Kiva Editor's Assistant /

 #!/usr/bin/env python
import sys
import StringIO
import re
import platform
import codecs

if platform.system() == 'Darwin':
    import subprocess

    def get_clipboard_text():
        s = subprocess.Popen('pbpaste', stdout=subprocess.PIPE)

    def set_clipboard_text(text):
        sub = subprocess.Popen('pbcopy', stdin=subprocess.PIPE)
        subprocess.Popen.communicate(sub, input=text)
elif platform.system() == 'Windows':
    import win32clipboard
    def get_clipboard_text():
            src = win32clipboard.GetClipboardData(win32clipboard.CF_UNICODETEXT)
        return src

    def set_clipboard_text(text):
            win32clipboard.SetClipboardText(text, win32clipboard.CF_UNICODETEXT)
        except Exception as e:
            print 'Set clipboard failed'
            print e
        with'kiva.txt', 'w', 'utf-8') as f:
    print "Error: unsupported platform"

from collections import defaultdict

# look for matches in the order presented in the array.
# if a match is found, make the corresponding substitution.

transforms = [
    (u'raised animals', u'grown animals'),

    (u'  +', u' '),

    (u' ,', u','), # foo , -> foo,

    (u'adult-aged', u'adult'),

    (ur'Kshs ([0-9, ]+)', ur'\1 KES'),
    (ur'\bLe ([0-9, ]+)', ur'\1 SLL'),
    (u'infant-aged', u'infant'),

    (u'junk foods', u'junk food'),

    (u'requested for', u'requested'),

    (u'lake victoria', u'Lake Victoria'),
    (ur'To make a living, (?P<name>(\w|\s)+) owns & operates a business venture in the [a-z]+ sector \w+ (?P<business>[^.]+\.)',
     ur'\g<name> owns and operates \g<business>'),

    (u'requesting for a', u'requesting a'),

    (u'While not the only means for generating revenue, the', u'The'),

    (u'main source of income for the business comes primarily from',
     'main source of income for the business comes from'),

    (u'Paglaum Multi-Purpose Cooperative \(PMPC is',
     'Paglaum Multi-Purpose Cooperative (PMPC) is'),

    (u'a month for these activities', u'a month from it'),

    (u'comes from buying and selling of', u'comes from buying and selling'),

    (u'engage in business activities', u'do business'),

    (u'improve/expand the borrower\'s business', u'improve and expand it'),

    (u'to invest in expanding the business', u'to expand the business'),

    (u'is one of accredited', u'is one of the accredited'),

    (u'fisherfolks', u'fisher-folk'),

    (u'aspired for', u'wanted'),

    (u'uplifting the family\'s standard', u'raising the family\'s standard'),

    (u'cycle loan', u'loan cycle'),

    (u'could continue to save up', u'can continue to save'),

    (u'clicking the link to the NWTF Kiva lending team',
     ur'clicking the link to the <a href="">NWTF Kiva lending team</a>'),

    (u'raise & sell in future', u'raise and sell'),

    (ur'\s+\.([^ \t])', ur'. \1'),

    #(ur'([^ \t.])([.,])([^ \t.])', ur'\1\2 \3'),

    (ur'\bN([0-9,]+)', ur'\1 Nigerian naira'),

    (ur'(?<!\.)\.\.(?!\.)', u'.'), # blah.. -> blah.

    (ur' 1 ', u' one '),
    (ur' 2 ', u' two '),
    (ur' 3 ', u' three '),
    (ur' 4 ', u' four '),
    (ur' 5 ', u' five '),
    (ur' 6 ', u' six '),
    (ur' 7 ', u' seven '),
    (ur' 8 ', u' eight '),
    (ur' 9 ', u' nine '),

    (u'\\b1st', u'first'),
    (u'\\b2nd', u'second'),
    (u'\\b3rd', u'third'),
    (u'\\b4th', u'fourth'),
    (u'\\b5th', u'fifth'),
    (u'\\b6th', u'sixth'),
    (u'\\b7th', u'seventh'),
    (u'\\b8th', u'eighth'),
    (u'\\b9th', u'ninth')

def apply_transforms(line):
    for tp in transforms:
        new_version = re.sub(tp[0], tp[1], line, re.U | re.I)
        if new_version != line:
            print 'applied transform'
            print '    ', tp[0], u'->', tp[1]
            print '     original line:'
            print line
            print '     new line:'
            print new_version
            line = new_version
    line = separate_digits(line)
    return line

def digit_group_callback(match_obj):
    return + ',' +

def separate_digits(line):
    # For (almost) all numbers of 4 or more digits: repeatedly change
    # the rightmost 4 digits to have a comma, i.e., go from 0000 to
    # 0,000. The exception is four digit numbers that might be years,
    # those are left alone.
    result = re.sub(ur'(?<![Ii]n )(\d)(\d{3})\b', digit_group_callback, line)
    return result

TEMPLATE_RE = re.compile(
   u"In (?P<year>\d+), (?P<name>\w+) joined (?P<mfi>\w+) to gain access"
    " to financial services to help improve (?P<poss>\w+) living situation"
    " and ability to do business\. \w+ has successfully repaid a previous"
    " loan of (?P<prev_loan_amt>[0-9,.]+) (?P<iso_currency>\w+) from \w+\."
    " This previous loan was used to (?P<prev_loan_use>[^.]+)\. \w+ is"
    " requesting a new loan of (?P<new_loan_amt>[0-9,.]+) \w+ which will be"
    " used (?P<new_loan_use>[^.]+)\. This loan will be the "
    "(?P<nth_loan>(\w|\d)+) loan taken out by \w+ from \w+\. (?P<prp>\w+) "
    "plans to use the additional revenue generated from the business to "
           re.IGNORECASE | re.UNICODE)

TEMPLATE_FMT = u"""In {year}, {name} joined {mfi} to gain access to financial services to help improve {poss} living situation and ability to do business. {prp} has successfully repaid {poss} last loan from {mfi}, which was for {prev_loan_amt} {iso_currency} and was used {prev_loan_use}.

This loan will be {poss} {nth_loan} from {mfi}. {name} is requesting a new loan of {new_loan_amt} {iso_currency} which will be used to {new_loan_use}. {prp} plans to use the additional revenue generated from the business to {revenue_use}.\n"""

def handle_template(line):
    mo = TEMPLATE_RE.match(line)
    if not mo:
        return line
    print 'performing template replacement'
    return TEMPLATE_FMT.format(year ='year'),
                               name ='name'),
                               prev_loan_amt ='prev_loan_amt'),
                               iso_currency ='iso_currency'),
                               mfi ='mfi'),
                               poss ='poss'),
                               prp ='prp'),
                               nth_loan ='nth_loan'),
                               prev_loan_use ='prev_loan_use'),
                               new_loan_amt ='new_loan_amt'),
                               new_loan_use ='new_loan_use'),
                               revenue_use ='revenue_use'))

currency_display_names = defaultdict(str)
currency_display_names['PHP'] = 'Philippine pesos'
currency_display_names['KES'] = 'Kenyan shillings'
currency_display_names['KSH'] = 'Kenyan shillings'
currency_display_names['PEN'] = 'Peruvian nuevos soles'
currency_display_names['SLL'] = 'Sierra Leonean leones'
currency_display_names['MNT'] = 'Mongolian tugriks'
currency_display_names['UGX'] = 'Ugandan shillings'
currency_display_names['KGS'] = 'Kyrgyzstani soms'
currency_display_names['VND'] = 'Vietnamese dong'

def expand_iso_currency(lines):
    for i, line in enumerate(lines):
        for iso_currency, currency_display_name in currency_display_names.items():
            mo ='[0-9 ]+({0})\\b'.format(iso_currency), line, re.I | re.U)
            if not mo:
                mo ='({0})[0-9 ]+'.format(iso_currency), line, re.I | re.U)
            if mo:
                lines[i] = line.replace(,
                                        '{0} ({1})'.format(currency_display_name,

def process_file(infile):
    """Process the text in infile."""
    description = []
    for line in infile:
        line = apply_transforms(line)
        line = handle_template(line)
        print '***************************************************************************'
    return u''.join(description)

def process_clipboard():
    result = process_file(StringIO.StringIO(get_clipboard_text()))
if __name__ == '__main__':