Source

Kiva Editor's Assistant / regex_process.py

Full commit
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys
import re
import codecs

from collections import defaultdict

# look for matches in the order presented in the array.
# if a match is found, make the corresponding substitution.

transforms = [
    (u'“|”', u'"'),

    (u'raised animals', u'grown animals'),

    (u'dependants', u'dependents'),

    (u'  +', u' '),

    (u' ,', u','), # foo , -> foo,

#    (u'’', u"'"),

    (u'the congo town area', u'the Congo Town area'),

    (u'Pre-angkorean', u'pre-Angkorean'),

    (ur'\b1 infant-aged children', u'one infant child'),

    (u'adult-aged', u'adult'),

    (ur'and etc\.*', u'etc.'),

    (ur'e\.t\.c.', u'etc.'),

    (ur'etc([^.])', ur'etc.\1'),

    (u'infant-aged', u'infant'),

    (ur'\b1 years', u'one year'),

    (u'pay Micro Finance loans', u'pay micro-finance loans'),

    (u'junk foods', u'junk food'),

    (u'requested for', u'requested'),

    (u'lake victoria', u'Lake Victoria'),

    (ur'To make a living, (?P<name>(\w|\s)+) owns & operates a business venture in the [a-z]+ sector \w+ (?P<business>[^.]+\.)',
     ur'\g<name> owns and operates \g<business>'),

    (u'requesting for a', u'requesting a'),

    (u'While not the only means for generating revenue, the', u'The'),

    (u'main source of income for the business comes primarily from',
     u'main source of income for the business comes from'),

    (ur'Paglaum Multi-Purpose Cooperative \(PMPC is',
     u'Paglaum Multi-Purpose Cooperative (PMPC) is'),

    (u'a month for these activities', u'a month from it'),

    (u'comes from buying and selling of', u'comes from selling'),

    (u'engage in business activities', u'do business'),

    (u"improve/expand (the borrower's|his|her) business", u'improve and expand it'),

    (u'borrowed a loan', u'took out a loan'),

    (u'in a business of', u'in the business of'),

    (u'with (.+) children and (.+) of them go to school', ur'and has \1 children, \2 of whom go to school'),

    (u'to invest in expanding the business', u'to expand the business'),

    (u'is one of accredited', u'is one of the accredited'),

    (u'fisherfolks', u'fishermen'),

    (u'([iI]n) future', ur'\1 the future'),

    (u'KADET LTD', u'KADET Ltd.'),

    (u'KIVA', u'Kiva'),

    (u'aspired for', u'wanted'),

    (u"uplifting the family's standard", u"raising the family's standard"),

    (u'cycle loan', u'loan'),

    (u'could continue to save up', u'can continue to save'),

    (u'clicking the link to the NWTF Kiva lending team',
     ur'clicking the link to the <a href="http://www.kiva.org/team/nwtf_philippines">NWTF Kiva lending team</a>'),

    (u'raise & sell in future', u'raise and sell'),

    (ur'fellowship\* meeting', u'fellowship meeting*'),

    (u'from the Word of God she received', u'from the Word of God she studies'),

    (u'tyres', u'tires'),

    (ur'mr\.', u'Mr.'),

    (ur'mrs\.', u'Mrs.'),

    (ur'([0-9]+) year old (man|woman)', ur'\1-year-old \2'),

    #(ur'\s+\.([^ \t])', ur'. \1'),

    (ur'(?<!\.)\.\.(?!\.)', u'.'), # blah.. -> blah.

    (ur'\b1st\b', u'first'),
    (ur'\b2nd\b', u'second'),
    (ur'\b3rd\b', u'third'),
    (ur'\b4th\b', u'fourth'),
    (ur'\b5th\b', u'fifth'),
    (ur'\b6th\b', u'sixth'),
    (ur'\b7th\b', u'seventh'),
    (ur'\b8th\b', u'eighth'),
    (ur'\b9th\b', u'ninth')
]

def apply_transforms(line, change_list):
    for tp in transforms:
        #print 'searching for ', tp[0].encode('cp437', 'replace')
        new_version = re.sub(tp[0], tp[1], line, flags=re.U | re.I)
        if new_version != line:
            # print 'applied transform'
            # print '    ', tp[0], u'->', tp[1]
            line = new_version
    return line

TEMPLATE_RE = re.compile(
   u"In (?P<year>\d+), (?P<name>\w+) joined (?P<mfi>\w+) to gain access"
    " to financial services to help improve (?P<poss>\w+) living situation"
    " and ability to do business\. \w+ has successfully repaid a previous"
    " loan of (?P<prev_loan_amt>[0-9,.]+) (?P<iso_currency>\w+) from \w+\."
    " This previous loan was used to (?P<prev_loan_use>[^.]+)\. \w+ is"
    " requesting a new loan of (?P<new_loan_amt>[0-9,.]+) \w+ which will be"
    " used (?P<new_loan_use>[^.]+)\. This loan will be the "
    "(?P<nth_loan>(\w|\d)+) loan taken out by \w+ from \w+\. (?P<prp>\w+) "
    "plans to use the additional revenue generated from the business to "
    "(?P<revenue_use>[^.]+)\.",
           re.IGNORECASE | re.UNICODE)

TEMPLATE_FMT = u"""In {year}, {name} joined {mfi} to gain access to financial services to help improve {poss} living situation and ability to do business. {prp} has successfully repaid {poss} last loan from {mfi}, which was for {prev_loan_amt} {iso_currency} and was used {prev_loan_use}.

This loan will be {poss} {nth_loan} from {mfi}. {name} is requesting {new_loan_amt} {iso_currency} which will be used  {new_loan_use}. {prp} plans to use the additional revenue generated from the business to {revenue_use}.\n"""



def handle_template(line, change_list):
    mo = TEMPLATE_RE.match(line)
    if not mo:
        return line
    return TEMPLATE_FMT.format(year = mo.group('year'),
                               name = mo.group('name'),
                               prev_loan_amt = mo.group('prev_loan_amt'),
                               iso_currency = mo.group('iso_currency'),
                               mfi = mo.group('mfi'),
                               poss = mo.group('poss'),
                               prp = mo.group('prp'),
                               nth_loan = mo.group('nth_loan'),
                               prev_loan_use = mo.group('prev_loan_use'),
                               new_loan_amt = mo.group('new_loan_amt'),
                               new_loan_use = mo.group('new_loan_use'),
                               revenue_use = mo.group('revenue_use'))

def process_file(infile, change_list):
    """Process the text in infile."""
    description = []
    for line in infile:
        line = apply_transforms(line, change_list)
        line = handle_template(line, change_list)
        description.append(line)
    return u''.join(description)