Commits

david_walker  committed 6594db4

checkpoint before massive change--about to merge transform code directly into rules, and have rules apply themselves directly rather than in a two-phase rule/transform process.

  • Participants
  • Parent commits a7cbe57
  • Branches parse

Comments (0)

Files changed (3)

+def wc(s):
+    sys.stdout.write(' ' + str(s) + ' |')
+
+
+def print_table(caption, source, target, d):
+    print caption
+    # print source string as table header
+    sys.stdout.write('   |   |')
+    for s in source:
+        wc(s)
+    sys.stdout.write('\n')
+    # print table contents prefixed with target string
+    for row in range(len(target) + 1):
+        for col in range(len(source) + 2):
+            if col == 0:
+                if row == 0:
+                    wc(' ')
+                else:
+                    wc(target[row - 1])
+            else:
+                wc(d[row][col - 1].score)
+        sys.stdout.write('\n')
+    sys.stdout.write('\n\n')
+
 # This file contains code that is not currently used in the project, but
 # is kept just in case it might be useful.
 
     phase = INITIAL_PHASE
     regex_pairs = [
         # Character standardization
-        (u'“|”', u'"'),
-        (u"’", u"'"),
+        TokenSearchByRegexp(u'“|”', u'"'),
+        TokenSearchByRegexp(u"’", u"'"),
 
         # misspellings
-        (u'dependants', u'dependents'),
-        (ur'therefor\b', u'therefore'),
-        (ur'(?i)(micro) +finance', u'\1finance'),
+        TokenSearchByRegexp(u'dependants', u'dependents'),
+        TokenSearchByRegexp(ur'therefor\b', u'therefore'),
+        TokenSearchByRegexp(ur'(?i)(micro) +finance', u'\1finance'),
 
         # proper nouns
-        (u'congo town', u'Congo Town'),
-        (u'lake victoria', u'Lake Victoria'),
-        (u'Pre-angkorean', u'pre-Angkorean'),
-        (u'KIVA', u'Kiva'),
-        (u'KADET LTD', u'KADET Ltd.'),
+        TokenSearchByRegexp(u'congo town', u'Congo Town'),
+        TokenSearchByRegexp(u'lake victoria', u'Lake Victoria'),
+        TokenSearchByRegexp(u'Pre-angkorean', u'pre-Angkorean'),
+        TokenSearchByRegexp(u'KIVA', u'Kiva'),
+        TokenSearchByRegexp(u'KADET LTD', u'KADET Ltd.'),
 
         # awkward or verbose constructions
-        (u'requested for', u'requested'),
-        (u'has given birth to', 'has'),
-        (u'requesting to borrow', u'asking to borrow'),
-        (u'adult-aged', u'adult'),
-        (ur'and etc\.*', u'etc.'),
-        (u'infant-aged', u'infant'),
-        (u'requesting for a', u'requesting a'),
-        (u'requested a loan for ([0-9]+)', ur'requested a loan of \1'),
-        (ur'\bhe is widowed', u'he is a widower'),
-        (u'borrowed a loan', u'took out a loan'),
-        (u'in a business of', u'in the business of'),
-        (u'with (.+) children and (.+) of them go to school',
+        TokenSearchByRegexp(u'requested for', u'requested'),
+        TokenSearchByRegexp(u'has given birth to', 'has'),
+        TokenSearchByRegexp(u'requesting to borrow', u'asking to borrow'),
+        TokenSearchByRegexp(u'adult-aged', u'adult'),
+        TokenSearchByRegexp(ur'and etc\.*', u'etc.'),
+        TokenSearchByRegexp(u'infant-aged', u'infant'),
+        TokenSearchByRegexp(u'requesting for a', u'requesting a'),
+        TokenSearchByRegexp(u'requested a loan for ([0-9]+)', ur'requested a loan of \1'),
+        TokenSearchByRegexp(ur'\bhe is widowed', u'he is a widower'),
+        TokenSearchByRegexp(u'borrowed a loan', u'took out a loan'),
+        TokenSearchByRegexp(u'in a business of', u'in the business of'),
+        TokenSearchByRegexp(u'with (.+) children and (.+) of them go to school',
          ur'and has \1 children, \2 of whom go to school'),
-        (u'to invest in expanding the business', u'to expand the business'),
-        (u'fisherfolks', u'fishermen'),
-        (u'aspired for', u'wanted'),
-        (u"uplifting the family's standard", u"raising the family's standard"),
-        (u'could continue to save up', u'can continue to save'),
-        (u'from the Word of God she received',
+        TokenSearchByRegexp(u'to invest in expanding the business', u'to expand the business'),
+        TokenSearchByRegexp(u'fisherfolks', u'fishermen'),
+        TokenSearchByRegexp(u'aspired for', u'wanted'),
+        TokenSearchByRegexp(u"uplifting the family's standard", u"raising the family's standard"),
+        TokenSearchByRegexp(u'could continue to save up', u'can continue to save'),
+        TokenSearchByRegexp(u'from the Word of God she received',
          u'from the Word of God she studies'),
-        (u'raise & sell in future', u'raise and sell'),
-        (u'married with ([0-9]+) (child|children)',
+        TokenSearchByRegexp(u'raise & sell in future', u'raise and sell'),
+        TokenSearchByRegexp(u'married with ([0-9]+) (child|children)',
          ur'married and has \1 \2'),
-        (u'has a long experience', u'has a lot of experience'),
-        (u'is aiming to gain more profits', u'aims to make more money'),
-        (u'has a good experience in this field and a good reputation '
+        TokenSearchByRegexp(u'has a long experience', u'has a lot of experience'),
+        TokenSearchByRegexp(u'is aiming to gain more profits', u'aims to make more money'),
+        TokenSearchByRegexp(u'has a good experience in this field and a good reputation '
          'and (s?he) is being well known in (his|her) area',
          ur' has a lot of experience in this field, a good reputation, '
          ur'and is well known in \2 area'),
 
         # Chiefly British
-        (u'([iI]n) future', ur'\1 the future'),
-        (u'tyres', u'tires'),
-        (u'neighbour', u'neighbor'),
-        (u'licencing', u'licensing'),
+        TokenSearchByRegexp(u'([iI]n) future', ur'\1 the future'),
+        TokenSearchByRegexp(u'tyres', u'tires'),
+        TokenSearchByRegexp(u'neighbour', u'neighbor'),
+        TokenSearchByRegexp(u'licencing', u'licensing'),
 
         # non-ISO currency abbreviations
-        (u'/=', u' UGX '),
-        (ur'(?i)ksh\.', u' KES '),
-        (ur'(?i)kshs(\.|)', u' KES '),
-        (ur'[Pp]hp', 'PHP'),
-        (ur'(?i)\bLE([0-9]*)\b', ur'SLL \1'),
-        (ur'\bRp\.', 'IDR'),
+        TokenSearchByRegexp(u'/=', u' UGX '),
+        TokenSearchByRegexp(ur'(?i)ksh\.', u' KES '),
+        TokenSearchByRegexp(ur'(?i)kshs(\.|)', u' KES '),
+        TokenSearchByRegexp(ur'[Pp]hp', 'PHP'),
+        TokenSearchByRegexp(ur'\bP([0-9,.]+)\b', ur'\1 PHP'),
+        TokenSearchByRegexp(ur'(?i)\bLE([0-9]*)\b', ur'SLL \1'),
+        TokenSearchByRegexp(ur'\bRp\.', 'IDR'),
 
         # incorrect punctuation
-        (ur'e\.t\.c\.?', u'etc.'),
-        (ur'\betc([^.])', ur'etc.\1'),
-        (ur'(?<!\.)\.\.(?!\.)', u'.'),  # blah.. -> blah.
+        TokenSearchByRegexp(ur'e\.t\.c\.?', u'etc.'),
+        TokenSearchByRegexp(ur'\betc([^.])', ur'etc.\1'),
+        TokenSearchByRegexp(ur'(?<!\.)\.\.(?!\.)', u'.'),  # blah.. -> blah.
 
         # grammatical errors
-        (ur'\b1 infant-aged children', u'one infant child'),
-        (ur'\b1 years', u'one year'),
-        (ur'never missed any meeting\.', u'never missed any meetings.'),
+        TokenSearchByRegexp(ur'\b1 infant-aged children', u'one infant child'),
+        TokenSearchByRegexp(ur'\b1 years', u'one year'),
+        TokenSearchByRegexp(ur'never missed any meeting\.', u'never missed any meetings.'),
 
         # Field partner template cleanup
-        (ur'To make a living, (?P<name>(\w|\s)+) owns & operates a business'
+        TokenSearchByRegexp(ur'To make a living, (?P<name>(\w|\s)+) owns & operates a business'
            'venture in the [a-z]+ sector \w+ (?P<business>[^.]+\.)',
          ur'\g<name> has a \g<business>'),
-        (u'[Ww]hile not the only means for generating revenue, the', u'The'),
-        (u'main source of income for the business comes primarily from',
+        TokenSearchByRegexp(u'[Ww]hile not the only means for generating revenue, the', u'The'),
+        TokenSearchByRegexp(u'main source of income for the business comes primarily from',
          u'main source of income for the business comes from'),
-        (u'a month for these activities', u'a month from it'),
-        (u'comes from buying and selling of', u'comes from selling'),
-        (u'engage in business activities', u'do business'),
-        (u"improve/expand (the borrower's|his|her) business",
+        TokenSearchByRegexp(u'a month for these activities', u'a month from it'),
+        TokenSearchByRegexp(u'comes from buying and selling of', u'comes from selling'),
+        TokenSearchByRegexp(u'engage in business activities', u'do business'),
+        TokenSearchByRegexp(u"improve/expand (the borrower's|his|her) business",
          u'improve and expand it'),
-        (ur'fellowship\* meeting', u'fellowship meeting*'),
-        (u'clicking the link to the NWTF Kiva lending team',
+        TokenSearchByRegexp(ur'fellowship\* meeting', u'fellowship meeting*'),
+        TokenSearchByRegexp(u'clicking the link to the NWTF Kiva lending team',
          ur'clicking the link to the '
          '<a href="http://www.kiva.org/team/nwtf_philippines">'
          'NWTF Kiva lending team</a>'),
-        (u'Kiva\'s Muslim World Lending helptext: http://tinyurl.com/3aekx8m',
+        TokenSearchByRegexp(u'Kiva\'s Muslim World Lending helptext: http://tinyurl.com/3aekx8m',
          u'Kiva\'s article on <a href="http://na3.salesforce.com/_ui/'
          'selfservice/pkb/'
          'PublicKnowledgeSolution/d?orgId=00D500000006svl&lang=1&id='
          'lang%3D1%26t%3D4&ps=1&pPv=1">Lending in the Muslim World</a>'),
 
         # Jargon
-        (u'cycle loan', u'loan'),
-        (u'loan cycle', u'loan'),
-        (u'loan facility', u'loan'),
+        TokenSearchByRegexp(u'cycle loan', u'loan'),
+        TokenSearchByRegexp(u'loan cycle', u'loan'),
+        TokenSearchByRegexp(u'loan facility', u'loan'),
 
         # Numeric expressions
-        (ur'\b1st\b', u'first'),
-        (ur'\b2nd\b', u'second'),
-        (ur'\b3rd\b', u'third'),
-        (ur'\b4th\b', u'fourth'),
-        (ur'\b5th\b', u'fifth'),
-        (ur'\b6th\b', u'sixth'),
-        (ur'\b7th\b', u'seventh'),
-        (ur'\b8th\b', u'eighth'),
-        (ur'\b9th\b', u'ninth'),
+        TokenSearchByRegexp(ur'\b1st\b', u'first'),
+        TokenSearchByRegexp(ur'\b2nd\b', u'second'),
+        TokenSearchByRegexp(ur'\b3rd\b', u'third'),
+        TokenSearchByRegexp(ur'\b4th\b', u'fourth'),
+        TokenSearchByRegexp(ur'\b5th\b', u'fifth'),
+        TokenSearchByRegexp(ur'\b6th\b', u'sixth'),
+        TokenSearchByRegexp(ur'\b7th\b', u'seventh'),
+        TokenSearchByRegexp(ur'\b8th\b', u'eighth'),
+        TokenSearchByRegexp(ur'\b9th\b', u'ninth'),
 
         ]
 
         self.tokens = tokens
         transforms = []
         transform_tokens = []
-        for token in tokens:
-            if token.non_printing:
-                continue
-            for rp in RegexCleanupRule.regex_pairs:
-                mo = re.search(rp[0], token.str, re.U)
-                if mo:
-                    transform_tokens.append(token)
-                    break
-        if transform_tokens:
-            transforms.append(tr.RegexTransform(self, transform_tokens))
-        else:
+
+        for ts in RegexCleanupRule.regex_pairs:
+            transform = ts.search(tokens)
+            if transform:
+                transforms.append(transform)
+
+        if not transforms:
             self.enabled = False
         return transforms
 

File tokensearch.py

 #!/usr/bin/env python
 
 from collections import namedtuple
-import sys
 
 OpCode = namedtuple('OpCode', ['opcode', 'token', 'row', 'col'])
 
         self._value = value
         self._compare_func = compare_func
 
-    def match(self, token):
+    def search(self, token):
         """Compare token against the criteria stored in this object.
         """
         if self._compare_func:
 
 
 class TokenSearchByRegexp(object):
-    def __init__(self, regexp, attr_name='str'):
+    def __init__(self, regexp, replace, attr_name='str'):
+        if isinstance(regexp, basestring):
+            self._regexp = regexp.split()
+        else:
+            self._regexp = regexp
+        if isinstance(replace, basestring):
+            self._replace = replace.split()
+        else:
+            self._replace = replace
         self._attr_name = attr_name
-        self._regexp = regexp
+        self._searched_tokens = []
 
-    def match(self, token):
-        return self._regexp.match(token.getattr(self._attr_name))
-
-
-class TokenSearchReplace(object):
-    def __init__(self, search_exprs, replacement):
-        self._search_exprs = search_exprs
-        self._replacement = replacement
-        self._match_start_idx = None
-
-    def search(self, tokens):
+    def search(self, rule, tokens):
         self._searched_tokens = tokens
         search_expr_idx = 0
         for token in tokens:
-            if self._search_exprs[search_expr_idx].match(token):
+            if self._regexp[search_expr_idx].match(token.getattr(self._attr_name)):
                 search_expr_idx += 1
                 if search_expr_idx == len(self._search_exprs) - 1:
-                    self._match_start_idx = tokens.index(token)
-                    return True
+                    match_start_idx = tokens.index(token)
+                    transform = RegexTransform(rule, match_start_idx, self._replace)
+                    return transform
             else:
                 search_expr_idx = 0
-        return False
-
-    def replace():
-        pass
-
-
-def deploy():
-    tks = map(TokenSearch, u'with (.+) children and (.+) of them go to school'.split())
-    # convert the two plain TokenSearch objects into regular
-    # expression token search objects.
-    tks[1] = TokenSearchByRegexp(tks[1].value)
-    tks[4] = TokenSearchByRegexp(tks[4].value)
-
-    replacement = ur'and has \1 children, \2 of whom go to school'.split()
-    tksl = TokenSearchList(tks, replacement)
-
-
-def wc(s):
-    sys.stdout.write(' ' + str(s) + ' |')
-
-
-def print_table(caption, source, target, d):
-    print caption
-    # print source string as table header
-    sys.stdout.write('   |   |')
-    for s in source:
-        wc(s)
-    sys.stdout.write('\n')
-    # print table contents prefixed with target string
-    for row in range(len(target) + 1):
-        for col in range(len(source) + 2):
-            if col == 0:
-                if row == 0:
-                    wc(' ')
-                else:
-                    wc(target[row - 1])
-            else:
-                wc(d[row][col - 1].score)
-        sys.stdout.write('\n')
-    sys.stdout.write('\n\n')
+        return None
 
 
 def get_levenshtein_dist(source_tokens, target_tokens):
     # tokens into the target tokens using the minimum number of copy,
     # delete, insert, and change operations.
     #
-    #
 
     num_source_tokens = len(source_tokens)
     num_target_tokens = len(target_tokens)
 
     for col in range(1, num_source_tokens + 1):
         for row in range(1, num_target_tokens + 1):
-            if source_tokens[col - 1] == target_tokens[row - 1]:
+            if source_tokens[col - 1].str == target_tokens[row - 1].str:
                 d[row][col].score = d[row - 1][col - 1].score
                 d[row][col].hist = d[row - 1][col - 1].hist + [
                     OpCode('c', target_tokens[row - 1], row, col)]
 
                 if row > col:
                     if sub_cost < ins_cost:
-                        if not (d[row][col].score == sub_cost):
-                            print '[{}, {}] score={}, ins_cost={}, del_cost={}, SUB_COST={}'.format(
-                                row, col, d[row][col].score, ins_cost, del_cost, sub_cost)
-                            print_table('', source_tokens, target_tokens, d)
-                            assert(False)
                         d[row][col].hist = d[row - 1][col - 1].hist + [
                             OpCode('s', target_tokens[row - 1], row, col)]
                     else:
-                        if not (d[row][col].score == ins_cost):
-                            print '[{}, {}] score={}, INS_COST={}, del_cost={}, sub_cost={}'.format(
-                                row, col, d[row][col].score, ins_cost, del_cost, sub_cost)
-                            print_table('', source_tokens, target_tokens, d)
-                            assert(False)
                         d[row][col].hist = d[row - 1][col].hist + [
                             OpCode('i', target_tokens[row - 1], row, col)]
                 elif row < col:
                     if sub_cost < del_cost:
-                        assert(sub_cost < ins_cost)
-                        if not (d[row][col].score == sub_cost):
-                            print '[{}, {}] score={}, ins_cost={}, del_cost={}, SUB_COST={}'.format(
-                                row, col, d[row][col].score, ins_cost, del_cost, sub_cost)
-                            print_table('', source_tokens, target_tokens, d)
-                            assert(False)
                         d[row][col].hist = d[row - 1][col - 1].hist + [
                             OpCode('s', target_tokens[row - 1], row, col)]
                     else:
-                        if not (d[row][col].score == del_cost):
-                            print '[{}, {}] score={}, ins_cost={}, DEL_COST={}, sub_cost={}'.format(
-                                row, col, d[row][col].score, ins_cost, del_cost, sub_cost)
-                        if not (del_cost < ins_cost):
-                            print '[{}, {}] score={}, ins_cost={}, DEL_COST={}, sub_cost={}'.format(
-                                row, col, d[row][col].score, ins_cost, del_cost, sub_cost)
-
                         d[row][col].hist = d[row][col - 1].hist + [
                             OpCode('d', source_tokens[col - 1], row, col)]
-                else: # on the diagonal
+                else:  # on the diagonal
                     if d[row][col].score == sub_cost:
                         d[row][col].hist = d[row - 1][col - 1].hist + [
                         OpCode('s', target_tokens[row - 1], row, col)]
     return d[num_target_tokens][num_source_tokens].hist
 
 
-def modify_tokens(source_tokens, target_tokens):
-    operations = get_levenshtein_dist(source_tokens, target_tokens)
+def modify_tokens(source_tokens, source_pos, source_len, target_tokens):
+    source_slice = source_tokens[source_pos:(source_pos + source_len)]
+    operations = get_levenshtein_dist(source_slice, target_tokens)
 
     # Now d[num_source_tokens][num_target_tokens].hist contains a list of
     # operations to perform to make source_tokens look like
     # target_tokens.
-    source_idx = 0
+    source_idx = source_pos
     for op in operations:
         # opcode 'c' (copy) is a no-op
         if op.opcode == 'd':
         elif op.opcode == 'i':
             source_tokens.insert(source_idx, op.token)
         elif op.opcode == 's':
-            source_tokens[source_idx] = op.token
+            source_tokens[source_idx].str = op.token.str
 
         if op.opcode != 'd':
             source_idx += 1
     return source_tokens
 
-def split_chars(s):
-    return [c for c in s]
 
-if __name__ == '__main__':
-    result_tokens = modify_tokens(split_chars(sys.argv[1]), split_chars(sys.argv[2]))
-    result = ''.join(result_tokens)
-    if result == sys.argv[2]:
-        print 'Correct:', result
-    else:
-        print 'ERROR:', result
+def deploy():
+    tks = map(TokenSearch,
+              u'with (.+) children and (.+) of them go to school'.split())
+    # convert the two plain TokenSearch objects into regular
+    # expression token search objects.
+    tks[1] = TokenSearchByRegexp(tks[1].value)
+    tks[4] = TokenSearchByRegexp(tks[4].value)
+
+    replacement = ur'and has \1 children, \2 of whom go to school'.split()
+    tksl = TokenSearchList(tks, replacement)
+
+
+    transform = search_for_pattern(tokens, pattern, replacement)
+
+
+def search_for_pattern(tokens, token_search):
+    """
+    if the token_search object finds a match in tokens, ask it to
+    generate a Transform object and return that.
+    """
+    return token_search.search(tokens)