david_walker avatar david_walker committed 8b06e29

debug tokensearch.py
change regex search replace rule to not use transform

Comments (0)

Files changed (2)

 
         # misspellings
         TokenSearchByRegexp(u'dependants', u'dependents'),
-        TokenSearchByRegexp(ur'therefor\b', u'therefore'),
-        TokenSearchByRegexp(ur'(?i)(micro) +finance', u'\1finance'),
+        TokenSearchByRegexp(ur'therefor', u'therefore'),
+        TokenSearchByRegexp(ur'(?i)(micro) finance', u'\1finance'),
 
         # proper nouns
         TokenSearchByRegexp(u'congo town', u'Congo Town'),
         TokenSearchByRegexp(u'infant-aged', u'infant'),
         TokenSearchByRegexp(u'requesting for a', u'requesting a'),
         TokenSearchByRegexp(u'requested a loan for ([0-9]+)', ur'requested a loan of \1'),
-        TokenSearchByRegexp(ur'\bhe is widowed', u'he is a widower'),
+        TokenSearchByRegexp(ur'he is widowed', u'he is a widower'),
         TokenSearchByRegexp(u'borrowed a loan', u'took out a loan'),
         TokenSearchByRegexp(u'in a business of', u'in the business of'),
         TokenSearchByRegexp(u'with (.+) children and (.+) of them go to school',
         TokenSearchByRegexp(u'licencing', u'licensing'),
 
         # non-ISO currency abbreviations
-        TokenSearchByRegexp(u'/=', u' UGX '),
-        TokenSearchByRegexp(ur'(?i)ksh\.', u' KES '),
-        TokenSearchByRegexp(ur'(?i)kshs(\.|)', u' KES '),
+        TokenSearchByRegexp(u'/=', u'UGX'),
+        TokenSearchByRegexp(ur'(?i)ksh\.', u'KES'),
+        TokenSearchByRegexp(ur'(?i)kshs(\.|)', u'KES'),
         TokenSearchByRegexp(ur'[Pp]hp', 'PHP'),
-        TokenSearchByRegexp(ur'\bP([0-9,.]+)\b', ur'\1 PHP'),
-        TokenSearchByRegexp(ur'(?i)\bLE([0-9]*)\b', ur'SLL \1'),
-        TokenSearchByRegexp(ur'\bRp\.', 'IDR'),
+        TokenSearchByRegexp(ur'P([0-9,.]+)', ur'\1 PHP'),
+        TokenSearchByRegexp(ur'(?i)LE ([0-9]*)', ur'SLL \1'),
+        TokenSearchByRegexp(ur'Rp\.', 'IDR'),
 
         # incorrect punctuation
         TokenSearchByRegexp(ur'e\.t\.c\.?', u'etc.'),
         TokenSearchByRegexp(ur'(?<!\.)\.\.(?!\.)', u'.'),  # blah.. -> blah.
 
         # grammatical errors
-        TokenSearchByRegexp(ur'\b1 infant-aged children', u'one infant child'),
-        TokenSearchByRegexp(ur'\b1 years', u'one year'),
+        TokenSearchByRegexp(ur'1 infant-aged children', u'one infant child'),
+        TokenSearchByRegexp(ur'1 years', u'one year'),
         TokenSearchByRegexp(ur'never missed any meeting\.', u'never missed any meetings.'),
 
         # Field partner template cleanup
-        TokenSearchByRegexp(ur'To make a living, (?P<name>(\w|\s)+) owns & operates a business'
-           'venture in the [a-z]+ sector \w+ (?P<business>[^.]+\.)',
-         ur'\g<name> has a \g<business>'),
+        # TokenSearchByRegexp(ur'To make a living, (?P<name>(\w|\s)+) owns & operates a business'
+        #    'venture in the [a-z]+ sector \w+ (?P<business>[^.]+\.)',
+        #  ur'\g<name> has a \g<business>'),
         TokenSearchByRegexp(u'[Ww]hile not the only means for generating revenue, the', u'The'),
         TokenSearchByRegexp(u'main source of income for the business comes primarily from',
          u'main source of income for the business comes from'),
         TokenSearchByRegexp(u'loan facility', u'loan'),
 
         # Numeric expressions
-        TokenSearchByRegexp(ur'\b1st\b', u'first'),
-        TokenSearchByRegexp(ur'\b2nd\b', u'second'),
-        TokenSearchByRegexp(ur'\b3rd\b', u'third'),
-        TokenSearchByRegexp(ur'\b4th\b', u'fourth'),
-        TokenSearchByRegexp(ur'\b5th\b', u'fifth'),
-        TokenSearchByRegexp(ur'\b6th\b', u'sixth'),
-        TokenSearchByRegexp(ur'\b7th\b', u'seventh'),
-        TokenSearchByRegexp(ur'\b8th\b', u'eighth'),
-        TokenSearchByRegexp(ur'\b9th\b', u'ninth'),
+        TokenSearchByRegexp(ur'1st', u'first'),
+        TokenSearchByRegexp(ur'2nd', u'second'),
+        TokenSearchByRegexp(ur'3rd', u'third'),
+        TokenSearchByRegexp(ur'4th', u'fourth'),
+        TokenSearchByRegexp(ur'5th', u'fifth'),
+        TokenSearchByRegexp(ur'6th', u'sixth'),
+        TokenSearchByRegexp(ur'7th', u'seventh'),
+        TokenSearchByRegexp(ur'8th', u'eighth'),
+        TokenSearchByRegexp(ur'9th', u'ninth'),
 
         ]
 
     def __init__(self):
         Rule.__init__(self, 10, 1.0, "Search and replace specific strings")
 
-    def get_transforms(self, tokens):
-        self.tokens = tokens
-        transforms = []
-        transform_tokens = []
-
+    def apply(self, tokens):
         for ts in RegexCleanupRule.regex_pairs:
-            transform = ts.search(tokens)
-            if transform:
-                transforms.append(transform)
-
-        if not transforms:
-            self.enabled = False
-        return transforms
+            ts.apply(tokens)
 
 
 class ParagraphRule(Rule):
 #!/usr/bin/env python
-
+import re
 from collections import namedtuple
 
 OpCode = namedtuple('OpCode', ['opcode', 'token', 'row', 'col'])
 class TokenSearchByRegexp(object):
     def __init__(self, regexp, replace, attr_name='str'):
         if isinstance(regexp, basestring):
-            self._regexp = regexp.split()
+            self._regexp = map(re.compile, regexp.split())
         else:
             self._regexp = regexp
         if isinstance(replace, basestring):
         else:
             self._replace = replace
         self._attr_name = attr_name
-        self._searched_tokens = []
 
-    def search(self, rule, tokens):
-        self._searched_tokens = tokens
+    def _instantiate_replacements(self, match_groups):
+        inst_rep = []
+        for rep in self._replace:
+            # first, find out if there's a replacement escape sequence
+            # in the replacement string (\1, \2, etc.)
+            while True:
+                mo = re.search(r'\\([0-9])', rep)
+                if not mo:
+                    inst_rep.append(rep)
+                    break
+                # there is; find out which group is supposed to replace
+                # it
+                index = int(mo.group(1)) - 1
+                # substitute the appropriate match group for all
+                # occurrences of the escape sequence
+                rep = rep.replace(mo.group(0), match_groups[index])
+                # loop around to replace any further escapes
+        return inst_rep
+
+    def apply(self, tokens):
         search_expr_idx = 0
-        for token in tokens:
-            if self._regexp[search_expr_idx].match(token.getattr(self._attr_name)):
-                search_expr_idx += 1
-                if search_expr_idx == len(self._search_exprs) - 1:
-                    match_start_idx = tokens.index(token)
-                    transform = RegexTransform(rule, match_start_idx, self._replace)
-                    return transform
-            else:
+        match_groups = []
+        for i, token in enumerate(tokens):
+            mo = self._regexp[search_expr_idx].match(token.str)
+            if not mo:
                 search_expr_idx = 0
-        return None
+                match_groups = []
+                continue
+            # Save the Match groups, if non-empty; flatten the tuples
+            # into a single list so they can be referenced by the
+            # replacement escapes \1, \2, etc.
+            if mo.groups():
+                match_groups += list(mo.groups())
+            # Move on to the next element of self._regexp; if we've
+            # successfully matched the entire length of the self._regexp
+            # list, then apply the replacement to the source tokens.
+            search_expr_idx += 1
+            if search_expr_idx == len(self._regexp):
+                modify_tokens(tokens,
+                              i - (len(self._regexp) - 1),
+                              len(self._regexp),
+                              self._instantiate_replacements(match_groups))
+                break
 
 
-def get_levenshtein_dist(source_tokens, target_tokens):
+def get_levenshtein_dist(source_tokens, target_strings):
     """Return a minimal list of operations required to transform the
     source tokens into the target tokens.
     """
     #
 
     num_source_tokens = len(source_tokens)
-    num_target_tokens = len(target_tokens)
+    num_target_strings = len(target_strings)
     d = [[ScoreHist() for col in range(num_source_tokens + 1)]
-         for row in range(num_target_tokens + 1)]
+         for row in range(num_target_strings + 1)]
 
-    for row in range(num_target_tokens + 1):
+    for row in range(num_target_strings + 1):
         # init column 0 with the distance of any target string to an
         # empty source string
         d[row][0].score = row
         if row > 0:
-            d[row][0].hist = [OpCode('i', target_tokens[row - 1], row, col)]
+            d[row][0].hist = [OpCode('i', target_strings[row - 1], row, col)]
     for col in range(num_source_tokens + 1):
         # init row 0 with the distance of any second string to an empty
         # first string
             d[0][col].hist = [OpCode('d', source_tokens[col - 1], row, col)]
 
     for col in range(1, num_source_tokens + 1):
-        for row in range(1, num_target_tokens + 1):
-            if source_tokens[col - 1].str == target_tokens[row - 1].str:
+        for row in range(1, num_target_strings + 1):
+            if source_tokens[col - 1].str == target_strings[row - 1]:
                 d[row][col].score = d[row - 1][col - 1].score
                 d[row][col].hist = d[row - 1][col - 1].hist + [
-                    OpCode('c', target_tokens[row - 1], row, col)]
+                    OpCode('c', target_strings[row - 1], row, col)]
             else:
                 del_cost = d[row][col - 1].score + 1
                 ins_cost = d[row - 1][col].score + 1
                 if row > col:
                     if sub_cost < ins_cost:
                         d[row][col].hist = d[row - 1][col - 1].hist + [
-                            OpCode('s', target_tokens[row - 1], row, col)]
+                            OpCode('s', target_strings[row - 1], row, col)]
                     else:
                         d[row][col].hist = d[row - 1][col].hist + [
-                            OpCode('i', target_tokens[row - 1], row, col)]
+                            OpCode('i', target_strings[row - 1], row, col)]
                 elif row < col:
                     if sub_cost < del_cost:
                         d[row][col].hist = d[row - 1][col - 1].hist + [
-                            OpCode('s', target_tokens[row - 1], row, col)]
+                            OpCode('s', target_strings[row - 1], row, col)]
                     else:
                         d[row][col].hist = d[row][col - 1].hist + [
                             OpCode('d', source_tokens[col - 1], row, col)]
                 else:  # on the diagonal
                     if d[row][col].score == sub_cost:
                         d[row][col].hist = d[row - 1][col - 1].hist + [
-                        OpCode('s', target_tokens[row - 1], row, col)]
+                        OpCode('s', target_strings[row - 1], row, col)]
                     elif d[row][col].score == ins_cost:
                         d[row][col].hist = d[row - 1][col].hist + [
-                        OpCode('i', target_tokens[row - 1], row, col)]
+                        OpCode('i', target_strings[row - 1], row, col)]
                     else:
                         d[row][col].hist = d[row][col - 1].hist + [
                         OpCode('d', source_tokens[col - 1], row, col)]
-    return d[num_target_tokens][num_source_tokens].hist
+    return d[num_target_strings][num_source_tokens].hist
 
 
-def modify_tokens(source_tokens, source_pos, source_len, target_tokens):
+def modify_tokens(source_tokens, source_pos, source_len, target_strings):
     source_slice = source_tokens[source_pos:(source_pos + source_len)]
-    operations = get_levenshtein_dist(source_slice, target_tokens)
+    operations = get_levenshtein_dist(source_slice, target_strings)
 
-    # Now d[num_source_tokens][num_target_tokens].hist contains a list of
+    # Now d[num_source_tokens][num_target_strings].hist contains a list of
     # operations to perform to make source_tokens look like
-    # target_tokens.
+    # target_strings.
     source_idx = source_pos
     for op in operations:
         # opcode 'c' (copy) is a no-op
         if op.opcode == 'd':
             source_tokens.pop(source_idx)
         elif op.opcode == 'i':
-            source_tokens.insert(source_idx, op.token)
+            source_tokens.insert(source_idx, Token(op.token))
         elif op.opcode == 's':
-            source_tokens[source_idx].str = op.token.str
+            source_tokens[source_idx].str = op.token
 
         if op.opcode != 'd':
             source_idx += 1
     return source_tokens
 
 
-def deploy():
-    tks = map(TokenSearch,
-              u'with (.+) children and (.+) of them go to school'.split())
-    # convert the two plain TokenSearch objects into regular
-    # expression token search objects.
-    tks[1] = TokenSearchByRegexp(tks[1].value)
-    tks[4] = TokenSearchByRegexp(tks[4].value)
-
-    replacement = ur'and has \1 children, \2 of whom go to school'.split()
-    tksl = TokenSearchList(tks, replacement)
-
-
-    transform = search_for_pattern(tokens, pattern, replacement)
-
-
-def search_for_pattern(tokens, token_search):
-    """
-    if the token_search object finds a match in tokens, ask it to
-    generate a Transform object and return that.
-    """
-    return token_search.search(tokens)
+if __name__ == '__main__':
+    from mytoken import Token
+    ts = TokenSearchByRegexp(u'with (.+) children and (.+)',
+                             ur'and has \1 children, \2 of whom go to school')
+    tokens = map(Token, 'with 4 children and 2'.split())
+    ts.apply(tokens)
+    print tokens
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.