Commits

david_walker committed a7cbe57

tokensearch.py: finish get_levenshtein_dist and modify_tokens
rules.py: do whitespace split early

Comments (0)

Files changed (2)

             get_right_neighbor(lst, i, attrib_name))
 
 
+def find_token_pattern(tokens, search_exprs):
+    """Return the index of the first occurrence of a sequence of tokens
+    matching the search expressions in `search_exprs`.
+
+    Arguments:
+    tokens -- a list of Token objects
+    search_exprs -- a list of TokenSearch objects
+    """
+
+
 class RegexCleanupRule(Rule):
     """Generate transforms for tokens matching any of a variety of
     regular expressions used to clean up common errors.
     phase = INITIAL_PHASE
 
     def __init__(self):
-        Rule.__init__(self, 30, 1.0, "Separate text by whitespace.")
+        Rule.__init__(self, 5, 1.0, "Separate text by whitespace.")
 
     def get_transforms(self, tokens):
         self.tokens = tokens
 from collections import namedtuple
 import sys
 
-OpCode = namedtuple('OpCode', ['opcode', 'token'])
+OpCode = namedtuple('OpCode', ['opcode', 'token', 'row', 'col'])
 
 
 class ScoreHist(object):
 
 
 def get_levenshtein_dist(source_tokens, target_tokens):
-    """Change the source_tokens to match the target_tokens, preserving as
-    much of the source token data as possible.
+    """Return a minimal list of operations required to transform the
+    source tokens into the target tokens.
     """
-    # Compute the Levenshtein Distance between source and new token
-    # lists. (See http://en.wikipedia.org/wiki/Levenshtein_distance)
+    # Compute the Levenshtein Distance between source and target token
+    # lists. (See http://en.wikipedia.org/wiki/Levenshtein_distance for
+    # an explanation and http://www.merriampark.com/ld.htm for public-
+    # domain implementations.)
     #
+    # This results in a set of instructions for transforming the source
+    # tokens into the target tokens using the minimum number of copy,
+    # delete, insert, and change operations.
     #
     #
-    # For all i and j, d[i,j].score will hold the Levenshtein distance
-    # between the first i elements of source_tokens and the first j elements of
-    # target_tokens.
 
     num_source_tokens = len(source_tokens)
     num_target_tokens = len(target_tokens)
         # empty source string
         d[row][0].score = row
         if row > 0:
-            d[row][0].hist = [OpCode('i', target_tokens[row - 1])]
+            d[row][0].hist = [OpCode('i', target_tokens[row - 1], row, col)]
     for col in range(num_source_tokens + 1):
         # init row 0 with the distance of any second string to an empty
         # first string
         d[0][col].score = col
         if col > 0:
-            d[0][col].hist = [OpCode('d', source_tokens[col - 1])]
+            d[0][col].hist = [OpCode('d', source_tokens[col - 1], row, col)]
 
     for col in range(1, num_source_tokens + 1):
         for row in range(1, num_target_tokens + 1):
             if source_tokens[col - 1] == target_tokens[row - 1]:
                 d[row][col].score = d[row - 1][col - 1].score
                 d[row][col].hist = d[row - 1][col - 1].hist + [
-                    OpCode('c', target_tokens[row - 1])]
+                    OpCode('c', target_tokens[row - 1], row, col)]
             else:
-                #print 'row', row, 'col', col
-                del_cost = d[row - 1][col].score + 1
-                ins_cost = d[row][col - 1].score + 1
+                del_cost = d[row][col - 1].score + 1
+                ins_cost = d[row - 1][col].score + 1
                 sub_cost = d[row - 1][col - 1].score + 1
 
                 d[row][col].score = min(del_cost, ins_cost, sub_cost)
 
-                # if row > column then insert is required
-                # if row < column then delete is required
-                # if row == column then sub is required
+                if row > col:
+                    if sub_cost < ins_cost:
+                        if not (d[row][col].score == sub_cost):
+                            print '[{}, {}] score={}, ins_cost={}, del_cost={}, SUB_COST={}'.format(
+                                row, col, d[row][col].score, ins_cost, del_cost, sub_cost)
+                            print_table('', source_tokens, target_tokens, d)
+                            assert(False)
+                        d[row][col].hist = d[row - 1][col - 1].hist + [
+                            OpCode('s', target_tokens[row - 1], row, col)]
+                    else:
+                        if not (d[row][col].score == ins_cost):
+                            print '[{}, {}] score={}, INS_COST={}, del_cost={}, sub_cost={}'.format(
+                                row, col, d[row][col].score, ins_cost, del_cost, sub_cost)
+                            print_table('', source_tokens, target_tokens, d)
+                            assert(False)
+                        d[row][col].hist = d[row - 1][col].hist + [
+                            OpCode('i', target_tokens[row - 1], row, col)]
+                elif row < col:
+                    if sub_cost < del_cost:
+                        assert(sub_cost < ins_cost)
+                        if not (d[row][col].score == sub_cost):
+                            print '[{}, {}] score={}, ins_cost={}, del_cost={}, SUB_COST={}'.format(
+                                row, col, d[row][col].score, ins_cost, del_cost, sub_cost)
+                            print_table('', source_tokens, target_tokens, d)
+                            assert(False)
+                        d[row][col].hist = d[row - 1][col - 1].hist + [
+                            OpCode('s', target_tokens[row - 1], row, col)]
+                    else:
+                        if not (d[row][col].score == del_cost):
+                            print '[{}, {}] score={}, ins_cost={}, DEL_COST={}, sub_cost={}'.format(
+                                row, col, d[row][col].score, ins_cost, del_cost, sub_cost)
+                        if not (del_cost < ins_cost):
+                            print '[{}, {}] score={}, ins_cost={}, DEL_COST={}, sub_cost={}'.format(
+                                row, col, d[row][col].score, ins_cost, del_cost, sub_cost)
 
-                if row > col:
-                    assert(d[row][col].score == ins_cost)
-                    d[row][col].hist = d[row][col - 1].hist + [
-                        OpCode('i', target_tokens[row - 1])]
-                elif row < col:
-                    assert(d[row][col].score == del_cost)
-                    d[row][col].hist = d[row - 1][col].hist + [
-                        OpCode('d', source_tokens[col - 1])]
-                else:
-                    assert(d[row][col].score == sub_cost)
-                    d[row][col].hist = d[row - 1][col - 1].hist + [
-                        OpCode('s', target_tokens[row - 1])]
-    print_table('', source_tokens, target_tokens, d)
+                        d[row][col].hist = d[row][col - 1].hist + [
+                            OpCode('d', source_tokens[col - 1], row, col)]
+                else: # on the diagonal
+                    if d[row][col].score == sub_cost:
+                        d[row][col].hist = d[row - 1][col - 1].hist + [
+                        OpCode('s', target_tokens[row - 1], row, col)]
+                    elif d[row][col].score == ins_cost:
+                        d[row][col].hist = d[row - 1][col].hist + [
+                        OpCode('i', target_tokens[row - 1], row, col)]
+                    else:
+                        d[row][col].hist = d[row][col - 1].hist + [
+                        OpCode('d', source_tokens[col - 1], row, col)]
     return d[num_target_tokens][num_source_tokens].hist
 
 
     # target_tokens.
     source_idx = 0
     for op in operations:
-        if op.opcode == 'c':
-            print 'copy', op.token
-            source_idx += 1
-        elif op.opcode == 'd':
-            print 'delete', source_tokens[source_idx]
+        # opcode 'c' (copy) is a no-op
+        if op.opcode == 'd':
             source_tokens.pop(source_idx)
         elif op.opcode == 'i':
-            print 'insert', op.token, 'before index', source_idx
             source_tokens.insert(source_idx, op.token)
         elif op.opcode == 's':
-            print 'sub', op.token, 'for', source_tokens[source_idx]
             source_tokens[source_idx] = op.token
+
+        if op.opcode != 'd':
             source_idx += 1
-    print 'result:', source_tokens
+    return source_tokens
+
+def split_chars(s):
+    return [c for c in s]
+
+if __name__ == '__main__':
+    result_tokens = modify_tokens(split_chars(sys.argv[1]), split_chars(sys.argv[2]))
+    result = ''.join(result_tokens)
+    if result == sys.argv[2]:
+        print 'Correct:', result
+    else:
+        print 'ERROR:', result