1. Jason Scheirer
  2. match-o-matic

Commits

Jason Scheirer  committed 2710abd

Skeleton n-gram lookup (too slow, need better algorithm)

  • Participants
  • Parent commits adeb77e
  • Branches default

Comments (0)

Files changed (1)

File worddict.py

View file
 #! python
 
 import collections
+import itertools
 
 def ddmaker():
     return collections.defaultdict(ddmaker)
 class WordLookup(object):
     def __init__(self, filename):
         self._word_lookup = ddmaker()
+        #self._ngram_index = collections.defaultdict(set)
         with open(filename, 'rb') as handle:
             for word in handle.readlines():
                 self.add_word(word.upper().strip())
         for letter in word:
             lookup = lookup[letter]
         lookup[None] = None
+        #for s, e in itertools.combinations(range(len(word) + 1), 2):
+        #    x = word[s:e]
+        #    self._ngram_index[x].add(word)
     def is_word(self, word):
         lookup = self._word_lookup
         for letter in word.upper().strip():
         return None in lookup
     def words_with_prefix(self, prefix, exact_length=-1):
         def yield_words(prefix, lookup_table, exact_length=-1):
-            if exact_length == 0:
-                return
-            if None in lookup_table:
+            if exact_length == 0 or None in lookup_table:
                 yield prefix
+                if exact_length == 0:
+                    return
             for k, v in lookup_table.iteritems():
                 if k is not None:
                     for word in yield_words(prefix + k, v, 
                                             exact_length - 1 if
-                                                exact_length > -1 else -1):
+                                                exact_length > 0 else -1):
                         if exact_length < 1:
                             yield word
         lookups = [('', self._word_lookup)]
                                      'words.txt'))
     print time.time() - x
     for word in sys.argv[1:]:
+        x = time.time()
         print "Word {:18} in dictionary: {}".format(word, 
                     'Yes' if lookup.is_word(word) else 'No')
+        for prefixed in lookup.words_with_prefix(word, len(word)):
+            print "     * {}".format(prefixed)
+        print "----"
         for prefixed in lookup.words_with_prefix(word):
             print "     * {}".format(prefixed)
+        print time.time() - x