Source

match-o-matic / worddict.py

#! python

import collections
import itertools

def ddmaker():
    return collections.defaultdict(ddmaker)

class WordLookup(object):
    def __init__(self, filename):
        self._word_lookup = ddmaker()
        self._wordbag_lookup = ddmaker()
        #self._ngram_index = collections.defaultdict(set)
        with open(filename, 'rb') as handle:
            for word in handle.readlines():
                self.add_word(word.upper().strip())
    def add_word(self, word):
        lookup = self._word_lookup
        for letter in word.upper():
            lookup = lookup[letter]
        lookup[None] = None

        bag_lookup = self._wordbag_lookup
        for letter in sorted(word.upper()):
            bag_lookup = bag_lookup[letter]
        if not None in bag_lookup:
            bag_lookup[None] = set([word])
        else:
            bag_lookup[None].add(word)
        #for s, e in itertools.combinations(range(len(word) + 1), 2):
        #    x = word[s:e]
        #    self._ngram_index[x].add(word)
    def is_word(self, word):
        lookup = self._word_lookup
        for letter in word.upper().strip():
            if letter in lookup or letter == '_':
                lookup = lookup[letter]
            else:
                return False
        return None in lookup
    def words_with_prefix(self, prefix, exact_length=-1):
        def yield_words(prefix, lookup_table, exact_length=-1):
            if None in lookup_table:
                yield prefix
            if exact_length == 0:
                return
            for k, v in lookup_table.iteritems():
                if k is not None:
                    for word in yield_words(prefix + k, v, 
                                            exact_length - 1 if
                                                exact_length > 0 else -1):
                        if exact_length < 1:
                            yield word
        lookups = [('', self._word_lookup)]
        for letter in prefix.upper().strip():
            if exact_length > 0:
                exact_length -= 1
            elif exact_length == 0:
                return
            if letter == '_':
                lookups = sum(([(lookup[0] + k, l) 
                                    for k, l in lookup[1].iteritems() if l]
                                    for lookup in lookups), [])
            else:
                lookups = [(lookup[0] + letter, lookup[1][letter])
                                for lookup in lookups
                                if letter in lookup[1]]
        if lookups:
            for prefix, lookup in lookups:
                for word in yield_words(prefix, lookup, exact_length):
                    yield word
    def matches(self, word):
        for word in self.words_with_prefix(word, len(word)):
            yield word
    def matches_for_bag(self, letterbag, use_whole_bag=False):
        wildcards = len(letterbag) - (len(letterbag) -
                                      len(letterbag.replace('_', '')))
        sort_bag = sorted(letterbag.replace('_', '').upper())
        lengths = ([len(letterbag)] 
                        if use_whole_bag else xrange(2, len(letterbag) + 1))
        seen_words = set()
        for subset in lengths:
            for smallerbag in itertools.combinations(sort_bag, subset):
                bag_lookup = self._wordbag_lookup
                for letter in smallerbag:
                    if letter in bag_lookup:
                        bag_lookup = bag_lookup[letter]
                    if None in bag_lookup:
                        for word in bag_lookup[None]:
                            if word in seen_words:
                                pass
                            else:
                                seen_words.add(word)
                                yield word

if __name__ == "__main__":
    import os
    import sys
    import time
    x = time.time()
    lookup = WordLookup(os.path.join(os.path.dirname(
                                        os.path.abspath(
                                            __file__)),
                                     'words.txt'))
    print "Loaded words:", time.time() - x
    for word in sys.argv[1:]:
        x = time.time()
        print "Word {:18} in dictionary: {}".format(word, 
                    'Yes' if lookup.is_word(word) else 'No')
        for prefixed in lookup.words_with_prefix(word, len(word)):
            pass
            print "     * {}".format(prefixed)
        print "----"
        for prefixed in lookup.words_with_prefix(word):
            pass
            print "     * {}".format(prefixed)
        print "----"
        for bagged in lookup.matches_for_bag(word):
            pass
            print "     * {}".format(bagged)
        print time.time() - x
        print "========"