Commits

Matt Chaput committed b8fc9b0

Continuing work on DAWG.

Comments (0)

Files changed (7)

docs/source/api/spelling.rst

 ``spelling`` module
 ===================
 
-See :doc:`how to use the Whoosh spell checker <../spelling>`.
+See :doc:`correcting errors in user queries <../spelling>`.
 
 .. automodule:: whoosh.spelling
 
-.. autoclass:: SpellChecker
-    :inherited-members:
-    :members:
+

src/whoosh/filedb/filereading.py

                 raise Exception("Field(s) %r have spelling=True but DAWG file %r not found" % (spelled, fname))
             
             dawgfile = self.storage.open_file(fname, mapped=False)
-            self.dawg = DawgReader(dawgfile)
+            self.dawg = DawgReader(dawgfile).root
         
         self.dc = segment.doc_count_all()
         assert self.dc == self.storedfields.length
         if not self.schema[fieldname].spelling:
             return False
         if self.dawg:
-            return fieldname in self.dawg.fields
+            return fieldname in self.dawg
         return False
 
     def word_graph(self, fieldname):
         if not self.has_word_graph(fieldname):
             raise Exception("No word graph for field %r" % fieldname)
-        return self.dawg.field_root(fieldname)
-
-    def _field_root(self, fieldname):
-        if not self.has_word_graph(fieldname):
-            raise Exception("No word graph for field %r" % fieldname)
-        
-        return self.dawg.field_root(fieldname)
+        return self.dawg.edge(fieldname)
     
     # Field cache methods
 

src/whoosh/filedb/filewriting.py

                             (lastfn, lasttext, fieldname, text))
     
         if fieldname in self.hasdawg:
-            self.dawg.add(fieldname, text)
-    
+            self.dawg.insert((fieldname, ) + tuple(text))
+        
         if fieldname != lastfn:
             self.format = self.schema[fieldname].format
         
         self.termsindex.close()
         self.postwriter.close()
         if self.dawg:
-            self.dawg.close()
+            self.dawg.write()
         
-        
+
 def add_spelling(ix, fieldnames, force=False):
     """Adds spelling files to an existing index that was created without
     them, and modifies the schema so the given fields have the ``spelling``
         f = storage.create_file(filename)
         dw = DawgWriter(f)
         for fieldname in fieldnames:
+            ft = (fieldname, )
             for word in r.lexicon(fieldname):
-                dw.add(fieldname, word)
+                dw.insert(ft + tuple(word))
         dw.close()
     writer.commit()
 

src/whoosh/reading.py

         
         raise NotImplementedError
     
+    def corrector(self, fieldname):
+        """Returns a :class:`whoosh.spelling.Corrector` object that suggests
+        corrections based on the terms in the given field.
+        """
+        
+        from whoosh.spelling import ReaderCorrector
+        return ReaderCorrector(self, fieldname)
+    
     def terms_within(self, fieldname, text, maxdist, prefix=0, seen=None):
         """Returns a generator of words in the given field within ``maxdist``
         Damerau-Levenshtein edit distance of the given text.
         
+        :param maxdist: the maximum edit distance.
         :param prefix: require suggestions to share a prefix of this length
             with the given word. This is often justifiable since most
             misspellings do not involve the first letter of the word.
             Using a prefix dramatically decreases the time it takes to generate
             the list of words.
+        :param seen: an optional set object. Words that appear in the set will
+            not be yielded.
         """
         
         if self.has_word_graph(fieldname):

src/whoosh/searching.py

         # Copy attributes/methods from wrapped reader
         for name in ("stored_fields", "all_stored_fields", "vector", "vector_as",
                      "lexicon", "frequency", "doc_frequency", 
-                     "field_length", "doc_field_length", "max_field_length"):
+                     "field_length", "doc_field_length", "max_field_length",
+                     "corrector"):
             setattr(self, name, getattr(self.ixreader, name))
 
     def __enter__(self):
             for docnum in q.docs(self):
                 yield docnum
 
-    def suggest(self, fieldname, text, limit=5, maxdist=2, prefix=0,
-                ranking=None):
+    def suggest(self, fieldname, text, limit=5, maxdist=2, prefix=0):
         """Returns a sorted list of suggested corrections for the given
-        mis-typed word based on the contents of the given field.
+        mis-typed word ``text`` based on the contents of the given field::
         
-        See :meth:`whoosh.spelling.suggest` for more information.
+            >>> searcher.suggest("content", "specail")
+            ["special"]
+        
+        This is a convenience method. If you are planning to get suggestions
+        for multiple words in the same field, it is more efficient to get a
+        :class:`~whoosh.spelling.Corrector` object and use it directly::
+        
+            corrector = searcher.corrector("fieldname")
+            for word in words:
+                print corrector.suggest(word)
+        
+        :param limit: only return up to this many suggestions. If there are not
+            enough terms in the field within ``maxdist`` of the given word, the
+            returned list will be shorter than this number.
+        :param maxdist: the largest edit distance from the given word to look
+            at. Numbers higher than 2 are not very effective or efficient.
+        :param prefix: require suggestions to share a prefix of this length
+            with the given word. This is often justifiable since most
+            misspellings do not involve the first letter of the word. Using a
+            prefix dramatically decreases the time it takes to generate the
+            list of words.
         """
         
-        return suggest(self.reader(), fieldname, text, limit=limit,
-                       maxdist=maxdist, prefix=prefix, ranking=ranking)
-
+        c = self.reader().corrector(fieldname)
+        return c.suggest(text, limit=limit, maxdist=maxdist, prefix=prefix)
+    
     def key_terms(self, docnums, fieldname, numterms=5,
                   model=classify.Bo1Model, normalize=True):
         """Returns the 'numterms' most important terms from the documents

src/whoosh/spelling.py

 from collections import defaultdict
 from heapq import heappush, heapreplace
 
+import whoosh.support.dawg as dawg
 from whoosh import analysis, fields, query, scoring
-from whoosh.support.dawg import within
 from whoosh.support.levenshtein import distance
 
 
-def default_ranking(reader, fieldname, word, k):
-    """This function ranks suggestions first by lowest Damerau-Levenshtein edit
-    distance, then by highest term frequency, so more common words will be
-    suggested first.
+# Suggestion scorers
+
+def simple_scorer(word, cost):
+    """Ranks suggestions by the edit distance.
     """
     
-    return (k, 0 - reader.frequency(fieldname, word))
-
-
-def suggest(reader, fieldname, text, limit=5, maxdist=2, prefix=0,
-            ranking=None):
-    """Returns a sorted list of suggested corrections for the given
-    mis-typed word based on the contents of the given field.
-    
-    >>> r = ix.reader()
-    >>> suggest(r, "text", "specail")
-    [u'special']
-    
-    :param reader: an object which implements the ``terms_within`` method.
-    :param fieldname: the field to use for words. This may be None if the
-        "reader" does not support fields.
-    :param limit: only return up to this many suggestions. If there are not
-        enough terms in the field within ``maxdist`` of the given word, the
-        returned list will be shorter than this number.
-    :param maxdist: the largest edit distance from the given word to look
-        at. Numbers higher than 2 are not very effective or efficient.
-    :param prefix: require suggestions to share a prefix of this length
-        with the given word. This is often justifiable since most misspellings
-        do not involve the first letter of the word. Using a prefix
-        dramatically decreases the time it takes to generate the list of words.
-    :param ranking: a custom ranking function. If this argument is ``None``,
-        :func:`default_ranking` is used. The custom function should accept the
-        arguments ``reader, fieldname, word, k`` (where k is the edit
-        distance) and return a ranking value, where lower values mean a better
-        suggestion.
-    """
-    
-    if ranking is None:
-        ranking = default_ranking
-    
-    heap = []
-    seen = set()
-    root = reader.word_graph(fieldname)
-    for k in xrange(1, maxdist+1):
-        for sug in within(root, text, k, prefix=prefix, seen=seen):
-            item = (ranking(reader, fieldname, sug, k), sug)
-            if len(heap) < limit:
-                heappush(heap, item)
-            elif item < heap[0]:
-                heapreplace(heap, item)
-        
-        if len(heap) >= limit:
-            break
-    
-    return [sug for _, sug in sorted(heap)]
+    return cost
 
 
 class Corrector(object):
     words based on a word list. Note that if you want to generate suggestions
     based on the content of a field in an index, you should turn spelling on
     for the field and use :func:`suggest` instead of this object.
-    
     """
     
-    def __init__(self):
-        pass
+    def score(self, word, cost):
+        """Returns a rank value (where lower values are better suggestions)
+        for the given word and "cost" (usually edit distance).
+        """
+        
+        return cost
+        
+    def suggest(self, text, limit=5, maxdist=2, prefix=0):
+        """
+        :param text: the text to check.
+        :param limit: only return up to this many suggestions. If there are not
+            enough terms in the field within ``maxdist`` of the given word, the
+            returned list will be shorter than this number.
+        :param maxdist: the largest edit distance from the given word to look
+            at. Numbers higher than 2 are not very effective or efficient.
+        :param prefix: require suggestions to share a prefix of this length
+            with the given word. This is often justifiable since most
+            misspellings do not involve the first letter of the word. Using a
+            prefix dramatically decreases the time it takes to generate the
+            list of words.
+        """
+        
+        score = self.score
+        suggestions = self.suggestions
+        
+        heap = []
+        seen = set()
+        for k in xrange(1, maxdist+1):
+            for sug in suggestions(text, k, prefix, seen):
+                item = (score(sug, k), sug)
+                if len(heap) < limit:
+                    heappush(heap, item)
+                elif item < heap[0]:
+                    heapreplace(heap, item)
+            
+            # If the heap is already at the required length, don't bother going
+            # to a higher edit distance
+            if len(heap) >= limit:
+                break
+        
+        return [sug for _, sug in sorted(heap)]
+        
+    def suggestions(self, text, maxdist, prefix, seen):
+        """Low-level method that yields a series of ("suggestion", cost) tuples.
+        
+        :param text: the text to check.
+        :param maxdist: the maximum edit distance.
+        :param prefix: require suggestions to share a prefix of this length
+            with the given word.
+        :param seen: a set object with which to track already-seen words.
+        """
+        
+        raise NotImplementedError
+        
+
+class ReaderCorrector(Corrector):
+    """Suggests corrections based on the content of a field in a reader.
+    
+    Ranks suggestions by the edit distance, then by highest to lowest
+    frequency.
+    """
+    
+    def __init__(self, reader, fieldname):
+        self.reader = reader
+        self.fieldname = fieldname
+    
+    def score(self, word, cost):
+        return (cost, 0 - self.reader.frequency(self.fieldname, word))
+    
+    def suggestions(self, text, maxdist, prefix, seen):
+        return self.reader.terms_within(self.fieldname, text, maxdist,
+                                        prefix=prefix, seen=seen)
+        
+
+def wordlist_to_graph_file(wordlist, dbfile):
+    """Writes a word graph file from a list of words.
+    
+    >>> # Open a dictionary file with one word on each line
+    >>> dictfile = open("mywords.txt")
+    >>> # Write the words to a word graph file
+    >>> wordlist_to_graph_file(dictfile, "mywords.dawg")
+    
+    :param wordlist: an iterable containing the words for the graph. The words
+        must be in sorted order.
+    :param dbfile: a filename string or file-like object to write the word
+        graph to. If you pass a file-like object, it will be closed when the
+        function completes.
+    """
+    
+    from whoosh.filedb.structfile import StructFile
+    
+    dw = dawg.DawgWriter()
+    for word in wordlist:
+        dw.insert(word)
+    
+    if isinstance(dbfile, basestring):
+        dbfile = open(dbfile, "wb")
+    if not isinstance(dbfile, StructFile):
+        dbfile = StructFile(dbfile)
+    dw.write(dbfile)
+    dbfile.close()
+
+
+class GraphCorrector(Corrector):
+    """Suggests corrections based on the content of a word list.
+    
+    By default ranks suggestions based on the edit distance.
+    """
+
+    def __init__(self, word_graph, ranking=None):
+        self.word_graph = word_graph
+        self.score = ranking or simple_scorer
+    
+    def suggestions(self, text, maxdist, prefix, seen):
+        return dawg.within(self.word_graph, text, maxdist, prefix=prefix,
+                           seen=seen)
+    
+    def save(self, filename):
+        f = open(filename, "wb")
+        self.word_graph.write(f)
+        f.close()
+    
+    @classmethod
+    def from_word_list(cls, wordlist, ranking=None, fieldname=""):
+        dw = dawg.DawgWriter()
+        for word in wordlist:
+            dw.add(fieldname, word)
+        return cls(dw.root, ranking=ranking)
+    
+    @classmethod
+    def from_graph_file(cls, dbfile, ranking=None, fieldname=""):
+        dr = dawg.DawgReader(dbfile)
+        return cls(dr.field_root(fieldname), ranking=ranking)
     
 
 # Old, obsolete spell checker
 
 class SpellChecker(object):
-    """This feature is obsolete. Instead use either a field with spelling
-    turned on or a :class:`Corrector`.
-    
-    Implements a spell-checking engine using a search index for the backend
-    storage and lookup. This class is based on the Lucene contributed spell-
-    checker code.
-    
-    To use this object::
-    
-        st = store.FileStorage("spelldict")
-        sp = SpellChecker(st)
-        
-        sp.add_words([u"aardvark", u"manticore", u"zebra", ...])
-        # or
-        ix = index.open_dir("index")
-        sp.add_field(ix, "content")
-        
-        suggestions = sp.suggest(u"ardvark", number = 2)
+    """This feature is obsolete.
     """
 
     def __init__(self, storage, indexname="SPELL",

src/whoosh/support/dawg.py

     
 
 class DawgWriter(object):
-    def __init__(self, dbfile, reduced=True):
+    def __init__(self, dbfile=None, reduced=True):
         self.dbfile = dbfile
         self.reduced = reduced
         
         # List of unique nodes that have been checked for duplication.
         self.minimized = {}
         
-        # Maps fieldnames to node starts
-        self.fields = {}
-        self._reset()
-        
-        dbfile.write_int(0)  # File flags
-        dbfile.write_uint(0)  # Pointer to field index
-    
-    def _reset(self):
-        self.fieldname = None
         self.root = BuildNode()
         self.offsets = {}
     
-    def add(self, fieldname, text):
-        if fieldname != self.fieldname:
-            if fieldname in self.fields:
-                raise Exception("I already wrote %r!" % fieldname)
-            if self.fieldname is not None:
-                self._write_field()
-            self.fieldname = fieldname
-        
-        self.insert(text)
-    
     def insert(self, word):
         if word < self.lastword:
             raise Exception("Out of order %r..%r." % (self.lastword, word))
                 self.minimized[child] = child;
             self.unchecked.pop()
 
-    def close(self):
-        if self.fieldname is not None:
-            self._write_field()
-        dbfile = self.dbfile
+    def write(self, dbfile=None):
+        dbfile = self.dbfile or dbfile
+        dbfile.write_int(0)  # File flags
+        dbfile.write_uint(0)  # Pointer to root node
         
-        self.indexpos = dbfile.tell()
-        dbfile.write_pickle(self.fields)
-        dbfile.flush()
-        dbfile.seek(_INT_SIZE)
-        dbfile.write_uint(self.indexpos)
-        dbfile.close()
-    
-    def _write_field(self):
         self._minimize(0)
         root = self.root
         if self.reduced:
             reduce(root)
-        self.fields[self.fieldname] = self._write_node(root)
+        offset = self._write_node(root)
         self._reset()
         
+        dbfile.flush()
+        dbfile.seek(_INT_SIZE)
+        dbfile.write_uint(offset)
+        dbfile.close()
+    
     def _write_node(self, node):
         dbfile = self.dbfile
         keys = node._edges.keys()
 
 
 class ComboNode(BaseNode):
+    """Base class for DAWG nodes that blend the nodes of two different graphs.
+    
+    Concrete subclasses need to implement the ``edge()`` method and possibly
+    the ``final`` property.
+    """
+    
     def __init__(self, a, b):
         self.a = a
         self.b = b
         return self.a.final or self.b.final
     
 
-class PriorityNode(ComboNode):
-    def edge(self, key):
-        if key in self.a:
-            return self.a.edge(key)
-        else:
-            return self.b.edge(key)
-        
-
-class MixedNode(ComboNode):
+class UnionNode(ComboNode):
+    """Makes two graphs appear to be the union of the two graphs.
+    """
+    
     def edge(self, key):
         a = self.a
         b = self.b
         if key in a and key in b:
-            return MixedNode(a.edge(key), b.edge(key))
+            return UnionNode(a.edge(key), b.edge(key))
         elif key in a:
             return a.edge(key)
         else:
             return b.edge(key)
         
 
+class IntersectionNode(ComboNode):
+    """Makes two graphs appear to be the intersection of the two graphs.
+    """
+    
+    def edge(self, key):
+        a = self.a
+        b = self.b
+        if key in a and key in b:
+            return IntersectionNode(a.edge(key), b.edge(key))
+
+
+# Reader for disk-based graph files
+
 class DawgReader(object):
     def __init__(self, dbfile):
         self.dbfile = dbfile
         
         dbfile.seek(0)
         self.fileflags = dbfile.read_int()
-        self.indexpos = dbfile.read_uint()
-        dbfile.seek(self.indexpos)
-        self.fields = dbfile.read_pickle()
-        
-    def field_root(self, fieldname):
-        v = self.fields[fieldname]
-        if not isinstance(v, BaseNode):
-            v = DiskNode(self, v)
-            self.fields[fieldname] = v
-        return v
-
+        self.root = DiskNode(self, dbfile.read_uint())
+    
 
 # Functions