Commits

Matt Chaput  committed 9476d2b

Patches to fix 2.5 compatibility and matcher inheretence, thanks Daniel and Bernd!
Restored FileIndex.schema as a property forwarding to FileIndex._current_schema().
Restored Searcher.schema as a property forwarding to the wrapped IndexReader's schema attribute.
Removed debug print from test_fields.py.

  • Participants
  • Parent commits 62d545a

Comments (0)

Files changed (12)

File .pydevproject

-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<?eclipse-pydev version="1.0"?>
-
-<pydev_project>
-<pydev_property name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">Python 2.6</pydev_property>
-<pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python 2.6</pydev_property>
-<pydev_pathproperty name="org.python.pydev.PROJECT_SOURCE_PATH">
-<path>/whoosh/src</path>
-<path>/whoosh/tests</path>
-<path>/whoosh/benchmark</path>
-<path>/whoosh/stress</path>
-</pydev_pathproperty>
-</pydev_project>
-
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<?eclipse-pydev version="1.0"?>
+
+<pydev_project>
+<pydev_property name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">Python 2.5</pydev_property>
+<pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python 2.5</pydev_property>
+<pydev_pathproperty name="org.python.pydev.PROJECT_SOURCE_PATH">
+<path>/whoosh/src</path>
+<path>/whoosh/tests</path>
+<path>/whoosh/benchmark</path>
+<path>/whoosh/stress</path>
+</pydev_pathproperty>
+</pydev_project>

File benchmark/enron.py

 from __future__ import division
 from bz2 import compress, decompress
 from email import message_from_string
-import gc, marshal, os.path, tarfile
+import os.path, tarfile
 from marshal import dump, load
+from optparse import OptionParser
+from urllib import urlretrieve
 
-from whoosh import analysis, index
-from whoosh.fields import *
-from whoosh.filedb import pools
+from whoosh import analysis, fields, index, qparser
 from whoosh.util import now
 
 
-enronURL = "http://www.cs.cmu.edu/~enron/"
+enron_archive_url = ""
+
+# http://www.cs.cmu.edu/~enron/
+enron_archive_url = "http://www.cs.cmu.edu/~enron/enron_mail_082109.tar.gz"
+enron_archive_filename = "enron_mail_082109.tar.gz"
+cache_filename = "enron_cache.pickle"
 
 ana = analysis.StemmingAnalyzer(maxsize=40)
-schema = Schema(body=TEXT(analyzer=ana, stored=True), date=ID(stored=True),
-                frm=ID(stored=True), to=IDLIST(stored=True),
-                subject=TEXT(stored=True), cc=IDLIST, bcc=IDLIST)
+schema = fields.Schema(body=fields.TEXT(analyzer=ana, stored=True),
+                       date=fields.ID(stored=True),
+                       frm=fields.ID(stored=True),
+                       to=fields.IDLIST(stored=True),
+                       subject=fields.TEXT(stored=True),
+                       cc=fields.IDLIST,
+                       bcc=fields.IDLIST)
 
 header_to_field = {"Date": "date", "From": "frm", "To": "to",
                    "Subject": "subject", "Cc": "cc", "Bcc": "bcc"}
 
-def get_texts(tarfilename):
-    archive = tarfile.open(tarfilename)
+
+# Functions for downloading and then reading the email archive and caching
+# the messages in an easier-to-digest format
+
+def download_archive(archive):
+    print "Downloading Enron email archive to %r..." % archive
+    t = now()
+    urlretrieve(enron_archive_url, archive)
+    print "Downloaded in ", now() - t, "seconds"
+    
+def get_texts(archive):
+    archive = tarfile.open(archive, "r:gz")
     while True:
         entry = archive.next()
         archive.members = []
             text = f.read()
             yield text
 
-def get_messages(tarfilename, headers=True):
-    s = set()
-    for text in get_texts(tarfilename):
+def get_messages(archive, headers=True):
+    for text in get_texts(archive):
         message = message_from_string(text)
         body = message.as_string().decode("latin_1")
         blank = body.find("\n\n")
                     d[fn] = v.decode("latin_1")
         yield d
         
-def cache_messages(tarfilename, cachename, headers=True):
-    f = open(cachename, "wb")
+def cache_messages(archive, cache):
+    print "Caching messages in %s..." % cache
+    
+    if not os.path.exists(archive):
+        raise Exception("Archive file %r does not exist" % archive)
+    
+    t = now()
+    f = open(cache, "wb")
     c = 0
-    for d in get_messages(tarfilename):
+    for d in get_messages(archive):
         c += 1
         dump(d, f)
         if not c % 1000: print c
     f.close()
-    
-def get_cached_messages(cachename):
-    f = open(cachename, "rb")
+    print "Cached messages in ", now() - t, "seconds"
+
+
+# Functions for reading the cached messages
+
+def get_cached_messages(cache):
+    f = open(cache, "rb")
     try:
         while True:
             d = load(f)
         pass
     f.close()
 
-def do_index(cachename, chunk=1000, skip=1, upto=600000, **kwargs):
-    if not os.path.exists("testindex"):
-        os.mkdir("testindex")
-    ix = index.create_in("testindex", schema)
+
+# Main function for indexing the cached messages
+
+def do_index(cache, indexname, chunk=1000, skip=1, upto=600000, **kwargs):
+    print "Indexing..."
+    if not os.path.exists(indexname):
+        os.mkdir(indexname)
+    ix = index.create_in(indexname, schema)
     
-    #w = ix.writer(**kwargs)
-    from whoosh.filedb.multiproc import MultiSegmentWriter
-    w = MultiSegmentWriter(ix, **kwargs)
-    
+    w = ix.writer(**kwargs)
     starttime = chunkstarttime = now()
     c = 0
     skipc = skip
-    for d in get_cached_messages(cachename):
+    for d in get_cached_messages(cache):
         skipc -= 1
         if not skipc:
             d["_stored_body"] = compress(d["body"])
     print "Total", (committime - starttime), "for", c
 
 
+# Main function for testing the archive
+
+def do_search(indexname, q, limit=10):
+    ix = index.open_dir(indexname)
+    s = ix.searcher()
+    q = qparser.QueryParser("body", schema=s.schema).parse(q)
+    print "query=", q
+    r = s.search(q, limit=limit)
+    print "result=", r
+    for i, d in enumerate(r):
+        print i, d.get("subject")
+
+
 if __name__=="__main__":
-    #t = now()
-    #cache_messages("c:/Documents and Settings/matt/Desktop/Search/enron_mail_030204.tar", "messages.bin")
-    #print now() - t
+    parser = OptionParser()
+    parser.add_option("-d", "--dir", dest="dir", metavar="DIRNAME",
+                      help="directory in which to store files, index, etc.",
+                      default=".")
+    parser.add_option("-s", "--setup", dest="setup", action="store_true",
+                      help="Download and cache the document archive if necessary.",
+                      default=False)
+    parser.add_option("-i", "--index", dest="index", action="store_true",
+                      help="Index the documents.",
+                      default=False)
+    parser.add_option("-n", "--name", dest="indexname",
+                      help="Name of the index directory",
+                      default="index")
+    parser.add_option("-m", "--mb", dest="limitmb",
+                      help="Memory size, in MB",
+                      default="256")
+    parser.add_option("-c", "--chunk", dest="chunk",
+                      help="Report indexing progress in chunks of this many documents.",
+                      default="1000")
+    parser.add_option("-k", "--skip", dest="skip",
+                      help="Skip this many documents before indexing a document.",
+                      default="1")
+    parser.add_option("-u", "--upto", dest="upto",
+                      help="Only index up to this document.",
+                      default="600000")
+    parser.add_option("-p", "--procs", dest="procs",
+                      help="Use this many processors to index.",
+                      default="1")
+    parser.add_option("-l", "--limit", dest="limit",
+                      help="Maximum number of results to display for a search.",
+                      default="10")
+    parser.add_option("-P", "--pool", dest="pool", action="store_true", default=False)
+    options, args = parser.parse_args()
     
-    do_index("messages.bin", limitmb=128, procs=2, upto=1000)
     
-    #import cProfile
-    #cProfile.run('do_index("messages.bin", limitmb=128, upto=10000)', "index.profile")
-    #from pstats import Stats
-    #p = Stats("index.profile")
-    #p.sort_stats("time").print_stats()
+    archive = os.path.abspath(os.path.join(options.dir, enron_archive_filename))
+    cache = os.path.abspath(os.path.join(options.dir, cache_filename))
     
-    from whoosh.query import Term
-    from whoosh.support.bitvector import BitSet, BitVector
-    from sys import getsizeof
+    if options.setup:
+        if not os.path.exists(archive):
+            download_archive(archive)
+        else:
+            print "Archive is OK"
+        
+        if not os.path.exists(cache):
+            cache_messages(archive, cache)
+        else:
+            print "Cache is OK"
+            
+    if options.index:
+        poolclass = None
+        if options.pool:
+            from whoosh.filedb.pools2 import AltPool
+            poolclass = AltPool
+        do_index(cache, options.indexname, chunk=int(options.chunk),
+                 skip=int(options.skip), upto=int(options.upto),
+                 procs=int(options.procs), limitmb=int(options.limitmb),
+                 poolclass=poolclass)
     
-    t = now()
-    ix = index.open_dir("testindex")
-    s = ix.searcher()
-    print now() - t
+    if args:
+        qs = args[0].decode("utf8")
+        print "Query string=", repr(qs)
+        do_search(options.indexname, qs, limit=int(options.limit))
     
-    q = Term("body", u"enron")
-    t = now()
-    r = s.search(q)
-    print now() - t
+#    #t = now()
+#    #cache_messages("c:/Documents and Settings/matt/Desktop/Search/enron_mail_030204.tar", "messages.bin")
+#    #print now() - t
+#    
+#    do_index("messages.bin", limitmb=128, procs=2, upto=1000)
+#    
+#    #import cProfile
+#    #cProfile.run('do_index("messages.bin", limitmb=128, upto=10000)', "index.profile")
+#    #from pstats import Stats
+#    #p = Stats("index.profile")
+#    #p.sort_stats("time").print_stats()
+#    
+#    from whoosh.query import Term
+#    from whoosh.support.bitvector import BitSet, BitVector
+#    from sys import getsizeof
+#    
+#    t = now()
+#    ix = index.open_dir("testindex")
+#    s = ix.searcher()
+#    print now() - t
+#    
+#    q = Term("body", u"enron")
+#    t = now()
+#    r = s.search(q)
+#    print now() - t
+#    
+#    for doc in r:
+#        print doc["subject"]
     
-    for doc in r:
-        print doc["subject"]
-    

File src/whoosh/filedb/fileindex.py

     
     def _current_schema(self):
         return self._read_toc().schema
+    
+    @property
+    def schema(self):
+        return self._current_schema()
 
 
 # SegmentSet object

File src/whoosh/matching.py

     def value(self):
         return self.matchers[self.current].value()
     
+    def value_as(self, astype):
+        return self.matchers[self.current].value_as(astype)
+    
     def next(self):
         if not self.is_active(): raise ReadTooFar
         
         self._spans = None
         self._find_next()
     
+    def copy(self):
+        return self.__class__(self.wordmatchers[:], slop=self.slop, boost=self.boost)
+    
     def replace(self):
         if not self.is_active():
             return NullMatcher()
         return self
     
+    def all_ids(self):
+        # Need to redefine this because the WrappingMatcher parent class
+        # forwards to the submatcher, which in this case is just the
+        # IntersectionMatcher.
+        while self.is_active():
+            yield self.id()
+            self.next()
+    
     def next(self):
         ri = self.child.next()
         rn = self._find_next()
         rn = self._find_next()
         return rs or rn
     
+    def skip_to_quality(self, minquality):
+        skipped = 0
+        while self.is_active() and self.quality() <= minquality:
+            # TODO: doesn't count the documents matching the phrase yet
+            skipped += self.child.skip_to_quality(minquality/self.boost)
+            self._find_next()
+        return skipped
+    
     def positions(self):
         if not self.is_active():
             raise ReadTooFar
             #  [list of positions for word 2], ...]
             poses = [m.positions() for m in self.wordmatchers]
             
-            _
-            
             # Set the "active" position list to the list of positions of the
             # first word. We well then iteratively update this list with the
             # positions of subsequent words if they are within the "slop"

File src/whoosh/query.py

         self.boost = boost
 
     def copy(self):
-        return self.__class__(*self.subqueries, boost=self.boost)
+        return self.__class__(self.subqueries[0], self.subqueries[1],
+                              boost=self.boost)
 
     def normalize(self):
         required, optional = (q.normalize() for q in self.subqueries)

File src/whoosh/reading.py

 
     def __init__(self, readers, generation=-1):
         self.readers = readers
+        self.schema = None
+        if readers:
+            self.schema = readers[0].schema
         self._generation = generation
         
         self.doc_offsets = []

File src/whoosh/searching.py

         self._idf_cache = {}
         self._sorter_cache = {}
 
+    @property
+    def schema(self):
+        return self.ixreader.schema
+
     def last_modified(self):
         return self.ix.last_modified()
 

File src/whoosh/util.py

 # Functions
 
 
-def make_binary_tree(cls, args, **kwargs):
-    """Takes a class that takes two positional arguments and a list of
+def make_binary_tree(fn, args, **kwargs):
+    """Takes a function/class that takes two positional arguments and a list of
     arguments and returns a binary tree of instances.
     
     >>> make_binary_tree(UnionMatcher, [matcher1, matcher2, matcher3])
         return args[0]
     
     half = count // 2
-    return cls(make_binary_tree(cls, args[:half], **kwargs),
-               make_binary_tree(cls, args[half:], **kwargs), **kwargs)
+    return fn(make_binary_tree(fn, args[:half], **kwargs),
+              make_binary_tree(fn, args[half:], **kwargs), **kwargs)
 
 
 # Varint cache

File tests/test_fields.py

 import unittest
 from datetime import datetime
 
-from whoosh import fields, index, qparser
+from whoosh import fields, qparser, query
 from whoosh.filedb.filestore import RamStorage
 
 
         self.assertEqual(len(r), 27)
         
         q = qp.parse(u"date:[2010-05 TO 2010-08]")
-        print q
+        self.assertEqual(q.__class__, query.TermRange)
+        self.assertEqual(q.start, u"201005")
+        self.assertEqual(q.end, u"201008")
     
     def test_boolean(self):
         schema = fields.Schema(id=fields.ID(stored=True),

File tests/test_indexing.py

         w.commit()
     
     def test_multipool(self):
+        try:
+            import multiprocessing
+        except ImportError:
+            return
+        
         domain = (u"alfa", u"bravo", u"charlie", u"delta", u"echo", u"foxtrot", u"golf",
                   u"hotel", u"india", u"juliet", u"kilo", u"lima", u"mike", u"november")
         

File tests/test_searching.py

         m = q.matcher(searcher)
         self.assertEqual(m.__class__.__name__, "PhraseMatcher")
         
-        self.assertEqual(names(searcher.search(q)), ["A"])
+        r = searcher.search(q)
+        self.assertEqual(names(r), ["A"])
+        self.assertEqual(len(r), 1)
         
         q = Phrase("value", [u"miss", u"muffet", u"sat", u"tuffet"])
         self.assertEqual(names(searcher.search(q)), ["A", "D"])
         
         q = Phrase("value", [u"falunk", u"gibberish"])
-        self.assertEqual(names(searcher.search(q)), [])
+        r = searcher.search(q)
+        self.assertEqual(names(r), [])
+        self.assertEqual(len(r), 0)
         
         q = Phrase("value", [u"gibberish", u"falunk"], slop=2)
         self.assertEqual(names(searcher.search(q)), ["D"])

File tests/test_spans.py

                 id = m.id()
                 sps = m.spans()
                 ids.add(id)
-                original = s.stored_fields(id)["text"]
+                original = list(s.stored_fields(id)["text"])
                 self.assertTrue(word in original, "%r not in %r" % (word, original))
                 
                 if word != "bravo":
         q = spans.SpanNot(nq, bq)
         m = q.matcher(s)
         while m.is_active():
-            orig = s.stored_fields(m.id())["text"]
+            orig = list(s.stored_fields(m.id())["text"])
             i1 = orig.index("alfa")
             i2 = orig.index("charlie")
             dist = i2 - i1
         bq = spans.SpanBefore(Term("text", "alfa"), Term("text", "charlie"))
         m = bq.matcher(s)
         while m.is_active():
-            orig = s.stored_fields(m.id())["text"]
+            orig = list(s.stored_fields(m.id())["text"])
             self.assertTrue("alfa" in orig)
             self.assertTrue("charlie" in orig)
             self.assertTrue(orig.index("alfa") < orig.index("charlie"))