Commits

Matt Chaput  committed 6519cdd

Bumped version and changed index format to account for changes due to index compression work.

  • Participants
  • Parent commits a4b2068

Comments (0)

Files changed (3)

File benchmark/reuters.py

 import gzip, os.path
 from optparse import OptionParser
 
-from whoosh import analysis, fields, index, query
+from whoosh import analysis, fields, index, qparser, query
 from whoosh.util import now
 
 ana = analysis.StemmingAnalyzer()
 schema = fields.Schema(id=fields.ID(stored=True),
+                       headline=fields.STORED,
                        text=fields.TEXT(analyzer=ana, stored=True))
 
 def do_index(file, indexname, **kwargs):
     w = ix.writer(**kwargs)
     for line in gzip.GzipFile(file, "rb"):
         id, text = line.decode("latin1").split("\t")
-        w.add_document(id=id, text=text)
+        w.add_document(id=id, text=text, headline=text[:70])
     print "Spool:", now() - t
     ct = now()
     w.commit()
     print "Commit:", now() - ct
     print "Total:", now() - t
-    
 
+
+def do_search(indexname, q, limit=10, showbody=False):
+    ix = index.open_dir(indexname)
+    s = ix.searcher()
+    q = qparser.QueryParser("text", schema=s.schema).parse(q)
+    print "query=", q
+    r = s.search(q, limit=limit)
+    print "result=", r
+    for hit in r:
+        print hit.pos, hit["headline"]
+        if showbody:
+            print hit["text"]
+        
 if __name__ == "__main__":
     parser = OptionParser()
+    parser.add_option("-i", "--index", dest="index",
+                      help="Index the documents", default=False,
+                      action="store_true")
     parser.add_option("-s", "--source", dest="source", metavar="FILENAME",
-                      help="file containing the corpus date.",
+                      help="File containing the corpus date.",
                       default="reuters21578.txt.gz")
     parser.add_option("-d", "--dir", dest="dir", metavar="DIRNAME",
-                      help="directory in which to store files, index, etc.",
+                      help="Directory in which to store files, index, etc.",
                       default=".")
     parser.add_option("-n", "--name", dest="indexname",
                       help="Name of the index directory",
     parser.add_option("-l", "--limit", dest="limit",
                       help="Maximum number of results to display for a search.",
                       default="10")
+    parser.add_option("-b", "--body", dest="showbody",
+                      help="Show the body of emails found by a search.",
+                      default=False, action="store_true")
     parser.add_option("-t", "--tempdir", dest="tempdir",
                       help="Directory to use for temp file storage",
                       default=None)
+    
     options, args = parser.parse_args()
     
-    do_index(options.source, options.indexname, procs=int(options.procs),
-             limitmb=int(options.limitmb), dir=options.tempdir)
+    if options.index:
+        do_index(options.source, options.indexname, procs=int(options.procs),
+                 limitmb=int(options.limitmb), dir=options.tempdir)
+        
+    if args:
+        qs = " ".join(args).decode("utf8")
+        print "Query string=", repr(qs)
+        do_search(options.indexname, qs, limit=int(options.limit),
+                  showbody=options.showbody)
     
     

File src/whoosh/__init__.py

 # limitations under the License.
 #===============================================================================
 
-__version__ = (1, 0, 1)
+__version__ = (1, 1, 0)
 
 
 def versionstring(build=True, extra=True):

File src/whoosh/filedb/fileindex.py

 from whoosh.system import _INT_SIZE, _FLOAT_SIZE, _LONG_SIZE
 
 
-_INDEX_VERSION = -108
+_INDEX_VERSION = -109
 
 
 # TOC read/write functions