1. Takafumi Arakaki
  2. peps-search

Commits

Takafumi Arakaki  committed 1721001

added hash filed to compare to the old document.

  • Participants
  • Parent commits c5f6345
  • Branches default

Comments (0)

Files changed (1)

File search.py

View file
  • Ignore whitespace
+import codecs
+from hashlib import md5
+
+
 def getschema():
-    from whoosh.fields import Schema, TEXT, ID
-    return Schema(path=ID(stored=True), content=TEXT)
+    from whoosh.fields import Schema, TEXT, ID, STORED
+    return Schema(path=ID(stored=True, unique=True),
+                  content=TEXT,
+                  hash=STORED)
 
 
-def makeindex(ix):
+def readpath(path):
+    return codecs.open(path, 'r', 'utf-8').read()
+
+
+def update(writer, searcher, path):
+    old_document = searcher.document(path=path)
+    if old_document:
+        old_hash = old_document['hash']
+    else:
+        old_hash = None
+
+    content = readpath(path)
+    new_hash = md5(content.encode('utf-8')).hexdigest()
+    if old_hash != new_hash:
+        writer.update_document(
+            path=unicode(path),
+            content=content,
+            hash=new_hash)
+        return True
+    else:
+        return False
+
+
+def updateall(ix, path_list):
     import sys
-    from glob import glob
-    from codecs import open
 
-    writer = ix.writer()
     print 'Start indexing.'
-    path_list = glob('peps/pep-*.txt')
-    for path in path_list:
-        writer.add_document(path=unicode(path),
-                            content=open(path, 'r', 'utf-8').read())
-        sys.stdout.write('.')
-        sys.stdout.flush()
-    writer.commit()
+    num_updated = 0
+    with ix.writer() as writer:
+        with writer.searcher() as searcher:
+            for path in path_list:
+                if update(writer, searcher, path):
+                    num_updated += 1
+                    sys.stdout.write('.')
+                    sys.stdout.flush()
     print
-    print 'Indexed {0} peps'.format(len(path_list))
-
+    print 'Indexed {0} documents (updated {1})'.format(
+        len(path_list), num_updated)
 
 
 def getindex(indexdir="indexdir"):
         print "Creating index."
         os.mkdir(indexdir)
         ix = index.create_in(indexdir, getschema())
-        makeindex(ix)
     else:
         print "Loading index from {0}.".format(indexdir)
         ix = index.open_dir(indexdir)
 
 
 def search(ix, querystr):
-    from codecs import open
     from whoosh.qparser import QueryParser
     from whoosh import highlight
 
             path = hit['path']
             print
             print path
-            print hit.highlights("content",
-                                 open(path, 'r', 'utf-8').read())
+            print hit.highlights("content", readpath(path))
 
 
 def main():
     import sys
+    from glob import glob
+
     ix = getindex()
+    updateall(ix, glob('peps/pep-*.txt'))
     querystr = ' '.join(sys.argv[1:])
     if querystr:
         search(ix, querystr)