Source

peps-search / search.py

Full commit
import codecs
from hashlib import md5


def getschema():
    from whoosh.fields import Schema, TEXT, ID, STORED
    return Schema(path=ID(stored=True, unique=True),
                  content=TEXT,
                  hash=STORED)


def readpath(path):
    return codecs.open(path, 'r', 'utf-8').read()


def update(writer, searcher, path):
    old_document = searcher.document(path=path)
    if old_document:
        old_hash = old_document['hash']
    else:
        old_hash = None

    content = readpath(path)
    new_hash = md5(content.encode('utf-8')).hexdigest()
    if old_hash != new_hash:
        writer.update_document(
            path=unicode(path),
            content=content,
            hash=new_hash)
        return True
    else:
        return False


def updateall(ix, path_list):
    import sys

    print 'Start indexing.'
    num_updated = 0
    with ix.writer() as writer:
        with writer.searcher() as searcher:
            for path in path_list:
                if update(writer, searcher, path):
                    num_updated += 1
                    sys.stdout.write('.')
                    sys.stdout.flush()
    print
    print 'Indexed {0} documents (updated {1})'.format(
        len(path_list), num_updated)


def getindex(indexdir="indexdir"):
    import os
    from whoosh import index
    if not os.path.isdir(indexdir):
        print "Directory '{0}' does not exist.".format(indexdir)
        print "Creating index."
        os.mkdir(indexdir)
        ix = index.create_in(indexdir, getschema())
    else:
        print "Loading index from {0}.".format(indexdir)
        ix = index.open_dir(indexdir)
    return ix


def search(ix, querystr):
    from whoosh.qparser import QueryParser
    from whoosh import highlight

    querystr = unicode(querystr)
    print u"Search '{0}'".format(querystr)
    with ix.searcher() as searcher:
        query = QueryParser("content", ix.schema).parse(querystr)
        results = searcher.search(query)
        results.formatter = highlight.UppercaseFormatter()
        print results
        for hit in results:
            path = hit['path']
            print
            print path
            print hit.highlights("content", readpath(path))


def main():
    import sys
    from glob import glob

    ix = getindex()
    updateall(ix, glob('peps/pep-*.txt'))
    querystr = ' '.join(sys.argv[1:])
    if querystr:
        search(ix, querystr)


if __name__ == '__main__':
    main()