Source

whoosh / benchmark / reuters.py

import gzip, os.path

from whoosh import analysis, fields, index, qparser, query
from whoosh.support.bench import Bench, Spec
from whoosh.util import now


class Reuters(Spec):
    name = "reuters"
    filename = "reuters21578.txt.gz"
    main_field = "text"
    headline_text = "headline"
    
    def whoosh_schema(self):
        #ana = analysis.StemmingAnalyzer()
        ana = analysis.StandardAnalyzer()
        schema = fields.Schema(id=fields.ID(stored=True),
                               headline=fields.STORED,
                               text=fields.TEXT(analyzer=ana, stored=True))
        return schema
    
    def zcatalog_setup(self, cat):
        from zcatalog import indexes  #@UnresolvedImport
        cat["id"] = indexes.FieldIndex(field_name="id")
        cat["headline"] = indexes.TextIndex(field_name="headline")
        cat["body"] = indexes.TextIndex(field_name="text")
    
    def documents(self):
        path = os.path.join(self.options.dir, self.filename)
        f = gzip.GzipFile(path)
        
        for line in f:
            id, text = line.decode("latin1").split("\t")
            yield {"id": id, "text": text, "headline": text[:70]}

        
if __name__ == "__main__":
    Bench().run(Reuters)
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.