Commits

coady committed caf76b3

Forwards compatible lucene 4.0 migrations.

Comments (0)

Files changed (6)

lupyne/engine/documents.py

     :param store,index,termvector: field parameters, expressed as bools or strs, with lucene defaults
     :param analyzed,omitNorms: additional index boolean settings
     :param withPositions,withOffsets: additional termvector boolean settings
+    :param boost: boost factor
     :param attrs: additional attributes to set on the field
     """
-    def __init__(self, name, store=False, index='analyzed', termvector=False, analyzed=False, omitNorms=False, withPositions=False, withOffsets=False, **attrs):
-        self.name, self.attrs = name, attrs
+    def __init__(self, name, store=False, index='analyzed', termvector=False, analyzed=False, omitNorms=False, \
+            withPositions=False, withOffsets=False, boost=1.0, **attrs):
+        self.name, self.boost, self.attrs = name, boost, attrs
         if isinstance(store, bool):
             store = 'yes' if store else 'no'
         self.store = document.Field.Store.valueOf(store.upper())
                 field = document.Field(self.name, value)
             else:
                 field = document.Field(self.name, value, self.termvector)
+            field.setBoost(self.boost)
             for name, value in self.attrs.items():
                 setattr(field, name, value)
             yield field
             yield self.sep.join(value[:index])
     def items(self, *values):
         "Generate indexed component fields."
-        if self.store.stored:
+        if self.store == document.Field.Store.YES:
             for value in values:
                 yield document.Field(self.name, value, self.store, document.Field.Index.NO)
         for value in values:
     def __init__(self, searcher, scoredocs, count=None, maxscore=None, fields=None):
         self.searcher, self.scoredocs = searcher, scoredocs
         self.count, self.maxscore = count, maxscore
-        self.fields = document.MapFieldSelector(fields) if isinstance(fields, collections.Iterable) else fields
+        self.fields = fields
+    def select(self, *fields):
+        "Only load selected fields."
+        self.fields = document.MapFieldSelector(fields)
     def __len__(self):
         return len(self.scoredocs)
     def __getitem__(self, index):
         return self.searchgroups.size()
     def __iter__(self):
         for searchgroup in self.searchgroups:
-            yield searchgroup.groupValue.toString()
+            yield convert(searchgroup.groupValue)
     def facets(self, filter):
         "Generate field values and counts which match given filter."
         collector = grouping.TermSecondPassGroupingCollector(self.field, self.searchgroups, self.sort, self.sort, 1, False, False, False)
         search.IndexSearcher.search(self.searcher, self.query, filter, collector)
         for groupdocs in collector.getTopGroups(0).groups:
-            yield groupdocs.groupValue.toString(), groupdocs.totalHits
+            yield convert(groupdocs.groupValue), groupdocs.totalHits
     def groups(self, count=1, sort=None, scores=False, maxscore=False):
         """Generate grouped `Hits`_ from second pass grouping collector.
         
         search.IndexSearcher.search(self.searcher, self.query, collector)
         for groupdocs in collector.getTopGroups(0).groups:
             hits = Hits(self.searcher, groupdocs.scoreDocs, groupdocs.totalHits, groupdocs.maxScore, getattr(self, 'fields', None))
-            hits.value = groupdocs.groupValue.toString()
+            hits.value = convert(groupdocs.groupValue)
             yield hits

lupyne/engine/indexers.py

     """Copy the index commit to the destination directory.
     Optimized to use hard links if the destination is a file system path.
     """
-    src = commit.directory
     if isinstance(dest, store.Directory):
         for filename in commit.fileNames:
-            src.copy(dest, filename, filename)
+            commit.directory.copy(dest, filename, filename)
     else:
-        src = store.FSDirectory.cast_(src).directory.path
+        src = IndexSearcher.path.fget(commit)
         os.path.isdir(dest) or os.makedirs(dest)
         for filename in commit.fileNames:
             paths = os.path.join(src, filename), os.path.join(dest, filename)
         return payload and getattr(payload.data, 'string_', None)
     @payload.setter
     def payload(self, data):
-        self.Payload.payload = index.Payload(lucene.JArray_byte(data))
+        self.Payload.payload = index.Payload(lucene.JArray_byte(data.encode('utf8') if isinstance(data, unicode) else data))
     @property
     def positionIncrement(self):
         "Position relative to the previous token."
         "reader's lucene Directory"
         return self.indexReader.directory()
     @property
+    def path(self):
+        "FSDirectory path"
+        return store.FSDirectory.cast_(self.directory).directory.path
+    @property
     def timestamp(self):
         "timestamp of reader's last commit"
         return self.indexCommit.timestamp / 1000.0
     @property
+    def readers(self):
+        "segment readers"
+        return map(index.SegmentReader.cast_, self.sequentialSubReaders)
+    @property
     def segments(self):
         "segment filenames with document counts"
-        return dict((index.SegmentReader.cast_(reader).segmentName, reader.numDocs()) for reader in self.sequentialSubReaders)
+        return dict((reader.segmentName, reader.numDocs()) for reader in self.readers)
     def copy(self, dest, query=None, exclude=None, merge=0):
         """Copy the index to the destination directory.
         Optimized to use hard links if the destination is a file system path.
             if exclude:
                 writer.delete(exclude)
             writer.commit()
-            writer.expungeDeletes()
+            writer.forceMergeDeletes()
             if merge:
-                writer.optimize(merge)
+                writer.forceMerge(merge)
             return len(writer)
     def count(self, name, value):
         "Return number of documents with given term."
             args = fuzzy.pop('minSimilarity', 0.5), fuzzy.pop('prefixLength', 0)
             termenum = search.FuzzyTermEnum(self.indexReader, term, *args, **fuzzy)
         elif '*' in value or '?' in value:
+            value = value.rstrip('*')
+            if '*' in value or '?' in value:
+                warnings.warn('Wildcard term enumeration has been removed from lucene 4; use a prefix instead.', DeprecationWarning)
             termenum = search.WildcardTermEnum(self.indexReader, term)
         else:
             termenum = search.TermRangeTermEnum(self.indexReader, name, value, stop, True, False, None)
         :param type: type object or name compatible with FieldCache
         :param parser: lucene FieldCache.Parser or callable applied to field values
         """
-        return SortField(name, type, parser).comparator(self.indexReader)
+        return SortField(name, type, parser).comparator(self)
     def spans(self, query, positions=False, payloads=False):
         """Generate docs with occurrence counts for a span query.
         
     def overlap(self, left, right):
         "Return intersection count of cached filters."
         count, bitset = 0, getattr(util, 'FixedBitSet', util.OpenBitSet)
-        for reader in self.sequentialSubReaders:
+        for reader in self.readers:
             docsets = left.getDocIdSet(reader), right.getDocIdSet(reader)
             if search.DocIdSet.EMPTY_DOCIDSET not in docsets:
                 bits = [bitset.cast_(docset).bits for docset in docsets]
         return sorter if sorter.reverse == reverse else SortField(sorter.field, sorter.typename, sorter.parser, reverse)
     def comparator(self, field, type='string', parser=None):
         "Return :meth:`IndexReader.comparator` using a cached `SortField`_ if available."
-        return self.sorter(field, type, parser).comparator(self.indexReader)
+        return self.sorter(field, type, parser).comparator(self)
     def distances(self, lng, lat, lngfield, latfield):
         "Return distance comparator computed from cached lat/lng fields."
         arrays = (self.comparator(field, 'double') for field in (lngfield, latfield))
         IndexSearcher.__init__(self, reader, analyzer)
         self.shared.update(shared)
         shared.clear()
+        self.version = sum(reader.version for reader in self.sequentialSubReaders)
     @property
-    def version(self):
-        return sum(map(operator.attrgetter('version'), self.sequentialSubReaders))
+    def readers(self):
+        return itertools.chain.from_iterable(IndexReader(reader).readers for reader in self.sequentialSubReaders)
     @property
     def timestamp(self):
         return max(IndexReader(reader).timestamp for reader in self.sequentialSubReaders)
-    def overlap(self, *filters):
-        return sum(IndexReader(reader).overlap(*filters) for reader in self.sequentialSubReaders)
 
 class IndexWriter(index.IndexWriter):
     """Inherited lucene IndexWriter.
     """
     __len__ = index.IndexWriter.numDocs
     parse = IndexSearcher.__dict__['parse']
+    if not hasattr(index.IndexWriter, 'forceMerge'):
+        forceMerge, forceMergeDeletes = index.IndexWriter.optimize, index.IndexWriter.expungeDeletes
     def __init__(self, directory=None, mode='a', analyzer=None, version=None, **attrs):
         self.shared = closing()
         if version is None:
         IndexWriter.commit(self)
         if merge:
             if isinstance(merge, bool):
-                self.expungeDeletes()
+                self.forceMergeDeletes()
             else:
-                self.optimize(merge)
+                self.forceMerge(merge)
             IndexWriter.commit(self)
         self.refresh(**caches)
 
         self.termsfilters = {}
     def termsfilter(self, filter, *others):
         "Return `TermsFilter`_ synced to given filter and optionally associated with other indexers."
-        terms = self.sorter(self.field).terms(filter, *self.sequentialSubReaders)
+        terms = self.sorter(self.field).terms(filter, *self.readers)
         termsfilter = self.termsfilters[filter] = TermsFilter(self.field, terms)
         for other in others:
             termsfilter.refresh(other)
         "Store refreshed searcher and synchronize :attr:`termsfilters`."
         sorter, segments = self.sorter(self.field), self.segments
         searcher = self.indexSearcher.reopen(**caches)
-        readers = [reader for reader in searcher.sequentialSubReaders if index.SegmentReader.cast_(reader).segmentName not in segments]
+        readers = [reader for reader in searcher.readers if reader.segmentName not in segments]
         terms = list(itertools.chain.from_iterable(IndexReader(reader).terms(self.field) for reader in readers))
         for filter, termsfilter in self.termsfilters.items():
             if terms:

lupyne/engine/queries.py

     """Caching filter based on a unique field and set of matching values.
     Optimized for many terms and docs, with support for incremental updates.
     Suitable for searching external metadata associated with indexed identifiers.
-    Call :meth:`refresh` to cache a new (or reopened) reader.
+    Call :meth:`refresh` to cache a new (or reopened) searcher.
     
     :param field: field name
     :param values: initial term values, synchronized with the cached filters
         "Return lucene TermsFilter, optionally using the FieldCache."
         if cache:
             return search.FieldCacheTermsFilter(self.field, tuple(values))
-        filter, term = search.TermsFilter(), index.Term(self.field)
+        filter = search.TermsFilter()
         for value in values:
-            filter.addTerm(term.createTerm(value))
+            filter.addTerm(index.Term(self.field, value))
         return filter
     def apply(self, filter, op, readers):
         for reader in readers:
         with self.lock:
             getattr(self.values, self.ops[op])(values)
             self.apply(filter, op, self.readers)
-    def refresh(self, reader):
-        "Refresh cached bitsets of current values for new segments of top-level reader."
-        readers = set(reader.sequentialSubReaders)
+    def refresh(self, searcher):
+        "Refresh cached bitsets of current values for new segments of searcher."
+        readers = set(searcher.readers)
         with self.lock:
             self.apply(self.filter(self.values), 'or', readers - self.readers)
             self.readers = set(reader for reader in readers | self.readers if reader.refCount)
     def array(self, reader):
         method = getattr(search.FieldCache.DEFAULT, 'get{0}s'.format(self.typename))
         return method(reader, self.field, *[self.parser][:bool(self.parser)])
-    def comparator(self, reader):
-        "Return indexed values from default FieldCache using the given top-level reader."
-        readers = reader.sequentialSubReaders
-        if index.MultiReader.instance_(reader):
-            readers = itertools.chain.from_iterable(reader.sequentialSubReaders for reader in readers)
-        arrays = list(map(self.array, readers))
+    def comparator(self, searcher):
+        "Return indexed values from default FieldCache using the given searcher."
+        arrays = list(map(self.array, searcher.readers))
         return arrays[0] if len(arrays) <= 1 else Comparator(arrays)
     def filter(self, start, stop, lower=True, upper=False):
         "Return lucene FieldCacheRangeFilter based on field and type."
 import warnings
 import lucene
 try:
-    from org.apache.lucene import document, index, search, store
+    from org.apache.lucene import index, search
 except ImportError:
-    document = index = search = store = lucene
+    index = search = lucene
 import cherrypy
 try:
     from . import engine, client
     response.headers['x-response-time'] = time.time() - response.time
 
 @tool('on_start_resource')
-def validate(etag=True, last_modified=True, max_age=None, expires=None):
+def validate(etag=True, last_modified=False, max_age=None, expires=None):
     """Return and validate caching headers.
     
     :param etag: return weak entity tag header based on index version and validate if-match headers
     def sync(self, host, path=''):
         "Sync with remote index."
         path = '/' + '{0}/update/{1}/'.format(path, uuid.uuid1()).lstrip('/')
-        directory = store.FSDirectory.cast_(self.searcher.directory).directory.path
+        directory = self.searcher.path
         resource = client.Resource(host)
         names = sorted(set(resource.put(path)).difference(os.listdir(directory)))
         try:
         if fields is None:
             fields = {}
         else:
-            hits.fields = document.MapFieldSelector(list(itertools.chain(fields, multi)))
+            hits.select(*itertools.chain(fields, multi))
         with HTTPError(httplib.BAD_REQUEST, AttributeError):
             groups = hits.groupby(searcher.comparator(*group.split(':')).__getitem__) if group else [hits]
         result['groups'], limit = [], options.get('group.limit', len(groups))
         if not name:
             return list(commit.fileNames)
         with HTTPError(httplib.NOT_FOUND, TypeError, AssertionError):
-            directory = store.FSDirectory.cast_(self.indexer.directory).directory.path
+            directory = self.searcher.path
             assert name in commit.fileNames, 'file not referenced in commit'
         return cherrypy.lib.static.serve_download(os.path.join(directory, name))
     @cherrypy.expose
 class typeAsPayload(engine.TokenFilter):
     "Custom implementation of lucene TypeAsPayloadTokenFilter."
     def setattrs(self):
-        self.payload = self.type.encode('utf8')
+        self.payload = self.type
 
 @contextlib.contextmanager
 def assertWarns(*categories):
         scores = list(searcher.match(doc, 'text:congress', 'text:law', 'amendment:27', 'date:19*'))
         assert 0.0 == scores[0] < scores[1] < scores[2] < scores[3] == 1.0
         searcher = engine.MultiSearcher([indexer.indexReader, self.tempdir])
-        assert searcher.refCount == 1
+        assert searcher.refCount == 1 and searcher.timestamp
         assert searcher.count() == len(searcher) == 2 * len(indexer)
         searcher.sorters['amendment'] = engine.SortField('amenmdment', int)
         comparator = searcher.comparator('amendment')
         assert sorted(map(int, indexer.terms('amendment'))) == range(1, 28)
         assert list(itertools.islice(indexer.terms('text', 'right'), 2)) == ['right', 'rights']
         assert list(indexer.terms('text', 'right*')) == ['right', 'rights']
+        with assertWarns(DeprecationWarning):
+            assert list(indexer.terms('text', 'right?')) == ['rights']
         assert list(indexer.terms('text', 'right', minSimilarity=0.5)) == ['eight', 'right', 'rights']
         word, count = next(indexer.terms('text', 'people', counts=True))
         assert word == 'people' and count == 8
         indexer.add(name='delta')
         indexer.delete('name', 'alpha')
         indexer.commit()
-        assert filter.readers > set(indexer.sequentialSubReaders)
+        assert filter.readers > set(indexer.readers)
         assert [hit['name'] for hit in indexer.search(filter=filter)] == ['bravo', 'delta']
         parallel.update('bravo')
         parallel.update('charlie', priority='high')
         assert [hit['name'] for hit in indexer.search(filter=filter)] == ['charlie', 'delta']
         parallel.commit()
         filter.refresh(indexer)
-        assert filter.readers == set(indexer.sequentialSubReaders)
+        assert filter.readers == set(indexer.readers)
 
 if __name__ == '__main__':
     lucene.initVM()
     
     def testInterface(self):
         "Remote reading and writing."
-        config = {'tools.json_out.indent': 2, 'tools.validate.expires': 0, 'tools.validate.max_age': 0}
+        config = {'tools.json_out.indent': 2, 'tools.validate.last_modified': True, 'tools.validate.expires': 0, 'tools.validate.max_age': 0}
         self.servers += (
             self.start(self.ports[0], self.tempdir, '--autoreload=1', **config),
             self.start(self.ports[1], self.tempdir, self.tempdir, '--autoupdate=2.0'), # concurrent searchers
         assert resource.get('/terms/text/:0') == []
         assert resource.get('/terms/text/z:') == []
         assert resource.get('/terms/text/right:right~') == resource.get('/terms/text/right*') == ['right', 'rights']
-        assert resource.get('/terms/text/writ%3f') == ['writs']
         assert resource.get('/terms/text/writ*') == ['writ', 'writing', 'writings', 'writs', 'written']
         assert resource.get('/terms/text/*?count=0') == []
         assert resource.get('/terms/text/writ*?count=10') == ['writs', 'writ', 'writing', 'writings', 'written']