Commits

coady committed 085ccab

PyLucene 4.1 supported.

  • Participants
  • Parent commits 4070d8f

Comments (0)

Files changed (6)

 Lupyne should run anywhere PyLucene does, though its primary testing is on the popular unix variants.
 
  * Python 2.6.6+, 2.7
- * PyLucene 3.2, 3.3, 3.4, 3.5, 3.6, 4.0
+ * PyLucene 3.2, 3.3, 3.4, 3.5, 3.6, 4.0, 4.1
  * CherryPy 3.1.2+, 3.2 (only required for server)
 
 Usage

lupyne/engine/documents.py

     :param count: maximum number of groups
     :param sort: lucene Sort to order groups
     """
+    collectors = getattr(grouping, 'term', grouping)
     def __init__(self, searcher, field, query=None, count=None, sort=None):
         self.searcher, self.field = searcher, field
         self.query = query or search.MatchAllDocsQuery()
         self.sort = sort or search.Sort.RELEVANCE
         if count is None:
-            collector = grouping.TermAllGroupsCollector(field)
+            collector = self.collectors.TermAllGroupsCollector(field)
             search.IndexSearcher.search(self.searcher, self.query, collector)
             count = collector.groupCount
-        collector = grouping.TermFirstPassGroupingCollector(field, self.sort, count)
+        collector = self.collectors.TermFirstPassGroupingCollector(field, self.sort, count)
         search.IndexSearcher.search(self.searcher, self.query, collector)
         self.searchgroups = collector.getTopGroups(0, False).of_(grouping.SearchGroup)
     def __len__(self):
             yield convert(searchgroup.groupValue)
     def facets(self, filter):
         "Generate field values and counts which match given filter."
-        collector = grouping.TermSecondPassGroupingCollector(self.field, self.searchgroups, self.sort, self.sort, 1, False, False, False)
+        collector = self.collectors.TermSecondPassGroupingCollector(self.field, self.searchgroups, self.sort, self.sort, 1, False, False, False)
         search.IndexSearcher.search(self.searcher, self.query, filter, collector)
         for groupdocs in collector.getTopGroups(0).groups:
             yield convert(groupdocs.groupValue), groupdocs.totalHits
         sort = sort or self.sort
         if sort == search.Sort.RELEVANCE:
             scores = maxscore = True
-        collector = grouping.TermSecondPassGroupingCollector(self.field, self.searchgroups, self.sort, sort, count, scores, maxscore, True)
+        collector = self.collectors.TermSecondPassGroupingCollector(self.field, self.searchgroups, self.sort, sort, count, scores, maxscore, True)
         search.IndexSearcher.search(self.searcher, self.query, collector)
         for groupdocs in collector.getTopGroups(0).groups:
             hits = Hits(self.searcher, groupdocs.scoreDocs, groupdocs.totalHits, groupdocs.maxScore, getattr(self, 'fields', None))

lupyne/engine/indexers.py

     from lucene import File, StringReader, Float, Arrays, HashMap, HashSet, PythonAnalyzer, PythonTokenFilter, PythonQueryParser
     analysis = document = index = queryParser = search = store = util = \
     standard = tokenattributes = memory = similar = spans = lucene
-from .queries import Query, TermsFilter, SortField, Highlighter, FastVectorHighlighter, SpellChecker, SpellParser
+from .queries import Query, BooleanFilter, TermsFilter, SortField, Highlighter, FastVectorHighlighter, SpellChecker, SpellParser
 from .documents import Field, Document, Hits, Grouping
 from .spatial import DistanceComparator
 
         source = tokens = self.tokenizer.tokenStream(field, reader) if isinstance(self.tokenizer, analysis.Analyzer) else self.tokenizer(reader)
         for filter in self.filters:
             tokens = filter(tokens)
+        tokens.reset()
         return source, tokens
     def tokenStream(self, field, reader):
         return self.components(field, reader)[1]
             return mlt.like(StringReader(doc), '') if isinstance(doc, basestring) else mlt.like(doc)
         except lucene.InvalidArgsError:
             return mlt.like(StringReader(doc))
-    def overlap(self, left, right):
-        "Return intersection count of cached filters."
-        count, bitset = 0, getattr(util, 'FixedBitSet', util.OpenBitSet)
-        for reader in self.readers:
-            if hasattr(reader, 'liveDocs'):
-                docsets = [filter.getDocIdSet(reader.context, reader.liveDocs).bits() for filter in (left, right)]
-            else:
-                docsets = left.getDocIdSet(reader), right.getDocIdSet(reader)
-            if all(map(bitset.instance_, docsets)):
-                bits = [bitset.cast_(docset).getBits() for docset in docsets]
-                count += util.BitUtil.pop_intersect(bits[0], bits[1], 0, min(map(len, bits)))
-        return int(count)
 
 class IndexSearcher(search.IndexSearcher, IndexReader):
     """Inherited lucene IndexSearcher, with a mixed-in IndexReader.
         if isinstance(query, search.Query):
             query = search.QueryWrapperFilter(query)
         if not isinstance(query, search.CachingWrapperFilter):
-            flag = search.CachingWrapperFilter.DeletesMode.RECACHE if hasattr(search.CachingWrapperFilter, 'DeletesMode') else True
-            query = search.CachingWrapperFilter(query, flag)
+            query = search.CachingWrapperFilter(query)
         for key in keys:
             filters = self.filters.get(key)
             if key in self.groupings:
                 counts[key] = dict(self.groupings[key].facets(query))
             elif isinstance(filters, search.Filter):
-                counts[key] = self.overlap(query, filters)
+                counts[key] = self.count(filter=BooleanFilter.all(query, filters))
             else:
                 name, value = (key, None) if isinstance(key, basestring) else key
                 filters = self.filters.setdefault(name, {})
                 for value in values:
                     if value not in filters:
                         filters[value] = Query.term(name, value).filter()
-                    counts[name][value] = self.overlap(query, filters[value])
+                    counts[name][value] = self.count(filter=BooleanFilter.all(query, filters[value]))
         return dict(counts)
     def grouping(self, field, query=None, count=None, sort=None):
         "Return `Grouping`_ for unique field and lucene search parameters."

lupyne/engine/queries.py

         elif isinstance(self, search.TermRangeQuery):
             filter = search.TermRangeFilter(self.field, self.lowerTerm, self.upperTerm, self.includesLower(), self.includesUpper())
         elif isinstance(self, search.TermQuery):
-            filter = queries.TermsFilter()
-            filter.addTerm(self.getTerm())
+            if hasattr(queries.TermsFilter, 'addTerm'):
+                filter = queries.TermsFilter()
+                filter.addTerm(self.getTerm())
+            else:
+                filter = queries.TermsFilter([self.getTerm()])
         else:
             filter = search.QueryWrapperFilter(self)
-        if not cache:
-            return filter
-        flag = search.CachingWrapperFilter.DeletesMode.RECACHE if hasattr(search.CachingWrapperFilter, 'DeletesMode') else True
-        return search.CachingWrapperFilter(filter, flag)
+        return search.CachingWrapperFilter(filter) if cache else filter
     def terms(self):
         "Generate set of query term items."
         terms = HashSet().of_(index.Term)
         base = spans.SpanNearPayloadCheckQuery if spans.SpanNearQuery.instance_(self) else spans.SpanPayloadCheckQuery
         return SpanQuery(base, self, Arrays.asList(list(map(lucene.JArray_byte, values))))
 
+class BooleanFilter(queries.BooleanFilter):
+    "Inherited lucene BooleanFilter similar to BooleanQuery."
+    def __init__(self, occur, *filters):
+        queries.BooleanFilter.__init__(self)
+        for filter in filters:
+            self.add(queries.FilterClause(filter, occur))
+    @classmethod
+    def all(cls, *filters):
+        "Return `BooleanFilter`_ (AND) from filters."
+        return cls(search.BooleanClause.Occur.MUST, *filters)
+
 class TermsFilter(search.CachingWrapperFilter):
     """Caching filter based on a unique field and set of matching values.
     Optimized for many terms and docs, with support for incremental updates.
     ops = {'or': 'update', 'and': 'intersection_update', 'andNot': 'difference_update'}
     def __init__(self, field, values=()):
         assert lucene.VERSION >= '3.5', 'requires FixedBitSet set operations introduced in lucene 3.5'
-        args = [True] if lucene.VERSION >= '4' else []
-        search.CachingWrapperFilter.__init__(self, queries.TermsFilter(), *args)
+        search.CachingWrapperFilter.__init__(self, search.QueryWrapperFilter(search.MatchAllDocsQuery()))
         self.field = field
         self.values = set(values)
         self.readers = set()
         "Return lucene TermsFilter, optionally using the FieldCache."
         if cache:
             return search.FieldCacheTermsFilter(self.field, tuple(values))
+        terms = [index.Term(self.field, value) for value in values]
+        if not hasattr(queries.TermsFilter, 'addTerm'):
+            return queries.TermsFilter(terms)
         filter = queries.TermsFilter()
-        for value in values:
-            filter.addTerm(index.Term(self.field, value))
+        for term in terms:
+            filter.addTerm(term)
         return filter
     def apply(self, filter, op, readers):
         for reader in readers:
             try:
-                args = [reader.context, reader.liveDocs] if hasattr(index.IndexReader, 'context') else [reader]
+                args = [reader.context, None] if hasattr(index.IndexReader, 'context') else [reader]
                 bitset = util.FixedBitSet.cast_(self.getDocIdSet(*args))
+                if reader not in self.readers:
+                    bitset.clear(0, bitset.length())
                 getattr(bitset, op)(filter.getDocIdSet(*args).iterator())
             except lucene.JavaError as exc:
                 assert not reader.refCount, exc
 ==================
  * Engine:
    
-   - PyLucene 4.0 supported
+   - PyLucene 4.0 and 4.1 supported
    - PyLucene 3.2, 3.3, and 3.4 deprecated
    - Optimized searching and sorting with unlimited count
    - Support for contrib grouping collectors and faceting
         assert orange == 'CA.Orange' and facets[orange] > 10
         (field, facets), = indexer.facets(query, ('state.county', 'CA.*')).items()
         assert all(value.startswith('CA.') for value in facets) and set(facets) < set(indexer.filters[field])
-        if hasattr(grouping, 'TermFirstPassGroupingCollector'):
+        if lucene.VERSION >= '3.3':
             assert set(indexer.grouping('state', count=1)) < set(indexer.grouping('state')) == set(states)
             grouper = indexer.grouping(field, query, sort=search.Sort(indexer.sorter(field)))
             assert len(grouper) == 2 and list(grouper) == [la, orange]
             indexer.add(name=name)
         indexer.commit()
         filter = engine.TermsFilter('name')
-        assert indexer.count(filter=filter) == len(filter.readers) == 0
+        assert len(filter.readers) == 0
         filter.add('alpha', 'bravo')
         filter.discard('bravo', 'charlie')
         assert filter.values == set(['alpha'])