coady avatar coady committed a88b8c2

Optimized collecting of unlimited hits.

Comments (0)

Files changed (7)

examples/sorting.py

 PyLucene has several pitfalls when collecting or sorting a large query result.
 Generally they involve the overhead of traversing the VM in an internal loop.
 
-Lucene's performance itself drops off noticeably as the requested doc count increases.
-This is heavily compounded by having to iterate through a large set of ScoreDocs in PyLucene.
+Lucene also requires supplying a maximum doc count for searches,
+and supplying an excessively large count is a poor workaround because the collection heap is pre-allocated.
 
-Lucene also only supports sorting a query result when a doc count is supplied.
-And supplying an excessively large count is not a good workaround because of the aforementioned problem.
-
-Finally the custom sorting interface, although well-supported in PyLucene, is bascially useless.
-The sort key of every potential doc must realistically be cached anyway,
-but the performance overhead of O(n log n) comparison calls in java is still horrid.
+Finally the custom sorting interface, although well-supported in PyLucene, has horrible performance.
+The sort key of every potential doc must realistically be cached,
+but the overhead of O(n log n) comparison calls dispatched through the VM is far worse than iterating ScoreDocs.
 
 To mitigate all these problems, Lupyne first provides a unified search interface.
-The same Hits type is returned regardless of whether a doc count is supplied.
+The same Hits type is returned regardless of optional doc count or sorting parameters.
 As with lucene, the result is fully evaluated but each individual Hit object will only be loaded on demand.
-Internally an optimized custom hit Collector is used when all docs are requested.
+Internally a CachingCollector is used when all docs are requested.
 
-The search method does allow lucene Sort parameters to be passed through, since that's still optimal.
-So the only gotcha is that with no doc count the sort parameter must instead be a python callable key.
+The search method allows lucene Sort parameters to be passed through, since that's still optimal.
+Additionally the hits themselves can be sorted afterwards with any python callable key.
 The IndexSearcher.comparator method is convenient for creating a sort key table from indexed fields.
 The upshot is custom sorting and sorting large results are both easier and faster.
 
 
 ### lupyne ###
 
-hits = indexer.search(count=10, sort='color')
+hits = indexer.search(sort='color')
 assert [hit['color'] for hit in hits] == sorted(colors)
 comparator = indexer.comparator('color')
 assert list(comparator) == list(colors)
-hits = indexer.search(sort=comparator.__getitem__)
+hits = indexer.search().sorted(comparator.__getitem__)
 assert [hit['color'] for hit in hits] == sorted(colors)

lupyne/engine/documents.py

 
 from future_builtins import map, zip
 import datetime, calendar
+import operator
 import collections
 import lucene
 from .queries import Query
     Provides a read-only sequence interface to hit objects.
     
     :param searcher: `IndexSearcher`_ which can retrieve documents
-    :param ids: ordered doc ids
-    :param scores: ordered doc scores
+    :param scoredocs: lucene ScoreDocs
     :param count: total number of hits
     :param maxscore: maximum score
     :param fields: optional field selectors
     """
-    def __init__(self, searcher, ids, scores, count=None, maxscore=None, fields=None):
-        self.searcher = searcher
-        self.ids, self.scores = ids, scores
+    def __init__(self, searcher, scoredocs, count=None, maxscore=None, fields=None):
+        self.searcher, self.scoredocs = searcher, scoredocs
         self.count, self.maxscore = count, maxscore
         self.fields = lucene.MapFieldSelector(fields) if isinstance(fields, collections.Iterable) else fields
     def __len__(self):
-        return len(self.ids)
+        return len(self.scoredocs)
     def __getitem__(self, index):
-        id, score = self.ids[index], self.scores[index]
         if isinstance(index, slice):
-            return type(self)(self.searcher, id, score, self.count, self.maxscore, self.fields)
-        return Hit(self.searcher.doc(id, self.fields), id, score)
+            start, stop, step = index.indices(len(self))
+            assert step == 1, 'slice step is not supported'
+            scoredocs = self.scoredocs[start:stop] if stop - start < len(self) else self.scoredocs
+            return type(self)(self.searcher, scoredocs, self.count, self.maxscore, self.fields)
+        scoredoc = self.scoredocs[index]
+        return Hit(self.searcher.doc(scoredoc.doc, self.fields), scoredoc.doc, scoredoc.score)
+    @property
+    def ids(self):
+        return map(operator.attrgetter('doc'), self.scoredocs)
+    @property
+    def scores(self):
+        return map(operator.attrgetter('score'), self.scoredocs)
     def items(self):
         "Generate zipped ids and scores."
-        return zip(self.ids, self.scores)
+        return map(operator.attrgetter('doc', 'score'), self.scoredocs)
     def groupby(self, func):
         "Return ordered list of `Hits`_ grouped by value of function applied to doc ids."
         groups = {}
-        for id, score in self.items():
-            value = func(id)
+        for scoredoc in self.scoredocs:
+            value = func(scoredoc.doc)
             try:
                 group = groups[value]
             except KeyError:
-                group = groups[value] = type(self)(self.searcher, [], [], fields=self.fields)
+                group = groups[value] = type(self)(self.searcher, [], fields=self.fields)
                 group.index, group.value = len(groups), value
-            group.ids.append(id)
-            group.scores.append(score)
+            group.scoredocs.append(scoredoc)
         return sorted(groups.values(), key=lambda group: group.__dict__.pop('index'))
     def filter(self, func):
         "Return `Hits`_ filtered by function applied to doc ids."
-        ids, scores = [], []
-        for id, score in self.items():
-            if func(id):
-                ids.append(id)
-                scores.append(score)
-        return type(self)(self.searcher, ids, scores, fields=self.fields)
+        scoredocs = [scoredoc for scoredoc in self.scoredocs if func(scoredoc.doc)]
+        return type(self)(self.searcher, scoredocs, fields=self.fields)
     def sorted(self, key, reverse=False):
         "Return `Hits`_ sorted by key function applied to doc ids."
-        ids = sorted(self.ids, key=key, reverse=reverse)
-        scores = list(map(dict(self.items()).__getitem__, ids))
-        return type(self)(self.searcher, ids, scores, self.count, self.maxscore, self.fields)
+        scoredocs = sorted(self.scoredocs, key=lambda scoredoc: key(scoredoc.doc), reverse=reverse)
+        return type(self)(self.searcher, scoredocs, self.count, self.maxscore, self.fields)

lupyne/engine/indexers.py

 import itertools, operator
 import contextlib
 import abc, collections
+import warnings
 import lucene
-from .queries import Query, Collector, TermsFilter, SortField, Highlighter, FastVectorHighlighter, SpellChecker, SpellParser
+from .queries import Query, TermsFilter, SortField, Highlighter, FastVectorHighlighter, SpellChecker, SpellParser
 from .documents import Field, Document, Hits
 from .spatial import DistanceComparator
 
         collector = lucene.TotalHitCountCollector()
         lucene.IndexSearcher.search(self, query, options.get('filter'), collector)
         return collector.totalHits
+    def collector(self, query, count=None, sort=None, reverse=False, scores=False, maxscore=False):
+        weight = self.createNormalizedWeight(query) if hasattr(self, 'createNormalizedWeight') else query.weight(self)
+        inorder = not weight.scoresDocsOutOfOrder()
+        if count is None:
+            return lucene.CachingCollector.create(not inorder, True, float('inf'))
+        count = min(count, self.maxDoc() or 1)
+        if callable(sort) or sort is None:
+            return lucene.TopScoreDocCollector.create(count, inorder)
+        if isinstance(sort, basestring):
+            sort = self.sorter(sort, reverse=reverse)
+        if not isinstance(sort, lucene.Sort):
+            sort = lucene.Sort(sort)
+        return lucene.TopFieldCollector.create(sort, count, False, scores, maxscore, inorder)
     def search(self, query=None, filter=None, count=None, sort=None, reverse=False, scores=False, maxscore=False, timeout=None, **parser):
         """Run query and return `Hits`_.
         
+        .. deprecated:: 1.2+ sort param for lucene only; use Hits.sorted with a callable
+        
         :param query: query string or lucene Query
         :param filter: lucene Filter
         :param count: maximum number of hits to retrieve
-        :param sort: if count is given, lucene Sort parameters, else a callable key
+        :param sort: lucene Sort parameters
         :param reverse: reverse flag used with sort
-        :param scores: compute scores for candidate results when using lucene Sort
-        :param maxscore: compute maximum score of all results when using lucene Sort
+        :param scores: compute scores for candidate results when sorting
+        :param maxscore: compute maximum score of all results when sorting
         :param timeout: stop search after elapsed number of seconds
         :param parser: :meth:`Analyzer.parse` options
         """
         query = lucene.MatchAllDocsQuery() if query is None else self.parse(query, **parser)
-        weight = self.createNormalizedWeight(query) if hasattr(self, 'createNormalizedWeight') else query.weight(self)
-        # use custom Collector if all results are necessary, otherwise use lucene's TopDocsCollectors
-        if count is None:
-            collector = Collector()
-        else:
-            count, inorder = min(count, self.maxDoc() or 1), not weight.scoresDocsOutOfOrder()
-            if sort is None:
-                collector = lucene.TopScoreDocCollector.create(count, inorder)
-            else:
-                if isinstance(sort, basestring):
-                    sort = self.sorter(sort, reverse=reverse)
-                if not isinstance(sort, lucene.Sort):
-                    sort = lucene.Sort(sort)
-                collector = lucene.TopFieldCollector.create(sort, count, False, scores, maxscore, inorder)
+        cache = collector = self.collector(query, count, sort, reverse, scores, maxscore)
         args = [lucene.TimeLimitingCollector.getGlobalCounter()] if hasattr(lucene, 'Counter') else []
         results = collector if timeout is None else lucene.TimeLimitingCollector(collector, *(args + [long(timeout * 1000)]))
         try:
-            lucene.IndexSearcher.search(self, weight, filter, results)
+            lucene.IndexSearcher.search(self, query, filter, results)
         except lucene.JavaError as timeout:
             if not lucene.TimeLimitingCollector.TimeExceededException.instance_(timeout.getJavaException()):
                 raise
-        if isinstance(collector, Collector):
-            ids, scores = collector.sorted(key=sort, reverse=reverse)
-            collector.finalize()
-            stats = len(ids), max(scores or [float('nan')])
-        else:
-            topdocs = collector.topDocs()
-            scoredocs = list(topdocs.scoreDocs)
-            ids, scores = (list(map(operator.attrgetter(name), scoredocs)) for name in ('doc', 'score'))
-            stats = topdocs.totalHits, topdocs.maxScore
-        stats *= not isinstance(timeout, lucene.JavaError)
-        return Hits(self, ids, scores, *stats)
+        if isinstance(cache, lucene.CachingCollector):
+            collector = lucene.TotalHitCountCollector()
+            cache.replay(collector)
+            collector = self.collector(query, collector.totalHits or 1, sort, reverse, scores, maxscore)
+            cache.replay(collector)
+        topdocs = collector.topDocs()
+        stats = (topdocs.totalHits, topdocs.maxScore) * (not isinstance(timeout, lucene.JavaError))
+        hits = Hits(self, topdocs.scoreDocs, *stats)
+        if callable(sort):
+            warnings.warn('Use lucene sorting or call Hits.sorted.', DeprecationWarning)
+            return hits.sorted(sort, reverse)
+        return hits
     def facets(self, query, *keys):
         """Return mapping of document counts for the intersection with each facet.
         

lupyne/engine/queries.py

         base = lucene.SpanNearPayloadCheckQuery if lucene.SpanNearQuery.instance_(self) else lucene.SpanPayloadCheckQuery
         return SpanQuery(base, self, lucene.Arrays.asList(list(map(lucene.JArray_byte, values))))
 
-class Collector(lucene.PythonCollector):
-    "Collect all ids and scores efficiently."
-    def __init__(self):
-        lucene.PythonCollector.__init__(self)
-        self.scores = {}
-    def collect(self, id, score):
-        self.scores[id + self.base] = score
-    def setNextReader(self, reader, base):
-        self.base = base
-    def acceptsDocsOutOfOrder(self):
-        return True
-    def sorted(self, key=None, reverse=False):
-        "Return ordered ids and scores."
-        ids = sorted(self.scores)
-        if key is None:
-            key, reverse = self.scores.__getitem__, True
-        ids.sort(key=key, reverse=reverse)
-        return ids, list(map(self.scores.__getitem__, ids))
-
 class TermsFilter(lucene.CachingWrapperFilter):
     """Caching filter based on a unique field and set of matching values.
     Optimized for many terms and docs, with support for incremental updates.
         if sort is not None:
             sort = (re.match('(-?)(\w+):?(\w*)', field).groups() for field in sort)
             sort = [(name, (type or 'string'), (reverse == '-')) for reverse, name, type in sort]
-            if count is None:
-                with HTTPError(httplib.BAD_REQUEST, ValueError, AttributeError):
-                    reverse, = set(reverse for name, type, reverse in sort) # only one sort direction allowed with unlimited count
-                    comparators = [searcher.comparator(name, type) for name, type, reverse in sort]
-                sort = comparators[0].__getitem__ if len(comparators) == 1 else lambda id: tuple(map(operator.itemgetter(id), comparators))
-            else:
-                with HTTPError(httplib.BAD_REQUEST, AttributeError):
-                    sort = [searcher.sorter(name, type, reverse=reverse) for name, type, reverse in sort]
+            with HTTPError(httplib.BAD_REQUEST, AttributeError):
+                sort = [searcher.sorter(name, type, reverse=reverse) for name, type, reverse in sort]
         q = params.q(searcher, q, **options)
         qfilter = options.pop('filter', None)
         if qfilter is not None and qfilter not in searcher.filters:
         qfilter = searcher.filters.get(qfilter)
         if mlt is not None:
             if q is not None:
-                mlt = searcher.search(q, count=mlt+1, sort=sort, reverse=reverse).ids[mlt]
+                mlt, = searcher.search(q, count=mlt+1, sort=sort, reverse=reverse)[mlt:].ids
             mltfields = options.pop('mlt.fields', ())
             with HTTPError(httplib.BAD_REQUEST, ValueError):
                 attrs = dict((key.partition('.')[-1], json.loads(options[key])) for key in options if key.startswith('mlt.'))
         assert list(indexer.positions('text', 'world', payloads=True)) == [(0, [(1, '<ALPHANUM>')])]
         hits = indexer.search('text:hello')
         assert len(hits) == hits.count == 1
-        assert hits.ids == [0]
+        self.assertRaises(AssertionError, hits.__getitem__, slice(None, None, 2))
+        assert hits.scoredocs is hits[:1].scoredocs and not hits[1:]
+        assert list(hits.ids) == [0]
         score, = hits.scores
         assert 0 < score < 1
         assert dict(hits.items()) == {0: score}
         assert len(hits) == hits.count == 8
         assert set(map(type, hits.ids)) == set([int]) and set(map(type, hits.scores)) == set([float])
         assert hits.maxscore == max(hits.scores)
-        ids = hits.ids
+        ids = list(hits.ids)
         hits = indexer.search('people', count=5, field='text')
-        assert hits.ids == ids[:len(hits)]
+        assert list(hits.ids) == ids[:len(hits)]
         assert len(hits) == 5 and hits.count == 8
         assert not any(map(math.isnan, hits.scores))
         assert hits.maxscore == max(hits.scores)
         hits = indexer.search('text:people', count=5, sort=lucene.Sort.INDEXORDER)
-        assert sorted(hits.ids) == hits.ids
+        assert sorted(hits.ids) == list(hits.ids)
         sort = engine.SortField('amendment', type=int)
         hits = indexer.search('text:people', count=5, sort=sort)
         assert [hit.get('amendment') for hit in hits] == [None, None, '1', '2', '4']
         hits = indexer.search('text:right', count=2, sort=sort, maxscore=True)
         assert hits.maxscore > max(hits.scores)
         comparator = indexer.comparator('amendment', type=int, parser=lambda value: int(value or -1))
-        hits = indexer.search('text:people', sort=comparator.__getitem__)
-        assert sorted(hits.ids) == hits.ids and hits.ids != ids
+        with assertWarns(DeprecationWarning):
+            hits = indexer.search('text:people', sort=comparator.__getitem__)
+        assert sorted(hits.ids) == list(hits.ids) and list(hits.ids) != ids
         comparator = list(zip(*map(indexer.comparator, ['article', 'amendment'])))
         hits = indexer.search('text:people', sort=comparator.__getitem__)
-        assert sorted(hits.ids) != hits.ids
+        assert sorted(hits.ids) != list(hits.ids)
         hits = indexer.search('text:people', count=5, sort='amendment', reverse=True)
         assert [hit['amendment'] for hit in hits] == ['9', '4', '2', '17', '10']
         hit, = indexer.search('freedom', field='text')
         query = field.within(x, y, 10**4)
         assert len(query) < 3
         distances = indexer.distances(x, y, 'longitude', 'latitude')
-        hits = indexer.search(query, sort=distances.__getitem__)
+        hits = indexer.search(query).sorted(distances.__getitem__)
         assert hits[0]['zipcode'] == zipcode and distances[hits[0].id] < 10
         cities = set(hit['city'] for hit in hits)
         assert city in cities and 100 > len(cities) > 50
         hits = hits.filter(lambda id: distances[id] < 10**4)
         assert 0 < len(hits) < sum(counts.values())
         hits = hits.sorted(distances.__getitem__, reverse=True)
-        assert 0 == distances[hits.ids[-1]] < distances[hits.ids[0]] < 10**4
+        ids = list(hits.ids)
+        assert 0 == distances[ids[-1]] < distances[ids[0]] < 10**4
     
     def testFields(self):
         "Custom fields."
         sizes = dict((id, int(indexer[id]['size'])) for id in indexer)
         ids = sorted((id for id in sizes if sizes[id] >= 1000), key=sizes.get)
         query = engine.Query.range('size', '1000', None)
-        hits = indexer.search(query, sort=sizes.get)
-        assert hits.ids == ids
+        hits = indexer.search(query).sorted(sizes.get)
+        assert list(hits.ids) == ids
         hits = indexer.search(query, count=3, sort=engine.SortField('size', type=long))
-        assert hits.ids == ids[:len(hits)]
+        assert list(hits.ids) == ids[:len(hits)]
         query = engine.Query.range('size', None, '1000')
         assert indexer.count(query) == len(sizes) - len(ids)
         indexer.sorters['year'] = engine.SortField('Y-m-d', type=int, parser=lambda date: int(date.split('-')[0]))
         sizes = dict((id, int(indexer[id]['size'])) for id in indexer)
         ids = sorted((id for id in sizes if sizes[id] >= 1000), key=sizes.get)
         query = field.range(1000, None)
-        hits = indexer.search(query, sort=sizes.get)
-        assert hits.ids == ids
+        hits = indexer.search(query).sorted(sizes.get)
+        assert list(hits.ids) == ids
         hits = indexer.search(query, count=3, sort=engine.SortField('size', type=long))
-        assert hits.ids == ids[:len(hits)]
+        assert list(hits.ids) == ids[:len(hits)]
         query = field.range(None, 1000)
         assert indexer.count(query) == len(sizes) - len(ids)
         self.assertRaises(OverflowError, list, field.items(-2**64))
         with assertRaises(httplib.HTTPException, httplib.BAD_REQUEST):
             resource.get('/search?count=')
         with assertRaises(httplib.HTTPException, httplib.BAD_REQUEST):
-            resource.get('/search', sort='-x,y')
-        with assertRaises(httplib.HTTPException, httplib.BAD_REQUEST):
             resource.get('/search', count=1, sort='x:str')
         with assertRaises(httplib.HTTPException, httplib.BAD_REQUEST):
             resource.get('/search', count=1, group='x:str')
         assert sorted(docs, key=operator.itemgetter('__score__'), reverse=True) == docs
         assert len(docs) == result['count'] == 8
         result = resource.get('/search', q='text:people', count=5)
+        maxscore = result['maxscore']
         assert docs[:5] == result['docs'] and result['count'] == len(docs)
         result = resource.get('/search', q='text:people', count=5, sort='-amendment:int')
         assert math.isnan(result['maxscore']) and all(math.isnan(doc['__score__']) for doc in result['docs'])
         assert [doc['amendment'] for doc in result['docs']] == ['17', '10', '9', '4', '2']
         result = resource.get('/search', q='text:people', sort='-amendment:int')
         assert [doc.get('amendment') for doc in result['docs']] == ['17', '10', '9', '4', '2', '1', None, None]
-        assert result == resource.get('/search', q='text:people', sort='-date,-amendment:int')
-        maxscore = result['maxscore']
-        assert maxscore == max(doc['__score__'] for doc in result['docs'])
         result = resource.get('/search', q='text:people', count=5, sort='-amendment:int', **{'sort.scores': ''})
         assert math.isnan(result['maxscore']) and maxscore in (doc['__score__'] for doc in result['docs'])
         result = resource.get('/search', q='text:people', count=1, sort='-amendment:int', **{'sort.scores': 'max'})
         assert maxscore == result['maxscore'] and maxscore not in (doc['__score__'] for doc in result['docs'])
         result = resource.get('/search', q='text:people', count=5, sort='-article,amendment:int')
         assert [doc.get('amendment') for doc in result['docs']] == [None, None, '1', '2', '4']
-        with assertRaises(httplib.HTTPException, httplib.BAD_REQUEST):
-            resource.get('/search', q='text:people', sort='-article,amendment:int')
         result = resource.get('/search', q='text:people', start=2, count=2, facets='article,amendment')
         assert [doc['amendment'] for doc in result['docs']] == ['10', '1']
         assert result['count'] == sum(sum(facets.values()) for facets in result['facets'].values())
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.