Source

whoosh / src / whoosh / query / nested.py

Full commit
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
# Copyright 2012 Matt Chaput. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
#    1. Redistributions of source code must retain the above copyright notice,
#       this list of conditions and the following disclaimer.
#
#    2. Redistributions in binary form must reproduce the above copyright
#       notice, this list of conditions and the following disclaimer in the
#       documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are
# those of the authors and should not be interpreted as representing official
# policies, either expressed or implied, of Matt Chaput.

from whoosh import matching
from whoosh.compat import text_type, u, xrange
from whoosh.query import qcore
from whoosh.query.wrappers import WrappingQuery


class NestedParent(WrappingQuery):
    """A query that allows you to search for "nested" documents, where you can
    index (possibly multiple levels of) "parent" and "child" documents using
    the :meth:`~whoosh.writing.IndexWriter.group` and/or
    :meth:`~whoosh.writing.IndexWriter.start_group` methods of a
    :class:`whoosh.writing.IndexWriter` to indicate that hierarchically related
    documents should be kept together::

        schema = fields.Schema(type=fields.ID, text=fields.TEXT(stored=True))

        with ix.writer() as w:
            # Say we're indexing chapters (type=chap) and each chapter has a
            # number of paragraphs (type=p)
            with w.group():
                w.add_document(type="chap", text="Chapter 1")
                w.add_document(type="p", text="Able baker")
                w.add_document(type="p", text="Bright morning")
            with w.group():
                w.add_document(type="chap", text="Chapter 2")
                w.add_document(type="p", text="Car trip")
                w.add_document(type="p", text="Dog eared")
                w.add_document(type="p", text="Every day")
            with w.group():
                w.add_document(type="chap", text="Chapter 3")
                w.add_document(type="p", text="Fine day")

    The ``NestedParent`` query wraps two sub-queries: the "parent query"
    matches a class of "parent documents". The "sub query" matches nested
    documents you want to find. For each "sub document" the "sub query" finds,
    this query acts as if it found the corresponding "parent document".

    >>> with ix.searcher() as s:
    ...   r = s.search(query.Term("text", "day"))
    ...   for hit in r:
    ...     print(hit["text"])
    ...
    Chapter 2
    Chapter 3
    """

    def __init__(self, parents, subq, per_parent_limit=None, score_fn=sum):
        """
        :param parents: a query, DocIdSet object, or Results object
            representing the documents you want to use as the "parent"
            documents. Where the sub-query matches, the corresponding document
            in these results will be returned as the match.
        :param subq: a query matching the information you want to find.
        :param per_parent_limit: a maximum number of "sub documents" to search
            per parent. The default is None, meaning no limit.
        :param score_fn: a function to use to combine the scores of matching
            sub-documents to calculate the score returned for the parent
            document. The default is ``sum``, that is, add up the scores of the
            sub-documents.
        """

        self.parents = parents
        self.child = subq
        self.per_parent_limit = per_parent_limit
        self.score_fn = score_fn

    def normalize(self):
        p = self.parents
        if isinstance(p, qcore.Query):
            p = p.normalize()
        q = self.child.normalize()

        if p is qcore.NullQuery or q is qcore.NullQuery:
            return qcore.NullQuery

        return self.__class__(p, q)

    def requires(self):
        return self.child.requires()

    def matcher(self, searcher, context=None):
        bits = searcher._filter_to_comb(self.parents)
        if not bits:
            return matching.NullMatcher
        m = self.child.matcher(searcher, context)
        if not m.is_active():
            return matching.NullMatcher

        return self.NestedParentMatcher(bits, m, self.per_parent_limit,
                                        searcher.doc_count_all())

    def deletion_docs(self, searcher):
        bits = searcher._filter_to_comb(self.parents)
        if not bits:
            return

        m = self.child.matcher(searcher, searcher.boolean_context())
        maxdoc = searcher.doc_count_all()
        while m.is_active():
            docnum = m.id()
            parentdoc = bits.before(docnum + 1)
            nextparent = bits.after(docnum) or maxdoc
            for i in xrange(parentdoc, nextparent):
                yield i
            m.skip_to(nextparent)

    class NestedParentMatcher(matching.Matcher):
        def __init__(self, comb, child, per_parent_limit, maxdoc):
            self.comb = comb
            self.child = child
            self.per_parent_limit = per_parent_limit
            self.maxdoc = maxdoc

            self._nextdoc = None
            if self.child.is_active():
                self._gather()

        def is_active(self):
            return self._nextdoc is not None

        def supports_block_quality(self):
            return False

        def _gather(self):
            # This is where the magic happens ;)
            child = self.child
            pplimit = self.per_parent_limit

            # The next document returned by this matcher is the parent of the
            # child's current document. We don't have to worry about whether
            # the parent is deleted, because the query that gave us the parents
            # wouldn't return deleted documents.
            self._nextdoc = self.comb.before(child.id() + 1)
            # The next parent after the child matcher's current document
            nextparent = self.comb.after(child.id()) or self.maxdoc

            # Sum the scores of all matching documents under the parent
            count = 1
            score = 0
            while child.is_active() and child.id() < nextparent:
                if pplimit and count > pplimit:
                    child.skip_to(nextparent)
                    break

                score += child.score()
                child.next()
                count += 1

            self._nextscore = score

        def id(self):
            return self._nextdoc

        def score(self):
            return self._nextscore

        def reset(self):
            self.child.reset()
            self._gather()

        def next(self):
            if self.child.is_active():
                self._gather()
            else:
                if self._nextdoc is None:
                    raise matching.ReadTooFar
                else:
                    self._nextdoc = None

        def skip_to(self, id):
            self.child.skip_to(id)
            self._gather()

        def value(self):
            raise NotImplementedError(self.__class__)

        def spans(self):
            return []


class NestedChildren(WrappingQuery):
    """This is the reverse of a :class:`NestedParent` query: instead of taking
    a query that matches children but returns the parent, this query matches
    parents but returns the children.

    This is useful, for example, to search for an album title and return the
    songs in the album::

        schema = fields.Schema(type=fields.ID(stored=True),
                               album_name=fields.TEXT(stored=True),
                               track_num=fields.NUMERIC(stored=True),
                               track_name=fields.TEXT(stored=True),
                               lyrics=fields.TEXT)
        ix = RamStorage().create_index(schema)

        # Indexing
        with ix.writer() as w:
            # For each album, index a "group" of a parent "album" document and
            # multiple child "track" documents.
            with w.group():
                w.add_document(type="album",
                               artist="The Cure", album_name="Disintegration")
                w.add_document(type="track", track_num=1,
                               track_name="Plainsong")
                w.add_document(type="track", track_num=2,
                               track_name="Pictures of You")
                # ...
            # ...


        # Find songs where the song name has "heaven" in the title and the
        # album the song is on has "hell" in the title
        qp = QueryParser("lyrics", ix.schema)
        with ix.searcher() as s:
            # A query that matches all parents
            all_albums = qp.parse("type:album")

            # A query that matches the parents we want
            albums_with_hell = qp.parse("album_name:hell")

            # A query that matches the desired albums but returns the tracks
            songs_on_hell_albums = NestedChildren(all_albums, albums_with_hell)

            # A query that matches tracks with heaven in the title
            songs_with_heaven = qp.parse("track_name:heaven")

            # A query that finds tracks with heaven in the title on albums
            # with hell in the title
            q = query.And([songs_on_hell_albums, songs_with_heaven])

    """

    def __init__(self, parents, subq, boost=1.0):
        self.parents = parents
        self.child = subq
        self.boost = boost

    def matcher(self, searcher, context=None):
        bits = searcher._filter_to_comb(self.parents)
        if not bits:
            return matching.NullMatcher

        m = self.child.matcher(searcher, context)
        if not m.is_active():
            return matching.NullMatcher

        return self.NestedChildMatcher(bits, m, searcher.doc_count_all(),
                                       searcher.reader().is_deleted,
                                       boost=self.boost)

    class NestedChildMatcher(matching.WrappingMatcher):
        def __init__(self, parent_comb, wanted_parent_matcher, limit,
                     is_deleted, boost=1.0):
            self.parent_comb = parent_comb
            self.wanted_parent_matcher = wanted_parent_matcher
            self.limit = limit
            self.is_deleted = is_deleted
            self.boost = boost
            self._nextchild = -1
            self._nextparent = -1
            self._find_next_children()

        def __repr__(self):
            return "%s(%r, %r)" % (self.__class__.__name__,
                                   self.parent_comb,
                                   self.wanted_parent_matcher)

        def reset(self):
            self.child.reset()
            self._reset()

        def _reset(self):
            self._nextchild = -1
            self._nextparent = -1
            self._find_next_children()

        def is_active(self):
            return self._nextchild < self._nextparent

        def replace(self, minscore):
            return self

        def _find_next_children(self):
            # "comb" contains the doc IDs of all parent documents
            comb = self.parent_comb
            # "m" is the matcher for "wanted" parents
            m = self.wanted_parent_matcher
            # Last doc ID + 1
            limit = self.limit
            # A function that returns True if a doc ID is deleted
            is_deleted = self.is_deleted
            nextchild = self._nextchild
            nextparent = self._nextparent

            while m.is_active():
                # Move the "child id" to the document after the current match
                nextchild = m.id() + 1
                # Move the parent matcher to the next match
                m.next()

                # Find the next parent document (matching or not) after this
                nextparent = comb.after(nextchild)
                if nextparent is None:
                    nextparent = limit

                # Skip any deleted child documents
                while is_deleted(nextchild):
                    nextchild += 1

                # If skipping deleted documents put us to or past the next
                # parent doc, go again
                if nextchild >= nextparent:
                    continue
                else:
                    # Otherwise, we're done
                    break

            self._nextchild = nextchild
            self._nextparent = nextparent

        def id(self):
            return self._nextchild

        def all_ids(self):
            while self.is_active():
                yield self.id()
                self.next()

        def next(self):
            is_deleted = self.is_deleted
            limit = self.limit
            nextparent = self._nextparent

            # Go to the next document
            nextchild = self._nextchild
            nextchild += 1

            # Skip over any deleted child documents
            while nextchild < nextparent and is_deleted(nextchild):
                nextchild += 1

            self._nextchild = nextchild
            # If we're at or past the next parent doc, go to the next set of
            # children
            if nextchild >= limit:
                return
            elif nextchild >= nextparent:
                self._find_next_children()

        def skip_to(self, docid):
            comb = self.parent_comb
            wpm = self.wanted_parent_matcher

            # self._nextchild is the "current" matching child ID
            if docid <= self._nextchild:
                return

            # self._nextparent is the next parent ID (matching or not)
            if docid < self._nextparent:
                # Just iterate
                while self.is_active() and self.id() < docid:
                    self.next()
            else:
                # Find the parent before the target ID
                pid = comb.before(docid)
                # Skip the parent matcher to that ID
                wpm.skip_to(pid)
                # If that made the matcher inactive, then we're done
                if not wpm.is_active():
                    self._nextchild = self._nextparent = self.limit
                else:
                    # Reestablish for the next child after the next matching
                    # parent
                    self._find_next_children()

        def value(self):
            raise NotImplementedError(self.__class__)

        def score(self):
            return self.boost

        def spans(self):
            return []