whoosh / src / whoosh / highlight.py

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
# Copyright 2008 Matt Chaput. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
#    1. Redistributions of source code must retain the above copyright notice,
#       this list of conditions and the following disclaimer.
#
#    2. Redistributions in binary form must reproduce the above copyright
#       notice, this list of conditions and the following disclaimer in the
#       documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are
# those of the authors and should not be interpreted as representing official
# policies, either expressed or implied, of Matt Chaput.

"""The highlight module contains classes and functions for displaying short
excerpts from hit documents in the search results you present to the user, with
query terms highlighted.

The highlighting system has four main elements.

* **Fragmenters** chop up the original text into __fragments__, based on the
  locations of matched terms in the text.

* **Scorers** assign a score to each fragment, allowing the system to rank the
  best fragments by whatever criterion.

* **Order functions** control in what order the top-scoring fragments are
  presented to the user. For example, you can show the fragments in the order
  they appear in the document (FIRST) or show higher-scoring fragments first
  (SCORE)

* **Formatters** turn the fragment objects into human-readable output, such as
  an HTML string.

See :doc:`/highlight` for more information.
"""

from __future__ import division
from collections import deque
from heapq import nlargest

from whoosh.compat import htmlescape
from whoosh.analysis import Token


# The default value for the maximum chars to examine when fragmenting
DEFAULT_CHARLIMIT = 2 ** 15


# Fragment object

def mkfrag(text, tokens, startchar=None, endchar=None,
           charsbefore=0, charsafter=0):
    """Returns a :class:`Fragment` object based on the :class:`analysis.Token`
    objects in ``tokens`.
    """

    if startchar is None:
        startchar = tokens[0].startchar if tokens else 0
    if endchar is None:
        endchar = tokens[-1].endchar if tokens else len(text)

    startchar = max(0, startchar - charsbefore)
    endchar = min(len(text), endchar + charsafter)

    return Fragment(text, tokens, startchar, endchar)


class Fragment(object):
    """Represents a fragment (extract) from a hit document. This object is
    mainly used to keep track of the start and end points of the fragment and
    the "matched" character ranges inside; it does not contain the text of the
    fragment or do much else.

    The useful attributes are:

    ``Fragment.text``
        The entire original text from which this fragment is taken.

    ``Fragment.matches``
        An ordered list of objects representing the matched terms in the
        fragment. These objects have ``startchar`` and ``endchar`` attributes.

    ``Fragment.startchar``
        The index of the first character in the fragment.

    ``Fragment.endchar``
        The index of the last character in the fragment.

    ``Fragment.matched_terms``
        A ``set`` of the ``text`` of the matched terms in the fragment (if
        available).
    """

    def __init__(self, text, matches, startchar=0, endchar= -1):
        """
        :param text: the source text of the fragment.
        :param matches: a list of objects which have ``startchar`` and
            ``endchar`` attributes, and optionally a ``text`` attribute.
        :param startchar: the index into ``text`` at which the fragment starts.
            The default is 0.
        :param endchar: the index into ``text`` at which the fragment ends.
            The default is -1, which is interpreted as the length of ``text``.
        """

        self.text = text
        self.matches = matches

        if endchar == -1:
            endchar = len(text)
        self.startchar = startchar
        self.endchar = endchar

        self.matched_terms = set()
        for t in matches:
            if hasattr(t, "text"):
                self.matched_terms.add(t.text)

    def __repr__(self):
        return "<Fragment %d:%d %d>" % (self.startchar, self.endchar,
                                        len(self.matches))

    def __len__(self):
        return self.endchar - self.startchar

    def overlaps(self, fragment):
        sc = self.startchar
        ec = self.endchar
        fsc = fragment.startchar
        fec = fragment.endchar
        return (sc < fsc < ec) or (sc < fec < ec)

    def overlapped_length(self, fragment):
        sc = self.startchar
        ec = self.endchar
        fsc = fragment.startchar
        fec = fragment.endchar
        return max(ec, fec) - min(sc, fsc)

    def __lt__(self, other):
        return id(self) < id(other)


# Tokenizing

def set_matched_filter(tokens, termset):
    for t in tokens:
        t.matched = t.text in termset
        yield t


# Fragmenters

class Fragmenter(object):
    def must_retokenize(self):
        """Returns True if this fragmenter requires retokenized text.

        If this method returns True, the fragmenter's ``fragment_tokens``
        method  will be called with an iterator of ALL tokens from the text,
        with the tokens for matched terms having the ``matched`` attribute set
        to True.

        If this method returns False, the fragmenter's ``fragment_matches``
        method will be called with a LIST of matching tokens.
        """

        return True

    def fragment_tokens(self, text, all_tokens):
        """Yields :class:`Fragment` objects based on the tokenized text.

        :param text: the string being highlighted.
        :param all_tokens: an iterator of :class:`analysis.Token`
            objects from the string.
        """

        raise NotImplementedError

    def fragment_matches(self, text, matched_tokens):
        """Yields :class:`Fragment` objects based on the text and the matched
        terms.

        :param text: the string being highlighted.
        :param matched_tokens: a list of :class:`analysis.Token` objects
            representing the term matches in the string.
        """

        raise NotImplementedError


class WholeFragmenter(Fragmenter):
    """Doesn't fragment the token stream. This object just returns the entire
    entire stream as one "fragment". This is useful if you want to highlight
    the entire text.

    Note that even if you use the `WholeFragmenter`, the highlight code will
    return no fragment if no terms matched in the given field. To return the
    whole fragment even in that case, call `highlights()` with `minscore=0`::

        # Query where no terms match in the "text" field
        q = query.Term("tag", "new")

        r = mysearcher.search(q)
        r.fragmenter = highlight.WholeFragmenter()
        r.formatter = highlight.UppercaseFormatter()
        # Since no terms in the "text" field matched, we get no fragments back
        assert r[0].highlights("text") == ""

        # If we lower the minimum score to 0, we get a fragment even though it
        # has no matching terms
        assert r[0].highlights("text", minscore=0) == "This is the text field."

    """

    def __init__(self, charlimit=DEFAULT_CHARLIMIT):
        self.charlimit = charlimit

    def fragment_tokens(self, text, tokens):
        charlimit = self.charlimit
        matches = []
        for t in tokens:
            if charlimit and t.endchar > charlimit:
                break
            if t.matched:
                matches.append(t.copy())
        return [Fragment(text, matches)]


# Backwards compatiblity
NullFragmeter = WholeFragmenter


class SentenceFragmenter(Fragmenter):
    """Breaks the text up on sentence end punctuation characters
    (".", "!", or "?"). This object works by looking in the original text for a
    sentence end as the next character after each token's 'endchar'.

    When highlighting with this fragmenter, you should use an analyzer that
    does NOT remove stop words, for example::

        sa = StandardAnalyzer(stoplist=None)
    """

    def __init__(self, maxchars=200, sentencechars=".!?",
                 charlimit=DEFAULT_CHARLIMIT):
        """
        :param maxchars: The maximum number of characters allowed in a
            fragment.
        """

        self.maxchars = maxchars
        self.sentencechars = frozenset(sentencechars)
        self.charlimit = charlimit

    def fragment_tokens(self, text, tokens):
        maxchars = self.maxchars
        sentencechars = self.sentencechars
        charlimit = self.charlimit

        textlen = len(text)
        # startchar of first token in the current sentence
        first = None
        # Buffer for matched tokens in the current sentence
        tks = []
        endchar = None
        # Number of chars in the current sentence
        currentlen = 0

        for t in tokens:
            startchar = t.startchar
            endchar = t.endchar
            if charlimit and endchar > charlimit:
                break

            if first is None:
                # Remember the startchar of the first token in a sentence
                first = startchar
                currentlen = 0

            tlength = endchar - startchar
            currentlen += tlength

            if t.matched:
                tks.append(t.copy())

            # If the character after the current token is end-of-sentence
            # punctuation, finish the sentence and reset
            if endchar < textlen and text[endchar] in sentencechars:
                # Don't break for two periods in a row (e.g. ignore "...")
                if (endchar + 1 < textlen
                    and text[endchar + 1] in sentencechars):
                    continue

                # If the sentence had matches and it's not too long, yield it
                # as a token
                if tks and currentlen <= maxchars:
                    yield mkfrag(text, tks, startchar=first, endchar=endchar)
                # Reset the counts
                tks = []
                first = None
                currentlen = 0

        # If we get to the end of the text and there's still a sentence
        # in the buffer, yield it
        if tks:
            yield mkfrag(text, tks, startchar=first, endchar=endchar)


class ContextFragmenter(Fragmenter):
    """Looks for matched terms and aggregates them with their surrounding
    context.
    """

    def __init__(self, maxchars=200, surround=20, charlimit=DEFAULT_CHARLIMIT):
        """
        :param maxchars: The maximum number of characters allowed in a
            fragment.
        :param surround: The number of extra characters of context to add both
            before the first matched term and after the last matched term.
        """

        self.maxchars = maxchars
        self.surround = surround
        self.charlimit = charlimit

    def fragment_tokens(self, text, tokens):
        maxchars = self.maxchars
        surround = self.surround
        charlimit = self.charlimit

        # startchar of the first token in the fragment
        first = None
        # Stack of startchars
        firsts = deque()
        # Each time we see a matched token, we reset the countdown to finishing
        # the fragment. This also indicates whether we're currently inside a
        # fragment (< 0 not in fragment, >= 0 in fragment)
        countdown = -1
        # Tokens in current fragment
        tks = []
        endchar = None
        # Number of chars in the current fragment
        currentlen = 0

        for t in tokens:
            startchar = t.startchar
            endchar = t.endchar
            tlength = endchar - startchar
            if charlimit and endchar > charlimit:
                break

            if countdown < 0 and not t.matched:
                # We're not in a fragment currently, so just maintain the
                # "charsbefore" buffer
                firsts.append(startchar)
                while firsts and endchar - firsts[0] > surround:
                    firsts.popleft()
            elif currentlen + tlength > maxchars:
                # We're in a fragment, but adding this token would put us past
                # the maximum size. Zero the countdown so the code below will
                # cause the fragment to be emitted
                countdown = 0
            elif t.matched:
                # Start/restart the countdown
                countdown = surround
                # Remember the first char of this fragment
                if first is None:
                    if firsts:
                        first = firsts[0]
                    else:
                        first = startchar
                        # Add on unused front context
                        countdown += surround
                tks.append(t.copy())

            # If we're in a fragment...
            if countdown >= 0:
                # Update the counts
                currentlen += tlength
                countdown -= tlength

                # If the countdown is expired
                if countdown <= 0:
                    # Finish the fragment
                    yield mkfrag(text, tks, startchar=first, endchar=endchar)
                    # Reset the counts
                    tks = []
                    firsts = deque()
                    first = None
                    currentlen = 0

        # If there's a fragment left over at the end, yield it
        if tks:
            yield mkfrag(text, tks, startchar=first, endchar=endchar)


class PinpointFragmenter(Fragmenter):
    """This is a NON-RETOKENIZING fragmenter. It builds fragments from the
    positions of the matched terms.
    """

    def __init__(self, maxchars=200, surround=20, autotrim=False,
                 charlimit=DEFAULT_CHARLIMIT):
        """
        :param maxchars: The maximum number of characters allowed in a
            fragment.
        :param surround: The number of extra characters of context to add both
            before the first matched term and after the last matched term.
        :param autotrim: automatically trims text before the first space and
            after the last space in the fragments, to try to avoid truncated
            words at the start and end. For short fragments or fragments with
            long runs between spaces this may give strange results.
        """

        self.maxchars = maxchars
        self.surround = surround
        self.autotrim = autotrim
        self.charlimit = charlimit

    def must_retokenize(self):
        return False

    def fragment_tokens(self, text, tokens):
        matched = [t for t in tokens if t.matched]
        return self.fragment_matches(text, matched)

    @staticmethod
    def _autotrim(fragment):
        text = fragment.text
        startchar = fragment.startchar
        endchar = fragment.endchar

        firstspace = text.find(" ", startchar, endchar)
        if firstspace > 0:
            startchar = firstspace + 1
        lastspace = text.rfind(" ", startchar, endchar)
        if lastspace > 0:
            endchar = lastspace

        if fragment.matches:
            startchar = min(startchar, fragment.matches[0].startchar)
            endchar = max(endchar, fragment.matches[-1].endchar)

        fragment.startchar = startchar
        fragment.endchar = endchar

    def fragment_matches(self, text, tokens):
        maxchars = self.maxchars
        surround = self.surround
        autotrim = self.autotrim
        charlimit = self.charlimit

        for i, t in enumerate(tokens):
            j = i
            left = t.startchar
            right = t.endchar
            if charlimit and right > charlimit:
                break

            currentlen = right - left
            while j < len(tokens) - 1 and currentlen < maxchars:
                next = tokens[j + 1]
                ec = next.endchar
                if ec - right <= surround and ec - left <= maxchars:
                    j += 1
                    right = ec
                    currentlen += (ec - next.startchar)
                else:
                    break

            left = max(0, left - surround)
            right = min(len(text), right + surround)
            fragment = Fragment(text, tokens[i:j + 1], left, right)
            if autotrim:
                self._autotrim(fragment)
            yield fragment


# Fragment scorers

class FragmentScorer(object):
    pass


class BasicFragmentScorer(FragmentScorer):
    def __call__(self, f):
        # Add up the boosts for the matched terms in this passage
        score = sum(t.boost for t in f.matches)

        # Favor diversity: multiply score by the number of separate
        # terms matched
        score *= (len(f.matched_terms) * 100) or 1

        return score


# Fragment sorters

def SCORE(fragment):
    "Sorts higher scored passages first."
    return 1


def FIRST(fragment):
    "Sorts passages from earlier in the document first."
    return fragment.startchar


def LONGER(fragment):
    "Sorts longer passages first."
    return 0 - len(fragment)


def SHORTER(fragment):
    "Sort shorter passages first."
    return len(fragment)


# Formatters

def get_text(original, token, replace):
    """Convenience function for getting the text to use for a match when
    formatting.

    If ``replace`` is False, returns the part of ``original`` between
    ``token.startchar`` and ``token.endchar``. If ``replace`` is True, returns
    ``token.text``.
    """

    if replace:
        return token.text
    else:
        return original[token.startchar:token.endchar]


class Formatter(object):
    """Base class for formatters.

    For highlighters that return strings, it is usually only necessary to
    override :meth:`Formatter.format_token`.

    Use the :func:`get_text` function as a convenience to get the token text::

        class MyFormatter(Formatter):
            def format_token(text, token, replace=False):
                ttext = get_text(text, token, replace)
                return "[%s]" % ttext
    """

    between = "..."

    def _text(self, text):
        return text

    def format_token(self, text, token, replace=False):
        """Returns a formatted version of the given "token" object, which
        should have at least ``startchar`` and ``endchar`` attributes, and
        a ``text`` attribute if ``replace`` is True.

        :param text: the original fragment text being highlighted.
        :param token: an object having ``startchar`` and ``endchar`` attributes
            and optionally a ``text`` attribute (if ``replace`` is True).
        :param replace: if True, the original text between the token's
            ``startchar`` and ``endchar`` indices will be replaced with the
            value of the token's ``text`` attribute.
        """

        raise NotImplementedError

    def format_fragment(self, fragment, replace=False):
        """Returns a formatted version of the given text, using the "token"
        objects in the given :class:`Fragment`.

        :param fragment: a :class:`Fragment` object representing a list of
            matches in the text.
        :param replace: if True, the original text corresponding to each
            match will be replaced with the value of the token object's
            ``text`` attribute.
        """

        output = []
        index = fragment.startchar
        text = fragment.text

        for t in fragment.matches:
            if t.startchar < index:
                continue
            if t.startchar > index:
                output.append(self._text(text[index:t.startchar]))
            output.append(self.format_token(text, t, replace))
            index = t.endchar
        output.append(self._text(text[index:fragment.endchar]))

        out_string = "".join(output)
        return out_string

    def format(self, fragments, replace=False):
        """Returns a formatted version of the given text, using a list of
        :class:`Fragment` objects.
        """

        formatted = [self.format_fragment(f, replace=replace)
                     for f in fragments]
        return self.between.join(formatted)

    def __call__(self, text, fragments):
        # For backwards compatibility
        return self.format(fragments)


class NullFormatter(Formatter):
    """Formatter that does not modify the string.
    """

    def format_token(self, text, token, replace=False):
        return get_text(text, token, replace)


class UppercaseFormatter(Formatter):
    """Returns a string in which the matched terms are in UPPERCASE.
    """

    def __init__(self, between="..."):
        """
        :param between: the text to add between fragments.
        """

        self.between = between

    def format_token(self, text, token, replace=False):
        ttxt = get_text(text, token, replace)
        return ttxt.upper()


class HtmlFormatter(Formatter):
    """Returns a string containing HTML formatting around the matched terms.

    This formatter wraps matched terms in an HTML element with two class names.
    The first class name (set with the constructor argument ``classname``) is
    the same for each match. The second class name (set with the constructor
    argument ``termclass`` is different depending on which term matched. This
    allows you to give different formatting (for example, different background
    colors) to the different terms in the excerpt.

    >>> hf = HtmlFormatter(tagname="span", classname="match", termclass="term")
    >>> hf(mytext, myfragments)
    "The <span class="match term0">template</span> <span class="match term1">geometry</span> is..."

    This object maintains a dictionary mapping terms to HTML class names (e.g.
    ``term0`` and ``term1`` above), so that multiple excerpts will use the same
    class for the same term. If you want to re-use the same HtmlFormatter
    object with different searches, you should call HtmlFormatter.clear()
    between searches to clear the mapping.
    """

    template = '<%(tag)s class=%(q)s%(cls)s%(tn)s%(q)s>%(t)s</%(tag)s>'

    def __init__(self, tagname="strong", between="...",
                 classname="match", termclass="term", maxclasses=5,
                 attrquote='"'):
        """
        :param tagname: the tag to wrap around matching terms.
        :param between: the text to add between fragments.
        :param classname: the class name to add to the elements wrapped around
            matching terms.
        :param termclass: the class name prefix for the second class which is
            different for each matched term.
        :param maxclasses: the maximum number of term classes to produce. This
            limits the number of classes you have to define in CSS by recycling
            term class names. For example, if you set maxclasses to 3 and have
            5 terms, the 5 terms will use the CSS classes ``term0``, ``term1``,
            ``term2``, ``term0``, ``term1``.
        """

        self.between = between
        self.tagname = tagname
        self.classname = classname
        self.termclass = termclass
        self.attrquote = attrquote
        self.maxclasses = maxclasses
        self.seen = {}
        self.htmlclass = " ".join((self.classname, self.termclass))

    def _text(self, text):
        return htmlescape(text, quote=False)

    def format_token(self, text, token, replace=False):
        seen = self.seen
        ttext = self._text(get_text(text, token, replace))
        if ttext in seen:
            termnum = seen[ttext]
        else:
            termnum = len(seen) % self.maxclasses
            seen[ttext] = termnum

        return self.template % {"tag": self.tagname, "q": self.attrquote,
                                "cls": self.htmlclass, "t": ttext,
                                "tn": termnum}

    def clean(self):
        """Clears the dictionary mapping terms to HTML classnames.
        """
        self.seen = {}


class GenshiFormatter(Formatter):
    """Returns a Genshi event stream containing HTML formatting around the
    matched terms.
    """

    def __init__(self, qname="strong", between="..."):
        """
        :param qname: the QName for the tag to wrap around matched terms.
        :param between: the text to add between fragments.
        """

        self.qname = qname
        self.between = between

        from genshi.core import START, END, TEXT  # @UnresolvedImport
        from genshi.core import Attrs, Stream  # @UnresolvedImport
        self.START, self.END, self.TEXT = START, END, TEXT
        self.Attrs, self.Stream = Attrs, Stream

    def _add_text(self, text, output):
        if output and output[-1][0] == self.TEXT:
            output[-1] = (self.TEXT, output[-1][1] + text, output[-1][2])
        else:
            output.append((self.TEXT, text, (None, -1, -1)))

    def format_token(self, text, token, replace=False):
        qn = self.qname
        txt = get_text(text, token, replace)
        return self.Stream([(self.START, (qn, self.Attrs()), (None, -1, -1)),
                            (self.TEXT, txt, (None, -1, -1)),
                            (self.END, qn, (None, -1, -1))])

    def format_fragment(self, fragment, replace=False):
        output = []
        index = fragment.startchar
        text = fragment.text

        for t in fragment.matches:
            if t.startchar > index:
                self._add_text(text[index:t.startchar], output)
            output.append((text, t, replace))
            index = t.endchar
        if index < len(text):
            self._add_text(text[index:], output)
        return self.Stream(output)

    def format(self, fragments, replace=False):
        output = []
        first = True
        for fragment in fragments:
            if not first:
                self._add_text(self.between, output)
            output += self.format_fragment(fragment, replace=replace)
            first = False
        return self.Stream(output)


# Highlighting

def top_fragments(fragments, count, scorer, order, minscore=1):
    scored_fragments = ((scorer(f), f) for f in fragments)
    scored_fragments = nlargest(count, scored_fragments)
    best_fragments = [sf for score, sf in scored_fragments if score >= minscore]
    best_fragments.sort(key=order)
    return best_fragments


def highlight(text, terms, analyzer, fragmenter, formatter, top=3,
              scorer=None, minscore=1, order=FIRST, mode="query"):

    if scorer is None:
        scorer = BasicFragmentScorer()

    if type(fragmenter) is type:
        fragmenter = fragmenter()
    if type(formatter) is type:
        formatter = formatter()
    if type(scorer) is type:
        scorer = scorer()

    if scorer is None:
        scorer = BasicFragmentScorer()

    termset = frozenset(terms)
    tokens = analyzer(text, chars=True, mode=mode, removestops=False)
    tokens = set_matched_filter(tokens, termset)
    fragments = fragmenter.fragment_tokens(text, tokens)
    fragments = top_fragments(fragments, top, scorer, order, minscore)
    return formatter(text, fragments)


class Highlighter(object):
    def __init__(self, fragmenter=None, scorer=None, formatter=None,
                 always_retokenize=False, order=FIRST):
        self.fragmenter = fragmenter or ContextFragmenter()
        self.scorer = scorer or BasicFragmentScorer()
        self.formatter = formatter or HtmlFormatter(tagname="b")
        self.order = order
        self.always_retokenize = always_retokenize

    def can_load_chars(self, results, fieldname):
        # Is it possible to build a mapping between the matched terms/docs and
        # their start and end chars for "pinpoint" highlighting (ie not require
        # re-tokenizing text)?

        if self.always_retokenize:
            # No, we've been configured to always retokenize some text
            return False
        if not results.has_matched_terms():
            # No, we don't know what the matched terms are yet
            return False
        if self.fragmenter.must_retokenize():
            # No, the configured fragmenter doesn't support it
            return False

        # Maybe, if the field was configured to store characters
        field = results.searcher.schema[fieldname]
        return field.supports("characters")

    def _load_chars(self, results, fieldname, texts, to_bytes):
        # For each docnum, create a mapping of text -> [(startchar, endchar)]
        # for the matched terms

        results._char_cache[fieldname] = cache = {}
        sorted_ids = sorted(docnum for _, docnum in results.top_n)

        for docnum in sorted_ids:
            cache[docnum] = {}

        for text in texts:
            btext = to_bytes(text)
            m = results.searcher.postings(fieldname, btext)
            docset = set(results.termdocs[(fieldname, btext)])
            for docnum in sorted_ids:
                if docnum in docset:
                    m.skip_to(docnum)
                    assert m.id() == docnum
                    cache[docnum][text] = m.value_as("characters")

    def highlight_hit(self, hitobj, fieldname, text=None, top=3, minscore=1):
        results = hitobj.results
        schema = results.searcher.schema
        field = schema[fieldname]
        to_bytes = field.to_bytes
        from_bytes = field.from_bytes

        if text is None:
            if fieldname not in hitobj:
                raise KeyError("Field %r is not stored." % fieldname)
            text = hitobj[fieldname]

        # Get the terms searched for/matched in this field
        if results.has_matched_terms():
            bterms = (term for term in hitobj.matched_terms()
                      if term[0] == fieldname)
        else:
            bterms = results.query_terms(expand=True, fieldname=fieldname)
        # Convert bytes to unicode
        words = frozenset(from_bytes(term[1]) for term in bterms)

        # If we can do "pinpoint" highlighting...
        if self.can_load_chars(results, fieldname):
            # Build the docnum->[(startchar, endchar),] map
            if fieldname not in results._char_cache:
                self._load_chars(results, fieldname, words, to_bytes)

            # Grab the word->[(startchar, endchar)] map for this docnum
            cmap = results._char_cache[fieldname][hitobj.docnum]
            # A list of Token objects for matched words
            tokens = []
            charlimit = self.fragmenter.charlimit
            for word in words:
                chars = cmap[word]
                for pos, startchar, endchar in chars:
                    if charlimit and endchar > charlimit:
                        break
                    tokens.append(Token(text=word, pos=pos,
                                        startchar=startchar, endchar=endchar))
            tokens.sort(key=lambda t: t.startchar)
            fragments = self.fragmenter.fragment_matches(text, tokens)
        else:
            # Retokenize the text
            analyzer = results.searcher.schema[fieldname].analyzer
            tokens = analyzer(text, positions=True, chars=True, mode="query",
                              removestops=False)
            # Set Token.matched attribute for tokens that match a query term
            tokens = set_matched_filter(tokens, words)
            fragments = self.fragmenter.fragment_tokens(text, tokens)

        fragments = top_fragments(fragments, top, self.scorer, self.order,
                                  minscore=minscore)
        output = self.formatter.format(fragments)
        return output
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.