Commits

Matt Chaput committed af7ae10

Added TeeFilter.

  • Participants
  • Parent commits 0fa1c53

Comments (0)

Files changed (2)

src/whoosh/analysis.py

         return filter(chain([t], tokens))
         
 
+class TeeFilter(Filter):
+    """Interleaves the results of two or more filters (or filter chains).
+    
+    >>> target = "ALFA BRAVO CHARLIE"
+    >>> # In one branch, we'll lower-case the tokens
+    >>> f1 = LowercaseFilter()
+    >>> # In the other branch, we'll reverse the tokens
+    >>> f2 = ReverseTextFilter()
+    >>> ana = RegexTokenizer(r"\S+") | analysis.TeeFilter(f1, f2)
+    >>> [token.text for token in ana(target)]
+    ["alfa", "AFLA", "bravo", "OVARB", "charlie", "EILRAHC"]
+    
+    To combine the incoming token stream with the output of a filter chain, use
+    ``TeeFilter`` and make one of the filters a :class:`PassFilter`.
+    
+    >>> f1 = PassFilter()
+    >>> f2 = BiWordFilter()
+    >>> ana = RegexTokenizer(r"\S+") | analysis.TeeFilter(f1, f2) | LowercaseFilter()
+    >>> [token.text for token in ana(target)]
+    ["alfa", "alfa-bravo", "bravo", "bravo-charlie", "charlie"]
+    """
+    
+    def __init__(self, *filters):
+        if len(filters) < 2:
+            raise Exception("TeeFilter requires two or more filters")
+        self.filters = filters
+    
+    def __eq__(self, other):
+        return self.__class__ is other.__class__ and self.filters == other.fitlers
+    
+    def __call__(self, tokens):
+        from itertools import tee
+        
+        count = len(self.filters)
+        # Tee the token iterator and wrap each teed iterator with the
+        # corresponding filter
+        gens = [filter(t.copy() for t in gen) for filter, gen
+                in zip(self.filters, tee(tokens, count))]
+        # Keep a count of the number of running iterators
+        running = count
+        while running:
+            for i, gen in enumerate(gens):
+                if gen is not None:
+                    try:
+                        yield next(gen)
+                    except StopIteration:
+                        gens[i] = None
+                        running -= 1
+
+
 class ReverseTextFilter(Filter):
     """Reverses the text of each token.
     
 
 
 
+
+
+
+
+

tests/test_analysis.py

     assert_equal([t.text for t in ana(text, mode="a")], ["alfa", "bravo", "charlie"])
     assert_equal([t.text for t in ana(text, mode="b")], ["ALFA", "BRAVO", "CHARLIE"])
 
+def test_tee_filter():
+    target = u("Alfa Bravo Charlie")
+    f1 = analysis.LowercaseFilter()
+    f2 = analysis.ReverseTextFilter()
+    ana = analysis.RegexTokenizer(r"\S+") | analysis.TeeFilter(f1, f2)
+    result = " ".join([t.text for t in ana(target)])
+    assert_equal(result, "alfa aflA bravo ovarB charlie eilrahC")
+    
+    class ucfilter(analysis.Filter):
+        def __call__(self, tokens):
+            for t in tokens:
+                t.text = t.text.upper()
+                yield t
+    
+    f2 = analysis.ReverseTextFilter() | ucfilter()
+    ana = analysis.RegexTokenizer(r"\S+") | analysis.TeeFilter(f1, f2)
+    result = " ".join([t.text for t in ana(target)])
+    assert_equal(result, "alfa AFLA bravo OVARB charlie EILRAHC")
+    
+    f1 = analysis.PassFilter()
+    f2 = analysis.BiWordFilter()
+    ana = analysis.RegexTokenizer(r"\S+") | analysis.TeeFilter(f1, f2) | analysis.LowercaseFilter()
+    result = " ".join([t.text for t in ana(target)])
+    assert_equal(result, "alfa alfa-bravo bravo bravo-charlie charlie")
+
 def test_intraword():
     iwf = analysis.IntraWordFilter(mergewords=True, mergenums=True)
     ana = analysis.RegexTokenizer(r"\S+") | iwf