Commits

Matt Chaput  committed 05ffc0a

Added AlphaNumTokenizer and AlphaNumAnalyzer.

  • Participants
  • Parent commits 2a34e3d
  • Branches keyvalue

Comments (0)

Files changed (2)

File src/whoosh/analysis/analyzers.py

 from whoosh.analysis.morph import StemFilter
 from whoosh.analysis.intraword import IntraWordFilter
 from whoosh.analysis.tokenizers import default_pattern
+from whoosh.analysis.tokenizers import AlphaNumTokenizer
 from whoosh.analysis.tokenizers import CommaSeparatedTokenizer
 from whoosh.analysis.tokenizers import IDTokenizer
 from whoosh.analysis.tokenizers import RegexTokenizer
                               cachesize=cachesize)
 
 
+def AlphaNumAnalyzer(word_expr=r"([.-]|\w)+", alpha_expr=r"[^\W\d_]+",
+                     num_expr=r"\d+", stoplist=None, lang="en",
+                     minsize=2, maxsize=None):
+    """
+    Composes a :class:`whoosh.analysis.AlphaNumTokenizer` with a lower case
+    filter and a stop filter.
+
+    :param word_expr: a regular expression to match the "overall word".
+    :param alpha_expr: a regular expression to match runs of letters.
+    :param num_expr: a regular expression to match runs of numbers.
+    :param stoplist: a list of "stop" words. Set this and ``lang`` to None to
+        turn off the stop filter.
+    :param lang: a string specifying a language (default is ``"en"``). This is
+        passed to ``whoosh.lang.stemmer_for_language()`` to get a
+        language-specific list of stop words. Set this and ``stoplist`` to None
+        to turn off the stop filter.
+    :param minsize: Words smaller than this are removed from the stream.
+    :param maxsize: Words longer that this are removed from the stream.
+    """
+
+    ret = (
+        AlphaNumTokenizer(word_expr, alpha_expr, num_expr)
+        | LowercaseFilter()
+    )
+    if stoplist or lang:
+        ret |= StopFilter(stoplist=stoplist, lang=lang, minsize=minsize,
+                          maxsize=maxsize)
+    return ret
+
+
 def FancyAnalyzer(expression=r"\s+", stoplist=STOP_WORDS, minsize=2,
                   maxsize=None, gaps=True, splitwords=True, splitnums=True,
                   mergewords=False, mergenums=False):

File src/whoosh/analysis/tokenizers.py

 # those of the authors and should not be interpreted as representing official
 # policies, either expressed or implied, of Matt Chaput.
 
+from itertools import chain
+
 from whoosh.compat import u, text_type
 from whoosh.analysis import Composable, Token
 from whoosh.util.text import rcompile
                 yield t
 
 
+class AlphaNumTokenizer(Tokenizer):
+    """
+    Tokenizes text using an "overall word" expression, and then within those
+    words, tokenizes runs of letters and runs of numbers.
+
+    >>> ant = AlphaNumTokenizer()
+    >>> [t.text for t in ant("12-25corrosion5")]
+    ["12-25corrosion5", "corrosion", "12", "25", "5"]
+    """
+
+    def __init__(self, word_expr="([-]|\w)+", alpha_expr="[^\W\d_]+",
+                 num_expr="\d+"):
+        """
+        :param word_expr: a regular expression to match the "overall word".
+        :param alpha_expr: a regular expression to match runs of letters.
+        :param num_expr: a regular expression to match runs of numbers.
+        """
+
+        self.word_expr = rcompile(word_expr)
+        self.alpha_expr = rcompile(alpha_expr)
+        self.num_expr = rcompile(num_expr)
+
+    def __call__(self, value, positions=False, chars=False,
+                 keeporiginal=False,  start_pos=0, start_char=0, tokenize=True,
+                 **kwargs):
+        t = Token(positions, chars, **kwargs)
+
+        if not tokenize:
+            t.original = t.text = value
+            if positions:
+                t.pos = start_pos
+            if chars:
+                t.startchar = start_char
+                t.endchar = start_char + len(value)
+            yield t
+            return
+
+        alpha_expr = self.alpha_expr
+        num_expr = self.num_expr
+        pos = 0
+        for word_match in self.word_expr.finditer(value):
+            if positions:
+                t.pos = pos
+            if chars:
+                t.startchar = word_start = word_match.start()
+                t.endchar = word_match.end()
+            word_text = word_match.group(0)
+            matches = chain([word_match], alpha_expr.finditer(word_text),
+                            num_expr.finditer(word_text))
+            for i, match in enumerate(matches):
+                if chars:
+                    if i > 0:
+                        t.startchar = word_start + match.start()
+                        t.endchar = word_start + match.end()
+                if keeporiginal:
+                    t.original = match.group(0)
+                t.text = match.group(0)
+                t.boost = 1.0
+                t.stopped = False
+                yield t
+
+            pos += 1
+
+
 class CharsetTokenizer(Tokenizer):
     """
     Tokenizes and translates text according to a character mapping object.