Commits

Ned Batchelder  committed afd13f1

Adopt the faster loop from issue #1. Also fix the quotes in a docstring so doctest works.

  • Participants
  • Parent commits 4f4bc53

Comments (0)

Files changed (1)

         Yields pairs (`name`, `tokentext`).
 
         """
-        while text:
-            eaten = 0
-            for match in self.regexes[self.state].finditer(text):
-                for name, toktext in match.groupdict().iteritems():
-                    if toktext is not None:
-                        tok = self.toks[name]
-                        new_state = tok.next
-                        eaten += len(toktext)
-                        yield (tok.name, toktext)
-                if new_state:
-                    self.state = new_state
+        end = len(text)
+        state = self.state
+        regexes = self.regexes
+        toks = self.toks
+        start = 0
+
+        while start < end:
+            for match in regexes[state].finditer(text, start):
+                name = match.lastgroup
+                tok = toks[name]
+                toktext = match.group(name)
+                start += len(toktext)
+                yield (tok.name, toktext)
+
+                if tok.next:
+                    state = tok.next
                     break
-            text = text[eaten:]
+
+        self.state = state
 
 
 class JsLexer(Lexer):
     
     >>> lexer = JsLexer()
     >>> list(lexer.lex("a = 1"))
-    [("id", "a"), ("ws", " "), ("punct", "="), ("ws", " "), ("dnum", "1")]
+    [('id', 'a'), ('ws', ' '), ('punct', '='), ('ws', ' '), ('dnum', '1')]
 
     This doesn't properly handle non-Ascii characters in the Javascript source.