Eric Rochester  committed 71916ce

Updated the Python tokenizer to have output closer to the Penn TB.

File bin/

 Token = namedtuple('Token', 'text source raw offset length')
-reTOKEN = re.compile(r"\w+('+\w+)?")
+reTOKEN = re.compile(r"(\w+('+\w+)?)|(\S)")
 def get_files(args):
         for line in fin:
             for match in reTOKEN.finditer(line):
                 raw =
+                if not raw:
+                    raw =
+                if not raw:
+                    continue
                 (start, end) = match.span()
                 yield Token(