david_walker avatar david_walker committed 05246e9

rename parser and token modules
expand handling of nn-years-old type expressions

Comments (0)

Files changed (10)

-This project requires WAF to build and run.
-
-http://waf.googlecode.com
-
-To build, run:
-
-waf build
-
-To execute, run:
-
-python gkea.py
+This project requires DELPH-IN technology, namely the PET parser and the English Resource Grammar.  It also requires the TnT (Trigrams'n'Tags) part-of-speech tagger.
 import codecs
 import logging
 import sys
+import re
 
 from clipboard import get_clipboard_text, set_clipboard_text
-from mytoken import Token
+from keatoken import Token
 import rules
 import tagger
-import myparser
+import keaparser
 
 
 class Sentence(object):
         sentence_idx = self.start
         match_index = 0
         while sentence_idx < self.end:
-            if self.ea.tokens[sentence_idx].str != match_list[match_index]:
+            if not re.match(match_list[match_index],
+                            self.ea.tokens[sentence_idx].str):
                 match_index = 0
             else:
                 match_index += 1
         self.tokens = [Token(self._original_text, 0,
                               len(self._original_text)), eof_token]
         self.sentences = []
-        self._parser = myparser.Parser()
+        self._parser = keaparser.Parser()
         # apply first phase rules to replace the original Token object
         # with multiple Token objects, one for each bit of the input
         # text that qualifies as a single input token.
 
             if append_space:
                 self.edited_text += u' '
+        self.edited_text = self.edited_text.strip()
 
     def _parse_sentences(self):
         # for each range of tokens representing a sentence, generate a
     def dump_pos_tags(self):
         """Write every token with a Part-Of-Speech tag to stdout."""
         for token in self.tokens:
-            if hasattr(token, 'pos') and token.pos:
+            if token.pos:
                 if len(token.pos) > 1:
                     sys.stdout.write(token.str + u'/[')
                     first_element = True
                         if not first_element:
                             sys.stdout.write(', ')
                         sys.stdout.write(postag.pos)
-                    sys.stdout.write(']\n')
+                        first_element = False
+                    sys.stdout.write('] ')
                 else:
-                    print u'{}/{}'.format(token.str, token.pos[0]),
+                    sys.stdout.write(u'{}/{} '.format(token.str, token.pos[0].pos))
             if token.str == '\n':
-                print
+                sys.stdout.write('\n')
+        if self.tokens:
+            sys.stdout.write('\n')
 
 
 def parse_commandline():
+#!/usr/bin/env python
+"""
+Interface to external parser.
+"""
+
+import subprocess
+import xmlwitch
+import xmlrpclib
+import time
+import os.path
+import logging
+import shlex
+
+from keatoken import Token
+
+
+class Parser(object):
+    CHEAP_BIN = '/usr/local/bin/cheap'
+    CHEAP_ARGS = ('-v=9 -nsolutions=1 -tok=pic_counts -default-les -packing '
+                  '-results=1 -server /home/david/delphin/erg/english.grm')
+    SERVER_URL = u'http://localhost:4711/cheap-rpc2'
+    PIC_FILE = 'pic.xml'
+    MAX_ALIVE_CHECKS = 10
+
+    def __init__(self):
+        self._server = xmlrpclib.ServerProxy(Parser.SERVER_URL)
+
+    def _build_tree(self, root, it, tokens):
+        # TODO: handle embedded parens
+        s = ''
+        for c in it:
+            if c == '(':
+                if s:
+                    root += self._make_list(s, tokens)
+                    s = ''
+                # create a new child of current parent
+                root.append(self._build_tree([], it, tokens))
+            elif c != ')':
+                s += c
+            else:  # c == ')'
+                if s:
+                    root += self._make_list(s, tokens)
+                break
+        return root
+
+    def _create_pet_input_chart(self, tokens):
+        """Encode the tokens as an XML document that the 'cheap' parser
+        can understand.
+        """
+        xml = xmlwitch.Builder(version='1.0', encoding='utf-8')
+        with xml.pet_input_chart:
+            i = 1
+            cpos = 1
+            for token in tokens:
+                if token.non_printing or token.is_para:
+                    continue
+                with xml.w(id='W' + str(i), cstart=str(cpos),
+                           cend=str(cpos + len(token.str))):
+                    xml.surface(token.str)
+                    for pos_tag in token.pos:
+                        xml.pos(None, tag=pos_tag.pos, prio=pos_tag.prob)
+                cpos += len(token.str) + 1
+                i += 1
+        return unicode(xml).replace('pet_input_chart', 'pet-input-chart')
+
+    def _check_server(self):
+        """Attempt an XML-RPC call to check on the status of the cheap
+        parser server; if it does not respond, try to start it."""
+        try:
+            if self._server.cheap.alive():
+                pass
+        except:
+            self._start_server()
+
+    def _make_list(self, s, tokens):
+        parse_data = shlex.split(s)
+        if (len(parse_data) >= 3 and
+            parse_data[1][0].isdigit() and
+            parse_data[2][0] == '"'):
+            # parse_data was created from a string of the form
+            # '"is" 2 "\"is\""' or
+            # '"so as" 4 "\"so\"" 5 "\"as\""'
+            # the integer elements are 1-based indexes of
+            # tokens. These are leaf nodes of the parse tree.
+            return [tokens[int(i) - 1] for i in
+                    [n for n in parse_data[1:] if n[0].isdigit()]]
+        if len(parse_data) == 5:
+            # parse_data is the result of splitting a string of this form:
+            # '4406 subjh 5.1677 0 8'
+            # extract the second element, which is a lexical or syntactic
+            # rule name.
+            return [parse_data[1]]
+        return parse_data
+
+    def _start_server(self):
+        """Start the PET cheap parser in XML-RPC server mode and wait
+        for it to acknowledge the cheap.alive() call."""
+        # this starts cheap as a child process whose stdout and stderr
+        # go to new pipes rather than to this process' stdout and
+        # stderr. It will continue to run even after this process exits.
+        logging.debug('starting cheap server')
+        subprocess.Popen([Parser.CHEAP_BIN] + Parser.CHEAP_ARGS.split(),
+                         stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        server_alive = False
+        attempts = 0
+        while not server_alive and attempts < Parser.MAX_ALIVE_CHECKS:
+            logging.debug('sleeping before checking cheap server')
+            time.sleep(5)
+            try:
+                logging.debug('checking cheap server')
+                if self._server.cheap.alive():
+                    server_alive = True
+                    logging.debug('cheap server is alive')
+            except:
+                attempts += 1
+                logging.debug('cheap server not alive')
+        if not server_alive:
+            logging.debug(
+                'failed to start server at {} after {} attempts'.format(
+                    Parser.CHEAP_BIN, attempts))
+
+    def parse(self, tokens):
+        logging.debug(u"parsing %s", tokens)
+        self._check_server()
+        # create an XML DOM object that represents the tagged tokens to parse
+        pic = self._create_pet_input_chart(tokens)
+        if not pic:
+            return None
+        # write it to a file to serve as input to the 'cheap' PET parser
+        pic_filename = os.path.realpath(Parser.PIC_FILE)
+        with open(pic_filename, 'w') as outfile:
+            outfile.write(str(pic))
+            # cheap requires two blank lines at end or it faults
+            outfile.write('\n\n')
+        start_time = time.time()
+        analysis = self._server.cheap.analyze(pic_filename)
+        logging.debug('analyzed {} tokens in {:.2f}s'.format(
+                len(tokens), time.time() - start_time))
+        # cheap.analyze returns a string, which contains a tree
+        # structure built of nested parenthesis. Given that string,
+        # build a tree structure of nested lists of strings.
+        root = []
+        self._build_tree(
+            root, iter(analysis['readings'][0]['derivation']), tokens)
+        # pp = pprint.PrettyPrinter()
+        # pp.pprint(root[0])
+        # Finally, transform the tree of strings into a tree of ParseNode
+        # objects.
+        pn = ParseNode(None, root[0])
+        pn.pprint()
+        return pn
+
+
+class ParseNode(object):
+    """ Represent a parse tree, with convenient traversal methods.
+    """
+    def __init__(self, parent, parse_list):
+        # create a tree of ParseNodes from the parse_list
+        self._parent = parent
+        self.name = parse_list[0]
+        if isinstance(parse_list[1][0], Token):
+            self.children = parse_list[1]
+        else:
+            self.children = []
+            for p in parse_list[1:]:
+                self.children.append(ParseNode(self, p))
+
+    def _pprint(self, indent):
+        logging.debug((u' ' * indent) + self.name)
+        for child in self.children:
+            if isinstance(child, Token):
+                logging.debug((u' ' * (indent + 2)) + unicode(child))
+            else:
+                child._pprint(indent + 2)
+
+    def node_from_token(self, token):
+        """Return the ParseNode whose child is `token`."""
+        # if children list is a leaf node, i.e. a Token, then either
+        # this is the parent being sought or `token` doesn't lie in this
+        # branch of the parse tree.
+        if isinstance(self.children[0], Token):
+            if self.children[0] == token:
+                return self
+            return None
+
+        # the child list is non-leaf ParseNodes, so recurse into each of
+        # them to find `token`. It is convenient to write this as a
+        # depth-first search, however since every token of the input
+        # should appear exactly once in the set of parse trees, the
+        # order of search is unimportant.
+        for child in self.children:
+            node = child.node_from_token(token)
+            if node:
+                return node
+        return None
+
+    def parent(self, generation=1):
+        """Return the specified generation of ancestor of this node.
+        Generation 1 is immediate parent, 2 is grandparent, etc.
+        """
+        assert(generation >= 1)
+        p = self
+        while generation > 0:
+            p = p._parent
+            generation -= 1
+        return p
+
+    def pprint(self):
+        self._pprint(0)
+#!/usr/bin/env python
+"""
+The Token class, which is used to contain input tokens (that are later
+edited by transforms).
+
+A Token object represents some some bit of text and has additional
+properties describing that text.
+
+Initially there is only one token object which contains the entire blob
+of text that appeared in the original input.
+
+After some Transforms have been applied, that original token will have
+been replaced by a number of Tokens; eventually after processing is
+complete each Token will represent a small element like an individual
+word or punctuation mark.
+
+This module is named "keatoken" rather than "token" to avoid overriding
+the Python standard library module named "token".
+"""
+
+import re
+import pycountry
+import unicodedata
+from abc import ABCMeta, abstractmethod
+import logging
+
+import tagger
+
+class AbbrevInfo(object):
+    """Hold information about an abbreviation."""
+    def __init__(self, regex_str, normal_form=None):
+        """Compile regex and store normal_form.
+
+        `regex` - a regular expression string.
+        `normal_form` - normal way the abbreviation appears.
+
+        Usually the regex exists just to have a case-insensitive way to
+        match, but some are more complicated than that.
+
+        The normal form has the standard capitalization of the
+        abbreviation and is substituted for the input text if it is
+        specified.
+        """
+        self.regex = re.compile(regex_str, re.I | re.U)
+        self.normal_form = normal_form
+
+
+class Token(object):
+    """Contains a portion of text, either part of the original input
+    or generated later, as well as properties describing it.
+
+    Token objects should only be modified by Transform objects. They
+    should not modify themselves.
+
+    This keeps all the token-modifying code in one place, as well as
+    providing a mechanism to resolve conflicts between multiple bits of
+    code that might both want to touch the same Token object.
+    """
+    abbreviations = [
+        AbbrevInfo(ur'e\.g\.'),
+        AbbrevInfo(ur'i\.e\.'),
+        AbbrevInfo(ur'etc\.'),
+        AbbrevInfo(ur'mr\.', u'Mr.'),
+        AbbrevInfo(ur'mrs\.', u'Mrs.'),
+        AbbrevInfo(ur'ksh\.', u'KES'),
+        AbbrevInfo(ur'kes\.', u'KES'),
+        AbbrevInfo(ur'ltd\.', u'Ltd.'),
+        AbbrevInfo(ur's\.a\.l(\.)?', u's.a.l.'),
+        AbbrevInfo(ur'u\.s\.s\.r\.', u'U.S.S.R.')]
+
+    _currency_terms = [
+        u'$',
+        u'dollar',
+        u'dollars',
+        u'/=',
+        u'peso',
+        u'pesos',
+        u'shilling',
+        u'shillings']
+
+    ordinal_res = [
+        re.compile(ur'^([0-9,]*[02-9]){0,1}1st$', re.I | re.U),
+        re.compile(ur'^([0-9,]*[02-9]){0,1}2nd$', re.I | re.U),
+        re.compile(ur'^([0-9,]*[02-9]){0,1}3rd$', re.I | re.U),
+        re.compile(ur'^[04-9]th$', re.I | re.U),
+        re.compile(ur'^[0-9,]*1[0-9]th$',  re.I | re.U),
+    ]
+
+    has_digits_re = re.compile(ur'.*\d+.*', re.U)
+
+    is_alpha_re = re.compile(ur'^\w+$', re.I | re.U)
+
+    # recognizes a decimal number with comma-delimited thousands groups
+    delimited_decimal_re = re.compile(
+        ur"""^            # start of string
+             [1-9]        # nonzero leading digit
+             [0-9]{,2}    # up to two more leading digits
+             (,[0-9]{3})* # any number of comma-delimited groups of 3 digits
+             (\.[0-9]+)?  # optional decimal followed by one or more digits
+             $            # end of string
+          """,
+        re.U | re.X)
+
+    # recognizes an integer with comma-delimited thousands groups
+    delimited_integer_re = re.compile(
+        ur"""^            # start of string
+             [0-9]{1,3}   # one to three leading digits
+             (,[0-9]{3})* # any number of comma-delimited groups of 3 digits
+             $            # end of string
+          """,
+        re.U | re.X)
+
+    url_re = re.compile(
+        """(\w+\.)+     # one or more dot-delimited words
+           # one of the TLDs that appear in Kiva loans
+           (com|edu|gov|info|mil|net|org|tj)
+           (\S*)        # any amount of non-space chars """,
+        re.I | re.U | re.VERBOSE)
+
+    def __init__(self, s, cbegin=None, cend=None):
+        """Initialize from text.
+
+        Arguments:
+        s -- unicode string
+
+        cbegin -- index into original text for the start of the initial
+        value of this token
+
+        cend -- index into original text to character just past the end
+        of the initial value of this token.
+
+        It is frequently true that at initialization cend = cbegin +
+        len(s), but since cbegin and cend are offsets into the original
+        input text, they are held invariant as s changes.
+        """
+        # Note we use the setter here which initializes the cache.
+        self.str = s
+        self.cbegin = cbegin
+        self.cend = cend
+        self.pos = tagger.PosContainer()
+        assert(cbegin == None and cend == None or self.cend >= self.cbegin)
+
+    def __repr__(self):
+        """Return a string representation of this object suitable for
+        debugging output.
+        """
+        escaped_str = self._str.replace(u'\n', u'\\n')
+        if self.cbegin == None and self.cend == None:
+            r = u'<' + escaped_str + u'>'
+        else:
+            r = u'<{} {}:{}>'.format(escaped_str, self.cbegin, self.cend)
+        # Python 2.x requires that __repr__ return an ascii string.
+        # Python 3.x requires that it return a unicode string.
+        return r.encode(encoding='iso-8859-15', errors='replace')
+
+    def __str__(self):
+        return self.__repr__()
+
+    def __eq__(self, other):
+        if isinstance(other, basestring):
+            return self._str == other
+        assert(isinstance(other, Token))
+        for key, val in self.__dict__.items():
+            if getattr(other, key, None) != val:
+                return False
+        return True
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+    def _reset_cache(self):
+        self._abbrev_checked = False
+        self._abbrev_match = None
+        self._abbrev_match_len = 0
+        self.sentence_delim = None
+        self.eof = None
+        self._URL_checked = False
+        self._is_URL = None
+
+    @property
+    def str(self):
+        return self._str
+
+    @str.setter
+    def str(self, new_value):
+        self._str = unicode(new_value)
+        self._reset_cache()
+
+    @property
+    def abbrev_match_len(self):
+        if not self._abbrev_checked:
+            self._abbrev_checked = True
+            for abbrev in Token.abbreviations:
+                match_obj = abbrev.regex.match(self._str)
+                if match_obj:
+                    self._abbrev_match = abbrev
+                    self._abbrev_match_len = len(match_obj.group())
+                    break
+        return self._abbrev_match_len, self._abbrev_match
+
+    @property
+    def has_digits(self):
+        """Return True if `str` has digits in it."""
+        return Token.has_digits_re.search(self._str) != None
+
+    @property
+    def is_abbrev(self):
+        """Return True if token matches (not just starts with) an
+        abbreviation."""
+        match_len, abbrev = self.abbrev_match_len
+        return abbrev and match_len == len(self._str)
+
+    @property
+    def is_alpha(self):
+        """Return True if token contains only letters."""
+        return LexToken.is_alpha_re.match(self._str)
+
+    @property
+    def is_alphanumeric_ordinal(self):
+        """Return True if token is of the form 1st, 2nd, 3rd, 4th, etc."""
+        for regex in Token.ordinal_res:
+            if regex.match(self._str):
+                return True
+        return False
+
+    @property
+    def is_close(self):
+        """Return True if this token is any type of closing paren.
+        """
+        return len(self._str) == 1 and self._str in u')]}'
+
+    @property
+    def is_currency_symbol(self):
+        return len(self._str) == 1 and self._str == u'$'
+
+    @property
+    def is_currency_term(self):
+        if self._str.lower() in Token._currency_terms:
+            return True
+        return self.is_ISO_currency
+
+    @property
+    def is_eof(self):
+        return self.eof == True
+
+    @property
+    def is_delimited_decimal(self):
+        return Token.delimited_decimal_re.match(self._str) != None
+
+    @property
+    def is_delimited_integer(self):
+        return Token.delimited_integer_re.match(self._str) != None
+
+    @property
+    def is_ISO_currency(self):
+        try:
+            pycountry.currencies.get(letter=self._str.upper())
+            result = True
+        except:
+            result = False
+        return result
+
+    @property
+    def is_nonspacing_punc(self):
+        """Return True if this token is a punctuation character.
+        """
+        return len(self._str) == 1 and self._str in u',.!?;%:'
+
+    @property
+    def is_open(self):
+        """Return True if this token is any type of opening paren.
+        """
+        return len(self._str) == 1 and self._str in u'([{'
+
+    @property
+    def is_para(self):
+        return self._str == '\n'
+
+    @property
+    def is_punc(self):
+        """Return True if this token is a punctuation character.
+        """
+        return len(self._str) == 1 and unicodedata.category(
+            self._str).startswith(u'P')
+
+    @property
+    def is_quote(self):
+        """Return true if this token is any type of single or double quote.
+        """
+        return len(self._str) == 1 and self._str in u'\'`"'
+
+    @property
+    def is_URL(self):
+        """Check if token contains a URL, marking it if necessary.
+
+        Only a subset of possible URL forms likely to appear in a Kiva
+        description are recognized, since it is more likely that a token
+        that happens to conform to an exotic URL form is, in fact, a typo.
+        """
+        if not self._URL_checked:
+            self._URL_checked = True
+
+            # look for a scheme identifier; Kiva loans only will have an
+            # http or maybe https prefix, but won't use any of the others.
+            if self._str.lower().startswith('http'):
+                self._is_URL = True
+            elif Token.url_re.match(self._str):
+                self._is_URL = True
+        return self._is_URL
+
+    @property
+    def non_printing(self):
+        """Return True if any of the attributes are set which indicate a
+        non-printing token.
+        """
+        return self.sentence_delim or self.eof

myparser.py

-#!/usr/bin/env python
-"""
-Interface to external parser.
-"""
-
-import subprocess
-import xmlwitch
-import xmlrpclib
-import time
-import os.path
-import logging
-from collections import namedtuple
-
-from mytoken import Token
-Leaf = namedtuple('Leaf', 'text token')
-
-
-class Parser(object):
-    CHEAP_BIN = '/usr/local/bin/cheap'
-    CHEAP_ARGS = ('-v=9 -nsolutions=1 -tok=pic_counts -default-les -packing '
-                  '-results=1 -server /home/david/delphin/erg/english.grm')
-    SERVER_URL = u'http://localhost:4711/cheap-rpc2'
-    PIC_FILE = 'pic.xml'
-    MAX_ALIVE_CHECKS = 10
-
-    def __init__(self):
-        self._server = xmlrpclib.ServerProxy(Parser.SERVER_URL)
-
-    def _build_tree(self, root, it, tokens):
-        # TODO: handle embedded parens
-        s = ''
-        for c in it:
-            if c == '(':
-                if s:
-                    root += self._make_list(s, tokens)
-                    s = ''
-                # create a new child of current parent
-                root.append(self._build_tree([], it, tokens))
-            elif c != ')':
-                s += c
-            else:  # c == ')'
-                if s:
-                    root += self._make_list(s, tokens)
-                break
-        return root
-
-    def _create_pet_input_chart(self, tokens):
-        """Encode the tokens as an XML document that the 'cheap' parser
-        can understand.
-        """
-        xml = xmlwitch.Builder(version='1.0', encoding='utf-8')
-        with xml.pet_input_chart:
-            i = 1
-            cpos = 1
-            for token in tokens:
-                if token.non_printing or token.is_para:
-                    continue
-                with xml.w(id='W' + str(i), cstart=str(cpos),
-                           cend=str(cpos + len(token.str))):
-                    xml.surface(token.str)
-                    if not getattr(token, 'pos', None):
-                        continue
-                    with xml.pos(tag=token.pos, prio='1.0'):
-                        pass
-                cpos += len(token.str) + 1
-                i += 1
-        return unicode(xml).replace('pet_input_chart', 'pet-input-chart')
-
-    def _check_server(self):
-        """Attempt an XML-RPC call to check on the status of the cheap
-        parser server; if it does not respond, try to start it."""
-        try:
-            if self._server.cheap.alive():
-                pass
-        except:
-            self._start_server()
-
-    def _make_list(self, s, tokens):
-        parse_data = s.split()
-        if len(parse_data) == 5:
-            # parse_data is the result of splitting a string of this form:
-            # '4406 subjh 5.1677 0 8'
-            # extract the second element, which is a lexical or syntactic
-            # rule name.
-            return [parse_data[1]]
-        if len(parse_data) == 3:
-            # parse_data was created from a string of the form
-            # '"is" 2 "\"is\""'
-            # extract the second element, which is a 1-based index of the
-            # token. This is a leaf node of the parse tree.
-            token_index = int(parse_data[1]) - 1
-            return [Leaf(parse_data[0][1:-1], tokens[token_index])]
-        return parse_data
-
-    def _start_server(self):
-        """Start the PET cheap parser in XML-RPC server mode and wait
-        for it to acknowledge the cheap.alive() call."""
-        # this starts cheap as a child process whose stdout and stderr
-        # go to new pipes rather than to this process' stdout and
-        # stderr. It will continue to run even after this process exits.
-        logging.debug('starting cheap server')
-        subprocess.Popen([Parser.CHEAP_BIN] + Parser.CHEAP_ARGS.split(),
-                         stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-        server_alive = False
-        attempts = 0
-        while not server_alive and attempts < Parser.MAX_ALIVE_CHECKS:
-            logging.debug('sleeping before checking cheap server')
-            time.sleep(5)
-            try:
-                logging.debug('checking cheap server')
-                if self._server.cheap.alive():
-                    server_alive = True
-                    logging.debug('cheap server is alive')
-            except:
-                attempts += 1
-                logging.debug('cheap server not alive')
-        if not server_alive:
-            logging.debug(
-                'failed to start server at {} after {} attempts'.format(
-                    Parser.CHEAP_BIN, attempts))
-
-    def parse(self, tokens):
-        logging.debug(u"parsing %s", tokens)
-        self._check_server()
-        # create an XML DOM object that represents the tagged tokens to parse
-        pic = self._create_pet_input_chart(tokens)
-        if not pic:
-            return None
-        # write it to a file to serve as input to the 'cheap' PET parser
-        pic_filename = os.path.realpath(Parser.PIC_FILE)
-        with open(pic_filename, 'w') as outfile:
-            outfile.write(str(pic))
-            # cheap requires two blank lines at end or it faults
-            outfile.write('\n\n')
-        start_time = time.time()
-        analysis = self._server.cheap.analyze(pic_filename)
-        logging.debug('analyzed {} tokens in {:.2f}s'.format(
-                len(tokens), time.time() - start_time))
-        root = []
-        try:
-            self._build_tree(root, iter(analysis['readings'][0]['derivation']),
-                             tokens)
-            pn = ParseNode(None, root[0])
-            pn.pprint()
-        except:
-            pn = None
-            logging.error('parsing failed')
-        return pn
-
-
-class ParseNode(object):
-    def __init__(self, parent, parse_list):
-        # create a tree of ParseNodes from the parse_list
-        self.parent = parent
-        self.name = parse_list[0]
-        if isinstance(parse_list[1][0], Leaf):
-            self.children = [parse_list[1][0].token]
-        else:
-            self.children = []
-            for p in parse_list[1:]:
-                self.children.append(ParseNode(self, p))
-
-    def _pprint(self, indent):
-        logging.debug((u' ' * indent) + self.name)
-        for child in self.children:
-            if isinstance(child, Token):
-                logging.debug((u' ' * (indent + 2)) + unicode(child))
-            else:
-                child._pprint(indent + 2)
-
-    def node_from_token(self, token):
-        """Return the ParseNode whose child is `token`."""
-        # if children list is a leaf node, i.e. a Token, then either
-        # this is the parent being sought or `token` doesn't lie in this
-        # branch of the parse tree.
-        if isinstance(self.children[0], Token):
-            if self.children[0] == token:
-                return self
-            return None
-
-        # the child list is non-leaf ParseNodes, so recurse into each of
-        # them to find `token`. It is convenient to write this as a
-        # depth-first search, however since every token of the input
-        # should appear exactly once in the set of parse trees, the
-        # order of search is unimportant.
-        for child in self.children:
-            node = child.node_from_token(token)
-            if node:
-                return node
-        return None
-
-    def pprint(self):
-        self._pprint(0)

mytoken.py

-#!/usr/bin/env python
-"""
-The Token class, which is used to contain input tokens (that are later
-edited by transforms).
-
-A Token object represents some some bit of text and has additional
-properties describing that text.
-
-Initially there is only one token object which contains the entire blob
-of text that appeared in the original input.
-
-After some Transforms have been applied, that original token will have
-been replaced by a number of Tokens; eventually after processing is
-complete each Token will represent a small element like an individual
-word or punctuation mark.
-
-This module is named "mytoken" rather than "token" to avoid overriding
-the Python standard library module named "token".
-"""
-
-import re
-import pycountry
-import unicodedata
-from abc import ABCMeta, abstractmethod
-
-import logging
-
-
-class AbbrevInfo(object):
-    """Hold information about an abbreviation."""
-    def __init__(self, regex_str, normal_form=None):
-        """Compile regex and store normal_form.
-
-        `regex` - a regular expression string.
-        `normal_form` - normal way the abbreviation appears.
-
-        Usually the regex exists just to have a case-insensitive way to
-        match, but some are more complicated than that.
-
-        The normal form has the standard capitalization of the
-        abbreviation and is substituted for the input text if it is
-        specified.
-        """
-        self.regex = re.compile(regex_str, re.I | re.U)
-        self.normal_form = normal_form
-
-
-class Token(object):
-    """Contains a portion of text, either part of the original input
-    or generated later, as well as properties describing it.
-
-    Token objects should only be modified by Transform objects. They
-    should not modify themselves.
-
-    This keeps all the token-modifying code in one place, as well as
-    providing a mechanism to resolve conflicts between multiple bits of
-    code that might both want to touch the same Token object.
-    """
-    abbreviations = [
-        AbbrevInfo(ur'e\.g\.'),
-        AbbrevInfo(ur'i\.e\.'),
-        AbbrevInfo(ur'etc\.'),
-        AbbrevInfo(ur'mr\.', u'Mr.'),
-        AbbrevInfo(ur'mrs\.', u'Mrs.'),
-        AbbrevInfo(ur'ksh\.', u'KES'),
-        AbbrevInfo(ur'kes\.', u'KES'),
-        AbbrevInfo(ur'ltd\.', u'Ltd.'),
-        AbbrevInfo(ur's\.a\.l(\.)?', u's.a.l.'),
-        AbbrevInfo(ur'u\.s\.s\.r\.', u'U.S.S.R.')]
-
-    _currency_terms = [
-        u'$',
-        u'dollar',
-        u'dollars',
-        u'/=',
-        u'peso',
-        u'pesos',
-        u'shilling',
-        u'shillings']
-
-    ordinal_res = [
-        re.compile(ur'^([0-9,]*[02-9]){0,1}1st$', re.I | re.U),
-        re.compile(ur'^([0-9,]*[02-9]){0,1}2nd$', re.I | re.U),
-        re.compile(ur'^([0-9,]*[02-9]){0,1}3rd$', re.I | re.U),
-        re.compile(ur'^[04-9]th$', re.I | re.U),
-        re.compile(ur'^[0-9,]*1[0-9]th$',  re.I | re.U),
-    ]
-
-    has_digits_re = re.compile(ur'.*\d+.*', re.U)
-
-    is_alpha_re = re.compile(ur'^\w+$', re.I | re.U)
-
-    # recognizes a decimal number with comma-delimited thousands groups
-    delimited_decimal_re = re.compile(
-        ur"""^            # start of string
-             [1-9]        # nonzero leading digit
-             [0-9]{,2}    # up to two more leading digits
-             (,[0-9]{3})* # any number of comma-delimited groups of 3 digits
-             (\.[0-9]+)?  # optional decimal followed by one or more digits
-             $            # end of string
-          """,
-        re.U | re.X)
-
-    # recognizes an integer with comma-delimited thousands groups
-    delimited_integer_re = re.compile(
-        ur"""^            # start of string
-             [0-9]{1,3}   # one to three leading digits
-             (,[0-9]{3})* # any number of comma-delimited groups of 3 digits
-             $            # end of string
-          """,
-        re.U | re.X)
-
-    url_re = re.compile(
-        """(\w+\.)+     # one or more dot-delimited words
-           # one of the TLDs that appear in Kiva loans
-           (com|edu|gov|info|mil|net|org|tj)
-           (\S*)        # any amount of non-space chars """,
-        re.I | re.U | re.VERBOSE)
-
-    def __init__(self, s, cbegin=None, cend=None):
-        """Initialize from text.
-
-        Arguments:
-        s -- unicode string
-
-        cbegin -- index into original text for the start of the initial
-        value of this token
-
-        cend -- index into original text to character just past the end
-        of the initial value of this token.
-
-        It is frequently true that at initialization cend = cbegin +
-        len(s), but since cbegin and cend are offsets into the original
-        input text, they are held invariant as s changes.
-        """
-        # Note we use the setter here which initializes the cache.
-        self.str = s
-        self.cbegin = cbegin
-        self.cend = cend
-        self.pos = tagger.PosContainer()
-        assert(cbegin == None and cend == None or self.cend >= self.cbegin)
-
-    def __repr__(self):
-        """Return a string representation of this object suitable for
-        debugging output.
-        """
-        escaped_str = self._str.replace(u'\n', u'\\n')
-        if self.cbegin == None and self.cend == None:
-            r = u'<' + escaped_str + u'>'
-        else:
-            r = u'<{} {}:{}>'.format(escaped_str, self.cbegin, self.cend)
-        # Python 2.x requires that __repr__ return an ascii string.
-        # Python 3.x requires that it return a unicode string.
-        return r.encode(encoding='iso-8859-15', errors='replace')
-
-    def __str__(self):
-        return self.__repr__()
-
-    def __eq__(self, other):
-        if isinstance(other, basestring):
-            return self._str == other
-        assert(isinstance(other, Token))
-        for key, val in self.__dict__.items():
-            if getattr(other, key, None) != val:
-                return False
-        return True
-
-    def __ne__(self, other):
-        return not self.__eq__(other)
-
-    def _reset_cache(self):
-        self._abbrev_checked = False
-        self._abbrev_match = None
-        self._abbrev_match_len = 0
-        self.sentence_delim = None
-        self.eof = None
-        self._URL_checked = False
-        self._is_URL = None
-
-    @property
-    def str(self):
-        return self._str
-
-    @str.setter
-    def str(self, new_value):
-        self._str = unicode(new_value)
-        self._reset_cache()
-
-    @property
-    def abbrev_match_len(self):
-        if not self._abbrev_checked:
-            self._abbrev_checked = True
-            for abbrev in Token.abbreviations:
-                match_obj = abbrev.regex.match(self._str)
-                if match_obj:
-                    self._abbrev_match = abbrev
-                    self._abbrev_match_len = len(match_obj.group())
-                    break
-        return self._abbrev_match_len, self._abbrev_match
-
-    @property
-    def has_digits(self):
-        """Return True if `str` has digits in it."""
-        return Token.has_digits_re.search(self._str) != None
-
-    @property
-    def is_abbrev(self):
-        """Return True if token matches (not just starts with) an
-        abbreviation."""
-        match_len, abbrev = self.abbrev_match_len
-        return abbrev and match_len == len(self._str)
-
-    @property
-    def is_alpha(self):
-        """Return True if token contains only letters."""
-        return LexToken.is_alpha_re.match(self._str)
-
-    @property
-    def is_alphanumeric_ordinal(self):
-        """Return True if token is of the form 1st, 2nd, 3rd, 4th, etc."""
-        for regex in Token.ordinal_res:
-            if regex.match(self._str):
-                return True
-        return False
-
-    @property
-    def is_close(self):
-        """Return True if this token is any type of closing paren.
-        """
-        return len(self._str) == 1 and self._str in u')]}'
-
-    @property
-    def is_currency_symbol(self):
-        return len(self._str) == 1 and self._str == u'$'
-
-    @property
-    def is_currency_term(self):
-        if self._str.lower() in Token._currency_terms:
-            return True
-        return self.is_ISO_currency
-
-    @property
-    def is_eof(self):
-        return self.eof == True
-
-    @property
-    def is_delimited_decimal(self):
-        return Token.delimited_decimal_re.match(self._str) != None
-
-    @property
-    def is_delimited_integer(self):
-        return Token.delimited_integer_re.match(self._str) != None
-
-    @property
-    def is_ISO_currency(self):
-        try:
-            pycountry.currencies.get(letter=self._str.upper())
-            result = True
-        except:
-            result = False
-        return result
-
-    @property
-    def is_nonspacing_punc(self):
-        """Return True if this token is a punctuation character.
-        """
-        return len(self._str) == 1 and self._str in u',.!?;%:'
-
-    @property
-    def is_open(self):
-        """Return True if this token is any type of opening paren.
-        """
-        return len(self._str) == 1 and self._str in u'([{'
-
-    @property
-    def is_para(self):
-        return self._str == '\n'
-
-    @property
-    def is_punc(self):
-        """Return True if this token is a punctuation character.
-        """
-        return len(self._str) == 1 and unicodedata.category(
-            self._str).startswith(u'P')
-
-    @property
-    def is_quote(self):
-        """Return true if this token is any type of single or double quote.
-        """
-        return len(self._str) == 1 and self._str in u'\'`"'
-
-    @property
-    def is_URL(self):
-        """Check if token contains a URL, marking it if necessary.
-
-        Only a subset of possible URL forms likely to appear in a Kiva
-        description are recognized, since it is more likely that a token
-        that happens to conform to an exotic URL form is, in fact, a typo.
-        """
-        if not self._URL_checked:
-            self._URL_checked = True
-
-            # look for a scheme identifier; Kiva loans only will have an
-            # http or maybe https prefix, but won't use any of the others.
-            if self._str.lower().startswith('http'):
-                self._is_URL = True
-            elif Token.url_re.match(self._str):
-                self._is_URL = True
-        return self._is_URL
-
-    @property
-    def non_printing(self):
-        """Return True if any of the attributes are set which indicate a
-        non-printing token.
-        """
-        return self.sentence_delim or self.eof
 import unicodedata
 from abc import ABCMeta, abstractmethod
 
-from mytoken import Token
+from keatoken import Token
 from tokensearch import TokenSearchByRegexp
 
 
         TokenSearchByRegexp(u'KADET LTD', u'KADET Ltd.'),
 
         # awkward or verbose constructions
+        TokenSearchByRegexp(u'in the year ([0-9]+)', ur'in \1'),
+        TokenSearchByRegexp(u'three/four acre(s?)', ur'three quarters of an acre'),
         TokenSearchByRegexp(u'requested for', u'requested'),
         TokenSearchByRegexp(u'has given birth to', 'has'),
         TokenSearchByRegexp(u'requesting to borrow', u'asking to borrow'),
         # non-ISO currency abbreviations
         TokenSearchByRegexp(ur'(.+)/=', ur'\1 UGX'),
         TokenSearchByRegexp(ur'(?i)ksh(?:s|)(?:\.|)([0-9,.]+|)', ur'KES \1'),
-        TokenSearchByRegexp(ur'[Pp]hp', 'PHP'),
+        TokenSearchByRegexp(ur'[Pp]hp', ur'PHP'),
         TokenSearchByRegexp(ur'P([0-9,.]+)', ur'\1 PHP'),
         TokenSearchByRegexp(ur'(?i)LE ([0-9]*)', ur'SLL \1'),
-        TokenSearchByRegexp(ur'Rp\.', 'IDR'),
+        TokenSearchByRegexp(ur'Rp\.', ur'IDR'),
 
         # incorrect punctuation
-        TokenSearchByRegexp(ur'e\.t\.c\.?', u'etc.'),
+        TokenSearchByRegexp(ur'e\.t\.c\.?', ur'etc.'),
         TokenSearchByRegexp(ur'\betc([^.])', ur'etc.\1'),
-        TokenSearchByRegexp(ur'(?<!\.)\.\.(?!\.)', u'.'),  # blah.. -> blah.
+        TokenSearchByRegexp(ur'(?<!\.)\.\.(?!\.)', ur'.'),  # blah.. -> blah.
 
         # grammatical errors
-        TokenSearchByRegexp(ur'1 infant-aged children', u'one infant child'),
-        TokenSearchByRegexp(ur'1 years', u'one year'),
+        TokenSearchByRegexp(ur'1 infant-aged children', ur'one infant child'),
+        TokenSearchByRegexp(ur'1 years', ur'one year'),
         TokenSearchByRegexp(ur'never missed any meeting\.',
-                            u'never missed any meetings.'),
+                            ur'never missed any meetings.'),
 
         # Field partner template cleanup
         # TokenSearchByRegexp(ur'To make a living,'
         # |   11 | 43rd      | 43rd       |
         # |   12 | 1,200.    | 1,200 .    |
         # |   13 | 1,500.00  | 1,500.00   |
+        # |   14 | 47-year   | 47-year    |
         changed = False
         for token in ea.tokens:
             # skip non-printing, URL, and short tokens
                 mo = re.search(ur'\d', token.str)
             elif token.str[0].isdigit():
                 # if it starts with a digit, split at first alpha, except
-                # for cases 8 through 11 in the table above.
+                # for cases 8 through 11 and 14 in the table above.
                 if re.match(ur'[1-9][0-9]*0s', token.str):
                     # case 8
                     continue
                 if token.is_alphanumeric_ordinal:
                     # cases 9-11
                     continue
+                if '-' in token.str[1:]:
+                    # case 14
+                    continue
                 # case 12, note $ is for case 13
                 mo = re.match(ur'[1-9][0-9]{,2}(?:,[0-9]{3})*\.(?:[^0-9]|$)',
                               token.str)
 
     phase = PARSED_PHASE
 
+    regex_pairs = [
+        TokenSearchByRegexp(ur'([0-9]+)-year old', ur'\1-year-old'),
+        TokenSearchByRegexp(ur'([0-9]+) year-old', ur'\1-year-old'),
+        ]
+
     def __init__(self):
         Rule.__init__(
             self, 300,
             "Fix incorrect plural and lack of hyphens in phrases "
             "like '20 years old man'.")
 
+    def _change_nn_dash_years(self, ea, sent, years_idx):
+        changed = False
+        try:
+            logging.debug('>_change_nn_dash_years')
+            years_node = sent.parse.node_from_token(ea.tokens[years_idx])
+            if years_node.parent(2).children[1].name == 'nadj_rr':
+                if ea.tokens[years_idx + 1].str == u'old':
+                    logging.debug('changing "nn-year(s) old" to "nn-year-old"')
+                    ea.tokens[years_idx].str = ea.tokens[years_idx].str[:-1] + u'-old'
+                    del ea.tokens[years_idx + 1]
+                    changed = True
+                elif ea.tokens[years_idx].str.endswith(u'-years-old'):
+                    # case 10
+                    logging.debug('changing "nn-years-old" to "nn-year-old"')
+                    ea.tokens[years_idx].str = ea.tokens[years_idx].str.replace(u'years', u'year')
+                    changed = True
+        except Exception as e:
+            logging.debug('caught ' + str(e))
+        logging.debug('<_change_nn_dash_years')
+        return changed
+
+    def _change_years_dash_old(self, ea, sent, years_idx):
+        changed = False
+        try:
+            logging.debug('>_change_years_dash_old')
+            years_node = sent.parse.node_from_token(ea.tokens[years_idx])
+            if years_node.parent().name == 'noun_n_cmpnd':
+                logging.debug('changing "nn years-old" to "nn-year-old"')
+                ea.tokens[years_idx - 1].str += u'-year-old'
+                del ea.tokens[years_idx]
+                changed = True
+        except Exception as e:
+            logging.debug('caught ' + str(e))
+        logging.debug('<_change_years_dash_old')
+        return changed
+
+
     def apply(self, ea):
-        # search tokens for "years old"
+        # These cases are correct and should remain unchanged.
+        # 1. 'She is 47 years old.'         PASSED
+        # 2. 'She is a 47-year-old farmer.' PASSED
+        # 3. 'She is 1 year old.'           PASSED
+
+        # These cases should be changed to have a single token of
+        # '47-year-old':
+        # 4. 'She is a 47 year old farmer.'
+        # 5. 'She is a 47-year old farmer.'    PASSED
+        # 6. 'She is a 47 year-old farmer.'    PASSED
+
+        # These should have 'years' changed to 'year' and hyphens inserted.
+        # 7. 'She is a 47 years old farmer.'   PASSED
+        # 8. 'She is a 47-years old farmer.'   PASSED
+        # 9. 'She is a 47 years-old farmer.'   PASSED
+        # 10. 'She is a 47-years-old farmer.'  PASSED
+
+        changed = False
+        # cases 5 and 6 are handled by regular expression search and replace
+        for ts in YearOldRule.regex_pairs:
+            if ts.apply(self, ea.tokens):
+                changed = True
+
         for sent in ea.sentences:
+            # Cases 5 and 8
+            years_idx = sent.find_sequence('[0-9]+-year(s?)')
+            if (years_idx != None and
+                self._change_nn_dash_years(ea, sent, years_idx)):
+                changed = True
+                continue
+
+            # Case 9
+            years_idx = sent.find_sequence(u'years-old')
+            if (years_idx != None and
+                self._change_years_dash_old(ea, sent, years_idx)):
+                changed = True
+                continue
+
+            make_singular = False
+
             years_idx = sent.find_sequence(u'years old')
-            if years_idx == None:
-                years_idx = sent.find_sequence(u'years - old')
+
             if years_idx == None:
                 continue
-            # get the node whose only child is the token 'years'
+            # get the node that has child of token containing 'year'
             years_node = sent.parse.node_from_token(ea.tokens[years_idx])
-            if years_node.parent.name != 'plur_noun_orule':
+
+
+            if years_node.parent().name != 'plur_noun_orule':
+                logging.debug('*****skipping %s != plur_noun_orule' % years_node.parent().name)
                 continue
-            make_singular = False
-        # She is 42 years old.
-        # root_informal
-        #   subjh
-        #     bare_npq
-        #       she
-        #         <She 0:3>
-        #     hcomp
-        #       be_c_is
-        #         <is 4:6>
-        #       npadv
-        #         appos
-        #           bare_np
-        #             adjn
-        #               attr_adj_verb_psv_part
-        #                 generic_trans_verb_pas
-        #                   <42 7:9>
-        #               plur_noun_orule
-        #                 year_n1
-        #                   <years 10:15>
-        #           proper_np
-        #             adjn
-        #               old_a1
-        #                 <old 16:19>
-        #               noptcomp
-        #                 generic_year_ne
-        #                   <. 19:20>
 
-
-            # nadj_rr
-            #   measure_np
-            #     generic_number
-            #       <51 2:4>
-            #     plur_noun_orule
-            #       year_n1
-            #         <years 5:10>
-            #   npadv
+            # Case 7
             try:
-                make_singular = years_node.parent.parent.parent.children[1].name == 'npadv'
+                make_singular = years_node.parent(3).children[1].name == 'npadv'
                 if make_singular:
-                    logging.debug('make_singular case 1')
+                    logging.debug('********NPADV')
             except:
                 pass
-            # npadv_mnp
-            #   adjh_s_xp
-            #     a_one_adj
-            #       <A 0:1>
-            #     measure_np
-            #       generic_card_ne
-            #         <51 2:4>
-            #       plur_noun_orule
-            #         year_n1
-            #           <years 5:10>
+
             if not make_singular:
                 try:
-                    make_singular = years_node.parent.parent.parent.parent.name == 'npadv_mnp'
+                    make_singular = years_node.parent(4).name == 'npadv_mnp'
                     if make_singular:
-                        logging.debug('make_singular case 2')
+                        logging.debug('********NPADV_MNP')
                 except:
                     pass
             # appos
             #         <years 5:10>
             if not make_singular:
                 try:
-                    make_singular = years_node.parent.parent.parent.name == 'appos'
+                    make_singular = years_node.parent(3).name == 'appos'
                     if make_singular:
-                        logging.debug('make_singular case 3')
+                        logging.debug('********APPOS')
                 except:
                     pass
-            # She is a 42-year old farmer.
-            # frag_np
-            #   bare_np
-            #     punct_hinit
-            #       plur_noun_orule
-            #         year_n1
-            #           <years 11:16>
-            #       s_dash_pct
-            #         <- 16:17>
+
             if not make_singular:
                 try:
                     make_singular = (
-                        years_node.parent.parent.parent.name == 'bare_np' and
-                        years_node.parent.parent.children[1].name == 's_dash_pct')
+                        years_node.parent(3).name == 'bare_np' and
+                        years_node.parent(2).children[1].name == 's_dash_pct')
                     if make_singular:
-                        logging.debug('make_singular case 4')
+                        logging.debug('********s_dash_pct')
                 except:
                     pass
 
             if make_singular:
-                years_token = years_node.children[0]
-                original = unicode(years_token)
-                years_token.str = u'year'
-                logging.debug(u'YearOldRule changed {} to {}'.format(
-                        original, unicode(years_token)))
-        return False
+                ea.tokens[years_idx - 1].str += u'-year-old'
+                del ea.tokens[years_idx:years_idx + 2]
+                # years_token = years_node.children[0]
+                # original = unicode(years_token)
+                # years_token.str = u'year'
+                # logging.debug(u'YearOldRule make_singular changed {} to {}'.format(
+                #         original, unicode(years_token)))
+                changed = True
+        return changed
                 return True
         return False
 
+    def __getitem__(self, i):
+        return self.tags_list[i]
+
+    def __iter__(self):
+        return iter(self.tags_list)
+
+    def __len__(self):
+        return len(self.tags_list)
+
 
 def tag_tokens(tokens):
     """Pass all tokens to an external POS tagger, then add its tags as
 
 def test_year_old():
     expect(u'Mahmoud is a 47-year-old married man from Lebanon.')
-    expect(u'This is 40 year-old Kadiatu.')
+    expect(u'This is 40 year-old Kadiatu.', 'This is 40-year-old Kadiatu.')
+    expect(u'Eman is a 32 years old young woman.', u'Eman is a 32-year-old young woman.')
+
+    # These cases are correct and should remain unchanged.
+    expect(u'She is 47 years old.')
+    expect(u'She is a 47-year-old farmer.')
+
+    # These cases should be changed to have a single token of
+    # '47-year-old'.
+    expect(u'She is a 47 year old farmer.', u'She is a 47-year-old farmer.')
+    expect(u'She is a 47-year old farmer.', u'She is a 47-year-old farmer.')
+    expect(u'She is a 47 year-old farmer.', u'She is a 47-year-old farmer.')
+
+    # These should have 'years' changed to 'year' and hyphens inserted.
+    expect(ur'She is a 47 years old farmer.', ur'She is a 47-year-old farmer.')  # ok
+    expect(ur'She is a 47-years old farmer.', ur'She is a 47-year-old farmer.')  # ok
+    expect(ur'She is a 47 years-old farmer.', ur'She is a 47-year-old farmer.')
+    expect(ur'She is a 47-years-old farmer.', ur'She is a 47-year-old farmer.')
+
+    #
+    # Here are the parse trees for all 9 cases:
+    #
+    # 1. She is 47 years old.
+    #
+    # root_informal
+    #   subjh
+    #     bare_npq
+    #       she
+    #         <She 0:3>
+    #     hcomp
+    #       be_c_is
+    #         <is 4:6>
+    #       npadv
+    #         appos
+    #           bare_np
+    #             adjn
+    #               attr_adj_verb_psv_part
+    #                 generic_trans_verb_pas
+    #                   <47 7:9>
+    #               plur_noun_orule
+    #                 year_n1
+    #                   <years 10:15>
+    #           proper_np
+    #             adjn
+    #               old_a1
+    #                 <old 16:19>
+    #               noptcomp
+    #                 generic_year_ne
+    #                   <. 19:20>
+    #
 from collections import namedtuple
 import logging
 
-from mytoken import Token
+from keatoken import Token
 
 OpCode = namedtuple('OpCode', ['opcode', 'str', 'row', 'col'])
 
 
 
 if __name__ == '__main__':
-    from mytoken import Token
+    from keatoken import Token
     ts = TokenSearchByRegexp(u'with (.+) children and (.+)',
                              ur'and has \1 children, \2 of whom go to school')
     tokens = map(Token, 'with 4 children and 2'.split())
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.