Commits

david_walker  committed e6bb679

changes needed to support YearOldRule, which depends on parse trees

  • Participants
  • Parent commits 173e692
  • Branches parse

Comments (0)

Files changed (5)

 import tagger
 import parser
 
+
+class Sentence(object):
+    """Acts as a container of Tokens which has a parse attribute."""
+    def __init__(self, ea, start_idx, end_idx, parse=None):
+        self.ea = ea
+        self.start = start_idx
+        self.end = end_idx
+        self.parse = parse
+
+    def find_sequence(self, item):
+        """Support set membership operator "in" for strings, Tokens, and arrays
+        of strings or Tokens."""
+        if isinstance(item, basestring):
+            match_list = item.split()
+        else:
+            match_list = item
+        sentence_idx = self.start
+        match_index = 0
+        while sentence_idx < self.end:
+            if self.ea.tokens[sentence_idx].str != match_list[match_index]:
+                match_index = 0
+            else:
+                match_index += 1
+                if match_index == len(match_list):
+                    # Found all the tokens in match_list consecutively
+                    # appearing in self.ea.tokens. Return the index into
+                    # self.ea.tokens of the first token of that
+                    # sequence. Since sentence_idx has not yet been
+                    # incremented for the last item matched, it hasn't
+                    # advanced the entire length of match_list, hence
+                    # the + 1 here.
+                    return sentence_idx - len(match_list) + 1
+            sentence_idx += 1
+        return None
+
+
 class EditAssistant(object):
     def __init__(self, infile):
         """Process the input file and generate an output string."""
         # start the tokens array with one Token object that contains all
         # the text, followed by the sentinal.
         self._original_text = infile.read()
-        self._tokens = [Token(self._original_text, 0,
+        self.tokens = [Token(self._original_text, 0,
                               len(self._original_text)), eof_token]
+        self.sentences = []
+        self._parser = parser.Parser()
         # apply first phase rules to replace the original Token object
         # with multiple Token objects, one for each bit of the input
         # text that qualifies as a single input token.
         self._process_tokens(rules.INITIAL_PHASE)
         # Add a part-of-speech property to all the tokens
-        tagger.tag_tokens(self._tokens)
-        # now apply rules/transforms that make use of the POS properties
+        tagger.tag_tokens(self.tokens)
+        # now apply rules/transforms that make use of the POS
+        # properties. This includes the DelimitSentencesRule which
+        # inserts non-printing tokens that mark sentence boundaries.
         self._process_tokens(rules.POS_PHASE)
-        # insert non-printing sentence-delimiter tokens
-        self._delimit_sentences()
         # for each sentence, generate a parse tree
-        self._parser = parser.Parser()
         self._parse_sentences()
         # now apply rules that require sentence parses
-        self._process_tokens(rules.PARSE_PHASE)
+        self._process_tokens(rules.PARSED_PHASE)
+        self._generate_output()
         self._report_changes()
-        self._generate_output()
 
     def _asterisk_at_bol(self, token):
         return (token.str == u'*' and
                  len(self.edited_text) >= 2 and
                  self.edited_text[-2:] == u'\n*'))
 
-    def _delimit_sentences(self):
-        pass
+    def _dump_sentences(self):
+        for t in self.tokens:
+            print t.str,
+            if t.sentence_delim:
+                print
 
     def _generate_output(self):
         quote_stack = []
         self.edited_text = u''
-        for i, token in enumerate(self._tokens[:-1]):
+        for i, token in enumerate(self.tokens[:-1]):
             # if we have a paragraph break, insert that and go on to next token
             if token.is_para:
                 self.edited_text += u'\n\n'
 
             # now figure out if a space should follow it
             append_space = True
-            next_token = self._tokens[i + 1]
+            next_token = self.tokens[i + 1]
 
             if (token.is_open or
                 token.is_currency_symbol or
                 self.edited_text += u' '
 
     def _parse_sentences(self):
-        self._parse = self._parser.parse(self._tokens)
+        # for each range of tokens representing a sentence, generate a
+        # parse tree.
+        sentence_start_idx = 0
+        sentence_end_idx = 0
+        while sentence_end_idx < len(self.tokens):
+            cur_token = self.tokens[sentence_end_idx]
+            if cur_token.sentence_delim or cur_token.eof:
+                self.sentences.append(
+                    Sentence(self, sentence_start_idx, sentence_end_idx))
+                sentence_start_idx = sentence_end_idx + 1
+                sentence_end_idx += 2
+            else:
+                sentence_end_idx += 1
         pp = pprint.PrettyPrinter()
-        pp.pprint(self._parse)
+        for sent in self.sentences:
+            sent.parse = self._parser.parse(self.tokens[sent.start:sent.end])
 
     def _process_tokens(self, phase):
-        all_rules = rules.get_rules(phase)
+        rules_to_run = rules.get_rules(phase)
         while True:
-            logging.debug('***calling rules')
+            logging.debug('***calling %s rules', rules.PHASE_NAME[phase])
             changed = False
-            for rule in all_rules:
-                if rule.enabled and rule.apply(self._tokens):
+            for rule in rules_to_run:
+                if rule.enabled and rule.apply(self):
                     changed = True
                     break
-
             # If no changes were made, we're done changing tokens and
             # it's time to generate the output text.
             if not changed:
     def _report_changes(self):
         """Write a description of all significant changes."""
         print "Change report:"
-        for token in self._tokens:
+        # TODO: report runs of changes instead of each individual token
+        # changed or inserted.
+        for token in self.tokens:
             # skip Token objects used internally by the program; these
             # do not represent input text.
             if token.non_printing:
             # token contains text; if its cbegin==cend then it didn't
             # appear in the original text and was inserted.
             if token.cbegin == token.cend:
-                print "Inserted", token.str
+                print "inserted", token.str
             elif self._original_text[token.cbegin:token.cend] != token.str:
                 print u'Changed "{}" to "{}"'.format(
                     self._original_text[token.cbegin:token.cend], token.str)
 
     def dump_pos_tags(self):
         """Write every token with a Part-Of-Speech tag to stdout."""
-        for token in self._tokens:
+        for token in self.tokens:
             if hasattr(token, 'pos'):
                 print u'{}/{}'.format(token.str, token.pos),
             if token.str == '\n':
     def __str__(self):
         return self.__repr__()
 
+    def __eq__(self, other):
+        if isinstance(other, basestring):
+            return self._str == other
+        assert(isinstance(other, Token))
+        for key, val in self.__dict__.items():
+            if getattr(other, key, None) != val:
+                return False
+        return True
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
     def _reset_cache(self):
         self._abbrev_checked = False
         self._abbrev_match = None
 import xmlrpclib
 import time
 import os.path
+import logging
 from collections import namedtuple
 
+from mytoken import Token
 Leaf = namedtuple('Leaf', 'text token')
 
 
             for token in tokens:
                 if token.non_printing or token.is_para:
                     continue
-                with xml.w(id='W'+str(i), cstart=str(cpos), cend=str(cpos + len(token.str))):
+                with xml.w(id='W' + str(i), cstart=str(cpos),
+                           cend=str(cpos + len(token.str))):
                     xml.surface(token.str)
                     with xml.pos(tag=token.pos, prio='1.0'):
                         pass
         # this starts cheap as a child process whose stdout and stderr
         # go to new pipes rather than to this process' stdout and
         # stderr. It will continue to run even after this process exits.
+        logging.debug('starting cheap server')
         subprocess.Popen([Parser.CHEAP_BIN] + Parser.CHEAP_ARGS.split(),
                          stdout=subprocess.PIPE, stderr=subprocess.PIPE)
         server_alive = False
         attempts = 0
         while not server_alive and attempts < Parser.MAX_ALIVE_CHECKS:
-            time.sleep(3)
+            logging.debug('sleeping before checking cheap server')
+            time.sleep(5)
             try:
+                logging.debug('checking cheap server')
                 if self._server.cheap.alive():
                     server_alive = True
+                    logging.debug('cheap server is alive')
             except:
                 attempts += 1
+                logging.debug('cheap server not alive')
+        if not server_alive:
+            logging.debug(
+                'failed to start server at {} after {} attempts'.format(
+                    Parser.CHEAP_BIN, attempts))
 
     def parse(self, tokens):
+        logging.debug(u"parsing %s", tokens)
         self._check_server()
         # create an XML DOM object that represents the tagged tokens to parse
         pic = self._create_pet_input_chart(tokens)
             outfile.write(str(pic))
             # cheap requires two blank lines at end or it faults
             outfile.write('\n\n')
+        start_time = time.time()
         analysis = self._server.cheap.analyze(pic_filename)
+        logging.debug('analyzed {} tokens in {:.2f}s'.format(
+                len(tokens), time.time() - start_time))
         root = []
-        self._build_tree(root, iter(analysis['readings'][0]['derivation']),
-                         tokens)
-        return root[0]
+        try:
+            self._build_tree(root, iter(analysis['readings'][0]['derivation']),
+                             tokens)
+            pn = ParseNode(None, root[0])
+            pn.pprint()
+        except:
+            pn = None
+            logging.error('parsing failed')
+        return pn
+
+
+class ParseNode(object):
+    def __init__(self, parent, parse_list):
+        # create a tree of ParseNodes from the parse_list
+        self.parent = parent
+        self.name = parse_list[0]
+        if isinstance(parse_list[1][0], Leaf):
+            self.children = [parse_list[1][0].token]
+        else:
+            self.children = []
+            for p in parse_list[1:]:
+                self.children.append(ParseNode(self, p))
+
+    def _pprint(self, indent):
+        logging.debug((u' ' * indent) + self.name)
+        for child in self.children:
+            if isinstance(child, Token):
+                logging.debug((u' ' * (indent + 2)) + unicode(child))
+            else:
+                child._pprint(indent + 2)
+
+    def node_from_token(self, token):
+        """Return the ParseNode whose child is `token`."""
+        # if children list is a leaf node, i.e. a Token, then either
+        # this is the parent being sought or `token` doesn't lie in this
+        # branch of the parse tree.
+        if isinstance(self.children[0], Token):
+            if self.children[0] == token:
+                return self
+            return None
+
+        # the child list is non-leaf ParseNodes, so recurse into each of
+        # them to find `token`. It is convenient to write this as a
+        # depth-first search, however since every token of the input
+        # should appear exactly once in the set of parse trees, the
+        # order of search is unimportant.
+        for child in self.children:
+            node = child.node_from_token(token)
+            if node:
+                return node
+        return None
+
+    def pprint(self):
+        self._pprint(0)
         return self.__class__.__name__
 
     @abstractmethod
-    def apply(self, tokens):
-        """Apply this rule to `tokens`. """
+    def apply(self, ea):
+        """Apply this rule to the tokens in EditAssistant object `ea`.
+        """
         pass
 
 INITIAL_PHASE = 0
 POS_PHASE     = 1
-PARSE_PHASE   = 2
+PARSED_PHASE  = 2
+
+PHASE_NAME = ['Initial', 'Part-of-Speech Tagged', 'Parsed']
 
 
 def get_rules(desired_phase):
                                                    token_to_transform))
 
 
+# ---------------------------------------------------------------------------
+#
+# Initial Phase rules.
+#
+# ---------------------------------------------------------------------------
+
 class WhitespaceSplitRule(Rule):
     phase = INITIAL_PHASE
 
     def __init__(self):
         Rule.__init__(self, 5, "Separate text by whitespace.")
 
-    def apply(self, tokens):
+    def apply(self, ea):
         changed = False
-        for i, token in enumerate(tokens):
+        for i, token in enumerate(ea.tokens):
             if token.is_para or token.non_printing:
                 continue
             if ' ' in token.str:
-                split_token_at_delim(self, tokens, i, ' ', False)
+                split_token_at_delim(self, ea.tokens, i, ' ', False)
                 changed = True
         return changed
 
     def __init__(self):
         Rule.__init__(self, 10, "Search and replace specific strings")
 
-    def apply(self, tokens):
+    def apply(self, ea):
         changed = False
         for ts in RegexCleanupRule.regex_pairs:
-            if ts.apply(self, tokens):
+            if ts.apply(self, ea.tokens):
                 changed = True
         return changed
 
         Rule.__init__(self, 20,
                       "Separate text into paragraphs at line breaks.")
 
-    def apply(self, tokens):
+    def apply(self, ea):
         changed = False
         # search for a newline, consider it and all contiguous remaining
         # whitespace (including other newlines) to be a single paragraph break.
-        for transform_token_index, token in enumerate(tokens):
+        for transform_token_index, token in enumerate(ea.tokens):
             if token.is_para or token.non_printing:
                 continue
             mo = ParagraphRule.LINEBREAK_RE.search(token.str)
                 paragraph_token = Token(u'\n', token.cbegin, token.cbegin + 1)
 
             # replace the token we're transforming
-            tokens[transform_token_index] = paragraph_token
+            ea.tokens[transform_token_index] = paragraph_token
             changed = True
             # insert a token to its left if it is nonempty
             if left_token:
-                tokens.insert(transform_token_index, left_token)
+                ea.tokens.insert(transform_token_index, left_token)
                 # that insertion has made transform_token_index off by
                 # one; correct it.
                 transform_token_index += 1
             # now insert the token to the right, if it is nonempty
             if right_token:
-                tokens.insert(transform_token_index + 1, right_token)
+                ea.tokens.insert(transform_token_index + 1, right_token)
             logging.debug(u'ParagraphRule: %s',
                           filter(None,
                                  [left_token, paragraph_token, right_token]))
         tokens.insert(transform_token_index + 1, post_abbrev_token)
         return True
 
-    def apply(self, tokens):
+    def apply(self, ea):
         changed = False
-        for i, token in enumerate(tokens):
+        for i, token in enumerate(ea.tokens):
             if token.non_printing or not '.' in token.str:
                 continue
             # Token has a dot in it somewhere. Leave it alone if it's
                         logging.debug('DotSplitRule changing {} to {}'.format(
                                 original, token))
                         changed = True
-                elif self._transform_abbrev(tokens, i, abbrev, abbrev_len):
+                elif self._transform_abbrev(ea.tokens, i, abbrev, abbrev_len):
                     # token starts with an abbreviation was split
                     changed = True
             elif len(token.str) > 1:
                 # length check so we don't try to split '.'
-                split_token_at_delim(self, tokens, i, u'.', True)
+                split_token_at_delim(self, ea.tokens, i, u'.', True)
                 changed = True
         return changed
 
             self, 50, "Convert European style thousands"
             " delimiters '1.234.567,89' to American style '1,234,567.89'.")
 
-    def apply(self, tokens):
+    def apply(self, ea):
         changed = False
-        for token in tokens:
+        for token in ea.tokens:
             if token.non_printing or token.is_URL:
                 continue
             mo = EuroDelimiterRule.euro_decimal_number_re.match(token.str)
 
     Avoid splitting numeric punctuation, e.g., 11,000.34 should not be
     split at the comma or the decimal. Also avoid splitting at
-    apostrophes in contractions.
+    apostrophes in contractions, and do not split at single hyphens.
     """
 
     phase = INITIAL_PHASE
                       "Separate punctuation (other than periods)"
                       " into separate tokens.")
 
-    def apply(self, tokens):
+    def apply(self, ea):
         changed = False
-        for token in tokens:
+        for token in ea.tokens:
             if token.non_printing or len(token.str) < 2 or token.is_URL:
                 continue
             # get a list of match objects for embedded decimal numbers
                     logging.debug(
                         u"PunctSplitRule splitting '{}' at {}".format(
                             token.str, i))
-                    split_token_at_index(self, tokens, tokens.index(token), i,
-                                         True)
+                    split_token_at_index(
+                        self, ea.tokens, ea.tokens.index(token), i, True)
                     changed = True
                     break
         return changed
                       "Split conjoined words and "
                       "numbers into separate tokens.")
 
-    def apply(self, tokens):
+    def apply(self, ea):
         # | case | input     | output     |
         # |------+-----------+------------|
         # |    1 | 10am      | 10 am      |
         # |   12 | 1,200.    | 1,200 .    |
         # |   13 | 1,500.00  | 1,500.00   |
         changed = False
-        for token in tokens:
+        for token in ea.tokens:
             # skip non-printing, URL, and short tokens
             if token.non_printing or len(token.str) < 2 or token.is_URL:
                 continue
             if mo:
                 logging.debug(u"AlphaNumSplitRule '{}' at {}".format(
                         token.str, mo.start()))
-                split_token_at_index(self, tokens, tokens.index(token),
+                split_token_at_index(self, ea.tokens, ea.tokens.index(token),
                                      mo.start(), False)
                 changed = True
         return changed
         Rule.__init__(self, 80,
                       "Spell out single digit numbers.")
 
-    def apply(self, tokens):
+    def apply(self, ea):
         """Return True if any tokens are converted from digits to
         spelled numbers."""
         changed = False
-        for i, token in enumerate(tokens):
+        for i, token in enumerate(ea.tokens):
             # ignore tokens that aren't single digits
             if (token.non_printing or
                 not token.has_digits or len(token.str) != 1):
             # token is a digit 1..9, but it should only be spelled out
             # in some contexts. Get the tokens immediately preceding and
             # following it so they can be checked.
-            prev_token, next_token = get_neighbors(tokens, i)
+            prev_token, next_token = get_neighbors(ea.tokens, i)
 
             # don't spell out percentages, i.e. we want the final text
             # to have "7%" not "seven %"
                     original_token_str, token.str))
         return changed
 
-    def apply(self, tokens):
+    def apply(self, ea):
         changed = False
-        for i, token in enumerate(tokens):
+        for i, token in enumerate(ea.tokens):
             if token.non_printing:
                 continue
             if not DelimitThousandsRule._splittable_number_re.match(token.str):
                 continue
             # is preceding or following token recognized as a currency
             # symbol, ISO currency abbreviation, or term?
-            if (i > 0 and tokens[i - 1].is_currency_term or
-                i < len(tokens) - 1 and  tokens[i + 1].is_currency_term):
+            if (i > 0 and ea.tokens[i - 1].is_currency_term or
+                i < len(ea.tokens) - 1 and  ea.tokens[i + 1].is_currency_term):
                 if self._separate_thousands(token):
                     changed = True
         return changed
         # the token at start_idx.
         del tokens[start_idx + 1:start_idx + num]
 
-    def apply(self, tokens):
+    def apply(self, ea):
         changed = False
-        for i, token in enumerate(tokens):
+        for i, token in enumerate(ea.tokens):
             if token.non_printing or not token.has_digits:
                 continue
             if not token.is_delimited_integer:
                 continue
-            right = get_right_neighbor(tokens, i)
+            right = get_right_neighbor(ea.tokens, i)
             if not right:
                 continue
             # Can merge with right if it is a 3 digit or longer number,
             # optionally containing delimiting commas and a decimal
             # point.
             if AccreteNumbersRule.mergeable_number_re.match(right.str):
-                self._concat(tokens, i, 2)
+                self._concat(ea.tokens, i, 2)
                 changed = True
                 break
             # can also merge if next token (right) is a comma and the
             # one that follows it is a mergeable number
             if right.str == u',':
-                right2 = get_right_neighbor(tokens, i + 1)
+                right2 = get_right_neighbor(ea.tokens, i + 1)
                 if AccreteNumbersRule.mergeable_number_re.match(right2.str):
-                    self._concat(tokens, i, 3)
+                    self._concat(ea.tokens, i, 3)
                     changed = True
         return changed
 
             self, 110,
             "Put currency abbreviations after the numbers they describe")
 
-    def apply(self, tokens):
+    def apply(self, ea):
         changed = False
-        for i, token in enumerate(tokens):
+        for i, token in enumerate(ea.tokens):
             if token.non_printing or not token.is_ISO_currency:
                 continue
             # token is a 3-letter ISO currency abbreviation. If the next
             # token is a number, swap them, UNLESS the previous token is
             # also a number.
-            left, right = get_neighbors(tokens, i)
+            left, right = get_neighbors(ea.tokens, i)
             if right and right.has_digits:
                 if not left or not left.is_delimited_decimal:
                     logging.debug(
             self, 120,
             "Spell out the first occurrence of an ISO currency abbreviation.")
 
-    def apply(self, tokens):
+    def apply(self, ea):
         changed = False
-        for i, token in enumerate(tokens):
+        for i, token in enumerate(ea.tokens):
             if token.non_printing:
                 continue
             # token is an ISO currency abbreviation like 'PHP' or 'AZN'.
             # to be considered a currency abbreviation, a token must be
             # preceded or followed by a number. Look at the tokens
             # immediately preceding and following the current one.
-            prev_token, next_token = get_neighbors(tokens, i)
+            prev_token, next_token = get_neighbors(ea.tokens, i)
             if (not (prev_token and prev_token.has_digits) and
                 not (next_token and next_token.has_digits)):
                 continue
             #   100 PHP (Philippine peso)
             #   100 Philippine peso (PHP)
             prev_alpha_token, next_alpha_token = get_neighbors(
-                tokens, i, 'is_alpha')
+                ea.tokens, i, 'is_alpha')
             currency_name = pycountry.currencies.get(
                 letter=token.str.upper()).name
             name_words = currency_name.split()
                     logging.debug(
                         u'ISOCurrencyRule prefixing {} '
                         'with $ and deleting {}'.format(prev_token,
-                                                        tokens[i - 1]))
+                                                        ea.tokens[i - 1]))
                     token.str = '$' + prev_token.str
-                    del tokens[i - 1]
+                    del ea.tokens[i - 1]
                     changed = True
                 else:
                     # swap them first
                         letter=token.str.upper()).name.split()):
                     logging.debug(u'ISOCurrencyRule inserting ' +
                                   new_token.str)
-                    tokens.insert(i, new_token)
+                    ea.tokens.insert(i, new_token)
                     i += 1
                 logging.debug(u'ISOCurrencyRule inserting ( before ' +
-                              tokens[i].str)
-                tokens.insert(i, Token(u'('))
+                              ea.tokens[i].str)
+                ea.tokens.insert(i, Token(u'('))
                 logging.debug(u'ISOCurrencyRule inserting ) after ' +
-                              tokens[i + 1].str)
-                tokens.insert(i + 2, Token(u')'))
+                              ea.tokens[i + 1].str)
+                ea.tokens.insert(i + 2, Token(u')'))
                 changed = True
             # Exit the loop here because only the first abbreviation
             # should be spelled out.
             break
         return changed
 
+#TODO:
+# spell out sentence-initial numbers
 
-class SentenceDelimitRule(Rule):
+# ---------------------------------------------------------------------------
+#
+# Part-of-Speech Phase rules.
+#
+# These rules expect tokens to have part-of-speech tags.
+#
+# ---------------------------------------------------------------------------
+
+
+class DelimitSentencesRule(Rule):
     """Insert delimiter tokens between beginning and end of sentences.
     """
 
-    phase = INITIAL_PHASE
-
-    def __init__(self):
-        Rule.__init__(
-            self, 130,
-            "Surround every sentence with a sentence-delimiter token.")
-
-    def apply(self, tokens):
-        """Return a transform that will insert the delimiter tokens.
-
-        This rule is only intended to run once. It will disable itself
-        after the first run. If it detects any pre-existing sentence
-        delimiter tokens, it will return an empty list.
-        """
-        changed = False
-        # do nothing if this rule has ever been run.
-        for token in tokens:
-            if token.non_printing:
-                continue
-            if token.sentence_delim:
-                self.enabled = False
-                break
-        return changed
-
-
-class YearOldRule(Rule):
-    """Change 'xx years old' and 'xx year old' to 'xx-year-old' as
-    appropriate.
-    """
-
     phase = POS_PHASE
 
     def __init__(self):
         Rule.__init__(
             self, 200,
+            "Surround every sentence with a sentence-delimiter token.")
+
+    def apply(self, ea):
+        """Return a transform that will insert the delimiter tokens.
+
+        This rule is only intended to run once. It will disable itself
+        after the first run. If it detects any pre-existing sentence
+        delimiter tokens, it will return an empty list.
+        """
+        sentence_delim_token = Token('*EOS*')
+        sentence_delim_token.sentence_delim = True
+        changed = False
+        i = 0
+        while i < len(ea.tokens):
+            # do nothing if this rule has ever been run.
+            if ea.tokens[i].sentence_delim:
+                self.enabled = False
+                break
+            if getattr(ea.tokens[i], 'pos', None) == u'.':
+                ea.tokens.insert(i + 1, sentence_delim_token)
+                i += 1  # skip over inserted token
+            i += 1
+        return changed
+
+
+# ---------------------------------------------------------------------------
+#
+# Parsed Phase rules.
+#
+# These rules expect ea.parses to be valid.
+#
+# ---------------------------------------------------------------------------
+
+class YearOldRule(Rule):
+    """Change 'xx years old' and 'xx year old' to 'xx-year-old' as
+    appropriate.
+    """
+
+    phase = PARSED_PHASE
+
+    def __init__(self):
+        Rule.__init__(
+            self, 300,
             "Fix incorrect plural and lack of hyphens in phrases "
             "like '20 years old man'.")
 
-    def apply(self, tokens):
+    def apply(self, ea):
+        # search tokens for "years old"
+        for sent in ea.sentences:
+            years_idx = sent.find_sequence(u'years old')
+            if years_idx == None:
+                years_idx = sent.find_sequence(u'years - old')
+            if years_idx == None:
+                continue
+            # get the node whose only child is the token 'years'
+            years_node = sent.parse.node_from_token(ea.tokens[years_idx])
+            if years_node.parent.name != 'plur_noun_orule':
+                continue
+            make_singular = False
+        # She is 42 years old.
+        # root_informal
+        #   subjh
+        #     bare_npq
+        #       she
+        #         <She 0:3>
+        #     hcomp
+        #       be_c_is
+        #         <is 4:6>
+        #       npadv
+        #         appos
+        #           bare_np
+        #             adjn
+        #               attr_adj_verb_psv_part
+        #                 generic_trans_verb_pas
+        #                   <42 7:9>
+        #               plur_noun_orule
+        #                 year_n1
+        #                   <years 10:15>
+        #           proper_np
+        #             adjn
+        #               old_a1
+        #                 <old 16:19>
+        #               noptcomp
+        #                 generic_year_ne
+        #                   <. 19:20>
+
+
+            # nadj_rr
+            #   measure_np
+            #     generic_number
+            #       <51 2:4>
+            #     plur_noun_orule
+            #       year_n1
+            #         <years 5:10>
+            #   npadv
+            try:
+                make_singular = years_node.parent.parent.parent.children[1].name == 'npadv'
+                if make_singular:
+                    logging.debug('make_singular case 1')
+            except:
+                pass
+            # npadv_mnp
+            #   adjh_s_xp
+            #     a_one_adj
+            #       <A 0:1>
+            #     measure_np
+            #       generic_card_ne
+            #         <51 2:4>
+            #       plur_noun_orule
+            #         year_n1
+            #           <years 5:10>
+            if not make_singular:
+                try:
+                    make_singular = years_node.parent.parent.parent.parent.name == 'npadv_mnp'
+                    if make_singular:
+                        logging.debug('make_singular case 2')
+                except:
+                    pass
+            # appos
+            #   bare_np
+            #     plur_noun_orule
+            #       year_n1
+            #         <years 5:10>
+            if not make_singular:
+                try:
+                    make_singular = years_node.parent.parent.parent.name == 'appos'
+                    if make_singular:
+                        logging.debug('make_singular case 3')
+                except:
+                    pass
+            # She is a 42-year old farmer.
+            # frag_np
+            #   bare_np
+            #     punct_hinit
+            #       plur_noun_orule
+            #         year_n1
+            #           <years 11:16>
+            #       s_dash_pct
+            #         <- 16:17>
+            if not make_singular:
+                try:
+                    make_singular = (
+                        years_node.parent.parent.parent.name == 'bare_np' and
+                        years_node.parent.parent.children[1].name == 's_dash_pct')
+                    if make_singular:
+                        logging.debug('make_singular case 4')
+                except:
+                    pass
+
+            if make_singular:
+                years_token = years_node.children[0]
+                original = unicode(years_token)
+                years_token.str = u'year'
+                logging.debug(u'YearOldRule changed {} to {}'.format(
+                        original, unicode(years_token)))
         return False
 import tempfile
 import codecs
 
+TNT_BIN = '/home/david/delphin/bin/tnt'
 TRIGRAM_PATH = '/home/david/delphin/components/tnt/models/wsj.tnt'
 
 
         token_file_writer.flush()
 
         # Execute TNT; capture stderr so it doesn't pollute the console
-        process = subprocess.Popen(['tnt', TRIGRAM_PATH, token_file.name],
+        process = subprocess.Popen([TNT_BIN, TRIGRAM_PATH, token_file.name],
                                    stdin=subprocess.PIPE,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE)