david_walker avatar david_walker committed bcc8036

implementation progress report, initial (incomplete) version

Comments (0)

Files changed (8)

 This project requires DELPH-IN technology, namely the PET parser and the English Resource Grammar.  It also requires the TnT (Trigrams'n'Tags) part-of-speech tagger.
+
 
 class Sentence(object):
     """Acts as a container of Tokens which has a parse attribute."""
-    def __init__(self, ea, start_idx, end_idx, parse=None):
+    def __init__(self, ea, start_idx, end_idx):
         self.ea = ea
-        self.start = start_idx
-        self.end = end_idx
-        self.parse = parse
+        # Because methods in rules.py and elsewhere may insert or delete
+        # tokens within and between sentences, the integer indexes of
+        # the start and end delimiters of the sentences will change.
+        #
+        # Therefore this class keeps track of the sentence delimiting
+        # tokens themselves, and computes their indexes on demand.
+        self.start_token = self.ea.tokens[start_idx]
+        assert(self.start_token.bos)
+        self.end_token = self.ea.tokens[end_idx]
+        assert(self.end_token.eos)
+        # Since parsing is very expensive (some sentences can take
+        # minutes to parse) it is done on-demand only.
+        self._parse = None
+
+    @property
+    def start_idx(self):
+        """Return the 0-based index into self.ea.tokens of this
+        sentence's beginning-of-sentence delimiter token.
+        """
+        return self.ea.tokens.index(self.start_token)
+
+    @property
+    def end_idx(self):
+        """Return the 0-based index into self.ea.tokens of this
+        sentence's end-of-sentence delimiter token.
+        """
+        return self.ea.tokens.index(self.end_token)
+
+    @property
+    def parse(self):
+        """Return the parse tree for this sentence.
+        """
+        if not self._parse:
+            self._parse = self.ea.parser.parse(
+                self.ea.tokens[self.start_idx + 1:self.end_idx])
+        return self._parse
 
     def find_sequence(self, item):
-        """Support set membership operator "in" for strings, Tokens, and arrays
-        of strings or Tokens."""
+        """Search for a sequence of tokens whose strings match the
+        supplied regular expression(s).
+        """
         if isinstance(item, basestring):
             match_list = item.split()
         else:
             match_list = item
-        sentence_idx = self.start
+        sentence_idx = self.start_idx
+        end_idx = self.end_idx
         match_index = 0
-        while sentence_idx < self.end:
+        while sentence_idx < end_idx:
             if not re.match(match_list[match_index],
                             self.ea.tokens[sentence_idx].str):
                 match_index = 0
         self.tokens = [Token(self._original_text, 0,
                               len(self._original_text)), eof_token]
         self.sentences = []
-        self._parser = keaparser.Parser()
+        self.parser = keaparser.Parser()
         # apply first phase rules to replace the original Token object
         # with multiple Token objects, one for each bit of the input
         # text that qualifies as a single input token.
         # properties. This includes the DelimitSentencesRule which
         # inserts non-printing tokens that mark sentence boundaries.
         self._process_tokens(rules.POS_PHASE)
+        self._dump_tokens()
         # for each sentence, generate a parse tree
-        self._parse_sentences()
+        self._make_sentences()
         # now apply rules that require sentence parses
         self._process_tokens(rules.PARSED_PHASE)
         self._generate_output()
                  len(self.edited_text) >= 2 and
                  self.edited_text[-2:] == u'\n*'))
 
-    def _dump_sentences(self):
+    def _dump_tokens(self):
         for t in self.tokens:
-            print t.str,
-            if t.sentence_delim:
+            print t,
+            if t.str == '\n':
                 print
+        print
 
     def _generate_output(self):
         quote_stack = []
                 self.edited_text += u' '
         self.edited_text = self.edited_text.strip()
 
-    def _parse_sentences(self):
-        # for each range of tokens representing a sentence, generate a
-        # parse tree.
+    def _make_sentences(self):
+        """Populate the self.sentences array with Sentence objects.
+        """
         sentence_start_idx = 0
         sentence_end_idx = None
         while sentence_start_idx < len(self.tokens):
-            # The next sentence starts with the first printing token
-            # that is not a paragraph marker.
+            # The next sentence starts with a non-printing token that
+            # has the beginning of sentence property (bos) set.
             while (sentence_start_idx < len(self.tokens) and
-                   (self.tokens[sentence_start_idx].is_para or
-                    self.tokens[sentence_start_idx].non_printing)):
+                   not self.tokens[sentence_start_idx].bos):
                 sentence_start_idx += 1
 
             # If we couldn't find the start of the next sentence, stop
             # The end of the sentence must be beyond the starting token.
             sentence_end_idx = sentence_start_idx + 1
 
-            # move the end index to the right until something that
-            # delimits sentences is found.
+            # move the end index to the right until the end of sentence
+            # delimiting token is found.
             while sentence_end_idx < len(self.tokens):
-                cur_token = self.tokens[sentence_end_idx]
                 # if we've found the a delimiting token, make a
                 # sentence, then break out of this inner while loop to
                 # start searching for the start of the next sentence.
-                if cur_token.non_printing or cur_token.is_para:
+                if self.tokens[sentence_end_idx].eos:
                     self.sentences.append(
                         Sentence(self, sentence_start_idx, sentence_end_idx))
                     sentence_start_idx = sentence_end_idx + 1
                     break
                 # no delimiter yet, keep looking for end
                 sentence_end_idx += 1
-        for sent in self.sentences:
-            sent.parse = self._parser.parse(self.tokens[sent.start:sent.end])
 
     def _process_tokens(self, phase):
         rules_to_run = rules.get_rules(phase)
         while True:
             logging.debug('***calling %s rules', rules.PHASE_NAME[phase])
+            self._dump_tokens()
             changed = False
             for rule in rules_to_run:
                 if rule.enabled and rule.apply(self):
             # token contains text; if its cbegin==cend then it didn't
             # appear in the original text and was inserted.
             if token.cbegin == token.cend:
-                print "inserted", token.str
+                print 'inserted "{}"'.format(token.str)
             elif self._original_text[token.cbegin:token.cend] != token.str:
                 print u'Changed "{}" to "{}"'.format(
                     self._original_text[token.cbegin:token.cend], token.str)
                   '-results=1 -server /home/david/delphin/erg/english.grm')
     SERVER_URL = u'http://localhost:4711/cheap-rpc2'
     PIC_FILE = 'pic.xml'
+    DTD_PATH = u'/home/david/Projects/Kiva-dev/pic.dtd'
     MAX_ALIVE_CHECKS = 10
 
     def __init__(self):
         can understand.
         """
         xml = xmlwitch.Builder(version='1.0', encoding='utf-8')
+        xml.write(u'<!DOCTYPE pet-input-chart SYSTEM "{}">\n'.format(
+                Parser.DTD_PATH))
         with xml.pet_input_chart:
             i = 1
             cpos = 1
            (\S*)        # any amount of non-space chars """,
         re.I | re.U | re.VERBOSE)
 
+    @staticmethod
+    def bos_token():
+        """A convenience class-factory method to return an
+        beginning-of-sentence delimiter token."""
+        bos_token = Token('*BOS*')
+        bos_token.bos = True
+        return bos_token
+
+    @staticmethod
+    def eos_token():
+        """A convenience class-factory method to return an
+        end-of-sentence delimiter token."""
+        eos_token = Token('*EOS*')
+        eos_token.eos = True
+        return eos_token
+
     def __init__(self, s, cbegin=None, cend=None):
         """Initialize from text.
 
         self._abbrev_checked = False
         self._abbrev_match = None
         self._abbrev_match_len = 0
-        self.sentence_delim = None
-        self.eof = None
+        self.bos = None  # beginning of sentence
+        self.eos = None  # end of sentence
+        self.eof = None  # end of file
         self._URL_checked = False
         self._is_URL = None
 
         """Return True if any of the attributes are set which indicate a
         non-printing token.
         """
-        return self.sentence_delim or self.eof
+        return self.bos or self.eos or self.eof

report/progress.tex

+\title{Kiva Editor's Assistant\\Progress Report}
+\author{
+        David Walker \\
+        david.walker64@gmail.com
+}
+\date{\today}
+
+\documentclass[12pt]{article}
+\newcommand{\tkn}[2]{
+\ensuremath{\langle\!} #1 #2 \ensuremath{\!\rangle}
+}
+\begin{document}
+\maketitle
+
+\begin{abstract}
+The Kiva Editor's Assistant (KEA) provides automated support for microfinance charity Kiva.org's volunteer editors, whose job is to clean up English-language loan descriptions that are generally written by non-native English speakers. KEA iteratively applies rules to its input; these rules perform not only surface structure tasks (such as tokenizing, tagging, determining sentence boundaries, applying regular expression search/replace operations, and expanding ISO currency abbreviations) but also deep structure analysis to do things like correcting pluralization of phrases. The user is presented with the edited result and a report of the changes made. This paper describes the current status of the project and its implementation goals.
+\end{abstract}
+
+\section{Introduction}
+The Kiva Editor's Assistant (KEA) uses TnT and PET's "cheap" parser loaded with the ERG with the goal of providing automated support for microfinance charity Kiva.org's volunteer editors, whose task is to clean up English-language loan descriptions that are generally written by non-native English speakers. KEA uses multiple passes to tokenize input and apply rules that do things ranging from simple regular expression search/replace operations, to expanding (only the first occurence of) ISO currency abbreviations, to correcting pluralization of phrases like "is a 20 years old farmer" while being aware that a phrase like "is a farmer who is 20 years old" should be left untouched, which is handled by consulting the ERG parse tree.
+
+\section{Rules}
+
+KEA uses a rule-based approach to processing text.  Rules are grouped in phases and by priority; all the rules of a given phase are run in priority order until the first pass which produces no changes to the text occurs, at which time the next phase is invoked. When the last phase no longer produces changes to the text, KEA converts its tokens into output text and delivers that along with a report of the changes made to the original text.
+
+The following sections describe the rules in priority order.
+
+\subsection{Initial Phase}
+\subsubsection{Splitting at Spaces}
+KEA initially creates one token that encompasses the entire input, then subdivides that token. The first subdivision is performed by splitting the input text at spaces, resulting in one token for every space-delimited stretch of characters.  Each token records the beginning and ending offset into the original text that it represents, and this indexing information is preserved when splitting (or, later, merging) tokens.
+
+\subsubsection{Splitting at Newlines}
+
+Every run of one or more newline characters is converted into a single paragraph delimiter token. In addition to providing a hard paragraph break, these tokens are also used to provide a clue to the sentence boundary detection rule.
+
+\subsubsection{Splitting at Dots}
+
+A common error in the source text is a lack of spaces after sentence-final punctuation, leading to sentences of the form ``One sentence.And another.'' This rule splits periods into separate tokens, unless they are part of a URL, an abbreviation, or a decimal number.
+
+The preceding example, after whitespace split, would result in these tokens:
+
+\tkn{One}{0:3} \tkn{sentence.And}{4:16} \tkn{another.}{17:25} 
+
+After splitting at dots, the resulting tokens are:
+
+\tkn{One}{0:3} \tkn{sentence}{4:12} \tkn{.}{12:13} \tkn{And}{13:16} \tkn{another}{17:24} \tkn{.}{24:25} 
+
+\subsubsection{Converting Decimal Delimiters}
+
+Nearly all loans use American-style delimiters for decimal numbers, e.g., ``12,345.67''.  A very small number of loans use European-style delimiters, e.g. ``12.345,67''. For consistency, this rule converts European-style decimal numbers into American-style.
+
+\subsubsection{Splitting at Other Punctuation}
+
+This rule splits tokens at punctuation other than dots.  It is constructed to avoid splitting tokens at numeric punctuation (thousands and decimal delimiters), apostrophes used as contractions and possessive markers (but not as quotes), and asterisks that appear at the start of words (these occur in loans as footnote markers).
+
+This rule also avoids splitting hyphenated words; with the way the token lattice is currently constructed, PET will fail to parse separate tokens like ``47'' ``year'' ``-'' ``old'' but will succeed with a sequence like ``47'' ``year-old''.
+
+\subsubsection{Alphanumeric Split}
+
+This rule splits one token into two for a number of cases, illustrated in table \ref{tab:alpha_split}:
+
+\begin{table}[h]
+\begin{center}
+\begin{tabular}{|l|l|}
+ input      &  output      \\
+\hline
+ 10am       &  10 am       \\
+ 10.00am    &  10.00 am    \\
+ 10:00am    &  10:00 am    \\
+ 10:00a.m.  &  10:00 a.m.  \\
+ 500foo     &  500 foo     \\
+ bar200     &  bar 200     \\
+ ksh.1000   &  ksh. 1000   \\
+ 1,200.     &  1,200 .     \\
+ 1200.      &  1200 .      \\
+\end{tabular}
+\caption{Alphanumeric token splitting.}
+\label{tab:alpha_split}
+\end{center}
+\end{table}
+
+
+
+\subsubsection{Regular Expression Search and Replace }
+
+The user must have the opportunity to review every (non-whitespace) change that KEA has made to the original text. This requirement has an impact on the regular expression search and replace (regex) rules: it would be convenient to have the regex rules operate on the original all-encompassing token before any whitespace split occurs, but doing so would render these changes internal to that single token, which has the disadvantage that textual changes would have to be reconstructed by performing a diff operation between the original and the generated output.  
+
+Maintaining a link between individual tokens and the indexes of the original text they were initialized from not only makes generating a change report a simpler task but also makes for much easier debugging of the system, as tracking the changes to tokens as the result of successive rule applications is straightforward.
+
+For example, consider the sentence ``Pat joined in the year 2009.''  This is tokenized as:
+
+\tkn{Pat}{0:3} \tkn{joined}{4:10} \tkn{in}{11:13} \tkn{the}{14:17} \tkn{year}{18:22} \tkn{2009}{23:27} \tkn{.}{27:28} 
+
+and is changed by a regex rule to:
+
+\tkn{Pat}{0:3}  \tkn{joined}{4:10}  \tkn{in}{11:13}  \tkn{2009}{23:27}  \tkn{.}{27:28} 
+
+The regex rule processor uses dynamic programming to compute the Levenshtein distance at the token level between the source and target strings, which allows it to perform the minimal number of insertions, deletions, and edits to the source tokens to transform them so they represent the target strings.
+
+\subsubsection{Spelling Single Digits}
+
+As a purely stylistic measure, single digits are spelled out, unless they indicate a percentage (7\%), are part of a list [e.g. 1) 2) 3)], or indicate an amount of currency (\$7).  Items for further work include recognizing when single digits are part of an address, or occur in a list that contains multi-digit numbers. A frequent example of the latter is a list of children's ages, such as ``Pat has 3 children aged 4, 8, and 12.'' In this case the ``3'' should be spelled out, but the ``4'' and ``8'' should not.
+
+\subsubsection{Delimiting Currency}
+\label{sec:delim_currency}
+
+Currency amounts larger than four digits are delimited with commas separating thousands. Identifying a number as being a currency amount requires recognizing currency symbols like ``\$", ISO abbreviations such as ``PHP'' (Philippine Peso), and local currency terms, such as ``/='' (a Ugandan abbreviation for shilling).
+
+\subsubsection{Concatenating Numbers}
+\label{sec:concat_num}
+
+This is the first of the rules that merges tokens. It searches for consecutive numeric tokens that either have spaces where thousands separators should be, or spaces in addition to thousands separators. 
+
+\subsubsection{Currency Abbreviation Placement}
+
+This rule swaps the position of an ISO currency abbreviation and a following number, unless the abbreviation is also preceded by a number. For example, 
+
+\tkn{PHP}{0:3} \tkn{1000}{4:8}
+
+becomes:
+
+\tkn{1000}{0:3} \tkn{PHP}{4:8}
+
+\subsubsection{ISO Currency Abbreviation expansion}
+
+The initial occurence of an ISO currency abbreviation is spelled out.  For example, ``one 5000 KES loan and another 7000 KES loan'' becomes ``one 5000 Kenyan Shilling (KES) loan and another 7000 KES loan''.
+
+\subsection{POS Phase Rules}
+
+Rules in this phase depend on the TnT part-of-speech tagger having supplied POS tags to all tokens.  Before processing any of the rules in this phase, KEA writes the text of each token to its own line in a temporary file, then invokes TnT on it, using the WSJ PTB tag set.  Early experimentation showed that providing PET with only the highest-priority tag for each token led to parse failures, so KEA launches TnT as a child process with command-line parameters that request not just the highest-probability tag but all tags with at least one hundredth that probability for each token.
+
+\subsubsection{Sentence Boundary Detection}
+
+This rule looks for a POS tag indicating sentence-final punctuation, as well as terminating sentences when a hard newline is found.  The latter is of course generally \emph{not} a reliable indicator of a sentence boundary, but in Kiva loan description text, it is quite reliable.
+
+\subsection{Parsed Phase Rules}
+
+These rules expect that a parse can at least be attempted, which is why this phase must follow the part of speech tagging phase. For performance reasons, sentences are only parsed on demand.  KEA uses PET (specifically the binary ``cheap'') as an XML-RPC server; if the server isn't detected, KEA will launch it as a daemon.
+
+KEA uses PET Input Chart (PIC) input mode, which requires an XML document\footnote{See http://moin.delph-in.net/PetInputChart}. 
+To request the parse of a sentence, KEA converts a sequence of tokens to an XML file using the document type definition pic.dtd from the Heart of Gold\footnote{See http://moin.delph-in.net/HeartofgoldTop} and supplies that to PET. Currently only the w, surface, and pos elements are used; see \ref{fig:pic} for an example. 
+
+A high priority work item is to either extend the KEA code to create a richer token lattice, or to outsource that job to Heart of Gold's TnTpiXML tokenizer, since some test inputs fail to parse in a reasonable time using the current simplistic token lattice, but are manageable using a lattice created by TnTpiXML.  
+
+In either case, there is another opportunity to improve the lattice that is unique to this application. Every loan description contains the name of at least one borrower; these names are available in an HTML table on the Kiva editing web page.  A future work item, then, is to extract the HTML table contents and use those to mark the names in the token lattice as named entities. As an example, see the handling of the name ``Kim Novak'' at http://moin.delph-in.net/PetInputChart.
+
+\subsubsection{Age Expression}
+
+The first (and currently only) parse phase rule searches sentences for expressions like ``x years old'', then examines the parse provided by PET to determine if the pluralization and hyphenation needs to be corrected.  Currrently it considers eleven cases. 
+
+
+The first three cases are considered correct usages and are not changed:
+
+\begin{enumerate}
+\item Pat is 47 years old.
+\item Pat is a 47-year-old farmer.
+\item {Pat is 1 year old.}
+\end{enumerate}
+
+\flushleft{These cases have the correct pluralization, but lack one or more hyphens:}
+
+\begin{enumerate}
+\setcounter{enumi}{3}
+\item Pat is a 47 year old farmer.
+\item Pat is a 47 year old lady who is a farmer.
+\item Pat is a 47-year old farmer.
+\item Pat is a 47 year-old farmer.
+\end{enumerate}
+
+The remaining cases all have incorrect pluralization and, except for the last, which are missing one or more hyphens:
+
+\begin{enumerate}
+\setcounter{enumi}{7}
+\item Pat is a 47 years old farmer. \label{it:years}
+\item Pat is a 47-years old farmer.
+\item Pat is a 47 years-old farmer.
+\item Pat is a 47-years-old farmer.
+\end{enumerate}
+
+Consider case \ref{it:years}. Having determined that the sentence contains the sequence ``years old'', KEA requests a parse tree (figure \ref{fig:farmer}). Seeing that the grandparent of the \tkn{years}{12:17} token is $\tt plur\_noun\_orule$ and that its great-grandparent's sibling is $\tt npadv$, which satisfy the criteria for case \ref{it:years}, KEA changes the tokens \tkn{47}{9:11} \tkn{years}{12:17} \tkn{old}{18:21} to \tkn{47-year-old}{9:11}.
+
+\begin{figure}
+  \centering
+\begin{verbatim}
+<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE pet-input-chart SYSTEM "/home/david/Projects/Kiva-dev/pic.dtd">
+<pet-input-chart>
+  <w id="W1" cend="4" cstart="1">
+    <surface>Pat</surface>
+    <pos tag="NNP" prio="1.000000e+00" />
+  </w>
+  <w id="W2" cend="7" cstart="5">
+    <surface>is</surface>
+    <pos tag="VBZ" prio="1.000000e+00" />
+  </w>
+  <w id="W3" cend="9" cstart="8">
+    <surface>a</surface>
+    <pos tag="DT" prio="1.000000e+00" />
+  </w>
+  <w id="W4" cend="12" cstart="10">
+    <surface>47</surface>
+    <pos tag="CD" prio="1.000000e+00" />
+  </w>
+  <w id="W5" cend="18" cstart="13">
+    <surface>years</surface>
+    <pos tag="NNS" prio="1.000000e+00" />
+  </w>
+  <w id="W6" cend="22" cstart="19">
+    <surface>old</surface>
+    <pos tag="JJ" prio="1.000000e+00" />
+  </w>
+  <w id="W7" cend="29" cstart="23">
+    <surface>farmer</surface>
+    <pos tag="NN" prio="1.000000e+00" />
+  </w>
+  <w id="W8" cend="31" cstart="30">
+    <surface>.</surface>
+    <pos tag="." prio="1.000000e+00" />
+  </w>
+</pet-input-chart>
+\end{verbatim}
+\caption{Pet Input Chart for ``Pat is a 47 years old farmer.''}
+\label{fig:pic}
+\end{figure}
+
+\begin{figure}[H]
+\begin{verbatim}
+root_informal
+  subjh
+    proper_np
+      sing_noun_irule
+        pat
+          <Pat 0:3>
+    hcomp
+      be_c_is
+        <is 4:6>
+      npadv_mnp
+        adjh_s_xp
+          a_one_adj
+            <a 7:8>
+          nadj_rr
+            measure_np
+              generic_card_ne
+                <47 9:11>
+              plur_noun_orule
+                year_n1
+                  <years 12:17>
+            npadv
+              proper_np
+                adjn
+                  old_a1
+                    <old 18:21>
+                  noun_n_cmpnd
+                    farmer_n1
+                      <farmer 22:28>
+                    sing_noun_irule
+                      generic_date_ne
+                        <. 28:29>
+\end{verbatim}
+\caption{Parse tree for ``Pat is a 47 years old farmer.''}
+\label{fig:farmer}
+\end{figure}
+
+
+\section{Examples}
+
+For example, given the initial tokens: 
+
+\tkn{10}{0:2} \tkn{000}{3:6} \tkn{PHP}{7:10}
+
+the concatenation rule will alter them (notice the conservation of the indexes to the original text ranging from 0:6):
+
+\tkn{10000}{0:6} \tkn{PHP}{7:10} 
+
+then the currency delimiting rule will fire, producing:
+
+\tkn{10,000}{0:6} \tkn{PHP}{7:10} 
+
+
+
+\begin{table}[h]
+\begin{center}
+\begin{tabular}{|l|l|}
+ input      &  output      \\
+\hline
+ 10 000       &  10000       \\
+\end{tabular}
+\caption{Concatenating numeric tokens.}
+\label{tab:accrete}
+\end{center}
+\end{table}
+
+
+\section{Future Work}
+- improve token lattice with Heart of Gold
+- pluralizing currency names
+- user-friendly GUI
+- support for other O/Ss
+
+\bibliographystyle{abbrv}
+\bibliography{main}
+
+\end{document}
     deleted if `keep_delim` is not set.
     """
     token_to_transform = tokens[transform_token_index]
+    logging.debug(u'split_token_at_delim called on {} for delim {}'.format(token_to_transform, delim))
     split_tokens = []
     # accumulate non-delimiter characters in split_str
     split_str = u''
         return changed
 
 
-class RegexCleanupRule(Rule):
-    """Generate transforms for tokens matching any of a variety of
-    regular expressions used to clean up common errors.
-
-    This rule works best when run before the original text is split.
-    """
-
-    phase = INITIAL_PHASE
-    regex_pairs = [
-        # Character standardization
-        TokenSearchByRegexp(u'“|”', u'"'),
-        TokenSearchByRegexp(u"’", u"'"),
-
-        # misspellings
-        TokenSearchByRegexp(u'dependants', u'dependents'),
-        TokenSearchByRegexp(ur'therefor', u'therefore'),
-        TokenSearchByRegexp(ur'(?i)(micro) finance', u'\1finance'),
-
-        # proper nouns
-        TokenSearchByRegexp(u'congo town', u'Congo Town'),
-        TokenSearchByRegexp(u'lake victoria', u'Lake Victoria'),
-        TokenSearchByRegexp(u'Pre-angkorean', u'pre-Angkorean'),
-        TokenSearchByRegexp(u'KIVA', u'Kiva'),
-        TokenSearchByRegexp(u'KADET LTD', u'KADET Ltd.'),
-
-        # awkward or verbose constructions
-        TokenSearchByRegexp(u'in the year ([0-9]+)', ur'in \1'),
-        TokenSearchByRegexp(u'three/four acre(s?)', ur'three quarters of an acre'),
-        TokenSearchByRegexp(u'requested for', u'requested'),
-        TokenSearchByRegexp(u'has given birth to', 'has'),
-        TokenSearchByRegexp(u'requesting to borrow', u'asking to borrow'),
-        TokenSearchByRegexp(u'adult-aged', u'adult'),
-        TokenSearchByRegexp(ur'and etc\.*', u'etc.'),
-        TokenSearchByRegexp(u'infant-aged', u'infant'),
-        TokenSearchByRegexp(u'requesting for a', u'requesting a'),
-        TokenSearchByRegexp(u'requested a loan for ([0-9]+)',
-                            ur'requested a loan of \1'),
-        TokenSearchByRegexp(ur'he is widowed', u'he is a widower'),
-        TokenSearchByRegexp(u'borrowed a loan', u'took out a loan'),
-        TokenSearchByRegexp(u'in a business of', u'in the business of'),
-        TokenSearchByRegexp(
-            u'with (.+) children and (.+) of them go to school',
-            ur'and has \1 children, \2 of whom go to school'),
-        TokenSearchByRegexp(u'to invest in expanding the business',
-                            u'to expand the business'),
-        TokenSearchByRegexp(u'fisherfolks', u'fishermen'),
-        TokenSearchByRegexp(u'aspired for', u'wanted'),
-        TokenSearchByRegexp(u"uplifting the family's standard",
-                            u"raising the family's standard"),
-        TokenSearchByRegexp(u'could continue to save up',
-                            u'can continue to save'),
-        TokenSearchByRegexp(u'from the Word of God she received',
-                            u'from the Word of God she studies'),
-        TokenSearchByRegexp(u'raise & sell in future', u'raise and sell'),
-        TokenSearchByRegexp(u'married with ([0-9]+) (child|children)',
-         ur'married and has \1 \2'),
-        TokenSearchByRegexp(u'has a long experience',
-                            u'has a lot of experience'),
-        TokenSearchByRegexp(u'is aiming to gain more profits',
-                            u'aims to make more money'),
-        TokenSearchByRegexp(u'has a good experience in this field and a good '
-         'reputation and (s?he) is being well known in (his|her) area',
-         ur' has a lot of experience in this field, a good reputation, '
-         ur'and is well known in \2 area'),
-
-        # Chiefly British
-        TokenSearchByRegexp(u'([iI]n) future', ur'\1 the future'),
-        TokenSearchByRegexp(u'tyres', u'tires'),
-        TokenSearchByRegexp(u'neighbour', u'neighbor'),
-        TokenSearchByRegexp(u'licencing', u'licensing'),
-
-        # non-ISO currency abbreviations
-        TokenSearchByRegexp(ur'(.+)/=', ur'\1 UGX'),
-        TokenSearchByRegexp(ur'(?i)ksh(?:s|)(?:\.|)([0-9,.]+|)', ur'KES \1'),
-        TokenSearchByRegexp(ur'[Pp]hp', ur'PHP'),
-        TokenSearchByRegexp(ur'P([0-9,.]+)', ur'\1 PHP'),
-        TokenSearchByRegexp(ur'(?i)LE ([0-9]*)', ur'SLL \1'),
-        TokenSearchByRegexp(ur'Rp\.', ur'IDR'),
-
-        # incorrect punctuation
-        TokenSearchByRegexp(ur'e\.t\.c\.?', ur'etc.'),
-        TokenSearchByRegexp(ur'\betc([^.])', ur'etc.\1'),
-        TokenSearchByRegexp(ur'(?<!\.)\.\.(?!\.)', ur'.'),  # blah.. -> blah.
-
-        # grammatical errors
-        TokenSearchByRegexp(ur'1 infant-aged children', ur'one infant child'),
-        TokenSearchByRegexp(ur'1 years', ur'one year'),
-        TokenSearchByRegexp(ur'never missed any meeting\.',
-                            ur'never missed any meetings.'),
-
-        # Field partner template cleanup
-        # TokenSearchByRegexp(ur'To make a living,'
-        #    '(?P<name>(\w|\s)+) owns & operates a business'
-        #    'venture in the [a-z]+ sector \w+ (?P<business>[^.]+\.)',
-        #  ur'\g<name> has a \g<business>'),
-        TokenSearchByRegexp(
-            u'[Ww]hile not the only means for generating revenue, the',
-            u'The'),
-        TokenSearchByRegexp(
-            u'main source of income for the business comes primarily from',
-            u'main source of income for the business comes from'),
-        TokenSearchByRegexp(u'a month for these activities',
-                            u'a month from it'),
-        TokenSearchByRegexp(u'comes from buying and selling of',
-                            u'comes from selling'),
-        TokenSearchByRegexp(u'engage in business activities', u'do business'),
-        # TODO: really want "improve / expand" NP "business"
-        # TokenSearchByRegexp(
-        #     u"improve / expand the borrower's business",
-        #     u'improve and expand it'),
-        TokenSearchByRegexp(ur'fellowship\* meeting', u'fellowship meeting*'),
-        TokenSearchByRegexp(u'clicking the link to the NWTF Kiva lending team',
-         ur'clicking the link to the '
-         '<a href="http://www.kiva.org/team/nwtf_philippines">'
-         'NWTF Kiva lending team</a>'),
-        TokenSearchByRegexp(
-            u'Kiva\'s Muslim World Lending helptext: '
-            u'http://tinyurl.com/3aekx8m',
-            u'Kiva\'s article on <a href="http://na3.salesforce.com/_ui/'
-            'selfservice/pkb/'
-            'PublicKnowledgeSolution/d?orgId=00D500000006svl&lang=1&id=501'
-            '50000000SN1N&retURL=/sol/public/solutionbrowser.jsp%3Fsearch%3D'
-            'muslim%2Bworld%26cid%3D02n50000000DUOS%26orgId%3D00D500000006svl'
-            '%26lang%3D1%26t%3D4&ps=1&pPv=1">Lending in the Muslim World</a>'),
-
-        # Jargon
-        TokenSearchByRegexp(u'cycle loan', u'loan'),
-        TokenSearchByRegexp(u'loan cycle', u'loan'),
-        TokenSearchByRegexp(u'loan facility', u'loan'),
-
-        # Numeric expressions
-        TokenSearchByRegexp(ur'1st', u'first'),
-        TokenSearchByRegexp(ur'2nd', u'second'),
-        TokenSearchByRegexp(ur'3rd', u'third'),
-        TokenSearchByRegexp(ur'4th', u'fourth'),
-        TokenSearchByRegexp(ur'5th', u'fifth'),
-        TokenSearchByRegexp(ur'6th', u'sixth'),
-        TokenSearchByRegexp(ur'7th', u'seventh'),
-        TokenSearchByRegexp(ur'8th', u'eighth'),
-        TokenSearchByRegexp(ur'9th', u'ninth'),
-
-        ]
-
-    def __init__(self):
-        Rule.__init__(self, 10, "Search and replace specific strings")
-
-    def apply(self, ea):
-        changed = False
-        for ts in RegexCleanupRule.regex_pairs:
-            if ts.apply(self, ea.tokens):
-                changed = True
-        return changed
-
-
 class ParagraphRule(Rule):
     """Divide a single token containing embedded newlines into
     multiple tokens.
         # |   12 | 1,200.    | 1,200 .    |
         # |   13 | 1,500.00  | 1,500.00   |
         # |   14 | 47-year   | 47-year    |
+        # |   15 | 1200.     | 1200 .     |
         changed = False
         for token in ea.tokens:
             # skip non-printing, URL, and short tokens
                 # case 12, note $ is for case 13
                 mo = re.match(ur'[1-9][0-9]{,2}(?:,[0-9]{3})*\.(?:[^0-9]|$)',
                               token.str)
+                # case 15
+                if not mo:
+                    mo = re.match('[0-9]+\.$', token.str)
                 if mo:
                     # reposition at period
                     mo = re.search(ur'\.', token.str)
         return changed
 
 
+class RegexCleanupRule(Rule):
+    """Generate transforms for tokens matching any of a variety of
+    regular expressions used to clean up common errors.
+    """
+
+    phase = INITIAL_PHASE
+    regex_pairs = [
+        # Character standardization
+        TokenSearchByRegexp(u'“|”', u'"'),
+        TokenSearchByRegexp(u"’", u"'"),
+
+        # misspellings
+        TokenSearchByRegexp(u'dependants', u'dependents'),
+        TokenSearchByRegexp(ur'therefor', u'therefore'),
+        TokenSearchByRegexp(ur'(?i)(micro) finance', u'\1finance'),
+
+        # proper nouns
+        TokenSearchByRegexp(u'congo town', u'Congo Town'),
+        TokenSearchByRegexp(u'lake victoria', u'Lake Victoria'),
+        TokenSearchByRegexp(u'Pre-angkorean', u'pre-Angkorean'),
+        TokenSearchByRegexp(u'KIVA', u'Kiva'),
+        TokenSearchByRegexp(u'KADET LTD', u'KADET Ltd.'),
+
+        # awkward or verbose constructions
+        TokenSearchByRegexp(u'in the year ([0-9]+)', ur'in \1'),
+        TokenSearchByRegexp(u'three/four acre(s?)', ur'three quarters of an acre'),
+        TokenSearchByRegexp(u'requested for', u'requested'),
+        TokenSearchByRegexp(u'has given birth to', 'has'),
+        TokenSearchByRegexp(u'requesting to borrow', u'asking to borrow'),
+        TokenSearchByRegexp(u'adult-aged', u'adult'),
+        TokenSearchByRegexp(ur'and etc\.*', u'etc.'),
+        TokenSearchByRegexp(u'infant-aged', u'infant'),
+        TokenSearchByRegexp(u'requesting for a', u'requesting a'),
+        TokenSearchByRegexp(u'requested a loan for ([0-9]+)',
+                            ur'requested a loan of \1'),
+        TokenSearchByRegexp(ur'he is widowed', u'he is a widower'),
+        TokenSearchByRegexp(u'borrowed a loan', u'took out a loan'),
+        TokenSearchByRegexp(u'in a business of', u'in the business of'),
+        TokenSearchByRegexp(
+            u'with (.+) children and (.+) of them go to school',
+            ur'and has \1 children, \2 of whom go to school'),
+        TokenSearchByRegexp(u'to invest in expanding the business',
+                            u'to expand the business'),
+        TokenSearchByRegexp(u'fisherfolks', u'fishermen'),
+        TokenSearchByRegexp(u'aspired for', u'wanted'),
+        TokenSearchByRegexp(u"uplifting the family's standard",
+                            u"raising the family's standard"),
+        TokenSearchByRegexp(u'could continue to save up',
+                            u'can continue to save'),
+        TokenSearchByRegexp(u'from the Word of God she received',
+                            u'from the Word of God she studies'),
+        TokenSearchByRegexp(u'raise & sell in future', u'raise and sell'),
+        TokenSearchByRegexp(u'married with ([0-9]+) (child|children)',
+         ur'married and has \1 \2'),
+        TokenSearchByRegexp(u'has a long experience',
+                            u'has a lot of experience'),
+        TokenSearchByRegexp(u'is aiming to gain more profits',
+                            u'aims to make more money'),
+        TokenSearchByRegexp(u'has a good experience in this field and a good '
+         'reputation and (s?he) is being well known in (his|her) area',
+         ur' has a lot of experience in this field, a good reputation, '
+         ur'and is well known in \2 area'),
+
+        # Chiefly British
+        TokenSearchByRegexp(u'([iI]n) future', ur'\1 the future'),
+        TokenSearchByRegexp(u'tyres', u'tires'),
+
+        # non-ISO currency abbreviations
+        TokenSearchByRegexp(ur'(.+)/=', ur'\1 UGX'),
+        TokenSearchByRegexp(ur'(?i)ksh(?:s|)(?:\.|)([0-9,.]+|)', ur'KES \1'),
+        TokenSearchByRegexp(ur'[Pp]hp', ur'PHP'),
+        TokenSearchByRegexp(ur'P([0-9,.]+)', ur'\1 PHP'),
+        TokenSearchByRegexp(ur'(?i)LE ([0-9]*)', ur'SLL \1'),
+        TokenSearchByRegexp(ur'Rp\.', ur'IDR'),
+
+        # incorrect punctuation
+        TokenSearchByRegexp(ur'e\.t\.c\.?', ur'etc.'),
+        TokenSearchByRegexp(ur'\betc([^.])', ur'etc.\1'),
+        TokenSearchByRegexp(ur'(?<!\.)\.\.(?!\.)', ur'.'),  # blah.. -> blah.
+
+        # grammatical errors
+        TokenSearchByRegexp(ur'1 infant-aged children', ur'one infant child'),
+        TokenSearchByRegexp(ur'1 years', ur'one year'),
+        TokenSearchByRegexp(ur'never missed any meeting',
+                            ur'never missed any meetings'),
+
+        # Field partner template cleanup
+        # TokenSearchByRegexp(ur'To make a living,'
+        #    '(?P<name>(\w|\s)+) owns & operates a business'
+        #    'venture in the [a-z]+ sector \w+ (?P<business>[^.]+\.)',
+        #  ur'\g<name> has a \g<business>'),
+        TokenSearchByRegexp(
+            u'[Ww]hile not the only means for generating revenue, the',
+            u'The'),
+        TokenSearchByRegexp(
+            u'main source of income for the business comes primarily from',
+            u'main source of income for the business comes from'),
+        TokenSearchByRegexp(u'a month for these activities',
+                            u'a month from it'),
+        TokenSearchByRegexp(u'comes from buying and selling of',
+                            u'comes from selling'),
+        TokenSearchByRegexp(u'engage in business activities', u'do business'),
+        # TODO: really want "improve / expand" NP "business"
+        # TokenSearchByRegexp(
+        #     u"improve / expand the borrower's business",
+        #     u'improve and expand it'),
+        TokenSearchByRegexp(ur'fellowship\* meeting', u'fellowship meeting*'),
+        TokenSearchByRegexp(u'clicking the link to the NWTF Kiva lending team',
+         ur'clicking the link to the '
+         '<a href="http://www.kiva.org/team/nwtf_philippines">'
+         'NWTF Kiva lending team</a>'),
+        TokenSearchByRegexp(
+            u'Kiva\'s Muslim World Lending helptext: '
+            u'http://tinyurl.com/3aekx8m',
+            u'Kiva\'s article on <a href="http://na3.salesforce.com/_ui/'
+            'selfservice/pkb/'
+            'PublicKnowledgeSolution/d?orgId=00D500000006svl&lang=1&id=501'
+            '50000000SN1N&retURL=/sol/public/solutionbrowser.jsp%3Fsearch%3D'
+            'muslim%2Bworld%26cid%3D02n50000000DUOS%26orgId%3D00D500000006svl'
+            '%26lang%3D1%26t%3D4&ps=1&pPv=1">Lending in the Muslim World</a>'),
+
+        # Jargon
+        TokenSearchByRegexp(u'cycle loan', u'loan'),
+        TokenSearchByRegexp(u'loan cycle', u'loan'),
+        TokenSearchByRegexp(u'loan facility', u'loan'),
+
+        # Numeric expressions
+        TokenSearchByRegexp(ur'1st', u'first'),
+        TokenSearchByRegexp(ur'2nd', u'second'),
+        TokenSearchByRegexp(ur'3rd', u'third'),
+        TokenSearchByRegexp(ur'4th', u'fourth'),
+        TokenSearchByRegexp(ur'5th', u'fifth'),
+        TokenSearchByRegexp(ur'6th', u'sixth'),
+        TokenSearchByRegexp(ur'7th', u'seventh'),
+        TokenSearchByRegexp(ur'8th', u'eighth'),
+        TokenSearchByRegexp(ur'9th', u'ninth'),
+
+        ]
+
+    def __init__(self):
+        Rule.__init__(self, 75, "Search and replace specific strings")
+
+    def apply(self, ea):
+        changed = False
+        for ts in RegexCleanupRule.regex_pairs:
+            if ts.apply(self, ea.tokens):
+                changed = True
+        return changed
+
+
 class SpellDigitsRule(Rule):
     """Spell out numbers 1..9.
 
             # following it so they can be checked.
             prev_token, next_token = get_neighbors(ea.tokens, i)
 
-            # don't spell out percentages, i.e. we want the final text
-            # to have "7%" not "seven %"
-            if next_token and next_token.str == '%':
+            # don't spell out percentages, or numbered lists, i.e. we
+            # want the final text to have "7%" and "7)"
+            if next_token and next_token.str in u'.%)':
                 continue
 
             # don't spell out single-digit currency values, i.e. don't
             # also a number.
             left, right = get_neighbors(ea.tokens, i)
             if right and right.has_digits:
+                # TODO: just checking for is_delimited_decimal doesn't
+                # handle case like "5000 PHP 1000 of which are for..."
                 if not left or not left.is_delimited_decimal:
                     logging.debug(
                         u'CurrencyOrderRule swapping {} and {}'.format(
 
 
 class DelimitSentencesRule(Rule):
-    """Insert delimiter tokens between beginning and end of sentences.
+    """Insert delimiter tokens at beginning and end of sentences.
     """
 
     phase = POS_PHASE
     def __init__(self):
         Rule.__init__(
             self, 200,
-            "Surround every sentence with a sentence-delimiter token.")
+            "Surround every sentence with sentence-delimiter tokens.")
 
     def apply(self, ea):
-        """Return a transform that will insert the delimiter tokens.
+        """Insert the delimiter tokens.
 
         This rule is only intended to run once. It will disable itself
         after the first run. If it detects any pre-existing sentence
-        delimiter tokens, it will return an empty list.
+        delimiter tokens, it will do nothing.
         """
-        sentence_delim_token = Token('*EOS*')
-        sentence_delim_token.sentence_delim = True
+        inside_sentence = False
         changed = False
         i = 0
         while i < len(ea.tokens):
+            cur_token = ea.tokens[i]
             # do nothing if this rule has ever been run.
-            if ea.tokens[i].sentence_delim:
-                self.enabled = False
+            if cur_token.bos or cur_token.eos:
                 break
-            if '.' in ea.tokens[i].pos:
-                ea.tokens.insert(i + 1, sentence_delim_token)
-                i += 1  # skip over inserted token
+
+            if not inside_sentence:
+                # insert bos token in front of current token if it is a
+                # printing token.
+                if not (cur_token.non_printing or cur_token.is_para):
+                    ea.tokens.insert(i, Token.bos_token())
+                    i += 1
+                    inside_sentence = True
+                    changed = True
+            else:
+                if '.' in cur_token.pos:
+                    # insert end of sentence marker AFTER sentence-final
+                    # punctuation
+                    ea.tokens.insert(i + 1, Token.eos_token())
+                    i += 1  # skip over inserted token
+                    changed = True
+                    inside_sentence = False
+                elif cur_token.eof or cur_token.is_para:
+                    # insert end of sentence marker BEFORE current token
+                    ea.tokens.insert(i, Token.eos_token())
+                    i += 1
+                    changed = True
+                    inside_sentence = False
             i += 1
+        self.enabled = False
         return changed
 
 
     def _change_nn_dash_years(self, ea, sent, years_idx):
         changed = False
         try:
-            logging.debug('>_change_nn_dash_years')
             years_node = sent.parse.node_from_token(ea.tokens[years_idx])
             if years_node.parent(2).children[1].name == 'nadj_rr':
                 if ea.tokens[years_idx + 1].str == u'old':
-                    logging.debug('changing "nn-year(s) old" to "nn-year-old"')
+                    logging.debug('_change_nn_dash_years changing "nn-year(s) old" to "nn-year-old"')
                     ea.tokens[years_idx].str = ea.tokens[years_idx].str[:-1] + u'-old'
                     del ea.tokens[years_idx + 1]
                     changed = True
                 elif ea.tokens[years_idx].str.endswith(u'-years-old'):
                     # case 10
-                    logging.debug('changing "nn-years-old" to "nn-year-old"')
+                    logging.debug('_change_nn_dash_years changing "nn-years-old" to "nn-year-old"')
                     ea.tokens[years_idx].str = ea.tokens[years_idx].str.replace(u'years', u'year')
                     changed = True
         except Exception as e:
-            logging.debug('caught ' + str(e))
-        logging.debug('<_change_nn_dash_years')
+            logging.debug('_change_nn_dash_years caught ' + str(e))
         return changed
 
     def _change_years_dash_old(self, ea, sent, years_idx):
         changed = False
         try:
-            logging.debug('>_change_years_dash_old')
             years_node = sent.parse.node_from_token(ea.tokens[years_idx])
             if years_node.parent().name == 'noun_n_cmpnd':
-                logging.debug('changing "nn years-old" to "nn-year-old"')
+                logging.debug('_change_years_dash_old changing "nn years-old" to "nn-year-old"')
                 ea.tokens[years_idx - 1].str += u'-year-old'
                 del ea.tokens[years_idx]
                 changed = True
         except Exception as e:
-            logging.debug('caught ' + str(e))
-        logging.debug('<_change_years_dash_old')
+            logging.debug('_change_years_dash_old caught ' + str(e))
         return changed
 
+    def _change_years_old(self, ea, sent, years_idx):
+        changed = False
+        try:
+            years_node = sent.parse.node_from_token(ea.tokens[years_idx])
+            if (years_node.parent().name == 'plur_noun_orule' and
+                years_node.parent(3).children[1].name == 'npadv'):
+                ea.tokens[years_idx - 1].str += u'-year-old'
+                del ea.tokens[years_idx:years_idx + 2]
+                changed = True
+        except Exception as e:
+            logging.debug('_change_years_old caught ' + str(e))
+        return changed
+
+    def _change_nn_year_old(self, ea, sent, years_idx):
+        changed = False
+        try:
+            years_node = sent.parse.node_from_token(ea.tokens[years_idx])
+            if years_node.parent(3).name == 'hspechc':
+                logging.debug('changing "nn year old" to "nn-year-old"')
+                ea.tokens[years_idx].str += u'-year-old'
+                del ea.tokens[years_idx + 1:years_idx + 3]
+                changed = True
+        except Exception as e:
+            logging.debug('caught ' + str(e))
+        return changed
 
     def apply(self, ea):
         # These cases are correct and should remain unchanged.
-        # 1. 'She is 47 years old.'         PASSED
-        # 2. 'She is a 47-year-old farmer.' PASSED
-        # 3. 'She is 1 year old.'           PASSED
+        # 1. 'She is 47 years old.'
+        # 2. 'She is a 47-year-old farmer.'
+        # 3. 'She is 1 year old.'
 
         # These cases should be changed to have a single token of
         # '47-year-old':
         # 4. 'She is a 47 year old farmer.'
-        # 5. 'She is a 47-year old farmer.'    PASSED
-        # 6. 'She is a 47 year-old farmer.'    PASSED
+        # 5. 'She is a 47-year old farmer.'
+        # 6. 'She is a 47 year-old farmer.'
 
         # These should have 'years' changed to 'year' and hyphens inserted.
-        # 7. 'She is a 47 years old farmer.'   PASSED
-        # 8. 'She is a 47-years old farmer.'   PASSED
-        # 9. 'She is a 47 years-old farmer.'   PASSED
-        # 10. 'She is a 47-years-old farmer.'  PASSED
+        # 7. 'She is a 47 years old farmer.'
+        # 8. 'She is a 47-years old farmer.'
+        # 9. 'She is a 47 years-old farmer.'
+        # 10. 'She is a 47-years-old farmer.'
+        # 11. 'She is a 47 year old lady who is a farmer.'
 
         changed = False
         # cases 5 and 6 are handled by regular expression search and replace
                 changed = True
 
         for sent in ea.sentences:
-            # Cases 5 and 8
-            years_idx = sent.find_sequence('[0-9]+-year(s?)')
+            # cases 4 and 11
+            years_idx = sent.find_sequence('[0-9]+ year old')
+            if (years_idx != None and
+                self._change_nn_year_old(ea, sent, years_idx)):
+                changed = True
+                continue
+
+            # Case 7
+            years_idx = sent.find_sequence(u'years old')
+            if (years_idx != None and
+                self._change_years_old(ea, sent, years_idx)):
+                changed = True
+                continue
+
+            # Case 8
+            years_idx = sent.find_sequence('[0-9]+-years')
             if (years_idx != None and
                 self._change_nn_dash_years(ea, sent, years_idx)):
                 changed = True
                 self._change_years_dash_old(ea, sent, years_idx)):
                 changed = True
                 continue
-
-            make_singular = False
-
-            years_idx = sent.find_sequence(u'years old')
-
-            if years_idx == None:
-                continue
-            # get the node that has child of token containing 'year'
-            years_node = sent.parse.node_from_token(ea.tokens[years_idx])
-
-
-            if years_node.parent().name != 'plur_noun_orule':
-                logging.debug('*****skipping %s != plur_noun_orule' % years_node.parent().name)
-                continue
-
-            # Case 7
-            try:
-                make_singular = years_node.parent(3).children[1].name == 'npadv'
-                if make_singular:
-                    logging.debug('********NPADV')
-            except:
-                pass
-
-            if not make_singular:
-                try:
-                    make_singular = years_node.parent(4).name == 'npadv_mnp'
-                    if make_singular:
-                        logging.debug('********NPADV_MNP')
-                except:
-                    pass
-            # appos
-            #   bare_np
-            #     plur_noun_orule
-            #       year_n1
-            #         <years 5:10>
-            if not make_singular:
-                try:
-                    make_singular = years_node.parent(3).name == 'appos'
-                    if make_singular:
-                        logging.debug('********APPOS')
-                except:
-                    pass
-
-            if not make_singular:
-                try:
-                    make_singular = (
-                        years_node.parent(3).name == 'bare_np' and
-                        years_node.parent(2).children[1].name == 's_dash_pct')
-                    if make_singular:
-                        logging.debug('********s_dash_pct')
-                except:
-                    pass
-
-            if make_singular:
-                ea.tokens[years_idx - 1].str += u'-year-old'
-                del ea.tokens[years_idx:years_idx + 2]
-                # years_token = years_node.children[0]
-                # original = unicode(years_token)
-                # years_token.str = u'year'
-                # logging.debug(u'YearOldRule make_singular changed {} to {}'.format(
-                #         original, unicode(years_token)))
-                changed = True
         return changed
 from expect import expect
 
 def test_year_old():
+    # All of these are taken from actual Kiva loan descriptions
     expect(u'Mahmoud is a 47-year-old married man from Lebanon.')
-    expect(u'This is 40 year-old Kadiatu.', 'This is 40-year-old Kadiatu.')
-    expect(u'Eman is a 32 years old young woman.', u'Eman is a 32-year-old young woman.')
+    expect(u'This is 40 year-old Kadiatu.',
+           u'This is 40-year-old Kadiatu.')
+    expect(u'Eman is a 32 years old young woman.',
+           u'Eman is a 32-year-old young woman.')
+    expect(u'Yuliana is a 57 year old lady who is originated from Semarang.',
+           u'Yuliana is a 57-year-old lady who is originated from Semarang.')
+    expect(u'Sabah is a 28 years old married female from Madaba, Jordan.',
+           u'Sabah is a 28-year-old married female from Madaba, Jordan.')
 
     # These cases are correct and should remain unchanged.
     expect(u'She is 47 years old.')
     expect(ur'She is a 47-years old farmer.', ur'She is a 47-year-old farmer.')  # ok
     expect(ur'She is a 47 years-old farmer.', ur'She is a 47-year-old farmer.')
     expect(ur'She is a 47-years-old farmer.', ur'She is a 47-year-old farmer.')
-
-    #
-    # Here are the parse trees for all 9 cases:
-    #
-    # 1. She is 47 years old.
-    #
-    # root_informal
-    #   subjh
-    #     bare_npq
-    #       she
-    #         <She 0:3>
-    #     hcomp
-    #       be_c_is
-    #         <is 4:6>
-    #       npadv
-    #         appos
-    #           bare_np
-    #             adjn
-    #               attr_adj_verb_psv_part
-    #                 generic_trans_verb_pas
-    #                   <47 7:9>
-    #               plur_noun_orule
-    #                 year_n1
-    #                   <years 10:15>
-    #           proper_np
-    #             adjn
-    #               old_a1
-    #                 <old 16:19>
-    #               noptcomp
-    #                 generic_year_ne
-    #                   <. 19:20>
-    #
 
 def get_levenshtein_dist(source_tokens, target_strings):
     """Return a minimal list of operations required to transform the
-    source tokens into the target tokens.
+    source tokens into the target strings.
     """
-    # Compute the Levenshtein Distance between source and target token
-    # lists. (See http://en.wikipedia.org/wiki/Levenshtein_distance for
-    # an explanation and http://www.merriampark.com/ld.htm for public-
+    # Compute the Levenshtein Distance between source and target lists.
+    # (See http://en.wikipedia.org/wiki/Levenshtein_distance for an
+    # explanation and http://www.merriampark.com/ld.htm for public-
     # domain implementations.)
     #
     # This results in a set of instructions for transforming the source
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.