Commits

david_walker  committed 9630cd5

starting support for tracking charcter begin and end of tokens in original text
added POS tagger
refactored tests.py to extract expect

  • Participants
  • Parent commits 83b5bb7
  • Branches parse

Comments (0)

Files changed (8)

 ^build
 
 # python-refactoring library support files
-\.ropeproject
+\.ropeproject
+
+# log file created by kea.py
+kea.log
 Fundamental classes used by the editor's assistant.
 """
 
-import re
-import pycountry
-import unicodedata
 from abc import ABCMeta, abstractmethod
 from collections import namedtuple
 import logging
         return []
 
 
-class AbbrevInfo(object):
-    """Hold information about an abbreviation."""
-    def __init__(self, regex_str, normal_form=None):
-        """Compile regex and store normal_form.
-
-        `regex` - a regular expression string.
-        `normal_form` - normal way the abbreviation appears.
-
-        Usually the regex exists just to have a case-insensitive way to
-        match, but some are more complicated than that.
-
-        The normal form has the standard capitalization of the
-        abbreviation and is substituted for the input text if it is
-        specified.
-        """
-        self.regex = re.compile(regex_str, re.I | re.U)
-        self.normal_form = normal_form
-
-
-#
-# The Token class.
-#
-# A Token object represents some some bit of text and has additional
-# properties describing that text.
-#
-# Initially there is only one token object which contains the entire
-# blob of text that appeared in the original input.
-#
-# After some Transforms have been applied, that original token will
-# have been replaced by a number of Tokens; eventually after
-# processing is complete each Token will represent a small element
-# like an individual word or punctuation mark.
-#
-
-
-class Token(object):
-    """Contains a portion of text, either part of the original input
-    or generated later, as well as properties describing it.
-
-    Token objects should only be modified by Transform objects. They
-    should not modify themselves.
-
-    This keeps all the token-modifying code in one place conceptually,
-    as well as providing a mechanism to resolve conflicts between
-    multiple bits of code that might both want to touch the same Token
-    object.
-    """
-    abbreviations = [
-        AbbrevInfo(ur'e\.g\.'),
-        AbbrevInfo(ur'i\.e\.'),
-        AbbrevInfo(ur'etc\.'),
-        AbbrevInfo(ur'mr\.', u'Mr.'),
-        AbbrevInfo(ur'mrs\.', u'Mrs.'),
-        AbbrevInfo(ur'ksh\.', u'KES'),
-        AbbrevInfo(ur'kes\.', u'KES'),
-        AbbrevInfo(ur'ltd\.', u'Ltd.'),
-        AbbrevInfo(ur's\.a\.l(\.)?', u's.a.l.'),
-        AbbrevInfo(ur'u\.s\.s\.r\.', u'U.S.S.R.')]
-
-    _currency_terms = [
-        u'$',
-        u'dollar',
-        u'dollars',
-        u'/=',
-        u'peso',
-        u'pesos',
-        u'shilling',
-        u'shillings']
-
-    ordinal_res = [
-        re.compile(ur'^([0-9,]*[02-9]){0,1}1st$', re.I | re.U),
-        re.compile(ur'^([0-9,]*[02-9]){0,1}2nd$', re.I | re.U),
-        re.compile(ur'^([0-9,]*[02-9]){0,1}3rd$', re.I | re.U),
-        re.compile(ur'^[04-9]th$', re.I | re.U),
-        re.compile(ur'^[0-9,]*1[0-9]th$',  re.I | re.U),
-    ]
-
-    has_digits_re = re.compile(ur'.*\d+.*', re.U)
-
-    is_alpha_re = re.compile(ur'^\w+$', re.I | re.U)
-
-    # recognizes a decimal number with comma-delimited thousands groups
-    delimited_decimal_re = re.compile(
-        ur"""^            # start of string
-             [1-9]        # nonzero leading digit
-             [0-9]{,2}    # up to two more leading digits
-             (,[0-9]{3})* # any number of comma-delimited groups of 3 digits
-             (\.[0-9]+)?  # optional decimal followed by one or more digits
-             $            # end of string
-          """,
-        re.U | re.X)
-
-    # recognizes an integer with comma-delimited thousands groups
-    delimited_integer_re = re.compile(
-        ur"""^            # start of string
-             [0-9]{1,3}   # one to three leading digits
-             (,[0-9]{3})* # any number of comma-delimited groups of 3 digits
-             $            # end of string
-          """,
-        re.U | re.X)
-
-    url_re = re.compile(
-        """(\w+\.)+     # one or more dot-delimited words
-           # one of the TLDs that appear in Kiva loans
-           (com|edu|gov|info|mil|net|org|tj)
-           (\S*)        # any amount of non-space chars """,
-        re.I | re.U | re.VERBOSE)
-
-    def __init__(self, s):
-        """Initialize from text. """
-        # Note we use the setter here which initializes the cache.
-        self.str = s
-
-    def __repr__(self):
-        """Return a string representation of this object suitable for
-        debugging output.
-        """
-        r = u'<'
-        for key, val in self.__dict__.items():
-            if val:
-                if len(r) > 1:
-                    r += u' '
-                if key == '_str':
-                    r += u'"{}"'.format(val)
-                else:
-                    r += u'{}: {}'.format(key, val)
-        r += u'>'
-        # Python 2.x requires that __repr__ return an ascii string.
-        # Python 3.x requires that it return a unicode string.
-        return r.encode(encoding='iso-8859-15', errors='replace')
-
-    def _reset_cache(self):
-        self._abbrev_checked = False
-        self._abbrev_match = None
-        self._abbrev_match_len = 0
-        self.sentence_delim = None
-        self.eof = None
-
-        self._URL_checked = False
-        self._is_URL = None
-
-    @property
-    def str(self):
-        return self._str
-
-    @str.setter
-    def str(self, new_value):
-        self._str = unicode(new_value)
-        self._reset_cache()
-
-    @property
-    def abbrev_match_len(self):
-        if not self._abbrev_checked:
-            self._abbrev_checked = True
-            for abbrev in Token.abbreviations:
-                match_obj = abbrev.regex.match(self._str)
-                if match_obj:
-                    self._abbrev_match = abbrev
-                    self._abbrev_match_len = len(match_obj.group())
-                    break
-        return self._abbrev_match_len, self._abbrev_match
-
-    @property
-    def has_digits(self):
-        """Return True if `str` has digits in it."""
-        return Token.has_digits_re.search(self._str) != None
-
-    @property
-    def is_abbrev(self):
-        """Return True if token matches (not just starts with) an
-        abbreviation."""
-        match_len, abbrev = self.abbrev_match_len
-        return abbrev and match_len == len(self._str)
-
-    @property
-    def is_alpha(self):
-        """Return True if token contains only letters."""
-        return is_alpha_re.match(self._str)
-
-    @property
-    def is_alphanumeric_ordinal(self):
-        """Return True if token is of the form 1st, 2nd, 3rd, 4th, etc."""
-        for regex in Token.ordinal_res:
-            if regex.match(self._str):
-                return True
-        return False
-
-    @property
-    def is_close(self):
-        """Return True if this token is any type of closing paren.
-        """
-        return len(self._str) == 1 and self._str in u')]}'
-
-    @property
-    def is_currency_symbol(self):
-        return len(self._str) == 1 and self._str == u'$'
-
-    @property
-    def is_currency_term(self):
-        if self._str.lower() in Token._currency_terms:
-            return True
-        return self.is_ISO_currency
-
-    @property
-    def is_eof(self):
-        return self.eof == True
-
-    @property
-    def is_delimited_decimal(self):
-        return Token.delimited_decimal_re.match(self._str) != None
-
-    @property
-    def is_delimited_integer(self):
-        return Token.delimited_integer_re.match(self._str) != None
-
-    @property
-    def is_ISO_currency(self):
-        try:
-            pycountry.currencies.get(letter=self._str.upper())
-            result = True
-        except:
-            result = False
-        return result
-
-    @property
-    def is_nonspacing_punc(self):
-        """Return True if this token is a punctuation character.
-        """
-        return len(self._str) == 1 and self._str in u',.!?;%:'
-
-    @property
-    def is_open(self):
-        """Return True if this token is any type of opening paren.
-        """
-        return len(self._str) == 1 and self._str in u'([{'
-
-    @property
-    def is_para(self):
-        return self._str == '\n'
-
-    @property
-    def is_punc(self):
-        """Return True if this token is a punctuation character.
-        """
-        return len(self._str) == 1 and unicodedata.category(
-            self._str).startswith(u'P')
-
-    @property
-    def is_quote(self):
-        """Return true if this token is any type of single or double quote.
-        """
-        return len(self._str) == 1 and self._str in u'\'`"'
-
-    @property
-    def is_URL(self):
-        """Check if token contains a URL, marking it if necessary.
-
-        Only a subset of possible URL forms likely to appear in a Kiva
-        description are recognized, since it is more likely that a token
-        that happens to conform to an exotic URL form is, in fact, a typo.
-        """
-        if not self._URL_checked:
-            self._URL_checked = True
-
-            # look for a scheme identifier; Kiva loans only will have an
-            # http or maybe https prefix, but won't use any of the others.
-            if self._str.lower().startswith('http'):
-                self._is_URL = True
-            elif Token.url_re.match(self._str):
-                self._is_URL = True
-        return self._is_URL
-
-    @property
-    def non_printing(self):
-        """Return True if any of the attributes are set which indicate a
-        non-printing token.
-        """
-        return self.sentence_delim or self.eof
-
-
 class Transform():
     """An abstract base class; derived classes should override `apply()`.
 
 import StringIO
 import codecs
 import logging
-import tempfile
 import sys
 
 from clipboard import get_clipboard_text, set_clipboard_text
-from base import Token
+from mytoken import Token
 import rules
-
+import tagger
+import parser
 
 class EditAssistant(object):
     def __init__(self, infile):
         """Process the input file and generate an output string."""
+        # create a sentinal end-of-file token
         eof_token = Token(u'')
         eof_token.eof = True
-        self._tokens = [Token(infile.read()), eof_token]
-        self._process_tokens(infile)
+        # start the tokens array with one Token object that contains all
+        # the text, followed by the sentinal.
+        text = infile.read()
+        self._tokens = [Token(text, 0, len(text)), eof_token]
+        # apply first phase rules/transforms to replace the original
+        # Token object with multiple Token objects, one for each bit of
+        # the input text that qualifies as a single input token.
+        self._process_tokens(rules.INITIAL_PHASE)
+        # Add a part-of-speech property to all the tokens
+        tagger.tag_tokens(self._tokens)
+        # now apply rules/transforms that make use of the POS properties
+        self._process_tokens(rules.POS_PHASE)
+        #self._parser = parser.Parser()
+        #self._parser.parse(self._tokens)
         self._generate_output()
 
     def _asterisk_at_bol(self, token):
             if append_space:
                 self.edited_text += u' '
 
-    def _process_tokens(self, infile):
-        all_rules = rules.get_rules()
-
+    def _process_tokens(self, phase):
+        all_rules = rules.get_rules(phase)
         #
         # The main loop.
         #
 
             self._tokens = winner.apply()
 
+    def dump_pos_tags(self):
+        """Write every token with a Part-Of-Speech tag to stdout."""
+        for token in self._tokens:
+            if hasattr(token, 'pos'):
+                print u'{}/{}'.format(token.str, token.pos),
+            if token.str == '\n':
+                print
+
 
 def parse_commandline():
     """Return an argparse parse of the command line
         "If this option is specified, then --infile is ignored.")
 
     parser.add_argument(
+        '-d', '--debug', action='store_true',
+        help="Print the raw argument list and exit.")
+
+    parser.add_argument(
         '-i', '--infile', default=sys.stdin,
         help="The UTF-8 encoded file to read, (defaults to stdin).")
 
         help="The UTF-8 encoded file to write (defaults to stdout).")
 
     parser.add_argument(
+        '-s', '--show_pos', action='store_true',
+        help="Print the tagged tokens and exit.")
+
+    parser.add_argument(
         '-t', '--test', nargs='*',
         help="Process the TEST strings instead of an input file. If this "
         "option is specified, then --infile and --clipboard are ignored.")
 
-    parser.add_argument(
-        '-d', '--debug', action='store_true',
-        help="Print the raw argument list and exit.")
-
     # Parse the command line and return it
     return parser.parse_args()
 
     """
 
     # Initialize logging to go to a file
-    handler = logging.FileHandler("/home/david/Dropbox/Projects/Kiva/logfile.txt", "w",
-                                  encoding="UTF-8")
+    handler = logging.FileHandler("kea.log", "w", encoding="UTF-8")
     formatter = logging.Formatter("%(message)s")
     handler.setFormatter(formatter)
     root_logger = logging.getLogger()
 
     args = parse_commandline()
 
-    # get an output file handle
+    # get an output file handle to either a user-supplied file name or
+    # stdout.
     if isinstance(args.outfile, basestring):
         # The user supplied a filename. Open it.
         outfile = codecs.open(args.outfile, 'w', 'utf-8')
         # as utf-8 encoded.
         outfile = codecs.getwriter('utf-8')(sys.stdout)
 
+    # Use this to hold an instance of an EditAssistant object, which can
+    # be created in a variety of ways (or not at all) depending on
+    # command-line arguments.
+    edit_assistant = None
+
     if args.debug:
         print args
     elif args.test:
         # the test option supercedes other input modes
         if not isinstance(args.test, basestring):
             args.test = u' '.join(args.test)
-        outfile.write(EditAssistant(StringIO.StringIO(
-                    unicode(args.test))).edited_text)
+        edit_assistant = EditAssistant(StringIO.StringIO(unicode(args.test)))
+        outfile.write(edit_assistant.edited_text)
         outfile.write('\n')
     elif args.clipboard:
         edit_assistant = EditAssistant(StringIO.StringIO(get_clipboard_text()))
             infile = codecs.open(args.infile, 'r', 'utf-8')
         else:
             infile = codecs.getreader('utf-8')(sys.stdin)
-        outfile.write(EditAssistant(infile).edited_text)
+        edit_assistant = EditAssistant(infile)
+        outfile.write(edit_assistant.edited_text)
+
+    if args.show_pos and edit_assistant:
+        edit_assistant.dump_pos_tags()
     sys.exit(0)
 
 if __name__ == '__main__':
 from base import AttrChange, Rule
 import transforms as tr
 
+INITIAL_PHASE = 0
+POS_PHASE = 1
 
-def get_rules():
+def get_rules(desired_phase):
     """Return a list containing instances of all the rules in this
-    module."""
+    module with a matching phase."""
     classes = []
     this_module = sys.modules[__name__]
     for name, obj in inspect.getmembers(this_module):
         if (inspect.isclass(obj) and
             obj.__module__ == this_module.__name__ and
-            'Rule' in str(obj)):
+            'Rule' in str(obj) and
+            obj.phase == desired_phase):
             classes.append(obj())
     return classes
 
     This rule works best when run before the original text is split.
     """
 
+    phase = INITIAL_PHASE
     regex_pairs = [
         # Character standardization
         (u'“|”', u'"'),
         # incorrect punctuation
         (ur'e\.t\.c\.?', u'etc.'),
         (ur'\betc([^.])', ur'etc.\1'),
-        (ur'([0-9]+) year(?:s?) old (man|woman|single|married|widow|widowed)',
-         ur'\1-year-old \2'),
         (ur'(?<!\.)\.\.(?!\.)', u'.'),  # blah.. -> blah.
 
         # grammatical errors
     multiple tokens.
     """
 
+    phase = INITIAL_PHASE
     LINEBREAK_RE = re.compile(r'(\s*)(\n+)(\s*)')
 
     def __init__(self):
 
 
 class WhitespaceSplitRule(Rule):
+    phase = INITIAL_PHASE
+
     def __init__(self):
         Rule.__init__(self, 30, 1.0, "Separate text by whitespace.")
 
 
         if transform_tokens:
             transforms = [
-                tr.RegexSplitTransform(self,
-                                       transform_tokens,
-                                       split_re=ur'\s+')]
-
-        # if transforms:
-        #     logging.debug('WhitespaceSplitRule returning %d',
-            # len(transforms))
+                tr.CharSplitTransform(self,
+                                      transform_tokens,
+                                      delimiter_char=u' ',
+                                      keep_delimiter=False)]
         return transforms
 
 
 
     """
 
+    phase = INITIAL_PHASE
+
     def __init__(self):
         Rule.__init__(self, 40, 1.0,
                       "Separate periods from words that aren't abbreviations.")
                 transform_tokens.append(token)
 
         if transform_tokens:
-            transforms += [tr.RegexSplitTransform(self,
-                                             transform_tokens,
-                                             split_re=ur'(\.)')]
+            transforms += [tr.CharSplitTransform(self,
+                                                 transform_tokens,
+                                                 delimiter_char=u'.',
+                                                 keep_delimiter=True)]
         # if transforms:
         #     logging.debug("DotSplitRule returning %d", len(transforms))
         return transforms
 
     For example, 1.234.567 becomes 1,234,567
     """
+
+    phase = INITIAL_PHASE
+
     euro_decimal_number_re = re.compile(
         """# up to three digits, but no leading zeros
            [1-9]\d{0,2}
     apostrophes in contractions.
     """
 
+    phase = INITIAL_PHASE
+
     # this is the same as Token.delimited_decimal_re except that
     # it is not bookended by ^ and $
     embedded_decimal_number_re = re.compile(
     """Split alphanumeric sequences.
     """
 
+    phase = INITIAL_PHASE
+
     def __init__(self):
         Rule.__init__(self, 70, 1.0,
                       "Split conjoined words and "
     - are part of a  numbered list "1. foo 2. bar 3. baz
     """
 
+    phase = INITIAL_PHASE
+
     spelled_digits = [
         u'one',
         u'two',
 
     def __init__(self):
         """Set rule priority and name. """
-        Rule.__init__(self, 80, 1.0, "Spell out single digit  numbers.")
+        Rule.__init__(self, 80, INITIAL_PHASE,
+                      1.0, "Spell out single digit numbers.")
 
     def get_transforms(self, tokens):
         """Return an array of transform objects."""
 class DelimitThousandsRule(Rule):
     """Insert comma separators in currency values larger than 4 digits.
     """
+    phase = INITIAL_PHASE
+
     _splittable_number_re = re.compile(ur'^[1-9][0-9]{4,}(\.[0-9]{2})?$', re.U)
 
     def __init__(self):
     """Merge consecutive tokens that together make a formatted decimal number.
     """
 
+    phase = INITIAL_PHASE
+
     mergeable_number_re = re.compile(
         ur"""(^
                [0-9]{3}       # three leading digits, may start with 0
         re.UNICODE | re.VERBOSE)
 
     def __init__(self):
-        Rule.__init__(self, 100, 1.0, "Remove spaces from numbers.")
+        Rule.__init__(self, 100, 1.0,
+                      "Remove spaces from numbers.")
 
     def get_transforms(self, tokens):
         self.tokens = tokens
 class CurrencyOrderRule(Rule):
     """Require that ISO currency abbreviations come *after* the
     associated numbers."""
+
+    phase = INITIAL_PHASE
+
     def __init__(self):
         Rule.__init__(
             self, 110, 1.0,
     """Spell out ISO currency abbreviations.
     """
 
+    phase = INITIAL_PHASE
+
     def __init__(self):
         Rule.__init__(
             self, 120, 1.0,
     """Insert delimiter tokens between beginning and end of sentences.
     """
 
+    phase = INITIAL_PHASE
+
     def __init__(self):
         Rule.__init__(
             self, 130, 1.0,
                 self.enabled = False
                 return []
         return []
+
+
+class YearOldRule(Rule):
+    """Change 'xx years old' and 'xx year old' to 'xx-year-old' as
+    appropriate.
+    """
+
+    phase = POS_PHASE
+
+    def __init__(self):
+        Rule.__init__(
+            self, 200, 1.0,
+            "Fix incorrect plural and lack of hyphens in phrases "
+            "like '20 years old man'.")
+
+    def get_transforms(self, tokens):
+        """Return a transform that will fix age phrases.
+        """
+        self.tokens = tokens
+
+        return []
-Thoi Thi Tran is 37 years old. She is living with her family in Ly nhan village. Being a farmer, she growing rice, and raising  pigs for meat Her business lasted for long time but it is not successful.To earn more money,she make wine to sell in retail in 2008. Everyday, she can have 70,000 Vietnamese dong profit. Ms Thoi  was a clients of our credit program in 2006.During that time, she is always a good re payer and use loans effectively. Currently, she wants to borrow 6,322,000 Vietnamese dong to buy  material to expand her business. 
-
-
-The woman person shown in the picture is Mrs. Nuon , 52 years old , who lives in Siem Reap Province . She owns a small store in local market which is full of people , and she uses this store to sell porridge in the morning and afternoon . She generates a profit approximately of US $ two a day to help support her family’s daily expenses with more stability . Her husband is the chief provider for the family , and he works as a motor taxi driver . This couple has six children , two of whom work in hotel as internal staffs . Mrs. Nuon will be using part of this requested money to purchase more necessary ingredients for making porridge to sell ; the rest of the loan will be used to a new motor - trailer for her husband to go on his business . 
-
-
-US$ 2 a day
-
-Cayetana is a hardworking entrepreneur who has a fishing business in the Philippines. 
-
-She is borrowing PHP 20000 through NWTF to buy material for fishing like cooler, fishing net, etc. for her fishing business.
-
-Cayetana has been in this business for 4 years. Cayetana earns more income from raising pigs.
-
-Cayetana has been sustaining her business activities through her own efforts with the help of the loans from NWTF. She dreams to build and expand her business to secure the future of his family.
-
-
-Lending helptext http://tinyurl.com/3aekx8m
-
----
-
-Isabella is 60 years old, married to Michael. She has been keeping poultry for ten years with a monthly income of KES 12,000.
-
-five peso's-worth
-
-wood Le500, 000. He
-
+Thoi Thi Tran is 37 years old. She is living with her family in Ly nhan village. Being a farmer, she growing rice, and raising  pigs for meat Her business lasted for long time but it is not successful.To earn more money,she make wine to sell in retail in 2008. Everyday, she can have 70,000 Vietnamese dong profit. Ms Thoi  was a clients of our credit program in 2006.During that time, she is always a good re payer and use loans effectively. Currently, she wants to borrow 6,322,000 Vietnamese dong to buy  material to expand her business. 
+
+
+The woman person shown in the picture is Mrs. Nuon , 52 years old , who lives in Siem Reap Province . She owns a small store in local market which is full of people , and she uses this store to sell porridge in the morning and afternoon . She generates a profit approximately of US $ two a day to help support her family’s daily expenses with more stability . Her husband is the chief provider for the family , and he works as a motor taxi driver . This couple has six children , two of whom work in hotel as internal staffs . Mrs. Nuon will be using part of this requested money to purchase more necessary ingredients for making porridge to sell ; the rest of the loan will be used to a new motor - trailer for her husband to go on his business . 
+
+
+US$ 2 a day
+
+Cayetana is a hardworking entrepreneur who has a fishing business in the Philippines. 
+
+She is borrowing PHP 20000 through NWTF to buy material for fishing like cooler, fishing net, etc. for her fishing business.
+
+Cayetana has been in this business for 4 years. Cayetana earns more income from raising pigs.
+
+Cayetana has been sustaining her business activities through her own efforts with the help of the loans from NWTF. She dreams to build and expand her business to secure the future of his family.
+
+
+Lending helptext http://tinyurl.com/3aekx8m
+
+---
+
+Isabella is 60 years old, married to Michael. She has been keeping poultry for ten years with a monthly income of KES 12,000.
+
+five peso's-worth
+
+wood Le500, 000. He
+
+--- Unicode samples
+
+She is 45 years old and lives in Sto. Niño, Mahayag.
+
+--- "n years old" samples
+
+correct usage:
+
+Annie is 64 years old and lives in barnersville.
+
+She is married and has five children between 22 to 40 years old.
+
+incorrect usage (lacking hyphens):
+
+Born in a village called Boya Romende, the 51 year old is married and has seven children, five of whom are attending primary, secondary and tertiary institutions.
+
+incorrect usage ("years" should be singular):
+
+A 51 years old farmer.
+
+CD "years" "old" JJ|NN
+#!/usr/bin/env python
+"""
+Function that, given a list of tokens, returns an XML document
+containing part-of-speech tagging information for the tokens.
+"""
+import subprocess
+import tempfile
+import codecs
+
+TRIGRAM_PATH = '/home/david/delphin/components/tnt/models/wsj.tnt'
+
+
+def tag_tokens(tokens):
+    """Pass all tokens to an external POS tagger, then add its tags as
+    properties on the Token objects.
+    """
+    with tempfile.NamedTemporaryFile() as token_file:
+        # Create a temporary file for TNT (Trigrams'n'Tags) to process (it
+        # doesn't accept input from stdin).  Each token to be tagged must
+        # appear on a single line.
+        token_file_writer = codecs.getwriter('utf-8')(token_file)
+        for token in tokens:
+            if token.non_printing or token.is_para:
+                continue
+            token_file_writer.write(token.str)
+            token_file_writer.write('\n')
+        token_file_writer.flush()
+
+        # Execute TNT; capture stderr so it doesn't pollute the console
+        process = subprocess.Popen(['tnt', TRIGRAM_PATH, token_file.name],
+                                   stdin=subprocess.PIPE,
+                                   stdout=subprocess.PIPE,
+                                   stderr=subprocess.PIPE)
+
+        # add part of speech tag to tokens, being careful to align the
+        # pos assignments with the printable tokens we sent
+        i = 0
+        for line in process.communicate()[0].split('\n'):
+            if i == len(tokens):
+                break
+            # ignore empty and comment lines
+            if not line.strip() or line.startswith('%'):
+                continue
+            # find the next token that needs a part of speech assignment
+            while tokens[i].non_printing or tokens[i].is_para:
+                i += 1
+            # TNT output for tokens is the token, some spaces, and the
+            # POS tag.
+            tokens[i].pos = line.split()[1]
+            i += 1

File tests/test.py

-import kea2
-import collections
-import StringIO
-
-def expect(test_input, expected_output=None):
-    actual_output = kea2.EditAssistant(StringIO.StringIO(test_input)).edited_text
-    if expected_output == None:
-        expected_output = test_input
-    if actual_output != expected_output:
-        print 'expected "{0}" but got "{1}"'.format(expected_output, actual_output)
-        assert(False)
-
-
-
-def test_alphanumeric():
-    expect(u'', u'')
-    expect(u'100p of 43th', u'100 p of 43 th')
-    expect(u'0th')
-    expect(u'1st', u'first')
-    expect(u'2nd', u'second')
-    expect(u'3rd', u'third')
-    expect(u'4th', u'fourth')
-    expect(u'10th')
-    expect(u'11th')
-    expect(u'101st')
-    expect(u'102nd')
-    expect(u'102st', u'102 st')
-    expect(u'1,999,871st')
-    expect(u'1th', u'one th')
-    expect(u'2rd', u'two rd')
-
-def test_numeric():
-    """ Test handling of numbers.
-    """
-    expect(u'KES50, 000', u'50,000 Kenyan Shilling (KES)')
-    expect(u'15, 17, 19')
-    expect(u'50, 000, 34', u'50,000, 34')
-    expect(u'3.234.134', u'3,234,134')
-    expect(u'1', u'one')
-    expect(u'1.5')
-    expect(u'1000 USD', u'$1000')
-    expect(u'10000 USD', u'$10,000')
-    expect(u'100000 USD', u'$100,000')
-    expect(u'1000000 USD', u'$1,000,000')
-    expect(u'10000000 USD', u'$10,000,000')
-    expect(u'100000000 USD', u'$100,000,000')
-
-def test_asterisk():
-    expect(u'*foo')
-    expect(u'* foo', u'*foo')
-    expect(u'foo * bar', u'foo* bar')
-    expect(u'foo* bar')
-
-def test_currency():
-    """Test recognition of currency abbreviations."""
-    expect('Ksh.5,000', '5,000 Kenyan Shilling (KES)')
-    expect('Ksh.50, 000', '50,000 Kenyan Shilling (KES)')
-    expect('Php11, 000', '11,000 Philippine Peso (PHP)')
-    #expect('Php11,000, 100 PHP', '11,000 Philippine Peso (PHP), 100 PHP')
-    expect('180,000/= and 50/=', '180,000 Uganda Shilling (UGX) and 50 UGX')
-    expect('100 usd', '$100')
-    expect('usd 100', '$100')
-
-    expect('100 usd and 200 usd then', '$100 and $200 then')
-    expect('100 usd and usd 200 then', '$100 and $200 then')
-    expect('usd 100 and 200 usd then', '$100 and $200 then')
-    expect('usd 100 and usd 200 then', '$100 and $200 then')
-
-    expect('100 usd 200') # ambiguous, so unchanged
-    #expect('100 usd 200 usd', '$100 $200')
-    #expect('usd 100 usd 200', '$100 $200')
-
-def test_honorifics():
-    """Test recognition and capitalization of honorifics."""
-    expect('mr.', 'Mr.')
-    expect('mrs.', 'Mrs.')
-    expect('Mr.')
-    expect('Mrs.')
-
-def test_improve_expand():
-    """Test variations on the improve/expand replacement."""
-    expect("to improve/expand his business.", "to improve and expand it.")
-    expect("to improve/expand her business.", "to improve and expand it.")
-    expect("to improve/expand the borrower's business.", "to improve and expand it.")
-
-def test_output_generation():
-    expect(u'hello, world')
-    expect(u'Kiva/SMT')
+#!/bin/env python
+from expect import expect
+
+
+def test_alphanumeric():
+    expect(u'', u'')
+    expect(u'100p of 43th', u'100 p of 43 th')
+    expect(u'0th')
+    expect(u'1st', u'first')
+    expect(u'2nd', u'second')
+    expect(u'3rd', u'third')
+    expect(u'4th', u'fourth')
+    expect(u'10th')
+    expect(u'11th')
+    expect(u'101st')
+    expect(u'102nd')
+    expect(u'102st', u'102 st')
+    expect(u'1,999,871st')
+    expect(u'1th', u'one th')
+    expect(u'2rd', u'two rd')
+
+
+def test_numeric():
+    """ Test handling of numbers.
+    """
+    expect(u'KES50, 000', u'50,000 Kenyan Shilling (KES)')
+    expect(u'15, 17, 19')
+    expect(u'50, 000, 34', u'50,000, 34')
+    expect(u'3.234.134', u'3,234,134')
+    expect(u'1', u'one')
+    expect(u'1.5')
+    expect(u'1000 USD', u'$1000')
+    expect(u'10000 USD', u'$10,000')
+    expect(u'100000 USD', u'$100,000')
+    expect(u'1000000 USD', u'$1,000,000')
+    expect(u'10000000 USD', u'$10,000,000')
+    expect(u'100000000 USD', u'$100,000,000')
+
+
+def test_asterisk():
+    expect(u'*foo')
+    expect(u'* foo', u'*foo')
+    expect(u'foo * bar', u'foo* bar')
+    expect(u'foo* bar')
+
+
+def test_currency():
+    """Test recognition of currency abbreviations."""
+    expect('Ksh.5,000', '5,000 Kenyan Shilling (KES)')
+    expect('Ksh.50, 000', '50,000 Kenyan Shilling (KES)')
+    expect('Php11, 000', '11,000 Philippine Peso (PHP)')
+    #expect('Php11,000, 100 PHP', '11,000 Philippine Peso (PHP), 100 PHP')
+    expect('180,000/= and 50/=', '180,000 Uganda Shilling (UGX) and 50 UGX')
+    expect('100 usd', '$100')
+    expect('usd 100', '$100')
+
+    expect('100 usd and 200 usd then', '$100 and $200 then')
+    expect('100 usd and usd 200 then', '$100 and $200 then')
+    expect('usd 100 and 200 usd then', '$100 and $200 then')
+    expect('usd 100 and usd 200 then', '$100 and $200 then')
+
+    expect('100 usd 200') # ambiguous, so unchanged
+    #expect('100 usd 200 usd', '$100 $200')
+    #expect('usd 100 usd 200', '$100 $200')
+
+
+def test_honorifics():
+    """Test recognition and capitalization of honorifics."""
+    expect('mr.', 'Mr.')
+    expect('mrs.', 'Mrs.')
+    expect('Mr.')
+    expect('Mrs.')
+
+
+def test_improve_expand():
+    """Test variations on the improve/expand replacement."""
+    expect("to improve/expand his business.", "to improve and expand it.")
+    expect("to improve/expand her business.", "to improve and expand it.")
+    expect("to improve/expand the borrower's business.", "to improve and expand it.")
+
+
+def test_output_generation():
+    expect(u'hello, world')
+    expect(u'Kiva/SMT')

File transforms.py

 import re
 import pycountry
 import logging
-from base import Token, Transform
-
+from mytoken import Token
+from base import Transform
 
 def token_strings(tokens):
     return [t.str for t in tokens]
             # token we are to transform
             transform_token_index = self.rule.tokens.index(token_to_transform)
             # create the token that will be inserted to its left
-            left = Token(token_to_transform.str[:self._mo.start()])
-            # create the token that will replace it
-            paragraph_token = Token('\n')
-            # create the token that will be inserted to its right
-            right = Token(token_to_transform.str[self._mo.end():])
+            left_str = token_to_transform.str[:self._mo.start()]
+            right_str = token_to_transform.str[self._mo.end():]
+
+            left_token = None
+            right_token = None
+
+            if left_str:
+                if right_str:
+                    # token_to_transform.str has the form left_str +
+                    # '\n' + right_str, where left_str is some non-empty
+                    # string that doesn't contain a newline and
+                    # right_str is some non-empty string.
+                    left_cend = token_to_transform.cbegin + len(left_str)
+                    left_token = Token(
+                        left_str,
+                        token_to_transform.cbegin,
+                        left_cend)
+                    right_cbegin = left_cend + 1 # +1 to skip the \n
+                    right_token = Token(
+                        right_str,
+                        right_cbegin,
+                        right_cbegin + len(right_str))
+                else:
+                    # token_to_transform == left_str + '\n'
+                    left_token = Token(
+                        left_str,
+                        token_to_transform.cbegin,
+                        token_to_transform.cbegin + len(left_str))
+                paragraph_token = Token(u'\n', left_token.cend,
+                                        left_token.cend + 1)
+            else:
+                if right_str:
+                    # token_to_transform == '\n' + right_str
+                    right_token = Token(
+                        right_str,
+                        token_to_transform.cbegin + 1,
+                        token_to_transform.cbegin + 1 + len(right_str))
+                else:
+                    # token_to_transform == two or more '\n'
+                    pass
+                paragraph_token = Token(u'\n', token_to_transform.cbegin,
+                                        token_to_transform.cbegin + 1)
+
             # replace the token we're transforming
             self.rule.tokens[transform_token_index] = paragraph_token
             # insert a token to its left if it is nonempty
-            if left.str:
-                self.rule.tokens.insert(transform_token_index, left)
-                # that insertion has made transform_token_index out of
-                # date; correct it.
+            if left_token:
+                self.rule.tokens.insert(transform_token_index, left_token)
+                # that insertion has made transform_token_index off by
+                # one; correct it.
                 transform_token_index += 1
             # now insert the token to the right, if it is nonempty
-            if right.str:
-                self.rule.tokens.insert(transform_token_index + 1, right)
+            if right_token:
+                self.rule.tokens.insert(transform_token_index + 1, right_token)
         logging.debug(u'<ParagraphTransform %s',
                       token_strings(self.rule.tokens))
         return self.rule.tokens
         """Put keyword arguments in instance vars.
 
         Keyword args:
+        `matched_abbrev` -- the AbbrevInfo object which (at least
+        partially) matched the token to transform.
 
-        `matched_abbrev` - the AbbrevInfo object which (at least
-        partially) matched the token to transform.
-        `abbrev_match_len` - number of characters of token that matched.
+        `abbrev_match_len` -- number of characters of token that matched.
         """
         logging.debug(u'>AbbrevTransform.__init__')
         Transform.__init__(self, rule, tokens_to_transform, **kwargs)
                          len(tokens_to_transform))
 
     def apply(self):
-        logging.debug(u'>AbbrevTransform %s',
-                      token_strings(self.tokens_to_transform))
+        logging.debug(u'>AbbrevTransform')
         for token_to_transform in self.tokens_to_transform:
             token_to_transform.abbrev = True
 
             # place.
             if self._abbrev_match_len == len(token_to_transform.str):
                 if self._matched_abbrev.normal_form:
+                    logging.debug(token_to_transform, '=>',
+                                  self._matched_abbrev.normal_form)
                     token_to_transform.str = self._matched_abbrev.normal_form
                 continue
 
 
             # Now modify the existing token, and create a new one to
             # insert after it.
+            logging.debug(token_to_transform)
             token_to_transform.str = abbrev_part
-            post_abbrev_token = Token(extra_part)
+            token_to_transform.cend = (token_to_transform.cbegin +
+                                       len(abbrev_part))
+            post_abbrev_token = Token(extra_part, token_to_transform.cend,
+                                      token_to_transform.cend + len(extra_part))
+            logging.debug('=> {}, {}'.format(token_to_transform,
+                                             post_abbrev_token))
 
             # Find the index within the full list of tokens of the
             # token we are to transform
             transform_token_index = self.rule.tokens.index(token_to_transform)
             self.rule.tokens.insert(transform_token_index + 1,
                                     post_abbrev_token)
-        logging.debug(u'<AbbrevTransform %s', token_strings(self.rule.tokens))
+        logging.debug(u'<AbbrevTransform')
         return self.rule.tokens
 
 
-class RegexSplitTransform(Transform):
+class CharSplitTransform(Transform):
     """Split tokens on designated character."""
 
     def __init__(self, rule, token, **kwargs):
         Transform.__init__(self, rule, token, **kwargs)
-        self._split_re = kwargs.get('split_re')
+        self._delimiter_char = kwargs.get('delimiter_char')
+        self._keep_delimiter = kwargs.get('keep_delimiter')
+
+    def _add(self, split_tokens, i, split_str, token_to_transform):
+        cbegin = token_to_transform.cbegin + i - len(split_str)
+        cend = cbegin + len(split_str)
+        split_tokens.append(Token(split_str, cbegin, cend))
 
     def apply(self):
-        logging.debug(u'>RegexSplitTransform %s',
+        logging.debug(u'>CharSplitTransform %s',
                       token_strings(self.tokens_to_transform))
         for token_to_transform in self.tokens_to_transform:
             # find the index within the full list of tokens of the
             # to modify.
             left_of_split = self.rule.tokens[:transform_token_index]
             right_of_split = self.rule.tokens[transform_token_index + 1:]
-            split_strings = re.split(self._split_re,
-                                     token_to_transform.str)
-            # consecutive delimiters can result in empty strings in the
-            # output list, so use filter() to eliminate them
-            split_strings = filter(None, split_strings)
-            # create a new Token from each of the new non-empty strings
-            split_tokens = map(Token, split_strings)
+
+            split_tokens = []
+            split_str = u''
+            for i, c in enumerate(token_to_transform.str):
+                if c == self._delimiter_char:
+                    # if delimiter has just occurred after one or more
+                    # non-delimiter characters, add those characters as
+                    # a new token.
+                    if split_str:
+                        self._add(split_tokens, i, split_str, token_to_transform)
+                        split_str = u''
+                    # if we're keeping delimiter characters, make a new
+                    # token for it.
+                    if self._keep_delimiter:
+                        cbegin = token_to_transform.cbegin + i
+                        split_tokens.append(Token(c, cbegin, cbegin + 1))
+                else:
+                    split_str += c
+            if split_str:
+                self._add(split_tokens, i + 1, split_str, token_to_transform)
             # put it all back together
             self.rule.tokens = left_of_split + split_tokens + right_of_split
-        logging.debug(u'<RegexSplitTransform %s',
+            logging.debug(split_tokens)
+        logging.debug(u'<CharSplitTransform %s',
                       token_strings(self.rule.tokens))
         return self.rule.tokens