Commits

david_walker committed fa39edb

split Token class out of base into mytoken.py (token.py conflicts with standrd module)

Comments (0)

Files changed (3)

+#!/usr/bin/env python
+"""
+The Token class, which is used to contain input tokens (that are later
+edited by transforms).
+
+A Token object represents some some bit of text and has additional
+properties describing that text.
+
+Initially there is only one token object which contains the entire blob
+of text that appeared in the original input.
+
+After some Transforms have been applied, that original token will have
+been replaced by a number of Tokens; eventually after processing is
+complete each Token will represent a small element like an individual
+word or punctuation mark.
+
+This module is named "mytoken" rather than "token" to avoid overriding
+the Python standard library module named "token".
+"""
+
+import re
+import pycountry
+import unicodedata
+from abc import ABCMeta, abstractmethod
+
+import logging
+
+
+class AbbrevInfo(object):
+    """Hold information about an abbreviation."""
+    def __init__(self, regex_str, normal_form=None):
+        """Compile regex and store normal_form.
+
+        `regex` - a regular expression string.
+        `normal_form` - normal way the abbreviation appears.
+
+        Usually the regex exists just to have a case-insensitive way to
+        match, but some are more complicated than that.
+
+        The normal form has the standard capitalization of the
+        abbreviation and is substituted for the input text if it is
+        specified.
+        """
+        self.regex = re.compile(regex_str, re.I | re.U)
+        self.normal_form = normal_form
+
+
+class Token(object):
+    """Contains a portion of text, either part of the original input
+    or generated later, as well as properties describing it.
+
+    Token objects should only be modified by Transform objects. They
+    should not modify themselves.
+
+    This keeps all the token-modifying code in one place, as well as
+    providing a mechanism to resolve conflicts between multiple bits of
+    code that might both want to touch the same Token object.
+    """
+    abbreviations = [
+        AbbrevInfo(ur'e\.g\.'),
+        AbbrevInfo(ur'i\.e\.'),
+        AbbrevInfo(ur'etc\.'),
+        AbbrevInfo(ur'mr\.', u'Mr.'),
+        AbbrevInfo(ur'mrs\.', u'Mrs.'),
+        AbbrevInfo(ur'ksh\.', u'KES'),
+        AbbrevInfo(ur'kes\.', u'KES'),
+        AbbrevInfo(ur'ltd\.', u'Ltd.'),
+        AbbrevInfo(ur's\.a\.l(\.)?', u's.a.l.'),
+        AbbrevInfo(ur'u\.s\.s\.r\.', u'U.S.S.R.')]
+
+    _currency_terms = [
+        u'$',
+        u'dollar',
+        u'dollars',
+        u'/=',
+        u'peso',
+        u'pesos',
+        u'shilling',
+        u'shillings']
+
+    ordinal_res = [
+        re.compile(ur'^([0-9,]*[02-9]){0,1}1st$', re.I | re.U),
+        re.compile(ur'^([0-9,]*[02-9]){0,1}2nd$', re.I | re.U),
+        re.compile(ur'^([0-9,]*[02-9]){0,1}3rd$', re.I | re.U),
+        re.compile(ur'^[04-9]th$', re.I | re.U),
+        re.compile(ur'^[0-9,]*1[0-9]th$',  re.I | re.U),
+    ]
+
+    has_digits_re = re.compile(ur'.*\d+.*', re.U)
+
+    is_alpha_re = re.compile(ur'^\w+$', re.I | re.U)
+
+    # recognizes a decimal number with comma-delimited thousands groups
+    delimited_decimal_re = re.compile(
+        ur"""^            # start of string
+             [1-9]        # nonzero leading digit
+             [0-9]{,2}    # up to two more leading digits
+             (,[0-9]{3})* # any number of comma-delimited groups of 3 digits
+             (\.[0-9]+)?  # optional decimal followed by one or more digits
+             $            # end of string
+          """,
+        re.U | re.X)
+
+    # recognizes an integer with comma-delimited thousands groups
+    delimited_integer_re = re.compile(
+        ur"""^            # start of string
+             [0-9]{1,3}   # one to three leading digits
+             (,[0-9]{3})* # any number of comma-delimited groups of 3 digits
+             $            # end of string
+          """,
+        re.U | re.X)
+
+    url_re = re.compile(
+        """(\w+\.)+     # one or more dot-delimited words
+           # one of the TLDs that appear in Kiva loans
+           (com|edu|gov|info|mil|net|org|tj)
+           (\S*)        # any amount of non-space chars """,
+        re.I | re.U | re.VERBOSE)
+
+    def __init__(self, s, cbegin=None, cend=None):
+        """Initialize from text.
+
+        Arguments:
+        s -- unicode string
+
+        cbegin -- index into original text for the start of the initial
+        value of this token
+
+        cend -- index into original text to character just past the end
+        of the initial value of this token.
+
+        It is frequently true that at initialization cend = cbegin +
+        len(s), but since cbegin and cend are offsets into the original
+        input text, they are held invariant as s changes.
+        """
+        # Note we use the setter here which initializes the cache.
+        self.str = s
+        self.cbegin = cbegin
+        self.cend = cend
+        assert(cbegin == None and cend == None or self.cend >= self.cbegin)
+
+    def __repr__(self):
+        """Return a string representation of this object suitable for
+        debugging output.
+        """
+        r = u'<'
+        for key, val in sorted(self.__dict__.items()):
+            if val != None and (not isinstance(val, bool) or val != False):
+                if len(r) > 1:
+                    r += u' '
+                if key == '_str':
+                    r += u'"{}"'.format(val.replace(u'\n', u'\\n'))
+                else:
+                    r += u'{}: {}'.format(key, val)
+        r += u'>'
+        # Python 2.x requires that __repr__ return an ascii string.
+        # Python 3.x requires that it return a unicode string.
+        return r.encode(encoding='iso-8859-15', errors='replace')
+
+    def _reset_cache(self):
+        self._abbrev_checked = False
+        self._abbrev_match = None
+        self._abbrev_match_len = 0
+        self.sentence_delim = None
+        self.eof = None
+        self._URL_checked = False
+        self._is_URL = None
+
+    @property
+    def str(self):
+        return self._str
+
+    @str.setter
+    def str(self, new_value):
+        self._str = unicode(new_value)
+        self._reset_cache()
+
+    @property
+    def abbrev_match_len(self):
+        if not self._abbrev_checked:
+            self._abbrev_checked = True
+            for abbrev in Token.abbreviations:
+                match_obj = abbrev.regex.match(self._str)
+                if match_obj:
+                    self._abbrev_match = abbrev
+                    self._abbrev_match_len = len(match_obj.group())
+                    break
+        return self._abbrev_match_len, self._abbrev_match
+
+    @property
+    def has_digits(self):
+        """Return True if `str` has digits in it."""
+        return Token.has_digits_re.search(self._str) != None
+
+    @property
+    def is_abbrev(self):
+        """Return True if token matches (not just starts with) an
+        abbreviation."""
+        match_len, abbrev = self.abbrev_match_len
+        return abbrev and match_len == len(self._str)
+
+    @property
+    def is_alpha(self):
+        """Return True if token contains only letters."""
+        return LexToken.is_alpha_re.match(self._str)
+
+    @property
+    def is_alphanumeric_ordinal(self):
+        """Return True if token is of the form 1st, 2nd, 3rd, 4th, etc."""
+        for regex in Token.ordinal_res:
+            if regex.match(self._str):
+                return True
+        return False
+
+    @property
+    def is_close(self):
+        """Return True if this token is any type of closing paren.
+        """
+        return len(self._str) == 1 and self._str in u')]}'
+
+    @property
+    def is_currency_symbol(self):
+        return len(self._str) == 1 and self._str == u'$'
+
+    @property
+    def is_currency_term(self):
+        if self._str.lower() in Token._currency_terms:
+            return True
+        return self.is_ISO_currency
+
+    @property
+    def is_eof(self):
+        return self.eof == True
+
+    @property
+    def is_delimited_decimal(self):
+        return Token.delimited_decimal_re.match(self._str) != None
+
+    @property
+    def is_delimited_integer(self):
+        return Token.delimited_integer_re.match(self._str) != None
+
+    @property
+    def is_ISO_currency(self):
+        try:
+            pycountry.currencies.get(letter=self._str.upper())
+            result = True
+        except:
+            result = False
+        return result
+
+    @property
+    def is_nonspacing_punc(self):
+        """Return True if this token is a punctuation character.
+        """
+        return len(self._str) == 1 and self._str in u',.!?;%:'
+
+    @property
+    def is_open(self):
+        """Return True if this token is any type of opening paren.
+        """
+        return len(self._str) == 1 and self._str in u'([{'
+
+    @property
+    def is_para(self):
+        return self._str == '\n'
+
+    @property
+    def is_punc(self):
+        """Return True if this token is a punctuation character.
+        """
+        return len(self._str) == 1 and unicodedata.category(
+            self._str).startswith(u'P')
+
+    @property
+    def is_quote(self):
+        """Return true if this token is any type of single or double quote.
+        """
+        return len(self._str) == 1 and self._str in u'\'`"'
+
+    @property
+    def is_URL(self):
+        """Check if token contains a URL, marking it if necessary.
+
+        Only a subset of possible URL forms likely to appear in a Kiva
+        description are recognized, since it is more likely that a token
+        that happens to conform to an exotic URL form is, in fact, a typo.
+        """
+        if not self._URL_checked:
+            self._URL_checked = True
+
+            # look for a scheme identifier; Kiva loans only will have an
+            # http or maybe https prefix, but won't use any of the others.
+            if self._str.lower().startswith('http'):
+                self._is_URL = True
+            elif Token.url_re.match(self._str):
+                self._is_URL = True
+        return self._is_URL
+
+    @property
+    def non_printing(self):
+        """Return True if any of the attributes are set which indicate a
+        non-printing token.
+        """
+        return self.sentence_delim or self.eof
+import kea
+import StringIO
+
+
+def print_as_chars(s):
+    for c in s:
+        print ord(c),
+    print
+
+
+def expect(test_input, expected_output=None):
+    actual_output = kea.EditAssistant(
+        StringIO.StringIO(test_input)).edited_text
+    if expected_output == None:
+        expected_output = test_input
+    if actual_output != expected_output:
+        print 'expected "{0}" but got "{1}"'.format(
+            expected_output, actual_output)
+        print "expected:",
+        print_as_chars(expected_output)
+        print "actual:  ",
+        print_as_chars(actual_output)
+        assert(False)

tests/test_para.py

+from expect import expect
+
+def test_para():
+    expect(u'abc\nde', u'abc \n\nde')
+    expect(u'\nde', u'\n\nde')