Kiva Editor's Assistant /

david_walker 05246e9 

david_walker bcc8036 

david_walker 05246e9 

david_walker bcc8036 

david_walker 05246e9 

david_walker bcc8036 
#!/usr/bin/env python
The Token class, which is used to contain input tokens (that are later
edited by transforms).

A Token object represents some some bit of text and has additional
properties describing that text.

Initially there is only one token object which contains the entire blob
of text that appeared in the original input.

After some Transforms have been applied, that original token will have
been replaced by a number of Tokens; eventually after processing is
complete each Token will represent a small element like an individual
word or punctuation mark.

This module is named "keatoken" rather than "token" to avoid overriding
the Python standard library module named "token".

import re
import pycountry
import unicodedata
from abc import ABCMeta, abstractmethod
import logging

import tagger

class AbbrevInfo(object):
    """Hold information about an abbreviation."""
    def __init__(self, regex_str, normal_form=None):
        """Compile regex and store normal_form.

        `regex` - a regular expression string.
        `normal_form` - normal way the abbreviation appears.

        Usually the regex exists just to have a case-insensitive way to
        match, but some are more complicated than that.

        The normal form has the standard capitalization of the
        abbreviation and is substituted for the input text if it is
        self.regex = re.compile(regex_str, re.I | re.U)
        self.normal_form = normal_form

class Token(object):
    """Contains a portion of text, either part of the original input
    or generated later, as well as properties describing it.

    Token objects should only be modified by Transform objects. They
    should not modify themselves.

    This keeps all the token-modifying code in one place, as well as
    providing a mechanism to resolve conflicts between multiple bits of
    code that might both want to touch the same Token object.
    abbreviations = [
        AbbrevInfo(ur'mr\.', u'Mr.'),
        AbbrevInfo(ur'mrs\.', u'Mrs.'),
        AbbrevInfo(ur'ksh\.', u'KES'),
        AbbrevInfo(ur'kes\.', u'KES'),
        AbbrevInfo(ur'ltd\.', u'Ltd.'),
        AbbrevInfo(ur's\.a\.l(\.)?', u's.a.l.'),
        AbbrevInfo(ur'u\.s\.s\.r\.', u'U.S.S.R.')]

    _currency_terms = [

    ordinal_res = [
        re.compile(ur'^([0-9,]*[02-9]){0,1}1st$', re.I | re.U),
        re.compile(ur'^([0-9,]*[02-9]){0,1}2nd$', re.I | re.U),
        re.compile(ur'^([0-9,]*[02-9]){0,1}3rd$', re.I | re.U),
        re.compile(ur'^[04-9]th$', re.I | re.U),
        re.compile(ur'^[0-9,]*1[0-9]th$',  re.I | re.U),

    has_digits_re = re.compile(ur'.*\d+.*', re.U)

    is_alpha_re = re.compile(ur'^\w+$', re.I | re.U)

    # recognizes a decimal number with comma-delimited thousands groups
    delimited_decimal_re = re.compile(
        ur"""^            # start of string
             [1-9]        # nonzero leading digit
             [0-9]{,2}    # up to two more leading digits
             (,[0-9]{3})* # any number of comma-delimited groups of 3 digits
             (\.[0-9]+)?  # optional decimal followed by one or more digits
             $            # end of string
        re.U | re.X)

    # recognizes an integer with comma-delimited thousands groups
    delimited_integer_re = re.compile(
        ur"""^            # start of string
             [0-9]{1,3}   # one to three leading digits
             (,[0-9]{3})* # any number of comma-delimited groups of 3 digits
             $            # end of string
        re.U | re.X)

    url_re = re.compile(
        """(\w+\.)+     # one or more dot-delimited words
           # one of the TLDs that appear in Kiva loans
           (\S*)        # any amount of non-space chars """,
        re.I | re.U | re.VERBOSE)

    def bos_token():
        """A convenience class-factory method to return an
        beginning-of-sentence delimiter token."""
        bos_token = Token('*BOS*')
        bos_token.bos = True
        return bos_token

    def eos_token():
        """A convenience class-factory method to return an
        end-of-sentence delimiter token."""
        eos_token = Token('*EOS*')
        eos_token.eos = True
        return eos_token

    def __init__(self, s, cbegin=None, cend=None):
        """Initialize from text.

        s -- unicode string

        cbegin -- index into original text for the start of the initial
        value of this token

        cend -- index into original text to character just past the end
        of the initial value of this token.

        It is frequently true that at initialization cend = cbegin +
        len(s), but since cbegin and cend are offsets into the original
        input text, they are held invariant as s changes.
        # Note we use the setter here which initializes the cache.
        self.str = s
        self.cbegin = cbegin
        self.cend = cend
        self.pos = tagger.PosContainer()
        assert(cbegin == None and cend == None or self.cend >= self.cbegin)

    def __repr__(self):
        """Return a string representation of this object suitable for
        debugging output.
        escaped_str = self._str.replace(u'\n', u'\\n')
        if self.cbegin == None and self.cend == None:
            r = u'<' + escaped_str + u'>'
            r = u'<{} {}:{}>'.format(escaped_str, self.cbegin, self.cend)
        # Python 2.x requires that __repr__ return an ascii string.
        # Python 3.x requires that it return a unicode string.
        return r.encode(encoding='iso-8859-15', errors='replace')

    def __str__(self):
        return self.__repr__()

    def __eq__(self, other):
        if isinstance(other, basestring):
            return self._str == other
        assert(isinstance(other, Token))
        for key, val in self.__dict__.items():
            if getattr(other, key, None) != val:
                return False
        return True

    def __ne__(self, other):
        return not self.__eq__(other)

    def _reset_cache(self):
        self._abbrev_checked = False
        self._abbrev_match = None
        self._abbrev_match_len = 0
        self.bos = None  # beginning of sentence
        self.eos = None  # end of sentence
        self.eof = None  # end of file
        self._URL_checked = False
        self._is_URL = None

    def str(self):
        return self._str

    def str(self, new_value):
        self._str = unicode(new_value)

    def abbrev_match_len(self):
        if not self._abbrev_checked:
            self._abbrev_checked = True
            for abbrev in Token.abbreviations:
                match_obj = abbrev.regex.match(self._str)
                if match_obj:
                    self._abbrev_match = abbrev
                    self._abbrev_match_len = len(
        return self._abbrev_match_len, self._abbrev_match

    def has_digits(self):
        """Return True if `str` has digits in it."""
        return != None

    def is_abbrev(self):
        """Return True if token matches (not just starts with) an
        match_len, abbrev = self.abbrev_match_len
        return abbrev and match_len == len(self._str)

    def is_alpha(self):
        """Return True if token contains only letters."""
        return LexToken.is_alpha_re.match(self._str)

    def is_alphanumeric_ordinal(self):
        """Return True if token is of the form 1st, 2nd, 3rd, 4th, etc."""
        for regex in Token.ordinal_res:
            if regex.match(self._str):
                return True
        return False

    def is_close(self):
        """Return True if this token is any type of closing paren.
        return len(self._str) == 1 and self._str in u')]}'

    def is_currency_symbol(self):
        return len(self._str) == 1 and self._str == u'$'

    def is_currency_term(self):
        if self._str.lower() in Token._currency_terms:
            return True
        return self.is_ISO_currency

    def is_eof(self):
        return self.eof == True

    def is_delimited_decimal(self):
        return Token.delimited_decimal_re.match(self._str) != None

    def is_delimited_integer(self):
        return Token.delimited_integer_re.match(self._str) != None

    def is_ISO_currency(self):
            result = True
            result = False
        return result

    def is_nonspacing_punc(self):
        """Return True if this token is a punctuation character.
        return len(self._str) == 1 and self._str in u',.!?;%:'

    def is_open(self):
        """Return True if this token is any type of opening paren.
        return len(self._str) == 1 and self._str in u'([{'

    def is_para(self):
        return self._str == '\n'

    def is_punc(self):
        """Return True if this token is a punctuation character.
        return len(self._str) == 1 and unicodedata.category(

    def is_quote(self):
        """Return true if this token is any type of single or double quote.
        return len(self._str) == 1 and self._str in u'\'`"'

    def is_URL(self):
        """Check if token contains a URL, marking it if necessary.

        Only a subset of possible URL forms likely to appear in a Kiva
        description are recognized, since it is more likely that a token
        that happens to conform to an exotic URL form is, in fact, a typo.
        if not self._URL_checked:
            self._URL_checked = True

            # look for a scheme identifier; Kiva loans only will have an
            # http or maybe https prefix, but won't use any of the others.
            if self._str.lower().startswith('http'):
                self._is_URL = True
            elif Token.url_re.match(self._str):
                self._is_URL = True
        return self._is_URL

    def non_printing(self):
        """Return True if any of the attributes are set which indicate a
        non-printing token.
        return self.bos or self.eos or self.eof