Source

Kiva Editor's Assistant / classifier.py

Full commit
#!/usr/bin/env python
import re

class AbbrevInfo(object):
    """Hold information about an abbreviation."""
    def __init__(self, regex_str, normal_form=None):
        """Compile regex and store normal_form.

        `regex` - a regular expression string.
        `normal_form` - normal way the abbreviation appears.

        Usually the regex exists just to have a case-insensitive way to
        match, but some are more complicated than that.

        The normal form has the standard capitalization of the
        abbreviation and is substituted for the input text if it is
        specified.
        """
        self.regex = re.compile(regex_str, re.I | re.U)
        self.normal_form = normal_form


class TokenClassifier(object):
    """Contains methods that will test a token and set its attributes to
    reflect its membership in a class of things like URLs, acronyms,
    etc. (The term class is not used in a Pythonic sense here.)
    """
    url_re = re.compile(
        """(\w+\.)+     # one or more dot-delimited words
           # one of the TLDs that appear in Kiva loans
           (com|edu|gov|info|mil|net|org|tj)
           (\S*)        # any amount of non-space chars """,
        re.I | re.U | re.VERBOSE)

    has_digits_re = re.compile(ur'.*\d+.*')

    abbreviations = [
        AbbrevInfo(ur'e\.g\.'),
        AbbrevInfo(ur'i\.e\.'),
        AbbrevInfo(ur'etc\.'),
        AbbrevInfo(ur'mr\.', 'Mr.'),
        AbbrevInfo(ur'mrs\.', 'Mrs.'),
        AbbrevInfo(ur'ksh\.', 'KES'),
        AbbrevInfo(ur'kes\.', 'KES'),
        AbbrevInfo(ur'ltd\.', 'Ltd.'),
        AbbrevInfo(ur's\.a\.l(\.)?', 's.a.l.'),
        AbbrevInfo(ur'u\.s\.s\.r\.', 'U.S.S.R.')]

    @staticmethod
    def check_URL(token):
        """Check if token contains a URL, marking it if necessary.

        Only a subset of possible URL forms likely to appear in a Kiva
        description are recognized, since it is more likely that a token
        that happens to conform to an exotic URL form is, in fact, a typo.
        """
        # see if token is already marked as a URL
        if token.url:
            return True

        # look for a scheme identifier; Kiva loans only will have an
        # http or maybe https prefix, but won't use any of the others.
        if token.str.lower().startswith('http'):
            token.url = True
            return True

        # run it through the regex, which is pretty permissive.
        if TokenClassifier.url_re.match(token.str):
            token.url = True
            return True
        return False

    @staticmethod
    def check_digits(token):
        if token.has_digits:
            return True
        if TokenClassifier.has_digits_re.match(token.str):
            token.has_digits = True
            return True
        return False

    @staticmethod
    def get_abbrev_match(token):
        for abbrev in TokenClassifier.abbreviations:
            match_obj = abbrev.regex.match(token.str)
            if match_obj:
                return len(match_obj.group()), abbrev
        return 0, None

    @staticmethod
    def _get_match_len(matches, text):
        for m in matches:
            match_obj = m.match(text)
            if match_obj:
                return len(match_obj.group())
        return 0