Commits

david_walker  committed 7fa47b4

delete obsolete kea.py

  • Participants
  • Parent commits 485f42a

Comments (0)

Files changed (1)

File kea.py

-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-import sys
-import StringIO
-import re
-import codecs
-from collections import defaultdict
-import regex_process
-from clipboard import *
-import argparse
-
-# for debugging
-import logging
-import tempfile
-import pdb
-
-#
-# A constant dictionary of ISO 4217 currency abbreviations. Note 'USD' is absent
-# because it is represented using the symbol '$' and must be handled separately.
-#
-# TODO: consider using pycountry database
-#
-
-currency_display_names = defaultdict(str)
-currency_display_names[u'AMD'] = u'Armenian dram'
-currency_display_names[u'AZN'] = u'Azerbaijani manat'
-currency_display_names[u'GHS'] = u'Ghanaian cedis'
-currency_display_names[u'IDR'] = u'Indonesian rupiah'
-currency_display_names[u'KES'] = u'Kenyan shillings'
-currency_display_names[u'KGS'] = u'Kyrgyzstani soms'
-currency_display_names[u'MNT'] = u'Mongolian tugriks'
-currency_display_names[u'MNT'] = u'Mongolian tugriks'
-currency_display_names[u'NGN'] = u'Nigerian naira'
-currency_display_names[u'PEN'] = u'Peruvian Nuevos soles'
-currency_display_names[u'PHP'] = u'Philippine pesos'
-currency_display_names[u'SDG'] = u'Sudanese pounds'
-currency_display_names[u'SLL'] = u'Sierra Leonean leones'
-currency_display_names[u'TJS'] = u'Tajikistan somonis'
-currency_display_names[u'UAH'] = u'Ukrainian hryvnias'
-currency_display_names[u'UGX'] = u'Ugandan shillings'
-currency_display_names[u'USD'] = u'US' # this gets special handling for '$'
-currency_display_names[u'VND'] = u'Vietnamese dong'
-currency_display_names[u'WST'] = u'Samoan tala'
-
-#
-# A separate dictionary of idiomatic currency abbreviations
-#
-
-alternate_currency_abbrev = defaultdict(str)
-alternate_currency_abbrev[u'KSH']  = u'KES'
-alternate_currency_abbrev[u'KSHS'] = u'KES'
-alternate_currency_abbrev[u'KSH.'] = u'KES'
-alternate_currency_abbrev[u'KES.'] = u'KES'
-alternate_currency_abbrev[u'LE']   = u'SLL'
-alternate_currency_abbrev[u'N']    = u'NGN'
-
-
-
-#
-# Token Classes
-#
-
-class TokenType(object):
-    """ Used as a namespace to contain enumerated type values.
-    """
-    EOF      = 1
-    ALPHA    = 2
-    NUMERIC  = 3
-    PARA     = 4
-    SYM      = 5
-    ALPHANUM = 6
-
-ABBREVIATIONS = [
-    re.compile(ur'e\.g\.', re.I | re.U),
-    re.compile(ur'i\.e\.', re.I | re.U),
-    re.compile(ur'etc\.', re.I | re.U),
-    re.compile(ur'mr\.', re.I | re.U),
-    re.compile(ur'mrs\.', re.I | re.U),
-    re.compile(ur'ksh\.', re.I | re.U),
-    re.compile(ur'kes\.', re.I | re.U),
-    re.compile(ur'Ltd\.', re.I | re.U),
-    re.compile(ur's\.a\.l(\.)?', re.I | re.U),
-    re.compile(ur'U\.S\.S\.R\.', re.I | re.U)
-    ]
-
-ALPHANUMS = [
-    re.compile(ur'([0-9,]*[02-9]){0,1}1st', re.I | re.U),
-    re.compile(ur'([0-9,]*[02-9]){0,1}2nd', re.I | re.U),
-    re.compile(ur'([0-9,]*[02-9]){0,1}3rd', re.I | re.U),
-    re.compile(ur'[04-9]th', re.I | re.U),
-    re.compile(ur'[0-9,]*1[0-9]th',  re.I | re.U),
-    ]
-
-# These must be separate regexes because Python regex alternation is not greedy.
-
-US_DECIMAL_NUMBER_REGEX = re.compile(ur"""
-  [0-9]{1,3}        # up to three digits
-  (,[0-9]{3})*      # followed by zero or more comma-delimited
-                    # three-digit groups
-  (\.[0-9]{1,2})?   # with an optional  decimal point and two digits
-""", re.I | re.U | re.VERBOSE)
-
-EURO_DECIMAL_NUMBER_REGEX = re.compile("""
-  [0-9]{1,3}     # up to three digits
-  (\.[0-9]{3})*  # followed by zero or more period-delimited
-                 # three-digit groups
-""", re.I | re.U | re.VERBOSE)
-
-INTEGER_REGEX = re.compile("""
-  [0-9 ]+
-""", re.I | re.U | re.VERBOSE)
-
-class Token(object):
-    def is_alpha(self):
-        """Return True if this token is composed only of alpha characters.
-        """
-        return self._type == TokenType.ALPHA
-
-    def is_close(self):
-        """Return True if this token is any type of closing paren.
-        """
-        return len(self._str) == 1 and self._str in u')]}'
-
-    def is_currency_symbol(self):
-        """Return true if this token is a currency symbol.
-        """
-        return len(self._str) == 1 and self._str == u'$'
-
-    def is_eof(self):
-        """Return true if this is an end of file token.
-        """
-        return self._type == TokenType.EOF
-
-    def is_numeric(self):
-        """Return true if this token represents a number using digits.
-        """
-        return self._type == TokenType.NUMERIC
-
-    def is_open(self):
-        """Return True if this token is any type of opening paren.
-        """
-        return len(self._str) == 1 and self._str in u'([{'
-
-    def is_quote(self):
-        """Return true if this token is any type of single or double quote.
-        """
-        return len(self._str) == 1 and self._str in u'\'`"'
-
-    def is_symbol(self):
-        """Return True if this token is any symbol (e.g., "$").
-        """
-        return not self.is_punc() and self._type == TokenType.SYM
-
-    def is_para(self):
-        """Return True if this token is a paragraph break.
-        """
-        return self._type == TokenType.PARA
-
-    def is_punc(self):
-        """Return True if this token is a punctuation character.
-        """
-        return self._type == TokenType.SYM and self._str in u',.!?;:%*'
-
-    def __repr__(self):
-        """Return string representation of this object.
-        """
-        return self._str
-
-    def __str__(self):
-        return self._str
-
-
-
-class InputToken(Token):
-    """ Represents a portion of string input
-    """
-
-    def __init__(self, text, startpos):
-        """ Create a token from the string in `text` starting at `startpos`.
-
-        Arguments:
-        - `text`: raw text to create a token from
-
-        - `startpos`: offset into `text` of the place to start looking for
-          characters from which to form a token
-        """
-        self._startpos = startpos
-        self._len = 0
-        self._str = u''
-
-        # skip leading whitespace other than newlines
-        while self._startpos < len(text) and \
-                  text[self._startpos].isspace() and \
-                  text[self._startpos] != u'\n':
-            self._startpos += 1
-
-        # if we stopped at a newline, consider it and all contiguous remaining
-        # whitespace (including other newlines) to be a single paragraph break.
-
-        if self._startpos >= len(text):
-            # there were no significant characters before the end of the string.
-            self._type = TokenType.EOF
-            self._str = u'*EOF*'
-        elif text[self._startpos] == u'\n':
-            self._type = TokenType.PARA
-            self._str = u'*PARA*'
-            self._len = 1
-            while (self._startpos + self._len) < len(text) and \
-                      text[self._startpos + self._len].isspace():
-                self._len += 1
-        else:
-            # here are some samples that should be recognized:
-            #  6.00am => '6:00 a.m.'
-            # 2 000 Kes => '2,000' 'KES'
-            next_char = text[self._startpos + self._len]
-            if next_char.isalpha():
-                self._init_alpha(text)
-            elif next_char.isdigit():
-                self._init_numeric(text)
-            else:
-                self._init_nonalnum(text)
-
-        if self._str == u'':
-            self._str = text[self._startpos:self._startpos + self._len]
-
-
-    def _abbrev_word_len(self, text):
-        """Return the length of the abbreviation found at _startpos, or 0 if
-        none was found.
-        """
-        return self._get_match_len(ABBREVIATIONS, text)
-
-    def _alphanum_match_len(self, text):
-        """Return the length of the first alphanumeric found in 'text' that
-        begins at self._startpos, or 0 if no matches are found.
-        """
-        return self._get_match_len(ALPHANUMS, text)
-
-    def _get_match_len(self, matches, text):
-        for m in matches:
-            match_obj = m.match(text, self._startpos)
-            if match_obj:
-                return len(match_obj.group())
-        return 0
-
-    def _init_alpha(self, text):
-        """ Finish initialization, given that text[self._startpos] is an alpha
-        character.
-        """
-        self._type = TokenType.ALPHA
-        while self._startpos + self._len < len(text):
-           next_char = text[self._startpos + self._len]
-           next_plus_1_char = self.peek(text, 1)
-           if next_char.isalpha():
-               # simplest case of additional alpha characters after the first
-               self._len += 1
-           elif next_char == u'.':
-               # Allow internal period like "e.g." but don't be fooled by a
-               # missing space after a sentence-final period.
-               # Also allow abbreviations such as "Mrs."
-               new_len = self._abbrev_word_len(text)
-               if new_len != 0:
-                   self._len = new_len
-               # Either we found an abbreviation and now have its length, or we
-               # found a period which should not be included in the word. In
-               # both cases, we're done.
-               break
-           elif next_char in u"'-" and self.peek(text, 1).isalpha():
-               # allow internal dash like "each-other"
-               # allow internal apostrophe like "isn't"
-               self._len += 2
-           else:
-               break
-
-    def _init_numeric(self, text):
-        """ Finish initialization, given that text[self._startpos] is a numeric
-        character.
-
-        Note the _type may change based on what follows the initial digit. For
-        example, the token "1st" will have _type ALPHANUM.
-        """
-        # Assume that this token, which starts with a digit, will be NUMERIC. It
-        # may turn out to be ALPHANUMERIC.
-        self._type = TokenType.NUMERIC
-
-        # Get the length of the longest integer match
-        mo = INTEGER_REGEX.match(text, self._startpos)
-        if mo:
-            self._len = len(mo.group())
-            # delete any spaces
-            no_spaces = mo.group().replace(u' ', u'')
-            if len(no_spaces) != len(mo.group()):
-                self._str = no_spaces
-
-        # now try United States style with comma delimiters
-        mo = US_DECIMAL_NUMBER_REGEX.match(text, self._startpos)
-        if mo and len(mo.group()) > self._len:
-            self._len = len(mo.group())
-
-        mo = EURO_DECIMAL_NUMBER_REGEX.match(text, self._startpos)
-        if mo and len(mo.group()) > self._len:
-            self._len = len(mo.group())
-            self._str = mo.group()
-            self._str = self._str.replace(u',', u'x')
-            self._str = self._str.replace(u'.', u',')
-            self._str = self._str.replace(u'x', u'.')
-
-        # check for alphanumeric case
-        next_char = self.peek(text, 1)
-        if next_char.isalpha():
-            alnum_match_len = self._alphanum_match_len(text)
-            if alnum_match_len > self._len:
-                self._len = alnum_match_len
-                self._type = TokenType.ALPHANUM
-
-
-
-    def _init_nonalnum(self, text):
-        """ Finish initialization, given that text[self._startpos] is neither an
-        alpha nor a numeric character. In some cases symbols may be converted to
-        other strings, e.g. "/=" becomes currency abbreviation "UGX".
-        """
-        if text[self._startpos] == '/' and self.peek(text, 1) == '=':
-            self._type = TokenType.ALPHA
-            self._len = 2
-            self._str = 'UGX'
-        else:
-            self._type = TokenType.SYM
-            self._len = 1
-
-    def peek(self, text, lookahead):
-        """ Return the character at _startpos + _len + lookahead, or u'' if text
-        ends before there.
-        """
-        index = self._startpos + self._len + lookahead
-        if index < len(text):
-            return text[index]
-        return u''
-
-
-
-class InputTokenArray(object):
-    """ Create an array of InputToken objects which refer to specified text.
-    """
-
-    def __init__(self, text):
-        """ Fill the token array.
-
-        Arguments:
-        - `text`: the text to tokenize
-        """
-        # replace non-latin1 characters with equivalents
-
-        if u"’" in text:
-            text = text.replace(u"’", u"'")
-
-        self.tokens = []
-        token_start = 0
-        token_span = 0
-        while True:
-            new_token = InputToken(text, token_start)
-            self.tokens.append(new_token)
-
-            # if there was nothing (other than whitespace) before the end of
-            # string, the token type will be EOF, and we can discard it and
-            # finish.
-
-            if new_token._type == TokenType.EOF:
-                break
-
-            token_start = new_token._startpos + new_token._len
-
-    def __repr__(self):
-        return u'\n'.join([repr(t) for t in self.tokens])
-
-    def get_token_str(self, i):
-        """Return the string of ith token, or empty string if i is out of range.
-        """
-        if i >= 0 and i < len(self.tokens):
-            return unicode(self.tokens[i])
-        return u''
-
-
-
-def digit_group_callback(match_obj):
-    return match_obj.group(1) + ',' + match_obj.group(2)
-
-
-class OutputToken(Token):
-    """ Contains text and other features created from examining one or more input tokens.
-    """
-
-    def __init__(self, input_token):
-        """ Initialize the token.
-        """
-        self._type = input_token._type
-        self._str = input_token._str
-        if self._type == TokenType.NUMERIC:
-            # apply number formatting rules
-            self._format_number()
-
-    SPELLED_NUMBERS = [
-        u'one',
-        u'two',
-        u'three',
-        u'four',
-        u'five',
-        u'six',
-        u'seven',
-        u'eight',
-        u'nine']
-
-
-    def _format_number(self):
-        """ Apply numeric formatting rules to self._str.
-        """
-        # RULE: positive integers less than 10 should be spelled out.
-        try:
-            i = int(self._str)
-        except ValueError:
-            i = None
-
-        if u'.' in self._str:
-            # Number is either decimal or contains euro style thousands separators.
-
-            # Convert euro style thousands separators to American style.
-            self._str = re.sub(r'\.([0-9]{3})', r',\1', self._str)
-        elif 0 < i < 10:
-            # spell out numbers less than 10
-            self._str = OutputToken.SPELLED_NUMBERS[i-1]
-            self._type = TokenType.ALPHA
-        elif u',' not in self._str and len(self._str) > 4:
-            # add thousands separators to numbers > 9999
-            new_str = re.sub(ur'(\d)(\d{3})\b', digit_group_callback, self._str)
-            while new_str != self._str:
-                self._str = new_str
-                new_str = re.sub(ur'(\d)(\d{3})\b', digit_group_callback, self._str)
-
-
-
-class OutputTokenArray(object):
-    """ Create an array of OutputToken objects, which are the result of
-    processing an array of InputTokens.
-    """
-
-    def __init__(self):
-        self._output_tokens = []
-
-    def _init_from_ita(self, input_token_array, expand_currency):
-        """ Make a copy of the input token array. """
-        for it in input_token_array.tokens:
-            self._output_tokens.append(OutputToken(it))
-        self._process_output_tokens(expand_currency)
-
-    def _peek(self, index):
-        """ Return the item at _output_tokens[index] or an EOF token if the
-        index is out of range.
-        """
-        result = None
-        try:
-            result = self._output_tokens[index]
-        except IndexError:
-
-            result = OutputToken(InputToken('', 0))
-        return result
-
-    def _process_output_tokens(self, expand_currency):
-        """ All processing of output tokens is launched from here.
-        """
-        i = 0
-
-        while i < len(self._output_tokens):
-            cur_token = self._output_tokens[i]
-
-            # look for <1-3 digit number> (<comma> <3 digit number>)+
-            # for example:
-            # 12 , 348 , 227 => 12,348,227
-            # but a sequence like 13, 34, 11 should not be changed
-
-            if (cur_token.is_numeric() and
-                len(cur_token._str) <= 3 and
-                cur_token._str != u'0'):
-
-                # combine as many <comma> <3 digit number> groups as possible
-                while (self._peek(i + 1)._str == u',' and
-                       self._peek(i + 2).is_numeric() and
-                       len(self._peek(i + 2)._str) == 3):
-                    self._output_tokens[i]._str += ',' + self._peek(i + 2)._str
-                    # delete tokens we just concatenated onto cur_token
-                    del self._output_tokens[i+1:i+3]
-
-            # change idiomatic currency abbreviations into ISO version
-            if alternate_currency_abbrev.has_key(unicode(cur_token).upper()):
-                cur_token._str = alternate_currency_abbrev[str(cur_token).upper()]
-
-            # ensure currency abbreviations are capitalized
-            if (currency_display_names.has_key(unicode(cur_token).upper())):
-                cur_token._str = cur_token._str.upper()
-            i += 1
-
-        # make another pass to change currency abbreviation order from before a
-        # number to after it. this is done as a separate pass to avoid changing
-        # "PHP 11, 000" to "11 PHP, 000".
-
-        i = 0
-        expanded_currency_abbrev = not expand_currency
-
-        while i < len(self._output_tokens):
-            cur_token = self._output_tokens[i]
-
-            # if a currency abbreviation appears before a number, swap it with
-            # the number, but make no change in this situation: 100 KES 200
-
-            if (currency_display_names.has_key(unicode(cur_token).upper()) and
-                self._peek(i + 1).is_numeric() and
-                not self._peek(i - 1).is_numeric()):
-                cur_token._str = cur_token._str.upper()
-                self._output_tokens[i], self._output_tokens[i + 1] = \
-                    self._output_tokens[i + 1], self._output_tokens[i]
-                # refresh cur_token since it's different in the array now
-                cur_token = self._output_tokens[i]
-
-            # if the currency is USD (United States Dollars) handle it
-            # specially: both "USD 100" and "100 USD" should become $100 US (the
-            # expanded version) and simply $100 (for unexpanded). Note the
-            # previous code block has already swapped the currency abbreviation
-            # and the number, so both these cases become "100 USD".
-
-            if unicode(cur_token).upper() == u'USD':
-                # insert a '$' before preceding number
-                if self._peek(i - 1).is_numeric():
-                    self.insert_token('$', i - 1)
-                    # adjust current index to account for token we just inserted
-                    # behind it.
-                    i += 1
-                if expanded_currency_abbrev:
-                    del self._output_tokens[i]
-                    # avoid incrementing i
-                    continue
-                expanded_currency_abbrev = True
-                cur_token._str = u"US"
-
-            # expand the first currency abbreviation found, unless it seems to
-            # have already been done.
-
-            if (not expanded_currency_abbrev and
-                currency_display_names.has_key(unicode(cur_token))):
-                if not self._currency_expanded(i):
-                    currency_str = '{0} ({1})'.format(
-                        currency_display_names[unicode(cur_token)], unicode(cur_token))
-                    del self._output_tokens[i]
-                    # Insert the output tokens at the place where the single
-                    # abbreviated currency token was. Note the last output token
-                    # in the array is not inserted, because it is the EOF token.
-                    self.insert_token(currency_str, i)
-                expanded_currency_abbrev = True
-            i += 1
-
-    def _currency_expanded(self, i):
-        """ Return True if the next alpha tokens after i describe the indicated currency.
-        """
-        # Find the next alpha token after i
-        j = i + 1
-        while j < len(self._output_tokens) and not self._output_tokens[j].is_alpha():
-            j += 1
-        return (j < len(self._output_tokens) and
-                unicode(self._output_tokens[j]) == currency_display_names[self._output_tokens[i]._str])
-
-
-    def insert_token(self, s, i):
-        """Insert an output token constructed from string "s" at index "i".
-        """
-        ota = OutputTokenArray.from_text(s, False)
-        self._output_tokens[i:i] = ota._output_tokens[:-1]
-
-
-    @classmethod
-    def from_ita(cls, input_token_array, expand_currency=True):
-        """ Perform all the processing to create the output tokens.
-        """
-        obj = cls()
-        obj._init_from_ita(input_token_array, expand_currency)
-        return obj
-
-    @classmethod
-    def from_text(cls, text, expand_currency=True):
-        """ Perform all the processing to create the output tokens.
-        """
-        obj = cls()
-        obj._init_from_ita(InputTokenArray(text), expand_currency)
-        return obj
-
-    def _asterisk_at_bol(self, token, text):
-        if unicode(token) != u'*':
-            return False
-        if text == u'*':
-            return True
-        if text[-2:] == u'\n*':
-            return True
-        return False
-
-    def generate_text(self):
-        """Return a string created by concatenating the output tokens together
-        in an appropriate way.
-        """
-        quote_stack = []
-        text = u''
-        for i, t in enumerate(self._output_tokens[:-1]):
-            # if we have a paragraph break, insert that and go on to next token
-            if t._type == TokenType.PARA:
-                text += u'\n\n'
-                continue
-
-            # append the non-paragraph-break token
-            text += unicode(t)
-
-            # now figure out if a space should follow it
-            append_space = True
-            peek_t = self._output_tokens[i+1]
-
-            if (t.is_open() or
-                t.is_currency_symbol() or
-                unicode(t) in u'-/' or
-                self._asterisk_at_bol(t, text) or
-                unicode(peek_t) in '-/' or
-                peek_t.is_close() or
-                peek_t.is_punc() or
-                peek_t.is_eof()):
-                append_space = False
-            elif t.is_quote():
-                if quote_stack and quote_stack[-1] == t._str:
-                    # space after close quote
-                    quote_stack.pop()
-                else:
-                    # no space after open quote
-                    quote_stack.append(t._str)
-                    append_space = False
-
-            if append_space:
-                text += u' '
-        return text
-
-
-
-class EditAssistant(object):
-    def __init__(self, infile):
-        change_list = []
-        regex_processed_text = regex_process.process_file(infile, change_list)
-        self.input_tokens = InputTokenArray(regex_processed_text)
-        self.output_tokens = OutputTokenArray.from_ita(self.input_tokens)
-        self.edited_text = self.output_tokens.generate_text()
-
-
-
-
-
-def main():
-    """ Process text from either stdin, the clipboard, the command line,
-    or a specified file. Apply various formatting rules designed to fix common
-    errors in Kiva loan descriptions.
-    """
-    # Create a command-line parsing object
-    parser = argparse.ArgumentParser(
-        "Attempt to correct errors commonly found in Kiva loan descriptions.")
-    # Tell the parser about all the arguments this program supports.
-    parser.add_argument('-c', '--clipboard', action = 'store_true',
-                        help = "Use the contents of the clipboard instead of a file."
-                        " If this option is specified, then --infile is ignored.")
-    parser.add_argument('-i', '--infile', default = sys.stdin,
-                        help = "The UTF-8 encoded file to read, (defaults to stdin).")
-    parser.add_argument('-o', '--outfile', default = sys.stdout,
-                        help = "The UTF-8 encoded file to write (defaults to stdout).")
-    parser.add_argument('-t', '--test', nargs='*',
-                        help = "Process the TEST strings instead of an input file."
-                        " If this option is specified, then --infile and --clipboard are ignored.")
-    parser.add_argument('-d', '--debug', action = 'store_true',
-                        help = "Print the raw argument list and exit.")
-    # Parse the command line
-    args = parser.parse_args()
-
-    # get an output file handle
-    if isinstance(args.outfile, basestring):
-        # The user supplied a filename. Open it.
-        outfile = codecs.open(args.outfile, 'w', 'utf-8')
-    else:
-        # The user didn't supply a filename, so use stdout. Since
-        # we'll be handling Unicode text, use codecs to write it out
-        # as utf-8 encoded.
-        outfile = codecs.getwriter('utf-8')(sys.stdout)
-
-    if args.debug:
-        print args
-    elif args.test:
-        # the test option supercedes other input modes
-        if not isinstance(args.test, basestring):
-            args.test = u' '.join(args.test)
-        outfile.write(EditAssistant(StringIO.StringIO(args.test)).edited_text)
-        outfile.write('\n')
-    elif args.clipboard:
-        edit_assistant = EditAssistant(StringIO.StringIO(get_clipboard_text()))
-        set_clipboard_text(edit_assistant.edited_text)
-    else:
-        # If args.infile is a string, treat it as a filename and
-        # assume it is encoded in utf8. Otherwise it should be the
-        # default, which is sys.stdin. sys.stdin needs to be decoded
-        # into unicode.
-        if isinstance(args.infile, basestring):
-            infile = codecs.open(args.infile, 'r', 'utf-8')
-        else:
-            infile = codecs.getreader('utf-8')(sys.stdin)
-        outfile.write(EditAssistant(infile).edited_text)
-    sys.exit(0)
-
-
-if __name__ == '__main__':
-    logfile = tempfile.NamedTemporaryFile()
-    logging.basicConfig(stream=logfile, level=logging.DEBUG)
-    main()