Source

Kiva Editor's Assistant / kea.py

#!/usr/bin/env python
"""
Kiva Editor's Assistant automatically corrects grammatical and stylistic
errors commonly found in Kiva loan descriptions.
"""

import argparse
import StringIO
import codecs
import logging
import sys
import re

from clipboard import get_clipboard_text, set_clipboard_text
from keatoken import Token
import rules
import tagger
import keaparser


class Sentence(object):
    """Acts as a container of Tokens which has a parse attribute."""
    def __init__(self, ea, start_idx, end_idx):
        self.ea = ea
        # Because methods in rules.py and elsewhere may insert or delete
        # tokens within and between sentences, the integer indexes of
        # the start and end delimiters of the sentences will change.
        #
        # Therefore this class keeps track of the sentence delimiting
        # tokens themselves, and computes their indexes on demand.
        self.start_token = self.ea.tokens[start_idx]
        assert(self.start_token.bos)
        self.end_token = self.ea.tokens[end_idx]
        assert(self.end_token.eos)
        # Since parsing is very expensive (some sentences can take
        # minutes to parse) it is done on-demand only.
        self._parse = None

    @property
    def start_idx(self):
        """Return the 0-based index into self.ea.tokens of this
        sentence's beginning-of-sentence delimiter token.
        """
        return self.ea.tokens.index(self.start_token)

    @property
    def end_idx(self):
        """Return the 0-based index into self.ea.tokens of this
        sentence's end-of-sentence delimiter token.
        """
        return self.ea.tokens.index(self.end_token)

    @property
    def parse(self):
        """Return the parse tree for this sentence.
        """
        if not self._parse:
            self._parse = self.ea.parser.parse(
                self.ea.tokens[self.start_idx + 1:self.end_idx])
        return self._parse

    def find_sequence(self, item):
        """Search for a sequence of tokens whose strings match the
        supplied regular expression(s).
        """
        if isinstance(item, basestring):
            match_list = item.split()
        else:
            match_list = item
        sentence_idx = self.start_idx
        end_idx = self.end_idx
        match_index = 0
        while sentence_idx < end_idx:
            if not re.match(match_list[match_index],
                            self.ea.tokens[sentence_idx].str):
                match_index = 0
            else:
                match_index += 1
                if match_index == len(match_list):
                    # Found all the tokens in match_list consecutively
                    # appearing in self.ea.tokens. Return the index into
                    # self.ea.tokens of the first token of that
                    # sequence. Since sentence_idx has not yet been
                    # incremented for the last item matched, it hasn't
                    # advanced the entire length of match_list, hence
                    # the + 1 here.
                    return sentence_idx - len(match_list) + 1
            sentence_idx += 1
        return None


class EditAssistant(object):
    def __init__(self, infile):
        """Process the input file and generate an output string."""
        # create a sentinal end-of-file token
        eof_token = Token(u'*EOF*')
        eof_token.eof = True
        # start the tokens array with one Token object that contains all
        # the text, followed by the sentinal.
        self._original_text = infile.read()
        self.tokens = [Token(self._original_text, 0,
                              len(self._original_text)), eof_token]
        self.sentences = []
        self.parser = keaparser.Parser()
        # apply first phase rules to replace the original Token object
        # with multiple Token objects, one for each bit of the input
        # text that qualifies as a single input token.
        self._process_tokens(rules.INITIAL_PHASE)
        # Add a part-of-speech property to all the tokens
        tagger.tag_tokens(self.tokens)
        # now apply rules/transforms that make use of the POS
        # properties. This includes the DelimitSentencesRule which
        # inserts non-printing tokens that mark sentence boundaries.
        self._process_tokens(rules.POS_PHASE)
        self._dump_tokens()
        # for each sentence, generate a parse tree
        self._make_sentences()
        # now apply rules that require sentence parses
        self._process_tokens(rules.PARSED_PHASE)
        self._generate_output()
        self._report_changes()

    def _asterisk_at_bol(self, token):
        return (token.str == u'*' and
                (self.edited_text == u'*' or
                 len(self.edited_text) >= 2 and
                 self.edited_text[-2:] == u'\n*'))

    def _dump_tokens(self):
        for t in self.tokens:
            print t,
            if t.str == '\n':
                print
        print

    def _generate_output(self):
        quote_stack = []
        self.edited_text = u''
        for i, token in enumerate(self.tokens[:-1]):
            # if we have a paragraph break, insert that and go on to next token
            if token.is_para:
                self.edited_text += u'\n\n'
                continue
            # skip non-printing tokens
            if token.non_printing:
                continue
            self.edited_text += token.str

            # now figure out if a space should follow it
            append_space = True
            next_token = self.tokens[i + 1]

            if (token.is_open or
                token.is_currency_symbol or
                token.str in u'-/' or
                self._asterisk_at_bol(token) or
                next_token.str in '-/*' or
                next_token.is_close or
                next_token.is_nonspacing_punc or
                next_token.is_eof):
                    append_space = False
            elif token.is_quote:
                if quote_stack and quote_stack[-1] == token.str:
                    # space after close quote
                    quote_stack.pop()
                else:
                    # no space after open quote
                    quote_stack.append(token.str)
                    append_space = False

            if append_space:
                self.edited_text += u' '
        self.edited_text = self.edited_text.strip()

    def _make_sentences(self):
        """Populate the self.sentences array with Sentence objects.
        """
        sentence_start_idx = 0
        sentence_end_idx = None
        while sentence_start_idx < len(self.tokens):
            # The next sentence starts with a non-printing token that
            # has the beginning of sentence property (bos) set.
            while (sentence_start_idx < len(self.tokens) and
                   not self.tokens[sentence_start_idx].bos):
                sentence_start_idx += 1

            # If we couldn't find the start of the next sentence, stop
            # looking for sentences.
            if sentence_start_idx >= len(self.tokens):
                break

            # The end of the sentence must be beyond the starting token.
            sentence_end_idx = sentence_start_idx + 1

            # move the end index to the right until the end of sentence
            # delimiting token is found.
            while sentence_end_idx < len(self.tokens):
                # if we've found the a delimiting token, make a
                # sentence, then break out of this inner while loop to
                # start searching for the start of the next sentence.
                if self.tokens[sentence_end_idx].eos:
                    self.sentences.append(
                        Sentence(self, sentence_start_idx, sentence_end_idx))
                    sentence_start_idx = sentence_end_idx + 1
                    break
                # no delimiter yet, keep looking for end
                sentence_end_idx += 1

    def _process_tokens(self, phase):
        rules_to_run = rules.get_rules(phase)
        while True:
            logging.debug('***calling %s rules', rules.PHASE_NAME[phase])
            self._dump_tokens()
            changed = False
            for rule in rules_to_run:
                if rule.enabled and rule.apply(self):
                    changed = True
                    break
            # If no changes were made, we're done changing tokens and
            # it's time to generate the output text.
            if not changed:
                break

    def _report_changes(self):
        """Write a description of all significant changes."""
        print "Change report:"
        # TODO: report runs of changes instead of each individual token
        # changed or inserted.
        for token in self.tokens:
            # skip Token objects used internally by the program; these
            # do not represent input text.
            if token.non_printing:
                continue
            # token contains text; if its cbegin==cend then it didn't
            # appear in the original text and was inserted.
            if token.cbegin == token.cend:
                print 'inserted "{}"'.format(token.str)
            elif self._original_text[token.cbegin:token.cend] != token.str:
                print u'Changed "{}" to "{}"'.format(
                    self._original_text[token.cbegin:token.cend], token.str)
        print

    def dump_pos_tags(self):
        """Write every token with a Part-Of-Speech tag to stdout."""
        for token in self.tokens:
            if token.pos:
                if len(token.pos) > 1:
                    sys.stdout.write(token.str + u'/[')
                    first_element = True
                    for postag in token.pos:
                        if not first_element:
                            sys.stdout.write(', ')
                        sys.stdout.write(postag.pos)
                        first_element = False
                    sys.stdout.write('] ')
                else:
                    sys.stdout.write(u'{}/{} '.format(token.str, token.pos[0].pos))
            if token.str == '\n':
                sys.stdout.write('\n')
        if self.tokens:
            sys.stdout.write('\n')


def parse_commandline():
    """Return an argparse parse of the command line
    """
    # Create a command-line parsing object
    parser = argparse.ArgumentParser(
        "Attempt to correct errors commonly found in Kiva loan descriptions.")
    # Tell the parser about all the arguments this program supports.
    parser.add_argument(
        '-a', '--arg-dump', dest='arg_dump', action='store_true',
        help="Print the raw argument list and exit.")

    parser.add_argument(
        '-c', '--clipboard', action='store_true',
        help="Use the contents of the clipboard instead of a file. "
        "If this option is specified, then --infile is ignored.")

    parser.add_argument(
        '-i', '--infile', default=sys.stdin,
        help="The UTF-8 encoded file to read, (defaults to stdin).")

    parser.add_argument(
        '-l', '--log-to-stdout', dest='log_to_stdout', action='store_true',
        help="Write logging to stdout rather than kea.log.")

    parser.add_argument(
        '-o', '--outfile', default=sys.stdout,
        help="The UTF-8 encoded file to write (defaults to stdout).")

    parser.add_argument(
        '-s', '--show-pos', dest='show_pos', action='store_true',
        help="Print the tagged tokens and exit.")

    parser.add_argument(
        '-t', '--test', nargs='*',
        help="Process the TEST strings instead of an input file. If this "
        "option is specified, then --infile and --clipboard are ignored.")

    # Parse the command line and return it
    return parser.parse_args()


def main():
    """ Process text from either stdin, the clipboard, the command line,
    or a specified file. Apply various formatting rules designed to fix common
    errors in Kiva loan descriptions.
    """

    args = parse_commandline()

    # get an output file handle to either a user-supplied file name or
    # stdout.
    if isinstance(args.outfile, basestring):
        # The user supplied a filename. Open it.
        outfile = codecs.open(args.outfile, 'w', 'utf-8')
    else:
        # The user didn't supply a filename, so use stdout. Since
        # we'll be handling Unicode text, use codecs to write it out
        # as utf-8 encoded.
        outfile = codecs.getwriter('utf-8')(sys.stdout)

    # Use this to hold an instance of an EditAssistant object, which can
    # be created in a variety of ways (or not at all) depending on
    # command-line arguments.
    edit_assistant = None

    # Initialize logging; send it to a file unless otherwise directed by
    # command-line arguments.
    root_logger = logging.getLogger()
    root_logger.setLevel(logging.DEBUG)
    if not args.log_to_stdout:
        handler = logging.FileHandler("kea.log", "w", encoding="UTF-8")
        formatter = logging.Formatter("%(message)s")
        handler.setFormatter(formatter)
        root_logger.addHandler(handler)


    if args.arg_dump:
        print args
    else:
        if args.test:
            # the test option supercedes other input modes
            if not isinstance(args.test, basestring):
                args.test = u' '.join(args.test)
            edit_assistant = EditAssistant(StringIO.StringIO(unicode(args.test)))
            outfile.write(edit_assistant.edited_text)
            outfile.write('\n')
        elif args.clipboard:
            edit_assistant = EditAssistant(StringIO.StringIO(get_clipboard_text()))
            set_clipboard_text(edit_assistant.edited_text)
        else:
            # If args.infile is a string, treat it as a filename and
            # assume it is encoded in utf8. Otherwise it should be the
            # default, which is sys.stdin. sys.stdin needs to be decoded
            # into unicode.
            if isinstance(args.infile, basestring):
                infile = codecs.open(args.infile, 'r', 'utf-8')
            else:
                infile = codecs.getreader('utf-8')(sys.stdin)
            edit_assistant = EditAssistant(infile)
            outfile.write(edit_assistant.edited_text)

        if args.show_pos and edit_assistant:
            edit_assistant.dump_pos_tags()
    sys.exit(0)

if __name__ == '__main__':
    main()