Source

Kiva Editor's Assistant / kea.py

Full commit
#!/usr/bin/env python
"""
Kiva Editor's Assistant automatically corrects grammatical and stylistic
errors commonly found in Kiva loan descriptions.
"""

import argparse
import StringIO
import codecs
import logging
import sys

from clipboard import get_clipboard_text, set_clipboard_text
from mytoken import Token
import rules
import tagger
import myparser


class Sentence(object):
    """Acts as a container of Tokens which has a parse attribute."""
    def __init__(self, ea, start_idx, end_idx, parse=None):
        self.ea = ea
        self.start = start_idx
        self.end = end_idx
        self.parse = parse

    def find_sequence(self, item):
        """Support set membership operator "in" for strings, Tokens, and arrays
        of strings or Tokens."""
        if isinstance(item, basestring):
            match_list = item.split()
        else:
            match_list = item
        sentence_idx = self.start
        match_index = 0
        while sentence_idx < self.end:
            if self.ea.tokens[sentence_idx].str != match_list[match_index]:
                match_index = 0
            else:
                match_index += 1
                if match_index == len(match_list):
                    # Found all the tokens in match_list consecutively
                    # appearing in self.ea.tokens. Return the index into
                    # self.ea.tokens of the first token of that
                    # sequence. Since sentence_idx has not yet been
                    # incremented for the last item matched, it hasn't
                    # advanced the entire length of match_list, hence
                    # the + 1 here.
                    return sentence_idx - len(match_list) + 1
            sentence_idx += 1
        return None


class EditAssistant(object):
    def __init__(self, infile):
        """Process the input file and generate an output string."""
        # create a sentinal end-of-file token
        eof_token = Token(u'*EOF*')
        eof_token.eof = True
        # start the tokens array with one Token object that contains all
        # the text, followed by the sentinal.
        self._original_text = infile.read()
        self.tokens = [Token(self._original_text, 0,
                              len(self._original_text)), eof_token]
        self.sentences = []
        self._parser = myparser.Parser()
        # apply first phase rules to replace the original Token object
        # with multiple Token objects, one for each bit of the input
        # text that qualifies as a single input token.
        self._process_tokens(rules.INITIAL_PHASE)
        # Add a part-of-speech property to all the tokens
        tagger.tag_tokens(self.tokens)
        # now apply rules/transforms that make use of the POS
        # properties. This includes the DelimitSentencesRule which
        # inserts non-printing tokens that mark sentence boundaries.
        self._process_tokens(rules.POS_PHASE)
        # for each sentence, generate a parse tree
        self._parse_sentences()
        # now apply rules that require sentence parses
        self._process_tokens(rules.PARSED_PHASE)
        self._generate_output()
        self._report_changes()

    def _asterisk_at_bol(self, token):
        return (token.str == u'*' and
                (self.edited_text == u'*' or
                 len(self.edited_text) >= 2 and
                 self.edited_text[-2:] == u'\n*'))

    def _dump_sentences(self):
        for t in self.tokens:
            print t.str,
            if t.sentence_delim:
                print

    def _generate_output(self):
        quote_stack = []
        self.edited_text = u''
        for i, token in enumerate(self.tokens[:-1]):
            # if we have a paragraph break, insert that and go on to next token
            if token.is_para:
                self.edited_text += u'\n\n'
                continue
            # skip non-printing tokens
            if token.non_printing:
                continue
            self.edited_text += token.str

            # now figure out if a space should follow it
            append_space = True
            next_token = self.tokens[i + 1]

            if (token.is_open or
                token.is_currency_symbol or
                token.str in u'-/' or
                self._asterisk_at_bol(token) or
                next_token.str in '-/*' or
                next_token.is_close or
                next_token.is_nonspacing_punc or
                next_token.is_eof):
                    append_space = False
            elif token.is_quote:
                if quote_stack and quote_stack[-1] == token.str:
                    # space after close quote
                    quote_stack.pop()
                else:
                    # no space after open quote
                    quote_stack.append(token.str)
                    append_space = False

            if append_space:
                self.edited_text += u' '

    def _parse_sentences(self):
        # for each range of tokens representing a sentence, generate a
        # parse tree.
        sentence_start_idx = 0
        sentence_end_idx = None
        while sentence_start_idx < len(self.tokens):
            # The next sentence starts with the first printing token
            # that is not a paragraph marker.
            while (sentence_start_idx < len(self.tokens) and
                   (self.tokens[sentence_start_idx].is_para or
                    self.tokens[sentence_start_idx].non_printing)):
                sentence_start_idx += 1

            # If we couldn't find the start of the next sentence, stop
            # looking for sentences.
            if sentence_start_idx >= len(self.tokens):
                break

            # The end of the sentence must be beyond the starting token.
            sentence_end_idx = sentence_start_idx + 1

            # move the end index to the right until something that
            # delimits sentences is found.
            while sentence_end_idx < len(self.tokens):
                cur_token = self.tokens[sentence_end_idx]
                # if we've found the a delimiting token, make a
                # sentence, then break out of this inner while loop to
                # start searching for the start of the next sentence.
                if cur_token.non_printing or cur_token.is_para:
                    self.sentences.append(
                        Sentence(self, sentence_start_idx, sentence_end_idx))
                    sentence_start_idx = sentence_end_idx + 1
                    break
                # no delimiter yet, keep looking for end
                sentence_end_idx += 1
        for sent in self.sentences:
            sent.parse = self._parser.parse(self.tokens[sent.start:sent.end])

    def _process_tokens(self, phase):
        rules_to_run = rules.get_rules(phase)
        while True:
            logging.debug('***calling %s rules', rules.PHASE_NAME[phase])
            changed = False
            for rule in rules_to_run:
                if rule.enabled and rule.apply(self):
                    changed = True
                    break
            # If no changes were made, we're done changing tokens and
            # it's time to generate the output text.
            if not changed:
                break

    def _report_changes(self):
        """Write a description of all significant changes."""
        print "Change report:"
        # TODO: report runs of changes instead of each individual token
        # changed or inserted.
        for token in self.tokens:
            # skip Token objects used internally by the program; these
            # do not represent input text.
            if token.non_printing:
                continue
            # token contains text; if its cbegin==cend then it didn't
            # appear in the original text and was inserted.
            if token.cbegin == token.cend:
                print "inserted", token.str
            elif self._original_text[token.cbegin:token.cend] != token.str:
                print u'Changed "{}" to "{}"'.format(
                    self._original_text[token.cbegin:token.cend], token.str)
        print

    def dump_pos_tags(self):
        """Write every token with a Part-Of-Speech tag to stdout."""
        for token in self.tokens:
            if hasattr(token, 'pos') and token.pos:
                if len(token.pos) > 1:
                    sys.stdout.write(token.str + u'/[')
                    first_element = True
                    for postag in token.pos:
                        if not first_element:
                            sys.stdout.write(', ')
                        sys.stdout.write(postag.pos)
                    sys.stdout.write(']\n')
                else:
                    print u'{}/{}'.format(token.str, token.pos[0]),
            if token.str == '\n':
                print


def parse_commandline():
    """Return an argparse parse of the command line
    """
    # Create a command-line parsing object
    parser = argparse.ArgumentParser(
        "Attempt to correct errors commonly found in Kiva loan descriptions.")
    # Tell the parser about all the arguments this program supports.
    parser.add_argument(
        '-a', '--arg-dump', dest='arg_dump', action='store_true',
        help="Print the raw argument list and exit.")

    parser.add_argument(
        '-c', '--clipboard', action='store_true',
        help="Use the contents of the clipboard instead of a file. "
        "If this option is specified, then --infile is ignored.")

    parser.add_argument(
        '-i', '--infile', default=sys.stdin,
        help="The UTF-8 encoded file to read, (defaults to stdin).")

    parser.add_argument(
        '-l', '--log-to-stdout', dest='log_to_stdout', action='store_true',
        help="Write logging to stdout rather than kea.log.")

    parser.add_argument(
        '-o', '--outfile', default=sys.stdout,
        help="The UTF-8 encoded file to write (defaults to stdout).")

    parser.add_argument(
        '-s', '--show-pos', dest='show_pos', action='store_true',
        help="Print the tagged tokens and exit.")

    parser.add_argument(
        '-t', '--test', nargs='*',
        help="Process the TEST strings instead of an input file. If this "
        "option is specified, then --infile and --clipboard are ignored.")

    # Parse the command line and return it
    return parser.parse_args()


def main():
    """ Process text from either stdin, the clipboard, the command line,
    or a specified file. Apply various formatting rules designed to fix common
    errors in Kiva loan descriptions.
    """

    args = parse_commandline()

    # get an output file handle to either a user-supplied file name or
    # stdout.
    if isinstance(args.outfile, basestring):
        # The user supplied a filename. Open it.
        outfile = codecs.open(args.outfile, 'w', 'utf-8')
    else:
        # The user didn't supply a filename, so use stdout. Since
        # we'll be handling Unicode text, use codecs to write it out
        # as utf-8 encoded.
        outfile = codecs.getwriter('utf-8')(sys.stdout)

    # Use this to hold an instance of an EditAssistant object, which can
    # be created in a variety of ways (or not at all) depending on
    # command-line arguments.
    edit_assistant = None

    # Initialize logging; send it to a file unless otherwise directed by
    # command-line arguments.
    root_logger = logging.getLogger()
    root_logger.setLevel(logging.DEBUG)
    if not args.log_to_stdout:
        handler = logging.FileHandler("kea.log", "w", encoding="UTF-8")
        formatter = logging.Formatter("%(message)s")
        handler.setFormatter(formatter)
        root_logger.addHandler(handler)


    if args.arg_dump:
        print args
    else:
        if args.test:
            # the test option supercedes other input modes
            if not isinstance(args.test, basestring):
                args.test = u' '.join(args.test)
            edit_assistant = EditAssistant(StringIO.StringIO(unicode(args.test)))
            outfile.write(edit_assistant.edited_text)
            outfile.write('\n')
        elif args.clipboard:
            edit_assistant = EditAssistant(StringIO.StringIO(get_clipboard_text()))
            set_clipboard_text(edit_assistant.edited_text)
        else:
            # If args.infile is a string, treat it as a filename and
            # assume it is encoded in utf8. Otherwise it should be the
            # default, which is sys.stdin. sys.stdin needs to be decoded
            # into unicode.
            if isinstance(args.infile, basestring):
                infile = codecs.open(args.infile, 'r', 'utf-8')
            else:
                infile = codecs.getreader('utf-8')(sys.stdin)
            edit_assistant = EditAssistant(infile)
            outfile.write(edit_assistant.edited_text)

        if args.show_pos and edit_assistant:
            edit_assistant.dump_pos_tags()
    sys.exit(0)

if __name__ == '__main__':
    main()