Kiva Editor's Assistant / kea.py

#!/usr/bin/env python
"""
Kiva Editor's Assistant automatically corrects grammatical and stylistic
errors commonly found in Kiva loan descriptions.
"""

import argparse
import StringIO
import codecs
import logging
import os
import sys
import tempfile


from clipboard import get_clipboard_text, set_clipboard_text
from base import Token
import rules


class EditAssistant(object):
    def __init__(self, infile):
        """Process the input file and generate an output string."""
        eof_token = Token(u'')
        eof_token.eof = True
        self._tokens = [Token(infile.read()), eof_token]
        self._process_tokens(infile)
        self._generate_output()

    def _asterisk_at_bol(self, token):
        return (token.str == u'*' and
                (self.edited_text == u'*' or
                 len(self.edited_text) >= 2 and
                 self.edited_text[-2:] == u'\n*'))

    def _generate_output(self):
        quote_stack = []
        self.edited_text = u''
        for i, token in enumerate(self._tokens[:-1]):
            next_token = self._tokens[i + 1]

            # if we have a paragraph break, insert that and go on to
            # next token
            if token.is_para:
                if not next_token.is_eof:
                    self.edited_text = self.edited_text.rstrip() + u'\n\n'
                continue
            # skip non-printing tokens
            if token.non_printing:
                continue
            self.edited_text += token.str

            # now figure out how many spaces should follow it
            append_space = ' '

            if (token.is_open or
                token.is_currency_symbol or
                token.str in u'-/' or
                self._asterisk_at_bol(token) or
                next_token.str in '-/*' or
                next_token.is_close or
                next_token.is_nonspacing_punc or
                next_token.is_eof):
                    append_space = ''
            elif (next_token.is_quote and
                  quote_stack and
                  quote_stack[-1] == next_token.str):
                # no space before close quote
                append_space = ''
            elif token.is_quote:
                if quote_stack and quote_stack[-1] == token.str:
                    # space after close quote
                    quote_stack.pop()
                else:
                    # no space after open quote
                    quote_stack.append(token.str)
                    append_space = ''

            self.edited_text += append_space

    def _process_tokens(self, infile):
        all_rules = rules.get_rules()

        #
        # The main loop.
        #
        # Each rule in a master list of Rules is supplied with the Token list
        # so that it can generate a list of Transform objects. If no Rule
        # produces at least one Transform object, the main loop exits and the
        # final list of Tokens is transformed into the program's output text.
        #
        # Only one change to any given Token can be made in each iteration.
        # Therefore, if more than one Transform object wishes to touch the
        # same Token, there must be a way to pick the single one to apply for
        # the current iteration.
        #
        # This is done by choosing the Transform object that was created by
        # the Rule object having the highest `score`. If there is a tie in
        # `score`, then the lowest `id` wins.
        #
        # Once the list of Transform objects has been reduced to only those
        # that are non-conflicting, they can be applied to the Tokens list.
        #
        # Since the entire rule set is applied iteratively until no more
        # Transform objects are created, Rules must be written so that they do
        # not produce Transform objects indefinitely.
        #
        # Similarly, any set of rules which each transform the tokens so that
        # the resulting transformation of one rule matches the predicate of
        # another will cause an infinite loop.  In other words, care must be
        # taken to avoid writing a set of rules which would forever take turns
        # changing the same bit of text back and forth.
        #

        while True:
            # build up a list of transforms which could apply to the
            # tokens in their current state.
            transforms = []

            logging.debug('***calling rules')
            for rule in all_rules:
                if rule.enabled:
                    transforms += rule.get_transforms(self._tokens)

            # If there are no transforms, we're done changing tokens and
            # it's time to generate the output text.
            if not transforms:
                break

            # Find the highest priority transform
            winner = None
            for transform in transforms:
                if not winner:
                    winner = transform
                elif transform.beats(winner):
                    winner = transform

            self._tokens = winner.apply()


def parse_commandline():
    """Return an argparse parse of the command line
    """
    # Create a command-line parsing object
    parser = argparse.ArgumentParser(
        "Attempt to correct errors commonly found in Kiva loan descriptions.")
    # Tell the parser about all the arguments this program supports.
    parser.add_argument(
        '-c', '--clipboard', action='store_true',
        help="Use the contents of the clipboard instead of a file. "
        "If this option is specified, then --infile is ignored.")

    parser.add_argument(
        '-i', '--infile', default=sys.stdin,
        help="The UTF-8 encoded file to read, (defaults to stdin).")

    parser.add_argument(
        '-o', '--outfile', default=sys.stdout,
        help="The UTF-8 encoded file to write (defaults to stdout).")

    parser.add_argument(
        '-t', '--test', nargs='*',
        help="Process the TEST strings instead of an input file. If this "
        "option is specified, then --infile and --clipboard are ignored.")

    parser.add_argument(
        '-d', '--debug', action='store_true',
        help="Print the raw argument list and exit.")

    # Parse the command line and return it
    return parser.parse_args()


def main():
    """ Process text from either stdin, the clipboard, the command line,
    or a specified file. Apply various formatting rules designed to fix common
    errors in Kiva loan descriptions.
    """

    # Initialize logging to go to a temporary file
    temp_file_name = os.path.join(tempfile.gettempdir(), "kea.log")
    handler = logging.FileHandler(temp_file_name, "w", encoding="UTF-8")
    formatter = logging.Formatter("%(message)s")
    handler.setFormatter(formatter)
    root_logger = logging.getLogger()
    root_logger.addHandler(handler)
    root_logger.setLevel(logging.DEBUG)

    args = parse_commandline()

    # get an output file handle
    if isinstance(args.outfile, basestring):
        # The user supplied a filename. Open it.
        outfile = codecs.open(args.outfile, 'w', 'utf-8')
    else:
        # The user didn't supply a filename, so use stdout. Since
        # we'll be handling Unicode text, use codecs to write it out
        # as utf-8 encoded.
        outfile = codecs.getwriter('utf-8')(sys.stdout)

    if args.debug:
        print args
    elif args.test:
        # the test option supercedes other input modes
        if not isinstance(args.test, basestring):
            args.test = u' '.join(args.test)
        outfile.write(EditAssistant(StringIO.StringIO(
                    unicode(args.test))).edited_text)
        outfile.write('\n')
    elif args.clipboard:
        edit_assistant = EditAssistant(StringIO.StringIO(get_clipboard_text()))
        set_clipboard_text(edit_assistant.edited_text)
    else:
        # If args.infile is a string, treat it as a filename and
        # assume it is encoded in utf8. Otherwise it should be the
        # default, which is sys.stdin. sys.stdin needs to be decoded
        # into unicode.
        if isinstance(args.infile, basestring):
            infile = codecs.open(args.infile, 'r', 'utf-8')
        else:
            infile = codecs.getreader('utf-8')(sys.stdin)
        outfile.write(EditAssistant(infile).edited_text)
    sys.exit(0)

if __name__ == '__main__':
    main()
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.