Source

Kiva Editor's Assistant / transforms.py

#!/usr/bin/env python
"""
Derived Transform classes.
"""
import re
import pycountry
import logging
from base import Token, Transform


def token_strings(tokens):
    return [t.str for t in tokens]


class ParagraphTransform(Transform):
    """Break tokens containing a newline into three parts.
    """

    def __init__(self, rule, token, **kwargs):
        Transform.__init__(self, rule, token, **kwargs)
        self._mo = kwargs['match_obj']

    def apply(self):
        logging.debug(u'>ParagraphTransform %s',
                      token_strings(self.tokens_to_transform))
        # Replace each token to transform with three new tokens: one
        # for the string before the newline, a PARA token, and one for
        # the string after the newline.
        for token_to_transform in self.tokens_to_transform:
            # find the index within the full list of tokens of the
            # token we are to transform
            transform_token_index = self.rule.tokens.index(token_to_transform)
            # create the token that will be inserted to its left
            left = Token(token_to_transform.str[:self._mo.start()])
            # create the token that will replace it
            paragraph_token = Token('\n')
            # create the token that will be inserted to its right
            right = Token(token_to_transform.str[self._mo.end():])
            # replace the token we're transforming
            self.rule.tokens[transform_token_index] = paragraph_token
            # insert a token to its left if it is nonempty
            if left.str:
                self.rule.tokens.insert(transform_token_index, left)
                # that insertion has made transform_token_index out of
                # date; correct it.
                transform_token_index += 1
            # now insert the token to the right, if it is nonempty
            if right.str:
                self.rule.tokens.insert(transform_token_index + 1, right)
        logging.debug(u'<ParagraphTransform %s',
                      token_strings(self.rule.tokens))
        return self.rule.tokens


class AbbrevTransform(Transform):
    """Mark a token as an abbreviation, splitting off any concatenation.

    Technically an instance of this class could apply to multiple
    tokens, but normally `tokens_to_transform` will only have one token
    in it.
    """
    def __init__(self, rule, tokens_to_transform, **kwargs):
        """Put keyword arguments in instance vars.

        Keyword args:

        `matched_abbrev` - the AbbrevInfo object which (at least
        partially) matched the token to transform.
        `abbrev_match_len` - number of characters of token that matched.
        """
        logging.debug(u'>AbbrevTransform.__init__')
        Transform.__init__(self, rule, tokens_to_transform, **kwargs)
        self._matched_abbrev = kwargs.get('matched_abbrev')
        self._abbrev_match_len = kwargs.get('abbrev_match_len')
        if len(tokens_to_transform) != 1:
            logging.warn('AbbrevTransform expected single token but got %d',
                         len(tokens_to_transform))

    def apply(self):
        logging.debug(u'>AbbrevTransform %s',
                      token_strings(self.tokens_to_transform))
        for token_to_transform in self.tokens_to_transform:
            token_to_transform.abbrev = True

            # If the abbreviation matches the entire token, it does not
            # need to be split into multiple tokens, so update it in
            # place.
            if self._abbrev_match_len == len(token_to_transform.str):
                if self._matched_abbrev.normal_form:
                    token_to_transform.str = self._matched_abbrev.normal_form
                continue

            # The abbreviation matches just the start of the token.
            # Make a new token to insert after the abbreviation.
            if self._matched_abbrev.normal_form:
                abbrev_part = self._matched_abbrev.normal_form
            else:
                abbrev_part = token_to_transform.str[:self._abbrev_match_len]
            extra_part = token_to_transform.str[self._abbrev_match_len:]

            # Now modify the existing token, and create a new one to
            # insert after it.
            token_to_transform.str = abbrev_part
            post_abbrev_token = Token(extra_part)

            # Find the index within the full list of tokens of the
            # token we are to transform
            transform_token_index = self.rule.tokens.index(token_to_transform)
            self.rule.tokens.insert(transform_token_index + 1,
                                    post_abbrev_token)
        logging.debug(u'<AbbrevTransform %s', token_strings(self.rule.tokens))
        return self.rule.tokens


class RegexSplitTransform(Transform):
    """Split tokens on designated character."""

    def __init__(self, rule, token, **kwargs):
        Transform.__init__(self, rule, token, **kwargs)
        self._split_re = kwargs.get('split_re')

    def apply(self):
        logging.debug(u'>RegexSplitTransform %s',
                      token_strings(self.tokens_to_transform))
        for token_to_transform in self.tokens_to_transform:
            # find the index within the full list of tokens of the
            # token we are to transform
            transform_token_index = self.rule.tokens.index(token_to_transform)
            # get slices of the array that exclude the token we're going
            # to modify.
            left_of_split = self.rule.tokens[:transform_token_index]
            right_of_split = self.rule.tokens[transform_token_index + 1:]
            split_strings = re.split(self._split_re,
                                     token_to_transform.str)
            # consecutive delimiters can result in empty strings in the
            # output list, so use filter() to eliminate them
            split_strings = filter(None, split_strings)
            # create a new Token from each of the new non-empty strings
            split_tokens = map(Token, split_strings)
            # put it all back together
            self.rule.tokens = left_of_split + split_tokens + right_of_split
        logging.debug(u'<RegexSplitTransform %s',
                      token_strings(self.rule.tokens))
        return self.rule.tokens


class IndexSplitTransform(Transform):
    """Split tokens at designated character index.

    Keyword Arguments:
    `index` - the character position at which to split the string
    `three_way` - if True, split the string into three parts instead of two

    If `index` is 0, `three_way` is ignored and a special case of the
    two-way split is performed. The string is split into two tokens, the
    first composed of the token's initial character, and the second
    being the remainder of the string.

    If `three_way` is not True and `index` is non-zero, the token is
    split into two parts, where the first is all the characters
    preceding `index` and the second is all the characters from `index`
    to the end of the string.

    Otherwise, if `three_way` is True, the token will be split into
    three parts: the characters before `index`, the single character at
    `index`, and the characters after `index`.
    """

    def __init__(self, rule, token, **kwargs):
        Transform.__init__(self, rule, token, **kwargs)
        self._index = kwargs.get('index')
        if not self._index:
            self._three_way = False
        else:
            self._three_way = kwargs.get('three_way', False)

    def apply(self):
        logging.debug(u'>IndexSplitTransform %s',
                      token_strings(self.tokens_to_transform))
        for token_to_transform in self.tokens_to_transform:
            # find the index within the full list of tokens of the
            # token to be transformed
            transform_token_index = self.rule.tokens.index(token_to_transform)
            if (self._three_way and
                self._index < len(token_to_transform.str) - 1):
                left = token_to_transform.str[:self._index]
                middle = token_to_transform.str[self._index]
                right = token_to_transform.str[self._index + 1:]

                self.rule.tokens.insert(transform_token_index, Token(left))
                token_to_transform.str = middle
                self.rule.tokens.insert(transform_token_index + 2,
                                        Token(right))
            else:
                # split the token string at the supplied character index
                index = self._index
                if index == 0 and index < len(token_to_transform.str) - 1:
                    index += 1
                left = token_to_transform.str[:index]
                right = token_to_transform.str[index:]
                # insert a new token for the left part of the split
                self.rule.tokens.insert(transform_token_index, Token(left))
                token_to_transform.str = right
        logging.debug(u'<IndexSplitTransform %s',
                      token_strings(self.rule.tokens))
        return self.rule.tokens


class SetAttrTransform(Transform):
    """Change attributes of a token. """

    def __init__(self, rule, tokens, **kwargs):
        Transform.__init__(self, rule, tokens, **kwargs)
        self._attr_changes = kwargs['attr_changes']

    def apply(self):
        logging.debug(u'>SetAttrTransform %s',
                      token_strings(self.tokens_to_transform))
        for token in self.tokens_to_transform:
            for change in self._attr_changes:
                setattr(token, change.attr, change.value)
        logging.debug(u'<SetAttrTransform %s', token_strings(self.rule.tokens))
        return self.rule.tokens


class SeparateThousandsTransform(Transform):
    """Insert commas to separate thousands."""

    @staticmethod
    def digit_group_callback(match_obj):
        return match_obj.group(1) + ',' + match_obj.group(2)

    def __init__(self, rule, tokens, **kwargs):
        Transform.__init__(self, rule, tokens, **kwargs)

    def apply(self):
        logging.debug(u'>SeparateThousandsTransform %s',
                      token_strings(self.tokens_to_transform))
        for token in self.tokens_to_transform:
            new_str = re.sub(
                ur'(\d)(\d{3})\b',
                SeparateThousandsTransform.digit_group_callback,
                token.str)
            while new_str != token.str:
                token.str = new_str
                new_str = re.sub(
                    ur'(\d)(\d{3})\b',
                    SeparateThousandsTransform.digit_group_callback,
                    token.str)
        logging.debug(u'<SeparateThousandsTransform %s',
                      token_strings(self.rule.tokens))
        return self.rule.tokens


class ISOCurrencyTransform(Transform):
    """Spell out an ISO currency abbreviation.
    """

    def __init__(self, rule, tokens, **kwargs):
            Transform.__init__(self, rule, tokens, **kwargs)

    def apply(self):
        logging.debug(u'>ISOCurrencyTransform %s',
                      token_strings(self.tokens_to_transform))
        assert(len(self.tokens_to_transform) == 1)
        token = self.tokens_to_transform[0]
        transform_token_index = self.rule.tokens.index(token)
        left_of_split = self.rule.tokens[:transform_token_index]
        right_of_split = self.rule.tokens[transform_token_index + 1:]
        # token is an ISO currency abbreviation. Make new tokens to
        # replace it
        iso_letters = token.str.upper()
        new_token_str = '{} ( {} )'.format(
            pycountry.currencies.get(letter=iso_letters).name,
            iso_letters)
        new_tokens = map(Token, new_token_str.split())
        # this must be set to avoid infinite loop
        new_tokens[-2].ISO_currency_expanded = True
        self.rule.tokens = left_of_split + new_tokens + right_of_split
        logging.debug(u'<ISOCurrencyTransform %s',
                      token_strings(self.rule.tokens))
        return self.rule.tokens


class USCurrencyTransform(Transform):
    """Convert '100 USD' to '$100'.
    """

    def __init__(self, rule, tokens, **kwargs):
            Transform.__init__(self, rule, tokens, **kwargs)

    def apply(self):
        logging.debug(u'>USCurrencyTransform %s',
                      token_strings(self.tokens_to_transform))
        assert(len(self.tokens_to_transform) == 1)
        token = self.tokens_to_transform[0]
        transform_token_index = self.rule.tokens.index(token)
        del self.rule.tokens[transform_token_index]
        amount = self.rule.tokens[transform_token_index - 1]
        amount.str = '$' + amount.str
        logging.debug(u'<USCurrencyTransform %s',
                      token_strings(self.rule.tokens))
        return self.rule.tokens


class SwapTransform(Transform):
    """Swap two adjacent tokens."""

    def __init__(self, rule, tokens, **kwargs):
            Transform.__init__(self, rule, tokens, **kwargs)

    def apply(self):
        logging.debug(u'>SwapTransform %s',
                      token_strings(self.tokens_to_transform))
        assert(len(self.tokens_to_transform) == 2)
        index1 = self.rule.tokens.index(self.tokens_to_transform[0])
        index2 = self.rule.tokens.index(self.tokens_to_transform[1])
        self.rule.tokens[index1], self.rule.tokens[index2] = \
            self.rule.tokens[index2], self.rule.tokens[index1]
        logging.debug(u'<SwapTransform %s', token_strings(self.rule.tokens))
        return self.rule.tokens


class RegexTransform(Transform):
    """Alter tokens according to regular expressions.
    """

    def __init__(self, rule, tokens, **kwargs):
        Transform.__init__(self, rule, tokens, **kwargs)

    def apply(self):
        log_string = u'>RegexTransform {}'.format(self.rule.tokens)
        logging.debug(log_string)
        for token in self.tokens_to_transform:
            for re_pair in self.rule.regex_pairs:
                token.str = re.sub(re_pair[0], re_pair[1], token.str)
        logging.debug(u'<RegexTransform %s', token_strings(self.rule.tokens))
        return self.rule.tokens


class ConcatenateTransform(Transform):
    """Replace 2 or more tokens with a single concatenated token."""

    def __init__(self, rule, tokens, **kwargs):
        Transform.__init__(self, rule, tokens, **kwargs)

    def apply(self):
        logging.debug(u'>ConcatenateTransform %s',
                      token_strings(self.tokens_to_transform))
        # first get the concatenated string
        new_str = u''
        first_token = True
        for token in self.tokens_to_transform:
            new_str += token.str
            if not first_token:
                del self.rule.tokens[self.rule.tokens.index(token)]
            first_token = False
        # set the first token in the series to contain the new string
        self.tokens_to_transform[0].str = new_str
        logging.debug(u'<ConcatenateTransform %s',
                      token_strings(self.rule.tokens))
        return self.rule.tokens