Commits

david_walker committed 71c07f0

code migrated into rules.py

Comments (0)

Files changed (1)

transforms.py

-#!/usr/bin/env python
-"""
-Derived Transform classes.
-"""
-import re
-import pycountry
-import logging
-from mytoken import Token
-from base import Transform
-
-def token_strings(tokens):
-    return [t.str for t in tokens]
-
-
-class ParagraphTransform(Transform):
-    """Break tokens containing a newline into three parts.
-    """
-
-    def __init__(self, rule, token, **kwargs):
-        Transform.__init__(self, rule, token, **kwargs)
-        self._mo = kwargs['match_obj']
-
-    def apply(self):
-        logging.debug(u'>ParagraphTransform')
-        logging.debug(self.tokens_to_transform)
-        # Replace each token to transform with three new tokens: one
-        # for the string before the newline, a PARA token, and one for
-        # the string after the newline.
-        for token_to_transform in self.tokens_to_transform:
-            transform_token_index = self.rule.tokens.index(token_to_transform)
-            left_str = token_to_transform.str[:self._mo.start()]
-            right_str = token_to_transform.str[self._mo.end():]
-            left_token = None
-            right_token = None
-
-            if left_str:
-                if right_str:
-                    # token_to_transform.str has the form left_str +
-                    # '\n' + right_str, where left_str is some non-empty
-                    # string that doesn't contain a newline and
-                    # right_str is some non-empty string.
-                    left_cend = token_to_transform.cbegin + len(left_str)
-                    left_token = Token(
-                        left_str,
-                        token_to_transform.cbegin,
-                        left_cend)
-                    right_cbegin = token_to_transform.cbegin + self._mo.end()
-                    right_token = Token(
-                        right_str,
-                        right_cbegin,
-                        right_cbegin + len(right_str))
-                else:
-                    # token_to_transform == left_str + '\n'
-                    left_token = Token(
-                        left_str,
-                        token_to_transform.cbegin,
-                        token_to_transform.cbegin + len(left_str))
-                paragraph_token = Token(u'\n', left_token.cend,
-                                        left_token.cend + 1)
-            else:
-                if right_str:
-                    # token_to_transform == '\n' + right_str
-                    right_token = Token(
-                        right_str,
-                        token_to_transform.cbegin + 1,
-                        token_to_transform.cbegin + 1 + len(right_str))
-                else:
-                    # token_to_transform == two or more '\n'
-                    pass
-                paragraph_token = Token(u'\n', token_to_transform.cbegin,
-                                        token_to_transform.cbegin + 1)
-
-            # replace the token we're transforming
-            self.rule.tokens[transform_token_index] = paragraph_token
-            # insert a token to its left if it is nonempty
-            if left_token:
-                self.rule.tokens.insert(transform_token_index, left_token)
-                # that insertion has made transform_token_index off by
-                # one; correct it.
-                transform_token_index += 1
-            # now insert the token to the right, if it is nonempty
-            if right_token:
-                self.rule.tokens.insert(transform_token_index + 1, right_token)
-            logging.debug(filter(None, [left_token, paragraph_token, right_token]))
-        logging.debug(u'<ParagraphTransform')
-        return self.rule.tokens
-
-
-class AbbrevTransform(Transform):
-    """Mark a token as an abbreviation, splitting off any concatenation.
-
-    Technically an instance of this class could apply to multiple
-    tokens, but normally `tokens_to_transform` will only have one token
-    in it.
-    """
-    def __init__(self, rule, tokens_to_transform, **kwargs):
-        """Put keyword arguments in instance vars.
-
-        Keyword args:
-        `matched_abbrev` -- the AbbrevInfo object which (at least
-        partially) matched the token to transform.
-
-        `abbrev_match_len` -- number of characters of token that matched.
-        """
-        logging.debug(u'>AbbrevTransform.__init__')
-        Transform.__init__(self, rule, tokens_to_transform, **kwargs)
-        self._matched_abbrev = kwargs.get('matched_abbrev')
-        self._abbrev_match_len = kwargs.get('abbrev_match_len')
-        if len(tokens_to_transform) != 1:
-            logging.warn('AbbrevTransform expected single token but got %d',
-                         len(tokens_to_transform))
-
-    def apply(self):
-        logging.debug(u'>AbbrevTransform')
-        for token_to_transform in self.tokens_to_transform:
-            token_to_transform.abbrev = True
-
-            # If the abbreviation matches the entire token, it does not
-            # need to be split into multiple tokens, so update it in
-            # place.
-            if self._abbrev_match_len == len(token_to_transform.str):
-                if self._matched_abbrev.normal_form:
-                    logging.debug(token_to_transform, '=>',
-                                  self._matched_abbrev.normal_form)
-                    token_to_transform.str = self._matched_abbrev.normal_form
-                continue
-
-            # The abbreviation matches just the start of the token.
-            # Make a new token to insert after the abbreviation.
-            if self._matched_abbrev.normal_form:
-                abbrev_part = self._matched_abbrev.normal_form
-            else:
-                abbrev_part = token_to_transform.str[:self._abbrev_match_len]
-            extra_part = token_to_transform.str[self._abbrev_match_len:]
-
-            # Now modify the existing token, and create a new one to
-            # insert after it.
-            logging.debug(token_to_transform)
-            token_to_transform.str = abbrev_part
-            token_to_transform.cend = (token_to_transform.cbegin +
-                                       len(abbrev_part))
-            post_abbrev_token = Token(extra_part, token_to_transform.cend,
-                                      token_to_transform.cend + len(extra_part))
-            logging.debug('=> {}, {}'.format(token_to_transform,
-                                             post_abbrev_token))
-
-            # Find the index within the full list of tokens of the
-            # token we are to transform
-            transform_token_index = self.rule.tokens.index(token_to_transform)
-            self.rule.tokens.insert(transform_token_index + 1,
-                                    post_abbrev_token)
-        logging.debug(u'<AbbrevTransform')
-        return self.rule.tokens
-
-
-class CharSplitTransform(Transform):
-    """Split tokens on designated character."""
-
-    def __init__(self, rule, token, **kwargs):
-        Transform.__init__(self, rule, token, **kwargs)
-        self._delimiter_char = kwargs.get('delimiter_char')
-        self._keep_delimiter = kwargs.get('keep_delimiter')
-
-    def _add(self, split_tokens, i, split_str, token_to_transform):
-        cbegin = token_to_transform.cbegin + i - len(split_str)
-        cend = cbegin + len(split_str)
-        split_tokens.append(Token(split_str, cbegin, cend))
-
-    def apply(self):
-        logging.debug(u'>CharSplitTransform %s',
-                      token_strings(self.tokens_to_transform))
-        for token_to_transform in self.tokens_to_transform:
-            # find the index within the full list of tokens of the
-            # token we are to transform
-            transform_token_index = self.rule.tokens.index(token_to_transform)
-            # get slices of the array that exclude the token we're going
-            # to modify.
-            left_of_split = self.rule.tokens[:transform_token_index]
-            right_of_split = self.rule.tokens[transform_token_index + 1:]
-
-            split_tokens = []
-            split_str = u''
-            for i, c in enumerate(token_to_transform.str):
-                if c == self._delimiter_char:
-                    # if delimiter has just occurred after one or more
-                    # non-delimiter characters, add those characters as
-                    # a new token.
-                    if split_str:
-                        self._add(split_tokens, i, split_str, token_to_transform)
-                        split_str = u''
-                    # if we're keeping delimiter characters, make a new
-                    # token for it.
-                    if self._keep_delimiter:
-                        cbegin = token_to_transform.cbegin + i
-                        split_tokens.append(Token(c, cbegin, cbegin + 1))
-                else:
-                    split_str += c
-            if split_str:
-                self._add(split_tokens, i + 1, split_str, token_to_transform)
-            # put it all back together
-            self.rule.tokens = left_of_split + split_tokens + right_of_split
-            logging.debug(split_tokens)
-        logging.debug(u'<CharSplitTransform %s',
-                      token_strings(self.rule.tokens))
-        return self.rule.tokens
-
-
-class IndexSplitTransform(Transform):
-    """Split tokens at designated character index.
-
-    Keyword Arguments:
-    `index` - the character position at which to split the string
-    `three_way` - if True, split the string into three parts instead of two
-
-    If `index` is 0, `three_way` is ignored and a special case of the
-    two-way split is performed. The string is split into two tokens, the
-    first composed of the token's initial character, and the second
-    being the remainder of the string.
-
-    If `three_way` is not True and `index` is non-zero, the token is
-    split into two parts, where the first is all the characters
-    preceding `index` and the second is all the characters from `index`
-    to the end of the string.
-
-    Otherwise, if `three_way` is True, the token will be split into
-    three parts: the characters before `index`, the single character at
-    `index`, and the characters after `index`.
-    """
-
-    def __init__(self, rule, token, **kwargs):
-        Transform.__init__(self, rule, token, **kwargs)
-        self._index = kwargs.get('index')
-        if not self._index:
-            self._three_way = False
-        else:
-            self._three_way = kwargs.get('three_way', False)
-
-    def apply(self):
-        logging.debug(u'>IndexSplitTransform %s',
-                      token_strings(self.tokens_to_transform))
-        for token_to_transform in self.tokens_to_transform:
-            # find the index within the full list of tokens of the
-            # token to be transformed
-            transform_token_index = self.rule.tokens.index(token_to_transform)
-            if (self._three_way and
-                self._index < len(token_to_transform.str) - 1):
-                left = token_to_transform.str[:self._index]
-                middle = token_to_transform.str[self._index]
-                right = token_to_transform.str[self._index + 1:]
-
-                cbegin = token_to_transform.cbegin
-                cend = cbegin + len(left)
-                new_token = Token(left, cbegin, cend)
-                logging.debug(new_token)
-                self.rule.tokens.insert(transform_token_index, new_token)
-                cbegin = cend
-                cend = cbegin + len(middle)
-                token_to_transform.str = middle
-                token_to_transform.cbegin = cbegin
-                token_to_transform.cend = cend
-                logging.debug(token_to_transform)
-
-                cbegin = cend
-                cend = cbegin + len(right)
-                new_token = Token(right, cbegin, cend)
-                logging.debug(new_token)
-                self.rule.tokens.insert(transform_token_index + 2,
-                                        new_token)
-            else:
-                # split the token string at the supplied character index
-                index = self._index
-                if index == 0 and index < len(token_to_transform.str) - 1:
-                    index += 1
-                left = token_to_transform.str[:index]
-                right = token_to_transform.str[index:]
-                # insert a new token for the left part of the split
-                cbegin = token_to_transform.cbegin
-                cend = cbegin + len(left)
-                new_token = Token(left, cbegin, cend)
-                logging.debug(new_token)
-                self.rule.tokens.insert(transform_token_index,
-                                        new_token)
-                token_to_transform.str = right
-                token_to_transform.cbegin = cend
-                token_to_transform.cend = cend + len(right)
-                logging.debug(token_to_transform)
-        logging.debug(u'<IndexSplitTransform %s',
-                      token_strings(self.rule.tokens))
-        return self.rule.tokens
-
-
-class SetAttrTransform(Transform):
-    """Change attributes of a token. """
-
-    def __init__(self, rule, tokens, **kwargs):
-        Transform.__init__(self, rule, tokens, **kwargs)
-        self._attr_changes = kwargs['attr_changes']
-
-    def apply(self):
-        logging.debug(u'>SetAttrTransform %s',
-                      token_strings(self.tokens_to_transform))
-        for token in self.tokens_to_transform:
-            for change in self._attr_changes:
-                setattr(token, change.attr, change.value)
-        logging.debug(u'<SetAttrTransform %s', token_strings(self.rule.tokens))
-        return self.rule.tokens
-
-
-class SeparateThousandsTransform(Transform):
-    """Insert commas to separate thousands."""
-
-    @staticmethod
-    def digit_group_callback(match_obj):
-        return match_obj.group(1) + ',' + match_obj.group(2)
-
-    def __init__(self, rule, tokens, **kwargs):
-        Transform.__init__(self, rule, tokens, **kwargs)
-
-    def apply(self):
-        logging.debug(u'>SeparateThousandsTransform %s',
-                      token_strings(self.tokens_to_transform))
-        for token in self.tokens_to_transform:
-            new_str = re.sub(
-                ur'(\d)(\d{3})\b',
-                SeparateThousandsTransform.digit_group_callback,
-                token.str)
-            while new_str != token.str:
-                token.str = new_str
-                new_str = re.sub(
-                    ur'(\d)(\d{3})\b',
-                    SeparateThousandsTransform.digit_group_callback,
-                    token.str)
-        logging.debug(u'<SeparateThousandsTransform %s',
-                      token_strings(self.rule.tokens))
-        return self.rule.tokens
-
-
-class ISOCurrencyTransform(Transform):
-    """Spell out an ISO currency abbreviation.
-    """
-
-    def __init__(self, rule, tokens, **kwargs):
-            Transform.__init__(self, rule, tokens, **kwargs)
-
-    def apply(self):
-        logging.debug(u'>ISOCurrencyTransform %s',
-                      token_strings(self.tokens_to_transform))
-        assert(len(self.tokens_to_transform) == 1)
-        token = self.tokens_to_transform[0]
-        transform_token_index = self.rule.tokens.index(token)
-        left_of_split = self.rule.tokens[:transform_token_index]
-        right_of_split = self.rule.tokens[transform_token_index + 1:]
-        # token is an ISO currency abbreviation. Make new tokens to
-        # replace it
-        iso_letters = token.str.upper()
-        new_token_str = '{} ( {} )'.format(
-            pycountry.currencies.get(letter=iso_letters).name,
-            iso_letters)
-        new_tokens = map(Token, new_token_str.split())
-        # this must be set to avoid infinite loop
-        new_tokens[-2].ISO_currency_expanded = True
-        new_tokens[-2].cbegin = token.cbegin
-        new_tokens[-2].cend = token.cend
-        logging.debug(new_tokens)
-        self.rule.tokens = left_of_split + new_tokens + right_of_split
-        logging.debug(u'<ISOCurrencyTransform %s',
-                      token_strings(self.rule.tokens))
-        return self.rule.tokens
-
-
-class USCurrencyTransform(Transform):
-    """Convert '100 USD' to '$100'.
-    """
-
-    def __init__(self, rule, tokens, **kwargs):
-            Transform.__init__(self, rule, tokens, **kwargs)
-
-    def apply(self):
-        logging.debug(u'>USCurrencyTransform %s',
-                      token_strings(self.tokens_to_transform))
-        assert(len(self.tokens_to_transform) == 1)
-        token = self.tokens_to_transform[0]
-        transform_token_index = self.rule.tokens.index(token)
-        del self.rule.tokens[transform_token_index]
-        amount = self.rule.tokens[transform_token_index - 1]
-        amount.str = '$' + amount.str
-        logging.debug(u'<USCurrencyTransform %s',
-                      token_strings(self.rule.tokens))
-        return self.rule.tokens
-
-
-class SwapTransform(Transform):
-    """Swap two adjacent tokens."""
-
-    def __init__(self, rule, tokens, **kwargs):
-            Transform.__init__(self, rule, tokens, **kwargs)
-
-    def apply(self):
-        logging.debug(u'>SwapTransform %s',
-                      token_strings(self.tokens_to_transform))
-        assert(len(self.tokens_to_transform) == 2)
-        index1 = self.rule.tokens.index(self.tokens_to_transform[0])
-        index2 = self.rule.tokens.index(self.tokens_to_transform[1])
-        self.rule.tokens[index1], self.rule.tokens[index2] = \
-            self.rule.tokens[index2], self.rule.tokens[index1]
-        logging.debug(u'<SwapTransform %s', token_strings(self.rule.tokens))
-        return self.rule.tokens
-
-
-class RegexTransform(Transform):
-    """Alter tokens according to regular expressions.
-    """
-
-    def __init__(self, rule, tokens, **kwargs):
-        Transform.__init__(self, rule, tokens, **kwargs)
-
-    def apply(self):
-        log_string = u'>RegexTransform {}'.format(self.rule.tokens)
-        logging.debug(log_string)
-        for token in self.tokens_to_transform:
-            for re_pair in self.rule.regex_pairs:
-                token.str = re.sub(re_pair[0], re_pair[1], token.str)
-        logging.debug(u'<RegexTransform %s', token_strings(self.rule.tokens))
-        return self.rule.tokens
-
-
-class ConcatenateTransform(Transform):
-    """Replace 2 or more tokens with a single concatenated token."""
-
-    def __init__(self, rule, tokens, **kwargs):
-        Transform.__init__(self, rule, tokens, **kwargs)
-
-    def apply(self):
-        logging.debug(u'>ConcatenateTransform %s',
-                      token_strings(self.tokens_to_transform))
-        # first get the concatenated string
-        new_str = u''
-        first_token = True
-        for token in self.tokens_to_transform:
-            new_str += token.str
-            if not first_token:
-                del self.rule.tokens[self.rule.tokens.index(token)]
-            first_token = False
-        # set the first token in the series to contain the new string
-        self.tokens_to_transform[0].str = new_str
-        logging.debug(u'<ConcatenateTransform %s',
-                      token_strings(self.rule.tokens))
-        return self.rule.tokens