Commits

david_walker committed f21b4c3

work in progress: removing transforms and making rules directly change tokens

  • Participants
  • Parent commits 39769dd
  • Branches parse

Comments (0)

Files changed (4)

File base.py

-#!/usr/bin/env python
-"""
-Fundamental classes used by the editor's assistant.
-"""
-
-from abc import ABCMeta, abstractmethod
-from collections import namedtuple
-import logging
-
-AttrChange = namedtuple('AttrChange', 'attr, value')
-
-
-class Relation(object):
-    """Namespace for constants used to describe relation between
-    transforms.
-    """
-    dominates = 1
-    dominated_by = 2
-    disjoint = 3
-
-
-class Rule():
-    """Abstract base class for rules.
-
-    A Rule object, given a list of tokens, will return a (possibly
-    empty) list of Transform objects representing the changes that
-    Rule would make to the token list.
-    """
-
-    # for Python 3 use class Rule(metaclass=ABCMeta):
-    __metaclass__ = ABCMeta
-
-    def __init__(self, rule_id, description, enabled=True):
-        """Initialize descriptive parameters.
-
-        Arguments:
-        - `rule_id`: a globally unique integer.
-
-        - `description`: a human-readable explanation of what this
-          rule does.
-
-        - `enabled`: for efficiency, rules can opt out of the main
-          loop by setting this attribute to False. Rules should
-          tolerate being re-enabled at any time, and get_transforms()
-          should function correctly regardless of this value.
-        """
-        self.rule_id = rule_id
-        self.description = description
-        self.enabled = enabled
-
-    def __str__(self):
-        return self.__class__.__name__
-
-    @abstractmethod
-    def apply(self, tokens):
-        """Apply this rule to `tokens`. """
-        pass
-
-
-class Transform():
-    """An abstract base class; derived classes should override `apply()`.
-
-    A Transform object can add, modify, and remove Tokens from a list.
-    Transform objects are generated by Rule objects.
-    """
-
-    # for Python 3 use class Transform(metaclass=ABCMeta):
-    __metaclass__ = ABCMeta
-
-    def __init__(self, rule, tokens_to_transform, **kwargs):
-        """Remember the rule which created this object and the tokens
-        on which it should operate.
-
-        Arguments:
-        - `rule`: the Rule object that created this object.
-        - `tokens_to_transform`: a non-empty list of Token objects to modify
-        - `kwargs`: arguments needed by `apply()`
-        """
-        self.rule = rule
-        self.score = rule.score  # some transforms may compute their own score
-        self.tokens_to_transform = tokens_to_transform
-        self.kwargs = kwargs
-
-        # A transform is always created enabled, but is disabled if it
-        # conflicts with another transform that has a higher priority.
-        self.enabled = True
-
-    @abstractmethod
-    def apply(self):
-        """Return a copy of the `tokens_to_transform` list containing
-        the changed tokens."""
-        pass
-
-    def beats(self, other_transform):
-        """Return True if this transform has a higher priority than
-        `other_transform`, or False if `other_transform` has a higher
-        priority than this one. It is impossible for two transforms to
-        have the same priority.
-        """
-        if self.score > other_transform.score:
-            logging.debug('{} beats {}'.format(self.rule,
-                                               other_transform.rule))
-            return True
-        if self.score == other_transform.score:
-            if self.rule.rule_id < other_transform.rule.rule_id:
-                logging.debug('{} beats {}'.format(self.rule,
-                                                   other_transform.rule))
-                return True
-        logging.debug('{} beats {}'.format(other_transform.rule,
-                                           self.rule))
-        return False
-
-    @property
-    def rule_id(self):
-        """Return the id of the rule that created this object."""
-        return self._rule.rule_id
         self._original_text = infile.read()
         self._tokens = [Token(self._original_text, 0,
                               len(self._original_text)), eof_token]
-        # apply first phase rules/transforms to replace the original
-        # Token object with multiple Token objects, one for each bit of
-        # the input text that qualifies as a single input token.
+        # apply first phase rules to replace the original Token object
+        # with multiple Token objects, one for each bit of the input
+        # text that qualifies as a single input token.
         self._process_tokens(rules.INITIAL_PHASE)
         # Add a part-of-speech property to all the tokens
         tagger.tag_tokens(self._tokens)
 
     def _process_tokens(self, phase):
         all_rules = rules.get_rules(phase)
-        #
-        # The main loop.
-        #
-        # Each rule in a master list of Rules is supplied with the Token list
-        # so that it can generate a list of Transform objects. If no Rule
-        # produces at least one Transform object, the main loop exits and the
-        # final list of Tokens is transformed into the program's output text.
-        #
-        # Only one change to any given Token can be made in each iteration.
-        # Therefore, if more than one Transform object wishes to touch the
-        # same Token, there must be a way to pick the single one to apply for
-        # the current iteration.
-        #
-        # This is done by choosing the Transform object that was created by
-        # the Rule object having the highest `score`. If there is a tie in
-        # `score`, then the lowest `id` wins.
-        #
-        # Once the list of Transform objects has been reduced to only those
-        # that are non-conflicting, they can be applied to the Tokens list.
-        #
-        # Since the entire rule set is applied iteratively until no more
-        # Transform objects are created, Rules must be written so that they do
-        # not produce Transform objects indefinitely.
-        #
-        # Similarly, any set of rules which each transform the tokens so that
-        # the resulting transformation of one rule matches the predicate of
-        # another will cause an infinite loop.  In other words, care must be
-        # taken to avoid writing a set of rules which would forever take turns
-        # changing the same bit of text back and forth.
-        #
+        while True:
+            logging.debug('***calling rules')
+            changed = False
+            for rule in all_rules:
+                if rule.enabled and rule.apply(self._tokens):
+                    changed = True
+                    break
 
-        while True:
-            # build up a list of transforms which could apply to the
-            # tokens in their current state.
-            transforms = []
-
-            logging.debug('***calling rules')
-            for rule in all_rules:
-                if rule.enabled:
-                    transforms += rule.get_transforms(self._tokens)
-
-            # If there are no transforms, we're done changing tokens and
+            # If no changes were made, we're done changing tokens and
             # it's time to generate the output text.
-            if not transforms:
+            if not changed:
                 break
 
-            # Find the highest priority transform
-            winner = None
-            for transform in transforms:
-                if not winner:
-                    winner = transform
-                elif transform.beats(winner):
-                    winner = transform
-
-            self._tokens = winner.apply()
-
     def _report_changes(self):
         """Write a description of all significant changes."""
         for token in self._tokens:
 import inspect
 import pycountry
 import unicodedata
-from base import AttrChange, Rule
-import transforms as tr
+from abc import ABCMeta, abstractmethod
+
+from mytoken import Token
 from tokensearch import TokenSearchByRegexp
 
+class Rule():
+    """Abstract base class for rules.
+    """
+
+    # for Python 3 use class Rule(metaclass=ABCMeta):
+    __metaclass__ = ABCMeta
+
+    def __init__(self, rule_id, description, enabled=True):
+        """Initialize descriptive parameters.
+
+        Arguments:
+        - `rule_id`: a globally unique integer.
+
+        - `description`: a human-readable explanation of what this
+          rule does.
+
+        - `enabled`: for efficiency, rules can opt out of the main
+          loop by setting this attribute to False. Rules should
+          tolerate being re-enabled at any time, and apply()
+          should function correctly regardless of this value.
+        """
+        self.rule_id = rule_id
+        self.description = description
+        self.enabled = enabled
+
+    def __str__(self):
+        return self.__class__.__name__
+
+    @abstractmethod
+    def apply(self, tokens):
+        """Apply this rule to `tokens`. """
+        pass
+
 INITIAL_PHASE = 0
 POS_PHASE = 1
 
 def get_rules(desired_phase):
     """Return a list containing instances of all the rules in this
-    module with a matching phase."""
+    module with a matching phase, sorted by rule id."""
     classes = []
     this_module = sys.modules[__name__]
     for name, obj in inspect.getmembers(this_module):
         if (inspect.isclass(obj) and
             obj.__module__ == this_module.__name__ and
             'Rule' in str(obj) and
-            obj.phase == desired_phase):
+            getattr(obj, 'phase', None) == desired_phase):
             classes.append(obj())
-    return classes
+    # sort by id
+    return sorted(classes, cmp=lambda x,y: cmp(x.rule_id, y.rule_id))
 
 
 def get_neighbor(lst, i, step, attrib_name=None):
             get_right_neighbor(lst, i, attrib_name))
 
 
-def split_token_at(tokens, transform_token_index, delim, keep_delim):
+def split_token_at_delim(tokens, transform_token_index, delim, keep_delim):
     token_to_transform = tokens[transform_token_index]
     split_tokens = []
     split_str = u''
         tokens[transform_token_index].cend = split_tokens[-1].cend
 
 
+def split_token_at_index(tokens, transform_token_index, split_index, three_way):
+    token_to_transform = tokens[transform_token_index]
+    if (three_way and
+        split_index < len(token_to_transform.str) - 1):
+        left = token_to_transform.str[:split_index]
+        middle = token_to_transform.str[split_index]
+        right = token_to_transform.str[split_index + 1:]
+
+        cbegin = token_to_transform.cbegin
+        cend = cbegin + len(left)
+        new_token = Token(left, cbegin, cend)
+        logging.debug(new_token)
+        tokens.insert(transform_token_index, new_token)
+        cbegin = cend
+        cend = cbegin + len(middle)
+        token_to_transform.str = middle
+        token_to_transform.cbegin = cbegin
+        token_to_transform.cend = cend
+
+        cbegin = cend
+        cend = cbegin + len(right)
+        new_token = Token(right, cbegin, cend)
+        tokens.insert(transform_token_index + 2, new_token)
+    else:
+        # split the token string at the supplied character index
+        index = split_index
+        if index == 0 and index < len(token_to_transform.str) - 1:
+            index += 1
+        left = token_to_transform.str[:index]
+        right = token_to_transform.str[index:]
+        # insert a new token for the left part of the split
+        cbegin = token_to_transform.cbegin
+        cend = cbegin + len(left)
+        new_token = Token(left, cbegin, cend)
+        tokens.insert(transform_token_index, new_token)
+        token_to_transform.str = right
+        token_to_transform.cbegin = cend
+        token_to_transform.cend = cend + len(right)
+
+
 class WhitespaceSplitRule(Rule):
     phase = INITIAL_PHASE
 
         Rule.__init__(self, 5, "Separate text by whitespace.")
 
     def apply(self, tokens):
+        changed = False
         for i, token in enumerate(tokens):
             if token.is_para or token.non_printing:
                 continue
             if ' ' in token.str:
-                split_token_at(tokens, i, ' ', False)
+                split_token_at_delim(tokens, i, ' ', False)
+                changed = True
+        return changed
 
 
 class RegexCleanupRule(Rule):
         Rule.__init__(self, 10, "Search and replace specific strings")
 
     def apply(self, tokens):
+        changed = False
         for ts in RegexCleanupRule.regex_pairs:
-            ts.apply(tokens)
+            if ts.apply(tokens):
+                changed = True
+        return changed
 
 
 class ParagraphRule(Rule):
                       "Separate text into paragraphs at line breaks.")
 
     def apply(self, tokens):
+        changed = False
         # search for a newline, consider it and all contiguous remaining
         # whitespace (including other newlines) to be a single paragraph break.
         for transform_token_index, token in enumerate(tokens):
 
             # replace the token we're transforming
             tokens[transform_token_index] = paragraph_token
+            changed = True
             # insert a token to its left if it is nonempty
             if left_token:
                 tokens.insert(transform_token_index, left_token)
             if right_token:
                 tokens.insert(transform_token_index + 1, right_token)
             logging.debug(filter(None, [left_token, paragraph_token, right_token]))
+        return changed
 
 
 class DotSplitRule(Rule):
 
 
     def apply(self, tokens):
+        changed = False
         for i, token in enumerate(tokens):
             if token.non_printing or not '.' in token.str:
                 continue
                 if token.is_abbrev:
                     # token is an abbreviation
                     if abbrev.normal_form and token.str != abbrev.normal_form:
-                        # but it differs from the proper form of the abbrev
+                        # but it differs from the proper form of the
+                        # abbrev
                         token.str = abbrev.normal_form
                 else:
                     # token starts with an abbreviation and should be split
                     self._transform_abbrev(tokens, i, abbrev, abbrev_len)
+                changed = True
             elif len(token.str) > 1:
                 # length check so we don't try to split '.'
-                split_token_at(tokens, i, u'.', True)
+                split_token_at_delim(tokens, i, u'.', True)
+                changed = True
+        return changed
 
 
 class EuroDelimiterRule(Rule):
             " delimiters '1.234.567,89' to American style '1,234,567.89'.")
 
     def apply(self, tokens):
+        changed = False
         for token in tokens:
             if token.non_printing or token.is_URL:
                 continue
                 replacement = replacement.replace(u'.', u',')
                 replacement = replacement.replace(u'x', u'.')
                 token.str = replacement
+                changed = True
+        return changed
 
 
 class PunctSplitRule(Rule):
                       "Separate punctuation (other than periods)"
                       " into separate tokens.")
 
-    def get_transforms(self, tokens):
-        """Return an array of transform objects."""
-        self.tokens = tokens
-        transforms = []
+    def apply(self, tokens):
+        changed = False
         for token in tokens:
             if token.non_printing or len(token.str) < 2 or token.is_URL:
                 continue
                         token.str[i + 1:] in PunctSplitRule._contraction_endings):
                             continue
 
-                    # Create a transform to split the token at this
-                    # point.
+                    # Split the token at this point.
                     logging.debug(u"PunctSplitRule '{}' at {}".format(
                             token.str, i))
-                    transforms.append(
-                        tr.IndexSplitTransform(self,
-                                               [token],
-                                               index=i,
-                                               three_way=True))
+                    split_token_at_index(tokens, tokens.index(token), i, True)
+                    changed = True
                     break
-        return transforms
-
+        return changed
 
 class AlphaNumSplitRule(Rule):
     """Split alphanumeric sequences.
                       "Split conjoined words and "
                       "numbers into separate tokens.")
 
-    def get_transforms(self, tokens):
-        self.tokens = tokens
-        transforms = []
-
+    def apply(self, tokens):
         # | case | input     | output     |
         # |------+-----------+------------|
         # |    1 | 10am      | 10 am      |
         # |   11 | 43rd      | 43rd       |
         # |   12 | 1,200.    | 1,200 .    |
         # |   13 | 1,500.00  | 1,500.00   |
-
+        changed = False
         for token in tokens:
             # skip non-printing, URL, and short tokens
             if token.non_printing or len(token.str) < 2 or token.is_URL:
             if mo:
                 logging.debug(u"AlphaNumSplitRule '{}' at {}".format(
                         token.str, mo.start()))
-                transform = tr.IndexSplitTransform(self, [token],
-                                                   index=mo.start(),
-                                                   three_way=False)
-                transforms.append(transform)
-        return transforms
+                split_token_at_index(tokens,
+                                     tokens.index(token),
+                                     mo.start(),
+                                     False)
+                changed = True
+        return changed
 
 
 class SpellDigitsRule(Rule):
         Rule.__init__(self, 80,
                       "Spell out single digit numbers.")
 
-    def get_transforms(self, tokens):
-        """Return an array of transform objects."""
-        self.tokens = tokens
-        transforms = []
-
+    def apply(self, tokens):
+        """Return True if any tokens are converted from digits to
+        spelled numbers."""
+        changed = False
         for i, token in enumerate(tokens):
             # ignore tokens that aren't single digits
             if (token.non_printing or
                 continue
 
             # ok to spell out the digit; look up spelling
-            spelled_digit = SpellDigitsRule.spelled_digits[digit_value - 1]
-            # we'll use the generic attribute-changing transform,
-            # which takes an AttrChange list as a keyword argument
-            attr_change = AttrChange('str', spelled_digit)
-            transform = tr.SetAttrTransform(self,
-                                            [token],
-                                            attr_changes=[attr_change])
-            transforms.append(transform)
-
-        # if transforms:
-        #     logging.debug('SpellDigitsRule returning %d', len(transforms))
-        return transforms
+            token.str = SpellDigitsRule.spelled_digits[digit_value - 1]
+            changed = True
+        return changed
 
 
 class DelimitThousandsRule(Rule):
         Rule.__init__(self, 90,
                       "Format numbers which express amounts of currency.")
 
-    def get_transforms(self, tokens):
-        self.tokens = tokens
-        transforms = []
+    @staticmethod
+    def _digit_group_callback(match_obj):
+        return match_obj.group(1) + ',' + match_obj.group(2)
+
+    def _separate_thousands(self, token):
+        new_str = re.sub(ur'(\d)(\d{3})\b',
+                         DelimitThousandsRule._digit_group_callback,
+                         token.str)
+        while new_str != token.str:
+            token.str = new_str
+            new_str = re.sub(ur'(\d)(\d{3})\b',
+                             DelimitThousandsRule._digit_group_callback,
+                             token.str)
+
+    def apply(self, tokens):
+        changed = False
         for i, token in enumerate(tokens):
             if token.non_printing:
                 continue
             # symbol, ISO currency abbreviation, or term?
             if (i > 0 and tokens[i - 1].is_currency_term or
                 i < len(tokens) - 1 and  tokens[i + 1].is_currency_term):
-                transforms.append(tr.SeparateThousandsTransform(self, [token]))
-        # if transforms:
-        #     logging.debug('DelimitThousandsRule returning %d',
-        #                   len(transforms))
-        return transforms
+                self._separate_thousands(token)
+                changed = True
+        return changed
 
 
 class AccreteNumbersRule(Rule):
     """Merge consecutive tokens that together make a formatted decimal number.
     """
-
     phase = INITIAL_PHASE
 
     mergeable_number_re = re.compile(
         re.UNICODE | re.VERBOSE)
 
     def __init__(self):
-        Rule.__init__(self, 100,
-                      "Remove spaces from numbers.")
+        Rule.__init__(self, 100, "Remove spaces from numbers.")
 
-    def get_transforms(self, tokens):
-        self.tokens = tokens
-        transforms = []
+    def _concat(self, tokens):
+        # first get the concatenated string
+        new_str = u''
+        first_token = True
+        for token in tokens:
+            new_str += token.str
+            if not first_token:
+                del tokens[tokens.index(token)]
+            first_token = False
+        # set the first token in the series to contain the new string
+        tokens[0].str = new_str
+
+    def apply(self, tokens):
+        changed = False
         for i, token in enumerate(tokens):
             if token.non_printing or not token.has_digits:
                 continue
             # optionally containing delimiting commas and a decimal
             # point.
             if AccreteNumbersRule.mergeable_number_re.match(right.str):
-                transforms.append(tr.ConcatenateTransform(
-                        self, [token, right]))
+                self._concat([token, right])
+                changed = True
                 break
             # can also merge if next token (right) is a comma and the
             # one that follows it is a mergeable number
             if right.str == u',':
                 right2 = get_right_neighbor(tokens, i + 1)
                 if AccreteNumbersRule.mergeable_number_re.match(right2.str):
-                    transforms.append(
-                        tr.ConcatenateTransform(self, [token, right, right2]))
-        return transforms
+                    self._concat([token, right, right2])
+                    changed = True
+        return changed
 
 
 class CurrencyOrderRule(Rule):
             self, 110,
             "Put currency abbreviations after the numbers they describe")
 
-    def get_transforms(self, tokens):
-        self.tokens = tokens
-        transforms = []
+    def apply(self, tokens):
+        changed = False
         for i, token in enumerate(tokens):
             if token.non_printing or not token.is_ISO_currency:
                 continue
             # token is a number, swap them, UNLESS the previous token is
             # also a number.
             left, right = get_neighbors(tokens, i)
-            if right and right.is_delimited_decimal:
+            if right and right.has_digits:
                 if not left or not left.is_delimited_decimal:
-                    transforms = [tr.SwapTransform(self, [token, right])]
+                    token.str, right.str = right.str, token.str
+                    changed = True
                     break
-        # if transforms:
-        #     logging.debug('CurrencyOrderRule returning %d', len(transforms))
-        return transforms
+        return changed
 
 
 def currency_name_match(s1, s2):
             self, 120,
             "Spell out the first occurrence of an ISO currency abbreviation.")
 
-    def get_transforms(self, tokens):
-        self.tokens = tokens
-        transforms = []
+    def apply(self, tokens):
+        changed = False
         for i, token in enumerate(tokens):
             if token.non_printing:
                 continue
             # If the transform returned by this code has already
             # processed it, it will have a flag set on it.
             if getattr(token, 'ISO_currency_expanded', False):
+                # exit because the currency abbreviation should only be
+                # expanded once.
                 break
 
             # if token doesn't match a 3-letter ISO currency
                 currency_name_match(prev_alpha_token.str, name_words[-1]) or
                 next_alpha_token and
                 currency_name_match(next_alpha_token.str, name_words[0])):
-                set_expanded = AttrChange('ISO_currency_expanded', True)
-                transforms = [tr.SetAttrTransform(self,
-                                                  [token],
-                                                  attr_changes=[set_expanded])]
+                token.ISO_currency_expanded = True
+                changed = True
                 break
 
             if token.str.upper() == u'USD':
                 # 'next_token' has digits in it. If both have digits, do
                 # nothing because it's ambiguous which one the 'USD'
                 # describes.
-                #
-                # If the
                 if prev_token and prev_token.has_digits:
                     if next_token and next_token.has_digits:
                         continue
-                    transforms = [tr.USCurrencyTransform(self, [token])]
+                    token.str = '$' + prev_token.str
+                    del tokens[i - 1]
+                    changed = True
                 else:
                     # swap them first
-                    transforms = [tr.SwapTransform(self, [prev_token, token])]
+                    prev_token.str, token.str = token.str, prev_token.str
+                    changed = True
             else:
-                # instantiate a transform to spell out the abbreviation.
-                transforms = [tr.ISOCurrencyTransform(self, [token])]
-
+                # this must be set to avoid infinite loop
+                token.ISO_currency_expanded = True
+                # insert tokens for the expanded currency abbreviation
+                # and a left paren before the existing abbreviation
+                # token.
+                for new_token in map(Token, pycountry.currencies.get(
+                        letter=token.str.upper()).name.split()):
+                    tokens.insert(i, new_token)
+                    i += 1
+                tokens.insert(i, Token(u'('))
+                tokens.insert(i + 2, Token(u')'))
+                changed = True
             # Exit the loop here because only the first abbreviation
             # should be spelled out.
             break
-        # if transforms:
-        #     logging.debug('ISOCurrencyRule returning %d', len(transforms))
-        return transforms
-
+        return changed
 
 class SentenceDelimitRule(Rule):
     """Insert delimiter tokens between beginning and end of sentences.
             self, 130,
             "Surround every sentence with a sentence-delimiter token.")
 
-    def get_transforms(self, tokens):
+    def apply(self, tokens):
         """Return a transform that will insert the delimiter tokens.
 
         This rule is only intended to run once. It will disable itself
         after the first run. If it detects any pre-existing sentence
         delimiter tokens, it will return an empty list.
         """
-        self.tokens = tokens
-
+        changed = False
         # do nothing if this rule has ever been run.
         for token in tokens:
             if token.non_printing:
                 continue
             if token.sentence_delim:
                 self.enabled = False
-                return []
-        return []
+                break
+        return changed
 
 
 class YearOldRule(Rule):
             "Fix incorrect plural and lack of hyphens in phrases "
             "like '20 years old man'.")
 
-    def get_transforms(self, tokens):
-        """Return a transform that will fix age phrases.
-        """
-        self.tokens = tokens
-
-        return []
-
-if __name__ == '__main__':
-    from mytoken import Token
-    s = u'1.234.567,89'
-    ws_rule = WhitespaceSplitRule()
-    tokens = [Token(s, 0, len(s))]
-    ws_rule.apply(tokens)
-    EuroDelimiterRule().apply(tokens)
-    print tokens
+    def apply(self, tokens):
+        return False

File tokensearch.py

         return inst_rep
 
     def apply(self, tokens):
+        changed = False
         search_expr_idx = 0
         match_groups = []
         for i, token in enumerate(tokens):
                               i - (len(self._regexp) - 1),
                               len(self._regexp),
                               self._instantiate_replacements(match_groups))
+                changed = True
                 break
+        return changed
 
 
 def get_levenshtein_dist(source_tokens, target_strings):