Commits

david_walker  committed 39769dd

revamp of rules.py continues

  • Participants
  • Parent commits 8b06e29
  • Branches parse

Comments (0)

Files changed (3)

     # for Python 3 use class Rule(metaclass=ABCMeta):
     __metaclass__ = ABCMeta
 
-    def __init__(self, rule_id, score, description, enabled=True):
+    def __init__(self, rule_id, description, enabled=True):
         """Initialize descriptive parameters.
 
         Arguments:
         - `rule_id`: a globally unique integer.
 
-        - `score`: a value from 0.0 to 1.0 inclusive. It is meant to
-          be determined by supervised learning techniques.
-
         - `description`: a human-readable explanation of what this
           rule does.
 
           should function correctly regardless of this value.
         """
         self.rule_id = rule_id
-        self.score = score
         self.description = description
         self.enabled = enabled
 
         return self.__class__.__name__
 
     @abstractmethod
-    def get_transforms(self, edit_assistant):
-        """Return a (possibly empty) list of Transform objects which
-        would carry out the changes to `edit_assistant.tokens`
-        associated with this rule.
-        """
-        return []
+    def apply(self, tokens):
+        """Apply this rule to `tokens`. """
+        pass
 
 
 class Transform():
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 """
-Rule and Transform derived classes.
+Classes derived from Rule.
 """
 import re
 import sys
 import unicodedata
 from base import AttrChange, Rule
 import transforms as tr
+from tokensearch import TokenSearchByRegexp
 
 INITIAL_PHASE = 0
 POS_PHASE = 1
             get_right_neighbor(lst, i, attrib_name))
 
 
-def find_token_pattern(tokens, search_exprs):
-    """Return the index of the first occurrence of a sequence of tokens
-    matching the search expressions in `search_exprs`.
+def split_token_at(tokens, transform_token_index, delim, keep_delim):
+    token_to_transform = tokens[transform_token_index]
+    split_tokens = []
+    split_str = u''
+    for i, c in enumerate(token_to_transform.str):
+        if c == delim:
+            # if delimiter has just occurred after one or more
+            # non-delimiter characters, add those characters as
+            # a new token.
+            if split_str:
+                cbegin = token_to_transform.cbegin + i - len(split_str)
+                cend = cbegin + len(split_str)
+                split_tokens.append(Token(split_str, cbegin, cend))
+                split_str = u''
+            # if we're keeping delimiter characters, make a new
+            # token for it.
+            if keep_delim:
+                cbegin = token_to_transform.cbegin + i
+                split_tokens.append(Token(c, cbegin, cbegin + 1))
+        else:
+            split_str += c
+    if split_str:
+        cbegin = token_to_transform.cbegin + i + 1 - len(split_str)
+        cend = cbegin + len(split_str)
+        split_tokens.append(Token(split_str, cbegin, cend))
+    if len(split_tokens) > 1:
+        # insert all but the last of split_tokens before the token at
+        # transform_token_index, then make the token at
+        # transform_token_index look like the last token in split_tokens.
+        for i, token_to_insert in enumerate(split_tokens[:-1]):
+            tokens.insert(transform_token_index + i, token_to_insert)
+        transform_token_index += i + 1
+        tokens[transform_token_index].str = split_tokens[-1].str
+        tokens[transform_token_index].cbegin = split_tokens[-1].cbegin
+        tokens[transform_token_index].cend = split_tokens[-1].cend
 
-    Arguments:
-    tokens -- a list of Token objects
-    search_exprs -- a list of TokenSearch objects
-    """
+
+class WhitespaceSplitRule(Rule):
+    phase = INITIAL_PHASE
+
+    def __init__(self):
+        Rule.__init__(self, 5, "Separate text by whitespace.")
+
+    def apply(self, tokens):
+        for i, token in enumerate(tokens):
+            if token.is_para or token.non_printing:
+                continue
+            if ' ' in token.str:
+                split_token_at(tokens, i, ' ', False)
 
 
 class RegexCleanupRule(Rule):
         TokenSearchByRegexp(u'a month for these activities', u'a month from it'),
         TokenSearchByRegexp(u'comes from buying and selling of', u'comes from selling'),
         TokenSearchByRegexp(u'engage in business activities', u'do business'),
-        TokenSearchByRegexp(u"improve/expand (the borrower's|his|her) business",
-         u'improve and expand it'),
+        #TokenSearchByRegexp([u"improve/expand", u"(the borrower's|his|her)", "business"], u'improve and expand it'),
         TokenSearchByRegexp(ur'fellowship\* meeting', u'fellowship meeting*'),
         TokenSearchByRegexp(u'clicking the link to the NWTF Kiva lending team',
          ur'clicking the link to the '
         ]
 
     def __init__(self):
-        Rule.__init__(self, 10, 1.0, "Search and replace specific strings")
+        Rule.__init__(self, 10, "Search and replace specific strings")
 
     def apply(self, tokens):
         for ts in RegexCleanupRule.regex_pairs:
     LINEBREAK_RE = re.compile(r'(\s*)(\n+)(\s*)')
 
     def __init__(self):
-        Rule.__init__(self, 20, 1.0,
+        Rule.__init__(self, 20,
                       "Separate text into paragraphs at line breaks.")
 
-    def get_transforms(self, tokens):
-        self.tokens = tokens
-        transforms = []
+    def apply(self, tokens):
         # search for a newline, consider it and all contiguous remaining
         # whitespace (including other newlines) to be a single paragraph break.
-        for token in tokens:
+        for transform_token_index, token in enumerate(tokens):
             if token.is_para or token.non_printing:
                 continue
             mo = ParagraphRule.LINEBREAK_RE.search(token.str)
-            if mo:
-                # Found a linebreak; create a transform that would
-                # break the token at the linebreak into two halves,
-                # inserting a paragraph token between them.
-                # Note there might be other paragraph breaks beyond
-                # the first one. They will be addressed when the main
-                # loop applies this rule again.
-                new_transform = tr.ParagraphTransform(self, [token],
-                                                      match_obj=mo)
-                transforms.append(new_transform)
-        # if transforms:
-        #     logging.debug('ParagraphRule returning %d', len(transforms))
-        return transforms
-
-
-class WhitespaceSplitRule(Rule):
-    phase = INITIAL_PHASE
-
-    def __init__(self):
-        Rule.__init__(self, 5, 1.0, "Separate text by whitespace.")
-
-    def get_transforms(self, tokens):
-        self.tokens = tokens
-        transforms = []
-        transform_tokens = []
-        for token in tokens:
-            # skip paragraph tokens as their string is '\n', which
-            # would be deleted by a split().
-            if token.is_para or token.non_printing:
+            if not mo:
                 continue
 
-            if ' ' in token.str:
-                transform_tokens.append(token)
+            # Found a linebreak. Break the token at the linebreak into
+            # two halves, inserting a paragraph token between them. Note
+            # there might be other paragraph breaks beyond the first
+            # one. They will be addressed when the main loop applies
+            # this rule again.
 
-        if transform_tokens:
-            transforms = [
-                tr.CharSplitTransform(self,
-                                      transform_tokens,
-                                      delimiter_char=u' ',
-                                      keep_delimiter=False)]
-        return transforms
+            left_str = token.str[:mo.start()]
+            right_str = token.str[mo.end():]
+            left_token = None
+            right_token = None
+
+            if left_str:
+                if right_str:
+                    # token.str has the form left_str + '\n' +
+                    # right_str, where left_str is some non-empty string
+                    # that doesn't contain a newline and right_str is
+                    # some non-empty string.
+                    left_cend = token.cbegin + len(left_str)
+                    left_token = Token(left_str, token.cbegin, left_cend)
+                    right_cbegin = token.cbegin + mo.end()
+                    right_token = Token(right_str, right_cbegin,
+                                        right_cbegin + len(right_str))
+                else:
+                    # token == left_str + '\n'
+                    left_token = Token(left_str, token.cbegin,
+                                       token.cbegin + len(left_str))
+                paragraph_token = Token(u'\n', left_token.cend,
+                                        left_token.cend + 1)
+            else:
+                if right_str:
+                    # token == '\n' + right_str
+                    right_token = Token(right_str, token.cbegin + 1,
+                                        token.cbegin + 1 + len(right_str))
+                else:
+                    # token == two or more '\n'
+                    pass
+                paragraph_token = Token(u'\n', token.cbegin, token.cbegin + 1)
+
+            # replace the token we're transforming
+            tokens[transform_token_index] = paragraph_token
+            # insert a token to its left if it is nonempty
+            if left_token:
+                tokens.insert(transform_token_index, left_token)
+                # that insertion has made transform_token_index off by
+                # one; correct it.
+                transform_token_index += 1
+            # now insert the token to the right, if it is nonempty
+            if right_token:
+                tokens.insert(transform_token_index + 1, right_token)
+            logging.debug(filter(None, [left_token, paragraph_token, right_token]))
 
 
 class DotSplitRule(Rule):
       will convert this to american style)
     - a decimal like 1.5 or .5 (latter should be changed to 0.5)
     - an abbreviation
-
     """
 
     phase = INITIAL_PHASE
 
     def __init__(self):
-        Rule.__init__(self, 40, 1.0,
+        Rule.__init__(self, 40,
                       "Separate periods from words that aren't abbreviations.")
 
-    def get_transforms(self, tokens):
-        self.tokens = tokens
-        transforms = []
-        transform_tokens = []
-        for token in tokens:
+    def _transform_abbrev(self, tokens, transform_token_index, abbrev,
+                          abbrev_len):
+        token_to_transform = tokens[transform_token_index]
+        token_to_transform.abbrev = True
+
+        # If the abbreviation matches the entire token, it does not need
+        # to be split into multiple tokens, so update it in place.
+        if abbrev_len == len(token_to_transform.str):
+            if abbrev.normal_form:
+                logging.debug(token_to_transform, '=>',
+                              abbrev.normal_form)
+                token_to_transform.str = abbrev.normal_form
+            return
+
+        # The abbreviation matches just the start of the token. Make a
+        # new token to insert after the abbreviation.
+        if abbrev.normal_form:
+            abbrev_part = abbrev.normal_form
+        else:
+            abbrev_part = token_to_transform.str[:abbrev_len]
+        extra_part = token_to_transform.str[abbrev_len:]
+
+        # Now modify the existing token, and create a new one to insert
+        # after it.
+        logging.debug(token_to_transform)
+        token_to_transform.str = abbrev_part
+        token_to_transform.cend = (token_to_transform.cbegin +
+                                   len(abbrev_part))
+        post_abbrev_token = Token(extra_part, token_to_transform.cend,
+                                  token_to_transform.cend + len(extra_part))
+        logging.debug('=> {}, {}'.format(token_to_transform,
+                                         post_abbrev_token))
+        tokens.insert(transform_token_index + 1, post_abbrev_token)
+
+
+    def apply(self, tokens):
+        for i, token in enumerate(tokens):
             if token.non_printing or not '.' in token.str:
                 continue
             # Token has a dot in it somewhere. Leave it alone if it's
                     # token is an abbreviation
                     if abbrev.normal_form and token.str != abbrev.normal_form:
                         # but it differs from the proper form of the abbrev
-                        attr_change = AttrChange('str', abbrev.normal_form)
-                        transform = tr.SetAttrTransform(
-                            self, [token], attr_changes=[attr_change])
-                        transforms.append(transform)
+                        token.str = abbrev.normal_form
                 else:
                     # token starts with an abbreviation and should be split
-                    transforms.append(
-                        tr.AbbrevTransform(self, [token],
-                                           matched_abbrev=abbrev,
-                                           abbrev_match_len=abbrev_len))
+                    self._transform_abbrev(tokens, i, abbrev, abbrev_len)
             elif len(token.str) > 1:
                 # length check so we don't try to split '.'
-                transform_tokens.append(token)
-
-        if transform_tokens:
-            transforms += [tr.CharSplitTransform(self,
-                                                 transform_tokens,
-                                                 delimiter_char=u'.',
-                                                 keep_delimiter=True)]
-        # if transforms:
-        #     logging.debug("DotSplitRule returning %d", len(transforms))
-        return transforms
+                split_token_at(tokens, i, u'.', True)
 
 
 class EuroDelimiterRule(Rule):
 
     def __init__(self):
         Rule.__init__(
-            self, 50, 1.0, "Convert European style thousands"
+            self, 50, "Convert European style thousands"
             " delimiters '1.234.567,89' to American style '1,234,567.89'.")
 
-    def get_transforms(self, tokens):
-        self.tokens = tokens
-        transforms = []
+    def apply(self, tokens):
         for token in tokens:
             if token.non_printing or token.is_URL:
                 continue
                 replacement = token.str.replace(u',', u'x')
                 replacement = replacement.replace(u'.', u',')
                 replacement = replacement.replace(u'x', u'.')
-                new_str_attr = AttrChange('str', replacement)
-                transforms.append(tr.SetAttrTransform(
-                        self, [token], attr_changes=[new_str_attr]))
-        return transforms
+                token.str = replacement
 
 
 class PunctSplitRule(Rule):
 
     def __init__(self):
         """Set rule priority and name. """
-        Rule.__init__(self, 60, 1.0,
+        Rule.__init__(self, 60,
                       "Separate punctuation (other than periods)"
                       " into separate tokens.")
 
     phase = INITIAL_PHASE
 
     def __init__(self):
-        Rule.__init__(self, 70, 1.0,
+        Rule.__init__(self, 70,
                       "Split conjoined words and "
                       "numbers into separate tokens.")
 
 
     def __init__(self):
         """Set rule priority and name. """
-        Rule.__init__(self, 80, INITIAL_PHASE,
-                      1.0, "Spell out single digit numbers.")
+        Rule.__init__(self, 80,
+                      "Spell out single digit numbers.")
 
     def get_transforms(self, tokens):
         """Return an array of transform objects."""
     _splittable_number_re = re.compile(ur'^[1-9][0-9]{4,}(\.[0-9]{2})?$', re.U)
 
     def __init__(self):
-        Rule.__init__(self, 90, 1.0,
+        Rule.__init__(self, 90,
                       "Format numbers which express amounts of currency.")
 
     def get_transforms(self, tokens):
         re.UNICODE | re.VERBOSE)
 
     def __init__(self):
-        Rule.__init__(self, 100, 1.0,
+        Rule.__init__(self, 100,
                       "Remove spaces from numbers.")
 
     def get_transforms(self, tokens):
 
     def __init__(self):
         Rule.__init__(
-            self, 110, 1.0,
+            self, 110,
             "Put currency abbreviations after the numbers they describe")
 
     def get_transforms(self, tokens):
 
     def __init__(self):
         Rule.__init__(
-            self, 120, 1.0,
+            self, 120,
             "Spell out the first occurrence of an ISO currency abbreviation.")
 
     def get_transforms(self, tokens):
 
     def __init__(self):
         Rule.__init__(
-            self, 130, 1.0,
+            self, 130,
             "Surround every sentence with a sentence-delimiter token.")
 
     def get_transforms(self, tokens):
 
     def __init__(self):
         Rule.__init__(
-            self, 200, 1.0,
+            self, 200,
             "Fix incorrect plural and lack of hyphens in phrases "
             "like '20 years old man'.")
 
         self.tokens = tokens
 
         return []
+
+if __name__ == '__main__':
+    from mytoken import Token
+    s = u'1.234.567,89'
+    ws_rule = WhitespaceSplitRule()
+    tokens = [Token(s, 0, len(s))]
+    ws_rule.apply(tokens)
+    EuroDelimiterRule().apply(tokens)
+    print tokens

File tokensearch.py

         if isinstance(regexp, basestring):
             self._regexp = map(re.compile, regexp.split())
         else:
-            self._regexp = regexp
+            self._regexp = map(re.compile, regexp)
         if isinstance(replace, basestring):
             self._replace = replace.split()
         else: