Commits

david_walker committed fef6cf0

add splitting of alphanumeric tokens

Comments (0)

Files changed (3)

             if val:
                 if len(r) > 1:
                     r += u' '
-                r += u'{}: {}'.format(key, val)
+                if key == '_str':
+                    r += u'{}: "{}"'.format(key, val)
+                else:
+                    r += u'{}: {}'.format(key, val)
         r += u'>'
         return r
 
 
         if transform_tokens:
             transforms = [
-                tr.SplitTransform(self, transform_tokens, split_re=ur'\s+')]
+                tr.RegexSplitTransform(self,
+                                       transform_tokens,
+                                       split_re=ur'\s+')]
 
         if transforms:
             logging.debug('WhitespaceSplitRule returning %d', len(transforms))
                 transform_tokens.append(token)
 
         if transform_tokens:
-            transforms += [tr.SplitTransform(self,
+            transforms += [tr.RegexSplitTransform(self,
                                              transform_tokens,
                                              split_re=ur'(\.)')]
         if transforms:
         splittables = filter(PunctSplitRule.is_splittable, tokens)
         if splittables:
             transforms.append(
-                tr.SplitTransform(self, splittables,
+                tr.RegexSplitTransform(self, splittables,
                                   split_re=PunctSplitRule.punct_re))
         if transforms:
             logging.debug("PunctSplitRule returning %d", len(transforms))
             not token.is_formatted_decimal_number
 
 
+class AlphaNumSplitRule(Rule):
+    """Split alphanumeric sequences.
+    """
+
+    def __init__(self):
+        Rule.__init__(self, 6, 1.0,
+                      "Split conjoined words and "
+                      "numbers into separate tokens.")
+
+    def get_transforms(self, tokens):
+        self.tokens = tokens
+        transforms = []
+
+        # | input     | output     |
+        # |-----------+------------|
+        # | 10am      | 10 am      |
+        # | 10.00am   | 10.00 am   |
+        # | 10:00am   | 10:00 am   |
+        # | 10:00a.m. | 10:00 a.m. |
+        # | 500foo    | 500 foo    |
+        # | bar200    | bar 200    |
+        # | ksh.1000  | ksh. 1000  |
+
+        for token in tokens:
+            # skip non-printing tokens
+            if not token.str:
+                continue
+            # if it starts with an alpha char, split at first digit.
+            # if it starts with a digit, split at first alpha
+            mo = None
+            if token.str[0].isalpha():
+                mo = re.search(ur'\d', token.str)
+            elif token.str[0].isdigit():
+                mo = re.search(ur'[a-zA-Z]', token.str)
+            if mo:
+                transform = tr.IndexSplitTransform(self, [token],
+                                                   index=mo.start())
+                transforms.append(transform)
+        return transforms
+
+
 class SpellDigitsRule(Rule):
     """Spell out numbers 1..9.
 
 
     def __init__(self):
         """Set rule priority and name. """
-        Rule.__init__(self, 6, 1.0, "Spell out single digit  numbers.")
+        Rule.__init__(self, 7, 1.0, "Spell out single digit  numbers.")
 
     def get_transforms(self, tokens):
         """Return an array of transform objects."""
     """
 
     def __init__(self):
-        Rule.__init__(self, 7, 1.0,
+        Rule.__init__(self, 8, 1.0,
                       "Format numbers which express amounts of currency.")
 
     def get_transforms(self, tokens):
     associated numbers."""
     def __init__(self):
         Rule.__init__(
-            self, 7, 1.0,
+            self, 9, 1.0,
             "Put currency abbreviations after the numbers they describe")
 
     def get_transforms(self, tokens):
 
     def __init__(self):
         Rule.__init__(
-            self, 8, 1.0,
+            self, 10, 1.0,
             "Spell out the first occurrence of an ISO currency abbreviation.")
 
     def get_transforms(self, tokens):
             #   100 PHP (Philippine peso)
             #   100 Philippine peso (PHP)
             prev_token, next_token = get_neighbors(tokens, i, 'is_alpha')
-            currency_name = pycountry.currencies.get(letter=token.str.upper()).name
+            currency_name = pycountry.currencies.get(
+                letter=token.str.upper()).name
             name_words = currency_name.split()
             if (prev_token and currency_name_match(prev_token.str, name_words[-1]) or
                 next_token and currency_name_match(next_token.str, name_words[0])):
         return self.rule.tokens
 
 
-class SplitTransform(Transform):
+class RegexSplitTransform(Transform):
     """Split tokens on designated character."""
 
     def __init__(self, rule, token, **kwargs):
         self._split_re = kwargs.get('split_re')
 
     def apply(self):
-        logging.debug('>SplitTransform %s', self.tokens_to_transform)
+        logging.debug('>RegexSplitTransform %s', self.tokens_to_transform)
         for token_to_transform in self.tokens_to_transform:
             # find the index within the full list of tokens of the
             # token we are to transform
             split_tokens = map(Token, split_strings)
             # put it all back together
             self.rule.tokens = left_of_split + split_tokens + right_of_split
-        logging.debug('<SplitTransform %s', self.rule.tokens)
+        logging.debug('<RegexSplitTransform %s', self.rule.tokens)
+        return self.rule.tokens
+
+
+class IndexSplitTransform(Transform):
+    """Split tokens at designated character index."""
+
+    def __init__(self, rule, token, **kwargs):
+        Transform.__init__(self, rule, token, **kwargs)
+        self._index = kwargs.get('index')
+        assert(self._index > 0)
+
+    def apply(self):
+        logging.debug('>IndexSplitTransform %s', self.tokens_to_transform)
+        for token_to_transform in self.tokens_to_transform:
+            # find the index within the full list of tokens of the
+            # token we are to transform
+            transform_token_index = self.rule.tokens.index(token_to_transform)
+            # split the token string at the supplied character index
+            left = token_to_transform.str[:self._index]
+            right = token_to_transform.str[self._index:]
+            # insert a new token for the left part of the split
+            self.rule.tokens.insert(self._index, Token(left))
+            token_to_transform.str = right
+        logging.debug('<IndexSplitTransform %s', self.rule.tokens)
         return self.rule.tokens