Commits

david_walker  committed e3dbab2

fix bugs in punct and alphanum splits

  • Participants
  • Parent commits c497742

Comments (0)

Files changed (4)

 
 import re
 import pycountry
+import unicodedata
 from abc import ABCMeta, abstractmethod
 from collections import namedtuple
 import logging
     def __init__(self, str):
         """Initialize from text. """
         # Note we use the setter here which initializes the cache.
-        self.str = str
+        self.str = unicode(str)
 
     def __repr__(self):
         """Return a string representation of this object containing all
     def is_punc(self):
         """Return True if this token is a punctuation character.
         """
-        return self._str in u',.!?;:%*'
+        return len(self._str) == 1 and unicodedata.category(
+            self._str).startswith(u'P')
 
     @property
     def is_quote(self):
 class EditAssistant(object):
     def __init__(self, infile):
         """Process the input file and generate an output string."""
-        eof_token = Token('')
+        eof_token = Token(u'')
         eof_token.eof = True
         self._tokens = [Token(infile.read()), eof_token]
         self._process_tokens(infile)
                     winner = transform
 
             self._tokens = winner.apply()
-            # # Disable any transform which conflicts with another transform
-            # # of higher priority.
-            # for i, transform in enumerate(transforms):
-            #     j = i + 1
-            #     while j < len(transforms):
-            #         relation = transform.get_relation(transforms[j])
-            #         if relation == Relation.dominates:
-            #             transforms[j].enabled = False
-            #         elif relation == Relation.dominated_by:
-            #             transform.enabled = False
-            #         j += 1
-
-            # # Now apply all enabled transforms
-            # logging.debug('---applying transforms')
-            # for transform in transforms:
-            #     if transform.enabled:
-            #         self._tokens = transform.apply()
 
 
 def parse_commandline():
         # the test option supercedes other input modes
         if not isinstance(args.test, basestring):
             args.test = u' '.join(args.test)
-        outfile.write(EditAssistant(StringIO.StringIO(args.test)).edited_text)
+        outfile.write(EditAssistant(StringIO.StringIO(
+                    unicode(args.test))).edited_text)
         outfile.write('\n')
     elif args.clipboard:
         edit_assistant = EditAssistant(StringIO.StringIO(get_clipboard_text()))
 import logging
 import inspect
 import pycountry
+import unicodedata
 from base import AttrChange, Rule
 import transforms as tr
 
                                        split_re=ur'\s+')]
 
         # if transforms:
-        #     logging.debug('WhitespaceSplitRule returning %d', len(transforms))
+        #     logging.debug('WhitespaceSplitRule returning %d',
+            # len(transforms))
         return transforms
 
 
     Avoid splitting numeric punctuation, e.g., 11,000.34 should not be
     split at the comma or the decimal.
     """
-    punct_re = re.compile(ur'(["\'!@#$%^&*()_=+[\]{}|\:;/?,<>-])', re.U)
 
     # this is the same as Token.formatted_decimal_number_re except that
     # it is not bookended by ^ and $
         """Return an array of transform objects."""
         self.tokens = tokens
         transforms = []
-        # extract just tokens that have splittable punctuation.
-        splittables = filter(PunctSplitRule.is_splittable, tokens)
-        if splittables:
-            transforms.append(
-                tr.RegexSplitTransform(self, splittables,
-                                  split_re=PunctSplitRule.punct_re))
-        # if transforms:
-        #     logging.debug("PunctSplitRule returning %d", len(transforms))
+        for token in tokens:
+            if len(token.str) < 2:
+                continue
+            # get a list of match objects for embedded decimal numbers
+            number_mos = [
+                mo for mo in
+                PunctSplitRule.embedded_decimal_number_re.finditer(token.str)]
+            for i, char in enumerate(token.str):
+                if unicodedata.category(char).startswith(u'P'):
+                    # found punctuation character. does it lie within
+                    # any span of embedded decimal numbers?
+                    skip = False
+                    for mo in number_mos:
+                        if mo.start() < i < mo.end():
+                            skip = True
+                            break
+                    if skip:
+                        continue
+
+                    # found punctuation character, and it is not
+                    # embedded within a number as a thousands separator
+                    # or a decimal point. Create a transform to split
+                    # the token at this point.
+                    logging.debug("PunctSplitRule '{}' at {}".format(token, i))
+                    transforms.append(
+                        tr.IndexSplitTransform(self,
+                                               [token],
+                                               index=i,
+                                               three_way=True))
+                    break
         return transforms
 
-    @staticmethod
-    def is_splittable(token):
-        """Return True if token should be split into two or more tokens."""
-        if len(token.str) <= 1 or token.is_formatted_decimal_number:
-            return False
-        punct_mo = PunctSplitRule.punct_re.search(token.str)
-        if not punct_mo:
-            return False
-        # A punctuation character is in the token. Catch this case:
-        # 'php11,000', where token is not a formatted decimal number but
-        # has one embedded in it.
-        number_mo = PunctSplitRule.embedded_decimal_number_re.search(token.str)
-        if not number_mo:
-            return True
-        if number_mo.start() < punct_mo.start() < number_mo.end():
-            return False
-        return True
 
 class AlphaNumSplitRule(Rule):
     """Split alphanumeric sequences.
         # | ksh.1000  | ksh. 1000  |
 
         for token in tokens:
-            # skip non-printing tokens
-            if not token.str:
+            # skip non-printing and short tokens
+            if len(token.str) < 2:
                 continue
             # if it starts with an alpha char, split at first digit.
             # if it starts with a digit, split at first alpha
             elif token.str[0].isdigit():
                 mo = re.search(ur'[a-zA-Z]', token.str)
             if mo:
+                logging.debug("AlphaNumSplitRule '{}' at {}".format(token, mo.start()))
                 transform = tr.IndexSplitTransform(self, [token],
-                                                   index=mo.start())
+                                                   index=mo.start(),
+                                                   three_way=False)
                 transforms.append(transform)
         return transforms
 
             if not token.is_ISO_currency:
                 continue
             # token is a 3-letter ISO currency abbreviation. If the next
-            # token is a number, swap them.
-            right = get_right_neighbor(tokens, i)
+            # token is a number, swap them, UNLESS the previous token is
+            # also a number.
+            left, right = get_neighbors(tokens, i)
             if right and right.is_formatted_decimal_number:
-                transforms = [tr.SwapTransform(self, [token, right])]
-                break
+                if not left or not left.is_formatted_decimal_number:
+                    transforms = [tr.SwapTransform(self, [token, right])]
+                    break
         # if transforms:
         #     logging.debug('CurrencyOrderRule returning %d', len(transforms))
         return transforms

File transforms.py

 
 
 class IndexSplitTransform(Transform):
-    """Split tokens at designated character index."""
+    """Split tokens at designated character index.
+
+    Keyword Arguments:
+    `index` - the character position at which to split the string
+    `three_way` - if True, split the string into three parts instead of two
+
+    If `index` is 0, `three_way` is ignored and a special case of the
+    two-way split is performed. The string is split into two tokens, the
+    first composed of the token's initial character, and the second
+    being the remainder of the string.
+
+    If `three_way` is not True and `index` is non-zero, the token is
+    split into two parts, where the first is all the characters
+    preceding `index` and the second is all the characters from `index`
+    to the end of the string.
+
+    Otherwise, if `three_way` is True, the token will be split into
+    three parts: the characters before `index`, the single character at
+    `index`, and the characters after `index`.
+    """
 
     def __init__(self, rule, token, **kwargs):
         Transform.__init__(self, rule, token, **kwargs)
         self._index = kwargs.get('index')
-        assert(self._index > 0)
+        if not self._index:
+            self._three_way = False
+        else:
+            self._three_way = kwargs.get('three_way', False)
 
     def apply(self):
         logging.debug('>IndexSplitTransform %s', self.tokens_to_transform)
         for token_to_transform in self.tokens_to_transform:
             # find the index within the full list of tokens of the
-            # token we are to transform
+            # token to be transformed
             transform_token_index = self.rule.tokens.index(token_to_transform)
-            # split the token string at the supplied character index
-            left = token_to_transform.str[:self._index]
-            right = token_to_transform.str[self._index:]
-            # insert a new token for the left part of the split
-            self.rule.tokens.insert(transform_token_index, Token(left))
-            token_to_transform.str = right
+            if (self._three_way and
+                self._index < len(token_to_transform.str) - 1):
+                left = Token(token_to_transform.str[:self._index])
+                middle = token_to_transform.str[self._index]
+                right = Token(token_to_transform.str[self._index + 1:])
+
+                self.rule.tokens.insert(transform_token_index, Token(left))
+                token_to_transform.str = middle
+                self.rule.tokens.insert(transform_token_index + 2, Token(left))
+            else:
+                # split the token string at the supplied character index
+                index = self._index
+                if index == 0 and index < len(token_to_transform.str) - 1:
+                    index += 1
+                left = token_to_transform.str[:index]
+                right = token_to_transform.str[index:]
+                # insert a new token for the left part of the split
+                self.rule.tokens.insert(transform_token_index, Token(left))
+                token_to_transform.str = right
         logging.debug('<IndexSplitTransform %s', self.rule.tokens)
         return self.rule.tokens