Commits

david_walker committed 5190539 Merge

merge the unicode logging fix to transforms.py from the dev branch

Comments (0)

Files changed (1)

 from base import Token, Transform
 
 
+def token_strings(tokens):
+    return [t.str for t in tokens]
+
+
 class ParagraphTransform(Transform):
     """Break tokens containing a newline into three parts.
     """
         self._mo = kwargs['match_obj']
 
     def apply(self):
-        logging.debug(u'>ParagraphTransform %s', self.tokens_to_transform)
+        logging.debug(u'>ParagraphTransform %s',
+                      token_strings(self.tokens_to_transform))
         # Replace each token to transform with three new tokens: one
         # for the string before the newline, a PARA token, and one for
         # the string after the newline.
             right = Token(token_to_transform.str[self._mo.end():])
             # replace the token we're transforming
             self.rule.tokens[transform_token_index] = paragraph_token
-            # now insert a token to its left
-            self.rule.tokens.insert(transform_token_index, left)
-            # that insertion has made transform_token_index out of
-            # date; correct it.
-            transform_token_index += 1
-            # now insert the token to the right
-            self.rule.tokens.insert(transform_token_index + 1, right)
-        logging.debug(u'<ParagraphTransform %s', self.rule.tokens)
+            # insert a token to its left if it is nonempty
+            if left.str:
+                self.rule.tokens.insert(transform_token_index, left)
+                # that insertion has made transform_token_index out of
+                # date; correct it.
+                transform_token_index += 1
+            # now insert the token to the right, if it is nonempty
+            if right.str:
+                self.rule.tokens.insert(transform_token_index + 1, right)
+        logging.debug(u'<ParagraphTransform %s',
+                      token_strings(self.rule.tokens))
         return self.rule.tokens
 
 
                          len(tokens_to_transform))
 
     def apply(self):
-        logging.debug(u'>AbbrevTransform %s', self.tokens_to_transform)
+        logging.debug(u'>AbbrevTransform %s',
+                      token_strings(self.tokens_to_transform))
         for token_to_transform in self.tokens_to_transform:
             token_to_transform.abbrev = True
 
             transform_token_index = self.rule.tokens.index(token_to_transform)
             self.rule.tokens.insert(transform_token_index + 1,
                                     post_abbrev_token)
-        logging.debug(u'<AbbrevTransform %s', self.rule.tokens)
+        logging.debug(u'<AbbrevTransform %s', token_strings(self.rule.tokens))
         return self.rule.tokens
 
 
         self._split_re = kwargs.get('split_re')
 
     def apply(self):
-        logging.debug(u'>RegexSplitTransform %s', self.tokens_to_transform)
+        logging.debug(u'>RegexSplitTransform %s',
+                      token_strings(self.tokens_to_transform))
         for token_to_transform in self.tokens_to_transform:
             # find the index within the full list of tokens of the
             # token we are to transform
             split_tokens = map(Token, split_strings)
             # put it all back together
             self.rule.tokens = left_of_split + split_tokens + right_of_split
-        logging.debug(u'<RegexSplitTransform %s', self.rule.tokens)
+        logging.debug(u'<RegexSplitTransform %s',
+                      token_strings(self.rule.tokens))
         return self.rule.tokens
 
 
             self._three_way = kwargs.get('three_way', False)
 
     def apply(self):
-        logging.debug(u'>IndexSplitTransform %s', self.tokens_to_transform)
+        logging.debug(u'>IndexSplitTransform %s',
+                      token_strings(self.tokens_to_transform))
         for token_to_transform in self.tokens_to_transform:
             # find the index within the full list of tokens of the
             # token to be transformed
                 # insert a new token for the left part of the split
                 self.rule.tokens.insert(transform_token_index, Token(left))
                 token_to_transform.str = right
-        logging.debug(u'<IndexSplitTransform %s', self.rule.tokens)
+        logging.debug(u'<IndexSplitTransform %s',
+                      token_strings(self.rule.tokens))
         return self.rule.tokens
 
 
         self._attr_changes = kwargs['attr_changes']
 
     def apply(self):
-        logging.debug(u'>SetAttrTransform %s', self.tokens_to_transform)
+        logging.debug(u'>SetAttrTransform %s',
+                      token_strings(self.tokens_to_transform))
         for token in self.tokens_to_transform:
             for change in self._attr_changes:
                 setattr(token, change.attr, change.value)
-        logging.debug(u'<SetAttrTransform %s', self.rule.tokens)
+        logging.debug(u'<SetAttrTransform %s', token_strings(self.rule.tokens))
         return self.rule.tokens
 
 
 
     def apply(self):
         logging.debug(u'>SeparateThousandsTransform %s',
-                      self.tokens_to_transform)
+                      token_strings(self.tokens_to_transform))
         for token in self.tokens_to_transform:
             new_str = re.sub(
                 ur'(\d)(\d{3})\b',
                     ur'(\d)(\d{3})\b',
                     SeparateThousandsTransform.digit_group_callback,
                     token.str)
-        logging.debug(u'<SeparateThousandsTransform %s', self.rule.tokens)
+        logging.debug(u'<SeparateThousandsTransform %s',
+                      token_strings(self.rule.tokens))
         return self.rule.tokens
 
 
             Transform.__init__(self, rule, tokens, **kwargs)
 
     def apply(self):
-        logging.debug(u'>ISOCurrencyTransform %s', self.tokens_to_transform)
+        logging.debug(u'>ISOCurrencyTransform %s',
+                      token_strings(self.tokens_to_transform))
         assert(len(self.tokens_to_transform) == 1)
         token = self.tokens_to_transform[0]
         transform_token_index = self.rule.tokens.index(token)
         # this must be set to avoid infinite loop
         new_tokens[-2].ISO_currency_expanded = True
         self.rule.tokens = left_of_split + new_tokens + right_of_split
-        logging.debug(u'<ISOCurrencyTransform %s', self.rule.tokens)
+        logging.debug(u'<ISOCurrencyTransform %s',
+                      token_strings(self.rule.tokens))
         return self.rule.tokens
 
 
             Transform.__init__(self, rule, tokens, **kwargs)
 
     def apply(self):
-        logging.debug(u'>USCurrencyTransform %s', self.tokens_to_transform)
+        logging.debug(u'>USCurrencyTransform %s',
+                      token_strings(self.tokens_to_transform))
         assert(len(self.tokens_to_transform) == 1)
         token = self.tokens_to_transform[0]
         transform_token_index = self.rule.tokens.index(token)
         del self.rule.tokens[transform_token_index]
         amount = self.rule.tokens[transform_token_index - 1]
         amount.str = '$' + amount.str
-        logging.debug(u'<USCurrencyTransform %s', self.rule.tokens)
+        logging.debug(u'<USCurrencyTransform %s',
+                      token_strings(self.rule.tokens))
         return self.rule.tokens
 
 
             Transform.__init__(self, rule, tokens, **kwargs)
 
     def apply(self):
-        logging.debug(u'>SwapTransform %s', self.tokens_to_transform)
+        logging.debug(u'>SwapTransform %s',
+                      token_strings(self.tokens_to_transform))
         assert(len(self.tokens_to_transform) == 2)
         index1 = self.rule.tokens.index(self.tokens_to_transform[0])
         index2 = self.rule.tokens.index(self.tokens_to_transform[1])
         self.rule.tokens[index1], self.rule.tokens[index2] = \
             self.rule.tokens[index2], self.rule.tokens[index1]
-        logging.debug(u'<SwapTransform %s', self.rule.tokens)
+        logging.debug(u'<SwapTransform %s', token_strings(self.rule.tokens))
         return self.rule.tokens
 
 
         for token in self.tokens_to_transform:
             for re_pair in self.rule.regex_pairs:
                 token.str = re.sub(re_pair[0], re_pair[1], token.str)
-        logging.debug(u'<RegexTransform %s', self.rule.tokens)
+        logging.debug(u'<RegexTransform %s', token_strings(self.rule.tokens))
         return self.rule.tokens
 
 
         Transform.__init__(self, rule, tokens, **kwargs)
 
     def apply(self):
-        logging.debug(u'>ConcatenateTransform %s', self.tokens_to_transform)
+        logging.debug(u'>ConcatenateTransform %s',
+                      token_strings(self.tokens_to_transform))
         # first get the concatenated string
         new_str = u''
         first_token = True
             first_token = False
         # set the first token in the series to contain the new string
         self.tokens_to_transform[0].str = new_str
-        logging.debug(u'<ConcatenateTransform %s', self.rule.tokens)
+        logging.debug(u'<ConcatenateTransform %s',
+                      token_strings(self.rule.tokens))
         return self.rule.tokens