david_walker avatar david_walker committed bd85e6c

update token.cend for merging tokens in "years old" type expressions
look for any direct ancestor with hspec or hspechc instead of only grandparent for nn years old rule

Comments (0)

Files changed (1)

 
     def apply(self, ea):
         changed = False
-        for i, token in enumerate(ea.tokens):
+        token_idx = 0
+        while token_idx < len(ea.tokens):
+            token = ea.tokens[token_idx]
+            token_idx += 1
             if token.non_printing or not '.' in token.str:
                 continue
             # Token has a dot in it somewhere. Leave it alone if it's
                         logging.debug('DotSplitRule changing {} to {}'.format(
                                 original, token))
                         changed = True
-                elif self._transform_abbrev(ea.tokens, i, abbrev, abbrev_len):
+                elif self._transform_abbrev(ea.tokens, token_idx - 1, abbrev, abbrev_len):
                     # token starts with an abbreviation was split
                     changed = True
             else:
-                changed = split_token_at_delim(self, ea.tokens, i, u'.', True)
+                changed = split_token_at_delim(self, ea.tokens, token_idx - 1, u'.', True)
         return changed
 
 
         # |   14 | 47-year   | 47-year    |
         # |   15 | 1200.     | 1200 .     |
         changed = False
-        for token in ea.tokens:
+        token_idx = 0
+        # because the array of tokens can be lengthened or shortened
+        # during the loop, it is unsafe to use an expression like
+        # for token in ea.tokens. Use an index instead.
+        while token_idx < len(ea.tokens):
+            token = ea.tokens[token_idx]
+            # increment the index right away so that doing a continue is
+            # safe.
+            token_idx += 1
             # skip non-printing, URL, and short tokens
             if token.non_printing or len(token.str) < 2 or token.is_URL:
                 continue
 
         # non-ISO currency abbreviations
         TokenSearchByRegexp(ur'(.+)/=', ur'\1 UGX'),
-        TokenSearchByRegexp(ur'(?i)ksh(?:s|)(?:\.|)([0-9,.]+|)', ur'KES \1'),
+        TokenSearchByRegexp(ur'(?i)ksh(?:s?)([0-9,.]+)', ur'KES \1'),
+        TokenSearchByRegexp(ur'(?i)ksh(?:s?) \. ([0-9,.]+)', ur'KES \1'),
+        TokenSearchByRegexp(ur'(?i)ksh(s?\.?)$', ur'KES'),
+        TokenSearchByRegexp(ur'(?i)Rp \. ([0-9,.]+)', ur'IDR \1'),
         TokenSearchByRegexp(ur'[Pp]hp', ur'PHP'),
         TokenSearchByRegexp(ur'P([0-9,.]+)', ur'\1 PHP'),
         TokenSearchByRegexp(ur'(?i)LE ([0-9]*)', ur'SLL \1'),
 
     def apply(self, ea):
         changed = False
-        for i, token in enumerate(ea.tokens):
+        token_idx = 0
+        while token_idx < len(ea.tokens):
+            token = ea.tokens[token_idx]
             if token.non_printing:
+                token_idx += 1
                 continue
             if not DelimitThousandsRule._splittable_number_re.match(token.str):
+                token_idx += 1
                 continue
             # is preceding or following token recognized as a currency
             # symbol, ISO currency abbreviation, or term?
-            if (i > 0 and ea.tokens[i - 1].is_currency_term or
-                i < len(ea.tokens) - 1 and  ea.tokens[i + 1].is_currency_term):
+            if (token_idx > 0 and ea.tokens[token_idx - 1].is_currency_term or
+                token_idx < len(ea.tokens) - 1 and
+                ea.tokens[token_idx + 1].is_currency_term):
                 if self._separate_thousands(token):
                     changed = True
+            token_idx += 1
+
         return changed
 
 
                     logging.debug(
                         u'CurrencyOrderRule swapping {} and {}'.format(
                             token.str, right.str))
-                    token.str, right.str = right.str, token.str
+                    ea.tokens[i], ea.tokens[i+1] = ea.tokens[i+1], ea.tokens[i]
+                    #token.str, right.str = right.str, token.str
                     changed = True
                     break
         return changed
                 if ea.tokens[years_idx + 1].str == u'old':
                     logging.debug('_change_nn_dash_years changing "nn-year(s) old" to "nn-year-old"')
                     ea.tokens[years_idx].str = ea.tokens[years_idx].str[:-1] + u'-old'
+                    ea.tokens[years_idx].cend = ea.tokens[years_idx + 1].cend
                     del ea.tokens[years_idx + 1]
                     changed = True
                 elif ea.tokens[years_idx].str.endswith(u'-years-old'):
         try:
             years_node = sent.parse.node_from_token(ea.tokens[years_idx])
             if years_node.parent().name == 'noun_n_cmpnd':
-                logging.debug('_change_years_dash_old changing "nn years-old" to "nn-year-old"')
+                original = str(ea.tokens[years_idx - 1:years_idx + 1])
                 ea.tokens[years_idx - 1].str += u'-year-old'
+                ea.tokens[years_idx - 1].cend = ea.tokens[years_idx].cend
                 del ea.tokens[years_idx]
+                logging.debug('_change_years_dash_old changed %s to %s',
+                              original,
+                              ea.tokens[years_idx - 1])
                 changed = True
         except Exception as e:
             logging.debug('_change_years_dash_old caught ' + str(e))
             years_node = sent.parse.node_from_token(ea.tokens[years_idx])
             if (years_node.parent().name == 'plur_noun_orule' and
                 years_node.parent(3).children[1].name == 'npadv'):
+                logging.debug('_change_years_old changing "nn years old" to "nn-year-old"')
                 ea.tokens[years_idx - 1].str += u'-year-old'
+                ea.tokens[years_idx - 1].cend = ea.tokens[years_idx + 2].cend
                 del ea.tokens[years_idx:years_idx + 2]
                 changed = True
         except Exception as e:
         changed = False
         try:
             years_node = sent.parse.node_from_token(ea.tokens[years_idx])
-            if years_node.parent(3).name == 'hspechc':
+            if (years_node.has_parent('hspec') or
+                years_node.has_parent('hspechc')):
                 logging.debug('changing "nn year old" to "nn-year-old"')
                 ea.tokens[years_idx].str += u'-year-old'
+                ea.tokens[years_idx].cend = ea.tokens[years_idx + 2].cend
                 del ea.tokens[years_idx + 1:years_idx + 3]
                 changed = True
         except Exception as e:
-            logging.debug('caught ' + str(e))
+            logging.debug('_change_nn_year_old caught ' + str(e))
         return changed
 
     def apply(self, ea):
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.