david_walker avatar david_walker committed 6cf6c1b

force abbreviations to normal form

Comments (0)

Files changed (2)

         AbbrevInfo(ur'e\.g\.'),
         AbbrevInfo(ur'i\.e\.'),
         AbbrevInfo(ur'etc\.'),
-        AbbrevInfo(ur'mr\.', 'Mr.'),
-        AbbrevInfo(ur'mrs\.', 'Mrs.'),
-        AbbrevInfo(ur'ksh\.', 'KES'),
-        AbbrevInfo(ur'kes\.', 'KES'),
-        AbbrevInfo(ur'ltd\.', 'Ltd.'),
-        AbbrevInfo(ur's\.a\.l(\.)?', 's.a.l.'),
-        AbbrevInfo(ur'u\.s\.s\.r\.', 'U.S.S.R.')]
+        AbbrevInfo(ur'mr\.', u'Mr.'),
+        AbbrevInfo(ur'mrs\.', u'Mrs.'),
+        AbbrevInfo(ur'ksh\.', u'KES'),
+        AbbrevInfo(ur'kes\.', u'KES'),
+        AbbrevInfo(ur'ltd\.', u'Ltd.'),
+        AbbrevInfo(ur's\.a\.l(\.)?', u's.a.l.'),
+        AbbrevInfo(ur'u\.s\.s\.r\.', u'U.S.S.R.')]
 
     _currency_terms = [
         u'$',
 
         # Jargon
         (u'cycle loan', u'loan'),
+
+        (ur'\b1st\b', u'first'),
+        (ur'\b2nd\b', u'second'),
+        (ur'\b3rd\b', u'third'),
+        (ur'\b4th\b', u'fourth'),
+        (ur'\b5th\b', u'fifth'),
+        (ur'\b6th\b', u'sixth'),
+        (ur'\b7th\b', u'seventh'),
+        (ur'\b8th\b', u'eighth'),
+        (ur'\b9th\b', u'ninth'),
         ]
 
     def __init__(self):
         for token in tokens:
             if not '.' in token.str:
                 continue
-            # token has a dot in it somewhere.
-            # leave it alone if it's supposed to have embedded dots.
-            if token.is_abbrev or token.is_URL or token.has_digits:
+            # Token has a dot in it somewhere. Leave it alone if it's
+            # supposed to have embedded dots and is not an abbreviation.
+            if token.is_URL or token.has_digits:
                 continue
 
-            # Token is not marked as exactly matching an abbreviation,
-            # but there could be an abbreviation that matches the start
-            # of the token. In that case we should split token at the
-            # end of the matching abbrev.
+            # check if the token is or starts with an abbreviation
             abbrev_len, abbrev = token.abbrev_match_len
 
             if abbrev_len:
-                transforms.append(
-                    tr.AbbrevTransform(self, [token],
-                                       matched_abbrev=abbrev,
-                                       abbrev_match_len=abbrev_len))
+                if token.is_abbrev:
+                    # token is an abbreviation
+                    if abbrev.normal_form and token.str != abbrev.normal_form:
+                        # but it differs from the proper form of the abbrev
+                        attr_change = AttrChange('str', abbrev.normal_form)
+                        transform = tr.SetAttrTransform(
+                            self, [token], attr_changes=[attr_change])
+                        transforms.append(transform)
+                else:
+                    # token starts with an abbreviation and should be split
+                    transforms.append(
+                        tr.AbbrevTransform(self, [token],
+                                           matched_abbrev=abbrev,
+                                           abbrev_match_len=abbrev_len))
             elif len(token.str) > 1:
                 # length check so we don't try to split '.'
                 transform_tokens.append(token)
                 mo for mo in
                 PunctSplitRule.embedded_decimal_number_re.finditer(token.str)]
             for i, char in enumerate(token.str):
-                if unicodedata.category(char).startswith(u'P'):
+                if (unicodedata.category(char).startswith(u'P') and
+                    char != u'.'):
                     # found punctuation character. does it lie within
                     # any span of embedded decimal numbers?
                     skip = False
             elif token.str[0].isdigit():
                 mo = re.search(ur'[a-zA-Z]', token.str)
             if mo:
-                logging.debug("AlphaNumSplitRule '{}' at {}".format(token, mo.start()))
+                logging.debug("AlphaNumSplitRule '{}' at {}".format(
+                        token, mo.start()))
                 transform = tr.IndexSplitTransform(self, [token],
                                                    index=mo.start(),
                                                    three_way=False)
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.