Commits

david_walker  committed 0fdb7ac

prepare for sentence delimiter token
prevent split of decimal numbers in AlphaNumSplitRule

  • Participants
  • Parent commits 100412e

Comments (0)

Files changed (4)

         self._abbrev_checked = False
         self._abbrev_match = None
         self._abbrev_match_len = 0
-        self.bos = None
-        self.eos = None
+        self.sentence_delim = None
         self.eof = None
 
         self._URL_checked = False
         """Return True if any of the attributes are set which indicate a
         non-printing token.
         """
-        return self.bos or self.eos or self.eof
+        return self.sentence_delim or self.eof
 
 
 class Transform():

File regex_process.py

 
 transforms = [
     (u'“|”', u'"'),
-    
+
     (u'raised animals', u'grown animals'),
 
     (u'dependants', u'dependents'),
-    
+
     (u'  +', u' '),
 
     (u' ,', u','), # foo , -> foo,
     (u'Pre-angkorean', u'pre-Angkorean'),
 
     (ur'\b1 infant-aged children', u'one infant child'),
-    
+
     (u'adult-aged', u'adult'),
 
     (ur'and etc\.*', u'etc.'),
     (u'requested for', u'requested'),
 
     (u'lake victoria', u'Lake Victoria'),
-    
+
     (ur'To make a living, (?P<name>(\w|\s)+) owns & operates a business venture in the [a-z]+ sector \w+ (?P<business>[^.]+\.)',
      ur'\g<name> owns and operates \g<business>'),
 
     #(ur'\s+\.([^ \t])', ur'. \1'),
 
     (ur'(?<!\.)\.\.(?!\.)', u'.'), # blah.. -> blah.
-    
+
     (ur'\b1st\b', u'first'),
     (ur'\b2nd\b', u'second'),
     (ur'\b3rd\b', u'third'),
         line = handle_template(line, change_list)
         description.append(line)
     return u''.join(description)
-
-
-
         # misspellings
         (u'dependants', u'dependents'),
         (ur'therefor\b', u'therefore'),
+        (ur'(?i)(micro) +finance', u'\1finance'),
 
         # proper nouns
         (u'congo town', u'Congo Town'),
         (ur'and etc\.*', u'etc.'),
         (u'infant-aged', u'infant'),
         (u'requesting for a', u'requesting a'),
+        (u'requested a loan for ([0-9]+)', ur'requested a loan of \1'),
+        (u'he is widowed', u'he is a widower'),
         (u'borrowed a loan', u'took out a loan'),
         (u'in a business of', u'in the business of'),
         (u'with (.+) children and (.+) of them go to school',
         (u'from the Word of God she received',
          u'from the Word of God she studies'),
         (u'raise & sell in future', u'raise and sell'),
+        (u'married with ([0-9]+) (child|children)',
+         ur'married and has \1 \2'),
+        (u'has a long experience', u'has a lot of experience'),
+        (u'is aiming to gain more profits', u'aims to make more money'),
+        (u'has a good experience in this field and a good reputation '
+         'and (s?he) is being well known in (his|her) area',
+         ur' has a lot of experience in this field, a good reputation, '
+         ur'and is well known in \2 area'),
 
         # Chiefly British
         (u'([iI]n) future', ur'\1 the future'),
         (u'tyres', u'tires'),
+        (u'neighbour', u'neighbor'),
+        (u'licencing', u'licensing'),
 
-        # non-ISO currency abbreviations
+        # currency abbreviations
         (u'/=', u' UGX '),
         (ur'(?i)ksh\.', u' KES '),
+        (ur'(?i)kshs', u' KES '),
+        (ur'[Pp]hp', 'PHP'),
 
         # incorrect punctuation
         (ur'e\.t\.c\.?', u'etc.'),
         (ur'\betc([^.])', ur'etc.\1'),
-        (ur'([0-9]+) year old (man|woman)', ur'\1-year-old \2'),
+        (ur'([0-9]+) year(?:s?) old (man|woman)', ur'\1-year-old \2'),
         (ur'(?<!\.)\.\.(?!\.)', u'.'),  # blah.. -> blah.
 
         # grammatical errors
         (ur'\b1 infant-aged children', u'one infant child'),
         (ur'\b1 years', u'one year'),
+        (ur'never missed any meeting\.', u'never missed any meetings.'),
 
         # Field partner template cleanup
         (ur'To make a living, (?P<name>(\w|\s)+) owns & operates a business'
 
         # Jargon
         (u'cycle loan', u'loan'),
+        (u'loan cycle', u'loan'),
+        (u'loan facility', u'loan'),
 
         # Numeric expressions
         (ur'\b1st\b', u'first'),
         # |    9 | 14th      | 14th       |
         # |   10 | 2nd       | 2nd        |
         # |   11 | 43rd      | 43rd       |
+        # |   12 | 1,200.    | 1,200 .    |
+        # |   13 | 1,500.00  | 1,500.00   |
 
         for token in tokens:
             # skip non-printing, URL, and short tokens
                 if token.is_alphanumeric_ordinal:
                     # cases 9-11
                     continue
-                mo = re.search(ur'[a-zA-Z]', token.str)
+                # case 12, note $ is for case 13
+                mo = re.match(ur'[1-9][0-9]{,2}(?:,[0-9]{3})*\.(?:[^0-9]|$)',
+                              token.str)
+                if mo:
+                    # reposition at period
+                    mo = re.search(ur'\.', token.str)
+                else:
+                    # split at first alpha
+                    mo = re.search(ur'[a-zA-Z]', token.str)
             if mo:
                 logging.debug(u"AlphaNumSplitRule '{}' at {}".format(
                         token.str, mo.start()))
         # if transforms:
         #     logging.debug('ISOCurrencyRule returning %d', len(transforms))
         return transforms
+
+
+class SentenceDelimitRule(Rule):
+    """Insert delimiter tokens between beginning and end of sentences.
+    """
+
+    def __init__(self):
+        Rule.__init__(
+            self, 130, 1.0,
+            "Surround every sentence with a sentence-delimiter token.")
+
+    def get_transforms(self, tokens):
+        """Return a transform that will insert the delimiter tokens.
+
+        This rule is only intended to run once. It will disable itself
+        after the first run. If it detects any pre-existing sentence
+        delimiter tokens, it will return an empty list.
+        """
+        self.tokens = tokens
+
+        # do nothing if this rule has ever been run.
+        for token in tokens:
+            if token.sentence_delim:
+                self.enabled = False
+                return []
+        return []
 
 
 Lending helptext http://tinyurl.com/3aekx8m
+
+---
+
+Isabella is 60 years old, married to Michael. She has been keeping poultry for ten years with a monthly income of KES 12,000.
+
+five peso's-worth