Commits

david_walker committed 6d84598

improve handling of punctuation characters

Comments (0)

Files changed (1)

 
 def split_token_at_delim(caller, tokens, transform_token_index, delim,
                          keep_delim):
+    """
+    Split the token at tokens[transform_token_index] into multiple
+    tokens delimited by `delim`.
+
+    Arguments:
+    caller -- name of calling function to print in debug logging
+
+    tokens -- array of Token objects; new Tokens will be inserted if
+    delimiter character is found.
+
+    transform_token_index -- 0-based index into `tokens`
+
+    delim -- delimiter character
+
+    keep_delim -- if set, transform runs of delimiter characters into
+    Tokens, otherwise discard them when splitting.
+
+    If the token to split consists of nothing but delimiter characters,
+    then it is either left unchanged if `keep_delim` is set or it is
+    deleted if `keep_delim` is not set.
+    """
     token_to_transform = tokens[transform_token_index]
     split_tokens = []
+    # accumulate non-delimiter characters in split_str
     split_str = u''
+    # if keep_delim is set, then accumulate runs of delimiter characters
+    # in delim_token.
+    delim_token = None
     for i, c in enumerate(token_to_transform.str):
         if c == delim:
             # if delimiter has just occurred after one or more
-            # non-delimiter characters, add those characters as
-            # a new token.
+            # non-delimiter characters, add those characters as a new
+            # token.
             if split_str:
                 cbegin = token_to_transform.cbegin + i - len(split_str)
                 cend = cbegin + len(split_str)
                 split_tokens.append(Token(split_str, cbegin, cend))
                 split_str = u''
-            # if we're keeping delimiter characters, make a new
-            # token for it.
+            # if we're keeping delimiter characters, accumulate this one
             if keep_delim:
-                cbegin = token_to_transform.cbegin + i
-                split_tokens.append(Token(c, cbegin, cbegin + 1))
+                if not delim_token:
+                    cbegin = token_to_transform.cbegin + i
+                    delim_token = Token(delim, cbegin, cbegin + 1)
+                else:
+                    delim_token.str = delim_token.str + delim
+                    delim_token.cend += 1
         else:
+            if delim_token:
+                split_tokens.append(delim_token)
+                delim_token = None
             split_str += c
+    # it shouldn't be possible to have both a split_str and a
+    # delim_token to append to the split_tokens list because whenever
+    # one is encountered the other is emptied.
+    assert(not (split_str and delim_token))
     if split_str:
         cbegin = token_to_transform.cbegin + i + 1 - len(split_str)
         cend = cbegin + len(split_str)
         split_tokens.append(Token(split_str, cbegin, cend))
+    if delim_token:
+        split_tokens.append(delim_token)
     if len(split_tokens) > 1:
         # insert all but the last of split_tokens before the token at
         # transform_token_index, then make the token at
         # transform_token_index look like the last token in split_tokens.
         for i, token_to_insert in enumerate(split_tokens[:-1]):
-            logging.debug(u'{} inserting {} before {}'.format(
+            logging.debug(u'stad {} inserting {} before {}'.format(
                     caller, token_to_insert,
                     tokens[transform_token_index + i]))
             tokens.insert(transform_token_index + i, token_to_insert)
         transform_token_index += i + 1
-        logging.debug(u'{} changing {} to {}'.format(
+        logging.debug(u'stad {} changing {} to {}'.format(
                 caller, tokens[transform_token_index],
                 split_tokens[-1]))
         tokens[transform_token_index].str = split_tokens[-1].str
         tokens[transform_token_index].cbegin = split_tokens[-1].cbegin
         tokens[transform_token_index].cend = split_tokens[-1].cend
-
+        return True
+    return False
 
 def split_token_at_index(caller, tokens, transform_token_index, split_index,
                          three_way):
                 elif self._transform_abbrev(ea.tokens, i, abbrev, abbrev_len):
                     # token starts with an abbreviation was split
                     changed = True
-            elif len(token.str) > 1:
-                # length check so we don't try to split '.'
-                split_token_at_delim(self, ea.tokens, i, u'.', True)
-                changed = True
+            else:
+                changed = split_token_at_delim(self, ea.tokens, i, u'.', True)
         return changed
 
 
 
     Avoid splitting numeric punctuation, e.g., 11,000.34 should not be
     split at the comma or the decimal. Also avoid splitting at
-    apostrophes in contractions, and do not split at single hyphens.
+    apostrophes in contractions.
+
+    Do not split at single hyphens, because the parser fails at
+    expressions like "47" "year" "-" "old" but succeeds when given a
+    token sequence like "47" "year-old" (the tagger labels "year-old" as
+    JJ).
     """
 
     phase = INITIAL_PHASE
     # this is the same as Token.delimited_decimal_re except that
     # it is not bookended by ^ and $
     embedded_decimal_number_re = re.compile(
-        ur"""[0-9]{1,3}   # one to three leading digits
+        ur"""(-|\+)?      # negative or positive sign
+             [0-9]{1,3}   # one to three leading digits
              (,[0-9]{3})* # any number of comma-delimited groups of 3 digits
              (\.[0-9]+)?  # optional decimal followed by one or more digits
           """,
         re.U | re.X)
 
-    _contraction_endings = [u't', u's', u'd']
+    _contraction_endings = u'tsdTSD'
 
     def __init__(self):
         """Set rule priority and name. """
         Rule.__init__(self, 60,
-                      "Separate punctuation (other than periods)"
-                      " into separate tokens.")
+                      "Separate punctuation (other than periods, apostrophes "
+                      "not being used as quote marks, and single hypens) "
+                      "into separate tokens.")
 
     def apply(self, ea):
         changed = False
                 mo for mo in
                 PunctSplitRule.embedded_decimal_number_re.finditer(token.str)]
             for i, char in enumerate(token.str):
-                if (unicodedata.category(char).startswith(u'P') and
-                    char != u'.'):
-                    # found punctuation character. does it lie within
-                    # any span of embedded decimal numbers?
-                    skip = False
-                    for mo in number_mos:
-                        if mo.start() < i < mo.end():
-                            skip = True
-                            break
-                    if skip:
+                # DotSplitRule handles periods
+                if char == u'.':
+                    continue
+                # ignore non-punctuation characters
+                if not unicodedata.category(char).startswith(u'P'):
+                    continue
+                # found punctuation character. does it lie within any
+                # span of embedded decimal numbers?
+                skip = False
+                for mo in number_mos:
+                    if mo.start() <= i < mo.end():
+                        skip = True
+                        break
+                if skip:
+                    continue
+
+                # Found punctuation character, and it is not embedded
+                # within a number as a thousands separator or a decimal
+                # point. Check to see if it is an apostrophe in a
+                # contraction.
+                if (char == u"'" and token.str[i + 1:] in
+                    PunctSplitRule._contraction_endings):
                         continue
 
-                    # Found punctuation character, and it is not
-                    # embedded within a number as a thousands separator
-                    # or a decimal point. Check to see if it is an
-                    # apostrophe in a contraction.
-                    if (char == u"'" and token.str[i + 1:] in
-                        PunctSplitRule._contraction_endings):
-                            continue
+                # Don't split single hyphens, but do split runs of
+                # hyphens
+                if char == u'-':
+                    # don't split trailing hyphen
+                    if i == len(token.str) - 1:
+                        continue
+                    # don't split single embedded hyphen
+                    if token.str[i + 1] != u'-':
+                        continue
+                    # we've found 2 or more hyphens in a row. split them
+                    # into their own token.
+                    if split_token_at_delim('PunctSplitRule', ea.tokens,
+                                            ea.tokens.index(token), u'-', True):
+                        changed = True
+                        break
+                    continue
 
-                    # Split the token at this point.
-                    logging.debug(
-                        u"PunctSplitRule splitting '{}' at {}".format(
-                            token.str, i))
-                    split_token_at_index(
-                        self, ea.tokens, ea.tokens.index(token), i, True)
-                    changed = True
-                    break
+                # Split the token at this point.
+                logging.debug(
+                    u"PunctSplitRule splitting '{}' at {}".format(
+                        token.str, i))
+                split_token_at_index(
+                    self, ea.tokens, ea.tokens.index(token), i, True)
+                changed = True
+                break
         return changed