1. yanchuan sim
  2. yc-pyutils

Commits

yanchuan sim  committed 8e547f3

slightly faster due to check for changes before tagging again

  • Participants
  • Parent commits 3228d5e
  • Branches master

Comments (0)

Files changed (1)

File ycutils/tokenize.py

View file
  • Ignore whitespace
 # __TRANS_UNICODE__ = dict([(ord(c), u' ') for c in u'!"#$%&()*+,./:;<=>?@[\\]^`{|}~'] + [(ord(u'-'), None)]) # \', _, - not in here
 # __TRANS_ASCII__ = string.maketrans('!"#$%&()*+,./:;<=>?@[\\]^`{|}~', ' ' * 29)
 
-__DEFAULT_NORMALIZE__ = ['case', 'phone', 'time', 'url', 'email', 'number', 'punct-del', 'hyphen-split', 'consecutive', 'clitics-del', 'neg-clitics-keep']
+__DEFAULT_NORMALIZE__ = ['case', 'phone', 'time', 'url', 'email', 'number', 'punct-del', 'hyphen-split', 'clitics-del', 'neg-clitics-keep']
 __DEFAULT_TAG_LIST__ = ['phone', 'time', 'url', 'email', 'number']
 
 TAG_EMPTY = 0
     if tag == TAG_EMPTY: continue
     elif tag == TAG_WORD:
       if 'hyphen-del' in normalize:
-        tok = __RE_HYPHENS__.sub('', tok)
-        tag = tag_tokens([tok], tag_list=tag_list)[0]
+        tok, n = __RE_HYPHENS__.subn('', tok)
+        if n: tag = tag_tokens([tok], tag_list=tag_list)[0]
+      #end if
 
       elif 'hyphen-split' in normalize: tok = __RE_HYPHENS__.sub(' ', tok)
 
       if 'punct-split' in normalize: tok = ' '.join(re_punct.split(tok)).strip()
-      elif 'punct-del' in normalize: tok = re_punct.sub(r' ', tok).strip()
+      elif 'punct-del' in normalize: tok = re_punct.sub(r' ', tok)
+      #end if
 
-      if 'clitics-del' in normalize: tok = __RE_CLITICS__.sub(r'\1', tok)
+      if 'clitics-del' in normalize: 
+        tok, n = __RE_CLITICS__.subn(r'\1', tok)
+        if n: tag = tag_tokens([tok], tag_list=tag_list)[0]
       elif 'clitics-split' in normalize: tok = ' '.join(__RE_CLITICS__.split(tok)).strip()
+        
 
-      if 'neg-clitics-del' in normalize: tok = __RE_NEGATION_CLITIC__.sub(r'\1n', tok)
+      if 'neg-clitics-del' in normalize: 
+        tok, n = __RE_NEGATION_CLITIC__.subn(r'\1n', tok)
+        if n: tag = tag_tokens([tok], tag_list=tag_list)[0]
       elif 'neg-clitics-split' in normalize: tok = ' '.join(__RE_NEGATION_CLITIC__.split(tok)).strip()
 
       if re_punct_all.match(tok): tag = TAG_PUNCT
     elif tag == TAG_NUM and 'number' in normalize: tok = '__NUM__'
     elif tag == TAG_URL and 'url' in normalize: tok = '__URL__'
 
-    if u' ' in tok:
+    if ' ' in tok:
       for tok2 in tok.split():
         toks2, tags2 = words(tok2, strip_unicode=False, normalize=normalize, tag_list=tag_list, filter_stopwords=False, not_punctuations=not_punctuations, return_tags=True)
         new_tokens_tags.extend(zip(toks2, tags2))
       #end for
     #end if
 
-    elif tok and tag: new_tokens_tags.append((tok, tag))
+    elif tok and tag: 
+      new_tokens_tags.append((tok, tag))
   #end for
 
   if 'consecutive' in normalize:
 
   :param sents: a list of sentences.
   :param strip_unicode, normalize, tag_list, filter_stopwords, not_punctuations, return_tags: see :meth:`words`.
-  :returns: a list of list of tokens. # todo
-  :rtype: list of list."""
+  :returns: a list of list of tokens or (list of tokens, list of tags)."""
 
-  text,
   if return_tags:
     return filter(lambda (sent, tag): sent and tag, map(lambda sent: words(sent, strip_unicode=strip_unicode, normalize=normalize, tag_list=tag_list, filter_stopwords=filter_stopwords, not_punctuations='', return_tags=False), sents))