Commits

yanchuan sim committed eacd7fd

added k/m/b for numerical/monetary values and handles stopwords using python style do first worry later

  • Participants
  • Parent commits c138277

Comments (0)

Files changed (1)

File ycutils/tokenize.py

 STOPWORDS.add('i')
 
 __RE_WHITESPACES__ = re.compile(ur'\t\n\r\f\v', re.U)
-__RE_NUMBERS__ = re.compile(ur'[\+\-\p{Pd}\p{Pc}]?([\d,]*\.?[\d,]+)(th|st|nd|rd)?$', re.U | re.I)
-__RE_MONEY__ = re.compile(ur'\p{Sc}[\d\.,]+', re.U | re.I)
+__RE_NUMBERS__ = re.compile(ur'[\+\-\p{Pd}\p{Pc}]?([\d,]*\.?[\d,]+)(th|st|nd|rd|k|m|b)?$', re.U | re.I)
+__RE_MONEY__ = re.compile(ur'[\+-]?\p{Sc}[\d\.,kmb]+', re.U | re.I)
 __RE_TIME__ = re.compile(ur'((([0]?[1-9]|1[0-2])([:][0-5][0-9](:[0-5][0-9])?( )?)?(am|pm|a\.m|p\.m))|(([0]?[0-9]|1[0-9]|2[0-3]):[0-5][0-9](:[0-5][0-9])?)|(([0]?[1-9]|1[0-2]|2[0-3]):?[0-5][0-9]))$', re.I | re.U)
 __RE_PHONE_NUMBER__ = re.compile(ur'(\(?[2-9]\d\d\)?)?[-\. ]\d{3,4}[-\. ]\d{4}$', re.U)
 __RE_URL__ = re.compile(ur'((http|https|ftp)\://)?[a-z0-9\-\.]+\.[a-z]{2,3}(:[a-z0-9]*)?/?([a-z0-9\-\._\?\,\'/\\\+\&\%\$#\=~])*$', re.U | re.I) # with http/https/ftp
 
     if ' ' in tok:
       for tok2 in tok.split():
-        toks2, tags2 = words(tok2, strip_unicode=False, normalize=normalize, tag_list=tag_list, filter_stopwords=False, not_punctuations=not_punctuations, return_tags=True)
+        toks2, tags2 = words(tok2, strip_unicode=False, normalize=normalize, tag_list=tag_list, filter_stopwords=filter_stopwords, not_punctuations=not_punctuations, return_tags=True)
         new_tokens_tags.extend(zip(toks2, tags2))
       #end for
     #end if
   #end if
 
   if filter_stopwords:
-    if not isinstance(filter_stopwords, list): filter_stopwords = STOPWORDS
-    new_tokens_tags = filter(lambda (tok, tag): tok not in filter_stopwords, new_tokens_tags)
+    try: new_tokens_tags = filter(lambda (tok, tag): tok not in filter_stopwords, new_tokens_tags)
+    except TypeError: new_tokens_tags = filter(lambda (tok, tag): tok not in STOPWORDS, new_tokens_tags)
   #end if