1. wooparadog
  2. 42qu-data

Commits

wooparadog  committed ba1bc9e

f

  • Participants
  • Parent commits fd886d5
  • Branches default

Comments (0)

Files changed (3)

File tfidf/classification.py

View file
  • Ignore whitespace
 from yajl import loads
 from tfidf.train.topic_bayes import TAG2ID, WORD2ID 
 from tfidf.train.convert2array import DB_Kyoto
+from zkit.sp_txt import sp_txt
 
 import sys;
-reload(sys);
-sys.setdefaultencoding('utf-8')
+reload(sys); sys.setdefaultencoding('utf-8')
 
 ID2TAG = TAG2ID.id2word()
 
                 if rank_t<6:
                     break
 
-def sp_txt(txt):
-    for i in range(len(txt)-1):
-        yield txt[i:i+2]
 
 if __name__ == '__main__':
     txt = '''

File tfidf/find_parent_tag.py

View file
  • Ignore whitespace
 from idf import idf_zhihu
 from mmseg import seg_txt
 from yajl import loads
+from zkit.sp_txt import sp_txt
 
 class ParentTagger(object):
     def __init__(self):
 
         self.id_to_word = TAG2ID.id2word()
 
-    def sp_txt(self, txt):
-        txt = unicode(txt)
-        for i in range(len(txt)-1):
-            yield txt[i:i+2]
-
     def get_parent_tag(self, tag):
         set_list = []
 
-        for i in self.sp_txt(tag):
+        for i in sp_txt(tag):
             if i in self.word_to_id:
                 set_list.append(i)
 
     
 if __name__ == '__main__':
     finder = ParentTagger()
-    print finder.get_parent_tag(u'用户体验设计')
+    print ','.join(finder.get_parent_tag(u'用户体验设计'))

File zkit/sp_txt.py

View file
  • Ignore whitespace
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+
+def sp_txt(txt):
+    if str(txt).replace(" ",'').isalnum():
+        yield txt
+    else:
+        txt = txt.decode('utf-8')
+        for i in range(len(txt)-1):
+            yield txt[i:i+2]
+
+
+if __name__ == '__main__':
+    for i in sp_txt('这是什么东西'):
+        print i
+
+    for i in sp_txt('a quick brown fox jumps over the lazy dog'):
+        print i