Commits

wooparadog  committed fd886d5

f

  • Participants
  • Parent commits 8f0b0eb

Comments (0)

Files changed (4)

File marvin/similarity.py

     >>> sim_cosine({1:2,2:3,5:2},{1:1.9,2:2.1,5:2})
     0.986375822
     '''
-    all_keys = set(item_A_vector.keys())|set(item_B_vector.keys())
-    base = map(lambda vector_dict:math.sqrt(sum([x**2 for x in vector_dict.values()])),[item_A_vector,item_B_vector])
-    
-    h = sum([item_A_vector[key]*item_B_vector[key] for key in all_keys])
-    return h/float(multi(base))
+    if item_A_vector and item_B_vector:
+        base = map(lambda vector_dict:math.sqrt(sum([x**2 for x in vector_dict.values()])),[item_A_vector,item_B_vector])
+        
+        h = sum([item_A_vector[key]*item_B_vector[key] for key in item_A_vector.keys() if key in item_B_vector])
+        return h/float(multi(base))
 
 if __name__=='__main__':
     import doctest

File tfidf/classification.py

 from idf import idf_zhihu
 from mmseg import seg_txt
 from yajl import loads
-from tfidf.train.topic_bayes import TAG2ID, WORD2ID, BAYES_RANK
+from tfidf.train.topic_bayes import TAG2ID, WORD2ID 
+from tfidf.train.convert2array import DB_Kyoto
 
 import sys;
 reload(sys);
 class GetTag(object):
     def __init__(self ):
         self.idf = idf_zhihu()
+        self.db = DB_Kyoto('test.kch')
 
     def get_tag(self, txt):
-        print '------------------------'*2
-        print txt
         topic_rank = defaultdict(float)
         tfidf_list = sorted(self.idf.tf_idf(txt), key=lambda x:x[1], reverse=True)
+        average_tfidf = sum([i[1] for i in tfidf_list])/float(len(tfidf_list))
+        tfidf_list = [ i for i in tfidf_list if i[1]>average_tfidf]
 
-        highest_word_list = []
-        for word, tfidf in tfidf_list[:10]:
-            if word in ID2TAG.values():
-                highest_word_list.append(TAG2ID.id_by_tag(word))
 
-        for word_tfidf, word_id in zip(
-            [i[1] for i in tfidf_list],
+        for (word, word_tfidf), word_id in zip(
+            tfidf_list,
             WORD2ID.id_list_by_word_list(i[0] for i in tfidf_list)
         ):
-            if word_id in BAYES_RANK:
-                for topic_id, bayes in BAYES_RANK[word_id]:
+            topic_items_dict  = self.db.get(word_id)
+            if topic_items_dict:
+                for topic_id, bayes in topic_items_dict:
                     topic_rank[topic_id] += (word_tfidf*bayes)
 
         topic_rank = sorted(topic_rank.iteritems(), key=lambda x:x[1], reverse=True)
+        txt = txt.lower()
+        if topic_rank:
+            rank_avg = float(sum(i[1] for i in topic_rank))/len(topic_rank)
+            for topic_id, rank in topic_rank[:50]:
+                '''
+                推荐主题做二元分词, 如果文章中没有, 则去掉. 
+                '''
+                topic = ID2TAG[topic_id]
+                rank_t = rank/rank_avg
+                for seg in sp_txt(topic):
+                    if seg in txt:
+                        yield topic, rank_t
+                        break
 
-        for topic_id, rank in topic_rank[:10]:
-            '''
-            推荐主题做二元分词, 如果文章中没有, 则去掉. 
-            '''
-            for seg in sp_txt(ID2TAG[topic_id]):
-                if seg in txt:
-                    print ID2TAG[topic_id], rank
+                if rank_t<6:
                     break
-                
-
-        for k in highest_word_list:
-            print ID2TAG[k]
 
 def sp_txt(txt):
     for i in range(len(txt)-1):

File tfidf/test/ucdchina_test.py

 #for f in file_list:
 #    print '-------------------------',f
 #    parse_page(f)
-parse_page('ucdchina/2798')
+#parse_page('ucdchina/2798')

File tfidf/train/train_data/convert_zhihu.py

             entry = {}
             entry['txt'] = ''.join([str(i) for i in data['answer']])
             entry['tags'] = [str(i) for i in data['tags']]
-            print dumps(entry)
+            #print dumps(entry)
+            open(str(i)+'.test','w').write(data.encode('utf-8'))
 
 
     #with open('zhihu.js') as f: