Commits

wooparadog committed 8f0b0eb

f

  • Participants
  • Parent commits 37befa6

Comments (0)

Files changed (12)

File marvin/similarity.py

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import math
+
+def multi(ite):
+    out = 1
+    for i in ite:
+        out *= i 
+    return out
+
+def sim_cosine(item_A_vector, item_B_vector):
+    '''
+    vector should be defaultdict, with id:rank
+    >>> sim_cosine({1:2,2:3,5:2},{1:1.9,2:2.1,5:2})
+    0.986375822
+    '''
+    all_keys = set(item_A_vector.keys())|set(item_B_vector.keys())
+    base = map(lambda vector_dict:math.sqrt(sum([x**2 for x in vector_dict.values()])),[item_A_vector,item_B_vector])
+    
+    h = sum([item_A_vector[key]*item_B_vector[key] for key in all_keys])
+    return h/float(multi(base))
+
+if __name__=='__main__':
+    import doctest
+    doctest.testmod()

File tfidf/data/tag2id

Binary file modified.

File tfidf/data/word2id

Binary file modified.

File tfidf/data/word_id2tag_id

Binary file modified.

File tfidf/test/run_test.py

         for i in self.file_list:
             with open(i) as f:
                 title = i.rsplit('/', 1)[-1][:-4]
-                self.get_tag(title+'\n\n'+f.read())
+                txt = title+'\n\n'+f.read()
+                print '------------------------------------'
+                print txt
+                print '------------------------------------'
+                self.get_tag(txt)
 
     def get_tag(self,txt):
-        print '------------------------------------'
-        print txt
-        print '------------------------------------'
-
         topic_rank = defaultdict(float)
         tfidf_list = sorted(self.idf.tf_idf(txt), key=lambda x:x[1], reverse=True)
+        average_tfidf = sum([i[1] for i in tfidf_list])/float(len(tfidf_list))
+        tfidf_list = [ i for i in tfidf_list if i[1]>average_tfidf]
 
 
         for (word, word_tfidf), word_id in zip(
                 '''
                 推荐主题做二元分词, 如果文章中没有, 则去掉. 
                 '''
+                topic = ID2TAG[topic_id]
                 rank_t = rank/rank_avg
+                for seg in sp_txt(topic):
+                    if seg in txt:
+                        print topic, rank_t
+                        break
+
                 if rank_t<6:
                     break
 
-                topic = ID2TAG[topic_id]
-                if topic.replace(" ","").isalnum():
-                    if topic.lower() in txt:
-                        print topic, rank_t
-                else: 
-                    for seg in sp_txt(topic):
-                        if seg in txt:
-                            print topic, rank_t
-                            break
+                #if topic.replace(" ","").isalnum():
+                #    if topic.lower() in txt:
+                #        print topic, rank_t
 
 
         print ""
-        if tfidf_list:
-            idf_avg = float(sum(i[1] for i in tfidf_list))/len(tfidf_list)
-            for word, tfidf in tfidf_list:
-                if word in TAG2ID:
-                    rank = tfidf/ idf_avg
-                    if rank<6:
-                        break
-                    #print word, rank
+
+        for word,word_tfidf in tfidf_list[:50]:
+            if word in WORD2ID.word_to_id():
+                word_id = WORD2ID.word_to_id()[word]
+                topic_items_dict  = self.db.get(word_id)
+                out = ''
+                if topic_items_dict:
+                    out = u'|'.join([unicode(ID2TAG.get(i[0])+'"'+str(i[0]))+'":'+str(float(i[1])/((2<<32)-1))  for i in sorted(topic_items_dict,key=lambda x:x[1],reverse=True)[:10]])
+                print  word,':',word_tfidf,'---',out
+
+        #if tfidf_list:
+        #    idf_avg = float(sum(i[1] for i in tfidf_list))/len(tfidf_list)
+        #    for word, tfidf in tfidf_list:
+        #        if word in TAG2ID:
+        #            rank = tfidf/ idf_avg
+        #            if rank<6:
+        #                break
+        #            print word, rank
 
 #            print ID2TAG[k]
 

File tfidf/test/ucdchina_test.py

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import _env
+from zkit.bot_txt import txt_wrap_by_all, txt_wrap_by
+from zkit.htm2txt import htm2txt
+from zkit.txt_cleanup import clean_txt
+from run_test import GetTag
+from yajl import dumps
+
+TAGGER = GetTag('ucdchina/')
+def parse_page(filepath):
+    with open(filepath) as f:
+        page = f.read()
+
+        title = txt_wrap_by('<title>', '- UCD大社区', page)
+        author = txt_wrap_by('style=" float:left; color:#999;">', '</span', page)
+        author = txt_wrap_by('作者:', '|', author)
+        content_wrapper = txt_wrap_by('<div id="pageContentWrap" style="font-size:13px; ">', '</div', page)
+        url =txt_wrap_by('阅读和发布评论:<a href="','"',page)
+
+        if content_wrapper:
+            content,pic_list = htm2txt(content_wrapper)
+        else:
+            return 
+        
+        content = str(content)
+        tags = TAGGER.get_tag((content+title))
+        out = dumps([title,url])
+        print out
+        print ""
+
+#from glob import glob
+#file_list=glob('ucdchina/*')
+#for f in file_list:
+#    print '-------------------------',f
+#    parse_page(f)
+parse_page('ucdchina/2798')

File tfidf/train/convert2array.py

         else:
             #print >>sys.stderr, self.ider.get_word_by_id(key)
             #print key
-            print >>sys.stderr, "%s error: "%key + str(self.db.error())
+            #print >>sys.stderr, "%s error: "%key + str(self.db.error())
+            pass
 
 
 def pairwise(iterable):

File tfidf/train/idf.py

 import os
 from tfidf.idf import Idf
 from tfidf.config import DATA_DIR
+from zkit.txt_cleanup import line_iter
 
 def tf_idf_by_zhihu():
     current_path = os.path.dirname(os.path.abspath(__file__))

File tfidf/train/topic_bayes.py

 from convert2array import DB_Kyoto
 
 current_path = os.path.dirname(os.path.abspath(__file__))
-banned_tag_list=['开放课程']
+banned_tag_list = ['开放课程']
+TAG_MAPPING = open(join(current_path, 'mapping.py'), 'w')
+WORD_DOC_COUNT = defaultdict(int)
 
 class WordId(object):
     def __init__(self):
         self._dict = {}
         self._id2word_dict = {}
+        self._word_doc_count = defaultdict(int)
 
     def word_to_id(self):
         return self._dict
             tag = tag[:tag.find(u'(')]
         if u'(' in tag:
             tag = tag[:tag.find(u'(')]
-        tag = tag.lower()
+        lower = tag.lower()
+
+        if tag != lower:
+            print >> TAG_MAPPING, lower, ":", tag
+            tag = lower
+
         tag = str(tag)
         _dict = self._dict
         if tag in _dict:
             return _dict[tag]
+
         id = len(_dict)+1
         _dict[tag] = id
+        WORD_DOC_COUNT[id] += 1
         return id
 
     def tofile(self, path):
         g = open(join(path, 'topic_dict'))
         topic_dict = loads(g.read())
 
-        #count = 0
+        count = 0
         for data_src in zhihu_data:
             print 'Processing...', data_src
             with open(data_src) as f:
                 for line in f:
-                    #if count>1000:
+                    #if count > 1000:
                     #    break
-                    #count+=1
+                    #count += 1
                     data = loads(line)
                     if 'tags' in data:
                         tags = data['tags']
         #word_topic_bayes = {}
         for word, topic_count in word_topic_count.iteritems():
             word_topic_freq = {}
+            word_doc_count = WORD_DOC_COUNT.get(word)
+            x = 1/float(word_doc_count)
             for topic_id, count in topic_count.iteritems():
 
                 word_topic_id = self.word2id.get_id_by_tag(self.tag2id.get_word_by_id(topic_id))
                 if topic2title < 10:
                     continue
                 if word_topic_id != topic_id:
-                    word_topic_freq[topic_id] = count/float(topic2title)
+                    word_topic_freq[topic_id] = (count+1)/float(topic2title+word_doc_count) - x
+
                 else:
                     word_topic_freq[topic_id] = 1
 
             count = sum(word_topic_freq.itervalues())
-            self.db.set((word,[(k,v/count) for k,v in word_topic_freq.iteritems()]))
+            self.db.set((word, [(k, (v+x)/(count+x*len(topic_count))) for k, v in word_topic_freq.iteritems()]))
             #wb = word_topic_bayes[word] = []
             #for k, v in word_topic_freq.iteritems():
             #    wb.append((k, v/count))
 
-        #return word_topic_bayes
+#return word_topic_bayes
 
 def main():
-    tagword = TagWord(join(current_path, 'train_data/'))
-    tagword.tofile()
-    WORD_ID2TAG_ID = fromfile(join(DATA_DIR, 'word_id2tag_id'))
+    #tagword = TagWord(join(current_path, 'train_data/'))
+    #tagword.tofile()
+    #WORD_ID2TAG_ID = fromfile(join(DATA_DIR, 'word_id2tag_id'))
     bayes_rank = BayesRank(WORD_ID2TAG_ID)
-    bayes_rank.rank()
+    
+    #bayes_rank.rank()
     #tofile(join(DATA_DIR, 'bayes_rank') , bayes_rank.rank())
 
 if __name__ == '__main__':

File zhihu/zhihu_explore.py

+# -*- coding: utf-8 -*-
+
+import urllib2
+from random import choice
+from urllib2 import urlopen
+from urllib import urlencode
+from json import loads
+import _env
+from zkit.bot_txt import txt_wrap_by_all
+from xml.sax.saxutils import unescape
+from time import sleep
+import  time
+from os.path import exists
+from zhihu_page import page_fetch,fetch
+
+
+a=[14,50,34,2,53,33]
+
+
+def main():
+    cookies = (
+            ('b3179509@nwldx.com', '_xsrf=7ed86e897bae4b9e8cf3e660efed7baf; q_c0=MTk2OTAzfGdmWDM5Q2pZNVpaUW9UTzA=|1326267926|eedfe70f85add0db0ecda1e73200cac9b085ecc6; __utma=155987696.1247389772.1322703824.1326190947.1326266591.29; __utmb=155987696.34.10.1326266591; __utmc=155987696; __utmz=155987696.1325768571.27.6.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=155987696.Logged%20In'),
+            )
+
+    headers = {
+            'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+            'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
+            'Accept-Language':'en,en-US;q=0.8,zh-CN;q=0.6,zh;q=0.4',
+            'Cache-Control':'max-age=0',
+            'Connection':'keep-alive',
+            'Host':'www.zhihu.com',
+            'Referer:http':'//www.zhihu.com/',
+            'User-Agent':'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11',
+            }
+    count = 0
+    headers['cookie'] = cookies[0][1]
+    explore_page = fetch('http://www.zhihu.com/explore',headers = headers)
+    url_list = txt_wrap_by_all('question/','answer',explore_page)
+    for i in url_list:
+        print i
+
+    #for id in url_list:
+    #    if id.isdigit():
+    #        count += 1
+    #        path = 'explorer/%s.html'%id
+    #        if exists(path):
+    #            continue
+    #        try:
+    #            cookie = choice(cookies)
+    #            headers['cookie'] = cookie[1]
+    #            print id
+    #            html = page_fetch(id, headers)
+    #            if '您的帐户因为异常活动已被冻结' in html or '请输入图中的数字' in html:
+    #                cookies.remove(cookie)
+    #                if len(cookies)==0:
+    #                    return
+    #            time.sleep(choice[a])
+    #        except:
+    #            continue
+    #        with open(path, 'w') as zhihu:
+    #            zhihu.write(html)
+
+if __name__ == '__main__':
+    main()

File zhihu/zhihu_page.py

 from time import sleep
 from os.path import exists
 
-def page_fetch(id,headers=None):
+def fetch(url,headers):
     if not headers:
         headers = {
-                    'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.7) Gecko/20091221 Firefox/3.5.7',
-                   'Accept': ' text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
-                   'Accept-Language':'zh-cn,zh;q=0.5',
-                   'Accept-Charset':'gb18030,utf-8;q=0.7,*;q=0.7',
-                   'Content-type':'application/x-www-form-urlencoded'
+                'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.7) Gecko/20091221 Firefox/3.5.7',
+                'Accept': ' text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
+                'Accept-Language':'zh-cn,zh;q=0.5',
+                'Accept-Charset':'gb18030,utf-8;q=0.7,*;q=0.7',
+                'Content-type':'application/x-www-form-urlencoded'
                 }
 
-
-
+    print url
     request = urllib2.Request(
-        url='http://www.zhihu.com/question/%s'%id,
-        headers=headers
-    )
+            url,
+            headers=headers
+            )
     urlopener = urllib2.build_opener()
     r = urlopener.open(request)
 
     j = r.read()
+    print j
 
     return j
 
+def page_fetch(id,headers=None):
+    url='http://www.zhihu.com/question/%s'%id,
+    return fetch(url,headers)
+
+
+
+
 
 def main():
     count = 0

File zkit/txt_cleanup.py

+#coding:utf-8
+from collections import defaultdict
+
+CN_CHAR = 1
+EN_CHAR = 2
+STOP_CHAR = 3
+
+
+def _en(char):
+    if len(char) >= 2:
+        v = []
+        for i in char:
+            if '.' <= i <= 'z':
+                v.append(i)
+        if v:
+            return ''.join(v) , EN_CHAR
+
+
+def _cn_en_iter(line):
+    line = line.decode('utf-8', 'ignore')
+    pre_char = []
+    for i in line:
+        if i.isdigit() or '.' <= i <= 'z':
+            pre_char.append(i)
+        else:
+            if pre_char:
+                r = _en(''.join(pre_char))
+                if r:
+                    yield r
+                pre_char = []
+
+            if u'\u4e00' <= i and i < u'\u9fa6':
+                yield i, CN_CHAR
+            else:
+                yield i, STOP_CHAR
+
+    r = _en(''.join(pre_char))
+    if r:
+        yield r
+
+def _iter(line):
+    for char, word_type in _cn_en_iter(line):
+        yield char.encode('utf-8', 'ignore'), word_type
+
+def line_iter(line):
+    for i in _line_iter(line):
+        yield ''.join(i)
+
+def _line_iter(line):
+    char_buffer = []
+    for char, word_type in _iter(line):
+        if word_type != STOP_CHAR:
+            char_buffer.append(char)
+        elif char_buffer:
+            yield char_buffer
+            char_buffer = []
+    if char_buffer:
+        yield char_buffer
+
+def clean_txt(txt):
+    return ' '.join([i for i in line_iter(txt) if i])
+
+if __name__ == '__main__':
+    txt = """
+第一次听说google的simhash算法[1]时,我感到很神奇。传统的hash算法只负责将原始内容尽量均匀随机地映射为一个签名值,原理上相当于伪随机数产生算法。传统hash算法产生的两个签名,如果相等,说明原始内容在一定概率下是相等的;如果不相等,除了说明原始内容不相等外,不再提供任何信息,因为即使原始内容只相差一个字节,所产生的签名也很可能差别极大。从这个意义上来说,要设计一个hash算法,对相似的内容产生的签名也相近,是更为艰难的任务,因为它的签名值除了提供原始内容是否相等的信息外,还能额外提供不相等的原始内容的差异程度的信息。
+
+    因此当 我知道google plus +1[[https://google.com]]的simhash算法产生的签名,可以用来比较原始内容的相似度时,便很想了解这种神奇的算法的原理。出人意料,这个算法并不深奥,其思想是非常清澈美妙的。
+
+simhash算法的输入是一个向量,输出是一个f位的签名值。为了陈述方便,假设输入的是一个文档的特征集合,每个特征有一定的权重。比如特征可以是文档中的词,其权重可以是这个词出现的次数。simhash算法如下:
+"""
+    #for i in line_iter(txt):
+    #    #tf-idf 最大的50词 , 按tf-idf排序
+    #    if i:
+    #        print i
+    print clean_txt(txt)