Commits

wooparadog committed 5059c63

firest

Comments (0)

Files changed (21)

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import _env
+from itertools import tee, izip
+from array import array
+import sys
+import os.path as path
+from kyotocabinet import *
+from tfidf.config import DB_PATH
+
+MAX_INT = (1<<32)-1
+CURRENT_PATH = path.dirname(path.abspath(__file__))
+
+class DB_Kyoto(object):
+    """docstring for DB_Kyoto"""
+    def __init__(self, db_file):
+        from topic_bayes import TAG2ID, WORD2ID#, BAYES_RANK
+        self.ider = WORD2ID
+        super(DB_Kyoto, self).__init__()
+        self.db = DB()
+        self.db_file = db_file
+        if not self.db.open(path.join(DB_PATH,self.db_file), DB.OWRITER | DB.OCREATE):
+            print >>sys.stderr, "open error: " + str(self.db.error())
+
+    def set(self,entry):
+        key = entry[0]
+        result_array = convert2array(entry[1]).tostring()
+        if not self.db.set(key,result_array):
+            print key
+            print result_array
+            print >>sys.stderr, "open error: " + str(self.db.error())
+
+    def get(self,key):
+        value = self.db.get(key)
+        if value:
+            result = array('L')
+            result.fromstring(value)
+            return convert2dict(result)
+        else:
+            #print >>sys.stderr, self.ider.get_word_by_id(key)
+            #print key
+            #print >>sys.stderr, "%s error: "%key + str(self.db.error())
+            pass
+
+def pairwise(iterable):
+    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
+    a, b = tee(iterable)
+    next(b, None)
+    return izip(a, b)
+
+#for k,v in pairwise(range(10)):
+#    print k,v
+
+def convert2array(dict_value):
+    ''' 
+    >>> convert2array({1:0.1,2:0.3})
+    array('L', [1L, 429496729L, 2L, 1288490188L])
+    '''
+    result_list =  []  
+    for k,v in dict_value:
+        result_list.extend([k,int(v*MAX_INT)])
+    result = array('L',result_list)
+    return result
+
+def convert2dict(array_l):
+    ''' 
+    >>> convert2dict([1L, 429496729L, 2L, 1288490188L])
+    {1:0.1,2:0.3}
+    '''
+    return [ (array_l[i],array_l[i+1]) for i in range(len(array_l)) if i%2==0]
+
+if __name__=='__main__':
+    import doctest
+    doctest.testmod()
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import _env
+from collections import defaultdict
+from idf import idf_zhihu
+from mmseg import seg_txt
+from yajl import loads
+from zkit.sp_txt import sp_txt
+
+class ParentTagger(object):
+    def __init__(self):
+        from tfidf.train.topic_bayes import TAG2ID 
+        self.word_to_id = TAG2ID.word_to_id()
+        self.word_to_id = dict([(unicode(k),v) for k,v in self.word_to_id.iteritems()])
+
+        self.id_to_word = TAG2ID.id2word()
+
+    def get_parent_tag(self, tag):
+        set_list = []
+
+        for i in sp_txt(tag):
+            if i in self.word_to_id:
+                set_list.append(i)
+
+        return list(set(set_list))
+
+    def get_parent_tag_list_by_list(self,tag_list):
+        out = []
+        for tag in tag_list:
+            parent_tag_list = self.get_parent_tag(tag)
+            out.extend(parent_tag_list)
+        return out
+    
+if __name__ == '__main__':
+    finder = ParentTagger()
+    print ','.join(finder.get_parent_tag(u'用户体验设计'))
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import _env
+from yajl import loads, dumps
+from collections import defaultdict
+from glob import glob
+from mmseg import seg_txt
+import os
+from os.path import join, dirname
+from zkit.tofromfile import tofile, fromfile
+from tfidf.find_parent_tag import ParentTagger
+from tfidf.config import DATA_DIR
+from convert2array import DB_Kyoto
+
+current_path = os.path.dirname(os.path.abspath(__file__))
+banned_tag_list = ['开放课程']
+TAG_MAPPING = open(join(current_path, 'mapping.py'), 'w')
+WORD_DOC_COUNT = defaultdict(int)
+
+class WordId(object):
+    def __init__(self):
+        self._dict = {}
+        self._id2word_dict = {}
+        self._word_doc_count = defaultdict(int)
+
+    def word_to_id(self):
+        return self._dict
+
+    def get_id_by_tag(self, tag):
+        if tag in self._dict:
+            return self._dict[tag]
+        return None
+
+    def id_by_tag(self, tag):
+        if u'(' in tag:
+            tag = tag[:tag.find(u'(')]
+        if u'(' in tag:
+            tag = tag[:tag.find(u'(')]
+        lower = tag.lower()
+
+        if tag != lower:
+            print >> TAG_MAPPING, lower, ":", tag
+            tag = lower
+
+        tag = str(tag)
+        _dict = self._dict
+        if tag in _dict:
+            return _dict[tag]
+
+        id = len(_dict)+1
+        _dict[tag] = id
+        WORD_DOC_COUNT[id] += 1
+        return id
+
+    def tofile(self, path):
+        tofile(path, self._dict)
+
+    def fromfile(self, path):
+        self._dict = fromfile(path)
+        return self
+
+    def id_list_by_word_list(self, tag_list):
+        result = []
+        for i in tag_list:
+            result.append(self.id_by_tag(i))
+        return result
+
+    def _reverse_dict(self):
+        if not self._id2word_dict:
+            self._id2word_dict = dict((k, v) for v, k in self._dict.iteritems())
+
+    def id2word(self):
+        self._reverse_dict()
+        return self._id2word_dict
+
+    def get_word_by_id(self, id):
+        self._reverse_dict()
+        if id in self._id2word_dict:
+            return self._id2word_dict[id]
+        return None
+
+    def get_max_id(self):
+        self._reverse_dict()
+        return max(self._id2word_dict.keys())
+
+
+#def word2id(self):
+#    return self._dict
+
+class TagWord(object):
+    def __init__(self, path):
+        print "Loding"
+        self.tag2id = WordId()
+        self.word2id = WordId()
+        self.path = path
+        self.parent_tag_finder = ParentTagger()
+        print "Loading done"
+
+    def _txt_tag_generator(self):
+        path = self.path
+        tag2id = self.tag2id
+        data_files = glob(join(path, '*.data'))
+        zhihu_data = [join(path, 'zhihu')]
+        zhihu_data.extend(data_files)
+
+
+        print 'Processing...'
+        g = open(join(path, 'topic_dict'))
+        topic_dict = loads(g.read())
+
+        count = 0
+        for data_src in zhihu_data:
+            print 'Processing...', data_src
+            with open(data_src) as f:
+                for line in f:
+                    #if count > 1000:
+                    #    break
+                    #count += 1
+                    data = loads(line)
+                    if 'tags' in data:
+                        tags = data['tags']
+                    else:
+                        continue
+
+
+                    tags_processed = []
+                    if 'zhihu' not in data_src:
+                        for tag in tags:
+                            if tag in topic_dict and tag not in banned_tag_list:
+                                tags_processed.append(tag)
+
+                        if not tags_processed:
+                            continue
+                        else:
+                            tags = tags_processed
+                            #print tags
+                            #raw_input()
+                    '''
+                    查找上级标签
+                    '''
+                    parent_list = self.parent_tag_finder.get_parent_tag_list_by_list(tags)
+                    tags.extend(parent_list)
+                    id_list = tag2id.id_list_by_word_list(tags)
+                    yield data['txt'], id_list
+
+    def txt_tag_generator(self):
+        word2id = self.word2id
+        for k, v in self._txt_tag_generator():
+            words = [i for i in list(seg_txt(str(k).lower())) if not i.isdigit()]
+            yield word2id.id_list_by_word_list(words) , v
+
+    def tofile(self):
+        word_id2tag_id = list(self.txt_tag_generator())
+        path = DATA_DIR
+        self.tag2id.tofile(join(path, 'tag2id'))
+        self.word2id.tofile(join(path, 'word2id'))
+        tofile(join(path, 'word_id2tag_id'), word_id2tag_id)
+
+def word_tag_word2tag_fromfile( path):
+    return map(fromfile,
+                map(
+                    lambda x:join(path, x),
+                    ('tag2id', 'word2id')
+                )
+            )
+
+class BayesRank(object):
+    def __init__(self, word_id2tag_id):
+        topic_id_title_count = self.topic_id_title_count = defaultdict(int)
+        word_topic_count = self.word_topic_count = defaultdict(lambda:defaultdict(int))
+        self.tag2id = WordId().fromfile(join(DATA_DIR, 'tag2id'))
+        self.word2id = WordId().fromfile(join(DATA_DIR, 'word2id'))
+        self.db = DB_Kyoto('test.kch')
+
+        for word_id_list, tag_id_list in word_id2tag_id:
+            for tag_id in tag_id_list:
+                topic_id_title_count[tag_id] += 1
+                for word_id in word_id_list:
+                    word_topic_count[word_id][tag_id] += 1
+
+    def rank(self):
+        print 'Ranking'
+        topic_id_title_count = self.topic_id_title_count
+        word_topic_count = self.word_topic_count
+
+
+        #word_topic_bayes = {}
+        for word, topic_count in word_topic_count.iteritems():
+            word_topic_freq = {}
+            word_doc_count = WORD_DOC_COUNT.get(word)
+            x = 1/float(word_doc_count)
+            for topic_id, count in topic_count.iteritems():
+
+                word_topic_id = self.word2id.get_id_by_tag(self.tag2id.get_word_by_id(topic_id))
+
+                topic2title = topic_id_title_count[topic_id]
+                if topic2title < 10:
+                    continue
+                if word_topic_id != topic_id:
+                    word_topic_freq[topic_id] = (count+1)/float(topic2title+word_doc_count) - x
+
+                else:
+                    word_topic_freq[topic_id] = 1
+
+            count = sum(word_topic_freq.itervalues())
+            self.db.set((word, [(k, (v+x)/(count+x*len(topic_count))) for k, v in word_topic_freq.iteritems()]))
+            #wb = word_topic_bayes[word] = []
+            #for k, v in word_topic_freq.iteritems():
+            #    wb.append((k, v/count))
+
+#return word_topic_bayes
+
+def main():
+    #tagword = TagWord(join(current_path, 'train_data/'))
+    #tagword.tofile()
+    #WORD_ID2TAG_ID = fromfile(join(DATA_DIR, 'word_id2tag_id'))
+    bayes_rank = BayesRank(WORD_ID2TAG_ID)
+    
+    #bayes_rank.rank()
+    #tofile(join(DATA_DIR, 'bayes_rank') , bayes_rank.rank())
+
+if __name__ == '__main__':
+    #TAG2ID = WordId().fromfile(join(DATA_DIR, 'tag2id'))
+    #print   TAG2ID._dict.keys()
+    main()
+else:
+    #BAYES_RANK = fromfile(join(DATA_DIR, 'bayes_rank'))
+    TAG2ID = WordId().fromfile(join(DATA_DIR, 'tag2id'))
+    WORD2ID = WordId().fromfile(join(DATA_DIR, 'word2id'))

tag/classification.py

+#coding:utf-8
+
+from collections import defaultdict
+from idf import idf_zhihu
+from mmseg import seg_txt
+from yajl import loads
+from tfidf.train.topic_bayes import TAG2ID, WORD2ID 
+from tfidf.train.convert2array import DB_Kyoto
+from zkit.sp_txt import sp_txt
+
+import sys;
+reload(sys); sys.setdefaultencoding('utf-8')
+
+ID2TAG = TAG2ID.id2word()
+
+class GetTag(object):
+    def __init__(self ):
+        self.idf = idf_zhihu()
+        self.db = DB_Kyoto('test.kch')
+
+    def get_tag(self, txt):
+        topic_rank = defaultdict(float)
+        tfidf_list = sorted(self.idf.tf_idf(txt), key=lambda x:x[1], reverse=True)
+        average_tfidf = sum([i[1] for i in tfidf_list])/float(len(tfidf_list))
+        tfidf_list = [ i for i in tfidf_list if i[1]>average_tfidf]
+
+
+        for (word, word_tfidf), word_id in zip(
+            tfidf_list,
+            WORD2ID.id_list_by_word_list(i[0] for i in tfidf_list)
+        ):
+            topic_items_dict  = self.db.get(word_id)
+            if topic_items_dict:
+                for topic_id, bayes in topic_items_dict:
+                    topic_rank[topic_id] += (word_tfidf*bayes)
+
+        topic_rank = sorted(topic_rank.iteritems(), key=lambda x:x[1], reverse=True)
+        txt = txt.lower()
+        if topic_rank:
+            rank_avg = float(sum(i[1] for i in topic_rank))/len(topic_rank)
+            for topic_id, rank in topic_rank[:50]:
+                '''
+                推荐主题做二元分词, 如果文章中没有, 则去掉. 
+                '''
+                topic = ID2TAG[topic_id]
+                rank_t = rank/rank_avg
+                for seg in sp_txt(topic):
+                    if seg in txt:
+                        yield topic, rank_t
+                        break
+
+                if rank_t<6:
+                    break
+
+
+if __name__ == '__main__':
+    txt = '''
+Pinterest的一些思考
+周末在家的时候,除了重构了部分代码以外,最多的时候想的就是Pinterest这件事情。最近太多关关于Pinterest的新闻出来了,包括花瓣拿到的4.5 M 美金的投资。包括估值巨高的Pinterest的各种事情。
+其实回过来看Pinterest和最象它的豆瓣相册。附上我们对:豆瓣相册统计
+Pinterest的模式更为松散,确切的说,Pinterest的模式的信息粒度是单张照片,一花一世界。Pinterest的松散的方式让逛变得没有目的。
+豆瓣相册的模式信息粒度突出的其实是单个相册。相册和单个照片不一样,豆瓣热门的相册大部分都以:xxxx美食教学,xxxx的20种方法,最温馨的xxxx个瞬间这样的标题。我们是一个一个相册的获得信息,在看到单个照片前我们通常是带有一定的目的的。
+另外一个很类似的东西是微博的图片分享,但是绝大多数微博的图片分享都局限于自己的美食经历,自己的穿衣打扮,和生活状态。
+这是三个完全不同的目的导向的产品,虽然他们面向的人群和内容是有交集,有共性的,但是他们最终的走向的却是不同的内容和受众,看pinterest的人,看豆瓣相册的人,看微博相册的人,人都是不一样的,目的也都是不一样的。
+在中国分享的人群更少,大家耗在微博和qq空间,甚至豆瓣的时间都很多。而且从一个宏观的大角度上来看,中国远远还不到饱暖思淫欲的时刻,中国人很多时候还是在想如何在淘宝赚钱,或者说更多人还停留在网址导航,停留在打开电脑只看qq的年代。
+我一直坚信的是,facebook和twitter打通了一条信息的流动的通路,但是通往信息最终散落的地方的很多重要的,有价值的内容其实并没有 得到完全的承载。因此如果说前一阵(5年左右时间)的大事情是信息的传播,社会化的话,我相信在一段时间过去最大的价值是各种有价值的信息的承载和细分。
+这些细分已经逐渐的显现出来了。包括,音乐类Spotify。问答类Quora。旅行类daodao等等。在一段时间内的细分市场会更加垂直和深入,以不同的方式展示和聚合最有价值的部分信息,真正为社会化的网络搭建的这条信息通道输送内容。
+那下一个是Pinterest吗?它能不能在中国顺利的成长?我觉得借鉴一下delicious的经验就可以知道这是很难的一条路,yupoo也没有完全复制Flickr的成功。或许或许,在中国Pinterest的机会不在花瓣,而在于美丽说。    
+'''
+    cla = GetTag()
+    for i in cla.get_tag(txt):
+        print i
+
+#ID2TAG = TAG2ID.id2word()
+#
+#if __name__ == '__main__':
+#
+#    txt = '''
+#Pinterest的一些思考
+#周末在家的时候,除了重构了部分代码以外,最多的时候想的就是Pinterest这件事情。最近太多关关于Pinterest的新闻出来了,包括花瓣拿到的4.5 M 美金的投资。包括估值巨高的Pinterest的各种事情。
+#其实回过来看Pinterest和最象它的豆瓣相册。附上我们对:豆瓣相册统计
+#Pinterest的模式更为松散,确切的说,Pinterest的模式的信息粒度是单张照片,一花一世界。Pinterest的松散的方式让逛变得没有目的。
+#豆瓣相册的模式信息粒度突出的其实是单个相册。相册和单个照片不一样,豆瓣热门的相册大部分都以:xxxx美食教学,xxxx的20种方法,最温馨的xxxx个瞬间这样的标题。我们是一个一个相册的获得信息,在看到单个照片前我们通常是带有一定的目的的。
+#另外一个很类似的东西是微博的图片分享,但是绝大多数微博的图片分享都局限于自己的美食经历,自己的穿衣打扮,和生活状态。
+#这是三个完全不同的目的导向的产品,虽然他们面向的人群和内容是有交集,有共性的,但是他们最终的走向的却是不同的内容和受众,看pinterest的人,看豆瓣相册的人,看微博相册的人,人都是不一样的,目的也都是不一样的。
+#在中国分享的人群更少,大家耗在微博和qq空间,甚至豆瓣的时间都很多。而且从一个宏观的大角度上来看,中国远远还不到饱暖思淫欲的时刻,中国人很多时候还是在想如何在淘宝赚钱,或者说更多人还停留在网址导航,停留在打开电脑只看qq的年代。
+#我一直坚信的是,facebook和twitter打通了一条信息的流动的通路,但是通往信息最终散落的地方的很多重要的,有价值的内容其实并没有 得到完全的承载。因此如果说前一阵(5年左右时间)的大事情是信息的传播,社会化的话,我相信在一段时间过去最大的价值是各种有价值的信息的承载和细分。
+#这些细分已经逐渐的显现出来了。包括,音乐类Spotify。问答类Quora。旅行类daodao等等。在一段时间内的细分市场会更加垂直和深入,以不同的方式展示和聚合最有价值的部分信息,真正为社会化的网络搭建的这条信息通道输送内容。
+#那下一个是Pinterest吗?它能不能在中国顺利的成长?我觉得借鉴一下delicious的经验就可以知道这是很难的一条路,yupoo也没有完全复制Flickr的成功。或许或许,在中国Pinterest的机会不在花瓣,而在于美丽说。    
+#'''
+#    topic_rank = defaultdict(float)
+#    idf = idf_zhihu()
+#    tfidf_list = sorted(idf.tf_idf(txt), key=lambda x:x[1], reverse=True)
+#
+#    for word, tfidf in tfidf_list:
+#        print "-",word,tfidf
+#    print ''
+#    for word_tfidf, word_id in zip(
+#        [i[1] for i in tfidf_list],
+#        WORD2ID.id_list_by_word_list(i[0] for i in tfidf_list)
+#    ):
+#        if word_id in BAYES_RANK:
+#            for topic_id, bayes in BAYES_RANK[word_id]:
+#                topic_rank[topic_id] += (word_tfidf*bayes)
+#
+#    topic_rank = sorted(topic_rank.iteritems(), key=lambda x:x[1], reverse=True)
+#    for topic_id, rank in topic_rank:
+#        print ID2TAG[topic_id], rank

tfidf/.ropeproject/config.py

+# The default ``config.py``
+
+
+def set_prefs(prefs):
+    """This function is called before opening the project"""
+
+    # Specify which files and folders to ignore in the project.
+    # Changes to ignored resources are not added to the history and
+    # VCSs.  Also they are not returned in `Project.get_files()`.
+    # Note that ``?`` and ``*`` match all characters but slashes.
+    # '*.pyc': matches 'test.pyc' and 'pkg/test.pyc'
+    # 'mod*.pyc': matches 'test/mod1.pyc' but not 'mod/1.pyc'
+    # '.svn': matches 'pkg/.svn' and all of its children
+    # 'build/*.o': matches 'build/lib.o' but not 'build/sub/lib.o'
+    # 'build//*.o': matches 'build/lib.o' and 'build/sub/lib.o'
+    prefs['ignored_resources'] = ['*.pyc', '*~', '.ropeproject',
+                                  '.hg', '.svn', '_svn', '.git']
+
+    # Specifies which files should be considered python files.  It is
+    # useful when you have scripts inside your project.  Only files
+    # ending with ``.py`` are considered to be python files by
+    # default.
+    #prefs['python_files'] = ['*.py']
+
+    # Custom source folders:  By default rope searches the project
+    # for finding source folders (folders that should be searched
+    # for finding modules).  You can add paths to that list.  Note
+    # that rope guesses project source folders correctly most of the
+    # time; use this if you have any problems.
+    # The folders should be relative to project root and use '/' for
+    # separating folders regardless of the platform rope is running on.
+    # 'src/my_source_folder' for instance.
+    #prefs.add('source_folders', 'src')
+
+    # You can extend python path for looking up modules
+    #prefs.add('python_path', '~/python/')
+
+    # Should rope save object information or not.
+    prefs['save_objectdb'] = True
+    prefs['compress_objectdb'] = False
+
+    # If `True`, rope analyzes each module when it is being saved.
+    prefs['automatic_soa'] = True
+    # The depth of calls to follow in static object analysis
+    prefs['soa_followed_calls'] = 0
+
+    # If `False` when running modules or unit tests "dynamic object
+    # analysis" is turned off.  This makes them much faster.
+    prefs['perform_doa'] = True
+
+    # Rope can check the validity of its object DB when running.
+    prefs['validate_objectdb'] = True
+
+    # How many undos to hold?
+    prefs['max_history_items'] = 32
+
+    # Shows whether to save history across sessions.
+    prefs['save_history'] = True
+    prefs['compress_history'] = False
+
+    # Set the number spaces used for indenting.  According to
+    # :PEP:`8`, it is best to use 4 spaces.  Since most of rope's
+    # unit-tests use 4 spaces it is more reliable, too.
+    prefs['indent_size'] = 4
+
+    # Builtin and c-extension modules that are allowed to be imported
+    # and inspected by rope.
+    prefs['extension_modules'] = []
+
+    # Add all standard c-extensions to extension_modules list.
+    prefs['import_dynload_stdmods'] = True
+
+    # If `True` modules with syntax errors are considered to be empty.
+    # The default value is `False`; When `False` syntax errors raise
+    # `rope.base.exceptions.ModuleSyntaxError` exception.
+    prefs['ignore_syntax_errors'] = False
+
+    # If `True`, rope ignores unresolvable imports.  Otherwise, they
+    # appear in the importing namespace.
+    prefs['ignore_bad_imports'] = False
+
+
+def project_opened(project):
+    """This function is called after opening the project"""
+    # Do whatever you like here!

tfidf/__init__.py

Empty file added.
+import sys
+reload(sys)
+sys.setdefaultencoding('utf-8')
+from os.path import dirname, abspath, join
+PWD = dirname(abspath(__file__))
+sys.path.append(dirname(PWD))
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import _env
+import os.path as path
+
+CURRENT_PATH = path.dirname(path.abspath(__file__))
+
+DATA_DIR = path.join(CURRENT_PATH,'data')
+DB_PATH = "/mnt/42qu_data/db/"
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import _env
+import os
+from collections import defaultdict
+from math import log
+from mmseg import seg_txt
+from yajl import loads
+import sys;
+reload(sys);
+sys.setdefaultencoding('utf-8')
+from os.path import join
+from zkit.tofromfile import tofile, fromfile
+from tfidf.config import DATA_DIR
+
+class Idf(object):
+    def __init__(self):
+        self._idf = defaultdict(int)
+        self._count = 0
+
+    def append(self, txt):
+        for i in set(seg_txt(str(txt.lower()))):
+            self._idf[i] += 1
+        self._count += 1
+
+    def idf(self):
+        result = {}
+        count = float(self._count)
+        for k, v in self._idf.iteritems():
+            result[k] = log(count/v, 2)
+            '''
+            idf训练中, 低于1/100w的的词直接去掉..
+            '''
+            if result[k] < 1/1000000.0:
+                result.pop(k)
+        return result
+
+    def tofile(self, f):
+        tofile(
+                f, (self._count, self.idf())
+              )
+
+    def fromfile(self, f):
+        self._count , self._idf = fromfile(f)
+
+
+    def tf_idf(self, txt):
+        tf = defaultdict(int)
+        for i in seg_txt(str(txt.lower())):
+            tf[i] += 1
+        result = []
+        for k, v in tf.iteritems():
+            if k in self._idf:
+                result.append((k, v*self._idf[k]))
+        return result
+
+def idf_zhihu():
+    current_path = os.path.dirname(os.path.abspath(__file__))
+    idf = Idf()
+    idf.fromfile(join(DATA_DIR, 'zhihu.idf'))
+    return idf
+
+
+if __name__ == '__main__':
+    pass
+
+    #tf_idf_by_zhihu()
+    #idf = idf_zhihu()
+    #for k, v in idf.tf_idf('我不可思议是什么的人'):
+    #    print k, v
+
+
+#print tf_idf('我','我不可思议是什么的人')
+#current_path = os.path.dirname(os.path.abspath(__file__))
+#data=[]
+
+#total_files = len(data)
+#def idf_list(word_list):
+#    word_idf_dict = defaultdict(int)
+#
+#    ##for i in data:
+#    ##    ans = '\n'.join([x['answer'] for x in i['answer']])
+#    ##    for word in word_list:
+#    ##        if word in i['body'] or word in ans or word in i['title']:
+#    ##            word_idf_dict[word]+=1
+#    ##word_idf_dict = [(k,log(total_files/float(v))) for k,v in word_idf_dict.items()]
+#
+#    return word_idf_dict
+#
+##def idf(word):
+##    word_idf=0
+##    for i in data:
+##        ans = ''.join([x['answer'] for x in i['answer']])
+##        if word in i['body'] or word in ans or word in i['title']:
+##            word_idf+=1
+##
+##    word_idf = log(total_files/float(word_idf))
+##    return word_idf
+#
+##def tf(word,text):
+##    words = list(seg_txt(text))
+##    print words
+##    count = text.count(word)
+##    return count/float(len(words))
+##
+##def tf_idf(word,text):
+##    return tf(word,text)*idf(word)
+#
+

tfidf/misc/.ropeproject/config.py

+# The default ``config.py``
+
+
+def set_prefs(prefs):
+    """This function is called before opening the project"""
+
+    # Specify which files and folders to ignore in the project.
+    # Changes to ignored resources are not added to the history and
+    # VCSs.  Also they are not returned in `Project.get_files()`.
+    # Note that ``?`` and ``*`` match all characters but slashes.
+    # '*.pyc': matches 'test.pyc' and 'pkg/test.pyc'
+    # 'mod*.pyc': matches 'test/mod1.pyc' but not 'mod/1.pyc'
+    # '.svn': matches 'pkg/.svn' and all of its children
+    # 'build/*.o': matches 'build/lib.o' but not 'build/sub/lib.o'
+    # 'build//*.o': matches 'build/lib.o' and 'build/sub/lib.o'
+    prefs['ignored_resources'] = ['*.pyc', '*~', '.ropeproject',
+                                  '.hg', '.svn', '_svn', '.git']
+
+    # Specifies which files should be considered python files.  It is
+    # useful when you have scripts inside your project.  Only files
+    # ending with ``.py`` are considered to be python files by
+    # default.
+    #prefs['python_files'] = ['*.py']
+
+    # Custom source folders:  By default rope searches the project
+    # for finding source folders (folders that should be searched
+    # for finding modules).  You can add paths to that list.  Note
+    # that rope guesses project source folders correctly most of the
+    # time; use this if you have any problems.
+    # The folders should be relative to project root and use '/' for
+    # separating folders regardless of the platform rope is running on.
+    # 'src/my_source_folder' for instance.
+    #prefs.add('source_folders', 'src')
+
+    # You can extend python path for looking up modules
+    #prefs.add('python_path', '~/python/')
+
+    # Should rope save object information or not.
+    prefs['save_objectdb'] = True
+    prefs['compress_objectdb'] = False
+
+    # If `True`, rope analyzes each module when it is being saved.
+    prefs['automatic_soa'] = True
+    # The depth of calls to follow in static object analysis
+    prefs['soa_followed_calls'] = 0
+
+    # If `False` when running modules or unit tests "dynamic object
+    # analysis" is turned off.  This makes them much faster.
+    prefs['perform_doa'] = True
+
+    # Rope can check the validity of its object DB when running.
+    prefs['validate_objectdb'] = True
+
+    # How many undos to hold?
+    prefs['max_history_items'] = 32
+
+    # Shows whether to save history across sessions.
+    prefs['save_history'] = True
+    prefs['compress_history'] = False
+
+    # Set the number spaces used for indenting.  According to
+    # :PEP:`8`, it is best to use 4 spaces.  Since most of rope's
+    # unit-tests use 4 spaces it is more reliable, too.
+    prefs['indent_size'] = 4
+
+    # Builtin and c-extension modules that are allowed to be imported
+    # and inspected by rope.
+    prefs['extension_modules'] = []
+
+    # Add all standard c-extensions to extension_modules list.
+    prefs['import_dynload_stdmods'] = True
+
+    # If `True` modules with syntax errors are considered to be empty.
+    # The default value is `False`; When `False` syntax errors raise
+    # `rope.base.exceptions.ModuleSyntaxError` exception.
+    prefs['ignore_syntax_errors'] = False
+
+    # If `True`, rope ignores unresolvable imports.  Otherwise, they
+    # appear in the importing namespace.
+    prefs['ignore_bad_imports'] = False
+
+
+def project_opened(project):
+    """This function is called after opening the project"""
+    # Do whatever you like here!

tfidf/misc/.ropeproject/globalnames

+�}qUrmseq]q(UrmseqUmainqes.

tfidf/misc/.ropeproject/history

+�]q(]q]qe.

tfidf/misc/.ropeproject/objectdb

+�}qUrmse.pyq}qUrmseqcrope.base.oi.memorydb
+ScopeInfo
+q)�q}q(Uunknown�q�q	Uunknown�q
+Unone��q
+import sys
+reload(sys)
+sys.setdefaultencoding('utf-8')
+from os.path import dirname, abspath, join
+PWD = dirname(dirname(abspath(__file__)))
+sys.path.append(dirname(PWD))
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import sys;
+reload(sys);
+sys.setdefaultencoding('utf-8')
+
+
+from yajl import loads, dumps
+
+def main():
+    with open('relations/topic_dict')as f:
+        topic_dict = dict([(v,k) for k,v in loads(f.read()).items()])
+    with open('relations/topic_member') as f:
+        topic_member = loads(f.read())
+    topic_member = sorted(topic_member.iteritems(),key=lambda x:len(x[1]),reverse = True)
+    for k,v in topic_member:
+        print len(v) ,' - ',topic_dict[int(k)]
+    pass
+
+
+if __name__ == '__main__':
+    main()
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import _env
+from tfidf.train.topic_bayes import TAG2ID, WORD2ID, BAYES_RANK
+
+def rmse(num_list):
+    if num_list:
+        length = float(len(num_list))
+        E = sum(num_list)/length
+        total = 0
+        li = map(lambda x: (x-E)**2,num_list)
+        return sum(li)/length
+
+def main():
+    topic_count = TAG2ID.get_max_id()
+    for word_id,topic_list in BAYES_RANK.iteritems():
+        print WORD2ID.get_word_by_id(word_id),rmse(map(lambda x:x[1],topic_list))
+    
+
+if __name__ == '__main__':
+    main()

tfidf/test/__init__.py

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+'''
+__init__.py
+Author: WooParadog
+Email:  Guohaochuan@gmail.com
+
+Created on
+2011-12-24
+'''
+
+def main():
+    pass
+
+if __name__ == '__main__':
+    pass
+import sys
+reload(sys)
+sys.setdefaultencoding('utf-8')
+from os.path import dirname, abspath, join
+PWD = dirname(dirname(abspath(__file__)))
+sys.path.append(dirname(PWD))

tfidf/test/run_test.py

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import _env
+from collections import defaultdict
+from tfidf.train.topic_bayes import TAG2ID, WORD2ID#, BAYES_RANK
+from tfidf.train.convert2array import DB_Kyoto
+from tfidf.idf import idf_zhihu
+from mmseg import seg_txt
+from yajl import loads
+import os.path as path
+from glob import glob
+
+import sys;
+reload(sys);
+sys.setdefaultencoding('utf-8')
+
+ID2TAG = TAG2ID.id2word()
+TAG2ID = dict((k,v) for v,k in ID2TAG.iteritems())
+
+CURRENT_PATH = path.dirname(path.abspath(__file__))
+
+class GetTag(object):
+
+    def __init__(self, folder):
+        self.idf = idf_zhihu()
+        file_list = glob(path.join(CURRENT_PATH, folder)+'/*')
+        self.file_list = file_list
+        self.db = DB_Kyoto('test.kch')
+
+    def run_test(self):
+        for i in self.file_list:
+            with open(i) as f:
+                title = i.rsplit('/', 1)[-1][:-4]
+                txt = title+'\n\n'+f.read()
+                print '------------------------------------'
+                print txt
+                print '------------------------------------'
+                self.get_tag(txt)
+
+    def get_tag(self,txt):
+        topic_rank = defaultdict(float)
+        tfidf_list = sorted(self.idf.tf_idf(txt), key=lambda x:x[1], reverse=True)
+        average_tfidf = sum([i[1] for i in tfidf_list])/float(len(tfidf_list))
+        tfidf_list = [ i for i in tfidf_list if i[1]>average_tfidf]
+
+
+        for (word, word_tfidf), word_id in zip(
+            tfidf_list,
+            WORD2ID.id_list_by_word_list(i[0] for i in tfidf_list)
+        ):
+            topic_items_dict  = self.db.get(word_id)
+            if topic_items_dict:
+                for topic_id, bayes in topic_items_dict:
+                    topic_rank[topic_id] += (word_tfidf*bayes)
+
+        topic_rank = sorted(topic_rank.iteritems(), key=lambda x:x[1], reverse=True)
+        txt = txt.lower()
+        if topic_rank:
+            rank_avg = float(sum(i[1] for i in topic_rank))/len(topic_rank)
+            for topic_id, rank in topic_rank[:50]:
+                '''
+                推荐主题做二元分词, 如果文章中没有, 则去掉. 
+                '''
+                topic = ID2TAG[topic_id]
+                rank_t = rank/rank_avg
+                for seg in sp_txt(topic):
+                    if seg in txt:
+                        print topic, rank_t
+                        break
+
+                if rank_t<6:
+                    break
+
+                #if topic.replace(" ","").isalnum():
+                #    if topic.lower() in txt:
+                #        print topic, rank_t
+
+
+        print ""
+
+        for word,word_tfidf in tfidf_list[:50]:
+            if word in WORD2ID.word_to_id():
+                word_id = WORD2ID.word_to_id()[word]
+                topic_items_dict  = self.db.get(word_id)
+                out = ''
+                if topic_items_dict:
+                    out = u'|'.join([unicode(ID2TAG.get(i[0])+'"'+str(i[0]))+'":'+str(float(i[1])/((2<<32)-1))  for i in sorted(topic_items_dict,key=lambda x:x[1],reverse=True)[:10]])
+                print  word,':',word_tfidf,'---',out
+
+        #if tfidf_list:
+        #    idf_avg = float(sum(i[1] for i in tfidf_list))/len(tfidf_list)
+        #    for word, tfidf in tfidf_list:
+        #        if word in TAG2ID:
+        #            rank = tfidf/ idf_avg
+        #            if rank<6:
+        #                break
+        #            print word, rank
+
+#            print ID2TAG[k]
+
+def sp_txt(txt):
+    txt = unicode(txt)
+    for i in range(len(txt)-1):
+        yield str(txt[i:i+2])
+
+def main():
+    folder = 'articles'
+    test = GetTag(folder)
+    test.run_test()
+
+
+if __name__ == '__main__':
+    #import cProfile
+    #cProfile.run('main()')
+    main()
+

tfidf/test/ucdchina_test.py

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import _env
+from zkit.bot_txt import txt_wrap_by_all, txt_wrap_by
+from zkit.htm2txt import htm2txt
+from zkit.txt_cleanup import clean_txt
+from run_test import GetTag
+from yajl import dumps
+
+TAGGER = GetTag('ucdchina/')
+def parse_page(filepath):
+    with open(filepath) as f:
+        page = f.read()
+
+        title = txt_wrap_by('<title>', '- UCD大社区', page)
+        author = txt_wrap_by('style=" float:left; color:#999;">', '</span', page)
+        author = txt_wrap_by('作者:', '|', author)
+        content_wrapper = txt_wrap_by('<div id="pageContentWrap" style="font-size:13px; ">', '</div', page)
+        url =txt_wrap_by('阅读和发布评论:<a href="','"',page)
+
+        if content_wrapper:
+            content,pic_list = htm2txt(content_wrapper)
+        else:
+            return 
+        
+        content = str(content)
+        tags = TAGGER.get_tag((content+title))
+        out = dumps([title,url])
+        print out
+        print ""
+
+#from glob import glob
+#file_list=glob('ucdchina/*')
+#for f in file_list:
+#    print '-------------------------',f
+#    parse_page(f)
+#parse_page('ucdchina/2798')
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import _env
+import os
+from tfidf.idf import Idf
+from tfidf.config import DATA_DIR
+from zkit.txt_cleanup import line_iter
+
+def tf_idf_by_zhihu():
+    current_path = os.path.dirname(os.path.abspath(__file__))
+    infile = join(current_path,'train_data/','out.js')
+    outfile = join(DATA_DIR, 'zhihu.idf')
+    idf = Idf()
+
+    with open(infile) as lib:
+        for line in lib:
+            l = loads(line)
+            idf.append( l['title'] )
+            for j in l['answer']:
+                idf.append(j['answer'])
+
+    with open(join(DATA_DIR,"review.txt")) as review:
+        result = []
+        for line in review:
+            line = line.strip()
+            if not line:
+                continue
+            if line.startswith(">->->"):
+                if result:
+                    line = line.split(" ",5)
+                    result.append(line[-1])
+                    txt = "\n".join(result)
+                    idf.append(txt)
+                    #print line[1]
+                    #print txt
+                    #raw_input()
+                result = []
+            else:
+                result.append(line)
+
+    idf.tofile(outfile)