Commits

wooparadog committed 4421e03

f

Comments (0)

Files changed (2)

classification.py

-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-'''
-classification.py
-
-Created on
-2011-12-24
-'''
-import sys;
-reload(sys);
-sys.setdefaultencoding('utf-8')
-from tfidf.idf import idf_zhihu
-from mmseg import seg_txt
-from collections import defaultdict
-from yajl import loads
-
-topic_dict = loads(open('tfidf/relations/topic_dict').read())
-topic_dict = dict(
-    (str(k), int(v)) for k, v in topic_dict.iteritems()
-)
-word_dict = loads(open('tfidf/relations/word_dict').read())
-word_dict = dict(
-    (str(k), int(v)) for k, v in word_dict.iteritems()
-)
-_word_topics = loads(open('tfidf/relations/word_topics').read())
-word_topics = dict(
-    (int(k), dict(
-        (int(k2), v2) for k2, v2 in v.iteritems()
-    ))
-    for k, v in _word_topics.iteritems()
-)
-
-topic_member = loads(open('tfidf/relations/topic_member').read())
-
-TOPIC_ID_NAME_DICT = dict([ (v, k) for k, v in topic_dict.iteritems()])
-
-class ArticleClassification(object):
-    def __init__(self, text):
-        self.text = text.encode('utf-8')
-        self.word_list = list(seg_txt(self.text))
-        self.word_count = len(self.word_list)
-        self.topic_candidate = defaultdict(int)
-        self.idf = idf_zhihu()
-
-    def get_article_topic(self):
-        self.idf_dict = dict(self.idf.tf_idf(self.text))
-        self.process()
-        self.result = sorted(self.topic_candidate.iteritems(), key=lambda x:x[1], reverse=True)
-        return self.result
-
-    def get_nomalised_article_topic(self):
-        if not self.result:
-            self.get_article_topic()
-        sum_weight = sum([x[1] for x in self.result])
-        return
-
-    def process(self):
-        for word, tf_idf in self.idf_dict.items():
-            #print word,tf_idf
-            if word in word_dict:
-                word_id = word_dict[word]
-                if word_id in word_topics:
-                    for topic_id, weight in sorted(word_topics[word_id].iteritems(), key=lambda x:x[1], reverse=True):
-                        self.topic_candidate[topic_id] += tf_idf * weight
-
-def main():
-    t = '''
-    先驱者苹果的iPhone在2007年初次登上舞台,宣告了移动计算时代的到来。消费者下载了数十亿的应用到这些设备上。根据我们的估计,仅2011一年的时间,iOS和Android应用的下载量就达到了250亿次。而且预计这一数字在2012年还会翻番。
-
-    跟其他新技术一样,iOS和Android设备的采用主要是先从从北美和西欧这些可支配收入更高的地方开始的。然而,随着老款iOS型号价格的下降,以及支持Android的OEM提供了更加负担得起的低端市场设备,消费者应用的使用情况发生了明显国际化的转变,包括新兴市场在内。下面我们先上个图热热身,看看美国以外移动应用的市场扩张情况。
-    '''
-
-    new = ArticleClassification(t)
-    out = new.get_article_topic()
-    for i in new.get_article_topic():
-        id, rank = i
-        print TOPIC_ID_NAME_DICT[id], '\t', rank
-#print '\n'.join([ ])
-
-if __name__ == '__main__':
-    main()

design

-文章    ->    主题
-
-需求:
-
-根据文章内容对文章分类
-
-依赖:
-
-0. 主题列表
-1. 词对不同主题的idf表.
-2. 分词.
-
-接口:
-
-
-get_article_topic(text):
-   for each word:
-        for each in word_to_topic(word,text)
-            topicList['topic']+=word_to_topic['topic']
-   return ordered({"topic":weight"})
-
-word_to_topic(word,text):
-   call tf(word,text)
-   call idf(word)
-   tf_idf=tf*idf
-   return { "topic":weight }
-
-tf(word,text):
-    return count(word)/count(text)
-
-idf(word):
-    return {"topic":weight}
-
-
-normalise(topic_list):
-    normalise(topicList)
-    return {"topic":_normal(topicList)}
-    
-
-
-
-