Commits

wooparadog  committed ec29b5c

f

  • Participants
  • Parent commits f205636

Comments (0)

Files changed (17)

File tfidf/classification.py

 from idf import idf_zhihu
 from mmseg import seg_txt
 from yajl import loads
-from generate_lib import TAG2ID, WORD2ID, BAYES_RANK
+from tfidf.train.generate_lib import TAG2ID, WORD2ID, BAYES_RANK
 
 import sys;
 reload(sys);

File tfidf/config.py

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import _env
+import os.path as path
+
+CURRENT_PATH = path.dirname(path.abspath(__file__))
+
+DATA_DIR = path.join(CURRENT_PATH,'data')

File tfidf/data/tag2id

Binary file modified.

File tfidf/data/word2id

Binary file modified.

File tfidf/data/word_id2tag_id

Binary file modified.

File tfidf/dump_load.py

Empty file removed.

File tfidf/find_parent_tag.py

 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
+import _env
 from collections import defaultdict
 from idf import idf_zhihu
 from mmseg import seg_txt
 
         return out
 
-    def get_parent_tag_list_by_list(tag_list):
+    def get_parent_tag_list_by_list(self,tag_list):
         out = []
         for tag in tag_list:
             parent_tag_id_list = self.get_parent_tag(tag)

File tfidf/generate_lib.py

-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-import _env
-from yajl import loads, dumps
-from collections import defaultdict
-from mmseg import seg_txt
-import os
-from os.path import join, dirname
-from tofromfile import tofile, fromfile
-from find_parent_tag import ParentTagger
-
-current_path = os.path.dirname(os.path.abspath(__file__))
-
-class WordId(object):
-    def __init__(self):
-        self._dict = {}
-    
-    def word_to_id(self):
-        return self._dict
-
-    def get_id_by_tag(self,tag):
-        if tag in self._dict:
-            return _dict[tag]
-        return None
-
-    def id_by_tag(self, tag):
-        tag = str(tag)
-        _dict = self._dict
-        if tag in _dict:
-            return _dict[tag]
-        id = len(_dict)+1
-        _dict[tag] = id
-        return id
-
-    def tofile(self, path):
-        tofile(path, self._dict)
-
-    def fromfile(self, path):
-        self._dict = fromfile(path)
-        return self
-
-    def id_list_by_word_list(self, tag_list):
-        result = []
-        for i in tag_list:
-            result.append(self.id_by_tag(i))
-        return result
-
-    def id2word(self):
-        return dict((k,v) for v,k in self._dict.iteritems())
-
-class TagWord(object):
-    def __init__(self, path):
-        self.tag2id = WordId()
-        self.word2id = WordId()
-        self.path = path
-        self.parent_tag_finder = ParentTagger()
-
-    def _txt_tag_generator(self):
-        path = self.path
-        tag2id = self.tag2id
-        with open(path) as f:
-            for line in f:
-                data = loads(line)
-                tags = data['tags']
-                '''
-                查找上级标签
-                '''
-                parent_list = self.parent_tag_finder.get_parent_tag_list_by_list(tags)
-                tags.extend(parent_list)
-                id_list = tag2id.id_list_by_word_list(tags)
-                yield data['title'], id_list
-                for ans in data['answer']:
-                    yield ans['answer'], id_list
-                '''
-                训练时, 将主题也算作一个词来处理.
-                '''
-                for tag in tags:
-                    yield tag,id_list
-
-    def txt_tag_generator(self):
-        word2id = self.word2id
-        for k, v in self._txt_tag_generator():
-            words = list(seg_txt(str(k).lower()))
-            yield word2id.id_list_by_word_list(words) , v
-
-    def tofile(self):
-        word_id2tag_id = list(self.txt_tag_generator())
-        path = dirname(self.path)
-        self.tag2id.tofile(join(path, 'tag2id'))
-        self.word2id.tofile(join(path, 'word2id'))
-        tofile(join(path, 'word_id2tag_id'), word_id2tag_id)
-
-def word_tag_word2tag_fromfile( path):
-    return map(fromfile,
-                map(
-                    lambda x:join(path, x),
-                    ('tag2id', 'word2id')
-                )
-            )
-
-
-class BayesRank(object):
-    def __init__(self, word_id2tag_id):
-        topic_id_title_count = self.topic_id_title_count = defaultdict(int)
-        word_topic_count = self.word_topic_count = defaultdict(lambda:defaultdict(int))
-
-        for word_id_list, tag_id_list in word_id2tag_id:
-            for tag_id in tag_id_list:
-                topic_id_title_count[tag_id] += 1
-                for word_id in word_id_list:
-                    word_topic_count[word_id][tag_id] += 1
-
-    def rank(self):
-        topic_id_title_count = self.topic_id_title_count
-        word_topic_count = self.word_topic_count
-
-        word_topic_bayes = {}
-        for word, topic_count in word_topic_count.iteritems():
-            word_topic_freq = {}
-            for topic_id, count in topic_count.iteritems():
-                topic2title = topic_id_title_count[topic_id]
-                if topic2title<20:
-                    continue
-                word_topic_freq[topic_id] = count/float(topic2title)
-
-            count = sum(word_topic_freq.itervalues())
-            wb = word_topic_bayes[word] = []
-            for k, v in word_topic_freq.iteritems():
-                wb.append((k, v/count))
-        return word_topic_bayes
-
-def main():
-    tagword=TagWord("data/out.js")
-    tagword.tofile()
-    WORD_ID2TAG_ID = fromfile( "data/word_id2tag_id")
-    bayes_rank = BayesRank(WORD_ID2TAG_ID)
-    tofile( "data/bayes_rank" , bayes_rank.rank())
-
-if __name__ == '__main__':
-    main()
-else:
-    BAYES_RANK = fromfile( "data/bayes_rank")
-    TAG2ID = WordId().fromfile(join(current_path, 'data/tag2id'))
-    WORD2ID = WordId().fromfile(join(current_path, 'data/word2id'))

File tfidf/idf.py

 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
+import _env
 import os
 from collections import defaultdict
 from math import log
 from mmseg import seg_txt
 from yajl import loads
-from prefix import prefix_word
 import sys;
 reload(sys);
 sys.setdefaultencoding('utf-8')
 from os.path import join
-from tofromfile import tofile, fromfile
+from zkit.tofromfile import tofile, fromfile
+from tfidf.config import DATA_DIR
 
 class Idf(object):
     def __init__(self):
 def idf_zhihu():
     current_path = os.path.dirname(os.path.abspath(__file__))
     idf = Idf()
-    idf.fromfile(join(current_path, 'zhihu.idf'))
+    idf.fromfile(join(DATA_DIR, 'zhihu.idf'))
     return idf
 
-def tf_idf_by_zhihu():
-    current_path = os.path.dirname(os.path.abspath(__file__))
-    infile = join(current_path, 'data/out.js')
-    outfile = join(current_path, 'zhihu.idf')
-    idf = Idf()
-
-
-    with open(infile) as lib:
-        for line in lib:
-            l = loads(line)
-            idf.append( l['title'] )
-            for j in l['answer']:
-                idf.append(j['answer'])
-
-    with open(join(current_path,"data/review.txt")) as review:
-        result = []
-        for line in review:
-            line = line.strip()
-            if not line:
-                continue
-            if line.startswith(">->->"):
-                if result:
-                    line = line.split(" ",5)
-                    result.append(line[-1])
-                    txt = "\n".join(result)
-                    idf.append(txt)
-                    print line[1]
-                    #print txt
-                    #raw_input()
-                result = []
-            else:
-                result.append(line)
-
-    idf.tofile(outfile)
 
 if __name__ == '__main__':
-    tf_idf_by_zhihu()
+    pass
 
+    #tf_idf_by_zhihu()
     #idf = idf_zhihu()
     #for k, v in idf.tf_idf('我不可思议是什么的人'):
     #    print k, v

File tfidf/misc/dist.py

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import sys;
+reload(sys);
+sys.setdefaultencoding('utf-8')
+
+
+from yajl import loads, dumps
+
+def main():
+    with open('relations/topic_dict')as f:
+        topic_dict = dict([(v,k) for k,v in loads(f.read()).items()])
+    with open('relations/topic_member') as f:
+        topic_member = loads(f.read())
+    topic_member = sorted(topic_member.iteritems(),key=lambda x:len(x[1]),reverse = True)
+    for k,v in topic_member:
+        print len(v) ,' - ',topic_dict[int(k)]
+    pass
+
+
+if __name__ == '__main__':
+    main()

File tfidf/prefix.py

-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-prefix_word = ["?",
-"、",
-"。",
-"“",
-"”",
-"《",
-"》",
-"!",
-",",
-":",
-";",
-"?",
-"末",
-"啊",
-"阿",
-"哎",
-"哎呀",
-"哎哟",
-"唉",
-"俺",
-"俺们",
-"按",
-"按照",
-"吧",
-"吧哒",
-"把",
-"罢了",
-"被",
-"本",
-"本着",
-"比",
-"比方",
-"比如",
-"鄙人",
-"彼",
-"彼此",
-"边",
-"别",
-"别的",
-"别说",
-"并",
-"并且",
-"不比",
-"不成",
-"不单",
-"不但",
-"不独",
-"不管",
-"不光",
-"不过",
-"不仅",
-"不拘",
-"不论",
-"不怕",
-"不然",
-"不如",
-"不特",
-"不惟",
-"不问",
-"不只",
-"朝",
-"朝着",
-"趁",
-"趁着",
-"乘",
-"冲",
-"除",
-"除此之外",
-"除非",
-"除了",
-"此",
-"此间",
-"此外",
-"从",
-"从而",
-"打",
-"待",
-"但",
-"但是",
-"当",
-"当着",
-"到",
-"得",
-"的",
-"的话",
-"等",
-"等等",
-"地",
-"第",
-"叮咚",
-"对",
-"对于",
-"多",
-"多少",
-"而",
-"而况",
-"而且",
-"而是",
-"而外",
-"而言",
-"而已",
-"尔后",
-"反过来",
-"反过来说",
-"反之",
-"非但",
-"非徒",
-"否则",
-"嘎",
-"嘎登",
-"该",
-"赶",
-"个",
-"各",
-"各个",
-"各位",
-"各种",
-"各自",
-"给",
-"根据",
-"跟",
-"故",
-"故此",
-"固然",
-"关于",
-"管",
-"归",
-"果然",
-"果真",
-"过",
-"哈",
-"哈哈",
-"呵",
-"和",
-"何",
-"何处",
-"何况",
-"何时",
-"嘿",
-"哼",
-"哼唷",
-"呼哧",
-"乎",
-"哗",
-"还是",
-"还有",
-"换句话说",
-"换言之",
-"或",
-"或是",
-"或者",
-"极了",
-"及",
-"及其",
-"及至",
-"即",
-"即便",
-"即或",
-"即令",
-"即若",
-"即使",
-"几",
-"几时",
-"己",
-"既",
-"既然",
-"既是",
-"继而",
-"加之",
-"假如",
-"假若",
-"假使",
-"鉴于",
-"将",
-"较",
-"较之",
-"叫",
-"接着",
-"结果",
-"借",
-"紧接着",
-"进而",
-"尽",
-"尽管",
-"经",
-"经过",
-"就",
-"就是",
-"就是说",
-"据",
-"具体地说",
-"具体说来",
-"开始",
-"开外",
-"靠",
-"咳",
-"可",
-"可见",
-"可是",
-"可以",
-"况且",
-"啦",
-"来",
-"来着",
-"离",
-"例如",
-"哩",
-"连",
-"连同",
-"两者",
-"了",
-"临",
-"另",
-"另外",
-"另一方面",
-"论",
-"嘛",
-"吗",
-"慢说",
-"漫说",
-"冒",
-"么",
-"每",
-"每当",
-"们",
-"莫若",
-"某",
-"某个",
-"某些",
-"拿",
-"哪",
-"哪边",
-"哪儿",
-"哪个",
-"哪里",
-"哪年",
-"哪怕",
-"哪天",
-"哪些",
-"哪样",
-"那",
-"那边",
-"那儿",
-"那个",
-"那会儿",
-"那里",
-"那么",
-"那么些",
-"那么样",
-"那时",
-"那些",
-"那样",
-"乃",
-"乃至",
-"呢",
-"能",
-"你",
-"你们",
-"您",
-"宁",
-"宁可",
-"宁肯",
-"宁愿",
-"哦",
-"呕",
-"啪达",
-"旁人",
-"呸",
-"凭",
-"凭借",
-"其",
-"其次",
-"其二",
-"其他",
-"其它",
-"其一",
-"其余",
-"其中",
-"起",
-"起见",
-"起见",
-"岂但",
-"恰恰相反",
-"前后",
-"前者",
-"且",
-"然而",
-"然后",
-"然则",
-"让",
-"人家",
-"任",
-"任何",
-"任凭",
-"如",
-"如此",
-"如果",
-"如何",
-"如其",
-"如若",
-"如上所述",
-"若",
-"若非",
-"若是",
-"啥",
-"上下",
-"尚且",
-"设若",
-"设使",
-"甚而",
-"甚么",
-"甚至",
-"省得",
-"时候",
-"什么",
-"什么样",
-"使得",
-"是",
-"是的",
-"首先",
-"谁",
-"谁知",
-"顺",
-"顺着",
-"似的",
-"虽",
-"虽然",
-"虽说",
-"虽则",
-"随",
-"随着",
-"所",
-"所以",
-"他",
-"他们",
-"他人",
-"它",
-"它们",
-"她",
-"她们",
-"倘",
-"倘或",
-"倘然",
-"倘若",
-"倘使",
-"腾",
-"替",
-"通过",
-"同",
-"同时",
-"哇",
-"万一",
-"往",
-"望",
-"为",
-"为何",
-"为了",
-"为什么",
-"为着",
-"喂",
-"嗡嗡",
-"我",
-"我们",
-"呜",
-"呜呼",
-"乌乎",
-"无论",
-"无宁",
-"毋宁",
-"嘻",
-"吓",
-"相对而言",
-"像",
-"向",
-"向着",
-"嘘",
-"呀",
-"焉",
-"沿",
-"沿着",
-"要",
-"要不",
-"要不然",
-"要不是",
-"要么",
-"要是",
-"也",
-"也罢",
-"也好",
-"一",
-"一般",
-"一旦",
-"一方面",
-"一来",
-"一切",
-"一样",
-"一则",
-"依",
-"依照",
-"矣",
-"以",
-"以便",
-"以及",
-"以免",
-"以至",
-"以至于",
-"以致",
-"抑或",
-"因",
-"因此",
-"因而",
-"因为",
-"哟",
-"用",
-"由",
-"由此可见",
-"由于",
-"有",
-"有的",
-"有关",
-"有些",
-"又",
-"于",
-"于是",
-"于是乎",
-"与",
-"与此同时",
-"与否",
-"与其",
-"越是",
-"云云",
-"哉",
-"再说",
-"再者",
-"在",
-"在下",
-"咱",
-"咱们",
-"则",
-"怎",
-"怎么",
-"怎么办",
-"怎么样",
-"怎样",
-"咋",
-"照",
-"照着",
-"者",
-"这",
-"这边",
-"这儿",
-"这个",
-"这会儿",
-"这就是说",
-"这里",
-"这么",
-"这么点儿",
-"这么些",
-"这么样",
-"这时",
-"这些",
-"这样",
-"正如",
-"吱",
-"之",
-"之类",
-"之所以",
-"之一",
-"只是",
-"只限",
-"只要",
-"只有",
-"至",
-"至于",
-"诸位",
-"着",
-"着呢",
-"自",
-"自从",
-"自个儿",
-"自各儿",
-"自己",
-"自家",
-"自身",
-"综上所述",
-"总的来看",
-"总的来说",
-"总的说来",
-"总而言之",
-"总之",
-"纵",
-"纵令",
-"纵然",
-"纵使",
-"遵照",
-"作为",
-"兮",
-"呃",
-"呗",
-"咚",
-"咦",
-"喏",
-"啐",
-"喔唷",
-"嗬",
-"嗯",
-"嗳",
-]

File tfidf/test/run_test.py

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from collections import defaultdict
+from generate_lib import TAG2ID, WORD2ID, BAYES_RANK
+from idf import idf_zhihu
+from mmseg import seg_txt
+from yajl import loads
+import os.path as path
+from glob import glob
+
+import sys;
+reload(sys);
+sys.setdefaultencoding('utf-8')
+
+ID2TAG = TAG2ID.id2word()
+
+CURRENT_PATH = path.dirname(path.abspath(__file__))
+
+class GetTag(object):
+    def __init__(self, folder):
+        self.idf = idf_zhihu()
+        file_list = glob(path.join(CURRENT_PATH, folder)+'/*')
+        self.file_list = file_list
+
+    def run_test(self):
+        for i in self.file_list:
+            with open(i) as f:
+                title = i.rsplit('/', 1)[-1][:-4]
+                self.get_tag(title+'\n\n'+f.read())
+
+    def get_tag(self,txt):
+        topic_rank = defaultdict(float)
+        tfidf_list = sorted(self.idf.tf_idf(txt), key=lambda x:x[1], reverse=True)
+
+        highest_word_list = []
+        for word, tfidf in tfidf_list[:10]:
+            if word in ID2TAG.values():
+                highest_word_list.append(TAG2ID.id_by_tag(word))
+
+        for word_tfidf, word_id in zip(
+            [i[1] for i in tfidf_list],
+            WORD2ID.id_list_by_word_list(i[0] for i in tfidf_list)
+        ):
+            if word_id in BAYES_RANK:
+                for topic_id, bayes in BAYES_RANK[word_id]:
+                    topic_rank[topic_id] += (word_tfidf*bayes)
+
+        topic_rank = sorted(topic_rank.iteritems(), key=lambda x:x[1], reverse=True)
+
+        for topic_id, rank in topic_rank[:10]:
+            '''
+            推荐主题做二元分词, 如果文章中没有, 则去掉. 
+            '''
+            for seg in sp_txt(ID2TAG[topic_id]):
+                if seg in txt:
+                    print ID2TAG[topic_id], rank
+                    break
+                
+
+        for k in highest_word_list:
+            print ID2TAG[k]
+
+def sp_txt(txt):
+    for i in range(len(txt)-1):
+        yield txt[i:i+2]
+
+def main():
+    folder = 'articles'
+    test = GetTag(folder)
+    test.run_test()
+
+
+if __name__ == '__main__':
+    main()

File tfidf/train/__init__.py

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-

File tfidf/train/_env.py

+import sys
+reload(sys)
+sys.setdefaultencoding('utf-8')
+from os.path import dirname, abspath, join
+PWD = dirname(dirname(abspath(__file__)))
+sys.path.append(dirname(PWD))

File tfidf/train/generate_lib.py

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import _env
+from yajl import loads, dumps
+from collections import defaultdict
+from mmseg import seg_txt
+import os
+from os.path import join, dirname
+from zkit.tofromfile import tofile, fromfile
+from tfidf.find_parent_tag import ParentTagger
+
+from tfidf.config import DATA_DIR
+
+current_path = os.path.dirname(os.path.abspath(__file__))
+
+class WordId(object):
+    def __init__(self):
+        self._dict = {}
+    
+    def word_to_id(self):
+        return self._dict
+
+    def get_id_by_tag(self,tag):
+        if tag in self._dict:
+            return _dict[tag]
+        return None
+
+    def id_by_tag(self, tag):
+        tag = str(tag)
+        _dict = self._dict
+        if tag in _dict:
+            return _dict[tag]
+        id = len(_dict)+1
+        _dict[tag] = id
+        return id
+
+    def tofile(self, path):
+        tofile(path, self._dict)
+
+    def fromfile(self, path):
+        self._dict = fromfile(path)
+        return self
+
+    def id_list_by_word_list(self, tag_list):
+        result = []
+        for i in tag_list:
+            result.append(self.id_by_tag(i))
+        return result
+
+    def id2word(self):
+        return dict((k,v) for v,k in self._dict.iteritems())
+
+class TagWord(object):
+    def __init__(self, path):
+        self.tag2id = WordId()
+        self.word2id = WordId()
+        self.path = path
+        self.parent_tag_finder = ParentTagger()
+
+    def _txt_tag_generator(self):
+        path = self.path
+        tag2id = self.tag2id
+        with open(path) as f:
+            for line in f:
+                data = loads(line)
+                tags = data['tags']
+                '''
+                查找上级标签
+                '''
+                parent_list = self.parent_tag_finder.get_parent_tag_list_by_list(tags)
+                tags.extend(parent_list)
+                id_list = tag2id.id_list_by_word_list(tags)
+                yield data['title'], id_list
+                for ans in data['answer']:
+                    yield ans['answer'], id_list
+                '''
+                训练时, 将主题也算作一个词来处理.
+                '''
+                for tag in tags:
+                    yield tag,id_list
+
+    def txt_tag_generator(self):
+        word2id = self.word2id
+        for k, v in self._txt_tag_generator():
+            words = list(seg_txt(str(k).lower()))
+            yield word2id.id_list_by_word_list(words) , v
+
+    def tofile(self):
+        word_id2tag_id = list(self.txt_tag_generator())
+        path = dirname(self.path)
+        self.tag2id.tofile(join(path, 'tag2id'))
+        self.word2id.tofile(join(path, 'word2id'))
+        tofile(join(path, 'word_id2tag_id'), word_id2tag_id)
+
+def word_tag_word2tag_fromfile( path):
+    return map(fromfile,
+                map(
+                    lambda x:join(path, x),
+                    ('tag2id', 'word2id')
+                )
+            )
+
+class BayesRank(object):
+    def __init__(self, word_id2tag_id):
+        topic_id_title_count = self.topic_id_title_count = defaultdict(int)
+        word_topic_count = self.word_topic_count = defaultdict(lambda:defaultdict(int))
+
+        for word_id_list, tag_id_list in word_id2tag_id:
+            for tag_id in tag_id_list:
+                topic_id_title_count[tag_id] += 1
+                for word_id in word_id_list:
+                    word_topic_count[word_id][tag_id] += 1
+
+    def rank(self):
+        topic_id_title_count = self.topic_id_title_count
+        word_topic_count = self.word_topic_count
+
+        word_topic_bayes = {}
+        for word, topic_count in word_topic_count.iteritems():
+            word_topic_freq = {}
+            for topic_id, count in topic_count.iteritems():
+                topic2title = topic_id_title_count[topic_id]
+                if topic2title<20:
+                    continue
+                word_topic_freq[topic_id] = count/float(topic2title)
+
+            count = sum(word_topic_freq.itervalues())
+            wb = word_topic_bayes[word] = []
+            for k, v in word_topic_freq.iteritems():
+                wb.append((k, v/count))
+        return word_topic_bayes
+
+def main():
+    tagword=TagWord(join(DATA_DIR,"out.js"))
+    tagword.tofile()
+    WORD_ID2TAG_ID = fromfile(join(DATA_DIR,"word_id2tag_id"))
+    bayes_rank = BayesRank(WORD_ID2TAG_ID)
+    tofile(join(DATA_DIR, "bayes_rank") , bayes_rank.rank())
+
+if __name__ == '__main__':
+    main()
+else:
+    BAYES_RANK = fromfile(join(DATA_DIR,"bayes_rank"))
+    TAG2ID = WordId().fromfile(join(DATA_DIR,'tag2id'))
+    WORD2ID = WordId().fromfile(join(DATA_DIR,'word2id'))

File tfidf/train/tf_idf.py

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import _env
+import os
+from tfidf.idf import Idf
+from tfidf.config import DATA_DIR
+
+def tf_idf_by_zhihu():
+    current_path = os.path.dirname(os.path.abspath(__file__))
+    infile = join(DATA_DIR, 'out.js')
+    outfile = join(DATA_DIR, 'zhihu.idf')
+    idf = Idf()
+
+
+    with open(infile) as lib:
+        for line in lib:
+            l = loads(line)
+            idf.append( l['title'] )
+            for j in l['answer']:
+                idf.append(j['answer'])
+
+    with open(join(DATA_DIR,"review.txt")) as review:
+        result = []
+        for line in review:
+            line = line.strip()
+            if not line:
+                continue
+            if line.startswith(">->->"):
+                if result:
+                    line = line.split(" ",5)
+                    result.append(line[-1])
+                    txt = "\n".join(result)
+                    idf.append(txt)
+                    #print line[1]
+                    #print txt
+                    #raw_input()
+                result = []
+            else:
+                result.append(line)
+
+    idf.tofile(outfile)

File wanfang/wanfang_arti.py

             writer.write(result['url'],out+'\n')
 
 def get_file_list():
-    with open(path.join(CURRNET_PATH,"article_list/article_list0")) as f:
+    with open(path.join(CURRNET_PATH,"article_list/article_list1")) as f:
         for url in f:
             url = url.strip()
             file_path = get_write_path(url)