Commits

wooparadog committed 793d960

f

Comments (0)

Files changed (9)

+import sys
+reload(sys)
+sys.setdefaultencoding('utf-8')
+from os.path import dirname, abspath, join
+PWD = dirname(abspath(__file__))
+sys.path.append(dirname(PWD))

marvin/item_vector.py

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+
+import _env
+from tfidf.classification import GetTag
+from collections import defaultdict
+from zkit.tofromfile import tofile, fromfile
+from os import path
+
+class ItemVector(object):
+    def __init__(self, path):
+        super(ItemVector, self).__init__()
+        self.path = path
+        self.item_vector_dict = defaultdict(dict)
+        self.tagger = GetTag()
+
+    def get_item_bayes(self, txt):
+        tags = self.tagger.get_tag(txt)
+        return tags
+
+    def get_item(self):
+        '''
+        TODO:To be rewritten
+        '''
+        from glob import glob
+        file_list = glob(path.join(self.path, '*.test'))
+        for f in file_list:
+            print f
+            yield f, open(f).read()
+
+    def compute_vector_dict(self):
+        for id, item in self.get_item():
+            if item.strip():
+                item_vector = [(topic,rank_t) for topic, rank_t in self.get_item_bayes(item)][:50]
+
+                sum_v = sum([v for k,v in item_vector])
+                item_vector = [(k, v/sum_v) for k, v in item_vector]
+                item_vector.sort(key=lambda x:x[1], reverse=True)
+                item_vector = dict(item_vector)
+
+                self.item_vector_dict[id] = item_vector
+
+    def get_item_vector(self):
+        if not self.item_vector_dict:
+            self.compute_vector_dict()
+
+    def tofile(self, path):
+        if not self.item_vector_dict:
+            self.get_item_vector()
+        tofile(path, dict(self.item_vector_dict))
+
+    def fromfile(self, path):
+        self.item_vector_dict = fromfile(path)
+        return self.item_vector_dict
+
+def main():
+    a = ItemVector('test_items/')
+    a.tofile('test.d')
+
+if __name__ == '__main__':
+    main()
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from similarity import sim_cosine
+from user_vector import UserVector
+from item_vector import ItemVector
+from collections import defaultdict
+from yajl import dumps
+from zkit.pprint import pprint
+
+
+def find_sim(user_vector, item_vector_dict):
+    result = defaultdict(float)
+    for k, v in item_vector_dict.iteritems():
+        result[k] = sim_cosine(user_vector, v)
+
+    result = sorted(result.iteritems(), key=lambda x:x[1], reverse=True)[:10]
+    return result
+
+if __name__ == '__main__':
+    user_vector = UserVector(1000).fromfile('user_vectors/')
+    item_vector_dict = ItemVector('df').fromfile('test.d')
+    pprint(find_sim(user_vector,item_vector_dict))

marvin/user_vector.py

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import _env
+from tfidf.classification import GetTag
+from collections import defaultdict
+from zkit.tofromfile import tofile, fromfile
+from os import path
+
+class UserVector(object):
+    def __init__(self, user_id):
+        super(UserVector, self).__init__()
+        self.tagger = GetTag()
+        self.user_id = user_id
+        self.user_vector = defaultdict(float)
+
+    def get_item_bayes(self, txt):
+        tags = self.tagger.get_tag(txt)
+        return tags
+
+    def get_related_items(self):
+        from glob import glob
+        file_list = glob('10000000/*')
+        for f in file_list:
+            print f
+            yield open(f).read()
+
+    def get_user_vector(self):
+        if not self.user_vector:
+            self.compute_user_vector()
+        return self.user_vector
+
+    def compute_user_vector(self):
+        for item in self.get_related_items():
+            for topic, rank_t in self.get_item_bayes(item):
+                self.user_vector[topic] += rank_t
+
+        self.user_vector = self.user_vector.items()[:50]
+        sum_v = sum([v for k,v in self.user_vector])
+        self.user_vector = [(k, v/sum_v) for k, v in self.user_vector]
+        self.user_vector.sort(key=lambda x:x[1], reverse=True)
+        self.user_vector = dict(self.user_vector)
+
+    def path_builder(self,folder):
+        return path.join(folder,str(self.user_id))
+
+    def tofile(self,path):
+        if not self.user_vector:
+            self.get_user_vector()
+        path = self.path_builder(path)
+        tofile(path,self.user_vector)
+
+    def fromfile(self,path):
+        path = self.path_builder(path)
+        self.user_vector = fromfile(path)
+        return self.user_vector
+
+def main():
+    a = UserVector(1000)
+    a.tofile('user_vectors')
+
+if __name__ == '__main__':
+    main()

spider/__init__.py

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from rolling import Rolling
+from urlfetch import Fetch, NoCacheFetch
+from gcrawler import GCrawler 
+import sys
+reload(sys)
+sys.setdefaultencoding('utf-8')
+from os.path import dirname, abspath, join
+PWD = dirname(dirname(abspath(__file__)))
+sys.path.append(dirname(PWD))

spider/gcrawler.py

+# -*- coding: utf-8 -*-
+"""
+    gevent crawler
+    ~~~~~~~~~~~~~~~~
+    BSD License
+    2011 by raptor.zh@gmail.com.
+"""
+import _env
+import gevent
+from gevent import monkey, queue
+
+monkey.patch_all()
+
+import urllib2
+from time import sleep
+import os
+from hashlib import md5
+import urlparse
+from collections import  defaultdict
+import os.path as path
+import traceback
+
+
+CURRENT_PATH = path.dirname(path.abspath(__file__))
+
+class GCrawler(object):
+    def __init__(self, spider,  workers_count=8):
+        self.spider = spider
+        self.jobs = [gevent.spawn(self._work) for i in range(workers_count)]
+        self.job_count = len(self.jobs)
+
+    def start(self):
+        gevent.joinall(self.jobs)
+
+    def _work(self):
+        scheduler = self.spider.scheduler()
+        try:
+            for item in scheduler:
+                #print item,"!!item"
+                try:
+                    self.spider.worker(item)
+                except Exception, e:
+                    print('Error on get %s:%s\n%s' % (item[1], e, traceback.format_exc()))
+        finally:
+            self.job_count -= 1
+            print("Worker done, job count: %s" % self.job_count)
+
+
+def main():
+    pass
+
+if __name__ == '__main__':
+    main()

spider/rolling.py

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+
+import _env
+from gcrawler import GCrawler 
+from collections import  defaultdict
+import os
+from gevent import monkey, queue
+import Queue
+from urlfetch import Fetch, fetch
+
+class Rolling(object):
+    def __init__(self, fetch , url_iter=()):
+        self.subitems = queue.Queue(-1)
+        self.url_iter = iter(url_iter)
+        self.fetch = fetch
+
+    def scheduler(self):
+        subitems = self.subitems
+
+        for item in self.url_iter:
+            self.push(*item)
+            yield subitems.get(timeout=60)
+
+        while True:
+            try:
+                item = subitems.get(timeout=60)
+            except Queue.Empty:
+                return
+            else:
+                yield item
+
+    def push(self, callback, url, *args, **kwds):
+        self.subitems.put((callback, url, args, kwds))
+
+    def worker(self, item):
+        callback , url, args, kwds = item
+        r = self.fetch(url)
+        if r != None:
+            new_items = callback(r, url, *args, **kwds)
+        if new_items is not None:
+            for item in new_items:
+                self.push(*item)
+        return r
+
+
+
+def main():
+    def callback(html, url):
+        print  url
+
+    spider = Rolling(
+        fetch,
+        (
+            (callback, 'http://www.baidu.com'),
+        )
+    )
+    crawler = GCrawler(spider, workers_count=10)
+    crawler.start()
+
+if __name__ == '__main__':
+    main()

spider/urlfetch.py

+from os import path
+from hashlib import md5
+import urllib2
+import urlparse
+
+def retryOnURLError(self, trycnt=3):
+    def funcwrapper(fn):
+        def wrapper( *args, **kwargs):
+            for i in range(trycnt):
+                try:
+                    return fn( *args, **kwargs)
+                except urllib2.URLError, e:
+                    #logger.info('retry %s time(s)' % (i+1))
+                    if i == trycnt - 1:
+                        raise e
+        return wrapper
+    return funcwrapper
+
+class Fetch(object):
+    def __init__(self, cache, headers={}):
+        self.cache = cache
+        self.headers = headers
+
+    def cache_get(self, url):
+        cache_dir = path.join(
+            self.cache, urlparse.urlparse(url).hostname
+        )
+        if not path.exists(cache_dir):
+            os.mkdir(cache_dir)
+
+        if not path.exists(cache_dir):
+            os.mkdir(cache_dir)
+        file_name = md5(url).hexdigest()
+        file_path = path.join(cache_dir, file_name)
+
+        if path.exists(file_path):
+            with open(file_path) as f:
+                #logger.debug('Using Cache ...%s' % url)
+                data = f.read()
+                return data
+
+    def read(self, url):
+        conn = urllib2.urlopen(url, timeout=30)
+        data = conn.read()
+        conn.close()
+        return data
+
+    @retryOnURLError(3)
+    def __call__(self, url):
+        data = self.cache_get(url)
+        if data is None:
+            with open(file_path, 'w') as f:
+                data = self.read(url)
+                f.write(data)
+
+        return data
+
+class NoCacheFetch(object):
+    def __init__(self, headers={}):
+        self.headers = headers
+
+    def read(self, url):
+        print "reading url",url
+        conn = urllib2.urlopen(url, timeout=30)
+        data = conn.read()
+        conn.close()
+        return data
+
+    @retryOnURLError(3)
+    def __call__(self, url):
+        data  = self.read(url)
+        return data
+
+from os import path
+CURRENT_PATH = path.dirname(path.abspath(__file__))
+fetch=Fetch(path.join(CURRENT_PATH, "cache"))