Commits

yang xiaoyong committed 89f2186 Draft Merge

merge

Comments (0)

Files changed (44)

py/compare_date.py

+import datetime
+
+dates = ['2012-05-30',
+         '2012-05-16',
+         '2012-05-13',
+         '2012-04-29',
+         '2012-04-28',
+         '2012-05-12',
+         '2012-04-27',
+         '2012-04-26',
+         '2012-06-03',
+         '2012-06-01',]
+
+day_fmt =  '%Y-%m-%d'
+
+t = ('2012-05-13', '2012-06-01')
+
+dates = [datetime.datetime.strptime(d, day_fmt) for d in dates]
+dt = [datetime.datetime.strptime(d, day_fmt) for d in t]
+
+for date in dates:
+    if date >= dt[1]:
+        print date

py/crawler/__init__.py

Empty file added.

py/crawler/crawler/__init__.py

Empty file added.

py/crawler/crawler/backup_db.sh

+#!/bin/sh
+
+DST="/data/backups"
+
+cp data.db $DST/$(date +%Y-%m-%d)-data.db

py/crawler/crawler/init_db.py

+import os
+import sqlite3
+
+db = 'data.db'
+def create_table(table_name):
+    #if os.path.exists(db):
+        #print 'db already was initialized.'
+        #return
+    conn = sqlite3.connect(db)
+    conn.execute("""CREATE TABLE IF NOT EXISTS %s (
+         url TEXT,
+         title TEXT,
+         content TEXT,
+         image_urls TEXT,
+         category TEXT,
+         download_urls TEXT)""" % table_name)
+    conn.commit()
+    conn.close()
+
+def add_column(tbl, cl):
+    conn = sqlite3.connect(db)
+    conn.execute("""ALTER table %s add column %s TEXT;""" % (tbl, cl))
+    conn.commit()
+    conn.close()
+
+# domains = ['gamersky', 'dmgame', 'tencent', 'sina', 'duowan',
+domains = ['dmgame', 'tencent', 'sina', 'duowan',
+           'sgamer', 'a9vg', 'g178', 'net163', 'youba', 'pcgames',
+           'verycd', 'g91', 'd_cn']
+
+domains = ['ali213']
+for d in domains:
+    create_table(d)
+    # add_column(d, 'download_urls')
+    #create_table(d)
+
+

py/crawler/crawler/items.py

+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/topics/items.html
+
+from scrapy.item import Item, Field
+
+class ImageItem(Item):
+    url = Field()
+    image_urls = Field()
+    images = Field()
+
+class CrawlerItem(Item):
+    table_name = Field()
+    url = Field()
+    title = Field()
+    content = Field()
+    image_urls = Field()
+    images = Field()
+    category = Field()
+    download_urls = Field()
+
+class DownloadItem(Item):
+    table_name = Field()
+    url = Field()
+    title = Field()
+    content = Field()
+    image_urls = Field()
+    images = Field()
+    category = Field()
+    download_urls = Field()
+

py/crawler/crawler/pipelines.py

+# -*- encoding:utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/topics/item-pipeline.html
+import os
+import hashlib
+import sqlite3
+
+from scrapy.contrib.pipeline.images import ImagesPipeline
+from scrapy import signals
+from scrapy.xlib.pydispatch import dispatcher
+
+from utils import get_pathname_from_url, writefile, mkdir
+from items import CrawlerItem
+
+def unicode_to_str(raw, enc='utf8', sep='|'):
+    '''
+    raw 如果是列表,列表里的元素必须是unicode
+    raw
+    '''
+    if hasattr(raw, '__iter__'):
+        return sep.join([i.encode(enc) for i in raw])
+
+    elif isinstance(raw, unicode):
+        return raw.encode(enc)
+    elif isinstance(raw, str):
+        return raw
+    else:
+        raise TypeError('Unsupport type: %r <%r>' % (raw, type(raw)))
+
+class CrawlerPipeline(object):
+    filename = 'data.db'
+    def __init__(self):
+        self.conn = None
+        dispatcher.connect(self.initialize, signals.engine_started)
+        dispatcher.connect(self.finalize, signals.engine_stopped)
+
+    def process_item(self, item, spider):
+        d = {}
+        for k, v in item.items():
+            sep = '|'
+            if k == 'content':
+                sep = '\n'
+            d[k] = unicode_to_str(v, sep=sep)
+
+        values = (d['url'], d['title'], d['content'], d['image_urls'], d['category'], d['download_urls'])
+        self.conn.execute('INSERT INTO %s values (?, ?, ?, ?, ?, ?)' % d['table_name'], values)
+
+        self.conn.commit()
+        # 暂时不考虑写文件系统
+        #if isinstance(item, CrawlerItem):
+            #fn = '/data/crawl/sites/' + get_pathname_from_url(item['url'])
+            #mkdir(os.path.dirname(fn))
+            ## content = '\n'.join([e.encode('utf8') for e in item['content']])
+            #writefile(fn, d['content'])
+        return item
+
+
+    def initialize(self):
+        self.conn = sqlite3.connect(self.filename)
+        self.conn.text_factory = str
+
+    def finalize(self):
+        if self.conn is not None:
+            self.conn.commit()
+            self.conn.close()
+            self.conn = None
+
+#class CrawlerPipeline(object):
+    #def process_item(self, item, spider):
+        #if isinstance(item, CrawlerItem):
+            #fn = os.path.join('dat', get_pathname_from_url(item['url']))
+            #fd = os.path.dirname(fn)
+            #mkdir(fd)
+            #content = '\n'.join([e.encode('utf8') for e in item['content']])
+            #writefile(fn, content)
+
+        #return item
+
+class MyImagePipeline(ImagesPipeline):
+    def _get_hash_key(self, s):
+        v = hashlib.sha1(s).hexdigest()
+        return (v[:2], v[2:4], v[4:])
+
+    def image_key(self, url):
+        fp = os.sep.join(self._get_hash_key(url))
+        path = 'full/%s.jpg' % fp
+        return path
+
+    def thumb_key(self, url, thumb_id):
+        fp = os.sep.join(self._get_hash_key(url))
+        path = 'thumbs/%s/%s.jpg' % (thumb_id, fp)
+        return path
+
+

py/crawler/crawler/restruct.py

+#!/usr/bin/env python
+# -*- encoding:utf-8 -*-
+import os
+import shutil
+
+DST = '/data/image/full'
+
+def get_file(path):
+    for root, dirs, files in os.walk(path):
+        for fn in files:
+            yield os.path.join(root, fn)
+
+def copyfile(src, dst):
+    shutil.copyfile(src, dst)
+
+def mkdir(path):
+    if not os.path.exists(path):
+        os.makedirs(path)
+
+def split(v):
+    return (v[:2], v[2:4], v[4:])
+
+
+def main():
+    for fn in get_file('.'):
+        if os.path.basename(fn) == __file__:
+            continue
+        sha = fn[2:].replace('/', '')
+        dst = DST + '/' + '/'.join(split(sha))
+        mkdir(os.path.dirname(dst))
+        print 'copy %s to %s' % (fn, dst)
+        copyfile(fn, dst)
+
+if __name__ == '__main__':
+    main()
+
+

py/crawler/crawler/scripts/download_img.py

+import os
+import hashlib
+import time
+import sqlite3
+from urllib2 import Request, urlopen
+
+from selenium import webdriver
+
+conn = sqlite3.connect('../data.db')
+cur = conn.cursor()
+
+# level 2, step 2 hash directory
+step = 2
+level = 2
+USER_AGENT = ('Mozilla/5.0 X11 Linux x86_64 AppleWebKit/535.19 KHTML, '
+              'like Gecko Ubuntu/12.04 Chromium/18.0.1025.151 Chrome/1'
+              '8.0.1025.151 Safari/535.19')
+
+def permu(li):
+    ret = []
+    for i, e in enumerate(li):
+        if i + 1 >= len(li):
+            break
+        ret.append([e, li[i+1]])
+    return ret
+
+indexs = permu(range(0, 50, step))[:level]
+
+def mkdir(path):
+    if not os.path.exists(path):
+        os.makedirs(path)
+
+def get_path_by_url(url):
+    '''hash url to get path'''
+    sha = hashlib.sha1(url).hexdigest()
+    fs = [sha[i[0]:i[1]] for i in indexs]
+    fs.append(sha[level*step:])
+    fs = os.sep.join(fs)
+    ext_index = url.rfind('.')
+    ext = url[ext_index:]
+    if not ext.lower() in ['.jpg', '.gif', '.png']:
+        print 'invalid url', url
+        return
+    fs += ext
+    return fs
+
+def crawl_img(r, fp):
+    try:
+        resp = urlopen(r)
+    except Exception, e:
+        print '%s --> %s' % (r.get_full_url(), e)
+        return None
+    size = 512 * 1024
+    with open(fp, 'wb') as fb:
+        while True:
+            chunk = resp.read(size)
+            if not chunk: break
+            fb.write(chunk)
+    print '[done] %s saved to %s' % (r.get_full_url(), fp)
+
+size_map = {'200x200': '550x550',
+            '116x86': '980x1200'}
+small_size = '200x200'
+big_size = '550x550'
+# small_pat = re.compile('.+%s' % small_size)
+
+def main(url):
+    headers = {'Referer': url, 'User-Agent': USER_AGENT}
+    driver = webdriver.Firefox(timeout=15)
+    driver.get(url)
+    time.sleep(8)
+    img_links = [t.get_attribute('src') for t in driver.find_elements_by_tag_name('img')]
+    if not img_links:
+        return
+    sql = 'UPDATE tencent SET image_urls="%s" WHERE url="%s"' % ('|'.join(img_links), url)
+    cur.execute(sql)
+    conn.commit()
+    driver.quit()
+    for small_size in size_map:
+        img_links.extend([l.replace(small_size, size_map[small_size]) for l in img_links if small_size in l])
+    requests = [Request(u, headers=headers) for u in img_links]
+
+    for req in requests:
+        fp = get_path_by_url(req.get_full_url())
+        if not fp: continue
+        fp = '/data/crawl/image/full/' + fp
+        if os.path.exists(fp):
+            continue
+        mkdir(os.path.dirname(fp))
+        crawl_img(req, fp)
+
+if __name__ == '__main__':
+    cur.execute('SELECT url, `image_urls` from tencent')
+    urls = [l[0] for l in cur.fetchall() if not l[1]]
+    for l in urls:
+        main(l)

py/crawler/crawler/settings.py

+# Scrapy settings for crawler project
+#
+# For simplicity, this file contains only the most important settings by
+# default. All the other settings are documented here:
+#
+#     http://doc.scrapy.org/topics/settings.html
+#
+
+BOT_NAME = 'crawler'
+BOT_VERSION = '1.0'
+
+SPIDER_MODULES = ['crawler.spiders']
+NEWSPIDER_MODULE = 'crawler.spiders'
+# USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
+
+USER_AGENT = 'Mozilla/5.0 X11 Linux x86_64 AppleWebKit/535.19 KHTML, like Gecko Ubuntu/12.04 Chromium/18.0.1025.151 Chrome/18.0.1025.151 Safari/535.19'
+
+ITEM_PIPELINES = ['crawler.pipelines.CrawlerPipeline',
+                  'crawler.pipelines.MyImagePipeline'
+                  ]
+
+SPIDER_MIDDLEWARES = {'scrapy.contrib.spidermiddleware.referer.RefererMiddleware': 10}
+IMAGES_STORE = '/data/crawl/image'
+
+SITES = {'duowan': {'rules': [(r'com/1206/\d+\.html', 'parse_item')],
+                    'start_urls': ['http://pc.duowan.com/tag/184669959747.html'],
+                    'allowed_domains': ['duowan.com'],
+                    'table_name': 'duowan',
+                    },
+         'qq': {},
+         }

py/crawler/crawler/spiders/__init__.py

+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.

py/crawler/crawler/spiders/a3dmgame.py

+# -*- encoding:utf-8 -*-
+import re
+
+from scrapy.selector import HtmlXPathSelector
+from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
+from scrapy.contrib.spiders import CrawlSpider, Rule
+from crawler.items import CrawlerItem
+
+date = '201206'
+thunder_pat = re.compile(r'href=["\'](thunder://.+?)["\']\s*')
+
+class A3dmSpider(CrawlSpider):
+    name = 'a3dmgame'
+    allowed_domains = ['3dmgame.com']
+    start_urls = [
+                  'http://www.3dmgame.com/news/',
+                  'http://www.3dmgame.com/preview/',
+                  'http://www.3dmgame.com/review/',
+                  'http://dl.3dmgame.com/SoftList_18.html',
+                  ]
+
+    rules = (
+        Rule(SgmlLinkExtractor(allow=r'com/%s/\d+(_\d+)?\.html' % date), callback='parse_soft', follow=True),
+        Rule(SgmlLinkExtractor(allow=r'news/%s/\d+(_\d+)?\.html' % date), callback='parse_news', follow=True),
+        Rule(SgmlLinkExtractor(allow=r'preview/%s/\d+(_\d+)?\.html' % date), callback='parse_news', follow=True),
+        Rule(SgmlLinkExtractor(allow=r'review/%s/\d+(_\d+)?\.html' % date), callback='parse_news', follow=True),
+    )
+
+    def _parse_rel_link(self, url, link):
+        i = url.rfind('/')
+        par_url = url[:i]
+        if link.startswith('..'):
+            link = '/'.join([par_url, link])
+        elif link.startswith('/'):
+            link = par_url + link
+        return link
+
+    def parse_news(self, response):
+        hxs = HtmlXPathSelector(response)
+
+        i = CrawlerItem()
+        i['table_name'] = self.name
+        i['url'] = response.url
+        i['title'] = hxs.select('//h1/text()').extract()
+        i['image_urls'] = [self._parse_rel_link(response.url.strip(), u.strip()) for u in hxs.select('//img/@src').extract()
+                          if u.strip()[-4:].lower() in ['.jpg', '.png', '.gif']] # TODO .jpeg support
+        i['category'] = hxs.select('//div[@id="Location"]/div[@class="LoTitle"]/text()').extract()[1]
+        i['content'] = hxs.select('//div[@class="DeContent"]').re('.+')
+        i['download_urls'] = ''
+        return i
+
+    def parse_soft(self, response):
+        hxs = HtmlXPathSelector(response)
+
+        i = CrawlerItem()
+        i['table_name'] = self.name
+        i['url'] = response.url
+        i['title'] = hxs.select('//h1/text()').extract()
+        i['image_urls'] = [self._parse_rel_link(response.url.strip(), u.strip()) for u in hxs.select('//img/@src').extract()]
+        i['content'] = hxs.select('//div[@class="gameContent"]').re('.+')
+        i['category'] = hxs.select('//div[@class="gameContent"]/div[@class="jbContentBOttom"]/dl/ul/li[3]/dd/div[@class="wenziRight"]/text()').extract()[0].strip()
+        i['download_urls'] = [thunder_pat.search(t).group(1) for t in hxs.select('//div[@class="xiazaiList"]/a').extract()
+                              if thunder_pat.search(t)]
+
+        return i

py/crawler/crawler/spiders/duowan.py

+from scrapy.selector import HtmlXPathSelector
+from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
+from scrapy.contrib.spiders import CrawlSpider, Rule
+from scrapy.conf import settings
+from crawler.items import CrawlerItem
+
+site_name = 'duowan'
+site_conf = settings['SITES'][site_name]
+
+class DuowanSpider(CrawlSpider):
+    name = site_name
+    allowed_domains = ['duowan.com']
+    start_urls = site_conf['start_urls']
+
+    rules = [Rule(SgmlLinkExtractor(allow=regex), callback=func, follow=True) for regex, func in site_conf['rules']]
+
+    def parse_item(self, response):
+        hxs = HtmlXPathSelector(response)
+        i = CrawlerItem()
+        i['table_name'] = site_conf['table_name']
+        i['url'] = response.url
+        i['title'] = hxs.select('//h1/text()').extract()
+        i['content'] = hxs.select('//div[@id="text"]/p/text()').extract()
+        i['image_urls'] = ''
+        i['category'] = hxs.select('//div[@class="mod-crumb"]/a/text()').extract()
+        i['download_urls'] = ''
+
+        return i

py/crawler/crawler/spiders/gamersky.py

+# -*- encoding:utf-8 -*-
+
+from scrapy.selector import HtmlXPathSelector
+from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
+from scrapy.contrib.spiders import CrawlSpider, Rule
+from crawler.items import CrawlerItem
+
+class GamerskySpider(CrawlSpider):
+    name = 'gamersky'
+    allowed_domains = ['gamersky.com']
+    start_urls = ['http://www.gamersky.com/news/pc/zx/',     # 新闻速递
+                  'http://www.gamersky.com/news/pc/qz/',
+                  'http://www.gamersky.com/handbook/pc/gl/', # 攻略
+                  'http://www.gamersky.com/news/pc/dp/',     # 评测
+                  'http://www.gamersky.com/news/tv/zx/',
+                  ]
+
+    rules = (
+        Rule(SgmlLinkExtractor(allow=r'news/201206/\d+(_\d+)?\.shtml'), callback='parse_article', follow=True),
+        # TODO: 抓取大图
+        # Rule(SgmlLinkExtractor(allow=r'.+\.(jpg|png|gif)$'), callback='parse_img', follow=False),
+    )
+
+    def parse_img(self, response):
+        # hxs = HtmlXPathSelector(response)
+        i = CrawlerItem()
+        i['image_urls'] = response.url# hxs('//div[@id="gspaging"]/p/a/@href').extract()
+        return i
+
+    def parse_article(self, response):
+        hxs = HtmlXPathSelector(response)
+
+        i = CrawlerItem()
+
+        i['table_name'] = self.name
+        i['url'] = response.url
+        i['image_urls'] = hxs.select('//a/img/@src').extract()
+        i['title'] = hxs.select('//h1/b/text()').extract()
+        i['category'] = hxs.select('//div[@class="tit1 mid"]/a/text()').extract()
+        i['content'] = hxs.select('//div[@id="gspaging"]').re(r'(.+)')
+
+        return i

py/crawler/crawler/spiders/gamersky_dw.py

+# -*- encoding:utf-8 -*-
+
+from scrapy.selector import HtmlXPathSelector
+from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
+from scrapy.contrib.spiders import CrawlSpider, Rule
+from crawler.items import CrawlerItem, DownloadItem
+
+class GamerskySpider(CrawlSpider):
+    name = 'dw'
+    allowed_domains = ['gamersky.com']
+    start_urls = ['http://www.gamersky.com/Soft/te/',
+                  ]
+
+    rules = (
+        Rule(SgmlLinkExtractor(allow=r'Soft/201206/\d+(_\d+)?\.shtml'), callback='parse_article', follow=True),
+    )
+
+    def parse_article(self, response):
+        hxs = HtmlXPathSelector(response)
+
+        i = DownloadItem()
+
+        i['table_name'] = 'gamersky_dw'
+        i['url'] = response.url
+        i['image_urls'] = hxs.select('//a/img/@src').extract()
+        i['title'] = hxs.select('//h1/b/text()').extract()
+        i['category'] = hxs.select('//div[@class="tit1 mid"]/a/text()').extract()
+        i['content'] = hxs.select('//div[@class="actdl"]').re('.+')
+        i['download_urls'] = hxs.select('//div[@class="dvurl1"]/p/a/@href').extract()
+
+        return i

py/crawler/crawler/spiders/qq.py

+from scrapy.selector import HtmlXPathSelector
+from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
+from scrapy.contrib.spiders import CrawlSpider, Rule
+from crawler.items import CrawlerItem
+
+class QqSpider(CrawlSpider):
+    name = 'qq'
+    allowed_domains = ['qq.com']
+    start_urls = ['http://games.qq.com/pcgame/05pchotnews/list2009124183318.htm',
+                  'http://games.qq.com/l/tvgame/07zxzx/list2010098184725.htm',
+                  ]
+
+    rules = (
+        Rule(SgmlLinkExtractor(allow=r'a/201206\d+/\d+\.htm'), callback='parse_item', follow=True),
+    )
+
+    def parse_item(self, response):
+        hxs = HtmlXPathSelector(response)
+        i = CrawlerItem()
+        i['table_name'] = 'tencent'
+        i['title'] = hxs.select('//h1/text()').extract()
+        i['content'] = hxs.select('//div[@bosszone="content"]/p/text()').extract()
+        i['url'] = response.url
+        i['image_urls'] = ''
+        i['category'] = hxs.select('//span[@bosszone="crumbNav"]/a/text()').extract()
+        i['download_urls'] = ''
+
+        return i

py/crawler/crawler/spiders/test.py

+from scrapy.spider import BaseSpider
+
+class TestSpider(BaseSpider):
+    name = "test"
+    allowed_domains = ["google.com"]
+    start_urls = (
+        'http://www.google.com/',
+        'http://www.baidu.com/',
+        'http://www.bing.com/',
+        )
+
+    def parse(self, response):
+        self.log('A response from %s just arrived!' % response.url)

py/crawler/crawler/test.sh

+#!/bin/sh
+
+scrapy crawl gamersky
+# scrapy crawl 3dmgame

py/crawler/crawler/utils.py

+import os
+from urlparse import urlparse
+
+
+def writefile(fn, content):
+    with open(fn, 'wb') as fb:
+        fb.write(content)
+
+def mkdir(path):
+    if not os.path.exists(path):
+        os.makedirs(path)
+
+def get_pathname_from_url(url):
+    u = urlparse(url)
+    fn = '/'.join([u.netloc, u.path])
+    #mkdir(os.path.dirname(fn))
+    return fn

py/crawler/items.py

+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/topics/items.html
+
+from scrapy.item import Item, Field
+
+class CrawlerItem(Item):
+    url = Field()
+    title = Field()
+    category = Field()
+    content = Field()
+
+class ImageItem(Item):
+    url = Field()
+    image_urls = Field()
+    images = Field()
+
+class GamerskyItem(Item):
+    url = Field()
+    title = Field()
+    content = Field()
+    image_urls = Field()
+    images = Field()
+    category = Field()
+

py/crawler/pipelines.py

+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/topics/item-pipeline.html
+import os
+
+from scrapy.contrib.pipeline.images import ImagesPipeline
+
+from utils import get_pathname_from_url, writefile, mkdir
+from items import CrawlerItem, ImageItem
+
+class CrawlerPipeline(object):
+    def process_item(self, item, spider):
+        if isinstance(item, CrawlerItem):
+            fn = os.path.join('dat', get_pathname_from_url(item['url']))
+            fd = os.path.dirname(fn)
+            mkdir(fd)
+            writefile(fn, '\n'.join([e.encode('utf8') for e in item['content']]))
+
+        return item
+
+class MyImagePipeline(ImagesPipeline):
+    def image_key(self, url):
+        return get_pathname_from_url(url)
+

py/crawler/scrapy.cfg

+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# http://doc.scrapy.org/topics/scrapyd.html
+
+[settings]
+default = crawler.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = crawler

py/crawler/settings.py

+# Scrapy settings for crawler project
+#
+# For simplicity, this file contains only the most important settings by
+# default. All the other settings are documented here:
+#
+#     http://doc.scrapy.org/topics/settings.html
+#
+
+BOT_NAME = 'crawler'
+BOT_VERSION = '1.0'
+
+SPIDER_MODULES = ['crawler.spiders']
+NEWSPIDER_MODULE = 'crawler.spiders'
+# USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
+
+USER_AGENT = 'Mozilla/5.0 X11 Linux x86_64 AppleWebKit/535.19 KHTML, like Gecko Ubuntu/12.04 Chromium/18.0.1025.151 Chrome/18.0.1025.151 Safari/535.19'
+
+ITEM_PIPELINES = ['crawler.pipelines.CrawlerPipeline',
+                  'crawler.pipelines.MyImagePipeline']
+
+IMAGES_STORE = 'dat/'

py/crawler/spiders/__init__.py

+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.

py/crawler/spiders/gamersky.py

+from scrapy.selector import HtmlXPathSelector
+from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
+from scrapy.contrib.spiders import CrawlSpider, Rule
+from crawler.items import GamerskyItem, ImageItem
+
+class GamerskySpider(CrawlSpider):
+    name = 'gamersky'
+    allowed_domains = ['gamersky.com']
+    start_urls = ['http://www.gamersky.com/news/pc/zx/',
+                  'http://www.gamersky.com/news/pc/qz/',
+                  'http://www.gamersky.com/news/tv/zx/']
+    # start_urls = ['http://www.gamersky.com/news/201206/203393.shtml']
+
+    rules = (
+        Rule(SgmlLinkExtractor(allow=r'news/201206/\d+(_\d+)?\.shtml'), callback='parse_article', follow=True),
+    )
+
+    def parse_article(self, response):
+        hxs = HtmlXPathSelector(response)
+
+        i = GamerskyItem()
+        #im = ImageItem()
+        i['url'] = response.url
+        #im['image_urls'] = hxs.select('//a/img/@src').extract()
+        i['image_urls'] = hxs.select('//a/img/@src').extract()
+        i['title'] = hxs.select('//h1/b/text()').extract()
+        i['category'] = hxs.select('//div[@class="tit1 mid"]/a/text()').extract()
+        i['content'] = hxs.select('//div[@id="gspaging"]').re(r'(.+)')
+
+        return i

py/crawler/spiders/test.py

+from scrapy.spider import BaseSpider
+
+class TestSpider(BaseSpider):
+    name = "test"
+    allowed_domains = ["google.com"]
+    start_urls = (
+        'http://www.google.com/',
+        'http://www.baidu.com/',
+        'http://www.bing.com/',
+        )
+
+    def parse(self, response):
+        self.log('A response from %s just arrived!' % response.url)

py/crawler/utils.py

+import os
+from urlparse import urlparse
+
+
+def writefile(fn, content):
+    with open(fn, 'wb') as fb:
+        fb.write(content)
+
+def mkdir(path):
+    if not os.path.exists(path):
+        os.makedirs(path)
+
+def get_pathname_from_url(url):
+    u = urlparse(url)
+    fn = '/'.join([u.netloc, u.path])
+    mkdir(os.path.dirname(fn))
+    return fn
+ #-*- encoding:utf-8 -*-
+
+from __future__ import with_statement
+
+import os
+
+from hashlib import md5
+
+def get_relpath(path):
+    '''
+    >>> p = 'a/b/c'
+    >>> get_relpath(p)
+    'a/b/c'
+    >>> p = 'a/b/c'
+    >>> get_relpath(p)
+    'b/c'
+    >>> p = 'a'
+    >>> get_relpath(p)
+    'a'
+    >>> p = 'c:\\a\\b\\c'
+    >>> get_relpath(p)
+    'a\\b\\c'
+    '''
+    i = path.find(os.sep) + 1
+    if i == 0:
+        return path
+    return path[i:]
+
+def gen_file_md5(fp, chunk=2*2048):
+    m = md5()
+    with open(fp, 'rb') as fb:
+        while True:
+            content = fb.read(chunk)
+            if not content:
+                break
+            m.update(content)
+    return m.hexdigest()
+
+def get_files(path):
+    for root, dirs, files in os.walk(path):
+        for fn in files:
+            yield os.path.join(root, fn)
+
+def gen_dir_md5(dirname):
+    md5_list = [(f, gen_file_md5(f)) for f in get_files(dirname)]
+    return md5_list
+
+def is_same(file_a, file_b):
+    pass
+
+def compare_dirs(da, db):
+    filelst_a = dict(gen_dir_md5(da))
+    filelst_b = dict(gen_dir_md5(db))
+
+    writefile(da + '.md5.txt', os.linesep.join(['%s,%s' % (k,v) for k, v in filelst_a.iteritems()]))
+    writefile(db + '.md5.txt', os.linesep.join(['%s,%s' % (k,v) for k, v in filelst_b.iteritems()]))
+
+    rel_path_a = [p[len(da)+1:] for p in filelst_a]
+    rel_path_b = [p[len(db)+1:] for p in filelst_b]
+    only_a = [f for f in rel_path_a if f not in rel_path_b]
+    only_b = [f for f in rel_path_b if f not in rel_path_a]
+    isolate_files = only_a > only_b and only_b or only_a
+    file_set = set(rel_path_a) & set(rel_path_b)
+    diff_files = []
+    for fn in file_set:
+        if filelst_a[os.sep.join([da, fn])] != filelst_b[os.sep.join([db, fn])]:
+            diff_files.append(fn)
+
+    return (only_a, only_b, diff_files)
+
+def writefile(fp, content):
+    with open(fp, 'wb') as fb:
+        fb.write(content)
+
+def main():
+    cwd_dirs = [d for d in os.listdir('.') if os.path.isdir(d)]
+    cwd_dirs = dict(zip(map(str, range(len(cwd_dirs))), cwd_dirs))
+    for k, v in cwd_dirs.iteritems():
+        print '[%s] %s' % (k, v)
+    print 'Choose two dirs to compare.'
+    dir_a = raw_input('1st dirname:')
+    dir_b = raw_input('2snd dirname:')
+
+    assert dir_a in cwd_dirs
+    assert dir_b in cwd_dirs
+
+    only_in_a, only_in_b, diff_files = compare_dirs(cwd_dirs[dir_a], cwd_dirs[dir_b])
+    is_same = True
+    if only_in_a or only_in_b or diff_files:
+        is_same = False
+    for d in only_in_a:
+        print 'Only in [%s]: %s' % (cwd_dirs[dir_a], d)
+    for d in only_in_b:
+        print 'Only in [%s]: %s' % (cwd_dirs[dir_b], d)
+
+    for df in diff_files:
+        print 'diff file: %s' % df
+
+    if is_same:
+        print '%s, %s are the same' % (cwd_dirs[dir_a], cwd_dirs[dir_b])
+
+if __name__ == '__main__':
+    main()
+    raw_input('press any key to continue.')
+

py/object/metaclass-1.py

+#!/usr/bin/env python
+# -*- encoding:utf-8 -*-
+
+class UpperAttrMetaclass(type):
+    def __new__(upperattr_metaclass, future_class_name,
+                future_class_parents, future_class_attr):
+        attrs = ((name, value) for name, value in future_class_attr.items()
+                if not name.startswith('__'))
+        uppercase_attr = dict((name.upper(), value) for name, value in attrs)
+        return type(future_class_name, future_class_parents, uppercase_attr)
+
+
+class Foo:
+    __metaclass__ = UpperAttrMetaclass
+    def foo(self):
+        print 'Upper'
+
+f = Foo()
+f.FOO()

py/object/metaclass-2.py

+#!/usr/bin/env python
+# -*- encoding:utf-8 -*-
+
+# upperattr_metaclass: 一个方法总是接收当期实例作为第一个参数传入
+# 就跟原来方法传入 `self` 一样
+
+class UpperAttrMetaclass(type):
+    def __new__(upperattr_metaclass, future_class_name,
+                future_class_parents, future_class_attr):
+        attrs = ((name, value) for name, value in future_class_attr.items()
+                if not name.startswith('__'))
+        uppercase_attr = dict((name.upper(), value) for name, value in attrs)
+        # we call `type` directly and we don't override call the parent
+        # `__new__`
+
+        return type.__new__(upperattr_metaclass,
+                            future_class_name,
+                            future_class_parents, uppercase_attr)
+
+
+class Foo:
+    __metaclass__ = UpperAttrMetaclass
+    def foo(self):
+        print 'Upper'
+
+f = Foo()
+f.FOO()

py/object/metaclass.py

+#!/usr/bin/env python
+# -*- encoding:utf-8 -*-
+
+'''
+http://stackoverflow.com/questions/100003/what-is-a-metaclass-in-python
+'''
+
+def upper_attr(future_class_name, future_class_parents, future_class_attr):
+    """Return a class Object, with the list of its attribute turned
+    into uppercase.
+    """
+    attrs = ((name, value) for name, value in future_class_attr.items() if not name.startswith('__'))
+    uppercase_attr = dict((name.upper(), value) for name, value in attrs)
+
+    # let `type` do the class creation
+    return type(future_class_name, future_class_parents, uppercase_attr)
+
+
+# __metaclass__ = upper_attr # this will affect all classes in the module
+
+class Foo(object):
+    __metaclass__ = upper_attr
+    # we can define __metaclass__ here instead to affect only this class
+    bar = 'bip'
+
+print hasattr(Foo, 'bar')
+# output: False
+print hasattr(Foo, 'BAR')
+# output: True
+
+f = Foo()
+print f.BAR

py/pil/add_watermark.py

+#!/usr/bin/env python
+# -*- encoding:utf-8 -*-
+
+import os
+import Image, ImageEnhance
+
+
+def batch(infolder, outfolder, watermark):
+    mark = Image.open(watermark)
+    for root, dirs, files in os.walk(infolder):
+        for name in files:
+            try:
+                im = Image.open(os.path.join(root, name))
+                if im.mode != 'RGBA':
+                    im = im.convert('RGBA')
+                layer = Image.new('RGBA', im.size, (0, 0, 0, 0))
+                position = (im.size[0] - mark.size[0],
+                            im.size[1] - mark.size[1])
+                layer.paste(mark, position)
+                Image.composite(layer, im, layer).save(os.path.join(outfolder, name))
+            except Exception, e:
+                print e
+
+def test():
+    batch('/home/yxy/Pictures', '/home/yxy/tmp', '/home/yxy/tmp/t.png')
+
+if __name__ == '__main__':
+    test()

py/sa/obj.pkl

Binary file added.

py/sa/parse_log.py

 import sys
 import tarfile
 import cPickle
+import optparse
 from datetime import datetime, timedelta
 from collections import defaultdict
 from pprint import pprint
 
-fail_text = u'挖图失败'.encode('gb18030')
+fail_text = u'战斗失败'.encode('gb18030')
 
 watu_fail_pat = re.compile(fail_text)
 watu_succ_pat = re.compile(r'Get\s+reward')
-datu_fail_pat = re.compile(r'Fail')
-datu_succ_pat = re.compile(r'Complete')
+datu_fail_pat = re.compile(r'Fail:\d+\s+[\'"]?cg_cbt')
+datu_succ_pat = re.compile(r'Complete:\d+\s+[\'"]?cg_cbt')
 
 brackets_pat = re.compile(r'\[(.+?)\]')
+tarfile_pat = re.compile(r'\.t[(ar)|(gz)]+(\.gz)?(\.bz\d?)?')
 
 ums_pat = re.compile(r'ums=(.+?),')
 uid_pat = re.compile(r'uid=(\d+)')
 # ums --> uid 的映射
 UMS_UID_MAP = defaultdict(set)
 
-# 统计uid登录次数
+# 所有 UID 每天的登录信息 {'day': {'uid': 'login_counts'}}
 UID_LOGIN = {}
-# 统计最后一次 ums 的登录ip
-ums_login = {}
+
+# 所有 UMS 每天的登录信息(当天只记录最后一次登录IP) {'day': {'ums': ['ip1', 'ip2']}}
+UMS_LOGIN = {}
+
+# mini 客户端每天登录信息
+MINICLIENT_LOGIN = {}
 
 # 挖图信息
-WATU = {}
+WA_TU = {}
 # 打图信息
-DATU = {}
+DA_TU = {}
+
+li2line = lambda x: ','.join(['"%s"' % l for l in x])
 
 def load_obj(fp):
-    print 'loading ', fp
+    # print 'loading ', fp
     return cPickle.load(file(fp))
 
-def tongji(server_id, start_date, end_date):
+def tongji_login(server_id):
+    '''
+    统计结果:
+    1. 微端登录数,按UMS算,如果UMS登录多次,当天只统计一次
+    2. 普通端登录数,按UMS算,如果UMS登录多次,当天只统计一次
+    3. 微端登录当天登录比例
+
+    - 3 月份之前的
+    - 3 月份之后的
+    '''
+    uid_ums_map = load_obj('%s-uid_ums_map.pkl' % server_id)
+    uid_login = load_obj('%s-uid_login.pkl' % server_id)
+    ums_login = load_obj('%s-ums_login.pkl' % server_id)
+    # ums_uid_map = '%s-ums_uid_map.pkl' % server_id
+    miniclient_login = load_obj('%s-miniclient_login.pkl' % server_id)
+    csv = '%s-ums_login.csv' % server_id
+
+    days = ums_login.keys()
+    days.sort()
+    before_march = []
+    after_march = []
+    march = get_dateobj_from_str('2010-03-01', day_fmt)
+    for day in days:
+        dobj = get_dateobj_from_str(day, day_fmt)
+        if dobj < march:
+            before_march.append(day)
+        else:
+            after_march.append(day)
+
+    all_ums = set([])
+    # 所有微端登录的 ums
+    mini_ums = set([])
+    # 所有正常版登录的 ums
+    norm_ums = set([])
+
+    for m in [before_march, after_march]:
+        n = 1
+        fobj = file(str(n) + csv, 'wb')
+        n += 1
+        sum_login_counts = len([len(v.keys()) for k, v in ums_login.iteritems()
+                                if k in m])
+        # uid -> ums
+        header_summary = [s.endcode('gb18030') for s in [u'日期', u'SERVER_ID', u'总登录数',
+                            u'微端登录数', u'普通版登录数', u'微端登录比例']]
+        fobj.write(li2line(header_summary) + '\n')
+        fobj.write(li2line([n, server_id, sum_login_counts, ]))
+        fobj.write('\n')
+
+        for day in before_march:
+            pass
+
+        fobj.write()
+    for day, item in ums_login.iteritems():
+        pass
+
+
+    header_detail = [s.encode('gb18030') for s in [u'日期', u'微端登录数', u'正常版登录数', u'微端比例']]
+    fobj.write(li2line(header_detail) + '\n')
+    fobj.close()
+
+def tongji_cangbao(server_id, start_date, end_date):
     uid_ums_map = load_obj('%s-uid_ums_map.pkl' % server_id)
     watu = load_obj('%s-watu.pkl'% server_id)
     datu = load_obj('%s-datu.pkl' % server_id)
         for date in dates:
             dobj = get_dateobj_from_str(date, day_fmt)
             if dobj >= start_date:
-                if dobj <= end_date:
+                if dobj < (end_date + timedelta(1)):
                     continue
             del dct[date]
-        pprint( dct.keys())
         return dct
 
     def get_ip_by_uid(uid):
             days.reverse()
             for day in days:
                 if ums in ums_login[day]:
-                    ip = ums_login[day][ums]
+                    # 取最后一次登录IP
+                    ip = ums_login[day][ums][-1]
         return ip
 
     def parse_cangbao(dct):
             uid_map[uid] = {'succ': succ, 'fail': fail}
         return succ_count, fail_count, uid_map
 
-    print 'write content to %s' % csv
-
     fobj = open(csv, 'wb')
 
     _uid_login = filter_by_date(uid_login, start_date, end_date)
+
     _watu = filter_by_date(watu, start_date, end_date)
     _datu = filter_by_date(datu, start_date, end_date)
 
     # 时间段内总的登录总数
     login_count = 0
     uids = set([])
+    # 统计有效的 uid 数量
     for date, item in _uid_login.iteritems():
         for k, v in item.iteritems():
             uids.add(k)
             login_count += v
 
+    print '%s %s --> %d' % (server_id, date_line, len(uids))
     watu_count = {'fail': 0, 'succ': 0}
     uid_watu_count = {}
     for date, item in _watu.iteritems():
     for key in ['fail', 'succ']:
         cangbao_count += datu_count[key] + watu_count[key]
 
-    li2line = lambda x: ','.join(['"%s"' % l for l in x])
 
     fobj.write(u'日期,登录UID数,藏宝总数,挖图总数,打图总数\n'.encode('gb18030'))
     sum_line = [date_line, login_count, cangbao_count, watu_count, datu_count]
         uid = parts[1]
         day = datetime_to_day(date)
 
-        WATU.setdefault(day, {})
-        WATU[day].setdefault(uid, {})
-        WATU[day][uid].setdefault('succ', 0)
-        WATU[day][uid].setdefault('fail', 0)
+        WA_TU.setdefault(day, {})
+        WA_TU[day].setdefault(uid, {})
+        WA_TU[day][uid].setdefault('succ', 0)
+        WA_TU[day][uid].setdefault('fail', 0)
 
-        WATU[day][uid][s] += 1
-        return WATU[day][uid][s]
+        WA_TU[day][uid][s] += 1
+        return WA_TU[day][uid][s]
 
     for line in fobj:
         if watu_fail_pat.search(line):
         uid = parts[1]
         day = datetime_to_day(date)
 
-        DATU.setdefault(day, {})
-        DATU[day].setdefault(uid, {})
-        #DATU[day][uid].setdefault(s, 0)
-        DATU[day][uid].setdefault('fail', 0)
-        DATU[day][uid].setdefault('succ', 0)
+        DA_TU.setdefault(day, {})
+        DA_TU[day].setdefault(uid, {})
+        #DA_TU[day][uid].setdefault(s, 0)
+        DA_TU[day][uid].setdefault('fail', 0)
+        DA_TU[day][uid].setdefault('succ', 0)
 
-        DATU[day][uid][s] += 1
-        return DATU[day][uid][s]
+        DA_TU[day][uid][s] += 1
+        return DA_TU[day][uid][s]
 
     for line in fobj:
         if datu_fail_pat.search(line):
         elif datu_succ_pat.search(line):
             _parse(line, 'succ')
 
+def parse_minilog(fobj):
+    '''处理微端日志'''
+
+    for line in fobj:
+        parts = parse_line(line, brackets_pat)
+        if not all(parts) or len(parts) < 3:
+            print '[ERR LINE]', line
+            continue
+        date = parts[0]
+        uid = parts[1]
+        ip = parts[2]
+        day = datetime_to_day(date)
+        MINICLIENT_LOGIN.setdefault(day, defaultdict(list))
+        # 记录所有登录IP
+        MINICLIENT_LOGIN[day][uid].append(ip)
+
 def parse_login(fobj):
     '''
     统计 uid 登录次数,最后一次 ip
             # ums 只能从key的字段里获取
             ums = key_parts[2]
 
-            # 统计 最后一次 ip
-            if day not in ums_login:
-                ums_login[day] = {ums: ip}
-            else:
-                ums_login[day][ums] = ip
+            UMS_LOGIN.setdefault(day, defaultdict(list))
+            # 记录所有登录IP
+            UMS_LOGIN[day][ums].append(ip)
 
         elif 'uid enter game vfd=' in line:
             date = get_ret(date_pat, line)
     fobj = tf.extractfile(tf.members[0])
     return fobj
 
-def main():
-    logdir = sys.argv[1]
-    server_id = sys.argv[2]
+def cmd_parse(args, opts):
+    """parse - parse log directory Usage: <server_id_path> <server_id>
+    """
+    logdir = args[1]
+    server_id = args[2]
     for fn in os.listdir(logdir):
         fp = os.path.join(logdir, fn)
         print 'parsing', fp
-        fobj = get_fobj_from_tarfile(fp)
+
+        if tarfile_pat.search(fp):
+            fobj = get_fobj_from_tarfile(fp)
+        else:
+            fobj = file(fp)
+
         if 'login_' in fn:
             print 'enter login logic'
             parse_login(fobj)
         elif 'mission_' in fn:
             print 'enter mission logic'
             parse_datu(fobj)
+        elif 'mini' in fn:
+            print 'entering miniclient log logind'
+            parse_minilog(fobj)
         fobj.close()
         del fobj
 
     dump_obj('%s-uid_ums_map.pkl' % server_id, UID_UMS_MAP)
     dump_obj('%s-ums_uid_map.pkl' % server_id, UMS_UID_MAP)
-    dump_obj('%s-watu.pkl'% server_id, WATU)
-    dump_obj('%s-datu.pkl' % server_id, DATU)
+    dump_obj('%s-watu.pkl'% server_id, WA_TU)
+    dump_obj('%s-datu.pkl' % server_id, DA_TU)
     dump_obj('%s-uid_login.pkl' % server_id, UID_LOGIN)
-    dump_obj('%s-ums_login.pkl' % server_id, ums_login)
+    dump_obj('%s-ums_login.pkl' % server_id, UMS_LOGIN)
+    dump_obj('%s-miniclient_login.pkl' % server_id, MINICLIENT_LOGIN)
 
-def output():
-    date_tuple = (('2012-05-03', '2012-05-09'),
-                  ('2012-05-25', '2012-05-31'),
-                  ('2012-06-01', '2012-06-07'),
-                  )
+def cmd_tongji_cangbao(args, opts):
+    '''output log statistics'''
+    date_tuple = (
+            ('2012-05-03', '2012-05-09'),
+            ('2012-05-10', '2012-05-16'),
+            ('2012-05-17', '2012-05-23'),
+            ('2012-05-25', '2012-05-31'),
+            ('2012-06-01', '2012-06-07'),
+            ('2012-06-08', '2012-06-14'),
+   )
 
     server_ids = ('1036', '1037')
     for id in server_ids:
         for start_date, end_date in date_tuple:
-            output_result(id, start_date, end_date)
+            tongji_cangbao(id, start_date, end_date)
 
     for start_date, end_date in date_tuple[1:]:
-        output_result('1052', start_date, end_date)
+        tongji_cangbao('1052', start_date, end_date)
+
+def cmd_help(args, opts):
+    """help - list available commands"""
+
+    print "Available commands:"
+    for _, func in sorted(get_commands().items()):
+        print "   ", func.__doc__
+
+def get_commands():
+    return {'help': cmd_help,
+            'parse': cmd_parse,
+            'tongji_cangbao': cmd_tongji_cangbao}
+
+def parse_opts():
+    usage = "%prog [options] <command> [arg] ..."
+    description = (u"Log parse, statistics too. Use `%prog help`"
+        "to see the list of available commands.")
+    op = optparse.OptionParser(usage=usage, description=description)
+    opts, args = op.parse_args()
+    if not args:
+        op.print_help()
+        sys.exit(2)
+    cmdname, cmdargs, opts = args[0], args[1:], opts
+    commands = get_commands()
+    if cmdname not in commands:
+        print >> sys.stdout, "Unknown command: %s\n\n" % cmdname
+        cmd_help(None, None)
+        sys.exit(1)
+    return commands[cmdname], cmdargs, opts
+
+def main():
+    cmd, args, opts = parse_opts()
+    try:
+        cmd(args, opts)
+    except IndexError:
+        print cmd.__doc__
 
 if __name__ == '__main__':
     main()

py/sa/simple_cdn_check.py

+#!/usr/bin/env python
+# -*- encoding:utf-8 -*-
+
+import os
+import sys
+import re
+import traceback
+import urllib2
+import threading
+import Queue
+import logging
+import time
+
+cwd = os.path.abspath(os.path.dirname(__file__))
+logfile = os.path.join(cwd, os.path.splitext(__file__)[0] + '.log')
+logging.basicConfig(filename=logfile,
+                    level=logging.INFO)
+
+# DONE: 构建一个多线程的模型,支持传递线程数量
+# DONE: 确保URL请求的是HEAD方法
+
+cdn_ips = (
+           '61.187.102.68',
+           '218.60.35.175',
+           '218.8.52.229',
+           '61.138.133.14',
+           '221.204.210.146',
+           '221.204.210.147',
+           '60.5.252.167',
+           '221.195.3.139',
+           '60.210.16.87',
+           '60.210.16.88',
+           '60.165.99.197',
+           '222.208.168.214',
+           '222.208.168.215',
+           '218.6.12.169',
+           # '222.186.34.47' --> '222.186.58.206',
+           '222.186.58.206',
+           '58.221.38.68',
+           # '123.183.210.136', tempory remove
+           '60.173.11.162',
+           '59.53.65.14',
+           # '61.183.42.56' -->'61.183.41.216',
+           '61.183.41.216',
+           '121.9.221.88',
+           '121.9.221.89',
+           '220.165.3.73',
+           # '218.29.176.196' --> '42.227.234.137',
+           '42.227.234.137',
+           '218.77.101.34')
+
+def sanity(ver):
+    '''
+    >>> sanity('mini_cn_v_2_9_0')
+    ('mini_cn', 'v', '2.9.0')
+    >>> sanity('v_2_9_0')
+    ('mini_cn', 'v', '2.9.0')
+    >>> sanity('v2.9.0')
+    ('mini_cn', 'v', '2.9.0')
+    >>> sanity('2.9.0')
+    ('mini_cn', 'v', '2.9.0')
+    '''
+    t = 'mini_cn'
+    v = '2.9.0'
+    pat1 = re.compile(r'(\w+_\w+)_v_(\d+[_\.]\d+[_\.]\d+)')
+    pat2 = re.compile(r'(v[_\.])?(\d+[\._]\d+[\._]\d+)')
+    pat3 = re.compile(r'_')
+    if pat1.search(ver):
+        t, v = pat1.search(ver).groups()
+    elif pat2.search(ver):
+        v = pat2.search(ver).group(2)
+    else:
+        raise TypeError('version type error %r' % ver)
+    v = pat3.sub(r'.', v)
+    return (t, 'v', v)
+
+def get_resources(path, excludes=[]):
+    '''
+    返回一个资源路径下所有的资源地址
+    地址格式为相对路径
+
+    '''
+
+    if os.path.exists('obj.pkl'):
+        import pickle
+        with open('obj.pkl', 'rb') as fb:
+            resources = pickle.load(fb)
+            for rsc in resources:
+                relpath = rsc[len(path):].lstrip('/')
+                if relpath.startswith('/'):
+                    relpath = relpath[1:]
+                pardir = relpath.split('/', 1)[0]
+                if pardir in excludes:
+                    continue
+                yield relpath
+    else:
+        for root, dirs, files in os.walk(path):
+            for fn in files:
+                yield os.path.join(root, fn)[len(path):].lstrip('/')
+
+class ThreadUrlFetch(threading.Thread):
+    def __init__(self, fetch, queue):
+        threading.Thread.__init__(self)
+        self.fetch = fetch
+        self.queue = queue
+
+    def run(self):
+        while True:
+            request = self.queue.get(1)
+            status = 'ERR'
+            try:
+                u = self.fetch.open(request)
+                status = u.msg
+                assert u.code == 200, "server response code not equal 200"
+                msg = "{0} - [{1}] {2:<15} {3}".format(self.getName(),
+                                                       status,
+                                                       self.fetch.handlers[0].proxies['http'],
+                                                       request.get_full_url())
+                # if request.get_method() == 'GET':
+                #     # TODO: 可以校验内容
+                #     raise ValueError('Need to implement')
+                u.close()
+
+            except urllib2.HTTPError:
+                exc_type, exc_value, exc_traceback = sys.exc_info()
+                formatted_lines = traceback.format_exc().splitlines()
+                print '*' * 30
+                msg = "{0} - [{1}] {2:<15} {3} [{4}]".format(self.getName(),
+                                                             status,
+                                                             self.fetch.handlers[0].proxies['http'],
+                                                             request.get_full_url(),
+                                                             formatted_lines[-1])
+
+                print >> sys.stderr, msg
+                print '*' * 30
+                logging.error(msg)
+            finally:
+                self.queue.task_done()
+
+class HeadRequest(urllib2.Request):
+    '''HEAD Request'''
+    def get_method(self):
+        return "HEAD"
+
+def main():
+    start = time.time()
+
+    # 默认开启的线程数量
+    threads_num = 10
+    # 默认的版本
+    _version = '2.11.0'
+    # CDN 下载地址
+    cdn_link = 'http://download1.fs.175game.com'
+    # 资源根路径
+    resource_root_dir = '/home/miniclient/rsync_input/mini.fs.175game.com/mini_cn'
+    if len(sys.argv) < 2:
+        print >> sys.stderr, "Please specified version, like: v2.11.0"
+        sys.exit(-1)
+    else:
+        _version = sys.argv[1]
+
+
+    client_type, _v, version = sanity(_version)
+    underline_version = version.replace('.', '_')
+    print "Use version: %s_%s_%s" % (client_type, _v, underline_version)
+    print "Use threads: %d" % threads_num
+    # client_type = 'mini_cn'
+
+    cdn_proxies = ({'http': ip} for ip in cdn_ips)
+
+    # 资源发布路径
+    resource_pub_dir = 'fs/%s_v_%s' % (client_type, underline_version)
+
+    resources = get_resources(resource_root_dir, ['resource'])
+    resources = list(resources)
+    print 'grab url resources: %d' % len(resources)
+    resource_links = ['/'.join([cdn_link, resource_pub_dir, r]) for r in resources]
+    # resource_links = ['http://download1.fs.175game.com/fs/mini_cn_v_2_11_0/client/bin/release/fsresd.exe']
+
+    for proxy in cdn_proxies:
+        _start = time.time()
+        # print 'Starting check proxy [%s]' % proxy['http']
+        proxy_opener = urllib2.build_opener(urllib2.ProxyHandler(proxy))
+        queue = Queue.Queue()
+        [queue.put(HeadRequest(u)) for u in resource_links]
+        for i in range(threads_num):
+            t = ThreadUrlFetch(proxy_opener, queue)
+            t.setDaemon(True)
+            t.start()
+        queue.join()
+        print 'Done [%s] %.2f seconds' % (proxy['http'].ljust(12), time.time() - _start)
+
+    print 'Finished'
+    print 'Time elapsed %.2f seconds' % (time.time() - start)
+if __name__ == '__main__':
+    main()

py/sa/statistics/data.sqlite3

Binary file added.

py/sa/statistics/do_statistics.py

+#!/usr/bin/env python
+# -*- encoding:utf-8 -*-
+
+import sys
+import os
+import re
+import cPickle
+import datetime
+import time
+
+from collections import defaultdict
+
+'''
+统计日志
+'''
+
+# 统计全服三月前微端登录比例和三月后微端登录比例,根据ums来统计
+
+def load_obj(pkl):
+    return cPickle.load(file(pkl, 'rb'))
+
+def str2date(s):
+    tt = time.strptime(s, '%Y-%m-%d')
+    return datetime.date(*tt[:3])
+
+# 统计每个服的微端/正常版登录数量
+def login_stat(server_id, start_date, end_date):
+
+    mini_login_stat = load_obj('%s-miniclient_login.pkl' % server_id)
+    mini_uids = set()
+
+    [mini_uids.update(k.keys()) for k, v in mini_login_stat.iteritems()]
+    err_ids = [u for u in mini_uids if not u.isdigit or not u]
+    if err_ids:
+        print 'Got erro ids %s' % err_ids
+        for _id in err_ids:
+            mini_uids.remove(_id)
+
+    uid_ums_map = load_obj('%s-uid_ums_map.pkl')
+    #ums_login = load_obj('%s-ums_login.pkl' % server_id)
+    uid_login = load_obj('%s-uid_login.pkl' % server_id)
+
+    mini_ums_login_stat = defaultdict(set)
+    norm_ums_login_stat = defaultdict(set)
+
+    for date, items in uid_login.iteritems():
+        tt = str2date(date)
+        if not (s_tt <= tt < e_tt):
+            print '[DEBUG] ignore time %s' % date
+            continue
+        for uid, count in items.iteritems():
+            # 如果id出现在mini id 集合中,判定它的ums为微端登录
+            if uid in mini_uids:
+                ums = uid_ums_map.get(uid, 'unknown_uid-%s' % uid)
+                mini_ums_login_stat[date].add(ums)
+            else:
+                norm_ums_login_stat[date].add(ums)
+
+    mini_ums_login_counts = sum([len(v) for k, v in mini_ums_login_stat.iteritems()])
+    norm_ums_login_counts = sum([len(v) for k, v in norm_ums_login_stat.iteritems()])
+
+    return (mini_ums_login_counts, norm_ums_login_counts)
+
+def stat(server_ids, start_date, end_date):
+    _stat = [login_stat(i, start_date, end_date) for i in server_ids]
+    mini_login_counts = sum([i[0] for i in _stat])
+    norm_login_counts = sum([i[1] for i in _stat])
+    login_counts = mini_login_counts + norm_login_counts
+    percent = '%d%%' % (mini_login_counts * 1.0 / login_counts * 100)
+    return (mini_login_counts, norm_login_counts, login_counts, percent)
+
+def main():
+    files = os.listdir('.')
+    id_pat = re.compile(r'^(\d{4})-')
+    server_ids = set()
+    server_ids.update([id_pat.search(f).group(1) for f in files if id_pat.search(f)])
+
+    min_date = str2date('1999-01-01')
+    max_date = str2date('2012-12-30')
+    mid_date = str2date('2012-03-01')
+
+    print 'before %s' % mid_date
+    mini_login_counts, norm_login_counts, login_counts, percent = stat(server_ids,
+            min_date, mid_date)
+    print '%s(%s) - %s -%s' % (mini_login_counts, percent, norm_login_counts, login_counts)
+    print '-' * 20
+    print 'after %s' % mid_date
+    mini_login_counts, norm_login_counts, login_counts, percent = stat(server_ids,
+            mid_date, max_date)
+    print '%s(%s) - %s -%s' % (mini_login_counts, percent, norm_login_counts, login_counts)
+
+if __name__ == '__main__':
+    main()
+
+

py/sa/statistics/machine_info_statistics.py

+#!/usr/bin/env python
+# -*- encoding:utf-8 -*-
+import tarfile
+import re
+import time
+import sys
+
+from utils import get_files_by_ext
+
+def get_logobj_from_tarfile(filename):
+    '''从tar文件里获取日志文件对象'''
+
+    tf = tarfile.open(filename)
+    for member in tf.members:
+        # if member.name == 'mini_stat_access.run':
+        yield LogLineGenerator(tf.extractfile(member))
+
+class LogLineGenerator(object):
+    def __init__(self, fobj, log_regpat=None):
+        self.log_regpat = log_regpat
+        if isinstance(fobj, (file, tarfile.ExFileObject)):
+            self.fobj = fobj
+        elif isinstance(fobj, str):
+            self.fobj = open(fobj)
+        else:
+            raise TypeError('fobj only support file object or file path. Got: %r' % type(fobj))
+
+    def get_loglines(self):
+        for line in self.fobj:
+            line = line.strip()
+            yield line
+
+def main():
+
+    tar_dir = sys.argv[1]
+    tar_files = get_files_by_ext(tar_dir, '.tar.gz')
+    uid_pat = re.compile(r'uid\=(.+?),')
+    machine_pat = re.compile(r'MachineInfo=(.+)')
+
+    cpu_pat = re.compile(r'(cpu.*?),')
+    device_pat = re.compile(r'device(\[.+?\]),')
+    #memory_pat = re.compile(r'memory\[(.+?)\]')
+    memory_pat = re.compile(r'mem\-physic:(.+?),')
+    screen_pat = re.compile(r'screen\-(.+?),')
+    window_pat = re.compile(r'window:(.+)')
+
+    #fp = open('statistics.csv', 'wb')
+    fp_map = {'memory': open('memory.csv', 'wb'),
+              'cpu': open('cpu.csv', 'wb'),
+              'device': open('gpu_device.csv', 'wb'),
+              'screen': open('screen.csv', 'wb'),
+              'window': open('window.csv', 'wb'),
+              #'all': open('statistics.csv', 'wb'),
+              }
+    fp_all = open('statistics.csv', 'wb')
+    fp_all.write('device,memory,cpu,screen,window\n')
+    for fp in fp_map:
+        fp_map[fp].write('%s,sum\n' % fp)
+    maps  = [{}, {}, {}, {}, {}]
+    for tar_file in tar_files:
+        print tar_file
+        start_time = time.time()
+        for logobj in get_logobj_from_tarfile(tar_file):
+            for line in logobj.get_loglines():
+                m = machine_pat.search(line)
+                if m:
+                    #uid = uid_pat.search(line).group(1)
+                    machine_info = m.group(1).strip()
+
+                    cpu = cpu_pat.search(machine_info).group(1).strip()
+                    device = device_pat.search(machine_info).group(1).strip()
+                    memory = memory_pat.search(machine_info).group(1).strip()
+                    screen = screen_pat.search(machine_info).group(1).strip()
+                    window = window_pat.search(machine_info).group(1).strip()
+
+                    li = [device, memory, cpu, screen, window]
+                    line = ','.join(['"%s"' % i for i in li]) + '\n'
+                    for index, dct in enumerate(maps):
+                        if li[index] not in maps[index]:
+                            maps[index][li[index]] = 1
+                        else:
+                            v = maps[index][li[index]]
+                            maps[index][li[index]] = v + 1
+                    fp_all.write(line)
+                    #print li
+
+    def co(m):
+        lines = ''
+        for k, v in m.iteritems():
+            lines += ','.join([k, str(v)]) + '\n'
+        return lines
+    #import pdb;pdb.set_trace()
+    fp_all.close()
+    fp_map['device'].write(co(maps[0]))
+    fp_map['memory'].write(co(maps[1]))
+    fp_map['cpu'].write(co(maps[2]))
+    fp_map['screen'].write(co(maps[3]))
+    fp_map['window'].write(co(maps[4]))
+    for fp in fp_map:
+        fp_map[fp].close()
+    print 'elapsed %.2f' % (time.time() - start_time)
+
+if __name__ == '__main__':
+    main()

py/sa/statistics/parse_log.py

+#!/usr/bin/env python
+# -*- encoding:utf-8 -*-
+
+
+'''
+2012-05-03 -- 2012-05-09
+uid 登录总数 总挖图次数(包含成功和失败) 总打图次数(包含成功和失败)
+
+ID IP 打图成功 打图失败 挖图成功 挖图失败
+
+'''
+import re
+import os
+import sys
+import tarfile
+import cPickle
+import optparse
+from datetime import datetime, timedelta
+from collections import defaultdict
+from pprint import pprint
+
+from log import get_logger
+
+cwd = os.path.realpath(os.path.dirname(__file__))
+logfile = os.path.join(cwd, __file__ + '.log')
+logger = get_logger('tongji', logfile)
+
+fail_text = u'战斗失败'.encode('gb18030')
+
+watu_fail_pat = re.compile(fail_text)
+watu_succ_pat = re.compile(r'Get\s+reward')
+datu_fail_pat = re.compile(r'Fail:\d+\s+[\'"]?cg_cbt')
+datu_succ_pat = re.compile(r'Complete:\d+\s+[\'"]?cg_cbt')
+
+brackets_pat = re.compile(r'\[(.+?)\]')
+tarfile_pat = re.compile(r'\.t[(ar)|(gz)]+(\.gz)?(\.bz\d?)?')
+
+ums_pat = re.compile(r'ums=(.+?),')
+uid_pat = re.compile(r'uid=(\d+)')
+ip_pat = re.compile(r'ip=([\d\.]+)')
+date_pat = re.compile(r'\[([-\d:\s]+)\]')
+key_pat = re.compile(r'key=(.+?),')
+mission_uid_pat = re.compile(r'[(Fail)|(Complete)]:(\d+)')
+
+datetime_fmt = '%Y-%m-%d %H:%M:%S'
+day_fmt = '%Y-%m-%d'
+hour_fmt = '%Y-%m-%d %H'
+minute_fmt = '%Y-%m-%d %H:%M'
+
+# uid --> ums 的映射
+UID_UMS_MAP = {}
+# ums --> uid 的映射
+UMS_UID_MAP = defaultdict(set)
+
+# 所有 UID 每天的登录信息 {'day': {'uid': 'login_counts'}}
+UID_LOGIN = {}
+
+# 所有 UMS 每天的登录信息(当天只记录最后一次登录IP) {'day': {'ums': ['ip1', 'ip2']}}
+UMS_LOGIN = {}
+
+# mini 客户端每天登录信息
+MINICLIENT_LOGIN = {}
+
+# 挖图信息
+WA_TU = {}
+# 打图信息
+DA_TU = {}
+
+li2line = lambda x: ','.join(['"%s"' % l for l in x])
+
+def load_obj(fp):
+    # print 'loading ', fp
+    logger.info('loading %s', fp)
+    return cPickle.load(file(fp))
+
+def tongji_login(server_id):
+    '''
+    统计结果:
+    1. 微端登录数,按UMS算,如果UMS登录多次,当天只统计一次
+    2. 普通端登录数,按UMS算,如果UMS登录多次,当天只统计一次
+    3. 微端登录当天登录比例
+
+    - 3 月份之前的
+    - 3 月份之后的
+    '''
+    uid_ums_map = load_obj('%s-uid_ums_map.pkl' % server_id)
+    uid_login = load_obj('%s-uid_login.pkl' % server_id)
+    ums_login = load_obj('%s-ums_login.pkl' % server_id)
+    # ums_uid_map = '%s-ums_uid_map.pkl' % server_id
+    miniclient_login = load_obj('%s-miniclient_login.pkl' % server_id)
+    csv = '%s-ums_login.csv' % server_id
+
+    days = ums_login.keys()
+    days.sort()
+    before_march = []
+    after_march = []
+    march = get_dateobj_from_str('2010-03-01', day_fmt)
+    for day in days:
+        dobj = get_dateobj_from_str(day, day_fmt)
+        if dobj < march:
+            before_march.append(day)
+        else:
+            after_march.append(day)
+
+    all_ums = set([])
+    # 所有微端登录的 ums
+    mini_ums = set([])
+    # 所有正常版登录的 ums
+    norm_ums = set([])
+
+    for m in [before_march, after_march]:
+        n = 1
+        fobj = file(str(n) + csv, 'wb')
+        n += 1
+        sum_login_counts = len([len(v.keys()) for k, v in ums_login.iteritems()
+                                if k in m])
+        # uid -> ums
+        header_summary = [s.endcode('gb18030') for s in [u'日期', u'SERVER_ID', u'总登录数',
+                            u'微端登录数', u'普通版登录数', u'微端登录比例']]
+        fobj.write(li2line(header_summary) + '\n')
+        fobj.write(li2line([n, server_id, sum_login_counts, ]))
+        fobj.write('\n')
+
+        for day in before_march:
+            pass
+
+        fobj.write()
+    for day, item in ums_login.iteritems():
+        pass
+
+
+    header_detail = [s.encode('gb18030') for s in [u'日期', u'微端登录数', u'正常版登录数', u'微端比例']]
+    fobj.write(li2line(header_detail) + '\n')
+    fobj.close()
+
+def tongji_cangbao(server_id, start_date, end_date):
+    uid_ums_map = load_obj('%s-uid_ums_map.pkl' % server_id)
+    watu = load_obj('%s-watu.pkl'% server_id)
+    datu = load_obj('%s-datu.pkl' % server_id)
+    uid_login = load_obj('%s-uid_login.pkl' % server_id)
+    ums_login = load_obj('%s-ums_login.pkl' % server_id)
+    # ums_uid_map = '%s-ums_uid_map.pkl' % server_id
+
+    date_line = '%s - %s' % (start_date, end_date)
+    csv = '%s-%s-result.csv' % (server_id, date_line)
+
+
+    start_date = get_dateobj_from_str(start_date, day_fmt)
+    end_date = get_dateobj_from_str(end_date, day_fmt)
+
+    def filter_by_date(dct, start, end):
+        dates = dct.keys()
+        for date in dates:
+            dobj = get_dateobj_from_str(date, day_fmt)
+            if dobj >= start_date:
+                if dobj < (end_date + timedelta(1)):
+                    continue
+            del dct[date]
+        return dct
+
+    def get_ip_by_uid(uid):
+        ip = 'unknown'
+        ums = uid_ums_map.get(uid, 'unknown')
+        if ums == 'unknown':
+            ip = 'unknown'
+        else:
+            days = ums_login.keys()
+            days.reverse()
+            for day in days:
+                if ums in ums_login[day]:
+                    # 取最后一次登录IP
+                    ip = ums_login[day][ums][-1]
+        return ip
+
+    def parse_cangbao(dct):
+        succ_count = 0
+        fail_count = 0
+        uid_map = {}
+        for uid, info in dct.iteritems():
+            succ = info.get('succ', 0)
+            fail = info.get('fail', 0)
+            succ_count += succ
+            fail_count += fail
+            uid_map[uid] = {'succ': succ, 'fail': fail}
+        return succ_count, fail_count, uid_map
+
+    fobj = open(csv, 'wb')
+
+    _uid_login = filter_by_date(uid_login, start_date, end_date)
+
+    _watu = filter_by_date(watu, start_date, end_date)
+    _datu = filter_by_date(datu, start_date, end_date)
+
+    logger.debug('date_line = %s' % date_line)
+    # 时间段内总的登录总数
+    login_count = 0
+    uids = set([])
+    # 统计有效的 uid 数量
+    for date, item in _uid_login.iteritems():
+        for k, v in item.iteritems():
+            uids.add(k)
+            login_count += v
+
+    logger.info('%s %s --> %d' % (server_id, date_line, len(uids)))
+    watu_count = {'fail': 0, 'succ': 0}
+    uid_watu_count = {}
+    for date, item in _watu.iteritems():
+        for uid, m in item.iteritems():
+            uid_watu_count.setdefault(uid, {})
+            uid_watu_count[uid].setdefault('fail', 0)
+            uid_watu_count[uid].setdefault('succ', 0)
+            uid_watu_count[uid]['fail'] += m['fail']
+            uid_watu_count[uid]['succ'] += m['succ']
+            watu_count['fail'] += m['fail']
+            watu_count['succ'] += m['succ']
+
+    uid_datu_count = {}
+    datu_count = {'fail': 0, 'succ': 0}
+    for date, item in _datu.iteritems():
+        for uid, m in item.iteritems():
+            uid_datu_count.setdefault(uid, {})
+            uid_datu_count[uid].setdefault('fail', 0)
+            uid_datu_count[uid].setdefault('succ', 0)
+            uid_datu_count[uid]['fail'] += m['fail']
+            uid_datu_count[uid]['succ'] += m['succ']
+            datu_count['fail'] += m['fail']
+            datu_count['succ'] += m['succ']
+
+    cangbao_count = 0
+    for key in ['fail', 'succ']:
+        cangbao_count += datu_count[key] + watu_count[key]
+
+
+    fobj.write(u'日期,登录UID数,藏宝总数,挖图总数,打图总数\n'.encode('gb18030'))
+    sum_line = [date_line, login_count, cangbao_count, watu_count, datu_count]
+    fobj.write(li2line(sum_line) + '\n')
+
+    fobj.write('\n')
+    fobj.write(u'ID,IP,打图成功,打图失败,挖图成功,挖图失败\n'.encode('gb18030'))
+
+    for uid in uids:
+        found_uid_watu = True
+        found_uid_datu = True
+        # 不统计藏家寻宝活的玩家
+        if uid not in uid_datu_count:
+            uid_datu_count[uid] = {'fail': 0, 'succ': 0}
+            found_uid_datu = False
+        if uid not in uid_watu_count:
+            uid_watu_count[uid] = {'fail': 0, 'succ': 0}
+            found_uid_watu = False
+        if not (found_uid_datu | found_uid_watu):
+            continue
+        line = [uid, get_ip_by_uid(uid),
+                uid_datu_count[uid]['succ'], uid_datu_count[uid]['fail'],
+                uid_watu_count[uid]['succ'], uid_watu_count[uid]['fail']]
+        fobj.write(li2line(line) + '\n')
+    fobj.close()
+
+def dump_obj(fn, obj):
+    with open(fn, 'wb') as fb:
+        cPickle.dump(obj, fb)
+