Source

woocode / py / sa / sum.py

#!/usr/bin/env python
# -*- encoding:utf-8 -*-
import tarfile
import re
import sys
import os
import datetime
from collections import defaultdict

from utils import dump_obj
from pprint import pprint

def get_logobj_from_tarfile(filename):
    '''从tar文件里获取日志文件对象'''

    tf = tarfile.open(filename)
    for member in tf.members:
        # if member.name == 'mini_stat_access.run':
        yield LogLineGenerator(tf.extractfile(member))

class LogLineGenerator(object):
    def __init__(self, fobj, log_regpat=None):
        self.log_regpat = log_regpat
        if isinstance(fobj, (file, tarfile.ExFileObject)):
            self.fobj = fobj
        elif isinstance(fobj, str):
            self.fobj = open(fobj)
        else:
            raise TypeError('fobj only support file object or file path. Got: %r' % type(fobj))

    def get_loglines(self):
        for line in self.fobj:
            line = line.strip()
            yield line

def get_ret(pat, s):
    m = pat.search(s)
    if m:
        return m.group(1)

def main():
    start_day = datetime.datetime(2012, 5, 1)
    end_day = datetime.datetime(2012, 6, 13)

    tar_dir = sys.argv[1]
    all_login_logfiles = [os.path.join(tar_dir, f) for f in os.listdir(tar_dir)
            if f.find('login_') != -1]
    mini_login_logfiles = [os.path.join(tar_dir, f) for f in os.listdir(tar_dir)
            if f.find('mini_client') != -1]
    # 所有的登录信息
    all_login_info = defaultdict(list)
    mini_login_info = defaultdict(list)
    date_pat = re.compile(r'\[([-\d:\s]+)\]')
    ums_pat = re.compile(r'ums=(\w+?),')
    uid_pat = re.compile(r'uid=(\d+)')

    for logfile in all_login_logfiles:
        for logobj in get_logobj_from_tarfile(logfile):

            for line in logobj.get_loglines():
                if not 'uid enter game' in line:
                    continue
                dte = date_pat.search(line).group(1)
                dte_obj = datetime.datetime.strptime(dte, '%Y-%m-%d %H:%M:%S')
                if dte_obj < start_day or dte_obj > end_day:
                    print 'ignore %s' % dte_obj
                    continue
                info = {'ums': get_ret(ums_pat, line),
                        'uid': get_ret(uid_pat, line)}
                if info not in all_login_info[dte]:
                    all_login_info[dte].append(info)

    print 'all login info finished'
    mini_pat = re.compile(r'\[(.+?)\]')
    for logfile in mini_login_logfiles:
        for logobj in get_logobj_from_tarfile(logfile):
            for line in logobj.get_loglines():
                parts = mini_pat.findall(line)
                dte = parts[0]
                dte_obj = datetime.datetime(dte, '%Y-%m-%d %H:%M:%S')
                if dte_obj < start_day or dte_obj > end_day:
                    print 'ignore %s' % dte_obj
                    continue
                info = {'uid': parts[1]}
                if info not in mini_login_info[dte]:
                    mini_login_info[dte].append(info)

    print 'mini login info finished'
    # {date: 'nor_ums': [], 'mini_ums': [], 'percent': 30%}
    statistics = {}
    for day, info in all_login_info.iteritems():
        _info = set([(k, v) for k, v in info.iteritems()])
        mini_uids = set([u for u in mini_login_info[day]['uid']])
        all_ums = set([v for k, v in info.iteritems()])
        mini_ums = set([v for k, v in _info if k in mini_uids])
        # normal_ums = set([v for k, v in _info if k not in mini_uids])
        normal_ums = all_ums - mini_ums

        statistics[day] = {'nor_ums': normal_ums, 'mini_ums': mini_ums, 'mini_percent': len(mini_ums) * 1.0 / len(mini_ums + normal_ums)}

    pprint(statistics)
    dump_obj('obj.pkl', statistics)

if __name__ == "__main__":
    main()