Source

logparser / do_statistics.py

#!/usr/bin/env python
# -*- encoding:utf-8 -*-

import sys
import os
import re
import cPickle
import datetime
import time

from collections import defaultdict
from pprint import pprint

'''
统计日志
'''

# 统计全服三月前微端登录比例和三月后微端登录比例,根据ums来统计

def load_obj(pkl):
    return cPickle.load(file(pkl, 'rb'))

def load_obj_new(pkl):
    print 'loading %s' % pkl
    obj = cPickle.load(file(pkl, 'rb'))
    if os.path.exists('new/%s' % pkl):
        print 'loading new %s' % pkl
        new = cPickle.load(file('new/%s' % pkl, 'rb'))
        for k, v in new.iteritems():
            if k not in obj:
                print 'update %s' % k
                obj[k] = v
    return obj

def str2date(s):
    tt = time.strptime(s, '%Y-%m-%d')
    return datetime.date(*tt[:3])

def login_stat_by_week(server_id):

    mini_login_stat = load_obj_new('%s-miniclient_login.pkl' % server_id)
    uid_ums_map = load_obj_new('%s-uid_ums_map.pkl' % server_id)
    #ums_login = load_obj('%s-ums_login.pkl' % server_id)
    uid_login = load_obj_new('%s-uid_login.pkl' % server_id)

    mini_uids = set()
    stats = defaultdict(dict)

    [mini_uids.update(v.keys()) for k, v in mini_login_stat.iteritems()]
    err_ids = [u for u in mini_uids if not u.isdigit or not u]
    if err_ids:
        print 'Got erro ids %s' % err_ids
        for _id in err_ids:
            mini_uids.remove(_id)

    for date, items in uid_login.iteritems():
        tt = str2date(date)
        for uid, count in items.iteritems():
            # 如果id出现在mini id 集合中,判定它的ums为微端登录
            ums = uid_ums_map.get(uid, 'unknown_uid-%s' % uid)
            if uid in mini_uids:
                stats[tt.strftime('%W')].setdefault('mini_ums_login', 1)
                stats[tt.strftime('%W')]['mini_ums_login'] += 1
            else:
                stats[tt.strftime('%W')].setdefault('norm_ums_login', 1)
                stats[tt.strftime('%W')]['norm_ums_login'] += 1

    return (server_id, stats)

# 统计每个服的微端/正常版登录数量
def login_stat(server_id, start_date, end_date):

    mini_login_stat = load_obj('%s-miniclient_login.pkl' % server_id)
    mini_uids = set()

    [mini_uids.update(v.keys()) for k, v in mini_login_stat.iteritems()]
    err_ids = [u for u in mini_uids if not u.isdigit or not u]
    if err_ids:
        print 'Got erro ids %s' % err_ids
        for _id in err_ids:
            mini_uids.remove(_id)

    uid_ums_map = load_obj('%s-uid_ums_map.pkl' % server_id)
    #ums_login = load_obj('%s-ums_login.pkl' % server_id)
    uid_login = load_obj('%s-uid_login.pkl' % server_id)

    mini_ums_login_stat = defaultdict(set)
    norm_ums_login_stat = defaultdict(set)

    for date, items in uid_login.iteritems():
        tt = str2date(date)
        if not (start_date <= tt < end_date):
            continue
        for uid, count in items.iteritems():
            # 如果id出现在mini id 集合中,判定它的ums为微端登录
            ums = uid_ums_map.get(uid, 'unknown_uid-%s' % uid)
            if uid in mini_uids:
                mini_ums_login_stat[date].add(ums)
            else:
                norm_ums_login_stat[date].add(ums)

    mini_ums_login_counts = sum([len(v) for k, v in mini_ums_login_stat.iteritems()])
    norm_ums_login_counts = sum([len(v) for k, v in norm_ums_login_stat.iteritems()])

    return (server_id, (mini_ums_login_counts, norm_ums_login_counts))

def stat(server_ids):
    
    _stat = [login_stat_by_week(i) for i in server_ids]
    fobj = open('%s-result.csv' % ('-'.join(server_ids)), 'wb')
    header = 'weeknum,mini_ums_login (%),norm_ums_login\n'
    fobj.write(header)
    _results = defaultdict(dict)
    for _id, data in _stat:
        weeks = sorted(data.keys())
        for week in weeks:
            if week not in ['25', '24', '26']:
                print 'ignore %s' % week
                continue
            _results[week].setdefault('mini_ums_login', 0)
            _results[week].setdefault('norm_ums_login', 0)
            _mini = data[week].get('mini_ums_login', 0)
            _norm = data[week].get('norm_ums_login', 0)

            _results[week]['mini_ums_login'] += _mini
            _results[week]['norm_ums_login'] += _norm
    for week, items in _results.iteritems():
        _mini = items['mini_ums_login']
        _norm = items['norm_ums_login']
        _mini_perc = "%s (%d%%)" % (_mini, (_mini * 1.0 / (_mini+_norm) * 100))
        fobj.write('"%s","%s","%s"\n' % (week, _mini_perc, _norm))
    fobj.close()
    # mini_login_counts = sum([i[1][0] for i in _stat])
    # norm_login_counts = sum([i[1][1] for i in _stat])
    # login_counts = mini_login_counts + norm_login_counts
    # print _stat
    # return (mini_login_counts, norm_login_counts, login_counts, mini_percent)

def main():
    files = os.listdir('.')
    id_pat = re.compile(r'^(\d{4})-')
    server_ids = set()
    server_ids.update([id_pat.search(f).group(1) for f in files if id_pat.search(f)])

    min_date = str2date('1999-01-01')
    max_date = str2date('2012-12-30')
    mid_date = str2date('2012-03-01')

    new_server_ids = ['1054', '1053', '1052', '1051', '1050']
    old_server_ids = ['1049', '1048', '1037', '1036', '1034']
    stat(old_server_ids)
    stat(new_server_ids)
    # mini_login_counts, norm_login_counts, login_counts, percent = stat(server_ids)
    # print '%s(%s) - %s -%s' % (mini_login_counts, percent, norm_login_counts, login_counts)

if __name__ == '__main__':
    main()