Source

woocode / py / sa / statistics / parse_log.py

Full commit
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
#!/usr/bin/env python
# -*- encoding:utf-8 -*-


'''
2012-05-03 -- 2012-05-09
uid 登录总数 总挖图次数(包含成功和失败) 总打图次数(包含成功和失败)

ID IP 打图成功 打图失败 挖图成功 挖图失败

'''
import re
import os
import sys
import tarfile
import cPickle
import optparse
from datetime import datetime, timedelta
from collections import defaultdict
from pprint import pprint

from log import get_logger

cwd = os.path.realpath(os.path.dirname(__file__))
logfile = os.path.join(cwd, __file__ + '.log')
logger = get_logger('tongji', logfile)

fail_text = u'战斗失败'.encode('gb18030')

watu_fail_pat = re.compile(fail_text)
watu_succ_pat = re.compile(r'Get\s+reward')
datu_fail_pat = re.compile(r'Fail:\d+\s+[\'"]?cg_cbt')
datu_succ_pat = re.compile(r'Complete:\d+\s+[\'"]?cg_cbt')

brackets_pat = re.compile(r'\[(.+?)\]')
tarfile_pat = re.compile(r'\.t[(ar)|(gz)]+(\.gz)?(\.bz\d?)?')

ums_pat = re.compile(r'ums=(.+?),')
uid_pat = re.compile(r'uid=(\d+)')
ip_pat = re.compile(r'ip=([\d\.]+)')
date_pat = re.compile(r'\[([-\d:\s]+)\]')
key_pat = re.compile(r'key=(.+?),')
mission_uid_pat = re.compile(r'[(Fail)|(Complete)]:(\d+)')

datetime_fmt = '%Y-%m-%d %H:%M:%S'
day_fmt = '%Y-%m-%d'
hour_fmt = '%Y-%m-%d %H'
minute_fmt = '%Y-%m-%d %H:%M'

# uid --> ums 的映射
UID_UMS_MAP = {}
# ums --> uid 的映射
UMS_UID_MAP = defaultdict(set)

# 所有 UID 每天的登录信息 {'day': {'uid': 'login_counts'}}
UID_LOGIN = {}

# 所有 UMS 每天的登录信息(当天只记录最后一次登录IP) {'day': {'ums': ['ip1', 'ip2']}}
UMS_LOGIN = {}

# mini 客户端每天登录信息
MINICLIENT_LOGIN = {}

# 挖图信息
WA_TU = {}
# 打图信息
DA_TU = {}

li2line = lambda x: ','.join(['"%s"' % l for l in x])

def load_obj(fp):
    # print 'loading ', fp
    logger.info('loading %s', fp)
    return cPickle.load(file(fp))

def tongji_login(server_id):
    '''
    统计结果:
    1. 微端登录数,按UMS算,如果UMS登录多次,当天只统计一次
    2. 普通端登录数,按UMS算,如果UMS登录多次,当天只统计一次
    3. 微端登录当天登录比例

    - 3 月份之前的
    - 3 月份之后的
    '''
    uid_ums_map = load_obj('%s-uid_ums_map.pkl' % server_id)
    uid_login = load_obj('%s-uid_login.pkl' % server_id)
    ums_login = load_obj('%s-ums_login.pkl' % server_id)
    # ums_uid_map = '%s-ums_uid_map.pkl' % server_id
    miniclient_login = load_obj('%s-miniclient_login.pkl' % server_id)
    csv = '%s-ums_login.csv' % server_id

    days = ums_login.keys()
    days.sort()
    before_march = []
    after_march = []
    march = get_dateobj_from_str('2010-03-01', day_fmt)
    for day in days:
        dobj = get_dateobj_from_str(day, day_fmt)
        if dobj < march:
            before_march.append(day)
        else:
            after_march.append(day)

    all_ums = set([])
    # 所有微端登录的 ums
    mini_ums = set([])
    # 所有正常版登录的 ums
    norm_ums = set([])

    for m in [before_march, after_march]:
        n = 1
        fobj = file(str(n) + csv, 'wb')
        n += 1
        sum_login_counts = len([len(v.keys()) for k, v in ums_login.iteritems()
                                if k in m])
        # uid -> ums
        header_summary = [s.endcode('gb18030') for s in [u'日期', u'SERVER_ID', u'总登录数',
                            u'微端登录数', u'普通版登录数', u'微端登录比例']]
        fobj.write(li2line(header_summary) + '\n')
        fobj.write(li2line([n, server_id, sum_login_counts, ]))
        fobj.write('\n')

        for day in before_march:
            pass

        fobj.write()
    for day, item in ums_login.iteritems():
        pass


    header_detail = [s.encode('gb18030') for s in [u'日期', u'微端登录数', u'正常版登录数', u'微端比例']]
    fobj.write(li2line(header_detail) + '\n')
    fobj.close()

def tongji_cangbao(server_id, start_date, end_date):
    uid_ums_map = load_obj('%s-uid_ums_map.pkl' % server_id)
    watu = load_obj('%s-watu.pkl'% server_id)
    datu = load_obj('%s-datu.pkl' % server_id)
    uid_login = load_obj('%s-uid_login.pkl' % server_id)
    ums_login = load_obj('%s-ums_login.pkl' % server_id)
    # ums_uid_map = '%s-ums_uid_map.pkl' % server_id

    date_line = '%s - %s' % (start_date, end_date)
    csv = '%s-%s-result.csv' % (server_id, date_line)


    start_date = get_dateobj_from_str(start_date, day_fmt)
    end_date = get_dateobj_from_str(end_date, day_fmt)

    def filter_by_date(dct, start, end):
        dates = dct.keys()
        for date in dates:
            dobj = get_dateobj_from_str(date, day_fmt)
            if dobj >= start_date:
                if dobj < (end_date + timedelta(1)):
                    continue
            del dct[date]
        return dct

    def get_ip_by_uid(uid):
        ip = 'unknown'
        ums = uid_ums_map.get(uid, 'unknown')
        if ums == 'unknown':
            ip = 'unknown'
        else:
            days = ums_login.keys()
            days.reverse()
            for day in days:
                if ums in ums_login[day]:
                    # 取最后一次登录IP
                    ip = ums_login[day][ums][-1]
        return ip

    def parse_cangbao(dct):
        succ_count = 0
        fail_count = 0
        uid_map = {}
        for uid, info in dct.iteritems():
            succ = info.get('succ', 0)
            fail = info.get('fail', 0)
            succ_count += succ
            fail_count += fail
            uid_map[uid] = {'succ': succ, 'fail': fail}
        return succ_count, fail_count, uid_map

    fobj = open(csv, 'wb')

    _uid_login = filter_by_date(uid_login, start_date, end_date)

    _watu = filter_by_date(watu, start_date, end_date)
    _datu = filter_by_date(datu, start_date, end_date)

    logger.debug('date_line = %s' % date_line)
    # 时间段内总的登录总数
    login_count = 0
    uids = set([])
    # 统计有效的 uid 数量
    for date, item in _uid_login.iteritems():
        for k, v in item.iteritems():
            uids.add(k)
            login_count += v

    logger.info('%s %s --> %d' % (server_id, date_line, len(uids)))
    watu_count = {'fail': 0, 'succ': 0}
    uid_watu_count = {}
    for date, item in _watu.iteritems():
        for uid, m in item.iteritems():
            uid_watu_count.setdefault(uid, {})
            uid_watu_count[uid].setdefault('fail', 0)
            uid_watu_count[uid].setdefault('succ', 0)
            uid_watu_count[uid]['fail'] += m['fail']
            uid_watu_count[uid]['succ'] += m['succ']
            watu_count['fail'] += m['fail']
            watu_count['succ'] += m['succ']

    uid_datu_count = {}
    datu_count = {'fail': 0, 'succ': 0}
    for date, item in _datu.iteritems():
        for uid, m in item.iteritems():
            uid_datu_count.setdefault(uid, {})
            uid_datu_count[uid].setdefault('fail', 0)
            uid_datu_count[uid].setdefault('succ', 0)
            uid_datu_count[uid]['fail'] += m['fail']
            uid_datu_count[uid]['succ'] += m['succ']
            datu_count['fail'] += m['fail']
            datu_count['succ'] += m['succ']

    cangbao_count = 0
    for key in ['fail', 'succ']:
        cangbao_count += datu_count[key] + watu_count[key]


    fobj.write(u'日期,登录UID数,藏宝总数,挖图总数,打图总数\n'.encode('gb18030'))
    sum_line = [date_line, login_count, cangbao_count, watu_count, datu_count]
    fobj.write(li2line(sum_line) + '\n')

    fobj.write('\n')
    fobj.write(u'ID,IP,打图成功,打图失败,挖图成功,挖图失败\n'.encode('gb18030'))

    for uid in uids:
        found_uid_watu = True
        found_uid_datu = True
        # 不统计藏家寻宝活的玩家
        if uid not in uid_datu_count:
            uid_datu_count[uid] = {'fail': 0, 'succ': 0}
            found_uid_datu = False
        if uid not in uid_watu_count:
            uid_watu_count[uid] = {'fail': 0, 'succ': 0}
            found_uid_watu = False
        if not (found_uid_datu | found_uid_watu):
            continue
        line = [uid, get_ip_by_uid(uid),
                uid_datu_count[uid]['succ'], uid_datu_count[uid]['fail'],
                uid_watu_count[uid]['succ'], uid_watu_count[uid]['fail']]
        fobj.write(li2line(line) + '\n')
    fobj.close()

def dump_obj(fn, obj):
    with open(fn, 'wb') as fb:
        cPickle.dump(obj, fb)

def get_ret(reg, s, gn=1):
    m = reg.search(s)
    if m:
        return m.group(gn)

def get_dateobj_from_str(s, fmt=datetime_fmt):
    if isinstance(s, str):
        ret = datetime.strptime(s, fmt)
    elif isinstance(s, type(datetime.now())):
        ret = s
    else:
        raise TypeError("datetime type error")
    return ret

def parse_line(line, pat=[]):
    ret = []
    if not isinstance(pat, list):
        pat = [pat]
    if len(pat) == 1:
        ret = pat[0].findall(line)
    else:
        for pa in pat:
            r = get_ret(pa, line)
            ret.append(r)
    return ret

def datetime_to_day(date, fmt=datetime_fmt):
    date_obj = get_dateobj_from_str(date)
    return date_obj.strftime(day_fmt)

def parse_watu(fobj):
    '''
    挖图
    Get Reward --> 挖图成功
    战斗失败   --> 挖图失败
    '''
    def _parse(line, s='fail'):
        parts = parse_line(line, [brackets_pat])
        if len(parts) < 2 or (not all(parts)):
            logger.error('error line: %s' % line)
            return None
        date = parts[0]
        uid = parts[1]
        day = datetime_to_day(date)

        WA_TU.setdefault(day, {})
        WA_TU[day].setdefault(uid, {})
        WA_TU[day][uid].setdefault('succ', 0)
        WA_TU[day][uid].setdefault('fail', 0)

        WA_TU[day][uid][s] += 1
        return WA_TU[day][uid][s]

    for line in fobj:
        if watu_fail_pat.search(line):
            _parse(line, 'fail')

        elif watu_succ_pat.search(line):
            _parse(line, 'succ')

def parse_datu(fobj):
    '''
    打图
    Complete   --> 成功
    Fail       --> 失败
    '''

    def _parse(line, s='fail'):

        parts = parse_line(line, [date_pat, mission_uid_pat])
        if len(parts) < 2 or (not all(parts)):
            logger.error('error line: %s' % line)
            return None
        date = parts[0]
        uid = parts[1]
        day = datetime_to_day(date)

        DA_TU.setdefault(day, {})
        DA_TU[day].setdefault(uid, {})
        #DA_TU[day][uid].setdefault(s, 0)
        DA_TU[day][uid].setdefault('fail', 0)
        DA_TU[day][uid].setdefault('succ', 0)

        DA_TU[day][uid][s] += 1
        return DA_TU[day][uid][s]

    for line in fobj:
        if datu_fail_pat.search(line):
            _parse(line, 'fail')

        elif datu_succ_pat.search(line):
            _parse(line, 'succ')

def parse_minilog(fobj):
    '''处理微端日志'''

    for line in fobj:
        parts = parse_line(line, [brackets_pat])
        if all(parts) and len(parts) >= 3:
            date = parts[0]
            uid = parts[1]
            ip = parts[2]
            try:
                day = datetime_to_day(date)
            except:
                logger.error('error line: %s' % line)
                continue

            MINICLIENT_LOGIN.setdefault(day, defaultdict(list))
            # 记录所有登录IP
            MINICLIENT_LOGIN[day][uid].append(ip)
        else:
            logger.error('error line: %s' % line)

def parse_login(fobj):
    '''
    统计 uid 登录次数,最后一次 ip
    [2012-04-26 09:04:40] uid enter game vfd=259,ums=fdg1204,uid=1067015
    [2012-04-26 09:04:39] client use loginkey ok vfd=259,ums=0,key=:1036:fdg1204:1919133540226506612:2012-04-26 09:04:38:3555,ip=113.95.229.13
    '''
    n = 0
    for line in fobj:
        n += 1
        if 'client use loginkey ok vfd=' in line:
            # 统计 最后一次 ip
            date = get_ret(date_pat, line)
            #vfd = get_ret(vfd_pat, line)
            key = get_ret(key_pat, line)
            ip = get_ret(ip_pat, line)
            if not all([date, key, ip]):
                logger.error('error line[%d]: %s' % (n, line))
                continue

            date_obj = get_dateobj_from_str(date)
            day = date_obj.strftime(day_fmt)

            key_parts = key.split(':')
            # ums 只能从key的字段里获取
            ums = key_parts[2]

            UMS_LOGIN.setdefault(day, defaultdict(list))
            # 记录所有登录IP
            UMS_LOGIN[day][ums].append(ip)

        elif 'uid enter game vfd=' in line:
            date = get_ret(date_pat, line)
            ums = get_ret(ums_pat, line)
            uid = get_ret(uid_pat, line)
            if not all([date, ums, uid]):
                logger.error('error line[%d]: %s' % (n, line))

            date_obj = get_dateobj_from_str(date)
            day = date_obj.strftime(day_fmt)

            UID_LOGIN.setdefault(day, {})
            UID_LOGIN[day].setdefault(uid, 0)
            UID_LOGIN[day][uid] += 1

            UMS_UID_MAP[ums].add(uid)
            UID_UMS_MAP.setdefault(uid, ums)

def get_fobj_from_tarfile(tr):
    tf = tarfile.open(tr)
    fobj = tf.extractfile(tf.members[0])
    return fobj

def cmd_parse(args, opts):
    """parse - parse log directory Usage: <server_id_path> <server_id>
    """
    logdir = args[0]
    server_id = args[1]
    for fn in os.listdir(logdir):
        fp = os.path.join(logdir, fn)
        if os.path.isdir(fp):
            continue

        if tarfile_pat.search(fp):
            fobj = get_fobj_from_tarfile(fp)
        else:
            fobj = file(fp)

        if 'login_' in fn:
            logger.info('parsing %s, enter login logic' % fn)
            parse_login(fobj)
        # elif 'cangbao_' in fn:
        #     print 'enter cangbao logic'
        #     parse_watu(fobj)
        # elif 'mission_' in fn:
        #     print 'enter mission logic'
        #     parse_datu(fobj)
        elif 'mini' in fn:
            logger.info('parsing %s, enter miniclient logic' % fn)
            parse_minilog(fobj)
        else:
            logger.info('ignore %s' % fp)
        fobj.close()
        del fobj

    dump_obj('%s-uid_ums_map.pkl' % server_id, UID_UMS_MAP)
    dump_obj('%s-ums_uid_map.pkl' % server_id, UMS_UID_MAP)
    dump_obj('%s-watu.pkl'% server_id, WA_TU)
    dump_obj('%s-datu.pkl' % server_id, DA_TU)
    dump_obj('%s-uid_login.pkl' % server_id, UID_LOGIN)
    dump_obj('%s-ums_login.pkl' % server_id, UMS_LOGIN)
    dump_obj('%s-miniclient_login.pkl' % server_id, MINICLIENT_LOGIN)

def cmd_tongji_cangbao(args, opts):
    '''output log statistics'''
    date_tuple = (
            ('2012-05-03', '2012-05-09'),
            ('2012-05-10', '2012-05-16'),
            ('2012-05-17', '2012-05-23'),
            ('2012-05-25', '2012-05-31'),
            ('2012-06-01', '2012-06-07'),
            ('2012-06-08', '2012-06-14'),
   )

    server_ids = ('1036', '1037')
    for id in server_ids:
        for start_date, end_date in date_tuple:
            tongji_cangbao(id, start_date, end_date)

    for start_date, end_date in date_tuple[1:]:
        tongji_cangbao('1052', start_date, end_date)

def cmd_help(args, opts):
    """help - list available commands"""

    print "Available commands:"
    for _, func in sorted(get_commands().items()):
        print "   ", func.__doc__

def get_commands():
    return {'help': cmd_help,
            'parse': cmd_parse,
            'tongji_cangbao': cmd_tongji_cangbao}

def parse_opts():
    usage = "%prog [options] <command> [arg] ..."
    description = (u"Log parse, statistics too. Use `%prog help`"
        "to see the list of available commands.")
    op = optparse.OptionParser(usage=usage, description=description)
    opts, args = op.parse_args()
    if not args:
        op.print_help()
        sys.exit(2)
    cmdname, cmdargs, opts = args[0], args[1:], opts
    commands = get_commands()
    if cmdname not in commands:
        print >> sys.stdout, "Unknown command: %s\n\n" % cmdname
        cmd_help(None, None)
        sys.exit(1)
    return commands[cmdname], cmdargs, opts

def main():
    cmd, args, opts = parse_opts()
    try:
        cmd(args, opts)
    except IndexError:
        print cmd.__doc__

if __name__ == '__main__':
    main()