loganalyze / loganalyze.py

#!/usr/bin/env python

# Much of the coroutine stuff inspired and/or stolen from Chapter 6 of "Python
# Essential Reference, Fourth Edition."

import os
import sys
import re
import subprocess
import shlex
import readline

from datetime import datetime, timedelta
from optparse import OptionParser, OptionValueError

# XXX: Debug
import pdb
from pprint import pprint as pp

class UrlAggregator(dict):
    def _sort(self):
        url_list = self.keys()
        url_list = sorted(url_list, 
                          cmp=lambda a,b: cmp(self[a]['count'], self[b]['count']))
        return url_list

class TopUrls(UrlAggregator):
    def __init__(self, log_analyzer):
        self._log_analyzer = log_analyzer
        self._start = None
        self._end = None

    def __call__(self, start, end, num_urls):
        num_urls = int(num_urls)
        if self._start == start and self._end == end: 
            self.output(num_urls)
        else:
            self._start = start
            self._end = end
            for datetimeobj, url, ms, ip in self._log_analyzer.log_data:
                if start <= datetimeobj <= end:
                    self.update(datetimeobj, url, ms, ip)
            self.output(num_urls)

    def update(self, datetimeobj, url, ms, ip):
        url_dict = self.setdefault(url, {'count': 0})
        url_dict['count'] += 1
        
    def output(self, num_urls):
        url_list = self._sort()
        if url_list:
            # only print the top self._num_urls urls
            for url in url_list[-num_urls:]:
                print "%6d %-40s" % (self[url]['count'], url)

class LinearQuantizeTimeOfDay(UrlAggregator):
    def __init__(self, period, start, end):
        self._time_period = end - start
        self._period = int(period)
        self._num_buckets = (self._time_period.seconds + (self._time_period.days*24*3600)) / self._period
        self._start = start
        self._end = end

    def add(self, datetimeobj, url, ip, ms):
        url_dict = self.setdefault(url, {'count': 0, 
                                         'buckets': [0 for x in range(self._num_buckets)]})
        url_dict['count'] += 1
        for bucket_idx in range(self._num_buckets):
            bucket_start = self._start + timedelta(seconds=self._period*bucket_idx)
            bucket_end = self._start + timedelta(seconds=self._period*(bucket_idx+1)-1)
            if bucket_start <= datetimeobj <= bucket_end:
                url_dict['buckets'][bucket_idx] += 1

    def output(self):
        url_list = self._sort()
        if url_list:
            # only want distributions for the top 5 urls
            for url in url_list[-5:]:
                print "%-60s %6d\n" % (url, self[url]['count'])
                for bucket_idx in range(self._num_buckets):
                    bucket = self[url]['buckets'][bucket_idx]
                    bucket_start = self._start + timedelta(seconds=self._period*bucket_idx)
                    bucket_end = self._start + timedelta(seconds=(self._period*(bucket_idx+1))-1)
                    print "\t%s - %s: %d" % (bucket_start, bucket_end, bucket)
                print

class LinearQuantizeSpeed(UrlAggregator):
    def __init__(self, ms_step):
        self._ms_step = int(ms_step)

    def add(self, datetimeobj, url, ip, ms):
        url_dict = self.setdefault(url, {'count': 0, 'buckets': []})
        url_dict['count'] += 1

        bucket_idx = ms / self._ms_step
        num_buckets = bucket_idx + 1
        bucket_list_len = len(url_dict['buckets'])
        if bucket_list_len < num_buckets:
            url_dict['buckets'].extend([0 for x in range(num_buckets-bucket_list_len)])

        url_dict['buckets'][bucket_idx] += 1

    def output(self):
        url_list = self._sort()
        if url_list:
            # only want distributions for the top 5 urls
            for url in url_list[-5:]:
                print "%-60s\n" % (url)
                url_dict = self[url]
                for bucket_idx in range(len(url_dict['buckets'])):
                    print "%5d ms: %d" % ((bucket_idx+1)*self._ms_step, url_dict['buckets'][bucket_idx])
                print
                
class QuantizeSpeed(UrlAggregator):
    def __init__(self):
        pass

    def add(self, datetimeobj, url, ip, ms):
        pass

    def output(self):
        pass

class LogAnalyzer(object):

    month_map = {
        "Jan": 1,
        "Feb": 2,
        "Mar": 3,
        "Apr": 4,
        "May": 5,
        "Jun": 6,
        "Jul": 7,
        "Aug": 8,
        "Sep": 9,
        "Oct": 10,
        "Nov": 11,
        "Dec": 12
    }


    def __init__(self):
        self.log_data = []
        self.start = None
        self.end = None

        self.cmd_map = {
            "topurls": TopUrls(self)
        }

    def cmd_loop(self):
        while True:
            try:
                line = raw_input("\rLogAnalyzer> ")
                if not line or line.lstrip().rstrip() == '':
                    continue
                for cmd in self.cmd_map.keys():
                    func, args = line.split(' ', 1)
                    if func.lower() == cmd:
                        self.cmd_map[cmd](self.start, self.end, *args.split(' '))
                        break
                else:
                    print "Unknown Command: %s" % line
            except EOFError:
                break

    def parse_logfile(self, path):
        cmds = []
        if path.endswith(".gz"):
            cmds.append("gzip -dc %s" % path)
            cmds.append("grep 'request end'")
        else:
            cmds.append("grep 'request end' %s" % path)

        procs = []
        stdin = None
        for cmd in cmds:
            p = subprocess.Popen(shlex.split(cmd), stdin=stdin, stdout=subprocess.PIPE)
            stdin = p.stdout
            procs.append(p)

        for line in p.stdout:
            self._process_line(line)

        for p in procs:
            p.wait()

    def _process_line(self, line):
        year   = int(line[8:12])
        month  = self.month_map[line[4:7]]
        day    = int(line[1:3])
        hour   = int(line[13:15])
        minute = int(line[16:18])
        second = int(line[19:21])
        datetimeobj = datetime(year, month, day, hour, minute, second)

        line = self._fixup_urls(line)
        line_elements = line.split(' ')
        url = line_elements[8]
        ip = line_elements[11]
        ms = int(line_elements[9])

        # Record the data for later analysis
        self.log_data.append((datetimeobj, url, ms, ip))

        # Calculate the start of the time period for the logs we're dealing with
        if self.start:
            if datetimeobj < self.start:
                self.start = datetimeobj
        else:
            self.start = datetimeobj

        # Calculate the end of the time period for the logs we're dealing with
        if self.end:
            if datetimeobj > self.end:
                self.end = datetimeobj
        else:
            self.end = datetimeobj

    def _fixup_urls(self, line):
        line = re.sub('&uqid=jQuery[0-9_]+', '', line)
        line = re.sub('&ts=\d+', '', line)
        line = re.sub('&_=\d+', '', line)
        return line

def main():
    log_analyzer = LogAnalyzer()
    for path in sys.argv[1:]:
        print "Parsing log file: %s  " % os.path.basename(path),
        start = datetime.now()
        log_analyzer.parse_logfile(path)    
        end = datetime.now()
        print "DONE (%s)" % (end - start)
    log_analyzer.cmd_loop()

if __name__ == "__main__":
#   import cProfile
#   cProfile.run('main()')
    main()

# vim: et sw=4 ts=4 softtabstop=4 foldmethod=expr tw=100
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.