Source

weblog / combined.py

#!/usr/bin/env python

'''
Combined Web log parsing class.

Contents:

- Parser: logfile parser class
  log = weblog.combined.Parser(log filehandle)
  - methods:
	log.getlogent()
  - attributes:
  	log.buffer - buffer size (default 512k)
  - read-only attributes:
	log.client
	log.ident
	log.authuser
	log.utime
	log.method
	log.url
	log.proto
	log.status
	log.bytes
	log.referer
	log.agent
	log.num_processed - number of raw lines seen
	log.num_error - number of errors seen

- test: test function
'''


# (c) 1998 Copyright Mark Nottingham
# <mnot@pobox.com>
#
# This software may be freely distributed, modified and used, 
# provided that this copyright notice remain intact.
#
# This software is provided 'as is' without warranty of any kind.


# Combined Logfile Format
# -----------------------
#
# host rfc931 authuser [DD/Mon/YYYY:hh:mm:ss] "request" ddd bbbb "referer" "agent"
#
# rfc931: identd info, - otherwise
# authuser: user id if http autheticated, - otherwise
# ddd: the status code returned by the server, - if not available
# bbbb: the total number of bytes sent, not including header, - if not available


__version__ = '1.0'


from time import mktime
from regex import symcomp
from string import atoi, split, rstrip
import sys

_pattern = symcomp('^\(<client>[^ ]+\) \(<ident>[^ ]+\) \(<authuser>[^\[\n]+\) \[\(<mday>[0-9]+\)\/\(<mon_name>\w+\)\/\(<year>[0-9]+\):\(<hour>[0-9]+\):\(<min>[0-9]+\):\(<sec>[0-9]+\) \(<timediff>[^ ]+\)\] "\(<method>[^ ]+\) \(<url>[^ ]+\) \(<proto>[^ ]+\)" \(<status>[^ ]+\) \(<bytes>[^ ]+\) "\(<refer>[^ ]+\)" "\(<agent>[^"]+\)"\|\(-\)[\r\n ]*$') 
_patmatch = _pattern.match
_patgroup = _pattern.group

_monlist = {	"Jan": 1,
				"Feb": 2,
				"Mar": 3,
				"Apr": 4,
				"May": 5,
				"Jun": 6,
				"Jul": 7,
				"Aug": 8,
				"Sep": 9,
				"Oct": 10,
				"Nov": 11,
				"Dec": 12 
			}


class Parser:
	''' Combined Web Logfile Parser '''

	def __init__(self, file_descriptor):
		self.num_processed = 0
		self.num_error = 0
		self._fd = file_descriptor
		self._lines = []
		self._lines_num = 0
		self._lines_index = 0
		self.buffer = 1024 * 512
		self.client = ''
		self.ident = ''
		self.authuser = ''
		self.utime = 0
		self.method = ''
		self.url = ''
		self.proto = ''
		self.status = 0
		self.bytes = 0
		self.referer = ''
		self.agent = ''


	def getlogent(self):
		''' Increament location in the log and populate object attributes '''

		while 1:	# loop until we find a valid line, or end

			### this is the buffering for readline()
			if self._lines_index >= self._lines_num:
				self._lines_index = 0
				self._lines = self._fd.readlines(self.buffer)
				self._lines_num = len(self._lines)
				if self._lines_num == 0: return 0
			line = self._lines[self._lines_index]
			self._lines_index = self._lines_index + 1

			self.num_processed = self.num_processed + 1
				
			n = split(line, None, 11)	# split the line on whitespace
			if len(n) != 12:			# split didn't work; try regex
				if self._run_regex(line):
					return 1
				else:
					self.num_error = self.num_error + 1
					continue
			
			try:
				self.utime = mktime(    atoi(n[3][8:12]),
										_monlist[n[3][4:7]],
										atoi(n[3][1:3]),
										atoi(n[3][13:15]),
										atoi(n[3][16:18]),
										atoi(n[3][19:21]),
										-1, -1, -1
									)
				self.client = n[0]
				self.ident = n[1]
				self.authuser = n[2]
				self.method = n[5][1:]
				self.url = n[6]
				self.proto = n[7][:-1]
			except:					# split didn't work; try regex
				if self._run_regex(line):
					return 1
				else:
					continue

			try:
				self.status = atoi(n[8])
			except ValueError:
				self.status = 0
			try:
				self.bytes = atoi(n[9])
			except ValueError:
				self.bytes = 0
			try:
				self.referer = n[10][1:-1]
			except IndexError:
				self.referer = ''
			try:
				self.agent = (rstrip(n[11]))[1:-1]
			except IndexError:
				self.agent = ''
			return 1		# valid line found


	def _run_regex(self, line):
		''' Try to parse the line with a regex; return 1 if sucessful. '''

		n = _patmatch(line)
		if n == -1: return 0		# no match
		self.utime = mktime(	atoi(_patgroup('year')), 
								_monlist[_patgroup('mon_name')], 
								atoi(_patgroup('mday')), 
								atoi(_patgroup('hour')), 
								atoi(_patgroup('min')), 
								atoi(_patgroup('sec')), 
								-1, -1, -1
							)				
		self.client = _patgroup('client') 
		self.ident = _patgroup('ident')
		self.authuser = _patgroup('authuser') 
		self.method = _patgroup('method') 
		self.url = _patgroup('url') 
		self.proto = _patgroup('proto')
		try:
			self.status = atoi(_patgroup('status')) 
		except ValueError:
			self.status = 0	
		try:
			self.bytes = atoi(_patgroup('bytes')) 
		except ValueError:
			self.bytes = 0
		self.referer = _patgroup('refer') 
		self.agent = _patgroup('agent')
		return 1



			
def test():
	''' basic test suite- modify at will to test all functionality '''
	
	file = sys.stdin
	log = Parser(file)	
	while log.getlogent():
		pass
		print "%s %s %s %s %s %s" % (log.num_processed, log.client, log.utime, log.url, log.referer, log.agent)
	print "lines: %s" % (log.num_processed)
	print "error: %s" % (log.num_error)
		
		
if __name__ == '__main__':
	test()