Commits

Peter Nixon committed 1907c53

Comments (0)

Files changed (21)

._squid.py

Binary file added.

.hgignore

Empty file added.
+version 1.0
+- finally decided to release
+- added full_mime_hdrs to squid parser
+- made readline buffered in parsing classes - substantial speed improvement
+
+version 0.99
+- CHANGED ALL MODULE NAMES to a uniform package scheme (see README)
+- added StoreParser class to SquidCacheLog; changed Parser to AccessParser
+- bug in weblog.limit.Time (typo)
+- bug in Parser classes (depended on string.atoi('-') eval'ing to 0)
+- integrated FasterParser classes into Parser's; can always use Parser now
+
+version 0.98
+- added MultipleWebLog module
+- added WebLogQueryParse module
+- added example script to tail a file
+- added num_skipped attribute to WebLogLimit classes
+- made WebLogLimit's test() more explicit
+- made SquidCacheLog's error catching more thorough
+- revised WebLogClean to catch all % escaped sequences
+- moved host lower()ing to WebLogUrlParse
+- general cleanup
+
+version 0.96
+- added dummy attributes to __init__ methods of parsing classes
+- used a while loop instead of recursion for catching errors in parsing classes
+- renamed some attributes in SquidWebLog for clarity
+- added SquidWebLog example script
+- added WebLogLimit module (TimeLimit, HostLimit, PathLimit classes)
+- used a test in __init__ instead of try/except for attribute testing in
+  WebLogUrlParse, WebLogClean
+- moved caching tests into getlogent() in WebLogClean and WebLogUrlParse
+  (25% performance improvement)
+- fixed WebLogClean.test_clean()
+- tweaked siteurl attribute generation in WebLogReferType
+- siteurl list made non-case-sensitive in WebLogReferType
+
+
+Remember that the modules must be installed before using the examples; see
+the top-level README.

EXAMPLES/bad_passwords.py

+#!/usr/bin/env python
+
+# This script shows how many bad passwords each user has entered for HTTP 
+# authentication. (use CommonWebLog instead of CombinedWebLog as necessary)
+#
+# usage: cat [logfile] | ./bad_passwords.py > [outfile]
+
+
+import sys
+from weblog import combined
+
+log = combined.Parser(sys.stdin)
+
+bad_pw = {}
+while log.getlogent():
+	if (log.status == 401 or log.status == 403 ) and log.authuser != '-':
+		bad_pw[log.authuser] = bad_pw.get(log.authuser, 0) + 1
+		
+for (user, num_bad) in bad_pw.items():
+	print "%s  %s" % (num_bad, user)

EXAMPLES/log_watch.py

+#!/usr/bin/env python
+
+# This script watches a log and prints out hits as they happen.
+#
+# usage: ./log_watch.py [logfile]
+
+
+import sys, time
+from weblog import combined
+
+file = open(sys.argv[1])
+log = combined.Parser(file)
+while 1:
+	while log.getlogent():
+		print "host: %s\n page: %s" % (log.client, log.url)
+	file.seek(0, 1)
+	time.sleep(5)

EXAMPLES/referers.py

+#!/usr/bin/env python	
+
+# This script shows what the most popular referers to pages on your site are.
+# Make sure you set log.siteurl and c_log.directory_index as appropriate.
+# (this requires a logfile in combined format)
+#
+# usage: cat [logfile] | ./referers.py [siteurl(s)] > [outfile]
+
+
+import sys
+from weblog import combined, url, clean, referer
+
+o_log = combined.Parser(sys.stdin)
+p_log = url.Parser(o_log)
+
+c_log = clean.Cleaner(p_log)
+c_log.directory_index = ['index.html', 'index.htm']
+
+log = referer.Typer(c_log)
+log.siteurl = sys.argv[1:]
+
+offsites = {}
+while log.getlogent():
+	if log.referer_type is 'OFFSITE':
+		try:
+			offsites[log.url][log.referer] = offsites[log.url].get(log.referer, 0) + 1
+		except KeyError:
+			offsites[log.url] = {}
+			offsites[log.url][log.referer] = 1
+
+
+pages = offsites.keys()
+pages.sort()	
+
+for page in pages:
+	print "\n%s" % (page)
+	ref_nums = offsites[page]
+	referers = ref_nums.keys()
+	referers.sort(lambda a, b, rn = ref_nums: cmp(rn[b], rn[a]))
+	for referer in referers:
+		print "%s  %s" % (ref_nums[referer], referer)
+		
+		

EXAMPLES/search_terms.py

+#!/usr/bin/env python
+
+# search_terms.py
+#
+# This script will process the log fed on stdin for query terms on the 
+# more popular Web search engines, and give statistics on a page-by-page
+# basis.
+#
+# usage: cat [logfile] | ./search_terms.py > [outfile]
+
+from weblog import combined, url, query
+from string import split, join, lower, strip
+import regex, sys
+from regsub import gsub
+
+replace_pat = regex.compile('\([\*\,\"\+]\|\-[^ ]*\)')
+
+engines = {	'digital.com':	'q',
+			'yahoo.com':	'p',
+			'hotbot.com':	'MT',
+			'excite.com':	'search',
+			'infoseek.com':	'qt',
+			'search.com':	'QUERY',
+			'metacrawler.com':	'general',
+			'metafind.com':		'q',
+			'webcrawler.com':	'searchText',
+			'lycos.com':		'query',
+			'inference.com':	'query',
+			'looksmart.com':	'key',
+			'northernlight.com': 'qr',
+		}			
+				
+o_log = combined.Parser(sys.stdin)
+p_log = url.Parser(o_log)
+q_log = query.Parser(p_log)
+
+searches = {}
+while q_log.getlogent():
+	if q_log.ref_query:
+		host = (split(p_log.ref_host, '.'))[-2:]
+		host = lower(join(host, '.'))
+		if engines.has_key(host):
+			try:
+				[terms] = q_log.ref_query_dict[engines[host]]
+			except KeyError:
+				continue
+			terms = lower(gsub(replace_pat, '', terms))
+
+			# uncomment this line to index by words in terms.
+#			terms = split(terms, None)
+			# uncomment this line to index by whole search phrases.
+			terms = [terms]
+
+			for term in terms:
+				try:
+					searches[p_log.url][term] = searches[p_log.url].get(term, 0) + 1
+				except KeyError:
+					searches[p_log.url] = {}
+					searches[p_log.url][term] = 1
+
+
+pages = searches.keys()
+pages.sort()
+for page in pages:
+	print "\n%s" % (page)
+	term_nums = searches[page]
+	terms = term_nums.keys()
+	terms.sort(lambda a, b, tn = term_nums: cmp(tn[b], tn[a]))
+	for term in terms:
+		print "%s  %s" % (term_nums[term], term)

EXAMPLES/squid_users.py

+#!/usr/bin/env python	
+
+# This parses a Squid logfile and outputs a list of traffic by clients and
+# then Web site visited, with Kbytes per site, user and total.
+#
+# NOTE THAT THERE ARE PRIVACY ISSUES IN ANALYSING WEB PROXY LOGFILES; THIS
+# IS INTENDED AS A DEMONSTRATION ONLY.
+#
+# usage: cat [logfile] | ./squid_users.py > [outfile]
+
+
+import sys
+from weblog import squid, url, resolve
+
+o_log = squid.AccessParser(sys.stdin)
+p_log = url.Parser(o_log)
+log = resolve.SimpleResolver(p_log)
+log.set_client = 'host'
+
+users = {}
+ttl_bytes = 0
+while log.getlogent():
+	ttl_bytes = ttl_bytes + log.bytes
+	try:
+		users[log.client]['TTL_B'] = users[log.client].get('TTL_B', 0) + log.bytes
+	except KeyError:
+		users[log.client] = {}
+		users[log.client]['HOSTS'] = {}
+		users[log.client]['TTL_B'] = log.bytes
+	users[log.client]['HOSTS'][log.url_host] = users[log.client]['HOSTS'].get(log.url_host, 0) + log.bytes
+
+print "TOTAL KBYTES: %s\n" % (int(ttl_bytes / 1024.0))
+
+names = users.keys()
+names.sort(lambda a, b, us = users: cmp(us[b]['TTL_B'], us[a]['TTL_B']))	
+for name in names:
+	print "\n%s - %s Kb" % (name, int(users[name]['TTL_B'] / 1024.0))
+	host_list = users[name]['HOSTS']
+	hosts = host_list.keys()
+	hosts.sort(lambda a, b, hl = host_list: cmp(hl[b], hl[a]))
+	for host in hosts:
+		print " %6i  %s" % (int(host_list[host] / 1024.0), host)
+		
+
+WebLog Classes - Python Logfile Analysis Toolkit
+------------------------------------------------
+Version 1.0
+
+(c) 1998 Mark Nottingham
+<mnot@pobox.com> - bug reports, questions, comments
+
+This software may be freely distibuted, modified and used,
+provided that this copyright notice remain intact.
+THIS SOFTWARE IS PROVIDED 'AS IS' WITHOUT WARRANTY OF ANY KIND.
+
+Thanks to Ben Golding and Jeremy Hylton for their advice.
+
+If you use the classes in an interesting or large application, please drop me
+a line!
+
+Introduction
+------------
+WebLog is a group of Python modules containing several class definitions that
+are useful for parsing and manipulating common Web and Web proxy logfile 
+formats. 
+
+WebLog is reasonably fast, considering that it's written in a scripting
+language. The parsing modules are especially well optimised; for example, 
+the combined parser will process about 2500 lines a second on a Pentium 233,
+on a Unix operating system.
+
+
+Contents
+--------
+The modules can be broken up into two types; parsing and postprocessing. The
+classes inside of them can be used by first using a parsing class and then
+stacking postprocessing classes on top of it.
+
+Parsing Modules:
+common - Common (NCSA) Web log parser.
+combined - Combined/extended Web log parser (adds referer and agent).
+squid - Squid Web Proxy Cache log parsers (access.log, store.log v1.1).
+multiple - combines log files of the same content from different servers.
+
+Postprocessing Modules:
+url - parses url and referer (if availalble) for components.
+query - parses queries into dictionaries. *
+clean - normalises attributes of Web Log for more accurate analysis. *
+resolve - resolves client address to host and/or ip.
+referer - determines type of hit: local, offsite, manual, or file. *
+limit - limit output to certain domains, files, directories or times. *
+
+* requires use of url.Parse first
+
+The squid parsing module contains two classes; AccessParser (for
+access.log), and StoreParser (for store.log). If you have full_mime_hdrs set
+in squid.conf, make sure to set the corresponding attribute in AccessParser;
+however, use of this will appreciably slow down analysis.
+
+
+Installation
+------------
+
+To install the modules, put the weblog directory either in the same directory
+as your application, or in the site-packages directory. If you do so, remember
+to include a weblog.pth file in the top level; for instance;
+
+% mkdir /usr/local/lib/python-1.5/site-packages   # if it isn't there
+% mv weblog /usr/local/lib/python-1.5/site-packages
+% touch /usr/local/lib/python-1.5/site-packages/weblog.pth
+
+See the site.py module for more details. After doing this, the modules can
+be imported in several ways, such as:
+
+>>> import weblog			# referenced like: weblog.common.Parser
+>>> from weblog import common			# referenced like: common.Parser
+>>> from weblog.common import Parser	# referenced like: Parser
+
+
+Use
+---
+One of the Parsing classes must always be used first and only once, and then
+Postprocessing classes may be used on the resulting instance, if desired.
+
+All of the classes define a method, getlogent(). This method will make the
+next log line available through its attributes. It will return 0 when 
+there are no more lines to process.
+
+For full details of the classes and their interfaces, read the comments of 
+the individual modules, as well as their __doc__ methods. Note that several 
+of the postprocessing classes have specific requirements for their input.
+
+
+Examples
+--------
+A WebLog class can be as easy to use as this, which will print how many hits
+pages on your site get:
+
+import weblog.common, sys
+log = common.Parser(sys.stdin)
+hits = {}
+while log.getlogent():
+	hits[log.url] = hits.get(log.url, 0) + 1
+for (page, hit_num) in hits.items():
+	print "%s %s" % (hit_num, page)
+
+Several moderately more complex demo scripts come with the WebLog package
+(in the EXAMPLES/ directory):
+
+bad_passwords.py - identify bad HTTP authentication attempts.
+referers.py - shows what referers go into your pages, by page and referer.
+search_terms.py - shows what search terms are used to reach your pages on 
+                  popular search engines.
+squid_users.py - shows traffic through a cache by user and site.
+log_watch.py - watches a logfile (i.e., 'tail -f').
+
+The best way to learn to use the classes is to pick through the examples, as
+well as the test() functions of each of the modules.
+__all__ = [ 	'clean', 
+				'combined', 
+				'common', 
+				'limit', 
+				'multiple', 
+				'query', 
+				'referer', 
+				'resolve', 
+				'squid', 
+				'url' 
+			]
+#!/usr/bin/env python
+
+'''
+Web Log Cleaning class.
+
+Contents:
+
+- Cleaner: logfile cleaning class
+  clean_log = weblog.clean.Cleaner(log object)
+  methods:
+	- clean_log.getlogent()
+  variables:
+	- clean_log.directory_index  [ filenames that are equal to directories ]
+	- clean_log.cache_size  [ maximum size of url cache ]
+
+- test: test function
+'''
+
+
+# (c) 1998 Copyright Mark Nottingham
+# <mnot@pobox.com>
+#
+# This software may be freely distributed, modified and used,
+# provided that this copyright notice remain intact.
+#
+# This software is provided 'as is' without warranty of any kind.
+
+
+# Web Log Cleaning
+#
+# The log object fed to a Cleaner MUST be preprocessed with weblog.url.Parser.
+#
+# The Cleaner class will normalise the log entries as much as possible, 
+# resulting in more accurate statistics for analysis. Although its efforts
+# are not perfect (particularly with path collapsing, due to the differing
+# natures of browsers and servers), it tries to 'do the right thing'.
+# 
+# This class will do the following to a log fed to it, for both request URL
+# and referer (if available):
+# 
+# - collapse out ./ ../ // and other sequences where appropriate
+# 
+# - replace % escaped sequences in the path with their equivalents
+# 
+# Additionally, 
+# 
+# - if a request URL is for a page specified in the directory_index list,
+#   that page is trimmed from the path
+# 
+# - referer hosts are normalised to lowercase (actually done by UrlParser)
+# 
+# - referer hosts that specify a port which is the default port for that
+#   service will have it removed
+#
+# - referer news: URL's that specify a host have it removed
+
+
+__version__ = '1.0'
+
+
+
+from string import join, rfind
+from regsub import gsub
+from urlparse import urlunparse
+from urllib import unquote
+import regex
+
+_replace_pat = regex.compile('\(/[^/]+/\.\./?\|/\./\|//\|/\.$\|/\.\.$\)')
+
+_default_port = {	'http': ':80',
+					'https': ':443',
+					'gopher': ':70',
+					'news': ':119',
+					'snews': ':563',
+					'nntp': ':119',
+					'snntp': ':563',
+					'ftp': ':21',
+					'telnet': ':23',
+					'prospero': ':191',
+				}
+
+_relative_scheme = {	'http': 1,
+						'https': 1,
+						'news': 1,
+						'snews': 1,
+						'nntp': 1,
+						'snntp': 1,
+						'ftp': 1,
+						'file': 1,
+						'': 1
+					}
+
+
+class Cleaner:
+	def __init__(self, log):
+		self.log = log
+		self.directory_index = []
+		self.cache_size = 10000
+		self._cache = {'url': {}, 'ref': {}}
+		self._referer_present = 0
+		if hasattr(self.log, 'referer'):
+			self._referer_present = 1
+
+
+	def __getattr__(self, attr):
+		try:
+			return getattr(self.log, attr)
+		except AttributeError:
+			raise AttributeError, attr
+
+
+	def __setattr__(self, attr, value):
+		if attr == 'directory_index':
+			self._dir_pat = "/\(" + join(value, "\|") + "\)$"
+			self._dir_comp = regex.compile(self._dir_pat)
+		self.__dict__[attr] = value
+
+
+	def getlogent(self):
+		''' Increment position in the log and populate requested attributes '''
+
+		if self.log.getlogent():
+			### clean url
+			if not self._cache['url'].has_key(self.log.url):
+				self._cache['url'][self.log.url] = self._clean(self.log.url, 'url')
+			clean_url = self._cache['url'][self.log.url]
+			self.url = urlunparse(clean_url)
+			self.url_scheme = clean_url[0]
+			self.url_host = clean_url[1]
+			self.url_path = clean_url[2]
+			
+			### clean referer
+			if self._referer_present:
+				if not self._cache['ref'].has_key(self.log.referer):
+					self._cache['ref'][self.log.referer] = self._clean(self.log.referer, 'ref')
+				clean_ref = self._cache['ref'][self.log.referer]
+				self.referer = urlunparse(clean_ref)
+				self.ref_scheme = clean_ref[0]
+				self.ref_host = clean_ref[1]
+				self.ref_path = clean_ref[2]
+			return 1
+		else:
+			return 0
+
+
+	def _clean(self, url, url_type):
+		''' cleans url (url_type is url or ref) '''
+
+		if len(self._cache[url_type]) > self.cache_size:
+			self._cache[url_type] = {}
+			
+		scheme = getattr(self.log, url_type + "_scheme")
+		path = getattr(self.log, url_type + "_path")
+
+		if _relative_scheme.get(scheme, 0):
+			last_path = path
+			while 1:
+				path = gsub(_replace_pat, '/', path)
+				if last_path == path:
+					break
+				last_path = path
+
+		path = unquote(path)
+
+		if url_type == 'url':		 	
+			scheme, host = '', ''
+			if self._dir_comp:
+				if self._dir_comp.search(path) > -1:
+					slash_index = rfind(path, '/') + 1
+					path = path[:slash_index]
+
+		elif url_type == 'ref':
+			host = self.log.ref_host
+			colon_index = rfind(host, ':')
+			if colon_index:
+				if host[colon_index:] == _default_port.get(scheme, '#'):
+					host = host[:colon_index]
+			if scheme == 'news':
+				slash_index = rfind(path, '/') + 1
+				path = path[slash_index:]
+			
+		return (	scheme, 
+					host, 
+					path, 
+					getattr(self.log, url_type + "_parameters"),
+					getattr(self.log, url_type + "_query"),
+					getattr(self.log, url_type + "_fragment")
+				)
+
+
+
+
+def test():
+	''' basic test suite- modify at will to test full functionality '''
+
+	import sys
+	from weblog import combined, url
+
+	file = sys.stdin
+	log = combined.Parser(file)
+	p_log = url.Parser(log)
+
+	clean_log = Cleaner(p_log)
+	clean_log.directory_index = ['index.html', 'index.htm']
+
+	while clean_log.getlogent():
+		print "%s\n%s %s" % (log.url, clean_log.url, clean_log.url_path)
+		print "%s\n%s %s %s" % (log.referer, clean_log.referer, clean_log.ref_host, clean_log.ref_path)
+		print
+
+
+
+class Dummy:
+	''' dummy log cless for test_clean() '''
+	pass
+
+
+def test_clean():
+	''' function to test _clean's operation '''
+	
+	from urlparse import urlparse
+
+	dummy = Dummy()
+	cleaner = Cleaner(dummy)
+	cleaner.directory_index = ['index.html', 'index.htm']	
+
+	u_tests = { '/foo/../bar':				'/bar',
+				'/foo/index.html':			'/foo/',
+				'/foo/index.html/':			'/foo/index.html/',
+				'/index.html.':				'/index.html.',
+				'/index.html':				'/',
+				'/index.htm':				'/',
+				'/index.ht':				'/index.ht'
+			}
+
+	r_tests = {	'/foo/bar/.':				'/foo/bar/', 
+				'/foo/bar/./':				'/foo/bar/',
+				'/foo/bar/..':				'/foo/',
+				'/foo/bar/../': 			'/foo/',
+				'/foo/bar/../baz': 			'/foo/baz',
+				'/foo/bar/../..': 			'/',
+				'/foo/bar/../../': 			'/',
+				'/foo/bar/../../baz': 		'/baz',
+				'/foo/bar/../../../baz':	'/../baz',
+				'/foo/bar/../../../../baz':	'/baz',
+				'/./foo':					'/foo',
+				'/../foo':					'/../foo',
+				'/foo.':					'/foo.',
+				'/.foo':					'/.foo',
+				'/foo..':					'/foo..',
+				'/..foo':					'/..foo',
+				'/./../foo':				'/foo',
+				'/./foo/.':					'/foo/',
+				'/foo/./bar':				'/foo/bar',
+				'/foo/../bar':				'/bar',
+				'/foo//':					'/foo/',
+				'/foo///bar//':				'/foo/bar/',	
+				'news:alt.this.group':		'news:alt.this.group',
+				'news://foo.com/user-full.article.number@whatever.server.net':	'news:user-full.article.number@whatever.server.net',
+				'news:user-full.article.number@whatever.server.net':	'news:user-full.article.number@whatever.server.net',
+				'http://www.foo.com:80/foo':	'http://www.foo.com/foo',
+				'http://www.foo.com:8000/foo':	'http://www.foo.com:8000/foo',
+				'http://www.foo.com/%7ebar':	'http://www.foo.com/~bar',
+				'http://www.foo.com/%7Ebar':	'http://www.foo.com/~bar',
+				'-':						'-',
+			}
+
+	n_correct, n_fail = 0, 0
+	test_types = { 	'url': u_tests,
+					'ref': r_tests,
+				}
+	type_keys = test_types.keys()
+	
+	for ty in type_keys:
+		test_keys = test_types[ty].keys()
+		test_keys.sort()		
+	
+		for i in test_keys:
+			print 'ORIGINAL:', i
+			(	dummy.__dict__[ty + '_scheme'],
+				dummy.__dict__[ty + '_host'],
+				dummy.__dict__[ty + '_path'],
+				dummy.__dict__[ty + '_parameters'],
+				dummy.__dict__[ty + '_query'],
+				dummy.__dict__[ty + '_fragment']	) = urlparse(i)
+			
+			cleaned = urlunparse(cleaner._clean(i, ty))
+			answer = test_types[ty][i]
+			print ty, 'CLEANED: ', cleaned
+			print ty, 'CORRECT: ', answer
+			if cleaned != answer:
+				print "*** TEST FAILED"
+				n_fail = n_fail + 1
+			else:
+				n_correct = n_correct + 1
+			print
+		
+	print "TOTAL CORRECT:", n_correct
+	print "TOTAL FAILURE:", n_fail
+
+
+
+
+if __name__ == '__main__':
+	test()
+#!/usr/bin/env python
+
+'''
+Combined Web log parsing class.
+
+Contents:
+
+- Parser: logfile parser class
+  log = weblog.combined.Parser(log filehandle)
+  - methods:
+	log.getlogent()
+  - attributes:
+  	log.buffer - buffer size (default 512k)
+  - read-only attributes:
+	log.client
+	log.ident
+	log.authuser
+	log.utime
+	log.method
+	log.url
+	log.proto
+	log.status
+	log.bytes
+	log.referer
+	log.agent
+	log.num_processed - number of raw lines seen
+	log.num_error - number of errors seen
+
+- test: test function
+'''
+
+
+# (c) 1998 Copyright Mark Nottingham
+# <mnot@pobox.com>
+#
+# This software may be freely distributed, modified and used, 
+# provided that this copyright notice remain intact.
+#
+# This software is provided 'as is' without warranty of any kind.
+
+
+# Combined Logfile Format
+# -----------------------
+#
+# host rfc931 authuser [DD/Mon/YYYY:hh:mm:ss] "request" ddd bbbb "referer" "agent"
+#
+# rfc931: identd info, - otherwise
+# authuser: user id if http autheticated, - otherwise
+# ddd: the status code returned by the server, - if not available
+# bbbb: the total number of bytes sent, not including header, - if not available
+
+
+__version__ = '1.0'
+
+
+from time import mktime
+from regex import symcomp
+from string import atoi, split, rstrip
+import sys
+
+_pattern = symcomp('^\(<client>[^ ]+\) \(<ident>[^ ]+\) \(<authuser>[^\[\n]+\) \[\(<mday>[0-9]+\)\/\(<mon_name>\w+\)\/\(<year>[0-9]+\):\(<hour>[0-9]+\):\(<min>[0-9]+\):\(<sec>[0-9]+\) \(<timediff>[^ ]+\)\] "\(<method>[^ ]+\) \(<url>[^ ]+\) \(<proto>[^ ]+\)" \(<status>[^ ]+\) \(<bytes>[^ ]+\) "\(<refer>[^ ]+\)" "\(<agent>[^"]+\)"\|\(-\)[\r\n ]*$') 
+_patmatch = _pattern.match
+_patgroup = _pattern.group
+
+_monlist = {	"Jan": 1,
+				"Feb": 2,
+				"Mar": 3,
+				"Apr": 4,
+				"May": 5,
+				"Jun": 6,
+				"Jul": 7,
+				"Aug": 8,
+				"Sep": 9,
+				"Oct": 10,
+				"Nov": 11,
+				"Dec": 12 
+			}
+
+
+class Parser:
+	''' Combined Web Logfile Parser '''
+
+	def __init__(self, file_descriptor):
+		self.num_processed = 0
+		self.num_error = 0
+		self._fd = file_descriptor
+		self._lines = []
+		self._lines_num = 0
+		self._lines_index = 0
+		self.buffer = 1024 * 512
+		self.client = ''
+		self.ident = ''
+		self.authuser = ''
+		self.utime = 0
+		self.method = ''
+		self.url = ''
+		self.proto = ''
+		self.status = 0
+		self.bytes = 0
+		self.referer = ''
+		self.agent = ''
+
+
+	def getlogent(self):
+		''' Increament location in the log and populate object attributes '''
+
+		while 1:	# loop until we find a valid line, or end
+
+			### this is the buffering for readline()
+			if self._lines_index >= self._lines_num:
+				self._lines_index = 0
+				self._lines = self._fd.readlines(self.buffer)
+				self._lines_num = len(self._lines)
+				if self._lines_num == 0: return 0
+			line = self._lines[self._lines_index]
+			self._lines_index = self._lines_index + 1
+
+			self.num_processed = self.num_processed + 1
+				
+			n = split(line, None, 11)	# split the line on whitespace
+			if len(n) != 12:			# split didn't work; try regex
+				if self._run_regex(line):
+					return 1
+				else:
+					self.num_error = self.num_error + 1
+					continue
+			
+			try:
+				self.utime = mktime(    atoi(n[3][8:12]),
+										_monlist[n[3][4:7]],
+										atoi(n[3][1:3]),
+										atoi(n[3][13:15]),
+										atoi(n[3][16:18]),
+										atoi(n[3][19:21]),
+										-1, -1, -1
+									)
+				self.client = n[0]
+				self.ident = n[1]
+				self.authuser = n[2]
+				self.method = n[5][1:]
+				self.url = n[6]
+				self.proto = n[7][:-1]
+			except:					# split didn't work; try regex
+				if self._run_regex(line):
+					return 1
+				else:
+					continue
+
+			try:
+				self.status = atoi(n[8])
+			except ValueError:
+				self.status = 0
+			try:
+				self.bytes = atoi(n[9])
+			except ValueError:
+				self.bytes = 0
+			try:
+				self.referer = n[10][1:-1]
+			except IndexError:
+				self.referer = ''
+			try:
+				self.agent = (rstrip(n[11]))[1:-1]
+			except IndexError:
+				self.agent = ''
+			return 1		# valid line found
+
+
+	def _run_regex(self, line):
+		''' Try to parse the line with a regex; return 1 if sucessful. '''
+
+		n = _patmatch(line)
+		if n == -1: return 0		# no match
+		self.utime = mktime(	atoi(_patgroup('year')), 
+								_monlist[_patgroup('mon_name')], 
+								atoi(_patgroup('mday')), 
+								atoi(_patgroup('hour')), 
+								atoi(_patgroup('min')), 
+								atoi(_patgroup('sec')), 
+								-1, -1, -1
+							)				
+		self.client = _patgroup('client') 
+		self.ident = _patgroup('ident')
+		self.authuser = _patgroup('authuser') 
+		self.method = _patgroup('method') 
+		self.url = _patgroup('url') 
+		self.proto = _patgroup('proto')
+		try:
+			self.status = atoi(_patgroup('status')) 
+		except ValueError:
+			self.status = 0	
+		try:
+			self.bytes = atoi(_patgroup('bytes')) 
+		except ValueError:
+			self.bytes = 0
+		self.referer = _patgroup('refer') 
+		self.agent = _patgroup('agent')
+		return 1
+
+
+
+			
+def test():
+	''' basic test suite- modify at will to test all functionality '''
+	
+	file = sys.stdin
+	log = Parser(file)	
+	while log.getlogent():
+		pass
+		print "%s %s %s %s %s %s" % (log.num_processed, log.client, log.utime, log.url, log.referer, log.agent)
+	print "lines: %s" % (log.num_processed)
+	print "error: %s" % (log.num_error)
+		
+		
+if __name__ == '__main__':
+	test()
+#!/usr/bin/env python
+
+'''
+Common Web log parsing class.
+
+Contents:
+
+- Parser: logfile parser class
+  log = weblog.common.Parser(log filehandle)
+  - methods:
+	log.getlogent()
+  - attributes:
+  	log.buffer - buffer size (default 512k)
+  - read-only attributes:
+	log.client
+	log.ident
+	log.authuser
+	log.utime
+	log.method
+	log.url
+	log.proto
+	log.status
+	log.bytes
+	log.num_processed - number of raw lines seen
+	log.num_error - number of errors seen
+
+- test: test function
+'''
+
+
+# (c) 1998 Copyright Mark Nottingham
+# <mnot@pobox.com>
+#
+# This software may be freely distributed, modified and used, 
+# provided that this copyright notice remain intact.
+#
+# This software is provided 'as is' without warranty of any kind.
+
+
+# Common Logfile Format
+# ---------------------
+#
+# host rfc931 authuser [DD/Mon/YYYY:hh:mm:ss] "request" ddd bbbb
+#
+# rfc931: identd info, - otherwise
+# authuser: user id if http autheticated, - otherwise
+# ddd: the status code returned by the server, - if not available
+# bbbb: the total number of bytes sent, not including header, - if not available
+
+
+__version__ = '1.0'
+
+
+from time import mktime
+from regex import symcomp
+from string import atoi, split
+import sys
+
+_pattern = symcomp('^\(<client>[^ ]+\) \(<ident>[^ ]+\) \(<authuser>[^\[\n]+\) \[\(<mday>[0-9]+\)\/\(<mon_name>\w+\)\/\(<year>[0-9]+\):\(<hour>[0-9]+\):\(<min>[0-9]+\):\(<sec>[0-9]+\) \(<timediff>[^ ]+\)\] "\(<method>[^ ]+\) \(<url>[^ ]+\) \(<proto>[^ ]+\)" \(<status>[^ ]+\) \(<bytes>[^ ]+\)[\r\n ]*$') 
+_patmatch = _pattern.match
+_patgroup = _pattern.group
+
+_monlist = {	"Jan": 1,
+				"Feb": 2,
+				"Mar": 3,
+				"Apr": 4,
+				"May": 5,
+				"Jun": 6,
+				"Jul": 7,
+				"Aug": 8,
+				"Sep": 9,
+				"Oct": 10,
+				"Nov": 11,
+				"Dec": 12 
+			}
+
+
+
+class Parser:
+	''' Common Web Logfile Parser '''
+
+	def __init__(self, file_descriptor):
+		self.num_processed = 0
+		self.num_error = 0
+		self._fd = file_descriptor			
+		self._lines = []
+		self._lines_num = 0
+		self._lines_index = 0
+		self.buffer = 1024 * 512
+		self.client = ''
+		self.ident = ''
+		self.authuser = ''
+		self.utime = 0
+		self.method = ''
+		self.url = ''
+		self.proto = ''
+		self.status = 0
+		self.bytes = 0
+
+	
+	def getlogent(self):
+		''' Increament location in the log and populate object attributes '''
+
+		while 1:	# loop until we find a valid line, or end
+
+			### this is the buffering for readline()
+			if self._lines_index >= self._lines_num:
+				self._lines_index = 0
+				self._lines = self._fd.readlines(self.buffer)
+				self._lines_num = len(self._lines)
+				if self._lines_num == 0: return 0
+			line = self._lines[self._lines_index]
+			self._lines_index = self._lines_index + 1
+
+			self.num_processed = self.num_processed + 1
+			
+			n = split(line, None)	# split the line on whitespace
+			if len(n) != 10:		# split didn't work; try regex
+				if self._run_regex(line):
+					return 1
+				else:
+					self.num_error = self.num_error + 1
+					continue
+			try:
+				self.utime = mktime(	atoi(n[3][8:12]), 
+										_monlist[n[3][4:7]], 
+										atoi(n[3][1:3]), 
+										atoi(n[3][13:15]), 
+										atoi(n[3][16:18]), 
+										atoi(n[3][19:21]), 
+										-1, -1, -1
+									)
+				self.client = n[0] 
+				self.ident = n[1]
+				self.authuser = n[2]
+				self.method = n[5][1:] 
+				self.url = n[6]
+				self.proto = n[7][:-1]
+			except:					# split didn't work; try regex
+				if self._run_regex(line):
+					return 1
+				else:
+					continue
+
+			try:
+				self.status = atoi(n[8]) 
+			except ValueError:
+				self.status = 0
+			try:
+				self.bytes = atoi(n[9]) 
+			except ValueError:
+				self.bytes = 0
+			return 1
+
+
+	def _run_regex(self, line):
+		''' Try to parse the line with a regex; return 1 if sucessful. '''
+
+		n = _patmatch(line)
+		if n == -1: return 0		# no match
+		self.utime = mktime(	atoi(_patgroup('year')), 
+								_monlist[_patgroup('mon_name')], 
+								atoi(_patgroup('mday')), 
+								atoi(_patgroup('hour')), 
+								atoi(_patgroup('min')), 
+								atoi(_patgroup('sec')), 
+								-1, -1, -1
+							)
+		self.client = _patgroup('client') 
+		self.ident = _patgroup('ident')
+		self.authuser = _patgroup('authuser') 
+		self.method = _patgroup('method') 
+		self.url = _patgroup('url') 
+		self.proto = _patgroup('proto')
+		try:
+			self.status = atoi(_patgroup('status')) 
+		except ValueError:
+			self.status = 0	
+		try:
+			self.bytes = atoi(_patgroup('bytes')) 
+		except ValueError:
+			self.bytes = 0
+		return 1
+
+
+
+			
+def test():
+	''' basic test suite- modify at will to test all functionality '''
+	
+	file = sys.stdin
+	log = Parser(file)	
+	while log.getlogent():
+		print "%s %s %s %s" % (log.client, log.utime, log.bytes, log.url)
+	print "lines: %s" % (log.num_processed)
+	print "error: %s" % (log.num_error)
+		
+		
+if __name__ == '__main__':
+	test()
+
+#!/usr/bin/env python
+
+'''
+Web Log Limiting classes.
+
+Contents:
+
+- Path: directory/page limiting class
+  pl_log = weblog.limit.Path(log object)
+  methods:
+    - pl_log.getlogent()
+  variables:
+    - pl_log.page_limit [ list of page names/file types to limit to ]
+    - pl_log.page_exclude [ list of page names/file types to exclude ]
+    - pl_log.path_limit [ list of specific pages/directories to limit to ]
+    - pl_log.path_exclude [ list of specific pages/directories to exclude ]
+    - pl_log.cache_size (size of page cache)
+  read-only variables:
+    - pl_log.num_skipped (number of lines that have been passed over)
+  attributes:
+    - all attributes from the log object are available.
+
+  Page/Directory limiting requires previous use of weblog.url.Parse. Page
+  and directory limitations are case sensitive.
+  
+  The page_limit and page_exclude specifications are for excluding a page
+  name or file type; e.g., 'foo.html' or '.gif', not '/foo/bar/'.
+  The dir_limit and dir_exclude specifications are from the document root; 
+  e.g., '/foo/bar/baz.html', not 'baz.html' and '/bat.html' not 'bat.html'.
+  When setting any of these four attributes, make sure to pass the arguments
+  as a [list], all at once.
+
+
+- Host: host/domain/ip/network limiting class
+  hl_log = weblog.limit.Host(log object)
+  methods:
+    - hl_log.getlogent()
+  variables:
+    - hl_log.limit_host [ list of hostnames/domains to limit to ]  
+    - hl_log.exclude_host [ list of hostnames/domains to exclude ]
+    - hl_log.cache_size (size of page cache)
+  read-only variables:
+    - hl_log.num_skipped (number of lines that have been passed over)
+  attributes:
+    - all attributes from the log object are available.    
+
+  Hosts can be a FQDN or domain name; e.g., 'foo.bar.com', 'bar.com', '.com'.
+  Hosts are not case-sensitive.
+  
+  If weblog.resolve has been used, the .host attribute will be used when 
+  available; otherwise, the .client attribute will be used, and will 
+  not match if it is not of the correct type. 
+
+
+- Time: start/end time limiting class
+  tl_log = weblog.limit.Time(log object)
+  methods:
+	- tl_log.getlogent()
+  booleans:
+    - tl_log.end_stop ( if set, will getlogent() will return 0 on first
+                       line matching l_log.end )
+  variables:
+	- tl_log.start [ exclude log lines before this unix epoch time ]
+	- tl_log.end [ exclude log lines after this unix epoch time ]
+  read-only variables:
+    - hl_log.num_skipped (number of lines that have been passed over)
+  attributes:
+	- all attributes of the log object are available.
+
+  Because Time only needs the utime attribute from the parser, it
+  pays to put it at the top of the 'stack' of instances, right below
+  the Parser itself.
+
+
+- test: test function
+'''
+
+
+# (c) 1998 Copyright Mark Nottingham
+# <mnot@pobox.com>
+#
+# This software may be freely distributed, modified and used,
+# provided that this copyright notice remain intact.
+#
+# This software is provided 'as is' without warranty of any kind.
+
+
+
+__version__ = '1.0'
+
+import re
+from string import join
+from types import ListType
+
+
+class Path:
+	''' Path Limiting Class '''
+
+	def __init__(self, log):
+		self.log = log
+		self.dir_limit = []
+		self.dir_exclude = ['::::::::']
+		self.page_limit = []
+		self.page_exclude = ['::::::::']
+		self.cache_size = 10000
+		self.num_skipped = 0
+		self._cache = {}
+
+
+	def __getattr__(self, attr):
+		try:
+			return getattr(self.log, attr)
+		except AttributeError:
+			raise AttributeError, attr
+
+
+	def __setattr__(self, attr, value):
+		if type(value) == ListType:
+			value = map(lambda a: re.escape(a), value)
+		if attr == 'dir_limit':
+			self._dir_limit_pat = "^(" + join(value, "|") + ")"
+			self._dir_limit_comp = re.compile(self._dir_limit_pat)
+		if attr == 'dir_exclude':
+			self._dir_exclude_pat = "^(" + join(value, "|") + ")"
+			self._dir_exclude_comp = re.compile(self._dir_exclude_pat)
+		if attr == 'page_limit':
+			self._page_limit_pat = "(" + join(value, "|") + ")$"
+			self._page_limit_comp = re.compile(self._page_limit_pat)
+		if attr == 'page_exclude':
+			self._page_exclude_pat = "(" + join(value, "|") + ")$"
+			self._page_exclude_comp = re.compile(self._page_exclude_pat)
+		self.__dict__[attr] = value
+
+
+	def getlogent(self):
+		''' Increment position in the log and populate requested attributes '''
+
+		while self.log.getlogent():
+			try:
+				if self._cache[self.log.url_path]:
+					return 1
+			except KeyError:
+				if self._path_match(self.log.url_path):
+					return 1
+			self.num_skipped = self.num_skipped + 1
+		return 0
+
+
+	def _path_match(self, path):
+		''' matching engine; returns 1 if page is to be ignored '''
+
+		if len(self._cache) > self.cache_size:
+			self._cache = {}
+		i = 1
+		if self._dir_limit_comp.search(path) == None: i = 0
+		elif self._page_limit_comp.search(path) == None: i = 0
+		elif self._dir_exclude_comp.search(path) != None: i = 0
+		elif self._page_exclude_comp.search(path) != None: i = 0					
+		self._cache[path] = i
+		return self._cache[path]
+
+
+
+class Host:
+	''' Host/Domain Limiting Class '''
+	
+	def __init__(self, log):
+		self.log = log
+		self.host_limit = []
+		self.host_exclude = ['::::::::']
+		self.cache_size = 10000
+		self.num_skipped = 0
+		self._cache = {}
+		if hasattr(self.log, 'host'):
+			self._loc = 'host'
+		else:
+			self._loc = 'client'
+
+		
+	def __getattr__(self, attr):
+		try:
+			return getattr(self.log, attr)
+		except AttributeError:
+			raise AttributeError, attr
+
+
+	def __setattr__(self, attr, value):
+		if attr == 'host_limit':
+			self._host_limit_pat = "(" + join(value, "|") + ")\.?$"
+			self._host_limit_comp = re.compile(self._host_limit_pat, re.IGNORECASE)
+		if attr == 'host_exclude':
+			self._host_exclude_pat = "(" + join(value, "|") + ")\.?$"
+			self._host_exclude_comp = re.compile(self._host_exclude_pat, re.IGNORECASE)
+		self.__dict__[attr] = value
+
+
+	def getlogent(self):
+		''' Increment position in the log and populate requested attributes '''
+
+		while self.log.getlogent():
+			try:
+				if self._cache[getattr(self.log, self._loc)]:
+					return 1
+			except KeyError:
+				if self._host_match(getattr(self.log, self._loc)):
+					return 1
+			self.num_skipped = self.num_skipped + 1
+		return 0
+		
+
+	def _host_match(self, host):
+		''' matching engine; returns 1 if page is to be ignored '''
+
+		if len(self._cache) > self.cache_size:
+			self._cache = {}
+		i = 1
+		if self._host_limit_comp.search(host) == None: i = 0
+		elif self._host_exclude_comp.search(host) != None: i = 0
+		self._cache[host] = i
+		return self._cache[host]
+			
+
+
+class Time:
+	''' Start/Stop Time Limiting Class '''
+	
+	def __init__(self, log):
+		self.log = log
+		self.start = 0
+		self.end = 2000000000
+		self.end_stop = 0
+		self.num_skipped = 0
+
+	
+	def __getattr__(self, attr):
+		try:
+			return getattr(self.log, attr)
+		except AttributeError:
+			raise AttributeError, attr
+
+
+	def getlogent(self):
+		''' Increment position in the log and populate requested attributes '''
+
+		while self.log.getlogent():
+			if self.log.utime < self.start:
+				self.num_skipped = self.num_skipped + 1
+				continue
+			if self.log.utime > self.end:
+				if self.end_stop:
+					return 0
+				else:
+					self.num_skipped = self.num_skipped + 1
+					continue
+			return 1
+		return 0	
+	
+
+
+
+
+def test():
+	''' basic test suite- modify at will to test full functionality '''
+
+	import sys
+	from weblog import combined, url
+
+	file = sys.stdin
+	log = combined.Parser(file)
+
+	up_log = url.Parser(log)
+
+	lim_log = Host(up_log)
+	lim_log.host_limit = ['.com']
+	lim_log.host_exclude = ['aol.com']
+
+	jpg_log = Path(lim_log)
+	jpg_log.page_limit = ['.jpg']
+
+	s_log = Time(jpg_log)
+	s_log.start = 899306347
+
+	while s_log.getlogent():
+		print "%s %s" % (s_log.client, s_log.url)
+	print "lines processed-", log.num_processed
+	print "error lines-", log.num_error
+	print "lines skipped- %s host, %s path, %s time, %s total" % (
+			lim_log.num_skipped, jpg_log.num_skipped, s_log.num_skipped,
+			lim_log.num_skipped + jpg_log.num_skipped + s_log.num_skipped)
+
+
+if __name__ == '__main__':
+	test()
+#!/usr/bin/env python
+
+'''
+Multiple Web Logfile combining class.
+
+Contents:
+- Combiner: multiple logfile combining class
+  ag_log = weblog.multiple.Combiner(WebLog_Class, [list of filehandles])
+  methods:
+	- ag_log.getlogent()
+  read-only variables:
+  	- ag_log.num_error
+	- ag_log.num_proccessed
+  	
+  This class is useful when you need to combine logfiles from separate servers
+  that log the same content; for instance, if you use a round-robin DNS or
+  other load distribution system.
+
+  The Combiner needs to be fed a prototype of the class of parser you want
+  it to use (NOT an instance, and a list of filehandles to the logs. For
+  instance:
+  
+  import weblog.common, weblog.multiple
+  log1 = open('log1_log')
+  log2 = open('log2_log')
+  mult_log = multiple.Combiner(common.Parser, [log1, log2]
+  [...]
+
+- test: test function
+'''
+
+
+# (c) 1998 Copyright Mark Nottingham
+# <mnot@pobox.com>
+#
+# This software may be freely distributed, modified and used,
+# provided that this copyright notice remain intact.
+#
+# This software is provided 'as is' without warranty of any kind.
+
+
+__version__ = '1.0'
+
+
+
+
+class Combiner:
+	''' 
+	Class to combine multiple logfiles of the same format. 
+	(chronologically correct)
+	'''
+
+	def __init__(self, logtype, fds):
+		self.logs = map(lambda a, lt=logtype: lt(a), fds)
+		self.processed_lines = 0
+		self.error_lines = 0
+		self._queue = []	# index used to determine next line to return
+		self._proc = []		# index of num_processed attributes
+		self._err = []		# index of num_error attributes
+		for log in self.logs:
+			if log.getlogent():
+				self._queue.append(log.utime)
+			else:
+				self._queue.append(0)
+			self._proc.append(log.num_processed)
+			self._err.append(log.num_error)
+		self._q_pos = self._queue.index(min(self._queue)) 
+		self.target = Dummy(min(self._queue))
+
+
+	def __getattr__(self, attr):
+		if attr == 'num_processed':
+			return reduce(lambda a,b: a+b, self._proc, 0)
+		elif attr == 'num_error':
+			return reduce(lambda a,b: a+b, self._err, 0)
+		else:
+			try:
+				return getattr(self.target, attr)
+			except AttributeError:
+				raise AttributeError, attr
+
+
+	def getlogent(self):
+		''' Increment position in the log and populate requested attributes '''
+
+		if self.target.getlogent():			# fetch next line from last log
+			self._queue[self._q_pos] = self.target.utime	# update candidates
+			self._proc[self._q_pos] = self.target.num_processed
+			self._err[self._q_pos] = self.target.num_error
+		else:
+			del self._queue[self._q_pos]	# close a log if done with it
+			del self.logs[self._q_pos]
+			if not len(self._queue): return 0
+		self._q_pos = self._queue.index(min(self._queue))	# find our next one
+		self.target = self.logs[self._q_pos]	# make it availble
+		return 1
+
+
+class Dummy:
+	''' Dummy class for Combiner so it has something to start with '''
+	
+	def __init__(self, first):
+		self.utime = first
+		self.num_processed = 0
+		self.num_error = 0
+
+	def getlogent(self):
+		return 1
+	
+
+
+def test():
+	''' basic test suite- modify at will to test full functionality '''
+
+	import sys
+	from weblog import combined
+
+	logs = []
+	for arg in sys.argv[1:]:
+		logs.append(open(arg))
+
+	log = Combiner(combined.Parser, logs)
+
+	while log.getlogent():
+		print "%s %s %s" % (log.client, log.utime, log.url)
+	print "processed-", log.num_processed
+	print "error-", log.num_error
+
+
+if __name__ == '__main__':
+	test()
+#!/usr/bin/env python
+
+'''
+Web Log Url Parsing class.
+
+Contents:
+- Parser: logfile Query parsing class
+  p_log = weblog.query.Parser(log object)
+  methods:
+	- p_log.getlogent()
+  variables:
+	- p_log.cache_size  [ maximum size of url cache ]
+  attributes:
+    - url_query_dict  (dictionary of lists of items in the query)
+    - ref_query_dict  (as above, if referer is available)
+    - all attributes of the log object are available as well.
+
+  The weblog.url.Parser() class MUST be used before using this module.
+
+  This class will parse the url and referer (if available) queries into their
+  respective elements, making the query available as a dictionary.
+  
+  The values of the dictionaries are lists of elements; there may only be
+  one element in that list.
+  
+- test: test function
+'''
+
+
+# (c) 1998 Copyright Mark Nottingham
+# <mnot@pobox.com>
+#
+# This software may be freely distributed, modified and used,
+# provided that this copyright notice remain intact.
+#
+# This software is provided 'as is' without warranty of any kind.
+
+
+__version__ = '1.0'
+
+
+from cgi import parse_qs
+
+
+class Parser:
+	def __init__(self, log):
+		self.log = log
+		self.url_query_dict = {}
+		self.ref_query_dict = {}
+		self.cache_size = 10000
+		self._cache = {'url': {}, 'ref': {}}
+		self._referer_present = 0
+		if hasattr(self.log, 'referer'):
+			self._referer_present = 1
+
+
+	def __getattr__(self, attr):
+		try:
+			return getattr(self.log, attr)
+		except AttributeError:
+			raise AttributeError, attr
+
+
+	def getlogent(self):
+		''' Increment position in the log and populate requested attributes '''
+
+		if self.log.getlogent():
+			### parse url query
+			if self.log.url_query:
+				if not self._cache['url'].has_key(self.log.url_query):
+					self._cache['url'][self.log.url_query] = self._parse(self.log.url_query, 'url')
+				self.url_query_dict = self._cache['url'][self.log.url_query]
+			else:
+				self.url_query_dict = {}
+
+			### parse referer query
+			if self._referer_present:
+				if self.log.ref_query:
+					if not self._cache['ref'].has_key(self.log.ref_query):
+						self._cache['ref'][self.log.ref_query] = self._parse(self.log.ref_query, 'ref')
+					self.ref_query_dict = self._cache['ref'][self.log.ref_query]
+				else:
+					self.ref_query_dict = {}
+			return 1
+		else:
+			return 0
+
+
+	def _parse(self, url, url_type):
+		if len(self._cache[url_type]) > self.cache_size:
+			self._cache[url_type] = {}
+		parsed = parse_qs(url)		
+		return parsed
+
+
+
+def test():
+	''' basic test suite- modify at will to test full functionality '''
+
+	import sys
+	from weblog import combined, url
+
+	file = sys.stdin
+	log = combined.Parser(file)
+
+	u_log = url.Parser(log)
+	p_log = Parser(u_log)		# query parser
+
+	while p_log.getlogent():
+		if p_log.ref_query_dict:
+			for (key, value) in p_log.ref_query_dict.items():
+				print key, value
+			print
+
+
+
+if __name__ == '__main__':
+	test()
+#!/usr/bin/env python
+
+'''
+Web Log Referer Typing class.
+
+Contents:
+- Typer: logfile referer typing class
+  ref_log = weblog.referer.Typer(log object)
+  methods:
+	- ref_log.getlogent()
+  variables:
+	- ref_log.siteurl [list of local site urls]
+  attributes:
+	- res_log.referer_type (MANUAL|LOCAL|OFFSITE|FILE)
+    - all attributes available from the log object are available as well.
+
+- test: test function
+'''
+
+
+# (c) 1998 Copyright Mark Nottingham
+# <mnot@pobox.com>
+#
+# This software may be freely distributed, modified and used,
+# provided that this copyright notice remain intact.
+#
+# This software is provided 'as is' without warranty of any kind.
+
+
+# Web Log Referer Typing
+#
+# referer.Typer will determine the type of referer associated with a hit, 
+# based on the site URLS that you give it. The possible referer types are:
+# 
+# - LOCAL -  on one of the sites given
+# - MANUAL - a '-' hit
+# - FILE - an url beginning with 'file://'
+# - OFFSITE - not on one of the sites given, a file or manual hit
+#
+# the referer attribute of LOCAL hits will be truncated to exclude the 
+# scheme and host, and the corresponding attributes will be erased, if
+# present.
+#
+# Make sure the .siteurl attribute is fed a list, even if it's only one item. 
+#
+# This class MUST be fed the output of both a Web logfile parsing module
+# that produces a referer attribute, and the weblog.url.Parse class, 
+# which populates the individual components that it needs to operate. 
+# See the test() for an example.
+
+
+
+__version__ = '1.0'
+
+
+from urlparse import urlunparse
+from string import lower
+import socket
+
+
+class Typer:
+	def __init__(self, log):
+		self.log = log
+		self.siteurl = []
+		self.referer_type = ''
+
+
+	def __getattr__(self, attr):
+		try:
+			return getattr(self.log, attr)
+		except AttributeError:
+			raise AttributeError, attr
+
+
+	def __setattr__(self, attr, value):
+		if attr == 'siteurl':
+			siteurls = []
+			for item in value:
+				siteurls.append(lower(item))
+				try:
+					siteurls.append(socket.gethostbyname(item))
+				except socket.error:
+					pass
+			value = siteurls
+		self.__dict__[attr] = value
+
+
+	def getlogent(self):
+		''' Increment position in the log and populate requested attributes '''
+
+		if self.log.getlogent():
+			### clear attributes if last logent was a LOCAL
+			if self.referer_type == 'LOCAL':
+				delattr(self, 'referer')
+				delattr(self, 'ref_scheme')
+				delattr(self, 'ref_host')
+
+			if self.referer == '-':
+				self.__dict__['referer_type'] = 'MANUAL'
+			elif self.ref_host in self.siteurl:
+				self.__dict__['referer_type'] = 'LOCAL'
+				self.__dict__['ref_scheme'] = ''
+				self.__dict__['ref_host'] = ''
+				self.__dict__['referer'] = urlunparse((	'', 
+														'', 
+														self.ref_path, 
+														self.ref_parameters, 
+														self.ref_query, 
+														self.ref_fragment))
+			elif self.ref_scheme == 'file':
+				self.__dict__['referer_type'] = 'FILE'
+			else:
+				self.__dict__['referer_type'] = 'OFFSITE'
+
+			return 1
+		else:
+			return 0
+
+
+
+
+
+
+def test():
+	''' basic test suite- modify at will to test full functionality '''
+
+	import sys
+	from weblog import combined, url
+
+	file = sys.stdin
+	log = combined.Parser(file)
+
+	p_log = url.Parser(log)
+	
+	ref_log = Typer(p_log)
+	ref_log.siteurl = sys.argv[1:]
+
+	while ref_log.getlogent():
+		print "%20s	%20s	%s" % (ref_log.referer[:20], ref_log.ref_path[:20], ref_log.referer_type)
+
+if __name__ == '__main__':
+	test()
+#!/usr/bin/env python
+
+'''
+Web Log Resolving class.
+
+Contents:
+- SimpleResolver: logfile resolver class
+  res_log = weblog.resolve.SimpleResolver(log object)
+  methods:
+	- res_log.getlogent()
+  booleans:
+	- res_log.lookup_host [puts hostname in res_log.host]
+	- res_log.lookup_ip [puts ip address in res_log.ip]
+  variables:
+	- res_log.set_client ('host' | 'ip')
+	- res_log.cache_size [limits size of ip and host caches, in entries]
+  attributes:
+	- res_log.host [FQDN, if requested]
+	- res_log.ip [IP address, if requested]
+	- all attributes of the log object are available as well.
+
+  lookup_host will put the ip in the host attribute if it cannot resolve an
+  ip; lookup_ip will put None in the ip attribute if it cannot. If set_client
+  is set, it will replace the current contents of the client attribute with
+  the specified information.
+
+- test: test function
+'''
+
+
+# (c) 1998 Copyright Mark Nottingham
+# <mnot@pobox.com>
+#
+# This software may be freely distributed, modified and used,
+# provided that this copyright notice remain intact.
+#
+# This software is provided 'as is' without warranty of any kind.
+
+
+__version__ = '1.0'
+
+
+import socket
+
+_gethost = socket.gethostbyaddr
+_getip = socket.gethostbyname
+
+class SimpleResolver:
+	def __init__(self, log):
+		self.log = log
+		self._namecache = {}
+		self._ipcache = {}
+		self.cache_size = 100000
+		self.lookup_host = 0
+		self.lookup_ip = 0
+		self.set_client = ''
+
+	def __getattr__(self, attr):
+		try:
+			return getattr(self.log, attr)
+		except AttributeError:
+			raise AttributeError, attr
+
+	def getlogent(self):
+		''' Increment position in the log and populate requested attributes '''
+
+		if self.log.getlogent():
+
+			### hostname lookup
+			if self.lookup_host or self.set_client == 'host':
+				if len(self._namecache) > self.cache_size:
+					self._namecache = {}
+				try:
+					self.host = self._namecache[self.log.client]
+				except KeyError:
+					try:
+						self.host = _gethost(self.log.client)[0]
+					except socket.error:
+						self.host = self.log.client
+					self._namecache[self.log.client] = self.host
+				if self.set_client == 'host':
+					self.client = self.host
+
+			### ip lookup
+			if self.lookup_ip or self.set_client == 'ip':
+				if len(self._ipcache) > self.cache_size:
+					self._ipcache = {}
+				try:
+					self.ip = self._ipcache[self.log.client]
+				except KeyError:
+					try:
+						self.ip = _getip(self.log.client)			
+					except socket.error:
+						self.ip = None
+					self._ipcache[self.log.client] = self.ip
+				if self.set_client == 'ip':
+					if self.ip:
+						self.client = self.ip
+
+			return 1
+		else:
+			return 0
+
+
+
+
+
+
+def test():
+	''' basic test suite- modify at will to test full functionality '''
+
+	import sys
+	from weblog import combined
+
+	file = sys.stdin
+	log = combined.Parser(file)
+
+	res_log = SimpleResolver(log)
+	res_log.set_client = 'host'
+
+	while res_log.getlogent():
+		print "%s %s" % (res_log.client, res_log.url)
+
+
+if __name__ == '__main__':
+	test()
+#!/usr/bin/env python
+
+'''
+Squid Web proxy cache log parsing classes.
+
+Contents:
+
+- AccessParser: squid access logfile parser class
+  log = AccessParser(log filehandle)
+  - methods:
+	log.getlogent()
+  - attributes:
+  	log.buffer - buffer size (default 512k)
+	log.full_mime_hdrs - set to 1 if you've enabled this in Squid
+  - read-only attributes:
+    log.utime
+    log.elapsed
+	log.client
+	log.log_tag
+	log.status
+	log.bytes
+	log.method
+	log.url
+	log.ident
+	log.peer_tag
+	log.peerhost
+	log.mimetype
+	log.hdr_request - request headers (if enabled)
+	log.hdr_response - response headers (if enabled)
+	log.num_processed - number of raw lines seen
+	log.num_error - number of errors seen
+
+Note that the full_mime_hrds code is NOT optimised, and will substantially
+slow down the parser. full_mime_hdrs splits the request and response headers
+up into forced-lowercase dictionary, and does some rudimentary parsing for 
+date values.
+
+- StoreParser: squid store logfile parser class
+  log = StoreParser(log filehandle)
+  - methods:
+	log.getlogent()
+  - attributes:
+  	log.buffer - buffer size (default 512k)
+	log.full_mime_hdrs - set to 1 if this is enabled in squid.conf
+  - read-only attributes:
+    log.utime
+    log.action
+	log.status
+	log.datehdr
+	log.lastmod
+	log.expires
+	log.mimetype
+	log.expect_len
+	log.real_len
+	log.method
+	log.url
+	log.num_processed - number of raw lines seen
+	log.num_error - number of errors seen
+
+- test: test function
+'''
+
+
+# (c) 1998 Copyright Mark Nottingham
+# <mnot@pobox.com>
+#
+# This software may be freely distributed, modified and used, 
+# provided that this copyright notice remain intact.
+#
+# This software is provided 'as is' without warranty of any kind.
+
+
+# Squid Access Logfile Format
+# ---------------------------
+#
+# Version 1.1 Access log
+# 
+# timestamp elapsed_time client log_tag/status bytes method URL rfc931 \
+# peer_tag/peerhost mimetype
+#
+# rfc931: identd info, - otherwise
+#
+#
+# Squid Store Logfile Format
+# --------------------------
+#
+# Version 1.1 Store log
+#
+# time action status datehdr lastmod expires type expect-len/real-len \
+# method key
+#
+#
+# for more information about both formats, see the Squid FAQ at
+# http://squid.nlanr.net/
+
+
+
+
+__version__ = '1.01'
+
+
+from string import atoi, atof, split, join, lower
+from re import compile
+from urllib import unquote
+import sys
+
+
+class AccessParser:
+	''' Splitting Squid Access Logfile Parser '''
+
+	def __init__(self, file_descriptor):
+		self.num_processed = 0
+		self.num_error = 0
+		self._fd = file_descriptor			
+		self._lines = []
+		self._lines_num = 0
+		self._lines_index = 0
+		self.buffer = 1024 * 512
+		self.full_mime_hdrs = 0
+		self.utime = 0
+		self.elapsed = 0
+		self.client = ''
+		self.log_tag = ''
+		self.status = 0
+		self.bytes = 0
+		self.method = ''
+		self.url = ''
+		self.ident = ''
+		self.peer_tag = ''
+		self.peerhost = ''
+		self.mimetype = ''
+		self.hdr_request = {}
+		self.hdr_response = {}
+		self._mime_splitter = compile("\[(.*?)\] \[(.*?)\]")
+		self._mime_indexer = compile("%0d%0a")
+		self._mime_hasher = compile("([\w\-_]+):\s*(.*)$")
+		self._time_headers = ['date', 'last-modified', 'expires']
+	
+	def getlogent(self):
+		''' Increament location in the log and populate object attributes '''
+
+		while 1: 	# loop until we find a valid line, or end
+
+			### this is the buffering for readline()
+			if self._lines_index >= self._lines_num:
+				self._lines_index = 0
+				self._lines = self._fd.readlines(self.buffer)
+				self._lines_num = len(self._lines)
+				if self._lines_num == 0: return 0
+			line = self._lines[self._lines_index]
+			self._lines_index = self._lines_index + 1
+
+			self.num_processed = self.num_processed + 1
+			
+			n = split(line, None)
+			if not self.full_mime_hdrs and len(n) != 10:
+				self.num_error = self.num_error + 1
+			else:
+				try:
+					self.utime = int(atof(n[0]))
+					self.elapsed = int(atoi(n[1]))
+					self.client = n[2]
+					(self.log_tag, status) = split(n[3], '/', 2) 
+					self.status = atoi(status)
+					self.bytes = atoi(n[4]) 
+					self.method = n[5] 
+					self.url = n[6]
+					self.ident = n[7]
+					(self.peer_tag, self.peerhost) = split(n[8], '/', 2)
+					self.mimetype = n[9]
+					if self.full_mime_hdrs:
+						raw_mime = join(n[10:], ' ')
+						self.hdr_request, self.hdr_response = self._parse_mime(raw_mime)
+				except:
+					self.num_error = self.num_error + 1
+					continue
+				return 1
+
+
+	def _parse_mime(self, raw):
+		match = self._mime_splitter.match(raw)
+		if not match:
+			return {}, {}
+		return (	self._process_hdr(match.group(1)), 
+					self._process_hdr(match.group(2))	)
+
+
+	def _process_hdr(self, raw_header):
+		from time import mktime, timezone
+		from rfc822 import parsedate
+	
+		hdrs = {}
+		header_list = self._mime_indexer.split(raw_header)
+		for header in header_list:
+			match = self._mime_hasher.match(header)
+			if not match:
+				continue
+
+			key = lower(match.group(1))
+			value = unquote(match.group(2))
+
+			if key in self._time_headers:
+				value = mktime(parsedate(value)) - timezone
+			hdrs[key] = value
+
+		return hdrs
+
+
+class StoreParser:
+	''' Splitting Squid Store Logfile Parser '''
+
+	def __init__(self, file_descriptor):
+		self.num_processed = 0
+		self.num_error = 0
+		self._fd = file_descriptor
+		self._lines = []
+		self._lines_num = 0
+		self._lines_index = 0
+		self.buffer = 1024 * 512
+		self.utime = 0
+		self.action = 0
+		self.status = ''
+		self.datehdr = ''
+		self.lastmod = 0
+		self.expires = 0
+		self.mimetype = ''
+		self.expect_len = ''
+		self.real_len = ''
+		self.method = ''
+		self.url = ''
+
+	
+	def getlogent(self):
+
+		''' Increament location in the log and populate object attributes '''
+
+		while 1: 	# loop until we find a valid line, or end
+
+			### this is the buffering for readline()
+			if self._lines_index >= self._lines_num:
+				self._lines_index = 0
+				self._lines = self._fd.readlines(self.buffer)
+				self._lines_num = len(self._lines)
+				if self._lines_num == 0: return 0
+			line = self._lines[self._lines_index]
+			self._lines_index = self._lines_index + 1
+
+			self.num_processed = self.num_processed + 1
+			
+			n = split(line, None)
+			if len(n) != 10:
+				self.num_error = self.num_error + 1
+			else:
+				try:
+					self.utime = int(atof(n[0]))
+					self.action = n[1]
+					self.status = atoi(n[2])
+					self.datehdr = atoi(n[3])
+					self.lastmod = atoi(n[4]) 
+					self.expires = atoi(n[5]) 
+					self.mimetype = n[6]
+					(expect_len, real_len) = split(n[7], '/', 2)
+					self.expect_len = atoi(expect_len)
+					self.real_len = atoi(real_len)
+					self.method = n[8]
+					self.url = n[9]
+				except:
+					self.num_error = self.num_error + 1
+					continue
+				return 1
+
+
+
+
+
+			
+def test_access():
+	''' basic test suite- modify at will to test all functionality '''
+	
+	file = sys.stdin
+	log = AccessParser(file)	
+	log.full_mime_hdrs = 0
+	while log.getlogent():
+		print "%s %s" % (log.client, log.url)
+	print "lines: %s" % (log.num_processed)
+	print "error: %s" % (log.num_error)
+		
+		
+if __name__ == '__main__':
+	test_access()
+
+
+
+
+#!/usr/bin/env python
+
+'''
+Web Log Url Parsing class.
+
+Contents:
+- Parser: logfile url parsing class
+  p_log = weblog.url.Parser(log object)
+  methods:
+	- p_log.getlogent()
+  variables:
+	- p_log.cache_size  [ maximum size of url cache ]
+  attributes:
+    - url_scheme      /  ref_scheme - (http|ftp|gopher...)
+    - url_host        /  ref_host 
+    - url_path        /  ref_path
+    - url_parameters  /  ref_parameters - (section after ';')
+    - url_query       /  ref_query - (section after '?')
+    - url_fragment    /  ref_fragment - section after '#')
+    - all attributes of the log object are available as well.
+
+  This class will parse the url and referer (if available) into their
+  respective components. It will also replace each with the unparsed
+  result of those components; this assures that the input is fully 
+  conformant and in sync with the components.
+