Source

weblog / query.py

Full commit
#!/usr/bin/env python

'''
Web Log Url Parsing class.

Contents:
- Parser: logfile Query parsing class
  p_log = weblog.query.Parser(log object)
  methods:
	- p_log.getlogent()
  variables:
	- p_log.cache_size  [ maximum size of url cache ]
  attributes:
    - url_query_dict  (dictionary of lists of items in the query)
    - ref_query_dict  (as above, if referer is available)
    - all attributes of the log object are available as well.

  The weblog.url.Parser() class MUST be used before using this module.

  This class will parse the url and referer (if available) queries into their
  respective elements, making the query available as a dictionary.
  
  The values of the dictionaries are lists of elements; there may only be
  one element in that list.
  
- test: test function
'''


# (c) 1998 Copyright Mark Nottingham
# <mnot@pobox.com>
#
# This software may be freely distributed, modified and used,
# provided that this copyright notice remain intact.
#
# This software is provided 'as is' without warranty of any kind.


__version__ = '1.0'


from cgi import parse_qs


class Parser:
	def __init__(self, log):
		self.log = log
		self.url_query_dict = {}
		self.ref_query_dict = {}
		self.cache_size = 10000
		self._cache = {'url': {}, 'ref': {}}
		self._referer_present = 0
		if hasattr(self.log, 'referer'):
			self._referer_present = 1


	def __getattr__(self, attr):
		try:
			return getattr(self.log, attr)
		except AttributeError:
			raise AttributeError, attr


	def getlogent(self):
		''' Increment position in the log and populate requested attributes '''

		if self.log.getlogent():
			### parse url query
			if self.log.url_query:
				if not self._cache['url'].has_key(self.log.url_query):
					self._cache['url'][self.log.url_query] = self._parse(self.log.url_query, 'url')
				self.url_query_dict = self._cache['url'][self.log.url_query]
			else:
				self.url_query_dict = {}

			### parse referer query
			if self._referer_present:
				if self.log.ref_query:
					if not self._cache['ref'].has_key(self.log.ref_query):
						self._cache['ref'][self.log.ref_query] = self._parse(self.log.ref_query, 'ref')
					self.ref_query_dict = self._cache['ref'][self.log.ref_query]
				else:
					self.ref_query_dict = {}
			return 1
		else:
			return 0


	def _parse(self, url, url_type):
		if len(self._cache[url_type]) > self.cache_size:
			self._cache[url_type] = {}
		parsed = parse_qs(url)		
		return parsed



def test():
	''' basic test suite- modify at will to test full functionality '''

	import sys
	from weblog import combined, url

	file = sys.stdin
	log = combined.Parser(file)

	u_log = url.Parser(log)
	p_log = Parser(u_log)		# query parser

	while p_log.getlogent():
		if p_log.ref_query_dict:
			for (key, value) in p_log.ref_query_dict.items():
				print key, value
			print



if __name__ == '__main__':
	test()