Source

weblog / referer.py

Full commit
#!/usr/bin/env python

'''
Web Log Referer Typing class.

Contents:
- Typer: logfile referer typing class
  ref_log = weblog.referer.Typer(log object)
  methods:
	- ref_log.getlogent()
  variables:
	- ref_log.siteurl [list of local site urls]
  attributes:
	- res_log.referer_type (MANUAL|LOCAL|OFFSITE|FILE)
    - all attributes available from the log object are available as well.

- test: test function
'''


# (c) 1998 Copyright Mark Nottingham
# <mnot@pobox.com>
#
# This software may be freely distributed, modified and used,
# provided that this copyright notice remain intact.
#
# This software is provided 'as is' without warranty of any kind.


# Web Log Referer Typing
#
# referer.Typer will determine the type of referer associated with a hit, 
# based on the site URLS that you give it. The possible referer types are:
# 
# - LOCAL -  on one of the sites given
# - MANUAL - a '-' hit
# - FILE - an url beginning with 'file://'
# - OFFSITE - not on one of the sites given, a file or manual hit
#
# the referer attribute of LOCAL hits will be truncated to exclude the 
# scheme and host, and the corresponding attributes will be erased, if
# present.
#
# Make sure the .siteurl attribute is fed a list, even if it's only one item. 
#
# This class MUST be fed the output of both a Web logfile parsing module
# that produces a referer attribute, and the weblog.url.Parse class, 
# which populates the individual components that it needs to operate. 
# See the test() for an example.



__version__ = '1.0'


from urlparse import urlunparse
from string import lower
import socket


class Typer:
	def __init__(self, log):
		self.log = log
		self.siteurl = []
		self.referer_type = ''


	def __getattr__(self, attr):
		try:
			return getattr(self.log, attr)
		except AttributeError:
			raise AttributeError, attr


	def __setattr__(self, attr, value):
		if attr == 'siteurl':
			siteurls = []
			for item in value:
				siteurls.append(lower(item))
				try:
					siteurls.append(socket.gethostbyname(item))
				except socket.error:
					pass
			value = siteurls
		self.__dict__[attr] = value


	def getlogent(self):
		''' Increment position in the log and populate requested attributes '''

		if self.log.getlogent():
			### clear attributes if last logent was a LOCAL
			if self.referer_type == 'LOCAL':
				delattr(self, 'referer')
				delattr(self, 'ref_scheme')
				delattr(self, 'ref_host')

			if self.referer == '-':
				self.__dict__['referer_type'] = 'MANUAL'
			elif self.ref_host in self.siteurl:
				self.__dict__['referer_type'] = 'LOCAL'
				self.__dict__['ref_scheme'] = ''
				self.__dict__['ref_host'] = ''
				self.__dict__['referer'] = urlunparse((	'', 
														'', 
														self.ref_path, 
														self.ref_parameters, 
														self.ref_query, 
														self.ref_fragment))
			elif self.ref_scheme == 'file':
				self.__dict__['referer_type'] = 'FILE'
			else:
				self.__dict__['referer_type'] = 'OFFSITE'

			return 1
		else:
			return 0






def test():
	''' basic test suite- modify at will to test full functionality '''

	import sys
	from weblog import combined, url

	file = sys.stdin
	log = combined.Parser(file)

	p_log = url.Parser(log)
	
	ref_log = Typer(p_log)
	ref_log.siteurl = sys.argv[1:]

	while ref_log.getlogent():
		print "%20s	%20s	%s" % (ref_log.referer[:20], ref_log.ref_path[:20], ref_log.referer_type)

if __name__ == '__main__':
	test()