Source

scikits_index / code / tools.py

Full commit
Janto Dreijer e79bb2f 





Janto Dreijer 384d66c 
Janto Dreijer e79bb2f 


Janto Dreijer fb20e45 
Janto Dreijer e79bb2f 

Janto Dreijer 2f208fe 



Janto Dreijer 90824d9 

Janto Dreijer 2f208fe 

Janto Dreijer 2f62e3e 

Janto Dreijer e79bb2f 

Janto Dreijer 6869a68 
Janto Dreijer e79bb2f 
Janto Dreijer 6cb15f9 

Janto Dreijer 4a39519 
Janto Dreijer 6cb15f9 


Janto Dreijer 6869a68 
Janto Dreijer 731d020 


Janto Dreijer e79bb2f 


Janto Dreijer 4a39519 

Janto Dreijer e79bb2f 








Janto Dreijer 2f208fe 
Janto Dreijer 7222ebd 









Janto Dreijer fb20e45 






















Janto Dreijer 7b1baa7 

Janto Dreijer fb20e45 





Janto Dreijer 1919c0b 


Janto Dreijer 6cb15f9 
Janto Dreijer 1919c0b 

















Janto Dreijer 2f62e3e 
Janto Dreijer 1919c0b 

Janto Dreijer 6869a68 
Janto Dreijer 1919c0b 







Janto Dreijer 4a39519 
Janto Dreijer 6869a68 
Janto Dreijer 90824d9 

Janto Dreijer 1919c0b 

Janto Dreijer 6869a68 
Janto Dreijer 4a39519 


Janto Dreijer dd366de 
Janto Dreijer 4a39519 


























Janto Dreijer 384d66c 
Janto Dreijer 4a39519 












Janto Dreijer 2f208fe 































Janto Dreijer 6869a68 

Janto Dreijer 2f208fe 

Janto Dreijer 6869a68 

Janto Dreijer 2f208fe 

Janto Dreijer 6869a68 
Janto Dreijer 2f208fe 
Janto Dreijer 08111e2 


Janto Dreijer 2f208fe 
Janto Dreijer 6869a68 
Janto Dreijer 2f208fe 











from itertools import *
from net import *
from utils import *

import cgi
import datetime
from BeautifulSoup import BeautifulSoup
import wsgiref.handlers

import re
import os
import os.path

from google.appengine.api import users, urlfetch, memcache, mail
from google.appengine.ext import webapp, db
from google.appengine.api import users

from google.appengine.ext.webapp import template as templating

import xmlrpclib

import PyRSS2Gen

# set up locations
ROOT = os.path.dirname(__file__)
ON_DEV_SERVER = os.environ.get("SERVER_SOFTWARE", "dev").lower().startswith("dev")

SECONDS_IN_MINUTE = 60
SECONDS_IN_HOUR = SECONDS_IN_MINUTE * 60
SECONDS_IN_DAY = SECONDS_IN_HOUR * 24
SECONDS_IN_WEEK = SECONDS_IN_DAY * 7
SECONDS_IN_MONTH = SECONDS_IN_DAY * 28

# how often new data needs to be loaded
PACKAGE_INFO_CACHE_DURATION = SECONDS_IN_HOUR * 6
PACKAGE_NEWS_CACHE_DURATION = SECONDS_IN_DAY * 6
PACKAGE_LISTING_CACHE_DURATION = SECONDS_IN_HOUR * 6

import time

import random

# set up logging system
import logging
log_format = "%(levelname)s:%(module)s.%(name)s@%(asctime)s : %(message)s"
logging.basicConfig(
	level=logging.DEBUG,
	datefmt="%H:%M",
	format=log_format,
	)
logger = logging.getLogger("")

import rdfxml
class Sink(object):
	def __init__(self):
		self.result = []
	def triple(self, s, p, o):
		self.result.append((s, p, o))
def rdfToPython(s, base=None):
	sink = Sink()
	return rdfxml.parseRDF(s, base=None, sink=sink).result

from docutils.core import publish_parts
import docutils.utils
def rst2html(s):
	"""
	from http://brizzled.clapper.org/id/77/
	"""
	settings = {
		'config' : None,
		'halt_level':2,

		# following security precausions probably not necessary as GAE already messes with python's file() capability
		'file_insertion_enabled':'no',
		'raw_enabled':'no',
	}

	# Necessary, because otherwise docutils attempts to read a config file
	# via the codecs module, which doesn't work with AppEngine.
	os.environ['DOCUTILSCONFIG'] = ""
	parts = publish_parts(source=s,
		writer_name='html4css1',
		settings_overrides=settings)
	return parts['fragment']

#~ s=r"""
#~ This is a scikit intended to include numerical methods for smoothing\ndata.""".replace(r"\n", "\n")
#~ try:
	#~ print rst2html(s)
#~ except docutils.utils.SystemMessage, e:
	#~ print htmlquote(s).replace(r"\n", "<br />\n") + "\n<!-- DOCUTILS WARNING! %s -->" % str(e)
#~ 1/0

class Cache(object):

	"""
	memcache that notifies if object expired but also still returns previous value
	"""

	@classmethod
	def get(self, key):
		result = memcache.get(key)
		if result is None:
			return None, True
		value, timeout = result
		expired = False
		if timeout is not None and timeout < time.time():
			expired = True
		return value, expired

	@classmethod
	def set(self, key, value, duration=None):
		timeout = (time.time()+duration) if duration is not None else None
		return memcache.set(key=key, value=(value, timeout))

def get_url(url, force_fetch=False, cache_duration=None):
	response, expired = Cache.get(url)
	if expired or force_fetch:
		logger.debug("fetching %s" % url)
		try:
			response = urlfetch.fetch(url)
		except: # failed
			if response is not None: # got a value in the past
				logger.warn("returning old value for %s" % url)
			else:
				raise
		else:
			assert Cache.set(key=url, value=response, duration=cache_duration), url
	else:
		#~ logger.debug("cache hit for %s" % url)
		pass

	return response

def fetch_dir_links(url, cache_duration=None):
	result = get_url(url, cache_duration=cache_duration)
	if result.status_code != 200:
		return []

	items = re.findall('<a href="(.+?)/">.+?</a>', result.content)
	return [os.path.join(url, item) for item in items if not item.startswith("http://") and not item.startswith("..")]

def fetch_links_with_dates(url, cache_duration=None):
	response = get_url(url, cache_duration=cache_duration)
	if response.status_code != 200:
		return
	text = response.content
	if "404 Not Found" in text:
		return []

	items = []
	soup = BeautifulSoup(text)
	for tr in soup.findAll("tr"):
		link = tr.find("a")
		if link is None:
			continue
		name = link.contents[0]
		_url = link["href"]
		if _url.startswith("?"):
			continue
		_url = os.path.join(url, _url)

		# find date field
		for td in tr.findAll("td"):
			try:
				t = datetime.datetime.strptime(str(td.contents[0]).strip(), "%d-%b-%Y %H:%M")
				break
			except ValueError:
				continue
		else:
			continue # no date found

		items.append((name, _url, t))

	if not items:
		logger.warn("no items for %s" % url)

	return items

class GoogleXMLRPCTransport(object):
	"""Handles an HTTP transaction to an XML-RPC server."""

	def __init__(self, use_datetime=0):
		self._use_datetime = use_datetime

	def request(self, host, handler, request_body, verbose=0):
		"""
		Send a complete request, and parse the response. See xmlrpclib.py.

		:Parameters:
			host : str
				target host

			handler : str
				RPC handler on server (i.e., path to handler)

			request_body : str
				XML-RPC request body

			verbose : bool/int
				debugging flag. Ignored by this implementation

		:rtype: dict
		:return: parsed response, as key/value pairs
		"""

		# issue XML-RPC request

		result = None
		url = 'http://%s%s' % (host, handler)
		try:
			response = urlfetch.fetch(
				url,
				payload=request_body,
				method=urlfetch.POST,
				headers={'Content-Type': 'text/xml'},
				)
		except:
			msg = 'Failed to fetch %s' % url
			raise

		if response.content_was_truncated:
			logger.warn("GAE truncated xmlrpc data")

		if response.status_code != 200:
			logger.error('%s returned status code %s' % (url, response.status_code))
			raise xmlrpclib.ProtocolError(host + handler,
				  response.status_code,
				  "",
				  response.headers)
		else:
			result = self.__parse_response(response.content)

		return result

	def __parse_response(self, response_body):
		p, u = xmlrpclib.getparser(use_datetime=self._use_datetime)
		p.feed(response_body)
		return u.close()