Source

Shorten URLs service / Shorten-URLs.py

Full commit
#!/usr/bin/env python

import re

class URLShortener(object):
	known_URLs = {}
	def shorten_URLs_in_string(self, original_text):
		return self.canonical_URL_exp.sub(self.shorten_URL_from_match, original_text)

class RetrievingURLShortener(URLShortener):
	"Abstract subclass of URLShortener that retrieves the canonical URL and extracts the short URL from somewhere in the response body. Subclasses must override almost all of the class attributes (canonical_URL_prefix and canonical_URL_suffix may be left alone)."
	canonical_URL_exp = 'Subclass must provide regular expression object.'
	canonical_URL_prefix = 'Subclass may provide a string to prepend to canonical URLs that don\'t already end with it.'
	canonical_URL_prefix = 'http://www.'
	canonical_URL_suffix = 'Subclass may provide a string to append to canonical URLs that don\'t already end with it. Defaults to the empty string.'
	canonical_URL_suffix=''
	short_URL_exp = 'Subclass must provide regular expression object.'
	curl_arguments = []

	def shorten_URL_from_match(self, match):
		canonical_URL = match.group(1)
		if not canonical_URL.startswith(self.canonical_URL_prefix):
			canonical_URL = self.canonical_URL_prefix + canonical_URL
		if not canonical_URL.endswith(self.canonical_URL_suffix):
			canonical_URL += self.canonical_URL_suffix
		try:
			short_URL = self.known_URLs[canonical_URL]
		except KeyError:
			import subprocess
			curl_args = ['curl'] + self.curl_arguments + ['--location', '-q', canonical_URL]
			HTML_source = subprocess.Popen(curl_args, stdout=subprocess.PIPE).stdout.read()
			import os
			os.wait()
			try:
				short_URL = self.short_URL_exp.search(HTML_source).group(1)
			except AttributeError: #search returned None
				import os, sys, tempfile
				dumpfile_fd, dumpfile_path = tempfile.mkstemp('.html')
				print >>sys.stderr, 'Could not find short URL in response from %s! Dumping to %s' % (canonical_URL, dumpfile_path)
				dumpfile = os.fdopen(dumpfile_fd, 'wb')
				dumpfile.write(HTML_source)
				dumpfile.close()
				raise
			self.known_URLs[canonical_URL] = short_URL
		return short_URL

class FlickrURLShortener(RetrievingURLShortener):
	canonical_URL_exp = re.compile('(?:http://)?(?:www\.)?(flickr\.com/photos/[-_a-zA-Z0-9@]+/[0-9]+)(/in/[-a-z0-9]+)?/?')
	short_URL_exp = re.compile('<link(?: id="shorturl")? rev="canonical" type="text/html" href="(http://flic.kr/p/[A-Za-z0-9]+)" ?/?>')

class TechCrunchURLShortener(RetrievingURLShortener):
	canonical_URL_exp = re.compile('(?:http://)?(?:www\.)?(techcrunch.com/[0-9]{4}/[0-9]{2}/[0-9]{2}/[-a-z0-9]+/?)')
	canonical_URL_suffix='/'
	short_URL_exp = re.compile('<input type="text" id="short_url_field" value="(http://tcrn\.ch/[A-Za-z0-9]+)" onClick="this\.select\(\)" ?/?>')

class ArsTechnicaURLShortener(RetrievingURLShortener):
	canonical_URL_exp = re.compile('(?:http://)?(?:www\.)?(arstechnica.com/(?:[a-z]+/)*[0-9]{4}/[0-9]{2}/[-a-z0-9]+\.ars)')
	short_URL_exp = re.compile('<link rev="canonical"(?: type="text/html")? href="(http://arst.ch/[A-Za-z0-9]+)" ?/?>')

class TheMacObserverURLShortener(RetrievingURLShortener):
	canonical_URL_exp = re.compile('(?:http://)?(?:www\.)?(macobserver\.com/tmo/article/[-_.a-z0-9]+/?)')
	canonical_URL_suffix = '/'
	short_URL_exp = re.compile('<link rel="shorturl" href="(http://tmo.to/[A-Za-z0-9]+)" ?/?>')

class YouTubeURLShortener(URLShortener):
	canonical_URL_exp = re.compile('(?:http://)?(?:www\.)?(youtube\.com/watch\?)(?:[-_a-zA-Z0-9]+=[-_a-zA-Z0-9]+&)*v=([-_a-zA-Z0-9]+)(?:&[-_a-zA-Z0-9]+=[-_a-zA-Z0-9]+(?:\.be)?)*')

	def shorten_URL_from_match(self, match):
		video_ID = match.group(2)
		canonical_URL = 'http://www.' + match.group(1) + 'v=' + video_ID
		try:
			short_URL = self.known_URLs[canonical_URL]
		except KeyError:
			short_URL = 'http://youtu.be/' + video_ID
			self.known_URLs[canonical_URL] = short_URL
		return short_URL

class AmazonURLShortener(URLShortener):
	canonical_URL_exp = re.compile('(?:http://)?(?:www\.)?(?:amazon\.com(?:(?:/[::identifier::]+){2}|/dp)/)(?:album-redirect/)?(B[A-Z0-9]+|[0-9]{10})(/ref=[::identifier::]*)?([?&][::identifier::]*(=[::identifier::]*)?)*'.replace('[::identifier::]', '[-_a-zA-Z0-9]'))

	def shorten_URL_from_match(self, match):
		item_ID = match.group(1)
		canonical_URL = 'http://www.amazon.com/dp/' + item_ID
		try:
			short_URL = self.known_URLs[canonical_URL]
		except KeyError:
			short_URL = 'http://amzn.com/' + item_ID
			self.known_URLs[canonical_URL] = short_URL
		return short_URL

class AmazonWishlistURLShortener(URLShortener):
	canonical_URL_exp = re.compile('(?:http://)?(?:www\.)?amazon\.com/(?:[dg]p/)?registry/wishlist/([A-Z0-9]+)(?:/ref=.*)?(?:\?.*)?')

	def shorten_URL_from_match(self, match):
		wishlist_ID = match.group(1)
		canonical_URL = 'http://www.amazon.com/registry/wishlist/' + wishlist_ID
		try:
			short_URL = self.known_URLs[canonical_URL]
		except KeyError:
			short_URL = 'http://amzn.com/w/' + wishlist_ID
			self.known_URLs[canonical_URL] = short_URL
		return short_URL

class TumblrURLShortener(RetrievingURLShortener):
	canonical_URL_exp = re.compile('(?:http://)?((?:[-0-9a-zA-Z]+\.)?tumblr\.com/post/[0-9]+/?[-_0-9a-zA-Z]*)')
	canonical_URL_prefix = 'http://'
	curl_arguments = ['-o', '/dev/null', '-D', '/dev/stdout']
	short_URL_exp = re.compile('Link: <(.+)>; rel=shorturl')

import sys
original_text = sys.stdin.read()

shorteners = [
	FlickrURLShortener(),
	YouTubeURLShortener(),
	TechCrunchURLShortener(),
	ArsTechnicaURLShortener(),
	AmazonURLShortener(),
	AmazonWishlistURLShortener(),
	TheMacObserverURLShortener(),
	TumblrURLShortener(),
]
text = original_text
for shortener in shorteners:
	text = shortener.shorten_URLs_in_string(text)

sys.stdout.write(text)