Source

resimar / clean.py

# -*- coding: utf-8 -*-

import hashlib
import re
import tidy

BLOCK_TYPES = ('code', 'quote')


class Block(object):
	"""
	defines one block of rsm text
	"""
	
	def __init__(self, string):
		# array in which we keep urls
		self.links = []
		# type_re = re.compile(r'(?P<type>code|quote):("(?P<meta>.*?)")?\n(?P<content>.*?)', re.DOTALL | re.MULTILINE)
		type_re = re.compile(r'^(?P<type>\w+):(?P<meta>.*?)\n(?P<content>.*)', re.DOTALL | re.UNICODE)
		match = type_re.match(string)

		# print string.strip()[0:2]
		if match and match.group('type') and match.group('type') in BLOCK_TYPES:
			self.type = match.group('type')
			self.content = ''.join(string.splitlines()[1:])
		elif string.strip()[0:2] == '* ':
			self.type = 'ul'
			self.content = self.make_li(string)
		else:
			self.type = 'p'
			self.content = string
		if match and match.group('meta'):
			self.meta = match.group('meta')
		else:
			self.meta = None
		
		self.sanitize()
		self.parse_urls()
		# self.parse_short_markup()
		self.parse_long_markup()
		self.brake_long_words()
		self.resub_links()
		if not self.type == 'ul':
			self.nl2br()

	def sanitize(self):
		"""
		get rid of leading and tracing newlines
		"""
		self.content = self.content.strip()

	def make_li(self, string):
		li_re = re.compile(r'\n?\*\s+')
		lis = li_re.split(string)
		stuff = []
		for li in lis:
			if not li == '':
				stuff.append('\n<li>\n%s\n</li>' % li.replace('\n', '<br />\n'))
		# for i in stuff:
		# 	print i
		# print string
		# print stuff
		return ''.join(stuff)

	def create_nice_url(self, match):
		if match.group(0).__len__() > 30:
			name = match.group(0)[0:30] + u'[...]'
		else:
			name = match.group(0)
		link = u'<a href="' + match.group(0) + '">' + name + '</a>'
		link_hash = hashlib.md5(link).hexdigest()
		self.links.append((link, link_hash))
		return '$' + link_hash + '$'

	def parse_urls(self):
		url_re = re.compile(r'((ht|f)tp(s?)\:\/\/|~/|/)?([\w]+:\w+@)?([a-zA-Z]{1}([\w\-]+\.)+([\w]{2,5}))(:[\d]{1,5})?((/?\w+/)+|/?)(\w+\.[\w]{3,4})?((\?\w+=\w+)?(&\w+=\w+)*)', re.UNICODE)
		# url_re = re.compile(r'http:\/\/[a-zA-Z\-]+\.[\w]{2,4}')
		self.content = url_re.sub(self.create_nice_url, self.content)
		# print self.content
	
	def resub_links(self):
		"""get back in the urls we replaced with a safe string earlier"""
		for link, link_hash in self.links:
			# print u'$' + link_hash + u'$', link
			self.content = self.content.replace(u'$' + link_hash + u'$', link)

	def parse_short_markup(self):
		"""parses short markup like _this_"""
		# bold
		b_re = re.compile(r'\*(?P<content>[\w\d]+?)\*', re.UNICODE)
		self.content = b_re.sub('<b>\g<content></b>', self.content)
		# italic
		i_re = re.compile(r'/(?P<content>[\w\d]+?)/', re.UNICODE)
		self.content = i_re.sub('<i>\g<content></i>', self.content)
		# deleted
		s_re = re.compile(r'-(?P<content>[\w\d]+?)-', re.UNICODE)
		self.content = s_re.sub('<del>\g<content></del>', self.content)
		# underline
		u_re = re.compile(r'_(?P<content>[\w\d]+?)_', re.UNICODE)
		self.content = u_re.sub('<u>\g<content></u>', self.content)

	def parse_long_markup(self):
		# bold
		db_re = re.compile(r'\*\*(?P<content>.*?)\*\*', re.UNICODE)
		self.content = db_re.sub('<b>\g<content></b>', self.content)
		# italic
		di_re = re.compile(r'//(?P<content>.*?)//', re.UNICODE)
		self.content = di_re.sub('<i>\g<content></i>', self.content)
		# deleted
		ds_re = re.compile(r'--(?P<content>.*?)--', re.UNICODE)
		self.content = ds_re.sub('<del>\g<content></del>', self.content)
		# underlined
		du_re = re.compile(r'__(?P<content>.*?)__', re.UNICODE)
		self.content = du_re.sub('<u>\g<content></u>', self.content)

	def brake_long_word(self, match):
		print match.group(0)[::30]
		return match.group(0)
	
	def brake_long_words(self):
		long_re = re.compile(r'[a-zA-Z0-9]{50,}')
		self.content = long_re.sub(self.brake_long_word, self.content)

	def nl2br(self):
		"""
		replace newlines with html breaks
		"""
		self.content = self.content.replace('\n', '<br />')
	
	# def __str__(self):
	# 	if self.type == 'p' or self.type == 'code':
	# 		output = '\n<%s>\n%s\n</%s>' % (self.type, self.content, self.type)
	# 	else:
	# 		output = '\n<blockquote>\n<p>\n%s</p>\n</blockquote>' % self.content
	# 	return output

# TODO: make possible to escape stuff
# TODO: add lists
# TODO: trim too long words/links
# TODO: make urls nameable
# TODO: think of something for _stuff_ cd-rom-drive
class SimpleMarkupText(object):
	"""
	instance of a text that has been parsed by sm

	input: original input
	blocks: blocks of text
	output: final parsed text
	"""
	
	def __init__(self, input, tidy=True):
		# print type(input) is unicode
		assert(type(input) is unicode, 'input isnt unicode string')
		self.blocks = []
		self.code_open = 0
		self.quote_open = 0
		
		self.input = input
		self.output = self.input
		self.sanitize()
		self.split_blocks()
		self.join_blocks()
		if tidy:
			self.tidy()
	
	def sanitize(self):
		"""
		gets rid of tags by replacing < and > by the corresponding html entities
		also replaces newlines that are too much
		"""
		# tags have to go first
		self.output = self.output.replace('<', '&lt;')
		self.output = self.output.replace('>', '&gt;')
		# now we get rid of linebreaks
		br_re = re.compile(r'(\n){3,}')
		self.output = br_re.sub('\n\n', self.output)
	
	def split_blocks(self):
		"""
		a block is a paragraph of text defined by two newlines
		"""
		# print self.output.encode('utf8')
		# print self.output.split('\n\n').__len__()
		for blockstring in self.output.split('\n\n'):
			block = Block(blockstring)
			self.blocks.append(block)
		
	def parse_blocks(self):
		pass
		# code_re = re.compile(r'(?P<type>code|quote):("(?P<meta>.*?)")?\n(?P<content>.*?)\n:\1', re.DOTALL | re.MULTILINE)
		# match = code_re.match(string)
		# 
		# for block in self.blocks:
		# 	block
	
	def join_blocks(self):
		self.output = ''
		for block in self.blocks:
			content = block.content
			if block.type == 'p' or block.type == 'code':
				output = '\n<%s>\n%s\n</%s>' % (block.type, block.content, block.type)
			elif block.type == 'ul':
				output = '\n<ul>%s\n</ul>' % block.content
			else:
				output = '\n<blockquote>\n<p>\n%s</p>\n</blockquote>' % block.content
			self.output += output
	
	def tidy(self):
		self.output = tidy.parseString(self.output.encode('utf8'), char_encoding='utf8', indent=True, output_xhtml=True, show_body_only=True).__str__().decode('utf8')


def simple_markup(text, **kwargs):
	return SimpleMarkupText(text, **kwargs).output

# print simple_markup2(text)
# print simple_markup2(text2)
# print simple_markup2(text3)