resimar / resimar.py

# -*- coding: utf-8 -*-

__author__ = "Christian Kaula <chris@christiankaula.com>"
__version__ = "0.2.0"
# __url__ = "http://wiki.chad.org/SmartyPantsPy"
__description__ = "REallySImpleMARkup for stuff for which BBCode would be overkill."

import hashlib
import re
import tidy

class Block(object):
	"""
	defines one block of rsm text
	"""
	
	def __init__(self, content, meta=None):
		self.meta = meta
		self.content = content
		
		# array in which we keep urls
		self.links = []
		
	def nl2br(self):
		self.content = self.content.replace('\n', '<br />\n')


	def create_nice_url(self, match):
		if match.group(0).__len__() > 50:
			name = match.group(0)[0:50] + u'[...]'
		else:
			name = match.group(0)
		link = u'<a href="' + match.group(0) + '">' + name + '</a>'
		link_hash = hashlib.md5(link).hexdigest()
		self.links.append((link, link_hash))
		return '$' + link_hash + '$'

	def parse_urls(self):
		# probs to Oz http://regexlib.com/REDetails.aspx?regexp_id=1719
		# with changes of mine
		url_re =  re.compile(r"(http://|https://|ftp://)([a-zA-Z0-9]+\.[a-zA-Z0-9\-]+|[a-zA-Z0-9\-]+)\.[a-zA-Z\.]{2,6}(/[a-zA-Z0-9\.\?=/#%&\+-_]+|/|)")
		self.content = url_re.sub(self.create_nice_url, self.content)


	def create_nice_email_url(self, match):
		link = u'<a href="mailto:' + match.group(0) + '">' + match.group(0) + '</a>'
		link_hash = hashlib.md5(link).hexdigest()
		self.links.append((link, link_hash))
		return '$' + link_hash + '$'

	def parse_emails(self):
		# probs to Roger Ramjet http://regexlib.com/REDetails.aspx?regexp_id=328
		email_re =  re.compile(r"(((\"[^\"\f\n\r\t\v\b]+\")|([\w\!\#\$\%\&\'\*\+\-\~\/\^\`\|\{\}]+(\.[\w\!\#\$\%\&\'\*\+\-\~\/\^\`\|\{\}]+)*))@((\[(((25[0-5])|(2[0-4][0-9])|([0-1]?[0-9]?[0-9]))\.((25[0-5])|(2[0-4][0-9])|([0-1]?[0-9]?[0-9]))\.((25[0-5])|(2[0-4][0-9])|([0-1]?[0-9]?[0-9]))\.((25[0-5])|(2[0-4][0-9])|([0-1]?[0-9]?[0-9])))\])|(((25[0-5])|(2[0-4][0-9])|([0-1]?[0-9]?[0-9]))\.((25[0-5])|(2[0-4][0-9])|([0-1]?[0-9]?[0-9]))\.((25[0-5])|(2[0-4][0-9])|([0-1]?[0-9]?[0-9]))\.((25[0-5])|(2[0-4][0-9])|([0-1]?[0-9]?[0-9])))|((([A-Za-z0-9\-])+\.)+[A-Za-z\-]+)))")
		self.content = email_re.sub(self.create_nice_email_url, self.content)

	
	def resub_links(self):
		"""get back in the urls we replaced with a safe string earlier"""
		for link, link_hash in self.links:
			# print u'$' + link_hash + u'$', link
			self.content = self.content.replace(u'$' + link_hash + u'$', link)


	def parse_short_markup(self):
		"""parses short markup like _this_"""
		b_re = re.compile(r'\*(?P<content>[\w\d]+?)\*', re.UNICODE)
		self.content = b_re.sub('<b>\g<content></b>', self.content)
		i_re = re.compile(r'/(?P<content>[\w\d]+?)/', re.UNICODE)
		self.content = i_re.sub('<i>\g<content></i>', self.content)
		s_re = re.compile(r'-(?P<content>[\w\d]+?)-', re.UNICODE)
		self.content = s_re.sub('<del>\g<content></del>', self.content)
		u_re = re.compile(r'_(?P<content>[\w\d]+?)_', re.UNICODE)
		self.content = u_re.sub('<u>\g<content></u>', self.content)


	def parse_long_markup(self):
		db_re = re.compile(r'\*\*(?P<content>.*?)\*\*', re.UNICODE)
		self.content = db_re.sub('<b>\g<content></b>', self.content)
		di_re = re.compile(r'//(?P<content>.*?)//', re.UNICODE)
		self.content = di_re.sub('<i>\g<content></i>', self.content)
		ds_re = re.compile(r'--(?P<content>.*?)--', re.UNICODE)
		self.content = ds_re.sub('<del>\g<content></del>', self.content)
		du_re = re.compile(r'__(?P<content>.*?)__', re.UNICODE)
		self.content = du_re.sub('<u>\g<content></u>', self.content)
 
	
	def parse(self):
		# self.sanitize()
		self.parse_urls()
		self.parse_emails()
		# self.parse_short_markup()
		self.parse_long_markup()
		# self.brake_long_words()
		self.resub_links()


class ParagraphBlock(Block):
	def parse(self):
		super(ParagraphBlock, self).parse()
		self.nl2br()
		# BUG: call nl2br somewhere appropriate
		return u'<p>\n%s\n</p>' % self.content


class CodeBlock(Block):
	def parse(self):
		super(CodeBlock, self).parse()
		endcode_re = re.compile(r'\n:code\s*$')
		self.content = endcode_re.sub('', self.content)
		output = u'<code>\n%s\n</code>' % self.content
		if self.meta:
			output = u'<div class="code-meta">%s</div>\n' % self.meta + output
		return output
		

class QuoteBlock(Block):
	def parse(self):
		super(QuoteBlock, self).parse()
		# BUG: call nl2br somewhere appendppropriate
		endquote_re = re.compile(r'\n:quote\s*$')
		self.content = endquote_re.sub('', self.content)
		output = u'<p>\n%s\n</p>' % self.content
		if self.meta:
			output = u'<div class="quote-meta">\n%s\n</div>\n' % self.meta + output
		return u'<blockquote>\n%s\n</blockquote>' % output
	

class ListBlock(Block):
	def parse_list(self):
		li_re = re.compile(r'(?:^|\n)\s*(?:\*|-|#)\s+', re.UNICODE)
		lis = li_re.split(self.content)
		output = []
		for li in lis:
			if not li.strip(' \n\r') == '':
				output.append('<li>\n  %s\n</li>' % li.strip(' \n\r').replace('\n', '<br />\n'))
		self.content = '\n'.join(output)
	
	def parse(self):
		content = self.parse_list()
		super(ListBlock, self).parse()


class OrderedListBlock(ListBlock):
	def parse(self):
		super(OrderedListBlock, self).parse()
		output = u'<ol>\n%s\n</ol>' % self.content
		return output
		

class UnorderedListBlock(ListBlock):
	def parse(self):
		super(UnorderedListBlock, self).parse()
		output = u'<ul>\n%s\n</ul>' % self.content
		return output


# TODO: make possible to escape stuff
# TODO: trim too long words/links
# TODO: make urls nameable
# TODO: think of something for _stuff_ cd-rom-drive
class ResimarText(object):
	"""
	instance of a text that has been parsed by sm

	input: original input
	blocks: blocks of text
	output: final parsed text
	"""
	
	def __init__(self, input, tidy=True):
		if input == None:
			self.content = u''
			return
			
		else:
			# make sure input is unicode
			if not isinstance(input, unicode):
				input = input.decode('utf8')
			self.content = input
		
		self.blocks = []
		self.code_open = 0
		self.quote_open = 0
		
		# self.input = input
		# self.content = self.input


	def brake_long_word(self, match):
		print match.group(0)[::30]
		return match.group(0)
	
	def sanitize(self):
		"""
		gets rid of tags by replacing < and > by the corresponding html entities
		replace carriage returns and stuff that doesnt belong with newlines
		also replaces newlines that are too much
		"""
		# tags have to go first
		self.content = self.content.replace('<', '&lt;')
		self.content = self.content.replace('>', '&gt;')
		# remove tabs
		self.content = self.content.replace('\t', '')
		# normalize linebreaks
		nl_re = re.compile(r'(\r\n|\r)', re.UNICODE)
		self.content = nl_re.sub('\n', self.content)
		# now we get rid of messy linebreaks
		br_re = re.compile(r'(\n){3,}', re.UNICODE)
		self.content = br_re.sub('\n\n', self.content)
		# remove messy spaces
		space_re = re.compile(r'(\n+) +', re.UNICODE)
		# print space_re.findall(self.content)
		self.content = space_re.sub(r'\1', self.content)
		# break long words
		long_re = re.compile(r'[a-zA-Z0-9]{50,}', re.UNICODE)
		self.content = long_re.sub(self.brake_long_word, self.content)

	
	def split_blocks(self):
		"""
		a block is a paragraph of text defined by two newlines
		split blocks of text and convert them into objects
		"""
		for blockstring in self.content.strip(' \n\r').split('\n\n'):
			type_re = re.compile(r'^(?P<type>\w+):(?P<meta>.*?)\n(?P<content>.*)', re.DOTALL | re.UNICODE)
			match = type_re.match(blockstring)
			
			if match:
				if match.group('type'):
					blocktype = match.group('type').lower()
				else:
					blocktype = None
				if match.group('meta'):
					blockmeta = match.group('meta')
				else:
					blockmeta = None
			
			blockident = blockstring[0:2]
			# print self.content
			# print 'bs', blockstring
			# print 'bi', blockident

			if match and blocktype and blocktype in ('code', 'quote'):
				if blocktype == 'code':
					self.blocks.append(CodeBlock(match.group('content'), blockmeta))
				elif blocktype == 'quote':
					self.blocks.append(QuoteBlock(match.group('content'), blockmeta))
					
			elif blockident == '* ' or blockident == '- ':
				self.blocks.append(UnorderedListBlock(content=blockstring))
			elif blockident == '# ':
				self.blocks.append(OrderedListBlock(content=blockstring))
			else:
				self.blocks.append(ParagraphBlock(blockstring))


	def tidy(self):
		# if isinstance(self.content, basestring):
		# 	tidy_input = self.content
		# elif not isinstance(self.content, basestring):
		# 	try:
		# 		tidy_input = str(self.content)
		# 	except UnicodeEncodeError:
		# 		tidy_input = unicode(self.content)

		output = tidy.parseString(self.content.encode('utf8'), char_encoding='utf8', indent=True, output_xhtml=True, show_body_only=True).__str__().decode('utf8')
		if not isinstance(output, basestring,):
			if not isinstance(output, unicode):
				output = output.decode('utf8')
			else:
				output = unicode(str(output), 'utf8')
		self.content = output


	def parse(self):
		if self.content == u'':
			return u''
			
		self.sanitize()
		self.split_blocks()

		output = ''
		for block in self.blocks:
			output = output + block.parse() + '\n'

		if self.tidy:
			self.tidy()

		return output
	

def resimar(text, **kwargs):
	# print [SimpleMarkupText(text, **kwargs).output]
	return ResimarText(text, **kwargs).parse()
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.