Commits

Christian Kaula  committed f7a6081

added todos

  • Participants
  • Parent commits 559d40b

Comments (0)

Files changed (1)

File simple_markup.py

 import re
 import tidy
 
-text = """
+text = u"""
 ich bin __lustiger blabla__ text__ der=-- zum --testen gut is.
 ich hab **auch //mehere** zeilen//.
 
 blargh
 :quote
 
-code:"c"
+code:c
 int a = 1;
 int b = 2;
 if a > b:
 <script>
 :code
 
-code:"java"
+
+
+code:java
 blargh
 
-quote:"jo dude"
+quote:jo dude
 ich war ärster
 und zweiter auch
 :quote
 das is //auch// zum testen gut.
 <script type="javascript"></script>
 """
-text = unicode(text, 'latin1')
 
+class Block(object):
+	"""
+	defines one block of rsm text
+	"""
+	
+	def __init__(self, string):
+		# type_re = re.compile(r'(?P<type>code|quote):("(?P<meta>.*?)")?\n(?P<content>.*?)', re.DOTALL | re.MULTILINE)
+		type_re = re.compile(r'^(?P<type>\w+):(?P<meta>.*?)\n(?P<content>.*)', re.DOTALL | re.UNICODE)
+		match = type_re.match(string)
+
+		if match and match.group('type'):
+			self.type = match.group('type')
+			self.content = ''.join(string.splitlines()[1:])
+		else:
+			self.type = 'p'
+			self.content = string
+		if match and match.group('meta'):
+			self.meta = match.group('meta')
+		else:
+			self.meta = None
+		
+		self.sanitize()
+		self.parse_short_markup()
+		self.parse_long_markup()
+		self.nl2br()
+
+	def sanitize(self):
+		"""
+		get rid of leading and tracing newlines
+		"""
+		self.content = self.content.strip()
+
+	def parse_short_markup(self):
+		"""parses short markup like _this_"""
+		# bold
+		b_re = re.compile(r'\*(?P<content>[\w\d]+?)\*')
+		self.content = b_re.sub('<b>\g<content></b>', self.content)
+		# italic
+		i_re = re.compile(r'/(?P<content>[\w\d]+?)/')
+		self.content = i_re.sub('<i>\g<content></i>', self.content)
+		# deleted
+		s_re = re.compile(r'-(?P<content>[\w\d]+?)-')
+		self.content = s_re.sub('<del>\g<content></del>', self.content)
+		# underline
+		u_re = re.compile(r'_(?P<content>[\w\d]+?)_')
+		self.content = u_re.sub('<u>\g<content></u>', self.content)
+
+	def parse_long_markup(self):
+		# bold
+		db_re = re.compile(r'\*\*(?P<content>.*?)\*\*')
+		self.content = db_re.sub('<b>\g<content></b>', self.content)
+		# italic
+		di_re = re.compile(r'//(?P<content>.*?)//')
+		self.content = di_re.sub('<i>\g<content></i>', self.content)
+		# deleted
+		ds_re = re.compile(r'--(?P<content>.*?)--')
+		self.content = ds_re.sub('<del>\g<content></del>', self.content)
+		# underlined
+		du_re = re.compile(r'__(?P<content>.*?)__')
+		self.content = du_re.sub('<u>\g<content></u>', self.content)
+
+	def nl2br(self):
+		"""
+		replace newlines with html breaks
+		"""
+		self.content = self.content.replace('\n', '<br />')
+	
+	# def __str__(self):
+	# 	if self.type == 'p' or self.type == 'code':
+	# 		output = '\n<%s>\n%s\n</%s>' % (self.type, self.content, self.type)
+	# 	else:
+	# 		output = '\n<blockquote>\n<p>\n%s</p>\n</blockquote>' % self.content
+	# 	return output
+
+# TODO: make possible to escape stuff
+# TODO: add lists
 class SimpleMarkupText(object):
+	"""
+	instance of a text that has been parsed by sm
+
+	input: original input
+	blocks: blocks of text
+	output: final parsed text
+	"""
 	
-	def __init__(self, input):
-		self.input = sanitize(input)
-		return self.output
+	def __init__(self, input, tidy=True):
+		# print type(input) is unicode
+		assert(type(input) is unicode, 'input isnt unicode string')
+		self.blocks = []
+		self.code_open = 0
+		self.quote_open = 0
+		
+		self.input = input
+		self.output = self.input
+		self.sanitize()
+		self.split_blocks()
+		self.join_blocks()
+		if tidy:
+			self.tidy()
 	
 	def sanitize(self):
-		text = text.replace('<', '&lt;')
-		text = text.replace('>', '&gt;')
-		text = text.replace('\n', '<br />')
-		return text
+		"""
+		gets rid of tags by replacing < and > by the corresponding html entities
+		also replaces newlines that are too much
+		"""
+		# tags have to go first
+		self.output = self.output.replace('<', '&lt;')
+		self.output = self.output.replace('>', '&gt;')
+		# now we get rid of linebreaks
+		br_re = re.compile(r'(\n){3,}')
+		self.output = br_re.sub('\n\n', self.output)
+	
+	def split_blocks(self):
+		"""
+		a block is a paragraph of text defined by two newlines
+		"""
+		# print self.output.encode('utf8')
+		# print self.output.split('\n\n').__len__()
+		for blockstring in self.output.split('\n\n'):
+			block = Block(blockstring)
+			self.blocks.append(block)
+		
+	def parse_blocks(self):
+		pass
+		# code_re = re.compile(r'(?P<type>code|quote):("(?P<meta>.*?)")?\n(?P<content>.*?)\n:\1', re.DOTALL | re.MULTILINE)
+		# match = code_re.match(string)
+		# 
+		# for block in self.blocks:
+		# 	block
+	
+	def join_blocks(self):
+		self.output = ''
+		for block in self.blocks:
+			content = block.content
+			if block.type == 'p' or block.type == 'code':
+				output = '\n<%s>\n%s\n</%s>' % (block.type, block.content, block.type)
+			else:
+				output = '\n<blockquote>\n<p>\n%s</p>\n</blockquote>' % block.content
+			self.output += output
+	
+	def tidy(self):
+		self.output = tidy.parseString(self.output.encode('utf8'), char_encoding='utf8', indent=True, output_xhtml=True, show_body_only=True).__str__().decode('utf8')
+
+
+def simple_markup2(text, **kwargs):
+	return SimpleMarkupText(text, **kwargs).output
+
+print simple_markup2(text).encode('utf8')
+
 
 def simple_markup(text):
 	def clean(text):