creole-py /

Full commit
#!/usr/bin/env python
# -*- coding: utf-8 -*-

WikiCreole to HTML converter
This program is an example of how the WikiCreole parser
can be used.

@copyright: 2007 MoinMoin:RadomirDopieralski
@license: BSD, see COPYING for details.

Test cases contributed by Jan Klopper (,
modified by Radomir Dopieralski (MoinMoin:RadomirDopieralski).

>>> import lxml.html.usedoctest
>>> def parse(text):
...     print HtmlEmitter(Parser(text).parse()).emit()

>>> parse(u'test')

>>> parse(u'test\ntest')
<p>test test</p>

>>> parse(u'test\n\ntest')

>>> parse(u'test\\\\test')

>>> parse(u'ÓÔÕÖØÙÚÛÜÝßàáâãäåæçèéêëìíîïñòóôõöøùúûüýÿŒœ%0A')

>>> parse(u'----')

>>> parse(u'==test==')

>>> parse(u'== test')

>>> parse(u'==test====')

>>> parse(u'=====test')

>>> parse(u'==test==\ntest\n===test===')

>>> parse(u'test\n* test line one\n * test line two\ntest\n\ntest')
    <li>test line one</li>
    <li>test line two test</li>

>>> parse(u'* test line one\n* test line two\n** Nested item')
    <li>test line one</li>
    <li>test line two<ul>
        <li>Nested item</li>

>>> parse(u'* test line one\n* test line two\n# Nested item')
    <li>test line one</li>
    <li>test line two<ol>
        <li>Nested item</li>

>>> parse(u'test //test test// test **test test** test')
<p>test <i>test test</i> test <b>test test</b> test</p>

>>> parse(u'test //test **test// test** test')
<p>test <i>test <b>test<i> test<b> test</b></i></b></i></p>

>>> parse(u'**test')

>>> parse(u'|x|y|z|\n|a|b|c|\n|d|e|f|\ntest')

>>> parse(u'|=x|y|=z=|\n|a|b|c|\n|d|e|f|')

>>> parse(u'test test')
<p>test <a href=""></a> test</p>

>>> parse(u',test, test')
<p><a href=",test">,test</a>, test</p>

>>> parse(u'(')
<p>(<a href=""></a>)</p>

XXX This might be considered a bug, but it's impossible to detect in general.
>>> parse(u'')
<p><a href=""></a>)</p>

>>> parse(u'')
<p><a href=";test=1">;test=1</a></p>

>>> parse(u'~')

>>> parse(u'')
<p><a href=""></a></p>

>>> parse(u'[[test]] [[tset|test]]')
<p><a href="test">test</a> <a href="tset">test</a></p>

>>> parse(u'[[|test]]')
<p><a href="">test</a></p>

import re
from creole import Parser

class Rules:
    # For the link targets:
    proto = r'http|https|ftp|nntp|news|mailto|telnet|file|irc'
    extern = r'(?P<extern_addr>(?P<extern_proto>%s):.*)' % proto
    interwiki = r'''
            (?P<inter_wiki> [A-Z][a-zA-Z]+ ) :
            (?P<inter_page> .* )

class HtmlEmitter:
    Generate HTML output for the document
    tree consisting of DocNodes.

    addr_re = re.compile('|'.join([
        ]), re.X | re.U) # for addresses

    def __init__(self, root):
        self.root = root

    def get_text(self, node):
        """Try to emit whatever text is in the node."""

            return node.children[0].content or ''
            return node.content or ''

    def html_escape(self, text):
        return text.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')

    def attr_escape(self, text):
        return self.html_escape(text).replace('"', '&quot')

    # *_emit methods for emitting nodes of the document:

    def document_emit(self, node):
        return self.emit_children(node)

    def text_emit(self, node):
        return self.html_escape(node.content)

    def separator_emit(self, node):
        return u'<hr>';

    def paragraph_emit(self, node):
        return u'<p>%s</p>\n' % self.emit_children(node)

    def bullet_list_emit(self, node):
        return u'<ul>\n%s</ul>\n' % self.emit_children(node)

    def number_list_emit(self, node):
        return u'<ol>\n%s</ol>\n' % self.emit_children(node)

    def list_item_emit(self, node):
        return u'<li>%s</li>\n' % self.emit_children(node)

    def table_emit(self, node):
        return u'<table>\n%s</table>\n' % self.emit_children(node)

    def table_row_emit(self, node):
        return u'<tr>%s</tr>\n' % self.emit_children(node)

    def table_cell_emit(self, node):
        return u'<td>%s</td>' % self.emit_children(node)

    def table_head_emit(self, node):
        return u'<th>%s</th>' % self.emit_children(node)

    def emphasis_emit(self, node):
        return u'<i>%s</i>' % self.emit_children(node)

    def strong_emit(self, node):
        return u'<b>%s</b>' % self.emit_children(node)

    def header_emit(self, node):
        return u'<h%d>%s</h%d>\n' % (
            node.level, self.html_escape(node.content), node.level)

    def code_emit(self, node):
        return u'<tt>%s</tt>' % self.html_escape(node.content)

    def link_emit(self, node):
        target = node.content
        if node.children:
            inside = self.emit_children(node)
            inside = self.html_escape(target)
        m = self.addr_re.match(target)
        if m:
                return u'<a href="%s">%s</a>' % (
                    self.attr_escape(target), inside)
                raise NotImplementedError
        return u'<a href="%s">%s</a>' % (
            self.attr_escape(target), inside)

    def image_emit(self, node):
        target = node.content
        text = self.get_text(node)
        m = self.addr_re.match(target)
        if m:
                return u'<img src="%s" alt="%s">' % (
                    self.attr_escape(target), self.attr_escape(text))
                raise NotImplementedError
        return u'<img src="%s" alt="%s">' % (
            self.attr_escape(target), self.attr_escape(text))

    def macro_emit(self, node):
        raise NotImplementedError

    def break_emit(self, node):
        return u"<br>"

    def preformatted_emit(self, node):
        return u"<pre>%s</pre>" % self.html_escape(node.content)

    def default_emit(self, node):
        """Fallback function for emitting unknown nodes."""

        raise TypeError

    def emit_children(self, node):
        """Emit all the children of a node."""

        return u''.join([self.emit_node(child) for child in node.children])

    def emit_node(self, node):
        """Emit a single node."""

        emit = getattr(self, '%s_emit' % node.kind, self.default_emit)
        return emit(node)

    def emit(self):
        """Emit the document represented by self.root DOM tree."""

        return self.emit_node(self.root)

if __name__=="__main__":
    import sys
    document = Parser(unicode(, 'utf-8', 'ignore')).parse()
    sys.stdout.write(HtmlEmitter(document).emit().encode('utf-8', 'ignore'))