Commits

Anonymous committed 9a1f567

initial check-in

Comments (0)

Files changed (11)

+version 0.21
+(Nov ?? 2007)
+ * added inline_no_wiki_monospace option to dialects.Creole10 
+ * refactored core.pre_escape()
+ * CHANGES.txt, LISCENCE.txt, and INSTALL.txt added to distribution
+ * added use_additions option to dialects.Creole10
+ * implemented monospace (##) (for use_additions option only) 
+
+
+Version 0.2
+(Oct 26 2007)
+
+ * module split into separate files to create the creoleparser package
+ * other refactoring
+
+
+Version 0.1, 0.1.1, 0.1.2
+(Oct 2007)
+
+ * First public releases
+Installing CreoleParser
+=======================
+
+Prerequisites
+-------------
+
+ * Python 2.4+ <http://www.python.org>
+ * Genshi 0.4+ <http://genshi.edgewall.org>
+
+Installation
+------------
+
+Once you've downloaded and unpacked a Creoleparser source release,
+enter the directory where the archive was unpacked, and run:
+
+  $ python setup.py install
+
+Note that you may need administrator/root privileges for this step,
+as this command will by default attempt to install Creoleparser to
+the Python site-packages directory on your system.
+
+Support
+-------
+
+If you encounter any problems with Creoleparser, please don't
+hesitate to ask questions on the WikiCreole.org wiki:
+
+http://wikicreole.org/wiki/Creoleparser.py
+
+Copyright (c) 2007 Stephen Day
+
+Permission is hereby granted, free of charge, to any person obtaining a copy 
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+include *.txt
+
+About Creoleparser
+==================
+
+Creoleparser is a Python implementation of a parser for the Creole wiki markup language.
+
+For more information please visit:
+
+http://wikicreole.org/wiki/Creoleparser.py
+

creoleparser/__init__.py

+# __init__.py
+#
+# Copyright (c) 2007 Stephen Day
+#
+# This module is part of Creoleparser and is released under
+# the MIT License: http://www.opensource.org/licenses/mit-license.php
+#
+"""
+This is a Python implementation of a parser for the Creole wiki markup language.
+The specification of that can be found at http://wikicreole.org/wiki/Creole1.0
+
+Basic Usage
+===========
+>>> from creoleparser import creole_to_xhtml
+
+Simply call the creole_to_xhtml() function with one argument (the text to be parsed):
+
+>>> print creole_to_xhtml("Some real **simple** mark-up"),
+<p>Some real <strong>simple</strong> mark-up</p>
+
+To customize things a little, create your own dialect and parser:
+
+>>> from creoleparser.dialects import Creole10
+>>> from creoleparser.core import Parser
+
+>>> my_dialect=Creole10(wiki_links_base_url='http://www.mysite.net/',
+... interwiki_links_base_urls=dict(wikicreole='http://wikicreole.org/wiki/'))
+
+>>> my_parser = Parser(dialect=my_dialect)
+
+>>> print my_parser("[[Home]] and [[wikicreole:Home]]"),
+<p><a href="http://www.mysite.net/Home">Home</a> and <a href="http://wikicreole.org/wiki/Home">wikicreole:Home</a></p>
+
+TODO
+====
+ - Add (a lot) more docstrings (done)
+ - Package this module properly and make it easy_install'able (done)
+ - Add a 'use_additions' option to the Creole class (and implement them!)
+ - Move the tests to a separate file (done)
+ - Compile the re's used for preprocessing (done)
+"""
+
+from core import Parser
+from dialects import Creole10
+
+__docformat__ = 'restructuredtext en'
+
+
+creole_to_xhtml = Parser(dialect=Creole10(wiki_links_base_url='http://www.wikicreole.org/wiki/',
+                             interwiki_links_base_urls={'Ohana':'http://wikiohana.net/cgi-bin/wiki.pl/'}))
+"""This is a parser created for convenience"""
+
+
+def _test():
+    import doctest
+    doctest.testmod()
+
+if __name__ == "__main__":
+    _test()

creoleparser/core.py

+# core.py
+#
+# Copyright (c) 2007 Stephen Day
+#
+# This module is part of Creoleparser and is released under
+# the MIT License: http://www.opensource.org/licenses/mit-license.php
+#
+
+import re
+
+import genshi.builder as bldr
+
+
+__docformat__ = 'restructuredtext en'
+
+escape_char = '~'
+esc_neg_look = '(?<!' + re.escape(escape_char) + ')'
+esc_to_remove = re.compile(''.join([r'(?<!',re.escape(escape_char),')',re.escape(escape_char),r'(?!([ \n]|$))']))
+
+def fragmentize(text,wiki_elements,remove_escapes = True):
+
+    """Takes a string of wiki markup and outputs a list of genshi
+    Fragments (Elements and strings).
+
+    This recursive function, with help from the WikiElement objects,
+    does almost all the parsing.
+
+    When no WikiElement objects are supplied, escapes are removed from
+    ``text`` (except if called from ``no_wiki`` or ``pre``)  and it is
+    returned as-is. This is the only way for recursion to stop.
+
+    :parameters:
+      text
+        the text to be parsed
+      wiki_elements
+        list of WikiElement objects to be searched for
+      remove_escapes
+        If False, escapes will not be removed
+    
+    """
+
+    # remove escape characters 
+    if not wiki_elements:
+        if remove_escapes:
+            return [esc_to_remove.sub('',text)]
+        else:
+            return [text]
+
+    # If the first supplied wiki_element is actually a list of elements, \
+    # search for all of them and match the closest one only.
+    if isinstance(wiki_elements[0],(list,tuple)):
+        found_elements = []
+        for wiki_element in wiki_elements[0]:
+            mo = wiki_element.regexp.search(text)
+            if mo:
+                found_elements.append((mo.start(),wiki_element,mo))
+        if found_elements:
+            x,wiki_element,mo = min(found_elements)
+        else:
+            mo = None
+    else:
+        wiki_element = wiki_elements[0]
+        mo = wiki_element.regexp.search(text)
+         
+    frags = []
+    if mo:
+        # call again for leading text and extend the result list 
+        if mo.start():
+            frags.extend(fragmentize(text[:mo.start()],wiki_elements[1:]))
+
+        # append the found wiki element to the result list
+        frags.append(wiki_element._build(mo))
+
+        # make the source output easier to read
+        if wiki_element.append_newline:
+            frags.append('\n')
+
+        # call again for trailing text and extend the result list
+        if mo.end() < len(text):
+            frags.extend(fragmentize(text[mo.end():],wiki_elements))
+    else:
+        frags = fragmentize(text,wiki_elements[1:])
+
+    return frags
+
+
+class Parser(object):
+
+    """Instantiates a parser with specified behaviour"""
+    
+    def __init__(self,dialect, method='xhtml', strip_whitespace=False, encoding='utf-8'):
+        """Constructor for Parser objects.
+
+        :parameters:
+          dialect
+            A Creole instance
+          method
+            This value is passed to genshies Steam.render(). Possible values
+            include ``xhtml``, ``html``, and ``xml``.
+          strip_whitespace
+            This value is passed Genshies Steam.render().
+          encoding
+            This value is passed Genshies Steam.render().
+        """
+        self.dialect = dialect
+        self.method = method
+        self.strip_whitespace = strip_whitespace
+        self.encoding=encoding
+
+    def generate(self,text):
+        """Returns a Genshi Stream."""
+        text = preprocess(text,self.dialect)
+        return bldr.tag(fragmentize(text,self.dialect.parse_order)).generate()
+
+    def render(self,text,**kwargs):
+        """Returns final output string (e.g., xhtml)
+
+        :parameter:
+          See Genshi documentation for additional keyword arguments.
+        """
+        return self.generate(text).render(method=self.method,strip_whitespace=self.strip_whitespace,
+                                          encoding=self.encoding,**kwargs)
+
+    def __call__(self,text):
+        """Wrapper for the render method. Returns final output string."""
+        return self.render(text)
+
+def preprocess(text, dialect):
+    """This should generally be called before fragmentize().
+
+    :parameters:
+      text
+        text to be processsed.
+      dialect
+        a ``Creole`` object.
+    """
+    text = text.replace("\r\n", "\n")
+    text = text.replace("\r", "\n")
+    text = ''.join([text.rstrip(),'\n']) 
+    text = ''.join(pre_escape(text,[dialect.pre,dialect.no_wiki],
+                              [dialect.link, dialect.img, dialect.http_link]))
+    return text
+
+
+def pre_escape(text, elements_to_skip=None,
+               elements_to_process=None):
+    """This is used to escape certain markup before parsing.
+
+    :parameters:
+      text
+        text to be processsed.
+      elements_to_skip
+        these wiki elements will not be processed.
+      elements_to_process
+        these wiki elements will have an escape added according to
+        their ``esc_regexp``
+    """
+    if not elements_to_skip:
+        for element in elements_to_process:
+            text = element.pre_escape(text)
+        return [text]
+    mo = elements_to_skip[0].regexp.search(text)
+    parts = []
+    if mo:
+        if mo.start():
+            parts.extend(pre_escape(text[:mo.start()],elements_to_skip[1:],
+                                    elements_to_process))
+        parts.append(mo.group(0))
+        if mo.end() < len(text):
+            parts.extend(pre_escape(text[mo.end():],elements_to_skip,
+                                    elements_to_process))
+    else:
+        parts = pre_escape(text,elements_to_skip[1:],elements_to_process)
+    return parts
+     
+
+def _test():
+    import doctest
+    doctest.testmod()
+
+if __name__ == "__main__":
+    _test()
+

creoleparser/dialects.py

+# dialects.py
+#
+# Copyright (c) 2007 Stephen Day
+#
+# This module is part of Creoleparser and is released under
+# the MIT License: http://www.opensource.org/licenses/mit-license.php
+#
+
+from elements import *
+
+class Creole10(object):
+
+    """This class contains most of the logic and specification of the markup."""
+
+    def __init__(self,wiki_links_base_url='http://',interwiki_links_base_urls={},
+                 no_wiki_monospace=False, use_additions=False):
+        """Most attributes of new Creole objects are derived from the WikiElement
+        class. Please see the constructor of that class and other specific element
+        classes for details.
+
+        :parameters:
+          wiki_link_base_url
+            self explanitory
+          interwiki_links_base_urls
+            Dictionary of urls for interwiki links.
+          no_wiki_monospace = ``False``
+            If ``True``, inline no_wiki will be rendered as <tt> not <span>
+          use_additions = ``False``
+            If ``True``, markup beyond the Creole 1.0 spec will be allowed.
+            Including monospace (##).
+        """
+
+        self.br = LineBreak('br', r'\\')
+        self.http_link = RawLink('a')
+        self.interwiki_link = InterWikiLink(delimiter=':',
+                                            base_urls=interwiki_links_base_urls,
+                                            space_char='_')
+        self.wiki_link = WikiLink(base_url=wiki_links_base_url,space_char='_')
+        self.img = Image('img',('{{','}}'),[],delimiter='|')
+        self.link = Link('a',('[[',']]'),[],delimiter='|',
+                        link_types=[self.http_link,self.interwiki_link,self.wiki_link])
+        self.strong = InlineElement('strong', '**',[])
+        self.em = InlineElement('em', '//',[self.strong,self.br,self.link,self.img,self.http_link])
+        if no_wiki_monospace:
+            no_wiki_tag = 'tt'
+        else:
+            no_wiki_tag = 'span'
+        self.no_wiki = NoWikiElement(no_wiki_tag,['{{{','}}}'],[])
+        self.strong.child_tags = [self.em,self.br,self.link,self.img,self.http_link]
+        self.link.child_tags = [(self.strong, self.em), self.img]
+
+        if use_additions:
+            self.tt = InlineElement('tt', '##',[(self.strong,self.em),self.br,self.link,self.img,self.http_link])
+            self.strong.child_tags = [(self.em,self.tt),self.br,self.link,self.img,self.http_link]
+            self.em.child_tags = [(self.strong,self.tt),self.br,self.link,self.img,self.http_link]
+            self.link.child_tags = [(self.strong, self.em,self.tt), self.img]
+            header_children = [self.no_wiki,(self.strong, self.em, self.tt), self.br,self.link,self.img,self.http_link]
+
+        else:
+            self.strong.child_tags = [self.em,self.br,self.link,self.img,self.http_link]
+            self.link.child_tags = [(self.strong, self.em), self.img]
+            header_children = [self.no_wiki,(self.strong, self.em), self.br,self.link,self.img,self.http_link]
+                
+        self.hr = LoneElement('hr','----',[])
+        self.lone_br = LoneElement('br',r'\\',[])
+        self.blank_line = BlankLine()
+
+        self.h1 = Heading('h1','=',header_children)
+        self.h2 = Heading('h2','==',header_children)
+        self.h3 = Heading('h3','===',header_children)
+        self.h4 = Heading('h4','====',header_children)
+        self.h5 = Heading('h5','=====',header_children)
+        self.h6 = Heading('h6','======',header_children)
+
+        headings = [self.h1,self.h2,self.h3,self.h4,self.h5,self.h6]
+        
+        self.td = TableCell('td','|',header_children)
+        self.th = TableCell('th','|=',header_children)
+        self.tr = TableRow('tr','|',[self.th,self.td])
+        self.table = Table('table','|',[self.tr])
+
+        self.p = Paragraph('p',header_children)
+
+        self.li = ListItem('li',child_tags=[],list_tokens='*#')
+        self.ol = List('ol','#',[self.li],other_token='*')
+        self.ul = List('ul','*',[self.li],other_token='#')
+        self.nested_ol = NestedList('ol','#',[self.li])
+        self.nested_ul = NestedList('ul','*',[self.li])
+        self.li.child_tags = [(self.nested_ol,self.nested_ul)] + header_children
+
+        self.pre = PreBlock('pre',['{{{','}}}'])
+
+        self.parse_order = [self.pre,self.blank_line,self.table]+ headings\
+                           + [self.hr,self.lone_br,self.ul,self.ol,self.p]
+        """These are the wiki elements that are searched at the top level of text to be
+        processed. The order matters because elements later in the list need not have any
+        knowledge of those before (as those were parsed out already). This makes the
+        regular expression patterns for later elements very simple.
+        """

creoleparser/elements.py

+# elements.py
+#
+# Copyright (c) 2007 Stephen Day
+#
+# This module is part of Creoleparser and is released under
+# the MIT License: http://www.opensource.org/licenses/mit-license.php
+#
+
+import re
+
+import genshi.builder as bldr
+
+from core import escape_char, esc_neg_look, fragmentize
+
+
+__docformat__ = 'restructuredtext en'
+
+class WikiElement(object):
+    
+    """Baseclass for all wiki WikiElements."""
+    
+    append_newline = False
+    """Determines if newlines are appended to Element(s) during processing.
+    Should only affect readability of source xml.
+    """
+    
+    def __init__(self, tag, token, child_tags):
+        """Constructor for WikiElement objects.
+
+        Subclasses may have other keyword arguments.   
+
+        :parameters:
+          tag
+            The xhtml tag associated with the element.
+          token
+            The character string (or strings) that identifies the element
+            in wiki markup.
+          child_tags
+            A list of wiki_elements that will be searched for in the body of the
+            element.  The order of these elements matters, because if an element is
+            found before the element that encloses it, the enclosing element will
+            never be found.  In cases where this imposes limits (e.g, ``strong`` and
+            ``em`` should be allowed to nest each other), place the conflicting
+            elements in a sublist. The parser will then find which comes first.
+        """
+        self.tag = tag
+        self.token = token
+        self.child_tags = child_tags
+                
+    def _build(self,mo):
+        """Returns a genshi Element that has ``self.tag`` as the
+        outermost tag.
+
+        This methods if called exclusively by ``fragmentize``
+
+        :parameters:
+          mo
+            match object, usually the one returned by
+            self.regexp.search(s) 
+        """
+        return bldr.tag.__getattr__(self.tag)(fragmentize(mo.group(1),
+                                                          self.child_tags))
+
+    def re_string(self):
+        """The regular expression pattern that is compiled into ``self.regexp``.
+
+        The regular expression must consume the entire wiki element,
+        including the tokens. For block elements, the newline on the last
+        line must be consumed also. group(1) should normally be the
+        entire string inside the tokens. If not, a custom ``_build``
+        method will be needed.
+        """
+        pass
+
+    def pre_escape(self,text):
+        """Finds the element in ``text`` and inserts an escape character \
+        to hide certain markup (i.e., | an //) contained within.
+
+        Returns the modified ``text``
+        """
+        pass
+
+    def __repr__(self):
+        return "<WikiElement "+str(self.tag)+">"
+
+
+class RawLink(WikiElement):
+    
+    """Used to find raw urls in wiki text and build xml from them.
+
+    In the example below, a tilde (~) is used to escape the "//" in
+    the url. This is normally done during preprocessing and it is used
+    to avoid conflict with other markup.
+
+    >>> raw_link = RawLink(tag='a')
+    >>> mo = raw_link.regexp.search(" a http:~//www.google.com url ")
+    >>> raw_link.href(mo)
+    'http://www.google.com'
+    >>> raw_link._build(mo).generate().render()
+    '<a href="http://www.google.com">http://www.google.com</a>'
+    
+    """
+
+    def __init__(self, tag):
+        super(RawLink,self).__init__(tag=tag, token=None, child_tags=None)
+        self.regexp = re.compile(self.re_string())
+        self.pre_escape_regexp = re.compile('(http(s?)|ftp)://')
+
+    def pre_escape(self,text):
+        return self.pre_escape_regexp.sub(r'\1:~//',text)
+
+    def re_string(self):
+        protocol = '((https?:)' + re.escape(escape_char) + '(//'
+        rest_of_url = r'\S+?))'
+        look_ahead = r'(?=[,.?!:;"\']?(\s|$))' #allow one punctuation character
+        return esc_neg_look + protocol + rest_of_url + look_ahead
+
+    def _build(self,mo):
+        return bldr.tag.__getattr__(self.tag)(self.alias(mo),
+                                              href=self.href(mo))
+
+    def href(self,mo):
+        """Returns the string for the href attribute of the Element."""
+        return mo.group(2)+mo.group(3)
+
+    def alias(self,mo):
+        """Returns the string for the content of the Element."""
+        return self.href(mo)
+
+
+class InterWikiLink(WikiElement):
+
+    """Used to find interwiki links and return a href and alias.
+
+    The search scope for these is only inside wiki links,
+    before the pipe(|)! 
+
+    >>> interwiki_link = InterWikiLink(delimiter=':',
+    ... base_urls=dict(somewiki='http://somewiki.org/',
+    ...                bigwiki='http://bigwiki.net/'),
+    ...                space_char='_')
+    >>> mo = interwiki_link.regexp.search(" somewiki:Home Page ")
+    >>> interwiki_link.href(mo)
+    'http://somewiki.org/Home_Page'
+    >>> interwiki_link.alias(mo)
+    'somewiki:Home Page'
+    
+    """
+
+    def __init__(self,delimiter,base_urls,space_char):
+        self.delimiter = delimiter
+        self.regexp = re.compile(self.re_string())
+        self.base_urls = base_urls
+        self.space_char = space_char
+
+    def re_string(self):
+        wiki_id = r'(\w+)'
+        optional_spaces = ' *'
+        page_name = r'(\S+( +\S+)*)' #allows any number of single spaces 
+        return wiki_id + optional_spaces + re.escape(self.delimiter) + \
+               optional_spaces + page_name + optional_spaces + '$'
+
+    def href(self,mo):
+        base_url = self.base_urls.get(mo.group(1))
+        if not base_url:
+            return None
+        return base_url + mo.group(2).replace(' ',self.space_char)
+
+    def alias(self,mo):
+        return ''.join([mo.group(1),self.delimiter,mo.group(2)])
+
+
+class WikiLink(WikiElement):
+
+    """Used to find wiki links and return a href and alias.
+
+    The search scope for these is only inside wiki links, before the pipe(|)
+
+    >>> wiki_link = WikiLink(base_url='http://somewiki.org/',
+    ...                      space_char='_')
+    >>> mo = wiki_link.regexp.search(" Home Page ")
+    >>> wiki_link.href(mo)
+    'http://somewiki.org/Home_Page'
+    >>> wiki_link.alias(mo)
+    'Home Page'
+    
+    """
+
+    def __init__(self,base_url,space_char):
+        self.regexp = re.compile(self.re_string())
+        self.base_url = base_url
+        self.space_char = space_char
+
+    def re_string(self):
+        optional_spaces = ' *'
+        page_name = r'(\S+( +\S+)*)' #allows any number of single spaces 
+        return optional_spaces + page_name + optional_spaces + '$' 
+
+    def href(self,mo):
+        return self.base_url + mo.group(1).replace(' ',self.space_char)
+
+    def alias(self,mo):
+        return mo.group(1)
+
+
+class BlockElement(WikiElement):
+
+    """Wiki elements wanting ``append_newline = True`` should use this
+    as the base.
+
+    """
+
+    append_newline = True
+
+
+class List(BlockElement):
+
+    """Finds list wiki elements.
+
+    group(1) of the match object includes all lines from the list
+    including newline characters.
+        
+    """
+
+    def __init__(self, tag, token,child_tags,other_token):
+        super(List,self).__init__(tag, token, child_tags)
+        self.other_token = other_token
+        self.regexp = re.compile(self.re_string(),re.DOTALL+re.MULTILINE)
+
+    def re_string(self):
+        """Lists are the last outer level elements to be searched. The
+        regexp only has to know about lists.
+        """
+        leading_whitespace = r'^([ \t]*'
+        only_one_token = re.escape(self.token)+'[^'+ re.escape(self.token) + ']'
+        rest_of_list = r'.*?\n)'
+        only_one_other_token = re.escape(self.other_token)+'[^'+ \
+                               re.escape(self.other_token) + ']'
+        look_ahead = '(?=([ \t]*' + only_one_other_token + '|$))'
+        return leading_whitespace + only_one_token + rest_of_list + \
+               look_ahead
+
+
+class ListItem(WikiElement):
+    r"""Matches the current list item.
+
+    Everything up to the next same-level list item is matched.
+
+    >>> list_item = ListItem('li',[],'#*')
+    >>> mo = list_item.regexp.search("*one\n**one.1\n**one.2\n*two\n")
+    >>> mo.group(2)
+    'one\n**one.1\n**one.2\n'
+    >>> mo.group(0)
+    '*one\n**one.1\n**one.2\n'
+    
+    """
+    
+    append_newline = True
+
+    def __init__(self, tag, child_tags, list_tokens):
+        """Constructor for list items.
+
+        :parameters"
+          list_tokens
+            A string that includes the tokens used for lists
+        """
+        super(ListItem,self).__init__(tag, token=None,
+                                      child_tags=child_tags)
+        self.list_tokens = list_tokens
+        self.regexp = re.compile(self.re_string(),re.DOTALL)
+
+    def re_string(self):
+        whitespace = r'\s*'
+        item_start = '([*#]+)'
+        rest_of_item = r'(.*?\n)'
+        start_of_same_level_item = r'\1(?![*#])'
+        look_ahead = '(?=(' + whitespace + start_of_same_level_item + '|$))'
+        return whitespace + item_start + whitespace + '?' + \
+               rest_of_item + look_ahead
+
+    def _build(self,mo):
+        return bldr.tag.__getattr__(self.tag)(fragmentize(mo.group(2),
+                                                          self.child_tags))
+
+
+class NestedList(WikiElement):
+
+    r"""Finds a list in the current list item.
+
+    >>> nested_ul = NestedList('ul','*',[])
+    >>> mo = nested_ul.regexp.search('one\n**one.1\n**one.2\n')
+    >>> mo.group(1)
+    '**one.1\n**one.2\n'
+    >>> mo.group(0) == mo.group(1)
+    True
+
+    """
+
+    def __init__(self, tag, token,child_tags):
+        super(NestedList,self).__init__(tag, token, child_tags)
+        self.regexp = re.compile(self.re_string(),re.DOTALL+re.MULTILINE)
+
+    def re_string(self):
+        look_behind = r'(?<=\n)' # have to avoid finding a list on the first line
+        whitespace = r'(\s*'
+        rest_of_list = '.*$)'
+        return look_behind + '^' + whitespace + re.escape(self.token) + \
+               rest_of_list
+
+
+class Paragraph(BlockElement):
+
+    """"This should be the last outer level wiki element to be "searched".
+
+    Anything that is left over will be placed in paragraphs.
+
+    """
+
+    def __init__(self, tag, child_tags):
+        super(Paragraph,self).__init__(tag,token=None, child_tags=child_tags)
+        self.regexp = re.compile(self.re_string(),re.DOTALL+re.MULTILINE)
+
+    def re_string(self):
+        return r'^(.*)\n'
+
+
+class Heading(BlockElement):
+
+    r"""Finds heading wiki elements.
+
+    >>> h1 = Heading('h1','=',[])
+    >>> mo = h1.regexp.search('before\n = An important thing = \n after')
+    >>> mo.group(1)
+    'An important thing'
+    >>> mo.group(0)
+    ' = An important thing = \n'
+
+    """
+  
+    def __init__(self, tag, token, child_tags):
+        super(Heading,self).__init__(tag,token , child_tags)
+        self.regexp = re.compile(self.re_string(),re.MULTILINE)
+
+    def re_string(self):
+        whitespace = r'[ \t]*'
+        neg_look_ahead = '(?!' + re.escape(self.token[0]) + ')'
+        content = '(.*?)'
+        trailing_markup = '(' + re.escape(self.token[0]) + r'+[ \t]*)?\n'
+        return '^' + whitespace + re.escape(self.token) + neg_look_ahead + \
+               whitespace + content + whitespace + trailing_markup
+
+
+class Table(BlockElement):
+
+    r"""Find tables.
+
+    >>> table = Table('table','|',[])
+    >>> mo = table.regexp.search("before\n | one | two |\n|one|two \n hi")
+    >>> mo.group(1)
+    ' | one | two |\n|one|two \n'
+    >>> mo.group(0) == mo.group(1)
+    True
+    
+    """
+
+    def __init__(self, tag, token, child_tags=[]):
+        super(Table,self).__init__(tag,token , child_tags)
+        self.regexp = re.compile(self.re_string(),re.MULTILINE)
+
+    def re_string(self):
+        whitespace = r'[ \t]*'
+        rest_of_line = r'.*?\n'
+        return '^((' + whitespace + re.escape(self.token) + \
+               rest_of_line + ')+)'
+
+
+class TableRow(BlockElement):
+
+    r"""Finds rows in a table.
+
+    >>> row = TableRow('tr','|',[])
+    >>> mo = row.regexp.search(' | one | two |\n|one|two \n')
+    >>> mo.group(1)
+    '| one | two '
+    >>> mo.group(0)
+    ' | one | two |\n'
+    
+    """
+
+    def __init__(self, tag, token, child_tags=[]):
+        super(TableRow,self).__init__(tag,token , child_tags)
+        self.regexp = re.compile(self.re_string(),re.MULTILINE)
+
+    def re_string(self):
+        whitespace = r'[ \t]*'
+        content = '(' + re.escape(self.token) + '.*?)'
+        trailing_token = re.escape(self.token) + '?'
+        return '^' + whitespace + content + trailing_token + \
+               whitespace + r'\n'
+
+
+class TableCell(WikiElement):
+
+    r"""Finds cells in a table row.
+
+    >>> cell = TableCell('td','|',[])
+    >>> mo = cell.regexp.search('| one | two ')
+    >>> mo.group(1)
+    'one'
+    >>> mo.group(0)
+    '| one '
+    
+    """
+
+    def __init__(self, tag, token, child_tags=[]):
+        super(TableCell,self).__init__(tag,token , child_tags)
+        self.regexp = re.compile(self.re_string())
+
+    def re_string(self):
+        whitespace = r'[ \t]*'
+        content = '(.*?)'
+        look_ahead = '((?=' + esc_neg_look + re.escape(self.token[0]) + ')|$)'
+        return esc_neg_look + re.escape(self.token) + whitespace + \
+               content + whitespace + look_ahead    
+
+
+class InlineElement(WikiElement):
+
+    r"""For finding generic inline elements like ``strong`` and ``em``.
+
+    >>> em = InlineElement('em','//',[])
+    >>> mo1 = em.regexp.search('a //word// in a line')
+    >>> mo2 = em.regexp.search('a //word in a line\n or two\n')
+    >>> mo1.group(0),mo1.group(1)
+    ('//word//', 'word')
+    >>> mo2.group(0),mo2.group(1)
+    ('//word in a line\n or two', 'word in a line\n or two')
+
+    Use a list for the ``token`` argument to have different start
+    and end strings. These must be closed.
+
+    >>> foo = InlineElement('foo',['<<','>>'],[])
+    >>> mo = foo.regexp.search('blaa <<here it is >>\n')
+    >>> mo.group(1)
+    'here it is '
+        
+    """
+
+    def __init__(self, tag, token, child_tags=[]):
+        super(InlineElement,self).__init__(tag,token , child_tags)
+        self.regexp = re.compile(self.re_string(),re.DOTALL)
+
+    def re_string(self):
+        if isinstance(self.token,str):
+            content = '(.+?)'
+            end = '(' + esc_neg_look + re.escape(self.token) + r'|$)'
+            return esc_neg_look + re.escape(self.token) + content + end
+        else:
+            content = '(.+?)'
+            return esc_neg_look + re.escape(self.token[0]) + content + esc_neg_look + re.escape(self.token[1])
+             
+
+class Link(InlineElement):
+
+    """Finds and builds links."""
+    
+    def __init__(self, tag, token, child_tags, delimiter,link_types):
+        super(Link,self).__init__(tag,token , child_tags)
+        self.regexp = re.compile(self.re_string())
+        self.delimiter = delimiter
+        self.link_types = link_types
+        self.pre_escape_regexp = re.compile(self.pre_escape_pattern())
+
+    def pre_escape(self,text):
+        return self.pre_escape_regexp.sub(r'\1~\2',text)
+
+    def pre_escape_pattern(self):
+        return '(' + re.escape(self.token[0]) + '.*?)' + \
+               '(' + re.escape(self.delimiter) + '.*?' + \
+               re.escape(self.token[1]) + ')'
+        
+    def _build(self,mo):
+        body = mo.group(1).split(escape_char + self.delimiter, 1)
+        link = body[0]
+        if len(body) == 1:
+            alias = None
+        else:
+            alias = body[1].strip()
+        for link_type in self.link_types:
+            link_mo = link_type.regexp.search(link)
+            if link_mo:
+                break
+        href = link_type.href(link_mo)
+        if not href:
+            return bldr.tag.span('Bad Link - ',link)
+        if not alias:
+            alias = link_type.alias(link_mo)
+        else:
+            alias = fragmentize(alias,self.child_tags)
+        return bldr.tag.__getattr__(self.tag)(alias ,href=link_type.href(link_mo))
+
+
+class Image(InlineElement):
+
+    """Processes image elements.
+
+    In the example below, a tilde (~) is used to escape the "|".
+    This is normally done during preprocessing and it is used to avoid
+    conflict with other markup (it allows images to appear in tables).
+
+    >>> img = Image('img',('{{','}}'),[], delimiter='|')
+    >>> mo = img.regexp.search('{{ picture.jpg ~| An image of a house }}')
+    >>> img._build(mo).generate().render()
+    '<img src="picture.jpg" alt="An image of a house"/>'
+
+    """
+
+    def __init__(self, tag, token, child_tags,delimiter):
+        super(Image,self).__init__(tag,token , child_tags)
+        self.regexp = re.compile(self.re_string())
+        self.delimiter = delimiter
+        self.src_regexp = re.compile(r'^\s*(\S+)\s*$')
+        self.pre_escape_regexp = re.compile(self.pre_escape_pattern())
+
+    def pre_escape(self,text):
+        return self.pre_escape_regexp.sub(r'\1~\2',text)
+
+    def pre_escape_pattern(self):
+        return '(' + re.escape(self.token[0]) + '.*?)' + \
+               '(' + re.escape(self.delimiter) + '.*?' + \
+               re.escape(self.token[1]) + ')'
+
+    def _build(self,mo):
+        body = mo.group(1).split(escape_char+self.delimiter,1)
+        src_mo = self.src_regexp.search(body[0])
+        if not src_mo:
+            return bldr.tag.span('Bad Image src')
+        link = src_mo.group(1)
+        if len(body) == 1:
+            alias = link
+        else:
+            alias = body[1].strip()
+        return bldr.tag.__getattr__(self.tag)(src=link ,alt=alias)
+
+
+class NoWikiElement(InlineElement):
+
+    """Inline no-wiki.
+
+    When two or more end tokens are found together, only last marks
+    the end of the element.
+
+    This element must be on a single line.
+    
+    """
+
+    def __init__(self, tag, token, child_tags=[]):
+        super(NoWikiElement,self).__init__(tag,token , child_tags)
+        self.regexp = re.compile(self.re_string()) 
+
+    def _build(self,mo):
+        if self.tag:
+            return bldr.tag.__getattr__(self.tag)(
+                   fragmentize(mo.group(1), self.child_tags,
+                               remove_escapes=False))
+        else:
+            return bldr.tag(fragmentize(mo.group(1),self.child_tags,
+                                        remove_escapes=False))
+
+    def re_string(self):
+        if isinstance(self.token,str):
+            content = '(.+?' + re.escape(self.token[-1]) + '*)'
+            return esc_neg_look + re.escape(self.token) + \
+                   content + re.escape(self.token)
+        else:
+            content = '(.+?' + re.escape(self.token[1][-1]) + '*)'
+            return esc_neg_look + re.escape(self.token[0]) + \
+                   content + re.escape(self.token[1])
+
+
+class PreBlock(BlockElement):
+
+    """A preformatted block.
+
+    If a closing token is found on a line with a space as the first
+    character, it will be remove from the output.
+    
+    """
+
+    def __init__(self, tag, token, child_tags=[]):
+        super(PreBlock,self).__init__(tag,token , child_tags)
+        self.regexp = re.compile(self.re_string(),re.DOTALL+re.MULTILINE)
+        self.regexp2 = re.compile(self.re_string2(),re.MULTILINE)
+
+    def re_string(self):
+        if isinstance(self.token,str):
+            return '^' + re.escape(self.token) + r'\s*?\n(.*?\n)' + \
+                   re.escape(self.token) + r'\s*?\n'
+        else:
+            start = '^' + re.escape(self.token[0]) + r'\s*?\n'
+            content = r'(.*?\n)'
+            end = re.escape(self.token[1]) + r'\s*?\n'
+            return start + content + end
+
+    def re_string2(self):
+        """Finds a closing token will a space at the start of the line."""
+        if isinstance(self.token,str):
+            return r'^ (\s*?' + re.escape(self.token) + r'\s*?\n)'
+        else:
+            return r'^ (\s*?' + re.escape(self.token[1]) + r'\s*?\n)'
+
+    def _build(self,mo):
+        match = self.regexp2.sub(r'\1',mo.group(1))
+        
+        return bldr.tag.__getattr__(self.tag)(fragmentize(match,self.child_tags,remove_escapes=False))
+
+
+class LoneElement(BlockElement):
+
+    """Element on a line by itself with no content (e.g., <hr/>)"""
+
+    def __init__(self, tag, token, child_tags):
+        super(LoneElement,self).__init__(tag,token , child_tags)
+        self.regexp = re.compile(self.re_string(),re.DOTALL+re.MULTILINE)
+
+    def re_string(self):
+        return r'^(\s*?' + re.escape(self.token) + r'\s*?\n)'
+
+    def _build(self,mo):
+        return bldr.tag.__getattr__(self.tag)()
+
+ 
+class BlankLine(WikiElement):
+
+    """Blank lines divide elements but don't add any output."""
+
+    def __init__(self):
+        super(BlankLine,self).__init__(tag=None,token='' , child_tags=[])
+        self.regexp = re.compile(self.re_string(),re.MULTILINE)
+
+    def re_string(self):
+        return r'^(\s*\n)+'
+     
+    def _build(self,mo):
+        return None
+
+    
+class LineBreak(WikiElement):
+
+    """An inline line break."""
+
+    append_newline = True
+    def __init__(self,tag, token, child_tags=[]):
+        super(LineBreak,self).__init__(tag,token , child_tags)
+        self.regexp = re.compile(self.re_string(),re.DOTALL)
+
+    def re_string(self):
+        return re.escape(self.token)
+    
+    def _build(self,mo):
+        return bldr.tag.__getattr__(self.tag)()
+
+
+
+def _test():
+    import doctest
+    doctest.testmod()
+
+if __name__ == "__main__":
+    _test()    

creoleparser/tests.py

+# tests.py
+#
+# Copyright (c) 2007 Stephen Day
+#
+# This module is part of Creoleparser and is released under
+# the MIT License: http://www.opensource.org/licenses/mit-license.php
+#
+
+from __init__ import creole_to_xhtml
+from dialects import Creole10
+from core import Parser
+
+def test_creole_to_xhtml():
+
+    assert creole_to_xhtml('**strong** soft\n') == '<p><strong>strong</strong> soft</p>\n'
+    assert creole_to_xhtml('//this**strong** soft//') == '<p><em>this<strong>strong</strong> soft</em></p>\n'
+    assert creole_to_xhtml('steve **is strong**\n{{{\nnot **weak**\n}}}\n') == \
+            '<p>steve <strong>is strong</strong></p>\n<pre>not **weak**\n</pre>\n'
+    assert creole_to_xhtml('{{{no **wiki** in here}}} but //here// is fine') == \
+            '<p><span>no **wiki** in here</span> but <em>here</em> is fine</p>\n'
+    assert creole_to_xhtml('steve **is strong //you know\n dude{{{not **weak**}}}\n') == \
+            '<p>steve <strong>is strong <em>you know\n dude</em></strong><span>not **weak**</span></p>\n'
+
+    assert creole_to_xhtml(
+r"""   |= Item|= Size|= Price |
+  | fish | **big**  |cheap   |
+  | crab | small|expesive|
+
+  |= Item|= Size|= Price 
+  | fish | big  |//cheap//   
+  | crab | small|**very\\expesive**
+  """) == """\
+<table><tr><th>Item</th><th>Size</th><th>Price</th></tr>
+<tr><td>fish</td><td><strong>big</strong></td><td>cheap</td></tr>
+<tr><td>crab</td><td>small</td><td>expesive</td></tr>
+</table>
+<table><tr><th>Item</th><th>Size</th><th>Price</th></tr>
+<tr><td>fish</td><td>big</td><td><em>cheap</em></td></tr>
+<tr><td>crab</td><td>small</td><td><strong>very<br />
+expesive</strong></td></tr>
+</table>
+"""
+
+    assert creole_to_xhtml(r"""
+  = Level 1 (largest) =
+== Level 2 ==
+ === Level 3 ===
+==== Level 4 ====
+===== Level 5 =====
+====== Level 6 ======
+=== Also level 3
+=== Also level 3 =
+=== Also level 3 ==
+=== **is** //parsed// ===
+  """) == """\
+<h1>Level 1 (largest)</h1>
+<h2>Level 2</h2>
+<h3>Level 3</h3>
+<h4>Level 4</h4>
+<h5>Level 5</h5>
+<h6>Level 6</h6>
+<h3>Also level 3</h3>
+<h3>Also level 3</h3>
+<h3>Also level 3</h3>
+<h3><strong>is</strong> <em>parsed</em></h3>
+""" 
+
+    assert creole_to_xhtml(r"""
+a lone escape ~ in the middle of a line
+or at the end ~
+a double ~~ in the middle
+at end ~~
+preventing ~** **bold** and ~// //italics//
+ ~= stopping headers!
+| in table~| cells | too!
+""") == """\
+<p>a lone escape ~ in the middle of a line
+or at the end ~
+a double ~ in the middle
+at end ~
+preventing ** <strong>bold</strong> and // <em>italics</em>
+ = stopping headers!</p>
+<table><tr><td>in table| cells</td><td>too!</td></tr>
+</table>
+"""
+
+    assert creole_to_xhtml(r"""
+{{{
+** some ** unformatted {{{ stuff }}} ~~~
+ }}}
+}}}
+""") == """\
+<pre>** some ** unformatted {{{ stuff }}} ~~~
+}}}
+</pre>
+"""
+
+    assert creole_to_xhtml("""\
+{{{** some ** unformatted {{{ stuff ~~ }}}}}}""") == """\
+<p><span>** some ** unformatted {{{ stuff ~~ }}}</span></p>
+"""
+
+    assert creole_to_xhtml("""\
+|http://www.google.com| steve|
+
+hello **[[http://www.google.com|Google]]**
+= http://www.yahoo.com
+== ~http://www.yahoo.com
+""") == """\
+<table><tr><td><a href="http://www.google.com">http://www.google.com</a></td><td>steve</td></tr>
+</table>
+<p>hello <strong><a href="http://www.google.com">Google</a></strong></p>
+<h1><a href="http://www.yahoo.com">http://www.yahoo.com</a></h1>
+<h2>http://www.yahoo.com</h2>
+"""
+
+    assert creole_to_xhtml(r"""
+Go to [[http://www.google.com]], it is [[http://www.google.com| Google]]\\
+even [[This Page]] is nice like [[This Page|this]].\\
+As is [[Ohana:Home|This one]].""") == """\
+<p>Go to <a href="http://www.google.com">http://www.google.com</a>, it is <a href="http://www.google.com">Google</a><br />
+
+even <a href="http://www.wikicreole.org/wiki/This_Page">This Page</a> is nice like <a href="http://www.wikicreole.org/wiki/This_Page">this</a>.<br />
+
+As is <a href="http://wikiohana.net/cgi-bin/wiki.pl/Home">This one</a>.</p>
+"""
+
+    assert creole_to_xhtml(r"""
+* this is list **item one**
+** item one - //subitem 1//
+### one **http://www.google.com**
+### two [[Creole1.0]]
+### three\\covers\\many\\lines
+** //subitem 2//
+### what is this?
+### no idea?
+**** A
+**** B
+### And lots of
+drivel here
+** //subitem 3//
+*** huh?
+* **item two
+* **item three**
+# new ordered list, item 1
+# item 2
+## sub item
+##sub item
+""") == """\
+<ul><li> this is list <strong>item one</strong>
+<ul><li> item one - <em>subitem 1</em>
+<ol><li> one <strong><a href="http://www.google.com">http://www.google.com</a></strong>
+</li>
+<li> two <a href="http://www.wikicreole.org/wiki/Creole1.0">Creole1.0</a>
+</li>
+<li> three<br />
+covers<br />
+many<br />
+lines
+</li>
+</ol></li>
+<li> <em>subitem 2</em>
+<ol><li> what is this?
+</li>
+<li> no idea?
+<ul><li> A
+</li>
+<li> B
+</li>
+</ul></li>
+<li> And lots of
+drivel here
+</li>
+</ol></li>
+<li> <em>subitem 3</em>
+<ul><li> huh?
+</li>
+</ul></li>
+</ul></li>
+<li> <strong>item two</strong>
+</li>
+<li> <strong>item three</strong>
+</li>
+</ul>
+<ol><li> new ordered list, item 1
+</li>
+<li> item 2
+<ol><li> sub item
+</li>
+<li>sub item
+</li>
+</ol></li>
+</ol>
+"""
+
+    assert creole_to_xhtml(r"""
+= Big Heading
+----
+\\
+|nice picture |{{campfire.jpg}}|\\
+|same picture as a link| [[http://google.com | {{ campfire.jpg | campfire.jpg }} ]]|""") == """\
+<h1>Big Heading</h1>
+<hr />
+<br />
+<table><tr><td>nice picture</td><td><img src="campfire.jpg" alt="campfire.jpg" /></td><td><br />
+</td></tr>
+<tr><td>same picture as a link</td><td><a href="http://google.com"><img src="campfire.jpg" alt="campfire.jpg" /></a></td></tr>
+</table>
+"""
+
+def test_no_wiki_monospace_option():
+    dialect = Creole10(no_wiki_monospace=True)
+    parser = Parser(dialect)
+    assert parser(r"""
+This block of {{{no_wiki **should** be monospace}}} now""") == """\
+<p>This block of <tt>no_wiki **should** be monospace</tt> now</p>
+"""
+    
+def test_use_additions_option():
+    dialect = Creole10(use_additions=True)
+    parser = Parser(dialect)
+    assert parser(r"""
+This block of ##text **should** be monospace## now""") == """\
+<p>This block of <tt>text <strong>should</strong> be monospace</tt> now</p>
+"""
+
+def _test():
+    import doctest
+    doctest.testmod()
+    test_creole_to_xhtml()
+    test_no_wiki_monospace_option()
+    test_use_additions_option()
+
+if __name__ == "__main__":
+    _test()
+
+
+#!/usr/bin/env python
+
+try:
+    from setuptools import setup
+except ImportError:
+    print 'setuptools not installed, using distutils.core'
+    print 'please ignore error message about "install_requires"'
+    from distutils.core import setup
+
+
+setup(name='Creoleparser',
+      version='0.2',
+      install_requires=['Genshi>=0.4'],
+      description='Parser for the Creole common wiki markup language',
+      author='Stephen Day',
+      author_email='stephen.h.day@-->reverse this-->moc.liamg',
+      url='http://www.wikicreole.org/wiki/Creoleparser.py',
+      packages=['creoleparser'],
+      license = 'MIT',
+      zip_safe = False,
+      classifiers = [
+        'Development Status :: 4 - Beta',
+        'Environment :: Web Environment',
+        'Intended Audience :: Developers',
+        'License :: OSI Approved :: MIT License',
+        'Operating System :: OS Independent',
+        'Programming Language :: Python',
+        'Topic :: Internet :: WWW/HTTP :: Dynamic Content',
+        'Topic :: Software Development :: Libraries :: Python Modules',
+        'Topic :: Text Processing :: Markup'
+        ],
+      long_description = """\
+What is Creoleparser?
+---------------------
+
+Creoleparser is a Python library for converting Creole wiki markup
+for output on the web. It is a full implementation of the Creole 1.0
+specification and aims to follow the spec exactly.
+
+What is Creole?
+---------------
+
+From wikicreole.org:
+  Creole is a common wiki markup language to be used across different
+  wikis. It's not replacing existing markup but instead enabling wiki
+  users to transfer content seamlessly across wikis, and for novice
+  users to contribute more easily.
+
+Find out more about Creole at <http://www.wikicreole.org>
+"""
+     )
+