WebHelpers / webhelpers / html /

Full commit
"""HTML/XHTML tag builder

You create tags with attribute access.  I.e., the "A" anchor tag is
html.a.  The attributes of the HTML tag are done with keyword
arguments.  The contents of the tag are the non-keyword arguments
(concatenated).  You can also use the special "c" keyword, passing a
list, tuple, or single tag, and it will make up the contents (this is
useful because keywords have to come after all non-keyword arguments,
and it's unintuitive to give your content before your attributes).

If the value of an attribute is None, then no attribute will be
inserted.  Think of it as "does not apply".  So::

    >>> HTML.a(href="", name=None, 
    ... c="Click Here")
    literal(u'<a href="">Click Here</a>')

If the value is None, then the empty string is used.  Otherwise str()
is called on the value.

``HTML`` can also be called, and it will concatenate the quoted string
representations of its arguments.

``HTML.comment`` will generate an HTML comment, like
``HTML.comment('comment text', 'and some more text')`` -- note that
it cannot take keyword arguments (because they wouldn't mean anything).

``HTML.literal`` will allow you to give HTML source without any quoting.

If you cannot define an attribute because it conflicts with a Python
keyword (particularly ``class``), you can append an underscore and
it will be removed (like ``class_='whatever'``).

About XHTML and HTML

This builder always produces tags that are valid as *both* HTML and
XHTML.  "Empty" tags (like ``<br>``, ``<input>`` etc) are written like ``<br />``,
with a space and a trailing ``/``.

*Only* empty tags get this treatment.  The library will never, for example,
product ``<script src="..." />``, which is invalid HTML.

The `W3C HTML validator <>`_ validates these
constructs as valid HTML Strict.  It does produce warnings, but those
warnings warn about the ambiguity if this same XML-style self-closing
tags are used for HTML elements that can take content (``<script>``,
``<textarea>``, etc).  This library never produces markup like that.

Rather than add options to generate different kinds of behavior, we
felt it was better to create markup that could be used in different
contexts without any real problems and without the overhead of passing
options around or maintaining different contexts, where you'd have to
keep track of whether markup is being rendered in an HTML or XHTML

import re
from cgi import escape as cgi_escape
from urllib import quote as url_escape
from UserDict import DictMixin
except NameError:
    from sets import Set as set

__all__ = ["HTML", "escape", "literal", "url_escape", "lit_sub"]

class UnfinishedTag(object):
    """Represents an unfinished or empty tag."""
    def __init__(self, tag):
        """Initialize with the tag name."""
        self._tag = tag

    def __call__(self, *args, **kw):
        """Create the tag with the arguments passed in."""
        return make_tag(self._tag, *args, **kw)

    def __str__(self):
        """Return a literal representation."""
        return literal('<%s />' % self._tag)

    def __html__(self):
        """Return the HTML escaped tag."""
        return str(self)

class UnfinishedComment(object):
    """Represents an unfinished or empty comment."""
    def __call__(self, *args):
        """Create the HTML comment."""
        return literal('<!--%s-->' % ''.join([str(x) for x in args]))
    def __html__(self):
        """Return the HTML escaped tag."""
        raise NotImplementedError(
            "You must call html.comment with some text")

class UnfinishedLiteral(object):
    """Represent an unfinished literal value."""
    def __call__(self, *args):
        """Return the literal HTML."""
        return literal(*args)

    def __html__(self):
        """Return the HTML escaped text."""
        raise NotImplementedError(
            "You must call html.literal with some text")

class HTMLBuilder(object):
    """Base HTML object."""
    comment = UnfinishedComment()
    literal = UnfinishedLiteral()
    def __getattr__(self, attr):
        """Generate the tag for the given attribute name."""
        if attr.startswith('_'):
            raise AttributeError
        result = self.__dict__[attr] = UnfinishedTag(attr.lower())
        return result

    def __call__(self, *args):
        """Join raw HTML and HTML escape it."""
        return literal(''.join([escape(x) for x in args]))

def _attr_decode(v):
    """Parse out attributes that begin with '_'."""
    if v.endswith('_'):
        return v[:-1]
        return v

def make_tag(tag, *args, **kw):
    if kw.has_key("c"):
        assert not args, "The special 'c' keyword argument cannot be used "\
"in conjunction with non-keyword arguments"
        args = kw.pop("c")
    closed = kw.pop("_closed", True)
    htmlArgs = [' %s="%s"' % (_attr_decode(attr), escape(value))
                for attr, value in sorted(kw.iteritems())
                if value is not None]
    if not args and tag in empty_tags and closed:
        substr = '<%s%s />'
        return literal(substr % (tag, "".join(htmlArgs)))
        close_tag = ""
        if closed:
            close_tag = "</%s>" %(tag)
        return literal("<%s%s>%s%s" % (
            "".join([escape(x) for x in args]),

class literal(unicode):
    """Represents an HTML literal.
    This subclass of unicode has a ``.__html__()`` method that is 
    detected by the ``escape()`` function.
    Also, if you add another string to this string, the other string 
    will be quoted and you will get back another literal object.  Also
    ``literal(...) % obj`` will quote any value(s) from ``obj``.  If
    you do something like ``literal(...) + literal(...)``, neither
    string will be changed because ``escape(literal(...))`` doesn't
    change the original literal.
    def __new__(cls, string='', encoding='utf-8', errors="strict"):
        """Create the new literal string object."""
        if isinstance(string, unicode):
            obj = unicode.__new__(cls, string)
            obj = unicode.__new__(cls, string, encoding, errors)
        obj.encoding = encoding
        obj.error_mode = errors
        return obj

    def __str__(self):
        return self.encode(self.encoding)

    def __repr__(self):
        return '%s(%s)' % (self.__class__.__name__, unicode.__repr__(self))
    def __html__(self):
        return self
    def __add__(self, other):
        if hasattr(other, '__html__') or isinstance(other, basestring):
            return self.__class__(unicode.__add__(self, escape(other)))
        return NotImplemented
    def __radd__(self, other):
        if hasattr(other, '__html__') or isinstance(other, basestring):
            return self.__class__(unicode.__add__(escape(other), self))
        return NotImplemented
    def __mul__(self, count):
        return self.__class__(unicode.__mul__(self, count))
    def __mod__(self, obj):
        if isinstance(obj, tuple):
            escaped = [_EscapedItem(item, self.encoding,
                                    self.error_mode) for item in obj]
            return self.__class__(unicode.__mod__(self, tuple(escaped)))
            return self.__class__(unicode.__mod__(self, _EscapedItem(obj, self.encoding,
    def join(self, items):
        return self.__class__(unicode.join(self, ([escape(i) for i in items])))
    def split(self, *args, **kwargs):
        return [literal(x) for x in unicode.split(self, *args, **kwargs)]

    def rsplit(self, *args, **kwargs):
        return [literal(x) for x in unicode.rsplit(self, *args, **kwargs)]
    def splitlines(self, *args, **kwargs):
        return [literal(x) for x in unicode.splitlines(self, *args, **kwargs)]

# Yes, this is rather sucky, but I really don't want to write all these
# damn methods, so we write in all the appropriate literal results of these
# functions on module load
for k in dir(literal):
    if k in ['__getslice__', '__getitem__', 'capitalize', 'center', 
             'expandtabs', 'ljust', 'lower', 'lstrip', 'partition',
             'replace', 'rjust', 'rpartition', 'rstrip', 'strip',
             'swapcase', 'title', 'translate', 'upper', 'zfill']:
        def wrapper(func):
            def entangle(*args, **kwargs):
                return literal(func(*args, **kwargs))
                entangle.__name__ = func.__name__
            except TypeError:
                # < Python 2.4 
            entangle.__doc__ = func.__doc__
            return entangle
        fun = getattr(unicode, k)
        setattr(literal, k, wrapper(fun))

def lit_sub(*args, **kw):
    """Literal-safe version of re.sub.  If the string to be operated on is
    a literal, return a literal result.
    lit = hasattr(args[2], '__html__')
    cls = args[2].__class__
    result = re.sub(*args, **kw)
    if lit:
        return cls(result)
        return result

def escape(val, force=False):
    """Does HTML-escaping of a value.
    Objects with a ``.__html__()`` method will have that method called,
    and the return value will *not* be quoted.  Thus objects with that
    magic method can be used to represent HTML that should not be
    As a special case, ``escape(None)`` returns ''
    If ``force`` is true, then it will always be quoted regardless of
    if val is None:
        return literal('')
    elif not force and hasattr(val, '__html__'):
        return literal(val.__html__())
    elif isinstance(val, basestring):
        return literal(cgi_escape(val, True))
        return literal(cgi_escape(unicode(val), True))

class _EscapedItem(DictMixin):
    """Wrapper/helper for literal(...) % obj
    This quotes the object during string substitution, and if the
    object is dictionary(-like) it will quote all the values in the
    def __init__(self, obj, encoding, error_mode):
        self.obj = obj
        self.encoding = encoding
        self.error_mode = error_mode
    def __getitem__(self, key):
        return _EscapedItem(self.obj[key], self.encoding, self.error_mode)
    def __str__(self):
        v = escape(self.obj)
        if isinstance(v, unicode):
            v = v.encode(self.encoding)
        return v
    def __unicode__(self):
        v = escape(self.obj)
        if isinstance(v, str):
            v = v.decode(self.encoding, self.error_mode)
        return v
    def __int__(self):
        return int(self.obj)
    def __float__(self):
        return float(self.obj)
    def __repr__(self):
        return escape(repr(self.obj))

empty_tags = set("area base basefont br col frame hr img input isindex link meta param".split())

HTML = HTMLBuilder()