Commits

Virgil Dupras committed daa68a7

First commit, before running 2to3.

  • Participants

Comments (0)

Files changed (7)

+syntax: glob
+
+.DS_Store
+*.pyc
+Version 1.0.0 -- 2010/08/24
+---------------------------
+
+* Initial Release
+Copyright (c) 2010, Hardcoded Software Inc., http://www.hardcoded.net
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+    * Neither the name of Hardcoded Software Inc. nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+==================================================
+sgmllib3k -- Py3k port of the old stdlib module
+==================================================
+
+sgmllib was dropped in Python 3. For those depending on it, that's somewhat infortunate. This is a quick and dirty port of this old module. I just ran 2to3 on it and published it. I don't indend to maintain it, so it might be a good idea to eventually think about finding another module to use.
+from setuptools import setup
+
+CLASSIFIERS = [
+    'Development Status :: 5 - Production/Stable',
+    'Intended Audience :: Developers',
+    'License :: OSI Approved :: BSD License',
+    'Programming Language :: Python :: 3',
+]
+
+setup(
+    name='sgmllib3k',
+    version='1.0.0',
+    author='Hardcoded Software',
+    author_email='hsoft@hardcoded.net',
+    py_modules=['sgmllib'],
+    scripts=[],
+    url='http://hg.hardcoded.net/sgmllib'
+    license='BSD License',
+    description='Py3k port of sgmllib.',
+    long_description=open('README').read(),
+    classifiers=CLASSIFIERS,
+    test_suite='test_sgmllib',
+)
+"""A parser for SGML, using the derived class as a static DTD."""
+
+# XXX This only supports those SGML features used by HTML.
+
+# XXX There should be a way to distinguish between PCDATA (parsed
+# character data -- the normal case), RCDATA (replaceable character
+# data -- only char and entity references and end tags are special)
+# and CDATA (character data -- only end tags are special).  RCDATA is
+# not supported at all.
+
+import markupbase
+import re
+
+__all__ = ["SGMLParser", "SGMLParseError"]
+
+# Regular expressions used for parsing
+
+interesting = re.compile('[&<]')
+incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
+                           '<([a-zA-Z][^<>]*|'
+                              '/([a-zA-Z][^<>]*)?|'
+                              '![^<>]*)?')
+
+entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
+charref = re.compile('&#([0-9]+)[^0-9]')
+
+starttagopen = re.compile('<[>a-zA-Z]')
+shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
+shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
+piclose = re.compile('>')
+endbracket = re.compile('[<>]')
+tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
+attrfind = re.compile(
+    r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
+    r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
+
+
+class SGMLParseError(RuntimeError):
+    """Exception raised for all parse errors."""
+    pass
+
+
+# SGML parser base class -- find tags and call handler functions.
+# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
+# The dtd is defined by deriving a class which defines methods
+# with special names to handle tags: start_foo and end_foo to handle
+# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
+# (Tags are converted to lower case for this purpose.)  The data
+# between tags is passed to the parser by calling self.handle_data()
+# with some data as argument (the data may be split up in arbitrary
+# chunks).  Entity references are passed by calling
+# self.handle_entityref() with the entity reference as argument.
+
+class SGMLParser(markupbase.ParserBase):
+    # Definition of entities -- derived classes may override
+    entity_or_charref = re.compile('&(?:'
+      '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
+      ')(;?)')
+
+    def __init__(self, verbose=0):
+        """Initialize and reset this instance."""
+        self.verbose = verbose
+        self.reset()
+
+    def reset(self):
+        """Reset this instance. Loses all unprocessed data."""
+        self.__starttag_text = None
+        self.rawdata = ''
+        self.stack = []
+        self.lasttag = '???'
+        self.nomoretags = 0
+        self.literal = 0
+        markupbase.ParserBase.reset(self)
+
+    def setnomoretags(self):
+        """Enter literal mode (CDATA) till EOF.
+
+        Intended for derived classes only.
+        """
+        self.nomoretags = self.literal = 1
+
+    def setliteral(self, *args):
+        """Enter literal mode (CDATA).
+
+        Intended for derived classes only.
+        """
+        self.literal = 1
+
+    def feed(self, data):
+        """Feed some data to the parser.
+
+        Call this as often as you want, with as little or as much text
+        as you want (may include '\n').  (This just saves the text,
+        all the processing is done by goahead().)
+        """
+
+        self.rawdata = self.rawdata + data
+        self.goahead(0)
+
+    def close(self):
+        """Handle the remaining data."""
+        self.goahead(1)
+
+    def error(self, message):
+        raise SGMLParseError(message)
+
+    # Internal -- handle data as far as reasonable.  May leave state
+    # and data to be processed by a subsequent call.  If 'end' is
+    # true, force handling all data as if followed by EOF marker.
+    def goahead(self, end):
+        rawdata = self.rawdata
+        i = 0
+        n = len(rawdata)
+        while i < n:
+            if self.nomoretags:
+                self.handle_data(rawdata[i:n])
+                i = n
+                break
+            match = interesting.search(rawdata, i)
+            if match: j = match.start()
+            else: j = n
+            if i < j:
+                self.handle_data(rawdata[i:j])
+            i = j
+            if i == n: break
+            if rawdata[i] == '<':
+                if starttagopen.match(rawdata, i):
+                    if self.literal:
+                        self.handle_data(rawdata[i])
+                        i = i+1
+                        continue
+                    k = self.parse_starttag(i)
+                    if k < 0: break
+                    i = k
+                    continue
+                if rawdata.startswith("</", i):
+                    k = self.parse_endtag(i)
+                    if k < 0: break
+                    i = k
+                    self.literal = 0
+                    continue
+                if self.literal:
+                    if n > (i + 1):
+                        self.handle_data("<")
+                        i = i+1
+                    else:
+                        # incomplete
+                        break
+                    continue
+                if rawdata.startswith("<!--", i):
+                        # Strictly speaking, a comment is --.*--
+                        # within a declaration tag <!...>.
+                        # This should be removed,
+                        # and comments handled only in parse_declaration.
+                    k = self.parse_comment(i)
+                    if k < 0: break
+                    i = k
+                    continue
+                if rawdata.startswith("<?", i):
+                    k = self.parse_pi(i)
+                    if k < 0: break
+                    i = i+k
+                    continue
+                if rawdata.startswith("<!", i):
+                    # This is some sort of declaration; in "HTML as
+                    # deployed," this should only be the document type
+                    # declaration ("<!DOCTYPE html...>").
+                    k = self.parse_declaration(i)
+                    if k < 0: break
+                    i = k
+                    continue
+            elif rawdata[i] == '&':
+                if self.literal:
+                    self.handle_data(rawdata[i])
+                    i = i+1
+                    continue
+                match = charref.match(rawdata, i)
+                if match:
+                    name = match.group(1)
+                    self.handle_charref(name)
+                    i = match.end(0)
+                    if rawdata[i-1] != ';': i = i-1
+                    continue
+                match = entityref.match(rawdata, i)
+                if match:
+                    name = match.group(1)
+                    self.handle_entityref(name)
+                    i = match.end(0)
+                    if rawdata[i-1] != ';': i = i-1
+                    continue
+            else:
+                self.error('neither < nor & ??')
+            # We get here only if incomplete matches but
+            # nothing else
+            match = incomplete.match(rawdata, i)
+            if not match:
+                self.handle_data(rawdata[i])
+                i = i+1
+                continue
+            j = match.end(0)
+            if j == n:
+                break # Really incomplete
+            self.handle_data(rawdata[i:j])
+            i = j
+        # end while
+        if end and i < n:
+            self.handle_data(rawdata[i:n])
+            i = n
+        self.rawdata = rawdata[i:]
+        # XXX if end: check for empty stack
+
+    # Extensions for the DOCTYPE scanner:
+    _decl_otherchars = '='
+
+    # Internal -- parse processing instr, return length or -1 if not terminated
+    def parse_pi(self, i):
+        rawdata = self.rawdata
+        if rawdata[i:i+2] != '<?':
+            self.error('unexpected call to parse_pi()')
+        match = piclose.search(rawdata, i+2)
+        if not match:
+            return -1
+        j = match.start(0)
+        self.handle_pi(rawdata[i+2: j])
+        j = match.end(0)
+        return j-i
+
+    def get_starttag_text(self):
+        return self.__starttag_text
+
+    # Internal -- handle starttag, return length or -1 if not terminated
+    def parse_starttag(self, i):
+        self.__starttag_text = None
+        start_pos = i
+        rawdata = self.rawdata
+        if shorttagopen.match(rawdata, i):
+            # SGML shorthand: <tag/data/ == <tag>data</tag>
+            # XXX Can data contain &... (entity or char refs)?
+            # XXX Can data contain < or > (tag characters)?
+            # XXX Can there be whitespace before the first /?
+            match = shorttag.match(rawdata, i)
+            if not match:
+                return -1
+            tag, data = match.group(1, 2)
+            self.__starttag_text = '<%s/' % tag
+            tag = tag.lower()
+            k = match.end(0)
+            self.finish_shorttag(tag, data)
+            self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
+            return k
+        # XXX The following should skip matching quotes (' or ")
+        # As a shortcut way to exit, this isn't so bad, but shouldn't
+        # be used to locate the actual end of the start tag since the
+        # < or > characters may be embedded in an attribute value.
+        match = endbracket.search(rawdata, i+1)
+        if not match:
+            return -1
+        j = match.start(0)
+        # Now parse the data between i+1 and j into a tag and attrs
+        attrs = []
+        if rawdata[i:i+2] == '<>':
+            # SGML shorthand: <> == <last open tag seen>
+            k = j
+            tag = self.lasttag
+        else:
+            match = tagfind.match(rawdata, i+1)
+            if not match:
+                self.error('unexpected call to parse_starttag')
+            k = match.end(0)
+            tag = rawdata[i+1:k].lower()
+            self.lasttag = tag
+        while k < j:
+            match = attrfind.match(rawdata, k)
+            if not match: break
+            attrname, rest, attrvalue = match.group(1, 2, 3)
+            if not rest:
+                attrvalue = attrname
+            else:
+                if (attrvalue[:1] == "'" == attrvalue[-1:] or
+                    attrvalue[:1] == '"' == attrvalue[-1:]):
+                    # strip quotes
+                    attrvalue = attrvalue[1:-1]
+                attrvalue = self.entity_or_charref.sub(
+                    self._convert_ref, attrvalue)
+            attrs.append((attrname.lower(), attrvalue))
+            k = match.end(0)
+        if rawdata[j] == '>':
+            j = j+1
+        self.__starttag_text = rawdata[start_pos:j]
+        self.finish_starttag(tag, attrs)
+        return j
+
+    # Internal -- convert entity or character reference
+    def _convert_ref(self, match):
+        if match.group(2):
+            return self.convert_charref(match.group(2)) or \
+                '&#%s%s' % match.groups()[1:]
+        elif match.group(3):
+            return self.convert_entityref(match.group(1)) or \
+                '&%s;' % match.group(1)
+        else:
+            return '&%s' % match.group(1)
+
+    # Internal -- parse endtag
+    def parse_endtag(self, i):
+        rawdata = self.rawdata
+        match = endbracket.search(rawdata, i+1)
+        if not match:
+            return -1
+        j = match.start(0)
+        tag = rawdata[i+2:j].strip().lower()
+        if rawdata[j] == '>':
+            j = j+1
+        self.finish_endtag(tag)
+        return j
+
+    # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
+    def finish_shorttag(self, tag, data):
+        self.finish_starttag(tag, [])
+        self.handle_data(data)
+        self.finish_endtag(tag)
+
+    # Internal -- finish processing of start tag
+    # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
+    def finish_starttag(self, tag, attrs):
+        try:
+            method = getattr(self, 'start_' + tag)
+        except AttributeError:
+            try:
+                method = getattr(self, 'do_' + tag)
+            except AttributeError:
+                self.unknown_starttag(tag, attrs)
+                return -1
+            else:
+                self.handle_starttag(tag, method, attrs)
+                return 0
+        else:
+            self.stack.append(tag)
+            self.handle_starttag(tag, method, attrs)
+            return 1
+
+    # Internal -- finish processing of end tag
+    def finish_endtag(self, tag):
+        if not tag:
+            found = len(self.stack) - 1
+            if found < 0:
+                self.unknown_endtag(tag)
+                return
+        else:
+            if tag not in self.stack:
+                try:
+                    method = getattr(self, 'end_' + tag)
+                except AttributeError:
+                    self.unknown_endtag(tag)
+                else:
+                    self.report_unbalanced(tag)
+                return
+            found = len(self.stack)
+            for i in range(found):
+                if self.stack[i] == tag: found = i
+        while len(self.stack) > found:
+            tag = self.stack[-1]
+            try:
+                method = getattr(self, 'end_' + tag)
+            except AttributeError:
+                method = None
+            if method:
+                self.handle_endtag(tag, method)
+            else:
+                self.unknown_endtag(tag)
+            del self.stack[-1]
+
+    # Overridable -- handle start tag
+    def handle_starttag(self, tag, method, attrs):
+        method(attrs)
+
+    # Overridable -- handle end tag
+    def handle_endtag(self, tag, method):
+        method()
+
+    # Example -- report an unbalanced </...> tag.
+    def report_unbalanced(self, tag):
+        if self.verbose:
+            print '*** Unbalanced </' + tag + '>'
+            print '*** Stack:', self.stack
+
+    def convert_charref(self, name):
+        """Convert character reference, may be overridden."""
+        try:
+            n = int(name)
+        except ValueError:
+            return
+        if not 0 <= n <= 127:
+            return
+        return self.convert_codepoint(n)
+
+    def convert_codepoint(self, codepoint):
+        return chr(codepoint)
+
+    def handle_charref(self, name):
+        """Handle character reference, no need to override."""
+        replacement = self.convert_charref(name)
+        if replacement is None:
+            self.unknown_charref(name)
+        else:
+            self.handle_data(replacement)
+
+    # Definition of entities -- derived classes may override
+    entitydefs = \
+            {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
+
+    def convert_entityref(self, name):
+        """Convert entity references.
+
+        As an alternative to overriding this method; one can tailor the
+        results by setting up the self.entitydefs mapping appropriately.
+        """
+        table = self.entitydefs
+        if name in table:
+            return table[name]
+        else:
+            return
+
+    def handle_entityref(self, name):
+        """Handle entity references, no need to override."""
+        replacement = self.convert_entityref(name)
+        if replacement is None:
+            self.unknown_entityref(name)
+        else:
+            self.handle_data(replacement)
+
+    # Example -- handle data, should be overridden
+    def handle_data(self, data):
+        pass
+
+    # Example -- handle comment, could be overridden
+    def handle_comment(self, data):
+        pass
+
+    # Example -- handle declaration, could be overridden
+    def handle_decl(self, decl):
+        pass
+
+    # Example -- handle processing instruction, could be overridden
+    def handle_pi(self, data):
+        pass
+
+    # To be overridden -- handlers for unknown objects
+    def unknown_starttag(self, tag, attrs): pass
+    def unknown_endtag(self, tag): pass
+    def unknown_charref(self, ref): pass
+    def unknown_entityref(self, ref): pass
+
+
+class TestSGMLParser(SGMLParser):
+
+    def __init__(self, verbose=0):
+        self.testdata = ""
+        SGMLParser.__init__(self, verbose)
+
+    def handle_data(self, data):
+        self.testdata = self.testdata + data
+        if len(repr(self.testdata)) >= 70:
+            self.flush()
+
+    def flush(self):
+        data = self.testdata
+        if data:
+            self.testdata = ""
+            print 'data:', repr(data)
+
+    def handle_comment(self, data):
+        self.flush()
+        r = repr(data)
+        if len(r) > 68:
+            r = r[:32] + '...' + r[-32:]
+        print 'comment:', r
+
+    def unknown_starttag(self, tag, attrs):
+        self.flush()
+        if not attrs:
+            print 'start tag: <' + tag + '>'
+        else:
+            print 'start tag: <' + tag,
+            for name, value in attrs:
+                print name + '=' + '"' + value + '"',
+            print '>'
+
+    def unknown_endtag(self, tag):
+        self.flush()
+        print 'end tag: </' + tag + '>'
+
+    def unknown_entityref(self, ref):
+        self.flush()
+        print '*** unknown entity ref: &' + ref + ';'
+
+    def unknown_charref(self, ref):
+        self.flush()
+        print '*** unknown char ref: &#' + ref + ';'
+
+    def unknown_decl(self, data):
+        self.flush()
+        print '*** unknown decl: [' + data + ']'
+
+    def close(self):
+        SGMLParser.close(self)
+        self.flush()
+
+
+def test(args = None):
+    import sys
+
+    if args is None:
+        args = sys.argv[1:]
+
+    if args and args[0] == '-s':
+        args = args[1:]
+        klass = SGMLParser
+    else:
+        klass = TestSGMLParser
+
+    if args:
+        file = args[0]
+    else:
+        file = 'test.html'
+
+    if file == '-':
+        f = sys.stdin
+    else:
+        try:
+            f = open(file, 'r')
+        except IOError, msg:
+            print file, ":", msg
+            sys.exit(1)
+
+    data = f.read()
+    if f is not sys.stdin:
+        f.close()
+
+    x = klass()
+    for c in data:
+        x.feed(c)
+    x.close()
+
+
+if __name__ == '__main__':
+    test()

File test_sgmllib.py

+import pprint
+import re
+import unittest
+import sgmllib
+
+class EventCollector(sgmllib.SGMLParser):
+
+    def __init__(self):
+        self.events = []
+        self.append = self.events.append
+        sgmllib.SGMLParser.__init__(self)
+
+    def get_events(self):
+        # Normalize the list of events so that buffer artefacts don't
+        # separate runs of contiguous characters.
+        L = []
+        prevtype = None
+        for event in self.events:
+            type = event[0]
+            if type == prevtype == "data":
+                L[-1] = ("data", L[-1][1] + event[1])
+            else:
+                L.append(event)
+            prevtype = type
+        self.events = L
+        return L
+
+    # structure markup
+
+    def unknown_starttag(self, tag, attrs):
+        self.append(("starttag", tag, attrs))
+
+    def unknown_endtag(self, tag):
+        self.append(("endtag", tag))
+
+    # all other markup
+
+    def handle_comment(self, data):
+        self.append(("comment", data))
+
+    def handle_charref(self, data):
+        self.append(("charref", data))
+
+    def handle_data(self, data):
+        self.append(("data", data))
+
+    def handle_decl(self, decl):
+        self.append(("decl", decl))
+
+    def handle_entityref(self, data):
+        self.append(("entityref", data))
+
+    def handle_pi(self, data):
+        self.append(("pi", data))
+
+    def unknown_decl(self, decl):
+        self.append(("unknown decl", decl))
+
+
+class CDATAEventCollector(EventCollector):
+    def start_cdata(self, attrs):
+        self.append(("starttag", "cdata", attrs))
+        self.setliteral()
+
+
+class HTMLEntityCollector(EventCollector):
+
+    entity_or_charref = re.compile('(?:&([a-zA-Z][-.a-zA-Z0-9]*)'
+        '|&#(x[0-9a-zA-Z]+|[0-9]+))(;?)')
+
+    def convert_charref(self, name):
+        self.append(("charref", "convert", name))
+        if name[0] != "x":
+            return EventCollector.convert_charref(self, name)
+
+    def convert_codepoint(self, codepoint):
+        self.append(("codepoint", "convert", codepoint))
+        EventCollector.convert_codepoint(self, codepoint)
+
+    def convert_entityref(self, name):
+        self.append(("entityref", "convert", name))
+        return EventCollector.convert_entityref(self, name)
+
+    # These to record that they were called, then pass the call along
+    # to the default implementation so that it's actions can be
+    # recorded.
+
+    def handle_charref(self, data):
+        self.append(("charref", data))
+        sgmllib.SGMLParser.handle_charref(self, data)
+
+    def handle_entityref(self, data):
+        self.append(("entityref", data))
+        sgmllib.SGMLParser.handle_entityref(self, data)
+
+
+class SGMLParserTestCase(unittest.TestCase):
+
+    collector = EventCollector
+
+    def get_events(self, source):
+        parser = self.collector()
+        try:
+            for s in source:
+                parser.feed(s)
+            parser.close()
+        except:
+            #self.events = parser.events
+            raise
+        return parser.get_events()
+
+    def check_events(self, source, expected_events):
+        try:
+            events = self.get_events(source)
+        except:
+            #import sys
+            #print >>sys.stderr, pprint.pformat(self.events)
+            raise
+        if events != expected_events:
+            self.fail("received events did not match expected events\n"
+                      "Expected:\n" + pprint.pformat(expected_events) +
+                      "\nReceived:\n" + pprint.pformat(events))
+
+    def check_parse_error(self, source):
+        parser = EventCollector()
+        try:
+            parser.feed(source)
+            parser.close()
+        except sgmllib.SGMLParseError:
+            pass
+        else:
+            self.fail("expected SGMLParseError for %r\nReceived:\n%s"
+                      % (source, pprint.pformat(parser.get_events())))
+
+    def test_doctype_decl_internal(self):
+        inside = """\
+DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'
+             SYSTEM 'http://www.w3.org/TR/html401/strict.dtd' [
+  <!ELEMENT html - O EMPTY>
+  <!ATTLIST html
+      version CDATA #IMPLIED
+      profile CDATA 'DublinCore'>
+  <!NOTATION datatype SYSTEM 'http://xml.python.org/notations/python-module'>
+  <!ENTITY myEntity 'internal parsed entity'>
+  <!ENTITY anEntity SYSTEM 'http://xml.python.org/entities/something.xml'>
+  <!ENTITY % paramEntity 'name|name|name'>
+  %paramEntity;
+  <!-- comment -->
+]"""
+        self.check_events(["<!%s>" % inside], [
+            ("decl", inside),
+            ])
+
+    def test_doctype_decl_external(self):
+        inside = "DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'"
+        self.check_events("<!%s>" % inside, [
+            ("decl", inside),
+            ])
+
+    def test_underscore_in_attrname(self):
+        # SF bug #436621
+        """Make sure attribute names with underscores are accepted"""
+        self.check_events("<a has_under _under>", [
+            ("starttag", "a", [("has_under", "has_under"),
+                               ("_under", "_under")]),
+            ])
+
+    def test_underscore_in_tagname(self):
+        # SF bug #436621
+        """Make sure tag names with underscores are accepted"""
+        self.check_events("<has_under></has_under>", [
+            ("starttag", "has_under", []),
+            ("endtag", "has_under"),
+            ])
+
+    def test_quotes_in_unquoted_attrs(self):
+        # SF bug #436621
+        """Be sure quotes in unquoted attributes are made part of the value"""
+        self.check_events("<a href=foo'bar\"baz>", [
+            ("starttag", "a", [("href", "foo'bar\"baz")]),
+            ])
+
+    def test_xhtml_empty_tag(self):
+        """Handling of XHTML-style empty start tags"""
+        self.check_events("<br />text<i></i>", [
+            ("starttag", "br", []),
+            ("data", "text"),
+            ("starttag", "i", []),
+            ("endtag", "i"),
+            ])
+
+    def test_processing_instruction_only(self):
+        self.check_events("<?processing instruction>", [
+            ("pi", "processing instruction"),
+            ])
+
+    def test_bad_nesting(self):
+        self.check_events("<a><b></a></b>", [
+            ("starttag", "a", []),
+            ("starttag", "b", []),
+            ("endtag", "a"),
+            ("endtag", "b"),
+            ])
+
+    def test_bare_ampersands(self):
+        self.check_events("this text & contains & ampersands &", [
+            ("data", "this text & contains & ampersands &"),
+            ])
+
+    def test_bare_pointy_brackets(self):
+        self.check_events("this < text > contains < bare>pointy< brackets", [
+            ("data", "this < text > contains < bare>pointy< brackets"),
+            ])
+
+    def test_attr_syntax(self):
+        output = [
+          ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", "e")])
+          ]
+        self.check_events("""<a b='v' c="v" d=v e>""", output)
+        self.check_events("""<a  b = 'v' c = "v" d = v e>""", output)
+        self.check_events("""<a\nb\n=\n'v'\nc\n=\n"v"\nd\n=\nv\ne>""", output)
+        self.check_events("""<a\tb\t=\t'v'\tc\t=\t"v"\td\t=\tv\te>""", output)
+
+    def test_attr_values(self):
+        self.check_events("""<a b='xxx\n\txxx' c="yyy\t\nyyy" d='\txyz\n'>""",
+                        [("starttag", "a", [("b", "xxx\n\txxx"),
+                                            ("c", "yyy\t\nyyy"),
+                                            ("d", "\txyz\n")])
+                         ])
+        self.check_events("""<a b='' c="">""", [
+            ("starttag", "a", [("b", ""), ("c", "")]),
+            ])
+        # URL construction stuff from RFC 1808:
+        safe = "$-_.+"
+        extra = "!*'(),"
+        reserved = ";/?:@&="
+        url = "http://example.com:8080/path/to/file?%s%s%s" % (
+            safe, extra, reserved)
+        self.check_events("""<e a=%s>""" % url, [
+            ("starttag", "e", [("a", url)]),
+            ])
+        # Regression test for SF patch #669683.
+        self.check_events("<e a=rgb(1,2,3)>", [
+            ("starttag", "e", [("a", "rgb(1,2,3)")]),
+            ])
+
+    def test_attr_values_entities(self):
+        """Substitution of entities and charrefs in attribute values"""
+        # SF bug #1452246
+        self.check_events("""<a b=&lt; c=&lt;&gt; d=&lt-&gt; e='&lt; '
+                                f="&xxx;" g='&#32;&#33;' h='&#500;'
+                                i='x?a=b&c=d;'
+                                j='&amp;#42;' k='&#38;#42;'>""",
+            [("starttag", "a", [("b", "<"),
+                                ("c", "<>"),
+                                ("d", "&lt->"),
+                                ("e", "< "),
+                                ("f", "&xxx;"),
+                                ("g", " !"),
+                                ("h", "&#500;"),
+                                ("i", "x?a=b&c=d;"),
+                                ("j", "&#42;"),
+                                ("k", "&#42;"),
+                                ])])
+
+    def test_convert_overrides(self):
+        # This checks that the character and entity reference
+        # conversion helpers are called at the documented times.  No
+        # attempt is made to really change what the parser accepts.
+        #
+        self.collector = HTMLEntityCollector
+        self.check_events(('<a title="&ldquo;test&#x201d;">foo</a>'
+                           '&foobar;&#42;'), [
+            ('entityref', 'convert', 'ldquo'),
+            ('charref', 'convert', 'x201d'),
+            ('starttag', 'a', [('title', '&ldquo;test&#x201d;')]),
+            ('data', 'foo'),
+            ('endtag', 'a'),
+            ('entityref', 'foobar'),
+            ('entityref', 'convert', 'foobar'),
+            ('charref', '42'),
+            ('charref', 'convert', '42'),
+            ('codepoint', 'convert', 42),
+            ])
+
+    def test_attr_funky_names(self):
+        self.check_events("""<a a.b='v' c:d=v e-f=v>""", [
+            ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),
+            ])
+
+    def test_attr_value_ip6_url(self):
+        # http://www.python.org/sf/853506
+        self.check_events(("<a href='http://[1080::8:800:200C:417A]/'>"
+                           "<a href=http://[1080::8:800:200C:417A]/>"), [
+            ("starttag", "a", [("href", "http://[1080::8:800:200C:417A]/")]),
+            ("starttag", "a", [("href", "http://[1080::8:800:200C:417A]/")]),
+            ])
+
+    def test_weird_starttags(self):
+        self.check_events("<a<a>", [
+            ("starttag", "a", []),
+            ("starttag", "a", []),
+            ])
+        self.check_events("</a<a>", [
+            ("endtag", "a"),
+            ("starttag", "a", []),
+            ])
+
+    def test_declaration_junk_chars(self):
+        self.check_parse_error("<!DOCTYPE foo $ >")
+
+    def test_get_starttag_text(self):
+        s = """<foobar   \n   one="1"\ttwo=2   >"""
+        self.check_events(s, [
+            ("starttag", "foobar", [("one", "1"), ("two", "2")]),
+            ])
+
+    def test_cdata_content(self):
+        s = ("<cdata> <!-- not a comment --> &not-an-entity-ref; </cdata>"
+             "<notcdata> <!-- comment --> </notcdata>")
+        self.collector = CDATAEventCollector
+        self.check_events(s, [
+            ("starttag", "cdata", []),
+            ("data", " <!-- not a comment --> &not-an-entity-ref; "),
+            ("endtag", "cdata"),
+            ("starttag", "notcdata", []),
+            ("data", " "),
+            ("comment", " comment "),
+            ("data", " "),
+            ("endtag", "notcdata"),
+            ])
+        s = """<cdata> <not a='start tag'> </cdata>"""
+        self.check_events(s, [
+            ("starttag", "cdata", []),
+            ("data", " <not a='start tag'> "),
+            ("endtag", "cdata"),
+            ])
+
+    def test_illegal_declarations(self):
+        s = 'abc<!spacer type="block" height="25">def'
+        self.check_events(s, [
+            ("data", "abc"),
+            ("unknown decl", 'spacer type="block" height="25"'),
+            ("data", "def"),
+            ])
+
+    def test_enumerated_attr_type(self):
+        s = "<!DOCTYPE doc [<!ATTLIST doc attr (a | b) >]>"
+        self.check_events(s, [
+            ('decl', 'DOCTYPE doc [<!ATTLIST doc attr (a | b) >]'),
+            ])
+
+    def test_only_decode_ascii(self):
+        # SF bug #1651995, make sure non-ascii character references are not decoded
+        s = '<signs exclamation="&#33" copyright="&#169" quoteleft="&#8216;">'
+        self.check_events(s, [
+            ('starttag', 'signs',
+             [('exclamation', '!'), ('copyright', '&#169'),
+              ('quoteleft', '&#8216;')]),
+            ])
+
+    # XXX These tests have been disabled by prefixing their names with
+    # an underscore.  The first two exercise outstanding bugs in the
+    # sgmllib module, and the third exhibits questionable behavior
+    # that needs to be carefully considered before changing it.
+
+    def _test_starttag_end_boundary(self):
+        self.check_events("<a b='<'>", [("starttag", "a", [("b", "<")])])
+        self.check_events("<a b='>'>", [("starttag", "a", [("b", ">")])])
+
+    def _test_buffer_artefacts(self):
+        output = [("starttag", "a", [("b", "<")])]
+        self.check_events(["<a b='<'>"], output)
+        self.check_events(["<a ", "b='<'>"], output)
+        self.check_events(["<a b", "='<'>"], output)
+        self.check_events(["<a b=", "'<'>"], output)
+        self.check_events(["<a b='<", "'>"], output)
+        self.check_events(["<a b='<'", ">"], output)
+
+        output = [("starttag", "a", [("b", ">")])]
+        self.check_events(["<a b='>'>"], output)
+        self.check_events(["<a ", "b='>'>"], output)
+        self.check_events(["<a b", "='>'>"], output)
+        self.check_events(["<a b=", "'>'>"], output)
+        self.check_events(["<a b='>", "'>"], output)
+        self.check_events(["<a b='>'", ">"], output)
+
+        output = [("comment", "abc")]
+        self.check_events(["", "<!--abc-->"], output)
+        self.check_events(["<", "!--abc-->"], output)
+        self.check_events(["<!", "--abc-->"], output)
+        self.check_events(["<!-", "-abc-->"], output)
+        self.check_events(["<!--", "abc-->"], output)
+        self.check_events(["<!--a", "bc-->"], output)
+        self.check_events(["<!--ab", "c-->"], output)
+        self.check_events(["<!--abc", "-->"], output)
+        self.check_events(["<!--abc-", "->"], output)
+        self.check_events(["<!--abc--", ">"], output)
+        self.check_events(["<!--abc-->", ""], output)
+
+    def _test_starttag_junk_chars(self):
+        self.check_parse_error("<")
+        self.check_parse_error("<>")
+        self.check_parse_error("</$>")
+        self.check_parse_error("</")
+        self.check_parse_error("</a")
+        self.check_parse_error("<$")
+        self.check_parse_error("<$>")
+        self.check_parse_error("<!")
+        self.check_parse_error("<a $>")
+        self.check_parse_error("<a")
+        self.check_parse_error("<a foo='bar'")
+        self.check_parse_error("<a foo='bar")
+        self.check_parse_error("<a foo='>'")
+        self.check_parse_error("<a foo='>")
+        self.check_parse_error("<a foo=>")
+
+
+if __name__ == "__main__":
+    unittest.main()