1. Amirouche Boubekki
  2. n0v4pl4n3t

Source

n0v4pl4n3t / html5lib / filters / optionaltags.py

import _base

class Filter(_base.Filter):
    def slider(self):
        previous1 = previous2 = None
        for token in self.source:
            if previous1 is not None:
                yield previous2, previous1, token
            previous2 = previous1
            previous1 = token
        yield previous2, previous1, None

    def __iter__(self):
        for previous, token, next in self.slider():
            type = token["type"]
            if type == "StartTag":
                if (token["data"] or 
                    not self.is_optional_start(token["name"], previous, next)):
                    yield token
            elif type == "EndTag":
                if not self.is_optional_end(token["name"], next):
                    yield token
            else:
                yield token

    def is_optional_start(self, tagname, previous, next):
        type = next and next["type"] or None
        if tagname in 'html':
            # An html element's start tag may be omitted if the first thing
            # inside the html element is not a space character or a comment.
            return type not in ("Comment", "SpaceCharacters")
        elif tagname == 'head':
            # A head element's start tag may be omitted if the first thing
            # inside the head element is an element.
            # XXX: we also omit the start tag if the head element is empty
            if type in ("StartTag", "EmptyTag"):
                return True
            elif type == "EndTag":
                return next["name"] == "head"
        elif tagname == 'body':
            # A body element's start tag may be omitted if the first thing
            # inside the body element is not a space character or a comment,
            # except if the first thing inside the body element is a script
            # or style element and the node immediately preceding the body
            # element is a head element whose end tag has been omitted.
            if type in ("Comment", "SpaceCharacters"):
                return False
            elif type == "StartTag":
                # XXX: we do not look at the preceding event, so we never omit
                # the body element's start tag if it's followed by a script or
                # a style element.
                return next["name"] not in ('script', 'style')
            else:
                return True
        elif tagname == 'colgroup':
            # A colgroup element's start tag may be omitted if the first thing
            # inside the colgroup element is a col element, and if the element
            # is not immediately preceeded by another colgroup element whose
            # end tag has been omitted.
            if type in ("StartTag", "EmptyTag"):
                # XXX: we do not look at the preceding event, so instead we never
                # omit the colgroup element's end tag when it is immediately
                # followed by another colgroup element. See is_optional_end.
                return next["name"] == "col"
            else:
                return False
        elif tagname == 'tbody':
            # A tbody element's start tag may be omitted if the first thing
            # inside the tbody element is a tr element, and if the element is
            # not immediately preceeded by a tbody, thead, or tfoot element
            # whose end tag has been omitted.
            if type == "StartTag":
                # omit the thead and tfoot elements' end tag when they are
                # immediately followed by a tbody element. See is_optional_end.
                if previous and previous['type'] == 'EndTag' and \
                  previous['name'] in ('tbody','thead','tfoot'):
                    return False
                return next["name"] == 'tr'
            else:
                return False
        return False

    def is_optional_end(self, tagname, next):
        type = next and next["type"] or None
        if tagname in ('html', 'head', 'body'):
            # An html element's end tag may be omitted if the html element
            # is not immediately followed by a space character or a comment.
            return type not in ("Comment", "SpaceCharacters")
        elif tagname in ('li', 'optgroup', 'tr'):
            # A li element's end tag may be omitted if the li element is
            # immediately followed by another li element or if there is
            # no more content in the parent element.
            # An optgroup element's end tag may be omitted if the optgroup
            # element is immediately followed by another optgroup element,
            # or if there is no more content in the parent element.
            # A tr element's end tag may be omitted if the tr element is
            # immediately followed by another tr element, or if there is
            # no more content in the parent element.
            if type == "StartTag":
                return next["name"] == tagname
            else:
                return type == "EndTag" or type is None
        elif tagname in ('dt', 'dd'):
            # A dt element's end tag may be omitted if the dt element is
            # immediately followed by another dt element or a dd element.
            # A dd element's end tag may be omitted if the dd element is
            # immediately followed by another dd element or a dt element,
            # or if there is no more content in the parent element.
            if type == "StartTag":
                return next["name"] in ('dt', 'dd')
            elif tagname == 'dd':
                return type == "EndTag" or type is None
            else:
                return False
        elif tagname == 'p':
            # A p element's end tag may be omitted if the p element is
            # immediately followed by an address, article, aside,
            # blockquote, datagrid, dialog, dir, div, dl, fieldset,
            # footer, form, h1, h2, h3, h4, h5, h6, header, hr, menu,
            # nav, ol, p, pre, section, table, or ul, element, or if
            # there is no more content in the parent element.
            if type in ("StartTag", "EmptyTag"):
                return next["name"] in ('address', 'article', 'aside',
                                        'blockquote', 'datagrid', 'dialog', 
                                        'dir', 'div', 'dl', 'fieldset', 'footer',
                                        'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
                                        'header', 'hr', 'menu', 'nav', 'ol', 
                                        'p', 'pre', 'section', 'table', 'ul')
            else:
                return type == "EndTag" or type is None
        elif tagname == 'option':
            # An option element's end tag may be omitted if the option
            # element is immediately followed by another option element,
            # or if it is immediately followed by an <code>optgroup</code>
            # element, or if there is no more content in the parent
            # element.
            if type == "StartTag":
                return next["name"] in ('option', 'optgroup')
            else:
                return type == "EndTag" or type is None
        elif tagname in ('rt', 'rp'):
            # An rt element's end tag may be omitted if the rt element is
            # immediately followed by an rt or rp element, or if there is
            # no more content in the parent element.
            # An rp element's end tag may be omitted if the rp element is
            # immediately followed by an rt or rp element, or if there is
            # no more content in the parent element.
            if type == "StartTag":
                return next["name"] in ('rt', 'rp')
            else:
                return type == "EndTag" or type is None
        elif tagname == 'colgroup':
            # A colgroup element's end tag may be omitted if the colgroup
            # element is not immediately followed by a space character or
            # a comment.
            if type in ("Comment", "SpaceCharacters"):
                return False
            elif type == "StartTag":
                # XXX: we also look for an immediately following colgroup
                # element. See is_optional_start.
                return next["name"] != 'colgroup'
            else:
                return True
        elif tagname in ('thead', 'tbody'):
            # A thead element's end tag may be omitted if the thead element
            # is immediately followed by a tbody or tfoot element.
            # A tbody element's end tag may be omitted if the tbody element
            # is immediately followed by a tbody or tfoot element, or if
            # there is no more content in the parent element.
            # A tfoot element's end tag may be omitted if the tfoot element
            # is immediately followed by a tbody element, or if there is no
            # more content in the parent element.
            # XXX: we never omit the end tag when the following element is
            # a tbody. See is_optional_start.
            if type == "StartTag":
                return next["name"] in ['tbody', 'tfoot']
            elif tagname == 'tbody':
                return type == "EndTag" or type is None
            else:
                return False
        elif tagname == 'tfoot':
            # A tfoot element's end tag may be omitted if the tfoot element
            # is immediately followed by a tbody element, or if there is no
            # more content in the parent element.
            # XXX: we never omit the end tag when the following element is
            # a tbody. See is_optional_start.
            if type == "StartTag":
                return next["name"] == 'tbody'
            else:
                return type == "EndTag" or type is None
        elif tagname in ('td', 'th'):
            # A td element's end tag may be omitted if the td element is
            # immediately followed by a td or th element, or if there is
            # no more content in the parent element.
            # A th element's end tag may be omitted if the th element is
            # immediately followed by a td or th element, or if there is
            # no more content in the parent element.
            if type == "StartTag":
                return next["name"] in ('td', 'th')
            else:
                return type == "EndTag" or type is None
        return False