Commits

ms2ger committed 6e160fa

Use unicode literals and remove references to the unicode function.

Comments (0)

Files changed (10)

 Post-process a document, adding cross-references, table of contents, etc.
 """
 
+from __future__ import unicode_literals
+
 import sys
 from argparse import ArgumentParser, SUPPRESS
 
         try:
             input = open(args.input, "rb")
         except IOError as e:
-            sys.stderr.write(unicode(e) + u"\n")
+            sys.stderr.write(str(e) + "\n")
             sys.exit(1)
     else:
         input = sys.stdin
         try:
             output = open(args.output, "wb")
         except IOError as e:
-            sys.stderr.write(unicode(e) + u"\n")
+            sys.stderr.write(str(e) + "\n")
             sys.exit(1)
     else:
         output = sys.stdout
         generator.toFile(tree, output, **kwargs)
         output.close()
     except (utils.AnolisException, IOError, etree.XMLSyntaxError) as e:
-        sys.stderr.write(unicode(e) + u"\n")
+        sys.stderr.write(str(e) + "\n")
         sys.exit(1)
 
 
         processes=["filter", "sub", "toc", "xref", "annotate"],
         parser="html5lib",
         serializer="html5lib",
-        newline_char=u"\n",
-        indent_char=u" ",
+        newline_char="\n",
+        indent_char=" ",
         filter=None,
         annotation=None,
         annotate_whatwg_status=False,

anolislib/generator.py

 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
 
+from __future__ import unicode_literals
+
 import html5lib
 from html5lib import treebuilders, treewalkers
 from html5lib.serializer import htmlserializer
     for process in processes:
         try:
             process_module = getattr(__import__('processes', globals(),
-                                                locals(), [process], -1),
+                                                locals(), [str(process)], -1),
                                     process)
         except AttributeError:
             process_module = __import__(process, globals(), locals(), [], -1)

anolislib/processes/annotate.py

 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
 
+from __future__ import unicode_literals
+
 from lxml import etree
 from collections import defaultdict
 

anolislib/processes/outliner.py

 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
 
+from __future__ import unicode_literals
+
 from lxml import etree
 
 from anolislib import utils
 
 # Rank of heading elements (these are negative so h1 > h6)
-fixedRank = {u"h1": -1, u"h2": -2, u"h3": -3, u"h4": -4, u"h5": -5, u"h6": -6}
+fixedRank = {"h1": -1, "h2": -2, "h3": -3, "h4": -4, "h5": -5, "h6": -6}
 
 
 class section(list):
         # h1–h6 element descendant of the hgroup element, if there are any such
         # elements, or otherwise the same as for an h1 element (the highest
         # rank).
-        elif element.tag == u"hgroup":
+        elif element.tag == "hgroup":
             for i in range(1, 6):
-                if element.find(u".//h" + unicode(i)) is not None:
+                if element.find(".//h%i" % i) is not None:
                     return -i
             else:
                 return -1

anolislib/processes/replaceHeadings.py

 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
 
+from __future__ import unicode_literals
+
 from anolislib import utils
 from anolislib.processes import outliner
 
-numered_headings = frozenset([u"h1", u"h2", u"h3", u"h4", u"h5", u"h6"])
+numered_headings = frozenset(["h1", "h2", "h3", "h4", "h5", "h6"])
 
 
 class replaceHeadings(object):
             if section.header is not None and section.header.tag in \
                                               numered_headings:
                 if depth <= 6:
-                    section.header.tag = u"h" + unicode(depth)
+                    section.header.tag = "h%i" % depth
                 else:
-                    raise TooDeepException(u"Too deep for numbered headers")
+                    raise TooDeepException("Too deep for numbered headers")
             
             # Add subsections in reverse order (so the next one is executed
             # next) with a higher depth value

anolislib/processes/sub.py

 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
 
+from __future__ import unicode_literals
+
 import re
 import time
 import os
 
 from anolislib import utils
 
-latest_version = re.compile(u"latest[%s]+version" % utils.spaceCharacters,
+latest_version = re.compile("latest[%s]+version" % utils.spaceCharacters,
                             re.IGNORECASE)
 
 w3c_tr_url_status = r"http://www\.w3\.org/TR/[^/]*/(MO|WD|CR|PR|REC|PER|NOTE)-"
 w3c_tr_url_status = re.compile(w3c_tr_url_status)
 
 title = re.compile(r"\[TITLE[^\]]*\]")
-title_identifier = u"[TITLE"
+title_identifier = "[TITLE"
 
 status = re.compile(r"\[STATUS[^\]]*\]")
-status_identifier = u"[STATUS"
+status_identifier = "[STATUS"
 
 longstatus = re.compile(r"\[LONGSTATUS[^\]]*\]")
-longstatus_identifier = u"[LONGSTATUS"
+longstatus_identifier = "[LONGSTATUS"
 longstatus_map = {
-    u"MO": u"W3C Member-only Draft",
-    u"ED": u"Editor's Draft",
-    u"WD": u"W3C Working Draft",
-    u"CR": u"W3C Candidate Recommendation",
-    u"PR": u"W3C Proposed Recommendation",
-    u"REC": u"W3C Recommendation",
-    u"PER": u"W3C Proposed Edited Recommendation",
-    u"NOTE": u"W3C Working Group Note"
+    "MO": "W3C Member-only Draft",
+    "ED": "Editor's Draft",
+    "WD": "W3C Working Draft",
+    "CR": "W3C Candidate Recommendation",
+    "PR": "W3C Proposed Recommendation",
+    "REC": "W3C Recommendation",
+    "PER": "W3C Proposed Edited Recommendation",
+    "NOTE": "W3C Working Group Note"
 }
 
 shortname = re.compile(r"\[SHORTNAME[^\]]*\]")
-shortname_identifier = u"[SHORTNAME"
+shortname_identifier = "[SHORTNAME"
 
 latest = re.compile(r"\[LATEST[^\]]*\]")
-latest_identifier = u"[LATEST"
+latest_identifier = "[LATEST"
 
 version = re.compile(r"\[VERSION[^\]]*\]")
-version_identifier = u"[VERSION"
+version_identifier = "[VERSION"
 
 w3c_stylesheet = re.compile(r"http://www\.w3\.org/StyleSheets/TR/W3C-[A-Z]+")
-w3c_stylesheet_identifier = u"http://www.w3.org/StyleSheets/TR/W3C-"
+w3c_stylesheet_identifier = "http://www.w3.org/StyleSheets/TR/W3C-"
 
 basic_comment_subs = ()
 
                             **kwargs):
         # Get doc_title from the title element
         try:
-            doc_title = utils.textContent(ElementTree.getroot().find(u"head")
-                                                               .find(u"title"))
+            doc_title = utils.textContent(ElementTree.getroot().find("head")
+                                                               .find("title"))
         except (AttributeError, TypeError):
-            doc_title = u""
+            doc_title = ""
 
         year = re.compile(r"\[YEAR[^\]]*\]")
-        year_sub = time.strftime(u"%Y", self.pubdate)
-        year_identifier = u"[YEAR"
+        year_sub = time.strftime("%Y", self.pubdate)
+        year_identifier = "[YEAR"
 
         date = re.compile(r"\[DATE[^\]]*\]")
-        date_sub = time.strftime(u"%d %B %Y", self.pubdate).lstrip(u"0")
-        date_identifier = u"[DATE"
+        date_sub = time.strftime("%d %B %Y", self.pubdate).lstrip("0")
+        date_identifier = "[DATE"
 
         cdate = re.compile(r"\[CDATE[^\]]*\]")
-        cdate_sub = time.strftime(u"%Y%m%d", self.pubdate)
-        cdate_identifier = u"[CDATE"
+        cdate_sub = time.strftime("%Y%m%d", self.pubdate)
+        cdate_identifier = "[CDATE"
 
         udate = re.compile(r"\[UDATE[^\]]*\]")
-        udate_sub = time.strftime(u"%Y-%m-%d", self.pubdate)
-        udate_identifier = u"[UDATE"
+        udate_sub = time.strftime("%Y-%m-%d", self.pubdate)
+        udate_identifier = "[UDATE"
 
         string_subs = ((year, year_sub, year_identifier),
                        (date, date_sub, date_identifier),
 
         if w3c_compat_crazy_substitutions:
             # Get the right stylesheet
-            doc_w3c_stylesheet = u"http://www.w3.org/StyleSheets/TR/W3C-%s" % (self.w3c_status, )
+            doc_w3c_stylesheet = "http://www.w3.org/StyleSheets/TR/W3C-%s" % (self.w3c_status, )
 
         # Get all the subs we want
         string_subs += ((title, doc_title, title_identifier), )
             try:
                 shortname_sub = w3c_shortname or os.path.basename(os.getcwd())
             except OSError:
-                shortname_sub = u""
-            latest_sub = u"http://www.w3.org/TR/%s/" % (shortname_sub, )
-            version_sub = u"http://www.w3.org/TR/%s/%s-%s-%s/" % (year_sub, self.w3c_status, shortname_sub, cdate_sub)
+                shortname_sub = ""
+            latest_sub = "http://www.w3.org/TR/%s/" % (shortname_sub, )
+            version_sub = "http://www.w3.org/TR/%s/%s-%s-%s/" % (year_sub, self.w3c_status, shortname_sub, cdate_sub)
             string_subs += ((status, self.w3c_status, status_identifier),
                             (longstatus, doc_longstatus, longstatus_identifier),
                             (shortname, shortname_sub, shortname_identifier),
 
         # Add more basic substitutions in compat. mode
         if w3c_compat or w3c_compat_substitutions:
-            copyright = u"copyright"
-            copyright_sub = etree.fromstring(u'<p class="copyright"><a href="http://www.w3.org/Consortium/Legal/ipr-notice#Copyright">Copyright</a> &#xA9; %s <a href="http://www.w3.org/"><abbr title="World Wide Web Consortium">W3C</abbr></a><sup>&#xAE;</sup> (<a href="http://www.csail.mit.edu/"><abbr title="Massachusetts Institute of Technology">MIT</abbr></a>, <a href="http://www.ercim.eu/"><abbr title="European Research Consortium for Informatics and Mathematics">ERCIM</abbr></a>, <a href="http://www.keio.ac.jp/">Keio</a>, <a href="http://ev.buaa.edu.cn/">Beihang</a>), All Rights Reserved. W3C <a href="http://www.w3.org/Consortium/Legal/ipr-notice#Legal_Disclaimer">liability</a>, <a href="http://www.w3.org/Consortium/Legal/ipr-notice#W3C_Trademarks">trademark</a> and <a href="http://www.w3.org/Consortium/Legal/copyright-documents">document use</a> rules apply.</p>' % time.strftime(u"%Y", self.pubdate))
+            copyright = "copyright"
+            copyright_sub = etree.fromstring('<p class="copyright"><a href="http://www.w3.org/Consortium/Legal/ipr-notice#Copyright">Copyright</a> &#xA9; %s <a href="http://www.w3.org/"><abbr title="World Wide Web Consortium">W3C</abbr></a><sup>&#xAE;</sup> (<a href="http://www.csail.mit.edu/"><abbr title="Massachusetts Institute of Technology">MIT</abbr></a>, <a href="http://www.ercim.eu/"><abbr title="European Research Consortium for Informatics and Mathematics">ERCIM</abbr></a>, <a href="http://www.keio.ac.jp/">Keio</a>, <a href="http://ev.buaa.edu.cn/">Beihang</a>), All Rights Reserved. W3C <a href="http://www.w3.org/Consortium/Legal/ipr-notice#Legal_Disclaimer">liability</a>, <a href="http://www.w3.org/Consortium/Legal/ipr-notice#W3C_Trademarks">trademark</a> and <a href="http://www.w3.org/Consortium/Legal/copyright-documents">document use</a> rules apply.</p>' % time.strftime("%Y", self.pubdate))
 
-            logo = u"logo"
-            logo_str = u'<a href="http://www.w3.org/"><img height="48" width="72" alt="W3C" src="http://www.w3.org/Icons/w3c_home"/></a>'
+            logo = "logo"
+            logo_str = '<a href="http://www.w3.org/"><img height="48" width="72" alt="W3C" src="http://www.w3.org/Icons/w3c_home"/></a>'
             if enable_woolly:
-                logo_str += u'<a class="logo" href="https://www.w3.org/Style/Group/" rel="in-activity"><img alt="CSS WG" src="https://www.w3.org/Style/Woolly/woolly-icon"/></a>'
+                logo_str += '<a class="logo" href="https://www.w3.org/Style/Group/" rel="in-activity"><img alt="CSS WG" src="https://www.w3.org/Style/Woolly/woolly-icon"/></a>'
 
-            logo_sub = etree.fromstring(u'<p>%s</p>' % logo_str)
+            logo_sub = etree.fromstring('<p>%s</p>' % logo_str)
 
             instance_basic_comment_subs += ((logo, logo_sub),
                                             (copyright, copyright_sub))
         for node in ElementTree.iter():
             if link_parent is not None:
                 if node.tag is etree.Comment and \
-                   node.text.strip(utils.spaceCharacters) == u"end-link":
+                   node.text.strip(utils.spaceCharacters) == "end-link":
                     if node.getparent() is not link_parent:
-                        raise utils.DifferentParentException(u"begin-link and end-link have different parents")
+                        raise utils.DifferentParentException("begin-link and end-link have different parents")
                     utils.removeInteractiveContentChildren(link)
-                    link.set(u"href", utils.textContent(link))
+                    link.set("href", utils.textContent(link))
                     link_parent = None
                 else:
                     if node.getparent() is link_parent:
                         link.append(deepcopy(node))
                     to_remove.add(node)
             elif node.tag is etree.Comment and \
-                 node.text.strip(utils.spaceCharacters) == u"begin-link":
+                 node.text.strip(utils.spaceCharacters) == "begin-link":
                 link_parent = node.getparent()
-                link = etree.Element(u"a")
+                link = etree.Element("a")
                 link.text = node.tail
                 node.tail = None
                 node.addnext(link)
         # Get all text nodes that contain case-insensitively "latest version"
         # with any amount of whitespace inside the phrase, or contain
         # http://www.w3.org/TR/
-        for text in ElementTree.xpath(u"//text()[contains(translate(., 'LATEST', 'latest'), 'latest') and contains(translate(., 'VERSION', 'version'), 'version') or contains(., 'http://www.w3.org/TR/')]"):
+        for text in ElementTree.xpath("//text()[contains(translate(., 'LATEST', 'latest'), 'latest') and contains(translate(., 'VERSION', 'version'), 'version') or contains(., 'http://www.w3.org/TR/')]"):
             if latest_version.search(text):
-                return u"ED"
+                return "ED"
             elif w3c_tr_url_status.search(text):
                 return w3c_tr_url_status.search(text).group(1)
         # Didn't find any status, return the default (ED)
         else:
-            return u"ED"
+            return "ED"
 
 class DifferentParentException(utils.AnolisException):
     """begin-link and end-link do not have the same parent."""

anolislib/processes/toc.py

 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
 
+from __future__ import unicode_literals
+
 from lxml import etree
 from copy import deepcopy
 
 from anolislib.processes import outliner
 
 # These are just the non-interactive elements to be removed
-remove_elements_from_toc = frozenset([u"dfn", ])
+remove_elements_from_toc = frozenset(["dfn", ])
 # These are, however, all the attributes to be removed
-remove_attributes_from_toc = frozenset([u"id", ])
+remove_attributes_from_toc = frozenset(["id", ])
 
 
 class toc(object):
     toc = None
 
     def __init__(self, ElementTree, **kwargs):
-        self.toc = etree.Element(u"ol", {u"class": u"toc"})
+        self.toc = etree.Element("ol", {"class": "toc"})
         self.buildToc(ElementTree, **kwargs)
         self.addToc(ElementTree, **kwargs)
 
             # If we have a header, regardless of how deep we are
             if section.header is not None:
                 # Get the element that represents the section header's text
-                if section.header.tag == u"hgroup":
+                if section.header.tag == "hgroup":
                     i = 1
                     while i <= 6:
-                        header_text = section.header.find(u".//h" + unicode(i))
+                        header_text = section.header.find(".//h%i" % i)
                         if header_text is not None:
                             break
                         i += 1
             # If we have a section heading text element, regardless of depth
             if header_text is not None:
                 # Remove any existing number
-                for element in header_text.findall(u".//span"):
-                    if utils.elementHasClass(element, u"secno"):
+                for element in header_text.findall(".//span"):
+                    if utils.elementHasClass(element, "secno"):
                         # Copy content, to prepare for the node being
                         # removed
                         utils.copyContentForRemoval(element, text=False,
 
                 # Increment the current section's number
                 if header_text is not None and \
-                   not utils.elementHasClass(header_text, u"no-num") or \
+                   not utils.elementHasClass(header_text, "no-num") or \
                    header_text is None and section:
                     num[-1] += 1
 
                 # Get the current TOC section for this depth, and add another
                 # item to it
                 if header_text is not None and \
-                   not utils.elementHasClass(header_text, u"no-toc") or \
+                   not utils.elementHasClass(header_text, "no-toc") or \
                    header_text is None and section:
                     # Find the appropriate section of the TOC
                     i = 0
                             # If the final li has no children, or the last
                             # children isn't an ol element
                             if len(toc_section[-1]) == 0 or \
-                               toc_section[-1][-1].tag != u"ol":
-                                toc_section[-1].append(etree.Element(u"ol"))
+                               toc_section[-1][-1].tag != "ol":
+                                toc_section[-1].append(etree.Element("ol"))
                                 utils.indentNode(toc_section[-1][-1],
                                                  (i + 1) * 2, **kwargs)
                                 if w3c_compat or w3c_compat_class_toc:
-                                    toc_section[-1][-1].set(u"class", u"toc")
+                                    toc_section[-1][-1].set("class", "toc")
                         except IndexError:
                             # If the current ol has no li in it
-                            toc_section.append(etree.Element(u"li"))
+                            toc_section.append(etree.Element("li"))
                             utils.indentNode(toc_section[0], (i + 1) * 2 - 1,
                                              **kwargs)
-                            toc_section[0].append(etree.Element(u"ol"))
+                            toc_section[0].append(etree.Element("ol"))
                             utils.indentNode(toc_section[0][0], (i + 1) * 2,
                                              **kwargs)
                             if w3c_compat or w3c_compat_class_toc:
-                                toc_section[0][0].set(u"class", u"toc")
+                                toc_section[0][0].set("class", "toc")
                         # TOC Section is now the final child (ol) of the final
                         # item (li) in the previous section
-                        assert toc_section[-1].tag == u"li"
-                        assert toc_section[-1][-1].tag == u"ol"
+                        assert toc_section[-1].tag == "li"
+                        assert toc_section[-1][-1].tag == "ol"
                         toc_section = toc_section[-1][-1]
                         i += 1
                     # Add the current item to the TOC
-                    item = etree.Element(u"li")
+                    item = etree.Element("li")
                     toc_section.append(item)
                     utils.indentNode(item, (i + 1) * 2 - 1, **kwargs)
 
                 if header_text is not None:
                     # Add ID to header
                     id = utils.generateID(header_text, **kwargs)
-                    if header_text.get(u"id") is not None:
-                        del header_text.attrib[u"id"]
-                    section.header.set(u"id", id)
+                    if header_text.get("id") is not None:
+                        del header_text.attrib["id"]
+                    section.header.set("id", id)
 
                     # Add number, if @class doesn't contain no-num
-                    if not utils.elementHasClass(header_text, u"no-num"):
-                        header_text[0:0] = [etree.Element(u"span", {u"class":
-                                                                    u"secno"})]
+                    if not utils.elementHasClass(header_text, "no-num"):
+                        header_text[0:0] = [etree.Element("span", {"class":
+                                                                   "secno"})]
                         header_text[0].tail = header_text.text
                         header_text.text = None
-                        header_text[0].text = u".".join(map(unicode, num))
-                        header_text[0].text += u" "
+                        header_text[0].text = ".".join("%s" % n for n in num)
+                        header_text[0].text += " "
                     # Add to TOC, if @class doesn't contain no-toc
-                    if not utils.elementHasClass(header_text, u"no-toc"):
+                    if not utils.elementHasClass(header_text, "no-toc"):
                         link = deepcopy(header_text)
                         item.append(link)
                         # Make it link to the header
-                        link.tag = u"a"
-                        link.set(u"href", u"#" + id)
+                        link.tag = "a"
+                        link.set("href", "#" + id)
                         # Remove interactive content child elements
                         utils.removeInteractiveContentChildren(link)
                         # Remove other child elements
                         for element_name in remove_elements_from_toc:
                             # Iterate over all the desendants of the new link
                             # with that element name
-                            for element in link.findall(u".//" + element_name):
+                            for element in link.findall(".//" + element_name):
                                 # Copy content, to prepare for the node being
                                 # removed
                                 utils.copyContentForRemoval(element)
                              for child_section in reversed(section)])
 
     def addToc(self, ElementTree, **kwargs):
-        utils.replaceComment(ElementTree, u"toc", self.toc, **kwargs)
+        utils.replaceComment(ElementTree, "toc", self.toc, **kwargs)
 
 
 class DifferentParentException(utils.AnolisException):

anolislib/processes/xref.py

 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
 
+from __future__ import unicode_literals
+
 import re
 from lxml import etree
 
 
 from anolislib import utils
 
-instance_elements = frozenset([u"span", u"abbr", u"code", u"var", u"i"])
-w3c_instance_elements = frozenset([u"abbr", u"acronym", u"b", u"bdo", u"big",
-                                   u"code", u"del", u"em", u"i", u"ins",
-                                   u"kbd", u"label", u"legend", u"q", u"samp",
-                                   u"small", u"span", u"strong", u"sub",
-                                   u"sup", u"tt", u"var"])
+instance_elements = frozenset(["span", "abbr", "code", "var", "i"])
+w3c_instance_elements = frozenset(["abbr", "acronym", "b", "bdo", "big",
+                                   "code", "del", "em", "i", "ins",
+                                   "kbd", "label", "legend", "q", "samp",
+                                   "small", "span", "strong", "sub",
+                                   "sup", "tt", "var"])
 
 # Instances cannot be in the stack with any of these element, or with
 # interactive elements
-instance_not_in_stack_with = frozenset([u"dfn", ])
+instance_not_in_stack_with = frozenset(["dfn", ])
 
 non_alphanumeric_spaces = re.compile(r"[^a-zA-Z0-9 \-\_\/]+")
 
             self.dump(self.getDfns(dump_xrefs), dump_xrefs, **kwargs)
         self.addReferences(ElementTree, dump_backrefs=dump_backrefs, **kwargs)
         if dump_backrefs:
-            self.dump(self.instances, u"backrefs.json", **kwargs)
+            self.dump(self.instances, "backrefs.json", **kwargs)
 
     def buildReferences(self, ElementTree, allow_duplicate_dfns=False,
                         **kwargs):
-        for dfn in ElementTree.iter(u"dfn"):
+        for dfn in ElementTree.iter("dfn"):
             term = self.getTerm(dfn, **kwargs)
 
             if len(term) > 0:
                 if not allow_duplicate_dfns and term in self.dfns:
-                    raise DuplicateDfnException(u'The term "%s" is defined more than once' % term)
+                    raise DuplicateDfnException('The term "%s" is defined more than once' % term)
 
                 link_to = dfn
 
 
                 id = utils.generateID(link_to, **kwargs)
 
-                link_to.set(u"id", id)
+                link_to.set("id", id)
 
                 self.dfns[term] = id
                 self.instances[term] = []
     def dump(self, obj, f, **kwargs):
         d = json.dumps(obj, sort_keys=True, allow_nan=False, indent=2, separators=(',', ': '))
         fp = open(f, "w")
-        fp.write(d + u"\n")
+        fp.write(d + "\n")
         fp.close()
 
     def addReferences(self, ElementTree, w3c_compat=False,
                                 break
 
                     if goodParentingAndChildren:
-                        if element.tag == u"span":
-                            element.tag = u"a"
-                            element.set(u"href", u"#" + self.dfns[term])
+                        if element.tag == "span":
+                            element.tag = "a"
+                            element.set("href", "#" + self.dfns[term])
                             link = element
                         else:
-                            link = etree.Element(u"a",
-                                                 {u"href":
-                                                  u"#" + self.dfns[term]})
+                            link = etree.Element("a",
+                                                 {"href":
+                                                  "#" + self.dfns[term]})
                             if w3c_compat or w3c_compat_xref_a_placement:
                                 for node in element:
                                     link.append(node)
                                 link.tail = link[0].tail
                                 link[0].tail = None
                         if dump_backrefs:
-                            t = utils.non_ifragment.sub(u"-", term.strip(utils.spaceCharacters)).strip(u"-")
-                            id = u"instance_" + t + u"_" + str(len(self.instances[term]))
-                            link.set(u"id", id)
+                            t = utils.non_ifragment.sub("-", term.strip(utils.spaceCharacters)).strip("-")
+                            id = "instance_" + t + "_" + str(len(self.instances[term]))
+                            link.set("id", id)
                             self.instances[term].append(id)
                 elif use_strict and term and \
                      not utils.elementHasClass(element, "secno") and \
-                     not u"data-anolis-spec" in element.attrib and \
-                     not u"data-anolis-ref" in element.attrib and \
+                     not "data-anolis-spec" in element.attrib and \
+                     not "data-anolis-ref" in element.attrib and \
                      not element.getparent().tag in instance_not_in_stack_with:
                     raise SyntaxError("Term not defined: %s, %s." % (term, element))
 
     def getTerm(self, element, w3c_compat=False,
                 w3c_compat_xref_normalization=False, **kwargs):
-        if element.get(u"data-anolis-xref") is not None:
-            term = element.get(u"data-anolis-xref")
-        elif element.get(u"title") is not None:
-            term = element.get(u"title")
+        if element.get("data-anolis-xref") is not None:
+            term = element.get("data-anolis-xref")
+        elif element.get("title") is not None:
+            term = element.get("title")
         else:
             term = utils.textContent(element)
 
         term = term.strip(utils.spaceCharacters).lower()
 
-        term = utils.spacesRegex.sub(u" ", term)
+        term = utils.spacesRegex.sub(" ", term)
 
         if w3c_compat or w3c_compat_xref_normalization:
-            term = non_alphanumeric_spaces.sub(u"", term)
+            term = non_alphanumeric_spaces.sub("", term)
 
         return term
 

anolislib/processes/xspecxref.py

 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
 
+from __future__ import unicode_literals
+
 from lxml import etree
 
 try:
 
 from anolislib import utils
 
-instance_elements = frozenset([u"span", u"abbr", u"code", u"var", u"i"])
-w3c_instance_elements = frozenset([u"abbr", u"acronym", u"b", u"bdo", u"big",
-                   u"code", u"del", u"em", u"i", u"ins",
-                   u"kbd", u"label", u"legend", u"q", u"samp",
-                   u"small", u"span", u"strong", u"sub",
-                   u"sup", u"tt", u"var"])
+instance_elements = frozenset(["span", "abbr", "code", "var", "i"])
+w3c_instance_elements = frozenset(["abbr", "acronym", "b", "bdo", "big",
+                   "code", "del", "em", "i", "ins",
+                   "kbd", "label", "legend", "q", "samp",
+                   "small", "span", "strong", "sub",
+                   "sup", "tt", "var"])
 
 # Instances cannot be in the stack with any of these element, or with
 # interactive elements
-instance_not_in_stack_with = frozenset([u"dfn", ])
+instance_not_in_stack_with = frozenset(["dfn", ])
 
 class xspecxref(object):
   """Add cross-references."""
       if ((element.tag in instance_elements
           or (w3c_compat or w3c_compat_xref_elements)
           and element.tag in w3c_instance_elements)
-          and (element.get(u"data-anolis-spec") is not None)):
+          and (element.get("data-anolis-spec") is not None)):
         term = self.getTerm(element, **kwargs)
-        spec = element.get(u"data-anolis-spec")
+        spec = element.get("data-anolis-spec")
         if w3c_compat:
           del element.attrib["data-anolis-spec"]
-        if element.get(u"class") is not None:
-          element.set(u"class", element.get(u"class") + u" external")
+        if element.get("class") is not None:
+          element.set("class", element.get("class") + " external")
         else:
-          element.set(u"class", u"external")
+          element.set("class", "external")
 
         if not spec in self.dfns or not self.dfns[spec]:
           raise SyntaxError("Specification not found: %s." % spec)
               break
 
         if goodParentingAndChildren:
-          if element.tag == u"span":
-            element.tag = u"a"
-            element.set(u"href", obj["url"] + obj["values"][term])
+          if element.tag == "span":
+            element.tag = "a"
+            element.set("href", obj["url"] + obj["values"][term])
           else:
-            link = etree.Element(u"a",
-                       {u"href":
+            link = etree.Element("a",
+                       {"href":
                         obj["url"] + obj["values"][term]})
             if w3c_compat or w3c_compat_xref_a_placement:
               for node in element:
 
   def getTerm(self, element, w3c_compat=False,
               w3c_compat_xref_normalization=False, **kwargs):
-    if element.get(u"data-anolis-xref") is not None:
-      term = element.get(u"data-anolis-xref")
-    elif element.get(u"title") is not None:
-      term = element.get(u"title")
+    if element.get("data-anolis-xref") is not None:
+      term = element.get("data-anolis-xref")
+    elif element.get("title") is not None:
+      term = element.get("title")
     else:
       term = utils.textContent(element)
 
     term = term.strip(utils.spaceCharacters).lower()
 
-    return utils.spacesRegex.sub(u" ", term)
+    return utils.spacesRegex.sub(" ", term)

anolislib/utils.py

 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
 
+from __future__ import unicode_literals
+
 from copy import deepcopy
 import re
 import sys
 
 ids = {}
 
-spaceCharacters = u"".join(spaceCharacters)
-spacesRegex = re.compile(u"[%s]+" % spaceCharacters)
+spaceCharacters = "".join(spaceCharacters)
+spacesRegex = re.compile("[%s]+" % spaceCharacters)
 
-heading_content = frozenset([u"h1", u"h2", u"h3", u"h4", u"h5", u"h6",
-                             u"hgroup"])
-sectioning_content = frozenset([u"section", u"nav", u"article", u"aside"])
-sectioning_root = frozenset([u"body", u"blockquote", u"figure", u"td",
-                             u"datagrid"])
+heading_content = frozenset(["h1", "h2", "h3", "h4", "h5", "h6",
+                             "hgroup"])
+sectioning_content = frozenset(["section", "nav", "article", "aside"])
+sectioning_root = frozenset(["body", "blockquote", "figure", "td",
+                             "datagrid"])
 
-always_interactive_content = frozenset([u"a", u"bb", u"details", u"datagrid"])
-media_elements = frozenset([u"audio", u"video"])
+always_interactive_content = frozenset(["a", "bb", "details", "datagrid"])
+media_elements = frozenset(["audio", "video"])
 
 non_sgml_name = re.compile("[^A-Za-z0-9_:.]+")
 
 if sys.maxunicode == 0xFFFF:
     # UTF-16 Python
-    non_ifragment = re.compile(u"([\u0000-\u0020\u0022\u0023\u0025\\\u002D\u003C\u003E\u005B-\u005E\u0060\u007B-\u007D\u007F-\u0099\uD800-\uF8FF\uFDD0-\uFDDF\uFFF0-\uFFFF]|\U0001FFFE|\U0001FFFF|\U0002FFFE|\U0002FFFF|\U0003FFFE|\U0003FFFF|\U0004FFFE|\U0004FFFF|\U0005FFFE|\U0005FFFF|\U0006FFFE|\U0006FFFF|\U0007FFFE|\U0007FFFF|\U0008FFFE|\U0008FFFF|\U0009FFFE|\U0009FFFF|\U000AFFFE|\U000AFFFF|\U000BFFFE|\U000BFFFF|\U000CFFFE|\U000CFFFF|\uDB3F[\uDFFE-\uDFFF]|[\uDB40-\uDB43][\uDC00-\uDFFF]|\uDB7F[\uDFFE-\uDFFF]|[\uDB80-\uDBFF][\uDC00-\uDFFF])+")
+    non_ifragment = re.compile("([\u0000-\u0020\u0022\u0023\u0025\\\u002D\u003C\u003E\u005B-\u005E\u0060\u007B-\u007D\u007F-\u0099\uD800-\uF8FF\uFDD0-\uFDDF\uFFF0-\uFFFF]|\U0001FFFE|\U0001FFFF|\U0002FFFE|\U0002FFFF|\U0003FFFE|\U0003FFFF|\U0004FFFE|\U0004FFFF|\U0005FFFE|\U0005FFFF|\U0006FFFE|\U0006FFFF|\U0007FFFE|\U0007FFFF|\U0008FFFE|\U0008FFFF|\U0009FFFE|\U0009FFFF|\U000AFFFE|\U000AFFFF|\U000BFFFE|\U000BFFFF|\U000CFFFE|\U000CFFFF|\uDB3F[\uDFFE-\uDFFF]|[\uDB40-\uDB43][\uDC00-\uDFFF]|\uDB7F[\uDFFE-\uDFFF]|[\uDB80-\uDBFF][\uDC00-\uDFFF])+")
 else:
     # UTF-32 Python
-    non_ifragment = re.compile(u"[^A-Za-z0-9._~!$&'()*+,;=:@/\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\U00010000-\U0001FFFD\U00020000-\U0002FFFD\U00030000-\U0003FFFD\U00040000-\U0004FFFD\U00050000-\U0005FFFD\U00060000-\U0006FFFD\U00070000-\U0007FFFD\U00080000-\U0008FFFD\U00090000-\U0009FFFD\U000A0000-\U000AFFFD\U000B0000-\U000BFFFD\U000C0000-\U000CFFFD\U000D0000-\U000DFFFD\U000E1000-\U000EFFFD]+")
+    non_ifragment = re.compile("[^A-Za-z0-9._~!$&'()*+,;=:@/\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\U00010000-\U0001FFFD\U00020000-\U0002FFFD\U00030000-\U0003FFFD\U00040000-\U0004FFFD\U00050000-\U0005FFFD\U00060000-\U0006FFFD\U00070000-\U0007FFFD\U00080000-\U0008FFFD\U00090000-\U0009FFFD\U000A0000-\U000AFFFD\U000B0000-\U000BFFFD\U000C0000-\U000CFFFD\U000D0000-\U000DFFFD\U000E1000-\U000EFFFD]+")
 
 
 def splitOnSpaces(string):
 
 
 def elementHasClass(Element, class_name):
-    if Element.get(u"class") and \
-       class_name in splitOnSpaces(Element.get(u"class")):
+    if Element.get("class") and \
+       class_name in splitOnSpaces(Element.get("class")):
         return True
     else:
         return False
 
 
 def generateID(Element, force_html4_id=False, **kwargs):
-    if Element.get(u"id") is not None:
-        return Element.get(u"id")
-    elif Element.get(u"title") is not None and \
-         Element.get(u"title").strip(spaceCharacters) is not u"":
-        source = Element.get(u"title")
+    if Element.get("id") is not None:
+        return Element.get("id")
+    elif Element.get("title") is not None and \
+         Element.get("title").strip(spaceCharacters) != "":
+        source = Element.get("title")
     else:
         source = textContent(Element)
 
     source = source.strip(spaceCharacters).lower()
 
-    if source == u"":
-        source = u"generatedID"
+    if source == "":
+        source = "generatedID"
     elif force_html4_id or Element.getroottree().docinfo.public_id in \
-        (u"-//W3C//DTD HTML 4.0//EN",
-         u"-//W3C//DTD HTML 4.0 Transitional//EN",
-         u"-//W3C//DTD HTML 4.0 Frameset//EN",
-         u"-//W3C//DTD HTML 4.01//EN",
-         u"-//W3C//DTD HTML 4.01 Transitional//EN",
-         u"-//W3C//DTD HTML 4.01 Frameset//EN",
-         u"ISO/IEC 15445:2000//DTD HyperText Markup Language//EN",
-         u"ISO/IEC 15445:2000//DTD HTML//EN",
-         u"-//W3C//DTD XHTML 1.0 Strict//EN",
-         u"-//W3C//DTD XHTML 1.0 Transitional//EN",
-         u"-//W3C//DTD XHTML 1.0 Frameset//EN",
-         u"-//W3C//DTD XHTML 1.1//EN"):
-        source = non_sgml_name.sub(u"-", source).strip(u"-")
+        ("-//W3C//DTD HTML 4.0//EN",
+         "-//W3C//DTD HTML 4.0 Transitional//EN",
+         "-//W3C//DTD HTML 4.0 Frameset//EN",
+         "-//W3C//DTD HTML 4.01//EN",
+         "-//W3C//DTD HTML 4.01 Transitional//EN",
+         "-//W3C//DTD HTML 4.01 Frameset//EN",
+         "ISO/IEC 15445:2000//DTD HyperText Markup Language//EN",
+         "ISO/IEC 15445:2000//DTD HTML//EN",
+         "-//W3C//DTD XHTML 1.0 Strict//EN",
+         "-//W3C//DTD XHTML 1.0 Transitional//EN",
+         "-//W3C//DTD XHTML 1.0 Frameset//EN",
+         "-//W3C//DTD XHTML 1.1//EN"):
+        source = non_sgml_name.sub("-", source).strip("-")
         try:
             if not source[0].isalpha():
-                source = u"x" + source
+                source = "x" + source
         except IndexError:
-            source = u"generatedID"
+            source = "generatedID"
     else:
-        source = non_ifragment.sub(u"-", source).strip(u"-")
-        if source == u"":
-            source = u"generatedID"
+        source = non_ifragment.sub("-", source).strip("-")
+        if source == "":
+            source = "generatedID"
 
     # Initally set the id to the source
     id = source
 
     i = 0
     while getElementById(Element.getroottree().getroot(), id) is not None:
-        id = source + u"-" + unicode(i)
+        id = "%s-%i" % (source, i)
         i += 1
 
     ids[Element.getroottree().getroot()][id] = Element
     to_remove = set()
     
     # Replace img with its alt attribute
-    for child in Element.iter(tag=u"img"):
+    for child in Element.iter(tag="img"):
         # Add alt in its place
         if child.get("alt") is not None:
             if child.getprevious() is not None:
                 if child.getprevious().tail is None:
-                    child.getprevious().tail = child.get(u"alt")
+                    child.getprevious().tail = child.get("alt")
                 else:
-                    child.getprevious().tail += child.get(u"alt")
+                    child.getprevious().tail += child.get("alt")
             else:
                 if child.getparent().text is None:
-                    child.getparent().text = child.get(u"alt")
+                    child.getparent().text = child.get("alt")
                 else:
-                    child.getparent().text += child.get(u"alt")
+                    child.getparent().text += child.get("alt")
         # Preserve the element tail
         if child.tail is not None:
             if child.getprevious() is not None:
     else:
         ids[base] = {}
         for element in base.iter(tag=etree.Element):
-            if element.get(u"id"):
-                ids[base][element.get(u"id")] = element
+            if element.get("id"):
+                ids[base][element.get("id")] = element
         return getElementById(base, id)
 
 
 def escapeXPathString(string):
-    return u"concat('', '%s')" % string.replace(u"'", u"', \"'\", '")
+    return "concat('', '%s')" % string.replace("'", "', \"'\", '")
 
 
 def removeInteractiveContentChildren(element):
     # Iter over list of decendants of element
-    for child in element.findall(u".//*"):
+    for child in element.findall(".//*"):
         if isInteractiveContent(child):
             # Copy content, to prepare for the node being removed
             copyContentForRemoval(child)
 
 def isInteractiveContent(element):
     if element.tag in always_interactive_content \
-    or element.tag in media_elements and element.get(u"controls") is not None \
-    or element.tag == u"menu" and element.get(u"type") is not None and \
-       element.get(u"type").lower() == u"toolbar":
+    or element.tag in media_elements and element.get("controls") is not None \
+    or element.tag == "menu" and element.get("type") is not None and \
+       element.get("type").lower() == "toolbar":
         return True
     else:
         return False
                 node.getparent().text += node.tail
 
 def replaceComment(ElementTree, comment, sub, **kwargs):
-    begin_sub = u"begin-%s" % comment
-    end_sub = u"end-%s" % comment
+    begin_sub = "begin-%s" % comment
+    end_sub = "end-%s" % comment
     sub_parent = None
     to_remove = set()
     for node in ElementTree.iter():
             if node.tag is etree.Comment and \
                node.text.strip(spaceCharacters) == end_sub:
                 if node.getparent() is not sub_parent:
-                    raise DifferentParentException(u"%s and %s have different parents" % begin_sub, end_sub)
+                    raise DifferentParentException("%s and %s have different parents" % begin_sub, end_sub)
                 sub_parent = None
             else:
                 to_remove.add(node)
     for node in to_remove:
         node.getparent().remove(node)
 
-def indentNode(node, indent=0, newline_char=u"\n", indent_char=u" ", **kwargs):
+def indentNode(node, indent=0, newline_char="\n", indent_char=" ", **kwargs):
     whitespace = newline_char + indent_char * indent
     if node.getprevious() is not None:
         if node.getprevious().tail is None: