Commits

Geoffrey Sneddon  committed 9cbda25

Start move to PEP 8 coding standards.

  • Participants
  • Parent commits 2a8f8d0

Comments (0)

Files changed (8)

File anolislib/generator.py

 # coding=UTF-8
 # Copyright (c) 2008 Geoffrey Sneddon
-# 
+#
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
-# 
+#
 # The above copyright notice and this permission notice shall be included in
 # all copies or substantial portions of the Software.
-# 
+#
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 import lxml.html
 from lxml import etree
 
+
 def process(tree, processes=set(["sub", "toc", "xref"]), **kwargs):
-	""" Process the given tree. """
-	
-	# Find number of passes to do
-	for process in processes:
-		try:
-			process_module = getattr(__import__('processes', globals(), locals(), [process], -1), process)
-		except ImportError:
-			process_module = __import__(process, globals(), locals(), [], -1)
-		
-		getattr(process_module, process)(tree, **kwargs)
+    """ Process the given tree. """
 
-def fromFile(input, processes=set(["sub", "toc", "xref"]), xml=False, lxml_html=False, profile=False, **kwargs):
-	# Parse as XML:
-	#if xml:
-	if False:
-		tree = etree.parse(input)
-	# Parse as HTML using lxml.html
-	elif lxml_html:
-		tree = lxml.html.parse(input)
-	# Parse as HTML using html5lib
-	else:
-		parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("lxml", etree))
-		tree = parser.parse(input)
-	
-	# Close the input file
-	input.close()
-	
-	# Run the generator, and profile, or not, as the case may be
-	if profile:
-		import hotshot
-		import hotshot.stats
-		import os
-		import tempfile
-		statfile = tempfile.mkstemp()[1]
-		prof = hotshot.Profile(statfile)
-		prof.runcall(process, tree, processes, **kwargs)
-		prof.close()
-		stats = hotshot.stats.load(statfile)
-		stats.strip_dirs()
-		stats.sort_stats('time')
-		stats.print_stats()
-		os.remove(statfile)
-	else:
-		process(tree, processes, **kwargs)
-	
-	# Return the tree
-	return tree
+    # Find number of passes to do
+    for process in processes:
+        try:
+            process_module = getattr(__import__('processes', globals(),
+                                                locals(), [process], -1),
+                                    process)
+        except ImportError:
+            process_module = __import__(process, globals(), locals(), [], -1)
 
-def toFile(tree, output, xml=False, lxml_html=False, **kwargs):			
-	# Serialize to XML
-	#if xml:
-	if False:
-		rendered = etree.tostring(tree, encoding="utf-8")
-	# Serialize to HTML using lxml.html
-	elif lxml_html:
-		rendered = lxml.html.tostring(tree, encoding="utf-8")
-	# Serialize to HTML using html5lib
-	else:
-		walker = treewalkers.getTreeWalker("lxml")
-		s = serializer.htmlserializer.HTMLSerializer(**kwargs)
-		rendered = s.render(walker(tree), encoding="utf-8")
-	
-	# Write to the output
-	output.write(rendered)
+        getattr(process_module, process)(tree, **kwargs)
+
+
+def fromFile(input, processes=set(["sub", "toc", "xref"]), xml=False,
+             lxml_html=False, profile=False, **kwargs):
+    # Parse as XML:
+    #if xml:
+    if False:
+        tree = etree.parse(input)
+    # Parse as HTML using lxml.html
+    elif lxml_html:
+        tree = lxml.html.parse(input)
+    # Parse as HTML using html5lib
+    else:
+        parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("lxml",
+                                                                      etree))
+        tree = parser.parse(input)
+
+    # Close the input file
+    input.close()
+
+    # Run the generator, and profile, or not, as the case may be
+    if profile:
+        import hotshot
+        import hotshot.stats
+        import os
+        import tempfile
+        statfile = tempfile.mkstemp()[1]
+        prof = hotshot.Profile(statfile)
+        prof.runcall(process, tree, processes, **kwargs)
+        prof.close()
+        stats = hotshot.stats.load(statfile)
+        stats.strip_dirs()
+        stats.sort_stats('time')
+        stats.print_stats()
+        os.remove(statfile)
+    else:
+        process(tree, processes, **kwargs)
+
+    # Return the tree
+    return tree
+
+
+def toFile(tree, output, xml=False, lxml_html=False, **kwargs):
+    # Serialize to XML
+    #if xml:
+    if False:
+        rendered = etree.tostring(tree, encoding="utf-8")
+    # Serialize to HTML using lxml.html
+    elif lxml_html:
+        rendered = lxml.html.tostring(tree, encoding="utf-8")
+    # Serialize to HTML using html5lib
+    else:
+        walker = treewalkers.getTreeWalker("lxml")
+        s = serializer.htmlserializer.HTMLSerializer(**kwargs)
+        rendered = s.render(walker(tree), encoding="utf-8")
+
+    # Write to the output
+    output.write(rendered)
+
 
 def fromToFile(input, output, **kwargs):
-	tree = fromFile(input, **kwargs)
-	toFile(tree, output, **kwargs)
+    tree = fromFile(input, **kwargs)
+    toFile(tree, output, **kwargs)

File anolislib/processes/outliner.py

 # coding=UTF-8
 # Copyright (c) 2008 Geoffrey Sneddon
-# 
+#
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
-# 
+#
 # The above copyright notice and this permission notice shall be included in
 # all copies or substantial portions of the Software.
-# 
+#
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 from anolislib import utils
 
 # Rank of heading elements (these are negative so h1 > h6)
-rank = {u"h1": -1, u"h2": -2, u"h3": -3, u"h4": -4, u"h5": -5, u"h6": -6, u"header": -1}
+rank = {u"h1": -1, u"h2": -2, u"h3": -3, u"h4": -4, u"h5": -5, u"h6": -6,
+        u"header": -1}
 
 class section(list):
-	"""Represents the section of a document."""
-	
-	header = None
-	
-	def __repr__(self):
-		return "<section %s>" % (repr(self.header))
+    """Represents the section of a document."""
 
-	def append(self, child):
-		list.append(self, child)
-		child.parent = self
-	
-	def extend(self, children):
-		list.extend(self, children)
-		for child in children:
-			child.parent = self
+    header = None
+
+    def __repr__(self):
+        return "<section %s>" % (repr(self.header))
+
+    def append(self, child):
+        list.append(self, child)
+        child.parent = self
+
+    def extend(self, children):
+        list.extend(self, children)
+        for child in children:
+            child.parent = self
 
 class Outliner:
-	"""Build the outline of an HTML document."""
-	
-	def __init__(self, ElementTree, **kwargs):
-		self.ElementTree = ElementTree
-		self.stack = []
-		self.outlines = {}
-		self.current_outlinee = None
-		self.current_section = None
-	
-	def build(self, **kwargs):
-		for action, element in etree.iterwalk(self.ElementTree, events=("start", "end")):
-			# If the top of the stack is an element, and you are exiting that element
-			if action == "end" and self.stack and self.stack[-1] == element:
-				# Note: The element being exited is a heading content element.
-				assert element.tag in utils.heading_content
-				# Pop that element from the stack.
-				self.stack.pop()
-			
-			# If the top of the stack is a heading content element
-			elif self.stack and self.stack[-1].tag in utils.heading_content:
-				# Do nothing.
-				pass
-			
-			# When entering a sectioning content element or a sectioning root element
-			elif action == "start" and (element.tag in utils.sectioning_content or element.tag in utils.sectioning_root):
-				# If current outlinee is not null, push current outlinee onto the stack.
-				if self.current_outlinee is not None:
-					self.stack.append(self.current_outlinee)
-				# Let current outlinee be the element that is being entered.
-				self.current_outlinee = element
-				# Let current section be a newly created section for the current outlinee element.
-				self.current_section = section()
-				# Let there be a new outline for the new current outlinee, initialized with just the new current section as the only section in the outline.
-				self.outlines[self.current_outlinee] = [self.current_section]
-				
-			# When exiting a sectioning content element, if the stack is not empty
-			elif action == "end" and element.tag in utils.sectioning_content and self.stack:
-				# Pop the top element from the stack, and let the current outlinee be that element.
-				self.current_outlinee = self.stack.pop()
-				# Let current section be the last section in the outline of the current outlinee element.
-				self.current_section = self.outlines[self.current_outlinee][-1]
-				# Append the outline of the sectioning content element being exited to the current section. (This does not change which section is the last section in the outline.)
-				self.current_section += self.outlines[element]
-				
-			# When exiting a sectioning root element, if the stack is not empty
-			elif action == "end" and element.tag in utils.sectioning_root and self.stack:
-				# Pop the top element from the stack, and let the current outlinee be that element.
-				self.current_outlinee = self.stack.pop()
-				# Let current section be the last section in the outline of the current outlinee element.
-				self.current_section = self.outlines[self.current_outlinee][-1]
-				# Loop: If current section has no child sections, stop these steps.
-				while self.current_section:
-					# Let current section be the last child section of the current current section.
-					assert self.current_section != self.current_section[-1]
-					self.current_section = self.current_section[-1]
-					# Go back to the substep labeled Loop.
-					
-			# When exiting a sectioning content element or a sectioning root element
-			elif action == "end" and (element.tag in utils.sectioning_content or element.tag in utils.sectioning_root):
-				# Note: The current outlinee is the element being exited.
-				assert self.current_outlinee == element
-				# Let current section be the first section in the outline of the current outlinee element.
-				self.current_section = self.outlines[self.current_outlinee][0]
-				# Skip to the next step in the overall set of steps. (The walk is over.)
-				break
-				
-			# If the current outlinee is null.
-			elif self.current_outlinee is None:
-				# Do nothing.
-				pass
-			
-			# When entering a heading content element
-			elif action == "start" and element.tag in utils.heading_content:
-				# If the current section has no heading, let the element being entered be the heading for the current section.
-				if self.current_section.header is None:
-					self.current_section.header = element
-				
-				# Otherwise, if the element being entered has a rank equal to or greater than the heading of the last section of the outline of the current outlinee, then create a new section and append it to the outline of the current outlinee element, so that this new section is the new last section of that outline. Let current section be that new section. Let the element being entered be the new heading for the current section.
-				elif rank[element.tag] >= rank[self.outlines[self.current_outlinee][-1].header.tag]:
-					self.current_section = section()
-					self.outlines[self.current_outlinee].append(self.current_section)
-					self.current_section.header = element
-				
-				# Otherwise, run these substeps:
-				else:
-					# Let candidate section be current section.
-					candidate_section = self.current_section
-					while True:
-						# If the element being entered has a rank lower than the rank of the heading of the candidate section, then create a new section, and append it to candidate section. (This does not change which section is the last section in the outline.) Let current section be this new section. Let the element being entered be the new heading for the current section. Abort these substeps.
-						if rank[element.tag] < rank[candidate_section.header.tag]:
-							self.current_section = section()
-							candidate_section.append(self.current_section)
-							self.current_section.header = element
-							break
-						# Let new candidate section be the section that contains candidate section in the outline of current outlinee.
-						# Let candidate section be new candidate section.
-						candidate_section = candidate_section.parent
-						# Return to step 2.
-				# Push the element being entered onto the stack. (This causes the algorithm to skip any descendants of the element.)
-				self.stack.append(element)
-		
-		# If the current outlinee is null, then there was no sectioning content element or sectioning root element in the DOM. There is no outline.
-		try:
-			return self.outlines[self.current_outlinee]
-		except KeyError:
-			return None
+    """Build the outline of an HTML document."""
+
+    def __init__(self, ElementTree, **kwargs):
+        self.ElementTree = ElementTree
+        self.stack = []
+        self.outlines = {}
+        self.current_outlinee = None
+        self.current_section = None
+
+    def build(self, **kwargs):
+        for action, element in etree.iterwalk(self.ElementTree,
+                                              events=("start", "end")):
+            # If the top of the stack is an element, and you are exiting that
+            # element
+            if action == "end" and self.stack and self.stack[-1] == element:
+                # Note: The element being exited is a heading content element.
+                assert element.tag in utils.heading_content
+                # Pop that element from the stack.
+                self.stack.pop()
+
+            # If the top of the stack is a heading content element
+            elif self.stack and self.stack[-1].tag in utils.heading_content:
+                # Do nothing.
+                pass
+
+            # When entering a sectioning content element or a sectioning root
+            # element
+            elif action == "start" and \
+                 (element.tag in utils.sectioning_content or \
+                  element.tag in utils.sectioning_root):
+                # If current outlinee is not null, push current outlinee onto
+                # the stack.
+                if self.current_outlinee is not None:
+                    self.stack.append(self.current_outlinee)
+                # Let current outlinee be the element that is being entered.
+                self.current_outlinee = element
+                # Let current section be a newly created section for the
+                # current outlinee element.
+                self.current_section = section()
+                # Let there be a new outline for the new current outlinee,
+                # initialized with just the new current section as the only
+                # section in the outline.
+                self.outlines[self.current_outlinee] = [self.current_section]
+
+            # When exiting a sectioning content element, if the stack is not
+            # empty
+            elif action == "end" and \
+                 element.tag in utils.sectioning_content and self.stack:
+                # Pop the top element from the stack, and let the current
+                # outlinee be that element.
+                self.current_outlinee = self.stack.pop()
+                # Let current section be the last section in the outline of the
+                # current outlinee element.
+                self.current_section = self.outlines[self.current_outlinee][-1]
+                # Append the outline of the sectioning content element being
+                # exited to the current section. (This does not change which
+                # section is the last section in the outline.)
+                self.current_section += self.outlines[element]
+
+            # When exiting a sectioning root element, if the stack is not empty
+            elif action == "end" and element.tag in utils.sectioning_root and \
+                 self.stack:
+                # Pop the top element from the stack, and let the current
+                # outlinee be that element.
+                self.current_outlinee = self.stack.pop()
+                # Let current section be the last section in the outline of the
+                # current outlinee element.
+                self.current_section = self.outlines[self.current_outlinee][-1]
+                # Loop: If current section has no child sections, stop these
+                # steps.
+                while self.current_section:
+                    # Let current section be the last child section of the
+                    # current current section.
+                    assert self.current_section != self.current_section[-1]
+                    self.current_section = self.current_section[-1]
+                    # Go back to the substep labeled Loop.
+
+            # When exiting a sectioning content element or a sectioning root
+            # element
+            elif action == "end" and \
+                 (element.tag in utils.sectioning_content or \
+                  element.tag in utils.sectioning_root):
+                # Note: The current outlinee is the element being exited.
+                assert self.current_outlinee == element
+                # Let current section be the first section in the outline of
+                # the current outlinee element.
+                self.current_section = self.outlines[self.current_outlinee][0]
+                # Skip to the next step in the overall set of steps. (The walk
+                # is over.)
+                break
+
+            # If the current outlinee is null.
+            elif self.current_outlinee is None:
+                # Do nothing.
+                pass
+
+            # When entering a heading content element
+            elif action == "start" and element.tag in utils.heading_content:
+                # If the current section has no heading, let the element being
+                # entered be the heading for the current section.
+                if self.current_section.header is None:
+                    self.current_section.header = element
+
+                # Otherwise, if the element being entered has a rank equal to
+                # or greater than the heading of the last section of the
+                # outline of the current outlinee, then create a new section
+                # and append it to the outline of the current outlinee element,
+                # so that this new section is the new last section of that
+                # outline. Let current section be that new section. Let the
+                # element being entered be the new heading for the current
+                # section.
+                elif rank[element.tag] >= \
+                     rank[self.outlines[self.current_outlinee][-1].header.tag]:
+                    self.current_section = section()
+                    self.outlines[self.current_outlinee] \
+                        .append(self.current_section)
+                    self.current_section.header = element
+
+                # Otherwise, run these substeps:
+                else:
+                    # Let candidate section be current section.
+                    candidate_section = self.current_section
+                    while True:
+                        # If the element being entered has a rank lower than
+                        # the rank of the heading of the candidate section,
+                        # then create a new section, and append it to candidate
+                        # section. (This does not change which section is the
+                        # last section in the outline.) Let current section be
+                        # this new section. Let the element being entered be
+                        # the new heading for the current section. Abort these
+                        # substeps.
+                        if rank[element.tag] < rank[candidate_section.header.tag]:
+                            self.current_section = section()
+                            candidate_section.append(self.current_section)
+                            self.current_section.header = element
+                            break
+                        # Let new candidate section be the section that contains candidate section in the outline of current outlinee.
+                        # Let candidate section be new candidate section.
+                        candidate_section = candidate_section.parent
+                        # Return to step 2.
+                # Push the element being entered onto the stack. (This causes the algorithm to skip any descendants of the element.)
+                self.stack.append(element)
+
+        # If the current outlinee is null, then there was no sectioning content element or sectioning root element in the DOM. There is no outline.
+        try:
+            return self.outlines[self.current_outlinee]
+        except KeyError:
+            return None

File anolislib/processes/sub.py

 longstatus = re.compile(r"\[LONGSTATUS[^\]]*\]")
 longstatus_identifier = u"[LONGSTATUS"
 longstatus_map = {
-	u"MO": u"W3C Member-only Draft",
-	u"ED": u"Editor's Draft",
-	u"WD": u"W3C Working Draft",
-	u"CR": u"W3C Candidate Recommendation",
-	u"PR": u"W3C Proposed Recommendation",
-	u"REC": u"W3C Recommendation",
-	u"PER": u"W3C Proposed Edited Recommendation",
-	u"NOTE": u"W3C Working Group Note"
+    u"MO": u"W3C Member-only Draft",
+    u"ED": u"Editor's Draft",
+    u"WD": u"W3C Working Draft",
+    u"CR": u"W3C Candidate Recommendation",
+    u"PR": u"W3C Proposed Recommendation",
+    u"REC": u"W3C Recommendation",
+    u"PER": u"W3C Proposed Edited Recommendation",
+    u"NOTE": u"W3C Working Group Note"
 }
 
 w3c_stylesheet = re.compile(r"http://www\.w3\.org/StyleSheets/TR/W3C-[A-Z]+")
 basic_comment_subs = ()
 
 class sub(object):
-	"""Perform substitutions."""
-	
-	def __init__(self, ElementTree, w3c_compat=False, w3c_compat_substitutions=False, w3c_compat_crazy_substitutions=False, **kwargs):
-		if w3c_compat or w3c_compat_substitutions or w3c_compat_crazy_substitutions:
-			self.w3c_status = self.getW3CStatus(ElementTree, **kwargs)
-		self.stringSubstitutions(ElementTree, w3c_compat, w3c_compat_substitutions, w3c_compat_crazy_substitutions, **kwargs)
-		self.commentSubstitutions(ElementTree, w3c_compat, w3c_compat_substitutions, w3c_compat_crazy_substitutions, **kwargs)
-	
-	def stringSubstitutions(self, ElementTree, w3c_compat=False, w3c_compat_substitutions=False, w3c_compat_crazy_substitutions=False, **kwargs):
-		# Get doc_title from the title element
-		try:
-			doc_title = utils.textContent(ElementTree.getroot().find(u"head").find(u"title"))
-		except (AttributeError, TypeError):
-			doc_title = u""
-		
-		if w3c_compat or w3c_compat_substitutions:
-			# Get the right long status
-			doc_longstatus = longstatus_map[self.w3c_status]
-		
-		if w3c_compat_crazy_substitutions:
-			# Get the right stylesheet
-			doc_w3c_stylesheet = u"http://www.w3.org/StyleSheets/TR/W3C-" + self.w3c_status
-		
-		# Get all the subs we want
-		instance_string_subs = string_subs + ((title, doc_title, title_identifier),)
-		
-		# And even more in compat. mode
-		if w3c_compat or w3c_compat_substitutions:
-			instance_string_subs += ((status, self.w3c_status, status_identifier),
-			                         (longstatus, doc_longstatus, longstatus_identifier))
-		
-		# And more that aren't even enabled by default in compat. mode
-		if w3c_compat_crazy_substitutions:
-			instance_string_subs += ((w3c_stylesheet, doc_w3c_stylesheet, w3c_stylesheet_identifier),)
-		
-		for node in ElementTree.iter():
-			for regex, sub, identifier in instance_string_subs:
-				if node.text is not None and identifier in node.text:
-					node.text = regex.sub(sub, node.text)
-				if node.tail is not None and identifier in node.tail:
-					node.tail = regex.sub(sub, node.tail)
-				for name, value in node.attrib.items():
-					if identifier in value:
-						node.attrib[name] = regex.sub(sub, value)
-	
-	def commentSubstitutions(self, ElementTree, w3c_compat=False, w3c_compat_substitutions=False, w3c_compat_crazy_substitutions=False, **kwargs):
-		# Basic substitutions
-		instance_basic_comment_subs = basic_comment_subs
-		
-		# Add more basic substitutions in compat. mode
-		if w3c_compat or w3c_compat_substitutions:
-			instance_basic_comment_subs += ((logo, logo_sub),
-			                                (copyright, copyright_sub))
-		
-		# Set of nodes to remove
-		to_remove = set()
-		
-		# Link
-		in_link = False
-		for node in ElementTree.iter():
-			if in_link:
-				if node.tag is etree.Comment and node.text.strip(utils.spaceCharacters) == u"end-link":
-					if node.getparent() is not link_parent:
-						raise DifferentParentException, u"begin-link and end-link have different parents"
-					utils.removeInteractiveContentChildren(link)
-					link.set(u"href", utils.textContent(link))
-					in_link = False
-				else:
-					if node.getparent() is link_parent:
-						link.append(deepcopy(node))
-					to_remove.add(node)
-			elif node.tag is etree.Comment and node.text.strip(utils.spaceCharacters) == u"begin-link":
-				link_parent = node.getparent()
-				in_link = True
-				link = etree.Element(u"a")
-				link.text = node.tail
-				node.tail = None
-				node.addnext(link)
-		
-		# Basic substitutions
-		for comment, sub in instance_basic_comment_subs:
-			begin_sub = u"begin-" + comment
-			end_sub = u"end-" + comment
-			in_sub = False
-			for node in ElementTree.iter():
-				if in_sub:
-					if node.tag is etree.Comment and node.text.strip(utils.spaceCharacters) == end_sub:
-						if node.getparent() is not sub_parent:
-							raise DifferentParentException, u"%s and %s have different parents" % begin_sub, end_sub
-						in_sub = False
-					else:
-						to_remove.add(node)
-				elif node.tag is etree.Comment:
-					if node.text.strip(utils.spaceCharacters) == begin_sub:
-						sub_parent = node.getparent()
-						in_sub = True
-						node.tail = None
-						node.addnext(deepcopy(sub))
-					elif node.text.strip(utils.spaceCharacters) == comment:
-						node.addprevious(etree.Comment(begin_sub))
-						node.addprevious(deepcopy(sub))
-						node.addprevious(etree.Comment(end_sub))
-						node.getprevious().tail = node.tail
-						to_remove.add(node)
-		
-		# Remove nodes
-		for node in to_remove:
-			node.getparent().remove(node)
-	
-	def getW3CStatus(self, ElementTree, **kwargs):
-		# Get all text nodes that contain case-insensitively "latest version" with any amount of whitespace inside the phrase, or contain http://www.w3.org/TR/
-		for text in ElementTree.xpath(u"//text()[contains(translate(., 'LATEST', 'latest'), 'latest') and contains(translate(., 'VERSION', 'version'), 'version') or contains(., 'http://www.w3.org/TR/')]"):
-			if latest_version.search(text):
-				return u"ED"
-			elif w3c_tr_url_status.search(text):
-				return w3c_tr_url_status.search(text).group(1)
-		# Didn't find any status, return the default (ED)
-		else:
-			return u"ED"
+    """Perform substitutions."""
+    
+    def __init__(self, ElementTree, w3c_compat=False, w3c_compat_substitutions=False, w3c_compat_crazy_substitutions=False, **kwargs):
+        if w3c_compat or w3c_compat_substitutions or w3c_compat_crazy_substitutions:
+            self.w3c_status = self.getW3CStatus(ElementTree, **kwargs)
+        self.stringSubstitutions(ElementTree, w3c_compat, w3c_compat_substitutions, w3c_compat_crazy_substitutions, **kwargs)
+        self.commentSubstitutions(ElementTree, w3c_compat, w3c_compat_substitutions, w3c_compat_crazy_substitutions, **kwargs)
+    
+    def stringSubstitutions(self, ElementTree, w3c_compat=False, w3c_compat_substitutions=False, w3c_compat_crazy_substitutions=False, **kwargs):
+        # Get doc_title from the title element
+        try:
+            doc_title = utils.textContent(ElementTree.getroot().find(u"head").find(u"title"))
+        except (AttributeError, TypeError):
+            doc_title = u""
+        
+        if w3c_compat or w3c_compat_substitutions:
+            # Get the right long status
+            doc_longstatus = longstatus_map[self.w3c_status]
+        
+        if w3c_compat_crazy_substitutions:
+            # Get the right stylesheet
+            doc_w3c_stylesheet = u"http://www.w3.org/StyleSheets/TR/W3C-" + self.w3c_status
+        
+        # Get all the subs we want
+        instance_string_subs = string_subs + ((title, doc_title, title_identifier),)
+        
+        # And even more in compat. mode
+        if w3c_compat or w3c_compat_substitutions:
+            instance_string_subs += ((status, self.w3c_status, status_identifier),
+                                     (longstatus, doc_longstatus, longstatus_identifier))
+        
+        # And more that aren't even enabled by default in compat. mode
+        if w3c_compat_crazy_substitutions:
+            instance_string_subs += ((w3c_stylesheet, doc_w3c_stylesheet, w3c_stylesheet_identifier),)
+        
+        for node in ElementTree.iter():
+            for regex, sub, identifier in instance_string_subs:
+                if node.text is not None and identifier in node.text:
+                    node.text = regex.sub(sub, node.text)
+                if node.tail is not None and identifier in node.tail:
+                    node.tail = regex.sub(sub, node.tail)
+                for name, value in node.attrib.items():
+                    if identifier in value:
+                        node.attrib[name] = regex.sub(sub, value)
+    
+    def commentSubstitutions(self, ElementTree, w3c_compat=False, w3c_compat_substitutions=False, w3c_compat_crazy_substitutions=False, **kwargs):
+        # Basic substitutions
+        instance_basic_comment_subs = basic_comment_subs
+        
+        # Add more basic substitutions in compat. mode
+        if w3c_compat or w3c_compat_substitutions:
+            instance_basic_comment_subs += ((logo, logo_sub),
+                                            (copyright, copyright_sub))
+        
+        # Set of nodes to remove
+        to_remove = set()
+        
+        # Link
+        in_link = False
+        for node in ElementTree.iter():
+            if in_link:
+                if node.tag is etree.Comment and node.text.strip(utils.spaceCharacters) == u"end-link":
+                    if node.getparent() is not link_parent:
+                        raise DifferentParentException, u"begin-link and end-link have different parents"
+                    utils.removeInteractiveContentChildren(link)
+                    link.set(u"href", utils.textContent(link))
+                    in_link = False
+                else:
+                    if node.getparent() is link_parent:
+                        link.append(deepcopy(node))
+                    to_remove.add(node)
+            elif node.tag is etree.Comment and node.text.strip(utils.spaceCharacters) == u"begin-link":
+                link_parent = node.getparent()
+                in_link = True
+                link = etree.Element(u"a")
+                link.text = node.tail
+                node.tail = None
+                node.addnext(link)
+        
+        # Basic substitutions
+        for comment, sub in instance_basic_comment_subs:
+            begin_sub = u"begin-" + comment
+            end_sub = u"end-" + comment
+            in_sub = False
+            for node in ElementTree.iter():
+                if in_sub:
+                    if node.tag is etree.Comment and node.text.strip(utils.spaceCharacters) == end_sub:
+                        if node.getparent() is not sub_parent:
+                            raise DifferentParentException, u"%s and %s have different parents" % begin_sub, end_sub
+                        in_sub = False
+                    else:
+                        to_remove.add(node)
+                elif node.tag is etree.Comment:
+                    if node.text.strip(utils.spaceCharacters) == begin_sub:
+                        sub_parent = node.getparent()
+                        in_sub = True
+                        node.tail = None
+                        node.addnext(deepcopy(sub))
+                    elif node.text.strip(utils.spaceCharacters) == comment:
+                        node.addprevious(etree.Comment(begin_sub))
+                        node.addprevious(deepcopy(sub))
+                        node.addprevious(etree.Comment(end_sub))
+                        node.getprevious().tail = node.tail
+                        to_remove.add(node)
+        
+        # Remove nodes
+        for node in to_remove:
+            node.getparent().remove(node)
+    
+    def getW3CStatus(self, ElementTree, **kwargs):
+        # Get all text nodes that contain case-insensitively "latest version" with any amount of whitespace inside the phrase, or contain http://www.w3.org/TR/
+        for text in ElementTree.xpath(u"//text()[contains(translate(., 'LATEST', 'latest'), 'latest') and contains(translate(., 'VERSION', 'version'), 'version') or contains(., 'http://www.w3.org/TR/')]"):
+            if latest_version.search(text):
+                return u"ED"
+            elif w3c_tr_url_status.search(text):
+                return w3c_tr_url_status.search(text).group(1)
+        # Didn't find any status, return the default (ED)
+        else:
+            return u"ED"
 
 class DifferentParentException(utils.AnolisException):
-	"""begin-link and end-link do not have the same parent."""
-	pass
+    """begin-link and end-link do not have the same parent."""
+    pass

File anolislib/processes/toc.py

 remove_attributes_from_toc = frozenset([u"id",])
 
 class toc(object):
-	"""Build and add TOC."""
-	
-	toc = None
-	
-	def __init__(self, ElementTree, **kwargs):
-		self.toc = etree.Element(u"ol", {u"class": u"toc"})
-		self.buildToc(ElementTree, **kwargs)
-		self.addToc(ElementTree, **kwargs)
-	
-	def buildToc(self, ElementTree, min_depth = 2, max_depth = 6, w3c_compat = False, w3c_compat_class_toc = False, **kwargs):
-		# Build the outline of the document
-		outline_creator = outliner.Outliner(ElementTree, **kwargs)
-		outline = outline_creator.build(**kwargs)
-		
-		# Get a list of all the top level sections, and their depth (0)
-		sections = [(section, 0) for section in reversed(outline)]
-		
-		# Numbering
-		num = []
-		
-		# Set of elements to remove (due to odd behaviour of Element.iter() this has to be done afterwards)
-		to_remove = set()
-		
-		# Loop over all sections in a DFS
-		while sections:
-			# Get the section and depth at the end of list
-			section, depth = sections.pop()
-					
-			# If we have a header, regardless of how deep we are
-			if section.header is not None:
-				# Get the element that represents the section header's text
-				if section.header.tag == u"header":
-					i = 1
-					while i <= 6:
-						section_header_text_element = section.header.find(u"h" + unicode(i))
-						if section_header_text_element is not None:
-							break
-					else:
-						section_header_text_element = None
-				else:
-					section_header_text_element = section.header
-			else:
-				section_header_text_element = None
-			
-			# If we have a section heading text element, regardless of depth
-			if section_header_text_element is not None:
-				# Remove any existing number
-				for element in section_header_text_element.iter(u"span"):
-					if utils.elementHasClass(element, u"secno"):
-						# Preserve the element tail
-						if element.tail is not None:
-							if element.getprevious() is not None:
-								if element.getprevious().tail is None:
-									element.getprevious().tail = element.tail
-								else:
-									element.getprevious().tail += element.tail
-							else:
-								if element.getparent().text is None:
-									element.getparent().text = element.tail
-								else:
-									element.getparent().text += element.tail
-						# Remove the element
-						to_remove.add(element)
-			
-			# Check we're in the valid depth range (min/max_depth are 1 based, depth is 0 based)
-			if depth >= min_depth - 1 and depth <= max_depth - 1:
-				# Calculate the corrected depth (i.e., the actual depth within the numbering/TOC)
-				corrected_depth = depth - min_depth + 1
-				
-				# Numbering:
-				# No children, no sibling, move back to parent's sibling
-				if corrected_depth + 1 < len(num):
-					del num[corrected_depth + 1:]
-				# Children
-				elif corrected_depth == len(num):
-					num.append(0)
-				
-				# Increment the current section's number
-				if section_header_text_element is not None and not utils.elementHasClass(section_header_text_element, u"no-num") or section_header_text_element is None and section:
-					num[-1] += 1
-				
-				# Get the current TOC section for this depth, and add another item to it
-				if section_header_text_element is not None and not utils.elementHasClass(section_header_text_element, u"no-toc") or section_header_text_element is None and section:
-					# Find the appropriate section of the TOC 
-					i = 0
-					toc_section = self.toc
-					while i < corrected_depth:
-						try:
-							# If the final li has no children, or the last children isn't an ol element
-							if len(toc_section[-1]) == 0 or toc_section[-1][-1].tag != u"ol":
-								toc_section[-1].append(etree.Element(u"ol"))
-								self.indentNode(toc_section[-1][-1], (i + 1) * 2, **kwargs)
-								if w3c_compat or w3c_compat_class_toc:
-									toc_section[-1][-1].set(u"class", u"toc")
-						except IndexError:
-							# If the current ol has no li in it
-							toc_section.append(etree.Element(u"li"))
-							self.indentNode(toc_section[0], (i + 1) * 2 - 1, **kwargs)
-							toc_section[0].append(etree.Element(u"ol"))
-							self.indentNode(toc_section[0][0], (i + 1) * 2, **kwargs)
-							if w3c_compat or w3c_compat_class_toc:
-								toc_section[0][0].set(u"class", u"toc")
-						# TOC Section is now the final child (ol) of the final item (li) in the previous section
-						assert toc_section[-1].tag == u"li"
-						assert toc_section[-1][-1].tag == u"ol"
-						toc_section = toc_section[-1][-1]
-						i += 1
-					# Add the current item to the TOC
-					item = etree.Element(u"li")
-					toc_section.append(item)
-					self.indentNode(item, (i + 1) * 2 - 1, **kwargs)
-					
-				# If we have a header
-				if section_header_text_element is not None:
-					# Remove all the elements in the list of nodes to remove (so that the removal of existing numbers doesn't lead to crazy IDs)
-					for element in to_remove:
-						element.getparent().remove(element)
-					to_remove = set()
-					
-					# Add ID to header
-					id = utils.generateID(section_header_text_element, **kwargs)
-					if section_header_text_element.get(u"id") is not None:
-						del section_header_text_element.attrib[u"id"]
-					section.header.set(u"id", id)
-					
-					# Add number, if @class doesn't contain no-num
-					if not utils.elementHasClass(section_header_text_element, u"no-num"):
-						section_header_text_element[0:0] = [etree.Element(u"span", {u"class": u"secno"})]
-						section_header_text_element[0].tail = section_header_text_element.text
-						section_header_text_element.text = None
-						section_header_text_element[0].text = u".".join(map(unicode, num))
-						section_header_text_element[0].text += u" "
-					# Add to TOC, if @class doesn't contain no-toc
-					if not utils.elementHasClass(section_header_text_element, u"no-toc"):
-						link = deepcopy(section_header_text_element)
-						item.append(link)
-						# Make it link to the header
-						link.tag = u"a"
-						link.set(u"href", u"#" + id)
-						# Remove interactive content child elements
-						utils.removeInteractiveContentChildren(link)
-						# Remove other child elements
-						for element_name in remove_elements_from_toc:
-							# Iterate over all the desendants of the new link with that element name
-							for element in link.iterdescendants(element_name):
-								# Copy content, to prepare for the node being removed
-								utils.copyContentForRemoval(element)
-								# Add the element of the list of elements to remove
-								to_remove.add(element)
-						# Remove unwanted attributes
-						for element in link.iter(tag=etree.Element):
-							for attribute_name in remove_attributes_from_toc:
-								if element.get(attribute_name) is not None:
-									del element.attrib[attribute_name]
-						# We don't want the old tail (or any tail, for that matter)
-						link.tail = None
-			# Add subsections in reverse order (so the next one is executed next) with a higher depth value
-			sections.extend([(child_section, depth + 1) for child_section in reversed(section)])
-		# Remove all the elements in the list of nodes to remove
-		for element in to_remove:
-			element.getparent().remove(element)
-	
-	def addToc(self, ElementTree, **kwargs):
-		to_remove = set()
-		in_toc = False
-		for node in ElementTree.iter():
-			if in_toc:
-				if node.tag is etree.Comment and node.text.strip(utils.spaceCharacters) == u"end-toc":
-					if node.getparent() is not toc_parent:
-						raise DifferentParentException, u"begin-toc and end-toc have different parents"
-					in_toc = False
-				else:
-					to_remove.add(node)
-			elif node.tag is etree.Comment:
-				if node.text.strip(utils.spaceCharacters) == u"begin-toc":
-					toc_parent = node.getparent()
-					in_toc = True
-					node.tail = None
-					node.addnext(deepcopy(self.toc))
-					self.indentNode(node.getnext(), 0, **kwargs)
-				elif node.text.strip(utils.spaceCharacters) == u"toc":
-					node.addprevious(etree.Comment(u"begin-toc"))
-					self.indentNode(node.getprevious(), 0, **kwargs)
-					node.addprevious(deepcopy(self.toc))
-					self.indentNode(node.getprevious(), 0, **kwargs)
-					node.addprevious(etree.Comment(u"end-toc"))
-					self.indentNode(node.getprevious(), 0, **kwargs)
-					node.getprevious().tail = node.tail
-					to_remove.add(node)
-		for node in to_remove:
-			node.getparent().remove(node)
-	
-	def indentNode(self, node, indent=0, newline_char=u"\n", indent_char=u"\t", **kwargs):
-		whitespace = newline_char + indent_char * indent
-		if node.getprevious() is not None:
-			if node.getprevious().tail is None:
-				node.getprevious().tail = whitespace
-			else:
-				node.getprevious().tail += whitespace
-		else:
-			if node.getparent().text is None:
-				node.getparent().text = whitespace
-			else:
-				node.getparent().text += whitespace
+    """Build and add TOC."""
+    
+    toc = None
+    
+    def __init__(self, ElementTree, **kwargs):
+        self.toc = etree.Element(u"ol", {u"class": u"toc"})
+        self.buildToc(ElementTree, **kwargs)
+        self.addToc(ElementTree, **kwargs)
+    
+    def buildToc(self, ElementTree, min_depth = 2, max_depth = 6, w3c_compat = False, w3c_compat_class_toc = False, **kwargs):
+        # Build the outline of the document
+        outline_creator = outliner.Outliner(ElementTree, **kwargs)
+        outline = outline_creator.build(**kwargs)
+        
+        # Get a list of all the top level sections, and their depth (0)
+        sections = [(section, 0) for section in reversed(outline)]
+        
+        # Numbering
+        num = []
+        
+        # Set of elements to remove (due to odd behaviour of Element.iter() this has to be done afterwards)
+        to_remove = set()
+        
+        # Loop over all sections in a DFS
+        while sections:
+            # Get the section and depth at the end of list
+            section, depth = sections.pop()
+                    
+            # If we have a header, regardless of how deep we are
+            if section.header is not None:
+                # Get the element that represents the section header's text
+                if section.header.tag == u"header":
+                    i = 1
+                    while i <= 6:
+                        section_header_text_element = section.header.find(u"h" + unicode(i))
+                        if section_header_text_element is not None:
+                            break
+                    else:
+                        section_header_text_element = None
+                else:
+                    section_header_text_element = section.header
+            else:
+                section_header_text_element = None
+            
+            # If we have a section heading text element, regardless of depth
+            if section_header_text_element is not None:
+                # Remove any existing number
+                for element in section_header_text_element.iter(u"span"):
+                    if utils.elementHasClass(element, u"secno"):
+                        # Preserve the element tail
+                        if element.tail is not None:
+                            if element.getprevious() is not None:
+                                if element.getprevious().tail is None:
+                                    element.getprevious().tail = element.tail
+                                else:
+                                    element.getprevious().tail += element.tail
+                            else:
+                                if element.getparent().text is None:
+                                    element.getparent().text = element.tail
+                                else:
+                                    element.getparent().text += element.tail
+                        # Remove the element
+                        to_remove.add(element)
+            
+            # Check we're in the valid depth range (min/max_depth are 1 based, depth is 0 based)
+            if depth >= min_depth - 1 and depth <= max_depth - 1:
+                # Calculate the corrected depth (i.e., the actual depth within the numbering/TOC)
+                corrected_depth = depth - min_depth + 1
+                
+                # Numbering:
+                # No children, no sibling, move back to parent's sibling
+                if corrected_depth + 1 < len(num):
+                    del num[corrected_depth + 1:]
+                # Children
+                elif corrected_depth == len(num):
+                    num.append(0)
+                
+                # Increment the current section's number
+                if section_header_text_element is not None and not utils.elementHasClass(section_header_text_element, u"no-num") or section_header_text_element is None and section:
+                    num[-1] += 1
+                
+                # Get the current TOC section for this depth, and add another item to it
+                if section_header_text_element is not None and not utils.elementHasClass(section_header_text_element, u"no-toc") or section_header_text_element is None and section:
+                    # Find the appropriate section of the TOC 
+                    i = 0
+                    toc_section = self.toc
+                    while i < corrected_depth:
+                        try:
+                            # If the final li has no children, or the last children isn't an ol element
+                            if len(toc_section[-1]) == 0 or toc_section[-1][-1].tag != u"ol":
+                                toc_section[-1].append(etree.Element(u"ol"))
+                                self.indentNode(toc_section[-1][-1], (i + 1) * 2, **kwargs)
+                                if w3c_compat or w3c_compat_class_toc:
+                                    toc_section[-1][-1].set(u"class", u"toc")
+                        except IndexError:
+                            # If the current ol has no li in it
+                            toc_section.append(etree.Element(u"li"))
+                            self.indentNode(toc_section[0], (i + 1) * 2 - 1, **kwargs)
+                            toc_section[0].append(etree.Element(u"ol"))
+                            self.indentNode(toc_section[0][0], (i + 1) * 2, **kwargs)
+                            if w3c_compat or w3c_compat_class_toc:
+                                toc_section[0][0].set(u"class", u"toc")
+                        # TOC Section is now the final child (ol) of the final item (li) in the previous section
+                        assert toc_section[-1].tag == u"li"
+                        assert toc_section[-1][-1].tag == u"ol"
+                        toc_section = toc_section[-1][-1]
+                        i += 1
+                    # Add the current item to the TOC
+                    item = etree.Element(u"li")
+                    toc_section.append(item)
+                    self.indentNode(item, (i + 1) * 2 - 1, **kwargs)
+                    
+                # If we have a header
+                if section_header_text_element is not None:
+                    # Remove all the elements in the list of nodes to remove (so that the removal of existing numbers doesn't lead to crazy IDs)
+                    for element in to_remove:
+                        element.getparent().remove(element)
+                    to_remove = set()
+                    
+                    # Add ID to header
+                    id = utils.generateID(section_header_text_element, **kwargs)
+                    if section_header_text_element.get(u"id") is not None:
+                        del section_header_text_element.attrib[u"id"]
+                    section.header.set(u"id", id)
+                    
+                    # Add number, if @class doesn't contain no-num
+                    if not utils.elementHasClass(section_header_text_element, u"no-num"):
+                        section_header_text_element[0:0] = [etree.Element(u"span", {u"class": u"secno"})]
+                        section_header_text_element[0].tail = section_header_text_element.text
+                        section_header_text_element.text = None
+                        section_header_text_element[0].text = u".".join(map(unicode, num))
+                        section_header_text_element[0].text += u" "
+                    # Add to TOC, if @class doesn't contain no-toc
+                    if not utils.elementHasClass(section_header_text_element, u"no-toc"):
+                        link = deepcopy(section_header_text_element)
+                        item.append(link)
+                        # Make it link to the header
+                        link.tag = u"a"
+                        link.set(u"href", u"#" + id)
+                        # Remove interactive content child elements
+                        utils.removeInteractiveContentChildren(link)
+                        # Remove other child elements
+                        for element_name in remove_elements_from_toc:
+                            # Iterate over all the desendants of the new link with that element name
+                            for element in link.iterdescendants(element_name):
+                                # Copy content, to prepare for the node being removed
+                                utils.copyContentForRemoval(element)
+                                # Add the element of the list of elements to remove
+                                to_remove.add(element)
+                        # Remove unwanted attributes
+                        for element in link.iter(tag=etree.Element):
+                            for attribute_name in remove_attributes_from_toc:
+                                if element.get(attribute_name) is not None:
+                                    del element.attrib[attribute_name]
+                        # We don't want the old tail (or any tail, for that matter)
+                        link.tail = None
+            # Add subsections in reverse order (so the next one is executed next) with a higher depth value
+            sections.extend([(child_section, depth + 1) for child_section in reversed(section)])
+        # Remove all the elements in the list of nodes to remove
+        for element in to_remove:
+            element.getparent().remove(element)
+    
+    def addToc(self, ElementTree, **kwargs):
+        to_remove = set()
+        in_toc = False
+        for node in ElementTree.iter():
+            if in_toc:
+                if node.tag is etree.Comment and node.text.strip(utils.spaceCharacters) == u"end-toc":
+                    if node.getparent() is not toc_parent:
+                        raise DifferentParentException, u"begin-toc and end-toc have different parents"
+                    in_toc = False
+                else:
+                    to_remove.add(node)
+            elif node.tag is etree.Comment:
+                if node.text.strip(utils.spaceCharacters) == u"begin-toc":
+                    toc_parent = node.getparent()
+                    in_toc = True
+                    node.tail = None
+                    node.addnext(deepcopy(self.toc))
+                    self.indentNode(node.getnext(), 0, **kwargs)
+                elif node.text.strip(utils.spaceCharacters) == u"toc":
+                    node.addprevious(etree.Comment(u"begin-toc"))
+                    self.indentNode(node.getprevious(), 0, **kwargs)
+                    node.addprevious(deepcopy(self.toc))
+                    self.indentNode(node.getprevious(), 0, **kwargs)
+                    node.addprevious(etree.Comment(u"end-toc"))
+                    self.indentNode(node.getprevious(), 0, **kwargs)
+                    node.getprevious().tail = node.tail
+                    to_remove.add(node)
+        for node in to_remove:
+            node.getparent().remove(node)
+    
+    def indentNode(self, node, indent=0, newline_char=u"\n", indent_char=u"\t", **kwargs):
+        whitespace = newline_char + indent_char * indent
+        if node.getprevious() is not None:
+            if node.getprevious().tail is None:
+                node.getprevious().tail = whitespace
+            else:
+                node.getprevious().tail += whitespace
+        else:
+            if node.getparent().text is None:
+                node.getparent().text = whitespace
+            else:
+                node.getparent().text += whitespace
 
 class DifferentParentException(utils.AnolisException):
-	"""begin-toc and end-toc do not have the same parent."""
-	pass
+    """begin-toc and end-toc do not have the same parent."""
+    pass

File anolislib/processes/xref.py

 non_alphanumeric_spaces = re.compile(r"[^a-zA-Z0-9 \-]+")
 
 class xref(object):
-	"""Add cross-references."""
-	
-	def __init__(self, ElementTree, **kwargs):
-		self.dfns = {}
-		self.buildReferences(ElementTree, **kwargs)
-		self.addReferences(ElementTree, **kwargs)
-	
-	def buildReferences(self, ElementTree, allow_duplicate_dfns=False, **kwargs):
-		for dfn in ElementTree.iter(u"dfn"):
-			term = self.getTerm(dfn, **kwargs)
-			
-			if len(term) > 0:
-				if not allow_duplicate_dfns and term in self.dfns:
-					raise DuplicateDfnException, u'The term "%s" is defined more than once' % term
-				
-				link_to = dfn
-				
-				for parent_element in dfn.iterancestors(tag=etree.Element):
-					if parent_element.tag in utils.heading_content:
-						link_to = parent_element
-						break
-				
-				id = utils.generateID(link_to, **kwargs)
-				
-				link_to.set(u"id", id)
-				
-				self.dfns[term] = id
-	
-	def addReferences(self, ElementTree, w3c_compat = False, w3c_compat_xref_elements = False, w3c_compat_xref_a_placement = False, **kwargs):
-		for element in ElementTree.iter(tag=etree.Element):
-			if element.tag in instance_elements or (w3c_compat or w3c_compat_xref_elements) and element.tag in w3c_instance_elements:
-				term = self.getTerm(element, w3c_compat=w3c_compat, **kwargs)
-				
-				if term in self.dfns:
-					goodParentingAndChildren = True
-					
-					for parent_element in element.iterancestors(tag=etree.Element):
-						if parent_element.tag in instance_not_in_stack_with or utils.isInteractiveContent(parent_element):
-							goodParentingAndChildren = False
-							break
-					else:
-						for child_element in element.iterdescendants(tag=etree.Element):
-							if child_element.tag in instance_not_in_stack_with or utils.isInteractiveContent(child_element):
-								goodParentingAndChildren = False
-								break
-					
-					if goodParentingAndChildren:
-						if element.tag == u"span":
-							element.tag = u"a"
-							element.set(u"href", u"#" + self.dfns[term])
-						else:
-							link = etree.Element(u"a", {u"href": u"#" + self.dfns[term]})
-							if w3c_compat or w3c_compat_xref_a_placement:
-								for node in element:
-									link.append(node)
-								link.text = element.text
-								element.text = None
-								element.append(link)
-							else:
-								element.addprevious(link)
-								link.append(element)
-								link.tail = link[0].tail
-								link[0].tail = None
-	
-	def getTerm(self, element, w3c_compat = False, w3c_compat_xref_normalization = False, **kwargs):
-		if element.get(u"title") is not None:
-			term = element.get(u"title")
-		else:
-			term = utils.textContent(element)
-		
-		term = term.strip(utils.spaceCharacters).lower()
-		
-		term = utils.spacesRegex.sub(u" ", term)
-		
-		if w3c_compat or w3c_compat_xref_normalization:
-			term = non_alphanumeric_spaces.sub(u"", term)
-		
-		return term
+    """Add cross-references."""
+    
+    def __init__(self, ElementTree, **kwargs):
+        self.dfns = {}
+        self.buildReferences(ElementTree, **kwargs)
+        self.addReferences(ElementTree, **kwargs)
+    
+    def buildReferences(self, ElementTree, allow_duplicate_dfns=False, **kwargs):
+        for dfn in ElementTree.iter(u"dfn"):
+            term = self.getTerm(dfn, **kwargs)
+            
+            if len(term) > 0:
+                if not allow_duplicate_dfns and term in self.dfns:
+                    raise DuplicateDfnException, u'The term "%s" is defined more than once' % term
+                
+                link_to = dfn
+                
+                for parent_element in dfn.iterancestors(tag=etree.Element):
+                    if parent_element.tag in utils.heading_content:
+                        link_to = parent_element
+                        break
+                
+                id = utils.generateID(link_to, **kwargs)
+                
+                link_to.set(u"id", id)
+                
+                self.dfns[term] = id
+    
+    def addReferences(self, ElementTree, w3c_compat = False, w3c_compat_xref_elements = False, w3c_compat_xref_a_placement = False, **kwargs):
+        for element in ElementTree.iter(tag=etree.Element):
+            if element.tag in instance_elements or (w3c_compat or w3c_compat_xref_elements) and element.tag in w3c_instance_elements:
+                term = self.getTerm(element, w3c_compat=w3c_compat, **kwargs)
+                
+                if term in self.dfns:
+                    goodParentingAndChildren = True
+                    
+                    for parent_element in element.iterancestors(tag=etree.Element):
+                        if parent_element.tag in instance_not_in_stack_with or utils.isInteractiveContent(parent_element):
+                            goodParentingAndChildren = False
+                            break
+                    else:
+                        for child_element in element.iterdescendants(tag=etree.Element):
+                            if child_element.tag in instance_not_in_stack_with or utils.isInteractiveContent(child_element):
+                                goodParentingAndChildren = False
+                                break
+                    
+                    if goodParentingAndChildren:
+                        if element.tag == u"span":
+                            element.tag = u"a"
+                            element.set(u"href", u"#" + self.dfns[term])
+                        else:
+                            link = etree.Element(u"a", {u"href": u"#" + self.dfns[term]})
+                            if w3c_compat or w3c_compat_xref_a_placement:
+                                for node in element:
+                                    link.append(node)
+                                link.text = element.text
+                                element.text = None
+                                element.append(link)
+                            else:
+                                element.addprevious(link)
+                                link.append(element)
+                                link.tail = link[0].tail
+                                link[0].tail = None
+    
+    def getTerm(self, element, w3c_compat = False, w3c_compat_xref_normalization = False, **kwargs):
+        if element.get(u"title") is not None:
+            term = element.get(u"title")
+        else:
+            term = utils.textContent(element)
+        
+        term = term.strip(utils.spaceCharacters).lower()
+        
+        term = utils.spacesRegex.sub(u" ", term)
+        
+        if w3c_compat or w3c_compat_xref_normalization:
+            term = non_alphanumeric_spaces.sub(u"", term)
+        
+        return term
 
 class DuplicateDfnException(utils.AnolisException):
-	"""Term already defined."""
-	pass
+    """Term already defined."""
+    pass

File anolislib/utils.py

 # coding=UTF-8
 # Copyright (c) 2008 Geoffrey Sneddon
-# 
+#
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
-# 
+#
 # The above copyright notice and this permission notice shall be included in
 # all copies or substantial portions of the Software.
-# 
+#
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 spaceCharacters = u"".join(spaceCharacters)
 spacesRegex = re.compile(u"[%s]+" % spaceCharacters)
 
-heading_content = frozenset([u"h1", u"h2", u"h3", u"h4", u"h5", u"h6", u"header"])
-sectioning_content = frozenset([u"body", u"section", u"nav", u"article", u"aside"])
+heading_content = frozenset([u"h1", u"h2", u"h3", u"h4", u"h5", u"h6",
+                             u"header"])
+sectioning_content = frozenset([u"body", u"section", u"nav", u"article",
+                                u"aside"])
 sectioning_root = frozenset([u"blockquote", u"figure", u"td", u"datagrid"])
 
 always_interactive_content = frozenset([u"a", u"bb", u"details", u"datagrid"])
 non_sgml_name = re.compile("[^A-Za-z0-9_:.]+")
 
 if sys.maxunicode == 0xFFFF:
-	# UTF-16 Python
-	non_ifragment = re.compile(u"([\u0000-\u0020\u0022\u0023\u0025\\\u002D\u003C\u003E\u005B-\u005E\u0060\u007B-\u007D\u007F-\u0099\uD800-\uF8FF\uFDD0-\uFDDF\uFFF0-\uFFFF]|\U0001FFFE|\U0001FFFF|\U0002FFFE|\U0002FFFF|\U0003FFFE|\U0003FFFF|\U0004FFFE|\U0004FFFF|\U0005FFFE|\U0005FFFF|\U0006FFFE|\U0006FFFF|\U0007FFFE|\U0007FFFF|\U0008FFFE|\U0008FFFF|\U0009FFFE|\U0009FFFF|\U000AFFFE|\U000AFFFF|\U000BFFFE|\U000BFFFF|\U000CFFFE|\U000CFFFF|\uDB3F[\uDFFE-\uDFFF]|[\uDB40-\uDB43][\uDC00-\uDFFF]|\uDB7F[\uDFFE-\uDFFF]|[\uDB80-\uDBFF][\uDC00-\uDFFF])+")
+    # UTF-16 Python
+    non_ifragment = re.compile(u"([\u0000-\u0020\u0022\u0023\u0025\\\u002D\u003C\u003E\u005B-\u005E\u0060\u007B-\u007D\u007F-\u0099\uD800-\uF8FF\uFDD0-\uFDDF\uFFF0-\uFFFF]|\U0001FFFE|\U0001FFFF|\U0002FFFE|\U0002FFFF|\U0003FFFE|\U0003FFFF|\U0004FFFE|\U0004FFFF|\U0005FFFE|\U0005FFFF|\U0006FFFE|\U0006FFFF|\U0007FFFE|\U0007FFFF|\U0008FFFE|\U0008FFFF|\U0009FFFE|\U0009FFFF|\U000AFFFE|\U000AFFFF|\U000BFFFE|\U000BFFFF|\U000CFFFE|\U000CFFFF|\uDB3F[\uDFFE-\uDFFF]|[\uDB40-\uDB43][\uDC00-\uDFFF]|\uDB7F[\uDFFE-\uDFFF]|[\uDB80-\uDBFF][\uDC00-\uDFFF])+")
 else:
-	# UTF-32 Python
-	non_ifragment = re.compile(u"[^A-Za-z0-9._~!$&'()*+,;=:@/?\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\U00010000-\U0001FFFD\U00020000-\U0002FFFD\U00030000-\U0003FFFD\U00040000-\U0004FFFD\U00050000-\U0005FFFD\U00060000-\U0006FFFD\U00070000-\U0007FFFD\U00080000-\U0008FFFD\U00090000-\U0009FFFD\U000A0000-\U000AFFFD\U000B0000-\U000BFFFD\U000C0000-\U000CFFFD\U000D0000-\U000DFFFD\U000E1000-\U000EFFFD]+")
+    # UTF-32 Python
+    non_ifragment = re.compile(u"[^A-Za-z0-9._~!$&'()*+,;=:@/?\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\U00010000-\U0001FFFD\U00020000-\U0002FFFD\U00030000-\U0003FFFD\U00040000-\U0004FFFD\U00050000-\U0005FFFD\U00060000-\U0006FFFD\U00070000-\U0007FFFD\U00080000-\U0008FFFD\U00090000-\U0009FFFD\U000A0000-\U000AFFFD\U000B0000-\U000BFFFD\U000C0000-\U000CFFFD\U000D0000-\U000DFFFD\U000E1000-\U000EFFFD]+")
+
 
 def splitOnSpaces(string):
-	return spacesRegex.split(string)
+    return spacesRegex.split(string)
+
 
 def elementHasClass(Element, class_name):
-	if Element.get(u"class") and class_name in splitOnSpaces(Element.get(u"class")):
-		return True
-	else:
-		return False
+    if Element.get(u"class") and \
+       class_name in splitOnSpaces(Element.get(u"class")):
+        return True
+    else:
+        return False
+
 
 def generateID(Element, force_html4_id=False, **kwargs):
-	if Element.get(u"id") is not None:
-		return Element.get(u"id")
-	elif Element.get(u"title") is not None and Element.get(u"title").strip(spaceCharacters) is not u"":
-		source = Element.get(u"title")
-	else:
-		source = textContent(Element)
-	
-	source = source.strip(spaceCharacters).lower()
-	
-	if source == u"":
-		source = u"generatedID"
-	elif force_html4_id or Element.getroottree().docinfo.public_id in \
-		(u"-//W3C//DTD HTML 4.0//EN",
-		 u"-//W3C//DTD HTML 4.0 Transitional//EN",
-		 u"-//W3C//DTD HTML 4.0 Frameset//EN",
-		 u"-//W3C//DTD HTML 4.01//EN",
-		 u"-//W3C//DTD HTML 4.01 Transitional//EN",
-		 u"-//W3C//DTD HTML 4.01 Frameset//EN",
-		 u"ISO/IEC 15445:2000//DTD HyperText Markup Language//EN",
-		 u"ISO/IEC 15445:2000//DTD HTML//EN",
-		 u"-//W3C//DTD XHTML 1.0 Strict//EN",
-		 u"-//W3C//DTD XHTML 1.0 Transitional//EN",
-		 u"-//W3C//DTD XHTML 1.0 Frameset//EN",
-		 u"-//W3C//DTD XHTML 1.1//EN"):
-		source = non_sgml_name.sub(u"-", source).strip(u"-")
-		try:
-			if not source[0].isalpha():
-				source = u"x" + source
-		except IndexError:
-			source = u"generatedID"
-	else:
-		source = non_ifragment.sub(u"-", source).strip(u"-")
-	
-	# Initally set the id to the source
-	id = source
-	
-	i = 0
-	while getElementById(Element.getroottree().getroot(), id) is not None:
-		id = source + u"-" + unicode(i)
-		i += 1
-	
-	ids[Element.getroottree().getroot()][id] = Element
-	
-	return id
+    if Element.get(u"id") is not None:
+        return Element.get(u"id")
+    elif Element.get(u"title") is not None and \
+         Element.get(u"title").strip(spaceCharacters) is not u"":
+        source = Element.get(u"title")
+    else:
+        source = textContent(Element)
+
+    source = source.strip(spaceCharacters).lower()
+
+    if source == u"":
+        source = u"generatedID"
+    elif force_html4_id or Element.getroottree().docinfo.public_id in \
+        (u"-//W3C//DTD HTML 4.0//EN",
+         u"-//W3C//DTD HTML 4.0 Transitional//EN",
+         u"-//W3C//DTD HTML 4.0 Frameset//EN",
+         u"-//W3C//DTD HTML 4.01//EN",
+         u"-//W3C//DTD HTML 4.01 Transitional//EN",
+         u"-//W3C//DTD HTML 4.01 Frameset//EN",
+         u"ISO/IEC 15445:2000//DTD HyperText Markup Language//EN",
+         u"ISO/IEC 15445:2000//DTD HTML//EN",
+         u"-//W3C//DTD XHTML 1.0 Strict//EN",
+         u"-//W3C//DTD XHTML 1.0 Transitional//EN",
+         u"-//W3C//DTD XHTML 1.0 Frameset//EN",
+         u"-//W3C//DTD XHTML 1.1//EN"):
+        source = non_sgml_name.sub(u"-", source).strip(u"-")
+        try:
+            if not source[0].isalpha():
+                source = u"x" + source
+        except IndexError:
+            source = u"generatedID"
+    else:
+        source = non_ifragment.sub(u"-", source).strip(u"-")
+
+    # Initally set the id to the source
+    id = source
+
+    i = 0
+    while getElementById(Element.getroottree().getroot(), id) is not None:
+        id = source + u"-" + unicode(i)
+        i += 1
+
+    ids[Element.getroottree().getroot()][id] = Element
+
+    return id
+
 
 def textContent(Element):
-	return etree.tostring(Element, encoding=unicode, method='text', with_tail=False)
+    return etree.tostring(Element, encoding=unicode, method='text',
+                          with_tail=False)
+
 
 def getElementById(base, id):
-	if base in ids:
-		try:
-			return ids[base][id]
-		except KeyError:
-			return None
-	else:
-		ids[base] = {}
-		for element in base.iter(tag=etree.Element):
-			if element.get(u"id"):
-				ids[base][element.get(u"id")] = element
-		return getElementById(base, id)
+    if base in ids:
+        try:
+            return ids[base][id]
+        except KeyError:
+            return None
+    else:
+        ids[base] = {}
+        for element in base.iter(tag=etree.Element):
+            if element.get(u"id"):
+                ids[base][element.get(u"id")] = element
+        return getElementById(base, id)
+
 
 def escapeXPathString(string):
-	return u"concat('', '%s')" % string.replace(u"'", u"', \"'\", '")
+    return u"concat('', '%s')" % string.replace(u"'", u"', \"'\", '")
+
 
 def removeInteractiveContentChildren(element):
-	# Set of elements to remove
-	to_remove = set()
-	
-	# Iter over decendants of element
-	for child in element.iterdescendants(etree.Element):
-		if isInteractiveContent(child):
-			# Copy content, to prepare for the node being removed
-			copyContentForRemoval(child)
-			# Add the element of the list of elements to remove
-			to_remove.add(child)
-	
-	# Remove all elements to be removed
-	for element in to_remove:
-		element.getparent().remove(element)
+    # Set of elements to remove
+    to_remove = set()
+
+    # Iter over decendants of element
+    for child in element.iterdescendants(etree.Element):
+        if isInteractiveContent(child):
+            # Copy content, to prepare for the node being removed
+            copyContentForRemoval(child)
+            # Add the element of the list of elements to remove
+            to_remove.add(child)
+
+    # Remove all elements to be removed
+    for element in to_remove:
+        element.getparent().remove(element)
+
 
 def isInteractiveContent(element):
-	if element.tag in always_interactive_content \
-	or element.tag in media_elements and element.get(u"controls") is not None \
-	or element.tag == u"menu" and element.get(u"type") is not None and element.get(u"type").lower() == u"toolbar":
-		return True
-	else:
-		return False
+    if element.tag in always_interactive_content \
+    or element.tag in media_elements and element.get(u"controls") is not None \
+    or element.tag == u"menu" and element.get(u"type") is not None and \
+       element.get(u"type").lower() == u"toolbar":
+        return True
+    else:
+        return False
+
 
 def copyContentForRemoval(node):
-	# Preserve the text, if it is an element
-	if isinstance(node.tag, basestring) and node.text is not None:
-		if node.getprevious() is not None:
-			if node.getprevious().tail is None:
-				node.getprevious().tail = node.text
-			else:
-				node.getprevious().tail += node.text
-		else:
-			if node.getparent().text is None:
-				node.getparent().text = node.text
-			else:
-				node.getparent().text += node.text
-	# Re-parent all the children of the element we're removing
-	for child in node:
-		node.addprevious(child)
-	# Preserve the element tail
-	if node.tail is not None:
-		if node.getprevious() is not None:
-			if node.getprevious().tail is None:
-				node.getprevious().tail = node.tail
-			else:
-				node.getprevious().tail += node.tail
-		else:
-			if node.getparent().text is None:
-				node.getparent().text = node.tail
-			else:
-				node.getparent().text += node.tail
+    # Preserve the text, if it is an element
+    if isinstance(node.tag, basestring) and node.text is not None:
+        if node.getprevious() is not None:
+            if node.getprevious().tail is None:
+                node.getprevious().tail = node.text
+            else:
+                node.getprevious().tail += node.text
+        else:
+            if node.getparent().text is None:
+                node.getparent().text = node.text
+            else:
+                node.getparent().text += node.text
+    # Re-parent all the children of the element we're removing
+    for child in node:
+        node.addprevious(child)
+    # Preserve the element tail
+    if node.tail is not None:
+        if node.getprevious() is not None:
+            if node.getprevious().tail is None:
+                node.getprevious().tail = node.tail
+            else:
+                node.getprevious().tail += node.tail
+        else:
+            if node.getparent().text is None:
+                node.getparent().text = node.tail
+            else:
+                node.getparent().text += node.tail
+
 
 global reversed
 try:
-	reversed
+    reversed
 except NameError:
-	def reversed(x):
-		if hasattr(x, 'keys'):
-			raise ValueError("mappings do not support reverse iteration")
-		i = len(x)
-		while i > 0:
-			i -= 1
-			yield x[i]
-						
+    def reversed(x):
+        if hasattr(x, 'keys'):
+            raise ValueError("mappings do not support reverse iteration")
+        i = len(x)
+        while i > 0:
+            i -= 1
+            yield x[i]
+
+
 class AnolisException(Exception):
-	"""Generic anolis error."""
-	pass
+    """Generic anolis error."""
+    pass
 # coding=UTF-8
 # Copyright (c) 2008 Geoffrey Sneddon
-# 
+#
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
-# 
+#
 # The above copyright notice and this permission notice shall be included in
 # all copies or substantial portions of the Software.
-# 
+#
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 
 from anolislib import generator
 
+
 def get_files(*args):
-	return glob.glob(os.path.join(*args))
+    return glob.glob(os.path.join(*args))
+
 
 class TestCase(unittest.TestCase):
-	pass
+    pass
+
 
 def buildTestSuite():
-	for file_name in get_files("tests", "basic", "*.src.html"):
-		def testFunc(self, file_name=file_name):
-			try:
-				# Get the input
-				input = open(file_name, "rb")
-				tree = generator.fromFile(input)
-				input.close()
-				
-				# Get the output
-				output = StringIO.StringIO()
-				generator.toFile(tree, output)
-				
-				# Get the expected result
-				expected = open(file_name[:-9] + ".html", "rb")
-				
-				# Run the test
-				self.assertEquals(output.getvalue(), expected.read())
-				
-				# Close the files
-				output.close()
-				expected.close()
-			except IOError, err:
-				self.fail(err)
-		setattr(TestCase, "test_%s" % (file_name), testFunc)
+    for file_name in get_files("tests", "basic", "*.src.html"):
+
+        def testFunc(self, file_name=file_name):
+            try:
+                # Get the input
+                input = open(file_name, "rb")
+                tree = generator.fromFile(input)
+                input.close()
+
+                # Get the output
+                output = StringIO.StringIO()
+                generator.toFile(tree, output)
+
+                # Get the expected result
+                expected = open(file_name[:-9] + ".html", "rb")
+
+                # Run the test
+                self.assertEquals(output.getvalue(), expected.read())
+
+                # Close the files
+                output.close()
+                expected.close()
+            except IOError, err:
+                self.fail(err)
+
+        setattr(TestCase, "test_%s" % (file_name), testFunc)
+
 
 def main():
-	buildTestSuite()
-	unittest.main()
+    buildTestSuite()
+    unittest.main()
 
 if __name__ == "__main__":
-	main()
+    main()
 from distutils.core import setup
 
 setup(name = "anolislib",
-	license="""MIT""",
-	version = "1.0",
-	author = "Geoffrey Sneddon",
-	author_email = "geoffers@gmail.com",
-	packages = ["anolislib", "anolislib/processes"],
-	scripts = ["anolis"],
-	)
+    license="""MIT""",
+    version = "1.0",
+    author = "Geoffrey Sneddon",
+    author_email = "geoffers@gmail.com",
+    packages = ["anolislib", "anolislib/processes"],
+    scripts = ["anolis"],
+    )