Commits

laiso committed fbc5feb

first commit

Comments (0)

Files changed (30)

+syntax: glob
+*.pyc
+*.orig
+*.swp
+*.tmp
+*~
+*.pdf
+.DS_Store
+docs/build
+docs/japanese/build
+kay/lib/jinja2/_speedups.so
+kay/*
+
+syntax: regexp
+(.*/)?\#[^/]*\#$
+(.*/)?\.\#[^/]*$
+kay-framework = https://kay-framework.googlecode.com/hg
+0000000000000000000000000000000000000000 kay-framework
+application: hatena-diary2epub
+version: 1
+runtime: python
+api_version: 1
+
+handlers:
+- url: /favicon.ico
+  static_files: favicon.ico
+  upload: favicon.ico
+  mime_type: image/x-icon
+
+- url: /media
+  static_dir: media
+
+- url: /_generated_media
+  static_dir: _generated_media
+
+- url: /_media
+  static_dir: kay/media
+
+- url: /_kay/.*
+  script: kay/main.py
+  login: admin
+
+- url: /remote_api
+  script: $PYTHON_LIB/google/appengine/ext/remote_api/handler.py
+  login: admin
+
+- url: /_ah/queue/deferred
+  script: kay/main.py
+  login: admin
+
+- url: /_ah/test.*
+  script: kay/ext/testutils/gaeunit.py
+  login: admin
+
+- url: /.*
+  script: kay/main.py
+
+skip_files: |
+  ^(.*/)?(
+  (_backup/.*)|
+  (app\.yaml)|
+  (app\.yml)|
+  (index\.yaml)|
+  (index\.yml)|
+  (#.*#)|
+  (.*~)|
+  (.*\.py[co])|
+  (.*\.po)|
+  (.*\.pot)|
+  (\..*)|
+  (app\.yaml\.sample)|
+  (index\.yaml\.sample)|
+  (cron\.yaml\.sample)|
+  (manage\.py)|
+  (TODO)|
+  (TODO\.pdf)|
+  (README)|
+  (README\.pdf)|
+  (LICENSE)|
+  (gaema-LICENSE)|
+  (kay\/docs\/.*)|
+  (kay\/management\/.*)|
+  (kay\/lib\/babel\/localedata\/.*)|
+  )$
+

hd2epub/__init__.py

+# -*- coding: utf-8 -*-
+# Kay application: hd2epub
+

hd2epub/models.py

+# -*- coding: utf-8 -*-
+# hd2epub.models
+from __future__ import (
+    absolute_import, with_statement
+)
+import os
+import csv
+import logging
+import zipfile
+from StringIO import StringIO
+
+from jinja2 import Template
+
+from google.appengine.ext import db
+
+TEMP_DIR = os.path.dirname(__file__) + '/templates/epub'
+
+class ExportedCSV(db.Model):
+    raw_data = db.TextProperty()
+    zip_data = db.BlobProperty()
+
+    @classmethod
+    def create(cls, stream):
+        txt = stream.read(1024*10000)
+        assert len(txt)
+        txt = unicode(txt, 'cp932')
+        mycsv = ExportedCSV(raw_data=txt)
+        mycsv.put()
+        return mycsv
+
+    def publish(self):
+        assert len(self.raw_data)
+        mycsv = csv.DictReader(
+                StringIO(self.raw_data), 
+                ['date', 'title', 'body', 'comment', 'text'])
+        io = StringIO()
+        myzip = zipfile.ZipFile(io, 
+                    'w', zipfile.ZIP_DEFLATED)
+        page = []
+        for i, line in enumerate(mycsv):
+            if i != 0:
+                page.append(i)
+                logging.debug(line)
+                html = ''
+                html += line.get('date')
+                html += "<br />"
+                html += line.get('title')
+                html += "<br />"
+                html += line.get('body')
+                myzip.writestr('Text/'+str(i)+'.xhtml', html)
+
+        toc = Template(file(TEMP_DIR+'/OEBPS/toc.ncx', 'rb').read())
+        myzip.writestr('OEBPS/toc.ncx', toc.render(pages=page))
+
+        opf = Template(file(TEMP_DIR+'/OEBPS/content.opf', 'rb').read())
+        myzip.writestr('OEBPS/content.opf', opf.render(pages=page))
+
+        myzip.write(TEMP_DIR+'/META-INF/container.xml', 'META-INF/container.xml')
+        myzip.write(TEMP_DIR+'/mimetype', 'mimetype')
+ 
+        self.zip_data = io.read()
+        myzip.close()
+        self.put()
+        return io
+
+
+
+

hd2epub/templates/epub/META-INF/container.xml

+<?xml version="1.0"?>
+<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
+    <rootfiles>
+        <rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
+   </rootfiles>
+</container>

hd2epub/templates/epub/OEBPS/Text/CheckingYourePubFile.xhtml

+<?xml version="1.0" encoding="utf-8" standalone="no"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
+  "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
+
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+  <title>Checking Your ePub File</title>
+  <style type="text/css">
+/*<![CDATA[*/
+
+  h1.sgc-1 {text-align: center;}
+
+  p.sgc-3 {font-weight: bold}
+  li.sgc-2 {list-style: none; display: inline}
+  /*]]>*/
+  </style>
+</head>
+
+<body>
+  <h1 class="sgc-1" id="heading_id_2">Checking Your ePub file</h1>
+
+  <p>So you've made a sample ePub book, and it won't open, or it opens with an error, or looks funky. What now?</p>
+
+  <p>epubcheck is a program that will scan your ePub file and display any errors it finds in the book.<br />
+  You can download it <a href="http://code.google.com/p/epubcheck/">here</a></p>
+
+  <p>You can also go to <a href="http://www.threepress.org/document/epub-validate/">threepress's</a> website to have it scanned online.</p>
+  <hr />
+
+  <p>If you have any comments, notice any bugs, or any questions on any of the steps here, please e-mail me at: yoda47 (at) jedisaber (dot) com</p>
+</body>
+</html>

hd2epub/templates/epub/OEBPS/Text/PrepareTheContent.xhtml

+<?xml version="1.0" encoding="utf-8" standalone="no"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
+  "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
+
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+  <title>Step 1: Prepare the Content</title>
+  <style type="text/css">
+/*<![CDATA[*/
+
+  h1.sgc-1 {text-align: center;}
+  /*]]>*/
+  </style>
+</head>
+
+<body>
+  <h1 class="sgc-1" id="heading_id_2">Step 1: Prepare the Content</h1>
+
+  <p>The first thing to do is prepare your content. Content will typically come in a .txt file or an .html file, if you get it from a public source such as Project Gutenberg. If you are turning your own work into an ePub eBook, it's probably in .doc or another word processor format.<br /></p>
+
+  <p>Content for ePub books needs to be in HTML format. It's out of the scope of this guide to show how to create an HTML file. As a quick overview:</p>
+
+  <p>If your file is in .doc (MS Word) format, save it as HTML, then use a tool mentioned above to clean it up.</p>
+
+  <p>If your file is a text document, add the <a href="http://www.w3.org/TR/xhtml-modularization/">appropriate markup</a> to make it XHTML compliant.</p>
+
+  <p>Another thing to note: If you are creating these files in a text editor, be sure to save in UTF-8 format, not ANSI. (In notepad, this is an option under the "save as" box.)</p>
+</body>
+</html>

hd2epub/templates/epub/OEBPS/Text/PrepareTheXMLFiles.xhtml

+<?xml version="1.0" encoding="utf-8" standalone="no"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
+  "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
+
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+  <title>Step 2: Prepare the XML files</title>
+  <style type="text/css">
+/*<![CDATA[*/
+
+  h1.sgc-1 {text-align: center;}
+
+  p.sgc-3 {font-weight: bold}
+  li.sgc-2 {list-style: none; display: inline}
+
+
+  /*]]>*/
+  </style>
+</head>
+
+<body>
+  <h1 class="sgc-1" id="heading_id_2">Step 2: Prepare the XML files</h1>
+
+  <p>The XML files are all the other stuff in the ePub book that tells where your content is, and what to do with it.</p>
+
+  <p>Before we start preparing our own eBook, lets look inside a sample file.</p>
+
+  <ul>
+    <li>Download the <a href="books/sample.epub">sample file</a> to your hard drive</li>
+
+    <li>Rename the .epub extension to .zip</li>
+
+    <li>Open the Zip file</li>
+  </ul>
+
+  <p>Great. Now what is all this stuff?</p>
+
+  <p><img alt="" src="../Images/Tutori1.jpg" />&nbsp;<img alt="" src="../Images/Tutori2.jpg" />&nbsp;<img alt="" src="../Images/Tutori3.jpg" /><br />
+  The root of the zip file&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; The Meta-inf folder&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; the OEBPS folder</p>
+
+  <p>A .epub file contains, at a bare minimum, the following files/folders:</p>
+
+  <ul>
+    <li>mimetype - tells a reader/operating system what's in here</li>
+
+    <li>META-INF folder - This folder contains, at minimum, the container.xml file, which tells the reader software where in the zip file to find the book.</li>
+
+    <li>OEBPS folder - Recommended location for the books content. It contains:</li>
+
+    <li class="sgc-2">
+      <ul>
+        <li>images folder - images go here</li>
+
+        <li>Content.opf - XML file that lists what's in the zip file</li>
+
+        <li>toc.ncx - This is the table of Contents</li>
+
+        <li>xhtml files - The book's contents are in these</li>
+
+        <li>page-template.xpgt - This file isn't really needed, but it makes it look pretty in Adobe Digital Editions</li>
+      </ul>
+    </li>
+  </ul>
+
+  <p>Lets look at each of these in more detail.<br />
+  Feel free to extract these files and use them as a template...</p>
+
+  <p>One thing to note before we get started: the filenames are case sensitive.<br />
+  This means that if you have a file named "Chapter1.xhtml" and you refer to it as "chapter1.xhtml" in the .OPF file or .NCX file, the book will not display properly.</p>
+
+  <p><b>mimetype</b><br />
+  This file is just a plain ASCII text file that contains the line:<br />
+  "<b>application/epub+zip</b>"<br />
+  The operating system can look at this file to figure out what a .epub file is instead of using the file extension.<br />
+  This file must be the first file in the zip file, and must not be compressed.</p>
+
+  <p><b>META-INF</b> Folder<br />
+  This contains the container.xml file, which points to the location of the Content.opf file.<br />
+  This folder is the same for every e-book, so you should be able to recycle the whole folder from the sample file without making changes.</p>
+
+  <p><b>OEBPS</b> Folder<br />
+  Notes on the OEBPS folder:<br />
+  This is the folder where the book content is stored. According to the IDPF spec, you don't have to put your book content in here, but it is recommended. I've come across at least two readers that won't read the book properly if the content isn't in this folder. (If you do put your book content somewhere else, make sure that you update container.xml to point to the correct location of the content.opf file.)</p>
+
+  <p>- <b>images</b> Folder<br />
+  If you have any images for your eBook, they go in here.</p>
+
+  <p>Note: most reading systems support a variety of images, but according to the OPF spec, only PNG must be supported by reading system</p>
+
+  <p>- <b>Content.opf</b><br />
+  This file gives a list of all files in the .epub container, defines the order of files, and stores meta data (author, genre, publisher, etc.) information.<br />
+  Note that this file can be named anything you want to call it, as long as the container.xml file mentioned above points to the correct filename.</p>
+
+  <p><img alt="" src="../Images/contentOPF.png" /><br /></p>
+
+  <p>Lots of stuff in this file. I'll go through each required tag here. Check the specs to see more information about optional meta data tags.</p>
+
+  <p>dc:title - Title of the book<br />
+  dc:language - Identifies the language used in the book content. The content has to comply with <a href="http://www.ietf.org/rfc/rfc3066.txt">RFC 3066</a>. <a href="http://www.loc.gov/standards/iso639-2/php/code_list.php">List of language codes</a>. (I'd just copy the language tag from the sample...)<br />
+  dc:identifier - This is the book's unique ID. This has to be a unique identifier for every different e-book. The spec doesn't give any sort of recommendation for what to use, but an ISBN number would be a good bet. I used the name of my web site and the date and time.<br />
+  One thing to note, because of how the file interacts with toc.ncx, just modify what's after the " uuid:" on this line.</p>
+
+  <p>Next comes the manifest. This is just a listing of the files in the .epub container, and their file type.<br />
+  Each item is also assigned an item ID that's used in the spine section of content.opf. This list does not have to be in any particular order. (But you'll be happier if it is. Also, see the section below on the NCX file for more information on the id attribute.)</p>
+
+  <p>The spine section lists the reading order of the contents. The spine doesn't have to list every file in the manifest, just the reading order. For example, if the manifest lists images, they do not have to be listed in the spine, and in fact, can't be. Only content (i.e. the XHTML files) can be listed here.</p>
+
+  <p>- <b>toc.ncx</b><br />
+  This is the table of contents. This file controls what shows up in the left Table of Contents pane in Digital Editions</p>
+
+  <p><img alt="" src="../Images/NCX.png" /><br /></p>
+
+  <p>Things you need to change (if you copy and re-use the sample toc.ncx file):</p>
+
+  <ul>
+    <li>Make sure the uid matches what you have in content.opf</li>
+
+    <li>doctitle: The text inside the text tag is what will show up as the books title in the reader software</li>
+
+    <li>The navpoint tag.</li>
+  </ul>
+
+  <p class="sgc-3">The navPoint tag</p>
+
+  <p>Each nav point is a chapter listing, the text is the chapter name, and the src is the file it links to.<br />
+  If you copy a navpoint tag set to add chapters, make sure to update the id and playorder values.</p>
+
+  <p>Let's look at our example file to clarify this:</p>
+
+  <p>&lt;navPoint id="chapter01" playOrder="1"&gt;<br />
+  &nbsp;&nbsp; &nbsp; &lt;navLable&gt;<br />
+  &nbsp;&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&lt;text&gt;Chapter 1&lt;/text&gt;<br />
+  &nbsp;&nbsp; &nbsp; &lt;/navLable&gt;<br />
+  &nbsp;&nbsp; &nbsp; &lt;content src="chap01.xhtml" /&gt;<br />
+  &lt;/navPoint&gt;</p>
+
+  <p>&nbsp;&nbsp; &nbsp; &lt;navPoint <b>id="chapter01"</b> playOrder="1"&gt;</p>
+
+  <p>According to the spec, the <b>id</b> can be anything you want, but it's easier to keep track of things if you use the same id you used for that file in the .OPF file. Also, some readers won't properly display the Table of Contents if the ID doesn't match.</p>
+
+  <p>&nbsp;&nbsp; &nbsp; &lt;navPoint&nbsp;id="chapter01"&nbsp;<b>playOrder="1"</b>&gt;<br /></p>
+
+  <p>The&nbsp;<b>playOrder</b>&nbsp;values have to be in order. (An item with playorder 1 will be before an item with playorder 2, etc.) They also have to be listed in order, and can't have any gaps. (You'll get an error if you jump from 1 to 20, etc)<br /></p>
+
+  <p>&nbsp;&nbsp; &nbsp;&nbsp;&lt;text&gt;Chapter 1&lt;/text&gt;</p>
+
+  <p>The stuff you type inside the text tag is what acutally shows up in the reading software's table of contents. This can be any text you want.</p>
+
+  <p>&nbsp;&nbsp; &nbsp; &lt;content src="chap01.xhtml"&gt; /&gt;<br /></p>
+
+  <p>The content tag links the table of contents item to the XHTML file it points to. If your id tag and text tag both point to chapter 1, and your content tag points to chapter 4, you'll go to chapter 4 when you click the link to chapter 1 in the table of contents.</p>
+
+  <p class="sgc-3">Notes on the toc.ncx:</p>
+
+  <p>You can't format the contents of the toc.ncx. This file is used by the reading software to display the table of contents. Each program will display the contents of toc.ncx differently. If you want to present a formatted table of contents to the reader, you need to make a XHTML file with the contents formatted however you want. (In fact, this is a good idea as there are still some ePub reading programs that don't use toc.ncx.)<br /></p>
+
+  <p><br /></p>
+
+  <p>- <b>page-template.xpgt</b><br />
+  This file isn't part of the IDPF spec, but Adobe Digital Editions uses it for formatting and setting column settings and whatnot. You don't need this file at all, but your book will look nicer in Digital Editions if you include it. Other readers should just ignore it.</p>
+
+  <p>Note: You can use a .css style sheet file to layout styles for your book as well. Just make sure to list it in the manifest section of Content.opf<br />
+  Also of note here, any styling should be done in a CSS stylesheet, and not in the document.</p>
+
+  <p>- <b>Content .xhtml files</b><br />
+  Content files should be XML 1.1 documents<br />
+  If you're not familiar with XML, it's basically HTML with closing tags for every element, and several style tags are not supported.<br />
+  As far as how to put the content, you can have it all in one document with bookmarks at each chapter, or each chapter in a separate .xhtml file. The latter looks nicer in most readers.</p>
+</body>
+</html>

hd2epub/templates/epub/OEBPS/Text/PutitintheContainer.xhtml

+<?xml version="1.0" encoding="utf-8" standalone="no"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
+  "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
+
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+  <title>Step 3: Put it in the Container</title>
+  <style type="text/css">
+/*<![CDATA[*/
+
+  h1.sgc-1 {text-align: center;}
+
+  p.sgc-3 {font-weight: bold}
+  li.sgc-2 {list-style: none; display: inline}
+
+  p.sgc-4 {text-align: center;}
+  /*]]>*/
+  </style>
+</head>
+
+<body>
+  <h1 class="sgc-1" id="heading_id_2">Step 3: Put it in the Container</h1>
+
+  <p>Now we make the .epub container that all these files go in.</p>
+
+  <ol>
+    <li>Create an empty .zip file with whatever name you like (See notes below for creating the container in OS X)</li>
+
+    <li>Copy the mimetype file into the zip file (don't use compression on this file)</li>
+
+    <li>Copy the rest of the files and folders mentioned above into the zip file *</li>
+
+    <li>Re-name the .zip extension to .epub</li>
+  </ol>
+
+  <p>* The specification recommends that the books files go in an "OEBPS" folder inside the zip file. If you put them in another spot, be sure that container.xml in the META-INF folder points to the correct location of the *.opf file.</p>
+
+  <p>The zip file layout should look something like this:</p>
+  <pre>
+
+- mimetype
+META-INF
+   - container.xml
+OEBPS
+   images
+   - content.opf
+   - toc.ncx
+   - stylesheet.css
+   - content.xhtml
+</pre>
+
+  <p>You should now be able to open your eBook in Adobe Digital Editions, or any other reader that supports the .epub format.</p>
+
+  <p>One thing to note is that the ePub specification calls the mimetype file to be the first file in the container. If you are using a GUI zip program, this is done by making a blank zip file, then copying just the mimetype file into the zip file, then the rest of the files.<br />
+  If you are using a command line zip utility, then you can either add the mimetype file first, or if your zip program of choice supports adding multiple files/folders at a time, make sure the mimetype file is the first one in the list.</p>
+
+  <p>If you want to cheat, download the file below. It's a zip file that has empty chapter pages, and the content and toc files pre filled out, so all you have to do is copy and paste your content into the empty files, and modify the OPF and NCX files.<br />
+  <a href="books/sample2.zip">Blank Sample file</a></p>
+
+  <p class="sgc-3 sgc-4">Creating the Container in OS X</p>
+
+  <p>The way the built-in zip support in OS X works makes the process a tad different than on Windows or Linux.</p>
+
+  <ol>
+    <li>Prepare your content like normal</li>
+
+    <li>Right-click the folder the files are in (ensure they are in their own file first) and click "Compress Folder"</li>
+
+    <li>Change the "zip" extension to "epub"</li>
+  </ol>
+
+  <p>The problem with this method is that the mimetype file isn't necessarily added to the zip folder first.<br />
+  A user at the mobileread forums has created a script file that will do the zipping for you, in the right order. You can get it here: <a href="http://www.mobileread.com/forums/showthread.php?t=55681" target="_blank">http://www.mobileread.com/forums/showthread.php?t=55681</a></p>
+
+  <p class="sgc-3">Note:</p>
+
+  <p>I have heard reports of OS X copying metadata files into the container. It's not supposed to do that, but if it does you can do through and delete any folder or file with a period in front of the name.</p>
+</body>
+</html>

hd2epub/templates/epub/OEBPS/Text/aboutepubformat.xhtml

+<?xml version="1.0" encoding="utf-8" standalone="no"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
+  "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
+
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+  <title>About the ePub Format</title>
+  <style type="text/css">
+/*<![CDATA[*/
+
+  p.sgc-3 {text-align: center;}
+  p.sgc-2 {font-weight: bold; text-align: center}
+  h1.sgc-1 {text-align: center;}
+
+  p.sgc-5 {font-weight: bold}
+  /*]]>*/
+  </style>
+</head>
+
+<body>
+  <h1 class="sgc-1" id="heading_id_2">About the ePub Format</h1>
+
+  <p>&nbsp;</p>
+
+  <p class="sgc-5">Why should I use the ePub format?</p>
+
+  <p>Because it's a completely open and free standard.<br />
+  The .epub format is a standard for eBooks created by the International Digital Publishing Forum. It consists of basic XHTML for the book content, XML for descriptions, and a re-named zip file to hold it all in. Anyone can make these eBooks, and since they're essentially just XHTML, anyone can read them.</p>
+
+  <p class="sgc-5">How can I read ePub books?</p>
+
+  <p>There are a varity of hardware and software devices that can read books in the ePub format. You can read ePub books on anything from a blackberry or iPhone, to a computer, or a dedicated reading device such as the Sony Reader or the Nook. For a review of programs that can read ePub formatted books, go to: <a href="http://www.jedisaber.com/eBooks/Readers.asp">www.jedisaber.com/eBooks/Readers.asp</a></p>
+
+  <p class="sgc-5">What is the ePub format?</p>
+
+  <p>ePub (also know by: EPUB, .epub, or other variations on capitalization) is a free and open standard for for eBooks (electronic books) maintained by the International Digital Publishing Forum (IDPF). The ePub file format was designed to be open (anyone can use it or create it) and re-flowable (the text can be re-sized and re-arranged to suite whatever display it's being read on). Ideally, it will catch on as the standard for ebooks.</p>
+
+  <p>For more on the history of the ePub format, see Wikipedia's ePub page, and of course, the <a href="http://www.idpf.org/">IDPF's page</a>.</p>
+
+  <p class="sgc-5">Where to get ePub Books</p>
+
+  <p>Some books in the IDPF .epub format are available at <a href="http://www.jedisaber.com/eBooks/Books.asp">www.jedisaber.com/eBooks/Books.asp</a> (Along with links to other sites to get ePub books from.)</p>
+
+  <p class="sgc-5">Automated Tools and Software Programs for editing ePub files</p>
+
+  <p>There are now several tools for automating the process of creating ePub books. See <a href="http://www.jedisaber.com/eBooks/editors.asp">www.jedisaber.com/eBooks/editors.asp</a> for full reviews of a variety of software programs to create and edit ePub files. The abillities of these programs range from assisting in packaging your prepared content into the ePub format to providing a full WYSIWYG&nbsp;<!--StartFragment-->environment<!--EndFragment-->&nbsp;to create and edit in.</p>
+</body>
+</html>

hd2epub/templates/epub/OEBPS/Text/cover.xhtml

+<?xml version="1.0" encoding="utf-8" standalone="no"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
+  "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
+
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+  <title>Cover</title>
+  <style type="text/css">
+/*<![CDATA[*/
+
+  p.sgc-3 {text-align: center;}
+  p.sgc-2 {font-weight: bold; text-align: center}
+  h1.sgc-1 {text-align: center;}
+
+  span.sgc-4 {font-weight: normal; font-size: medium;}
+  /*]]>*/
+  </style>
+</head>
+
+<body>
+  <h1 class="sgc-1" id="heading_id_2" title="Cover"><span class="sgc-4"><img align="absmiddle" alt="" src="../Images/epub_logo_color.jpg" />&nbsp;&nbsp; &nbsp;</span>ePub eBooks Tutorial</h1>
+
+  <p class="sgc-2"><br /></p>
+
+  <p class="sgc-2">How to create eBooks in the ePub Format</p>
+
+  <p class="sgc-3"><br /></p>
+
+  <p class="sgc-3"><a href="http://www.jedisaber.com/ebooks/">www.jedisaber.com/ebooks</a></p>
+</body>
+</html>

hd2epub/templates/epub/OEBPS/Text/overview.xhtml

+<?xml version="1.0" encoding="utf-8" standalone="no"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
+  "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
+
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+  <title>Overview and Required Tools</title>
+  <style type="text/css">
+/*<![CDATA[*/
+
+  h2.sgc-3 {text-align: center}
+  h3.sgc-2 {font-weight: bold}
+  p.sgc-1 {font-weight: bold}
+
+  h1.sgc-4 {text-align: center;}
+
+  p.sgc-6 {font-weight: bold; text-align: center}
+  /*]]>*/
+  </style>
+</head>
+
+<body>
+  <h1 class="sgc-4" id="heading_id_2">Overview and Required Tools</h1>
+
+  <p>If you're interested, I figured out the information in this guide by a combination of reverse engineering the Sherlock Holmes book from Adobe's site, reading through the specs at the IDPF web site, and trial and error until I got a working eBook to load properly in Digital Editions. (I figure it's OK to do that since Holmes is in the public domain now...)</p>
+
+  <p>Since I originally posted this guide, the ePub standard has gone through a few revisions, making the hand made files I first created now throw out a bunch of errors when ran through tools like ePubCheck. Since various websites now offer books in the ePub format, I'm not going to go through and update every singe book I've made, but I am going to re-do this guide from scratch.</p>
+
+  <p>As I stated above, this page is about making ePub files by hand. When the format first came out, there weren't any tools to make ePub files, so this was the only way to go. Now there are several (see the section above for reviews of some of them) and there really isn't a need to do this the hard way anymore. Nevertheless, I feel it's beneficial to know the guts of something, whether it's to fix a weird error, or to know what that program is really doing. Or you might just want to have a little more control over the process... or you might be really bored. ;)</p>
+
+  <p class="sgc-1">Tools Needed:</p>
+
+  <ul>
+    <li>A text editor. Anything that can edit text files, HTML, and XML. (Example: Notepad)</li>
+
+    <li>A .zip program. Anything that can create .zip files. (Example: Windows XP's built-in .zip support)<br /></li>
+  </ul>
+
+  <p class="sgc-1">Optional Tools:</p>
+
+  <p>You can make ePub files with just the programs that came with your operating system, but here are some suggestions for tools that can make the process easier.</p>
+
+  <p>For Windows:</p>
+
+  <ul>
+    <li><a href="http://www.editplus.com/">Edit Plus</a> (shareware)</li>
+
+    <li><a href="http://notepad-plus.sourceforge.net/uk/site.htm">Notpad++</a></li>
+
+    <li><a href="http://www.info-zip.org/Zip.html">Info-Zip</a><br /></li>
+  </ul>
+
+  <p>For OS X:</p>
+
+  <ul>
+    <li><a href="http://www.barebones.com/products/textwrangler/">Text Wrangler</a></li>
+  </ul>
+
+  <p>Several people have written to tell me that OS X's built in zip support works just fine. Just layout your files, then compress the folder they're in (right-click, then click "Compress"). Others prefer the command line zip program.</p>
+
+  <p class="sgc-1">Tools for cleaning up source documents:</p>
+
+  <p>Below are some tools for cleaning up the HTML/XHTML files often used for sources for ePub books. Cleaner source code will produce a better looking book. Most of the ePub readers right now only support basic tags and do strange and wonderful things when they see a tag they don't recognize. Whether you make an eBook by hand, or use a program to convert a text or HTML document to an ePub file, the cleaner the HTML looks, the nicer the final book will look. The fewer the tags in the source document, the better.</p>
+
+  <ul>
+    <li><a href="http://home.ccil.org/~cowan/XML/tagsoup/">Tag Soup</a> - cleans up HTML tags</li>
+  </ul>
+
+  <p>I had a nice Windows program specifically to clean MS Word crap out of HTML pages... but I can't seem to find what I did with it. I'll keep looking and post a link here if I can find it again...</p>
+
+  <p>&nbsp;</p>
+
+  <p class="sgc-6">The process of making an ePub eBook</p>
+
+  <p>The process of making an ePub book can be broken down into three parts:</p>
+
+  <ol>
+    <li>Prepare the content</li>
+
+    <li>Prepare the XML files</li>
+
+    <li>Put in in the container.</li>
+  </ol>
+
+  <p>First, let's go check out the official specs. Yes, it's very boring and hard to follow, but aren't they all?<br />
+  These will come in handy later on though. After getting the basic structure of the file setup, the official specs are handy to reference for tags that aren't used very often, or if you can't remember what exactly goes in a certain tag.<br />
+  Don't let them scare you though, we really only have to fiddle with two XML files, the rest is either straight XHTML, or files that you can copy from the sample file that we'll be looking at later.</p>
+
+  <p class="sgc-1">IDPF Specs:</p>
+
+  <ul>
+    <li><a href="http://www.idpf.org/doc_library/epub/OPS_2.0.1_draft.htm">(Allowed Mark-up reference for included XHTML files)</a></li>
+
+    <li><a href="http://www.idpf.org/doc_library/epub/OPF_2.0.1_draft.htm">Structure</a></li>
+
+    <li><a href="http://www.idpf.org/ocf/ocf1.0/index.htm">Container</a></li>
+  </ul>
+</body>
+</html>

hd2epub/templates/epub/OEBPS/content.opf

+<?xml version="1.0" encoding="UTF-8"?>
+<package xmlns="http://www.idpf.org/2007/opf" unique-identifier="BookID" version="2.0">
+    <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">
+        <dc:identifier id="BookID" opf:scheme="UUID">urn:uuid:7e41160a-1c9a-4beb-b268-3abf106df3e3</dc:identifier>
+        <meta name="Sigil version" content="0.3.1"/>
+    </metadata>
+    <manifest>
+        <item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>
+        {% for i in pages %}
+        <item id="{{ i }}.xhtml" href="Text/{{ i }}.xhtml" media-type="application/xhtml+xml"/>
+        {% endfor %}
+    </manifest>
+    <spine toc="ncx">
+        {% for i in pages %}
+        <itemref idref="{{ i }}.xhtml"/>
+        {% endfor %}
+    </spine>
+    <guide>
+        <reference type="cover" title="Cover" href="Text/1.xhtml"/>
+    </guide>
+</package>

hd2epub/templates/epub/OEBPS/toc.ncx

+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN"
+   "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">
+
+<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
+    <head>
+        <meta name="dtb:uid" content="7e41160a-1c9a-4beb-b268-3abf106df3e3"/>
+        <meta name="dtb:depth" content="1"/>
+        <meta name="dtb:totalPageCount" content="0"/>
+        <meta name="dtb:maxPageNumber" content="0"/>
+    </head>
+    <docTitle>
+        <text>Unknown</text>
+    </docTitle>
+    <navMap>
+        {% for i in pages %}
+        <navPoint id="navPoint-{{ i }}" playOrder="{{ i }}">
+            <navLabel>
+                <text>Page{{ i }}</text>
+            </navLabel>
+            <content src="Text/{{ i }}.xhtml"/>
+        </navPoint>
+        {% endfor %}
+    </navMap>
+</ncx>

hd2epub/templates/epub/mimetype

+application/epub+zip

hd2epub/templates/index.html

+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html>
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<title>Top Page - hd2epub</title>
+</head>
+<body>
+    <form action="" method="POST" enctype="multipart/form-data">
+    {% if uploaded %}
+        <p>UPLOADED</p>
+    {% endif %}
+    <input type="file" name='csv'>
+    <input type="submit">
+    </form>
+</body>
+</html>

hd2epub/tests/__init__.py

+# -*- coding: utf-8 -*-
+
+from __future__ import (
+    absolute_import, with_statement
+)
+
+import os
+from zipfile import ZipFile
+from unittest import TestCase
+from StringIO import StringIO
+
+from nose.tools import *
+
+from ..models import (
+    ExportedCSV, 
+)
+
+TEST_CSV = os.path.dirname(__file__) + '/data/hoge.csv'
+TEMP_DIR = os.path.dirname(__file__) + '/data/tmp'
+
+class ReadCSVTestCase(TestCase):
+    def testResdCSV(self):
+        with open(TEST_CSV, 'rb') as f:
+            mycsv = ExportedCSV.create(f)
+            ok_(isinstance(mycsv, ExportedCSV))
+            ok_(isinstance(mycsv.raw_data, basestring))
+    
+class ReadEPUBTestCase(TestCase):
+    def testRead(self):
+        with open(TEST_CSV, 'rb') as f:
+            mycsv = ExportedCSV.create(f)
+            myepub = mycsv.publish()
+            ok_(isinstance(myepub, StringIO))
+
+

hd2epub/tests/data/hoge.csv

+date,title,body,comment,text
+2011-02-10,,"
+  <div class=""section"">
+   <h3 class=""title""><a href=""/ANONYMOUS/20110210#1297354397"" name=""1297354397"">�e�X�g</a></h3>
+   <p class=""sectionheader""> <span class=""timestamp"">01:13</span></p>
+   <p>�Ă��Ƃ���[</p>
+  </div>
+",,"*1297354397*�e�X�g
+�Ă��Ƃ���["
+2009-08-26,,"
+  <div class=""section"">
+   <h3 class=""title""><a href=""/ANONYMOUS/20090826#1251296064"" name=""1251296064"">aaaa</a></h3>
+   <p class=""sectionheader""> <span class=""timestamp"">23:14</span></p>
+   <p>Aaawwwww</p>
+  </div>
+",,"*1251296064*aaaa
+Aaawwwww"

hd2epub/tests/data/hoge.html

+date,title,body,comment,text
+2011-02-10,,"
+  <div class=""section"">
+   <h3 class=""title""><a href=""/ANONYMOUS/20110210#1297354397"" name=""1297354397"">�e�X�g</a></h3>
+   <p class=""sectionheader""> <span class=""timestamp"">01:13</span></p>
+   <p>�Ă��Ƃ���[</p>
+  </div>
+",,"*1297354397*�e�X�g
+�Ă��Ƃ���["
+2009-08-26,,"
+  <div class=""section"">
+   <h3 class=""title""><a href=""/ANONYMOUS/20090826#1251296064"" name=""1251296064"">aaaa</a></h3>
+   <p class=""sectionheader""> <span class=""timestamp"">23:14</span></p>
+   <p>Aaawwwww</p>
+  </div>
+",,"*1251296064*aaaa
+Aaawwwww"
Add a comment to this file

hd2epub/tests/data/tmp/aa.zip

Binary file added.

Add a comment to this file

hd2epub/tests/data/tmp/hoge.zip

Binary file added.

+# -*- coding: utf-8 -*-
+# hd2epub.urls
+# 
+
+# Following few lines is an example urlmapping with an older interface.
+"""
+from werkzeug.routing import EndpointPrefix, Rule
+
+def make_rules():
+  return [
+    EndpointPrefix('hd2epub/', [
+      Rule('/', endpoint='index'),
+    ]),
+  ]
+
+all_views = {
+  'hd2epub/index': 'hd2epub.views.index',
+}
+"""
+
+from kay.routing import (
+  ViewGroup, Rule
+)
+
+view_groups = [
+  ViewGroup(
+    Rule('/', endpoint='index', view='hd2epub.views.index'),
+    Rule('/file/<int:id>.zip', endpoint='file', view='hd2epub.views.file'),
+    Rule('/file/<int:id>.epub', endpoint='file', view='hd2epub.views.file'),
+  )
+]
+
+# -*- coding: utf-8 -*-
+"""
+hd2epub.views
+"""
+from __future__ import (
+    absolute_import, with_statement
+)
+
+"""
+import logging
+
+from google.appengine.api import users
+from google.appengine.api import memcache
+
+from werkzeug.exceptions import (
+  NotFound, MethodNotAllowed, BadRequest
+)
+
+from kay.i18n import gettext as _
+from kay.auth.decorators import login_required
+
+"""
+from werkzeug import (
+  unescape, redirect, Response,
+)
+from kay.utils import (
+  render_to_response, reverse,
+  get_by_key_name_or_404, get_by_id_or_404,
+  to_utc, to_local_timezone, url_for, raise_on_dev
+)
+
+from kay.utils import render_to_response
+
+from .models import ExportedCSV
+
+def index(request):
+    uploaded = False
+    if request.method == 'POST':
+        uploaded = _upload(request)
+    return render_to_response('hd2epub/index.html', 
+            dict(uploaded=uploaded, ))
+
+def file(request, id):
+    epub = get_by_id_or_404(ExportedCSV, id)
+    myzip = epub.publish()
+    content_value = myzip.getvalue()
+    assert content_value
+    return Response(content_value, 
+            #mimetype='application/x-zip-compressed', 
+            mimetype='application/octet-stream',
+            content_type='application/octet-stream')
+    
+def _upload(request):
+    mycsv = request.files.get('csv')
+    return ExportedCSV.create(mycsv)
+

lib/BeautifulSoup.py

+"""Beautiful Soup
+Elixir and Tonic
+"The Screen-Scraper's Friend"
+http://www.crummy.com/software/BeautifulSoup/
+
+Beautiful Soup parses a (possibly invalid) XML or HTML document into a
+tree representation. It provides methods and Pythonic idioms that make
+it easy to navigate, search, and modify the tree.
+
+A well-formed XML/HTML document yields a well-formed data
+structure. An ill-formed XML/HTML document yields a correspondingly
+ill-formed data structure. If your document is only locally
+well-formed, you can use this library to find and process the
+well-formed part of it.
+
+Beautiful Soup works with Python 2.2 and up. It has no external
+dependencies, but you'll have more success at converting data to UTF-8
+if you also install these three packages:
+
+* chardet, for auto-detecting character encodings
+  http://chardet.feedparser.org/
+* cjkcodecs and iconv_codec, which add more encodings to the ones supported
+  by stock Python.
+  http://cjkpython.i18n.org/
+
+Beautiful Soup defines classes for two main parsing strategies:
+
+ * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
+   language that kind of looks like XML.
+
+ * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
+   or invalid. This class has web browser-like heuristics for
+   obtaining a sensible parse tree in the face of common HTML errors.
+
+Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
+the encoding of an HTML or XML document, and converting it to
+Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
+
+For more than you ever wanted to know about Beautiful Soup, see the
+documentation:
+http://www.crummy.com/software/BeautifulSoup/documentation.html
+
+Here, have some legalese:
+
+Copyright (c) 2004-2010, Leonard Richardson
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+
+  * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following
+    disclaimer in the documentation and/or other materials provided
+    with the distribution.
+
+  * Neither the name of the the Beautiful Soup Consortium and All
+    Night Kosher Bakery nor the names of its contributors may be
+    used to endorse or promote products derived from this software
+    without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
+
+"""
+from __future__ import generators
+
+__author__ = "Leonard Richardson (leonardr@segfault.org)"
+__version__ = "3.2.0"
+__copyright__ = "Copyright (c) 2004-2010 Leonard Richardson"
+__license__ = "New-style BSD"
+
+from sgmllib import SGMLParser, SGMLParseError
+import codecs
+import markupbase
+import types
+import re
+import sgmllib
+try:
+  from htmlentitydefs import name2codepoint
+except ImportError:
+  name2codepoint = {}
+try:
+    set
+except NameError:
+    from sets import Set as set
+
+#These hacks make Beautiful Soup able to parse XML with namespaces
+sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
+markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
+
+DEFAULT_OUTPUT_ENCODING = "utf-8"
+
+def _match_css_class(str):
+    """Build a RE to match the given CSS class."""
+    return re.compile(r"(^|.*\s)%s($|\s)" % str)
+
+# First, the classes that represent markup elements.
+
+class PageElement(object):
+    """Contains the navigational information for some part of the page
+    (either a tag or a piece of text)"""
+
+    def setup(self, parent=None, previous=None):
+        """Sets up the initial relations between this element and
+        other elements."""
+        self.parent = parent
+        self.previous = previous
+        self.next = None
+        self.previousSibling = None
+        self.nextSibling = None
+        if self.parent and self.parent.contents:
+            self.previousSibling = self.parent.contents[-1]
+            self.previousSibling.nextSibling = self
+
+    def replaceWith(self, replaceWith):
+        oldParent = self.parent
+        myIndex = self.parent.index(self)
+        if hasattr(replaceWith, "parent")\
+                  and replaceWith.parent is self.parent:
+            # We're replacing this element with one of its siblings.
+            index = replaceWith.parent.index(replaceWith)
+            if index and index < myIndex:
+                # Furthermore, it comes before this element. That
+                # means that when we extract it, the index of this
+                # element will change.
+                myIndex = myIndex - 1
+        self.extract()
+        oldParent.insert(myIndex, replaceWith)
+
+    def replaceWithChildren(self):
+        myParent = self.parent
+        myIndex = self.parent.index(self)
+        self.extract()
+        reversedChildren = list(self.contents)
+        reversedChildren.reverse()
+        for child in reversedChildren:
+            myParent.insert(myIndex, child)
+
+    def extract(self):
+        """Destructively rips this element out of the tree."""
+        if self.parent:
+            try:
+                del self.parent.contents[self.parent.index(self)]
+            except ValueError:
+                pass
+
+        #Find the two elements that would be next to each other if
+        #this element (and any children) hadn't been parsed. Connect
+        #the two.
+        lastChild = self._lastRecursiveChild()
+        nextElement = lastChild.next
+
+        if self.previous:
+            self.previous.next = nextElement
+        if nextElement:
+            nextElement.previous = self.previous
+        self.previous = None
+        lastChild.next = None
+
+        self.parent = None
+        if self.previousSibling:
+            self.previousSibling.nextSibling = self.nextSibling
+        if self.nextSibling:
+            self.nextSibling.previousSibling = self.previousSibling
+        self.previousSibling = self.nextSibling = None
+        return self
+
+    def _lastRecursiveChild(self):
+        "Finds the last element beneath this object to be parsed."
+        lastChild = self
+        while hasattr(lastChild, 'contents') and lastChild.contents:
+            lastChild = lastChild.contents[-1]
+        return lastChild
+
+    def insert(self, position, newChild):
+        if isinstance(newChild, basestring) \
+            and not isinstance(newChild, NavigableString):
+            newChild = NavigableString(newChild)
+
+        position =  min(position, len(self.contents))
+        if hasattr(newChild, 'parent') and newChild.parent is not None:
+            # We're 'inserting' an element that's already one
+            # of this object's children.
+            if newChild.parent is self:
+                index = self.index(newChild)
+                if index > position:
+                    # Furthermore we're moving it further down the
+                    # list of this object's children. That means that
+                    # when we extract this element, our target index
+                    # will jump down one.
+                    position = position - 1
+            newChild.extract()
+
+        newChild.parent = self
+        previousChild = None
+        if position == 0:
+            newChild.previousSibling = None
+            newChild.previous = self
+        else:
+            previousChild = self.contents[position-1]
+            newChild.previousSibling = previousChild
+            newChild.previousSibling.nextSibling = newChild
+            newChild.previous = previousChild._lastRecursiveChild()
+        if newChild.previous:
+            newChild.previous.next = newChild
+
+        newChildsLastElement = newChild._lastRecursiveChild()
+
+        if position >= len(self.contents):
+            newChild.nextSibling = None
+
+            parent = self
+            parentsNextSibling = None
+            while not parentsNextSibling:
+                parentsNextSibling = parent.nextSibling
+                parent = parent.parent
+                if not parent: # This is the last element in the document.
+                    break
+            if parentsNextSibling:
+                newChildsLastElement.next = parentsNextSibling
+            else:
+                newChildsLastElement.next = None
+        else:
+            nextChild = self.contents[position]
+            newChild.nextSibling = nextChild
+            if newChild.nextSibling:
+                newChild.nextSibling.previousSibling = newChild
+            newChildsLastElement.next = nextChild
+
+        if newChildsLastElement.next:
+            newChildsLastElement.next.previous = newChildsLastElement
+        self.contents.insert(position, newChild)
+
+    def append(self, tag):
+        """Appends the given tag to the contents of this tag."""
+        self.insert(len(self.contents), tag)
+
+    def findNext(self, name=None, attrs={}, text=None, **kwargs):
+        """Returns the first item that matches the given criteria and
+        appears after this Tag in the document."""
+        return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
+
+    def findAllNext(self, name=None, attrs={}, text=None, limit=None,
+                    **kwargs):
+        """Returns all items that match the given criteria and appear
+        after this Tag in the document."""
+        return self._findAll(name, attrs, text, limit, self.nextGenerator,
+                             **kwargs)
+
+    def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
+        """Returns the closest sibling to this Tag that matches the
+        given criteria and appears after this Tag in the document."""
+        return self._findOne(self.findNextSiblings, name, attrs, text,
+                             **kwargs)
+
+    def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
+                         **kwargs):
+        """Returns the siblings of this Tag that match the given
+        criteria and appear after this Tag in the document."""
+        return self._findAll(name, attrs, text, limit,
+                             self.nextSiblingGenerator, **kwargs)
+    fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
+
+    def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
+        """Returns the first item that matches the given criteria and
+        appears before this Tag in the document."""
+        return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
+
+    def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
+                        **kwargs):
+        """Returns all items that match the given criteria and appear
+        before this Tag in the document."""
+        return self._findAll(name, attrs, text, limit, self.previousGenerator,
+                           **kwargs)
+    fetchPrevious = findAllPrevious # Compatibility with pre-3.x
+
+    def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
+        """Returns the closest sibling to this Tag that matches the
+        given criteria and appears before this Tag in the document."""
+        return self._findOne(self.findPreviousSiblings, name, attrs, text,
+                             **kwargs)
+
+    def findPreviousSiblings(self, name=None, attrs={}, text=None,
+                             limit=None, **kwargs):
+        """Returns the siblings of this Tag that match the given
+        criteria and appear before this Tag in the document."""
+        return self._findAll(name, attrs, text, limit,
+                             self.previousSiblingGenerator, **kwargs)
+    fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
+
+    def findParent(self, name=None, attrs={}, **kwargs):
+        """Returns the closest parent of this Tag that matches the given
+        criteria."""
+        # NOTE: We can't use _findOne because findParents takes a different
+        # set of arguments.
+        r = None
+        l = self.findParents(name, attrs, 1)
+        if l:
+            r = l[0]
+        return r
+
+    def findParents(self, name=None, attrs={}, limit=None, **kwargs):
+        """Returns the parents of this Tag that match the given
+        criteria."""
+
+        return self._findAll(name, attrs, None, limit, self.parentGenerator,
+                             **kwargs)
+    fetchParents = findParents # Compatibility with pre-3.x
+
+    #These methods do the real heavy lifting.
+
+    def _findOne(self, method, name, attrs, text, **kwargs):
+        r = None
+        l = method(name, attrs, text, 1, **kwargs)
+        if l:
+            r = l[0]
+        return r
+
+    def _findAll(self, name, attrs, text, limit, generator, **kwargs):
+        "Iterates over a generator looking for things that match."
+
+        if isinstance(name, SoupStrainer):
+            strainer = name
+        # (Possibly) special case some findAll*(...) searches
+        elif text is None and not limit and not attrs and not kwargs:
+            # findAll*(True)
+            if name is True:
+                return [element for element in generator()
+                        if isinstance(element, Tag)]
+            # findAll*('tag-name')
+            elif isinstance(name, basestring):
+                return [element for element in generator()
+                        if isinstance(element, Tag) and
+                        element.name == name]
+            else:
+                strainer = SoupStrainer(name, attrs, text, **kwargs)
+        # Build a SoupStrainer
+        else:
+            strainer = SoupStrainer(name, attrs, text, **kwargs)
+        results = ResultSet(strainer)
+        g = generator()
+        while True:
+            try:
+                i = g.next()
+            except StopIteration:
+                break
+            if i:
+                found = strainer.search(i)
+                if found:
+                    results.append(found)
+                    if limit and len(results) >= limit:
+                        break
+        return results
+
+    #These Generators can be used to navigate starting from both
+    #NavigableStrings and Tags.
+    def nextGenerator(self):
+        i = self
+        while i is not None:
+            i = i.next
+            yield i
+
+    def nextSiblingGenerator(self):
+        i = self
+        while i is not None:
+            i = i.nextSibling
+            yield i
+
+    def previousGenerator(self):
+        i = self
+        while i is not None:
+            i = i.previous
+            yield i
+
+    def previousSiblingGenerator(self):
+        i = self
+        while i is not None:
+            i = i.previousSibling
+            yield i
+
+    def parentGenerator(self):
+        i = self
+        while i is not None:
+            i = i.parent
+            yield i
+
+    # Utility methods
+    def substituteEncoding(self, str, encoding=None):
+        encoding = encoding or "utf-8"
+        return str.replace("%SOUP-ENCODING%", encoding)
+
+    def toEncoding(self, s, encoding=None):
+        """Encodes an object to a string in some encoding, or to Unicode.
+        ."""
+        if isinstance(s, unicode):
+            if encoding:
+                s = s.encode(encoding)
+        elif isinstance(s, str):
+            if encoding:
+                s = s.encode(encoding)
+            else:
+                s = unicode(s)
+        else:
+            if encoding:
+                s  = self.toEncoding(str(s), encoding)
+            else:
+                s = unicode(s)
+        return s
+
+class NavigableString(unicode, PageElement):
+
+    def __new__(cls, value):
+        """Create a new NavigableString.
+
+        When unpickling a NavigableString, this method is called with
+        the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
+        passed in to the superclass's __new__ or the superclass won't know
+        how to handle non-ASCII characters.
+        """
+        if isinstance(value, unicode):
+            return unicode.__new__(cls, value)
+        return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
+
+    def __getnewargs__(self):
+        return (NavigableString.__str__(self),)
+
+    def __getattr__(self, attr):
+        """text.string gives you text. This is for backwards
+        compatibility for Navigable*String, but for CData* it lets you
+        get the string without the CData wrapper."""
+        if attr == 'string':
+            return self
+        else:
+            raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
+
+    def __unicode__(self):
+        return str(self).decode(DEFAULT_OUTPUT_ENCODING)
+
+    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+        if encoding:
+            return self.encode(encoding)
+        else:
+            return self
+
+class CData(NavigableString):
+
+    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+        return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
+
+class ProcessingInstruction(NavigableString):
+    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+        output = self
+        if "%SOUP-ENCODING%" in output:
+            output = self.substituteEncoding(output, encoding)
+        return "<?%s?>" % self.toEncoding(output, encoding)
+
+class Comment(NavigableString):
+    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+        return "<!--%s-->" % NavigableString.__str__(self, encoding)
+
+class Declaration(NavigableString):
+    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+        return "<!%s>" % NavigableString.__str__(self, encoding)
+
+class Tag(PageElement):
+
+    """Represents a found HTML tag with its attributes and contents."""
+
+    def _invert(h):
+        "Cheap function to invert a hash."
+        i = {}
+        for k,v in h.items():
+            i[v] = k
+        return i
+
+    XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
+                                      "quot" : '"',
+                                      "amp" : "&",
+                                      "lt" : "<",
+                                      "gt" : ">" }
+
+    XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
+
+    def _convertEntities(self, match):
+        """Used in a call to re.sub to replace HTML, XML, and numeric
+        entities with the appropriate Unicode characters. If HTML
+        entities are being converted, any unrecognized entities are
+        escaped."""
+        x = match.group(1)
+        if self.convertHTMLEntities and x in name2codepoint:
+            return unichr(name2codepoint[x])
+        elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
+            if self.convertXMLEntities:
+                return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
+            else:
+                return u'&%s;' % x
+        elif len(x) > 0 and x[0] == '#':
+            # Handle numeric entities
+            if len(x) > 1 and x[1] == 'x':
+                return unichr(int(x[2:], 16))
+            else:
+                return unichr(int(x[1:]))
+
+        elif self.escapeUnrecognizedEntities:
+            return u'&amp;%s;' % x
+        else:
+            return u'&%s;' % x
+
+    def __init__(self, parser, name, attrs=None, parent=None,
+                 previous=None):
+        "Basic constructor."
+
+        # We don't actually store the parser object: that lets extracted
+        # chunks be garbage-collected
+        self.parserClass = parser.__class__
+        self.isSelfClosing = parser.isSelfClosingTag(name)
+        self.name = name
+        if attrs is None:
+            attrs = []
+        elif isinstance(attrs, dict):
+            attrs = attrs.items()
+        self.attrs = attrs
+        self.contents = []
+        self.setup(parent, previous)
+        self.hidden = False
+        self.containsSubstitutions = False
+        self.convertHTMLEntities = parser.convertHTMLEntities
+        self.convertXMLEntities = parser.convertXMLEntities
+        self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
+
+        # Convert any HTML, XML, or numeric entities in the attribute values.
+        convert = lambda(k, val): (k,
+                                   re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
+                                          self._convertEntities,
+                                          val))
+        self.attrs = map(convert, self.attrs)
+
+    def getString(self):
+        if (len(self.contents) == 1
+            and isinstance(self.contents[0], NavigableString)):
+            return self.contents[0]
+
+    def setString(self, string):
+        """Replace the contents of the tag with a string"""
+        self.clear()
+        self.append(string)
+
+    string = property(getString, setString)
+
+    def getText(self, separator=u""):
+        if not len(self.contents):
+            return u""
+        stopNode = self._lastRecursiveChild().next
+        strings = []
+        current = self.contents[0]
+        while current is not stopNode:
+            if isinstance(current, NavigableString):
+                strings.append(current.strip())
+            current = current.next
+        return separator.join(strings)
+
+    text = property(getText)
+
+    def get(self, key, default=None):
+        """Returns the value of the 'key' attribute for the tag, or
+        the value given for 'default' if it doesn't have that
+        attribute."""
+        return self._getAttrMap().get(key, default)
+
+    def clear(self):
+        """Extract all children."""
+        for child in self.contents[:]:
+            child.extract()
+
+    def index(self, element):
+        for i, child in enumerate(self.contents):
+            if child is element:
+                return i
+        raise ValueError("Tag.index: element not in tag")
+
+    def has_key(self, key):
+        return self._getAttrMap().has_key(key)
+
+    def __getitem__(self, key):
+        """tag[key] returns the value of the 'key' attribute for the tag,
+        and throws an exception if it's not there."""
+        return self._getAttrMap()[key]
+
+    def __iter__(self):
+        "Iterating over a tag iterates over its contents."
+        return iter(self.contents)
+
+    def __len__(self):
+        "The length of a tag is the length of its list of contents."
+        return len(self.contents)
+
+    def __contains__(self, x):
+        return x in self.contents
+
+    def __nonzero__(self):
+        "A tag is non-None even if it has no contents."
+        return True
+
+    def __setitem__(self, key, value):
+        """Setting tag[key] sets the value of the 'key' attribute for the
+        tag."""
+        self._getAttrMap()
+        self.attrMap[key] = value
+        found = False
+        for i in range(0, len(self.attrs)):
+            if self.attrs[i][0] == key:
+                self.attrs[i] = (key, value)
+                found = True
+        if not found:
+            self.attrs.append((key, value))
+        self._getAttrMap()[key] = value
+
+    def __delitem__(self, key):
+        "Deleting tag[key] deletes all 'key' attributes for the tag."
+        for item in self.attrs:
+            if item[0] == key:
+                self.attrs.remove(item)
+                #We don't break because bad HTML can define the same
+                #attribute multiple times.
+            self._getAttrMap()
+            if self.attrMap.has_key(key):
+                del self.attrMap[key]
+
+    def __call__(self, *args, **kwargs):
+        """Calling a tag like a function is the same as calling its
+        findAll() method. Eg. tag('a') returns a list of all the A tags
+        found within this tag."""
+        return apply(self.findAll, args, kwargs)
+
+    def __getattr__(self, tag):
+        #print "Getattr %s.%s" % (self.__class__, tag)
+        if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
+            return self.find(tag[:-3])
+        elif tag.find('__') != 0:
+            return self.find(tag)
+        raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
+
+    def __eq__(self, other):
+        """Returns true iff this tag has the same name, the same attributes,
+        and the same contents (recursively) as the given tag.
+
+        NOTE: right now this will return false if two tags have the
+        same attributes in a different order. Should this be fixed?"""
+        if other is self:
+            return True
+        if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
+            return False
+        for i in range(0, len(self.contents)):
+            if self.contents[i] != other.contents[i]:
+                return False
+        return True
+
+    def __ne__(self, other):
+        """Returns true iff this tag is not identical to the other tag,
+        as defined in __eq__."""
+        return not self == other
+
+    def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+        """Renders this tag as a string."""
+        return self.__str__(encoding)
+
+    def __unicode__(self):
+        return self.__str__(None)
+
+    BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
+                                           + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
+                                           + ")")
+
+    def _sub_entity(self, x):
+        """Used with a regular expression to substitute the
+        appropriate XML entity for an XML special character."""
+        return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
+
+    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
+                prettyPrint=False, indentLevel=0):
+        """Returns a string or Unicode representation of this tag and
+        its contents. To get Unicode, pass None for encoding.
+
+        NOTE: since Python's HTML parser consumes whitespace, this
+        method is not certain to reproduce the whitespace present in
+        the original string."""
+
+        encodedName = self.toEncoding(self.name, encoding)
+
+        attrs = []
+        if self.attrs:
+            for key, val in self.attrs:
+                fmt = '%s="%s"'
+                if isinstance(val, basestring):
+                    if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
+                        val = self.substituteEncoding(val, encoding)
+
+                    # The attribute value either:
+                    #
+                    # * Contains no embedded double quotes or single quotes.
+                    #   No problem: we enclose it in double quotes.
+                    # * Contains embedded single quotes. No problem:
+                    #   double quotes work here too.
+                    # * Contains embedded double quotes. No problem:
+                    #   we enclose it in single quotes.
+                    # * Embeds both single _and_ double quotes. This
+                    #   can't happen naturally, but it can happen if
+                    #   you modify an attribute value after parsing
+                    #   the document. Now we have a bit of a
+                    #   problem. We solve it by enclosing the
+                    #   attribute in single quotes, and escaping any
+                    #   embedded single quotes to XML entities.
+                    if '"' in val:
+                        fmt = "%s='%s'"
+                        if "'" in val:
+                            # TODO: replace with apos when
+                            # appropriate.
+                            val = val.replace("'", "&squot;")
+
+                    # Now we're okay w/r/t quotes. But the attribute
+                    # value might also contain angle brackets, or
+                    # ampersands that aren't part of entities. We need
+                    # to escape those to XML entities too.
+                    val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
+
+                attrs.append(fmt % (self.toEncoding(key, encoding),
+                                    self.toEncoding(val, encoding)))
+        close = ''
+        closeTag = ''
+        if self.isSelfClosing:
+            close = ' /'
+        else:
+            closeTag = '</%s>' % encodedName
+
+        indentTag, indentContents = 0, 0
+        if prettyPrint:
+            indentTag = indentLevel
+            space = (' ' * (indentTag-1))
+            indentContents = indentTag + 1
+        contents = self.renderContents(encoding, prettyPrint, indentContents)
+        if self.hidden:
+            s = contents
+        else:
+            s = []
+            attributeString = ''
+            if attrs:
+                attributeString = ' ' + ' '.join(attrs)
+            if prettyPrint:
+                s.append(space)
+            s.append('<%s%s%s>' % (encodedName, attributeString, close))
+            if prettyPrint:
+                s.append("\n")
+            s.append(contents)
+            if prettyPrint and contents and contents[-1] != "\n":
+                s.append("\n")
+            if prettyPrint and closeTag:
+                s.append(space)
+            s.append(closeTag)
+            if prettyPrint and closeTag and self.nextSibling:
+                s.append("\n")
+            s = ''.join(s)
+        return s
+
+    def decompose(self):
+        """Recursively destroys the contents of this tree."""
+        self.extract()
+        if len(self.contents) == 0:
+            return
+        current = self.contents[0]
+        while current is not None:
+            next = current.next
+            if isinstance(current, Tag):
+                del current.contents[:]
+            current.parent = None
+            current.previous = None
+            current.previousSibling = None
+            current.next = None
+            current.nextSibling = None
+            current = next
+
+    def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
+        return self.__str__(encoding, True)
+
+    def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
+                       prettyPrint=False, indentLevel=0):
+        """Renders the contents of this tag as a string in the given
+        encoding. If encoding is None, returns a Unicode string.."""
+        s=[]
+        for c in self:
+            text = None
+            if isinstance(c, NavigableString):
+                text = c.__str__(encoding)
+            elif isinstance(c, Tag):
+                s.append(c.__str__(encoding, prettyPrint, indentLevel))
+            if text and prettyPrint:
+                text = text.strip()
+            if text:
+                if prettyPrint:
+                    s.append(" " * (indentLevel-1))
+                s.append(text)
+                if prettyPrint:
+                    s.append("\n")
+        return ''.join(s)
+
+    #Soup methods
+
+    def find(self, name=None, attrs={}, recursive=True, text=None,
+             **kwargs):
+        """Return only the first child of this Tag matching the given
+        criteria."""
+        r = None
+        l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
+        if l:
+            r = l[0]
+        return r
+    findChild = find
+
+    def findAll(self, name=None, attrs={}, recursive=True, text=None,
+                limit=None, **kwargs):
+        """Extracts a list of Tag objects that match the given
+        criteria.  You can specify the name of the Tag and any
+        attributes you want the Tag to have.
+
+        The value of a key-value pair in the 'attrs' map can be a
+        string, a list of strings, a regular expression object, or a
+        callable that takes a string and returns whether or not the
+        string matches for some custom definition of 'matches'. The
+        same is true of the tag name."""
+        generator = self.recursiveChildGenerator
+        if not recursive:
+            generator = self.childGenerator
+        return self._findAll(name, attrs, text, limit, generator, **kwargs)
+    findChildren = findAll
+
+    # Pre-3.x compatibility methods
+    first = find
+    fetch = findAll
+
+    def fetchText(self, text=None, recursive=True, limit=None):
+        return self.findAll(text=text, recursive=recursive, limit=limit)
+
+    def firstText(self, text=None, recursive=True):
+        return self.find(text=text, recursive=recursive)
+
+    #Private methods
+
+    def _getAttrMap(self):
+        """Initializes a map representation of this tag's attributes,
+        if not already initialized."""
+        if not getattr(self, 'attrMap'):
+            self.attrMap = {}
+            for (key, value) in self.attrs:
+                self.attrMap[key] = value
+        return self.attrMap
+
+    #Generator methods
+    def childGenerator(self):
+        # Just use the iterator from the contents
+        return iter(self.contents)
+
+    def recursiveChildGenerator(self):
+        if not len(self.contents):
+            raise StopIteration
+        stopNode = self._lastRecursiveChild().next
+        current = self.contents[0]
+        while current is not stopNode:
+            yield current
+            current = current.next
+
+
+# Next, a couple classes to represent queries and their results.
+class SoupStrainer:
+    """Encapsulates a number of ways of matching a markup element (tag or
+    text)."""
+
+    def __init__(self, name=None, attrs={}, text=None, **kwargs):
+        self.name = name
+        if isinstance(attrs, basestring):
+            kwargs['class'] = _match_css_class(attrs)
+            attrs = None
+        if kwargs:
+            if attrs:
+                attrs = attrs.copy()
+                attrs.update(kwargs)
+            else:
+                attrs = kwargs
+        self.attrs = attrs
+        self.text = text
+
+    def __str__(self):
+        if self.text:
+            return self.text
+        else:
+            return "%s|%s" % (self.name, self.attrs)
+
+    def searchTag(self, markupName=None, markupAttrs={}):
+        found = None
+        markup = None
+        if isinstance(markupName, Tag):
+            markup = markupName
+            markupAttrs = markup
+        callFunctionWithTagData = callable(self.name) \
+                                and not isinstance(markupName, Tag)
+
+        if (not self.name) \
+               or callFunctionWithTagData \
+               or (markup and self._matches(markup, self.name)) \
+               or (not markup and self._matches(markupName, self.name)):
+            if callFunctionWithTagData:
+                match = self.name(markupName, markupAttrs)
+            else:
+                match = True
+                markupAttrMap = None
+                for attr, matchAgainst in self.attrs.items():
+                    if not markupAttrMap:
+                         if hasattr(markupAttrs, 'get'):
+                            markupAttrMap = markupAttrs
+                         else:
+                            markupAttrMap = {}
+                            for k,v in markupAttrs:
+                                markupAttrMap[k] = v
+                    attrValue = markupAttrMap.get(attr)
+                    if not self._matches(attrValue, matchAgainst):
+                        match = False
+                        break
+            if match:
+                if markup:
+                    found = markup
+                else:
+                    found = markupName
+        return found
+
+    def search(self, markup):
+        #print 'looking for %s in %s' % (self, markup)
+        found = None
+        # If given a list of items, scan it for a text element that
+        # matches.
+        if hasattr(markup, "__iter__") \
+                and not isinstance(markup, Tag):
+            for element in markup:
+                if isinstance(element, NavigableString) \
+                       and self.search(element):
+                    found = element
+                    break
+        # If it's a Tag, make sure its name or attributes match.
+        # Don't bother with Tags if we're searching for text.
+        elif isinstance(markup, Tag):
+            if not self.text:
+                found = self.searchTag(markup)
+        # If it's text, make sure the text matches.
+        elif isinstance(markup, NavigableString) or \
+                 isinstance(markup, basestring):
+            if self._matches(markup, self.text):
+                found = markup
+        else:
+            raise Exception, "I don't know how to match against a %s" \
+                  % markup.__class__
+        return found
+
+    def _matches(self, markup, matchAgainst):
+        #print "Matching %s against %s" % (markup, matchAgainst)
+        result = False
+        if matchAgainst is True:
+            result = markup is not None
+        elif callable(matchAgainst):
+            result = matchAgainst(markup)
+        else:
+            #Custom match methods take the tag as an argument, but all
+            #other ways of matching match the tag name as a string.
+            if isinstance(markup, Tag):
+                markup = markup.name
+            if markup and not isinstance(markup, basestring):
+                markup = unicode(markup)
+            #Now we know that chunk is either a string, or None.
+            if hasattr(matchAgainst, 'match'):
+                # It's a regexp object.
+                result = markup and matchAgainst.search(markup)
+            elif hasattr(matchAgainst, '__iter__'): # list-like
+                result = markup in matchAgainst
+            elif hasattr(matchAgainst, 'items'):
+                result = markup.has_key(matchAgainst)
+            elif matchAgainst and isinstance(markup, basestring):
+                if isinstance(markup, unicode):
+                    matchAgainst = unicode(matchAgainst)
+                else:
+                    matchAgainst = str(matchAgainst)
+
+            if not result:
+                result = matchAgainst == markup
+        return result
+
+class ResultSet(list):
+    """A ResultSet is just a list that keeps track of the SoupStrainer
+    that created it."""
+    def __init__(self, source):
+        list.__init__([])
+        self.source = source
+
+# Now, some helper functions.
+
+def buildTagMap(default, *args):
+    """Turns a list of maps, lists, or scalars into a single map.
+    Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
+    NESTING_RESET_TAGS maps out of lists and partial maps."""
+    built = {}
+    for portion in args:
+        if hasattr(portion, 'items'):
+            #It's a map. Merge it.
+            for k,v in portion.items():
+                built[k] = v
+        elif hasattr(portion, '__iter__'): # is a list
+            #It's a list. Map each item to the default.
+            for k in portion:
+                built[k] = default
+        else:
+            #It's a scalar. Map it to the default.
+            built[portion] = default
+    return built
+
+# Now, the parser classes.
+
+class BeautifulStoneSoup(Tag, SGMLParser):
+
+    """This class contains the basic parser and search code. It defines
+    a parser that knows nothing about tag behavior except for the
+    following:
+
+      You can't close a tag without closing all the tags it encloses.
+      That is, "<foo><bar></foo>" actually means
+      "<foo><bar></bar></foo>".
+
+    [Another possible explanation is "<foo><bar /></foo>", but since
+    this class defines no SELF_CLOSING_TAGS, it will never use that
+    explanation.]
+
+    This class is useful for parsing XML or made-up markup languages,
+    or when BeautifulSoup makes an assumption counter to what you were
+    expecting."""
+
+    SELF_CLOSING_TAGS = {}
+    NESTABLE_TAGS = {}
+    RESET_NESTING_TAGS = {}
+    QUOTE_TAGS = {}
+    PRESERVE_WHITESPACE_TAGS = []
+
+    MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
+                       lambda x: x.group(1) + ' />'),
+                      (re.compile('<!\s+([^<>]*)>'),
+                       lambda x: '<!' + x.group(1) + '>')
+                      ]
+
+    ROOT_TAG_NAME = u'[document]'
+
+    HTML_ENTITIES = "html"
+    XML_ENTITIES = "xml"
+    XHTML_ENTITIES = "xhtml"
+    # TODO: This only exists for backwards-compatibility
+    ALL_ENTITIES = XHTML_ENTITIES
+
+    # Used when determining whether a text node is all whitespace and
+    # can be replaced with a single space. A text node that contains
+    # fancy Unicode spaces (usually non-breaking) should be left
+    # alone.
+    STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
+
+    def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
+                 markupMassage=True, smartQuotesTo=XML_ENTITIES,
+                 convertEntities=None, selfClosingTags=None, isHTML=False):
+        """The Soup object is initialized as the 'root tag', and the
+        provided markup (which can be a string or a file-like object)
+        is fed into the underlying parser.
+
+        sgmllib will process most bad HTML, and the BeautifulSoup
+        class has some tricks for dealing with some HTML that kills
+        sgmllib, but Beautiful Soup can nonetheless choke or lose data
+        if your data uses self-closing tags or declarations
+        incorrectly.
+
+        By default, Beautiful Soup uses regexes to sanitize input,
+        avoiding the vast majority of these problems. If the problems
+        don't apply to you, pass in False for markupMassage, and
+        you'll get better performance.
+
+        The default parser massage techniques fix the two most common
+        instances of invalid HTML that choke sgmllib:
+
+         <br/> (No space between name of closing tag and tag close)
+         <! --Comment--> (Extraneous whitespace in declaration)
+
+        You can pass in a custom list of (RE object, replace method)
+        tuples to get Beautiful Soup to scrub your input the way you
+        want."""
+
+        self.parseOnlyThese = parseOnlyThese
+        self.fromEncoding = fromEncoding
+        self.smartQuotesTo = smartQuotesTo
+        self.convertEntities = convertEntities
+        # Set the rules for how we'll deal with the entities we
+        # encounter
+        if self.convertEntities:
+            # It doesn't make sense to convert encoded characters to
+            # entities even while you're converting entities to Unicode.
+            # Just convert it all to Unicode.
+            self.smartQuotesTo = None
+            if convertEntities == self.HTML_ENTITIES:
+                self.convertXMLEntities = False
+                self.convertHTMLEntities = True
+                self.escapeUnrecognizedEntities = True
+            elif convertEntities == self.XHTML_ENTITIES:
+                self.convertXMLEntities = True
+                self.convertHTMLEntities = True
+                self.escapeUnrecognizedEntities = False
+            elif convertEntities == self.XML_ENTITIES:
+                self.convertXMLEntities = True
+                self.convertHTMLEntities = False
+                self.escapeUnrecognizedEntities = False
+        else:
+            self.convertXMLEntities = False
+            self.convertHTMLEntities = False
+            self.escapeUnrecognizedEntities = False