Source

serendipity2blogofile / serendipity2disqus.py

Full commit
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Author: Marcin Kasperski

"""Export Serendipity comments to Disqus compatible XML.

This file is MIT licensed, see http://blogofile.com/LICENSE.html for details.

I wrote it while working on serendipity→blogofile conversion,
but it should work for other s9y→disqus cases too.

Requirements:

  * An existing Serendipity database hosted on PostgreSQL

  * SQLAlchemy and Postgres driver

Output:

  * XML file in custom Disqus format
    http://docs.disqus.com/developers/export/import_format/

Usage:

  * Edit database connection details below and other config elements below

  * Execute
  

    python /path/to/serendipity2disqus.py  output.xml

    If everything worked right, this will create xml
    file suitable for disqus import.

Important notes:

  * Entries numerical id's are used as disqus thread identifiers.
    Configure blogofile to refer them appropriately.

  * Pingbacks and Trackbacks are handled as usual comments or not
    handled depending on setting below (disqus import format has no
    way to distinguish trackbacks and pingbacks from normal comments,
    I have no clue how are they to be imported)

"""

###########################################################################
# Configuration requiring edits
###########################################################################

### Database connection details

table_prefix = "s9pl_"

db_username  = "blogowner"
db_password  = ""                    
db_host      = "localhost"  
db_port      = "5432"
db_database  = "BLOG" 
db_conn      = "postgres://{db_username}:{db_password}@{db_host}:{db_port}/{db_database}".format(**locals())

### Other configuration

# Import spam (moderated comments)?
IMPORT_MODERATED_COMMENTS = False

# Import trackbacks (as normal commments)
IMPORT_TRACKBACKS = True

# Import pingbacks(as normal comments)
IMPORT_PINGBACKS = True

###########################################################################
# Importer code. No need to edit anything below
###########################################################################

import os
import re
import sys
import markdown
import codecs
import datetime
import sqlalchemy as sa
import sqlalchemy.orm as orm
from sqlalchemy.ext.declarative import declarative_base

import logging
#Markdown logging is noisy, pot it down:
logging.getLogger("MARKDOWN").setLevel(logging.ERROR)

###########################################################################
# SQLAlchemy objects
###########################################################################

engine = sa.create_engine(db_conn)
Session = orm.scoped_session(
    orm.sessionmaker(autocommit=False,
                     autoflush=False,
                     bind=engine))
Base = declarative_base(bind=engine)

session = Session()

###########################################################################
# Mapper objects
###########################################################################

class Config(Base):
    """
    Configuration table. Has fields name and value (and authorid but
    who cares)

    Intersting properties (names)

    blogTitle, blogDescription, blogMail, lang, baseURL, 

    permalinkStructure, permalinkAuthorStructure,
    permalinkCategoryStructure, permalinkFeedCategoryStructure,
    permalinkArchivePath, permalinkArchivesPath, permalinkFeedsPath

    username, realname, email
    """
    __tablename__ = table_prefix + "config"
    __table_args__ = {'autoload': True}
    name = sa.Column("name", sa.String, primary_key = True)


class Author(Base):
    """
    Author information. Interesting fields:

    authorid (numerical id)
    username (nick)
    realname (full true name)
    email
    """
    __tablename__ = table_prefix + "authors"
    __table_args__ = {'autoload': True}

class Permalink(Base):
    """
    Entries and categories permalinks

    permalink  (relative, for example "archives/44-My-Article.html")
    entry_id   (numerical)
    type       ("entry" or "category")

    Note: as we don't use category permalink, I haven't fought
    with SQLAlchemy tu polymorphically make entry_id foreign
    key to either entries, or categories
    """
    __tablename__ = table_prefix + "permalinks"
    __table_args__ = {'autoload': True}
    entry_id = sa.Column("entry_id", sa.Integer,
                         sa.ForeignKey(table_prefix + "entries.id"),
                         primary_key = True)

class Entry(Base):
    """
    Actual entry.

    Properties to be used directly:

    id
    title
    body
    extended
    isdraft

    author (object with attributes username, realname, email)

    Other properties (including timestamp, author, authorid, 
    last_modified and mapped from other tables) better should be
    used by methods.
    """
    __tablename__ = table_prefix + "entries"
    __table_args__ = {'autoload': True}

    authorid = sa.Column("authorid", 
                         sa.ForeignKey(table_prefix + "authors"))
    author_nick = sa.Column("author",
            sa.ForeignKey(table_prefix + 'authors.authorid'))
    author = orm.relation("Author",
                          primaryjoin="Entry.authorid == Author.authorid")
    permalink_rel = orm.relation(
        "Permalink",
        primaryjoin="and_"
        "(Entry.id == Permalink.entry_id, "
        "Permalink.type == 'entry')",
        uselist = False)
    comments_rel = orm.relation(
        "Comment",
        )

    def creation_time(self):
        return datetime.datetime.fromtimestamp(self.timestamp)
    def creation_time_str(self):
        return self.creation_time().strftime("%Y-%m-%d %H:%M:%S")

    def last_modification_time(self):
        return datetime.datetime.fromtimestamp(self.last_modified)

    def permalink(self):
        return self.permalink_rel.permalink

    def get_comments(self, also_moderated = False):
        for comment in self.comments_rel:
            if also_moderated or comment.is_approved():
                yield comment

    def body_html(self):
        return markdown.markdown(self.body)


class Comment(Base):
    """
    Actual comments. Interesting fields:

    id   (numerical comment id)
    entry_id   (numerical id of entry comment is bound to)
    parent_id  (parent comment in case of threading)
    timestamp (int, save time)
    title     
    author
    email     (of author, as text - "somebody@some.com")
    url       (of author, as text - "http://some.where.com/x")
    ip        (of author, as text - "10.11.21.11")
    body      (markdown)
    type      ("NORMAL", "PINGBACK", "TRACKBACK")
    subscribed ("f", "t")
    status    ("approved", "pending")
    """
    __tablename__ = table_prefix + "comments"
    __table_args__ = {'autoload': True}

    entry_id = sa.Column("entry_id",
                         sa.ForeignKey(table_prefix + 'entries.id'))

    def creation_time(self):
        return datetime.datetime.fromtimestamp(self.timestamp)
    def creation_time_str(self):
        return self.creation_time().strftime("%Y-%m-%d %H:%M:%S")

    def body_html(self):
        return markdown.markdown(self.body)

    def is_subscribed(self):
        return self.subscribed == "t"

    def is_approved(self):
        return self.status == "approved"

###########################################################################
# Database helpers
###########################################################################

def get_blog_posts():
    """
    Yields all blog posts found, excluding drafts. Use Entry class
    methods to examine data.
    """
    for item in session.query(Entry).order_by(Entry.id):
        if not item.isdraft:
            yield item

def get_config_item(item_name):
    return session.query(Config).get(item_name).value

###########################################################################
# XML helpers
###########################################################################

class XMLWriter(object):

    def __init__(self, output_file):
        self.output = codecs.open(output_file, "w", "utf-8")
        self.output.write("""<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
  xmlns:content="http://purl.org/rss/1.0/modules/content/"
  xmlns:dsq="http://www.disqus.com/"
  xmlns:dc="http://purl.org/dc/elements/1.1/"
  xmlns:wp="http://wordpress.org/export/1.0/"
>
  <channel>
""")

    def process_entry(self, entry):
        if entry.isdraft:
            return
        self.output.write(u"""
    <item>
      <title>%(title)s</title>
      <link>%(link)s</link>
      <content:encoded><![CDATA[%(body)s]]></content:encoded>
      <dsq:thread_identifier>%(id)s</dsq:thread_identifier>
      <wp:post_date_gmt>%(date)s</wp:post_date_gmt>
      <wp:comment_status>open</wp:comment_status>
""" % dict(
                title = entry.title,
                link = "%s%s" % (baseURL, entry.permalink()),
                body = entry.body_html(),
                id = entry.id,
                date = entry.creation_time_str(),
                ))
        for comment in entry.get_comments(also_moderated = IMPORT_MODERATED_COMMENTS):
            self.process_comment(comment)
        self.output.write("""
    </item>
""")

    def process_comment(self, comment):
        if comment.type == "TRACKBACK" and not IMPORT_TRACKBACKS:
            return
        if comment.type == "PINGBACK" and not IMPORT_PINGBACKS:
            return
        self.output.write(u"""
      <wp:comment>
        <wp:comment_id>%(comment_id)s</wp:comment_id>
        <wp:comment_author>%(author)s</wp:comment_author>
        <wp:comment_author_email>%(author_email)s</wp:comment_author_email>
        <wp:comment_author_url>%(author_url)s</wp:comment_author_url>
        <wp:comment_author_IP>%(author_ip)s</wp:comment_author_IP>
        <wp:comment_date_gmt>%(date)s</wp:comment_date_gmt>
        <wp:comment_content><![CDATA[%(body)s]]></wp:comment_content>
        <wp:comment_approved>%(approved)s</wp:comment_approved>
        <wp:comment_parent>%(parent_id)s</wp:comment_parent>
      </wp:comment>
""" % dict(
                comment_id = comment.id,
                author = comment.author,
                author_email = comment.email,
                author_url = comment.url,
                author_ip = comment.ip,
                date = comment.creation_time_str(),
                body = comment.body_html(),
                approved = comment.is_approved() and "1" or "0",
                parent_id = comment.parent_id,
                ))

    def finalize(self):
        self.output.write("""
  </channel>
</rss>
""")
        self.output.close()

###########################################################################
# Main
###########################################################################
    
if __name__ == '__main__':
    if len(sys.argv) != 2:
        print "Usage:\n"
        print "    python serendipity2disqus.py output.xml"
        sys.exit(1)
    output_file = sys.argv[1]

    if os.path.exists(output_file):
            print "There's already a %s file here, "\
                "I'm not going to overwrite it." % output_file
            sys.exit(1)

    baseURL = get_config_item("baseURL")

    writer = XMLWriter(output_file)

    for entry in get_blog_posts():
        writer.process_entry(entry)

    writer.finalize()