Commits

Marcin Kasperski  committed 5dc9504

Preliminary version of disqus exporter

  • Participants
  • Parent commits 6b7e17a

Comments (0)

Files changed (2)

 (http://s9y.org) PHP blog and Blogofile (http://www.blogofile.com)
 static blog generator.
 
+   serendipity2blogofile.py
+
+      Export blog posts and static pages (reads Serendipity database,
+      writes blogofile-compatible _posts directory and some extra
+      files)
+
+   serendipity2disqus.py
+
+      Export blog comments (reads Serendipity database, writes
+      XML suitable for Disqus import).
+
+Read comments in both files for more detail.

File serendipity2disqus.py

-# Not yet implemented
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Author: Marcin Kasperski
 
-_references   (trackbacki chyba albo nie wiem)
-_comments
+"""Export Serendipity comments to Disqus compatible XML.
+
+This file is MIT licensed, see http://blogofile.com/LICENSE.html for details.
+
+I wrote it while working on serendipity→blogofile conversion,
+but it should work for other s9y→disqus cases too.
+
+Requirements:
+
+  * An existing Serendipity database hosted on PostgreSQL
+
+  * SQLAlchemy and Postgres driver
+
+Output:
+
+  * XML file in custom Disqus format
+    http://docs.disqus.com/developers/export/import_format/
+
+Usage:
+
+  * Edit database connection details below and other config elements below
+
+  * Execute
+  
+
+    python /path/to/serendipity2disqus.py  output.xml
+
+    If everything worked right, this will create xml
+    file suitable for disqus import.
+
+Important notes:
+
+  * Entries numerical id's are used as disqus thread identifiers.
+    Configure blogofile to refer them appropriately.
+
+  * Pingbacks and Trackbacks are handled as usual comments or not
+    handled depending on setting below (disqus import format has no
+    way to distinguish trackbacks and pingbacks from normal comments,
+    I have no clue how are they to be imported)
+
+"""
+
+###########################################################################
+# Configuration requiring edits
+###########################################################################
+
+### Database connection details
+
+table_prefix = "s9en_"
+
+db_username  = "blogowner"
+db_password  = ""                    
+db_host      = "linode.mekk.waw.pl"  
+db_port      = "5432"
+db_database  = "BLOG" 
+db_conn      = "postgres://{db_username}:{db_password}@{db_host}:{db_port}/{db_database}".format(**locals())
+
+### Other configuration
+
+# Import spam (moderated comments)?
+IMPORT_MODERATED_COMMENTS = False
+
+# Import trackbacks (as normal commments)
+IMPORT_TRACKBACKS = True
+
+# Import pingbacks(as normal comments)
+IMPORT_PINGBACKS = True
+
+###########################################################################
+# Importer code. No need to edit anything below
+###########################################################################
+
+import os
+import re
+import sys
+import markdown
+import codecs
+import datetime
+import sqlalchemy as sa
+import sqlalchemy.orm as orm
+from sqlalchemy.ext.declarative import declarative_base
+
+import logging
+#Markdown logging is noisy, pot it down:
+logging.getLogger("MARKDOWN").setLevel(logging.ERROR)
+
+###########################################################################
+# SQLAlchemy objects
+###########################################################################
+
+engine = sa.create_engine(db_conn)
+Session = orm.scoped_session(
+    orm.sessionmaker(autocommit=False,
+                     autoflush=False,
+                     bind=engine))
+Base = declarative_base(bind=engine)
+
+session = Session()
+
+###########################################################################
+# Mapper objects
+###########################################################################
+
+class Config(Base):
+    """
+    Configuration table. Has fields name and value (and authorid but
+    who cares)
+
+    Intersting properties (names)
+
+    blogTitle, blogDescription, blogMail, lang, baseURL, 
+
+    permalinkStructure, permalinkAuthorStructure,
+    permalinkCategoryStructure, permalinkFeedCategoryStructure,
+    permalinkArchivePath, permalinkArchivesPath, permalinkFeedsPath
+
+    username, realname, email
+    """
+    __tablename__ = table_prefix + "config"
+    __table_args__ = {'autoload': True}
+    name = sa.Column("name", sa.String, primary_key = True)
+
+
+class Author(Base):
+    """
+    Author information. Interesting fields:
+
+    authorid (numerical id)
+    username (nick)
+    realname (full true name)
+    email
+    """
+    __tablename__ = table_prefix + "authors"
+    __table_args__ = {'autoload': True}
+
+class Permalink(Base):
+    """
+    Entries and categories permalinks
+
+    permalink  (relative, for example "archives/44-My-Article.html")
+    entry_id   (numerical)
+    type       ("entry" or "category")
+
+    Note: as we don't use category permalink, I haven't fought
+    with SQLAlchemy tu polymorphically make entry_id foreign
+    key to either entries, or categories
+    """
+    __tablename__ = table_prefix + "permalinks"
+    __table_args__ = {'autoload': True}
+    entry_id = sa.Column("entry_id", sa.Integer,
+                         sa.ForeignKey(table_prefix + "entries.id"),
+                         primary_key = True)
+
+class Entry(Base):
+    """
+    Actual entry.
+
+    Properties to be used directly:
+
+    id
+    title
+    body
+    extended
+    isdraft
+
+    author (object with attributes username, realname, email)
+
+    Other properties (including timestamp, author, authorid, 
+    last_modified and mapped from other tables) better should be
+    used by methods.
+    """
+    __tablename__ = table_prefix + "entries"
+    __table_args__ = {'autoload': True}
+
+    authorid = sa.Column("authorid", 
+                         sa.ForeignKey(table_prefix + "authors"))
+    author_nick = sa.Column("author",
+            sa.ForeignKey(table_prefix + 'authors.authorid'))
+    author = orm.relation("Author",
+                          primaryjoin="Entry.authorid == Author.authorid")
+    permalink_rel = orm.relation(
+        "Permalink",
+        primaryjoin="and_"
+        "(Entry.id == Permalink.entry_id, "
+        "Permalink.type == 'entry')",
+        uselist = False)
+    comments_rel = orm.relation(
+        "Comment",
+        )
+
+    def creation_time(self):
+        return datetime.datetime.fromtimestamp(self.timestamp)
+    def creation_time_str(self):
+        return self.creation_time().strftime("%Y-%m-%d %H:%M:%S"),
+
+    def last_modification_time(self):
+        return datetime.datetime.fromtimestamp(self.last_modified)
+
+    def permalink(self):
+        return self.permalink_rel.permalink
+
+    def get_comments(self, also_moderated = False):
+        for comment in self.comments_rel:
+            if also_moderated or comment.is_approved():
+                yield comment
+
+    def body_html(self):
+        return markdown.markdown(self.body)
+
+
+class Comment(Base):
+    """
+    Actual comments. Interesting fields:
+
+    id   (numerical comment id)
+    entry_id   (numerical id of entry comment is bound to)
+    parent_id  (parent comment in case of threading)
+    timestamp (int, save time)
+    title     
+    author
+    email     (of author, as text - "somebody@some.com")
+    url       (of author, as text - "http://some.where.com/x")
+    ip        (of author, as text - "10.11.21.11")
+    body      (markdown)
+    type      ("NORMAL", "PINGBACK", "TRACKBACK")
+    subscribed ("f", "t")
+    status    ("approved", "pending")
+    """
+    __tablename__ = table_prefix + "comments"
+    __table_args__ = {'autoload': True}
+
+    entry_id = sa.Column("entry_id",
+                         sa.ForeignKey(table_prefix + 'entries.id'))
+
+    def creation_time(self):
+        return datetime.datetime.fromtimestamp(self.timestamp)
+    def creation_time_str(self):
+        return self.creation_time().strftime("%Y-%m-%d %H:%M:%S"),
+
+    def body_html(self):
+        return markdown.markdown(self.body)
+
+    def is_subscribed(self):
+        return self.subscribed == "t"
+
+    def is_approved(self):
+        return self.status == "approved"
+
+###########################################################################
+# Database helpers
+###########################################################################
+
+def get_blog_posts():
+    """
+    Yields all blog posts found, excluding drafts. Use Entry class
+    methods to examine data.
+    """
+    for item in session.query(Entry).order_by(Entry.id):
+        if not item.isdraft:
+            yield item
+
+def get_config_item(item_name):
+    return session.query(Config).get(item_name).value
+
+###########################################################################
+# XML helpers
+###########################################################################
+
+class XMLWriter(object):
+
+    def __init__(self, output_file):
+        self.output = codecs.open(output_file, "w", "utf-8")
+        self.output.write("""<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0"
+  xmlns:content="http://purl.org/rss/1.0/modules/content/"
+  xmlns:dsq="http://www.disqus.com/"
+  xmlns:dc="http://purl.org/dc/elements/1.1/"
+  xmlns:wp="http://wordpress.org/export/1.0/"
+>
+  <channel>
+""")
+
+    def process_entry(self, entry):
+        if entry.isdraft:
+            return
+        self.output.write(u"""
+    <item>
+      <title>%(title)s</title>
+      <link>%(link)s</link>
+      <content:encoded><![CDATA[%(body)s]]></content:encoded>
+      <dsq:thread_identifier>%(id)s</dsq:thread_identifier>
+      <wp:post_date_gmt>%(date)s</wp:post_date_gmt>
+      <wp:comment_status>open</wp:comment_status>
+""" % dict(
+                title = entry.title,
+                link = "%s%s" % (baseURL, entry.permalink()),
+                body = entry.body_html(),
+                id = entry.id,
+                date = entry.creation_time_str(),
+                ))
+        for comment in entry.get_comments(also_moderated = IMPORT_MODERATED_COMMENTS):
+            self.process_comment(comment)
+        self.output.write("""
+    </item>
+""")
+
+    def process_comment(self, comment):
+        if comment.type == "TRACKBACK" and not IMPORT_TRACKBACKS:
+            return
+        if comment.type == "PINGBACK" and not IMPORT_PINGBACKS:
+            return
+        self.output.write(u"""
+      <wp:comment>
+        <wp:comment_id>%(comment_id)s</wp:comment_id>
+        <wp:comment_author>%(author)s</wp:comment_author>
+        <wp:comment_author_email>%(author_email)s</wp:comment_author_email>
+        <wp:comment_author_url>%(author_url)s</wp:comment_author_url>
+        <wp:comment_author_IP>%(author_ip)s</wp:comment_author_IP>
+        <wp:comment_date_gmt>%(date)s</wp:comment_date_gmt>
+        <wp:comment_content><![CDATA[%(body)s]]></wp:comment_content>
+        <wp:comment_approved>%(approved)s</wp:comment_approved>
+        <wp:comment_parent>%(parent_id)s</wp:comment_parent>
+      </wp:comment>
+""" % dict(
+                comment_id = comment.id,
+                author = comment.author,
+                author_email = comment.email,
+                author_url = comment.url,
+                author_ip = comment.ip,
+                date = comment.creation_time_str(),
+                body = comment.body_html(),
+                approved = comment.is_approved() and "1" or "0",
+                parent_id = comment.parent_id,
+                ))
+
+    def finalize(self):
+        self.output.write("""
+  </channel>
+</rss>
+""")
+        self.output.close()
+
+###########################################################################
+# Main
+###########################################################################
+    
+if __name__ == '__main__':
+    if len(sys.argv) != 2:
+        print "Usage:\n"
+        print "    python serendipity2disqus.py output.xml"
+        sys.exit(1)
+    output_file = sys.argv[1]
+
+    if os.path.exists(output_file):
+            print "There's already a %s file here, "\
+                "I'm not going to overwrite it." % output_file
+            sys.exit(1)
+
+    baseURL = get_config_item("baseURL")
+
+    writer = XMLWriter(output_file)
+
+    for entry in get_blog_posts():
+        writer.process_entry(entry)
+
+    writer.finalize()