Source

newklet / rs2nk.py

Full commit
import os.path
import sqlite3
import urllib2
from datetime import datetime
from os import mkdir
from shutil import rmtree, copytree

import feedparser
from jinja2 import Template, Markup

import settings
from decruft.decruft import Document


base_dir = "nook_base"

DATE_FORMAT = '%Y-%m-%d %H:%M:%S'

o = urllib2.build_opener( urllib2.HTTPCookieProcessor() )
urllib2.install_opener( o )

def connect_sqlite():
    db = sqlite3.connect("cache")
    c = db.cursor()
    c.execute("create table if not exists cache (datetime text, url text, content text, cleaned text)")
    c.execute("create unique index if not exists cache_url on cache(url)")
    c.execute("create index if not exists cache_datetime on cache(datetime)")
    db.isolation_level = None
    c.close()
    return db

db = connect_sqlite()

def cache_set(url, content, cleaned):
    c = db.cursor()
    c.execute("insert into cache (datetime, url, content, cleaned) values (?, ?, ?, ?)",
            [datetime.now().strftime(DATE_FORMAT), url, content, cleaned])
    c.close()

def render_template(name, **kwargs):
    return Template(open("templates/"+name, "r").read()).render(**kwargs)

def cache_get_content(url):
    c = db.cursor()
    result = c.execute("select content from cache where url=?", [url]).fetchone()
    c.close()
    if result and result[0]:
        return result[0]

def cache_get_cleaned(url):
    return None
    c = db.cursor()
    result = c.execute("select cleaned from cache where url=?", [url]).fetchone()
    c.close()
    if result and result[0]:
        return result[0]

def cache_update(url, cleaned):
    c = db.cursor()
    c.execute("update cache set datetime=?, cleaned=? where url=?", [datetime.now().strftime(DATE_FORMAT), cleaned, url])
    c.close()

def cache_touch(url):
    c = db.cursor()
    c.execute("update cache set datetime=? where url=?", [datetime.now().strftime(DATE_FORMAT), url])
    c.close()


def makepretty(url):
    print "Fetching url "+url
    cleaned = cache_get_cleaned(url)
    if cleaned:
        cache_touch(url)
        return cleaned

    html = cache_get_content(url)
    if not html:
        from_cache = False
        print "No cache"
        f=o.open(url)
        html = f.read().decode('utf-8', errors='ignore')
        f.close()
    else:
        from_cache = True
        print "From cache"
    cleaned = Document(html).summary()
    if from_cache:
        cache_update(url, cleaned)
    else:
        cache_set(url, html, cleaned)

    return cleaned

def get_feed(url):
    print "Getting feed"
    c = db.cursor()
    r = c.execute("select content, datetime from cache where url=?", [url]).fetchone()
    if r:
        content, time = r
        content = content.encode("utf-8")
        try:
            time = datetime.strptime(time, DATE_FORMAT)
        except:
            time = None
        if not content or time is None or (datetime.now()-time).seconds > 30*60:
            print time and (datetime.now()-time).seconds
            print "Updating cache"
            content = urllib2.urlopen(url).read()
            c.execute("update cache set content=?, datetime=? where url=?", [content.decode("utf-8"), datetime.now().strftime(DATE_FORMAT), url])
        else:
            print "From cache"
    else:
        print "From the internet"
        content = urllib2.urlopen(url).read()
        c.execute("insert into cache (content, datetime, url) values (?, ?, ?)", [content.decode("utf-8"), datetime.now().strftime(DATE_FORMAT), url])
    return content

def parse(name, url):
    html = get_feed(url)
    d = feedparser.parse(html)
    result = {'name': name, 'title': d.feed.title, 'entries': []}

    for eid, e in enumerate(d.entries):
        result['entries'].append({'id': eid, 'title': unicode(e.title), 'summary': unicode(e.summary), 'url': e.link,
            'link': '{0}/{1}.html'.format(name, eid)})
    return result

def process_rss(name, url):
    print "Processing rss "+url
    feed_dir = os.path.join(settings.destination, name)
    mkdir(feed_dir)
    feed = parse(name, url)
    with open(os.path.join(settings.destination, name+".html"), "w") as f:
        html = render_template("feed.html", d=feed, is_eink=False)
        f.write(html.encode("utf-8"))
    with open(os.path.join(settings.destination, "eink_"+name+".html"), "w") as f:
        html = render_template("feed.html", d=feed, is_eink=True)
        f.write(html.encode("utf-8"))
    for e in feed['entries']:
        filename = os.path.join(settings.destination, name, str(e['id'])+".html")
        try:
            with open(filename, "w") as f:
                f.write(render_template("singlefeed.html", e=e, content=Markup(makepretty(e['url']))).encode('utf-8'))
        except Exception, e:
            print e

def initialize_destination():
    if os.path.isdir(settings.destination):
        rmtree(settings.destination)
    copytree(base_dir, settings.destination)

if __name__ == '__main__':
    initialize_destination()
    links = []
    for title, name, url in settings.feeds:
        print "Processing "+title
        process_rss(name, url)
        links.append({'title': title, 'link': '{0}.html'.format(name)})
    with open(settings.destination+"/touch.html", "w") as f:
        f.write(render_template("index.html", links=links).encode("utf-8"))