Source

contentbrowser / src / storage.py

Full commit
# -*- coding: utf-8 -*-
import os
import urlparse

from werkzeug.urls import url_quote_plus

import redis
import requests
from readability.readability import Document

from whoosh.query import Term
from whoosh.fields import Schema, ID, TEXT
from whoosh.index import create_in, open_dir, EmptyIndexError

from settings import (
    PROXY_URL, FORCE_REFRESH, REDIS_HOST, REDIS_PORT, WHOOSH_INDEX
)
from utils import guess_encoding, strip_tags


class BrowserDocument(Document):
    """A custom Readability's Document to deal with contents' filters.

    The main goal for now is to do not force the parsing of the HTML
    if it has already been done, otherwise the custom links' replacement
    will be overidden before the extract of title and content.
    """

    def _html(self, force=None):
        # Custom: getting rid of the force parse arg
        # which overwrite the custom links' replacement
        if self.html is None:
            self.html = self._parse(self.input)
        return self.html


class Storage(object):
    """The storage that deal with parsing, persistency and search."""

    def __init__(self):
        # Redis configuration
        self.redis = redis.Redis(REDIS_HOST, REDIS_PORT)

        # Creating Whoosh index if absent
        whoosh_index = WHOOSH_INDEX
        if not os.path.exists(whoosh_index):
            os.mkdir(whoosh_index)
        try:
            self.search_index = open_dir(whoosh_index)
        except EmptyIndexError:
            schema = Schema(url=ID(stored=True),
                            title=TEXT(stored=True),
                            content=TEXT(stored=True))
            self.search_index = create_in(whoosh_index, schema)

    @property
    def searcher(self):
        """Quick access to Whoosh's searcher."""
        return self.search_index.searcher()

    def retrieve_content(self, url):
        """Retrieves title and content from distant `url` or cache.

        `title` and `content` are cached in Redis.
        """
        search_writer = self.search_index.writer()

        # Remove existing content from Redis and Whoosh
        if FORCE_REFRESH:
            self.redis.delete('title:' + url)
            self.redis.delete('content:' + url)
            query = Term('url', url)
            search_writer.delete_by_query(query)

        # Checks from Redis' cache
        if self.redis.exists('title:' + url):
            title = self.redis.get('title:' + url)
            content = self.redis.get('content:' + url)
            return title.decode('utf-8'), content.decode('utf-8')

        # Fallbacks to distant content and cache
        title, content = self.extract_meta(url)
        self.redis.set('title:' + url, title.encode('utf-8'))
        self.redis.set('content:' + url, content.encode('utf-8'))

        # Indexes the retrieved content in Whoosh
        search_writer.add_document(url=url,
                                   title=unicode(title),
                                   content=unicode(strip_tags(content)))
        search_writer.commit()
        return title, content

    def prepend_proxy_url(self, link):
        """From an URL, prepend the proxy URL to be able to browse

        content from article to article.
        The link is quoted to prevent multiple arguments' errors.
        """
        return u"http://{proxy_url}/r?url={quoted_url}".format(
            proxy_url=PROXY_URL,
            quoted_url=url_quote_plus(link)
        )

    def extract_meta(self, url):
        """From an URL, extract title and content using Readability.

        The title is shortened through the `short_title` native method.
        The content doesn't contain `<body>` tags to be directly
        embeddable in the template and rendered as is.
        """
        # Base URL construction to deal with relative links
        parsed_url = urlparse.urlparse(url)
        base_url = "{scheme}://{netloc}".format(
            scheme=parsed_url.scheme,
            netloc=parsed_url.netloc
        )

        # Retrieves the resource and turns it into a Readability doc
        response = requests.get(url)
        response.encoding = guess_encoding(response)
        document = BrowserDocument(response.text)
        document.html = self.rewrite_links(document._html(), base_url)

        # The short title is more concise and readable
        title = document.short_title()
        content = document.summary(html_partial=True)
        return title, content

    def rewrite_links(self, html, base_url):
        """Returns transformed HTML with proxied and absolute links.

        All links and images are turned to absolute for rendering,
        links are proxied to be able to browse content from peer to peer.
        """
        for element, attribute, link, position in html.iterlinks():
            link = link.strip()
            if link.startswith("/"):  # Deal with relative links
                link = base_url + link

            if attribute == "src":  # Do not proxy images' URLs
                element.attrib[attribute] = link
            else:
                element.attrib[attribute] = self.prepend_proxy_url(link)
        return html