Source

contentbrowser / src / storage.py

# -*- coding: utf-8 -*-
import os

import redis
import requests

from whoosh.query import Term
from whoosh.fields import Schema, ID, TEXT
from whoosh.index import create_in, open_dir, EmptyIndexError

from settings import (
    FORCE_REFRESH, USE_DEBUGGER,
    REDIS_HOST, REDIS_PORT, WHOOSH_INDEX
)
from document import BrowserDocument
from utils import guess_encoding, strip_tags


class Storage(object):
    """The storage that deal with parsing, persistency and search."""

    def __init__(self):
        # Redis configuration
        self.redis = redis.Redis(REDIS_HOST, REDIS_PORT)

        # Creating Whoosh index if absent
        whoosh_index = WHOOSH_INDEX
        if not os.path.exists(whoosh_index):
            os.mkdir(whoosh_index)
        try:
            self.search_index = open_dir(whoosh_index)
        except EmptyIndexError:
            schema = Schema(url=ID(stored=True),
                            title=TEXT(stored=True),
                            content=TEXT(stored=True))
            self.search_index = create_in(whoosh_index, schema)

    @property
    def searcher(self):
        """Quick access to Whoosh's searcher."""
        return self.search_index.searcher()

    def retrieve_content(self, url):
        """Retrieves title and content from distant `url` or cache.

        `title` and `content` are cached in Redis.
        """
        # Remove existing content from Redis and Whoosh
        if FORCE_REFRESH:
            self.delete_content(url)

        # Checks from Redis' cache
        if self.redis.exists('title:' + url):
            title = self.redis.get('title:' + url)
            content = self.redis.get('content:' + url)
            return title.decode('utf-8'), content.decode('utf-8')

        # Fallbacks to distant content and cache
        title, content = self.extract_meta(url)
        self.redis.set('title:' + url, title.encode('utf-8'))
        self.redis.set('content:' + url, content.encode('utf-8'))

        # Indexes the retrieved content in Whoosh
        search_writer = self.search_index.writer()
        search_writer.add_document(url=url,
                                   title=unicode(title),
                                   content=unicode(strip_tags(content)))
        search_writer.commit()
        return title, content

    def retrieve_archives(self):
        """Retrieves all available URLs and associated titles

        in a list of dicts sorted by title.
        """
        archives = []
        for key in self.redis.keys('title:*'):
            archives.append({
                'url': key.split(':', 1)[1],
                'title': self.redis.get(key).decode('utf-8')
            })
        archives.sort(key=lambda d: d['title'].lower())
        return archives

    def delete_content(self, url):
        """Deletes all local data related to the given `url`."""
        search_writer = self.search_index.writer()
        self.redis.delete('title:' + url)
        self.redis.delete('content:' + url)
        search_writer.delete_by_query(Term('url', url))
        search_writer.commit()

    def extract_meta(self, url):
        """From an URL, extract title and content using Readability.

        The title is shortened through the `short_title` native method.
        The content doesn't contain `<body>` tags to be directly
        embeddable in the template and rendered as is.
        """
        # Retrieves the resource and turns it into a Readability doc
        response = requests.get(url)
        response.encoding = guess_encoding(response)
        document = BrowserDocument(response.text, debug=USE_DEBUGGER)

        # The short title is more concise and readable
        title = document.short_title()
        content = document.summary(html_partial=True, current_url=url)
        return title, content