Source

contentbrowser / src / browser.py

Full commit
# -*- coding: utf-8 -*-
import os
import urlparse

from werkzeug.utils import redirect
from werkzeug.routing import Map, Rule
from werkzeug.exceptions import HTTPException
from werkzeug.wsgi import SharedDataMiddleware
from werkzeug.wrappers import Request, Response
from werkzeug.urls import url_quote_plus, url_unquote_plus
from jinja2 import Environment, FileSystemLoader

import redis
import requests
from readability.readability import Document

from whoosh.query import Or, Term
from whoosh.fields import Schema, ID, TEXT
from whoosh.highlight import ContextFragmenter
from whoosh.index import create_in, open_dir, EmptyIndexError

from settings import (
    LOCAL_HOST, LOCAL_PORT, FORCE_REFRESH, CSS_THEME,
    REDIS_HOST, REDIS_PORT, WHOOSH_INDEX,
    USE_DEBUGGER, USE_RELOADER
)
from utils import (
    get_hostname, guess_encoding, highlights, is_valid_url, strip_tags
)


class BrowserDocument(Document):
    """A custom Readability's Document to deal with contents' filters.

    The main goal for now is to do not force the parsing of the HTML
    if it has already been done, otherwise the custom links' replacement
    will be overidden before the extract of title and content.
    """

    def _html(self, force=None):
        # Custom: getting rid of the force parse arg
        # which overwrite the custom links' replacement
        if self.html is None:
            self.html = self._parse(self.input)
        return self.html


class Browser(object):
    """The WSGI application to browse contents from URIs.

    Views are prefixed with `on_`.
    """

    def __init__(self):
        # Local configuration
        self.proxy_url = "{local_host}:{local_port}".format(
            local_host=LOCAL_HOST,
            local_port=LOCAL_PORT
        )
        self.force_refresh = FORCE_REFRESH

        # Redis configuration
        self.redis = redis.Redis(REDIS_HOST, REDIS_PORT)

        # Creating Whoosh index if absent
        whoosh_index = WHOOSH_INDEX
        if not os.path.exists(whoosh_index):
            os.mkdir(whoosh_index)
        try:
            self.search_index = open_dir(whoosh_index)
        except EmptyIndexError:
            schema = Schema(url=ID(stored=True),
                            title=TEXT(stored=True),
                            content=TEXT(stored=True))
            self.search_index = create_in(whoosh_index, schema)

        # Jinja environment and filters
        template_path = os.path.join(os.path.dirname(__file__), 'templates')
        self.jinja_env = Environment(loader=FileSystemLoader(template_path),
                                     autoescape=True)
        self.jinja_env.filters['hostname'] = get_hostname
        self.jinja_env.filters['highlights'] = highlights
        self.jinja_env.filters['proxify'] = self.__prepend_proxy_url

        # Local URLs
        self.url_map = Map([
            Rule('/', endpoint='new_url'),
            Rule('/r', endpoint='render_content'),
            Rule('/s', endpoint='search_content'),
        ])

    def on_new_url(self, request):
        """View that displays forms to submit a URL or search contents."""
        error = None
        url = ''
        if request.method == 'POST':
            if 'url' in request.form:
                # Deals with the submitted URL (validity + quoting)
                url = request.form['url']
                if not is_valid_url(url):
                    error = 'Please enter a valid URL'
                else:
                    quoted_url = url_quote_plus(url)
                    return redirect('/r?url=%s' % quoted_url)
            elif 'search' in request.form:
                search = request.form['search']
                return redirect('/s?terms=%s' % url_quote_plus(search))

        return self.render_template('new_url.html',
            url=url,
            error=error,
            theme=CSS_THEME,
            proxy_url=self.proxy_url
        )

    def on_render_content(self, request):
        """View that renders content from the `url` GET argument."""
        error = None
        title, content = '', ''

        # Retrieves the url from GET arguments and unquote it
        quoted_url = request.args.get('url')
        if quoted_url is None:
            return redirect('/')
        url = url_unquote_plus(quoted_url)

        # Retrieves title and content from the given URL
        try:
            title, content = self.retrieve_content(url)
        except requests.exceptions.SSLError:
            error = ('You tried to reach a web page with an invalid '
                     '(or self-signed) SSL certificate. This case is '
                     'not handled for now due to related security issues.')

        return self.render_template('render_content.html',
            url=url,
            title=title,
            error=error,
            theme=CSS_THEME,
            content=content
        )

    def on_search_content(self, request):
        """View that returns URLs related to the `terms` GET argument."""
        # Retrieves terms from GET arguments and unquote it
        quoted_terms = request.args.get('terms')
        if quoted_terms is None:
            return redirect('/')
        terms = url_unquote_plus(quoted_terms)

        # Searches terms in Whoosh index
        with self.search_index.searcher() as searcher:
            query = Or([Term("content", terms), Term("title", terms)])
            results = searcher.search(query)
            results.fragmenter = ContextFragmenter(maxchars=100, surround=150)

            return self.render_template('render_results.html',
                results=results,
                theme=CSS_THEME
            )

    def retrieve_content(self, url):
        """Retrieves title and content from distant `url` or cache.

        `title` and `content` are cached in Redis.
        """
        if self.force_refresh:
            self.redis.delete('title:' + url)
            self.redis.delete('content:' + url)

        # Checks from Redis' cache
        if self.redis.exists('title:' + url):
            title = self.redis.get('title:' + url)
            content = self.redis.get('content:' + url)
            return title.decode('utf-8'), content.decode('utf-8')

        # Fallbacks to distant content and cache
        title, content = self.extract_meta(url)
        self.redis.set('title:' + url, title.encode('utf-8'))
        self.redis.set('content:' + url, content.encode('utf-8'))

        # Indexes the retrieved content in Whoosh
        search_writer = self.search_index.writer()
        search_writer.add_document(url=url,
                                   title=unicode(title),
                                   content=unicode(strip_tags(content)))
        search_writer.commit()
        return title, content

    def __prepend_proxy_url(self, link):
        """From an URL, prepend the proxy URL to be able to browse

        content from article to article.
        The link is quoted to prevent multiple arguments' errors.
        """
        return u"http://{proxy_url}/r?url={quoted_url}".format(
            proxy_url=self.proxy_url,
            quoted_url=url_quote_plus(link)
        )

    def extract_meta(self, url):
        """From an URL, extract title and content using Readability.

        The title is shortened through the `short_title` native method.
        The content doesn't contain `<body>` tags to be directly
        embeddable in the template and rendered as is.
        """
        # Base URL construction to deal with relative links
        parsed_url = urlparse.urlparse(url)
        base_url = "{scheme}://{netloc}".format(
            scheme=parsed_url.scheme,
            netloc=parsed_url.netloc
        )

        # Retrieves the resource and turns it into a Readability doc
        response = requests.get(url)
        response.encoding = guess_encoding(response)
        document = BrowserDocument(response.text)
        document.html = self.rewrite_links(document._html(), base_url)

        # The short title is more concise and readable
        title = document.short_title()
        content = document.summary(html_partial=True)
        return title, content

    def rewrite_links(self, html, base_url):
        """Returns transformed HTML with proxied and absolute links.

        All links and images are turned to absolute for rendering,
        links are proxied to be able to browse content from peer to peer.
        """
        for element, attribute, link, position in html.iterlinks():
            link = link.strip()
            if link.startswith("/"):  # Deal with relative links
                link = base_url + link

            if attribute == "src":  # Do not proxy images' URLs
                element.attrib[attribute] = link
            else:
                element.attrib[attribute] = self.__prepend_proxy_url(link)
        return html

    ## WERKZEUG INTERNALS (see http://werkzeug.pocoo.org/docs/tutorial/)

    def render_template(self, template_name, **context):
        t = self.jinja_env.get_template(template_name)
        return Response(t.render(context), mimetype='text/html')

    def dispatch_request(self, request):
        adapter = self.url_map.bind_to_environ(request.environ)
        try:
            endpoint, values = adapter.match()
            return getattr(self, 'on_' + endpoint)(request, **values)
        except HTTPException, e:
            return e

    def wsgi_app(self, environ, start_response):
        request = Request(environ)
        response = self.dispatch_request(request)
        return response(environ, start_response)

    def __call__(self, environ, start_response):
        return self.wsgi_app(environ, start_response)


if __name__ == '__main__':
    from werkzeug.serving import run_simple
    app = Browser()
    app.wsgi_app = SharedDataMiddleware(app.wsgi_app, {
        '/static':  os.path.join(os.path.dirname(__file__), 'static')
    })
    run_simple(LOCAL_HOST, LOCAL_PORT, app,
               use_debugger=USE_DEBUGGER, use_reloader=USE_RELOADER)