Commits

David Larlet committed 68199f5

Split storage part to a dedicated file/class, thanks @tarek for the suggestion.

  • Participants
  • Parent commits dda42c8

Comments (0)

Files changed (3)

File src/browser.py

 # -*- coding: utf-8 -*-
 import os
-import urlparse
 
 from werkzeug.utils import redirect
 from werkzeug.routing import Map, Rule
 from werkzeug.urls import url_quote_plus, url_unquote_plus
 from jinja2 import Environment, FileSystemLoader
 
-import redis
 import requests
-from readability.readability import Document
 
 from whoosh.query import Or, Term
-from whoosh.fields import Schema, ID, TEXT
 from whoosh.highlight import ContextFragmenter
-from whoosh.index import create_in, open_dir, EmptyIndexError
 
+from storage import Storage
 from settings import (
-    LOCAL_HOST, LOCAL_PORT, FORCE_REFRESH, CSS_THEME,
-    REDIS_HOST, REDIS_PORT, WHOOSH_INDEX,
-    USE_DEBUGGER, USE_RELOADER
+    LOCAL_HOST, LOCAL_PORT, PROXY_URL,
+    CSS_THEME, USE_DEBUGGER, USE_RELOADER
 )
-from utils import (
-    get_hostname, guess_encoding, highlights, is_valid_url, strip_tags
-)
-
-
-class BrowserDocument(Document):
-    """A custom Readability's Document to deal with contents' filters.
-
-    The main goal for now is to do not force the parsing of the HTML
-    if it has already been done, otherwise the custom links' replacement
-    will be overidden before the extract of title and content.
-    """
-
-    def _html(self, force=None):
-        # Custom: getting rid of the force parse arg
-        # which overwrite the custom links' replacement
-        if self.html is None:
-            self.html = self._parse(self.input)
-        return self.html
+from utils import get_hostname, highlights, is_valid_url
 
 
 class Browser(object):
-    """The WSGI application to browse contents from URIs.
-
-    Views are prefixed with `on_`.
-    """
+    """The WSGI application to browse contents from URIs."""
 
     def __init__(self):
-        # Local configuration
-        self.proxy_url = "{local_host}:{local_port}".format(
-            local_host=LOCAL_HOST,
-            local_port=LOCAL_PORT
-        )
-        self.force_refresh = FORCE_REFRESH
-
-        # Redis configuration
-        self.redis = redis.Redis(REDIS_HOST, REDIS_PORT)
-
-        # Creating Whoosh index if absent
-        whoosh_index = WHOOSH_INDEX
-        if not os.path.exists(whoosh_index):
-            os.mkdir(whoosh_index)
-        try:
-            self.search_index = open_dir(whoosh_index)
-        except EmptyIndexError:
-            schema = Schema(url=ID(stored=True),
-                            title=TEXT(stored=True),
-                            content=TEXT(stored=True))
-            self.search_index = create_in(whoosh_index, schema)
+        # Storage configuration
+        self.storage = Storage()
 
         # Jinja environment and filters
         template_path = os.path.join(os.path.dirname(__file__), 'templates')
                                      autoescape=True)
         self.jinja_env.filters['hostname'] = get_hostname
         self.jinja_env.filters['highlights'] = highlights
-        self.jinja_env.filters['proxify'] = self.__prepend_proxy_url
+        self.jinja_env.filters['proxify'] = self.storage.prepend_proxy_url
 
         # Local URLs
         self.url_map = Map([
             url=url,
             error=error,
             theme=CSS_THEME,
-            proxy_url=self.proxy_url
+            proxy_url=PROXY_URL
         )
 
     def on_render_content(self, request):
 
         # Retrieves title and content from the given URL
         try:
-            title, content = self.retrieve_content(url)
+            title, content = self.storage.retrieve_content(url)
         except requests.exceptions.SSLError:
             error = ('You tried to reach a web page with an invalid '
                      '(or self-signed) SSL certificate. This case is '
         terms = url_unquote_plus(quoted_terms)
 
         # Searches terms in Whoosh index
-        with self.search_index.searcher() as searcher:
+        with self.storage.searcher as searcher:
             query = Or([Term("content", terms), Term("title", terms)])
             results = searcher.search(query)
             results.fragmenter = ContextFragmenter(maxchars=100, surround=150)
                 theme=CSS_THEME
             )
 
-    def retrieve_content(self, url):
-        """Retrieves title and content from distant `url` or cache.
-
-        `title` and `content` are cached in Redis.
-        """
-        if self.force_refresh:
-            self.redis.delete('title:' + url)
-            self.redis.delete('content:' + url)
-
-        # Checks from Redis' cache
-        if self.redis.exists('title:' + url):
-            title = self.redis.get('title:' + url)
-            content = self.redis.get('content:' + url)
-            return title.decode('utf-8'), content.decode('utf-8')
-
-        # Fallbacks to distant content and cache
-        title, content = self.extract_meta(url)
-        self.redis.set('title:' + url, title.encode('utf-8'))
-        self.redis.set('content:' + url, content.encode('utf-8'))
-
-        # Indexes the retrieved content in Whoosh
-        search_writer = self.search_index.writer()
-        search_writer.add_document(url=url,
-                                   title=unicode(title),
-                                   content=unicode(strip_tags(content)))
-        search_writer.commit()
-        return title, content
-
-    def __prepend_proxy_url(self, link):
-        """From an URL, prepend the proxy URL to be able to browse
-
-        content from article to article.
-        The link is quoted to prevent multiple arguments' errors.
-        """
-        return u"http://{proxy_url}/r?url={quoted_url}".format(
-            proxy_url=self.proxy_url,
-            quoted_url=url_quote_plus(link)
-        )
-
-    def extract_meta(self, url):
-        """From an URL, extract title and content using Readability.
-
-        The title is shortened through the `short_title` native method.
-        The content doesn't contain `<body>` tags to be directly
-        embeddable in the template and rendered as is.
-        """
-        # Base URL construction to deal with relative links
-        parsed_url = urlparse.urlparse(url)
-        base_url = "{scheme}://{netloc}".format(
-            scheme=parsed_url.scheme,
-            netloc=parsed_url.netloc
-        )
-
-        # Retrieves the resource and turns it into a Readability doc
-        response = requests.get(url)
-        response.encoding = guess_encoding(response)
-        document = BrowserDocument(response.text)
-        document.html = self.rewrite_links(document._html(), base_url)
-
-        # The short title is more concise and readable
-        title = document.short_title()
-        content = document.summary(html_partial=True)
-        return title, content
-
-    def rewrite_links(self, html, base_url):
-        """Returns transformed HTML with proxied and absolute links.
-
-        All links and images are turned to absolute for rendering,
-        links are proxied to be able to browse content from peer to peer.
-        """
-        for element, attribute, link, position in html.iterlinks():
-            link = link.strip()
-            if link.startswith("/"):  # Deal with relative links
-                link = base_url + link
-
-            if attribute == "src":  # Do not proxy images' URLs
-                element.attrib[attribute] = link
-            else:
-                element.attrib[attribute] = self.__prepend_proxy_url(link)
-        return html
-
     ## WERKZEUG INTERNALS (see http://werkzeug.pocoo.org/docs/tutorial/)
 
     def render_template(self, template_name, **context):

File src/settings.py

 # Proxy's settings
 LOCAL_HOST = '127.0.0.1'
 LOCAL_PORT = 5000
-FORCE_REFRESH = False  # Do not retrieve data from Redis' cache
+FORCE_REFRESH = False  # Will not retrieve data from Redis' cache if True
 CSS_THEME = 'default'  # Based on your CSS filename, try 'werkzeug'
 
 # Redis' settings
 # Werkzeug's settings
 USE_DEBUGGER = True
 USE_RELOADER = True
+
+# You shouldn't have to customize below this line
+PROXY_URL = "{local_host}:{local_port}".format(
+    local_host=LOCAL_HOST,
+    local_port=LOCAL_PORT
+)

File src/storage.py

+# -*- coding: utf-8 -*-
+import os
+import urlparse
+
+from werkzeug.urls import url_quote_plus
+
+import redis
+import requests
+from readability.readability import Document
+
+from whoosh.fields import Schema, ID, TEXT
+from whoosh.index import create_in, open_dir, EmptyIndexError
+
+from settings import (
+    PROXY_URL, FORCE_REFRESH, REDIS_HOST, REDIS_PORT, WHOOSH_INDEX
+)
+from utils import guess_encoding, strip_tags
+
+
+class BrowserDocument(Document):
+    """A custom Readability's Document to deal with contents' filters.
+
+    The main goal for now is to do not force the parsing of the HTML
+    if it has already been done, otherwise the custom links' replacement
+    will be overidden before the extract of title and content.
+    """
+
+    def _html(self, force=None):
+        # Custom: getting rid of the force parse arg
+        # which overwrite the custom links' replacement
+        if self.html is None:
+            self.html = self._parse(self.input)
+        return self.html
+
+
+class Storage(object):
+    """The storage that deal with parsing, persistency and search."""
+
+    def __init__(self):
+        # Redis configuration
+        self.redis = redis.Redis(REDIS_HOST, REDIS_PORT)
+
+        # Creating Whoosh index if absent
+        whoosh_index = WHOOSH_INDEX
+        if not os.path.exists(whoosh_index):
+            os.mkdir(whoosh_index)
+        try:
+            self.search_index = open_dir(whoosh_index)
+        except EmptyIndexError:
+            schema = Schema(url=ID(stored=True),
+                            title=TEXT(stored=True),
+                            content=TEXT(stored=True))
+            self.search_index = create_in(whoosh_index, schema)
+
+    @property
+    def searcher(self):
+        """Quick access to Whoosh's searcher."""
+        return self.search_index.searcher()
+
+    def retrieve_content(self, url):
+        """Retrieves title and content from distant `url` or cache.
+
+        `title` and `content` are cached in Redis.
+        """
+        if FORCE_REFRESH:
+            self.redis.delete('title:' + url)
+            self.redis.delete('content:' + url)
+
+        # Checks from Redis' cache
+        if self.redis.exists('title:' + url):
+            title = self.redis.get('title:' + url)
+            content = self.redis.get('content:' + url)
+            return title.decode('utf-8'), content.decode('utf-8')
+
+        # Fallbacks to distant content and cache
+        title, content = self.extract_meta(url)
+        self.redis.set('title:' + url, title.encode('utf-8'))
+        self.redis.set('content:' + url, content.encode('utf-8'))
+
+        # Indexes the retrieved content in Whoosh
+        search_writer = self.search_index.writer()
+        search_writer.add_document(url=url,
+                                   title=unicode(title),
+                                   content=unicode(strip_tags(content)))
+        search_writer.commit()
+        return title, content
+
+    def prepend_proxy_url(self, link):
+        """From an URL, prepend the proxy URL to be able to browse
+
+        content from article to article.
+        The link is quoted to prevent multiple arguments' errors.
+        """
+        return u"http://{proxy_url}/r?url={quoted_url}".format(
+            proxy_url=PROXY_URL,
+            quoted_url=url_quote_plus(link)
+        )
+
+    def extract_meta(self, url):
+        """From an URL, extract title and content using Readability.
+
+        The title is shortened through the `short_title` native method.
+        The content doesn't contain `<body>` tags to be directly
+        embeddable in the template and rendered as is.
+        """
+        # Base URL construction to deal with relative links
+        parsed_url = urlparse.urlparse(url)
+        base_url = "{scheme}://{netloc}".format(
+            scheme=parsed_url.scheme,
+            netloc=parsed_url.netloc
+        )
+
+        # Retrieves the resource and turns it into a Readability doc
+        response = requests.get(url)
+        response.encoding = guess_encoding(response)
+        document = BrowserDocument(response.text)
+        document.html = self.rewrite_links(document._html(), base_url)
+
+        # The short title is more concise and readable
+        title = document.short_title()
+        content = document.summary(html_partial=True)
+        return title, content
+
+    def rewrite_links(self, html, base_url):
+        """Returns transformed HTML with proxied and absolute links.
+
+        All links and images are turned to absolute for rendering,
+        links are proxied to be able to browse content from peer to peer.
+        """
+        for element, attribute, link, position in html.iterlinks():
+            link = link.strip()
+            if link.startswith("/"):  # Deal with relative links
+                link = base_url + link
+
+            if attribute == "src":  # Do not proxy images' URLs
+                element.attrib[attribute] = link
+            else:
+                element.attrib[attribute] = self.prepend_proxy_url(link)
+        return html