Commits

David Larlet committed e4829ce

Refactoring party, switching to a dedicated file for settings and removing the useless 404 page

Comments (0)

Files changed (4)

 ## Motivations
 
 * Fed up with bloated websites full of noise.
-* Tired of cmd++++ on each website I visit.
+* Tired of cmd+++++ on each website I visit.
 * Exhausted by crapy social widgets.
 
-## Installation
+## Installing
 
 Create a virtualenv (optional) and install dependencies from `requirements.txt` with pip:
 
     $ pip install -r requirements.txt
 
 You need to have a `redis-server` launched too for persistence/cache.
+Python 2.6+ is required, Python 3 is not (yet) supported.
 
-## Enjoyment
+## Enjoying
 
 Launch the python script:
 
 
 ## Hacking
 
-BSD licensed, just fork it:
+BSD licensed:
 
 > take my code with you  
 > and do whatever you want  
 > 
 > — [License Haiku](http://www.aaronsw.com/weblog/000360)
 
-## TODO
+## Helping
 
-* pre-fetching of links in background, need to be asynchronuous
-* add the location of the browsing using [doko](https://bitbucket.org/larsyencken/doko), can be useful for search (I often remember where I read articles)
+This is an *opiniated* tool, here are some features I'd like to add and
+I'll probably [not accept your pull-request](http://brianegranger.com/?p=249)
+otherwise so if you plan to contribute, please 
+[contact me](https://larlet.fr/david/) before or discuss using issues.
+
+* pre-fetching of links in background (on hover?), need to be asynchronuous
 * deal with non-article pages (homepages, lists, etc)
-* handle `CSS` themes with a good default
+* handle `CSS` themes with a good default one
 
 from werkzeug.utils import redirect
 from werkzeug.routing import Map, Rule
+from werkzeug.exceptions import HTTPException
 from werkzeug.wsgi import SharedDataMiddleware
 from werkzeug.wrappers import Request, Response
-from werkzeug.exceptions import HTTPException, NotFound
 from werkzeug.urls import url_quote_plus, url_unquote_plus
 from jinja2 import Environment, FileSystemLoader
 
 from whoosh.highlight import ContextFragmenter
 from whoosh.index import create_in, open_dir, EmptyIndexError
 
+from settings import (
+    LOCAL_HOST, LOCAL_PORT, FORCE_REFRESH,
+    REDIS_HOST, REDIS_PORT, WHOOSH_INDEX,
+    USE_DEBUGGER, USE_RELOADER
+)
+
 strip_tags_re = re.compile(r'</?\S([^=]*=(\s*"[^"]*"|\s*\'[^\']*\'|\S*)|[^>])*?>', re.IGNORECASE)
 meta_encoding_re = re.compile(r'<meta.*?charset=([^"\']+)', re.IGNORECASE)
 
     return strip_tags_re.sub(' ', content)
 
 
+def guess_encoding(response):
+    """Returns an HTML guessed encoding from a requests' response."""
+    encoding = response.encoding
+    if encoding == 'ISO-8859-1':
+        # By default, the fallback of the content-type text/html
+        # is ISO-8859-1, so in that case we double check that the
+        # encoding is not set in HTML's dedicated meta, see
+        # http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
+        # Warning: response.text MUST be reevaluated
+        encoding = re.findall(meta_encoding_re, response.text)
+        if encoding:
+            encoding = encoding[0]
+        else:  # guess from Charade as a final fallback
+            encoding = response.apparent_encoding
+    return encoding
+
+
 def is_valid_url(url):
     """Verifies the validity of the scheme for a given `url`."""
     parts = urlparse.urlparse(url)
     will be overidden before the extract of title and content.
     """
 
-    def _html(self, force=False):
+    def _html(self, force=None):
         # Custom: getting rid of the force parse arg
         # which overwrite the custom links' replacement
         if self.html is None:
 class Browser(object):
     """The WSGI application to browse contents from URIs."""
 
-    def __init__(self, config):
+    def __init__(self):
         # Local configuration
-        self.proxy_url = "%s:%s" % (config['local_host'], config['local_port'])
+        self.proxy_url = "{local_host}:{local_port}".format(
+            local_host=LOCAL_HOST,
+            local_port=LOCAL_PORT
+        )
+        self.force_refresh = FORCE_REFRESH
 
         # Redis configuration
-        self.redis = redis.Redis(config['redis_host'], config['redis_port'])
+        self.redis = redis.Redis(REDIS_HOST, REDIS_PORT)
 
         # Jinja environment and filters
         template_path = os.path.join(os.path.dirname(__file__), 'templates')
         self.jinja_env.filters['proxify'] = self.__prepend_proxy_url
 
         # Creating Whoosh index if absent
-        whoosh_index = config['whoosh_index']
+        whoosh_index = WHOOSH_INDEX
         if not os.path.exists(whoosh_index):
             os.mkdir(whoosh_index)
         try:
             elif 'search' in request.form:
                 search = request.form['search']
                 return redirect('/s?terms=%s' % url_quote_plus(search))
+
         return self.render_template('new_url.html',
             url=url,
             error=error,
             query = Or([Term("content", terms), Term("title", terms)])
             results = searcher.search(query)
             results.fragmenter = ContextFragmenter(maxchars=100, surround=150)
+
             return self.render_template('render_results.html',
                 results=results
             )
 
-    def retrieve_content(self, url, force_refresh=False):
+    def retrieve_content(self, url):
         """Retrieves title and content from distant `url` or cache.
 
         `title` and `content` are cached in Redis.
         """
-        if force_refresh:
+        if self.force_refresh:
             self.redis.delete('title:' + url)
             self.redis.delete('content:' + url)
 
                                    title=unicode(title),
                                    content=unicode(strip_tags(content)))
         search_writer.commit()
-
         return title, content
 
     def __prepend_proxy_url(self, link):
         content from article to article.
         The link is quoted to prevent multiple arguments' errors.
         """
-        return u"http://%(proxy_url)s/r?url=%(quoted_url)s" % {
-            "proxy_url": self.proxy_url,
-            "quoted_url": url_quote_plus(link)
-        }
+        return u"http://{proxy_url}/r?url={quoted_url}".format(
+            proxy_url=self.proxy_url,
+            quoted_url=url_quote_plus(link)
+        )
 
     def extract_meta(self, url):
         """From an URL, extract title and content using Readability.
         """
         # Base URL construction to deal with relative links
         parsed_url = urlparse.urlparse(url)
-        base_url = "%s://%s" % (parsed_url.scheme, parsed_url.netloc)
+        base_url = "{scheme}://{netloc}".format(
+            scheme=parsed_url.scheme,
+            netloc=parsed_url.netloc
+        )
 
         # Retrieves the resource and turns it into a Readability doc
         response = requests.get(url)
-        if response.encoding == 'ISO-8859-1':
-            # By default, the fallback of the content-type text/html
-            # is ISO-8859-1, so in that case we double check that the
-            # encoding is not set in HTML's dedicated meta, see
-            # http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
-            # Warning: response.text MUST be reevaluated
-            encoding = re.findall(meta_encoding_re, response.text)
-            if encoding:
-                response.encoding = encoding[0]
-            else:  # guess from Charade as a final fallback
-                response.encoding = response.apparent_encoding
+        response.encoding = guess_encoding(response)
         document = BrowserDocument(response.text)
-
-        # Explicitely parse the HTML to be able to rewrite links
-        # with base URL and prepend the proxy to all URLs (but images)
-        html = document._html()
-        for element, attribute, link, position in html.iterlinks():
-            # Deal with relative links
-            link = link.strip()
-            if link.startswith("/"):
-                link = base_url + link
-
-            # In case of an image, do not proxy the src URL
-            if attribute == "src":
-                element.attrib[attribute] = link
-            else:
-                element.attrib[attribute] = self.__prepend_proxy_url(link)
-        document.html = html
+        document.html = self.rewrite_links(document._html(), base_url)
 
         # The short title is more concise and readable
         title = document.short_title()
         content = document.summary(html_partial=True)
         return title, content
 
+    def rewrite_links(self, html, base_url):
+        """Returns transformed HTML with proxied and absolute links.
+
+        All links and images are turned to absolute for rendering,
+        links are proxied to be able to browse content from peer to peer.
+        """
+        for element, attribute, link, position in html.iterlinks():
+            link = link.strip()
+            if link.startswith("/"):  # Deal with relative links
+                link = base_url + link
+
+            if attribute == "src":  # Do not proxy images' URLs
+                element.attrib[attribute] = link
+            else:
+                element.attrib[attribute] = self.__prepend_proxy_url(link)
+        return html
+
     ## WERKZEUG INTERNALS (see http://werkzeug.pocoo.org/docs/tutorial/)
 
-    def error_404(self):
-        response = self.render_template('404.html')
-        response.status_code = 404
-        return response
-
     def render_template(self, template_name, **context):
         t = self.jinja_env.get_template(template_name)
         return Response(t.render(context), mimetype='text/html')
         try:
             endpoint, values = adapter.match()
             return getattr(self, 'on_' + endpoint)(request, **values)
-        except NotFound, e:
-            return self.error_404()
         except HTTPException, e:
             return e
 
         return self.wsgi_app(environ, start_response)
 
 
-def create_app(local_host, local_port,
-        redis_host='localhost', redis_port=6379,
-        whoosh_index='index', with_static=True):
-    app = Browser({
-        'local_host': local_host,
-        'local_port': local_port,
-        'redis_host': redis_host,
-        'redis_port': redis_port,
-        'whoosh_index': whoosh_index
-    })
-    if with_static:
-        app.wsgi_app = SharedDataMiddleware(app.wsgi_app, {
-            '/static':  os.path.join(os.path.dirname(__file__), 'static')
-        })
-    return app
-
-
 if __name__ == '__main__':
     from werkzeug.serving import run_simple
-    local_host = '127.0.0.1'
-    local_port = 5000
-    app = create_app(local_host, local_port)
-    run_simple(local_host, local_port, app,
-               use_debugger=True, use_reloader=True)
+    app = Browser()
+    app.wsgi_app = SharedDataMiddleware(app.wsgi_app, {
+        '/static':  os.path.join(os.path.dirname(__file__), 'static')
+    })
+    run_simple(LOCAL_HOST, LOCAL_PORT, app,
+               use_debugger=USE_DEBUGGER, use_reloader=USE_RELOADER)
+# Proxy's settings
+LOCAL_HOST = '127.0.0.1'
+LOCAL_PORT = 5000
+FORCE_REFRESH = False  # Do not retrieve data from Redis' cache
+
+# Redis' settings
+REDIS_HOST = 'localhost'
+REDIS_PORT = 6379
+
+# Whoosh's settings
+WHOOSH_INDEX = 'index'
+
+# Werkzeug's settings
+USE_DEBUGGER = True
+USE_RELOADER = True

src/templates/404.html

-{% extends "layout.html" %}
-{% block title %}Page Not Found{% endblock %}
-{% block body %}
-  <h2>Page Not Found</h2>
-  <p>I am sorry, but no such page was found here.
-{% endblock %}