Commits

David Larlet  committed a12d10e

Fix a bug related to multiple indexation of documents in case of FORCE_REFRESH=True

  • Participants
  • Parent commits 68199f5

Comments (0)

Files changed (6)

File src/static/default.css

               color: brown; font-family: 'Georgia'; }
 .source     { font-weight: lighter; font-size: 20px; color: black;
               text-align: right; float: right; }
-.home a     { text-decoration: none; }
+.backhome a { text-decoration: none; }
 .content h1,
 .content h2,
 .content h3 { font-weight: normal; font-family: 'Georgia'; }
 .content a  { color: #11557C; }
 .content img{ float: left; margin: 1em; }
+.match      { font-weight: bold; }

File src/static/werkzeug.css

               font-weight: normal; color: #11557C; }
 .source     { font-weight: lighter; font-size: 20px; color: black;
               text-align: right; float: right; }
-.home a     { text-decoration: none; }
+.backhome a { text-decoration: none; }
 .content h1,
 .content h2,
 .content h3 { font-weight: normal; }
 .content a  { color: #11557C; }
 .content img{ float: left; margin: 1em; }
+.match      { font-weight: bold; }

File src/storage.py

 import requests
 from readability.readability import Document
 
+from whoosh.query import Term
 from whoosh.fields import Schema, ID, TEXT
 from whoosh.index import create_in, open_dir, EmptyIndexError
 
 
         `title` and `content` are cached in Redis.
         """
+        search_writer = self.search_index.writer()
+
+        # Remove existing content from Redis and Whoosh
         if FORCE_REFRESH:
             self.redis.delete('title:' + url)
             self.redis.delete('content:' + url)
+            query = Term('url', url)
+            search_writer.delete_by_query(query)
 
         # Checks from Redis' cache
         if self.redis.exists('title:' + url):
         self.redis.set('content:' + url, content.encode('utf-8'))
 
         # Indexes the retrieved content in Whoosh
-        search_writer = self.search_index.writer()
         search_writer.add_document(url=url,
                                    title=unicode(title),
                                    content=unicode(strip_tags(content)))

File src/templates/render_content.html

 {% block title %}Browsing {{ url }}{% endblock %}
 {% block body %}
   <h3 class="source">Source: <a href="{{ url }}">{{ url|hostname }}</a></h3>
-  <h1 class="home"><a href=/>← <img src="/static/home.png" alt="Home" /></a></h1>
+  <h1 class="backhome"><a href=/>← <img src="/static/home.png" alt="Home" /></a></h1>
   <h2 class="title">{{ title }}</h2>
     {% if error %}
       <p class=error><strong>Error:</strong> {{ error }}

File src/templates/render_results.html

 {% extends "layout.html" %}
 {% block title %}Looking for {{ terms }}{% endblock %}
 {% block body %}
-  <h1><a href=/>← <img src="/static/home.png" alt="Home" /></a></h1>
+  <h1 class="backhome"><a href=/>← <img src="/static/home.png" alt="Home" /></a></h1>
   {% for hit in results %}
-      <h2><a href="{{ hit.url|proxify }}">{{ hit.title }}</a> ({{ hit.url|hostname }})</h2>
+      <h3 class="result-title"><a href="{{ hit.url|proxify }}">{{ hit.title }}</a> ({{ hit.url|hostname }})</h3>
       {{ hit|highlights("content")|safe }}
   {% else %}
       <h2>No results found :(</h2>

File src/utils.py

 def highlights(hit, column_name):
     """Returns the highlighted extract for a given `hit`."""
     return hit.highlights(column_name)
-