Commits

hc committed 0dfe3d9

strip comments from html

Comments (0)

Files changed (1)

plugin_h_search/__init__.py

         self.db = globals_['db']
         self.url = URL(r=request, args=request.args, vars=request.get_vars)
         self.frequency = frequency
-        self.re_splitwords = re.compile(r'\W*')
     
     def add(self, data, url=None):
         " Add data to search index "
         for tag in ('script', 'style'):
             for t in soup(tag):
                 t.extract()
-        for comments in soup.findAll(text=lambda t: isinstance(t, Comment)):
-            for comment in comments:
-                comment.extract         
+        for comment in soup.findAll(text=lambda t: isinstance(t, Comment)):
+            comment.extract()       
         # extract page title
         title = soup.html.head.title.string or 'Untitled'
-        # strip html
+        # strip html and remove newlines
         contents = ' '.join(soup.findAll(text=True))
+        contents = re.sub('\n', ' ', contents)
         # split words
-        words = (w.lower() for w in self.re_splitwords.split(data) if w!='')
+        r = re.compile(r'\W*')
+        words = (w.lower() for w in r.split(data) if w!='')
         # remove stopwords
         words = (w for w in words if w not in STOPWORDS)
         # stem words
     def search(self, search_query, namespace=None, limit=15):
         results = self._query_index(search_query, namespace)
         if results:
-            scores = Scorer(results.values()).score(position_weight=2)
+            scores = Scorer(results.values()).score()
             db = self.db
             for score, ref_page in sorted(zip(scores, results.keys())[:limit],
                 reverse=True):
                 record = db[TABLE_PAGE][ref_page]
                 s = Storage()
-                s.title = record.title
+                s.title = self._highlight_matches(record.title)
                 s.snippet = self._get_snippet(record.contents)
                 s.url = record.url
                 yield s
             p = r'\b[\w\s]{0,%s}' % PRECONTEXT + p + r'.{0,%s}\b' % POSTCONTEXT
             self.regex = re.compile(p, re.I| re.S)
         best_len = sum(map(len, self.words)) + PRECONTEXT + POSTCONTEXT
-        snippet = sorted(self.regex.findall(text),
-            key=lambda x: abs(len(x)-best_len))[0][:400]
-        regex = re.compile(r'(%s)' % '|'.join(self.words), re.I)
-        return regex.subn(r'<b>\1</b>', snippet)[0]
+        try:
+            snippet = sorted(self.regex.findall(text),
+                key=lambda x: abs(len(x)-best_len))[0][:400]
+        except IndexError:
+            snippet = ''
+        return self._highlight_matches(snippet)
+    
+    def _highlight_matches(self, text):
+        r = re.compile(r'(%s)' % '|'.join(self.words), re.IGNORECASE)
+        return r.subn(r'<b>\1</b>', text)[0]
     
 
 class Scorer(object):