Commits

Ian Lewis committed d2710a4

Item title's should be text

Comments (0)

Files changed (2)

lifestream/plugins/__init__.py

 import dateutil.parser
 import copy
 
-from django.utils.encoding import iri_to_uri
+from django.utils.encoding import iri_to_uri,force_unicode
 from django.db.models import Q
 from django.utils.html import strip_tags
 
 from lifestream.models import *
+from lifestream.util import convert_entities
 
 class FeedPlugin(object):
   
     feed_description = entry.get('description')
     if feed_contents:
         content_type = feed_contents[0]['type']
-        content = feed_contents[0]['value']
+        content = force_unicode(feed_contents[0]['value'])
         clean_content = strip_tags(content)
     elif feed_description:
         content_type = "text/html"
-        content = feed_description
-        clean_content = strip_tags(feed_description)
+        content = force_unicode(feed_description)
+        clean_content = strip_tags(content)
     else:
       content_type = None
       content = None
       clean_content = None
+
+    # Make sure the title is clean too.
+    title = convert_entities(strip_tags(force_unicode(entry.get('title'))))
     
     media_url = None
     media_content_attrs = entry.get('media_content_attrs')
     if media_player_attrs:
       media_player_url = media_player_attrs.get('url')
 
+    media_description = force_unicode(entry.get("media_description"))
+
     item = Item(feed = self.feed,
              date = entry.get('published'),
-             title = entry.get('title'),
+             title = title,
              content = content,
              content_type = content_type,
              clean_content = clean_content,
              media_url = media_url,
              media_thumbnail_url = thumbnail_url,
              media_player_url = media_player_url,
-             media_description = entry.get("media_description"),
+             media_description = media_description,
              media_description_type = media_description_type,
     )
     return item

lifestream/util.py

     "font-weight",
 )
 
+def escape_entities(text):
+        return re.sub(r'&(?![A-Za-z]+;)', '&', text)\
+                 .replace('<','&lt;')\
+                 .replace('>', '&gt;')\
+                 .replace('"', '&quot;')\
+                 .replace("'", '&apos;')
+
+def convert_entities(text):
+    if text is None:
+        return None
+    entities = {
+        u'&amp;': u'&',
+        u'&lt;': u'<',
+        u'&gt;': u'>',
+        u'&quot;': u'"',
+        u'&apos;': u"'",
+    }
+    for entity in entities:
+        text = text.replace(entity, entities[entity])
+    return text
+
 def sanitize_html(htmlSource, encoding=None, valid_tags=None, valid_styles=None):
     """
     Clean bad html content. Currently this simply strips tags that
             style += "%s:%s;" % (key,val.strip())
         tag["style"] = style
 
-    def entities(text):
-        return re.sub(r'&(?![A-Za-z]+;)', '&amp;', text)\
-                 .replace('<','&lt;')\
-                 .replace('>', '&gt;')\
-                 .replace('"', '&quot;')\
-                 .replace("'", '&apos;')
-
-    # Sanitize html text by changing bad text to entities.
+        # Sanitize html text by changing bad text to entities.
     # BeautifulSoup will do this for href and src attributes
     # on anchors and image tags but not for text.
     for text in soup.findAll(text=True):
-        text.replaceWith(entities(text))
+        text.replaceWith(escape_entities(text))
    
     # Strip disallowed tags and attributes.
     return soup.renderContents().decode('utf8')