Commits

Blue committed cc83c5c

Parsing posts now decodes html entities.

  • Participants
  • Parent commits 0ad4a13

Comments (0)

Files changed (2)

File spotnet/post.py

 import logging
 import email.header
 from dateutil.parser import parse as parse_datetime
+from HTMLParser import HTMLParser
 from xml.dom.minidom import parseString
 from datetime import datetime, timedelta
 from nzb import decode_nzb, DecodeNzbError
             decoded = email.header.decode_header(string)
             if decoded[0][1] is None:
                 decoded[0] = (decoded[0][0], 'utf8')  # a sensible default, since it's probably ascii
-            return decoded[0][0].decode(decoded[0][1], 'replace')
+            uni = decoded[0][0].decode(decoded[0][1], 'replace')
+            # decode html entities
+            h = HTMLParser()
+            return h.unescape(uni)
         else:
             raise TypeError(string)
 

File spotnet/tests/parsing.py

 
 class EncodingParsingTest(ParsingTest):
 
+    def test_parse_html_entity(self):
+        post = self.construct(dict(
+            Subject='dokter van een patiënt',
+        ), 'blaat')
+        self.assertEqual(post.subject, u'dokter van een pati\xebnt')
+
     def test_parse_different_encoding_header(self):
         post = self.construct(dict(
             Subject='=?ISO-8859-7?B?1PHp4e303Pb16+vv8iDM4ezc6u/y?=',