Commits

Gregory Petukhov committed f84da5b

Fix the "incorrect" processing of &#[128-160]; entities. Add new options: fix_special_entities, which is on by default

Comments (0)

Files changed (6)

         # This setting is overwritten after each request with
         # charset of rertreived document
         charset = 'utf-8',
+
         # Charset to use for converting content of response
         # into unicode, by default it is detected automatically
         document_charset = None,
         # For xml type XML DOM builder is used
         content_type = 'html',
 
+        # Fix &#X; entities, where X between 128 and 160
+        # Such entities are parsed by modern browsers as
+        # windows-1251 entities independently of the real charset of
+        # the document, If this option is True then such entities
+        # will be replaced with correct unicode entitites e.g.:
+        # — ->  —
+        fix_special_entities = True,
+
         # Convert document body to lower case before bulding LXML tree
         # It does not affect `response.body`
         lowercased_tree = False,
         from lxml.etree import ParserError
 
         if self._lxml_tree is None:
-            body = self.response.unicode_body().strip()
+            body = self.response.unicode_body(
+                fix_special_entities=self.config['fix_special_entities']
+            ).strip()
             #if self.config['tidy']:
                 #from tidylib import tidy_document
                 #body, errors = tidy_document(body)
 import tempfile
 import webbrowser
 import codecs
+from grab.tools import encoding as encoding_tools
 
 from .tools.files import hashed_path
 
             else:
                 self.charset = charset
 
-    def unicode_body(self, ignore_errors=True):
+    def unicode_body(self, ignore_errors=True, fix_special_entities=True):
         """
         Return response body as unicode string.
         """
                 body = self.body[len(self.bom):]
             else:
                 body = self.body
+            if fix_special_entities:
+                body = encoding_tools.fix_special_entities(body)
             ubody = body.decode(self.charset, errors).strip()
             self._unicode_body = ubody
         return self._unicode_body

grab/tools/encoding.py

+import re
+
+RE_SPECIAL_ENTITY = re.compile('&#(1[2-6][0-9]);')
+
 def smart_str(value, encoding='utf-8'):
     """
     Normalize unicode/byte string to byte string.
     if not isinstance(value, unicode):
         value = value.decode(encoding)
     return value
+
+
+def special_entity_handler(match):
+    num = int(match.group(1))
+    if 128 <= num <= 160:
+        return '&#%d;' % ord(chr(num).decode('cp1252'))
+    else:
+        return match.group(0)
+
+
+def fix_special_entities(body):
+    return RE_SPECIAL_ENTITY.sub(special_entity_handler, body)
     'test.limit_option',
     'test.cookies',
     'test.response_class',
+    'test.charset_issue',
     # test server
     'test.fake_server',
     # tools

test/charset_issue.py

+# coding: utf-8
+from unittest import TestCase
+from grab import Grab, DataNotFound
+
+from test.util import (GRAB_TRANSPORT, RESPONSE, BASE_URL,
+                       FakeServerThread)
+
+class LXMLExtensionTest(TestCase):
+    def setUp(self):
+        FakeServerThread().start()
+
+    def test_dash_issue(self):
+        HTML = '<strong>&#151;</strong>'
+        RESPONSE['get'] = HTML
+        g = Grab()
+        g.go(BASE_URL)
+
+        # By default &#[128-160]; are fixed
+        self.assertFalse(g.xpath('//strong/text()') == unichr(151))
+        self.assertTrue(g.xpath('//strong/text()') == unichr(8212))
+
+        # disable fix-behaviour
+        g.setup(fix_special_entities=False)
+        g.go(BASE_URL)
+
+        # By default &#[128-160]; are fixed
+        self.assertTrue(g.xpath('//strong/text()') == unichr(151))
+        self.assertFalse(g.xpath('//strong/text()') == unichr(8212))
+
+        # Explicitly use unicode_body func
+        g = Grab()
+        g.go(BASE_URL)
+        print ':::', g.response.unicode_body()
+        self.assertTrue('&#8212;' in g.response.unicode_body())