Commits

Gregory Petukhov committed 6164040

Drop strip_xml_declaration option. It is deprecated now

  • Participants
  • Parent commits aa6aaab

Comments (0)

Files changed (9)

docs/grab/options.rst

 
 :Type: bool
 :Default: True
-
-.. _option_strip_xml_declaration
-
-strip_xml_declaration
----------------------
-
-Удаление XML declaration из тела документа перед тем, как строить его unicode-представление. Я забыл зачем это нужно :) Попозже допишу помощь.
-
-:Type: bool
-:Default: True
         # It does not affect `response.body`
         strip_null_bytes = True,
 
+        # Obsolete options, will be removed in future versions
         # Strip XML declaration before building unicode body
         strip_xml_declaration = True,
     )
             if not key in self.config.keys():
                 raise error.GrabMisuseError('Unknown option: %s' % key)
 
+        if key == 'strip_xml_declaration':
+            logging.error('Option strip_xml_declaration is deprecated. Now xml declarations is alwas striped out. This option will be removed in future versions')
+
         if 'url' in kwargs:
             if self.config.get('url'):
                 kwargs['url'] = self.make_url_absolute(kwargs['url'])
 
         # TODO: check max redirect count
         if self.config['follow_refresh']:
-            url = find_refresh_url(self.response.unicode_body(
-                strip_xml_declaration=self.config['strip_xml_declaration']))
+            url = find_refresh_url(self.response.unicode_body())
             if url:
                 return self.request(url=url)
 
 
         if self.config['url']:
             if resolve_base:
-                ubody = self.response.unicode_body(
-                    strip_xml_declaration=self.config['strip_xml_declaration']
-                )
+                ubody = self.response.unicode_body()
                 base_url = find_base_url(ubody)
                 if base_url:
                     return urljoin(base_url, url)
         from lxml.etree import ParserError
 
         if self._lxml_tree is None:
-            body = self.response.unicode_body(
-                strip_xml_declaration=self.config['strip_xml_declaration']).strip()
+            body = self.response.unicode_body().strip()
             #if self.config['tidy']:
                 #from tidylib import tidy_document
                 #body, errors = tidy_document(body)
         if byte:
             match =  regexp.search(self.response.body)
         else:
-            ubody = self.response.unicode_body(
-                strip_xml_declaration=self.config['strip_xml_declaration']
-            )
+            ubody = self.response.unicode_body()
             match = regexp.search(ubody)
         if match:
             return match
             if byte:
                 raise GrabMisuseError('The anchor should be bytes string in byte mode')
             else:
-                return anchor in self.response.unicode_body(
-                    strip_xml_declaration=self.config['strip_xml_declaration']
-                )
+                return anchor in self.response.unicode_body()
 
         if not isinstance(anchor, unicode):
             if byte:
             else:
                 self.charset = charset
 
-    def unicode_body(self, ignore_errors=True, strip_xml_declaration=False):
+    def unicode_body(self, ignore_errors=True):
         """
         Return response body as unicode string.
         """
             else:
                 body = self.body
             ubody = body.decode(self.charset, errors).strip()
-            if strip_xml_declaration:
-                ubody = RE_XML_DECLARATION.sub('', ubody)
+            ubody = RE_XML_DECLARATION.sub('', ubody)
             self._unicode_body = ubody
         return self._unicode_body
 

test/response_class.py

         g.setup(document_charset='utf-8')
         g.go(BASE_URL)
         self.assertTrue(u'крокодил' in g.response.unicode_body())
+
+    def test_xml_declaration(self):
+        """
+        HTML with XML declaration shuld be processed without errors.
+        """
+        RESPONSE['get'] = """<?xml version="1.0" encoding="UTF-8"?>
+        <html><body><h1>test</h1></body></html>
+        """
+        g = Grab()
+        g.go(BASE_URL)
+        self.assertEqual('test', g.xpath_text('//h1'))

usecase/setup_script.py

+import os
+import sys
+
+ROOT = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+
+os.chdir(ROOT)
+sys.path.insert(0, ROOT)
+import setup_script
+
+import grab
+
+urls = ['http://karta-moskvy.ru']
+
+g = grab.Grab(hammer_mode=True)
+#g.setup(strip_xml_declaration=True)
+
+for url in urls:
+    g.go(url)
+    print g.response.unicode_body()
+    #for a in g.xpath_list('//a'):
+        #h = a.get('href','')