Gregory Petukhov  committed a3cd37e

Add normalize_space option to lxml-text functions

  • Participants
  • Parent commits a661b4e

Comments (0)

Files changed (4)

File grab/ext/

 import time
 from ..base import DataNotFound, GrabMisuseError, GLOBAL_STATE
-from import normalize_space, find_number
+from import normalize_space as normalize_space_func, find_number
 from import get_node_text
 NULL = object()
             return items 
-    def xpath_text(self, path, default=NULL, filter=None, smart=False):
+    def xpath_text(self, path, default=NULL, filter=None, smart=False,
+                   normalize_space=True):
         Get normalized text of node which matches the given xpath.
                 return default
             if isinstance(elem, basestring):
-                return normalize_space(elem)
+                return normalize_space_func(elem)
-                return get_node_text(elem, smart=smart)
+                return get_node_text(elem, smart=smart, normalize_space=normalize_space)
     def xpath_number(self, path, default=NULL, filter=None, ignore_spaces=False,
         return self.tree.cssselect(path)
-    def css_text(self, path, default=NULL, smart=False):
+    def css_text(self, path, default=NULL, smart=False, normalize_space=True):
         Get normalized text of node which matches the css path.
-            return get_node_text(self.css(path), smart=smart)
+            return get_node_text(self.css(path), smart=smart,
+                                 normalize_space=normalize_space)
         except IndexError:
             if default is NULL:
                 return default
-    def strip_tags(self, content):
+    def strip_tags(self, content, smart=False):
         Strip tags from the HTML content.
         from lxml.html import fromstring
-        return get_node_text(fromstring(content))
+        return get_node_text(fromstring(content), smart=smart)
     def assert_css(self, path):
         body = tostring(self.tree, encoding='utf-8').decode('utf-8')
         # Normalize spaces
-        body = normalize_space(body)
+        body = normalize_space_func(body)
         # Find text blocks
         block_rex = re.compile(r'[^<>]+')

File grab/tools/

 Functions to process content of lxml nodes.
-from text import normalize_space, find_number
+from text import normalize_space as normalize_space_func, find_number
-def get_node_text(node, smart=True):
+def get_node_text(node, smart=False, normalize_space=True):
     Extract text content of the `node` and all its descendants.
     if smart:
-        return normalize_space(' '.join(node.xpath(
+        value = ' '.join(node.xpath(
             './descendant-or-self::*[name() != "script" and '\
-            'name() != "style"]/text()[normalize-space()]')))
+            'name() != "style"]/text()[normalize-space()]'))
-        return normalize_space(node.text_content())
+        value = node.text_content()
+    if normalize_space:
+        value = normalize_space_func(value)
+    return value
 def find_node_number(node, ignore_spaces=False):

File tests/

     def test_strip_tags(self):
         self.assertEqual('foo', self.g.strip_tags('<b>foo</b>'))
         self.assertEqual('foo bar', self.g.strip_tags('<b>foo</b> <i>bar'))
-        self.assertEqual('foo bar', self.g.strip_tags('<b>foo</b><i>bar'))
+        self.assertEqual('foobar', self.g.strip_tags('<b>foo</b><i>bar'))
+        self.assertEqual('foo bar', self.g.strip_tags('<b>foo</b><i>bar', smart=True))
         self.assertEqual('', self.g.strip_tags('<b> <div>'))
     def test_css_exists(self):

File tests/

     def test_get_node_text(self):
         elem = self.lxml_tree.xpath('//div[@id="bee"]')[0]
-        self.assertEqual(get_node_text(elem), u'пче ла')
+        self.assertEqual(get_node_text(elem), u'пчела mozilla = 777; body { color: green; }')
+        self.assertEqual(get_node_text(elem, smart=True), u'пче ла')
         elem = self.lxml_tree.xpath('//div[@id="fly"]')[0]
         self.assertEqual(get_node_text(elem), u'му ха')