Commits

Gregory Petukhov  committed a3cd37e

Add normalize_space option to lxml-text functions

  • Participants
  • Parent commits a661b4e

Comments (0)

Files changed (4)

File grab/ext/lxml.py

 import time
 
 from ..base import DataNotFound, GrabMisuseError, GLOBAL_STATE
-from ..tools.text import normalize_space, find_number
+from ..tools.text import normalize_space as normalize_space_func, find_number
 from ..tools.lxml_tools import get_node_text
 
 NULL = object()
         else:
             return items 
 
-    def xpath_text(self, path, default=NULL, filter=None, smart=False):
+    def xpath_text(self, path, default=NULL, filter=None, smart=False,
+                   normalize_space=True):
         """
         Get normalized text of node which matches the given xpath.
         """
                 return default
         else:
             if isinstance(elem, basestring):
-                return normalize_space(elem)
+                return normalize_space_func(elem)
             else:
-                return get_node_text(elem, smart=smart)
+                return get_node_text(elem, smart=smart, normalize_space=normalize_space)
 
     def xpath_number(self, path, default=NULL, filter=None, ignore_spaces=False,
                      smart=False):
 
         return self.tree.cssselect(path)
 
-    def css_text(self, path, default=NULL, smart=False):
+    def css_text(self, path, default=NULL, smart=False, normalize_space=True):
         """
         Get normalized text of node which matches the css path.
         """
 
         try:
-            return get_node_text(self.css(path), smart=smart)
+            return get_node_text(self.css(path), smart=smart,
+                                 normalize_space=normalize_space)
         except IndexError:
             if default is NULL:
                 raise
             else:
                 return default
 
-    def strip_tags(self, content):
+    def strip_tags(self, content, smart=False):
         """
         Strip tags from the HTML content.
         """
         from lxml.html import fromstring
 
-        return get_node_text(fromstring(content))
+        return get_node_text(fromstring(content), smart=smart)
 
     def assert_css(self, path):
         """
         body = tostring(self.tree, encoding='utf-8').decode('utf-8')
 
         # Normalize spaces
-        body = normalize_space(body)
+        body = normalize_space_func(body)
 
         # Find text blocks
         block_rex = re.compile(r'[^<>]+')

File grab/tools/lxml_tools.py

 """
 Functions to process content of lxml nodes.
 """
-from text import normalize_space, find_number
+from text import normalize_space as normalize_space_func, find_number
 
-def get_node_text(node, smart=True):
+def get_node_text(node, smart=False, normalize_space=True):
     """
     Extract text content of the `node` and all its descendants.
 
     """
 
     if smart:
-        return normalize_space(' '.join(node.xpath(
+        value = ' '.join(node.xpath(
             './descendant-or-self::*[name() != "script" and '\
-            'name() != "style"]/text()[normalize-space()]')))
+            'name() != "style"]/text()[normalize-space()]'))
     else:
-        return normalize_space(node.text_content())
+        value = node.text_content()
+    if normalize_space:
+        value = normalize_space_func(value)
+    return value
 
 def find_node_number(node, ignore_spaces=False):
     """

File tests/test_lxml_extension.py

     def test_strip_tags(self):
         self.assertEqual('foo', self.g.strip_tags('<b>foo</b>'))
         self.assertEqual('foo bar', self.g.strip_tags('<b>foo</b> <i>bar'))
-        self.assertEqual('foo bar', self.g.strip_tags('<b>foo</b><i>bar'))
+        self.assertEqual('foobar', self.g.strip_tags('<b>foo</b><i>bar'))
+        self.assertEqual('foo bar', self.g.strip_tags('<b>foo</b><i>bar', smart=True))
         self.assertEqual('', self.g.strip_tags('<b> <div>'))
 
     def test_css_exists(self):

File tests/test_lxml_tools.py

 
     def test_get_node_text(self):
         elem = self.lxml_tree.xpath('//div[@id="bee"]')[0]
-        self.assertEqual(get_node_text(elem), u'пче ла')
+        self.assertEqual(get_node_text(elem), u'пчела mozilla = 777; body { color: green; }')
+        self.assertEqual(get_node_text(elem, smart=True), u'пче ла')
         elem = self.lxml_tree.xpath('//div[@id="fly"]')[0]
         self.assertEqual(get_node_text(elem), u'му ха')