1. mt3
  2. grab

Commits

Gregory Petukhov  committed 9df521a

Add smart optio to all xpath/css_text/number functions

  • Participants
  • Parent commits d3fb37d
  • Branches default

Comments (0)

Files changed (3)

File grab/ext/lxml.py

View file
         else:
             return items 
 
-    def xpath_text(self, path, default=NULL, filter=None):
+    def xpath_text(self, path, default=NULL, filter=None, smart=False):
         """
         Get normalized text of node which matches the given xpath.
         """
             if isinstance(elem, basestring):
                 return normalize_space(elem)
             else:
-                return get_node_text(elem)
+                return get_node_text(elem, smart=smart)
 
-    def xpath_number(self, path, default=NULL, filter=None, ignore_spaces=False):
+    def xpath_number(self, path, default=NULL, filter=None, ignore_spaces=False,
+                     smart=False):
         """
         Find number in normalized text of node which matches the given xpath.
         """
 
         try:
-            return find_number(self.xpath_text(path, filter=filter),
+            return find_number(self.xpath_text(path, filter=filter, smart=smart),
                                     ignore_spaces=ignore_spaces)
         except IndexError:
             if default is NULL:
 
         return self.tree.cssselect(path)
 
-    def css_text(self, path, default=NULL):
+    def css_text(self, path, default=NULL, smart=False):
         """
         Get normalized text of node which matches the css path.
         """
 
         try:
-            return get_node_text(self.css(path))
+            return get_node_text(self.css(path), smart=smart)
         except IndexError:
             if default is NULL:
                 raise
             else:
                 return default
 
-    def css_number(self, path, default=NULL, ignore_spaces=False):
+    def css_number(self, path, default=NULL, ignore_spaces=False, smart=False):
         """
         Find number in normalized text of node which matches the given css path.
         """
 
         try:
-            return find_number(self.css_text(path), ignore_spaces=ignore_spaces)
+            return find_number(self.css_text(path, smart=smart), ignore_spaces=ignore_spaces)
         except IndexError:
             if default is NULL:
                 raise

File grab/tools/lxml_tools.py

View file
 """
 from text import normalize_space, find_number
 
-def get_node_text(node):
+def get_node_text(node, smart=True):
     """
     Extract text content of the `node` and all its descendants.
 
-    This is not equal to `node.text_content()`. The `get_node_text` insert
-    spaces between <tag><another tag> and also ignores content of the script
-    and style tags.
+    In smart mode `get_node_text` insert spaces between <tag><another tag>
+    and also ignores content of the script and style tags.
+
+    In non-smart mode this func just return text_content() of node
+    with normalized spaces
     """
 
-    return normalize_space(' '.join(node.xpath(
-        './descendant-or-self::*[name() != "script" and '\
-        'name() != "style"]/text()[normalize-space()]')))
+    if smart:
+        return normalize_space(' '.join(node.xpath(
+            './descendant-or-self::*[name() != "script" and '\
+            'name() != "style"]/text()[normalize-space()]')))
+    else:
+        return normalize_space(node.text_content())
 
 def find_node_number(node, ignore_spaces=False):
     """

File tests/test_lxml_extension.py

View file
         self.assertEqual('foo', self.g.xpath('//zzz', default='foo'))
 
     def test_xpath_text(self):
-        self.assertEqual(u'пче ла', self.g.xpath_text('//*[@id="bee"]'))
-        self.assertEqual(u'пче ла му ха item #100 2 item #2', self.g.xpath_text('/html/body'))
+        self.assertEqual(u'пче ла', self.g.xpath_text('//*[@id="bee"]', smart=True))
+        self.assertEqual(u'пчела mozilla = 777; body { color: green; }', self.g.xpath_text('//*[@id="bee"]', smart=False))
+        self.assertEqual(u'пче ла му ха item #100 2 item #2', self.g.xpath_text('/html/body', smart=True))
         self.assertRaises(DataNotFound,
             lambda: self.g.xpath_text('//code'))
         self.assertEqual(u'bee', self.g.xpath('//*[@id="bee"]/@id'))
         self.assertEqual('foo', self.g.css('zzz', default='foo'))
 
     def test_css_text(self):
-        self.assertEqual(u'пче ла', self.g.css_text('#bee'))
-        self.assertEqual(u'пче ла му ха item #100 2 item #2', self.g.css_text('html body'))
+        self.assertEqual(u'пче ла', self.g.css_text('#bee', smart=True))
+        self.assertEqual(u'пче ла му ха item #100 2 item #2', self.g.css_text('html body', smart=True))
         self.assertRaises(DataNotFound,
             lambda: self.g.css_text('code'))
         self.assertEqual('foo', self.g.css_text('zzz', default='foo'))