Commits

Gregory Petukhov committed c9e748b

Refactor Data processing in spider, write some tests for Data processing

  • Participants
  • Parent commits a1ad637

Comments (0)

Files changed (4)

grab/spider/base.py

-"""
-Global TODO:
-* make task_%s_preprocess methods
-"""
 from __future__ import absolute_import
 import types
 import signal
 import Queue
 from ..base import GLOBAL_STATE, Grab
 from .error import (SpiderError, SpiderMisuseError, FatalError,
-                    StopTaskProcessing, NoTaskHandler)
+                    StopTaskProcessing, NoTaskHandler, NoDataHandler)
 from .task import Task, NullTask
 from .data import Data
 from .pattern import SpiderPattern
     def process_handler_error(self, func_name, ex, task, error_tb=None):
         self.inc_count('error-%s' % ex.__class__.__name__.lower())
 
-        if error_tb:
+        if error_tb is not None:
             logger.error('Error in %s function' % func_name)
             logger.error(error_tb)
         else:
             except TypeError:
                 ex_str = str(ex)
 
-        self.add_item('fatal', '%s|%s|%s' % (ex.__class__.__name__,
-                                             ex_str, task.url))
+        self.add_item('fatal', '%s|%s|%s|%s' % (
+            func_name, ex.__class__.__name__, ex_str, task.url))
         if isinstance(ex, FatalError):
             raise
 
+    def find_data_handler(self, data):
+        try:
+            handler = getattr(self, 'data_%s' % data.name)
+        except AttributeError:
+            raise NoDataHandler('No handler defined for Data %s' % data.name)
+        else:
+            return handler
+
     def process_handler_result(self, result, task):
         """
         Process result received from the task handler.
         if isinstance(result, Task):
             self.add_task(result)
         elif isinstance(result, Data):
-            handler_name = 'data_%s' % result.name
-            try:
-                handler = getattr(self, handler_name)
-            except AttributeError:
-                raise SpiderError('No content handler for %s item' % result.name)
+            handler = self.find_data_handler(result)
             try:
                 handler(result.item)
             except Exception, ex:
-                self.process_handler_error(handler_name, ex, task)
+                self.process_handler_error('data_%s' % result.name, ex, task)
         elif result is None:
             pass
         else:
                         else:
                             for item in result:
                                 self.process_handler_result(item, res['task'])
+            except NoDataHandler, ex:
+                raise
             except Exception, ex:
                 self.process_handler_error(handler_name, ex, res['task'])
             else:
                 self.add_task(task)
             # TODO: allow to write error handlers
     
+    def find_task_handler(self, task):
+        callback = task.get('callback')
+        if callback:
+            return callback
+        else:
+            try:
+                handler = getattr(self, 'task_%s' % task.name)
+            except AttributeError:
+                raise NoTaskHandler('No handler or callback defined for task %s' % task.name)
+            else:
+                return handler
 
     def process_network_result(self, res):
         """
         if stop:
             return
 
-        # Process the response
-        handler_name = 'task_%s' % res['task'].name
-
-        try:
-            handler = getattr(self, handler_name)
-        except AttributeError:
-            handler = None
-
-        callback = res['task'].get('callback')
-
-        if handler is None and callback is None:
-            raise NoTaskHandler('No handler or callback defined for task %s' % res['task'].name)
-        else:
-            self.execute_task_handler(res, callback or handler)
+        handler = self.find_task_handler(res['task'])
+        self.execute_task_handler(res, handler)
 
     def change_proxy(self, task, grab):
         """

grab/spider/error.py

 __all__ = ('SpiderError', 'SpiderMisuseError', 'FatalError',
            'StopTaskProcessing', 'SpiderInternalError',
-           'NoTaskHandler')
+           'NoTaskHandler', 'NoDataHandler')
 
 class SpiderError(Exception):
     "Base class for Spider exceptions"
 
+
 class SpiderMisuseError(SpiderError):
     "Improper usage of Spider framework"
 
+
 class FatalError(SpiderError):
     "Fatal error which should stop parsing process"
 
+
 class StopTaskProcessing(SpiderError):
     """
     Used in middlewares to stop task processing
     """
 
+
 class SpiderInternalError(SpiderError):
     """
     Used to indicate error in some internal spider services
     Used then it is not possible to find which
     handler should be used to process network response.
     """
+
+
+class NoDataHandler(SpiderError):
+    """
+    Used then it is not possible to find which
+    handler should be used to process Data object.
+    """

test/spider_data.py

+from unittest import TestCase
+
+from grab import Grab
+from grab.spider import Spider, Task, Data, NoDataHandler, SpiderMisuseError
+
+from .tornado_util import SERVER
+
+class TestSpider(TestCase):
+    def setUp(self):
+        SERVER.reset()
+
+    def test_data_nohandler_error(self):
+        class TestSpider(Spider):
+            def task_page(self, grab, task):
+                yield Data('foo', 1)
+
+        bot = TestSpider()
+        bot.setup_queue()
+        bot.add_task(Task('page', url=SERVER.BASE_URL))
+        self.assertRaises(NoDataHandler, bot.run)
+
+    def test_exception_from_data_handler(self):
+        class TestSpider(Spider):
+            def task_page(self, grab, task):
+                yield Data('foo', 1)
+            
+            def data_foo(self, num):
+                1/0
+
+        bot = TestSpider()
+        bot.setup_queue()
+        bot.add_task(Task('page', url=SERVER.BASE_URL))
+        bot.run()
+        self.assertTrue('data_foo' in bot.items['fatal'][0])

test/spider_task.py

     base_url = 'http://google.com'
 
     def task_baz(self, grab, task):
-        return Data('foo', grab.response.body)
-
-    def data_foo(self, item):
-        self.SAVED_ITEM = item
+        self.SAVED_ITEM = grab.response.body
 
 class TestSpider(TestCase):
     def setUp(self):