Grigoriy Petukhov avatar Grigoriy Petukhov committed efdc4bf

Proxylist refactoring

Comments (0)

Files changed (5)

     #
 
     def setup_proxylist(self, proxy_file=None, proxy_type='http', read_timeout=None,
-                        auto_init=True, auto_change=False,
+                        auto_init=True, auto_change=True,
                         server_list=None):
         """
         Setup location of files with proxy servers
 
         self.proxylist = ProxyList(source, source_type, proxy_type=proxy_type,
                                    read_timeout=read_timeout)
-        if auto_init:
+        if not auto_change and auto_init:
             self.change_proxy()
         self.setup(proxy_auto_change=auto_change)
 

grab/spider/base.py

 from .stat  import SpiderStat
 from .transport.multicurl import MulticurlTransport
 from .transport.threadpool import ThreadPoolTransport
+from ..proxylist import ProxyList
 
 DEFAULT_TASK_PRIORITY = 100
 RANDOM_TASK_PRIORITY_RANGE = (80, 100)
         self.request_limit = request_limit
         self.counters = defaultdict(int)
         self.grab_config = {}
-        self.proxylist_config = None
         self.items = {}
         self.task_try_limit = task_try_limit
         self.network_try_limit = network_try_limit
         self.should_stop = False
         self.request_pause = request_pause
 
+        self.proxylist_enabled = None
+        self.proxylist = None
+        self.proxy = None
+        self.proxy_auto_change = False
+
     def setup_cache(self, backend='mongo', database=None, use_compression=True, **kwargs):
         if database is None:
             raise SpiderMisuseError('setup_cache method requires database option')
                                 break
 
                         self.inc_count('request-network')
-                        if task.use_proxylist and self.proxylist_config:
-                            args, kwargs = self.proxylist_config
-                            grab.setup_proxylist(*args, **kwargs)
+                        if task.use_proxylist and self.proxylist_enabled:
+                            if self.proxy_auto_change:
+                                self.proxy = self.proxylist.get_random()
+                            if self.proxy:
+                                proxy, proxy_userpwd, proxy_type = self.proxy
+                                grab.setup(proxy=proxy, proxy_userpwd=proxy_userpwd,
+                                           proxy_type=proxy_type)
 
                         transport.process_task(task, grab, grab_config_backup)
 
 
         logger.debug('Job done!')
 
-    def setup_proxylist(self, *args, **kwargs):
-        """
-        Save proxylist config which will be later passed to Grab
-        constructor.
-        """
-
-        self.proxylist_config = (args, kwargs)
-
     def process_handler_error(self, func_name, ex, task, error_tb=None):
         self.inc_count('error-%s' % ex.__class__.__name__.lower())
 
 
         self.should_stop = True
 
-    @classmethod
-    def init_with_config(cls, modname):
-        """
-        This method create spider instance and configure it
-        with options found in given config module.
-        
-        Args:
-            :modname string: name of module with settings
-        """
+    def load_proxylist(self, source, source_type, proxy_type='http',
+                       auto_init=True, auto_change=True,
+                       **kwargs):
+        self.proxylist = ProxyList(source, source_type, proxy_type=proxy_type, **kwargs)
 
-        # Load key, value dict from config module
-        config = __import__(modname, fromlist=[''])
-        config_dict = {}
-        for key in dir(config):
-            config_dict[key.lower()] = getattr(config, key)
+        self.proxylist_enabled = True
+        self.proxy = None
+        if not auto_change and auto_init:
+            self.proxy = self.proxylist.get_random()
+        self.proxy_auto_change = auto_change
 
-        # Find names of arguments of __init__ method
-        arg_names = inspect.getargspec(getattr(cls, '__init__'))[0]
-        arg_names = [x.lower() for x in arg_names]
+    # 
+    # Deprecated methods
+    #
 
-        # Find __init__ arguments in config module
-        kwargs = {}
-        for name in arg_names:
-            if name in config_dict:
-                kwargs[name] = config_dict[name]
+    def setup_proxylist(self, proxy_file=None, proxy_type='http', read_timeout=None,
+                        auto_init=True, auto_change=True,
+                        server_list=None):
+        logging.error('Method `setup_proxylist` is deprecated. Use `load_proxylist` instead.')
+        if server_list is not None:
+            raise error.GrabMisuseError('setup_proxylist: the argument `server_list` is not suppported more')
+        if proxy_file is None:
+            raise error.GrabMisuseError('setup_proxylist: value of argument `proxy_file` could not be None')
+        source = proxy_file
+        source_type = 'text_file'
 
-        # Create Spider instance
-        obj = cls(**kwargs)
 
-        # Configure proxy list
-        if 'proxylist' in config_dict:
-            obj.setup_proxylist(**config_dict['proxylist'])
-
-        return obj
-
-    def dump_title(self, grab):
-        print grab.xpath_text('//title', 'N/A')
+        self.proxylist = ProxyList(source, source_type, proxy_type=proxy_type,
+                                   read_timeout=read_timeout)
+        self.proxylist_enabled = True
+        self.proxy = None
+        if not auto_change and auto_init:
+            self.proxy = self.proxylist.get_random()
+        self.proxy_auto_change = auto_change

runtest_spider.py

     #'tests.test_distributed_spider',
     'test.spider_task',
     'test.spider_mongo_queue',
+    'test.spider_proxy',
 )
 
 def main():

test/spider_proxy.py

+from unittest import TestCase
+
+from grab.spider import Spider, Task, Data
+from util import (FakeServerThread, BASE_URL, RESPONSE, SLEEP, FAKE_SERVER_PORT,
+                  REQUEST)
+
+PORT1 = FAKE_SERVER_PORT + 1
+PORT2 = FAKE_SERVER_PORT + 2
+PORT3 = FAKE_SERVER_PORT + 3
+PROXY1 = 'localhost:%d' % PORT1
+PROXY2 = 'localhost:%d' % PORT2
+PROXY3 = 'localhost:%d' % PORT3
+
+class SimpleSpider(Spider):
+    def prepare(self):
+        self.ports = set()
+
+    def task_baz(self, grab, task):
+        self.ports.add(int(grab.response.headers.get('Listen-Port', 0)))
+
+class TestSpider(TestCase):
+    def setUp(self):
+        FakeServerThread(port=FAKE_SERVER_PORT).start()
+        FakeServerThread(port=PORT1).start()
+        FakeServerThread(port=PORT2).start()
+        FakeServerThread(port=PORT3).start()
+
+    def test_load_proxylist(self):
+        content = '%s\n%s\n%s' % (PROXY1, PROXY2, PROXY3)
+        open('/tmp/__proxy.txt', 'w').write(content)
+
+        # Simple test, one task
+        bot = SimpleSpider(thread_number=1)
+        bot.setup_proxylist('/tmp/__proxy.txt')
+        bot.setup_queue()
+        bot.add_task(Task('baz', 'http://yandex.ru'))
+        bot.run()
+
+        self.assertEqual(REQUEST['headers']['host'], 'yandex.ru')
+        self.assertTrue(len(bot.ports) == 1)
+
+        # By default auto_change is True
+        bot = SimpleSpider(thread_number=1)
+        bot.setup_proxylist('/tmp/__proxy.txt')
+        bot.setup_queue()
+        for x in xrange(10):
+            bot.add_task(Task('baz', 'http://yandex.ru'))
+        bot.run()
+
+        self.assertEqual(REQUEST['headers']['host'], 'yandex.ru')
+        self.assertTrue(len(bot.ports) > 1)
+
+        # DO the same test with load_proxylist method
+        bot = SimpleSpider(thread_number=1)
+        bot.load_proxylist('/tmp/__proxy.txt', 'text_file')
+        bot.setup_queue()
+        for x in xrange(10):
+            bot.add_task(Task('baz', 'http://yandex.ru'))
+        bot.run()
+
+        self.assertEqual(REQUEST['headers']['host'], 'yandex.ru')
+        self.assertTrue(len(bot.ports) > 1)
+
+        # Disable auto_change
+        # By default auto_init is True
+        bot = SimpleSpider(thread_number=1)
+        bot.setup_proxylist('/tmp/__proxy.txt', auto_change=False)
+        bot.setup_queue()
+        for x in xrange(10):
+            bot.add_task(Task('baz', 'http://yandex.ru'))
+        bot.run()
+
+        self.assertEqual(REQUEST['headers']['host'], 'yandex.ru')
+        self.assertTrue(len(bot.ports) == 1)
+
+        # Disable auto_change
+        # Disable auto_init
+        # Proxylist will not be used by default
+        bot = SimpleSpider(thread_number=1)
+        bot.setup_proxylist('/tmp/__proxy.txt', auto_change=False, auto_init=False)
+        bot.setup_queue()
+        for x in xrange(10):
+            bot.add_task(Task('baz', BASE_URL))
+        bot.run()
+
+        self.assertEqual(REQUEST['headers'].get('host'),
+                         '%s:%s' % ('localhost', FAKE_SERVER_PORT))
+        self.assertTrue(len(bot.ports) == 1)
+        self.assertEqual(list(bot.ports)[0], FAKE_SERVER_PORT)
 import os
 import shutil
 import tempfile
+from copy import deepcopy
 
 import grab
 
                 time.sleep(SLEEP['get'])
 
                 REQUEST['headers'] = self.headers
+                print "RH:", REQUEST['headers']
                 REQUEST['path'] = self.path
 
                 if RESPONSE['get_callback'] is not None:
                     while RESPONSE_ONCE_HEADERS:
                         self.send_header(*RESPONSE_ONCE_HEADERS.pop())
 
+                    self.send_header('Listen-Port', str(self.server.server_port))
+
                     self.end_headers()
 
                     if RESPONSE_ONCE['get'] is not None:
                     self.send_response(200)
                 while RESPONSE_ONCE_HEADERS:
                     self.send_header(*RESPONSE_ONCE_HEADERS.pop())
+
+                self.send_header('Listen-Port', str(self.server.server_port))
+
                 self.end_headers()
                 if RESPONSE_ONCE['post'] is not None:
                     self.wfile.write(RESPONSE_ONCE['post'])
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.