Commits

Gregory Petukhov committed 8565b7d

Refactor proxylist class. Add new method Grab.load_proxylist. Grab.setup_proxylist is depricated. Fix auto_change bug (#37)

  • Participants
  • Parent commits c2a6c34

Comments (0)

Files changed (4)

         proxy = None,
         proxy_type = None,
         proxy_userpwd = None,
+        proxy_auto_change = True,
 
         # Method, Post
         method = None,
     # Attributes which should be processed when clone
     # of Grab instance is creating
     clonable_attributes = ('request_head', 'request_log', 'request_body',
-                           'proxylist', 'proxylist_auto_change')
+                           'proxylist')
 
     # Complex config items which points to mutable objects
     mutable_config_keys = copy(MUTABLE_CONFIG_KEYS)
 
         self.reset()
         self.proxylist = None
-        self.proxylist_auto_change = False
         if kwargs:
             self.setup(**kwargs)
         self.clone_counter = 0
         if not self._request_prepared:
             self.reset()
             self.request_counter = self.get_request_counter()
-            if self.proxylist_auto_change:
-                self.change_proxy()
             if kwargs:
                 self.setup(**kwargs)
+            if self.proxylist and self.config['proxy_auto_change']:
+                self.change_proxy()
             self.request_method = self.detect_request_method()
             self.transport.process_config(self)
             self._request_prepared = True
 
         self.response = res
 
-    def setup_proxylist(self, proxy_file=None, proxy_type='http', read_timeout=None,
-                        auto_init=True, auto_change=False,
-                        server_list=None):
-        """
-        Setup location of files with proxy servers
-
-        ``proxy_file`` - file which contains list of proxy servers
-        Each server could be a line of one of following formats:
-        * server:port
-        * server:port:username:password
-
-        ``proxy_type`` - type of proxy servers from proxy file.
-        For now all proxies should be of one type
-
-        ``auto_init`` - if True then ``change_proxy`` method will be automatically
-        called
-        """
-
-        self.proxylist = ProxyList(proxy_file=proxy_file, proxy_type=proxy_type,
-                                   server_list=server_list, read_timeout=read_timeout)
-        if auto_init:
+    def load_proxylist(self, source, source_type, proxy_type='http',
+                       auto_init=True, auto_change=True,
+                       **kwargs):
+        self.proxylist = ProxyList(source, source_type, proxy_type=proxy_type, **kwargs)
+        self.setup(proxy_auto_change=auto_change)
+        if not auto_change and auto_init:
             self.change_proxy()
-        self.proxylist_auto_change = auto_change
 
     def change_proxy(self):
         """
         Set random proxy from proxylist.
         """
 
-        server, userpwd = self.proxylist.get_random()
+        server, userpwd, proxy_type = self.proxylist.get_random()
         self.setup(proxy=server, proxy_userpwd=userpwd,
-                   proxy_type=self.proxylist.proxy_type)
+                   proxy_type=proxy_type)
 
     """
     Private methods
             out.write(json.dumps(self.config['cookies']))
 
     def setup_with_proxyline(self, line, proxy_type='http'):
+        # TODO: remove from base class
+        # maybe to proxylist?
         host, port, user, pwd = parse_proxyline(line)
         server_port = '%s:%s' % (host, port)
         self.setup(proxy=server_port, proxy_type=proxy_type)
             return {}
 
 
+    # 
+    # Deprecated methods
+    #
+
+    def setup_proxylist(self, proxy_file=None, proxy_type='http', read_timeout=None,
+                        auto_init=True, auto_change=False,
+                        server_list=None):
+        """
+        Setup location of files with proxy servers
+
+        ``proxy_file`` - file which contains list of proxy servers
+        Each server could be a line of one of following formats:
+        * server:port
+        * server:port:username:password
+
+        ``proxy_type`` - type of proxy servers from proxy file.
+        For now all proxies should be of one type
+
+        ``auto_init`` - if True then ``change_proxy`` method will be automatically
+        called
+        """
+
+        logging.error('Method `setup_proxylist` is deprecated. Use `load_proxylist` instead.')
+        if server_list is not None:
+            raise error.GrabMisuseError('setup_proxylist: the argument `server_list` is not suppported more')
+        if proxy_file is None:
+            raise error.GrabMisuseError('setup_proxylist: value of argument `proxy_file` could not be None')
+        source = proxy_file
+        source_type = 'text_file'
+
+
+        self.proxylist = ProxyList(source, source_type, proxy_type=proxy_type,
+                                   read_timeout=read_timeout)
+        if auto_init:
+            self.change_proxy()
+        self.setup(proxy_auto_change=auto_change)
+
+
+
 # For backward compatibility
 BaseGrab = Grab

grab/proxylist.py

 """
 import itertools
 from random import choice
-from datetime import datetime, timedelta
 import re
 import logging
+from copy import deepcopy
+import time
+import logging
 
 from error import GrabError, GrabMisuseError
 
+logger = logging.getLogger('grab.proxylist')
+
 READ_TIMEOUT = 60 * 10
 RE_SIMPLE_PROXY = re.compile(r'^([^:]+):([^:]+)$')
 RE_AUTH_PROXY = re.compile(r'^([^:]+):([^:]+):([^:]+):([^:]+)$')
     raise GrabError('Invalid proxy line: %s' % line)
 
 
-class ProxyList(object):
-    """
-    Class to work with proxy list which 
-    is stored in the plain text file.
-    """
+class TextFileSource(object):
+    source_type = 'text_file'
 
-    def __init__(self, proxy_file=None, proxy_type='http', read_timeout=None,
-                 server_list=None):
-        """
-        Create `ProxyList` object and load proxies from the specified source.
+    def __init__(self, filename, read_timeout=READ_TIMEOUT, proxy_type='http'):
+        self.filename = filename
+        self.proxy_type = proxy_type
+        self.read_timeout = read_timeout
 
-        You can specify source either with `proxy_file` or `server_list` options.
-
-        Args:
-            :param proxy_file: path to file which contains list of servers.
-                Each server could be in two forms:
-                * simple: "server:port"
-                * complex: "server:port:user:pwd"
-            :param server_list: list of servers. Each item should be a string
-                of format described in description of `proxy_file` option.
-            :param proxy_type: type of proxies. All proxies should be of the same
-                type
-            :param read_timeout: time after which the proxy list will be reloaded
-                By deafult, it is 600 seconds.
-        """
-
-        if proxy_file is None and server_list is None:
-            raise GrabMisuseError('ProxyList constructor: both proxy_file and'\
-                                  ' server_list options are None')
-        elif proxy_file is not None and server_list is not None:
-            raise GrabMisuseError('ProxyList constructor: only one of proxy_file and'
-                                  ' server_list options could be non-None')
-        elif proxy_file is not None:
-            self.proxy_file = proxy_file
-            self.server_list = None
-        else:
-            self.proxy_file = None
-            self.server_list = server_list
-        self.proxy_type = proxy_type
-        if read_timeout is None:
-            self.read_timeout = READ_TIMEOUT
-        else:
-            self.read_timeout = read_timeout
-        self.read_time = None
-        self.refresh_proxy_list()
-
-    def get_random(self):
-        """
-        Return random server from the list
-
-        """
-        self.refresh_proxy_list()
-        return choice(self._servers)
-
-    def get_next(self):
-        "Return next server in the list"
-        self.refresh_proxy_list()
-        return self.server_iterator.next()
-
-    def refresh_proxy_list(self):
-        """
-        Update proxy list.
-        
-        Re-read proxy file after each XX seconds.
-        """
-        
-        if (self.read_time is None or
-            (datetime.now() - self.read_time).seconds > self.read_timeout):
-            self.load_proxy_list()
-            self.read_time = datetime.now()
-            self.server_iterator = itertools.cycle(self._servers)
-
-    def load_proxy_list(self):
+    def load(self):
         """
         Load proxy list from specified source and validate loaded data.
 
         * complex: "server:port:user:pwd"
         """
 
-        if self.proxy_file:
-            with open(self.proxy_file) as src:
-                lines = src.read().splitlines()
-        else:
-            lines = self.server_list
+        with open(self.filename) as src:
+            lines = src.read().splitlines()
 
         servers = []
         for line in lines:
             line = line.strip().replace(' ', '')
             if line:
                 host, port, user, pwd = parse_proxyline(line)
-                server_port = '%s:%s' % (host, port)
+                server = '%s:%s' % (host, port)
                 if user:
                     user_pwd = '%s:%s' % (user, pwd)
                 else:
                     user_pwd = None
-                servers.append((server_port, user_pwd))
-        self._servers = servers
+                servers.append((server, user_pwd, self.proxy_type))
+        self.read_time = time.time()
+        self.server_list = servers
+        self.server_list_iterator = itertools.cycle(self.server_list)
+
+    def reload(self):
+        """
+        Update proxy list.
+        
+        Re-read proxy file after each XX seconds.
+        """
+        
+        if (self.read_time is None or
+            (time.time() - self.read_time) > self.read_timeout):
+            logging.debug('Reloading proxy list')
+            self.load()
+
+
+SOURCE_LIST = {
+    'text_file': TextFileSource,
+}
+
+
+class ProxyList(object):
+    """
+    Class to work with proxy list which 
+    is stored in the plain text file.
+    """
+
+    def __init__(self, source, source_type, proxy_type='http', **kwargs):
+        """
+        Create `ProxyList` object and load proxies from the specified source.
+
+        You should specify type of source in second argument to let ProxyList
+        instance know how to handle proxy source.
+
+        :param source: source of the project (file name, string or some object)
+        :param source_type: type of proxy source
+        :param proxy_type: default type of proxy (if proxy source does not provide
+            this information)
+        :param **kwargs: any additional aruguments goes to specific proxy load method 
+        """
+
+        self.init_kwargs = deepcopy(kwargs)
+
+        try:
+            source_class = SOURCE_LIST[source_type]
+        except AttributeError:
+            raise GrabMisuseError('Unknown proxy source type: %s' % source_type)
+        self.source = source_class(source, proxy_type=proxy_type, **kwargs)
+        self.source.load()
+
+    def get_random(self):
+        """
+        Return random server from the list
+        """
+
+        self.source.reload()
+        return choice(self.source.server_list)
+
+    def get_next(self):
+        """
+        Return next server in the list.
+        """
+
+        logging.debug('Changing proxy')
+        self.source.reload()
+        return self.source.server_list_iterator.next()

test/proxy_feature.py

 from util import (FakeServerThread, RESPONSE, REQUEST, FAKE_SERVER_PORT,
                   GRAB_TRANSPORT)
 
+PORT1 = FAKE_SERVER_PORT
+PORT2 = FAKE_SERVER_PORT + 1
+PORT3 = FAKE_SERVER_PORT + 2
+PROXY1 = 'localhost:%d' % PORT1
+PROXY2 = 'localhost:%d' % PORT2
+PROXY3 = 'localhost:%d' % PORT3
+
 class TestProxy(TestCase):
     def setUp(self):
-        FakeServerThread().start()
+        FakeServerThread(port=PORT1).start()
+        FakeServerThread(port=PORT2).start()
+        FakeServerThread(port=PORT3).start()
 
-    def test_proxy(self):
+    def test_proxy_option(self):
         g = Grab(transport=GRAB_TRANSPORT)
-        proxy = 'localhost:%d' % FAKE_SERVER_PORT 
-        g.setup(proxy=proxy, proxy_type='http')
+
+        g.setup(proxy=PROXY1, proxy_type='http')
         RESPONSE['get'] = '123'
+
         g.go('http://yandex.ru')
         self.assertEqual('123', g.response.body)
         self.assertEqual('yandex.ru', REQUEST['headers']['host'])
 
-    def test_file_proxylist(self):
+    def test_deprecated_setup_proxylist(self):
         g = Grab(transport=GRAB_TRANSPORT)
-        proxy = 'localhost:%d' % FAKE_SERVER_PORT 
-        open('/tmp/__proxy.txt', 'w').write(proxy)
+        open('/tmp/__proxy.txt', 'w').write(PROXY1)
         g.setup_proxylist('/tmp/__proxy.txt', 'http')
         RESPONSE['get'] = '123'
         g.change_proxy()
         self.assertEqual('123', g.response.body)
         self.assertEqual('yandex.ru', REQUEST['headers']['host'])
 
-    def test_memory_proxylist(self):
+    def test_load_proxylist(self):
+        content = '%s\n%s\n%s' % (PROXY1, PROXY2, PROXY3)
+        open('/tmp/__proxy.txt', 'w').write(content)
+
+        # By default auto_change is True
         g = Grab(transport=GRAB_TRANSPORT)
-        server_list = ['localhost:%d' % FAKE_SERVER_PORT]
-        g.setup_proxylist(server_list=server_list, proxy_type='http',
-                          auto_init=True)
-        RESPONSE['get'] = '123'
+        g.load_proxylist('/tmp/__proxy.txt', 'text_file')
+        self.assertEqual(g.config['proxy_auto_change'], True)
+        servers = set()
+        for x in xrange(10):
+            g.go('http://yandex.ru')
+            servers.add(g.config['proxy'])
+
+        self.assertTrue(len(servers) > 1)
+
+        # Disable auto_change
+        # By default auto_init is True
+        g = Grab(transport=GRAB_TRANSPORT)
+        g.load_proxylist('/tmp/__proxy.txt', 'text_file', auto_change=False)
+        self.assertEqual(g.config['proxy_auto_change'], False)
+        servers = set()
+        for x in xrange(10):
+            g.go('http://yandex.ru')
+            servers.add(g.config['proxy'])
+        self.assertEqual(len(servers), 1)
+
+        # Disable auto_change
+        # Disable auto_init
+        # Proxylist will not be used by default
+        g = Grab(transport=GRAB_TRANSPORT)
+        g.load_proxylist('/tmp/__proxy.txt', 'text_file', auto_change=False,
+                         auto_init=False)
+        self.assertEqual(g.config['proxy_auto_change'], False)
         g.go('http://yandex.ru')
-        self.assertEqual('123', g.response.body)
-        self.assertEqual('yandex.ru', REQUEST['headers']['host'])
+        self.assertEqual(g.config['proxy'], None)
+
+    #def test_memory_proxylist(self):
+        #g = Grab(transport=GRAB_TRANSPORT)
+        #server_list = ['localhost:%d' % PORT1]
+        #g.setup_proxylist(server_list=server_list, proxy_type='http',
+                          #auto_init=True)
+        #RESPONSE['get'] = '123'
+        #g.go('http://yandex.ru')
+        #self.assertEqual('123', g.response.body)
+        #self.assertEqual('yandex.ru', REQUEST['headers']['host'])
 
     def test_change_proxy(self):
         g = Grab(transport=GRAB_TRANSPORT)
 
     def test_proxylist_api(self):
         g = Grab(transport=GRAB_TRANSPORT)
-        self.assertRaises(GrabMisuseError,
-                          lambda: g.setup_proxylist(proxy_file='foo', server_list=[]))
+        #self.assertRaises(GrabMisuseError,
+                          #lambda: g.setup_proxylist(proxy_file='foo', server_list=[]))
         self.assertRaises(GrabMisuseError,
                           lambda: g.setup_proxylist(proxy_file=None, server_list=None))
         self.assertRaises(GrabMisuseError,
 
 
 class FakeServerThread(threading.Thread):
-    def __init__(self, *args, **kwargs):
+    def __init__(self, port=FAKE_SERVER_PORT, *args, **kwargs):
         super(FakeServerThread, self).__init__(*args, **kwargs)
         self.daemon = True
+        self.listen_port = port
 
     def start(self):
         super(FakeServerThread, self).start()
                 else:
                     self.wfile.write(RESPONSE['post'])
 
-        server_address = ('localhost', FAKE_SERVER_PORT)
+        server_address = ('localhost', self.listen_port)
         try:
             httpd = HTTPServer(server_address, RequestHandlerClass)
             httpd.serve_forever()