Commits

Gregory Petukhov  committed c49e1c8

Issue #88: add correct processing of infinite meta refresh redirect

  • Participants
  • Parent commits f729a51

Comments (0)

Files changed (9)

File grab/base.py

         # Redirects
         follow_refresh = False,
         follow_location = True,
+        refresh_redirect_count = 0,
         redirect_limit = 10,
 
         # Authentication
 
         # It's important to delete old POST data after request is performed.
         # If POST data is not cleared then next request will try to use them again!
-
+        old_refresh_count = self.config['refresh_redirect_count']
         self.reset_temporary_options()
 
         if prepare_response_func:
         # TODO: check max redirect count
         if self.config['follow_refresh']:
             url = find_refresh_url(self.response.unicode_body())
-            if url:
-                return self.request(url=url)
+            print 'URL', url
+            if url is not None:
+                inc_count = old_refresh_count + 1
+                if inc_count > self.config['redirect_limit']:
+                    raise error.GrabTooManyRedirectsError()
+                else:
+                    print inc_count
+                    return self.request(url=url, refresh_redirect_count=inc_count)
 
         return None
 
         self.config['multipart_post'] = None
         self.config['method'] = None
         self.config['body_storage_filename'] = None
+        self.config['refresh_redirect_count'] = 0
 
     def save_failed_dump(self):
         """

File grab/error.py

 
 class GrabConnectionError(GrabError):
     """
-    Raised then it is not possible to establish network connection.
+    Raised when it is not possible to establish network connection.
 
     In curl transport it is CURLE_COULDNT_CONNECT (7)
     """
 
+
 class GrabAuthError(GrabError):
     """
-    Raised then remote server denies authentication credentials.
+    Raised when remote server denies authentication credentials.
 
     In curl transport it is CURLE_COULDNT_CONNECT (67)
     """
+
+
+class GrabTooManyRedirectsError(GrabError):
+    """
+    Raised when Grab reached max. allowd number of redirects for
+    one request.
+    """

File grab/tools/html.py

 
 RE_TAG = re.compile(r'<[^>]+>')
 RE_REFRESH_TAG = re.compile(r'<meta[^>]+http-equiv\s*=\s*["\']*Refresh[^>]+', re.I)
-RE_REFRESH_URL = re.compile(r'url=["\']*([^\'"> ]+)', re.I)
+RE_REFRESH_URL = re.compile(r'''
+    content \s* = \s*
+    ["\']* \d+
+    (?:;\s*url=)? ["\']* ([^\'"> ]*)
+''', re.I | re.X)
+
 RE_ENTITY = re.compile(r'(&[a-z]+;)')
 RE_NUM_ENTITY = re.compile(r'(&#\d+;)')
 RE_BASE_URL = re.compile(r'<base[^>]+href\s*=["\']*([^\'"> ]+)', re.I)

File grab/transport/curl.py

                     raise error.GrabConnectionError(ex[0], ex[1])
                 elif ex[0] == 67:
                     raise error.GrabAuthError(ex[0], ex[1])
+                elif ex[0] == 47:
+                    raise error.GrabTooManyRedirectsError(ex[0], ex[1])
                 else:
                     raise error.GrabNetworkError(ex[0], ex[1])
 
     'test.tornado_server',
     # *** grab.tools
     'test.text_tools',
+    'test.tools_html',
     'test.lxml_tools',
     'test.tools_account',
     'test.tools_control',

File test/base_interface.py

         self.assertRaises(GrabMisuseError,
             lambda: g.setup(save_the_word=True))
 
-    @only_transport('curl.CurlTransport')
+    @only_transport('grab.transport.curl.CurlTransport')
     def test_empty_useragent_pycurl(self):
         g = Grab(transport=GRAB_TRANSPORT)
 
         self.assertEqual(blocks[0], porno.strip())
         #self.assertEqual(blocks[1], redis.strip())
 
-    def test_meta_refresh_redirect(self):
-        # By default meta-redirect is off
-        url = SERVER.BASE_URL + '/foo'
-        SERVER.RESPONSE_ONCE['get'] = '<meta http-equiv="refresh" content="5; url=%s">' % url
-        g = Grab(transport=GRAB_TRANSPORT)
-        g.go(SERVER.BASE_URL + '/')
-        self.assertEqual(SERVER.REQUEST['path'], '/')
-        self.assertEqual(g.response.url, SERVER.BASE_URL + '/')
-
-        # Now test meta-auto-redirect
-        SERVER.RESPONSE_ONCE['get'] = '<meta http-equiv="refresh" content="5; url=%s">' % url
-        g = Grab(transport=GRAB_TRANSPORT)
-        g.setup(follow_refresh=True)
-        g.go(SERVER.BASE_URL)
-        self.assertEqual(SERVER.REQUEST['path'], '/foo')
-        self.assertEqual(g.response.url, SERVER.BASE_URL + '/foo')
-
-    @only_transport('curl.CurlTransport')
-    def test_redirect_limit(self):
-        class Scope(object):
-            counter = None
-
-            def callback(self, server):
-                if self.counter:
-                    server.set_status(301)
-                    server.set_header('Location', SERVER.BASE_URL)
-                else:
-                    server.set_status(200)
-                self.counter -= 1
-
-        scope = Scope()
-        scope.counter = 10
-        g = Grab(transport=GRAB_TRANSPORT)
-        g.setup(redirect_limit=5)
-
-        SERVER.RESPONSE['get_callback'] = scope.callback
-
-        try:
-            try:
-                g.go(SERVER.BASE_URL)
-            except Exception, ex:
-                pass
-            self.assert_(ex is not None)
-            self.assertEqual(ex.errno, 47)
-
-            scope.counter = 10
-            g.setup(redirect_limit=20)
-
-            try:
-                g.go(SERVER.BASE_URL)
-            except Exception, ex:
-                pass
-            else:
-                ex = None
-            self.assert_(ex is None)
-
-        finally:
-            # Clean up test environment
-            SERVER.RESPONSE['get_callback'] = None
-
     def test_default_content_for_fake_response(self):
         content = '<strong>test</strong>'
         g = Grab(content)

File test/grab_redirect.py

+from unittest import TestCase
+
+from grab import Grab
+from grab.error import GrabTooManyRedirectsError
+from .tornado_util import SERVER
+from .util import GRAB_TRANSPORT, only_transport
+
+class RedirectController(object):
+    def __init__(self, counter):
+        self.setup_counter(counter)
+
+    def setup_counter(self, counter):
+        self.counter = counter
+
+    def request_handler(self, server):
+        if self.counter:
+            server.set_status(301)
+            server.set_header('Location', SERVER.BASE_URL)
+        else:
+            server.set_status(200)
+        self.counter -= 1
+
+
+class RefreshRedirectController(RedirectController):
+    def request_handler(self, server):
+        server.set_status(200)
+        if self.counter:
+            server.write('<html><head><meta http-equiv="refresh" content="5"></head>')
+        else:
+            server.write('OK')
+        self.counter -= 1
+
+
+class GrabRedirectTestCase(TestCase):
+    def setUp(self):
+        SERVER.reset()
+
+    def test_meta_refresh_redirect(self):
+        # By default meta-redirect is off
+        meta_url = SERVER.BASE_URL + '/foo'
+
+        SERVER.RESPONSE_ONCE['get'] = '<meta http-equiv="refresh" content="5; url=%s">' % meta_url
+        g = Grab(transport=GRAB_TRANSPORT)
+        g.go(SERVER.BASE_URL + '/')
+        self.assertEqual(SERVER.REQUEST['path'], '/')
+        self.assertEqual(g.response.url, SERVER.BASE_URL + '/')
+
+        # Now test meta-auto-redirect
+        SERVER.RESPONSE_ONCE['get'] = '<meta http-equiv="refresh" content="5; url=%s">' % meta_url
+        g = Grab(transport=GRAB_TRANSPORT)
+        g.setup(follow_refresh=True)
+        g.go(SERVER.BASE_URL)
+        self.assertEqual(SERVER.REQUEST['path'], '/foo')
+        self.assertEqual(g.response.url, meta_url)
+
+    @only_transport('grab.transport.curl.CurlTransport')
+    def test_redirect_limit(self):
+        ctl = RedirectController(10)
+        SERVER.RESPONSE['get_callback'] = ctl.request_handler
+
+        g = Grab(transport=GRAB_TRANSPORT)
+        g.setup(redirect_limit=5)
+
+        self.assertRaises(GrabTooManyRedirectsError,
+                          lambda: g.go(SERVER.BASE_URL))
+
+        ctl.setup_counter(10)
+        g.setup(redirect_limit=20)
+        g.go(SERVER.BASE_URL)
+
+    @only_transport('grab.transport.curl.CurlTransport')
+    def test_refresh_redirect_limit(self):
+        ctl = RefreshRedirectController(10)
+        SERVER.RESPONSE['get_callback'] = ctl.request_handler
+
+        g = Grab(transport=GRAB_TRANSPORT)
+        g.setup(redirect_limit=5, follow_refresh=False)
+        g.go(SERVER.BASE_URL)
+
+        ctl.setup_counter(10)
+        g.setup(redirect_limit=5, follow_refresh=True)
+        self.assertRaises(GrabTooManyRedirectsError,
+                          lambda: g.go(SERVER.BASE_URL))

File test/tools_html.py

+# coding: utf-8
+from unittest import TestCase
+from grab.tools.html import find_refresh_url
+
+class HtmlToolsTestCase(TestCase):
+
+    def test_find_refresh_url(self):
+        url = find_refresh_url("""
+            <meta http-equiv="refresh" content="5">
+        """)
+        self.assertEqual('', url)
+
+        url = find_refresh_url("""
+            <meta http-equiv="refresh" content="5;URL='http://example.com/'">
+        """)
+        self.assertEqual('http://example.com/', url)
+
+        url = find_refresh_url("""
+            <meta http-equiv="refresh" content="0;URL='http://example.com/'">
+        """)
+        self.assertEqual('http://example.com/', url)
+
+        url = find_refresh_url("""
+            <meta http-equiv="refresh" content="5; url=http://example.com/">
+        """)
+        self.assertEqual('http://example.com/', url)

File test/util.py

 import os
 import shutil
 import tempfile
+import functools
 
 TEST_DIR = os.path.dirname(os.path.realpath(__file__))
 
     """
 
     def wrapper(func):
+        @functools.wraps(func)
         def test_method(*args, **kwargs):
             if GRAB_TRANSPORT == transport:
                 return
     """
 
     def wrapper(func):
+        @functools.wraps(func)
         def test_method(*args, **kwargs):
             if GRAB_TRANSPORT == transport:
                 func(*args, **kwargs)