Commits

Gregory Petukhov committed d8e3913

Move http-headers functions into grab.tools.http module. Update documentation

  • Participants
  • Parent commits 019fad6

Comments (0)

Files changed (12)

File docs/api/base.rst

     .. automethod:: fake_response
     .. automethod:: setup_proxylist
     .. automethod:: change_proxy
-    .. automethod:: urlencode
     .. automethod:: make_url_absolute
     .. automethod:: detect_request_method
     .. automethod:: clear_cookies

File docs/api/tools.rst

 
 .. automodule:: grab.tools.text
     :members:
+
+grab.tools.http
+===============
+
+.. automodule:: grab.tools.http
+    :members:

File docs/api/upload.rst

+.. _upload:
+
+===========
+grab.upload
+===========
+
+.. automodule:: grab.upload
+    :members:

File docs/grab/options.rst

 url
 ---
 
-Сетевой запрашиваемого документа. Можно использовать относительный адрес, в таком
+Сетевой адрес запрашиваемого документа. Можно использовать относительный адрес, в таком
 случае полный адрес будет получен путём соединения с полным адресом предыдущего
 сетевого запроса. Grab ожидает адрес в корректном формате. Это ваша обязанность -
 преобразовать все нестдартные символы в escape-последовательности (`RFC 2396 <http://www.ietf.org/rfc/rfc2396.txt>`_).

File docs/grab/tools.rst

 * :func:`text.find_number` - поиск числа в строке
 * :func:`text.drop_space` - удаление *всех* пробелов в строке
 * :func:`text.normalize_space` - удаление начальных и конечных пробелов, приведение последовательности пробелов к одному пробелу.
+
+Работа с http-заголовками
+=========================
+
+* :func:`http.urlencode` - сериализация словаря или списка пар в строку, которую можно отправить в GET или POST-запросе. В отличие от стандартного `urllib.urlencode` может обрабатывать unicode, None и :class:`grab.upload.UploadFile` объекты.

File docs/index.rst

 
     api/tools_html
     api/tools
-    
-
+    api/tools_http
 
 
 Похожие проекты

File grab/base.py

 
 from error import (GrabError, GrabNetworkError, GrabMisuseError, DataNotFound,
                    GrabTimeoutError)
+from upload import UploadContent, UploadFile
+from tools.http import normalize_http_values
 
 # This counter will used in enumerating network queries.
 # Its value will be displayed in logging messages and also used
 
 logger = logging.getLogger('grab')
 
-class UploadContent(str):
-    """
-    TODO: docstring
-    """
-
-    def __new__(cls, value):
-        obj = str.__new__(cls, 'xxx')
-        obj.raw_value = value
-        return obj
-
-    def field_tuple(self):
-        # TODO: move to transport extension
-        import pycurl
-        return (pycurl.FORM_CONTENTS, self.raw_value)
-
-
-class UploadFile(str):
-    """
-    TODO: docstring
-    """
-
-    def __new__(cls, path):
-        obj = str.__new__(cls, 'xxx')
-        obj.path = path
-        return obj
-
-    def field_tuple(self):
-        # move to transport extension
-        import pycurl
-        return (pycurl.FORM_FILE, self.path)
-
-
 def default_config():
     return dict(
         # Common
                 if isinstance(post, basestring):
                     post = post[:150] + '...'
                 else:
-                    post = self.normalize_http_values(post, charset='utf-8')
+                    post = normalize_http_values(post, charset='utf-8')
                     items = sorted(post, key=lambda x: x[0])
                     new_items = []
                     for key, value in items:
                 self.request_counter, tname, fext))
             self.response.save(fname)
 
-    def urlencode(self, items):
-        """
-        Convert sequence of items into bytestring which could be submitted
-        in POST or GET request.
-
-        It differs from ``urllib.urlencode`` in that it can process unicode
-        and some special values.
-
-        ``items`` could dict or tuple or list.
-        """
-
-        if isinstance(items, dict):
-            items = items.items()
-        return urllib.urlencode(self.normalize_http_values(items))
-
-
-    def encode_cookies(self, items, join=True):
-        """
-        Serialize dict or sequence of two-element items into string suitable
-        for sending in Cookie http header.
-        """
-
-        def encode(val):
-            """
-            URL-encode special characters in the text.
-
-            In cookie value only ",", " ", "\t" and ";" should be encoded
-            """
-
-            return val.replace(' ', '%20').replace('\t', '%09')\
-                      .replace(';', '%3B').replace(',', '%2C')
-
-        if isinstance(items, dict):
-            items = items.items()
-        items = self.normalize_http_values(items)
-        tokens = []
-        for key, value in items:
-            tokens.append('%s=%s' % (encode(key), encode(value)))
-        if join:
-            return '; '.join(tokens)
-        else:
-            return tokens
-
-
-    def normalize_http_values(self, items, charset=None):
-        """
-        Accept sequence of (key, value) paris or dict and convert each
-        value into bytestring.
-
-        Unicode is converted into bytestring using charset of previous response
-        (or utf-8, if no requests were performed)
-
-        None is converted into empty string. 
-
-        Instances of ``UploadContent`` or ``UploadFile`` is converted
-        into special pycurl objects.
-        """
-
-        if isinstance(items, dict):
-            items = items.items()
-
-        def process(item):
-            key, value = item
-
-            # normalize value
-            if isinstance(value, (UploadContent, UploadFile)):
-                value = value.field_tuple()
-            elif isinstance(value, unicode):
-                value = self.normalize_unicode(value, charset=charset)
-            elif value is None:
-                value = ''
-
-            # normalize key
-            if isinstance(key, unicode):
-                key = self.normalize_unicode(key, charset=charset)
-
-            return key, value
-
-        items =  map(process, items)
-        items = sorted(items, key=lambda x: x[0])
-        return items
-
-    def normalize_unicode(self, value, charset=None):
-        """
-        Convert unicode into byte-string using detected charset (default or from
-        previous response)
-
-        By default, charset from previous response is used to encode unicode into
-        byte-string but you can enforce charset with ``charset`` option
-        """
-
-        if not isinstance(value, unicode):
-            raise GrabMisuseError('normalize_unicode method accepts only unicode values')
-        return value.encode(self.charset if charset is None else charset, 'ignore')
-
     def make_url_absolute(self, url, resolve_base=False):
         """
         Make url absolute using previous request url as base url.

File grab/tools/http.py

+import urllib
+
+from ..base import UploadFile, UploadContent
+from ..error import GrabMisuseError
+
+def urlencode(items):
+    """
+    Convert sequence of items into bytestring which could be submitted
+    in POST or GET request.
+
+    It differs from ``urllib.urlencode`` in that it can process unicode
+    and some special values.
+
+    ``items`` could dict or tuple or list.
+    """
+
+    if isinstance(items, dict):
+        items = items.items()
+    return urllib.urlencode(normalize_http_values(items))
+
+
+def encode_cookies(items, join=True):
+    """
+    Serialize dict or sequence of two-element items into string suitable
+    for sending in Cookie http header.
+    """
+
+    def encode(val):
+        """
+        URL-encode special characters in the text.
+
+        In cookie value only ",", " ", "\t" and ";" should be encoded
+        """
+
+        return val.replace(' ', '%20').replace('\t', '%09')\
+                  .replace(';', '%3B').replace(',', '%2C')
+
+    if isinstance(items, dict):
+        items = items.items()
+    items = normalize_http_values(items)
+    tokens = []
+    for key, value in items:
+        tokens.append('%s=%s' % (encode(key), encode(value)))
+    if join:
+        return '; '.join(tokens)
+    else:
+        return tokens
+
+
+def normalize_http_values(items, charset='utf-8'):
+    """
+    Accept sequence of (key, value) paris or dict and convert each
+    value into bytestring.
+
+    Unicode is converted into bytestring using charset of previous response
+    (or utf-8, if no requests were performed)
+
+    None is converted into empty string. 
+
+    Instances of ``UploadContent`` or ``UploadFile`` is converted
+    into special pycurl objects.
+    """
+
+    if isinstance(items, dict):
+        items = items.items()
+
+    def process(item):
+        key, value = item
+
+        # normalize value
+        if isinstance(value, (UploadContent, UploadFile)):
+            value = value.field_tuple()
+        elif isinstance(value, unicode):
+            value = normalize_unicode(value, charset=charset)
+        elif value is None:
+            value = ''
+
+        # normalize key
+        if isinstance(key, unicode):
+            key = normalize_unicode(key, charset=charset)
+
+        return key, value
+
+    items =  map(process, items)
+    items = sorted(items, key=lambda x: x[0])
+    return items
+
+
+def normalize_unicode(value, charset='utf-8'):
+    """
+    Convert unicode into byte-string using detected charset (default or from
+    previous response)
+
+    By default, charset from previous response is used to encode unicode into
+    byte-string but you can enforce charset with ``charset`` option
+    """
+
+    if not isinstance(value, unicode):
+        raise GrabMisuseError('normalize_unicode function accepts only unicode values')
+    return value.encode(charset, 'ignore')

File grab/transport/curl.py

 
 from ..base import (GrabError, GrabMisuseError, UploadContent, UploadFile,
                     GrabTimeoutError, GrabNetworkError)
+from ..tools.http import encode_cookies, urlencode, normalize_unicode,\
+                         normalize_http_values
 
 logger = logging.getLogger('grab')
 
             if self.config['multipart_post']:
                 if isinstance(self.config['multipart_post'], basestring):
                     raise GrabMisuseError('multipart_post option could not be a string')
-                post_items = self.normalize_http_values(self.config['multipart_post'])
+                post_items = normalize_http_values(self.config['multipart_post'],
+                                                   charset=self.charset)
                 self.curl.setopt(pycurl.HTTPPOST, post_items) 
             elif self.config['post']:
                 if isinstance(self.config['post'], basestring):
                     # bytes-string should be posted as-is
                     # unicode should be converted into byte-string
                     if isinstance(self.config['post'], unicode):
-                        post_data = self.normalize_unicode(self.config['post'])
+                        post_data = normalize_unicode(self.config['post'])
                     else:
                         post_data = self.config['post']
                 else:
                     # dict, tuple, list should be serialized into byte-string
-                    post_data = self.urlencode(self.config['post'])
+                    post_data = urlencode(self.config['post'])
                 self.curl.setopt(pycurl.POSTFIELDS, post_data)
         elif self.request_method == 'PUT':
             self.curl.setopt(pycurl.PUT, 1)
         if self.config['cookies']:
             if not isinstance(self.config['cookies'], dict):
                 raise GrabMisuseError('cookies option shuld be a dict')
-            items = self.encode_cookies(self.config['cookies'], join=False)
+            items = encode_cookies(self.config['cookies'], join=False)
             self.curl.setopt(pycurl.COOKIELIST, 'ALL')
             for item in items:
                 self.curl.setopt(pycurl.COOKIELIST, 'Set-Cookie: %s' % item)

File grab/transport/requests.py

                 raise NotImplementedError
                 #if isinstance(self.config['multipart_post'], basestring):
                     #raise GrabMisuseError('multipart_post option could not be a string')
-                #post_items = self.normalize_http_values(self.config['multipart_post'])
+                #post_items = self.normalize_http_values(self.config['multipart_post'],
+                                                         #charset=self.charset)
                 #self.curl.setopt(pycurl.HTTPPOST, post_items) 
             elif self.config['post']:
                 if isinstance(self.config['post'], basestring):

File grab/transport/selenium.py

             #if self.config['multipart_post']:
                 #if not isinstance(self.config['multipart_post'], (list, tuple)):
                     #raise GrabMisuseError('multipart_post should be tuple or list, not dict')
-                #post_items = self.normalize_http_values(self.config['multipart_post'])
+                #post_items = self.normalize_http_values(self.config['multipart_post'], charset=self.charset)
                 #self.curl.setopt(pycurl.HTTPPOST, post_items) 
             #elif self.config['post']:
                 #if isinstance(self.config['post'], basestring):

File grab/upload.py

+class UploadContent(str):
+    """
+    TODO: docstring
+    """
+
+    def __new__(cls, value):
+        obj = str.__new__(cls, 'xxx')
+        obj.raw_value = value
+        return obj
+
+    def field_tuple(self):
+        # TODO: move to transport extension
+        import pycurl
+        return (pycurl.FORM_CONTENTS, self.raw_value)
+
+
+class UploadFile(str):
+    """
+    TODO: docstring
+    """
+
+    def __new__(cls, path):
+        obj = str.__new__(cls, 'xxx')
+        obj.path = path
+        return obj
+
+    def field_tuple(self):
+        # move to transport extension
+        import pycurl
+        return (pycurl.FORM_FILE, self.path)
+
+