Commits

Robert Brewer committed 0ecd7ed

Fix for #946 (Problem with encoded text in multipart/form-data). Reworked the structure for attempting various charsets when decoding request entities. New 'decode' Tool which is backward-compatible with the one in 3.1.

Comments (0)

Files changed (5)

cherrypy/_cpreqbody.py

+"""Request body processing for CherryPy.
+
+When an HTTP request includes an entity body, it is often desirable to
+provide that information to applications in a form other than the raw bytes.
+Different content types demand different approaches. Examples:
+
+ * For a GIF file, we want the raw bytes in a stream.
+ * An HTML form is better parsed into its component fields, and each text field
+    decoded from bytes to unicode.
+ * A JSON body should be deserialized into a Python dict or list.
+
+When the request contains a Content-Type header, the media type is used as a
+key to look up a value in the 'request.body.processors' dict. If the full media
+type is not found, then the major type is tried; for example, if no processor
+is found for the 'image/jpeg' type, then we look for a processor for the 'image'
+types altogether. If neither the full type nor the major type has a matching
+processor, then a default processor is used (self.default_proc). For most
+types, this means no processing is done, and the body is left unread as a
+raw byte stream. Processors are configurable in an 'on_start_resource' hook.
+
+Some processors, especially those for the 'text' types, attempt to decode bytes
+to unicode. If the Content-Type request header includes a 'charset' parameter,
+this is used to decode the entity. Otherwise, one or more default charsets may
+be attempted, although this decision is up to each processor. If a processor
+successfully decodes an Entity or Part, it should set the 'charset' attribute
+on the Entity or Part to the name of the successful charset, so that
+applications can easily re-encode or transcode the value if they wish.
+
+If the Content-Type of the request entity is of major type 'multipart', then
+the above parsing process, and possibly a decoding process, is performed for
+each part.
+
+For both the full entity and multipart parts, a Content-Disposition header may
+be used to fill .name and .filename attributes on the request.body or the Part.
+"""
+
 import re
 import tempfile
 from urllib import unquote_plus
         # which is valid for the decoded entity-body.
         raise cherrypy.HTTPError(411)
     
-    params = entity.params
     qs = entity.fp.read()
-    for aparam in qs.split('&'):
-        for pair in aparam.split(';'):
-            if not pair:
-                continue
-            
-            atoms = pair.split('=', 1)
-            if len(atoms) == 1:
-                atoms.append('')
-            
-            key = unquote_plus(atoms[0]).decode(entity.encoding)
-            value = unquote_plus(atoms[1]).decode(entity.encoding)
-            
-            if key in params:
-                if not isinstance(params[key], list):
-                    params[key] = [params[key]]
-                params[key].append(value)
-            else:
-                params[key] = value
+    for charset in entity.attempt_charsets:
+        try:
+            params = {}
+            for aparam in qs.split('&'):
+                for pair in aparam.split(';'):
+                    if not pair:
+                        continue
+                    
+                    atoms = pair.split('=', 1)
+                    if len(atoms) == 1:
+                        atoms.append('')
+                    
+                    key = unquote_plus(atoms[0]).decode(charset)
+                    value = unquote_plus(atoms[1]).decode(charset)
+                    
+                    if key in params:
+                        if not isinstance(params[key], list):
+                            params[key] = [params[key]]
+                        params[key].append(value)
+                    else:
+                        params[key] = value
+        except UnicodeDecodeError:
+            pass
+        else:
+            entity.charset = charset
+            break
+    else:
+        raise cherrypy.HTTPError(
+            400, "The request entity could not be decoded. The following "
+            "charsets were attempted: %s" % repr(entity.attempt_charsets))
+        
+    # Now that all values have been successfully parsed and decoded,
+    # apply them to the entity.params dict.
+    for key, value in params.iteritems():
+        if key in entity.params:
+            if not isinstance(entity.params[key], list):
+                entity.params[key] = [entity.params[key]]
+            entity.params[key].append(value)
+        else:
+            entity.params[key] = value
+
 
 def process_multipart(entity):
     """Read all multipart parts into entity.parts."""
     can be sent with various HTTP method verbs). This value is set between
     the 'before_request_body' and 'before_handler' hooks (assuming that
     process_request_body is True)."""
-
+    
     default_content_type = u'application/x-www-form-urlencoded'
     # http://tools.ietf.org/html/rfc2046#section-4.1.2:
     # "The default character set, which must be assumed in the
     # absence of a charset parameter, is US-ASCII."
-    default_text_encoding = u'us-ascii'
-    # For MIME multiparts, if the payload has no charset, leave as bytes.
-    default_encoding = None
-    force_encoding = None
+    # However, many browsers send data in utf-8 with no charset.
+    attempt_charsets = [u'utf-8']
     processors = {u'application/x-www-form-urlencoded': process_urlencoded,
                   u'multipart/form-data': process_multipart_form_data,
                   u'multipart': process_multipart,
             self.content_type = httputil.HeaderElement.from_str(
                 self.default_content_type)
         
-        # Encoding
-        self.encoding = self.best_encoding()
+        # Copy the class 'attempt_charsets', prepending any Content-Type charset
+        dec = self.content_type.params.get(u"charset", None)
+        if dec:
+            dec = dec.decode('ISO-8859-1')
+            self.attempt_charsets = [dec] + [c for c in self.attempt_charsets
+                                             if c != dec]
+        else:
+            self.attempt_charsets = self.attempt_charsets[:]
         
         # Length
         self.length = None
                 if self.filename.startswith(u'"') and self.filename.endswith(u'"'):
                     self.filename = self.filename[1:-1]
     
-    def best_encoding(self):
-        """Return the best encoding based on Content-Type (and defaults)."""
-        if self.force_encoding:
-            return self.force_encoding
-        
-        encoding = self.content_type.params.get(u"charset", None)
-        if not encoding:
-            ct = self.content_type.value.lower()
-            if ct.lower().startswith(u"text/"):
-                return self.default_text_encoding
-        
-        return encoding or self.default_encoding
-    
     def read(self, size=None, fp=None):
         return self.fp.read(size, fp)
     
             self.default_proc()
         else:
             proc(self)
-
     
     def default_proc(self):
         # Leave the fp alone for someone else to read. This works fine
 class Part(Entity):
     """A MIME part entity, part of a multipart entity."""
     
-    default_content_type = u'text/plain; charset=us-ascii'
+    default_content_type = u'text/plain'
     # "The default character set, which must be assumed in the absence of a
     # charset parameter, is US-ASCII."
-    default_encoding = u'us-ascii'
+    attempt_charsets = [u'us-ascii', u'utf-8']
     # This is the default in stdlib cgi. We may want to increase it.
     maxrambytes = 1000
     
         
         if fp is None:
             result = ''.join(lines)
-            if self.encoding is not None:
-                result = result.decode(self.encoding)
-            return result
+            for charset in self.attempt_charsets:
+                try:
+                    result = result.decode(charset)
+                except UnicodeDecodeError:
+                    pass
+                else:
+                    self.charset = charset
+                    return result
+            else:
+                raise cherrypy.HTTPError(
+                    400, "The request entity could not be decoded. The following "
+                    "charsets were attempted: %s" % repr(self.attempt_charsets))
         else:
             fp.seek(0)
             return fp
     # a Content-Type header. See http://www.cherrypy.org/ticket/790
     default_content_type = u''
     
-    default_encoding = u'utf-8'
-    # http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
-    # When no explicit charset parameter is provided by the
-    # sender, media subtypes of the "text" type are defined
-    # to have a default charset value of "ISO-8859-1" when
-    # received via HTTP.
-    default_text_encoding = u'ISO-8859-1'
     bufsize = 8 * 1024
     maxbytes = None
     
     def __init__(self, fp, headers, params=None, request_params=None):
         Entity.__init__(self, fp, headers, params)
+        
+        # http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
+        # When no explicit charset parameter is provided by the
+        # sender, media subtypes of the "text" type are defined
+        # to have a default charset value of "ISO-8859-1" when
+        # received via HTTP.
+        if self.content_type.value.startswith('text/'):
+            for c in (u'ISO-8859-1', u'iso-8859-1', u'Latin-1', u'latin-1'):
+                if c in self.attempt_charsets:
+                    break
+            else:
+                self.attempt_charsets.append(u'ISO-8859-1')
+        
         self.fp = SizedReader(self.fp, self.length,
                               self.maxbytes, bufsize=self.bufsize)
         # Temporary fix while deprecating passing .parts as .params.

cherrypy/_cprequest.py

 
 def request_namespace(k, v):
     """Attach request attributes declared in config."""
-    setattr(cherrypy.serving.request, k, v)
+    # Provides config entries to set request.body attrs (like attempt_charsets).
+    if k[:5] == 'body.':
+        setattr(cherrypy.serving.request.body, k[5:], v)
+    else:
+        setattr(cherrypy.serving.request, k, v)
 
 def response_namespace(k, v):
     """Attach response attributes declared in config."""

cherrypy/_cptools.py

 _d.log_hooks = Tool('on_end_request', cptools.log_hooks, priority=100)
 _d.err_redirect = ErrorTool(cptools.redirect)
 _d.etags = Tool('before_finalize', cptools.validate_etags, priority=75)
+_d.decode = Tool('before_request_body', encoding.decode)
 # the order of encoding, gzip, caching is important
 _d.encode = Tool('before_handler', encoding.ResponseEncoder, priority=70)
 _d.gzip = Tool('before_finalize', encoding.gzip, priority=80)

cherrypy/lib/encoding.py

 from cherrypy.lib import set_vary_header
 
 
+def decode(encoding=None, default_encoding='utf-8'):
+    """Replace or extend the list of charsets used to decode a request entity.
+    
+    Either argument may be a single string or a list of strings.
+    
+    encoding: If not None, restricts the set of charsets attempted while decoding
+    a request entity to the given set (even if a different charset is given in
+    the Content-Type request header).
+    
+    default_encoding: Only in effect if the 'encoding' argument is not given.
+    If given, the set of charsets attempted while decoding a request entity is
+    *extended* with the given value(s).
+    """
+    body = cherrypy.request.body
+    if encoding is not None:
+        if not isinstance(encoding, list):
+            encoding = [encoding]
+        body.attempt_charsets = encoding
+    elif default_encoding:
+        if not isinstance(default_encoding, list):
+            default_encoding = [default_encoding]
+        body.attempt_charsets = body.attempt_charsets + default_encoding
+
+
 class ResponseEncoder:
     
     default_encoding = 'utf-8'

cherrypy/test/test_encoding.py

         noshow_stream.exposed = True
         noshow_stream._cp_config = {'response.stream': True}
     
+    class Decode:
+        def extra_charset(self, *args, **kwargs):
+            return repr(cherrypy.request.params)
+        extra_charset.exposed = True
+        extra_charset._cp_config = {
+            'tools.decode.on': True,
+            'tools.decode.default_encoding': [u'utf-16'],
+            }
+        
+        def force_charset(self, *args, **kwargs):
+            return repr(cherrypy.request.params)
+        force_charset.exposed = True
+        force_charset._cp_config = {
+            'tools.decode.on': True,
+            'tools.decode.encoding': u'utf-16',
+            }
+    
     root = Root()
     root.gzip = GZIP()
+    root.decode = Decode()
     cherrypy.tree.mount(root, config={'/gzip': {'tools.gzip.on': True}})
 
 
 
 class EncodingTests(helper.CPWebCase):
     
-    def testDecoding(self):
+    def test_query_string_decoding(self):
         europoundUtf8 = europoundUnicode.encode('utf-8')
         self.getPage('/?param=' + europoundUtf8)
         self.assertBody(europoundUtf8)
             "The given query string could not be processed. Query "
             "strings for this resource must be encoded with 'utf8'.")
     
+    def test_urlencoded_decoding(self):
+        # Test the decoding of an application/x-www-form-urlencoded entity.
+        europoundUtf8 = europoundUnicode.encode('utf-8')
+        body="param=" + europoundUtf8
+        self.getPage('/', method='POST',
+                     headers=[("Content-Type", "application/x-www-form-urlencoded"),
+                              ("Content-Length", str(len(body))),
+                              ],
+                     body=body),
+        self.assertBody(europoundUtf8)
+        
+        # Encoded utf8 entities MUST be parsed and decoded correctly.
+        # Here, q is the POUND SIGN U+00A3 encoded in utf8
+        body = "q=\xc2\xa3"
+        self.getPage('/reqparams', method='POST',
+                     headers=[("Content-Type", "application/x-www-form-urlencoded"),
+                              ("Content-Length", str(len(body))),
+                              ],
+                     body=body),
+        self.assertBody(r"{'q': u'\xa3'}")
+        
+        # ...and in utf16, which is not in the default attempt_charsets list:
+        body = "\xff\xfeq\x00=\xff\xfe\xa3\x00"
+        self.getPage('/reqparams', method='POST',
+                     headers=[("Content-Type", "application/x-www-form-urlencoded;charset=utf-16"),
+                              ("Content-Length", str(len(body))),
+                              ],
+                     body=body),
+        self.assertBody(r"{'q': u'\xa3'}")
+        
+        # Entities that are incorrectly encoded MUST raise 400.
+        # Here, q is the POUND SIGN U+00A3 encoded in utf16, but
+        # the Content-Type incorrectly labels it utf-8.
+        body = "\xff\xfeq\x00=\xff\xfe\xa3\x00"
+        self.getPage('/reqparams', method='POST',
+                     headers=[("Content-Type", "application/x-www-form-urlencoded;charset=utf-8"),
+                              ("Content-Length", str(len(body))),
+                              ],
+                     body=body),
+        self.assertStatus(400)
+        self.assertErrorPage(400, 
+            "The request entity could not be decoded. The following charsets "
+            "were attempted: [u'utf-8']")
+    
+    def test_decode_tool(self):
+        # An extra charset should be tried first, and succeed if it matches.
+        # Here, we add utf-16 as a charset and pass a utf-16 body.
+        body = "\xff\xfeq\x00=\xff\xfe\xa3\x00"
+        self.getPage('/decode/extra_charset', method='POST',
+                     headers=[("Content-Type", "application/x-www-form-urlencoded"),
+                              ("Content-Length", str(len(body))),
+                              ],
+                     body=body),
+        self.assertBody(r"{'q': u'\xa3'}")
+        
+        # An extra charset should be tried first, and continue to other default
+        # charsets if it doesn't match.
+        # Here, we add utf-16 as a charset but still pass a utf-8 body.
+        body = "q=\xc2\xa3"
+        self.getPage('/decode/extra_charset', method='POST',
+                     headers=[("Content-Type", "application/x-www-form-urlencoded"),
+                              ("Content-Length", str(len(body))),
+                              ],
+                     body=body),
+        self.assertBody(r"{'q': u'\xa3'}")
+        
+        # An extra charset should error if force is True and it doesn't match.
+        # Here, we force utf-16 as a charset but still pass a utf-8 body.
+        body = "q=\xc2\xa3"
+        self.getPage('/decode/force_charset', method='POST',
+                     headers=[("Content-Type", "application/x-www-form-urlencoded"),
+                              ("Content-Length", str(len(body))),
+                              ],
+                     body=body),
+        self.assertErrorPage(400, 
+            "The request entity could not be decoded. The following charsets "
+            "were attempted: [u'utf-16']")
+    
+    def test_multipart_decoding(self):
+        # Test the decoding of a multipart entity when the charset (utf16) is
+        # explicitly given.
+        body='\r\n'.join(['--X',
+                          'Content-Type: text/plain;charset=utf-16',
+                          'Content-Disposition: form-data; name="text"',
+                          '',
+                          '\xff\xfea\x00b\x00\x1c c\x00',
+                          '--X',
+                          'Content-Type: text/plain;charset=utf-16',
+                          'Content-Disposition: form-data; name="submit"',
+                          '',
+                          '\xff\xfeC\x00r\x00e\x00a\x00t\x00e\x00',
+                          '--X--'])
+        self.getPage('/reqparams', method='POST',
+                     headers=[("Content-Type", "multipart/form-data;boundary=X"),
+                              ("Content-Length", str(len(body))),
+                              ],
+                     body=body),
+        self.assertBody(r"{'text': u'ab\u201cc', 'submit': u'Create'}")
+    
+    def test_multipart_decoding_no_charset(self):
+        # Test the decoding of a multipart entity when the charset (utf8) is
+        # NOT explicitly given, but is in the list of charsets to attempt.
+        body='\r\n'.join(['--X',
+                          'Content-Disposition: form-data; name="text"',
+                          '',
+                          '\xe2\x80\x9c',
+                          '--X',
+                          'Content-Disposition: form-data; name="submit"',
+                          '',
+                          'Create',
+                          '--X--'])
+        self.getPage('/reqparams', method='POST',
+                     headers=[("Content-Type", "multipart/form-data;boundary=X"),
+                              ("Content-Length", str(len(body))),
+                              ],
+                     body=body),
+        self.assertBody(r"{'text': u'\u201c', 'submit': u'Create'}")
+    
+    def test_multipart_decoding_no_successful_charset(self):
+        # Test the decoding of a multipart entity when the charset (utf16) is
+        # NOT explicitly given, and is NOT in the list of charsets to attempt.
+        body='\r\n'.join(['--X',
+                          'Content-Disposition: form-data; name="text"',
+                          '',
+                          '\xff\xfea\x00b\x00\x1c c\x00',
+                          '--X',
+                          'Content-Disposition: form-data; name="submit"',
+                          '',
+                          '\xff\xfeC\x00r\x00e\x00a\x00t\x00e\x00',
+                          '--X--'])
+        self.getPage('/reqparams', method='POST',
+                     headers=[("Content-Type", "multipart/form-data;boundary=X"),
+                              ("Content-Length", str(len(body))),
+                              ],
+                     body=body),
+        self.assertStatus(400)
+        self.assertErrorPage(400, 
+            "The request entity could not be decoded. The following charsets "
+            "were attempted: [u'us-ascii', u'utf-8']")
+    
     def testEncoding(self):
         # Default encoding should be utf-8
         self.getPage('/mao_zedong')