Commits

benoitc  committed 99d8185

better detection of encoding (should be more reliable) by using some
code from feedparser. While i'm here fix some errors spotted by namlook

  • Participants
  • Parent commits d1f952c

Comments (0)

Files changed (4)

File docs/whatsnew.txt

 Changes
 =======
 
+1.3.1
+-----
+
+- Better detection of content encoding
+- Make sure we keep the trailing slash
+
 1.3
 ---
 

File restclient/rest.py

 # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 #
+# _getCharacterEncoding from Feedparser under BSD License :
+#
+# Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
+# 
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+# 
+# * Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# 
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
 
 """
 restclient.rest
 import types
 import urllib
 
+try:
+    import chardet
+except ImportError:
+    chardet = False
+
 from restclient.errors import *
 from restclient.transport import getDefaultHTTPTransport, HTTPTransportBase
 from restclient.utils import to_bytestring
         self.status  = status_code = resp.status
         self.response = resp
         
+        
         if status_code >= 400:
             if status_code == 404:
                 raise ResourceNotFound(data, http_code=404, response=resp)
                 raise RequestFailed(data, http_code=status_code,
                     response=resp)
 
-        try:
-            return data.decode('utf-8')
-        except UnicodeDecodeError:
-            pass    
+        # determine character encoding
+        true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, \
+        acceptable_content_type = _getCharacterEncoding(resp, data)
+        
+
+        tried_encodings = []
+        # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
+        for proposed_encoding in (true_encoding, xml_encoding, sniffed_xml_encoding):
+            if not proposed_encoding: continue
+            if proposed_encoding in tried_encodings: continue
+            tried_encodings.append(proposed_encoding)
+            try:
+               return data.decode(proposed_encoding)
+               break
+            except:
+                pass
+                
+        # if still no luck and we haven't tried utf-8 yet, try that
+        if 'utf-8' not in tried_encodings:
+            try:
+                proposed_encoding = 'utf-8'
+                tried_encodings.append(proposed_encoding)
+                return data.decode(proposed_encoding)
+              
+            except:
+                pass
+                
+        # if still no luck and we haven't tried windows-1252 yet, try that
+        if 'windows-1252' not in tried_encodings:
+            try:
+                proposed_encoding = 'windows-1252'
+                tried_encodings.append(proposed_encoding)
+                return data.decode(proposed_encoding)
+            except:
+                pass
+                
+        # if no luck and we have auto-detection library, try that
+        if chardet:
+            try:
+                proposed_encoding = chardet.detect(data)['encoding']
+                if proposed_encoding and (proposed_encoding not in tried_encodings):
+                    tried_encodings.append(proposed_encoding)
+                    return data.decode(proposed_encoding)
+            except:
+                pass
+              
+        # give up, return data as is.   
         return data 
 
     def get_response(self):
 
     return '&'.join(tmp)
 
+import cgi
+
+def _getCharacterEncoding(http_headers, xml_data):
+    '''Get the character encoding of the XML document
+
+    http_headers is a dictionary
+    xml_data is a raw string (not Unicode)
+    
+    This is so much trickier than it sounds, it's not even funny.
+    According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
+    is application/xml, application/*+xml,
+    application/xml-external-parsed-entity, or application/xml-dtd,
+    the encoding given in the charset parameter of the HTTP Content-Type
+    takes precedence over the encoding given in the XML prefix within the
+    document, and defaults to 'utf-8' if neither are specified.  But, if
+    the HTTP Content-Type is text/xml, text/*+xml, or
+    text/xml-external-parsed-entity, the encoding given in the XML prefix
+    within the document is ALWAYS IGNORED and only the encoding given in
+    the charset parameter of the HTTP Content-Type header should be
+    respected, and it defaults to 'us-ascii' if not specified.
+
+    Furthermore, discussion on the atom-syntax mailing list with the
+    author of RFC 3023 leads me to the conclusion that any document
+    served with a Content-Type of text/* and no charset parameter
+    must be treated as us-ascii.  (We now do this.)  And also that it
+    must always be flagged as non-well-formed.  (We now do this too.)
+    
+    If Content-Type is unspecified (input was local file or non-HTTP source)
+    or unrecognized (server just got it totally wrong), then go by the
+    encoding given in the XML prefix of the document and default to
+    'iso-8859-1' as per the HTTP specification (RFC 2616).
+    
+    Then, assuming we didn't find a character encoding in the HTTP headers
+    (and the HTTP Content-type allowed us to look in the body), we need
+    to sniff the first few bytes of the XML data and try to determine
+    whether the encoding is ASCII-compatible.  Section F of the XML
+    specification shows the way here:
+    http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
+
+    If the sniffed encoding is not ASCII-compatible, we need to make it
+    ASCII compatible so that we can sniff further into the XML declaration
+    to find the encoding attribute, which will tell us the true encoding.
+
+    Of course, none of this guarantees that we will be able to parse the
+    feed in the declared character encoding (assuming it was declared
+    correctly, which many are not).  CJKCodecs and iconv_codec help a lot;
+    you should definitely install them if you can.
+    http://cjkpython.i18n.org/
+    '''
+
+    def _parseHTTPContentType(content_type):
+        '''takes HTTP Content-Type header and returns (content type, charset)
+
+        If no charset is specified, returns (content type, '')
+        If no content type is specified, returns ('', '')
+        Both return parameters are guaranteed to be lowercase strings
+        '''
+        content_type = content_type or ''
+        content_type, params = cgi.parse_header(content_type)
+        return content_type, params.get('charset', '').replace("'", '')
+
+    sniffed_xml_encoding = ''
+    xml_encoding = ''
+    true_encoding = ''
+    http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('Content-Type'))
+    # Must sniff for non-ASCII-compatible character encodings before
+    # searching for XML declaration.  This heuristic is defined in
+    # section F of the XML specification:
+    # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
+    try:
+        if xml_data[:4] == '\x4c\x6f\xa7\x94':
+            # EBCDIC
+            xml_data = _ebcdic_to_ascii(xml_data)
+        elif xml_data[:4] == '\x00\x3c\x00\x3f':
+            # UTF-16BE
+            sniffed_xml_encoding = 'utf-16be'
+            xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
+        elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] != '\x00\x00'):
+            # UTF-16BE with BOM
+            sniffed_xml_encoding = 'utf-16be'
+            xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
+        elif xml_data[:4] == '\x3c\x00\x3f\x00':
+            # UTF-16LE
+            sniffed_xml_encoding = 'utf-16le'
+            xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
+        elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and (xml_data[2:4] != '\x00\x00'):
+            # UTF-16LE with BOM
+            sniffed_xml_encoding = 'utf-16le'
+            xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
+        elif xml_data[:4] == '\x00\x00\x00\x3c':
+            # UTF-32BE
+            sniffed_xml_encoding = 'utf-32be'
+            xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
+        elif xml_data[:4] == '\x3c\x00\x00\x00':
+            # UTF-32LE
+            sniffed_xml_encoding = 'utf-32le'
+            xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
+        elif xml_data[:4] == '\x00\x00\xfe\xff':
+            # UTF-32BE with BOM
+            sniffed_xml_encoding = 'utf-32be'
+            xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
+        elif xml_data[:4] == '\xff\xfe\x00\x00':
+            # UTF-32LE with BOM
+            sniffed_xml_encoding = 'utf-32le'
+            xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
+        elif xml_data[:3] == '\xef\xbb\xbf':
+            # UTF-8 with BOM
+            sniffed_xml_encoding = 'utf-8'
+            xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
+        else:
+            # ASCII-compatible
+            pass
+        xml_encoding_match = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
+    except:
+        xml_encoding_match = None
+    if xml_encoding_match:
+        xml_encoding = xml_encoding_match.groups()[0].lower()
+        if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')):
+            xml_encoding = sniffed_xml_encoding
+    acceptable_content_type = 0
+    application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity')
+    text_content_types = ('text/xml', 'text/xml-external-parsed-entity')
+    if (http_content_type in application_content_types) or \
+       (http_content_type.startswith('application/') and http_content_type.endswith('+xml')):
+        acceptable_content_type = 1
+        true_encoding = http_encoding or xml_encoding or 'utf-8'
+    elif (http_content_type in text_content_types) or \
+         (http_content_type.startswith('text/')) and http_content_type.endswith('+xml'):
+        acceptable_content_type = 1
+        true_encoding = http_encoding or 'us-ascii'
+    elif http_content_type.startswith('text/'):
+        true_encoding = http_encoding or 'us-ascii'
+    elif http_headers and (not http_headers.has_key('content-type')):
+        true_encoding = xml_encoding or 'iso-8859-1'
+    else:
+        true_encoding = xml_encoding or 'utf-8'
+    return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type

File restclient/transport/_httplib2.py

 
         if not (url.startswith('http://') or url.startswith('https://')):
             error = 'URL is not a HTTP URL: %r' % (url,)
-            if DEBUG:
+            if restclient.debuglevel > 0:
                 print >>sys.stderr, str(error)
             raise InvalidUrl(error)
 
 
 setup(
     name = 'py-restclient',
-    version = '1.3',
+    version = '1.3.1',
     description = 'Python REST client',
     long_description = \
 """A simple REST client for Python, inspired by the microframework (Camping, Sinatra) style of specifying actions: get, put, post, delete.""",
         ]
     },
 
-    install_requires = ['httplib2'],
+    install_requires = [
+        'httplib2'
+    ],
 
     test_suite = 'nose.collector',