1. pcdinh
  2. google_translate


google_translate / google_translate.py

import urllib
import urllib2
import urlparse
import gzip as gzip_
    from cStringIO import StringIO
except ImportError:
    from StringIO import StringIO
    import simplejson as json
except ImportError:
    import json

API_URL = 'http://translate.googleapis.com/translate_a/t'
#don't bother google with suspicious user-agents
DEFAULT_USER_AGENT = ('Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 '
                      '(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6')

def translate(text, target_language, source_language=None, html=True,
              user_agent=DEFAULT_USER_AGENT, referer=None, origin=None,
              timeout=None, flat=False, urlopen=urllib2.urlopen, gzip=True):

    if isinstance(text, basestring):
        text = (text,)
    #TODO: split for smaller queries
    #official clients split queries for smaller parts, but service can tranlate
    #blocks for almost 2MB.

    headers = {
        'User-Agent': user_agent,
        'Accept-Charset': 'utf-8',
    if gzip:
        headers['Accept-Encoding'] = 'gzip'
    #better supply information below not to be banned by google
    if referer:
        headers['Referer'] = referer
        headers['Origin'] = '%s://%s/' % urlparse.urlsplit(referer)[:2]
        if origin and origin != headers['Origin']:
            raise ValueError('Origin and referer not matched', origin,
    elif origin:
        headers['Origin'] = origin

    qs = (
        #('anno', '3'), #wtf? annotation?
        #If sentences more than 1 it returns original after translation.
        #NOTE: this isn't happening from official clients for some reason.

        #('client', 'te'), #google javascript translator
        ('client', 'te_lib'), #google-chrome translator
        ('format', html and 'html' or ''), #encodes html-entities in response
        ('v', '1.0'), #google translate version
        ('logld', 'v10'), #obviously protocol version

    data = []
    for block in text:
        if isinstance(block, unicode):
            block = block.encode('utf8')
            #just to make sure this string is utf8 encoded
            except UnicodeDecodeError:
                raise ValueError('Text must be utf8 or unicode')
        data.append(('q', block))
    data += [
        ('sl', source_language or ''),
        ('tl', target_language),
        ('tc', '1'), #counter of splitted request related to one content
        #('ctt', '1'), #wtf? absent in google-chrome translator

    req = urllib2.Request(API_URL + '?' + urllib.urlencode(qs),
                          urllib.urlencode(data), headers=headers)
    resp = urlopen(req, **(timeout and {'timeout': timeout} or {}))
    if gzip:
        resp_cont = gzip_.GzipFile(fileobj=StringIO(resp.read()), mode='r').read()
        resp_cont = resp.read()
    result = json.loads(resp_cont.decode('string-escape'))

    if not source_language:
        #return detected source language as last element if it wasn't specified
        result, source_language = result[:-1], result[-1]
    elif len(text) == 1:
        #this isn't json list, just plain text
        result = (result,)

    if flat:
        return u' '.join(result)
    return tuple(result), source_language