Source

slurpy / slurpy / browsers.py

"""Information on browsers used to try and fake browsers.
"""

# This is a smattering of user agent strings from various browsers. Maybe should be a bit better.
import random
import requests

from bs4 import BeautifulSoup, NavigableString

all_browsers = []

#-------------------------------------------------------------------------------
class Browser(object):
    """Encapsulates a browser session.
    """

    browser_wide_headers = {}
    user_agents = []
    languages = [
            ('en-CA', 'en'),
            ('en-US', 'en'),
            ('en-UK', 'en'),
            ('en-UK', 'en'),
            ('fr-FR', 'fr'),
            ('de-DE', 'de'),
    ]

    # TODO: Get the referer's working properly.
    #Referer http://en.wikipedia.org/wiki/HTTP_referer

    #---------------------------------------------------------------------------
    def __init__(self, headers=None, user_agent=None, language=None, proxies=None, timeout=1.0):
        """Initializes the browser.

        User may provide custom headers or user_agent if they wish. If not
        provided, the headers will be the default for the browser. If the user
        agent is not provided a random one for that browser is selected.

        User may also specify a language setting which should be a two-tuple
        of a sub-language followed by a language such as ('en-US', 'en'),
        if not provided we will randomly choose one for you based on provided
        values.
        """
        if not headers:
            headers = {}
        if not user_agent:
            user_agent = random.choice(self.user_agents);
        if not language:
            language = random.choice(self.languages);
        if not proxies:
            proxies = {}
        self.proxies = proxies
        self.timeout = timeout

        self.referer = None
        self.headers = self.browser_wide_headers.copy()
        self.headers.update(headers)
        self.user_agent = user_agent
        self.headers['User-Agent'] = self.user_agent

        # Make some general replacements to the headers based on languages.
        for key in self.headers:
            self.headers[key] = self.headers[key] % {
                'sub-language': language[0],
                'main-language': language[1],
            }
        self.session = requests.session()

    #---------------------------------------------------------------------------
    def hide_my_ass_proxies(self):
        """Uses find my ass's proxy list to figure out a proxy that we can
        use to properly hide our tracks.
        """
        response = self.get(
                'http://hidemyass.com/proxy-list/', headers=self.headers
            )
        doc = BeautifulSoup(response.text)
        def is_secure(tag):
            if tag.name == 'td':
                if len(tag.contents) == 1 and tag.contents[0] == u"High +KA":
                    return True
            return False

        slow_slow_potentials = []
        slow_medium_potentials = []
        slow_fast_potentials = []
        medium_medium_potentials = []
        medium_fast_potentials = []
        fast_fast_potentials = []
        for td in doc.find_all(is_secure):
            row = td.parent
            children = list(row.find_all('td'))

            ip = children[1]
            port = children[2]
            speed = children[4]
            connection = children[5]
            type = children[6]
            anonymity = children[7]

            # Parse the type (Because we'll be mostly ignoring everything
            # but high anonymous proxies, parse the type first.
            type = type.get_text().strip()
            #if type != "HTTPS":
                #continue

            anonymity = anonymity.get_text().strip()
            if anonymity != "High +KA":
                continue


            speeds = ['fast', 'slow', 'medium']
            for child in list(speed.find_all("div")):
                if child.has_key('class') and child['class'][0] in speeds:
                    speed = child['class'][0]
                    break

            for child in list(connection.find_all("div")):
                if child.has_key('class') and child['class'][0] in speeds:
                    connection = child['class'][0]
                    break


            # Parse the IP address
            parsed_ip = u""
            style = unicode(ip.find_all("style")[0].get_text())
            for child in list(ip.children)[0].children:
                if isinstance(child, NavigableString):
                    parsed_ip += unicode(child)
                    continue
                elif child.name == "style":
                    continue
                elif child.name == "span" or child.name == "div":
                    if child.has_key('class'):
                        hidden = False
                        for css_class in child['class']:
                            hidden_css = u".%s{display:none}" % css_class
                            if hidden_css in style:
                                hidden = True
                                break
                        if hidden:
                            continue

                    if child.has_key('style'):
                        if child['style'] == 'display:none':
                            continue
                parsed_ip += unicode(child.get_text())

            proxy = "%s:%s" % (parsed_ip, port.get_text().strip())
            if set([speed, connection]) == set(['fast']):
                fast_fast_potentials.append(proxy)
            elif set([speed, connection]) == set(['slow']):
                slow_slow_potentials.append(proxy)
            elif set([speed, connection]) == set(['medium']):
                medium_medium_potentials.append(proxy)
            elif set([speed, connection]) == set(['fast', 'medium']):
                medium_fast_potentials.append(proxy)
            elif set([speed, connection]) == set(['medium', 'slow']):
                slow_medium_potentials.append(proxy)
            elif set([speed, connection]) == set(['fast', 'slow']):
                slow_fast_potentials.append(proxy)

        return (fast_fast_potentials, medium_fast_potentials, medium_medium_potentials, slow_slow_potentials, slow_medium_potentials, slow_slow_potentials)

    #---------------------------------------------------------------------------
    def __unicode__(self):
        return u"%s: %s" % (self.__class__.__name__, self.user_agent)

    #---------------------------------------------------------------------------
    def _setup_request(self, url, kwargs):

        # Generally setup the headers
        headers = kwargs.get('headers', {})
        headers.update(self.headers)
        if self.referer:
            headers['Referer'] = headers.get('Referer', self.referer)
        kwargs['headers'] = headers

        # Generaly setup the proxies.
        proxies = kwargs.get('proxies', {})
        proxies.update(self.proxies)
        kwargs['proxies'] = proxies

        # Generaly setup the timeout.
        timeout = kwargs.get('timeout', self.timeout)
        kwargs['timeout'] = timeout

        # Store it from the next time
        self.referer = url

        return kwargs

    #---------------------------------------------------------------------------
    def get(self, url, **kwargs):
        kwargs = self._setup_request(url, kwargs)
        return self.session.get(url, **kwargs)

    #---------------------------------------------------------------------------
    def post(self, url, **kwargs):
        kwargs = self._setup_request(url, kwargs)
        return self.session.post(url, **kwargs)

    #---------------------------------------------------------------------------
    def options(self, url, **kwargs):
        kwargs = self._setup_request(url, kwargs)
        return self.session.options(url, **kwargs)

    #---------------------------------------------------------------------------
    def head(self, url, **kwargs):
        kwargs = self._setup_request(url, kwargs)
        return self.session.head(url, **kwargs)

    #---------------------------------------------------------------------------
    def put(self, url, **kwargs):
        kwargs = self._setup_request(url, kwargs)
        return self.session.put(url, **kwargs)

    #---------------------------------------------------------------------------
    def patch(self, url, **kwargs):
        kwargs = self._setup_request(url, kwargs)
        return self.session.patch(url, **kwargs)

    #---------------------------------------------------------------------------
    def delete(self, url, **kwargs):
        kwargs = self._setup_request(url, kwargs)
        return self.session.delete(url, **kwargs)

#-------------------------------------------------------------------------------
class Firefox(Browser):
    browser_wide_headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': '%(sub-language)s,%(main-language)s;q=0.5',
        'Connection ': 'keep-alive',
    }

    user_agents = [
        'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:14.0) Gecko/20100101 Firefox/14.0.1',
        'Mozilla/5.0 (Windows NT 6.1; rv:15.0) Gecko/20120716 Firefox/15.0a2',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20120427 Firefox/15.0a1',
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:14.0) Gecko/20120405 Firefox/14.0a1',
        'Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20120405 Firefox/14.0a1',
        'Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20120405 Firefox/14.0a1',
        'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:14.0) Gecko/20100101 Firefox/14.0.1',
        'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:14.0) Gecko/20100101 Firefox/14.0.1',
        'Mozilla/5.0 (Windows; U; Windows NT 6.1; WOW64; %(sub-language)s; rv:2.0.4) Gecko/20120718 AskTbAVR-IDW/3.12.5.17700 Firefox/14.0.1',
        'Mozilla/5.0 (Windows NT 6.1; rv:12.0) Gecko/20120403211507 Firefox/14.0.1',
        'Mozilla/5.0 (Windows NT 6.1; rv:12.0) Gecko/ 20120405 Firefox/14.0.1',
        'Mozilla/5.0 (Windows NT 6.0; rv:14.0) Gecko/20100101 Firefox/14.0.1',
    ]
all_browsers.append(Firefox)

#-------------------------------------------------------------------------------
class Opera(Browser):
    browser_wide_headers = {
        'Accept': 'text/html, application/xml;q=0.9, application/xhtml+xml, image/png, image/webp, image/jpeg, image/gif, image/x-xbitmap, */*;q=0.1',
        'Accept-Language': '%(sub-language)s,%(main-language)s;q=0.9',
        'Accept-Encoding': 'gzip, deflate',
        'Cache-Control': 'no-cache',
        'Connection': 'Keep-Alive',
    }

    user_agents = [
        'Opera/9.80 (X11; Linux x86_64; U; %(main-language)s) Presto/2.10.289 Version/12.01',
        'Opera/9.80 (Windows NT 6.1; U; %(main-language)s) Presto/2.9.181 Version/12.00',
        'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; %(main-language)s) Presto/2.9.168 Version/11.52',
        'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; de) Presto/2.9.168 Version/11.52',
        'Opera/9.80 (Windows NT 5.1; U; %(main-language)s) Presto/2.9.168 Version/11.51',
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; de) Opera 11.51',
        'Opera/9.80 (X11; Linux i686; U; %(main-language)s) Presto/2.9.168 Version/11.50'
    ]
all_browsers.append(Opera)


#-------------------------------------------------------------------------------
class Chrome(Browser):
    browser_wide_headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
        'Accept-Encoding': 'gzip,deflate,sdch',
        'Accept-Language': '%(main-language)s;q=0.8,%(sub-language)s;q=0.6',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
    }
    user_agents = [
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.19 (KHTML, like Gecko) Ubuntu/12.04 Chromium/18.0.1025.168 Chrome/18.0.1025.168 Safari/535.19',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1',
        'Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6',
        'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6',
        'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1',
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5',
        'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5',
    ]
all_browsers.append(Chrome)