1. Pavel Zhukov
  2. pbot

Commits

Pavel Zhukov  committed 3b3abfa

Add force_encoding option

  • Participants
  • Parent commits e7f85c1
  • Branches default

Comments (0)

Files changed (4)

File README

View file
 Spider gives you special features::
 
     from pbot.spider import Spider
-    bot = Spider()
+    bot = Spider() # or Spider(force_encoding='utf-8') to force encoding for parser
     bot.open('http://example.com')
     bot.tree.xpath('//a') # lxml tree can be accessed by .tree, response will be automatically readed and parsed by lxml.html
     form = bot.xpath('//form[@id="main"]') # xpath shortcut for bot.tree.xpath

File pbot/pbot.py

View file
 
 import urllib
 import urllib2
+
 from cookielib import CookieJar, Cookie
 import logging
 from copy import copy
     '''
 
     def __init__(self, send_referral=True, parse_cookies=True, proxies=None, cookiejar=None, add_headers=None,
-                 encoding='utf-8'):
+                 encoding='utf-8', debug=False):
         '''
         Set basic headers settings
         '''
         self._add_headers = add_headers or {}
         self.current_url = None
         self.encoding = encoding
+        self.debug = debug
 
     def refresh_connector(self):
         '''
         '''
         Create new OpenerDirector object using selected parameters
         '''
+        if self.debug:
+            h = urllib2.HTTPHandler(debuglevel=1)
+        else:
+            h = urllib2.HTTPHandler()
         if self.proxies:
             proxy_handler = urllib2.ProxyHandler(self.proxies)
-            self.browser = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cj), proxy_handler)
+            self.browser = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cj), proxy_handler, h)
         else:
-            self.browser = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cj))
-        self.browser.add_headers = self.headers
+            self.browser = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cj), h)
+        self.browser.addheaders = self.headers.items()
 
     def open(self, url, post=None, get=None, referrer=None, ):
         '''
         logging.debug('referrer: %s' % referrer)
         if post:
             post = force_unicode(post, self.encoding)
-            self.response = self.browser.open(target, urllib.urlencode(post))
+            post = urllib.urlencode(post)
+            logging.debug(post)
+            self.response = self.browser.open(target, post)
         else:
             self.response = self.browser.open(target)
         self.current_url = target

File pbot/spider.py

View file
     Lxml powered crawler
     '''
     def __init__(self, *args, **kwargs):
+        self.force_encoding = kwargs.pop('force_encoding', None)
         super(Spider, self).__init__(*args, **kwargs)
         self._tree = None
         self.base_url = None
 
 
+
     def open(self, *args, **kwargs):
         self._tree = None
         response = super(Spider, self).open(*args, **kwargs)
            return self._tree
         else:
             if self.response:
-                self._tree = html.parse(self.response)
+                if self.force_encoding:
+                    parser = html.HTMLParser(encoding=self.force_encoding)
+                    self._tree = html.parse(self.response, parser)
+                else:
+                    self._tree = html.parse(self.response)
                 return self._tree
             else:
                 raise NoResponse('Spider.response is empty or already readed')

File setup.py

View file
 
 setup(
         name = 'pbot',
-        version = '1.3.0',
+        version = '1.4.0',
         packages = ['pbot'],
         install_requires = ['lxml'],
         author = 'Pavel Zhukov',