Commits

Gregory Petukhov committed db7a1fa Merge

Automated merge with ssh://athlon//web/grab

Comments (0)

Files changed (27)

 ^ghost$
 ^var/
 ^\.tox/
+^script/

grab/captcha/__init__.py

+from .service import CaptchaService
+from .error import *

grab/captcha/backend/__init__.py

Empty file added.

grab/captcha/backend/antigate.py

+from tempfile import mkstemp
+from base64 import b64encode
+import urllib
+
+from grab import Grab
+from .base import CaptchaBackend
+from ..error import (CaptchaServiceError, ServiceTooBusy, BalanceTooLow,
+                     SolutionNotReady)
+
+class AntigateBackend(CaptchaBackend):
+    def setup(self, api_key):
+        self.api_key = api_key
+
+    def get_submit_captcha_request(self, data, **kwargs):
+        g = Grab()
+        post={
+            'key': self.api_key,
+            'method': 'base64',
+            'body': b64encode(data),
+        }
+        post.update(kwargs)
+        g.setup(post=post)
+        g.setup(url='http://antigate.com/in.php')
+        return g
+
+    def parse_submit_captcha_response(self, res):
+        if res.code == 200:
+            if res.body.startswith('OK|'):
+                return res.body.split('|', 1)[1]
+            elif res.body == 'ERROR_NO_SLOT_AVAILABLE':
+                raise ServiceTooBusy('Service too busy')
+            elif res.body == 'ERROR_ZERO_BALANCE':
+                raise BalanceTooLow('Balance too low')
+            else:
+                raise CaptchaServiceError(res.body)
+        else:
+            raise CaptchaServiceError('Returned HTTP code: %d' % res.code)
+        
+    def get_check_solution_request(self, captcha_id):
+        params = {'key': self.api_key, 'action': 'get', 'id': captcha_id}
+        url = 'http://antigate.com/res.php?%s' % urllib.urlencode(params)
+        g = Grab()
+        g.setup(url=url)
+        return g
+
+    def parse_check_solution_response(self, res):
+        if res.code == 200:
+            if res.body.startswith('OK|'):
+                return res.body.split('|', 1)[1]
+            elif res.body == 'CAPCHA_NOT_READY':
+                raise SolutionNotReady('Solution not ready')
+            else:
+                raise CaptchaServiceError(res.body)
+        else:
+            raise CaptchaServiceError('Returned HTTP code: %d' % res.code)

grab/captcha/backend/base.py

+class CaptchaBackend(object):
+    def setup(self, **kwargs):
+        pass

grab/captcha/backend/browser.py

+import tempfile
+import webbrowser
+import time
+import os
+
+from grab import Grab
+from .base import CaptchaBackend
+
+class BrowserBackend(CaptchaBackend):
+    def get_submit_captcha_request(self, data):
+        fd, path = tempfile.mkstemp()
+        with open(path, 'w') as out:
+            out.write(data)
+        url = 'file://' + path
+        g = Grab()
+        g.setup(url=url)
+        return g
+
+    def parse_submit_captcha_response(self, res):
+        return res.url.replace('file://', '')
+
+    def get_check_solution_request(self, captcha_id):
+        url = 'file://' + captcha_id
+        g = Grab()
+        g.setup(url=url)
+        return g
+
+    def parse_check_solution_response(self, res):
+        webbrowser.open(url=res.url)
+        # Wait some time, skip some debug messages
+        # which browser could dump to console
+        time.sleep(0.5)
+        solution = raw_input('Enter solution: ')
+        path = res.url.replace('file://', '')
+        os.unlink(path)
+        return solution

grab/captcha/backend/gui.py

+import tempfile
+import webbrowser
+import time
+import os
+import pygtk
+import gtk
+from StringIO import StringIO
+
+from grab import Grab
+from .base import CaptchaBackend
+
+pygtk.require('2.0')
+
+class CaptchaWindow(object):
+    def __init__(self, path, solution):
+        self.solution = solution
+        self.window = gtk.Window(gtk.WINDOW_TOPLEVEL)
+        self.window.show()
+        self.window.connect('destroy', self.destroy)
+        self.box = gtk.HBox()
+        self.image = gtk.Image()
+        self.image.set_from_file(path)
+        self.entry = gtk.Entry()
+        self.entry.connect('activate', self.solve)
+        self.button = gtk.Button('Go')
+        self.button.connect('clicked', self.solve)
+
+        self.window.add(self.box)
+        self.box.pack_start(self.image)
+        self.box.pack_start(self.entry)
+        self.box.pack_start(self.button)
+        self.box.show()
+        self.image.show()
+        self.button.show()
+        self.entry.show()
+        self.entry.grab_focus()
+
+    def destroy(self, *args):
+        gtk.main_quit()
+
+    def solve(self, *args):
+        self.solution.append(self.entry.get_text())
+        self.window.hide()
+        gtk.main_quit()
+
+    def main(self):
+        gtk.main()
+
+
+class GuiBackend(CaptchaBackend):
+    def get_submit_captcha_request(self, data):
+        fd, path = tempfile.mkstemp()
+        with open(path, 'w') as out:
+            out.write(data)
+        url = 'file://' + path
+        g = Grab()
+        g.setup(url=url)
+        return g
+
+    def parse_submit_captcha_response(self, res):
+        return res.url.replace('file://', '')
+
+    def get_check_solution_request(self, captcha_id):
+        url = 'file://' + captcha_id
+        g = Grab()
+        g.setup(url=url)
+        return g
+
+    def parse_check_solution_response(self, res):
+        path = res.url.replace('file://', '')
+        solution = []
+        window = CaptchaWindow(path, solution)
+        window.main()
+        os.unlink(path)
+        return solution[0]

grab/captcha/const.py

+BACKEND_ALIAS = {
+    'antigate': 'grab.captcha.backend.antigate.AntigateBackend',
+    'browser': 'grab.captcha.backend.browser.BrowserBackend',
+    'gui': 'grab.captcha.backend.gui.GuiBackend',
+}

grab/captcha/error.py

+__all__ = ('CaptchaError', 'CaptchaServiceError', 'SolutionNotReady',
+           'ServiceTooBusy', 'BalanceTooLow')
+
+class CaptchaError(Exception):
+    pass
+
+
+class CaptchaServiceError(CaptchaError):
+    pass
+
+
+class SolutionNotReady(CaptchaServiceError):
+    pass
+
+
+class ServiceTooBusy(CaptchaServiceError):
+    pass
+
+
+class BalanceTooLow(CaptchaServiceError):
+    pass

grab/captcha/service.py

+import logging
+
+from ..util.module import import_string
+from .const import BACKEND_ALIAS
+
+__all__ = ('CaptchaService',)
+
+logger = logging.getLogger('grab.captcha')
+
+class CaptchaService(object):
+    """
+    This class implements API to communicate with
+    remote captcha solving service.
+    """
+
+    def __init__(self, backend, **kwargs):
+        if backend in BACKEND_ALIAS:
+            backend_path = BACKEND_ALIAS[backend]
+        else:
+            backend_path = backend
+        self.backend = import_string(backend_path)()
+        self.backend.setup(**kwargs)
+
+    def submit_captcha(self, data, **kwargs):
+        g = self.backend.get_submit_captcha_request(data, **kwargs)
+        g.request()
+        return self.backend.parse_submit_captcha_response(g.response)
+
+
+    def check_solution(self, captcha_id):
+        """
+        Raises:
+        * SolutionNotReady
+        * ServiceTooBusy
+        """
+
+        g = self.backend.get_check_solution_request(captcha_id)
+        g.request()
+        return self.backend.parse_check_solution_response(g.response)
+
+import re
+import random
+
+RE_SCRIPT = re.compile(r'<script[^>]+recaptcha\.net[^>]+>', re.S)
+RE_SCRIPT2 = re.compile(r'<script[^>]+google\.com/recaptcha/api/challenge[^>]+>', re.S)
+RE_SCRIPT3 = re.compile(r'Recaptcha\.create\("([^"]+)', re.S | re.I)
+RE_SRC = re.compile(r'src="([^"]+)"')
+
+    #def solve_captcha(self, g, url=None, data=None):
+        #if not g.clone_counter:
+            #logging.error('Warning: maybe you forgot to make the clone of Grab instance')
+
+        #if url:
+            #logging.debug('Downloading captcha')
+            #g.request(url=url)
+            #data = g.response.body
+
+        #logging.debug('Solving captcha')
+        #solution = self.module.solve_captcha(key=self.key, data=data)
+
+        #logging.debug('Captcha solved: %s' % solution)
+        #return solution
+
+
+    #def solve_recaptcha(self, g):
+        #if not g.clone_counter:
+            #logging.error('Warning: maybe you forgot to make the clone of Grab instance')
+
+        #def fetch_challenge():
+            #for x in xrange(5):
+                #url = None
+                #match = RE_SCRIPT.search(g.response.body)
+                #if match:
+                    #url = RE_SRC.search(match.group(0)).group(1)
+                #if not url:
+                    #match = RE_SCRIPT2.search(g.response.body)
+                    #if match:
+                        #url = RE_SRC.search(match.group(0)).group(1)
+                #if not url:
+                    #if 'google.com/recaptcha/api/js/recaptcha_ajax.js' in g.response.body:
+                        ## It is type of google recaptcha
+                        #match = RE_SCRIPT3.search(g.response.body)
+                        #code = match.group(1)
+                        #url = 'http://www.google.com/recaptcha/api/challenge'\
+                              #'?k=%s&ajax=1&cachestop=%s' % (code, str(random.random()))
+                        ##response = frame_loader.response.body
+                        ##rex = re.compile(r"challenge : '[^\"\s]+',")
+                        ##challenge_code = rex.search(response).group(0)[13:-2]
+                        
+                        ##image_loader = frame_loader.clone()
+                        ##image_url = 'https://www.google.com/recaptcha/api/image?c=%s' % challenge_code
+                        ##solution = solve_captcha(image_loader, url=image_url)
+
+                #if not url:
+                    #raise Exception('Unknown recaptcha implementation')
+
+                #g.request(url=url)
+                #html = g.response.body
+
+                #if not html:
+                    #logging.error('Empty response from recaptcha server')
+                    #continue
+
+                #server = re.compile(r'server\s*:\s*\'([^\']+)').search(html).group(1)
+                #challenge = re.compile(r'challenge\s*:\s*\'([^\']+)').search(html).group(1)
+                #url = server + 'image?c=' + challenge
+                #return challenge, url
+            #raise CaptchaError('Could not get valid response from recaptcha server')
+
+        #challenge, url = fetch_challenge()
+        #solution = self.solve_captcha(g, url=url)
+        #return challenge, solution
     parser.add_argument('--slave', action='store_true', default=False)
     #parser.add_argument('--force-url', type=str)
     parser.add_argument('spider_name', type=str)
+    parser.add_argument('--propagate-network-logger', action='store_true',
+                        default=False)
 
 
 def main(spider_name, thread_number=None, slave=False, force_url=None,
          settings='settings', *args, **kwargs):
-    default_logging()
+    default_logging(propagate_network_logger=kwargs['propagate_network_logger'])
 
     lock_key = None
     if not slave:

grab/script/start_project.py

+import os
+import logging
+import shutil
+import re
+
+logger = logging.getLogger('grab.script.start_project')
+
+def setup_arg_parser(parser):
+    parser.add_argument('project_name')
+    parser.add_argument('--template')
+
+
+def process_macros(content, context):
+    changed = False
+    for key, value in context.items():
+        re_macros = re.compile(r'\{\{\s*%s\s*\}\}' % re.escape(key))
+        if re_macros.search(content):
+            changed = True
+            content = re_macros.sub(value, content)
+    return changed, content
+
+
+def underscore_to_camelcase(val):
+    items = val.lower().split('_')
+    return ''.join(x.title() for x in items)
+
+
+def main(project_name, template, **kwargs):
+    cur_dir = os.getcwd()
+    project_dir = os.path.join(cur_dir, project_name)
+
+    if template is None:
+        grab_root = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+        template_path = os.path.join(grab_root, 'util/default_project')
+    else:
+        template_path = template
+
+    if os.path.exists(project_dir):
+        logger.error
+        raise Exception('Directory %s already exists' % project_dir)
+    else:
+        logger.debug('Copying %s to %s' % (template_path, project_dir))
+        shutil.copytree(template_path, project_dir)
+
+        for base, dir_names, file_names in os.walk(project_dir):
+            for file_name in file_names:
+                if file_name.endswith('.py'):
+                    file_path = os.path.join(base, file_name)
+                    context = {
+                        'PROJECT_NAME': project_name,
+                        'PROJECT_NAME_CAMELCASE': underscore_to_camelcase(project_name),
+                    }
+                    changed, content = process_macros(open(file_path).read(), context)
+                    if changed:
+                        with open(file_path, 'w') as out:
+                            out.write(content)
 
 DEFAULT_TASK_PRIORITY = 100
 RANDOM_TASK_PRIORITY_RANGE = (50, 100)
-TASK_QUEUE_TIMEOUT = 0.01
 NULL = object()
 
 logger = logging.getLogger('grab.spider.base')
             return randint(*RANDOM_TASK_PRIORITY_RANGE)
 
     def add_task_handler(self, task):
-        self.taskq.put(task, task.priority)
+        self.taskq.put(task, task.priority, schedule_time=task.schedule_time)
 
     def add_task(self, task):
         """
         while True:
             try:
                 with self.save_timer('task_queue'):
-                    return self.taskq.get(TASK_QUEUE_TIMEOUT)
+                    return self.taskq.get()
             except Queue.Empty:
+                if self.taskq.size():
+                    logger_verbose.debug('Waiting for scheduled task')
+                    return True
                 if not self.slave:
                     logger_verbose.debug('Task queue is empty.')
                     return None
                 else:
                     # Temporarly hack which force slave crawler
-                    # to wait 2 seconds for new tasks, this solves
+                    # to wait 5 seconds for new tasks, this solves
                     # the problem that sometimes slave crawler stop
                     # its work because it could not receive new
                     # tasks immediatelly
                     # tasks from task queue
                     for x in xrange(5):
                         task = self.load_new_task()
-                        if not task:
+                        if task is None:
                             if not self.transport.active_task_number():
                                 self.process_task_generator()
+                        elif task is True:
+                            # If only delayed tasks in queue
+                            break
                         else:
+                            # If got some task
                             break
 
                     if not task:
                             if task.sleep:
                                 logger.debug('Got NullTask with sleep instruction. Sleeping for %.2f seconds' % task.sleep)
                                 time.sleep(task.sleep)
+                    elif task == True:
+                        pass
                     else:
                         #if self.wating_shutdown_event.is_set():
                             #self.wating_shutdown_event.clear()

grab/spider/mixin/__init__.py

Empty file added.

grab/spider/mixin/captcha_solver.py

+import logging
+from ..task import Task
+from grab.captcha import SolutionNotReady
+
+logger = logging.getLogger('grab.spider.mixin.captcha_solver')
+
+class CaptchaSolverInterface(object):
+    def task_download_captcha(self, grab, task):
+        logger.debug('Got captcha image')
+        g_new = self.solver.backend.get_submit_captcha_request(grab.response.body)
+        yield Task('submit_captcha', grab=g_new, meta=task.meta)
+
+    def task_submit_captcha(self, grab, task):
+        captcha_id = self.solver.backend.parse_submit_captcha_response(grab.response)
+        g_new = self.solver.backend.get_check_solution_request(captcha_id)
+        yield Task('check_solution', grab=g_new, delay=5, meta=task.meta)
+
+    def task_check_solution(self, grab, task):
+        try:
+            solution = self.solver.backend.parse_check_solution_response(grab.response)
+        except SolutionNotReady:
+            logger.debug('SOLUTION IS NOT READY')
+            yield task.clone(delay=task.original_delay)
+        else:
+            logger.debug('GOT CAPTCHA SOLUTION: %s' % solution)
+            yield task.meta['handler'](solution, task.meta)
+
+

grab/spider/queue_backend/base.py

     def put(self, task, priority):
         pass
 
-    def get(self, timeout):
+    def get(self):
         """
         Return `Task` object or raise `Queue.Empty` exception
-        after `timeout` seconds.
 
         @returns: `grab.spider.task.Task` object
         @raises: `Queue.Empty` exception

grab/spider/queue_backend/memory.py

 from __future__ import absolute_import
+from datetime import datetime
 
 from .base import QueueInterface
 from Queue import PriorityQueue, Empty
 
 class QueueBackend(QueueInterface):
-    def __init__(self, spider_name, unique=False, **kwargs):
+    def __init__(self, spider_name, **kwargs):
         super(QueueInterface, self).__init__(**kwargs)
         self.queue_object = PriorityQueue()
-        self.unique = unique
-        self.unique_dict = {}
+        self.schedule_list = []
 
-    def put(self, task, priority):
-        if self.unique:
-            key = unique_key(task)
-            if key in self.unique_dict:
-                return
-            self.unique_dict[key] = True
-        self.queue_object.put((priority, task))
+    def put(self, task, priority, schedule_time=None):
+        if schedule_time is None:
+            self.queue_object.put((priority, task))
+        else:
+            self.schedule_list.append((schedule_time, task))
 
-    def get(self, timeout):
-        priority, task = self.queue_object.get(True, timeout)
-        if self.unique:
-            key = unique_key(task)
-            del self.unique_dict[key]
+    def get(self):
+        now = datetime.now()
+
+        removed_indexes = []
+        idx = 0
+        for schedule_time, task in self.schedule_list:
+            if schedule_time <= now:
+                self.put(task, 1)
+                removed_indexes.append(idx)
+            idx += 1
+
+        self.schedule_list = [x for idx, x in enumerate(self.schedule_list)
+                              if not idx in removed_indexes]
+
+        priority, task = self.queue_object.get(block=False)
         return task
 
     def size(self):
-        return self.queue_object.qsize()
+        return self.queue_object.qsize() + len(self.schedule_list)
 
     def clear(self):
         try:
                 self.queue_object.get(False)
         except Empty:
             pass
-
-def unique_key(task):
-    return '%s:%s' % (task.name, task.url)
+        self.schedule_list = []

grab/spider/queue_backend/mongo.py

 import pymongo
 
 from .base import QueueInterface
+from ..error import SpiderMisuseError
 
 logger = logging.getLogger('grab.spider.queue_backend.mongo')
 
     def size(self):
         return self.collection.count()
 
-    def put(self, task, priority):
+    def put(self, task, priority, schedule_time=None):
+        if schedule_time is not None:
+            raise SpiderMisuseError('Mongo task queue does not support delayed task') 
         item = {
             'task': Binary(pickle.dumps(task)),
             'priority': priority,
         }
         self.collection.save(item)
 
-    def get(self, timeout):
+    def get(self):
         item = self.collection.find_and_modify(
             sort=[('priority', pymongo.ASCENDING)],
             remove=True

grab/spider/queue_backend/redis.py

 """
 from __future__ import absolute_import
 
-from .base import QueueInterface
 from qr import PriorityQueue
 import Queue
 import random
 
+from .base import QueueInterface
+from ..error import SpiderMisuseError
+
 class QueueBackend(QueueInterface):
     def __init__(self, spider_name, queue_name=None, **kwargs):
         super(QueueInterface, self).__init__(**kwargs)
         self.queue_name = queue_name
         self.queue_object = PriorityQueue(queue_name)
 
-    def put(self, task, priority):
+    def put(self, task, priority, schedule_time=None):
         # Add attribute with random value
         # This is required because qr library
         # does not allow to store multiple values with same hash
         # in the PriorityQueue
+
+        if schedule_time is not None:
+            raise SpiderMisuseError('Mongo task queue does not support delayed task') 
         task._rnd = random.random()
         self.queue_object.push(task, priority)
 
-    def get(self, timeout):
+
+    def get(self):
         task = self.queue_object.pop()
         if task is None:
             raise Queue.Empty()
     def clear(self):
         try:
             while True:
-                self.get(0)
+                self.get()
         except Queue.Empty:
             pass
 from __future__ import absolute_import
 from random import randint
+from datetime import datetime, timedelta
 
 from .error import SpiderMisuseError
 from ..base import copy_config
                  network_try_count=0, task_try_count=0, 
                  disable_cache=False, refresh_cache=False,
                  valid_status=[], use_proxylist=True,
-                 cache_timeout=None,
+                 cache_timeout=None, delay=0,
                  **kwargs):
         """
         Create `Task` object.
             :param use_proxylist: it means to use proxylist which was configured
                 via `setup_proxylist` method of spider
             :param cache_timeout: maximum age (in seconds) of cache record to be valid
-
+            :param delay: if specified tells the spider to schedule the task and execute
+                it after `delay` seconds
             Any non-standard named arguments passed to `Task` constructor will be saved as
             attributes of the object. You can get their values later as attributes or with
             `get` method which allows to use default value if attrubute does not exist.
             self.url = url
             self.grab_config = None
 
+        self.process_delay_option(delay)
+
         self.priority_is_custom = priority_is_custom
         self.priority = priority
         self.network_try_count = network_try_count
         """
         return getattr(self, key, default)
 
+    def process_delay_option(self, delay):
+        if delay:
+            self.schedule_time = datetime.now() + timedelta(seconds=delay)
+            self.original_delay = delay
+        else:
+            self.schedule_time = None
+            self.original_delay = None
+
     def clone(self, **kwargs):
         """
         Clone Task instance.
         for key, value in kwargs.items():
             setattr(task, key, value)
 
+        task.process_delay_option(task.get('delay', None))
+
         return task
 
     def __repr__(self):
             return data
     else:
         # dict, tuple, list should be serialized into byte-string
-        return urlencode(data, charset)
+        return smart_urlencode(data, charset)

grab/util/default_project/database.py

+import pymongo
+
+db = pymongo.Connection()['{{ PROJECT_NAME }}']

grab/util/default_project/settings.py

+PROXY_LIST = {
+    'source': '/web/proxy.txt',
+    'source_type': 'text_file',
+}

grab/util/default_project/spider.py

+#!/usr/bin/env python
+# coding: utf-8
+from grab.spider import Spider, Task
+from grab.tools.logs import default_logging
+from grab import Grab
+import logging
+from urlparse import urlsplit, parse_qs, parse_qsl, urlunsplit, urljoin
+from grab.tools.lxml_tools import parse_html, render_html, drop_node, clone_node
+import traceback
+import urllib
+from collections import defaultdict
+import re
+
+from database import mongodb
+
+class {{ PROJECT_NAME_CAMELCASE }}Spider(Spider):
+    initial_urls = ['']
+
+    def task_initial(self, grab, task):
+        pass
         try:
             mod = __import__(path, None, None, ['foo'])
         except ImportError, ex:
-            logging.error('', exc_info=ex)
+            if not path in unicode(ex):
+                logging.error('', exc_info=ex)
         else:
             for key in dir(mod):
                 val = getattr(mod, key)
 
     def setup_queue(self, bot):
         bot.setup_queue(backend='memory')
+
+    def test_schedule(self):
+        """
+        In this test I create a number of delayed task
+        and then check the order in which they was executed
+        """
+
+        class TestSpider(Spider):
+            def prepare(self):
+                self.numbers = []
+
+            def task_generator(self):
+                yield Task('page', url=SERVER.BASE_URL, num=1)
+                yield Task('page', url=SERVER.BASE_URL, delay=1.5, num=2)
+                yield Task('page', url=SERVER.BASE_URL, delay=0.5, num=3)
+                yield Task('page', url=SERVER.BASE_URL, delay=1, num=4)
+
+            def task_page(self, grab, task):
+                self.numbers.append(task.num)
+
+        bot = TestSpider()
+        self.setup_queue(bot)
+        bot.run()
+        self.assertEqual(bot.numbers, [1, 3, 4, 2])