grab / grab / ext /

Full commit
# Copyright: 2011, Grigoriy Petukhov
# Author: Grigoriy Petukhov (
# License: BSD
from __future__ import absolute_import
from urlparse import urljoin

from ..base import DataNotFound, GrabMisuseError
from import urlencode

# TODO: refactor this hell

class FormExtension(object):
    def extra_reset(self):
        self._lxml_form = None
        self._file_fields = {}

    def choose_form(self, number=None, id=None, name=None, xpath=None):
        Set the default form.
        :param number: number of form (starting from zero)
        :param id: value of "id" atrribute
        :param name: value of "name" attribute
        :param xpath: XPath query
        :raises: :class:`DataNotFound` if form not found
        :raises: :class:`GrabMisuseError` if method is called without parameters

        Selected form will be available via `form` atribute of `Grab`
        instance. All form methods will work with defalt form.


            # Select second form

            # Select by id

            # Select by name

            # Select by xpath
            g.choose_form(xpath='//form[contains(@action, "/submit")]')

        if id is not None:
                self._lxml_form = self.css('form[id="%s"]' % id)
            except IndexError:
                raise DataNotFound("There is no form with id: %s" % id)
        elif name is not None:
                self._lxml_form = self.css('form[name="%s"]' % name)
            except IndexError:
                raise DataNotFound('There is no form with name: %s' % name)
        elif number is not None:
                self._lxml_form = self.tree.forms[number]
            except IndexError:
                raise DataNotFound('There is no form with number: %s' % number)
        elif xpath is not None:
                self._lxml_form = self.xpath(xpath)
            except IndexError:
                raise DataNotFound('Could not find form with xpath: %s' % xpath)
            raise GrabMisuseError('choose_form methods requires one of '
                                  '[number, id, name, xpath] arguments')
    def form(self):
        This attribute points to default form.

        If form was not selected manually then select the form
        which has the biggest number of input elements.

        The form value is just an `lxml.html` form element.


            g.go('some URL')
            # Choose form automatically
            print g.form

            # And now choose form manually
            print g.form

        if self._lxml_form is None:
            forms = [(idx, len(x.fields)) for idx, x in enumerate(self.tree.forms)]
            idx = sorted(forms, key=lambda x: x[1], reverse=True)[0][0]
        return self._lxml_form

    def set_input(self, name, value):
        Set the value of form element by its `name` attribute.

        :param name: name of element
        :param value: value which should be set to element

        To check/uncheck the checkbox pass boolean value.


            g.set_input('sex', 'male')

            # Check the checkbox
            g.set_input('accept', True)

        if self._lxml_form is None:
            self.choose_form_by_element('.//*[@name="%s"]' % name)
        elem = self.form.inputs[name]

        processed = False
        if getattr(elem, 'type', None) == 'checkbox':
            if isinstance(value, bool):
                elem.checked = value
                processed = True
        if not processed:
            # We need to remember origina values of file fields
            # Because lxml will convert UploadContent/UploadFile object to string
            if getattr(elem, 'type', '').lower() == 'file':
                self._file_fields[name] = value
            elem.value = value

    def set_input_by_id(self, _id, value):
        Set the value of form element by its `id` attribute.

        :param _id: id of element
        :param value: value which should be set to element

        xpath = './/*[@id="%s"]' % _id
        if self._lxml_form is None:
        elem = self.form.xpath(xpath)[0]
        return self.set_input(elem.get('name'), value)

    def set_input_by_number(self, number, value):
        Set the value of form element by its number in the form

        :param number: number of element
        :param value: value which should be set to element

        elem = self.form.xpath('.//input[@type="text"]')[number]
        return self.set_input(elem.get('name'), value)

    def set_input_by_xpath(self, xpath, value):
        Set the value of form element by xpath

        :param xpath: xpath path
        :param value: value which should be set to element

        elem = self.tree.xpath(xpath)[0]

        if self._lxml_form is None:
            # Explicitly set the default form 
            # which contains found element
            parent = elem
            while True:
                parent = parent.getparent()
                if parent.tag == 'form':
                    self._lxml_form = parent

        return self.set_input(elem.get('name'), value)

    # TODO:
    # Remove set_input_by_id
    # Remove set_input_by_number
    # New method: set_input_by(id=None, number=None, xpath=None)

    def submit(self, submit_name=None, make_request=True,
               url=None, extra_post=None):
        Submit default form.

        :param submit_name: name of buton which should be "clicked" to
            submit form
        :param make_request: if `False` then grab instance will be
            configured with form post data but request will not be
        :param url: explicitly specifi form action url
        :param extra_post: additional form data which will override
            data automatically extracted from the form.

        Following input elements are automatically processed:

        * input[type="hidden"] - default value
        * select: value of last option
        * radio - ???
        * checkbox - ???

        Multipart forms are corectly recognized by grab library.


            # Assume that we going to some page with some form
            g.go('some url')
            # Fill some fields
            g.set_input('username', 'bob')
            g.set_input('pwd', '123')
            # Submit the form
            # or we can just fill the form
            # and do manu submition
            g.set_input('foo', 'bar')

            # for multipart forms we can specify files
            from grab import UploadFile
            g.set_input('img', UploadFile('/path/to/image.png'))

        # TODO: add .x and .y items
        # if submit element is image

        post = self.form_fields()
        submit_control = None

        # Build list of submit buttons which have a name
        submit_controls = {}
        for elem in self.form.inputs:
            if (elem.tag == 'input' and elem.type == 'submit' and
                elem.get('name') is not None):
                submit_controls[] = elem

        # All this code need only for one reason:
        # to not send multiple submit keys in form data
        # in real life only this key is submitted whose button
        # was pressed
        if len(submit_controls):
            # If name of submit control is not given then
            # use the name of first submit control
            if submit_name is None or not submit_name in submit_controls:
                controls = sorted(submit_controls.values(), key=lambda x:
                submit_name = controls[0].name

            # Form data should contain only one submit control
            for name in submit_controls:
                if name != submit_name:
                    if name in post:
                        del post[name]

        if url:
            action_url = urljoin(self.response.url, url)
            action_url = urljoin(self.response.url, self.form.action)

        if extra_post:

        if self.form.method == 'POST':
            if 'multipart' in self.form.get('enctype', ''):
                for key, obj in self._file_fields.items():
                    post[key] = obj

            url = action_url.split('?')[0] + '?' + urlencode(post.items())
        if make_request:
            return self.request()
            return None

    def form_fields(self):
        Return fields of default form.

        Fill some fields with reasonable values.

        fields = dict(self.form.fields)
        for elem in self.form.inputs:
            # Ignore elements without name
            if not elem.get('name'):

            # Do not submit disabled fields
            if elem.get('disabled'):
                del fields[]

            if elem.tag == 'select':
                if not fields[]:
                    if len(elem.value_options):
                        fields[] = elem.value_options[-1]
            if getattr(elem, 'type', None) == 'radio':
                if not fields[]:
                    fields[] = elem.get('value')
            if getattr(elem, 'type', None) == 'checkbox':
                if not elem.checked:
                    if is not None:
                        if in fields:
                            del fields[]
        return fields

    def choose_form_by_element(self, xpath):
        forms = self.tree.xpath('//form')
        found_form = None
        for form in forms:
            if len(form.xpath(xpath)):
                found_form = form
        self._lxml_form = found_form if found_form is not None else forms[0]