Module `pywebcopy.core`

Expand source code

# Copyright 2020; Raja Tomar
# See license for more details
import logging
import operator
import os

from lxml.html import HTMLParser
from lxml.html import XHTML_NAMESPACE
from lxml.html import parse
from requests.models import Response

from .elements import HTMLResource
from .helpers import RewindableResponse
from .schedulers import crawler_scheduler
from .schedulers import default_scheduler
from .schedulers import threading_crawler_scheduler
from .schedulers import threading_default_scheduler

__all__ = ['WebPage', 'Crawler']

logger = logging.getLogger(__name__)


class WebPage(HTMLResource):
    """
    WebPage built upon HTMLResource element.
    It provides various utilities like form-filling,
    external response processing, getting list of links,
    dumping html and opening the html in the browser.
    """
    @classmethod
    def from_config(cls, config):
        """It creates a `WebPage` object from a set config object.
        Under the hood it checks whether the config is set or not,
        then it creates a `session` using the `config.create_session()` method.
        It then creates a `scheduler` based on whether the threading is enabled or not.
        It also defines a `context` object which stores the path metadata for this structure.
        """
        if config and not config.is_set():
            raise AttributeError("Configuration is not setup.")

        session = config.create_session()
        if config.get('threaded'):
            scheduler = threading_default_scheduler(
                timeout=config.get_thread_join_timeout())
        else:
            scheduler = default_scheduler()
        context = config.create_context()
        ans = cls(session, config, scheduler, context)
        # XXX: Check connection to the url here?
        return ans

    def __repr__(self):
        """Short representation of this instance."""
        return '<{}: {}>'.format(self.__class__.__name__, self.url)

    element_map = property(
        operator.attrgetter('scheduler.data'),
        doc="Registry of different handler for different tags."
    )

    def set_response(self, response):
        """
        Set an explicit `requests.Response` object directly.
        It accepts a `requests.Response` object and wraps it in a `RewindableResponse` object.
        It also enables decoding in the original `urllib3` response object.

        You can use it like this
            import requests
            resp = requests.get('https://www.httpbin.org/')
            wp = WebPage()
            wp.set_response(resp)
            wp.get_forms()
        """
        if not isinstance(response, Response):
            raise ValueError("Expected %r, got %r" % (Response, response))
        response.raw.decode_content = True
        response.raw = RewindableResponse(response.raw)
        return super(WebPage, self).set_response(response)

    def get_source(self, buffered=False):
        """Returns the `requests.Response` object
        in a rewindable io wrapper. Contents can be consumed then
        the `.rewind()` method should be called to restore the contents
        of the original response.

            wp = WebPage()
            wp.get('http://httpbin.org/forms/')
            src = WebPage.get_source(buffered=True)
            contents = src.read()
            src.rewind()

        :param buffered: whether to return an Readable file-like object
            or just a plain string.
        :rtype: RewindableResponse
        """
        raw = getattr(self.response, 'raw', None)
        if raw is None:
            raise ValueError(
                "HTTP Response is not set at the `.response` attribute!"
                "Use the `.get()` method or `.set_response()` methods to set it.")

        # if raw.closed:
        #     raise ValueError(
        #         "I/O operations are closed for the raw source.")

        # Return the raw object which will decode the
        # buffer while reading otherwise errors will follow
        raw.decode_content = True

        # fp = getattr(raw, '_fp', None)
        # assert fp is not None, "Raw source wrapper is missing!"
        # assert isinstance(fp, CallbackFileWrapper), \
        #     "Raw source wrapper is missing!"
        raw.rewind()
        if buffered:
            return raw, self.encoding
        return raw.read(), self.encoding

    def refresh(self):
        """Re-fetches the resource from the internet using the session."""
        self.set_response(self.session.get(self.url, stream=True))

    def get_forms(self):
        """Returns a list of form elements available on the page."""
        source, encoding = self.get_source(buffered=True)
        return parse(
            source, parser=HTMLParser(encoding=encoding, collect_ids=False)
        ).xpath(
            "descendant-or-self::form|descendant-or-self::x:form",
            namespaces={'x': XHTML_NAMESPACE}
        )

    def submit_form(self, form, **extra_values):
        """
        Helper function to submit a `lxml` form.

        Example:

            wp = WebPage()
            wp.get('http://httpbin.org/forms/')
            form = wp.get_forms()[0]
            form.inputs['email'].value = 'bar' # etc
            form.inputs['password'].value = 'baz' # etc
            wp.submit_form(form)
            wp.get_links()

        The action is one of 'GET' or 'POST', the URL is the target URL as a
        string, and the values are a sequence of ``(name, value)`` tuples
        with the form data.
        """
        values = form.form_values()
        if extra_values:
            if hasattr(extra_values, 'items'):
                extra_values = extra_values.items()
            values.extend(extra_values)

        if form.action:
            url = form.action
        elif form.base_url:
            url = form.base_url
        else:
            url = self.url
        return self.request(form.method, url, data=values)

    def get_files(self):
        """
        Returns a list of urls, css, js, images etc.
        """
        return (e[2] for e in self.parse())

    def get_links(self):
        """
        Returns a list of urls in the anchor tags only.
        """
        return (e[2] for e in self.parse() if e[0].tag == 'a')

    def scrap_html(self, url):
        """Returns the html of the given url.

        :param url: address of the target page.
        """
        response = self.session.get(url)
        response.raise_for_status()
        return response.content

    def scrap_links(self, url):
        """Returns all the links from a given url.

        :param url: address of the target page.
        """
        response = self.session.get(url)
        response.raise_for_status()
        return response.links()

    def dump_html(self, filename=None):
        """Saves the html of the page to a default or specified file.

        :param filename: path of the file to write the contents to
        """
        filename = filename or self.filepath
        with open(filename, 'w+b') as fh:
            source, enc = self.get_source()
            fh.write(source)
        return filename

    def save_complete(self, pop=False):
        """Saves the complete html+assets on page to a file and
        also writes its linked files to the disk.

        Implements the combined logic of save_assets and save_html in
        compact form with checks and validation.
        """
        if not self.viewing_html():
            raise TypeError(
                "Not viewing a html page. Please check the link!")

        self.scheduler.handle_resource(self)
        if pop:
            self.open_in_browser()
        return self.filepath

    def open_in_browser(self):
        """Open the page in the default browser if it has been saved.

        You need to use the :meth:`~WebPage.save_complete` to make it work.
        """
        if not os.path.exists(self.filepath):
            self.logger.info(
                "Can't find the file to open in browser: %s" % self.filepath)
            return False

        self.logger.info(
            "Opening default browser with file: %s" % self.filepath)
        import webbrowser
        return webbrowser.open('file:///' + self.filepath)

    # handy shortcuts
    run = crawl = save_assets = save_complete


class Crawler(WebPage):
    @classmethod
    def from_config(cls, config):
        """
        It creates a `Crawler` object from a set config object.
        Under the hood it checks whether the config is set or not,
        then it creates a `session` using the `config.create_session()` method.
        It then creates a `scheduler` based on whether the threading is enabled or not.
        The scheduler is different from the `WebPage` objects scheduler due to its
        ability to process the anchor tags links to different pages.
        It also defines a `context` object which stores the path metadata for this structure.
        """
        if config and not config.is_set():
            raise AttributeError("Configuration is not setup.")

        session = config.create_session()
        if config.get('threaded'):
            scheduler = threading_crawler_scheduler(
                timeout=config.get_thread_join_timeout())
        else:
            scheduler = crawler_scheduler()
        context = config.create_context()
        ans = cls(session, config, scheduler, context)
        # XXX: Check connection to the url here?
        return ans

Classes

class Crawler (session, config, scheduler, context, response=None)

WebPage built upon HTMLResource element. It provides various utilities like form-filling, external response processing, getting list of links, dumping html and opening the html in the browser.

Generic internet resource which processes a server response based on responses content-type. Downloadable file if allowed in config would be downloaded. Css file would be parsed using suitable parser. Html will also be parsed using suitable html parser.

:param session: http client used for networking. :param config: project configuration handler. :param response: http response from the server. :param scheduler: response processor scheduler. :param context: context of this response; should contain base-location, base-url etc.

Expand source code

class Crawler(WebPage):
    @classmethod
    def from_config(cls, config):
        """
        It creates a `Crawler` object from a set config object.
        Under the hood it checks whether the config is set or not,
        then it creates a `session` using the `config.create_session()` method.
        It then creates a `scheduler` based on whether the threading is enabled or not.
        The scheduler is different from the `WebPage` objects scheduler due to its
        ability to process the anchor tags links to different pages.
        It also defines a `context` object which stores the path metadata for this structure.
        """
        if config and not config.is_set():
            raise AttributeError("Configuration is not setup.")

        session = config.create_session()
        if config.get('threaded'):
            scheduler = threading_crawler_scheduler(
                timeout=config.get_thread_join_timeout())
        else:
            scheduler = crawler_scheduler()
        context = config.create_context()
        ans = cls(session, config, scheduler, context)
        # XXX: Check connection to the url here?
        return ans

Ancestors

Static methods

def from_config(config)

It creates a Crawler object from a set config object. Under the hood it checks whether the config is set or not, then it creates a session using the config.create_session() method. It then creates a scheduler based on whether the threading is enabled or not. The scheduler is different from the WebPage objects scheduler due to its ability to process the anchor tags links to different pages. It also defines a context object which stores the path metadata for this structure.

Expand source code

@classmethod
def from_config(cls, config):
    """
    It creates a `Crawler` object from a set config object.
    Under the hood it checks whether the config is set or not,
    then it creates a `session` using the `config.create_session()` method.
    It then creates a `scheduler` based on whether the threading is enabled or not.
    The scheduler is different from the `WebPage` objects scheduler due to its
    ability to process the anchor tags links to different pages.
    It also defines a `context` object which stores the path metadata for this structure.
    """
    if config and not config.is_set():
        raise AttributeError("Configuration is not setup.")

    session = config.create_session()
    if config.get('threaded'):
        scheduler = threading_crawler_scheduler(
            timeout=config.get_thread_join_timeout())
    else:
        scheduler = crawler_scheduler()
    context = config.create_context()
    ans = cls(session, config, scheduler, context)
    # XXX: Check connection to the url here?
    return ans

Inherited members

WebPage:
- close
- content_type
- crawl
- dump_html
- element_map
- encoding
- extract_children
- filename
- filepath
- get
- get_files
- get_forms
- get_links
- get_source
- open_in_browser
- parse
- post
- refresh
- request
- resolve
- retrieve
- run
- save_assets
- save_complete
- scrap_html
- scrap_links
- set_response
- submit_form
- url
- viewing_css
- viewing_html
- viewing_js

class WebPage (session, config, scheduler, context, response=None)

WebPage built upon HTMLResource element. It provides various utilities like form-filling, external response processing, getting list of links, dumping html and opening the html in the browser.

Expand source code

class WebPage(HTMLResource):
    """
    WebPage built upon HTMLResource element.
    It provides various utilities like form-filling,
    external response processing, getting list of links,
    dumping html and opening the html in the browser.
    """
    @classmethod
    def from_config(cls, config):
        """It creates a `WebPage` object from a set config object.
        Under the hood it checks whether the config is set or not,
        then it creates a `session` using the `config.create_session()` method.
        It then creates a `scheduler` based on whether the threading is enabled or not.
        It also defines a `context` object which stores the path metadata for this structure.
        """
        if config and not config.is_set():
            raise AttributeError("Configuration is not setup.")

        session = config.create_session()
        if config.get('threaded'):
            scheduler = threading_default_scheduler(
                timeout=config.get_thread_join_timeout())
        else:
            scheduler = default_scheduler()
        context = config.create_context()
        ans = cls(session, config, scheduler, context)
        # XXX: Check connection to the url here?
        return ans

    def __repr__(self):
        """Short representation of this instance."""
        return '<{}: {}>'.format(self.__class__.__name__, self.url)

    element_map = property(
        operator.attrgetter('scheduler.data'),
        doc="Registry of different handler for different tags."
    )

    def set_response(self, response):
        """
        Set an explicit `requests.Response` object directly.
        It accepts a `requests.Response` object and wraps it in a `RewindableResponse` object.
        It also enables decoding in the original `urllib3` response object.

        You can use it like this
            import requests
            resp = requests.get('https://www.httpbin.org/')
            wp = WebPage()
            wp.set_response(resp)
            wp.get_forms()
        """
        if not isinstance(response, Response):
            raise ValueError("Expected %r, got %r" % (Response, response))
        response.raw.decode_content = True
        response.raw = RewindableResponse(response.raw)
        return super(WebPage, self).set_response(response)

    def get_source(self, buffered=False):
        """Returns the `requests.Response` object
        in a rewindable io wrapper. Contents can be consumed then
        the `.rewind()` method should be called to restore the contents
        of the original response.

            wp = WebPage()
            wp.get('http://httpbin.org/forms/')
            src = WebPage.get_source(buffered=True)
            contents = src.read()
            src.rewind()

        :param buffered: whether to return an Readable file-like object
            or just a plain string.
        :rtype: RewindableResponse
        """
        raw = getattr(self.response, 'raw', None)
        if raw is None:
            raise ValueError(
                "HTTP Response is not set at the `.response` attribute!"
                "Use the `.get()` method or `.set_response()` methods to set it.")

        # if raw.closed:
        #     raise ValueError(
        #         "I/O operations are closed for the raw source.")

        # Return the raw object which will decode the
        # buffer while reading otherwise errors will follow
        raw.decode_content = True

        # fp = getattr(raw, '_fp', None)
        # assert fp is not None, "Raw source wrapper is missing!"
        # assert isinstance(fp, CallbackFileWrapper), \
        #     "Raw source wrapper is missing!"
        raw.rewind()
        if buffered:
            return raw, self.encoding
        return raw.read(), self.encoding

    def refresh(self):
        """Re-fetches the resource from the internet using the session."""
        self.set_response(self.session.get(self.url, stream=True))

    def get_forms(self):
        """Returns a list of form elements available on the page."""
        source, encoding = self.get_source(buffered=True)
        return parse(
            source, parser=HTMLParser(encoding=encoding, collect_ids=False)
        ).xpath(
            "descendant-or-self::form|descendant-or-self::x:form",
            namespaces={'x': XHTML_NAMESPACE}
        )

    def submit_form(self, form, **extra_values):
        """
        Helper function to submit a `lxml` form.

        Example:

            wp = WebPage()
            wp.get('http://httpbin.org/forms/')
            form = wp.get_forms()[0]
            form.inputs['email'].value = 'bar' # etc
            form.inputs['password'].value = 'baz' # etc
            wp.submit_form(form)
            wp.get_links()

        The action is one of 'GET' or 'POST', the URL is the target URL as a
        string, and the values are a sequence of ``(name, value)`` tuples
        with the form data.
        """
        values = form.form_values()
        if extra_values:
            if hasattr(extra_values, 'items'):
                extra_values = extra_values.items()
            values.extend(extra_values)

        if form.action:
            url = form.action
        elif form.base_url:
            url = form.base_url
        else:
            url = self.url
        return self.request(form.method, url, data=values)

    def get_files(self):
        """
        Returns a list of urls, css, js, images etc.
        """
        return (e[2] for e in self.parse())

    def get_links(self):
        """
        Returns a list of urls in the anchor tags only.
        """
        return (e[2] for e in self.parse() if e[0].tag == 'a')

    def scrap_html(self, url):
        """Returns the html of the given url.

        :param url: address of the target page.
        """
        response = self.session.get(url)
        response.raise_for_status()
        return response.content

    def scrap_links(self, url):
        """Returns all the links from a given url.

        :param url: address of the target page.
        """
        response = self.session.get(url)
        response.raise_for_status()
        return response.links()

    def dump_html(self, filename=None):
        """Saves the html of the page to a default or specified file.

        :param filename: path of the file to write the contents to
        """
        filename = filename or self.filepath
        with open(filename, 'w+b') as fh:
            source, enc = self.get_source()
            fh.write(source)
        return filename

    def save_complete(self, pop=False):
        """Saves the complete html+assets on page to a file and
        also writes its linked files to the disk.

        Implements the combined logic of save_assets and save_html in
        compact form with checks and validation.
        """
        if not self.viewing_html():
            raise TypeError(
                "Not viewing a html page. Please check the link!")

        self.scheduler.handle_resource(self)
        if pop:
            self.open_in_browser()
        return self.filepath

    def open_in_browser(self):
        """Open the page in the default browser if it has been saved.

        You need to use the :meth:`~WebPage.save_complete` to make it work.
        """
        if not os.path.exists(self.filepath):
            self.logger.info(
                "Can't find the file to open in browser: %s" % self.filepath)
            return False

        self.logger.info(
            "Opening default browser with file: %s" % self.filepath)
        import webbrowser
        return webbrowser.open('file:///' + self.filepath)

    # handy shortcuts
    run = crawl = save_assets = save_complete

Ancestors

Subclasses

Crawler

Static methods

def from_config(config)

It creates a WebPage object from a set config object. Under the hood it checks whether the config is set or not, then it creates a session using the config.create_session() method. It then creates a scheduler based on whether the threading is enabled or not. It also defines a context object which stores the path metadata for this structure.

Expand source code

@classmethod
def from_config(cls, config):
    """It creates a `WebPage` object from a set config object.
    Under the hood it checks whether the config is set or not,
    then it creates a `session` using the `config.create_session()` method.
    It then creates a `scheduler` based on whether the threading is enabled or not.
    It also defines a `context` object which stores the path metadata for this structure.
    """
    if config and not config.is_set():
        raise AttributeError("Configuration is not setup.")

    session = config.create_session()
    if config.get('threaded'):
        scheduler = threading_default_scheduler(
            timeout=config.get_thread_join_timeout())
    else:
        scheduler = default_scheduler()
    context = config.create_context()
    ans = cls(session, config, scheduler, context)
    # XXX: Check connection to the url here?
    return ans

Instance variables

var element_map: Registry of different handler for different tags.

Methods

def crawl(self, pop=False)

Saves the complete html+assets on page to a file and also writes its linked files to the disk.

Implements the combined logic of save_assets and save_html in compact form with checks and validation.

Expand source code

def save_complete(self, pop=False):
    """Saves the complete html+assets on page to a file and
    also writes its linked files to the disk.

    Implements the combined logic of save_assets and save_html in
    compact form with checks and validation.
    """
    if not self.viewing_html():
        raise TypeError(
            "Not viewing a html page. Please check the link!")

    self.scheduler.handle_resource(self)
    if pop:
        self.open_in_browser()
    return self.filepath

def dump_html(self, filename=None)

Saves the html of the page to a default or specified file.

:param filename: path of the file to write the contents to

Expand source code

def dump_html(self, filename=None):
    """Saves the html of the page to a default or specified file.

    :param filename: path of the file to write the contents to
    """
    filename = filename or self.filepath
    with open(filename, 'w+b') as fh:
        source, enc = self.get_source()
        fh.write(source)
    return filename

def get_files(self)

Returns a list of urls, css, js, images etc.

Expand source code

def get_files(self):
    """
    Returns a list of urls, css, js, images etc.
    """
    return (e[2] for e in self.parse())

def get_forms(self)

Returns a list of form elements available on the page.

Expand source code

def get_forms(self):
    """Returns a list of form elements available on the page."""
    source, encoding = self.get_source(buffered=True)
    return parse(
        source, parser=HTMLParser(encoding=encoding, collect_ids=False)
    ).xpath(
        "descendant-or-self::form|descendant-or-self::x:form",
        namespaces={'x': XHTML_NAMESPACE}
    )

def get_links(self)

Returns a list of urls in the anchor tags only.

Expand source code

def get_links(self):
    """
    Returns a list of urls in the anchor tags only.
    """
    return (e[2] for e in self.parse() if e[0].tag == 'a')

def get_source(self, buffered=False)

Returns the requests.Response object in a rewindable io wrapper. Contents can be consumed then the .rewind() method should be called to restore the contents of the original response.

wp = WebPage()
wp.get('http://httpbin.org/forms/')
src = WebPage.get_source(buffered=True)
contents = src.read()
src.rewind()

:param buffered: whether to return an Readable file-like object or just a plain string. :rtype: RewindableResponse

Expand source code

def get_source(self, buffered=False):
    """Returns the `requests.Response` object
    in a rewindable io wrapper. Contents can be consumed then
    the `.rewind()` method should be called to restore the contents
    of the original response.

        wp = WebPage()
        wp.get('http://httpbin.org/forms/')
        src = WebPage.get_source(buffered=True)
        contents = src.read()
        src.rewind()

    :param buffered: whether to return an Readable file-like object
        or just a plain string.
    :rtype: RewindableResponse
    """
    raw = getattr(self.response, 'raw', None)
    if raw is None:
        raise ValueError(
            "HTTP Response is not set at the `.response` attribute!"
            "Use the `.get()` method or `.set_response()` methods to set it.")

    # if raw.closed:
    #     raise ValueError(
    #         "I/O operations are closed for the raw source.")

    # Return the raw object which will decode the
    # buffer while reading otherwise errors will follow
    raw.decode_content = True

    # fp = getattr(raw, '_fp', None)
    # assert fp is not None, "Raw source wrapper is missing!"
    # assert isinstance(fp, CallbackFileWrapper), \
    #     "Raw source wrapper is missing!"
    raw.rewind()
    if buffered:
        return raw, self.encoding
    return raw.read(), self.encoding

def open_in_browser(self)

Open the page in the default browser if it has been saved.

You need to use the :meth:~WebPage.save_complete to make it work.

Expand source code

def open_in_browser(self):
    """Open the page in the default browser if it has been saved.

    You need to use the :meth:`~WebPage.save_complete` to make it work.
    """
    if not os.path.exists(self.filepath):
        self.logger.info(
            "Can't find the file to open in browser: %s" % self.filepath)
        return False

    self.logger.info(
        "Opening default browser with file: %s" % self.filepath)
    import webbrowser
    return webbrowser.open('file:///' + self.filepath)

def refresh(self)

Re-fetches the resource from the internet using the session.

Expand source code

def refresh(self):
    """Re-fetches the resource from the internet using the session."""
    self.set_response(self.session.get(self.url, stream=True))

def run(self, pop=False)

Saves the complete html+assets on page to a file and also writes its linked files to the disk.

Implements the combined logic of save_assets and save_html in compact form with checks and validation.

Expand source code

def save_complete(self, pop=False):
    """Saves the complete html+assets on page to a file and
    also writes its linked files to the disk.

    Implements the combined logic of save_assets and save_html in
    compact form with checks and validation.
    """
    if not self.viewing_html():
        raise TypeError(
            "Not viewing a html page. Please check the link!")

    self.scheduler.handle_resource(self)
    if pop:
        self.open_in_browser()
    return self.filepath

def save_assets(self, pop=False)

Saves the complete html+assets on page to a file and also writes its linked files to the disk.

Implements the combined logic of save_assets and save_html in compact form with checks and validation.

Expand source code

def save_complete(self, pop=False):
    """Saves the complete html+assets on page to a file and
    also writes its linked files to the disk.

    Implements the combined logic of save_assets and save_html in
    compact form with checks and validation.
    """
    if not self.viewing_html():
        raise TypeError(
            "Not viewing a html page. Please check the link!")

    self.scheduler.handle_resource(self)
    if pop:
        self.open_in_browser()
    return self.filepath

def save_complete(self, pop=False)

Saves the complete html+assets on page to a file and also writes its linked files to the disk.

Implements the combined logic of save_assets and save_html in compact form with checks and validation.

Expand source code

def save_complete(self, pop=False):
    """Saves the complete html+assets on page to a file and
    also writes its linked files to the disk.

    Implements the combined logic of save_assets and save_html in
    compact form with checks and validation.
    """
    if not self.viewing_html():
        raise TypeError(
            "Not viewing a html page. Please check the link!")

    self.scheduler.handle_resource(self)
    if pop:
        self.open_in_browser()
    return self.filepath

def scrap_html(self, url)

Returns the html of the given url.

:param url: address of the target page.

Expand source code

def scrap_html(self, url):
    """Returns the html of the given url.

    :param url: address of the target page.
    """
    response = self.session.get(url)
    response.raise_for_status()
    return response.content

def scrap_links(self, url)

Returns all the links from a given url.

:param url: address of the target page.

Expand source code

def scrap_links(self, url):
    """Returns all the links from a given url.

    :param url: address of the target page.
    """
    response = self.session.get(url)
    response.raise_for_status()
    return response.links()

def set_response(self, response)

Set an explicit requests.Response object directly. It accepts a requests.Response object and wraps it in a RewindableResponse object. It also enables decoding in the original urllib3 response object.

You can use it like this import requests resp = requests.get('https://www.httpbin.org/') wp = WebPage() wp.set_response(resp) wp.get_forms()

Expand source code

def set_response(self, response):
    """
    Set an explicit `requests.Response` object directly.
    It accepts a `requests.Response` object and wraps it in a `RewindableResponse` object.
    It also enables decoding in the original `urllib3` response object.

    You can use it like this
        import requests
        resp = requests.get('https://www.httpbin.org/')
        wp = WebPage()
        wp.set_response(resp)
        wp.get_forms()
    """
    if not isinstance(response, Response):
        raise ValueError("Expected %r, got %r" % (Response, response))
    response.raw.decode_content = True
    response.raw = RewindableResponse(response.raw)
    return super(WebPage, self).set_response(response)

def submit_form(self, form, **extra_values)

Helper function to submit a lxml form.

Example

wp = WebPage() wp.get('http://httpbin.org/forms/') form = wp.get_forms()[0] form.inputs['email'].value = 'bar' # etc form.inputs['password'].value = 'baz' # etc wp.submit_form(form) wp.get_links()

The action is one of 'GET' or 'POST', the URL is the target URL as a string, and the values are a sequence of (name, value) tuples with the form data.

Expand source code

def submit_form(self, form, **extra_values):
    """
    Helper function to submit a `lxml` form.

    Example:

        wp = WebPage()
        wp.get('http://httpbin.org/forms/')
        form = wp.get_forms()[0]
        form.inputs['email'].value = 'bar' # etc
        form.inputs['password'].value = 'baz' # etc
        wp.submit_form(form)
        wp.get_links()

    The action is one of 'GET' or 'POST', the URL is the target URL as a
    string, and the values are a sequence of ``(name, value)`` tuples
    with the form data.
    """
    values = form.form_values()
    if extra_values:
        if hasattr(extra_values, 'items'):
            extra_values = extra_values.items()
        values.extend(extra_values)

    if form.action:
        url = form.action
    elif form.base_url:
        url = form.base_url
    else:
        url = self.url
    return self.request(form.method, url, data=values)

Inherited members

HTMLResource:
- close
- content_type
- encoding
- extract_children
- filename
- filepath
- get
- parse
- post
- request
- resolve
- retrieve
- url
- viewing_css
- viewing_html
- viewing_js