Module pywebcopy.configs

Expand source code
# Copyright 2020; Raja Tomar
# See license for more details
import logging
import os
import sys
import tempfile
from functools import partial

from requests.structures import CaseInsensitiveDict
from six import text_type
from six import string_types

from .__version__ import __title__
from .__version__ import __version__
from .urls import HIERARCHY
from .urls import get_host
from .urls import secure_filename
from .session import default_headers

__all__ = [
    'ConfigHandler',
    'get_config',
    'default_config',
    'safe_file_types',
    'safe_http_headers'
]

logger = logging.getLogger(__name__)


def add_stderr_logger(name=__title__, level=logging.DEBUG):
    """
    Helper for quickly adding a StreamHandler to the logger. Useful for
    debugging.

    Returns the handler after adding it.
    """
    # This method needs to be in this __init__.py to get the __name__ correct
    # even if this library is wrapped within another package.
    root = logging.getLogger(name)
    #: If there is already a stderr logger then we don't need to bother.
    for h in root.handlers:
        if isinstance(h, logging.StreamHandler):
            if h.stream == sys.stderr:
                h.disabled = False
                h.setLevel(level)
                return h
    handler = logging.StreamHandler(sys.stderr)
    handler.setFormatter(
        logging.Formatter(
            '%(levelname)-8s - %(name)s:%(lineno)d - %(message)s'
        )
    )
    root.addHandler(handler)
    root.setLevel(level)
    root.debug('Added a stderr logging handler to logger: %s', name)
    return handler


# FIXME: Do something about these.
safe_file_types = [
    'text/*',
    'image/*',
    'font/*',
    'application/pdf',
    'application/json',
]


safe_http_headers = {
    "Accept-Language": "en-US,en;q=0.9",
    'User-Agent':
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) "
        "Gecko/20100101 Firefox/70.0 PyWebCopyBot/%s" % __version__
}

#: Base configuration with preconfigured values.
default_config = {
    'debug': False,
    'project_url': None,
    'project_name': None,
    'project_folder': None,
    'threaded': None,
    'thread_join_timeout': None,
    'tree_type': HIERARCHY,

    # TODO: Allow a `last-modified-time` overwrite mode
    'overwrite': False,

    'bypass_robots': False,
    'http_cache': False,
    'http_headers': default_headers(**safe_http_headers),
    'delay': None,

    # TODO: Disabled for now until I figure it out.
    # 'allowed_file_types': safe_file_types,
}


class ConfigError(AttributeError, TypeError):
    """Bad config value or operation."""


class ConfigHandler(CaseInsensitiveDict):
    """Provides functionality to the config instance which
    stores and provides configuration values in every module.
    """
    def __repr__(self):  # pragma: no cover
        return '<ConfigHandler(%s)>' % self.get('project_name', 'Not Set')

    def __getattribute__(self, item):
        """Dynamic method of name `get_(key)` and `set_(key)` generation
        for all of the keys available.
        for example to change the `project_url` key
        instead of using dictionary like operation you would do
        `.get_project_url()` instead of `['project_url']`.
        `.set_project_url(new)` instead of `['project_url'] = new`.
        """
        if isinstance(item, string_types) and item.startswith('set_'):
            if item[4:] in self:
                return partial(self.__setitem__, item[4:])
        elif isinstance(item, string_types) and item.startswith('get_'):
            if item[4:] in self:
                return partial(self.__getitem__, item[4:])
        return super(ConfigHandler, self).__getattribute__(item)

    def resolve_url(self):
        """Resolves any redirects in the url and sets the final url as base url."""
        raise NotImplementedError()

    def reset_config(self):
        """Resets all to configuration to default state."""
        self.update(default_config)

    def reset_key(self, key):
        self.update({key: default_config.get(key)})

    def is_set(self):
        """Checks whether the configuration has required attributes or not."""
        try:
            assert self.get('project_folder') is not None
            assert self.get('project_url') is not None
            assert self.get('project_name') is not None
        except AssertionError:
            return False
        else:
            return True

    def setup_paths(self, project_folder, project_name):
        """Fills the project_name, project_name and its
        dependent keys after evaluation.

        .. version changed :: 6.0.0
            Added string type checks and os based path normalising.

        .. version changed :: 6.1.0
            FIX: fixed path issue when using relative path for project_folder

        .. version changed :: 6.3.0
            FIX: Removed file based logging.
            FIX: Disabled dir change on setup

        :param project_name: new name of the project
        :param project_folder: folder where to store all the downloaded files
        """
        if not isinstance(project_name, string_types):
            raise ConfigError("project_name value must be a string")

        if not isinstance(project_folder, string_types):
            raise ConfigError("project_folder value must be a string!")

        if os.altsep:
            project_folder = project_folder.replace(os.altsep, os.sep)
        if project_folder.find(os.sep) < 0:  # pragma: no cover
            raise ConfigError("Project_folder path doesn't seem to be a valid path.")

        project_folder = os.path.abspath(project_folder)

        norm_p = os.path.join(
            os.path.normpath(project_folder),
            os.path.normpath(project_name)
        )
        self.set_project_name(project_name)
        self.set_project_folder(norm_p)

        if not os.path.exists(norm_p):
            os.makedirs(norm_p)

    def setup_config(self,
                     project_url=None,
                     project_folder=None,
                     project_name=None,
                     overwrite=False,
                     bypass_robots=False,
                     debug=False,
                     delay=None,
                     threaded=None):
        """Sets up the complete config parts which requires a project_url to be present.

        Complete configuration is done here and subject to change according to application structure
        You are advised to use only the .setup_path() method if you get any unusual behaviour
        """
        self.set_overwrite(overwrite)
        self.set_bypass_robots(bypass_robots)
        self.set_debug(debug)
        self.set_delay(delay)
        self.set_threaded(threaded)
        self.set_project_url(project_url)
        self.setup_paths(project_folder, project_name)

        #: Add a stderr logger to this library.
        if debug:
            add_stderr_logger(level=logging.DEBUG)

        #: Log this new configuration to the log file for debug purposes
        logger.debug(str(dict(self)))

    def create_context(self):
        if not self.is_set():
            raise ConfigError("Config is missing required attributes!")
        from .urls import Context
        return Context.from_config(self)

    def create_session(self):
        if not self.is_set():
            raise ConfigError("Config is missing required attributes!")
        from .session import Session
        return Session.from_config(self)

    def create_crawler(self):
        if not self.is_set():
            raise ConfigError("Config is missing required attributes!")
        from .core import Crawler
        return Crawler.from_config(self)

    def create_page(self):
        if not self.is_set():
            raise ConfigError("Config is missing required attributes!")
        from .core import WebPage
        return WebPage.from_config(self)


def get_config(project_url,
               project_folder=None,
               project_name=None,
               bypass_robots=False,
               debug=False,
               delay=None,
               threaded=None):
    """Create a ConfigHandler instance and return it.
    If the project_folder is not supplied it will use the users Tempdir.

    :param project_url: project_url of the web page to work with
    :type project_url: str
    :param project_folder: folder in which the files will be downloaded
    :type project_folder: str
    :param project_name: name of the project to distinguish it
    :type project_name: str | None
    :param bypass_robots: whether to follow the robots.txt rules or not
    :param debug: whether to print deep logs or not.
    :param delay: amount of delay between two concurrent requests to a same server.
    :param threaded: whether to use threading or not (it can break some site).
    """
    if not isinstance(project_url, string_types):
        raise ConfigError("Expected string type, got %r" % project_url)
    if project_folder and not isinstance(project_folder, string_types):
        raise ConfigError("Expected string type, got %r" % project_folder)

    if not project_folder:
        logger.debug('No project folder provided, %temp% dir will be used instead.')
        project_folder = tempfile.gettempdir()

    if not project_name:
        project_name = '_'.join(
            map(secure_filename,
                map(lambda x: text_type(x),
                    filter(None, get_host(project_url)))))
        logger.debug('No project name provided, generated from url: %s' % project_name)

    ans = ConfigHandler(default_config)
    ans.setup_config(
        project_url=project_url,
        project_folder=project_folder,
        project_name=project_name,
        bypass_robots=bypass_robots,
        debug=debug,
        delay=delay,
        threaded=threaded,
    )
    return ans

Global variables

var default_config

Base configuration with preconfigured values.

Functions

def get_config(project_url, project_folder=None, project_name=None, bypass_robots=False, debug=False, delay=None, threaded=None)

Create a ConfigHandler instance and return it. If the project_folder is not supplied it will use the users Tempdir.

:param project_url: project_url of the web page to work with :type project_url: str :param project_folder: folder in which the files will be downloaded :type project_folder: str :param project_name: name of the project to distinguish it :type project_name: str | None :param bypass_robots: whether to follow the robots.txt rules or not :param debug: whether to print deep logs or not. :param delay: amount of delay between two concurrent requests to a same server. :param threaded: whether to use threading or not (it can break some site).

Expand source code
def get_config(project_url,
               project_folder=None,
               project_name=None,
               bypass_robots=False,
               debug=False,
               delay=None,
               threaded=None):
    """Create a ConfigHandler instance and return it.
    If the project_folder is not supplied it will use the users Tempdir.

    :param project_url: project_url of the web page to work with
    :type project_url: str
    :param project_folder: folder in which the files will be downloaded
    :type project_folder: str
    :param project_name: name of the project to distinguish it
    :type project_name: str | None
    :param bypass_robots: whether to follow the robots.txt rules or not
    :param debug: whether to print deep logs or not.
    :param delay: amount of delay between two concurrent requests to a same server.
    :param threaded: whether to use threading or not (it can break some site).
    """
    if not isinstance(project_url, string_types):
        raise ConfigError("Expected string type, got %r" % project_url)
    if project_folder and not isinstance(project_folder, string_types):
        raise ConfigError("Expected string type, got %r" % project_folder)

    if not project_folder:
        logger.debug('No project folder provided, %temp% dir will be used instead.')
        project_folder = tempfile.gettempdir()

    if not project_name:
        project_name = '_'.join(
            map(secure_filename,
                map(lambda x: text_type(x),
                    filter(None, get_host(project_url)))))
        logger.debug('No project name provided, generated from url: %s' % project_name)

    ans = ConfigHandler(default_config)
    ans.setup_config(
        project_url=project_url,
        project_folder=project_folder,
        project_name=project_name,
        bypass_robots=bypass_robots,
        debug=debug,
        delay=delay,
        threaded=threaded,
    )
    return ans

Classes

class ConfigHandler (data=None, **kwargs)

Provides functionality to the config instance which stores and provides configuration values in every module.

Expand source code
class ConfigHandler(CaseInsensitiveDict):
    """Provides functionality to the config instance which
    stores and provides configuration values in every module.
    """
    def __repr__(self):  # pragma: no cover
        return '<ConfigHandler(%s)>' % self.get('project_name', 'Not Set')

    def __getattribute__(self, item):
        """Dynamic method of name `get_(key)` and `set_(key)` generation
        for all of the keys available.
        for example to change the `project_url` key
        instead of using dictionary like operation you would do
        `.get_project_url()` instead of `['project_url']`.
        `.set_project_url(new)` instead of `['project_url'] = new`.
        """
        if isinstance(item, string_types) and item.startswith('set_'):
            if item[4:] in self:
                return partial(self.__setitem__, item[4:])
        elif isinstance(item, string_types) and item.startswith('get_'):
            if item[4:] in self:
                return partial(self.__getitem__, item[4:])
        return super(ConfigHandler, self).__getattribute__(item)

    def resolve_url(self):
        """Resolves any redirects in the url and sets the final url as base url."""
        raise NotImplementedError()

    def reset_config(self):
        """Resets all to configuration to default state."""
        self.update(default_config)

    def reset_key(self, key):
        self.update({key: default_config.get(key)})

    def is_set(self):
        """Checks whether the configuration has required attributes or not."""
        try:
            assert self.get('project_folder') is not None
            assert self.get('project_url') is not None
            assert self.get('project_name') is not None
        except AssertionError:
            return False
        else:
            return True

    def setup_paths(self, project_folder, project_name):
        """Fills the project_name, project_name and its
        dependent keys after evaluation.

        .. version changed :: 6.0.0
            Added string type checks and os based path normalising.

        .. version changed :: 6.1.0
            FIX: fixed path issue when using relative path for project_folder

        .. version changed :: 6.3.0
            FIX: Removed file based logging.
            FIX: Disabled dir change on setup

        :param project_name: new name of the project
        :param project_folder: folder where to store all the downloaded files
        """
        if not isinstance(project_name, string_types):
            raise ConfigError("project_name value must be a string")

        if not isinstance(project_folder, string_types):
            raise ConfigError("project_folder value must be a string!")

        if os.altsep:
            project_folder = project_folder.replace(os.altsep, os.sep)
        if project_folder.find(os.sep) < 0:  # pragma: no cover
            raise ConfigError("Project_folder path doesn't seem to be a valid path.")

        project_folder = os.path.abspath(project_folder)

        norm_p = os.path.join(
            os.path.normpath(project_folder),
            os.path.normpath(project_name)
        )
        self.set_project_name(project_name)
        self.set_project_folder(norm_p)

        if not os.path.exists(norm_p):
            os.makedirs(norm_p)

    def setup_config(self,
                     project_url=None,
                     project_folder=None,
                     project_name=None,
                     overwrite=False,
                     bypass_robots=False,
                     debug=False,
                     delay=None,
                     threaded=None):
        """Sets up the complete config parts which requires a project_url to be present.

        Complete configuration is done here and subject to change according to application structure
        You are advised to use only the .setup_path() method if you get any unusual behaviour
        """
        self.set_overwrite(overwrite)
        self.set_bypass_robots(bypass_robots)
        self.set_debug(debug)
        self.set_delay(delay)
        self.set_threaded(threaded)
        self.set_project_url(project_url)
        self.setup_paths(project_folder, project_name)

        #: Add a stderr logger to this library.
        if debug:
            add_stderr_logger(level=logging.DEBUG)

        #: Log this new configuration to the log file for debug purposes
        logger.debug(str(dict(self)))

    def create_context(self):
        if not self.is_set():
            raise ConfigError("Config is missing required attributes!")
        from .urls import Context
        return Context.from_config(self)

    def create_session(self):
        if not self.is_set():
            raise ConfigError("Config is missing required attributes!")
        from .session import Session
        return Session.from_config(self)

    def create_crawler(self):
        if not self.is_set():
            raise ConfigError("Config is missing required attributes!")
        from .core import Crawler
        return Crawler.from_config(self)

    def create_page(self):
        if not self.is_set():
            raise ConfigError("Config is missing required attributes!")
        from .core import WebPage
        return WebPage.from_config(self)

Ancestors

  • requests.structures.CaseInsensitiveDict
  • collections.abc.MutableMapping
  • collections.abc.Mapping
  • collections.abc.Collection
  • collections.abc.Sized
  • collections.abc.Iterable
  • collections.abc.Container

Methods

def create_context(self)
Expand source code
def create_context(self):
    if not self.is_set():
        raise ConfigError("Config is missing required attributes!")
    from .urls import Context
    return Context.from_config(self)
def create_crawler(self)
Expand source code
def create_crawler(self):
    if not self.is_set():
        raise ConfigError("Config is missing required attributes!")
    from .core import Crawler
    return Crawler.from_config(self)
def create_page(self)
Expand source code
def create_page(self):
    if not self.is_set():
        raise ConfigError("Config is missing required attributes!")
    from .core import WebPage
    return WebPage.from_config(self)
def create_session(self)
Expand source code
def create_session(self):
    if not self.is_set():
        raise ConfigError("Config is missing required attributes!")
    from .session import Session
    return Session.from_config(self)
def is_set(self)

Checks whether the configuration has required attributes or not.

Expand source code
def is_set(self):
    """Checks whether the configuration has required attributes or not."""
    try:
        assert self.get('project_folder') is not None
        assert self.get('project_url') is not None
        assert self.get('project_name') is not None
    except AssertionError:
        return False
    else:
        return True
def reset_config(self)

Resets all to configuration to default state.

Expand source code
def reset_config(self):
    """Resets all to configuration to default state."""
    self.update(default_config)
def reset_key(self, key)
Expand source code
def reset_key(self, key):
    self.update({key: default_config.get(key)})
def resolve_url(self)

Resolves any redirects in the url and sets the final url as base url.

Expand source code
def resolve_url(self):
    """Resolves any redirects in the url and sets the final url as base url."""
    raise NotImplementedError()
def setup_config(self, project_url=None, project_folder=None, project_name=None, overwrite=False, bypass_robots=False, debug=False, delay=None, threaded=None)

Sets up the complete config parts which requires a project_url to be present.

Complete configuration is done here and subject to change according to application structure You are advised to use only the .setup_path() method if you get any unusual behaviour

Expand source code
def setup_config(self,
                 project_url=None,
                 project_folder=None,
                 project_name=None,
                 overwrite=False,
                 bypass_robots=False,
                 debug=False,
                 delay=None,
                 threaded=None):
    """Sets up the complete config parts which requires a project_url to be present.

    Complete configuration is done here and subject to change according to application structure
    You are advised to use only the .setup_path() method if you get any unusual behaviour
    """
    self.set_overwrite(overwrite)
    self.set_bypass_robots(bypass_robots)
    self.set_debug(debug)
    self.set_delay(delay)
    self.set_threaded(threaded)
    self.set_project_url(project_url)
    self.setup_paths(project_folder, project_name)

    #: Add a stderr logger to this library.
    if debug:
        add_stderr_logger(level=logging.DEBUG)

    #: Log this new configuration to the log file for debug purposes
    logger.debug(str(dict(self)))
def setup_paths(self, project_folder, project_name)

Fills the project_name, project_name and its dependent keys after evaluation.

.. version changed :: 6.0.0 Added string type checks and os based path normalising.

.. version changed :: 6.1.0 FIX: fixed path issue when using relative path for project_folder

.. version changed :: 6.3.0 FIX: Removed file based logging. FIX: Disabled dir change on setup

:param project_name: new name of the project :param project_folder: folder where to store all the downloaded files

Expand source code
def setup_paths(self, project_folder, project_name):
    """Fills the project_name, project_name and its
    dependent keys after evaluation.

    .. version changed :: 6.0.0
        Added string type checks and os based path normalising.

    .. version changed :: 6.1.0
        FIX: fixed path issue when using relative path for project_folder

    .. version changed :: 6.3.0
        FIX: Removed file based logging.
        FIX: Disabled dir change on setup

    :param project_name: new name of the project
    :param project_folder: folder where to store all the downloaded files
    """
    if not isinstance(project_name, string_types):
        raise ConfigError("project_name value must be a string")

    if not isinstance(project_folder, string_types):
        raise ConfigError("project_folder value must be a string!")

    if os.altsep:
        project_folder = project_folder.replace(os.altsep, os.sep)
    if project_folder.find(os.sep) < 0:  # pragma: no cover
        raise ConfigError("Project_folder path doesn't seem to be a valid path.")

    project_folder = os.path.abspath(project_folder)

    norm_p = os.path.join(
        os.path.normpath(project_folder),
        os.path.normpath(project_name)
    )
    self.set_project_name(project_name)
    self.set_project_folder(norm_p)

    if not os.path.exists(norm_p):
        os.makedirs(norm_p)