Source code for scraper_toolkit.components.PageFetcher

from typing import Iterable

import bs4
import requests

from scraper_toolkit import exceptions

BS4_PARSER = 'lxml'


[docs]class PageFetcher:
    """Fetch URLs and return web pages' HTML as strings.

    :param domain: Prefix to be added to scraped URLs missing the domain.
    """
    def __init__(self, domain: str):
        self.domain = domain

[docs]    def select_links_from_page(self, selector: str, target_url: str = None) -> Iterable[str]:
        """Yield the HTML of all pages linked on the target_url located by the given CSS selector.

        :param selector: CSS selector for elements containing href attribute
        :param target_url: URL to search for links. If none is provided, the domain URL will be used.
        :return: Generator for HTML as strings, fetched from the selected links.
        """
        if not target_url:
            target_url = self.domain[:]

        target_page = self.get_html(target_url)
        elems = self.select_elements_from_html(target_page, selector)
        urls = (x['href'] for x in elems)

        for x in urls:
            page_url = self.get_full_url(self.domain, x)
            page = requests.get(page_url).text
            yield page

[docs]    def get_links_from_page(self, target_url: str = None) -> Iterable[str]:
        """Return a list of every href URL found from target_url.

        :param target_url: URL of page to search for href links.
        :return: List of every discovered href link on the page.
        """
        if not target_url:
            target_url = self.domain[:]

        domain = self.domain[:]
        html = self.get_html(target_url)
        selector = 'a[href]'
        
        elems = self.select_elements_from_html(html, selector)
        links = {x['href'] for x in elems}
        full_links = [self.get_full_url(domain, x) for x in links]

        return full_links

[docs]    @staticmethod
    def select_elements_from_html(html: str, selector: str):
        """Return a list of HTML elements from the given html that match the provided CSS selector.

        :param html: HTML of the page to parse.
        :param selector: CSS selector for target elements.
        :return: List of HTML elements matching the CSS selector.
        """
        soup = bs4.BeautifulSoup(html, BS4_PARSER)

        try:
            elems = soup.select(selector)
        except SyntaxError:
            raise exceptions.SelectorSyntaxError

        return elems

[docs]    @staticmethod
    def get_full_url(domain: str, suffix: str) -> str:
        """Return a complete URL given a domain and suffix, even if the provided suffix is the complete URL.

        :param domain: The domain of the target page URL.
        :param suffix: The URL of the target page, with or without the domain prefix.
        :return: The complete URL.
        """
        if not suffix.startswith(domain):
            if domain.endswith('/') and suffix.startswith('/'):
                page_url = domain[:-1] + suffix
            elif not domain.endswith('/') and not suffix.startswith('/'):
                page_url = domain + '/' + suffix
            else:
                page_url = domain + suffix
        else:
            page_url = suffix
        return page_url

[docs]    def get_html(self, url: str = None) -> str:
        """Fetch the page HTML from the given URL.

        :param url: URL of target page.
        :return: HTML as a string.
        """
        if not url:
            url = self.domain

        r = requests.get(url)
        return r.text