Source code for scraper_toolkit.components.PageFetcher

from typing import Iterable

import bs4
import requests

from scraper_toolkit import exceptions

BS4_PARSER = 'lxml'


[docs]class PageFetcher: """Fetch URLs and return web pages' HTML as strings. :param domain: Prefix to be added to scraped URLs missing the domain. """ def __init__(self, domain: str): self.domain = domain
[docs] @staticmethod def select_elements_from_html(html: str, selector: str): """Return a list of HTML elements from the given html that match the provided CSS selector. :param html: HTML of the page to parse. :param selector: CSS selector for target elements. :return: List of HTML elements matching the CSS selector. """ soup = bs4.BeautifulSoup(html, BS4_PARSER) try: elems = soup.select(selector) except SyntaxError: raise exceptions.SelectorSyntaxError return elems
[docs] @staticmethod def get_full_url(domain: str, suffix: str) -> str: """Return a complete URL given a domain and suffix, even if the provided suffix is the complete URL. :param domain: The domain of the target page URL. :param suffix: The URL of the target page, with or without the domain prefix. :return: The complete URL. """ if not suffix.startswith(domain): if domain.endswith('/') and suffix.startswith('/'): page_url = domain[:-1] + suffix elif not domain.endswith('/') and not suffix.startswith('/'): page_url = domain + '/' + suffix else: page_url = domain + suffix else: page_url = suffix return page_url
[docs] def get_html(self, url: str = None) -> str: """Fetch the page HTML from the given URL. :param url: URL of target page. :return: HTML as a string. """ if not url: url = self.domain r = requests.get(url) return r.text