from typing import Iterable
import bs4
import requests
from scraper_toolkit import exceptions
BS4_PARSER = 'lxml'
[docs]class PageFetcher:
"""Fetch URLs and return web pages' HTML as strings.
:param domain: Prefix to be added to scraped URLs missing the domain.
"""
def __init__(self, domain: str):
self.domain = domain
[docs] def select_links_from_page(self, selector: str, target_url: str = None) -> Iterable[str]:
"""Yield the HTML of all pages linked on the target_url located by the given CSS selector.
:param selector: CSS selector for elements containing href attribute
:param target_url: URL to search for links. If none is provided, the domain URL will be used.
:return: Generator for HTML as strings, fetched from the selected links.
"""
if not target_url:
target_url = self.domain[:]
target_page = self.get_html(target_url)
elems = self.select_elements_from_html(target_page, selector)
urls = (x['href'] for x in elems)
for x in urls:
page_url = self.get_full_url(self.domain, x)
page = requests.get(page_url).text
yield page
[docs] def get_links_from_page(self, target_url: str = None) -> Iterable[str]:
"""Return a list of every href URL found from target_url.
:param target_url: URL of page to search for href links.
:return: List of every discovered href link on the page.
"""
if not target_url:
target_url = self.domain[:]
domain = self.domain[:]
html = self.get_html(target_url)
selector = 'a[href]'
elems = self.select_elements_from_html(html, selector)
links = {x['href'] for x in elems}
full_links = [self.get_full_url(domain, x) for x in links]
return full_links
[docs] @staticmethod
def select_elements_from_html(html: str, selector: str):
"""Return a list of HTML elements from the given html that match the provided CSS selector.
:param html: HTML of the page to parse.
:param selector: CSS selector for target elements.
:return: List of HTML elements matching the CSS selector.
"""
soup = bs4.BeautifulSoup(html, BS4_PARSER)
try:
elems = soup.select(selector)
except SyntaxError:
raise exceptions.SelectorSyntaxError
return elems
[docs] @staticmethod
def get_full_url(domain: str, suffix: str) -> str:
"""Return a complete URL given a domain and suffix, even if the provided suffix is the complete URL.
:param domain: The domain of the target page URL.
:param suffix: The URL of the target page, with or without the domain prefix.
:return: The complete URL.
"""
if not suffix.startswith(domain):
if domain.endswith('/') and suffix.startswith('/'):
page_url = domain[:-1] + suffix
elif not domain.endswith('/') and not suffix.startswith('/'):
page_url = domain + '/' + suffix
else:
page_url = domain + suffix
else:
page_url = suffix
return page_url
[docs] def get_html(self, url: str = None) -> str:
"""Fetch the page HTML from the given URL.
:param url: URL of target page.
:return: HTML as a string.
"""
if not url:
url = self.domain
r = requests.get(url)
return r.text