from typing import Union, List, Callable
from scraper_toolkit.components.PageFetcher import PageFetcher
from scraper_toolkit.components.Parser import Parser
from scraper_toolkit.components.Selector import Selector
from scraper_toolkit.components.Exporter import Exporter
from pathlib import Path
from scraper_toolkit import exceptions
[docs]class ScraperProject:
"""Handle the page fetching, HTML parsing, and exporting of a web scraping project.
:param domain: Prefix to be added to scraped URLs missing the domain.
"""
def __init__(self, domain: str):
self.fetcher = PageFetcher(domain)
self.parser = None
self.selectors = None
self.fetch()
self.selectors = self.parser.selectors
self.parsed = self.parser.parsed
[docs] def fetch(self, url: str = None) -> str:
"""Fetch HTML from the page at the given URL.
:param url: URL of the target page.
:return: HTML page as a string.
"""
html = self.fetcher.get_html(url)
self.parser = Parser(html)
return html
[docs] def add_selector(self, selector: Union[str, 'Selector'], attribute: str = None, name: str = None,
post_processing: Callable = None):
"""Add the given selector to loaded CSS selectors.
:param selector: CSS selector as a string or a Selector type object.
:param attribute: HTML attribute of the element to store
:param name: Optional name for the parsed attribute, useful for creating the header row
when exporting as a CSV file.
:param post_processing: Optional function called on the parsed attribute before it is stored.
Useful for cleaning up and splitting data.
"""
self.parser.add_selector(selector=selector, attribute=attribute, name=name, post_processing=post_processing)
[docs] def add_selectors(self, selectors: List['Selector']):
"""Add multiple CSS selectors to loaded selectors.
:param selectors: List of Selector objects.
"""
for s in selectors:
self.add_selector(s)
[docs] def parse(self):
"""Parse HTML for elements using loaded CSS selectors and
append matching elements to self.parsed as dictionary objects.
"""
self.parser.parse()
[docs] def export_to_csv(self, csv_path: Path, encoding: str = 'UTF-8', write_header: bool = True):
"""Export parsed data to a CSV file.
:param csv_path: Path of the location to save the CSV file.
:param encoding: CSV file encoding. Default is UTF-8.
:param write_header: If true, write a header row to the CSV file using the "name" keys in the provided data.
"""
if not self.parsed:
raise exceptions.ExportBeforeParsingError
Exporter(self.parsed).export_to_csv(csv_path=csv_path, encoding=encoding, write_header=write_header)