Source code for scraper_toolkit.ScraperProject

from typing import Union, List, Callable

from scraper_toolkit.components.PageFetcher import PageFetcher
from scraper_toolkit.components.Parser import Parser
from scraper_toolkit.components.Selector import Selector
from scraper_toolkit.components.Exporter import Exporter
from pathlib import Path

from scraper_toolkit import exceptions


[docs]class ScraperProject: """Handle the page fetching, HTML parsing, and exporting of a web scraping project. :param domain: Prefix to be added to scraped URLs missing the domain. """ def __init__(self, domain: str): self.fetcher = PageFetcher(domain) self.parser = None self.selectors = None self.fetch() self.selectors = self.parser.selectors self.parsed = self.parser.parsed
[docs] def fetch(self, url: str = None) -> str: """Fetch HTML from the page at the given URL. :param url: URL of the target page. :return: HTML page as a string. """ html = self.fetcher.get_html(url) self.parser = Parser(html) return html
[docs] def add_selector(self, selector: Union[str, 'Selector'], attribute: str = None, name: str = None, post_processing: Callable = None): """Add the given selector to loaded CSS selectors. :param selector: CSS selector as a string or a Selector type object. :param attribute: HTML attribute of the element to store :param name: Optional name for the parsed attribute, useful for creating the header row when exporting as a CSV file. :param post_processing: Optional function called on the parsed attribute before it is stored. Useful for cleaning up and splitting data. """ self.parser.add_selector(selector=selector, attribute=attribute, name=name, post_processing=post_processing)
[docs] def add_selectors(self, selectors: List['Selector']): """Add multiple CSS selectors to loaded selectors. :param selectors: List of Selector objects. """ for s in selectors: self.add_selector(s)
[docs] def parse(self): """Parse HTML for elements using loaded CSS selectors and append matching elements to self.parsed as dictionary objects. """ self.parser.parse()
[docs] def export_to_csv(self, csv_path: Path, encoding: str = 'UTF-8', write_header: bool = True): """Export parsed data to a CSV file. :param csv_path: Path of the location to save the CSV file. :param encoding: CSV file encoding. Default is UTF-8. :param write_header: If true, write a header row to the CSV file using the "name" keys in the provided data. """ if not self.parsed: raise exceptions.ExportBeforeParsingError Exporter(self.parsed).export_to_csv(csv_path=csv_path, encoding=encoding, write_header=write_header)