Source code for scraper_toolkit.components.Parser

from typing import Callable
from typing import List, Union

import bs4

from scraper_toolkit import exceptions
from scraper_toolkit.components.Selector import Selector

BS4_PARSER = 'lxml'


[docs]class Parser: """Parse HTML for specific elements or attributes :param html: HTML to parse, as a string. """ def __init__(self, html: str): self.soup = bs4.BeautifulSoup(html, BS4_PARSER) self.selectors = list() self.parsed = list()
[docs] def parse(self): """Parse HTML for elements using loaded CSS selectors and append matching elements to self.parsed as dictionary objects. """ for s in self.selectors: elems = self.soup.select(s.selector_str) for i, e in enumerate(elems): attr = str(s.attribute).lower() parsed_attribute = self.__parse_attribute(attr, e) if s.post_processing: parsed_attribute = s.post_processing(parsed_attribute) self.__append_attribute_to_parsed(i, parsed_attribute, s)
[docs] def add_selector(self, selector: Union[str, Selector] = None, attribute: str = None, name: str = None, post_processing: Callable = None): """Add the given selector to loaded CSS selectors. :param selector: CSS selector as a string or a Selector type object. :param attribute: HTML attribute of the element to store :param name: Optional name for the parsed attribute, useful for creating the header row when exporting as a CSV file. :param post_processing: Optional function called on the parsed attribute before it is stored. Useful for cleaning up and splitting data. """ if isinstance(selector, str): new_selector = Selector(selector_str=selector, name=name, attribute=attribute, post_processing=post_processing) elif isinstance(selector, Selector): new_selector = selector else: raise TypeError if new_selector not in self.selectors: self.selectors.append(new_selector)
def __append_attribute_to_parsed(self, i, parsed_attribute, s): if len(self.parsed) > i: self.parsed[i][s.name] = parsed_attribute else: d = {s.name: parsed_attribute} self.parsed.append(d) @staticmethod def __parse_attribute(attr, e): if attr in ['text']: parsed_attribute = e.text else: try: parsed_attribute = e[attr] except KeyError: raise exceptions.InvalidAttributeError(attr) return parsed_attribute