from typing import Callable
from typing import List, Union
import bs4
from scraper_toolkit import exceptions
from scraper_toolkit.components.Selector import Selector
BS4_PARSER = 'lxml'
[docs]class Parser:
"""Parse HTML for specific elements or attributes
:param html: HTML to parse, as a string.
"""
def __init__(self, html: str):
self.soup = bs4.BeautifulSoup(html, BS4_PARSER)
self.selectors = list()
self.parsed = list()
[docs] def parse(self):
"""Parse HTML for elements using loaded CSS selectors and
append matching elements to self.parsed as dictionary objects.
"""
for s in self.selectors:
elems = self.soup.select(s.selector_str)
for i, e in enumerate(elems):
attr = str(s.attribute).lower()
parsed_attribute = self.__parse_attribute(attr, e)
if s.post_processing:
parsed_attribute = s.post_processing(parsed_attribute)
self.__append_attribute_to_parsed(i, parsed_attribute, s)
[docs] def add_selector(self, selector: Union[str, Selector] = None, attribute: str = None, name: str = None,
post_processing: Callable = None):
"""Add the given selector to loaded CSS selectors.
:param selector: CSS selector as a string or a Selector type object.
:param attribute: HTML attribute of the element to store
:param name: Optional name for the parsed attribute, useful for creating the header row
when exporting as a CSV file.
:param post_processing: Optional function called on the parsed attribute before it is stored.
Useful for cleaning up and splitting data.
"""
if isinstance(selector, str):
new_selector = Selector(selector_str=selector, name=name, attribute=attribute,
post_processing=post_processing)
elif isinstance(selector, Selector):
new_selector = selector
else:
raise TypeError
if new_selector not in self.selectors:
self.selectors.append(new_selector)
def __append_attribute_to_parsed(self, i, parsed_attribute, s):
if len(self.parsed) > i:
self.parsed[i][s.name] = parsed_attribute
else:
d = {s.name: parsed_attribute}
self.parsed.append(d)
@staticmethod
def __parse_attribute(attr, e):
if attr in ['text']:
parsed_attribute = e.text
else:
try:
parsed_attribute = e[attr]
except KeyError:
raise exceptions.InvalidAttributeError(attr)
return parsed_attribute