Source code for scraper_toolkit.components.Parser

from typing import Callable
from typing import List, Union

import bs4

from scraper_toolkit import exceptions
from scraper_toolkit.components.Selector import Selector

BS4_PARSER = 'lxml'


[docs]class Parser:
    """Parse HTML for specific elements or attributes

    :param html: HTML to parse, as a string.
    """
    def __init__(self, html: str):
        self.soup = bs4.BeautifulSoup(html, BS4_PARSER)
        self.selectors = list()
        self.parsed = list()

[docs]    def parse(self):
        """Parse HTML for elements using loaded CSS selectors and
        append matching elements to self.parsed as dictionary objects.
        """
        for s in self.selectors:
            elems = self.soup.select(s.selector_str)

            for i, e in enumerate(elems):
                attr = str(s.attribute).lower()
                parsed_attribute = self.__parse_attribute(attr, e)

                if s.post_processing:
                    parsed_attribute = s.post_processing(parsed_attribute)

                self.__append_attribute_to_parsed(i, parsed_attribute, s)

[docs]    def add_selector(self, selector: Union[str, Selector] = None, attribute: str = None, name: str = None,
                     post_processing: Callable = None):
        """Add the given selector to loaded CSS selectors.

        :param selector: CSS selector as a string or a Selector type object.
        :param attribute: HTML attribute of the element to store
        :param name: Optional name for the parsed attribute, useful for creating the header row
                      when exporting as a CSV file.
        :param post_processing: Optional function called on the parsed attribute before it is stored.
                                Useful for cleaning up and splitting data.
        """

        if isinstance(selector, str):
            new_selector = Selector(selector_str=selector, name=name, attribute=attribute,
                                    post_processing=post_processing)
        elif isinstance(selector, Selector):
            new_selector = selector
        else:
            raise TypeError

        if new_selector not in self.selectors:
            self.selectors.append(new_selector)

    def __append_attribute_to_parsed(self, i, parsed_attribute, s):
        if len(self.parsed) > i:
            self.parsed[i][s.name] = parsed_attribute
        else:
            d = {s.name: parsed_attribute}
            self.parsed.append(d)

    @staticmethod
    def __parse_attribute(attr, e):
        if attr in ['text']:
            parsed_attribute = e.text
        else:
            try:
                parsed_attribute = e[attr]
            except KeyError:
                raise exceptions.InvalidAttributeError(attr)
        return parsed_attribute