moviewyrm/bookwyrm/sanitize_html.py

""" html parser to clean up incoming text from unknown sources """
from html.parser import HTMLParser


class InputHtmlParser(HTMLParser):  # pylint: disable=abstract-method
    """Removes any html that isn't allowed_tagsed from a block"""

    def __init__(self):
        HTMLParser.__init__(self)
        self.allowed_tags = [
            "p",
            "blockquote",
            "br",
            "b",
            "i",
            "strong",
            "em",
            "pre",
            "a",
            "span",
            "ul",
            "ol",
            "li",
        ]
        self.allowed_attrs = ["href", "rel", "src", "alt"]
        self.tag_stack = []
        self.output = []
        # if the html appears invalid, we just won't allow any at all
        self.allow_html = True

    def handle_starttag(self, tag, attrs):
        """check if the tag is valid"""
        if self.allow_html and tag in self.allowed_tags:
            allowed_attrs = " ".join(
                f'{a}="{v}"' for a, v in attrs if a in self.allowed_attrs
            )
            reconstructed = f"<{tag}"
            if allowed_attrs:
                reconstructed += " " + allowed_attrs
            reconstructed += ">"
            self.output.append(("tag", reconstructed))
            self.tag_stack.append(tag)
        else:
            self.output.append(("data", ""))

    def handle_endtag(self, tag):
        """keep the close tag"""
        if not self.allow_html or tag not in self.allowed_tags:
            self.output.append(("data", ""))
            return

        if not self.tag_stack or self.tag_stack[-1] != tag:
            # the end tag doesn't match the most recent start tag
            self.allow_html = False
            self.output.append(("data", ""))
            return

        self.tag_stack = self.tag_stack[:-1]
        self.output.append(("tag", f"</{tag}>"))

    def handle_data(self, data):
        """extract the answer, if we're in an answer tag"""
        self.output.append(("data", data))

    def get_output(self):
        """convert the output from a list of tuples to a string"""
        if self.tag_stack:
            self.allow_html = False
        if not self.allow_html:
            return "".join(v for (k, v) in self.output if k == "data")
        return "".join(v for (k, v) in self.output)