bookwyrm/bookwyrm/tests/validate_html.py

""" html validation on rendered templates """
from html.parser import HTMLParser
from tidylib import tidy_document


def validate_html(html):
    """run tidy on html"""
    _, errors = tidy_document(
        html.content,
        options={
            "doctype": "html5",
            "drop-empty-elements": False,
            "warn-proprietary-attributes": False,
        },
    )
    # idk how else to filter out these unescape amp errs
    errors = "\n".join(
        e
        for e in errors.split("\n")
        if "&book" not in e
        and "&type" not in e
        and "&resolved" not in e
        and "id and name attribute" not in e
        and "illegal characters found in URI" not in e
        and "escaping malformed URI reference" not in e
    )
    if errors:
        raise Exception(errors)

    validator = HtmlValidator()
    # will raise exceptions
    validator.feed(str(html.content))


class HtmlValidator(HTMLParser):  # pylint: disable=abstract-method
    """Checks for custom html validation requirements"""

    def __init__(self):
        HTMLParser.__init__(self)

    def handle_starttag(self, tag, attrs):
        """check if the tag is valid"""
        # filter out everything besides links that open in new tabs
        if tag != "a" or ("target", "_blank") not in attrs:
            return

        for attr, value in attrs:
            if (
                attr == "rel"
                and "nofollow" in value
                and "noopener" in value
                and "noreferrer" in value
            ):
                return
        raise Exception(
            'Links to a new tab must have rel="nofollow noopener noreferrer"'
        )
Helper function for html validation 2021-10-01 05:22:20 +00:00			`""" html validation on rendered templates """`
Adds unit tests that checks if rel field is set on links 2022-08-05 18:00:43 +00:00			`from html.parser import HTMLParser`
Helper function for html validation 2021-10-01 05:22:20 +00:00			`from tidylib import tidy_document`

Python formatting 2021-10-01 05:23:29 +00:00
Helper function for html validation 2021-10-01 05:22:20 +00:00			`def validate_html(html):`
Python formatting 2021-10-01 05:23:29 +00:00			`"""run tidy on html"""`
Helper function for html validation 2021-10-01 05:22:20 +00:00			`_, errors = tidy_document(`
			`html.content,`
			`options={`
Explicitly set doctype to html5 when invoking tidy_document() Many tests break without this on newer versions of html-tidy. 2023-12-11 22:40:48 +00:00			`"doctype": "html5",`
Helper function for html validation 2021-10-01 05:22:20 +00:00			`"drop-empty-elements": False,`
			`"warn-proprietary-attributes": False,`
			`},`
			`)`
			`# idk how else to filter out these unescape amp errs`
			`errors = "\n".join(`
			`e`
			`for e in errors.split("\n")`
Fixes search html And suppresses more warnings 2021-12-29 20:26:40 +00:00			`if "&book" not in e`
			`and "&type" not in e`
Updates ignore html error list 2023-05-15 12:31:07 +00:00			`and "&resolved" not in e`
Fixes search html And suppresses more warnings 2021-12-29 20:26:40 +00:00			`and "id and name attribute" not in e`
			`and "illegal characters found in URI" not in e`
			`and "escaping malformed URI reference" not in e`
Helper function for html validation 2021-10-01 05:22:20 +00:00			`)`
			`if errors:`
			`raise Exception(errors)`
Adds unit tests that checks if rel field is set on links 2022-08-05 18:00:43 +00:00
			`validator = HtmlValidator()`
			`# will raise exceptions`
			`validator.feed(str(html.content))`


			`class HtmlValidator(HTMLParser): # pylint: disable=abstract-method`
			`"""Checks for custom html validation requirements"""`

			`def __init__(self):`
			`HTMLParser.__init__(self)`

			`def handle_starttag(self, tag, attrs):`
			`"""check if the tag is valid"""`
			`# filter out everything besides links that open in new tabs`
			`if tag != "a" or ("target", "_blank") not in attrs:`
			`return`

			`for attr, value in attrs:`
			`if (`
			`attr == "rel"`
			`and "nofollow" in value`
			`and "noopener" in value`
			`and "noreferrer" in value`
			`):`
			`return`
			`raise Exception(`
			`'Links to a new tab must have rel="nofollow noopener noreferrer"'`
			`)`