bookwyrm/bookwyrm/tests/validate_html.py

58 lines
1.6 KiB
Python
Raw Normal View History

2021-10-01 05:22:20 +00:00
""" html validation on rendered templates """
from html.parser import HTMLParser
2021-10-01 05:22:20 +00:00
from tidylib import tidy_document
2021-10-01 05:23:29 +00:00
2021-10-01 05:22:20 +00:00
def validate_html(html):
2021-10-01 05:23:29 +00:00
"""run tidy on html"""
2021-10-01 05:22:20 +00:00
_, errors = tidy_document(
html.content,
options={
"doctype": "html5",
2021-10-01 05:22:20 +00:00
"drop-empty-elements": False,
"warn-proprietary-attributes": False,
},
)
# idk how else to filter out these unescape amp errs
errors = "\n".join(
e
for e in errors.split("\n")
if "&book" not in e
and "&type" not in e
2023-05-15 12:31:07 +00:00
and "&resolved" not in e
and "id and name attribute" not in e
and "illegal characters found in URI" not in e
and "escaping malformed URI reference" not in e
2021-10-01 05:22:20 +00:00
)
if errors:
raise Exception(errors)
validator = HtmlValidator()
# will raise exceptions
validator.feed(str(html.content))
class HtmlValidator(HTMLParser): # pylint: disable=abstract-method
"""Checks for custom html validation requirements"""
def __init__(self):
HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs):
"""check if the tag is valid"""
# filter out everything besides links that open in new tabs
if tag != "a" or ("target", "_blank") not in attrs:
return
for attr, value in attrs:
if (
attr == "rel"
and "nofollow" in value
and "noopener" in value
and "noreferrer" in value
):
return
raise Exception(
'Links to a new tab must have rel="nofollow noopener noreferrer"'
)