bookwyrm/bookwyrm/tests/validate_html.py
Ross Chapman dd1999eb8e
Adds view tests for shelf filters (#3162)
* Adds test file

* Adds success assertion

* Updates tests

* Updates shelf books creation

* Updates assertion to use isbn for Edition model

* Updates query

* trigger workflow test

* Updates validate_html

* Updates comment and test

* Fixes none test

* Adds management command to clear all deleted user data

* Adds success message

---------

Co-authored-by: Mouse Reeve <mousereeve@riseup.net>
Co-authored-by: Mouse Reeve <mouse.reeve@gmail.com>
2024-02-20 16:25:01 -08:00

68 lines
2.2 KiB
Python

""" html validation on rendered templates """
from html.parser import HTMLParser
from tidylib import tidy_document
def validate_html(html):
"""run tidy on html"""
_, errors = tidy_document(
html.content,
options={
"doctype": "html5",
"drop-empty-elements": False,
"warn-proprietary-attributes": False,
},
)
# Tidy's parser is strict when validating unescaped/encoded ampersands found within
# the html document that are notpart of a character or entity reference
# (eg: `&amp;` or `&#38`). Despite the fact the HTML5 spec no longer recommends
# escaping ampersands in URLs, Tidy will still complain if they are used as query
# param keys. Unfortunately, there is no way currently to configure tidy to ignore
# this so we must explictly redlist related strings that will appear in Tidy's
# errors output.
#
# See further discussion: https://github.com/htacg/tidy-html5/issues/1017
excluded = [
"&book",
"&type",
"&resolved",
"id and name attribute",
"illegal characters found in URI",
"escaping malformed URI reference",
"&filter",
]
errors = "\n".join(
e for e in errors.split("\n") if not any(exclude in e for exclude in excluded)
)
if errors:
raise Exception(errors)
validator = HtmlValidator()
# will raise exceptions
validator.feed(str(html.content))
class HtmlValidator(HTMLParser): # pylint: disable=abstract-method
"""Checks for custom html validation requirements"""
def __init__(self):
HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs):
"""check if the tag is valid"""
# filter out everything besides links that open in new tabs
if tag != "a" or ("target", "_blank") not in attrs:
return
for attr, value in attrs:
if (
attr == "rel"
and "nofollow" in value
and "noopener" in value
and "noreferrer" in value
):
return
raise Exception(
'Links to a new tab must have rel="nofollow noopener noreferrer"'
)