Adds allowlist for html attrs

This commit is contained in:
Mouse Reeve 2022-02-03 13:15:06 -08:00
parent 3b48d986d5
commit 1f6ecc39ac
2 changed files with 23 additions and 2 deletions

View file

@ -22,6 +22,9 @@ class InputHtmlParser(HTMLParser): # pylint: disable=abstract-method
"ol",
"li",
]
self.allowed_attrs = [
"href", "rel", "src", "alt"
]
self.tag_stack = []
self.output = []
# if the html appears invalid, we just won't allow any at all
@ -30,7 +33,14 @@ class InputHtmlParser(HTMLParser): # pylint: disable=abstract-method
def handle_starttag(self, tag, attrs):
"""check if the tag is valid"""
if self.allow_html and tag in self.allowed_tags:
self.output.append(("tag", self.get_starttag_text()))
allowed_attrs = " ".join(
f'{a}="{v}"' for a, v in attrs if a in self.allowed_attrs
)
reconstructed = f'<{tag}'
if allowed_attrs:
reconstructed += " " + allowed_attrs
reconstructed += ">"
self.output.append(("tag", reconstructed))
self.tag_stack.append(tag)
else:
self.output.append(("data", ""))

View file

@ -24,13 +24,24 @@ class Sanitizer(TestCase):
self.assertEqual(input_text, output)
def test_valid_html_attrs(self):
"""and don't remove attributes"""
"""and don't remove useful attributes"""
input_text = '<a href="fish.com">yes </a> <i>html</i>'
parser = InputHtmlParser()
parser.feed(input_text)
output = parser.get_output()
self.assertEqual(input_text, output)
def test_valid_html_invalid_attrs(self):
"""do remove un-approved attributes"""
input_text = '<a href="fish.com" fish="hello">yes </a> <i>html</i>'
parser = InputHtmlParser()
parser.feed(input_text)
output = parser.get_output()
self.assertEqual(
output,
'<a href="fish.com">yes </a> <i>html</i>'
)
def test_invalid_html(self):
"""remove all html when the html is malformed"""
input_text = "<b>yes <i>html</i>"