Merge pull request #1912 from bookwyrm-social/sanitize-tags

Adds allowlist for html attrs
This commit is contained in:
Mouse Reeve 2022-02-03 13:29:40 -08:00 committed by GitHub
commit 8518b4f877
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 18 additions and 2 deletions

View file

@ -22,6 +22,7 @@ class InputHtmlParser(HTMLParser): # pylint: disable=abstract-method
"ol",
"li",
]
self.allowed_attrs = ["href", "rel", "src", "alt"]
self.tag_stack = []
self.output = []
# if the html appears invalid, we just won't allow any at all
@ -30,7 +31,14 @@ class InputHtmlParser(HTMLParser): # pylint: disable=abstract-method
def handle_starttag(self, tag, attrs):
"""check if the tag is valid"""
if self.allow_html and tag in self.allowed_tags:
self.output.append(("tag", self.get_starttag_text()))
allowed_attrs = " ".join(
f'{a}="{v}"' for a, v in attrs if a in self.allowed_attrs
)
reconstructed = f"<{tag}"
if allowed_attrs:
reconstructed += " " + allowed_attrs
reconstructed += ">"
self.output.append(("tag", reconstructed))
self.tag_stack.append(tag)
else:
self.output.append(("data", ""))

View file

@ -24,13 +24,21 @@ class Sanitizer(TestCase):
self.assertEqual(input_text, output)
def test_valid_html_attrs(self):
"""and don't remove attributes"""
"""and don't remove useful attributes"""
input_text = '<a href="fish.com">yes </a> <i>html</i>'
parser = InputHtmlParser()
parser.feed(input_text)
output = parser.get_output()
self.assertEqual(input_text, output)
def test_valid_html_invalid_attrs(self):
"""do remove un-approved attributes"""
input_text = '<a href="fish.com" fish="hello">yes </a> <i>html</i>'
parser = InputHtmlParser()
parser.feed(input_text)
output = parser.get_output()
self.assertEqual(output, '<a href="fish.com">yes </a> <i>html</i>')
def test_invalid_html(self):
"""remove all html when the html is malformed"""
input_text = "<b>yes <i>html</i>"