forked from mirrors/bookwyrm
Adds allowlist for html attrs
This commit is contained in:
parent
3b48d986d5
commit
1f6ecc39ac
2 changed files with 23 additions and 2 deletions
|
@ -22,6 +22,9 @@ class InputHtmlParser(HTMLParser): # pylint: disable=abstract-method
|
|||
"ol",
|
||||
"li",
|
||||
]
|
||||
self.allowed_attrs = [
|
||||
"href", "rel", "src", "alt"
|
||||
]
|
||||
self.tag_stack = []
|
||||
self.output = []
|
||||
# if the html appears invalid, we just won't allow any at all
|
||||
|
@ -30,7 +33,14 @@ class InputHtmlParser(HTMLParser): # pylint: disable=abstract-method
|
|||
def handle_starttag(self, tag, attrs):
|
||||
"""check if the tag is valid"""
|
||||
if self.allow_html and tag in self.allowed_tags:
|
||||
self.output.append(("tag", self.get_starttag_text()))
|
||||
allowed_attrs = " ".join(
|
||||
f'{a}="{v}"' for a, v in attrs if a in self.allowed_attrs
|
||||
)
|
||||
reconstructed = f'<{tag}'
|
||||
if allowed_attrs:
|
||||
reconstructed += " " + allowed_attrs
|
||||
reconstructed += ">"
|
||||
self.output.append(("tag", reconstructed))
|
||||
self.tag_stack.append(tag)
|
||||
else:
|
||||
self.output.append(("data", ""))
|
||||
|
|
|
@ -24,13 +24,24 @@ class Sanitizer(TestCase):
|
|||
self.assertEqual(input_text, output)
|
||||
|
||||
def test_valid_html_attrs(self):
|
||||
"""and don't remove attributes"""
|
||||
"""and don't remove useful attributes"""
|
||||
input_text = '<a href="fish.com">yes </a> <i>html</i>'
|
||||
parser = InputHtmlParser()
|
||||
parser.feed(input_text)
|
||||
output = parser.get_output()
|
||||
self.assertEqual(input_text, output)
|
||||
|
||||
def test_valid_html_invalid_attrs(self):
|
||||
"""do remove un-approved attributes"""
|
||||
input_text = '<a href="fish.com" fish="hello">yes </a> <i>html</i>'
|
||||
parser = InputHtmlParser()
|
||||
parser.feed(input_text)
|
||||
output = parser.get_output()
|
||||
self.assertEqual(
|
||||
output,
|
||||
'<a href="fish.com">yes </a> <i>html</i>'
|
||||
)
|
||||
|
||||
def test_invalid_html(self):
|
||||
"""remove all html when the html is malformed"""
|
||||
input_text = "<b>yes <i>html</i>"
|
||||
|
|
Loading…
Reference in a new issue