mirror of
https://github.com/bookwyrm-social/bookwyrm.git
synced 2024-12-13 11:46:35 +00:00
71 lines
2.3 KiB
Python
71 lines
2.3 KiB
Python
""" html parser to clean up incoming text from unknown sources """
|
|
from html.parser import HTMLParser
|
|
|
|
|
|
class InputHtmlParser(HTMLParser): # pylint: disable=abstract-method
|
|
"""Removes any html that isn't allowed_tagsed from a block"""
|
|
|
|
def __init__(self):
|
|
HTMLParser.__init__(self)
|
|
self.allowed_tags = [
|
|
"p",
|
|
"blockquote",
|
|
"br",
|
|
"b",
|
|
"i",
|
|
"strong",
|
|
"em",
|
|
"pre",
|
|
"a",
|
|
"span",
|
|
"ul",
|
|
"ol",
|
|
"li",
|
|
]
|
|
self.allowed_attrs = ["href", "rel", "src", "alt"]
|
|
self.tag_stack = []
|
|
self.output = []
|
|
# if the html appears invalid, we just won't allow any at all
|
|
self.allow_html = True
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
"""check if the tag is valid"""
|
|
if self.allow_html and tag in self.allowed_tags:
|
|
allowed_attrs = " ".join(
|
|
f'{a}="{v}"' for a, v in attrs if a in self.allowed_attrs
|
|
)
|
|
reconstructed = f"<{tag}"
|
|
if allowed_attrs:
|
|
reconstructed += " " + allowed_attrs
|
|
reconstructed += ">"
|
|
self.output.append(("tag", reconstructed))
|
|
self.tag_stack.append(tag)
|
|
else:
|
|
self.output.append(("data", ""))
|
|
|
|
def handle_endtag(self, tag):
|
|
"""keep the close tag"""
|
|
if not self.allow_html or tag not in self.allowed_tags:
|
|
self.output.append(("data", ""))
|
|
return
|
|
|
|
if not self.tag_stack or self.tag_stack[-1] != tag:
|
|
# the end tag doesn't match the most recent start tag
|
|
self.allow_html = False
|
|
self.output.append(("data", ""))
|
|
return
|
|
|
|
self.tag_stack = self.tag_stack[:-1]
|
|
self.output.append(("tag", f"</{tag}>"))
|
|
|
|
def handle_data(self, data):
|
|
"""extract the answer, if we're in an answer tag"""
|
|
self.output.append(("data", data))
|
|
|
|
def get_output(self):
|
|
"""convert the output from a list of tuples to a string"""
|
|
if self.tag_stack:
|
|
self.allow_html = False
|
|
if not self.allow_html:
|
|
return "".join(v for (k, v) in self.output if k == "data")
|
|
return "".join(v for (k, v) in self.output)
|