bookwyrm/bookwyrm/sanitize_html.py

57 lines
1.9 KiB
Python
Raw Normal View History

2020-02-19 08:35:12 +00:00
''' html parser to clean up incoming text from unknown sources '''
2020-02-15 05:45:13 +00:00
from html.parser import HTMLParser
2020-12-17 00:47:05 +00:00
class InputHtmlParser(HTMLParser):#pylint: disable=abstract-method
2020-09-21 17:25:26 +00:00
''' Removes any html that isn't allowed_tagsed from a block '''
2020-02-15 05:45:13 +00:00
def __init__(self):
HTMLParser.__init__(self)
2020-12-20 02:54:56 +00:00
self.allowed_tags = [
2021-01-29 16:00:19 +00:00
'p', 'blockquote', 'br',
2020-12-20 02:54:56 +00:00
'b', 'i', 'strong', 'em', 'pre',
'a', 'span', 'ul', 'ol', 'li'
]
2020-02-15 05:45:13 +00:00
self.tag_stack = []
self.output = []
# if the html appears invalid, we just won't allow any at all
self.allow_html = True
def handle_starttag(self, tag, attrs):
''' check if the tag is valid '''
2020-09-21 17:25:26 +00:00
if self.allow_html and tag in self.allowed_tags:
self.output.append(('tag', self.get_starttag_text()))
2020-02-15 05:45:13 +00:00
self.tag_stack.append(tag)
else:
self.output.append(('data', ''))
2020-02-15 05:45:13 +00:00
def handle_endtag(self, tag):
''' keep the close tag '''
2020-09-21 17:25:26 +00:00
if not self.allow_html or tag not in self.allowed_tags:
self.output.append(('data', ''))
2020-02-15 05:45:13 +00:00
return
if not self.tag_stack or self.tag_stack[-1] != tag:
# the end tag doesn't match the most recent start tag
self.allow_html = False
self.output.append(('data', ''))
2020-02-15 05:45:13 +00:00
return
self.tag_stack = self.tag_stack[:-1]
self.output.append(('tag', '</%s>' % tag))
def handle_data(self, data):
''' extract the answer, if we're in an answer tag '''
self.output.append(('data', data))
def get_output(self):
''' convert the output from a list of tuples to a string '''
if self.tag_stack:
self.allow_html = False
2020-02-15 05:45:13 +00:00
if not self.allow_html:
return ''.join(v for (k, v) in self.output if k == 'data')
return ''.join(v for (k, v) in self.output)