mirror of
https://github.com/bookwyrm-social/bookwyrm.git
synced 2024-11-29 13:01:08 +00:00
ec45f8d565
Fixes #39
51 lines
1.7 KiB
Python
51 lines
1.7 KiB
Python
''' html parser to clean up incoming text from unknown sources '''
|
|
from html.parser import HTMLParser
|
|
|
|
class InputHtmlParser(HTMLParser):
|
|
''' Removes any html that isn't whitelisted from a block '''
|
|
|
|
def __init__(self):
|
|
HTMLParser.__init__(self)
|
|
self.whitelist = ['p', 'b', 'i', 'pre', 'a', 'span']
|
|
self.tag_stack = []
|
|
self.output = []
|
|
# if the html appears invalid, we just won't allow any at all
|
|
self.allow_html = True
|
|
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
''' check if the tag is valid '''
|
|
if self.allow_html and tag in self.whitelist:
|
|
self.output.append(('tag', self.get_starttag_text()))
|
|
self.tag_stack.append(tag)
|
|
else:
|
|
self.output.append(('data', ' '))
|
|
|
|
|
|
def handle_endtag(self, tag):
|
|
''' keep the close tag '''
|
|
if not self.allow_html or tag not in self.whitelist:
|
|
self.output.append(('data', ' '))
|
|
return
|
|
|
|
if not self.tag_stack or self.tag_stack[-1] != tag:
|
|
# the end tag doesn't match the most recent start tag
|
|
self.allow_html = False
|
|
self.output.append(('data', ' '))
|
|
return
|
|
|
|
self.tag_stack = self.tag_stack[:-1]
|
|
self.output.append(('tag', '</%s>' % tag))
|
|
|
|
|
|
def handle_data(self, data):
|
|
''' extract the answer, if we're in an answer tag '''
|
|
self.output.append(('data', data))
|
|
|
|
|
|
def get_output(self):
|
|
''' convert the output from a list of tuples to a string '''
|
|
if not self.allow_html:
|
|
return ''.join(v for (k, v) in self.output if k == 'data')
|
|
return ''.join(v for (k, v) in self.output)
|
|
|