bookwyrm/bookwyrm/sanitize_html.py

''' html parser to clean up incoming text from unknown sources '''
from html.parser import HTMLParser

class InputHtmlParser(HTMLParser):#pylint: disable=abstract-method
    ''' Removes any html that isn't allowed_tagsed from a block '''

    def __init__(self):
        HTMLParser.__init__(self)
        self.allowed_tags = [
            'p', 'blockquote', 'br',
            'b', 'i', 'strong', 'em', 'pre',
            'a', 'span', 'ul', 'ol', 'li'
        ]
        self.tag_stack = []
        self.output = []
        # if the html appears invalid, we just won't allow any at all
        self.allow_html = True


    def handle_starttag(self, tag, attrs):
        ''' check if the tag is valid '''
        if self.allow_html and tag in self.allowed_tags:
            self.output.append(('tag', self.get_starttag_text()))
            self.tag_stack.append(tag)
        else:
            self.output.append(('data', ''))


    def handle_endtag(self, tag):
        ''' keep the close tag '''
        if not self.allow_html or tag not in self.allowed_tags:
            self.output.append(('data', ''))
            return

        if not self.tag_stack or self.tag_stack[-1] != tag:
            # the end tag doesn't match the most recent start tag
            self.allow_html = False
            self.output.append(('data', ''))
            return

        self.tag_stack = self.tag_stack[:-1]
        self.output.append(('tag', '</%s>' % tag))


    def handle_data(self, data):
        ''' extract the answer, if we're in an answer tag '''
        self.output.append(('data', data))


    def get_output(self):
        ''' convert the output from a list of tuples to a string '''
        if self.tag_stack:
            self.allow_html = False
        if not self.allow_html:
            return ''.join(v for (k, v) in self.output if k == 'data')
        return ''.join(v for (k, v) in self.output)
comment and todo cleanup 2020-02-19 08:35:12 +00:00			`''' html parser to clean up incoming text from unknown sources '''`
Sanitizes html input 2020-02-15 05:45:13 +00:00			`from html.parser import HTMLParser`

Sanitize incoming html 2020-12-17 00:47:05 +00:00			`class InputHtmlParser(HTMLParser):#pylint: disable=abstract-method`
Fixes linter issues 2020-09-21 17:25:26 +00:00			`''' Removes any html that isn't allowed_tagsed from a block '''`
Sanitizes html input 2020-02-15 05:45:13 +00:00
			`def __init__(self):`
			`HTMLParser.__init__(self)`
Allow markdown in html fields 2020-12-20 02:54:56 +00:00			`self.allowed_tags = [`
Allows blockquote markdown 2021-01-29 16:00:19 +00:00			`'p', 'blockquote', 'br',`
Allow markdown in html fields 2020-12-20 02:54:56 +00:00			`'b', 'i', 'strong', 'em', 'pre',`
			`'a', 'span', 'ul', 'ol', 'li'`
			`]`
Sanitizes html input 2020-02-15 05:45:13 +00:00			`self.tag_stack = []`
			`self.output = []`
			`# if the html appears invalid, we just won't allow any at all`
			`self.allow_html = True`


			`def handle_starttag(self, tag, attrs):`
			`''' check if the tag is valid '''`
Fixes linter issues 2020-09-21 17:25:26 +00:00			`if self.allow_html and tag in self.allowed_tags:`
Allow a and span tags in posts Fixes #39 2020-03-13 00:23:55 +00:00			`self.output.append(('tag', self.get_starttag_text()))`
Sanitizes html input 2020-02-15 05:45:13 +00:00			`self.tag_stack.append(tag)`
			`else:`
Tests and fixes whitespace bugs in sanitizer 2020-05-10 01:30:24 +00:00			`self.output.append(('data', ''))`
Sanitizes html input 2020-02-15 05:45:13 +00:00

			`def handle_endtag(self, tag):`
			`''' keep the close tag '''`
Fixes linter issues 2020-09-21 17:25:26 +00:00			`if not self.allow_html or tag not in self.allowed_tags:`
Tests and fixes whitespace bugs in sanitizer 2020-05-10 01:30:24 +00:00			`self.output.append(('data', ''))`
Sanitizes html input 2020-02-15 05:45:13 +00:00			`return`

			`if not self.tag_stack or self.tag_stack[-1] != tag:`
			`# the end tag doesn't match the most recent start tag`
			`self.allow_html = False`
Tests and fixes whitespace bugs in sanitizer 2020-05-10 01:30:24 +00:00			`self.output.append(('data', ''))`
Sanitizes html input 2020-02-15 05:45:13 +00:00			`return`

			`self.tag_stack = self.tag_stack[:-1]`
			`self.output.append(('tag', '</%s>' % tag))`


			`def handle_data(self, data):`
			`''' extract the answer, if we're in an answer tag '''`
			`self.output.append(('data', data))`


			`def get_output(self):`
			`''' convert the output from a list of tuples to a string '''`
Tests and fixes whitespace bugs in sanitizer 2020-05-10 01:30:24 +00:00			`if self.tag_stack:`
			`self.allow_html = False`
Sanitizes html input 2020-02-15 05:45:13 +00:00			`if not self.allow_html:`
			`return ''.join(v for (k, v) in self.output if k == 'data')`
			`return ''.join(v for (k, v) in self.output)`