From 7f78140015d6e2c9bac86ba9931911009c407dba Mon Sep 17 00:00:00 2001 From: Mouse Reeve Date: Mon, 4 Jul 2022 13:14:22 -0700 Subject: [PATCH] Uses library for html cleanup --- bookwyrm/models/fields.py | 6 +-- bookwyrm/sanitize_html.py | 71 ---------------------------- bookwyrm/tests/test_sanitize_html.py | 36 ++++++-------- bookwyrm/utils/sanitizer.py | 25 ++++++++++ bookwyrm/views/status.py | 7 +-- requirements.txt | 1 + 6 files changed, 44 insertions(+), 102 deletions(-) delete mode 100644 bookwyrm/sanitize_html.py create mode 100644 bookwyrm/utils/sanitizer.py diff --git a/bookwyrm/models/fields.py b/bookwyrm/models/fields.py index 62c61cc40..785f3397c 100644 --- a/bookwyrm/models/fields.py +++ b/bookwyrm/models/fields.py @@ -16,7 +16,7 @@ from django.utils.encoding import filepath_to_uri from bookwyrm import activitypub from bookwyrm.connectors import get_image -from bookwyrm.sanitize_html import InputHtmlParser +from bookwyrm.utils.sanitizer import clean from bookwyrm.settings import MEDIA_FULL_URL @@ -497,9 +497,7 @@ class HtmlField(ActivitypubFieldMixin, models.TextField): def field_from_activity(self, value): if not value or value == MISSING: return None - sanitizer = InputHtmlParser() - sanitizer.feed(value) - return sanitizer.get_output() + return clean(value) class ArrayField(ActivitypubFieldMixin, DjangoArrayField): diff --git a/bookwyrm/sanitize_html.py b/bookwyrm/sanitize_html.py deleted file mode 100644 index 4edd2818e..000000000 --- a/bookwyrm/sanitize_html.py +++ /dev/null @@ -1,71 +0,0 @@ -""" html parser to clean up incoming text from unknown sources """ -from html.parser import HTMLParser - - -class InputHtmlParser(HTMLParser): # pylint: disable=abstract-method - """Removes any html that isn't allowed_tagsed from a block""" - - def __init__(self): - HTMLParser.__init__(self) - self.allowed_tags = [ - "p", - "blockquote", - "br", - "b", - "i", - "strong", - "em", - "pre", - "a", - "span", - "ul", - "ol", - "li", - ] - self.allowed_attrs = ["href", "rel", "src", "alt"] - self.tag_stack = [] - self.output = [] - # if the html appears invalid, we just won't allow any at all - self.allow_html = True - - def handle_starttag(self, tag, attrs): - """check if the tag is valid""" - if self.allow_html and tag in self.allowed_tags: - allowed_attrs = " ".join( - f'{a}="{v}"' for a, v in attrs if a in self.allowed_attrs - ) - reconstructed = f"<{tag}" - if allowed_attrs: - reconstructed += " " + allowed_attrs - reconstructed += ">" - self.output.append(("tag", reconstructed)) - self.tag_stack.append(tag) - else: - self.output.append(("data", "")) - - def handle_endtag(self, tag): - """keep the close tag""" - if not self.allow_html or tag not in self.allowed_tags: - self.output.append(("data", "")) - return - - if not self.tag_stack or self.tag_stack[-1] != tag: - # the end tag doesn't match the most recent start tag - self.allow_html = False - self.output.append(("data", "")) - return - - self.tag_stack = self.tag_stack[:-1] - self.output.append(("tag", f"")) - - def handle_data(self, data): - """extract the answer, if we're in an answer tag""" - self.output.append(("data", data)) - - def get_output(self): - """convert the output from a list of tuples to a string""" - if self.tag_stack: - self.allow_html = False - if not self.allow_html: - return "".join(v for (k, v) in self.output if k == "data") - return "".join(v for (k, v) in self.output) diff --git a/bookwyrm/tests/test_sanitize_html.py b/bookwyrm/tests/test_sanitize_html.py index 5814f2207..ecdd69793 100644 --- a/bookwyrm/tests/test_sanitize_html.py +++ b/bookwyrm/tests/test_sanitize_html.py @@ -1,7 +1,7 @@ """ make sure only valid html gets to the app """ from django.test import TestCase -from bookwyrm.sanitize_html import InputHtmlParser +from bookwyrm.utils.sanitizer import clean class Sanitizer(TestCase): @@ -10,53 +10,45 @@ class Sanitizer(TestCase): def test_no_html(self): """just text""" input_text = "no html " - parser = InputHtmlParser() - parser.feed(input_text) - output = parser.get_output() + output = clean(input_text) self.assertEqual(input_text, output) def test_valid_html(self): """leave the html untouched""" input_text = "yes html" - parser = InputHtmlParser() - parser.feed(input_text) - output = parser.get_output() + output = clean(input_text) self.assertEqual(input_text, output) def test_valid_html_attrs(self): """and don't remove useful attributes""" input_text = 'yes html' - parser = InputHtmlParser() - parser.feed(input_text) - output = parser.get_output() + output = clean(input_text) self.assertEqual(input_text, output) def test_valid_html_invalid_attrs(self): """do remove un-approved attributes""" input_text = 'yes html' - parser = InputHtmlParser() - parser.feed(input_text) - output = parser.get_output() + output = clean(input_text) self.assertEqual(output, 'yes html') def test_invalid_html(self): """remove all html when the html is malformed""" input_text = "yes html" - parser = InputHtmlParser() - parser.feed(input_text) - output = parser.get_output() + output = clean(input_text) self.assertEqual("yes html", output) input_text = "yes html " - parser = InputHtmlParser() - parser.feed(input_text) - output = parser.get_output() + output = clean(input_text) self.assertEqual("yes html ", output) def test_disallowed_html(self): """remove disallowed html but keep allowed html""" input_text = "
yes html
" - parser = InputHtmlParser() - parser.feed(input_text) - output = parser.get_output() + output = clean(input_text) self.assertEqual(" yes html", output) + + def test_escaped_bracket(self): + """remove > and <""" + input_text = "<dev>hi</div>" + output = clean(input_text) + self.assertEqual("hi", output) diff --git a/bookwyrm/utils/sanitizer.py b/bookwyrm/utils/sanitizer.py new file mode 100644 index 000000000..676921949 --- /dev/null +++ b/bookwyrm/utils/sanitizer.py @@ -0,0 +1,25 @@ +"""Clean user-provided text""" +import bleach + + +def clean(input_text): + """Run through "bleach" """ + return bleach.clean( + input_text, + tags=[ + "p", + "blockquote", + "br", + "b", + "i", + "strong", + "em", + "pre", + "a", + "span", + "ul", + "ol", + "li", + ], + attributes=["href", "rel", "src", "alt"], + ) diff --git a/bookwyrm/views/status.py b/bookwyrm/views/status.py index 670ea5717..0dd9e0f80 100644 --- a/bookwyrm/views/status.py +++ b/bookwyrm/views/status.py @@ -16,9 +16,8 @@ from django.views.decorators.http import require_POST from markdown import markdown from bookwyrm import forms, models -from bookwyrm.sanitize_html import InputHtmlParser from bookwyrm.settings import DOMAIN -from bookwyrm.utils import regex +from bookwyrm.utils import regex, sanitizer from .helpers import handle_remote_webfinger, is_api_request from .helpers import load_date_in_user_tz_as_utc @@ -268,6 +267,4 @@ def to_markdown(content): content = format_links(content) content = markdown(content) # sanitize resulting html - sanitizer = InputHtmlParser() - sanitizer.feed(content) - return sanitizer.get_output() + return sanitizer.clean(content) diff --git a/requirements.txt b/requirements.txt index 0155782cc..3d9f686ae 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ aiohttp==3.8.1 +bleach==5.0.1 celery==5.2.2 colorthief==0.2.1 Django==3.2.13