Uses library for html cleanup

2025-04-14 14:24:05 +00:00 · 2022-07-04 13:14:22 -07:00 · 2022-07-04 13:14:22 -07:00 · 7f78140015
commit 7f78140015
parent 58b23a74da
6 changed files with 44 additions and 102 deletions
--- a/bookwyrm/models/fields.py
+++ b/bookwyrm/models/fields.py
@ -16,7 +16,7 @@ from django.utils.encoding import filepath_to_uri

 from bookwyrm import activitypub
 from bookwyrm.connectors import get_image
-from bookwyrm.sanitize_html import InputHtmlParser
+from bookwyrm.utils.sanitizer import clean
 from bookwyrm.settings import MEDIA_FULL_URL


@ -497,9 +497,7 @@ class HtmlField(ActivitypubFieldMixin, models.TextField):
    def field_from_activity(self, value):
        if not value or value == MISSING:
            return None
-        sanitizer = InputHtmlParser()
-        sanitizer.feed(value)
-        return sanitizer.get_output()
+        return clean(value)


 class ArrayField(ActivitypubFieldMixin, DjangoArrayField):
--- a/bookwyrm/sanitize_html.py
+++ b/bookwyrm/sanitize_html.py
@ -1,71 +0,0 @@
-""" html parser to clean up incoming text from unknown sources """
-from html.parser import HTMLParser
-
-
-class InputHtmlParser(HTMLParser):  # pylint: disable=abstract-method
-    """Removes any html that isn't allowed_tagsed from a block"""
-
-    def __init__(self):
-        HTMLParser.__init__(self)
-        self.allowed_tags = [
-            "p",
-            "blockquote",
-            "br",
-            "b",
-            "i",
-            "strong",
-            "em",
-            "pre",
-            "a",
-            "span",
-            "ul",
-            "ol",
-            "li",
-        ]
-        self.allowed_attrs = ["href", "rel", "src", "alt"]
-        self.tag_stack = []
-        self.output = []
-        # if the html appears invalid, we just won't allow any at all
-        self.allow_html = True
-
-    def handle_starttag(self, tag, attrs):
-        """check if the tag is valid"""
-        if self.allow_html and tag in self.allowed_tags:
-            allowed_attrs = " ".join(
-                f'{a}="{v}"' for a, v in attrs if a in self.allowed_attrs
-            )
-            reconstructed = f"<{tag}"
-            if allowed_attrs:
-                reconstructed += " " + allowed_attrs
-            reconstructed += ">"
-            self.output.append(("tag", reconstructed))
-            self.tag_stack.append(tag)
-        else:
-            self.output.append(("data", ""))
-
-    def handle_endtag(self, tag):
-        """keep the close tag"""
-        if not self.allow_html or tag not in self.allowed_tags:
-            self.output.append(("data", ""))
-            return
-
-        if not self.tag_stack or self.tag_stack[-1] != tag:
-            # the end tag doesn't match the most recent start tag
-            self.allow_html = False
-            self.output.append(("data", ""))
-            return
-
-        self.tag_stack = self.tag_stack[:-1]
-        self.output.append(("tag", f"</{tag}>"))
-
-    def handle_data(self, data):
-        """extract the answer, if we're in an answer tag"""
-        self.output.append(("data", data))
-
-    def get_output(self):
-        """convert the output from a list of tuples to a string"""
-        if self.tag_stack:
-            self.allow_html = False
-        if not self.allow_html:
-            return "".join(v for (k, v) in self.output if k == "data")
-        return "".join(v for (k, v) in self.output)
--- a/bookwyrm/tests/test_sanitize_html.py
+++ b/bookwyrm/tests/test_sanitize_html.py
@ -1,7 +1,7 @@
 """ make sure only valid html gets to the app """
 from django.test import TestCase

-from bookwyrm.sanitize_html import InputHtmlParser
+from bookwyrm.utils.sanitizer import clean


 class Sanitizer(TestCase):
@ -10,53 +10,45 @@ class Sanitizer(TestCase):
    def test_no_html(self):
        """just text"""
        input_text = "no      html  "
-        parser = InputHtmlParser()
-        parser.feed(input_text)
-        output = parser.get_output()
+        output = clean(input_text)
        self.assertEqual(input_text, output)

    def test_valid_html(self):
        """leave the html untouched"""
        input_text = "<b>yes    </b> <i>html</i>"
-        parser = InputHtmlParser()
-        parser.feed(input_text)
-        output = parser.get_output()
+        output = clean(input_text)
        self.assertEqual(input_text, output)

    def test_valid_html_attrs(self):
        """and don't remove useful attributes"""
        input_text = '<a href="fish.com">yes    </a> <i>html</i>'
-        parser = InputHtmlParser()
-        parser.feed(input_text)
-        output = parser.get_output()
+        output = clean(input_text)
        self.assertEqual(input_text, output)

    def test_valid_html_invalid_attrs(self):
        """do remove un-approved attributes"""
        input_text = '<a href="fish.com" fish="hello">yes    </a> <i>html</i>'
-        parser = InputHtmlParser()
-        parser.feed(input_text)
-        output = parser.get_output()
+        output = clean(input_text)
        self.assertEqual(output, '<a href="fish.com">yes    </a> <i>html</i>')

    def test_invalid_html(self):
        """remove all html when the html is malformed"""
        input_text = "<b>yes  <i>html</i>"
-        parser = InputHtmlParser()
-        parser.feed(input_text)
-        output = parser.get_output()
+        output = clean(input_text)
        self.assertEqual("yes  html", output)

        input_text = "yes <i></b>html   </i>"
-        parser = InputHtmlParser()
-        parser.feed(input_text)
-        output = parser.get_output()
+        output = clean(input_text)
        self.assertEqual("yes html   ", output)

    def test_disallowed_html(self):
        """remove disallowed html but keep allowed html"""
        input_text = "<div>  yes <i>html</i></div>"
-        parser = InputHtmlParser()
-        parser.feed(input_text)
-        output = parser.get_output()
+        output = clean(input_text)
        self.assertEqual("  yes <i>html</i>", output)
+
+    def test_escaped_bracket(self):
+        """remove &gt; and &lt;"""
+        input_text = "&lt;dev&gt;hi&lt;/div&gt;"
+        output = clean(input_text)
+        self.assertEqual("hi", output)
--- a/bookwyrm/utils/sanitizer.py
+++ b/bookwyrm/utils/sanitizer.py
@ -0,0 +1,25 @@
+"""Clean user-provided text"""
+import bleach
+
+
+def clean(input_text):
+    """Run through "bleach" """
+    return bleach.clean(
+        input_text,
+        tags=[
+            "p",
+            "blockquote",
+            "br",
+            "b",
+            "i",
+            "strong",
+            "em",
+            "pre",
+            "a",
+            "span",
+            "ul",
+            "ol",
+            "li",
+        ],
+        attributes=["href", "rel", "src", "alt"],
+    )
--- a/bookwyrm/views/status.py
+++ b/bookwyrm/views/status.py
@ -16,9 +16,8 @@ from django.views.decorators.http import require_POST

 from markdown import markdown
 from bookwyrm import forms, models
-from bookwyrm.sanitize_html import InputHtmlParser
 from bookwyrm.settings import DOMAIN
-from bookwyrm.utils import regex
+from bookwyrm.utils import regex, sanitizer
 from .helpers import handle_remote_webfinger, is_api_request
 from .helpers import load_date_in_user_tz_as_utc

@ -268,6 +267,4 @@ def to_markdown(content):
    content = format_links(content)
    content = markdown(content)
    # sanitize resulting html
-    sanitizer = InputHtmlParser()
-    sanitizer.feed(content)
-    return sanitizer.get_output()
+    return sanitizer.clean(content)
--- a/requirements.txt
+++ b/requirements.txt
@ -1,4 +1,5 @@
 aiohttp==3.8.1
+bleach==5.0.1
 celery==5.2.2
 colorthief==0.2.1
 Django==3.2.13