From 7f78140015d6e2c9bac86ba9931911009c407dba Mon Sep 17 00:00:00 2001
From: Mouse Reeve <mousereeve@riseup.net>
Date: Mon, 4 Jul 2022 13:14:22 -0700
Subject: [PATCH] Uses library for html cleanup

---
 bookwyrm/models/fields.py            |  6 +--
 bookwyrm/sanitize_html.py            | 71 ----------------------------
 bookwyrm/tests/test_sanitize_html.py | 36 ++++++--------
 bookwyrm/utils/sanitizer.py          | 25 ++++++++++
 bookwyrm/views/status.py             |  7 +--
 requirements.txt                     |  1 +
 6 files changed, 44 insertions(+), 102 deletions(-)
 delete mode 100644 bookwyrm/sanitize_html.py
 create mode 100644 bookwyrm/utils/sanitizer.py

diff --git a/bookwyrm/models/fields.py b/bookwyrm/models/fields.py
index 62c61cc40..785f3397c 100644
--- a/bookwyrm/models/fields.py
+++ b/bookwyrm/models/fields.py
@@ -16,7 +16,7 @@ from django.utils.encoding import filepath_to_uri
 
 from bookwyrm import activitypub
 from bookwyrm.connectors import get_image
-from bookwyrm.sanitize_html import InputHtmlParser
+from bookwyrm.utils.sanitizer import clean
 from bookwyrm.settings import MEDIA_FULL_URL
 
 
@@ -497,9 +497,7 @@ class HtmlField(ActivitypubFieldMixin, models.TextField):
     def field_from_activity(self, value):
         if not value or value == MISSING:
             return None
-        sanitizer = InputHtmlParser()
-        sanitizer.feed(value)
-        return sanitizer.get_output()
+        return clean(value)
 
 
 class ArrayField(ActivitypubFieldMixin, DjangoArrayField):
diff --git a/bookwyrm/sanitize_html.py b/bookwyrm/sanitize_html.py
deleted file mode 100644
index 4edd2818e..000000000
--- a/bookwyrm/sanitize_html.py
+++ /dev/null
@@ -1,71 +0,0 @@
-""" html parser to clean up incoming text from unknown sources """
-from html.parser import HTMLParser
-
-
-class InputHtmlParser(HTMLParser):  # pylint: disable=abstract-method
-    """Removes any html that isn't allowed_tagsed from a block"""
-
-    def __init__(self):
-        HTMLParser.__init__(self)
-        self.allowed_tags = [
-            "p",
-            "blockquote",
-            "br",
-            "b",
-            "i",
-            "strong",
-            "em",
-            "pre",
-            "a",
-            "span",
-            "ul",
-            "ol",
-            "li",
-        ]
-        self.allowed_attrs = ["href", "rel", "src", "alt"]
-        self.tag_stack = []
-        self.output = []
-        # if the html appears invalid, we just won't allow any at all
-        self.allow_html = True
-
-    def handle_starttag(self, tag, attrs):
-        """check if the tag is valid"""
-        if self.allow_html and tag in self.allowed_tags:
-            allowed_attrs = " ".join(
-                f'{a}="{v}"' for a, v in attrs if a in self.allowed_attrs
-            )
-            reconstructed = f"<{tag}"
-            if allowed_attrs:
-                reconstructed += " " + allowed_attrs
-            reconstructed += ">"
-            self.output.append(("tag", reconstructed))
-            self.tag_stack.append(tag)
-        else:
-            self.output.append(("data", ""))
-
-    def handle_endtag(self, tag):
-        """keep the close tag"""
-        if not self.allow_html or tag not in self.allowed_tags:
-            self.output.append(("data", ""))
-            return
-
-        if not self.tag_stack or self.tag_stack[-1] != tag:
-            # the end tag doesn't match the most recent start tag
-            self.allow_html = False
-            self.output.append(("data", ""))
-            return
-
-        self.tag_stack = self.tag_stack[:-1]
-        self.output.append(("tag", f"</{tag}>"))
-
-    def handle_data(self, data):
-        """extract the answer, if we're in an answer tag"""
-        self.output.append(("data", data))
-
-    def get_output(self):
-        """convert the output from a list of tuples to a string"""
-        if self.tag_stack:
-            self.allow_html = False
-        if not self.allow_html:
-            return "".join(v for (k, v) in self.output if k == "data")
-        return "".join(v for (k, v) in self.output)
diff --git a/bookwyrm/tests/test_sanitize_html.py b/bookwyrm/tests/test_sanitize_html.py
index 5814f2207..ecdd69793 100644
--- a/bookwyrm/tests/test_sanitize_html.py
+++ b/bookwyrm/tests/test_sanitize_html.py
@@ -1,7 +1,7 @@
 """ make sure only valid html gets to the app """
 from django.test import TestCase
 
-from bookwyrm.sanitize_html import InputHtmlParser
+from bookwyrm.utils.sanitizer import clean
 
 
 class Sanitizer(TestCase):
@@ -10,53 +10,45 @@ class Sanitizer(TestCase):
     def test_no_html(self):
         """just text"""
         input_text = "no      html  "
-        parser = InputHtmlParser()
-        parser.feed(input_text)
-        output = parser.get_output()
+        output = clean(input_text)
         self.assertEqual(input_text, output)
 
     def test_valid_html(self):
         """leave the html untouched"""
         input_text = "<b>yes    </b> <i>html</i>"
-        parser = InputHtmlParser()
-        parser.feed(input_text)
-        output = parser.get_output()
+        output = clean(input_text)
         self.assertEqual(input_text, output)
 
     def test_valid_html_attrs(self):
         """and don't remove useful attributes"""
         input_text = '<a href="fish.com">yes    </a> <i>html</i>'
-        parser = InputHtmlParser()
-        parser.feed(input_text)
-        output = parser.get_output()
+        output = clean(input_text)
         self.assertEqual(input_text, output)
 
     def test_valid_html_invalid_attrs(self):
         """do remove un-approved attributes"""
         input_text = '<a href="fish.com" fish="hello">yes    </a> <i>html</i>'
-        parser = InputHtmlParser()
-        parser.feed(input_text)
-        output = parser.get_output()
+        output = clean(input_text)
         self.assertEqual(output, '<a href="fish.com">yes    </a> <i>html</i>')
 
     def test_invalid_html(self):
         """remove all html when the html is malformed"""
         input_text = "<b>yes  <i>html</i>"
-        parser = InputHtmlParser()
-        parser.feed(input_text)
-        output = parser.get_output()
+        output = clean(input_text)
         self.assertEqual("yes  html", output)
 
         input_text = "yes <i></b>html   </i>"
-        parser = InputHtmlParser()
-        parser.feed(input_text)
-        output = parser.get_output()
+        output = clean(input_text)
         self.assertEqual("yes html   ", output)
 
     def test_disallowed_html(self):
         """remove disallowed html but keep allowed html"""
         input_text = "<div>  yes <i>html</i></div>"
-        parser = InputHtmlParser()
-        parser.feed(input_text)
-        output = parser.get_output()
+        output = clean(input_text)
         self.assertEqual("  yes <i>html</i>", output)
+
+    def test_escaped_bracket(self):
+        """remove &gt; and &lt;"""
+        input_text = "&lt;dev&gt;hi&lt;/div&gt;"
+        output = clean(input_text)
+        self.assertEqual("hi", output)
diff --git a/bookwyrm/utils/sanitizer.py b/bookwyrm/utils/sanitizer.py
new file mode 100644
index 000000000..676921949
--- /dev/null
+++ b/bookwyrm/utils/sanitizer.py
@@ -0,0 +1,25 @@
+"""Clean user-provided text"""
+import bleach
+
+
+def clean(input_text):
+    """Run through "bleach" """
+    return bleach.clean(
+        input_text,
+        tags=[
+            "p",
+            "blockquote",
+            "br",
+            "b",
+            "i",
+            "strong",
+            "em",
+            "pre",
+            "a",
+            "span",
+            "ul",
+            "ol",
+            "li",
+        ],
+        attributes=["href", "rel", "src", "alt"],
+    )
diff --git a/bookwyrm/views/status.py b/bookwyrm/views/status.py
index 670ea5717..0dd9e0f80 100644
--- a/bookwyrm/views/status.py
+++ b/bookwyrm/views/status.py
@@ -16,9 +16,8 @@ from django.views.decorators.http import require_POST
 
 from markdown import markdown
 from bookwyrm import forms, models
-from bookwyrm.sanitize_html import InputHtmlParser
 from bookwyrm.settings import DOMAIN
-from bookwyrm.utils import regex
+from bookwyrm.utils import regex, sanitizer
 from .helpers import handle_remote_webfinger, is_api_request
 from .helpers import load_date_in_user_tz_as_utc
 
@@ -268,6 +267,4 @@ def to_markdown(content):
     content = format_links(content)
     content = markdown(content)
     # sanitize resulting html
-    sanitizer = InputHtmlParser()
-    sanitizer.feed(content)
-    return sanitizer.get_output()
+    return sanitizer.clean(content)
diff --git a/requirements.txt b/requirements.txt
index 0155782cc..3d9f686ae 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,5 @@
 aiohttp==3.8.1
+bleach==5.0.1
 celery==5.2.2
 colorthief==0.2.1
 Django==3.2.13