Uses library for html cleanup

2024-05-13 22:12:57 +00:00 · 2022-07-04 13:14:22 -07:00 · 2022-07-04 13:14:22 -07:00 · 7f78140015
parent 58b23a74da
commit 7f78140015
6 changed files with 44 additions and 102 deletions
--- a/bookwyrm/models/fields.py
+++ b/bookwyrm/models/fields.py
@ -16,7 +16,7 @@ from django.utils.encoding import filepath_to_uri
 from bookwyrm import activitypub
 from bookwyrm.connectors import get_image
-from bookwyrm.sanitize_html import InputHtmlParser
+from bookwyrm.utils.sanitizer import clean
 from bookwyrm.settings import MEDIA_FULL_URL
@ -497,9 +497,7 @@ class HtmlField(ActivitypubFieldMixin, models.TextField):
    def field_from_activity(self, value):
        if not value or value == MISSING:
            return None
-        sanitizer = InputHtmlParser()
+        return clean(value)
        sanitizer.feed(value)
        return sanitizer.get_output()
 class ArrayField(ActivitypubFieldMixin, DjangoArrayField):
--- a/bookwyrm/sanitize_html.py
+++ b/bookwyrm/sanitize_html.py
@ -1,71 +0,0 @@
 """ html parser to clean up incoming text from unknown sources """
 from html.parser import HTMLParser
 class InputHtmlParser(HTMLParser):  # pylint: disable=abstract-method
    """Removes any html that isn't allowed_tagsed from a block"""
    def __init__(self):
        HTMLParser.__init__(self)
        self.allowed_tags = [
            "p",
            "blockquote",
            "br",
            "b",
            "i",
            "strong",
            "em",
            "pre",
            "a",
            "span",
            "ul",
            "ol",
            "li",
        ]
        self.allowed_attrs = ["href", "rel", "src", "alt"]
        self.tag_stack = []
        self.output = []
        # if the html appears invalid, we just won't allow any at all
        self.allow_html = True
    def handle_starttag(self, tag, attrs):
        """check if the tag is valid"""
        if self.allow_html and tag in self.allowed_tags:
            allowed_attrs = " ".join(
                f'{a}="{v}"' for a, v in attrs if a in self.allowed_attrs
            )
            reconstructed = f"<{tag}"
            if allowed_attrs:
                reconstructed += " " + allowed_attrs
            reconstructed += ">"
            self.output.append(("tag", reconstructed))
            self.tag_stack.append(tag)
        else:
            self.output.append(("data", ""))
    def handle_endtag(self, tag):
        """keep the close tag"""
        if not self.allow_html or tag not in self.allowed_tags:
            self.output.append(("data", ""))
            return
        if not self.tag_stack or self.tag_stack[-1] != tag:
            # the end tag doesn't match the most recent start tag
            self.allow_html = False
            self.output.append(("data", ""))
            return
        self.tag_stack = self.tag_stack[:-1]
        self.output.append(("tag", f"</{tag}>"))
    def handle_data(self, data):
        """extract the answer, if we're in an answer tag"""
        self.output.append(("data", data))
    def get_output(self):
        """convert the output from a list of tuples to a string"""
        if self.tag_stack:
            self.allow_html = False
        if not self.allow_html:
            return "".join(v for (k, v) in self.output if k == "data")
        return "".join(v for (k, v) in self.output)
--- a/bookwyrm/tests/test_sanitize_html.py
+++ b/bookwyrm/tests/test_sanitize_html.py
@ -1,7 +1,7 @@
 """ make sure only valid html gets to the app """
 from django.test import TestCase
-from bookwyrm.sanitize_html import InputHtmlParser
+from bookwyrm.utils.sanitizer import clean
 class Sanitizer(TestCase):
@ -10,53 +10,45 @@ class Sanitizer(TestCase):
    def test_no_html(self):
        """just text"""
        input_text = "no      html  "
-        parser = InputHtmlParser()
+        output = clean(input_text)
        parser.feed(input_text)
        output = parser.get_output()
        self.assertEqual(input_text, output)
    def test_valid_html(self):
        """leave the html untouched"""
        input_text = "<b>yes    </b> <i>html</i>"
-        parser = InputHtmlParser()
+        output = clean(input_text)
        parser.feed(input_text)
        output = parser.get_output()
        self.assertEqual(input_text, output)
    def test_valid_html_attrs(self):
        """and don't remove useful attributes"""
        input_text = '<a href="fish.com">yes    </a> <i>html</i>'
-        parser = InputHtmlParser()
+        output = clean(input_text)
        parser.feed(input_text)
        output = parser.get_output()
        self.assertEqual(input_text, output)
    def test_valid_html_invalid_attrs(self):
        """do remove un-approved attributes"""
        input_text = '<a href="fish.com" fish="hello">yes    </a> <i>html</i>'
-        parser = InputHtmlParser()
+        output = clean(input_text)
        parser.feed(input_text)
        output = parser.get_output()
        self.assertEqual(output, '<a href="fish.com">yes    </a> <i>html</i>')
    def test_invalid_html(self):
        """remove all html when the html is malformed"""
        input_text = "<b>yes  <i>html</i>"
-        parser = InputHtmlParser()
+        output = clean(input_text)
        parser.feed(input_text)
        output = parser.get_output()
        self.assertEqual("yes  html", output)
        input_text = "yes <i></b>html   </i>"
-        parser = InputHtmlParser()
+        output = clean(input_text)
        parser.feed(input_text)
        output = parser.get_output()
        self.assertEqual("yes html   ", output)
    def test_disallowed_html(self):
        """remove disallowed html but keep allowed html"""
        input_text = "<div>  yes <i>html</i></div>"
-        parser = InputHtmlParser()
+        output = clean(input_text)
        parser.feed(input_text)
        output = parser.get_output()
        self.assertEqual("  yes <i>html</i>", output)
    def test_escaped_bracket(self):
        """remove &gt; and &lt;"""
        input_text = "&lt;dev&gt;hi&lt;/div&gt;"
        output = clean(input_text)
        self.assertEqual("hi", output)
--- a/bookwyrm/utils/sanitizer.py
+++ b/bookwyrm/utils/sanitizer.py
@ -0,0 +1,25 @@
 """Clean user-provided text"""
 import bleach
 def clean(input_text):
    """Run through "bleach" """
    return bleach.clean(
        input_text,
        tags=[
            "p",
            "blockquote",
            "br",
            "b",
            "i",
            "strong",
            "em",
            "pre",
            "a",
            "span",
            "ul",
            "ol",
            "li",
        ],
        attributes=["href", "rel", "src", "alt"],
    )
--- a/bookwyrm/views/status.py
+++ b/bookwyrm/views/status.py
@ -16,9 +16,8 @@ from django.views.decorators.http import require_POST
 from markdown import markdown
 from bookwyrm import forms, models
 from bookwyrm.sanitize_html import InputHtmlParser
 from bookwyrm.settings import DOMAIN
-from bookwyrm.utils import regex
+from bookwyrm.utils import regex, sanitizer
 from .helpers import handle_remote_webfinger, is_api_request
 from .helpers import load_date_in_user_tz_as_utc
@ -268,6 +267,4 @@ def to_markdown(content):
    content = format_links(content)
    content = markdown(content)
    # sanitize resulting html
-    sanitizer = InputHtmlParser()
+    return sanitizer.clean(content)
    sanitizer.feed(content)
    return sanitizer.get_output()
--- a/requirements.txt
+++ b/requirements.txt
@ -1,4 +1,5 @@
 aiohttp==3.8.1
 bleach==5.0.1
 celery==5.2.2
 colorthief==0.2.1
 Django==3.2.13