Uses library for html cleanup

This commit is contained in:
Mouse Reeve 2022-07-04 13:14:22 -07:00
parent 58b23a74da
commit 7f78140015
6 changed files with 44 additions and 102 deletions

View file

@ -16,7 +16,7 @@ from django.utils.encoding import filepath_to_uri
from bookwyrm import activitypub from bookwyrm import activitypub
from bookwyrm.connectors import get_image from bookwyrm.connectors import get_image
from bookwyrm.sanitize_html import InputHtmlParser from bookwyrm.utils.sanitizer import clean
from bookwyrm.settings import MEDIA_FULL_URL from bookwyrm.settings import MEDIA_FULL_URL
@ -497,9 +497,7 @@ class HtmlField(ActivitypubFieldMixin, models.TextField):
def field_from_activity(self, value): def field_from_activity(self, value):
if not value or value == MISSING: if not value or value == MISSING:
return None return None
sanitizer = InputHtmlParser() return clean(value)
sanitizer.feed(value)
return sanitizer.get_output()
class ArrayField(ActivitypubFieldMixin, DjangoArrayField): class ArrayField(ActivitypubFieldMixin, DjangoArrayField):

View file

@ -1,71 +0,0 @@
""" html parser to clean up incoming text from unknown sources """
from html.parser import HTMLParser
class InputHtmlParser(HTMLParser): # pylint: disable=abstract-method
"""Removes any html that isn't allowed_tagsed from a block"""
def __init__(self):
HTMLParser.__init__(self)
self.allowed_tags = [
"p",
"blockquote",
"br",
"b",
"i",
"strong",
"em",
"pre",
"a",
"span",
"ul",
"ol",
"li",
]
self.allowed_attrs = ["href", "rel", "src", "alt"]
self.tag_stack = []
self.output = []
# if the html appears invalid, we just won't allow any at all
self.allow_html = True
def handle_starttag(self, tag, attrs):
"""check if the tag is valid"""
if self.allow_html and tag in self.allowed_tags:
allowed_attrs = " ".join(
f'{a}="{v}"' for a, v in attrs if a in self.allowed_attrs
)
reconstructed = f"<{tag}"
if allowed_attrs:
reconstructed += " " + allowed_attrs
reconstructed += ">"
self.output.append(("tag", reconstructed))
self.tag_stack.append(tag)
else:
self.output.append(("data", ""))
def handle_endtag(self, tag):
"""keep the close tag"""
if not self.allow_html or tag not in self.allowed_tags:
self.output.append(("data", ""))
return
if not self.tag_stack or self.tag_stack[-1] != tag:
# the end tag doesn't match the most recent start tag
self.allow_html = False
self.output.append(("data", ""))
return
self.tag_stack = self.tag_stack[:-1]
self.output.append(("tag", f"</{tag}>"))
def handle_data(self, data):
"""extract the answer, if we're in an answer tag"""
self.output.append(("data", data))
def get_output(self):
"""convert the output from a list of tuples to a string"""
if self.tag_stack:
self.allow_html = False
if not self.allow_html:
return "".join(v for (k, v) in self.output if k == "data")
return "".join(v for (k, v) in self.output)

View file

@ -1,7 +1,7 @@
""" make sure only valid html gets to the app """ """ make sure only valid html gets to the app """
from django.test import TestCase from django.test import TestCase
from bookwyrm.sanitize_html import InputHtmlParser from bookwyrm.utils.sanitizer import clean
class Sanitizer(TestCase): class Sanitizer(TestCase):
@ -10,53 +10,45 @@ class Sanitizer(TestCase):
def test_no_html(self): def test_no_html(self):
"""just text""" """just text"""
input_text = "no html " input_text = "no html "
parser = InputHtmlParser() output = clean(input_text)
parser.feed(input_text)
output = parser.get_output()
self.assertEqual(input_text, output) self.assertEqual(input_text, output)
def test_valid_html(self): def test_valid_html(self):
"""leave the html untouched""" """leave the html untouched"""
input_text = "<b>yes </b> <i>html</i>" input_text = "<b>yes </b> <i>html</i>"
parser = InputHtmlParser() output = clean(input_text)
parser.feed(input_text)
output = parser.get_output()
self.assertEqual(input_text, output) self.assertEqual(input_text, output)
def test_valid_html_attrs(self): def test_valid_html_attrs(self):
"""and don't remove useful attributes""" """and don't remove useful attributes"""
input_text = '<a href="fish.com">yes </a> <i>html</i>' input_text = '<a href="fish.com">yes </a> <i>html</i>'
parser = InputHtmlParser() output = clean(input_text)
parser.feed(input_text)
output = parser.get_output()
self.assertEqual(input_text, output) self.assertEqual(input_text, output)
def test_valid_html_invalid_attrs(self): def test_valid_html_invalid_attrs(self):
"""do remove un-approved attributes""" """do remove un-approved attributes"""
input_text = '<a href="fish.com" fish="hello">yes </a> <i>html</i>' input_text = '<a href="fish.com" fish="hello">yes </a> <i>html</i>'
parser = InputHtmlParser() output = clean(input_text)
parser.feed(input_text)
output = parser.get_output()
self.assertEqual(output, '<a href="fish.com">yes </a> <i>html</i>') self.assertEqual(output, '<a href="fish.com">yes </a> <i>html</i>')
def test_invalid_html(self): def test_invalid_html(self):
"""remove all html when the html is malformed""" """remove all html when the html is malformed"""
input_text = "<b>yes <i>html</i>" input_text = "<b>yes <i>html</i>"
parser = InputHtmlParser() output = clean(input_text)
parser.feed(input_text)
output = parser.get_output()
self.assertEqual("yes html", output) self.assertEqual("yes html", output)
input_text = "yes <i></b>html </i>" input_text = "yes <i></b>html </i>"
parser = InputHtmlParser() output = clean(input_text)
parser.feed(input_text)
output = parser.get_output()
self.assertEqual("yes html ", output) self.assertEqual("yes html ", output)
def test_disallowed_html(self): def test_disallowed_html(self):
"""remove disallowed html but keep allowed html""" """remove disallowed html but keep allowed html"""
input_text = "<div> yes <i>html</i></div>" input_text = "<div> yes <i>html</i></div>"
parser = InputHtmlParser() output = clean(input_text)
parser.feed(input_text)
output = parser.get_output()
self.assertEqual(" yes <i>html</i>", output) self.assertEqual(" yes <i>html</i>", output)
def test_escaped_bracket(self):
"""remove &gt; and &lt;"""
input_text = "&lt;dev&gt;hi&lt;/div&gt;"
output = clean(input_text)
self.assertEqual("hi", output)

View file

@ -0,0 +1,25 @@
"""Clean user-provided text"""
import bleach
def clean(input_text):
"""Run through "bleach" """
return bleach.clean(
input_text,
tags=[
"p",
"blockquote",
"br",
"b",
"i",
"strong",
"em",
"pre",
"a",
"span",
"ul",
"ol",
"li",
],
attributes=["href", "rel", "src", "alt"],
)

View file

@ -16,9 +16,8 @@ from django.views.decorators.http import require_POST
from markdown import markdown from markdown import markdown
from bookwyrm import forms, models from bookwyrm import forms, models
from bookwyrm.sanitize_html import InputHtmlParser
from bookwyrm.settings import DOMAIN from bookwyrm.settings import DOMAIN
from bookwyrm.utils import regex from bookwyrm.utils import regex, sanitizer
from .helpers import handle_remote_webfinger, is_api_request from .helpers import handle_remote_webfinger, is_api_request
from .helpers import load_date_in_user_tz_as_utc from .helpers import load_date_in_user_tz_as_utc
@ -268,6 +267,4 @@ def to_markdown(content):
content = format_links(content) content = format_links(content)
content = markdown(content) content = markdown(content)
# sanitize resulting html # sanitize resulting html
sanitizer = InputHtmlParser() return sanitizer.clean(content)
sanitizer.feed(content)
return sanitizer.get_output()

View file

@ -1,4 +1,5 @@
aiohttp==3.8.1 aiohttp==3.8.1
bleach==5.0.1
celery==5.2.2 celery==5.2.2
colorthief==0.2.1 colorthief==0.2.1
Django==3.2.13 Django==3.2.13