mirror of
https://github.com/bookwyrm-social/bookwyrm.git
synced 2025-01-25 00:18:10 +00:00
Uses library for html cleanup
This commit is contained in:
parent
58b23a74da
commit
7f78140015
6 changed files with 44 additions and 102 deletions
|
@ -16,7 +16,7 @@ from django.utils.encoding import filepath_to_uri
|
||||||
|
|
||||||
from bookwyrm import activitypub
|
from bookwyrm import activitypub
|
||||||
from bookwyrm.connectors import get_image
|
from bookwyrm.connectors import get_image
|
||||||
from bookwyrm.sanitize_html import InputHtmlParser
|
from bookwyrm.utils.sanitizer import clean
|
||||||
from bookwyrm.settings import MEDIA_FULL_URL
|
from bookwyrm.settings import MEDIA_FULL_URL
|
||||||
|
|
||||||
|
|
||||||
|
@ -497,9 +497,7 @@ class HtmlField(ActivitypubFieldMixin, models.TextField):
|
||||||
def field_from_activity(self, value):
|
def field_from_activity(self, value):
|
||||||
if not value or value == MISSING:
|
if not value or value == MISSING:
|
||||||
return None
|
return None
|
||||||
sanitizer = InputHtmlParser()
|
return clean(value)
|
||||||
sanitizer.feed(value)
|
|
||||||
return sanitizer.get_output()
|
|
||||||
|
|
||||||
|
|
||||||
class ArrayField(ActivitypubFieldMixin, DjangoArrayField):
|
class ArrayField(ActivitypubFieldMixin, DjangoArrayField):
|
||||||
|
|
|
@ -1,71 +0,0 @@
|
||||||
""" html parser to clean up incoming text from unknown sources """
|
|
||||||
from html.parser import HTMLParser
|
|
||||||
|
|
||||||
|
|
||||||
class InputHtmlParser(HTMLParser): # pylint: disable=abstract-method
|
|
||||||
"""Removes any html that isn't allowed_tagsed from a block"""
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
HTMLParser.__init__(self)
|
|
||||||
self.allowed_tags = [
|
|
||||||
"p",
|
|
||||||
"blockquote",
|
|
||||||
"br",
|
|
||||||
"b",
|
|
||||||
"i",
|
|
||||||
"strong",
|
|
||||||
"em",
|
|
||||||
"pre",
|
|
||||||
"a",
|
|
||||||
"span",
|
|
||||||
"ul",
|
|
||||||
"ol",
|
|
||||||
"li",
|
|
||||||
]
|
|
||||||
self.allowed_attrs = ["href", "rel", "src", "alt"]
|
|
||||||
self.tag_stack = []
|
|
||||||
self.output = []
|
|
||||||
# if the html appears invalid, we just won't allow any at all
|
|
||||||
self.allow_html = True
|
|
||||||
|
|
||||||
def handle_starttag(self, tag, attrs):
|
|
||||||
"""check if the tag is valid"""
|
|
||||||
if self.allow_html and tag in self.allowed_tags:
|
|
||||||
allowed_attrs = " ".join(
|
|
||||||
f'{a}="{v}"' for a, v in attrs if a in self.allowed_attrs
|
|
||||||
)
|
|
||||||
reconstructed = f"<{tag}"
|
|
||||||
if allowed_attrs:
|
|
||||||
reconstructed += " " + allowed_attrs
|
|
||||||
reconstructed += ">"
|
|
||||||
self.output.append(("tag", reconstructed))
|
|
||||||
self.tag_stack.append(tag)
|
|
||||||
else:
|
|
||||||
self.output.append(("data", ""))
|
|
||||||
|
|
||||||
def handle_endtag(self, tag):
|
|
||||||
"""keep the close tag"""
|
|
||||||
if not self.allow_html or tag not in self.allowed_tags:
|
|
||||||
self.output.append(("data", ""))
|
|
||||||
return
|
|
||||||
|
|
||||||
if not self.tag_stack or self.tag_stack[-1] != tag:
|
|
||||||
# the end tag doesn't match the most recent start tag
|
|
||||||
self.allow_html = False
|
|
||||||
self.output.append(("data", ""))
|
|
||||||
return
|
|
||||||
|
|
||||||
self.tag_stack = self.tag_stack[:-1]
|
|
||||||
self.output.append(("tag", f"</{tag}>"))
|
|
||||||
|
|
||||||
def handle_data(self, data):
|
|
||||||
"""extract the answer, if we're in an answer tag"""
|
|
||||||
self.output.append(("data", data))
|
|
||||||
|
|
||||||
def get_output(self):
|
|
||||||
"""convert the output from a list of tuples to a string"""
|
|
||||||
if self.tag_stack:
|
|
||||||
self.allow_html = False
|
|
||||||
if not self.allow_html:
|
|
||||||
return "".join(v for (k, v) in self.output if k == "data")
|
|
||||||
return "".join(v for (k, v) in self.output)
|
|
|
@ -1,7 +1,7 @@
|
||||||
""" make sure only valid html gets to the app """
|
""" make sure only valid html gets to the app """
|
||||||
from django.test import TestCase
|
from django.test import TestCase
|
||||||
|
|
||||||
from bookwyrm.sanitize_html import InputHtmlParser
|
from bookwyrm.utils.sanitizer import clean
|
||||||
|
|
||||||
|
|
||||||
class Sanitizer(TestCase):
|
class Sanitizer(TestCase):
|
||||||
|
@ -10,53 +10,45 @@ class Sanitizer(TestCase):
|
||||||
def test_no_html(self):
|
def test_no_html(self):
|
||||||
"""just text"""
|
"""just text"""
|
||||||
input_text = "no html "
|
input_text = "no html "
|
||||||
parser = InputHtmlParser()
|
output = clean(input_text)
|
||||||
parser.feed(input_text)
|
|
||||||
output = parser.get_output()
|
|
||||||
self.assertEqual(input_text, output)
|
self.assertEqual(input_text, output)
|
||||||
|
|
||||||
def test_valid_html(self):
|
def test_valid_html(self):
|
||||||
"""leave the html untouched"""
|
"""leave the html untouched"""
|
||||||
input_text = "<b>yes </b> <i>html</i>"
|
input_text = "<b>yes </b> <i>html</i>"
|
||||||
parser = InputHtmlParser()
|
output = clean(input_text)
|
||||||
parser.feed(input_text)
|
|
||||||
output = parser.get_output()
|
|
||||||
self.assertEqual(input_text, output)
|
self.assertEqual(input_text, output)
|
||||||
|
|
||||||
def test_valid_html_attrs(self):
|
def test_valid_html_attrs(self):
|
||||||
"""and don't remove useful attributes"""
|
"""and don't remove useful attributes"""
|
||||||
input_text = '<a href="fish.com">yes </a> <i>html</i>'
|
input_text = '<a href="fish.com">yes </a> <i>html</i>'
|
||||||
parser = InputHtmlParser()
|
output = clean(input_text)
|
||||||
parser.feed(input_text)
|
|
||||||
output = parser.get_output()
|
|
||||||
self.assertEqual(input_text, output)
|
self.assertEqual(input_text, output)
|
||||||
|
|
||||||
def test_valid_html_invalid_attrs(self):
|
def test_valid_html_invalid_attrs(self):
|
||||||
"""do remove un-approved attributes"""
|
"""do remove un-approved attributes"""
|
||||||
input_text = '<a href="fish.com" fish="hello">yes </a> <i>html</i>'
|
input_text = '<a href="fish.com" fish="hello">yes </a> <i>html</i>'
|
||||||
parser = InputHtmlParser()
|
output = clean(input_text)
|
||||||
parser.feed(input_text)
|
|
||||||
output = parser.get_output()
|
|
||||||
self.assertEqual(output, '<a href="fish.com">yes </a> <i>html</i>')
|
self.assertEqual(output, '<a href="fish.com">yes </a> <i>html</i>')
|
||||||
|
|
||||||
def test_invalid_html(self):
|
def test_invalid_html(self):
|
||||||
"""remove all html when the html is malformed"""
|
"""remove all html when the html is malformed"""
|
||||||
input_text = "<b>yes <i>html</i>"
|
input_text = "<b>yes <i>html</i>"
|
||||||
parser = InputHtmlParser()
|
output = clean(input_text)
|
||||||
parser.feed(input_text)
|
|
||||||
output = parser.get_output()
|
|
||||||
self.assertEqual("yes html", output)
|
self.assertEqual("yes html", output)
|
||||||
|
|
||||||
input_text = "yes <i></b>html </i>"
|
input_text = "yes <i></b>html </i>"
|
||||||
parser = InputHtmlParser()
|
output = clean(input_text)
|
||||||
parser.feed(input_text)
|
|
||||||
output = parser.get_output()
|
|
||||||
self.assertEqual("yes html ", output)
|
self.assertEqual("yes html ", output)
|
||||||
|
|
||||||
def test_disallowed_html(self):
|
def test_disallowed_html(self):
|
||||||
"""remove disallowed html but keep allowed html"""
|
"""remove disallowed html but keep allowed html"""
|
||||||
input_text = "<div> yes <i>html</i></div>"
|
input_text = "<div> yes <i>html</i></div>"
|
||||||
parser = InputHtmlParser()
|
output = clean(input_text)
|
||||||
parser.feed(input_text)
|
|
||||||
output = parser.get_output()
|
|
||||||
self.assertEqual(" yes <i>html</i>", output)
|
self.assertEqual(" yes <i>html</i>", output)
|
||||||
|
|
||||||
|
def test_escaped_bracket(self):
|
||||||
|
"""remove > and <"""
|
||||||
|
input_text = "<dev>hi</div>"
|
||||||
|
output = clean(input_text)
|
||||||
|
self.assertEqual("hi", output)
|
||||||
|
|
25
bookwyrm/utils/sanitizer.py
Normal file
25
bookwyrm/utils/sanitizer.py
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
"""Clean user-provided text"""
|
||||||
|
import bleach
|
||||||
|
|
||||||
|
|
||||||
|
def clean(input_text):
|
||||||
|
"""Run through "bleach" """
|
||||||
|
return bleach.clean(
|
||||||
|
input_text,
|
||||||
|
tags=[
|
||||||
|
"p",
|
||||||
|
"blockquote",
|
||||||
|
"br",
|
||||||
|
"b",
|
||||||
|
"i",
|
||||||
|
"strong",
|
||||||
|
"em",
|
||||||
|
"pre",
|
||||||
|
"a",
|
||||||
|
"span",
|
||||||
|
"ul",
|
||||||
|
"ol",
|
||||||
|
"li",
|
||||||
|
],
|
||||||
|
attributes=["href", "rel", "src", "alt"],
|
||||||
|
)
|
|
@ -16,9 +16,8 @@ from django.views.decorators.http import require_POST
|
||||||
|
|
||||||
from markdown import markdown
|
from markdown import markdown
|
||||||
from bookwyrm import forms, models
|
from bookwyrm import forms, models
|
||||||
from bookwyrm.sanitize_html import InputHtmlParser
|
|
||||||
from bookwyrm.settings import DOMAIN
|
from bookwyrm.settings import DOMAIN
|
||||||
from bookwyrm.utils import regex
|
from bookwyrm.utils import regex, sanitizer
|
||||||
from .helpers import handle_remote_webfinger, is_api_request
|
from .helpers import handle_remote_webfinger, is_api_request
|
||||||
from .helpers import load_date_in_user_tz_as_utc
|
from .helpers import load_date_in_user_tz_as_utc
|
||||||
|
|
||||||
|
@ -268,6 +267,4 @@ def to_markdown(content):
|
||||||
content = format_links(content)
|
content = format_links(content)
|
||||||
content = markdown(content)
|
content = markdown(content)
|
||||||
# sanitize resulting html
|
# sanitize resulting html
|
||||||
sanitizer = InputHtmlParser()
|
return sanitizer.clean(content)
|
||||||
sanitizer.feed(content)
|
|
||||||
return sanitizer.get_output()
|
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
aiohttp==3.8.1
|
aiohttp==3.8.1
|
||||||
|
bleach==5.0.1
|
||||||
celery==5.2.2
|
celery==5.2.2
|
||||||
colorthief==0.2.1
|
colorthief==0.2.1
|
||||||
Django==3.2.13
|
Django==3.2.13
|
||||||
|
|
Loading…
Reference in a new issue