Merge pull request #394 from mouse-reeve/sanitize-html

Sanitize html
This commit is contained in:
Mouse Reeve 2020-12-16 19:12:37 -08:00 committed by GitHub
commit f0767de363
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 110 additions and 26 deletions

View file

@ -0,0 +1,39 @@
# Generated by Django 3.0.7 on 2020-12-17 00:46
import bookwyrm.models.fields
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('bookwyrm', '0024_merge_20201216_1721'),
]
operations = [
migrations.AlterField(
model_name='author',
name='bio',
field=bookwyrm.models.fields.HtmlField(blank=True, null=True),
),
migrations.AlterField(
model_name='book',
name='description',
field=bookwyrm.models.fields.HtmlField(blank=True, null=True),
),
migrations.AlterField(
model_name='quotation',
name='quote',
field=bookwyrm.models.fields.HtmlField(),
),
migrations.AlterField(
model_name='status',
name='content',
field=bookwyrm.models.fields.HtmlField(blank=True, null=True),
),
migrations.AlterField(
model_name='user',
name='summary',
field=bookwyrm.models.fields.HtmlField(default=''),
),
]

View file

@ -25,7 +25,7 @@ class Author(ActivitypubMixin, BookWyrmModel):
aliases = fields.ArrayField( aliases = fields.ArrayField(
models.CharField(max_length=255), blank=True, default=list models.CharField(max_length=255), blank=True, default=list
) )
bio = fields.TextField(null=True, blank=True) bio = fields.HtmlField(null=True, blank=True)
def save(self, *args, **kwargs): def save(self, *args, **kwargs):
''' can't be abstract for query reasons, but you shouldn't USE it ''' ''' can't be abstract for query reasons, but you shouldn't USE it '''

View file

@ -36,7 +36,7 @@ class Book(ActivitypubMixin, BookWyrmModel):
title = fields.CharField(max_length=255) title = fields.CharField(max_length=255)
sort_title = fields.CharField(max_length=255, blank=True, null=True) sort_title = fields.CharField(max_length=255, blank=True, null=True)
subtitle = fields.CharField(max_length=255, blank=True, null=True) subtitle = fields.CharField(max_length=255, blank=True, null=True)
description = fields.TextField(blank=True, null=True) description = fields.HtmlField(blank=True, null=True)
languages = fields.ArrayField( languages = fields.ArrayField(
models.CharField(max_length=255), blank=True, default=list models.CharField(max_length=255), blank=True, default=list
) )

View file

@ -12,6 +12,7 @@ from django.db import models
from django.utils import timezone from django.utils import timezone
from django.utils.translation import gettext_lazy as _ from django.utils.translation import gettext_lazy as _
from bookwyrm import activitypub from bookwyrm import activitypub
from bookwyrm.sanitize_html import InputHtmlParser
from bookwyrm.settings import DOMAIN from bookwyrm.settings import DOMAIN
from bookwyrm.connectors import get_image from bookwyrm.connectors import get_image
@ -362,6 +363,15 @@ class DateTimeField(ActivitypubFieldMixin, models.DateTimeField):
except (ParserError, TypeError): except (ParserError, TypeError):
return None return None
class HtmlField(ActivitypubFieldMixin, models.TextField):
''' a text field for storing html '''
def field_from_activity(self, value):
if not value or value == MISSING:
return None
sanitizer = InputHtmlParser()
sanitizer.feed(value)
return sanitizer.get_output()
class ArrayField(ActivitypubFieldMixin, DjangoArrayField): class ArrayField(ActivitypubFieldMixin, DjangoArrayField):
''' activitypub-aware array field ''' ''' activitypub-aware array field '''
def field_to_activity(self, value): def field_to_activity(self, value):

View file

@ -14,7 +14,7 @@ class Status(OrderedCollectionPageMixin, BookWyrmModel):
''' any post, like a reply to a review, etc ''' ''' any post, like a reply to a review, etc '''
user = fields.ForeignKey( user = fields.ForeignKey(
'User', on_delete=models.PROTECT, activitypub_field='attributedTo') 'User', on_delete=models.PROTECT, activitypub_field='attributedTo')
content = fields.TextField(blank=True, null=True) content = fields.HtmlField(blank=True, null=True)
mention_users = fields.TagField('User', related_name='mention_user') mention_users = fields.TagField('User', related_name='mention_user')
mention_books = fields.TagField('Edition', related_name='mention_book') mention_books = fields.TagField('Edition', related_name='mention_book')
local = models.BooleanField(default=True) local = models.BooleanField(default=True)
@ -134,7 +134,7 @@ class Comment(Status):
class Quotation(Status): class Quotation(Status):
''' like a review but without a rating and transient ''' ''' like a review but without a rating and transient '''
quote = fields.TextField() quote = fields.HtmlField()
book = fields.ForeignKey( book = fields.ForeignKey(
'Edition', on_delete=models.PROTECT, activitypub_field='inReplyToBook') 'Edition', on_delete=models.PROTECT, activitypub_field='inReplyToBook')

View file

@ -42,7 +42,7 @@ class User(OrderedCollectionPageMixin, AbstractUser):
blank=True, blank=True,
) )
outbox = fields.RemoteIdField(unique=True) outbox = fields.RemoteIdField(unique=True)
summary = fields.TextField(default='') summary = fields.HtmlField(default='')
local = models.BooleanField(default=False) local = models.BooleanField(default=False)
bookwyrm_user = fields.BooleanField(default=True) bookwyrm_user = fields.BooleanField(default=True)
localname = models.CharField( localname = models.CharField(

View file

@ -1,7 +1,7 @@
''' html parser to clean up incoming text from unknown sources ''' ''' html parser to clean up incoming text from unknown sources '''
from html.parser import HTMLParser from html.parser import HTMLParser
class InputHtmlParser(HTMLParser): class InputHtmlParser(HTMLParser):#pylint: disable=abstract-method
''' Removes any html that isn't allowed_tagsed from a block ''' ''' Removes any html that isn't allowed_tagsed from a block '''
def __init__(self): def __init__(self):

View file

@ -21,31 +21,23 @@ from bookwyrm.activitypub.base_activity import ActivityObject
from bookwyrm.models import fields, User, Status from bookwyrm.models import fields, User, Status
from bookwyrm.models.base_model import ActivitypubMixin, BookWyrmModel from bookwyrm.models.base_model import ActivitypubMixin, BookWyrmModel
#pylint: disable=too-many-public-methods
class ActivitypubFields(TestCase): class ActivitypubFields(TestCase):
''' overwrites standard model feilds to work with activitypub ''' ''' overwrites standard model feilds to work with activitypub '''
def test_validate_remote_id(self): def test_validate_remote_id(self):
''' should look like a url ''' ''' should look like a url '''
self.assertIsNone(fields.validate_remote_id( self.assertIsNone(fields.validate_remote_id('http://www.example.com'))
'http://www.example.com' self.assertIsNone(fields.validate_remote_id('https://www.example.com'))
)) self.assertIsNone(fields.validate_remote_id('http://exle.com/dlg-23/x'))
self.assertIsNone(fields.validate_remote_id(
'https://www.example.com'
))
self.assertIsNone(fields.validate_remote_id(
'http://example.com/dlfjg-23/x'
))
self.assertRaises( self.assertRaises(
ValidationError, fields.validate_remote_id, ValidationError, fields.validate_remote_id,
'http:/example.com/dlfjg-23/x' 'http:/example.com/dlfjg-23/x')
)
self.assertRaises( self.assertRaises(
ValidationError, fields.validate_remote_id, ValidationError, fields.validate_remote_id,
'www.example.com/dlfjg-23/x' 'www.example.com/dlfjg-23/x')
)
self.assertRaises( self.assertRaises(
ValidationError, fields.validate_remote_id, ValidationError, fields.validate_remote_id,
'http://www.example.com/dlfjg 23/x' 'http://www.example.com/dlfjg 23/x')
)
def test_activitypub_field_mixin(self): def test_activitypub_field_mixin(self):
''' generic mixin with super basic to and from functionality ''' ''' generic mixin with super basic to and from functionality '''
@ -71,6 +63,38 @@ class ActivitypubFields(TestCase):
instance.name = 'snake_case_name' instance.name = 'snake_case_name'
self.assertEqual(instance.get_activitypub_field(), 'snakeCaseName') self.assertEqual(instance.get_activitypub_field(), 'snakeCaseName')
def test_set_field_from_activity(self):
''' setter from entire json blob '''
@dataclass
class TestModel:
''' real simple mock '''
field_name: str
mock_model = TestModel(field_name='bip')
TestActivity = namedtuple('test', ('fieldName', 'unrelated'))
data = TestActivity(fieldName='hi', unrelated='bfkjh')
instance = fields.ActivitypubFieldMixin()
instance.name = 'field_name'
instance.set_field_from_activity(mock_model, data)
self.assertEqual(mock_model.field_name, 'hi')
def test_set_activity_from_field(self):
''' set json field given entire model '''
@dataclass
class TestModel:
''' real simple mock '''
field_name: str
unrelated: str
mock_model = TestModel(field_name='bip', unrelated='field')
instance = fields.ActivitypubFieldMixin()
instance.name = 'field_name'
data = {}
instance.set_activity_from_field(data, mock_model)
self.assertEqual(data['fieldName'], 'bip')
def test_remote_id_field(self): def test_remote_id_field(self):
''' just sets some defaults on charfield ''' ''' just sets some defaults on charfield '''
instance = fields.RemoteIdField() instance = fields.RemoteIdField()
@ -408,3 +432,12 @@ class ActivitypubFields(TestCase):
''' idk why it makes them strings but probably for a good reason ''' ''' idk why it makes them strings but probably for a good reason '''
instance = fields.ArrayField(fields.IntegerField) instance = fields.ArrayField(fields.IntegerField)
self.assertEqual(instance.field_to_activity([0, 1]), ['0', '1']) self.assertEqual(instance.field_to_activity([0, 1]), ['0', '1'])
def test_html_field(self):
''' sanitizes html, the sanitizer has its own tests '''
instance = fields.HtmlField()
self.assertEqual(
instance.field_from_activity('<marquee><p>hi</p></marquee>'),
'<p>hi</p>'
)

View file

@ -1,34 +1,36 @@
''' make sure only valid html gets to the app '''
from django.test import TestCase from django.test import TestCase
from bookwyrm.sanitize_html import InputHtmlParser from bookwyrm.sanitize_html import InputHtmlParser
class Sanitizer(TestCase): class Sanitizer(TestCase):
''' sanitizer tests '''
def test_no_html(self): def test_no_html(self):
''' just text '''
input_text = 'no html ' input_text = 'no html '
parser = InputHtmlParser() parser = InputHtmlParser()
parser.feed(input_text) parser.feed(input_text)
output = parser.get_output() output = parser.get_output()
self.assertEqual(input_text, output) self.assertEqual(input_text, output)
def test_valid_html(self): def test_valid_html(self):
''' leave the html untouched '''
input_text = '<b>yes </b> <i>html</i>' input_text = '<b>yes </b> <i>html</i>'
parser = InputHtmlParser() parser = InputHtmlParser()
parser.feed(input_text) parser.feed(input_text)
output = parser.get_output() output = parser.get_output()
self.assertEqual(input_text, output) self.assertEqual(input_text, output)
def test_valid_html_attrs(self): def test_valid_html_attrs(self):
''' and don't remove attributes '''
input_text = '<a href="fish.com">yes </a> <i>html</i>' input_text = '<a href="fish.com">yes </a> <i>html</i>'
parser = InputHtmlParser() parser = InputHtmlParser()
parser.feed(input_text) parser.feed(input_text)
output = parser.get_output() output = parser.get_output()
self.assertEqual(input_text, output) self.assertEqual(input_text, output)
def test_invalid_html(self): def test_invalid_html(self):
''' remove all html when the html is malformed '''
input_text = '<b>yes <i>html</i>' input_text = '<b>yes <i>html</i>'
parser = InputHtmlParser() parser = InputHtmlParser()
parser.feed(input_text) parser.feed(input_text)
@ -41,8 +43,8 @@ class Sanitizer(TestCase):
output = parser.get_output() output = parser.get_output()
self.assertEqual('yes html ', output) self.assertEqual('yes html ', output)
def test_disallowed_html(self): def test_disallowed_html(self):
''' remove disallowed html but keep allowed html '''
input_text = '<div> yes <i>html</i></div>' input_text = '<div> yes <i>html</i></div>'
parser = InputHtmlParser() parser = InputHtmlParser()
parser.feed(input_text) parser.feed(input_text)