From a3c7d324d67fe29202e76fb332b7c328e61faf3c Mon Sep 17 00:00:00 2001 From: Mouse Reeve Date: Wed, 16 Dec 2020 16:47:05 -0800 Subject: [PATCH 1/2] Sanitize incoming html --- .../migrations/0025_auto_20201217_0046.py | 39 +++++++++++++++++++ bookwyrm/models/author.py | 2 +- bookwyrm/models/book.py | 2 +- bookwyrm/models/fields.py | 10 +++++ bookwyrm/models/status.py | 4 +- bookwyrm/models/user.py | 2 +- bookwyrm/sanitize_html.py | 2 +- bookwyrm/tests/test_sanitize_html.py | 12 +++--- 8 files changed, 62 insertions(+), 11 deletions(-) create mode 100644 bookwyrm/migrations/0025_auto_20201217_0046.py diff --git a/bookwyrm/migrations/0025_auto_20201217_0046.py b/bookwyrm/migrations/0025_auto_20201217_0046.py new file mode 100644 index 000000000..a3ffe8c13 --- /dev/null +++ b/bookwyrm/migrations/0025_auto_20201217_0046.py @@ -0,0 +1,39 @@ +# Generated by Django 3.0.7 on 2020-12-17 00:46 + +import bookwyrm.models.fields +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('bookwyrm', '0024_merge_20201216_1721'), + ] + + operations = [ + migrations.AlterField( + model_name='author', + name='bio', + field=bookwyrm.models.fields.HtmlField(blank=True, null=True), + ), + migrations.AlterField( + model_name='book', + name='description', + field=bookwyrm.models.fields.HtmlField(blank=True, null=True), + ), + migrations.AlterField( + model_name='quotation', + name='quote', + field=bookwyrm.models.fields.HtmlField(), + ), + migrations.AlterField( + model_name='status', + name='content', + field=bookwyrm.models.fields.HtmlField(blank=True, null=True), + ), + migrations.AlterField( + model_name='user', + name='summary', + field=bookwyrm.models.fields.HtmlField(default=''), + ), + ] diff --git a/bookwyrm/models/author.py b/bookwyrm/models/author.py index 331d2dd6f..47714d4ec 100644 --- a/bookwyrm/models/author.py +++ b/bookwyrm/models/author.py @@ -25,7 +25,7 @@ class Author(ActivitypubMixin, BookWyrmModel): aliases = fields.ArrayField( models.CharField(max_length=255), blank=True, default=list ) - bio = fields.TextField(null=True, blank=True) + bio = fields.HtmlField(null=True, blank=True) def save(self, *args, **kwargs): ''' can't be abstract for query reasons, but you shouldn't USE it ''' diff --git a/bookwyrm/models/book.py b/bookwyrm/models/book.py index bcd4bc046..3080a1158 100644 --- a/bookwyrm/models/book.py +++ b/bookwyrm/models/book.py @@ -36,7 +36,7 @@ class Book(ActivitypubMixin, BookWyrmModel): title = fields.CharField(max_length=255) sort_title = fields.CharField(max_length=255, blank=True, null=True) subtitle = fields.CharField(max_length=255, blank=True, null=True) - description = fields.TextField(blank=True, null=True) + description = fields.HtmlField(blank=True, null=True) languages = fields.ArrayField( models.CharField(max_length=255), blank=True, default=list ) diff --git a/bookwyrm/models/fields.py b/bookwyrm/models/fields.py index 5e12f5d56..529337150 100644 --- a/bookwyrm/models/fields.py +++ b/bookwyrm/models/fields.py @@ -12,6 +12,7 @@ from django.db import models from django.utils import timezone from django.utils.translation import gettext_lazy as _ from bookwyrm import activitypub +from bookwyrm.sanitize_html import InputHtmlParser from bookwyrm.settings import DOMAIN from bookwyrm.connectors import get_image @@ -362,6 +363,15 @@ class DateTimeField(ActivitypubFieldMixin, models.DateTimeField): except (ParserError, TypeError): return None +class HtmlField(ActivitypubFieldMixin, models.TextField): + ''' a text field for storing html ''' + def field_from_activity(self, value): + if not value or value == MISSING: + return None + sanitizer = InputHtmlParser() + sanitizer.feed(value) + return sanitizer.get_output() + class ArrayField(ActivitypubFieldMixin, DjangoArrayField): ''' activitypub-aware array field ''' def field_to_activity(self, value): diff --git a/bookwyrm/models/status.py b/bookwyrm/models/status.py index fcf4a2907..66114e7cc 100644 --- a/bookwyrm/models/status.py +++ b/bookwyrm/models/status.py @@ -14,7 +14,7 @@ class Status(OrderedCollectionPageMixin, BookWyrmModel): ''' any post, like a reply to a review, etc ''' user = fields.ForeignKey( 'User', on_delete=models.PROTECT, activitypub_field='attributedTo') - content = fields.TextField(blank=True, null=True) + content = fields.HtmlField(blank=True, null=True) mention_users = fields.TagField('User', related_name='mention_user') mention_books = fields.TagField('Edition', related_name='mention_book') local = models.BooleanField(default=True) @@ -134,7 +134,7 @@ class Comment(Status): class Quotation(Status): ''' like a review but without a rating and transient ''' - quote = fields.TextField() + quote = fields.HtmlField() book = fields.ForeignKey( 'Edition', on_delete=models.PROTECT, activitypub_field='inReplyToBook') diff --git a/bookwyrm/models/user.py b/bookwyrm/models/user.py index 63549d360..9d66eb5cf 100644 --- a/bookwyrm/models/user.py +++ b/bookwyrm/models/user.py @@ -42,7 +42,7 @@ class User(OrderedCollectionPageMixin, AbstractUser): blank=True, ) outbox = fields.RemoteIdField(unique=True) - summary = fields.TextField(default='') + summary = fields.HtmlField(default='') local = models.BooleanField(default=False) bookwyrm_user = fields.BooleanField(default=True) localname = models.CharField( diff --git a/bookwyrm/sanitize_html.py b/bookwyrm/sanitize_html.py index 9c5ca73ac..933fc43c6 100644 --- a/bookwyrm/sanitize_html.py +++ b/bookwyrm/sanitize_html.py @@ -1,7 +1,7 @@ ''' html parser to clean up incoming text from unknown sources ''' from html.parser import HTMLParser -class InputHtmlParser(HTMLParser): +class InputHtmlParser(HTMLParser):#pylint: disable=abstract-method ''' Removes any html that isn't allowed_tagsed from a block ''' def __init__(self): diff --git a/bookwyrm/tests/test_sanitize_html.py b/bookwyrm/tests/test_sanitize_html.py index 3344a9347..58d94311c 100644 --- a/bookwyrm/tests/test_sanitize_html.py +++ b/bookwyrm/tests/test_sanitize_html.py @@ -1,34 +1,36 @@ +''' make sure only valid html gets to the app ''' from django.test import TestCase from bookwyrm.sanitize_html import InputHtmlParser - class Sanitizer(TestCase): + ''' sanitizer tests ''' def test_no_html(self): + ''' just text ''' input_text = 'no html ' parser = InputHtmlParser() parser.feed(input_text) output = parser.get_output() self.assertEqual(input_text, output) - def test_valid_html(self): + ''' leave the html untouched ''' input_text = 'yes html' parser = InputHtmlParser() parser.feed(input_text) output = parser.get_output() self.assertEqual(input_text, output) - def test_valid_html_attrs(self): + ''' and don't remove attributes ''' input_text = 'yes html' parser = InputHtmlParser() parser.feed(input_text) output = parser.get_output() self.assertEqual(input_text, output) - def test_invalid_html(self): + ''' remove all html when the html is malformed ''' input_text = 'yes html' parser = InputHtmlParser() parser.feed(input_text) @@ -41,8 +43,8 @@ class Sanitizer(TestCase): output = parser.get_output() self.assertEqual('yes html ', output) - def test_disallowed_html(self): + ''' remove disallowed html but keep allowed html ''' input_text = '
yes html
' parser = InputHtmlParser() parser.feed(input_text) From 42167af3e9dad8aa040b74f66f4b50c0ea98a77b Mon Sep 17 00:00:00 2001 From: Mouse Reeve Date: Wed, 16 Dec 2020 18:39:18 -0800 Subject: [PATCH 2/2] Tests fro html field --- bookwyrm/tests/models/test_fields.py | 63 +++++++++++++++++++++------- 1 file changed, 48 insertions(+), 15 deletions(-) diff --git a/bookwyrm/tests/models/test_fields.py b/bookwyrm/tests/models/test_fields.py index 8c86b23ce..81b1d528a 100644 --- a/bookwyrm/tests/models/test_fields.py +++ b/bookwyrm/tests/models/test_fields.py @@ -21,31 +21,23 @@ from bookwyrm.activitypub.base_activity import ActivityObject from bookwyrm.models import fields, User, Status from bookwyrm.models.base_model import ActivitypubMixin, BookWyrmModel +#pylint: disable=too-many-public-methods class ActivitypubFields(TestCase): ''' overwrites standard model feilds to work with activitypub ''' def test_validate_remote_id(self): ''' should look like a url ''' - self.assertIsNone(fields.validate_remote_id( - 'http://www.example.com' - )) - self.assertIsNone(fields.validate_remote_id( - 'https://www.example.com' - )) - self.assertIsNone(fields.validate_remote_id( - 'http://example.com/dlfjg-23/x' - )) + self.assertIsNone(fields.validate_remote_id('http://www.example.com')) + self.assertIsNone(fields.validate_remote_id('https://www.example.com')) + self.assertIsNone(fields.validate_remote_id('http://exle.com/dlg-23/x')) self.assertRaises( ValidationError, fields.validate_remote_id, - 'http:/example.com/dlfjg-23/x' - ) + 'http:/example.com/dlfjg-23/x') self.assertRaises( ValidationError, fields.validate_remote_id, - 'www.example.com/dlfjg-23/x' - ) + 'www.example.com/dlfjg-23/x') self.assertRaises( ValidationError, fields.validate_remote_id, - 'http://www.example.com/dlfjg 23/x' - ) + 'http://www.example.com/dlfjg 23/x') def test_activitypub_field_mixin(self): ''' generic mixin with super basic to and from functionality ''' @@ -71,6 +63,38 @@ class ActivitypubFields(TestCase): instance.name = 'snake_case_name' self.assertEqual(instance.get_activitypub_field(), 'snakeCaseName') + def test_set_field_from_activity(self): + ''' setter from entire json blob ''' + @dataclass + class TestModel: + ''' real simple mock ''' + field_name: str + + mock_model = TestModel(field_name='bip') + TestActivity = namedtuple('test', ('fieldName', 'unrelated')) + data = TestActivity(fieldName='hi', unrelated='bfkjh') + + instance = fields.ActivitypubFieldMixin() + instance.name = 'field_name' + + instance.set_field_from_activity(mock_model, data) + self.assertEqual(mock_model.field_name, 'hi') + + def test_set_activity_from_field(self): + ''' set json field given entire model ''' + @dataclass + class TestModel: + ''' real simple mock ''' + field_name: str + unrelated: str + mock_model = TestModel(field_name='bip', unrelated='field') + instance = fields.ActivitypubFieldMixin() + instance.name = 'field_name' + + data = {} + instance.set_activity_from_field(data, mock_model) + self.assertEqual(data['fieldName'], 'bip') + def test_remote_id_field(self): ''' just sets some defaults on charfield ''' instance = fields.RemoteIdField() @@ -408,3 +432,12 @@ class ActivitypubFields(TestCase): ''' idk why it makes them strings but probably for a good reason ''' instance = fields.ArrayField(fields.IntegerField) self.assertEqual(instance.field_to_activity([0, 1]), ['0', '1']) + + + def test_html_field(self): + ''' sanitizes html, the sanitizer has its own tests ''' + instance = fields.HtmlField() + self.assertEqual( + instance.field_from_activity('

hi

'), + '

hi

' + )