Sanitize incoming html

2020-12-16 16:47:05 -08:00 · 2020-12-16 16:47:05 -08:00 · a3c7d324d6
commit a3c7d324d6
parent d79a756813
8 changed files with 62 additions and 11 deletions
--- a/bookwyrm/migrations/0025_auto_20201217_0046.py
+++ b/bookwyrm/migrations/0025_auto_20201217_0046.py
@ -0,0 +1,39 @@
+# Generated by Django 3.0.7 on 2020-12-17 00:46
+
+import bookwyrm.models.fields
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('bookwyrm', '0024_merge_20201216_1721'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='author',
+            name='bio',
+            field=bookwyrm.models.fields.HtmlField(blank=True, null=True),
+        ),
+        migrations.AlterField(
+            model_name='book',
+            name='description',
+            field=bookwyrm.models.fields.HtmlField(blank=True, null=True),
+        ),
+        migrations.AlterField(
+            model_name='quotation',
+            name='quote',
+            field=bookwyrm.models.fields.HtmlField(),
+        ),
+        migrations.AlterField(
+            model_name='status',
+            name='content',
+            field=bookwyrm.models.fields.HtmlField(blank=True, null=True),
+        ),
+        migrations.AlterField(
+            model_name='user',
+            name='summary',
+            field=bookwyrm.models.fields.HtmlField(default=''),
+        ),
+    ]
--- a/bookwyrm/models/author.py
+++ b/bookwyrm/models/author.py
@ -25,7 +25,7 @@ class Author(ActivitypubMixin, BookWyrmModel):
    aliases = fields.ArrayField(
        models.CharField(max_length=255), blank=True, default=list
    )
-    bio = fields.TextField(null=True, blank=True)
+    bio = fields.HtmlField(null=True, blank=True)

    def save(self, *args, **kwargs):
        ''' can't be abstract for query reasons, but you shouldn't USE it '''
--- a/bookwyrm/models/book.py
+++ b/bookwyrm/models/book.py
@ -36,7 +36,7 @@ class Book(ActivitypubMixin, BookWyrmModel):
    title = fields.CharField(max_length=255)
    sort_title = fields.CharField(max_length=255, blank=True, null=True)
    subtitle = fields.CharField(max_length=255, blank=True, null=True)
-    description = fields.TextField(blank=True, null=True)
+    description = fields.HtmlField(blank=True, null=True)
    languages = fields.ArrayField(
        models.CharField(max_length=255), blank=True, default=list
    )
--- a/bookwyrm/models/fields.py
+++ b/bookwyrm/models/fields.py
@ -12,6 +12,7 @@ from django.db import models
 from django.utils import timezone
 from django.utils.translation import gettext_lazy as _
 from bookwyrm import activitypub
+from bookwyrm.sanitize_html import InputHtmlParser
 from bookwyrm.settings import DOMAIN
 from bookwyrm.connectors import get_image

@ -362,6 +363,15 @@ class DateTimeField(ActivitypubFieldMixin, models.DateTimeField):
        except (ParserError, TypeError):
            return None

+class HtmlField(ActivitypubFieldMixin, models.TextField):
+    ''' a text field for storing html '''
+    def field_from_activity(self, value):
+        if not value or value == MISSING:
+            return None
+        sanitizer = InputHtmlParser()
+        sanitizer.feed(value)
+        return sanitizer.get_output()
+
 class ArrayField(ActivitypubFieldMixin, DjangoArrayField):
    ''' activitypub-aware array field '''
    def field_to_activity(self, value):
--- a/bookwyrm/models/status.py
+++ b/bookwyrm/models/status.py
@ -14,7 +14,7 @@ class Status(OrderedCollectionPageMixin, BookWyrmModel):
    ''' any post, like a reply to a review, etc '''
    user = fields.ForeignKey(
        'User', on_delete=models.PROTECT, activitypub_field='attributedTo')
-    content = fields.TextField(blank=True, null=True)
+    content = fields.HtmlField(blank=True, null=True)
    mention_users = fields.TagField('User', related_name='mention_user')
    mention_books = fields.TagField('Edition', related_name='mention_book')
    local = models.BooleanField(default=True)
@ -134,7 +134,7 @@ class Comment(Status):

 class Quotation(Status):
    ''' like a review but without a rating and transient '''
-    quote = fields.TextField()
+    quote = fields.HtmlField()
    book = fields.ForeignKey(
        'Edition', on_delete=models.PROTECT, activitypub_field='inReplyToBook')

--- a/bookwyrm/models/user.py
+++ b/bookwyrm/models/user.py
@ -42,7 +42,7 @@ class User(OrderedCollectionPageMixin, AbstractUser):
        blank=True,
    )
    outbox = fields.RemoteIdField(unique=True)
-    summary = fields.TextField(default='')
+    summary = fields.HtmlField(default='')
    local = models.BooleanField(default=False)
    bookwyrm_user = fields.BooleanField(default=True)
    localname = models.CharField(
--- a/bookwyrm/sanitize_html.py
+++ b/bookwyrm/sanitize_html.py
@ -1,7 +1,7 @@
 ''' html parser to clean up incoming text from unknown sources '''
 from html.parser import HTMLParser

-class InputHtmlParser(HTMLParser):
+class InputHtmlParser(HTMLParser):#pylint: disable=abstract-method
    ''' Removes any html that isn't allowed_tagsed from a block '''

    def __init__(self):
--- a/bookwyrm/tests/test_sanitize_html.py
+++ b/bookwyrm/tests/test_sanitize_html.py
@ -1,34 +1,36 @@
+''' make sure only valid html gets to the app '''
 from django.test import TestCase

 from bookwyrm.sanitize_html import InputHtmlParser

-
 class Sanitizer(TestCase):
+    ''' sanitizer tests '''
    def test_no_html(self):
+        ''' just text '''
        input_text = 'no      html  '
        parser = InputHtmlParser()
        parser.feed(input_text)
        output = parser.get_output()
        self.assertEqual(input_text, output)

-
    def test_valid_html(self):
+        ''' leave the html untouched '''
        input_text = '<b>yes    </b> <i>html</i>'
        parser = InputHtmlParser()
        parser.feed(input_text)
        output = parser.get_output()
        self.assertEqual(input_text, output)

-
    def test_valid_html_attrs(self):
+        ''' and don't remove attributes '''
        input_text = '<a href="fish.com">yes    </a> <i>html</i>'
        parser = InputHtmlParser()
        parser.feed(input_text)
        output = parser.get_output()
        self.assertEqual(input_text, output)

-
    def test_invalid_html(self):
+        ''' remove all html when the html is malformed '''
        input_text = '<b>yes  <i>html</i>'
        parser = InputHtmlParser()
        parser.feed(input_text)
@ -41,8 +43,8 @@ class Sanitizer(TestCase):
        output = parser.get_output()
        self.assertEqual('yes html   ', output)

-
    def test_disallowed_html(self):
+        ''' remove disallowed html but keep allowed html '''
        input_text = '<div>  yes <i>html</i></div>'
        parser = InputHtmlParser()
        parser.feed(input_text)