From 78d2283458b5c4a300f381171f624cd853deac01 Mon Sep 17 00:00:00 2001 From: Andrew Godwin Date: Tue, 20 Dec 2022 13:10:35 +0000 Subject: [PATCH] Move linkifying to all http-prefixed links --- core/html.py | 22 ++++++++++++++++++++-- tests/core/test_html.py | 12 ++++++++++++ 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/core/html.py b/core/html.py index 5af476e..fc5cde7 100644 --- a/core/html.py +++ b/core/html.py @@ -1,7 +1,21 @@ +import re +from functools import partial + import bleach from bleach.linkifier import LinkifyFilter from django.utils.safestring import mark_safe +url_regex = re.compile( + r"""\(* # Match any opening parentheses. + \b(?"]*)? + # /path/zz (excluding "unsafe" chars from RFC 1738, + # except for # and ~, which happen in practice) + """, + re.IGNORECASE | re.VERBOSE | re.UNICODE, +) + def allow_a(tag: str, name: str, value: str): if name in ["href", "title", "class"]: @@ -26,7 +40,7 @@ def sanitize_html(post_html: str) -> str: "p": ["class"], "span": ["class"], }, - filters=[LinkifyFilter], + filters=[partial(LinkifyFilter, url_re=url_regex)], strip=True, ) return mark_safe(cleaner.clean(post_html)) @@ -36,7 +50,11 @@ def strip_html(post_html: str) -> str: """ Strips all tags from the text, then linkifies it. """ - cleaner = bleach.Cleaner(tags=[], strip=True, filters=[LinkifyFilter]) + cleaner = bleach.Cleaner( + tags=[], + strip=True, + filters=[partial(LinkifyFilter, url_re=url_regex)], + ) return mark_safe(cleaner.clean(post_html)) diff --git a/tests/core/test_html.py b/tests/core/test_html.py index 5d798ac..ff28305 100644 --- a/tests/core/test_html.py +++ b/tests/core/test_html.py @@ -19,3 +19,15 @@ def test_sanitize_post(): assert sanitize_html("

Hello!

") == "

Hello!

" assert sanitize_html("

It's great

") == "

It's great

" + + # Note that we only want to linkify things with protocol prefixes to prevent + # too many false positives. + assert sanitize_html("

test.com

") == "

test.com

" + assert ( + sanitize_html("

https://test.com

") + == '

https://test.com

' + ) + assert ( + sanitize_html("

@someone@subdomain.some-domain.com

") + == "

@someone@subdomain.some-domain.com

" + )