Move linkifying to all http-prefixed links

This commit is contained in:
Andrew Godwin 2022-12-20 13:10:35 +00:00
parent 202046247c
commit 78d2283458
2 changed files with 32 additions and 2 deletions

View file

@ -1,7 +1,21 @@
import re
from functools import partial
import bleach import bleach
from bleach.linkifier import LinkifyFilter from bleach.linkifier import LinkifyFilter
from django.utils.safestring import mark_safe from django.utils.safestring import mark_safe
url_regex = re.compile(
r"""\(* # Match any opening parentheses.
\b(?<![@.])(?:https?://(?:(?:\w+:)?\w+@)?) # http://
([\w-]+\.)+(?:[\w-]+)(?:\:[0-9]+)?(?!\.\w)\b # xx.yy.tld(:##)?
(?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
# /path/zz (excluding "unsafe" chars from RFC 1738,
# except for # and ~, which happen in practice)
""",
re.IGNORECASE | re.VERBOSE | re.UNICODE,
)
def allow_a(tag: str, name: str, value: str): def allow_a(tag: str, name: str, value: str):
if name in ["href", "title", "class"]: if name in ["href", "title", "class"]:
@ -26,7 +40,7 @@ def sanitize_html(post_html: str) -> str:
"p": ["class"], "p": ["class"],
"span": ["class"], "span": ["class"],
}, },
filters=[LinkifyFilter], filters=[partial(LinkifyFilter, url_re=url_regex)],
strip=True, strip=True,
) )
return mark_safe(cleaner.clean(post_html)) return mark_safe(cleaner.clean(post_html))
@ -36,7 +50,11 @@ def strip_html(post_html: str) -> str:
""" """
Strips all tags from the text, then linkifies it. Strips all tags from the text, then linkifies it.
""" """
cleaner = bleach.Cleaner(tags=[], strip=True, filters=[LinkifyFilter]) cleaner = bleach.Cleaner(
tags=[],
strip=True,
filters=[partial(LinkifyFilter, url_re=url_regex)],
)
return mark_safe(cleaner.clean(post_html)) return mark_safe(cleaner.clean(post_html))

View file

@ -19,3 +19,15 @@ def test_sanitize_post():
assert sanitize_html("<p>Hello!</p>") == "<p>Hello!</p>" assert sanitize_html("<p>Hello!</p>") == "<p>Hello!</p>"
assert sanitize_html("<p>It&#39;s great</p>") == "<p>It&#39;s great</p>" assert sanitize_html("<p>It&#39;s great</p>") == "<p>It&#39;s great</p>"
# Note that we only want to linkify things with protocol prefixes to prevent
# too many false positives.
assert sanitize_html("<p>test.com</p>") == "<p>test.com</p>"
assert (
sanitize_html("<p>https://test.com</p>")
== '<p><a href="https://test.com" rel="nofollow">https://test.com</a></p>'
)
assert (
sanitize_html("<p>@someone@subdomain.some-domain.com</p>")
== "<p>@someone@subdomain.some-domain.com</p>"
)