Move linkifying to all http-prefixed links

2024-11-26 09:11:00 +00:00 · 2022-12-20 13:10:35 +00:00 · 2022-12-20 13:10:35 +00:00 · 78d2283458
commit 78d2283458
parent 202046247c
2 changed files with 32 additions and 2 deletions
--- a/core/html.py
+++ b/core/html.py
@ -1,7 +1,21 @@
 import re
 from functools import partial
 import bleach
 from bleach.linkifier import LinkifyFilter
 from django.utils.safestring import mark_safe
 url_regex = re.compile(
    r"""\(*  # Match any opening parentheses.
    \b(?<![@.])(?:https?://(?:(?:\w+:)?\w+@)?)  # http://
    ([\w-]+\.)+(?:[\w-]+)(?:\:[0-9]+)?(?!\.\w)\b   # xx.yy.tld(:##)?
    (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
        # /path/zz (excluding "unsafe" chars from RFC 1738,
        # except for # and ~, which happen in practice)
    """,
    re.IGNORECASE | re.VERBOSE | re.UNICODE,
 )
 def allow_a(tag: str, name: str, value: str):
    if name in ["href", "title", "class"]:
@ -26,7 +40,7 @@ def sanitize_html(post_html: str) -> str:
            "p": ["class"],
            "span": ["class"],
        },
-        filters=[LinkifyFilter],
+        filters=[partial(LinkifyFilter, url_re=url_regex)],
        strip=True,
    )
    return mark_safe(cleaner.clean(post_html))
@ -36,7 +50,11 @@ def strip_html(post_html: str) -> str:
    """
    Strips all tags from the text, then linkifies it.
    """
-    cleaner = bleach.Cleaner(tags=[], strip=True, filters=[LinkifyFilter])
+    cleaner = bleach.Cleaner(
        tags=[],
        strip=True,
        filters=[partial(LinkifyFilter, url_re=url_regex)],
    )
    return mark_safe(cleaner.clean(post_html))
--- a/tests/core/test_html.py
+++ b/tests/core/test_html.py
@ -19,3 +19,15 @@ def test_sanitize_post():
    assert sanitize_html("<p>Hello!</p>") == "<p>Hello!</p>"
    assert sanitize_html("<p>It&#39;s great</p>") == "<p>It&#39;s great</p>"
    # Note that we only want to linkify things with protocol prefixes to prevent
    # too many false positives.
    assert sanitize_html("<p>test.com</p>") == "<p>test.com</p>"
    assert (
        sanitize_html("<p>https://test.com</p>")
        == '<p><a href="https://test.com" rel="nofollow">https://test.com</a></p>'
    )
    assert (
        sanitize_html("<p>@someone@subdomain.some-domain.com</p>")
        == "<p>@someone@subdomain.some-domain.com</p>"
    )