Merge pull request #3027 from dato/find_links_wrapped_punct

Fix parsing of punctuation in format_links() fixes #2993 fixes #3049
2024-06-02 21:39:23 +00:00 · 2023-11-06 09:42:57 +11:00 · 2023-11-06 09:42:57 +11:00 · a93519ec3e
parent 1190ea7e69 afad39bf80
commit a93519ec3e
2 changed files with 54 additions and 58 deletions
--- a/bookwyrm/tests/views/test_status.py
+++ b/bookwyrm/tests/views/test_status.py
@ -420,21 +420,25 @@ http://www.fish.com/"""
            'okay\n\n<a href="http://www.fish.com/">www.fish.com/</a>',
        )
    def test_format_links_parens(self, *_):
        """find and format urls into a tags"""
        url = "http://www.fish.com/"
        self.assertEqual(
            views.status.format_links(f"({url})"),
            f'(<a href="{url}">www.fish.com/</a>)',
        )
    def test_format_links_punctuation(self, *_):
-        """don’t take trailing punctuation into account pls"""
+        """test many combinations of brackets, URLs, and punctuation"""
-        url = "http://www.fish.com/"
+        url = "https://bookwyrm.social"
-        self.assertEqual(
+        html = f'<a href="{url}">bookwyrm.social</a>'
-            views.status.format_links(f"{url}."),
+        test_table = [
-            f'<a href="{url}">www.fish.com/</a>.',
+            ("punct", f"text and {url}.", f"text and {html}."),
-        )
+            ("multi_punct", f"text, then {url}?...", f"text, then {html}?..."),
            ("bracket_punct", f"here ({url}).", f"here ({html})."),
            ("punct_bracket", f"there [{url}?]", f"there [{html}?]"),
            ("punct_bracket_punct", f"not here? ({url}!).", f"not here? ({html}!)."),
            (
                "multi_punct_bracket",
                f"not there ({url}...);",
                f"not there ({html}...);",
            ),
        ]
        for desc, text, output in test_table:
            with self.subTest(desc=desc):
                self.assertEqual(views.status.format_links(text), output)
    def test_format_links_special_chars(self, *_):
        """find and format urls into a tags"""
@ -464,6 +468,13 @@ http://www.fish.com/"""
            views.status.format_links(url), f'<a href="{url}">{url[8:]}</a>'
        )
    def test_format_links_ignore_non_urls(self, *_):
        """formating links should leave plain text untouced"""
        text_elision = "> “The distinction is significant.” [...]"  # bookwyrm#2993
        text_quoteparens = "some kind of gene-editing technology (?)"  # bookwyrm#3049
        self.assertEqual(views.status.format_links(text_elision), text_elision)
        self.assertEqual(views.status.format_links(text_quoteparens), text_quoteparens)
    def test_format_mentions_with_at_symbol_links(self, *_):
        """A link with an @username shouldn't treat the username as a mention"""
        content = "a link to https://example.com/user/@mouse"
--- a/bookwyrm/views/status.py
+++ b/bookwyrm/views/status.py
@ -1,7 +1,6 @@
 """ what are we here for if not for posting """
 import re
 import logging
 from urllib.parse import urlparse
 from django.contrib.auth.decorators import login_required
 from django.core.validators import URLValidator
@ -297,65 +296,51 @@ def find_or_create_hashtags(content):
 def format_links(content):
    """detect and format links"""
-    validator = URLValidator()
+    validator = URLValidator(["http", "https"])
-    formatted_content = ""
+    schema_re = re.compile(r"\bhttps?://")
    split_content = re.split(r"(\s+)", content)
-    for potential_link in split_content:
+    for i, potential_link in enumerate(split_content):
-        if not potential_link:
+        if not schema_re.search(potential_link):
            continue
        wrapped = _wrapped(potential_link)
        if wrapped:
            wrapper_close = potential_link[-1]
            formatted_content += potential_link[0]
            potential_link = potential_link[1:-1]
        ends_with_punctuation = _ends_with_punctuation(potential_link)
        if ends_with_punctuation:
            punctuation_glyph = potential_link[-1]
            potential_link = potential_link[0:-1]
        # Strip surrounding brackets and trailing punctuation.
        prefix, potential_link, suffix = _unwrap(potential_link)
        try:
            # raises an error on anything that's not a valid link
            validator(potential_link)
            # use everything but the scheme in the presentation of the link
-            url = urlparse(potential_link)
+            link = schema_re.sub("", potential_link)
-            link = url.netloc + url.path + url.params
+            split_content[i] = f'{prefix}<a href="{potential_link}">{link}</a>{suffix}'
            if url.query != "":
                link += "?" + url.query
            if url.fragment != "":
                link += "#" + url.fragment
            formatted_content += f'<a href="{potential_link}">{link}</a>'
        except (ValidationError, UnicodeError):
-            formatted_content += potential_link
+            pass
-        if wrapped:
+    return "".join(split_content)
            formatted_content += wrapper_close
        if ends_with_punctuation:
            formatted_content += punctuation_glyph
    return formatted_content
-def _wrapped(text):
+def _unwrap(text):
-    """check if a line of text is wrapped"""
+    """split surrounding brackets and trailing punctuation from a string of text"""
-    wrappers = [("(", ")"), ("[", "]"), ("{", "}")]
+    punct = re.compile(r'([.,;:!?"’”»]+)$')
-    for wrapper in wrappers:
+    prefix = suffix = ""
    if punct.search(text):
        # Move punctuation to suffix segment.
        text, suffix, _ = punct.split(text)
    for wrapper in ("()", "[]", "{}"):
        if text[0] == wrapper[0] and text[-1] == wrapper[-1]:
-            return True
+            # Split out wrapping chars.
-    return False
+            suffix = text[-1] + suffix
            prefix, text = text[:1], text[1:-1]
            break  # Nested wrappers not supported atm.
    if punct.search(text):
        # Move inner punctuation to suffix segment.
        text, inner_punct, _ = punct.split(text)
        suffix = inner_punct + suffix
-def _ends_with_punctuation(text):
+    return prefix, text, suffix
    """check if a line of text ends with a punctuation glyph"""
    glyphs = [".", ",", ";", ":", "!", "?", "”", "’", '"', "»"]
    for glyph in glyphs:
        if text[-1] == glyph:
            return True
    return False
 def to_markdown(content):