format_links: refactor; support multiple punctuation

2025-06-04 23:08:48 +00:00 · 2023-10-09 21:41:22 -03:00 · 2023-10-09 21:41:22 -03:00 · 294788aa1a
commit 294788aa1a
parent 17d741039c
2 changed files with 28 additions and 47 deletions
--- a/bookwyrm/tests/views/test_status.py
+++ b/bookwyrm/tests/views/test_status.py
@ -427,6 +427,10 @@ http://www.fish.com/"""
            views.status.format_links(f"{url}."),
            f'<a href="{url}">www.fish.com/</a>.',
        )
+        self.assertEqual(
+            views.status.format_links(f"{url}!?!"),
+            f'<a href="{url}">www.fish.com/</a>!?!',
+        )

    def test_format_links_punctuation_parens(self, *_):
        """ignore trailing punctuation and brackets combined"""
--- a/bookwyrm/views/status.py
+++ b/bookwyrm/views/status.py
@ -1,7 +1,6 @@
 """ what are we here for if not for posting """
 import re
 import logging
-from urllib.parse import urlparse

 from django.contrib.auth.decorators import login_required
 from django.core.validators import URLValidator
@ -297,67 +296,45 @@ def find_or_create_hashtags(content):

 def format_links(content):
    """detect and format links"""
-    validator = URLValidator()
-    formatted_content = ""
+    validator = URLValidator(["http", "https"])
+    schema_re = re.compile(r"\bhttps?://")
    split_content = re.split(r"(\s+)", content)

-    for potential_link in split_content:
-        if not potential_link:
+    for i, potential_link in enumerate(split_content):
+        if not schema_re.search(potential_link):
            continue

-        # FIXME: allow for multiple punctuation characters, e.g. `...` and `!?`.
-        ends_with_punctuation = _ends_with_punctuation(potential_link)
-        if ends_with_punctuation:
-            punctuation_glyph = potential_link[-1]
-            potential_link = potential_link[0:-1]
-
-        wrapped = _wrapped(potential_link)
-        if wrapped:
-            wrapper_close = potential_link[-1]
-            formatted_content += potential_link[0]
-            potential_link = potential_link[1:-1]
-
+        # Strip surrounding brackets and trailing punctuation.
+        prefix, potential_link, suffix = _unwrap(potential_link)
        try:
            # raises an error on anything that's not a valid link
            validator(potential_link)

            # use everything but the scheme in the presentation of the link
-            url = urlparse(potential_link)
-            link = url.netloc + url.path + url.params
-            if url.query != "":
-                link += "?" + url.query
-            if url.fragment != "":
-                link += "#" + url.fragment
-
-            formatted_content += f'<a href="{potential_link}">{link}</a>'
+            link = schema_re.sub("", potential_link)
+            split_content[i] = f'{prefix}<a href="{potential_link}">{link}</a>{suffix}'
        except (ValidationError, UnicodeError):
-            formatted_content += potential_link
+            pass

-        if wrapped:
-            formatted_content += wrapper_close
-
-        if ends_with_punctuation:
-            formatted_content += punctuation_glyph
-
-    return formatted_content
+    return "".join(split_content)


-def _wrapped(text):
-    """check if a line of text is wrapped"""
-    wrappers = ["()", "[]", "{}"]
-    for wrapper in wrappers:
+def _unwrap(text):
+    """split surrounding brackets and trailing punctuation from a string of text"""
+    punct = re.compile(r'([.,;:!?"’”»]+)\Z')
+    prefix = suffix = ""
+
+    if punct.search(text):
+        # Move punctuation to suffix segment.
+        text, suffix, _ = punct.split(text)
+
+    for wrapper in ("()", "[]", "{}"):
        if text[0] == wrapper[0] and text[-1] == wrapper[-1]:
-            return True
-    return False
+            # Split out wrapping chars.
+            suffix = text[-1] + suffix
+            prefix, text = text[:1], text[1:-1]

-
-def _ends_with_punctuation(text):
-    """check if a line of text ends with a punctuation glyph"""
-    glyphs = [".", ",", ";", ":", "!", "?", "”", "’", '"', "»"]
-    for glyph in glyphs:
-        if text[-1] == glyph:
-            return True
-    return False
+    return prefix, text, suffix


 def to_markdown(content):