From c3c22022f604ec1c934aaf60d4536beaebfdc7be Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adeodato=20Sim=C3=B3?= <dato@users.noreply.github.com>
Date: Mon, 9 Oct 2023 20:54:06 -0300
Subject: [PATCH 1/7] Check for punctuation before checking for wrapping
 parenthesis

This allows to parse `(URL).` correctly, which was not detected
as URL before.
---
 bookwyrm/tests/views/test_status.py |  9 +++++++++
 bookwyrm/views/status.py            | 12 +++++++-----
 2 files changed, 16 insertions(+), 5 deletions(-)
diff --git a/bookwyrm/tests/views/test_status.py b/bookwyrm/tests/views/test_status.py
index 33bd8b53a..6e630a370 100644
--- a/bookwyrm/tests/views/test_status.py
+++ b/bookwyrm/tests/views/test_status.py
@@ -436,6 +436,15 @@ http://www.fish.com/"""
             f'<a href="{url}">www.fish.com/</a>.',
         )
 
+    def test_format_links_punctuation_parens(self, *_):
+        """ignore trailing punctuation and brackets combined"""
+        # Period at the end, wrapped in brackets.
+        url = "http://www.fish.com"
+        self.assertEqual(
+            views.status.format_links(f"({url})."),
+            f'(<a href="{url}">www.fish.com</a>).',
+        )
+
     def test_format_links_special_chars(self, *_):
         """find and format urls into a tags"""
         url = "https://archive.org/details/dli.granth.72113/page/n25/mode/2up"
diff --git a/bookwyrm/views/status.py b/bookwyrm/views/status.py
index 7a0517b01..196db4eb9 100644
--- a/bookwyrm/views/status.py
+++ b/bookwyrm/views/status.py
@@ -304,17 +304,19 @@ def format_links(content):
     for potential_link in split_content:
         if not potential_link:
             continue
+
+        # FIXME: allow for multiple punctuation characters, e.g. `...` and `!?`.
+        ends_with_punctuation = _ends_with_punctuation(potential_link)
+        if ends_with_punctuation:
+            punctuation_glyph = potential_link[-1]
+            potential_link = potential_link[0:-1]
+
         wrapped = _wrapped(potential_link)
         if wrapped:
             wrapper_close = potential_link[-1]
             formatted_content += potential_link[0]
             potential_link = potential_link[1:-1]
 
-        ends_with_punctuation = _ends_with_punctuation(potential_link)
-        if ends_with_punctuation:
-            punctuation_glyph = potential_link[-1]
-            potential_link = potential_link[0:-1]
-
         try:
             # raises an error on anything that's not a valid link
             validator(potential_link)

From 0043329cc189ec23390101a9e9055ed592a8db4b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adeodato=20Sim=C3=B3?= <dato@users.noreply.github.com>
Date: Mon, 9 Oct 2023 21:01:34 -0300
Subject: [PATCH 2/7] Simplify literals in _wrapped

---
 bookwyrm/views/status.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bookwyrm/views/status.py b/bookwyrm/views/status.py
index 196db4eb9..bb8f6ac17 100644
--- a/bookwyrm/views/status.py
+++ b/bookwyrm/views/status.py
@@ -344,7 +344,7 @@ def format_links(content):
 
 def _wrapped(text):
     """check if a line of text is wrapped"""
-    wrappers = [("(", ")"), ("[", "]"), ("{", "}")]
+    wrappers = ["()", "[]", "{}"]
     for wrapper in wrappers:
         if text[0] == wrapper[0] and text[-1] == wrapper[-1]:
             return True

From 17d741039c95e1a00272e3ed6ed1d3d219218daf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adeodato=20Sim=C3=B3?= <dato@users.noreply.github.com>
Date: Mon, 9 Oct 2023 21:33:35 -0300
Subject: [PATCH 3/7] Remove duplicate test

(Test case already part of test_format_links_simple_url.)
---
 bookwyrm/tests/views/test_status.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/bookwyrm/tests/views/test_status.py b/bookwyrm/tests/views/test_status.py
index 6e630a370..42e83c5b8 100644
--- a/bookwyrm/tests/views/test_status.py
+++ b/bookwyrm/tests/views/test_status.py
@@ -420,14 +420,6 @@ http://www.fish.com/"""
             'okay\n\n<a href="http://www.fish.com/">www.fish.com/</a>',
         )
 
-    def test_format_links_parens(self, *_):
-        """find and format urls into a tags"""
-        url = "http://www.fish.com/"
-        self.assertEqual(
-            views.status.format_links(f"({url})"),
-            f'(<a href="{url}">www.fish.com/</a>)',
-        )
-
     def test_format_links_punctuation(self, *_):
         """don’t take trailing punctuation into account pls"""
         url = "http://www.fish.com/"

From 294788aa1ae2f10ffeae188177dfb96e9291a6d3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adeodato=20Sim=C3=B3?= <dato@users.noreply.github.com>
Date: Mon, 9 Oct 2023 21:41:22 -0300
Subject: [PATCH 4/7] format_links: refactor; support multiple punctuation

---
 bookwyrm/tests/views/test_status.py |  4 ++
 bookwyrm/views/status.py            | 71 ++++++++++-------------------
 2 files changed, 28 insertions(+), 47 deletions(-)

diff --git a/bookwyrm/tests/views/test_status.py b/bookwyrm/tests/views/test_status.py
index 42e83c5b8..33c8f0e41 100644
--- a/bookwyrm/tests/views/test_status.py
+++ b/bookwyrm/tests/views/test_status.py
@@ -427,6 +427,10 @@ http://www.fish.com/"""
             views.status.format_links(f"{url}."),
             f'<a href="{url}">www.fish.com/</a>.',
         )
+        self.assertEqual(
+            views.status.format_links(f"{url}!?!"),
+            f'<a href="{url}">www.fish.com/</a>!?!',
+        )
 
     def test_format_links_punctuation_parens(self, *_):
         """ignore trailing punctuation and brackets combined"""
diff --git a/bookwyrm/views/status.py b/bookwyrm/views/status.py
index bb8f6ac17..4c1d049df 100644
--- a/bookwyrm/views/status.py
+++ b/bookwyrm/views/status.py
@@ -1,7 +1,6 @@
 """ what are we here for if not for posting """
 import re
 import logging
-from urllib.parse import urlparse
 
 from django.contrib.auth.decorators import login_required
 from django.core.validators import URLValidator
@@ -297,67 +296,45 @@ def find_or_create_hashtags(content):
 
 def format_links(content):
     """detect and format links"""
-    validator = URLValidator()
-    formatted_content = ""
+    validator = URLValidator(["http", "https"])
+    schema_re = re.compile(r"\bhttps?://")
     split_content = re.split(r"(\s+)", content)
 
-    for potential_link in split_content:
-        if not potential_link:
+    for i, potential_link in enumerate(split_content):
+        if not schema_re.search(potential_link):
             continue
 
-        # FIXME: allow for multiple punctuation characters, e.g. `...` and `!?`.
-        ends_with_punctuation = _ends_with_punctuation(potential_link)
-        if ends_with_punctuation:
-            punctuation_glyph = potential_link[-1]
-            potential_link = potential_link[0:-1]
-
-        wrapped = _wrapped(potential_link)
-        if wrapped:
-            wrapper_close = potential_link[-1]
-            formatted_content += potential_link[0]
-            potential_link = potential_link[1:-1]
-
+        # Strip surrounding brackets and trailing punctuation.
+        prefix, potential_link, suffix = _unwrap(potential_link)
         try:
             # raises an error on anything that's not a valid link
             validator(potential_link)
 
             # use everything but the scheme in the presentation of the link
-            url = urlparse(potential_link)
-            link = url.netloc + url.path + url.params
-            if url.query != "":
-                link += "?" + url.query
-            if url.fragment != "":
-                link += "#" + url.fragment
-
-            formatted_content += f'<a href="{potential_link}">{link}</a>'
+            link = schema_re.sub("", potential_link)
+            split_content[i] = f'{prefix}<a href="{potential_link}">{link}</a>{suffix}'
         except (ValidationError, UnicodeError):
-            formatted_content += potential_link
+            pass
 
-        if wrapped:
-            formatted_content += wrapper_close
-
-        if ends_with_punctuation:
-            formatted_content += punctuation_glyph
-
-    return formatted_content
+    return "".join(split_content)
 
 
-def _wrapped(text):
-    """check if a line of text is wrapped"""
-    wrappers = ["()", "[]", "{}"]
-    for wrapper in wrappers:
+def _unwrap(text):
+    """split surrounding brackets and trailing punctuation from a string of text"""
+    punct = re.compile(r'([.,;:!?"’”»]+)\Z')
+    prefix = suffix = ""
+
+    if punct.search(text):
+        # Move punctuation to suffix segment.
+        text, suffix, _ = punct.split(text)
+
+    for wrapper in ("()", "[]", "{}"):
         if text[0] == wrapper[0] and text[-1] == wrapper[-1]:
-            return True
-    return False
+            # Split out wrapping chars.
+            suffix = text[-1] + suffix
+            prefix, text = text[:1], text[1:-1]
 
-
-def _ends_with_punctuation(text):
-    """check if a line of text ends with a punctuation glyph"""
-    glyphs = [".", ",", ";", ":", "!", "?", "”", "’", '"', "»"]
-    for glyph in glyphs:
-        if text[-1] == glyph:
-            return True
-    return False
+    return prefix, text, suffix
 
 
 def to_markdown(content):

From 7d13cbb10b45a8599e12aa9c379e1617d6e76f27 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adeodato=20Sim=C3=B3?= <dato@users.noreply.github.com>
Date: Thu, 2 Nov 2023 21:27:55 -0300
Subject: [PATCH 5/7] Add failing tests for reported bugs in format_links()

---
 bookwyrm/tests/views/test_status.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/bookwyrm/tests/views/test_status.py b/bookwyrm/tests/views/test_status.py
index 33c8f0e41..67e1f6d76 100644
--- a/bookwyrm/tests/views/test_status.py
+++ b/bookwyrm/tests/views/test_status.py
@@ -469,6 +469,13 @@ http://www.fish.com/"""
             views.status.format_links(url), f'<a href="{url}">{url[8:]}</a>'
         )
 
+    def test_format_links_ignore_non_urls(self, *_):
+        """formating links should leave plain text untouced"""
+        text_elision = "> “The distinction is significant.” [...]"  # bookwyrm#2993
+        text_quoteparens = "some kind of gene-editing technology (?)"  # bookwyrm#3049
+        self.assertEqual(views.status.format_links(text_elision), text_elision)
+        self.assertEqual(views.status.format_links(text_quoteparens), text_quoteparens)
+
     def test_format_mentions_with_at_symbol_links(self, *_):
         """A link with an @username shouldn't treat the username as a mention"""
         content = "a link to https://example.com/user/@mouse"

From 954a02126eb21ca230319cd8ff62adf04d962de9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adeodato=20Sim=C3=B3?= <dato@users.noreply.github.com>
Date: Thu, 2 Nov 2023 21:59:34 -0300
Subject: [PATCH 6/7] format_links: parse punctuation inside brackets

Also, consolidate all punctuation tests into a single table-driven one.
---
 bookwyrm/tests/views/test_status.py | 37 ++++++++++++++---------------
 bookwyrm/views/status.py            |  6 +++++
 2 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/bookwyrm/tests/views/test_status.py b/bookwyrm/tests/views/test_status.py
index 67e1f6d76..424698130 100644
--- a/bookwyrm/tests/views/test_status.py
+++ b/bookwyrm/tests/views/test_status.py
@@ -421,25 +421,24 @@ http://www.fish.com/"""
         )
 
     def test_format_links_punctuation(self, *_):
-        """don’t take trailing punctuation into account pls"""
-        url = "http://www.fish.com/"
-        self.assertEqual(
-            views.status.format_links(f"{url}."),
-            f'<a href="{url}">www.fish.com/</a>.',
-        )
-        self.assertEqual(
-            views.status.format_links(f"{url}!?!"),
-            f'<a href="{url}">www.fish.com/</a>!?!',
-        )
-
-    def test_format_links_punctuation_parens(self, *_):
-        """ignore trailing punctuation and brackets combined"""
-        # Period at the end, wrapped in brackets.
-        url = "http://www.fish.com"
-        self.assertEqual(
-            views.status.format_links(f"({url})."),
-            f'(<a href="{url}">www.fish.com</a>).',
-        )
+        """test many combinations of brackets, URLs, and punctuation"""
+        url = "https://bookwyrm.social"
+        html = f'<a href="{url}">bookwyrm.social</a>'
+        test_table = [
+            ("punct", f"text and {url}.", f"text and {html}."),
+            ("multi_punct", f"text, then {url}?...", f"text, then {html}?..."),
+            ("bracket_punct", f"here ({url}).", f"here ({html})."),
+            ("punct_bracket", f"there [{url}?]", f"there [{html}?]"),
+            ("punct_bracket_punct", f"not here? ({url}!).", f"not here? ({html}!)."),
+            (
+                "multi_punct_bracket",
+                f"not there ({url}...);",
+                f"not there ({html}...);",
+            ),
+        ]
+        for desc, text, output in test_table:
+            with self.subTest(desc=desc):
+                self.assertEqual(views.status.format_links(text), output)
 
     def test_format_links_special_chars(self, *_):
         """find and format urls into a tags"""
diff --git a/bookwyrm/views/status.py b/bookwyrm/views/status.py
index 4c1d049df..8dab11a27 100644
--- a/bookwyrm/views/status.py
+++ b/bookwyrm/views/status.py
@@ -333,6 +333,12 @@ def _unwrap(text):
             # Split out wrapping chars.
             suffix = text[-1] + suffix
             prefix, text = text[:1], text[1:-1]
+            break  # Nested wrappers not supported atm.
+
+    if punct.search(text):
+        # Move inner punctuation to suffix segment.
+        text, inner_punct, _ = punct.split(text)
+        suffix = inner_punct + suffix
 
     return prefix, text, suffix
 

From afad39bf80265f74de100cf3550924a7916d02a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adeodato=20Sim=C3=B3?= <dato@users.noreply.github.com>
Date: Fri, 3 Nov 2023 19:31:03 -0300
Subject: [PATCH 7/7] Use $ instead of \Z for end of string

They're identical here, since re.M is not used, and the better-known
should be used, for readability.
---
 bookwyrm/views/status.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bookwyrm/views/status.py b/bookwyrm/views/status.py
index 8dab11a27..34b62d0b4 100644
--- a/bookwyrm/views/status.py
+++ b/bookwyrm/views/status.py
@@ -321,7 +321,7 @@ def format_links(content):
 
 def _unwrap(text):
     """split surrounding brackets and trailing punctuation from a string of text"""
-    punct = re.compile(r'([.,;:!?"’”»]+)\Z')
+    punct = re.compile(r'([.,;:!?"’”»]+)$')
     prefix = suffix = ""
 
     if punct.search(text):