diff --git a/bookwyrm/tests/views/test_status.py b/bookwyrm/tests/views/test_status.py
index 33bd8b53a..424698130 100644
--- a/bookwyrm/tests/views/test_status.py
+++ b/bookwyrm/tests/views/test_status.py
@@ -420,21 +420,25 @@ http://www.fish.com/"""
'okay\n\nwww.fish.com/',
)
- def test_format_links_parens(self, *_):
- """find and format urls into a tags"""
- url = "http://www.fish.com/"
- self.assertEqual(
- views.status.format_links(f"({url})"),
- f'(www.fish.com/)',
- )
-
def test_format_links_punctuation(self, *_):
- """don’t take trailing punctuation into account pls"""
- url = "http://www.fish.com/"
- self.assertEqual(
- views.status.format_links(f"{url}."),
- f'www.fish.com/.',
- )
+ """test many combinations of brackets, URLs, and punctuation"""
+ url = "https://bookwyrm.social"
+ html = f'bookwyrm.social'
+ test_table = [
+ ("punct", f"text and {url}.", f"text and {html}."),
+ ("multi_punct", f"text, then {url}?...", f"text, then {html}?..."),
+ ("bracket_punct", f"here ({url}).", f"here ({html})."),
+ ("punct_bracket", f"there [{url}?]", f"there [{html}?]"),
+ ("punct_bracket_punct", f"not here? ({url}!).", f"not here? ({html}!)."),
+ (
+ "multi_punct_bracket",
+ f"not there ({url}...);",
+ f"not there ({html}...);",
+ ),
+ ]
+ for desc, text, output in test_table:
+ with self.subTest(desc=desc):
+ self.assertEqual(views.status.format_links(text), output)
def test_format_links_special_chars(self, *_):
"""find and format urls into a tags"""
@@ -464,6 +468,13 @@ http://www.fish.com/"""
views.status.format_links(url), f'{url[8:]}'
)
+ def test_format_links_ignore_non_urls(self, *_):
+ """formating links should leave plain text untouced"""
+ text_elision = "> “The distinction is significant.” [...]" # bookwyrm#2993
+ text_quoteparens = "some kind of gene-editing technology (?)" # bookwyrm#3049
+ self.assertEqual(views.status.format_links(text_elision), text_elision)
+ self.assertEqual(views.status.format_links(text_quoteparens), text_quoteparens)
+
def test_format_mentions_with_at_symbol_links(self, *_):
"""A link with an @username shouldn't treat the username as a mention"""
content = "a link to https://example.com/user/@mouse"
diff --git a/bookwyrm/views/status.py b/bookwyrm/views/status.py
index 7a0517b01..34b62d0b4 100644
--- a/bookwyrm/views/status.py
+++ b/bookwyrm/views/status.py
@@ -1,7 +1,6 @@
""" what are we here for if not for posting """
import re
import logging
-from urllib.parse import urlparse
from django.contrib.auth.decorators import login_required
from django.core.validators import URLValidator
@@ -297,65 +296,51 @@ def find_or_create_hashtags(content):
def format_links(content):
"""detect and format links"""
- validator = URLValidator()
- formatted_content = ""
+ validator = URLValidator(["http", "https"])
+ schema_re = re.compile(r"\bhttps?://")
split_content = re.split(r"(\s+)", content)
- for potential_link in split_content:
- if not potential_link:
+ for i, potential_link in enumerate(split_content):
+ if not schema_re.search(potential_link):
continue
- wrapped = _wrapped(potential_link)
- if wrapped:
- wrapper_close = potential_link[-1]
- formatted_content += potential_link[0]
- potential_link = potential_link[1:-1]
-
- ends_with_punctuation = _ends_with_punctuation(potential_link)
- if ends_with_punctuation:
- punctuation_glyph = potential_link[-1]
- potential_link = potential_link[0:-1]
+ # Strip surrounding brackets and trailing punctuation.
+ prefix, potential_link, suffix = _unwrap(potential_link)
try:
# raises an error on anything that's not a valid link
validator(potential_link)
# use everything but the scheme in the presentation of the link
- url = urlparse(potential_link)
- link = url.netloc + url.path + url.params
- if url.query != "":
- link += "?" + url.query
- if url.fragment != "":
- link += "#" + url.fragment
-
- formatted_content += f'{link}'
+ link = schema_re.sub("", potential_link)
+ split_content[i] = f'{prefix}{link}{suffix}'
except (ValidationError, UnicodeError):
- formatted_content += potential_link
+ pass
- if wrapped:
- formatted_content += wrapper_close
-
- if ends_with_punctuation:
- formatted_content += punctuation_glyph
-
- return formatted_content
+ return "".join(split_content)
-def _wrapped(text):
- """check if a line of text is wrapped"""
- wrappers = [("(", ")"), ("[", "]"), ("{", "}")]
- for wrapper in wrappers:
+def _unwrap(text):
+ """split surrounding brackets and trailing punctuation from a string of text"""
+ punct = re.compile(r'([.,;:!?"’”»]+)$')
+ prefix = suffix = ""
+
+ if punct.search(text):
+ # Move punctuation to suffix segment.
+ text, suffix, _ = punct.split(text)
+
+ for wrapper in ("()", "[]", "{}"):
if text[0] == wrapper[0] and text[-1] == wrapper[-1]:
- return True
- return False
+ # Split out wrapping chars.
+ suffix = text[-1] + suffix
+ prefix, text = text[:1], text[1:-1]
+ break # Nested wrappers not supported atm.
+ if punct.search(text):
+ # Move inner punctuation to suffix segment.
+ text, inner_punct, _ = punct.split(text)
+ suffix = inner_punct + suffix
-def _ends_with_punctuation(text):
- """check if a line of text ends with a punctuation glyph"""
- glyphs = [".", ",", ";", ":", "!", "?", "”", "’", '"', "»"]
- for glyph in glyphs:
- if text[-1] == glyph:
- return True
- return False
+ return prefix, text, suffix
def to_markdown(content):