Merge pull request #3027 from dato/find_links_wrapped_punct

Fix parsing of punctuation in format_links()

fixes #2993  
fixes #3049
This commit is contained in:
Hugh Rundle 2023-11-06 09:42:57 +11:00 committed by GitHub
commit a93519ec3e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 54 additions and 58 deletions

View file

@ -420,21 +420,25 @@ http://www.fish.com/"""
'okay\n\n<a href="http://www.fish.com/">www.fish.com/</a>', 'okay\n\n<a href="http://www.fish.com/">www.fish.com/</a>',
) )
def test_format_links_parens(self, *_):
"""find and format urls into a tags"""
url = "http://www.fish.com/"
self.assertEqual(
views.status.format_links(f"({url})"),
f'(<a href="{url}">www.fish.com/</a>)',
)
def test_format_links_punctuation(self, *_): def test_format_links_punctuation(self, *_):
"""dont take trailing punctuation into account pls""" """test many combinations of brackets, URLs, and punctuation"""
url = "http://www.fish.com/" url = "https://bookwyrm.social"
self.assertEqual( html = f'<a href="{url}">bookwyrm.social</a>'
views.status.format_links(f"{url}."), test_table = [
f'<a href="{url}">www.fish.com/</a>.', ("punct", f"text and {url}.", f"text and {html}."),
) ("multi_punct", f"text, then {url}?...", f"text, then {html}?..."),
("bracket_punct", f"here ({url}).", f"here ({html})."),
("punct_bracket", f"there [{url}?]", f"there [{html}?]"),
("punct_bracket_punct", f"not here? ({url}!).", f"not here? ({html}!)."),
(
"multi_punct_bracket",
f"not there ({url}...);",
f"not there ({html}...);",
),
]
for desc, text, output in test_table:
with self.subTest(desc=desc):
self.assertEqual(views.status.format_links(text), output)
def test_format_links_special_chars(self, *_): def test_format_links_special_chars(self, *_):
"""find and format urls into a tags""" """find and format urls into a tags"""
@ -464,6 +468,13 @@ http://www.fish.com/"""
views.status.format_links(url), f'<a href="{url}">{url[8:]}</a>' views.status.format_links(url), f'<a href="{url}">{url[8:]}</a>'
) )
def test_format_links_ignore_non_urls(self, *_):
"""formating links should leave plain text untouced"""
text_elision = "> “The distinction is significant.” [...]" # bookwyrm#2993
text_quoteparens = "some kind of gene-editing technology (?)" # bookwyrm#3049
self.assertEqual(views.status.format_links(text_elision), text_elision)
self.assertEqual(views.status.format_links(text_quoteparens), text_quoteparens)
def test_format_mentions_with_at_symbol_links(self, *_): def test_format_mentions_with_at_symbol_links(self, *_):
"""A link with an @username shouldn't treat the username as a mention""" """A link with an @username shouldn't treat the username as a mention"""
content = "a link to https://example.com/user/@mouse" content = "a link to https://example.com/user/@mouse"

View file

@ -1,7 +1,6 @@
""" what are we here for if not for posting """ """ what are we here for if not for posting """
import re import re
import logging import logging
from urllib.parse import urlparse
from django.contrib.auth.decorators import login_required from django.contrib.auth.decorators import login_required
from django.core.validators import URLValidator from django.core.validators import URLValidator
@ -297,65 +296,51 @@ def find_or_create_hashtags(content):
def format_links(content): def format_links(content):
"""detect and format links""" """detect and format links"""
validator = URLValidator() validator = URLValidator(["http", "https"])
formatted_content = "" schema_re = re.compile(r"\bhttps?://")
split_content = re.split(r"(\s+)", content) split_content = re.split(r"(\s+)", content)
for potential_link in split_content: for i, potential_link in enumerate(split_content):
if not potential_link: if not schema_re.search(potential_link):
continue continue
wrapped = _wrapped(potential_link)
if wrapped:
wrapper_close = potential_link[-1]
formatted_content += potential_link[0]
potential_link = potential_link[1:-1]
ends_with_punctuation = _ends_with_punctuation(potential_link)
if ends_with_punctuation:
punctuation_glyph = potential_link[-1]
potential_link = potential_link[0:-1]
# Strip surrounding brackets and trailing punctuation.
prefix, potential_link, suffix = _unwrap(potential_link)
try: try:
# raises an error on anything that's not a valid link # raises an error on anything that's not a valid link
validator(potential_link) validator(potential_link)
# use everything but the scheme in the presentation of the link # use everything but the scheme in the presentation of the link
url = urlparse(potential_link) link = schema_re.sub("", potential_link)
link = url.netloc + url.path + url.params split_content[i] = f'{prefix}<a href="{potential_link}">{link}</a>{suffix}'
if url.query != "":
link += "?" + url.query
if url.fragment != "":
link += "#" + url.fragment
formatted_content += f'<a href="{potential_link}">{link}</a>'
except (ValidationError, UnicodeError): except (ValidationError, UnicodeError):
formatted_content += potential_link pass
if wrapped: return "".join(split_content)
formatted_content += wrapper_close
if ends_with_punctuation:
formatted_content += punctuation_glyph
return formatted_content
def _wrapped(text): def _unwrap(text):
"""check if a line of text is wrapped""" """split surrounding brackets and trailing punctuation from a string of text"""
wrappers = [("(", ")"), ("[", "]"), ("{", "}")] punct = re.compile(r'([.,;:!?"’”»]+)$')
for wrapper in wrappers: prefix = suffix = ""
if punct.search(text):
# Move punctuation to suffix segment.
text, suffix, _ = punct.split(text)
for wrapper in ("()", "[]", "{}"):
if text[0] == wrapper[0] and text[-1] == wrapper[-1]: if text[0] == wrapper[0] and text[-1] == wrapper[-1]:
return True # Split out wrapping chars.
return False suffix = text[-1] + suffix
prefix, text = text[:1], text[1:-1]
break # Nested wrappers not supported atm.
if punct.search(text):
# Move inner punctuation to suffix segment.
text, inner_punct, _ = punct.split(text)
suffix = inner_punct + suffix
def _ends_with_punctuation(text): return prefix, text, suffix
"""check if a line of text ends with a punctuation glyph"""
glyphs = [".", ",", ";", ":", "!", "?", "", "", '"', "»"]
for glyph in glyphs:
if text[-1] == glyph:
return True
return False
def to_markdown(content): def to_markdown(content):