mirror of
https://github.com/bookwyrm-social/bookwyrm.git
synced 2024-06-02 21:39:23 +00:00
Merge pull request #3027 from dato/find_links_wrapped_punct
Fix parsing of punctuation in format_links() fixes #2993 fixes #3049
This commit is contained in:
commit
a93519ec3e
|
@ -420,21 +420,25 @@ http://www.fish.com/"""
|
||||||
'okay\n\n<a href="http://www.fish.com/">www.fish.com/</a>',
|
'okay\n\n<a href="http://www.fish.com/">www.fish.com/</a>',
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_format_links_parens(self, *_):
|
|
||||||
"""find and format urls into a tags"""
|
|
||||||
url = "http://www.fish.com/"
|
|
||||||
self.assertEqual(
|
|
||||||
views.status.format_links(f"({url})"),
|
|
||||||
f'(<a href="{url}">www.fish.com/</a>)',
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_format_links_punctuation(self, *_):
|
def test_format_links_punctuation(self, *_):
|
||||||
"""don’t take trailing punctuation into account pls"""
|
"""test many combinations of brackets, URLs, and punctuation"""
|
||||||
url = "http://www.fish.com/"
|
url = "https://bookwyrm.social"
|
||||||
self.assertEqual(
|
html = f'<a href="{url}">bookwyrm.social</a>'
|
||||||
views.status.format_links(f"{url}."),
|
test_table = [
|
||||||
f'<a href="{url}">www.fish.com/</a>.',
|
("punct", f"text and {url}.", f"text and {html}."),
|
||||||
)
|
("multi_punct", f"text, then {url}?...", f"text, then {html}?..."),
|
||||||
|
("bracket_punct", f"here ({url}).", f"here ({html})."),
|
||||||
|
("punct_bracket", f"there [{url}?]", f"there [{html}?]"),
|
||||||
|
("punct_bracket_punct", f"not here? ({url}!).", f"not here? ({html}!)."),
|
||||||
|
(
|
||||||
|
"multi_punct_bracket",
|
||||||
|
f"not there ({url}...);",
|
||||||
|
f"not there ({html}...);",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
for desc, text, output in test_table:
|
||||||
|
with self.subTest(desc=desc):
|
||||||
|
self.assertEqual(views.status.format_links(text), output)
|
||||||
|
|
||||||
def test_format_links_special_chars(self, *_):
|
def test_format_links_special_chars(self, *_):
|
||||||
"""find and format urls into a tags"""
|
"""find and format urls into a tags"""
|
||||||
|
@ -464,6 +468,13 @@ http://www.fish.com/"""
|
||||||
views.status.format_links(url), f'<a href="{url}">{url[8:]}</a>'
|
views.status.format_links(url), f'<a href="{url}">{url[8:]}</a>'
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_format_links_ignore_non_urls(self, *_):
|
||||||
|
"""formating links should leave plain text untouced"""
|
||||||
|
text_elision = "> “The distinction is significant.” [...]" # bookwyrm#2993
|
||||||
|
text_quoteparens = "some kind of gene-editing technology (?)" # bookwyrm#3049
|
||||||
|
self.assertEqual(views.status.format_links(text_elision), text_elision)
|
||||||
|
self.assertEqual(views.status.format_links(text_quoteparens), text_quoteparens)
|
||||||
|
|
||||||
def test_format_mentions_with_at_symbol_links(self, *_):
|
def test_format_mentions_with_at_symbol_links(self, *_):
|
||||||
"""A link with an @username shouldn't treat the username as a mention"""
|
"""A link with an @username shouldn't treat the username as a mention"""
|
||||||
content = "a link to https://example.com/user/@mouse"
|
content = "a link to https://example.com/user/@mouse"
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
""" what are we here for if not for posting """
|
""" what are we here for if not for posting """
|
||||||
import re
|
import re
|
||||||
import logging
|
import logging
|
||||||
from urllib.parse import urlparse
|
|
||||||
|
|
||||||
from django.contrib.auth.decorators import login_required
|
from django.contrib.auth.decorators import login_required
|
||||||
from django.core.validators import URLValidator
|
from django.core.validators import URLValidator
|
||||||
|
@ -297,65 +296,51 @@ def find_or_create_hashtags(content):
|
||||||
|
|
||||||
def format_links(content):
|
def format_links(content):
|
||||||
"""detect and format links"""
|
"""detect and format links"""
|
||||||
validator = URLValidator()
|
validator = URLValidator(["http", "https"])
|
||||||
formatted_content = ""
|
schema_re = re.compile(r"\bhttps?://")
|
||||||
split_content = re.split(r"(\s+)", content)
|
split_content = re.split(r"(\s+)", content)
|
||||||
|
|
||||||
for potential_link in split_content:
|
for i, potential_link in enumerate(split_content):
|
||||||
if not potential_link:
|
if not schema_re.search(potential_link):
|
||||||
continue
|
continue
|
||||||
wrapped = _wrapped(potential_link)
|
|
||||||
if wrapped:
|
|
||||||
wrapper_close = potential_link[-1]
|
|
||||||
formatted_content += potential_link[0]
|
|
||||||
potential_link = potential_link[1:-1]
|
|
||||||
|
|
||||||
ends_with_punctuation = _ends_with_punctuation(potential_link)
|
|
||||||
if ends_with_punctuation:
|
|
||||||
punctuation_glyph = potential_link[-1]
|
|
||||||
potential_link = potential_link[0:-1]
|
|
||||||
|
|
||||||
|
# Strip surrounding brackets and trailing punctuation.
|
||||||
|
prefix, potential_link, suffix = _unwrap(potential_link)
|
||||||
try:
|
try:
|
||||||
# raises an error on anything that's not a valid link
|
# raises an error on anything that's not a valid link
|
||||||
validator(potential_link)
|
validator(potential_link)
|
||||||
|
|
||||||
# use everything but the scheme in the presentation of the link
|
# use everything but the scheme in the presentation of the link
|
||||||
url = urlparse(potential_link)
|
link = schema_re.sub("", potential_link)
|
||||||
link = url.netloc + url.path + url.params
|
split_content[i] = f'{prefix}<a href="{potential_link}">{link}</a>{suffix}'
|
||||||
if url.query != "":
|
|
||||||
link += "?" + url.query
|
|
||||||
if url.fragment != "":
|
|
||||||
link += "#" + url.fragment
|
|
||||||
|
|
||||||
formatted_content += f'<a href="{potential_link}">{link}</a>'
|
|
||||||
except (ValidationError, UnicodeError):
|
except (ValidationError, UnicodeError):
|
||||||
formatted_content += potential_link
|
pass
|
||||||
|
|
||||||
if wrapped:
|
return "".join(split_content)
|
||||||
formatted_content += wrapper_close
|
|
||||||
|
|
||||||
if ends_with_punctuation:
|
|
||||||
formatted_content += punctuation_glyph
|
|
||||||
|
|
||||||
return formatted_content
|
|
||||||
|
|
||||||
|
|
||||||
def _wrapped(text):
|
def _unwrap(text):
|
||||||
"""check if a line of text is wrapped"""
|
"""split surrounding brackets and trailing punctuation from a string of text"""
|
||||||
wrappers = [("(", ")"), ("[", "]"), ("{", "}")]
|
punct = re.compile(r'([.,;:!?"’”»]+)$')
|
||||||
for wrapper in wrappers:
|
prefix = suffix = ""
|
||||||
|
|
||||||
|
if punct.search(text):
|
||||||
|
# Move punctuation to suffix segment.
|
||||||
|
text, suffix, _ = punct.split(text)
|
||||||
|
|
||||||
|
for wrapper in ("()", "[]", "{}"):
|
||||||
if text[0] == wrapper[0] and text[-1] == wrapper[-1]:
|
if text[0] == wrapper[0] and text[-1] == wrapper[-1]:
|
||||||
return True
|
# Split out wrapping chars.
|
||||||
return False
|
suffix = text[-1] + suffix
|
||||||
|
prefix, text = text[:1], text[1:-1]
|
||||||
|
break # Nested wrappers not supported atm.
|
||||||
|
|
||||||
|
if punct.search(text):
|
||||||
|
# Move inner punctuation to suffix segment.
|
||||||
|
text, inner_punct, _ = punct.split(text)
|
||||||
|
suffix = inner_punct + suffix
|
||||||
|
|
||||||
def _ends_with_punctuation(text):
|
return prefix, text, suffix
|
||||||
"""check if a line of text ends with a punctuation glyph"""
|
|
||||||
glyphs = [".", ",", ";", ":", "!", "?", "”", "’", '"', "»"]
|
|
||||||
for glyph in glyphs:
|
|
||||||
if text[-1] == glyph:
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def to_markdown(content):
|
def to_markdown(content):
|
||||||
|
|
Loading…
Reference in a new issue