From c3c22022f604ec1c934aaf60d4536beaebfdc7be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adeodato=20Sim=C3=B3?= Date: Mon, 9 Oct 2023 20:54:06 -0300 Subject: [PATCH 1/7] Check for punctuation before checking for wrapping parenthesis This allows to parse `(URL).` correctly, which was not detected as URL before. --- bookwyrm/tests/views/test_status.py | 9 +++++++++ bookwyrm/views/status.py | 12 +++++++----- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/bookwyrm/tests/views/test_status.py b/bookwyrm/tests/views/test_status.py index 33bd8b53a..6e630a370 100644 --- a/bookwyrm/tests/views/test_status.py +++ b/bookwyrm/tests/views/test_status.py @@ -436,6 +436,15 @@ http://www.fish.com/""" f'www.fish.com/.', ) + def test_format_links_punctuation_parens(self, *_): + """ignore trailing punctuation and brackets combined""" + # Period at the end, wrapped in brackets. + url = "http://www.fish.com" + self.assertEqual( + views.status.format_links(f"({url})."), + f'(www.fish.com).', + ) + def test_format_links_special_chars(self, *_): """find and format urls into a tags""" url = "https://archive.org/details/dli.granth.72113/page/n25/mode/2up" diff --git a/bookwyrm/views/status.py b/bookwyrm/views/status.py index 7a0517b01..196db4eb9 100644 --- a/bookwyrm/views/status.py +++ b/bookwyrm/views/status.py @@ -304,17 +304,19 @@ def format_links(content): for potential_link in split_content: if not potential_link: continue + + # FIXME: allow for multiple punctuation characters, e.g. `...` and `!?`. + ends_with_punctuation = _ends_with_punctuation(potential_link) + if ends_with_punctuation: + punctuation_glyph = potential_link[-1] + potential_link = potential_link[0:-1] + wrapped = _wrapped(potential_link) if wrapped: wrapper_close = potential_link[-1] formatted_content += potential_link[0] potential_link = potential_link[1:-1] - ends_with_punctuation = _ends_with_punctuation(potential_link) - if ends_with_punctuation: - punctuation_glyph = potential_link[-1] - potential_link = potential_link[0:-1] - try: # raises an error on anything that's not a valid link validator(potential_link) From 0043329cc189ec23390101a9e9055ed592a8db4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adeodato=20Sim=C3=B3?= Date: Mon, 9 Oct 2023 21:01:34 -0300 Subject: [PATCH 2/7] Simplify literals in _wrapped --- bookwyrm/views/status.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bookwyrm/views/status.py b/bookwyrm/views/status.py index 196db4eb9..bb8f6ac17 100644 --- a/bookwyrm/views/status.py +++ b/bookwyrm/views/status.py @@ -344,7 +344,7 @@ def format_links(content): def _wrapped(text): """check if a line of text is wrapped""" - wrappers = [("(", ")"), ("[", "]"), ("{", "}")] + wrappers = ["()", "[]", "{}"] for wrapper in wrappers: if text[0] == wrapper[0] and text[-1] == wrapper[-1]: return True From 17d741039c95e1a00272e3ed6ed1d3d219218daf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adeodato=20Sim=C3=B3?= Date: Mon, 9 Oct 2023 21:33:35 -0300 Subject: [PATCH 3/7] Remove duplicate test (Test case already part of test_format_links_simple_url.) --- bookwyrm/tests/views/test_status.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/bookwyrm/tests/views/test_status.py b/bookwyrm/tests/views/test_status.py index 6e630a370..42e83c5b8 100644 --- a/bookwyrm/tests/views/test_status.py +++ b/bookwyrm/tests/views/test_status.py @@ -420,14 +420,6 @@ http://www.fish.com/""" 'okay\n\nwww.fish.com/', ) - def test_format_links_parens(self, *_): - """find and format urls into a tags""" - url = "http://www.fish.com/" - self.assertEqual( - views.status.format_links(f"({url})"), - f'(www.fish.com/)', - ) - def test_format_links_punctuation(self, *_): """don’t take trailing punctuation into account pls""" url = "http://www.fish.com/" From 294788aa1ae2f10ffeae188177dfb96e9291a6d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adeodato=20Sim=C3=B3?= Date: Mon, 9 Oct 2023 21:41:22 -0300 Subject: [PATCH 4/7] format_links: refactor; support multiple punctuation --- bookwyrm/tests/views/test_status.py | 4 ++ bookwyrm/views/status.py | 71 ++++++++++------------------- 2 files changed, 28 insertions(+), 47 deletions(-) diff --git a/bookwyrm/tests/views/test_status.py b/bookwyrm/tests/views/test_status.py index 42e83c5b8..33c8f0e41 100644 --- a/bookwyrm/tests/views/test_status.py +++ b/bookwyrm/tests/views/test_status.py @@ -427,6 +427,10 @@ http://www.fish.com/""" views.status.format_links(f"{url}."), f'www.fish.com/.', ) + self.assertEqual( + views.status.format_links(f"{url}!?!"), + f'www.fish.com/!?!', + ) def test_format_links_punctuation_parens(self, *_): """ignore trailing punctuation and brackets combined""" diff --git a/bookwyrm/views/status.py b/bookwyrm/views/status.py index bb8f6ac17..4c1d049df 100644 --- a/bookwyrm/views/status.py +++ b/bookwyrm/views/status.py @@ -1,7 +1,6 @@ """ what are we here for if not for posting """ import re import logging -from urllib.parse import urlparse from django.contrib.auth.decorators import login_required from django.core.validators import URLValidator @@ -297,67 +296,45 @@ def find_or_create_hashtags(content): def format_links(content): """detect and format links""" - validator = URLValidator() - formatted_content = "" + validator = URLValidator(["http", "https"]) + schema_re = re.compile(r"\bhttps?://") split_content = re.split(r"(\s+)", content) - for potential_link in split_content: - if not potential_link: + for i, potential_link in enumerate(split_content): + if not schema_re.search(potential_link): continue - # FIXME: allow for multiple punctuation characters, e.g. `...` and `!?`. - ends_with_punctuation = _ends_with_punctuation(potential_link) - if ends_with_punctuation: - punctuation_glyph = potential_link[-1] - potential_link = potential_link[0:-1] - - wrapped = _wrapped(potential_link) - if wrapped: - wrapper_close = potential_link[-1] - formatted_content += potential_link[0] - potential_link = potential_link[1:-1] - + # Strip surrounding brackets and trailing punctuation. + prefix, potential_link, suffix = _unwrap(potential_link) try: # raises an error on anything that's not a valid link validator(potential_link) # use everything but the scheme in the presentation of the link - url = urlparse(potential_link) - link = url.netloc + url.path + url.params - if url.query != "": - link += "?" + url.query - if url.fragment != "": - link += "#" + url.fragment - - formatted_content += f'{link}' + link = schema_re.sub("", potential_link) + split_content[i] = f'{prefix}{link}{suffix}' except (ValidationError, UnicodeError): - formatted_content += potential_link + pass - if wrapped: - formatted_content += wrapper_close - - if ends_with_punctuation: - formatted_content += punctuation_glyph - - return formatted_content + return "".join(split_content) -def _wrapped(text): - """check if a line of text is wrapped""" - wrappers = ["()", "[]", "{}"] - for wrapper in wrappers: +def _unwrap(text): + """split surrounding brackets and trailing punctuation from a string of text""" + punct = re.compile(r'([.,;:!?"’”»]+)\Z') + prefix = suffix = "" + + if punct.search(text): + # Move punctuation to suffix segment. + text, suffix, _ = punct.split(text) + + for wrapper in ("()", "[]", "{}"): if text[0] == wrapper[0] and text[-1] == wrapper[-1]: - return True - return False + # Split out wrapping chars. + suffix = text[-1] + suffix + prefix, text = text[:1], text[1:-1] - -def _ends_with_punctuation(text): - """check if a line of text ends with a punctuation glyph""" - glyphs = [".", ",", ";", ":", "!", "?", "”", "’", '"', "»"] - for glyph in glyphs: - if text[-1] == glyph: - return True - return False + return prefix, text, suffix def to_markdown(content): From 7d13cbb10b45a8599e12aa9c379e1617d6e76f27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adeodato=20Sim=C3=B3?= Date: Thu, 2 Nov 2023 21:27:55 -0300 Subject: [PATCH 5/7] Add failing tests for reported bugs in format_links() --- bookwyrm/tests/views/test_status.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/bookwyrm/tests/views/test_status.py b/bookwyrm/tests/views/test_status.py index 33c8f0e41..67e1f6d76 100644 --- a/bookwyrm/tests/views/test_status.py +++ b/bookwyrm/tests/views/test_status.py @@ -469,6 +469,13 @@ http://www.fish.com/""" views.status.format_links(url), f'{url[8:]}' ) + def test_format_links_ignore_non_urls(self, *_): + """formating links should leave plain text untouced""" + text_elision = "> “The distinction is significant.” [...]" # bookwyrm#2993 + text_quoteparens = "some kind of gene-editing technology (?)" # bookwyrm#3049 + self.assertEqual(views.status.format_links(text_elision), text_elision) + self.assertEqual(views.status.format_links(text_quoteparens), text_quoteparens) + def test_format_mentions_with_at_symbol_links(self, *_): """A link with an @username shouldn't treat the username as a mention""" content = "a link to https://example.com/user/@mouse" From 954a02126eb21ca230319cd8ff62adf04d962de9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adeodato=20Sim=C3=B3?= Date: Thu, 2 Nov 2023 21:59:34 -0300 Subject: [PATCH 6/7] format_links: parse punctuation inside brackets Also, consolidate all punctuation tests into a single table-driven one. --- bookwyrm/tests/views/test_status.py | 37 ++++++++++++++--------------- bookwyrm/views/status.py | 6 +++++ 2 files changed, 24 insertions(+), 19 deletions(-) diff --git a/bookwyrm/tests/views/test_status.py b/bookwyrm/tests/views/test_status.py index 67e1f6d76..424698130 100644 --- a/bookwyrm/tests/views/test_status.py +++ b/bookwyrm/tests/views/test_status.py @@ -421,25 +421,24 @@ http://www.fish.com/""" ) def test_format_links_punctuation(self, *_): - """don’t take trailing punctuation into account pls""" - url = "http://www.fish.com/" - self.assertEqual( - views.status.format_links(f"{url}."), - f'www.fish.com/.', - ) - self.assertEqual( - views.status.format_links(f"{url}!?!"), - f'www.fish.com/!?!', - ) - - def test_format_links_punctuation_parens(self, *_): - """ignore trailing punctuation and brackets combined""" - # Period at the end, wrapped in brackets. - url = "http://www.fish.com" - self.assertEqual( - views.status.format_links(f"({url})."), - f'(www.fish.com).', - ) + """test many combinations of brackets, URLs, and punctuation""" + url = "https://bookwyrm.social" + html = f'bookwyrm.social' + test_table = [ + ("punct", f"text and {url}.", f"text and {html}."), + ("multi_punct", f"text, then {url}?...", f"text, then {html}?..."), + ("bracket_punct", f"here ({url}).", f"here ({html})."), + ("punct_bracket", f"there [{url}?]", f"there [{html}?]"), + ("punct_bracket_punct", f"not here? ({url}!).", f"not here? ({html}!)."), + ( + "multi_punct_bracket", + f"not there ({url}...);", + f"not there ({html}...);", + ), + ] + for desc, text, output in test_table: + with self.subTest(desc=desc): + self.assertEqual(views.status.format_links(text), output) def test_format_links_special_chars(self, *_): """find and format urls into a tags""" diff --git a/bookwyrm/views/status.py b/bookwyrm/views/status.py index 4c1d049df..8dab11a27 100644 --- a/bookwyrm/views/status.py +++ b/bookwyrm/views/status.py @@ -333,6 +333,12 @@ def _unwrap(text): # Split out wrapping chars. suffix = text[-1] + suffix prefix, text = text[:1], text[1:-1] + break # Nested wrappers not supported atm. + + if punct.search(text): + # Move inner punctuation to suffix segment. + text, inner_punct, _ = punct.split(text) + suffix = inner_punct + suffix return prefix, text, suffix From afad39bf80265f74de100cf3550924a7916d02a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adeodato=20Sim=C3=B3?= Date: Fri, 3 Nov 2023 19:31:03 -0300 Subject: [PATCH 7/7] Use $ instead of \Z for end of string They're identical here, since re.M is not used, and the better-known should be used, for readability. --- bookwyrm/views/status.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bookwyrm/views/status.py b/bookwyrm/views/status.py index 8dab11a27..34b62d0b4 100644 --- a/bookwyrm/views/status.py +++ b/bookwyrm/views/status.py @@ -321,7 +321,7 @@ def format_links(content): def _unwrap(text): """split surrounding brackets and trailing punctuation from a string of text""" - punct = re.compile(r'([.,;:!?"’”»]+)\Z') + punct = re.compile(r'([.,;:!?"’”»]+)$') prefix = suffix = "" if punct.search(text):