Merge pull request #1343 from reesporte/fix-link-formatter

fix link formatter
2021-08-30 13:37:04 -07:00 · 2021-08-30 13:37:04 -07:00 · 8d23f1d356
commit 8d23f1d356
parent 11e752185d 4f321e5f33
2 changed files with 61 additions and 7 deletions
--- a/bookwyrm/tests/views/test_status.py
+++ b/bookwyrm/tests/views/test_status.py
@ -303,6 +303,19 @@ class StatusViews(TestCase):
            '<a href="%s">openlibrary.org/search'
            "?q=arkady+strugatsky&mode=everything</a>" % url,
        )
+        url = "https://tech.lgbt/@bookwyrm"
+        self.assertEqual(
+            views.status.format_links(url), '<a href="%s">tech.lgbt/@bookwyrm</a>' % url
+        )
+        url = "https://users.speakeasy.net/~lion/nb/book.pdf"
+        self.assertEqual(
+            views.status.format_links(url),
+            '<a href="%s">users.speakeasy.net/~lion/nb/book.pdf</a>' % url,
+        )
+        url = "https://pkm.one/#/page/The%20Book%20which%20launched%20a%201000%20Note%20taking%20apps"
+        self.assertEqual(
+            views.status.format_links(url), '<a href="%s">%s</a>' % (url, url[8:])
+        )

    def test_to_markdown(self, *_):
        """this is mostly handled in other places, but nonetheless"""
--- a/bookwyrm/views/status.py
+++ b/bookwyrm/views/status.py
@ -1,13 +1,17 @@
 """ what are we here for if not for posting """
 import re
+from urllib.parse import urlparse
+
 from django.contrib.auth.decorators import login_required
+from django.core.validators import URLValidator
+from django.core.exceptions import ValidationError
 from django.http import HttpResponseBadRequest
 from django.shortcuts import get_object_or_404, redirect
 from django.template.response import TemplateResponse
 from django.utils.decorators import method_decorator
 from django.views import View
-from markdown import markdown

+from markdown import markdown
 from bookwyrm import forms, models
 from bookwyrm.sanitize_html import InputHtmlParser
 from bookwyrm.settings import DOMAIN
@ -149,17 +153,54 @@ def find_mentions(content):

 def format_links(content):
    """detect and format links"""
-    return re.sub(
-        r'([^(href=")]|^|\()(https?:\/\/(%s([\w\.\-_\/+&\?=:;,@#])*))' % regex.DOMAIN,
-        r'\g<1><a href="\g<2>">\g<3></a>',
-        content,
-    )
+    validator = URLValidator()
+    formatted_content = ""
+    split_content = content.split()
+
+    for index, potential_link in enumerate(split_content):
+        wrapped = _wrapped(potential_link)
+        if wrapped:
+            wrapper_close = potential_link[-1]
+            formatted_content += potential_link[0]
+            potential_link = potential_link[1:-1]
+
+        try:
+            # raises an error on anything that's not a valid link
+            validator(potential_link)
+
+            # use everything but the scheme in the presentation of the link
+            url = urlparse(potential_link)
+            link = url.netloc + url.path + url.params
+            if url.query != "":
+                link += "?" + url.query
+            if url.fragment != "":
+                link += "#" + url.fragment
+
+            formatted_content += '<a href="%s">%s</a>' % (potential_link, link)
+        except (ValidationError, UnicodeError):
+            formatted_content += potential_link
+
+        if wrapped:
+            formatted_content += wrapper_close
+        if index < len(split_content) - 1:
+            formatted_content += " "
+
+    return formatted_content
+
+
+def _wrapped(text):
+    """check if a line of text is wrapped"""
+    wrappers = [("(", ")"), ("[", "]"), ("{", "}")]
+    for wrapper in wrappers:
+        if text[0] == wrapper[0] and text[-1] == wrapper[-1]:
+            return True
+    return False


 def to_markdown(content):
    """catch links and convert to markdown"""
-    content = markdown(content)
    content = format_links(content)
+    content = markdown(content)
    # sanitize resulting html
    sanitizer = InputHtmlParser()
    sanitizer.feed(content)