Merge pull request #488 from mouse-reeve/url-regex

Better matching for links in statuses
This commit is contained in:
Mouse Reeve 2021-01-06 11:00:30 -08:00 committed by GitHub
commit e7e21c194e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 68 additions and 9 deletions

View file

@ -293,13 +293,16 @@ def find_mentions(content):
yield (match.group(), mention_user)
def to_markdown(content):
''' catch links and convert to markdown '''
content = re.sub(
r'([^(href=")])(https?:\/\/([A-Za-z\.\-_\/]+' \
r'\.[A-Za-z]{2,}[A-Za-z\.\-_\/]+))',
def format_links(content):
''' detect and format links '''
return re.sub(
r'([^(href=")]|^)(https?:\/\/(%s([\w\.\-_\/])*))' % regex.domain,
r'\g<1><a href="\g<2>">\g<3></a>',
content)
def to_markdown(content):
''' catch links and convert to markdown '''
content = format_links(content)
content = markdown(content)
# sanitize resulting html
sanitizer = InputHtmlParser()

View file

@ -447,3 +447,59 @@ class Outgoing(TestCase):
self.assertEqual(reply.user, user)
self.assertTrue(self.remote_user in reply.mention_users.all())
self.assertTrue(self.local_user in reply.mention_users.all())
def test_find_mentions(self):
''' detect and look up @ mentions of users '''
user = models.User.objects.create_user(
'nutria@%s' % DOMAIN, 'nutria@nutria.com', 'password',
local=True, localname='nutria')
self.assertEqual(user.username, 'nutria@%s' % DOMAIN)
self.assertEqual(
list(outgoing.find_mentions('@nutria'))[0],
('@nutria', user)
)
self.assertEqual(
list(outgoing.find_mentions('leading text @nutria'))[0],
('@nutria', user)
)
self.assertEqual(
list(outgoing.find_mentions('leading @nutria trailing text'))[0],
('@nutria', user)
)
self.assertEqual(
list(outgoing.find_mentions('@rat@example.com'))[0],
('@rat@example.com', self.remote_user)
)
multiple = list(outgoing.find_mentions('@nutria and @rat@example.com'))
self.assertEqual(multiple[0], ('@nutria', user))
self.assertEqual(multiple[1], ('@rat@example.com', self.remote_user))
with patch('bookwyrm.outgoing.handle_remote_webfinger') as rw:
rw.return_value = self.local_user
self.assertEqual(
list(outgoing.find_mentions('@beep@beep.com'))[0],
('@beep@beep.com', self.local_user)
)
with patch('bookwyrm.outgoing.handle_remote_webfinger') as rw:
rw.return_value = None
self.assertEqual(list(outgoing.find_mentions('@beep@beep.com')), [])
self.assertEqual(
list(outgoing.find_mentions('@nutria@%s' % DOMAIN))[0],
('@nutria@%s' % DOMAIN, user)
)
def test_format_links(self):
''' find and format urls into a tags '''
url = 'http://www.fish.com/'
self.assertEqual(
outgoing.format_links(url),
'<a href="%s">www.fish.com/</a>' % url)
url = 'https://archive.org/details/dli.granth.72113/page/n25/mode/2up'
self.assertEqual(
outgoing.format_links(url),
'<a href="%s">' \
'archive.org/details/dli.granth.72113/page/n25/mode/2up</a>' \
% url)

View file

@ -1,10 +1,10 @@
''' defining regexes for regularly used concepts '''
domain = r'[a-z-A-Z0-9_\-]+\.[a-z]+'
localname = r'@?[a-zA-Z_\-\.0-9]+'
domain = r'[\w_\-\.]+\.[a-z]{2,}'
localname = r'@?[a-zA-Z_\-\.0-9]+\b'
strict_localname = r'@[a-zA-Z_\-\.0-9]+'
username = r'%s(@%s)?' % (localname, domain)
strict_username = r'%s(@%s)?' % (strict_localname, domain)
full_username = r'%s@%s' % (localname, domain)
strict_username = r'\B%s(@%s)?\b' % (strict_localname, domain)
full_username = r'\B%s@%s\b' % (localname, domain)
# should match (BookWyrm/1.0.0; or (BookWyrm/99.1.2;
bookwyrm_user_agent = r'\(BookWyrm/[0-9]+\.[0-9]+\.[0-9]+;'