Ignore mentions and hashtags inside code blocks

This commit is contained in:
silverpill 2022-12-18 00:38:29 +00:00
parent 67b1729621
commit 2b8063990a
3 changed files with 45 additions and 6 deletions

View file

@ -2,6 +2,7 @@ use regex::{Captures, Regex};
use crate::errors::ValidationError; use crate::errors::ValidationError;
use crate::frontend::get_tag_page_url; use crate::frontend::get_tag_page_url;
use super::links::is_inside_code_block;
const HASHTAG_RE: &str = r"(?m)(?P<before>^|\s|>|[\(])#(?P<tag>[^\s<]+)"; const HASHTAG_RE: &str = r"(?m)(?P<before>^|\s|>|[\(])#(?P<tag>[^\s<]+)";
const HASHTAG_SECONDARY_RE: &str = r"^(?P<tag>[0-9A-Za-z]+)(?P<after>[\.,:?\)]?)$"; const HASHTAG_SECONDARY_RE: &str = r"^(?P<tag>[0-9A-Za-z]+)(?P<after>[\.,:?\)]?)$";
@ -13,6 +14,11 @@ pub fn find_hashtags(text: &str) -> Vec<String> {
let hashtag_secondary_re = Regex::new(HASHTAG_SECONDARY_RE).unwrap(); let hashtag_secondary_re = Regex::new(HASHTAG_SECONDARY_RE).unwrap();
let mut tags = vec![]; let mut tags = vec![];
for caps in hashtag_re.captures_iter(text) { for caps in hashtag_re.captures_iter(text) {
let tag_match = caps.name("tag").expect("should have tag group");
if is_inside_code_block(&tag_match, text) {
// Ignore hashtags inside code blocks
continue;
};
if let Some(secondary_caps) = hashtag_secondary_re.captures(&caps["tag"]) { if let Some(secondary_caps) = hashtag_secondary_re.captures(&caps["tag"]) {
let tag_name = secondary_caps["tag"].to_string().to_lowercase(); let tag_name = secondary_caps["tag"].to_string().to_lowercase();
if !tags.contains(&tag_name) { if !tags.contains(&tag_name) {
@ -28,6 +34,11 @@ pub fn replace_hashtags(instance_url: &str, text: &str, tags: &[String]) -> Stri
let hashtag_re = Regex::new(HASHTAG_RE).unwrap(); let hashtag_re = Regex::new(HASHTAG_RE).unwrap();
let hashtag_secondary_re = Regex::new(HASHTAG_SECONDARY_RE).unwrap(); let hashtag_secondary_re = Regex::new(HASHTAG_SECONDARY_RE).unwrap();
let result = hashtag_re.replace_all(text, |caps: &Captures| { let result = hashtag_re.replace_all(text, |caps: &Captures| {
let tag_match = caps.name("tag").expect("should have tag group");
if is_inside_code_block(&tag_match, text) {
// Don't replace hashtags inside code blocks
return caps[0].to_string();
};
if let Some(secondary_caps) = hashtag_secondary_re.captures(&caps["tag"]) { if let Some(secondary_caps) = hashtag_secondary_re.captures(&caps["tag"]) {
let before = caps["before"].to_string(); let before = caps["before"].to_string();
let tag = secondary_caps["tag"].to_string(); let tag = secondary_caps["tag"].to_string();

View file

@ -1,6 +1,6 @@
use std::collections::HashMap; use std::collections::HashMap;
use regex::{Captures, Regex}; use regex::{Captures, Match, Regex};
use tokio_postgres::GenericClient; use tokio_postgres::GenericClient;
use crate::database::DatabaseError; use crate::database::DatabaseError;
@ -10,10 +10,10 @@ use super::types::Post;
// MediaWiki-like syntax: [[url|text]] // MediaWiki-like syntax: [[url|text]]
const OBJECT_LINK_SEARCH_RE: &str = r"(?m)\[\[(?P<url>[^\s\|]+)(\|(?P<text>.+?))?\]\]"; const OBJECT_LINK_SEARCH_RE: &str = r"(?m)\[\[(?P<url>[^\s\|]+)(\|(?P<text>.+?))?\]\]";
fn is_inside_code_block(caps: &Captures, text: &str) -> bool { pub fn is_inside_code_block(match_: &Match, text: &str) -> bool {
// TODO: remove workaround. // TODO: remove workaround.
// Perform replacement only inside text nodes during markdown parsing // Perform replacement only inside text nodes during markdown parsing
let text_before = &text[0..caps.name("url").unwrap().start()]; let text_before = &text[0..match_.start()];
let code_open = text_before.matches("<code>").count(); let code_open = text_before.matches("<code>").count();
let code_closed = text_before.matches("</code>").count(); let code_closed = text_before.matches("</code>").count();
code_open > code_closed code_open > code_closed
@ -24,10 +24,12 @@ fn find_object_links(text: &str) -> Vec<String> {
let link_re = Regex::new(OBJECT_LINK_SEARCH_RE).unwrap(); let link_re = Regex::new(OBJECT_LINK_SEARCH_RE).unwrap();
let mut links = vec![]; let mut links = vec![];
for caps in link_re.captures_iter(text) { for caps in link_re.captures_iter(text) {
let url = caps["url"].to_string(); let url_match = caps.name("url").expect("should have url group");
if is_inside_code_block(&caps, text) { if is_inside_code_block(&url_match, text) {
// Ignore links inside code blocks
continue; continue;
}; };
let url = caps["url"].to_string();
if !links.contains(&url) { if !links.contains(&url) {
links.push(url); links.push(url);
}; };
@ -60,12 +62,17 @@ pub fn replace_object_links(
) -> String { ) -> String {
let mention_re = Regex::new(OBJECT_LINK_SEARCH_RE).unwrap(); let mention_re = Regex::new(OBJECT_LINK_SEARCH_RE).unwrap();
let result = mention_re.replace_all(text, |caps: &Captures| { let result = mention_re.replace_all(text, |caps: &Captures| {
let url_match = caps.name("url").expect("should have url group");
if is_inside_code_block(&url_match, text) {
// Don't replace inside code blocks
return caps[0].to_string();
};
let url = caps["url"].to_string(); let url = caps["url"].to_string();
let link_text = caps.name("text") let link_text = caps.name("text")
.map(|match_| match_.as_str()) .map(|match_| match_.as_str())
.unwrap_or(&url) .unwrap_or(&url)
.to_string(); .to_string();
if link_map.contains_key(&url) && !is_inside_code_block(caps, text) { if link_map.contains_key(&url) {
return format!(r#"<a href="{0}">{1}</a>"#, url, link_text); return format!(r#"<a href="{0}">{1}</a>"#, url, link_text);
}; };
// Leave unchanged if post does not exist // Leave unchanged if post does not exist
@ -84,6 +91,16 @@ mod tests {
"test ([[https://example.org/2]])", "test ([[https://example.org/2]])",
); );
#[test]
fn test_is_inside_code_block() {
let text = "abc<code>&&</code>xyz";
let regexp = Regex::new("&&").unwrap();
let mat = regexp.find(text).unwrap();
assert_eq!(mat.start(), 9);
let result = is_inside_code_block(&mat, text);
assert_eq!(result, true);
}
#[test] #[test]
fn test_find_object_links() { fn test_find_object_links() {
let results = find_object_links(TEXT_WITH_OBJECT_LINKS); let results = find_object_links(TEXT_WITH_OBJECT_LINKS);

View file

@ -8,6 +8,7 @@ use crate::database::DatabaseError;
use crate::errors::ValidationError; use crate::errors::ValidationError;
use crate::models::profiles::queries::get_profiles_by_accts; use crate::models::profiles::queries::get_profiles_by_accts;
use crate::models::profiles::types::DbActorProfile; use crate::models::profiles::types::DbActorProfile;
use super::links::is_inside_code_block;
// See also: ACTOR_ADDRESS_RE in activitypub::actors::types // See also: ACTOR_ADDRESS_RE in activitypub::actors::types
const MENTION_RE: &str = r"@?(?P<username>[\w\.-]+)@(?P<hostname>.+)"; const MENTION_RE: &str = r"@?(?P<username>[\w\.-]+)@(?P<hostname>.+)";
@ -23,6 +24,11 @@ fn find_mentions(
let mention_secondary_re = Regex::new(MENTION_SEARCH_SECONDARY_RE).unwrap(); let mention_secondary_re = Regex::new(MENTION_SEARCH_SECONDARY_RE).unwrap();
let mut mentions = vec![]; let mut mentions = vec![];
for caps in mention_re.captures_iter(text) { for caps in mention_re.captures_iter(text) {
let mention_match = caps.name("mention").expect("should have mention group");
if is_inside_code_block(&mention_match, text) {
// No mentions inside code blocks
continue;
};
if let Some(secondary_caps) = mention_secondary_re.captures(&caps["mention"]) { if let Some(secondary_caps) = mention_secondary_re.captures(&caps["mention"]) {
let username = secondary_caps["username"].to_string(); let username = secondary_caps["username"].to_string();
let hostname = secondary_caps.name("hostname") let hostname = secondary_caps.name("hostname")
@ -62,6 +68,11 @@ pub fn replace_mentions(
let mention_re = Regex::new(MENTION_SEARCH_RE).unwrap(); let mention_re = Regex::new(MENTION_SEARCH_RE).unwrap();
let mention_secondary_re = Regex::new(MENTION_SEARCH_SECONDARY_RE).unwrap(); let mention_secondary_re = Regex::new(MENTION_SEARCH_SECONDARY_RE).unwrap();
let result = mention_re.replace_all(text, |caps: &Captures| { let result = mention_re.replace_all(text, |caps: &Captures| {
let mention_match = caps.name("mention").expect("should have mention group");
if is_inside_code_block(&mention_match, text) {
// Don't replace mentions inside code blocks
return caps[0].to_string();
};
if let Some(secondary_caps) = mention_secondary_re.captures(&caps["mention"]) { if let Some(secondary_caps) = mention_secondary_re.captures(&caps["mention"]) {
let username = secondary_caps["username"].to_string(); let username = secondary_caps["username"].to_string();
let hostname = secondary_caps.name("hostname") let hostname = secondary_caps.name("hostname")