Ignore mentions and hashtags inside code blocks

This commit is contained in:
silverpill 2022-12-18 00:38:29 +00:00
parent 67b1729621
commit 2b8063990a
3 changed files with 45 additions and 6 deletions

View file

@ -2,6 +2,7 @@ use regex::{Captures, Regex};
use crate::errors::ValidationError;
use crate::frontend::get_tag_page_url;
use super::links::is_inside_code_block;
const HASHTAG_RE: &str = r"(?m)(?P<before>^|\s|>|[\(])#(?P<tag>[^\s<]+)";
const HASHTAG_SECONDARY_RE: &str = r"^(?P<tag>[0-9A-Za-z]+)(?P<after>[\.,:?\)]?)$";
@ -13,6 +14,11 @@ pub fn find_hashtags(text: &str) -> Vec<String> {
let hashtag_secondary_re = Regex::new(HASHTAG_SECONDARY_RE).unwrap();
let mut tags = vec![];
for caps in hashtag_re.captures_iter(text) {
let tag_match = caps.name("tag").expect("should have tag group");
if is_inside_code_block(&tag_match, text) {
// Ignore hashtags inside code blocks
continue;
};
if let Some(secondary_caps) = hashtag_secondary_re.captures(&caps["tag"]) {
let tag_name = secondary_caps["tag"].to_string().to_lowercase();
if !tags.contains(&tag_name) {
@ -28,6 +34,11 @@ pub fn replace_hashtags(instance_url: &str, text: &str, tags: &[String]) -> Stri
let hashtag_re = Regex::new(HASHTAG_RE).unwrap();
let hashtag_secondary_re = Regex::new(HASHTAG_SECONDARY_RE).unwrap();
let result = hashtag_re.replace_all(text, |caps: &Captures| {
let tag_match = caps.name("tag").expect("should have tag group");
if is_inside_code_block(&tag_match, text) {
// Don't replace hashtags inside code blocks
return caps[0].to_string();
};
if let Some(secondary_caps) = hashtag_secondary_re.captures(&caps["tag"]) {
let before = caps["before"].to_string();
let tag = secondary_caps["tag"].to_string();

View file

@ -1,6 +1,6 @@
use std::collections::HashMap;
use regex::{Captures, Regex};
use regex::{Captures, Match, Regex};
use tokio_postgres::GenericClient;
use crate::database::DatabaseError;
@ -10,10 +10,10 @@ use super::types::Post;
// MediaWiki-like syntax: [[url|text]]
const OBJECT_LINK_SEARCH_RE: &str = r"(?m)\[\[(?P<url>[^\s\|]+)(\|(?P<text>.+?))?\]\]";
fn is_inside_code_block(caps: &Captures, text: &str) -> bool {
pub fn is_inside_code_block(match_: &Match, text: &str) -> bool {
// TODO: remove workaround.
// Perform replacement only inside text nodes during markdown parsing
let text_before = &text[0..caps.name("url").unwrap().start()];
let text_before = &text[0..match_.start()];
let code_open = text_before.matches("<code>").count();
let code_closed = text_before.matches("</code>").count();
code_open > code_closed
@ -24,10 +24,12 @@ fn find_object_links(text: &str) -> Vec<String> {
let link_re = Regex::new(OBJECT_LINK_SEARCH_RE).unwrap();
let mut links = vec![];
for caps in link_re.captures_iter(text) {
let url = caps["url"].to_string();
if is_inside_code_block(&caps, text) {
let url_match = caps.name("url").expect("should have url group");
if is_inside_code_block(&url_match, text) {
// Ignore links inside code blocks
continue;
};
let url = caps["url"].to_string();
if !links.contains(&url) {
links.push(url);
};
@ -60,12 +62,17 @@ pub fn replace_object_links(
) -> String {
let mention_re = Regex::new(OBJECT_LINK_SEARCH_RE).unwrap();
let result = mention_re.replace_all(text, |caps: &Captures| {
let url_match = caps.name("url").expect("should have url group");
if is_inside_code_block(&url_match, text) {
// Don't replace inside code blocks
return caps[0].to_string();
};
let url = caps["url"].to_string();
let link_text = caps.name("text")
.map(|match_| match_.as_str())
.unwrap_or(&url)
.to_string();
if link_map.contains_key(&url) && !is_inside_code_block(caps, text) {
if link_map.contains_key(&url) {
return format!(r#"<a href="{0}">{1}</a>"#, url, link_text);
};
// Leave unchanged if post does not exist
@ -84,6 +91,16 @@ mod tests {
"test ([[https://example.org/2]])",
);
#[test]
fn test_is_inside_code_block() {
let text = "abc<code>&&</code>xyz";
let regexp = Regex::new("&&").unwrap();
let mat = regexp.find(text).unwrap();
assert_eq!(mat.start(), 9);
let result = is_inside_code_block(&mat, text);
assert_eq!(result, true);
}
#[test]
fn test_find_object_links() {
let results = find_object_links(TEXT_WITH_OBJECT_LINKS);

View file

@ -8,6 +8,7 @@ use crate::database::DatabaseError;
use crate::errors::ValidationError;
use crate::models::profiles::queries::get_profiles_by_accts;
use crate::models::profiles::types::DbActorProfile;
use super::links::is_inside_code_block;
// See also: ACTOR_ADDRESS_RE in activitypub::actors::types
const MENTION_RE: &str = r"@?(?P<username>[\w\.-]+)@(?P<hostname>.+)";
@ -23,6 +24,11 @@ fn find_mentions(
let mention_secondary_re = Regex::new(MENTION_SEARCH_SECONDARY_RE).unwrap();
let mut mentions = vec![];
for caps in mention_re.captures_iter(text) {
let mention_match = caps.name("mention").expect("should have mention group");
if is_inside_code_block(&mention_match, text) {
// No mentions inside code blocks
continue;
};
if let Some(secondary_caps) = mention_secondary_re.captures(&caps["mention"]) {
let username = secondary_caps["username"].to_string();
let hostname = secondary_caps.name("hostname")
@ -62,6 +68,11 @@ pub fn replace_mentions(
let mention_re = Regex::new(MENTION_SEARCH_RE).unwrap();
let mention_secondary_re = Regex::new(MENTION_SEARCH_SECONDARY_RE).unwrap();
let result = mention_re.replace_all(text, |caps: &Captures| {
let mention_match = caps.name("mention").expect("should have mention group");
if is_inside_code_block(&mention_match, text) {
// Don't replace mentions inside code blocks
return caps[0].to_string();
};
if let Some(secondary_caps) = mention_secondary_re.captures(&caps["mention"]) {
let username = secondary_caps["username"].to_string();
let hostname = secondary_caps.name("hostname")