From a8b37977c79a98b96baebe40bc70ea7a3db46805 Mon Sep 17 00:00:00 2001 From: Dessalines Date: Mon, 16 Sep 2024 11:15:41 -0400 Subject: [PATCH] Adding clearurls crate to clean tracking params from links and markdown. (#5018) * Adding clearurls crate to clean tracking params from links and markdown. - Thanks to @jenrdikw for creating this - Fixes #4905 * Upgrading to new version of clearurls * Fix clippy --- Cargo.lock | 15 +++++++ crates/api_common/src/utils.rs | 2 + crates/db_schema/src/utils.rs | 6 +-- crates/utils/Cargo.toml | 1 + crates/utils/src/utils/validation.rs | 58 ++++++++++++++++++---------- 5 files changed, 59 insertions(+), 23 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6fdaa0c98..8b21dce35 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1058,6 +1058,20 @@ version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4b82cf0babdbd58558212896d1a4272303a57bdb245c2bf1147185fb45640e70" +[[package]] +name = "clearurls" +version = "0.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e291c00af89ac0a5b400d9ba46a682e38015ae3cd8926dbbe85b3b864d550be3" +dependencies = [ + "linkify", + "percent-encoding", + "regex", + "serde", + "serde_json", + "url", +] + [[package]] name = "clokwerk" version = "0.4.0" @@ -3144,6 +3158,7 @@ dependencies = [ "actix-web", "anyhow", "cfg-if", + "clearurls", "deser-hjson", "diesel", "doku", diff --git a/crates/api_common/src/utils.rs b/crates/api_common/src/utils.rs index 0b8e56273..64cd41135 100644 --- a/crates/api_common/src/utils.rs +++ b/crates/api_common/src/utils.rs @@ -49,6 +49,7 @@ use lemmy_utils::{ utils::{ markdown::{markdown_check_for_blocked_urls, markdown_rewrite_image_links}, slurs::{build_slur_regex, remove_slurs}, + validation::clean_urls_in_text, }, CACHE_DURATION_FEDERATION, }; @@ -947,6 +948,7 @@ pub async fn process_markdown( context: &LemmyContext, ) -> LemmyResult { let text = remove_slurs(text, slur_regex); + let text = clean_urls_in_text(&text); markdown_check_for_blocked_urls(&text, url_blocklist)?; diff --git a/crates/db_schema/src/utils.rs b/crates/db_schema/src/utils.rs index b71c43495..8e4e35006 100644 --- a/crates/db_schema/src/utils.rs +++ b/crates/db_schema/src/utils.rs @@ -30,7 +30,7 @@ use i_love_jesus::CursorKey; use lemmy_utils::{ error::{LemmyErrorExt, LemmyErrorType, LemmyResult}, settings::SETTINGS, - utils::validation::clean_url_params, + utils::validation::clean_url, }; use regex::Regex; use rustls::{ @@ -305,7 +305,7 @@ pub fn diesel_url_update(opt: Option<&str>) -> LemmyResult> // An empty string is an erase Some("") => Ok(Some(None)), Some(str_url) => Url::parse(str_url) - .map(|u| Some(Some(clean_url_params(&u).into()))) + .map(|u| Some(Some(clean_url(&u).into()))) .with_lemmy_type(LemmyErrorType::InvalidUrl), None => Ok(None), } @@ -316,7 +316,7 @@ pub fn diesel_url_update(opt: Option<&str>) -> LemmyResult> pub fn diesel_url_create(opt: Option<&str>) -> LemmyResult> { match opt { Some(str_url) => Url::parse(str_url) - .map(|u| Some(clean_url_params(&u).into())) + .map(|u| Some(clean_url(&u).into())) .with_lemmy_type(LemmyErrorType::InvalidUrl), None => Ok(None), } diff --git a/crates/utils/Cargo.toml b/crates/utils/Cargo.toml index e94fce9d6..f83fa8ab1 100644 --- a/crates/utils/Cargo.toml +++ b/crates/utils/Cargo.toml @@ -83,6 +83,7 @@ markdown-it = { version = "0.6.1", optional = true } ts-rs = { workspace = true, optional = true } enum-map = { workspace = true, optional = true } cfg-if = "1" +clearurls = { version = "0.0.4", features = ["linkify"] } [dev-dependencies] reqwest = { workspace = true } diff --git a/crates/utils/src/utils/validation.rs b/crates/utils/src/utils/validation.rs index 0a59e2fea..b7e437ee1 100644 --- a/crates/utils/src/utils/validation.rs +++ b/crates/utils/src/utils/validation.rs @@ -1,4 +1,5 @@ use crate::error::{LemmyErrorExt, LemmyErrorType, LemmyResult}; +use clearurls::UrlCleaner; use itertools::Itertools; use regex::{Regex, RegexBuilder, RegexSet}; use std::sync::LazyLock; @@ -10,12 +11,8 @@ static VALID_MATRIX_ID_REGEX: LazyLock = LazyLock::new(|| { .expect("compile regex") }); // taken from https://en.wikipedia.org/wiki/UTM_parameters -static CLEAN_URL_PARAMS_REGEX: LazyLock = LazyLock::new(|| { - Regex::new( - r"^(utm_source|utm_medium|utm_campaign|utm_term|utm_content|gclid|gclsrc|dclid|fbclid)=", - ) - .expect("compile regex") -}); +static URL_CLEANER: LazyLock = + LazyLock::new(|| UrlCleaner::from_embedded_rules().expect("compile clearurls")); const ALLOWED_POST_URL_SCHEMES: [&str; 3] = ["http", "https", "magnet"]; const BODY_MAX_LENGTH: usize = 10000; @@ -257,16 +254,22 @@ pub fn build_and_check_regex(regex_str_opt: &Option<&str>) -> LemmyResult