mirror of
https://github.com/LemmyNet/lemmy.git
synced 2025-01-23 07:18:21 +00:00
Adding clearurls crate to clean tracking params from links and markdown. (#5018)
* Adding clearurls crate to clean tracking params from links and markdown. - Thanks to @jenrdikw for creating this - Fixes #4905 * Upgrading to new version of clearurls * Fix clippy
This commit is contained in:
parent
ff939e04fd
commit
5febf2b8fb
5 changed files with 59 additions and 23 deletions
15
Cargo.lock
generated
15
Cargo.lock
generated
|
@ -877,6 +877,20 @@ version = "0.7.2"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97"
|
||||
|
||||
[[package]]
|
||||
name = "clearurls"
|
||||
version = "0.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e291c00af89ac0a5b400d9ba46a682e38015ae3cd8926dbbe85b3b864d550be3"
|
||||
dependencies = [
|
||||
"linkify",
|
||||
"percent-encoding",
|
||||
"regex",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clokwerk"
|
||||
version = "0.4.0"
|
||||
|
@ -2781,6 +2795,7 @@ dependencies = [
|
|||
"actix-web",
|
||||
"anyhow",
|
||||
"cfg-if",
|
||||
"clearurls",
|
||||
"deser-hjson",
|
||||
"diesel",
|
||||
"doku",
|
||||
|
|
|
@ -49,6 +49,7 @@ use lemmy_utils::{
|
|||
utils::{
|
||||
markdown::{markdown_check_for_blocked_urls, markdown_rewrite_image_links},
|
||||
slurs::{build_slur_regex, remove_slurs},
|
||||
validation::clean_urls_in_text,
|
||||
},
|
||||
CACHE_DURATION_FEDERATION,
|
||||
};
|
||||
|
@ -947,6 +948,7 @@ pub async fn process_markdown(
|
|||
context: &LemmyContext,
|
||||
) -> LemmyResult<String> {
|
||||
let text = remove_slurs(text, slur_regex);
|
||||
let text = clean_urls_in_text(&text);
|
||||
|
||||
markdown_check_for_blocked_urls(&text, url_blocklist)?;
|
||||
|
||||
|
|
|
@ -30,7 +30,7 @@ use i_love_jesus::CursorKey;
|
|||
use lemmy_utils::{
|
||||
error::{LemmyErrorExt, LemmyErrorType, LemmyResult},
|
||||
settings::SETTINGS,
|
||||
utils::validation::clean_url_params,
|
||||
utils::validation::clean_url,
|
||||
};
|
||||
use regex::Regex;
|
||||
use rustls::{
|
||||
|
@ -305,7 +305,7 @@ pub fn diesel_url_update(opt: Option<&str>) -> LemmyResult<Option<Option<DbUrl>>
|
|||
// An empty string is an erase
|
||||
Some("") => Ok(Some(None)),
|
||||
Some(str_url) => Url::parse(str_url)
|
||||
.map(|u| Some(Some(clean_url_params(&u).into())))
|
||||
.map(|u| Some(Some(clean_url(&u).into())))
|
||||
.with_lemmy_type(LemmyErrorType::InvalidUrl),
|
||||
None => Ok(None),
|
||||
}
|
||||
|
@ -316,7 +316,7 @@ pub fn diesel_url_update(opt: Option<&str>) -> LemmyResult<Option<Option<DbUrl>>
|
|||
pub fn diesel_url_create(opt: Option<&str>) -> LemmyResult<Option<DbUrl>> {
|
||||
match opt {
|
||||
Some(str_url) => Url::parse(str_url)
|
||||
.map(|u| Some(clean_url_params(&u).into()))
|
||||
.map(|u| Some(clean_url(&u).into()))
|
||||
.with_lemmy_type(LemmyErrorType::InvalidUrl),
|
||||
None => Ok(None),
|
||||
}
|
||||
|
|
|
@ -81,6 +81,7 @@ markdown-it = { version = "0.6.1", optional = true }
|
|||
ts-rs = { workspace = true, optional = true }
|
||||
enum-map = { workspace = true, optional = true }
|
||||
cfg-if = "1"
|
||||
clearurls = { version = "0.0.4", features = ["linkify"] }
|
||||
|
||||
[dev-dependencies]
|
||||
reqwest = { workspace = true }
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
use crate::error::{LemmyErrorExt, LemmyErrorType, LemmyResult};
|
||||
use clearurls::UrlCleaner;
|
||||
use itertools::Itertools;
|
||||
use regex::{Regex, RegexBuilder, RegexSet};
|
||||
use std::sync::LazyLock;
|
||||
|
@ -10,12 +11,8 @@ static VALID_MATRIX_ID_REGEX: LazyLock<Regex> = LazyLock::new(|| {
|
|||
.expect("compile regex")
|
||||
});
|
||||
// taken from https://en.wikipedia.org/wiki/UTM_parameters
|
||||
static CLEAN_URL_PARAMS_REGEX: LazyLock<Regex> = LazyLock::new(|| {
|
||||
Regex::new(
|
||||
r"^(utm_source|utm_medium|utm_campaign|utm_term|utm_content|gclid|gclsrc|dclid|fbclid)=",
|
||||
)
|
||||
.expect("compile regex")
|
||||
});
|
||||
static URL_CLEANER: LazyLock<UrlCleaner> =
|
||||
LazyLock::new(|| UrlCleaner::from_embedded_rules().expect("compile clearurls"));
|
||||
const ALLOWED_POST_URL_SCHEMES: [&str; 3] = ["http", "https", "magnet"];
|
||||
|
||||
const BODY_MAX_LENGTH: usize = 10000;
|
||||
|
@ -257,16 +254,22 @@ pub fn build_and_check_regex(regex_str_opt: &Option<&str>) -> LemmyResult<Option
|
|||
)
|
||||
}
|
||||
|
||||
pub fn clean_url_params(url: &Url) -> Url {
|
||||
let mut url_out = url.clone();
|
||||
if let Some(query) = url.query() {
|
||||
let new_query = query
|
||||
.split_inclusive('&')
|
||||
.filter(|q| !CLEAN_URL_PARAMS_REGEX.is_match(q))
|
||||
.collect::<String>();
|
||||
url_out.set_query(Some(&new_query));
|
||||
/// Cleans a url of tracking parameters.
|
||||
pub fn clean_url(url: &Url) -> Url {
|
||||
match URL_CLEANER.clear_single_url(url) {
|
||||
Ok(res) => res.into_owned(),
|
||||
// If there are any errors, just return the original url
|
||||
Err(_) => url.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Cleans all the links in a string of tracking parameters.
|
||||
pub fn clean_urls_in_text(text: &str) -> String {
|
||||
match URL_CLEANER.clear_text(text) {
|
||||
Ok(res) => res.into_owned(),
|
||||
// If there are any errors, just return the original text
|
||||
Err(_) => text.to_owned(),
|
||||
}
|
||||
url_out
|
||||
}
|
||||
|
||||
pub fn check_site_visibility_valid(
|
||||
|
@ -357,7 +360,8 @@ mod tests {
|
|||
build_and_check_regex,
|
||||
check_site_visibility_valid,
|
||||
check_urls_are_valid,
|
||||
clean_url_params,
|
||||
clean_url,
|
||||
clean_urls_in_text,
|
||||
is_url_blocked,
|
||||
is_valid_actor_name,
|
||||
is_valid_bio_field,
|
||||
|
@ -378,18 +382,32 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn test_clean_url_params() -> LemmyResult<()> {
|
||||
let url = Url::parse("https://example.com/path/123?utm_content=buffercf3b2&utm_medium=social&user+name=random+user%20&id=123")?;
|
||||
let cleaned = clean_url_params(&url);
|
||||
let expected = Url::parse("https://example.com/path/123?user+name=random+user%20&id=123")?;
|
||||
let url = Url::parse("https://example.com/path/123?utm_content=buffercf3b2&utm_medium=social&user+name=random+user&id=123")?;
|
||||
let cleaned = clean_url(&url);
|
||||
let expected = Url::parse("https://example.com/path/123?user+name=random+user&id=123")?;
|
||||
assert_eq!(expected.to_string(), cleaned.to_string());
|
||||
|
||||
let url = Url::parse("https://example.com/path/123")?;
|
||||
let cleaned = clean_url_params(&url);
|
||||
let cleaned = clean_url(&url);
|
||||
assert_eq!(url.to_string(), cleaned.to_string());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_clean_body() -> LemmyResult<()> {
|
||||
let text = "[a link](https://example.com/path/123?utm_content=buffercf3b2&utm_medium=social&user+name=random+user&id=123)";
|
||||
let cleaned = clean_urls_in_text(text);
|
||||
let expected = "[a link](https://example.com/path/123?user+name=random+user&id=123)";
|
||||
assert_eq!(expected.to_string(), cleaned.to_string());
|
||||
|
||||
let text = "[a link](https://example.com/path/123)";
|
||||
let cleaned = clean_urls_in_text(text);
|
||||
assert_eq!(text.to_string(), cleaned);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn regex_checks() {
|
||||
assert!(is_valid_post_title("hi").is_err());
|
||||
|
|
Loading…
Reference in a new issue