mirror of
https://github.com/LemmyNet/lemmy.git
synced 2025-01-21 14:28:31 +00:00
Reduce false positives in URL blocklist to reduce scunthorpe problem by matching at word boundaries. (#5282)
This addresses an issue brought up on matrix where blocking rt.com resulted in links to deviantart.com getting blocked.
This commit is contained in:
parent
76034f058d
commit
f76322e3f5
3 changed files with 45 additions and 23 deletions
|
@ -552,7 +552,9 @@ pub async fn get_url_blocklist(context: &LemmyContext) -> LemmyResult<RegexSet>
|
||||||
let urls = LocalSiteUrlBlocklist::get_all(&mut context.pool()).await?;
|
let urls = LocalSiteUrlBlocklist::get_all(&mut context.pool()).await?;
|
||||||
|
|
||||||
// The urls are already validated on saving, so just escape them.
|
// The urls are already validated on saving, so just escape them.
|
||||||
let regexes = urls.iter().map(|url| escape(&url.url));
|
// If this regex creation changes it must be synced with
|
||||||
|
// lemmy_utils::utils::markdown::create_url_blocklist_test_regex_set.
|
||||||
|
let regexes = urls.iter().map(|url| format!(r"\b{}\b", escape(&url.url)));
|
||||||
|
|
||||||
let set = RegexSet::new(regexes)?;
|
let set = RegexSet::new(regexes)?;
|
||||||
Ok(set)
|
Ok(set)
|
||||||
|
|
|
@ -151,6 +151,8 @@ pub async fn update_site(
|
||||||
.ok();
|
.ok();
|
||||||
|
|
||||||
if let Some(url_blocklist) = data.blocked_urls.clone() {
|
if let Some(url_blocklist) = data.blocked_urls.clone() {
|
||||||
|
// If this validation changes it must be synced with
|
||||||
|
// lemmy_utils::utils::markdown::create_url_blocklist_test_regex_set.
|
||||||
let parsed_urls = check_urls_are_valid(&url_blocklist)?;
|
let parsed_urls = check_urls_are_valid(&url_blocklist)?;
|
||||||
LocalSiteUrlBlocklist::replace(&mut context.pool(), parsed_urls).await?;
|
LocalSiteUrlBlocklist::replace(&mut context.pool(), parsed_urls).await?;
|
||||||
}
|
}
|
||||||
|
|
|
@ -47,8 +47,10 @@ pub fn markdown_check_for_blocked_urls(text: &str, blocklist: &RegexSet) -> Lemm
|
||||||
mod tests {
|
mod tests {
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
|
use crate::utils::validation::check_urls_are_valid;
|
||||||
use image_links::markdown_rewrite_image_links;
|
use image_links::markdown_rewrite_image_links;
|
||||||
use pretty_assertions::assert_eq;
|
use pretty_assertions::assert_eq;
|
||||||
|
use regex::escape;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_basic_markdown() {
|
fn test_basic_markdown() {
|
||||||
|
@ -191,9 +193,20 @@ mod tests {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// This replicates the logic when saving url blocklist patterns and querying them.
|
||||||
|
// Refer to lemmy_api_crud::site::update::update_site and
|
||||||
|
// lemmy_api_common::utils::get_url_blocklist().
|
||||||
|
fn create_url_blocklist_test_regex_set(patterns: Vec<&str>) -> LemmyResult<RegexSet> {
|
||||||
|
let url_blocklist = patterns.iter().map(|&s| s.to_string()).collect();
|
||||||
|
let valid_urls = check_urls_are_valid(&url_blocklist)?;
|
||||||
|
let regexes = valid_urls.iter().map(|p| format!(r"\b{}\b", escape(p)));
|
||||||
|
let set = RegexSet::new(regexes)?;
|
||||||
|
Ok(set)
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_url_blocking() -> LemmyResult<()> {
|
fn test_url_blocking() -> LemmyResult<()> {
|
||||||
let set = RegexSet::new(vec![r"(https://)?example\.com/?"])?;
|
let set = create_url_blocklist_test_regex_set(vec!["example.com/"])?;
|
||||||
|
|
||||||
assert!(
|
assert!(
|
||||||
markdown_check_for_blocked_urls(&String::from("[](https://example.com)"), &set).is_err()
|
markdown_check_for_blocked_urls(&String::from("[](https://example.com)"), &set).is_err()
|
||||||
|
@ -221,37 +234,42 @@ mod tests {
|
||||||
)
|
)
|
||||||
.is_err());
|
.is_err());
|
||||||
|
|
||||||
let set = RegexSet::new(vec![r"(https://)?example\.com/spam\.jpg"])?;
|
let set = create_url_blocklist_test_regex_set(vec!["example.com/spam.jpg"])?;
|
||||||
assert!(markdown_check_for_blocked_urls(
|
assert!(markdown_check_for_blocked_urls("![](https://example.com/spam.jpg)", &set).is_err());
|
||||||
&String::from("![](https://example.com/spam.jpg)"),
|
assert!(markdown_check_for_blocked_urls("![](https://example.com/spam.jpg1)", &set).is_ok());
|
||||||
&set
|
// TODO: the following should not be matched, scunthorpe problem.
|
||||||
)
|
assert!(
|
||||||
.is_err());
|
markdown_check_for_blocked_urls("![](https://example.com/spam.jpg.html)", &set).is_err()
|
||||||
|
);
|
||||||
|
|
||||||
let set = RegexSet::new(vec![
|
let set = create_url_blocklist_test_regex_set(vec![
|
||||||
r"(https://)?quo\.example\.com/?",
|
r"quo.example.com/",
|
||||||
r"(https://)?foo\.example\.com/?",
|
r"foo.example.com/",
|
||||||
r"(https://)?bar\.example\.com/?",
|
r"bar.example.com/",
|
||||||
])?;
|
])?;
|
||||||
|
|
||||||
assert!(
|
assert!(markdown_check_for_blocked_urls("https://baz.example.com", &set).is_ok());
|
||||||
markdown_check_for_blocked_urls(&String::from("https://baz.example.com"), &set).is_ok()
|
|
||||||
);
|
|
||||||
|
|
||||||
assert!(
|
assert!(markdown_check_for_blocked_urls("https://bar.example.com", &set).is_err());
|
||||||
markdown_check_for_blocked_urls(&String::from("https://bar.example.com"), &set).is_err()
|
|
||||||
);
|
|
||||||
|
|
||||||
let set = RegexSet::new(vec![r"(https://)?example\.com/banned_page"])?;
|
let set = create_url_blocklist_test_regex_set(vec!["example.com/banned_page"])?;
|
||||||
|
|
||||||
assert!(
|
assert!(markdown_check_for_blocked_urls("https://example.com/page", &set).is_ok());
|
||||||
markdown_check_for_blocked_urls(&String::from("https://example.com/page"), &set).is_ok()
|
|
||||||
);
|
|
||||||
|
|
||||||
let set = RegexSet::new(vec![r"(https://)?ex\.mple\.com/?"])?;
|
let set = create_url_blocklist_test_regex_set(vec!["ex.mple.com/"])?;
|
||||||
|
|
||||||
assert!(markdown_check_for_blocked_urls("example.com", &set).is_ok());
|
assert!(markdown_check_for_blocked_urls("example.com", &set).is_ok());
|
||||||
|
|
||||||
|
let set = create_url_blocklist_test_regex_set(vec!["rt.com/"])?;
|
||||||
|
|
||||||
|
assert!(markdown_check_for_blocked_urls("deviantart.com", &set).is_ok());
|
||||||
|
assert!(markdown_check_for_blocked_urls("art.com.example.com", &set).is_ok());
|
||||||
|
assert!(markdown_check_for_blocked_urls("https://rt.com/abc", &set).is_err());
|
||||||
|
assert!(markdown_check_for_blocked_urls("go to rt.com.", &set).is_err());
|
||||||
|
assert!(markdown_check_for_blocked_urls("check out rt.computer", &set).is_ok());
|
||||||
|
// TODO: the following should not be matched, scunthorpe problem.
|
||||||
|
assert!(markdown_check_for_blocked_urls("rt.com.example.com", &set).is_err());
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue