use aho-corasick for chunk filtering

This commit is contained in:
Dull Bananas 2025-05-23 22:18:47 -07:00
parent 664453cf88
commit 6b1f3e4416
4 changed files with 102 additions and 66 deletions

1
Cargo.lock generated
View file

@ -3366,6 +3366,7 @@ dependencies = [
name = "lemmy_db_schema_file" name = "lemmy_db_schema_file"
version = "1.0.0-alpha.4" version = "1.0.0-alpha.4"
dependencies = [ dependencies = [
"aho-corasick",
"anyhow", "anyhow",
"chrono", "chrono",
"diesel", "diesel",

View file

@ -46,3 +46,4 @@ tracing = { workspace = true, optional = true }
serial_test = { workspace = true } serial_test = { workspace = true }
diff = "0.1.13" diff = "0.1.13"
itertools = { workspace = true } itertools = { workspace = true }
aho-corasick = "1.1.3"

View file

@ -1,5 +1,6 @@
#![cfg(test)] #![cfg(test)]
#![expect(clippy::expect_used)] #![expect(clippy::expect_used)]
use aho_corasick::{AhoCorasick, AhoCorasickKind};
use itertools::Itertools; use itertools::Itertools;
use lemmy_utils::settings::SETTINGS; use lemmy_utils::settings::SETTINGS;
use std::{ use std::{
@ -45,39 +46,83 @@ pub fn get_dump() -> String {
String::from_utf8(output.stdout).expect("pg_dump output is not valid UTF-8 text") String::from_utf8(output.stdout).expect("pg_dump output is not valid UTF-8 text")
} }
pub fn check_dump_diff(mut dumps: [&str; 2], label_of_change_from_dump_0_to_dump_1: &str) { pub struct DiffChecker {
// Performance optimizations ignored_chunk_prefixes: AhoCorasick,
if dumps[0] == dumps[1] { }
return;
}
dumps = trim_matching_chunks_at_beginning_and_end(dumps);
let [before_chunks, after_chunks] = dumps.map(|dump| { impl DiffChecker {
dump pub fn new() -> Self {
.split("\n\n") let prefixes = [
.filter_map(remove_ignored_details_from_chunk) "CREATE MATERIALIZED VIEW ",
.sorted_unstable() "CREATE OR REPLACE VIEW ",
.collect::<Vec<_>>() "CREATE VIEW ",
}); ]
let diff_results = diff::slice(&before_chunks, &after_chunks); .into_iter()
let mut only_in_before = HashSet::new(); .map(|s| s.to_owned())
let mut only_in_after = HashSet::new(); .chain(
for res in diff_results { [
match res { "refresh_comment",
diff::Result::Left(chunk) => only_in_before.insert(&**chunk), "refresh_comment_like",
diff::Result::Right(chunk) => only_in_after.insert(&**chunk), "refresh_community",
diff::Result::Both(_, _) => continue, "refresh_community_follower",
}; "refresh_community_user_ban",
} "refresh_post",
"refresh_post_like",
if !(only_in_before.is_empty() && only_in_after.is_empty()) { "refresh_private_message",
panic!( "refresh_user",
"{}", ]
[&label_of_change_from_dump_0_to_dump_1, "\n\n"] .into_iter()
.into_iter() .flat_map(|s| {
.chain(display_diffs(only_in_before, only_in_after)) [
.collect::<String>() format!("CREATE FUNCTION public.{s}("),
format!("CREATE TRIGGER {s} "),
]
}),
); );
DiffChecker {
ignored_chunk_prefixes: AhoCorasick::builder()
.start_kind(aho_corasick::StartKind::Anchored)
.kind(Some(AhoCorasickKind::DFA))
.build(prefixes)
.expect("AhoCorasick build failed"),
}
}
pub fn check_dump_diff(&self, mut dumps: [&str; 2], label_of_change_from_dump_0_to_dump_1: &str) {
// Performance optimizations
if dumps[0] == dumps[1] {
return;
}
dumps = trim_matching_chunks_at_beginning_and_end(dumps);
let [before_chunks, after_chunks] = dumps.map(|dump| {
dump
.split("\n\n")
.filter_map(|chunk| remove_ignored_details_from_chunk(chunk, &self.ignored_chunk_prefixes))
.sorted_unstable()
.collect::<Vec<_>>()
});
let diff_results = diff::slice(&before_chunks, &after_chunks);
let mut only_in_before = HashSet::new();
let mut only_in_after = HashSet::new();
for res in diff_results {
match res {
diff::Result::Left(chunk) => only_in_before.insert(&**chunk),
diff::Result::Right(chunk) => only_in_after.insert(&**chunk),
diff::Result::Both(_, _) => continue,
};
}
if !(only_in_before.is_empty() && only_in_after.is_empty()) {
panic!(
"{}",
[&label_of_change_from_dump_0_to_dump_1, "\n\n"]
.into_iter()
.chain(display_diffs(only_in_before, only_in_after))
.collect::<String>()
);
}
} }
} }
@ -213,25 +258,17 @@ fn count_inserted_or_deleted_chars_in_diff(a: &str, b: &str) -> usize {
.count() .count()
} }
fn remove_ignored_details_from_chunk(mut chunk: &str) -> Option<Cow<'_, str>> { fn remove_ignored_details_from_chunk<'a>(
mut chunk: &'a str,
ignored_chunk_prefixes: &AhoCorasick,
) -> Option<Cow<'a, str>> {
while let Some(s) = trim_start_of_chunk(chunk) { while let Some(s) = trim_start_of_chunk(chunk) {
chunk = s; chunk = s;
} }
if chunk.is_empty() || if chunk.is_empty()
// Skip old views and fast table triggers || ignored_chunk_prefixes
chunk.strip_prefix("CREATE ").is_some_and(|c| { .is_match(aho_corasick::Input::new(chunk).anchored(aho_corasick::Anchored::Yes))
c {
.starts_with("VIEW ")
|| c.starts_with("OR REPLACE VIEW ")
|| c.starts_with("MATERIALIZED VIEW ")
|| c.strip_prefix("FUNCTION public.")
.and_then(after_skipped_trigger_name)
.is_some_and(|a| a.starts_with('('))
||
c.strip_prefix("TRIGGER ")
.and_then(after_skipped_trigger_name)
.is_some_and(|a| a.starts_with(' '))
}) {
return None; return None;
} }
let mut chunk = Cow::Borrowed(chunk); let mut chunk = Cow::Borrowed(chunk);
@ -281,18 +318,6 @@ fn after_first_occurence<'a>(s: &'a str, pat: &str) -> &'a str {
s.split_once(pat).unwrap_or_default().1 s.split_once(pat).unwrap_or_default().1
} }
fn after_skipped_trigger_name(s: &str) -> Option<&str> {
s.strip_prefix("refresh_comment_like")
.or_else(|| s.strip_prefix("refresh_comment"))
.or_else(|| s.strip_prefix("refresh_community_follower"))
.or_else(|| s.strip_prefix("refresh_community_user_ban"))
.or_else(|| s.strip_prefix("refresh_community"))
.or_else(|| s.strip_prefix("refresh_post_like"))
.or_else(|| s.strip_prefix("refresh_post"))
.or_else(|| s.strip_prefix("refresh_private_message"))
.or_else(|| s.strip_prefix("refresh_user"))
}
// cfg(test) would be redundant here // cfg(test) would be redundant here
mod tests { mod tests {
#[test] #[test]

View file

@ -51,7 +51,7 @@ const REPLACEABLE_SCHEMA_PATH: &str = "crates/db_schema/replaceable_schema";
struct MigrationHarnessWrapper<'a> { struct MigrationHarnessWrapper<'a> {
conn: &'a mut PgConnection, conn: &'a mut PgConnection,
#[cfg(test)] #[cfg(test)]
enable_diff_check: bool, diff_checker: Option<&'a diff_check::DiffChecker>,
options: &'a Options, options: &'a Options,
} }
@ -80,7 +80,7 @@ impl MigrationHarness<Pg> for MigrationHarnessWrapper<'_> {
migration: &dyn Migration<Pg>, migration: &dyn Migration<Pg>,
) -> diesel::migration::Result<MigrationVersion<'static>> { ) -> diesel::migration::Result<MigrationVersion<'static>> {
#[cfg(test)] #[cfg(test)]
if self.enable_diff_check { if let Some(diff_checker) = self.diff_checker {
let before = diff_check::get_dump(); let before = diff_check::get_dump();
self.run_migration_inner(migration)?; self.run_migration_inner(migration)?;
@ -88,7 +88,7 @@ impl MigrationHarness<Pg> for MigrationHarnessWrapper<'_> {
let after = diff_check::get_dump(); let after = diff_check::get_dump();
diff_check::check_dump_diff( diff_checker.check_dump_diff(
[&after, &before], [&after, &before],
&format!( &format!(
"These changes need to be applied in migrations/{}/down.sql:", "These changes need to be applied in migrations/{}/down.sql:",
@ -180,6 +180,8 @@ pub enum Branch {
pub fn run(options: Options) -> LemmyResult<Branch> { pub fn run(options: Options) -> LemmyResult<Branch> {
let db_url = SETTINGS.get_database_url(); let db_url = SETTINGS.get_database_url();
#[cfg(test)]
let diff_checker = options.enable_diff_check.then(diff_check::DiffChecker::new);
// Migrations don't support async connection, and this function doesn't need to be async // Migrations don't support async connection, and this function doesn't need to be async
let mut conn = PgConnection::establish(&db_url)?; let mut conn = PgConnection::establish(&db_url)?;
@ -216,7 +218,13 @@ pub fn run(options: Options) -> LemmyResult<Branch> {
// it existing // it existing
revert_replaceable_schema(&mut conn)?; revert_replaceable_schema(&mut conn)?;
run_selected_migrations(&mut conn, &options).map_err(convert_err)?; run_selected_migrations(
&mut conn,
&options,
#[cfg(test)]
diff_checker.as_ref(),
)
.map_err(convert_err)?;
// Only run replaceable_schema if newest migration was applied // Only run replaceable_schema if newest migration was applied
let output = if (options.run && options.limit.is_none()) let output = if (options.run && options.limit.is_none())
@ -225,7 +233,7 @@ pub fn run(options: Options) -> LemmyResult<Branch> {
.map_err(convert_err)? .map_err(convert_err)?
{ {
#[cfg(test)] #[cfg(test)]
if options.enable_diff_check { if let Some(diff_checker) = diff_checker {
let before = diff_check::get_dump(); let before = diff_check::get_dump();
run_replaceable_schema(&mut conn)?; run_replaceable_schema(&mut conn)?;
@ -233,7 +241,7 @@ pub fn run(options: Options) -> LemmyResult<Branch> {
let after = diff_check::get_dump(); let after = diff_check::get_dump();
diff_check::check_dump_diff([&before, &after], "The code in crates/db_schema/replaceable_schema incorrectly created or modified things outside of the `r` schema, causing these changes to be left behind after dropping the schema:"); diff_checker.check_dump_diff([&before, &after], "The code in crates/db_schema/replaceable_schema incorrectly created or modified things outside of the `r` schema, causing these changes to be left behind after dropping the schema:");
} }
run_replaceable_schema(&mut conn)?; run_replaceable_schema(&mut conn)?;
@ -278,12 +286,13 @@ fn revert_replaceable_schema(conn: &mut PgConnection) -> LemmyResult<()> {
fn run_selected_migrations( fn run_selected_migrations(
conn: &mut PgConnection, conn: &mut PgConnection,
options: &Options, options: &Options,
#[cfg(test)] diff_checker: Option<&diff_check::DiffChecker>,
) -> diesel::migration::Result<()> { ) -> diesel::migration::Result<()> {
let mut wrapper = MigrationHarnessWrapper { let mut wrapper = MigrationHarnessWrapper {
conn, conn,
options, options,
#[cfg(test)] #[cfg(test)]
enable_diff_check: options.enable_diff_check, diff_checker,
}; };
if options.revert { if options.revert {