From 6b1f3e4416d0e63b137a07774e50718f6753ea5d Mon Sep 17 00:00:00 2001 From: Dull Bananas Date: Fri, 23 May 2025 22:18:47 -0700 Subject: [PATCH] use aho-corasick for chunk filtering --- Cargo.lock | 1 + crates/db_schema_file/Cargo.toml | 1 + crates/db_schema_file/src/diff_check.rs | 143 +++++++++++++--------- crates/db_schema_file/src/schema_setup.rs | 23 ++-- 4 files changed, 102 insertions(+), 66 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 588d7361a..1972227bd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3366,6 +3366,7 @@ dependencies = [ name = "lemmy_db_schema_file" version = "1.0.0-alpha.4" dependencies = [ + "aho-corasick", "anyhow", "chrono", "diesel", diff --git a/crates/db_schema_file/Cargo.toml b/crates/db_schema_file/Cargo.toml index 0b9dd260e..0d29c949c 100644 --- a/crates/db_schema_file/Cargo.toml +++ b/crates/db_schema_file/Cargo.toml @@ -46,3 +46,4 @@ tracing = { workspace = true, optional = true } serial_test = { workspace = true } diff = "0.1.13" itertools = { workspace = true } +aho-corasick = "1.1.3" diff --git a/crates/db_schema_file/src/diff_check.rs b/crates/db_schema_file/src/diff_check.rs index 4eb91acf4..12f21d722 100644 --- a/crates/db_schema_file/src/diff_check.rs +++ b/crates/db_schema_file/src/diff_check.rs @@ -1,5 +1,6 @@ #![cfg(test)] #![expect(clippy::expect_used)] +use aho_corasick::{AhoCorasick, AhoCorasickKind}; use itertools::Itertools; use lemmy_utils::settings::SETTINGS; use std::{ @@ -45,39 +46,83 @@ pub fn get_dump() -> String { String::from_utf8(output.stdout).expect("pg_dump output is not valid UTF-8 text") } -pub fn check_dump_diff(mut dumps: [&str; 2], label_of_change_from_dump_0_to_dump_1: &str) { - // Performance optimizations - if dumps[0] == dumps[1] { - return; - } - dumps = trim_matching_chunks_at_beginning_and_end(dumps); +pub struct DiffChecker { + ignored_chunk_prefixes: AhoCorasick, +} - let [before_chunks, after_chunks] = dumps.map(|dump| { - dump - .split("\n\n") - .filter_map(remove_ignored_details_from_chunk) - .sorted_unstable() - .collect::>() - }); - let diff_results = diff::slice(&before_chunks, &after_chunks); - let mut only_in_before = HashSet::new(); - let mut only_in_after = HashSet::new(); - for res in diff_results { - match res { - diff::Result::Left(chunk) => only_in_before.insert(&**chunk), - diff::Result::Right(chunk) => only_in_after.insert(&**chunk), - diff::Result::Both(_, _) => continue, - }; - } - - if !(only_in_before.is_empty() && only_in_after.is_empty()) { - panic!( - "{}", - [&label_of_change_from_dump_0_to_dump_1, "\n\n"] - .into_iter() - .chain(display_diffs(only_in_before, only_in_after)) - .collect::() +impl DiffChecker { + pub fn new() -> Self { + let prefixes = [ + "CREATE MATERIALIZED VIEW ", + "CREATE OR REPLACE VIEW ", + "CREATE VIEW ", + ] + .into_iter() + .map(|s| s.to_owned()) + .chain( + [ + "refresh_comment", + "refresh_comment_like", + "refresh_community", + "refresh_community_follower", + "refresh_community_user_ban", + "refresh_post", + "refresh_post_like", + "refresh_private_message", + "refresh_user", + ] + .into_iter() + .flat_map(|s| { + [ + format!("CREATE FUNCTION public.{s}("), + format!("CREATE TRIGGER {s} "), + ] + }), ); + + DiffChecker { + ignored_chunk_prefixes: AhoCorasick::builder() + .start_kind(aho_corasick::StartKind::Anchored) + .kind(Some(AhoCorasickKind::DFA)) + .build(prefixes) + .expect("AhoCorasick build failed"), + } + } + + pub fn check_dump_diff(&self, mut dumps: [&str; 2], label_of_change_from_dump_0_to_dump_1: &str) { + // Performance optimizations + if dumps[0] == dumps[1] { + return; + } + dumps = trim_matching_chunks_at_beginning_and_end(dumps); + + let [before_chunks, after_chunks] = dumps.map(|dump| { + dump + .split("\n\n") + .filter_map(|chunk| remove_ignored_details_from_chunk(chunk, &self.ignored_chunk_prefixes)) + .sorted_unstable() + .collect::>() + }); + let diff_results = diff::slice(&before_chunks, &after_chunks); + let mut only_in_before = HashSet::new(); + let mut only_in_after = HashSet::new(); + for res in diff_results { + match res { + diff::Result::Left(chunk) => only_in_before.insert(&**chunk), + diff::Result::Right(chunk) => only_in_after.insert(&**chunk), + diff::Result::Both(_, _) => continue, + }; + } + + if !(only_in_before.is_empty() && only_in_after.is_empty()) { + panic!( + "{}", + [&label_of_change_from_dump_0_to_dump_1, "\n\n"] + .into_iter() + .chain(display_diffs(only_in_before, only_in_after)) + .collect::() + ); + } } } @@ -213,25 +258,17 @@ fn count_inserted_or_deleted_chars_in_diff(a: &str, b: &str) -> usize { .count() } -fn remove_ignored_details_from_chunk(mut chunk: &str) -> Option> { +fn remove_ignored_details_from_chunk<'a>( + mut chunk: &'a str, + ignored_chunk_prefixes: &AhoCorasick, +) -> Option> { while let Some(s) = trim_start_of_chunk(chunk) { chunk = s; } - if chunk.is_empty() || - // Skip old views and fast table triggers - chunk.strip_prefix("CREATE ").is_some_and(|c| { - c - .starts_with("VIEW ") - || c.starts_with("OR REPLACE VIEW ") - || c.starts_with("MATERIALIZED VIEW ") - || c.strip_prefix("FUNCTION public.") - .and_then(after_skipped_trigger_name) - .is_some_and(|a| a.starts_with('(')) - || - c.strip_prefix("TRIGGER ") - .and_then(after_skipped_trigger_name) - .is_some_and(|a| a.starts_with(' ')) - }) { + if chunk.is_empty() + || ignored_chunk_prefixes + .is_match(aho_corasick::Input::new(chunk).anchored(aho_corasick::Anchored::Yes)) + { return None; } let mut chunk = Cow::Borrowed(chunk); @@ -281,18 +318,6 @@ fn after_first_occurence<'a>(s: &'a str, pat: &str) -> &'a str { s.split_once(pat).unwrap_or_default().1 } -fn after_skipped_trigger_name(s: &str) -> Option<&str> { - s.strip_prefix("refresh_comment_like") - .or_else(|| s.strip_prefix("refresh_comment")) - .or_else(|| s.strip_prefix("refresh_community_follower")) - .or_else(|| s.strip_prefix("refresh_community_user_ban")) - .or_else(|| s.strip_prefix("refresh_community")) - .or_else(|| s.strip_prefix("refresh_post_like")) - .or_else(|| s.strip_prefix("refresh_post")) - .or_else(|| s.strip_prefix("refresh_private_message")) - .or_else(|| s.strip_prefix("refresh_user")) -} - // cfg(test) would be redundant here mod tests { #[test] diff --git a/crates/db_schema_file/src/schema_setup.rs b/crates/db_schema_file/src/schema_setup.rs index 486702f7c..72ba80c6b 100644 --- a/crates/db_schema_file/src/schema_setup.rs +++ b/crates/db_schema_file/src/schema_setup.rs @@ -51,7 +51,7 @@ const REPLACEABLE_SCHEMA_PATH: &str = "crates/db_schema/replaceable_schema"; struct MigrationHarnessWrapper<'a> { conn: &'a mut PgConnection, #[cfg(test)] - enable_diff_check: bool, + diff_checker: Option<&'a diff_check::DiffChecker>, options: &'a Options, } @@ -80,7 +80,7 @@ impl MigrationHarness for MigrationHarnessWrapper<'_> { migration: &dyn Migration, ) -> diesel::migration::Result> { #[cfg(test)] - if self.enable_diff_check { + if let Some(diff_checker) = self.diff_checker { let before = diff_check::get_dump(); self.run_migration_inner(migration)?; @@ -88,7 +88,7 @@ impl MigrationHarness for MigrationHarnessWrapper<'_> { let after = diff_check::get_dump(); - diff_check::check_dump_diff( + diff_checker.check_dump_diff( [&after, &before], &format!( "These changes need to be applied in migrations/{}/down.sql:", @@ -180,6 +180,8 @@ pub enum Branch { pub fn run(options: Options) -> LemmyResult { let db_url = SETTINGS.get_database_url(); + #[cfg(test)] + let diff_checker = options.enable_diff_check.then(diff_check::DiffChecker::new); // Migrations don't support async connection, and this function doesn't need to be async let mut conn = PgConnection::establish(&db_url)?; @@ -216,7 +218,13 @@ pub fn run(options: Options) -> LemmyResult { // it existing revert_replaceable_schema(&mut conn)?; - run_selected_migrations(&mut conn, &options).map_err(convert_err)?; + run_selected_migrations( + &mut conn, + &options, + #[cfg(test)] + diff_checker.as_ref(), + ) + .map_err(convert_err)?; // Only run replaceable_schema if newest migration was applied let output = if (options.run && options.limit.is_none()) @@ -225,7 +233,7 @@ pub fn run(options: Options) -> LemmyResult { .map_err(convert_err)? { #[cfg(test)] - if options.enable_diff_check { + if let Some(diff_checker) = diff_checker { let before = diff_check::get_dump(); run_replaceable_schema(&mut conn)?; @@ -233,7 +241,7 @@ pub fn run(options: Options) -> LemmyResult { let after = diff_check::get_dump(); - diff_check::check_dump_diff([&before, &after], "The code in crates/db_schema/replaceable_schema incorrectly created or modified things outside of the `r` schema, causing these changes to be left behind after dropping the schema:"); + diff_checker.check_dump_diff([&before, &after], "The code in crates/db_schema/replaceable_schema incorrectly created or modified things outside of the `r` schema, causing these changes to be left behind after dropping the schema:"); } run_replaceable_schema(&mut conn)?; @@ -278,12 +286,13 @@ fn revert_replaceable_schema(conn: &mut PgConnection) -> LemmyResult<()> { fn run_selected_migrations( conn: &mut PgConnection, options: &Options, + #[cfg(test)] diff_checker: Option<&diff_check::DiffChecker>, ) -> diesel::migration::Result<()> { let mut wrapper = MigrationHarnessWrapper { conn, options, #[cfg(test)] - enable_diff_check: options.enable_diff_check, + diff_checker, }; if options.revert {