use aho-corasick for chunk filtering

This commit is contained in:
Dull Bananas 2025-05-23 22:18:47 -07:00
parent 664453cf88
commit 6b1f3e4416
4 changed files with 102 additions and 66 deletions

1
Cargo.lock generated
View file

@ -3366,6 +3366,7 @@ dependencies = [
name = "lemmy_db_schema_file"
version = "1.0.0-alpha.4"
dependencies = [
"aho-corasick",
"anyhow",
"chrono",
"diesel",

View file

@ -46,3 +46,4 @@ tracing = { workspace = true, optional = true }
serial_test = { workspace = true }
diff = "0.1.13"
itertools = { workspace = true }
aho-corasick = "1.1.3"

View file

@ -1,5 +1,6 @@
#![cfg(test)]
#![expect(clippy::expect_used)]
use aho_corasick::{AhoCorasick, AhoCorasickKind};
use itertools::Itertools;
use lemmy_utils::settings::SETTINGS;
use std::{
@ -45,39 +46,83 @@ pub fn get_dump() -> String {
String::from_utf8(output.stdout).expect("pg_dump output is not valid UTF-8 text")
}
pub fn check_dump_diff(mut dumps: [&str; 2], label_of_change_from_dump_0_to_dump_1: &str) {
// Performance optimizations
if dumps[0] == dumps[1] {
return;
}
dumps = trim_matching_chunks_at_beginning_and_end(dumps);
pub struct DiffChecker {
ignored_chunk_prefixes: AhoCorasick,
}
let [before_chunks, after_chunks] = dumps.map(|dump| {
dump
.split("\n\n")
.filter_map(remove_ignored_details_from_chunk)
.sorted_unstable()
.collect::<Vec<_>>()
});
let diff_results = diff::slice(&before_chunks, &after_chunks);
let mut only_in_before = HashSet::new();
let mut only_in_after = HashSet::new();
for res in diff_results {
match res {
diff::Result::Left(chunk) => only_in_before.insert(&**chunk),
diff::Result::Right(chunk) => only_in_after.insert(&**chunk),
diff::Result::Both(_, _) => continue,
};
}
if !(only_in_before.is_empty() && only_in_after.is_empty()) {
panic!(
"{}",
[&label_of_change_from_dump_0_to_dump_1, "\n\n"]
.into_iter()
.chain(display_diffs(only_in_before, only_in_after))
.collect::<String>()
impl DiffChecker {
pub fn new() -> Self {
let prefixes = [
"CREATE MATERIALIZED VIEW ",
"CREATE OR REPLACE VIEW ",
"CREATE VIEW ",
]
.into_iter()
.map(|s| s.to_owned())
.chain(
[
"refresh_comment",
"refresh_comment_like",
"refresh_community",
"refresh_community_follower",
"refresh_community_user_ban",
"refresh_post",
"refresh_post_like",
"refresh_private_message",
"refresh_user",
]
.into_iter()
.flat_map(|s| {
[
format!("CREATE FUNCTION public.{s}("),
format!("CREATE TRIGGER {s} "),
]
}),
);
DiffChecker {
ignored_chunk_prefixes: AhoCorasick::builder()
.start_kind(aho_corasick::StartKind::Anchored)
.kind(Some(AhoCorasickKind::DFA))
.build(prefixes)
.expect("AhoCorasick build failed"),
}
}
pub fn check_dump_diff(&self, mut dumps: [&str; 2], label_of_change_from_dump_0_to_dump_1: &str) {
// Performance optimizations
if dumps[0] == dumps[1] {
return;
}
dumps = trim_matching_chunks_at_beginning_and_end(dumps);
let [before_chunks, after_chunks] = dumps.map(|dump| {
dump
.split("\n\n")
.filter_map(|chunk| remove_ignored_details_from_chunk(chunk, &self.ignored_chunk_prefixes))
.sorted_unstable()
.collect::<Vec<_>>()
});
let diff_results = diff::slice(&before_chunks, &after_chunks);
let mut only_in_before = HashSet::new();
let mut only_in_after = HashSet::new();
for res in diff_results {
match res {
diff::Result::Left(chunk) => only_in_before.insert(&**chunk),
diff::Result::Right(chunk) => only_in_after.insert(&**chunk),
diff::Result::Both(_, _) => continue,
};
}
if !(only_in_before.is_empty() && only_in_after.is_empty()) {
panic!(
"{}",
[&label_of_change_from_dump_0_to_dump_1, "\n\n"]
.into_iter()
.chain(display_diffs(only_in_before, only_in_after))
.collect::<String>()
);
}
}
}
@ -213,25 +258,17 @@ fn count_inserted_or_deleted_chars_in_diff(a: &str, b: &str) -> usize {
.count()
}
fn remove_ignored_details_from_chunk(mut chunk: &str) -> Option<Cow<'_, str>> {
fn remove_ignored_details_from_chunk<'a>(
mut chunk: &'a str,
ignored_chunk_prefixes: &AhoCorasick,
) -> Option<Cow<'a, str>> {
while let Some(s) = trim_start_of_chunk(chunk) {
chunk = s;
}
if chunk.is_empty() ||
// Skip old views and fast table triggers
chunk.strip_prefix("CREATE ").is_some_and(|c| {
c
.starts_with("VIEW ")
|| c.starts_with("OR REPLACE VIEW ")
|| c.starts_with("MATERIALIZED VIEW ")
|| c.strip_prefix("FUNCTION public.")
.and_then(after_skipped_trigger_name)
.is_some_and(|a| a.starts_with('('))
||
c.strip_prefix("TRIGGER ")
.and_then(after_skipped_trigger_name)
.is_some_and(|a| a.starts_with(' '))
}) {
if chunk.is_empty()
|| ignored_chunk_prefixes
.is_match(aho_corasick::Input::new(chunk).anchored(aho_corasick::Anchored::Yes))
{
return None;
}
let mut chunk = Cow::Borrowed(chunk);
@ -281,18 +318,6 @@ fn after_first_occurence<'a>(s: &'a str, pat: &str) -> &'a str {
s.split_once(pat).unwrap_or_default().1
}
fn after_skipped_trigger_name(s: &str) -> Option<&str> {
s.strip_prefix("refresh_comment_like")
.or_else(|| s.strip_prefix("refresh_comment"))
.or_else(|| s.strip_prefix("refresh_community_follower"))
.or_else(|| s.strip_prefix("refresh_community_user_ban"))
.or_else(|| s.strip_prefix("refresh_community"))
.or_else(|| s.strip_prefix("refresh_post_like"))
.or_else(|| s.strip_prefix("refresh_post"))
.or_else(|| s.strip_prefix("refresh_private_message"))
.or_else(|| s.strip_prefix("refresh_user"))
}
// cfg(test) would be redundant here
mod tests {
#[test]

View file

@ -51,7 +51,7 @@ const REPLACEABLE_SCHEMA_PATH: &str = "crates/db_schema/replaceable_schema";
struct MigrationHarnessWrapper<'a> {
conn: &'a mut PgConnection,
#[cfg(test)]
enable_diff_check: bool,
diff_checker: Option<&'a diff_check::DiffChecker>,
options: &'a Options,
}
@ -80,7 +80,7 @@ impl MigrationHarness<Pg> for MigrationHarnessWrapper<'_> {
migration: &dyn Migration<Pg>,
) -> diesel::migration::Result<MigrationVersion<'static>> {
#[cfg(test)]
if self.enable_diff_check {
if let Some(diff_checker) = self.diff_checker {
let before = diff_check::get_dump();
self.run_migration_inner(migration)?;
@ -88,7 +88,7 @@ impl MigrationHarness<Pg> for MigrationHarnessWrapper<'_> {
let after = diff_check::get_dump();
diff_check::check_dump_diff(
diff_checker.check_dump_diff(
[&after, &before],
&format!(
"These changes need to be applied in migrations/{}/down.sql:",
@ -180,6 +180,8 @@ pub enum Branch {
pub fn run(options: Options) -> LemmyResult<Branch> {
let db_url = SETTINGS.get_database_url();
#[cfg(test)]
let diff_checker = options.enable_diff_check.then(diff_check::DiffChecker::new);
// Migrations don't support async connection, and this function doesn't need to be async
let mut conn = PgConnection::establish(&db_url)?;
@ -216,7 +218,13 @@ pub fn run(options: Options) -> LemmyResult<Branch> {
// it existing
revert_replaceable_schema(&mut conn)?;
run_selected_migrations(&mut conn, &options).map_err(convert_err)?;
run_selected_migrations(
&mut conn,
&options,
#[cfg(test)]
diff_checker.as_ref(),
)
.map_err(convert_err)?;
// Only run replaceable_schema if newest migration was applied
let output = if (options.run && options.limit.is_none())
@ -225,7 +233,7 @@ pub fn run(options: Options) -> LemmyResult<Branch> {
.map_err(convert_err)?
{
#[cfg(test)]
if options.enable_diff_check {
if let Some(diff_checker) = diff_checker {
let before = diff_check::get_dump();
run_replaceable_schema(&mut conn)?;
@ -233,7 +241,7 @@ pub fn run(options: Options) -> LemmyResult<Branch> {
let after = diff_check::get_dump();
diff_check::check_dump_diff([&before, &after], "The code in crates/db_schema/replaceable_schema incorrectly created or modified things outside of the `r` schema, causing these changes to be left behind after dropping the schema:");
diff_checker.check_dump_diff([&before, &after], "The code in crates/db_schema/replaceable_schema incorrectly created or modified things outside of the `r` schema, causing these changes to be left behind after dropping the schema:");
}
run_replaceable_schema(&mut conn)?;
@ -278,12 +286,13 @@ fn revert_replaceable_schema(conn: &mut PgConnection) -> LemmyResult<()> {
fn run_selected_migrations(
conn: &mut PgConnection,
options: &Options,
#[cfg(test)] diff_checker: Option<&diff_check::DiffChecker>,
) -> diesel::migration::Result<()> {
let mut wrapper = MigrationHarnessWrapper {
conn,
options,
#[cfg(test)]
enable_diff_check: options.enable_diff_check,
diff_checker,
};
if options.revert {