From 9785b2084357fe614ec615ef15123891245d93c3 Mon Sep 17 00:00:00 2001 From: Dessalines Date: Wed, 6 Sep 2023 13:43:27 -0400 Subject: [PATCH] Adding a scaled sort, to boost smaller communities. (#3907) * Adding a scaled sort, to boost smaller communities. - Previously referred to as *best* . - Fixes #3622 * Fixing scheduled task update. * Converting hot_rank integers to floats. * Altering hot_rank psql function to default to zero after a week. * Setting scaled_rank to zero, where hot_rank is zero. * Adding image_upload table. --- .../src/activities/create_or_update/post.rs | 2 +- .../src/aggregates/post_aggregates.rs | 27 +++++- crates/db_schema/src/aggregates/structs.rs | 12 +-- crates/db_schema/src/lib.rs | 2 + crates/db_schema/src/schema.rs | 21 ++++- crates/db_schema/src/utils.rs | 8 +- crates/db_views/src/comment_report_view.rs | 2 +- crates/db_views/src/comment_view.rs | 2 +- crates/db_views/src/post_view.rs | 8 +- crates/db_views_actor/src/community_view.rs | 2 +- .../2023-08-23-182533_scaled_rank/down.sql | 87 +++++++++++++++++++ .../2023-08-23-182533_scaled_rank/up.sql | 74 ++++++++++++++++ src/scheduled_tasks.rs | 63 +++++++++++--- 13 files changed, 279 insertions(+), 31 deletions(-) create mode 100644 migrations/2023-08-23-182533_scaled_rank/down.sql create mode 100644 migrations/2023-08-23-182533_scaled_rank/up.sql diff --git a/crates/apub/src/activities/create_or_update/post.rs b/crates/apub/src/activities/create_or_update/post.rs index cf4a6af1f..99e7a3d45 100644 --- a/crates/apub/src/activities/create_or_update/post.rs +++ b/crates/apub/src/activities/create_or_update/post.rs @@ -150,7 +150,7 @@ impl ActivityHandler for CreateOrUpdatePage { PostLike::like(&mut context.pool(), &like_form).await?; // Calculate initial hot_rank for post - PostAggregates::update_hot_rank(&mut context.pool(), post.id).await?; + PostAggregates::update_ranks(&mut context.pool(), post.id).await?; Ok(()) } diff --git a/crates/db_schema/src/aggregates/post_aggregates.rs b/crates/db_schema/src/aggregates/post_aggregates.rs index 7847f3946..1816ee5f3 100644 --- a/crates/db_schema/src/aggregates/post_aggregates.rs +++ b/crates/db_schema/src/aggregates/post_aggregates.rs @@ -1,10 +1,14 @@ use crate::{ aggregates::structs::PostAggregates, newtypes::PostId, - schema::post_aggregates, - utils::{functions::hot_rank, get_conn, DbPool}, + schema::{community_aggregates, post, post_aggregates}, + utils::{ + functions::{hot_rank, scaled_rank}, + get_conn, + DbPool, + }, }; -use diesel::{result::Error, ExpressionMethods, QueryDsl}; +use diesel::{result::Error, ExpressionMethods, JoinOnDsl, QueryDsl}; use diesel_async::RunQueryDsl; impl PostAggregates { @@ -16,9 +20,19 @@ impl PostAggregates { .await } - pub async fn update_hot_rank(pool: &mut DbPool<'_>, post_id: PostId) -> Result { + pub async fn update_ranks(pool: &mut DbPool<'_>, post_id: PostId) -> Result { let conn = &mut get_conn(pool).await?; + // Diesel can't update based on a join, which is necessary for the scaled_rank + // https://github.com/diesel-rs/diesel/issues/1478 + // Just select the users_active_month manually for now, since its a single post anyway + let users_active_month = community_aggregates::table + .select(community_aggregates::users_active_month) + .inner_join(post::table.on(community_aggregates::community_id.eq(post::community_id))) + .filter(post::id.eq(post_id)) + .first::(conn) + .await?; + diesel::update(post_aggregates::table) .filter(post_aggregates::post_id.eq(post_id)) .set(( @@ -27,6 +41,11 @@ impl PostAggregates { post_aggregates::score, post_aggregates::newest_comment_time_necro, )), + post_aggregates::scaled_rank.eq(scaled_rank( + post_aggregates::score, + post_aggregates::published, + users_active_month, + )), )) .get_result::(conn) .await diff --git a/crates/db_schema/src/aggregates/structs.rs b/crates/db_schema/src/aggregates/structs.rs index f34ff9c89..f24f02f49 100644 --- a/crates/db_schema/src/aggregates/structs.rs +++ b/crates/db_schema/src/aggregates/structs.rs @@ -27,11 +27,11 @@ pub struct CommentAggregates { pub published: DateTime, /// The total number of children in this comment branch. pub child_count: i32, - pub hot_rank: i32, + pub hot_rank: f64, pub controversy_rank: f64, } -#[derive(PartialEq, Eq, Debug, Serialize, Deserialize, Clone)] +#[derive(PartialEq, Debug, Serialize, Deserialize, Clone)] #[cfg_attr(feature = "full", derive(Queryable, Associations, Identifiable, TS))] #[cfg_attr(feature = "full", diesel(table_name = community_aggregates))] #[cfg_attr( @@ -55,7 +55,7 @@ pub struct CommunityAggregates { pub users_active_month: i64, /// The number of users with any activity in the last year. pub users_active_half_year: i64, - pub hot_rank: i32, + pub hot_rank: f64, } #[derive(PartialEq, Eq, Debug, Serialize, Deserialize, Clone, Default)] @@ -95,11 +95,13 @@ pub struct PostAggregates { pub featured_community: bool, /// If the post is featured on the site / to local. pub featured_local: bool, - pub hot_rank: i32, - pub hot_rank_active: i32, + pub hot_rank: f64, + pub hot_rank_active: f64, pub community_id: CommunityId, pub creator_id: PersonId, pub controversy_rank: f64, + /// A rank that amplifies smaller communities + pub scaled_rank: f64, } #[derive(PartialEq, Eq, Debug, Serialize, Deserialize, Clone)] diff --git a/crates/db_schema/src/lib.rs b/crates/db_schema/src/lib.rs index 079a1fe39..80fc9ffb7 100644 --- a/crates/db_schema/src/lib.rs +++ b/crates/db_schema/src/lib.rs @@ -54,6 +54,7 @@ use ts_rs::TS; )] #[cfg_attr(feature = "full", DbValueStyle = "verbatim")] #[cfg_attr(feature = "full", ts(export))] +// TODO add the controversial and scaled rankings to the doc below /// The post sort types. See here for descriptions: https://join-lemmy.org/docs/en/users/03-votes-and-ranking.html pub enum SortType { #[default] @@ -75,6 +76,7 @@ pub enum SortType { TopSixMonths, TopNineMonths, Controversial, + Scaled, } #[derive(EnumString, Display, Debug, Serialize, Deserialize, Clone, Copy)] diff --git a/crates/db_schema/src/schema.rs b/crates/db_schema/src/schema.rs index ffcdbebe8..1ffc14116 100644 --- a/crates/db_schema/src/schema.rs +++ b/crates/db_schema/src/schema.rs @@ -100,7 +100,7 @@ diesel::table! { downvotes -> Int8, published -> Timestamptz, child_count -> Int4, - hot_rank -> Int4, + hot_rank -> Float8, controversy_rank -> Float8, } } @@ -198,7 +198,7 @@ diesel::table! { users_active_week -> Int8, users_active_month -> Int8, users_active_half_year -> Int8, - hot_rank -> Int4, + hot_rank -> Float8, } } @@ -299,6 +299,16 @@ diesel::table! { } } +diesel::table! { + image_upload (id) { + id -> Int4, + local_user_id -> Int4, + pictrs_alias -> Text, + pictrs_delete_token -> Text, + published -> Timestamptz, + } +} + diesel::table! { instance (id) { id -> Int4, @@ -683,11 +693,12 @@ diesel::table! { newest_comment_time -> Timestamptz, featured_community -> Bool, featured_local -> Bool, - hot_rank -> Int4, - hot_rank_active -> Int4, + hot_rank -> Float8, + hot_rank_active -> Float8, community_id -> Int4, creator_id -> Int4, controversy_rank -> Float8, + scaled_rank -> Float8, } } @@ -893,6 +904,7 @@ diesel::joinable!(custom_emoji_keyword -> custom_emoji (custom_emoji_id)); diesel::joinable!(email_verification -> local_user (local_user_id)); diesel::joinable!(federation_allowlist -> instance (instance_id)); diesel::joinable!(federation_blocklist -> instance (instance_id)); +diesel::joinable!(image_upload -> local_user (local_user_id)); diesel::joinable!(local_site -> site (site_id)); diesel::joinable!(local_site_rate_limit -> local_site (local_site_id)); diesel::joinable!(local_user -> person (person_id)); @@ -967,6 +979,7 @@ diesel::allow_tables_to_appear_in_same_query!( email_verification, federation_allowlist, federation_blocklist, + image_upload, instance, language, local_site, diff --git a/crates/db_schema/src/utils.rs b/crates/db_schema/src/utils.rs index eea05b4b4..f90bb5c3d 100644 --- a/crates/db_schema/src/utils.rs +++ b/crates/db_schema/src/utils.rs @@ -347,7 +347,7 @@ pub fn naive_now() -> DateTime { pub fn post_to_comment_sort_type(sort: SortType) -> CommentSortType { match sort { - SortType::Active | SortType::Hot => CommentSortType::Hot, + SortType::Active | SortType::Hot | SortType::Scaled => CommentSortType::Hot, SortType::New | SortType::NewComments | SortType::MostComments => CommentSortType::New, SortType::Old => CommentSortType::Old, SortType::Controversial => CommentSortType::Controversial, @@ -384,7 +384,11 @@ pub mod functions { use diesel::sql_types::{BigInt, Text, Timestamptz}; sql_function! { - fn hot_rank(score: BigInt, time: Timestamptz) -> Integer; + fn hot_rank(score: BigInt, time: Timestamptz) -> Double; + } + + sql_function! { + fn scaled_rank(score: BigInt, time: Timestamptz, users_active_month: BigInt) -> Double; } sql_function! { diff --git a/crates/db_views/src/comment_report_view.rs b/crates/db_views/src/comment_report_view.rs index 99832d8b5..cded46ac9 100644 --- a/crates/db_views/src/comment_report_view.rs +++ b/crates/db_views/src/comment_report_view.rs @@ -432,7 +432,7 @@ mod tests { downvotes: 0, published: agg.published, child_count: 0, - hot_rank: 1728, + hot_rank: 0.1728, controversy_rank: 0.0, }, my_vote: None, diff --git a/crates/db_views/src/comment_view.rs b/crates/db_views/src/comment_view.rs index 0288bfd0f..b93f12842 100644 --- a/crates/db_views/src/comment_view.rs +++ b/crates/db_views/src/comment_view.rs @@ -886,7 +886,7 @@ mod tests { downvotes: 0, published: agg.published, child_count: 5, - hot_rank: 1728, + hot_rank: 0.1728, controversy_rank: 0.0, }, } diff --git a/crates/db_views/src/post_view.rs b/crates/db_views/src/post_view.rs index 860e3fa7d..4851c5507 100644 --- a/crates/db_views/src/post_view.rs +++ b/crates/db_views/src/post_view.rs @@ -380,6 +380,9 @@ fn queries<'a>() -> Queries< SortType::Hot => query .then_order_by(post_aggregates::hot_rank.desc()) .then_order_by(post_aggregates::published.desc()), + SortType::Scaled => query + .then_order_by(post_aggregates::scaled_rank.desc()) + .then_order_by(post_aggregates::published.desc()), SortType::Controversial => query.then_order_by(post_aggregates::controversy_rank.desc()), SortType::New => query.then_order_by(post_aggregates::published.desc()), SortType::Old => query.then_order_by(post_aggregates::published.asc()), @@ -1154,9 +1157,10 @@ mod tests { newest_comment_time: inserted_post.published, featured_community: false, featured_local: false, - hot_rank: 1728, - hot_rank_active: 1728, + hot_rank: 0.1728, + hot_rank_active: 0.1728, controversy_rank: 0.0, + scaled_rank: 0.3621, community_id: inserted_post.community_id, creator_id: inserted_post.creator_id, }, diff --git a/crates/db_views_actor/src/community_view.rs b/crates/db_views_actor/src/community_view.rs index 7970183dc..974267061 100644 --- a/crates/db_views_actor/src/community_view.rs +++ b/crates/db_views_actor/src/community_view.rs @@ -105,7 +105,7 @@ fn queries<'a>() -> Queries< } match options.sort.unwrap_or(Hot) { - Hot | Active => query = query.order_by(community_aggregates::hot_rank.desc()), + Hot | Active | Scaled => query = query.order_by(community_aggregates::hot_rank.desc()), NewComments | TopDay | TopTwelveHour | TopSixHour | TopHour => { query = query.order_by(community_aggregates::users_active_day.desc()) } diff --git a/migrations/2023-08-23-182533_scaled_rank/down.sql b/migrations/2023-08-23-182533_scaled_rank/down.sql new file mode 100644 index 000000000..afe1bfc21 --- /dev/null +++ b/migrations/2023-08-23-182533_scaled_rank/down.sql @@ -0,0 +1,87 @@ +DROP FUNCTION scaled_rank; + +ALTER TABLE community_aggregates + ALTER COLUMN hot_rank TYPE integer, + ALTER COLUMN hot_rank SET DEFAULT 1728; + +ALTER TABLE comment_aggregates + ALTER COLUMN hot_rank TYPE integer, + ALTER COLUMN hot_rank SET DEFAULT 1728; + +ALTER TABLE post_aggregates + ALTER COLUMN hot_rank TYPE integer, + ALTER COLUMN hot_rank SET DEFAULT 1728, + ALTER COLUMN hot_rank_active TYPE integer, + ALTER COLUMN hot_rank_active SET DEFAULT 1728; + +-- Change back to integer version +DROP FUNCTION hot_rank (numeric, published timestamp with time zone); + +CREATE OR REPLACE FUNCTION hot_rank (score numeric, published timestamp with time zone) + RETURNS integer + AS $$ +DECLARE + hours_diff numeric := EXTRACT(EPOCH FROM (now() - published)) / 3600; +BEGIN + IF (hours_diff > 0) THEN + RETURN floor(10000 * log(greatest (1, score + 3)) / power((hours_diff + 2), 1.8))::integer; + ELSE + -- if the post is from the future, set hot score to 0. otherwise you can game the post to + -- always be on top even with only 1 vote by setting it to the future + RETURN 0; + END IF; +END; +$$ +LANGUAGE plpgsql +IMMUTABLE PARALLEL SAFE; + +ALTER TABLE post_aggregates + DROP COLUMN scaled_rank; + +-- The following code is necessary because postgres can't remove +-- a single enum value. +ALTER TABLE local_user + ALTER default_sort_type DROP DEFAULT; + +UPDATE + local_user +SET + default_sort_type = 'Hot' +WHERE + default_sort_type = 'Scaled'; + +-- rename the old enum +ALTER TYPE sort_type_enum RENAME TO sort_type_enum__; + +-- create the new enum +CREATE TYPE sort_type_enum AS ENUM ( + 'Active', + 'Hot', + 'New', + 'Old', + 'TopDay', + 'TopWeek', + 'TopMonth', + 'TopYear', + 'TopAll', + 'MostComments', + 'NewComments', + 'TopHour', + 'TopSixHour', + 'TopTwelveHour', + 'TopThreeMonths', + 'TopSixMonths', + 'TopNineMonths' +); + +-- alter all your enum columns +ALTER TABLE local_user + ALTER COLUMN default_sort_type TYPE sort_type_enum + USING default_sort_type::text::sort_type_enum; + +ALTER TABLE local_user + ALTER default_sort_type SET DEFAULT 'Active'; + +-- drop the old enum +DROP TYPE sort_type_enum__; + diff --git a/migrations/2023-08-23-182533_scaled_rank/up.sql b/migrations/2023-08-23-182533_scaled_rank/up.sql new file mode 100644 index 000000000..43a235889 --- /dev/null +++ b/migrations/2023-08-23-182533_scaled_rank/up.sql @@ -0,0 +1,74 @@ +-- Change hot ranks and functions from an int to a float +ALTER TABLE community_aggregates + ALTER COLUMN hot_rank TYPE float, + ALTER COLUMN hot_rank SET DEFAULT 0.1728; + +ALTER TABLE comment_aggregates + ALTER COLUMN hot_rank TYPE float, + ALTER COLUMN hot_rank SET DEFAULT 0.1728; + +ALTER TABLE post_aggregates + ALTER COLUMN hot_rank TYPE float, + ALTER COLUMN hot_rank SET DEFAULT 0.1728, + ALTER COLUMN hot_rank_active TYPE float, + ALTER COLUMN hot_rank_active SET DEFAULT 0.1728; + +DROP FUNCTION hot_rank (numeric, published timestamp with time zone); + +CREATE OR REPLACE FUNCTION hot_rank (score numeric, published timestamp with time zone) + RETURNS float + AS $$ +DECLARE + hours_diff numeric := EXTRACT(EPOCH FROM (now() - published)) / 3600; +BEGIN + -- 24 * 7 = 168, so after a week, it will default to 0. + IF (hours_diff > 0 AND hours_diff < 168) THEN + RETURN log(greatest (1, score + 3)) / power((hours_diff + 2), 1.8); + ELSE + -- if the post is from the future, set hot score to 0. otherwise you can game the post to + -- always be on top even with only 1 vote by setting it to the future + RETURN 0.0; + END IF; +END; +$$ +LANGUAGE plpgsql +IMMUTABLE PARALLEL SAFE; + +-- The new scaled rank function +CREATE OR REPLACE FUNCTION scaled_rank (score numeric, published timestamp with time zone, users_active_month numeric) + RETURNS float + AS $$ +BEGIN + -- Add 2 to avoid divide by zero errors + -- Default for score = 1, active users = 1, and now, is (0.1728 / log(2 + 1)) = 0.3621 + -- There may need to be a scale factor multiplied to users_active_month, to make + -- the log curve less pronounced. This can be tuned in the future. + RETURN (hot_rank (score, published) / log(2 + users_active_month)); +END; +$$ +LANGUAGE plpgsql +IMMUTABLE PARALLEL SAFE; + +ALTER TABLE post_aggregates + ADD COLUMN scaled_rank float NOT NULL DEFAULT 0.3621; + +UPDATE + post_aggregates +SET + scaled_rank = 0 +WHERE + hot_rank = 0 + OR hot_rank_active = 0; + +CREATE INDEX idx_post_aggregates_featured_community_scaled ON post_aggregates (featured_community DESC, scaled_rank DESC, published DESC); + +CREATE INDEX idx_post_aggregates_featured_local_scaled ON post_aggregates (featured_local DESC, scaled_rank DESC, published DESC); + +-- We forgot to add the controversial sort type +ALTER TYPE sort_type_enum + ADD VALUE 'Controversial'; + +-- Add the Scaled enum +ALTER TYPE sort_type_enum + ADD VALUE 'Scaled'; + diff --git a/src/scheduled_tasks.rs b/src/scheduled_tasks.rs index 3b2bfe4b5..30ab5bfe6 100644 --- a/src/scheduled_tasks.rs +++ b/src/scheduled_tasks.rs @@ -154,22 +154,16 @@ fn startup_jobs(db_url: &str) { fn update_hot_ranks(conn: &mut PgConnection) { info!("Updating hot ranks for all history..."); - process_hot_ranks_in_batches( - conn, - "post_aggregates", - "a.hot_rank != 0 OR a.hot_rank_active != 0", - "SET hot_rank = hot_rank(a.score, a.published), - hot_rank_active = hot_rank(a.score, a.newest_comment_time_necro)", - ); + process_post_aggregates_ranks_in_batches(conn); - process_hot_ranks_in_batches( + process_ranks_in_batches( conn, "comment_aggregates", "a.hot_rank != 0", "SET hot_rank = hot_rank(a.score, a.published)", ); - process_hot_ranks_in_batches( + process_ranks_in_batches( conn, "community_aggregates", "a.hot_rank != 0", @@ -189,7 +183,7 @@ struct HotRanksUpdateResult { /// In `where_clause` and `set_clause`, "a" will refer to the current aggregates table. /// Locked rows are skipped in order to prevent deadlocks (they will likely get updated on the next /// run) -fn process_hot_ranks_in_batches( +fn process_ranks_in_batches( conn: &mut PgConnection, table_name: &str, where_clause: &str, @@ -241,6 +235,55 @@ fn process_hot_ranks_in_batches( ); } +/// Post aggregates is a special case, since it needs to join to the community_aggregates +/// table, to get the active monthly user counts. +fn process_post_aggregates_ranks_in_batches(conn: &mut PgConnection) { + let process_start_time: DateTime = Utc + .timestamp_opt(0, 0) + .single() + .expect("0 timestamp creation"); + + let update_batch_size = 1000; // Bigger batches than this tend to cause seq scans + let mut processed_rows_count = 0; + let mut previous_batch_result = Some(process_start_time); + while let Some(previous_batch_last_published) = previous_batch_result { + let result = sql_query( + r#"WITH batch AS (SELECT pa.id + FROM post_aggregates pa + WHERE pa.published > $1 + AND (pa.hot_rank != 0 OR pa.hot_rank_active != 0) + ORDER BY pa.published + LIMIT $2 + FOR UPDATE SKIP LOCKED) + UPDATE post_aggregates pa + SET hot_rank = hot_rank(pa.score, pa.published), + hot_rank_active = hot_rank(pa.score, pa.newest_comment_time_necro), + scaled_rank = scaled_rank(pa.score, pa.published, ca.users_active_month) + FROM batch, community_aggregates ca + WHERE pa.id = batch.id and pa.community_id = ca.community_id RETURNING pa.published; + "#, + ) + .bind::(previous_batch_last_published) + .bind::(update_batch_size) + .get_results::(conn); + + match result { + Ok(updated_rows) => { + processed_rows_count += updated_rows.len(); + previous_batch_result = updated_rows.last().map(|row| row.published); + } + Err(e) => { + error!("Failed to update {} hot_ranks: {}", "post_aggregates", e); + break; + } + } + } + info!( + "Finished process_hot_ranks_in_batches execution for {} (processed {} rows)", + "post_aggregates", processed_rows_count + ); +} + fn delete_expired_captcha_answers(conn: &mut PgConnection) { diesel::delete( captcha_answer::table.filter(captcha_answer::published.lt(now() - IntervalDsl::minutes(10))),