Adding a scaled sort, to boost smaller communities. (#3907)

* Adding a scaled sort, to boost smaller communities.

- Previously referred to as *best* .
- Fixes #3622

* Fixing scheduled task update.

* Converting hot_rank integers to floats.

* Altering hot_rank psql function to default to zero after a week.

* Setting scaled_rank to zero, where hot_rank is zero.

* Adding image_upload table.
This commit is contained in:
Dessalines 2023-09-06 13:43:27 -04:00 committed by GitHub
parent 4121fc4d56
commit 9785b20843
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
13 changed files with 279 additions and 31 deletions

View file

@ -150,7 +150,7 @@ impl ActivityHandler for CreateOrUpdatePage {
PostLike::like(&mut context.pool(), &like_form).await?; PostLike::like(&mut context.pool(), &like_form).await?;
// Calculate initial hot_rank for post // Calculate initial hot_rank for post
PostAggregates::update_hot_rank(&mut context.pool(), post.id).await?; PostAggregates::update_ranks(&mut context.pool(), post.id).await?;
Ok(()) Ok(())
} }

View file

@ -1,10 +1,14 @@
use crate::{ use crate::{
aggregates::structs::PostAggregates, aggregates::structs::PostAggregates,
newtypes::PostId, newtypes::PostId,
schema::post_aggregates, schema::{community_aggregates, post, post_aggregates},
utils::{functions::hot_rank, get_conn, DbPool}, utils::{
functions::{hot_rank, scaled_rank},
get_conn,
DbPool,
},
}; };
use diesel::{result::Error, ExpressionMethods, QueryDsl}; use diesel::{result::Error, ExpressionMethods, JoinOnDsl, QueryDsl};
use diesel_async::RunQueryDsl; use diesel_async::RunQueryDsl;
impl PostAggregates { impl PostAggregates {
@ -16,9 +20,19 @@ impl PostAggregates {
.await .await
} }
pub async fn update_hot_rank(pool: &mut DbPool<'_>, post_id: PostId) -> Result<Self, Error> { pub async fn update_ranks(pool: &mut DbPool<'_>, post_id: PostId) -> Result<Self, Error> {
let conn = &mut get_conn(pool).await?; let conn = &mut get_conn(pool).await?;
// Diesel can't update based on a join, which is necessary for the scaled_rank
// https://github.com/diesel-rs/diesel/issues/1478
// Just select the users_active_month manually for now, since its a single post anyway
let users_active_month = community_aggregates::table
.select(community_aggregates::users_active_month)
.inner_join(post::table.on(community_aggregates::community_id.eq(post::community_id)))
.filter(post::id.eq(post_id))
.first::<i64>(conn)
.await?;
diesel::update(post_aggregates::table) diesel::update(post_aggregates::table)
.filter(post_aggregates::post_id.eq(post_id)) .filter(post_aggregates::post_id.eq(post_id))
.set(( .set((
@ -27,6 +41,11 @@ impl PostAggregates {
post_aggregates::score, post_aggregates::score,
post_aggregates::newest_comment_time_necro, post_aggregates::newest_comment_time_necro,
)), )),
post_aggregates::scaled_rank.eq(scaled_rank(
post_aggregates::score,
post_aggregates::published,
users_active_month,
)),
)) ))
.get_result::<Self>(conn) .get_result::<Self>(conn)
.await .await

View file

@ -27,11 +27,11 @@ pub struct CommentAggregates {
pub published: DateTime<Utc>, pub published: DateTime<Utc>,
/// The total number of children in this comment branch. /// The total number of children in this comment branch.
pub child_count: i32, pub child_count: i32,
pub hot_rank: i32, pub hot_rank: f64,
pub controversy_rank: f64, pub controversy_rank: f64,
} }
#[derive(PartialEq, Eq, Debug, Serialize, Deserialize, Clone)] #[derive(PartialEq, Debug, Serialize, Deserialize, Clone)]
#[cfg_attr(feature = "full", derive(Queryable, Associations, Identifiable, TS))] #[cfg_attr(feature = "full", derive(Queryable, Associations, Identifiable, TS))]
#[cfg_attr(feature = "full", diesel(table_name = community_aggregates))] #[cfg_attr(feature = "full", diesel(table_name = community_aggregates))]
#[cfg_attr( #[cfg_attr(
@ -55,7 +55,7 @@ pub struct CommunityAggregates {
pub users_active_month: i64, pub users_active_month: i64,
/// The number of users with any activity in the last year. /// The number of users with any activity in the last year.
pub users_active_half_year: i64, pub users_active_half_year: i64,
pub hot_rank: i32, pub hot_rank: f64,
} }
#[derive(PartialEq, Eq, Debug, Serialize, Deserialize, Clone, Default)] #[derive(PartialEq, Eq, Debug, Serialize, Deserialize, Clone, Default)]
@ -95,11 +95,13 @@ pub struct PostAggregates {
pub featured_community: bool, pub featured_community: bool,
/// If the post is featured on the site / to local. /// If the post is featured on the site / to local.
pub featured_local: bool, pub featured_local: bool,
pub hot_rank: i32, pub hot_rank: f64,
pub hot_rank_active: i32, pub hot_rank_active: f64,
pub community_id: CommunityId, pub community_id: CommunityId,
pub creator_id: PersonId, pub creator_id: PersonId,
pub controversy_rank: f64, pub controversy_rank: f64,
/// A rank that amplifies smaller communities
pub scaled_rank: f64,
} }
#[derive(PartialEq, Eq, Debug, Serialize, Deserialize, Clone)] #[derive(PartialEq, Eq, Debug, Serialize, Deserialize, Clone)]

View file

@ -54,6 +54,7 @@ use ts_rs::TS;
)] )]
#[cfg_attr(feature = "full", DbValueStyle = "verbatim")] #[cfg_attr(feature = "full", DbValueStyle = "verbatim")]
#[cfg_attr(feature = "full", ts(export))] #[cfg_attr(feature = "full", ts(export))]
// TODO add the controversial and scaled rankings to the doc below
/// The post sort types. See here for descriptions: https://join-lemmy.org/docs/en/users/03-votes-and-ranking.html /// The post sort types. See here for descriptions: https://join-lemmy.org/docs/en/users/03-votes-and-ranking.html
pub enum SortType { pub enum SortType {
#[default] #[default]
@ -75,6 +76,7 @@ pub enum SortType {
TopSixMonths, TopSixMonths,
TopNineMonths, TopNineMonths,
Controversial, Controversial,
Scaled,
} }
#[derive(EnumString, Display, Debug, Serialize, Deserialize, Clone, Copy)] #[derive(EnumString, Display, Debug, Serialize, Deserialize, Clone, Copy)]

View file

@ -100,7 +100,7 @@ diesel::table! {
downvotes -> Int8, downvotes -> Int8,
published -> Timestamptz, published -> Timestamptz,
child_count -> Int4, child_count -> Int4,
hot_rank -> Int4, hot_rank -> Float8,
controversy_rank -> Float8, controversy_rank -> Float8,
} }
} }
@ -198,7 +198,7 @@ diesel::table! {
users_active_week -> Int8, users_active_week -> Int8,
users_active_month -> Int8, users_active_month -> Int8,
users_active_half_year -> Int8, users_active_half_year -> Int8,
hot_rank -> Int4, hot_rank -> Float8,
} }
} }
@ -299,6 +299,16 @@ diesel::table! {
} }
} }
diesel::table! {
image_upload (id) {
id -> Int4,
local_user_id -> Int4,
pictrs_alias -> Text,
pictrs_delete_token -> Text,
published -> Timestamptz,
}
}
diesel::table! { diesel::table! {
instance (id) { instance (id) {
id -> Int4, id -> Int4,
@ -683,11 +693,12 @@ diesel::table! {
newest_comment_time -> Timestamptz, newest_comment_time -> Timestamptz,
featured_community -> Bool, featured_community -> Bool,
featured_local -> Bool, featured_local -> Bool,
hot_rank -> Int4, hot_rank -> Float8,
hot_rank_active -> Int4, hot_rank_active -> Float8,
community_id -> Int4, community_id -> Int4,
creator_id -> Int4, creator_id -> Int4,
controversy_rank -> Float8, controversy_rank -> Float8,
scaled_rank -> Float8,
} }
} }
@ -893,6 +904,7 @@ diesel::joinable!(custom_emoji_keyword -> custom_emoji (custom_emoji_id));
diesel::joinable!(email_verification -> local_user (local_user_id)); diesel::joinable!(email_verification -> local_user (local_user_id));
diesel::joinable!(federation_allowlist -> instance (instance_id)); diesel::joinable!(federation_allowlist -> instance (instance_id));
diesel::joinable!(federation_blocklist -> instance (instance_id)); diesel::joinable!(federation_blocklist -> instance (instance_id));
diesel::joinable!(image_upload -> local_user (local_user_id));
diesel::joinable!(local_site -> site (site_id)); diesel::joinable!(local_site -> site (site_id));
diesel::joinable!(local_site_rate_limit -> local_site (local_site_id)); diesel::joinable!(local_site_rate_limit -> local_site (local_site_id));
diesel::joinable!(local_user -> person (person_id)); diesel::joinable!(local_user -> person (person_id));
@ -967,6 +979,7 @@ diesel::allow_tables_to_appear_in_same_query!(
email_verification, email_verification,
federation_allowlist, federation_allowlist,
federation_blocklist, federation_blocklist,
image_upload,
instance, instance,
language, language,
local_site, local_site,

View file

@ -347,7 +347,7 @@ pub fn naive_now() -> DateTime<Utc> {
pub fn post_to_comment_sort_type(sort: SortType) -> CommentSortType { pub fn post_to_comment_sort_type(sort: SortType) -> CommentSortType {
match sort { match sort {
SortType::Active | SortType::Hot => CommentSortType::Hot, SortType::Active | SortType::Hot | SortType::Scaled => CommentSortType::Hot,
SortType::New | SortType::NewComments | SortType::MostComments => CommentSortType::New, SortType::New | SortType::NewComments | SortType::MostComments => CommentSortType::New,
SortType::Old => CommentSortType::Old, SortType::Old => CommentSortType::Old,
SortType::Controversial => CommentSortType::Controversial, SortType::Controversial => CommentSortType::Controversial,
@ -384,7 +384,11 @@ pub mod functions {
use diesel::sql_types::{BigInt, Text, Timestamptz}; use diesel::sql_types::{BigInt, Text, Timestamptz};
sql_function! { sql_function! {
fn hot_rank(score: BigInt, time: Timestamptz) -> Integer; fn hot_rank(score: BigInt, time: Timestamptz) -> Double;
}
sql_function! {
fn scaled_rank(score: BigInt, time: Timestamptz, users_active_month: BigInt) -> Double;
} }
sql_function! { sql_function! {

View file

@ -432,7 +432,7 @@ mod tests {
downvotes: 0, downvotes: 0,
published: agg.published, published: agg.published,
child_count: 0, child_count: 0,
hot_rank: 1728, hot_rank: 0.1728,
controversy_rank: 0.0, controversy_rank: 0.0,
}, },
my_vote: None, my_vote: None,

View file

@ -886,7 +886,7 @@ mod tests {
downvotes: 0, downvotes: 0,
published: agg.published, published: agg.published,
child_count: 5, child_count: 5,
hot_rank: 1728, hot_rank: 0.1728,
controversy_rank: 0.0, controversy_rank: 0.0,
}, },
} }

View file

@ -380,6 +380,9 @@ fn queries<'a>() -> Queries<
SortType::Hot => query SortType::Hot => query
.then_order_by(post_aggregates::hot_rank.desc()) .then_order_by(post_aggregates::hot_rank.desc())
.then_order_by(post_aggregates::published.desc()), .then_order_by(post_aggregates::published.desc()),
SortType::Scaled => query
.then_order_by(post_aggregates::scaled_rank.desc())
.then_order_by(post_aggregates::published.desc()),
SortType::Controversial => query.then_order_by(post_aggregates::controversy_rank.desc()), SortType::Controversial => query.then_order_by(post_aggregates::controversy_rank.desc()),
SortType::New => query.then_order_by(post_aggregates::published.desc()), SortType::New => query.then_order_by(post_aggregates::published.desc()),
SortType::Old => query.then_order_by(post_aggregates::published.asc()), SortType::Old => query.then_order_by(post_aggregates::published.asc()),
@ -1154,9 +1157,10 @@ mod tests {
newest_comment_time: inserted_post.published, newest_comment_time: inserted_post.published,
featured_community: false, featured_community: false,
featured_local: false, featured_local: false,
hot_rank: 1728, hot_rank: 0.1728,
hot_rank_active: 1728, hot_rank_active: 0.1728,
controversy_rank: 0.0, controversy_rank: 0.0,
scaled_rank: 0.3621,
community_id: inserted_post.community_id, community_id: inserted_post.community_id,
creator_id: inserted_post.creator_id, creator_id: inserted_post.creator_id,
}, },

View file

@ -105,7 +105,7 @@ fn queries<'a>() -> Queries<
} }
match options.sort.unwrap_or(Hot) { match options.sort.unwrap_or(Hot) {
Hot | Active => query = query.order_by(community_aggregates::hot_rank.desc()), Hot | Active | Scaled => query = query.order_by(community_aggregates::hot_rank.desc()),
NewComments | TopDay | TopTwelveHour | TopSixHour | TopHour => { NewComments | TopDay | TopTwelveHour | TopSixHour | TopHour => {
query = query.order_by(community_aggregates::users_active_day.desc()) query = query.order_by(community_aggregates::users_active_day.desc())
} }

View file

@ -0,0 +1,87 @@
DROP FUNCTION scaled_rank;
ALTER TABLE community_aggregates
ALTER COLUMN hot_rank TYPE integer,
ALTER COLUMN hot_rank SET DEFAULT 1728;
ALTER TABLE comment_aggregates
ALTER COLUMN hot_rank TYPE integer,
ALTER COLUMN hot_rank SET DEFAULT 1728;
ALTER TABLE post_aggregates
ALTER COLUMN hot_rank TYPE integer,
ALTER COLUMN hot_rank SET DEFAULT 1728,
ALTER COLUMN hot_rank_active TYPE integer,
ALTER COLUMN hot_rank_active SET DEFAULT 1728;
-- Change back to integer version
DROP FUNCTION hot_rank (numeric, published timestamp with time zone);
CREATE OR REPLACE FUNCTION hot_rank (score numeric, published timestamp with time zone)
RETURNS integer
AS $$
DECLARE
hours_diff numeric := EXTRACT(EPOCH FROM (now() - published)) / 3600;
BEGIN
IF (hours_diff > 0) THEN
RETURN floor(10000 * log(greatest (1, score + 3)) / power((hours_diff + 2), 1.8))::integer;
ELSE
-- if the post is from the future, set hot score to 0. otherwise you can game the post to
-- always be on top even with only 1 vote by setting it to the future
RETURN 0;
END IF;
END;
$$
LANGUAGE plpgsql
IMMUTABLE PARALLEL SAFE;
ALTER TABLE post_aggregates
DROP COLUMN scaled_rank;
-- The following code is necessary because postgres can't remove
-- a single enum value.
ALTER TABLE local_user
ALTER default_sort_type DROP DEFAULT;
UPDATE
local_user
SET
default_sort_type = 'Hot'
WHERE
default_sort_type = 'Scaled';
-- rename the old enum
ALTER TYPE sort_type_enum RENAME TO sort_type_enum__;
-- create the new enum
CREATE TYPE sort_type_enum AS ENUM (
'Active',
'Hot',
'New',
'Old',
'TopDay',
'TopWeek',
'TopMonth',
'TopYear',
'TopAll',
'MostComments',
'NewComments',
'TopHour',
'TopSixHour',
'TopTwelveHour',
'TopThreeMonths',
'TopSixMonths',
'TopNineMonths'
);
-- alter all your enum columns
ALTER TABLE local_user
ALTER COLUMN default_sort_type TYPE sort_type_enum
USING default_sort_type::text::sort_type_enum;
ALTER TABLE local_user
ALTER default_sort_type SET DEFAULT 'Active';
-- drop the old enum
DROP TYPE sort_type_enum__;

View file

@ -0,0 +1,74 @@
-- Change hot ranks and functions from an int to a float
ALTER TABLE community_aggregates
ALTER COLUMN hot_rank TYPE float,
ALTER COLUMN hot_rank SET DEFAULT 0.1728;
ALTER TABLE comment_aggregates
ALTER COLUMN hot_rank TYPE float,
ALTER COLUMN hot_rank SET DEFAULT 0.1728;
ALTER TABLE post_aggregates
ALTER COLUMN hot_rank TYPE float,
ALTER COLUMN hot_rank SET DEFAULT 0.1728,
ALTER COLUMN hot_rank_active TYPE float,
ALTER COLUMN hot_rank_active SET DEFAULT 0.1728;
DROP FUNCTION hot_rank (numeric, published timestamp with time zone);
CREATE OR REPLACE FUNCTION hot_rank (score numeric, published timestamp with time zone)
RETURNS float
AS $$
DECLARE
hours_diff numeric := EXTRACT(EPOCH FROM (now() - published)) / 3600;
BEGIN
-- 24 * 7 = 168, so after a week, it will default to 0.
IF (hours_diff > 0 AND hours_diff < 168) THEN
RETURN log(greatest (1, score + 3)) / power((hours_diff + 2), 1.8);
ELSE
-- if the post is from the future, set hot score to 0. otherwise you can game the post to
-- always be on top even with only 1 vote by setting it to the future
RETURN 0.0;
END IF;
END;
$$
LANGUAGE plpgsql
IMMUTABLE PARALLEL SAFE;
-- The new scaled rank function
CREATE OR REPLACE FUNCTION scaled_rank (score numeric, published timestamp with time zone, users_active_month numeric)
RETURNS float
AS $$
BEGIN
-- Add 2 to avoid divide by zero errors
-- Default for score = 1, active users = 1, and now, is (0.1728 / log(2 + 1)) = 0.3621
-- There may need to be a scale factor multiplied to users_active_month, to make
-- the log curve less pronounced. This can be tuned in the future.
RETURN (hot_rank (score, published) / log(2 + users_active_month));
END;
$$
LANGUAGE plpgsql
IMMUTABLE PARALLEL SAFE;
ALTER TABLE post_aggregates
ADD COLUMN scaled_rank float NOT NULL DEFAULT 0.3621;
UPDATE
post_aggregates
SET
scaled_rank = 0
WHERE
hot_rank = 0
OR hot_rank_active = 0;
CREATE INDEX idx_post_aggregates_featured_community_scaled ON post_aggregates (featured_community DESC, scaled_rank DESC, published DESC);
CREATE INDEX idx_post_aggregates_featured_local_scaled ON post_aggregates (featured_local DESC, scaled_rank DESC, published DESC);
-- We forgot to add the controversial sort type
ALTER TYPE sort_type_enum
ADD VALUE 'Controversial';
-- Add the Scaled enum
ALTER TYPE sort_type_enum
ADD VALUE 'Scaled';

View file

@ -154,22 +154,16 @@ fn startup_jobs(db_url: &str) {
fn update_hot_ranks(conn: &mut PgConnection) { fn update_hot_ranks(conn: &mut PgConnection) {
info!("Updating hot ranks for all history..."); info!("Updating hot ranks for all history...");
process_hot_ranks_in_batches( process_post_aggregates_ranks_in_batches(conn);
conn,
"post_aggregates",
"a.hot_rank != 0 OR a.hot_rank_active != 0",
"SET hot_rank = hot_rank(a.score, a.published),
hot_rank_active = hot_rank(a.score, a.newest_comment_time_necro)",
);
process_hot_ranks_in_batches( process_ranks_in_batches(
conn, conn,
"comment_aggregates", "comment_aggregates",
"a.hot_rank != 0", "a.hot_rank != 0",
"SET hot_rank = hot_rank(a.score, a.published)", "SET hot_rank = hot_rank(a.score, a.published)",
); );
process_hot_ranks_in_batches( process_ranks_in_batches(
conn, conn,
"community_aggregates", "community_aggregates",
"a.hot_rank != 0", "a.hot_rank != 0",
@ -189,7 +183,7 @@ struct HotRanksUpdateResult {
/// In `where_clause` and `set_clause`, "a" will refer to the current aggregates table. /// In `where_clause` and `set_clause`, "a" will refer to the current aggregates table.
/// Locked rows are skipped in order to prevent deadlocks (they will likely get updated on the next /// Locked rows are skipped in order to prevent deadlocks (they will likely get updated on the next
/// run) /// run)
fn process_hot_ranks_in_batches( fn process_ranks_in_batches(
conn: &mut PgConnection, conn: &mut PgConnection,
table_name: &str, table_name: &str,
where_clause: &str, where_clause: &str,
@ -241,6 +235,55 @@ fn process_hot_ranks_in_batches(
); );
} }
/// Post aggregates is a special case, since it needs to join to the community_aggregates
/// table, to get the active monthly user counts.
fn process_post_aggregates_ranks_in_batches(conn: &mut PgConnection) {
let process_start_time: DateTime<Utc> = Utc
.timestamp_opt(0, 0)
.single()
.expect("0 timestamp creation");
let update_batch_size = 1000; // Bigger batches than this tend to cause seq scans
let mut processed_rows_count = 0;
let mut previous_batch_result = Some(process_start_time);
while let Some(previous_batch_last_published) = previous_batch_result {
let result = sql_query(
r#"WITH batch AS (SELECT pa.id
FROM post_aggregates pa
WHERE pa.published > $1
AND (pa.hot_rank != 0 OR pa.hot_rank_active != 0)
ORDER BY pa.published
LIMIT $2
FOR UPDATE SKIP LOCKED)
UPDATE post_aggregates pa
SET hot_rank = hot_rank(pa.score, pa.published),
hot_rank_active = hot_rank(pa.score, pa.newest_comment_time_necro),
scaled_rank = scaled_rank(pa.score, pa.published, ca.users_active_month)
FROM batch, community_aggregates ca
WHERE pa.id = batch.id and pa.community_id = ca.community_id RETURNING pa.published;
"#,
)
.bind::<Timestamptz, _>(previous_batch_last_published)
.bind::<Integer, _>(update_batch_size)
.get_results::<HotRanksUpdateResult>(conn);
match result {
Ok(updated_rows) => {
processed_rows_count += updated_rows.len();
previous_batch_result = updated_rows.last().map(|row| row.published);
}
Err(e) => {
error!("Failed to update {} hot_ranks: {}", "post_aggregates", e);
break;
}
}
}
info!(
"Finished process_hot_ranks_in_batches execution for {} (processed {} rows)",
"post_aggregates", processed_rows_count
);
}
fn delete_expired_captcha_answers(conn: &mut PgConnection) { fn delete_expired_captcha_answers(conn: &mut PgConnection) {
diesel::delete( diesel::delete(
captcha_answer::table.filter(captcha_answer::published.lt(now() - IntervalDsl::minutes(10))), captcha_answer::table.filter(captcha_answer::published.lt(now() - IntervalDsl::minutes(10))),