Adding a scaled sort, to boost smaller communities. (#3907)

* Adding a scaled sort, to boost smaller communities.

- Previously referred to as *best* .
- Fixes #3622

* Fixing scheduled task update.

* Converting hot_rank integers to floats.

* Altering hot_rank psql function to default to zero after a week.

* Setting scaled_rank to zero, where hot_rank is zero.

* Adding image_upload table.
This commit is contained in:
Dessalines 2023-09-06 13:43:27 -04:00 committed by GitHub
parent 4121fc4d56
commit 9785b20843
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
13 changed files with 279 additions and 31 deletions

View file

@ -150,7 +150,7 @@ impl ActivityHandler for CreateOrUpdatePage {
PostLike::like(&mut context.pool(), &like_form).await?;
// Calculate initial hot_rank for post
PostAggregates::update_hot_rank(&mut context.pool(), post.id).await?;
PostAggregates::update_ranks(&mut context.pool(), post.id).await?;
Ok(())
}

View file

@ -1,10 +1,14 @@
use crate::{
aggregates::structs::PostAggregates,
newtypes::PostId,
schema::post_aggregates,
utils::{functions::hot_rank, get_conn, DbPool},
schema::{community_aggregates, post, post_aggregates},
utils::{
functions::{hot_rank, scaled_rank},
get_conn,
DbPool,
},
};
use diesel::{result::Error, ExpressionMethods, QueryDsl};
use diesel::{result::Error, ExpressionMethods, JoinOnDsl, QueryDsl};
use diesel_async::RunQueryDsl;
impl PostAggregates {
@ -16,9 +20,19 @@ impl PostAggregates {
.await
}
pub async fn update_hot_rank(pool: &mut DbPool<'_>, post_id: PostId) -> Result<Self, Error> {
pub async fn update_ranks(pool: &mut DbPool<'_>, post_id: PostId) -> Result<Self, Error> {
let conn = &mut get_conn(pool).await?;
// Diesel can't update based on a join, which is necessary for the scaled_rank
// https://github.com/diesel-rs/diesel/issues/1478
// Just select the users_active_month manually for now, since its a single post anyway
let users_active_month = community_aggregates::table
.select(community_aggregates::users_active_month)
.inner_join(post::table.on(community_aggregates::community_id.eq(post::community_id)))
.filter(post::id.eq(post_id))
.first::<i64>(conn)
.await?;
diesel::update(post_aggregates::table)
.filter(post_aggregates::post_id.eq(post_id))
.set((
@ -27,6 +41,11 @@ impl PostAggregates {
post_aggregates::score,
post_aggregates::newest_comment_time_necro,
)),
post_aggregates::scaled_rank.eq(scaled_rank(
post_aggregates::score,
post_aggregates::published,
users_active_month,
)),
))
.get_result::<Self>(conn)
.await

View file

@ -27,11 +27,11 @@ pub struct CommentAggregates {
pub published: DateTime<Utc>,
/// The total number of children in this comment branch.
pub child_count: i32,
pub hot_rank: i32,
pub hot_rank: f64,
pub controversy_rank: f64,
}
#[derive(PartialEq, Eq, Debug, Serialize, Deserialize, Clone)]
#[derive(PartialEq, Debug, Serialize, Deserialize, Clone)]
#[cfg_attr(feature = "full", derive(Queryable, Associations, Identifiable, TS))]
#[cfg_attr(feature = "full", diesel(table_name = community_aggregates))]
#[cfg_attr(
@ -55,7 +55,7 @@ pub struct CommunityAggregates {
pub users_active_month: i64,
/// The number of users with any activity in the last year.
pub users_active_half_year: i64,
pub hot_rank: i32,
pub hot_rank: f64,
}
#[derive(PartialEq, Eq, Debug, Serialize, Deserialize, Clone, Default)]
@ -95,11 +95,13 @@ pub struct PostAggregates {
pub featured_community: bool,
/// If the post is featured on the site / to local.
pub featured_local: bool,
pub hot_rank: i32,
pub hot_rank_active: i32,
pub hot_rank: f64,
pub hot_rank_active: f64,
pub community_id: CommunityId,
pub creator_id: PersonId,
pub controversy_rank: f64,
/// A rank that amplifies smaller communities
pub scaled_rank: f64,
}
#[derive(PartialEq, Eq, Debug, Serialize, Deserialize, Clone)]

View file

@ -54,6 +54,7 @@ use ts_rs::TS;
)]
#[cfg_attr(feature = "full", DbValueStyle = "verbatim")]
#[cfg_attr(feature = "full", ts(export))]
// TODO add the controversial and scaled rankings to the doc below
/// The post sort types. See here for descriptions: https://join-lemmy.org/docs/en/users/03-votes-and-ranking.html
pub enum SortType {
#[default]
@ -75,6 +76,7 @@ pub enum SortType {
TopSixMonths,
TopNineMonths,
Controversial,
Scaled,
}
#[derive(EnumString, Display, Debug, Serialize, Deserialize, Clone, Copy)]

View file

@ -100,7 +100,7 @@ diesel::table! {
downvotes -> Int8,
published -> Timestamptz,
child_count -> Int4,
hot_rank -> Int4,
hot_rank -> Float8,
controversy_rank -> Float8,
}
}
@ -198,7 +198,7 @@ diesel::table! {
users_active_week -> Int8,
users_active_month -> Int8,
users_active_half_year -> Int8,
hot_rank -> Int4,
hot_rank -> Float8,
}
}
@ -299,6 +299,16 @@ diesel::table! {
}
}
diesel::table! {
image_upload (id) {
id -> Int4,
local_user_id -> Int4,
pictrs_alias -> Text,
pictrs_delete_token -> Text,
published -> Timestamptz,
}
}
diesel::table! {
instance (id) {
id -> Int4,
@ -683,11 +693,12 @@ diesel::table! {
newest_comment_time -> Timestamptz,
featured_community -> Bool,
featured_local -> Bool,
hot_rank -> Int4,
hot_rank_active -> Int4,
hot_rank -> Float8,
hot_rank_active -> Float8,
community_id -> Int4,
creator_id -> Int4,
controversy_rank -> Float8,
scaled_rank -> Float8,
}
}
@ -893,6 +904,7 @@ diesel::joinable!(custom_emoji_keyword -> custom_emoji (custom_emoji_id));
diesel::joinable!(email_verification -> local_user (local_user_id));
diesel::joinable!(federation_allowlist -> instance (instance_id));
diesel::joinable!(federation_blocklist -> instance (instance_id));
diesel::joinable!(image_upload -> local_user (local_user_id));
diesel::joinable!(local_site -> site (site_id));
diesel::joinable!(local_site_rate_limit -> local_site (local_site_id));
diesel::joinable!(local_user -> person (person_id));
@ -967,6 +979,7 @@ diesel::allow_tables_to_appear_in_same_query!(
email_verification,
federation_allowlist,
federation_blocklist,
image_upload,
instance,
language,
local_site,

View file

@ -347,7 +347,7 @@ pub fn naive_now() -> DateTime<Utc> {
pub fn post_to_comment_sort_type(sort: SortType) -> CommentSortType {
match sort {
SortType::Active | SortType::Hot => CommentSortType::Hot,
SortType::Active | SortType::Hot | SortType::Scaled => CommentSortType::Hot,
SortType::New | SortType::NewComments | SortType::MostComments => CommentSortType::New,
SortType::Old => CommentSortType::Old,
SortType::Controversial => CommentSortType::Controversial,
@ -384,7 +384,11 @@ pub mod functions {
use diesel::sql_types::{BigInt, Text, Timestamptz};
sql_function! {
fn hot_rank(score: BigInt, time: Timestamptz) -> Integer;
fn hot_rank(score: BigInt, time: Timestamptz) -> Double;
}
sql_function! {
fn scaled_rank(score: BigInt, time: Timestamptz, users_active_month: BigInt) -> Double;
}
sql_function! {

View file

@ -432,7 +432,7 @@ mod tests {
downvotes: 0,
published: agg.published,
child_count: 0,
hot_rank: 1728,
hot_rank: 0.1728,
controversy_rank: 0.0,
},
my_vote: None,

View file

@ -886,7 +886,7 @@ mod tests {
downvotes: 0,
published: agg.published,
child_count: 5,
hot_rank: 1728,
hot_rank: 0.1728,
controversy_rank: 0.0,
},
}

View file

@ -380,6 +380,9 @@ fn queries<'a>() -> Queries<
SortType::Hot => query
.then_order_by(post_aggregates::hot_rank.desc())
.then_order_by(post_aggregates::published.desc()),
SortType::Scaled => query
.then_order_by(post_aggregates::scaled_rank.desc())
.then_order_by(post_aggregates::published.desc()),
SortType::Controversial => query.then_order_by(post_aggregates::controversy_rank.desc()),
SortType::New => query.then_order_by(post_aggregates::published.desc()),
SortType::Old => query.then_order_by(post_aggregates::published.asc()),
@ -1154,9 +1157,10 @@ mod tests {
newest_comment_time: inserted_post.published,
featured_community: false,
featured_local: false,
hot_rank: 1728,
hot_rank_active: 1728,
hot_rank: 0.1728,
hot_rank_active: 0.1728,
controversy_rank: 0.0,
scaled_rank: 0.3621,
community_id: inserted_post.community_id,
creator_id: inserted_post.creator_id,
},

View file

@ -105,7 +105,7 @@ fn queries<'a>() -> Queries<
}
match options.sort.unwrap_or(Hot) {
Hot | Active => query = query.order_by(community_aggregates::hot_rank.desc()),
Hot | Active | Scaled => query = query.order_by(community_aggregates::hot_rank.desc()),
NewComments | TopDay | TopTwelveHour | TopSixHour | TopHour => {
query = query.order_by(community_aggregates::users_active_day.desc())
}

View file

@ -0,0 +1,87 @@
DROP FUNCTION scaled_rank;
ALTER TABLE community_aggregates
ALTER COLUMN hot_rank TYPE integer,
ALTER COLUMN hot_rank SET DEFAULT 1728;
ALTER TABLE comment_aggregates
ALTER COLUMN hot_rank TYPE integer,
ALTER COLUMN hot_rank SET DEFAULT 1728;
ALTER TABLE post_aggregates
ALTER COLUMN hot_rank TYPE integer,
ALTER COLUMN hot_rank SET DEFAULT 1728,
ALTER COLUMN hot_rank_active TYPE integer,
ALTER COLUMN hot_rank_active SET DEFAULT 1728;
-- Change back to integer version
DROP FUNCTION hot_rank (numeric, published timestamp with time zone);
CREATE OR REPLACE FUNCTION hot_rank (score numeric, published timestamp with time zone)
RETURNS integer
AS $$
DECLARE
hours_diff numeric := EXTRACT(EPOCH FROM (now() - published)) / 3600;
BEGIN
IF (hours_diff > 0) THEN
RETURN floor(10000 * log(greatest (1, score + 3)) / power((hours_diff + 2), 1.8))::integer;
ELSE
-- if the post is from the future, set hot score to 0. otherwise you can game the post to
-- always be on top even with only 1 vote by setting it to the future
RETURN 0;
END IF;
END;
$$
LANGUAGE plpgsql
IMMUTABLE PARALLEL SAFE;
ALTER TABLE post_aggregates
DROP COLUMN scaled_rank;
-- The following code is necessary because postgres can't remove
-- a single enum value.
ALTER TABLE local_user
ALTER default_sort_type DROP DEFAULT;
UPDATE
local_user
SET
default_sort_type = 'Hot'
WHERE
default_sort_type = 'Scaled';
-- rename the old enum
ALTER TYPE sort_type_enum RENAME TO sort_type_enum__;
-- create the new enum
CREATE TYPE sort_type_enum AS ENUM (
'Active',
'Hot',
'New',
'Old',
'TopDay',
'TopWeek',
'TopMonth',
'TopYear',
'TopAll',
'MostComments',
'NewComments',
'TopHour',
'TopSixHour',
'TopTwelveHour',
'TopThreeMonths',
'TopSixMonths',
'TopNineMonths'
);
-- alter all your enum columns
ALTER TABLE local_user
ALTER COLUMN default_sort_type TYPE sort_type_enum
USING default_sort_type::text::sort_type_enum;
ALTER TABLE local_user
ALTER default_sort_type SET DEFAULT 'Active';
-- drop the old enum
DROP TYPE sort_type_enum__;

View file

@ -0,0 +1,74 @@
-- Change hot ranks and functions from an int to a float
ALTER TABLE community_aggregates
ALTER COLUMN hot_rank TYPE float,
ALTER COLUMN hot_rank SET DEFAULT 0.1728;
ALTER TABLE comment_aggregates
ALTER COLUMN hot_rank TYPE float,
ALTER COLUMN hot_rank SET DEFAULT 0.1728;
ALTER TABLE post_aggregates
ALTER COLUMN hot_rank TYPE float,
ALTER COLUMN hot_rank SET DEFAULT 0.1728,
ALTER COLUMN hot_rank_active TYPE float,
ALTER COLUMN hot_rank_active SET DEFAULT 0.1728;
DROP FUNCTION hot_rank (numeric, published timestamp with time zone);
CREATE OR REPLACE FUNCTION hot_rank (score numeric, published timestamp with time zone)
RETURNS float
AS $$
DECLARE
hours_diff numeric := EXTRACT(EPOCH FROM (now() - published)) / 3600;
BEGIN
-- 24 * 7 = 168, so after a week, it will default to 0.
IF (hours_diff > 0 AND hours_diff < 168) THEN
RETURN log(greatest (1, score + 3)) / power((hours_diff + 2), 1.8);
ELSE
-- if the post is from the future, set hot score to 0. otherwise you can game the post to
-- always be on top even with only 1 vote by setting it to the future
RETURN 0.0;
END IF;
END;
$$
LANGUAGE plpgsql
IMMUTABLE PARALLEL SAFE;
-- The new scaled rank function
CREATE OR REPLACE FUNCTION scaled_rank (score numeric, published timestamp with time zone, users_active_month numeric)
RETURNS float
AS $$
BEGIN
-- Add 2 to avoid divide by zero errors
-- Default for score = 1, active users = 1, and now, is (0.1728 / log(2 + 1)) = 0.3621
-- There may need to be a scale factor multiplied to users_active_month, to make
-- the log curve less pronounced. This can be tuned in the future.
RETURN (hot_rank (score, published) / log(2 + users_active_month));
END;
$$
LANGUAGE plpgsql
IMMUTABLE PARALLEL SAFE;
ALTER TABLE post_aggregates
ADD COLUMN scaled_rank float NOT NULL DEFAULT 0.3621;
UPDATE
post_aggregates
SET
scaled_rank = 0
WHERE
hot_rank = 0
OR hot_rank_active = 0;
CREATE INDEX idx_post_aggregates_featured_community_scaled ON post_aggregates (featured_community DESC, scaled_rank DESC, published DESC);
CREATE INDEX idx_post_aggregates_featured_local_scaled ON post_aggregates (featured_local DESC, scaled_rank DESC, published DESC);
-- We forgot to add the controversial sort type
ALTER TYPE sort_type_enum
ADD VALUE 'Controversial';
-- Add the Scaled enum
ALTER TYPE sort_type_enum
ADD VALUE 'Scaled';

View file

@ -154,22 +154,16 @@ fn startup_jobs(db_url: &str) {
fn update_hot_ranks(conn: &mut PgConnection) {
info!("Updating hot ranks for all history...");
process_hot_ranks_in_batches(
conn,
"post_aggregates",
"a.hot_rank != 0 OR a.hot_rank_active != 0",
"SET hot_rank = hot_rank(a.score, a.published),
hot_rank_active = hot_rank(a.score, a.newest_comment_time_necro)",
);
process_post_aggregates_ranks_in_batches(conn);
process_hot_ranks_in_batches(
process_ranks_in_batches(
conn,
"comment_aggregates",
"a.hot_rank != 0",
"SET hot_rank = hot_rank(a.score, a.published)",
);
process_hot_ranks_in_batches(
process_ranks_in_batches(
conn,
"community_aggregates",
"a.hot_rank != 0",
@ -189,7 +183,7 @@ struct HotRanksUpdateResult {
/// In `where_clause` and `set_clause`, "a" will refer to the current aggregates table.
/// Locked rows are skipped in order to prevent deadlocks (they will likely get updated on the next
/// run)
fn process_hot_ranks_in_batches(
fn process_ranks_in_batches(
conn: &mut PgConnection,
table_name: &str,
where_clause: &str,
@ -241,6 +235,55 @@ fn process_hot_ranks_in_batches(
);
}
/// Post aggregates is a special case, since it needs to join to the community_aggregates
/// table, to get the active monthly user counts.
fn process_post_aggregates_ranks_in_batches(conn: &mut PgConnection) {
let process_start_time: DateTime<Utc> = Utc
.timestamp_opt(0, 0)
.single()
.expect("0 timestamp creation");
let update_batch_size = 1000; // Bigger batches than this tend to cause seq scans
let mut processed_rows_count = 0;
let mut previous_batch_result = Some(process_start_time);
while let Some(previous_batch_last_published) = previous_batch_result {
let result = sql_query(
r#"WITH batch AS (SELECT pa.id
FROM post_aggregates pa
WHERE pa.published > $1
AND (pa.hot_rank != 0 OR pa.hot_rank_active != 0)
ORDER BY pa.published
LIMIT $2
FOR UPDATE SKIP LOCKED)
UPDATE post_aggregates pa
SET hot_rank = hot_rank(pa.score, pa.published),
hot_rank_active = hot_rank(pa.score, pa.newest_comment_time_necro),
scaled_rank = scaled_rank(pa.score, pa.published, ca.users_active_month)
FROM batch, community_aggregates ca
WHERE pa.id = batch.id and pa.community_id = ca.community_id RETURNING pa.published;
"#,
)
.bind::<Timestamptz, _>(previous_batch_last_published)
.bind::<Integer, _>(update_batch_size)
.get_results::<HotRanksUpdateResult>(conn);
match result {
Ok(updated_rows) => {
processed_rows_count += updated_rows.len();
previous_batch_result = updated_rows.last().map(|row| row.published);
}
Err(e) => {
error!("Failed to update {} hot_ranks: {}", "post_aggregates", e);
break;
}
}
}
info!(
"Finished process_hot_ranks_in_batches execution for {} (processed {} rows)",
"post_aggregates", processed_rows_count
);
}
fn delete_expired_captcha_answers(conn: &mut PgConnection) {
diesel::delete(
captcha_answer::table.filter(captcha_answer::published.lt(now() - IntervalDsl::minutes(10))),