* generate sitemap.xml file

* set up endpoint for sitemap

* Update sitemap generation

- remove sitemap generation from scheduled tasks
- add posts query for sitemap
- create sitemap module in API crate

* remove priority and change freq from sitemap

* add configuration option for number of posts for sitemap

* fix default config

* rate limit sitemap endpoint

* update sitemap query

* update sitemap generation

- remove config value for query limit
- adjust sitemap generation to query changes
- tidy up error handling

* refactor sitemap generation loop

* remove `limit` argument

* refactor `generate_urlset` and add unit test

* change query to only fetch local posts of past 24h

* fix outdated comment and log

* cargo fmt
This commit is contained in:
Lukas Trombach 2023-08-23 02:30:15 +12:00 committed by GitHub
parent ab828b81e4
commit 28324ad2c8
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 198 additions and 0 deletions

28
Cargo.lock generated
View file

@ -1603,6 +1603,15 @@ version = "1.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "90e5c1c8368803113bf0c9584fc495a58b86dc8a29edbf8fe877d21d9507e797" checksum = "90e5c1c8368803113bf0c9584fc495a58b86dc8a29edbf8fe877d21d9507e797"
[[package]]
name = "elementtree"
version = "1.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3efd4742acf458718a6456e0adf0b4d734d6b783e452bbf1ac36bf31f4085cb3"
dependencies = [
"string_cache",
]
[[package]] [[package]]
name = "email-encoding" name = "email-encoding"
version = "0.2.0" version = "0.2.0"
@ -2581,6 +2590,7 @@ dependencies = [
"bcrypt", "bcrypt",
"captcha", "captcha",
"chrono", "chrono",
"elementtree",
"lemmy_api_common", "lemmy_api_common",
"lemmy_db_schema", "lemmy_db_schema",
"lemmy_db_views", "lemmy_db_views",
@ -2589,8 +2599,10 @@ dependencies = [
"lemmy_utils", "lemmy_utils",
"serde", "serde",
"serial_test", "serial_test",
"sitemap-rs",
"tokio", "tokio",
"tracing", "tracing",
"url",
"uuid", "uuid",
"wav", "wav",
] ]
@ -4745,6 +4757,16 @@ version = "0.3.10"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de" checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de"
[[package]]
name = "sitemap-rs"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "95b58125f0ab4317b5ba3cdc1f60696e47958760e356874c759334fa56ae1596"
dependencies = [
"chrono",
"xml-builder",
]
[[package]] [[package]]
name = "skeptic" name = "skeptic"
version = "0.13.7" version = "0.13.7"
@ -6132,6 +6154,12 @@ dependencies = [
"winapi", "winapi",
] ]
[[package]]
name = "xml-builder"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "efc4f1a86af7800dfc4056c7833648ea4515ae21502060b5c98114d828f5333b"
[[package]] [[package]]
name = "xml5ever" name = "xml5ever"
version = "0.17.0" version = "0.17.0"

View file

@ -31,8 +31,11 @@ captcha = { workspace = true }
anyhow = { workspace = true } anyhow = { workspace = true }
tracing = { workspace = true } tracing = { workspace = true }
chrono = { workspace = true } chrono = { workspace = true }
url = { workspace = true }
wav = "1.0.0" wav = "1.0.0"
sitemap-rs = "0.2.0"
[dev-dependencies] [dev-dependencies]
serial_test = { workspace = true } serial_test = { workspace = true }
tokio = { workspace = true } tokio = { workspace = true }
elementtree = "1.2.3"

View file

@ -18,6 +18,7 @@ pub mod post_report;
pub mod private_message; pub mod private_message;
pub mod private_message_report; pub mod private_message_report;
pub mod site; pub mod site;
pub mod sitemap;
#[async_trait::async_trait(?Send)] #[async_trait::async_trait(?Send)]
pub trait Perform { pub trait Perform {

142
crates/api/src/sitemap.rs Normal file
View file

@ -0,0 +1,142 @@
use actix_web::{
http::header::{self, CacheDirective},
web::Data,
HttpResponse,
};
use chrono::{DateTime, FixedOffset};
use lemmy_api_common::context::LemmyContext;
use lemmy_db_schema::{newtypes::DbUrl, source::post::Post};
use lemmy_utils::error::LemmyResult;
use sitemap_rs::{url::Url, url_set::UrlSet};
use tracing::info;
async fn generate_urlset(posts: Vec<(DbUrl, chrono::NaiveDateTime)>) -> LemmyResult<UrlSet> {
let urls = posts
.into_iter()
.map_while(|post| {
Url::builder(post.0.to_string())
.last_modified(DateTime::from_utc(
post.1,
FixedOffset::east_opt(0).expect("Error setting timezone offset"), // TODO what is the proper timezone offset here?
))
.build()
.ok()
})
.collect();
Ok(UrlSet::new(urls)?)
}
pub async fn get_sitemap(context: Data<LemmyContext>) -> LemmyResult<HttpResponse> {
info!("Generating sitemap with posts from last {} hours...", 24);
let posts = Post::list_for_sitemap(&mut context.pool()).await?;
info!("Loaded latest {} posts", posts.len());
let mut buf = Vec::<u8>::new();
generate_urlset(posts).await?.write(&mut buf)?;
Ok(
HttpResponse::Ok()
.content_type("application/xml")
.insert_header(header::CacheControl(vec![CacheDirective::MaxAge(86_400)])) // 24 h
.body(buf),
)
}
#[cfg(test)]
pub(crate) mod tests {
#![allow(clippy::unwrap_used)]
use crate::sitemap::generate_urlset;
use chrono::{NaiveDate, NaiveDateTime};
use elementtree::Element;
use lemmy_db_schema::newtypes::DbUrl;
use url::Url;
#[tokio::test]
async fn test_generate_urlset() {
let posts: Vec<(DbUrl, NaiveDateTime)> = vec![
(
Url::parse("https://example.com").unwrap().into(),
NaiveDate::from_ymd_opt(2022, 12, 1)
.unwrap()
.and_hms_opt(9, 10, 11)
.unwrap(),
),
(
Url::parse("https://lemmy.ml").unwrap().into(),
NaiveDate::from_ymd_opt(2023, 1, 1)
.unwrap()
.and_hms_opt(1, 2, 3)
.unwrap(),
),
];
let mut buf = Vec::<u8>::new();
generate_urlset(posts)
.await
.unwrap()
.write(&mut buf)
.unwrap();
let root = Element::from_reader(buf.as_slice()).unwrap();
assert_eq!(root.tag().name(), "urlset");
assert_eq!(root.child_count(), 2);
assert!(root.children().all(|url| url.tag().name() == "url"));
assert!(root.children().all(|url| url.child_count() == 2));
assert!(root.children().all(|url| url
.children()
.next()
.is_some_and(|element| element.tag().name() == "loc")));
assert!(root.children().all(|url| url
.children()
.nth(1)
.is_some_and(|element| element.tag().name() == "lastmod")));
assert_eq!(
root
.children()
.next()
.unwrap()
.children()
.find(|element| element.tag().name() == "loc")
.unwrap()
.text(),
"https://example.com/"
);
assert_eq!(
root
.children()
.next()
.unwrap()
.children()
.find(|element| element.tag().name() == "lastmod")
.unwrap()
.text(),
"2022-12-01T09:10:11+00:00"
);
assert_eq!(
root
.children()
.nth(1)
.unwrap()
.children()
.find(|element| element.tag().name() == "loc")
.unwrap()
.text(),
"https://lemmy.ml/"
);
assert_eq!(
root
.children()
.nth(1)
.unwrap()
.children()
.find(|element| element.tag().name() == "lastmod")
.unwrap()
.text(),
"2023-01-01T01:02:03+00:00"
);
}
}

View file

@ -1,3 +1,4 @@
use super::instance::coalesce;
use crate::{ use crate::{
newtypes::{CommunityId, DbUrl, PersonId, PostId}, newtypes::{CommunityId, DbUrl, PersonId, PostId},
schema::post::dsl::{ schema::post::dsl::{
@ -7,6 +8,7 @@ use crate::{
creator_id, creator_id,
deleted, deleted,
featured_community, featured_community,
local,
name, name,
post, post,
published, published,
@ -30,6 +32,7 @@ use crate::{
utils::{get_conn, naive_now, DbPool, DELETED_REPLACEMENT_TEXT, FETCH_LIMIT_MAX}, utils::{get_conn, naive_now, DbPool, DELETED_REPLACEMENT_TEXT, FETCH_LIMIT_MAX},
}; };
use ::url::Url; use ::url::Url;
use chrono::{Duration, Utc};
use diesel::{dsl::insert_into, result::Error, ExpressionMethods, QueryDsl, TextExpressionMethods}; use diesel::{dsl::insert_into, result::Error, ExpressionMethods, QueryDsl, TextExpressionMethods};
use diesel_async::RunQueryDsl; use diesel_async::RunQueryDsl;
@ -96,6 +99,21 @@ impl Post {
.await .await
} }
pub async fn list_for_sitemap(
pool: &mut DbPool<'_>,
) -> Result<Vec<(DbUrl, chrono::NaiveDateTime)>, Error> {
let conn = &mut get_conn(pool).await?;
post
.select((ap_id, coalesce(updated, published)))
.filter(local)
.filter(deleted.eq(false))
.filter(removed.eq(false))
.filter(published.ge(Utc::now().naive_utc() - Duration::days(1)))
.order(published.desc())
.load::<(DbUrl, chrono::NaiveDateTime)>(conn)
.await
}
pub async fn permadelete_for_creator( pub async fn permadelete_for_creator(
pool: &mut DbPool<'_>, pool: &mut DbPool<'_>,
for_creator_id: PersonId, for_creator_id: PersonId,

View file

@ -16,6 +16,7 @@ use lemmy_api::{
local_user::{ban_person::ban_from_site, notifications::mark_reply_read::mark_reply_as_read}, local_user::{ban_person::ban_from_site, notifications::mark_reply_read::mark_reply_as_read},
post::{feature::feature_post, like::like_post, lock::lock_post}, post::{feature::feature_post, like::like_post, lock::lock_post},
post_report::create::create_post_report, post_report::create::create_post_report,
sitemap::get_sitemap,
Perform, Perform,
}; };
use lemmy_api_common::{ use lemmy_api_common::{
@ -340,6 +341,11 @@ pub fn config(cfg: &mut web::ServiceConfig, rate_limit: &RateLimitCell) {
.route("/delete", web::post().to(delete_custom_emoji)), .route("/delete", web::post().to(delete_custom_emoji)),
), ),
); );
cfg.service(
web::scope("/sitemap.xml")
.wrap(rate_limit.message())
.route("", web::get().to(get_sitemap)),
);
} }
async fn perform<'a, Data>( async fn perform<'a, Data>(