Auto-detect post language (fixes #2870)

This commit is contained in:
Felix Ableitner 2024-11-21 16:18:05 +01:00
parent 63ea99d38a
commit d566710e96
6 changed files with 1114 additions and 42 deletions

1004
Cargo.lock generated

File diff suppressed because it is too large Load diff

View file

@ -120,6 +120,7 @@ reqwest = { version = "0.12.7", default-features = false, features = [
"gzip", "gzip",
"rustls-tls", "rustls-tls",
] } ] }
cfg-if = "1"
reqwest-middleware = "0.3.3" reqwest-middleware = "0.3.3"
reqwest-tracing = "0.5.3" reqwest-tracing = "0.5.3"
clokwerk = "0.4.0" clokwerk = "0.4.0"

View file

@ -38,7 +38,7 @@ full = [
"rustls", "rustls",
"i-love-jesus", "i-love-jesus",
"tuplex", "tuplex",
"diesel-bind-if-some", "diesel-bind-if-some","lingua"
] ]
[dependencies] [dependencies]
@ -82,6 +82,8 @@ diesel-bind-if-some = { workspace = true, optional = true }
moka.workspace = true moka.workspace = true
derive-new.workspace = true derive-new.workspace = true
tuplex = { workspace = true, optional = true } tuplex = { workspace = true, optional = true }
lingua = { version = "1.6.2", optional = true }
cfg-if.workspace =true
[dev-dependencies] [dev-dependencies]
serial_test = { workspace = true } serial_test = { workspace = true }

View file

@ -0,0 +1,84 @@
use crate::{newtypes::LanguageId, source::language::Language, utils::DbPool};
use lemmy_utils::error::LemmyResult;
use lingua::{IsoCode639_1, Language as LinguaLanguage, LanguageDetectorBuilder};
pub async fn detect_language(input: &str, pool: &mut DbPool<'_>) -> LemmyResult<LanguageId> {
// TODO: should only detect languages which are allowed in community
let detector = LanguageDetectorBuilder::from_iso_codes_639_1(&[
IsoCode639_1::EN,
IsoCode639_1::ES,
IsoCode639_1::DE,
])
.build();
let lang: Option<LinguaLanguage> = detector.detect_language_of(input);
let Some(lang) = lang else {
return Ok(LanguageId(0));
};
let confidence = detector.compute_language_confidence("languages are awesome", lang);
let lang = lang.iso_code_639_1().to_string().to_lowercase();
dbg!(&lang, &confidence);
if confidence < 0.4 {
return Ok(LanguageId(0));
}
Ok(Language::read_id_from_code(pool, &lang).await?)
}
#[cfg(test)]
#[expect(clippy::indexing_slicing)]
mod tests {
use super::*;
use crate::utils::build_db_pool_for_tests;
use pretty_assertions::assert_eq;
use serial_test::serial;
#[tokio::test]
#[serial]
async fn test_detect_language() -> LemmyResult<()> {
let pool = &build_db_pool_for_tests();
let pool = &mut pool.into();
// some easy comments
assert_eq!(
LanguageId(37),
detect_language(
"I don't think it's supposed to be taken seriously. It's just a throwaway meme.
",
pool
)
.await?
);
assert_eq!(
LanguageId(39),
detect_language(
"Oh! Mencion casual de la mejor pelicula navideña… Die hard!
",
pool
)
.await?
);
assert_eq!(
LanguageId(32),
detect_language(
"Die Forderung finde ich nutzlos.
",
pool
)
.await?
);
// different languages
assert_eq!(
LanguageId(0),
detect_language(
"Die Forderung finde ich nutzlos. It's just a throwaway meme.
",
pool
)
.await?
);
Ok(())
}
}

View file

@ -1,34 +1,23 @@
#![recursion_limit = "256"] #![recursion_limit = "256"]
use cfg_if::cfg_if;
#[cfg(feature = "full")] cfg_if! {
if #[cfg(feature = "full")] {
#[macro_use] #[macro_use]
extern crate diesel; extern crate diesel;
#[cfg(feature = "full")]
#[macro_use] #[macro_use]
extern crate diesel_derive_newtype; extern crate diesel_derive_newtype;
#[cfg(feature = "full")]
#[macro_use] #[macro_use]
extern crate diesel_derive_enum; extern crate diesel_derive_enum;
// this is used in tests // this is used in tests
#[cfg(feature = "full")]
#[macro_use] #[macro_use]
extern crate diesel_migrations; extern crate diesel_migrations;
#[cfg(feature = "full")]
#[macro_use] #[macro_use]
extern crate async_trait; extern crate async_trait;
pub mod aggregates;
#[cfg(feature = "full")]
pub mod impls; pub mod impls;
pub mod newtypes;
pub mod sensitive;
#[cfg(feature = "full")]
#[rustfmt::skip] #[rustfmt::skip]
pub mod schema; pub mod schema;
#[cfg(feature = "full")] pub mod detect_language;
pub mod aliases { pub mod aliases {
use crate::schema::{community_actions, person}; use crate::schema::{community_actions, person};
diesel::alias!( diesel::alias!(
@ -37,6 +26,12 @@ pub mod aliases {
person as person2: Person2, person as person2: Person2,
); );
} }
}
}
pub mod aggregates;
pub mod newtypes;
pub mod sensitive;
pub mod source; pub mod source;
#[cfg(feature = "full")] #[cfg(feature = "full")]
pub mod traits; pub mod traits;

View file

@ -83,7 +83,7 @@ lettre = { version = "0.11.10", default-features = false, features = [
markdown-it = { version = "0.6.1", optional = true } markdown-it = { version = "0.6.1", optional = true }
ts-rs = { workspace = true, optional = true } ts-rs = { workspace = true, optional = true }
enum-map = { workspace = true, optional = true } enum-map = { workspace = true, optional = true }
cfg-if = "1" cfg-if.workspace = true
clearurls = { version = "0.0.4", features = ["linkify"] } clearurls = { version = "0.0.4", features = ["linkify"] }
markdown-it-block-spoiler = "1.0.0" markdown-it-block-spoiler = "1.0.0"
markdown-it-sub = "1.0.0" markdown-it-sub = "1.0.0"