mirror of
https://github.com/LemmyNet/lemmy.git
synced 2024-11-25 10:51:03 +00:00
Auto-detect post language (fixes #2870)
This commit is contained in:
parent
63ea99d38a
commit
d566710e96
6 changed files with 1114 additions and 42 deletions
1004
Cargo.lock
generated
1004
Cargo.lock
generated
File diff suppressed because it is too large
Load diff
|
@ -120,6 +120,7 @@ reqwest = { version = "0.12.7", default-features = false, features = [
|
|||
"gzip",
|
||||
"rustls-tls",
|
||||
] }
|
||||
cfg-if = "1"
|
||||
reqwest-middleware = "0.3.3"
|
||||
reqwest-tracing = "0.5.3"
|
||||
clokwerk = "0.4.0"
|
||||
|
|
|
@ -38,7 +38,7 @@ full = [
|
|||
"rustls",
|
||||
"i-love-jesus",
|
||||
"tuplex",
|
||||
"diesel-bind-if-some",
|
||||
"diesel-bind-if-some","lingua"
|
||||
]
|
||||
|
||||
[dependencies]
|
||||
|
@ -82,6 +82,8 @@ diesel-bind-if-some = { workspace = true, optional = true }
|
|||
moka.workspace = true
|
||||
derive-new.workspace = true
|
||||
tuplex = { workspace = true, optional = true }
|
||||
lingua = { version = "1.6.2", optional = true }
|
||||
cfg-if.workspace =true
|
||||
|
||||
[dev-dependencies]
|
||||
serial_test = { workspace = true }
|
||||
|
|
84
crates/db_schema/src/detect_language.rs
Normal file
84
crates/db_schema/src/detect_language.rs
Normal file
|
@ -0,0 +1,84 @@
|
|||
use crate::{newtypes::LanguageId, source::language::Language, utils::DbPool};
|
||||
use lemmy_utils::error::LemmyResult;
|
||||
use lingua::{IsoCode639_1, Language as LinguaLanguage, LanguageDetectorBuilder};
|
||||
|
||||
pub async fn detect_language(input: &str, pool: &mut DbPool<'_>) -> LemmyResult<LanguageId> {
|
||||
// TODO: should only detect languages which are allowed in community
|
||||
let detector = LanguageDetectorBuilder::from_iso_codes_639_1(&[
|
||||
IsoCode639_1::EN,
|
||||
IsoCode639_1::ES,
|
||||
IsoCode639_1::DE,
|
||||
])
|
||||
.build();
|
||||
|
||||
let lang: Option<LinguaLanguage> = detector.detect_language_of(input);
|
||||
let Some(lang) = lang else {
|
||||
return Ok(LanguageId(0));
|
||||
};
|
||||
let confidence = detector.compute_language_confidence("languages are awesome", lang);
|
||||
let lang = lang.iso_code_639_1().to_string().to_lowercase();
|
||||
dbg!(&lang, &confidence);
|
||||
if confidence < 0.4 {
|
||||
return Ok(LanguageId(0));
|
||||
}
|
||||
|
||||
Ok(Language::read_id_from_code(pool, &lang).await?)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[expect(clippy::indexing_slicing)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use crate::utils::build_db_pool_for_tests;
|
||||
use pretty_assertions::assert_eq;
|
||||
use serial_test::serial;
|
||||
|
||||
#[tokio::test]
|
||||
#[serial]
|
||||
async fn test_detect_language() -> LemmyResult<()> {
|
||||
let pool = &build_db_pool_for_tests();
|
||||
let pool = &mut pool.into();
|
||||
|
||||
// some easy comments
|
||||
assert_eq!(
|
||||
LanguageId(37),
|
||||
detect_language(
|
||||
"I don't think it's supposed to be taken seriously. It's just a throwaway meme.
|
||||
",
|
||||
pool
|
||||
)
|
||||
.await?
|
||||
);
|
||||
assert_eq!(
|
||||
LanguageId(39),
|
||||
detect_language(
|
||||
"Oh! Mencion casual de la mejor pelicula navideña… Die hard!
|
||||
",
|
||||
pool
|
||||
)
|
||||
.await?
|
||||
);
|
||||
assert_eq!(
|
||||
LanguageId(32),
|
||||
detect_language(
|
||||
"Die Forderung finde ich nutzlos.
|
||||
",
|
||||
pool
|
||||
)
|
||||
.await?
|
||||
);
|
||||
|
||||
// different languages
|
||||
assert_eq!(
|
||||
LanguageId(0),
|
||||
detect_language(
|
||||
"Die Forderung finde ich nutzlos. It's just a throwaway meme.
|
||||
",
|
||||
pool
|
||||
)
|
||||
.await?
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
}
|
|
@ -1,34 +1,23 @@
|
|||
#![recursion_limit = "256"]
|
||||
use cfg_if::cfg_if;
|
||||
|
||||
#[cfg(feature = "full")]
|
||||
cfg_if! {
|
||||
if #[cfg(feature = "full")] {
|
||||
#[macro_use]
|
||||
extern crate diesel;
|
||||
#[cfg(feature = "full")]
|
||||
#[macro_use]
|
||||
extern crate diesel_derive_newtype;
|
||||
|
||||
#[cfg(feature = "full")]
|
||||
#[macro_use]
|
||||
extern crate diesel_derive_enum;
|
||||
|
||||
// this is used in tests
|
||||
#[cfg(feature = "full")]
|
||||
#[macro_use]
|
||||
extern crate diesel_migrations;
|
||||
|
||||
#[cfg(feature = "full")]
|
||||
#[macro_use]
|
||||
extern crate async_trait;
|
||||
|
||||
pub mod aggregates;
|
||||
#[cfg(feature = "full")]
|
||||
pub mod impls;
|
||||
pub mod newtypes;
|
||||
pub mod sensitive;
|
||||
#[cfg(feature = "full")]
|
||||
#[rustfmt::skip]
|
||||
pub mod schema;
|
||||
#[cfg(feature = "full")]
|
||||
pub mod detect_language;
|
||||
pub mod aliases {
|
||||
use crate::schema::{community_actions, person};
|
||||
diesel::alias!(
|
||||
|
@ -37,6 +26,12 @@ pub mod aliases {
|
|||
person as person2: Person2,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub mod aggregates;
|
||||
pub mod newtypes;
|
||||
pub mod sensitive;
|
||||
pub mod source;
|
||||
#[cfg(feature = "full")]
|
||||
pub mod traits;
|
||||
|
|
|
@ -83,7 +83,7 @@ lettre = { version = "0.11.10", default-features = false, features = [
|
|||
markdown-it = { version = "0.6.1", optional = true }
|
||||
ts-rs = { workspace = true, optional = true }
|
||||
enum-map = { workspace = true, optional = true }
|
||||
cfg-if = "1"
|
||||
cfg-if.workspace = true
|
||||
clearurls = { version = "0.0.4", features = ["linkify"] }
|
||||
markdown-it-block-spoiler = "1.0.0"
|
||||
markdown-it-sub = "1.0.0"
|
||||
|
|
Loading…
Reference in a new issue