mirror of
https://github.com/LemmyNet/lemmy.git
synced 2024-11-25 19:01:03 +00:00
Auto-detect post language (fixes #2870)
This commit is contained in:
parent
63ea99d38a
commit
d566710e96
6 changed files with 1114 additions and 42 deletions
1004
Cargo.lock
generated
1004
Cargo.lock
generated
File diff suppressed because it is too large
Load diff
|
@ -120,6 +120,7 @@ reqwest = { version = "0.12.7", default-features = false, features = [
|
||||||
"gzip",
|
"gzip",
|
||||||
"rustls-tls",
|
"rustls-tls",
|
||||||
] }
|
] }
|
||||||
|
cfg-if = "1"
|
||||||
reqwest-middleware = "0.3.3"
|
reqwest-middleware = "0.3.3"
|
||||||
reqwest-tracing = "0.5.3"
|
reqwest-tracing = "0.5.3"
|
||||||
clokwerk = "0.4.0"
|
clokwerk = "0.4.0"
|
||||||
|
|
|
@ -38,7 +38,7 @@ full = [
|
||||||
"rustls",
|
"rustls",
|
||||||
"i-love-jesus",
|
"i-love-jesus",
|
||||||
"tuplex",
|
"tuplex",
|
||||||
"diesel-bind-if-some",
|
"diesel-bind-if-some","lingua"
|
||||||
]
|
]
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
|
@ -82,6 +82,8 @@ diesel-bind-if-some = { workspace = true, optional = true }
|
||||||
moka.workspace = true
|
moka.workspace = true
|
||||||
derive-new.workspace = true
|
derive-new.workspace = true
|
||||||
tuplex = { workspace = true, optional = true }
|
tuplex = { workspace = true, optional = true }
|
||||||
|
lingua = { version = "1.6.2", optional = true }
|
||||||
|
cfg-if.workspace =true
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
serial_test = { workspace = true }
|
serial_test = { workspace = true }
|
||||||
|
|
84
crates/db_schema/src/detect_language.rs
Normal file
84
crates/db_schema/src/detect_language.rs
Normal file
|
@ -0,0 +1,84 @@
|
||||||
|
use crate::{newtypes::LanguageId, source::language::Language, utils::DbPool};
|
||||||
|
use lemmy_utils::error::LemmyResult;
|
||||||
|
use lingua::{IsoCode639_1, Language as LinguaLanguage, LanguageDetectorBuilder};
|
||||||
|
|
||||||
|
pub async fn detect_language(input: &str, pool: &mut DbPool<'_>) -> LemmyResult<LanguageId> {
|
||||||
|
// TODO: should only detect languages which are allowed in community
|
||||||
|
let detector = LanguageDetectorBuilder::from_iso_codes_639_1(&[
|
||||||
|
IsoCode639_1::EN,
|
||||||
|
IsoCode639_1::ES,
|
||||||
|
IsoCode639_1::DE,
|
||||||
|
])
|
||||||
|
.build();
|
||||||
|
|
||||||
|
let lang: Option<LinguaLanguage> = detector.detect_language_of(input);
|
||||||
|
let Some(lang) = lang else {
|
||||||
|
return Ok(LanguageId(0));
|
||||||
|
};
|
||||||
|
let confidence = detector.compute_language_confidence("languages are awesome", lang);
|
||||||
|
let lang = lang.iso_code_639_1().to_string().to_lowercase();
|
||||||
|
dbg!(&lang, &confidence);
|
||||||
|
if confidence < 0.4 {
|
||||||
|
return Ok(LanguageId(0));
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(Language::read_id_from_code(pool, &lang).await?)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
#[expect(clippy::indexing_slicing)]
|
||||||
|
mod tests {
|
||||||
|
|
||||||
|
use super::*;
|
||||||
|
use crate::utils::build_db_pool_for_tests;
|
||||||
|
use pretty_assertions::assert_eq;
|
||||||
|
use serial_test::serial;
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
#[serial]
|
||||||
|
async fn test_detect_language() -> LemmyResult<()> {
|
||||||
|
let pool = &build_db_pool_for_tests();
|
||||||
|
let pool = &mut pool.into();
|
||||||
|
|
||||||
|
// some easy comments
|
||||||
|
assert_eq!(
|
||||||
|
LanguageId(37),
|
||||||
|
detect_language(
|
||||||
|
"I don't think it's supposed to be taken seriously. It's just a throwaway meme.
|
||||||
|
",
|
||||||
|
pool
|
||||||
|
)
|
||||||
|
.await?
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
LanguageId(39),
|
||||||
|
detect_language(
|
||||||
|
"Oh! Mencion casual de la mejor pelicula navideña… Die hard!
|
||||||
|
",
|
||||||
|
pool
|
||||||
|
)
|
||||||
|
.await?
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
LanguageId(32),
|
||||||
|
detect_language(
|
||||||
|
"Die Forderung finde ich nutzlos.
|
||||||
|
",
|
||||||
|
pool
|
||||||
|
)
|
||||||
|
.await?
|
||||||
|
);
|
||||||
|
|
||||||
|
// different languages
|
||||||
|
assert_eq!(
|
||||||
|
LanguageId(0),
|
||||||
|
detect_language(
|
||||||
|
"Die Forderung finde ich nutzlos. It's just a throwaway meme.
|
||||||
|
",
|
||||||
|
pool
|
||||||
|
)
|
||||||
|
.await?
|
||||||
|
);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,34 +1,23 @@
|
||||||
#![recursion_limit = "256"]
|
#![recursion_limit = "256"]
|
||||||
|
use cfg_if::cfg_if;
|
||||||
|
|
||||||
#[cfg(feature = "full")]
|
cfg_if! {
|
||||||
|
if #[cfg(feature = "full")] {
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
extern crate diesel;
|
extern crate diesel;
|
||||||
#[cfg(feature = "full")]
|
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
extern crate diesel_derive_newtype;
|
extern crate diesel_derive_newtype;
|
||||||
|
|
||||||
#[cfg(feature = "full")]
|
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
extern crate diesel_derive_enum;
|
extern crate diesel_derive_enum;
|
||||||
|
|
||||||
// this is used in tests
|
// this is used in tests
|
||||||
#[cfg(feature = "full")]
|
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
extern crate diesel_migrations;
|
extern crate diesel_migrations;
|
||||||
|
|
||||||
#[cfg(feature = "full")]
|
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
extern crate async_trait;
|
extern crate async_trait;
|
||||||
|
|
||||||
pub mod aggregates;
|
|
||||||
#[cfg(feature = "full")]
|
|
||||||
pub mod impls;
|
pub mod impls;
|
||||||
pub mod newtypes;
|
|
||||||
pub mod sensitive;
|
|
||||||
#[cfg(feature = "full")]
|
|
||||||
#[rustfmt::skip]
|
#[rustfmt::skip]
|
||||||
pub mod schema;
|
pub mod schema;
|
||||||
#[cfg(feature = "full")]
|
pub mod detect_language;
|
||||||
pub mod aliases {
|
pub mod aliases {
|
||||||
use crate::schema::{community_actions, person};
|
use crate::schema::{community_actions, person};
|
||||||
diesel::alias!(
|
diesel::alias!(
|
||||||
|
@ -37,6 +26,12 @@ pub mod aliases {
|
||||||
person as person2: Person2,
|
person as person2: Person2,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub mod aggregates;
|
||||||
|
pub mod newtypes;
|
||||||
|
pub mod sensitive;
|
||||||
pub mod source;
|
pub mod source;
|
||||||
#[cfg(feature = "full")]
|
#[cfg(feature = "full")]
|
||||||
pub mod traits;
|
pub mod traits;
|
||||||
|
|
|
@ -83,7 +83,7 @@ lettre = { version = "0.11.10", default-features = false, features = [
|
||||||
markdown-it = { version = "0.6.1", optional = true }
|
markdown-it = { version = "0.6.1", optional = true }
|
||||||
ts-rs = { workspace = true, optional = true }
|
ts-rs = { workspace = true, optional = true }
|
||||||
enum-map = { workspace = true, optional = true }
|
enum-map = { workspace = true, optional = true }
|
||||||
cfg-if = "1"
|
cfg-if.workspace = true
|
||||||
clearurls = { version = "0.0.4", features = ["linkify"] }
|
clearurls = { version = "0.0.4", features = ["linkify"] }
|
||||||
markdown-it-block-spoiler = "1.0.0"
|
markdown-it-block-spoiler = "1.0.0"
|
||||||
markdown-it-sub = "1.0.0"
|
markdown-it-sub = "1.0.0"
|
||||||
|
|
Loading…
Reference in a new issue