Add Prometheus endpoint (#3456)

Add a server for serving Prometheus metrics. Include a configuration
block in the config file. Provide HTTP metrics on the API, along with
process-level metrics and DB pool metrics.
This commit is contained in:
Andrew Fields 2023-07-05 06:25:19 -05:00 committed by GitHub
parent 657c2e37c0
commit 1e99e8b9dc
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 232 additions and 6 deletions

50
Cargo.lock generated
View file

@ -317,6 +317,18 @@ dependencies = [
"syn 1.0.103", "syn 1.0.103",
] ]
[[package]]
name = "actix-web-prom"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9df3127d20a5d01c9fc9aceb969a38d31a6767e1b48a54d55a8f56c769a84923"
dependencies = [
"actix-web",
"futures-core",
"pin-project-lite",
"prometheus",
]
[[package]] [[package]]
name = "addr2line" name = "addr2line"
version = "0.19.0" version = "0.19.0"
@ -2765,6 +2777,7 @@ dependencies = [
"activitypub_federation", "activitypub_federation",
"actix-cors", "actix-cors",
"actix-web", "actix-web",
"actix-web-prom",
"chrono", "chrono",
"clokwerk", "clokwerk",
"console-subscriber", "console-subscriber",
@ -2782,6 +2795,7 @@ dependencies = [
"opentelemetry 0.17.0", "opentelemetry 0.17.0",
"opentelemetry-otlp 0.10.0", "opentelemetry-otlp 0.10.0",
"pict-rs", "pict-rs",
"prometheus",
"reqwest", "reqwest",
"reqwest-middleware", "reqwest-middleware",
"reqwest-tracing", "reqwest-tracing",
@ -4052,6 +4066,36 @@ dependencies = [
"unicode-ident", "unicode-ident",
] ]
[[package]]
name = "procfs"
version = "0.14.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b1de8dacb0873f77e6aefc6d71e044761fcc68060290f5b1089fcdf84626bb69"
dependencies = [
"bitflags 1.3.2",
"byteorder",
"hex",
"lazy_static",
"rustix",
]
[[package]]
name = "prometheus"
version = "0.13.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "449811d15fbdf5ceb5c1144416066429cf82316e2ec8ce0c1f6f8a02e7bbcf8c"
dependencies = [
"cfg-if",
"fnv",
"lazy_static",
"libc",
"memchr",
"parking_lot 0.12.1",
"procfs",
"protobuf",
"thiserror",
]
[[package]] [[package]]
name = "prost" name = "prost"
version = "0.9.0" version = "0.9.0"
@ -4138,6 +4182,12 @@ dependencies = [
"prost 0.11.0", "prost 0.11.0",
] ]
[[package]]
name = "protobuf"
version = "2.28.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "106dd99e98437432fed6519dedecfade6a06a73bb7b2a1e019fdd2bee5778d94"
[[package]] [[package]]
name = "psm" name = "psm"
version = "0.1.21" version = "0.1.21"

View file

@ -28,6 +28,7 @@ lto = "thin"
embed-pictrs = ["pict-rs"] embed-pictrs = ["pict-rs"]
console = ["console-subscriber", "opentelemetry", "opentelemetry-otlp", "tracing-opentelemetry", "reqwest-tracing/opentelemetry_0_16"] console = ["console-subscriber", "opentelemetry", "opentelemetry-otlp", "tracing-opentelemetry", "reqwest-tracing/opentelemetry_0_16"]
json-log = ["tracing-subscriber/json"] json-log = ["tracing-subscriber/json"]
prometheus-metrics = ["prometheus", "actix-web-prom"]
default = [] default = []
[workspace] [workspace]
@ -144,3 +145,5 @@ futures-util = { workspace = true }
tokio-postgres = { workspace = true } tokio-postgres = { workspace = true }
tokio-postgres-rustls = { workspace = true } tokio-postgres-rustls = { workspace = true }
chrono = { workspace = true } chrono = { workspace = true }
prometheus = { version = "0.13.3", features = ["process"], optional = true }
actix-web-prom = { version = "0.6.0", optional = true }

View file

@ -80,4 +80,8 @@
worker_count: 0 worker_count: 0
# The number of activitypub federation retry workers that can be in-flight concurrently # The number of activitypub federation retry workers that can be in-flight concurrently
retry_count: 0 retry_count: 0
prometheus: {
bind: "127.0.0.1"
port: 10002
}
} }

View file

@ -45,6 +45,10 @@ pub struct Settings {
/// The number of activitypub federation retry workers that can be in-flight concurrently /// The number of activitypub federation retry workers that can be in-flight concurrently
#[default(0)] #[default(0)]
pub retry_count: usize, pub retry_count: usize,
// Prometheus configuration.
#[default(None)]
#[doku(example = "Some(Default::default())")]
pub prometheus: Option<PrometheusConfig>,
} }
#[derive(Debug, Deserialize, Serialize, Clone, SmartDefault, Document)] #[derive(Debug, Deserialize, Serialize, Clone, SmartDefault, Document)]
@ -157,3 +161,16 @@ pub struct SetupConfig {
#[default(None)] #[default(None)]
pub admin_email: Option<String>, pub admin_email: Option<String>,
} }
#[derive(Debug, Deserialize, Serialize, Clone, SmartDefault, Document)]
#[serde(deny_unknown_fields)]
pub struct PrometheusConfig {
// Address that the Prometheus metrics will be served on.
#[default(Some(IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1))))]
#[doku(example = "127.0.0.1")]
pub bind: Option<IpAddr>,
// Port that the Prometheus metrics will be served on.
#[default(Some(10002))]
#[doku(example = "10002")]
pub port: Option<i32>,
}

View file

@ -2,6 +2,9 @@ FROM clux/muslrust:1.70.0 as builder
WORKDIR /app WORKDIR /app
ARG CARGO_BUILD_TARGET=x86_64-unknown-linux-musl ARG CARGO_BUILD_TARGET=x86_64-unknown-linux-musl
# comma-seperated list of features to enable
ARG CARGO_BUILD_FEATURES=default
# This can be set to release using --build-arg # This can be set to release using --build-arg
ARG RUST_RELEASE_MODE="debug" ARG RUST_RELEASE_MODE="debug"
@ -13,7 +16,7 @@ COPY . .
RUN --mount=type=cache,target=/app/target \ RUN --mount=type=cache,target=/app/target \
if [ "$RUST_RELEASE_MODE" = "debug" ] ; then \ if [ "$RUST_RELEASE_MODE" = "debug" ] ; then \
echo "pub const VERSION: &str = \"$(git describe --tag)\";" > "crates/utils/src/version.rs" \ echo "pub const VERSION: &str = \"$(git describe --tag)\";" > "crates/utils/src/version.rs" \
&& cargo build --target ${CARGO_BUILD_TARGET} \ && cargo build --target ${CARGO_BUILD_TARGET} --features ${CARGO_BUILD_FEATURES} \
&& cp ./target/$CARGO_BUILD_TARGET/$RUST_RELEASE_MODE/lemmy_server /app/lemmy_server; \ && cp ./target/$CARGO_BUILD_TARGET/$RUST_RELEASE_MODE/lemmy_server /app/lemmy_server; \
fi fi
@ -21,7 +24,7 @@ RUN --mount=type=cache,target=/app/target \
RUN \ RUN \
if [ "$RUST_RELEASE_MODE" = "release" ] ; then \ if [ "$RUST_RELEASE_MODE" = "release" ] ; then \
echo "pub const VERSION: &str = \"$(git describe --tag)\";" > "crates/utils/src/version.rs" \ echo "pub const VERSION: &str = \"$(git describe --tag)\";" > "crates/utils/src/version.rs" \
&& cargo build --target ${CARGO_BUILD_TARGET} --release \ && cargo build --target ${CARGO_BUILD_TARGET} --features ${CARGO_BUILD_FEATURES} --release \
&& cp ./target/$CARGO_BUILD_TARGET/$RUST_RELEASE_MODE/lemmy_server /app/lemmy_server; \ && cp ./target/$CARGO_BUILD_TARGET/$RUST_RELEASE_MODE/lemmy_server /app/lemmy_server; \
fi fi

View file

@ -32,12 +32,17 @@ services:
dockerfile: docker/Dockerfile dockerfile: docker/Dockerfile
# args: # args:
# RUST_RELEASE_MODE: release # RUST_RELEASE_MODE: release
# CARGO_BUILD_FEATURES: default
# this hostname is used in nginx reverse proxy and also for lemmy ui to connect to the backend, do not change # this hostname is used in nginx reverse proxy and also for lemmy ui to connect to the backend, do not change
hostname: lemmy hostname: lemmy
restart: always restart: always
environment: environment:
- RUST_LOG="warn,lemmy_server=debug,lemmy_api=debug,lemmy_api_common=debug,lemmy_api_crud=debug,lemmy_apub=debug,lemmy_db_schema=debug,lemmy_db_views=debug,lemmy_db_views_actor=debug,lemmy_db_views_moderator=debug,lemmy_routes=debug,lemmy_utils=debug,lemmy_websocket=debug" - RUST_LOG="warn,lemmy_server=debug,lemmy_api=debug,lemmy_api_common=debug,lemmy_api_crud=debug,lemmy_apub=debug,lemmy_db_schema=debug,lemmy_db_views=debug,lemmy_db_views_actor=debug,lemmy_db_views_moderator=debug,lemmy_routes=debug,lemmy_utils=debug,lemmy_websocket=debug"
- RUST_BACKTRACE=full - RUST_BACKTRACE=full
ports:
# prometheus metrics available at the path /metrics on port 10002 by default
# enable prometheus metrics by setting the CARGO_BUILD_FEATURES build arg above to "prometheus-metrics"
- "10002:10002"
volumes: volumes:
- ./lemmy.hjson:/config/config.hjson:Z - ./lemmy.hjson:/config/config.hjson:Z
depends_on: depends_on:

View file

@ -1,5 +1,7 @@
pub mod api_routes_http; pub mod api_routes_http;
pub mod code_migrations; pub mod code_migrations;
#[cfg(feature = "prometheus-metrics")]
pub mod prometheus_metrics;
pub mod root_span_builder; pub mod root_span_builder;
pub mod scheduled_tasks; pub mod scheduled_tasks;
#[cfg(feature = "console")] #[cfg(feature = "console")]
@ -35,6 +37,12 @@ use tracing_error::ErrorLayer;
use tracing_log::LogTracer; use tracing_log::LogTracer;
use tracing_subscriber::{filter::Targets, layer::SubscriberExt, Layer, Registry}; use tracing_subscriber::{filter::Targets, layer::SubscriberExt, Layer, Registry};
use url::Url; use url::Url;
#[cfg(feature = "prometheus-metrics")]
use {
actix_web_prom::PrometheusMetricsBuilder,
prometheus::default_registry,
prometheus_metrics::serve_prometheus,
};
/// Max timeout for http requests /// Max timeout for http requests
pub(crate) const REQWEST_TIMEOUT: Duration = Duration::from_secs(10); pub(crate) const REQWEST_TIMEOUT: Duration = Duration::from_secs(10);
@ -119,6 +127,9 @@ pub async fn start_lemmy_server() -> Result<(), LemmyError> {
}); });
} }
#[cfg(feature = "prometheus-metrics")]
serve_prometheus(settings.prometheus.as_ref(), context.clone());
let settings_bind = settings.clone(); let settings_bind = settings.clone();
let federation_config = FederationConfig::builder() let federation_config = FederationConfig::builder()
@ -134,6 +145,14 @@ pub async fn start_lemmy_server() -> Result<(), LemmyError> {
.build() .build()
.await?; .await?;
// this must come before the HttpServer creation
// creates a middleware that populates http metrics for each path, method, and status code
#[cfg(feature = "prometheus-metrics")]
let prom_api_metrics = PrometheusMetricsBuilder::new("lemmy_api")
.registry(default_registry().clone())
.build()
.unwrap();
// Create Http server with websocket support // Create Http server with websocket support
HttpServer::new(move || { HttpServer::new(move || {
let cors_config = if cfg!(debug_assertions) { let cors_config = if cfg!(debug_assertions) {
@ -145,7 +164,7 @@ pub async fn start_lemmy_server() -> Result<(), LemmyError> {
.allowed_origin(&settings.get_protocol_and_hostname()) .allowed_origin(&settings.get_protocol_and_hostname())
}; };
App::new() let app = App::new()
.wrap(middleware::Logger::new( .wrap(middleware::Logger::new(
// This is the default log format save for the usage of %{r}a over %a to guarantee to record the client's (forwarded) IP and not the last peer address, since the latter is frequently just a reverse proxy // This is the default log format save for the usage of %{r}a over %a to guarantee to record the client's (forwarded) IP and not the last peer address, since the latter is frequently just a reverse proxy
"%{r}a '%r' %s %b '%{Referer}i' '%{User-Agent}i' %T", "%{r}a '%r' %s %b '%{Referer}i' '%{User-Agent}i' %T",
@ -155,8 +174,13 @@ pub async fn start_lemmy_server() -> Result<(), LemmyError> {
.wrap(TracingLogger::<QuieterRootSpanBuilder>::new()) .wrap(TracingLogger::<QuieterRootSpanBuilder>::new())
.app_data(Data::new(context.clone())) .app_data(Data::new(context.clone()))
.app_data(Data::new(rate_limit_cell.clone())) .app_data(Data::new(rate_limit_cell.clone()))
.wrap(FederationMiddleware::new(federation_config.clone())) .wrap(FederationMiddleware::new(federation_config.clone()));
#[cfg(feature = "prometheus-metrics")]
let app = app.wrap(prom_api_metrics.clone());
// The routes // The routes
app
.configure(|cfg| api_routes_http::config(cfg, rate_limit_cell)) .configure(|cfg| api_routes_http::config(cfg, rate_limit_cell))
.configure(|cfg| { .configure(|cfg| {
if federation_enabled { if federation_enabled {

120
src/prometheus_metrics.rs Normal file
View file

@ -0,0 +1,120 @@
use actix_web::{rt::System, web, App, HttpResponse, HttpServer, Responder};
use lemmy_api_common::context::LemmyContext;
use lemmy_utils::settings::structs::PrometheusConfig;
use prometheus::{default_registry, Encoder, Gauge, Opts, TextEncoder};
use std::{
net::{IpAddr, Ipv4Addr},
sync::Arc,
thread,
};
struct PromContext {
lemmy: LemmyContext,
db_pool_metrics: DbPoolMetrics,
}
struct DbPoolMetrics {
max_size: Gauge,
size: Gauge,
available: Gauge,
}
static DEFAULT_BIND: IpAddr = IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1));
static DEFAULT_PORT: i32 = 10002;
pub fn serve_prometheus(config: Option<&PrometheusConfig>, lemmy_context: LemmyContext) {
let context = Arc::new(PromContext {
lemmy: lemmy_context,
db_pool_metrics: create_db_pool_metrics(),
});
let (bind, port) = match config {
Some(config) => (
config.bind.unwrap_or(DEFAULT_BIND),
config.port.unwrap_or(DEFAULT_PORT),
),
None => (DEFAULT_BIND, DEFAULT_PORT),
};
// spawn thread that blocks on handling requests
// only mapping /metrics to a handler
thread::spawn(move || {
let sys = System::new();
sys.block_on(async {
let server = HttpServer::new(move || {
App::new()
.app_data(web::Data::new(Arc::clone(&context)))
.route("/metrics", web::get().to(metrics))
})
.bind((bind, port as u16))
.expect(&format!("Cannot bind to {}:{}", bind, port))
.run();
if let Err(err) = server.await {
eprintln!("Prometheus server error: {}", err);
}
})
});
}
// handler for the /metrics path
async fn metrics(context: web::Data<Arc<PromContext>>) -> impl Responder {
// collect metrics
collect_db_pool_metrics(&context).await;
let mut buffer = Vec::new();
let encoder = TextEncoder::new();
// gather metrics from registry and encode in prometheus format
let metric_families = prometheus::gather();
encoder.encode(&metric_families, &mut buffer).unwrap();
let output = String::from_utf8(buffer).unwrap();
HttpResponse::Ok().body(output)
}
// create lemmy_db_pool_* metrics and register them with the default registry
fn create_db_pool_metrics() -> DbPoolMetrics {
let metrics = DbPoolMetrics {
max_size: Gauge::with_opts(Opts::new(
"lemmy_db_pool_max_connections",
"Maximum number of connections in the pool",
))
.unwrap(),
size: Gauge::with_opts(Opts::new(
"lemmy_db_pool_connections",
"Current number of connections in the pool",
))
.unwrap(),
available: Gauge::with_opts(Opts::new(
"lemmy_db_pool_available_connections",
"Number of available connections in the pool",
))
.unwrap(),
};
default_registry()
.register(Box::new(metrics.max_size.clone()))
.unwrap();
default_registry()
.register(Box::new(metrics.size.clone()))
.unwrap();
default_registry()
.register(Box::new(metrics.available.clone()))
.unwrap();
return metrics;
}
async fn collect_db_pool_metrics(context: &PromContext) {
let pool_status = context.lemmy.pool().status();
context
.db_pool_metrics
.max_size
.set(pool_status.max_size as f64);
context.db_pool_metrics.size.set(pool_status.size as f64);
context
.db_pool_metrics
.available
.set(pool_status.available as f64);
}