Rework tokio workers to be supervised

This commit is contained in:
asonix 2024-01-14 12:48:02 -05:00
parent 6665ced671
commit 7eaa3d8c72
3 changed files with 118 additions and 67 deletions

View file

@ -50,6 +50,8 @@
//! //!
//! // tokio::signal::ctrl_c().await?; //! // tokio::signal::ctrl_c().await?;
//! //!
//! drop(queue_handle);
//!
//! Ok(()) //! Ok(())
//! } //! }
//! //!
@ -118,10 +120,11 @@ use background_jobs_core::{
memory_storage::Timer, new_job, new_scheduled_job, Job, ProcessorMap, Storage as StorageTrait, memory_storage::Timer, new_job, new_scheduled_job, Job, ProcessorMap, Storage as StorageTrait,
}; };
use std::{ use std::{
collections::BTreeMap, collections::{BTreeMap, HashMap},
sync::Arc, sync::{Arc, Mutex},
time::{Duration, SystemTime}, time::{Duration, SystemTime},
}; };
use tokio::task::{JoinHandle, JoinSet};
mod every; mod every;
mod spawn; mod spawn;
@ -151,6 +154,7 @@ where
{ {
QueueHandle { QueueHandle {
inner: Storage::new(storage), inner: Storage::new(storage),
manager_handle: Some(Arc::new(Mutex::new(None))),
} }
} }
@ -218,23 +222,80 @@ where
} }
/// Start the workers in the provided arbiter /// Start the workers in the provided arbiter
pub fn start(self) -> QueueHandle { pub fn start(self) -> std::io::Result<QueueHandle> {
for (key, count) in self.queues.iter() { let Self {
processors,
queues,
queue_handle,
} = self;
let mut sets = HashMap::new();
for (key, count) in queues.iter() {
let mut set = JoinSet::new();
for _ in 0..*count { for _ in 0..*count {
let queue = key.clone(); let queue = key.clone();
let processors = self.processors.clone(); let processors = processors.clone();
let server = self.queue_handle.inner.clone(); let server = queue_handle.inner.clone();
if let Err(e) = spawn::spawn( spawn::spawn_in(
&mut set,
"local-worker", "local-worker",
worker::local_worker(queue, processors.clone(), server), worker::local_worker(queue, processors.clone(), server),
) { )?;
tracing::error!("Failed to spawn worker {e}");
}
} }
sets.insert(key.clone(), set);
} }
self.queue_handle let server = queue_handle.inner.clone();
let manager_task = crate::spawn::spawn("set-supervisor", async move {
let mut superset = JoinSet::new();
for (queue, mut set) in sets {
let server = server.clone();
let processors = processors.clone();
if let Err(e) = spawn::spawn_in(&mut superset, "worker-supervisor", async move {
while let Some(_) = set.join_next().await {
metrics::counter!("background-jobs.tokio.worker.finished", "queue" => queue.clone())
.increment(1);
tracing::warn!("worker closed, spawning another");
if let Err(e) = spawn::spawn_in(
&mut set,
"local-worker",
worker::local_worker(queue.clone(), processors.clone(), server.clone()),
) {
tracing::warn!("Failed to respawn worker: {e}");
break;
}
metrics::counter!("background-jobs.tokio.worker.restart").increment(1);
}
}) {
tracing::warn!("Failed to spawn worker supervisor: {e}");
break;
}
}
let mut count = 0;
while superset.join_next().await.is_some() {
count += 1;
tracing::info!("Joined worker-supervisor {count}");
}
})?;
*queue_handle
.manager_handle
.as_ref()
.unwrap()
.lock()
.unwrap() = Some(manager_task);
Ok(queue_handle)
} }
} }
@ -245,6 +306,7 @@ where
#[derive(Clone)] #[derive(Clone)]
pub struct QueueHandle { pub struct QueueHandle {
inner: Storage, inner: Storage,
manager_handle: Option<Arc<Mutex<Option<JoinHandle<()>>>>>,
} }
impl QueueHandle { impl QueueHandle {
@ -285,3 +347,17 @@ impl QueueHandle {
spawn::spawn("every", every(self.clone(), duration, job)).map(|_| ()) spawn::spawn("every", every(self.clone(), duration, job)).map(|_| ())
} }
} }
impl Drop for QueueHandle {
fn drop(&mut self) {
if let Some(handle) = self
.manager_handle
.take()
.and_then(Arc::into_inner)
.and_then(|m| m.lock().unwrap().take())
{
tracing::debug!("Dropping last QueueHandle");
handle.abort();
}
}
}

View file

@ -1,6 +1,6 @@
use std::future::Future; use std::future::Future;
use tokio::task::JoinHandle; use tokio::task::{AbortHandle, JoinHandle, JoinSet};
#[cfg(tokio_unstable)] #[cfg(tokio_unstable)]
pub(crate) fn spawn<F>(name: &str, future: F) -> std::io::Result<JoinHandle<F::Output>> pub(crate) fn spawn<F>(name: &str, future: F) -> std::io::Result<JoinHandle<F::Output>>
@ -11,6 +11,19 @@ where
tokio::task::Builder::new().name(name).spawn(future) tokio::task::Builder::new().name(name).spawn(future)
} }
#[cfg(tokio_unstable)]
pub(crate) fn spawn_in<F>(
set: &mut JoinSet<F::Output>,
name: &str,
future: F,
) -> std::io::Result<AbortHandle>
where
F: Future + Send + 'static,
F::Output: Send + 'static,
{
set.build_task().name(name).spawn(future)
}
#[cfg(not(tokio_unstable))] #[cfg(not(tokio_unstable))]
pub(crate) fn spawn<F>(name: &str, future: F) -> std::io::Result<JoinHandle<F::Output>> pub(crate) fn spawn<F>(name: &str, future: F) -> std::io::Result<JoinHandle<F::Output>>
where where
@ -20,3 +33,17 @@ where
let _ = name; let _ = name;
Ok(tokio::task::spawn(future)) Ok(tokio::task::spawn(future))
} }
#[cfg(not(tokio_unstable))]
pub(crate) fn spawn_in<F>(
set: &mut JoinSet<F::Output>,
name: &str,
future: F,
) -> std::io::Result<AbortHandle>
where
F: Future + Send + 'static,
F::Output: Send + 'static,
{
let _ = name;
Ok(set.spawn(future))
}

View file

@ -7,52 +7,6 @@ use std::{
use tracing::{Instrument, Span}; use tracing::{Instrument, Span};
use uuid::Uuid; use uuid::Uuid;
struct LocalWorkerStarter<State: Send + Clone + 'static> {
queue: String,
processors: ProcessorMap<State>,
server: Storage,
}
#[cfg(tokio_unstable)]
fn test_runtime() -> anyhow::Result<()> {
tokio::task::Builder::new()
.name("runtime-test")
.spawn(async move {})
.map(|_| ())
.map_err(From::from)
}
#[cfg(not(tokio_unstable))]
fn test_runtime() -> anyhow::Result<()> {
std::panic::catch_unwind(|| tokio::spawn(async move {})).map(|_| ()).map_err(From::from)
}
impl<State> Drop for LocalWorkerStarter<State> where State: Send + Clone + 'static {
fn drop(&mut self) {
metrics::counter!("background-jobs.tokio.worker.finished", "queue" => self.queue.clone())
.increment(1);
let res = test_runtime();
if res.is_ok() {
if let Err(e) = crate::spawn::spawn(
"local-worker",
local_worker(
self.queue.clone(),
self.processors.clone(),
self.server.clone(),
),
) {
tracing::error!("Failed to re-spawn local worker: {e}");
} else {
metrics::counter!("background-jobs.tokio.worker.restart").increment(1);
}
} else {
tracing::info!("Shutting down worker");
}
}
}
struct RunOnDrop<F>(F) struct RunOnDrop<F>(F)
where where
F: Fn(); F: Fn();
@ -148,18 +102,13 @@ pub(crate) async fn local_worker<State>(
) where ) where
State: Send + Clone + 'static, State: Send + Clone + 'static,
{ {
metrics::counter!("background-jobs.tokio.worker.started", "queue" => queue.clone()).increment(1); metrics::counter!("background-jobs.tokio.worker.started", "queue" => queue.clone())
.increment(1);
let starter = LocalWorkerStarter {
queue: queue.clone(),
processors: processors.clone(),
server: server.clone(),
};
let id = Uuid::now_v7(); let id = Uuid::now_v7();
let log_on_drop = RunOnDrop(|| { let log_on_drop = RunOnDrop(|| {
make_span(id, &queue, "closing").in_scope(|| tracing::info!("Worker closing")); make_span(id, &queue, "closing").in_scope(|| tracing::debug!("Worker closing"));
}); });
loop { loop {
@ -219,7 +168,6 @@ pub(crate) async fn local_worker<State>(
} }
drop(log_on_drop); drop(log_on_drop);
drop(starter);
} }
fn make_span(id: Uuid, queue: &str, operation: &str) -> Span { fn make_span(id: Uuid, queue: &str, operation: &str) -> Span {