Add new feature for testing with errors, test & fix job retries

This commit is contained in:
asonix 2024-03-10 22:02:27 -05:00
parent 286279cdf5
commit 6f95c72070
13 changed files with 114 additions and 21 deletions

1
Cargo.lock generated
View file

@ -1837,6 +1837,7 @@ dependencies = [
"metrics", "metrics",
"metrics-exporter-prometheus", "metrics-exporter-prometheus",
"mime", "mime",
"nanorand",
"opentelemetry", "opentelemetry",
"opentelemetry-otlp", "opentelemetry-otlp",
"opentelemetry_sdk", "opentelemetry_sdk",

View file

@ -16,6 +16,7 @@ strip = true
default = [] default = []
io-uring = ["dep:tokio-uring", "sled/io_uring", "actix-web/experimental-io-uring"] io-uring = ["dep:tokio-uring", "sled/io_uring", "actix-web/experimental-io-uring"]
poll-timer-warnings = [] poll-timer-warnings = []
random-errors = ["dep:nanorand"]
[dependencies] [dependencies]
actix-form-data = "0.7.0-beta.6" actix-form-data = "0.7.0-beta.6"
@ -40,6 +41,7 @@ md-5 = "0.10.5"
metrics = "0.22.0" metrics = "0.22.0"
metrics-exporter-prometheus = { version = "0.13.0", default-features = false, features = ["http-listener"] } metrics-exporter-prometheus = { version = "0.13.0", default-features = false, features = ["http-listener"] }
mime = "0.3.1" mime = "0.3.1"
nanorand = { version = "0.7", optional = true }
opentelemetry_sdk = { version = "0.22", features = ["rt-tokio"] } opentelemetry_sdk = { version = "0.22", features = ["rt-tokio"] }
opentelemetry = "0.22" opentelemetry = "0.22"
opentelemetry-otlp = "0.15" opentelemetry-otlp = "0.15"

View file

@ -59,12 +59,12 @@ impl BytesStream {
} }
pub(crate) fn into_io_stream(self) -> impl Stream<Item = std::io::Result<Bytes>> { pub(crate) fn into_io_stream(self) -> impl Stream<Item = std::io::Result<Bytes>> {
streem::from_fn(move |yielder| async move { crate::stream::error_injector(streem::from_fn(move |yielder| async move {
for bytes in self { for bytes in self {
crate::sync::cooperate().await; crate::sync::cooperate().await;
yielder.yield_ok(bytes).await; yielder.yield_ok(bytes).await;
} }
}) }))
} }
} }

View file

@ -167,6 +167,10 @@ pub(crate) enum UploadError {
#[error("Failed external validation")] #[error("Failed external validation")]
FailedExternalValidation, FailedExternalValidation,
#[cfg(feature = "random-errors")]
#[error("Randomly generated error for testing purposes")]
RandomError,
} }
impl UploadError { impl UploadError {
@ -205,6 +209,8 @@ impl UploadError {
Self::ProcessTimeout => ErrorCode::COMMAND_TIMEOUT, Self::ProcessTimeout => ErrorCode::COMMAND_TIMEOUT,
Self::FailedExternalValidation => ErrorCode::FAILED_EXTERNAL_VALIDATION, Self::FailedExternalValidation => ErrorCode::FAILED_EXTERNAL_VALIDATION,
Self::InvalidJob(_, _) => ErrorCode::INVALID_JOB, Self::InvalidJob(_, _) => ErrorCode::INVALID_JOB,
#[cfg(feature = "random-errors")]
Self::RandomError => ErrorCode::RANDOM_ERROR,
} }
} }

View file

@ -147,4 +147,8 @@ impl ErrorCode {
pub(crate) const INVALID_JOB: ErrorCode = ErrorCode { pub(crate) const INVALID_JOB: ErrorCode = ErrorCode {
code: "invalid-job", code: "invalid-job",
}; };
#[cfg(feature = "random-errors")]
pub(crate) const RANDOM_ERROR: ErrorCode = ErrorCode {
code: "random-error",
};
} }

View file

@ -167,11 +167,18 @@ where
#[cfg(not(feature = "poll-timer-warnings"))] #[cfg(not(feature = "poll-timer-warnings"))]
tracing::debug!("Future {} polled for {} ms", this.name, elapsed.as_millis()); tracing::debug!("Future {} polled for {} ms", this.name, elapsed.as_millis());
} else if elapsed > Duration::from_micros(200) { } else if elapsed > Duration::from_micros(200) {
#[cfg(feature = "poll-timer-warnings")]
tracing::debug!( tracing::debug!(
"Future {} polled for {} microseconds", "Future {} polled for {} microseconds",
this.name, this.name,
elapsed.as_micros(), elapsed.as_micros(),
); );
#[cfg(not(feature = "poll-timer-warnings"))]
tracing::trace!(
"Future {} polled for {} microseconds",
this.name,
elapsed.as_micros(),
);
} else if elapsed > Duration::from_micros(1) { } else if elapsed > Duration::from_micros(1) {
tracing::trace!( tracing::trace!(
"Future {} polled for {} microseconds", "Future {} polled for {} microseconds",

View file

@ -1914,6 +1914,11 @@ impl PictRsConfiguration {
/// } /// }
/// ``` /// ```
pub async fn run(self) -> color_eyre::Result<()> { pub async fn run(self) -> color_eyre::Result<()> {
#[cfg(feature = "random-errors")]
tracing::error!("pict-rs has been compiled with with the 'random-errors' feature enabled.");
#[cfg(feature = "random-errors")]
tracing::error!("This is not suitable for production environments");
let PictRsConfiguration { config, operation } = self; let PictRsConfiguration { config, operation } = self;
// describe all the metrics pict-rs produces // describe all the metrics pict-rs produces

View file

@ -23,6 +23,15 @@ where
Box::pin(async move { Box::pin(async move {
let job_text = format!("{job}"); let job_text = format!("{job}");
#[cfg(feature = "random-errors")]
{
use nanorand::Rng;
if nanorand::tls_rng().generate_range(0..25) < 1 {
return Err(crate::error::UploadError::RandomError).retry();
}
}
let job = serde_json::from_value(job) let job = serde_json::from_value(job)
.map_err(|e| UploadError::InvalidJob(e, job_text)) .map_err(|e| UploadError::InvalidJob(e, job_text))
.abort()?; .abort()?;

View file

@ -1548,14 +1548,14 @@ impl QueueRepo for PostgresRepo {
let mut conn = self.get_connection().await?; let mut conn = self.get_connection().await?;
if matches!(job_status, JobResult::Failure) { let count = if matches!(job_status, JobResult::Failure) {
diesel::update(job_queue) diesel::update(job_queue)
.filter( .filter(
id.eq(job_id.0) id.eq(job_id.0)
.and(queue.eq(queue_name)) .and(queue.eq(queue_name))
.and(worker.eq(worker_id)), .and(worker.eq(worker_id)),
) )
.set(retry.eq(retry - 1)) .set((retry.eq(retry - 1), worker.eq(Option::<Uuid>::None)))
.execute(&mut conn) .execute(&mut conn)
.with_metrics(crate::init_metrics::POSTGRES_QUEUE_RETRY) .with_metrics(crate::init_metrics::POSTGRES_QUEUE_RETRY)
.with_timeout(Duration::from_secs(5)) .with_timeout(Duration::from_secs(5))
@ -1564,18 +1564,13 @@ impl QueueRepo for PostgresRepo {
.map_err(PostgresError::Diesel)?; .map_err(PostgresError::Diesel)?;
diesel::delete(job_queue) diesel::delete(job_queue)
.filter( .filter(id.eq(job_id.0).and(retry.le(0)))
id.eq(job_id.0)
.and(queue.eq(queue_name))
.and(worker.eq(worker_id))
.and(retry.le(0)),
)
.execute(&mut conn) .execute(&mut conn)
.with_metrics(crate::init_metrics::POSTGRES_QUEUE_CLEANUP) .with_metrics(crate::init_metrics::POSTGRES_QUEUE_CLEANUP)
.with_timeout(Duration::from_secs(5)) .with_timeout(Duration::from_secs(5))
.await .await
.map_err(|_| PostgresError::DbTimeout)? .map_err(|_| PostgresError::DbTimeout)?
.map_err(PostgresError::Diesel)?; .map_err(PostgresError::Diesel)?
} else { } else {
diesel::delete(job_queue) diesel::delete(job_queue)
.filter( .filter(
@ -1588,7 +1583,20 @@ impl QueueRepo for PostgresRepo {
.with_timeout(Duration::from_secs(5)) .with_timeout(Duration::from_secs(5))
.await .await
.map_err(|_| PostgresError::DbTimeout)? .map_err(|_| PostgresError::DbTimeout)?
.map_err(PostgresError::Diesel)?; .map_err(PostgresError::Diesel)?
};
match job_status {
JobResult::Success => tracing::debug!("completed {job_id:?}"),
JobResult::Failure if count == 0 => {
tracing::info!("{job_id:?} failed, marked for retry")
}
JobResult::Failure => tracing::warn!("{job_id:?} failed permantently"),
JobResult::Aborted => tracing::warn!("{job_id:?} dead"),
}
if count > 0 {
tracing::debug!("Deleted {count} jobs");
} }
Ok(()) Ok(())

View file

@ -899,15 +899,25 @@ impl QueueRepo for SledRepo {
job_retries.remove(&key[..])?; job_retries.remove(&key[..])?;
} }
Ok(()) Ok(retry_count > 0 && retry)
}, },
) )
}) })
.await .await
.map_err(|_| RepoError::Canceled)?; .map_err(|_| RepoError::Canceled)?;
if let Err(TransactionError::Abort(e) | TransactionError::Storage(e)) = res { match res {
return Err(RepoError::from(SledError::from(e))); Err(TransactionError::Abort(e) | TransactionError::Storage(e)) => {
return Err(RepoError::from(SledError::from(e)));
}
Ok(retried) => match job_status {
JobResult::Success => tracing::debug!("completed {job_id:?}"),
JobResult::Failure if retried => {
tracing::info!("{job_id:?} failed, marked for retry")
}
JobResult::Failure => tracing::warn!("{job_id:?} failed permantently"),
JobResult::Aborted => tracing::warn!("{job_id:?} dead"),
},
} }
Ok(()) Ok(())

View file

@ -62,7 +62,10 @@ impl Store for FileStore {
{ {
let path = self.next_file(extension); let path = self.next_file(extension);
if let Err(e) = self.safe_save_stream(&path, stream).await { if let Err(e) = self
.safe_save_stream(&path, crate::stream::error_injector(stream))
.await
{
self.safe_remove_file(&path).await?; self.safe_remove_file(&path).await?;
return Err(e.into()); return Err(e.into());
} }
@ -95,7 +98,7 @@ impl Store for FileStore {
.instrument(file_span) .instrument(file_span)
.await?; .await?;
Ok(Box::pin(stream)) Ok(Box::pin(crate::stream::error_injector(stream)))
} }
#[tracing::instrument(skip(self))] #[tracing::instrument(skip(self))]

View file

@ -216,7 +216,11 @@ impl Store for ObjectStore {
S: Stream<Item = std::io::Result<Bytes>>, S: Stream<Item = std::io::Result<Bytes>>,
{ {
match self match self
.start_upload(stream, content_type.clone(), extension) .start_upload(
crate::stream::error_injector(stream),
content_type.clone(),
extension,
)
.await? .await?
{ {
UploadState::Single(first_chunk) => { UploadState::Single(first_chunk) => {
@ -306,9 +310,11 @@ impl Store for ObjectStore {
return Err(status_error(response, Some(identifier.clone())).await); return Err(status_error(response, Some(identifier.clone())).await);
} }
Ok(Box::pin(crate::stream::metrics( Ok(Box::pin(crate::stream::error_injector(
crate::init_metrics::OBJECT_STORAGE_GET_OBJECT_REQUEST_STREAM, crate::stream::metrics(
crate::stream::map_err(response.bytes_stream(), payload_to_io_error), crate::init_metrics::OBJECT_STORAGE_GET_OBJECT_REQUEST_STREAM,
crate::stream::map_err(response.bytes_stream(), payload_to_io_error),
),
))) )))
} }

View file

@ -5,6 +5,38 @@ use streem::IntoStreamer;
use crate::future::WithMetrics; use crate::future::WithMetrics;
#[cfg(not(feature = "random-errors"))]
pub(crate) fn error_injector(
stream: impl Stream<Item = std::io::Result<Bytes>>,
) -> impl Stream<Item = std::io::Result<Bytes>> {
stream
}
#[cfg(feature = "random-errors")]
pub(crate) fn error_injector(
stream: impl Stream<Item = std::io::Result<Bytes>>,
) -> impl Stream<Item = std::io::Result<Bytes>> {
streem::try_from_fn(|yielder| async move {
let stream = std::pin::pin!(stream);
let mut streamer = stream.into_streamer();
while let Some(item) = streamer.try_next().await? {
yielder.yield_ok(item).await;
use nanorand::Rng;
if nanorand::tls_rng().generate_range(0..1000) < 1 {
return Err(std::io::Error::new(
std::io::ErrorKind::Other,
crate::error::UploadError::RandomError,
));
}
}
Ok(())
})
}
pub(crate) fn take<S>(stream: S, amount: usize) -> impl Stream<Item = S::Item> pub(crate) fn take<S>(stream: S, amount: usize) -> impl Stream<Item = S::Item>
where where
S: Stream, S: Stream,