Less strict timeouts

This commit is contained in:
Alex Auvolat 2022-09-01 16:30:44 +02:00
parent f3bf34b6a1
commit df094bd807
No known key found for this signature in database
GPG key ID: 0E496D15096376BE
7 changed files with 16 additions and 10 deletions

2
Cargo.lock generated
View file

@ -2176,7 +2176,7 @@ dependencies = [
[[package]] [[package]]
name = "netapp" name = "netapp"
version = "0.5.0" version = "0.5.0"
source = "git+https://git.deuxfleurs.fr/lx/netapp?branch=stream-body#22d96929d5416750e1f5889ee6cc16b382293104" source = "git+https://git.deuxfleurs.fr/lx/netapp?branch=stream-body#f6ad1d0fab340e77fbfcb3488a98c342d334838e"
dependencies = [ dependencies = [
"arc-swap", "arc-swap",
"async-trait", "async-trait",

View file

@ -48,10 +48,14 @@ use crate::repair::*;
pub const INLINE_THRESHOLD: usize = 3072; pub const INLINE_THRESHOLD: usize = 3072;
// Timeout for RPCs that read and write blocks to remote nodes // Timeout for RPCs that read and write blocks to remote nodes
const BLOCK_RW_TIMEOUT: Duration = Duration::from_secs(30); const BLOCK_RW_TIMEOUT: Duration = Duration::from_secs(60);
// Timeout for RPCs that ask other nodes whether they need a copy // Timeout for RPCs that ask other nodes whether they need a copy
// of a given block before we delete it locally // of a given block before we delete it locally
const NEED_BLOCK_QUERY_TIMEOUT: Duration = Duration::from_secs(5); // The timeout here is relatively low because we don't want to block
// the entire resync loop when some nodes are not responding.
// Nothing will be deleted if the nodes don't answer the queries,
// we will just retry later.
const NEED_BLOCK_QUERY_TIMEOUT: Duration = Duration::from_secs(15);
// The delay between the time where a resync operation fails // The delay between the time where a resync operation fails
// and the time when it is retried, with exponential backoff // and the time when it is retried, with exponential backoff

View file

@ -31,7 +31,7 @@ use garage_util::metrics::RecordDuration;
use crate::metrics::RpcMetrics; use crate::metrics::RpcMetrics;
use crate::ring::Ring; use crate::ring::Ring;
const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10); const DEFAULT_TIMEOUT: Duration = Duration::from_secs(30);
// Don't allow more than 100 concurrent outgoing RPCs. // Don't allow more than 100 concurrent outgoing RPCs.
const MAX_CONCURRENT_REQUESTS: usize = 100; const MAX_CONCURRENT_REQUESTS: usize = 100;

View file

@ -38,7 +38,7 @@ use crate::rpc_helper::*;
const DISCOVERY_INTERVAL: Duration = Duration::from_secs(60); const DISCOVERY_INTERVAL: Duration = Duration::from_secs(60);
const STATUS_EXCHANGE_INTERVAL: Duration = Duration::from_secs(10); const STATUS_EXCHANGE_INTERVAL: Duration = Duration::from_secs(10);
const PING_TIMEOUT: Duration = Duration::from_secs(2); const SYSTEM_RPC_TIMEOUT: Duration = Duration::from_secs(15);
/// Version tag used for version check upon Netapp connection /// Version tag used for version check upon Netapp connection
pub const GARAGE_VERSION_TAG: u64 = 0x6761726167650007; // garage 0x0007 pub const GARAGE_VERSION_TAG: u64 = 0x6761726167650007; // garage 0x0007
@ -561,7 +561,7 @@ impl System {
.broadcast( .broadcast(
&self.system_endpoint, &self.system_endpoint,
SystemRpc::AdvertiseStatus(local_status), SystemRpc::AdvertiseStatus(local_status),
RequestStrategy::with_priority(PRIO_HIGH).with_timeout(PING_TIMEOUT), RequestStrategy::with_priority(PRIO_HIGH).with_timeout(SYSTEM_RPC_TIMEOUT),
) )
.await; .await;
@ -685,7 +685,7 @@ impl System {
&self.system_endpoint, &self.system_endpoint,
peer, peer,
SystemRpc::PullClusterLayout, SystemRpc::PullClusterLayout,
RequestStrategy::with_priority(PRIO_HIGH).with_timeout(PING_TIMEOUT), RequestStrategy::with_priority(PRIO_HIGH).with_timeout(SYSTEM_RPC_TIMEOUT),
) )
.await; .await;
if let Ok(SystemRpc::AdvertiseClusterLayout(layout)) = resp { if let Ok(SystemRpc::AdvertiseClusterLayout(layout)) = resp {

View file

@ -25,7 +25,8 @@ use crate::replication::*;
use crate::schema::*; use crate::schema::*;
const TABLE_GC_BATCH_SIZE: usize = 1024; const TABLE_GC_BATCH_SIZE: usize = 1024;
const TABLE_GC_RPC_TIMEOUT: Duration = Duration::from_secs(30); // Same timeout as NEED_BLOCK_QUERY_TIMEOUT in block manager
const TABLE_GC_RPC_TIMEOUT: Duration = Duration::from_secs(15);
// GC delay for table entries: 1 day (24 hours) // GC delay for table entries: 1 day (24 hours)
// (the delay before the entry is added in the GC todo list // (the delay before the entry is added in the GC todo list

View file

@ -24,7 +24,8 @@ use crate::merkle::*;
use crate::replication::*; use crate::replication::*;
use crate::*; use crate::*;
const TABLE_SYNC_RPC_TIMEOUT: Duration = Duration::from_secs(30); // Sync RPC can contain a lot of data, so have a 1min timeout
const TABLE_SYNC_RPC_TIMEOUT: Duration = Duration::from_secs(60);
// Do anti-entropy every 10 minutes // Do anti-entropy every 10 minutes
const ANTI_ENTROPY_INTERVAL: Duration = Duration::from_secs(10 * 60); const ANTI_ENTROPY_INTERVAL: Duration = Duration::from_secs(10 * 60);

View file

@ -31,7 +31,7 @@ use crate::schema::*;
use crate::sync::*; use crate::sync::*;
use crate::util::*; use crate::util::*;
pub const TABLE_RPC_TIMEOUT: Duration = Duration::from_secs(10); pub const TABLE_RPC_TIMEOUT: Duration = Duration::from_secs(30);
pub struct Table<F: TableSchema + 'static, R: TableReplication + 'static> { pub struct Table<F: TableSchema + 'static, R: TableReplication + 'static> {
pub system: Arc<System>, pub system: Arc<System>,