Less strict timeouts

2024-11-25 09:31:00 +00:00 · 2022-09-01 16:30:44 +02:00 · 2022-09-01 16:30:44 +02:00 · df094bd807
commit df094bd807
parent f3bf34b6a1
7 changed files with 16 additions and 10 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -2176,7 +2176,7 @@ dependencies = [
 [[package]]
 name = "netapp"
 version = "0.5.0"
-source = "git+https://git.deuxfleurs.fr/lx/netapp?branch=stream-body#22d96929d5416750e1f5889ee6cc16b382293104"
+source = "git+https://git.deuxfleurs.fr/lx/netapp?branch=stream-body#f6ad1d0fab340e77fbfcb3488a98c342d334838e"
 dependencies = [
 "arc-swap",
 "async-trait",
--- a/src/block/manager.rs
+++ b/src/block/manager.rs
@ -48,10 +48,14 @@ use crate::repair::*;
 pub const INLINE_THRESHOLD: usize = 3072;
 // Timeout for RPCs that read and write blocks to remote nodes
-const BLOCK_RW_TIMEOUT: Duration = Duration::from_secs(30);
+const BLOCK_RW_TIMEOUT: Duration = Duration::from_secs(60);
 // Timeout for RPCs that ask other nodes whether they need a copy
 // of a given block before we delete it locally
-const NEED_BLOCK_QUERY_TIMEOUT: Duration = Duration::from_secs(5);
+// The timeout here is relatively low because we don't want to block
 // the entire resync loop when some nodes are not responding.
 // Nothing will be deleted if the nodes don't answer the queries,
 // we will just retry later.
 const NEED_BLOCK_QUERY_TIMEOUT: Duration = Duration::from_secs(15);
 // The delay between the time where a resync operation fails
 // and the time when it is retried, with exponential backoff
--- a/src/rpc/rpc_helper.rs
+++ b/src/rpc/rpc_helper.rs
@ -31,7 +31,7 @@ use garage_util::metrics::RecordDuration;
 use crate::metrics::RpcMetrics;
 use crate::ring::Ring;
-const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10);
+const DEFAULT_TIMEOUT: Duration = Duration::from_secs(30);
 // Don't allow more than 100 concurrent outgoing RPCs.
 const MAX_CONCURRENT_REQUESTS: usize = 100;
--- a/src/rpc/system.rs
+++ b/src/rpc/system.rs
@ -38,7 +38,7 @@ use crate::rpc_helper::*;
 const DISCOVERY_INTERVAL: Duration = Duration::from_secs(60);
 const STATUS_EXCHANGE_INTERVAL: Duration = Duration::from_secs(10);
-const PING_TIMEOUT: Duration = Duration::from_secs(2);
+const SYSTEM_RPC_TIMEOUT: Duration = Duration::from_secs(15);
 /// Version tag used for version check upon Netapp connection
 pub const GARAGE_VERSION_TAG: u64 = 0x6761726167650007; // garage 0x0007
@ -561,7 +561,7 @@ impl System {
 				.broadcast(
 					&self.system_endpoint,
 					SystemRpc::AdvertiseStatus(local_status),
-					RequestStrategy::with_priority(PRIO_HIGH).with_timeout(PING_TIMEOUT),
+					RequestStrategy::with_priority(PRIO_HIGH).with_timeout(SYSTEM_RPC_TIMEOUT),
 				)
 				.await;
@ -685,7 +685,7 @@ impl System {
 				&self.system_endpoint,
 				peer,
 				SystemRpc::PullClusterLayout,
-				RequestStrategy::with_priority(PRIO_HIGH).with_timeout(PING_TIMEOUT),
+				RequestStrategy::with_priority(PRIO_HIGH).with_timeout(SYSTEM_RPC_TIMEOUT),
 			)
 			.await;
 		if let Ok(SystemRpc::AdvertiseClusterLayout(layout)) = resp {
--- a/src/table/gc.rs
+++ b/src/table/gc.rs
@ -25,7 +25,8 @@ use crate::replication::*;
 use crate::schema::*;
 const TABLE_GC_BATCH_SIZE: usize = 1024;
-const TABLE_GC_RPC_TIMEOUT: Duration = Duration::from_secs(30);
+// Same timeout as NEED_BLOCK_QUERY_TIMEOUT in block manager
 const TABLE_GC_RPC_TIMEOUT: Duration = Duration::from_secs(15);
 // GC delay for table entries: 1 day (24 hours)
 // (the delay before the entry is added in the GC todo list
--- a/src/table/sync.rs
+++ b/src/table/sync.rs
@ -24,7 +24,8 @@ use crate::merkle::*;
 use crate::replication::*;
 use crate::*;
-const TABLE_SYNC_RPC_TIMEOUT: Duration = Duration::from_secs(30);
+// Sync RPC can contain a lot of data, so have a 1min timeout
 const TABLE_SYNC_RPC_TIMEOUT: Duration = Duration::from_secs(60);
 // Do anti-entropy every 10 minutes
 const ANTI_ENTROPY_INTERVAL: Duration = Duration::from_secs(10 * 60);
--- a/src/table/table.rs
+++ b/src/table/table.rs
@ -31,7 +31,7 @@ use crate::schema::*;
 use crate::sync::*;
 use crate::util::*;
-pub const TABLE_RPC_TIMEOUT: Duration = Duration::from_secs(10);
+pub const TABLE_RPC_TIMEOUT: Duration = Duration::from_secs(30);
 pub struct Table<F: TableSchema + 'static, R: TableReplication + 'static> {
 	pub system: Arc<System>,