layout: allow sync update tracker to progress with only quorums

This commit is contained in:
Alex Auvolat 2023-12-07 14:27:53 +01:00
parent aa59059a91
commit 9cecea64d4
No known key found for this signature in database
GPG key ID: 0E496D15096376BE
7 changed files with 152 additions and 21 deletions

View file

@ -365,9 +365,9 @@ pub async fn cmd_layout_history(
table.push(format!( table.push(format!(
"{:?}\t#{}\t#{}\t#{}", "{:?}\t#{}\t#{}\t#{}",
node, node,
layout.update_trackers.ack_map.get(node), layout.update_trackers.ack_map.get(node, min_stored),
layout.update_trackers.sync_map.get(node), layout.update_trackers.sync_map.get(node, min_stored),
layout.update_trackers.sync_ack_map.get(node), layout.update_trackers.sync_ack_map.get(node, min_stored),
)); ));
} }
table[1..].sort(); table[1..].sort();

View file

@ -7,6 +7,7 @@ use serde::{Deserialize, Serialize};
use garage_util::data::*; use garage_util::data::*;
use super::schema::*; use super::schema::*;
use crate::replication_mode::ReplicationMode;
use crate::rpc_helper::RpcHelper; use crate::rpc_helper::RpcHelper;
#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq, Eq)] #[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq, Eq)]
@ -22,6 +23,7 @@ pub struct LayoutDigest {
} }
pub struct LayoutHelper { pub struct LayoutHelper {
replication_mode: ReplicationMode,
layout: Option<LayoutHistory>, layout: Option<LayoutHistory>,
// cached values // cached values
@ -48,7 +50,23 @@ impl Deref for LayoutHelper {
} }
impl LayoutHelper { impl LayoutHelper {
pub fn new(mut layout: LayoutHistory, mut ack_lock: HashMap<u64, AtomicUsize>) -> Self { pub fn new(
replication_mode: ReplicationMode,
mut layout: LayoutHistory,
mut ack_lock: HashMap<u64, AtomicUsize>,
) -> Self {
// In the new() function of the helper, we do a bunch of cleanup
// and calculations on the layout history to make sure things are
// correct and we have rapid access to important values such as
// the layout versions to use when reading to ensure consistency.
if !replication_mode.is_read_after_write_consistent() {
// Fast path for when no consistency is required.
// In this case we only need to keep the last version of the layout,
// we don't care about coordinating stuff in the cluster.
layout.keep_current_version_only();
}
layout.cleanup_old_versions(); layout.cleanup_old_versions();
let all_nodes = layout.get_all_nodes(); let all_nodes = layout.get_all_nodes();
@ -68,7 +86,7 @@ impl LayoutHelper {
.ack_map .ack_map
.min_among(&all_nodes, min_version); .min_among(&all_nodes, min_version);
// sync_map_min is the minimum value of sync_map among all storage nodes // sync_map_min is the minimum value of sync_map among storage nodes
// in the cluster (non-gateway nodes only, current and previous layouts). // in the cluster (non-gateway nodes only, current and previous layouts).
// It is the highest layout version for which we know that all relevant // It is the highest layout version for which we know that all relevant
// storage nodes have fullfilled a sync, and therefore it is safe to // storage nodes have fullfilled a sync, and therefore it is safe to
@ -76,11 +94,10 @@ impl LayoutHelper {
// Gateway nodes are excluded here because they hold no relevant data // Gateway nodes are excluded here because they hold no relevant data
// (they store the bucket and access key tables, but we don't have // (they store the bucket and access key tables, but we don't have
// consistency on those). // consistency on those).
// TODO: this value could take quorums into account instead. // This value is calculated using quorums to allow progress even
let sync_map_min = layout // if not all nodes have successfully completed a sync.
.update_trackers let sync_map_min =
.sync_map layout.calculate_sync_map_min_with_quorum(replication_mode, &all_nongateway_nodes);
.min_among(&all_nongateway_nodes, min_version);
let trackers_hash = layout.calculate_trackers_hash(); let trackers_hash = layout.calculate_trackers_hash();
let staging_hash = layout.calculate_staging_hash(); let staging_hash = layout.calculate_staging_hash();
@ -91,6 +108,7 @@ impl LayoutHelper {
.or_insert(AtomicUsize::new(0)); .or_insert(AtomicUsize::new(0));
LayoutHelper { LayoutHelper {
replication_mode,
layout: Some(layout), layout: Some(layout),
ack_map_min, ack_map_min,
sync_map_min, sync_map_min,
@ -115,6 +133,7 @@ impl LayoutHelper {
let changed = f(&mut self.layout.as_mut().unwrap()); let changed = f(&mut self.layout.as_mut().unwrap());
if changed { if changed {
*self = Self::new( *self = Self::new(
self.replication_mode,
self.layout.take().unwrap(), self.layout.take().unwrap(),
std::mem::take(&mut self.ack_lock), std::mem::take(&mut self.ack_lock),
); );

View file

@ -6,6 +6,7 @@ use garage_util::encode::nonversioned_encode;
use garage_util::error::*; use garage_util::error::*;
use super::*; use super::*;
use crate::replication_mode::ReplicationMode;
impl LayoutHistory { impl LayoutHistory {
pub fn new(replication_factor: usize) -> Self { pub fn new(replication_factor: usize) -> Self {
@ -64,6 +65,13 @@ impl LayoutHistory {
// ---- housekeeping (all invoked by LayoutHelper) ---- // ---- housekeeping (all invoked by LayoutHelper) ----
pub(crate) fn keep_current_version_only(&mut self) {
while self.versions.len() > 1 {
let removed = self.versions.remove(0);
self.old_versions.push(removed);
}
}
pub(crate) fn cleanup_old_versions(&mut self) { pub(crate) fn cleanup_old_versions(&mut self) {
// If there are invalid versions before valid versions, remove them // If there are invalid versions before valid versions, remove them
if self.versions.len() > 1 && self.current().check().is_ok() { if self.versions.len() > 1 && self.current().check().is_ok() {
@ -114,6 +122,99 @@ impl LayoutHistory {
} }
} }
pub(crate) fn calculate_sync_map_min_with_quorum(
&self,
replication_mode: ReplicationMode,
all_nongateway_nodes: &[Uuid],
) -> u64 {
// This function calculates the minimum layout version from which
// it is safe to read if we want to maintain read-after-write consistency.
// In the general case the computation can be a bit expensive so
// we try to optimize it in several ways.
// If there is only one layout version, we know that's the one
// we need to read from.
if self.versions.len() == 1 {
return self.current().version;
}
let quorum = replication_mode.write_quorum();
let min_version = self.min_stored();
let global_min = self
.update_trackers
.sync_map
.min_among(&all_nongateway_nodes, min_version);
// If the write quorums are equal to the total number of nodes,
// i.e. no writes can succeed while they are not written to all nodes,
// then we must in all case wait for all nodes to complete a sync.
// This is represented by reading from the layout with version
// number global_min, the smallest layout version for which all nodes
// have completed a sync.
if quorum == self.current().replication_factor {
return global_min;
}
// In the general case, we need to look at all write sets for all partitions,
// and find a safe layout version to read for that partition. We then
// take the minimum value among all partition as the safe layout version
// to read in all cases (the layout version to which all reads are directed).
let mut current_min = self.current().version;
let mut sets_done = HashSet::<Vec<Uuid>>::new();
for (_, p_hash) in self.current().partitions() {
for v in self.versions.iter() {
if v.version == self.current().version {
// We don't care about whether nodes in the latest layout version
// have completed a sync or not, as the sync is push-only
// and by definition nodes in the latest layout version do not
// hold data that must be pushed to nodes in the latest layout
// version, since that's the same version (any data that's
// already in the latest version is assumed to have been written
// by an operation that ensured a quorum of writes within
// that version).
continue;
}
// Determine set of nodes for partition p in layout version v.
// Sort the node set to avoid duplicate computations.
let mut set = v
.nodes_of(&p_hash, v.replication_factor)
.collect::<Vec<Uuid>>();
set.sort();
// If this set was already processed, skip it.
if sets_done.contains(&set) {
continue;
}
// Find the value of the sync update trackers that is the
// highest possible minimum within a quorum of nodes.
let mut sync_values = set
.iter()
.map(|x| self.update_trackers.sync_map.get(x, min_version))
.collect::<Vec<_>>();
sync_values.sort();
let set_min = sync_values[sync_values.len() - quorum];
if set_min < current_min {
current_min = set_min;
}
// defavorable case, we know we are at the smallest possible version,
// so we can stop early
assert!(current_min >= global_min);
if current_min == global_min {
return current_min;
}
// Add set to already processed sets
sets_done.insert(set);
}
}
current_min
}
pub(crate) fn calculate_trackers_hash(&self) -> Hash { pub(crate) fn calculate_trackers_hash(&self) -> Hash {
blake2sum(&nonversioned_encode(&self.update_trackers).unwrap()[..]) blake2sum(&nonversioned_encode(&self.update_trackers).unwrap()[..])
} }

View file

@ -14,12 +14,13 @@ use garage_util::error::*;
use garage_util::persister::Persister; use garage_util::persister::Persister;
use super::*; use super::*;
use crate::replication_mode::ReplicationMode;
use crate::rpc_helper::*; use crate::rpc_helper::*;
use crate::system::*; use crate::system::*;
pub struct LayoutManager { pub struct LayoutManager {
node_id: Uuid, node_id: Uuid,
replication_factor: usize, replication_mode: ReplicationMode,
persist_cluster_layout: Persister<LayoutHistory>, persist_cluster_layout: Persister<LayoutHistory>,
layout: Arc<RwLock<LayoutHelper>>, layout: Arc<RwLock<LayoutHelper>>,
@ -37,14 +38,16 @@ impl LayoutManager {
node_id: NodeID, node_id: NodeID,
system_endpoint: Arc<Endpoint<SystemRpc, System>>, system_endpoint: Arc<Endpoint<SystemRpc, System>>,
fullmesh: Arc<FullMeshPeeringStrategy>, fullmesh: Arc<FullMeshPeeringStrategy>,
replication_factor: usize, replication_mode: ReplicationMode,
) -> Result<Arc<Self>, Error> { ) -> Result<Arc<Self>, Error> {
let replication_factor = replication_mode.replication_factor();
let persist_cluster_layout: Persister<LayoutHistory> = let persist_cluster_layout: Persister<LayoutHistory> =
Persister::new(&config.metadata_dir, "cluster_layout"); Persister::new(&config.metadata_dir, "cluster_layout");
let cluster_layout = match persist_cluster_layout.load() { let cluster_layout = match persist_cluster_layout.load() {
Ok(x) => { Ok(x) => {
if x.current().replication_factor != replication_factor { if x.current().replication_factor != replication_mode.replication_factor() {
return Err(Error::Message(format!( return Err(Error::Message(format!(
"Prevous cluster layout has replication factor {}, which is different than the one specified in the config file ({}). The previous cluster layout can be purged, if you know what you are doing, simply by deleting the `cluster_layout` file in your metadata directory.", "Prevous cluster layout has replication factor {}, which is different than the one specified in the config file ({}). The previous cluster layout can be purged, if you know what you are doing, simply by deleting the `cluster_layout` file in your metadata directory.",
x.current().replication_factor, x.current().replication_factor,
@ -62,7 +65,8 @@ impl LayoutManager {
} }
}; };
let mut cluster_layout = LayoutHelper::new(cluster_layout, Default::default()); let mut cluster_layout =
LayoutHelper::new(replication_mode, cluster_layout, Default::default());
cluster_layout.update_trackers(node_id.into()); cluster_layout.update_trackers(node_id.into());
let layout = Arc::new(RwLock::new(cluster_layout)); let layout = Arc::new(RwLock::new(cluster_layout));
@ -77,7 +81,7 @@ impl LayoutManager {
Ok(Arc::new(Self { Ok(Arc::new(Self {
node_id: node_id.into(), node_id: node_id.into(),
replication_factor, replication_mode,
persist_cluster_layout, persist_cluster_layout,
layout, layout,
change_notify, change_notify,
@ -291,11 +295,11 @@ impl LayoutManager {
adv.update_trackers adv.update_trackers
); );
if adv.current().replication_factor != self.replication_factor { if adv.current().replication_factor != self.replication_mode.replication_factor() {
let msg = format!( let msg = format!(
"Received a cluster layout from another node with replication factor {}, which is different from what we have in our configuration ({}). Discarding the cluster layout we received.", "Received a cluster layout from another node with replication factor {}, which is different from what we have in our configuration ({}). Discarding the cluster layout we received.",
adv.current().replication_factor, adv.current().replication_factor,
self.replication_factor self.replication_mode.replication_factor()
); );
error!("{}", msg); error!("{}", msg);
return Err(Error::Message(msg)); return Err(Error::Message(msg));

View file

@ -411,13 +411,13 @@ impl UpdateTracker {
pub(crate) fn min_among(&self, storage_nodes: &[Uuid], min_version: u64) -> u64 { pub(crate) fn min_among(&self, storage_nodes: &[Uuid], min_version: u64) -> u64 {
storage_nodes storage_nodes
.iter() .iter()
.map(|x| self.0.get(x).copied().unwrap_or(min_version)) .map(|x| self.get(x, min_version))
.min() .min()
.unwrap_or(min_version) .unwrap_or(min_version)
} }
pub fn get(&self, node: &Uuid) -> u64 { pub fn get(&self, node: &Uuid, min_version: u64) -> u64 {
self.0.get(node).copied().unwrap_or(0) self.0.get(node).copied().unwrap_or(min_version)
} }
} }

View file

@ -54,4 +54,11 @@ impl ReplicationMode {
Self::ThreeWayDangerous => 1, Self::ThreeWayDangerous => 1,
} }
} }
pub fn is_read_after_write_consistent(&self) -> bool {
match self {
Self::None | Self::TwoWay | Self::ThreeWay => true,
_ => false,
}
}
} }

View file

@ -280,7 +280,7 @@ impl System {
netapp.id, netapp.id,
system_endpoint.clone(), system_endpoint.clone(),
fullmesh.clone(), fullmesh.clone(),
replication_factor, replication_mode,
)?; )?;
// ---- set up metrics and status exchange ---- // ---- set up metrics and status exchange ----