Simplify replication logic

This commit is contained in:
Alex Auvolat 2021-03-16 11:14:27 +01:00
parent 6a8439fd13
commit 1d9961e411
9 changed files with 60 additions and 58 deletions

View file

@ -319,10 +319,8 @@ impl BlockManager {
if exists && !needed { if exists && !needed {
trace!("Offloading block {:?}", hash); trace!("Offloading block {:?}", hash);
let ring = self.system.ring.borrow().clone(); let mut who = self.replication.write_nodes(&hash);
if who.len() < self.replication.write_quorum() {
let mut who = self.replication.replication_nodes(&hash, &ring);
if who.len() < self.replication.write_quorum(&self.system) {
return Err(Error::Message(format!("Not trying to offload block because we don't have a quorum of nodes to write to"))); return Err(Error::Message(format!("Not trying to offload block because we don't have a quorum of nodes to write to")));
} }
who.retain(|id| *id != self.system.id); who.retain(|id| *id != self.system.id);
@ -367,7 +365,7 @@ impl BlockManager {
) )
.await?; .await?;
} }
trace!( info!(
"Deleting block {:?}, offload finished ({} / {})", "Deleting block {:?}, offload finished ({} / {})",
hash, hash,
need_nodes.len(), need_nodes.len(),
@ -391,7 +389,7 @@ impl BlockManager {
} }
pub async fn rpc_get_block(&self, hash: &Hash) -> Result<Vec<u8>, Error> { pub async fn rpc_get_block(&self, hash: &Hash) -> Result<Vec<u8>, Error> {
let who = self.replication.read_nodes(&hash, &self.system); let who = self.replication.read_nodes(&hash);
let resps = self let resps = self
.rpc_client .rpc_client
.try_call_many( .try_call_many(
@ -415,12 +413,12 @@ impl BlockManager {
} }
pub async fn rpc_put_block(&self, hash: Hash, data: Vec<u8>) -> Result<(), Error> { pub async fn rpc_put_block(&self, hash: Hash, data: Vec<u8>) -> Result<(), Error> {
let who = self.replication.write_nodes(&hash, &self.system); let who = self.replication.write_nodes(&hash);
self.rpc_client self.rpc_client
.try_call_many( .try_call_many(
&who[..], &who[..],
Message::PutBlock(PutBlockMessage { hash, data }), Message::PutBlock(PutBlockMessage { hash, data }),
RequestStrategy::with_quorum(self.replication.write_quorum(&self.system)) RequestStrategy::with_quorum(self.replication.write_quorum())
.with_timeout(BLOCK_RW_TIMEOUT), .with_timeout(BLOCK_RW_TIMEOUT),
) )
.await?; .await?;

View file

@ -54,18 +54,23 @@ impl Garage {
); );
let data_rep_param = TableShardedReplication { let data_rep_param = TableShardedReplication {
system: system.clone(),
replication_factor: config.data_replication_factor, replication_factor: config.data_replication_factor,
write_quorum: (config.data_replication_factor + 1) / 2, write_quorum: (config.data_replication_factor + 1) / 2,
read_quorum: 1, read_quorum: 1,
}; };
let meta_rep_param = TableShardedReplication { let meta_rep_param = TableShardedReplication {
system: system.clone(),
replication_factor: config.meta_replication_factor, replication_factor: config.meta_replication_factor,
write_quorum: (config.meta_replication_factor + 1) / 2, write_quorum: (config.meta_replication_factor + 1) / 2,
read_quorum: (config.meta_replication_factor + 1) / 2, read_quorum: (config.meta_replication_factor + 1) / 2,
}; };
let control_rep_param = TableFullReplication::new(config.control_write_max_faults); let control_rep_param = TableFullReplication {
system: system.clone(),
max_faults: config.control_write_max_faults,
};
info!("Initialize block manager..."); info!("Initialize block manager...");
let block_manager = BlockManager::new( let block_manager = BlockManager::new(

View file

@ -170,6 +170,11 @@ impl Ring {
Self { config, ring } Self { config, ring }
} }
pub fn partition_of(&self, from: &Hash) -> u16 {
let top = u16::from_be_bytes(from.as_slice()[0..2].try_into().unwrap());
top >> (16 - PARTITION_BITS)
}
pub fn walk_ring(&self, from: &Hash, n: usize) -> Vec<UUID> { pub fn walk_ring(&self, from: &Hash, n: usize) -> Vec<UUID> {
if self.ring.len() != 1 << PARTITION_BITS { if self.ring.len() != 1 << PARTITION_BITS {
warn!("Ring not yet ready, read/writes will be lost!"); warn!("Ring not yet ready, read/writes will be lost!");
@ -177,8 +182,9 @@ impl Ring {
} }
let top = u16::from_be_bytes(from.as_slice()[0..2].try_into().unwrap()); let top = u16::from_be_bytes(from.as_slice()[0..2].try_into().unwrap());
let partition_idx = (top >> (16 - PARTITION_BITS)) as usize; let partition_idx = (top >> (16 - PARTITION_BITS)) as usize;
assert_eq!(partition_idx, self.partition_of(from) as usize);
let partition = &self.ring[partition_idx]; let partition = &self.ring[partition_idx];
let partition_top = let partition_top =

View file

@ -130,7 +130,7 @@ where
let mut partitions = HashMap::new(); let mut partitions = HashMap::new();
for (k, vhash, v) in entries { for (k, vhash, v) in entries {
let pkh = Hash::try_from(&k[..32]).unwrap(); let pkh = Hash::try_from(&k[..32]).unwrap();
let mut nodes = self.aux.replication.write_nodes(&pkh, &self.aux.system); let mut nodes = self.aux.replication.write_nodes(&pkh);
nodes.retain(|x| *x != self.aux.system.id); nodes.retain(|x| *x != self.aux.system.id);
nodes.sort(); nodes.sort();

View file

@ -8,21 +8,10 @@ use crate::replication::*;
#[derive(Clone)] #[derive(Clone)]
pub struct TableFullReplication { pub struct TableFullReplication {
pub system: Arc<System>,
pub max_faults: usize, pub max_faults: usize,
} }
#[derive(Clone)]
struct Neighbors {
ring: Arc<Ring>,
neighbors: Vec<UUID>,
}
impl TableFullReplication {
pub fn new(max_faults: usize) -> Self {
TableFullReplication { max_faults }
}
}
impl TableReplication for TableFullReplication { impl TableReplication for TableFullReplication {
// Full replication schema: all nodes store everything // Full replication schema: all nodes store everything
// Writes are disseminated in an epidemic manner in the network // Writes are disseminated in an epidemic manner in the network
@ -30,18 +19,23 @@ impl TableReplication for TableFullReplication {
// Advantage: do all reads locally, extremely fast // Advantage: do all reads locally, extremely fast
// Inconvenient: only suitable to reasonably small tables // Inconvenient: only suitable to reasonably small tables
fn read_nodes(&self, _hash: &Hash, system: &System) -> Vec<UUID> { fn partition_of(&self, _hash: &Hash) -> u16 {
vec![system.id] 0u16
}
fn read_nodes(&self, _hash: &Hash) -> Vec<UUID> {
vec![self.system.id]
} }
fn read_quorum(&self) -> usize { fn read_quorum(&self) -> usize {
1 1
} }
fn write_nodes(&self, hash: &Hash, system: &System) -> Vec<UUID> { fn write_nodes(&self, _hash: &Hash) -> Vec<UUID> {
self.replication_nodes(hash, system.ring.borrow().as_ref()) let ring = self.system.ring.borrow();
ring.config.members.keys().cloned().collect::<Vec<_>>()
} }
fn write_quorum(&self, system: &System) -> usize { fn write_quorum(&self) -> usize {
let nmembers = system.ring.borrow().config.members.len(); let nmembers = self.system.ring.borrow().config.members.len();
if nmembers > self.max_faults { if nmembers > self.max_faults {
nmembers - self.max_faults nmembers - self.max_faults
} else { } else {
@ -52,9 +46,6 @@ impl TableReplication for TableFullReplication {
self.max_faults self.max_faults
} }
fn replication_nodes(&self, _hash: &Hash, ring: &Ring) -> Vec<UUID> {
ring.config.members.keys().cloned().collect::<Vec<_>>()
}
fn split_points(&self, _ring: &Ring) -> Vec<Hash> { fn split_points(&self, _ring: &Ring) -> Vec<Hash> {
let mut ret = vec![]; let mut ret = vec![];
ret.push([0u8; 32].into()); ret.push([0u8; 32].into());

View file

@ -1,4 +1,3 @@
use garage_rpc::membership::System;
use garage_rpc::ring::Ring; use garage_rpc::ring::Ring;
use garage_util::data::*; use garage_util::data::*;
@ -7,16 +6,18 @@ pub trait TableReplication: Send + Sync {
// See examples in table_sharded.rs and table_fullcopy.rs // See examples in table_sharded.rs and table_fullcopy.rs
// To understand various replication methods // To understand various replication methods
// Partition number of data item (for Merkle tree)
fn partition_of(&self, hash: &Hash) -> u16;
// Which nodes to send reads from // Which nodes to send reads from
fn read_nodes(&self, hash: &Hash, system: &System) -> Vec<UUID>; fn read_nodes(&self, hash: &Hash) -> Vec<UUID>;
fn read_quorum(&self) -> usize; fn read_quorum(&self) -> usize;
// Which nodes to send writes to // Which nodes to send writes to
fn write_nodes(&self, hash: &Hash, system: &System) -> Vec<UUID>; fn write_nodes(&self, hash: &Hash) -> Vec<UUID>;
fn write_quorum(&self, system: &System) -> usize; fn write_quorum(&self) -> usize;
fn max_write_errors(&self) -> usize; fn max_write_errors(&self) -> usize;
// Which are the nodes that do actually replicate the data // Get partition boundaries
fn replication_nodes(&self, hash: &Hash, ring: &Ring) -> Vec<UUID>;
fn split_points(&self, ring: &Ring) -> Vec<Hash>; fn split_points(&self, ring: &Ring) -> Vec<Hash>;
} }

View file

@ -1,3 +1,5 @@
use std::sync::Arc;
use garage_rpc::membership::System; use garage_rpc::membership::System;
use garage_rpc::ring::Ring; use garage_rpc::ring::Ring;
use garage_util::data::*; use garage_util::data::*;
@ -6,6 +8,7 @@ use crate::replication::*;
#[derive(Clone)] #[derive(Clone)]
pub struct TableShardedReplication { pub struct TableShardedReplication {
pub system: Arc<System>,
pub replication_factor: usize, pub replication_factor: usize,
pub read_quorum: usize, pub read_quorum: usize,
pub write_quorum: usize, pub write_quorum: usize,
@ -19,28 +22,29 @@ impl TableReplication for TableShardedReplication {
// - reads are done on all of the nodes that replicate the data // - reads are done on all of the nodes that replicate the data
// - writes as well // - writes as well
fn read_nodes(&self, hash: &Hash, system: &System) -> Vec<UUID> { fn partition_of(&self, hash: &Hash) -> u16 {
let ring = system.ring.borrow().clone(); self.system.ring.borrow().partition_of(hash)
}
fn read_nodes(&self, hash: &Hash) -> Vec<UUID> {
let ring = self.system.ring.borrow().clone();
ring.walk_ring(&hash, self.replication_factor) ring.walk_ring(&hash, self.replication_factor)
} }
fn read_quorum(&self) -> usize { fn read_quorum(&self) -> usize {
self.read_quorum self.read_quorum
} }
fn write_nodes(&self, hash: &Hash, system: &System) -> Vec<UUID> { fn write_nodes(&self, hash: &Hash) -> Vec<UUID> {
let ring = system.ring.borrow().clone(); let ring = self.system.ring.borrow();
ring.walk_ring(&hash, self.replication_factor) ring.walk_ring(&hash, self.replication_factor)
} }
fn write_quorum(&self, _system: &System) -> usize { fn write_quorum(&self) -> usize {
self.write_quorum self.write_quorum
} }
fn max_write_errors(&self) -> usize { fn max_write_errors(&self) -> usize {
self.replication_factor - self.write_quorum self.replication_factor - self.write_quorum
} }
fn replication_nodes(&self, hash: &Hash, ring: &Ring) -> Vec<UUID> {
ring.walk_ring(&hash, self.replication_factor)
}
fn split_points(&self, ring: &Ring) -> Vec<Hash> { fn split_points(&self, ring: &Ring) -> Vec<Hash> {
let mut ret = vec![]; let mut ret = vec![];

View file

@ -218,10 +218,7 @@ where
let nodes = self let nodes = self
.aux .aux
.replication .replication
.write_nodes( .write_nodes(&hash_of_merkle_partition(partition.range.begin))
&hash_of_merkle_partition(partition.range.begin),
&self.aux.system,
)
.into_iter() .into_iter()
.filter(|node| *node != my_id) .filter(|node| *node != my_id)
.collect::<Vec<_>>(); .collect::<Vec<_>>();
@ -293,7 +290,7 @@ where
let nodes = self let nodes = self
.aux .aux
.replication .replication
.write_nodes(&begin, &self.aux.system) .write_nodes(&begin)
.into_iter() .into_iter()
.collect::<Vec<_>>(); .collect::<Vec<_>>();
if nodes.contains(&self.aux.system.id) { if nodes.contains(&self.aux.system.id) {
@ -303,7 +300,7 @@ where
); );
break; break;
} }
if nodes.len() < self.aux.replication.write_quorum(&self.aux.system) { if nodes.len() < self.aux.replication.write_quorum() {
return Err(Error::Message(format!( return Err(Error::Message(format!(
"Not offloading as we don't have a quorum of nodes to write to." "Not offloading as we don't have a quorum of nodes to write to."
))); )));
@ -616,7 +613,7 @@ impl SyncTodo {
let begin_hash = hash_of_merkle_partition(begin); let begin_hash = hash_of_merkle_partition(begin);
let end_hash = hash_of_merkle_partition_opt(end); let end_hash = hash_of_merkle_partition_opt(end);
let nodes = aux.replication.replication_nodes(&begin_hash, &ring); let nodes = aux.replication.write_nodes(&begin_hash);
let retain = nodes.contains(&my_id); let retain = nodes.contains(&my_id);
if !retain { if !retain {

View file

@ -91,7 +91,7 @@ where
pub async fn insert(&self, e: &F::E) -> Result<(), Error> { pub async fn insert(&self, e: &F::E) -> Result<(), Error> {
let hash = e.partition_key().hash(); let hash = e.partition_key().hash();
let who = self.aux.replication.write_nodes(&hash, &self.aux.system); let who = self.aux.replication.write_nodes(&hash);
//eprintln!("insert who: {:?}", who); //eprintln!("insert who: {:?}", who);
let e_enc = Arc::new(ByteBuf::from(rmp_to_vec_all_named(e)?)); let e_enc = Arc::new(ByteBuf::from(rmp_to_vec_all_named(e)?));
@ -101,7 +101,7 @@ where
.try_call_many( .try_call_many(
&who[..], &who[..],
rpc, rpc,
RequestStrategy::with_quorum(self.aux.replication.write_quorum(&self.aux.system)) RequestStrategy::with_quorum(self.aux.replication.write_quorum())
.with_timeout(TABLE_RPC_TIMEOUT), .with_timeout(TABLE_RPC_TIMEOUT),
) )
.await?; .await?;
@ -113,7 +113,7 @@ where
for entry in entries.iter() { for entry in entries.iter() {
let hash = entry.partition_key().hash(); let hash = entry.partition_key().hash();
let who = self.aux.replication.write_nodes(&hash, &self.aux.system); let who = self.aux.replication.write_nodes(&hash);
let e_enc = Arc::new(ByteBuf::from(rmp_to_vec_all_named(entry)?)); let e_enc = Arc::new(ByteBuf::from(rmp_to_vec_all_named(entry)?));
for node in who { for node in who {
if !call_list.contains_key(&node) { if !call_list.contains_key(&node) {
@ -150,7 +150,7 @@ where
sort_key: &F::S, sort_key: &F::S,
) -> Result<Option<F::E>, Error> { ) -> Result<Option<F::E>, Error> {
let hash = partition_key.hash(); let hash = partition_key.hash();
let who = self.aux.replication.read_nodes(&hash, &self.aux.system); let who = self.aux.replication.read_nodes(&hash);
//eprintln!("get who: {:?}", who); //eprintln!("get who: {:?}", who);
let rpc = TableRPC::<F>::ReadEntry(partition_key.clone(), sort_key.clone()); let rpc = TableRPC::<F>::ReadEntry(partition_key.clone(), sort_key.clone());
@ -207,7 +207,7 @@ where
limit: usize, limit: usize,
) -> Result<Vec<F::E>, Error> { ) -> Result<Vec<F::E>, Error> {
let hash = partition_key.hash(); let hash = partition_key.hash();
let who = self.aux.replication.read_nodes(&hash, &self.aux.system); let who = self.aux.replication.read_nodes(&hash);
let rpc = TableRPC::<F>::ReadRange(partition_key.clone(), begin_sort_key, filter, limit); let rpc = TableRPC::<F>::ReadRange(partition_key.clone(), begin_sort_key, filter, limit);