Not fully tested: new multi-dc MagLev

This commit is contained in:
Alex Auvolat 2021-03-05 16:22:29 +01:00
parent 3882d5ba36
commit d7e005251d
4 changed files with 152 additions and 94 deletions

View file

@ -262,21 +262,21 @@ if __name__ == "__main__":
print("------") print("------")
print("method 2 (custom ring)") print("method 2 (custom ring)")
nodes = [('digitale', 'atuin', 4), nodes = [('digitale', 'atuin', 1),
('drosera', 'atuin', 4), ('drosera', 'atuin', 1),
('datura', 'atuin', 4), ('datura', 'atuin', 1),
('io', 'jupiter', 8)] ('io', 'jupiter', 2)]
nodes2 = [('digitale', 'atuin', 8), nodes2 = [('digitale', 'atuin', 2),
('drosera', 'atuin', 8), ('drosera', 'atuin', 2),
('datura', 'atuin', 8), ('datura', 'atuin', 2),
('io', 'jupiter', 16), ('io', 'jupiter', 4),
('isou', 'jupiter', 8), ('isou', 'jupiter', 2),
('mini', 'grog', 4), ('mini', 'grog', 1),
('mixi', 'grog', 4), ('mixi', 'grog', 1),
('moxi', 'grog', 4), ('moxi', 'grog', 1),
('modi', 'grog', 4), ('modi', 'grog', 1),
('geant', 'grisou', 16), ('geant', 'grisou', 4),
('gipsie', 'grisou', 16), ('gipsie', 'grisou', 4),
] ]
evaluate_method(method2, nodes2) evaluate_method(method2, nodes2)

View file

@ -218,12 +218,7 @@ impl System {
.unwrap_or("<invalid utf-8>".to_string()), .unwrap_or("<invalid utf-8>".to_string()),
}; };
let mut ring = Ring { let ring = Ring::new(net_config);
config: net_config,
ring: Vec::new(),
n_datacenters: 0,
};
ring.rebuild_ring();
let (update_ring, ring) = watch::channel(Arc::new(ring)); let (update_ring, ring) = watch::channel(Arc::new(ring));
let rpc_path = MEMBERSHIP_RPC_PATH.to_string(); let rpc_path = MEMBERSHIP_RPC_PATH.to_string();
@ -531,10 +526,7 @@ impl System {
let ring: Arc<Ring> = self.ring.borrow().clone(); let ring: Arc<Ring> = self.ring.borrow().clone();
if adv.version > ring.config.version { if adv.version > ring.config.version {
let mut ring = ring.as_ref().clone(); let ring = Ring::new(adv.clone());
ring.config = adv.clone();
ring.rebuild_ring();
update_lock.1.broadcast(Arc::new(ring))?; update_lock.1.broadcast(Arc::new(ring))?;
drop(update_lock); drop(update_lock);

View file

@ -1,9 +1,22 @@
use std::collections::HashMap; use std::collections::{HashMap, HashSet};
use std::convert::TryInto;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use garage_util::data::*; use garage_util::data::*;
// TODO: make this constant parametrizable in the config file
// For deployments with many nodes it might make sense to bump
// it up to 10.
// Maximum value : 16
pub const PARTITION_BITS: usize = 8;
const PARTITION_MASK_U16: u16 = ((1 << PARTITION_BITS) - 1) << (16 - PARTITION_BITS);
// TODO: make this constant paraetrizable in the config file
// (most deployments use a replication factor of 3, so...)
pub const MAX_REPLICATION: usize = 3;
#[derive(Clone, Debug, Serialize, Deserialize)] #[derive(Clone, Debug, Serialize, Deserialize)]
pub struct NetworkConfig { pub struct NetworkConfig {
pub members: HashMap<UUID, NetworkConfigEntry>, pub members: HashMap<UUID, NetworkConfigEntry>,
@ -30,96 +43,150 @@ pub struct NetworkConfigEntry {
pub struct Ring { pub struct Ring {
pub config: NetworkConfig, pub config: NetworkConfig,
pub ring: Vec<RingEntry>, pub ring: Vec<RingEntry>,
pub n_datacenters: usize,
} }
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
pub struct RingEntry { pub struct RingEntry {
pub location: Hash, pub location: Hash,
pub node: UUID, pub nodes: [UUID; MAX_REPLICATION],
datacenter: usize,
} }
impl Ring { impl Ring {
pub(crate) fn rebuild_ring(&mut self) { pub(crate) fn new(config: NetworkConfig) -> Self {
let mut new_ring = vec![]; // Create a vector of partition indices (0 to 2**PARTITION_BITS-1)
let mut datacenters = vec![]; let partitions_idx = (0usize..(1usize << PARTITION_BITS)).collect::<Vec<_>>();
for (id, config) in self.config.members.iter() { let datacenters = config
let datacenter = &config.datacenter; .members
.iter()
.map(|(_id, info)| info.datacenter.as_str())
.collect::<HashSet<&str>>();
let n_datacenters = datacenters.len();
if !datacenters.contains(datacenter) { // Prepare ring
datacenters.push(datacenter.to_string()); let mut partitions: Vec<Vec<(&UUID, &NetworkConfigEntry)>> = partitions_idx
.iter()
.map(|_i| Vec::new())
.collect::<Vec<_>>();
// Create MagLev priority queues for each node
let mut queues = config
.members
.iter()
.map(|(node_id, node_info)| {
let mut parts = partitions_idx
.iter()
.map(|i| {
let part_data =
[&u16::to_be_bytes(*i as u16)[..], node_id.as_slice()].concat();
(*i, fasthash(&part_data[..]))
})
.collect::<Vec<_>>();
parts.sort_by_key(|(_i, h)| *h);
let parts_i = parts.iter().map(|(i, _h)| *i).collect::<Vec<_>>();
(node_id, node_info, parts_i, 0)
})
.collect::<Vec<_>>();
let max_toktok = config
.members
.iter()
.map(|(_, node_info)| node_info.n_tokens)
.fold(0, std::cmp::max);
// Fill up ring
for rep in 0..MAX_REPLICATION {
queues.sort_by_key(|(ni, _np, _q, _p)| {
let queue_data = [&u16::to_be_bytes(rep as u16)[..], ni.as_slice()].concat();
fasthash(&queue_data[..])
});
for (_, _, _, pos) in queues.iter_mut() {
*pos = 0;
} }
let datacenter_idx = datacenters
.iter()
.enumerate()
.find(|(_, dc)| *dc == datacenter)
.unwrap()
.0;
for i in 0..config.n_tokens { let mut remaining = partitions_idx.len();
let location = sha256sum(format!("{} {}", hex::encode(&id), i).as_bytes()); while remaining > 0 {
let remaining0 = remaining;
new_ring.push(RingEntry { for toktok in 0..max_toktok {
location: location.into(), for (node_id, node_info, q, pos) in queues.iter_mut() {
node: *id, if toktok >= node_info.n_tokens {
datacenter: datacenter_idx, continue;
}) }
for pos2 in *pos..q.len() {
let qv = q[pos2];
if partitions[qv].len() != rep {
continue;
}
let p_dcs = partitions[qv]
.iter()
.map(|(_id, info)| info.datacenter.as_str())
.collect::<HashSet<&str>>();
if !partitions[qv]
.iter()
.any(|(_id, i)| *i.datacenter == node_info.datacenter)
|| (p_dcs.len() == n_datacenters
&& !partitions[qv].iter().any(|(id, _i)| id == node_id))
{
partitions[qv].push((node_id, node_info));
remaining -= 1;
*pos = pos2 + 1;
break;
}
}
}
}
if remaining == remaining0 {
// No progress made, exit
warn!("Could not build ring, not enough nodes configured.");
return Self {
config,
ring: vec![],
};
}
} }
} }
new_ring.sort_unstable_by(|x, y| x.location.cmp(&y.location)); let ring = partitions
self.ring = new_ring; .iter()
self.n_datacenters = datacenters.len(); .enumerate()
.map(|(i, nodes)| {
let top = (i as u16) << (16 - PARTITION_BITS);
let mut hash = [0u8; 32];
hash[0..2].copy_from_slice(&u16::to_be_bytes(top)[..]);
let nodes = nodes.iter().map(|(id, _info)| **id).collect::<Vec<UUID>>();
RingEntry {
location: hash.into(),
nodes: nodes.try_into().unwrap(),
}
})
.collect::<Vec<_>>();
// eprintln!("RING: --"); eprintln!("RING: --");
// for e in self.ring.iter() { for e in ring.iter() {
// eprintln!("{:?}", e); eprintln!("{:?}", e);
// } }
// eprintln!("END --"); eprintln!("END --");
Self { config, ring }
} }
pub fn walk_ring(&self, from: &Hash, n: usize) -> Vec<UUID> { pub fn walk_ring(&self, from: &Hash, n: usize) -> Vec<UUID> {
if n >= self.config.members.len() { if self.ring.len() != 1 << PARTITION_BITS {
return self.config.members.keys().cloned().collect::<Vec<_>>(); warn!("Ring not yet ready, read/writes will be lost");
return vec![];
} }
let start = match self.ring.binary_search_by(|x| x.location.cmp(from)) { let top = u16::from_be_bytes(from.as_slice()[0..2].try_into().unwrap());
Ok(i) => i,
Err(i) => {
if i == 0 {
self.ring.len() - 1
} else {
i - 1
}
}
};
self.walk_ring_from_pos(start, n) let partition_idx = (top >> (16 - PARTITION_BITS)) as usize;
} let partition = &self.ring[partition_idx];
fn walk_ring_from_pos(&self, start: usize, n: usize) -> Vec<UUID> { let partition_top =
if n >= self.config.members.len() { u16::from_be_bytes(partition.location.as_slice()[0..2].try_into().unwrap());
return self.config.members.keys().cloned().collect::<Vec<_>>(); assert!(partition_top & PARTITION_MASK_U16 == top & PARTITION_MASK_U16);
}
let mut ret = vec![]; assert!(n <= partition.nodes.len());
let mut datacenters = vec![]; partition.nodes[..n].iter().cloned().collect::<Vec<_>>()
let mut delta = 0;
while ret.len() < n {
let i = (start + delta) % self.ring.len();
delta += 1;
if !datacenters.contains(&self.ring[i].datacenter) {
ret.push(self.ring[i].node);
datacenters.push(self.ring[i].datacenter);
} else if datacenters.len() == self.n_datacenters && !ret.contains(&self.ring[i].node) {
ret.push(self.ring[i].node);
}
}
ret
} }
} }

View file

@ -44,7 +44,6 @@ impl TableReplication for TableShardedReplication {
fn split_points(&self, ring: &Ring) -> Vec<Hash> { fn split_points(&self, ring: &Ring) -> Vec<Hash> {
let mut ret = vec![]; let mut ret = vec![];
ret.push([0u8; 32].into());
for entry in ring.ring.iter() { for entry in ring.ring.iter() {
ret.push(entry.location); ret.push(entry.location);
} }