garage/src/membership.rs

523 lines
13 KiB
Rust

use std::collections::HashMap;
use std::hash::Hash as StdHash;
use std::hash::Hasher;
use std::io::Read;
use std::net::{IpAddr, SocketAddr};
use std::path::PathBuf;
use std::sync::Arc;
use std::time::Duration;
use futures::future::join_all;
use futures::select;
use futures_util::future::*;
use sha2::{Digest, Sha256};
use tokio::prelude::*;
use tokio::sync::watch;
use tokio::sync::Mutex;
use crate::background::BackgroundRunner;
use crate::data::*;
use crate::error::Error;
use crate::proto::*;
use crate::rpc_client::*;
use crate::server::Config;
const PING_INTERVAL: Duration = Duration::from_secs(10);
const PING_TIMEOUT: Duration = Duration::from_secs(2);
const MAX_FAILED_PINGS: usize = 3;
pub struct System {
pub config: Config,
pub id: UUID,
pub rpc_client: RpcClient,
pub status: watch::Receiver<Arc<Status>>,
pub ring: watch::Receiver<Arc<Ring>>,
update_lock: Mutex<(watch::Sender<Arc<Status>>, watch::Sender<Arc<Ring>>)>,
pub background: Arc<BackgroundRunner>,
}
#[derive(Debug, Clone)]
pub struct Status {
pub nodes: HashMap<UUID, NodeStatus>,
pub hash: Hash,
}
#[derive(Debug, Clone)]
pub struct NodeStatus {
pub addr: SocketAddr,
pub remaining_ping_attempts: usize,
}
#[derive(Clone)]
pub struct Ring {
pub config: NetworkConfig,
pub ring: Vec<RingEntry>,
pub n_datacenters: usize,
}
#[derive(Clone, Debug)]
pub struct RingEntry {
pub location: Hash,
pub node: UUID,
pub datacenter: u64,
}
impl Status {
fn handle_ping(&mut self, ip: IpAddr, info: &PingMessage) -> bool {
let addr = SocketAddr::new(ip, info.rpc_port);
let old_status = self.nodes.insert(
info.id.clone(),
NodeStatus {
addr: addr.clone(),
remaining_ping_attempts: MAX_FAILED_PINGS,
},
);
match old_status {
None => {
eprintln!("Newly pingable node: {}", hex::encode(&info.id));
true
}
Some(x) => x.addr != addr,
}
}
fn recalculate_hash(&mut self) {
let mut nodes = self.nodes.iter().collect::<Vec<_>>();
nodes.sort_unstable_by_key(|(id, _status)| *id);
let mut hasher = Sha256::new();
eprintln!("Current set of pingable nodes: --");
for (id, status) in nodes {
eprintln!("{} {}", hex::encode(&id), status.addr);
hasher.input(format!("{} {}\n", hex::encode(&id), status.addr));
}
eprintln!("END --");
self.hash
.as_slice_mut()
.copy_from_slice(&hasher.result()[..]);
}
}
impl Ring {
fn rebuild_ring(&mut self) {
let mut new_ring = vec![];
let mut datacenters = vec![];
for (id, config) in self.config.members.iter() {
let mut dc_hasher = std::collections::hash_map::DefaultHasher::new();
config.datacenter.hash(&mut dc_hasher);
let datacenter = dc_hasher.finish();
if !datacenters.contains(&datacenter) {
datacenters.push(datacenter);
}
for i in 0..config.n_tokens {
let location = hash(format!("{} {}", hex::encode(&id), i).as_bytes());
new_ring.push(RingEntry {
location: location.into(),
node: id.clone(),
datacenter,
})
}
}
new_ring.sort_unstable_by(|x, y| x.location.cmp(&y.location));
self.ring = new_ring;
self.n_datacenters = datacenters.len();
// eprintln!("RING: --");
// for e in self.ring.iter() {
// eprintln!("{:?}", e);
// }
// eprintln!("END --");
}
pub fn walk_ring(&self, from: &Hash, n: usize) -> Vec<UUID> {
if n >= self.config.members.len() {
return self.config.members.keys().cloned().collect::<Vec<_>>();
}
let start = match self.ring.binary_search_by(|x| x.location.cmp(from)) {
Ok(i) => i,
Err(i) => {
if i == 0 {
self.ring.len() - 1
} else {
i - 1
}
}
};
self.walk_ring_from_pos(start, n)
}
pub fn walk_ring_from_pos(&self, start: usize, n: usize) -> Vec<UUID> {
let mut ret = vec![];
let mut datacenters = vec![];
for delta in 0..self.ring.len() {
if ret.len() == n {
break;
}
let i = (start + delta) % self.ring.len();
if datacenters.len() == self.n_datacenters && !ret.contains(&self.ring[i].node) {
ret.push(self.ring[i].node.clone());
} else if !datacenters.contains(&self.ring[i].datacenter) {
ret.push(self.ring[i].node.clone());
datacenters.push(self.ring[i].datacenter);
}
}
ret
}
}
fn read_network_config(metadata_dir: &PathBuf) -> Result<NetworkConfig, Error> {
let mut path = metadata_dir.clone();
path.push("network_config");
let mut file = std::fs::OpenOptions::new()
.read(true)
.open(path.as_path())?;
let mut net_config_bytes = vec![];
file.read_to_end(&mut net_config_bytes)?;
let net_config = rmp_serde::decode::from_read_ref(&net_config_bytes[..])?;
Ok(net_config)
}
impl System {
pub fn new(config: Config, id: UUID, background: Arc<BackgroundRunner>) -> Self {
let net_config = match read_network_config(&config.metadata_dir) {
Ok(x) => x,
Err(e) => {
println!(
"No valid previous network configuration stored ({}), starting fresh.",
e
);
NetworkConfig {
members: HashMap::new(),
version: 0,
}
}
};
let mut status = Status {
nodes: HashMap::new(),
hash: Hash::default(),
};
status.recalculate_hash();
let (update_status, status) = watch::channel(Arc::new(status));
let mut ring = Ring {
config: net_config,
ring: Vec::new(),
n_datacenters: 0,
};
ring.rebuild_ring();
let (update_ring, ring) = watch::channel(Arc::new(ring));
let rpc_client = RpcClient::new(&config.rpc_tls).expect("Could not create RPC client");
System {
config,
id,
rpc_client,
status,
ring,
update_lock: Mutex::new((update_status, update_ring)),
background,
}
}
async fn save_network_config(self: Arc<Self>) -> Result<(), Error> {
let mut path = self.config.metadata_dir.clone();
path.push("network_config");
let ring = self.ring.borrow().clone();
let data = rmp_to_vec_all_named(&ring.config)?;
let mut f = tokio::fs::File::create(path.as_path()).await?;
f.write_all(&data[..]).await?;
Ok(())
}
pub fn make_ping(&self) -> Message {
let status = self.status.borrow().clone();
let ring = self.ring.borrow().clone();
Message::Ping(PingMessage {
id: self.id.clone(),
rpc_port: self.config.rpc_port,
status_hash: status.hash.clone(),
config_version: ring.config.version,
})
}
pub async fn broadcast(self: Arc<Self>, msg: Message, timeout: Duration) {
let status = self.status.borrow().clone();
let to = status
.nodes
.keys()
.filter(|x| **x != self.id)
.cloned()
.collect::<Vec<_>>();
rpc_call_many(self.clone(), &to[..], msg, timeout).await;
}
pub async fn bootstrap(self: Arc<Self>) {
let bootstrap_peers = self
.config
.bootstrap_peers
.iter()
.map(|ip| (ip.clone(), None))
.collect::<Vec<_>>();
self.clone().ping_nodes(bootstrap_peers).await;
self.clone()
.background
.spawn_worker(|stop_signal| self.ping_loop(stop_signal).map(Ok))
.await;
}
async fn ping_nodes(self: Arc<Self>, peers: Vec<(SocketAddr, Option<UUID>)>) {
let ping_msg = self.make_ping();
let ping_resps = join_all(peers.iter().map(|(addr, id_option)| {
let sys = self.clone();
let ping_msg_ref = &ping_msg;
async move {
(
id_option,
addr.clone(),
sys.rpc_client.call(&addr, ping_msg_ref, PING_TIMEOUT).await,
)
}
}))
.await;
let update_locked = self.update_lock.lock().await;
let mut status: Status = self.status.borrow().as_ref().clone();
let ring = self.ring.borrow().clone();
let mut has_changes = false;
let mut to_advertise = vec![];
for (id_option, addr, ping_resp) in ping_resps {
if let Ok(Message::Ping(info)) = ping_resp {
let is_new = status.handle_ping(addr.ip(), &info);
if is_new {
has_changes = true;
to_advertise.push(AdvertisedNode {
id: info.id.clone(),
addr: addr.clone(),
});
}
if is_new || status.hash != info.status_hash {
self.background
.spawn_cancellable(self.clone().pull_status(info.id.clone()).map(Ok));
}
if is_new || ring.config.version < info.config_version {
self.background
.spawn_cancellable(self.clone().pull_config(info.id.clone()).map(Ok));
}
} else if let Some(id) = id_option {
let remaining_attempts = status
.nodes
.get(id)
.map(|x| x.remaining_ping_attempts)
.unwrap_or(0);
if remaining_attempts == 0 {
eprintln!(
"Removing node {} after too many failed pings",
hex::encode(&id)
);
status.nodes.remove(&id);
has_changes = true;
} else {
if let Some(st) = status.nodes.get_mut(id) {
st.remaining_ping_attempts = remaining_attempts - 1;
}
}
}
}
if has_changes {
status.recalculate_hash();
}
if let Err(e) = update_locked.0.broadcast(Arc::new(status)) {
eprintln!("In ping_nodes: could not save status update ({})", e);
}
drop(update_locked);
if to_advertise.len() > 0 {
self.broadcast(Message::AdvertiseNodesUp(to_advertise), PING_TIMEOUT)
.await;
}
}
pub async fn handle_ping(
self: Arc<Self>,
from: &SocketAddr,
ping: &PingMessage,
) -> Result<Message, Error> {
let update_locked = self.update_lock.lock().await;
let mut status: Status = self.status.borrow().as_ref().clone();
let is_new = status.handle_ping(from.ip(), ping);
if is_new {
status.recalculate_hash();
}
let status_hash = status.hash.clone();
let config_version = self.ring.borrow().config.version;
update_locked.0.broadcast(Arc::new(status))?;
drop(update_locked);
if is_new || status_hash != ping.status_hash {
self.background
.spawn_cancellable(self.clone().pull_status(ping.id.clone()).map(Ok));
}
if is_new || config_version < ping.config_version {
self.background
.spawn_cancellable(self.clone().pull_config(ping.id.clone()).map(Ok));
}
Ok(self.make_ping())
}
pub fn handle_pull_status(&self) -> Result<Message, Error> {
let status = self.status.borrow().clone();
let mut mem = vec![];
for (node, status) in status.nodes.iter() {
mem.push(AdvertisedNode {
id: node.clone(),
addr: status.addr.clone(),
});
}
Ok(Message::AdvertiseNodesUp(mem))
}
pub fn handle_pull_config(&self) -> Result<Message, Error> {
let ring = self.ring.borrow().clone();
Ok(Message::AdvertiseConfig(ring.config.clone()))
}
pub async fn handle_advertise_nodes_up(
self: Arc<Self>,
adv: &[AdvertisedNode],
) -> Result<Message, Error> {
let mut to_ping = vec![];
let update_lock = self.update_lock.lock().await;
let mut status: Status = self.status.borrow().as_ref().clone();
let mut has_changed = false;
for node in adv.iter() {
if node.id == self.id {
// learn our own ip address
let self_addr = SocketAddr::new(node.addr.ip(), self.config.rpc_port);
let old_self = status.nodes.insert(
node.id.clone(),
NodeStatus {
addr: self_addr,
remaining_ping_attempts: MAX_FAILED_PINGS,
},
);
has_changed = match old_self {
None => true,
Some(x) => x.addr != self_addr,
};
} else if !status.nodes.contains_key(&node.id) {
to_ping.push((node.addr.clone(), Some(node.id.clone())));
}
}
if has_changed {
status.recalculate_hash();
}
update_lock.0.broadcast(Arc::new(status))?;
drop(update_lock);
if to_ping.len() > 0 {
self.background
.spawn_cancellable(self.clone().ping_nodes(to_ping).map(Ok));
}
Ok(Message::Ok)
}
pub async fn handle_advertise_config(
self: Arc<Self>,
adv: &NetworkConfig,
) -> Result<Message, Error> {
let mut ring: Ring = self.ring.borrow().as_ref().clone();
let update_lock = self.update_lock.lock().await;
if adv.version > ring.config.version {
ring.config = adv.clone();
ring.rebuild_ring();
update_lock.1.broadcast(Arc::new(ring))?;
drop(update_lock);
self.background.spawn_cancellable(
self.clone()
.broadcast(Message::AdvertiseConfig(adv.clone()), PING_TIMEOUT)
.map(Ok),
);
self.background.spawn(self.clone().save_network_config());
}
Ok(Message::Ok)
}
pub async fn ping_loop(self: Arc<Self>, mut stop_signal: watch::Receiver<bool>) {
loop {
let restart_at = tokio::time::delay_for(PING_INTERVAL);
let status = self.status.borrow().clone();
let ping_addrs = status
.nodes
.iter()
.filter(|(id, _)| **id != self.id)
.map(|(id, status)| (status.addr.clone(), Some(id.clone())))
.collect::<Vec<_>>();
self.clone().ping_nodes(ping_addrs).await;
select! {
_ = restart_at.fuse() => (),
must_exit = stop_signal.recv().fuse() => {
match must_exit {
None | Some(true) => return,
_ => (),
}
}
}
}
}
pub fn pull_status(
self: Arc<Self>,
peer: UUID,
) -> impl futures::future::Future<Output = ()> + Send + 'static {
async move {
let resp = rpc_call(self.clone(), &peer, &Message::PullStatus, PING_TIMEOUT).await;
if let Ok(Message::AdvertiseNodesUp(nodes)) = resp {
let _: Result<_, _> = self.handle_advertise_nodes_up(&nodes).await;
}
}
}
pub async fn pull_config(self: Arc<Self>, peer: UUID) {
let resp = rpc_call(self.clone(), &peer, &Message::PullConfig, PING_TIMEOUT).await;
if let Ok(Message::AdvertiseConfig(config)) = resp {
let _: Result<_, _> = self.handle_advertise_config(&config).await;
}
}
}