[block-ref-repair] Block refcount recalculation and repair

- We always recalculate the reference count of a block before deleting
  it locally, to make sure that it is indeed zero.

- If we had to fetch a remote block but we were not able to get it,
  check that refcount is indeed > 0.

- Repair procedure that checks everything
This commit is contained in:
Alex Auvolat 2024-03-19 11:04:20 +01:00
parent 0038ca8a78
commit dc0b78cdb8
No known key found for this signature in database
GPG key ID: 0E496D15096376BE
10 changed files with 285 additions and 8 deletions

View file

@ -11,3 +11,4 @@ mod metrics;
mod rc; mod rc;
pub use block::zstd_encode; pub use block::zstd_encode;
pub use rc::CalculateRefcount;

View file

@ -88,7 +88,7 @@ pub struct BlockManager {
mutation_lock: Vec<Mutex<BlockManagerLocked>>, mutation_lock: Vec<Mutex<BlockManagerLocked>>,
pub(crate) rc: BlockRc, pub rc: BlockRc,
pub resync: BlockResyncManager, pub resync: BlockResyncManager,
pub(crate) system: Arc<System>, pub(crate) system: Arc<System>,
@ -229,6 +229,12 @@ impl BlockManager {
} }
} }
/// Initialization: set how block references are recalculated
/// for repair operations
pub fn set_recalc_rc(&self, recalc: Vec<CalculateRefcount>) {
self.rc.recalc_rc.store(Some(Arc::new(recalc)));
}
/// Ask nodes that might have a (possibly compressed) block for it /// Ask nodes that might have a (possibly compressed) block for it
/// Return it as a stream with a header /// Return it as a stream with a header
async fn rpc_get_raw_block_streaming( async fn rpc_get_raw_block_streaming(
@ -316,9 +322,9 @@ impl BlockManager {
}; };
} }
let msg = format!("Get block {:?}: no node returned a valid block", hash); let err = Error::MissingBlock(*hash);
debug!("{}", msg); debug!("{}", err);
Err(Error::Message(msg)) Err(err)
} }
// ---- Public interface ---- // ---- Public interface ----

View file

@ -1,5 +1,7 @@
use std::convert::TryInto; use std::convert::TryInto;
use arc_swap::ArcSwapOption;
use garage_db as db; use garage_db as db;
use garage_util::data::*; use garage_util::data::*;
@ -8,13 +10,20 @@ use garage_util::time::*;
use crate::manager::BLOCK_GC_DELAY; use crate::manager::BLOCK_GC_DELAY;
pub type CalculateRefcount =
Box<dyn Fn(&db::Transaction, &Hash) -> db::TxResult<usize, Error> + Send + Sync>;
pub struct BlockRc { pub struct BlockRc {
pub(crate) rc: db::Tree, pub rc: db::Tree,
pub(crate) recalc_rc: ArcSwapOption<Vec<CalculateRefcount>>,
} }
impl BlockRc { impl BlockRc {
pub(crate) fn new(rc: db::Tree) -> Self { pub(crate) fn new(rc: db::Tree) -> Self {
Self { rc } Self {
rc,
recalc_rc: ArcSwapOption::new(None),
}
} }
/// Increment the reference counter associated to a hash. /// Increment the reference counter associated to a hash.
@ -68,6 +77,58 @@ impl BlockRc {
})?; })?;
Ok(()) Ok(())
} }
/// Recalculate the reference counter of a block
/// to fix potential inconsistencies
pub fn recalculate_rc(&self, hash: &Hash) -> Result<(usize, bool), Error> {
if let Some(recalc_fns) = self.recalc_rc.load().as_ref() {
trace!("Repair block RC for {:?}", hash);
let res = self
.rc
.db()
.transaction(|tx| {
let mut cnt = 0;
for f in recalc_fns.iter() {
cnt += f(&tx, hash)?;
}
let old_rc = RcEntry::parse_opt(tx.get(&self.rc, hash)?);
trace!(
"Block RC for {:?}: stored={}, calculated={}",
hash,
old_rc.as_u64(),
cnt
);
if cnt as u64 != old_rc.as_u64() {
warn!(
"Fixing inconsistent block RC for {:?}: was {}, should be {}",
hash,
old_rc.as_u64(),
cnt
);
let new_rc = if cnt > 0 {
RcEntry::Present { count: cnt as u64 }
} else {
RcEntry::Deletable {
at_time: now_msec() + BLOCK_GC_DELAY.as_millis() as u64,
}
};
tx.insert(&self.rc, hash, new_rc.serialize().unwrap())?;
Ok((cnt, true))
} else {
Ok((cnt, false))
}
})
.map_err(Error::from);
if let Err(e) = &res {
error!("Failed to fix RC for block {:?}: {}", hash, e);
}
res
} else {
Err(Error::Message(
"Block RC recalculation is not available at this point".into(),
))
}
}
} }
/// Describes the state of the reference counter for a block /// Describes the state of the reference counter for a block

View file

@ -367,6 +367,13 @@ impl BlockResyncManager {
} }
if exists && rc.is_deletable() { if exists && rc.is_deletable() {
if manager.rc.recalculate_rc(hash)?.0 > 0 {
return Err(Error::Message(format!(
"Refcount for block {:?} was inconsistent, retrying later",
hash
)));
}
info!("Resync block {:?}: offloading and deleting", hash); info!("Resync block {:?}: offloading and deleting", hash);
let existing_path = existing_path.unwrap(); let existing_path = existing_path.unwrap();
@ -453,7 +460,15 @@ impl BlockResyncManager {
hash hash
); );
let block_data = manager.rpc_get_raw_block(hash, None).await?; let block_data = manager.rpc_get_raw_block(hash, None).await;
if matches!(block_data, Err(Error::MissingBlock(_))) {
warn!(
"Could not fetch needed block {:?}, no node returned valid data. Checking that refcount is correct.",
hash
);
manager.rc.recalculate_rc(hash)?;
}
let block_data = block_data?;
manager.metrics.resync_recv_counter.add(1); manager.metrics.resync_recv_counter.add(1);

View file

@ -473,8 +473,11 @@ pub enum RepairWhat {
#[structopt(name = "mpu", version = garage_version())] #[structopt(name = "mpu", version = garage_version())]
MultipartUploads, MultipartUploads,
/// Repropagate version deletions to the block ref table /// Repropagate version deletions to the block ref table
#[structopt(name = "block_refs", version = garage_version())] #[structopt(name = "block-refs", version = garage_version())]
BlockRefs, BlockRefs,
/// Recalculate block reference counters
#[structopt(name = "block-rc", version = garage_version())]
BlockRc,
/// Verify integrity of all blocks on disc /// Verify integrity of all blocks on disc
#[structopt(name = "scrub", version = garage_version())] #[structopt(name = "scrub", version = garage_version())]
Scrub { Scrub {

View file

@ -4,6 +4,7 @@ use std::time::Duration;
use async_trait::async_trait; use async_trait::async_trait;
use tokio::sync::watch; use tokio::sync::watch;
use garage_block::manager::BlockManager;
use garage_block::repair::ScrubWorkerCommand; use garage_block::repair::ScrubWorkerCommand;
use garage_model::garage::Garage; use garage_model::garage::Garage;
@ -16,11 +17,14 @@ use garage_table::replication::*;
use garage_table::*; use garage_table::*;
use garage_util::background::*; use garage_util::background::*;
use garage_util::data::*;
use garage_util::error::Error; use garage_util::error::Error;
use garage_util::migrate::Migrate; use garage_util::migrate::Migrate;
use crate::*; use crate::*;
const RC_REPAIR_ITER_COUNT: usize = 64;
pub async fn launch_online_repair( pub async fn launch_online_repair(
garage: &Arc<Garage>, garage: &Arc<Garage>,
bg: &BackgroundRunner, bg: &BackgroundRunner,
@ -47,6 +51,13 @@ pub async fn launch_online_repair(
info!("Repairing the block refs table"); info!("Repairing the block refs table");
bg.spawn_worker(TableRepairWorker::new(garage.clone(), RepairBlockRefs)); bg.spawn_worker(TableRepairWorker::new(garage.clone(), RepairBlockRefs));
} }
RepairWhat::BlockRc => {
info!("Repairing the block reference counters");
bg.spawn_worker(BlockRcRepair::new(
garage.block_manager.clone(),
garage.block_ref_table.clone(),
));
}
RepairWhat::Blocks => { RepairWhat::Blocks => {
info!("Repairing the stored blocks"); info!("Repairing the stored blocks");
bg.spawn_worker(garage_block::repair::RepairWorker::new( bg.spawn_worker(garage_block::repair::RepairWorker::new(
@ -282,3 +293,98 @@ impl TableRepair for RepairMpu {
Ok(false) Ok(false)
} }
} }
// ===== block reference counter repair =====
pub struct BlockRcRepair {
block_manager: Arc<BlockManager>,
block_ref_table: Arc<Table<BlockRefTable, TableShardedReplication>>,
cursor: Hash,
counter: u64,
repairs: u64,
}
impl BlockRcRepair {
fn new(
block_manager: Arc<BlockManager>,
block_ref_table: Arc<Table<BlockRefTable, TableShardedReplication>>,
) -> Self {
Self {
block_manager,
block_ref_table,
cursor: [0u8; 32].into(),
counter: 0,
repairs: 0,
}
}
}
#[async_trait]
impl Worker for BlockRcRepair {
fn name(&self) -> String {
format!("Block refcount repair worker")
}
fn status(&self) -> WorkerStatus {
WorkerStatus {
progress: Some(format!("{} ({})", self.counter, self.repairs)),
..Default::default()
}
}
async fn work(&mut self, _must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
for _i in 0..RC_REPAIR_ITER_COUNT {
let next1 = self
.block_manager
.rc
.rc
.range(self.cursor.as_slice()..)?
.next()
.transpose()?
.map(|(k, _)| Hash::try_from(k.as_slice()).unwrap());
let next2 = self
.block_ref_table
.data
.store
.range(self.cursor.as_slice()..)?
.next()
.transpose()?
.map(|(k, _)| Hash::try_from(&k[..32]).unwrap());
let next = match (next1, next2) {
(Some(k1), Some(k2)) => std::cmp::min(k1, k2),
(Some(k), None) | (None, Some(k)) => k,
(None, None) => {
info!(
"{}: finished, done {}, fixed {}",
self.name(),
self.counter,
self.repairs
);
return Ok(WorkerState::Done);
}
};
if self.block_manager.rc.recalculate_rc(&next)?.1 {
self.repairs += 1;
}
self.counter += 1;
if let Some(next_incr) = next.increment() {
self.cursor = next_incr;
} else {
info!(
"{}: finished, done {}, fixed {}",
self.name(),
self.counter,
self.repairs
);
return Ok(WorkerState::Done);
}
}
Ok(WorkerState::Busy)
}
async fn wait_for_work(&mut self) -> WorkerState {
unreachable!()
}
}

View file

@ -247,6 +247,14 @@ impl Garage {
#[cfg(feature = "k2v")] #[cfg(feature = "k2v")]
let k2v = GarageK2V::new(system.clone(), &db, meta_rep_param); let k2v = GarageK2V::new(system.clone(), &db, meta_rep_param);
// ---- setup block refcount recalculation ----
// this function can be used to fix inconsistencies in the RC table
block_manager.set_recalc_rc(vec![
block_ref_recount_fn(&block_ref_table),
// other functions could be added here if we had other tables
// that hold references to data blocks
]);
// -- done -- // -- done --
Ok(Arc::new(Self { Ok(Arc::new(Self {
config, config,

View file

@ -3,8 +3,12 @@ use std::sync::Arc;
use garage_db as db; use garage_db as db;
use garage_util::data::*; use garage_util::data::*;
use garage_util::error::*;
use garage_util::migrate::Migrate;
use garage_block::CalculateRefcount;
use garage_table::crdt::Crdt; use garage_table::crdt::Crdt;
use garage_table::replication::TableShardedReplication;
use garage_table::*; use garage_table::*;
use garage_block::manager::*; use garage_block::manager::*;
@ -84,3 +88,38 @@ impl TableSchema for BlockRefTable {
filter.apply(entry.deleted.get()) filter.apply(entry.deleted.get())
} }
} }
pub fn block_ref_recount_fn(
block_ref_table: &Arc<Table<BlockRefTable, TableShardedReplication>>,
) -> CalculateRefcount {
let table = Arc::downgrade(block_ref_table);
Box::new(move |tx: &db::Transaction, block: &Hash| {
let table = table
.upgrade()
.ok_or_message("cannot upgrade weak ptr to block_ref_table")
.map_err(db::TxError::Abort)?;
Ok(calculate_refcount(&table, tx, block)?)
})
}
fn calculate_refcount(
block_ref_table: &Table<BlockRefTable, TableShardedReplication>,
tx: &db::Transaction,
block: &Hash,
) -> db::TxResult<usize, Error> {
let mut result = 0;
for entry in tx.range(&block_ref_table.data.store, block.as_slice()..)? {
let (key, value) = entry?;
if &key[..32] != block.as_slice() {
break;
}
let value = BlockRef::decode(&value)
.ok_or_message("could not decode block_ref")
.map_err(db::TxError::Abort)?;
assert_eq!(value.block, *block);
if !value.deleted.get() {
result += 1;
}
}
Ok(result)
}

View file

@ -83,6 +83,19 @@ impl FixedBytes32 {
ret.copy_from_slice(by); ret.copy_from_slice(by);
Some(Self(ret)) Some(Self(ret))
} }
/// Return the next hash
pub fn increment(&self) -> Option<Self> {
let mut ret = *self;
for byte in ret.0.iter_mut().rev() {
if *byte == u8::MAX {
*byte = 0;
} else {
*byte = *byte + 1;
return Some(ret);
}
}
return None;
}
} }
impl From<garage_net::NodeID> for FixedBytes32 { impl From<garage_net::NodeID> for FixedBytes32 {
@ -140,3 +153,25 @@ pub fn fasthash(data: &[u8]) -> FastHash {
pub fn gen_uuid() -> Uuid { pub fn gen_uuid() -> Uuid {
rand::thread_rng().gen::<[u8; 32]>().into() rand::thread_rng().gen::<[u8; 32]>().into()
} }
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_increment() {
let zero: FixedBytes32 = [0u8; 32].into();
let mut one: FixedBytes32 = [0u8; 32].into();
one.0[31] = 1;
let max: FixedBytes32 = [0xFFu8; 32].into();
assert_eq!(zero.increment(), Some(one));
assert_eq!(max.increment(), None);
let mut test: FixedBytes32 = [0u8; 32].into();
let i = 0x198DF97209F8FFFFu64;
test.0[24..32].copy_from_slice(&u64::to_be_bytes(i));
let mut test2: FixedBytes32 = [0u8; 32].into();
test2.0[24..32].copy_from_slice(&u64::to_be_bytes(i + 1));
assert_eq!(test.increment(), Some(test2));
}
}

View file

@ -70,6 +70,9 @@ pub enum Error {
#[error(display = "Corrupt data: does not match hash {:?}", _0)] #[error(display = "Corrupt data: does not match hash {:?}", _0)]
CorruptData(Hash), CorruptData(Hash),
#[error(display = "Missing block {:?}: no node returned a valid block", _0)]
MissingBlock(Hash),
#[error(display = "{}", _0)] #[error(display = "{}", _0)]
Message(String), Message(String),
} }