From 10eff2567971ca57b1e821f704de490add021c8e Mon Sep 17 00:00:00 2001 From: xie xingguo Date: Fri, 24 Jul 2020 09:57:40 +0800 Subject: [PATCH] osd/PeeringState: prevent peer's num_objects going negative Saw it in a teuthology run: -5645> 2020-07-20 04:34:32.067 7f351e329700 5 osd.5 pg_epoch: 667 ... exit Started/Primary/Active/Backfilling -5642> 2020-07-20 04:34:32.067 7f351e329700 5 osd.5 pg_epoch: 667 ... enter Started/Primary/Active/Recovered -5633> 2020-07-20 04:34:32.067 7f351e329700 20 osd.5 pg_epoch: 667 ... _update_calc_stats shard 5 primary objects 0 missing 0 -5632> 2020-07-20 04:34:32.067 7f351e329700 20 osd.5 pg_epoch: 667 ... _update_calc_stats shard 3 objects -1 missing 1 -5631> 2020-07-20 04:34:32.067 7f351e329700 20 osd.5 pg_epoch: 667 ... _update_calc_stats shard 6 objects 0 missing 0 This will crash the choose_acting() procedure as it will mistakenly think that peer 3 should continue to perform asynchronous recovery (e.g., due to num_objects_missing = 1) in contrast to fully backfill-recovered. While I did not dig into the real cause, there are a couple of possible explanations of how num_objects can be off. I think that if a roll forward or log replay could delete something twice, maybe there would be an undercount. Or maybe something as simple as a corruption. Since _update_calc_stats() is going to fix num_objects_missing for that peer anyway, let's make sure it always starts with a clean state. Fixes: https://tracker.ceph.com/issues/46705 Signed-off-by: xie xingguo --- src/osd/PeeringState.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/osd/PeeringState.cc b/src/osd/PeeringState.cc index 5daf8fbb49e..322bea953b0 100644 --- a/src/osd/PeeringState.cc +++ b/src/osd/PeeringState.cc @@ -3276,7 +3276,8 @@ void PeeringState::update_calc_stats() // Primary should not be in the peer_info, skip if it is. if (peer.first == pg_whoami) continue; int64_t missing = 0; - int64_t peer_num_objects = peer.second.stats.stats.sum.num_objects; + int64_t peer_num_objects = + std::max((int64_t)0, peer.second.stats.stats.sum.num_objects); // Backfill targets always track num_objects accurately // all other peers track missing accurately. if (is_backfill_target(peer.first)) { -- 2.39.5