From bca3fd651f4222e4370ba175b447ca91b84d23b0 Mon Sep 17 00:00:00 2001 From: xie xingguo Date: Fri, 24 Jul 2020 09:57:40 +0800 Subject: [PATCH] osd/PeeringState: prevent peer's num_objects going negative Saw it in a teuthology run: -5645> 2020-07-20 04:34:32.067 7f351e329700 5 osd.5 pg_epoch: 667 ... exit Started/Primary/Active/Backfilling -5642> 2020-07-20 04:34:32.067 7f351e329700 5 osd.5 pg_epoch: 667 ... enter Started/Primary/Active/Recovered -5633> 2020-07-20 04:34:32.067 7f351e329700 20 osd.5 pg_epoch: 667 ... _update_calc_stats shard 5 primary objects 0 missing 0 -5632> 2020-07-20 04:34:32.067 7f351e329700 20 osd.5 pg_epoch: 667 ... _update_calc_stats shard 3 objects -1 missing 1 -5631> 2020-07-20 04:34:32.067 7f351e329700 20 osd.5 pg_epoch: 667 ... _update_calc_stats shard 6 objects 0 missing 0 This will crash the choose_acting() procedure as it will mistakenly think that peer 3 should continue to perform asynchronous recovery (e.g., due to num_objects_missing = 1) in contrast to fully backfill-recovered. While I did not dig into the real cause, there are a couple of possible explanations of how num_objects can be off. I think that if a roll forward or log replay could delete something twice, maybe there would be an undercount. Or maybe something as simple as a corruption. Since _update_calc_stats() is going to fix num_objects_missing for that peer anyway, let's make sure it always starts with a clean state. Fixes: https://tracker.ceph.com/issues/46705 Signed-off-by: xie xingguo (cherry picked from commit 10eff2567971ca57b1e821f704de490add021c8e) Conflicts: src/osd/PeeringState.cc - file does not exist in nautilus; made the change manually in src/osd/PG.cc instead --- src/osd/PG.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 76bbbeb1d6cf6..36bb4d242562b 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -3349,7 +3349,8 @@ void PG::_update_calc_stats() // Primary should not be in the peer_info, skip if it is. if (peer.first == pg_whoami) continue; int64_t missing = 0; - int64_t peer_num_objects = peer.second.stats.stats.sum.num_objects; + int64_t peer_num_objects = + std::max((int64_t)0, peer.second.stats.stats.sum.num_objects); // Backfill targets always track num_objects accurately // all other peers track missing accurately. if (is_backfill_targets(peer.first)) { -- 2.39.5