From d96e53285b4e748eacda314bf0958b87cfa42130 Mon Sep 17 00:00:00 2001 From: xie xingguo Date: Sat, 31 Aug 2019 10:17:57 +0800 Subject: [PATCH] osd/PG: fix _finish_recovery vs repair race On detecting a corrupted object, primary may automatically repair that object by leveraging the existing recovery procedure, which turned out to be racy with a previous unfinished _finish_recovery callback - the problem would then be that _finish_recovery might continue to purge some strays that we still want to pull data from. Fix by re-checking if there are any newly added missing objects when executing _finish_recovery. Note that before https://github.com/ceph/ceph/pull/29756 we might instead have to call needs_recovery to catch the race condition since we did not evict pg from clean state when triggering an auto-repair.. Signed-off-by: xie xingguo --- src/osd/PG.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 88d86957ad9..6556d13301e 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -494,11 +494,12 @@ Context *PG::finish_recovery() void PG::_finish_recovery(Context *c) { std::scoped_lock locker{*this}; - // When recovery is initiated by a repair, that flag is left on - state_clear(PG_STATE_REPAIR); - if (recovery_state.is_deleting()) { + if (recovery_state.is_deleting() || !is_clean()) { + dout(10) << __func__ << " raced with delete or repair" << dendl; return; } + // When recovery is initiated by a repair, that flag is left on + state_clear(PG_STATE_REPAIR); if (c == finish_sync_event) { dout(10) << "_finish_recovery" << dendl; finish_sync_event = 0; -- 2.47.3