From: xie xingguo Date: Sat, 31 Aug 2019 02:17:57 +0000 (+0800) Subject: osd/PG: fix _finish_recovery vs repair race X-Git-Tag: v15.1.0~1647^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=d96e53285b4e748eacda314bf0958b87cfa42130;p=ceph.git osd/PG: fix _finish_recovery vs repair race On detecting a corrupted object, primary may automatically repair that object by leveraging the existing recovery procedure, which turned out to be racy with a previous unfinished _finish_recovery callback - the problem would then be that _finish_recovery might continue to purge some strays that we still want to pull data from. Fix by re-checking if there are any newly added missing objects when executing _finish_recovery. Note that before https://github.com/ceph/ceph/pull/29756 we might instead have to call needs_recovery to catch the race condition since we did not evict pg from clean state when triggering an auto-repair.. Signed-off-by: xie xingguo --- diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 88d86957ad9e..6556d13301e1 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -494,11 +494,12 @@ Context *PG::finish_recovery() void PG::_finish_recovery(Context *c) { std::scoped_lock locker{*this}; - // When recovery is initiated by a repair, that flag is left on - state_clear(PG_STATE_REPAIR); - if (recovery_state.is_deleting()) { + if (recovery_state.is_deleting() || !is_clean()) { + dout(10) << __func__ << " raced with delete or repair" << dendl; return; } + // When recovery is initiated by a repair, that flag is left on + state_clear(PG_STATE_REPAIR); if (c == finish_sync_event) { dout(10) << "_finish_recovery" << dendl; finish_sync_event = 0;