From 7bbc724d99e998bf6e06c3d32dc68348ab6aa45a Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 25 Nov 2019 13:15:24 -0600 Subject: [PATCH] osd/PeeringState: clear LAGGY and WAIT states on exiting Started These flags were not getting cleared except in recheck_readable(), which meant that a flag from a prior interval could bleed into a new interval. More dangerously, in a mixed-version cluster, one interval might include all octopus+ OSDs while the next might include a pre-octopus OSD, bypassing most of the laggy recheck code. This could lead to a stalled request and/or requeue ordering bug when release_object_locks() looked at is_laggy() and put a lock waiter on the waiting_for_readable list. Fixes: https://tracker.ceph.com/issues/42978 Signed-off-by: Sage Weil --- src/osd/PeeringState.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/osd/PeeringState.cc b/src/osd/PeeringState.cc index a9b5b4fde41d9..a326dc28a4e34 100644 --- a/src/osd/PeeringState.cc +++ b/src/osd/PeeringState.cc @@ -4301,6 +4301,7 @@ void PeeringState::Started::exit() DECLARE_LOCALS; utime_t dur = ceph_clock_now() - enter_time; pl->get_peering_perf().tinc(rs_started_latency, dur); + ps->state_clear(PG_STATE_WAIT | PG_STATE_LAGGY); } /*--------Reset---------*/ -- 2.39.5