From 71b5b39220b5f69995faffad1ef6992db6a337df Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 7 Feb 2020 10:33:26 -0600 Subject: [PATCH] osd/PeeringState: do not start renewing leases until PG is activated The activate() work renews the lease so that we can send lease info out to the peers immediately. However, these messages may get delayed. Since we immediately start scheduling renewals, it's possible for the renewal to go out before the PG is active, crashing the replicas. Fix by not scheduling renewals until the PG is really active. Also, renew aggressively at that point in time, since it may have been a while since we first started the activation. Fixes: https://tracker.ceph.com/issues/44041 Signed-off-by: Sage Weil --- src/osd/PeeringState.cc | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/osd/PeeringState.cc b/src/osd/PeeringState.cc index aa9e047c9337..3c84ebf350a2 100644 --- a/src/osd/PeeringState.cc +++ b/src/osd/PeeringState.cc @@ -2390,7 +2390,7 @@ void PeeringState::activate( if (HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) { renew_lease(pl->get_mnow()); - schedule_renew_lease(); + // do not schedule until we are actually activated } // adjust purged_snaps: PG may have been inactive while snaps were pruned @@ -5880,6 +5880,16 @@ void PeeringState::Active::all_activated_and_committed() ceph_assert(!ps->acting_recovery_backfill.empty()); ceph_assert(ps->blocked_by.empty()); + if (HAVE_FEATURE(ps->upacting_features, SERVER_OCTOPUS)) { + // this is overkill when the activation is quick, but when it is slow it + // is important, because the lease was renewed by the activate itself but we + // don't know how long ago that was, and simply scheduling now may leave + // a gap in lease coverage. keep it simple and aggressively renew. + ps->renew_lease(pl->get_mnow()); + ps->send_lease(); + ps->schedule_renew_lease(); + } + // Degraded? ps->update_calc_stats(); if (ps->info.stats.stats.sum.num_objects_degraded) { -- 2.47.3