From c4960aa42a3c8b839a18448cebb7db4b8f815b78 Mon Sep 17 00:00:00 2001 From: Yuval Lifshitz Date: Mon, 22 Aug 2022 13:29:56 +0000 Subject: [PATCH] rgw/multisite: check for late lease renewals also make lease renewal logs more uniform Signed-off-by: Yuval Lifshitz --- src/rgw/rgw_cr_rados.cc | 9 +++++++++ src/rgw/rgw_cr_rados.h | 13 ++++++++++++- src/rgw/rgw_data_sync.cc | 12 +++++++++--- src/rgw/rgw_sync.cc | 13 +++++++------ 4 files changed, 37 insertions(+), 10 deletions(-) diff --git a/src/rgw/rgw_cr_rados.cc b/src/rgw/rgw_cr_rados.cc index 10e1345f3938..05079723792f 100644 --- a/src/rgw/rgw_cr_rados.cc +++ b/src/rgw/rgw_cr_rados.cc @@ -917,8 +917,16 @@ int RGWContinuousLeaseCR::operate(const DoutPrefixProvider *dpp) return set_cr_done(); } reenter(this) { + last_renew_try_time = ceph::coarse_mono_clock::now(); while (!going_down) { yield call(new RGWSimpleRadosLockCR(async_rados, store, obj, lock_name, cookie, interval)); + current_time = ceph::coarse_mono_clock::now(); + if (current_time - last_renew_try_time > interval_tolerance) { + // renewal should happen between 50%-90% of interval + ldout(store->ctx(), 1) << *this << ": WARNING: did not renew lock " << obj << ":" << lock_name << ": within 90\% of interval. " << + (current_time - last_renew_try_time) << " > " << interval_tolerance << dendl; + } + last_renew_try_time = current_time; caller->set_sleeping(false); /* will only be relevant when we return, that's why we can do it early */ if (retcode < 0) { @@ -926,6 +934,7 @@ int RGWContinuousLeaseCR::operate(const DoutPrefixProvider *dpp) ldout(store->ctx(), 20) << *this << ": couldn't lock " << obj << ":" << lock_name << ": retcode=" << retcode << dendl; return set_state(RGWCoroutine_Error, retcode); } + ldout(store->ctx(), 20) << *this << ": successfully locked " << obj << ":" << lock_name << dendl; set_locked(true); yield wait(utime_t(interval / 2, 0)); } diff --git a/src/rgw/rgw_cr_rados.h b/src/rgw/rgw_cr_rados.h index 9e380e78ad31..cb392c05ff10 100644 --- a/src/rgw/rgw_cr_rados.h +++ b/src/rgw/rgw_cr_rados.h @@ -13,6 +13,7 @@ #include "common/Throttle.h" #include +#include "common/ceph_time.h" #include "services/svc_sys_obj.h" #include "services/svc_bucket.h" @@ -1414,10 +1415,16 @@ class RGWContinuousLeaseCR : public RGWCoroutine { int interval; bool going_down{ false }; bool locked{false}; + + const ceph::timespan interval_tolerance; + const ceph::timespan ts_interval; RGWCoroutine *caller; bool aborted{false}; + + ceph::coarse_mono_time last_renew_try_time; + ceph::coarse_mono_time current_time; public: RGWContinuousLeaseCR(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store, @@ -1426,7 +1433,8 @@ public: : RGWCoroutine(_store->ctx()), async_rados(_async_rados), store(_store), obj(_obj), lock_name(_lock_name), cookie(RGWSimpleRadosLockCR::gen_random_cookie(cct)), - interval(_interval), caller(_caller) + interval(_interval), interval_tolerance(ceph::make_timespan(9*interval/10)), ts_interval(ceph::make_timespan(interval)), + caller(_caller) {} virtual ~RGWContinuousLeaseCR() override; @@ -1434,6 +1442,9 @@ public: int operate(const DoutPrefixProvider *dpp) override; bool is_locked() const { + if (ceph::coarse_mono_clock::now() - last_renew_try_time > ts_interval) { + return false; + } return locked; } diff --git a/src/rgw/rgw_data_sync.cc b/src/rgw/rgw_data_sync.cc index 88cb9db899fd..17ec3e86846a 100644 --- a/src/rgw/rgw_data_sync.cc +++ b/src/rgw/rgw_data_sync.cc @@ -1579,6 +1579,7 @@ public: if (!lease_cr->is_locked()) { lease_cr->go_down(); drain_all(); + tn->log(1, "lease is lost, abort"); return set_cr_error(-ECANCELED); } omapvals = std::make_shared(); @@ -1672,6 +1673,7 @@ public: if (!lease_cr->is_locked()) { lease_cr->go_down(); drain_all(); + tn->log(1, "lease is lost, abort"); return set_cr_error(-ECANCELED); } current_modified.clear(); @@ -4006,6 +4008,7 @@ int RGWBucketFullSyncCR::operate(const DoutPrefixProvider *dpp) do { if (lease_cr && !lease_cr->is_locked()) { drain_all(); + tn->log(1, "no lease or lease is lost, abort"); return set_cr_error(-ECANCELED); } set_status("listing remote bucket"); @@ -4030,6 +4033,7 @@ int RGWBucketFullSyncCR::operate(const DoutPrefixProvider *dpp) for (; entries_iter != list_result.entries.end(); ++entries_iter) { if (lease_cr && !lease_cr->is_locked()) { drain_all(); + tn->log(1, "no lease or lease is lost, abort"); return set_cr_error(-ECANCELED); } tn->log(20, SSTR("[full sync] syncing object: " @@ -4075,6 +4079,7 @@ int RGWBucketFullSyncCR::operate(const DoutPrefixProvider *dpp) }); tn->unset_flag(RGW_SNS_FLAG_ACTIVE); if (lease_cr && !lease_cr->is_locked()) { + tn->log(1, "no lease or lease is lost, abort"); return set_cr_error(-ECANCELED); } yield call(marker_tracker.flush()); @@ -4265,7 +4270,7 @@ int RGWBucketShardIncrementalSyncCR::operate(const DoutPrefixProvider *dpp) do { if (lease_cr && !lease_cr->is_locked()) { drain_all(); - tn->log(0, "ERROR: lease is not taken, abort"); + tn->log(1, "no lease or lease is lost, abort"); return set_cr_error(-ECANCELED); } tn->log(20, SSTR("listing bilog for incremental sync; position=" << sync_info.inc_marker.position)); @@ -4321,6 +4326,7 @@ int RGWBucketShardIncrementalSyncCR::operate(const DoutPrefixProvider *dpp) for (; entries_iter != entries_end; ++entries_iter) { if (lease_cr && !lease_cr->is_locked()) { drain_all(); + tn->log(1, "no lease or lease is lost, abort"); return set_cr_error(-ECANCELED); } entry = &(*entries_iter); @@ -5200,7 +5206,7 @@ int RGWSyncBucketCR::operate(const DoutPrefixProvider *dpp) yield spawn(bucket_lease_cr.get(), false); while (!bucket_lease_cr->is_locked()) { if (bucket_lease_cr->is_done()) { - tn->log(5, "ERROR: failed to take bucket lease"); + tn->log(5, "failed to take lease"); set_status("lease lock failed, early abort"); drain_all(); return set_cr_error(bucket_lease_cr->get_ret_status()); @@ -5257,7 +5263,7 @@ int RGWSyncBucketCR::operate(const DoutPrefixProvider *dpp) yield spawn(bucket_lease_cr.get(), false); while (!bucket_lease_cr->is_locked()) { if (bucket_lease_cr->is_done()) { - tn->log(5, "ERROR: failed to take bucket lease"); + tn->log(5, "failed to take lease"); set_status("lease lock failed, early abort"); drain_all(); return set_cr_error(bucket_lease_cr->get_ret_status()); diff --git a/src/rgw/rgw_sync.cc b/src/rgw/rgw_sync.cc index 815405e0a02b..705f3eac872f 100644 --- a/src/rgw/rgw_sync.cc +++ b/src/rgw/rgw_sync.cc @@ -679,7 +679,7 @@ public: } while (!lease_cr->is_locked()) { if (lease_cr->is_done()) { - ldpp_dout(dpp, 5) << "lease cr failed, done early " << dendl; + ldpp_dout(dpp, 5) << "failed to take lease" << dendl; set_status("lease lock failed, early abort"); return set_cr_error(lease_cr->get_ret_status()); } @@ -926,8 +926,8 @@ public: } while (!lease_cr->is_locked()) { if (lease_cr->is_done()) { - ldpp_dout(dpp, 5) << "lease cr failed, done early " << dendl; - set_status("failed acquiring lock"); + ldpp_dout(dpp, 5) << "failed to take lease" << dendl; + set_status("lease lock failed, early abort"); return set_cr_error(lease_cr->get_ret_status()); } set_sleeping(true); @@ -977,6 +977,7 @@ public: for (; iter != result.keys.end(); ++iter) { if (!lease_cr->is_locked()) { lost_lock = true; + tn->log(1, "lease is lost, abort"); break; } yield; // allow entries_index consumer to make progress @@ -1618,7 +1619,7 @@ public: /* sync! */ do { if (!lease_cr->is_locked()) { - tn->log(10, "lost lease"); + tn->log(1, "lease is lost, abort"); lost_lock = true; break; } @@ -1751,7 +1752,7 @@ public: while (!lease_cr->is_locked()) { if (lease_cr->is_done()) { drain_all(); - tn->log(10, "failed to take lease"); + tn->log(5, "failed to take lease"); return lease_cr->get_ret_status(); } set_sleeping(true); @@ -1784,7 +1785,7 @@ public: do { if (!lease_cr->is_locked()) { lost_lock = true; - tn->log(10, "lost lease"); + tn->log(1, "lease is lost, abort"); break; } #define INCREMENTAL_MAX_ENTRIES 100 -- 2.47.3