From 5dacff4c136cd9eac02b623044467f1a0d5e0683 Mon Sep 17 00:00:00 2001 From: Leonid Chernin Date: Thu, 9 Oct 2025 08:24:20 +0300 Subject: [PATCH] nvmeofgw: fast-failover changes beacon timeouts are measured also in prepare_beacon function to be able to correctly implement shorter timeout values default failover detection time was set to 7 sec default beacon tick was set to 1 second changed condition for detection of ceph slowness in NVMeofgwMon Signed-off-by: Leonid Chernin (cherry picked from commit b49ac690d97c8e1659fc58c91cc278403d720c3b) --- src/common/options/mon.yaml.in | 4 +-- src/mon/NVMeofGwMon.cc | 58 ++++++++++++++++++++-------------- src/mon/NVMeofGwMon.h | 11 ++++++- 3 files changed, 46 insertions(+), 27 deletions(-) diff --git a/src/common/options/mon.yaml.in b/src/common/options/mon.yaml.in index 15922b4dbdb..f3e3f889505 100644 --- a/src/common/options/mon.yaml.in +++ b/src/common/options/mon.yaml.in @@ -77,7 +77,7 @@ options: level: advanced desc: Period in seconds from last beacon to monitor marking a NVMeoF gateway as failed - default: 10 + default: 7 services: - mon - name: mon_nvmeofgw_skip_failovers_interval @@ -1399,7 +1399,7 @@ options: type: secs level: advanced desc: Period in seconds of nvmeof gateway beacon messages to monitor - default: 2 + default: 1 services: - mon - name: enable_availability_tracking diff --git a/src/mon/NVMeofGwMon.cc b/src/mon/NVMeofGwMon.cc index 597aee4e144..687cc450db3 100644 --- a/src/mon/NVMeofGwMon.cc +++ b/src/mon/NVMeofGwMon.cc @@ -42,7 +42,7 @@ void NVMeofGwMon::on_restart() { dout(10) << "called " << dendl; last_beacon.clear(); - last_tick = ceph::coarse_mono_clock::now(); + last_beacon_check = ceph::coarse_mono_clock::now(); cleanup_pending_map(); synchronize_last_beacon(); } @@ -63,7 +63,7 @@ void NVMeofGwMon::synchronize_last_beacon() gw_availability_t::GW_AVAILABLE) { dout(10) << "synchronize last_beacon for GW :" << gw_id << dendl; LastBeacon lb = {gw_id, group_key}; - last_beacon[lb] = last_tick; + last_beacon[lb] = last_beacon_check; } // force send ack after nearest beacon after leader re-election gw_created_pair.second.beacon_index = @@ -77,6 +77,28 @@ void NVMeofGwMon::on_shutdown() dout(10) << "called " << dendl; } +void NVMeofGwMon::check_beacon_timeout(ceph::coarse_mono_clock::time_point now, + bool &propose_pending) +{ + const auto nvmegw_beacon_grace = + g_conf().get_val("mon_nvmeofgw_beacon_grace"); + for (auto &itr : last_beacon) { + auto& lb = itr.first; + auto last_beacon_time = itr.second; + if (last_beacon_time < (now - nvmegw_beacon_grace)) { + auto diff = now - last_beacon_time; + int seconds = std::chrono::duration_cast(diff).count(); + dout(1) << "beacon timeout for GW " << lb.gw_id << " for " + << seconds <<" sec" << dendl; + pending_map.process_gw_map_gw_down(lb.gw_id, lb.group_key, propose_pending); + last_beacon.erase(lb); + } else { + dout(20) << "beacon live for GW " << lb.group_key <<" "<< lb.gw_id << dendl; + } + } + last_beacon_check = now; +} + void NVMeofGwMon::tick() { if (!is_active() || !mon.is_leader()) { @@ -87,50 +109,37 @@ void NVMeofGwMon::tick() bool _propose_pending = false; const auto now = ceph::coarse_mono_clock::now(); - const auto nvmegw_beacon_grace = - g_conf().get_val("mon_nvmeofgw_beacon_grace"); + const std::chrono::duration + mon_tick_interval(g_conf()->mon_tick_interval); + dout(15) << "NVMeofGwMon leader got a tick, pending epoch " << pending_map.epoch << dendl; - const auto client_tick_period = - g_conf().get_val("nvmeof_mon_client_tick_period"); // handle exception of tick overdued in order to avoid false detection of // overdued beacons, like it done in MgrMonitor::tick - if (last_tick != ceph::coarse_mono_clock::zero() && - (now - last_tick > (nvmegw_beacon_grace - client_tick_period))) { + if (last_beacon_check != ceph::coarse_mono_clock::zero() && + (now - last_beacon_check > (2 * mon_tick_interval))) { // 1 mon tick was missed // This case handles either local slowness (calls being delayed // for whatever reason) or cluster election slowness (a long gap // between calls while an election happened) dout(4) << ": resetting beacon timeouts due to mon delay " - "(slow election?) of " << now - last_tick << " seconds" << dendl; + "(slow election?) of " << now - last_beacon_check << " seconds" << dendl; for (auto &i : last_beacon) { i.second = now; } } - last_tick = now; bool propose = false; // Periodic: check active FSM timers pending_map.update_active_timers(propose); _propose_pending |= propose; - const auto cutoff = now - nvmegw_beacon_grace; - // Pass over all the stored beacons NvmeGroupKey old_group_key; - for (auto &itr : last_beacon) { - auto& lb = itr.first; - auto last_beacon_time = itr.second; - if (last_beacon_time < cutoff) { - dout(1) << "beacon timeout for GW " << lb.gw_id << dendl; - pending_map.process_gw_map_gw_down(lb.gw_id, lb.group_key, propose); - _propose_pending |= propose; - last_beacon.erase(lb); - } else { - dout(20) << "beacon live for GW key: " << lb.gw_id << dendl; - } - } + check_beacon_timeout(now, propose); + _propose_pending |= propose; + BeaconSubsystems empty_subsystems; for (auto &[group_key, gws_states]: pending_map.created_gws) { BeaconSubsystems *subsystems = &empty_subsystems; @@ -674,6 +683,7 @@ bool NVMeofGwMon::prepare_beacon(MonOpRequestRef op) bool apply_ack_logic = true; bool send_ack = false; + check_beacon_timeout(now, gw_propose); if (avail == gw_availability_t::GW_CREATED) { if (gw == group_gws.end()) { gw_created = false; diff --git a/src/mon/NVMeofGwMon.h b/src/mon/NVMeofGwMon.h index 1c4617e3723..16ca9e568fc 100644 --- a/src/mon/NVMeofGwMon.h +++ b/src/mon/NVMeofGwMon.h @@ -39,7 +39,7 @@ class NVMeofGwMon: public PaxosService, NVMeofGwMap map; //NVMeGWMap NVMeofGwMap pending_map; std::map last_beacon; - ceph::coarse_mono_clock::time_point last_tick; + ceph::coarse_mono_clock::time_point last_beacon_check; public: NVMeofGwMon(Monitor &mn, Paxos &p, const std::string& service_name) @@ -100,6 +100,15 @@ private: void recreate_gw_epoch(); void restore_pending_map_info(NVMeofGwMap & tmp_map); void cleanup_pending_map(); + void get_gw_listeners(ceph::Formatter *f, std::pair& group_key); + int apply_beacon(const NvmeGwId &gw_id, int gw_version, + const NvmeGroupKey& group_key, void *msg, + const BeaconSubsystems& sub, gw_availability_t &avail, bool &propose_pending); + void do_send_map_ack(MonOpRequestRef op, bool gw_created, bool gw_propose, + uint64_t stored_sequence, bool is_correct_sequence, + const NvmeGroupKey& group_key, const NvmeGwId &gw_id); + void check_beacon_timeout(ceph::coarse_mono_clock::time_point now, + bool &propose_pending); }; #endif /* MON_NVMEGWMONITOR_H_ */ -- 2.47.3