From: Leonid Chernin Date: Sun, 6 Apr 2025 10:31:16 +0000 (+0300) Subject: nvmeofgw: fix host issue during redeploy, improves previous redeploy fix X-Git-Tag: v20.3.0~46^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=3b05ba180ca63aee27e06192dfd641de723c2dd0;p=ceph.git nvmeofgw: fix host issue during redeploy, improves previous redeploy fix This commit fixes the issue when during redeploy hosts might stuck due to many failover/failbacks during very short time frame this commit improves the previous redeploy fix since it allows to use the short beacon timeout - no impact on the failover time Signed-off-by: Leonid Chernin --- diff --git a/src/common/options/mon.yaml.in b/src/common/options/mon.yaml.in index db2b6f6afcc..dc8163ff480 100644 --- a/src/common/options/mon.yaml.in +++ b/src/common/options/mon.yaml.in @@ -77,14 +77,15 @@ options: level: advanced desc: Period in seconds from last beacon to monitor marking a NVMeoF gateway as failed - default: 15 + default: 10 services: - mon - name: mon_nvmeofgw_skip_failovers_interval type: secs level: advanced desc: Period in seconds in which no failovers are performed in GW's pool-group - default: 12 + this is equal to max GW redeploy interval + default: 16 services: - mon - name: mon_nvmeofgw_set_group_id_retry diff --git a/src/mon/NVMeofGwMap.cc b/src/mon/NVMeofGwMap.cc index 8a9113a3a26..12bd93cef74 100755 --- a/src/mon/NVMeofGwMap.cc +++ b/src/mon/NVMeofGwMap.cc @@ -234,9 +234,34 @@ int NVMeofGwMap::do_delete_gw( void NVMeofGwMap::gw_performed_startup(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, bool &propose_pending) { + std::chrono::system_clock::time_point now = std::chrono::system_clock::now(); dout(4) << "GW performed the full startup " << gw_id << dendl; propose_pending = true; increment_gw_epoch( group_key); + auto &st = created_gws[group_key][gw_id]; + const auto skip_failovers_sec = g_conf().get_val + ("mon_nvmeofgw_skip_failovers_interval"); + const auto beacon_grace_sec = + g_conf().get_val("mon_nvmeofgw_beacon_grace"); + /* + This is a heuristic that meant to identify "cephadm redeploy" of the nvmeof gws. + We would like to identify that redeploy is going on, because it helps us to prevent + redundant failover and failback actions. + It is very important to minimize fo/fb during redeploy, because during redeploy + all GWs go down and up again, and the amount of fo/fb that could be driven by that + is big, which also triggers a lot of changes on the hosts the are nvmeof connected + to the gws, even up to the point that the host will get stuck. + This heuristic assumes that if a gw disappears and shows back in less than + REDEPLOY_TIMEOUT seconds, then it might be that a redeploy started, so we will + do a failover for this GW, but will not do failover for the next REDEPLOY_TIMEOUT. + Then again for the next GW that disappears and so on. + If it works as designed, than regardless of the number of GWs, redeploy will only + cause one fo/fb. */ + if ((now - (st.last_gw_down_ts - beacon_grace_sec)) < skip_failovers_sec) { + skip_failovers_for_group(group_key); + dout(4) << "startup: set skip-failovers for group " << gw_id << " group " + << group_key << dendl; + } } void NVMeofGwMap::increment_gw_epoch(const NvmeGroupKey& group_key) @@ -332,6 +357,7 @@ int NVMeofGwMap::process_gw_map_gw_down( dout(10) << "GW down " << gw_id << dendl; auto& st = gw_state->second; st.set_unavailable_state(); + st.set_last_gw_down_ts(); for (auto& state_itr: created_gws[group_key][gw_id].sm_state) { fsm_handle_gw_down( gw_id, group_key, state_itr.second, diff --git a/src/mon/NVMeofGwMon.cc b/src/mon/NVMeofGwMon.cc index 1e5efad1810..e41060084ea 100644 --- a/src/mon/NVMeofGwMon.cc +++ b/src/mon/NVMeofGwMon.cc @@ -158,7 +158,7 @@ version_t NVMeofGwMon::get_trim_to() const * function called during new paxos epochs * function called to restore in pending map all data that is not serialized * to paxos peons. Othervise it would be overriden in "pending_map = map" - * currently just "allow_failovers_ts" variable is restored + * currently "allow_failovers_ts" and "last_gw_down_ts" variables restored */ void NVMeofGwMon::restore_pending_map_info(NVMeofGwMap & tmp_map) { std::chrono::system_clock::time_point now = std::chrono::system_clock::now(); @@ -173,6 +173,8 @@ void NVMeofGwMon::restore_pending_map_info(NVMeofGwMap & tmp_map) { pending_map.created_gws[group_key][gw_id].allow_failovers_ts = gw_created_pair.second.allow_failovers_ts; } + pending_map.created_gws[group_key][gw_id].last_gw_down_ts = + gw_created_pair.second.last_gw_down_ts; } } } @@ -671,7 +673,7 @@ bool NVMeofGwMon::prepare_beacon(MonOpRequestRef op) << gw_id << dendl; process_gw_down(gw_id, group_key, gw_propose, avail); pending_map.skip_failovers_for_group(group_key); - dout(4) << "set skip-failovers for gw's group " << gw_id << " group " + dout(4) << "fast_reboot:set skip-failovers for group " << gw_id << " group " << group_key << dendl; } else if ( pending_map.created_gws[group_key][gw_id].performed_full_startup == diff --git a/src/mon/NVMeofGwTypes.h b/src/mon/NVMeofGwTypes.h index cf264339348..097397795a9 100755 --- a/src/mon/NVMeofGwTypes.h +++ b/src/mon/NVMeofGwTypes.h @@ -153,6 +153,8 @@ struct NvmeGwMonState { */ std::chrono::system_clock::time_point allow_failovers_ts = std::chrono::system_clock::now(); + std::chrono::system_clock::time_point last_gw_down_ts = + std::chrono::system_clock::now() - std::chrono::seconds(30); NvmeGwMonState(): ana_grp_id(REDUNDANT_GW_ANA_GROUP_ID) {} NvmeGwMonState(NvmeAnaGrpId id) @@ -174,6 +176,9 @@ struct NvmeGwMonState { sm_state[grpid] = gw_states_per_group_t::GW_ACTIVE_STATE; blocklist_data[grpid].osd_epoch = 0; } + void set_last_gw_down_ts(){ + last_gw_down_ts = std::chrono::system_clock::now(); + } }; struct NqnState {