From 962699073c75cd1e6c03d0d167054eb14e5df0b1 Mon Sep 17 00:00:00 2001 From: Leonid Chernin Date: Thu, 23 Oct 2025 08:48:24 +0300 Subject: [PATCH] 2 commands added : set location and admin state, 1.failover logic incorporates GW location 2.implement GW admin commands enable/disable Signed-off-by: Leonid Chernin --- src/mon/MonCommands.h | 23 +++++ src/mon/NVMeofGwMap.cc | 165 ++++++++++++++++++++++++++++-------- src/mon/NVMeofGwMap.h | 6 ++ src/mon/NVMeofGwMon.cc | 54 +++++++++++- src/mon/NVMeofGwSerialize.h | 22 ++++- src/mon/NVMeofGwTypes.h | 8 ++ 6 files changed, 239 insertions(+), 39 deletions(-) diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h index 504619bd2bb..302453b6059 100644 --- a/src/mon/MonCommands.h +++ b/src/mon/MonCommands.h @@ -1453,6 +1453,29 @@ COMMAND("nvme-gw show" " show nvmeof gateways within (pool, group)", "mon", "r") +COMMAND("nvme-gw enable" + " name=id,type=CephString" + " name=pool,type=CephString" + " name=group,type=CephString", + "administratively enables nvmeof gateway id for (pool, group)", + "mgr", "rw") + +COMMAND("nvme-gw disable" + " name=id,type=CephString" + " name=pool,type=CephString" + " name=group,type=CephString", + "administratively disables nvmeof gateway id for (pool, group)", + "mgr", "rw") + +COMMAND("nvme-gw set-locale" + " name=id,type=CephString" + " name=pool,type=CephString" + " name=group,type=CephString" + " name=locale,type=CephString", + "set location for nvmeof gateway id for (pool, group)", + "mgr", "rw") + + // these are tell commands that were implemented as CLI commands in // the broken pre-octopus way that we want to allow to work when a // monitor has upgraded to octopus+ but the monmap min_mon_release is diff --git a/src/mon/NVMeofGwMap.cc b/src/mon/NVMeofGwMap.cc index 964946cda3d..795b6940b22 100755 --- a/src/mon/NVMeofGwMap.cc +++ b/src/mon/NVMeofGwMap.cc @@ -259,6 +259,65 @@ int NVMeofGwMap::do_delete_gw( return -EINVAL; } +int NVMeofGwMap::cfg_admin_state_change(const NvmeGwId &gw_id, + const NvmeGroupKey& group_key, + gw_admin_state_t state, bool &propose_pending, bool test) +{ + auto& gws_states = created_gws[group_key]; + auto gw_state = gws_states.find(gw_id); + if (gw_state != gws_states.end()) { + auto& st = gw_state->second; + if (state == gw_admin_state_t::GW_ADMIN_DISABLED) { + if (st.gw_admin_state == gw_admin_state_t::GW_ADMIN_ENABLED) { + dout(4) << "GW-id set admin Disabled " << group_key + << " " << gw_id << dendl; + if (st.availability == gw_availability_t::GW_AVAILABLE) { + skip_failovers_for_group(group_key, 5); + process_gw_map_gw_down(gw_id, group_key, propose_pending); + } + propose_pending = true; + } + } + else if (state == gw_admin_state_t::GW_ADMIN_ENABLED) { + if (st.gw_admin_state == gw_admin_state_t::GW_ADMIN_DISABLED) { + dout(4) << "GW-id set admin Enabled " << group_key + << " " << gw_id << dendl; + propose_pending = true; + } + } + st.gw_admin_state = state; + } else { + dout(4) << "GW-id not created yet " << group_key << " " << gw_id << dendl; + return -EINVAL; + } + return 0; +} +int NVMeofGwMap::cfg_set_location(const NvmeGwId &gw_id, + const NvmeGroupKey& group_key, + std::string &location, bool &propose_pending, bool test) { +// validate that location differs from gw location + auto& gws_states = created_gws[group_key]; + auto gw_state = gws_states.find(gw_id); + if (gw_state != gws_states.end()) { + auto& st = gw_state->second; + if (st.location == location) { + dout(4) << "GW-id same location is set " << group_key + << " " << gw_id << " " << location << dendl; + return 0; + } + else { + st.location = location; + dout(10) << "GW-id location is set " << group_key + << " " << gw_id << " " << location << dendl; + propose_pending = true; + return 0; + } + } else { + dout(4) << "GW-id not created yet " << group_key << " " << gw_id << dendl; + return -EINVAL; + } +} + void NVMeofGwMap::gw_performed_startup(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, bool &propose_pending) { @@ -613,15 +672,70 @@ void NVMeofGwMap::find_failback_gw( } } + +int NVMeofGwMap::find_failover_gw_logic(NvmeGwMonStates& gws_states, NvmeLocation& location, + NvmeGwId& min_loaded_gw_id) +{ +#define ILLEGAL_GW_ID " " +#define MIN_NUM_ANA_GROUPS 0xFFF + int min_num_ana_groups_in_gw = MIN_NUM_ANA_GROUPS; + min_loaded_gw_id = ILLEGAL_GW_ID; + int current_ana_groups_in_gw = 0; + int num_busy = 0, num_gws = 0; + // for all the gateways of the subsystem + // find the gws related to the same location as in anagrp + for (auto& found_gw_state: gws_states) { + auto st = found_gw_state.second; + if (st.availability == gw_availability_t::GW_AVAILABLE) + if (location == "" || st.location == location) { + num_gws ++; + current_ana_groups_in_gw = 0; + //for (auto& state_itr: created_gws[group_key][gw_id].sm_state) { + for (auto& state_itr: st.sm_state) { + NvmeAnaGrpId anagrp = state_itr.first; + if ((st.sm_state[anagrp] == + gw_states_per_group_t::GW_OWNER_WAIT_FAILBACK_PREPARED) || + (st.sm_state[anagrp] == + gw_states_per_group_t::GW_WAIT_FAILBACK_PREPARED) || + (st.sm_state[anagrp] == + gw_states_per_group_t::GW_WAIT_BLOCKLIST_CMPL)) { + current_ana_groups_in_gw = 0xFFFF; + num_busy ++; + break; // dont take into account GWs in the transitive state + } else if (st.sm_state[anagrp] == + gw_states_per_group_t::GW_ACTIVE_STATE) { + // how many ANA groups are handled by this GW + current_ana_groups_in_gw++; + } + } + if (min_num_ana_groups_in_gw > current_ana_groups_in_gw) { + min_num_ana_groups_in_gw = current_ana_groups_in_gw; + min_loaded_gw_id = found_gw_state.first; + dout(10) << "choose: gw-id min_ana_groups " << min_loaded_gw_id + << current_ana_groups_in_gw << " min " + << min_num_ana_groups_in_gw << dendl; + } + } + } + if (min_loaded_gw_id !=ILLEGAL_GW_ID) { // some GW choosen + return 0; + } else if (num_busy) { + dout(4) << "some GWs are busy " << num_busy + << "num Available " << num_gws << dendl; + return -EBUSY; + } else { + dout(4) << "no GWs in Active state. num Available " << num_gws << dendl; + return -ENOENT; + } +} + void NVMeofGwMap::find_failover_candidate( const NvmeGwId &gw_id, const NvmeGroupKey& group_key, NvmeAnaGrpId grpid, bool &propose_pending) { dout(10) << __func__<< " " << gw_id << dendl; -#define ILLEGAL_GW_ID " " -#define MIN_NUM_ANA_GROUPS 0xFFF - int min_num_ana_groups_in_gw = 0; - int current_ana_groups_in_gw = 0; + //int current_ana_groups_in_gw = 0; + NvmeLocation ana_location = ""; std::chrono::system_clock::time_point now = std::chrono::system_clock::now(); NvmeGwId min_loaded_gw_id = ILLEGAL_GW_ID; auto& gws_states = created_gws[group_key]; @@ -648,40 +762,19 @@ void NVMeofGwMap::find_failover_candidate( gw_state->second.standby_state(grpid); return ; } + if (st.ana_grp_id == grpid) { + ana_location = st.location; // found original location of the ANA group + dout(10) << "Found location " << ana_location + << " for anagrp " << grpid << dendl; + } } // Find a GW that takes over the ANA group(s) - min_num_ana_groups_in_gw = MIN_NUM_ANA_GROUPS; - min_loaded_gw_id = ILLEGAL_GW_ID; - - // for all the gateways of the subsystem - for (auto& found_gw_state: gws_states) { - auto st = found_gw_state.second; - if (st.availability == gw_availability_t::GW_AVAILABLE) { - current_ana_groups_in_gw = 0; - for (auto& state_itr: created_gws[group_key][gw_id].sm_state) { - NvmeAnaGrpId anagrp = state_itr.first; - if ((st.sm_state[anagrp] == - gw_states_per_group_t::GW_OWNER_WAIT_FAILBACK_PREPARED) || - (st.sm_state[anagrp] == - gw_states_per_group_t::GW_WAIT_FAILBACK_PREPARED) || - (st.sm_state[anagrp] == - gw_states_per_group_t::GW_WAIT_BLOCKLIST_CMPL)) { - current_ana_groups_in_gw = 0xFFFF; - break; // dont take into account GWs in the transitive state - } else if (st.sm_state[anagrp] == - gw_states_per_group_t::GW_ACTIVE_STATE) { - // how many ANA groups are handled by this GW - current_ana_groups_in_gw++; - } - } - if (min_num_ana_groups_in_gw > current_ana_groups_in_gw) { - min_num_ana_groups_in_gw = current_ana_groups_in_gw; - min_loaded_gw_id = found_gw_state.first; - dout(10) << "choose: gw-id min_ana_groups " << min_loaded_gw_id - << current_ana_groups_in_gw << " min " - << min_num_ana_groups_in_gw << dendl; - } - } + //Find GW among the GWs belong to the same location + int rc = find_failover_gw_logic(gws_states, ana_location, min_loaded_gw_id); + if (rc == -ENOENT) { + ana_location = ""; // looks at all GWs + dout(10) << "Find Failover GW -look at all Gateways in the pool/group" << dendl; + rc = find_failover_gw_logic(gws_states, ana_location, min_loaded_gw_id); } if (min_loaded_gw_id != ILLEGAL_GW_ID) { propose_pending = true; diff --git a/src/mon/NVMeofGwMap.h b/src/mon/NVMeofGwMap.h index 0b675f7beaf..aecce7e0036 100755 --- a/src/mon/NVMeofGwMap.h +++ b/src/mon/NVMeofGwMap.h @@ -70,6 +70,10 @@ public: int cfg_add_gw(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, bool test = false); int cfg_delete_gw(const NvmeGwId &gw_id, const NvmeGroupKey& group_key); + int cfg_admin_state_change(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, + gw_admin_state_t state, bool &propose_pending, bool test = false); + int cfg_set_location(const NvmeGwId &gw_id, const NvmeGroupKey& group_key, + std::string &location, bool &propose_pending, bool test = false); void process_gw_map_ka( const NvmeGwId &gw_id, const NvmeGroupKey& group_key, epoch_t& last_osd_epoch, bool &propose_pending); @@ -146,6 +150,8 @@ private: void validate_gw_map( const NvmeGroupKey& group_key); void increment_gw_epoch(const NvmeGroupKey& group_key); + int find_failover_gw_logic(NvmeGwMonStates& gws_states, + NvmeLocation& location, NvmeGwId& min_loaded_gw_id); public: int blocklist_gw( diff --git a/src/mon/NVMeofGwMon.cc b/src/mon/NVMeofGwMon.cc index 2b029f8cf24..269472e8305 100644 --- a/src/mon/NVMeofGwMon.cc +++ b/src/mon/NVMeofGwMon.cc @@ -510,6 +510,8 @@ bool NVMeofGwMon::preprocess_command(MonOpRequestRef op) f->open_object_section("stat"); f->dump_string("gw-id", gw_id); f->dump_unsigned("anagrp-id",state.ana_grp_id+1); + f->dump_string("location", state.location); + f->dump_unsigned("admin state", (uint32_t)state.gw_admin_state); f->dump_unsigned("num-namespaces", num_ns[state.ana_grp_id+1]); f->dump_unsigned("performed-full-startup", state.performed_full_startup); std::stringstream sstrm1; @@ -605,13 +607,60 @@ bool NVMeofGwMon::prepare_command(MonOpRequestRef op) response = true; } } + else if (prefix == "nvme-gw enable" || prefix == "nvme-gw disable") { + std::string id, pool, group; + cmd_getval(cmdmap, "id", id); + cmd_getval(cmdmap, "pool", pool); + cmd_getval(cmdmap, "group", group); + auto group_key = std::make_pair(pool, group); + dout(10) << " id "<< id <<" pool "<< pool << " group "<< group + << " " << prefix << dendl; + gw_admin_state_t set = (prefix == "nvme-gw enable") ? + gw_admin_state_t::GW_ADMIN_ENABLED : + gw_admin_state_t::GW_ADMIN_DISABLED; + bool propose = false; + rc = pending_map.cfg_admin_state_change(id, group_key, set, propose); + if (rc == -EINVAL) { + err = rc; + dout (4) << "Error: GW cannot be set to admin state " << id + << " " << pool << " " << group << " rc " << rc << dendl; + sstrm.str(""); + } + // propose pending would be generated by the PaxosService + if (rc == 0 && propose == true) { + response = true; + } + } + else if (prefix == "nvme-gw set-locale") { + + std::string id, pool, group, locale; + cmd_getval(cmdmap, "id", id); + cmd_getval(cmdmap, "pool", pool); + cmd_getval(cmdmap, "group", group); + cmd_getval(cmdmap, "locale", locale); + auto group_key = std::make_pair(pool, group); + dout(10) << " id "<< id <<" pool "<< pool << " group "<< group + <<" locale "<< locale << dendl; + bool propose = false; + rc = pending_map.cfg_set_location(id, group_key, locale, propose); + if (rc == -EINVAL) { + err = rc; + dout (4) << "Error: GW cannot set location " << id + << " " << pool << " " << group << " rc " << rc << dendl; + sstrm.str(""); + } + // propose pending would be generated by the PaxosService + if (rc == 0 && propose == true) { + response = true; + } + } getline(sstrm, rs); if (response == false) { if (err < 0 && rs.length() == 0) { rs = cpp_strerror(err); dout(10) << "Error command err : "<< err << " rs-len: " - << rs.length() << dendl; + << rs.length() << dendl; } mon.reply_command(op, err, rs, rdata, get_last_committed()); } else { @@ -786,6 +835,9 @@ int NVMeofGwMon::apply_beacon(const NvmeGwId &gw_id, int gw_version, if (changed) { avail = gw_availability_t::GW_AVAILABLE; } + if (state.gw_admin_state ==gw_admin_state_t::GW_ADMIN_DISABLED) { + avail = gw_availability_t::GW_CREATED; + } if (gw_subs.size() == 0) { avail = gw_availability_t::GW_CREATED; dout(10) << "No-subsystems condition detected for GW " << gw_id <= 2) { - decode(state.last_beacon_seq_number, bl); + decode(last_beacon_seq_number, bl); + state.last_beacon_seq_number = last_beacon_seq_number; decode(state.last_beacon_seq_ooo, bl); } DECODE_FINISH(bl); @@ -476,6 +478,9 @@ inline void encode(const NvmeGwMonStates& gws, ceph::bufferlist &bl, if (HAVE_FEATURE(features, NVMEOFHAMAP)) { version = 3; } + if (HAVE_FEATURE(features, NVMEOF_BEACON_DIFF)) { + version = 4; + } ENCODE_START(version, version, bl); dout(20) << "encode NvmeGwMonStates. struct_v: " << (int)version << dendl; encode ((uint32_t)gws.size(), bl); // number of gws in the group @@ -528,6 +533,11 @@ inline void encode(const NvmeGwMonStates& gws, ceph::bufferlist &bl, gw.second.addr_vect.encode(bl, features); encode(gw.second.beacon_index, bl); } + if (version >= 4) { + encode((int)gw.second.gw_admin_state, bl); + dout(10) << "encode location " << gw.second.location << dendl; + encode(gw.second.location, bl); + } } ENCODE_FINISH(bl); } @@ -536,7 +546,7 @@ inline void decode( NvmeGwMonStates& gws, ceph::buffer::list::const_iterator &bl) { gws.clear(); uint32_t num_created_gws; - DECODE_START(3, bl); + DECODE_START(4, bl); dout(20) << "decode NvmeGwMonStates. struct_v: " << struct_v << dendl; decode(num_created_gws, bl); dout(20) << "decode NvmeGwMonStates. num gws " << num_created_gws << dendl; @@ -615,6 +625,14 @@ inline void decode( decode(gw_created.beacon_index, bl); dout(20) << "decoded beacon_index " << gw_created.beacon_index << dendl; } + if (struct_v >= 4) { + dout(20) << "decode admin state and location" << dendl; + int admin_state; + decode(admin_state, bl); + gw_created.gw_admin_state = (gw_admin_state_t)admin_state; + decode(gw_created.location, bl); + dout(20) << "decoded location " << gw_created.location << dendl; + } gws[gw_name] = gw_created; } diff --git a/src/mon/NVMeofGwTypes.h b/src/mon/NVMeofGwTypes.h index cd22dcbc4fe..91c6b4834b3 100755 --- a/src/mon/NVMeofGwTypes.h +++ b/src/mon/NVMeofGwTypes.h @@ -26,6 +26,7 @@ #include "msg/msg_types.h" using NvmeGwId = std::string; +using NvmeLocation = std::string; using NvmeGroupKey = std::pair; using NvmeNqnId = std::string; using NvmeAnaGrpId = uint32_t; @@ -53,6 +54,11 @@ enum class gw_availability_t { GW_DELETED }; +enum class gw_admin_state_t { + GW_ADMIN_ENABLED = 0, + GW_ADMIN_DISABLED, +}; + enum class subsystem_change_t { SUBSYSTEM_ADDED, SUBSYSTEM_CHANGED, @@ -167,6 +173,8 @@ struct NvmeGwMonState { * it from being overriden by new epochs in monitor's function create_pending - * function restore_pending_map_info is called for this purpose */ + gw_admin_state_t gw_admin_state = gw_admin_state_t::GW_ADMIN_ENABLED; + std::string location = ""; std::chrono::system_clock::time_point allow_failovers_ts = std::chrono::system_clock::now(); std::chrono::system_clock::time_point last_gw_down_ts = -- 2.39.5