]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
2 commands added : set location and admin state, wip-leonidc2710-stretched-cluster
authorLeonid Chernin <leonidc@il.ibm.com>
Thu, 23 Oct 2025 05:48:24 +0000 (08:48 +0300)
committerLeonid Chernin <leonidc@il.ibm.com>
Mon, 27 Oct 2025 14:47:31 +0000 (16:47 +0200)
1.failover logic incorporates GW location
2.implement GW admin commands  enable/disable

Signed-off-by: Leonid Chernin <leonidc@il.ibm.com>
src/mon/MonCommands.h
src/mon/NVMeofGwMap.cc
src/mon/NVMeofGwMap.h
src/mon/NVMeofGwMon.cc
src/mon/NVMeofGwSerialize.h
src/mon/NVMeofGwTypes.h

index 504619bd2bbc5e68f1e5ae5bb3d6f3d83adb1c56..302453b6059be0a0001cf740982845e0f42b4712 100644 (file)
@@ -1453,6 +1453,29 @@ COMMAND("nvme-gw show"
    " show nvmeof gateways within (pool, group)",
    "mon", "r")
 
+COMMAND("nvme-gw enable"
+   " name=id,type=CephString"
+   " name=pool,type=CephString"
+   " name=group,type=CephString",
+   "administratively enables nvmeof gateway id for (pool, group)",
+   "mgr", "rw")
+
+COMMAND("nvme-gw disable"
+   " name=id,type=CephString"
+   " name=pool,type=CephString"
+   " name=group,type=CephString",
+   "administratively disables nvmeof gateway id for (pool, group)",
+   "mgr", "rw")
+
+COMMAND("nvme-gw set-locale"
+   " name=id,type=CephString"
+   " name=pool,type=CephString"
+   " name=group,type=CephString"
+   " name=locale,type=CephString",
+   "set location for nvmeof gateway id for (pool, group)",
+   "mgr", "rw")
+
+
 // these are tell commands that were implemented as CLI commands in
 // the broken pre-octopus way that we want to allow to work when a
 // monitor has upgraded to octopus+ but the monmap min_mon_release is
index 964946cda3d2bc0ded76e834539c64f5f43990b3..795b6940b2205759b3a3f1ed4102cc6c74390d44 100755 (executable)
@@ -259,6 +259,65 @@ int NVMeofGwMap::do_delete_gw(
   return -EINVAL;
 }
 
+int NVMeofGwMap::cfg_admin_state_change(const NvmeGwId &gw_id,
+               const NvmeGroupKey& group_key,
+        gw_admin_state_t state, bool &propose_pending, bool test)
+{
+  auto& gws_states = created_gws[group_key];
+  auto  gw_state = gws_states.find(gw_id);
+  if (gw_state != gws_states.end()) {
+    auto& st = gw_state->second;
+    if (state == gw_admin_state_t::GW_ADMIN_DISABLED) {
+         if (st.gw_admin_state == gw_admin_state_t::GW_ADMIN_ENABLED) {
+           dout(4) << "GW-id set admin Disabled " << group_key
+                << " " << gw_id << dendl;
+         if (st.availability == gw_availability_t::GW_AVAILABLE) {
+           skip_failovers_for_group(group_key, 5);
+           process_gw_map_gw_down(gw_id, group_key, propose_pending);
+         }
+           propose_pending = true;
+         }
+    }
+    else if (state == gw_admin_state_t::GW_ADMIN_ENABLED) {
+      if (st.gw_admin_state == gw_admin_state_t::GW_ADMIN_DISABLED) {
+        dout(4) << "GW-id set admin Enabled " << group_key
+                << " " << gw_id << dendl;
+        propose_pending = true;
+      }
+    }
+    st.gw_admin_state = state;
+  } else {
+     dout(4) << "GW-id not created yet " << group_key << " " << gw_id << dendl;
+     return -EINVAL;
+  }
+  return 0;
+}
+int NVMeofGwMap::cfg_set_location(const NvmeGwId &gw_id,
+    const NvmeGroupKey& group_key,
+    std::string &location, bool &propose_pending, bool test) {
+// validate that location differs from gw location
+    auto& gws_states = created_gws[group_key];
+    auto  gw_state = gws_states.find(gw_id);
+    if (gw_state != gws_states.end()) {
+      auto& st = gw_state->second;
+      if (st.location == location) {
+        dout(4) << "GW-id same location is set " << group_key
+        << " " << gw_id << " " << location << dendl;
+        return 0;
+      }
+      else {
+       st.location = location;
+       dout(10) << "GW-id  location is set " << group_key
+           << " " << gw_id << " " << location << dendl;
+        propose_pending = true;
+        return 0;
+      }
+    } else {
+      dout(4) << "GW-id not created yet " << group_key << " " << gw_id << dendl;
+      return -EINVAL;
+    }
+}
+
 void  NVMeofGwMap::gw_performed_startup(const NvmeGwId &gw_id,
       const NvmeGroupKey& group_key, bool &propose_pending)
 {
@@ -613,15 +672,70 @@ void NVMeofGwMap::find_failback_gw(
   }
 }
 
+
+int  NVMeofGwMap::find_failover_gw_logic(NvmeGwMonStates& gws_states, NvmeLocation& location,
+                   NvmeGwId& min_loaded_gw_id)
+{
+#define ILLEGAL_GW_ID " "
+#define MIN_NUM_ANA_GROUPS 0xFFF
+    int min_num_ana_groups_in_gw = MIN_NUM_ANA_GROUPS;
+    min_loaded_gw_id = ILLEGAL_GW_ID;
+    int current_ana_groups_in_gw = 0;
+    int num_busy = 0, num_gws = 0;
+    // for all the gateways of the subsystem
+    // find the gws  related to the same location as in anagrp
+    for (auto& found_gw_state: gws_states) {
+      auto st = found_gw_state.second;
+      if (st.availability == gw_availability_t::GW_AVAILABLE)
+           if (location == "" || st.location == location) {
+        num_gws ++;
+       current_ana_groups_in_gw = 0;
+       //for (auto& state_itr: created_gws[group_key][gw_id].sm_state) {
+       for (auto& state_itr: st.sm_state) {
+         NvmeAnaGrpId anagrp = state_itr.first;
+         if ((st.sm_state[anagrp] ==
+              gw_states_per_group_t::GW_OWNER_WAIT_FAILBACK_PREPARED) ||
+             (st.sm_state[anagrp] ==
+              gw_states_per_group_t::GW_WAIT_FAILBACK_PREPARED) ||
+             (st.sm_state[anagrp] ==
+              gw_states_per_group_t::GW_WAIT_BLOCKLIST_CMPL)) {
+           current_ana_groups_in_gw = 0xFFFF;
+           num_busy ++;
+           break; // dont take into account   GWs in the transitive state
+         } else if (st.sm_state[anagrp] ==
+                    gw_states_per_group_t::GW_ACTIVE_STATE) {
+            // how many ANA groups are handled by this GW
+           current_ana_groups_in_gw++;
+         }
+       }
+       if (min_num_ana_groups_in_gw > current_ana_groups_in_gw) {
+         min_num_ana_groups_in_gw = current_ana_groups_in_gw;
+         min_loaded_gw_id = found_gw_state.first;
+         dout(10) << "choose: gw-id  min_ana_groups " << min_loaded_gw_id
+                  << current_ana_groups_in_gw << " min "
+                  << min_num_ana_groups_in_gw << dendl;
+       }
+      }
+    }
+    if (min_loaded_gw_id !=ILLEGAL_GW_ID) { // some GW choosen
+      return 0;
+    } else if (num_busy) {
+      dout(4) << "some GWs are busy " << num_busy
+              << "num Available " << num_gws << dendl;
+      return -EBUSY;
+    } else {
+      dout(4) << "no GWs in Active state. num Available " << num_gws << dendl;
+      return -ENOENT;
+    }
+}
+
 void  NVMeofGwMap::find_failover_candidate(
   const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
   NvmeAnaGrpId grpid, bool &propose_pending)
 {
   dout(10) << __func__<< " " << gw_id << dendl;
-#define ILLEGAL_GW_ID " "
-#define MIN_NUM_ANA_GROUPS 0xFFF
-  int min_num_ana_groups_in_gw = 0;
-  int current_ana_groups_in_gw = 0;
+  //int current_ana_groups_in_gw = 0;
+  NvmeLocation ana_location = "";
   std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
   NvmeGwId min_loaded_gw_id = ILLEGAL_GW_ID;
   auto& gws_states = created_gws[group_key];
@@ -648,40 +762,19 @@ void  NVMeofGwMap::find_failover_candidate(
        gw_state->second.standby_state(grpid);
        return ;
       }
+      if (st.ana_grp_id == grpid) {
+       ana_location = st.location; // found original location of the ANA group
+       dout(10) << "Found location " << ana_location
+                << " for anagrp " << grpid << dendl;
+      }
     }
     // Find a GW that takes over the ANA group(s)
-    min_num_ana_groups_in_gw = MIN_NUM_ANA_GROUPS;
-    min_loaded_gw_id = ILLEGAL_GW_ID;
-
-    // for all the gateways of the subsystem
-    for (auto& found_gw_state: gws_states) {
-      auto st = found_gw_state.second;
-      if (st.availability == gw_availability_t::GW_AVAILABLE) {
-       current_ana_groups_in_gw = 0;
-       for (auto& state_itr: created_gws[group_key][gw_id].sm_state) {
-         NvmeAnaGrpId anagrp = state_itr.first;
-         if ((st.sm_state[anagrp] ==
-              gw_states_per_group_t::GW_OWNER_WAIT_FAILBACK_PREPARED) ||
-             (st.sm_state[anagrp] ==
-              gw_states_per_group_t::GW_WAIT_FAILBACK_PREPARED) ||
-             (st.sm_state[anagrp] ==
-              gw_states_per_group_t::GW_WAIT_BLOCKLIST_CMPL)) {
-           current_ana_groups_in_gw = 0xFFFF;
-           break; // dont take into account   GWs in the transitive state
-         } else if (st.sm_state[anagrp] ==
-                    gw_states_per_group_t::GW_ACTIVE_STATE) {
-            // how many ANA groups are handled by this GW
-           current_ana_groups_in_gw++;
-         }
-       }
-       if (min_num_ana_groups_in_gw > current_ana_groups_in_gw) {
-         min_num_ana_groups_in_gw = current_ana_groups_in_gw;
-         min_loaded_gw_id = found_gw_state.first;
-         dout(10) << "choose: gw-id  min_ana_groups " << min_loaded_gw_id
-                  << current_ana_groups_in_gw << " min "
-                  << min_num_ana_groups_in_gw << dendl;
-       }
-      }
+    //Find GW among the GWs belong to the same location
+    int rc = find_failover_gw_logic(gws_states, ana_location, min_loaded_gw_id);
+    if (rc == -ENOENT) {
+          ana_location = ""; // looks at all GWs
+          dout(10) << "Find Failover GW -look at all Gateways in the pool/group" << dendl;
+          rc = find_failover_gw_logic(gws_states, ana_location, min_loaded_gw_id);
     }
     if (min_loaded_gw_id != ILLEGAL_GW_ID) {
       propose_pending = true;
index 0b675f7beaf400712c9aa4b41ab3c5fbff2ce3a4..aecce7e003651613f13a6bc2e0675b636049be1f 100755 (executable)
@@ -70,6 +70,10 @@ public:
   int cfg_add_gw(const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
     bool test = false);
   int cfg_delete_gw(const NvmeGwId &gw_id, const NvmeGroupKey& group_key);
+  int cfg_admin_state_change(const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
+                 gw_admin_state_t state, bool &propose_pending, bool test = false);
+  int cfg_set_location(const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
+                 std::string &location, bool &propose_pending, bool test = false);
   void process_gw_map_ka(
     const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
     epoch_t& last_osd_epoch,  bool &propose_pending);
@@ -146,6 +150,8 @@ private:
   void validate_gw_map(
     const NvmeGroupKey& group_key);
   void increment_gw_epoch(const NvmeGroupKey& group_key);
+  int find_failover_gw_logic(NvmeGwMonStates& gws_states,
+    NvmeLocation& location, NvmeGwId& min_loaded_gw_id);
 
 public:
   int blocklist_gw(
index 2b029f8cf24646999fbd91d69a2e14b5099a5056..269472e8305d1dc79b0dc3853cbad522e26d07bb 100644 (file)
@@ -510,6 +510,8 @@ bool NVMeofGwMon::preprocess_command(MonOpRequestRef op)
        f->open_object_section("stat");
        f->dump_string("gw-id", gw_id);
        f->dump_unsigned("anagrp-id",state.ana_grp_id+1);
+       f->dump_string("location", state.location);
+       f->dump_unsigned("admin state", (uint32_t)state.gw_admin_state);
        f->dump_unsigned("num-namespaces", num_ns[state.ana_grp_id+1]);
        f->dump_unsigned("performed-full-startup", state.performed_full_startup);
        std::stringstream  sstrm1;
@@ -605,13 +607,60 @@ bool NVMeofGwMon::prepare_command(MonOpRequestRef op)
       response = true;
     }
   }
+  else if (prefix == "nvme-gw enable" || prefix == "nvme-gw disable") {
 
+      std::string id, pool, group;
+      cmd_getval(cmdmap, "id", id);
+      cmd_getval(cmdmap, "pool", pool);
+      cmd_getval(cmdmap, "group", group);
+      auto group_key = std::make_pair(pool, group);
+      dout(10) << " id "<< id <<" pool "<< pool << " group "<< group
+                 << " " << prefix << dendl;
+      gw_admin_state_t set =  (prefix == "nvme-gw enable") ?
+                gw_admin_state_t::GW_ADMIN_ENABLED :
+             gw_admin_state_t::GW_ADMIN_DISABLED;
+      bool propose = false;
+      rc = pending_map.cfg_admin_state_change(id, group_key, set, propose);
+      if (rc == -EINVAL) {
+           err = rc;
+        dout (4) << "Error: GW cannot be set to admin state " << id
+          << " " << pool << " " << group << "  rc " << rc << dendl;
+        sstrm.str("");
+      }
+      // propose pending would be generated by the PaxosService
+      if (rc == 0 && propose == true) {
+        response = true;
+      }
+  }
+  else if (prefix == "nvme-gw set-locale") {
+
+    std::string id, pool, group, locale;
+    cmd_getval(cmdmap, "id", id);
+    cmd_getval(cmdmap, "pool", pool);
+    cmd_getval(cmdmap, "group", group);
+    cmd_getval(cmdmap, "locale", locale);
+    auto group_key = std::make_pair(pool, group);
+    dout(10) << " id "<< id <<" pool "<< pool << " group "<< group
+                   <<" locale "<< locale << dendl;
+    bool propose = false;
+    rc = pending_map.cfg_set_location(id, group_key, locale, propose);
+    if (rc == -EINVAL) {
+      err = rc;
+      dout (4) << "Error: GW cannot  set location " << id
+           << " " << pool << " " << group << "  rc " << rc << dendl;
+      sstrm.str("");
+    }
+    // propose pending would be generated by the PaxosService
+    if (rc == 0 && propose == true) {
+      response = true;
+    }
+  }
   getline(sstrm, rs);
   if (response == false) {
     if (err < 0 && rs.length() == 0) {
       rs = cpp_strerror(err);
       dout(10) << "Error command  err : "<< err  << " rs-len: "
-              << rs.length() <<  dendl;
+             << rs.length() <<  dendl;
     }
     mon.reply_command(op, err, rs, rdata, get_last_committed());
   } else {
@@ -786,6 +835,9 @@ int NVMeofGwMon::apply_beacon(const NvmeGwId &gw_id, int gw_version,
   if (changed) {
     avail = gw_availability_t::GW_AVAILABLE;
   }
+  if (state.gw_admin_state ==gw_admin_state_t::GW_ADMIN_DISABLED) {
+         avail = gw_availability_t::GW_CREATED;
+  }
   if (gw_subs.size() == 0) {
       avail = gw_availability_t::GW_CREATED;
       dout(10) << "No-subsystems condition detected for GW " << gw_id <<dendl;
index f9aabeb985dec39cc76106e24aa5b74d64f428cb..b75bebc31073d4c947852bb40e7e413ee8544f1b 100755 (executable)
@@ -337,10 +337,12 @@ inline  void decode(
   decode(state.gw_map_epoch, bl);
   decode(state.subsystems, bl);
   uint32_t avail;
+  uint64_t last_beacon_seq_number;
   decode(avail, bl);
   state.availability = (gw_availability_t)avail;
   if (struct_v >= 2) {
-    decode(state.last_beacon_seq_number, bl);
+    decode(last_beacon_seq_number, bl);
+    state.last_beacon_seq_number = last_beacon_seq_number;
     decode(state.last_beacon_seq_ooo, bl);
   }
   DECODE_FINISH(bl);
@@ -476,6 +478,9 @@ inline void encode(const NvmeGwMonStates& gws,  ceph::bufferlist &bl,
   if (HAVE_FEATURE(features, NVMEOFHAMAP)) {
     version = 3;
   }
+  if (HAVE_FEATURE(features, NVMEOF_BEACON_DIFF)) {
+    version = 4;
+  }
   ENCODE_START(version, version, bl);
   dout(20) << "encode NvmeGwMonStates. struct_v: " << (int)version << dendl;
   encode ((uint32_t)gws.size(), bl); // number of gws in the group
@@ -528,6 +533,11 @@ inline void encode(const NvmeGwMonStates& gws,  ceph::bufferlist &bl,
       gw.second.addr_vect.encode(bl, features);
       encode(gw.second.beacon_index, bl);
     }
+    if (version >= 4) {
+      encode((int)gw.second.gw_admin_state, bl);
+      dout(10) << "encode location " << gw.second.location << dendl;
+      encode(gw.second.location, bl);
+    }
   }
   ENCODE_FINISH(bl);
 }
@@ -536,7 +546,7 @@ inline void decode(
   NvmeGwMonStates& gws, ceph::buffer::list::const_iterator &bl) {
   gws.clear();
   uint32_t num_created_gws;
-  DECODE_START(3, bl);
+  DECODE_START(4, bl);
   dout(20) << "decode NvmeGwMonStates. struct_v: " << struct_v << dendl;
   decode(num_created_gws, bl);
   dout(20) << "decode NvmeGwMonStates. num gws  " << num_created_gws << dendl;
@@ -615,6 +625,14 @@ inline void decode(
       decode(gw_created.beacon_index, bl);
       dout(20) << "decoded beacon_index " << gw_created.beacon_index << dendl;
     }
+    if (struct_v >= 4) {
+      dout(20) << "decode admin state and location" << dendl;
+      int admin_state;
+      decode(admin_state, bl);
+      gw_created.gw_admin_state = (gw_admin_state_t)admin_state;
+      decode(gw_created.location, bl);
+      dout(20) << "decoded location " << gw_created.location << dendl;
+    }
 
     gws[gw_name] = gw_created;
   }
index cd22dcbc4fe6a4250295e8fb23ba017580d51063..91c6b4834b3edc74845502995e71e577996fbbfc 100755 (executable)
@@ -26,6 +26,7 @@
 #include "msg/msg_types.h"
 
 using NvmeGwId = std::string;
+using NvmeLocation = std::string;
 using NvmeGroupKey = std::pair<std::string, std::string>;
 using NvmeNqnId = std::string;
 using NvmeAnaGrpId = uint32_t;
@@ -53,6 +54,11 @@ enum class gw_availability_t {
   GW_DELETED
 };
 
+enum class gw_admin_state_t {
+  GW_ADMIN_ENABLED = 0,
+  GW_ADMIN_DISABLED,
+};
+
 enum class subsystem_change_t {
   SUBSYSTEM_ADDED,
   SUBSYSTEM_CHANGED,
@@ -167,6 +173,8 @@ struct NvmeGwMonState {
    * it from being overriden by new epochs in monitor's function create_pending -
    * function restore_pending_map_info is called for this purpose
   */
+  gw_admin_state_t gw_admin_state = gw_admin_state_t::GW_ADMIN_ENABLED;
+  std::string location = "";
   std::chrono::system_clock::time_point allow_failovers_ts =
              std::chrono::system_clock::now();
   std::chrono::system_clock::time_point last_gw_down_ts =