]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
nvmeofgw: added support to nvmeof stretched cluster: wip-leonidc-23121-stretched-cluster
authorLeonid Chernin <leonidc@il.ibm.com>
Thu, 23 Oct 2025 05:48:24 +0000 (08:48 +0300)
committerLeonid Chernin <leonidc@il.ibm.com>
Tue, 23 Dec 2025 09:52:57 +0000 (11:52 +0200)
 nope
  GW commands added : set location and set admin state enable/disable
  added start-failback <location> command.

  failover logic is impacted by GW location
  implement GW admin commands  enable/disable
  added map  for location-failback-in-progress
  failback between locations happens only by monitor command
  implemented new ana-group relocation process used
  when inter-location failback command sent
  added upgrade rules

fixes: https://tracker.ceph.com/issues/74210

Signed-off-by: Leonid Chernin <leonidc@il.ibm.com>
src/mon/MonCommands.h
src/mon/NVMeofGwMap.cc
src/mon/NVMeofGwMap.h
src/mon/NVMeofGwMon.cc
src/mon/NVMeofGwSerialize.h
src/mon/NVMeofGwTypes.h

index ca9907c51e6d1f068c31cb2b2a22350f372dbcf5..484603e3d5e417b573c7bea6ac9f2e19c00748c5 100644 (file)
@@ -1459,6 +1459,35 @@ COMMAND("nvme-gw listeners"
        " show all nvmeof gateways listeners within (pool, group)",
        "mon", "r")
 
+COMMAND("nvme-gw enable"
+   " name=id,type=CephString"
+   " name=pool,type=CephString"
+   " name=group,type=CephString",
+   "administratively enables nvmeof gateway id for (pool, group)",
+   "mgr", "rw")
+
+COMMAND("nvme-gw disable"
+   " name=id,type=CephString"
+   " name=pool,type=CephString"
+   " name=group,type=CephString",
+   "administratively disables nvmeof gateway id for (pool, group)",
+   "mgr", "rw")
+
+COMMAND("nvme-gw set-location"
+   " name=id,type=CephString"
+   " name=pool,type=CephString"
+   " name=group,type=CephString"
+   " name=location,type=CephString",
+   "set location for nvmeof gateway id for (pool, group)",
+   "mgr", "rw")
+
+COMMAND("nvme-gw start-failback"
+  " name=pool,type=CephString"
+  " name=group,type=CephString"
+  " name=location,type=CephString",
+  "start failbacks for recovered location within (pool, group)",
+  "mgr", "rw")
+
 // these are tell commands that were implemented as CLI commands in
 // the broken pre-octopus way that we want to allow to work when a
 // monitor has upgraded to octopus+ but the monmap min_mon_release is
index 5d1a42089e4770515c121afba88ba211ed0739e8..03ca657e1262dc423d679a7bc430fc24b61649e6 100755 (executable)
@@ -259,6 +259,141 @@ int NVMeofGwMap::do_delete_gw(
   return -EINVAL;
 }
 
+int NVMeofGwMap::cfg_admin_state_change(const NvmeGwId &gw_id,
+        const NvmeGroupKey& group_key,
+        gw_admin_state_t state, bool &propose_pending, bool test)
+{
+  auto& gws_states = created_gws[group_key];
+  auto  gw_state = gws_states.find(gw_id);
+  if (gw_state != gws_states.end()) {
+    auto& st = gw_state->second;
+    if (state == gw_admin_state_t::GW_ADMIN_DISABLED) {
+      if (st.gw_admin_state == gw_admin_state_t::GW_ADMIN_ENABLED) {
+        dout(4) << "GW-id set admin Disabled " << group_key
+                << " " << gw_id << dendl;
+        if (st.availability == gw_availability_t::GW_AVAILABLE) {
+          skip_failovers_for_group(group_key, 5);
+          process_gw_map_gw_down(gw_id, group_key, propose_pending);
+        }
+        propose_pending = true;
+      }
+    } else if (state == gw_admin_state_t::GW_ADMIN_ENABLED) {
+      if (st.gw_admin_state == gw_admin_state_t::GW_ADMIN_DISABLED) {
+        dout(4) << "GW-id set admin Enabled " << group_key
+                << " " << gw_id << dendl;
+        propose_pending = true;
+      }
+    }
+    st.gw_admin_state = state;
+  } else {
+     dout(4) << "GW-id not created yet " << group_key << " " << gw_id << dendl;
+     return -EINVAL;
+  }
+  return 0;
+}
+
+bool NVMeofGwMap::validate_number_locations(int num_gws, int num_locations)
+{
+  return true; // This function  may have not empty body in the nex commit
+}
+
+int NVMeofGwMap::cfg_set_location(const NvmeGwId &gw_id,
+    const NvmeGroupKey& group_key,
+    std::string &location, bool &propose_pending, bool test) {
+ // validate that location differs from gw location
+  auto& gws_states = created_gws[group_key];
+  auto  gw_state = gws_states.find(gw_id);
+  std::set<std::string>  locations;
+  if (!HAVE_FEATURE(mon->get_quorum_con_features(), NVMEOF_BEACON_DIFF)) {
+    dout(4) << "Command is not allowed - feature is not installed"
+    << group_key << " " << gw_id << dendl;
+    return -EINVAL;
+  }
+  int num_gws = gws_states.size();
+  if (gw_state != gws_states.end()) {
+    auto& st = gw_state->second;
+    if (st.location == location) {
+      dout(4) << "GW-id same location is set " << group_key
+              << " " << gw_id << " " << location << dendl;
+      return 0;
+    } else {
+      bool last_gw = true;
+      for (auto& states: created_gws[group_key]) {
+        auto &state = states.second;
+        // calculate number set locations
+        locations.insert(state.location);
+        if (state.location == st.location && states.first != gw_id) {
+          last_gw = false;
+          break;
+        }
+      }
+      if (last_gw) { // this location would be removed so erase from set
+        locations.erase(st.location);
+      }
+      locations.insert(location);
+      dout(10) << "num GWs " << num_gws << " num set locations "
+              << locations.size() << dendl;
+      bool rc = validate_number_locations(num_gws, locations.size());
+      if (rc ==false) {
+        dout(4) << "Try to define invalid number of locations "
+        << locations.size() << dendl;
+        return -EINVAL;
+      }
+      if (last_gw) {
+       dout(4) << "remove location:last gw-id " << gw_id << " location "
+              << st.location << dendl;
+      }
+      st.location = location;
+      dout(10) << "GW-id  location is set " << group_key
+        << " " << gw_id << " " << location << dendl;
+      propose_pending = true;
+      return 0;
+    }
+  } else {
+    dout(4) << "GW-id not created yet " << group_key << " " << gw_id << dendl;
+    return -EINVAL;
+  }
+}
+
+int NVMeofGwMap::cfg_start_inter_location_failback(
+         const NvmeGroupKey& group_key,
+         std::string &location, bool &propose_pending) {
+  auto& gws_states = created_gws[group_key];
+  bool accept = false;
+  // for all the gateways of the subsystem
+  if (!HAVE_FEATURE(mon->get_quorum_con_features(), NVMEOF_BEACON_DIFF)) {
+    dout(4) << "Command is not allowed - feature is not installed"
+            << group_key << dendl;
+       return -EINVAL;
+  }
+  if (failbacks_in_progress.find(group_key) != failbacks_in_progress.end()) {
+     dout(4) << "command cannot be accepted since found active failback for a group "
+             << failbacks_in_progress[group_key] << dendl;
+     return -EEXIST;
+  }
+  for (auto& found_gw_state: gws_states) {
+    auto st = found_gw_state.second;
+    if (st.location == location) {
+      if(st.availability == gw_availability_t::GW_AVAILABLE &&
+        st.sm_state[st.ana_grp_id] == gw_states_per_group_t::GW_STANDBY_STATE) {
+        dout(10) << "command  accepted found gw in state " << st.availability
+                 << " ana grp state " << st.sm_state[st.ana_grp_id] << dendl;
+        accept = true;
+        break;
+      }
+    }
+  }
+  if (accept) {
+    failbacks_in_progress[group_key] = location;
+    propose_pending = true;
+    return 0;
+  } else {
+    dout(10) << "command  not accepted: not found AVAILABLE GW"
+                "with ANA grp in standby state" << dendl;
+    return -EINVAL;
+  }
+}
+
 void  NVMeofGwMap::gw_performed_startup(const NvmeGwId &gw_id,
       const NvmeGroupKey& group_key, bool &propose_pending)
 {
@@ -520,6 +655,7 @@ void NVMeofGwMap::handle_abandoned_ana_groups(bool& propose)
        find_failback_gw(gw_id, group_key, propose);
       }
     }
+    check_relocate_ana_groups(group_key, propose);
     if (propose) {
       validate_gw_map(group_key);
       increment_gw_epoch(group_key);
@@ -527,6 +663,123 @@ void NVMeofGwMap::handle_abandoned_ana_groups(bool& propose)
   }
 }
 
+void NVMeofGwMap::check_relocate_ana_groups(const NvmeGroupKey& group_key,
+         bool &propose) {
+  /* if location exists in failbacks_in_progress find all gws in location.
+   * add ana-grp of not Available gws to the list.
+   * if ana-grp is already active on some gw in location skip it
+   * for ana-grp in list make relocation.
+   * if all ana-grps in location active remove location from the map failbacks_in_progress
+  */
+  FailbackLocation location = "";
+  std::list<NvmeAnaGrpId>  reloc_list;
+  auto& gws_states = created_gws[group_key];
+  if (failbacks_in_progress.find(group_key) != failbacks_in_progress.end()) {
+    location = failbacks_in_progress[group_key];
+    uint32_t num_gw_in_location = 0;
+    uint32_t num_active_ana_in_location = 0;
+    for (auto& gw_state : gws_states) { // loop for GWs inside group-key
+      NvmeGwMonState& state = gw_state.second;
+      if (state.location == location) {
+        num_gw_in_location ++;
+        if (state.availability != gw_availability_t::GW_AVAILABLE) {
+          reloc_list.push_back(state.ana_grp_id);
+        } else { // in parallel check condition to complete failback-in-process
+          for (auto& state_it: state.sm_state) {
+            if (state_it.second == gw_states_per_group_t::GW_ACTIVE_STATE) {
+              num_active_ana_in_location ++;
+            }
+          }
+        }
+      }
+    }
+    if (num_gw_in_location == num_active_ana_in_location) {
+      failbacks_in_progress.erase(group_key); // All ana groups of location are in Active
+      dout(4) <<  "the location entry is erased "<< location
+          << " num_ana_groups in location " << num_gw_in_location
+          << " from the failbacks-in-progress of group " << group_key <<dendl;
+      propose = true;
+      return;
+    }
+    // for all ana groups in the list do relocate
+    for (auto& anagrp : reloc_list) {
+      for (auto& gw_state : gws_states) { // loop for GWs inside group-key
+        NvmeGwMonState& state = gw_state.second;
+        if (state.sm_state[anagrp] == gw_states_per_group_t::GW_ACTIVE_STATE) {
+          if (state.location == location) { // already relocated to the location
+            dout(10) << "ana " << anagrp << " already in " << location << dendl;
+            break;
+          } else { // try to relocate
+              dout(10) << "ana " << anagrp
+                 << " to relocate to " << location << dendl;
+              relocate_ana_grp(gw_state.first, group_key, anagrp,
+                    location, propose);
+          }
+        }
+      }
+    }
+  }
+}
+
+int NVMeofGwMap::relocate_ana_grp(const NvmeGwId &src_gw_id,
+  const NvmeGroupKey& group_key, NvmeAnaGrpId grpid, NvmeLocation& location,
+  bool &propose){
+  /* grpid should be in Active state on src_gw_id as precondition
+   * function tries to found the minimum loaded gw in location
+   * if found, relocate ana grp to it using regular failback FSM
+  */
+#define MIN_NUM_ANA_GROUPS 0xFFF
+  uint32_t min_num_ana_groups_in_gw = MIN_NUM_ANA_GROUPS;
+  NvmeGwId min_gw_id = "empty";
+  auto& gws_states = created_gws[group_key];
+
+  for (auto& gw_state : gws_states) { // loop for GWs inside group-key
+    NvmeGwMonState& state = gw_state.second;
+    uint32_t current_ana_groups_in_gw = 0;
+    if (state.location == location && state.availability ==
+        gw_availability_t::GW_AVAILABLE) {
+     //find minimum loaded gw in location for relocate anagr to it
+      for (auto& state_itr: state.sm_state) {
+        NvmeAnaGrpId anagrp = state_itr.first;
+        // if found some gw in intermediate state - exit the process
+        // need to calculate the correct loading of Active groups
+        if ( (state.sm_state[anagrp] !=
+           gw_states_per_group_t::GW_ACTIVE_STATE) &&
+            (state.sm_state[anagrp] !=
+                gw_states_per_group_t::GW_STANDBY_STATE)
+           ) {
+          dout(10) << "relocatе: found gw in intermediate state "
+           << gw_state.first << " state " << state.sm_state[anagrp] << dendl;
+          return 0;
+        }
+        if (state.sm_state[anagrp] ==
+          gw_states_per_group_t::GW_ACTIVE_STATE) {
+          // how many ANA groups are handled by this GW
+          current_ana_groups_in_gw ++;
+          if (current_ana_groups_in_gw < min_num_ana_groups_in_gw ) {
+            min_num_ana_groups_in_gw =  current_ana_groups_in_gw;
+            min_gw_id = gw_state.first;
+          }
+        }
+      }
+    }
+  }
+  dout(10) << "found min loaded gw for relocate " << min_gw_id << " location "
+      << location << "min load " << min_num_ana_groups_in_gw << dendl;
+
+  if (min_num_ana_groups_in_gw < MIN_NUM_ANA_GROUPS) {
+    dout(4) << "relocate starts " << grpid << " location " << location << dendl;
+    gws_states[src_gw_id].sm_state[grpid] =
+       gw_states_per_group_t::GW_WAIT_FAILBACK_PREPARED;
+    // Add timestamp of start Failback preparation
+    start_timer(src_gw_id, group_key, grpid, 3);
+    gws_states[min_gw_id].sm_state[grpid] =
+       gw_states_per_group_t::GW_OWNER_WAIT_FAILBACK_PREPARED;
+    propose = true;
+  }
+  return 0;
+}
+
 void NVMeofGwMap::set_failover_gw_for_ANA_group(
   const NvmeGwId &failed_gw_id, const NvmeGroupKey& group_key,
   const NvmeGwId &gw_id, NvmeAnaGrpId ANA_groupid)
@@ -564,7 +817,18 @@ void NVMeofGwMap::find_failback_gw(
   auto& gws_states = created_gws[group_key];
   auto& gw_state = created_gws[group_key][gw_id];
   bool do_failback = false;
-  dout(10) << "Find failback GW for GW " << gw_id << dendl;
+  bool force_inter_location = false;
+  FailbackLocation location = "";
+
+  if (failbacks_in_progress.find(group_key) !=
+       failbacks_in_progress.end()) {
+    location = failbacks_in_progress[group_key];
+    if (gw_state.location == location) {
+      force_inter_location = true;
+    }
+  }
+  dout(10) << "Find failback GW for GW " << gw_id << "location "
+           << gw_state.location << dendl;
   for (auto& gw_state_it: gws_states) {
     auto& st = gw_state_it.second;
     // some other gw owns or owned the desired ana-group
@@ -599,7 +863,13 @@ void NVMeofGwMap::find_failback_gw(
       dout(10)  << "Found Failback GW " << failback_gw_id
                << " that previously took over the ANAGRP "
                << gw_state.ana_grp_id << " of the available GW "
-               << gw_id << dendl;
+               << gw_id << "location " << st.location << dendl;
+      if (st.location != gw_state.location && !force_inter_location ) {
+        //not allowed inter-location failbacks
+        dout(10) << "not allowed interlocation failbacks. GW "
+        << gw_id << dendl;
+        return;
+      }
       st.sm_state[gw_state.ana_grp_id] =
        gw_states_per_group_t::GW_WAIT_FAILBACK_PREPARED;
 
@@ -613,15 +883,69 @@ void NVMeofGwMap::find_failback_gw(
   }
 }
 
+
+int  NVMeofGwMap::find_failover_gw_logic(NvmeGwMonStates& gws_states, NvmeLocation& location,
+                   NvmeGwId& min_loaded_gw_id)
+{
+#define ILLEGAL_GW_ID " "
+#define MIN_NUM_ANA_GROUPS 0xFFF
+    int min_num_ana_groups_in_gw = MIN_NUM_ANA_GROUPS;
+    min_loaded_gw_id = ILLEGAL_GW_ID;
+    int current_ana_groups_in_gw = 0;
+    int num_busy = 0, num_gws = 0;
+    // for all the gateways of the subsystem
+    // find the gws  related to the same location as in anagrp
+    for (auto& found_gw_state: gws_states) {
+      auto st = found_gw_state.second;
+      if (st.availability == gw_availability_t::GW_AVAILABLE)
+           if (location == "" || st.location == location) {
+        num_gws ++;
+       current_ana_groups_in_gw = 0;
+       //for (auto& state_itr: created_gws[group_key][gw_id].sm_state) {
+       for (auto& state_itr: st.sm_state) {
+         NvmeAnaGrpId anagrp = state_itr.first;
+         if ((st.sm_state[anagrp] ==
+              gw_states_per_group_t::GW_OWNER_WAIT_FAILBACK_PREPARED) ||
+             (st.sm_state[anagrp] ==
+              gw_states_per_group_t::GW_WAIT_FAILBACK_PREPARED) ||
+             (st.sm_state[anagrp] ==
+              gw_states_per_group_t::GW_WAIT_BLOCKLIST_CMPL)) {
+           current_ana_groups_in_gw = 0xFFFF;
+           num_busy ++;
+           break; // dont take into account   GWs in the transitive state
+         } else if (st.sm_state[anagrp] ==
+                    gw_states_per_group_t::GW_ACTIVE_STATE) {
+            // how many ANA groups are handled by this GW
+           current_ana_groups_in_gw++;
+         }
+       }
+       if (min_num_ana_groups_in_gw > current_ana_groups_in_gw) {
+         min_num_ana_groups_in_gw = current_ana_groups_in_gw;
+         min_loaded_gw_id = found_gw_state.first;
+         dout(10) << "choose: gw-id  min_ana_groups " << min_loaded_gw_id
+                  << current_ana_groups_in_gw << " min "
+                  << min_num_ana_groups_in_gw << dendl;
+       }
+      }
+    }
+    if (min_loaded_gw_id !=ILLEGAL_GW_ID) { // some GW choosen
+      return 0;
+    } else if (num_busy) {
+      dout(4) << "some GWs are busy " << num_busy
+              << "num Available " << num_gws << dendl;
+      return -EBUSY;
+    } else {
+      dout(4) << "no GWs in Active state. num Available " << num_gws << dendl;
+      return -ENOENT;
+    }
+}
+
 void  NVMeofGwMap::find_failover_candidate(
   const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
   NvmeAnaGrpId grpid, bool &propose_pending)
 {
   dout(10) << __func__<< " " << gw_id << dendl;
-#define ILLEGAL_GW_ID " "
-#define MIN_NUM_ANA_GROUPS 0xFFF
-  int min_num_ana_groups_in_gw = 0;
-  int current_ana_groups_in_gw = 0;
+  NvmeLocation ana_location = "";
   std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
   NvmeGwId min_loaded_gw_id = ILLEGAL_GW_ID;
   auto& gws_states = created_gws[group_key];
@@ -648,40 +972,19 @@ void  NVMeofGwMap::find_failover_candidate(
        gw_state->second.standby_state(grpid);
        return ;
       }
+      if (st.ana_grp_id == grpid) {
+        ana_location = st.location; // found original location of the ANA group
+        dout(10) << "Found location " << ana_location
+                 << " for anagrp " << grpid << dendl;
+      }
     }
     // Find a GW that takes over the ANA group(s)
-    min_num_ana_groups_in_gw = MIN_NUM_ANA_GROUPS;
-    min_loaded_gw_id = ILLEGAL_GW_ID;
-
-    // for all the gateways of the subsystem
-    for (auto& found_gw_state: gws_states) {
-      auto st = found_gw_state.second;
-      if (st.availability == gw_availability_t::GW_AVAILABLE) {
-       current_ana_groups_in_gw = 0;
-       for (auto& state_itr: created_gws[group_key][gw_id].sm_state) {
-         NvmeAnaGrpId anagrp = state_itr.first;
-         if ((st.sm_state[anagrp] ==
-              gw_states_per_group_t::GW_OWNER_WAIT_FAILBACK_PREPARED) ||
-             (st.sm_state[anagrp] ==
-              gw_states_per_group_t::GW_WAIT_FAILBACK_PREPARED) ||
-             (st.sm_state[anagrp] ==
-              gw_states_per_group_t::GW_WAIT_BLOCKLIST_CMPL)) {
-           current_ana_groups_in_gw = 0xFFFF;
-           break; // dont take into account   GWs in the transitive state
-         } else if (st.sm_state[anagrp] ==
-                    gw_states_per_group_t::GW_ACTIVE_STATE) {
-            // how many ANA groups are handled by this GW
-           current_ana_groups_in_gw++;
-         }
-       }
-       if (min_num_ana_groups_in_gw > current_ana_groups_in_gw) {
-         min_num_ana_groups_in_gw = current_ana_groups_in_gw;
-         min_loaded_gw_id = found_gw_state.first;
-         dout(10) << "choose: gw-id  min_ana_groups " << min_loaded_gw_id
-                  << current_ana_groups_in_gw << " min "
-                  << min_num_ana_groups_in_gw << dendl;
-       }
-      }
+    //Find GW among the GWs belong to the same location
+    int rc = find_failover_gw_logic(gws_states, ana_location, min_loaded_gw_id);
+    if (rc == -ENOENT) {
+      ana_location = ""; // looks at all GWs
+      dout(10) << "Find Failover GW -look at all Gateways in the pool/group" << dendl;
+      rc = find_failover_gw_logic(gws_states, ana_location, min_loaded_gw_id);
     }
     if (min_loaded_gw_id != ILLEGAL_GW_ID) {
       propose_pending = true;
@@ -910,7 +1213,8 @@ void NVMeofGwMap::fsm_handle_to_expired(
     for (auto& gw_state: created_gws[group_key]) {
       auto& st = gw_state.second;
       // group owner
-      if (st.ana_grp_id == grpid) {
+      if (st.sm_state[grpid] ==
+          gw_states_per_group_t::GW_OWNER_WAIT_FAILBACK_PREPARED) {
        grp_owner_found = true;
        if (st.availability == gw_availability_t::GW_AVAILABLE) {
          if (!(fbp_gw_state.last_gw_map_epoch_valid &&
@@ -924,34 +1228,29 @@ void NVMeofGwMap::fsm_handle_to_expired(
        }
        cancel_timer(gw_id, group_key, grpid);
        map_modified = true;
-       if ((st.sm_state[grpid] ==
-            gw_states_per_group_t::GW_OWNER_WAIT_FAILBACK_PREPARED) &&
-           (st.availability == gw_availability_t::GW_AVAILABLE)) {
-          // Previous failover GW  set to standby
-         fbp_gw_state.standby_state(grpid);
+       // unconditionaly previous failover GW  set to standby
+       fbp_gw_state.standby_state(grpid);
+       if (st.availability == gw_availability_t::GW_AVAILABLE) {
          st.active_state(grpid);
          dout(10)  << "Expired Failback-preparation timer from GW "
-                   << gw_id << " ANA groupId "<< grpid << dendl;
-         map_modified = true;
+                   << gw_id << " ANA groupId "<< grpid << dendl;
          break;
-       } else if ((st.sm_state[grpid] ==
-                   gw_states_per_group_t::GW_STANDBY_STATE) &&
-                  (st.availability == gw_availability_t::GW_AVAILABLE)) {
-          // GW failed during the persistency interval
-         st.standby_state(grpid);
-         dout(10)  << "Failback unsuccessfull. GW: " << gw_state.first
-                   << " becomes Standby for the ANA groupId " << grpid << dendl;
        }
-       fbp_gw_state.standby_state(grpid);
-       dout(10) << "Failback unsuccessfull GW: " << gw_id
-                << " becomes Standby for the ANA groupId " << grpid  << dendl;
-       map_modified = true;
-       break;
+       else {
+      st.standby_state(grpid);
+      dout(10) << "GW failed durind failback/relocation persistency interval"
+               << gw_state.first << dendl;
+    }
       }
     }
     if (grp_owner_found == false) {
-      // when  GW group owner is deleted the fbk gw is put to standby
-      dout(4) << "group owner not found " << grpid << " GW: " << gw_id << dendl;
+      cancel_timer(gw_id, group_key, grpid);
+      fbp_gw_state.standby_state(grpid);
+      dout(4) << "group owner/relocation target not found "
+              << grpid << " GW: " << gw_id << dendl;
+      dout(10) << "Failback unsuccessfull GW: " << gw_id
+               << " becomes Standby for the ANA groupId " << grpid  << dendl;
+      map_modified = true;
     }
   } else if (fbp_gw_state.sm_state[grpid] ==
             gw_states_per_group_t::GW_WAIT_BLOCKLIST_CMPL) {
index bfc45a009854eb178d5953422197e6aa3a9b5969..2cc11150a3ada19f9035cb3b432b27293be49926 100755 (executable)
@@ -61,6 +61,10 @@ public:
    * updating the map with a change affecting gws in group_key.
    */
   std::map<NvmeGroupKey, epoch_t> gw_epoch;
+  /* in stretched cluster configuration
+   * failbacks between locations does not happen automatically
+   * */
+  std::map<NvmeGroupKey, FailbackLocation> failbacks_in_progress;
 
   void to_gmap(std::map<NvmeGroupKey, NvmeGwMonClientStates>& Gmap) const;
   void track_deleting_gws(const NvmeGroupKey& group_key,
@@ -70,6 +74,12 @@ public:
   int cfg_add_gw(const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
     uint64_t features);
   int cfg_delete_gw(const NvmeGwId &gw_id, const NvmeGroupKey& group_key);
+  int cfg_admin_state_change(const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
+                 gw_admin_state_t state, bool &propose_pending, bool test = false);
+  int cfg_set_location(const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
+                 std::string &location, bool &propose_pending, bool test = false);
+  int cfg_start_inter_location_failback(const NvmeGroupKey& group_key,
+          std::string &location, bool &propose_pending);
   void process_gw_map_ka(
     const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
     epoch_t& last_osd_epoch,  bool &propose_pending);
@@ -146,6 +156,14 @@ private:
   void validate_gw_map(
     const NvmeGroupKey& group_key);
   void increment_gw_epoch(const NvmeGroupKey& group_key);
+  int find_failover_gw_logic(NvmeGwMonStates& gws_states,
+    NvmeLocation& location, NvmeGwId& min_loaded_gw_id);
+  bool validate_number_locations(int num_gws, int num_locations);
+  void check_relocate_ana_groups(const NvmeGroupKey& group_key,
+           bool &propose);
+  int relocate_ana_grp(const NvmeGwId &src_gw_id,
+   const NvmeGroupKey& group_key, NvmeAnaGrpId grpid,
+   NvmeLocation& location, bool &propose);
 
 public:
   int blocklist_gw(
@@ -156,7 +174,10 @@ public:
     using ceph::encode;
     uint8_t version = 1;
     if (HAVE_FEATURE(features, NVMEOFHAMAP)) {
-       version = 2;
+      version = 2;
+    }
+    if (HAVE_FEATURE(features, NVMEOF_BEACON_DIFF)) {
+      version = 3;
     }
     ENCODE_START(version, version, bl);
     encode(epoch, bl);// global map epoch
@@ -166,12 +187,15 @@ public:
     if (version >= 2) {
       encode(gw_epoch, bl);
     }
+    if (version >=3) {
+      encode(failbacks_in_progress, bl);
+    }
     ENCODE_FINISH(bl);
   }
 
   void decode(ceph::buffer::list::const_iterator &bl) {
     using ceph::decode;
-    DECODE_START(2, bl);
+    DECODE_START(3, bl);
 
     decode(epoch, bl);
     decode(created_gws, bl);
@@ -179,6 +203,9 @@ public:
     if (struct_v >= 2) {
       decode(gw_epoch, bl);
     }
+    if (struct_v >=3) {
+      decode(failbacks_in_progress, bl);
+    }
     DECODE_FINISH(bl);
   }
 
index b4c5113ff376d6264207cf8ac552fea6d1dbcff0..7ced136546c075795f4ef016f5f907fb6ddd87bb 100644 (file)
@@ -510,13 +510,18 @@ bool NVMeofGwMon::preprocess_command(MonOpRequestRef op)
        f->open_object_section("stat");
        f->dump_string("gw-id", gw_id);
        f->dump_unsigned("anagrp-id",state.ana_grp_id+1);
+       f->dump_string("location", state.location);
+       std::string admin_state = (state.gw_admin_state ==
+           gw_admin_state_t::GW_ADMIN_ENABLED) ? "ENABLED" : "DISABLED";
+       f->dump_string("admin state", admin_state);
        f->dump_unsigned("num-namespaces", num_ns[state.ana_grp_id+1]);
        f->dump_unsigned("performed-full-startup", state.performed_full_startup);
        std::stringstream  sstrm1;
        sstrm1 << state.availability;
        f->dump_string("Availability", sstrm1.str());
        uint32_t num_listeners = 0;
-       if (state.availability == gw_availability_t::GW_AVAILABLE) {
+       if ((state.availability == gw_availability_t::GW_AVAILABLE) ||
+           (state.availability == gw_availability_t::GW_CREATED)) {
          for (auto &subs: state.subsystems) {
            num_listeners += subs.listeners.size();
          }
@@ -669,18 +674,91 @@ bool NVMeofGwMon::prepare_command(MonOpRequestRef op)
       response = true;
     }
   }
+  else if (prefix == "nvme-gw enable" || prefix == "nvme-gw disable") {
 
+    std::string id, pool, group;
+    cmd_getval(cmdmap, "id", id);
+    cmd_getval(cmdmap, "pool", pool);
+    cmd_getval(cmdmap, "group", group);
+    auto group_key = std::make_pair(pool, group);
+    dout(10) << " id "<< id <<" pool "<< pool << " group "<< group
+             << " " << prefix << dendl;
+    gw_admin_state_t set =  (prefix == "nvme-gw enable") ?
+             gw_admin_state_t::GW_ADMIN_ENABLED :
+             gw_admin_state_t::GW_ADMIN_DISABLED;
+    bool propose = false;
+    rc = pending_map.cfg_admin_state_change(id, group_key, set, propose);
+    if (rc == -EINVAL) {
+      err = rc;
+      dout (4) << "Error: GW cannot be set to admin state " << id
+          << " " << pool << " " << group << "  rc " << rc << dendl;
+      sstrm.str("");
+    }
+    // propose pending would be generated by the PaxosService
+    if (rc == 0 && propose == true) {
+      response = true;
+    }
+  } else if (prefix == "nvme-gw set-location") {
+
+    std::string id, pool, group, location;
+    cmd_getval(cmdmap, "id", id);
+    cmd_getval(cmdmap, "pool", pool);
+    cmd_getval(cmdmap, "group", group);
+    cmd_getval(cmdmap, "location", location);
+    auto group_key = std::make_pair(pool, group);
+    dout(10) << " id "<< id <<" pool "<< pool << " group "<< group
+             <<" location "<< location << dendl;
+    bool propose = false;
+    rc = pending_map.cfg_set_location(id, group_key, location, propose);
+    if (rc == -EINVAL || rc == -EEXIST) {
+      err = rc;
+      dout (4) << "Error: GW cannot  set location " << id
+           << " " << pool << " " << group << "  rc " << rc << dendl;
+      sstrm.str("");
+      if (rc == -EEXIST) {
+        sstrm.str("old location removed but exist namespaces assigned to it");
+      }
+    }
+    // propose pending would be generated by the PaxosService
+    if (rc == 0 && propose == true) {
+      response = true;
+    }
+  } else if (prefix == "nvme-gw start-failback") {
+    std::string id, pool, group, location;
+    bool propose = false;
+    cmd_getval(cmdmap, "pool", pool);
+    cmd_getval(cmdmap, "group", group);
+    cmd_getval(cmdmap, "location", location);
+    auto group_key = std::make_pair(pool, group);
+    dout(10) << id <<" pool "<< pool << " group "<< group
+             <<" location "<< location << dendl;
+    rc = pending_map.cfg_start_inter_location_failback(group_key,
+                     location, propose);
+    if (rc == -EINVAL || rc == -EEXIST) {
+      err = rc;
+      sstrm.str("");
+      if (rc == -EEXIST) {
+        sstrm.str("command already set please wait until completed");
+      }
+      if (rc == EINVAL) {
+        sstrm.str("command cannot be executed");
+      }
+    }
+    if (rc == 0 && propose == true) {
+      response = true;
+    }
+  }
   getline(sstrm, rs);
   if (response == false) {
     if (err < 0 && rs.length() == 0) {
       rs = cpp_strerror(err);
       dout(10) << "Error command  err : "<< err  << " rs-len: "
-              << rs.length() <<  dendl;
+               << rs.length() <<  dendl;
     }
     mon.reply_command(op, err, rs, rdata, get_last_committed());
   } else {
     wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
-                                              get_last_committed() + 1));
+       get_last_committed() + 1));
   }
   return response;
 }
@@ -850,6 +928,9 @@ int NVMeofGwMon::apply_beacon(const NvmeGwId &gw_id, int gw_version,
   if (changed) {
     avail = gw_availability_t::GW_AVAILABLE;
   }
+  if (state.gw_admin_state ==gw_admin_state_t::GW_ADMIN_DISABLED) {
+    avail = gw_availability_t::GW_CREATED;
+  }
   if (gw_subs.size() == 0) {
       avail = gw_availability_t::GW_CREATED;
       dout(10) << "No-subsystems condition detected for GW " << gw_id <<dendl;
index 4ddbe18c9f5708c94c26643dd33662e5adf368e4..8da0121421b58b080e9032cd8b2b92f0a7b367a9 100755 (executable)
@@ -85,6 +85,22 @@ inline std::ostream& operator<<(
   return os;
 }
 
+inline std::ostream& operator<<(
+  std::ostream& os, const  gw_admin_state_t value) {
+  switch (value) {
+
+  case gw_admin_state_t:: GW_ADMIN_ENABLED:
+    os << "ADMIN_ENABLED";
+    break;
+  case gw_admin_state_t:: GW_ADMIN_DISABLED:
+    os << "ADMIN_DISABLED";
+    break;
+  default:
+    os << "Invalid " << (int)value << " ";
+  }
+  return os;
+}
+
 inline std::ostream& operator<<(std::ostream& os, const SmState value) {
   os << "SM_STATE [ ";
   for (auto& state_itr: value ) {
@@ -188,7 +204,8 @@ inline std::ostream& print_gw_created_t(
     os << " " << state_itr.first <<": " << state_itr.second << ",";
   }
   os << "]\n"<< MODULE_PREFFIX << " entity-addr : " << value.addr_vect
-     << " availability " << value.availability
+     << " availability " << value.availability << " location " << value.location
+     << " admin state " << value.gw_admin_state
      << " full-startup " << value.performed_full_startup  << " ]";
 
   return os;
@@ -235,6 +252,10 @@ inline std::ostream& operator<<(std::ostream& os, const NVMeofGwMap value) {
     os <<  "\n" <<  MODULE_PREFFIX  << "{ " << group_gws.first
        << " } -> GW epoch: " << group_gws.second << " }";
   }
+  for (auto& group_gws: value.failbacks_in_progress) {
+    os <<  "\n" <<  MODULE_PREFFIX  << "{ " << group_gws.first
+       << " } -> failback-to: " << group_gws.second << " }";
+  }
   for (auto& group_gws: value.created_gws) {
    os <<  "\n" <<  MODULE_PREFFIX  << "{ " << group_gws.first
       << " } -> { " << group_gws.second << " }";
@@ -335,10 +356,12 @@ inline  void decode(
   decode(state.gw_map_epoch, bl);
   decode(state.subsystems, bl);
   uint32_t avail;
+  uint64_t last_beacon_seq_number;
   decode(avail, bl);
   state.availability = (gw_availability_t)avail;
   if (struct_v >= 2) {
-    decode(state.last_beacon_seq_number, bl);
+    decode(last_beacon_seq_number, bl);
+    state.last_beacon_seq_number = last_beacon_seq_number;
     decode(state.last_beacon_seq_ooo, bl);
   }
   DECODE_FINISH(bl);
@@ -474,6 +497,9 @@ inline void encode(const NvmeGwMonStates& gws,  ceph::bufferlist &bl,
   if (HAVE_FEATURE(features, NVMEOFHAMAP)) {
     version = 3;
   }
+  if (HAVE_FEATURE(features, NVMEOF_BEACON_DIFF)) {
+    version = 4;
+  }
   ENCODE_START(version, version, bl);
   dout(20) << "encode NvmeGwMonStates. struct_v: " << (int)version << dendl;
   encode ((uint32_t)gws.size(), bl); // number of gws in the group
@@ -526,6 +552,11 @@ inline void encode(const NvmeGwMonStates& gws,  ceph::bufferlist &bl,
       gw.second.addr_vect.encode(bl, features);
       encode(gw.second.beacon_index, bl);
     }
+    if (version >= 4) {
+      encode((int)gw.second.gw_admin_state, bl);
+      dout(10) << "encode location " << gw.second.location << dendl;
+      encode(gw.second.location, bl);
+    }
   }
   ENCODE_FINISH(bl);
 }
@@ -534,7 +565,7 @@ inline void decode(
   NvmeGwMonStates& gws, ceph::buffer::list::const_iterator &bl) {
   gws.clear();
   uint32_t num_created_gws;
-  DECODE_START(3, bl);
+  DECODE_START(4, bl);
   dout(20) << "decode NvmeGwMonStates. struct_v: " << struct_v << dendl;
   decode(num_created_gws, bl);
   dout(20) << "decode NvmeGwMonStates. num gws  " << num_created_gws << dendl;
@@ -613,6 +644,14 @@ inline void decode(
       decode(gw_created.beacon_index, bl);
       dout(20) << "decoded beacon_index " << gw_created.beacon_index << dendl;
     }
+    if (struct_v >= 4) {
+      dout(20) << "decode admin state and location" << dendl;
+      int admin_state;
+      decode(admin_state, bl);
+      gw_created.gw_admin_state = (gw_admin_state_t)admin_state;
+      decode(gw_created.location, bl);
+      dout(20) << "decoded location " << gw_created.location << dendl;
+    }
 
     gws[gw_name] = gw_created;
   }
@@ -661,6 +700,39 @@ inline void decode(std::map<NvmeGroupKey, epoch_t>& gw_epoch,
   DECODE_FINISH(bl);
 }
 
+inline void encode(
+    const  std::map<NvmeGroupKey, FailbackLocation> &failbacks_in_progress,
+    ceph::bufferlist &bl) {
+  ENCODE_START(1, 1, bl);
+  encode ((uint32_t)failbacks_in_progress.size(), bl); // number of groups
+  for (auto& group_failbacks: failbacks_in_progress) {
+    auto& group_key = group_failbacks.first;
+    encode(group_key.first, bl); // pool
+    encode(group_key.second, bl); // group
+    encode(group_failbacks.second, bl);
+  }
+  ENCODE_FINISH(bl);
+}
+
+inline void decode(
+    std::map<NvmeGroupKey, FailbackLocation> &failbacks_in_progress,
+    ceph::buffer::list::const_iterator &bl) {
+  failbacks_in_progress.clear();
+  uint32_t ngroups;
+  DECODE_START(1, bl);
+  decode(ngroups, bl);
+  for(uint32_t i = 0; i<ngroups; i++){
+    std::string pool, group;
+    decode(pool, bl);
+    decode(group, bl);
+    FailbackLocation location;
+    decode(location, bl);
+    failbacks_in_progress[std::make_pair(pool, group)] = location;
+  }
+  DECODE_FINISH(bl);
+}
+
+
 inline void encode(
   const std::map<NvmeGroupKey, NvmeGwMonStates>& created_gws,
   ceph::bufferlist &bl, uint64_t features) {
index cd22dcbc4fe6a4250295e8fb23ba017580d51063..940107f2006771604995644c509622a7e56bf505 100755 (executable)
@@ -26,6 +26,8 @@
 #include "msg/msg_types.h"
 
 using NvmeGwId = std::string;
+using FailbackLocation = std::string;
+using NvmeLocation = std::string;
 using NvmeGroupKey = std::pair<std::string, std::string>;
 using NvmeNqnId = std::string;
 using NvmeAnaGrpId = uint32_t;
@@ -53,6 +55,11 @@ enum class gw_availability_t {
   GW_DELETED
 };
 
+enum class gw_admin_state_t {
+  GW_ADMIN_ENABLED = 0,
+  GW_ADMIN_DISABLED,
+};
+
 enum class subsystem_change_t {
   SUBSYSTEM_ADDED,
   SUBSYSTEM_CHANGED,
@@ -167,6 +174,8 @@ struct NvmeGwMonState {
    * it from being overriden by new epochs in monitor's function create_pending -
    * function restore_pending_map_info is called for this purpose
   */
+  gw_admin_state_t gw_admin_state = gw_admin_state_t::GW_ADMIN_ENABLED;
+  std::string location = "";
   std::chrono::system_clock::time_point allow_failovers_ts =
              std::chrono::system_clock::now();
   std::chrono::system_clock::time_point last_gw_down_ts =