]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
add structures for failback-in-progress to location wip-leonidc-0412-stretched-cluster
authorLeonid Chernin <leonidc@il.ibm.com>
Thu, 4 Dec 2025 05:04:17 +0000 (07:04 +0200)
committerLeonid Chernin <leonidc@il.ibm.com>
Thu, 4 Dec 2025 05:10:04 +0000 (07:10 +0200)
Signed-off-by: Leonid Chernin <leonidc@il.ibm.com>
src/mon/NVMeofGwMap.cc
src/mon/NVMeofGwMap.h
src/mon/NVMeofGwMon.cc
src/mon/NVMeofGwSerialize.h
src/mon/NVMeofGwTypes.h

index 94f56e85b1ef073457e53006dfe28b9554636c70..4c41ce24b2b67bbb0b48f7b0f65387089bd95d25 100755 (executable)
@@ -374,26 +374,40 @@ int NVMeofGwMap::cfg_start_inter_location_failback(
          const NvmeGroupKey& group_key,
          std::string &location, bool &propose_pending) {
   auto& gws_states = created_gws[group_key];
+  bool accept = false;
   // for all the gateways of the subsystem
-  for (auto& found_gw_state: gws_states) {
-    auto st = found_gw_state.second;
-    if (st.location == location) {
-      if(st.availability != gw_availability_t::GW_AVAILABLE ||
-         st.sm_state[st.ana_grp_id] != gw_states_per_group_t::GW_STANDBY_STATE) {
-         dout(4) << "command rejected found gw in state " << st.availability
-                         << " ana grp state " << st.sm_state[st.ana_grp_id] << dendl;
-            return -EINVAL;
-         }
-    }
+  if (!HAVE_FEATURE(mon->get_quorum_con_features(), NVMEOF_BEACON_DIFF)) {
+       dout(4) << "Command is not allowed - feature is not installed"
+               << group_key << dendl;
+       return -EINVAL;
+  }
+  if (failbacks_in_progress.find(group_key) != failbacks_in_progress.end())
+  {
+     dout(4) << "command cannot be accepted since found active failback for a group "
+             << failbacks_in_progress[group_key] << dendl;
+     return -EEXIST;
   }
   for (auto& found_gw_state: gws_states) {
     auto st = found_gw_state.second;
     if (st.location == location) {
-      auto gw_id = found_gw_state.first;
-      find_failback_gw(gw_id, group_key, propose_pending, true);
+      if(st.availability == gw_availability_t::GW_AVAILABLE &&
+         st.sm_state[st.ana_grp_id] == gw_states_per_group_t::GW_STANDBY_STATE) {
+         dout(10) << "command  accepted found gw in state " << st.availability
+                 << " ana grp state " << st.sm_state[st.ana_grp_id] << dendl;
+         accept = true;
+         break;
+      }
     }
   }
-  return 0;
+  if (accept) {
+    failbacks_in_progress[group_key] = location;
+    propose_pending = true;
+    return 0;
+  } else {
+    dout(10) << "command  not accepted: not found AVAILABLE GW"
+                "with ANA grp in standby state" << dendl;
+    return -EINVAL;
+  }
 }
 
 void  NVMeofGwMap::gw_performed_startup(const NvmeGwId &gw_id,
@@ -676,6 +690,7 @@ void NVMeofGwMap::handle_abandoned_ana_groups(bool& propose)
        find_failback_gw(gw_id, group_key, propose);
       }
     }
+    check_relocate_ana_groups(group_key, propose);
     if (propose) {
       validate_gw_map(group_key);
       increment_gw_epoch(group_key);
@@ -683,6 +698,123 @@ void NVMeofGwMap::handle_abandoned_ana_groups(bool& propose)
   }
 }
 
+void NVMeofGwMap::check_relocate_ana_groups(const NvmeGroupKey& group_key,
+         bool &propose) {
+  /* if location exists in failbacks_in_progress find all gws in location.
+   * add ana-grp of not Available gws to the list.
+   * if ana-grp is already active on some gw in location skip it
+   * for ana-grp in list make relocation.
+   * if all ana-grps in location active remove location from the map failbacks_in_progress
+  */
+  FailbackLocation location = "";
+  std::list<NvmeAnaGrpId>  reloc_list;
+  auto& gws_states = created_gws[group_key];
+  if (failbacks_in_progress.find(group_key) != failbacks_in_progress.end()) {
+    location = failbacks_in_progress[group_key];
+    uint32_t num_gw_in_location = 0;
+    uint32_t num_active_ana_in_location = 0;
+    for (auto& gw_state : gws_states) { // loop for GWs inside group-key
+      NvmeGwMonState& state = gw_state.second;
+      if (state.location == location) {
+        num_gw_in_location ++;
+        if (state.availability != gw_availability_t::GW_AVAILABLE) {
+          reloc_list.push_back(state.ana_grp_id);
+        } else { // in parallel check condition to complete failback-in-process
+          for (auto& state_it: state.sm_state) {
+            if (state_it.second == gw_states_per_group_t::GW_ACTIVE_STATE) {
+              num_active_ana_in_location ++;
+            }
+          }
+        }
+      }
+    }
+    if (num_gw_in_location == num_active_ana_in_location) {
+      failbacks_in_progress.erase(group_key); // All ana groups of location are in Active
+      dout(4) <<  "the location entry is erased "<< location
+          << " num_ana_groups in location " << num_gw_in_location
+          << " from the failbacks-in-progress of group " << group_key <<dendl;
+      propose = true;
+      return;
+    }
+    // for all ana groups in the list do relocate
+    for (auto& anagrp : reloc_list) {
+      for (auto& gw_state : gws_states) { // loop for GWs inside group-key
+        NvmeGwMonState& state = gw_state.second;
+        if (state.sm_state[anagrp] == gw_states_per_group_t::GW_ACTIVE_STATE) {
+          if (state.location == location) { // already relocated to the location
+            dout(10) << "ana " << anagrp << " already in " << location << dendl;
+            break;
+          } else { // try to relocate
+              dout(10) << "ana " << anagrp
+                 << " to relocate to " << location << dendl;
+              relocate_ana_grp(gw_state.first, group_key, anagrp,
+                    location, propose);
+          }
+        }
+      }
+    }
+  }
+}
+
+int NVMeofGwMap::relocate_ana_grp(const NvmeGwId &src_gw_id,
+  const NvmeGroupKey& group_key, NvmeAnaGrpId grpid, NvmeLocation& location,
+  bool &propose){
+  /* grpid should be in Active state on src_gw_id as precondition
+   * function tries to found the minimum loaded gw in location
+   * if found, relocate ana grp to it using regular failback FSM
+   * */
+#define MIN_NUM_ANA_GROUPS 0xFFF
+  uint32_t min_num_ana_groups_in_gw = MIN_NUM_ANA_GROUPS;
+  NvmeGwId min_gw_id = "empty";
+  auto& gws_states = created_gws[group_key];
+
+  for (auto& gw_state : gws_states) { // loop for GWs inside group-key
+    NvmeGwMonState& state = gw_state.second;
+    uint32_t current_ana_groups_in_gw = 0;
+    if (state.location == location && state.availability ==
+        gw_availability_t::GW_AVAILABLE) {
+     //find minimum loaded gw in location for relocate anagr to it
+      for (auto& state_itr: state.sm_state) {
+        NvmeAnaGrpId anagrp = state_itr.first;
+        //TODO - if found some gw in intermediate state - exit the process
+        // need to calculate the correct loading of Active groups
+        if ( (state.sm_state[anagrp] !=
+           gw_states_per_group_t::GW_ACTIVE_STATE) &&
+            (state.sm_state[anagrp] !=
+                gw_states_per_group_t::GW_STANDBY_STATE)
+           ) {
+          dout(10) << "relocation found gw in intermediate state "
+           << gw_state.first << " state " << state.sm_state[anagrp] << dendl;
+          return 0;
+        }
+        if (state.sm_state[anagrp] ==
+          gw_states_per_group_t::GW_ACTIVE_STATE) {
+          // how many ANA groups are handled by this GW
+          current_ana_groups_in_gw ++;
+          if (current_ana_groups_in_gw < min_num_ana_groups_in_gw ) {
+            min_num_ana_groups_in_gw =  current_ana_groups_in_gw;
+            min_gw_id = gw_state.first;
+          }
+        }
+      }
+    }
+  }
+  dout(10) << "found min loaded gw for relocate " << min_gw_id << " location "
+      << location << "min load " << min_num_ana_groups_in_gw << dendl;
+
+  if (min_num_ana_groups_in_gw < MIN_NUM_ANA_GROUPS) {
+    dout(4) << "relocate starts " << grpid << " location " << location << dendl;
+    gws_states[src_gw_id].sm_state[grpid] =
+       gw_states_per_group_t::GW_WAIT_FAILBACK_PREPARED;
+     // Add timestamp of start Failback preparation
+    start_timer(src_gw_id, group_key, grpid, 3);
+    gws_states[min_gw_id].sm_state[grpid] =
+       gw_states_per_group_t::GW_OWNER_WAIT_FAILBACK_PREPARED;
+    propose = true;
+  }
+  return 0;
+}
+
 void NVMeofGwMap::set_failover_gw_for_ANA_group(
   const NvmeGwId &failed_gw_id, const NvmeGroupKey& group_key,
   const NvmeGwId &gw_id, NvmeAnaGrpId ANA_groupid)
@@ -715,12 +847,21 @@ void NVMeofGwMap::set_failover_gw_for_ANA_group(
 }
 
 void NVMeofGwMap::find_failback_gw(
-  const NvmeGwId &gw_id, const NvmeGroupKey& group_key, bool &propose,
-  bool force_inter_location)
+  const NvmeGwId &gw_id, const NvmeGroupKey& group_key, bool &propose)
 {
   auto& gws_states = created_gws[group_key];
   auto& gw_state = created_gws[group_key][gw_id];
   bool do_failback = false;
+  bool force_inter_location = false;
+  FailbackLocation location = "";
+
+  if (failbacks_in_progress.find(group_key) !=
+       failbacks_in_progress.end()) {
+    location = failbacks_in_progress[group_key];
+    if (gw_state.location == location) {
+      force_inter_location = true;
+    }
+  }
   dout(10) << "Find failback GW for GW " << gw_id << "location "
            << gw_state.location << dendl;
   for (auto& gw_state_it: gws_states) {
@@ -1107,7 +1248,8 @@ void NVMeofGwMap::fsm_handle_to_expired(
     for (auto& gw_state: created_gws[group_key]) {
       auto& st = gw_state.second;
       // group owner
-      if (st.ana_grp_id == grpid) {
+      if (st.sm_state[grpid] ==
+          gw_states_per_group_t::GW_OWNER_WAIT_FAILBACK_PREPARED) {
        grp_owner_found = true;
        if (st.availability == gw_availability_t::GW_AVAILABLE) {
          if (!(fbp_gw_state.last_gw_map_epoch_valid &&
@@ -1121,34 +1263,29 @@ void NVMeofGwMap::fsm_handle_to_expired(
        }
        cancel_timer(gw_id, group_key, grpid);
        map_modified = true;
-       if ((st.sm_state[grpid] ==
-            gw_states_per_group_t::GW_OWNER_WAIT_FAILBACK_PREPARED) &&
-           (st.availability == gw_availability_t::GW_AVAILABLE)) {
-          // Previous failover GW  set to standby
-         fbp_gw_state.standby_state(grpid);
+       // unconditionaly previous failover GW  set to standby
+       fbp_gw_state.standby_state(grpid);
+       if (st.availability == gw_availability_t::GW_AVAILABLE) {
          st.active_state(grpid);
          dout(10)  << "Expired Failback-preparation timer from GW "
                    << gw_id << " ANA groupId "<< grpid << dendl;
-         map_modified = true;
          break;
-       } else if ((st.sm_state[grpid] ==
-                   gw_states_per_group_t::GW_STANDBY_STATE) &&
-                  (st.availability == gw_availability_t::GW_AVAILABLE)) {
-          // GW failed during the persistency interval
-         st.standby_state(grpid);
-         dout(10)  << "Failback unsuccessfull. GW: " << gw_state.first
-                   << " becomes Standby for the ANA groupId " << grpid << dendl;
        }
-       fbp_gw_state.standby_state(grpid);
-       dout(10) << "Failback unsuccessfull GW: " << gw_id
-                << " becomes Standby for the ANA groupId " << grpid  << dendl;
-       map_modified = true;
-       break;
+       else {
+          st.standby_state(grpid);
+          dout(10) << "GW failed durind failback/relocation persistency interval"
+            << gw_state.first << dendl;
+        }
       }
     }
     if (grp_owner_found == false) {
-      // when  GW group owner is deleted the fbk gw is put to standby
-      dout(4) << "group owner not found " << grpid << " GW: " << gw_id << dendl;
+      cancel_timer(gw_id, group_key, grpid);
+      fbp_gw_state.standby_state(grpid);
+      dout(4) << "group owner/relocation target not found "
+         << grpid << " GW: " << gw_id << dendl;
+      dout(10) << "Failback unsuccessfull GW: " << gw_id
+            << " becomes Standby for the ANA groupId " << grpid  << dendl;
+      map_modified = true;
     }
   } else if (fbp_gw_state.sm_state[grpid] ==
             gw_states_per_group_t::GW_WAIT_BLOCKLIST_CMPL) {
index 142d92a858f7b763ecae3211d9eee7ff0b524ad5..0456ddcb9ce8c4040dd527be2cd22a5e1cf632ba 100755 (executable)
@@ -61,6 +61,10 @@ public:
    * updating the map with a change affecting gws in group_key.
    */
   std::map<NvmeGroupKey, epoch_t> gw_epoch;
+  /* in stretched cluster configuration
+   * failbacks between locations not happens automatically
+   * */
+  std::map<NvmeGroupKey, FailbackLocation> failbacks_in_progress;
 
   void to_gmap(std::map<NvmeGroupKey, NvmeGwMonClientStates>& Gmap) const;
   void track_deleting_gws(const NvmeGroupKey& group_key,
@@ -135,9 +139,8 @@ private:
   void find_failover_candidate(
     const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
     NvmeAnaGrpId grpid, bool &propose_pending);
-  void find_failback_gw(
-    const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
-    bool &propose_pending, bool force_inter_location = false);
+  void find_failback_gw(const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
+    bool &propose_pending);
   void set_failover_gw_for_ANA_group(
     const NvmeGwId &failed_gw_id, const NvmeGroupKey& group_key,
     const NvmeGwId &gw_id, NvmeAnaGrpId groupid);
@@ -158,6 +161,11 @@ private:
   int find_failover_gw_logic(NvmeGwMonStates& gws_states,
     NvmeLocation& location, NvmeGwId& min_loaded_gw_id);
   bool validate_number_locations(int num_gws, int num_locations);
+  void check_relocate_ana_groups(const NvmeGroupKey& group_key,
+           bool &propose);
+  int relocate_ana_grp(const NvmeGwId &src_gw_id,
+   const NvmeGroupKey& group_key, NvmeAnaGrpId grpid,
+   NvmeLocation& location, bool &propose);
 
 public:
   int blocklist_gw(
@@ -168,7 +176,10 @@ public:
     using ceph::encode;
     uint8_t version = 1;
     if (HAVE_FEATURE(features, NVMEOFHAMAP)) {
-       version = 2;
+      version = 2;
+    }
+    if (HAVE_FEATURE(features, NVMEOF_BEACON_DIFF)) {
+      version = 3;
     }
     ENCODE_START(version, version, bl);
     encode(epoch, bl);// global map epoch
@@ -178,12 +189,15 @@ public:
     if (version >= 2) {
       encode(gw_epoch, bl);
     }
+    if (version >=3) {
+      encode(failbacks_in_progress, bl);
+    }
     ENCODE_FINISH(bl);
   }
 
   void decode(ceph::buffer::list::const_iterator &bl) {
     using ceph::decode;
-    DECODE_START(2, bl);
+    DECODE_START(3, bl);
 
     decode(epoch, bl);
     decode(created_gws, bl);
@@ -191,6 +205,9 @@ public:
     if (struct_v >= 2) {
       decode(gw_epoch, bl);
     }
+    if (struct_v >=3) {
+      decode(failbacks_in_progress, bl);
+    }
     DECODE_FINISH(bl);
   }
 
index 361f4f4120eefc8005c49c51d4d13d9e3df5240a..cd54637efbe2ae68e84ed4cf00741804a5e76821 100644 (file)
@@ -667,6 +667,19 @@ bool NVMeofGwMon::prepare_command(MonOpRequestRef op)
              <<" location "<< location << dendl;
     rc = pending_map.cfg_start_inter_location_failback(group_key,
                      location, propose);
+    if (rc == -EINVAL || rc == -EEXIST) {
+      err = rc;
+      sstrm.str("");
+      if (rc == -EEXIST) {
+        sstrm.str("command already set please wait until completed");
+      }
+      if (rc == EINVAL) {
+        sstrm.str("command cannot be executed");
+      }
+    }
+    if (rc == 0 && propose == true) {
+      response = true;
+    }
   }
   getline(sstrm, rs);
   if (response == false) {
index b75bebc31073d4c947852bb40e7e413ee8544f1b..17b816dd14ec400cec582f0a14dcd520410f45cd 100755 (executable)
@@ -190,7 +190,7 @@ inline std::ostream& print_gw_created_t(
     os << " " << state_itr.first <<": " << state_itr.second << ",";
   }
   os << "]\n"<< MODULE_PREFFIX << " entity-addr : " << value.addr_vect
-     << " availability " << value.availability
+     << " availability " << value.availability << "location " << value.location
      << " full-startup " << value.performed_full_startup  << " ]";
 
   return os;
@@ -237,6 +237,10 @@ inline std::ostream& operator<<(std::ostream& os, const NVMeofGwMap value) {
     os <<  "\n" <<  MODULE_PREFFIX  << "{ " << group_gws.first
        << " } -> GW epoch: " << group_gws.second << " }";
   }
+  for (auto& group_gws: value.failbacks_in_progress) {
+    os <<  "\n" <<  MODULE_PREFFIX  << "{ " << group_gws.first
+       << " } -> failback-to: " << group_gws.second << " }";
+  }
   for (auto& group_gws: value.created_gws) {
    os <<  "\n" <<  MODULE_PREFFIX  << "{ " << group_gws.first
       << " } -> { " << group_gws.second << " }";
@@ -681,6 +685,39 @@ inline void decode(std::map<NvmeGroupKey, epoch_t>& gw_epoch,
   DECODE_FINISH(bl);
 }
 
+inline void encode(
+    const  std::map<NvmeGroupKey, FailbackLocation> &failbacks_in_progress,
+    ceph::bufferlist &bl) {
+  ENCODE_START(1, 1, bl);
+  encode ((uint32_t)failbacks_in_progress.size(), bl); // number of groups
+  for (auto& group_failbacks: failbacks_in_progress) {
+    auto& group_key = group_failbacks.first;
+    encode(group_key.first, bl); // pool
+    encode(group_key.second, bl); // group
+    encode(group_failbacks.second, bl);
+  }
+  ENCODE_FINISH(bl);
+}
+
+inline void decode(
+    std::map<NvmeGroupKey, FailbackLocation> &failbacks_in_progress,
+    ceph::buffer::list::const_iterator &bl) {
+  failbacks_in_progress.clear();
+  uint32_t ngroups;
+  DECODE_START(1, bl);
+  decode(ngroups, bl);
+  for(uint32_t i = 0; i<ngroups; i++){
+    std::string pool, group;
+    decode(pool, bl);
+    decode(group, bl);
+    FailbackLocation location;
+    decode(location, bl);
+    failbacks_in_progress[std::make_pair(pool, group)] = location;
+  }
+  DECODE_FINISH(bl);
+}
+
+
 inline void encode(
   const std::map<NvmeGroupKey, NvmeGwMonStates>& created_gws,
   ceph::bufferlist &bl, uint64_t features) {
index 91c6b4834b3edc74845502995e71e577996fbbfc..940107f2006771604995644c509622a7e56bf505 100755 (executable)
@@ -26,6 +26,7 @@
 #include "msg/msg_types.h"
 
 using NvmeGwId = std::string;
+using FailbackLocation = std::string;
 using NvmeLocation = std::string;
 using NvmeGroupKey = std::pair<std::string, std::string>;
 using NvmeNqnId = std::string;