nvmeofgw: fix host issue during redeploy, improves previous redeploy fix

author Leonid Chernin <leonidc@il.ibm.com>

Sun, 6 Apr 2025 10:31:16 +0000 (13:31 +0300)

committer Leonid Chernin <leonidc@il.ibm.com>

Mon, 7 Apr 2025 15:47:49 +0000 (18:47 +0300)
author Leonid Chernin <leonidc@il.ibm.com>
Sun, 6 Apr 2025 10:31:16 +0000 (13:31 +0300)
committer Leonid Chernin <leonidc@il.ibm.com>
Mon, 7 Apr 2025 15:47:49 +0000 (18:47 +0300)
diff --git a/src/common/options/mon.yaml.in b/src/common/options/mon.yaml.in

index db2b6f6afccd9e241fac8bdebe434a3b836cab6f..dc8163ff48049af1226cd11ceb5c8660ef65148b 100644 (file)
--- a/src/common/options/mon.yaml.in
+++ b/src/common/options/mon.yaml.in
@@ -77,14 +77,15 @@ options:
    level: advanced
    desc: Period in seconds from last beacon to monitor marking a  NVMeoF gateway as
      failed
-  default: 15
+  default: 10
    services:
    - mon
  - name: mon_nvmeofgw_skip_failovers_interval
    type: secs
    level: advanced
    desc: Period in seconds in which no failovers are performed in GW's pool-group
-  default: 12
+    this is equal to max GW redeploy interval
+  default: 16
    services:
    - mon
  - name: mon_nvmeofgw_set_group_id_retry
diff --git a/src/mon/NVMeofGwMap.cc b/src/mon/NVMeofGwMap.cc

index 8a9113a3a26b67ec4ed736a102a9d0294fef840c..12bd93cef7466f59c90e26ab4b3ce1f53d34224b 100755 (executable)
--- a/src/mon/NVMeofGwMap.cc
+++ b/src/mon/NVMeofGwMap.cc
@@ -234,9 +234,34 @@ int NVMeofGwMap::do_delete_gw(
  void  NVMeofGwMap::gw_performed_startup(const NvmeGwId &gw_id,
        const NvmeGroupKey& group_key, bool &propose_pending)
  {
+  std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
    dout(4) << "GW  performed the full startup " << gw_id << dendl;
    propose_pending = true;
    increment_gw_epoch( group_key);
+  auto &st = created_gws[group_key][gw_id];
+  const auto skip_failovers_sec = g_conf().get_val<std::chrono::seconds>
+    ("mon_nvmeofgw_skip_failovers_interval");
+  const auto beacon_grace_sec =
+    g_conf().get_val<std::chrono::seconds>("mon_nvmeofgw_beacon_grace");
+ /*
+    This is a heuristic that meant to identify "cephadm redeploy" of the nvmeof gws.
+    We would like to identify that redeploy is going on, because it helps us to prevent
+    redundant failover and failback actions.
+    It is very important to minimize fo/fb during redeploy, because during redeploy
+    all GWs go down and up again, and the amount of fo/fb that could be driven by that
+    is big, which also triggers a lot of changes on the hosts the are nvmeof connected
+    to the gws, even up to the point that the host will get stuck.
+    This heuristic assumes that if a gw disappears and shows back in less than
+    REDEPLOY_TIMEOUT seconds, then it might be that a redeploy started, so we will
+    do a failover for this GW, but will not do failover for the next REDEPLOY_TIMEOUT.
+    Then again for the next GW that disappears and so on.
+    If it works as designed, than regardless of the number of GWs, redeploy will only
+    cause one fo/fb. */
+  if ((now - (st.last_gw_down_ts - beacon_grace_sec)) < skip_failovers_sec) {
+    skip_failovers_for_group(group_key);
+    dout(4) << "startup: set skip-failovers for group " << gw_id << " group "
+                << group_key << dendl;
+  }
  }
  
  void NVMeofGwMap::increment_gw_epoch(const NvmeGroupKey& group_key)
@@ -332,6 +357,7 @@ int NVMeofGwMap::process_gw_map_gw_down(
      dout(10) << "GW down " << gw_id << dendl;
      auto& st = gw_state->second;
      st.set_unavailable_state();
+    st.set_last_gw_down_ts();
      for (auto& state_itr: created_gws[group_key][gw_id].sm_state) {
        fsm_handle_gw_down(
         gw_id, group_key, state_itr.second,
diff --git a/src/mon/NVMeofGwMon.cc b/src/mon/NVMeofGwMon.cc

index 1e5efad1810dacc93adc1a732f618f5f4bea56e2..e41060084eae95a9d6b42ef0cc88d196c4bc0896 100644 (file)
--- a/src/mon/NVMeofGwMon.cc
+++ b/src/mon/NVMeofGwMon.cc
@@ -158,7 +158,7 @@ version_t NVMeofGwMon::get_trim_to() const
   * function called during new paxos epochs
   * function called to restore in pending map all data that is not serialized
   * to paxos peons. Othervise it would be overriden in "pending_map = map"
- * currently  just "allow_failovers_ts" variable is restored
+ * currently "allow_failovers_ts" and "last_gw_down_ts" variables restored
   */
  void NVMeofGwMon::restore_pending_map_info(NVMeofGwMap & tmp_map) {
    std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
@@ -173,6 +173,8 @@ void NVMeofGwMon::restore_pending_map_info(NVMeofGwMap & tmp_map) {
          pending_map.created_gws[group_key][gw_id].allow_failovers_ts =
            gw_created_pair.second.allow_failovers_ts;
        }
+      pending_map.created_gws[group_key][gw_id].last_gw_down_ts =
+          gw_created_pair.second.last_gw_down_ts;
      }
    }
  }
@@ -671,7 +673,7 @@ bool NVMeofGwMon::prepare_beacon(MonOpRequestRef op)
                 << gw_id << dendl;
          process_gw_down(gw_id, group_key, gw_propose, avail);
          pending_map.skip_failovers_for_group(group_key);
-        dout(4) << "set skip-failovers for gw's group " << gw_id << " group "
+        dout(4) << "fast_reboot:set skip-failovers for group " << gw_id << " group "
          << group_key << dendl;
        } else if (
         pending_map.created_gws[group_key][gw_id].performed_full_startup ==
diff --git a/src/mon/NVMeofGwTypes.h b/src/mon/NVMeofGwTypes.h

index cf264339348b39d005f62c67b124b2f7c2df569d..097397795a9308b3fa5ef4cb4ab7fd8bfb8211f1 100755 (executable)
--- a/src/mon/NVMeofGwTypes.h
+++ b/src/mon/NVMeofGwTypes.h
@@ -153,6 +153,8 @@ struct NvmeGwMonState {
    */
    std::chrono::system_clock::time_point allow_failovers_ts =
               std::chrono::system_clock::now();
+  std::chrono::system_clock::time_point last_gw_down_ts =
+             std::chrono::system_clock::now() - std::chrono::seconds(30);
    NvmeGwMonState(): ana_grp_id(REDUNDANT_GW_ANA_GROUP_ID) {}
  
    NvmeGwMonState(NvmeAnaGrpId id)
@@ -174,6 +176,9 @@ struct NvmeGwMonState {
      sm_state[grpid]       = gw_states_per_group_t::GW_ACTIVE_STATE;
      blocklist_data[grpid].osd_epoch = 0;
    }
+  void set_last_gw_down_ts(){
+    last_gw_down_ts = std::chrono::system_clock::now();
+  }
  };
  
  struct NqnState {
author	Leonid Chernin <leonidc@il.ibm.com>
	Sun, 6 Apr 2025 10:31:16 +0000 (13:31 +0300)
committer	Leonid Chernin <leonidc@il.ibm.com>
	Mon, 7 Apr 2025 15:47:49 +0000 (18:47 +0300)
src/common/options/mon.yaml.in		patch \| blob \| history
src/mon/NVMeofGwMap.cc		patch \| blob \| history
src/mon/NVMeofGwMon.cc		patch \| blob \| history
src/mon/NVMeofGwTypes.h		patch \| blob \| history