apply NVMEOF_BEACON_DIFF and use it

author Leonid Chernin <leonidc@il.ibm.com>

Wed, 13 Aug 2025 07:10:56 +0000 (10:10 +0300)

committer Leonid Chernin <leonidc@il.ibm.com>

Sun, 14 Sep 2025 06:37:52 +0000 (09:37 +0300)
author Leonid Chernin <leonidc@il.ibm.com>
Wed, 13 Aug 2025 07:10:56 +0000 (10:10 +0300)
committer Leonid Chernin <leonidc@il.ibm.com>
Sun, 14 Sep 2025 06:37:52 +0000 (09:37 +0300)
diff --git a/src/include/ceph_features.h b/src/include/ceph_features.h

index 88187fe5c2d3b6df82d921a9d366007d0e95eb35..b9ca2690abef3333b9cf8a6c0b5d9ea98736819b 100644 (file)
--- a/src/include/ceph_features.h
+++ b/src/include/ceph_features.h
@@ -163,6 +163,7 @@ DEFINE_CEPH_FEATURE(49, 2, SERVER_SQUID);
  DEFINE_CEPH_FEATURE_RETIRED(50, 1, MON_METADATA, MIMIC, OCTOPUS)
  DEFINE_CEPH_FEATURE(50, 2, SERVER_TENTACLE);
  DEFINE_CEPH_FEATURE_RETIRED(51, 1, OSD_BITWISE_HOBJ_SORT, MIMIC, OCTOPUS)
+DEFINE_CEPH_FEATURE(51, 2, NVMEOF_BEACON_DIFF)
  // available
  DEFINE_CEPH_FEATURE_RETIRED(52, 1, OSD_PROXY_WRITE_FEATURES, MIMIC, OCTOPUS)
  // available
@@ -259,6 +260,7 @@ DEFINE_CEPH_FEATURE_RETIRED(63, 1, RESERVED_BROKEN, LUMINOUS, QUINCY) // client-
          CEPH_FEATUREMASK_SERVER_REEF | \
          CEPH_FEATUREMASK_SERVER_SQUID | \
          CEPH_FEATUREMASK_SERVER_TENTACLE | \
+        CEPH_FEATUREMASK_NVMEOF_BEACON_DIFF | \
          0ULL)
  
  #define CEPH_FEATURES_SUPPORTED_DEFAULT  CEPH_FEATURES_ALL
diff --git a/src/mon/NVMeofGwMap.cc b/src/mon/NVMeofGwMap.cc

index 84e00d5840f4414a96710ba013024281efe8cfd6..ca4599fabe4201c7fe0cdbf2dbe1002fb96df051 100755 (executable)
--- a/src/mon/NVMeofGwMap.cc
+++ b/src/mon/NVMeofGwMap.cc
@@ -178,11 +178,16 @@ int NVMeofGwMap::cfg_delete_gw(
      for (auto& gws_states: created_gws[group_key]) {
        if (gws_states.first == gw_id) {
          auto& state = gws_states.second;
+        if (state.availability == gw_availability_t::GW_AVAILABLE) {
+                  /*prevent failover because blocklisting right now cause IO errors */
+                  dout(4) << "Delete GW: set skip-failovers for group " << gw_id
+                                  << " group " << group_key << dendl;
+                  skip_failovers_for_group(group_key, 5);
+               }
          state.availability = gw_availability_t::GW_DELETING;
          dout(4) << " Deleting  GW :"<< gw_id  << " in state  "
              << state.availability <<  " Resulting GW availability: "
              << state.availability  << dendl;
-        state.subsystems.clear();//ignore subsystems of this GW
          utime_t now = ceph_clock_now();
          mon->nvmegwmon()->gws_deleting_time[group_key][gw_id] = now;
          return 0;
@@ -342,10 +347,16 @@ void NVMeofGwMap::track_deleting_gws(const NvmeGroupKey& group_key,
    }
  }
  
-void NVMeofGwMap::skip_failovers_for_group(const NvmeGroupKey& group_key)
+void NVMeofGwMap::skip_failovers_for_group(const NvmeGroupKey& group_key,
+   int interval_sec)
  {
-  const auto skip_failovers = g_conf().get_val<std::chrono::seconds>
-    ("mon_nvmeofgw_skip_failovers_interval");
+  std::chrono::seconds skip_failovers;
+  if (interval_sec == 0) {
+    skip_failovers = g_conf().get_val<std::chrono::seconds>
+             ("mon_nvmeofgw_skip_failovers_interval");
+       } else {
+       skip_failovers = std::chrono::seconds(interval_sec);
+  }
    for (auto& gw_created: created_gws[group_key]) {
      gw_created.second.allow_failovers_ts = std::chrono::system_clock::now()
          + skip_failovers;
@@ -1146,7 +1157,7 @@ bool NVMeofGwMap::put_gw_beacon_sequence_number(const NvmeGwId &gw_id,
    NvmeGwMonState& gw_map = created_gws[group_key][gw_id];
    //gw_map.beacon_sequence_ooo = false;
  
-  if (HAVE_FEATURE(mon->get_quorum_con_features(), NVMEOFHAMAP)) { //TODO BEACONDIFF
+  if (HAVE_FEATURE(mon->get_quorum_con_features(), NVMEOF_BEACON_DIFF)) {
      if (beacon_sequence == 40 && inject1 == 0) { //Inject sequence ooo
          inject1 = 1;
          gw_map.beacon_sequence -= 5;
@@ -1167,6 +1178,18 @@ bool NVMeofGwMap::put_gw_beacon_sequence_number(const NvmeGwId &gw_id,
    return rc;
  }
  
+bool NVMeofGwMap::set_gw_beacon_sequence_number(const NvmeGwId &gw_id,
+       const NvmeGroupKey& group_key, uint64_t beacon_sequence)
+{
+  NvmeGwMonState& gw_map = created_gws[group_key][gw_id];
+  if (HAVE_FEATURE(mon->get_quorum_con_features(), NVMEOF_BEACON_DIFF)) {
+      gw_map.beacon_sequence = beacon_sequence;
+      dout(10) << gw_id << " set beacon_sequence " << beacon_sequence << dendl;
+  }
+  return true;
+}
+
+
  void NVMeofGwMap::update_active_timers(bool &propose_pending)
  {
    const auto now = std::chrono::system_clock::now();
diff --git a/src/mon/NVMeofGwMap.h b/src/mon/NVMeofGwMap.h

index 9da7548f428394d27fb0335332fc3cdb74f20904..49452276933767f6a06f92c31913040893ed44b0 100755 (executable)
--- a/src/mon/NVMeofGwMap.h
+++ b/src/mon/NVMeofGwMap.h
@@ -91,10 +91,13 @@ public:
         const NvmeGroupKey& group_key, bool &propose_pending);
    void set_addr_vect(const NvmeGwId &gw_id,
        const NvmeGroupKey& group_key, const entity_addr_t &addr_vect);
-  void skip_failovers_for_group(const NvmeGroupKey& group_key);
+  void skip_failovers_for_group(const NvmeGroupKey& group_key,
+      int interval_sec = 0);
    bool put_gw_beacon_sequence_number(const NvmeGwId &gw_id,
        const NvmeGroupKey& group_key, uint64_t beacon_sequence,
        uint64_t& old_sequence);
+  bool set_gw_beacon_sequence_number(const NvmeGwId &gw_id,
+         const NvmeGroupKey& group_key, uint64_t beacon_sequence);
  private:
    int  do_delete_gw(const NvmeGwId &gw_id, const NvmeGroupKey& group_key);
    int  do_erase_gw_id(const NvmeGwId &gw_id,
diff --git a/src/mon/NVMeofGwMon.cc b/src/mon/NVMeofGwMon.cc

index c4751445160896c07fe305f283bcebc23f55ace7..c1d1be543d150d8801da7e85628049016b77b8ca 100644 (file)
--- a/src/mon/NVMeofGwMon.cc
+++ b/src/mon/NVMeofGwMon.cc
@@ -138,7 +138,9 @@ void NVMeofGwMon::tick()
    for (auto &[group_key, gws_states]: pending_map.created_gws) {
      BeaconSubsystems *subsystems = &empty_subsystems;
      for (auto& gw_state : gws_states) { // loop for GWs inside nqn group
-      subsystems = &gw_state.second.subsystems;
+      if (gw_state.second.availability == gw_availability_t::GW_AVAILABLE) {
+        subsystems = &gw_state.second.subsystems;
+      }
        if (subsystems->size()) { // Set subsystems to the valid value
          break;
        }
@@ -680,7 +682,7 @@ int NVMeofGwMon::apply_beacon(const NvmeGwId &gw_id,
                       pending_map.created_gws[group_key][gw_id].subsystems;
    auto &state = pending_map.created_gws[group_key][gw_id];
  
-  if (!HAVE_FEATURE(mon.get_quorum_con_features(), NVMEOFHAMAP)) { //TODO beacondiff//NVMEOF_BEACONDIFF)) {
+  if (!HAVE_FEATURE(mon.get_quorum_con_features(), NVMEOF_BEACON_DIFF)) {
      if (gw_subs != sub) {
        dout(10) << "subsystems of GW changed, propose pending " << gw_id << dendl;
        gw_subs = sub;
@@ -733,6 +735,9 @@ int NVMeofGwMon::apply_beacon(const NvmeGwId &gw_id,
       }
    }
  }
+  if (changed) {
+    avail = gw_availability_t::GW_AVAILABLE;
+  }
    if (gw_subs.size() == 0) {
        avail = gw_availability_t::GW_CREATED;
        dout(10) << "No-subsystems condition detected for GW " << gw_id <<dendl;
@@ -750,6 +755,9 @@ int NVMeofGwMon::apply_beacon(const NvmeGwId &gw_id,
        }
      }// for HA no-subsystems and no-listeners are same usecases
    //dout(10) << " GWid " << gw_id  << " beacon subsystems changed = " << changed << dendl;
+  if (avail == gw_availability_t::GW_UNAVAILABLE) {
+         dout(4) << "Warning: UNAVAILABLE gw " << gw_id << dendl;
+  }
    return (changed == true ? 1:0);
  }
  
@@ -765,6 +773,9 @@ bool NVMeofGwMon::prepare_beacon(MonOpRequestRef op)
    ConnectionRef con = op->get_connection();
    NvmeGwId gw_id = m->get_gw_id();
    NvmeGroupKey group_key = std::make_pair(m->get_gw_pool(),  m->get_gw_group());
+  //"avail" variable will be changed inside the function
+  // when it becomes CREATED for several reasons GW's load balance group
+ //  is serviced by another GW
    gw_availability_t  avail = m->get_availability();
    bool propose = false;
    bool nonce_propose = false;
@@ -793,8 +804,8 @@ bool NVMeofGwMon::prepare_beacon(MonOpRequestRef op)
                << map.created_gws << dendl;
        goto set_propose;
      } else {
-      correct_sequence = pending_map.put_gw_beacon_sequence_number
-           (gw_id, group_key, sequence, stored_sequence);
+      pending_map.created_gws[group_key][gw_id].subsystems.clear();
+      pending_map.set_gw_beacon_sequence_number (gw_id, group_key, sequence);
        dout(4) << "GW beacon: Created state - full startup done " << gw_id
                << " GW state in monitor data-base : "
                << pending_map.created_gws[group_key][gw_id].availability
@@ -804,8 +815,6 @@ bool NVMeofGwMon::prepare_beacon(MonOpRequestRef op)
         dout(1) << " Warning :GW marked as Available in the NVmeofGwMon "
                 << "database, performed full startup - Apply it but don't allow failover!"
                 << gw_id << dendl;
-       correct_sequence = true; //ack with ooo indication wouldn't sent this time
-                                //to prevent duplicated exception handling
          process_gw_down(gw_id, group_key, gw_propose, avail);
          pending_map.skip_failovers_for_group(group_key);
          dout(4) << "fast_reboot:set skip-failovers for group " << gw_id << " group "
@@ -845,8 +854,14 @@ bool NVMeofGwMon::prepare_beacon(MonOpRequestRef op)
         goto false_return;
        }
        if (!correct_sequence) {
+        if (avail == gw_availability_t::GW_AVAILABLE) {
+          /*prevent failover because blocklisting is not possible */
+          dout(4) << "sequence ooo: set skip-failovers for group " << gw_id
+                  << " group " << group_key << dendl;
+          pending_map.skip_failovers_for_group(group_key, 7);
+        }
          avail = gw_availability_t::GW_CREATED;
-        // GW in service mode, not active up to correct sequence
+        // GW is not active up to correct sequence
          goto check_availability;
        }
      }
@@ -929,19 +944,23 @@ set_propose:
             * if epoch-filter-bit: send ack to beacon in case no propose
             * or if changed something not relevant to gw-epoch
            */
-    if (gw_created) {
+    if (gw_created) { //TODO make func
        // respond with a map slice correspondent to the same GW
-      ack_map.created_gws[group_key][gw_id] = map.created_gws[group_key][gw_id];
+      ack_map.created_gws[group_key][gw_id] = (gw_propose) ? //avail = CREATED
+              pending_map.created_gws[group_key][gw_id] :
+              map.created_gws[group_key][gw_id];
        ack_map.created_gws[group_key][gw_id].beacon_sequence = sequence;
        if (!correct_sequence) {
-        dout(4) << "beacon from GW " << gw_id <<
+        dout(4) << " GW " << gw_id <<
            " sending ACK due to receiving beacon_sequence out of order"
            << dendl;
          ack_map.created_gws[group_key][gw_id].beacon_sequence =
            stored_sequence;
          ack_map.created_gws[group_key][gw_id].beacon_sequence_ooo = true;
        }
-
+      if (gw_propose) {
+        dout(10) << "GW in Created " << gw_id << " ack map " << ack_map << dendl;
+      }
      }
      ack_map.epoch = get_ack_map_epoch(gw_created, group_key);
      if (!gw_created)
diff --git a/src/mon/NVMeofGwSerialize.h b/src/mon/NVMeofGwSerialize.h

index df9bae0f3db1b84b3d9e17cc6c495020170831e8..716b60b4d07551533303b498a917a9693a37a018 100755 (executable)
--- a/src/mon/NVMeofGwSerialize.h
+++ b/src/mon/NVMeofGwSerialize.h
@@ -311,7 +311,7 @@ inline  void decode(
  
  inline void encode(const NvmeGwClientState& state,  ceph::bufferlist &bl, uint64_t features) {
    uint8_t version = 1;
-  if (HAVE_FEATURE(features, NVMEOFHAMAP)) { //TODO beacondiff//NVMEOF_BEACONDIFF)) {
+  if (HAVE_FEATURE(features, NVMEOF_BEACON_DIFF)) {
       version = 2;
    }
    ENCODE_START(version, version, bl);
@@ -842,7 +842,7 @@ inline void decode(BeaconListener& ls, ceph::buffer::list::const_iterator &bl) {
  
  inline void encode(const BeaconSubsystem& sub,  ceph::bufferlist &bl, uint64_t features) {
    uint8_t version = 1;
-  if (HAVE_FEATURE(features, NVMEOFHAMAP)) { //TODO beacondiff//NVMEOF_BEACONDIFF)) {
+  if (HAVE_FEATURE(features, NVMEOF_BEACON_DIFF)) {
      version = 2;
    }
    ENCODE_START(version, version, bl);
author	Leonid Chernin <leonidc@il.ibm.com>
	Wed, 13 Aug 2025 07:10:56 +0000 (10:10 +0300)
committer	Leonid Chernin <leonidc@il.ibm.com>
	Sun, 14 Sep 2025 06:37:52 +0000 (09:37 +0300)
src/include/ceph_features.h		patch \| blob \| history
src/mon/NVMeofGwMap.cc		patch \| blob \| history
src/mon/NVMeofGwMap.h		patch \| blob \| history
src/mon/NVMeofGwMon.cc		patch \| blob \| history
src/mon/NVMeofGwSerialize.h		patch \| blob \| history