]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
apply NVMEOF_BEACON_DIFF and use it
authorLeonid Chernin <leonidc@il.ibm.com>
Wed, 13 Aug 2025 07:10:56 +0000 (10:10 +0300)
committerLeonid Chernin <leonidc@il.ibm.com>
Sun, 14 Sep 2025 06:37:52 +0000 (09:37 +0300)
fix gw startup
fixes for blocklist and skip failover
fix sending ack when gw_propose = true

Signed-off-by: Leonid Chernin <leonidc@il.ibm.com>
src/include/ceph_features.h
src/mon/NVMeofGwMap.cc
src/mon/NVMeofGwMap.h
src/mon/NVMeofGwMon.cc
src/mon/NVMeofGwSerialize.h

index 88187fe5c2d3b6df82d921a9d366007d0e95eb35..b9ca2690abef3333b9cf8a6c0b5d9ea98736819b 100644 (file)
@@ -163,6 +163,7 @@ DEFINE_CEPH_FEATURE(49, 2, SERVER_SQUID);
 DEFINE_CEPH_FEATURE_RETIRED(50, 1, MON_METADATA, MIMIC, OCTOPUS)
 DEFINE_CEPH_FEATURE(50, 2, SERVER_TENTACLE);
 DEFINE_CEPH_FEATURE_RETIRED(51, 1, OSD_BITWISE_HOBJ_SORT, MIMIC, OCTOPUS)
+DEFINE_CEPH_FEATURE(51, 2, NVMEOF_BEACON_DIFF)
 // available
 DEFINE_CEPH_FEATURE_RETIRED(52, 1, OSD_PROXY_WRITE_FEATURES, MIMIC, OCTOPUS)
 // available
@@ -259,6 +260,7 @@ DEFINE_CEPH_FEATURE_RETIRED(63, 1, RESERVED_BROKEN, LUMINOUS, QUINCY) // client-
         CEPH_FEATUREMASK_SERVER_REEF | \
         CEPH_FEATUREMASK_SERVER_SQUID | \
         CEPH_FEATUREMASK_SERVER_TENTACLE | \
+        CEPH_FEATUREMASK_NVMEOF_BEACON_DIFF | \
         0ULL)
 
 #define CEPH_FEATURES_SUPPORTED_DEFAULT  CEPH_FEATURES_ALL
index 84e00d5840f4414a96710ba013024281efe8cfd6..ca4599fabe4201c7fe0cdbf2dbe1002fb96df051 100755 (executable)
@@ -178,11 +178,16 @@ int NVMeofGwMap::cfg_delete_gw(
     for (auto& gws_states: created_gws[group_key]) {
       if (gws_states.first == gw_id) {
         auto& state = gws_states.second;
+        if (state.availability == gw_availability_t::GW_AVAILABLE) {
+                  /*prevent failover because blocklisting right now cause IO errors */
+                  dout(4) << "Delete GW: set skip-failovers for group " << gw_id
+                                  << " group " << group_key << dendl;
+                  skip_failovers_for_group(group_key, 5);
+               }
         state.availability = gw_availability_t::GW_DELETING;
         dout(4) << " Deleting  GW :"<< gw_id  << " in state  "
             << state.availability <<  " Resulting GW availability: "
             << state.availability  << dendl;
-        state.subsystems.clear();//ignore subsystems of this GW
         utime_t now = ceph_clock_now();
         mon->nvmegwmon()->gws_deleting_time[group_key][gw_id] = now;
         return 0;
@@ -342,10 +347,16 @@ void NVMeofGwMap::track_deleting_gws(const NvmeGroupKey& group_key,
   }
 }
 
-void NVMeofGwMap::skip_failovers_for_group(const NvmeGroupKey& group_key)
+void NVMeofGwMap::skip_failovers_for_group(const NvmeGroupKey& group_key,
+   int interval_sec)
 {
-  const auto skip_failovers = g_conf().get_val<std::chrono::seconds>
-    ("mon_nvmeofgw_skip_failovers_interval");
+  std::chrono::seconds skip_failovers;
+  if (interval_sec == 0) {
+    skip_failovers = g_conf().get_val<std::chrono::seconds>
+             ("mon_nvmeofgw_skip_failovers_interval");
+       } else {
+       skip_failovers = std::chrono::seconds(interval_sec);
+  }
   for (auto& gw_created: created_gws[group_key]) {
     gw_created.second.allow_failovers_ts = std::chrono::system_clock::now()
         + skip_failovers;
@@ -1146,7 +1157,7 @@ bool NVMeofGwMap::put_gw_beacon_sequence_number(const NvmeGwId &gw_id,
   NvmeGwMonState& gw_map = created_gws[group_key][gw_id];
   //gw_map.beacon_sequence_ooo = false;
 
-  if (HAVE_FEATURE(mon->get_quorum_con_features(), NVMEOFHAMAP)) { //TODO BEACONDIFF
+  if (HAVE_FEATURE(mon->get_quorum_con_features(), NVMEOF_BEACON_DIFF)) {
     if (beacon_sequence == 40 && inject1 == 0) { //Inject sequence ooo
         inject1 = 1;
         gw_map.beacon_sequence -= 5;
@@ -1167,6 +1178,18 @@ bool NVMeofGwMap::put_gw_beacon_sequence_number(const NvmeGwId &gw_id,
   return rc;
 }
 
+bool NVMeofGwMap::set_gw_beacon_sequence_number(const NvmeGwId &gw_id,
+       const NvmeGroupKey& group_key, uint64_t beacon_sequence)
+{
+  NvmeGwMonState& gw_map = created_gws[group_key][gw_id];
+  if (HAVE_FEATURE(mon->get_quorum_con_features(), NVMEOF_BEACON_DIFF)) {
+      gw_map.beacon_sequence = beacon_sequence;
+      dout(10) << gw_id << " set beacon_sequence " << beacon_sequence << dendl;
+  }
+  return true;
+}
+
+
 void NVMeofGwMap::update_active_timers(bool &propose_pending)
 {
   const auto now = std::chrono::system_clock::now();
index 9da7548f428394d27fb0335332fc3cdb74f20904..49452276933767f6a06f92c31913040893ed44b0 100755 (executable)
@@ -91,10 +91,13 @@ public:
        const NvmeGroupKey& group_key, bool &propose_pending);
   void set_addr_vect(const NvmeGwId &gw_id,
       const NvmeGroupKey& group_key, const entity_addr_t &addr_vect);
-  void skip_failovers_for_group(const NvmeGroupKey& group_key);
+  void skip_failovers_for_group(const NvmeGroupKey& group_key,
+      int interval_sec = 0);
   bool put_gw_beacon_sequence_number(const NvmeGwId &gw_id,
       const NvmeGroupKey& group_key, uint64_t beacon_sequence,
       uint64_t& old_sequence);
+  bool set_gw_beacon_sequence_number(const NvmeGwId &gw_id,
+         const NvmeGroupKey& group_key, uint64_t beacon_sequence);
 private:
   int  do_delete_gw(const NvmeGwId &gw_id, const NvmeGroupKey& group_key);
   int  do_erase_gw_id(const NvmeGwId &gw_id,
index c4751445160896c07fe305f283bcebc23f55ace7..c1d1be543d150d8801da7e85628049016b77b8ca 100644 (file)
@@ -138,7 +138,9 @@ void NVMeofGwMon::tick()
   for (auto &[group_key, gws_states]: pending_map.created_gws) {
     BeaconSubsystems *subsystems = &empty_subsystems;
     for (auto& gw_state : gws_states) { // loop for GWs inside nqn group
-      subsystems = &gw_state.second.subsystems;
+      if (gw_state.second.availability == gw_availability_t::GW_AVAILABLE) {
+        subsystems = &gw_state.second.subsystems;
+      }
       if (subsystems->size()) { // Set subsystems to the valid value
         break;
       }
@@ -680,7 +682,7 @@ int NVMeofGwMon::apply_beacon(const NvmeGwId &gw_id,
                      pending_map.created_gws[group_key][gw_id].subsystems;
   auto &state = pending_map.created_gws[group_key][gw_id];
 
-  if (!HAVE_FEATURE(mon.get_quorum_con_features(), NVMEOFHAMAP)) { //TODO beacondiff//NVMEOF_BEACONDIFF)) {
+  if (!HAVE_FEATURE(mon.get_quorum_con_features(), NVMEOF_BEACON_DIFF)) {
     if (gw_subs != sub) {
       dout(10) << "subsystems of GW changed, propose pending " << gw_id << dendl;
       gw_subs = sub;
@@ -733,6 +735,9 @@ int NVMeofGwMon::apply_beacon(const NvmeGwId &gw_id,
      }
   }
 }
+  if (changed) {
+    avail = gw_availability_t::GW_AVAILABLE;
+  }
   if (gw_subs.size() == 0) {
       avail = gw_availability_t::GW_CREATED;
       dout(10) << "No-subsystems condition detected for GW " << gw_id <<dendl;
@@ -750,6 +755,9 @@ int NVMeofGwMon::apply_beacon(const NvmeGwId &gw_id,
       }
     }// for HA no-subsystems and no-listeners are same usecases
   //dout(10) << " GWid " << gw_id  << " beacon subsystems changed = " << changed << dendl;
+  if (avail == gw_availability_t::GW_UNAVAILABLE) {
+         dout(4) << "Warning: UNAVAILABLE gw " << gw_id << dendl;
+  }
   return (changed == true ? 1:0);
 }
 
@@ -765,6 +773,9 @@ bool NVMeofGwMon::prepare_beacon(MonOpRequestRef op)
   ConnectionRef con = op->get_connection();
   NvmeGwId gw_id = m->get_gw_id();
   NvmeGroupKey group_key = std::make_pair(m->get_gw_pool(),  m->get_gw_group());
+  //"avail" variable will be changed inside the function
+  // when it becomes CREATED for several reasons GW's load balance group
+ //  is serviced by another GW
   gw_availability_t  avail = m->get_availability();
   bool propose = false;
   bool nonce_propose = false;
@@ -793,8 +804,8 @@ bool NVMeofGwMon::prepare_beacon(MonOpRequestRef op)
               << map.created_gws << dendl;
       goto set_propose;
     } else {
-      correct_sequence = pending_map.put_gw_beacon_sequence_number
-           (gw_id, group_key, sequence, stored_sequence);
+      pending_map.created_gws[group_key][gw_id].subsystems.clear();
+      pending_map.set_gw_beacon_sequence_number (gw_id, group_key, sequence);
       dout(4) << "GW beacon: Created state - full startup done " << gw_id
               << " GW state in monitor data-base : "
               << pending_map.created_gws[group_key][gw_id].availability
@@ -804,8 +815,6 @@ bool NVMeofGwMon::prepare_beacon(MonOpRequestRef op)
        dout(1) << " Warning :GW marked as Available in the NVmeofGwMon "
                << "database, performed full startup - Apply it but don't allow failover!"
                << gw_id << dendl;
-       correct_sequence = true; //ack with ooo indication wouldn't sent this time
-                                //to prevent duplicated exception handling
         process_gw_down(gw_id, group_key, gw_propose, avail);
         pending_map.skip_failovers_for_group(group_key);
         dout(4) << "fast_reboot:set skip-failovers for group " << gw_id << " group "
@@ -845,8 +854,14 @@ bool NVMeofGwMon::prepare_beacon(MonOpRequestRef op)
        goto false_return;
       }
       if (!correct_sequence) {
+        if (avail == gw_availability_t::GW_AVAILABLE) {
+          /*prevent failover because blocklisting is not possible */
+          dout(4) << "sequence ooo: set skip-failovers for group " << gw_id
+                  << " group " << group_key << dendl;
+          pending_map.skip_failovers_for_group(group_key, 7);
+        }
         avail = gw_availability_t::GW_CREATED;
-        // GW in service mode, not active up to correct sequence
+        // GW is not active up to correct sequence
         goto check_availability;
       }
     }
@@ -929,19 +944,23 @@ set_propose:
            * if epoch-filter-bit: send ack to beacon in case no propose
            * or if changed something not relevant to gw-epoch
           */
-    if (gw_created) {
+    if (gw_created) { //TODO make func
       // respond with a map slice correspondent to the same GW
-      ack_map.created_gws[group_key][gw_id] = map.created_gws[group_key][gw_id];
+      ack_map.created_gws[group_key][gw_id] = (gw_propose) ? //avail = CREATED
+              pending_map.created_gws[group_key][gw_id] :
+              map.created_gws[group_key][gw_id];
       ack_map.created_gws[group_key][gw_id].beacon_sequence = sequence;
       if (!correct_sequence) {
-        dout(4) << "beacon from GW " << gw_id <<
+        dout(4) << " GW " << gw_id <<
           " sending ACK due to receiving beacon_sequence out of order"
           << dendl;
         ack_map.created_gws[group_key][gw_id].beacon_sequence =
           stored_sequence;
         ack_map.created_gws[group_key][gw_id].beacon_sequence_ooo = true;
       }
-
+      if (gw_propose) {
+        dout(10) << "GW in Created " << gw_id << " ack map " << ack_map << dendl;
+      }
     }
     ack_map.epoch = get_ack_map_epoch(gw_created, group_key);
     if (!gw_created)
index df9bae0f3db1b84b3d9e17cc6c495020170831e8..716b60b4d07551533303b498a917a9693a37a018 100755 (executable)
@@ -311,7 +311,7 @@ inline  void decode(
 
 inline void encode(const NvmeGwClientState& state,  ceph::bufferlist &bl, uint64_t features) {
   uint8_t version = 1;
-  if (HAVE_FEATURE(features, NVMEOFHAMAP)) { //TODO beacondiff//NVMEOF_BEACONDIFF)) {
+  if (HAVE_FEATURE(features, NVMEOF_BEACON_DIFF)) {
      version = 2;
   }
   ENCODE_START(version, version, bl);
@@ -842,7 +842,7 @@ inline void decode(BeaconListener& ls, ceph::buffer::list::const_iterator &bl) {
 
 inline void encode(const BeaconSubsystem& sub,  ceph::bufferlist &bl, uint64_t features) {
   uint8_t version = 1;
-  if (HAVE_FEATURE(features, NVMEOFHAMAP)) { //TODO beacondiff//NVMEOF_BEACONDIFF)) {
+  if (HAVE_FEATURE(features, NVMEOF_BEACON_DIFF)) {
     version = 2;
   }
   ENCODE_START(version, version, bl);