mon: fix duplicated entity addr in the map during reboot of several nvvmeof GWs

author Leonid Chernin <leonidc@il.ibm.com>

Mon, 21 Apr 2025 13:56:07 +0000 (16:56 +0300)

committer Leonid Chernin <leonidc@il.ibm.com>

Mon, 30 Jun 2025 08:08:28 +0000 (11:08 +0300)
author Leonid Chernin <leonidc@il.ibm.com>
Mon, 21 Apr 2025 13:56:07 +0000 (16:56 +0300)
committer Leonid Chernin <leonidc@il.ibm.com>
Mon, 30 Jun 2025 08:08:28 +0000 (11:08 +0300)
diff --git a/src/common/options/mon.yaml.in b/src/common/options/mon.yaml.in

index 06db74895b80290d631f6cbde98d0bd1dd4f2a05..2c337fcea70721b73abd548bb9657fcd15300bb1 100644 (file)
--- a/src/common/options/mon.yaml.in
+++ b/src/common/options/mon.yaml.in
@@ -111,6 +111,13 @@ options:
    default: 15_min
    services:
    - mon
+- name: mon_nvmeofgw_wrong_map_ignore_sec
+  type: uint
+  level: advanced
+  desc: Period in seconds from MonClient startup to ignore wrong maps from Monitor
+  default: 15
+  services:
+  - mon
  - name: mon_mgr_inactive_grace
    type: int
    level: advanced
diff --git a/src/mon/NVMeofGwMap.cc b/src/mon/NVMeofGwMap.cc

index 12bd93cef7466f59c90e26ab4b3ce1f53d34224b..80d5306051f7da29c6765368d9e228ccd5f2a270 100755 (executable)
--- a/src/mon/NVMeofGwMap.cc
+++ b/src/mon/NVMeofGwMap.cc
@@ -264,6 +264,25 @@ void  NVMeofGwMap::gw_performed_startup(const NvmeGwId &gw_id,
    }
  }
  
+void NVMeofGwMap::set_addr_vect(const NvmeGwId &gw_id,
+    const NvmeGroupKey& group_key, const entity_addr_t &addr) {
+  entity_addrvec_t addrvec(addr);
+  for (auto& gws_states: created_gws[group_key]) {
+     auto &state = gws_states.second;
+     auto &gw_found = gws_states.first;
+     if (state.addr_vect == addrvec && gw_found != gw_id) {
+      /* This can happen when several GWs restart simultaneously and
+       * they got entity_addr that differ from the previous one
+       */
+       entity_addr_t a;
+       state.addr_vect = entity_addrvec_t(a);// cleanup duplicated address
+       dout(4) << "found duplicated addr vect in gw " << gw_found << dendl;
+     }
+  }
+  created_gws[group_key][gw_id].addr_vect = addrvec;
+  dout(10) << "Set addr vect " << addrvec << " for gw " << gw_id << dendl;
+}
+
  void NVMeofGwMap::increment_gw_epoch(const NvmeGroupKey& group_key)
  {
    if (HAVE_FEATURE(mon->get_quorum_con_features(), NVMEOFHAMAP)) {
diff --git a/src/mon/NVMeofGwMap.h b/src/mon/NVMeofGwMap.h

index c3a95d082266fd5536e01efdded1b81fe5134eac..015577f248ad7bf97ecdbe4fe818ea2d0266f050 100755 (executable)
--- a/src/mon/NVMeofGwMap.h
+++ b/src/mon/NVMeofGwMap.h
@@ -87,6 +87,8 @@ public:
         const NvmeGroupKey& group_key, bool &map_modified);
    void gw_performed_startup(const NvmeGwId &gw_id,
         const NvmeGroupKey& group_key, bool &propose_pending);
+  void set_addr_vect(const NvmeGwId &gw_id,
+      const NvmeGroupKey& group_key, const entity_addr_t &addr_vect);
    void skip_failovers_for_group(const NvmeGroupKey& group_key);
  private:
    int  do_delete_gw(const NvmeGwId &gw_id, const NvmeGroupKey& group_key);
diff --git a/src/mon/NVMeofGwMon.cc b/src/mon/NVMeofGwMon.cc

index e41060084eae95a9d6b42ef0cc88d196c4bc0896..5c5d4f7194fc2a24fac45c6cc1ae9bd9b97a7bdb 100644 (file)
--- a/src/mon/NVMeofGwMon.cc
+++ b/src/mon/NVMeofGwMon.cc
@@ -680,8 +680,7 @@ bool NVMeofGwMon::prepare_beacon(MonOpRequestRef op)
         false) {
         pending_map.created_gws[group_key][gw_id].performed_full_startup = true;
         pending_map.gw_performed_startup(gw_id, group_key, gw_propose);
-       pending_map.created_gws[group_key][gw_id].addr_vect =
-           entity_addrvec_t(con->get_peer_addr());
+       pending_map.set_addr_vect(gw_id, group_key, con->get_peer_addr());
        }
        LastBeacon lb = {gw_id, group_key};
        last_beacon[lb] = now; //Update last beacon
@@ -730,8 +729,7 @@ bool NVMeofGwMon::prepare_beacon(MonOpRequestRef op)
      dout(4) << "Warning: entity addr need to set for GW client " << gw_id
        << " was " <<  pending_map.created_gws[group_key][gw_id].addr_vect
        << " now " << entity_addrvec_t(con->get_peer_addr()) << dendl;
-    pending_map.created_gws[group_key][gw_id].addr_vect =
-      entity_addrvec_t(con->get_peer_addr());
+    pending_map.set_addr_vect(gw_id, group_key, con->get_peer_addr());
      gw_propose = true;
    }
    // deep copy the whole nonce map of this GW
diff --git a/src/nvmeof/NVMeofGwMonitorClient.cc b/src/nvmeof/NVMeofGwMonitorClient.cc

index ae5562a0718b775def7c130edf7e363f32e8e7cc..0b798c370a2e764f68008fff03c04bdd6a8d847e 100644 (file)
--- a/src/nvmeof/NVMeofGwMonitorClient.cc
+++ b/src/nvmeof/NVMeofGwMonitorClient.cc
@@ -39,6 +39,7 @@ NVMeofGwMonitorClient::NVMeofGwMonitorClient(int argc, const char **argv) :
    osdmap_epoch(0),
    gwmap_epoch(0),
    last_map_time(std::chrono::steady_clock::now()),
+  reset_timestamp(std::chrono::steady_clock::now()),
    monc{g_ceph_context, poolctx},
    client_messenger(Messenger::create(g_ceph_context, "async", entity_name_t::CLIENT(-1), "client", getpid())),
    objecter{g_ceph_context, client_messenger.get(), &monc, poolctx},
@@ -304,18 +305,32 @@ void NVMeofGwMonitorClient::shutdown()
  
  void NVMeofGwMonitorClient::handle_nvmeof_gw_map(ceph::ref_t<MNVMeofGwMap> nmap)
  {
-  last_map_time = std::chrono::steady_clock::now(); // record time of last monitor message
+  auto now = std::chrono::steady_clock::now();
+  last_map_time = now; // record time of last monitor message
  
    auto &new_map = nmap->get_map();
    gwmap_epoch = nmap->get_gwmap_epoch();
    auto group_key = std::make_pair(pool, group);
    dout(10) << "handle nvmeof gw map: " << new_map << dendl;
-
+  uint64_t reset_elapsed_seconds =
+      std::chrono::duration_cast<std::chrono::seconds>(now - reset_timestamp).count();
    NvmeGwClientState old_gw_state;
+  uint64_t ignore_wrong_map_interval_sec =
+       g_conf().get_val<uint64_t>("mon_nvmeofgw_wrong_map_ignore_sec");
    auto got_old_gw_state = get_gw_state("old map", map, group_key, name, old_gw_state); 
    NvmeGwClientState new_gw_state;
    auto got_new_gw_state = get_gw_state("new map", new_map, group_key, name, new_gw_state); 
  
+  /*It is possible that wrong second map would be sent by monitor in rear cases when several GWs doing reboot
+  * and entity_address of the monitor client changes. So Monitor may send the unicast map to the wrong destination
+  * since this "old" address still appears in its map. It is asynchronous process in the monitor, better to protect
+  * from this scenario by silently ignoring the wrong map. This can happen just in the first several seconds after restart
+  */
+  if ( (reset_elapsed_seconds < ignore_wrong_map_interval_sec) &&
+        !got_new_gw_state && got_old_gw_state) {
+    dout(4) << "Wrong map received, Ignore it" << dendl;
+    return;
+  }
    // ensure that the gateway state has not vanished
    ceph_assert(got_new_gw_state || !got_old_gw_state);
  
diff --git a/src/nvmeof/NVMeofGwMonitorClient.h b/src/nvmeof/NVMeofGwMonitorClient.h

index dc5fcbce2b2ce45c969545b1f39c8814d906816f..546fff27db7eb21c1593c5f763bc2eb957963e0a 100644 (file)
--- a/src/nvmeof/NVMeofGwMonitorClient.h
+++ b/src/nvmeof/NVMeofGwMonitorClient.h
@@ -45,6 +45,9 @@ private:
    epoch_t     gwmap_epoch;  // last received gw map epoch
    std::chrono::time_point<std::chrono::steady_clock>
                last_map_time; // used to panic on disconnect
+  std::chrono::time_point<std::chrono::steady_clock>
+                reset_timestamp; // used to bypass some validations
+
    bool first_beacon = true;
    // init gw ssl opts
    void init_gw_ssl_opts();
author	Leonid Chernin <leonidc@il.ibm.com>
	Mon, 21 Apr 2025 13:56:07 +0000 (16:56 +0300)
committer	Leonid Chernin <leonidc@il.ibm.com>
	Mon, 30 Jun 2025 08:08:28 +0000 (11:08 +0300)
src/common/options/mon.yaml.in		patch \| blob \| history
src/mon/NVMeofGwMap.cc		patch \| blob \| history
src/mon/NVMeofGwMap.h		patch \| blob \| history
src/mon/NVMeofGwMon.cc		patch \| blob \| history
src/nvmeof/NVMeofGwMonitorClient.cc		patch \| blob \| history
src/nvmeof/NVMeofGwMonitorClient.h		patch \| blob \| history