From: Samuel Just <sjust@redhat.com>
Date: Wed, 11 Jun 2025 15:30:29 +0000 (-0700)
Subject: Merge pull request #63003 from leonidc/fix_duplicate_entity_addr
X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=8fb15d16eed476982fa79341a94d2719123c8781;p=ceph.git

Merge pull request #63003 from leonidc/fix_duplicate_entity_addr

fix duplicated entity addr in the map during reboot of several GWs

Reviewed-by: Samuel Just <sjust@redhat.com>
---

8fb15d16eed476982fa79341a94d2719123c8781
diff --cc src/mon/NVMeofGwMap.cc
index 12bd93cef746,e959f6d20af8..80d5306051f7
--- a/src/mon/NVMeofGwMap.cc
+++ b/src/mon/NVMeofGwMap.cc
@@@ -238,32 -237,27 +238,51 @@@ void  NVMeofGwMap::gw_performed_startup
    dout(4) << "GW  performed the full startup " << gw_id << dendl;
    propose_pending = true;
    increment_gw_epoch( group_key);
 +  auto &st = created_gws[group_key][gw_id];
 +  const auto skip_failovers_sec = g_conf().get_val<std::chrono::seconds>
 +    ("mon_nvmeofgw_skip_failovers_interval");
 +  const auto beacon_grace_sec =
 +    g_conf().get_val<std::chrono::seconds>("mon_nvmeofgw_beacon_grace");
 + /*
 +    This is a heuristic that meant to identify "cephadm redeploy" of the nvmeof gws.
 +    We would like to identify that redeploy is going on, because it helps us to prevent
 +    redundant failover and failback actions.
 +    It is very important to minimize fo/fb during redeploy, because during redeploy
 +    all GWs go down and up again, and the amount of fo/fb that could be driven by that
 +    is big, which also triggers a lot of changes on the hosts the are nvmeof connected
 +    to the gws, even up to the point that the host will get stuck.
 +    This heuristic assumes that if a gw disappears and shows back in less than
 +    REDEPLOY_TIMEOUT seconds, then it might be that a redeploy started, so we will
 +    do a failover for this GW, but will not do failover for the next REDEPLOY_TIMEOUT.
 +    Then again for the next GW that disappears and so on.
 +    If it works as designed, than regardless of the number of GWs, redeploy will only
 +    cause one fo/fb. */
 +  if ((now - (st.last_gw_down_ts - beacon_grace_sec)) < skip_failovers_sec) {
 +    skip_failovers_for_group(group_key);
 +    dout(4) << "startup: set skip-failovers for group " << gw_id << " group "
 +	         << group_key << dendl;
 +  }
  }
  
+ void NVMeofGwMap::set_addr_vect(const NvmeGwId &gw_id,
+     const NvmeGroupKey& group_key, const entity_addr_t &addr) {
+   entity_addrvec_t addrvec(addr);
+   for (auto& gws_states: created_gws[group_key]) {
+      auto &state = gws_states.second;
+      auto &gw_found = gws_states.first;
+      if (state.addr_vect == addrvec && gw_found != gw_id) {
+       /* This can happen when several GWs restart simultaneously and
+        * they got entity_addr that differ from the previous one
+        */
+        entity_addr_t a;
+        state.addr_vect = entity_addrvec_t(a);// cleanup duplicated address
+        dout(4) << "found duplicated addr vect in gw " << gw_found << dendl;
+      }
+   }
+   created_gws[group_key][gw_id].addr_vect = addrvec;
+   dout(10) << "Set addr vect " << addrvec << " for gw " << gw_id << dendl;
+ }
+ 
  void NVMeofGwMap::increment_gw_epoch(const NvmeGroupKey& group_key)
  {
    if (HAVE_FEATURE(mon->get_quorum_con_features(), NVMEOFHAMAP)) {