From: Alexander Indenbaum <aindenba@redhat.com>
Date: Thu, 9 May 2024 11:46:31 +0000 (+0000)
Subject: ceph-nvmeof-mon fixes
X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=5319ed2e868ae60588443c317870ecc525f81295;p=ceph-ci.git

ceph-nvmeof-mon fixes

Resolves: rhbz#2279862

- ceph-nvmeof-mon: nvme-gw create/delete
  * move creation to the `daemon_check_post` method to prevent zombie creation.
    this change aligns with the dashboard logic, which uses the same callback.
  * implement `purge` method to ensure that no zombie gateways are left behind
    once the service is removed.
  Signed-off-by: Alexander Indenbaum <aindenba@redhat.com>

- nvmeof service spec: enable_monitor_client by default
  Signed-off-by: Alexander Indenbaum <aindenba@redhat.com>

- fix gw stuck in  ana-state GW_WAIT_FAILBACK_PREPARED
  Signed-off-by: Leonid Chernin <leonidc@il.ibm.com>

Signed-off-by: Alexander Indenbaum <aindenba@redhat.com>
(cherry picked from commit b0c764b6e22fbb51f59e469d9dd99895c152f73e)
---

diff --git a/src/mon/NVMeofGwMap.cc b/src/mon/NVMeofGwMap.cc
index 1d656574b8d..f4af5d0caae 100755
--- a/src/mon/NVMeofGwMap.cc
+++ b/src/mon/NVMeofGwMap.cc
@@ -471,10 +471,12 @@ void NVMeofGwMap::fsm_handle_to_expired(const NvmeGwId &gw_id, const NvmeGroupKe
             auto& st = gw_state.second;
             if (st.ana_grp_id == grpid){// group owner
                 grp_owner_found = true;
-                if( ! (fbp_gw_state.last_gw_map_epoch_valid  && st.last_gw_map_epoch_valid) ){
-                   //Timer is not cancelled so it would expire over and over as long as both gws are not updated
-                   dout(1) << "gw " << gw_id  <<" or gw " << gw_state.first  << "map epochs are not updated "<< dendl;
-                   return;
+                if(st.availability == GW_AVAILABILITY_E::GW_AVAILABLE) {
+                   if( ! (fbp_gw_state.last_gw_map_epoch_valid  && st.last_gw_map_epoch_valid) ){
+                     //Timer is not cancelled so it would expire over and over as long as both gws are not updated
+                     dout(1) << "gw " << gw_id  <<" or gw " << gw_state.first  << "map epochs are not updated "<< dendl;
+                     return;
+                   }
                 }
                 cancel_timer(gw_id, group_key, grpid);
                 if (st.sm_state[grpid] == GW_STATES_PER_AGROUP_E::GW_OWNER_WAIT_FAILBACK_PREPARED && st.availability == GW_AVAILABILITY_E::GW_AVAILABLE )
diff --git a/src/pybind/mgr/cephadm/services/nvmeof.py b/src/pybind/mgr/cephadm/services/nvmeof.py
index 454f5b8c11a..ac01bc7c4b5 100644
--- a/src/pybind/mgr/cephadm/services/nvmeof.py
+++ b/src/pybind/mgr/cephadm/services/nvmeof.py
@@ -56,21 +56,36 @@ class NvmeofService(CephService):
         daemon_spec.extra_files = {'ceph-nvmeof.conf': gw_conf}
         daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec)
         daemon_spec.deps = []
-        # Notify monitor about this gateway creation
-        cmd = {
-            'prefix': 'nvme-gw create',
-            'id': name,
-            'group': spec.group,
-            'pool': spec.pool
-        }
-        _, _, err = self.mgr.mon_command(cmd)
-        # if send command failed, raise assertion exception, failing the daemon creation
-        assert not err, f"Unable to send monitor command {cmd}, error {err}"
         if not hasattr(self, 'gws'):
             self.gws = {} # id -> name map of gateways for this service.
         self.gws[nvmeof_gw_id] = name # add to map of service's gateway names
         return daemon_spec
 
+    def daemon_check_post(self, daemon_descrs: List[DaemonDescription]) -> None:
+        """ Overrides the daemon_check_post to add nvmeof gateways safely
+        """
+        self.mgr.log.info(f"nvmeof daemon_check_post {daemon_descrs}")
+        # Assert configured
+        assert self.pool
+        assert self.group is not None
+        for dd in daemon_descrs:
+            self.mgr.log.info(f"nvmeof daemon_descr {dd}")
+            assert dd.daemon_id in self.gws
+            name = self.gws[dd.daemon_id]
+            self.mgr.log.info(f"nvmeof daemon name={name}")
+            # Notify monitor about this gateway creation
+            cmd = {
+                'prefix': 'nvme-gw create',
+                'id': name,
+                'group': self.group,
+                'pool': self.pool
+            }
+            self.mgr.log.info(f"create gateway: monitor command {cmd}")
+            _, _, err = self.mgr.mon_command(cmd)
+            if err:
+                self.mgr.log.error(f"Unable to send monitor command {cmd}, error {err}")
+        super().daemon_check_post(daemon_descrs)
+
     def config_dashboard(self, daemon_descrs: List[DaemonDescription]) -> None:
         # TODO: what integration do we need with the dashboard?
         pass
@@ -124,12 +139,28 @@ class NvmeofService(CephService):
             'group': self.group,
             'pool': self.pool
         }
+        self.mgr.log.info(f"delete gateway: monitor command {cmd}")
         _, _, err = self.mgr.mon_command(cmd)
         if err:
             self.mgr.log.error(f"Unable to send monitor command {cmd}, error {err}")
 
     def purge(self, service_name: str) -> None:
-        """Removes configuration
+        """Make sure no zombie gateway is left behind
         """
-        #  TODO: what should we purge in this case (if any)?
-        pass
+        # Assert configured
+        assert self.pool
+        assert self.group is not None
+        for daemon_id in self.gws:
+            name = self.gws[daemon_id]
+            self.gws.pop(daemon_id)
+            # Notify monitor about this gateway deletion
+            cmd = {
+                'prefix': 'nvme-gw delete',
+                'id': name,
+                'group': self.group,
+                'pool': self.pool
+            }
+            self.mgr.log.info(f"purge delete gateway: monitor command {cmd}")
+            _, _, err = self.mgr.mon_command(cmd)
+            if err:
+                self.mgr.log.error(f"Unable to send monitor command {cmd}, error {err}")