]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
ceph-nvmeof-mon fixes
authorAlexander Indenbaum <aindenba@redhat.com>
Thu, 9 May 2024 11:46:31 +0000 (11:46 +0000)
committerAlexander Indenbaum <aindenba@redhat.com>
Thu, 20 Nov 2025 08:55:27 +0000 (10:55 +0200)
Resolves: rhbz#2279862

- ceph-nvmeof-mon: nvme-gw create/delete
  * move creation to the `daemon_check_post` method to prevent zombie creation.
    this change aligns with the dashboard logic, which uses the same callback.
  * implement `purge` method to ensure that no zombie gateways are left behind
    once the service is removed.
Signed-off-by: Alexander Indenbaum <aindenba@redhat.com>
- nvmeof service spec: enable_monitor_client by default
Signed-off-by: Alexander Indenbaum <aindenba@redhat.com>
- fix gw stuck in  ana-state GW_WAIT_FAILBACK_PREPARED
Signed-off-by: Leonid Chernin <leonidc@il.ibm.com>
Signed-off-by: Alexander Indenbaum <aindenba@redhat.com>
(cherry picked from commit b0c764b6e22fbb51f59e469d9dd99895c152f73e)

src/mon/NVMeofGwMap.cc
src/pybind/mgr/cephadm/services/nvmeof.py

index 1d656574b8d22c3ec75ac640a8546190ea24064e..f4af5d0caae6d05fbe2c2891c9b82079b4b9a19d 100755 (executable)
@@ -471,10 +471,12 @@ void NVMeofGwMap::fsm_handle_to_expired(const NvmeGwId &gw_id, const NvmeGroupKe
             auto& st = gw_state.second;
             if (st.ana_grp_id == grpid){// group owner
                 grp_owner_found = true;
-                if( ! (fbp_gw_state.last_gw_map_epoch_valid  && st.last_gw_map_epoch_valid) ){
-                   //Timer is not cancelled so it would expire over and over as long as both gws are not updated
-                   dout(1) << "gw " << gw_id  <<" or gw " << gw_state.first  << "map epochs are not updated "<< dendl;
-                   return;
+                if(st.availability == GW_AVAILABILITY_E::GW_AVAILABLE) {
+                   if( ! (fbp_gw_state.last_gw_map_epoch_valid  && st.last_gw_map_epoch_valid) ){
+                     //Timer is not cancelled so it would expire over and over as long as both gws are not updated
+                     dout(1) << "gw " << gw_id  <<" or gw " << gw_state.first  << "map epochs are not updated "<< dendl;
+                     return;
+                   }
                 }
                 cancel_timer(gw_id, group_key, grpid);
                 if (st.sm_state[grpid] == GW_STATES_PER_AGROUP_E::GW_OWNER_WAIT_FAILBACK_PREPARED && st.availability == GW_AVAILABILITY_E::GW_AVAILABLE )
index 454f5b8c11a5b35e493a22affa8f310733496b2b..ac01bc7c4b5c178edae061145d49edc55fb59622 100644 (file)
@@ -56,21 +56,36 @@ class NvmeofService(CephService):
         daemon_spec.extra_files = {'ceph-nvmeof.conf': gw_conf}
         daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec)
         daemon_spec.deps = []
-        # Notify monitor about this gateway creation
-        cmd = {
-            'prefix': 'nvme-gw create',
-            'id': name,
-            'group': spec.group,
-            'pool': spec.pool
-        }
-        _, _, err = self.mgr.mon_command(cmd)
-        # if send command failed, raise assertion exception, failing the daemon creation
-        assert not err, f"Unable to send monitor command {cmd}, error {err}"
         if not hasattr(self, 'gws'):
             self.gws = {} # id -> name map of gateways for this service.
         self.gws[nvmeof_gw_id] = name # add to map of service's gateway names
         return daemon_spec
 
+    def daemon_check_post(self, daemon_descrs: List[DaemonDescription]) -> None:
+        """ Overrides the daemon_check_post to add nvmeof gateways safely
+        """
+        self.mgr.log.info(f"nvmeof daemon_check_post {daemon_descrs}")
+        # Assert configured
+        assert self.pool
+        assert self.group is not None
+        for dd in daemon_descrs:
+            self.mgr.log.info(f"nvmeof daemon_descr {dd}")
+            assert dd.daemon_id in self.gws
+            name = self.gws[dd.daemon_id]
+            self.mgr.log.info(f"nvmeof daemon name={name}")
+            # Notify monitor about this gateway creation
+            cmd = {
+                'prefix': 'nvme-gw create',
+                'id': name,
+                'group': self.group,
+                'pool': self.pool
+            }
+            self.mgr.log.info(f"create gateway: monitor command {cmd}")
+            _, _, err = self.mgr.mon_command(cmd)
+            if err:
+                self.mgr.log.error(f"Unable to send monitor command {cmd}, error {err}")
+        super().daemon_check_post(daemon_descrs)
+
     def config_dashboard(self, daemon_descrs: List[DaemonDescription]) -> None:
         # TODO: what integration do we need with the dashboard?
         pass
@@ -124,12 +139,28 @@ class NvmeofService(CephService):
             'group': self.group,
             'pool': self.pool
         }
+        self.mgr.log.info(f"delete gateway: monitor command {cmd}")
         _, _, err = self.mgr.mon_command(cmd)
         if err:
             self.mgr.log.error(f"Unable to send monitor command {cmd}, error {err}")
 
     def purge(self, service_name: str) -> None:
-        """Removes configuration
+        """Make sure no zombie gateway is left behind
         """
-        #  TODO: what should we purge in this case (if any)?
-        pass
+        # Assert configured
+        assert self.pool
+        assert self.group is not None
+        for daemon_id in self.gws:
+            name = self.gws[daemon_id]
+            self.gws.pop(daemon_id)
+            # Notify monitor about this gateway deletion
+            cmd = {
+                'prefix': 'nvme-gw delete',
+                'id': name,
+                'group': self.group,
+                'pool': self.pool
+            }
+            self.mgr.log.info(f"purge delete gateway: monitor command {cmd}")
+            _, _, err = self.mgr.mon_command(cmd)
+            if err:
+                self.mgr.log.error(f"Unable to send monitor command {cmd}, error {err}")