From: Alexander Indenbaum Date: Thu, 9 May 2024 11:46:31 +0000 (+0000) Subject: ceph-nvmeof-mon fixes X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=5319ed2e868ae60588443c317870ecc525f81295;p=ceph-ci.git ceph-nvmeof-mon fixes Resolves: rhbz#2279862 - ceph-nvmeof-mon: nvme-gw create/delete * move creation to the `daemon_check_post` method to prevent zombie creation. this change aligns with the dashboard logic, which uses the same callback. * implement `purge` method to ensure that no zombie gateways are left behind once the service is removed. Signed-off-by: Alexander Indenbaum - nvmeof service spec: enable_monitor_client by default Signed-off-by: Alexander Indenbaum - fix gw stuck in ana-state GW_WAIT_FAILBACK_PREPARED Signed-off-by: Leonid Chernin Signed-off-by: Alexander Indenbaum (cherry picked from commit b0c764b6e22fbb51f59e469d9dd99895c152f73e) --- diff --git a/src/mon/NVMeofGwMap.cc b/src/mon/NVMeofGwMap.cc index 1d656574b8d..f4af5d0caae 100755 --- a/src/mon/NVMeofGwMap.cc +++ b/src/mon/NVMeofGwMap.cc @@ -471,10 +471,12 @@ void NVMeofGwMap::fsm_handle_to_expired(const NvmeGwId &gw_id, const NvmeGroupKe auto& st = gw_state.second; if (st.ana_grp_id == grpid){// group owner grp_owner_found = true; - if( ! (fbp_gw_state.last_gw_map_epoch_valid && st.last_gw_map_epoch_valid) ){ - //Timer is not cancelled so it would expire over and over as long as both gws are not updated - dout(1) << "gw " << gw_id <<" or gw " << gw_state.first << "map epochs are not updated "<< dendl; - return; + if(st.availability == GW_AVAILABILITY_E::GW_AVAILABLE) { + if( ! (fbp_gw_state.last_gw_map_epoch_valid && st.last_gw_map_epoch_valid) ){ + //Timer is not cancelled so it would expire over and over as long as both gws are not updated + dout(1) << "gw " << gw_id <<" or gw " << gw_state.first << "map epochs are not updated "<< dendl; + return; + } } cancel_timer(gw_id, group_key, grpid); if (st.sm_state[grpid] == GW_STATES_PER_AGROUP_E::GW_OWNER_WAIT_FAILBACK_PREPARED && st.availability == GW_AVAILABILITY_E::GW_AVAILABLE ) diff --git a/src/pybind/mgr/cephadm/services/nvmeof.py b/src/pybind/mgr/cephadm/services/nvmeof.py index 454f5b8c11a..ac01bc7c4b5 100644 --- a/src/pybind/mgr/cephadm/services/nvmeof.py +++ b/src/pybind/mgr/cephadm/services/nvmeof.py @@ -56,21 +56,36 @@ class NvmeofService(CephService): daemon_spec.extra_files = {'ceph-nvmeof.conf': gw_conf} daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec) daemon_spec.deps = [] - # Notify monitor about this gateway creation - cmd = { - 'prefix': 'nvme-gw create', - 'id': name, - 'group': spec.group, - 'pool': spec.pool - } - _, _, err = self.mgr.mon_command(cmd) - # if send command failed, raise assertion exception, failing the daemon creation - assert not err, f"Unable to send monitor command {cmd}, error {err}" if not hasattr(self, 'gws'): self.gws = {} # id -> name map of gateways for this service. self.gws[nvmeof_gw_id] = name # add to map of service's gateway names return daemon_spec + def daemon_check_post(self, daemon_descrs: List[DaemonDescription]) -> None: + """ Overrides the daemon_check_post to add nvmeof gateways safely + """ + self.mgr.log.info(f"nvmeof daemon_check_post {daemon_descrs}") + # Assert configured + assert self.pool + assert self.group is not None + for dd in daemon_descrs: + self.mgr.log.info(f"nvmeof daemon_descr {dd}") + assert dd.daemon_id in self.gws + name = self.gws[dd.daemon_id] + self.mgr.log.info(f"nvmeof daemon name={name}") + # Notify monitor about this gateway creation + cmd = { + 'prefix': 'nvme-gw create', + 'id': name, + 'group': self.group, + 'pool': self.pool + } + self.mgr.log.info(f"create gateway: monitor command {cmd}") + _, _, err = self.mgr.mon_command(cmd) + if err: + self.mgr.log.error(f"Unable to send monitor command {cmd}, error {err}") + super().daemon_check_post(daemon_descrs) + def config_dashboard(self, daemon_descrs: List[DaemonDescription]) -> None: # TODO: what integration do we need with the dashboard? pass @@ -124,12 +139,28 @@ class NvmeofService(CephService): 'group': self.group, 'pool': self.pool } + self.mgr.log.info(f"delete gateway: monitor command {cmd}") _, _, err = self.mgr.mon_command(cmd) if err: self.mgr.log.error(f"Unable to send monitor command {cmd}, error {err}") def purge(self, service_name: str) -> None: - """Removes configuration + """Make sure no zombie gateway is left behind """ - # TODO: what should we purge in this case (if any)? - pass + # Assert configured + assert self.pool + assert self.group is not None + for daemon_id in self.gws: + name = self.gws[daemon_id] + self.gws.pop(daemon_id) + # Notify monitor about this gateway deletion + cmd = { + 'prefix': 'nvme-gw delete', + 'id': name, + 'group': self.group, + 'pool': self.pool + } + self.mgr.log.info(f"purge delete gateway: monitor command {cmd}") + _, _, err = self.mgr.mon_command(cmd) + if err: + self.mgr.log.error(f"Unable to send monitor command {cmd}, error {err}")