From: Shweta Bhosale Date: Fri, 14 Nov 2025 12:04:25 +0000 (+0530) Subject: mgr/cephadm: Handle ganesha-rados-grace tool failure X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=a7c4259ba0daea3d23fc73c6b856f04c13ca115b;p=ceph-ci.git mgr/cephadm: Handle ganesha-rados-grace tool failure Fixes: https://tracker.ceph.com/issues/73851 Signed-off-by: Shweta Bhosale --- diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py index b61db25ba65..d5816026731 100644 --- a/src/pybind/mgr/cephadm/serve.py +++ b/src/pybind/mgr/cephadm/serve.py @@ -33,6 +33,7 @@ from cephadm.utils import forall_hosts, cephadmNoImage, is_repo_digest, \ from mgr_module import MonCommandFailed from mgr_util import format_bytes from cephadm.services.service_registry import service_registry +from cephadm.services.nfs import NFSService from . import utils from . import exchange @@ -120,6 +121,8 @@ class CephadmServe: self._check_for_moved_osds() + self._retry_failed_operations() + if self.mgr.agent_helpers._handle_use_agent_setting(): continue @@ -1869,6 +1872,27 @@ class CephadmServe: await self.mgr.ssh._write_remote_file(host, self.mgr.cephadm_binary_path, self.mgr._cephadm, addr=addr) + def _retry_failed_operations(self) -> None: + self.log.debug('_retry_failed_operations') + # retry nfs fencing for failed specs + failed_services = self.mgr.get_store('nfs_fencing_failed_services') + services = failed_services.split(',') if failed_services else [] + to_remove = [] + for service_name in services: + if service_name not in self.mgr.spec_store: + to_remove.append(service_name) + continue + spec = self.mgr.spec_store[service_name].spec + rank_map = self.mgr.spec_store[spec.service_name()].rank_map or {} + daemons = self.mgr.cache.get_daemons_by_service(service_name) + svc = service_registry.get_service('nfs') + self.log.debug('Retry NFS fence old rank for %s service', service_name) + svc.fence_old_ranks(spec, rank_map, len(daemons)) + if to_remove: + self.log.debug('Remove NFS service from retry fence old ranks as services %s are removed', to_remove) + svc = cast(NFSService, service_registry.get_service('nfs')) + svc.update_failed_fencing_services_remove_missing(to_remove) + def _host_selector(svc: Any) -> Optional[HostSelector]: if hasattr(svc, 'filter_host_candidates'): diff --git a/src/pybind/mgr/cephadm/services/nfs.py b/src/pybind/mgr/cephadm/services/nfs.py index d622e68e4b8..bb8c9fa5c57 100644 --- a/src/pybind/mgr/cephadm/services/nfs.py +++ b/src/pybind/mgr/cephadm/services/nfs.py @@ -4,6 +4,7 @@ import logging import os import subprocess import tempfile +from threading import Lock from typing import Dict, Tuple, Any, List, cast, Optional, TYPE_CHECKING from configparser import ConfigParser from io import StringIO @@ -28,6 +29,10 @@ class NFSService(CephService): TYPE = 'nfs' DEFAULT_EXPORTER_PORT = 9587 + def __init__(self, mgr: "CephadmOrchestrator"): + super().__init__(mgr) + self._fencing_store_lock = Lock() + @property def needs_monitoring(self) -> bool: return True @@ -35,6 +40,35 @@ class NFSService(CephService): def ranked(self, spec: ServiceSpec) -> bool: return True + def _update_failed_fencing_services( + self, + service_name: str, + add_if_failed: bool, + remove_if_success: bool + ) -> None: + with self._fencing_store_lock: + failed_services = self.mgr.get_store('nfs_fencing_failed_services') + services = failed_services.split(',') if failed_services else [] + if add_if_failed: + if service_name not in services: + services.append(service_name) + elif remove_if_success: + if service_name in services: + services.remove(service_name) + val = ','.join(services) if services else None + if failed_services or services: + self.mgr.set_store('nfs_fencing_failed_services', val) + + def update_failed_fencing_services_remove_missing(self, services_to_remove: List[str]) -> None: + if not services_to_remove: + return + with self._fencing_store_lock: + failed_services = self.mgr.get_store('nfs_fencing_failed_services') + services = failed_services.split(',') if failed_services else [] + updated = [s for s in services if s not in services_to_remove] + val = ','.join(updated) if updated else None + self.mgr.set_store('nfs_fencing_failed_services', val) + def fence(self, daemon_id: str) -> None: logger.info(f'Fencing old nfs.{daemon_id}') ret, out, err = self.mgr.mon_command({ @@ -48,16 +82,25 @@ class NFSService(CephService): spec: ServiceSpec, rank_map: Dict[int, Dict[int, Optional[str]]], num_ranks: int) -> None: + service_name = spec.service_name() + fence_failed = False for rank, m in list(rank_map.items()): if rank >= num_ranks: for daemon_id in m.values(): if daemon_id is not None: self.fence(daemon_id) - del rank_map[rank] nodeid = f'{rank}' - self.mgr.log.info(f'Removing {nodeid} from the ganesha grace table') - self.run_grace_tool(cast(NFSServiceSpec, spec), 'remove', nodeid) - self.mgr.spec_store.save_rank_map(spec.service_name(), rank_map) + self.mgr.log.info( + "Removing %s from the ganesha grace table for service %s", nodeid, service_name + ) + try: + self.run_grace_tool(cast(NFSServiceSpec, spec), 'remove', nodeid) + # Only delete from rank_map if grace tool succeeds + del rank_map[rank] + self.mgr.spec_store.save_rank_map(service_name, rank_map) + except RuntimeError: + self.mgr.log.exception('Got exception while removing node id from grace table') + fence_failed = True else: max_gen = max(m.keys()) for gen, daemon_id in list(m.items()): @@ -65,7 +108,8 @@ class NFSService(CephService): if daemon_id is not None: self.fence(daemon_id) del rank_map[rank][gen] - self.mgr.spec_store.save_rank_map(spec.service_name(), rank_map) + self.mgr.spec_store.save_rank_map(service_name, rank_map) + self._update_failed_fencing_services(service_name, fence_failed, not fence_failed) def config(self, spec: NFSServiceSpec) -> None: # type: ignore from nfs.cluster import create_ganesha_pool @@ -117,7 +161,9 @@ class NFSService(CephService): rados_keyring = self.create_keyring(daemon_spec) # ensure rank is known to ganesha - self.mgr.log.info(f'Ensuring {nodeid} is in the ganesha grace table') + self.mgr.log.info( + 'Ensuring %s is in the ganesha grace table for service %s', nodeid, daemon_spec.service_name + ) self.run_grace_tool(spec, 'add', nodeid) # create the rados config object @@ -316,10 +362,19 @@ class NFSService(CephService): result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=10) if result.returncode: + stderr = result.stderr.decode("utf-8") self.mgr.log.warning( - f'ganesha-rados-grace tool failed: {result.stderr.decode("utf-8")}' + 'ganesha-rados-grace tool failed for service %s, err: %s rc: %s', + spec.service_name(), stderr, result.returncode ) - raise RuntimeError(f'grace tool failed: {result.stderr.decode("utf-8")}') + if action == 'remove' and 'Failure: -126' in stderr: + self.mgr.log.info( + 'Ignore ganesha-rados-grace tool remove failure as %s does not exists for %s service', + nodeid, spec.service_name() + ) + return + + raise RuntimeError(f'grace tool failed for service {spec.service_name()}: {stderr}') finally: self.mgr.check_mon_command({