From a03d33117967d8685a5a13eb3de50d7908a78275 Mon Sep 17 00:00:00 2001 From: Adam King Date: Wed, 29 Jan 2025 15:48:53 -0500 Subject: [PATCH] mgr/cephadm: continue in nfs service purge if grace file is already deleted The test_nfs task we run in teuthology creates and removes a number of nfs clusters during the task. I think it's possible based on timing for it to end up in a situation where it tries to remove an nfs service before the grace file has been created. In that case, cephadm doesn't know it hasn't created the grace file and just repeatedly fails forever attempting to remove the nonexistent file. This patch adds handling for the error case where we get a nonzero rc but the error message implies the command failed because the file already does not exist. Fixes: https://tracker.ceph.com/issues/69736 Signed-off-by: Adam King --- src/pybind/mgr/cephadm/services/nfs.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/src/pybind/mgr/cephadm/services/nfs.py b/src/pybind/mgr/cephadm/services/nfs.py index 8b6ea752d54..ee12175ff6d 100644 --- a/src/pybind/mgr/cephadm/services/nfs.py +++ b/src/pybind/mgr/cephadm/services/nfs.py @@ -14,7 +14,7 @@ from mgr_module import NFS_POOL_NAME as POOL_NAME from ceph.deployment.service_spec import ServiceSpec, NFSServiceSpec from .service_registry import register_cephadm_service -from orchestrator import DaemonDescription +from orchestrator import DaemonDescription, OrchestratorError from cephadm.services.cephadmservice import AuthEntity, CephadmDaemonDeploySpec, CephService @@ -319,12 +319,24 @@ class NFSService(CephService): '--namespace', cast(str, spec.service_id), 'rm', 'grace', ] - subprocess.run( - cmd, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - timeout=10 - ) + try: + result = subprocess.run( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + timeout=10 + ) + except Exception as e: + err_msg = f'Got unexpected exception trying to remove ganesha grace file for nfs.{spec.service_id} service: {str(e)}' + self.mgr.log.warning(err_msg) + raise OrchestratorError(err_msg) + if result.returncode: + if "No such file" in result.stderr.decode('utf-8'): + logger.info(f'Grace file for nfs.{spec.service_id} already deleted') + else: + err_msg = f'Failed to remove ganesha grace file for nfs.{spec.service_id} service: {result.stderr.decode("utf-8")}' + self.mgr.log.warning(err_msg) + raise OrchestratorError(err_msg) def _haproxy_hosts(self) -> List[str]: # NB: Ideally, we would limit the list to IPs on hosts running -- 2.39.5