From: Adam King Date: Wed, 29 Jan 2025 20:48:53 +0000 (-0500) Subject: mgr/cephadm: continue in nfs service purge if grace file is already deleted X-Git-Tag: v20.0.0~243^2 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=a03d33117967d8685a5a13eb3de50d7908a78275;p=ceph.git mgr/cephadm: continue in nfs service purge if grace file is already deleted The test_nfs task we run in teuthology creates and removes a number of nfs clusters during the task. I think it's possible based on timing for it to end up in a situation where it tries to remove an nfs service before the grace file has been created. In that case, cephadm doesn't know it hasn't created the grace file and just repeatedly fails forever attempting to remove the nonexistent file. This patch adds handling for the error case where we get a nonzero rc but the error message implies the command failed because the file already does not exist. Fixes: https://tracker.ceph.com/issues/69736 Signed-off-by: Adam King --- diff --git a/src/pybind/mgr/cephadm/services/nfs.py b/src/pybind/mgr/cephadm/services/nfs.py index 8b6ea752d5484..ee12175ff6de3 100644 --- a/src/pybind/mgr/cephadm/services/nfs.py +++ b/src/pybind/mgr/cephadm/services/nfs.py @@ -14,7 +14,7 @@ from mgr_module import NFS_POOL_NAME as POOL_NAME from ceph.deployment.service_spec import ServiceSpec, NFSServiceSpec from .service_registry import register_cephadm_service -from orchestrator import DaemonDescription +from orchestrator import DaemonDescription, OrchestratorError from cephadm.services.cephadmservice import AuthEntity, CephadmDaemonDeploySpec, CephService @@ -319,12 +319,24 @@ class NFSService(CephService): '--namespace', cast(str, spec.service_id), 'rm', 'grace', ] - subprocess.run( - cmd, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - timeout=10 - ) + try: + result = subprocess.run( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + timeout=10 + ) + except Exception as e: + err_msg = f'Got unexpected exception trying to remove ganesha grace file for nfs.{spec.service_id} service: {str(e)}' + self.mgr.log.warning(err_msg) + raise OrchestratorError(err_msg) + if result.returncode: + if "No such file" in result.stderr.decode('utf-8'): + logger.info(f'Grace file for nfs.{spec.service_id} already deleted') + else: + err_msg = f'Failed to remove ganesha grace file for nfs.{spec.service_id} service: {result.stderr.decode("utf-8")}' + self.mgr.log.warning(err_msg) + raise OrchestratorError(err_msg) def _haproxy_hosts(self) -> List[str]: # NB: Ideally, we would limit the list to IPs on hosts running