]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/cephadm: continue in nfs service purge if grace file is already deleted 61594/head
authorAdam King <adking@redhat.com>
Wed, 29 Jan 2025 20:48:53 +0000 (15:48 -0500)
committerAdam King <adking@redhat.com>
Wed, 5 Feb 2025 00:01:41 +0000 (19:01 -0500)
The test_nfs task we run in teuthology creates and removes a number of
nfs clusters during the task. I think it's possible based on timing for
it to end up in a situation where it tries to remove an nfs service before
the grace file has been created. In that case, cephadm doesn't know it
hasn't created the grace file and just repeatedly fails forever attempting
to remove the nonexistent file. This patch adds handling for the error
case where we get a nonzero rc but the error message implies the command
failed because the file already does not exist.

Fixes: https://tracker.ceph.com/issues/69736
Signed-off-by: Adam King <adking@redhat.com>
src/pybind/mgr/cephadm/services/nfs.py

index 8b6ea752d5484e0793b2f9cdded63108b37687ba..ee12175ff6de33ad260a37c5e240372127c7b669 100644 (file)
@@ -14,7 +14,7 @@ from mgr_module import NFS_POOL_NAME as POOL_NAME
 from ceph.deployment.service_spec import ServiceSpec, NFSServiceSpec
 from .service_registry import register_cephadm_service
 
-from orchestrator import DaemonDescription
+from orchestrator import DaemonDescription, OrchestratorError
 
 from cephadm.services.cephadmservice import AuthEntity, CephadmDaemonDeploySpec, CephService
 
@@ -319,12 +319,24 @@ class NFSService(CephService):
             '--namespace', cast(str, spec.service_id),
             'rm', 'grace',
         ]
-        subprocess.run(
-            cmd,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            timeout=10
-        )
+        try:
+            result = subprocess.run(
+                cmd,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                timeout=10
+            )
+        except Exception as e:
+            err_msg = f'Got unexpected exception trying to remove ganesha grace file for nfs.{spec.service_id} service: {str(e)}'
+            self.mgr.log.warning(err_msg)
+            raise OrchestratorError(err_msg)
+        if result.returncode:
+            if "No such file" in result.stderr.decode('utf-8'):
+                logger.info(f'Grace file for nfs.{spec.service_id} already deleted')
+            else:
+                err_msg = f'Failed to remove ganesha grace file for nfs.{spec.service_id} service: {result.stderr.decode("utf-8")}'
+                self.mgr.log.warning(err_msg)
+                raise OrchestratorError(err_msg)
 
     def _haproxy_hosts(self) -> List[str]:
         # NB: Ideally, we would limit the list to IPs on hosts running