From a03d33117967d8685a5a13eb3de50d7908a78275 Mon Sep 17 00:00:00 2001
From: Adam King <adking@redhat.com>
Date: Wed, 29 Jan 2025 15:48:53 -0500
Subject: [PATCH] mgr/cephadm: continue in nfs service purge if grace file is
 already deleted

The test_nfs task we run in teuthology creates and removes a number of
nfs clusters during the task. I think it's possible based on timing for
it to end up in a situation where it tries to remove an nfs service before
the grace file has been created. In that case, cephadm doesn't know it
hasn't created the grace file and just repeatedly fails forever attempting
to remove the nonexistent file. This patch adds handling for the error
case where we get a nonzero rc but the error message implies the command
failed because the file already does not exist.

Fixes: https://tracker.ceph.com/issues/69736

Signed-off-by: Adam King <adking@redhat.com>
---
 src/pybind/mgr/cephadm/services/nfs.py | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/src/pybind/mgr/cephadm/services/nfs.py b/src/pybind/mgr/cephadm/services/nfs.py
index 8b6ea752d54..ee12175ff6d 100644
--- a/src/pybind/mgr/cephadm/services/nfs.py
+++ b/src/pybind/mgr/cephadm/services/nfs.py
@@ -14,7 +14,7 @@ from mgr_module import NFS_POOL_NAME as POOL_NAME
 from ceph.deployment.service_spec import ServiceSpec, NFSServiceSpec
 from .service_registry import register_cephadm_service
 
-from orchestrator import DaemonDescription
+from orchestrator import DaemonDescription, OrchestratorError
 
 from cephadm.services.cephadmservice import AuthEntity, CephadmDaemonDeploySpec, CephService
 
@@ -319,12 +319,24 @@ class NFSService(CephService):
             '--namespace', cast(str, spec.service_id),
             'rm', 'grace',
         ]
-        subprocess.run(
-            cmd,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            timeout=10
-        )
+        try:
+            result = subprocess.run(
+                cmd,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                timeout=10
+            )
+        except Exception as e:
+            err_msg = f'Got unexpected exception trying to remove ganesha grace file for nfs.{spec.service_id} service: {str(e)}'
+            self.mgr.log.warning(err_msg)
+            raise OrchestratorError(err_msg)
+        if result.returncode:
+            if "No such file" in result.stderr.decode('utf-8'):
+                logger.info(f'Grace file for nfs.{spec.service_id} already deleted')
+            else:
+                err_msg = f'Failed to remove ganesha grace file for nfs.{spec.service_id} service: {result.stderr.decode("utf-8")}'
+                self.mgr.log.warning(err_msg)
+                raise OrchestratorError(err_msg)
 
     def _haproxy_hosts(self) -> List[str]:
         # NB: Ideally, we would limit the list to IPs on hosts running
-- 
2.39.5