From 457cb8b407de2bf1e2fda43d205a159163d1b73a Mon Sep 17 00:00:00 2001
From: Adam King <adking@redhat.com>
Date: Thu, 22 Jan 2026 11:25:02 -0500
Subject: [PATCH] cephadm: retry cleaning old cgroups when it fails

It is possible that when attempting to redeploy a daemon
the shutdown of the daemon from cephadm running `systemctl stop`
may not have completed and we'll be unable to finish
cleaning the old cgroup. In these cases, moving on
immediately to try to start the systemd unit tends to
result in it failing to start. This patch adds a retry
to cleaning the old cgroups that should hopefully
avoid the race condition and daemons failing to start
because of it

Signed-off-by: Adam King <adking@redhat.com>
---
 src/cephadm/cephadm.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py
index 4e255e551656..03234518cc74 100755
--- a/src/cephadm/cephadm.py
+++ b/src/cephadm/cephadm.py
@@ -1015,10 +1015,18 @@ def clean_cgroup(ctx: CephadmContext, fsid: str, unit_name: str) -> None:
             if p.is_dir():
                 cg_trim(p)
         path.rmdir()
-    try:
-        cg_trim(cg_path)
-    except OSError:
-        logger.warning(f'Failed to trim old cgroups {cg_path}')
+
+    for s in [0.5, 1.0, 2.0, False]:
+        try:
+            cg_trim(cg_path)
+        except OSError:
+            if not s:
+                logger.warning(f'Failed 4 times to trim old cgroups <{cg_path}>. Giving up!')
+            else:
+                logger.warning(f'Failed to trim old cgroups <{cg_path}>. Retrying in {s} seconds...')
+                time.sleep(s)
+        else:
+            break
 
 
 def deploy_daemon_units(
-- 
2.47.3