From 457cb8b407de2bf1e2fda43d205a159163d1b73a Mon Sep 17 00:00:00 2001 From: Adam King Date: Thu, 22 Jan 2026 11:25:02 -0500 Subject: [PATCH] cephadm: retry cleaning old cgroups when it fails It is possible that when attempting to redeploy a daemon the shutdown of the daemon from cephadm running `systemctl stop` may not have completed and we'll be unable to finish cleaning the old cgroup. In these cases, moving on immediately to try to start the systemd unit tends to result in it failing to start. This patch adds a retry to cleaning the old cgroups that should hopefully avoid the race condition and daemons failing to start because of it Signed-off-by: Adam King --- src/cephadm/cephadm.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py index 4e255e551656..03234518cc74 100755 --- a/src/cephadm/cephadm.py +++ b/src/cephadm/cephadm.py @@ -1015,10 +1015,18 @@ def clean_cgroup(ctx: CephadmContext, fsid: str, unit_name: str) -> None: if p.is_dir(): cg_trim(p) path.rmdir() - try: - cg_trim(cg_path) - except OSError: - logger.warning(f'Failed to trim old cgroups {cg_path}') + + for s in [0.5, 1.0, 2.0, False]: + try: + cg_trim(cg_path) + except OSError: + if not s: + logger.warning(f'Failed 4 times to trim old cgroups <{cg_path}>. Giving up!') + else: + logger.warning(f'Failed to trim old cgroups <{cg_path}>. Retrying in {s} seconds...') + time.sleep(s) + else: + break def deploy_daemon_units( -- 2.47.3