From: Adam King Date: Thu, 22 Jan 2026 16:25:02 +0000 (-0500) Subject: cephadm: retry cleaning old cgroups when it fails X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=457cb8b407de2bf1e2fda43d205a159163d1b73a;p=ceph.git cephadm: retry cleaning old cgroups when it fails It is possible that when attempting to redeploy a daemon the shutdown of the daemon from cephadm running `systemctl stop` may not have completed and we'll be unable to finish cleaning the old cgroup. In these cases, moving on immediately to try to start the systemd unit tends to result in it failing to start. This patch adds a retry to cleaning the old cgroups that should hopefully avoid the race condition and daemons failing to start because of it Signed-off-by: Adam King --- diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py index 4e255e551656..03234518cc74 100755 --- a/src/cephadm/cephadm.py +++ b/src/cephadm/cephadm.py @@ -1015,10 +1015,18 @@ def clean_cgroup(ctx: CephadmContext, fsid: str, unit_name: str) -> None: if p.is_dir(): cg_trim(p) path.rmdir() - try: - cg_trim(cg_path) - except OSError: - logger.warning(f'Failed to trim old cgroups {cg_path}') + + for s in [0.5, 1.0, 2.0, False]: + try: + cg_trim(cg_path) + except OSError: + if not s: + logger.warning(f'Failed 4 times to trim old cgroups <{cg_path}>. Giving up!') + else: + logger.warning(f'Failed to trim old cgroups <{cg_path}>. Retrying in {s} seconds...') + time.sleep(s) + else: + break def deploy_daemon_units(