From: Adam King Date: Fri, 28 Jul 2023 17:24:06 +0000 (-0400) Subject: mgr/cephadm: still remove daemons in error state if ok-to-stop fails X-Git-Tag: v18.2.1~326^2~29 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=c431e3af0fd947edafde8a5ac37fdeceb5c83515;p=ceph.git mgr/cephadm: still remove daemons in error state if ok-to-stop fails The ok-to-stop function works for certain daemons by checking if there are at least a certain number (typically 1) daemon(s) that are actually running and saying it's not ok-to-stop if if that won't be true after the removals. This case breaks down when all the daemons are in error state, making it so cephadm will refuse to remove a set of daemons that aren't even working because they're not "ok to stop". Since ok-to-stop works in a yes or no fashion, something like this where we want to be willing to remove a certain subset (or potentially all currently deployed) daemons it's easier to keep this logic as part of applying the service Signed-off-by: Adam King (cherry picked from commit 8bf99ba7b0862d7340690c74b6aa3ad995ae1098) --- diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py index 85532b0b938c..6f557bafdc6f 100644 --- a/src/pybind/mgr/cephadm/serve.py +++ b/src/pybind/mgr/cephadm/serve.py @@ -920,7 +920,18 @@ class CephadmServe: while daemons_to_remove and not _ok_to_stop(daemons_to_remove): # let's find a subset that is ok-to-stop - daemons_to_remove.pop() + non_error_daemon_index = -1 + # prioritize removing daemons in error state + for i, dmon in enumerate(daemons_to_remove): + if dmon.status != DaemonDescriptionStatus.error: + non_error_daemon_index = i + break + if non_error_daemon_index != -1: + daemons_to_remove.pop(non_error_daemon_index) + else: + # all daemons in list are in error state + # we should be able to remove all of them + break for d in daemons_to_remove: r = True assert d.hostname is not None