mgr/cephadm: still remove daemons in error state if ok-to-stop fails

author Adam King <adking@redhat.com>

Fri, 28 Jul 2023 17:24:06 +0000 (13:24 -0400)

committer Adam King <adking@redhat.com>

Thu, 31 Aug 2023 17:36:14 +0000 (13:36 -0400)
author Adam King <adking@redhat.com>
Fri, 28 Jul 2023 17:24:06 +0000 (13:24 -0400)
committer Adam King <adking@redhat.com>
Thu, 31 Aug 2023 17:36:14 +0000 (13:36 -0400)
diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py

index 85532b0b938c6aae0d6e3255607b6c69d3d974cd..6f557bafdc6fdd3ea70adca7e0e4114600dac8cc 100644 (file)
--- a/src/pybind/mgr/cephadm/serve.py
+++ b/src/pybind/mgr/cephadm/serve.py
@@ -920,7 +920,18 @@ class CephadmServe:
  
              while daemons_to_remove and not _ok_to_stop(daemons_to_remove):
                  # let's find a subset that is ok-to-stop
-                daemons_to_remove.pop()
+                non_error_daemon_index = -1
+                # prioritize removing daemons in error state
+                for i, dmon in enumerate(daemons_to_remove):
+                    if dmon.status != DaemonDescriptionStatus.error:
+                        non_error_daemon_index = i
+                        break
+                if non_error_daemon_index != -1:
+                    daemons_to_remove.pop(non_error_daemon_index)
+                else:
+                    # all daemons in list are in error state
+                    # we should be able to remove all of them
+                    break
              for d in daemons_to_remove:
                  r = True
                  assert d.hostname is not None
author	Adam King <adking@redhat.com>
	Fri, 28 Jul 2023 17:24:06 +0000 (13:24 -0400)
committer	Adam King <adking@redhat.com>
	Thu, 31 Aug 2023 17:36:14 +0000 (13:36 -0400)