From f7cf8ce1e6576188841fda5bb9bf30624b092f21 Mon Sep 17 00:00:00 2001 From: bachmanity1 Date: Tue, 10 Mar 2026 23:13:03 +0900 Subject: [PATCH] mgr/cephadm: fix KeyError when host is removed during serve loop Signed-off-by: bachmanity1 --- src/pybind/mgr/cephadm/inventory.py | 2 +- src/pybind/mgr/cephadm/serve.py | 16 ++++++++++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/src/pybind/mgr/cephadm/inventory.py b/src/pybind/mgr/cephadm/inventory.py index aed6ba03efad..3fda8c0e5907 100644 --- a/src/pybind/mgr/cephadm/inventory.py +++ b/src/pybind/mgr/cephadm/inventory.py @@ -1331,7 +1331,7 @@ class HostCache(): if host in self.mgr.offline_hosts: dd.status = orchestrator.DaemonDescriptionStatus.error dd.status_desc = 'host is offline' - elif self.mgr.inventory._inventory[host].get("status", "").lower() == "maintenance": + elif self.mgr.inventory._inventory.get(host, {}).get("status", "").lower() == "maintenance": # We do not refresh daemons on hosts in maintenance mode, so stored daemon statuses # could be wrong. We must assume maintenance is working and daemons are stopped dd.status = orchestrator.DaemonDescriptionStatus.stopped diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py index 3f1c63bcda11..22083ffbbacd 100644 --- a/src/pybind/mgr/cephadm/serve.py +++ b/src/pybind/mgr/cephadm/serve.py @@ -252,8 +252,11 @@ class CephadmServe: @forall_hosts def refresh(host: str) -> None: - # skip hosts that are in maintenance - they could be powered off - if self.mgr.inventory._inventory[host].get("status", "").lower() == "maintenance": + # skip hosts that were removed or are in maintenance - they could be powered off + host_info = self.mgr.inventory._inventory.get(host) + if host_info is None: + return + if host_info.get("status", "").lower() == "maintenance": return if self.mgr.use_agent: @@ -862,8 +865,13 @@ class CephadmServe: try: all_slots, slots_to_add, daemons_to_remove = ha.place() - daemons_to_remove = [d for d in daemons_to_remove if (d.hostname and self.mgr.inventory._inventory[d.hostname].get( - 'status', '').lower() not in ['maintenance', 'offline'] and d.hostname not in self.mgr.offline_hosts)] + daemons_to_remove = [ + d for d in daemons_to_remove if ( + d.hostname + and d.hostname in self.mgr.inventory._inventory + and self.mgr.inventory._inventory.get(d.hostname, {}).get( + 'status', '').lower() not in ['maintenance', 'offline'] + and d.hostname not in self.mgr.offline_hosts)] self.log.debug('Add %s, remove %s' % (slots_to_add, daemons_to_remove)) except OrchestratorError as e: msg = f'Failed to apply {spec.service_name()} spec {spec}: {str(e)}' -- 2.47.3