From de5f1e47156f4c3777dc9adf3107a928f64717ec Mon Sep 17 00:00:00 2001 From: Shweta Bhosale Date: Mon, 24 Nov 2025 13:34:06 +0530 Subject: [PATCH] mgr/cephadm: Fixed Keepalived behavior for failover 1. Keepalived will get failover to other node when daemon node is down 2. While removing extra daemon of Ingress service make sure to first remove it from non dependent daemon node Fixes: https://tracker.ceph.com/issues/73949 Signed-off-by: Shweta Bhosale --- src/pybind/mgr/cephadm/schedule.py | 36 +++- .../mgr/cephadm/tests/test_scheduling.py | 163 ++++++++++++++++++ src/pybind/mgr/cephadm/utils.py | 2 +- 3 files changed, 195 insertions(+), 6 deletions(-) diff --git a/src/pybind/mgr/cephadm/schedule.py b/src/pybind/mgr/cephadm/schedule.py index fe3cc093efd7..166a8326168a 100644 --- a/src/pybind/mgr/cephadm/schedule.py +++ b/src/pybind/mgr/cephadm/schedule.py @@ -171,7 +171,7 @@ class DaemonPlacement(NamedTuple): if self.ports: if self.ports != dd.ports and dd.ports: return False - if self.ip != dd.ip and dd.ip: + if self.ip and dd.ip and self.ip != dd.ip: return False return True @@ -436,10 +436,36 @@ class HostAssignment(object): # The number of new slots that need to be selected in order to fulfill count need = count - len(existing) - # we don't need any additional placements - if need <= 0: - to_remove.extend(existing[count:]) - del existing_slots[count:] + # Scaling down: more daemons exist than required. + # When related services exist, prioritize keeping daemons co-located with them + # by removing from non-related hosts first, then from related hosts if needed. + if need < 0: + non_matching_daemons = [] + if self.related_service_daemons: + # Get unique hostnames where related service daemons are running + related_service_hosts = list(set(dd.hostname for dd in self.related_service_daemons)) + + total_excess = len(existing) - count + to_delete = [] + # First, prefer removing daemons from hosts that don't have related services + non_related = [dd for dd in existing if dd.hostname not in related_service_hosts] + to_delete.extend(non_related[-total_excess:]) + + # If we still need to remove more, remove from hosts with related services + remaining_needed = total_excess - len(to_delete) + if remaining_needed > 0: + remaining = [dd for dd in existing if dd not in to_delete] + to_delete.extend(remaining[count:]) + + non_matching_daemons = to_delete + else: + # No related services - simply remove excess daemons beyond target count + non_matching_daemons = existing[count:] + + to_remove.extend(non_matching_daemons) + # remove from existing_slots + non_matching_hostnames = {dd.hostname for dd in non_matching_daemons} + existing_slots = [slot for slot in existing_slots if slot.hostname not in non_matching_hostnames] return self.place_per_host_daemons(existing_slots, [], to_remove) if self.related_service_daemons: diff --git a/src/pybind/mgr/cephadm/tests/test_scheduling.py b/src/pybind/mgr/cephadm/tests/test_scheduling.py index 926afa6ba372..bb8cb572b621 100644 --- a/src/pybind/mgr/cephadm/tests/test_scheduling.py +++ b/src/pybind/mgr/cephadm/tests/test_scheduling.py @@ -1857,3 +1857,166 @@ def test_blocking_daemon_host( ).place() assert sorted([h.hostname for h in to_add]) in expected_add assert sorted([h.name() for h in to_remove]) in expected_remove + + +@pytest.mark.parametrize( + "service_type,placement,hosts,daemons,related_service_daemons,rank_map,expected,expected_add,expected_remove", + [ + ( + 'ingress', + PlacementSpec(count=2), + 'host1 host2 host3 host4'.split(), + [ + DaemonDescription('haproxy', 'ingress1', 'host1'), + DaemonDescription('keepalived', 'ingress1', 'host1'), + DaemonDescription('haproxy', 'ingress2', 'host2'), + DaemonDescription('keepalived', 'ingress2', 'host2'), + DaemonDescription('haproxy', 'ingress3', 'host3'), + DaemonDescription('keepalived', 'ingress3', 'host3'), + DaemonDescription('haproxy', 'ingress4', 'host4'), + DaemonDescription('keepalived', 'ingress4', 'host4'), + ], + [ + DaemonDescription('nfs', 'nfs1', 'host1'), + DaemonDescription('nfs', 'nfs3', 'host3'), + ], + None, + ['haproxy:host1(*:443,8888)', 'haproxy:host3(*:443,8888)', 'keepalived:host1', 'keepalived:host3'], + [], + ['haproxy.ingress2', 'haproxy.ingress4', 'keepalived.ingress2', 'keepalived.ingress4'] # to_remove + ), + ( + 'ingress', + PlacementSpec(count=1), + 'host1 host2 host3'.split(), + [ + DaemonDescription('haproxy', 'ingress1', 'host1'), + DaemonDescription('keepalived', 'ingress1', 'host1'), + DaemonDescription('haproxy', 'ingress2', 'host2'), + DaemonDescription('keepalived', 'ingress2', 'host2'), + DaemonDescription('haproxy', 'ingress3', 'host3'), + DaemonDescription('keepalived', 'ingress3', 'host3'), + ], + [ + DaemonDescription('nfs', 'nfs1', 'host1'), + DaemonDescription('nfs', 'nfs2', 'host2'), + ], + None, + ['haproxy:host1(*:443,8888)', 'keepalived:host1'], + [], + ['haproxy.ingress2', 'haproxy.ingress3', 'keepalived.ingress2', 'keepalived.ingress3'] # to_remove + ), + ( + 'nfs', + PlacementSpec(count=1), + 'host1 host2 host3'.split(), + [ + DaemonDescription('nfs', 'nfs1', 'host1'), + DaemonDescription('nfs', 'nfs2', 'host2'), + ], + [], + None, + ['nfs:host1'], + [], + ['nfs.nfs2'] # to_remove + ), + ( + 'ingress', + PlacementSpec(count=2), + 'host1 host2 host3 host4 host5 host6'.split(), + [ + DaemonDescription('haproxy', 'ingress1', 'host1'), + DaemonDescription('keepalived', 'ingress1', 'host1'), + DaemonDescription('haproxy', 'ingress2', 'host2'), + DaemonDescription('keepalived', 'ingress2', 'host2'), + DaemonDescription('haproxy', 'ingress3', 'host3'), + DaemonDescription('keepalived', 'ingress3', 'host3'), + DaemonDescription('haproxy', 'ingress4', 'host4'), + DaemonDescription('keepalived', 'ingress4', 'host4'), + ], + [ + DaemonDescription('nfs', 'nfs5', 'host5'), + DaemonDescription('nfs', 'nfs6', 'host6'), + ], + None, + ['haproxy:host1(*:443,8888)', 'haproxy:host2(*:443,8888)', 'keepalived:host1', 'keepalived:host2'], + [], + ['haproxy.ingress3', 'haproxy.ingress4', 'keepalived.ingress3', 'keepalived.ingress4'] # to_remove + ), + ( + 'nfs', + PlacementSpec(count=2), + 'host1 host2 host3 host4'.split(), + [ + DaemonDescription('nfs', 'nfs1', 'host1'), + DaemonDescription('nfs', 'nfs2', 'host2'), + DaemonDescription('nfs', 'nfs3', 'host3'), + DaemonDescription('nfs', 'nfs4', 'host4'), + ], + [ + DaemonDescription('haproxy', 'ingress3', 'host3'), + DaemonDescription('keepalived', 'ingress3', 'host3'), + DaemonDescription('haproxy', 'ingress4', 'host4'), + DaemonDescription('keepalived', 'ingress4', 'host4'), + ], + None, + ['nfs:host3', 'nfs:host4'], + [], + ['nfs.nfs1', 'nfs.nfs2'] # to_remove + ), + ( + 'nfs', + PlacementSpec(count=2), + 'host1 host2 host3'.split(), + [ + DaemonDescription('nfs', '0.2', 'host1', rank=0, rank_generation=2), + DaemonDescription('nfs', '0.1', 'host2', rank=0, rank_generation=1), + DaemonDescription('nfs', '1.1', 'host3', rank=1, rank_generation=1), + ], + [ + DaemonDescription('haproxy', 'ingress2', 'host2'), + DaemonDescription('keepalived', 'ingress2', 'host2'), + DaemonDescription('haproxy', 'ingress3', 'host3'), + DaemonDescription('keepalived', 'ingress3', 'host3'), + ], + {0: {1: '0.1', 2: '0.2'}, 1: {1: '1.1'}}, + ['nfs:host1(rank=0.2)', 'nfs:host3(rank=1.1)'], + [], + ['nfs.0.1'] + ), + ]) +def test_related_service_downsize(service_type, placement, hosts, daemons, related_service_daemons, rank_map, expected, expected_add, expected_remove): + if service_type == 'ingress': + spec = IngressSpec( + service_type='ingress', + service_id='nfs.test', + frontend_port=443, + monitor_port=8888, + virtual_ip='10.0.0.20/8', + backend_service='nfs.test', + placement=placement, + ) + else: + spec = ServiceSpec(service_type=service_type, + service_id='test', + placement=placement) + + all_slots, to_add, to_remove = HostAssignment( + spec=spec, + hosts=[HostSpec(h) for h in hosts], + unreachable_hosts=[], + draining_hosts=[], + daemons=daemons, + related_service_daemons=related_service_daemons, + primary_daemon_type='haproxy' if spec.service_type == 'ingress' else spec.service_type, + per_host_daemon_type='keepalived' if spec.service_type == 'ingress' else None, + rank_map=rank_map + ).place() + + got = [str(p) for p in all_slots] + assert sorted(got) == sorted(expected) + + got_add = [str(p) for p in to_add] + assert sorted(got_add) == sorted(expected_add) + + assert sorted([d.name() for d in to_remove]) == sorted(expected_remove) diff --git a/src/pybind/mgr/cephadm/utils.py b/src/pybind/mgr/cephadm/utils.py index 9ef394e86dc3..cb17faf2a306 100644 --- a/src/pybind/mgr/cephadm/utils.py +++ b/src/pybind/mgr/cephadm/utils.py @@ -28,7 +28,7 @@ GATEWAY_TYPES = ['iscsi', 'nfs', 'nvmeof', 'smb'] MONITORING_STACK_TYPES = ['node-exporter', 'prometheus', 'alertmanager', 'grafana', 'loki', 'promtail', 'alloy'] MGMT_GATEWAY_STACK_TYPES = ['mgmt-gateway', 'oauth2-proxy'] -RESCHEDULE_FROM_OFFLINE_HOSTS_TYPES = ['haproxy', 'nfs'] +RESCHEDULE_FROM_OFFLINE_HOSTS_TYPES = ['haproxy', 'nfs', 'keepalived'] CEPH_UPGRADE_ORDER = CEPH_TYPES + GATEWAY_TYPES + MONITORING_STACK_TYPES + MGMT_GATEWAY_STACK_TYPES -- 2.47.3