From 8f1232a1799f3349ad702f9b79803e5cf0f03505 Mon Sep 17 00:00:00 2001 From: Adam King Date: Wed, 14 Apr 2021 15:29:58 -0400 Subject: [PATCH] mgr/cephadm: don't remove daemons from hosts in maintenance or offline mode Fixes: https://tracker.ceph.com/issues/50364 Signed-off-by: Adam King (cherry picked from commit eebb842d0487660c93baf9eafda28a2f87e482f3) --- src/pybind/mgr/cephadm/serve.py | 2 ++ src/pybind/mgr/cephadm/tests/test_cephadm.py | 31 ++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py index f17219730c421..ea2d8a296102d 100644 --- a/src/pybind/mgr/cephadm/serve.py +++ b/src/pybind/mgr/cephadm/serve.py @@ -560,6 +560,8 @@ class CephadmServe: try: all_slots, slots_to_add, daemons_to_remove = ha.place() + daemons_to_remove = [d for d in daemons_to_remove if (d.hostname and self.mgr.inventory._inventory[d.hostname].get( + 'status', '').lower() not in ['maintenance', 'offline'])] self.log.debug('Add %s, remove %s' % (slots_to_add, daemons_to_remove)) except OrchestratorError as e: self.log.error('Failed to apply %s spec %s: %s' % ( diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py index 8c1949e74db69..db9bfc4c7a599 100644 --- a/src/pybind/mgr/cephadm/tests/test_cephadm.py +++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py @@ -984,6 +984,37 @@ class TestCephadm(object): out = wait(cephadm_module, cephadm_module.get_hosts())[0].to_json() assert out == HostSpec('test', 'test').to_json() + @mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}')) + def test_dont_touch_offline_or_maintenance_host_daemons(self, cephadm_module): + # test daemons on offline/maint hosts not removed when applying specs + # test daemons not added to hosts in maint/offline state + with with_host(cephadm_module, 'test1'): + with with_host(cephadm_module, 'test2'): + with with_host(cephadm_module, 'test3'): + with with_service(cephadm_module, ServiceSpec('mgr', placement=PlacementSpec(host_pattern='*'))): + # should get a mgr on all 3 hosts + # CephadmServe(cephadm_module)._apply_all_services() + assert len(cephadm_module.cache.get_daemons_by_type('mgr')) == 3 + + # put one host in offline state and one host in maintenance state + cephadm_module.inventory._inventory['test2']['status'] = 'offline' + cephadm_module.inventory._inventory['test3']['status'] = 'maintenance' + cephadm_module.inventory.save() + + # being in offline/maint mode should disqualify hosts from being + # candidates for scheduling + candidates = [ + h.hostname for h in cephadm_module._hosts_with_daemon_inventory()] + assert 'test2' not in candidates + assert 'test3' not in candidates + + with with_service(cephadm_module, ServiceSpec('crash', placement=PlacementSpec(host_pattern='*'))): + # re-apply services. No mgr should be removed from maint/offline hosts + # crash daemon should only be on host not in maint/offline mode + CephadmServe(cephadm_module)._apply_all_services() + assert len(cephadm_module.cache.get_daemons_by_type('mgr')) == 3 + assert len(cephadm_module.cache.get_daemons_by_type('crash')) == 1 + def test_stale_connections(self, cephadm_module): class Connection(object): """ -- 2.39.5