]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/cephadm: fix handling of draining hosts with explicit placement specs
authorAdam King <adking@redhat.com>
Fri, 29 Jul 2022 20:10:09 +0000 (16:10 -0400)
committerAdam King <adking@redhat.com>
Wed, 17 Aug 2022 17:40:42 +0000 (13:40 -0400)
Basically, if you have a placement that explicitly defines the hosts
to place on, and then add _no_schedule label to one of the hosts (which
should cause all daemons to be removed from the host) cpehadm will simply
fail to apply the spec, saying the host with the _no_schedule label is "Unknown".
This is due to the fact that we remove hosts with the _no_schedule label from
the pool of hosts the scheduler has to work with entirely. If we also provide
the scheduler with a list of currently draining hosts, it can handle this
better and the daemon can be drained off the host as expected.

Fixes: https://tracker.ceph.com/issues/56972
Signed-off-by: Adam King <adking@redhat.com>
(cherry picked from commit 7e8c07a3dd998dd3745b7f36919a21ca613484e4)

src/pybind/mgr/cephadm/inventory.py
src/pybind/mgr/cephadm/migrations.py
src/pybind/mgr/cephadm/module.py
src/pybind/mgr/cephadm/schedule.py
src/pybind/mgr/cephadm/serve.py
src/pybind/mgr/cephadm/tests/test_scheduling.py
src/pybind/mgr/cephadm/tests/test_tuned_profiles.py
src/pybind/mgr/cephadm/tuned_profiles.py

index 7a88258f002dc401993f70a0ec309063c82b764c..bf0dbb534101e0dafa4d60b7f5b33c93e4ee7e9e 100644 (file)
@@ -936,6 +936,15 @@ class HostCache():
             h for h in self.mgr.inventory.all_specs() if '_no_schedule' not in h.labels
         ]
 
+    def get_draining_hosts(self) -> List[HostSpec]:
+        """
+        Returns all hosts that have _no_schedule label and therefore should have
+        no daemons placed on them, but are potentially still reachable
+        """
+        return [
+            h for h in self.mgr.inventory.all_specs() if '_no_schedule' in h.labels
+        ]
+
     def get_unreachable_hosts(self) -> List[HostSpec]:
         """
         Return all hosts that are offline or in maintenance mode.
index 672a895bc14c8009239e7f69b4c3fa8cd425dcb2..69f39cb9107703eb286b7ce0ee7085ace3bde420 100644 (file)
@@ -112,6 +112,7 @@ class Migrations:
                 spec=spec,
                 hosts=self.mgr.inventory.all_specs(),
                 unreachable_hosts=self.mgr.cache.get_unreachable_hosts(),
+                draining_hosts=self.mgr.cache.get_draining_hosts(),
                 daemons=existing_daemons,
             ).place()
 
index e06ff1359e5d832b7c57ecb25d9c378f6689d35a..118670d2870071b9a3e16b7ba7e74ccbbe9ea970 100644 (file)
@@ -2538,6 +2538,7 @@ Then run the following:
             spec=spec,
             hosts=self.cache.get_schedulable_hosts(),
             unreachable_hosts=self.cache.get_unreachable_hosts(),
+            draining_hosts=self.cache.get_draining_hosts(),
             networks=self.cache.networks,
             daemons=self.cache.get_daemons_by_service(spec.service_name()),
             allow_colo=svc.allow_colo(),
@@ -2616,6 +2617,7 @@ Then run the following:
             spec=spec,
             hosts=self.inventory.all_specs(),  # All hosts, even those without daemon refresh
             unreachable_hosts=self.cache.get_unreachable_hosts(),
+            draining_hosts=self.cache.get_draining_hosts(),
             networks=self.cache.networks,
             daemons=self.cache.get_daemons_by_service(spec.service_name()),
             allow_colo=self.cephadm_services[spec.service_type].allow_colo(),
index 5002ec6e5060e66f71c378f8e574bb8639384ec1..c80b6781869da6f04ed05d92e640487c0edd104d 100644 (file)
@@ -143,6 +143,7 @@ class HostAssignment(object):
                  spec,  # type: ServiceSpec
                  hosts: List[orchestrator.HostSpec],
                  unreachable_hosts: List[orchestrator.HostSpec],
+                 draining_hosts: List[orchestrator.HostSpec],
                  daemons: List[orchestrator.DaemonDescription],
                  networks: Dict[str, Dict[str, Dict[str, List[str]]]] = {},
                  filter_new_host=None,  # type: Optional[Callable[[str],bool]]
@@ -156,6 +157,7 @@ class HostAssignment(object):
         self.primary_daemon_type = primary_daemon_type or spec.service_type
         self.hosts: List[orchestrator.HostSpec] = hosts
         self.unreachable_hosts: List[orchestrator.HostSpec] = unreachable_hosts
+        self.draining_hosts: List[orchestrator.HostSpec] = draining_hosts
         self.filter_new_host = filter_new_host
         self.service_name = spec.service_name()
         self.daemons = daemons
@@ -189,7 +191,8 @@ class HostAssignment(object):
 
         if self.spec.placement.hosts:
             explicit_hostnames = {h.hostname for h in self.spec.placement.hosts}
-            unknown_hosts = explicit_hostnames.difference(set(self.get_hostnames()))
+            known_hosts = self.get_hostnames() + [h.hostname for h in self.draining_hosts]
+            unknown_hosts = explicit_hostnames.difference(set(known_hosts))
             if unknown_hosts:
                 raise OrchestratorValidationError(
                     f'Cannot place {self.spec.one_line_str()} on {", ".join(sorted(unknown_hosts))}: Unknown hosts')
@@ -371,7 +374,7 @@ class HostAssignment(object):
                 DaemonPlacement(daemon_type=self.primary_daemon_type,
                                 hostname=h.hostname, network=h.network, name=h.name,
                                 ports=self.ports_start)
-                for h in self.spec.placement.hosts
+                for h in self.spec.placement.hosts if h.hostname not in [dh.hostname for dh in self.draining_hosts]
             ]
         elif self.spec.placement.label:
             ls = [
index 2f26ca70900f757431e237f913c2f63bcf7281b5..88d7b2083779fd4e87676bf35a81be4fdcc051f1 100644 (file)
@@ -627,6 +627,7 @@ class CephadmServe:
             hosts=self.mgr.cache.get_non_draining_hosts() if spec.service_name(
             ) == 'agent' else self.mgr.cache.get_schedulable_hosts(),
             unreachable_hosts=self.mgr.cache.get_unreachable_hosts(),
+            draining_hosts=self.mgr.cache.get_draining_hosts(),
             daemons=daemons,
             networks=self.mgr.cache.networks,
             filter_new_host=(
@@ -1005,6 +1006,7 @@ class CephadmServe:
                     spec=ServiceSpec('mon', placement=pspec),
                     hosts=self.mgr.cache.get_schedulable_hosts(),
                     unreachable_hosts=self.mgr.cache.get_unreachable_hosts(),
+                    draining_hosts=self.mgr.cache.get_draining_hosts(),
                     daemons=[],
                     networks=self.mgr.cache.networks,
                 )
@@ -1035,6 +1037,7 @@ class CephadmServe:
                     spec=ServiceSpec('mon', placement=ks.placement),
                     hosts=self.mgr.cache.get_schedulable_hosts(),
                     unreachable_hosts=self.mgr.cache.get_unreachable_hosts(),
+                    draining_hosts=self.mgr.cache.get_draining_hosts(),
                     daemons=[],
                     networks=self.mgr.cache.networks,
                 )
index 2454dc0d1ad0b2ed746ee1a36c8d86ad44494943..ffdad8e94068a59fc482651500008f79a39e5275 100644 (file)
@@ -133,6 +133,7 @@ def run_scheduler_test(results, mk_spec, hosts, daemons, key_elems):
                 spec=spec,
                 hosts=hosts,
                 unreachable_hosts=[],
+                draining_hosts=[],
                 daemons=daemons,
             ).place()
             if isinstance(host_res, list):
@@ -149,6 +150,7 @@ def run_scheduler_test(results, mk_spec, hosts, daemons, key_elems):
                 spec=spec,
                 hosts=hosts,
                 unreachable_hosts=[],
+                draining_hosts=[],
                 daemons=daemons
             ).place()
 
@@ -841,6 +843,7 @@ def test_node_assignment(service_type, placement, hosts, daemons, rank_map, post
         spec=spec,
         hosts=[HostSpec(h, labels=['foo']) for h in hosts],
         unreachable_hosts=[],
+        draining_hosts=[],
         daemons=daemons,
         allow_colo=allow_colo,
         rank_map=rank_map,
@@ -935,6 +938,7 @@ def test_node_assignment_random_shuffle(service_type, placement, available_hosts
         spec=spec,
         hosts=[HostSpec(h, labels=['foo']) for h in available_hosts],
         unreachable_hosts=[],
+        draining_hosts=[],
         daemons=[],
         allow_colo=allow_colo,
     ).get_candidates()
@@ -1019,6 +1023,7 @@ def test_node_assignment2(service_type, placement, hosts,
         spec=ServiceSpec(service_type, placement=placement),
         hosts=[HostSpec(h, labels=['foo']) for h in hosts],
         unreachable_hosts=[],
+        draining_hosts=[],
         daemons=daemons,
     ).place()
     assert len(hosts) == expected_len
@@ -1053,6 +1058,7 @@ def test_node_assignment3(service_type, placement, hosts,
         spec=ServiceSpec(service_type, placement=placement),
         hosts=[HostSpec(h) for h in hosts],
         unreachable_hosts=[],
+        draining_hosts=[],
         daemons=daemons,
     ).place()
     assert len(hosts) == expected_len
@@ -1150,6 +1156,7 @@ def test_node_assignment4(spec, networks, daemons,
         spec=spec,
         hosts=[HostSpec(h, labels=['foo']) for h in networks.keys()],
         unreachable_hosts=[],
+        draining_hosts=[],
         daemons=daemons,
         allow_colo=True,
         networks=networks,
@@ -1236,6 +1243,7 @@ def test_bad_specs(service_type, placement, hosts, daemons, expected):
             spec=ServiceSpec(service_type, placement=placement),
             hosts=[HostSpec(h) for h in hosts],
             unreachable_hosts=[],
+            draining_hosts=[],
             daemons=daemons,
         ).place()
     assert str(e.value) == expected
@@ -1412,6 +1420,7 @@ def test_active_assignment(service_type, placement, hosts, daemons, expected, ex
         spec=spec,
         hosts=[HostSpec(h) for h in hosts],
         unreachable_hosts=[],
+        draining_hosts=[],
         daemons=daemons,
     ).place()
     assert sorted([h.hostname for h in hosts]) in expected
@@ -1509,6 +1518,7 @@ def test_unreachable_host(service_type, placement, hosts, unreachable_hosts, dae
         spec=spec,
         hosts=[HostSpec(h) for h in hosts],
         unreachable_hosts=[HostSpec(h) for h in unreachable_hosts],
+        draining_hosts=[],
         daemons=daemons,
     ).place()
     assert sorted([h.hostname for h in to_add]) in expected_add
@@ -1585,6 +1595,7 @@ def test_remove_from_offline(service_type, placement, hosts, maintenance_hosts,
         spec=spec,
         hosts=host_specs,
         unreachable_hosts=[h for h in host_specs if h.status],
+        draining_hosts=[],
         daemons=daemons,
     ).place()
     assert sorted([h.hostname for h in to_add]) in expected_add
index bec433fdcb7cc983e20f836427d56328e9c2ebb2..41c0d96fcf3fe9896ac2c2f50733231ffd858aed 100644 (file)
@@ -34,6 +34,9 @@ class FakeCache:
     def get_unreachable_hosts(self):
         return self.unreachable_hosts
 
+    def get_draining_hosts(self):
+        return []
+
     @property
     def networks(self):
         return {h: {'a': {'b': ['c']}} for h in self.hosts}
index f07f8f3e48fbcec670b2f459bf26d507a5ef3886..7a5ce6fe602854dae373770d671699564c82c06f 100644 (file)
@@ -34,6 +34,7 @@ class TunedProfileUtils():
                     'crash', placement=profile.placement),
                 hosts=self.mgr.cache.get_schedulable_hosts(),
                 unreachable_hosts=self.mgr.cache.get_unreachable_hosts(),
+                draining_hosts=self.mgr.cache.get_draining_hosts(),
                 daemons=[],
                 networks=self.mgr.cache.networks,
             )