]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/cephadm: Reschedule nfs daemons from offline hosts 44343/head
authorAdam King <adking@redhat.com>
Tue, 22 Mar 2022 22:57:21 +0000 (18:57 -0400)
committerAdam King <adking@redhat.com>
Tue, 22 Mar 2022 22:57:21 +0000 (18:57 -0400)
In order to improve nfs availability, if there are other
hosts we can place an nfs daemon on or if there is a host
with a lower rank nfs daemon when a higher rank one is on
an offline host, we should reschedule the nfs daemons

Signed-off-by: Adam King <adking@redhat.com>
src/pybind/mgr/cephadm/schedule.py
src/pybind/mgr/cephadm/tests/test_scheduling.py
src/pybind/mgr/cephadm/utils.py

index 9ee0a5e3c970f4e48bc148f3015a88c510107a7a..9a8bad3c906292861bf515758c1a444e424c3f96 100644 (file)
@@ -7,6 +7,7 @@ import orchestrator
 from ceph.deployment.service_spec import ServiceSpec
 from orchestrator._interface import DaemonDescription
 from orchestrator import OrchestratorValidationError
+from .utils import RESCHEDULE_FROM_OFFLINE_HOSTS_TYPES
 
 logger = logging.getLogger(__name__)
 T = TypeVar('T')
@@ -255,6 +256,10 @@ class HostAssignment(object):
 
         # get candidate hosts based on [hosts, label, host_pattern]
         candidates = self.get_candidates()  # type: List[DaemonPlacement]
+        if self.primary_daemon_type in RESCHEDULE_FROM_OFFLINE_HOSTS_TYPES:
+            # remove unreachable hosts that are not in maintenance so daemons
+            # on these hosts will be rescheduled
+            candidates = self.remove_non_maintenance_unreachable_candidates(candidates)
 
         def expand_candidates(ls: List[DaemonPlacement], num: int) -> List[DaemonPlacement]:
             r = []
@@ -433,3 +438,14 @@ class HostAssignment(object):
         final = sorted(ls)
         random.Random(seed).shuffle(final)
         return ls
+
+    def remove_non_maintenance_unreachable_candidates(self, candidates: List[DaemonPlacement]) -> List[DaemonPlacement]:
+        in_maintenance: Dict[str, bool] = {}
+        for h in self.hosts:
+            if h.status.lower() == 'maintenance':
+                in_maintenance[h.hostname] = True
+                continue
+            in_maintenance[h.hostname] = False
+        unreachable_hosts = [h.hostname for h in self.unreachable_hosts]
+        candidates = [c for c in candidates if c.hostname not in unreachable_hosts or in_maintenance[c.hostname]]
+        return candidates
index ec4b87f59e486f6453f1df97b21a5fcc91dc1dc8..c70ef9fb5ee1106ebffde9b37aa4a5a00cd91ad9 100644 (file)
@@ -1441,3 +1441,79 @@ def test_unreachable_host(service_type, placement, hosts, unreachable_hosts, dae
     ).place()
     assert sorted([h.hostname for h in to_add]) in expected_add
     assert sorted([h.name() for h in to_remove]) in expected_remove
+
+
+class RescheduleFromOfflineTest(NamedTuple):
+    service_type: str
+    placement: PlacementSpec
+    hosts: List[str]
+    maintenance_hosts: List[str]
+    offline_hosts: List[str]
+    daemons: List[DaemonDescription]
+    expected_add: List[List[str]]
+    expected_remove: List[List[str]]
+
+
+@pytest.mark.parametrize("service_type,placement,hosts,maintenance_hosts,offline_hosts,daemons,expected_add,expected_remove",
+                         [
+                             RescheduleFromOfflineTest(
+                                 'nfs',
+                                 PlacementSpec(count=2),
+                                 'host1 host2 host3'.split(),
+                                 [],
+                                 ['host2'],
+                                 [
+                                     DaemonDescription('nfs', 'a', 'host1'),
+                                     DaemonDescription('nfs', 'b', 'host2'),
+                                 ],
+                                 [['host3']],
+                                 [[]],
+                             ),
+                             RescheduleFromOfflineTest(
+                                 'nfs',
+                                 PlacementSpec(count=2),
+                                 'host1 host2 host3'.split(),
+                                 ['host2'],
+                                 [],
+                                 [
+                                     DaemonDescription('nfs', 'a', 'host1'),
+                                     DaemonDescription('nfs', 'b', 'host2'),
+                                 ],
+                                 [[]],
+                                 [[]],
+                             ),
+                             RescheduleFromOfflineTest(
+                                 'mon',
+                                 PlacementSpec(count=2),
+                                 'host1 host2 host3'.split(),
+                                 [],
+                                 ['host2'],
+                                 [
+                                     DaemonDescription('mon', 'a', 'host1'),
+                                     DaemonDescription('mon', 'b', 'host2'),
+                                 ],
+                                 [[]],
+                                 [[]],
+                             ),
+                         ])
+def test_remove_from_offline(service_type, placement, hosts, maintenance_hosts, offline_hosts, daemons, expected_add, expected_remove):
+
+    spec = ServiceSpec(service_type=service_type,
+                       service_id='test',
+                       placement=placement)
+
+    host_specs = [HostSpec(h) for h in hosts]
+    for h in host_specs:
+        if h.hostname in offline_hosts:
+            h.status = 'offline'
+        if h.hostname in maintenance_hosts:
+            h.status = 'maintenance'
+
+    hosts, to_add, to_remove = HostAssignment(
+        spec=spec,
+        hosts=host_specs,
+        unreachable_hosts=[h for h in host_specs if h.status],
+        daemons=daemons,
+    ).place()
+    assert sorted([h.hostname for h in to_add]) in expected_add
+    assert sorted([h.name() for h in to_remove]) in expected_remove
index 94b9162affa7c0c435adc23a3f861f56083ecbb9..3a5b564d59e830f57bdfcf5a01458c7a99fd2a19 100644 (file)
@@ -24,6 +24,7 @@ class CephadmNoImage(Enum):
 CEPH_TYPES = ['mgr', 'mon', 'crash', 'osd', 'mds', 'rgw', 'rbd-mirror', 'cephfs-mirror']
 GATEWAY_TYPES = ['iscsi', 'nfs']
 MONITORING_STACK_TYPES = ['node-exporter', 'prometheus', 'alertmanager', 'grafana', 'loki', 'promtail']
+RESCHEDULE_FROM_OFFLINE_HOSTS_TYPES = ['nfs']
 
 CEPH_UPGRADE_ORDER = CEPH_TYPES + GATEWAY_TYPES + MONITORING_STACK_TYPES