From 0ebe83c714a93abf13b8af6148d51efb6f1a3435 Mon Sep 17 00:00:00 2001
From: Adam King <adking@redhat.com>
Date: Thu, 30 Sep 2021 10:04:14 -0400
Subject: [PATCH] mgr/cephadm: move schedulable, unreachable and non_draining
 hosts to HostCache

they can be accessed similar to how we access daemons with
certain attributes rather than being random functions in the
cephadm mgr module

Signed-off-by: Adam King <adking@redhat.com>
---
 src/pybind/mgr/cephadm/agent.py              |  2 +-
 src/pybind/mgr/cephadm/inventory.py          | 48 ++++++++++++++++-
 src/pybind/mgr/cephadm/migrations.py         |  2 +-
 src/pybind/mgr/cephadm/module.py             | 54 ++------------------
 src/pybind/mgr/cephadm/serve.py              | 17 +++---
 src/pybind/mgr/cephadm/services/osd.py       |  6 +--
 src/pybind/mgr/cephadm/tests/test_cephadm.py |  5 +-
 7 files changed, 68 insertions(+), 66 deletions(-)

diff --git a/src/pybind/mgr/cephadm/agent.py b/src/pybind/mgr/cephadm/agent.py
index cd9cd199f11..ead4c0a3bee 100644
--- a/src/pybind/mgr/cephadm/agent.py
+++ b/src/pybind/mgr/cephadm/agent.py
@@ -297,7 +297,7 @@ class CephadmAgentHelpers:
     def _agent_down(self, host: str) -> bool:
         # if host is draining or drained (has _no_schedule label) there should not
         # be an agent deployed there and therefore we should return False
-        if host not in [h.hostname for h in self.mgr._non_draining_hosts()]:
+        if host not in [h.hostname for h in self.mgr.cache.get_non_draining_hosts()]:
             return False
         # if we don't have a timestamp, it's likely because of a mgr fail over.
         # just set the timestamp to now. However, if host was offline before, we
diff --git a/src/pybind/mgr/cephadm/inventory.py b/src/pybind/mgr/cephadm/inventory.py
index a637f4ae0b9..002aaa52060 100644
--- a/src/pybind/mgr/cephadm/inventory.py
+++ b/src/pybind/mgr/cephadm/inventory.py
@@ -741,6 +741,52 @@ class HostCache():
             r.append(host)
         return r
 
+    def get_schedulable_hosts(self) -> List[HostSpec]:
+        """
+        Returns all usable hosts that went through _refresh_host_daemons().
+
+        This mitigates a potential race, where new host was added *after*
+        ``_refresh_host_daemons()`` was called, but *before*
+        ``_apply_all_specs()`` was called. thus we end up with a hosts
+        where daemons might be running, but we have not yet detected them.
+        """
+        return [
+            h for h in self.mgr.inventory.all_specs()
+            if (
+                self.host_had_daemon_refresh(h.hostname)
+                and '_no_schedule' not in h.labels
+            )
+        ]
+
+    def get_non_draining_hosts(self) -> List[HostSpec]:
+        """
+        Returns all hosts that do not have _no_schedule label.
+
+        Useful for the agent who needs this specific list rather than the
+        schedulable_hosts since the agent needs to be deployed on hosts with
+        no daemon refresh
+        """
+        return [
+            h for h in self.mgr.inventory.all_specs() if '_no_schedule' not in h.labels
+        ]
+
+    def get_unreachable_hosts(self) -> List[HostSpec]:
+        """
+        Return all hosts that are offline or in maintenance mode.
+
+        The idea is we should not touch the daemons on these hosts (since
+        in theory the hosts are inaccessible so we CAN'T touch them) but
+        we still want to count daemons that exist on these hosts toward the
+        placement so daemons on these hosts aren't just moved elsewhere
+        """
+        return [
+            h for h in self.mgr.inventory.all_specs()
+            if (
+                h.status.lower() in ['maintenance', 'offline']
+                or h.hostname in self.mgr.offline_hosts
+            )
+        ]
+
     def get_facts(self, host: str) -> Dict[str, Any]:
         return self.facts.get(host, {})
 
@@ -953,7 +999,7 @@ class HostCache():
         return True
 
     def all_host_metadata_up_to_date(self) -> bool:
-        unreachables = [h.hostname for h in self.mgr._unreachable_hosts()]
+        unreachables = [h.hostname for h in self.get_unreachable_hosts()]
         if [h for h in self.get_hosts() if (not self.host_metadata_up_to_date(h) and h not in unreachables)]:
             # this function is primarily for telling if it's safe to try and apply a service
             # spec. Since offline/maintenance hosts aren't considered in that process anyway
diff --git a/src/pybind/mgr/cephadm/migrations.py b/src/pybind/mgr/cephadm/migrations.py
index 941ede3666d..e5a73f30689 100644
--- a/src/pybind/mgr/cephadm/migrations.py
+++ b/src/pybind/mgr/cephadm/migrations.py
@@ -103,7 +103,7 @@ class Migrations:
             placements, to_add, to_remove = HostAssignment(
                 spec=spec,
                 hosts=self.mgr.inventory.all_specs(),
-                unreachable_hosts=self.mgr._unreachable_hosts(),
+                unreachable_hosts=self.mgr.cache.get_unreachable_hosts(),
                 daemons=existing_daemons,
             ).place()
 
diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py
index 3cc37845754..0dc18c8daf1 100644
--- a/src/pybind/mgr/cephadm/module.py
+++ b/src/pybind/mgr/cephadm/module.py
@@ -1374,52 +1374,6 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
 
         return image
 
-    def _schedulable_hosts(self) -> List[HostSpec]:
-        """
-        Returns all usable hosts that went through _refresh_host_daemons().
-
-        This mitigates a potential race, where new host was added *after*
-        ``_refresh_host_daemons()`` was called, but *before*
-        ``_apply_all_specs()`` was called. thus we end up with a hosts
-        where daemons might be running, but we have not yet detected them.
-        """
-        return [
-            h for h in self.inventory.all_specs()
-            if (
-                self.cache.host_had_daemon_refresh(h.hostname)
-                and '_no_schedule' not in h.labels
-            )
-        ]
-
-    def _non_draining_hosts(self) -> List[HostSpec]:
-        """
-        Returns all hosts that do not have _no_schedule label.
-
-        Useful for the agent who needs this specific list rather than the
-        _schedulable_hosts since the agent needs to be deployed on hosts with
-        no daemon refresh
-        """
-        return [
-            h for h in self.inventory.all_specs() if '_no_schedule' not in h.labels
-        ]
-
-    def _unreachable_hosts(self) -> List[HostSpec]:
-        """
-        Return all hosts that are offline or in maintenance mode.
-
-        The idea is we should not touch the daemons on these hosts (since
-        in theory the hosts are inaccessible so we CAN'T touch them) but
-        we still want to count daemons that exist on these hosts toward the
-        placement so daemons on these hosts aren't just moved elsewhere
-        """
-        return [
-            h for h in self.inventory.all_specs()
-            if (
-                h.status.lower() in ['maintenance', 'offline']
-                or h.hostname in self.offline_hosts
-            )
-        ]
-
     def _check_valid_addr(self, host: str, addr: str) -> str:
         # make sure hostname is resolvable before trying to make a connection
         try:
@@ -1823,7 +1777,7 @@ Then run the following:
                 continue
             sm[nm] = orchestrator.ServiceDescription(
                 spec=spec,
-                size=spec.placement.get_target_count(self._schedulable_hosts()),
+                size=spec.placement.get_target_count(self.cache.get_schedulable_hosts()),
                 running=0,
                 events=self.events.get_for_service(spec.service_name()),
                 created=self.spec_store.spec_created[nm],
@@ -2400,8 +2354,8 @@ Then run the following:
         svc = self.cephadm_services[spec.service_type]
         ha = HostAssignment(
             spec=spec,
-            hosts=self._schedulable_hosts(),
-            unreachable_hosts=self._unreachable_hosts(),
+            hosts=self.cache.get_schedulable_hosts(),
+            unreachable_hosts=self.cache.get_unreachable_hosts(),
             networks=self.cache.networks,
             daemons=self.cache.get_daemons_by_service(spec.service_name()),
             allow_colo=svc.allow_colo(),
@@ -2477,7 +2431,7 @@ Then run the following:
         HostAssignment(
             spec=spec,
             hosts=self.inventory.all_specs(),  # All hosts, even those without daemon refresh
-            unreachable_hosts=self._unreachable_hosts(),
+            unreachable_hosts=self.cache.get_unreachable_hosts(),
             networks=self.cache.networks,
             daemons=self.cache.get_daemons_by_service(spec.service_name()),
             allow_colo=self.cephadm_services[spec.service_type].allow_colo(),
diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py
index ef15bab72b1..3559e687867 100644
--- a/src/pybind/mgr/cephadm/serve.py
+++ b/src/pybind/mgr/cephadm/serve.py
@@ -209,8 +209,8 @@ class CephadmServe:
                 pspec = PlacementSpec.from_string(self.mgr.manage_etc_ceph_ceph_conf_hosts)
                 ha = HostAssignment(
                     spec=ServiceSpec('mon', placement=pspec),
-                    hosts=self.mgr._schedulable_hosts(),
-                    unreachable_hosts=self.mgr._unreachable_hosts(),
+                    hosts=self.mgr.cache.get_schedulable_hosts(),
+                    unreachable_hosts=self.mgr.cache.get_unreachable_hosts(),
                     daemons=[],
                     networks=self.mgr.cache.networks,
                 )
@@ -241,8 +241,8 @@ class CephadmServe:
                     keyring.encode('utf-8')).digest())
                 ha = HostAssignment(
                     spec=ServiceSpec('mon', placement=ks.placement),
-                    hosts=self.mgr._schedulable_hosts(),
-                    unreachable_hosts=self.mgr._unreachable_hosts(),
+                    hosts=self.mgr.cache.get_schedulable_hosts(),
+                    unreachable_hosts=self.mgr.cache.get_unreachable_hosts(),
                     daemons=[],
                     networks=self.mgr.cache.networks,
                 )
@@ -295,7 +295,7 @@ class CephadmServe:
                 if r is not None:
                     bad_hosts.append(r)
 
-            if not self.mgr.use_agent or host not in [h.hostname for h in self.mgr._non_draining_hosts()]:
+            if not self.mgr.use_agent or host not in [h.hostname for h in self.mgr.cache.get_non_draining_hosts()]:
                 if self.mgr.cache.host_needs_daemon_refresh(host):
                     self.log.debug('refreshing %s daemons' % host)
                     r = self._refresh_host_daemons(host)
@@ -717,8 +717,9 @@ class CephadmServe:
             rank_map = self.mgr.spec_store[spec.service_name()].rank_map or {}
         ha = HostAssignment(
             spec=spec,
-            hosts=self.mgr._non_draining_hosts() if spec.service_name() == 'agent' else self.mgr._schedulable_hosts(),
-            unreachable_hosts=self.mgr._unreachable_hosts(),
+            hosts=self.mgr.cache.get_non_draining_hosts() if spec.service_name(
+            ) == 'agent' else self.mgr.cache.get_schedulable_hosts(),
+            unreachable_hosts=self.mgr.cache.get_unreachable_hosts(),
             daemons=daemons,
             networks=self.mgr.cache.networks,
             filter_new_host=(
@@ -909,7 +910,7 @@ class CephadmServe:
             if self.mgr.use_agent:
                 # can only send ack to agents if we know for sure port they bound to
                 hosts_altered = set([h for h in hosts_altered if (h in self.mgr.cache.agent_ports and h in [
-                                    h2.hostname for h2 in self.mgr._non_draining_hosts()])])
+                                    h2.hostname for h2 in self.mgr.cache.get_non_draining_hosts()])])
                 self.mgr.agent_helpers._request_agent_acks(hosts_altered, increment=True)
 
         if r is None:
diff --git a/src/pybind/mgr/cephadm/services/osd.py b/src/pybind/mgr/cephadm/services/osd.py
index 7b63fe7aa4e..f5996eeae76 100644
--- a/src/pybind/mgr/cephadm/services/osd.py
+++ b/src/pybind/mgr/cephadm/services/osd.py
@@ -152,7 +152,7 @@ class OSDService(CephService):
     def prepare_drivegroup(self, drive_group: DriveGroupSpec) -> List[Tuple[str, DriveSelection]]:
         # 1) use fn_filter to determine matching_hosts
         matching_hosts = drive_group.placement.filter_matching_hostspecs(
-            self.mgr._schedulable_hosts())
+            self.mgr.cache.get_schedulable_hosts())
         # 2) Map the inventory to the InventoryHost object
         host_ds_map = []
 
@@ -261,7 +261,7 @@ class OSDService(CephService):
         if not osdspecs:
             self.mgr.log.debug("No OSDSpecs found")
             return []
-        return sum([spec.placement.filter_matching_hostspecs(self.mgr._schedulable_hosts()) for spec in osdspecs], [])
+        return sum([spec.placement.filter_matching_hostspecs(self.mgr.cache.get_schedulable_hosts()) for spec in osdspecs], [])
 
     def resolve_osdspecs_for_host(self, host: str,
                                   specs: Optional[List[DriveGroupSpec]] = None) -> List[DriveGroupSpec]:
@@ -271,7 +271,7 @@ class OSDService(CephService):
             specs = [cast(DriveGroupSpec, spec) for (sn, spec) in self.mgr.spec_store.spec_preview.items()
                      if spec.service_type == 'osd']
         for spec in specs:
-            if host in spec.placement.filter_matching_hostspecs(self.mgr._schedulable_hosts()):
+            if host in spec.placement.filter_matching_hostspecs(self.mgr.cache.get_schedulable_hosts()):
                 self.mgr.log.debug(f"Found OSDSpecs for host: <{host}> -> <{spec}>")
                 matching_specs.append(spec)
         return matching_specs
diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py
index 50628ca3f78..b25411f92f4 100644
--- a/src/pybind/mgr/cephadm/tests/test_cephadm.py
+++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py
@@ -1127,11 +1127,12 @@ spec:
                         # being in offline/maint mode should disqualify hosts from being
                         # candidates for scheduling
                         candidates = [
-                            h.hostname for h in cephadm_module._schedulable_hosts()]
+                            h.hostname for h in cephadm_module.cache.get_schedulable_hosts()]
                         assert 'test2' in candidates
                         assert 'test3' in candidates
 
-                        unreachable = [h.hostname for h in cephadm_module._unreachable_hosts()]
+                        unreachable = [
+                            h.hostname for h in cephadm_module.cache.get_unreachable_hosts()]
                         assert 'test2' in unreachable
                         assert 'test3' in unreachable
 
-- 
2.47.3