From 0ebe83c714a93abf13b8af6148d51efb6f1a3435 Mon Sep 17 00:00:00 2001 From: Adam King Date: Thu, 30 Sep 2021 10:04:14 -0400 Subject: [PATCH] mgr/cephadm: move schedulable, unreachable and non_draining hosts to HostCache they can be accessed similar to how we access daemons with certain attributes rather than being random functions in the cephadm mgr module Signed-off-by: Adam King --- src/pybind/mgr/cephadm/agent.py | 2 +- src/pybind/mgr/cephadm/inventory.py | 48 ++++++++++++++++- src/pybind/mgr/cephadm/migrations.py | 2 +- src/pybind/mgr/cephadm/module.py | 54 ++------------------ src/pybind/mgr/cephadm/serve.py | 17 +++--- src/pybind/mgr/cephadm/services/osd.py | 6 +-- src/pybind/mgr/cephadm/tests/test_cephadm.py | 5 +- 7 files changed, 68 insertions(+), 66 deletions(-) diff --git a/src/pybind/mgr/cephadm/agent.py b/src/pybind/mgr/cephadm/agent.py index cd9cd199f114..ead4c0a3bee5 100644 --- a/src/pybind/mgr/cephadm/agent.py +++ b/src/pybind/mgr/cephadm/agent.py @@ -297,7 +297,7 @@ class CephadmAgentHelpers: def _agent_down(self, host: str) -> bool: # if host is draining or drained (has _no_schedule label) there should not # be an agent deployed there and therefore we should return False - if host not in [h.hostname for h in self.mgr._non_draining_hosts()]: + if host not in [h.hostname for h in self.mgr.cache.get_non_draining_hosts()]: return False # if we don't have a timestamp, it's likely because of a mgr fail over. # just set the timestamp to now. However, if host was offline before, we diff --git a/src/pybind/mgr/cephadm/inventory.py b/src/pybind/mgr/cephadm/inventory.py index a637f4ae0b91..002aaa52060d 100644 --- a/src/pybind/mgr/cephadm/inventory.py +++ b/src/pybind/mgr/cephadm/inventory.py @@ -741,6 +741,52 @@ class HostCache(): r.append(host) return r + def get_schedulable_hosts(self) -> List[HostSpec]: + """ + Returns all usable hosts that went through _refresh_host_daemons(). + + This mitigates a potential race, where new host was added *after* + ``_refresh_host_daemons()`` was called, but *before* + ``_apply_all_specs()`` was called. thus we end up with a hosts + where daemons might be running, but we have not yet detected them. + """ + return [ + h for h in self.mgr.inventory.all_specs() + if ( + self.host_had_daemon_refresh(h.hostname) + and '_no_schedule' not in h.labels + ) + ] + + def get_non_draining_hosts(self) -> List[HostSpec]: + """ + Returns all hosts that do not have _no_schedule label. + + Useful for the agent who needs this specific list rather than the + schedulable_hosts since the agent needs to be deployed on hosts with + no daemon refresh + """ + return [ + h for h in self.mgr.inventory.all_specs() if '_no_schedule' not in h.labels + ] + + def get_unreachable_hosts(self) -> List[HostSpec]: + """ + Return all hosts that are offline or in maintenance mode. + + The idea is we should not touch the daemons on these hosts (since + in theory the hosts are inaccessible so we CAN'T touch them) but + we still want to count daemons that exist on these hosts toward the + placement so daemons on these hosts aren't just moved elsewhere + """ + return [ + h for h in self.mgr.inventory.all_specs() + if ( + h.status.lower() in ['maintenance', 'offline'] + or h.hostname in self.mgr.offline_hosts + ) + ] + def get_facts(self, host: str) -> Dict[str, Any]: return self.facts.get(host, {}) @@ -953,7 +999,7 @@ class HostCache(): return True def all_host_metadata_up_to_date(self) -> bool: - unreachables = [h.hostname for h in self.mgr._unreachable_hosts()] + unreachables = [h.hostname for h in self.get_unreachable_hosts()] if [h for h in self.get_hosts() if (not self.host_metadata_up_to_date(h) and h not in unreachables)]: # this function is primarily for telling if it's safe to try and apply a service # spec. Since offline/maintenance hosts aren't considered in that process anyway diff --git a/src/pybind/mgr/cephadm/migrations.py b/src/pybind/mgr/cephadm/migrations.py index 941ede3666de..e5a73f306896 100644 --- a/src/pybind/mgr/cephadm/migrations.py +++ b/src/pybind/mgr/cephadm/migrations.py @@ -103,7 +103,7 @@ class Migrations: placements, to_add, to_remove = HostAssignment( spec=spec, hosts=self.mgr.inventory.all_specs(), - unreachable_hosts=self.mgr._unreachable_hosts(), + unreachable_hosts=self.mgr.cache.get_unreachable_hosts(), daemons=existing_daemons, ).place() diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index 3cc37845754a..0dc18c8daf15 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -1374,52 +1374,6 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, return image - def _schedulable_hosts(self) -> List[HostSpec]: - """ - Returns all usable hosts that went through _refresh_host_daemons(). - - This mitigates a potential race, where new host was added *after* - ``_refresh_host_daemons()`` was called, but *before* - ``_apply_all_specs()`` was called. thus we end up with a hosts - where daemons might be running, but we have not yet detected them. - """ - return [ - h for h in self.inventory.all_specs() - if ( - self.cache.host_had_daemon_refresh(h.hostname) - and '_no_schedule' not in h.labels - ) - ] - - def _non_draining_hosts(self) -> List[HostSpec]: - """ - Returns all hosts that do not have _no_schedule label. - - Useful for the agent who needs this specific list rather than the - _schedulable_hosts since the agent needs to be deployed on hosts with - no daemon refresh - """ - return [ - h for h in self.inventory.all_specs() if '_no_schedule' not in h.labels - ] - - def _unreachable_hosts(self) -> List[HostSpec]: - """ - Return all hosts that are offline or in maintenance mode. - - The idea is we should not touch the daemons on these hosts (since - in theory the hosts are inaccessible so we CAN'T touch them) but - we still want to count daemons that exist on these hosts toward the - placement so daemons on these hosts aren't just moved elsewhere - """ - return [ - h for h in self.inventory.all_specs() - if ( - h.status.lower() in ['maintenance', 'offline'] - or h.hostname in self.offline_hosts - ) - ] - def _check_valid_addr(self, host: str, addr: str) -> str: # make sure hostname is resolvable before trying to make a connection try: @@ -1823,7 +1777,7 @@ Then run the following: continue sm[nm] = orchestrator.ServiceDescription( spec=spec, - size=spec.placement.get_target_count(self._schedulable_hosts()), + size=spec.placement.get_target_count(self.cache.get_schedulable_hosts()), running=0, events=self.events.get_for_service(spec.service_name()), created=self.spec_store.spec_created[nm], @@ -2400,8 +2354,8 @@ Then run the following: svc = self.cephadm_services[spec.service_type] ha = HostAssignment( spec=spec, - hosts=self._schedulable_hosts(), - unreachable_hosts=self._unreachable_hosts(), + hosts=self.cache.get_schedulable_hosts(), + unreachable_hosts=self.cache.get_unreachable_hosts(), networks=self.cache.networks, daemons=self.cache.get_daemons_by_service(spec.service_name()), allow_colo=svc.allow_colo(), @@ -2477,7 +2431,7 @@ Then run the following: HostAssignment( spec=spec, hosts=self.inventory.all_specs(), # All hosts, even those without daemon refresh - unreachable_hosts=self._unreachable_hosts(), + unreachable_hosts=self.cache.get_unreachable_hosts(), networks=self.cache.networks, daemons=self.cache.get_daemons_by_service(spec.service_name()), allow_colo=self.cephadm_services[spec.service_type].allow_colo(), diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py index ef15bab72b18..3559e687867d 100644 --- a/src/pybind/mgr/cephadm/serve.py +++ b/src/pybind/mgr/cephadm/serve.py @@ -209,8 +209,8 @@ class CephadmServe: pspec = PlacementSpec.from_string(self.mgr.manage_etc_ceph_ceph_conf_hosts) ha = HostAssignment( spec=ServiceSpec('mon', placement=pspec), - hosts=self.mgr._schedulable_hosts(), - unreachable_hosts=self.mgr._unreachable_hosts(), + hosts=self.mgr.cache.get_schedulable_hosts(), + unreachable_hosts=self.mgr.cache.get_unreachable_hosts(), daemons=[], networks=self.mgr.cache.networks, ) @@ -241,8 +241,8 @@ class CephadmServe: keyring.encode('utf-8')).digest()) ha = HostAssignment( spec=ServiceSpec('mon', placement=ks.placement), - hosts=self.mgr._schedulable_hosts(), - unreachable_hosts=self.mgr._unreachable_hosts(), + hosts=self.mgr.cache.get_schedulable_hosts(), + unreachable_hosts=self.mgr.cache.get_unreachable_hosts(), daemons=[], networks=self.mgr.cache.networks, ) @@ -295,7 +295,7 @@ class CephadmServe: if r is not None: bad_hosts.append(r) - if not self.mgr.use_agent or host not in [h.hostname for h in self.mgr._non_draining_hosts()]: + if not self.mgr.use_agent or host not in [h.hostname for h in self.mgr.cache.get_non_draining_hosts()]: if self.mgr.cache.host_needs_daemon_refresh(host): self.log.debug('refreshing %s daemons' % host) r = self._refresh_host_daemons(host) @@ -717,8 +717,9 @@ class CephadmServe: rank_map = self.mgr.spec_store[spec.service_name()].rank_map or {} ha = HostAssignment( spec=spec, - hosts=self.mgr._non_draining_hosts() if spec.service_name() == 'agent' else self.mgr._schedulable_hosts(), - unreachable_hosts=self.mgr._unreachable_hosts(), + hosts=self.mgr.cache.get_non_draining_hosts() if spec.service_name( + ) == 'agent' else self.mgr.cache.get_schedulable_hosts(), + unreachable_hosts=self.mgr.cache.get_unreachable_hosts(), daemons=daemons, networks=self.mgr.cache.networks, filter_new_host=( @@ -909,7 +910,7 @@ class CephadmServe: if self.mgr.use_agent: # can only send ack to agents if we know for sure port they bound to hosts_altered = set([h for h in hosts_altered if (h in self.mgr.cache.agent_ports and h in [ - h2.hostname for h2 in self.mgr._non_draining_hosts()])]) + h2.hostname for h2 in self.mgr.cache.get_non_draining_hosts()])]) self.mgr.agent_helpers._request_agent_acks(hosts_altered, increment=True) if r is None: diff --git a/src/pybind/mgr/cephadm/services/osd.py b/src/pybind/mgr/cephadm/services/osd.py index 7b63fe7aa4e9..f5996eeae764 100644 --- a/src/pybind/mgr/cephadm/services/osd.py +++ b/src/pybind/mgr/cephadm/services/osd.py @@ -152,7 +152,7 @@ class OSDService(CephService): def prepare_drivegroup(self, drive_group: DriveGroupSpec) -> List[Tuple[str, DriveSelection]]: # 1) use fn_filter to determine matching_hosts matching_hosts = drive_group.placement.filter_matching_hostspecs( - self.mgr._schedulable_hosts()) + self.mgr.cache.get_schedulable_hosts()) # 2) Map the inventory to the InventoryHost object host_ds_map = [] @@ -261,7 +261,7 @@ class OSDService(CephService): if not osdspecs: self.mgr.log.debug("No OSDSpecs found") return [] - return sum([spec.placement.filter_matching_hostspecs(self.mgr._schedulable_hosts()) for spec in osdspecs], []) + return sum([spec.placement.filter_matching_hostspecs(self.mgr.cache.get_schedulable_hosts()) for spec in osdspecs], []) def resolve_osdspecs_for_host(self, host: str, specs: Optional[List[DriveGroupSpec]] = None) -> List[DriveGroupSpec]: @@ -271,7 +271,7 @@ class OSDService(CephService): specs = [cast(DriveGroupSpec, spec) for (sn, spec) in self.mgr.spec_store.spec_preview.items() if spec.service_type == 'osd'] for spec in specs: - if host in spec.placement.filter_matching_hostspecs(self.mgr._schedulable_hosts()): + if host in spec.placement.filter_matching_hostspecs(self.mgr.cache.get_schedulable_hosts()): self.mgr.log.debug(f"Found OSDSpecs for host: <{host}> -> <{spec}>") matching_specs.append(spec) return matching_specs diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py index 50628ca3f78b..b25411f92f4e 100644 --- a/src/pybind/mgr/cephadm/tests/test_cephadm.py +++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py @@ -1127,11 +1127,12 @@ spec: # being in offline/maint mode should disqualify hosts from being # candidates for scheduling candidates = [ - h.hostname for h in cephadm_module._schedulable_hosts()] + h.hostname for h in cephadm_module.cache.get_schedulable_hosts()] assert 'test2' in candidates assert 'test3' in candidates - unreachable = [h.hostname for h in cephadm_module._unreachable_hosts()] + unreachable = [ + h.hostname for h in cephadm_module.cache.get_unreachable_hosts()] assert 'test2' in unreachable assert 'test3' in unreachable -- 2.47.3