From f3c3c66b12425fe7078a1c00ea3fafd3647207b8 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 9 Apr 2021 16:26:00 -0400 Subject: [PATCH] mgr/cephadm: rewrite/simplify describe_service The prior implementation first tried to fabricate services based on the running daemons, and then filled in defined services on top. This led to duplication and a range of small errors. Instead, flip this around: start with the services that are defined, and only fill in 'unmanaged' services where we need to. Drop the osd kludges and instead rely on DaemonDescription.service_id to return the right thing. Signed-off-by: Sage Weil (cherry picked from commit 58d9e90425679fd715aa31d7c8f1044f4582388e) --- src/pybind/mgr/cephadm/module.py | 114 +++++++++++++------------------ 1 file changed, 46 insertions(+), 68 deletions(-) diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index f5d4307a87313..7c5008ec3a8a0 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -1562,97 +1562,75 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, self._invalidate_daemons_and_kick_serve() self.log.info('Kicked serve() loop to refresh all services') - # sm: Dict[str, orchestrator.ServiceDescription] = {} - osd_count = 0 + + # known services + for nm, spec in self.spec_store.all_specs.items(): + if service_type is not None and service_type != spec.service_type: + continue + if service_name is not None and service_name != nm: + continue + sm[nm] = orchestrator.ServiceDescription( + spec=spec, + size=spec.placement.get_target_count(self.inventory.all_specs()), + running=0, + events=self.events.get_for_service(spec.service_name()), + created=self.spec_store.spec_created[nm], + deleted=self.spec_store.spec_deleted.get(nm, None), + ) + if service_type == 'nfs': + spec = cast(NFSServiceSpec, spec) + sm[nm].rados_config_location = spec.rados_config_location() + if spec.service_type == 'ha-rgw': + # ha-rgw has 2 daemons running per host + sm[nm].size *= 2 + + # factor daemons into status for h, dm in self.cache.get_daemons_with_volatile_status(): for name, dd in dm.items(): assert dd.hostname is not None, f'no hostname for {dd!r}' assert dd.daemon_type is not None, f'no daemon_type for {dd!r}' - if service_type and service_type != dd.daemon_type: - continue n: str = dd.service_name() + + if ( + service_type + and service_type != n + and not dd.daemon_type.startswith(n + '.') + ): + continue if service_name and service_name != n: continue - if dd.daemon_type == 'osd': - """ - OSDs do not know the affinity to their spec out of the box. - """ - n = f"osd.{dd.osdspec_affinity}" - if not dd.osdspec_affinity: - # If there is no osdspec_affinity, the spec should suffice for displaying - continue - if n in self.spec_store.all_specs: - spec = self.spec_store.all_specs[n] - else: + + if n not in sm: + # new unmanaged service spec = ServiceSpec( unmanaged=True, service_type=daemon_type_to_service(dd.daemon_type), service_id=dd.service_id(), - placement=PlacementSpec( - hosts=[dd.hostname] - ) ) - if n not in sm: sm[n] = orchestrator.ServiceDescription( last_refresh=dd.last_refresh, container_image_id=dd.container_image_id, container_image_name=dd.container_image_name, spec=spec, - events=self.events.get_for_service(spec.service_name()), + size=0, ) - if n in self.spec_store.all_specs: - if dd.daemon_type == 'osd': - """ - The osd count can't be determined by the Placement spec. - Showing an actual/expected representation cannot be determined - here. So we're setting running = size for now. - """ - osd_count += 1 - sm[n].size = osd_count - else: - sm[n].size = spec.placement.get_target_count( - self.inventory.all_specs()) - - sm[n].created = self.spec_store.spec_created[n] - sm[n].deleted = self.spec_store.spec_deleted.get(n, None) - - if service_type == 'nfs': - spec = cast(NFSServiceSpec, spec) - sm[n].rados_config_location = spec.rados_config_location() - else: - sm[n].size = 0 + if dd.status == DaemonDescriptionStatus.running: sm[n].running += 1 - if not sm[n].last_refresh or not dd.last_refresh or dd.last_refresh < sm[n].last_refresh: # type: ignore + if dd.daemon_type == 'osd': + # The osd count can't be determined by the Placement spec. + # Showing an actual/expected representation cannot be determined + # here. So we're setting running = size for now. + sm[n].size += 1 + if ( + not sm[n].last_refresh + or not dd.last_refresh + or dd.last_refresh < sm[n].last_refresh # type: ignore + ): sm[n].last_refresh = dd.last_refresh - if sm[n].container_image_id != dd.container_image_id: - sm[n].container_image_id = 'mix' - if sm[n].container_image_name != dd.container_image_name: - sm[n].container_image_name = 'mix' - if dd.daemon_type == 'haproxy' or dd.daemon_type == 'keepalived': - # ha-rgw has 2 daemons running per host - sm[n].size = sm[n].size * 2 - for n, spec in self.spec_store.all_specs.items(): - if n in sm: - continue - if service_type is not None and service_type != spec.service_type: - continue - if service_name is not None and service_name != n: - continue - sm[n] = orchestrator.ServiceDescription( - spec=spec, - size=spec.placement.get_target_count(self.inventory.all_specs()), - running=0, - events=self.events.get_for_service(spec.service_name()), - ) - if service_type == 'nfs': - spec = cast(NFSServiceSpec, spec) - sm[n].rados_config_location = spec.rados_config_location() - if spec.service_type == 'ha-rgw': - # ha-rgw has 2 daemons running per host - sm[n].size = sm[n].size * 2 + return list(sm.values()) @handle_orch_error -- 2.39.5