From 12a9aba43caedf6ea2cb897708e0b31d96ee358f Mon Sep 17 00:00:00 2001 From: Adam King Date: Mon, 9 Sep 2024 18:28:45 -0400 Subject: [PATCH] cephadm: add ability to continue on failure when applying multiple specs Additionally, add the flag that does so when cephadm applies a spec during bootstrap. Bootstrap will continue to completion regardless of whether applying the spec fails, so we might as well try applying all of it while reporting errors we do see back to the user Fixes: https://tracker.ceph.com/issues/65338 Signed-off-by: Adam King --- src/cephadm/cephadm.py | 2 +- src/pybind/mgr/cephadm/module.py | 16 +++++++- src/pybind/mgr/orchestrator/_interface.py | 7 +++- src/pybind/mgr/orchestrator/module.py | 48 +++++++++++++++++++---- 4 files changed, 62 insertions(+), 11 deletions(-) diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py index e71addf7bfa..33cac4a3403 100755 --- a/src/cephadm/cephadm.py +++ b/src/cephadm/cephadm.py @@ -2946,7 +2946,7 @@ def command_bootstrap(ctx): mounts = {} mounts[pathify(ctx.apply_spec)] = '/tmp/spec.yml:ro' try: - out = cli(['orch', 'apply', '-i', '/tmp/spec.yml'], extra_mounts=mounts) + out = cli(['orch', 'apply', '--continue-on-error', '-i', '/tmp/spec.yml'], extra_mounts=mounts) logger.info(out) except Exception: ctx.error_code = -errno.EINVAL diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index 1a9a1086218..650d9711bd3 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -3516,7 +3516,12 @@ Then run the following: return "Scheduled %s update..." % spec.service_name() @handle_orch_error - def apply(self, specs: Sequence[GenericSpec], no_overwrite: bool = False) -> List[str]: + def apply( + self, + specs: Sequence[GenericSpec], + no_overwrite: bool = False, + continue_on_error: bool = True + ) -> List[str]: results = [] for spec in specs: if no_overwrite: @@ -3528,7 +3533,14 @@ Then run the following: results.append('Skipped %s service spec. To change %s spec omit --no-overwrite flag' % (cast(ServiceSpec, spec).service_name(), cast(ServiceSpec, spec).service_name())) continue - results.append(self._apply(spec)) + try: + res = self._apply(spec) + results.append(res) + except Exception as e: + if continue_on_error: + results.append(f'Failed to apply spec for {spec}: {str(e)}') + else: + raise e return results @handle_orch_error diff --git a/src/pybind/mgr/orchestrator/_interface.py b/src/pybind/mgr/orchestrator/_interface.py index c33f38cfdd4..7b602401a7e 100644 --- a/src/pybind/mgr/orchestrator/_interface.py +++ b/src/pybind/mgr/orchestrator/_interface.py @@ -574,7 +574,12 @@ class Orchestrator(object): raise NotImplementedError() @handle_orch_error - def apply(self, specs: Sequence["GenericSpec"], no_overwrite: bool = False) -> List[str]: + def apply( + self, + specs: Sequence["GenericSpec"], + no_overwrite: bool = False, + continue_on_error: bool = False + ) -> List[str]: """ Applies any spec """ diff --git a/src/pybind/mgr/orchestrator/module.py b/src/pybind/mgr/orchestrator/module.py index d0f3286177c..7c943b076f4 100644 --- a/src/pybind/mgr/orchestrator/module.py +++ b/src/pybind/mgr/orchestrator/module.py @@ -1620,12 +1620,14 @@ Usage: format: Format = Format.plain, unmanaged: bool = False, no_overwrite: bool = False, + continue_on_error: bool = False, inbuf: Optional[str] = None) -> HandleCommandResult: """Update the size or placement for a service or apply a large yaml spec""" usage = """Usage: ceph orch apply -i [--dry-run] ceph orch apply [--placement=] [--unmanaged] """ + errs: List[str] = [] if inbuf: if service_type or placement or unmanaged: raise OrchestratorValidationError(usage) @@ -1635,7 +1637,14 @@ Usage: # None entries in the output. Let's skip them silently. content = [o for o in yaml_objs if o is not None] for s in content: - spec = json_to_generic_spec(s) + try: + spec = json_to_generic_spec(s) + except Exception as e: + if continue_on_error: + errs.append(f'Failed to convert {s} from json object: {str(e)}') + continue + else: + raise e # validate the config (we need MgrModule for that) if isinstance(spec, ServiceSpec) and spec.config: @@ -1643,7 +1652,12 @@ Usage: try: self.get_foreign_ceph_option('mon', k) except KeyError: - raise SpecValidationError(f'Invalid config option {k} in spec') + err = SpecValidationError(f'Invalid config option {k} in spec') + if continue_on_error: + errs.append(str(err)) + continue + else: + raise err # There is a general "osd" service with no service id, but we use # that to dump osds created individually with "ceph orch daemon add osd" @@ -1658,7 +1672,12 @@ Usage: and spec.service_type == 'osd' and not spec.service_id ): - raise SpecValidationError('Please provide the service_id field in your OSD spec') + err = SpecValidationError('Please provide the service_id field in your OSD spec') + if continue_on_error: + errs.append(str(err)) + continue + else: + raise err if dry_run and not isinstance(spec, HostSpec): spec.preview_only = dry_run @@ -1668,15 +1687,30 @@ Usage: continue specs.append(spec) else: + # Note in this case there is only ever one spec + # being applied so there is no need to worry about + # handling of continue_on_error placementspec = PlacementSpec.from_string(placement) if not service_type: raise OrchestratorValidationError(usage) specs = [ServiceSpec(service_type.value, placement=placementspec, unmanaged=unmanaged, preview_only=dry_run)] - return self._apply_misc(specs, dry_run, format, no_overwrite) - - def _apply_misc(self, specs: Sequence[GenericSpec], dry_run: bool, format: Format, no_overwrite: bool = False) -> HandleCommandResult: - completion = self.apply(specs, no_overwrite) + cmd_result = self._apply_misc(specs, dry_run, format, no_overwrite, continue_on_error) + if errs: + # HandleCommandResult is a named tuple, so use + # _replace to modify it. + cmd_result = cmd_result._replace(stdout=cmd_result.stdout + '\n' + '\n'.join(errs)) + return cmd_result + + def _apply_misc( + self, + specs: Sequence[GenericSpec], + dry_run: bool, + format: Format, + no_overwrite: bool = False, + continue_on_error: bool = False + ) -> HandleCommandResult: + completion = self.apply(specs, no_overwrite, continue_on_error) raise_if_exception(completion) out = completion.result_str() if dry_run: -- 2.47.3