From 3cadc37500600366a78c21ca5676bd61aeaca1e8 Mon Sep 17 00:00:00 2001
From: Adam King <adking@redhat.com>
Date: Mon, 9 Sep 2024 18:28:45 -0400
Subject: [PATCH] cephadm: add ability to continue on failure when applying
 multiple specs

Additionally, add the flag that does so when cephadm applies
a spec during bootstrap. Bootstrap will continue to completion
regardless of whether applying the spec fails, so we might
as well try applying all of it while reporting errors we
do see back to the user

Fixes: https://tracker.ceph.com/issues/65338

Signed-off-by: Adam King <adking@redhat.com>
(cherry picked from commit 12a9aba43caedf6ea2cb897708e0b31d96ee358f)
---
 src/cephadm/cephadm.py                    |  2 +-
 src/pybind/mgr/cephadm/module.py          | 16 +++++++-
 src/pybind/mgr/orchestrator/_interface.py |  7 +++-
 src/pybind/mgr/orchestrator/module.py     | 48 +++++++++++++++++++----
 4 files changed, 62 insertions(+), 11 deletions(-)

diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py
index 5758e7c258b36..45ec18fe1a748 100755
--- a/src/cephadm/cephadm.py
+++ b/src/cephadm/cephadm.py
@@ -2915,7 +2915,7 @@ def command_bootstrap(ctx):
         mounts = {}
         mounts[pathify(ctx.apply_spec)] = '/tmp/spec.yml:ro'
         try:
-            out = cli(['orch', 'apply', '-i', '/tmp/spec.yml'], extra_mounts=mounts)
+            out = cli(['orch', 'apply', '--continue-on-error', '-i', '/tmp/spec.yml'], extra_mounts=mounts)
             logger.info(out)
         except Exception:
             ctx.error_code = -errno.EINVAL
diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py
index a8b71a1081eff..1ddfb0240e713 100644
--- a/src/pybind/mgr/cephadm/module.py
+++ b/src/pybind/mgr/cephadm/module.py
@@ -3487,7 +3487,12 @@ Then run the following:
         return "Scheduled %s update..." % spec.service_name()
 
     @handle_orch_error
-    def apply(self, specs: Sequence[GenericSpec], no_overwrite: bool = False) -> List[str]:
+    def apply(
+        self,
+        specs: Sequence[GenericSpec],
+        no_overwrite: bool = False,
+        continue_on_error: bool = True
+    ) -> List[str]:
         results = []
         for spec in specs:
             if no_overwrite:
@@ -3499,7 +3504,14 @@ Then run the following:
                     results.append('Skipped %s service spec. To change %s spec omit --no-overwrite flag'
                                    % (cast(ServiceSpec, spec).service_name(), cast(ServiceSpec, spec).service_name()))
                     continue
-            results.append(self._apply(spec))
+            try:
+                res = self._apply(spec)
+                results.append(res)
+            except Exception as e:
+                if continue_on_error:
+                    results.append(f'Failed to apply spec for {spec}: {str(e)}')
+                else:
+                    raise e
         return results
 
     @handle_orch_error
diff --git a/src/pybind/mgr/orchestrator/_interface.py b/src/pybind/mgr/orchestrator/_interface.py
index 9e5c08f7959e3..7bd6943795601 100644
--- a/src/pybind/mgr/orchestrator/_interface.py
+++ b/src/pybind/mgr/orchestrator/_interface.py
@@ -591,7 +591,12 @@ class Orchestrator(object):
         raise NotImplementedError()
 
     @handle_orch_error
-    def apply(self, specs: Sequence["GenericSpec"], no_overwrite: bool = False) -> List[str]:
+    def apply(
+        self,
+        specs: Sequence["GenericSpec"],
+        no_overwrite: bool = False,
+        continue_on_error: bool = False
+    ) -> List[str]:
         """
         Applies any spec
         """
diff --git a/src/pybind/mgr/orchestrator/module.py b/src/pybind/mgr/orchestrator/module.py
index fd7ffddaa5158..9cecb377c2bdb 100644
--- a/src/pybind/mgr/orchestrator/module.py
+++ b/src/pybind/mgr/orchestrator/module.py
@@ -1641,12 +1641,14 @@ Usage:
                    format: Format = Format.plain,
                    unmanaged: bool = False,
                    no_overwrite: bool = False,
+                   continue_on_error: bool = False,
                    inbuf: Optional[str] = None) -> HandleCommandResult:
         """Update the size or placement for a service or apply a large yaml spec"""
         usage = """Usage:
   ceph orch apply -i <yaml spec> [--dry-run]
   ceph orch apply <service_type> [--placement=<placement_string>] [--unmanaged]
         """
+        errs: List[str] = []
         if inbuf:
             if service_type or placement or unmanaged:
                 raise OrchestratorValidationError(usage)
@@ -1656,7 +1658,14 @@ Usage:
             # None entries in the output. Let's skip them silently.
             content = [o for o in yaml_objs if o is not None]
             for s in content:
-                spec = json_to_generic_spec(s)
+                try:
+                    spec = json_to_generic_spec(s)
+                except Exception as e:
+                    if continue_on_error:
+                        errs.append(f'Failed to convert {s} from json object: {str(e)}')
+                        continue
+                    else:
+                        raise e
 
                 # validate the config (we need MgrModule for that)
                 if isinstance(spec, ServiceSpec) and spec.config:
@@ -1664,7 +1673,12 @@ Usage:
                         try:
                             self.get_foreign_ceph_option('mon', k)
                         except KeyError:
-                            raise SpecValidationError(f'Invalid config option {k} in spec')
+                            err = SpecValidationError(f'Invalid config option {k} in spec')
+                            if continue_on_error:
+                                errs.append(str(err))
+                                continue
+                            else:
+                                raise err
 
                 # There is a general "osd" service with no service id, but we use
                 # that to dump osds created individually with "ceph orch daemon add osd"
@@ -1679,7 +1693,12 @@ Usage:
                     and spec.service_type == 'osd'
                     and not spec.service_id
                 ):
-                    raise SpecValidationError('Please provide the service_id field in your OSD spec')
+                    err = SpecValidationError('Please provide the service_id field in your OSD spec')
+                    if continue_on_error:
+                        errs.append(str(err))
+                        continue
+                    else:
+                        raise err
 
                 if dry_run and not isinstance(spec, HostSpec):
                     spec.preview_only = dry_run
@@ -1689,15 +1708,30 @@ Usage:
                     continue
                 specs.append(spec)
         else:
+            # Note in this case there is only ever one spec
+            # being applied so there is no need to worry about
+            # handling of continue_on_error
             placementspec = PlacementSpec.from_string(placement)
             if not service_type:
                 raise OrchestratorValidationError(usage)
             specs = [ServiceSpec(service_type.value, placement=placementspec,
                                  unmanaged=unmanaged, preview_only=dry_run)]
-        return self._apply_misc(specs, dry_run, format, no_overwrite)
-
-    def _apply_misc(self, specs: Sequence[GenericSpec], dry_run: bool, format: Format, no_overwrite: bool = False) -> HandleCommandResult:
-        completion = self.apply(specs, no_overwrite)
+        cmd_result = self._apply_misc(specs, dry_run, format, no_overwrite, continue_on_error)
+        if errs:
+            # HandleCommandResult is a named tuple, so use
+            # _replace to modify it.
+            cmd_result = cmd_result._replace(stdout=cmd_result.stdout + '\n' + '\n'.join(errs))
+        return cmd_result
+
+    def _apply_misc(
+        self,
+        specs: Sequence[GenericSpec],
+        dry_run: bool,
+        format: Format,
+        no_overwrite: bool = False,
+        continue_on_error: bool = False
+    ) -> HandleCommandResult:
+        completion = self.apply(specs, no_overwrite, continue_on_error)
         raise_if_exception(completion)
         out = completion.result_str()
         if dry_run:
-- 
2.39.5