From: Guillaume Abrioux <gabrioux@ibm.com>
Date: Wed, 10 Apr 2024 13:00:21 +0000 (+0200)
Subject: cephadm: check if file exists when passing `--apply_spec`
X-Git-Tag: v20.0.0~2048^2
X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=2ede1484925452c1ba717de0b9e8f9310c128bfb;p=ceph.git

cephadm: check if file exists when passing `--apply_spec`

cephadm deploys the cluster, fails and does a rollback.
If the passed file doesn't exist we can make the CLI fail early instead.

```

... omitted output ...

Applying ../host-spec.yaml to cluster
FileNotFoundError: [Errno 2] No such file or directory: '../host-spec.yaml'

        ***************
        Cephadm hit an issue during cluster installation. Current cluster files will be deleted automatically.
        To disable this behaviour you can pass the --no-cleanup-on-failure flag. In case of any previous
        broken installation, users must use the following command to completely delete the broken cluster:

        > cephadm rm-cluster --force --zap-osds --fsid <fsid>

        for more information please refer to https://docs.ceph.com/en/latest/cephadm/operations/#purging-a-cluster
        ***************

Deleting cluster with fsid: 6e6a2dbe-f73a-11ee-8262-98be948800fd
Traceback (most recent call last):
  File "/usr/lib64/python3.9/runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib64/python3.9/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/tmp/tmpive4g9gs.cephadm.build/app/__main__.py", line 5615, in <module>
  File "/tmp/tmpive4g9gs.cephadm.build/app/__main__.py", line 5603, in main
  File "/tmp/tmpive4g9gs.cephadm.build/app/__main__.py", line 2693, in _rollback
  File "/tmp/tmpive4g9gs.cephadm.build/app/__main__.py", line 445, in _default_image
  File "/tmp/tmpive4g9gs.cephadm.build/app/__main__.py", line 2958, in command_bootstrap
FileNotFoundError: [Errno 2] No such file or directory: '../host-spec.yaml'
```

Signed-off-by: Guillaume Abrioux <gabrioux@ibm.com>
---

diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py
index 6257fb11d13..8e2677d35ea 100755
--- a/src/cephadm/cephadm.py
+++ b/src/cephadm/cephadm.py
@@ -2695,8 +2695,9 @@ def rollback(func: FuncT) -> FuncT:
             # another cluster with the provided fsid already exists: don't remove.
             raise
         except (KeyboardInterrupt, Exception) as e:
-            logger.error(f'{type(e).__name__}: {e}')
-            if ctx.no_cleanup_on_failure:
+            # If ctx.fsid is None it would print meaningless message suggesting
+            # running "cephadm rm-cluster --force --fsid None"
+            if ctx.no_cleanup_on_failure and ctx.fsid is not None:
                 logger.info('\n\n'
                             '\t***************\n'
                             '\tCephadm hit an issue during cluster installation. Current cluster files will NOT BE DELETED automatically. To change\n'
@@ -2706,7 +2707,10 @@ def rollback(func: FuncT) -> FuncT:
                             '\t   > cephadm rm-cluster --force --zap-osds --fsid <fsid>\n\n'
                             '\tfor more information please refer to https://docs.ceph.com/en/latest/cephadm/operations/#purging-a-cluster\n'
                             '\t***************\n\n')
-            else:
+            if not ctx.no_cleanup_on_failure:
+                # The logger.error() used to be called before these conditions, which resulted in the error being printed twice.
+                # Moving it inside this condition to print the error if _rm_cluster() is called and also fails.
+                logger.error(f'{type(e).__name__}: {e}')
                 logger.info('\n\n'
                             '\t***************\n'
                             '\tCephadm hit an issue during cluster installation. Current cluster files will be deleted automatically.\n'
@@ -2734,6 +2738,13 @@ def command_bootstrap(ctx):
     if not ctx.output_pub_ssh_key:
         ctx.output_pub_ssh_key = os.path.join(ctx.output_dir, CEPH_PUBKEY)
 
+    if ctx.apply_spec and not os.path.exists(ctx.apply_spec):
+        # Given that nothing has been deployed at this point, setting `ctx.no_cleanup_on_failure = True`
+        # as there's no need to call _rm_cluster() which would generate the message:
+        # "ERROR: must select the cluster to delete by passing --fsid to proceed"
+        ctx.no_cleanup_on_failure = True
+        raise Error(f"--apply-spec has been specified but {ctx.apply_spec} doesn't exist.")
+
     if (
         (bool(ctx.ssh_private_key) is not bool(ctx.ssh_public_key))
         and (bool(ctx.ssh_private_key) is not bool(ctx.ssh_signed_cert))