cephadm: adding rollback mechanism to handle bootstrap failures

author Redouane Kachach <rkachach@redhat.com>

Tue, 23 May 2023 12:01:26 +0000 (14:01 +0200)

committer Adam King <adking@redhat.com>

Tue, 10 Oct 2023 17:03:38 +0000 (13:03 -0400)
author Redouane Kachach <rkachach@redhat.com>
Tue, 23 May 2023 12:01:26 +0000 (14:01 +0200)
committer Adam King <adking@redhat.com>
Tue, 10 Oct 2023 17:03:38 +0000 (13:03 -0400)
diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py

index 36f2e9adebcbb0e1c992b47390e26b3371990b56..081870fb0b8c37c42d2fe1c0799b6e8d23b8b20e 100755 (executable)
--- a/src/cephadm/cephadm.py
+++ b/src/cephadm/cephadm.py
@@ -377,6 +377,10 @@ class Error(Exception):
      pass
  
  
+class ClusterAlreadyExists(Exception):
+    pass
+
+
  class TimeoutExpired(Error):
      pass
  
@@ -3349,10 +3353,14 @@ def get_container_mounts(ctx, fsid, daemon_type, daemon_id,
      if daemon_type == 'osd':
          # selinux-policy in the container may not match the host.
          if HostFacts(ctx).selinux_enabled:
-            selinux_folder = '/var/lib/ceph/%s/selinux' % fsid
-            if not os.path.exists(selinux_folder):
-                os.makedirs(selinux_folder, mode=0o755)
-            mounts[selinux_folder] = '/sys/fs/selinux:ro'
+            cluster_dir = f'{ctx.data_dir}/{fsid}'
+            selinux_folder = f'{cluster_dir}/selinux'
+            if os.path.exists(cluster_dir):
+                if not os.path.exists(selinux_folder):
+                    os.makedirs(selinux_folder, mode=0o755)
+                mounts[selinux_folder] = '/sys/fs/selinux:ro'
+            else:
+                logger.error(f'Cluster direcotry {cluster_dir} does not exist.')
          mounts['/'] = '/rootfs'
  
      try:
@@ -5683,6 +5691,7 @@ def create_mgr(
          except Exception as e:
              logger.debug('status failed: %s' % e)
              return False
+
      is_available(ctx, 'mgr', is_mgr_available)
  
  
@@ -6078,6 +6087,43 @@ def save_cluster_config(ctx: CephadmContext, uid: int, gid: int, fsid: str) -> N
          logger.warning(f'Cannot create cluster configuration directory {conf_dir}')
  
  
+def rollback(func: FuncT) -> FuncT:
+    """
+    """
+    @wraps(func)
+    def _rollback(ctx: CephadmContext) -> Any:
+        try:
+            return func(ctx)
+        except ClusterAlreadyExists:
+            # another cluster with the provided fsid already exists: don't remove.
+            raise
+        except (KeyboardInterrupt, Exception) as e:
+            logger.error(f'{type(e).__name__}: {e}')
+            if ctx.cleanup_on_failure:
+                logger.info('\n\n'
+                            '\t***************\n'
+                            '\tCephadm hit an issue during cluster installation. Current cluster files will be deleted automatically,\n'
+                            '\tto disable this behaviour you can pass the --no-cleanup-on-failure flag. In case of any previous\n'
+                            '\tbroken installation user must use the following command to completely delete the broken cluster:\n\n'
+                            '\t> cephadm rm-cluster --force --zap-osds --fsid <fsid>\n\n'
+                            '\tfor more information please refer to https://docs.ceph.com/en/latest/cephadm/operations/#purging-a-cluster\n'
+                            '\t***************\n\n')
+                _rm_cluster(ctx, keep_logs=False, zap_osds=False)
+            else:
+                logger.info('\n\n'
+                            '\t***************\n'
+                            '\tCephadm hit an issue during cluster installation. Current cluster files will NOT BE DELETED automatically to change\n'
+                            '\tthis behaviour you can pass the --cleanup-on-failure. To remove this broken cluster manually please run:\n\n'
+                            f'\t   > cephadm rm-cluster --force --fsid {ctx.fsid}\n\n'
+                            '\tin case of any previous broken installation user must use the rm-cluster command to delete the broken cluster:\n\n'
+                            '\t   > cephadm rm-cluster --force --zap-osds --fsid <fsid>\n\n'
+                            '\tfor more information please refer to https://docs.ceph.com/en/latest/cephadm/operations/#purging-a-cluster\n'
+                            '\t***************\n\n')
+            raise
+    return cast(FuncT, _rollback)
+
+
+@rollback
  @default_image
  def command_bootstrap(ctx):
      # type: (CephadmContext) -> int
@@ -6108,17 +6154,21 @@ def command_bootstrap(ctx):
      if ctx.fsid:
          data_dir_base = os.path.join(ctx.data_dir, ctx.fsid)
          if os.path.exists(data_dir_base):
-            raise Error(f"A cluster with the same fsid '{ctx.fsid}' already exists.")
+            raise ClusterAlreadyExists(f"A cluster with the same fsid '{ctx.fsid}' already exists.")
          else:
              logger.warning('Specifying an fsid for your cluster offers no advantages and may increase the likelihood of fsid conflicts.')
  
+    # initial vars
+    ctx.fsid = ctx.fsid or make_fsid()
+    fsid = ctx.fsid
+    if not is_fsid(fsid):
+        raise Error('not an fsid: %s' % fsid)
+
      # verify output files
-    for f in [ctx.output_config, ctx.output_keyring,
-              ctx.output_pub_ssh_key]:
+    for f in [ctx.output_config, ctx.output_keyring, ctx.output_pub_ssh_key]:
          if not ctx.allow_overwrite:
              if os.path.exists(f):
-                raise Error('%s already exists; delete or pass '
-                            '--allow-overwrite to overwrite' % f)
+                raise ClusterAlreadyExists('%s already exists; delete or pass --allow-overwrite to overwrite' % f)
          dirname = os.path.dirname(f)
          if dirname and not os.path.exists(dirname):
              fname = os.path.basename(f)
@@ -6139,12 +6189,7 @@ def command_bootstrap(ctx):
      else:
          logger.info('Skip prepare_host')
  
-    # initial vars
-    fsid = ctx.fsid or make_fsid()
-    if not is_fsid(fsid):
-        raise Error('not an fsid: %s' % fsid)
      logger.info('Cluster fsid: %s' % fsid)
-
      hostname = get_hostname()
      if '.' in hostname and not ctx.allow_fqdn_hostname:
          raise Error('hostname is a fully qualified domain name (%s); either fix (e.g., "sudo hostname %s" or similar) or pass --allow-fqdn-hostname' % (hostname, hostname.split('.')[0]))
@@ -7870,14 +7915,20 @@ def get_ceph_cluster_count(ctx: CephadmContext) -> int:
      return len([c for c in os.listdir(ctx.data_dir) if is_fsid(c)])
  
  
-def command_rm_cluster(ctx):
-    # type: (CephadmContext) -> None
+def command_rm_cluster(ctx: CephadmContext) -> None:
      if not ctx.force:
          raise Error('must pass --force to proceed: '
                      'this command may destroy precious data!')
  
      lock = FileLock(ctx, ctx.fsid)
      lock.acquire()
+    _rm_cluster(ctx, ctx.keep_logs, ctx.zap_osds)
+
+
+def _rm_cluster(ctx: CephadmContext, keep_logs: bool, zap_osds: bool) -> None:
+
+    if not ctx.fsid:
+        raise Error('must select the cluster to delete by passing --fsid to proceed')
  
      def disable_systemd_service(unit_name: str) -> None:
          call(ctx, ['systemctl', 'stop', unit_name],
@@ -7887,6 +7938,8 @@ def command_rm_cluster(ctx):
          call(ctx, ['systemctl', 'disable', unit_name],
               verbosity=CallVerbosity.DEBUG)
  
+    logger.info(f'Deleting cluster with fsid: {ctx.fsid}')
+
      # stop + disable individual daemon units
      for d in list_daemons(ctx, detail=False):
          if d['fsid'] != ctx.fsid:
@@ -7904,7 +7957,7 @@ def command_rm_cluster(ctx):
           verbosity=CallVerbosity.DEBUG)
  
      # osds?
-    if ctx.zap_osds:
+    if zap_osds:
          _zap_osds(ctx)
  
      # rm units
@@ -7917,7 +7970,7 @@ def command_rm_cluster(ctx):
      # rm data
      call_throws(ctx, ['rm', '-rf', ctx.data_dir + '/' + ctx.fsid])
  
-    if not ctx.keep_logs:
+    if not keep_logs:
          # rm logs
          call_throws(ctx, ['rm', '-rf', ctx.log_dir + '/' + ctx.fsid])
          call_throws(ctx, ['rm', '-rf', ctx.log_dir
@@ -7937,7 +7990,7 @@ def command_rm_cluster(ctx):
          # rm cephadm logrotate config
          call_throws(ctx, ['rm', '-f', ctx.logrotate_dir + '/cephadm'])
  
-        if not ctx.keep_logs:
+        if not keep_logs:
              # remove all cephadm logs
              for fname in glob(f'{ctx.log_dir}/cephadm.log*'):
                  os.remove(fname)
@@ -10254,6 +10307,23 @@ def _get_parser():
          '--allow-overwrite',
          action='store_true',
          help='allow overwrite of existing --output-* config/keyring/ssh files')
+    # following logic to have both '--cleanup-on-failure' and '--no-cleanup-on-failure'
+    # has been included in argparse of python v3.9, however since we have to support
+    # older python versions the following is more generic. Once python v3.9 becomes
+    # the minium supported version we can implement the same by using the new option
+    # argparse.BooleanOptionalAction
+    group = parser_bootstrap.add_mutually_exclusive_group()
+    group.add_argument(
+        '--cleanup-on-failure',
+        action='store_true',
+        default=True,
+        help='Delete cluster files in case of a failed installation')
+    group.add_argument(
+        '--no-cleanup-on-failure',
+        action='store_const',
+        const=False,
+        dest='cleanup_on_failure',
+        help='Do not delete cluster files in case of a failed installation')
      parser_bootstrap.add_argument(
          '--allow-fqdn-hostname',
          action='store_true',
@@ -10560,7 +10630,7 @@ def main() -> None:
              check_container_engine(ctx)
          # command handler
          r = ctx.func(ctx)
-    except Error as e:
+    except (Error, ClusterAlreadyExists) as e:
          if ctx.verbose:
              raise
          logger.error('ERROR: %s' % e)
author	Redouane Kachach <rkachach@redhat.com>
	Tue, 23 May 2023 12:01:26 +0000 (14:01 +0200)
committer	Adam King <adking@redhat.com>
	Tue, 10 Oct 2023 17:03:38 +0000 (13:03 -0400)