From: Redouane Kachach Date: Tue, 23 May 2023 12:01:26 +0000 (+0200) Subject: cephadm: adding rollback mechanism to handle bootstrap failures X-Git-Tag: v18.2.1~116^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=e4e6a19011927fd6c1fbb77428a6f4238034f39e;p=ceph.git cephadm: adding rollback mechanism to handle bootstrap failures Fixes: https://tracker.ceph.com/issues/57016 Signed-off-by: Redouane Kachach (cherry picked from commit 4067e2fe8a37b0dd621e247d139f250428858d9c) --- diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py index 36f2e9adebcb..081870fb0b8c 100755 --- a/src/cephadm/cephadm.py +++ b/src/cephadm/cephadm.py @@ -377,6 +377,10 @@ class Error(Exception): pass +class ClusterAlreadyExists(Exception): + pass + + class TimeoutExpired(Error): pass @@ -3349,10 +3353,14 @@ def get_container_mounts(ctx, fsid, daemon_type, daemon_id, if daemon_type == 'osd': # selinux-policy in the container may not match the host. if HostFacts(ctx).selinux_enabled: - selinux_folder = '/var/lib/ceph/%s/selinux' % fsid - if not os.path.exists(selinux_folder): - os.makedirs(selinux_folder, mode=0o755) - mounts[selinux_folder] = '/sys/fs/selinux:ro' + cluster_dir = f'{ctx.data_dir}/{fsid}' + selinux_folder = f'{cluster_dir}/selinux' + if os.path.exists(cluster_dir): + if not os.path.exists(selinux_folder): + os.makedirs(selinux_folder, mode=0o755) + mounts[selinux_folder] = '/sys/fs/selinux:ro' + else: + logger.error(f'Cluster direcotry {cluster_dir} does not exist.') mounts['/'] = '/rootfs' try: @@ -5683,6 +5691,7 @@ def create_mgr( except Exception as e: logger.debug('status failed: %s' % e) return False + is_available(ctx, 'mgr', is_mgr_available) @@ -6078,6 +6087,43 @@ def save_cluster_config(ctx: CephadmContext, uid: int, gid: int, fsid: str) -> N logger.warning(f'Cannot create cluster configuration directory {conf_dir}') +def rollback(func: FuncT) -> FuncT: + """ + """ + @wraps(func) + def _rollback(ctx: CephadmContext) -> Any: + try: + return func(ctx) + except ClusterAlreadyExists: + # another cluster with the provided fsid already exists: don't remove. + raise + except (KeyboardInterrupt, Exception) as e: + logger.error(f'{type(e).__name__}: {e}') + if ctx.cleanup_on_failure: + logger.info('\n\n' + '\t***************\n' + '\tCephadm hit an issue during cluster installation. Current cluster files will be deleted automatically,\n' + '\tto disable this behaviour you can pass the --no-cleanup-on-failure flag. In case of any previous\n' + '\tbroken installation user must use the following command to completely delete the broken cluster:\n\n' + '\t> cephadm rm-cluster --force --zap-osds --fsid \n\n' + '\tfor more information please refer to https://docs.ceph.com/en/latest/cephadm/operations/#purging-a-cluster\n' + '\t***************\n\n') + _rm_cluster(ctx, keep_logs=False, zap_osds=False) + else: + logger.info('\n\n' + '\t***************\n' + '\tCephadm hit an issue during cluster installation. Current cluster files will NOT BE DELETED automatically to change\n' + '\tthis behaviour you can pass the --cleanup-on-failure. To remove this broken cluster manually please run:\n\n' + f'\t > cephadm rm-cluster --force --fsid {ctx.fsid}\n\n' + '\tin case of any previous broken installation user must use the rm-cluster command to delete the broken cluster:\n\n' + '\t > cephadm rm-cluster --force --zap-osds --fsid \n\n' + '\tfor more information please refer to https://docs.ceph.com/en/latest/cephadm/operations/#purging-a-cluster\n' + '\t***************\n\n') + raise + return cast(FuncT, _rollback) + + +@rollback @default_image def command_bootstrap(ctx): # type: (CephadmContext) -> int @@ -6108,17 +6154,21 @@ def command_bootstrap(ctx): if ctx.fsid: data_dir_base = os.path.join(ctx.data_dir, ctx.fsid) if os.path.exists(data_dir_base): - raise Error(f"A cluster with the same fsid '{ctx.fsid}' already exists.") + raise ClusterAlreadyExists(f"A cluster with the same fsid '{ctx.fsid}' already exists.") else: logger.warning('Specifying an fsid for your cluster offers no advantages and may increase the likelihood of fsid conflicts.') + # initial vars + ctx.fsid = ctx.fsid or make_fsid() + fsid = ctx.fsid + if not is_fsid(fsid): + raise Error('not an fsid: %s' % fsid) + # verify output files - for f in [ctx.output_config, ctx.output_keyring, - ctx.output_pub_ssh_key]: + for f in [ctx.output_config, ctx.output_keyring, ctx.output_pub_ssh_key]: if not ctx.allow_overwrite: if os.path.exists(f): - raise Error('%s already exists; delete or pass ' - '--allow-overwrite to overwrite' % f) + raise ClusterAlreadyExists('%s already exists; delete or pass --allow-overwrite to overwrite' % f) dirname = os.path.dirname(f) if dirname and not os.path.exists(dirname): fname = os.path.basename(f) @@ -6139,12 +6189,7 @@ def command_bootstrap(ctx): else: logger.info('Skip prepare_host') - # initial vars - fsid = ctx.fsid or make_fsid() - if not is_fsid(fsid): - raise Error('not an fsid: %s' % fsid) logger.info('Cluster fsid: %s' % fsid) - hostname = get_hostname() if '.' in hostname and not ctx.allow_fqdn_hostname: raise Error('hostname is a fully qualified domain name (%s); either fix (e.g., "sudo hostname %s" or similar) or pass --allow-fqdn-hostname' % (hostname, hostname.split('.')[0])) @@ -7870,14 +7915,20 @@ def get_ceph_cluster_count(ctx: CephadmContext) -> int: return len([c for c in os.listdir(ctx.data_dir) if is_fsid(c)]) -def command_rm_cluster(ctx): - # type: (CephadmContext) -> None +def command_rm_cluster(ctx: CephadmContext) -> None: if not ctx.force: raise Error('must pass --force to proceed: ' 'this command may destroy precious data!') lock = FileLock(ctx, ctx.fsid) lock.acquire() + _rm_cluster(ctx, ctx.keep_logs, ctx.zap_osds) + + +def _rm_cluster(ctx: CephadmContext, keep_logs: bool, zap_osds: bool) -> None: + + if not ctx.fsid: + raise Error('must select the cluster to delete by passing --fsid to proceed') def disable_systemd_service(unit_name: str) -> None: call(ctx, ['systemctl', 'stop', unit_name], @@ -7887,6 +7938,8 @@ def command_rm_cluster(ctx): call(ctx, ['systemctl', 'disable', unit_name], verbosity=CallVerbosity.DEBUG) + logger.info(f'Deleting cluster with fsid: {ctx.fsid}') + # stop + disable individual daemon units for d in list_daemons(ctx, detail=False): if d['fsid'] != ctx.fsid: @@ -7904,7 +7957,7 @@ def command_rm_cluster(ctx): verbosity=CallVerbosity.DEBUG) # osds? - if ctx.zap_osds: + if zap_osds: _zap_osds(ctx) # rm units @@ -7917,7 +7970,7 @@ def command_rm_cluster(ctx): # rm data call_throws(ctx, ['rm', '-rf', ctx.data_dir + '/' + ctx.fsid]) - if not ctx.keep_logs: + if not keep_logs: # rm logs call_throws(ctx, ['rm', '-rf', ctx.log_dir + '/' + ctx.fsid]) call_throws(ctx, ['rm', '-rf', ctx.log_dir @@ -7937,7 +7990,7 @@ def command_rm_cluster(ctx): # rm cephadm logrotate config call_throws(ctx, ['rm', '-f', ctx.logrotate_dir + '/cephadm']) - if not ctx.keep_logs: + if not keep_logs: # remove all cephadm logs for fname in glob(f'{ctx.log_dir}/cephadm.log*'): os.remove(fname) @@ -10254,6 +10307,23 @@ def _get_parser(): '--allow-overwrite', action='store_true', help='allow overwrite of existing --output-* config/keyring/ssh files') + # following logic to have both '--cleanup-on-failure' and '--no-cleanup-on-failure' + # has been included in argparse of python v3.9, however since we have to support + # older python versions the following is more generic. Once python v3.9 becomes + # the minium supported version we can implement the same by using the new option + # argparse.BooleanOptionalAction + group = parser_bootstrap.add_mutually_exclusive_group() + group.add_argument( + '--cleanup-on-failure', + action='store_true', + default=True, + help='Delete cluster files in case of a failed installation') + group.add_argument( + '--no-cleanup-on-failure', + action='store_const', + const=False, + dest='cleanup_on_failure', + help='Do not delete cluster files in case of a failed installation') parser_bootstrap.add_argument( '--allow-fqdn-hostname', action='store_true', @@ -10560,7 +10630,7 @@ def main() -> None: check_container_engine(ctx) # command handler r = ctx.func(ctx) - except Error as e: + except (Error, ClusterAlreadyExists) as e: if ctx.verbose: raise logger.error('ERROR: %s' % e)