From: Redouane Kachach Date: Tue, 23 May 2023 12:01:26 +0000 (+0200) Subject: cephadm: adding rollback mechanism to handle bootstrap failures X-Git-Tag: v19.0.0~908^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=4067e2fe8a37b0dd621e247d139f250428858d9c;p=ceph.git cephadm: adding rollback mechanism to handle bootstrap failures Fixes: https://tracker.ceph.com/issues/57016 Signed-off-by: Redouane Kachach --- diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py index 672c1701155e..3137fc16db1c 100755 --- a/src/cephadm/cephadm.py +++ b/src/cephadm/cephadm.py @@ -363,6 +363,10 @@ class Error(Exception): pass +class ClusterAlreadyExists(Exception): + pass + + class TimeoutExpired(Error): pass @@ -3071,10 +3075,14 @@ def get_container_mounts(ctx, fsid, daemon_type, daemon_id, if daemon_type == 'osd': # selinux-policy in the container may not match the host. if HostFacts(ctx).selinux_enabled: - selinux_folder = '/var/lib/ceph/%s/selinux' % fsid - if not os.path.exists(selinux_folder): - os.makedirs(selinux_folder, mode=0o755) - mounts[selinux_folder] = '/sys/fs/selinux:ro' + cluster_dir = f'{ctx.data_dir}/{fsid}' + selinux_folder = f'{cluster_dir}/selinux' + if os.path.exists(cluster_dir): + if not os.path.exists(selinux_folder): + os.makedirs(selinux_folder, mode=0o755) + mounts[selinux_folder] = '/sys/fs/selinux:ro' + else: + logger.error(f'Cluster direcotry {cluster_dir} does not exist.') mounts['/'] = '/rootfs' try: @@ -5398,6 +5406,7 @@ def create_mgr( except Exception as e: logger.debug('status failed: %s' % e) return False + is_available(ctx, 'mgr', is_mgr_available) @@ -5783,6 +5792,43 @@ def save_cluster_config(ctx: CephadmContext, uid: int, gid: int, fsid: str) -> N logger.warning(f'Cannot create cluster configuration directory {conf_dir}') +def rollback(func: FuncT) -> FuncT: + """ + """ + @wraps(func) + def _rollback(ctx: CephadmContext) -> Any: + try: + return func(ctx) + except ClusterAlreadyExists: + # another cluster with the provided fsid already exists: don't remove. + raise + except (KeyboardInterrupt, Exception) as e: + logger.error(f'{type(e).__name__}: {e}') + if ctx.cleanup_on_failure: + logger.info('\n\n' + '\t***************\n' + '\tCephadm hit an issue during cluster installation. Current cluster files will be deleted automatically,\n' + '\tto disable this behaviour you can pass the --no-cleanup-on-failure flag. In case of any previous\n' + '\tbroken installation user must use the following command to completely delete the broken cluster:\n\n' + '\t> cephadm rm-cluster --force --zap-osds --fsid \n\n' + '\tfor more information please refer to https://docs.ceph.com/en/latest/cephadm/operations/#purging-a-cluster\n' + '\t***************\n\n') + _rm_cluster(ctx, keep_logs=False, zap_osds=False) + else: + logger.info('\n\n' + '\t***************\n' + '\tCephadm hit an issue during cluster installation. Current cluster files will NOT BE DELETED automatically to change\n' + '\tthis behaviour you can pass the --cleanup-on-failure. To remove this broken cluster manually please run:\n\n' + f'\t > cephadm rm-cluster --force --fsid {ctx.fsid}\n\n' + '\tin case of any previous broken installation user must use the rm-cluster command to delete the broken cluster:\n\n' + '\t > cephadm rm-cluster --force --zap-osds --fsid \n\n' + '\tfor more information please refer to https://docs.ceph.com/en/latest/cephadm/operations/#purging-a-cluster\n' + '\t***************\n\n') + raise + return cast(FuncT, _rollback) + + +@rollback @default_image def command_bootstrap(ctx): # type: (CephadmContext) -> int @@ -5802,17 +5848,21 @@ def command_bootstrap(ctx): if ctx.fsid: data_dir_base = os.path.join(ctx.data_dir, ctx.fsid) if os.path.exists(data_dir_base): - raise Error(f"A cluster with the same fsid '{ctx.fsid}' already exists.") + raise ClusterAlreadyExists(f"A cluster with the same fsid '{ctx.fsid}' already exists.") else: logger.warning('Specifying an fsid for your cluster offers no advantages and may increase the likelihood of fsid conflicts.') + # initial vars + ctx.fsid = ctx.fsid or make_fsid() + fsid = ctx.fsid + if not is_fsid(fsid): + raise Error('not an fsid: %s' % fsid) + # verify output files - for f in [ctx.output_config, ctx.output_keyring, - ctx.output_pub_ssh_key]: + for f in [ctx.output_config, ctx.output_keyring, ctx.output_pub_ssh_key]: if not ctx.allow_overwrite: if os.path.exists(f): - raise Error('%s already exists; delete or pass ' - '--allow-overwrite to overwrite' % f) + raise ClusterAlreadyExists('%s already exists; delete or pass --allow-overwrite to overwrite' % f) dirname = os.path.dirname(f) if dirname and not os.path.exists(dirname): fname = os.path.basename(f) @@ -5833,12 +5883,7 @@ def command_bootstrap(ctx): else: logger.info('Skip prepare_host') - # initial vars - fsid = ctx.fsid or make_fsid() - if not is_fsid(fsid): - raise Error('not an fsid: %s' % fsid) logger.info('Cluster fsid: %s' % fsid) - hostname = get_hostname() if '.' in hostname and not ctx.allow_fqdn_hostname: raise Error('hostname is a fully qualified domain name (%s); either fix (e.g., "sudo hostname %s" or similar) or pass --allow-fqdn-hostname' % (hostname, hostname.split('.')[0])) @@ -7463,14 +7508,20 @@ def get_ceph_cluster_count(ctx: CephadmContext) -> int: return len([c for c in os.listdir(ctx.data_dir) if is_fsid(c)]) -def command_rm_cluster(ctx): - # type: (CephadmContext) -> None +def command_rm_cluster(ctx: CephadmContext) -> None: if not ctx.force: raise Error('must pass --force to proceed: ' 'this command may destroy precious data!') lock = FileLock(ctx, ctx.fsid) lock.acquire() + _rm_cluster(ctx, ctx.keep_logs, ctx.zap_osds) + + +def _rm_cluster(ctx: CephadmContext, keep_logs: bool, zap_osds: bool) -> None: + + if not ctx.fsid: + raise Error('must select the cluster to delete by passing --fsid to proceed') def disable_systemd_service(unit_name: str) -> None: call(ctx, ['systemctl', 'stop', unit_name], @@ -7480,6 +7531,8 @@ def command_rm_cluster(ctx): call(ctx, ['systemctl', 'disable', unit_name], verbosity=CallVerbosity.DEBUG) + logger.info(f'Deleting cluster with fsid: {ctx.fsid}') + # stop + disable individual daemon units for d in list_daemons(ctx, detail=False): if d['fsid'] != ctx.fsid: @@ -7497,7 +7550,7 @@ def command_rm_cluster(ctx): verbosity=CallVerbosity.DEBUG) # osds? - if ctx.zap_osds: + if zap_osds: _zap_osds(ctx) # rm units @@ -7510,7 +7563,7 @@ def command_rm_cluster(ctx): # rm data call_throws(ctx, ['rm', '-rf', ctx.data_dir + '/' + ctx.fsid]) - if not ctx.keep_logs: + if not keep_logs: # rm logs call_throws(ctx, ['rm', '-rf', ctx.log_dir + '/' + ctx.fsid]) call_throws(ctx, ['rm', '-rf', ctx.log_dir @@ -7530,7 +7583,7 @@ def command_rm_cluster(ctx): # rm cephadm logrotate config call_throws(ctx, ['rm', '-f', ctx.logrotate_dir + '/cephadm']) - if not ctx.keep_logs: + if not keep_logs: # remove all cephadm logs for fname in glob(f'{ctx.log_dir}/cephadm.log*'): os.remove(fname) @@ -7543,7 +7596,7 @@ def command_rm_cluster(ctx): p.unlink() # cleanup remaining ceph directories - ceph_dirs = [f'/run/ceph/{ctx.fsid}', f'/tmp/var/lib/ceph/{ctx.fsid}', f'/var/run/ceph/{ctx.fsid}'] + ceph_dirs = [f'/run/ceph/{ctx.fsid}', f'/tmp/cephadm-{ctx.fsid}', f'/var/run/ceph/{ctx.fsid}'] for dd in ceph_dirs: shutil.rmtree(dd, ignore_errors=True) @@ -9746,6 +9799,23 @@ def _get_parser(): '--allow-overwrite', action='store_true', help='allow overwrite of existing --output-* config/keyring/ssh files') + # following logic to have both '--cleanup-on-failure' and '--no-cleanup-on-failure' + # has been included in argparse of python v3.9, however since we have to support + # older python versions the following is more generic. Once python v3.9 becomes + # the minium supported version we can implement the same by using the new option + # argparse.BooleanOptionalAction + group = parser_bootstrap.add_mutually_exclusive_group() + group.add_argument( + '--cleanup-on-failure', + action='store_true', + default=True, + help='Delete cluster files in case of a failed installation') + group.add_argument( + '--no-cleanup-on-failure', + action='store_const', + const=False, + dest='cleanup_on_failure', + help='Do not delete cluster files in case of a failed installation') parser_bootstrap.add_argument( '--allow-fqdn-hostname', action='store_true', @@ -10078,7 +10148,7 @@ def main() -> None: check_container_engine(ctx) # command handler r = ctx.func(ctx) - except Error as e: + except (Error, ClusterAlreadyExists) as e: if ctx.verbose: raise logger.error('ERROR: %s' % e)