From 9accc576d6fd727b46210eff526ddf4b27fe4aa7 Mon Sep 17 00:00:00 2001 From: Adam King Date: Wed, 14 May 2025 13:16:43 -0400 Subject: [PATCH] mgr/cephadm: rotate keyring for core ceph daemons during upgrade Specifically, this causes us to rotate the mgr, mon, OSD, and mds keyrings. The mgr and mon keyring are done as soon as we see all the mons have been upgraded and OSD/mds happens when we reach them in the upgrade order. Signed-off-by: Adam King --- src/cephadm/cephadm.py | 26 ++++ src/cephadm/cephadmlib/daemons/ceph.py | 184 ++++++++++++++++++++++++- src/pybind/mgr/cephadm/module.py | 12 ++ src/pybind/mgr/cephadm/serve.py | 9 ++ src/pybind/mgr/cephadm/services/osd.py | 4 - src/pybind/mgr/cephadm/upgrade.py | 66 ++++++++- 6 files changed, 294 insertions(+), 7 deletions(-) diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py index 4e255e55165..dba0c85a290 100755 --- a/src/cephadm/cephadm.py +++ b/src/cephadm/cephadm.py @@ -180,6 +180,7 @@ from cephadmlib.daemons import ( Keepalived, Monitoring, NFSGanesha, + OSD, SMB, SNMPGateway, MgmtGateway, @@ -621,8 +622,28 @@ def create_daemon_dirs( if keyring: keyring_path = os.path.join(data_dir, 'keyring') + config_json = fetch_configs(ctx) + key_path_exists = False + key_path_content = 'N/A' + try: + key_path_exists = os.path.exists(keyring_path) + key_path_content = open(keyring_path, 'r').read() + except Exception: + pass + update_bluestore_label_osd_keyring = False + if ( + ident.daemon_type == 'osd' + and key_path_exists + and key_path_content != keyring + ): + # need to update keyring with ceph-bluestore-tool + update_bluestore_label_osd_keyring = True with write_new(keyring_path, owner=(uid, gid)) as f: f.write(keyring) + if update_bluestore_label_osd_keyring: + osd_daemon_form = OSD.create(ctx, ident) + # osd_daemon_form = OSD.init(ctx, ctx.fsid, ident.daemon_id) + osd_daemon_form.rotate_osd_lv_keyring(ctx, keyring_path) if daemon_type in Monitoring.components.keys(): config_json = fetch_configs(ctx) @@ -4555,6 +4576,11 @@ def _add_deploy_parser_args( default=None, help='Time in seconds to wait for graceful service shutdown before forcefully killing it' ) + parser_deploy.add_argument( + '--osd-dm-crypt-key', + default=None, + help="dm-crypt key for OSD, needed for deployment if OSD's cephx keyring has been rotated" + ) def _name_opts(parser: argparse.ArgumentParser) -> None: diff --git a/src/cephadm/cephadmlib/daemons/ceph.py b/src/cephadm/cephadmlib/daemons/ceph.py index ac16bbebcfc..4a27e5bbc0c 100644 --- a/src/cephadm/cephadmlib/daemons/ceph.py +++ b/src/cephadm/cephadmlib/daemons/ceph.py @@ -1,5 +1,6 @@ import logging import os +import json from typing import Any, Dict, List, Optional, Tuple, Union @@ -12,6 +13,7 @@ from ..context_getters import ( ) from ..daemon_form import register as register_daemon_form from ..daemon_identity import DaemonIdentity +from ..call_wrappers import call, CallVerbosity from ..constants import DEFAULT_IMAGE from ..context import CephadmContext from ..deployment_utils import to_deployment_container @@ -22,7 +24,9 @@ from ..file_utils import ( pathify, populate_files, makedirs, + read_file, recursive_chown, + write_new, ) from ..data_utils import dict_get from ..host_facts import HostFacts @@ -304,9 +308,11 @@ class OSD(Ceph): ctx: CephadmContext, ident: DaemonIdentity, osd_fsid: Optional[str] = None, + osd_dm_crypt_key: Optional[str] = None, ) -> None: super().__init__(ctx, ident) self._osd_fsid = osd_fsid + self._osd_dm_crypt_key: str = osd_dm_crypt_key or '' @classmethod def create(cls, ctx: CephadmContext, ident: DaemonIdentity) -> 'OSD': @@ -315,7 +321,8 @@ class OSD(Ceph): logger.info( 'Creating an OSD daemon form without an OSD FSID value' ) - return cls(ctx, ident, osd_fsid) + osd_dm_crypt_key = getattr(ctx, 'osd_dm_crypt_key', None) + return cls(ctx, ident, osd_fsid, osd_dm_crypt_key) @staticmethod def get_sysctl_settings() -> List[str]: @@ -332,6 +339,181 @@ class OSD(Ceph): def osd_fsid(self) -> Optional[str]: return self._osd_fsid + def rotate_osd_lv_keyring( + self, ctx: CephadmContext, keyring_path: str + ) -> None: + keyring_content = read_file([keyring_path]) + if not keyring_content or keyring_content == 'Unknown': + raise Error( + f'Failed to find OSD keyring content at expected path: {keyring_path}' + ) + actual_keyring = keyring_content + # if our keyring is the full thing with sections and caps, we don't want that + # just the actual keyring itself + try: + actual_keyring = ( + keyring_content.split('key =', 1)[1] + .split('caps', 1)[0] + .strip() + ) + except IndexError: + logger.error( + f'Failed to parse keyring from {keyring_content} for rotation of key for osd.{self.identity.daemon_id}' + ) + c_v_container = CephContainer( + ctx, + image=ctx.image, + privileged=True, + entrypoint='ceph-volume', + args=[ + 'lvm', + 'list', + str(self.identity.daemon_id), + '--format', + 'json', + ], + volume_mounts=get_ceph_mounts_for_type( + ctx, ctx.fsid, 'ceph-volume' + ), + ) + out, err, ret = call( + ctx, + c_v_container.run_cmd(), + verbosity=CallVerbosity.QUIET_UNLESS_ERROR, + ) + if ret: + raise Error( + f'Got error using ceph-volume lvm list to get lv path for osd.{self.identity.daemon_id}\n' + f'Out:{out}\n' + f'Err:{err}' + ) + osd_bluestore_data = json.loads(out) + lv_path = '' + encrypted = False + osd_lv_data = osd_bluestore_data.get(self.identity.daemon_id, []) + if osd_lv_data: + for dev in osd_lv_data: + if dev.get('type') == 'block': + lv_path = dev.get('lv_path', '') + encrypted = ( + dev.get('tags', {}).get('ceph.encrypted', '0') == '1' + ) + if not lv_path: + raise Error( + f'Failed to find lv path for osd with id "{self.identity.daemon_id}". Key not rotated using ceph-bluestore-tool' + ) + if encrypted and not self._osd_dm_crypt_key: + raise Error( + f'Cannot rotate keyring for encrypted osd with id "{self.identity.daemon_id}" without dm-crypt key.' + 'Key not rotated using ceph-bluestore-tool' + ) + dev_path = lv_path + + # OSD must be stopped to make changes to bluestore labels + logger.info( + f'Stopping osd.{self.identity.daemon_id} to update osd_key bluestore label' + ) + call( + ctx, + ['systemctl', 'stop', self.identity.unit_name], + verbosity=CallVerbosity.QUIET_UNLESS_ERROR, + ) + + if encrypted: + dev_path = f'/dev/mapper/tmp_open_osd_{self._osd_fsid}' + cryptsetup_action = """#!/bin/bash +DM_CRYPT_KEY=%s +LV_PATH=%s +DEV_NAME=%s + +echo "$DM_CRYPT_KEY" | cryptsetup luksOpen $LV_PATH $DEV_NAME +""" % ( + self._osd_dm_crypt_key, + lv_path, + dev_path.split('/')[-1], + ) + helper_script_path = ( + f'/tmp/cephadm-osd-{self.identity.daemon_id}-rotate-helper.sh' + ) + with write_new(helper_script_path, perms=0o700) as f: + f.write(cryptsetup_action) + cryptsetup_open_container = CephContainer( + ctx, + image=ctx.image, + privileged=True, + entrypoint='/tmp/cryptsetup_action.sh', + volume_mounts={ + '/dev': '/dev', + helper_script_path: '/tmp/cryptsetup_action.sh', + }, + ) + out, err, ret = call( + ctx, + cryptsetup_open_container.run_cmd(), + verbosity=CallVerbosity.QUIET_UNLESS_ERROR, + ) + os.remove(helper_script_path) + if ret: + raise Error( + 'Got error rotating osd keyring while using cryptsetup tool\n' + f'Out:{out}\n' + f'Err:{err}' + ) + + bluestore_tool_container = CephContainer( + ctx, + image=ctx.image, + privileged=True, + entrypoint='ceph-bluestore-tool', + args=[ + '--dev', + dev_path, + 'set-label-key', + '-k', + 'osd_key', + '-v', + actual_keyring, + ], + volume_mounts={'/dev': '/dev'}, + ) + out, err, ret = call( + ctx, + bluestore_tool_container.run_cmd(), + verbosity=CallVerbosity.QUIET_UNLESS_ERROR, + ) + if ret: + raise Error( + 'Got error rotating osd keyring using ceph-bluestore-tool\n' + f'Out:{out}\n' + f'Err:{err}' + ) + + if encrypted: + cryptsetup_close_container = CephContainer( + ctx, + image=ctx.image, + privileged=True, + entrypoint='cryptsetup', + args=[ + 'luksClose', + dev_path, + ], + volume_mounts={ + '/dev': '/dev', + }, + ) + out, err, ret = call( + ctx, + cryptsetup_close_container.run_cmd(), + verbosity=CallVerbosity.QUIET_UNLESS_ERROR, + ) + if ret: + raise Error( + 'Got error rotating osd keyring while using cryptsetup tool\n' + f'Out:{out}\n' + f'Err:{err}' + ) + @register_daemon_form class CephExporter(ContainerDaemonForm): diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index aec32f1fb7b..b9abd99a4fd 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -2619,6 +2619,18 @@ Then run the following: return self.perform_service_action(action, service_name) + def key_rotate(self, daemon_spec: CephadmDaemonDeploySpec) -> None: + rc, out, err = self.mon_command({ + 'prefix': 'auth rotate', + 'entity': daemon_spec.entity_name(), + 'format': 'json', + }) + if rc: + raise OrchestratorError( + f'Failed to rotate daemon key for {daemon_spec.entity_name()}.\n' + f'Rc: {rc}\nOut: {out}\nErr: {err}' + ) + def _rotate_daemon_key(self, daemon_spec: CephadmDaemonDeploySpec) -> str: self.log.info(f'Rotating authentication key for {daemon_spec.name()}') rc, out, err = self.mon_command({ diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py index 3f1c63bcda1..815d5ad13b3 100644 --- a/src/pybind/mgr/cephadm/serve.py +++ b/src/pybind/mgr/cephadm/serve.py @@ -1429,6 +1429,15 @@ class CephadmServe: if not osd_uuid: raise OrchestratorError('osd.%s not in osdmap' % daemon_spec.daemon_id) daemon_params['osd_fsid'] = osd_uuid + # we may need a dm-crypt key to rotate this OSD's keyring + # if it is encrypted. If it is not encrypted, no such + # key will exist + rc, ckg_out, ckg_err = self.mgr.mon_command({ + 'prefix': 'config-key get', + 'key': f'dm-crypt/osd/{osd_uuid}/luks', + }) + if not rc and ckg_out: + daemon_params['osd_dm_crypt_key'] = ckg_out if reconfig: daemon_params['reconfig'] = True diff --git a/src/pybind/mgr/cephadm/services/osd.py b/src/pybind/mgr/cephadm/services/osd.py index 60a399149f9..31039b9b25b 100644 --- a/src/pybind/mgr/cephadm/services/osd.py +++ b/src/pybind/mgr/cephadm/services/osd.py @@ -40,10 +40,6 @@ class OSDService(CephService): async def create_from_spec_one(host: str, drive_selection: DriveSelection) -> Optional[str]: # skip this host if there has been no change in inventory - if not self.mgr.cache.osdspec_needs_apply(host, drive_group): - self.mgr.log.debug("skipping apply of %s on %s (no change)" % ( - host, drive_group)) - return None # skip this host if we cannot schedule here if self.mgr.inventory.has_label(host, SpecialHostLabels.DRAIN_DAEMONS): return None diff --git a/src/pybind/mgr/cephadm/upgrade.py b/src/pybind/mgr/cephadm/upgrade.py index 26396f7f93d..c0e7c63cd78 100644 --- a/src/pybind/mgr/cephadm/upgrade.py +++ b/src/pybind/mgr/cephadm/upgrade.py @@ -71,6 +71,7 @@ class UpgradeState: services: Optional[List[str]] = None, total_count: Optional[int] = None, remaining_count: Optional[int] = None, + rotated_mgr_mon_auth_key_daemons: Optional[List[str]] = None, ): self._target_name: str = target_name # Use CephadmUpgrade.target_image instead. self.progress_id: str = progress_id @@ -88,6 +89,7 @@ class UpgradeState: self.services = services self.total_count = total_count self.remaining_count = remaining_count + self.rotated_mgr_mon_auth_key_daemons = rotated_mgr_mon_auth_key_daemons def to_json(self) -> dict: return { @@ -106,6 +108,7 @@ class UpgradeState: 'services': self.services, 'total_count': self.total_count, 'remaining_count': self.remaining_count, + 'rotated_mgr_mon_auth_key_daemons': self.rotated_mgr_mon_auth_key_daemons, } @classmethod @@ -127,7 +130,8 @@ class CephadmUpgrade: 'UPGRADE_REDEPLOY_DAEMON', 'UPGRADE_BAD_TARGET_VERSION', 'UPGRADE_EXCEPTION', - 'UPGRADE_OFFLINE_HOST' + 'UPGRADE_OFFLINE_HOST', + 'UPGRADE_KEY_ROTATION' ] def __init__(self, mgr: "CephadmOrchestrator"): @@ -867,11 +871,52 @@ class CephadmUpgrade: break return True, to_upgrade + def _rotate_mgr_mon_auth_keys(self, target_image: str, target_digests: Optional[List[str]] = None) -> None: + if self.upgrade_state: + if self.upgrade_state.rotated_mgr_mon_auth_key_daemons is None: + self.upgrade_state.rotated_mgr_mon_auth_key_daemons = [] + # do mgr and mon keyrings as one off after mons have been upgraded + mon_daemons = self.mgr.cache.get_daemons_by_service('mon') + _, mons_needing_upgrade, __, ___ = self._detect_need_upgrade(mon_daemons, target_digests, target_image) + if not mons_needing_upgrade: + # all mons have been upgraded if we get here + for dd in self.mgr.cache.get_daemons_by_service('mgr'): + if dd.name() in self.upgrade_state.rotated_mgr_mon_auth_key_daemons: + continue + daemon_spec = CephadmDaemonDeploySpec.from_daemon_description(dd) + self.mgr.key_rotate(daemon_spec) + if self.mgr.daemon_is_self(daemon_spec.daemon_type, daemon_spec.daemon_id): + self.mgr._schedule_daemon_action(daemon_spec.name(), 'redeploy') + else: + self.mgr._daemon_action(daemon_spec, action='redeploy') + self.upgrade_state.rotated_mgr_mon_auth_key_daemons.append(daemon_spec.name()) + self._save_upgrade_state() + # mon daemons share a key, only do one key rotation + # but still trigger redeploy for each mon + if 'mon' not in self.upgrade_state.rotated_mgr_mon_auth_key_daemons: + self.mgr.key_rotate( + CephadmDaemonDeploySpec.from_daemon_description( + mon_daemons[0] + ) + ) + self.upgrade_state.rotated_mgr_mon_auth_key_daemons.append('mon') + self._save_upgrade_state() + for dd in mon_daemons: + if dd.name() in self.upgrade_state.rotated_mgr_mon_auth_key_daemons: + continue + daemon_spec = CephadmDaemonDeploySpec.from_daemon_description(dd) + self.mgr._daemon_action(daemon_spec, action='redeploy') + self.upgrade_state.rotated_mgr_mon_auth_key_daemons.append(daemon_spec.name()) + self._save_upgrade_state() + else: + self.mgr.log.debug('Skipping mgr/mon key rotation, mons not upgraded') + def _upgrade_daemons(self, to_upgrade: List[Tuple[DaemonDescription, bool]], target_image: str, target_digests: Optional[List[str]] = None) -> None: assert self.upgrade_state is not None num = 1 if target_digests is None: target_digests = [] + self._rotate_mgr_mon_auth_keys(target_image, target_digests) for d_entry in to_upgrade: if self.upgrade_state.remaining_count is not None and self.upgrade_state.remaining_count <= 0 and not d_entry[1]: self.mgr.log.info( @@ -924,9 +969,26 @@ class CephadmUpgrade: else: logger.info('Upgrade: Updating %s.%s' % (d.daemon_type, d.daemon_id)) + + daemon_spec = CephadmDaemonDeploySpec.from_daemon_description(d) + + try: + if daemon_spec.daemon_type in ['mds', 'osd']: + daemon_spec.keyring = None + self.mgr.key_rotate(daemon_spec) + except Exception as e: + self._fail_upgrade('UPGRADE_KEY_ROTATION', { + 'severity': 'warning', + 'summary': f'Rotation of cephx key for daemon {d.name()} on host {d.hostname} failed.', + 'count': 1, + 'detail': [ + f'Upgrade daemon key rotation: {d.name()}: {e}' + ], + }) + return + action = 'Upgrading' if not d_entry[1] else 'Redeploying' try: - daemon_spec = CephadmDaemonDeploySpec.from_daemon_description(d) self.mgr._daemon_action( daemon_spec, 'redeploy', -- 2.47.3