From 55dcd9767734d6db71ca90e320d4dce6d2489020 Mon Sep 17 00:00:00 2001 From: Adam King Date: Wed, 14 May 2025 13:16:43 -0400 Subject: [PATCH] mgr/cephadm: rotate keyring for core ceph daemons during upgrade Specifically, this causes us to rotate the mgr, mon, OSD, and mds keyrings. The mgr and mon keyring are done as soon as we see all the mons have been upgraded and OSD/mds happens when we reach them in the upgrade order. NOTE: This patch alone is not enough to get this working for encrypted OSDs Signed-off-by: Adam King --- src/cephadm/cephadm.py | 21 ++++++ src/cephadm/cephadmlib/daemons/ceph.py | 99 ++++++++++++++++++++++++++ src/pybind/mgr/cephadm/module.py | 12 ++++ src/pybind/mgr/cephadm/services/osd.py | 4 -- src/pybind/mgr/cephadm/upgrade.py | 66 ++++++++++++++++- 5 files changed, 196 insertions(+), 6 deletions(-) diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py index 26f13ecda34..3ff55244ceb 100755 --- a/src/cephadm/cephadm.py +++ b/src/cephadm/cephadm.py @@ -180,6 +180,7 @@ from cephadmlib.daemons import ( Keepalived, Monitoring, NFSGanesha, + OSD, SMB, SNMPGateway, MgmtGateway, @@ -621,8 +622,28 @@ def create_daemon_dirs( if keyring: keyring_path = os.path.join(data_dir, 'keyring') + config_json = fetch_configs(ctx) + key_path_exists = False + key_path_content = 'N/A' + try: + key_path_exists = os.path.exists(keyring_path) + key_path_content = open(keyring_path, 'r').read() + except Exception: + pass + update_bluestore_label_osd_keyring = False + if ( + ident.daemon_type == 'osd' + and os.path.exists(keyring_path) + and open(keyring_path, 'r').read() != keyring + ): + # need to update keyring with ceph-bluestore-tool + update_bluestore_label_osd_keyring = True with write_new(keyring_path, owner=(uid, gid)) as f: f.write(keyring) + if update_bluestore_label_osd_keyring: + osd_daemon_form = OSD.create(ctx, ident) + # osd_daemon_form = OSD.init(ctx, ctx.fsid, ident.daemon_id) + osd_daemon_form.rotate_osd_lv_keyring(ctx, keyring_path) if daemon_type in Monitoring.components.keys(): config_json = fetch_configs(ctx) diff --git a/src/cephadm/cephadmlib/daemons/ceph.py b/src/cephadm/cephadmlib/daemons/ceph.py index ac16bbebcfc..0730cf82028 100644 --- a/src/cephadm/cephadmlib/daemons/ceph.py +++ b/src/cephadm/cephadmlib/daemons/ceph.py @@ -1,5 +1,6 @@ import logging import os +import json from typing import Any, Dict, List, Optional, Tuple, Union @@ -12,6 +13,7 @@ from ..context_getters import ( ) from ..daemon_form import register as register_daemon_form from ..daemon_identity import DaemonIdentity +from ..call_wrappers import call, CallVerbosity from ..constants import DEFAULT_IMAGE from ..context import CephadmContext from ..deployment_utils import to_deployment_container @@ -22,6 +24,7 @@ from ..file_utils import ( pathify, populate_files, makedirs, + read_file, recursive_chown, ) from ..data_utils import dict_get @@ -332,6 +335,102 @@ class OSD(Ceph): def osd_fsid(self) -> Optional[str]: return self._osd_fsid + def rotate_osd_lv_keyring( + self, ctx: CephadmContext, keyring_path: str + ) -> None: + keyring_content = read_file([keyring_path]) + if not keyring_content or keyring_content == 'Unknown': + raise Error( + f'Failed to find OSD keyring content at expected path: {keyring_path}' + ) + actual_keyring = keyring_content + # if our keyring is the full thing with sections and caps, we don't want that + # just the actual keyring itself + try: + actual_keyring = ( + keyring_content.split('key =', 1)[1] + .split('caps', 1)[0] + .strip() + ) + except IndexError: + logger.error( + f'Failed to parse keyring from {keyring_content} for rotation of key for osd.{self.identity.daemon_id}' + ) + c_v_container = CephContainer( + ctx, + image=ctx.image, + privileged=True, + entrypoint='ceph-volume', + args=[ + 'lvm', + 'list', + str(self.identity.daemon_id), + '--format', + 'json', + ], + volume_mounts=get_ceph_mounts_for_type( + ctx, ctx.fsid, 'ceph-volume' + ), + ) + out, err, ret = call( + ctx, + c_v_container.run_cmd(), + verbosity=CallVerbosity.QUIET_UNLESS_ERROR, + ) + if ret: + raise Error( + f'Got error using ceph-volume lvm list to get lv path for osd.{self.identity.daemon_id}\n' + f'Out:{out}\n' + f'Err:{err}' + ) + osd_bluestore_data = json.loads(out) + lv_path = '' + osd_lv_data = osd_bluestore_data.get(self.identity.daemon_id, []) + if osd_lv_data: + for dev in osd_lv_data: + if dev.get('type') == 'block': + lv_path = dev.get('lv_path', '') + if not lv_path: + raise Error( + f'Failed to find lv path for osd with id "{self.identity.daemon_id}". Key not rotated using ceph-bluestore-tool' + ) + bluestore_tool_container = CephContainer( + ctx, + image=ctx.image, + privileged=True, + entrypoint='ceph-bluestore-tool', + args=[ + '--dev', + lv_path, + 'set-label-key', + '-k', + 'osd_key', + '-v', + actual_keyring, + ], + volume_mounts={'/dev': '/dev'}, + ) + # OSD must be stopped to make changes to bluestore labels + logger.info( + f'Stopping osd.{self.identity.daemon_id} to update osd_key bluestore label' + ) + call( + ctx, + ['systemctl', 'stop', self.identity.unit_name], + verbosity=CallVerbosity.QUIET_UNLESS_ERROR, + ) + out, err, ret = call( + ctx, + bluestore_tool_container.run_cmd(), + verbosity=CallVerbosity.QUIET_UNLESS_ERROR, + ) + if ret: + raise Error( + 'Got error rotating osd keyring using ceph-bluestore-tool\n' + f'Out:{out}\n' + f'Err:{err}' + ) + @register_daemon_form class CephExporter(ContainerDaemonForm): diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index be17f4a4223..04656c9449d 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -2616,6 +2616,18 @@ Then run the following: return self.perform_service_action(action, service_name) + def key_rotate(self, daemon_spec: CephadmDaemonDeploySpec) -> None: + rc, out, err = self.mon_command({ + 'prefix': 'auth rotate', + 'entity': daemon_spec.entity_name(), + 'format': 'json', + }) + if rc: + raise OrchestratorError( + f'Failed to rotate daemon key for {daemon_spec.entity_name()}.\n' + f'Rc: {rc}\nOut: {out}\nErr: {err}' + ) + def _rotate_daemon_key(self, daemon_spec: CephadmDaemonDeploySpec) -> str: self.log.info(f'Rotating authentication key for {daemon_spec.name()}') rc, out, err = self.mon_command({ diff --git a/src/pybind/mgr/cephadm/services/osd.py b/src/pybind/mgr/cephadm/services/osd.py index 60a399149f9..31039b9b25b 100644 --- a/src/pybind/mgr/cephadm/services/osd.py +++ b/src/pybind/mgr/cephadm/services/osd.py @@ -40,10 +40,6 @@ class OSDService(CephService): async def create_from_spec_one(host: str, drive_selection: DriveSelection) -> Optional[str]: # skip this host if there has been no change in inventory - if not self.mgr.cache.osdspec_needs_apply(host, drive_group): - self.mgr.log.debug("skipping apply of %s on %s (no change)" % ( - host, drive_group)) - return None # skip this host if we cannot schedule here if self.mgr.inventory.has_label(host, SpecialHostLabels.DRAIN_DAEMONS): return None diff --git a/src/pybind/mgr/cephadm/upgrade.py b/src/pybind/mgr/cephadm/upgrade.py index 26396f7f93d..c0e7c63cd78 100644 --- a/src/pybind/mgr/cephadm/upgrade.py +++ b/src/pybind/mgr/cephadm/upgrade.py @@ -71,6 +71,7 @@ class UpgradeState: services: Optional[List[str]] = None, total_count: Optional[int] = None, remaining_count: Optional[int] = None, + rotated_mgr_mon_auth_key_daemons: Optional[List[str]] = None, ): self._target_name: str = target_name # Use CephadmUpgrade.target_image instead. self.progress_id: str = progress_id @@ -88,6 +89,7 @@ class UpgradeState: self.services = services self.total_count = total_count self.remaining_count = remaining_count + self.rotated_mgr_mon_auth_key_daemons = rotated_mgr_mon_auth_key_daemons def to_json(self) -> dict: return { @@ -106,6 +108,7 @@ class UpgradeState: 'services': self.services, 'total_count': self.total_count, 'remaining_count': self.remaining_count, + 'rotated_mgr_mon_auth_key_daemons': self.rotated_mgr_mon_auth_key_daemons, } @classmethod @@ -127,7 +130,8 @@ class CephadmUpgrade: 'UPGRADE_REDEPLOY_DAEMON', 'UPGRADE_BAD_TARGET_VERSION', 'UPGRADE_EXCEPTION', - 'UPGRADE_OFFLINE_HOST' + 'UPGRADE_OFFLINE_HOST', + 'UPGRADE_KEY_ROTATION' ] def __init__(self, mgr: "CephadmOrchestrator"): @@ -867,11 +871,52 @@ class CephadmUpgrade: break return True, to_upgrade + def _rotate_mgr_mon_auth_keys(self, target_image: str, target_digests: Optional[List[str]] = None) -> None: + if self.upgrade_state: + if self.upgrade_state.rotated_mgr_mon_auth_key_daemons is None: + self.upgrade_state.rotated_mgr_mon_auth_key_daemons = [] + # do mgr and mon keyrings as one off after mons have been upgraded + mon_daemons = self.mgr.cache.get_daemons_by_service('mon') + _, mons_needing_upgrade, __, ___ = self._detect_need_upgrade(mon_daemons, target_digests, target_image) + if not mons_needing_upgrade: + # all mons have been upgraded if we get here + for dd in self.mgr.cache.get_daemons_by_service('mgr'): + if dd.name() in self.upgrade_state.rotated_mgr_mon_auth_key_daemons: + continue + daemon_spec = CephadmDaemonDeploySpec.from_daemon_description(dd) + self.mgr.key_rotate(daemon_spec) + if self.mgr.daemon_is_self(daemon_spec.daemon_type, daemon_spec.daemon_id): + self.mgr._schedule_daemon_action(daemon_spec.name(), 'redeploy') + else: + self.mgr._daemon_action(daemon_spec, action='redeploy') + self.upgrade_state.rotated_mgr_mon_auth_key_daemons.append(daemon_spec.name()) + self._save_upgrade_state() + # mon daemons share a key, only do one key rotation + # but still trigger redeploy for each mon + if 'mon' not in self.upgrade_state.rotated_mgr_mon_auth_key_daemons: + self.mgr.key_rotate( + CephadmDaemonDeploySpec.from_daemon_description( + mon_daemons[0] + ) + ) + self.upgrade_state.rotated_mgr_mon_auth_key_daemons.append('mon') + self._save_upgrade_state() + for dd in mon_daemons: + if dd.name() in self.upgrade_state.rotated_mgr_mon_auth_key_daemons: + continue + daemon_spec = CephadmDaemonDeploySpec.from_daemon_description(dd) + self.mgr._daemon_action(daemon_spec, action='redeploy') + self.upgrade_state.rotated_mgr_mon_auth_key_daemons.append(daemon_spec.name()) + self._save_upgrade_state() + else: + self.mgr.log.debug('Skipping mgr/mon key rotation, mons not upgraded') + def _upgrade_daemons(self, to_upgrade: List[Tuple[DaemonDescription, bool]], target_image: str, target_digests: Optional[List[str]] = None) -> None: assert self.upgrade_state is not None num = 1 if target_digests is None: target_digests = [] + self._rotate_mgr_mon_auth_keys(target_image, target_digests) for d_entry in to_upgrade: if self.upgrade_state.remaining_count is not None and self.upgrade_state.remaining_count <= 0 and not d_entry[1]: self.mgr.log.info( @@ -924,9 +969,26 @@ class CephadmUpgrade: else: logger.info('Upgrade: Updating %s.%s' % (d.daemon_type, d.daemon_id)) + + daemon_spec = CephadmDaemonDeploySpec.from_daemon_description(d) + + try: + if daemon_spec.daemon_type in ['mds', 'osd']: + daemon_spec.keyring = None + self.mgr.key_rotate(daemon_spec) + except Exception as e: + self._fail_upgrade('UPGRADE_KEY_ROTATION', { + 'severity': 'warning', + 'summary': f'Rotation of cephx key for daemon {d.name()} on host {d.hostname} failed.', + 'count': 1, + 'detail': [ + f'Upgrade daemon key rotation: {d.name()}: {e}' + ], + }) + return + action = 'Upgrading' if not d_entry[1] else 'Redeploying' try: - daemon_spec = CephadmDaemonDeploySpec.from_daemon_description(d) self.mgr._daemon_action( daemon_spec, 'redeploy', -- 2.39.5