From: Ujjawal Anand <ujjawal.anand@ibm.com>
Date: Sun, 15 Feb 2026 19:41:41 +0000 (+0530)
Subject: cephadm: Handling OSD flags during upgrade
X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=63416f28dfe7b774caf76dbcc819853e0e3d7d65;p=ceph-ci.git

cephadm: Handling OSD flags during upgrade

Fixes: https://tracker.ceph.com/issues/74956

Signed-off-by: Ujjawal Anand <ujjawal.anand@ibm.com>
---

diff --git a/doc/cephadm/upgrade.rst b/doc/cephadm/upgrade.rst
index 9c3eba8fbea..8736e8289e0 100644
--- a/doc/cephadm/upgrade.rst
+++ b/doc/cephadm/upgrade.rst
@@ -243,6 +243,54 @@ you need. For example, the following command upgrades to a development build:
 
 For more information about available container images, see :ref:`containers`.
 
+Setting OSD flags during upgrade
+================================
+
+Cephadm can set specified OSD flags at upgrade start time and then unset these flags upon
+upgrade completion. By default, cephadm will set and unset the ``noout,noscrub,nodeep-scrub`` OSD flags.
+
+To see the OSD flags cephadm is currently configured to set, check
+
+.. prompt:: bash #
+
+  ceph config get mgr mgr/cephadm/upgrade_osd_flags
+
+The config option is a comma separated list of the flags to be set, and can be modified
+by running
+
+.. prompt:: bash #
+
+  ceph config set mgr mgr/cephadm/upgrade_osd_flags <flag1>,<flag2>, . . . ,<flagN>
+
+Note that setting the config option overwrites the set of flags cephadm will
+set. So if it is currently configured to set ``flag1`` and ``flag2`` and you do a config set
+to have it set ``flag3`` and ``flag4`` it will ONLY be configured to set ``flag3`` and ``flag4``, NOT
+``flag1``, ``flag2``, ``flag3``, and ``flag4``.
+
+Cephadm is configured to set these flags by default on upgrade in versions that
+support it. To have cephadm skip setting these flags for a specific upgrade without
+changing the persistent configuration, you can pass ``--no-osd-flags`` to the upgrade
+command
+
+.. prompt:: bash #
+
+  ceph orch upgrade start --image <image> --no-osd-flags
+
+.. note::
+
+   To check if the current version of cephadm supports setting the osd flags, check
+   ``ceph orch upgrade start --help`` and look to see if ``--no-osd-flags`` is available
+   as a command argument. If so, it is supported and cephadm will set these flags by
+   default during the upgrade.
+
+.. note::
+
+   When OSD flag management is enabled (the default), cephadm sets/unsets the flags as
+   part of the ``ceph orch upgrade start`` command. This can make the command take a
+   little longer to return than it used to.
+
+Cephadm will only unset OSD flags that it set itself for the upgrade, and will
+not remove flags that were already set before the upgrade started.
 
 .. _cephadm_staggered_upgrade:
 
diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py
index c815bbb1287..f26765957fd 100644
--- a/src/pybind/mgr/cephadm/module.py
+++ b/src/pybind/mgr/cephadm/module.py
@@ -411,6 +411,12 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule):
             default=16,
             desc='Maximum number of OSD daemons upgraded in parallel.'
         ),
+        Option(
+            'upgrade_osd_flags',
+            type='str',
+            default='noout,noscrub,nodeep-scrub',
+            desc='Comma separated list of OSD flags to set for the duration of an upgrade'
+        ),
         Option(
             'service_discovery_port',
             type='int',
@@ -597,6 +603,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule):
             self.apply_spec_fails: List[Tuple[str, str]] = []
             self.max_osd_draining_count = 10
             self.max_parallel_osd_upgrades = 16
+            self.upgrade_osd_flags = 'noout,noscrub,nodeep-scrub'
             self.device_enhanced_scan = False
             self.inventory_list_all = False
             self.cgroups_split = True
@@ -4080,7 +4087,7 @@ Then run the following:
 
     @handle_orch_error
     def upgrade_start(self, image: str, version: str, daemon_types: Optional[List[str]] = None, host_placement: Optional[str] = None,
-                      services: Optional[List[str]] = None, limit: Optional[int] = None) -> str:
+                      services: Optional[List[str]] = None, limit: Optional[int] = None, no_osd_flags: bool = False) -> str:
         if self.inventory.get_host_with_state("maintenance"):
             raise OrchestratorError("Upgrade aborted - you have host(s) in maintenance state")
         if self.offline_hosts:
@@ -4112,7 +4119,7 @@ Then run the following:
                 raise OrchestratorError(
                     f'Upgrade aborted - --limit arg must be a positive integer, not {limit}')
 
-        return self.upgrade.upgrade_start(image, version, daemon_types, hosts, services, limit)
+        return self.upgrade.upgrade_start(image, version, daemon_types, hosts, services, limit, no_osd_flags)
 
     @handle_orch_error
     def upgrade_pause(self) -> str:
diff --git a/src/pybind/mgr/cephadm/upgrade.py b/src/pybind/mgr/cephadm/upgrade.py
index 26396f7f93d..b06773285d2 100644
--- a/src/pybind/mgr/cephadm/upgrade.py
+++ b/src/pybind/mgr/cephadm/upgrade.py
@@ -71,6 +71,7 @@ class UpgradeState:
                  services: Optional[List[str]] = None,
                  total_count: Optional[int] = None,
                  remaining_count: Optional[int] = None,
+                 osd_flags: Optional[List[str]] = None,
                  ):
         self._target_name: str = target_name  # Use CephadmUpgrade.target_image instead.
         self.progress_id: str = progress_id
@@ -88,6 +89,8 @@ class UpgradeState:
         self.services = services
         self.total_count = total_count
         self.remaining_count = remaining_count
+        # Global OSD flags actually set by cephadm for the upgrade and then unset afterwards.
+        self.osd_flags: Optional[List[str]] = osd_flags
 
     def to_json(self) -> dict:
         return {
@@ -106,6 +109,7 @@ class UpgradeState:
             'services': self.services,
             'total_count': self.total_count,
             'remaining_count': self.remaining_count,
+            'osd_flags': self.osd_flags,
         }
 
     @classmethod
@@ -310,8 +314,135 @@ class CephadmUpgrade:
             r["tags"] = sorted(ls)
         return r
 
+    def _get_osd_flags_for_upgrade(self) -> List[str]:
+        """
+        Get configured OSD flags to set during upgrade from
+        mgr/cephadm/upgrade_osd_flags(comma-separated list).
+
+        Empty string means "do not manage any flags".
+        """
+        # This is a cephadm module option, so read it from the mgr attribute.
+        s = str(self.mgr.upgrade_osd_flags or '').strip()
+        if not s:
+            return []
+
+        flags = [f.strip() for f in s.split(',') if f.strip()]
+        # sorting keeps the output readable.
+        return sorted(set(flags))
+
+    def _rollback_osd_flags(self, flags: List[str]) -> List[str]:
+        rollback_failed: List[str] = []
+        for flag in flags:
+            try:
+                self.mgr.check_mon_command({
+                    'prefix': 'osd unset',
+                    'key': flag,
+                })
+            except MonCommandFailed as e:
+                rollback_failed.append(flag)
+                self.mgr.log.error(f'Upgrade: failed to rollback OSD flag {flag}: {e}')
+        return rollback_failed
+
+    def _set_osd_flags_for_upgrade(self) -> None:
+        """
+        Set configured OSD flags for the duration of the upgrade.
+        Track which flags are actually set by cephadm so we only unset those later.
+        """
+        assert self.upgrade_state is not None
+
+        desired_flags = self._get_osd_flags_for_upgrade()
+        if not desired_flags:
+            # Explicitly configured to manage no flags.
+            self.upgrade_state.osd_flags = []
+            self._save_upgrade_state()
+            return
+
+        osd_map = self.mgr.get("osd_map")
+        flags_str = osd_map.get('flags', '') or ''
+        current_flags = set(f for f in flags_str.split(',') if f)
+
+        self.upgrade_state.osd_flags = []
+        self._save_upgrade_state()
+        for flag in desired_flags:
+            if flag in current_flags:
+                # One-off at upgrade start: be explicit when we skip a flag.
+                self.mgr.log.info(f'Upgrade: OSD flag {flag} already set; leaving as-is')
+                continue
+            self.mgr.log.info(f'Upgrade: Setting OSD flag {flag} for upgrade duration')
+            try:
+                # Set the flag. If it fails, abort upgrade start.
+                self.mgr.check_mon_command({
+                    'prefix': 'osd set',
+                    'key': flag,
+                })
+            except MonCommandFailed as e:
+                self.mgr.log.error(f'Upgrade: failed to set OSD flag {flag}: {e}')
+
+                # Unset anything we already set in this start attempt.
+                rollback_failed = self._rollback_osd_flags(self.upgrade_state.osd_flags or [])
+
+                # Keep track of any flags we could not roll back.
+                self.upgrade_state.osd_flags = rollback_failed
+                self._save_upgrade_state()
+
+                raise OrchestratorError(f'Failed to set OSD flag {flag}: {e}')
+
+            # Only record the flag if it was set successfully.
+            self.upgrade_state.osd_flags.append(flag)
+            self._save_upgrade_state()
+
+    def _restore_osd_flags_after_upgrade(self) -> None:
+        """
+        Restore OSD flags to their pre-upgrade state.
+
+        Unset only the flags that were set by cephadm for this upgrade, and
+        leave user-set flags untouched.
+        """
+        if not self.upgrade_state:
+            # No upgrade in progress, nothing to restore.
+            return
+
+        osd_map = self.mgr.get("osd_map")
+        flags_str = osd_map.get('flags', '') or ''
+        current_flags = set(f for f in flags_str.split(',') if f)
+
+        # Only unset the flags cephadm actually set. If any unsets fail, keep track of
+        # them so we don't lose state and leave flags behind silently.
+        remaining_flags: List[str] = []
+        for flag in (self.upgrade_state.osd_flags or []):
+            if flag not in current_flags:
+                self.mgr.log.info(f'Upgrade: OSD flag {flag} already unset; skipping')
+                continue
+
+            self.mgr.log.info(f'Upgrade: Unsetting OSD flag {flag} after upgrade')
+            try:
+                # Just log any failures here and carry on trying to unset the rest.
+                self.mgr.check_mon_command({
+                    'prefix': 'osd unset',
+                    'key': flag,
+                })
+            except MonCommandFailed as e:
+                remaining_flags.append(flag)
+                self.mgr.log.error(f'Upgrade: failed to unset OSD flag {flag}: {e}')
+                continue
+
+        if remaining_flags:
+            # At least one `osd unset` operation failed.
+            # Record the failed entries and unset them manually.
+            self.upgrade_state.osd_flags = remaining_flags
+            self._save_upgrade_state()
+            raise OrchestratorError(
+                'Failed to restore OSD flags after upgrade. '
+                f'Please manually unset: {",".join(remaining_flags)}'
+            )
+
+        # Clear stored state now that flags have been restored.
+        self.upgrade_state.osd_flags = []
+        self._save_upgrade_state()
+
     def upgrade_start(self, image: str, version: str, daemon_types: Optional[List[str]] = None,
-                      hosts: Optional[List[str]] = None, services: Optional[List[str]] = None, limit: Optional[int] = None) -> str:
+                      hosts: Optional[List[str]] = None, services: Optional[List[str]] = None,
+                      limit: Optional[int] = None, no_osd_flags: bool = False) -> str:
         fail_fs_value = cast(bool, self.mgr.get_module_option_ex(
             'orchestrator', 'fail_fs', False))
         if self.mgr.mode != 'root':
@@ -358,6 +489,20 @@ class CephadmUpgrade:
             total_count=limit,
             remaining_count=limit,
         )
+        # Set OSD flags for the duration of the upgrade (unless --no-osd-flags was requested).
+        # If this fails, abort and clear upgrade_state so we don't leave a half-started upgrade around.
+        try:
+            if no_osd_flags:
+                # User passed --no-osd-flags, so we don't manage any flags for this upgrade.
+                self.upgrade_state.osd_flags = []
+                self._save_upgrade_state()
+            else:
+                self._set_osd_flags_for_upgrade()
+        except OrchestratorError as e:
+            self.mgr.log.error(f'Upgrade: failed to set OSD flags, aborting upgrade start: {e}')
+            self.upgrade_state = None
+            self._save_upgrade_state()
+            raise
         self._update_upgrade_progress(0.0)
         self._save_upgrade_state()
         self._clear_upgrade_health_checks()
@@ -487,6 +632,8 @@ class CephadmUpgrade:
         if self.upgrade_state.progress_id:
             self.mgr.remote('progress', 'complete',
                             self.upgrade_state.progress_id)
+        # Restore any OSD flags we temporarily set for this upgrade.
+        self._restore_osd_flags_after_upgrade()
         target_image = self.target_image
         self.mgr.log.info('Upgrade: Stopped')
         self.upgrade_state = None
@@ -1086,6 +1233,8 @@ class CephadmUpgrade:
         if not self.upgrade_state:
             logger.debug('_mark_upgrade_complete upgrade already marked complete, exiting')
             return
+        # Restore OSD flags before we clear the upgrade state.
+        self._restore_osd_flags_after_upgrade()
         logger.info('Upgrade: Complete!')
         if self.upgrade_state.progress_id:
             self.mgr.remote('progress', 'complete',
diff --git a/src/pybind/mgr/orchestrator/_interface.py b/src/pybind/mgr/orchestrator/_interface.py
index 136fde595ac..5c9ea15cc74 100644
--- a/src/pybind/mgr/orchestrator/_interface.py
+++ b/src/pybind/mgr/orchestrator/_interface.py
@@ -949,7 +949,7 @@ class Orchestrator(object):
         raise NotImplementedError()
 
     def upgrade_start(self, image: Optional[str], version: Optional[str], daemon_types: Optional[List[str]],
-                      hosts: Optional[str], services: Optional[List[str]], limit: Optional[int]) -> OrchResult[str]:
+                      hosts: Optional[str], services: Optional[List[str]], limit: Optional[int], no_osd_flags: bool = False) -> OrchResult[str]:
         raise NotImplementedError()
 
     def upgrade_pause(self) -> OrchResult[str]:
diff --git a/src/pybind/mgr/orchestrator/module.py b/src/pybind/mgr/orchestrator/module.py
index 5fc2fce63fa..a471fc39505 100644
--- a/src/pybind/mgr/orchestrator/module.py
+++ b/src/pybind/mgr/orchestrator/module.py
@@ -2569,12 +2569,13 @@ Usage:
                        hosts: Optional[str] = None,
                        services: Optional[str] = None,
                        limit: Optional[int] = None,
+                       no_osd_flags: bool = False,
                        ceph_version: Optional[str] = None) -> HandleCommandResult:
         """Initiate upgrade"""
         self._upgrade_check_image_name(image, ceph_version)
         dtypes = daemon_types.split(',') if daemon_types is not None else None
         service_names = services.split(',') if services is not None else None
-        completion = self.upgrade_start(image, ceph_version, dtypes, hosts, service_names, limit)
+        completion = self.upgrade_start(image, ceph_version, dtypes, hosts, service_names, limit, no_osd_flags)
         raise_if_exception(completion)
         return HandleCommandResult(stdout=completion.result_str())