https://docs.ceph.com/en/quincy/cephadm/upgrade/#staggered-upgrade
Relevant tracker: https://tracker.ceph.com/issues/55715
+ Relevant tracker: https://tracker.ceph.com/issues/5614
Starting the upgrade
====================
-.. note::
- .. note::
- `Staggered Upgrade`_ of the mons/mgrs may be necessary to have access
- to this new feature.
-
- Cephadm by default reduces `max_mds` to `1`. This can be disruptive for large
- scale CephFS deployments because the cluster cannot quickly reduce active MDS(s)
- to `1` and a single active MDS cannot easily handle the load of all clients
- even for a short time. Therefore, to upgrade MDS(s) without reducing `max_mds`,
- the `fail_fs` option can to be set to `true` (default value is `false`) prior
- to initiating the upgrade:
-
- .. prompt:: bash #
-
- ceph config set mgr mgr/orchestrator/fail_fs true
-
- This would:
- #. Fail CephFS filesystems, bringing active MDS daemon(s) to
- `up:standby` state.
-
- #. Upgrade MDS daemons safely.
-
- #. Bring CephFS filesystems back up, bringing the state of active
- MDS daemon(s) from `up:standby` to `up:active`.
-
Before you use cephadm to upgrade Ceph, verify that all hosts are currently online and that your cluster is healthy by running the following command:
.. prompt:: bash #
+++ /dev/null
-../.qa/
\ No newline at end of file
+++ /dev/null
-.qa/cephfs/objectstore-ec/bluestore-bitmap.yaml
\ No newline at end of file
+++ /dev/null
-.qa/distros/podman/centos_8.stream_container_tools.yaml
\ No newline at end of file
+++ /dev/null
-.qa/cephfs/conf/
\ No newline at end of file
+++ /dev/null
-../.qa/
\ No newline at end of file
+++ /dev/null
-.qa/cephfs/overrides/ignorelist_health.yaml
\ No newline at end of file
+++ /dev/null
-.qa/cephfs/overrides/ignorelist_wrongly_marked_down.yaml
\ No newline at end of file
+++ /dev/null
-overrides:
- ceph:
- conf:
- global:
- mon pg warn min per osd: 0
+++ /dev/null
-overrides:
- kclient:
- syntax: 'v1'
+++ /dev/null
-roles:
-- - host.a
- - client.0
- - osd.0
- - osd.1
- - osd.2
-- - host.b
- - client.1
- - osd.3
- - osd.4
- - osd.5
+++ /dev/null
-../.qa/
\ No newline at end of file
+++ /dev/null
-../.qa/
\ No newline at end of file
+++ /dev/null
-meta:
-- desc: |
- setup ceph/pacific
-
-tasks:
-- install:
- branch: pacific
- exclude_packages:
- - ceph-volume
-- print: "**** done install task..."
-- cephadm:
- image: quay.io/ceph/daemon-base:latest-pacific
- roleless: true
- cephadm_branch: pacific
- cephadm_git_url: https://github.com/ceph/ceph
- conf:
- osd:
- #set config option for which cls modules are allowed to be loaded / used
- osd_class_load_list: "*"
- osd_class_default_list: "*"
-- print: "**** done end installing pacific cephadm ..."
-- cephadm.shell:
- host.a:
- - ceph config set mgr mgr/cephadm/use_repo_digest true --force
-- print: "**** done cephadm.shell ceph config set mgr..."
-- cephadm.shell:
- host.a:
- - ceph orch status
- - ceph orch ps
- - ceph orch ls
- - ceph orch host ls
- - ceph orch device ls
+++ /dev/null
-meta:
-- desc: |
- setup ceph/pacific v16.2.4
-
-tasks:
-# Disable metrics sending by kclient as it may crash (assert) a v16.2.4 MDS
-- pexec:
- clients:
- - sudo modprobe -r ceph
- - sudo modprobe ceph disable_send_metrics=on
-- install:
- tag: v16.2.4
- exclude_packages:
- - ceph-volume
-- print: "**** done install task..."
-- cephadm:
- roleless: true
- image: quay.io/ceph/ceph:v16.2.4
- cephadm_branch: v16.2.4
- cephadm_git_url: https://github.com/ceph/ceph
- # needed for v16.2.4 due to --skip-admin-label
- avoid_pacific_features: true
-- print: "**** done starting v16.2.4"
-- cephadm.shell:
- host.a:
- - ceph orch status
- - ceph orch ps
- - ceph orch ls
- - ceph orch host ls
- - ceph orch device ls
+++ /dev/null
-../.qa/
\ No newline at end of file
+++ /dev/null
-tasks:
-- cephadm.shell:
- host.a:
- - ceph fs volume create cephfs --placement=4
- - ceph fs dump
+++ /dev/null
-../.qa/
\ No newline at end of file
+++ /dev/null
-tasks:
-- cephadm.shell:
- host.a:
- - ceph fs set cephfs max_mds 2
+++ /dev/null
-../.qa/
\ No newline at end of file
+++ /dev/null
-tasks:
-- cephadm.shell:
- host.a:
- - ceph fs set cephfs allow_standby_replay false
+++ /dev/null
-tasks:
-- cephadm.shell:
- host.a:
- - ceph fs set cephfs inline_data false
+++ /dev/null
-tasks:
-- cephadm.shell:
- host.a:
- - ceph fs set cephfs inline_data true --yes-i-really-really-mean-it
+++ /dev/null
-tasks:
-- cephadm.shell:
- host.a:
- - ceph fs dump
- - ceph --format=json fs dump | jq -e ".filesystems | length == 1"
- - while ! ceph --format=json mds versions | jq -e ". | add == 4"; do sleep 1; done
-- fs.pre_upgrade_save:
+++ /dev/null
-tasks:
-- kclient:
-- print: "**** done client"
+++ /dev/null
-tasks:
-- parallel:
- - upgrade-tasks
- - workload-tasks
-
-upgrade-tasks:
- sequential:
- - cephadm.shell:
- env: [sha1]
- host.a:
- - ceph config set mon mon_warn_on_insecure_global_id_reclaim false --force
- - ceph config set mon mon_warn_on_insecure_global_id_reclaim_allowed false --force
- - ceph config set global log_to_journald false --force
- - ceph orch ps
- - ceph versions
- - ceph -s
- - ceph orch ls
- - ceph orch daemon redeploy "mgr.$(ceph mgr dump -f json | jq .standbys | jq .[] | jq -r .name)" --image quay.ceph.io/ceph-ci/ceph:$sha1
- - ceph orch ps --refresh
- - sleep 300
- - ceph orch ps
- - ceph versions
- - ceph -s
- - ceph versions | jq -e '.mgr | length == 2'
- - ceph mgr fail
- - sleep 180
- - ceph orch daemon redeploy "mgr.$(ceph mgr dump -f json | jq .standbys | jq .[] | jq -r .name)" --image quay.ceph.io/ceph-ci/ceph:$sha1
- - ceph orch ps --refresh
- - sleep 180
- - ceph orch ps
- - ceph versions
- - ceph -s
- - ceph mgr fail
- - sleep 300
- - ceph orch ps
- - ceph versions
- - ceph -s
- - ceph versions | jq -e '.mgr | length == 1'
- - ceph mgr fail
- - sleep 180
- - ceph orch ps
- - ceph versions
- - ceph orch upgrade start --image quay.ceph.io/ceph-ci/ceph:$sha1 --daemon-types mgr
- - while ceph orch upgrade status | jq '.in_progress' | grep true && ! ceph orch upgrade status | jq '.message' | grep Error ; do ceph orch ps ; ceph versions ; ceph orch upgrade status ; sleep 30 ; done
- - ceph versions | jq -e '.mgr | length == 1'
- - ceph versions | jq -e '.mgr | keys' | grep $sha1
- - ceph versions | jq -e '.overall | length == 2'
- - ceph orch upgrade check quay.ceph.io/ceph-ci/ceph:$sha1 | jq -e '.up_to_date | length == 2'
- - ceph orch ps --refresh
- - sleep 180
- - ceph config set mgr mgr/orchestrator/fail_fs true
- - ceph orch upgrade start --image quay.ceph.io/ceph-ci/ceph:$sha1
- - cephadm.shell:
- env: [sha1]
- host.a:
- - while ceph orch upgrade status | jq '.in_progress' | grep true && ! ceph orch upgrade status | jq '.message' | grep Error ; do ceph orch ps ; ceph versions ; ceph fs dump; ceph orch upgrade status ; sleep 30 ; done
- - ceph orch ps
- - ceph versions
- - echo "wait for servicemap items w/ changing names to refresh"
- - sleep 60
- - ceph orch ps
- - ceph health detail
- - ceph orch upgrade status
- - ceph versions
- - ceph versions | jq -e '.overall | length == 1'
- - ceph versions | jq -e '.overall | keys' | grep $sha1
-
-workload-tasks:
- sequential:
- - workunit:
- clients:
- all:
- - suites/fsstress.sh
+++ /dev/null
-tasks:
-- cephadm.shell:
- host.a:
- - ceph fs dump
-- fs.post_upgrade_checks:
# Everything up to CEPH_MDSMAP_ALLOW_STANDBY_REPLAY
CEPH_MDSMAP_ALLOW_STANDBY_REPLAY = (1<<5)
-CEPH_MDSMAP_NOT_JOINABLE = (1 << 0)
CEPH_MDSMAP_LAST = CEPH_MDSMAP_ALLOW_STANDBY_REPLAY
UPGRADE_FLAGS_MASK = ((CEPH_MDSMAP_LAST<<1) - 1)
def pre_upgrade_save(ctx, config):
epoch = mdsmap['epoch']
pre_upgrade_epoch = fs_state['epoch']
assert pre_upgrade_epoch < epoch
- multiple_max_mds = fs_state['max_mds'] > 1
+ should_decrease_max_mds = fs_state['max_mds'] > 1
did_decrease_max_mds = False
should_disable_allow_standby_replay = fs_state['flags'] & CEPH_MDSMAP_ALLOW_STANDBY_REPLAY
did_disable_allow_standby_replay = False
- did_fail_fs = False
for i in range(pre_upgrade_epoch+1, mdsmap['epoch']):
old_status = mdsc.status(epoch=i)
old_fs = old_status.get_fsmap(fscid)
old_mdsmap = old_fs['mdsmap']
- if not multiple_max_mds \
- and (old_mdsmap['flags'] & CEPH_MDSMAP_NOT_JOINABLE):
- raise RuntimeError('mgr is failing fs when there is only one '
- f'rank in epoch {i}.')
- if multiple_max_mds \
- and (old_mdsmap['flags'] & CEPH_MDSMAP_NOT_JOINABLE) \
- and old_mdsmap['max_mds'] == 1:
- raise RuntimeError('mgr is failing fs as well the max_mds '
- f'is reduced in epoch {i}')
- if old_mdsmap['flags'] & CEPH_MDSMAP_NOT_JOINABLE:
- log.debug(f"max_mds not reduced in epoch {i} as fs was failed "
- "for carrying out rapid multi-rank mds upgrade")
- did_fail_fs = True
- if multiple_max_mds and old_mdsmap['max_mds'] == 1:
+ if should_decrease_max_mds and old_mdsmap['max_mds'] == 1:
log.debug(f"max_mds reduced in epoch {i}")
did_decrease_max_mds = True
if should_disable_allow_standby_replay and not (old_mdsmap['flags'] & CEPH_MDSMAP_ALLOW_STANDBY_REPLAY):
log.debug(f"allow_standby_replay disabled in epoch {i}")
did_disable_allow_standby_replay = True
- assert not multiple_max_mds or did_fail_fs or did_decrease_max_mds
+ assert not should_decrease_max_mds or did_decrease_max_mds
assert not should_disable_allow_standby_replay or did_disable_allow_standby_replay
import logging
import time
import uuid
-from typing import TYPE_CHECKING, Optional, Dict, List, Tuple, Any, cast
+from typing import TYPE_CHECKING, Optional, Dict, List, Tuple, Any
import orchestrator
from cephadm.registry import Registry
# from ceph_fs.h
CEPH_MDSMAP_ALLOW_STANDBY_REPLAY = (1 << 5)
-CEPH_MDSMAP_NOT_JOINABLE = (1 << 0)
def normalize_image_digest(digest: str, default_registry: str) -> str:
target_version: Optional[str] = None,
error: Optional[str] = None,
paused: Optional[bool] = None,
- fail_fs: bool = False,
fs_original_max_mds: Optional[Dict[str, int]] = None,
fs_original_allow_standby_replay: Optional[Dict[str, bool]] = None,
daemon_types: Optional[List[str]] = None,
self.fs_original_max_mds: Optional[Dict[str, int]] = fs_original_max_mds
self.fs_original_allow_standby_replay: Optional[Dict[str,
bool]] = fs_original_allow_standby_replay
- self.fail_fs = fail_fs
self.daemon_types = daemon_types
self.hosts = hosts
self.services = services
'target_id': self.target_id,
'target_digests': self.target_digests,
'target_version': self.target_version,
- 'fail_fs': self.fail_fs,
'fs_original_max_mds': self.fs_original_max_mds,
'fs_original_allow_standby_replay': self.fs_original_allow_standby_replay,
'error': self.error,
def upgrade_start(self, image: str, version: str, daemon_types: Optional[List[str]] = None,
hosts: Optional[List[str]] = None, services: Optional[List[str]] = None, limit: Optional[int] = None) -> str:
- fail_fs_value = cast(bool, self.mgr.get_module_option_ex(
- 'orchestrator', 'fail_fs', False))
if self.mgr.mode != 'root':
raise OrchestratorError('upgrade is not supported in %s mode' % (
self.mgr.mode))
self.upgrade_state = UpgradeState(
target_name=target_name,
progress_id=str(uuid.uuid4()),
- fail_fs=fail_fs_value,
daemon_types=daemon_types,
hosts=hosts,
services=services,
# scale down this filesystem?
if mdsmap["max_mds"] > 1:
- if self.upgrade_state.fail_fs:
- if not (mdsmap['flags'] & CEPH_MDSMAP_NOT_JOINABLE) and \
- len(mdsmap['up']) > 0:
- self.mgr.log.info(f'Upgrade: failing fs {fs_name} for '
- f'rapid multi-rank mds upgrade')
- ret, out, err = self.mgr.check_mon_command({
- 'prefix': 'fs fail',
- 'fs_name': fs_name
- })
- if ret != 0:
- continue_upgrade = False
- continue
- else:
- self.mgr.log.info('Upgrade: Scaling down filesystem %s' % (
- fs_name
- ))
- if fscid not in self.upgrade_state.fs_original_max_mds:
- self.upgrade_state.fs_original_max_mds[fscid] = \
- mdsmap['max_mds']
- self._save_upgrade_state()
- ret, out, err = self.mgr.check_mon_command({
- 'prefix': 'fs set',
- 'fs_name': fs_name,
- 'var': 'max_mds',
- 'val': '1',
- })
- continue_upgrade = False
- continue
+ self.mgr.log.info('Upgrade: Scaling down filesystem %s' % (
+ fs_name
+ ))
+ if fscid not in self.upgrade_state.fs_original_max_mds:
+ self.upgrade_state.fs_original_max_mds[fscid] = mdsmap['max_mds']
+ self._save_upgrade_state()
+ ret, out, err = self.mgr.check_mon_command({
+ 'prefix': 'fs set',
+ 'fs_name': fs_name,
+ 'var': 'max_mds',
+ 'val': '1',
+ })
+ continue_upgrade = False
+ continue
- if not self.upgrade_state.fail_fs:
- if not (mdsmap['in'] == [0] and len(mdsmap['up']) <= 1):
- self.mgr.log.info(
- 'Upgrade: Waiting for fs %s to scale down to reach 1 MDS' % (
- fs_name))
- time.sleep(10)
- continue_upgrade = False
- continue
+ if not (mdsmap['in'] == [0] and len(mdsmap['up']) <= 1):
+ self.mgr.log.info(
+ 'Upgrade: Waiting for fs %s to scale down to reach 1 MDS' % (fs_name))
+ time.sleep(10)
+ continue_upgrade = False
+ continue
if len(mdsmap['up']) == 0:
self.mgr.log.warning(
return False, to_upgrade
if d.daemon_type == 'mds' and self._enough_mds_for_ok_to_stop(d):
- # when fail_fs is set to true, all MDS daemons will be moved to
- # up:standby state, so Cephadm won't be able to upgrade due to
- # this check and and will warn with "It is NOT safe to stop
- # mds.<daemon_name> at this time: one or more filesystems is
- # currently degraded", therefore we bypass this check for that
- # case.
- assert self.upgrade_state is not None
- if not self.upgrade_state.fail_fs \
- and not self._wait_for_ok_to_stop(d, known_ok_to_stop):
+ if not self._wait_for_ok_to_stop(d, known_ok_to_stop):
return False, to_upgrade
to_upgrade.append(d_entry)
def _complete_mds_upgrade(self) -> None:
assert self.upgrade_state is not None
- if self.upgrade_state.fail_fs:
- for fs in self.mgr.get("fs_map")['filesystems']:
- fs_name = fs['mdsmap']['fs_name']
- self.mgr.log.info('Upgrade: Setting filesystem '
- f'{fs_name} Joinable')
- try:
- ret, _, err = self.mgr.check_mon_command({
- 'prefix': 'fs set',
- 'fs_name': fs_name,
- 'var': 'joinable',
- 'val': 'true',
- })
- except Exception as e:
- logger.error("Failed to set fs joinable "
- f"true due to {e}")
- raise OrchestratorError("Failed to set"
- "fs joinable true"
- f"due to {e}")
- elif self.upgrade_state.fs_original_max_mds:
+ if self.upgrade_state.fs_original_max_mds:
for fs in self.mgr.get("fs_map")['filesystems']:
fscid = fs["id"]
fs_name = fs['mdsmap']['fs_name']
desc='Orchestrator backend',
enum_allowed=['cephadm', 'rook', 'test_orchestrator'],
runtime=True,
- ),
- Option(
- 'fail_fs',
- type='bool',
- default=False,
- desc='Fail filesystem for rapid multi-rank mds upgrade'
- ),
+ )
]
NATIVE_OPTIONS = [] # type: List[dict]
def _select_orchestrator(self) -> str:
return cast(str, self.get_module_option("orchestrator"))
- def _get_fail_fs_value(self) -> bool:
- return bool(self.get_module_option("fail_fs"))
-
@_cli_write_command('orch host add')
def _add_host(self,
hostname: str,
self._set_backend('')
assert self._select_orchestrator() is None
self._set_backend(old_orch)
- old_fs_fail_value = self._get_fail_fs_value()
- self.set_module_option("fail_fs", True)
- assert self._get_fail_fs_value() is True
- self.set_module_option("fail_fs", False)
- assert self._get_fail_fs_value() is False
- self.set_module_option("fail_fs", old_fs_fail_value)
e1 = self.remote('selftest', 'remote_from_orchestrator_cli_self_test', "ZeroDivisionError")
try: