From a8a86789c7b6af720568721f60fc6f8760fa5725 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Mon, 22 Aug 2022 13:09:58 -0400 Subject: [PATCH] Revert "Merge pull request #47092 from dparmar18/wip-dparmar-cephadm-simple-1" This reverts commit 1c4da3dbd20d9683fe681cd7083478809cbb19b9, reversing changes made to ee1e163b1e69c4db558cdf6b857c7c82cd4820d8. Signed-off-by: Patrick Donnelly --- PendingReleaseNotes | 1 + doc/cephadm/upgrade.rst | 25 ----- .../upgrade_without_reducing_max_mds/% | 0 .../upgrade_without_reducing_max_mds/.qa | 1 - .../bluestore-bitmap.yaml | 1 - .../centos_8.stream_container_tools.yaml | 1 - .../upgrade_without_reducing_max_mds/conf | 1 - .../overrides/% | 0 .../overrides/.qa | 1 - .../overrides/ignorelist_health.yaml | 1 - .../ignorelist_wrongly_marked_down.yaml | 1 - .../overrides/pg-warn.yaml | 5 - .../overrides/syntax.yaml | 3 - .../roles.yaml | 11 --- .../upgrade_without_reducing_max_mds/tasks/% | 0 .../tasks/.qa | 1 - .../tasks/0-from/.qa | 1 - .../tasks/0-from/pacific.yaml | 32 ------- .../tasks/0-from/v16.2.4.yaml | 30 ------ .../tasks/1-volume/% | 0 .../tasks/1-volume/.qa | 1 - .../tasks/1-volume/0-create.yaml | 5 - .../tasks/1-volume/1-ranks/.qa | 1 - .../tasks/1-volume/1-ranks/1.yaml | 4 - .../tasks/1-volume/2-allow_standby_replay/.qa | 1 - .../1-volume/2-allow_standby_replay/no.yaml | 4 - .../tasks/1-volume/3-inline/no.yaml | 4 - .../tasks/1-volume/3-inline/yes.yaml | 4 - .../tasks/1-volume/4-verify.yaml | 7 -- .../tasks/2-client.yaml | 3 - .../tasks/3-upgrade-with-workload.yaml | 73 -------------- .../tasks/4-verify.yaml | 5 - qa/tasks/fs.py | 21 +--- src/pybind/mgr/cephadm/upgrade.py | 95 +++++-------------- src/pybind/mgr/orchestrator/module.py | 17 +--- 35 files changed, 28 insertions(+), 333 deletions(-) delete mode 100644 qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/% delete mode 120000 qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/.qa delete mode 120000 qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/bluestore-bitmap.yaml delete mode 120000 qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/centos_8.stream_container_tools.yaml delete mode 120000 qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/conf delete mode 100644 qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/overrides/% delete mode 120000 qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/overrides/.qa delete mode 120000 qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/overrides/ignorelist_health.yaml delete mode 120000 qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/overrides/ignorelist_wrongly_marked_down.yaml delete mode 100644 qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/overrides/pg-warn.yaml delete mode 100644 qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/overrides/syntax.yaml delete mode 100644 qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/roles.yaml delete mode 100644 qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/% delete mode 120000 qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/.qa delete mode 120000 qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/0-from/.qa delete mode 100644 qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/0-from/pacific.yaml delete mode 100644 qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/0-from/v16.2.4.yaml delete mode 100644 qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/1-volume/% delete mode 120000 qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/1-volume/.qa delete mode 100644 qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/1-volume/0-create.yaml delete mode 120000 qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/1-volume/1-ranks/.qa delete mode 100644 qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/1-volume/1-ranks/1.yaml delete mode 120000 qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/1-volume/2-allow_standby_replay/.qa delete mode 100644 qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/1-volume/2-allow_standby_replay/no.yaml delete mode 100644 qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/1-volume/3-inline/no.yaml delete mode 100644 qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/1-volume/3-inline/yes.yaml delete mode 100644 qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/1-volume/4-verify.yaml delete mode 100644 qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/2-client.yaml delete mode 100644 qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/3-upgrade-with-workload.yaml delete mode 100644 qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/4-verify.yaml diff --git a/PendingReleaseNotes b/PendingReleaseNotes index ddc6be10d149f..7e8da3f8ba165 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -99,3 +99,4 @@ of the feature, refer this link on how to perform it: https://docs.ceph.com/en/quincy/cephadm/upgrade/#staggered-upgrade Relevant tracker: https://tracker.ceph.com/issues/55715 + Relevant tracker: https://tracker.ceph.com/issues/5614 diff --git a/doc/cephadm/upgrade.rst b/doc/cephadm/upgrade.rst index 8e62af61e440b..221f212449f79 100644 --- a/doc/cephadm/upgrade.rst +++ b/doc/cephadm/upgrade.rst @@ -48,31 +48,6 @@ The automated upgrade process follows Ceph best practices. For example: Starting the upgrade ==================== -.. note:: - .. note:: - `Staggered Upgrade`_ of the mons/mgrs may be necessary to have access - to this new feature. - - Cephadm by default reduces `max_mds` to `1`. This can be disruptive for large - scale CephFS deployments because the cluster cannot quickly reduce active MDS(s) - to `1` and a single active MDS cannot easily handle the load of all clients - even for a short time. Therefore, to upgrade MDS(s) without reducing `max_mds`, - the `fail_fs` option can to be set to `true` (default value is `false`) prior - to initiating the upgrade: - - .. prompt:: bash # - - ceph config set mgr mgr/orchestrator/fail_fs true - - This would: - #. Fail CephFS filesystems, bringing active MDS daemon(s) to - `up:standby` state. - - #. Upgrade MDS daemons safely. - - #. Bring CephFS filesystems back up, bringing the state of active - MDS daemon(s) from `up:standby` to `up:active`. - Before you use cephadm to upgrade Ceph, verify that all hosts are currently online and that your cluster is healthy by running the following command: .. prompt:: bash # diff --git a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/% b/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/% deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/.qa b/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/.qa deleted file mode 120000 index a602a0353e751..0000000000000 --- a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/.qa +++ /dev/null @@ -1 +0,0 @@ -../.qa/ \ No newline at end of file diff --git a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/bluestore-bitmap.yaml b/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/bluestore-bitmap.yaml deleted file mode 120000 index fb603bc9a64c8..0000000000000 --- a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/bluestore-bitmap.yaml +++ /dev/null @@ -1 +0,0 @@ -.qa/cephfs/objectstore-ec/bluestore-bitmap.yaml \ No newline at end of file diff --git a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/centos_8.stream_container_tools.yaml b/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/centos_8.stream_container_tools.yaml deleted file mode 120000 index 7a86f967f0203..0000000000000 --- a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/centos_8.stream_container_tools.yaml +++ /dev/null @@ -1 +0,0 @@ -.qa/distros/podman/centos_8.stream_container_tools.yaml \ No newline at end of file diff --git a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/conf b/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/conf deleted file mode 120000 index 6d47129847fad..0000000000000 --- a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/conf +++ /dev/null @@ -1 +0,0 @@ -.qa/cephfs/conf/ \ No newline at end of file diff --git a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/overrides/% b/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/overrides/% deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/overrides/.qa b/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/overrides/.qa deleted file mode 120000 index a602a0353e751..0000000000000 --- a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/overrides/.qa +++ /dev/null @@ -1 +0,0 @@ -../.qa/ \ No newline at end of file diff --git a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/overrides/ignorelist_health.yaml b/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/overrides/ignorelist_health.yaml deleted file mode 120000 index 5cb891a95c3c8..0000000000000 --- a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/overrides/ignorelist_health.yaml +++ /dev/null @@ -1 +0,0 @@ -.qa/cephfs/overrides/ignorelist_health.yaml \ No newline at end of file diff --git a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/overrides/ignorelist_wrongly_marked_down.yaml b/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/overrides/ignorelist_wrongly_marked_down.yaml deleted file mode 120000 index f317cb714ca28..0000000000000 --- a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/overrides/ignorelist_wrongly_marked_down.yaml +++ /dev/null @@ -1 +0,0 @@ -.qa/cephfs/overrides/ignorelist_wrongly_marked_down.yaml \ No newline at end of file diff --git a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/overrides/pg-warn.yaml b/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/overrides/pg-warn.yaml deleted file mode 100644 index 4ae54a40d3195..0000000000000 --- a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/overrides/pg-warn.yaml +++ /dev/null @@ -1,5 +0,0 @@ -overrides: - ceph: - conf: - global: - mon pg warn min per osd: 0 diff --git a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/overrides/syntax.yaml b/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/overrides/syntax.yaml deleted file mode 100644 index 84d5d43b2570b..0000000000000 --- a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/overrides/syntax.yaml +++ /dev/null @@ -1,3 +0,0 @@ -overrides: - kclient: - syntax: 'v1' diff --git a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/roles.yaml b/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/roles.yaml deleted file mode 100644 index bce4ecd34ccb3..0000000000000 --- a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/roles.yaml +++ /dev/null @@ -1,11 +0,0 @@ -roles: -- - host.a - - client.0 - - osd.0 - - osd.1 - - osd.2 -- - host.b - - client.1 - - osd.3 - - osd.4 - - osd.5 diff --git a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/% b/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/% deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/.qa b/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/.qa deleted file mode 120000 index a602a0353e751..0000000000000 --- a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/.qa +++ /dev/null @@ -1 +0,0 @@ -../.qa/ \ No newline at end of file diff --git a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/0-from/.qa b/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/0-from/.qa deleted file mode 120000 index a602a0353e751..0000000000000 --- a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/0-from/.qa +++ /dev/null @@ -1 +0,0 @@ -../.qa/ \ No newline at end of file diff --git a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/0-from/pacific.yaml b/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/0-from/pacific.yaml deleted file mode 100644 index 67c27ba6bac3e..0000000000000 --- a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/0-from/pacific.yaml +++ /dev/null @@ -1,32 +0,0 @@ -meta: -- desc: | - setup ceph/pacific - -tasks: -- install: - branch: pacific - exclude_packages: - - ceph-volume -- print: "**** done install task..." -- cephadm: - image: quay.io/ceph/daemon-base:latest-pacific - roleless: true - cephadm_branch: pacific - cephadm_git_url: https://github.com/ceph/ceph - conf: - osd: - #set config option for which cls modules are allowed to be loaded / used - osd_class_load_list: "*" - osd_class_default_list: "*" -- print: "**** done end installing pacific cephadm ..." -- cephadm.shell: - host.a: - - ceph config set mgr mgr/cephadm/use_repo_digest true --force -- print: "**** done cephadm.shell ceph config set mgr..." -- cephadm.shell: - host.a: - - ceph orch status - - ceph orch ps - - ceph orch ls - - ceph orch host ls - - ceph orch device ls diff --git a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/0-from/v16.2.4.yaml b/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/0-from/v16.2.4.yaml deleted file mode 100644 index c732d692ac13d..0000000000000 --- a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/0-from/v16.2.4.yaml +++ /dev/null @@ -1,30 +0,0 @@ -meta: -- desc: | - setup ceph/pacific v16.2.4 - -tasks: -# Disable metrics sending by kclient as it may crash (assert) a v16.2.4 MDS -- pexec: - clients: - - sudo modprobe -r ceph - - sudo modprobe ceph disable_send_metrics=on -- install: - tag: v16.2.4 - exclude_packages: - - ceph-volume -- print: "**** done install task..." -- cephadm: - roleless: true - image: quay.io/ceph/ceph:v16.2.4 - cephadm_branch: v16.2.4 - cephadm_git_url: https://github.com/ceph/ceph - # needed for v16.2.4 due to --skip-admin-label - avoid_pacific_features: true -- print: "**** done starting v16.2.4" -- cephadm.shell: - host.a: - - ceph orch status - - ceph orch ps - - ceph orch ls - - ceph orch host ls - - ceph orch device ls diff --git a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/1-volume/% b/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/1-volume/% deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/1-volume/.qa b/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/1-volume/.qa deleted file mode 120000 index a602a0353e751..0000000000000 --- a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/1-volume/.qa +++ /dev/null @@ -1 +0,0 @@ -../.qa/ \ No newline at end of file diff --git a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/1-volume/0-create.yaml b/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/1-volume/0-create.yaml deleted file mode 100644 index 5ee0022c6bd6e..0000000000000 --- a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/1-volume/0-create.yaml +++ /dev/null @@ -1,5 +0,0 @@ -tasks: -- cephadm.shell: - host.a: - - ceph fs volume create cephfs --placement=4 - - ceph fs dump diff --git a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/1-volume/1-ranks/.qa b/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/1-volume/1-ranks/.qa deleted file mode 120000 index a602a0353e751..0000000000000 --- a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/1-volume/1-ranks/.qa +++ /dev/null @@ -1 +0,0 @@ -../.qa/ \ No newline at end of file diff --git a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/1-volume/1-ranks/1.yaml b/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/1-volume/1-ranks/1.yaml deleted file mode 100644 index fcd3b1ea42ea7..0000000000000 --- a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/1-volume/1-ranks/1.yaml +++ /dev/null @@ -1,4 +0,0 @@ -tasks: -- cephadm.shell: - host.a: - - ceph fs set cephfs max_mds 2 diff --git a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/1-volume/2-allow_standby_replay/.qa b/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/1-volume/2-allow_standby_replay/.qa deleted file mode 120000 index a602a0353e751..0000000000000 --- a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/1-volume/2-allow_standby_replay/.qa +++ /dev/null @@ -1 +0,0 @@ -../.qa/ \ No newline at end of file diff --git a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/1-volume/2-allow_standby_replay/no.yaml b/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/1-volume/2-allow_standby_replay/no.yaml deleted file mode 100644 index 3dbc810899465..0000000000000 --- a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/1-volume/2-allow_standby_replay/no.yaml +++ /dev/null @@ -1,4 +0,0 @@ -tasks: -- cephadm.shell: - host.a: - - ceph fs set cephfs allow_standby_replay false diff --git a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/1-volume/3-inline/no.yaml b/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/1-volume/3-inline/no.yaml deleted file mode 100644 index 107f30ecd218e..0000000000000 --- a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/1-volume/3-inline/no.yaml +++ /dev/null @@ -1,4 +0,0 @@ -tasks: -- cephadm.shell: - host.a: - - ceph fs set cephfs inline_data false diff --git a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/1-volume/3-inline/yes.yaml b/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/1-volume/3-inline/yes.yaml deleted file mode 100644 index 246ed71b448f1..0000000000000 --- a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/1-volume/3-inline/yes.yaml +++ /dev/null @@ -1,4 +0,0 @@ -tasks: -- cephadm.shell: - host.a: - - ceph fs set cephfs inline_data true --yes-i-really-really-mean-it diff --git a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/1-volume/4-verify.yaml b/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/1-volume/4-verify.yaml deleted file mode 100644 index e71365ad113c9..0000000000000 --- a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/1-volume/4-verify.yaml +++ /dev/null @@ -1,7 +0,0 @@ -tasks: -- cephadm.shell: - host.a: - - ceph fs dump - - ceph --format=json fs dump | jq -e ".filesystems | length == 1" - - while ! ceph --format=json mds versions | jq -e ". | add == 4"; do sleep 1; done -- fs.pre_upgrade_save: diff --git a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/2-client.yaml b/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/2-client.yaml deleted file mode 100644 index 92b9dda84794f..0000000000000 --- a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/2-client.yaml +++ /dev/null @@ -1,3 +0,0 @@ -tasks: -- kclient: -- print: "**** done client" diff --git a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/3-upgrade-with-workload.yaml b/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/3-upgrade-with-workload.yaml deleted file mode 100644 index 876cffd191564..0000000000000 --- a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/3-upgrade-with-workload.yaml +++ /dev/null @@ -1,73 +0,0 @@ -tasks: -- parallel: - - upgrade-tasks - - workload-tasks - -upgrade-tasks: - sequential: - - cephadm.shell: - env: [sha1] - host.a: - - ceph config set mon mon_warn_on_insecure_global_id_reclaim false --force - - ceph config set mon mon_warn_on_insecure_global_id_reclaim_allowed false --force - - ceph config set global log_to_journald false --force - - ceph orch ps - - ceph versions - - ceph -s - - ceph orch ls - - ceph orch daemon redeploy "mgr.$(ceph mgr dump -f json | jq .standbys | jq .[] | jq -r .name)" --image quay.ceph.io/ceph-ci/ceph:$sha1 - - ceph orch ps --refresh - - sleep 300 - - ceph orch ps - - ceph versions - - ceph -s - - ceph versions | jq -e '.mgr | length == 2' - - ceph mgr fail - - sleep 180 - - ceph orch daemon redeploy "mgr.$(ceph mgr dump -f json | jq .standbys | jq .[] | jq -r .name)" --image quay.ceph.io/ceph-ci/ceph:$sha1 - - ceph orch ps --refresh - - sleep 180 - - ceph orch ps - - ceph versions - - ceph -s - - ceph mgr fail - - sleep 300 - - ceph orch ps - - ceph versions - - ceph -s - - ceph versions | jq -e '.mgr | length == 1' - - ceph mgr fail - - sleep 180 - - ceph orch ps - - ceph versions - - ceph orch upgrade start --image quay.ceph.io/ceph-ci/ceph:$sha1 --daemon-types mgr - - while ceph orch upgrade status | jq '.in_progress' | grep true && ! ceph orch upgrade status | jq '.message' | grep Error ; do ceph orch ps ; ceph versions ; ceph orch upgrade status ; sleep 30 ; done - - ceph versions | jq -e '.mgr | length == 1' - - ceph versions | jq -e '.mgr | keys' | grep $sha1 - - ceph versions | jq -e '.overall | length == 2' - - ceph orch upgrade check quay.ceph.io/ceph-ci/ceph:$sha1 | jq -e '.up_to_date | length == 2' - - ceph orch ps --refresh - - sleep 180 - - ceph config set mgr mgr/orchestrator/fail_fs true - - ceph orch upgrade start --image quay.ceph.io/ceph-ci/ceph:$sha1 - - cephadm.shell: - env: [sha1] - host.a: - - while ceph orch upgrade status | jq '.in_progress' | grep true && ! ceph orch upgrade status | jq '.message' | grep Error ; do ceph orch ps ; ceph versions ; ceph fs dump; ceph orch upgrade status ; sleep 30 ; done - - ceph orch ps - - ceph versions - - echo "wait for servicemap items w/ changing names to refresh" - - sleep 60 - - ceph orch ps - - ceph health detail - - ceph orch upgrade status - - ceph versions - - ceph versions | jq -e '.overall | length == 1' - - ceph versions | jq -e '.overall | keys' | grep $sha1 - -workload-tasks: - sequential: - - workunit: - clients: - all: - - suites/fsstress.sh diff --git a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/4-verify.yaml b/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/4-verify.yaml deleted file mode 100644 index c2b657e5a3b8a..0000000000000 --- a/qa/suites/fs/upgrade/upgrade_without_reducing_max_mds/tasks/4-verify.yaml +++ /dev/null @@ -1,5 +0,0 @@ -tasks: -- cephadm.shell: - host.a: - - ceph fs dump -- fs.post_upgrade_checks: diff --git a/qa/tasks/fs.py b/qa/tasks/fs.py index 7e62c80318726..f7a9330e29b50 100644 --- a/qa/tasks/fs.py +++ b/qa/tasks/fs.py @@ -11,7 +11,6 @@ log = logging.getLogger(__name__) # Everything up to CEPH_MDSMAP_ALLOW_STANDBY_REPLAY CEPH_MDSMAP_ALLOW_STANDBY_REPLAY = (1<<5) -CEPH_MDSMAP_NOT_JOINABLE = (1 << 0) CEPH_MDSMAP_LAST = CEPH_MDSMAP_ALLOW_STANDBY_REPLAY UPGRADE_FLAGS_MASK = ((CEPH_MDSMAP_LAST<<1) - 1) def pre_upgrade_save(ctx, config): @@ -60,35 +59,21 @@ def post_upgrade_checks(ctx, config): epoch = mdsmap['epoch'] pre_upgrade_epoch = fs_state['epoch'] assert pre_upgrade_epoch < epoch - multiple_max_mds = fs_state['max_mds'] > 1 + should_decrease_max_mds = fs_state['max_mds'] > 1 did_decrease_max_mds = False should_disable_allow_standby_replay = fs_state['flags'] & CEPH_MDSMAP_ALLOW_STANDBY_REPLAY did_disable_allow_standby_replay = False - did_fail_fs = False for i in range(pre_upgrade_epoch+1, mdsmap['epoch']): old_status = mdsc.status(epoch=i) old_fs = old_status.get_fsmap(fscid) old_mdsmap = old_fs['mdsmap'] - if not multiple_max_mds \ - and (old_mdsmap['flags'] & CEPH_MDSMAP_NOT_JOINABLE): - raise RuntimeError('mgr is failing fs when there is only one ' - f'rank in epoch {i}.') - if multiple_max_mds \ - and (old_mdsmap['flags'] & CEPH_MDSMAP_NOT_JOINABLE) \ - and old_mdsmap['max_mds'] == 1: - raise RuntimeError('mgr is failing fs as well the max_mds ' - f'is reduced in epoch {i}') - if old_mdsmap['flags'] & CEPH_MDSMAP_NOT_JOINABLE: - log.debug(f"max_mds not reduced in epoch {i} as fs was failed " - "for carrying out rapid multi-rank mds upgrade") - did_fail_fs = True - if multiple_max_mds and old_mdsmap['max_mds'] == 1: + if should_decrease_max_mds and old_mdsmap['max_mds'] == 1: log.debug(f"max_mds reduced in epoch {i}") did_decrease_max_mds = True if should_disable_allow_standby_replay and not (old_mdsmap['flags'] & CEPH_MDSMAP_ALLOW_STANDBY_REPLAY): log.debug(f"allow_standby_replay disabled in epoch {i}") did_disable_allow_standby_replay = True - assert not multiple_max_mds or did_fail_fs or did_decrease_max_mds + assert not should_decrease_max_mds or did_decrease_max_mds assert not should_disable_allow_standby_replay or did_disable_allow_standby_replay diff --git a/src/pybind/mgr/cephadm/upgrade.py b/src/pybind/mgr/cephadm/upgrade.py index b7ad4a8b66eb3..c2cc0aff9775a 100644 --- a/src/pybind/mgr/cephadm/upgrade.py +++ b/src/pybind/mgr/cephadm/upgrade.py @@ -2,7 +2,7 @@ import json import logging import time import uuid -from typing import TYPE_CHECKING, Optional, Dict, List, Tuple, Any, cast +from typing import TYPE_CHECKING, Optional, Dict, List, Tuple, Any import orchestrator from cephadm.registry import Registry @@ -20,7 +20,6 @@ logger = logging.getLogger(__name__) # from ceph_fs.h CEPH_MDSMAP_ALLOW_STANDBY_REPLAY = (1 << 5) -CEPH_MDSMAP_NOT_JOINABLE = (1 << 0) def normalize_image_digest(digest: str, default_registry: str) -> str: @@ -59,7 +58,6 @@ class UpgradeState: target_version: Optional[str] = None, error: Optional[str] = None, paused: Optional[bool] = None, - fail_fs: bool = False, fs_original_max_mds: Optional[Dict[str, int]] = None, fs_original_allow_standby_replay: Optional[Dict[str, bool]] = None, daemon_types: Optional[List[str]] = None, @@ -78,7 +76,6 @@ class UpgradeState: self.fs_original_max_mds: Optional[Dict[str, int]] = fs_original_max_mds self.fs_original_allow_standby_replay: Optional[Dict[str, bool]] = fs_original_allow_standby_replay - self.fail_fs = fail_fs self.daemon_types = daemon_types self.hosts = hosts self.services = services @@ -92,7 +89,6 @@ class UpgradeState: 'target_id': self.target_id, 'target_digests': self.target_digests, 'target_version': self.target_version, - 'fail_fs': self.fail_fs, 'fs_original_max_mds': self.fs_original_max_mds, 'fs_original_allow_standby_replay': self.fs_original_allow_standby_replay, 'error': self.error, @@ -303,8 +299,6 @@ class CephadmUpgrade: def upgrade_start(self, image: str, version: str, daemon_types: Optional[List[str]] = None, hosts: Optional[List[str]] = None, services: Optional[List[str]] = None, limit: Optional[int] = None) -> str: - fail_fs_value = cast(bool, self.mgr.get_module_option_ex( - 'orchestrator', 'fail_fs', False)) if self.mgr.mode != 'root': raise OrchestratorError('upgrade is not supported in %s mode' % ( self.mgr.mode)) @@ -342,7 +336,6 @@ class CephadmUpgrade: self.upgrade_state = UpgradeState( target_name=target_name, progress_id=str(uuid.uuid4()), - fail_fs=fail_fs_value, daemon_types=daemon_types, hosts=hosts, services=services, @@ -619,43 +612,27 @@ class CephadmUpgrade: # scale down this filesystem? if mdsmap["max_mds"] > 1: - if self.upgrade_state.fail_fs: - if not (mdsmap['flags'] & CEPH_MDSMAP_NOT_JOINABLE) and \ - len(mdsmap['up']) > 0: - self.mgr.log.info(f'Upgrade: failing fs {fs_name} for ' - f'rapid multi-rank mds upgrade') - ret, out, err = self.mgr.check_mon_command({ - 'prefix': 'fs fail', - 'fs_name': fs_name - }) - if ret != 0: - continue_upgrade = False - continue - else: - self.mgr.log.info('Upgrade: Scaling down filesystem %s' % ( - fs_name - )) - if fscid not in self.upgrade_state.fs_original_max_mds: - self.upgrade_state.fs_original_max_mds[fscid] = \ - mdsmap['max_mds'] - self._save_upgrade_state() - ret, out, err = self.mgr.check_mon_command({ - 'prefix': 'fs set', - 'fs_name': fs_name, - 'var': 'max_mds', - 'val': '1', - }) - continue_upgrade = False - continue + self.mgr.log.info('Upgrade: Scaling down filesystem %s' % ( + fs_name + )) + if fscid not in self.upgrade_state.fs_original_max_mds: + self.upgrade_state.fs_original_max_mds[fscid] = mdsmap['max_mds'] + self._save_upgrade_state() + ret, out, err = self.mgr.check_mon_command({ + 'prefix': 'fs set', + 'fs_name': fs_name, + 'var': 'max_mds', + 'val': '1', + }) + continue_upgrade = False + continue - if not self.upgrade_state.fail_fs: - if not (mdsmap['in'] == [0] and len(mdsmap['up']) <= 1): - self.mgr.log.info( - 'Upgrade: Waiting for fs %s to scale down to reach 1 MDS' % ( - fs_name)) - time.sleep(10) - continue_upgrade = False - continue + if not (mdsmap['in'] == [0] and len(mdsmap['up']) <= 1): + self.mgr.log.info( + 'Upgrade: Waiting for fs %s to scale down to reach 1 MDS' % (fs_name)) + time.sleep(10) + continue_upgrade = False + continue if len(mdsmap['up']) == 0: self.mgr.log.warning( @@ -799,15 +776,7 @@ class CephadmUpgrade: return False, to_upgrade if d.daemon_type == 'mds' and self._enough_mds_for_ok_to_stop(d): - # when fail_fs is set to true, all MDS daemons will be moved to - # up:standby state, so Cephadm won't be able to upgrade due to - # this check and and will warn with "It is NOT safe to stop - # mds. at this time: one or more filesystems is - # currently degraded", therefore we bypass this check for that - # case. - assert self.upgrade_state is not None - if not self.upgrade_state.fail_fs \ - and not self._wait_for_ok_to_stop(d, known_ok_to_stop): + if not self._wait_for_ok_to_stop(d, known_ok_to_stop): return False, to_upgrade to_upgrade.append(d_entry) @@ -953,25 +922,7 @@ class CephadmUpgrade: def _complete_mds_upgrade(self) -> None: assert self.upgrade_state is not None - if self.upgrade_state.fail_fs: - for fs in self.mgr.get("fs_map")['filesystems']: - fs_name = fs['mdsmap']['fs_name'] - self.mgr.log.info('Upgrade: Setting filesystem ' - f'{fs_name} Joinable') - try: - ret, _, err = self.mgr.check_mon_command({ - 'prefix': 'fs set', - 'fs_name': fs_name, - 'var': 'joinable', - 'val': 'true', - }) - except Exception as e: - logger.error("Failed to set fs joinable " - f"true due to {e}") - raise OrchestratorError("Failed to set" - "fs joinable true" - f"due to {e}") - elif self.upgrade_state.fs_original_max_mds: + if self.upgrade_state.fs_original_max_mds: for fs in self.mgr.get("fs_map")['filesystems']: fscid = fs["id"] fs_name = fs['mdsmap']['fs_name'] diff --git a/src/pybind/mgr/orchestrator/module.py b/src/pybind/mgr/orchestrator/module.py index 753ae6b7cfcf7..a11d87cb08c2b 100644 --- a/src/pybind/mgr/orchestrator/module.py +++ b/src/pybind/mgr/orchestrator/module.py @@ -213,13 +213,7 @@ class OrchestratorCli(OrchestratorClientMixin, MgrModule, desc='Orchestrator backend', enum_allowed=['cephadm', 'rook', 'test_orchestrator'], runtime=True, - ), - Option( - 'fail_fs', - type='bool', - default=False, - desc='Fail filesystem for rapid multi-rank mds upgrade' - ), + ) ] NATIVE_OPTIONS = [] # type: List[dict] @@ -345,9 +339,6 @@ class OrchestratorCli(OrchestratorClientMixin, MgrModule, def _select_orchestrator(self) -> str: return cast(str, self.get_module_option("orchestrator")) - def _get_fail_fs_value(self) -> bool: - return bool(self.get_module_option("fail_fs")) - @_cli_write_command('orch host add') def _add_host(self, hostname: str, @@ -1493,12 +1484,6 @@ Usage: self._set_backend('') assert self._select_orchestrator() is None self._set_backend(old_orch) - old_fs_fail_value = self._get_fail_fs_value() - self.set_module_option("fail_fs", True) - assert self._get_fail_fs_value() is True - self.set_module_option("fail_fs", False) - assert self._get_fail_fs_value() is False - self.set_module_option("fail_fs", old_fs_fail_value) e1 = self.remote('selftest', 'remote_from_orchestrator_cli_self_test', "ZeroDivisionError") try: -- 2.39.5