From: David Zafman Date: Sun, 8 Nov 2020 17:07:04 +0000 (-0800) Subject: osd test: Delay reporting until mon_warn_older_version_delay has passed X-Git-Tag: v16.1.0~634^2~2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=93373746f529e8485748a8ffaa8a50be25d3287d;p=ceph.git osd test: Delay reporting until mon_warn_older_version_delay has passed Move release notes description to 16.0.0 and update Update documentation Signed-off-by: David Zafman --- diff --git a/PendingReleaseNotes b/PendingReleaseNotes index bf7b727a2499..b900e2b50006 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -22,6 +22,14 @@ ``osd_scrub_end_week_day`` are 0 - 6. The use of 7 is now illegal. Specifying ``0`` for both values causes every day of the week to be allowed. +* A new health check will warn if different versions of Ceph are running + on deamons. It will be a health error if multiple versions are detected. + This condition must exists for over 1 week by default in order for the + health condition to be triggered. This allows most upgrades to proceed + without falsely see the warning. If upgrade is paused for an extended + time period, health mute can be used like this + "ceph health mute DAEMON_OLD_VERSION --sticky". In this case after + upgrading has finish use "ceph health unmute DAEMON_OLD_VERSION". >=15.0.0 -------- @@ -196,9 +204,3 @@ * ``ceph pg #.# list_unfound`` output has been enhanced to provide might_have_unfound information which indicates which OSDs may contain the unfound objects. - -* A new health check will warn if different versions of Ceph are running - on deamons. This will be an expected warning while in the middle of - an upgrade. A manual upgrade should health mute like this - "ceph health mute DAEMON_OLD_VERSION --sticky". After upgrading has - finish use ceph health unmute DAEMON_OLD_VERSION". diff --git a/doc/rados/operations/health-checks.rst b/doc/rados/operations/health-checks.rst index 9a133df72b57..65aeedd5df38 100644 --- a/doc/rados/operations/health-checks.rst +++ b/doc/rados/operations/health-checks.rst @@ -29,10 +29,13 @@ DAEMON_OLD_VERSION __________________ Warn if old version(s) of Ceph are running on any deamons. -This will be an expected warning while in the middle of -an upgrade. A manual upgrade should health mute like this -"ceph health mute DAEMON_OLD_VERSION --sticky". After upgrading has -finish use ceph health unmute DAEMON_OLD_VERSION". +It will be a health error if multiple versions are detected. +This condition must exists for over 1 week by default in order for the +health condition to be triggered. This allows most upgrades to proceed +without falsely see the warning. If upgrade is paused for an extended +time period, health mute can be used like this +"ceph health mute DAEMON_OLD_VERSION --sticky". In this case after +upgrading has finish use "ceph health unmute DAEMON_OLD_VERSION". MON_DOWN ________ diff --git a/qa/standalone/misc/ver-health.sh b/qa/standalone/misc/ver-health.sh index 4dfbdb34da28..3f3350a22c97 100755 --- a/qa/standalone/misc/ver-health.sh +++ b/qa/standalone/misc/ver-health.sh @@ -46,8 +46,8 @@ function TEST_check_version_health_1() { setup $dir || return 1 # create a cluster with one monitor and three osds - run_mon $dir a --public-addr=$CEPH_MON_A || return 1 - run_mon $dir b --public-addr=$CEPH_MON_B || return 1 + run_mon $dir a --public-addr=$CEPH_MON_A --mon_warn_older_version_delay=0.0 || return 1 + run_mon $dir b --public-addr=$CEPH_MON_B --mon_warn_older_version_delay=0.0 || return 1 run_osd $dir 0 || return 1 run_osd $dir 1 || return 1 run_osd $dir 2 || return 1 @@ -95,8 +95,8 @@ function TEST_check_version_health_2() { setup $dir || return 1 # create a cluster with one monitor and three osds - run_mon $dir a --public-addr=$CEPH_MON_A || return 1 - run_mon $dir b --public-addr=$CEPH_MON_B || return 1 + run_mon $dir a --public-addr=$CEPH_MON_A --mon_warn_older_version_delay=0.0 || return 1 + run_mon $dir b --public-addr=$CEPH_MON_B --mon_warn_older_version_delay=0.0 || return 1 run_osd $dir 0 || return 1 run_osd $dir 1 || return 1 run_osd $dir 2 || return 1 @@ -111,7 +111,7 @@ function TEST_check_version_health_2() { ceph health detail | grep DAEMON_OLD_VERSION && return 1 kill_daemons $dir KILL mon.b - EXTRA_OPTS=" --debug_version_for_testing=01.00.00-gversion-test" run_mon $dir b + EXTRA_OPTS=" --debug_version_for_testing=01.00.00-gversion-test" run_mon $dir b --mon_warn_older_version_delay=0.0 # XXX: Manager doesn't seem to use the test specific config for version #kill_daemons $dir KILL mgr.x #EXTRA_OPTS=" --debug_version_for_testing=02.00.00-gversion-test" run_mgr $dir x @@ -140,4 +140,61 @@ function TEST_check_version_health_2() { ceph health detail | grep -q "osd.0 is running an older version of ceph: 02.00.00-gversion-test" || return 1 } +# Verify delay handling +function TEST_check_version_health_3() { + local dir=$1 + + # Asssume MON_A is leader? + CEPH_ARGS="$ORIG_CEPH_ARGS --mon-host=$CEPH_MON_A " + # setup + setup $dir || return 1 + + # create a cluster with one monitor and three osds + run_mon $dir a --public-addr=$CEPH_MON_A --mon_warn_older_version_delay=20.0 || return 1 + run_mon $dir b --public-addr=$CEPH_MON_B --mon_warn_older_version_delay=20.0 || return 1 + run_osd $dir 0 || return 1 + run_osd $dir 1 || return 1 + run_osd $dir 2 || return 1 + run_mgr $dir x || return 1 + run_mgr $dir y || return 1 + run_mds $dir m || return 1 + run_mds $dir n || return 1 + + sleep 5 + ceph health detail + # should not see this yet + ceph health detail | grep DAEMON_OLD_VERSION && return 1 + + kill_daemons $dir KILL osd.1 + EXTRA_OPTS=" --debug_version_for_testing=01.00.00-gversion-test" activate_osd $dir 1 + sleep 5 # give kill time + + sleep 10 + # should not see this yet + ceph health detail | grep DAEMON_OLD_VERSION && return 1 + + # Now make sure that at least 20 seconds have passed + sleep 10 + + ceph health detail + # Should notice that osd.1 is a different version + ceph health | grep -q "HEALTH_WARN .*There is a daemon running an older version of ceph" || return 1 + ceph health detail | grep -q "HEALTH_WARN .*There is a daemon running an older version of ceph" || return 1 + ceph health detail | grep -q "^[[]WRN[]] DAEMON_OLD_VERSION: There is a daemon running an older version of ceph" || return 1 + ceph health detail | grep -q "osd.1 is running an older version of ceph: 01.00.00-gversion-test" || return 1 + + kill_daemons $dir KILL osd.2 + EXTRA_OPTS=" --debug_version_for_testing=01.00.00-gversion-test" activate_osd $dir 2 + kill_daemons $dir KILL osd.0 + EXTRA_OPTS=" --debug_version_for_testing=02.00.00-gversion-test" activate_osd $dir 0 + sleep 5 + + ceph health detail + ceph health | grep -q "HEALTH_ERR .*There are daemons running multiple old versions of ceph" || return 1 + ceph health detail | grep -q "HEALTH_ERR .*There are daemons running multiple old versions of ceph" || return 1 + ceph health detail | grep -q "^[[]ERR[]] DAEMON_OLD_VERSION: There are daemons running multiple old versions of ceph" || return 1 + ceph health detail | grep -q "osd.1 osd.2 are running an older version of ceph: 01.00.00-gversion-test" || return 1 + ceph health detail | grep -q "osd.0 is running an older version of ceph: 02.00.00-gversion-test" || return 1 +} + main ver-health "$@" diff --git a/src/common/options.cc b/src/common/options.cc index ac0cb9b3a0d6..8f50223893ea 100644 --- a/src/common/options.cc +++ b/src/common/options.cc @@ -2233,6 +2233,11 @@ std::vector