osd test: Delay reporting until mon_warn_older_version_delay has passed

author David Zafman <dzafman@redhat.com>

Sun, 8 Nov 2020 17:07:04 +0000 (09:07 -0800)

committer David Zafman <dzafman@redhat.com>

Wed, 11 Nov 2020 23:10:11 +0000 (15:10 -0800)
author David Zafman <dzafman@redhat.com>
Sun, 8 Nov 2020 17:07:04 +0000 (09:07 -0800)
committer David Zafman <dzafman@redhat.com>
Wed, 11 Nov 2020 23:10:11 +0000 (15:10 -0800)
diff --git a/PendingReleaseNotes b/PendingReleaseNotes

index bf7b727a2499a5c7feaaee3faeb4fef658af3279..b900e2b50006966d8745a1f8a324e1dc27cc4d73 100644 (file)
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -22,6 +22,14 @@
    ``osd_scrub_end_week_day`` are 0 - 6.  The use of 7 is now illegal.
    Specifying ``0`` for both values causes every day of the week to be allowed.
  
+* A new health check will warn if different versions of Ceph are running
+  on deamons. It will be a health error if multiple versions are detected.
+  This condition must exists for over 1 week by default in order for the
+  health condition to be triggered.  This allows most upgrades to proceed
+  without falsely see the warning.  If upgrade is paused for an extended
+  time period, health mute can be used like this
+  "ceph health mute DAEMON_OLD_VERSION --sticky".  In this case after
+  upgrading has finish use "ceph health unmute DAEMON_OLD_VERSION".
  
  >=15.0.0
  --------
@@ -196,9 +204,3 @@
  * ``ceph pg #.# list_unfound`` output has been enhanced to provide
    might_have_unfound information which indicates which OSDs may
    contain the unfound objects.
-
-* A new health check will warn if different versions of Ceph are running
-  on deamons. This will be an expected warning while in the middle of
-  an upgrade. A manual upgrade should health mute like this
-  "ceph health mute DAEMON_OLD_VERSION --sticky".  After upgrading has
-  finish use ceph health unmute DAEMON_OLD_VERSION".
diff --git a/doc/rados/operations/health-checks.rst b/doc/rados/operations/health-checks.rst

index 9a133df72b57bff532088f648727d25d6aef22bc..65aeedd5df388349194847e78441f7ccd280d5c6 100644 (file)
--- a/doc/rados/operations/health-checks.rst
+++ b/doc/rados/operations/health-checks.rst
@@ -29,10 +29,13 @@ DAEMON_OLD_VERSION
  __________________
  
  Warn if old version(s) of Ceph are running on any deamons.
-This will be an expected warning while in the middle of
-an upgrade. A manual upgrade should health mute like this
-"ceph health mute DAEMON_OLD_VERSION --sticky".  After upgrading has
-finish use ceph health unmute DAEMON_OLD_VERSION".
+It will be a health error if multiple versions are detected.
+This condition must exists for over 1 week by default in order for the
+health condition to be triggered.  This allows most upgrades to proceed
+without falsely see the warning.  If upgrade is paused for an extended
+time period, health mute can be used like this
+"ceph health mute DAEMON_OLD_VERSION --sticky".  In this case after
+upgrading has finish use "ceph health unmute DAEMON_OLD_VERSION".
  
  MON_DOWN
  ________
diff --git a/qa/standalone/misc/ver-health.sh b/qa/standalone/misc/ver-health.sh

index 4dfbdb34da284c168801de3c3f115d6270b2d13d..3f3350a22c97b01458b98ab4afa6613048ba6e39 100755 (executable)
--- a/qa/standalone/misc/ver-health.sh
+++ b/qa/standalone/misc/ver-health.sh
@@ -46,8 +46,8 @@ function TEST_check_version_health_1() {
      setup $dir || return 1
  
      # create a cluster with one monitor and three osds
-    run_mon $dir a --public-addr=$CEPH_MON_A || return 1
-    run_mon $dir b --public-addr=$CEPH_MON_B || return 1
+    run_mon $dir a --public-addr=$CEPH_MON_A --mon_warn_older_version_delay=0.0 || return 1
+    run_mon $dir b --public-addr=$CEPH_MON_B --mon_warn_older_version_delay=0.0 || return 1
      run_osd $dir 0 || return 1
      run_osd $dir 1 || return 1
      run_osd $dir 2 || return 1
@@ -95,8 +95,8 @@ function TEST_check_version_health_2() {
      setup $dir || return 1
  
      # create a cluster with one monitor and three osds
-    run_mon $dir a --public-addr=$CEPH_MON_A || return 1
-    run_mon $dir b --public-addr=$CEPH_MON_B || return 1
+    run_mon $dir a --public-addr=$CEPH_MON_A --mon_warn_older_version_delay=0.0 || return 1
+    run_mon $dir b --public-addr=$CEPH_MON_B --mon_warn_older_version_delay=0.0 || return 1
      run_osd $dir 0 || return 1
      run_osd $dir 1 || return 1
      run_osd $dir 2 || return 1
@@ -111,7 +111,7 @@ function TEST_check_version_health_2() {
      ceph health detail | grep DAEMON_OLD_VERSION && return 1
  
      kill_daemons $dir KILL mon.b
-    EXTRA_OPTS=" --debug_version_for_testing=01.00.00-gversion-test" run_mon $dir b
+    EXTRA_OPTS=" --debug_version_for_testing=01.00.00-gversion-test" run_mon $dir b --mon_warn_older_version_delay=0.0
      # XXX: Manager doesn't seem to use the test specific config for version
      #kill_daemons $dir KILL mgr.x
      #EXTRA_OPTS=" --debug_version_for_testing=02.00.00-gversion-test" run_mgr $dir x
@@ -140,4 +140,61 @@ function TEST_check_version_health_2() {
      ceph health detail | grep -q "osd.0 is running an older version of ceph: 02.00.00-gversion-test" || return 1
  }
  
+# Verify delay handling
+function TEST_check_version_health_3() {
+    local dir=$1
+
+    # Asssume MON_A is leader?
+    CEPH_ARGS="$ORIG_CEPH_ARGS --mon-host=$CEPH_MON_A "
+    # setup
+    setup $dir || return 1
+
+    # create a cluster with one monitor and three osds
+    run_mon $dir a --public-addr=$CEPH_MON_A --mon_warn_older_version_delay=20.0 || return 1
+    run_mon $dir b --public-addr=$CEPH_MON_B --mon_warn_older_version_delay=20.0 || return 1
+    run_osd $dir 0 || return 1
+    run_osd $dir 1 || return 1
+    run_osd $dir 2 || return 1
+    run_mgr $dir x || return 1
+    run_mgr $dir y || return 1
+    run_mds $dir m || return 1
+    run_mds $dir n || return 1
+
+    sleep 5
+    ceph health detail
+    # should not see this yet
+    ceph health detail | grep DAEMON_OLD_VERSION && return 1
+
+    kill_daemons $dir KILL osd.1
+    EXTRA_OPTS=" --debug_version_for_testing=01.00.00-gversion-test" activate_osd $dir 1
+    sleep 5 # give kill time
+
+    sleep 10
+    # should not see this yet
+    ceph health detail | grep DAEMON_OLD_VERSION && return 1
+
+    # Now make sure that at least 20 seconds have passed
+    sleep 10
+
+    ceph health detail
+    # Should notice that osd.1 is a different version
+    ceph health | grep -q "HEALTH_WARN .*There is a daemon running an older version of ceph" || return 1
+    ceph health detail | grep -q "HEALTH_WARN .*There is a daemon running an older version of ceph" || return 1
+    ceph health detail | grep -q "^[[]WRN[]] DAEMON_OLD_VERSION: There is a daemon running an older version of ceph" || return 1
+    ceph health detail | grep -q "osd.1 is running an older version of ceph: 01.00.00-gversion-test" || return 1
+
+    kill_daemons $dir KILL osd.2
+    EXTRA_OPTS=" --debug_version_for_testing=01.00.00-gversion-test" activate_osd $dir 2
+    kill_daemons $dir KILL osd.0
+    EXTRA_OPTS=" --debug_version_for_testing=02.00.00-gversion-test" activate_osd $dir 0
+    sleep 5
+
+    ceph health detail
+    ceph health | grep -q "HEALTH_ERR .*There are daemons running multiple old versions of ceph" || return 1
+    ceph health detail | grep -q "HEALTH_ERR .*There are daemons running multiple old versions of ceph" || return 1
+    ceph health detail | grep -q "^[[]ERR[]] DAEMON_OLD_VERSION: There are daemons running multiple old versions of ceph" || return 1
+    ceph health detail | grep -q "osd.1 osd.2 are running an older version of ceph: 01.00.00-gversion-test" || return 1
+    ceph health detail | grep -q "osd.0 is running an older version of ceph: 02.00.00-gversion-test" || return 1
+}
+
  main ver-health "$@"
diff --git a/src/common/options.cc b/src/common/options.cc

index ac0cb9b3a0d60c4c4af6068fa4fcadaa976ca1a8..8f50223893eac97a3866c47d769523bc2052dd93 100644 (file)
--- a/src/common/options.cc
+++ b/src/common/options.cc
@@ -2233,6 +2233,11 @@ std::vector<Option> get_global_options() {
      .add_service("mon")
      .set_description("issue DAEMON_OLD_VERSION health warning if daemons are not all running the same version"),
  
+    Option("mon_warn_older_version_delay", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(7_day)
+    .add_service("mon")
+    .set_description("issue DAEMON_OLD_VERSION health warning after this amount of time has elapsed"),
+
      // PAXOS
  
      Option("paxos_stash_full_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
diff --git a/src/mon/HealthMonitor.cc b/src/mon/HealthMonitor.cc

index 582f99f9b5ffa0644110e33248126bb67e9f75b8..0e6bf5be2d4ce251a09e633fd8fd225071209b7c 100644 (file)
--- a/src/mon/HealthMonitor.cc
+++ b/src/mon/HealthMonitor.cc
@@ -684,8 +684,14 @@ bool HealthMonitor::check_leader_health()
  
    health_check_map_t next;
  
+  static utime_t old_version_first_time;
+
   // DAEMON_OLD_VERSION
    if (g_conf().get_val<bool>("mon_warn_on_older_version")) {
+    utime_t now = ceph_clock_now();
+    if (old_version_first_time == utime_t())
+      old_version_first_time = now;
+    if ((now - old_version_first_time) > g_conf().get_val<double>("mon_warn_older_version_delay")) {
    std::map<string, std::list<string> > all_versions;
    mon->get_all_versions(all_versions);
    if (all_versions.size() > 1) {
@@ -720,6 +726,9 @@ bool HealthMonitor::check_leader_health()
           << " running an older version of ceph: " << g.first;
        d.detail.push_back(ds.str());
      }
+  } else {
+    old_version_first_time = utime_t();
+  }
    }
    }
author	David Zafman <dzafman@redhat.com>
	Sun, 8 Nov 2020 17:07:04 +0000 (09:07 -0800)
committer	David Zafman <dzafman@redhat.com>
	Wed, 11 Nov 2020 23:10:11 +0000 (15:10 -0800)
PendingReleaseNotes		patch \| blob \| history
doc/rados/operations/health-checks.rst		patch \| blob \| history
qa/standalone/misc/ver-health.sh		patch \| blob \| history
src/common/options.cc		patch \| blob \| history
src/mon/HealthMonitor.cc		patch \| blob \| history