osd, test: Special Nautilus handling because there is no health mute

author David Zafman <dzafman@redhat.com>

Thu, 30 Jul 2020 00:55:10 +0000 (00:55 +0000)

committer David Zafman <dzafman@redhat.com>

Sat, 8 Aug 2020 00:29:20 +0000 (00:29 +0000)
author David Zafman <dzafman@redhat.com>
Thu, 30 Jul 2020 00:55:10 +0000 (00:55 +0000)
committer David Zafman <dzafman@redhat.com>
Sat, 8 Aug 2020 00:29:20 +0000 (00:29 +0000)
diff --git a/PendingReleaseNotes b/PendingReleaseNotes

index 0cb749f0283f951b87f583e786ec5b0681cf1dde..61af03c31fb0617806ed155267b467157adc961d 100644 (file)
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -15,4 +15,9 @@
  
  * Monitors now have a config option ``mon_osd_warn_num_repaired``, 10 by default.
    If any OSD has repaired more than this many I/O errors in stored data a
- ``OSD_TOO_MANY_REPAIRS`` health warning is generated.
+  ``OSD_TOO_MANY_REPAIRS`` health warning is generated.  In order to allow
+  clearing of the warning, a new command ``ceph tell osd.# clear_shards_repaired [count]``
+  has been added.  By default it will set the repair count to 0.  If you wanted
+  to be warned again if additional repairs are performed you can provide a value
+  to the command and specify the value of ``mon_osd_warn_num_repaired``.
+  This command will be replaced in future releases by the health mute/unmute feature.
diff --git a/doc/rados/operations/health-checks.rst b/doc/rados/operations/health-checks.rst

index 5a2cb96280640cc24092cc243c37293233402f3a..7bf477e91701ba442a62b3edf0aa25ca7f89db9a 100644 (file)
--- a/doc/rados/operations/health-checks.rst
+++ b/doc/rados/operations/health-checks.rst
@@ -594,6 +594,13 @@ that aren't seeing scrub errors, a count of read repairs is maintained.  If
  it exceeds a config value threshold *mon_osd_warn_num_repaired* default 10,
  this health warning is generated.
  
+In order to allow clearing of the warning, a new command
+``ceph tell osd.# clear_shards_repaired [count]`` has been added.
+By default it will set the repair count to 0.  If the administrator wanted
+to re-enable the warning if any additional repairs are performed you can provide
+a value to the command and specify the value of ``mon_osd_warn_num_repaired``.
+This command will be replaced in future releases by the health mute/unmute feature.
+
  LARGE_OMAP_OBJECTS
  __________________
  
diff --git a/qa/standalone/osd/osd-rep-recov-eio.sh b/qa/standalone/osd/osd-rep-recov-eio.sh

index 613bfc316f7ca2c23e66ee6ba210b1559676f99a..ab7c1501e4269bb8b26ea677302b0827e3851be2 100755 (executable)
--- a/qa/standalone/osd/osd-rep-recov-eio.sh
+++ b/qa/standalone/osd/osd-rep-recov-eio.sh
@@ -209,12 +209,17 @@ function TEST_rados_repair_warning() {
      ceph health | grep -q "Too many repaired reads on 1 OSDs" || return 1
      ceph health detail | grep -q "osd.$primary had $OBJS reads repaired" || return 1
  
-    ceph health mute OSD_TOO_MANY_REPAIRS
+    ceph tell osd.$primary clear_shards_repaired
+    sleep 10
+
      set -o pipefail
      # Should mute this
      ceph health | $(! grep -q "Too many repaired reads on 1 OSDs") || return 1
      set +o pipefail
  
+    ceph tell osd.$primary clear_shards_repaired $OBJS
+    sleep 10
+
      for i in $(seq 1 $OBJS)
       do
         inject_$inject rep data $poolname ${objbase}-$i $dir 0 || return 1
@@ -230,7 +235,7 @@ function TEST_rados_repair_warning() {
      COUNT=$(ceph pg dump --format=json-pretty | jq ".pg_map.osd_stats_sum.num_shards_repaired")
      test "$COUNT" = "$(expr $OBJS \* 3)" || return 1
  
-    # Give mon a chance to notice additional OSD and unmute
+    # Give mon a chance to notice additional OSD and reset num_shards_repaired
      # The default tick time is 5 seconds
      CHECKTIME=10
      LOOPS=0
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc

index 5e6064e0ef6861b20fe60c210c459419b5acbe4f..9c59bce24da89a55d05b674a779181623aaaffd2 100644 (file)
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -982,6 +982,13 @@ void OSDService::inc_osd_stat_repaired()
    return;
  }
  
+void OSDService::set_osd_stat_repaired(int64_t count)
+{
+  std::lock_guard l(stat_lock);
+  osd_stat.num_shards_repaired = count;
+  return;
+}
+
  float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio,
                                          uint64_t adjust_used)
  {
@@ -6750,6 +6757,10 @@ COMMAND("cluster_log " \
         "name=message,type=CephString,n=N",
         "log a message to the cluster log",
         "osd", "rw")
+COMMAND("clear_shards_repaired " \
+       "name=count,type=CephInt,req=false",
+       "clear num_shards_repaired to clear health warning",
+       "osd", "rw")
  COMMAND("bench " \
         "name=count,type=CephInt,req=false " \
         "name=size,type=CephInt,req=false " \
@@ -6968,6 +6979,11 @@ int OSD::_do_command(
      }
      clog->do_log(level, message);
    }
+  else if (prefix == "clear_shards_repaired") {
+    int64_t count;
+    cmd_getval(cct, cmdmap, "count", count, (int64_t) 0);
+    service.set_osd_stat_repaired(count);
+  }
  
    // either 'pg <pgid> <command>' or
    // 'tell <pgid>' (which comes in without any of that prefix)?
diff --git a/src/osd/OSD.h b/src/osd/OSD.h

index 80ca9af538cd006c21c381516eb515a403c09fd4..8c87823d2da70832c340e7f89171c4dcbb796e15 100644 (file)
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -909,6 +909,7 @@ public:
      osd_alert_list_t& alerts);
    osd_stat_t set_osd_stat(vector<int>& hb_peers, int num_pgs);
    void inc_osd_stat_repaired(void);
+  void set_osd_stat_repaired(int64_t);
    float compute_adjusted_ratio(osd_stat_t new_stat, float *pratio, uint64_t adjust_used = 0);
    osd_stat_t get_osd_stat() {
      std::lock_guard l(stat_lock);
diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h

index b9641a5305c7b9b3caeece706b47188686f454e2..e19695fcd6dbb855198981000575c79323525ad0 100644 (file)
--- a/src/osd/PGBackend.h
+++ b/src/osd/PGBackend.h
@@ -298,6 +298,7 @@ typedef std::shared_ptr<const OSDMap> OSDMapRef;
  
       virtual bool pg_is_repair() = 0;
       virtual void inc_osd_stat_repaired() = 0;
+     virtual void set_osd_stat_repaired(int64_t) = 0;
       virtual bool pg_is_remote_backfilling() = 0;
       virtual void pg_add_local_num_bytes(int64_t num_bytes) = 0;
       virtual void pg_sub_local_num_bytes(int64_t num_bytes) = 0;
diff --git a/src/osd/PrimaryLogPG.h b/src/osd/PrimaryLogPG.h

index 56fdc87fe60915e8c064fded3e5ecbcfd9155b58..39f3b04c28db7bb11feaa127a38ef995472f3ba3 100644 (file)
--- a/src/osd/PrimaryLogPG.h
+++ b/src/osd/PrimaryLogPG.h
@@ -410,6 +410,9 @@ public:
    void inc_osd_stat_repaired() override {
      osd->inc_osd_stat_repaired();
    }
+  void set_osd_stat_repaired(int64_t count) override {
+    osd->set_osd_stat_repaired(count);
+  }
    bool pg_is_remote_backfilling() override {
      return is_remote_backfilling();
    }
author	David Zafman <dzafman@redhat.com>
	Thu, 30 Jul 2020 00:55:10 +0000 (00:55 +0000)
committer	David Zafman <dzafman@redhat.com>
	Sat, 8 Aug 2020 00:29:20 +0000 (00:29 +0000)
PendingReleaseNotes		patch \| blob \| history
doc/rados/operations/health-checks.rst		patch \| blob \| history
qa/standalone/osd/osd-rep-recov-eio.sh		patch \| blob \| history
src/osd/OSD.cc		patch \| blob \| history
src/osd/OSD.h		patch \| blob \| history
src/osd/PGBackend.h		patch \| blob \| history
src/osd/PrimaryLogPG.h		patch \| blob \| history