mgr: Warn when too many reads are repaired on an OSD

author David Zafman <dzafman@redhat.com>

Wed, 10 Jun 2020 02:24:00 +0000 (19:24 -0700)

committer Josh Durgin <jdurgin@redhat.com>

Wed, 1 Jul 2020 19:25:45 +0000 (12:25 -0700)
author David Zafman <dzafman@redhat.com>
Wed, 10 Jun 2020 02:24:00 +0000 (19:24 -0700)
committer Josh Durgin <jdurgin@redhat.com>
Wed, 1 Jul 2020 19:25:45 +0000 (12:25 -0700)
diff --git a/PendingReleaseNotes b/PendingReleaseNotes

index e49425bd0a3ed3cc115f38c279b74da5f3afcae1..ff977162fddf112f4c49aa08e147a99f35232d63 100644 (file)
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -1,3 +1,6 @@
  >=15.2.5
  --------
  
+* Monitors now have a config option ``mon_osd_warn_num_repaired``, 10 by default.
+  If any OSD has repaired more than this many I/O errors in stored data a
+ ``OSD_TOO_MANY_REPAIRS`` health warning is generated.
diff --git a/doc/rados/operations/health-checks.rst b/doc/rados/operations/health-checks.rst

index bd71a2ee937f8c8918d8842ed7183868866d84d6..4b3d5a7a2f58e0289a819a2f360ec0f1549015d8 100644 (file)
--- a/doc/rados/operations/health-checks.rst
+++ b/doc/rados/operations/health-checks.rst
@@ -688,6 +688,16 @@ paired with *PG_DAMAGED* (see above).
  
  See :doc:`pg-repair` for more information.
  
+OSD_TOO_MANY_REPAIRS
+____________________
+
+When a read error occurs and another replica is available it is used to repair
+the error immediately, so that the client can get the object data.  Scrub
+handles errors for data at rest.  In order to identify possible failing disks
+that aren't seeing scrub errors, a count of read repairs is maintained.  If
+it exceeds a config value threshold *mon_osd_warn_num_repaired* default 10,
+this health warning is generated.
+
  LARGE_OMAP_OBJECTS
  __________________
  
diff --git a/qa/standalone/osd/osd-rep-recov-eio.sh b/qa/standalone/osd/osd-rep-recov-eio.sh

index 8dce41a98bbfd1906e277c75b1471b714de4a99d..613bfc316f7ca2c23e66ee6ba210b1559676f99a 100755 (executable)
--- a/qa/standalone/osd/osd-rep-recov-eio.sh
+++ b/qa/standalone/osd/osd-rep-recov-eio.sh
@@ -19,6 +19,8 @@
  
  source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
  
+warnings=10
+
  function run() {
      local dir=$1
      shift
@@ -32,7 +34,8 @@ function run() {
      local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
      for func in $funcs ; do
          setup $dir || return 1
-        run_mon $dir a || return 1
+       # set warning amount in case default changes
+        run_mon $dir a --mon_osd_warn_num_repaired=$warnings || return 1
         run_mgr $dir x || return 1
         ceph osd pool create foo 8 || return 1
  
@@ -171,6 +174,86 @@ function TEST_rados_get_with_eio() {
      delete_pool $poolname
  }
  
+function TEST_rados_repair_warning() {
+    local dir=$1
+    local OBJS=$(expr $warnings + 1)
+
+    setup_osds 4 || return 1
+
+    local poolname=pool-rep
+    create_pool $poolname 1 1 || return 1
+    wait_for_clean || return 1
+
+    local poolname=pool-rep
+    local obj-base=obj-warn-
+    local inject=eio
+
+   for i in $(seq 1 $OBJS)
+    do
+      rados_put $dir $poolname ${objbase}-$i || return 1
+      inject_$inject rep data $poolname ${objbase}-$i $dir 0 || return 1
+      rados_get $dir $poolname ${objbase}-$i || return 1
+    done
+    local pgid=$(get_pg $poolname ${objbase}-1)
+
+    local object_osds=($(get_osds $poolname ${objbase}-1))
+    local primary=${object_osds[0]}
+    local bad_peer=${object_osds[1]}
+
+    COUNT=$(ceph pg $pgid query | jq '.info.stats.stat_sum.num_objects_repaired')
+    test "$COUNT" = "$OBJS" || return 1
+    flush_pg_stats
+    COUNT=$(ceph pg dump --format=json-pretty | jq ".pg_map.osd_stats_sum.num_shards_repaired")
+    test "$COUNT" = "$OBJS" || return 1
+
+    ceph health | grep -q "Too many repaired reads on 1 OSDs" || return 1
+    ceph health detail | grep -q "osd.$primary had $OBJS reads repaired" || return 1
+
+    ceph health mute OSD_TOO_MANY_REPAIRS
+    set -o pipefail
+    # Should mute this
+    ceph health | $(! grep -q "Too many repaired reads on 1 OSDs") || return 1
+    set +o pipefail
+
+    for i in $(seq 1 $OBJS)
+     do
+       inject_$inject rep data $poolname ${objbase}-$i $dir 0 || return 1
+       inject_$inject rep data $poolname ${objbase}-$i $dir 1 || return 1
+       # Force primary to pull from the bad peer, so we can repair it too!
+       set_config osd $primary osd_debug_feed_pullee $bad_peer || return 1
+       rados_get $dir $poolname ${objbase}-$i || return 1
+    done
+
+    COUNT=$(ceph pg $pgid query | jq '.info.stats.stat_sum.num_objects_repaired')
+    test "$COUNT" = "$(expr $OBJS \* 2)" || return 1
+    flush_pg_stats
+    COUNT=$(ceph pg dump --format=json-pretty | jq ".pg_map.osd_stats_sum.num_shards_repaired")
+    test "$COUNT" = "$(expr $OBJS \* 3)" || return 1
+
+    # Give mon a chance to notice additional OSD and unmute
+    # The default tick time is 5 seconds
+    CHECKTIME=10
+    LOOPS=0
+    while(true)
+    do
+      sleep 1
+      if ceph health | grep -q "Too many repaired reads on 2 OSDs"
+      then
+             break
+      fi
+      LOOPS=$(expr $LOOPS + 1)
+      if test "$LOOPS" = "$CHECKTIME"
+      then
+             echo "Too many repaired reads not seen after $CHECKTIME seconds"
+             return 1
+      fi
+    done
+    ceph health detail | grep -q "osd.$primary had $(expr $OBJS \* 2) reads repaired" || return 1
+    ceph health detail | grep -q "osd.$bad_peer had $OBJS reads repaired" || return 1
+
+    delete_pool $poolname
+}
+
  # Test backfill with unfound object
  function TEST_rep_backfill_unfound() {
      local dir=$1
diff --git a/qa/suites/rados/singleton/all/random-eio.yaml b/qa/suites/rados/singleton/all/random-eio.yaml

index fd120680515981ada53e3a4f0b4d8589dd0daa28..1d5adae75877114ddcf1c9c6d0f1a59bbf6727a2 100644 (file)
--- a/qa/suites/rados/singleton/all/random-eio.yaml
+++ b/qa/suites/rados/singleton/all/random-eio.yaml
@@ -24,6 +24,7 @@ tasks:
      - overall HEALTH_
      - \(POOL_APP_NOT_ENABLED\)
      - \(PG_DEGRADED\)
+    - \(OSD_TOO_MANY_REPAIRS\)
  - full_sequential:
    - exec:
        client.0:
diff --git a/src/common/options.cc b/src/common/options.cc

index b9d5b675a0a3e43e025a5d70d8018f4fd52b4ff6..331dac6a21104dcfd226b5d0c9d7a30c1d907886 100644 (file)
--- a/src/common/options.cc
+++ b/src/common/options.cc
@@ -1501,6 +1501,11 @@ std::vector<Option> get_global_options() {
      .add_service("mgr")
      .set_description("issue REQUEST_SLOW health warning if OSD ops are slower than this age (seconds)"),
  
+    Option("mon_osd_warn_num_repaired", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(10)
+    .add_service("mon")
+    .set_description("issue OSD_TOO_MANY_REPAIRS health warning if an OSD has more than this many read repairs"),
+
      Option("mon_osd_err_op_age_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
      .set_default(128)
      .add_service("mgr")
diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc

index 636ab0c7156327814c384b2e1050a9a361226d66..a340fd0562a17bce41084c258136b25fe36b2ee7 100644 (file)
--- a/src/mon/PGMap.cc
+++ b/src/mon/PGMap.cc
@@ -2823,6 +2823,7 @@ void PGMap::get_health_checks(
  
      list<string> detail_back;
      list<string> detail_front;
+    list<string> detail;
      set<mon_ping_item_t> back_sorted, front_sorted;
      for (auto i : osd_stat) {
        for (auto j : i.second.hb_pingtime) {
@@ -2853,6 +2854,19 @@ void PGMap::get_health_checks(
           front_sorted.emplace(front);
         }
        }
+      if (i.second.num_shards_repaired >
+                     cct->_conf.get_val<uint64_t>("mon_osd_warn_num_repaired")) {
+        ostringstream ss;
+       ss << "osd." << i.first << " had " << i.second.num_shards_repaired << " reads repaired";
+        detail.push_back(ss.str());
+      }
+    }
+    if (!detail.empty()) {
+      ostringstream ss;
+      ss << "Too many repaired reads on " << detail.size() << " OSDs";
+      auto& d = checks->add("OSD_TOO_MANY_REPAIRS", HEALTH_WARN, ss.str(),
+                     detail.size());
+      d.detail.swap(detail);
      }
      int max_detail = 10;
      for (auto &sback : boost::adaptors::reverse(back_sorted)) {
author	David Zafman <dzafman@redhat.com>
	Wed, 10 Jun 2020 02:24:00 +0000 (19:24 -0700)
committer	Josh Durgin <jdurgin@redhat.com>
	Wed, 1 Jul 2020 19:25:45 +0000 (12:25 -0700)
PendingReleaseNotes		patch \| blob \| history
doc/rados/operations/health-checks.rst		patch \| blob \| history
qa/standalone/osd/osd-rep-recov-eio.sh		patch \| blob \| history
qa/suites/rados/singleton/all/random-eio.yaml		patch \| blob \| history
src/common/options.cc		patch \| blob \| history
src/mon/PGMap.cc		patch \| blob \| history