]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
osd: Replace active/pending scrub tracking for local/remote
authorDavid Zafman <dzafman@redhat.com>
Wed, 31 Jul 2019 00:15:32 +0000 (00:15 +0000)
committerDavid Zafman <dzafman@redhat.com>
Tue, 10 Sep 2019 20:33:27 +0000 (13:33 -0700)
This is similar to how recovery reservations are split between
local and remote.

It was the case that scrubs_pending was used for reservations at
the replicas as well as at the primary while requesting reservations
from the replicas.  There was no need for scrubs_pending to turn
into scrubs_active at the primary as nothing treated that value
as special.  scrubber.active = true when scrubbing is
actually going.

Now scurbber.local_reserved indicates scrubs_local incremented
Now scrubber.remote_reserved indicates scrubs_remote incremented

Fixes: https://tracker.ceph.com/issues/41669
Signed-off-by: David Zafman <dzafman@redhat.com>
PendingReleaseNotes
doc/dev/osd_internals/scrub.rst
qa/standalone/scrub/osd-scrub-dump.sh [new file with mode: 0755]
src/osd/OSD.cc
src/osd/OSD.h
src/osd/PG.cc
src/osd/PG.h

index d793483a08d1c2959c208f27621b57fc7f5bf8c9..1f819531271f575413a85e5fe642f3346f43f979 100644 (file)
   recovery locks held (in_progress) and waiting in priority queues.
 
 * New OSD daemon command dump_scrub_reservations which reveals the
-  scrub reservations that are active and pending.
+  scrub reservations that are held for local (primary) and remote (replica) PGs.
index 3343b3986c52a9c48c8547cb947f40234b460807..f20dc042e458db2aa8dcbe0b6ae7c0d1aa66f4f9 100644 (file)
@@ -1,6 +1,9 @@
 
+Scrub internals and diagnostics
+===============================
+
 Scrubbing Behavior Table
-========================
+------------------------
 
 +-------------------------------------------------+----------+-----------+---------------+----------------------+
 |                                          Flags  | none     | noscrub   | nodeep_scrub  | noscrub/nodeep_scrub |
@@ -28,3 +31,11 @@ State variables
 - Initiated scrub state is  must_scrub && !must_deep_scrub && !time_for_deep
 - Initiated scrub after osd_deep_scrub_interval state is must scrub && !must_deep_scrub && time_for_deep
 - Initiated deep scrub state is  must_scrub && must_deep_scrub
+
+Scrub Reservations
+------------------
+
+An OSD daemon command dumps total local and remote reservations::
+
+  ceph daemon osd.<id> dump_scrub_reservations
+
diff --git a/qa/standalone/scrub/osd-scrub-dump.sh b/qa/standalone/scrub/osd-scrub-dump.sh
new file mode 100755 (executable)
index 0000000..e218834
--- /dev/null
@@ -0,0 +1,173 @@
+#!/usr/bin/env bash
+#
+# Copyright (C) 2019 Red Hat <contact@redhat.com>
+#
+# Author: David Zafman <dzafman@redhat.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Library Public License for more details.
+#
+
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+MAX_SCRUBS=4
+SCRUB_SLEEP=2
+POOL_SIZE=3
+
+function run() {
+    local dir=$1
+    shift
+    local SLEEP=0
+    local CHUNK_MAX=5
+
+    export CEPH_MON="127.0.0.1:7184" # git grep '\<7184\>' : there must be only one
+    export CEPH_ARGS
+    CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+    CEPH_ARGS+="--mon-host=$CEPH_MON "
+    CEPH_ARGS+="--osd_max_scrubs=$MAX_SCRUBS "
+    CEPH_ARGS+="--osd_scrub_sleep=$SLEEP "
+    CEPH_ARGS+="--osd_scrub_chunk_max=$CHUNK_MAX "
+    CEPH_ARGS+="--osd_scrub_sleep=$SCRUB_SLEEP "
+    CEPH_ARGS+="--osd_pool_default_size=$POOL_SIZE "
+
+    export -n CEPH_CLI_TEST_DUP_COMMAND
+    local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+    for func in $funcs ; do
+        setup $dir || return 1
+        $func $dir || return 1
+        teardown $dir || return 1
+    done
+}
+
+function TEST_recover_unexpected() {
+    local dir=$1
+    shift
+    local OSDS=6
+    local PGS=16
+    local POOLS=3
+    local OBJS=1000
+
+    run_mon $dir a || return 1
+    run_mgr $dir x || return 1
+    for o in $(seq 0 $(expr $OSDS - 1))
+    do
+        run_osd $dir $o
+    done
+
+    for i in $(seq 1 $POOLS)
+    do
+        create_pool test$i $PGS $PGS
+    done
+
+    wait_for_clean || return 1
+
+    dd if=/dev/urandom of=datafile bs=4k count=2
+    for i in $(seq 1 $POOLS)
+    do
+       for j in $(seq 1 $OBJS)
+       do
+              rados -p test$i put obj$j datafile
+       done
+    done
+    rm datafile
+
+    ceph osd set noscrub
+    ceph osd set nodeep-scrub
+
+    for qpg in $(ceph pg dump pgs --format=json-pretty | jq '.pg_stats[].pgid')
+    do
+       primary=$(ceph pg dump pgs --format=json | jq ".pg_stats[] | select(.pgid == $qpg) | .acting_primary")
+       eval pg=$qpg   # strip quotes around qpg
+       CEPH_ARGS='' ceph daemon $(get_asok_path osd.$primary) trigger_scrub $pg
+    done
+
+    ceph pg dump pgs
+
+    max=$(CEPH_ARGS='' ceph daemon $(get_asok_path osd.0) dump_scrub_reservations | jq '.osd_max_scrubs')
+    if [ $max != $MAX_SCRUBS];
+    then
+       echo "ERROR: Incorrect osd_max_scrubs from dump_scrub_reservations"
+       return 1
+    fi
+
+    ceph osd unset noscrub
+
+    ok=false
+    for i in $(seq 0 300)
+    do
+       ceph pg dump pgs
+       if ceph pg dump pgs | grep scrubbing; then
+           ok=true
+           break
+       fi
+       sleep 1
+    done
+    if test $ok = "false"; then
+       echo "ERROR: Test set-up failed no scrubbing"
+       return 1
+    fi
+
+    local total=0
+    local zerocount=0
+    local maxzerocount=3
+    while(true)
+    do
+       pass=0
+       for o in $(seq 0 $(expr $OSDS - 1))
+       do
+               CEPH_ARGS='' ceph daemon $(get_asok_path osd.$o) dump_scrub_reservations
+               scrubs=$(CEPH_ARGS='' ceph daemon $(get_asok_path osd.$o) dump_scrub_reservations | jq '.scrubs_local + .scrubs_remote')
+               if [ $scrubs -gt $MAX_SCRUBS ]; then
+                   echo "ERROR: More than $MAX_SCRUBS currently reserved"
+                   return 1
+               fi
+               pass=$(expr $pass + $scrubs)
+        done
+       if [ $pass = "0" ]; then
+           zerocount=$(expr $zerocount + 1)
+       fi
+       if [ $zerocount -gt $maxzerocount ]; then
+           break
+       fi
+       total=$(expr $total + $pass)
+       sleep $(expr $SCRUB_SLEEP \* 2)
+    done
+
+    # Check that there are no more scrubs
+    for i in $(seq 0 5)
+    do
+        if ceph pg dump pgs | grep scrubbing; then
+           echo "ERROR: Extra scrubs after test completion...not expected"
+           return 1
+        fi
+       sleep $SCRUB_SLEEP
+    done
+
+    echo $total total reservations seen
+
+    # Sort of arbitraty number based on PGS * POOLS * POOL_SIZE as the number of total scrub
+    # reservations that must occur.  However, the loop above might see the same reservation more
+    # than once.
+    actual_reservations=$(expr $PGS \* $POOLS \* $POOL_SIZE)
+    if [ $total -lt $actual_reservations ]; then
+       echo "ERROR: Unexpectedly low amount of scrub reservations seen during test"
+       return 1
+    fi
+
+    return 0
+}
+
+
+main osd-scrub-dump "$@"
+
+# Local Variables:
+# compile-command: "cd build ; make check && \
+#    ../qa/run-standalone.sh osd-scrub-dump.sh"
+# End:
index 14e2c064816ce13bd7ecdeb379e83fc9679762a2..2eaedc8a281b7fd35912eeda898ac55058ecf2dc 100644 (file)
@@ -233,8 +233,8 @@ OSDService::OSDService(OSD *osd) :
   publish_lock{ceph::make_mutex("OSDService::publish_lock")},
   pre_publish_lock{ceph::make_mutex("OSDService::pre_publish_lock")},
   max_oldest_map(0),
-  scrubs_pending(0),
-  scrubs_active(0),
+  scrubs_local(0),
+  scrubs_remote(0),
   agent_valid_iterator(false),
   agent_ops(0),
   flush_mode_high_count(0),
@@ -1195,79 +1195,76 @@ void OSDService::prune_pg_created()
 // --------------------------------------
 // dispatch
 
-bool OSDService::can_inc_scrubs_pending()
+bool OSDService::can_inc_scrubs()
 {
   bool can_inc = false;
   std::lock_guard l(sched_scrub_lock);
 
-  if (scrubs_pending + scrubs_active < cct->_conf->osd_max_scrubs) {
-    dout(20) << __func__ << " " << scrubs_pending << " -> " << (scrubs_pending+1)
-            << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active
-            << ")" << dendl;
+  if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
+    dout(20) << __func__ << " == true " << scrubs_local << " local + " << scrubs_remote
+            << " remote < max " << cct->_conf->osd_max_scrubs << dendl;
     can_inc = true;
   } else {
-    dout(20) << __func__ << " " << scrubs_pending << " + " << scrubs_active
-            << " active >= max " << cct->_conf->osd_max_scrubs << dendl;
+    dout(20) << __func__ << " == false " << scrubs_local << " local + " << scrubs_remote
+            << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
   }
 
   return can_inc;
 }
 
-bool OSDService::inc_scrubs_pending()
+bool OSDService::inc_scrubs_local()
 {
   bool result = false;
   std::lock_guard l{sched_scrub_lock};
-  if (scrubs_pending + scrubs_active < cct->_conf->osd_max_scrubs) {
-    dout(20) << "inc_scrubs_pending " << scrubs_pending << " -> " << (scrubs_pending+1)
-            << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
+  if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
+    dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local+1)
+            << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
     result = true;
-    ++scrubs_pending;
+    ++scrubs_local;
   } else {
-    dout(20) << "inc_scrubs_pending " << scrubs_pending << " + " << scrubs_active << " active >= max " << cct->_conf->osd_max_scrubs << dendl;
+    dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
   }
   return result;
 }
 
-void OSDService::dec_scrubs_pending()
+void OSDService::dec_scrubs_local()
 {
   std::lock_guard l{sched_scrub_lock};
-  dout(20) << "dec_scrubs_pending " << scrubs_pending << " -> " << (scrubs_pending-1)
-          << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
-  --scrubs_pending;
-  ceph_assert(scrubs_pending >= 0);
+  dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local-1)
+          << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
+  --scrubs_local;
+  ceph_assert(scrubs_local >= 0);
 }
 
-void OSDService::inc_scrubs_active(bool reserved)
+bool OSDService::inc_scrubs_remote()
 {
+  bool result = false;
   std::lock_guard l{sched_scrub_lock};
-  ++(scrubs_active);
-  if (reserved) {
-    --(scrubs_pending);
-    dout(20) << "inc_scrubs_active " << (scrubs_active-1) << " -> " << scrubs_active
-            << " (max " << cct->_conf->osd_max_scrubs
-            << ", pending " << (scrubs_pending+1) << " -> " << scrubs_pending << ")" << dendl;
-    ceph_assert(scrubs_pending >= 0);
+  if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
+    dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote+1)
+            << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
+    result = true;
+    ++scrubs_remote;
   } else {
-    dout(20) << "inc_scrubs_active " << (scrubs_active-1) << " -> " << scrubs_active
-            << " (max " << cct->_conf->osd_max_scrubs
-            << ", pending " << scrubs_pending << ")" << dendl;
+    dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
   }
+  return result;
 }
 
-void OSDService::dec_scrubs_active()
+void OSDService::dec_scrubs_remote()
 {
   std::lock_guard l{sched_scrub_lock};
-  dout(20) << "dec_scrubs_active " << scrubs_active << " -> " << (scrubs_active-1)
-          << " (max " << cct->_conf->osd_max_scrubs << ", pending " << scrubs_pending << ")" << dendl;
-  --scrubs_active;
-  ceph_assert(scrubs_active >= 0);
+  dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote-1)
+          << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
+  --scrubs_remote;
+  ceph_assert(scrubs_remote >= 0);
 }
 
 void OSDService::dump_scrub_reservations(Formatter *f)
 {
   std::lock_guard l{sched_scrub_lock};
-  f->dump_int("scrubs_active", scrubs_active);
-  f->dump_int("scrubs_pending", scrubs_pending);
+  f->dump_int("scrubs_local", scrubs_local);
+  f->dump_int("scrubs_remote", scrubs_remote);
   f->dump_int("osd_max_scrubs", cct->_conf->osd_max_scrubs);
 }
 
@@ -7654,7 +7651,7 @@ bool OSD::scrub_load_below_threshold()
 void OSD::sched_scrub()
 {
   // if not permitted, fail fast
-  if (!service.can_inc_scrubs_pending()) {
+  if (!service.can_inc_scrubs()) {
     return;
   }
   bool allow_requested_repair_only = false;
@@ -7711,7 +7708,7 @@ void OSD::sched_scrub()
         continue;
       }
       // If it is reserving, let it resolve before going to the next scrub job
-      if (pg->scrubber.reserved) {
+      if (pg->scrubber.local_reserved && !pg->scrubber.active) {
        pg->unlock();
        dout(30) << __func__ << ": reserve in progress pgid " << scrub.pgid << dendl;
        break;
index 770087970b34a870a50dacbfae5c1377911b498a..babf1561f0f0eb125242c711a794d4a3f6aaa258 100644 (file)
@@ -273,8 +273,8 @@ public:
 private:
   // -- scrub scheduling --
   ceph::mutex sched_scrub_lock = ceph::make_mutex("OSDService::sched_scrub_lock");
-  int scrubs_pending;
-  int scrubs_active;
+  int scrubs_local;
+  int scrubs_remote;
 
 public:
   struct ScrubJob {
@@ -349,11 +349,11 @@ public:
     f->close_section();
   }
 
-  bool can_inc_scrubs_pending();
-  bool inc_scrubs_pending();
-  void inc_scrubs_active(bool reserved);
-  void dec_scrubs_pending();
-  void dec_scrubs_active();
+  bool can_inc_scrubs();
+  bool inc_scrubs_local();
+  void dec_scrubs_local();
+  bool inc_scrubs_remote();
+  void dec_scrubs_remote();
   void dump_scrub_reservations(Formatter *f);
 
   void reply_op_error(OpRequestRef op, int err);
index e655dc61a7bd9dbe649dd27dff769a585e0d3b23..5fef5299fa2eae6ef0e2d97e08a63f7937e6809a 100644 (file)
@@ -352,7 +352,7 @@ void PG::clear_primary_state()
 }
 
 PG::Scrubber::Scrubber()
- : reserved(false), reserve_failed(false),
+ : local_reserved(false), remote_reserved(false), reserve_failed(false),
    epoch_start(0),
    active(false),
    shallow_errors(0), deep_errors(0), fixed(0),
@@ -1323,22 +1323,24 @@ void PG::requeue_map_waiters()
 /*
  * when holding pg and sched_scrub_lock, then the states are:
  *   scheduling:
- *     scrubber.reserved = true
- *     scrub_rserved_peers includes whoami
- *     osd->scrub_pending++
+ *     scrubber.local_reserved = true
+ *     scrubber.active = false
+ *     scrubber.reserved_peers includes whoami
+ *     osd->scrubs_local++
  *   scheduling, replica declined:
- *     scrubber.reserved = true
+ *     scrubber.local_reserved = true
  *     scrubber.reserved_peers includes -1
- *     osd->scrub_pending++
+ *     osd->scrub_local++
  *   pending:
- *     scrubber.reserved = true
+ *     scrubber.local_reserved = true
+ *     scrubber.active = false
  *     scrubber.reserved_peers.size() == acting.size();
  *     pg on scrub_wq
- *     osd->scrub_pending++
+ *     osd->scrub_local++
  *   scrubbing:
- *     scrubber.reserved = false;
+ *     scrubber.local_reserved = true;
+ *     scrubber.active = true
  *     scrubber.reserved_peers empty
- *     osd->scrubber.active++
  */
 
 // returns true if a scrub has been newly kicked off
@@ -1352,7 +1354,7 @@ bool PG::sched_scrub()
 
   // All processing the first time through commits us to whatever
   // choices are made.
-  if (!scrubber.reserved) {
+  if (!scrubber.local_reserved) {
     dout(20) << __func__ << ": Start processing pg " << info.pgid << dendl;
 
     bool allow_deep_scrub = !(get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
@@ -1442,9 +1444,9 @@ bool PG::sched_scrub()
                           (cct->_conf->osd_repair_during_recovery && scrubber.must_repair) ||
                           !osd->is_recovery_active();
     if (allow_scrubing &&
-         osd->inc_scrubs_pending()) {
+         osd->inc_scrubs_local()) {
       dout(20) << __func__ << ": reserved locally, reserving replicas" << dendl;
-      scrubber.reserved = true;
+      scrubber.local_reserved = true;
       scrubber.reserved_peers.insert(pg_whoami);
       scrub_reserve_replicas();
     } else {
@@ -1453,7 +1455,7 @@ bool PG::sched_scrub()
     }
   }
 
-  if (scrubber.reserved) {
+  if (scrubber.local_reserved) {
     if (scrubber.reserve_failed) {
       dout(20) << __func__ << ": failed, a peer declined" << dendl;
       clear_scrub_reserved();
@@ -1895,23 +1897,23 @@ void PG::handle_scrub_reserve_request(OpRequestRef op)
 {
   dout(7) << __func__ << " " << *op->get_req() << dendl;
   op->mark_started();
-  if (scrubber.reserved) {
+  if (scrubber.remote_reserved) {
     dout(10) << __func__ << " ignoring reserve request: Already reserved"
             << dendl;
     return;
   }
   if ((cct->_conf->osd_scrub_during_recovery || !osd->is_recovery_active()) &&
-      osd->inc_scrubs_pending()) {
-    scrubber.reserved = true;
+      osd->inc_scrubs_remote()) {
+    scrubber.remote_reserved = true;
   } else {
     dout(20) << __func__ << ": failed to reserve remotely" << dendl;
-    scrubber.reserved = false;
+    scrubber.remote_reserved = false;
   }
   auto m = op->get_req<MOSDScrubReserve>();
   Message *reply = new MOSDScrubReserve(
     spg_t(info.pgid.pgid, get_primary().shard),
     m->map_epoch,
-    scrubber.reserved ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT,
+    scrubber.remote_reserved ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT,
     pg_whoami);
   osd->send_message_osd_cluster(reply, op->get_req()->get_connection());
 }
@@ -1920,7 +1922,7 @@ void PG::handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from)
 {
   dout(7) << __func__ << " " << *op->get_req() << dendl;
   op->mark_started();
-  if (!scrubber.reserved) {
+  if (!scrubber.local_reserved) {
     dout(10) << "ignoring obsolete scrub reserve reply" << dendl;
     return;
   }
@@ -1937,7 +1939,7 @@ void PG::handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from)
 {
   dout(7) << __func__ << " " << *op->get_req() << dendl;
   op->mark_started();
-  if (!scrubber.reserved) {
+  if (!scrubber.local_reserved) {
     dout(10) << "ignoring obsolete scrub reserve reply" << dendl;
     return;
   }
@@ -2053,9 +2055,13 @@ void PG::clear_scrub_reserved()
   scrubber.reserved_peers.clear();
   scrubber.reserve_failed = false;
 
-  if (scrubber.reserved) {
-    scrubber.reserved = false;
-    osd->dec_scrubs_pending();
+  if (scrubber.local_reserved) {
+    scrubber.local_reserved = false;
+    osd->dec_scrubs_local();
+  }
+  if (scrubber.remote_reserved) {
+    scrubber.remote_reserved = false;
+    osd->dec_scrubs_remote();
   }
 }
 
@@ -2621,12 +2627,6 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
         scrubber.epoch_start = info.history.same_interval_since;
         scrubber.active = true;
 
-       osd->inc_scrubs_active(scrubber.reserved);
-       if (scrubber.reserved) {
-         scrubber.reserved = false;
-         scrubber.reserved_peers.clear();
-       }
-
        {
          ObjectStore::Transaction t;
          scrubber.cleanup_store(&t);
@@ -2989,9 +2989,12 @@ void PG::scrub_clear_state(bool has_error)
   state_clear(PG_STATE_DEEP_SCRUB);
   publish_stats_to_osd();
 
-  // active -> nothing.
-  if (scrubber.active)
-    osd->dec_scrubs_active();
+  // local -> nothing.
+  if (scrubber.local_reserved) {
+    osd->dec_scrubs_local();
+    scrubber.local_reserved = false;
+    scrubber.reserved_peers.clear();
+  }
 
   requeue_ops(waiting_for_scrub);
 
index b146081c51535fe5993b5572337f341fc93bf16b..585c93e51b08cf005a311a195ba68106f8a7bb2a 100644 (file)
@@ -1091,7 +1091,7 @@ public:
 
     // metadata
     set<pg_shard_t> reserved_peers;
-    bool reserved, reserve_failed;
+    bool local_reserved, remote_reserved, reserve_failed;
     epoch_t epoch_start;
 
     // common to both scrubs