From: David Zafman Date: Wed, 31 Jul 2019 00:15:32 +0000 (+0000) Subject: osd: Replace active/pending scrub tracking for local/remote X-Git-Tag: v15.1.0~1591^2 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=b3e1c58b0ee70630d9e41269c181166c0b7ce703;p=ceph-ci.git osd: Replace active/pending scrub tracking for local/remote This is similar to how recovery reservations are split between local and remote. It was the case that scrubs_pending was used for reservations at the replicas as well as at the primary while requesting reservations from the replicas. There was no need for scrubs_pending to turn into scrubs_active at the primary as nothing treated that value as special. scrubber.active = true when scrubbing is actually going. Now scurbber.local_reserved indicates scrubs_local incremented Now scrubber.remote_reserved indicates scrubs_remote incremented Fixes: https://tracker.ceph.com/issues/41669 Signed-off-by: David Zafman --- diff --git a/PendingReleaseNotes b/PendingReleaseNotes index d793483a08d..1f819531271 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -134,4 +134,4 @@ recovery locks held (in_progress) and waiting in priority queues. * New OSD daemon command dump_scrub_reservations which reveals the - scrub reservations that are active and pending. + scrub reservations that are held for local (primary) and remote (replica) PGs. diff --git a/doc/dev/osd_internals/scrub.rst b/doc/dev/osd_internals/scrub.rst index 3343b3986c5..f20dc042e45 100644 --- a/doc/dev/osd_internals/scrub.rst +++ b/doc/dev/osd_internals/scrub.rst @@ -1,6 +1,9 @@ +Scrub internals and diagnostics +=============================== + Scrubbing Behavior Table -======================== +------------------------ +-------------------------------------------------+----------+-----------+---------------+----------------------+ | Flags | none | noscrub | nodeep_scrub | noscrub/nodeep_scrub | @@ -28,3 +31,11 @@ State variables - Initiated scrub state is must_scrub && !must_deep_scrub && !time_for_deep - Initiated scrub after osd_deep_scrub_interval state is must scrub && !must_deep_scrub && time_for_deep - Initiated deep scrub state is must_scrub && must_deep_scrub + +Scrub Reservations +------------------ + +An OSD daemon command dumps total local and remote reservations:: + + ceph daemon osd. dump_scrub_reservations + diff --git a/qa/standalone/scrub/osd-scrub-dump.sh b/qa/standalone/scrub/osd-scrub-dump.sh new file mode 100755 index 00000000000..e218834c657 --- /dev/null +++ b/qa/standalone/scrub/osd-scrub-dump.sh @@ -0,0 +1,173 @@ +#!/usr/bin/env bash +# +# Copyright (C) 2019 Red Hat +# +# Author: David Zafman +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# + +source $CEPH_ROOT/qa/standalone/ceph-helpers.sh + +MAX_SCRUBS=4 +SCRUB_SLEEP=2 +POOL_SIZE=3 + +function run() { + local dir=$1 + shift + local SLEEP=0 + local CHUNK_MAX=5 + + export CEPH_MON="127.0.0.1:7184" # git grep '\<7184\>' : there must be only one + export CEPH_ARGS + CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " + CEPH_ARGS+="--mon-host=$CEPH_MON " + CEPH_ARGS+="--osd_max_scrubs=$MAX_SCRUBS " + CEPH_ARGS+="--osd_scrub_sleep=$SLEEP " + CEPH_ARGS+="--osd_scrub_chunk_max=$CHUNK_MAX " + CEPH_ARGS+="--osd_scrub_sleep=$SCRUB_SLEEP " + CEPH_ARGS+="--osd_pool_default_size=$POOL_SIZE " + + export -n CEPH_CLI_TEST_DUP_COMMAND + local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} + for func in $funcs ; do + setup $dir || return 1 + $func $dir || return 1 + teardown $dir || return 1 + done +} + +function TEST_recover_unexpected() { + local dir=$1 + shift + local OSDS=6 + local PGS=16 + local POOLS=3 + local OBJS=1000 + + run_mon $dir a || return 1 + run_mgr $dir x || return 1 + for o in $(seq 0 $(expr $OSDS - 1)) + do + run_osd $dir $o + done + + for i in $(seq 1 $POOLS) + do + create_pool test$i $PGS $PGS + done + + wait_for_clean || return 1 + + dd if=/dev/urandom of=datafile bs=4k count=2 + for i in $(seq 1 $POOLS) + do + for j in $(seq 1 $OBJS) + do + rados -p test$i put obj$j datafile + done + done + rm datafile + + ceph osd set noscrub + ceph osd set nodeep-scrub + + for qpg in $(ceph pg dump pgs --format=json-pretty | jq '.pg_stats[].pgid') + do + primary=$(ceph pg dump pgs --format=json | jq ".pg_stats[] | select(.pgid == $qpg) | .acting_primary") + eval pg=$qpg # strip quotes around qpg + CEPH_ARGS='' ceph daemon $(get_asok_path osd.$primary) trigger_scrub $pg + done + + ceph pg dump pgs + + max=$(CEPH_ARGS='' ceph daemon $(get_asok_path osd.0) dump_scrub_reservations | jq '.osd_max_scrubs') + if [ $max != $MAX_SCRUBS]; + then + echo "ERROR: Incorrect osd_max_scrubs from dump_scrub_reservations" + return 1 + fi + + ceph osd unset noscrub + + ok=false + for i in $(seq 0 300) + do + ceph pg dump pgs + if ceph pg dump pgs | grep scrubbing; then + ok=true + break + fi + sleep 1 + done + if test $ok = "false"; then + echo "ERROR: Test set-up failed no scrubbing" + return 1 + fi + + local total=0 + local zerocount=0 + local maxzerocount=3 + while(true) + do + pass=0 + for o in $(seq 0 $(expr $OSDS - 1)) + do + CEPH_ARGS='' ceph daemon $(get_asok_path osd.$o) dump_scrub_reservations + scrubs=$(CEPH_ARGS='' ceph daemon $(get_asok_path osd.$o) dump_scrub_reservations | jq '.scrubs_local + .scrubs_remote') + if [ $scrubs -gt $MAX_SCRUBS ]; then + echo "ERROR: More than $MAX_SCRUBS currently reserved" + return 1 + fi + pass=$(expr $pass + $scrubs) + done + if [ $pass = "0" ]; then + zerocount=$(expr $zerocount + 1) + fi + if [ $zerocount -gt $maxzerocount ]; then + break + fi + total=$(expr $total + $pass) + sleep $(expr $SCRUB_SLEEP \* 2) + done + + # Check that there are no more scrubs + for i in $(seq 0 5) + do + if ceph pg dump pgs | grep scrubbing; then + echo "ERROR: Extra scrubs after test completion...not expected" + return 1 + fi + sleep $SCRUB_SLEEP + done + + echo $total total reservations seen + + # Sort of arbitraty number based on PGS * POOLS * POOL_SIZE as the number of total scrub + # reservations that must occur. However, the loop above might see the same reservation more + # than once. + actual_reservations=$(expr $PGS \* $POOLS \* $POOL_SIZE) + if [ $total -lt $actual_reservations ]; then + echo "ERROR: Unexpectedly low amount of scrub reservations seen during test" + return 1 + fi + + return 0 +} + + +main osd-scrub-dump "$@" + +# Local Variables: +# compile-command: "cd build ; make check && \ +# ../qa/run-standalone.sh osd-scrub-dump.sh" +# End: diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 14e2c064816..2eaedc8a281 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -233,8 +233,8 @@ OSDService::OSDService(OSD *osd) : publish_lock{ceph::make_mutex("OSDService::publish_lock")}, pre_publish_lock{ceph::make_mutex("OSDService::pre_publish_lock")}, max_oldest_map(0), - scrubs_pending(0), - scrubs_active(0), + scrubs_local(0), + scrubs_remote(0), agent_valid_iterator(false), agent_ops(0), flush_mode_high_count(0), @@ -1195,79 +1195,76 @@ void OSDService::prune_pg_created() // -------------------------------------- // dispatch -bool OSDService::can_inc_scrubs_pending() +bool OSDService::can_inc_scrubs() { bool can_inc = false; std::lock_guard l(sched_scrub_lock); - if (scrubs_pending + scrubs_active < cct->_conf->osd_max_scrubs) { - dout(20) << __func__ << " " << scrubs_pending << " -> " << (scrubs_pending+1) - << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active - << ")" << dendl; + if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) { + dout(20) << __func__ << " == true " << scrubs_local << " local + " << scrubs_remote + << " remote < max " << cct->_conf->osd_max_scrubs << dendl; can_inc = true; } else { - dout(20) << __func__ << " " << scrubs_pending << " + " << scrubs_active - << " active >= max " << cct->_conf->osd_max_scrubs << dendl; + dout(20) << __func__ << " == false " << scrubs_local << " local + " << scrubs_remote + << " remote >= max " << cct->_conf->osd_max_scrubs << dendl; } return can_inc; } -bool OSDService::inc_scrubs_pending() +bool OSDService::inc_scrubs_local() { bool result = false; std::lock_guard l{sched_scrub_lock}; - if (scrubs_pending + scrubs_active < cct->_conf->osd_max_scrubs) { - dout(20) << "inc_scrubs_pending " << scrubs_pending << " -> " << (scrubs_pending+1) - << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl; + if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) { + dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local+1) + << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl; result = true; - ++scrubs_pending; + ++scrubs_local; } else { - dout(20) << "inc_scrubs_pending " << scrubs_pending << " + " << scrubs_active << " active >= max " << cct->_conf->osd_max_scrubs << dendl; + dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl; } return result; } -void OSDService::dec_scrubs_pending() +void OSDService::dec_scrubs_local() { std::lock_guard l{sched_scrub_lock}; - dout(20) << "dec_scrubs_pending " << scrubs_pending << " -> " << (scrubs_pending-1) - << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl; - --scrubs_pending; - ceph_assert(scrubs_pending >= 0); + dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local-1) + << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl; + --scrubs_local; + ceph_assert(scrubs_local >= 0); } -void OSDService::inc_scrubs_active(bool reserved) +bool OSDService::inc_scrubs_remote() { + bool result = false; std::lock_guard l{sched_scrub_lock}; - ++(scrubs_active); - if (reserved) { - --(scrubs_pending); - dout(20) << "inc_scrubs_active " << (scrubs_active-1) << " -> " << scrubs_active - << " (max " << cct->_conf->osd_max_scrubs - << ", pending " << (scrubs_pending+1) << " -> " << scrubs_pending << ")" << dendl; - ceph_assert(scrubs_pending >= 0); + if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) { + dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote+1) + << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl; + result = true; + ++scrubs_remote; } else { - dout(20) << "inc_scrubs_active " << (scrubs_active-1) << " -> " << scrubs_active - << " (max " << cct->_conf->osd_max_scrubs - << ", pending " << scrubs_pending << ")" << dendl; + dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl; } + return result; } -void OSDService::dec_scrubs_active() +void OSDService::dec_scrubs_remote() { std::lock_guard l{sched_scrub_lock}; - dout(20) << "dec_scrubs_active " << scrubs_active << " -> " << (scrubs_active-1) - << " (max " << cct->_conf->osd_max_scrubs << ", pending " << scrubs_pending << ")" << dendl; - --scrubs_active; - ceph_assert(scrubs_active >= 0); + dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote-1) + << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl; + --scrubs_remote; + ceph_assert(scrubs_remote >= 0); } void OSDService::dump_scrub_reservations(Formatter *f) { std::lock_guard l{sched_scrub_lock}; - f->dump_int("scrubs_active", scrubs_active); - f->dump_int("scrubs_pending", scrubs_pending); + f->dump_int("scrubs_local", scrubs_local); + f->dump_int("scrubs_remote", scrubs_remote); f->dump_int("osd_max_scrubs", cct->_conf->osd_max_scrubs); } @@ -7654,7 +7651,7 @@ bool OSD::scrub_load_below_threshold() void OSD::sched_scrub() { // if not permitted, fail fast - if (!service.can_inc_scrubs_pending()) { + if (!service.can_inc_scrubs()) { return; } bool allow_requested_repair_only = false; @@ -7711,7 +7708,7 @@ void OSD::sched_scrub() continue; } // If it is reserving, let it resolve before going to the next scrub job - if (pg->scrubber.reserved) { + if (pg->scrubber.local_reserved && !pg->scrubber.active) { pg->unlock(); dout(30) << __func__ << ": reserve in progress pgid " << scrub.pgid << dendl; break; diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 770087970b3..babf1561f0f 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -273,8 +273,8 @@ public: private: // -- scrub scheduling -- ceph::mutex sched_scrub_lock = ceph::make_mutex("OSDService::sched_scrub_lock"); - int scrubs_pending; - int scrubs_active; + int scrubs_local; + int scrubs_remote; public: struct ScrubJob { @@ -349,11 +349,11 @@ public: f->close_section(); } - bool can_inc_scrubs_pending(); - bool inc_scrubs_pending(); - void inc_scrubs_active(bool reserved); - void dec_scrubs_pending(); - void dec_scrubs_active(); + bool can_inc_scrubs(); + bool inc_scrubs_local(); + void dec_scrubs_local(); + bool inc_scrubs_remote(); + void dec_scrubs_remote(); void dump_scrub_reservations(Formatter *f); void reply_op_error(OpRequestRef op, int err); diff --git a/src/osd/PG.cc b/src/osd/PG.cc index e655dc61a7b..5fef5299fa2 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -352,7 +352,7 @@ void PG::clear_primary_state() } PG::Scrubber::Scrubber() - : reserved(false), reserve_failed(false), + : local_reserved(false), remote_reserved(false), reserve_failed(false), epoch_start(0), active(false), shallow_errors(0), deep_errors(0), fixed(0), @@ -1323,22 +1323,24 @@ void PG::requeue_map_waiters() /* * when holding pg and sched_scrub_lock, then the states are: * scheduling: - * scrubber.reserved = true - * scrub_rserved_peers includes whoami - * osd->scrub_pending++ + * scrubber.local_reserved = true + * scrubber.active = false + * scrubber.reserved_peers includes whoami + * osd->scrubs_local++ * scheduling, replica declined: - * scrubber.reserved = true + * scrubber.local_reserved = true * scrubber.reserved_peers includes -1 - * osd->scrub_pending++ + * osd->scrub_local++ * pending: - * scrubber.reserved = true + * scrubber.local_reserved = true + * scrubber.active = false * scrubber.reserved_peers.size() == acting.size(); * pg on scrub_wq - * osd->scrub_pending++ + * osd->scrub_local++ * scrubbing: - * scrubber.reserved = false; + * scrubber.local_reserved = true; + * scrubber.active = true * scrubber.reserved_peers empty - * osd->scrubber.active++ */ // returns true if a scrub has been newly kicked off @@ -1352,7 +1354,7 @@ bool PG::sched_scrub() // All processing the first time through commits us to whatever // choices are made. - if (!scrubber.reserved) { + if (!scrubber.local_reserved) { dout(20) << __func__ << ": Start processing pg " << info.pgid << dendl; bool allow_deep_scrub = !(get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) || @@ -1442,9 +1444,9 @@ bool PG::sched_scrub() (cct->_conf->osd_repair_during_recovery && scrubber.must_repair) || !osd->is_recovery_active(); if (allow_scrubing && - osd->inc_scrubs_pending()) { + osd->inc_scrubs_local()) { dout(20) << __func__ << ": reserved locally, reserving replicas" << dendl; - scrubber.reserved = true; + scrubber.local_reserved = true; scrubber.reserved_peers.insert(pg_whoami); scrub_reserve_replicas(); } else { @@ -1453,7 +1455,7 @@ bool PG::sched_scrub() } } - if (scrubber.reserved) { + if (scrubber.local_reserved) { if (scrubber.reserve_failed) { dout(20) << __func__ << ": failed, a peer declined" << dendl; clear_scrub_reserved(); @@ -1895,23 +1897,23 @@ void PG::handle_scrub_reserve_request(OpRequestRef op) { dout(7) << __func__ << " " << *op->get_req() << dendl; op->mark_started(); - if (scrubber.reserved) { + if (scrubber.remote_reserved) { dout(10) << __func__ << " ignoring reserve request: Already reserved" << dendl; return; } if ((cct->_conf->osd_scrub_during_recovery || !osd->is_recovery_active()) && - osd->inc_scrubs_pending()) { - scrubber.reserved = true; + osd->inc_scrubs_remote()) { + scrubber.remote_reserved = true; } else { dout(20) << __func__ << ": failed to reserve remotely" << dendl; - scrubber.reserved = false; + scrubber.remote_reserved = false; } auto m = op->get_req(); Message *reply = new MOSDScrubReserve( spg_t(info.pgid.pgid, get_primary().shard), m->map_epoch, - scrubber.reserved ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT, + scrubber.remote_reserved ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT, pg_whoami); osd->send_message_osd_cluster(reply, op->get_req()->get_connection()); } @@ -1920,7 +1922,7 @@ void PG::handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from) { dout(7) << __func__ << " " << *op->get_req() << dendl; op->mark_started(); - if (!scrubber.reserved) { + if (!scrubber.local_reserved) { dout(10) << "ignoring obsolete scrub reserve reply" << dendl; return; } @@ -1937,7 +1939,7 @@ void PG::handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from) { dout(7) << __func__ << " " << *op->get_req() << dendl; op->mark_started(); - if (!scrubber.reserved) { + if (!scrubber.local_reserved) { dout(10) << "ignoring obsolete scrub reserve reply" << dendl; return; } @@ -2053,9 +2055,13 @@ void PG::clear_scrub_reserved() scrubber.reserved_peers.clear(); scrubber.reserve_failed = false; - if (scrubber.reserved) { - scrubber.reserved = false; - osd->dec_scrubs_pending(); + if (scrubber.local_reserved) { + scrubber.local_reserved = false; + osd->dec_scrubs_local(); + } + if (scrubber.remote_reserved) { + scrubber.remote_reserved = false; + osd->dec_scrubs_remote(); } } @@ -2621,12 +2627,6 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle) scrubber.epoch_start = info.history.same_interval_since; scrubber.active = true; - osd->inc_scrubs_active(scrubber.reserved); - if (scrubber.reserved) { - scrubber.reserved = false; - scrubber.reserved_peers.clear(); - } - { ObjectStore::Transaction t; scrubber.cleanup_store(&t); @@ -2989,9 +2989,12 @@ void PG::scrub_clear_state(bool has_error) state_clear(PG_STATE_DEEP_SCRUB); publish_stats_to_osd(); - // active -> nothing. - if (scrubber.active) - osd->dec_scrubs_active(); + // local -> nothing. + if (scrubber.local_reserved) { + osd->dec_scrubs_local(); + scrubber.local_reserved = false; + scrubber.reserved_peers.clear(); + } requeue_ops(waiting_for_scrub); diff --git a/src/osd/PG.h b/src/osd/PG.h index b146081c515..585c93e51b0 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -1091,7 +1091,7 @@ public: // metadata set reserved_peers; - bool reserved, reserve_failed; + bool local_reserved, remote_reserved, reserve_failed; epoch_t epoch_start; // common to both scrubs