recovery locks held (in_progress) and waiting in priority queues.
* New OSD daemon command dump_scrub_reservations which reveals the
- scrub reservations that are active and pending.
+ scrub reservations that are held for local (primary) and remote (replica) PGs.
+Scrub internals and diagnostics
+===============================
+
Scrubbing Behavior Table
-========================
+------------------------
+-------------------------------------------------+----------+-----------+---------------+----------------------+
| Flags | none | noscrub | nodeep_scrub | noscrub/nodeep_scrub |
- Initiated scrub state is must_scrub && !must_deep_scrub && !time_for_deep
- Initiated scrub after osd_deep_scrub_interval state is must scrub && !must_deep_scrub && time_for_deep
- Initiated deep scrub state is must_scrub && must_deep_scrub
+
+Scrub Reservations
+------------------
+
+An OSD daemon command dumps total local and remote reservations::
+
+ ceph daemon osd.<id> dump_scrub_reservations
+
--- /dev/null
+#!/usr/bin/env bash
+#
+# Copyright (C) 2019 Red Hat <contact@redhat.com>
+#
+# Author: David Zafman <dzafman@redhat.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+MAX_SCRUBS=4
+SCRUB_SLEEP=2
+POOL_SIZE=3
+
+function run() {
+ local dir=$1
+ shift
+ local SLEEP=0
+ local CHUNK_MAX=5
+
+ export CEPH_MON="127.0.0.1:7184" # git grep '\<7184\>' : there must be only one
+ export CEPH_ARGS
+ CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+ CEPH_ARGS+="--mon-host=$CEPH_MON "
+ CEPH_ARGS+="--osd_max_scrubs=$MAX_SCRUBS "
+ CEPH_ARGS+="--osd_scrub_sleep=$SLEEP "
+ CEPH_ARGS+="--osd_scrub_chunk_max=$CHUNK_MAX "
+ CEPH_ARGS+="--osd_scrub_sleep=$SCRUB_SLEEP "
+ CEPH_ARGS+="--osd_pool_default_size=$POOL_SIZE "
+
+ export -n CEPH_CLI_TEST_DUP_COMMAND
+ local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+ for func in $funcs ; do
+ setup $dir || return 1
+ $func $dir || return 1
+ teardown $dir || return 1
+ done
+}
+
+function TEST_recover_unexpected() {
+ local dir=$1
+ shift
+ local OSDS=6
+ local PGS=16
+ local POOLS=3
+ local OBJS=1000
+
+ run_mon $dir a || return 1
+ run_mgr $dir x || return 1
+ for o in $(seq 0 $(expr $OSDS - 1))
+ do
+ run_osd $dir $o
+ done
+
+ for i in $(seq 1 $POOLS)
+ do
+ create_pool test$i $PGS $PGS
+ done
+
+ wait_for_clean || return 1
+
+ dd if=/dev/urandom of=datafile bs=4k count=2
+ for i in $(seq 1 $POOLS)
+ do
+ for j in $(seq 1 $OBJS)
+ do
+ rados -p test$i put obj$j datafile
+ done
+ done
+ rm datafile
+
+ ceph osd set noscrub
+ ceph osd set nodeep-scrub
+
+ for qpg in $(ceph pg dump pgs --format=json-pretty | jq '.pg_stats[].pgid')
+ do
+ primary=$(ceph pg dump pgs --format=json | jq ".pg_stats[] | select(.pgid == $qpg) | .acting_primary")
+ eval pg=$qpg # strip quotes around qpg
+ CEPH_ARGS='' ceph daemon $(get_asok_path osd.$primary) trigger_scrub $pg
+ done
+
+ ceph pg dump pgs
+
+ max=$(CEPH_ARGS='' ceph daemon $(get_asok_path osd.0) dump_scrub_reservations | jq '.osd_max_scrubs')
+ if [ $max != $MAX_SCRUBS];
+ then
+ echo "ERROR: Incorrect osd_max_scrubs from dump_scrub_reservations"
+ return 1
+ fi
+
+ ceph osd unset noscrub
+
+ ok=false
+ for i in $(seq 0 300)
+ do
+ ceph pg dump pgs
+ if ceph pg dump pgs | grep scrubbing; then
+ ok=true
+ break
+ fi
+ sleep 1
+ done
+ if test $ok = "false"; then
+ echo "ERROR: Test set-up failed no scrubbing"
+ return 1
+ fi
+
+ local total=0
+ local zerocount=0
+ local maxzerocount=3
+ while(true)
+ do
+ pass=0
+ for o in $(seq 0 $(expr $OSDS - 1))
+ do
+ CEPH_ARGS='' ceph daemon $(get_asok_path osd.$o) dump_scrub_reservations
+ scrubs=$(CEPH_ARGS='' ceph daemon $(get_asok_path osd.$o) dump_scrub_reservations | jq '.scrubs_local + .scrubs_remote')
+ if [ $scrubs -gt $MAX_SCRUBS ]; then
+ echo "ERROR: More than $MAX_SCRUBS currently reserved"
+ return 1
+ fi
+ pass=$(expr $pass + $scrubs)
+ done
+ if [ $pass = "0" ]; then
+ zerocount=$(expr $zerocount + 1)
+ fi
+ if [ $zerocount -gt $maxzerocount ]; then
+ break
+ fi
+ total=$(expr $total + $pass)
+ sleep $(expr $SCRUB_SLEEP \* 2)
+ done
+
+ # Check that there are no more scrubs
+ for i in $(seq 0 5)
+ do
+ if ceph pg dump pgs | grep scrubbing; then
+ echo "ERROR: Extra scrubs after test completion...not expected"
+ return 1
+ fi
+ sleep $SCRUB_SLEEP
+ done
+
+ echo $total total reservations seen
+
+ # Sort of arbitraty number based on PGS * POOLS * POOL_SIZE as the number of total scrub
+ # reservations that must occur. However, the loop above might see the same reservation more
+ # than once.
+ actual_reservations=$(expr $PGS \* $POOLS \* $POOL_SIZE)
+ if [ $total -lt $actual_reservations ]; then
+ echo "ERROR: Unexpectedly low amount of scrub reservations seen during test"
+ return 1
+ fi
+
+ return 0
+}
+
+
+main osd-scrub-dump "$@"
+
+# Local Variables:
+# compile-command: "cd build ; make check && \
+# ../qa/run-standalone.sh osd-scrub-dump.sh"
+# End:
publish_lock{ceph::make_mutex("OSDService::publish_lock")},
pre_publish_lock{ceph::make_mutex("OSDService::pre_publish_lock")},
max_oldest_map(0),
- scrubs_pending(0),
- scrubs_active(0),
+ scrubs_local(0),
+ scrubs_remote(0),
agent_valid_iterator(false),
agent_ops(0),
flush_mode_high_count(0),
// --------------------------------------
// dispatch
-bool OSDService::can_inc_scrubs_pending()
+bool OSDService::can_inc_scrubs()
{
bool can_inc = false;
std::lock_guard l(sched_scrub_lock);
- if (scrubs_pending + scrubs_active < cct->_conf->osd_max_scrubs) {
- dout(20) << __func__ << " " << scrubs_pending << " -> " << (scrubs_pending+1)
- << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active
- << ")" << dendl;
+ if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
+ dout(20) << __func__ << " == true " << scrubs_local << " local + " << scrubs_remote
+ << " remote < max " << cct->_conf->osd_max_scrubs << dendl;
can_inc = true;
} else {
- dout(20) << __func__ << " " << scrubs_pending << " + " << scrubs_active
- << " active >= max " << cct->_conf->osd_max_scrubs << dendl;
+ dout(20) << __func__ << " == false " << scrubs_local << " local + " << scrubs_remote
+ << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
}
return can_inc;
}
-bool OSDService::inc_scrubs_pending()
+bool OSDService::inc_scrubs_local()
{
bool result = false;
std::lock_guard l{sched_scrub_lock};
- if (scrubs_pending + scrubs_active < cct->_conf->osd_max_scrubs) {
- dout(20) << "inc_scrubs_pending " << scrubs_pending << " -> " << (scrubs_pending+1)
- << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
+ if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
+ dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local+1)
+ << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
result = true;
- ++scrubs_pending;
+ ++scrubs_local;
} else {
- dout(20) << "inc_scrubs_pending " << scrubs_pending << " + " << scrubs_active << " active >= max " << cct->_conf->osd_max_scrubs << dendl;
+ dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
}
return result;
}
-void OSDService::dec_scrubs_pending()
+void OSDService::dec_scrubs_local()
{
std::lock_guard l{sched_scrub_lock};
- dout(20) << "dec_scrubs_pending " << scrubs_pending << " -> " << (scrubs_pending-1)
- << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
- --scrubs_pending;
- ceph_assert(scrubs_pending >= 0);
+ dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local-1)
+ << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
+ --scrubs_local;
+ ceph_assert(scrubs_local >= 0);
}
-void OSDService::inc_scrubs_active(bool reserved)
+bool OSDService::inc_scrubs_remote()
{
+ bool result = false;
std::lock_guard l{sched_scrub_lock};
- ++(scrubs_active);
- if (reserved) {
- --(scrubs_pending);
- dout(20) << "inc_scrubs_active " << (scrubs_active-1) << " -> " << scrubs_active
- << " (max " << cct->_conf->osd_max_scrubs
- << ", pending " << (scrubs_pending+1) << " -> " << scrubs_pending << ")" << dendl;
- ceph_assert(scrubs_pending >= 0);
+ if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
+ dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote+1)
+ << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
+ result = true;
+ ++scrubs_remote;
} else {
- dout(20) << "inc_scrubs_active " << (scrubs_active-1) << " -> " << scrubs_active
- << " (max " << cct->_conf->osd_max_scrubs
- << ", pending " << scrubs_pending << ")" << dendl;
+ dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
}
+ return result;
}
-void OSDService::dec_scrubs_active()
+void OSDService::dec_scrubs_remote()
{
std::lock_guard l{sched_scrub_lock};
- dout(20) << "dec_scrubs_active " << scrubs_active << " -> " << (scrubs_active-1)
- << " (max " << cct->_conf->osd_max_scrubs << ", pending " << scrubs_pending << ")" << dendl;
- --scrubs_active;
- ceph_assert(scrubs_active >= 0);
+ dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote-1)
+ << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
+ --scrubs_remote;
+ ceph_assert(scrubs_remote >= 0);
}
void OSDService::dump_scrub_reservations(Formatter *f)
{
std::lock_guard l{sched_scrub_lock};
- f->dump_int("scrubs_active", scrubs_active);
- f->dump_int("scrubs_pending", scrubs_pending);
+ f->dump_int("scrubs_local", scrubs_local);
+ f->dump_int("scrubs_remote", scrubs_remote);
f->dump_int("osd_max_scrubs", cct->_conf->osd_max_scrubs);
}
void OSD::sched_scrub()
{
// if not permitted, fail fast
- if (!service.can_inc_scrubs_pending()) {
+ if (!service.can_inc_scrubs()) {
return;
}
bool allow_requested_repair_only = false;
continue;
}
// If it is reserving, let it resolve before going to the next scrub job
- if (pg->scrubber.reserved) {
+ if (pg->scrubber.local_reserved && !pg->scrubber.active) {
pg->unlock();
dout(30) << __func__ << ": reserve in progress pgid " << scrub.pgid << dendl;
break;
private:
// -- scrub scheduling --
ceph::mutex sched_scrub_lock = ceph::make_mutex("OSDService::sched_scrub_lock");
- int scrubs_pending;
- int scrubs_active;
+ int scrubs_local;
+ int scrubs_remote;
public:
struct ScrubJob {
f->close_section();
}
- bool can_inc_scrubs_pending();
- bool inc_scrubs_pending();
- void inc_scrubs_active(bool reserved);
- void dec_scrubs_pending();
- void dec_scrubs_active();
+ bool can_inc_scrubs();
+ bool inc_scrubs_local();
+ void dec_scrubs_local();
+ bool inc_scrubs_remote();
+ void dec_scrubs_remote();
void dump_scrub_reservations(Formatter *f);
void reply_op_error(OpRequestRef op, int err);
}
PG::Scrubber::Scrubber()
- : reserved(false), reserve_failed(false),
+ : local_reserved(false), remote_reserved(false), reserve_failed(false),
epoch_start(0),
active(false),
shallow_errors(0), deep_errors(0), fixed(0),
/*
* when holding pg and sched_scrub_lock, then the states are:
* scheduling:
- * scrubber.reserved = true
- * scrub_rserved_peers includes whoami
- * osd->scrub_pending++
+ * scrubber.local_reserved = true
+ * scrubber.active = false
+ * scrubber.reserved_peers includes whoami
+ * osd->scrubs_local++
* scheduling, replica declined:
- * scrubber.reserved = true
+ * scrubber.local_reserved = true
* scrubber.reserved_peers includes -1
- * osd->scrub_pending++
+ * osd->scrub_local++
* pending:
- * scrubber.reserved = true
+ * scrubber.local_reserved = true
+ * scrubber.active = false
* scrubber.reserved_peers.size() == acting.size();
* pg on scrub_wq
- * osd->scrub_pending++
+ * osd->scrub_local++
* scrubbing:
- * scrubber.reserved = false;
+ * scrubber.local_reserved = true;
+ * scrubber.active = true
* scrubber.reserved_peers empty
- * osd->scrubber.active++
*/
// returns true if a scrub has been newly kicked off
// All processing the first time through commits us to whatever
// choices are made.
- if (!scrubber.reserved) {
+ if (!scrubber.local_reserved) {
dout(20) << __func__ << ": Start processing pg " << info.pgid << dendl;
bool allow_deep_scrub = !(get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
(cct->_conf->osd_repair_during_recovery && scrubber.must_repair) ||
!osd->is_recovery_active();
if (allow_scrubing &&
- osd->inc_scrubs_pending()) {
+ osd->inc_scrubs_local()) {
dout(20) << __func__ << ": reserved locally, reserving replicas" << dendl;
- scrubber.reserved = true;
+ scrubber.local_reserved = true;
scrubber.reserved_peers.insert(pg_whoami);
scrub_reserve_replicas();
} else {
}
}
- if (scrubber.reserved) {
+ if (scrubber.local_reserved) {
if (scrubber.reserve_failed) {
dout(20) << __func__ << ": failed, a peer declined" << dendl;
clear_scrub_reserved();
{
dout(7) << __func__ << " " << *op->get_req() << dendl;
op->mark_started();
- if (scrubber.reserved) {
+ if (scrubber.remote_reserved) {
dout(10) << __func__ << " ignoring reserve request: Already reserved"
<< dendl;
return;
}
if ((cct->_conf->osd_scrub_during_recovery || !osd->is_recovery_active()) &&
- osd->inc_scrubs_pending()) {
- scrubber.reserved = true;
+ osd->inc_scrubs_remote()) {
+ scrubber.remote_reserved = true;
} else {
dout(20) << __func__ << ": failed to reserve remotely" << dendl;
- scrubber.reserved = false;
+ scrubber.remote_reserved = false;
}
auto m = op->get_req<MOSDScrubReserve>();
Message *reply = new MOSDScrubReserve(
spg_t(info.pgid.pgid, get_primary().shard),
m->map_epoch,
- scrubber.reserved ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT,
+ scrubber.remote_reserved ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT,
pg_whoami);
osd->send_message_osd_cluster(reply, op->get_req()->get_connection());
}
{
dout(7) << __func__ << " " << *op->get_req() << dendl;
op->mark_started();
- if (!scrubber.reserved) {
+ if (!scrubber.local_reserved) {
dout(10) << "ignoring obsolete scrub reserve reply" << dendl;
return;
}
{
dout(7) << __func__ << " " << *op->get_req() << dendl;
op->mark_started();
- if (!scrubber.reserved) {
+ if (!scrubber.local_reserved) {
dout(10) << "ignoring obsolete scrub reserve reply" << dendl;
return;
}
scrubber.reserved_peers.clear();
scrubber.reserve_failed = false;
- if (scrubber.reserved) {
- scrubber.reserved = false;
- osd->dec_scrubs_pending();
+ if (scrubber.local_reserved) {
+ scrubber.local_reserved = false;
+ osd->dec_scrubs_local();
+ }
+ if (scrubber.remote_reserved) {
+ scrubber.remote_reserved = false;
+ osd->dec_scrubs_remote();
}
}
scrubber.epoch_start = info.history.same_interval_since;
scrubber.active = true;
- osd->inc_scrubs_active(scrubber.reserved);
- if (scrubber.reserved) {
- scrubber.reserved = false;
- scrubber.reserved_peers.clear();
- }
-
{
ObjectStore::Transaction t;
scrubber.cleanup_store(&t);
state_clear(PG_STATE_DEEP_SCRUB);
publish_stats_to_osd();
- // active -> nothing.
- if (scrubber.active)
- osd->dec_scrubs_active();
+ // local -> nothing.
+ if (scrubber.local_reserved) {
+ osd->dec_scrubs_local();
+ scrubber.local_reserved = false;
+ scrubber.reserved_peers.clear();
+ }
requeue_ops(waiting_for_scrub);
// metadata
set<pg_shard_t> reserved_peers;
- bool reserved, reserve_failed;
+ bool local_reserved, remote_reserved, reserve_failed;
epoch_t epoch_start;
// common to both scrubs