teardown $dir || return 1
}
+#
+# Allow repair to be scheduled when some recovering is still undergoing on the same OSD
+#
+function TEST_allow_repair_during_recovery() {
+ local dir=$1
+ local poolname=rbd
+
+ setup $dir || return 1
+ run_mon $dir a --osd_pool_default_size=2 || return 1
+ run_mgr $dir x || return 1
+ run_osd $dir 0 --osd_scrub_during_recovery=false \
+ --osd_repair_during_recovery=true \
+ --osd_debug_pretend_recovery_active=true || return 1
+ run_osd $dir 1 --osd_scrub_during_recovery=false \
+ --osd_repair_during_recovery=true \
+ --osd_debug_pretend_recovery_active=true || return 1
+ create_rbd_pool || return 1
+ wait_for_clean || return 1
+
+ add_something $dir $poolname || return 1
+ corrupt_and_repair_one $dir $poolname $(get_not_primary $poolname SOMETHING) || return 1
+
+ teardown $dir || return 1
+}
+
+#
+# Skip non-repair scrub correctly during recovery
+#
+function TEST_skip_non_repair_during_recovery() {
+ local dir=$1
+ local poolname=rbd
+
+ setup $dir || return 1
+ run_mon $dir a --osd_pool_default_size=2 || return 1
+ run_mgr $dir x || return 1
+ run_osd $dir 0 --osd_scrub_during_recovery=false \
+ --osd_repair_during_recovery=true \
+ --osd_debug_pretend_recovery_active=true || return 1
+ run_osd $dir 1 --osd_scrub_during_recovery=false \
+ --osd_repair_during_recovery=true \
+ --osd_debug_pretend_recovery_active=true || return 1
+ create_rbd_pool || return 1
+ wait_for_clean || return 1
+
+ add_something $dir $poolname || return 1
+ scrub_and_not_schedule $dir $poolname $(get_not_primary $poolname SOMETHING) || return 1
+
+ teardown $dir || return 1
+}
+
+function scrub_and_not_schedule() {
+ local dir=$1
+ local poolname=$2
+ local osd=$3
+
+ #
+ # 1) start a non-repair scrub
+ #
+ local pg=$(get_pg $poolname SOMETHING)
+ local last_scrub=$(get_last_scrub_stamp $pg)
+ ceph pg scrub $pg
+
+ #
+ # 2) Assure the scrub is not scheduled
+ #
+ for ((i=0; i < 3; i++)); do
+ if test "$(get_last_scrub_stamp $pg)" '>' "$last_scrub" ; then
+ return 1
+ fi
+ sleep 1
+ done
+
+ #
+ # 3) Access to the file must OK
+ #
+ objectstore_tool $dir $osd SOMETHING list-attrs || return 1
+ rados --pool $poolname get SOMETHING $dir/COPY || return 1
+ diff $dir/ORIGINAL $dir/COPY || return 1
+}
+
function corrupt_and_repair_two() {
local dir=$1
local poolname=$2
OPTION(osd_max_push_objects, OPT_U64) // max objects in single push op
OPTION(osd_max_scrubs, OPT_INT)
OPTION(osd_scrub_during_recovery, OPT_BOOL) // Allow new scrubs to start while recovery is active on the OSD
+OPTION(osd_repair_during_recovery, OPT_BOOL) // Allow new requested repairs to start while recovery is active on the OSD
OPTION(osd_scrub_begin_hour, OPT_INT)
OPTION(osd_scrub_end_hour, OPT_INT)
OPTION(osd_scrub_begin_week_day, OPT_INT)
OPTION(osd_debug_verify_cached_snaps, OPT_BOOL)
OPTION(osd_debug_deep_scrub_sleep, OPT_FLOAT)
OPTION(osd_debug_no_acting_change, OPT_BOOL)
+OPTION(osd_debug_pretend_recovery_active, OPT_BOOL)
OPTION(osd_enable_op_tracker, OPT_BOOL) // enable/disable OSD op tracking
OPTION(osd_num_op_tracker_shard, OPT_U32) // The number of shards for holding the ops
OPTION(osd_op_history_size, OPT_U32) // Max number of completed ops to track
.set_default(false)
.set_description("Allow scrubbing when PGs on the OSD are undergoing recovery"),
+ Option("osd_repair_during_recovery", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description("Allow requested repairing when PGs on the OSD are undergoing recovery"),
+
Option("osd_scrub_begin_hour", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(0)
.set_description("Restrict scrubbing to this hour of the day or later")
Option("osd_debug_no_purge_strays", Option::TYPE_BOOL, Option::LEVEL_DEV)
.set_default(false),
+ Option("osd_debug_pretend_recovery_active", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
Option("osd_enable_op_tracker", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(true)
.set_description(""),
if (!service.can_inc_scrubs_pending()) {
return;
}
- if (!cct->_conf->osd_scrub_during_recovery && service.is_recovery_active()) {
- dout(20) << __func__ << " not scheduling scrubs due to active recovery" << dendl;
- return;
+ bool allow_requested_repair_only = false;
+ if (service.is_recovery_active()) {
+ if (!cct->_conf->osd_scrub_during_recovery && cct->_conf->osd_repair_during_recovery) {
+ dout(10) << __func__
+ << " will only schedule explicitly requested repair due to active recovery"
+ << dendl;
+ allow_requested_repair_only = true;
+ } else if (!cct->_conf->osd_scrub_during_recovery && !cct->_conf->osd_repair_during_recovery) {
+ dout(20) << __func__ << " not scheduling scrubs due to active recovery" << dendl;
+ return;
+ }
}
-
utime_t now = ceph_clock_now();
bool time_permit = scrub_time_permit(now);
bool load_is_low = scrub_load_below_threshold();
dout(30) << __func__ << ": already in progress pgid " << scrub.pgid << dendl;
continue;
}
+ // Skip other kinds of scrubing if only explicitly requested repairing is allowed
+ if (allow_requested_repair_only && !pg->scrubber.must_repair) {
+ pg->unlock();
+ dout(10) << __func__ << " skip " << scrub.pgid
+ << " because repairing is not explicitly requested on it"
+ << dendl;
+ continue;
+ }
// If it is reserving, let it resolve before going to the next scrub job
if (pg->scrubber.reserved) {
pg->unlock();
bool OSDService::is_recovery_active()
{
+ if (cct->_conf->osd_debug_pretend_recovery_active) {
+ return true;
+ }
return local_reserver.has_reservation() || remote_reserver.has_reservation();
}
scrubber.need_auto = false;
ceph_assert(scrubber.reserved_peers.empty());
- if ((cct->_conf->osd_scrub_during_recovery || !osd->is_recovery_active()) &&
+ bool allow_scrubing = cct->_conf->osd_scrub_during_recovery ||
+ (cct->_conf->osd_repair_during_recovery && scrubber.must_repair) ||
+ !osd->is_recovery_active();
+ if (allow_scrubing &&
osd->inc_scrubs_pending()) {
dout(20) << __func__ << ": reserved locally, reserving replicas" << dendl;
scrubber.reserved = true;