From 7008b85fc562d228f61dc382a3e5068547acdc2a Mon Sep 17 00:00:00 2001 From: Ronen Friedman Date: Sun, 14 Nov 2021 20:37:38 +0000 Subject: [PATCH] qa/standalone: test for scrub behavior when noscrub is set but nodeepscrub is not A bug (https://tracker.ceph.com/issues/52901 - now fixed) resulted in this combination of conditions leaving the PG in "scrubbing" state forever. That bug was fixed by PR#43521. The patch here adds a test to detect the (now fixed) wrong behavior. Signed-off-by: Ronen Friedman --- qa/standalone/scrub/osd-scrub-test.sh | 59 ++++++++++++++ qa/standalone/scrub/scrub-helpers.sh | 112 +++++++++++++++++++++++++- 2 files changed, 167 insertions(+), 4 deletions(-) diff --git a/qa/standalone/scrub/osd-scrub-test.sh b/qa/standalone/scrub/osd-scrub-test.sh index 6a1132ef22555..b389465b2312c 100755 --- a/qa/standalone/scrub/osd-scrub-test.sh +++ b/qa/standalone/scrub/osd-scrub-test.sh @@ -441,6 +441,65 @@ function TEST_scrub_permit_time() { done } +# a test to recreate the problem described in bug #52901 - setting 'noscrub' +# without explicitly preventing deep scrubs made the PG 'unscrubable'. +# Fixed by PR#43521 +function TEST_just_deep_scrubs() { + local dir=$1 + local -A cluster_conf=( + ['osds_num']="3" + ['pgs_in_pool']="4" + ['pool_name']="test" + ) + + standard_scrub_cluster $dir cluster_conf + local poolid=${cluster_conf['pool_id']} + local poolname=${cluster_conf['pool_name']} + echo "Pool: $poolname : $poolid" + + TESTDATA="testdata.$$" + local objects=15 + dd if=/dev/urandom of=$TESTDATA bs=1032 count=1 + for i in `seq 1 $objects` + do + rados -p $poolname put obj${i} $TESTDATA + done + rm -f $TESTDATA + + # set 'no scrub', then request a deep-scrub. + # we do not expect to see the scrub scheduled. + + ceph osd set noscrub || return 1 + sleep 6 # the 'noscrub' command takes a long time to reach the OSDs + local now_is=`date -I"ns"` + declare -A sched_data + local pgid="${poolid}.2" + + # turn on the publishing of test data in the 'scrubber' section of 'pg query' output + set_query_debug $pgid + + extract_published_sch $pgid $now_is $now_is sched_data + local saved_last_stamp=${sched_data['query_last_stamp']} + local dbg_counter_at_start=${sched_data['query_scrub_seq']} + echo "test counter @ start: $dbg_counter_at_start" + + ceph pg $pgid deep_scrub + + sleep 5 # 5s is the 'pg dump' interval + declare -A sc_data_2 + extract_published_sch $pgid $now_is $now_is sc_data_2 + echo "test counter @ should show no change: " ${sc_data_2['query_scrub_seq']} + (( ${sc_data_2['dmp_last_duration']} == 0)) || return 1 + (( ${sc_data_2['query_scrub_seq']} == $dbg_counter_at_start)) || return 1 + + # unset the 'no scrub'. Deep scrubbing should start now. + ceph osd unset noscrub || return 1 + sleep 5 + declare -A expct_qry_duration=( ['query_last_duration']="0" ['query_last_duration_neg']="not0" ) + sc_data_2=() + echo "test counter @ should be higher than before the unset: " ${sc_data_2['query_scrub_seq']} + wait_any_cond $pgid 10 $saved_last_stamp expct_qry_duration "WaitingAfterScrub " sc_data_2 || return 1 +} function TEST_dump_scrub_schedule() { local dir=$1 diff --git a/qa/standalone/scrub/scrub-helpers.sh b/qa/standalone/scrub/scrub-helpers.sh index cf13732c53854..c7f48150cc1d4 100644 --- a/qa/standalone/scrub/scrub-helpers.sh +++ b/qa/standalone/scrub/scrub-helpers.sh @@ -66,9 +66,9 @@ function extract_published_sch() { query_last_duration: .info.stats.last_scrub_duration, query_last_stamp: .info.history.last_scrub_stamp, query_last_scrub: (.info.history.last_scrub| sub($spt;"x") ), - query_is_future: .q_when_is_future, - query_vs_date: .q_vs_date, - + query_is_future: .q_when_is_future, + query_vs_date: .q_vs_date, + query_scrub_seq: .scrubber.test_sequence } '` (( extr_dbg >= 1 )) && echo $from_qry " " $from_dmp | jq -s -r 'add | "(",(to_entries | .[] | "["+(.key)+"]="+(.value|@sh)),")"' @@ -171,7 +171,7 @@ function schedule_against_expected() { local -n ep=$2 # the expected results local extr_dbg=1 - #turn off '-x' (but remember previous state) + # turn off '-x' (but remember previous state) local saved_echo_flag=${-//[^x]/} set +x @@ -194,3 +194,107 @@ function schedule_against_expected() { if [[ -n "$saved_echo_flag" ]]; then set -x; fi return 0 } + + +# Start the cluster "nodes" and create a pool for testing. +# +# The OSDs are started with a set of parameters aimed in creating a repeatable +# and stable scrub sequence: +# - no scrub randomizations/backoffs +# - no autoscaler +# +# $1: the test directory +# $2: [in/out] an array of configuration values +# +# The function adds/updates the configuration dictionary with the name of the +# pool created, and its ID. +# +# Argument 2 might look like this: +# +# declare -A test_conf=( +# ['osds_num']="3" +# ['pgs_in_pool']="7" +# ['extras']="--extra1 --extra2" +# ['pool_name']="testpl" +# ) +function standard_scrub_cluster() { + local dir=$1 + local -n args=$2 + + local OSDS=${args['osds_num']:-"3"} + local pg_num=${args['pgs_in_pool']:-"8"} + local poolname="${args['pool_name']:-test}" + args['pool_name']=$poolname + local extra_pars=${args['extras']} + local debug_msg=${args['msg']:-"dbg"} + + # turn off '-x' (but remember previous state) + local saved_echo_flag=${-//[^x]/} + set +x + + run_mon $dir a --osd_pool_default_size=$OSDS || return 1 + run_mgr $dir x || return 1 + + local ceph_osd_args="--osd_deep_scrub_randomize_ratio=0 \ + --osd_scrub_interval_randomize_ratio=0 \ + --osd_scrub_backoff_ratio=0.0 \ + --osd_pool_default_pg_autoscale_mode=off \ + $extra_pars" + + for osd in $(seq 0 $(expr $OSDS - 1)) + do + run_osd $dir $osd $ceph_osd_args || return 1 + done + + create_pool $poolname $pg_num $pg_num + wait_for_clean || return 1 + + # update the in/out 'args' with the ID of the new pool + sleep 1 + name_n_id=`ceph osd dump | awk '/^pool.*'$poolname'/ { gsub(/'"'"'/," ",$3); print $3," ", $2}'` + echo "standard_scrub_cluster: $debug_msg: test pool is $name_n_id" + args['pool_id']="${name_n_id##* }" + if [[ -n "$saved_echo_flag" ]]; then set -x; fi +} + + +# Start the cluster "nodes" and create a pool for testing - wpq version. +# +# A variant of standard_scrub_cluster() that selects the wpq scheduler and sets a value to +# osd_scrub_sleep. To be used when the test is attempting to "catch" the scrubber during an +# ongoing scrub. +# +# See standard_scrub_cluster() for more details. +# +# $1: the test directory +# $2: [in/out] an array of configuration values +# $3: osd_scrub_sleep +# +# The function adds/updates the configuration dictionary with the name of the +# pool created, and its ID. +function standard_scrub_wpq_cluster() { + local dir=$1 + local -n conf=$2 + local osd_sleep=$3 + + conf['extras']=" --osd_op_queue=wpq --osd_scrub_sleep=$osd_sleep ${conf['extras']}" + + standard_scrub_cluster $dir conf || return 1 +} + + +# A debug flag is set for the PG specified, causing the 'pg query' command to display +# an additional 'scrub sessions counter' field. +# +# $1: PG id +# +function set_query_debug() { + local pgid=$1 + local prim_osd=`ceph pg dump pgs_brief | \ + awk -v pg="^$pgid" -n -e '$0 ~ pg { print(gensub(/[^0-9]*([0-9]+).*/,"\\\\1","g",$5)); }' ` + + echo "Setting scrub debug data. Primary for $pgid is $prim_osd" + CEPH_ARGS='' ceph --format=json daemon $(get_asok_path osd.$prim_osd) \ + scrubdebug $pgid set sessions +} + -- 2.39.5