From: Ronen Friedman Date: Thu, 9 Nov 2023 15:21:42 +0000 (-0600) Subject: tests/standalone: fix scrub-related tests following command changes X-Git-Tag: v19.0.0~85^2 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=fbb7d73e6f2070fb1732ec2d94ae05d23a31e51b;p=ceph.git tests/standalone: fix scrub-related tests following command changes Using ceph tell $pgid [deep]-scrub to initiate an 'operator initiated' scrub, and ceph tell $pgid schedule[-deep]-scrub for causing a 'periodic scrub' to be scheduled. Signed-off-by: Ronen Friedman --- diff --git a/qa/standalone/ceph-helpers.sh b/qa/standalone/ceph-helpers.sh index bf2c91bc04274..fc4756daf8b13 100755 --- a/qa/standalone/ceph-helpers.sh +++ b/qa/standalone/ceph-helpers.sh @@ -1865,6 +1865,9 @@ function test_repair() { # **get_last_scrub_stamp** function reports a timestamp different from # the one stored before starting the scrub. # +# The scrub is initiated using the "operator initiated" method, and +# the scrub triggered is not subject to no-scrub flags etc. +# # @param pgid the id of the PG # @return 0 on success, 1 on error # @@ -1899,6 +1902,48 @@ function test_pg_scrub() { ####################################################################### +## +# Trigger a "scheduled" scrub on **pgid** (by mnaually modifying the relevant +# last-scrub stamp) and wait until it completes. The pg_scrub +# function will fail if scrubbing does not complete within $TIMEOUT +# seconds. The pg_scrub is complete whenever the +# **get_last_scrub_stamp** function reports a timestamp different from +# the one stored before starting the scrub. +# +# @param pgid the id of the PG +# @return 0 on success, 1 on error +# +function pg_schedule_scrub() { + local pgid=$1 + local last_scrub=$(get_last_scrub_stamp $pgid) + ceph pg scrub $pgid + wait_for_scrub $pgid "$last_scrub" +} + +function pg_schedule_deep_scrub() { + local pgid=$1 + local last_scrub=$(get_last_scrub_stamp $pgid last_deep_scrub_stamp) + ceph pg deep-scrub $pgid + wait_for_scrub $pgid "$last_scrub" last_deep_scrub_stamp +} + +function test_pg_schedule_scrub() { + local dir=$1 + + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=1 --mon_allow_pool_size_one=true || return 1 + run_mgr $dir x || return 1 + run_osd $dir 0 || return 1 + create_rbd_pool || return 1 + wait_for_clean || return 1 + pg_schedule_scrub 1.0 || return 1 + kill_daemons $dir KILL osd || return 1 + ! TIMEOUT=1 pg_scrub 1.0 || return 1 + teardown $dir || return 1 +} + +####################################################################### + ## # Run the *command* and expect it to fail (i.e. return a non zero status). # The output (stderr and stdout) is stored in a temporary file in *dir* diff --git a/qa/standalone/scrub/osd-mapper.sh b/qa/standalone/scrub/osd-mapper.sh index ed18f94f1af1f..bfe57eac03e9b 100755 --- a/qa/standalone/scrub/osd-mapper.sh +++ b/qa/standalone/scrub/osd-mapper.sh @@ -77,7 +77,7 @@ function TEST_truncated_sna_record() { (( extr_dbg >= 1 )) && rados --format json-pretty -p $poolname listsnaps $objname # scrub the PG - ceph pg $pgid deep_scrub || return 1 + ceph pg $pgid deep-scrub || return 1 # we aren't just waiting for the scrub to terminate, but also for the # logs to be published @@ -149,7 +149,7 @@ function TEST_truncated_sna_record() { local cur_prim=`ceph --format=json-pretty osd map $poolname $objname | jq -r '.up[0]'` ceph pg dump pgs sleep 2 - ceph pg $pgid deep_scrub || return 1 + ceph pg $pgid deep-scrub || return 1 sleep 5 ceph pg dump pgs (( extr_dbg >= 1 )) && grep -a "ERR" $dir/osd.$cur_prim.log @@ -161,7 +161,7 @@ function TEST_truncated_sna_record() { echo "prev count: $prev_err_cnt" # scrub again. No errors expected this time - ceph pg $pgid deep_scrub || return 1 + ceph pg $pgid deep-scrub || return 1 sleep 5 ceph pg dump pgs (( extr_dbg >= 1 )) && grep -a "ERR" $dir/osd.$cur_prim.log diff --git a/qa/standalone/scrub/osd-scrub-repair.sh b/qa/standalone/scrub/osd-scrub-repair.sh index 13b30360c4e06..008e8ea1959dd 100755 --- a/qa/standalone/scrub/osd-scrub-repair.sh +++ b/qa/standalone/scrub/osd-scrub-repair.sh @@ -16,6 +16,7 @@ # set -x source $CEPH_ROOT/qa/standalone/ceph-helpers.sh +source $CEPH_ROOT/qa/standalone/scrub/scrub-helpers.sh if [ `uname` = FreeBSD ]; then # erasure coding overwrites are only tested on Bluestore @@ -160,7 +161,7 @@ function scrub_and_not_schedule() { # local pg=$(get_pg $poolname SOMETHING) local last_scrub=$(get_last_scrub_stamp $pg) - ceph pg scrub $pg + ceph tell $pg schedule-scrub # # 2) Assure the scrub is not scheduled @@ -329,8 +330,7 @@ function initiate_and_fetch_state() { date --rfc-3339=ns # note: must initiate a "regular" (periodic) deep scrub - not an operator-initiated one - env CEPH_ARGS= ceph --format json daemon $(get_asok_path $the_osd) deep_scrub "$pgid" - env CEPH_ARGS= ceph --format json daemon $(get_asok_path $the_osd) scrub "$pgid" + env CEPH_ARGS= ceph --format json daemon $(get_asok_path $the_osd) schedule-deep-scrub "$pgid" # wait for 'scrubbing' to appear for ((i=0; i < 80; i++)); do @@ -436,19 +436,17 @@ function TEST_auto_repair_bluestore_tag() { function TEST_auto_repair_bluestore_basic() { local dir=$1 - local poolname=testpool - - # Launch a cluster with 5 seconds scrub interval - run_mon $dir a || return 1 - run_mgr $dir x || return 1 - local ceph_osd_args="--osd-scrub-auto-repair=true \ - --osd_deep_scrub_randomize_ratio=0 \ - --osd-scrub-interval-randomize-ratio=0" - for id in $(seq 0 2) ; do - run_osd $dir $id $ceph_osd_args || return 1 - done + local -A cluster_conf=( + ['osds_num']="3" + ['pgs_in_pool']="1" + ['pool_name']="testpool" + ['extras']=" --osd_scrub_auto_repair=true" + ) + local extr_dbg=3 + standard_scrub_cluster $dir cluster_conf + local poolid=${cluster_conf['pool_id']} + local poolname=${cluster_conf['pool_name']} - create_pool $poolname 1 1 || return 1 ceph osd pool set $poolname size 2 wait_for_clean || return 1 @@ -460,12 +458,14 @@ function TEST_auto_repair_bluestore_basic() { # Remove the object from one shard physically # Restarted osd get $ceph_osd_args passed objectstore_tool $dir $(get_not_primary $poolname SOMETHING) SOMETHING remove || return 1 + ceph tell osd.* config set osd_scrub_auto_repair true local pgid=$(get_pg $poolname SOMETHING) local primary=$(get_primary $poolname SOMETHING) local last_scrub_stamp="$(get_last_scrub_stamp $pgid)" - ceph tell $pgid deep_scrub - ceph tell $pgid scrub + # note: the scrub initiated must be a "regular" (periodic) deep scrub - not an + # operator-initiated one (as there's no 'auto-repair' for the latter) + ceph tell $pgid schedule-deep-scrub # Wait for auto repair wait_for_scrub $pgid "$last_scrub_stamp" || return 1 @@ -510,12 +510,16 @@ function TEST_auto_repair_bluestore_scrub() { local pgid=$(get_pg $poolname SOMETHING) local primary=$(get_primary $poolname SOMETHING) local last_scrub_stamp="$(get_last_scrub_stamp $pgid)" - ceph tell $pgid scrub + ceph tell $pgid schedule-scrub # Wait for scrub -> auto repair wait_for_scrub $pgid "$last_scrub_stamp" || return 1 ceph pg dump pgs # Actually this causes 2 scrubs, so we better wait a little longer + sleep 2 + ceph pg dump pgs + sleep 2 + ceph pg dump pgs sleep 5 wait_for_clean || return 1 ceph pg dump pgs @@ -567,8 +571,7 @@ function TEST_auto_repair_bluestore_failed() { local pgid=$(get_pg $poolname obj1) local primary=$(get_primary $poolname obj1) local last_scrub_stamp="$(get_last_scrub_stamp $pgid)" - ceph tell $pgid deep_scrub - ceph tell $pgid scrub + ceph tell $pgid schedule-deep-scrub # Wait for auto repair wait_for_scrub $pgid "$last_scrub_stamp" || return 1 @@ -631,12 +634,12 @@ function TEST_auto_repair_bluestore_failed_norecov() { # obj2 can't be repaired objectstore_tool $dir $(get_not_primary $poolname SOMETHING) obj2 remove || return 1 objectstore_tool $dir $(get_primary $poolname SOMETHING) obj2 rm-attr _ || return 1 + ceph tell osd.* config set osd_scrub_auto_repair true local pgid=$(get_pg $poolname obj1) local primary=$(get_primary $poolname obj1) local last_scrub_stamp="$(get_last_scrub_stamp $pgid)" - ceph tell $pgid deep_scrub - ceph tell $pgid scrub + ceph tell $pgid schedule-deep-scrub # Wait for auto repair wait_for_scrub $pgid "$last_scrub_stamp" || return 1 @@ -5793,7 +5796,7 @@ function TEST_periodic_scrub_replicated() { flush_pg_stats local last_scrub=$(get_last_scrub_stamp $pg) # Fake a schedule scrub - ceph tell $pg scrub || return 1 + ceph tell $pg schedule-scrub || return 1 # Wait for schedule regular scrub wait_for_scrub $pg "$last_scrub" @@ -5811,7 +5814,7 @@ function TEST_periodic_scrub_replicated() { sleep 5 # Fake a schedule scrub - ceph tell $pg scrub || return 1 + ceph tell $pg schedule-scrub || return 1 # Wait for schedule regular scrub # to notice scrub and skip it local found=false @@ -5828,7 +5831,7 @@ function TEST_periodic_scrub_replicated() { flush_pg_stats # Request a regular scrub and it will be done - pg_scrub $pg + pg_schedule_scrub $pg grep -q "Regular scrub request, deep-scrub details will be lost" $dir/osd.${primary}.log || return 1 # deep-scrub error is no longer present @@ -5885,7 +5888,7 @@ function TEST_scrub_warning() { else overdue_seconds=$conf_overdue_seconds fi - ceph tell ${i}.0 scrub $(expr ${overdue_seconds} + ${i}00) || return 1 + ceph tell ${i}.0 schedule-scrub $(expr ${overdue_seconds} + ${i}00) || return 1 done # Fake schedule deep scrubs for i in $(seq $(expr $scrubs + 1) $(expr $scrubs + $deep_scrubs)) @@ -5896,7 +5899,7 @@ function TEST_scrub_warning() { else overdue_seconds=$conf_overdue_seconds fi - ceph tell ${i}.0 deep_scrub $(expr ${overdue_seconds} + ${i}00) || return 1 + ceph tell ${i}.0 schedule-deep-scrub $(expr ${overdue_seconds} + ${i}00) || return 1 done flush_pg_stats @@ -5905,7 +5908,7 @@ function TEST_scrub_warning() { ceph health | grep -q " pgs not deep-scrubbed in time" || return 1 ceph health | grep -q " pgs not scrubbed in time" || return 1 - # note that the 'ceph tell pg deep_scrub' command now also sets the regular scrub + # note that the 'ceph tell pg deep-scrub' command now also sets the regular scrub # time-stamp. I.e. - all 'late for deep scrubbing' pgs are also late for # regular scrubbing. For now, we'll allow both responses. COUNT=$(ceph health detail | grep "not scrubbed since" | wc -l) @@ -6222,15 +6225,15 @@ function TEST_request_scrub_priority() { otherpgs="${otherpgs}${opg} " local other_last_scrub=$(get_last_scrub_stamp $pg) # Fake a schedule scrub - ceph tell $opg scrub $opg || return 1 + ceph tell $opg schedule-scrub $opg || return 1 done sleep 15 flush_pg_stats - # Request a regular scrub and it will be done + # Force a shallow scrub and it will be done local last_scrub=$(get_last_scrub_stamp $pg) - ceph pg scrub $pg + ceph tell $pg scrub || return 1 ceph osd unset noscrub || return 1 ceph osd unset nodeep-scrub || return 1 diff --git a/qa/standalone/scrub/osd-scrub-test.sh b/qa/standalone/scrub/osd-scrub-test.sh index 354bd22880e36..e11100f8a1988 100755 --- a/qa/standalone/scrub/osd-scrub-test.sh +++ b/qa/standalone/scrub/osd-scrub-test.sh @@ -231,11 +231,11 @@ function TEST_scrub_extended_sleep() { create_pool $poolname 1 1 wait_for_clean || return 1 - # Trigger a scrub on a PG + # Trigger a periodic scrub on a PG (no 'extended sleep' for h.p. scrubs) local pgid=$(get_pg $poolname SOMETHING) local primary=$(get_primary $poolname SOMETHING) local last_scrub=$(get_last_scrub_stamp $pgid) - ceph tell $pgid scrub || return 1 + ceph tell $pgid schedule-scrub || return 1 # Allow scrub to start extended sleep PASSED="false" @@ -330,12 +330,7 @@ function _scrub_abort() { local primary=$(get_primary $poolname obj1) local pgid="${poolid}.0" - ceph tell $pgid $type || return 1 - # deep-scrub won't start without scrub noticing - if [ "$type" = "deep_scrub" ]; - then - ceph tell $pgid scrub || return 1 - fi + ceph tell $pgid schedule-$type || return 1 # Wait for scrubbing to start set -o pipefail @@ -359,7 +354,7 @@ function _scrub_abort() { fi ceph osd set $stopscrub - if [ "$type" = "deep_scrub" ]; + if [ "$type" = "deep-scrub" ]; then ceph osd set noscrub fi @@ -390,7 +385,7 @@ function _scrub_abort() { ceph config set osd "osd_scrub_sleep" "0.1" ceph osd unset $stopscrub - if [ "$type" = "deep_scrub" ]; + if [ "$type" = "deep-scrub" ]; then ceph osd unset noscrub fi @@ -405,7 +400,7 @@ function TEST_scrub_abort() { function TEST_deep_scrub_abort() { local dir=$1 - _scrub_abort $dir deep_scrub + _scrub_abort $dir deep-scrub } function TEST_scrub_permit_time() { @@ -441,7 +436,7 @@ function TEST_scrub_permit_time() { # current time to set last_scrub_stamp, it sets the deadline # back by osd_max_interval which would cause the time permit checking # to be skipped. Set back 1 day, the default scrub_min_interval. - ceph tell $pgid scrub $(( 24 * 60 * 60 )) || return 1 + ceph tell $pgid schedule-scrub $(( 24 * 60 * 60 )) || return 1 # Scrub should not run for ((i=0; i < 30; i++)); do @@ -495,7 +490,7 @@ function TEST_just_deep_scrubs() { local dbg_counter_at_start=${sched_data['query_scrub_seq']} echo "test counter @ start: $dbg_counter_at_start" - ceph pg $pgid deep_scrub + ceph tell $pgid schedule-deep-scrub sleep 5 # 5s is the 'pg dump' interval declare -A sc_data_2 @@ -574,8 +569,7 @@ function TEST_dump_scrub_schedule() { saved_last_stamp=${sched_data['query_last_stamp']} ceph tell osd.* config set osd_scrub_sleep "0" - ceph pg deep-scrub $pgid - ceph pg scrub $pgid + ceph tell $pgid deep-scrub # wait for the 'last duration' entries to change. Note that the 'dump' one will need # up to 5 seconds to sync @@ -602,7 +596,7 @@ function TEST_dump_scrub_schedule() { sleep 2 saved_last_stamp=${sched_data['query_last_stamp']} - ceph pg $pgid scrub + ceph tell $pgid schedule-scrub sleep 1 sched_data=() declare -A expct_scrub_peri_sched=( ['query_is_future']="false" ) diff --git a/src/ceph.in b/src/ceph.in index 2ba2c74768cf8..11a76511a8ec2 100755 --- a/src/ceph.in +++ b/src/ceph.in @@ -1310,7 +1310,7 @@ def main(): if final_e: raise final_e - # Block until command completion (currently scrub and deep_scrub only) + # Block until command completion (currently scrub and deep scrub only) if block: wait(childargs, waitdata)