From 7008b85fc562d228f61dc382a3e5068547acdc2a Mon Sep 17 00:00:00 2001
From: Ronen Friedman <rfriedma@redhat.com>
Date: Sun, 14 Nov 2021 20:37:38 +0000
Subject: [PATCH] qa/standalone: test for scrub behavior when noscrub is set
 but nodeepscrub is not

A bug (https://tracker.ceph.com/issues/52901 - now fixed) resulted in
this combination of conditions leaving the PG in "scrubbing" state
forever. That bug was fixed by PR#43521. The patch here adds a
test to detect the (now fixed) wrong behavior.

Signed-off-by: Ronen Friedman <rfriedma@redhat.com>
---
 qa/standalone/scrub/osd-scrub-test.sh |  59 ++++++++++++++
 qa/standalone/scrub/scrub-helpers.sh  | 112 +++++++++++++++++++++++++-
 2 files changed, 167 insertions(+), 4 deletions(-)

diff --git a/qa/standalone/scrub/osd-scrub-test.sh b/qa/standalone/scrub/osd-scrub-test.sh
index 6a1132ef22555..b389465b2312c 100755
--- a/qa/standalone/scrub/osd-scrub-test.sh
+++ b/qa/standalone/scrub/osd-scrub-test.sh
@@ -441,6 +441,65 @@ function TEST_scrub_permit_time() {
     done
 }
 
+#  a test to recreate the problem described in bug #52901 - setting 'noscrub'
+#  without explicitly preventing deep scrubs made the PG 'unscrubable'.
+#  Fixed by PR#43521
+function TEST_just_deep_scrubs() {
+    local dir=$1
+    local -A cluster_conf=(
+        ['osds_num']="3" 
+        ['pgs_in_pool']="4"
+        ['pool_name']="test"
+    )
+
+    standard_scrub_cluster $dir cluster_conf
+    local poolid=${cluster_conf['pool_id']}
+    local poolname=${cluster_conf['pool_name']}
+    echo "Pool: $poolname : $poolid"
+
+    TESTDATA="testdata.$$"
+    local objects=15
+    dd if=/dev/urandom of=$TESTDATA bs=1032 count=1
+    for i in `seq 1 $objects`
+    do
+        rados -p $poolname put obj${i} $TESTDATA
+    done
+    rm -f $TESTDATA
+
+    # set 'no scrub', then request a deep-scrub.
+    # we do not expect to see the scrub scheduled.
+
+    ceph osd set noscrub || return 1
+    sleep 6 # the 'noscrub' command takes a long time to reach the OSDs
+    local now_is=`date -I"ns"`
+    declare -A sched_data
+    local pgid="${poolid}.2"
+
+    # turn on the publishing of test data in the 'scrubber' section of 'pg query' output
+    set_query_debug $pgid
+
+    extract_published_sch $pgid $now_is $now_is sched_data
+    local saved_last_stamp=${sched_data['query_last_stamp']}
+    local dbg_counter_at_start=${sched_data['query_scrub_seq']}
+    echo "test counter @ start: $dbg_counter_at_start"
+
+    ceph pg $pgid deep_scrub
+
+    sleep 5 # 5s is the 'pg dump' interval
+    declare -A sc_data_2
+    extract_published_sch $pgid $now_is $now_is sc_data_2
+    echo "test counter @ should show no change: " ${sc_data_2['query_scrub_seq']}
+    (( ${sc_data_2['dmp_last_duration']} == 0)) || return 1
+    (( ${sc_data_2['query_scrub_seq']} == $dbg_counter_at_start)) || return 1
+
+    # unset the 'no scrub'. Deep scrubbing should start now.
+    ceph osd unset noscrub || return 1
+    sleep 5
+    declare -A expct_qry_duration=( ['query_last_duration']="0" ['query_last_duration_neg']="not0" )
+    sc_data_2=()
+    echo "test counter @ should be higher than before the unset: " ${sc_data_2['query_scrub_seq']}
+    wait_any_cond $pgid 10 $saved_last_stamp expct_qry_duration "WaitingAfterScrub " sc_data_2 || return 1
+}
 
 function TEST_dump_scrub_schedule() {
     local dir=$1
diff --git a/qa/standalone/scrub/scrub-helpers.sh b/qa/standalone/scrub/scrub-helpers.sh
index cf13732c53854..c7f48150cc1d4 100644
--- a/qa/standalone/scrub/scrub-helpers.sh
+++ b/qa/standalone/scrub/scrub-helpers.sh
@@ -66,9 +66,9 @@ function extract_published_sch() {
         query_last_duration: .info.stats.last_scrub_duration,
         query_last_stamp: .info.history.last_scrub_stamp,
         query_last_scrub: (.info.history.last_scrub| sub($spt;"x") ),
-	query_is_future: .q_when_is_future,
-	query_vs_date: .q_vs_date,
-
+        query_is_future: .q_when_is_future,
+        query_vs_date: .q_vs_date,
+        query_scrub_seq: .scrubber.test_sequence
       }
    '`
   (( extr_dbg >= 1 )) && echo $from_qry " " $from_dmp | jq -s -r 'add | "(",(to_entries | .[] | "["+(.key)+"]="+(.value|@sh)),")"'
@@ -171,7 +171,7 @@ function schedule_against_expected() {
   local -n ep=$2  # the expected results
   local extr_dbg=1
 
-  #turn off '-x' (but remember previous state)
+  # turn off '-x' (but remember previous state)
   local saved_echo_flag=${-//[^x]/}
   set +x
 
@@ -194,3 +194,107 @@ function schedule_against_expected() {
   if [[ -n "$saved_echo_flag" ]]; then set -x; fi
   return 0
 }
+
+
+# Start the cluster "nodes" and create a pool for testing.
+#
+# The OSDs are started with a set of parameters aimed in creating a repeatable
+# and stable scrub sequence:
+#  - no scrub randomizations/backoffs
+#  - no autoscaler
+#
+# $1: the test directory
+# $2: [in/out] an array of configuration values
+#
+# The function adds/updates the configuration dictionary with the name of the
+# pool created, and its ID.
+#
+# Argument 2 might look like this:
+#
+#  declare -A test_conf=( 
+#    ['osds_num']="3"
+#    ['pgs_in_pool']="7"
+#    ['extras']="--extra1 --extra2"
+#    ['pool_name']="testpl"
+#  )
+function standard_scrub_cluster() {
+    local dir=$1
+    local -n args=$2
+
+    local OSDS=${args['osds_num']:-"3"}
+    local pg_num=${args['pgs_in_pool']:-"8"}
+    local poolname="${args['pool_name']:-test}"
+    args['pool_name']=$poolname
+    local extra_pars=${args['extras']}
+    local debug_msg=${args['msg']:-"dbg"}
+
+    # turn off '-x' (but remember previous state)
+    local saved_echo_flag=${-//[^x]/}
+    set +x
+
+    run_mon $dir a --osd_pool_default_size=$OSDS || return 1
+    run_mgr $dir x || return 1
+
+    local ceph_osd_args="--osd_deep_scrub_randomize_ratio=0 \
+            --osd_scrub_interval_randomize_ratio=0 \
+            --osd_scrub_backoff_ratio=0.0 \
+            --osd_pool_default_pg_autoscale_mode=off \
+            $extra_pars"
+
+    for osd in $(seq 0 $(expr $OSDS - 1))
+    do
+      run_osd $dir $osd $ceph_osd_args || return 1
+    done
+
+    create_pool $poolname $pg_num $pg_num
+    wait_for_clean || return 1
+
+    # update the in/out 'args' with the ID of the new pool
+    sleep 1
+    name_n_id=`ceph osd dump | awk '/^pool.*'$poolname'/ { gsub(/'"'"'/," ",$3); print $3," ", $2}'`
+    echo "standard_scrub_cluster: $debug_msg: test pool is $name_n_id"
+    args['pool_id']="${name_n_id##* }"
+    if [[ -n "$saved_echo_flag" ]]; then set -x; fi
+}
+
+
+# Start the cluster "nodes" and create a pool for testing - wpq version.
+#
+# A variant of standard_scrub_cluster() that selects the wpq scheduler and sets a value to
+# osd_scrub_sleep. To be used when the test is attempting to "catch" the scrubber during an
+# ongoing scrub.
+#
+# See standard_scrub_cluster() for more details.
+#
+# $1: the test directory
+# $2: [in/out] an array of configuration values
+# $3: osd_scrub_sleep
+#
+# The function adds/updates the configuration dictionary with the name of the
+# pool created, and its ID.
+function standard_scrub_wpq_cluster() {
+    local dir=$1
+    local -n conf=$2
+    local osd_sleep=$3
+
+    conf['extras']=" --osd_op_queue=wpq --osd_scrub_sleep=$osd_sleep ${conf['extras']}"
+
+    standard_scrub_cluster $dir conf || return 1
+}
+
+
+# A debug flag is set for the PG specified, causing the 'pg query' command to display
+# an additional 'scrub sessions counter' field.
+#
+# $1: PG id
+#
+function set_query_debug() {
+    local pgid=$1
+    local prim_osd=`ceph pg dump pgs_brief | \
+      awk -v pg="^$pgid" -n -e '$0 ~ pg { print(gensub(/[^0-9]*([0-9]+).*/,"\\\\1","g",$5)); }' `
+
+    echo "Setting scrub debug data. Primary for $pgid is $prim_osd"
+    CEPH_ARGS='' ceph --format=json daemon $(get_asok_path osd.$prim_osd) \
+          scrubdebug $pgid set sessions
+}
+
-- 
2.39.5