tests/standalone: fix scrub-related tests following command changes

author Ronen Friedman <rfriedma@redhat.com>

Thu, 9 Nov 2023 15:21:42 +0000 (09:21 -0600)

committer Ronen Friedman <rfriedma@redhat.com>

Wed, 15 Nov 2023 09:14:15 +0000 (03:14 -0600)
author Ronen Friedman <rfriedma@redhat.com>
Thu, 9 Nov 2023 15:21:42 +0000 (09:21 -0600)
committer Ronen Friedman <rfriedma@redhat.com>
Wed, 15 Nov 2023 09:14:15 +0000 (03:14 -0600)
diff --git a/qa/standalone/ceph-helpers.sh b/qa/standalone/ceph-helpers.sh

index bf2c91bc04274d09b2c97bd18b36ff48c7015f1e..fc4756daf8b13712d1c3d7d91135cd9da5c049ec 100755 (executable)
--- a/qa/standalone/ceph-helpers.sh
+++ b/qa/standalone/ceph-helpers.sh
@@ -1865,6 +1865,9 @@ function test_repair() {
  # **get_last_scrub_stamp** function reports a timestamp different from
  # the one stored before starting the scrub.
  #
+# The scrub is initiated using the "operator initiated" method, and
+# the scrub triggered is not subject to no-scrub flags etc.
+#
  # @param pgid the id of the PG
  # @return 0 on success, 1 on error
  #
@@ -1899,6 +1902,48 @@ function test_pg_scrub() {
  
  #######################################################################
  
+##
+# Trigger a "scheduled" scrub on **pgid** (by mnaually modifying the relevant
+# last-scrub stamp) and wait until it completes. The pg_scrub
+# function will fail if scrubbing does not complete within $TIMEOUT
+# seconds. The pg_scrub is complete whenever the
+# **get_last_scrub_stamp** function reports a timestamp different from
+# the one stored before starting the scrub.
+#
+# @param pgid the id of the PG
+# @return 0 on success, 1 on error
+#
+function pg_schedule_scrub() {
+    local pgid=$1
+    local last_scrub=$(get_last_scrub_stamp $pgid)
+    ceph pg scrub $pgid
+    wait_for_scrub $pgid "$last_scrub"
+}
+
+function pg_schedule_deep_scrub() {
+    local pgid=$1
+    local last_scrub=$(get_last_scrub_stamp $pgid last_deep_scrub_stamp)
+    ceph pg deep-scrub $pgid
+    wait_for_scrub $pgid "$last_scrub" last_deep_scrub_stamp
+}
+
+function test_pg_schedule_scrub() {
+    local dir=$1
+
+    setup $dir || return 1
+    run_mon $dir a --osd_pool_default_size=1 --mon_allow_pool_size_one=true || return 1
+    run_mgr $dir x || return 1
+    run_osd $dir 0 || return 1
+    create_rbd_pool || return 1
+    wait_for_clean || return 1
+    pg_schedule_scrub 1.0 || return 1
+    kill_daemons $dir KILL osd || return 1
+    ! TIMEOUT=1 pg_scrub 1.0 || return 1
+    teardown $dir || return 1
+}
+
+#######################################################################
+
  ##
  # Run the *command* and expect it to fail (i.e. return a non zero status).
  # The output (stderr and stdout) is stored in a temporary file in *dir*
diff --git a/qa/standalone/scrub/osd-mapper.sh b/qa/standalone/scrub/osd-mapper.sh

index ed18f94f1af1f37298a6d6bd6ae107da9c5bd46d..bfe57eac03e9bdb473cfdb620d068cc6e0b2ccbd 100755 (executable)
--- a/qa/standalone/scrub/osd-mapper.sh
+++ b/qa/standalone/scrub/osd-mapper.sh
@@ -77,7 +77,7 @@ function TEST_truncated_sna_record() {
      (( extr_dbg >= 1 )) && rados --format json-pretty -p $poolname listsnaps $objname
  
      # scrub the PG
-    ceph pg $pgid deep_scrub || return 1
+    ceph pg $pgid deep-scrub || return 1
  
      # we aren't just waiting for the scrub to terminate, but also for the
      # logs to be published
@@ -149,7 +149,7 @@ function TEST_truncated_sna_record() {
      local cur_prim=`ceph --format=json-pretty osd map $poolname $objname | jq -r '.up[0]'`
      ceph pg dump pgs
      sleep 2
-    ceph pg $pgid deep_scrub || return 1
+    ceph pg $pgid deep-scrub || return 1
      sleep 5
      ceph pg dump pgs
      (( extr_dbg >= 1 )) && grep -a "ERR" $dir/osd.$cur_prim.log
@@ -161,7 +161,7 @@ function TEST_truncated_sna_record() {
      echo "prev count: $prev_err_cnt"
  
      # scrub again. No errors expected this time
-    ceph pg $pgid deep_scrub || return 1
+    ceph pg $pgid deep-scrub || return 1
      sleep 5
      ceph pg dump pgs
      (( extr_dbg >= 1 )) && grep -a "ERR" $dir/osd.$cur_prim.log
diff --git a/qa/standalone/scrub/osd-scrub-repair.sh b/qa/standalone/scrub/osd-scrub-repair.sh

index 13b30360c4e06c2c1b1258a1dbf4fe9bf5441285..008e8ea1959dde34fa1283d0342fe74e67dfb9f8 100755 (executable)
--- a/qa/standalone/scrub/osd-scrub-repair.sh
+++ b/qa/standalone/scrub/osd-scrub-repair.sh
@@ -16,6 +16,7 @@
  #
  set -x
  source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+source $CEPH_ROOT/qa/standalone/scrub/scrub-helpers.sh
  
  if [ `uname` = FreeBSD ]; then
      # erasure coding overwrites are only tested on Bluestore
@@ -160,7 +161,7 @@ function scrub_and_not_schedule() {
      #
      local pg=$(get_pg $poolname SOMETHING)
      local last_scrub=$(get_last_scrub_stamp $pg)
-    ceph pg scrub $pg
+    ceph tell $pg schedule-scrub
  
      #
      # 2) Assure the scrub is not scheduled
@@ -329,8 +330,7 @@ function initiate_and_fetch_state() {
      date  --rfc-3339=ns
  
      # note: must initiate a "regular" (periodic) deep scrub - not an operator-initiated one
-    env CEPH_ARGS= ceph --format json daemon $(get_asok_path $the_osd) deep_scrub "$pgid"
-    env CEPH_ARGS= ceph --format json daemon $(get_asok_path $the_osd) scrub "$pgid"
+    env CEPH_ARGS= ceph --format json daemon $(get_asok_path $the_osd) schedule-deep-scrub "$pgid"
  
      # wait for 'scrubbing' to appear
      for ((i=0; i < 80; i++)); do
@@ -436,19 +436,17 @@ function TEST_auto_repair_bluestore_tag() {
  
  function TEST_auto_repair_bluestore_basic() {
      local dir=$1
-    local poolname=testpool
-
-    # Launch a cluster with 5 seconds scrub interval
-    run_mon $dir a || return 1
-    run_mgr $dir x || return 1
-    local ceph_osd_args="--osd-scrub-auto-repair=true \
-            --osd_deep_scrub_randomize_ratio=0 \
-            --osd-scrub-interval-randomize-ratio=0"
-    for id in $(seq 0 2) ; do
-        run_osd $dir $id $ceph_osd_args || return 1
-    done
+    local -A cluster_conf=(
+        ['osds_num']="3" 
+        ['pgs_in_pool']="1"
+        ['pool_name']="testpool"
+        ['extras']=" --osd_scrub_auto_repair=true"
+    )
+    local extr_dbg=3
+    standard_scrub_cluster $dir cluster_conf
+    local poolid=${cluster_conf['pool_id']}
+    local poolname=${cluster_conf['pool_name']}
  
-    create_pool $poolname 1 1 || return 1
      ceph osd pool set $poolname size 2
      wait_for_clean || return 1
  
@@ -460,12 +458,14 @@ function TEST_auto_repair_bluestore_basic() {
      # Remove the object from one shard physically
      # Restarted osd get $ceph_osd_args passed
      objectstore_tool $dir $(get_not_primary $poolname SOMETHING) SOMETHING remove || return 1
+    ceph tell osd.* config set osd_scrub_auto_repair true
  
      local pgid=$(get_pg $poolname SOMETHING)
      local primary=$(get_primary $poolname SOMETHING)
      local last_scrub_stamp="$(get_last_scrub_stamp $pgid)"
-    ceph tell $pgid deep_scrub
-    ceph tell $pgid scrub
+    # note: the scrub initiated must be a "regular" (periodic) deep scrub - not an
+    # operator-initiated one (as there's no 'auto-repair' for the latter)
+    ceph tell $pgid schedule-deep-scrub
  
      # Wait for auto repair
      wait_for_scrub $pgid "$last_scrub_stamp" || return 1
@@ -510,12 +510,16 @@ function TEST_auto_repair_bluestore_scrub() {
      local pgid=$(get_pg $poolname SOMETHING)
      local primary=$(get_primary $poolname SOMETHING)
      local last_scrub_stamp="$(get_last_scrub_stamp $pgid)"
-    ceph tell $pgid scrub
+    ceph tell $pgid schedule-scrub
  
      # Wait for scrub -> auto repair
      wait_for_scrub $pgid "$last_scrub_stamp" || return 1
      ceph pg dump pgs
      # Actually this causes 2 scrubs, so we better wait a little longer
+    sleep 2
+    ceph pg dump pgs
+    sleep 2
+    ceph pg dump pgs
      sleep 5
      wait_for_clean || return 1
      ceph pg dump pgs
@@ -567,8 +571,7 @@ function TEST_auto_repair_bluestore_failed() {
      local pgid=$(get_pg $poolname obj1)
      local primary=$(get_primary $poolname obj1)
      local last_scrub_stamp="$(get_last_scrub_stamp $pgid)"
-    ceph tell $pgid deep_scrub
-    ceph tell $pgid scrub
+    ceph tell $pgid schedule-deep-scrub
  
      # Wait for auto repair
      wait_for_scrub $pgid "$last_scrub_stamp" || return 1
@@ -631,12 +634,12 @@ function TEST_auto_repair_bluestore_failed_norecov() {
      # obj2 can't be repaired
      objectstore_tool $dir $(get_not_primary $poolname SOMETHING) obj2 remove || return 1
      objectstore_tool $dir $(get_primary $poolname SOMETHING) obj2 rm-attr _ || return 1
+    ceph tell osd.* config set osd_scrub_auto_repair true
  
      local pgid=$(get_pg $poolname obj1)
      local primary=$(get_primary $poolname obj1)
      local last_scrub_stamp="$(get_last_scrub_stamp $pgid)"
-    ceph tell $pgid deep_scrub
-    ceph tell $pgid scrub
+    ceph tell $pgid schedule-deep-scrub
  
      # Wait for auto repair
      wait_for_scrub $pgid "$last_scrub_stamp" || return 1
@@ -5793,7 +5796,7 @@ function TEST_periodic_scrub_replicated() {
      flush_pg_stats
      local last_scrub=$(get_last_scrub_stamp $pg)
      # Fake a schedule scrub
-    ceph tell $pg scrub || return 1
+    ceph tell $pg schedule-scrub || return 1
      # Wait for schedule regular scrub
      wait_for_scrub $pg "$last_scrub"
  
@@ -5811,7 +5814,7 @@ function TEST_periodic_scrub_replicated() {
      sleep 5
  
      # Fake a schedule scrub
-    ceph tell $pg scrub || return 1
+    ceph tell $pg schedule-scrub || return 1
      # Wait for schedule regular scrub
      # to notice scrub and skip it
      local found=false
@@ -5828,7 +5831,7 @@ function TEST_periodic_scrub_replicated() {
  
      flush_pg_stats
      # Request a regular scrub and it will be done
-    pg_scrub $pg
+    pg_schedule_scrub $pg
      grep -q "Regular scrub request, deep-scrub details will be lost" $dir/osd.${primary}.log || return 1
  
      # deep-scrub error is no longer present
@@ -5885,7 +5888,7 @@ function TEST_scrub_warning() {
        else
          overdue_seconds=$conf_overdue_seconds
        fi
-      ceph tell ${i}.0 scrub $(expr ${overdue_seconds} + ${i}00) || return 1
+      ceph tell ${i}.0 schedule-scrub $(expr ${overdue_seconds} + ${i}00) || return 1
      done
      # Fake schedule deep scrubs
      for i in $(seq $(expr $scrubs + 1) $(expr $scrubs + $deep_scrubs))
@@ -5896,7 +5899,7 @@ function TEST_scrub_warning() {
        else
          overdue_seconds=$conf_overdue_seconds
        fi
-      ceph tell ${i}.0 deep_scrub $(expr ${overdue_seconds} + ${i}00) || return 1
+      ceph tell ${i}.0 schedule-deep-scrub $(expr ${overdue_seconds} + ${i}00) || return 1
      done
      flush_pg_stats
  
@@ -5905,7 +5908,7 @@ function TEST_scrub_warning() {
      ceph health | grep -q " pgs not deep-scrubbed in time" || return 1
      ceph health | grep -q " pgs not scrubbed in time" || return 1
  
-    # note that the 'ceph tell pg deep_scrub' command now also sets the regular scrub
+    # note that the 'ceph tell pg deep-scrub' command now also sets the regular scrub
      # time-stamp. I.e. - all 'late for deep scrubbing' pgs are also late for
      # regular scrubbing. For now, we'll allow both responses.
      COUNT=$(ceph health detail | grep "not scrubbed since" | wc -l)
@@ -6222,15 +6225,15 @@ function TEST_request_scrub_priority() {
          otherpgs="${otherpgs}${opg} "
          local other_last_scrub=$(get_last_scrub_stamp $pg)
          # Fake a schedule scrub
-        ceph tell $opg scrub $opg || return 1
+        ceph tell $opg schedule-scrub $opg || return 1
      done
  
      sleep 15
      flush_pg_stats
  
-    # Request a regular scrub and it will be done
+    # Force a shallow scrub and it will be done
      local last_scrub=$(get_last_scrub_stamp $pg)
-    ceph pg scrub $pg
+    ceph tell $pg scrub || return 1
  
      ceph osd unset noscrub || return 1
      ceph osd unset nodeep-scrub || return 1
diff --git a/qa/standalone/scrub/osd-scrub-test.sh b/qa/standalone/scrub/osd-scrub-test.sh

index 354bd22880e3658925e43d9615956f6cfe907ad0..e11100f8a1988313a93fdc5f6570751046ae4ad0 100755 (executable)
--- a/qa/standalone/scrub/osd-scrub-test.sh
+++ b/qa/standalone/scrub/osd-scrub-test.sh
@@ -231,11 +231,11 @@ function TEST_scrub_extended_sleep() {
      create_pool $poolname 1 1
      wait_for_clean || return 1
  
-    # Trigger a scrub on a PG
+    # Trigger a periodic scrub on a PG (no 'extended sleep' for h.p. scrubs)
      local pgid=$(get_pg $poolname SOMETHING)
      local primary=$(get_primary $poolname SOMETHING)
      local last_scrub=$(get_last_scrub_stamp $pgid)
-    ceph tell $pgid scrub || return 1
+    ceph tell $pgid schedule-scrub || return 1
  
      # Allow scrub to start extended sleep
      PASSED="false"
@@ -330,12 +330,7 @@ function _scrub_abort() {
      local primary=$(get_primary $poolname obj1)
      local pgid="${poolid}.0"
  
-    ceph tell $pgid $type || return 1
-    # deep-scrub won't start without scrub noticing
-    if [ "$type" = "deep_scrub" ];
-    then
-      ceph tell $pgid scrub || return 1
-    fi
+    ceph tell $pgid schedule-$type || return 1
  
      # Wait for scrubbing to start
      set -o pipefail
@@ -359,7 +354,7 @@ function _scrub_abort() {
      fi
  
      ceph osd set $stopscrub
-    if [ "$type" = "deep_scrub" ];
+    if [ "$type" = "deep-scrub" ];
      then
        ceph osd set noscrub
      fi
@@ -390,7 +385,7 @@ function _scrub_abort() {
      ceph config set osd "osd_scrub_sleep" "0.1"
  
      ceph osd unset $stopscrub
-    if [ "$type" = "deep_scrub" ];
+    if [ "$type" = "deep-scrub" ];
      then
        ceph osd unset noscrub
      fi
@@ -405,7 +400,7 @@ function TEST_scrub_abort() {
  
  function TEST_deep_scrub_abort() {
      local dir=$1
-    _scrub_abort $dir deep_scrub
+    _scrub_abort $dir deep-scrub
  }
  
  function TEST_scrub_permit_time() {
@@ -441,7 +436,7 @@ function TEST_scrub_permit_time() {
      # current time to set last_scrub_stamp, it sets the deadline
      # back by osd_max_interval which would cause the time permit checking
      # to be skipped.  Set back 1 day, the default scrub_min_interval.
-    ceph tell $pgid scrub $(( 24 * 60 * 60 )) || return 1
+    ceph tell $pgid schedule-scrub $(( 24 * 60 * 60 )) || return 1
  
      # Scrub should not run
      for ((i=0; i < 30; i++)); do
@@ -495,7 +490,7 @@ function TEST_just_deep_scrubs() {
      local dbg_counter_at_start=${sched_data['query_scrub_seq']}
      echo "test counter @ start: $dbg_counter_at_start"
  
-    ceph pg $pgid deep_scrub
+    ceph tell $pgid schedule-deep-scrub
  
      sleep 5 # 5s is the 'pg dump' interval
      declare -A sc_data_2
@@ -574,8 +569,7 @@ function TEST_dump_scrub_schedule() {
  
      saved_last_stamp=${sched_data['query_last_stamp']}
      ceph tell osd.* config set osd_scrub_sleep "0"
-    ceph pg deep-scrub $pgid
-    ceph pg scrub $pgid
+    ceph tell $pgid deep-scrub
  
      # wait for the 'last duration' entries to change. Note that the 'dump' one will need
      # up to 5 seconds to sync
@@ -602,7 +596,7 @@ function TEST_dump_scrub_schedule() {
      sleep 2
      saved_last_stamp=${sched_data['query_last_stamp']}
  
-    ceph pg $pgid scrub
+    ceph tell $pgid schedule-scrub
      sleep 1
      sched_data=()
      declare -A expct_scrub_peri_sched=( ['query_is_future']="false" )
diff --git a/src/ceph.in b/src/ceph.in

index 2ba2c74768cf81779810055bf17efd362edc8919..11a76511a8ec2b758afcb2ca05e39df19e3b0a60 100755 (executable)
--- a/src/ceph.in
+++ b/src/ceph.in
@@ -1310,7 +1310,7 @@ def main():
          if final_e:
              raise final_e
  
-    # Block until command completion (currently scrub and deep_scrub only)
+    # Block until command completion (currently scrub and deep scrub only)
      if block:
          wait(childargs, waitdata)
author	Ronen Friedman <rfriedma@redhat.com>
	Thu, 9 Nov 2023 15:21:42 +0000 (09:21 -0600)
committer	Ronen Friedman <rfriedma@redhat.com>
	Wed, 15 Nov 2023 09:14:15 +0000 (03:14 -0600)
qa/standalone/ceph-helpers.sh		patch \| blob \| history
qa/standalone/scrub/osd-mapper.sh		patch \| blob \| history
qa/standalone/scrub/osd-scrub-repair.sh		patch \| blob \| history
qa/standalone/scrub/osd-scrub-test.sh		patch \| blob \| history
src/ceph.in		patch \| blob \| history