From 914ea7f57602691575cd6bf2ce6fb94775f1bd8a Mon Sep 17 00:00:00 2001 From: Ronen Friedman Date: Wed, 6 Aug 2025 23:54:30 -0500 Subject: [PATCH] qa/standalone/scrub: re-code osd-scrub-dump.sh to test scrub repair functionality. The new version of osd-scrub-dump.sh is designed to allow multiple "corruption methods" on a subset of objects. The functionality includes specifying: - the number of objects created; - the number to have their Primary version modified; - the number to have their Replicas modified; - the set of "manipulations" to perform on the objects. Signed-off-by: Ronen Friedman --- qa/standalone/scrub/osd-scrub-dump.sh | 839 +++++++++++++++++++++----- qa/standalone/scrub/scrub-helpers.sh | 116 +++- 2 files changed, 799 insertions(+), 156 deletions(-) diff --git a/qa/standalone/scrub/osd-scrub-dump.sh b/qa/standalone/scrub/osd-scrub-dump.sh index 403ffacd9a6..7bcfc2516f7 100755 --- a/qa/standalone/scrub/osd-scrub-dump.sh +++ b/qa/standalone/scrub/osd-scrub-dump.sh @@ -1,8 +1,9 @@ #!/usr/bin/env bash # -# Copyright (C) 2019 Red Hat +# Copyright (C) 2025 Red Hat # # Author: David Zafman +# Author: Ronen Friedman # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU Library Public License as published by @@ -15,166 +16,706 @@ # GNU Library Public License for more details. # +source $CEPH_ROOT/qa/standalone/ceph-helpers.sh +source $CEPH_ROOT/qa/standalone/scrub/scrub-helpers.sh -# 30.11.2023: the test is now disabled, as the reservation mechanism has been -# thoroughly reworked and the test is no longer valid. The test is left here -# as a basis for a new set of primary vs. replicas scrub activation tests. +function run() { + local dir=$1 + shift -source $CEPH_ROOT/qa/standalone/ceph-helpers.sh + export CEPH_MON="127.0.0.1:7138" # git grep '\<7138\>' : there must be only one + export CEPH_ARGS + CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " + CEPH_ARGS+="--mon-host=$CEPH_MON --osd_op_queue=wpq " -MAX_SCRUBS=4 -SCRUB_SLEEP=3 -POOL_SIZE=3 + export -n CEPH_CLI_TEST_DUP_COMMAND + local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} + for func in $funcs ; do + echo "-------------- Prepare Test $func -------------------" + setup $dir || return 1 + echo "-------------- Run Test $func -----------------------" + $func $dir || return 1 + echo "-------------- Teardown Test $func ------------------" + teardown $dir || return 1 + echo "-------------- Complete Test $func ------------------" + done +} -function run() { - echo "This test is disabled" + +# collect_pool_stamps() returns a dictionary with PG IDs as keys +# and their last scrub timestamps as values. +function collect_pool_stamps() { + local dir=$1 + local poolid="$2" + local -n out_dict=$3 + local saved_echo_flag=${-//[^x]/} + set +x + + while IFS="=" read -r pgid stamp; do + out_dict["$pgid"]="$stamp" + done < <( + ceph --format json pg dump pgs 2>/dev/null | + jq -r --arg poolid "$poolid" \ + '.pg_stats[] | select(.pgid | startswith($poolid + ".")) | "\(.pgid)=\(.last_scrub_stamp)"' + ) + if [[ -n "$saved_echo_flag" ]]; then set -x; fi + return 0 +} + + +# we are assuming the keys fully match +function arrays_all_values_differ() { + local -n a1 # the 'last_stamps'(in the assumed usage) + local -n a2 # the 'current_stamps' + local key + for key in "${!a1[@]}"; do + [[ -v a2[$key] ]] || return 1 # missing in 2nd → fail + [[ ${a1[$key]} != "${a2[$key]}" ]] || return 1 # same value → fail + done + return 0 # every key in 1st exists in 2nd and all values differ +} + + +# wait for a whole pool to be scrubbed. +# 'last-scrub' timestamps are compared to a reference timestamps array +# (ostensibly taken just before all the PGs in the pool were instructed to +# scrub). +# All 'last-scrub' timestamps must be greater than the reference timestamps. +function wait_for_pool_scrubbed() { + local dir=$1 + local pool_id=$2 + local -n prev_stamps=$3 + local timeout=${4:-1000} + local saved_echo_flag=${-//[^x]/} + set +x + local elapsed=0 + while true; do + local -A current_stamps + collect_pool_stamps $dir $poolid current_stamps + if arrays_all_values_differ prev_stamps current_stamps; then + break + fi + elapsed=$((elapsed + 1)) + if [ $elapsed -gt $((timeout * 10)) ]; then + echo "wait_for_pool_scrubbed: Timeout waiting for scrub completion after $timeout seconds" + return 1 + fi + sleep 0.1 + done + if [[ -n "$saved_echo_flag" ]]; then set -x; fi + return 0 +} + + +# manipulator functions: + +# delete_oi: delete the object info +function delete_oi() +{ + local dir=$1 + local target_osd=$2 + local obj=$3 + ceph-objectstore-tool --data-path $dir/$target_osd "$obj" rm-attr '_' || return 1 + return 0 +} + +# remove one copy of an object +function rm_object() +{ + local dir=$1 + local target_osd=$2 + local obj=$3 + # remove the object from the OSD + ceph-objectstore-tool --data-path $dir/$target_osd "$obj" remove || return 1 + return 0 +} + + +# applying the manipulators to objects of a specific OSD + + +# modify object for which the downed OSD is primary / secondary. +# Parameters: +function modify_obs_of_an_osd() +{ + local dir=$1 + local target_osd=$2 + # list of objects to modify + local objects_to_modify=$3 + local -n o_to_pg=$4 + local manips=$5 + local extr_dbg=1 + + # Skip if no objects to modify + if [[ -z "$objects_to_modify" ]]; then return 0 - local dir=$1 - shift - local CHUNK_MAX=5 - - export CEPH_MON="127.0.0.1:7184" # git grep '\<7184\>' : there must be only one - export CEPH_ARGS - CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " - CEPH_ARGS+="--mon-host=$CEPH_MON " - CEPH_ARGS+="--osd_max_scrubs=$MAX_SCRUBS " - CEPH_ARGS+="--osd_shallow_scrub_chunk_max=$CHUNK_MAX " - CEPH_ARGS+="--osd_scrub_sleep=$SCRUB_SLEEP " - CEPH_ARGS+="--osd_pool_default_size=$POOL_SIZE " - # Set scheduler to "wpq" until there's a reliable way to query scrub states - # with "--osd-scrub-sleep" set to 0. The "mclock_scheduler" overrides the - # scrub sleep to 0 and as a result the checks in the test fail. - CEPH_ARGS+="--osd_op_queue=wpq " - - export -n CEPH_CLI_TEST_DUP_COMMAND - local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} - for func in $funcs ; do - setup $dir || return 1 - $func $dir || return 1 - teardown $dir || return 1 - done -} - -function TEST_recover_unexpected() { - local dir=$1 - shift - local OSDS=6 - local PGS=16 - local POOLS=3 - local OBJS=1000 - - run_mon $dir a || return 1 - run_mgr $dir x || return 1 - for o in $(seq 0 $(expr $OSDS - 1)) - do - run_osd $dir $o - done - - for i in $(seq 1 $POOLS) - do - create_pool test$i $PGS $PGS - done - - wait_for_clean || return 1 - - dd if=/dev/urandom of=datafile bs=4k count=2 - for i in $(seq 1 $POOLS) - do - for j in $(seq 1 $OBJS) - do - rados -p test$i put obj$j datafile - done - done - rm datafile - - ceph osd set noscrub - ceph osd set nodeep-scrub - - for qpg in $(ceph pg dump pgs --format=json-pretty | jq '.pg_stats[].pgid') - do - eval pg=$qpg # strip quotes around qpg - ceph tell $pg scrub - done - - ceph pg dump pgs - - max=$(CEPH_ARGS='' ceph daemon $(get_asok_path osd.0) dump_scrub_reservations | jq '.osd_max_scrubs') - if [ $max != $MAX_SCRUBS ]; then - echo "ERROR: Incorrect osd_max_scrubs from dump_scrub_reservations" - return 1 + fi + + # the objects to modify: + (( extr_dbg >= 1 )) && echo "Objects to modify on osd.$target_osd: ${objects_to_modify[@]}" + + # Note: all OSDs are down + local cur_fn_index=0 + for obj in $objects_to_modify; do + (( extr_dbg >= 2 )) && echo "Object $obj: PGID: ${o_to_pg[$obj]}" + # get the PGID of the object + local pgid=${o_to_pg[$obj]} + if [[ -z "$pgid" ]]; then + echo "ERROR: cannot find PGID for object $obj on osd.$target_osd" + return 1 + fi + # get the manipulator function name + local manipulator=$(echo $manips | cut -d' ' -f$((cur_fn_index % ${#manips[@]} + 1))) + (( extr_dbg >= 2 )) && echo "Manipulator: $manipulator" + if declare -f "$manipulator" > /dev/null; then + $manipulator "$dir" "$target_osd" "$obj" || return 1 fi + (( cur_fn_index++ )) + done + return 0 +} + + +# \todo change to combine the counters from all OSDs +# Note: use the following to aggregate the JSON objects of a single OSD: +# ceph tell osd.$osd counter dump --format=json | +# jq '[recurse | objects | to_entries[] | select(.key | test("scrub")) | {(.key): .value}] | add' | +# jq -s '.' >> ${fnm}_b +function dump_scrub_counters() +{ + local dir=$1 + local OSDS=$2 + local hdr_msg=$3 + local extr_dbg=1 + + fnm="/tmp/dscrub_counters_`date +%d_%H%M`" + ((extr_dbg >= 3)) && echo "$hdr_msg: Scrub counters at `date +%T.%N`" > $fnm + for ((osd=0; osd> $fnm + all_c=$(ceph tell osd.$osd counter dump --format=json | + jq 'recurse | objects | to_entries[] | select(.key | test("scrub"))') + ((extr_dbg >= 3)) && echo "$all_c" >> $fnm + echo "$all_c" + done +} + +# WIP: a dump_scrub_counters version that truly aggregates across OSDs +function dump_scrub_counters_wip() +{ + local dir=$1 + local OSDS=$2 + local hdr_msg=$3 + local fnm="/tmp/dscrub_counters_$(date +%d_%H%M).json" + + local tmpd + tmpd=$(mktemp -d) + for ((osd=0; osd "$tmpd/osd_${osd}.json" + done + + jq -s --argjson n "$OSDS" ' + # Collect all scrub-related entries at any depth as {key,value} pairs + def scrub_entries: + recurse | objects | to_entries[] | select(.key | test("scrub")); + + # Element-wise sum for arrays of equal-length numbers; else concat + def sum_arrays(a; b): + if (a|length) == (b|length) + and ((a|all(type=="number")) and (b|all(type=="number"))) then + [ range(0; (a|length)) | (a[.] + b[.]) ] + else a + b end; + + # Recursively merge two values with numeric/array/object-aware addition + def sum_merge(a; b): + if a == null then b + elif b == null then a + elif (a|type) == "number" and (b|type) == "number" then a + b + elif (a|type) == "array" and (b|type) == "array" then sum_arrays(a; b) + elif (a|type) == "object" and (b|type) == "object" then + ( (a|keys) + (b|keys) | unique ) as $ks + | reduce $ks[] as $k ({}; .[$k] = sum_merge(a[$k]; b[$k])) + else b end; + + # Recompute averages: + # - If sum and (count or avgcount) exist: avg/avgtime/average = sum / count + # - Else: simple mean across OSDs (we summed earlier -> divide by n) + def recompute_avgs($n): + walk( + if type == "object" then + ( .count // .avgcount ) as $cnt + | if (.sum? != null) and ($cnt != null) then + (if ($cnt|tonumber)==0 then 0 else (.sum / $cnt) end) as $avg + | (if .avg? then (.avg = $avg) else . end) + | (if .average? then (.average = $avg) else . end) + | (if .avgtime? then (.avgtime = $avg) else . end) + else + (if .avg? then (.avg = (.avg / $n)) else . end) + | (if .average? then (.average = (.average / $n)) else . end) + | (if .avgtime? then (.avgtime = (.avgtime / $n)) else . end) + end + else . end + ); + + # Slurp all OSD JSONs -> flatten to list of {key,value} scrub entries + map([scrub_entries]) | add + # Group by key (counter name) so we can aggregate across OSDs + | group_by(.key) + # For each counter, sum/merge all values from different OSDs + | map({ key: .[0].key, value: (reduce .[].value as $v ({}; sum_merge(.; $v))) }) + | from_entries + | recompute_avgs($n) + ' "$tmpd"/osd_*.json > "$fnm" - ceph osd unset noscrub - - ok=false - for i in $(seq 0 300) - do - ceph pg dump pgs - if ceph pg dump pgs | grep '+scrubbing'; then - ok=true - break - fi - sleep 1 - done - if test $ok = "false"; then - echo "ERROR: Test set-up failed no scrubbing" - return 1 + rm "$tmpd"/osd_*.json + rm -f -d "$tmpd" + echo "Aggregated scrub counters written to $fnm" +} + + +# create K objects in the named pool, using one common random data +# file of size S. Use NTHR parallel threads. +function create_objects() +{ + local dir=$1 + local poolname=$2 + local K=$3 + local S=$4 + local NTHR=$5 + + local start_times=$(date +%s%N) + local testdata_file=$(file_with_random_data $S) + + # create the objects in parallel + for ((i=1; i<=K; i++)); do + rados -p $poolname put obj${i} $testdata_file & + if (( i % NTHR == 0 )); then + wait fi + done + wait + rm $testdata_file + local end_times=$(date +%s%N) + local duration=$(( (end_times - start_times) / 1000000 )) + echo "create_objects(): created $K objects in pool $poolname in $duration ms" +} + +function create_objects_2() +{ + local dir=$1 + local poolname=$2 + local K=$3 + local S=$4 + local NTHR=$5 - local total=0 - local zerocount=0 - local maxzerocount=3 - while(true) - do - pass=0 - for o in $(seq 0 $(expr $OSDS - 1)) - do - CEPH_ARGS='' ceph daemon $(get_asok_path osd.$o) dump_scrub_reservations - scrubs=$(CEPH_ARGS='' ceph daemon $(get_asok_path osd.$o) dump_scrub_reservations | jq '.scrubs_local + .granted_reservations') - if [ $scrubs -gt $MAX_SCRUBS ]; then - echo "ERROR: More than $MAX_SCRUBS currently reserved" - return 1 - fi - pass=$(expr $pass + $scrubs) - done - if [ $pass = "0" ]; then - zerocount=$(expr $zerocount + 1) - fi - if [ $zerocount -gt $maxzerocount ]; then - break - fi - total=$(expr $total + $pass) - if [ $total -gt 0 ]; then - # already saw some reservations, so wait longer to avoid excessive over-counting. - # Note the loop itself takes about 2-3 seconds - sleep $(expr $SCRUB_SLEEP - 2) - else - sleep 0.5 - fi - done - - # Check that there are no more scrubs - for i in $(seq 0 5) - do - if ceph pg dump pgs | grep '+scrubbing'; then - echo "ERROR: Extra scrubs after test completion...not expected" - return 1 + local start_times=$(date +%s%N) + local testdata_file=$(file_with_random_data $S) + + (( NTHR > K )) && NTHR=$K # no more than K parallel threads + local objs_per_proc=$((K / NTHR)) + if (( K % NTHR != 0 )); then + objs_per_proc=$((objs_per_proc + 1)) + fi + + (( NTHR > 1 )) && echo "Creating $K objects in pool $poolname using $NTHR parallel threads," \ + "$objs_per_proc objects per thread" + for i in `seq 1 $NTHR`; do + ( + for j in `seq 1 $objs_per_proc`; do + local obj_id=$(( (i - 1) * objs_per_proc + j )) + if (( obj_id > K )); then + break fi - sleep $SCRUB_SLEEP - done - - echo $total total reservations seen - - # Sort of arbitraty number based on PGS * POOLS * POOL_SIZE as the number of total scrub - # reservations that must occur. However, the loop above might see the same reservation more - # than once. - actual_reservations=$(expr $PGS \* $POOLS \* $POOL_SIZE) - if [ $total -lt $actual_reservations ]; then - echo "ERROR: Unexpectedly low amount of scrub reservations seen during test" - return 1 + rados -p $poolname put obj${obj_id} $testdata_file + done + ) & + done + + wait + rm $testdata_file + local end_times=$(date +%s%N) + local duration=$(( (end_times - start_times) / 1000000 )) + echo "create_objects_2(): created $K objects in pool $poolname in $duration ms" +} + + +# Parameters: +# $1: the test directory +# $2: [in/out] an array of configuration values +# +# Argument 2 might look like this: +# (note: the dict contains all the elements required by standard_scrub_cluster, +# and the values controlling the test) +# +# Should contains: +# - osds_num: number of OSDs in the cluster +# - pgs_in_pool: number of PGs in the pool +# - pool_name: name of the pool +# - pool_default_size: size of the pool +# - objects_per_pg: average number of objects per PG +# - modify_as_prim_cnt: number of objects to corrupt their Primary version +# - modify_as_repl_cnt: number of objects to corrupt one of their replicas +# - manipulations: a list of functions to perform on the objects +# Assuming N manipulator, a specific manipulator is called on each +# (k*N+)'th object in the specific OSD. +# +# declare -A test_conf=( +# ['osds_num']="3" +# ['pgs_in_pool']="7" +# ['extras']="--extra1 --extra2" +# ['pool_name']="testpl" +# ['pool_default_size']="3" +# ['msg']="Test message" +# ['objects_per_pg']="100" +# ['modify_as_prim_cnt']="10" # elements to corrupt their Primary version +# ['modify_as_repl_cnt']="10" # elements to corrupt one of their replicas +# ['manipulations']="delete_oi rm_object" the list of manipulations to perform +# ) +function corrupt_and_measure() +{ + local dir=$1 + local -n alargs=$2 + + local OSDS=${alargs['osds_num']} + local PGS=${alargs['pgs_in_pool']} + local OBJS_PER_PG=${alargs['objects_per_pg']} + local modify_as_prim_cnt=${alargs['modify_as_prim_cnt']} + local modify_as_repl_cnt=${alargs['modify_as_repl_cnt']} + + local objects=$(($PGS * $OBJS_PER_PG)) + # the total number of corrupted objects cannot exceed the number of objects + if [ $(($modify_as_prim_cnt + $modify_as_repl_cnt)) -gt $objects ]; then + echo "ERROR: too many corruptions requested ($modify_as_prim_cnt + $modify_as_repl_cnt > $objects)" + return 1 + fi + + local extr_dbg=2 # note: 3 and above leaves some temp files around + # must not do that: set -o nounset + standard_scrub_wpq_cluster "$dir" alargs 0 || return 1 + orig_osd_args=" ${alargs['osd_args']}" + orig_osd_args=" $(echo $orig_osd_args)" + + local poolid=${alargs['pool_id']} + local poolname=${alargs['pool_name']} + (( extr_dbg >= 1 )) && echo "Pool: $poolname : $poolid" + # prevent scrubbing while we corrupt objects + ceph osd pool set $poolname noscrub 1 + ceph osd pool set $poolname nodeep-scrub 1 + + #turn off '-x' (but remember previous state) + local saved_echo_flag=${-//[^x]/} + set -x + + # Create some objects + #create_objects_2 "$dir" "$poolname" "$objects" 256 4 || return 1 + create_objects "$dir" "$poolname" "$objects" 256 4 || return 1 + + echo "Pre-wait-forclean: $(date +%T.%N)" + wait_for_clean || return 1 + ceph osd pool stats + sleep 5 + ceph pg dump pgs + + echo "Pre dict creation: $(date +%T.%N)" + local start_dict=$(date +%s%N) + declare -A obj_to_pgid + declare -A obj_to_primary + declare -A obj_to_acting + objs_to_prim_dict_fast "$dir" $poolname "obj" $objects obj_to_pgid obj_to_primary obj_to_acting + local end_dict=$(date +%s%N) + echo "Post dict creation: $(date +%T.%N) ($(( (end_dict - start_dict)/1000000 )) ms)" + + # create a subset of modify_as_prim_cnt+modify_as_repl_cnt objects + # that we will corrupt. Note that no object is to have both its primary + # and replica versions corrupted. + local all_errors=$(($modify_as_prim_cnt + $modify_as_repl_cnt)) + # select the objects to corrupt (both primary and replica) + mapfile -t selected_keys < <(printf "%s\n" "${!obj_to_primary[@]}" | shuf -n "$all_errors") + # list the number of elements, and the first element + (( extr_dbg >= 2 )) && echo "Selected keys: ${#selected_keys[@]}" + (( extr_dbg >= 2 )) && echo "First element: ${selected_keys[0]}" + (( extr_dbg >= 2 )) && echo "obj_to_primary: ${#obj_to_primary[@]}" + + # take the first modify_as_prim_cnt of them to corrupt their primary version. + # the rest will be corrupted on their replicas. + + declare -A prim_objs_to_corrupt + declare -A repl_objs_to_corrupt + # group by the primary OSD (the dict value) + for ((i=0; i < $modify_as_prim_cnt; i++)); do + (( extr_dbg >= 2 )) && echo "Corrupting primary object ($1) ${selected_keys[$i]}" + k=${selected_keys[$i]} + prim_osd=${obj_to_primary[$k]} + prim_objs_to_corrupt["$prim_osd"]+="$k " + done + for ((i=$modify_as_prim_cnt; i < $all_errors; i++)); do + k=${selected_keys[$i]} + # find a replica OSD to "fix" + (( extr_dbg >= 2 )) && echo "Object $k: acting: ${obj_to_acting[$k]}" + (( extr_dbg >= 2 )) && echo "${obj_to_acting[$k]}" | awk '{print $NF}' + # modify either the first replica (in the acting-set order) or the + # last one, decided based on the object sequential number + acting_as_arr=(${obj_to_acting[$k]}) + (( extr_dbg >= 2 )) && echo "r----- ${acting_as_arr} for $k ($i) ${acting_as_arr[1]} ${acting_as_arr[-1]} ${acting_as_arr[0]}" + if (( i % 2 == 0 )); then + repl_osd=${acting_as_arr[1]} + else + repl_osd=${acting_as_arr[-1]} fi + (( extr_dbg >= 2 )) && echo "replosd ${repl_osd} for $k" + repl_objs_to_corrupt["$repl_osd"]+="$k " + done + if [[ -n "$saved_echo_flag" ]]; then set -x; fi - return 0 + # disable rescheduling of the queue due to 'no-scrub' flags + ceph tell osd.* config set osd_scrub_backoff_ratio 0.9999 + + + # -------------------------- step 2: corruption of objects -------------------------- + + kill_daemons $dir TERM osd || return 1 + sleep 1 + for ((osd=0; osd= 2 )) && echo "PGs table:" + for pg in "${!pg_pr[@]}"; do + #wait_for_pg_clean $pg || return 1 + (( extr_dbg >= 2 )) && echo "Got: $pg: ${pg_pr[$pg]} ( ${pg_ac[$pg]} ) ${pg_po[$pg]}" + done + + local -A saved_last_stamp + collect_pool_stamps $dir "$poolid" saved_last_stamp + ceph pg dump pgs + + ceph tell osd.* config set debug_osd 10/10 + local start_time=$(date +%s%N) + for pg in "${!pg_pr[@]}"; do + (( extr_dbg >= 1 )) && echo "deep-scrub $pg" + ceph pg $pg deep-scrub || return 1 + done + + # wait for the scrubs to complete + wait_for_pool_scrubbed "$dir" "$poolid" saved_last_stamp + + local end_time=$(date +%s%N) + local duration=$(( (end_time - start_time)/1000000 )) + ceph tell osd.* config set debug_osd 20/20 + + sleep 2 + ceph pg dump pgs + printf 'MSR NAUT %3d %3d %3d %3d/%3d %6d\n' "$OSDS" "$PGS" "$OBJS_PER_PG" \ + "$modify_as_prim_cnt" "$modify_as_repl_cnt" "$duration" + for pg in "${!pg_pr[@]}"; do + echo "list-inconsistent for PG $pg" + rados -p $poolname list-inconsistent-obj $pg --format=json-pretty | jq '.' | wc -l + done + for ((osd=0; osd= 3 )) && ceph pg dump pgs --format=json-pretty | jq '.pg_stats[]' > /tmp/pg_stats.json + (( extr_dbg >= 3 )) && ceph pg dump pgs --format=json-pretty | jq '.pg_stats[] | + select(.state | contains("inconsistent"))' >> /tmp/pg_stats_inconsistent.json + + collect_pool_stamps $dir "$poolid" saved_last_stamp + ceph tell osd.* config set debug_osd 10/10 + start_time=$(date +%s%N) + for pg in "${!pg_pr[@]}"; do + ceph pg repair $pg || return 1 + ## or, if auto-repairing: + #ceph pg $pg deep-scrub || return 1 + done + wait_for_pool_scrubbed "$dir" "$poolid" saved_last_stamp + end_time=$(date +%s%N) + duration=$(( (end_time - start_time)/1000000 )) + ceph tell osd.* config set debug_osd 20/20 + + sleep 5 + ceph pg dump pgs + printf 'MSR REPR %3d %3d %3d %3d/%3d %6d\n' "$OSDS" "$PGS" "$OBJS_PER_PG" \ + "$modify_as_prim_cnt" "$modify_as_repl_cnt" "$duration" + dump_scrub_counters "$dir" "$OSDS" "After repair" + + # -- collecting some data after the repair + + for pg in "${!pg_pr[@]}"; do + incon_lines=$(rados -p $poolname list-inconsistent-obj $pg --format=json-pretty | jq '.' | wc -l) + if [ "$incon_lines" -gt 5 ]; then + echo "PG $pg still seems to have $incon_lines inconsistent objects!!!!" + # for some reason, the list-inconsistent does not get fully updated immediately + # (10 seconds, for example, are not enough) + # return 1 + fi + done + + for ((osd=0; osd, where i is a number from 1 to obj_num, +# query the cluster for their PGs, primary OSDs and acting sets. +# The results are stored in the following dictionaries: +# - obj_pgid_dict: object name -> pgid +# - obj_prim_dict: object name -> primary OSD +# - obj_acting_dict: object name -> acting set +function objs_to_prim_dict_fast() +{ + local dir=$1 + local poolname=$2 + local basename=$3 + local obj_num=$4 + local -n obj_pgid_dict=$5 + local -n obj_prim_dict=$6 + local -n obj_acting_dict=$7 + #turn off '-x' (but remember previous state) + local saved_echo_flag=${-//[^x]/} + set +x + local extr_dbg=0 + + # Read the Python output and populate the dictionaries + while IFS=$'\t' read -r obj pgid primary acting; do + (( extr_dbg >= 3 )) && printf "Processing object: %s, PGID: %s, Primary: %s, Acting: %s\n" \ + "$obj" "$pgid" "$primary" "$acting" + if [[ -n "$obj" && -n "$pgid" ]]; then + obj_pgid_dict["$obj"]=$pgid + obj_prim_dict["$obj"]=$primary + obj_acting_dict["$obj"]=$acting + fi + done < <(python3 << EOF +import subprocess +import json +import sys +from concurrent.futures import ThreadPoolExecutor, as_completed + +def get_object_mapping(obj_name, pool_name): + """Get PG mapping for a single object.""" + try: + cmd = ['ceph', '--format=json', 'osd', 'map', pool_name, obj_name] + result = subprocess.run(cmd, stdout=subprocess.PIPE, text=True, stderr=subprocess.DEVNULL) + if result.returncode == 0: + data = json.loads(result.stdout) + pgid = data.get('pgid', '') + primary = data.get('acting_primary', '') + acting = ' '.join(map(str, data.get('acting', []))) + return obj_name, pgid, primary, acting + else: + return obj_name, '', '', '' + except Exception: + return obj_name, '', '', '' + +# Parameters from bash variables +pool_name = "$poolname" +base_name = "$basename" +obj_count = $obj_num + +# Generate object names +objects = [f"{base_name}{i}" for i in range(1, obj_count + 1)] + +# Use ThreadPoolExecutor for parallel execution +max_workers = min(16, len(objects)) + +with ThreadPoolExecutor(max_workers=max_workers) as executor: + future_to_obj = { + executor.submit(get_object_mapping, obj, pool_name): obj + for obj in objects + } + + for future in as_completed(future_to_obj): + obj_name, pgid, primary, acting = future.result() + if pgid: + print(f"{obj_name}\t{pgid}\t{primary}\t{acting}") +EOF +) + + if [[ $saved_echo_flag ]]; then + set -x + fi +} + +# a version of 'objs_to_prim_dict_fast' that does not use Python. +function objs_to_prim_dict() +{ + { + local dir=$1 + local poolname=$2 + local basename=$3 + local obj_num=$4 + local -n obj_pgid_dict=$5 + local -n obj_prim_dict=$6 + local -n obj_acting_dict=$7 + + for i in $(seq 1 $obj_num ); do + local obj="${basename}${i}" + IFS=$'\t' read -r pgid primary_osd acting <<<$(ceph --format=json osd map $poolname $obj |\ + jq -r '"\(.pgid)\t\(.acting_primary)\t\(.acting | join(" "))"') + obj_pgid_dict["$obj"]=$pgid + obj_prim_dict["$obj"]=$primary_osd + obj_acting_dict["$obj"]=$acting + done + } 2> /dev/null } -- 2.39.5