From 0cc5c3aa27e9c9c5502dbc19bf6910d9643b221e Mon Sep 17 00:00:00 2001 From: xie xingguo Date: Tue, 2 Apr 2019 16:17:52 +0800 Subject: [PATCH] qa: add new test case for pulling error Signed-off-by: xie xingguo (cherry picked from commit 6a8aedc1074d487510d8e546ec9e70e169523008) --- qa/standalone/osd/osd-rep-recov-eio.sh | 20 ++++++++++++++++---- src/common/legacy_config_opts.h | 1 + src/common/options.cc | 5 +++++ src/osd/ReplicatedBackend.cc | 20 ++++++++++++++++---- 4 files changed, 38 insertions(+), 8 deletions(-) diff --git a/qa/standalone/osd/osd-rep-recov-eio.sh b/qa/standalone/osd/osd-rep-recov-eio.sh index af4dfa3370b..adf6fc7967a 100755 --- a/qa/standalone/osd/osd-rep-recov-eio.sh +++ b/qa/standalone/osd/osd-rep-recov-eio.sh @@ -122,15 +122,27 @@ function rados_get_data() { COUNT=$(ceph pg dump --format=json-pretty | jq ".pg_map.osd_stats_sum.num_shards_repaired") test "$COUNT" = "1" || return 1 + local object_osds=($(get_osds $poolname $objname)) + local primary=${object_osds[0]} + local bad_peer=${object_osds[1]} inject_$inject rep data $poolname $objname $dir 0 || return 1 inject_$inject rep data $poolname $objname $dir 1 || return 1 + # Force primary to pull from the bad peer, so we can repair it too! + set_config osd $primary osd_debug_feed_pullee $bad_peer || return 1 + rados_get $dir $poolname $objname || return 1 + + # Wait until automatic repair of bad peer is done + wait_for_clean || return 1 + + inject_$inject rep data $poolname $objname $dir 0 || return 1 + inject_$inject rep data $poolname $objname $dir 2 || return 1 rados_get $dir $poolname $objname || return 1 COUNT=$(ceph pg $pgid query | jq '.info.stats.stat_sum.num_objects_repaired') - test "$COUNT" = "2" || return 1 + test "$COUNT" = "3" || return 1 flush_pg_stats COUNT=$(ceph pg dump --format=json-pretty | jq ".pg_map.osd_stats_sum.num_shards_repaired") - test "$COUNT" = "2" || return 1 + test "$COUNT" = "4" || return 1 inject_$inject rep data $poolname $objname $dir 0 || return 1 inject_$inject rep data $poolname $objname $dir 1 || return 1 @@ -139,10 +151,10 @@ function rados_get_data() { # After hang another repair couldn't happen, so count stays the same COUNT=$(ceph pg $pgid query | jq '.info.stats.stat_sum.num_objects_repaired') - test "$COUNT" = "2" || return 1 + test "$COUNT" = "3" || return 1 flush_pg_stats COUNT=$(ceph pg dump --format=json-pretty | jq ".pg_map.osd_stats_sum.num_shards_repaired") - test "$COUNT" = "2" || return 1 + test "$COUNT" = "4" || return 1 } function TEST_rados_get_with_eio() { diff --git a/src/common/legacy_config_opts.h b/src/common/legacy_config_opts.h index 3270fb4976d..cea68f34256 100644 --- a/src/common/legacy_config_opts.h +++ b/src/common/legacy_config_opts.h @@ -659,6 +659,7 @@ OPTION(osd_read_ec_check_for_errors, OPT_BOOL) // return error if any ec shard h // Only use clone_overlap for recovery if there are fewer than // osd_recover_clone_overlap_limit entries in the overlap set OPTION(osd_recover_clone_overlap_limit, OPT_INT) +OPTION(osd_debug_feed_pullee, OPT_INT) OPTION(osd_backfill_scan_min, OPT_INT) OPTION(osd_backfill_scan_max, OPT_INT) diff --git a/src/common/options.cc b/src/common/options.cc index 0d0b51f5cf2..578a52d0fb2 100644 --- a/src/common/options.cc +++ b/src/common/options.cc @@ -3192,6 +3192,11 @@ std::vector