From d8f26bde62fef23617b6eb02feadb201fa7018d3 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Tue, 12 Mar 2019 22:22:53 -0700 Subject: [PATCH] test, osd: Improvements to auto_repair 1(3) Allow auto_repair for replicated bluestore pools Regular scrub within auto repair parameters will trigger deep scrub New state failed_repair if PG repair attempt could not fix everything Set failed_repair if not possible to repair anything Fixes: http://tracker.ceph.com/issues/38616 Signed-off-by: David Zafman (cherry picked from commit 2202e5d0b107795837ce79ffce2a980e8c12fc62) --- qa/standalone/scrub/osd-scrub-repair.sh | 223 ++++++++++++++++++++++++ src/osd/PG.cc | 66 +++++-- src/osd/PG.h | 7 + src/osd/ReplicatedBackend.h | 2 +- src/osd/osd_types.cc | 4 + src/osd/osd_types.h | 1 + 6 files changed, 292 insertions(+), 11 deletions(-) diff --git a/qa/standalone/scrub/osd-scrub-repair.sh b/qa/standalone/scrub/osd-scrub-repair.sh index 3d587d603190e..b326df8c38ac4 100755 --- a/qa/standalone/scrub/osd-scrub-repair.sh +++ b/qa/standalone/scrub/osd-scrub-repair.sh @@ -268,6 +268,229 @@ function TEST_auto_repair_erasure_coded_overwrites() { fi } +function TEST_auto_repair_bluestore_basic() { + local dir=$1 + local poolname=testpool + + # Launch a cluster with 5 seconds scrub interval + setup $dir || return 1 + run_mon $dir a || return 1 + run_mgr $dir x || return 1 + local ceph_osd_args="--osd-scrub-auto-repair=true \ + --osd_deep_scrub_randomize_ratio=0 \ + --osd-scrub-interval-randomize-ratio=0" + for id in $(seq 0 2) ; do + run_osd_bluestore $dir $id $ceph_osd_args || return 1 + done + + create_pool $poolname 1 1 || return 1 + ceph osd pool set $poolname size 2 + wait_for_clean || return 1 + + # Put an object + local payload=ABCDEF + echo $payload > $dir/ORIGINAL + rados --pool $poolname put SOMETHING $dir/ORIGINAL || return 1 + + # Remove the object from one shard physically + # Restarted osd get $ceph_osd_args passed + objectstore_tool $dir $(get_not_primary $poolname SOMETHING) SOMETHING remove || return 1 + + local pgid=$(get_pg $poolname SOMETHING) + local primary=$(get_primary $poolname SOMETHING) + local last_scrub_stamp="$(get_last_scrub_stamp $pgid)" + CEPH_ARGS='' ceph daemon $(get_asok_path osd.$primary) trigger_deep_scrub $pgid + CEPH_ARGS='' ceph daemon $(get_asok_path osd.$primary) trigger_scrub $pgid + + # Wait for auto repair + wait_for_scrub $pgid "$last_scrub_stamp" || return 1 + wait_for_clean || return 1 + ceph pg dump pgs + # Verify - the file should be back + # Restarted osd get $ceph_osd_args passed + objectstore_tool $dir $(get_not_primary $poolname SOMETHING) SOMETHING list-attrs || return 1 + objectstore_tool $dir $(get_not_primary $poolname SOMETHING) SOMETHING get-bytes $dir/COPY || return 1 + diff $dir/ORIGINAL $dir/COPY || return 1 + grep scrub_finish $dir/osd.${primary}.log + + # Tear down + teardown $dir || return 1 +} + +function TEST_auto_repair_bluestore_scrub() { + local dir=$1 + local poolname=testpool + + # Launch a cluster with 5 seconds scrub interval + setup $dir || return 1 + run_mon $dir a || return 1 + run_mgr $dir x || return 1 + local ceph_osd_args="--osd-scrub-auto-repair=true \ + --osd_deep_scrub_randomize_ratio=0 \ + --osd-scrub-interval-randomize-ratio=0" + for id in $(seq 0 2) ; do + run_osd_bluestore $dir $id $ceph_osd_args || return 1 + done + + create_pool $poolname 1 1 || return 1 + ceph osd pool set $poolname size 2 + wait_for_clean || return 1 + + # Put an object + local payload=ABCDEF + echo $payload > $dir/ORIGINAL + rados --pool $poolname put SOMETHING $dir/ORIGINAL || return 1 + + # Remove the object from one shard physically + # Restarted osd get $ceph_osd_args passed + objectstore_tool $dir $(get_not_primary $poolname SOMETHING) SOMETHING remove || return 1 + + local pgid=$(get_pg $poolname SOMETHING) + local primary=$(get_primary $poolname SOMETHING) + local last_scrub_stamp="$(get_last_scrub_stamp $pgid)" + CEPH_ARGS='' ceph daemon $(get_asok_path osd.$primary) trigger_scrub $pgid + + # Wait for scrub -> auto repair + wait_for_scrub $pgid "$last_scrub_stamp" || return 1 + ceph pg dump pgs + # Actually this causes 2 scrubs, so we better wait a little longer + sleep 5 + wait_for_clean || return 1 + ceph pg dump pgs + # Verify - the file should be back + # Restarted osd get $ceph_osd_args passed + objectstore_tool $dir $(get_not_primary $poolname SOMETHING) SOMETHING list-attrs || return 1 + rados --pool $poolname get SOMETHING $dir/COPY || return 1 + diff $dir/ORIGINAL $dir/COPY || return 1 + grep scrub_finish $dir/osd.${primary}.log + + # Tear down + teardown $dir || return 1 +} + +function TEST_auto_repair_bluestore_failed() { + local dir=$1 + local poolname=testpool + + # Launch a cluster with 5 seconds scrub interval + setup $dir || return 1 + run_mon $dir a || return 1 + run_mgr $dir x || return 1 + local ceph_osd_args="--osd-scrub-auto-repair=true \ + --osd_deep_scrub_randomize_ratio=0 \ + --osd-scrub-interval-randomize-ratio=0" + for id in $(seq 0 2) ; do + run_osd_bluestore $dir $id $ceph_osd_args || return 1 + done + + create_pool $poolname 1 1 || return 1 + ceph osd pool set $poolname size 2 + wait_for_clean || return 1 + + # Put an object + local payload=ABCDEF + echo $payload > $dir/ORIGINAL + for i in $(seq 1 10) + do + rados --pool $poolname put obj$i $dir/ORIGINAL || return 1 + done + + # Remove the object from one shard physically + # Restarted osd get $ceph_osd_args passed + objectstore_tool $dir $(get_not_primary $poolname SOMETHING) obj1 remove || return 1 + # obj2 can't be repaired + objectstore_tool $dir $(get_not_primary $poolname SOMETHING) obj2 remove || return 1 + objectstore_tool $dir $(get_primary $poolname SOMETHING) obj2 rm-attr _ || return 1 + + local pgid=$(get_pg $poolname obj1) + local primary=$(get_primary $poolname obj1) + local last_scrub_stamp="$(get_last_scrub_stamp $pgid)" + CEPH_ARGS='' ceph daemon $(get_asok_path osd.$primary) trigger_deep_scrub $pgid + CEPH_ARGS='' ceph daemon $(get_asok_path osd.$primary) trigger_scrub $pgid + + # Wait for auto repair + wait_for_scrub $pgid "$last_scrub_stamp" || return 1 + wait_for_clean || return 1 + flush_pg_stats + grep scrub_finish $dir/osd.${primary}.log + grep -q "scrub_finish.*still present after re-scrub" $dir/osd.${primary}.log || return 1 + ceph pg dump pgs + ceph pg dump pgs | grep -q "^$(pgid).*+failed_repair" || return 1 + + # Verify - obj1 should be back + # Restarted osd get $ceph_osd_args passed + objectstore_tool $dir $(get_not_primary $poolname obj1) obj1 list-attrs || return 1 + rados --pool $poolname get obj1 $dir/COPY || return 1 + diff $dir/ORIGINAL $dir/COPY || return 1 + grep scrub_finish $dir/osd.${primary}.log + + # Make it repairable + objectstore_tool $dir $(get_primary $poolname SOMETHING) obj2 remove || return 1 + repair $pgid + sleep 2 + + ceph pg dump pgs + ceph pg dump pgs | grep -q "^$(pgid).* active+clean " || return 1 + grep scrub_finish $dir/osd.${primary}.log + + # Tear down + teardown $dir || return 1 +} + +function TEST_auto_repair_bluestore_failed_norecov() { + local dir=$1 + local poolname=testpool + + # Launch a cluster with 5 seconds scrub interval + setup $dir || return 1 + run_mon $dir a || return 1 + run_mgr $dir x || return 1 + local ceph_osd_args="--osd-scrub-auto-repair=true \ + --osd_deep_scrub_randomize_ratio=0 \ + --osd-scrub-interval-randomize-ratio=0" + for id in $(seq 0 2) ; do + run_osd_bluestore $dir $id $ceph_osd_args || return 1 + done + + create_pool $poolname 1 1 || return 1 + ceph osd pool set $poolname size 2 + wait_for_clean || return 1 + + # Put an object + local payload=ABCDEF + echo $payload > $dir/ORIGINAL + for i in $(seq 1 10) + do + rados --pool $poolname put obj$i $dir/ORIGINAL || return 1 + done + + # Remove the object from one shard physically + # Restarted osd get $ceph_osd_args passed + # obj1 can't be repaired + objectstore_tool $dir $(get_not_primary $poolname SOMETHING) obj1 remove || return 1 + objectstore_tool $dir $(get_primary $poolname SOMETHING) obj1 rm-attr _ || return 1 + # obj2 can't be repaired + objectstore_tool $dir $(get_not_primary $poolname SOMETHING) obj2 remove || return 1 + objectstore_tool $dir $(get_primary $poolname SOMETHING) obj2 rm-attr _ || return 1 + + local pgid=$(get_pg $poolname obj1) + local primary=$(get_primary $poolname obj1) + local last_scrub_stamp="$(get_last_scrub_stamp $pgid)" + CEPH_ARGS='' ceph daemon $(get_asok_path osd.$primary) trigger_deep_scrub $pgid + CEPH_ARGS='' ceph daemon $(get_asok_path osd.$primary) trigger_scrub $pgid + + # Wait for auto repair + wait_for_scrub $pgid "$last_scrub_stamp" || return 1 + wait_for_clean || return 1 + flush_pg_stats + grep -q "scrub_finish.*present with no repair possible" $dir/osd.${primary}.log || return 1 + ceph pg dump pgs + ceph pg dump pgs | grep -q "^$(pgid).*+failed_repair" || return 1 + + # Tear down + teardown $dir || return 1 +} + function corrupt_and_repair_jerasure() { local dir=$1 local allow_overwrites=$2 diff --git a/src/osd/PG.cc b/src/osd/PG.cc index a2a6c790ae590..ef7fe44574c58 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -1123,6 +1123,8 @@ PG::Scrubber::Scrubber() shallow_errors(0), deep_errors(0), fixed(0), must_scrub(false), must_deep_scrub(false), must_repair(false), auto_repair(false), + check_repair(false), + deep_scrub_on_error(false), num_digest_updates_pending(0), state(INACTIVE), deep(false) @@ -2557,6 +2559,7 @@ void PG::_finish_recovery(Context *c) dout(10) << "_finish_recovery requeueing for scrub" << dendl; scrub_after_recovery = false; scrubber.must_deep_scrub = true; + scrubber.check_repair = true; queue_scrub(); } } else { @@ -3446,8 +3449,10 @@ void PG::publish_stats_to_osd() if (info.stats.stats.sum.num_scrub_errors) state_set(PG_STATE_INCONSISTENT); - else + else { state_clear(PG_STATE_INCONSISTENT); + state_clear(PG_STATE_FAILED_REPAIR); + } utime_t now = ceph_clock_now(); if (info.stats.state != state) { @@ -4309,19 +4314,23 @@ bool PG::sched_scrub() } } + // Clear these in case user issues the scrub/repair command during + // the scheduling of the scrub/repair (e.g. request reservation) + scrubber.deep_scrub_on_error = false; + scrubber.auto_repair = false; if (cct->_conf->osd_scrub_auto_repair && get_pgbackend()->auto_repair_supported() - && time_for_deep // respect the command from user, and not do auto-repair && !scrubber.must_repair && !scrubber.must_scrub && !scrubber.must_deep_scrub) { - dout(20) << __func__ << ": auto repair with deep scrubbing" << dendl; - scrubber.auto_repair = true; - } else { - // this happens when user issue the scrub/repair command during - // the scheduling of the scrub/repair (e.g. request reservation) - scrubber.auto_repair = false; + if (time_for_deep) { + dout(20) << __func__ << ": auto repair with deep scrubbing" << dendl; + scrubber.auto_repair = true; + } else { + dout(20) << __func__ << ": auto repair with scrubbing, rescrub if errors found" << dendl; + scrubber.deep_scrub_on_error = true; + } } bool ret = true; @@ -5723,7 +5732,9 @@ bool PG::ops_blocked_by_scrub() const { // the part that actually finalizes a scrub void PG::scrub_finish() { + dout(20) << __func__ << dendl; bool repair = state_test(PG_STATE_REPAIR); + bool do_deep_scrub = false; // if the repair request comes from auto-repair and large number of errors, // we would like to cancel auto-repair if (repair && scrubber.auto_repair @@ -5734,6 +5745,15 @@ void PG::scrub_finish() bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB); const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub")); + // if a regular scrub had errors within the limit, do a deep scrub to auto repair. + if (scrubber.deep_scrub_on_error + && scrubber.authoritative.size() <= cct->_conf->osd_scrub_auto_repair_num_errors) { + ceph_assert(!deep_scrub); + scrubber.deep_scrub_on_error = false; + do_deep_scrub = true; + dout(20) << __func__ << " Try to auto repair after scrub errors" << dendl; + } + // type-specific finish (can tally more errors) _scrub_finish(); @@ -5773,10 +5793,17 @@ void PG::scrub_finish() if (scrubber.fixed == scrubber.shallow_errors + scrubber.deep_errors) { ceph_assert(deep_scrub); scrubber.shallow_errors = scrubber.deep_errors = 0; - } else { + dout(20) << __func__ << " All may be fixed" << dendl; + } else if (has_error) { // Deep scrub in order to get corrected error counts scrub_after_recovery = true; - } + dout(20) << __func__ << " Set scrub_after_recovery" << dendl; + } else if (scrubber.shallow_errors || scrubber.deep_errors) { + // We have errors but nothing can be fixed, so there is no repair + // possible. + state_set(PG_STATE_FAILED_REPAIR); } + dout(10) << __func__ << " " << (scrubber.shallow_errors + scrubber.deep_errors) + << " error(s) present with no repair possible" << dendl; } if (deep_scrub) { if ((scrubber.shallow_errors == 0) && (scrubber.deep_errors == 0)) @@ -5799,7 +5826,22 @@ void PG::scrub_finish() info.stats.stats.sum.num_scrub_errors = info.stats.stats.sum.num_shallow_scrub_errors + info.stats.stats.sum.num_deep_scrub_errors; + if (scrubber.check_repair) { + scrubber.check_repair = false; + if (info.stats.stats.sum.num_scrub_errors) { + state_set(PG_STATE_FAILED_REPAIR); + dout(10) << __func__ << " " << info.stats.stats.sum.num_scrub_errors + << " error(s) still present after re-scrub" << dendl; + } + } publish_stats_to_osd(); + if (do_deep_scrub) { + // XXX: Auto scrub won't activate if must_scrub is set, but + // setting the scrub stamps affects what users see. + utime_t stamp = utime_t(0,1); + set_last_scrub_stamp(stamp); + set_last_deep_scrub_stamp(stamp); + } reg_next_scrub(); { @@ -6462,6 +6504,10 @@ ostream& operator<<(ostream& out, const PG& pg) out << " MUST_REPAIR"; if (pg.scrubber.auto_repair) out << " AUTO_REPAIR"; + if (pg.scrubber.check_repair) + out << " CHECK_REPAIR"; + if (pg.scrubber.deep_scrub_on_error) + out << " DEEP_SCRUB_ON_ERROR"; if (pg.scrubber.must_deep_scrub) out << " MUST_DEEP_SCRUB"; if (pg.scrubber.must_scrub) diff --git a/src/osd/PG.h b/src/osd/PG.h index 1bf7a60a5edb9..110572c9afd79 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -1702,6 +1702,11 @@ public: // this flag indicates whether we would like to do auto-repair of the PG or not bool auto_repair; + // this flag indicates that we are scrubbing post repair to verify everything is fixed + bool check_repair; + // this flag indicates that if a regular scrub detects errors <= osd_scrub_auto_repair_num_errors, + // we should deep scrub in order to auto repair + bool deep_scrub_on_error; // Maps from objects with errors to missing/inconsistent peers map> missing; @@ -1814,6 +1819,8 @@ public: must_deep_scrub = false; must_repair = false; auto_repair = false; + check_repair = false; + deep_scrub_on_error = false; state = PG::Scrubber::INACTIVE; start = hobject_t(); diff --git a/src/osd/ReplicatedBackend.h b/src/osd/ReplicatedBackend.h index 35feef16bce7b..c5593080deed0 100644 --- a/src/osd/ReplicatedBackend.h +++ b/src/osd/ReplicatedBackend.h @@ -416,7 +416,7 @@ private: struct C_OSD_RepModifyCommit; void repop_commit(RepModifyRef rm); - bool auto_repair_supported() const override { return false; } + bool auto_repair_supported() const override { return store->has_builtin_csum(); } int be_deep_scrub( diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 130a8f2e2f86a..ad839ad911fa8 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -976,6 +976,8 @@ std::string pg_state_string(uint64_t state) oss << "snaptrim_wait+"; if (state & PG_STATE_SNAPTRIM_ERROR) oss << "snaptrim_error+"; + if (state & PG_STATE_FAILED_REPAIR) + oss << "failed_repair+"; string ret(oss.str()); if (ret.length() > 0) ret.resize(ret.length() - 1); @@ -1047,6 +1049,8 @@ boost::optional pg_string_state(const std::string& state) type = PG_STATE_SNAPTRIM_ERROR; else if (state == "creating") type = PG_STATE_CREATING; + else if (state == "failed_repair") + type = PG_STATE_FAILED_REPAIR; else if (state == "unknown") type = 0; else diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index a8dc8ee6425c5..c401d9c7b06a1 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -963,6 +963,7 @@ WRITE_CLASS_ENCODER_FEATURES(objectstore_perf_stat_t) #define PG_STATE_SNAPTRIM_ERROR (1ULL << 29) // error stopped trimming snaps #define PG_STATE_FORCED_RECOVERY (1ULL << 30) // force recovery of this pg before any other #define PG_STATE_FORCED_BACKFILL (1ULL << 31) // force backfill of this pg before any other +#define PG_STATE_FAILED_REPAIR (1ULL << 32) // A repair failed to fix all errors std::string pg_state_string(uint64_t state); std::string pg_vector_string(const vector &a); -- 2.39.5