From: David Zafman Date: Thu, 2 Jul 2020 17:05:57 +0000 (-0700) Subject: osd: Cancel in-progress scrubs (not user requested) X-Git-Tag: v14.2.11~14^2~1 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=16f52b1b1e29d9c4e94d5d146c7b404f1d8ef4da;p=ceph.git osd: Cancel in-progress scrubs (not user requested) This change adds new scrubber.req_scrub to track user requested scrubs, deep_scrub or repair. Fixes: https://tracker.ceph.com/issues/46275 Signed-off-by: David Zafman (cherry picked from commit 33749cc3c39131d7abed9b8c14064dbfaa87f3a2) Conflicts: PendingReleaseNotes (trivial) src/osd/PG.cc (Due to code re-arrangement changes add manually) --- diff --git a/PendingReleaseNotes b/PendingReleaseNotes index c0ec6fb1709e4..76cc45fcb1e7c 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -8,3 +8,7 @@ results on the cluster, which could fill a nearly-full cluster. They have been replaced by a tool, currently considered experimental, ``rgw-orphan-list``. + +* Now when noscrub and/or nodeep-scrub flags are set globally or per pool, + scheduled scrubs of the type disabled will be aborted. All user initiated + scrubs are NOT interrupted. diff --git a/src/osd/PG.cc b/src/osd/PG.cc index d421fc512302f..40da25a3bc67e 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -363,6 +363,7 @@ PG::PG(OSDService *o, OSDMapRef curmap, finish_sync_event(NULL), backoff_lock("PG::backoff_lock"), scrub_after_recovery(false), + save_req_scrub(false), active_pushes(0), recovery_state(this), peer_features(CEPH_FEATURES_SUPPORTED_DEFAULT), @@ -1122,6 +1123,7 @@ void PG::clear_primary_state() scrubber.reserved_peers.clear(); scrub_after_recovery = false; + save_req_scrub = false; agent_clear(); } @@ -1132,7 +1134,7 @@ PG::Scrubber::Scrubber() active(false), shallow_errors(0), deep_errors(0), fixed(0), must_scrub(false), must_deep_scrub(false), must_repair(false), - need_auto(false), time_for_deep(false), + need_auto(false), req_scrub(false), time_for_deep(false), auto_repair(false), check_repair(false), deep_scrub_on_error(false), @@ -2635,6 +2637,8 @@ void PG::_finish_recovery(Context *c) scrub_after_recovery = false; scrubber.must_deep_scrub = true; scrubber.check_repair = true; + // We remember whether req_scrub was set when scrub_after_recovery set to true + scrubber.req_scrub = save_req_scrub; queue_scrub(); } } else { @@ -4581,6 +4585,7 @@ void PG::scrub_requested(bool deep, bool repair, bool need_auto) scrubber.must_repair = repair; // User might intervene, so clear this scrubber.need_auto = false; + scrubber.req_scrub = true; } reg_next_scrub(); } @@ -5239,6 +5244,12 @@ void PG::scrub(epoch_t queued, ThreadPool::TPHandle &handle) chunky_scrub(handle); } +void PG::abort_scrub() +{ + scrub_clear_state(); + scrub_unreserve_replicas(); +} + /* * Chunky scrub scrubs objects one chunk at a time with writes blocked for that * chunk. @@ -5319,12 +5330,29 @@ void PG::scrub(epoch_t queued, ThreadPool::TPHandle &handle) */ void PG::chunky_scrub(ThreadPool::TPHandle &handle) { + // Since repair is only by request and we need to scrub afterward + // treat the same as req_scrub. + if (!scrubber.req_scrub) { + if (state_test(PG_STATE_DEEP_SCRUB)) { + if (get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) || + pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB)) { + dout(10) << "nodeep_scrub set, aborting" << dendl; + abort_scrub(); + return; + } + } else if (state_test(PG_STATE_SCRUBBING)) { + if (get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) || pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)) { + dout(10) << "noscrub set, aborting" << dendl; + abort_scrub(); + return; + } + } + } // check for map changes if (scrubber.is_chunky_scrub_active()) { if (scrubber.epoch_start != info.history.same_interval_since) { - dout(10) << "scrub pg changed, aborting" << dendl; - scrub_clear_state(); - scrub_unreserve_replicas(); + dout(10) << "scrub pg changed, aborting" << dendl; + abort_scrub(); return; } } @@ -5705,6 +5733,7 @@ void PG::scrub_clear_state(bool has_error) state_clear(PG_STATE_DEEP_SCRUB); publish_stats_to_osd(); + scrubber.req_scrub = false; // local -> nothing. if (scrubber.local_reserved) { osd->dec_scrubs_local(); @@ -5958,7 +5987,8 @@ void PG::scrub_finish() } else if (has_error) { // Deep scrub in order to get corrected error counts scrub_after_recovery = true; - dout(20) << __func__ << " Set scrub_after_recovery" << dendl; + save_req_scrub = scrubber.req_scrub; + dout(20) << __func__ << " Set scrub_after_recovery, req_scrub=" << save_req_scrub << dendl; } else if (scrubber.shallow_errors || scrubber.deep_errors) { // We have errors but nothing can be fixed, so there is no repair // possible. @@ -6673,6 +6703,8 @@ ostream& operator<<(ostream& out, const PG& pg) out << " TIME_FOR_DEEP"; if (pg.scrubber.need_auto) out << " NEED_AUTO"; + if (pg.scrubber.req_scrub) + out << " REQ_SCRUB"; //out << " (" << pg.pg_log.get_tail() << "," << pg.pg_log.get_head() << "]"; if (pg.pg_log.get_missing().num_missing()) { diff --git a/src/osd/PG.h b/src/osd/PG.h index b9383376f75b5..1ebf0c8f875fb 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -1717,7 +1717,7 @@ public: utime_t sleep_start; // flags to indicate explicitly requested scrubs (by admin) - bool must_scrub, must_deep_scrub, must_repair, need_auto; + bool must_scrub, must_deep_scrub, must_repair, need_auto, req_scrub; // Priority to use for scrub scheduling unsigned priority = 0; @@ -1842,6 +1842,7 @@ public: must_deep_scrub = false; must_repair = false; need_auto = false; + req_scrub = false; time_for_deep = false; auto_repair = false; check_repair = false; @@ -1878,6 +1879,7 @@ public: protected: bool scrub_after_recovery; + bool save_req_scrub; // Saved for scrub_after_recovery int active_pushes; @@ -1896,6 +1898,7 @@ protected: const hobject_t& soid, list > *ok_peers, pg_shard_t bad_peer); + void abort_scrub(); void chunky_scrub(ThreadPool::TPHandle &handle); void scrub_compare_maps(); /**