From ad5471f4ccd669075a54a495dfa9238e9c49f1d0 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 19 Jun 2020 10:52:52 +0800 Subject: [PATCH] mds: track scrub status in multiple mds Scrubs are always initialized from mds.0. So mds.0 can ensure that scrub tags are unique globally. mds.0 periodically gathers scrubs running in itself and in other mds. A scrub is finished only if it's not running in any mds. Signed-off-by: "Yan, Zheng" --- src/common/strtol.h | 2 + src/mds/CDir.cc | 10 +- src/mds/CDir.h | 5 +- src/mds/CInode.cc | 3 + src/mds/CInode.h | 5 +- src/mds/MDCache.cc | 101 ++++++++++---- src/mds/MDCache.h | 14 +- src/mds/MDSRank.cc | 16 +++ src/mds/ScrubHeader.h | 18 ++- src/mds/ScrubStack.cc | 249 ++++++++++++++++++++++++++++------ src/mds/ScrubStack.h | 21 ++- src/mds/Server.cc | 29 +--- src/messages/MMDSScrub.h | 16 ++- src/messages/MMDSScrubStats.h | 75 ++++++++++ src/msg/Message.cc | 5 + src/msg/Message.h | 1 + 16 files changed, 454 insertions(+), 116 deletions(-) create mode 100644 src/messages/MMDSScrubStats.h diff --git a/src/common/strtol.h b/src/common/strtol.h index e01d25547ac2e..5ebfc2a46b2b0 100644 --- a/src/common/strtol.h +++ b/src/common/strtol.h @@ -134,8 +134,10 @@ auto consume(std::string_view& sv, int base = 10) bool strict_strtob(const char* str, std::string *err); +long long strict_strtoll(std::string_view str, int base, std::string *err); long long strict_strtoll(const char *str, int base, std::string *err); +int strict_strtol(std::string_view str, int base, std::string *err); int strict_strtol(const char *str, int base, std::string *err); double strict_strtod(const char *str, std::string *err); diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc index 58149e653b2c9..3a734e989ffb4 100644 --- a/src/mds/CDir.cc +++ b/src/mds/CDir.cc @@ -3479,16 +3479,18 @@ void CDir::scrub_initialize(const ScrubHeaderRef& header) // FIXME: weird implicit construction, is someone else meant // to be calling scrub_info_create first? scrub_info(); - scrub_infop->header = header; scrub_infop->directory_scrubbing = true; + scrub_infop->header = header; + header->inc_num_pending(); } void CDir::scrub_aborted() { dout(20) << __func__ << dendl; ceph_assert(scrub_is_in_progress()); - scrub_infop->directory_scrubbing = false; scrub_infop->last_scrub_dirty = false; + scrub_infop->directory_scrubbing = false; + scrub_infop->header->dec_num_pending(); scrub_infop.reset(); } @@ -3496,7 +3498,6 @@ void CDir::scrub_finished() { dout(20) << __func__ << dendl; ceph_assert(scrub_is_in_progress()); - scrub_infop->directory_scrubbing = false; scrub_infop->last_local.time = ceph_clock_now(); scrub_infop->last_local.version = get_version(); @@ -3504,6 +3505,9 @@ void CDir::scrub_finished() scrub_infop->last_recursive = scrub_infop->last_local; scrub_infop->last_scrub_dirty = true; + + scrub_infop->directory_scrubbing = false; + scrub_infop->header->dec_num_pending(); } void CDir::scrub_maybe_delete_info() diff --git a/src/mds/CDir.h b/src/mds/CDir.h index 581f3a818f80b..6d876975e111b 100644 --- a/src/mds/CDir.h +++ b/src/mds/CDir.h @@ -298,8 +298,9 @@ public: * @post It has set up its internal scrubbing state. */ void scrub_initialize(const ScrubHeaderRef& header); - ScrubHeaderRef get_scrub_header() { - return scrub_infop ? scrub_infop->header : nullptr; + const ScrubHeaderRef& get_scrub_header() { + static const ScrubHeaderRef nullref; + return scrub_infop ? scrub_infop->header : nullref; } bool scrub_is_in_progress() const { diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index c812b7bfd2d63..9d3dd85e73969 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -5117,6 +5117,7 @@ void CInode::scrub_initialize(ScrubHeaderRef& header) scrub_infop->scrub_in_progress = true; scrub_infop->queued_frags.clear(); scrub_infop->header = header; + header->inc_num_pending(); // right now we don't handle remote inodes } @@ -5125,6 +5126,7 @@ void CInode::scrub_aborted() { ceph_assert(scrub_is_in_progress()); scrub_infop->scrub_in_progress = false; + scrub_infop->header->dec_num_pending(); scrub_maybe_delete_info(); } @@ -5136,6 +5138,7 @@ void CInode::scrub_finished() { scrub_infop->last_scrub_stamp = ceph_clock_now(); scrub_infop->last_scrub_dirty = true; scrub_infop->scrub_in_progress = false; + scrub_infop->header->dec_num_pending(); } int64_t CInode::get_backtrace_pool() const diff --git a/src/mds/CInode.h b/src/mds/CInode.h index 44e71f0b6a871..513113ba146f0 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -428,8 +428,9 @@ class CInode : public MDSCacheObject, public InodeStoreBase, public Counterheader : nullptr; + const ScrubHeaderRef& get_scrub_header() { + static const ScrubHeaderRef nullref; + return scrub_infop ? scrub_infop->header : nullref; } bool scrub_is_in_progress() const { diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index d8de397d69f57..12fb2068fb642 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -12695,18 +12695,20 @@ int MDCache::dump_cache(std::string_view fn, Formatter *f) return r; } - - -C_MDS_RetryRequest::C_MDS_RetryRequest(MDCache *c, MDRequestRef& r) - : MDSInternalContext(c->mds), cache(c), mdr(r) -{} - void C_MDS_RetryRequest::finish(int r) { mdr->retry++; cache->dispatch_request(mdr); } +MDSContext *CF_MDS_RetryRequestFactory::build() +{ + if (drop_locks) { + mdcache->mds->locker->drop_locks(mdr.get(), nullptr); + mdr->drop_local_auth_pins(); + } + return new C_MDS_RetryRequest(mdcache, mdr); +} class C_MDS_EnqueueScrub : public Context { @@ -12722,13 +12724,11 @@ public: if (r == 0) { // since recursive scrub is asynchronous, dump minimal output // to not upset cli tools. - if (header && header->get_recursive()) { - formatter->open_object_section("results"); - formatter->dump_int("return_code", 0); - formatter->dump_string("scrub_tag", tag); - formatter->dump_string("mode", "asynchronous"); - formatter->close_section(); // results - } + formatter->open_object_section("results"); + formatter->dump_int("return_code", 0); + formatter->dump_string("scrub_tag", tag); + formatter->dump_string("mode", "asynchronous"); + formatter->close_section(); // results } else { // we failed the lookup or something; dump ourselves formatter->open_object_section("results"); formatter->dump_int("return_code", r); @@ -12747,14 +12747,26 @@ void MDCache::enqueue_scrub( Formatter *f, Context *fin) { dout(10) << __func__ << " " << path << dendl; - MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_ENQUEUE_SCRUB); - if (path == "~mdsdir") { - filepath fp(MDS_INO_MDSDIR(mds->get_nodeid())); - mdr->set_filepath(fp); - } else { - filepath fp(path); - mdr->set_filepath(path); + + filepath fp; + if (path.compare(0, 4, "~mds") == 0) { + mds_rank_t rank; + if (path == "~mdsdir") { + rank = mds->get_nodeid(); + } else { + std::string err; + rank = strict_strtoll(path.substr(4), 10, &err); + if (!err.empty()) + rank = MDS_RANK_NONE; + } + if (rank >= 0 && rank < MAX_MDS) + fp.set_path("", MDS_INO_MDSDIR(rank)); } + if (fp.get_ino() == inodeno_t(0)) + fp.set_path(path); + + MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_ENQUEUE_SCRUB); + mdr->set_filepath(fp); bool is_internal = false; std::string tag_str(tag); @@ -12774,21 +12786,32 @@ void MDCache::enqueue_scrub( void MDCache::enqueue_scrub_work(MDRequestRef& mdr) { - CInode *in = mds->server->rdlock_path_pin_ref(mdr, true); - if (NULL == in) + CInode *in; + CF_MDS_RetryRequestFactory cf(this, mdr, true); + int r = path_traverse(mdr, cf, mdr->get_filepath(), + MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_RDLOCK_PATH, + nullptr, &in); + if (r > 0) + return; + if (r < 0) { + mds->server->respond_to_request(mdr, r); return; + } - // TODO: Remove this restriction - ceph_assert(in->is_auth()); + // Cannot scrub same dentry twice at same time + if (in->scrub_is_in_progress()) { + mds->server->respond_to_request(mdr, -EBUSY); + return; + } else { + in->scrub_info(); + } C_MDS_EnqueueScrub *cs = static_cast(mdr->internal_op_finish); - ScrubHeaderRef header = cs->header; - header->set_origin(in); + ScrubHeaderRef& header = cs->header; - int r = mds->scrubstack->enqueue(in, header, !header->get_recursive()); + r = mds->scrubstack->enqueue(in, header, !header->get_recursive()); mds->server->respond_to_request(mdr, r); - return; } struct C_MDC_RespondInternalRequest : public MDCacheLogContext { @@ -12801,12 +12824,27 @@ struct C_MDC_RespondInternalRequest : public MDCacheLogContext { } }; +struct C_MDC_ScrubRepaired : public MDCacheContext { + ScrubHeaderRef header; +public: + C_MDC_ScrubRepaired(MDCache *m, const ScrubHeaderRef& h) + : MDCacheContext(m), header(h) { + header->inc_num_pending(); + } + void finish(int r) override { + header->dec_num_pending(); + } +}; + void MDCache::repair_dirfrag_stats(CDir *dir) { MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_FRAGSTATS); mdr->pin(dir); mdr->internal_op_private = dir; - mdr->internal_op_finish = new C_MDSInternalNoop; + if (dir->scrub_is_in_progress()) + mdr->internal_op_finish = new C_MDC_ScrubRepaired(this, dir->get_scrub_header()); + else + mdr->internal_op_finish = new C_MDSInternalNoop; repair_dirfrag_stats_work(mdr); } @@ -12911,7 +12949,10 @@ void MDCache::repair_inode_stats(CInode *diri) MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_INODESTATS); mdr->auth_pin(diri); // already auth pinned by CInode::validate_disk_state() mdr->internal_op_private = diri; - mdr->internal_op_finish = new C_MDSInternalNoop; + if (diri->scrub_is_in_progress()) + mdr->internal_op_finish = new C_MDC_ScrubRepaired(this, diri->get_scrub_header()); + else + mdr->internal_op_finish = new C_MDSInternalNoop; repair_inode_stats_work(mdr); } diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index 2db916d70a73c..5c837d620c5a2 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -1345,8 +1345,20 @@ class C_MDS_RetryRequest : public MDSInternalContext { MDCache *cache; MDRequestRef mdr; public: - C_MDS_RetryRequest(MDCache *c, MDRequestRef& r); + C_MDS_RetryRequest(MDCache *c, MDRequestRef& r) : + MDSInternalContext(c->mds), cache(c), mdr(r) {} void finish(int r) override; }; +class CF_MDS_RetryRequestFactory : public MDSContextFactory { +public: + CF_MDS_RetryRequestFactory(MDCache *cache, MDRequestRef &mdr, bool dl) : + mdcache(cache), mdr(mdr), drop_locks(dl) {} + MDSContext *build() override; +private: + MDCache *mdcache; + MDRequestRef mdr; + bool drop_locks; +}; + #endif diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc index b6b06a06d61cb..42df3fb69b948 100644 --- a/src/mds/MDSRank.cc +++ b/src/mds/MDSRank.cc @@ -757,6 +757,9 @@ void MDSRankDispatcher::tick() set_mdsmap_multimds_snaps_allowed(); } } + + if (whoami == 0) + scrubstack->advance_scrub_status(); } if (is_active() || is_stopping()) { @@ -1172,6 +1175,7 @@ bool MDSRank::is_valid_message(const cref_t &m) { type == MSG_MDS_LOCK || type == MSG_MDS_INODEFILECAPS || type == MSG_MDS_SCRUB || + type == MSG_MDS_SCRUB_STATS || type == CEPH_MSG_CLIENT_CAPS || type == CEPH_MSG_CLIENT_CAPRELEASE || type == CEPH_MSG_CLIENT_LEASE) { @@ -1258,6 +1262,7 @@ void MDSRank::handle_message(const cref_t &m) break; case MSG_MDS_SCRUB: + case MSG_MDS_SCRUB_STATS: ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS); scrubstack->dispatch(m); break; @@ -2608,6 +2613,12 @@ void MDSRankDispatcher::handle_asok_command( r = config_client(client_id, !got_value, option, value, *css); } else if (command == "scrub start" || command == "scrub_start") { + if (whoami != 0) { + *css << "Not rank 0"; + r = -EXDEV; + goto out; + } + string path; string tag; vector scrubop_vec; @@ -2664,6 +2675,11 @@ void MDSRankDispatcher::handle_asok_command( } else if (command == "scrub status") { command_scrub_status(f); } else if (command == "tag path") { + if (whoami != 0) { + *css << "Not rank 0"; + r = -EXDEV; + goto out; + } string path; cmd_getval(cmdmap, "path", path); string tag; diff --git a/src/mds/ScrubHeader.h b/src/mds/ScrubHeader.h index ecf24c1abcb21..eb79090b0073c 100644 --- a/src/mds/ScrubHeader.h +++ b/src/mds/ScrubHeader.h @@ -41,27 +41,39 @@ public: // Set after construction because it won't be known until we've // started resolving path and locking - void set_origin(CInode *origin_) { origin = origin_; } + void set_origin(inodeno_t ino) { origin = ino; } bool get_recursive() const { return recursive; } bool get_repair() const { return repair; } bool get_force() const { return force; } bool is_internal_tag() const { return is_tag_internal; } - CInode *get_origin() const { return origin; } + inodeno_t get_origin() const { return origin; } const std::string& get_tag() const { return tag; } bool get_repaired() const { return repaired; } void set_repaired() { repaired = true; } + void set_epoch_last_forwarded(unsigned epoch) { epoch_last_forwarded = epoch; } + unsigned get_epoch_last_forwarded() const { return epoch_last_forwarded; } + + void inc_num_pending() { ++num_pending; } + void dec_num_pending() { + ceph_assert(num_pending > 0); + --num_pending; + } + unsigned get_num_pending() const { return num_pending; } + protected: const std::string tag; bool is_tag_internal; const bool force; const bool recursive; const bool repair; - CInode *origin = nullptr; + inodeno_t origin; bool repaired = false; // May be set during scrub if repairs happened + unsigned epoch_last_forwarded = 0; + unsigned num_pending = 0; }; typedef std::shared_ptr ScrubHeaderRef; diff --git a/src/mds/ScrubStack.cc b/src/mds/ScrubStack.cc index d5c04f6634483..505ff98531dde 100644 --- a/src/mds/ScrubStack.cc +++ b/src/mds/ScrubStack.cc @@ -92,21 +92,29 @@ int ScrubStack::_enqueue(MDSCacheObject *obj, ScrubHeaderRef& header, bool top) scrub_stack.push_front(&obj->item_scrub); else scrub_stack.push_back(&obj->item_scrub); + return 0; } -void ScrubStack::enqueue(CInode *in, ScrubHeaderRef& header, bool top) +int ScrubStack::enqueue(CInode *in, ScrubHeaderRef& header, bool top) { // abort in progress if (clear_stack) return -EAGAIN; - scrub_origins.emplace(in); - clog_scrub_summary(in); + header->set_origin(in->ino()); + auto ret = scrubbing_map.emplace(header->get_tag(), header); + if (!ret.second) { + dout(10) << __func__ << " with {" << *in << "}" + << ", conflicting tag " << header->get_tag() << dendl; + return -EEXIST; + } int r = _enqueue(in, header, top); if (r < 0) return r; + clog_scrub_summary(in); + kick_off_scrubs(); return 0; } @@ -259,9 +267,9 @@ bool ScrubStack::validate_inode_auth(CInode *in) dout(10) << __func__ << " forward to mds." << auth << dendl; auto r = make_message(MMDSScrub::OP_QUEUEINO, in->ino(), std::move(in->scrub_queued_frags()), - header->get_tag(), header->is_internal_tag(), - header->get_force(), header->get_recursive(), - header->get_repair()); + header->get_tag(), header->get_origin(), + header->is_internal_tag(), header->get_force(), + header->get_recursive(), header->get_repair()); mdcache->mds->send_message_mds(r, auth); scrub_r.gather_set.insert(auth); @@ -338,8 +346,9 @@ void ScrubStack::scrub_dir_inode(CInode *in, bool *added_children, bool *done) dout(20) << __func__ << " forward " << p.second << " to mds." << p.first << dendl; auto r = make_message(MMDSScrub::OP_QUEUEDIR, in->ino(), std::move(p.second), header->get_tag(), - header->is_internal_tag(), header->get_force(), - header->get_recursive(), header->get_repair()); + header->get_origin(), header->is_internal_tag(), + header->get_force(), header->get_recursive(), + header->get_repair()); mds->send_message_mds(r, p.first); scrub_r.gather_set.insert(p.first); } @@ -486,11 +495,6 @@ void ScrubStack::_validate_inode_done(CInode *in, int r, dout(10) << __func__ << " scrub passed on inode " << *in << dendl; } - if (in == header->get_origin()) { - scrub_origins.erase(in); - clog_scrub_summary(in); - } - in->scrub_finished(); } @@ -532,7 +536,9 @@ std::string_view ScrubStack::scrub_summary() { CachedStackStringStream cs; if (state == STATE_IDLE) { - return "idle"; + if (scrubbing_map.empty()) + return "idle"; + *cs << "idle+waiting"; } if (state == STATE_RUNNING) { @@ -558,14 +564,18 @@ std::string_view ScrubStack::scrub_summary() { } } - if (!scrub_origins.empty()) { - *cs << " [paths:"; - for (auto inode = scrub_origins.begin(); inode != scrub_origins.end(); ++inode) { - if (inode != scrub_origins.begin()) { - *cs << ","; - } - - *cs << scrub_inode_path(*inode); + if (!scrubbing_map.empty()) { + *cs << " paths ["; + bool first = true; + for (auto &p : scrubbing_map) { + if (!first) + *cs << ","; + auto& header = p.second; + if (CInode *in = mdcache->get_inode(header->get_origin())) + *cs << scrub_inode_path(in); + else + *cs << "#" << header->get_origin(); + first = false; } *cs << "]"; } @@ -582,7 +592,10 @@ void ScrubStack::scrub_status(Formatter *f) { bool have_more = false; if (state == STATE_IDLE) { - *css << "no active scrubs running"; + if (scrubbing_map.empty()) + *css << "no active scrubs running"; + else + *css << state << " (waiting for more scrubs)"; } else if (state == STATE_RUNNING) { if (clear_stack) { *css << "ABORTING"; @@ -607,14 +620,20 @@ void ScrubStack::scrub_status(Formatter *f) { f->dump_string("status", css->strv()); f->open_object_section("scrubs"); - for (auto &inode : scrub_origins) { + + for (auto& p : scrubbing_map) { have_more = false; - ScrubHeaderRefConst header = inode->get_scrub_header(); + auto& header = p.second; std::string tag(header->get_tag()); f->open_object_section(tag.c_str()); // scrub id - f->dump_string("path", scrub_inode_path(inode)); + if (CInode *in = mdcache->get_inode(header->get_origin())) + f->dump_string("path", scrub_inode_path(in)); + else + f->dump_stream("path") << "#" << header->get_origin(); + + f->dump_string("tag", header->get_tag()); CachedStackStringStream optcss; if (header->get_recursive()) { @@ -648,10 +667,6 @@ void ScrubStack::abort_pending_scrubs() { auto abort_one = [this](MDSCacheObject *obj) { if (CInode *in = dynamic_cast(obj)) { - if (in == in->scrub_info()->header->get_origin()) { - scrub_origins.erase(in); - clog_scrub_summary(in); - } in->scrub_aborted(); } else if (CDir *dir = dynamic_cast(obj)) { dir->scrub_aborted(); @@ -747,7 +762,7 @@ void ScrubStack::clog_scrub_summary(CInode *in) { std::string what; if (clear_stack) { what = "aborted"; - } else if (scrub_origins.count(in)) { + } else if (in->scrub_is_in_progress()) { what = "queued"; } else { what = "completed"; @@ -765,6 +780,10 @@ void ScrubStack::dispatch(const cref_t &m) handle_scrub(ref_cast(m)); break; + case MSG_MDS_SCRUB_STATS: + handle_scrub_stats(ref_cast(m)); + break; + default: derr << " scrub stack unknown message " << m->get_type() << dendl_impl; ceph_abort_msg("scrub stack unknown message"); @@ -811,12 +830,19 @@ void ScrubStack::handle_scrub(const cref_t &m) fragset_t queued; if (!dfs.empty()) { - ScrubHeaderRef header = std::make_shared(m->get_tag(), m->is_internal_tag(), - m->is_force(), m->is_recursive(), - m->is_repair()); + ScrubHeaderRef header; + if (auto it = scrubbing_map.find(m->get_tag()); it != scrubbing_map.end()) { + header = it->second; + } else { + header = std::make_shared(m->get_tag(), m->is_internal_tag(), + m->is_force(), m->is_recursive(), + m->is_repair()); + header->set_origin(m->get_origin()); + scrubbing_map.emplace(header->get_tag(), header); + } for (auto dir : dfs) { queued.insert_raw(dir->get_frag()); - _enqueue(dir, header, nullptr, true); + _enqueue(dir, header, true); } queued.simplify(); kick_off_scrubs(); @@ -842,6 +868,9 @@ void ScrubStack::handle_scrub(const cref_t &m) if (it->second.gather_set.empty()) { remote_scrubs.erase(it); + + const auto& header = diri->get_scrub_header(); + header->set_epoch_last_forwarded(scrub_epoch); remove_from_waiting(diri); } } @@ -853,9 +882,16 @@ void ScrubStack::handle_scrub(const cref_t &m) CInode *in = mdcache->get_inode(m->get_ino()); ceph_assert(in); - ScrubHeaderRef header = std::make_shared(m->get_tag(), m->is_internal_tag(), - m->is_force(), m->is_recursive(), - m->is_repair()); + ScrubHeaderRef header; + if (auto it = scrubbing_map.find(m->get_tag()); it != scrubbing_map.end()) { + header = it->second; + } else { + header = std::make_shared(m->get_tag(), m->is_internal_tag(), + m->is_force(), m->is_recursive(), + m->is_repair()); + header->set_origin(m->get_origin()); + scrubbing_map.emplace(header->get_tag(), header); + } _enqueue(in, header, true); in->scrub_queued_frags() = m->get_frags(); @@ -881,10 +917,8 @@ void ScrubStack::handle_scrub(const cref_t &m) remove_from_waiting(in, false); dequeue(in); - if (in == in->scrub_info()->header->get_origin()) { - scrub_origins.erase(in); - clog_scrub_summary(in); - } + const auto& header = in->get_scrub_header(); + header->set_epoch_last_forwarded(scrub_epoch); in->scrub_finished(); kick_off_scrubs(); @@ -897,6 +931,137 @@ void ScrubStack::handle_scrub(const cref_t &m) } } +void ScrubStack::handle_scrub_stats(const cref_t &m) +{ + mds_rank_t from = mds_rank_t(m->get_source().num()); + dout(7) << __func__ << " " << *m << " from mds." << from << dendl; + + if (from == 0) { + if (scrub_epoch != m->get_epoch() - 1) { + scrub_epoch = m->get_epoch() - 1; + for (auto& p : scrubbing_map) { + if (p.second->get_epoch_last_forwarded()) + p.second->set_epoch_last_forwarded(scrub_epoch); + } + } + bool any_finished = false; + bool any_repaired = false; + std::set scrubbing_tags; + for (auto it = scrubbing_map.begin(); it != scrubbing_map.end(); ) { + auto& header = it->second; + if (header->get_num_pending() || + header->get_epoch_last_forwarded() >= scrub_epoch) { + scrubbing_tags.insert(it->first); + ++it; + } else if (m->is_finished(it->first)) { + any_finished = true; + if (header->get_repaired()) + any_repaired = true; + scrubbing_map.erase(it++); + } else { + ++it; + } + } + + scrub_epoch = m->get_epoch(); + + auto ack = make_message(scrub_epoch, std::move(scrubbing_tags)); + mdcache->mds->send_message_mds(ack, 0); + + if (any_finished) + clog_scrub_summary(); + if (any_repaired) + mdcache->mds->mdlog->trim_all(); + } else { + if (scrub_epoch == m->get_epoch() && + (size_t)from < mds_scrub_stats.size()) { + auto& stat = mds_scrub_stats[from]; + stat.epoch_acked = m->get_epoch(); + stat.scrubbing_tags = m->get_scrubbing_tags(); + } + } +} + +void ScrubStack::advance_scrub_status() +{ + if (scrubbing_map.empty()) + return; + + MDSRank *mds = mdcache->mds; + + set up_mds; + mds->get_mds_map()->get_up_mds_set(up_mds); + auto up_max = *up_mds.rbegin(); + + bool update_scrubbing = false; + std::set scrubbing_tags; + + if (up_max == 0) { + update_scrubbing = true; + } else if (mds_scrub_stats.size() > (size_t)(up_max)) { + bool fully_acked = true; + for (const auto& stat : mds_scrub_stats) { + if (stat.epoch_acked != scrub_epoch) { + fully_acked = false; + break; + } + scrubbing_tags.insert(stat.scrubbing_tags.begin(), + stat.scrubbing_tags.end()); + } + if (fully_acked) { + // handle_scrub_stats() reports scrub is still in-progress if it has + // forwarded any object to other mds since previous epoch. Let's assume, + // at time 'A', we got scrub stats from all mds for previous epoch. If + // a scrub is not reported by any mds, we know there is no forward of + // the scrub since time 'A'. So we can consider the scrub is finished. + if (scrub_epoch_fully_acked + 1 == scrub_epoch) + update_scrubbing = true; + scrub_epoch_fully_acked = scrub_epoch; + } + } + + if (mds_scrub_stats.size() != (size_t)up_max + 1) + mds_scrub_stats.resize((size_t)up_max + 1); + mds_scrub_stats.at(0).epoch_acked = scrub_epoch + 1; + + bool any_finished = false; + bool any_repaired = false; + + for (auto it = scrubbing_map.begin(); it != scrubbing_map.end(); ) { + auto& header = it->second; + if (header->get_num_pending() || + header->get_epoch_last_forwarded() >= scrub_epoch) { + if (update_scrubbing && up_max != 0) + scrubbing_tags.insert(it->first); + ++it; + } else if (update_scrubbing && !scrubbing_tags.count(it->first)) { + // no longer being scrubbed globally + any_finished = true; + if (header->get_repaired()) + any_repaired = true; + scrubbing_map.erase(it++); + } else { + ++it; + } + } + + ++scrub_epoch; + + for (auto& r : up_mds) { + if (r == 0) + continue; + auto m = update_scrubbing ? + make_message(scrub_epoch, scrubbing_tags) : + make_message(scrub_epoch); + mds->send_message_mds(m, r); + } + + if (any_finished) + clog_scrub_summary(); + if (any_repaired) + mdcache->mds->mdlog->trim_all(); +} + void ScrubStack::handle_mds_failure(mds_rank_t mds) { bool kick = false; diff --git a/src/mds/ScrubStack.h b/src/mds/ScrubStack.h index 4bd725390d42c..a4488ff535379 100644 --- a/src/mds/ScrubStack.h +++ b/src/mds/ScrubStack.h @@ -24,6 +24,7 @@ #include "common/LogClient.h" #include "include/elist.h" #include "messages/MMDSScrub.h" +#include "messages/MMDSScrubStats.h" class MDCache; class Finisher; @@ -46,7 +47,7 @@ public: * @param in The inode to scrub * @param header The ScrubHeader propagated from wherever this scrub */ - void enqueue(CInode *in, ScrubHeaderRef& header, bool top); + int enqueue(CInode *in, ScrubHeaderRef& header, bool top); /** * Abort an ongoing scrub operation. The abort operation could be * delayed if there are in-progress scrub operations on going. The @@ -94,6 +95,8 @@ public: bool is_scrubbing() const { return !scrub_stack.empty(); } + void advance_scrub_status(); + void handle_mds_failure(mds_rank_t mds); void dispatch(const cref_t &m); @@ -121,6 +124,17 @@ protected: }; std::map remote_scrubs; + unsigned scrub_epoch = 2; + unsigned scrub_epoch_fully_acked = 0; + + struct scrub_stat_t { + unsigned epoch_acked = 0; + std::set scrubbing_tags; + }; + std::vector mds_scrub_stats; + + std::map scrubbing_map; + friend class C_RetryScrub; private: // scrub abort is _not_ a state, rather it's an operation that's @@ -241,6 +255,7 @@ private: void clog_scrub_summary(CInode *in=nullptr); void handle_scrub(const cref_t &m); + void handle_scrub_stats(const cref_t &m); State state = STATE_IDLE; bool clear_stack = false; @@ -248,10 +263,6 @@ private: // list of pending context completions for asynchronous scrub // control operations. std::vector control_ctxs; - - // list of inodes for which scrub operations are running -- used - // to diplay out in `scrub status`. - std::set scrub_origins; }; #endif /* SCRUBSTACK_H_ */ diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 96e358813af6b..cbc31013f1f17 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -3382,23 +3382,6 @@ public: } }; -class CF_MDS_MDRContextFactory : public MDSContextFactory { -public: - CF_MDS_MDRContextFactory(MDCache *cache, MDRequestRef &mdr, bool dl) : - mdcache(cache), mdr(mdr), drop_locks(dl) {} - MDSContext *build() { - if (drop_locks) { - mdcache->mds->locker->drop_locks(mdr.get(), nullptr); - mdr->drop_local_auth_pins(); - } - return new C_MDS_RetryRequest(mdcache, mdr); - } -private: - MDCache *mdcache; - MDRequestRef mdr; - bool drop_locks; -}; - /* If this returns null, the request has been handled * as appropriate: forwarded on, or the client's been replied to */ CInode* Server::rdlock_path_pin_ref(MDRequestRef& mdr, @@ -3412,7 +3395,7 @@ CInode* Server::rdlock_path_pin_ref(MDRequestRef& mdr, return mdr->in[0]; // traverse - CF_MDS_MDRContextFactory cf(mdcache, mdr, true); + CF_MDS_RetryRequestFactory cf(mdcache, mdr, true); int flags = 0; if (refpath.is_last_snap()) { if (!no_want_auth) @@ -3505,7 +3488,7 @@ CDentry* Server::rdlock_path_xlock_dentry(MDRequestRef& mdr, } // traverse to parent dir - CF_MDS_MDRContextFactory cf(mdcache, mdr, true); + CF_MDS_RetryRequestFactory cf(mdcache, mdr, true); int flags = MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_RDLOCK_PATH | MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_XLOCK_DENTRY | MDS_TRAVERSE_WANT_AUTH; @@ -3600,7 +3583,7 @@ Server::rdlock_two_paths_xlock_destdn(MDRequestRef& mdr, bool xlock_srcdn) } // traverse to parent dir - CF_MDS_MDRContextFactory cf(mdcache, mdr, true); + CF_MDS_RetryRequestFactory cf(mdcache, mdr, true); int flags = MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_WANT_AUTH; int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0]); if (r != 0) { @@ -3788,7 +3771,7 @@ void Server::handle_client_getattr(MDRequestRef& mdr, bool is_lookup) want_auth = true; // set want_auth for CEPH_STAT_RSTAT mask if (!mdr->is_batch_head() && mdr->can_batch()) { - CF_MDS_MDRContextFactory cf(mdcache, mdr, false); + CF_MDS_RetryRequestFactory cf(mdcache, mdr, false); int r = mdcache->path_traverse(mdr, cf, mdr->get_filepath(), (want_auth ? MDS_TRAVERSE_WANT_AUTH : 0), &mdr->dn[0], &mdr->in[0]); @@ -7554,7 +7537,7 @@ void Server::handle_peer_rmdir_prep(MDRequestRef& mdr) filepath srcpath(mdr->peer_request->srcdnpath); dout(10) << " src " << srcpath << dendl; CInode *in; - CF_MDS_MDRContextFactory cf(mdcache, mdr, false); + CF_MDS_RetryRequestFactory cf(mdcache, mdr, false); int r = mdcache->path_traverse(mdr, cf, srcpath, MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED, &trace, &in); @@ -9098,7 +9081,7 @@ void Server::handle_peer_rename_prep(MDRequestRef& mdr) filepath destpath(mdr->peer_request->destdnpath); dout(10) << " dest " << destpath << dendl; vector trace; - CF_MDS_MDRContextFactory cf(mdcache, mdr, false); + CF_MDS_RetryRequestFactory cf(mdcache, mdr, false); int r = mdcache->path_traverse(mdr, cf, destpath, MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED | MDS_TRAVERSE_WANT_DENTRY, &trace); diff --git a/src/messages/MMDSScrub.h b/src/messages/MMDSScrub.h index f1a7e788b6281..9988bebfd1c03 100644 --- a/src/messages/MMDSScrub.h +++ b/src/messages/MMDSScrub.h @@ -53,6 +53,7 @@ public: encode(ino, payload); encode(frags, payload); encode(tag, payload); + encode(origin, payload); encode(flags, payload); } void decode_payload() override { @@ -62,6 +63,7 @@ public: decode(ino, p); decode(frags, p); decode(tag, p); + decode(origin, p); decode(flags, p); } inodeno_t get_ino() const { @@ -73,6 +75,9 @@ public: const std::string& get_tag() const { return tag; } + inodeno_t get_origin() const { + return origin; + } int get_op() const { return op; } @@ -93,12 +98,12 @@ protected: static constexpr int HEAD_VERSION = 1; static constexpr int COMPAT_VERSION = 1; - MMDSScrub() : SafeMessage(MSG_MDS_SCRUB, HEAD_VERSION, COMPAT_VERSION) {} + MMDSScrub() : MMDSOp(MSG_MDS_SCRUB, HEAD_VERSION, COMPAT_VERSION) {} MMDSScrub(int o, inodeno_t i, fragset_t&& _frags, std::string_view _tag, - bool internal_tag=false, bool force=false, - bool recursive=false, bool repair=false) - : SafeMessage(MSG_MDS_SCRUB, HEAD_VERSION, COMPAT_VERSION), - op(o), ino(i), frags(std::move(_frags)), tag(_tag) { + inodeno_t _origin=inodeno_t(), bool internal_tag=false, + bool force=false, bool recursive=false, bool repair=false) + : MMDSOp(MSG_MDS_SCRUB, HEAD_VERSION, COMPAT_VERSION), + op(o), ino(i), frags(std::move(_frags)), tag(_tag), origin(_origin) { if (internal_tag) flags |= FLAG_INTERNAL_TAG; if (force) flags |= FLAG_FORCE; if (recursive) flags |= FLAG_RECURSIVE; @@ -119,6 +124,7 @@ private: inodeno_t ino; fragset_t frags; std::string tag; + inodeno_t origin; unsigned flags = 0; }; #endif // CEPH_MMDSSCRUB_H diff --git a/src/messages/MMDSScrubStats.h b/src/messages/MMDSScrubStats.h new file mode 100644 index 0000000000000..84fb5cdb8a2b2 --- /dev/null +++ b/src/messages/MMDSScrubStats.h @@ -0,0 +1,75 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MMDSSCRUBSTATS_H +#define CEPH_MMDSSCRUBSTATS_H + +#include "messages/MMDSOp.h" + +class MMDSScrubStats : public MMDSOp { + static constexpr int HEAD_VERSION = 1; + static constexpr int COMPAT_VERSION = 1; + +public: + std::string_view get_type_name() const override { return "mds_scrub_stats"; } + void print(ostream& o) const override { + o << "mds_scrub_stats(e" << epoch; + if (update_scrubbing) + o << " [" << scrubbing_tags << "])"; + else + o << ")"; + } + + unsigned get_epoch() const { return epoch; } + const auto& get_scrubbing_tags() const { return scrubbing_tags; } + bool is_finished(const std::string& tag) const { + return update_scrubbing && !scrubbing_tags.count(tag); + } + + void encode_payload(uint64_t features) override { + using ceph::encode; + encode(epoch, payload); + encode(scrubbing_tags, payload); + encode(update_scrubbing, payload); + } + void decode_payload() override { + using ceph::decode; + auto p = payload.cbegin(); + decode(epoch, p); + decode(scrubbing_tags, p); + decode(update_scrubbing, p); + } + +protected: + MMDSScrubStats(unsigned e=0) : + MMDSOp(MSG_MDS_SCRUB_STATS, HEAD_VERSION, COMPAT_VERSION), + epoch(e) {} + MMDSScrubStats(unsigned e, std::set&& tags) : + MMDSOp(MSG_MDS_SCRUB_STATS, HEAD_VERSION, COMPAT_VERSION), + epoch(e), scrubbing_tags(std::move(tags)), update_scrubbing(true) {} + MMDSScrubStats(unsigned e, const std::set& tags) : + MMDSOp(MSG_MDS_SCRUB_STATS, HEAD_VERSION, COMPAT_VERSION), + epoch(e), scrubbing_tags(tags), update_scrubbing(true) {} + ~MMDSScrubStats() override {} + +private: + unsigned epoch; + std::set scrubbing_tags; + bool update_scrubbing = false; + + template + friend boost::intrusive_ptr ceph::make_message(Args&&... args); +}; + +#endif diff --git a/src/msg/Message.cc b/src/msg/Message.cc index 7b4408bf8ba61..24413e7889f6e 100644 --- a/src/msg/Message.cc +++ b/src/msg/Message.cc @@ -146,6 +146,7 @@ #include "messages/MMDSOpenInoReply.h" #include "messages/MMDSSnapUpdate.h" #include "messages/MMDSScrub.h" +#include "messages/MMDSScrubStats.h" #include "messages/MDirUpdate.h" #include "messages/MDiscover.h" @@ -765,6 +766,10 @@ Message *decode_message(CephContext *cct, m = make_message(); break; + case MSG_MDS_SCRUB_STATS: + m = make_message(); + break; + case MSG_MDS_EXPORTDIRDISCOVER: m = make_message(); break; diff --git a/src/msg/Message.h b/src/msg/Message.h index 0f1e19a0c00dd..a36cc4dfbc277 100644 --- a/src/msg/Message.h +++ b/src/msg/Message.h @@ -194,6 +194,7 @@ #define MSG_MDS_HEARTBEAT 0x500 // for mds load balancer #define MSG_MDS_METRICS 0x501 // for mds metric aggregator #define MSG_MDS_PING 0x502 // for mds pinger +#define MSG_MDS_SCRUB_STATS 0x503 // for mds scrub stack // *** generic *** #define MSG_TIMECHECK 0x600 -- 2.39.5