From 7bdc74eccbab5dcf5ca16f19991a0f1c13a3cd3f Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Tue, 14 Nov 2023 13:17:57 -0500 Subject: [PATCH] mds: add quiesce op Signed-off-by: Patrick Donnelly Fixes: https://tracker.ceph.com/issues/63664 (cherry picked from commit 06916dd2a7f30874d29b03e0cd998370cf595f58) --- src/common/ceph_strings.cc | 2 + src/common/options/mds.yaml.in | 18 ++ src/include/ceph_fs.h | 4 +- src/include/fs_types.h | 1 + src/mds/Locker.cc | 32 +++- src/mds/Locker.h | 2 + src/mds/MDCache.cc | 330 ++++++++++++++++++++++++++++++++- src/mds/MDCache.h | 20 +- src/mds/MDSDaemon.cc | 6 + src/mds/MDSRank.cc | 51 +++++ src/mds/MDSRank.h | 1 + src/mds/Migrator.cc | 2 +- src/mds/Mutation.h | 6 + 13 files changed, 461 insertions(+), 14 deletions(-) diff --git a/src/common/ceph_strings.cc b/src/common/ceph_strings.cc index 5103b1a3f83..e36df170feb 100644 --- a/src/common/ceph_strings.cc +++ b/src/common/ceph_strings.cc @@ -311,6 +311,8 @@ const char *ceph_mds_op_name(int op) case CEPH_MDS_OP_ENQUEUE_SCRUB: return "enqueue_scrub"; case CEPH_MDS_OP_REPAIR_FRAGSTATS: return "repair_fragstats"; case CEPH_MDS_OP_REPAIR_INODESTATS: return "repair_inodestats"; + case CEPH_MDS_OP_QUIESCE_PATH: return "quiesce_path"; + case CEPH_MDS_OP_QUIESCE_INODE: return "quiesce_inode"; } return "???"; } diff --git a/src/common/options/mds.yaml.in b/src/common/options/mds.yaml.in index 7bd958f5951..6b7ef89080a 100644 --- a/src/common/options/mds.yaml.in +++ b/src/common/options/mds.yaml.in @@ -74,6 +74,24 @@ options: - mds flags: - runtime +- name: mds_cache_quiesce_delay + type: millisecs + level: dev + desc: delay before starting recursive quiesce inode operations + default: 0 + services: + - mds + flags: + - runtime +- name: mds_cache_quiesce_splitauth + type: bool + level: advanced + desc: allow recursive quiesce across auth boundaries + default: false + services: + - mds + flags: + - runtime - name: mds_cache_release_free_interval type: secs level: dev diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h index 016645ab5b0..49b45f26eb3 100644 --- a/src/include/ceph_fs.h +++ b/src/include/ceph_fs.h @@ -436,7 +436,9 @@ enum { CEPH_MDS_OP_ENQUEUE_SCRUB = 0x01503, CEPH_MDS_OP_REPAIR_FRAGSTATS = 0x01504, CEPH_MDS_OP_REPAIR_INODESTATS = 0x01505, - CEPH_MDS_OP_RDLOCK_FRAGSSTATS = 0x01507 + CEPH_MDS_OP_RDLOCK_FRAGSSTATS = 0x01507, + CEPH_MDS_OP_QUIESCE_PATH = 0x01508, + CEPH_MDS_OP_QUIESCE_INODE = 0x01509, }; #define IS_CEPH_MDS_OP_NEWINODE(op) (op == CEPH_MDS_OP_CREATE || \ diff --git a/src/include/fs_types.h b/src/include/fs_types.h index 606b9c6503d..5be4f74ebfd 100644 --- a/src/include/fs_types.h +++ b/src/include/fs_types.h @@ -48,6 +48,7 @@ class JSONObj; #define CEPHFS_EFAULT 14 #define CEPHFS_EISCONN 106 #define CEPHFS_EMULTIHOP 72 +#define CEPHFS_EINPROGRESS 115 // taken from linux kernel: include/uapi/linux/fcntl.h #define CEPHFS_AT_FDCWD -100 /* Special value used to indicate diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index 9e74952588b..0633156c81f 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -229,7 +229,8 @@ struct MarkEventOnDestruct { * on the appropriate wait list */ bool Locker::acquire_locks(const MDRequestRef& mdr, MutationImpl::LockOpVec& lov, - CInode *auth_pin_freeze, + CInode* auth_pin_freeze, + std::set mustpin, bool auth_pin_nonblocking, bool skip_quiesce) { @@ -244,7 +245,6 @@ bool Locker::acquire_locks(const MDRequestRef& mdr, client_t client = mdr->get_client(); - set mustpin; // items to authpin if (auth_pin_freeze) mustpin.insert(auth_pin_freeze); @@ -447,7 +447,7 @@ bool Locker::acquire_locks(const MDRequestRef& mdr, continue; } int err = 0; - if (!object->can_auth_pin(&err)) { + if (!object->can_auth_pin(&err, skip_quiesce)) { if (mdr->lock_cache) { CDir *dir; if (CInode *in = dynamic_cast(object)) { @@ -544,6 +544,8 @@ bool Locker::acquire_locks(const MDRequestRef& mdr, } if (auth_pin_nonblocking) req->mark_nonblocking(); + if (skip_quiesce) + req->mark_bypassfreezing(); else if (!mdr->locks.empty()) req->mark_notify_blocking(); @@ -676,7 +678,7 @@ bool Locker::acquire_locks(const MDRequestRef& mdr, /* Dropping *all* locks here is necessary so parent directory * snap/layout/quiesce locks are unlocked for a future mksnap. This is the * primary purpose of the new quiescelock. An op, e.g. getattr, cannot block - * waiting for another lock held by quiesce_subvolume_inode, e.g. filelock, + * waiting for another lock held by quiesce_inode, e.g. filelock, * which will prevent a mksnap on a subvolume inode (because getattr will * already have gotten parent snaplocks, see Locker::try_rdlock_snap_layout). */ @@ -863,6 +865,28 @@ void Locker::drop_rdlocks_for_early_reply(MutationImpl *mut) issue_caps_set(need_issue); } +void Locker::drop_rdlock(MutationImpl* mut, SimpleLock* what) +{ + dout(20) << __func__ << ": " << *what << dendl; + + for (auto it = mut->locks.begin(); it != mut->locks.end(); ++it) { + auto* lock = it->lock; + if (lock == what) { + dout(20) << __func__ << ": found lock " << *lock << dendl; + ceph_assert(it->is_rdlock()); + bool ni = false; + rdlock_finish(it, mut, &ni); + if (ni) { + set need_issue; + need_issue.insert(static_cast(lock->get_parent())); + issue_caps_set(need_issue); + } + return; + } + } + dout(20) << __func__ << ": not found!" << dendl; +} + void Locker::drop_locks_for_fragment_unfreeze(MutationImpl *mut) { set need_issue; diff --git a/src/mds/Locker.h b/src/mds/Locker.h index e3ecdf8131b..cfc0d9ace9a 100644 --- a/src/mds/Locker.h +++ b/src/mds/Locker.h @@ -54,6 +54,7 @@ public: bool acquire_locks(const MDRequestRef& mdr, MutationImpl::LockOpVec& lov, CInode *auth_pin_freeze=NULL, + std::set mustpin = {}, bool auth_pin_nonblocking=false, bool skip_quiesce=false); @@ -66,6 +67,7 @@ public: void set_xlocks_done(MutationImpl *mut, bool skip_dentry=false); void drop_non_rdlocks(MutationImpl *mut, std::set *pneed_issue=0); void drop_rdlocks_for_early_reply(MutationImpl *mut); + void drop_rdlock(MutationImpl* mut, SimpleLock* what); void drop_locks_for_fragment_unfreeze(MutationImpl *mut); int get_cap_bit_for_lock_cache(int op); diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index d2c11867574..bc0271251c6 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -9663,8 +9663,18 @@ MDRequestRef MDCache::request_start_internal(int op) params.all_read = now; params.dispatched = now; params.internal_op = op; - MDRequestRef mdr = - mds->op_tracker.create_request(¶ms); + + switch (op) { + case CEPH_MDS_OP_QUIESCE_PATH: + case CEPH_MDS_OP_QUIESCE_INODE: + params.continuous = true; + break; + default: + params.continuous = false; + break; + } + + MDRequestRef mdr = mds->op_tracker.create_request(¶ms); if (active_requests.count(mdr->reqid)) { auto& _mdr = active_requests[mdr->reqid]; @@ -9708,6 +9718,12 @@ void MDCache::request_finish(const MDRequestRef& mdr) } switch(mdr->internal_op) { + case CEPH_MDS_OP_QUIESCE_PATH: + logger->inc(l_mdss_ireq_quiesce_path); + break; + case CEPH_MDS_OP_QUIESCE_INODE: + logger->inc(l_mdss_ireq_quiesce_inode); + break; case CEPH_MDS_OP_FRAGMENTDIR: logger->inc(l_mdss_ireq_fragmentdir); break; @@ -9765,6 +9781,12 @@ void MDCache::dispatch_request(const MDRequestRef& mdr) mds->server->dispatch_peer_request(mdr); } else { switch (mdr->internal_op) { + case CEPH_MDS_OP_QUIESCE_PATH: + dispatch_quiesce_path(mdr); + break; + case CEPH_MDS_OP_QUIESCE_INODE: + dispatch_quiesce_inode(mdr); + break; case CEPH_MDS_OP_FRAGMENTDIR: dispatch_fragment_dir(mdr); break; @@ -9871,6 +9893,24 @@ void MDCache::request_cleanup(const MDRequestRef& mdr) mdr->clear_ambiguous_auth(); if (!mdr->more()->waiting_for_finish.empty()) mds->queue_waiters(mdr->more()->waiting_for_finish); + for (auto& [in, reqid] : mdr->more()->quiesce_ops) { + if (auto it = active_requests.find(reqid); it != active_requests.end()) { + auto qimdr = it->second; + dout(20) << "killing quiesce op " << *qimdr << dendl; + request_kill(qimdr); + } + } + } + + if (mdr->internal_op == CEPH_MDS_OP_QUIESCE_PATH) { + /* This construction is obviously not performant but it's rarely done and only for subvolumes */ + for (auto it = quiesced_subvolumes.begin(); it != quiesced_subvolumes.end();) { + if (it->second == mdr) { + it = quiesced_subvolumes.erase(it); + } else { + ++it; + } + } } request_drop_locks(mdr); @@ -9930,6 +9970,18 @@ void MDCache::request_kill(const MDRequestRef& mdr) return; } + /* quiesce ops are all completed via request_kill */ + switch(mdr->internal_op) { + case CEPH_MDS_OP_QUIESCE_PATH: + logger->inc(l_mdss_ireq_quiesce_path); + break; + case CEPH_MDS_OP_QUIESCE_INODE: + logger->inc(l_mdss_ireq_quiesce_inode); + break; + default: + break; + } + mdr->killed = true; mdr->mark_event("killing request"); @@ -11992,7 +12044,7 @@ void MDCache::dispatch_fragment_dir(const MDRequestRef& mdr) // prevent a racing gather on any other scatterlocks too lov.lock_scatter_gather(&diri->nestlock); lov.lock_scatter_gather(&diri->filelock); - if (!mds->locker->acquire_locks(mdr, lov, NULL, true)) { + if (!mds->locker->acquire_locks(mdr, lov, NULL, {}, true)) { if (!mdr->aborted) return; } @@ -13380,6 +13432,10 @@ void MDCache::register_perfcounters() "Stray dentries migrated"); // low prio internal request stats + pcb.add_u64_counter(l_mdss_ireq_quiesce_path, "ireq_quiesce_path", + "Internal Request type quiesce subvolume"); + pcb.add_u64_counter(l_mdss_ireq_quiesce_inode, "ireq_quiesce_inode", + "Internal Request type quiesce subvolume inode"); pcb.add_u64_counter(l_mdss_ireq_enqueue_scrub, "ireq_enqueue_scrub", "Internal Request type enqueue scrub"); pcb.add_u64_counter(l_mdss_ireq_exportdir, "ireq_exportdir", @@ -13448,6 +13504,274 @@ void MDCache::clear_dirty_bits_for_stray(CInode* diri) { } } +void MDCache::dispatch_quiesce_inode(const MDRequestRef& mdr) +{ + if (mdr->killed) { + dout(20) << __func__ << " " << *mdr << " not dispatching killed " << *mdr << dendl; + return; + } else if (mdr->internal_op_finish == nullptr) { + dout(20) << __func__ << " " << *mdr << " already finished quiesce" << dendl; + return; + } + + auto* qfinisher = static_cast(mdr->internal_op_private); + auto delay = qfinisher->delay; + auto splitauth = qfinisher->splitauth; + auto& qs = *qfinisher->qs; + auto qrmdr = qfinisher->mdr; + + CInode *in = get_inode(mdr->get_filepath().get_ino()); + if (in == nullptr) { + qs.add_failed(mdr, -CEPHFS_ENOENT); + mds->server->respond_to_request(mdr, -CEPHFS_ENOENT); + return; + } + const bool is_root = (mdr->get_filepath().get_ino() == mdr->get_filepath2().get_ino()); + + dout(20) << __func__ << " " << *mdr << " quiescing " << *in << dendl; + + + { + /* Acquire authpins on `in` to prevent migrations after this rank considers + * it (and its children) quiesced. + */ + + MutationImpl::LockOpVec lov; + if (!mds->locker->acquire_locks(mdr, lov, nullptr, {in}, false, true)) { + return; + } + } + + /* TODO: Consider: + * + * rank0 is auth for /foo + * rank1 quiesces /foo with no dirents in cache (and stops) + * rank0 begins quiescing /foo + * rank0 exports a dirfrag of /foo/bar to rank1 (/foo/bar is not authpinned by rank1 nor by rank0 (yet)) + * rank1 discovers relevant paths in /foo/bar + * rank1 now has /foo/bar in cache and may issue caps / execute operations + * + * The solution is probably to have rank1 mark /foo has STATE_QUIESCED and reject export ops from rank0. + */ + + if (in->is_auth()) { + /* Acquire rdlocks on anything which prevents writing. + * + * Because files are treated specially allowing multiple reader/writers, we + * need an xlock here to recall all write caps. This unfortunately means + * there can be no readers. + * + * The xlock on the quiescelock is important to prevent future requests + * from blocking on other inode locks while holding path traversal locks. + * See dev doc doc/dev/mds_internals/quiesce.rst for more details. + */ + + MutationImpl::LockOpVec lov; + lov.add_rdlock(&in->authlock); + lov.add_rdlock(&in->dirfragtreelock); + lov.add_rdlock(&in->filelock); + lov.add_rdlock(&in->linklock); + lov.add_rdlock(&in->nestlock); + lov.add_rdlock(&in->policylock); + // N.B.: NO xlock/wrlock on quiescelock; we need to allow access to mksnap/lookup + // This is an unfortunate inconsistency. It may be possible to circumvent + // this issue by having those ops acquire the quiscelock only if necessary. + if (is_root) { + lov.add_rdlock(&in->quiescelock); + } else { + lov.add_xlock(&in->quiescelock); /* !! */ + } + lov.add_rdlock(&in->snaplock); + lov.add_rdlock(&in->xattrlock); + if (!mds->locker->acquire_locks(mdr, lov, nullptr, {in}, false, true)) { + return; + } + } else if (!splitauth) { + dout(5) << "auth is split and splitauth is false: " << *in << dendl; + qs.add_failed(mdr, -CEPHFS_EPERM); + mds->server->respond_to_request(mdr, -CEPHFS_EPERM); + return; + } + + if (in->is_dir()) { + for (auto& dir : in->get_dirfrags()) { + if (!dir->is_auth() && !splitauth) { + dout(5) << "auth is split and splitauth is false: " << *dir << dendl; + qs.add_failed(mdr, -CEPHFS_EPERM); + mds->server->respond_to_request(mdr, -CEPHFS_EPERM); + return; + } + } + MDSGatherBuilder gather(g_ceph_context, new C_MDS_RetryRequest(this, mdr)); + auto& qops = qrmdr->more()->quiesce_ops; + for (auto& dir : in->get_dirfrags()) { + for (auto& [dnk, dn] : *dir) { + auto* in = dn->get_projected_inode(); + if (!in) { + continue; + } + + if (auto it = qops.find(in); it != qops.end()) { + dout(25) << __func__ << ": existing quiesce metareqid: " << it->second << dendl; + if (auto reqit = active_requests.find(it->second); reqit != active_requests.end()) { + auto& qimdr = reqit->second; + dout(25) << __func__ << ": found in-progress " << qimdr << dendl; + continue; + } + } + dout(10) << __func__ << ": scheduling op to quiesce " << *in << dendl; + + MDRequestRef qimdr = request_start_internal(CEPH_MDS_OP_QUIESCE_INODE); + qimdr->set_filepath(filepath(in->ino())); + qimdr->internal_op_finish = gather.new_sub(); + qimdr->internal_op_private = qfinisher; + qops[in] = qimdr->reqid; + qs.inc_inodes(); + if (delay > 0ms) { + mds->timer.add_event_after(delay, new LambdaContext([cache=this,qimdr](int r) { + cache->dispatch_request(qimdr); + })); + } else { + dispatch_request(qimdr); + } + if (!(qs.inc_heartbeat_count() % mds->heartbeat_reset_grace())) { + mds->heartbeat_reset(); + } + } + } + if (gather.has_subs()) { + dout(20) << __func__ << ": waiting for sub-ops to gather" << dendl; + gather.activate(); + return; + } + } + + if (in->is_auth()) { + dout(10) << __func__ << " " << *mdr << " quiesce complete of " << *in << dendl; + mdr->mark_event("quiesce complete"); + } else { + dout(10) << __func__ << " " << *mdr << " non-auth quiesce complete of " << *in << dendl; + mdr->mark_event("quiesce complete for non-auth inode"); + } + + qs.inc_inodes_quiesced(); + mdr->internal_op_finish->complete(0); + mdr->internal_op_finish = nullptr; + + /* do not respond/complete so locks are not lost, parent request will complete */ +} + +void MDCache::dispatch_quiesce_path(const MDRequestRef& mdr) +{ + if (mdr->killed) { + dout(20) << __func__ << " not dispatching killed " << *mdr << dendl; + return; + } + + if (!mds->is_active()) { + dout(20) << __func__ << " is not active!" << dendl; + mds->server->respond_to_request(mdr, -CEPHFS_EAGAIN); + return; + } + + ceph_assert(mdr->internal_op_finish); + + dout(5) << __func__ << ": dispatching" << dendl; + + C_MDS_QuiescePath* qfinisher = static_cast(mdr->internal_op_finish); + auto& qs = *qfinisher->qs; + auto delay = qfinisher->delay = g_conf().get_val("mds_cache_quiesce_delay"); + auto splitauth = qfinisher->splitauth = g_conf().get_val("mds_cache_quiesce_splitauth"); + + CInode* diri = nullptr; + CF_MDS_RetryRequestFactory cf(this, mdr, true); + static const int ptflags = 0 + | MDS_TRAVERSE_DISCOVER + | MDS_TRAVERSE_RDLOCK_PATH + | MDS_TRAVERSE_WANT_INODE + ; + int r = path_traverse(mdr, cf, mdr->get_filepath(), ptflags, nullptr, &diri); + if (r > 0) + return; + if (r < 0) { + mds->server->respond_to_request(mdr, r); + return; + } + + if (!diri->is_dir()) { + dout(5) << __func__ << ": file is not a directory" << dendl; + mds->server->respond_to_request(mdr, -CEPHFS_ENOTDIR); + return; + } + + if (auto [it, inserted] = quiesced_subvolumes.try_emplace(diri->ino(), mdr); !inserted) { + if (!it->second) { + it->second = mdr; + } else if (it->second != mdr) { + dout(5) << __func__ << ": quiesce operation already in flight: " << it->second << dendl; + mds->server->respond_to_request(mdr, -CEPHFS_EINPROGRESS); + return; + } + } + + qfinisher->mdr = mdr; + + for (auto& [qimdr, rc] : qs.get_failed()) { + dout(5) << __func__ << ": op " << *qimdr << " failed with " << rc << "!" << dendl; + mds->server->respond_to_request(mdr, rc); + return; + } + + if (!diri->is_auth() && !splitauth) { + dout(5) << __func__ << ": skipping recursive quiesce of path for non-auth inode" << dendl; + mdr->mark_event("quiesce complete for non-auth tree"); + } else if (auto& qops = mdr->more()->quiesce_ops; qops.count(diri) == 0) { + MDRequestRef qimdr = request_start_internal(CEPH_MDS_OP_QUIESCE_INODE); + qimdr->set_filepath(filepath(diri->ino())); + qimdr->set_filepath2(filepath(diri->ino())); /* is_root! */ + qimdr->internal_op_finish = new C_MDS_RetryRequest(this, mdr); + qimdr->internal_op_private = qfinisher; + qops[diri] = qimdr->reqid; + qs.inc_inodes(); + if (delay > 0ms) { + mds->timer.add_event_after(delay, new LambdaContext([cache=this,qimdr](int r) { + cache->dispatch_request(qimdr); + })); + } else { + dispatch_request(qimdr); + } + return; + } else { + dout(5) << __func__ << ": fully quiesced " << *diri << dendl; + mdr->mark_event("quiesce complete"); + } + + if (qfinisher) { + qfinisher->complete(0); + mdr->internal_op_finish = nullptr; + } + mdr->result = 0; + + /* caller kills this op */ +} + +MDRequestRef MDCache::quiesce_path(filepath p, C_MDS_QuiescePath* c, Formatter *f, std::chrono::milliseconds delay) { + MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_QUIESCE_PATH); + mdr->set_filepath(p); + mdr->internal_op_finish = c; + + if (delay > 0ms) { + mds->timer.add_event_after(delay, new LambdaContext([cache=this,mdr=mdr](int r) { + cache->dispatch_request(mdr); + })); + } else { + dispatch_request(mdr); + } + + return mdr; +} + + bool MDCache::dump_inode(Formatter *f, uint64_t number) { CInode *in = get_inode(number); if (!in) { diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index 990951b5531..ba91a24e145 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -108,6 +108,8 @@ enum { // How many inodes ever completed size recovery l_mdc_recovery_completed, + l_mdss_ireq_quiesce_path, + l_mdss_ireq_quiesce_inode, l_mdss_ireq_enqueue_scrub, l_mdss_ireq_exportdir, l_mdss_ireq_flush, @@ -535,6 +537,9 @@ public: void inc_inodes_quiesced() { inodes_quiesced++; } + uint64_t inc_heartbeat_count() { + return ++heartbeat_count; + } uint64_t get_inodes() const { return inodes; } @@ -564,6 +569,7 @@ public: f->close_section(); } private: + uint64_t heartbeat_count = 0; uint64_t inodes = 0; uint64_t inodes_quiesced = 0; std::map failed; @@ -574,7 +580,7 @@ private: MDSInternalContext(c->mds), cache(c), finisher(_finisher) {} ~C_MDS_QuiescePath() { if (finisher) { - finisher->complete(-ECANCELED); + finisher->complete(-CEPHFS_ECANCELED); finisher = nullptr; } } @@ -587,12 +593,14 @@ private: finisher = nullptr; } } - QuiesceStatistics qs; + std::shared_ptr qs = std::make_shared(); + std::chrono::milliseconds delay = 0ms; + bool splitauth = false; MDCache *cache; MDRequestRef mdr; Context* finisher = nullptr; }; - MDRequestRef quiesce_path(filepath p, C_MDS_QuiescePath* c, Formatter *f = nullptr, std::chrono::milliseconds delay = 0ms) { c->complete(-ENOTSUP); return nullptr; } + MDRequestRef quiesce_path(filepath p, C_MDS_QuiescePath* c, Formatter *f = nullptr, std::chrono::milliseconds delay = 0ms); void clean_open_file_lists(); void dump_openfiles(Formatter *f); @@ -1435,8 +1443,8 @@ private: void finish_uncommitted_fragment(dirfrag_t basedirfrag, int op); void rollback_uncommitted_fragment(dirfrag_t basedirfrag, frag_vec_t&& old_frags); - void dispatch_quiesce_path(const MDRequestRef& mdr) { } - void dispatch_quiesce_inode(const MDRequestRef& mdr) { } + void dispatch_quiesce_path(const MDRequestRef& mdr); + void dispatch_quiesce_inode(const MDRequestRef& mdr); void upkeep_main(void); @@ -1479,6 +1487,8 @@ private: std::atomic upkeep_trim_shutdown{false}; uint64_t kill_shutdown_at = 0; + + std::map quiesced_subvolumes; }; class C_MDS_RetryRequest : public MDSInternalContext { diff --git a/src/mds/MDSDaemon.cc b/src/mds/MDSDaemon.cc index 3f29e756b4d..fa430a30bc4 100644 --- a/src/mds/MDSDaemon.cc +++ b/src/mds/MDSDaemon.cc @@ -352,6 +352,12 @@ void MDSDaemon::set_up_admin_socket() asok_hook, "show cache status"); ceph_assert(r == 0); + r = admin_socket->register_command("quiesce path" + " name=path,type=CephString,req=true" + " name=wait,type=CephBool,req=false" + ,asok_hook + ,"quiesce a subtree"); + ceph_assert(r == 0); r = admin_socket->register_command("dump tree " "name=root,type=CephString,req=true " "name=depth,type=CephInt,req=false " diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc index b51148eedcb..209de7cb86e 100644 --- a/src/mds/MDSRank.cc +++ b/src/mds/MDSRank.cc @@ -3001,6 +3001,8 @@ void MDSRankDispatcher::handle_asok_command( } else if (command == "cache status") { std::lock_guard l(mds_lock); mdcache->cache_status(f); + } else if (command == "quiesce path") { + r = command_quiesce_path(f, cmdmap, *css); } else if (command == "dump tree") { command_dump_tree(cmdmap, *css, f); } else if (command == "dump loads") { @@ -3483,6 +3485,55 @@ void MDSRank::command_openfiles_ls(Formatter *f) mdcache->dump_openfiles(f); } +class C_MDS_QuiescePathCommand : public MDCache::C_MDS_QuiescePath { +public: + C_MDS_QuiescePathCommand(MDCache* cache, Context* fin) : C_MDS_QuiescePath(cache), finisher(fin) {} + void finish(int rc) override { + if (finisher) { + finisher->complete(rc); + finisher = nullptr; + } + } +private: + Context* finisher = nullptr; +}; + +int MDSRank::command_quiesce_path(Formatter* f, const cmdmap_t& cmdmap, std::ostream& ss) +{ + std::string path; + { + bool got = cmd_getval(cmdmap, "path", path); + if (!got) { + ss << "missing path"; + return -CEPHFS_EINVAL; + } + } + + bool wait = false; + cmd_getval(cmdmap, "wait", wait); + + C_SaferCond cond; + auto* finisher = new C_MDS_QuiescePathCommand(mdcache, wait ? &cond : nullptr); + auto qs = finisher->qs; + MDRequestRef mdr; + f->open_object_section("quiesce"); + { + std::lock_guard l(mds_lock); + mdr = mdcache->quiesce_path(filepath(path), finisher, f); + if (!wait) { + f->dump_object("op", *mdr); + } + } + if (wait) { + cond.wait(); + std::lock_guard l(mds_lock); + f->dump_object("op", *mdr); + } + f->dump_object("state", *qs); + f->close_section(); + return 0; +} + void MDSRank::command_dump_inode(Formatter *f, const cmdmap_t &cmdmap, std::ostream &ss) { std::lock_guard l(mds_lock); diff --git a/src/mds/MDSRank.h b/src/mds/MDSRank.h index 41c45cd7d39..97cab3b91a5 100644 --- a/src/mds/MDSRank.h +++ b/src/mds/MDSRank.h @@ -526,6 +526,7 @@ class MDSRank { std::ostream &ss); void command_openfiles_ls(Formatter *f); void command_dump_tree(const cmdmap_t &cmdmap, std::ostream &ss, Formatter *f); + int command_quiesce_path(Formatter *f, const cmdmap_t &cmdmap, std::ostream &ss); void command_dump_inode(Formatter *f, const cmdmap_t &cmdmap, std::ostream &ss); void command_dump_dir(Formatter *f, const cmdmap_t &cmdmap, std::ostream &ss); void command_cache_drop(uint64_t timeout, Formatter *f, Context *on_finish); diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc index b5690f4e2d7..dc898543d15 100644 --- a/src/mds/Migrator.cc +++ b/src/mds/Migrator.cc @@ -1067,7 +1067,7 @@ void Migrator::dispatch_export_dir(const MDRequestRef& mdr, int count) } lov.add_rdlock(&dir->get_inode()->dirfragtreelock); - if (!mds->locker->acquire_locks(mdr, lov, nullptr, true)) { + if (!mds->locker->acquire_locks(mdr, lov, nullptr, {}, true)) { if (mdr->aborted) export_try_cancel(dir); return; diff --git a/src/mds/Mutation.h b/src/mds/Mutation.h index fcb0b5a21ad..4b9d0d7528c 100644 --- a/src/mds/Mutation.h +++ b/src/mds/Mutation.h @@ -349,6 +349,8 @@ struct MDRequestImpl : public MutationImpl { MDSContext::vec waiting_for_finish; + std::map quiesce_ops; + // export & fragment CDir* export_dir = nullptr; dirfrag_t fragment_base; @@ -374,6 +376,9 @@ struct MDRequestImpl : public MutationImpl { const utime_t& get_dispatch_stamp() const { return dispatched; } + bool is_continuous() const { + return continuous; + } metareqid_t reqid; __u32 attempt = 0; ceph::cref_t client_req; @@ -382,6 +387,7 @@ struct MDRequestImpl : public MutationImpl { utime_t initiated; utime_t throttled, all_read, dispatched; int internal_op = -1; + bool continuous = false; }; MDRequestImpl(const Params* params, OpTracker *tracker) : MutationImpl(tracker, params->initiated, -- 2.39.5