From 759925748e8be355a743f6edd106052d5a026dfd Mon Sep 17 00:00:00 2001 From: Leonid Usov Date: Wed, 28 Feb 2024 15:11:17 +0200 Subject: [PATCH] mds/quiesce: resolve the quiesce cluster at the mds monitor Signed-off-by: Leonid Usov --- qa/tasks/mgr/dashboard/test_health.py | 4 +- src/include/cephfs/types.h | 8 +++ src/mds/FSMap.h | 16 ++++- src/mds/MDSMap.cc | 16 ++++- src/mds/MDSMap.h | 26 +++++++++ src/mds/MDSRankQuiesce.cc | 53 ++++++----------- src/mds/QuiesceDbManager.h | 12 +--- src/mon/MDSMonitor.cc | 52 +++++++++++++++++ src/mon/MDSMonitor.h | 2 + src/pybind/mgr/mgr_module.py | 84 ++++++++------------------- src/pybind/mgr/volumes/fs/volume.py | 17 +++--- src/pybind/mgr/volumes/module.py | 3 +- src/test/mds/TestQuiesceDb.cc | 2 +- 13 files changed, 175 insertions(+), 120 deletions(-) diff --git a/qa/tasks/mgr/dashboard/test_health.py b/qa/tasks/mgr/dashboard/test_health.py index 0b7b7a3b449..3ebea97e3d2 100644 --- a/qa/tasks/mgr/dashboard/test_health.py +++ b/qa/tasks/mgr/dashboard/test_health.py @@ -63,7 +63,9 @@ class HealthTest(DashboardTestCase): 'balance_automate': bool, }), 'ever_allowed_features': int, - 'root': int + 'root': int, + 'qdb_leader': int, + 'qdb_cluster': JList(int) }) def test_minimal_health(self): diff --git a/src/include/cephfs/types.h b/src/include/cephfs/types.h index 068c9ef8199..23f6e44a9c2 100644 --- a/src/include/cephfs/types.h +++ b/src/include/cephfs/types.h @@ -48,6 +48,14 @@ BOOST_STRONG_TYPEDEF(uint64_t, mds_gid_t) extern const mds_gid_t MDS_GID_NONE; +template <> +struct std::hash { + size_t operator()(const mds_gid_t& gid) const + { + return hash {}(gid); + } +}; + typedef int32_t fs_cluster_id_t; constexpr fs_cluster_id_t FS_CLUSTER_ID_NONE = -1; diff --git a/src/mds/FSMap.h b/src/mds/FSMap.h index a6aa92f218b..e2046fb7af8 100644 --- a/src/mds/FSMap.h +++ b/src/mds/FSMap.h @@ -21,6 +21,7 @@ #include #include #include +#include #include @@ -479,9 +480,18 @@ public: void modify_filesystem(fs_cluster_id_t fscid, T&& fn) { auto& fs = filesystems.at(fscid); - fn(fs); - fs.mds_map.epoch = epoch; - fs.mds_map.modified = ceph_clock_now(); + bool did_update = true; + + if constexpr (std::is_convertible_v, bool>) { + did_update = fn(fs); + } else { + fn(fs); + } + + if (did_update) { + fs.mds_map.epoch = epoch; + fs.mds_map.modified = ceph_clock_now(); + } } /* This is method is written for the option of "ceph fs swap" commmand diff --git a/src/mds/MDSMap.cc b/src/mds/MDSMap.cc index 4096b23ab9d..bedeed165ab 100644 --- a/src/mds/MDSMap.cc +++ b/src/mds/MDSMap.cc @@ -228,6 +228,12 @@ void MDSMap::dump(Formatter *f) const f->dump_string("balancer", balancer); f->dump_string("bal_rank_mask", bal_rank_mask); f->dump_int("standby_count_wanted", std::max(0, standby_count_wanted)); + f->dump_unsigned("qdb_leader", qdb_cluster_leader); + f->open_array_section("qdb_cluster"); + for (auto m: qdb_cluster_members) { + f->dump_int("member", m); + } + f->close_section(); } void MDSMap::dump_flags_state(Formatter *f) const @@ -290,6 +296,7 @@ void MDSMap::print(ostream& out) const out << "balancer\t" << balancer << "\n"; out << "bal_rank_mask\t" << bal_rank_mask << "\n"; out << "standby_count_wanted\t" << std::max(0, standby_count_wanted) << "\n"; + out << "qdb_cluster\tleader: " << qdb_cluster_leader << " members: " << qdb_cluster_members << std::endl; multimap< pair, mds_gid_t > foo; for (const auto &p : mds_info) { @@ -773,7 +780,7 @@ void MDSMap::encode(bufferlist& bl, uint64_t features) const encode(data_pools, bl); encode(cas_pool, bl); - __u16 ev = 18; + __u16 ev = 19; encode(ev, bl); encode(compat, bl); encode(metadata_pool, bl); @@ -802,6 +809,8 @@ void MDSMap::encode(bufferlist& bl, uint64_t features) const encode(required_client_features, bl); encode(bal_rank_mask, bl); encode(max_xattr_size, bl); + encode(qdb_cluster_leader, bl); + encode(qdb_cluster_members, bl); ENCODE_FINISH(bl); } @@ -957,6 +966,11 @@ void MDSMap::decode(bufferlist::const_iterator& p) decode(max_xattr_size, p); } + if (ev >= 19) { + decode(qdb_cluster_leader, p); + decode(qdb_cluster_members, p); + } + /* All MDS since at least v14.0.0 understand INLINE */ /* TODO: remove after R is released */ compat.incompat.insert(MDS_FEATURE_INCOMPAT_INLINE); diff --git a/src/mds/MDSMap.h b/src/mds/MDSMap.h index 746ae859715..2dd8fba8342 100644 --- a/src/mds/MDSMap.h +++ b/src/mds/MDSMap.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -312,6 +313,29 @@ public: mds_rank_t get_tableserver() const { return tableserver; } mds_rank_t get_root() const { return root; } + void get_quiesce_db_cluster(mds_gid_t &leader, std::unordered_set &members) const { + leader = qdb_cluster_leader; + members = qdb_cluster_members; + } + + mds_gid_t get_quiesce_db_cluster_leader() { + return qdb_cluster_leader; + } + + bool update_quiesce_db_cluster(mds_gid_t const& leader, std::same_as> auto && members) { + if (leader == qdb_cluster_leader && members == qdb_cluster_members) { + return false; + } + + ceph_assert(leader == MDS_GID_NONE || mds_info.contains(leader)); + ceph_assert(std::ranges::all_of(members, [this](auto &m) {return mds_info.contains(m);})); + + qdb_cluster_leader = leader; + qdb_cluster_members = members; + + return true; + } + const std::vector &get_data_pools() const { return data_pools; } int64_t get_first_data_pool() const { return *data_pools.begin(); } int64_t get_metadata_pool() const { return metadata_pool; } @@ -634,6 +658,8 @@ protected: mds_rank_t tableserver = 0; // which MDS has snaptable mds_rank_t root = 0; // which MDS has root directory + std::unordered_set qdb_cluster_members; + mds_gid_t qdb_cluster_leader = MDS_GID_NONE; __u32 session_timeout = 60; __u32 session_autoclose = 300; diff --git a/src/mds/MDSRankQuiesce.cc b/src/mds/MDSRankQuiesce.cc index 5d92c0bd818..bcc8e149ed7 100644 --- a/src/mds/MDSRankQuiesce.cc +++ b/src/mds/MDSRankQuiesce.cc @@ -101,6 +101,7 @@ void MDSRank::command_quiesce_db(const cmdmap_t& cmdmap, std::function on_finish; bool all = false; + mds_gid_t me; double sec(QuiesceTimeInterval duration) { return duration_cast
(duration).count(); @@ -126,6 +127,7 @@ void MDSRank::command_quiesce_db(const cmdmap_t& cmdmap, std::functionopen_object_section("response"); { f->dump_int("epoch", response.db_version.epoch); + f->dump_int("leader", me); f->dump_int("set_version", response.db_version.set_version); f->open_object_section("sets"); { for (auto&& [set_id, set] : response.sets) { @@ -168,8 +170,10 @@ void MDSRank::command_quiesce_db(const cmdmap_t& cmdmap, std::functionget_global_id()); ctx->on_finish = std::move(on_finish); ctx->all = all; + ctx->me = me; ctx->request.reset([&](auto& r) { r.set_id = set_id; @@ -212,6 +216,12 @@ void MDSRank::command_quiesce_db(const cmdmap_t& cmdmap, std::functionsubmit_request(ctx); if (rc != 0) { bufferlist bl; + auto f = Formatter::create_unique("json-pretty"); + f->open_object_section("response"); + f->dump_int("epoch", mdsmap->get_epoch()); + f->dump_int("leader", mdsmap->get_quiesce_db_cluster_leader()); + f->close_section(); + f->flush(bl); // on_finish was moved there, so should only call via the ctx. ctx->on_finish(rc, "Error submitting the command to the local db manager", bl); delete ctx; @@ -234,62 +244,35 @@ static void rebind_agent_callback(std::shared_ptr agt, std::shared void MDSRank::quiesce_cluster_update() { // the quiesce leader is the lowest rank with the highest state up to ACTIVE - auto less_leader = [](MDSMap::mds_info_t const* l, MDSMap::mds_info_t const* r) { - ceph_assert(l->rank != MDS_RANK_NONE); - ceph_assert(r->rank != MDS_RANK_NONE); - ceph_assert(l->state <= MDSMap::STATE_ACTIVE); - ceph_assert(r->state <= MDSMap::STATE_ACTIVE); - if (l->rank == r->rank) { - return l->state < r->state; - } else { - return l->rank > r->rank; - } - }; - - std::priority_queue, decltype(less_leader)> member_info(less_leader); QuiesceClusterMembership membership; - QuiesceInterface::PeerId me = mds_gid_t(monc->get_global_id()); - for (auto&& [gid, info] : mdsmap->get_mds_info()) { - // if it has a rank and state <= ACTIVE, it's good enough - // if (info.rank != MDS_RANK_NONE && info.state <= MDSMap::STATE_ACTIVE) { - if (info.rank != MDS_RANK_NONE && info.state == MDSMap::STATE_ACTIVE) { - member_info.push(&info); - membership.members.insert(info.global_id); - } - } - - QuiesceInterface::PeerId leader = - member_info.empty() - ? QuiesceClusterMembership::INVALID_MEMBER - : member_info.top()->global_id; + mdsmap->get_quiesce_db_cluster(membership.leader, membership.members); membership.epoch = mdsmap->get_epoch(); - membership.leader = leader; membership.me = me; membership.fs_name = mdsmap->get_fs_name(); - dout(5) << "epoch:" << membership.epoch << " me:" << me << " leader:" << leader << " members:" << membership.members + dout(5) << "epoch:" << membership.epoch << " me:" << me << " leader:" << membership.leader << " members:" << membership.members << (mdsmap->is_degraded() ? " (degraded)" : "") << dendl; - if (leader != QuiesceClusterMembership::INVALID_MEMBER) { + if (membership.leader != QuiesceClusterMembership::INVALID_MEMBER) { membership.send_ack = [=, this](QuiesceMap&& ack) { - if (me == leader) { + if (me == membership.leader) { // loopback quiesce_db_manager->submit_ack_from(me, std::move(ack)); return 0; } else { std::lock_guard guard(mds_lock); - if (mdsmap->get_state_gid(leader) == MDSMap::STATE_NULL) { - dout(5) << "couldn't find the leader " << leader << " in the map" << dendl; + if (mdsmap->get_state_gid(membership.leader) == MDSMap::STATE_NULL) { + dout(5) << "couldn't find the leader " << membership.leader << " in the map" << dendl; return -ENOENT; } - auto addrs = mdsmap->get_info_gid(leader).addrs; + auto addrs = mdsmap->get_info_gid(membership.leader).addrs; auto ack_msg = make_message(); - dout(10) << "sending ack " << ack << " to the leader " << leader << dendl; + dout(10) << "sending ack " << ack << " to the leader " << membership.leader << dendl; ack_msg->encode_payload_from(me, ack); return send_message_mds(ack_msg, addrs); } diff --git a/src/mds/QuiesceDbManager.h b/src/mds/QuiesceDbManager.h index a20b0cfc70a..7b7e77ff2f6 100644 --- a/src/mds/QuiesceDbManager.h +++ b/src/mds/QuiesceDbManager.h @@ -19,14 +19,6 @@ #include #include -template <> -struct std::hash { - size_t operator()(const mds_gid_t& gid) const - { - return hash {}(gid); - } -}; - struct QuiesceClusterMembership { static const QuiesceInterface::PeerId INVALID_MEMBER; @@ -36,7 +28,7 @@ struct QuiesceClusterMembership { QuiesceInterface::PeerId me = INVALID_MEMBER; QuiesceInterface::PeerId leader = INVALID_MEMBER; - std::set members; + std::unordered_set members; // A courier interface to decouple from the messaging layer // Failures can be ignored, manager will call this repeatedly if needed @@ -69,7 +61,7 @@ class QuiesceDbManager { // ============================ // quiesce db leader interface: - // -> EPERM unless this is the leader + // -> ENOTTY unless this is the leader // client interface to the DB int submit_request(RequestContext* ctx) { diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc index aeefe5c9592..c319715a6a8 100644 --- a/src/mon/MDSMonitor.cc +++ b/src/mon/MDSMonitor.cc @@ -14,6 +14,8 @@ #include #include +#include +#include #include #include "MDSMonitor.h" @@ -174,11 +176,61 @@ void MDSMonitor::create_pending() dout(10) << "create_pending e" << fsmap.get_epoch() << dendl; } +void MDSMonitor::assign_quiesce_db_leader(FSMap &fsmap) { + + // the quiesce leader is the lowest rank with the highest state up to ACTIVE + auto less_leader = [](MDSMap::mds_info_t const* l, MDSMap::mds_info_t const* r) { + ceph_assert(l->rank != MDS_RANK_NONE); + ceph_assert(r->rank != MDS_RANK_NONE); + ceph_assert(l->state <= MDSMap::STATE_ACTIVE); + ceph_assert(r->state <= MDSMap::STATE_ACTIVE); + if (l->rank == r->rank) { + return l->state < r->state; + } else { + return l->rank > r->rank; + } + }; + + for (const auto& [fscid, fs] : std::as_const(fsmap)) { + auto &&mdsmap = fs.get_mds_map(); + + if (mdsmap.get_epoch() < fsmap.get_epoch()) { + // no changes in this fs, we can skip the calculation below + // NB! be careful with this clause when updating the leader selection logic. + // When the input from outside of this fsmap will affect the decision + // this clause will have to be updated, too. + continue; + } + + std::priority_queue, decltype(less_leader)> + member_info(less_leader); + + std::unordered_set members; + + for (auto&& [gid, info] : mdsmap.get_mds_info()) { + // if it has a rank and state <= ACTIVE, it's good enough + // if (info.rank != MDS_RANK_NONE && info.state <= MDSMap::STATE_ACTIVE) { + if (info.rank != MDS_RANK_NONE && info.state == MDSMap::STATE_ACTIVE) { + member_info.push(&info); + members.insert(info.global_id); + } + } + + auto leader = member_info.empty() ? MDS_GID_NONE : member_info.top()->global_id; + + fsmap.modify_filesystem(fscid, [&leader, &members](auto &writable_fs) -> bool { + return writable_fs.get_mds_map().update_quiesce_db_cluster(leader, std::move(members)); + }); + } +} + void MDSMonitor::encode_pending(MonitorDBStore::TransactionRef t) { auto &pending = get_pending_fsmap_writeable(); auto epoch = pending.get_epoch(); + assign_quiesce_db_leader(pending); + dout(10) << "encode_pending e" << epoch << dendl; // print map iff 'debug mon = 30' or higher diff --git a/src/mon/MDSMonitor.h b/src/mon/MDSMonitor.h index 36d53fe4e48..25f27535c77 100644 --- a/src/mon/MDSMonitor.h +++ b/src/mon/MDSMonitor.h @@ -129,6 +129,8 @@ class MDSMonitor : public PaxosService, public PaxosFSMap, protected CommandHand int load_metadata(std::map& m); void count_metadata(const std::string& field, ceph::Formatter *f); + void assign_quiesce_db_leader(FSMap &fsmap); + public: void print_fs_summary(std::ostream& out) { get_fsmap().print_fs_summary(out); diff --git a/src/pybind/mgr/mgr_module.py b/src/pybind/mgr/mgr_module.py index 2e25f994b6b..41e54fd8790 100644 --- a/src/pybind/mgr/mgr_module.py +++ b/src/pybind/mgr/mgr_module.py @@ -1731,42 +1731,9 @@ class MgrModule(ceph_module.BaseMgrModule, MgrModuleLoggingMixin): )) return r - - MDS_STATE_ORD = { - "down:dne": 0, # CEPH_MDS_STATE_DNE, - "down:stopped": -1, # CEPH_MDS_STATE_STOPPED, - "down:damaged": 15, # CEPH_MDS_STATE_DAMAGED, - "up:boot": -4, # CEPH_MDS_STATE_BOOT, - "up:standby": -5, # CEPH_MDS_STATE_STANDBY, - "up:standby-replay": -8, # CEPH_MDS_STATE_STANDBY_REPLAY, - "up:oneshot-replay": -9, # CEPH_MDS_STATE_REPLAYONCE, - "up:creating": -6, # CEPH_MDS_STATE_CREATING, - "up:starting": -7, # CEPH_MDS_STATE_STARTING, - "up:replay": 8, # CEPH_MDS_STATE_REPLAY, - "up:resolve": 9, # CEPH_MDS_STATE_RESOLVE, - "up:reconnect": 10, # CEPH_MDS_STATE_RECONNECT, - "up:rejoin": 11, # CEPH_MDS_STATE_REJOIN, - "up:clientreplay": 12, # CEPH_MDS_STATE_CLIENTREPLAY, - "up:active": 13, # CEPH_MDS_STATE_ACTIVE, - "up:stopping": 14, # CEPH_MDS_STATE_STOPPING, - } - MDS_STATE_ACTIVE_ORD = MDS_STATE_ORD["up:active"] - - def get_quiesce_leader_info(self, fscid: str) -> Optional[dict]: - """ - Helper for `tell_quiesce_leader` to chose the mds to send the command to. - - Quiesce DB is managed by a leader which is selected based on the current MDSMap - The logic is currently implemented both here and on the MDS side, - see MDSRank::quiesce_cluster_update(). - - Ideally, this logic should be part of the MDSMonitor and the result should - be exposed via a dedicated field in the map, but until that is implemented - this function will have to be kept in sync with the corresponding logic - on the MDS side - """ - leader_info: Optional[dict] = None + def get_quiesce_leader_gid(self, fscid: str) -> Optional[int]: + leader_gid : Optional[int] = None for fs in self.get("fs_map")['filesystems']: if fscid != fs["id"]: continue @@ -1774,33 +1741,30 @@ class MgrModule(ceph_module.BaseMgrModule, MgrModuleLoggingMixin): # quiesce leader is the lowest rank # with the highest state mdsmap = fs["mdsmap"] - for info in mdsmap['info'].values(): - if info['rank'] == -1: - continue - if leader_info is None: - leader_info = info - else: - if info['rank'] < leader_info['rank']: - leader_info = info - elif info['rank'] == leader_info['rank']: - state_ord = self.MDS_STATE_ORD.get(info['state']) - # if there are more than one daemons with the same rank - # only one of them can be active - if state_ord == self.MDS_STATE_ACTIVE_ORD: - leader_info = info + leader_gid = mdsmap.get("qdb_leader", None) break - return leader_info - - def tell_quiesce_leader(self, fscid: str, cmd_dict: dict) -> Tuple[int, str, str]: - qleader = self.get_quiesce_leader_info(fscid) - if qleader is None: - self.log.warn("Couldn't resolve the quiesce leader for fscid %s" % fscid) - return (-errno.ENOENT, "", "Couldn't resolve the quiesce leader for fscid %s" % fscid) - self.log.debug("resolved quiesce leader for fscid {fscid} at daemon '{name}' gid {gid} rank {rank} ({state})".format(fscid=fscid, **qleader)) - # We use the one_shot here to cover for cases when the mds crashes - # without this parameter the client may get stuck awaiting response from a dead MDS - return self.tell_command('mds', str(qleader['gid']), cmd_dict, one_shot=True) + return leader_gid + + def tell_quiesce_leader(self, leader: int, cmd_dict: dict) -> Tuple[int, str, str]: + max_retries = 5 + for _ in range(max_retries): + # We use "one_shot" here to cover for cases when the mds crashes + # without this parameter the client may get stuck awaiting response from a dead MDS + # (which is particularly bad for the volumes plugin finisher thread) + rc, stdout, stderr = self.tell_command('mds', str(leader), cmd_dict, one_shot=True) + if rc == -errno.ENOTTY: + try: + resp = json.loads(stdout) + leader = int(resp['leader']) + self.log.info("Retrying a quiesce db command with leader %d" % leader) + except Exception as e: + self.log.error("Couldn't parse ENOTTY response from an mds with error: %s\n%s" % (str(e), stdout)) + break + else: + break + + return (rc, stdout, stderr) def send_command( self, diff --git a/src/pybind/mgr/volumes/fs/volume.py b/src/pybind/mgr/volumes/fs/volume.py index ef3b171af76..8fb7defed71 100644 --- a/src/pybind/mgr/volumes/fs/volume.py +++ b/src/pybind/mgr/volumes/fs/volume.py @@ -456,19 +456,20 @@ class VolumeClient(CephfsClient["Module"]): volname = cmd['vol_name'] default_group_name = cmd.get('group_name', None) roots = [] - fscid = None + leader_gid = cmd.get('with_leader', None) with open_volume(self, volname) as fs_handle: - fscid = fs_handle.get_fscid() + if leader_gid is None: + fscid = fs_handle.get_fscid() + leader_gid = self.mgr.get_quiesce_leader_gid(fscid) + if leader_gid is None: + return -errno.ENOENT, "", "Couldn't resolve the quiesce leader for volume %s (%s)" % (volname, fscid) if cmd.get('leader', False): - leader_info = self.mgr.get_quiesce_leader_info(fscid) - if leader_info is None: - return -errno.ENOENT, "", "Couldn't resolve the quiesce leader for volume %s (%s)" % (volname, fscid) return ( 0, - "mds.%d" % leader_info['gid'], - "Resolved the quiesce leader for volume '{volname}' as daemon '{name}' ({gid}) {state} rank {rank}".format(volname=volname, **leader_info) + "mds.%d" % leader_gid, + "Resolved the quiesce leader for volume '{volname}' as gid {gid}".format(volname=volname, gid=leader_gid) ) @@ -493,7 +494,7 @@ class VolumeClient(CephfsClient["Module"]): cmd['roots'] = roots cmd['prefix'] = 'quiesce db' - return self.mgr.tell_quiesce_leader(fscid, cmd) + return self.mgr.tell_quiesce_leader(leader_gid, cmd) def set_user_metadata(self, **kwargs): ret = 0, "", "" diff --git a/src/pybind/mgr/volumes/module.py b/src/pybind/mgr/volumes/module.py index ee36005d406..e059648261e 100644 --- a/src/pybind/mgr/volumes/module.py +++ b/src/pybind/mgr/volumes/module.py @@ -291,7 +291,8 @@ class Module(orchestrator.OrchestratorClientMixin, MgrModule): 'name=all,type=CephBool,req=false ' 'name=cancel,type=CephBool,req=false ' 'name=group_name,type=CephString,req=false ' - 'name=leader,type=CephBool,req=false ', + 'name=leader,type=CephBool,req=false ' + 'name=with_leader,type=CephInt,range=0,req=false ', 'desc': "Manage quiesce sets of subvolumes", 'perm': 'rw' }, diff --git a/src/test/mds/TestQuiesceDb.cc b/src/test/mds/TestQuiesceDb.cc index e820db40448..3c48474fa32 100644 --- a/src/test/mds/TestQuiesceDb.cc +++ b/src/test/mds/TestQuiesceDb.cc @@ -136,7 +136,7 @@ class QuiesceDbTest: public testing::Test { { ++epoch; ASSERT_GE(leader_and_replicas.size(), 1); - std::set members(leader_and_replicas.begin(), leader_and_replicas.end()); + std::unordered_set members(leader_and_replicas.begin(), leader_and_replicas.end()); auto leader = leader_and_replicas[0]; for (const auto &[this_peer, mgr] : managers) { QuiesceClusterMembership mem = { -- 2.39.5