'balance_automate': bool,
}),
'ever_allowed_features': int,
- 'root': int
+ 'root': int,
+ 'qdb_leader': int,
+ 'qdb_cluster': JList(int)
})
def test_minimal_health(self):
BOOST_STRONG_TYPEDEF(uint64_t, mds_gid_t)
extern const mds_gid_t MDS_GID_NONE;
+template <>
+struct std::hash<mds_gid_t> {
+ size_t operator()(const mds_gid_t& gid) const
+ {
+ return hash<uint64_t> {}(gid);
+ }
+};
+
typedef int32_t fs_cluster_id_t;
constexpr fs_cluster_id_t FS_CLUSTER_ID_NONE = -1;
#include <set>
#include <string>
#include <string_view>
+#include <type_traits>
#include <errno.h>
void modify_filesystem(fs_cluster_id_t fscid, T&& fn)
{
auto& fs = filesystems.at(fscid);
- fn(fs);
- fs.mds_map.epoch = epoch;
- fs.mds_map.modified = ceph_clock_now();
+ bool did_update = true;
+
+ if constexpr (std::is_convertible_v<std::invoke_result_t<T, Filesystem&>, bool>) {
+ did_update = fn(fs);
+ } else {
+ fn(fs);
+ }
+
+ if (did_update) {
+ fs.mds_map.epoch = epoch;
+ fs.mds_map.modified = ceph_clock_now();
+ }
}
/* This is method is written for the option of "ceph fs swap" commmand
f->dump_string("balancer", balancer);
f->dump_string("bal_rank_mask", bal_rank_mask);
f->dump_int("standby_count_wanted", std::max(0, standby_count_wanted));
+ f->dump_unsigned("qdb_leader", qdb_cluster_leader);
+ f->open_array_section("qdb_cluster");
+ for (auto m: qdb_cluster_members) {
+ f->dump_int("member", m);
+ }
+ f->close_section();
}
void MDSMap::dump_flags_state(Formatter *f) const
out << "balancer\t" << balancer << "\n";
out << "bal_rank_mask\t" << bal_rank_mask << "\n";
out << "standby_count_wanted\t" << std::max(0, standby_count_wanted) << "\n";
+ out << "qdb_cluster\tleader: " << qdb_cluster_leader << " members: " << qdb_cluster_members << std::endl;
multimap< pair<mds_rank_t, unsigned>, mds_gid_t > foo;
for (const auto &p : mds_info) {
encode(data_pools, bl);
encode(cas_pool, bl);
- __u16 ev = 18;
+ __u16 ev = 19;
encode(ev, bl);
encode(compat, bl);
encode(metadata_pool, bl);
encode(required_client_features, bl);
encode(bal_rank_mask, bl);
encode(max_xattr_size, bl);
+ encode(qdb_cluster_leader, bl);
+ encode(qdb_cluster_members, bl);
ENCODE_FINISH(bl);
}
decode(max_xattr_size, p);
}
+ if (ev >= 19) {
+ decode(qdb_cluster_leader, p);
+ decode(qdb_cluster_members, p);
+ }
+
/* All MDS since at least v14.0.0 understand INLINE */
/* TODO: remove after R is released */
compat.incompat.insert(MDS_FEATURE_INCOMPAT_INLINE);
#include <map>
#include <set>
#include <string>
+#include <ranges>
#include <string_view>
#include <errno.h>
mds_rank_t get_tableserver() const { return tableserver; }
mds_rank_t get_root() const { return root; }
+ void get_quiesce_db_cluster(mds_gid_t &leader, std::unordered_set<mds_gid_t> &members) const {
+ leader = qdb_cluster_leader;
+ members = qdb_cluster_members;
+ }
+
+ mds_gid_t get_quiesce_db_cluster_leader() {
+ return qdb_cluster_leader;
+ }
+
+ bool update_quiesce_db_cluster(mds_gid_t const& leader, std::same_as<std::unordered_set<mds_gid_t>> auto && members) {
+ if (leader == qdb_cluster_leader && members == qdb_cluster_members) {
+ return false;
+ }
+
+ ceph_assert(leader == MDS_GID_NONE || mds_info.contains(leader));
+ ceph_assert(std::ranges::all_of(members, [this](auto &m) {return mds_info.contains(m);}));
+
+ qdb_cluster_leader = leader;
+ qdb_cluster_members = members;
+
+ return true;
+ }
+
const std::vector<int64_t> &get_data_pools() const { return data_pools; }
int64_t get_first_data_pool() const { return *data_pools.begin(); }
int64_t get_metadata_pool() const { return metadata_pool; }
mds_rank_t tableserver = 0; // which MDS has snaptable
mds_rank_t root = 0; // which MDS has root directory
+ std::unordered_set<mds_gid_t> qdb_cluster_members;
+ mds_gid_t qdb_cluster_leader = MDS_GID_NONE;
__u32 session_timeout = 60;
__u32 session_autoclose = 300;
struct Ctx : public QuiesceDbManager::RequestContext {
std::function<void(int, const std::string&, bufferlist&)> on_finish;
bool all = false;
+ mds_gid_t me;
double sec(QuiesceTimeInterval duration) {
return duration_cast<dd>(duration).count();
f->open_object_section("response"); {
f->dump_int("epoch", response.db_version.epoch);
+ f->dump_int("leader", me);
f->dump_int("set_version", response.db_version.set_version);
f->open_object_section("sets"); {
for (auto&& [set_id, set] : response.sets) {
auto* ctx = new Ctx();
+ QuiesceInterface::PeerId me = mds_gid_t(monc->get_global_id());
ctx->on_finish = std::move(on_finish);
ctx->all = all;
+ ctx->me = me;
ctx->request.reset([&](auto& r) {
r.set_id = set_id;
int rc = quiesce_db_manager->submit_request(ctx);
if (rc != 0) {
bufferlist bl;
+ auto f = Formatter::create_unique("json-pretty");
+ f->open_object_section("response");
+ f->dump_int("epoch", mdsmap->get_epoch());
+ f->dump_int("leader", mdsmap->get_quiesce_db_cluster_leader());
+ f->close_section();
+ f->flush(bl);
// on_finish was moved there, so should only call via the ctx.
ctx->on_finish(rc, "Error submitting the command to the local db manager", bl);
delete ctx;
void MDSRank::quiesce_cluster_update() {
// the quiesce leader is the lowest rank with the highest state up to ACTIVE
- auto less_leader = [](MDSMap::mds_info_t const* l, MDSMap::mds_info_t const* r) {
- ceph_assert(l->rank != MDS_RANK_NONE);
- ceph_assert(r->rank != MDS_RANK_NONE);
- ceph_assert(l->state <= MDSMap::STATE_ACTIVE);
- ceph_assert(r->state <= MDSMap::STATE_ACTIVE);
- if (l->rank == r->rank) {
- return l->state < r->state;
- } else {
- return l->rank > r->rank;
- }
- };
-
- std::priority_queue<MDSMap::mds_info_t const*, std::vector<MDSMap::mds_info_t const*>, decltype(less_leader)> member_info(less_leader);
QuiesceClusterMembership membership;
-
QuiesceInterface::PeerId me = mds_gid_t(monc->get_global_id());
- for (auto&& [gid, info] : mdsmap->get_mds_info()) {
- // if it has a rank and state <= ACTIVE, it's good enough
- // if (info.rank != MDS_RANK_NONE && info.state <= MDSMap::STATE_ACTIVE) {
- if (info.rank != MDS_RANK_NONE && info.state == MDSMap::STATE_ACTIVE) {
- member_info.push(&info);
- membership.members.insert(info.global_id);
- }
- }
-
- QuiesceInterface::PeerId leader =
- member_info.empty()
- ? QuiesceClusterMembership::INVALID_MEMBER
- : member_info.top()->global_id;
+ mdsmap->get_quiesce_db_cluster(membership.leader, membership.members);
membership.epoch = mdsmap->get_epoch();
- membership.leader = leader;
membership.me = me;
membership.fs_name = mdsmap->get_fs_name();
- dout(5) << "epoch:" << membership.epoch << " me:" << me << " leader:" << leader << " members:" << membership.members
+ dout(5) << "epoch:" << membership.epoch << " me:" << me << " leader:" << membership.leader << " members:" << membership.members
<< (mdsmap->is_degraded() ? " (degraded)" : "") << dendl;
- if (leader != QuiesceClusterMembership::INVALID_MEMBER) {
+ if (membership.leader != QuiesceClusterMembership::INVALID_MEMBER) {
membership.send_ack = [=, this](QuiesceMap&& ack) {
- if (me == leader) {
+ if (me == membership.leader) {
// loopback
quiesce_db_manager->submit_ack_from(me, std::move(ack));
return 0;
} else {
std::lock_guard guard(mds_lock);
- if (mdsmap->get_state_gid(leader) == MDSMap::STATE_NULL) {
- dout(5) << "couldn't find the leader " << leader << " in the map" << dendl;
+ if (mdsmap->get_state_gid(membership.leader) == MDSMap::STATE_NULL) {
+ dout(5) << "couldn't find the leader " << membership.leader << " in the map" << dendl;
return -ENOENT;
}
- auto addrs = mdsmap->get_info_gid(leader).addrs;
+ auto addrs = mdsmap->get_info_gid(membership.leader).addrs;
auto ack_msg = make_message<MMDSQuiesceDbAck>();
- dout(10) << "sending ack " << ack << " to the leader " << leader << dendl;
+ dout(10) << "sending ack " << ack << " to the leader " << membership.leader << dendl;
ack_msg->encode_payload_from(me, ack);
return send_message_mds(ack_msg, addrs);
}
#include <set>
#include <queue>
-template <>
-struct std::hash<mds_gid_t> {
- size_t operator()(const mds_gid_t& gid) const
- {
- return hash<uint64_t> {}(gid);
- }
-};
-
struct QuiesceClusterMembership {
static const QuiesceInterface::PeerId INVALID_MEMBER;
QuiesceInterface::PeerId me = INVALID_MEMBER;
QuiesceInterface::PeerId leader = INVALID_MEMBER;
- std::set<QuiesceInterface::PeerId> members;
+ std::unordered_set<QuiesceInterface::PeerId> members;
// A courier interface to decouple from the messaging layer
// Failures can be ignored, manager will call this repeatedly if needed
// ============================
// quiesce db leader interface:
- // -> EPERM unless this is the leader
+ // -> ENOTTY unless this is the leader
// client interface to the DB
int submit_request(RequestContext* ctx) {
#include <regex>
#include <sstream>
+#include <queue>
+#include <ranges>
#include <boost/utility.hpp>
#include "MDSMonitor.h"
dout(10) << "create_pending e" << fsmap.get_epoch() << dendl;
}
+void MDSMonitor::assign_quiesce_db_leader(FSMap &fsmap) {
+
+ // the quiesce leader is the lowest rank with the highest state up to ACTIVE
+ auto less_leader = [](MDSMap::mds_info_t const* l, MDSMap::mds_info_t const* r) {
+ ceph_assert(l->rank != MDS_RANK_NONE);
+ ceph_assert(r->rank != MDS_RANK_NONE);
+ ceph_assert(l->state <= MDSMap::STATE_ACTIVE);
+ ceph_assert(r->state <= MDSMap::STATE_ACTIVE);
+ if (l->rank == r->rank) {
+ return l->state < r->state;
+ } else {
+ return l->rank > r->rank;
+ }
+ };
+
+ for (const auto& [fscid, fs] : std::as_const(fsmap)) {
+ auto &&mdsmap = fs.get_mds_map();
+
+ if (mdsmap.get_epoch() < fsmap.get_epoch()) {
+ // no changes in this fs, we can skip the calculation below
+ // NB! be careful with this clause when updating the leader selection logic.
+ // When the input from outside of this fsmap will affect the decision
+ // this clause will have to be updated, too.
+ continue;
+ }
+
+ std::priority_queue<MDSMap::mds_info_t const*, std::vector<MDSMap::mds_info_t const*>, decltype(less_leader)>
+ member_info(less_leader);
+
+ std::unordered_set<mds_gid_t> members;
+
+ for (auto&& [gid, info] : mdsmap.get_mds_info()) {
+ // if it has a rank and state <= ACTIVE, it's good enough
+ // if (info.rank != MDS_RANK_NONE && info.state <= MDSMap::STATE_ACTIVE) {
+ if (info.rank != MDS_RANK_NONE && info.state == MDSMap::STATE_ACTIVE) {
+ member_info.push(&info);
+ members.insert(info.global_id);
+ }
+ }
+
+ auto leader = member_info.empty() ? MDS_GID_NONE : member_info.top()->global_id;
+
+ fsmap.modify_filesystem(fscid, [&leader, &members](auto &writable_fs) -> bool {
+ return writable_fs.get_mds_map().update_quiesce_db_cluster(leader, std::move(members));
+ });
+ }
+}
+
void MDSMonitor::encode_pending(MonitorDBStore::TransactionRef t)
{
auto &pending = get_pending_fsmap_writeable();
auto epoch = pending.get_epoch();
+ assign_quiesce_db_leader(pending);
+
dout(10) << "encode_pending e" << epoch << dendl;
// print map iff 'debug mon = 30' or higher
int load_metadata(std::map<mds_gid_t, Metadata>& m);
void count_metadata(const std::string& field, ceph::Formatter *f);
+ void assign_quiesce_db_leader(FSMap &fsmap);
+
public:
void print_fs_summary(std::ostream& out) {
get_fsmap().print_fs_summary(out);
))
return r
-
- MDS_STATE_ORD = {
- "down:dne": 0, # CEPH_MDS_STATE_DNE,
- "down:stopped": -1, # CEPH_MDS_STATE_STOPPED,
- "down:damaged": 15, # CEPH_MDS_STATE_DAMAGED,
- "up:boot": -4, # CEPH_MDS_STATE_BOOT,
- "up:standby": -5, # CEPH_MDS_STATE_STANDBY,
- "up:standby-replay": -8, # CEPH_MDS_STATE_STANDBY_REPLAY,
- "up:oneshot-replay": -9, # CEPH_MDS_STATE_REPLAYONCE,
- "up:creating": -6, # CEPH_MDS_STATE_CREATING,
- "up:starting": -7, # CEPH_MDS_STATE_STARTING,
- "up:replay": 8, # CEPH_MDS_STATE_REPLAY,
- "up:resolve": 9, # CEPH_MDS_STATE_RESOLVE,
- "up:reconnect": 10, # CEPH_MDS_STATE_RECONNECT,
- "up:rejoin": 11, # CEPH_MDS_STATE_REJOIN,
- "up:clientreplay": 12, # CEPH_MDS_STATE_CLIENTREPLAY,
- "up:active": 13, # CEPH_MDS_STATE_ACTIVE,
- "up:stopping": 14, # CEPH_MDS_STATE_STOPPING,
- }
- MDS_STATE_ACTIVE_ORD = MDS_STATE_ORD["up:active"]
-
- def get_quiesce_leader_info(self, fscid: str) -> Optional[dict]:
- """
- Helper for `tell_quiesce_leader` to chose the mds to send the command to.
-
- Quiesce DB is managed by a leader which is selected based on the current MDSMap
- The logic is currently implemented both here and on the MDS side,
- see MDSRank::quiesce_cluster_update().
-
- Ideally, this logic should be part of the MDSMonitor and the result should
- be exposed via a dedicated field in the map, but until that is implemented
- this function will have to be kept in sync with the corresponding logic
- on the MDS side
- """
- leader_info: Optional[dict] = None
+ def get_quiesce_leader_gid(self, fscid: str) -> Optional[int]:
+ leader_gid : Optional[int] = None
for fs in self.get("fs_map")['filesystems']:
if fscid != fs["id"]:
continue
# quiesce leader is the lowest rank
# with the highest state
mdsmap = fs["mdsmap"]
- for info in mdsmap['info'].values():
- if info['rank'] == -1:
- continue
- if leader_info is None:
- leader_info = info
- else:
- if info['rank'] < leader_info['rank']:
- leader_info = info
- elif info['rank'] == leader_info['rank']:
- state_ord = self.MDS_STATE_ORD.get(info['state'])
- # if there are more than one daemons with the same rank
- # only one of them can be active
- if state_ord == self.MDS_STATE_ACTIVE_ORD:
- leader_info = info
+ leader_gid = mdsmap.get("qdb_leader", None)
break
- return leader_info
-
- def tell_quiesce_leader(self, fscid: str, cmd_dict: dict) -> Tuple[int, str, str]:
- qleader = self.get_quiesce_leader_info(fscid)
- if qleader is None:
- self.log.warn("Couldn't resolve the quiesce leader for fscid %s" % fscid)
- return (-errno.ENOENT, "", "Couldn't resolve the quiesce leader for fscid %s" % fscid)
- self.log.debug("resolved quiesce leader for fscid {fscid} at daemon '{name}' gid {gid} rank {rank} ({state})".format(fscid=fscid, **qleader))
- # We use the one_shot here to cover for cases when the mds crashes
- # without this parameter the client may get stuck awaiting response from a dead MDS
- return self.tell_command('mds', str(qleader['gid']), cmd_dict, one_shot=True)
+ return leader_gid
+
+ def tell_quiesce_leader(self, leader: int, cmd_dict: dict) -> Tuple[int, str, str]:
+ max_retries = 5
+ for _ in range(max_retries):
+ # We use "one_shot" here to cover for cases when the mds crashes
+ # without this parameter the client may get stuck awaiting response from a dead MDS
+ # (which is particularly bad for the volumes plugin finisher thread)
+ rc, stdout, stderr = self.tell_command('mds', str(leader), cmd_dict, one_shot=True)
+ if rc == -errno.ENOTTY:
+ try:
+ resp = json.loads(stdout)
+ leader = int(resp['leader'])
+ self.log.info("Retrying a quiesce db command with leader %d" % leader)
+ except Exception as e:
+ self.log.error("Couldn't parse ENOTTY response from an mds with error: %s\n%s" % (str(e), stdout))
+ break
+ else:
+ break
+
+ return (rc, stdout, stderr)
def send_command(
self,
volname = cmd['vol_name']
default_group_name = cmd.get('group_name', None)
roots = []
- fscid = None
+ leader_gid = cmd.get('with_leader', None)
with open_volume(self, volname) as fs_handle:
- fscid = fs_handle.get_fscid()
+ if leader_gid is None:
+ fscid = fs_handle.get_fscid()
+ leader_gid = self.mgr.get_quiesce_leader_gid(fscid)
+ if leader_gid is None:
+ return -errno.ENOENT, "", "Couldn't resolve the quiesce leader for volume %s (%s)" % (volname, fscid)
if cmd.get('leader', False):
- leader_info = self.mgr.get_quiesce_leader_info(fscid)
- if leader_info is None:
- return -errno.ENOENT, "", "Couldn't resolve the quiesce leader for volume %s (%s)" % (volname, fscid)
return (
0,
- "mds.%d" % leader_info['gid'],
- "Resolved the quiesce leader for volume '{volname}' as daemon '{name}' ({gid}) {state} rank {rank}".format(volname=volname, **leader_info)
+ "mds.%d" % leader_gid,
+ "Resolved the quiesce leader for volume '{volname}' as gid {gid}".format(volname=volname, gid=leader_gid)
)
cmd['roots'] = roots
cmd['prefix'] = 'quiesce db'
- return self.mgr.tell_quiesce_leader(fscid, cmd)
+ return self.mgr.tell_quiesce_leader(leader_gid, cmd)
def set_user_metadata(self, **kwargs):
ret = 0, "", ""
'name=all,type=CephBool,req=false '
'name=cancel,type=CephBool,req=false '
'name=group_name,type=CephString,req=false '
- 'name=leader,type=CephBool,req=false ',
+ 'name=leader,type=CephBool,req=false '
+ 'name=with_leader,type=CephInt,range=0,req=false ',
'desc': "Manage quiesce sets of subvolumes",
'perm': 'rw'
},
{
++epoch;
ASSERT_GE(leader_and_replicas.size(), 1);
- std::set<QuiesceInterface::PeerId> members(leader_and_replicas.begin(), leader_and_replicas.end());
+ std::unordered_set<QuiesceInterface::PeerId> members(leader_and_replicas.begin(), leader_and_replicas.end());
auto leader = leader_and_replicas[0];
for (const auto &[this_peer, mgr] : managers) {
QuiesceClusterMembership mem = {