OPTION(mon_fake_pool_delete, OPT_BOOL) // fake pool deletion (add _DELETED suffix)
OPTION(mon_globalid_prealloc, OPT_U32) // how many globalids to prealloc
OPTION(mon_osd_report_timeout, OPT_INT) // grace period before declaring unresponsive OSDs dead
-OPTION(mon_force_standby_active, OPT_BOOL) // should mons force standby-replay mds to be active
OPTION(mon_warn_on_legacy_crush_tunables, OPT_BOOL) // warn if crush tunables are too old (older than mon_min_crush_required_version)
OPTION(mon_crush_min_required_version, OPT_STR)
OPTION(mon_warn_on_crush_straw_calc_version_zero, OPT_BOOL) // warn if crush straw_calc_version==0
OPTION(mds_wipe_sessions, OPT_BOOL)
OPTION(mds_wipe_ino_prealloc, OPT_BOOL)
OPTION(mds_skip_ino, OPT_INT)
-OPTION(mds_standby_for_name, OPT_STR)
-OPTION(mds_standby_for_rank, OPT_INT)
-OPTION(mds_standby_for_fscid, OPT_INT)
-OPTION(mds_standby_replay, OPT_BOOL)
OPTION(mds_enable_op_tracker, OPT_BOOL) // enable/disable MDS op tracking
OPTION(mds_op_history_size, OPT_U32) // Max number of completed ops to track
OPTION(mds_op_history_duration, OPT_U32) // Oldest completed op to track
.add_service("mon")
.set_description("time before OSDs who do not report to the mons are marked down (seconds)"),
- Option("mon_force_standby_active", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
- .set_default(true)
- .add_service("mon")
- .set_description("allow use of MDS daemons in standby-replay as replacements"),
-
Option("mon_warn_on_msgr2_not_enabled", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(true)
.add_service("mon")
.set_default(0)
.set_description(""),
- Option("mds_standby_for_name", Option::TYPE_STR, Option::LEVEL_ADVANCED)
- .set_default("")
- .set_description("standby for named MDS daemon when not active"),
-
- Option("mds_standby_for_rank", Option::TYPE_INT, Option::LEVEL_BASIC)
- .set_default(-1)
- .set_description("allow MDS to become a standby:replay daemon"),
-
- Option("mds_standby_for_fscid", Option::TYPE_INT, Option::LEVEL_ADVANCED)
- .set_default(-1)
- .set_description("standby only for the file system with the given fscid"),
-
- Option("mds_standby_replay", Option::TYPE_BOOL, Option::LEVEL_BASIC)
- .set_default(false)
- .set_description("allow MDS to standby replay for an active MDS"),
-
Option("mds_enable_op_tracker", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(true)
.set_description("track remote operation progression and statistics"),
std::unique_lock lock(mutex);
_notify_mdsmap(mdsmap);
- standby_for_rank = mds_rank_t(g_conf()->mds_standby_for_rank);
- standby_for_name = g_conf()->mds_standby_for_name;
- standby_for_fscid = fs_cluster_id_t(g_conf()->mds_standby_for_fscid);
- standby_replay = g_conf()->mds_standby_replay;
sender = std::thread([this]() {
std::unique_lock<std::mutex> lock(mutex);
last_seq,
CEPH_FEATURES_SUPPORTED_DEFAULT);
- beacon->set_standby_for_rank(standby_for_rank);
- beacon->set_standby_for_name(standby_for_name);
- beacon->set_standby_for_fscid(standby_for_fscid);
- beacon->set_standby_replay(standby_replay);
beacon->set_health(health);
beacon->set_compat(compat);
// piggyback the sys info on beacon msg
std::string name;
version_t epoch = 0;
CompatSet compat;
- mds_rank_t standby_for_rank = MDS_RANK_NONE;
- std::string standby_for_name;
- fs_cluster_id_t standby_for_fscid = FS_CLUSTER_ID_NONE;
- bool standby_replay = false;
MDSMap::DaemonState want_state = MDSMap::STATE_BOOT;
// Internal beacon state
std::set<mds_rank_t> stuck_failed;
for (const auto &rank : fs->mds_map.failed) {
- const mds_gid_t replacement = find_replacement_for(
- {fs->fscid, rank}, {}, g_conf()->mon_force_standby_active);
+ auto&& replacement = find_replacement_for({fs->fscid, rank}, {});
if (replacement == MDS_GID_NONE) {
stuck_failed.insert(rank);
}
// Construct mds_roles, standby_daemons, and remove
// standbys from the MDSMap in the Filesystem.
- for (auto &p : migrate_fs->mds_map.mds_info) {
- if (p.second.state == MDSMap::STATE_STANDBY_REPLAY) {
- // In legacy MDSMap, standby replay daemons don't have
- // rank set, but since FSMap they do.
- p.second.rank = p.second.standby_for_rank;
- }
- if (p.second.rank == MDS_RANK_NONE) {
- if (p.second.state != MDSMap::STATE_STANDBY) {
+ for (const auto& [gid, info] : migrate_fs->mds_map.mds_info) {
+ if (info.state == MDSMap::STATE_STANDBY_REPLAY) {
+ /* drop any legacy standby-replay daemons */
+ drop_gids.insert(gid);
+ } else if (info.rank == MDS_RANK_NONE) {
+ if (info.state != MDSMap::STATE_STANDBY) {
// Old MDSMaps can have down:dne here, which
// is invalid in an FSMap (#17837)
- drop_gids.insert(p.first);
+ drop_gids.insert(gid);
} else {
- insert(p.second); // into standby_daemons
+ insert(info); // into standby_daemons
}
} else {
- mds_roles[p.first] = migrate_fs->fscid;
+ mds_roles[gid] = migrate_fs->fscid;
}
}
for (const auto &p : standby_daemons) {
mds_map.print(out);
}
-mds_gid_t FSMap::find_standby_for(mds_role_t role, std::string_view name) const
+mds_gid_t FSMap::find_replacement_for(mds_role_t role, std::string_view name) const
{
- mds_gid_t result = MDS_GID_NONE;
+ auto&& fs = get_filesystem(role.fscid);
// First see if we have a STANDBY_REPLAY
- auto fs = get_filesystem(role.fscid);
- for (const auto &i : fs->mds_map.mds_info) {
- const auto &info = i.second;
+ for (const auto& [gid, info] : fs->mds_map.mds_info) {
if (info.rank == role.rank && info.state == MDSMap::STATE_STANDBY_REPLAY) {
- return info.global_id;
+ return gid;
}
}
// See if there are any STANDBY daemons available
- for (const auto &i : standby_daemons) {
- const auto &gid = i.first;
- const auto &info = i.second;
- ceph_assert(info.state == MDSMap::STATE_STANDBY);
+ for (const auto& [gid, info] : standby_daemons) {
ceph_assert(info.rank == MDS_RANK_NONE);
+ ceph_assert(info.state == MDSMap::STATE_STANDBY);
if (info.laggy()) {
continue;
}
- // The mds_info_t may or may not tell us exactly which filesystem
- // the standby_for_rank refers to: lookup via legacy_client_fscid
- mds_role_t target_role = {
- info.standby_for_fscid == FS_CLUSTER_ID_NONE ?
- legacy_client_fscid : info.standby_for_fscid,
- info.standby_for_rank};
-
- if ((target_role.rank == role.rank && target_role.fscid == role.fscid)
- || (name.length() && info.standby_for_name == name)) {
- // It's a named standby for *me*, use it.
- return gid;
- } else if (
- info.standby_for_rank < 0 && info.standby_for_name.length() == 0 &&
- (info.standby_for_fscid == FS_CLUSTER_ID_NONE ||
- info.standby_for_fscid == role.fscid)) {
- // It's not a named standby for anyone, use it if we don't find
- // a named standby for me later, unless it targets another FSCID.
- result = gid;
- }
+ return gid;
}
- return result;
-}
-
-mds_gid_t FSMap::find_unused_for(mds_role_t role,
- bool force_standby_active) const {
- for (const auto &i : standby_daemons) {
- const auto &gid = i.first;
- const auto &info = i.second;
- ceph_assert(info.state == MDSMap::STATE_STANDBY);
-
- if (info.laggy() || info.rank >= 0)
- continue;
-
- if (info.standby_for_fscid != FS_CLUSTER_ID_NONE &&
- info.standby_for_fscid != role.fscid)
- continue;
- if (info.standby_for_rank != MDS_RANK_NONE &&
- info.standby_for_rank != role.rank)
- continue;
-
- // To be considered 'unused' a daemon must either not
- // be selected for standby-replay or the force_standby_active
- // setting must be enabled to use replay daemons anyway.
- if (!info.standby_replay || force_standby_active) {
- return gid;
- }
- }
return MDS_GID_NONE;
}
-mds_gid_t FSMap::find_replacement_for(mds_role_t role, std::string_view name,
- bool force_standby_active) const {
- const mds_gid_t standby = find_standby_for(role, name);
- if (standby)
- return standby;
- else
- return find_unused_for(role, force_standby_active);
-}
-
void FSMap::sanity() const
{
if (legacy_client_fscid != FS_CLUSTER_ID_NONE) {
void FSMap::promote(
mds_gid_t standby_gid,
- const Filesystem::ref& filesystem,
+ Filesystem& filesystem,
mds_rank_t assigned_rank)
{
ceph_assert(gid_exists(standby_gid));
ceph_assert(standby_daemons.at(standby_gid).state == MDSMap::STATE_STANDBY);
}
- MDSMap &mds_map = filesystem->mds_map;
+ MDSMap &mds_map = filesystem.mds_map;
// Insert daemon state to Filesystem
if (!is_standby_replay) {
}
info.rank = assigned_rank;
info.inc = epoch;
- mds_roles[standby_gid] = filesystem->fscid;
+ mds_roles[standby_gid] = filesystem.fscid;
// Update the rank state in Filesystem
mds_map.in.insert(assigned_rank);
*/
void promote(
mds_gid_t standby_gid,
- const Filesystem::ref& filesystem,
+ Filesystem& filesystem,
mds_rank_t assigned_rank);
/**
* Mutator helper for Filesystem objects: expose a non-const
* Filesystem pointer to `fn` and update epochs appropriately.
*/
- void modify_filesystem(
- const fs_cluster_id_t fscid,
- std::function<void(Filesystem::ref)> fn)
+ void modify_filesystem(fs_cluster_id_t fscid, auto&& fn)
{
- auto fs = filesystems.at(fscid);
+ auto& fs = filesystems.at(fscid);
fn(fs);
fs->mds_map.epoch = epoch;
}
* Apply a mutation to the mds_info_t structure for a particular
* daemon (identified by GID), and make appropriate updates to epochs.
*/
- void modify_daemon(
- mds_gid_t who,
- std::function<void(MDSMap::mds_info_t *info)> fn)
+ void modify_daemon(mds_gid_t who, auto&& fn)
{
- if (mds_roles.at(who) == FS_CLUSTER_ID_NONE) {
- auto &info = standby_daemons.at(who);
- fn(&info);
+ const auto& fscid = mds_roles.at(who);
+ if (fscid == FS_CLUSTER_ID_NONE) {
+ auto& info = standby_daemons.at(who);
+ fn(info);
ceph_assert(info.state == MDSMap::STATE_STANDBY);
standby_epochs[who] = epoch;
} else {
- const auto &fs = filesystems[mds_roles.at(who)];
- auto &info = fs->mds_map.mds_info.at(who);
- fn(&info);
-
+ auto& fs = filesystems.at(fscid);
+ auto& info = fs->mds_map.mds_info.at(who);
+ fn(info);
fs->mds_map.epoch = epoch;
}
}
void update_export_targets(mds_gid_t who, const std::set<mds_rank_t> &targets)
{
auto fscid = mds_roles.at(who);
- modify_filesystem(fscid, [who, &targets](auto fs) {
+ modify_filesystem(fscid, [who, &targets](auto&& fs) {
fs->mds_map.mds_info.at(who).export_targets = targets;
});
}
return false;
}
- mds_gid_t find_standby_for(mds_role_t mds, std::string_view name) const;
-
- mds_gid_t find_unused_for(mds_role_t mds, bool force_standby_active) const;
-
- mds_gid_t find_replacement_for(mds_role_t mds, std::string_view name,
- bool force_standby_active) const;
+ mds_gid_t find_replacement_for(mds_role_t mds, std::string_view name) const;
void get_health(list<pair<health_status_t,std::string> >& summary,
list<pair<health_status_t,std::string> > *detail) const;
if (laggy_since != utime_t())
f->dump_stream("laggy_since") << laggy_since;
- f->dump_int("standby_for_rank", standby_for_rank);
- f->dump_int("standby_for_fscid", standby_for_fscid);
- f->dump_string("standby_for_name", standby_for_name);
- f->dump_bool("standby_replay", standby_replay);
f->open_array_section("export_targets");
for (set<mds_rank_t>::iterator p = export_targets.begin();
p != export_targets.end(); ++p) {
if (laggy()) {
out << " laggy since " << laggy_since;
}
- if (standby_for_rank != -1 ||
- !standby_for_name.empty()) {
- out << " (standby for";
- //if (standby_for_rank >= 0)
- out << " rank " << standby_for_rank;
- if (!standby_for_name.empty()) {
- out << " '" << standby_for_name << "'";
- }
- out << ")";
- }
if (!export_targets.empty()) {
out << " export_targets=" << export_targets;
}
encode(addrs, bl, features);
}
encode(laggy_since, bl);
- encode(standby_for_rank, bl);
- encode(standby_for_name, bl);
+ encode(MDS_RANK_NONE, bl); /* standby_for_rank */
+ encode(std::string(), bl); /* standby_for_name */
encode(export_targets, bl);
encode(mds_features, bl);
- encode(standby_for_fscid, bl);
- encode(standby_replay, bl);
+ encode(FS_CLUSTER_ID_NONE, bl); /* standby_for_fscid */
+ encode(false, bl);
ENCODE_FINISH(bl);
}
encode(state_seq, bl);
encode(addrs.legacy_addr(), bl, 0);
encode(laggy_since, bl);
- encode(standby_for_rank, bl);
- encode(standby_for_name, bl);
+ encode(MDS_RANK_NONE, bl);
+ encode(std::string(), bl);
encode(export_targets, bl);
}
decode(state_seq, bl);
decode(addrs, bl);
decode(laggy_since, bl);
- decode(standby_for_rank, bl);
- decode(standby_for_name, bl);
+ {
+ mds_rank_t standby_for_rank;
+ decode(standby_for_rank, bl);
+ }
+ {
+ std::string standby_for_name;
+ decode(standby_for_name, bl);
+ }
if (struct_v >= 2)
decode(export_targets, bl);
if (struct_v >= 5)
decode(mds_features, bl);
if (struct_v >= 6) {
+ fs_cluster_id_t standby_for_fscid;
decode(standby_for_fscid, bl);
}
if (struct_v >= 7) {
+ bool standby_replay;
decode(standby_replay, bl);
}
DECODE_FINISH(bl);
version_t state_seq = 0;
entity_addrvec_t addrs;
utime_t laggy_since;
- mds_rank_t standby_for_rank = MDS_RANK_NONE;
- std::string standby_for_name;
- fs_cluster_id_t standby_for_fscid = FS_CLUSTER_ID_NONE;
- bool standby_replay = false;
std::set<mds_rank_t> export_targets;
uint64_t mds_features = 0;
return is_clientreplay(m) || is_active(m) || is_stopping(m);
}
- bool is_followable(mds_rank_t m) const {
- return (is_resolve(m) ||
- is_replay(m) ||
- is_rejoin(m) ||
- is_clientreplay(m) ||
- is_active(m) ||
- is_stopping(m));
+ bool is_followable(mds_rank_t r) const {
+ bool has_followable_rank = false;
+ for (const auto& p : mds_info) {
+ auto& info = p.second;
+ if (info.rank == r) {
+ if (info.state == STATE_ACTIVE) {
+ has_followable_rank = true;
+ } else {
+ return false;
+ }
+ }
+ if (p.second.state == STATE_STANDBY_REPLAY) {
+ return false;
+ }
+ }
+ return has_followable_rank;
}
bool is_laggy_gid(mds_gid_t gid) const {
// might not include barriers from the previous incarnation of this MDS)
set_osd_epoch_barrier(objecter->with_osdmap(
std::mem_fn(&OSDMap::get_epoch)));
- }
- if (is_active()) {
+ /* Now check if we should hint to the OSD that a read may follow */
bool found = false;
- MDSMap::mds_info_t info = mdsmap->get_info(whoami);
-
- for (map<mds_gid_t,MDSMap::mds_info_t>::const_iterator p = mdsmap->get_mds_info().begin();
- p != mdsmap->get_mds_info().end();
- ++p) {
- if (p->second.state == MDSMap::STATE_STANDBY_REPLAY &&
- (p->second.standby_for_rank == whoami ||(info.name.length() && p->second.standby_for_name == info.name))) {
+ for (const auto& p : mdsmap->get_mds_info()) {
+ auto& info = p.second;
+ if (info.state == MDSMap::STATE_STANDBY_REPLAY && info.rank == whoami) {
found = true;
break;
}
MDSMap::DaemonState state = MDSMap::STATE_NULL;
version_t seq = 0;
- mds_rank_t standby_for_rank = MDS_RANK_NONE;
- string standby_for_name;
- fs_cluster_id_t standby_for_fscid = FS_CLUSTER_ID_NONE;
- bool standby_replay = false;
-
CompatSet compat;
MDSHealth health;
MDSMap::DaemonState get_state() const { return state; }
version_t get_seq() const { return seq; }
std::string_view get_type_name() const override { return "mdsbeacon"; }
- mds_rank_t get_standby_for_rank() const { return standby_for_rank; }
- const string& get_standby_for_name() const { return standby_for_name; }
- const fs_cluster_id_t& get_standby_for_fscid() const { return standby_for_fscid; }
- bool get_standby_replay() const { return standby_replay; }
uint64_t get_mds_features() const { return mds_features; }
CompatSet const& get_compat() const { return compat; }
MDSHealth const& get_health() const { return health; }
void set_health(const MDSHealth &h) { health = h; }
- void set_standby_for_rank(mds_rank_t r) { standby_for_rank = r; }
- void set_standby_for_name(string& n) { standby_for_name = n; }
- void set_standby_for_name(const char* c) { standby_for_name.assign(c); }
- void set_standby_for_fscid(fs_cluster_id_t f) { standby_for_fscid = f; }
- void set_standby_replay(bool r) { standby_replay = r; }
-
const map<string, string>& get_sys_info() const { return sys_info; }
void set_sys_info(const map<string, string>& i) { sys_info = i; }
encode((__u32)state, payload);
encode(seq, payload);
encode(name, payload);
- encode(standby_for_rank, payload);
- encode(standby_for_name, payload);
+ encode(MDS_RANK_NONE, payload);
+ encode(std::string(), payload);
encode(compat, payload);
encode(health, payload);
if (state == MDSMap::STATE_BOOT) {
encode(sys_info, payload);
}
encode(mds_features, payload);
- encode(standby_for_fscid, payload);
- encode(standby_replay, payload);
+ encode(FS_CLUSTER_ID_NONE, payload);
+ encode(false, payload);
}
void decode_payload() override {
using ceph::decode;
decode((__u32&)state, p);
decode(seq, p);
decode(name, p);
- decode(standby_for_rank, p);
- decode(standby_for_name, p);
+ {
+ mds_rank_t standby_for_rank;
+ decode(standby_for_rank, p);
+ }
+ {
+ std::string standby_for_name;
+ decode(standby_for_name, p);
+ }
decode(compat, p);
decode(health, p);
if (state == MDSMap::STATE_BOOT) {
decode(sys_info, p);
}
decode(mds_features, p);
- decode(standby_for_fscid, p);
+ {
+ fs_cluster_id_t standby_for_fscid;
+ decode(standby_for_fscid, p);
+ }
if (header.version >= 7) {
+ bool standby_replay;
decode(standby_replay, p);
}
if (header.version < 7 && state == MDSMap::STATE_STANDBY_REPLAY) {
// Old MDS daemons request the state, instead of explicitly
// advertising that they are configured as a replay daemon.
- standby_replay = true;
state = MDSMap::STATE_STANDBY;
}
}
mon->osdmon()->propose_pending();
// All checks passed, go ahead and create.
- auto fs = fsmap.create_filesystem(fs_name, metadata, data,
+ auto&& fs = fsmap.create_filesystem(fs_name, metadata, data,
mon->get_quorum_con_features());
ss << "new fs with metadata pool " << metadata << " and data pool " << data;
// assign a standby to rank 0 to avoid health warnings
std::string _name;
- mds_gid_t gid = fsmap.find_replacement_for({fs->fscid, 0}, _name,
- g_conf()->mon_force_standby_active);
+ mds_gid_t gid = fsmap.find_replacement_for({fs->fscid, 0}, _name);
if (gid != MDS_GID_NONE) {
const auto &info = fsmap.get_info_gid(gid);
mon->clog->info() << info.human_name() << " assigned to filesystem "
<< fs_name << " as rank 0";
- fsmap.promote(gid, fs, 0);
+ fsmap.promote(gid, *fs, 0);
}
return 0;
new_info.mds_features = m->get_mds_features();
new_info.state = MDSMap::STATE_STANDBY;
new_info.state_seq = seq;
- new_info.standby_for_rank = m->get_standby_for_rank();
- new_info.standby_for_name = m->get_standby_for_name();
- new_info.standby_for_fscid = m->get_standby_for_fscid();
- new_info.standby_replay = m->get_standby_replay();
pending.insert(new_info);
}
- // Resolve standby_for_name to a rank
- const MDSMap::mds_info_t &info = pending.get_info_gid(gid);
- if (!info.standby_for_name.empty()) {
- const MDSMap::mds_info_t *leaderinfo = pending.find_by_name(
- info.standby_for_name);
- if (leaderinfo && (leaderinfo->rank >= 0)) {
- const auto &fscid = pending.mds_roles.at(leaderinfo->global_id);
-
- pending.modify_daemon(gid, [fscid, leaderinfo](
- MDSMap::mds_info_t *info) {
- info->standby_for_rank = leaderinfo->rank;
- info->standby_for_fscid = fscid;
- });
- }
- }
-
// initialize the beacon timer
auto &beacon = last_beacon[gid];
beacon.stamp = mono_clock::now();
return true;
}
- const MDSMap::mds_info_t &info = pending.get_info_gid(gid);
- // Old MDS daemons don't mention that they're standby replay until
- // after they've sent their boot beacon, so update this field.
- if (info.standby_replay != m->get_standby_replay()) {
- pending.modify_daemon(info.global_id, [&m](
- MDSMap::mds_info_t *i)
- {
- i->standby_replay = m->get_standby_replay();
- });
- }
-
+ const auto& info = pending.get_info_gid(gid);
if (info.state == MDSMap::STATE_STOPPING &&
state != MDSMap::STATE_STOPPING &&
state != MDSMap::STATE_STOPPED) {
if (info.laggy()) {
dout(1) << "prepare_beacon clearing laggy flag on " << addrs << dendl;
- pending.modify_daemon(info.global_id, [](MDSMap::mds_info_t *info)
+ pending.modify_daemon(info.global_id, [](auto& info)
{
- info->clear_laggy();
+ info.clear_laggy();
}
);
}
dout(5) << "prepare_beacon mds." << info.rank
<< " " << ceph_mds_state_name(info.state)
<< " -> " << ceph_mds_state_name(state)
- << " standby_for_rank=" << m->get_standby_for_rank()
<< dendl;
if (state == MDSMap::STATE_STOPPED) {
const auto fscid = pending.mds_roles.at(gid);
// Made it through special cases and validations, record the
// daemon's reported state to the FSMap.
- pending.modify_daemon(gid, [state, seq](MDSMap::mds_info_t *info) {
- info->state = state;
- info->state_seq = seq;
+ pending.modify_daemon(gid, [state, seq](auto& info) {
+ info.state = state;
+ info.state_seq = seq;
});
}
}
return -EINVAL;
}
if (fsmap.gid_exists(gid)) {
- fsmap.modify_daemon(gid, [state](MDSMap::mds_info_t *info) {
- info->state = state;
+ fsmap.modify_daemon(gid, [state](auto& info) {
+ info.state = state;
});
ss << "set mds gid " << gid << " to state " << state << " "
<< ceph_mds_state_name(state);
bool MDSMonitor::maybe_resize_cluster(FSMap &fsmap, fs_cluster_id_t fscid)
{
auto ¤t_mds_map = get_fsmap().get_filesystem(fscid)->mds_map;
- auto fs = fsmap.get_filesystem(fscid);
+ auto&& fs = fsmap.get_filesystem(fscid);
auto &mds_map = fs->mds_map;
int in = mds_map.get_num_in_mds();
while (mds_map.is_in(mds)) {
mds++;
}
- mds_gid_t newgid = fsmap.find_replacement_for({fscid, mds},
- name, g_conf()->mon_force_standby_active);
+ auto&& newgid = fsmap.find_replacement_for({fscid, mds}, name);
if (newgid == MDS_GID_NONE) {
return false;
}
"filesystem " << mds_map.fs_name << " as rank "
<< mds << " (now has " << mds_map.get_num_in_mds() + 1
<< " ranks)";
- fsmap.promote(newgid, fs, mds);
+ fsmap.promote(newgid, *fs, mds);
return true;
} else if (in > max) {
mds_rank_t target = in - 1;
if (mds_map.is_active(target)) {
dout(1) << "stopping " << target << dendl;
mon->clog->info() << "stopping " << info.human_name();
- fsmap.modify_daemon(info.global_id,
- [] (MDSMap::mds_info_t *info) {
- info->state = MDSMap::STATE_STOPPING;
- });
+ auto f = [](auto& info) {
+ info.state = MDSMap::STATE_STOPPING;
+ };
+ fsmap.modify_daemon(info.global_id, f);
return true;
} else {
dout(20) << "skipping stop of " << target << dendl;
info.state != MDSMap::STATE_STANDBY_REPLAY &&
may_replace &&
!fsmap.get_filesystem(fscid)->mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE) &&
- (sgid = fsmap.find_replacement_for({fscid, info.rank}, info.name,
- g_conf()->mon_force_standby_active)) != MDS_GID_NONE)
+ (sgid = fsmap.find_replacement_for({fscid, info.rank}, info.name)) != MDS_GID_NONE)
{
MDSMap::mds_info_t si = fsmap.get_info_gid(sgid);
*osd_propose |= fail_mds_gid(fsmap, gid);
// Promote the replacement
- auto fs = fsmap.filesystems.at(fscid);
- fsmap.promote(sgid, fs, info.rank);
+ auto&& fs = fsmap.filesystems.at(fscid);
+ fsmap.promote(sgid, *fs, info.rank);
*mds_propose = true;
} else if ((info.state == MDSMap::STATE_STANDBY_REPLAY ||
<< " mds." << info.rank << "." << info.inc
<< " " << ceph_mds_state_name(info.state)
<< " laggy" << dendl;
- fsmap.modify_daemon(info.global_id, [](MDSMap::mds_info_t *info) {
- info->laggy_since = ceph_clock_now();
+ fsmap.modify_daemon(info.global_id, [](auto& info) {
+ info.laggy_since = ceph_clock_now();
});
*mds_propose = true;
}
}
-bool MDSMonitor::maybe_promote_standby(FSMap &fsmap, std::shared_ptr<Filesystem> &fs)
+bool MDSMonitor::maybe_promote_standby(FSMap &fsmap, Filesystem& fs)
{
- if (fs->mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE)) {
+ if (fs.mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE)) {
return false;
}
// have a standby take over?
set<mds_rank_t> failed;
- fs->mds_map.get_failed_mds_set(failed);
- if (!failed.empty()) {
- set<mds_rank_t>::iterator p = failed.begin();
- while (p != failed.end()) {
- mds_rank_t f = *p++;
- mds_gid_t sgid = fsmap.find_replacement_for({fs->fscid, f}, {},
- g_conf()->mon_force_standby_active);
- if (sgid) {
- const MDSMap::mds_info_t si = fsmap.get_info_gid(sgid);
- dout(1) << " taking over failed mds." << f << " with " << sgid
- << "/" << si.name << " " << si.addrs << dendl;
- mon->clog->info() << "Standby " << si.human_name()
- << " assigned to filesystem " << fs->mds_map.fs_name
- << " as rank " << f;
-
- fsmap.promote(sgid, fs, f);
- do_propose = true;
- }
- }
- } else if (!fs->mds_map.is_degraded()) {
- // There were no failures to replace, so try using any available standbys
- // as standby-replay daemons. Don't do this when the cluster is degraded
- // as a standby-replay daemon may try to read a journal being migrated.
-
- // Take a copy of the standby GIDs so that we can iterate over
- // them while perhaps-modifying standby_daemons during the loop
- // (if we promote anyone they are removed from standby_daemons)
- std::vector<mds_gid_t> standby_gids;
- for (const auto &j : fsmap.standby_daemons) {
- standby_gids.push_back(j.first);
- }
-
- for (const auto &gid : standby_gids) {
- const auto &info = fsmap.standby_daemons.at(gid);
- ceph_assert(info.state == MDSMap::STATE_STANDBY);
-
- if (!info.standby_replay) {
- continue;
- }
-
- /*
- * This mds is standby but has no rank assigned.
- * See if we can find it somebody to shadow
- */
- dout(20) << "gid " << gid << " is standby and following nobody" << dendl;
-
- // standby for someone specific?
- if (info.standby_for_rank >= 0) {
- // The mds_info_t may or may not tell us exactly which filesystem
- // the standby_for_rank refers to: lookup via legacy_client_fscid
- mds_role_t target_role = {
- info.standby_for_fscid == FS_CLUSTER_ID_NONE ?
- fsmap.legacy_client_fscid : info.standby_for_fscid,
- info.standby_for_rank};
-
- // It is possible that the map contains a standby_for_fscid
- // that doesn't correspond to an existing filesystem, especially
- // if we loaded from a version with a bug (#17466)
- if (info.standby_for_fscid != FS_CLUSTER_ID_NONE
- && !fsmap.filesystem_exists(info.standby_for_fscid)) {
- derr << "gid " << gid << " has invalid standby_for_fscid "
- << info.standby_for_fscid << dendl;
- continue;
- }
-
- // If we managed to resolve a full target role
- if (target_role.fscid != FS_CLUSTER_ID_NONE) {
- const auto &fs = fsmap.get_filesystem(target_role.fscid);
- if (fs->mds_map.is_followable(target_role.rank)) {
- do_propose |= try_standby_replay(fsmap, info, *fs,
- fs->mds_map.get_info(target_role.rank));
- }
- }
+ fs.mds_map.get_failed_mds_set(failed);
+ for (const auto& rank : failed) {
+ auto&& sgid = fsmap.find_replacement_for({fs.fscid, rank}, {});
+ if (sgid) {
+ auto&& info = fsmap.get_info_gid(sgid);
+ dout(1) << " taking over failed mds." << rank << " with " << sgid
+ << "/" << info.name << " " << info.addrs << dendl;
+ mon->clog->info() << "Standby " << info.human_name()
+ << " assigned to filesystem " << fs.mds_map.fs_name
+ << " as rank " << rank;
- continue;
- }
-
- // check everyone
- for (const auto &p : fsmap.filesystems) {
- if (info.standby_for_fscid != FS_CLUSTER_ID_NONE &&
- info.standby_for_fscid != p.first)
- continue;
-
- bool assigned = false;
- const auto &fs = p.second;
- const MDSMap &mds_map = fs->mds_map;
- for (const auto &mds_i : mds_map.mds_info) {
- const MDSMap::mds_info_t &cand_info = mds_i.second;
- if (cand_info.rank >= 0 && mds_map.is_followable(cand_info.rank)) {
- if ((info.standby_for_name.length() && info.standby_for_name != cand_info.name) ||
- info.standby_for_rank != MDS_RANK_NONE) {
- continue; // we're supposed to follow someone else
- }
-
- if (try_standby_replay(fsmap, info, *fs, cand_info)) {
- assigned = true;
- break;
- }
- }
- }
- if (assigned) {
- do_propose = true;
- break;
- }
- }
+ fsmap.promote(sgid, fs, rank);
+ do_propose = true;
}
}
}
for (auto &p : pending.filesystems) {
- do_propose |= maybe_promote_standby(pending, p.second);
+ do_propose |= maybe_promote_standby(pending, *p.second);
}
if (do_propose) {
}
}
-/**
- * finfo: the would-be follower
- * leader_fs: the Filesystem containing the would-be leader
- * ainfo: the would-be leader
- */
-bool MDSMonitor::try_standby_replay(
- FSMap &fsmap,
- const MDSMap::mds_info_t& finfo,
- const Filesystem &leader_fs,
- const MDSMap::mds_info_t& ainfo)
-{
- // someone else already following?
- if (leader_fs.has_standby_replay(ainfo.global_id)) {
- dout(20) << " mds." << ainfo.rank << " already has a follower" << dendl;
- return false;
- } else {
- // Assign the new role to the standby
- dout(10) << " setting to follow mds rank " << ainfo.rank << dendl;
- fsmap.assign_standby_replay(finfo.global_id, leader_fs.fscid, ainfo.rank);
- return true;
- }
-}
-
MDSMonitor::MDSMonitor(Monitor *mn, Paxos *p, string service_name)
: PaxosService(mn, p, service_name)
{
};
map<mds_gid_t, beacon_info_t> last_beacon;
- bool try_standby_replay(FSMap &fsmap, const MDSMap::mds_info_t& finfo,
- const Filesystem &leader_fs, const MDSMap::mds_info_t& ainfo);
-
std::list<std::shared_ptr<FileSystemCommandHandler> > handlers;
- bool maybe_promote_standby(FSMap &fsmap, std::shared_ptr<Filesystem> &fs);
+ bool maybe_promote_standby(FSMap& fsmap, Filesystem& fs);
bool maybe_resize_cluster(FSMap &fsmap, fs_cluster_id_t fscid);
void maybe_replace_gid(FSMap &fsmap, mds_gid_t gid,
const MDSMap::mds_info_t& info, bool *mds_propose, bool *osd_propose);