f->open_object_section("info");
info.dump(f);
f->dump_int("epoch", standby_epochs.at(gid));
- auto p = standby_daemon_fscid.find(gid);
- if (p != standby_daemon_fscid.end()) {
- f->dump_int("fscid", p->second);
- }
f->close_section();
}
f->close_section();
mds_roles = rhs.mds_roles;
standby_daemons = rhs.standby_daemons;
standby_epochs = rhs.standby_epochs;
- standby_daemon_fscid = rhs.standby_daemon_fscid;
filesystems.clear();
for (const auto &i : rhs.filesystems) {
}
for (const auto& p : standby_daemons) {
- out << p.second;
- auto q = standby_daemon_fscid.find(p.first);
- if (q != standby_daemon_fscid.end()) {
- out << " (" << q->second << ")";
- }
- out << std::endl;
+ out << p.second << std::endl;
}
}
std::set<mds_rank_t> stuck_failed;
for (const auto &rank : fs->mds_map.failed) {
- auto&& replacement = find_replacement_for({fs->fscid, rank}, {});
- if (replacement == MDS_GID_NONE) {
+ auto rep_info = find_replacement_for({fs->fscid, rank});
+ if (!rep_info) {
stuck_failed.insert(rank);
}
}
void FSMap::encode(bufferlist& bl, uint64_t features) const
{
- ENCODE_START(8, 6, bl);
+ ENCODE_START(7, 6, bl);
encode(epoch, bl);
encode(next_filesystem_id, bl);
encode(legacy_client_fscid, bl);
encode(standby_daemons, bl, features);
encode(standby_epochs, bl);
encode(ever_enabled_multiple, bl);
- encode(standby_daemon_fscid, bl);
ENCODE_FINISH(bl);
}
if (struct_v >= 7) {
decode(ever_enabled_multiple, p);
}
- if (struct_v >= 8) {
- decode(standby_daemon_fscid, p);
- }
}
DECODE_FINISH(p);
std::map<mds_gid_t, MDSMap::mds_info_t> FSMap::get_mds_info() const
{
- std::map<mds_gid_t, MDSMap::mds_info_t> result;
+ std::map<mds_gid_t, mds_info_t> result;
for (const auto &i : standby_daemons) {
result[i.first] = i.second;
}
return result;
}
-mds_gid_t FSMap::get_available_standby(fs_cluster_id_t fscid) const
+const MDSMap::mds_info_t* FSMap::get_available_standby(fs_cluster_id_t fscid) const
{
+ const mds_info_t* who = nullptr;
for (const auto& [gid, info] : standby_daemons) {
ceph_assert(info.rank == MDS_RANK_NONE);
ceph_assert(info.state == MDSMap::STATE_STANDBY);
continue;
}
- auto p = standby_daemon_fscid.find(gid);
- if (p != standby_daemon_fscid.end() &&
- p->second != fscid) {
- continue;
+ if (info.join_fscid == fscid) {
+ who = &info;
+ break;
+ } else if (info.join_fscid == FS_CLUSTER_ID_NONE) {
+ who = &info; /* vanilla standby */
+ } else if (who == nullptr) {
+ who = &info; /* standby for another fs, last resort */
}
-
- return gid;
}
- return MDS_GID_NONE;
+ return who;
}
mds_gid_t FSMap::find_mds_gid_by_name(std::string_view s) const
const MDSMap::mds_info_t* FSMap::find_by_name(std::string_view name) const
{
- std::map<mds_gid_t, MDSMap::mds_info_t> result;
+ std::map<mds_gid_t, mds_info_t> result;
for (const auto &i : standby_daemons) {
if (i.second.name == name) {
return &(i.second);
return nullptr;
}
-mds_gid_t FSMap::find_replacement_for(mds_role_t role, std::string_view name) const
+const MDSMap::mds_info_t* FSMap::find_replacement_for(mds_role_t role) const
{
auto&& fs = get_filesystem(role.fscid);
if (info.rank == role.rank && info.state == MDSMap::STATE_STANDBY_REPLAY) {
if (info.is_frozen()) {
/* the standby-replay is frozen, do nothing! */
- return MDS_GID_NONE;
+ return nullptr;
} else {
- return gid;
+ return &info;
}
}
}
ceph_assert(mds_map.mds_info.at(standby_gid).state == MDSMap::STATE_STANDBY_REPLAY);
ceph_assert(mds_map.mds_info.at(standby_gid).rank == assigned_rank);
}
- MDSMap::mds_info_t &info = mds_map.mds_info[standby_gid];
+ auto& info = mds_map.mds_info[standby_gid];
if (mds_map.stopped.erase(assigned_rank)) {
// The cluster is being expanded with a stopped rank
if (!is_standby_replay) {
standby_daemons.erase(standby_gid);
standby_epochs.erase(standby_gid);
- standby_daemon_fscid.erase(standby_gid);
}
// Indicate that Filesystem has been modified
// Remove from the list of standbys
standby_daemons.erase(standby_gid);
standby_epochs.erase(standby_gid);
- standby_daemon_fscid.erase(standby_gid);
// Indicate that Filesystem has been modified
fs->mds_map.epoch = epoch;
if (mds_roles.at(who) == FS_CLUSTER_ID_NONE) {
standby_daemons.erase(who);
standby_epochs.erase(who);
- standby_daemon_fscid.erase(who);
} else {
auto &fs = filesystems.at(mds_roles.at(who));
const auto &info = fs->mds_map.mds_info.at(who);
standby_epochs[new_info.global_id] = epoch;
}
-void FSMap::adjust_standby_fscid(mds_gid_t standby_gid,
- fs_cluster_id_t fscid)
-{
- standby_daemon_fscid[standby_gid] = fscid;
-}
-
-std::size_t FSMap::clear_standby_fscid(mds_gid_t standby_gid)
-{
- auto count = standby_daemon_fscid.erase(standby_gid);
- return count;
-}
-
std::vector<mds_gid_t> FSMap::stop(mds_gid_t who)
{
ceph_assert(mds_roles.at(who) != FS_CLUSTER_ID_NONE);
}
return false;
}
+
+void FSMap::erase_filesystem(fs_cluster_id_t fscid)
+{
+ filesystems.erase(fscid);
+ for (auto& [gid, info] : standby_daemons) {
+ if (info.join_fscid == fscid) {
+ modify_daemon(gid, [](auto& info) {
+ info.join_fscid = FS_CLUSTER_ID_NONE;
+ });
+ }
+ }
+ for (auto& p : filesystems) {
+ for (auto& [gid, info] : p.second->mds_map.get_mds_info()) {
+ if (info.join_fscid == fscid) {
+ modify_daemon(gid, [](auto& info) {
+ info.join_fscid = FS_CLUSTER_ID_NONE;
+ });
+ }
+ }
+ }
+}
goto ignore;
}
- // did the standby fs change
- if (info.state == MDSMap::STATE_STANDBY &&
- state == MDSMap::STATE_STANDBY) {
- if (m->get_fs().size()) {
- fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
- auto f = fsmap.get_filesystem(m->get_fs());
- if (f) {
- fscid = f->fscid;
- }
- auto p = fsmap.standby_daemon_fscid.find(gid);
- if (p == fsmap.standby_daemon_fscid.end() ||
- p->second != fscid) {
- dout(10) << __func__ << " standby mds_join_fs changed to " << fscid
- << " (" << m->get_fs() << ")" << dendl;
- return false;
- }
- } else {
- auto p = fsmap.standby_daemon_fscid.find(gid);
- if (p != fsmap.standby_daemon_fscid.end()) {
- dout(10) << __func__ << " standby mds_join_fs was cleared" << dendl;
- return false;
- }
+ // did the join_fscid change
+ if (m->get_fs().size()) {
+ fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
+ auto f = fsmap.get_filesystem(m->get_fs());
+ if (f) {
+ fscid = f->fscid;
+ }
+ if (info.join_fscid != fscid) {
+ dout(10) << __func__ << " standby mds_join_fs changed to " << fscid
+ << " (" << m->get_fs() << ")" << dendl;
+ _note_beacon(m);
+ return false;
+ }
+ } else {
+ if (info.join_fscid != FS_CLUSTER_ID_NONE) {
+ dout(10) << __func__ << " standby mds_join_fs was cleared" << dendl;
+ _note_beacon(m);
+ return false;
}
}
if (f) {
fscid = f->fscid;
}
- pending.adjust_standby_fscid(gid, fscid);
+ new_info.join_fscid = fscid;
}
}
}
);
}
-
+
dout(5) << "prepare_beacon mds." << info.rank
<< " " << ceph_mds_state_name(info.state)
<< " -> " << ceph_mds_state_name(state)
<< dendl;
+
+ fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
+ if (m->get_fs().size()) {
+ auto f = pending.get_filesystem(m->get_fs());
+ if (f) {
+ fscid = f->fscid;
+ }
+ }
+ pending.modify_daemon(gid, [fscid](auto& info) {
+ info.join_fscid = fscid;
+ });
+
if (state == MDSMap::STATE_STOPPED) {
const auto fscid = pending.mds_roles.at(gid);
const auto &fs = pending.get_filesystem(fscid);
auto erased = pending.stop(gid);
erased.push_back(gid);
- for (const auto &erased_gid : erased) {
+ for (const auto& erased_gid : erased) {
last_beacon.erase(erased_gid);
if (pending_daemon_health.count(erased_gid)) {
pending_daemon_health.erase(erased_gid);
info.state = state;
info.state_seq = seq;
});
-
- // Process standby mds_join_fs change
- if (state == MDSMap::STATE_STANDBY) {
- if (m->get_fs().size()) {
- fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
- auto f = pending.get_filesystem(m->get_fs());
- if (f) {
- fscid = f->fscid;
- }
- pending.adjust_standby_fscid(gid, fscid);
- } else {
- pending.clear_standby_fscid(gid);
- }
- }
}
}
bool MDSMonitor::fail_mds_gid(FSMap &fsmap, mds_gid_t gid)
{
- const MDSMap::mds_info_t &info = fsmap.get_info_gid(gid);
+ const auto& info = fsmap.get_info_gid(gid);
dout(1) << "fail_mds_gid " << gid << " mds." << info.name << " role " << info.rank << dendl;
ceph_assert(mon->osdmon()->is_writeable());
if (in < max && !mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE)) {
mds_rank_t mds = mds_rank_t(0);
- string name;
while (mds_map.is_in(mds)) {
mds++;
}
- auto&& newgid = fsmap.find_replacement_for({fscid, mds}, name);
- if (newgid == MDS_GID_NONE) {
+ auto info = fsmap.find_replacement_for({fscid, mds});
+ if (!info) {
return false;
}
- const auto &new_info = fsmap.get_info_gid(newgid);
- dout(1) << "assigned standby " << new_info.addrs
+ dout(1) << "assigned standby " << info->addrs
<< " as mds." << mds << dendl;
-
- mon->clog->info() << new_info.human_name() << " assigned to "
+ mon->clog->info() << info->human_name() << " assigned to "
"filesystem " << mds_map.fs_name << " as rank "
<< mds << " (now has " << mds_map.get_num_in_mds() + 1
<< " ranks)";
- fsmap.promote(newgid, *fs, mds);
+ fsmap.promote(info->global_id, *fs, mds);
return true;
} else if (in > max) {
mds_rank_t target = in - 1;
/**
- * If a daemon is laggy, and a suitable replacement
- * is available, fail this daemon (remove from map) and pass its
- * role to another daemon.
+ * Fail a daemon and replace it with a suitable standby.
*/
-void MDSMonitor::maybe_replace_gid(FSMap &fsmap, mds_gid_t gid,
- const MDSMap::mds_info_t& info, bool *mds_propose, bool *osd_propose)
+bool MDSMonitor::drop_mds(FSMap &fsmap, mds_gid_t gid, const mds_info_t* rep_info, bool *osd_propose)
{
- ceph_assert(mds_propose != nullptr);
ceph_assert(osd_propose != nullptr);
const auto fscid = fsmap.mds_roles.at(gid);
+ const auto& info = fsmap.get_info_gid(gid);
+ const auto rank = info.rank;
+ const auto state = info.state;
+
+ if (info.is_frozen()) {
+ return false;
+ } else if (state == MDSMap::STATE_STANDBY_REPLAY ||
+ state == MDSMap::STATE_STANDBY) {
+ dout(1) << " failing and removing standby " << gid << " " << info.addrs
+ << " mds." << rank
+ << "." << info.inc << " " << ceph_mds_state_name(state)
+ << dendl;
+ *osd_propose |= fail_mds_gid(fsmap, gid);
+ return true;
+ } else if (rank >= 0 && rep_info) {
+ auto fs = fsmap.filesystems.at(fscid);
+ if (fs->mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE)) {
+ return false;
+ }
+ // are we in?
+ // and is there a non-laggy standby that can take over for us?
+ dout(1) << " replacing " << gid << " " << info.addrs
+ << " mds." << rank << "." << info.inc
+ << " " << ceph_mds_state_name(state)
+ << " with " << rep_info->global_id << "/" << rep_info->name << " " << rep_info->addrs
+ << dendl;
+
+ mon->clog->warn() << "Replacing " << info.human_name()
+ << " as rank " << rank
+ << " with standby " << rep_info->human_name();
+
+ // Remove the old one
+ *osd_propose |= fail_mds_gid(fsmap, gid);
+
+ // Promote the replacement
+ fsmap.promote(rep_info->global_id, *fs, rank);
+
+ return true;
+ }
+ return false;
+}
+
+bool MDSMonitor::check_health(FSMap& fsmap, bool* propose_osdmap)
+{
+ bool do_propose = false;
+ const auto now = mono_clock::now();
+ const bool osdmap_writeable = mon->osdmon()->is_writeable();
+ const auto mds_beacon_grace = g_conf().get_val<double>("mds_beacon_grace");
+ const auto mds_beacon_interval = g_conf().get_val<double>("mds_beacon_interval");
+
+ if (mono_clock::is_zero(last_tick)) {
+ last_tick = now;
+ }
+
+ {
+ auto since_last = std::chrono::duration<double>(now-last_tick);
+
+ if (since_last.count() > (mds_beacon_grace-mds_beacon_interval)) {
+ // This case handles either local slowness (calls being delayed
+ // for whatever reason) or cluster election slowness (a long gap
+ // between calls while an election happened)
+ dout(1) << __func__ << ": resetting beacon timeouts due to mon delay "
+ "(slow election?) of " << since_last.count() << " seconds" << dendl;
+ for (auto& p : last_beacon) {
+ p.second.stamp = now;
+ }
+ }
+ }
+
+ // make sure last_beacon is fully populated
+ for (auto& p : fsmap.mds_roles) {
+ auto& gid = p.first;
+ last_beacon.emplace(std::piecewise_construct,
+ std::forward_as_tuple(gid),
+ std::forward_as_tuple(now, 0));
+ }
// We will only take decisive action (replacing/removing a daemon)
- // if we have some indicating that some other daemon(s) are successfully
+ // if we have some indication that some other daemon(s) are successfully
// getting beacons through recently.
mono_time latest_beacon = mono_clock::zero();
- for (const auto &p : last_beacon) {
+ for (const auto& p : last_beacon) {
latest_beacon = std::max(p.second.stamp, latest_beacon);
}
- mono_time now = mono_clock::now();
- chrono::duration<double> since = now-latest_beacon;
- const bool frozen = info.is_frozen();
+ auto since = chrono::duration<double>(now-latest_beacon);
const bool may_replace = since.count() <
std::max(g_conf()->mds_beacon_interval, g_conf()->mds_beacon_grace * 0.5);
- // are we in?
- // and is there a non-laggy standby that can take over for us?
- mds_gid_t sgid;
- if (info.rank >= 0 &&
- info.state != MDSMap::STATE_STANDBY &&
- info.state != MDSMap::STATE_STANDBY_REPLAY &&
- may_replace &&
- !frozen &&
- !fsmap.get_filesystem(fscid)->mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE) &&
- (sgid = fsmap.find_replacement_for({fscid, info.rank}, info.name)) != MDS_GID_NONE)
- {
-
- MDSMap::mds_info_t si = fsmap.get_info_gid(sgid);
- dout(1) << " replacing " << gid << " " << info.addrs
- << " mds." << info.rank << "." << info.inc
- << " " << ceph_mds_state_name(info.state)
- << " with " << sgid << "/" << si.name << " " << si.addrs
- << dendl;
+ // check beacon timestamps
+ std::vector<mds_gid_t> to_remove;
+ for (auto it = last_beacon.begin(); it != last_beacon.end(); ) {
+ auto& [gid, beacon_info] = *it;
+ auto since_last = chrono::duration<double>(now-beacon_info.stamp);
+
+ if (!fsmap.gid_exists(gid)) {
+ // gid no longer exists, remove from tracked beacons
+ it = last_beacon.erase(it);
+ continue;
+ }
- mon->clog->warn() << info.human_name()
- << " is not responding, replacing it "
- << "as rank " << info.rank
- << " with standby " << si.human_name();
+ if (since_last.count() >= g_conf()->mds_beacon_grace) {
+ auto& info = fsmap.get_info_gid(gid);
+ dout(1) << "no beacon from mds." << info.rank << "." << info.inc
+ << " (gid: " << gid << " addr: " << info.addrs
+ << " state: " << ceph_mds_state_name(info.state) << ")"
+ << " since " << since_last.count() << dendl;
+ // If the OSDMap is writeable, we can blacklist things, so we can
+ // try failing any laggy MDS daemons. Consider each one for failure.
+ if (!info.laggy()) {
+ dout(1) << " marking " << gid << " " << info.addrs
+ << " mds." << info.rank << "." << info.inc
+ << " " << ceph_mds_state_name(info.state)
+ << " laggy" << dendl;
+ fsmap.modify_daemon(info.global_id, [](auto& info) {
+ info.laggy_since = ceph_clock_now();
+ });
+ do_propose = true;
+ }
+ if (osdmap_writeable && may_replace) {
+ to_remove.push_back(gid); // drop_mds may invalidate iterator
+ }
+ }
- // Remember what NS the old one was in
- const fs_cluster_id_t fscid = fsmap.mds_roles.at(gid);
+ ++it;
+ }
- // Remove the old one
- *osd_propose |= fail_mds_gid(fsmap, gid);
+ for (const auto& gid : to_remove) {
+ auto& info = fsmap.get_info_gid(gid);
+ const mds_info_t* rep_info = nullptr;
+ if (info.rank >= 0) {
+ auto fscid = fsmap.gid_fscid(gid);
+ rep_info = fsmap.find_replacement_for({fscid, info.rank});
+ }
+ bool dropped = drop_mds(fsmap, gid, rep_info, propose_osdmap);
+ if (dropped) {
+ mon->clog->info() << "MDS " << info.human_name()
+ << " is removed because it is dead or otherwise unavailable.";
+ do_propose = true;
+ }
+ }
- // Promote the replacement
- auto&& fs = fsmap.filesystems.at(fscid);
- fsmap.promote(sgid, *fs, info.rank);
-
- *mds_propose = true;
- } else if ((info.state == MDSMap::STATE_STANDBY_REPLAY ||
- info.state == MDSMap::STATE_STANDBY) && may_replace && !frozen) {
- dout(1) << " failing and removing " << gid << " " << info.addrs
- << " mds." << info.rank
- << "." << info.inc << " " << ceph_mds_state_name(info.state)
- << dendl;
- mon->clog->info() << "Standby " << info.human_name() << " is not "
- "responding, dropping it";
- fail_mds_gid(fsmap, gid);
- *mds_propose = true;
- } else if (!info.laggy()) {
- dout(1) << " marking " << gid << " " << info.addrs
- << " mds." << info.rank << "." << info.inc
- << " " << ceph_mds_state_name(info.state)
- << " laggy" << dendl;
- fsmap.modify_daemon(info.global_id, [](auto& info) {
- info.laggy_since = ceph_clock_now();
- });
- *mds_propose = true;
+ if (osdmap_writeable) {
+ for (auto& [fscid, fs] : fsmap.filesystems) {
+ if (!fs->mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE) &&
+ fs->mds_map.is_resizeable()) {
+ // Check if a rank or standby-replay should be replaced with a stronger
+ // affinity standby. This looks at ranks and standby-replay:
+ for (const auto& [gid, info] : fs->mds_map.get_mds_info()) {
+ const auto join_fscid = info.join_fscid;
+ if (join_fscid == fscid)
+ continue;
+ const auto rank = info.rank;
+ const auto state = info.state;
+ const mds_info_t* rep_info = nullptr;
+ if (state == MDSMap::STATE_STANDBY_REPLAY) {
+ rep_info = fsmap.get_available_standby(fscid);
+ } else if (state == MDSMap::STATE_ACTIVE) {
+ rep_info = fsmap.find_replacement_for({fscid, rank});
+ } else {
+ /* N.B. !is_degraded() */
+ ceph_abort_msg("invalid state in MDSMap");
+ }
+ if (!rep_info) {
+ break;
+ }
+ bool better_affinity = false;
+ if (join_fscid == FS_CLUSTER_ID_NONE) {
+ better_affinity = (rep_info->join_fscid == fscid);
+ } else {
+ better_affinity = (rep_info->join_fscid == fscid) ||
+ (rep_info->join_fscid == FS_CLUSTER_ID_NONE);
+ }
+ if (better_affinity) {
+ if (state == MDSMap::STATE_STANDBY_REPLAY) {
+ mon->clog->info() << "Dropping low affinity standby-replay "
+ << info.human_name()
+ << " in favor of higher affinity standby.";
+ *propose_osdmap |= fail_mds_gid(fsmap, gid);
+ /* Now let maybe_promote_standby do the promotion. */
+ } else {
+ mon->clog->info() << "Dropping low affinity active "
+ << info.human_name()
+ << " in favor of higher affinity standby.";
+ do_propose |= drop_mds(fsmap, gid, rep_info, propose_osdmap);
+ }
+ break; /* don't replace more than one per tick per fs */
+ }
+ }
+ }
+ }
}
+ return do_propose;
}
bool MDSMonitor::maybe_promote_standby(FSMap &fsmap, Filesystem& fs)
set<mds_rank_t> failed;
fs.mds_map.get_failed_mds_set(failed);
for (const auto& rank : failed) {
- auto&& sgid = fsmap.find_replacement_for({fs.fscid, rank}, {});
- if (sgid) {
- auto&& info = fsmap.get_info_gid(sgid);
- dout(1) << " taking over failed mds." << rank << " with " << sgid
- << "/" << info.name << " " << info.addrs << dendl;
- mon->clog->info() << "Standby " << info.human_name()
+ auto info = fsmap.find_replacement_for({fs.fscid, rank});
+ if (info) {
+ dout(1) << " taking over failed mds." << rank << " with " << info->global_id
+ << "/" << info->name << " " << info->addrs << dendl;
+ mon->clog->info() << "Standby " << info->human_name()
<< " assigned to filesystem " << fs.mds_map.fs_name
<< " as rank " << rank;
- fsmap.promote(sgid, fs, rank);
+ fsmap.promote(info->global_id, fs, rank);
do_propose = true;
}
}
- if (fs.mds_map.allows_standby_replay() && !fs.mds_map.is_degraded()) {
+ if (!fs.mds_map.is_degraded() && fs.mds_map.allows_standby_replay()) {
// There were no failures to replace, so try using any available standbys
// as standby-replay daemons. Don't do this when the cluster is degraded
// as a standby-replay daemon may try to read a journal being migrated.
for (;;) {
- auto standby_gid = fsmap.get_available_standby(fs.fscid);
- if (standby_gid == MDS_GID_NONE) break;
- dout(20) << "standby available mds." << standby_gid << dendl;
+ auto info = fsmap.get_available_standby(fs.fscid);
+ if (!info) break;
+ dout(20) << "standby available mds." << info->global_id << dendl;
bool changed = false;
for (const auto& rank : fs.mds_map.in) {
- dout(20) << "exmaining " << rank << dendl;
+ dout(20) << "examining " << rank << dendl;
if (fs.mds_map.is_followable(rank)) {
- dout(1) << " setting mds." << standby_gid
+ dout(1) << " setting mds." << info->global_id
<< " to follow mds rank " << rank << dendl;
- fsmap.assign_standby_replay(standby_gid, fs.fscid, rank);
+ fsmap.assign_standby_replay(info->global_id, fs.fscid, rank);
do_propose = true;
changed = true;
break;
void MDSMonitor::tick()
{
- // make sure mds's are still alive
- // ...if i am an active leader
-
if (!is_active() || !is_leader()) return;
auto &pending = get_pending_fsmap_writeable();
bool do_propose = false;
+ bool propose_osdmap = false;
do_propose |= pending.check_health();
- // resize mds cluster (adjust @in)?
- for (auto &p : pending.filesystems) {
- do_propose |= maybe_resize_cluster(pending, p.second->fscid);
- }
-
- mono_time now = mono_clock::now();
- if (mono_clock::is_zero(last_tick)) {
- last_tick = now;
- }
- chrono::duration<double> since_last = now-last_tick;
+ /* Check health and affinity of ranks */
+ do_propose |= check_health(pending, &propose_osdmap);
- if (since_last.count() >
- (g_conf()->mds_beacon_grace - g_conf()->mds_beacon_interval)) {
- // This case handles either local slowness (calls being delayed
- // for whatever reason) or cluster election slowness (a long gap
- // between calls while an election happened)
- dout(1) << __func__ << ": resetting beacon timeouts due to mon delay "
- "(slow election?) of " << now - last_tick << " seconds" << dendl;
- for (auto &p : last_beacon) {
- p.second.stamp = now;
- }
+ /* Resize the cluster according to max_mds. */
+ for (auto& p : pending.filesystems) {
+ do_propose |= maybe_resize_cluster(pending, p.second->fscid);
}
- last_tick = now;
-
- // make sure last_beacon is fully populated
- for (auto &p : pending.mds_roles) {
- auto &gid = p.first;
- last_beacon.emplace(std::piecewise_construct,
- std::forward_as_tuple(gid),
- std::forward_as_tuple(mono_clock::now(), 0));
+ /* Replace any failed ranks. */
+ for (auto& p : pending.filesystems) {
+ do_propose |= maybe_promote_standby(pending, *p.second);
}
-
- // check beacon timestamps
- bool propose_osdmap = false;
- bool osdmap_writeable = mon->osdmon()->is_writeable();
- for (auto it = last_beacon.begin(); it != last_beacon.end(); ) {
- mds_gid_t gid = it->first;
- auto beacon_info = it->second;
- chrono::duration<double> since_last = now-beacon_info.stamp;
-
- if (!pending.gid_exists(gid)) {
- // clean it out
- it = last_beacon.erase(it);
- continue;
- }
-
-
- if (since_last.count() >= g_conf()->mds_beacon_grace) {
- auto &info = pending.get_info_gid(gid);
- dout(1) << "no beacon from mds." << info.rank << "." << info.inc
- << " (gid: " << gid << " addr: " << info.addrs
- << " state: " << ceph_mds_state_name(info.state) << ")"
- << " since " << since_last.count() << "s" << dendl;
- // If the OSDMap is writeable, we can blacklist things, so we can
- // try failing any laggy MDS daemons. Consider each one for failure.
- if (osdmap_writeable) {
- maybe_replace_gid(pending, gid, info, &do_propose, &propose_osdmap);
- }
- }
-
- ++it;
- }
if (propose_osdmap) {
request_proposal(mon->osdmon());
}
- for (auto &p : pending.filesystems) {
- do_propose |= maybe_promote_standby(pending, *p.second);
- }
-
if (do_propose) {
propose_pending();
}
+
+ last_tick = mono_clock::now();
}
MDSMonitor::MDSMonitor(Monitor *mn, Paxos *p, string service_name)