For use by high-level storage operators like Rook.
Fixes: https://tracker.ceph.com/issues/62849
Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
services:
- mon
with_legacy: true
+- name: mon_fsmap_prune_threshold
+ type: secs
+ level: advanced
+ desc: prune fsmap older than this threshold in seconds
+ fmt_desc: The monitors keep historical fsmaps in memory to optimize asking
+ when an MDS daemon was last seen in the FSMap. This option controls
+ how far back in time the monitors will look.
+ default: 300
+ flags:
+ - runtime
+ services:
+ - mon
- name: mds_beacon_mon_down_grace
type: secs
level: advanced
#include <sstream>
#include <queue>
#include <ranges>
+#include <boost/range/adaptors.hpp>
#include <boost/utility.hpp>
#include "MDSMonitor.h"
ds << fsmap;
}
r = 0;
+ } else if (prefix == "mds last-seen") {
+ std::string id;
+ cmd_getval(cmdmap, "id", id);
+
+ dout(10) << "last seen check for " << id << dendl;
+
+ auto& history = get_fsmap_history();
+ auto now = real_clock::now();
+ bool found = false;
+ /* Special case:
+ * If the mons consider the MDS "in" the latest FSMap, then the mds
+ * is always "last seen" **now** (for the purposes of this API). We
+ * don't look at past beacons because that is only managed by the
+ * leader and the logic is fudged in places in the event of suspected
+ * network partitions.
+ */
+ std::chrono::seconds since = std::chrono::seconds(0);
+
+ for (auto& [epoch, fsmaph] : boost::adaptors::reverse(history)) {
+ dout(25) << "looking at epoch " << epoch << dendl;
+ auto* info = fsmaph.find_by_name(id);
+ if (info) {
+ dout(10) << "found: " << *info << dendl;
+ found = true;
+ if (f) {
+ f->open_object_section("mds last-seen");
+ f->dump_object("info", *info);
+ f->dump_string("last-seen", fmt::format("{}", since));
+ f->dump_int("epoch", epoch);
+ f->close_section();
+ f->flush(ds);
+ } else {
+ ds << fmt::format("{}", since);
+ }
+ break;
+ }
+ /* If the MDS appears in the next epoch, then it went away as of this epoch's btime.
+ */
+ since = std::chrono::duration_cast<std::chrono::seconds>(now - fsmaph.get_btime());
+ }
+ if (found) {
+ r = 0;
+ } else {
+ ss << "mds " << id << " not found in recent FSMaps";
+ r = -ENOENT;
+ }
} else if (prefix == "mds ok-to-stop") {
vector<string> ids;
if (!cmd_getval(cmdmap, "ids", ids)) {
void MDSMonitor::tick()
{
+ {
+ auto _history_prune_time = g_conf().get_val<std::chrono::seconds>("mon_fsmap_prune_threshold");
+ set_fsmap_history_threshold(_history_prune_time);
+ dout(20) << _history_prune_time << dendl;
+ prune_fsmap_history();
+ auto& history = get_fsmap_history();
+ auto now = real_clock::now();
+ if (auto it = history.begin(); it != history.end()) {
+ auto start = it->second.get_epoch();
+ dout(20) << "oldest epoch in history is " << start << dendl;
+ for (;;) {
+ --start;
+ bufferlist bl;
+ FSMap fsmaph;
+ int err = get_version(start, bl);
+ if (err == -ENOENT) {
+ break;
+ }
+ ceph_assert(err == 0);
+ ceph_assert(bl.length());
+ fsmaph.decode(bl);
+ auto btime = fsmaph.get_btime();
+ auto since = std::chrono::duration_cast<std::chrono::milliseconds>(now - btime);
+ dout(20) << "loaded epoch " << fsmaph.get_epoch() << " which is " << since << " old" << dendl;
+ if (since <= _history_prune_time) {
+ put_fsmap_history(fsmaph);
+ } else {
+ break;
+ }
+ }
+ }
+ }
+
if (!is_active() || !is_leader()) return;
auto &pending = get_pending_fsmap_writeable();
#define FS_NAME_GOODCHARS "[A-Za-z0-9-_.]"
COMMAND_WITH_FLAG("mds stat", "show MDS status", "mds", "r", FLAG(HIDDEN))
+COMMAND("mds last-seen name=id,type=CephString,req=true",
+ "fetch metadata for mds <id>",
+ "mds", "r")
COMMAND("fs dump "
"name=epoch,type=CephInt,req=false,range=0",
"dump all CephFS status, optionally from epoch", "mds", "r")
#ifndef CEPH_PAXOS_FSMAP_H
#define CEPH_PAXOS_FSMAP_H
+#include <chrono>
+
#include "mds/FSMap.h"
#include "mds/MDSMap.h"
return pending_fsmap;
}
+ void prune_fsmap_history() {
+ auto now = real_clock::now();
+ for (auto it = history.begin(); it != history.end(); ) {
+ auto since = now - it->second.get_btime();
+ /* Be sure to not make the map empty */
+ auto itnext = std::next(it);
+ if (itnext == history.end()) {
+ break;
+ }
+ /* Keep the map just before the prune time threshold:
+ * [ e-1 (lifetime > history_prune_time) | e (lifetime 1s) ]
+ * If an mds was removed in (e), then we want to be able to say it was
+ * last seen 1 second ago.
+ */
+ auto since2 = now - itnext->second.get_btime();
+ if (since > history_prune_time && since2 > history_prune_time) {
+ it = history.erase(it);
+ } else {
+ break;
+ }
+ }
+ }
+
+ void put_fsmap_history(const FSMap& _fsmap) {
+ auto now = real_clock::now();
+ auto since = now - _fsmap.get_btime();
+ if (since < history_prune_time) {
+ history.emplace(std::piecewise_construct, std::forward_as_tuple(_fsmap.get_epoch()), std::forward_as_tuple(_fsmap));
+ }
+ }
+
+ void set_fsmap_history_threshold(std::chrono::seconds t) {
+ history_prune_time = t;
+ }
+ std::chrono::seconds get_fsmap_history_threshold() const {
+ return history_prune_time;
+ }
+
+ const auto& get_fsmap_history() const {
+ return history;
+ }
+
void decode(ceph::buffer::list &bl) {
fsmap.decode(bl);
+ put_fsmap_history(fsmap);
pending_fsmap = FSMap(); /* nuke it to catch invalid access */
}
private:
/* Keep these PRIVATE to prevent unprotected manipulation. */
+ std::map<epoch_t, FSMap> history;
+ std::chrono::seconds history_prune_time = std::chrono::seconds(0);
FSMap fsmap; /* the current epoch */
FSMap pending_fsmap; /* the next epoch */
};