From 4b908318a2bc806c3ffb43004595586dc33646fb Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Mon, 18 Sep 2023 13:34:39 -0400 Subject: [PATCH] mon/MDSMonitor: add command to lookup when mds was last seen For use by high-level storage operators like Rook. Fixes: https://tracker.ceph.com/issues/62849 Signed-off-by: Patrick Donnelly --- src/common/options/mon.yaml.in | 12 +++++ src/mon/MDSMonitor.cc | 80 ++++++++++++++++++++++++++++++++++ src/mon/MonCommands.h | 3 ++ src/mon/PaxosFSMap.h | 47 ++++++++++++++++++++ 4 files changed, 142 insertions(+) diff --git a/src/common/options/mon.yaml.in b/src/common/options/mon.yaml.in index 075b335a08f74..1ec9871b6a8ea 100644 --- a/src/common/options/mon.yaml.in +++ b/src/common/options/mon.yaml.in @@ -778,6 +778,18 @@ options: services: - mon with_legacy: true +- name: mon_fsmap_prune_threshold + type: secs + level: advanced + desc: prune fsmap older than this threshold in seconds + fmt_desc: The monitors keep historical fsmaps in memory to optimize asking + when an MDS daemon was last seen in the FSMap. This option controls + how far back in time the monitors will look. + default: 300 + flags: + - runtime + services: + - mon - name: mds_beacon_mon_down_grace type: secs level: advanced diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc index 9dd72152327da..76a57ac443de7 100644 --- a/src/mon/MDSMonitor.cc +++ b/src/mon/MDSMonitor.cc @@ -16,6 +16,7 @@ #include #include #include +#include #include #include "MDSMonitor.h" @@ -1025,6 +1026,52 @@ bool MDSMonitor::preprocess_command(MonOpRequestRef op) ds << fsmap; } r = 0; + } else if (prefix == "mds last-seen") { + std::string id; + cmd_getval(cmdmap, "id", id); + + dout(10) << "last seen check for " << id << dendl; + + auto& history = get_fsmap_history(); + auto now = real_clock::now(); + bool found = false; + /* Special case: + * If the mons consider the MDS "in" the latest FSMap, then the mds + * is always "last seen" **now** (for the purposes of this API). We + * don't look at past beacons because that is only managed by the + * leader and the logic is fudged in places in the event of suspected + * network partitions. + */ + std::chrono::seconds since = std::chrono::seconds(0); + + for (auto& [epoch, fsmaph] : boost::adaptors::reverse(history)) { + dout(25) << "looking at epoch " << epoch << dendl; + auto* info = fsmaph.find_by_name(id); + if (info) { + dout(10) << "found: " << *info << dendl; + found = true; + if (f) { + f->open_object_section("mds last-seen"); + f->dump_object("info", *info); + f->dump_string("last-seen", fmt::format("{}", since)); + f->dump_int("epoch", epoch); + f->close_section(); + f->flush(ds); + } else { + ds << fmt::format("{}", since); + } + break; + } + /* If the MDS appears in the next epoch, then it went away as of this epoch's btime. + */ + since = std::chrono::duration_cast(now - fsmaph.get_btime()); + } + if (found) { + r = 0; + } else { + ss << "mds " << id << " not found in recent FSMaps"; + r = -ENOENT; + } } else if (prefix == "mds ok-to-stop") { vector ids; if (!cmd_getval(cmdmap, "ids", ids)) { @@ -2380,6 +2427,39 @@ bool MDSMonitor::maybe_promote_standby(FSMap &fsmap, const Filesystem& fs) void MDSMonitor::tick() { + { + auto _history_prune_time = g_conf().get_val("mon_fsmap_prune_threshold"); + set_fsmap_history_threshold(_history_prune_time); + dout(20) << _history_prune_time << dendl; + prune_fsmap_history(); + auto& history = get_fsmap_history(); + auto now = real_clock::now(); + if (auto it = history.begin(); it != history.end()) { + auto start = it->second.get_epoch(); + dout(20) << "oldest epoch in history is " << start << dendl; + for (;;) { + --start; + bufferlist bl; + FSMap fsmaph; + int err = get_version(start, bl); + if (err == -ENOENT) { + break; + } + ceph_assert(err == 0); + ceph_assert(bl.length()); + fsmaph.decode(bl); + auto btime = fsmaph.get_btime(); + auto since = std::chrono::duration_cast(now - btime); + dout(20) << "loaded epoch " << fsmaph.get_epoch() << " which is " << since << " old" << dendl; + if (since <= _history_prune_time) { + put_fsmap_history(fsmaph); + } else { + break; + } + } + } + } + if (!is_active() || !is_leader()) return; auto &pending = get_pending_fsmap_writeable(); diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h index b2a678dff53c5..d7f7e28fe97a8 100644 --- a/src/mon/MonCommands.h +++ b/src/mon/MonCommands.h @@ -294,6 +294,9 @@ COMMAND("versions", #define FS_NAME_GOODCHARS "[A-Za-z0-9-_.]" COMMAND_WITH_FLAG("mds stat", "show MDS status", "mds", "r", FLAG(HIDDEN)) +COMMAND("mds last-seen name=id,type=CephString,req=true", + "fetch metadata for mds ", + "mds", "r") COMMAND("fs dump " "name=epoch,type=CephInt,req=false,range=0", "dump all CephFS status, optionally from epoch", "mds", "r") diff --git a/src/mon/PaxosFSMap.h b/src/mon/PaxosFSMap.h index 7299988316118..4312d7e1f4d7b 100644 --- a/src/mon/PaxosFSMap.h +++ b/src/mon/PaxosFSMap.h @@ -15,6 +15,8 @@ #ifndef CEPH_PAXOS_FSMAP_H #define CEPH_PAXOS_FSMAP_H +#include + #include "mds/FSMap.h" #include "mds/MDSMap.h" @@ -39,13 +41,58 @@ protected: return pending_fsmap; } + void prune_fsmap_history() { + auto now = real_clock::now(); + for (auto it = history.begin(); it != history.end(); ) { + auto since = now - it->second.get_btime(); + /* Be sure to not make the map empty */ + auto itnext = std::next(it); + if (itnext == history.end()) { + break; + } + /* Keep the map just before the prune time threshold: + * [ e-1 (lifetime > history_prune_time) | e (lifetime 1s) ] + * If an mds was removed in (e), then we want to be able to say it was + * last seen 1 second ago. + */ + auto since2 = now - itnext->second.get_btime(); + if (since > history_prune_time && since2 > history_prune_time) { + it = history.erase(it); + } else { + break; + } + } + } + + void put_fsmap_history(const FSMap& _fsmap) { + auto now = real_clock::now(); + auto since = now - _fsmap.get_btime(); + if (since < history_prune_time) { + history.emplace(std::piecewise_construct, std::forward_as_tuple(_fsmap.get_epoch()), std::forward_as_tuple(_fsmap)); + } + } + + void set_fsmap_history_threshold(std::chrono::seconds t) { + history_prune_time = t; + } + std::chrono::seconds get_fsmap_history_threshold() const { + return history_prune_time; + } + + const auto& get_fsmap_history() const { + return history; + } + void decode(ceph::buffer::list &bl) { fsmap.decode(bl); + put_fsmap_history(fsmap); pending_fsmap = FSMap(); /* nuke it to catch invalid access */ } private: /* Keep these PRIVATE to prevent unprotected manipulation. */ + std::map history; + std::chrono::seconds history_prune_time = std::chrono::seconds(0); FSMap fsmap; /* the current epoch */ FSMap pending_fsmap; /* the next epoch */ }; -- 2.39.5