From 8db596944f12e66910d4732d98f9b7716a2704c2 Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Tue, 4 Jun 2024 03:02:41 -0400 Subject: [PATCH] mds: emit warning for estinated replay time If replay runs more than 30 seconds, emit a warning with estimated replay completion time. Fixes: https://tracker.ceph.com/issues/61863 Signed-off-by: Manish M Yathnalli Signed-off-by: Venky Shankar --- src/common/options/mds.yaml.in | 6 +++++ src/mds/Beacon.cc | 14 ++++++++++ src/mds/MDLog.cc | 48 +++++++++++++++++++++++++++++++++- src/mds/MDLog.h | 4 +++ src/mds/mdstypes.cc | 5 ++++ src/mds/mdstypes.h | 8 ++++++ src/messages/MMDSBeacon.h | 4 +++ 7 files changed, 88 insertions(+), 1 deletion(-) diff --git a/src/common/options/mds.yaml.in b/src/common/options/mds.yaml.in index 94824faef6b..4cbc079a215 100644 --- a/src/common/options/mds.yaml.in +++ b/src/common/options/mds.yaml.in @@ -1713,6 +1713,12 @@ options: default: 1000 services: - mds +- name: mds_delay_journal_replay_for_testing + type: millisecs + level: dev + desc: Delay the journal replay to verify the replay time estimate + long_desc: Jorunal replay warning is activated if the mds has been in replay state for more than 30 seconds. This config delays replay for validating the replay warning in tests. + default: 0 flags: - runtime - name: mds_server_dispatch_killpoint_random diff --git a/src/mds/Beacon.cc b/src/mds/Beacon.cc index 6fbfc79d416..1c1eeb4ecf8 100644 --- a/src/mds/Beacon.cc +++ b/src/mds/Beacon.cc @@ -26,6 +26,7 @@ #include "mds/MDSRank.h" #include "mds/MDSMap.h" #include "mds/Locker.h" +#include "mds/mdstypes.h" #include "Beacon.h" @@ -550,6 +551,19 @@ void Beacon::notify_health(MDSRank const *mds) } } } + if (mds->is_replay()) { + CachedStackStringStream css; + auto estimate = mds->mdlog->get_estimated_replay_finish_time(); + // this probably should be configurable, however, its fine to report + // if replay is running for more than 30 seconds. + if (estimate.elapsed_time > std::chrono::seconds(30)) { + *css << "replay: " << estimate.percent_complete << "% complete - elapsed time: " + << estimate.elapsed_time << ", estimated time remaining: " + << estimate.estimated_time; + MDSHealthMetric m(MDS_HEALTH_ESTIMATED_REPLAY_TIME, HEALTH_WARN, css->strv()); + health.metrics.push_back(m); + } + } } MDSMap::DaemonState Beacon::get_want_state() const diff --git a/src/mds/MDLog.cc b/src/mds/MDLog.cc index 4bbf2a1a141..d041e3b2fc8 100644 --- a/src/mds/MDLog.cc +++ b/src/mds/MDLog.cc @@ -221,7 +221,46 @@ uint64_t MDLog::get_safe_pos() const return journaler->get_write_safe_pos(); } +// estimate the replay completion time based on mdlog journal pointers +EstimatedReplayTime MDLog::get_estimated_replay_finish_time() { + ceph_assert(mds->is_replay()); + EstimatedReplayTime estimated_time{0, std::chrono::seconds::zero(), std::chrono::seconds::zero()}; + if (!journaler) { + return estimated_time; + } + + auto read_pos = journaler->get_read_pos(); + auto write_pos = journaler->get_write_pos(); + auto trimmed_pos = journaler->get_trimmed_pos(); + + dout(20) << __func__ << ": read_pos=" << read_pos << ", write_pos=" + << write_pos << ", trimmed_pos=" << trimmed_pos << dendl; + + if (read_pos == trimmed_pos || write_pos == trimmed_pos) { + return estimated_time; + } + + auto total_bytes = write_pos - trimmed_pos; + double percent_complete = ((double)(read_pos - trimmed_pos)) / (double)total_bytes; + auto elapsed_time = std::chrono::duration_cast + (ceph::coarse_mono_clock::now() - replay_start_time); + auto time = ((1 - percent_complete) / percent_complete) * elapsed_time; + + dout(20) << __func__ << "percent_complete=" << percent_complete + << ", elapsed_time=" << elapsed_time + << ", estimated_time=" << std::chrono::round(time) + << dendl; + + estimated_time.percent_complete = percent_complete * 100; + estimated_time.elapsed_time = elapsed_time; + estimated_time.estimated_time = std::chrono::round(time); + dout(20) << __func__ << "estimated_time.percent_complete=" << estimated_time.percent_complete + << ", estimated_time.elapsed_time=" << estimated_time.elapsed_time + << ", estimated_time.estimated_time=" << estimated_time.estimated_time + << dendl; + return estimated_time; +} void MDLog::create(MDSContext *c) { @@ -1137,6 +1176,7 @@ void MDLog::_recovery_thread(MDSContext *completion) { std::lock_guard l(mds->mds_lock); journaler = front_journal; + replay_start_time = ceph::coarse_mono_clock::now(); } C_SaferCond recover_wait; @@ -1374,11 +1414,17 @@ void MDLog::_reformat_journal(JournalPointer const &jp_in, Journaler *old_journa // i am a separate thread void MDLog::_replay_thread() { - dout(10) << "_replay_thread start" << dendl; + dout(10) << __func__ << ": start time: " << replay_start_time << ", now: " + << ceph::coarse_mono_clock::now() << dendl; // loop int r = 0; while (1) { + auto sleep_time = g_conf().get_val("mds_delay_journal_replay_for_testing"); + if (unlikely(sleep_time > 0ms)) { + dout(10) << __func__ << ": sleeping for " << sleep_time << "ms" << dendl; + std::this_thread::sleep_for(sleep_time); + } // wait for read? journaler->check_isreadable(); if (journaler->get_error()) { diff --git a/src/mds/MDLog.h b/src/mds/MDLog.h index a858b40fa03..180a34c9d82 100644 --- a/src/mds/MDLog.h +++ b/src/mds/MDLog.h @@ -53,6 +53,7 @@ enum { #include "LogSegment.h" #include "MDSMap.h" #include "SegmentBoundary.h" +#include "mdstypes.h" #include #include @@ -162,6 +163,7 @@ public: void reopen(MDSContext *onopen); void append(); void replay(MDSContext *onfinish); + EstimatedReplayTime get_estimated_replay_finish_time(); void standby_trim_segments(); @@ -328,5 +330,7 @@ private: std::atomic upkeep_log_trim_shutdown{false}; std::map> waiting_for_expire; // protected by mds_lock + + ceph::coarse_mono_time replay_start_time = ceph::coarse_mono_clock::zero(); }; #endif diff --git a/src/mds/mdstypes.cc b/src/mds/mdstypes.cc index 680218e62e3..f9424eed6dc 100644 --- a/src/mds/mdstypes.cc +++ b/src/mds/mdstypes.cc @@ -1042,3 +1042,8 @@ void snaprealm_reconnect_t::generate_test_instances(std::listrealm.seq = 2; ls.back()->realm.parent = 1; } + +void EstimatedReplayTime::print(std::ostream& out) { + out << "replay: " << percent_complete << "% complete - elapsed time: " + << elapsed_time << ", estimated time remaining: " << estimated_time; +} diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h index 3b8269006cb..742d7b23432 100644 --- a/src/mds/mdstypes.h +++ b/src/mds/mdstypes.h @@ -1044,4 +1044,12 @@ inline bool operator==(const MDSCacheObjectInfo& l, const MDSCacheObjectInfo& r) } WRITE_CLASS_ENCODER(MDSCacheObjectInfo) +struct EstimatedReplayTime { + double percent_complete; + std::chrono::seconds estimated_time; + std::chrono::seconds elapsed_time; + + void print(std::ostream& out); +}; + #endif diff --git a/src/messages/MMDSBeacon.h b/src/messages/MMDSBeacon.h index c157c33e758..526285aae8c 100644 --- a/src/messages/MMDSBeacon.h +++ b/src/messages/MMDSBeacon.h @@ -48,6 +48,7 @@ enum mds_metric_t { MDS_HEALTH_CLIENTS_LAGGY, MDS_HEALTH_CLIENTS_LAGGY_MANY, MDS_HEALTH_CLIENTS_BROKEN_ROOTSQUASH, + MDS_HEALTH_ESTIMATED_REPLAY_TIME, MDS_HEALTH_DUMMY, // not a real health warning, for testing }; @@ -69,6 +70,7 @@ inline const char *mds_metric_name(mds_metric_t m) case MDS_HEALTH_CLIENTS_LAGGY: return "MDS_CLIENTS_LAGGY"; case MDS_HEALTH_CLIENTS_LAGGY_MANY: return "MDS_CLIENTS_LAGGY_MANY"; case MDS_HEALTH_CLIENTS_BROKEN_ROOTSQUASH: return "MDS_CLIENTS_BROKEN_ROOTSQUASH"; + case MDS_HEALTH_ESTIMATED_REPLAY_TIME: return "MDS_ESTIMATED_REPLAY_TIME"; case MDS_HEALTH_DUMMY: return "MDS_DUMMY"; default: return "???"; @@ -107,6 +109,8 @@ inline const char *mds_metric_summary(mds_metric_t m) return "%num% client(s) laggy due to laggy OSDs"; case MDS_HEALTH_CLIENTS_BROKEN_ROOTSQUASH: return "%num% MDS report clients with broken root_squash implementation"; + case MDS_HEALTH_ESTIMATED_REPLAY_TIME: + return "%num% estimated journal replay time"; default: return "???"; } -- 2.39.5