From 718afe26ba880c75279d2c0540aa8ce9dadfc1ed Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Tue, 22 Oct 2024 08:00:23 +0000 Subject: [PATCH] mds: start a new major segment after reaching minor segment threshold Credit goes to Patrick (@batrick) for identifying this. When there are huge number of subtree exports (such as done in export thrashing test), the MDS would log an EExport event. The EExport event is relatively large in size. This causes the MDS to log new minor log segments frequently. Moreover, the MDS logs a major segment (boundary) after a certain number of events have been logged. This casues large number of (minor) events to get build up and cause delays in trimming expired segments, since journal expire position is updated on segment boundaries. To mitigate this issue, the MDS now starts a major segment after a configured number of minor segments have been logged. This threshold is configurable by adjusting `mds_log_minor_segments_per_major_segment` MDS config (defaults to 16). Fixes: https://tracker.ceph.com/issues/66948 Signed-off-by: Venky Shankar --- src/common/options/mds.yaml.in | 19 +++++++++---------- src/mds/MDLog.cc | 20 +++++++++++--------- src/mds/MDLog.h | 4 ++-- 3 files changed, 22 insertions(+), 21 deletions(-) diff --git a/src/common/options/mds.yaml.in b/src/common/options/mds.yaml.in index 18efba561ed..94824faef6b 100644 --- a/src/common/options/mds.yaml.in +++ b/src/common/options/mds.yaml.in @@ -586,16 +586,6 @@ options: min: 1 services: - mds -- name: mds_log_major_segment_event_ratio - type: uint - level: advanced - desc: multiple of mds_log_events_per_segment between major segments - default: 12 - services: - - mds - min: 1 - see_also: - - mds_log_events_per_segment # segment size for mds log, default to default file_layout_t - name: mds_log_segment_size type: size @@ -1741,3 +1731,12 @@ options: - mds flags: - runtime +- name: mds_log_minor_segments_per_major_segment + type: uint + level: advanced + desc: number of minor segments per major segment. + long_desc: The number of minor mds log segments since last major segment after which a major segment is started/logged. + default: 16 + services: + - mds + min: 8 diff --git a/src/mds/MDLog.cc b/src/mds/MDLog.cc index 2e1212e8cf4..177bd32cbb9 100644 --- a/src/mds/MDLog.cc +++ b/src/mds/MDLog.cc @@ -53,12 +53,12 @@ MDLog::MDLog(MDSRank* m) event_large_threshold = g_conf().get_val("mds_log_event_large_threshold"); events_per_segment = g_conf().get_val("mds_log_events_per_segment"); pause = g_conf().get_val("mds_log_pause"); - major_segment_event_ratio = g_conf().get_val("mds_log_major_segment_event_ratio"); max_segments = g_conf().get_val("mds_log_max_segments"); max_events = g_conf().get_val("mds_log_max_events"); skip_corrupt_events = g_conf().get_val("mds_log_skip_corrupt_events"); skip_unbounded_events = g_conf().get_val("mds_log_skip_unbounded_events"); log_warn_factor = g_conf().get_val("mds_log_warn_factor"); + minor_segments_per_major_segment = g_conf().get_val("mds_log_minor_segments_per_major_segment"); upkeep_thread = std::thread(&MDLog::log_trim_upkeep, this); } @@ -358,14 +358,15 @@ void MDLog::_submit_entry(LogEvent *le, MDSLogContextBase* c) ceph_assert(!mds_is_shutting_down); event_seq++; - events_since_last_major_segment++; if (auto sb = dynamic_cast(le); sb) { auto ls = _start_new_segment(sb); if (sb->is_major_segment_boundary()) { major_segments.insert(ls->seq); logger->set(l_mdl_segmjr, major_segments.size()); - events_since_last_major_segment = 0; + minor_segments_since_last_major_segment = 0; + } else { + ++minor_segments_since_last_major_segment; } } @@ -404,7 +405,7 @@ void MDLog::_segment_upkeep() uint64_t period = journaler->get_layout_period(); auto ls = get_current_segment(); // start a new segment? - if (events_since_last_major_segment > events_per_segment*major_segment_event_ratio) { + if (minor_segments_since_last_major_segment > minor_segments_per_major_segment) { dout(10) << __func__ << ": starting new major segment, current " << *ls << dendl; auto sle = mds->mdcache->create_subtree_map(); _submit_entry(sle, NULL); @@ -1479,7 +1480,6 @@ void MDLog::_replay_thread() } le->set_start_off(pos); - events_since_last_major_segment++; if (auto sb = dynamic_cast(le.get()); sb) { auto seq = sb->get_seq(); if (seq > 0) { @@ -1492,7 +1492,9 @@ void MDLog::_replay_thread() if (sb->is_major_segment_boundary()) { major_segments.insert(event_seq); logger->set(l_mdl_segmjr, major_segments.size()); - events_since_last_major_segment = 0; + minor_segments_since_last_major_segment = 0; + } else { + ++minor_segments_since_last_major_segment; } } else { event_seq++; @@ -1623,9 +1625,6 @@ void MDLog::handle_conf_change(const std::set& changed, const MDSMa if (changed.count("mds_log_events_per_segment")) { events_per_segment = g_conf().get_val("mds_log_events_per_segment"); } - if (changed.count("mds_log_major_segment_event_ratio")) { - major_segment_event_ratio = g_conf().get_val("mds_log_major_segment_event_ratio"); - } if (changed.count("mds_log_max_events")) { max_events = g_conf().get_val("mds_log_max_events"); } @@ -1650,4 +1649,7 @@ void MDLog::handle_conf_change(const std::set& changed, const MDSMa if (changed.count("mds_log_warn_factor")) { log_warn_factor = g_conf().get_val("mds_log_warn_factor"); } + if (changed.count("mds_log_minor_segments_per_major_segment")) { + minor_segments_per_major_segment = g_conf().get_val("mds_log_minor_segments_per_major_segment"); + } } diff --git a/src/mds/MDLog.h b/src/mds/MDLog.h index 49e3c025dde..a858b40fa03 100644 --- a/src/mds/MDLog.h +++ b/src/mds/MDLog.h @@ -305,9 +305,9 @@ private: bool debug_subtrees; std::atomic_uint64_t event_large_threshold; // accessed by submit thread uint64_t events_per_segment; - uint64_t major_segment_event_ratio; int64_t max_events; uint64_t max_segments; + uint64_t minor_segments_per_major_segment; bool pause; bool skip_corrupt_events; bool skip_unbounded_events; @@ -315,7 +315,7 @@ private: std::set major_segments; std::set expired_segments; std::set expiring_segments; - uint64_t events_since_last_major_segment = 0; + uint64_t minor_segments_since_last_major_segment = 0; double log_warn_factor; // log trimming decay counter -- 2.39.5