From: Patrick Donnelly Date: Tue, 7 Mar 2023 18:24:38 +0000 (-0500) Subject: mds: introduce ELid event to create/close log X-Git-Tag: v19.0.0~760^2~13 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=5a5374765441ee2db66a2ec5dcfea00ff57f9eab;p=ceph.git mds: introduce ELid event to create/close log Prior to this set of commits, the MDS would write the ESubtreeMap to the journal, trim everything up to that segment, then finally force the trimming of that last segment (`MDLog::trim(0)`). This is awkward in the new code which preserves a major segment boundary at the beginning of the journal during trimming. Instead of writing a special case for this situation, it seems more natural to just use a new "lid" or "cap" event to mark the beginning of the journal when no subtree map can yet be written but we need sequence numbers to tie in other MDS tables. Like ESegment, ELid doesn't actually contain any state. It's just a marker for the beginning the log after rank deactivation or rank creation. It can appear in the middle of the log if the shutdown sequence is interrupted while writing the event but the MDS will skip it during replay in that case. Signed-off-by: Patrick Donnelly --- diff --git a/src/common/options/mds.yaml.in b/src/common/options/mds.yaml.in index 87f216433b58..af7216f2581b 100644 --- a/src/common/options/mds.yaml.in +++ b/src/common/options/mds.yaml.in @@ -512,7 +512,19 @@ options: - mds fmt_desc: Determines whether the MDS should try to skip corrupt journal events during journal replay. - with_legacy: true + flags: + - runtime +- name: mds_log_skip_unbounded_events + type: bool + level: dev + default: false + services: + - mds + fmt_desc: Determines whether the MDS should try to skip journal + events during journal replay that wrongly exist before + a major segment boundary. + flags: + - runtime - name: mds_log_max_events type: int level: advanced diff --git a/src/mds/LogEvent.cc b/src/mds/LogEvent.cc index 8f65024b2d77..dcf2f9c7cffb 100644 --- a/src/mds/LogEvent.cc +++ b/src/mds/LogEvent.cc @@ -40,6 +40,7 @@ #include "events/ENoOp.h" #include "events/ESegment.h" +#include "events/ELid.h" #define dout_context g_ceph_context @@ -92,6 +93,7 @@ std::string_view LogEvent::get_type_str() const case EVENT_TABLESERVER: return "TABLESERVER"; case EVENT_NOOP: return "NOOP"; case EVENT_SEGMENT: return "SEGMENT"; + case EVENT_LID: return "LID"; default: generic_dout(0) << "get_type_str: unknown type " << _type << dendl; @@ -118,7 +120,8 @@ const std::map LogEvent::types = { {"TABLECLIENT", EVENT_TABLECLIENT}, {"TABLESERVER", EVENT_TABLESERVER}, {"NOOP", EVENT_NOOP}, - {"SEGMENT", EVENT_SEGMENT} + {"SEGMENT", EVENT_SEGMENT}, + {"LID", EVENT_LID} }; /* @@ -202,6 +205,9 @@ std::unique_ptr LogEvent::decode_event(bufferlist::const_iterator& p, case EVENT_SEGMENT: le = std::make_unique(); break; + case EVENT_LID: + le = std::make_unique(); + break; default: generic_dout(0) << "uh oh, unknown log event type " << type << " length " << length << dendl; return nullptr; diff --git a/src/mds/LogEvent.h b/src/mds/LogEvent.h index 88c70b47660d..49955290fd69 100644 --- a/src/mds/LogEvent.h +++ b/src/mds/LogEvent.h @@ -43,6 +43,7 @@ #define EVENT_NOOP 51 #define EVENT_SEGMENT 100 +#define EVENT_LID 101 #include "include/buffer_fwd.h" diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 2cc3b177e344..014e9bb09c2b 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -54,6 +54,7 @@ #include "osdc/Filer.h" #include "events/ESubtreeMap.h" +#include "events/ELid.h" #include "events/EUpdate.h" #include "events/EPeerUpdate.h" #include "events/EImportFinish.h" @@ -7958,7 +7959,7 @@ bool MDCache::shutdown_pass() // Fully trim the log so that all objects in cache are clean and may be // trimmed by a future MDCache::trim. Note that MDSRank::tick does not // trim the log such that the cache eventually becomes clean. - if (mds->mdlog->get_num_segments() > 0) { + if (mds->mdlog->get_num_segments() > 0 && !mds->mdlog->is_capped()) { auto ls = mds->mdlog->get_current_segment(); if (ls->num_events > 1 || !ls->dirty_dirfrags.empty()) { // Current segment contains events other than subtreemap or @@ -8019,26 +8020,10 @@ bool MDCache::shutdown_pass() // (only do this once!) if (!mds->mdlog->is_capped()) { dout(7) << "capping the mdlog" << dendl; + mds->mdlog->submit_entry(new ELid()); + mds->mdlog->flush(); mds->mdlog->cap(); - } - ceph_assert(kill_shutdown_at != KILL_SHUTDOWN_AT::SHUTDOWN_LOGCAP); - - if (!mds->mdlog->empty()) - mds->mdlog->trim(0); - - if (!mds->mdlog->empty()) { - dout(7) << "waiting for log to flush.. " << mds->mdlog->get_num_events() - << " in " << mds->mdlog->get_num_segments() << " segments" << dendl; - return false; - } - - if (!did_shutdown_log_cap) { - // flush journal header - dout(7) << "writing header for (now-empty) journal" << dendl; - ceph_assert(mds->mdlog->empty()); - mds->mdlog->write_head(0); - // NOTE: filer active checker below will block us until this completes. - did_shutdown_log_cap = true; + ceph_assert(kill_shutdown_at != KILL_SHUTDOWN_AT::SHUTDOWN_LOGCAP); return false; } diff --git a/src/mds/MDLog.cc b/src/mds/MDLog.cc index fb5e25f678c5..9e724fa96104 100644 --- a/src/mds/MDLog.cc +++ b/src/mds/MDLog.cc @@ -27,6 +27,7 @@ #include "events/ESubtreeMap.h" #include "events/ESegment.h" +#include "events/ELid.h" #include "common/config.h" #include "common/errno.h" @@ -53,6 +54,8 @@ MDLog::MDLog(MDSRank* m) major_segment_event_ratio = g_conf().get_val("mds_log_major_segment_event_ratio"); max_segments = g_conf().get_val("mds_log_max_segments"); max_events = g_conf().get_val("mds_log_max_events"); + skip_corrupt_events = g_conf().get_val("mds_log_skip_corrupt_events"); + skip_unbounded_events = g_conf().get_val("mds_log_skip_unbounded_events"); } MDLog::~MDLog() @@ -1400,22 +1403,23 @@ void MDLog::_replay_thread() mds->clog->error() << "corrupt journal event at " << pos << "~" << bl.length() << " / " << journaler->get_write_pos(); - if (g_conf()->mds_log_skip_corrupt_events) { + if (skip_corrupt_events) { continue; } else { mds->damaged_unlocked(); ceph_abort(); // Should be unreachable because damaged() calls // respawn() } - } - le->set_start_off(pos); - - // have we seen an import map yet? - if (segments.empty() && !dynamic_cast(le.get())) { - dout(1) << "_replay " << pos << "~" << bl.length() << " / " << journaler->get_write_pos() - << " " << le->get_stamp() << " -- waiting for ESubtreeMap. (skipping " << *le << ")" << dendl; + } else if (!segments.empty() && dynamic_cast(le.get())) { + /* This can reasonably happen when a up:stopping MDS restarts after + * writing ELid. We will merge with the previous segment. + * We are enforcing the constraint that ESubtreeMap should begin + * the journal. + */ + dout(20) << "found ELid not at the start of the journal" << dendl; continue; } + le->set_start_off(pos); events_since_last_major_segment++; if (auto sb = dynamic_cast(le.get()); sb) { @@ -1436,6 +1440,22 @@ void MDLog::_replay_thread() event_seq++; } + if (major_segments.empty()) { + dout(0) << __func__ << " " << pos << "~" << bl.length() << " / " + << journaler->get_write_pos() << " " << le->get_stamp() + << " -- waiting for major segment." + << dendl; + dout(0) << " Log event is " << *le << dendl; + if (skip_unbounded_events) { + dout(5) << __func__ << " skipping!" << dendl; + continue; + } else { + mds->damaged_unlocked(); + ceph_abort(); // Should be unreachable because damaged() calls + // respawn() + } + } + dout(10) << "_replay " << pos << "~" << bl.length() << " / " << journaler->get_write_pos() << " " << le->get_stamp() << ": " << *le << dendl; le->_segment = get_current_segment(); // replay may need this @@ -1558,4 +1578,10 @@ void MDLog::handle_conf_change(const std::set& changed, const MDSMa kick_submitter(); } } + if (changed.count("mds_log_skip_corrupt_events")) { + skip_corrupt_events = g_conf().get_val("mds_log_skip_corrupt_events"); + } + if (changed.count("mds_log_skip_unbounded_events")) { + skip_unbounded_events = g_conf().get_val("mds_log_skip_unbounded_events"); + } } diff --git a/src/mds/MDLog.h b/src/mds/MDLog.h index 43817de5268c..5f8b78620ef1 100644 --- a/src/mds/MDLog.h +++ b/src/mds/MDLog.h @@ -294,6 +294,8 @@ private: int64_t max_events; uint64_t max_segments; bool pause; + bool skip_corrupt_events; + bool skip_unbounded_events; std::set major_segments; std::set expired_segments; diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc index 6e6e518870d2..6b437ae04436 100644 --- a/src/mds/MDSRank.cc +++ b/src/mds/MDSRank.cc @@ -39,7 +39,7 @@ #include "common/HeartbeatMap.h" #include "ScrubStack.h" #include "events/ESubtreeMap.h" -#include "events/ESegment.h" +#include "events/ELid.h" #include "MDSRank.h" @@ -2164,8 +2164,7 @@ void MDSRank::boot_create() mdlog->create(fin.new_sub()); // open new journal segment, but do not journal subtree map (yet) - // N.B. this singular event will be skipped during replay - auto le = new ESegment(); + auto le = new ELid(); mdlog->submit_entry(le); if (whoami == mdsmap->get_root()) { @@ -3827,6 +3826,8 @@ const char** MDSRankDispatcher::get_tracked_conf_keys() const "mds_log_max_events", "mds_log_max_segments", "mds_log_pause", + "mds_log_skip_corrupt_events", + "mds_log_skip_unbounded_events", "mds_max_caps_per_client", "mds_max_export_size", "mds_max_purge_files", diff --git a/src/mds/events/ELid.h b/src/mds/events/ELid.h new file mode 100644 index 000000000000..1ac4efb85666 --- /dev/null +++ b/src/mds/events/ELid.h @@ -0,0 +1,44 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2022 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MDS_ELID_H +#define CEPH_MDS_ELID_H + +#include + +#include "../LogEvent.h" +#include "../SegmentBoundary.h" + +class ELid : public LogEvent, public SegmentBoundary { +public: + ELid() : LogEvent(EVENT_LID) {} + ELid(LogSegment::seq_t _seq) : LogEvent(EVENT_SEGMENT), SegmentBoundary(_seq) {} + + bool is_major_segment_boundary() const override { + return true; + } + + void print(std::ostream& out) const override { + out << "ELid(" << seq << ")"; + } + + void encode(bufferlist& bl, uint64_t features) const override; + void decode(bufferlist::const_iterator& bl) override; + void dump(Formatter *f) const override; + void replay(MDSRank *mds) override; + static void generate_test_instances(std::list& ls); +}; +WRITE_CLASS_ENCODER_FEATURES(ELid) + +#endif diff --git a/src/mds/journal.cc b/src/mds/journal.cc index 16155f813fc5..f9eb8a1eecf2 100644 --- a/src/mds/journal.cc +++ b/src/mds/journal.cc @@ -36,6 +36,7 @@ #include "events/ETableClient.h" #include "events/ETableServer.h" #include "events/ESegment.h" +#include "events/ELid.h" #include "include/stringify.h" @@ -3296,6 +3297,34 @@ void ESegment::generate_test_instances(std::list& ls) ls.push_back(new ESegment); } +void ELid::encode(bufferlist &bl, uint64_t features) const +{ + ENCODE_START(1, 1, bl); + encode(seq, bl); + ENCODE_FINISH(bl); +} + +void ELid::decode(bufferlist::const_iterator &bl) +{ + DECODE_START(1, bl); + decode(seq, bl); + DECODE_FINISH(bl); +} + +void ELid::replay(MDSRank *mds) +{ + dout(4) << "ELid::replay, seq " << seq << dendl; +} + +void ELid::dump(Formatter *f) const +{ + f->dump_int("seq", seq); +} + +void ELid::generate_test_instances(std::list& ls) +{ + ls.push_back(new ELid); +} void ENoOp::encode(bufferlist &bl, uint64_t features) const {