From b6f0324b157da99846a5cfd0ca11fabe2b51f99d Mon Sep 17 00:00:00 2001 From: Nitzan Mordechai Date: Thu, 27 Jan 2022 15:13:28 +0200 Subject: [PATCH] osd/OSD: osd_fast_shutdown_notify_mon not quite right When osd_fast_shutdown and osd_fast_shutdown_notify_mon set as true, OSD marked as Down it should be marked as Dead, Fixed: https://tracker.ceph.com/issues/53327 Signed-off-by: Nitzan Mordechai nd nd (cherry picked from commit 07302d5e41c49c885c9398c1c478638023e3f264) --- src/messages/MOSDMarkMeDown.h | 13 ++++++++++++- src/mon/OSDMonitor.cc | 8 +++++++- src/osd/OSD.cc | 6 ++++-- 3 files changed, 23 insertions(+), 4 deletions(-) diff --git a/src/messages/MOSDMarkMeDown.h b/src/messages/MOSDMarkMeDown.h index 2ff0e1cd809ea..44d57395fc9e2 100644 --- a/src/messages/MOSDMarkMeDown.h +++ b/src/messages/MOSDMarkMeDown.h @@ -19,7 +19,7 @@ class MOSDMarkMeDown final : public PaxosServiceMessage { private: - static constexpr int HEAD_VERSION = 3; + static constexpr int HEAD_VERSION = 4; static constexpr int COMPAT_VERSION = 3; public: @@ -28,6 +28,7 @@ private: entity_addrvec_t target_addrs; epoch_t epoch = 0; bool request_ack = false; // ack requested + bool down_and_dead = false; // mark down and dead MOSDMarkMeDown() : PaxosServiceMessage{MSG_OSD_MARK_ME_DOWN, 0, @@ -38,6 +39,12 @@ private: HEAD_VERSION, COMPAT_VERSION}, fsid(fs), target_osd(osd), target_addrs(av), epoch(e), request_ack(request_ack) {} + MOSDMarkMeDown(const uuid_d &fs, int osd, const entity_addrvec_t& av, + epoch_t e, bool request_ack, bool down_and_dead) + : PaxosServiceMessage{MSG_OSD_MARK_ME_DOWN, e, + HEAD_VERSION, COMPAT_VERSION}, + fsid(fs), target_osd(osd), target_addrs(av), + epoch(e), request_ack(request_ack), down_and_dead(down_and_dead) {} private: ~MOSDMarkMeDown() final {} @@ -63,6 +70,8 @@ public: decode(target_addrs, p); decode(epoch, p); decode(request_ack, p); + if(header.version >= 4) + decode(down_and_dead, p); } void encode_payload(uint64_t features) override { @@ -86,12 +95,14 @@ public: encode(target_addrs, payload, features); encode(epoch, payload); encode(request_ack, payload); + encode(down_and_dead, payload); } std::string_view get_type_name() const override { return "MOSDMarkMeDown"; } void print(std::ostream& out) const override { out << "MOSDMarkMeDown(" << "request_ack=" << request_ack + << ", down_and_dead=" << down_and_dead << ", osd." << target_osd << ", " << target_addrs << ", fsid=" << fsid diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 332673821c7ee..eba9fdc25258e 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -3038,8 +3038,14 @@ bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op) ceph_assert(osdmap.is_up(target_osd)); ceph_assert(osdmap.get_addrs(target_osd) == m->target_addrs); - mon.clog->info() << "osd." << target_osd << " marked itself down"; + mon.clog->info() << "osd." << target_osd << " marked itself " << ((m->down_and_dead) ? "down and dead" : "down"); pending_inc.new_state[target_osd] = CEPH_OSD_UP; + if (m->down_and_dead) { + if (!pending_inc.new_xinfo.count(target_osd)) { + pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd]; + } + pending_inc.new_xinfo[target_osd].dead_epoch = m->get_epoch(); + } if (m->request_ack) wait_for_finished_proposal(op, new C_AckMarkedDown(this, op)); return true; diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index ac9bd245f2d17..ea4c8db73d130 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -1378,7 +1378,7 @@ bool OSDService::prepare_to_stop() OSDMapRef osdmap = get_osdmap(); if (osdmap && osdmap->is_up(whoami)) { - dout(0) << __func__ << " telling mon we are shutting down" << dendl; + dout(0) << __func__ << " telling mon we are shutting down and dead " << dendl; set_state(PREPARING_TO_STOP); monc->send_mon_message( new MOSDMarkMeDown( @@ -1386,12 +1386,14 @@ bool OSDService::prepare_to_stop() whoami, osdmap->get_addrs(whoami), osdmap->get_epoch(), - true // request ack + true, // request ack + true // mark as down and dead )); const auto timeout = ceph::make_timespan(cct->_conf->osd_mon_shutdown_timeout); is_stopping_cond.wait_for(l, timeout, [this] { return get_state() == STOPPING; }); } + dout(0) << __func__ << " starting shutdown" << dendl; set_state(STOPPING); return true; -- 2.39.5