From 26f205dbead82f4f9fb7ae9cf5ba1681cb85797b Mon Sep 17 00:00:00 2001 From: Radoslaw Zarzynski Date: Mon, 17 May 2021 13:50:55 +0000 Subject: [PATCH] crimson/monc: fix send_message() racing with reopen_session(). The `send_message()` method is a high-level facility for communicating with a monitor. If there is an active conn available, it sends the message immediately; otherwise the message is queued. This method assumes the queue is already drained if the connection is available. `active_con` is managed by `reopen_session()` where it's first cleared and then reset after finding new alive mon. This is followed by draining the `pending_messages` queue which happens in `on_session_opened()` after the `MAuth` exchange is finished. Unfortunately, the path from the `active_con` clearing to draining the queue is long and divided into multiple continuations which results in lack of atomicity. When e.g. `run_command()` interleaves the stages, following crash happens: ``` INFO 2021-05-07 08:13:43,914 [shard 0] monc - do_auth_single: mon v2:172.21.15.82:6805/34166 => v2:172.21.15.82:3300/0 returns auth_reply(proto 2 0 (0) Success) v1: 0 ceph-osd: /home/jenkins-build/build/workspace/ceph-dev-new-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/17.0.0-3910-g1b18e076/rpm/el8/BUILD/ceph-17.0.0-3910-g1b18e076/src/crimson/mon/MonClient.cc:1034: seastar::future<> crimson::mon::Client::send_message(MessageRef): Assertion `pending_messages.empty()' failed. Aborting on shard 0. Backtrace: 0# 0x000055CDE6DB532F in ceph-osd 1# FatalSignal::signaled(int, siginfo_t const*) in ceph-osd 2# FatalSignal::install_oneshot_signal_handler<6>()::{lambda(int, siginfo_t*, void*)#1}::_FUN(int, siginfo_t*, void*) in ceph-osd 3# 0x00007FC1BF20BB20 in /lib64/libpthread.so.0 4# gsignal in /lib64/libc.so.6 5# abort in /lib64/libc.so.6 6# 0x00007FC1BD806B09 in /lib64/libc.so.6 7# 0x00007FC1BD814DE6 in /lib64/libc.so.6 8# crimson::mon::Client::send_message(boost::intrusive_ptr) in ceph-osd 9# crimson::mon::Client::renew_subs() in ceph-osd 10# 0x000055CDE764FB0B in ceph-osd 11# 0x000055CDE10457F0 in ceph-osd 12# 0x000055CDEA0AB88F in ceph-osd 13# 0x000055CDEA0B0DD0 in ceph-osd 14# 0x000055CDEA2689FB in ceph-osd 15# 0x000055CDE9DC0EDA in ceph-osd 16# main in ceph-osd 17# __libc_start_main in /lib64/libc.so.6 18# _start in ceph-osd ``` The problem caused following failure at Sepia: http://pulpito.front.sepia.ceph.com/rzarzynski-2021-05-07_07:41:02-rados-master-distro-basic-smithi/6104549 Signed-off-by: Radoslaw Zarzynski --- src/crimson/mon/MonClient.cc | 8 +++++++- src/crimson/mon/MonClient.h | 1 + 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/crimson/mon/MonClient.cc b/src/crimson/mon/MonClient.cc index e5a50918c58b0..6234f7555bbaf 100644 --- a/src/crimson/mon/MonClient.cc +++ b/src/crimson/mon/MonClient.cc @@ -901,6 +901,7 @@ seastar::future<> Client::stop() logger().info("{}", __func__); auto fut = gate.close(); timer.cancel(); + ready_to_send = false; for (auto& pending_con : pending_conns) { pending_con->close(); } @@ -929,6 +930,7 @@ static entity_addr_t choose_client_addr( seastar::future Client::reopen_session(int rank) { logger().info("{} to mon.{}", __func__, rank); + ready_to_send = false; if (active_con) { active_con->close(); active_con = nullptr; @@ -1001,6 +1003,9 @@ void Client::_finish_auth(const entity_addr_t& peer) } ceph_assert(!active_con && !pending_conns.empty()); + // It's too early to toggle the `ready_to_send` flag. It will + // be set atfer finishing the MAuth exchange and draining out + // the `pending_messages` queue. active_con = std::move(*found); *found = nullptr; for (auto& conn : pending_conns) { @@ -1028,7 +1033,7 @@ Client::run_command(std::string&& cmd, seastar::future<> Client::send_message(MessageRef m) { - if (active_con) { + if (active_con && ready_to_send) { assert(pending_messages.empty()); return active_con->get_conn()->send(m); } else { @@ -1047,6 +1052,7 @@ seastar::future<> Client::on_session_opened() m.pr.set_value(); } pending_messages.clear(); + ready_to_send = true; return seastar::now(); }).then([this] { return seastar::parallel_for_each(mon_commands, diff --git a/src/crimson/mon/MonClient.h b/src/crimson/mon/MonClient.h index cd15b2cd22d42..a0dfa45481d34 100644 --- a/src/crimson/mon/MonClient.h +++ b/src/crimson/mon/MonClient.h @@ -55,6 +55,7 @@ class Client : public crimson::net::Dispatcher, const uint32_t want_keys; MonMap monmap; + bool ready_to_send = false; seastar::shared_ptr active_con; std::vector> pending_conns; seastar::timer timer; -- 2.39.5