From 93469d4895c7d3257d68ab2815104b95b8567919 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Tue, 31 Mar 2026 18:40:08 +0530 Subject: [PATCH] mon/MonClient: check stopping for auth request handling When the MonClient is shutting down, it is no longer safe to access MonClient::auth and other members. The AuthClient methods should be checking the stopping flag in this case. The key bit from the segfault backtrace (thanks Brad Hubbard!) is here: #13 0x00007f921ee23c40 in ProtocolV2::handle_auth_done (this=0x7f91cc0945f0, payload=...) at /usr/include/c++/12/bits/shared_ptr_base.h:1665 #14 0x00007f921ee16a29 in ProtocolV2::run_continuation (this=0x7f91cc0945f0, continuation=...) at msg/./src/msg/async/ProtocolV2.cc:54 #15 0x00007f921edee56e in std::function::operator()(char*, long) const (__args#1=0, __args#0=, this=0x7f91cc0744d8) at /usr/include/c++/12/bits/std_function.h:591 #16 AsyncConnection::process (this=0x7f91cc074140) at msg/./src/msg/async/AsyncConnection.cc:485 #17 0x00007f921ee3300c in EventCenter::process_events (this=0x55efc9d0a058, timeout_microseconds=, working_dur=0x7f921a891d88) at msg/./src/msg/async/Event.cc:465 #18 0x00007f921ee38bf9 in operator() (__closure=) at msg/./src/msg/async/Stack.cc:50 #19 std::__invoke_impl&> (__f=...) at /usr/include/c++/12/bits/invoke.h:61 #20 std::__invoke_r&> (__fn=...) at /usr/include/c++/12/bits/invoke.h:111 #21 std::_Function_handler >::_M_invoke(const std::_Any_data &) (__functor=...) at /usr/include/c++/12/bits/std_function.h:290 #22 0x00007f921e81f253 in std::execute_native_thread_routine (__p=0x55efc9e9c5f0) at ../../../../../src/libstdc++-v3/src/c++11/thread.cc:82 #23 0x00007f921f5e8ac3 in start_thread (arg=) at ./nptl/pthread_create.c:442 #24 0x00007f921f67a8d0 in clone3 () at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81 I originally thought this may be the issue causing [1] however that turned out to be an issue caused by OpenSSL's use of atexit handlers. I still think there is a bug here so I am continuing with this change. [1] https://tracker.ceph.com/issues/59335 Fixes: https://tracker.ceph.com/issues/76017 Signed-off-by: Patrick Donnelly --- src/mon/MonClient.cc | 44 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/src/mon/MonClient.cc b/src/mon/MonClient.cc index 2d486b0d11a3..b035ae3ca28c 100644 --- a/src/mon/MonClient.cc +++ b/src/mon/MonClient.cc @@ -318,6 +318,12 @@ bool MonClient::ms_dispatch(Message *m) std::lock_guard lock(monc_lock); + if (stopping) { + ldout(cct, 5) << " dropping because stopping: " << *m << dendl; + m->put(); + return true; + } + if (!m->get_connection()->is_anon() && m->get_source().type() == CEPH_ENTITY_TYPE_MON) { if (_hunting()) { @@ -488,11 +494,13 @@ int MonClient::init() { ldout(cct, 10) << __func__ << dendl; + std::lock_guard l(monc_lock); + stopping = false; + entity_name = cct->_conf->name; auth_registry.refresh_config(); - std::lock_guard l(monc_lock); keyring.reset(new KeyRing); if (auth_registry.is_supported_method(messenger->get_mytype(), CEPH_AUTH_CEPHX)) { @@ -566,7 +574,6 @@ void MonClient::shutdown() } monc_lock.lock(); timer.shutdown(); - stopping = false; monc_lock.unlock(); } @@ -1445,6 +1452,11 @@ int MonClient::get_auth_request( ldout(cct,10) << __func__ << " con " << con << " auth_method " << *auth_method << dendl; + if (stopping) { + ldout(cct, 5) << __func__ << " dropping because stopping" << dendl; + return -ECANCELED; + } + // connection to mon? if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) { ceph_assert(!auth_meta->authorizer); @@ -1494,6 +1506,11 @@ int MonClient::handle_auth_reply_more( { std::lock_guard l(monc_lock); + if (stopping) { + ldout(cct, 5) << __func__ << " dropping because stopping" << dendl; + return -ECANCELED; + } + if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) { if (con->is_anon()) { for (auto& i : mon_commands) { @@ -1530,8 +1547,14 @@ int MonClient::handle_auth_done( CryptoKey *session_key, std::string *connection_secret) { + std::lock_guard l(monc_lock); + + if (stopping) { + ldout(cct, 5) << __func__ << " dropping because stopping" << dendl; + return -ECANCELED; + } + if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) { - std::lock_guard l(monc_lock); if (con->is_anon()) { for (auto& i : mon_commands) { if (i.second->target_con == con) { @@ -1586,9 +1609,15 @@ int MonClient::handle_auth_bad_method( const std::vector& allowed_methods, const std::vector& allowed_modes) { + std::lock_guard l(monc_lock); + + if (stopping) { + ldout(cct, 5) << __func__ << " dropping because stopping" << dendl; + return -ECANCELED; + } + auth_meta->allowed_methods = allowed_methods; - std::lock_guard l(monc_lock); if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) { if (con->is_anon()) { for (auto& i : mon_commands) { @@ -1644,6 +1673,13 @@ int MonClient::handle_auth_request( const ceph::buffer::list& payload, ceph::buffer::list *reply) { + std::lock_guard l(monc_lock); + + if (stopping) { + ldout(cct, 5) << __func__ << " dropping because stopping" << dendl; + return -ECANCELED; + } + if (payload.length() == 0) { // for some channels prior to nautilus (osd heartbeat), we // tolerate the lack of an authorizer. -- 2.47.3