From b800149243b593ff7946d9a5df23f5a49247c0fd Mon Sep 17 00:00:00 2001 From: Nitzan Mordechai Date: Tue, 10 Dec 2024 09:04:34 +0000 Subject: [PATCH] msg/async: race condition between reset_recv_state and shutdown_connections when shutting down monitors and valgrind is involved, we can, sometimes, to hit race condition and locks that causing the shutdown process to hang for a long time. reset_recv_state - issuing a message without proper locks that causing the shutdown to hang during shutdown connection (drain network) Fixes: https://tracker.ceph.com/issues/63501 Signed-off-by: Nitzan Mordechai --- src/msg/async/ProtocolV1.cc | 4 ++-- src/msg/async/ProtocolV2.cc | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/msg/async/ProtocolV1.cc b/src/msg/async/ProtocolV1.cc index a53f6389c31..17d3dcfbe2c 100644 --- a/src/msg/async/ProtocolV1.cc +++ b/src/msg/async/ProtocolV1.cc @@ -1281,11 +1281,11 @@ void ProtocolV1::reset_recv_state() // `write_message()`. `submit_to()` here is NOT blocking. if (!connection->center->in_thread()) { connection->center->submit_to(connection->center->get_id(), [this] { - ldout(cct, 5) << "reset_recv_state (warped) reseting security handlers" - << dendl; // Possibly unnecessary. See the comment in `deactivate_existing`. std::lock_guard l(connection->lock); std::lock_guard wl(connection->write_lock); + ldout(cct, 5) << "reset_recv_state (warped) reseting security handlers" + << dendl; reset_security(); }, /* always_async = */true); } else { diff --git a/src/msg/async/ProtocolV2.cc b/src/msg/async/ProtocolV2.cc index ed6f93cdd48..fd1dfb5470e 100644 --- a/src/msg/async/ProtocolV2.cc +++ b/src/msg/async/ProtocolV2.cc @@ -251,11 +251,11 @@ void ProtocolV2::reset_recv_state() { // `write_event()` unlocks it just before calling `write_message()`. // `submit_to()` here is NOT blocking. connection->center->submit_to(connection->center->get_id(), [this] { - ldout(cct, 5) << "reset_recv_state (warped) reseting crypto and compression handlers" - << dendl; // Possibly unnecessary. See the comment in `deactivate_existing`. std::lock_guard l(connection->lock); std::lock_guard wl(connection->write_lock); + ldout(cct, 5) << "reset_recv_state (warped) reseting crypto and compression handlers" + << dendl; reset_security(); reset_compression(); }, /* always_async = */true); -- 2.39.5