From: Nitzan Mordechai Date: Tue, 10 Dec 2024 09:04:34 +0000 (+0000) Subject: msg/async: race condition between reset_recv_state and shutdown_connections X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=8d1a5892a36d9b69d1748ab480a37a671b149cb0;p=ceph-ci.git msg/async: race condition between reset_recv_state and shutdown_connections when shutting down monitors and valgrind is involved, we can, sometimes, to hit race condition and locks that causing the shutdown process to hang for a long time. reset_recv_state - issuing a message without proper locks that causing the shutdown to hang during shutdown connection (drain network) Fixes: https://tracker.ceph.com/issues/63501 Signed-off-by: Nitzan Mordechai (cherry picked from commit b800149243b593ff7946d9a5df23f5a49247c0fd) --- diff --git a/src/msg/async/ProtocolV1.cc b/src/msg/async/ProtocolV1.cc index 041942fd906..a73d173ae60 100644 --- a/src/msg/async/ProtocolV1.cc +++ b/src/msg/async/ProtocolV1.cc @@ -1282,11 +1282,11 @@ void ProtocolV1::reset_recv_state() // `write_message()`. `submit_to()` here is NOT blocking. if (!connection->center->in_thread()) { connection->center->submit_to(connection->center->get_id(), [this] { - ldout(cct, 5) << "reset_recv_state (warped) reseting security handlers" - << dendl; // Possibly unnecessary. See the comment in `deactivate_existing`. std::lock_guard l(connection->lock); std::lock_guard wl(connection->write_lock); + ldout(cct, 5) << "reset_recv_state (warped) reseting security handlers" + << dendl; reset_security(); }, /* always_async = */true); } else { diff --git a/src/msg/async/ProtocolV2.cc b/src/msg/async/ProtocolV2.cc index 7c4a4d0fe94..3631898da8e 100644 --- a/src/msg/async/ProtocolV2.cc +++ b/src/msg/async/ProtocolV2.cc @@ -250,11 +250,11 @@ void ProtocolV2::reset_recv_state() { // `write_event()` unlocks it just before calling `write_message()`. // `submit_to()` here is NOT blocking. connection->center->submit_to(connection->center->get_id(), [this] { - ldout(cct, 5) << "reset_recv_state (warped) reseting crypto and compression handlers" - << dendl; // Possibly unnecessary. See the comment in `deactivate_existing`. std::lock_guard l(connection->lock); std::lock_guard wl(connection->write_lock); + ldout(cct, 5) << "reset_recv_state (warped) reseting crypto and compression handlers" + << dendl; reset_security(); reset_compression(); }, /* always_async = */true);