From d8e8ac6bcc1f62749c4c2ff4d2f1de8f9c993337 Mon Sep 17 00:00:00 2001 From: Kamoltat Sirivadhna Date: Mon, 23 Jun 2025 19:55:23 +0000 Subject: [PATCH] msg/async/ProtocolV2: Server drops existing connection when client restarts When a client is restarted, it loses its state including global_seq and gets a new client_cookie. This creates an issue during reconnection because the server has an existing connection with a higher global_seq value, causing it to reject the new connection as "stale" with the error: "this is a stale connection, peer_global_seq=" This commit adds detection logic in ProtocolV2::handle_existing_connection() that identifies client restarts by checking for: 1. peer_global_seq < exproto->peer_global_seq The reason is because global sequence should only increase during a session. A decrease strongly indicates a restart. 2. client_cookie has changed (client generated a new cookie) When these conditions are met, the server now drops the existing connection and accepts the new one (via, sending server ident to client, client happily accepts and both are ready to exchange messages), making events such as Monitor restarts & rejoin the quorum faster, preventing MON_NETSPLIT waring from poping up. This allows clients to successfully reconnect after a restart without having to wait for server-side call-back handler to trigger (server will also try to connect, and will be successful since server will use the reconnect path instead since it contains the client's cookie) or for global_seq _id of the client to catch up to that of the server's . Fixes: https://tracker.ceph.com/issues/71344 Signed-off-by: Kamoltat Sirivadhna --- src/msg/async/ProtocolV2.cc | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/msg/async/ProtocolV2.cc b/src/msg/async/ProtocolV2.cc index 58e4f4df21df5..377cb73d7846d 100644 --- a/src/msg/async/ProtocolV2.cc +++ b/src/msg/async/ProtocolV2.cc @@ -2671,6 +2671,17 @@ CtPtr ProtocolV2::handle_existing_connection(const AsyncConnectionRef& existing) return WRITE(wait, "wait", read_frame); } + if (peer_global_seq < exproto->peer_global_seq && + exproto->client_cookie && client_cookie && + exproto->client_cookie != client_cookie) { + ldout(cct, 1) << __func__ << " client has clearly restarted (peer_global_seq < ex_peer_global_seq && cookie changed), " + << "dropping existing connection=" << existing << " in favor of new one" << dendl; + existing->protocol->stop(); + existing->dispatch_queue->queue_reset(existing.get()); + l.unlock(); + return send_server_ident(); + } + if (exproto->peer_global_seq > peer_global_seq) { ldout(cct, 1) << __func__ << " this is a stale connection, peer_global_seq=" << peer_global_seq -- 2.39.5