From 8346e397631c6faa8d4305b71b86e3de4f099fc4 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 30 Nov 2018 09:40:49 -0600 Subject: [PATCH] msg/async: do not trigger RESETSESSION from connect fault during connection phase Previously, if we got a connection fault during the connect/connect_reply phase, we would increment connect_seq on the client side and trigger a RESETSESSION on the server side (because there was not yet an existing connection to replace). This led to dropped messages, usually in the form of stuck peering in the rados/thrash suite. The problem is that the condition for 'reconnect' vs 'backoff' inherited the test from SimpleMessenger, which had only a STATE_CONNECTING. In contract, AsyncMessenger also as CONNECTING_WAIT_BANNER_AND_IDENTIFY and CONNECTING_SEND_CONNECT_MSG, and if we were in these states, we would increment connect_seq instead of backing off and retrying (without an increment). Fix by adjusting the condition to match the range of CONNECTING states in asyncmessenger. Fixes: http://tracker.ceph.com/issues/36612 Signed-off-by: Sage Weil --- src/msg/async/Protocol.cc | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/src/msg/async/Protocol.cc b/src/msg/async/Protocol.cc index 0fb8bb389ce0..5e202412502b 100644 --- a/src/msg/async/Protocol.cc +++ b/src/msg/async/Protocol.cc @@ -182,20 +182,9 @@ void ProtocolV1::fault() { connection->write_lock.unlock(); - if (state != START_CONNECT && state != CONNECTING && state != WAIT) { - // policy maybe empty when state is in accept - if (connection->policy.server) { - ldout(cct, 0) << __func__ << " server, going to standby" << dendl; - state = STANDBY; - } else { - ldout(cct, 0) << __func__ << " initiating reconnect" << dendl; - connect_seq++; - state = START_CONNECT; - connection->state = AsyncConnection::STATE_CONNECTING; - } - backoff = utime_t(); - connection->center->dispatch_event_external(connection->read_handler); - } else { + if ((state >= START_CONNECT && state <= CONNECTING_SEND_CONNECT_MSG) || + state == WAIT) { + // backoff! if (state == WAIT) { backoff.set_from_double(cct->_conf->ms_max_backoff); } else if (backoff == utime_t()) { @@ -213,6 +202,19 @@ void ProtocolV1::fault() { connection->register_time_events.insert( connection->center->create_time_event(backoff.to_nsec() / 1000, connection->wakeup_handler)); + } else { + // policy maybe empty when state is in accept + if (connection->policy.server) { + ldout(cct, 0) << __func__ << " server, going to standby" << dendl; + state = STANDBY; + } else { + ldout(cct, 0) << __func__ << " initiating reconnect" << dendl; + connect_seq++; + state = START_CONNECT; + connection->state = AsyncConnection::STATE_CONNECTING; + } + backoff = utime_t(); + connection->center->dispatch_event_external(connection->read_handler); } } -- 2.47.3