From: Sage Weil Date: Fri, 30 Nov 2018 15:40:49 +0000 (-0600) Subject: msg/async: do not trigger RESETSESSION from connect fault during connection phase X-Git-Tag: v14.1.0~734^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=refs%2Fpull%2F25343%2Fhead;p=ceph.git msg/async: do not trigger RESETSESSION from connect fault during connection phase Previously, if we got a connection fault during the connect/connect_reply phase, we would increment connect_seq on the client side and trigger a RESETSESSION on the server side (because there was not yet an existing connection to replace). This led to dropped messages, usually in the form of stuck peering in the rados/thrash suite. The problem is that the condition for 'reconnect' vs 'backoff' inherited the test from SimpleMessenger, which had only a STATE_CONNECTING. In contract, AsyncMessenger also as CONNECTING_WAIT_BANNER_AND_IDENTIFY and CONNECTING_SEND_CONNECT_MSG, and if we were in these states, we would increment connect_seq instead of backing off and retrying (without an increment). Fix by adjusting the condition to match the range of CONNECTING states in asyncmessenger. Fixes: http://tracker.ceph.com/issues/36612 Signed-off-by: Sage Weil --- diff --git a/src/msg/async/Protocol.cc b/src/msg/async/Protocol.cc index 0fb8bb389ce0..5e202412502b 100644 --- a/src/msg/async/Protocol.cc +++ b/src/msg/async/Protocol.cc @@ -182,20 +182,9 @@ void ProtocolV1::fault() { connection->write_lock.unlock(); - if (state != START_CONNECT && state != CONNECTING && state != WAIT) { - // policy maybe empty when state is in accept - if (connection->policy.server) { - ldout(cct, 0) << __func__ << " server, going to standby" << dendl; - state = STANDBY; - } else { - ldout(cct, 0) << __func__ << " initiating reconnect" << dendl; - connect_seq++; - state = START_CONNECT; - connection->state = AsyncConnection::STATE_CONNECTING; - } - backoff = utime_t(); - connection->center->dispatch_event_external(connection->read_handler); - } else { + if ((state >= START_CONNECT && state <= CONNECTING_SEND_CONNECT_MSG) || + state == WAIT) { + // backoff! if (state == WAIT) { backoff.set_from_double(cct->_conf->ms_max_backoff); } else if (backoff == utime_t()) { @@ -213,6 +202,19 @@ void ProtocolV1::fault() { connection->register_time_events.insert( connection->center->create_time_event(backoff.to_nsec() / 1000, connection->wakeup_handler)); + } else { + // policy maybe empty when state is in accept + if (connection->policy.server) { + ldout(cct, 0) << __func__ << " server, going to standby" << dendl; + state = STANDBY; + } else { + ldout(cct, 0) << __func__ << " initiating reconnect" << dendl; + connect_seq++; + state = START_CONNECT; + connection->state = AsyncConnection::STATE_CONNECTING; + } + backoff = utime_t(); + connection->center->dispatch_event_external(connection->read_handler); } }