From: Sage Weil <sage@redhat.com>
Date: Fri, 30 Nov 2018 15:40:49 +0000 (-0600)
Subject: msg/async: do not trigger RESETSESSION from connect fault during connection phase
X-Git-Tag: v14.1.0~734^2
X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=refs%2Fpull%2F25343%2Fhead;p=ceph.git

msg/async: do not trigger RESETSESSION from connect fault during connection phase

Previously, if we got a connection fault during the connect/connect_reply
phase, we would increment connect_seq on the client side and trigger a
RESETSESSION on the server side (because there was not yet an existing
connection to replace).  This led to dropped messages, usually in the
form of stuck peering in the rados/thrash suite.

The problem is that the condition for 'reconnect' vs 'backoff' inherited
the test from SimpleMessenger, which had only a STATE_CONNECTING.  In
contract, AsyncMessenger also as CONNECTING_WAIT_BANNER_AND_IDENTIFY and
CONNECTING_SEND_CONNECT_MSG, and if we were in these states, we would
increment connect_seq instead of backing off and retrying (without an
increment).

Fix by adjusting the condition to match the range of CONNECTING states
in asyncmessenger.

Fixes: http://tracker.ceph.com/issues/36612
Signed-off-by: Sage Weil <sage@redhat.com>
---

diff --git a/src/msg/async/Protocol.cc b/src/msg/async/Protocol.cc
index 0fb8bb389ce..5e202412502 100644
--- a/src/msg/async/Protocol.cc
+++ b/src/msg/async/Protocol.cc
@@ -182,20 +182,9 @@ void ProtocolV1::fault() {
 
   connection->write_lock.unlock();
 
-  if (state != START_CONNECT && state != CONNECTING && state != WAIT) {
-    // policy maybe empty when state is in accept
-    if (connection->policy.server) {
-      ldout(cct, 0) << __func__ << " server, going to standby" << dendl;
-      state = STANDBY;
-    } else {
-      ldout(cct, 0) << __func__ << " initiating reconnect" << dendl;
-      connect_seq++;
-      state = START_CONNECT;
-      connection->state = AsyncConnection::STATE_CONNECTING;
-    }
-    backoff = utime_t();
-    connection->center->dispatch_event_external(connection->read_handler);
-  } else {
+  if ((state >= START_CONNECT && state <= CONNECTING_SEND_CONNECT_MSG) ||
+      state == WAIT) {
+    // backoff!
     if (state == WAIT) {
       backoff.set_from_double(cct->_conf->ms_max_backoff);
     } else if (backoff == utime_t()) {
@@ -213,6 +202,19 @@ void ProtocolV1::fault() {
     connection->register_time_events.insert(
         connection->center->create_time_event(backoff.to_nsec() / 1000,
                                               connection->wakeup_handler));
+  } else {
+    // policy maybe empty when state is in accept
+    if (connection->policy.server) {
+      ldout(cct, 0) << __func__ << " server, going to standby" << dendl;
+      state = STANDBY;
+    } else {
+      ldout(cct, 0) << __func__ << " initiating reconnect" << dendl;
+      connect_seq++;
+      state = START_CONNECT;
+      connection->state = AsyncConnection::STATE_CONNECTING;
+    }
+    backoff = utime_t();
+    connection->center->dispatch_event_external(connection->read_handler);
   }
 }