From: root <penglaiyxy>
Date: Mon, 30 Jul 2018 01:29:48 +0000 (-0400)
Subject: msg: ceph_abort() when there are enough accepter errors in msg server
X-Git-Tag: v14.0.1~280^2
X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=00e0ab407b2e9659d9121be1217e95c8117c411e;p=ceph-ci.git

msg: ceph_abort() when there are enough accepter errors in msg server
In some extrem cases(we have met one in our production cluster), when Accepter thread break out , new client can not connect to the osd. Because the former heartbeat connections are already connected, other osd can not detect failure then notify monitor to mark the failed osd down.
In the patch, we there are abnormal communication errors ,we just ceph_abort  so that osd can go down fastly and other osds can notify monitor to mark the failed osd down.
Signed-off-by: penglaiyxy@gmail.com <penglaiyxy@gmail.com>
---

diff --git a/src/common/legacy_config_opts.h b/src/common/legacy_config_opts.h
index 6a38549b33b..40ef425ba65 100644
--- a/src/common/legacy_config_opts.h
+++ b/src/common/legacy_config_opts.h
@@ -175,6 +175,10 @@ OPTION(ms_async_rdma_dscp, OPT_INT)            // in RoCE, this means DSCP
 OPTION(ms_async_rdma_cm, OPT_BOOL)
 OPTION(ms_async_rdma_type, OPT_STR)
 
+// when there are enough accept failures, indicating there are unrecoverable failures,
+// just do ceph_abort() . Here we make it configurable.
+OPTION(ms_max_accept_failures, OPT_INT)
+
 OPTION(ms_dpdk_port_id, OPT_INT)
 SAFE_OPTION(ms_dpdk_coremask, OPT_STR)        // it is modified in unittest so that use SAFE_OPTION to declare 
 OPTION(ms_dpdk_memory_channel, OPT_STR)
diff --git a/src/common/options.cc b/src/common/options.cc
index 79e88c5f5fa..1d91bd2188e 100644
--- a/src/common/options.cc
+++ b/src/common/options.cc
@@ -1100,6 +1100,11 @@ std::vector<Option> get_global_options() {
     .set_default(96)
     .set_description(""),
 
+    Option("ms_max_accept_failures", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(4)
+    .set_description("The maximum number of consecutive failed accept() calls before "
+                     "considering the daemon is misconfigured and abort it."),
+
     Option("ms_async_rdma_cm", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(false)
     .set_description(""),
diff --git a/src/msg/async/AsyncMessenger.cc b/src/msg/async/AsyncMessenger.cc
index 741e730ec20..6d892ea0d52 100644
--- a/src/msg/async/AsyncMessenger.cc
+++ b/src/msg/async/AsyncMessenger.cc
@@ -171,6 +171,8 @@ void Processor::accept()
   for (auto& listen_socket : listen_sockets) {
     ldout(msgr->cct, 10) << __func__ << " listen_fd=" << listen_socket.fd()
 			 << dendl;
+    unsigned accept_error_num = 0;
+
     while (true) {
       entity_addr_t addr;
       ConnectedSocket cli_socket;
@@ -185,6 +187,7 @@ void Processor::accept()
 			     << cli_socket.fd() << dendl;
 
 	msgr->add_accept(w, std::move(cli_socket), addr);
+	accept_error_num = 0;
 	continue;
       } else {
 	if (r == -EINTR) {
@@ -194,7 +197,11 @@ void Processor::accept()
 	} else if (r == -EMFILE || r == -ENFILE) {
 	  lderr(msgr->cct) << __func__ << " open file descriptions limit reached sd = " << listen_socket.fd()
 			   << " errno " << r << " " << cpp_strerror(r) << dendl;
-	  break;
+	  if (++accept_error_num > msgr->cct->_conf->ms_max_accept_failures) {
+	    lderr(msgr->cct) << "Proccessor accept has encountered enough error numbers, just do ceph_abort()." << dendl;
+	    ceph_abort();
+	  }
+	  continue;
 	} else if (r == -ECONNABORTED) {
 	  ldout(msgr->cct, 0) << __func__ << " it was closed because of rst arrived sd = " << listen_socket.fd()
 			      << " errno " << r << " " << cpp_strerror(r) << dendl;
@@ -202,7 +209,11 @@ void Processor::accept()
 	} else {
 	  lderr(msgr->cct) << __func__ << " no incoming connection?"
 			   << " errno " << r << " " << cpp_strerror(r) << dendl;
-	  break;
+	  if (++accept_error_num > msgr->cct->_conf->ms_max_accept_failures) {
+	    lderr(msgr->cct) << "Proccessor accept has encountered enough error numbers, just do ceph_abort()." << dendl;
+	    ceph_abort();
+	  }
+	  continue;
 	}
       }
     }
diff --git a/src/msg/simple/Accepter.cc b/src/msg/simple/Accepter.cc
index eadcffbb6cf..a9122ac1be2 100644
--- a/src/msg/simple/Accepter.cc
+++ b/src/msg/simple/Accepter.cc
@@ -317,7 +317,7 @@ void *Accepter::entry()
       }
       ldout(msgr->cct,1) << __func__ << " poll got error"  
  			  << " errno " << errno << " " << cpp_strerror(errno) << dendl;
-      break;
+      ceph_abort();
     }
     ldout(msgr->cct,10) << __func__ << " poll returned oke: " << r << dendl;
     ldout(msgr->cct,20) << __func__ <<  " pfd.revents[0]=" << pfd[0].revents << dendl;
@@ -326,7 +326,7 @@ void *Accepter::entry()
     if (pfd[0].revents & (POLLERR | POLLNVAL | POLLHUP)) {
       ldout(msgr->cct,1) << __func__ << " poll got errors in revents "  
  			 <<  pfd[0].revents << dendl;
-      break;
+      ceph_abort();
     }
     if (pfd[1].revents & (POLLIN | POLLERR | POLLNVAL | POLLHUP)) {
       // We got "signaled" to exit the poll
@@ -358,8 +358,10 @@ void *Accepter::entry()
     } else {
       ldout(msgr->cct,0) << __func__ << " no incoming connection?  sd = " << sd
 	      << " errno " << errno << " " << cpp_strerror(errno) << dendl;
-      if (++errors > 4)
-	break;
+      if (++errors > msgr->cct->_conf->ms_max_accept_failures) {
+        lderr(msgr->cct) << "accetper has encoutered enough errors, just do ceph_abort()." << dendl;
+        ceph_abort();
+      }
     }
   }