msg: ceph_abort() when there are enough accepter errors in msg server

author root <penglaiyxy>

Mon, 30 Jul 2018 01:29:48 +0000 (21:29 -0400)

committer root <penglaiyxy>

Thu, 2 Aug 2018 01:08:13 +0000 (21:08 -0400)
author root <penglaiyxy>
Mon, 30 Jul 2018 01:29:48 +0000 (21:29 -0400)
committer root <penglaiyxy>
Thu, 2 Aug 2018 01:08:13 +0000 (21:08 -0400)
diff --git a/src/common/legacy_config_opts.h b/src/common/legacy_config_opts.h

index 6a38549b33bd4850c23e953efb3784e27fa260a0..40ef425ba65b8089a25af685eb1b086b5fab891e 100644 (file)
--- a/src/common/legacy_config_opts.h
+++ b/src/common/legacy_config_opts.h
@@ -175,6 +175,10 @@ OPTION(ms_async_rdma_dscp, OPT_INT)            // in RoCE, this means DSCP
  OPTION(ms_async_rdma_cm, OPT_BOOL)
  OPTION(ms_async_rdma_type, OPT_STR)
  
+// when there are enough accept failures, indicating there are unrecoverable failures,
+// just do ceph_abort() . Here we make it configurable.
+OPTION(ms_max_accept_failures, OPT_INT)
+
  OPTION(ms_dpdk_port_id, OPT_INT)
  SAFE_OPTION(ms_dpdk_coremask, OPT_STR)        // it is modified in unittest so that use SAFE_OPTION to declare 
  OPTION(ms_dpdk_memory_channel, OPT_STR)
diff --git a/src/common/options.cc b/src/common/options.cc

index 79e88c5f5fae2d85f9eeb99fd7a3388efb99ec39..1d91bd2188e569277662e8486912755c26d3658f 100644 (file)
--- a/src/common/options.cc
+++ b/src/common/options.cc
@@ -1100,6 +1100,11 @@ std::vector<Option> get_global_options() {
      .set_default(96)
      .set_description(""),
  
+    Option("ms_max_accept_failures", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(4)
+    .set_description("The maximum number of consecutive failed accept() calls before "
+                     "considering the daemon is misconfigured and abort it."),
+
      Option("ms_async_rdma_cm", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
      .set_default(false)
      .set_description(""),
diff --git a/src/msg/async/AsyncMessenger.cc b/src/msg/async/AsyncMessenger.cc

index 741e730ec206ade64f520bde14b27e667280e1b5..6d892ea0d5272eb2cb17ca67c0be07ff2e3d3a23 100644 (file)
--- a/src/msg/async/AsyncMessenger.cc
+++ b/src/msg/async/AsyncMessenger.cc
@@ -171,6 +171,8 @@ void Processor::accept()
    for (auto& listen_socket : listen_sockets) {
      ldout(msgr->cct, 10) << __func__ << " listen_fd=" << listen_socket.fd()
                          << dendl;
+    unsigned accept_error_num = 0;
+
      while (true) {
        entity_addr_t addr;
        ConnectedSocket cli_socket;
@@ -185,6 +187,7 @@ void Processor::accept()
                              << cli_socket.fd() << dendl;
  
         msgr->add_accept(w, std::move(cli_socket), addr);
+       accept_error_num = 0;
         continue;
        } else {
         if (r == -EINTR) {
@@ -194,7 +197,11 @@ void Processor::accept()
         } else if (r == -EMFILE || r == -ENFILE) {
           lderr(msgr->cct) << __func__ << " open file descriptions limit reached sd = " << listen_socket.fd()
                            << " errno " << r << " " << cpp_strerror(r) << dendl;
-         break;
+         if (++accept_error_num > msgr->cct->_conf->ms_max_accept_failures) {
+           lderr(msgr->cct) << "Proccessor accept has encountered enough error numbers, just do ceph_abort()." << dendl;
+           ceph_abort();
+         }
+         continue;
         } else if (r == -ECONNABORTED) {
           ldout(msgr->cct, 0) << __func__ << " it was closed because of rst arrived sd = " << listen_socket.fd()
                               << " errno " << r << " " << cpp_strerror(r) << dendl;
@@ -202,7 +209,11 @@ void Processor::accept()
         } else {
           lderr(msgr->cct) << __func__ << " no incoming connection?"
                            << " errno " << r << " " << cpp_strerror(r) << dendl;
-         break;
+         if (++accept_error_num > msgr->cct->_conf->ms_max_accept_failures) {
+           lderr(msgr->cct) << "Proccessor accept has encountered enough error numbers, just do ceph_abort()." << dendl;
+           ceph_abort();
+         }
+         continue;
         }
        }
      }
diff --git a/src/msg/simple/Accepter.cc b/src/msg/simple/Accepter.cc

index eadcffbb6cfa553b46b6ee51bac85835cfea2412..a9122ac1be29d811638b6279ff6d56a488625744 100644 (file)
--- a/src/msg/simple/Accepter.cc
+++ b/src/msg/simple/Accepter.cc
@@ -317,7 +317,7 @@ void *Accepter::entry()
        }
        ldout(msgr->cct,1) << __func__ << " poll got error"  
                           << " errno " << errno << " " << cpp_strerror(errno) << dendl;
-      break;
+      ceph_abort();
      }
      ldout(msgr->cct,10) << __func__ << " poll returned oke: " << r << dendl;
      ldout(msgr->cct,20) << __func__ <<  " pfd.revents[0]=" << pfd[0].revents << dendl;
@@ -326,7 +326,7 @@ void *Accepter::entry()
      if (pfd[0].revents & (POLLERR | POLLNVAL | POLLHUP)) {
        ldout(msgr->cct,1) << __func__ << " poll got errors in revents "  
                          <<  pfd[0].revents << dendl;
-      break;
+      ceph_abort();
      }
      if (pfd[1].revents & (POLLIN | POLLERR | POLLNVAL | POLLHUP)) {
        // We got "signaled" to exit the poll
@@ -358,8 +358,10 @@ void *Accepter::entry()
      } else {
        ldout(msgr->cct,0) << __func__ << " no incoming connection?  sd = " << sd
               << " errno " << errno << " " << cpp_strerror(errno) << dendl;
-      if (++errors > 4)
-       break;
+      if (++errors > msgr->cct->_conf->ms_max_accept_failures) {
+        lderr(msgr->cct) << "accetper has encoutered enough errors, just do ceph_abort()." << dendl;
+        ceph_abort();
+      }
      }
    }
author	root <penglaiyxy>
	Mon, 30 Jul 2018 01:29:48 +0000 (21:29 -0400)
committer	root <penglaiyxy>
	Thu, 2 Aug 2018 01:08:13 +0000 (21:08 -0400)
src/common/legacy_config_opts.h		patch \| blob \| history
src/common/options.cc		patch \| blob \| history
src/msg/async/AsyncMessenger.cc		patch \| blob \| history
src/msg/simple/Accepter.cc		patch \| blob \| history