In some extrem cases(we have met one in our production cluster), when Accepter thread break out , new client can not connect to the osd. Because the former heartbeat connections are already connected, other osd can not detect failure then notify monitor to mark the failed osd down.
In the patch, we there are abnormal communication errors ,we just ceph_abort so that osd can go down fastly and other osds can notify monitor to mark the failed osd down.
Signed-off-by: penglaiyxy@gmail.com <penglaiyxy@gmail.com>
OPTION(ms_async_rdma_cm, OPT_BOOL)
OPTION(ms_async_rdma_type, OPT_STR)
+// when there are enough accept failures, indicating there are unrecoverable failures,
+// just do ceph_abort() . Here we make it configurable.
+OPTION(ms_max_accept_failures, OPT_INT)
+
OPTION(ms_dpdk_port_id, OPT_INT)
SAFE_OPTION(ms_dpdk_coremask, OPT_STR) // it is modified in unittest so that use SAFE_OPTION to declare
OPTION(ms_dpdk_memory_channel, OPT_STR)
.set_default(96)
.set_description(""),
+ Option("ms_max_accept_failures", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(4)
+ .set_description("The maximum number of consecutive failed accept() calls before "
+ "considering the daemon is misconfigured and abort it."),
+
Option("ms_async_rdma_cm", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(false)
.set_description(""),
for (auto& listen_socket : listen_sockets) {
ldout(msgr->cct, 10) << __func__ << " listen_fd=" << listen_socket.fd()
<< dendl;
+ unsigned accept_error_num = 0;
+
while (true) {
entity_addr_t addr;
ConnectedSocket cli_socket;
<< cli_socket.fd() << dendl;
msgr->add_accept(w, std::move(cli_socket), addr);
+ accept_error_num = 0;
continue;
} else {
if (r == -EINTR) {
} else if (r == -EMFILE || r == -ENFILE) {
lderr(msgr->cct) << __func__ << " open file descriptions limit reached sd = " << listen_socket.fd()
<< " errno " << r << " " << cpp_strerror(r) << dendl;
- break;
+ if (++accept_error_num > msgr->cct->_conf->ms_max_accept_failures) {
+ lderr(msgr->cct) << "Proccessor accept has encountered enough error numbers, just do ceph_abort()." << dendl;
+ ceph_abort();
+ }
+ continue;
} else if (r == -ECONNABORTED) {
ldout(msgr->cct, 0) << __func__ << " it was closed because of rst arrived sd = " << listen_socket.fd()
<< " errno " << r << " " << cpp_strerror(r) << dendl;
} else {
lderr(msgr->cct) << __func__ << " no incoming connection?"
<< " errno " << r << " " << cpp_strerror(r) << dendl;
- break;
+ if (++accept_error_num > msgr->cct->_conf->ms_max_accept_failures) {
+ lderr(msgr->cct) << "Proccessor accept has encountered enough error numbers, just do ceph_abort()." << dendl;
+ ceph_abort();
+ }
+ continue;
}
}
}
}
ldout(msgr->cct,1) << __func__ << " poll got error"
<< " errno " << errno << " " << cpp_strerror(errno) << dendl;
- break;
+ ceph_abort();
}
ldout(msgr->cct,10) << __func__ << " poll returned oke: " << r << dendl;
ldout(msgr->cct,20) << __func__ << " pfd.revents[0]=" << pfd[0].revents << dendl;
if (pfd[0].revents & (POLLERR | POLLNVAL | POLLHUP)) {
ldout(msgr->cct,1) << __func__ << " poll got errors in revents "
<< pfd[0].revents << dendl;
- break;
+ ceph_abort();
}
if (pfd[1].revents & (POLLIN | POLLERR | POLLNVAL | POLLHUP)) {
// We got "signaled" to exit the poll
} else {
ldout(msgr->cct,0) << __func__ << " no incoming connection? sd = " << sd
<< " errno " << errno << " " << cpp_strerror(errno) << dendl;
- if (++errors > 4)
- break;
+ if (++errors > msgr->cct->_conf->ms_max_accept_failures) {
+ lderr(msgr->cct) << "accetper has encoutered enough errors, just do ceph_abort()." << dendl;
+ ceph_abort();
+ }
}
}