From: Wido den Hollander Date: Tue, 18 Nov 2014 13:49:00 +0000 (+0100) Subject: SimpleMessenger: Retry binding on addresses if binding fails X-Git-Tag: v0.92~100^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=2d4dca757eaa7572ed4d8a1c798c9c09e5ada3d7;p=ceph.git SimpleMessenger: Retry binding on addresses if binding fails If binding on a IP-Address fails, delay and retry again. This happens mainly on IPv6 deployments. Due to DAD (Duplicate Address Detection) or SLAAC it can be that IPv6 is not yet available when the daemons start. Monitor daemons try to bind on a static IPv6 address and that might not be available yet and that causes the monitor not to start. Fixes: #10029 --- diff --git a/src/common/config_opts.h b/src/common/config_opts.h index d34a147b4c9f..bc00d6a2e212 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -128,6 +128,8 @@ OPTION(ms_dispatch_throttle_bytes, OPT_U64, 100 << 20) OPTION(ms_bind_ipv6, OPT_BOOL, false) OPTION(ms_bind_port_min, OPT_INT, 6800) OPTION(ms_bind_port_max, OPT_INT, 7300) +OPTION(ms_bind_retry_count, OPT_INT, 3) // If binding fails, how many times do we retry to bind +OPTION(ms_bind_retry_delay, OPT_INT, 5) // Delay between attemps to bind OPTION(ms_rwthread_stack_bytes, OPT_U64, 1024 << 10) OPTION(ms_tcp_read_timeout, OPT_U64, 900) OPTION(ms_pq_max_tokens_per_priority, OPT_U64, 16777216) diff --git a/src/msg/simple/Accepter.cc b/src/msg/simple/Accepter.cc index 7fb2baa56fbe..7d989a93691e 100644 --- a/src/msg/simple/Accepter.cc +++ b/src/msg/simple/Accepter.cc @@ -69,43 +69,66 @@ int Accepter::bind(const entity_addr_t &bind_addr, const set& avoid_ports) /* bind to port */ int rc = -1; - if (listen_addr.get_port()) { - // specific port - - // reuse addr+port when possible - int on = 1; - rc = ::setsockopt(listen_sd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)); - if (rc < 0) { - lderr(msgr->cct) << "accepter.bind unable to setsockopt: " - << cpp_strerror(errno) << dendl; - return -errno; - } + int r = -1; - rc = ::bind(listen_sd, (struct sockaddr *) &listen_addr.ss_addr(), listen_addr.addr_size()); - if (rc < 0) { - lderr(msgr->cct) << "accepter.bind unable to bind to " << listen_addr.ss_addr() - << ": " << cpp_strerror(errno) << dendl; - return -errno; - } - } else { - // try a range of ports - for (int port = msgr->cct->_conf->ms_bind_port_min; port <= msgr->cct->_conf->ms_bind_port_max; port++) { - if (avoid_ports.count(port)) - continue; - listen_addr.set_port(port); - rc = ::bind(listen_sd, (struct sockaddr *) &listen_addr.ss_addr(), listen_addr.addr_size()); - if (rc == 0) - break; + for (int i = 0; i < conf->ms_bind_retry_count; i++) { + + if (i > 0) { + lderr(msgr->cct) << "accepter.bind was unable to bind. Trying again in " << conf->ms_bind_retry_delay << " seconds " << dendl; + sleep(conf->ms_bind_retry_delay); } - if (rc < 0) { - lderr(msgr->cct) << "accepter.bind unable to bind to " << listen_addr.ss_addr() - << " on any port in range " << msgr->cct->_conf->ms_bind_port_min - << "-" << msgr->cct->_conf->ms_bind_port_max - << ": " << cpp_strerror(errno) - << dendl; - return -errno; + + if (listen_addr.get_port()) { + // specific port + + // reuse addr+port when possible + int on = 1; + rc = ::setsockopt(listen_sd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)); + if (rc < 0) { + lderr(msgr->cct) << "accepter.bind unable to setsockopt: " + << cpp_strerror(errno) << dendl; + r = -errno; + continue; + } + + rc = ::bind(listen_sd, (struct sockaddr *) &listen_addr.ss_addr(), listen_addr.addr_size()); + if (rc < 0) { + lderr(msgr->cct) << "accepter.bind unable to bind to " << listen_addr.ss_addr() + << ": " << cpp_strerror(errno) << dendl; + r = -errno; + continue; + } + } else { + // try a range of ports + for (int port = msgr->cct->_conf->ms_bind_port_min; port <= msgr->cct->_conf->ms_bind_port_max; port++) { + if (avoid_ports.count(port)) + continue; + + listen_addr.set_port(port); + rc = ::bind(listen_sd, (struct sockaddr *) &listen_addr.ss_addr(), listen_addr.addr_size()); + if (rc == 0) + break; + } + if (rc < 0) { + lderr(msgr->cct) << "accepter.bind unable to bind to " << listen_addr.ss_addr() + << " on any port in range " << msgr->cct->_conf->ms_bind_port_min + << "-" << msgr->cct->_conf->ms_bind_port_max + << ": " << cpp_strerror(errno) + << dendl; + r = -errno; + continue; + } + ldout(msgr->cct,10) << "accepter.bind bound on random port " << listen_addr << dendl; } - ldout(msgr->cct,10) << "accepter.bind bound on random port " << listen_addr << dendl; + + if (rc == 0) + break; + } + + // It seems that binding completely failed, return with that exit status + if (rc < 0) { + lderr(msgr->cct) << "accepter.bind was unable to bind after " << conf->ms_bind_retry_count << " attempts: " << cpp_strerror(errno) << dendl; + return r; } // what port did we get?