From 4de5407ac96686748497253e4daf51177f809a95 Mon Sep 17 00:00:00 2001 From: Haomai Wang Date: Sun, 19 Jun 2016 23:42:36 +0800 Subject: [PATCH] msg/async: close STATE_WAIT connection in short period 1. in practice, STATE_WAIT connection caused by racing connect should be resolved in milliseconds level. we don't need to keep this connection forever. 2. it will avoid unexpected osd peering hang because of outside network problem. Fixes: http://tracker.ceph.com/issues/16378 Signed-off-by: Haomai Wang --- src/msg/async/AsyncConnection.cc | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/msg/async/AsyncConnection.cc b/src/msg/async/AsyncConnection.cc index 229d2e06bb9e3..71ea16848885c 100644 --- a/src/msg/async/AsyncConnection.cc +++ b/src/msg/async/AsyncConnection.cc @@ -932,8 +932,8 @@ void AsyncConnection::process() case STATE_WAIT: { - ldout(async_msgr->cct, 20) << __func__ << " enter wait state" << dendl; - break; + ldout(async_msgr->cct, 1) << __func__ << " enter wait state, failing" << dendl; + goto fail; } default: @@ -2132,7 +2132,8 @@ void AsyncConnection::fault() } write_lock.Unlock(); - if (!(state >= STATE_CONNECTING && state < STATE_CONNECTING_READY)) { + if (!(state >= STATE_CONNECTING && state < STATE_CONNECTING_READY) && + state != STATE_WAIT) { // STATE_WAIT is coming from STATE_CONNECTING_* // policy maybe empty when state is in accept if (policy.server) { ldout(async_msgr->cct, 0) << __func__ << " server, going to standby" << dendl; @@ -2143,8 +2144,11 @@ void AsyncConnection::fault() state = STATE_CONNECTING; } backoff = utime_t(); + center->dispatch_event_external(read_handler); } else { - if (backoff == utime_t()) { + if (state == STATE_WAIT) { + backoff.set_from_double(async_msgr->cct->_conf->ms_max_backoff); + } else if (backoff == utime_t()) { backoff.set_from_double(async_msgr->cct->_conf->ms_initial_backoff); } else { backoff += backoff; @@ -2154,11 +2158,10 @@ void AsyncConnection::fault() state = STATE_CONNECTING; ldout(async_msgr->cct, 10) << __func__ << " waiting " << backoff << dendl; + // woke up again; + register_time_events.insert(center->create_time_event( + backoff.to_nsec()/1000, wakeup_handler)); } - - // woke up again; - register_time_events.insert(center->create_time_event( - backoff.to_nsec()/1000, wakeup_handler)); } void AsyncConnection::was_session_reset() -- 2.39.5