From: xie xingguo Date: Mon, 16 May 2016 05:50:28 +0000 (+0800) Subject: osd: refactor heartbeat health check X-Git-Tag: v13.2.7~26^2~24 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=3d471b61f59b7e19c8954d26721e24452bd9e92d;p=ceph.git osd: refactor heartbeat health check The original logic will reuse the timestamp which we send pings to the specific heartbeat peer to update the last_rx_front[back] field on receiving the corresponding replies, which later shall be honoured as the exact time we succeed in getting the corresponding replies and is used to calculate the heartbeat latency and determine whether the relevant peer is dead. However this is not accurate enough as there may be a delay between we receive a reply and call heartbeat_check(). We can eliminate the delay by introducing a map to track the ping-history here, each entry of which consists of three elements: 1. "tx_time", worked as the map key, indicates the exact timestamp we send pings. 2. "deadline", indicates we shall receive all replies by then, otherwise we consider this peer as "dead". 3. "unacknowledged", indicates how many pings for the corresponding ping are still unacknowledged. The initial value is 2(as we send two pings from the front and back side for each peer). We insert an item into the map on every time we sending out a ping, and decrease the "unacknowledged" counter by 1 each time we get a reply from the tracked ping. If "unacknowledged" drops to 0, we know all the replies have been successfully collected and we can safely erase the relevant item from the map as well as the earlier sent ones, if there is any. By comparing the current timestamp with the oldest deadline, we can now make a much accurate decision about whether the corresponding peer is healthy or not. And by setting last_rx_* to the timestamp we receiving the reply, the lower bound when we can no longer hear a reply from the corresponding connection is also much clear now. Signed-off-by: xie xingguo (cherry picked from commit 477774ceee42641f6d6884536462f92567bfea11) Conflicts: src/osd/OSD.cc (send_still_alive() has 1 less argument) --- diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index f46aca8c032..690252eb41c 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -4613,46 +4613,71 @@ void OSD::handle_osd_ping(MOSDPing *m) { map::iterator i = heartbeat_peers.find(from); if (i != heartbeat_peers.end()) { - if (m->get_connection() == i->second.con_back) { - dout(25) << "handle_osd_ping got reply from osd." << from - << " first_tx " << i->second.first_tx - << " last_tx " << i->second.last_tx - << " last_rx_back " << i->second.last_rx_back << " -> " << m->stamp - << " last_rx_front " << i->second.last_rx_front - << dendl; - i->second.last_rx_back = m->stamp; - // if there is no front con, set both stamps. - if (i->second.con_front == NULL) - i->second.last_rx_front = m->stamp; - } else if (m->get_connection() == i->second.con_front) { - dout(25) << "handle_osd_ping got reply from osd." << from - << " first_tx " << i->second.first_tx - << " last_tx " << i->second.last_tx - << " last_rx_back " << i->second.last_rx_back - << " last_rx_front " << i->second.last_rx_front << " -> " << m->stamp - << dendl; - i->second.last_rx_front = m->stamp; - } + auto acked = i->second.ping_history.find(m->stamp); + if (acked != i->second.ping_history.end()) { + utime_t now = ceph_clock_now(); + int &unacknowledged = acked->second.second; + if (m->get_connection() == i->second.con_back) { + dout(25) << "handle_osd_ping got reply from osd." << from + << " first_tx " << i->second.first_tx + << " last_tx " << i->second.last_tx + << " last_rx_back " << i->second.last_rx_back << " -> " << now + << " last_rx_front " << i->second.last_rx_front + << dendl; + i->second.last_rx_back = now; + assert(unacknowledged > 0); + --unacknowledged; + // if there is no front con, set both stamps. + if (i->second.con_front == NULL) { + i->second.last_rx_front = now; + assert(unacknowledged > 0); + --unacknowledged; + } + } else if (m->get_connection() == i->second.con_front) { + dout(25) << "handle_osd_ping got reply from osd." << from + << " first_tx " << i->second.first_tx + << " last_tx " << i->second.last_tx + << " last_rx_back " << i->second.last_rx_back + << " last_rx_front " << i->second.last_rx_front << " -> " << now + << dendl; + i->second.last_rx_front = now; + assert(unacknowledged > 0); + --unacknowledged; + } - utime_t cutoff = ceph_clock_now(); - cutoff -= cct->_conf->osd_heartbeat_grace; - if (i->second.is_healthy(cutoff)) { - // Cancel false reports - auto failure_queue_entry = failure_queue.find(from); - if (failure_queue_entry != failure_queue.end()) { - dout(10) << "handle_osd_ping canceling queued " - << "failure report for osd." << from << dendl; - failure_queue.erase(failure_queue_entry); + if (unacknowledged == 0) { + // succeeded in getting all replies + dout(25) << "handle_osd_ping got all replies from osd." << from + << " , erase pending ping(sent at " << m->stamp << ")" + << " and older pending ping(s)" + << dendl; + i->second.ping_history.erase(i->second.ping_history.begin(), ++acked); } - auto failure_pending_entry = failure_pending.find(from); - if (failure_pending_entry != failure_pending.end()) { - dout(10) << "handle_osd_ping canceling in-flight " - << "failure report for osd." << from << dendl; - send_still_alive(curmap->get_epoch(), - failure_pending_entry->second.second); - failure_pending.erase(failure_pending_entry); + if (i->second.is_healthy(now)) { + // Cancel false reports + auto failure_queue_entry = failure_queue.find(from); + if (failure_queue_entry != failure_queue.end()) { + dout(10) << "handle_osd_ping canceling queued " + << "failure report for osd." << from << dendl; + failure_queue.erase(failure_queue_entry); + } + + auto failure_pending_entry = failure_pending.find(from); + if (failure_pending_entry != failure_pending.end()) { + dout(10) << "handle_osd_ping canceling in-flight " + << "failure report for osd." << from << dendl; + send_still_alive(curmap->get_epoch(), + failure_pending_entry->second.second); + failure_pending.erase(failure_pending_entry); + } } + } else { + // old replies, deprecated by newly sent pings. + dout(10) << "handle_osd_ping no pending ping(sent at " << m->stamp + << ") is found, treat as covered by newly sent pings " + << "and ignore" + << dendl; } } @@ -4704,9 +4729,7 @@ void OSD::heartbeat_check() assert(heartbeat_lock.is_locked()); utime_t now = ceph_clock_now(); - // check for heartbeat replies (move me elsewhere?) - utime_t cutoff = now; - cutoff -= cct->_conf->osd_heartbeat_grace; + // check for incoming heartbeats (move me elsewhere?) for (map::iterator p = heartbeat_peers.begin(); p != heartbeat_peers.end(); ++p) { @@ -4723,19 +4746,26 @@ void OSD::heartbeat_check() << " last_rx_back " << p->second.last_rx_back << " last_rx_front " << p->second.last_rx_front << dendl; - if (p->second.is_unhealthy(cutoff)) { + if (p->second.is_unhealthy(now)) { + utime_t oldest_deadline = p->second.ping_history.begin()->second.first; if (p->second.last_rx_back == utime_t() || p->second.last_rx_front == utime_t()) { - derr << "heartbeat_check: no reply from " << p->second.con_front->get_peer_addr().get_sockaddr() - << " osd." << p->first << " ever on either front or back, first ping sent " - << p->second.first_tx << " (cutoff " << cutoff << ")" << dendl; + derr << "heartbeat_check: no reply from " + << p->second.con_front->get_peer_addr().get_sockaddr() + << " osd." << p->first + << " ever on either front or back, first ping sent " + << p->second.first_tx + << " (oldest deadline " << oldest_deadline << ")" + << dendl; // fail failure_queue[p->first] = p->second.last_tx; } else { - derr << "heartbeat_check: no reply from " << p->second.con_front->get_peer_addr().get_sockaddr() + derr << "heartbeat_check: no reply from " + << p->second.con_front->get_peer_addr().get_sockaddr() << " osd." << p->first << " since back " << p->second.last_rx_back << " front " << p->second.last_rx_front - << " (cutoff " << cutoff << ")" << dendl; + << " (oldest deadline " << oldest_deadline << ")" + << dendl; // fail failure_queue[p->first] = std::min(p->second.last_rx_back, p->second.last_rx_front); } @@ -4781,6 +4811,8 @@ void OSD::heartbeat() service.check_full_status(ratio); utime_t now = ceph_clock_now(); + utime_t deadline = now; + deadline += cct->_conf->osd_heartbeat_grace; // send heartbeats for (map::iterator i = heartbeat_peers.begin(); @@ -4790,6 +4822,8 @@ void OSD::heartbeat() i->second.last_tx = now; if (i->second.first_tx == utime_t()) i->second.first_tx = now; + i->second.ping_history[now] = make_pair(deadline, + HeartbeatInfo::HEARTBEAT_MAX_CONN); dout(30) << "heartbeat sending ping to osd." << peer << dendl; i->second.con_back->send_message(new MOSDPing(monc->get_fsid(), service.get_osdmap_epoch(), @@ -5451,13 +5485,12 @@ bool OSD::_is_healthy() if (is_waiting_for_healthy()) { Mutex::Locker l(heartbeat_lock); - utime_t cutoff = ceph_clock_now(); - cutoff -= cct->_conf->osd_heartbeat_grace; + utime_t now = ceph_clock_now(); int num = 0, up = 0; for (map::iterator p = heartbeat_peers.begin(); p != heartbeat_peers.end(); ++p) { - if (p->second.is_healthy(cutoff)) + if (p->second.is_healthy(now)) ++up; ++num; } diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 45a6d4fef65..115d1804992 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -1512,20 +1512,26 @@ private: utime_t last_rx_front; ///< last time we got a ping reply on the front side utime_t last_rx_back; ///< last time we got a ping reply on the back side epoch_t epoch; ///< most recent epoch we wanted this peer + /// number of connections we send and receive heartbeat pings/replies + const static int HEARTBEAT_MAX_CONN = 2; + /// history of inflight pings, arranging by timestamp we sent + /// send time -> deadline -> remaining replies + map> ping_history; + + bool is_unhealthy(utime_t now) { + if (ping_history.empty()) { + /// we haven't sent a ping yet or we have got all replies, + /// in either way we are safe and healthy for now + return false; + } - bool is_unhealthy(utime_t cutoff) const { - return - ! ((last_rx_front > cutoff || - (last_rx_front == utime_t() && (last_tx == utime_t() || - first_tx > cutoff))) && - (last_rx_back > cutoff || - (last_rx_back == utime_t() && (last_tx == utime_t() || - first_tx > cutoff)))); - } - bool is_healthy(utime_t cutoff) const { - return last_rx_front > cutoff && last_rx_back > cutoff; + utime_t oldest_deadline = ping_history.begin()->second.first; + return now > oldest_deadline; } + bool is_healthy(utime_t now) { + return !is_unhealthy(now); + } }; /// state attached to outgoing heartbeat connections struct HeartbeatSession : public RefCountedObject {