From: Yingxin Cheng Date: Thu, 26 Mar 2020 08:47:00 +0000 (+0800) Subject: crimson/osd: heartbeat cleanup, show gray health states X-Git-Tag: v16.1.0~1895^2~5 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=5c871d6d00d76ff8fc3a4f5148aa390f03678e18;p=ceph.git crimson/osd: heartbeat cleanup, show gray health states Be very explicit about gray states between healthy and unhealthy. Signed-off-by: Yingxin Cheng --- diff --git a/src/crimson/osd/heartbeat.cc b/src/crimson/osd/heartbeat.cc index 4ea6185415e..57cb3ed6510 100644 --- a/src/crimson/osd/heartbeat.cc +++ b/src/crimson/osd/heartbeat.cc @@ -372,55 +372,57 @@ void Heartbeat::Peer::disconnect() Heartbeat::Peer::~Peer() { disconnect(); } -bool Heartbeat::Peer::is_unhealthy(clock::time_point now) const +bool Heartbeat::Peer::pinged() const { - if (ping_history.empty()) { - // we haven't sent a ping yet or we have got all replies, - // in either way we are safe and healthy for now + if (clock::is_zero(first_tx)) { + // i can never receive a pong without sending any ping message first. + assert(clock::is_zero(last_rx_front) && + clock::is_zero(last_rx_back)); return false; } else { - auto oldest_ping = ping_history.begin(); - return now > oldest_ping->second.deadline; + return true; } } -bool Heartbeat::Peer::is_healthy(clock::time_point now) const +Heartbeat::Peer::health_state +Heartbeat::Peer::do_health_screen(clock::time_point now) const { - if (clock::is_zero(last_rx_front)) { - return false; - } - if (clock::is_zero(last_rx_back)) { - return false; + if (!pinged()) { + // we are not healty nor unhealty because we haven't sent anything yet + return health_state::UNKNOWN; + } else if (!ping_history.empty() && ping_history.begin()->second.deadline < now) { + return health_state::UNHEALTHY; + } else if (!clock::is_zero(last_rx_front) && + !clock::is_zero(last_rx_back)) { + // only declare to be healthy until we have received the first + // replies from both front/back connections + return health_state::HEALTHY; + } else { + return health_state::UNKNOWN; } - // only declare to be healthy until we have received the first - // replies from both front/back connections - return !is_unhealthy(now); } Heartbeat::clock::time_point Heartbeat::Peer::failed_since(clock::time_point now) const { - if (clock::is_zero(first_tx)) { - return clock::zero(); - } - if (!is_unhealthy(now)) { - return clock::zero(); - } - - auto oldest_deadline = ping_history.begin()->second.deadline; - auto failed_since = std::min(last_rx_back, last_rx_front); - if (clock::is_zero(failed_since)) { - logger().error("failed_since: no reply from osd.{} " - "ever on either front or back, first ping sent {} " - "(oldest deadline {})", - peer, first_tx, oldest_deadline); - failed_since = first_tx; + if (do_health_screen(now) == health_state::UNHEALTHY) { + auto oldest_deadline = ping_history.begin()->second.deadline; + auto failed_since = std::min(last_rx_back, last_rx_front); + if (clock::is_zero(failed_since)) { + logger().error("failed_since: no reply from osd.{} " + "ever on either front or back, first ping sent {} " + "(oldest deadline {})", + peer, first_tx, oldest_deadline); + failed_since = first_tx; + } else { + logger().error("failed_since: no reply from osd.{} " + "since back {} front {} (oldest deadline {})", + peer, last_rx_back, last_rx_front, oldest_deadline); + } + return failed_since; } else { - logger().error("failed_since: no reply from osd.{} " - "since back {} front {} (oldest deadline {})", - peer, last_rx_back, last_rx_front, oldest_deadline); + return clock::zero(); } - return failed_since; } void Heartbeat::Peer::send_heartbeat( @@ -428,7 +430,7 @@ void Heartbeat::Peer::send_heartbeat( ceph::signedspan mnow, std::vector>& futures) { - if (clock::is_zero(first_tx)) { + if (!pinged()) { first_tx = now; } last_tx = now; @@ -475,7 +477,7 @@ seastar::future<> Heartbeat::Peer::handle_reply( if (unacked == 0) { ping_history.erase(ping_history.begin(), ++ping); } - if (is_healthy(now)) { + if (do_health_screen(now) == health_state::HEALTHY) { // cancel false reports if (auto pending = heartbeat.failure_pending.find(peer); pending != heartbeat.failure_pending.end()) { diff --git a/src/crimson/osd/heartbeat.h b/src/crimson/osd/heartbeat.h index 63f77a2b9de..83676372c62 100644 --- a/src/crimson/osd/heartbeat.h +++ b/src/crimson/osd/heartbeat.h @@ -130,9 +130,13 @@ class Heartbeat::Peer { void handle_reset(crimson::net::ConnectionRef); private: - bool is_unhealthy(clock::time_point now) const; - bool is_healthy(clock::time_point now) const; - + bool pinged() const; + enum class health_state { + UNKNOWN, + UNHEALTHY, + HEALTHY, + }; + health_state do_health_screen(clock::time_point now) const; void connect(); void disconnect();