OSDs Check Heartbeats
=====================
-Each Ceph OSD Daemon checks the heartbeat of other Ceph OSD Daemons every 6
-seconds. You can change the heartbeat interval by adding an ``osd heartbeat
-interval`` setting under the ``[osd]`` section of your Ceph configuration file,
-or by setting the value at runtime. If a neighboring Ceph OSD Daemon doesn't
+Each Ceph OSD Daemon checks the heartbeat of other Ceph OSD Daemons at random
+intervals less than every 6 seconds. If a neighboring Ceph OSD Daemon doesn't
show a heartbeat within a 20 second grace period, the Ceph OSD Daemon may
consider the neighboring Ceph OSD Daemon ``down`` and report it back to a Ceph
Monitor, which will update the Ceph Cluster Map. You may change this grace
.add_service("mgr")
.set_description("Issue a health warning if there are fewer OSDs than osd_pool_default_size"),
+ Option("mon_warn_on_slow_ping_time", Option::TYPE_UINT, Option::LEVEL_BASIC)
+ .set_default(1000000)
+ .add_service("mgr")
+ .set_description("Issue a health warning if heartbeat ping longer than specified microseconds"),
+
Option("mon_max_snap_prune_per_epoch", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(100)
.set_description("Max number of pruned snaps we will process in a single OSDMap epoch"),
.set_default(entity_addr_t())
.set_description(""),
- Option("osd_heartbeat_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ Option("osd_heartbeat_interval", Option::TYPE_INT, Option::LEVEL_DEV)
.set_default(6)
- .set_min_max(1, 86400)
+ .set_min_max(1, 60)
.set_description("Interval (in seconds) between peer pings"),
Option("osd_heartbeat_grace", Option::TYPE_INT, Option::LEVEL_ADVANCED)
#include "osd/osd_types.h"
#include "osd/OSDMap.h"
+#include <boost/range/adaptor/reversed.hpp>
#define dout_context g_ceph_context
checks->add("TOO_FEW_OSDS", HEALTH_WARN, ss.str());
}
+ // SLOW_PING_TIME
+ auto warn_slow_ping_time = cct->_conf.get_val<uint64_t>("mon_warn_on_slow_ping_time");
+ if (warn_slow_ping_time > 0) {
+
+ struct mon_ping_item_t {
+ uint32_t pingtime;
+ int from;
+ int to;
+ bool improving;
+
+ bool operator<(const mon_ping_item_t& rhs) const {
+ if (pingtime < rhs.pingtime)
+ return true;
+ if (pingtime > rhs.pingtime)
+ return false;
+ if (from < rhs.from)
+ return true;
+ if (from > rhs.from)
+ return false;
+ return to < rhs.to;
+ }
+ };
+
+ list<string> detail_back;
+ list<string> detail_front;
+ set<mon_ping_item_t> back_sorted, front_sorted;
+ for (auto i : osd_stat) {
+ for (auto j : i.second.hb_pingtime) {
+
+ mon_ping_item_t back;
+ back.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
+ back.pingtime = std::max(back.pingtime, j.second.back_pingtime[2]);
+ back.from = i.first;
+ back.to = j.first;
+ if (back.pingtime > warn_slow_ping_time) {
+ back.improving = (j.second.back_pingtime[0] < j.second.back_pingtime[1]
+ && j.second.back_pingtime[1] < j.second.back_pingtime[2]);
+ back_sorted.emplace(back);
+ }
+
+ mon_ping_item_t front;
+ front.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
+ front.pingtime = std::max(front.pingtime, j.second.front_pingtime[2]);
+ front.from = i.first;
+ front.to = j.first;
+ if (front.pingtime > warn_slow_ping_time) {
+ front.improving = (j.second.front_pingtime[0] < j.second.front_pingtime[1]
+ && j.second.front_pingtime[1] < j.second.back_pingtime[2]);
+ front_sorted.emplace(front);
+ }
+ }
+ }
+ int max_detail = 10;
+ for (auto &sback : boost::adaptors::reverse(back_sorted)) {
+ ostringstream ss;
+ if (max_detail == 0) {
+ ss << "Truncated long network list.";
+ detail_back.push_back(ss.str());
+ break;
+ }
+ max_detail--;
+ ss << "Slow heartbeat ping on back interface from osd." << sback.from
+ << " to osd." << sback.to << " " << sback.pingtime << " usec"
+ << (sback.improving ? " possibly improving" : "");
+ detail_back.push_back(ss.str());
+ }
+ max_detail = 10;
+ for (auto &sfront : boost::adaptors::reverse(front_sorted)) {
+ ostringstream ss;
+ if (max_detail == 0) {
+ ss << "Truncated long network list.";
+ detail_front.push_back(ss.str());
+ break;
+ }
+ max_detail--;
+ ss << "Slow heartbeat ping on front interface from osd." << sfront.from
+ << " to osd." << sfront.to << " " << sfront.pingtime << " usec"
+ << (sfront.improving ? " possibly improving" : "");
+ detail_front.push_back(ss.str());
+ }
+ if (detail_back.size() != 0) {
+ ostringstream ss;
+ ss << "Long heartbeat ping times on back interface seen, longest is "
+ << back_sorted.rbegin()->pingtime << " usec";
+ auto& d = checks->add("OSD_SLOW_PING_TIME_BACK", HEALTH_WARN, ss.str());
+ d.detail.swap(detail_back);
+ }
+ if (detail_front.size() != 0) {
+ ostringstream ss;
+ ss << "Long heartbeat ping times on front interface seen, longest is "
+ << front_sorted.rbegin()->pingtime << " usec";
+ auto& d = checks->add("OSD_SLOW_PING_TIME_FRONT", HEALTH_WARN, ss.str());
+ d.detail.swap(detail_front);
+ }
+ }
+
// SMALLER_PGP_NUM
// MANY_OBJECTS_PER_PG
if (!pg_stat.empty()) {
hi = &heartbeat_peers[p];
hi->peer = p;
RefCountedPtr s{new HeartbeatSession{p}, false};
+ hi->hb_interval_start = ceph_clock_now();
hi->con_back = cons.first.get();
hi->con_back->set_priv(s);
if (cons.second) {
<< " , erase pending ping(sent at " << m->stamp << ")"
<< " and older pending ping(s)"
<< dendl;
+
+#define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5)
+ ++i->second.hb_average_count;
+ uint32_t back_pingtime = ROUND_S_TO_USEC(i->second.last_rx_back - m->stamp);
+ i->second.hb_total_back += back_pingtime;
+ uint32_t front_pingtime = ROUND_S_TO_USEC(i->second.last_rx_front - m->stamp);
+ i->second.hb_total_front += front_pingtime;
+
+ ceph_assert(i->second.hb_interval_start != utime_t());
+ if (i->second.hb_interval_start == utime_t())
+ i->second.hb_interval_start = now;
+ if (now - i->second.hb_interval_start >= utime_t(hb_avg, 0)) {
+ uint32_t back_pingtime = i->second.hb_total_back / i->second.hb_average_count;
+ uint32_t front_pingtime = i->second.hb_total_front / i->second.hb_average_count;
+
+ // Reset for new interval
+ i->second.hb_average_count = 0;
+ i->second.hb_interval_start = now;
+ i->second.hb_total_back = 0;
+ i->second.hb_total_front = 0;
+
+ // Record per osd interace ping times
+ // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval
+ if (i->second.hb_back_pingtime.size() < hb_vector_size) {
+ ceph_assert(i->second.hb_front_pingtime.size() == i->second.hb_back_pingtime.size());
+ i->second.hb_back_pingtime.push_back(back_pingtime);
+ i->second.hb_front_pingtime.push_back(front_pingtime);
+ } else {
+ i->second.hb_back_pingtime[i->second.hb_index & (hb_vector_size - 1)] = back_pingtime;
+ i->second.hb_front_pingtime[i->second.hb_index & (hb_vector_size - 1)] = front_pingtime;
+ }
+ ++i->second.hb_index;
+
+ {
+ std::lock_guard l(service.stat_lock);
+ uint32_t total = 0;
+ uint32_t count = 0;
+ uint32_t which = 0;
+ uint32_t size = (uint32_t)i->second.hb_back_pingtime.size();
+ for (int32_t k = size - 1 ; k >= 0; --k) {
+ ++count;
+ total += i->second.hb_back_pingtime[(i->second.hb_index + k) % size];
+ if (count == 1 || count == 5 || count == 15) {
+ service.osd_stat.hb_pingtime[from].back_pingtime[which++] = total / count;
+ if (count == 15)
+ break;
+ }
+ }
+
+ if (i->second.con_front != NULL) {
+ total = 0;
+ count = 0;
+ which = 0;
+ for (int32_t k = size - 1 ; k >= 0; --k) {
+ ++count;
+ total += i->second.hb_front_pingtime[(i->second.hb_index + k) % size];
+ if (count == 1 || count == 5 || count == 15) {
+ service.osd_stat.hb_pingtime[from].front_pingtime[which++] = total / count;
+ if (count == 15)
+ break;
+ }
+ }
+ }
+ }
+ }
i->second.ping_history.erase(i->second.ping_history.begin(), ++acked);
}
i->second.first_tx = now;
i->second.ping_history[now] = make_pair(deadline,
HeartbeatInfo::HEARTBEAT_MAX_CONN);
+ if (i->second.hb_interval_start == utime_t())
+ i->second.hb_interval_start = now;
dout(30) << "heartbeat sending ping to osd." << peer << dendl;
i->second.con_back->send_message(new MOSDPing(monc->get_fsid(),
service.get_osdmap_epoch(),
/// send time -> deadline -> remaining replies
map<utime_t, pair<utime_t, int>> ping_history;
+ utime_t hb_interval_start;
+ uint32_t hb_average_count = 0;
+ uint32_t hb_index = 0;
+
+ uint32_t hb_total_back = 0;
+ vector<uint32_t> hb_back_pingtime;
+
+ uint32_t hb_total_front = 0;
+ vector<uint32_t> hb_front_pingtime;
+
bool is_unhealthy(utime_t now) {
if (ping_history.empty()) {
/// we haven't sent a ping yet or we have got all replies,
Messenger *hb_back_server_messenger;
utime_t last_heartbeat_resample; ///< last time we chose random peers in waiting-for-healthy state
double daily_loadavg;
-
+
+ const uint32_t hb_avg = 60;
+ // Track ping repsonse times using vector as a circular buffer
+ // MUST BE A POWER OF 2
+ const uint32_t hb_vector_size = 16;
+
void _add_heartbeat_peer(int p);
void _remove_heartbeat_peer(int p);
bool heartbeat_reset(Connection *con);
f->open_object_section("perf_stat");
os_perf_stat.dump(f);
f->close_section();
+ // Call to update ping times?
+ f->open_array_section("network_ping_times");
+ for (auto &i : hb_pingtime) {
+ f->open_object_section("osd");
+ f->dump_int("osd", i.first);
+
+ f->open_array_section("interfaces");
+ f->open_object_section("interface");
+ f->dump_string("interface", "back");
+ f->dump_int("1min", i.second.back_pingtime[0]);
+ f->dump_int("5min", i.second.back_pingtime[1]);
+ f->dump_int("15min", i.second.back_pingtime[2]);
+ f->close_section();
+
+ if (i.second.front_pingtime[0] != 0) {
+ f->open_object_section("interface");
+ f->dump_string("interface", "front");
+ f->dump_int("1min", i.second.front_pingtime[0]);
+ f->dump_int("5min", i.second.front_pingtime[1]);
+ f->dump_int("15min", i.second.front_pingtime[2]);
+ f->close_section();
+ }
+ f->close_section();
+ f->close_section();
+ }
+ f->close_section();
}
void osd_stat_t::encode(bufferlist &bl, uint64_t features) const
{
- ENCODE_START(8, 2, bl);
+ ENCODE_START(9, 2, bl);
encode(kb, bl);
encode(kb_used, bl);
encode(kb_avail, bl);
encode(kb_used_data, bl);
encode(kb_used_omap, bl);
encode(kb_used_meta, bl);
+ // hb_pingtime map
+ encode((int)hb_pingtime.size(), bl);
+ for (auto i : hb_pingtime) {
+ encode(i.first, bl); // osd
+ encode(i.second.back_pingtime[0], bl);
+ encode(i.second.back_pingtime[1], bl);
+ encode(i.second.back_pingtime[2], bl);
+ encode(i.second.front_pingtime[0], bl);
+ encode(i.second.front_pingtime[1], bl);
+ encode(i.second.front_pingtime[2], bl);
+ }
ENCODE_FINISH(bl);
}
void osd_stat_t::decode(bufferlist::iterator &bl)
{
- DECODE_START_LEGACY_COMPAT_LEN(8, 2, 2, bl);
+ DECODE_START_LEGACY_COMPAT_LEN(9, 2, 2, bl);
decode(kb, bl);
decode(kb_used, bl);
decode(kb_avail, bl);
kb_used_omap = 0;
kb_used_meta = 0;
}
+ hb_pingtime.clear();
+ if (struct_v >= 9) {
+ int count;
+ decode(count, bl);
+ for (int i = 0 ; i < count ; i++) {
+ int osd;
+ decode(osd, bl);
+ struct Interfaces ifs;
+ decode(ifs.back_pingtime[0],bl);
+ decode(ifs.back_pingtime[1], bl);
+ decode(ifs.back_pingtime[2], bl);
+ decode(ifs.front_pingtime[0], bl);
+ decode(ifs.front_pingtime[1], bl);
+ decode(ifs.front_pingtime[2], bl);
+ hb_pingtime[osd] = ifs;
+ }
+ }
DECODE_FINISH(bl);
}
o.back()->hb_peers.push_back(7);
o.back()->snap_trim_queue_len = 8;
o.back()->num_snap_trimming = 99;
+ struct Interfaces gen_interfaces = { { 1000, 995, 990 }, {980, 985, 990} };
+ o.back()->hb_pingtime[20] = gen_interfaces;
+ gen_interfaces = { { 700, 800, 900 } };
+ o.back()->hb_pingtime[30] = gen_interfaces;
}
// -- pg_t --
uint32_t num_pgs = 0;
+ struct Interfaces {
+ uint32_t back_pingtime[3];
+ uint32_t front_pingtime[3];
+ };
+ map<int, Interfaces> hb_pingtime; ///< map of osd id to Interfaces
+
void add(const osd_stat_t& o) {
kb += o.kb;
kb_used += o.kb_used;