:Default: ``30``
+``osd mon heartbeat stat stale``
+
+:Description: Stop reporting on heartbeat ping times which haven't been updated for
+ this many seconds. Set to zero to disable this action.
+
+:Type: 32-bit Integer
+:Default: ``3600``
+
+
``osd mon report interval``
:Description: The number of seconds a Ceph OSD Daemon may wait
.set_default(30)
.set_description(""),
+ Option("osd_mon_heartbeat_stat_stale", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1_hr)
+ .set_description("Stop reporting on heartbeat ping times not updated for this many seconds.")
+ .set_long_description("Stop reporting on old heartbeat information unless this is set to zero"),
+
Option("osd_mon_report_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(5)
.set_description("Frequency of OSD reports to mon for peer failures, fullness status changes"),
};
set<mgr_ping_time_t> sorted;
+ utime_t now = ceph_clock_now();
for (auto i : pg_map.osd_stat) {
for (auto j : i.second.hb_pingtime) {
+
+ if (j.second.last_update == 0)
+ continue;
+ auto stale_time = g_ceph_context->_conf.get_val<int64_t>("osd_mon_heartbeat_stat_stale");
+ if (now.sec() - j.second.last_update > stale_time) {
+ dout(20) << __func__ << " time out heartbeat for osd " << i.first
+ << " last_update " << j.second.last_update << dendl;
+ continue;
+ }
mgr_ping_time_t item;
item.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
item.pingtime = std::max(item.pingtime, j.second.back_pingtime[2]);
osd_stat_t OSDService::set_osd_stat(vector<int>& hb_peers,
int num_pgs)
{
+ utime_t now = ceph_clock_now();
+ auto stale_time = g_conf().get_val<int64_t>("osd_mon_heartbeat_stat_stale");
std::lock_guard l(stat_lock);
osd_stat.hb_peers.swap(hb_peers);
osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
osd_stat.num_pgs = num_pgs;
+ // Clean entries that aren't updated
+ // This is called often enough that we can just remove 1 at a time
+ for (auto i: osd_stat.hb_pingtime) {
+ if (i.second.last_update == 0)
+ continue;
+ if (stale_time && now.sec() - i.second.last_update > stale_time) {
+ dout(20) << __func__ << " time out heartbeat for osd " << i.first
+ << " last_update " << i.second.last_update << dendl;
+ osd_stat.hb_pingtime.erase(i.first);
+ break;
+ }
+ }
return osd_stat;
}
map<int, osd_stat_t::Interfaces> *pingtimes = new map<int, osd_stat_t::Interfaces>;
service.get_hb_pingtime(pingtimes);
for (auto j : *pingtimes) {
+ if (j.second.last_update == 0)
+ continue;
osd_ping_time_t item;
item.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
item.pingtime = std::max(item.pingtime, j.second.back_pingtime[2]);