OPTION(osd_heartbeat_grace, OPT_INT, 20)
OPTION(osd_mon_report_interval_max, OPT_INT, 120)
OPTION(osd_mon_report_interval_min, OPT_INT, 5) // pg stats, failures, up_thru, boot.
+OPTION(osd_mon_ack_timeout, OPT_INT, 30) // time out a mon if it doesn't ack stats
OPTION(osd_min_down_reporters, OPT_INT, 1) // number of OSDs who need to report a down OSD for it to count
OPTION(osd_min_down_reports, OPT_INT, 3) // number of times a down OSD must be reported for it to count
OPTION(osd_default_data_pool_replay_window, OPT_INT, 45)
map_lock("OSD::map_lock"),
peer_map_epoch_lock("OSD::peer_map_epoch_lock"),
map_cache_lock("OSD::map_cache_lock"),
+ outstanding_pg_stats(false),
up_thru_wanted(0), up_thru_pending(0),
pg_stat_queue_lock("OSD::pg_stat_queue_lock"),
osd_stat_updated(false),
timer.add_event_after(1.0, new C_Tick(this));
+ if (outstanding_pg_stats
+ &&(now - g_conf->osd_mon_ack_timeout) > last_pg_stats_ack) {
+ dout(1) << "mon hasn't acked PGStats in " << now - last_pg_stats_ack
+ << "seconds, reconnecting elsewhere" << dendl;
+ monc->reopen_session();
+ }
+
// only do waiters if dispatch() isn't currently running. (if it is,
// it'll do the waiters, and doing them here may screw up ordering
// of op_queue vs handle_osd_map.)
}
pg->pg_stats_lock.Unlock();
}
-
+
+ if (!outstanding_pg_stats) {
+ outstanding_pg_stats = true;
+ last_pg_stats_ack = ceph_clock_now(g_ceph_context);
+ }
monc->send_mon_message(m);
}
return;
}
+ last_pg_stats_ack = ceph_clock_now(g_ceph_context);
+
pg_stat_queue_lock.Lock();
if (ack->get_tid() > pg_stat_tid_flushed) {
dout(30) << " still pending " << pg->info.pgid << " " << pg->pg_stats_stable.reported << dendl;
}
+ if (!pg_stat_queue.size()) {
+ outstanding_pg_stats = false;
+ }
+
pg_stat_queue_lock.Unlock();
ack->put();
utime_t last_mon_report;
utime_t last_pg_stats_sent;
+ /* if our monitor dies, we want to notice it and reconnect.
+ * So we keep track of when it last acked our stat updates,
+ * and if too much time passes (and we've been sending
+ * more updates) then we can call it dead and reconnect
+ * elsewhere.
+ */
+ utime_t last_pg_stats_ack;
+ bool outstanding_pg_stats; // some stat updates haven't been acked yet
+
void do_mon_report();
// -- boot --