OPTION(osd_mon_report_interval_min, OPT_INT, 5) // pg stats, failures, up_thru, boot.
OPTION(osd_pg_stat_report_interval_max, OPT_INT, 500) // report pg stats for any given pg at least this often
OPTION(osd_mon_ack_timeout, OPT_INT, 30) // time out a mon if it doesn't ack stats
+OPTION(osd_stats_ack_timeout_factor, OPT_DOUBLE, 2.0) // multiples of mon_ack_timeout
+OPTION(osd_stats_ack_timeout_decay, OPT_DOUBLE, .9)
OPTION(osd_default_data_pool_replay_window, OPT_INT, 45)
OPTION(osd_preserve_trimmed_log, OPT_BOOL, false)
OPTION(osd_auto_mark_unfound_lost, OPT_BOOL, false)
debug_drop_pg_create_probability(cct->_conf->osd_debug_drop_pg_create_probability),
debug_drop_pg_create_duration(cct->_conf->osd_debug_drop_pg_create_duration),
debug_drop_pg_create_left(-1),
+ stats_ack_timeout(cct->_conf->osd_mon_ack_timeout),
outstanding_pg_stats(false),
timeout_mon_on_pg_stats(true),
up_thru_wanted(0), up_thru_pending(0),
// mon report?
utime_t now = ceph_clock_now(cct);
if (outstanding_pg_stats && timeout_mon_on_pg_stats &&
- (now - cct->_conf->osd_mon_ack_timeout) > last_pg_stats_ack) {
- dout(1) << "mon hasn't acked PGStats in " << now - last_pg_stats_ack
+ (now - stats_ack_timeout) > last_pg_stats_ack) {
+ dout(1) << __func__ << " mon hasn't acked PGStats in "
+ << now - last_pg_stats_ack
<< " seconds, reconnecting elsewhere" << dendl;
monc->reopen_session(new C_MonStatsAckTimer(this));
timeout_mon_on_pg_stats = false;
last_pg_stats_ack = ceph_clock_now(cct); // reset clock
last_pg_stats_sent = utime_t();
+ stats_ack_timeout =
+ MAX(g_conf->osd_mon_ack_timeout,
+ stats_ack_timeout * g_conf->osd_stats_ack_timeout_factor);
}
if (now - last_pg_stats_sent > cct->_conf->osd_mon_report_interval_max) {
osd_stat_updated = true;
last_pg_stats_ack = ceph_clock_now(cct);
+ // decay timeout slowly (analogous to TCP)
+ stats_ack_timeout =
+ MAX(g_conf->osd_mon_ack_timeout,
+ stats_ack_timeout * g_conf->osd_stats_ack_timeout_decay);
+ dout(20) << __func__ << " timeout now " << stats_ack_timeout << dendl;
+
pg_stat_queue_lock.Lock();
if (ack->get_tid() > pg_stat_tid_flushed) {