From: Sage Weil Date: Sat, 22 Dec 2012 00:47:50 +0000 (-0800) Subject: osd: fix pg stat msgs vs timeout X-Git-Tag: v0.56~27 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=8f5de156056de78c90f1dc7bf7c5a131c32c1bb8;p=ceph.git osd: fix pg stat msgs vs timeout We can get a pattern like so: - new mon session - after say 120 seconds, we decide to send a stats msg - outstanding_pg_stats is finally true, we immediately time out (30 second grace), and reconnect to a new mon -> repeat The problem is that we don't reset the last_sent timestamp when we send. Or that we do this check after sending instead of before. Fix both. This should resolve the issue #3661 where osds that don't have pgs updating are not stats messags to the mon to check in, and are eventually getting marked down as a result. Signed-off-by: Sage Weil Reviewed-by: Samuel Just --- diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index ff9992a1c32..b71227d0f33 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -2241,23 +2241,22 @@ void OSD::tick() // mon report? utime_t now = ceph_clock_now(g_ceph_context); + if (outstanding_pg_stats && + (now - g_conf->osd_mon_ack_timeout) > last_pg_stats_ack) { + dout(1) << "mon hasn't acked PGStats in " << now - last_pg_stats_ack + << " seconds, reconnecting elsewhere" << dendl; + monc->reopen_session(); + last_pg_stats_ack = ceph_clock_now(g_ceph_context); // reset clock + last_pg_stats_sent = utime_t(); + } if (now - last_pg_stats_sent > g_conf->osd_mon_report_interval_max) { osd_stat_updated = true; do_mon_report(); - } - else if (now - last_mon_report > g_conf->osd_mon_report_interval_min) { + } else if (now - last_mon_report > g_conf->osd_mon_report_interval_min) { do_mon_report(); } map_lock.put_read(); - - if (outstanding_pg_stats - &&(now - g_conf->osd_mon_ack_timeout) > last_pg_stats_ack) { - dout(1) << "mon hasn't acked PGStats in " << now - last_pg_stats_ack - << " seconds, reconnecting elsewhere" << dendl; - monc->reopen_session(); - last_pg_stats_ack = ceph_clock_now(g_ceph_context); // reset clock - } } // only do waiters if dispatch() isn't currently running. (if it is,