From 622e9b479e1ea3ecc86a00166d722f302da28ae4 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 22 Jul 2010 15:55:23 -0700 Subject: [PATCH] osd: simplify heartbeat checks - Only check heartbeats when we have heartbeat_lock and osdmap rdlocked, and thus _know_ heartbeat info and map are in sync. Drop unnecessary consistency checks. - Check heartbeats in tick(), since the heartbeat thread may miss it if it's unlucky. --- src/osd/OSD.cc | 59 ++++++++++++++++++++++++++++++-------------------- src/osd/OSD.h | 1 + 2 files changed, 36 insertions(+), 24 deletions(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 762016b1f8565..b91e62cbdec66 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -1264,6 +1264,27 @@ void OSD::heartbeat_entry() heartbeat_lock.Unlock(); } +void OSD::heartbeat_check() +{ + assert(heartbeat_lock.is_locked()); + // we should also have map_lock rdlocked. + + // check for incoming heartbeats (move me elsewhere?) + utime_t grace = g_clock.now(); + grace -= g_conf.osd_heartbeat_grace; + for (map::iterator p = heartbeat_from.begin(); + p != heartbeat_from.end(); + p++) { + if (heartbeat_from_stamp.count(p->first) && + heartbeat_from_stamp[p->first] < grace) { + dout(0) << "heartbeat_check: no heartbeat from osd" << p->first + << " since " << heartbeat_from_stamp[p->first] + << " (cutoff " << grace << ")" << dendl; + queue_failure(p->first); + } + } +} + void OSD::heartbeat() { utime_t now = g_clock.now(); @@ -1318,37 +1339,23 @@ void OSD::heartbeat() } } - // check for incoming heartbeats (move me elsewhere?) - utime_t grace = now; - grace -= g_conf.osd_heartbeat_grace; + // request heartbeats? for (map::iterator p = heartbeat_from.begin(); p != heartbeat_from.end(); p++) { - if (heartbeat_from_stamp.count(p->first)) { - if (!osdmap->is_up(p->first)) { - dout(10) << "not checking timeout on down osd" << p->first << dendl; - } else if (osdmap->get_hb_inst(p->first) != heartbeat_inst[p->first]) { - dout(10) << "not checking timeout on osd" << p->first - << " hb inst " << heartbeat_inst[p->first] - << " != map's " << osdmap->get_hb_inst(p->first) - << dendl; - } else if (heartbeat_from_stamp[p->first] < grace) { - dout(0) << "no heartbeat from osd" << p->first - << " since " << heartbeat_from_stamp[p->first] - << " (cutoff " << grace << ")" << dendl; - queue_failure(p->first); - } - } else { + if (heartbeat_from_stamp.count(p->first) == 0) { // fake initial stamp. and send them a ping so they know we expect it. - if (heartbeat_inst.count(p->first)) { - heartbeat_from_stamp[p->first] = now; - Message *m = new MOSDPing(osdmap->get_fsid(), 0, heartbeat_epoch, my_stat, true); // request ack - m->set_priority(CEPH_MSG_PRIO_HIGH); - heartbeat_messenger->send_message(m, heartbeat_inst[p->first]); - } + dout(10) << "requesting heartbeats from " << heartbeat_inst[p->first] << dendl; + heartbeat_from_stamp[p->first] = now; + Message *m = new MOSDPing(osdmap->get_fsid(), 0, heartbeat_epoch, my_stat, true); // request ack + m->set_priority(CEPH_MSG_PRIO_HIGH); + heartbeat_messenger->send_message(m, heartbeat_inst[p->first]); } } + if (map_locked) + heartbeat_check(); + if (logger) logger->set(l_osd_hbto, heartbeat_to.size()); if (logger) logger->set(l_osd_hbfrom, heartbeat_from.size()); @@ -1389,6 +1396,10 @@ void OSD::tick() map_lock.get_read(); + heartbeat_lock.Lock(); + heartbeat_check(); + heartbeat_lock.Unlock(); + check_replay_queue(); // mon report? diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 100370bfef298..b9e2ce59cbdde 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -213,6 +213,7 @@ private: void update_heartbeat_peers(); void reset_heartbeat_peers(); void heartbeat(); + void heartbeat_check(); void heartbeat_entry(); struct T_Heartbeat : public Thread { -- 2.39.5