From: Sage Weil Date: Thu, 4 Aug 2011 20:48:55 +0000 (-0700) Subject: osd: expect heartbeats from anyone peering depends on X-Git-Tag: v0.33~14^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=72b0851fcb11e13816b753b397f2f2fe2ed1a1d8;p=ceph.git osd: expect heartbeats from anyone peering depends on We were getting heartbeats from just acting replicas. That's really not enough if we want to be sure to detect failures of OSDs we depend on, which includes any stray or up OSDs as well. Signed-off-by: Sage Weil --- diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 36a24ad24e8b..2c867dff2d6b 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -1305,6 +1305,30 @@ void OSD::update_osd_stat() dout(20) << "update_osd_stat " << osd_stat << dendl; } +void OSD::_add_heartbeat_source(int p, map& old_from, map& old_from_stamp, + map& old_con) +{ + if (p == whoami) + return; + if (heartbeat_from.count(p)) + return; + + heartbeat_from[p] = osdmap->get_epoch(); + Connection *con = hbin_messenger->get_connection(osdmap->get_hb_inst(p)); + heartbeat_from_con[p] = con; + if (old_from_stamp.count(p) && old_from.count(p) && old_con[p] == con) { + // have a stamp _AND_ i'm not new to the set + heartbeat_from_stamp[p] = old_from_stamp[p]; + } else { + dout(10) << "update_heartbeat_peers: new _from osd" << p + << " " << con->get_peer_addr() << dendl; + heartbeat_from_stamp[p] = ceph_clock_now(g_ceph_context); + MOSDPing *m = new MOSDPing(osdmap->get_fsid(), 0, heartbeat_epoch, + MOSDPing::START_HEARTBEAT); + hbin_messenger->send_message(m, con); + } +} + void OSD::update_heartbeat_peers() { assert(osd_lock.is_locked()); @@ -1330,29 +1354,15 @@ void OSD::update_heartbeat_peers() i++) { PG *pg = i->second; - // replicas ping primary. + // replicas (new and old) ping primary. if (pg->get_role() == 0) { assert(pg->acting[0] == whoami); - for (unsigned i=1; iacting.size(); i++) { - int p = pg->acting[i]; // peer - assert(p != whoami); - if (heartbeat_from.count(p)) - continue; - heartbeat_from[p] = osdmap->get_epoch(); - Connection *con = hbin_messenger->get_connection(osdmap->get_hb_inst(p)); - heartbeat_from_con[p] = con; - if (old_from_stamp.count(p) && old_from.count(p) && old_con[p] == con) { - // have a stamp _AND_ i'm not new to the set - heartbeat_from_stamp[p] = old_from_stamp[p]; - } else { - dout(10) << "update_heartbeat_peers: new _from osd" << p - << " " << con->get_peer_addr() << dendl; - heartbeat_from_stamp[p] = now; - MOSDPing *m = new MOSDPing(osdmap->get_fsid(), 0, heartbeat_epoch, - MOSDPing::START_HEARTBEAT); - hbin_messenger->send_message(m, con); - } - } + for (unsigned i=0; iacting.size(); i++) + _add_heartbeat_source(pg->acting[i], old_from, old_from_stamp, old_con); + for (unsigned i=0; iup.size(); i++) + _add_heartbeat_source(pg->up[i], old_from, old_from_stamp, old_con); + for (map::iterator p = pg->peer_info.begin(); p != pg->peer_info.end(); ++p) + _add_heartbeat_source(p->first, old_from, old_from_stamp, old_con); } } diff --git a/src/osd/OSD.h b/src/osd/OSD.h index b2bb3ec5e52e..54fd46c34c50 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -252,6 +252,8 @@ private: utime_t last_mon_heartbeat; Messenger *hbin_messenger, *hbout_messenger; + void _add_heartbeat_source(int p, map& old_from, map& old_from_stamp, + map& old_con); void update_heartbeat_peers(); void reset_heartbeat_peers(); void heartbeat();