From: Sage Weil Date: Wed, 4 Aug 2010 16:51:12 +0000 (-0700) Subject: osd: fix heartbeat to/from map updates X-Git-Tag: v0.21.1~9 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=ef711e2eead039b9819b8380f7b1ea6ebd84160d;p=ceph.git osd: fix heartbeat to/from map updates - always make _inst match _to and _from (don't adjust in note_down_osd) - only mark down old hb connection in update_heartbeat_peers - if old peer isn't down, send them a map - don't print dup new/old messages Signed-off-by: Sage Weil --- diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 76b9c7360d87..2e12390c1dfe 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -1156,6 +1156,15 @@ void OSD::update_heartbeat_peers() assert(osd_lock.is_locked()); heartbeat_lock.Lock(); + /* + for (map::iterator p = heartbeat_to.begin(); p != heartbeat_to.end(); p++) + if (heartbeat_inst.count(p->first) == 0) + dout(0) << " no inst for _to " << p->first << dendl; + for (map::iterator p = heartbeat_from.begin(); p != heartbeat_from.end(); p++) + if (heartbeat_inst.count(p->first) == 0) + dout(0) << " no inst for _from " << p->first << dendl; + */ + // filter heartbeat_from_stamp to only include osds that remain in // heartbeat_from. map old_from_stamp; @@ -1177,6 +1186,8 @@ void OSD::update_heartbeat_peers() if (pg->get_role() > 0) { assert(pg->acting.size() > 1); int p = pg->acting[0]; + if (heartbeat_to.count(p)) + continue; heartbeat_to[p] = osdmap->get_epoch(); heartbeat_inst[p] = osdmap->get_hb_inst(p); if (old_to.count(p) == 0 || old_inst[p] != heartbeat_inst[p]) @@ -1187,6 +1198,8 @@ void OSD::update_heartbeat_peers() for (unsigned i=1; iacting.size(); i++) { int p = pg->acting[i]; // peer assert(p != whoami); + if (heartbeat_from.count(p)) + continue; heartbeat_from[p] = osdmap->get_epoch(); heartbeat_inst[p] = osdmap->get_hb_inst(p); if (old_from_stamp.count(p) && old_from.count(p) && @@ -1202,15 +1215,21 @@ void OSD::update_heartbeat_peers() for (map::iterator p = old_to.begin(); p != old_to.end(); p++) { + assert(old_inst.count(p->first)); + if (heartbeat_to.count(p->first)) + continue; if (p->second > osdmap->get_epoch()) { dout(10) << "update_heartbeat_peers: keeping newer _to peer " << old_inst[p->first] << " as of " << p->second << dendl; heartbeat_to[p->first] = p->second; heartbeat_inst[p->first] = old_inst[p->first]; - } else if (p->second < osdmap->get_epoch() && - (!osdmap->is_up(p->first) || - osdmap->get_hb_inst(p->first) != old_inst[p->first])) { - dout(10) << "update_heartbeat_peers: marking down old _to peer " << old_inst[p->first] + } else if (osdmap->is_down(p->first) || + osdmap->get_hb_inst(p->first) != old_inst[p->first]) { + dout(10) << "update_heartbeat_peers: marking down old down _to peer " << old_inst[p->first] + << " as of " << p->second << dendl; + heartbeat_messenger->mark_down(old_inst[p->first].addr); + } else { + dout(10) << "update_heartbeat_peers: sharing map with old _to peer " << old_inst[p->first] << " as of " << p->second << dendl; // share latest map with this peer, so they know not to expect // heartbeats from us. otherwise they may mark us down! @@ -1231,6 +1250,15 @@ void OSD::update_heartbeat_peers() dout(10) << "update_heartbeat_peers: hb to: " << heartbeat_to << dendl; dout(10) << "update_heartbeat_peers: hb from: " << heartbeat_from << dendl; + /* + for (map::iterator p = heartbeat_to.begin(); p != heartbeat_to.end(); p++) + if (heartbeat_inst.count(p->first) == 0) + dout(0) << " no inst for _to " << p->first << dendl; + for (map::iterator p = heartbeat_from.begin(); p != heartbeat_from.end(); p++) + if (heartbeat_inst.count(p->first) == 0) + dout(0) << " no inst for _from " << p->first << dendl; + */ + heartbeat_lock.Unlock(); } @@ -2109,23 +2137,11 @@ void OSD::note_down_osd(int osd) heartbeat_lock.Lock(); - heartbeat_messenger->mark_down(osdmap->get_hb_addr(osd)); - - if (heartbeat_inst.count(osd)) { - if (heartbeat_inst[osd] == osdmap->get_hb_inst(osd)) { - dout(10) << "note_down_osd removing heartbeat_inst " << heartbeat_inst[osd] << dendl; - heartbeat_inst.erase(osd); - } else { - dout(10) << "note_down_osd leaving heartbeat_inst " << heartbeat_inst[osd] - << " != " << osdmap->get_hb_inst(osd) << dendl; - } - } else - dout(10) << "note_down_osd no heartbeat_inst for osd" << osd << dendl; + // note: update_heartbeat_peers will mark down the heartbeat connection. peer_map_epoch.erase(entity_name_t::OSD(osd)); failure_queue.erase(osd); failure_pending.erase(osd); - heartbeat_from_stamp.erase(osd); heartbeat_lock.Unlock(); }