From 7a574d88aeb41db2aed233b72ca96bc9915846a8 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 20 May 2011 10:42:16 -0700 Subject: [PATCH] osd: do not clobber explicitly requested heartbeat_to target addresss Consider peer P. - P does down in, say, epoch 60, and back up in epoch 70 - P and requests a heartbeat, as_of 70 - We update to map 50, and coincidentally add the same peer as a target - We set the heartbeat_to[P] = 50 and start sending to the _old_ address - P marks us down because we stop sending to the new addr - We eventually get map 70, but it's too late! Make sure we preserve any _to targets _and_ their epoch+inst. Signed-off-by: Sage Weil --- src/osd/OSD.cc | 50 ++++++++++++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 369e138d553..7a3ae4fbeed 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -1415,6 +1415,18 @@ void OSD::update_heartbeat_peers() heartbeat_epoch = osdmap->get_epoch(); + // grandfather newer _to peers + for (map::iterator p = old_to.begin(); + p != old_to.end(); + p++) { + if (p->second > osdmap->get_epoch()) { + dout(10) << "update_heartbeat_peers: keeping newer _to peer " << old_inst[p->first] + << " as of " << p->second << dendl; + heartbeat_to[p->first] = p->second; + heartbeat_inst[p->first] = old_inst[p->first]; + } + } + // build heartbeat to/from set for (hash_map::iterator i = pg_map.begin(); i != pg_map.end(); @@ -1461,30 +1473,24 @@ void OSD::update_heartbeat_peers() assert(old_inst.count(p->first)); if (heartbeat_to.count(p->first)) continue; - if (p->second > osdmap->get_epoch()) { - dout(10) << "update_heartbeat_peers: keeping newer _to peer " << old_inst[p->first] - << " as of " << p->second << dendl; - heartbeat_to[p->first] = p->second; - heartbeat_inst[p->first] = old_inst[p->first]; + assert(p->second <= osdmap->get_epoch()); + if (heartbeat_from.count(p->first) && old_inst[p->first] == heartbeat_inst[p->first]) { + dout(10) << "update_heartbeat_peers: old _to peer " << old_inst[p->first] + << " is still a _from peer, not marking down" << dendl; } else { - if (heartbeat_from.count(p->first) && old_inst[p->first] == heartbeat_inst[p->first]) { - dout(10) << "update_heartbeat_peers: old _to peer " << old_inst[p->first] - << " is still a _from peer, not marking down" << dendl; - } else { - dout(10) << "update_heartbeat_peers: marking down old _to peer " << old_inst[p->first] - << " as of " << p->second << dendl; - heartbeat_messenger->mark_down(old_inst[p->first].addr); - } + dout(10) << "update_heartbeat_peers: marking down old _to peer " << old_inst[p->first] + << " as of " << p->second << dendl; + heartbeat_messenger->mark_down(old_inst[p->first].addr); + } - if (!osdmap->is_down(p->first) && - osdmap->get_hb_inst(p->first) == old_inst[p->first]) { - dout(10) << "update_heartbeat_peers: sharing map with old _to peer " << old_inst[p->first] - << " as of " << p->second << dendl; - // share latest map with this peer (via the cluster link, NOT - // the heartbeat link), so they know not to expect heartbeats - // from us. otherwise they may mark us down! - _share_map_outgoing(osdmap->get_cluster_inst(p->first)); - } + if (!osdmap->is_down(p->first) && + osdmap->get_hb_inst(p->first) == old_inst[p->first]) { + dout(10) << "update_heartbeat_peers: sharing map with old _to peer " << old_inst[p->first] + << " as of " << p->second << dendl; + // share latest map with this peer (via the cluster link, NOT + // the heartbeat link), so they know not to expect heartbeats + // from us. otherwise they may mark us down! + _share_map_outgoing(osdmap->get_cluster_inst(p->first)); } } for (map::iterator p = old_from.begin(); -- 2.47.3