From: Sage Weil Date: Wed, 29 May 2013 03:39:30 +0000 (-0700) Subject: osd: fix hb con failure handler X-Git-Tag: v0.64~42^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=45b84f39ba6c0e0b7cbbeef386105a0a8fe3a14b;p=ceph.git osd: fix hb con failure handler Fix a few bugs introduced by 27381c0c6259ac89f5f9c592b4bfb585937a1cfc: - check against both front and back cons; either one may have failed. - close *both* front and back before reopening either. this is overkill, but slightly simpler code. - fix leak of con when marking down - handle race against osdmap update and note_down_osd Fixes: #5172 Signed-off-by: Sage Weil --- diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index e725e97e8221..0915a08190c7 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -2622,29 +2622,37 @@ bool OSD::heartbeat_reset(Connection *con) } map::iterator p = heartbeat_peers.find(s->peer); if (p != heartbeat_peers.end() && - p->second.con_back == con) { - pair newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch); - if (!newcon.first) { - dout(10) << "heartbeat_reset reopen failed hb con " << con << " but failed to reopen" << dendl; - } else { - dout(10) << "heartbeat_reset reopen failed hb con " << con << dendl; + (p->second.con_back == con || + p->second.con_front == con)) { + dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer + << ", reopening" << dendl; + if (con != p->second.con_back) { hbclient_messenger->mark_down(p->second.con_back); + p->second.con_back->put(); + } + p->second.con_back = NULL; + if (p->second.con_front && con != p->second.con_front) { + hbclient_messenger->mark_down(p->second.con_front); + p->second.con_front->put(); + } + p->second.con_front = NULL; + pair newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch); + if (newcon.first) { p->second.con_back = newcon.first.get(); p->second.con_back->get(); p->second.con_back->set_priv(s); - if (p->second.con_front) - hbclient_messenger->mark_down(p->second.con_front); if (newcon.second) { p->second.con_front = newcon.second.get(); p->second.con_front->get(); p->second.con_front->set_priv(s->get()); - } else { - p->second.con_front = NULL; } + } else { + dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer + << ", raced with osdmap update, closing out peer" << dendl; + heartbeat_peers.erase(p); } } else { dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl; - hbclient_messenger->mark_down(con); } heartbeat_lock.Unlock(); s->put();