From: Sage Weil Date: Tue, 25 Jan 2011 23:28:49 +0000 (-0800) Subject: osd: restart if the osdmap client, heartbeat, OR cluster addrs don't match X-Git-Tag: v0.24.3~18 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=7d65f6eabe916cc920da5684627ab1690b0df724;p=ceph.git osd: restart if the osdmap client, heartbeat, OR cluster addrs don't match If we somehow get ourselves into a situation where the OSDMap addresses do not match our actual addresses, restart and try again. This is still possible if multiple MOSDBoot messages end up in flight in the monitor, say due to a monitor disconnect/reconnect, and we race with something that marks us down in the map. Signed-off-by: Sage Weil --- diff --git a/src/msg/msg_types.h b/src/msg/msg_types.h index c4adf943799f1..2588aea23af2f 100644 --- a/src/msg/msg_types.h +++ b/src/msg/msg_types.h @@ -202,6 +202,18 @@ struct entity_addr_t { return a; } + bool probably_equals(const entity_addr_t &o) const { + if (get_port() != o.get_port()) + return false; + if (get_nonce() != o.get_nonce()) + return false; + if (is_blank_addr() || o.is_blank_addr()) + return true; + if (memcmp(&addr, &o.addr, sizeof(addr)) == 0) + return true; + return false; + } + bool is_same_host(const entity_addr_t &o) const { if (addr.ss_family != o.addr.ss_family) return false; @@ -214,7 +226,7 @@ struct entity_addr_t { return false; } - bool is_blank_addr() { + bool is_blank_addr() const { switch (addr.ss_family) { case AF_INET: return addr4.sin_addr.s_addr == INADDR_ANY; diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 5c9e33cfde191..4cdd8eff1a300 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -2747,9 +2747,22 @@ void OSD::handle_osd_map(MOSDMap *m) dout(0) << "map says i do not exist. shutting down." << dendl; do_shutdown = true; // don't call shutdown() while we have everything paused } else if (!osdmap->is_up(whoami) || - osdmap->get_addr(whoami) != client_messenger->get_myaddr()) { + !osdmap->get_addr(whoami).probably_equals(client_messenger->get_myaddr()) || + !osdmap->get_cluster_addr(whoami).probably_equals(cluster_messenger->get_myaddr()) || + !osdmap->get_hb_addr(whoami).probably_equals(heartbeat_messenger->get_myaddr())) { stringstream ss; - ss << "map e" << osdmap->get_epoch() << " wrongly marked me down"; + ss << "map e" << osdmap->get_epoch(); + if (!osdmap->is_up(whoami)) + ss << " wrongly marked me down or wrong addr"; + else if (!osdmap->get_addr(whoami).probably_equals(client_messenger->get_myaddr())) + ss << " had wrong client addr (" << osdmap->get_addr(whoami) + << " != my " << client_messenger->get_myaddr(); + else if (osdmap->get_cluster_addr(whoami).probably_equals(cluster_messenger->get_myaddr())) + ss << " had wrong client addr (" << osdmap->get_cluster_addr(whoami) + << " != my " << cluster_messenger->get_myaddr(); + else if (osdmap->get_hb_addr(whoami).probably_equals(heartbeat_messenger->get_myaddr())) + ss << " had wrong client addr (" << osdmap->get_hb_addr(whoami) + << " != my " << heartbeat_messenger->get_myaddr(); logclient.log(LOG_WARN, ss); state = STATE_BOOTING;