From 9061988ec7eaa922e2b303d9eece86e7c8ee0fa1 Mon Sep 17 00:00:00 2001 From: "Ma, Jianpeng" Date: Mon, 14 Jul 2014 03:17:14 +0000 Subject: [PATCH] osd: init local_connection for fast_dispatch in _send_boot() We were not properly setting up Sessions on the local_connection for fast_dispatch'ed Messages if the cluster_addr was set explicitly: the OSD was not in the dispatch list at bind() time (in ceph_osd.cc), and nothing called it later on. This issue was missed in testing because Inktank only uses unified NICs. That led to errors like the following: When do ec-read, i met a bug which was occured 100%. The messages are: 2014-07-14 10:03:07.318681 7f7654f6e700 -1 osd/OSD.cc: In function 'virtual void OSD::ms_fast_dispatch(Message*)' thread 7f7654f6e700 time 2014-07-14 10:03:07.316782 osd/OSD.cc: 5019: FAILED assert(session) ceph version 0.82-585-g79f3f67 (79f3f6749122ce2944baa70541949d7ca75525e6) 1: (OSD::ms_fast_dispatch(Message*)+0x286) [0x6544b6] 2: (DispatchQueue::fast_dispatch(Message*)+0x56) [0xb059d6] 3: (DispatchQueue::run_local_delivery()+0x6b) [0xb08e0b] 4: (DispatchQueue::LocalDeliveryThread::entry()+0xd) [0xa4a5fd] 5: (()+0x8182) [0x7f7665670182] 6: (clone()+0x6d) [0x7f7663a1130d] NOTE: a copy of the executable, or `objdump -rdS ` is needed to interpret this. To resolve this, we have the OSD invoke ms_handle_fast_connect() explicitly in send_boot(). It's not really an appropriate location, but we're already doing a bunch of messenger twiddling there, so it's acceptable for now. Signed-off-by: Ma Jianpeng Reviewed-by: Greg Farnum --- src/osd/OSD.cc | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index f211557cab4ca..1a5a395cc9d90 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -3852,29 +3852,37 @@ void OSD::_send_boot() { dout(10) << "_send_boot" << dendl; entity_addr_t cluster_addr = cluster_messenger->get_myaddr(); + Connection *local_connection = cluster_messenger->get_loopback_connection().get(); if (cluster_addr.is_blank_ip()) { int port = cluster_addr.get_port(); cluster_addr = client_messenger->get_myaddr(); cluster_addr.set_port(port); cluster_messenger->set_addr_unknowns(cluster_addr); dout(10) << " assuming cluster_addr ip matches client_addr" << dendl; - } + } else if (local_connection->get_priv() == NULL) + cluster_messenger->ms_deliver_handle_fast_connect(local_connection); + entity_addr_t hb_back_addr = hb_back_server_messenger->get_myaddr(); + local_connection = hb_back_server_messenger->get_loopback_connection().get(); if (hb_back_addr.is_blank_ip()) { int port = hb_back_addr.get_port(); hb_back_addr = cluster_addr; hb_back_addr.set_port(port); hb_back_server_messenger->set_addr_unknowns(hb_back_addr); dout(10) << " assuming hb_back_addr ip matches cluster_addr" << dendl; - } + } else if (local_connection->get_priv() == NULL) + hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection); + entity_addr_t hb_front_addr = hb_front_server_messenger->get_myaddr(); + local_connection = hb_front_server_messenger->get_loopback_connection().get(); if (hb_front_addr.is_blank_ip()) { int port = hb_front_addr.get_port(); hb_front_addr = client_messenger->get_myaddr(); hb_front_addr.set_port(port); hb_front_server_messenger->set_addr_unknowns(hb_front_addr); dout(10) << " assuming hb_front_addr ip matches client_addr" << dendl; - } + } else if (local_connection->get_priv() == NULL) + hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection); MOSDBoot *mboot = new MOSDBoot(superblock, service.get_boot_epoch(), hb_back_addr, hb_front_addr, cluster_addr); -- 2.39.5