From 991354f17c061ef1a186e4858c6fb892128d4c6d Mon Sep 17 00:00:00 2001 From: xie xingguo Date: Mon, 3 Feb 2020 14:56:38 +0800 Subject: [PATCH] osd/OSD: choose more heartbeat peers from different subtrees We want to avoid a situation like: - host.A consists of OSDs from 0 to 10 - cut off network of host.A from the rest of the cluster - osd.1 is marked down when enough votes have been collected by mon - osd.1 re-selects osd.0,2,3,..., and two extra osds from two different hosts as heartbeat peers - osd.1 has more than 1/3 heartbeat peers becoming pingable, e.g., because they belongs to the same host.A, and will try to mark itself as up again which as a result may cause a longer client op latency now. Fix by (always) trying to select as many heartbeat peers from different subtrees as possible instead. Signed-off-by: xie xingguo --- src/osd/OSD.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 54aa67074f3e..3c731b329b58 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -5144,8 +5144,9 @@ void OSD::maybe_update_heartbeat_peers() // subtree level (e.g., hosts) for fast failure detection. auto min_down = cct->_conf.get_val("mon_osd_min_down_reporters"); auto subtree = cct->_conf.get_val("mon_osd_reporter_subtree_level"); + auto limit = std::max(min_down, (uint64_t)cct->_conf->osd_heartbeat_min_peers); osdmap->get_random_up_osds_by_subtree( - whoami, subtree, min_down, want, &want); + whoami, subtree, limit, want, &want); for (set::iterator p = want.begin(); p != want.end(); ++p) { dout(10) << " adding neighbor peer osd." << *p << dendl; -- 2.47.3