From: Hu-Yuxuan <1024252105@qq.com> Date: Mon, 9 Oct 2023 09:49:54 +0000 (+0800) Subject: osd/OSDMap: skip upmap-items drop when um_from would become equally overfull X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=2ba9e1da7c3b767e3d34b5238508bb78e2f064f5;p=ceph.git osd/OSDMap: skip upmap-items drop when um_from would become equally overfull Improves pg-upmap balancer convergence speed on clusters where many OSDs are mutually overfull and linked by pg_upmap_items -- a common pattern after a scale-up that increases replica count. When try_drop_remap_overfull finds an incoming [um_from -> osd] pair on an overfull osd, it called test_change to check whether dropping the pair would improve distribution. If um_from carries the same excess load as osd, though, dropping just returns the PG to an equally crowded OSD -- test_change rejects the move anyway. With many such mutual pairs, the wasted calls add up to minutes before the balancer gives up. This adds an early check: skip test_change when um_from would end up at least as overfull as osd after receiving the PG back. This eliminates O(n_cohort * R) wasted test_change calls per balancer pass and reduces calc_pg_upmaps from minutes to milliseconds in the affected scenario. Fixes: https://tracker.ceph.com/issues/63137 Signed-off-by: Hu-Yuxuan <1024252105@qq.com> Co-authored-by: Kefu Chai --- diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc index 5583369c0080..6ac703039ecc 100644 --- a/src/osd/OSDMap.cc +++ b/src/osd/OSDMap.cc @@ -5901,7 +5901,7 @@ int OSDMap::calc_pg_upmaps( } // look for remaps we can un-remap if (try_drop_remap_overfull(cct, pgs, tmp_osd_map, osd, - temp_pgs_by_osd, to_unmap, to_upmap)) + temp_pgs_by_osd, to_unmap, to_upmap, osd_deviation)) goto test_change; // try upmap @@ -6352,23 +6352,27 @@ bool OSDMap::try_drop_remap_overfull( int osd, map>& temp_pgs_by_osd, set& to_unmap, - map>>& to_upmap) + map>>& to_upmap, + const map& osd_deviation) { // // This function tries to drop existimg upmap items which map data to overfull // OSDs. It updates temp_pgs_by_osd, to_unmap and to_upmap and rerturns true // if it found an item that can be dropped, false if not. // + const float osd_dev = osd_deviation.at(osd); for (auto pg : pgs) { auto p = tmp_osd_map.pg_upmap_items.find(pg); if (p == tmp_osd_map.pg_upmap_items.end()) continue; mempool::osdmap::vector> new_upmap_items; auto& pg_upmap_items = p->second; - for (auto um_pair : pg_upmap_items) { + for (auto& um_pair : pg_upmap_items) { auto& um_from = um_pair.first; auto& um_to = um_pair.second; - if (um_to == osd) { + // +1: dropping this pair moves one PG from um_to back to um_from. Skip + // the drop if it would leave um_from at least as overfull as osd is today. + if (um_to == osd && osd_deviation.at(um_from) + 1 < osd_dev) { ldout(cct, 10) << " will try dropping existing" << " remapping pair " << um_from << " -> " << um_to diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h index 896407235ec3..bbed6cf70abd 100644 --- a/src/osd/OSDMap.h +++ b/src/osd/OSDMap.h @@ -1631,7 +1631,8 @@ private: // Bunch of internal functions used only by calc_pg_upmaps (result of c int osd, std::map>& temp_pgs_by_osd, std::set& to_unmap, - std::map>>& to_upmap + std::map>>& to_upmap, + const std::map& osd_deviation ); typedef std::vector>>> diff --git a/src/test/osd/TestOSDMap.cc b/src/test/osd/TestOSDMap.cc index 4d36d592a94c..db9a3c210024 100644 --- a/src/test/osd/TestOSDMap.cc +++ b/src/test/osd/TestOSDMap.cc @@ -1843,6 +1843,88 @@ TEST_F(OSDMapTest, BUG_40104) { } } +TEST_F(OSDMapTest, TryDropRemapOverfullBothOverfull) { + // https://tracker.ceph.com/issues/63137 + // + // try_drop_remap_overfull must not drop a [um_from -> osd] pair when + // um_from is itself overfull: the drop returns the PG to an OSD that + // already carries too many PGs, providing no net improvement. + // + // Setup (size=1, 12 PGs, 3 OSDs, target=4 each): + // pg_upmap: pg0..pg6 -> osd.0 (7 PGs), pg7..pg11 -> osd.1 (5 PGs) + // pg_upmap_items: pg6: [osd.0 -> osd.1] + // pgs_by_osd: osd.0=6 (+2), osd.1=6 (+2), osd.2=0 (-4) + // + // Dropping pg6's entry would return it to osd.0, raising osd.0's deviation + // from +2 to +3 while lowering osd.1's from +2 to +1 -- a net wash that + // leaves one OSD worse off. + set_up_map(3, true); + + OSDMap::Incremental pool_inc(osdmap.get_epoch() + 1); + pool_inc.new_pool_max = osdmap.get_pool_max(); + pg_pool_t empty; + int64_t pool_id = ++pool_inc.new_pool_max; + pg_pool_t *p = pool_inc.get_new_pool(pool_id, &empty); + p->size = 1; + p->min_size = 1; + p->set_pg_num(12); + p->set_pgp_num(12); + p->type = pg_pool_t::TYPE_REPLICATED; + p->crush_rule = 0; + p->set_flag(pg_pool_t::FLAG_HASHPSPOOL); + pool_inc.new_pool_names[pool_id] = "testpool"; + osdmap.apply_incremental(pool_inc); + + // Force placement via pg_upmap: pg0..pg6 -> osd.0, pg7..pg11 -> osd.1 + { + OSDMap::Incremental inc(osdmap.get_epoch() + 1); + for (int i = 0; i <= 6; ++i) { + pg_t pg = osdmap.raw_pg_to_pg(pg_t(i, pool_id)); + inc.new_pg_upmap[pg] = mempool::osdmap::vector{0}; + } + for (int i = 7; i <= 11; ++i) { + pg_t pg = osdmap.raw_pg_to_pg(pg_t(i, pool_id)); + inc.new_pg_upmap[pg] = mempool::osdmap::vector{1}; + } + osdmap.apply_incremental(inc); + } + + // Add pg_upmap_items pg6: [osd.0 -> osd.1] + // pgs_by_osd after all upmaps: osd.0=6 (+2), osd.1=6 (+2), osd.2=0 (-4) + pg_t pg6 = osdmap.raw_pg_to_pg(pg_t(6, pool_id)); + { + OSDMap::Incremental inc(osdmap.get_epoch() + 1); + inc.new_pg_upmap_items[pg6] = + mempool::osdmap::vector>{{0, 1}}; + osdmap.apply_incremental(inc); + } + + // Verify pg6 acts from osd.1 before the balancer runs + { + vector up; + int up_primary; + osdmap.pg_to_up_acting_osds(pg6, &up, &up_primary, nullptr, nullptr); + ASSERT_EQ(up[0], 1); + } + + set only_pools = {pool_id}; + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + osdmap.calc_pg_upmaps(g_ceph_context, 1, 100, only_pools, &pending_inc); + + OSDMap newmap; + newmap.deepish_copy_from(osdmap); + newmap.apply_incremental(pending_inc); + + // pg6 must still act from osd.1: returning it to osd.0 would not improve + // the distribution (osd.0 deviation would rise from +2 to +3). + { + vector up; + int up_primary; + newmap.pg_to_up_acting_osds(pg6, &up, &up_primary, nullptr, nullptr); + ASSERT_EQ(up[0], 1); + } +} + TEST_F(OSDMapTest, BUG_42052) { // https://tracker.ceph.com/issues/42052 set_up_map(6, true);