From 9a5d55c104a18d554c4208f8380aa0db9f4127af Mon Sep 17 00:00:00 2001 From: Bill Scales <156200352+bill-scales@users.noreply.github.com> Date: Thu, 6 Mar 2025 12:20:52 +0000 Subject: [PATCH] osd: Restrict choice of primary shard for ec_optimizations pools Pools with ec_optimizations enabled have restrictions on which shards are permitted to become the primary because not all shards are updated for every I/O. To preserve backwards compatibility with downlevel clients pg_temp is used as the method to override the selection of primary by OSDMap. Directly changing the logic in OSDMap would have meant that all clients need to be upgraded to tentacle before using optimized EC pools, so was discounted. Using primary_temp to set the primary for an EC pool is not reliable because under error conditions an OSD can store multiple shards for the same PG and primary_temp cannot define which of these shards will be choosen. For optimized EC pools pg_temp is shuffled so that the non-primary shards are listed last. This means that the existing logic in OSDMap that picks the first available shard as the primary will avoid selecting a non-primary shard. OSDMonitor applies the shuffle when pg_temp is set, this is then reverted in PeeringState when initializing the acting set after OSDMap has selected the primary. PeeringState::choose_acting is modified to set pg_temp if OSDMap has selected a non-primary shard, this will cause a new OSDMAP to be published which will persuade OSDMap to select a primary shard instead. Signed-off-by: Bill Scales --- src/mon/OSDMonitor.cc | 12 +++++++- src/osd/OSDMap.cc | 67 +++++++++++++++++++++++++++++++++++++++++ src/osd/OSDMap.h | 7 +++++ src/osd/PeeringState.cc | 26 +++++++++++++--- 4 files changed, 106 insertions(+), 6 deletions(-) diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 495a78e4293..81716ccbd95 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -4187,8 +4187,18 @@ bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op) << ": pool has been removed" << dendl; continue; } + // Pools with allow_ec_optimizations set store pg_temp in a different + // order to change the primary selection algorithm without breaking + // old clients. If necessary re-order the new pg_temp now + pg_pool_t pg_pool; + if (pending_inc.new_pools.count(pool)) + pg_pool = pending_inc.new_pools[pool]; + else + pg_pool = *osdmap.get_pg_pool(pool); + + std::vector pg_temp = osdmap.pgtemp_primaryfirst(pg_pool, p->second); pending_inc.new_pg_temp[p->first] = - mempool::osdmap::vector(p->second.begin(), p->second.end()); + mempool::osdmap::vector(pg_temp.begin(), pg_temp.end()); // unconditionally clear pg_primary (until this message can encode // a change for that, too.. at which point we need to also fix diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc index ffdb03fec4f..cff5a615384 100644 --- a/src/osd/OSDMap.cc +++ b/src/osd/OSDMap.cc @@ -2852,6 +2852,73 @@ void OSDMap::_apply_primary_affinity(ps_t seed, } } +/* EC pools with allow_ec_optimizations set have some shards that cannot + * become the primary because they are not updated on every I/O. To avoid + * requiring clients to be upgraded to use these new pools the logic in + * OSDMap which selects a primary cannot be changed. Instead choose_acting + * is modified to set pgtemp when it is necessary to override the choice + * of primary, and this vector is reordered so that shards that are + * permitted to be the primary are listed first. The existing OSDMap code + * will then choose a suitable shard as primary except when the pg is + * incomplete and the choice of primary doesn't matter. This function is + * called by OSDMonitor when setting pg_temp to transform the vector. + * + * Example: Optimized EC pool 4+2 + * acting_set = {NONE, 6, 7, 8, 9, 10} + * non_primary_shards = {1, 2, 3} # data shards other than shard 0 + * pg_temp = {NONE, 9, 10, 6, 7, 8} # non-primary shards at end + * primary will be OSD 9(1) + */ +const std::vector OSDMap::pgtemp_primaryfirst(const pg_pool_t& pool, + const std::vector& pg_temp) const +{ + // Only perform the transform for pools with allow_ec_optimizations set + if (pool.allows_ecoptimizations()) { + std::vector result; + std::vector nonprimary; + int shard = 0; + for (auto osd : pg_temp) { + if (pool.is_nonprimary_shard(shard_id_t(shard))) { + nonprimary.emplace_back(osd); + } else { + result.emplace_back(osd); + } + shard++; + } + result.insert(result.end(), nonprimary.begin(), nonprimary.end()); + return result; + } + return pg_temp; +} + +/* The function above reorders the pg_temp vector. This transformation needs + * to be reversed by OSDs (but not clients) and is called by PeeringState + * when initializing the the acting set. + */ +const std::vector OSDMap::pgtemp_undo_primaryfirst(const pg_pool_t& pool, + const pg_t pg, const std::vector& acting) const +{ + // Only perform the transform for pools with allow_ec_optimizations set + // that also have pg_temp set + if (pool.allows_ecoptimizations()) { + if (pg_temp->find(pool.raw_pg_to_pg(pg)) != pg_temp->end()) { + std::vector result; + int primaryshard = 0; + int nonprimaryshard = pool.size - pool.nonprimary_shards.size(); + assert(acting.size() == pool.size); + for (auto shard = 0; shard < pool.size; shard++) { + if (pool.is_nonprimary_shard(shard_id_t(shard))) { + result.emplace_back(acting[nonprimaryshard++]); + } else { + result.emplace_back(acting[primaryshard++]); + } + } + return result; + } + } + return acting; +} + void OSDMap::_get_temp_osds(const pg_pool_t& pool, pg_t pg, vector *temp_pg, int *temp_primary) const { diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h index 3a4c56a46f3..81f3d914eda 100644 --- a/src/osd/OSDMap.h +++ b/src/osd/OSDMap.h @@ -590,6 +590,7 @@ private: mempool::osdmap::vector<__u32> osd_weight; // 16.16 fixed point, 0x10000 = "in", 0 = "out" mempool::osdmap::vector osd_info; + // Optimized EC pools re-order pg_temp, see pgtemp_primaryfirst std::shared_ptr pg_temp; // temp pg mapping (e.g. while we rebuild) std::shared_ptr< mempool::osdmap::map > primary_temp; // temp primary mapping (e.g. while we rebuild) std::shared_ptr< mempool::osdmap::vector<__u32> > osd_primary_affinity; ///< 16.16 fixed point, 0x10000 = baseline @@ -1357,6 +1358,12 @@ public: return false; } + const std::vector pgtemp_primaryfirst(const pg_pool_t& pool, + const std::vector& pg_temp) const; + const std::vector pgtemp_undo_primaryfirst(const pg_pool_t& pool, + const pg_t pg, + const std::vector& acting) const; + bool in_removed_snaps_queue(int64_t pool, snapid_t snap) const { auto p = removed_snaps_queue.find(pool); if (p == removed_snaps_queue.end()) { diff --git a/src/osd/PeeringState.cc b/src/osd/PeeringState.cc index 7a3c911840d..4dd7543e10a 100644 --- a/src/osd/PeeringState.cc +++ b/src/osd/PeeringState.cc @@ -538,10 +538,14 @@ bool PeeringState::should_restart_peering( int newupprimary, int newactingprimary, const vector& newup, - const vector& newacting, + const vector& _newacting, OSDMapRef lastmap, OSDMapRef osdmap) { + const vector newacting = osdmap->pgtemp_undo_primaryfirst( + pool.info, + info.pgid.pgid, + _newacting); if (PastIntervals::is_new_interval( primary.osd, newactingprimary, @@ -820,7 +824,9 @@ void PeeringState::init_primary_up_acting( int new_acting_primary) { actingset.clear(); - acting = newacting; + acting = get_osdmap()->pgtemp_undo_primaryfirst(pool.info, + info.pgid.pgid, + newacting); for (uint8_t i = 0; i < acting.size(); ++i) { if (acting[i] != CRUSH_ITEM_NONE) actingset.insert( @@ -2445,13 +2451,23 @@ bool PeeringState::choose_acting(pg_shard_t &auth_log_shard_id, << " from oversized want " << want << dendl; want.pop_back(); } - if (want != acting) { - psdout(10) << "want " << want << " != acting " << acting + if ((want != acting) || + pool.info.is_nonprimary_shard(pg_whoami.shard)) { + if (pool.info.is_nonprimary_shard(pg_whoami.shard)) { + psdout(10) << "shard " << pg_whoami.shard << " cannot be primary, want " + << pg_vector_string(want) + << " acting " << pg_vector_string(acting) << ", requesting pg_temp change" << dendl; + } else { + psdout(10) << "want " << pg_vector_string(want) + << " != acting " << pg_vector_string(acting) + << ", requesting pg_temp change" << dendl; + } want_acting = want; if (!cct->_conf->osd_debug_no_acting_change) { - if (want_acting == up) { + if ((want_acting == up) && + !pool.info.is_nonprimary_shard(pg_whoami.shard)) { // There can't be any pending backfill if // want is the same as crush map up OSDs. ceph_assert(want_backfill.empty()); -- 2.47.3