From: Bill Scales Date: Thu, 29 May 2025 11:53:27 +0000 (+0100) Subject: osd: EC optimizations rework for pg_temp X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=6c8b0297aaafeb0cff7350e52212140c85435afe;p=ceph.git osd: EC optimizations rework for pg_temp Bug fixes for how pg_temp is used with optimized EC pools. For these pools pg_temp is re-ordered with non-primary shards last. The acting set was undoing this re-ordering in PeeringState, but this is too late and results code getting the shard id wrong. One consequence iof this was an OSD refusing to create a PG because of an incorrect shard id. This commit moves the re-ordering earlier into OSDMap::_get_temp_osds, some changes are then required to OSDMap::clean_temps. Signed-off-by: Bill Scales --- diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc index ee95ee22fcbd2..714f5c41977e9 100644 --- a/src/osd/OSDMap.cc +++ b/src/osd/OSDMap.cc @@ -2034,13 +2034,9 @@ void OSDMap::clean_temps(CephContext *cct, // force a change of primary shard - do not remove pg_temp // if it is being used for this purpose if (pool->allows_ecoptimizations()) { - for (uint8_t i = 0; i < acting_set.size(); ++i) { - if (acting_set[i] == primary) { - if (pool->is_nonprimary_shard(shard_id_t(i))) { - // pg_temp still required - keep = true; - } - } + if (nextmap._pick_primary(pg.second) != primary) { + // pg_temp still required + keep = true; } } if (!keep) { @@ -2919,7 +2915,7 @@ const std::vector OSDMap::pgtemp_undo_primaryfirst(const pg_pool_t& pool, // Only perform the transform for pools with allow_ec_optimizations set // that also have pg_temp set if (pool.allows_ecoptimizations()) { - if (pg_temp->find(pool.raw_pg_to_pg(pg)) != pg_temp->end()) { + if (has_pgtemp(pool.raw_pg_to_pg(pg))) { std::vector result; int primaryshard = 0; int nonprimaryshard = pool.size - pool.nonprimary_shards.size(); @@ -2946,7 +2942,7 @@ const shard_id_t OSDMap::pgtemp_primaryfirst(const pg_pool_t& pool, } shard_id_t result = shard; if (pool.allows_ecoptimizations()) { - if (pg_temp->find(pool.raw_pg_to_pg(pg)) != pg_temp->end()) { + if (has_pgtemp(pool.raw_pg_to_pg(pg))) { int num_parity_shards = pool.size - pool.nonprimary_shards.size() - 1; if (shard >= pool.size - num_parity_shards) { result = shard_id_t(result + num_parity_shards + 1 - pool.size); @@ -2967,7 +2963,7 @@ shard_id_t OSDMap::pgtemp_undo_primaryfirst(const pg_pool_t& pool, } shard_id_t result = shard; if (pool.allows_ecoptimizations()) { - if (pg_temp->find(pool.raw_pg_to_pg(pg)) != pg_temp->end()) { + if (has_pgtemp(pool.raw_pg_to_pg(pg))) { int num_parity_shards = pool.size - pool.nonprimary_shards.size() - 1; if (shard > num_parity_shards) { result = shard_id_t(result - num_parity_shards); @@ -2982,6 +2978,7 @@ shard_id_t OSDMap::pgtemp_undo_primaryfirst(const pg_pool_t& pool, void OSDMap::_get_temp_osds(const pg_pool_t& pool, pg_t pg, vector *temp_pg, int *temp_primary) const { + vector temp; pg = pool.raw_pg_to_pg(pg); const auto p = pg_temp->find(pg); temp_pg->clear(); @@ -2991,21 +2988,22 @@ void OSDMap::_get_temp_osds(const pg_pool_t& pool, pg_t pg, if (pool.can_shift_osds()) { continue; } else { - temp_pg->push_back(CRUSH_ITEM_NONE); + temp.push_back(CRUSH_ITEM_NONE); } } else { - temp_pg->push_back(p->second[i]); + temp.push_back(p->second[i]); } } + *temp_pg = pgtemp_undo_primaryfirst(pool, pg, temp); } const auto &pp = primary_temp->find(pg); *temp_primary = -1; if (pp != primary_temp->end()) { *temp_primary = pp->second; - } else if (!temp_pg->empty()) { // apply pg_temp's primary - for (unsigned i = 0; i < temp_pg->size(); ++i) { - if ((*temp_pg)[i] != CRUSH_ITEM_NONE) { - *temp_primary = (*temp_pg)[i]; + } else if (!temp.empty()) { // apply pg_temp's primary + for (unsigned i = 0; i < temp.size(); ++i) { + if (temp[i] != CRUSH_ITEM_NONE) { + *temp_primary = temp[i]; break; } } diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h index ca03e16b6e7c7..ac13f6e2113f0 100644 --- a/src/osd/OSDMap.h +++ b/src/osd/OSDMap.h @@ -1358,6 +1358,9 @@ public: return false; } + bool has_pgtemp(const pg_t pg) const { + return (pg_temp->find(pg) != pg_temp->end()); + } const std::vector pgtemp_primaryfirst(const pg_pool_t& pool, const std::vector& pg_temp) const; const std::vector pgtemp_undo_primaryfirst(const pg_pool_t& pool, diff --git a/src/osd/PeeringState.cc b/src/osd/PeeringState.cc index a1a821bef26c0..55743361ce963 100644 --- a/src/osd/PeeringState.cc +++ b/src/osd/PeeringState.cc @@ -611,14 +611,10 @@ bool PeeringState::should_restart_peering( int newupprimary, int newactingprimary, const vector& newup, - const vector& _newacting, + const vector& newacting, OSDMapRef lastmap, OSDMapRef osdmap) { - const vector newacting = osdmap->pgtemp_undo_primaryfirst( - pool.info, - info.pgid.pgid, - _newacting); if (PastIntervals::is_new_interval( primary.osd, newactingprimary, @@ -898,9 +894,7 @@ void PeeringState::init_primary_up_acting( int new_acting_primary) { actingset.clear(); - acting = get_osdmap()->pgtemp_undo_primaryfirst(pool.info, - info.pgid.pgid, - newacting); + acting = newacting; for (uint8_t i = 0; i < acting.size(); ++i) { if (acting[i] != CRUSH_ITEM_NONE) actingset.insert( @@ -931,9 +925,26 @@ void PeeringState::init_primary_up_acting( break; } } + // Calcuating the shard of the acting_primary is tricky because in + // error conditions the same osd can be in multiple positions in + // the acting_set. Use pgtemp ordering (which places shards which + // can become the primary first) to match the code in + // OSDMap::_get_temp_osds + const OSDMapRef osdmap = get_osdmap(); + bool has_pgtemp = osdmap->has_pgtemp(spgid.pgid); + std::vector pg_temp = acting; + if (has_pgtemp) { + pg_temp = osdmap->pgtemp_primaryfirst(pool.info, acting); + } for (uint8_t i = 0; i < acting.size(); ++i) { - if (acting[i] == new_acting_primary) { - primary = pg_shard_t(acting[i], shard_id_t(i)); + if (pg_temp[i] == new_acting_primary) { + if (has_pgtemp) { + primary = pg_shard_t(new_acting_primary, + osdmap->pgtemp_undo_primaryfirst( + pool.info, spgid.pgid, shard_id_t(i))); + } else { + primary = pg_shard_t(new_acting_primary, shard_id_t(i)); + } break; } }