From 63636624ec5eb2d767ba721e66cb7521b4136f32 Mon Sep 17 00:00:00 2001 From: Bill Scales Date: Mon, 19 May 2025 17:01:39 +0100 Subject: [PATCH] osd: EC Optimizations fix routing of requests to non-zero shard id Pools with EC optimizations can use pg_temp to modify the selection of the primary. When this happens the clients route request to the correct OSD, but wrong shard which causes hung I/Os or misdirected I/Os. Fix Objecter to select the correct shard when sending requests to EC optimized pools. Fix OSD to modify the shard when receving requests from legacy clients. Add new unittests to test new functions for remapping the shard id. Signed-off-by: Bill Scales (cherry picked from commit f66cd3e860093380a7f97023bfd1f06312779383) --- src/osd/OSD.cc | 29 ++++++++++++++++++++++---- src/osd/OSDMap.cc | 42 ++++++++++++++++++++++++++++++++++++++ src/osd/OSDMap.h | 14 +++++++++---- src/osdc/Objecter.cc | 2 +- src/test/osd/TestOSDMap.cc | 20 ++++++++++++++++-- 5 files changed, 96 insertions(+), 11 deletions(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 23b028fc8fd9e..aa3a2e9c7f658 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -7661,7 +7661,8 @@ void OSD::dispatch_session_waiting(const ceph::ref_t& session, OSDMapRe op->put(); spg_t pgid; - if (m->get_type() == CEPH_MSG_OSD_OP) { + if (m->get_type() == CEPH_MSG_OSD_OP && + !m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT)) { pg_t actual_pgid = osdmap->raw_pg_to_pg( static_cast(m)->get_pg()); if (!osdmap->get_primary_shard(actual_pgid, &pgid)) { @@ -7669,6 +7670,12 @@ void OSD::dispatch_session_waiting(const ceph::ref_t& session, OSDMapRe } } else { pgid = m->get_spg(); + //Pre-tentacle clients encode the shard_id_t incorrectly for optimized EC + //pools with pg_temp set. Correct the mistake here. + if (!m->get_connection()->has_features(CEPH_FEATUREMASK_SERVER_TENTACLE)) { + auto pi = osdmap->get_pg_pool(pgid.pool()); + pgid.reset_shard(osdmap->pgtemp_undo_primaryfirst(*pi, pgid.pgid, pgid.shard)); + } } enqueue_op(pgid, std::move(op), m->get_map_epoch()); } @@ -7754,11 +7761,25 @@ void OSD::ms_fast_dispatch(Message *m) service.maybe_inject_dispatch_delay(); - if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) || - m->get_type() != CEPH_MSG_OSD_OP) { + // Pre-tentacle clients sending requests to EC shards other than 0 may + // set the shard incorrectly because of how pg_temp encodes primary + // shards first. These requests need to be routed through + // dispatch_session_waiting which uses the OSDMap to correct the shard. + bool legacy = !m->get_connection()->has_features(CEPH_FEATUREMASK_SERVER_TENTACLE); + spg_t spg = static_cast(m)->get_spg(); + if (legacy) { + // Optimization - replica pools and EC shard 0 are never remapped + if ((spg.shard == shard_id_t::NO_SHARD) || + (spg.shard == shard_id_t(0))) { + legacy = false; + } + } + if (!legacy && + (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) || + m->get_type() != CEPH_MSG_OSD_OP)) { // queue it directly enqueue_op( - static_cast(m)->get_spg(), + spg, std::move(op), static_cast(m)->get_map_epoch()); } else { diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc index cff5a6153842b..fd5982180cf7d 100644 --- a/src/osd/OSDMap.cc +++ b/src/osd/OSDMap.cc @@ -2919,6 +2919,48 @@ const std::vector OSDMap::pgtemp_undo_primaryfirst(const pg_pool_t& pool, return acting; } +const shard_id_t OSDMap::pgtemp_primaryfirst(const pg_pool_t& pool, + const pg_t pg, const shard_id_t shard) const +{ + if ((shard == shard_id_t::NO_SHARD) || + (shard == shard_id_t(0))) { + return shard; + } + shard_id_t result = shard; + if (pool.allows_ecoptimizations()) { + if (pg_temp->find(pool.raw_pg_to_pg(pg)) != pg_temp->end()) { + int num_parity_shards = pool.size - pool.nonprimary_shards.size() - 1; + if (shard >= pool.size - num_parity_shards) { + result = shard_id_t(result + num_parity_shards + 1 - pool.size); + } else { + result = shard_id_t(result + num_parity_shards); + } + } + } + return result; +} + +shard_id_t OSDMap::pgtemp_undo_primaryfirst(const pg_pool_t& pool, + const pg_t pg, const shard_id_t shard) const +{ + if ((shard == shard_id_t::NO_SHARD) || + (shard == shard_id_t(0))) { + return shard; + } + shard_id_t result = shard; + if (pool.allows_ecoptimizations()) { + if (pg_temp->find(pool.raw_pg_to_pg(pg)) != pg_temp->end()) { + int num_parity_shards = pool.size - pool.nonprimary_shards.size() - 1; + if (shard > num_parity_shards) { + result = shard_id_t(result - num_parity_shards); + } else { + result = shard_id_t(result + pool.size - num_parity_shards - 1); + } + } + } + return result; +} + void OSDMap::_get_temp_osds(const pg_pool_t& pool, pg_t pg, vector *temp_pg, int *temp_primary) const { diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h index 81f3d914edab4..ca03e16b6e7c7 100644 --- a/src/osd/OSDMap.h +++ b/src/osd/OSDMap.h @@ -1338,16 +1338,16 @@ public: return false; } bool get_primary_shard(const pg_t& pgid, int *primary, spg_t *out) const { - auto i = get_pools().find(pgid.pool()); - if (i == get_pools().end()) { + auto poolit = get_pools().find(pgid.pool()); + if (poolit == get_pools().end()) { return false; } std::vector acting; pg_to_acting_osds(pgid, &acting, primary); - if (i->second.is_erasure()) { + if (poolit->second.is_erasure()) { for (uint8_t i = 0; i < acting.size(); ++i) { if (acting[i] == *primary) { - *out = spg_t(pgid, shard_id_t(i)); + *out = spg_t(pgid, pgtemp_undo_primaryfirst(poolit->second, pgid, shard_id_t(i))); return true; } } @@ -1363,6 +1363,12 @@ public: const std::vector pgtemp_undo_primaryfirst(const pg_pool_t& pool, const pg_t pg, const std::vector& acting) const; + const shard_id_t pgtemp_primaryfirst(const pg_pool_t& pool, + const pg_t pg, + const shard_id_t shard) const; + shard_id_t pgtemp_undo_primaryfirst(const pg_pool_t& pool, + const pg_t pg, + const shard_id_t shard) const; bool in_removed_snaps_queue(int64_t pool, snapid_t snap) const { auto p = removed_snaps_queue.find(pool); diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc index adc03ec4c71c3..203eab51d225f 100644 --- a/src/osdc/Objecter.cc +++ b/src/osdc/Objecter.cc @@ -3007,7 +3007,7 @@ int Objecter::_calc_target(op_target_t *t, Connection *con, bool any_change) if (pi->is_erasure()) { for (uint8_t i = 0; i < t->acting.size(); ++i) { if (t->acting[i] == acting_primary) { - spgid.reset_shard(shard_id_t(i)); + spgid.reset_shard(osdmap->pgtemp_undo_primaryfirst(*pi, actual_pgid, shard_id_t(i))); break; } } diff --git a/src/test/osd/TestOSDMap.cc b/src/test/osd/TestOSDMap.cc index 33371c6d944c9..538819e81105a 100644 --- a/src/test/osd/TestOSDMap.cc +++ b/src/test/osd/TestOSDMap.cc @@ -3063,8 +3063,8 @@ TEST_F(OSDMapTest, pgtemp_primaryfirst) { decoded = osdmap.pgtemp_undo_primaryfirst(pool, pgid, encoded); ASSERT_EQ(set, decoded); - // With nonprimary_shards - for (int seed = 1; seed < 64; seed++) { + // With nonprimary_shards, shard 0 is never a non-primary + for (int seed = 2; seed < 64; seed += 2) { for (int osd = 0; osd < 6; osd++ ) { if (seed & (1 << osd)) { pool.nonprimary_shards.insert(shard_id_t(osd)); @@ -3082,6 +3082,22 @@ TEST_F(OSDMapTest, pgtemp_primaryfirst) { // non-primary shards last ASSERT_TRUE(pool.is_nonprimary_shard(shard_id_t(encoded[osd]))); } + std::cout << osd << " " << seed << " " << osdmap.pgtemp_primaryfirst(pool, pgid, shard_id_t(osd)) << std::endl; + // Encode and decode should be equivalent + ASSERT_EQ(osdmap.pgtemp_undo_primaryfirst(pool, pgid, + osdmap.pgtemp_primaryfirst(pool, pgid, shard_id_t(osd))), + shard_id_t(osd)); + // Shard 0 never changes, seed 62 is a special case because all the other + // shards are non-primary + if ((osd != 0) && (seed != 62)) { + // Encode should be different + ASSERT_NE(osdmap.pgtemp_primaryfirst(pool, pgid, shard_id_t(osd)), + shard_id_t(osd)); + } else { + // Encode should not change + ASSERT_EQ(osdmap.pgtemp_primaryfirst(pool, pgid, shard_id_t(osd)), + shard_id_t(osd)); + } } decoded = osdmap.pgtemp_undo_primaryfirst(pool, pgid, encoded); ASSERT_EQ(set, decoded); -- 2.39.5