osd: EC Optimizations fix routing of requests to non-zero shard id

author Bill Scales <bill_scales@uk.ibm.com>

Mon, 19 May 2025 16:01:39 +0000 (17:01 +0100)

committer Alex Ainscow <aainscow@uk.ibm.com>

Wed, 25 Jun 2025 22:36:40 +0000 (23:36 +0100)
author Bill Scales <bill_scales@uk.ibm.com>
Mon, 19 May 2025 16:01:39 +0000 (17:01 +0100)
committer Alex Ainscow <aainscow@uk.ibm.com>
Wed, 25 Jun 2025 22:36:40 +0000 (23:36 +0100)
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc

index f55898cac6d1ab15616c5439f9bffa8bff339a7c..98271bf9a98e85fef1246f797d868e62d5332445 100644 (file)
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -7672,7 +7672,8 @@ void OSD::dispatch_session_waiting(const ceph::ref_t<Session>& session, OSDMapRe
      op->put();
  
      spg_t pgid;
-    if (m->get_type() == CEPH_MSG_OSD_OP) {
+    if (m->get_type() == CEPH_MSG_OSD_OP &&
+       !m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT)) {
        pg_t actual_pgid = osdmap->raw_pg_to_pg(
         static_cast<const MOSDOp*>(m)->get_pg());
        if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
@@ -7680,6 +7681,12 @@ void OSD::dispatch_session_waiting(const ceph::ref_t<Session>& session, OSDMapRe
        }
      } else {
        pgid = m->get_spg();
+      //Pre-tentacle clients encode the shard_id_t incorrectly for optimized EC
+      //pools with pg_temp set. Correct the mistake here.
+      if (!m->get_connection()->has_features(CEPH_FEATUREMASK_SERVER_TENTACLE)) {
+        auto pi = osdmap->get_pg_pool(pgid.pool());
+       pgid.reset_shard(osdmap->pgtemp_undo_primaryfirst(*pi, pgid.pgid, pgid.shard));
+      }
      }
      enqueue_op(pgid, std::move(op), m->get_map_epoch());
    }
@@ -7765,11 +7772,25 @@ void OSD::ms_fast_dispatch(Message *m)
  
    service.maybe_inject_dispatch_delay();
  
-  if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
-      m->get_type() != CEPH_MSG_OSD_OP) {
+  // Pre-tentacle clients sending requests to EC shards other than 0 may
+  // set the shard incorrectly because of how pg_temp encodes primary
+  // shards first. These requests need to be routed through
+  // dispatch_session_waiting which uses the OSDMap to correct the shard.
+  bool legacy = !m->get_connection()->has_features(CEPH_FEATUREMASK_SERVER_TENTACLE);
+  spg_t spg = static_cast<MOSDFastDispatchOp*>(m)->get_spg();
+  if (legacy) {
+    // Optimization - replica pools and EC shard 0 are never remapped
+    if ((spg.shard == shard_id_t::NO_SHARD) ||
+       (spg.shard == shard_id_t(0))) {
+      legacy = false;
+    }
+  }
+  if (!legacy &&
+      (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
+       m->get_type() != CEPH_MSG_OSD_OP)) {
      // queue it directly
      enqueue_op(
-      static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
+      spg,
        std::move(op),
        static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
    } else {
diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc

index cff5a6153842b1922cf761ebc97e8b6ff3569f2d..fd5982180cf7dc94c1403cd4c96ab7ed2bd0929a 100644 (file)
--- a/src/osd/OSDMap.cc
+++ b/src/osd/OSDMap.cc
@@ -2919,6 +2919,48 @@ const std::vector<int> OSDMap::pgtemp_undo_primaryfirst(const pg_pool_t& pool,
    return acting;
  }
  
+const shard_id_t OSDMap::pgtemp_primaryfirst(const pg_pool_t& pool,
+       const pg_t pg, const shard_id_t shard) const
+{
+  if ((shard == shard_id_t::NO_SHARD) ||
+      (shard == shard_id_t(0))) {
+    return shard;
+  }
+  shard_id_t result = shard;
+  if (pool.allows_ecoptimizations()) {
+    if (pg_temp->find(pool.raw_pg_to_pg(pg)) != pg_temp->end()) {
+      int num_parity_shards = pool.size - pool.nonprimary_shards.size() - 1;
+      if (shard >= pool.size - num_parity_shards) {
+       result = shard_id_t(result + num_parity_shards + 1 - pool.size);
+      } else {
+       result = shard_id_t(result + num_parity_shards);
+      }
+    }
+  }
+  return result;
+}
+
+shard_id_t OSDMap::pgtemp_undo_primaryfirst(const pg_pool_t& pool,
+       const pg_t pg, const shard_id_t shard) const
+{
+  if ((shard == shard_id_t::NO_SHARD) ||
+      (shard == shard_id_t(0))) {
+    return shard;
+  }
+  shard_id_t result = shard;
+  if (pool.allows_ecoptimizations()) {
+    if (pg_temp->find(pool.raw_pg_to_pg(pg)) != pg_temp->end()) {
+      int num_parity_shards = pool.size - pool.nonprimary_shards.size() - 1;
+      if (shard > num_parity_shards) {
+       result = shard_id_t(result - num_parity_shards);
+      } else {
+       result = shard_id_t(result + pool.size - num_parity_shards - 1);
+      }
+    }
+  }
+  return result;
+}
+
  void OSDMap::_get_temp_osds(const pg_pool_t& pool, pg_t pg,
                              vector<int> *temp_pg, int *temp_primary) const
  {
diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h

index 81f3d914edab432c62d756e78f946fb24e82847d..ca03e16b6e7c78d9586bc30240831221da7d5777 100644 (file)
--- a/src/osd/OSDMap.h
+++ b/src/osd/OSDMap.h
@@ -1338,16 +1338,16 @@ public:
      return false;
    }
    bool get_primary_shard(const pg_t& pgid, int *primary, spg_t *out) const {
-    auto i = get_pools().find(pgid.pool());
-    if (i == get_pools().end()) {
+    auto poolit = get_pools().find(pgid.pool());
+    if (poolit == get_pools().end()) {
        return false;
      }
      std::vector<int> acting;
      pg_to_acting_osds(pgid, &acting, primary);
-    if (i->second.is_erasure()) {
+    if (poolit->second.is_erasure()) {
        for (uint8_t i = 0; i < acting.size(); ++i) {
         if (acting[i] == *primary) {
-         *out = spg_t(pgid, shard_id_t(i));
+         *out = spg_t(pgid, pgtemp_undo_primaryfirst(poolit->second, pgid, shard_id_t(i)));
           return true;
         }
        }
@@ -1363,6 +1363,12 @@ public:
    const std::vector<int> pgtemp_undo_primaryfirst(const pg_pool_t& pool,
                            const pg_t pg,
                            const std::vector<int>& acting) const;
+  const shard_id_t pgtemp_primaryfirst(const pg_pool_t& pool,
+                                      const pg_t pg,
+                                      const shard_id_t shard) const;
+  shard_id_t pgtemp_undo_primaryfirst(const pg_pool_t& pool,
+                                           const pg_t pg,
+                                           const shard_id_t shard) const;
  
    bool in_removed_snaps_queue(int64_t pool, snapid_t snap) const {
      auto p = removed_snaps_queue.find(pool);
diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc

index 78f7bbce5c6f067290ae0b957501f8446a92070e..7ba8875683f401cfbc0f3608bd2de48d9908ccfe 100644 (file)
--- a/src/osdc/Objecter.cc
+++ b/src/osdc/Objecter.cc
@@ -3020,7 +3020,7 @@ int Objecter::_calc_target(op_target_t *t, Connection *con, bool any_change)
      if (pi->is_erasure()) {
        for (uint8_t i = 0; i < t->acting.size(); ++i) {
          if (t->acting[i] == acting_primary) {
-          spgid.reset_shard(shard_id_t(i));
+         spgid.reset_shard(osdmap->pgtemp_undo_primaryfirst(*pi, actual_pgid, shard_id_t(i)));
            break;
          }
        }
diff --git a/src/test/osd/TestOSDMap.cc b/src/test/osd/TestOSDMap.cc

index 33371c6d944c9772022842fcc390d60c188582ab..538819e81105a080bc04a7b61200bde4b7c76613 100644 (file)
--- a/src/test/osd/TestOSDMap.cc
+++ b/src/test/osd/TestOSDMap.cc
@@ -3063,8 +3063,8 @@ TEST_F(OSDMapTest, pgtemp_primaryfirst) {
    decoded = osdmap.pgtemp_undo_primaryfirst(pool, pgid, encoded);
    ASSERT_EQ(set, decoded);
  
-  // With nonprimary_shards
-  for (int seed = 1; seed < 64; seed++) {
+  // With nonprimary_shards, shard 0 is never a non-primary
+  for (int seed = 2; seed < 64; seed += 2) {
      for (int osd = 0; osd < 6; osd++ ) {
        if (seed & (1 << osd)) {
         pool.nonprimary_shards.insert(shard_id_t(osd));
@@ -3082,6 +3082,22 @@ TEST_F(OSDMapTest, pgtemp_primaryfirst) {
         // non-primary shards last
         ASSERT_TRUE(pool.is_nonprimary_shard(shard_id_t(encoded[osd])));
        }
+      std::cout << osd << " " << seed << " " << osdmap.pgtemp_primaryfirst(pool, pgid, shard_id_t(osd)) << std::endl;
+      // Encode and decode should be equivalent
+      ASSERT_EQ(osdmap.pgtemp_undo_primaryfirst(pool, pgid,
+                 osdmap.pgtemp_primaryfirst(pool, pgid, shard_id_t(osd))),
+               shard_id_t(osd));
+      // Shard 0 never changes, seed 62 is a special case because all the other
+      // shards are non-primary
+      if ((osd != 0) && (seed != 62)) {
+       // Encode should be different
+       ASSERT_NE(osdmap.pgtemp_primaryfirst(pool, pgid, shard_id_t(osd)),
+                 shard_id_t(osd));
+      } else {
+       // Encode should not change
+       ASSERT_EQ(osdmap.pgtemp_primaryfirst(pool, pgid, shard_id_t(osd)),
+                 shard_id_t(osd));
+      }
      }
      decoded = osdmap.pgtemp_undo_primaryfirst(pool, pgid, encoded);
      ASSERT_EQ(set, decoded);
author	Bill Scales <bill_scales@uk.ibm.com>
	Mon, 19 May 2025 16:01:39 +0000 (17:01 +0100)
committer	Alex Ainscow <aainscow@uk.ibm.com>
	Wed, 25 Jun 2025 22:36:40 +0000 (23:36 +0100)
src/osd/OSD.cc		patch \| blob \| history
src/osd/OSDMap.cc		patch \| blob \| history
src/osd/OSDMap.h		patch \| blob \| history
src/osdc/Objecter.cc		patch \| blob \| history
src/test/osd/TestOSDMap.cc		patch \| blob \| history