]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: EC Optimizations fix routing of requests to non-zero shard id
authorBill Scales <bill_scales@uk.ibm.com>
Mon, 19 May 2025 16:01:39 +0000 (17:01 +0100)
committerAlex Ainscow <aainscow@uk.ibm.com>
Wed, 25 Jun 2025 22:36:40 +0000 (23:36 +0100)
Pools with EC optimizations can use pg_temp to modify the selection
of the primary. When this happens the clients route request to the
correct OSD, but wrong shard which causes hung I/Os or misdirected
I/Os.

Fix Objecter to select the correct shard when sending requests to
EC optimized pools. Fix OSD to modify the shard when receving
requests from legacy clients.

Add new unittests to test new functions for remapping the shard id.

Signed-off-by: Bill Scales <bill_scales@uk.ibm.com>
src/osd/OSD.cc
src/osd/OSDMap.cc
src/osd/OSDMap.h
src/osdc/Objecter.cc
src/test/osd/TestOSDMap.cc

index f55898cac6d1ab15616c5439f9bffa8bff339a7c..98271bf9a98e85fef1246f797d868e62d5332445 100644 (file)
@@ -7672,7 +7672,8 @@ void OSD::dispatch_session_waiting(const ceph::ref_t<Session>& session, OSDMapRe
     op->put();
 
     spg_t pgid;
-    if (m->get_type() == CEPH_MSG_OSD_OP) {
+    if (m->get_type() == CEPH_MSG_OSD_OP &&
+       !m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT)) {
       pg_t actual_pgid = osdmap->raw_pg_to_pg(
        static_cast<const MOSDOp*>(m)->get_pg());
       if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
@@ -7680,6 +7681,12 @@ void OSD::dispatch_session_waiting(const ceph::ref_t<Session>& session, OSDMapRe
       }
     } else {
       pgid = m->get_spg();
+      //Pre-tentacle clients encode the shard_id_t incorrectly for optimized EC
+      //pools with pg_temp set. Correct the mistake here.
+      if (!m->get_connection()->has_features(CEPH_FEATUREMASK_SERVER_TENTACLE)) {
+        auto pi = osdmap->get_pg_pool(pgid.pool());
+       pgid.reset_shard(osdmap->pgtemp_undo_primaryfirst(*pi, pgid.pgid, pgid.shard));
+      }
     }
     enqueue_op(pgid, std::move(op), m->get_map_epoch());
   }
@@ -7765,11 +7772,25 @@ void OSD::ms_fast_dispatch(Message *m)
 
   service.maybe_inject_dispatch_delay();
 
-  if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
-      m->get_type() != CEPH_MSG_OSD_OP) {
+  // Pre-tentacle clients sending requests to EC shards other than 0 may
+  // set the shard incorrectly because of how pg_temp encodes primary
+  // shards first. These requests need to be routed through
+  // dispatch_session_waiting which uses the OSDMap to correct the shard.
+  bool legacy = !m->get_connection()->has_features(CEPH_FEATUREMASK_SERVER_TENTACLE);
+  spg_t spg = static_cast<MOSDFastDispatchOp*>(m)->get_spg();
+  if (legacy) {
+    // Optimization - replica pools and EC shard 0 are never remapped
+    if ((spg.shard == shard_id_t::NO_SHARD) ||
+       (spg.shard == shard_id_t(0))) {
+      legacy = false;
+    }
+  }
+  if (!legacy &&
+      (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
+       m->get_type() != CEPH_MSG_OSD_OP)) {
     // queue it directly
     enqueue_op(
-      static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
+      spg,
       std::move(op),
       static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
   } else {
index cff5a6153842b1922cf761ebc97e8b6ff3569f2d..fd5982180cf7dc94c1403cd4c96ab7ed2bd0929a 100644 (file)
@@ -2919,6 +2919,48 @@ const std::vector<int> OSDMap::pgtemp_undo_primaryfirst(const pg_pool_t& pool,
   return acting;
 }
 
+const shard_id_t OSDMap::pgtemp_primaryfirst(const pg_pool_t& pool,
+       const pg_t pg, const shard_id_t shard) const
+{
+  if ((shard == shard_id_t::NO_SHARD) ||
+      (shard == shard_id_t(0))) {
+    return shard;
+  }
+  shard_id_t result = shard;
+  if (pool.allows_ecoptimizations()) {
+    if (pg_temp->find(pool.raw_pg_to_pg(pg)) != pg_temp->end()) {
+      int num_parity_shards = pool.size - pool.nonprimary_shards.size() - 1;
+      if (shard >= pool.size - num_parity_shards) {
+       result = shard_id_t(result + num_parity_shards + 1 - pool.size);
+      } else {
+       result = shard_id_t(result + num_parity_shards);
+      }
+    }
+  }
+  return result;
+}
+
+shard_id_t OSDMap::pgtemp_undo_primaryfirst(const pg_pool_t& pool,
+       const pg_t pg, const shard_id_t shard) const
+{
+  if ((shard == shard_id_t::NO_SHARD) ||
+      (shard == shard_id_t(0))) {
+    return shard;
+  }
+  shard_id_t result = shard;
+  if (pool.allows_ecoptimizations()) {
+    if (pg_temp->find(pool.raw_pg_to_pg(pg)) != pg_temp->end()) {
+      int num_parity_shards = pool.size - pool.nonprimary_shards.size() - 1;
+      if (shard > num_parity_shards) {
+       result = shard_id_t(result - num_parity_shards);
+      } else {
+       result = shard_id_t(result + pool.size - num_parity_shards - 1);
+      }
+    }
+  }
+  return result;
+}
+
 void OSDMap::_get_temp_osds(const pg_pool_t& pool, pg_t pg,
                             vector<int> *temp_pg, int *temp_primary) const
 {
index 81f3d914edab432c62d756e78f946fb24e82847d..ca03e16b6e7c78d9586bc30240831221da7d5777 100644 (file)
@@ -1338,16 +1338,16 @@ public:
     return false;
   }
   bool get_primary_shard(const pg_t& pgid, int *primary, spg_t *out) const {
-    auto i = get_pools().find(pgid.pool());
-    if (i == get_pools().end()) {
+    auto poolit = get_pools().find(pgid.pool());
+    if (poolit == get_pools().end()) {
       return false;
     }
     std::vector<int> acting;
     pg_to_acting_osds(pgid, &acting, primary);
-    if (i->second.is_erasure()) {
+    if (poolit->second.is_erasure()) {
       for (uint8_t i = 0; i < acting.size(); ++i) {
        if (acting[i] == *primary) {
-         *out = spg_t(pgid, shard_id_t(i));
+         *out = spg_t(pgid, pgtemp_undo_primaryfirst(poolit->second, pgid, shard_id_t(i)));
          return true;
        }
       }
@@ -1363,6 +1363,12 @@ public:
   const std::vector<int> pgtemp_undo_primaryfirst(const pg_pool_t& pool,
                           const pg_t pg,
                           const std::vector<int>& acting) const;
+  const shard_id_t pgtemp_primaryfirst(const pg_pool_t& pool,
+                                      const pg_t pg,
+                                      const shard_id_t shard) const;
+  shard_id_t pgtemp_undo_primaryfirst(const pg_pool_t& pool,
+                                           const pg_t pg,
+                                           const shard_id_t shard) const;
 
   bool in_removed_snaps_queue(int64_t pool, snapid_t snap) const {
     auto p = removed_snaps_queue.find(pool);
index 78f7bbce5c6f067290ae0b957501f8446a92070e..7ba8875683f401cfbc0f3608bd2de48d9908ccfe 100644 (file)
@@ -3020,7 +3020,7 @@ int Objecter::_calc_target(op_target_t *t, Connection *con, bool any_change)
     if (pi->is_erasure()) {
       for (uint8_t i = 0; i < t->acting.size(); ++i) {
         if (t->acting[i] == acting_primary) {
-          spgid.reset_shard(shard_id_t(i));
+         spgid.reset_shard(osdmap->pgtemp_undo_primaryfirst(*pi, actual_pgid, shard_id_t(i)));
           break;
         }
       }
index 33371c6d944c9772022842fcc390d60c188582ab..538819e81105a080bc04a7b61200bde4b7c76613 100644 (file)
@@ -3063,8 +3063,8 @@ TEST_F(OSDMapTest, pgtemp_primaryfirst) {
   decoded = osdmap.pgtemp_undo_primaryfirst(pool, pgid, encoded);
   ASSERT_EQ(set, decoded);
 
-  // With nonprimary_shards
-  for (int seed = 1; seed < 64; seed++) {
+  // With nonprimary_shards, shard 0 is never a non-primary
+  for (int seed = 2; seed < 64; seed += 2) {
     for (int osd = 0; osd < 6; osd++ ) {
       if (seed & (1 << osd)) {
        pool.nonprimary_shards.insert(shard_id_t(osd));
@@ -3082,6 +3082,22 @@ TEST_F(OSDMapTest, pgtemp_primaryfirst) {
        // non-primary shards last
        ASSERT_TRUE(pool.is_nonprimary_shard(shard_id_t(encoded[osd])));
       }
+      std::cout << osd << " " << seed << " " << osdmap.pgtemp_primaryfirst(pool, pgid, shard_id_t(osd)) << std::endl;
+      // Encode and decode should be equivalent
+      ASSERT_EQ(osdmap.pgtemp_undo_primaryfirst(pool, pgid,
+                 osdmap.pgtemp_primaryfirst(pool, pgid, shard_id_t(osd))),
+               shard_id_t(osd));
+      // Shard 0 never changes, seed 62 is a special case because all the other
+      // shards are non-primary
+      if ((osd != 0) && (seed != 62)) {
+       // Encode should be different
+       ASSERT_NE(osdmap.pgtemp_primaryfirst(pool, pgid, shard_id_t(osd)),
+                 shard_id_t(osd));
+      } else {
+       // Encode should not change
+       ASSERT_EQ(osdmap.pgtemp_primaryfirst(pool, pgid, shard_id_t(osd)),
+                 shard_id_t(osd));
+      }
     }
     decoded = osdmap.pgtemp_undo_primaryfirst(pool, pgid, encoded);
     ASSERT_EQ(set, decoded);