]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: EC optimizations rework for pg_temp
authorBill Scales <bill_scales@uk.ibm.com>
Thu, 29 May 2025 11:53:27 +0000 (12:53 +0100)
committerAlex Ainscow <aainscow@uk.ibm.com>
Wed, 25 Jun 2025 22:36:40 +0000 (23:36 +0100)
Bug fixes for how pg_temp is used with optimized EC pools. For these
pools pg_temp is re-ordered with non-primary shards last. The acting
set was undoing this re-ordering in PeeringState, but this is too
late and results code getting the shard id wrong. One consequence
iof this was an OSD refusing to create a PG because of an incorrect
shard id.

This commit moves the re-ordering earlier into OSDMap::_get_temp_osds,
some changes are then required to OSDMap::clean_temps.

Signed-off-by: Bill Scales <bill_scales@uk.ibm.com>
src/osd/OSDMap.cc
src/osd/OSDMap.h
src/osd/PeeringState.cc

index ee95ee22fcbd29021c7c5f6aa3d2c63bae72503a..714f5c41977e9f8c353ec04b4bf76f76c5d9907c 100644 (file)
@@ -2034,13 +2034,9 @@ void OSDMap::clean_temps(CephContext *cct,
       // force a change of primary shard - do not remove pg_temp
       // if it is being used for this purpose
       if (pool->allows_ecoptimizations()) {
-       for (uint8_t i = 0; i < acting_set.size(); ++i) {
-         if (acting_set[i] == primary) {
-           if (pool->is_nonprimary_shard(shard_id_t(i))) {
-             // pg_temp still required
-             keep = true;
-           }
-         }
+       if (nextmap._pick_primary(pg.second) != primary) {
+         // pg_temp still required
+         keep = true;
        }
       }
       if (!keep) {
@@ -2919,7 +2915,7 @@ const std::vector<int> OSDMap::pgtemp_undo_primaryfirst(const pg_pool_t& pool,
   // Only perform the transform for pools with allow_ec_optimizations set
   // that also have pg_temp set
   if (pool.allows_ecoptimizations()) {
-    if (pg_temp->find(pool.raw_pg_to_pg(pg)) != pg_temp->end()) {
+    if (has_pgtemp(pool.raw_pg_to_pg(pg))) {
       std::vector<int> result;
       int primaryshard = 0;
       int nonprimaryshard = pool.size - pool.nonprimary_shards.size();
@@ -2946,7 +2942,7 @@ const shard_id_t OSDMap::pgtemp_primaryfirst(const pg_pool_t& pool,
   }
   shard_id_t result = shard;
   if (pool.allows_ecoptimizations()) {
-    if (pg_temp->find(pool.raw_pg_to_pg(pg)) != pg_temp->end()) {
+    if (has_pgtemp(pool.raw_pg_to_pg(pg))) {
       int num_parity_shards = pool.size - pool.nonprimary_shards.size() - 1;
       if (shard >= pool.size - num_parity_shards) {
        result = shard_id_t(result + num_parity_shards + 1 - pool.size);
@@ -2967,7 +2963,7 @@ shard_id_t OSDMap::pgtemp_undo_primaryfirst(const pg_pool_t& pool,
   }
   shard_id_t result = shard;
   if (pool.allows_ecoptimizations()) {
-    if (pg_temp->find(pool.raw_pg_to_pg(pg)) != pg_temp->end()) {
+    if (has_pgtemp(pool.raw_pg_to_pg(pg))) {
       int num_parity_shards = pool.size - pool.nonprimary_shards.size() - 1;
       if (shard > num_parity_shards) {
        result = shard_id_t(result - num_parity_shards);
@@ -2982,6 +2978,7 @@ shard_id_t OSDMap::pgtemp_undo_primaryfirst(const pg_pool_t& pool,
 void OSDMap::_get_temp_osds(const pg_pool_t& pool, pg_t pg,
                             vector<int> *temp_pg, int *temp_primary) const
 {
+  vector<int> temp;
   pg = pool.raw_pg_to_pg(pg);
   const auto p = pg_temp->find(pg);
   temp_pg->clear();
@@ -2991,21 +2988,22 @@ void OSDMap::_get_temp_osds(const pg_pool_t& pool, pg_t pg,
        if (pool.can_shift_osds()) {
          continue;
        } else {
-         temp_pg->push_back(CRUSH_ITEM_NONE);
+         temp.push_back(CRUSH_ITEM_NONE);
        }
       } else {
-       temp_pg->push_back(p->second[i]);
+       temp.push_back(p->second[i]);
       }
     }
+    *temp_pg = pgtemp_undo_primaryfirst(pool, pg, temp);
   }
   const auto &pp = primary_temp->find(pg);
   *temp_primary = -1;
   if (pp != primary_temp->end()) {
     *temp_primary = pp->second;
-  } else if (!temp_pg->empty()) { // apply pg_temp's primary
-    for (unsigned i = 0; i < temp_pg->size(); ++i) {
-      if ((*temp_pg)[i] != CRUSH_ITEM_NONE) {
-       *temp_primary = (*temp_pg)[i];
+  } else if (!temp.empty()) { // apply pg_temp's primary
+    for (unsigned i = 0; i < temp.size(); ++i) {
+      if (temp[i] != CRUSH_ITEM_NONE) {
+       *temp_primary = temp[i];
        break;
       }
     }
index ca03e16b6e7c78d9586bc30240831221da7d5777..ac13f6e2113f0a0e0718234bdc856f30cc95fb38 100644 (file)
@@ -1358,6 +1358,9 @@ public:
     return false;
   }
 
+  bool has_pgtemp(const pg_t pg) const {
+    return (pg_temp->find(pg) != pg_temp->end());
+  }
   const std::vector<int> pgtemp_primaryfirst(const pg_pool_t& pool,
                           const std::vector<int>& pg_temp) const;
   const std::vector<int> pgtemp_undo_primaryfirst(const pg_pool_t& pool,
index a1a821bef26c0f900baf0bdd12dc297133871a95..55743361ce9636be358b534c9690ab2b938cd816 100644 (file)
@@ -611,14 +611,10 @@ bool PeeringState::should_restart_peering(
   int newupprimary,
   int newactingprimary,
   const vector<int>& newup,
-  const vector<int>& _newacting,
+  const vector<int>& newacting,
   OSDMapRef lastmap,
   OSDMapRef osdmap)
 {
-  const vector<int> newacting = osdmap->pgtemp_undo_primaryfirst(
-                                         pool.info,
-                                         info.pgid.pgid,
-                                         _newacting);
   if (PastIntervals::is_new_interval(
        primary.osd,
        newactingprimary,
@@ -898,9 +894,7 @@ void PeeringState::init_primary_up_acting(
   int new_acting_primary)
 {
   actingset.clear();
-  acting = get_osdmap()->pgtemp_undo_primaryfirst(pool.info,
-                                                 info.pgid.pgid,
-                                                 newacting);
+  acting = newacting;
   for (uint8_t i = 0; i < acting.size(); ++i) {
     if (acting[i] != CRUSH_ITEM_NONE)
       actingset.insert(
@@ -931,9 +925,26 @@ void PeeringState::init_primary_up_acting(
        break;
       }
     }
+    // Calcuating the shard of the acting_primary is tricky because in
+    // error conditions the same osd can be in multiple positions in
+    // the acting_set. Use pgtemp ordering (which places shards which
+    // can become the primary first) to match the code in
+    // OSDMap::_get_temp_osds
+    const OSDMapRef osdmap = get_osdmap();
+    bool has_pgtemp = osdmap->has_pgtemp(spgid.pgid);
+    std::vector<int> pg_temp = acting;
+    if (has_pgtemp) {
+      pg_temp = osdmap->pgtemp_primaryfirst(pool.info, acting);
+    }
     for (uint8_t i = 0; i < acting.size(); ++i) {
-      if (acting[i] == new_acting_primary) {
-       primary = pg_shard_t(acting[i], shard_id_t(i));
+      if (pg_temp[i] == new_acting_primary) {
+       if (has_pgtemp) {
+         primary = pg_shard_t(new_acting_primary,
+                              osdmap->pgtemp_undo_primaryfirst(
+                                pool.info, spgid.pgid, shard_id_t(i)));
+       } else {
+         primary = pg_shard_t(new_acting_primary, shard_id_t(i));
+       }
        break;
       }
     }