osd: EC optimizations rework for pg_temp

author Bill Scales <bill_scales@uk.ibm.com>

Thu, 29 May 2025 11:53:27 +0000 (12:53 +0100)

committer Alex Ainscow <aainscow@uk.ibm.com>

Wed, 25 Jun 2025 22:36:40 +0000 (23:36 +0100)
author Bill Scales <bill_scales@uk.ibm.com>
Thu, 29 May 2025 11:53:27 +0000 (12:53 +0100)
committer Alex Ainscow <aainscow@uk.ibm.com>
Wed, 25 Jun 2025 22:36:40 +0000 (23:36 +0100)
diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc

index ee95ee22fcbd29021c7c5f6aa3d2c63bae72503a..714f5c41977e9f8c353ec04b4bf76f76c5d9907c 100644 (file)
--- a/src/osd/OSDMap.cc
+++ b/src/osd/OSDMap.cc
@@ -2034,13 +2034,9 @@ void OSDMap::clean_temps(CephContext *cct,
        // force a change of primary shard - do not remove pg_temp
        // if it is being used for this purpose
        if (pool->allows_ecoptimizations()) {
-       for (uint8_t i = 0; i < acting_set.size(); ++i) {
-         if (acting_set[i] == primary) {
-           if (pool->is_nonprimary_shard(shard_id_t(i))) {
-             // pg_temp still required
-             keep = true;
-           }
-         }
+       if (nextmap._pick_primary(pg.second) != primary) {
+         // pg_temp still required
+         keep = true;
         }
        }
        if (!keep) {
@@ -2919,7 +2915,7 @@ const std::vector<int> OSDMap::pgtemp_undo_primaryfirst(const pg_pool_t& pool,
    // Only perform the transform for pools with allow_ec_optimizations set
    // that also have pg_temp set
    if (pool.allows_ecoptimizations()) {
-    if (pg_temp->find(pool.raw_pg_to_pg(pg)) != pg_temp->end()) {
+    if (has_pgtemp(pool.raw_pg_to_pg(pg))) {
        std::vector<int> result;
        int primaryshard = 0;
        int nonprimaryshard = pool.size - pool.nonprimary_shards.size();
@@ -2946,7 +2942,7 @@ const shard_id_t OSDMap::pgtemp_primaryfirst(const pg_pool_t& pool,
    }
    shard_id_t result = shard;
    if (pool.allows_ecoptimizations()) {
-    if (pg_temp->find(pool.raw_pg_to_pg(pg)) != pg_temp->end()) {
+    if (has_pgtemp(pool.raw_pg_to_pg(pg))) {
        int num_parity_shards = pool.size - pool.nonprimary_shards.size() - 1;
        if (shard >= pool.size - num_parity_shards) {
         result = shard_id_t(result + num_parity_shards + 1 - pool.size);
@@ -2967,7 +2963,7 @@ shard_id_t OSDMap::pgtemp_undo_primaryfirst(const pg_pool_t& pool,
    }
    shard_id_t result = shard;
    if (pool.allows_ecoptimizations()) {
-    if (pg_temp->find(pool.raw_pg_to_pg(pg)) != pg_temp->end()) {
+    if (has_pgtemp(pool.raw_pg_to_pg(pg))) {
        int num_parity_shards = pool.size - pool.nonprimary_shards.size() - 1;
        if (shard > num_parity_shards) {
         result = shard_id_t(result - num_parity_shards);
@@ -2982,6 +2978,7 @@ shard_id_t OSDMap::pgtemp_undo_primaryfirst(const pg_pool_t& pool,
  void OSDMap::_get_temp_osds(const pg_pool_t& pool, pg_t pg,
                              vector<int> *temp_pg, int *temp_primary) const
  {
+  vector<int> temp;
    pg = pool.raw_pg_to_pg(pg);
    const auto p = pg_temp->find(pg);
    temp_pg->clear();
@@ -2991,21 +2988,22 @@ void OSDMap::_get_temp_osds(const pg_pool_t& pool, pg_t pg,
         if (pool.can_shift_osds()) {
           continue;
         } else {
-         temp_pg->push_back(CRUSH_ITEM_NONE);
+         temp.push_back(CRUSH_ITEM_NONE);
         }
        } else {
-       temp_pg->push_back(p->second[i]);
+       temp.push_back(p->second[i]);
        }
      }
+    *temp_pg = pgtemp_undo_primaryfirst(pool, pg, temp);
    }
    const auto &pp = primary_temp->find(pg);
    *temp_primary = -1;
    if (pp != primary_temp->end()) {
      *temp_primary = pp->second;
-  } else if (!temp_pg->empty()) { // apply pg_temp's primary
-    for (unsigned i = 0; i < temp_pg->size(); ++i) {
-      if ((*temp_pg)[i] != CRUSH_ITEM_NONE) {
-       *temp_primary = (*temp_pg)[i];
+  } else if (!temp.empty()) { // apply pg_temp's primary
+    for (unsigned i = 0; i < temp.size(); ++i) {
+      if (temp[i] != CRUSH_ITEM_NONE) {
+       *temp_primary = temp[i];
         break;
        }
      }
diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h

index ca03e16b6e7c78d9586bc30240831221da7d5777..ac13f6e2113f0a0e0718234bdc856f30cc95fb38 100644 (file)
--- a/src/osd/OSDMap.h
+++ b/src/osd/OSDMap.h
@@ -1358,6 +1358,9 @@ public:
      return false;
    }
  
+  bool has_pgtemp(const pg_t pg) const {
+    return (pg_temp->find(pg) != pg_temp->end());
+  }
    const std::vector<int> pgtemp_primaryfirst(const pg_pool_t& pool,
                            const std::vector<int>& pg_temp) const;
    const std::vector<int> pgtemp_undo_primaryfirst(const pg_pool_t& pool,
diff --git a/src/osd/PeeringState.cc b/src/osd/PeeringState.cc

index a1a821bef26c0f900baf0bdd12dc297133871a95..55743361ce9636be358b534c9690ab2b938cd816 100644 (file)
--- a/src/osd/PeeringState.cc
+++ b/src/osd/PeeringState.cc
@@ -611,14 +611,10 @@ bool PeeringState::should_restart_peering(
    int newupprimary,
    int newactingprimary,
    const vector<int>& newup,
-  const vector<int>& _newacting,
+  const vector<int>& newacting,
    OSDMapRef lastmap,
    OSDMapRef osdmap)
  {
-  const vector<int> newacting = osdmap->pgtemp_undo_primaryfirst(
-                                         pool.info,
-                                         info.pgid.pgid,
-                                         _newacting);
    if (PastIntervals::is_new_interval(
         primary.osd,
         newactingprimary,
@@ -898,9 +894,7 @@ void PeeringState::init_primary_up_acting(
    int new_acting_primary)
  {
    actingset.clear();
-  acting = get_osdmap()->pgtemp_undo_primaryfirst(pool.info,
-                                                 info.pgid.pgid,
-                                                 newacting);
+  acting = newacting;
    for (uint8_t i = 0; i < acting.size(); ++i) {
      if (acting[i] != CRUSH_ITEM_NONE)
        actingset.insert(
@@ -931,9 +925,26 @@ void PeeringState::init_primary_up_acting(
         break;
        }
      }
+    // Calcuating the shard of the acting_primary is tricky because in
+    // error conditions the same osd can be in multiple positions in
+    // the acting_set. Use pgtemp ordering (which places shards which
+    // can become the primary first) to match the code in
+    // OSDMap::_get_temp_osds
+    const OSDMapRef osdmap = get_osdmap();
+    bool has_pgtemp = osdmap->has_pgtemp(spgid.pgid);
+    std::vector<int> pg_temp = acting;
+    if (has_pgtemp) {
+      pg_temp = osdmap->pgtemp_primaryfirst(pool.info, acting);
+    }
      for (uint8_t i = 0; i < acting.size(); ++i) {
-      if (acting[i] == new_acting_primary) {
-       primary = pg_shard_t(acting[i], shard_id_t(i));
+      if (pg_temp[i] == new_acting_primary) {
+       if (has_pgtemp) {
+         primary = pg_shard_t(new_acting_primary,
+                              osdmap->pgtemp_undo_primaryfirst(
+                                pool.info, spgid.pgid, shard_id_t(i)));
+       } else {
+         primary = pg_shard_t(new_acting_primary, shard_id_t(i));
+       }
         break;
        }
      }
author	Bill Scales <bill_scales@uk.ibm.com>
	Thu, 29 May 2025 11:53:27 +0000 (12:53 +0100)
committer	Alex Ainscow <aainscow@uk.ibm.com>
	Wed, 25 Jun 2025 22:36:40 +0000 (23:36 +0100)
src/osd/OSDMap.cc		patch \| blob \| history
src/osd/OSDMap.h		patch \| blob \| history
src/osd/PeeringState.cc		patch \| blob \| history