]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: Restrict choice of primary shard for ec_optimizations pools
authorBill Scales <156200352+bill-scales@users.noreply.github.com>
Thu, 6 Mar 2025 12:20:52 +0000 (12:20 +0000)
committerBill Scales <bill_scales@uk.ibm.com>
Fri, 11 Apr 2025 06:31:56 +0000 (07:31 +0100)
Pools with ec_optimizations enabled have restrictions on which
shards are permitted to become the primary because not all shards
are updated for every I/O.

To preserve backwards compatibility with downlevel clients
pg_temp is used as the method to override the selection of
primary by OSDMap. Directly changing the logic in OSDMap
would have meant that all clients need to be upgraded to
tentacle before using optimized EC pools, so was discounted.
Using primary_temp to set the primary for an EC pool is
not reliable because under error conditions an OSD can store
multiple shards for the same PG and primary_temp cannot
define which of these shards will be choosen.

For optimized EC pools pg_temp is shuffled so that the
non-primary shards are listed last. This means that the
existing logic in OSDMap that picks the first available
shard as the primary will avoid selecting a non-primary
shard. OSDMonitor applies the shuffle when pg_temp is set,
this is then reverted in PeeringState when initializing the
acting set after OSDMap has selected the primary.

PeeringState::choose_acting is modified to set pg_temp if
OSDMap has selected a non-primary shard, this will cause
a new OSDMAP to be published which will persuade
OSDMap to select a primary shard instead.

Signed-off-by: Bill Scales <bill_scales@uk.ibm.com>
src/mon/OSDMonitor.cc
src/osd/OSDMap.cc
src/osd/OSDMap.h
src/osd/PeeringState.cc

index 495a78e4293b21c06d2647913ab042ab6df804e8..81716ccbd958c95bed6f5fa65eac93d48e85b4fe 100644 (file)
@@ -4187,8 +4187,18 @@ bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op)
                << ": pool has been removed" << dendl;
       continue;
     }
+    // Pools with allow_ec_optimizations set store pg_temp in a different
+    // order to change the primary selection algorithm without breaking
+    // old clients. If necessary re-order the new pg_temp now
+    pg_pool_t pg_pool;
+    if (pending_inc.new_pools.count(pool))
+      pg_pool = pending_inc.new_pools[pool];
+    else
+      pg_pool = *osdmap.get_pg_pool(pool);
+
+    std::vector<int> pg_temp = osdmap.pgtemp_primaryfirst(pg_pool, p->second);
     pending_inc.new_pg_temp[p->first] =
-      mempool::osdmap::vector<int>(p->second.begin(), p->second.end());
+      mempool::osdmap::vector<int>(pg_temp.begin(), pg_temp.end());
 
     // unconditionally clear pg_primary (until this message can encode
     // a change for that, too.. at which point we need to also fix
index ffdb03fec4f44702a750f95633543b3876bbd197..cff5a6153842b1922cf761ebc97e8b6ff3569f2d 100644 (file)
@@ -2852,6 +2852,73 @@ void OSDMap::_apply_primary_affinity(ps_t seed,
   }
 }
 
+/* EC pools with allow_ec_optimizations set have some shards that cannot
+ * become the primary because they are not updated on every I/O. To avoid
+ * requiring clients to be upgraded to use these new pools the logic in
+ * OSDMap which selects a primary cannot be changed. Instead choose_acting
+ * is modified to set pgtemp when it is necessary to override the choice
+ * of primary, and this vector is reordered so that shards that are
+ * permitted to be the primary are listed first. The existing OSDMap code
+ * will then choose a suitable shard as primary except when the pg is
+ * incomplete and the choice of primary doesn't matter. This function is
+ * called by OSDMonitor when setting pg_temp to transform the vector.
+ *
+ * Example: Optimized EC pool 4+2
+ * acting_set = {NONE, 6, 7, 8, 9, 10}
+ * non_primary_shards = {1, 2, 3} # data shards other than shard 0
+ * pg_temp = {NONE, 9, 10, 6, 7, 8} # non-primary shards at end
+ * primary will be OSD 9(1)
+ */
+const std::vector<int> OSDMap::pgtemp_primaryfirst(const pg_pool_t& pool,
+                        const std::vector<int>& pg_temp) const
+{
+  // Only perform the transform for pools with allow_ec_optimizations set
+  if (pool.allows_ecoptimizations()) {
+    std::vector<int> result;
+    std::vector<int> nonprimary;
+    int shard = 0;
+    for (auto osd : pg_temp) {
+      if (pool.is_nonprimary_shard(shard_id_t(shard))) {
+       nonprimary.emplace_back(osd);
+      } else {
+       result.emplace_back(osd);
+      }
+      shard++;
+    }
+    result.insert(result.end(), nonprimary.begin(), nonprimary.end());
+    return result;
+  }
+  return pg_temp;
+}
+
+/* The function above reorders the pg_temp vector. This transformation needs
+ * to be reversed by OSDs (but not clients) and is called by PeeringState
+ * when initializing the the acting set.
+ */
+const std::vector<int> OSDMap::pgtemp_undo_primaryfirst(const pg_pool_t& pool,
+       const pg_t pg, const std::vector<int>& acting) const
+{
+  // Only perform the transform for pools with allow_ec_optimizations set
+  // that also have pg_temp set
+  if (pool.allows_ecoptimizations()) {
+    if (pg_temp->find(pool.raw_pg_to_pg(pg)) != pg_temp->end()) {
+      std::vector<int> result;
+      int primaryshard = 0;
+      int nonprimaryshard = pool.size - pool.nonprimary_shards.size();
+      assert(acting.size() == pool.size);
+      for (auto shard = 0; shard < pool.size; shard++) {
+       if (pool.is_nonprimary_shard(shard_id_t(shard))) {
+         result.emplace_back(acting[nonprimaryshard++]);
+       } else {
+         result.emplace_back(acting[primaryshard++]);
+       }
+      }
+      return result;
+    }
+  }
+  return acting;
+}
+
 void OSDMap::_get_temp_osds(const pg_pool_t& pool, pg_t pg,
                             vector<int> *temp_pg, int *temp_primary) const
 {
index 3a4c56a46f32c89f8bc8a39da7eb6659eb53e2b9..81f3d914edab432c62d756e78f946fb24e82847d 100644 (file)
@@ -590,6 +590,7 @@ private:
 
   mempool::osdmap::vector<__u32>   osd_weight;   // 16.16 fixed point, 0x10000 = "in", 0 = "out"
   mempool::osdmap::vector<osd_info_t> osd_info;
+  // Optimized EC pools re-order pg_temp, see pgtemp_primaryfirst
   std::shared_ptr<PGTempMap> pg_temp;  // temp pg mapping (e.g. while we rebuild)
   std::shared_ptr< mempool::osdmap::map<pg_t,int32_t > > primary_temp;  // temp primary mapping (e.g. while we rebuild)
   std::shared_ptr< mempool::osdmap::vector<__u32> > osd_primary_affinity; ///< 16.16 fixed point, 0x10000 = baseline
@@ -1357,6 +1358,12 @@ public:
     return false;
   }
 
+  const std::vector<int> pgtemp_primaryfirst(const pg_pool_t& pool,
+                          const std::vector<int>& pg_temp) const;
+  const std::vector<int> pgtemp_undo_primaryfirst(const pg_pool_t& pool,
+                          const pg_t pg,
+                          const std::vector<int>& acting) const;
+
   bool in_removed_snaps_queue(int64_t pool, snapid_t snap) const {
     auto p = removed_snaps_queue.find(pool);
     if (p == removed_snaps_queue.end()) {
index 7a3c911840d48d6624f85f19a85e4f663e02a0e8..4dd7543e10ac91714255dcbf56597bd966526b9c 100644 (file)
@@ -538,10 +538,14 @@ bool PeeringState::should_restart_peering(
   int newupprimary,
   int newactingprimary,
   const vector<int>& newup,
-  const vector<int>& newacting,
+  const vector<int>& _newacting,
   OSDMapRef lastmap,
   OSDMapRef osdmap)
 {
+  const vector<int> newacting = osdmap->pgtemp_undo_primaryfirst(
+                                         pool.info,
+                                         info.pgid.pgid,
+                                         _newacting);
   if (PastIntervals::is_new_interval(
        primary.osd,
        newactingprimary,
@@ -820,7 +824,9 @@ void PeeringState::init_primary_up_acting(
   int new_acting_primary)
 {
   actingset.clear();
-  acting = newacting;
+  acting = get_osdmap()->pgtemp_undo_primaryfirst(pool.info,
+                                                 info.pgid.pgid,
+                                                 newacting);
   for (uint8_t i = 0; i < acting.size(); ++i) {
     if (acting[i] != CRUSH_ITEM_NONE)
       actingset.insert(
@@ -2445,13 +2451,23 @@ bool PeeringState::choose_acting(pg_shard_t &auth_log_shard_id,
                << " from oversized want " << want << dendl;
     want.pop_back();
   }
-  if (want != acting) {
-    psdout(10) << "want " << want << " != acting " << acting
+  if ((want != acting) ||
+      pool.info.is_nonprimary_shard(pg_whoami.shard)) {
+    if (pool.info.is_nonprimary_shard(pg_whoami.shard)) {
+      psdout(10) << "shard " << pg_whoami.shard << " cannot be primary, want "
+              << pg_vector_string(want)
+              << " acting " << pg_vector_string(acting)
               << ", requesting pg_temp change" << dendl;
+    } else {
+      psdout(10) << "want " << pg_vector_string(want)
+              << " != acting " << pg_vector_string(acting)
+              << ", requesting pg_temp change" << dendl;
+    }
     want_acting = want;
 
     if (!cct->_conf->osd_debug_no_acting_change) {
-      if (want_acting == up) {
+      if ((want_acting == up) &&
+         !pool.info.is_nonprimary_shard(pg_whoami.shard)) {
        // There can't be any pending backfill if
        // want is the same as crush map up OSDs.
        ceph_assert(want_backfill.empty());