]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
osd: New options for configuring new EC
authorAlex Ainscow <aainscow@uk.ibm.com>
Thu, 27 Mar 2025 15:37:57 +0000 (15:37 +0000)
committerAlex Ainscow <aainscow@uk.ibm.com>
Tue, 22 Apr 2025 07:38:15 +0000 (08:38 +0100)
Adding three new configuration options which will apply once new EC
is in place:

osd_pool_default_flag_ec_optimizations

This allows EC optimizations to be turned on by default.

ec_extent_cache_size

This allows the user to specify the size of the per-shard extent cache if
they feel that the default 10MiB is too large or too small.

The default value may well change following more extensive testing.

ec_pd_write_mode

This is a development flag for testing the parity delta write RMW mechanism
within the EC code.  Setting to anything other than 0 will cause performance
problems.  It is provided as a test mechanism for performance and
teuthology.  Performance may wish too turn off all PDW writes for a particular
IO pattern. This will allow us to determine if the automatic mode should be
using conventional RMW writes.  The force-on mode allows testing on more
unusual scenarios and on smaller configurations.

Finally, we tweak the way optimisations are enabled, so as to be common between
enabling and default-enabled.

Signed-off-by: Alex Ainscow <aainscow@uk.ibm.com>
src/common/options/global.yaml.in
src/erasure-code/ErasureCodeInterface.h
src/erasure-code/isa/ErasureCodeIsa.h
src/erasure-code/jerasure/ErasureCodeJerasure.h
src/mon/OSDMonitor.cc
src/mon/OSDMonitor.h

index 279156370a514f308bff8e0bbdd14e805afab8c6..15c7c0fa26ae430de85b0ad606606c83f775539e 100644 (file)
@@ -2703,6 +2703,15 @@ options:
   services:
   - mon
   with_legacy: true
+- name: osd_pool_default_flag_ec_optimizations
+  type: bool
+  level: advanced
+  desc: Control whether to create new erasure coded pools with EC optimizations turned on by default.
+  fmt_desc: Set the ``allow_ec_optimizations`` flag on new erasure coded pools.
+  default: false
+  services:
+  - mon
+  with_legacy: true
 - name: osd_pool_default_hit_set_bloom_fpp
   type: float
   level: advanced
@@ -6765,3 +6774,15 @@ options:
     The format is ``{file}:{line} [, {file}:{line}]``
   level: dev
   with_legacy: false
+- name: ec_extent_cache_size
+  type: uint
+  level: advanced
+  desc: Size of per-shard extent cache
+  default: 10485760
+  services:
+  - osd
+- name: ec_pdw_write_mode
+  type: uint
+  level: dev
+  default: 0
+  desc: When EC writes should generate PDWs (development only) 0=optimal 1=never 2=when possible
index 8e2f5cdd1e5dacf040ecd8c0f533e45b4257b58c..695a6683afedbe6e614ad60da9ae904ed40c2683 100644 (file)
@@ -677,6 +677,10 @@ namespace ceph {
        * clay). Other plugins will not process the overhead of stub sub-chunks.
        */
       FLAG_EC_PLUGIN_REQUIRE_SUB_CHUNKS = 1<<5,
+      /* Optimized EC is supported only if this flag is set. All other flags
+       * are irrelevant if this flag is false.
+       */
+      FLAG_EC_PLUGIN_OPTIMIZED_SUPPORTED = 1<<6,
     };
     static const char *get_optimization_flag_name(const plugin_flags flag) {
       switch (flag) {
@@ -686,6 +690,7 @@ namespace ceph {
       case FLAG_EC_PLUGIN_ZERO_PADDING_OPTIMIZATION: return "zeropadding";
       case FLAG_EC_PLUGIN_PARITY_DELTA_OPTIMIZATION: return "paritydelta";
       case FLAG_EC_PLUGIN_REQUIRE_SUB_CHUNKS: return "requiresubchunks";
+      case FLAG_EC_PLUGIN_OPTIMIZED_SUPPORTED: return "optimizedsupport";
       default: return "???";
       }
     }
index e302eec0bc1806a5eb6adca700ab57abe87caf60..f68fa527d99e12b7857c55623bf104e045840e2e 100644 (file)
 #define CEPH_ERASURE_CODE_ISA_L_H
 
 // -----------------------------------------------------------------------------
+#include <string_view>
 #include "erasure-code/ErasureCode.h"
 #include "ErasureCodeIsaTableCache.h"
 // -----------------------------------------------------------------------------
 
+using namespace std::literals;
+
 #define EC_ISA_ADDRESS_ALIGNMENT 32u
 
 #define is_aligned(POINTER, BYTE_COUNT) \
@@ -51,6 +54,7 @@ public:
 
   ErasureCodeIsaTableCache &tcache;
   const char *technique;
+  uint64_t flags;
 
   ErasureCodeIsa(const char *_technique,
                  ErasureCodeIsaTableCache &_tcache) :
@@ -60,6 +64,15 @@ public:
   tcache(_tcache),
   technique(_technique)
   {
+    flags = FLAG_EC_PLUGIN_PARTIAL_READ_OPTIMIZATION |
+            FLAG_EC_PLUGIN_PARTIAL_WRITE_OPTIMIZATION |
+            FLAG_EC_PLUGIN_ZERO_INPUT_ZERO_OUTPUT_OPTIMIZATION |
+            FLAG_EC_PLUGIN_PARITY_DELTA_OPTIMIZATION;
+
+    if (technique == "reed_sol_van"sv ||
+        technique == "default"sv) {
+      flags |= FLAG_EC_PLUGIN_OPTIMIZED_SUPPORTED;
+    }
   }
 
   
@@ -68,10 +81,7 @@ public:
   }
 
   uint64_t get_supported_optimizations() const override {
-    return FLAG_EC_PLUGIN_PARTIAL_READ_OPTIMIZATION |
-      FLAG_EC_PLUGIN_PARTIAL_WRITE_OPTIMIZATION |
-      FLAG_EC_PLUGIN_ZERO_INPUT_ZERO_OUTPUT_OPTIMIZATION |
-      FLAG_EC_PLUGIN_PARITY_DELTA_OPTIMIZATION;
+    return flags;
   }
 
   unsigned int
index e14562668c42efa2bb952484e489ead64414526a..fac3c1d400e55c37bb8be30a384ab31f8b0d46a6 100644 (file)
 #ifndef CEPH_ERASURE_CODE_JERASURE_H
 #define CEPH_ERASURE_CODE_JERASURE_H
 
+#include <string_view>
+
 #include "erasure-code/ErasureCode.h"
 
+using namespace std::literals;
+
 class ErasureCodeJerasure : public ceph::ErasureCode {
 public:
   int k;
@@ -32,28 +36,33 @@ public:
   std::string rule_root;
   std::string rule_failure_domain;
   bool per_chunk_alignment;
+  uint64_t flags;
+
+  explicit ErasureCodeJerasure(const char *_technique)
+      : k(0),
+        DEFAULT_K("2"),
+        m(0),
+        DEFAULT_M("1"),
+        w(0),
+        DEFAULT_W("8"),
+        technique(_technique),
+        per_chunk_alignment(false) {
+    flags = FLAG_EC_PLUGIN_PARTIAL_READ_OPTIMIZATION |
+      FLAG_EC_PLUGIN_PARTIAL_WRITE_OPTIMIZATION |
+      FLAG_EC_PLUGIN_ZERO_INPUT_ZERO_OUTPUT_OPTIMIZATION |
+      FLAG_EC_PLUGIN_PARITY_DELTA_OPTIMIZATION;
 
-  explicit ErasureCodeJerasure(const char *_technique) :
-    k(0),
-    DEFAULT_K("2"),
-    m(0),
-    DEFAULT_M("1"),
-    w(0),
-    DEFAULT_W("8"),
-    technique(_technique),
-    per_chunk_alignment(false)
-  {}
+    if (technique == "reed_sol_van"sv) {
+      flags |= FLAG_EC_PLUGIN_OPTIMIZED_SUPPORTED;
+    }
+  }
 
   ~ErasureCodeJerasure() override {}
 
   uint64_t get_supported_optimizations() const override {
-    return FLAG_EC_PLUGIN_PARTIAL_READ_OPTIMIZATION |
-      FLAG_EC_PLUGIN_PARTIAL_WRITE_OPTIMIZATION |
-      FLAG_EC_PLUGIN_ZERO_INPUT_ZERO_OUTPUT_OPTIMIZATION |
-      FLAG_EC_PLUGIN_PARITY_DELTA_OPTIMIZATION;
+    return flags;
   }
 
-  
   unsigned int get_chunk_count() const override {
     return k + m;
   }
index 81716ccbd958c95bed6f5fa65eac93d48e85b4fe..d9f190f41a9dbf7baacfca796f809d291cfe2c5a 100644 (file)
@@ -8226,12 +8226,12 @@ int OSDMonitor::prepare_new_pool(string& name,
   pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty);
   pi->create_time = ceph_clock_now();
   pi->type = pool_type;
-  pi->fast_read = fread; 
+  pi->fast_read = fread;
   pi->flags = g_conf()->osd_pool_default_flags;
   if (bulk) {
     pi->set_flag(pg_pool_t::FLAG_BULK);
   } else if (g_conf()->osd_pool_default_flag_bulk) {
-      pi->set_flag(pg_pool_t::FLAG_BULK);
+    pi->set_flag(pg_pool_t::FLAG_BULK);
   }
   if (g_conf()->osd_pool_default_flag_hashpspool)
     pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
@@ -8331,6 +8331,11 @@ int OSDMonitor::prepare_new_pool(string& name,
   pi->cache_min_flush_age = g_conf()->osd_pool_default_cache_min_flush_age;
   pi->cache_min_evict_age = g_conf()->osd_pool_default_cache_min_evict_age;
 
+  if (cct->_conf.get_val<bool>("osd_pool_default_flag_ec_optimizations")) {
+    // This will fail if the pool cannot support ec optimizations.
+    enable_pool_ec_optimizations(*pi, nullptr, true);
+  }
+
   pending_inc.new_pool_names[pool] = name;
   return 0;
 }
@@ -8361,6 +8366,70 @@ bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op, int flag)
   return true;
 }
 
+int OSDMonitor::enable_pool_ec_optimizations(pg_pool_t &p,
+    stringstream *ss, bool enable) {
+  if (!p.is_erasure()) {
+    if (ss) {
+      *ss << "allow_ec_optimizations can only be enabled for an erasure coded pool";
+    }
+    return -EINVAL;
+  }
+  if (osdmap.require_osd_release < ceph_release_t::tentacle) {
+    if (ss) {
+      *ss << "All OSDs must be upgraded to tentacle or "
+           << "later before setting allow_ec_optimizations";
+    }
+    return -EINVAL;
+  }
+  if (enable) {
+    ErasureCodeInterfaceRef erasure_code;
+    unsigned int k, m;
+    stringstream tmp;
+    int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp);
+    if (err == 0) {
+      k = erasure_code->get_data_chunk_count();
+      m = erasure_code->get_coding_chunk_count();
+    } else {
+      if (ss) {
+        *ss << "get_erasure_code failed: " << tmp.str();
+      }
+      return -EINVAL;
+    }
+    if ((erasure_code->get_supported_optimizations() &
+        ErasureCodeInterface::FLAG_EC_PLUGIN_OPTIMIZED_SUPPORTED) == 0) {
+      if (ss) {
+        *ss << "ec optimizations not currently supported for pool profile.";
+      }
+      return -EINVAL;
+    }
+    // Restrict the set of shards that can be a primary to the 1st data
+    // raw_shard (raw_shard 0) and the coding parity raw_shards because§
+    // the other shards (including local parity for LRC) may not have
+    // up to date copies of xattrs including OI
+    p.nonprimary_shards.clear();
+    for (raw_shard_id_t raw_shard; raw_shard < k + m; ++raw_shard) {
+      if (raw_shard > 0 && raw_shard < k) {
+       shard_id_t shard;
+       if (erasure_code->get_chunk_mapping().size() > raw_shard ) {
+         shard = shard_id_t(erasure_code->get_chunk_mapping().at(int(raw_shard)));
+       } else {
+         shard = shard_id_t(int(raw_shard));
+       }
+        p.nonprimary_shards.insert(shard);
+      }
+    }
+    p.flags |= pg_pool_t::FLAG_EC_OPTIMIZATIONS;
+  } else {
+    if ((p.flags & pg_pool_t::FLAG_EC_OPTIMIZATIONS) != 0) {
+      if (ss) {
+        *ss << "allow_ec_optimizations cannot be disabled once enabled";
+      }
+      return -EINVAL;
+    }
+  }
+  return 0;
+}
+
 int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap,
                                          stringstream& ss)
 {
@@ -8828,26 +8897,34 @@ int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap,
       return -EINVAL;
     }
   } else if (var == "allow_ec_optimizations") {
-    if (!p.is_erasure()) {
-      ss << "allow_ec_optimizations can only be enabled for an erasure coded pool";
-      return -EINVAL;
-    }
-    if (osdmap.require_osd_release < ceph_release_t::tentacle) {
-      ss << "All OSDs must be upgraded to tentacle or "
-           << "later before setting allow_ec_optimizations";
-        return -EINVAL;
-      }
+    bool enable = false;
     if (val == "true" || (interr.empty() && n == 1)) {
-      p.flags |= pg_pool_t::FLAG_EC_OPTIMIZATIONS;
+      enable = true;
     } else if (val == "false" || (interr.empty() && n == 0)) {
-      if ((p.flags & pg_pool_t::FLAG_EC_OPTIMIZATIONS) != 0) {
-       ss << "allow_ec_optimizations cannot be disabled once enabled";
-       return -EINVAL;
-      }
+      enable = false;
     } else {
       ss << "expecting value 'true', 'false', '0', or '1'";
       return -EINVAL;
     }
+    bool was_enabled = p.allows_ecoptimizations();
+    int r = enable_pool_ec_optimizations(p, nullptr, enable);
+    if (r != 0) {
+      return r;
+    }
+    if (!was_enabled && p.allows_ecoptimizations()) {
+      // Pools with allow_ec_optimizations set store pg_temp in a different
+      // order to change the primary selection algorithm without breaking
+      // old clients. Modify any existing pg_temp for the pool now.
+      // This is only needed when switching on optimisations after creation.
+      for (auto pg_temp = osdmap.pg_temp->begin();
+           pg_temp != osdmap.pg_temp->end();
+           ++pg_temp) {
+        if (pg_temp->first.pool() == pool) {
+          std::vector<int> new_pg_temp = osdmap.pgtemp_primaryfirst(p, pg_temp->second);
+          pending_inc.new_pg_temp[pg_temp->first] = mempool::osdmap::vector<int>(new_pg_temp.begin(), new_pg_temp.end());
+        }
+      }
+    }
   } else if (var == "target_max_objects") {
     if (interr.length()) {
       ss << "error parsing int '" << val << "': " << interr;
index 4d1172932de65f1c48b83f94520fd0e719eb4528..83dffb35ba729938dd3e6c811ebdd187c630e403 100644 (file)
@@ -741,6 +741,9 @@ public:
       std::stringstream &ss,
       ceph::Formatter *f);
 
+  int enable_pool_ec_optimizations(pg_pool_t &pool,
+                                   std::stringstream *ss,
+                                   bool enable);
   int prepare_command_pool_set(const cmdmap_t& cmdmap,
                                std::stringstream& ss);