From f3966cafe4758082d7e8f49414f8018735946a37 Mon Sep 17 00:00:00 2001 From: Alex Ainscow Date: Thu, 27 Mar 2025 15:37:57 +0000 Subject: [PATCH] osd: New options for configuring new EC Adding three new configuration options which will apply once new EC is in place: osd_pool_default_flag_ec_optimizations This allows EC optimizations to be turned on by default. ec_extent_cache_size This allows the user to specify the size of the per-shard extent cache if they feel that the default 10MiB is too large or too small. The default value may well change following more extensive testing. ec_pd_write_mode This is a development flag for testing the parity delta write RMW mechanism within the EC code. Setting to anything other than 0 will cause performance problems. It is provided as a test mechanism for performance and teuthology. Performance may wish too turn off all PDW writes for a particular IO pattern. This will allow us to determine if the automatic mode should be using conventional RMW writes. The force-on mode allows testing on more unusual scenarios and on smaller configurations. Finally, we tweak the way optimisations are enabled, so as to be common between enabling and default-enabled. Signed-off-by: Alex Ainscow --- src/common/options/global.yaml.in | 21 ++++ src/erasure-code/ErasureCodeInterface.h | 5 + src/erasure-code/isa/ErasureCodeIsa.h | 18 ++- .../jerasure/ErasureCodeJerasure.h | 39 ++++--- src/mon/OSDMonitor.cc | 109 +++++++++++++++--- src/mon/OSDMonitor.h | 3 + 6 files changed, 160 insertions(+), 35 deletions(-) diff --git a/src/common/options/global.yaml.in b/src/common/options/global.yaml.in index 279156370a514..15c7c0fa26ae4 100644 --- a/src/common/options/global.yaml.in +++ b/src/common/options/global.yaml.in @@ -2703,6 +2703,15 @@ options: services: - mon with_legacy: true +- name: osd_pool_default_flag_ec_optimizations + type: bool + level: advanced + desc: Control whether to create new erasure coded pools with EC optimizations turned on by default. + fmt_desc: Set the ``allow_ec_optimizations`` flag on new erasure coded pools. + default: false + services: + - mon + with_legacy: true - name: osd_pool_default_hit_set_bloom_fpp type: float level: advanced @@ -6765,3 +6774,15 @@ options: The format is ``{file}:{line} [, {file}:{line}]`` level: dev with_legacy: false +- name: ec_extent_cache_size + type: uint + level: advanced + desc: Size of per-shard extent cache + default: 10485760 + services: + - osd +- name: ec_pdw_write_mode + type: uint + level: dev + default: 0 + desc: When EC writes should generate PDWs (development only) 0=optimal 1=never 2=when possible diff --git a/src/erasure-code/ErasureCodeInterface.h b/src/erasure-code/ErasureCodeInterface.h index 8e2f5cdd1e5da..695a6683afedb 100644 --- a/src/erasure-code/ErasureCodeInterface.h +++ b/src/erasure-code/ErasureCodeInterface.h @@ -677,6 +677,10 @@ namespace ceph { * clay). Other plugins will not process the overhead of stub sub-chunks. */ FLAG_EC_PLUGIN_REQUIRE_SUB_CHUNKS = 1<<5, + /* Optimized EC is supported only if this flag is set. All other flags + * are irrelevant if this flag is false. + */ + FLAG_EC_PLUGIN_OPTIMIZED_SUPPORTED = 1<<6, }; static const char *get_optimization_flag_name(const plugin_flags flag) { switch (flag) { @@ -686,6 +690,7 @@ namespace ceph { case FLAG_EC_PLUGIN_ZERO_PADDING_OPTIMIZATION: return "zeropadding"; case FLAG_EC_PLUGIN_PARITY_DELTA_OPTIMIZATION: return "paritydelta"; case FLAG_EC_PLUGIN_REQUIRE_SUB_CHUNKS: return "requiresubchunks"; + case FLAG_EC_PLUGIN_OPTIMIZED_SUPPORTED: return "optimizedsupport"; default: return "???"; } } diff --git a/src/erasure-code/isa/ErasureCodeIsa.h b/src/erasure-code/isa/ErasureCodeIsa.h index e302eec0bc180..f68fa527d99e1 100644 --- a/src/erasure-code/isa/ErasureCodeIsa.h +++ b/src/erasure-code/isa/ErasureCodeIsa.h @@ -26,10 +26,13 @@ #define CEPH_ERASURE_CODE_ISA_L_H // ----------------------------------------------------------------------------- +#include #include "erasure-code/ErasureCode.h" #include "ErasureCodeIsaTableCache.h" // ----------------------------------------------------------------------------- +using namespace std::literals; + #define EC_ISA_ADDRESS_ALIGNMENT 32u #define is_aligned(POINTER, BYTE_COUNT) \ @@ -51,6 +54,7 @@ public: ErasureCodeIsaTableCache &tcache; const char *technique; + uint64_t flags; ErasureCodeIsa(const char *_technique, ErasureCodeIsaTableCache &_tcache) : @@ -60,6 +64,15 @@ public: tcache(_tcache), technique(_technique) { + flags = FLAG_EC_PLUGIN_PARTIAL_READ_OPTIMIZATION | + FLAG_EC_PLUGIN_PARTIAL_WRITE_OPTIMIZATION | + FLAG_EC_PLUGIN_ZERO_INPUT_ZERO_OUTPUT_OPTIMIZATION | + FLAG_EC_PLUGIN_PARITY_DELTA_OPTIMIZATION; + + if (technique == "reed_sol_van"sv || + technique == "default"sv) { + flags |= FLAG_EC_PLUGIN_OPTIMIZED_SUPPORTED; + } } @@ -68,10 +81,7 @@ public: } uint64_t get_supported_optimizations() const override { - return FLAG_EC_PLUGIN_PARTIAL_READ_OPTIMIZATION | - FLAG_EC_PLUGIN_PARTIAL_WRITE_OPTIMIZATION | - FLAG_EC_PLUGIN_ZERO_INPUT_ZERO_OUTPUT_OPTIMIZATION | - FLAG_EC_PLUGIN_PARITY_DELTA_OPTIMIZATION; + return flags; } unsigned int diff --git a/src/erasure-code/jerasure/ErasureCodeJerasure.h b/src/erasure-code/jerasure/ErasureCodeJerasure.h index e14562668c42e..fac3c1d400e55 100644 --- a/src/erasure-code/jerasure/ErasureCodeJerasure.h +++ b/src/erasure-code/jerasure/ErasureCodeJerasure.h @@ -18,8 +18,12 @@ #ifndef CEPH_ERASURE_CODE_JERASURE_H #define CEPH_ERASURE_CODE_JERASURE_H +#include + #include "erasure-code/ErasureCode.h" +using namespace std::literals; + class ErasureCodeJerasure : public ceph::ErasureCode { public: int k; @@ -32,28 +36,33 @@ public: std::string rule_root; std::string rule_failure_domain; bool per_chunk_alignment; + uint64_t flags; + + explicit ErasureCodeJerasure(const char *_technique) + : k(0), + DEFAULT_K("2"), + m(0), + DEFAULT_M("1"), + w(0), + DEFAULT_W("8"), + technique(_technique), + per_chunk_alignment(false) { + flags = FLAG_EC_PLUGIN_PARTIAL_READ_OPTIMIZATION | + FLAG_EC_PLUGIN_PARTIAL_WRITE_OPTIMIZATION | + FLAG_EC_PLUGIN_ZERO_INPUT_ZERO_OUTPUT_OPTIMIZATION | + FLAG_EC_PLUGIN_PARITY_DELTA_OPTIMIZATION; - explicit ErasureCodeJerasure(const char *_technique) : - k(0), - DEFAULT_K("2"), - m(0), - DEFAULT_M("1"), - w(0), - DEFAULT_W("8"), - technique(_technique), - per_chunk_alignment(false) - {} + if (technique == "reed_sol_van"sv) { + flags |= FLAG_EC_PLUGIN_OPTIMIZED_SUPPORTED; + } + } ~ErasureCodeJerasure() override {} uint64_t get_supported_optimizations() const override { - return FLAG_EC_PLUGIN_PARTIAL_READ_OPTIMIZATION | - FLAG_EC_PLUGIN_PARTIAL_WRITE_OPTIMIZATION | - FLAG_EC_PLUGIN_ZERO_INPUT_ZERO_OUTPUT_OPTIMIZATION | - FLAG_EC_PLUGIN_PARITY_DELTA_OPTIMIZATION; + return flags; } - unsigned int get_chunk_count() const override { return k + m; } diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 81716ccbd958c..d9f190f41a9db 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -8226,12 +8226,12 @@ int OSDMonitor::prepare_new_pool(string& name, pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty); pi->create_time = ceph_clock_now(); pi->type = pool_type; - pi->fast_read = fread; + pi->fast_read = fread; pi->flags = g_conf()->osd_pool_default_flags; if (bulk) { pi->set_flag(pg_pool_t::FLAG_BULK); } else if (g_conf()->osd_pool_default_flag_bulk) { - pi->set_flag(pg_pool_t::FLAG_BULK); + pi->set_flag(pg_pool_t::FLAG_BULK); } if (g_conf()->osd_pool_default_flag_hashpspool) pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL); @@ -8331,6 +8331,11 @@ int OSDMonitor::prepare_new_pool(string& name, pi->cache_min_flush_age = g_conf()->osd_pool_default_cache_min_flush_age; pi->cache_min_evict_age = g_conf()->osd_pool_default_cache_min_evict_age; + if (cct->_conf.get_val("osd_pool_default_flag_ec_optimizations")) { + // This will fail if the pool cannot support ec optimizations. + enable_pool_ec_optimizations(*pi, nullptr, true); + } + pending_inc.new_pool_names[pool] = name; return 0; } @@ -8361,6 +8366,70 @@ bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op, int flag) return true; } +int OSDMonitor::enable_pool_ec_optimizations(pg_pool_t &p, + stringstream *ss, bool enable) { + if (!p.is_erasure()) { + if (ss) { + *ss << "allow_ec_optimizations can only be enabled for an erasure coded pool"; + } + return -EINVAL; + } + if (osdmap.require_osd_release < ceph_release_t::tentacle) { + if (ss) { + *ss << "All OSDs must be upgraded to tentacle or " + << "later before setting allow_ec_optimizations"; + } + return -EINVAL; + } + if (enable) { + ErasureCodeInterfaceRef erasure_code; + unsigned int k, m; + stringstream tmp; + int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp); + if (err == 0) { + k = erasure_code->get_data_chunk_count(); + m = erasure_code->get_coding_chunk_count(); + } else { + if (ss) { + *ss << "get_erasure_code failed: " << tmp.str(); + } + return -EINVAL; + } + if ((erasure_code->get_supported_optimizations() & + ErasureCodeInterface::FLAG_EC_PLUGIN_OPTIMIZED_SUPPORTED) == 0) { + if (ss) { + *ss << "ec optimizations not currently supported for pool profile."; + } + return -EINVAL; + } + // Restrict the set of shards that can be a primary to the 1st data + // raw_shard (raw_shard 0) and the coding parity raw_shards because§ + // the other shards (including local parity for LRC) may not have + // up to date copies of xattrs including OI + p.nonprimary_shards.clear(); + for (raw_shard_id_t raw_shard; raw_shard < k + m; ++raw_shard) { + if (raw_shard > 0 && raw_shard < k) { + shard_id_t shard; + if (erasure_code->get_chunk_mapping().size() > raw_shard ) { + shard = shard_id_t(erasure_code->get_chunk_mapping().at(int(raw_shard))); + } else { + shard = shard_id_t(int(raw_shard)); + } + p.nonprimary_shards.insert(shard); + } + } + p.flags |= pg_pool_t::FLAG_EC_OPTIMIZATIONS; + } else { + if ((p.flags & pg_pool_t::FLAG_EC_OPTIMIZATIONS) != 0) { + if (ss) { + *ss << "allow_ec_optimizations cannot be disabled once enabled"; + } + return -EINVAL; + } + } + return 0; +} + int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap, stringstream& ss) { @@ -8828,26 +8897,34 @@ int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap, return -EINVAL; } } else if (var == "allow_ec_optimizations") { - if (!p.is_erasure()) { - ss << "allow_ec_optimizations can only be enabled for an erasure coded pool"; - return -EINVAL; - } - if (osdmap.require_osd_release < ceph_release_t::tentacle) { - ss << "All OSDs must be upgraded to tentacle or " - << "later before setting allow_ec_optimizations"; - return -EINVAL; - } + bool enable = false; if (val == "true" || (interr.empty() && n == 1)) { - p.flags |= pg_pool_t::FLAG_EC_OPTIMIZATIONS; + enable = true; } else if (val == "false" || (interr.empty() && n == 0)) { - if ((p.flags & pg_pool_t::FLAG_EC_OPTIMIZATIONS) != 0) { - ss << "allow_ec_optimizations cannot be disabled once enabled"; - return -EINVAL; - } + enable = false; } else { ss << "expecting value 'true', 'false', '0', or '1'"; return -EINVAL; } + bool was_enabled = p.allows_ecoptimizations(); + int r = enable_pool_ec_optimizations(p, nullptr, enable); + if (r != 0) { + return r; + } + if (!was_enabled && p.allows_ecoptimizations()) { + // Pools with allow_ec_optimizations set store pg_temp in a different + // order to change the primary selection algorithm without breaking + // old clients. Modify any existing pg_temp for the pool now. + // This is only needed when switching on optimisations after creation. + for (auto pg_temp = osdmap.pg_temp->begin(); + pg_temp != osdmap.pg_temp->end(); + ++pg_temp) { + if (pg_temp->first.pool() == pool) { + std::vector new_pg_temp = osdmap.pgtemp_primaryfirst(p, pg_temp->second); + pending_inc.new_pg_temp[pg_temp->first] = mempool::osdmap::vector(new_pg_temp.begin(), new_pg_temp.end()); + } + } + } } else if (var == "target_max_objects") { if (interr.length()) { ss << "error parsing int '" << val << "': " << interr; diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h index 4d1172932de65..83dffb35ba729 100644 --- a/src/mon/OSDMonitor.h +++ b/src/mon/OSDMonitor.h @@ -741,6 +741,9 @@ public: std::stringstream &ss, ceph::Formatter *f); + int enable_pool_ec_optimizations(pg_pool_t &pool, + std::stringstream *ss, + bool enable); int prepare_command_pool_set(const cmdmap_t& cmdmap, std::stringstream& ss); -- 2.39.5