From 5c6f34603ec7db5eb794600ba4f04bfce0712005 Mon Sep 17 00:00:00 2001 From: Bill Scales <156200352+bill-scales@users.noreply.github.com> Date: Thu, 6 Mar 2025 07:53:02 +0000 Subject: [PATCH] osd: EC optimizations: add nonprimary_shards set to pg_pool_t EC optimizations pools do not update every shard on every I/O. The primary must have a complete log and requires objects to have up to date object attributes, so the choice of primary has to be restricted. Shards that cannot become a primary are listed in the nonprimary_shards set. For a K+M EC pool with optimizations enabled the 1st data shard and all M coding parity shards are always updated and can become a primary, the other shards will be marked as nonprimary. The new set nonprimary_shards stores shards that cannot become the primary, by default it is an empty set which retains existing behavior. When optimisations are enabled on an EC pool this set will be filled in to restrict the choice of primary. Signed-off-by: Bill Scales --- src/osd/osd_types.cc | 14 ++++++++++++-- src/osd/osd_types.h | 7 ++++++- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index ec2083d8bbd4f..6cece3471aab2 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -1642,6 +1642,7 @@ void pg_pool_t::dump(Formatter *f) const f->dump_unsigned("stripe_width", get_stripe_width()); f->dump_unsigned("expected_num_objects", expected_num_objects); f->dump_bool("fast_read", fast_read); + f->dump_stream("nonprimary_shards") << nonprimary_shards; f->open_object_section("options"); opts.dump(f); f->close_section(); // options @@ -1961,7 +1962,7 @@ void pg_pool_t::encode(ceph::buffer::list& bl, uint64_t features) const return; } - uint8_t v = 31; + uint8_t v = 32; // NOTE: any new encoding dependencies must be reflected by // SIGNIFICANT_FEATURES if (!HAVE_FEATURE(features, SERVER_TENTACLE)) { @@ -2080,12 +2081,15 @@ void pg_pool_t::encode(ceph::buffer::list& bl, uint64_t features) const auto maybe_peering_crush_data1 = maybe_peering_crush_data(); encode(maybe_peering_crush_data1, bl); } + if (v >= 32) { + encode(nonprimary_shards, bl); + } ENCODE_FINISH(bl); } void pg_pool_t::decode(ceph::buffer::list::const_iterator& bl) { - DECODE_START_LEGACY_COMPAT_LEN(31, 5, 5, bl); + DECODE_START_LEGACY_COMPAT_LEN(32, 5, 5, bl); decode(type, bl); decode(size, bl); decode(crush_rule, bl); @@ -2276,6 +2280,11 @@ void pg_pool_t::decode(ceph::buffer::list::const_iterator& bl) peering_crush_mandatory_member) = *peering_crush_data; } } + if (struct_v >= 32) { + decode(nonprimary_shards, bl); + } else { + nonprimary_shards.clear(); + } DECODE_FINISH(bl); calc_pg_masks(); calc_grade_table(); @@ -2377,6 +2386,7 @@ void pg_pool_t::generate_test_instances(list& o) a.erasure_code_profile = "profile in osdmap"; a.expected_num_objects = 123456; a.fast_read = false; + a.nonprimary_shards.clear(); a.application_metadata = {{"rbd", {{"key", "value"}}}}; o.push_back(new pg_pool_t(a)); diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 88f76c609a28e..aa5ea26e9637e 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -1626,7 +1626,7 @@ public: uint64_t expected_num_objects = 0; ///< expected number of objects on this pool, a value of 0 indicates ///< user does not specify any expected value bool fast_read = false; ///< whether turn on fast read on the pool or not - + shard_id_set nonprimary_shards; ///< EC partial writes: shards that cannot become a primary pool_opts_t opts; ///< options typedef enum { @@ -1931,6 +1931,11 @@ public: /// choose a random hash position within a pg uint32_t get_random_pg_position(pg_t pgid, uint32_t seed) const; + /// EC partial writes: test if a shard is a non-primary + bool is_nonprimary_shard(const shard_id_t shard) const { + return !nonprimary_shards.empty() && nonprimary_shards.contains(shard); + } + void encode(ceph::buffer::list& bl, uint64_t features) const; void decode(ceph::buffer::list::const_iterator& bl); -- 2.39.5