From 3327cc0500c33fea8989af6f83d1d33f10654a7d Mon Sep 17 00:00:00 2001 From: Jaya Prakash Date: Wed, 5 Feb 2025 19:31:47 +0530 Subject: [PATCH] os/bluestore : Refactoring Allocator Class Fixes : https://tracker.ceph.com/issues/69314 Signed-off-by: Jaya Prakash --- src/crimson/os/alienstore/CMakeLists.txt | 1 + src/os/CMakeLists.txt | 1 + src/os/bluestore/Allocator.cc | 177 +----------- src/os/bluestore/Allocator.h | 268 +----------------- src/os/bluestore/AllocatorBase.cc | 8 +- src/test/objectstore/Allocator_bench.cc | 3 +- src/test/objectstore/allocator_replay_test.cc | 3 +- 7 files changed, 15 insertions(+), 446 deletions(-) diff --git a/src/crimson/os/alienstore/CMakeLists.txt b/src/crimson/os/alienstore/CMakeLists.txt index 389e2ec0f22..efd70b1c76e 100644 --- a/src/crimson/os/alienstore/CMakeLists.txt +++ b/src/crimson/os/alienstore/CMakeLists.txt @@ -48,6 +48,7 @@ set(alien_store_srcs alien_log.cc ${PROJECT_SOURCE_DIR}/src/os/ObjectStore.cc ${PROJECT_SOURCE_DIR}/src/os/bluestore/Allocator.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/AllocatorBase.cc ${PROJECT_SOURCE_DIR}/src/os/bluestore/AvlAllocator.cc ${PROJECT_SOURCE_DIR}/src/os/bluestore/BtreeAllocator.cc ${PROJECT_SOURCE_DIR}/src/os/bluestore/Btree2Allocator.cc diff --git a/src/os/CMakeLists.txt b/src/os/CMakeLists.txt index a27656697df..71452e542bb 100644 --- a/src/os/CMakeLists.txt +++ b/src/os/CMakeLists.txt @@ -10,6 +10,7 @@ set(libos_srcs if(WITH_BLUESTORE) list(APPEND libos_srcs bluestore/Allocator.cc + bluestore/AllocatorBase.cc bluestore/BitmapFreelistManager.cc bluestore/BlueFS.cc bluestore/bluefs_types.cc diff --git a/src/os/bluestore/Allocator.cc b/src/os/bluestore/Allocator.cc index 603f698e382..67cd72e8369 100644 --- a/src/os/bluestore/Allocator.cc +++ b/src/os/bluestore/Allocator.cc @@ -2,6 +2,7 @@ // vim: ts=8 sw=2 smarttab #include "Allocator.h" +#include "AllocatorBase.h" #include #include "StupidAllocator.h" #include "BitmapAllocator.h" @@ -21,153 +22,18 @@ using std::to_string; using ceph::bufferlist; using ceph::Formatter; -class Allocator::SocketHook : public AdminSocketHook { - Allocator *alloc; - friend class Allocator; - std::string name; -public: - SocketHook(Allocator *alloc, std::string_view _name) : - alloc(alloc), name(_name) - { - AdminSocket *admin_socket = g_ceph_context->get_admin_socket(); - if (name.empty()) { - name = to_string((uintptr_t)this); - } - if (admin_socket) { - int r = admin_socket->register_command( - ("bluestore allocator dump " + name).c_str(), - this, - "dump allocator free regions"); - if (r != 0) - alloc = nullptr; //some collision, disable - if (alloc) { - r = admin_socket->register_command( - ("bluestore allocator score " + name).c_str(), - this, - "give score on allocator fragmentation (0-no fragmentation, 1-absolute fragmentation)"); - ceph_assert(r == 0); - r = admin_socket->register_command( - ("bluestore allocator fragmentation " + name).c_str(), - this, - "give allocator fragmentation (0-no fragmentation, 1-absolute fragmentation)"); - ceph_assert(r == 0); - r = admin_socket->register_command( - ("bluestore allocator fragmentation histogram " + name + - " name=alloc_unit,type=CephInt,req=false" + - " name=num_buckets,type=CephInt,req=false").c_str(), - this, - "build allocator free regions state histogram"); - ceph_assert(r == 0); - } - } - } - ~SocketHook() - { - AdminSocket *admin_socket = g_ceph_context->get_admin_socket(); - if (admin_socket && alloc) { - admin_socket->unregister_commands(this); - } - } - - int call(std::string_view command, - const cmdmap_t& cmdmap, - const bufferlist&, - Formatter *f, - std::ostream& ss, - bufferlist& out) override { - int r = 0; - if (command == "bluestore allocator dump " + name) { - f->open_object_section("allocator_dump"); - f->dump_unsigned("capacity", alloc->get_capacity()); - f->dump_unsigned("alloc_unit", alloc->get_block_size()); - f->dump_string("alloc_type", alloc->get_type()); - f->dump_string("alloc_name", name); - - f->open_array_section("extents"); - auto iterated_allocation = [&](size_t off, size_t len) { - ceph_assert(len > 0); - f->open_object_section("free"); - char off_hex[30]; - char len_hex[30]; - snprintf(off_hex, sizeof(off_hex) - 1, "0x%zx", off); - snprintf(len_hex, sizeof(len_hex) - 1, "0x%zx", len); - f->dump_string("offset", off_hex); - f->dump_string("length", len_hex); - f->close_section(); - }; - alloc->foreach(iterated_allocation); - f->close_section(); - f->close_section(); - } else if (command == "bluestore allocator score " + name) { - f->open_object_section("fragmentation_score"); - f->dump_float("fragmentation_rating", alloc->get_fragmentation_score()); - f->close_section(); - } else if (command == "bluestore allocator fragmentation " + name) { - f->open_object_section("fragmentation"); - f->dump_float("fragmentation_rating", alloc->get_fragmentation()); - f->close_section(); - } else if (command == "bluestore allocator fragmentation histogram " + name) { - int64_t alloc_unit = alloc->get_block_size(); - cmd_getval(cmdmap, "alloc_unit", alloc_unit); - if (alloc_unit <= 0 || - p2align(alloc_unit, alloc->get_block_size()) != alloc_unit) { - ss << "Invalid allocation unit: '" << alloc_unit - << "', to be aligned with: '" << alloc->get_block_size() - << "'" << std::endl; - return -EINVAL; - } - int64_t num_buckets = 8; - cmd_getval(cmdmap, "num_buckets", num_buckets); - if (num_buckets < 2) { - ss << "Invalid amount of buckets (min=2): '" << num_buckets - << "'" << std::endl; - return -EINVAL; - } - - Allocator::FreeStateHistogram hist(num_buckets); - alloc->foreach( - [&](size_t off, size_t len) { - hist.record_extent(uint64_t(alloc_unit), off, len); - }); - f->open_array_section("extent_counts"); - hist.foreach( - [&](uint64_t max_len, uint64_t total, uint64_t aligned, uint64_t units) { - f->open_object_section("c"); - f->dump_unsigned("max_len", max_len); - f->dump_unsigned("total", total); - f->dump_unsigned("aligned", aligned); - f->dump_unsigned("units", units); - f->close_section(); - } - ); - f->close_section(); - } else { - ss << "Invalid command" << std::endl; - r = -ENOSYS; - } - return r; - } - -}; Allocator::Allocator(std::string_view name, int64_t _capacity, int64_t _block_size) : device_size(_capacity), block_size(_block_size) -{ - asok_hook = new SocketHook(this, name); -} +{} Allocator::~Allocator() -{ - delete asok_hook; -} +{} -const string& Allocator::get_name() const { - return asok_hook->name; -} Allocator *Allocator::create( CephContext* cct, @@ -275,40 +141,3 @@ double Allocator::get_fragmentation_score() return (ideal - score_sum) / (ideal - terrible); } -/************* -* Allocator::FreeStateHistogram -*************/ -using std::function; - -void Allocator::FreeStateHistogram::record_extent(uint64_t alloc_unit, - uint64_t off, - uint64_t len) -{ - size_t idx = myTraits._get_bucket(len); - ceph_assert(idx < buckets.size()); - ++buckets[idx].total; - - // now calculate the bucket for the chunk after alignment, - // resulting chunks shorter than alloc_unit are discarded - auto delta = p2roundup(off, alloc_unit) - off; - if (len >= delta + alloc_unit) { - len -= delta; - idx = myTraits._get_bucket(len); - ceph_assert(idx < buckets.size()); - ++buckets[idx].aligned; - buckets[idx].alloc_units += len / alloc_unit; - } -} -void Allocator::FreeStateHistogram::foreach( - function cb) -{ - size_t i = 0; - for (const auto& b : buckets) { - cb(myTraits._get_bucket_max(i), - b.total, b.aligned, b.alloc_units); - ++i; - } -} diff --git a/src/os/bluestore/Allocator.h b/src/os/bluestore/Allocator.h index e2763077650..9d8c9feef1c 100644 --- a/src/os/bluestore/Allocator.h +++ b/src/os/bluestore/Allocator.h @@ -22,233 +22,6 @@ typedef interval_set release_set_t; typedef release_set_t::value_type release_set_entry_t; class Allocator { -protected: - - /** - * This is a base set of traits for logical placing entries - * into limited collection of buckets depending on their sizes. - * Descandants should implement get_bucket(len) method to obtain - * bucket index using entry length. - */ - struct LenPartitionedSetTraits { - size_t num_buckets; - size_t base_bits; // bits in min entry size - size_t base; // min entry size - size_t factor; // additional factor to be applied - // to entry size when calculating - // target bucket - - - LenPartitionedSetTraits(size_t _num_buckets, - size_t _base_bits = 12, //= 4096 bytes - size_t _factor = 1) : - num_buckets(_num_buckets), - base_bits(_base_bits), - base(1ull << base_bits), - factor(_factor) - { - ceph_assert(factor); - } - }; - - /** - * This extends LenPartitionedSetTraits to implement linear bucket indexing: - * bucket index to be determined as entry's size divided by (base * factor), - * i.e. buckets are: - * [0..base) - * [base, base+base*factor) - * [base+base*factor, base+base*factor*2) - * [base+base*factor*2, base+base*factor*3) - * ... - */ - struct LenPartitionedSetTraitsLinear : public LenPartitionedSetTraits { - using LenPartitionedSetTraits::LenPartitionedSetTraits; - /* - * Determines bucket index for a given extent's length in a bucket set - * with linear (len / base / factor) indexing. - * The first bucket is targeted for lengths < base, - * the last bucket is used for lengths above the maximum - * detemined by bucket count. - */ - inline size_t _get_bucket(uint64_t len) const { - size_t idx = (len / factor) >> base_bits; - idx = idx < num_buckets ? idx : num_buckets - 1; - return idx; - } - /* - * returns upper bound of a specific bucket - */ - inline size_t _get_bucket_max(size_t bucket) const { - return - bucket < num_buckets - 1 ? - base * factor * (1 + bucket) : - std::numeric_limits::max(); - } - }; - - /** - * This extends LenPartitionedSetTraits to implement exponential bucket indexing: - * target bucket bounds are determined as - * [0, base] - * (base, base*2^factor] - * (base*2^factor, base*2^(factor*2)] - * (base*2^(factor*2), base*2^(factor*3)] - * ... - * - */ - struct LenPartitionedSetTraitsPow2 : public LenPartitionedSetTraits { - /* - * Determines bucket index for a given extent's length in a bucket collection - * with log2(len) indexing. - * The first bucket is targeted for lengths < base, - * The last bucket index is used for lengths above the maximum - * detemined by bucket count. - */ - using LenPartitionedSetTraits::LenPartitionedSetTraits; - inline size_t _get_bucket(uint64_t len) const { - size_t idx; - const size_t len_p2_max = - base << ((factor * (num_buckets - 2))); - if (len <= base) { - idx = 0; - } else if (len > len_p2_max) { - idx = num_buckets - 1; - } else { - size_t most_bit = cbits(uint64_t(len - 1)) - 1; - idx = 1 + ((most_bit - base_bits) / factor); - } - ceph_assert(idx < num_buckets); - return idx; - } - /* - * returns upper bound of the bucket with log2(len) indexing. - */ - inline size_t _get_bucket_max(size_t bucket) const { - return - bucket < num_buckets - 1 ? - base << (factor * bucket) : - std::numeric_limits::max(); - } - }; - - /* - * Lockless stack implementation - * that permits put/get operation exclusively - * if no waiting is needed. - * Conflicting operations are omitted. - */ - class LocklessOpportunisticStack { - std::atomic ref = 0; - std::atomic count = 0; - std::vector data; - public: - void init(size_t size) { - data.resize(size); - } - bool try_put(uint64_t& v) { - bool done = ++ref == 1 && count < data.size(); - if (done) { - data[count++] = v; - } - --ref; - return done; - } - bool try_get(uint64_t& v) { - bool done = ++ref == 1 && count > 0; - if (done) { - v = data[--count]; - } - --ref; - return done; - } - void foreach(std::function notify) { - for (size_t i = 0; i < count; i++) { - notify(data[i]); - } - } - }; - /* - * Concurrently accessed extent (offset,length) cache - * which permits put/get operation exclusively if no waiting is needed. - * Implemented via a set of independent buckets (aka LocklessOpportunisticStack). - * Each bucket keeps extents of specific size only: 4K, 8K, 12K...64K - * which allows to avoid individual extent size tracking. - * Each bucket permits a single operation at a given time only, - * additional operations against the bucket are rejected meaning relevant - * extents aren't not cached. - */ - class OpportunisticExtentCache { - const LenPartitionedSetTraitsLinear myTraits; - enum { - BUCKET_COUNT = 16, - EXTENTS_PER_BUCKET = 16, // amount of entries per single bucket, - // total amount of entries will be - // BUCKET_COUNT * EXTENTS_PER_BUCKET. - }; - - std::vector buckets; - std::atomic hits = 0; - ceph::shared_mutex lock{ - ceph::make_shared_mutex(std::string(), false, false, false) - }; - public: - OpportunisticExtentCache() : - myTraits(BUCKET_COUNT + 1), // 16 regular buckets + 1 "catch-all" pseudo - // one to be used for out-of-bound checking - // since _get_*_size_bucket() methods imply - // the last bucket usage for the entries - // exceeding the max length. - buckets(BUCKET_COUNT) - { - //buckets.resize(BUCKET_COUNT); - for(auto& b : buckets) { - b.init(EXTENTS_PER_BUCKET); - } - } - bool try_put(uint64_t offset, uint64_t len) { - if (!lock.try_lock_shared()) { - return false; - } - bool ret = false; - ceph_assert(p2aligned(offset, myTraits.base)); - ceph_assert(p2aligned(len, myTraits.base)); - auto idx = myTraits._get_bucket(len); - if (idx < buckets.size()) - ret = buckets[idx].try_put(offset); - lock.unlock_shared(); - return ret; - } - bool try_get(uint64_t* offset, uint64_t len) { - if (!lock.try_lock_shared()) { - return false; - } - bool ret = false; - ceph_assert(offset); - ceph_assert(p2aligned(len, myTraits.base)); - size_t idx = len >> myTraits.base_bits; - if (idx < buckets.size()) { - ret = buckets[idx].try_get(*offset); - if (ret) { - ++hits; - } - } - lock.unlock_shared(); - return ret; - } - size_t get_hit_count() const { - return hits.load(); - } - void foreach(std::function notify) { - std::unique_lock _lock(lock); - for (uint64_t i = 0; i < buckets.size(); i++) { - auto cb = [&](uint64_t o) { - notify(o, i << myTraits.base_bits); - }; - buckets[i].foreach(cb); - } - } - }; - public: Allocator(std::string_view name, int64_t _capacity, @@ -307,7 +80,7 @@ public: ); - const std::string& get_name() const; + virtual const std::string& get_name() const = 0; int64_t get_capacity() const { return device_size; @@ -317,46 +90,9 @@ public: return block_size; } - // The following class implements Allocator's free extents histogram. - // Which is a set of N buckets to track extents layout. - // Extent matches a bucket depending on its length using the following - // length spans: - // [0..4K] (4K..16K] (16K..64K] .. (4M..16M] (16M..] - // Each bucket tracks: - // - total amount of extents of specific lengths - // - amount of extents aligned with allocation boundary - // - amount of allocation units in aligned extents - // - class FreeStateHistogram { - const LenPartitionedSetTraitsPow2 myTraits; - enum { - BASE_BITS = 12, // 4096 bytes - FACTOR = 2, - }; - struct free_state_hist_bucket { - size_t total = 0; - size_t aligned = 0; - size_t alloc_units = 0; - }; - std::vector buckets; - public: - - FreeStateHistogram(size_t num_buckets) - : myTraits(num_buckets, BASE_BITS, FACTOR) { - buckets.resize(num_buckets); - } - - void record_extent(uint64_t alloc_unit, uint64_t off, uint64_t len); - void foreach( - std::function cb); - }; - -private: - class SocketHook; - SocketHook* asok_hook = nullptr; protected: const int64_t device_size = 0; const int64_t block_size = 0; }; -#endif \ No newline at end of file +#endif diff --git a/src/os/bluestore/AllocatorBase.cc b/src/os/bluestore/AllocatorBase.cc index 99777cdf7ff..9e146f32257 100644 --- a/src/os/bluestore/AllocatorBase.cc +++ b/src/os/bluestore/AllocatorBase.cc @@ -109,20 +109,20 @@ public: f->dump_float("fragmentation_rating", alloc->get_fragmentation()); f->close_section(); } else if (command == "bluestore allocator fragmentation histogram " + name) { - int64_t alloc_unit = 4096; + int64_t alloc_unit = alloc->get_block_size(); cmd_getval(cmdmap, "alloc_unit", alloc_unit); if (alloc_unit <= 0 || p2align(alloc_unit, alloc->get_block_size()) != alloc_unit) { ss << "Invalid allocation unit: '" << alloc_unit - << ", to be aligned with: '" << alloc->get_block_size() - << std::endl; + << "', to be aligned with: '" << alloc->get_block_size() + << "'" << std::endl; return -EINVAL; } int64_t num_buckets = 8; cmd_getval(cmdmap, "num_buckets", num_buckets); if (num_buckets < 2) { ss << "Invalid amount of buckets (min=2): '" << num_buckets - << std::endl; + << "'" << std::endl; return -EINVAL; } diff --git a/src/test/objectstore/Allocator_bench.cc b/src/test/objectstore/Allocator_bench.cc index 0c577f4fe1b..d557f6168cc 100644 --- a/src/test/objectstore/Allocator_bench.cc +++ b/src/test/objectstore/Allocator_bench.cc @@ -13,6 +13,7 @@ #include "include/stringify.h" #include "include/Context.h" #include "os/bluestore/Allocator.h" +#include "os/bluestore/AllocatorBase.h" #include #include @@ -294,7 +295,7 @@ struct OverwriteTextContext : public Thread { void build_histogram() { const size_t num_buckets = 8; - Allocator::FreeStateHistogram hist(num_buckets); + AllocatorBase::FreeStateHistogram hist(num_buckets); alloc->foreach( [&](size_t off, size_t len) { hist.record_extent(uint64_t(alloc_unit), off, len); diff --git a/src/test/objectstore/allocator_replay_test.cc b/src/test/objectstore/allocator_replay_test.cc index 874a172b50c..b76fd7f7514 100644 --- a/src/test/objectstore/allocator_replay_test.cc +++ b/src/test/objectstore/allocator_replay_test.cc @@ -16,6 +16,7 @@ #include "include/denc.h" #include "global/global_init.h" #include "os/bluestore/Allocator.h" +#include "os/bluestore/AllocatorBase.h" using namespace std; @@ -773,7 +774,7 @@ int main(int argc, char **argv) std::cout << "Allocation unit:" << alloc_unit << std::endl; - Allocator::FreeStateHistogram hist(num_buckets); + AllocatorBase::FreeStateHistogram hist(num_buckets); a->foreach( [&](size_t off, size_t len) { hist.record_extent(uint64_t(alloc_unit), off, len); -- 2.47.3