From a3fa9daa22abf72fed6322447a915759b16e9c4d Mon Sep 17 00:00:00 2001 From: Igor Fedotov Date: Mon, 29 Jan 2024 21:49:48 +0300 Subject: [PATCH] os/bluestore: move and rename ExtentCache to Allocator class. Signed-off-by: Igor Fedotov --- src/os/bluestore/Allocator.h | 174 ++++++++++++++++++++++++++++ src/os/bluestore/Btree2Allocator.cc | 18 +-- src/os/bluestore/Btree2Allocator.h | 142 ++--------------------- 3 files changed, 193 insertions(+), 141 deletions(-) diff --git a/src/os/bluestore/Allocator.h b/src/os/bluestore/Allocator.h index 04377e43e3a..e3d719a3005 100644 --- a/src/os/bluestore/Allocator.h +++ b/src/os/bluestore/Allocator.h @@ -16,11 +16,185 @@ #include #include "include/ceph_assert.h" #include "bluestore_types.h" +#include "common/ceph_mutex.h" typedef interval_set release_set_t; typedef release_set_t::value_type release_set_entry_t; class Allocator { +protected: + + struct ExtentCollectionTraits { + size_t num_buckets; + size_t base_bits; // min extent size + size_t base = 1ull << base_bits; + size_t factor; // single bucket size range to be + // determined as [len, len * factor * 2) + // for log2(len) indexing and + // [len, len + factor * base) + // for linear indexing. + + + ExtentCollectionTraits(size_t _num_buckets, + size_t _base_bits = 12, //= 4096 bytes + size_t _factor = 1) : + num_buckets(_num_buckets), + base_bits(_base_bits), + base(1ull << base_bits), + factor(_factor) + { + ceph_assert(factor); + } + + /* + * Determines bucket index for a given extent's length in a bucket collection + * with log2(len) indexing. + * The last bucket index is returned for lengths above the maximum. + */ + inline size_t _get_p2_size_bucket(uint64_t len) const { + size_t idx; + const size_t len_p2_max = + base << ((factor * (num_buckets - 2))); + if (len <= base) { + idx = 0; + } else if (len > len_p2_max) { + idx = num_buckets - 1; + } else { + size_t most_bit = cbits(uint64_t(len - 1)) - 1; + idx = 1 + ((most_bit - base_bits) / factor); + } + ceph_assert(idx < num_buckets); + return idx; + } + /* + * Determines bucket index for a given extent's length in a bucket collection + * with linear (len / min_extent_size) indexing. + * The last bucket index is returned for lengths above the maximum. + */ + inline size_t _get_linear_size_bucket(uint64_t len) const { + size_t idx = (len / factor) >> base_bits; + idx = idx < num_buckets ? idx : num_buckets - 1; + return idx; + } + }; + + /* + * Lockless stack implementation + * that permits put/get operation exclusively + * if no waiting is needed. + * Conflicting operations are omitted. + */ + class LocklessOpportunisticStack { + std::atomic ref = 0; + std::atomic count = 0; + std::vector data; + public: + void init(size_t size) { + data.resize(size); + } + bool try_put(uint64_t& v) { + bool done = ++ref == 1 && count < data.size(); + if (done) { + data[count++] = v; + } + --ref; + return done; + } + bool try_get(uint64_t& v) { + bool done = ++ref == 1 && count > 0; + if (done) { + v = data[--count]; + } + --ref; + return done; + } + void foreach(std::function notify) { + for (size_t i = 0; i < count; i++) { + notify(data[i]); + } + } + }; + /* + * Concurrently accessed extent (offset,length) cache + * which permits put/get operation exclusively if no waiting is needed. + * Implemented via a set of independent buckets (aka LocklessOpportunisticStack). + * Each bucket keeps extents of specific size only: 4K, 8K, 12K...64K + * which allows to avoid individual extent size tracking. + * Each bucket permits a single operation at a given time only, + * additional operations against the bucket are rejected meaning relevant + * extents aren't not cached. + */ + class OpportunisticExtentCache { + const Allocator::ExtentCollectionTraits myTraits; + enum { + BUCKET_COUNT = 16, + EXTENTS_PER_BUCKET = 16, // amount of entries per single bucket, + // total amount of entries will be + // BUCKET_COUNT * EXTENTS_PER_BUCKET. + }; + + std::vector buckets; + std::atomic hits = 0; + ceph::shared_mutex lock{ + ceph::make_shared_mutex(std::string(), false, false, false) + }; + public: + OpportunisticExtentCache() : + myTraits(BUCKET_COUNT + 1), // 16 regular buckets + 1 "catch-all" pseudo + // one to be used for out-of-bound checking + // since _get_*_size_bucket() methods imply + // the last bucket usage for the entries + // exceeding the max length. + buckets(BUCKET_COUNT) + { + //buckets.resize(BUCKET_COUNT); + for(auto& b : buckets) { + b.init(EXTENTS_PER_BUCKET); + } + } + bool try_put(uint64_t offset, uint64_t len) { + if (!lock.try_lock_shared()) { + return false; + } + bool ret = false; + ceph_assert(p2aligned(offset, myTraits.base)); + ceph_assert(p2aligned(len, myTraits.base)); + auto idx = myTraits._get_linear_size_bucket(len); + if (idx < buckets.size()) + ret = buckets[idx].try_put(offset); + lock.unlock_shared(); + return ret; + } + bool try_get(uint64_t* offset, uint64_t len) { + if (!lock.try_lock_shared()) { + return false; + } + bool ret = false; + ceph_assert(offset); + ceph_assert(p2aligned(len, myTraits.base)); + size_t idx = len >> myTraits.base_bits; + if (idx < buckets.size()) { + ret = buckets[idx].try_get(*offset); + if (ret) { + ++hits; + } + } + lock.unlock_shared(); + return ret; + } + size_t get_hit_count() const { + return hits.load(); + } + void foreach(std::function notify) { + std::unique_lock _lock(lock); + for (uint64_t i = 0; i < buckets.size(); i++) { + auto cb = [&](uint64_t o) { + notify(o, i << myTraits.base_bits); + }; + buckets[i].foreach(cb); + } + } + }; public: Allocator(std::string_view name, diff --git a/src/os/bluestore/Btree2Allocator.cc b/src/os/bluestore/Btree2Allocator.cc index d0a2763ac2f..33ee6348c8e 100644 --- a/src/os/bluestore/Btree2Allocator.cc +++ b/src/os/bluestore/Btree2Allocator.cc @@ -24,13 +24,15 @@ Btree2Allocator::Btree2Allocator(CephContext* _cct, bool with_cache, std::string_view name) : Allocator(name, device_size, block_size), + myTraits(RANGE_SIZE_BUCKET_COUNT), cct(_cct), range_count_cap(max_mem / sizeof(range_seg_t)) { set_weight_factor(_rweight_factor); if (with_cache) { - cache = new ChunkCache(); + cache = new OpportunisticExtentCache(); } + range_size_set.resize(myTraits.num_buckets); } void Btree2Allocator::init_add_free(uint64_t offset, uint64_t length) @@ -195,7 +197,7 @@ int64_t Btree2Allocator::_allocate( continue; } } - size_t bucket0 = MyTraits::_get_size_bucket(want_now); + size_t bucket0 = myTraits._get_p2_size_bucket(want_now); int64_t r = __allocate(bucket0, want_now, unit, extents); if (r < 0) { @@ -329,7 +331,7 @@ int64_t Btree2Allocator::__allocate( auto rs_p = _pick_block(0, rs_tree, size); if (rs_p == rs_tree->end()) { - auto bucket_center = MyTraits::_get_size_bucket(weight_center); + auto bucket_center = myTraits._get_p2_size_bucket(weight_center); // requested size is to the left of weight center // hence we try to search up toward it first @@ -356,7 +358,7 @@ int64_t Btree2Allocator::__allocate( bucket = dir < 0 ? bucket0 : bucket_center + 1; do { // try spilled over or different direction if bucket index is out of bounds - if (bucket >= MyTraits::num_size_buckets) { + if (bucket >= myTraits.num_buckets) { if (dir < 0) { // reached the bottom while going downhill, // time to try spilled over extents @@ -376,7 +378,7 @@ int64_t Btree2Allocator::__allocate( dir = -dir; bucket = dir < 0 ? bucket0 : bucket_center + 1; // See above on new bucket // selection rationales - ceph_assert(bucket < MyTraits::num_size_buckets); // this should never happen + ceph_assert(bucket < myTraits.num_buckets); // this should never happen if (dir == dir0 ) { // stop if both directions already attempted return -ENOSPC; @@ -465,7 +467,7 @@ void Btree2Allocator::_remove_from_tree( uint64_t end) { range_seg_t rs(rt_p->first, rt_p->second); - size_t bucket = MyTraits::_get_size_bucket(rs.length()); + size_t bucket = myTraits._get_p2_size_bucket(rs.length()); range_size_tree_t* rs_tree = &range_size_set[bucket]; auto rs_p = rs_tree->find(rs); ceph_assert(rs_p != rs_tree->end()); @@ -569,7 +571,7 @@ bool Btree2Allocator::__try_insert_range( void Btree2Allocator::_range_size_tree_add(const range_seg_t& rs) { auto l = rs.length(); ceph_assert(rs.end > rs.start); - size_t bucket = MyTraits::_get_size_bucket(l); + size_t bucket = myTraits._get_p2_size_bucket(l); range_size_set[bucket].insert(rs); num_free += l; @@ -582,7 +584,7 @@ void Btree2Allocator::_range_size_tree_add(const range_seg_t& rs) { } void Btree2Allocator::_range_size_tree_rm(const range_seg_t& rs) { - size_t bucket = MyTraits::_get_size_bucket(rs.length()); + size_t bucket = myTraits._get_p2_size_bucket(rs.length()); range_size_tree_t* rs_tree = &range_size_set[bucket]; ceph_assert(rs_tree->size() > 0); auto rs_p = rs_tree->find(rs); diff --git a/src/os/bluestore/Btree2Allocator.h b/src/os/bluestore/Btree2Allocator.h index f950999adc4..bccdeb6ad05 100644 --- a/src/os/bluestore/Btree2Allocator.h +++ b/src/os/bluestore/Btree2Allocator.h @@ -14,146 +14,23 @@ #include "include/mempool.h" #include "common/ceph_mutex.h" -template -struct SizeBucketCollectionTraits { - static const size_t base_bits = BASE_BITS; - static const size_t base = 1ull << base_bits; - static const size_t size_factor = FACTOR; // single bucket size range to be - // determined as [n, n * 2 * FACTOR) - static const size_t num_size_buckets = NUM_BUCKETS; - static const size_t bucket_max = - base << ((size_factor * (num_size_buckets - 2))); - - static inline size_t _get_size_bucket(uint64_t len) { - size_t idx; - ceph_assert(size_factor); - if (len <= base) { - idx = 0; - } else if (len > bucket_max) { - idx = num_size_buckets - 1; - } else { - size_t most_bit = cbits(uint64_t(len - 1)) - 1; - idx = 1 + ((most_bit - base_bits) / size_factor); - } - ceph_assert(idx < num_size_buckets); - return idx; - } -}; - -class ChunkCache { -public: - class LocklessBuf { - std::atomic ref = 0; - size_t count = 0; - std::vector data; - public: - void init(size_t size) { - ceph_assert(data.size() == 0); - data.resize(size); - } - bool try_put(uint32_t& v) { - bool done = ++ref == 1 && count < data.size(); - if (done) { - data[count++] = v; - } - --ref; - return done; - } - bool try_get(uint32_t& v) { - bool done = ++ref == 1 && count > 0; - if (done) { - v = data[--count]; - } - --ref; - return done; - } - void foreach(std::function notify) { - for (size_t i = 0; i < count; i++) { - notify(data[i]); - } - } - }; - - static const size_t base_bits = 12; // 4K - static const size_t base = 1ull << base_bits; - static const size_t num_buckets = 16; // [4K, 8K, 12K, ... 64K] - 16 buckets total - static const size_t num_chunks = 16; - ChunkCache() { - for (size_t i = 0; i < buckets.size(); i++) { - buckets[i].init(num_chunks); - } - } - bool try_put(uint64_t offset, uint64_t len) { - if (!lock.try_lock_shared()) { - return false; - } - bool ret = false; - ceph_assert(p2aligned(offset, base)); - ceph_assert(p2aligned(len, base)); - // permit caching chunks with offsets fitting (without LSBs) - // into 32 - bit value - offset = offset >> base_bits; - size_t idx = len >> base_bits; - if (offset <= std::numeric_limits::max() && - idx < buckets.size()) { - uint32_t o = offset; - ret = buckets[idx].try_put(o); - } - lock.unlock_shared(); - return ret; - } - bool try_get(uint64_t* offset, uint64_t len) { - if (!lock.try_lock_shared()) { - return false; - } - bool ret = false; - ceph_assert(offset); - ceph_assert(p2aligned(len, base)); - size_t idx = len >> base_bits; - if (idx < buckets.size()) { - uint32_t o = 0; - ret = buckets[idx].try_get(o); - if (ret) { - *offset = uint64_t(o) << base_bits; - ++hits; - } - } - lock.unlock_shared(); - return ret; - } - size_t get_hit_count() const { - return hits.load(); - } - void foreach(std::function notify) { - std::unique_lock _lock(lock); - for (uint64_t i = 0; i < buckets.size(); i++) { - auto cb = [&](uint32_t o) { - notify(uint64_t(o) << base_bits, i << base_bits); - }; - buckets[i].foreach(cb); - } - } -private: - std::array buckets; - std::atomic hits = 0; - ceph::shared_mutex lock{ - ceph::make_shared_mutex(std::string(), false, false, false) - }; -}; - /* * class Btree2Allocator * * */ class Btree2Allocator : public Allocator { - typedef SizeBucketCollectionTraits<12, 14> MyTraits; + enum { + RANGE_SIZE_BUCKET_COUNT = 14, + }; + const ExtentCollectionTraits myTraits; + public: // Making public to share with mempools struct range_seg_t { MEMPOOL_CLASS_HELPERS(); ///< memory monitoring - uint64_t start; ///< starting offset of this segment - uint64_t end; ///< ending offset (non-inclusive) + uint64_t start; ///< starting offset of this segment + uint64_t end; ///< ending offset (non-inclusive) // Tree is sorted by offset, greater offsets at the end of the tree. struct before_t { @@ -252,7 +129,7 @@ public: private: CephContext* cct = nullptr; - ChunkCache* cache = nullptr; + Allocator::OpportunisticExtentCache* cache = nullptr; std::mutex lock; template @@ -276,8 +153,7 @@ private: range_seg_t, range_seg_t::shorter_t, pool_allocator>; - std::array< - range_size_tree_t, MyTraits::num_size_buckets> range_size_set; + std::vector range_size_set; std::atomic num_free = 0; ///< total bytes in freelist -- 2.39.5