#include <ostream>
#include "include/ceph_assert.h"
#include "bluestore_types.h"
+#include "common/ceph_mutex.h"
typedef interval_set<uint64_t> release_set_t;
typedef release_set_t::value_type release_set_entry_t;
class Allocator {
+protected:
+
+ struct ExtentCollectionTraits {
+ size_t num_buckets;
+ size_t base_bits; // min extent size
+ size_t base = 1ull << base_bits;
+ size_t factor; // single bucket size range to be
+ // determined as [len, len * factor * 2)
+ // for log2(len) indexing and
+ // [len, len + factor * base)
+ // for linear indexing.
+
+
+ ExtentCollectionTraits(size_t _num_buckets,
+ size_t _base_bits = 12, //= 4096 bytes
+ size_t _factor = 1) :
+ num_buckets(_num_buckets),
+ base_bits(_base_bits),
+ base(1ull << base_bits),
+ factor(_factor)
+ {
+ ceph_assert(factor);
+ }
+
+ /*
+ * Determines bucket index for a given extent's length in a bucket collection
+ * with log2(len) indexing.
+ * The last bucket index is returned for lengths above the maximum.
+ */
+ inline size_t _get_p2_size_bucket(uint64_t len) const {
+ size_t idx;
+ const size_t len_p2_max =
+ base << ((factor * (num_buckets - 2)));
+ if (len <= base) {
+ idx = 0;
+ } else if (len > len_p2_max) {
+ idx = num_buckets - 1;
+ } else {
+ size_t most_bit = cbits(uint64_t(len - 1)) - 1;
+ idx = 1 + ((most_bit - base_bits) / factor);
+ }
+ ceph_assert(idx < num_buckets);
+ return idx;
+ }
+ /*
+ * Determines bucket index for a given extent's length in a bucket collection
+ * with linear (len / min_extent_size) indexing.
+ * The last bucket index is returned for lengths above the maximum.
+ */
+ inline size_t _get_linear_size_bucket(uint64_t len) const {
+ size_t idx = (len / factor) >> base_bits;
+ idx = idx < num_buckets ? idx : num_buckets - 1;
+ return idx;
+ }
+ };
+
+ /*
+ * Lockless stack implementation
+ * that permits put/get operation exclusively
+ * if no waiting is needed.
+ * Conflicting operations are omitted.
+ */
+ class LocklessOpportunisticStack {
+ std::atomic<size_t> ref = 0;
+ std::atomic<size_t> count = 0;
+ std::vector<uint64_t> data;
+ public:
+ void init(size_t size) {
+ data.resize(size);
+ }
+ bool try_put(uint64_t& v) {
+ bool done = ++ref == 1 && count < data.size();
+ if (done) {
+ data[count++] = v;
+ }
+ --ref;
+ return done;
+ }
+ bool try_get(uint64_t& v) {
+ bool done = ++ref == 1 && count > 0;
+ if (done) {
+ v = data[--count];
+ }
+ --ref;
+ return done;
+ }
+ void foreach(std::function<void(uint64_t)> notify) {
+ for (size_t i = 0; i < count; i++) {
+ notify(data[i]);
+ }
+ }
+ };
+ /*
+ * Concurrently accessed extent (offset,length) cache
+ * which permits put/get operation exclusively if no waiting is needed.
+ * Implemented via a set of independent buckets (aka LocklessOpportunisticStack).
+ * Each bucket keeps extents of specific size only: 4K, 8K, 12K...64K
+ * which allows to avoid individual extent size tracking.
+ * Each bucket permits a single operation at a given time only,
+ * additional operations against the bucket are rejected meaning relevant
+ * extents aren't not cached.
+ */
+ class OpportunisticExtentCache {
+ const Allocator::ExtentCollectionTraits myTraits;
+ enum {
+ BUCKET_COUNT = 16,
+ EXTENTS_PER_BUCKET = 16, // amount of entries per single bucket,
+ // total amount of entries will be
+ // BUCKET_COUNT * EXTENTS_PER_BUCKET.
+ };
+
+ std::vector<LocklessOpportunisticStack> buckets;
+ std::atomic<size_t> hits = 0;
+ ceph::shared_mutex lock{
+ ceph::make_shared_mutex(std::string(), false, false, false)
+ };
+ public:
+ OpportunisticExtentCache() :
+ myTraits(BUCKET_COUNT + 1), // 16 regular buckets + 1 "catch-all" pseudo
+ // one to be used for out-of-bound checking
+ // since _get_*_size_bucket() methods imply
+ // the last bucket usage for the entries
+ // exceeding the max length.
+ buckets(BUCKET_COUNT)
+ {
+ //buckets.resize(BUCKET_COUNT);
+ for(auto& b : buckets) {
+ b.init(EXTENTS_PER_BUCKET);
+ }
+ }
+ bool try_put(uint64_t offset, uint64_t len) {
+ if (!lock.try_lock_shared()) {
+ return false;
+ }
+ bool ret = false;
+ ceph_assert(p2aligned(offset, myTraits.base));
+ ceph_assert(p2aligned(len, myTraits.base));
+ auto idx = myTraits._get_linear_size_bucket(len);
+ if (idx < buckets.size())
+ ret = buckets[idx].try_put(offset);
+ lock.unlock_shared();
+ return ret;
+ }
+ bool try_get(uint64_t* offset, uint64_t len) {
+ if (!lock.try_lock_shared()) {
+ return false;
+ }
+ bool ret = false;
+ ceph_assert(offset);
+ ceph_assert(p2aligned(len, myTraits.base));
+ size_t idx = len >> myTraits.base_bits;
+ if (idx < buckets.size()) {
+ ret = buckets[idx].try_get(*offset);
+ if (ret) {
+ ++hits;
+ }
+ }
+ lock.unlock_shared();
+ return ret;
+ }
+ size_t get_hit_count() const {
+ return hits.load();
+ }
+ void foreach(std::function<void(uint64_t offset, uint64_t length)> notify) {
+ std::unique_lock _lock(lock);
+ for (uint64_t i = 0; i < buckets.size(); i++) {
+ auto cb = [&](uint64_t o) {
+ notify(o, i << myTraits.base_bits);
+ };
+ buckets[i].foreach(cb);
+ }
+ }
+ };
public:
Allocator(std::string_view name,
bool with_cache,
std::string_view name) :
Allocator(name, device_size, block_size),
+ myTraits(RANGE_SIZE_BUCKET_COUNT),
cct(_cct),
range_count_cap(max_mem / sizeof(range_seg_t))
{
set_weight_factor(_rweight_factor);
if (with_cache) {
- cache = new ChunkCache();
+ cache = new OpportunisticExtentCache();
}
+ range_size_set.resize(myTraits.num_buckets);
}
void Btree2Allocator::init_add_free(uint64_t offset, uint64_t length)
continue;
}
}
- size_t bucket0 = MyTraits::_get_size_bucket(want_now);
+ size_t bucket0 = myTraits._get_p2_size_bucket(want_now);
int64_t r = __allocate(bucket0, want_now,
unit, extents);
if (r < 0) {
auto rs_p = _pick_block(0, rs_tree, size);
if (rs_p == rs_tree->end()) {
- auto bucket_center = MyTraits::_get_size_bucket(weight_center);
+ auto bucket_center = myTraits._get_p2_size_bucket(weight_center);
// requested size is to the left of weight center
// hence we try to search up toward it first
bucket = dir < 0 ? bucket0 : bucket_center + 1;
do {
// try spilled over or different direction if bucket index is out of bounds
- if (bucket >= MyTraits::num_size_buckets) {
+ if (bucket >= myTraits.num_buckets) {
if (dir < 0) {
// reached the bottom while going downhill,
// time to try spilled over extents
dir = -dir;
bucket = dir < 0 ? bucket0 : bucket_center + 1; // See above on new bucket
// selection rationales
- ceph_assert(bucket < MyTraits::num_size_buckets); // this should never happen
+ ceph_assert(bucket < myTraits.num_buckets); // this should never happen
if (dir == dir0 ) {
// stop if both directions already attempted
return -ENOSPC;
uint64_t end)
{
range_seg_t rs(rt_p->first, rt_p->second);
- size_t bucket = MyTraits::_get_size_bucket(rs.length());
+ size_t bucket = myTraits._get_p2_size_bucket(rs.length());
range_size_tree_t* rs_tree = &range_size_set[bucket];
auto rs_p = rs_tree->find(rs);
ceph_assert(rs_p != rs_tree->end());
void Btree2Allocator::_range_size_tree_add(const range_seg_t& rs) {
auto l = rs.length();
ceph_assert(rs.end > rs.start);
- size_t bucket = MyTraits::_get_size_bucket(l);
+ size_t bucket = myTraits._get_p2_size_bucket(l);
range_size_set[bucket].insert(rs);
num_free += l;
}
void Btree2Allocator::_range_size_tree_rm(const range_seg_t& rs)
{
- size_t bucket = MyTraits::_get_size_bucket(rs.length());
+ size_t bucket = myTraits._get_p2_size_bucket(rs.length());
range_size_tree_t* rs_tree = &range_size_set[bucket];
ceph_assert(rs_tree->size() > 0);
auto rs_p = rs_tree->find(rs);
#include "include/mempool.h"
#include "common/ceph_mutex.h"
-template <size_t BASE_BITS, size_t NUM_BUCKETS, size_t FACTOR = 1>
-struct SizeBucketCollectionTraits {
- static const size_t base_bits = BASE_BITS;
- static const size_t base = 1ull << base_bits;
- static const size_t size_factor = FACTOR; // single bucket size range to be
- // determined as [n, n * 2 * FACTOR)
- static const size_t num_size_buckets = NUM_BUCKETS;
- static const size_t bucket_max =
- base << ((size_factor * (num_size_buckets - 2)));
-
- static inline size_t _get_size_bucket(uint64_t len) {
- size_t idx;
- ceph_assert(size_factor);
- if (len <= base) {
- idx = 0;
- } else if (len > bucket_max) {
- idx = num_size_buckets - 1;
- } else {
- size_t most_bit = cbits(uint64_t(len - 1)) - 1;
- idx = 1 + ((most_bit - base_bits) / size_factor);
- }
- ceph_assert(idx < num_size_buckets);
- return idx;
- }
-};
-
-class ChunkCache {
-public:
- class LocklessBuf {
- std::atomic<size_t> ref = 0;
- size_t count = 0;
- std::vector<uint32_t> data;
- public:
- void init(size_t size) {
- ceph_assert(data.size() == 0);
- data.resize(size);
- }
- bool try_put(uint32_t& v) {
- bool done = ++ref == 1 && count < data.size();
- if (done) {
- data[count++] = v;
- }
- --ref;
- return done;
- }
- bool try_get(uint32_t& v) {
- bool done = ++ref == 1 && count > 0;
- if (done) {
- v = data[--count];
- }
- --ref;
- return done;
- }
- void foreach(std::function<void(uint32_t)> notify) {
- for (size_t i = 0; i < count; i++) {
- notify(data[i]);
- }
- }
- };
-
- static const size_t base_bits = 12; // 4K
- static const size_t base = 1ull << base_bits;
- static const size_t num_buckets = 16; // [4K, 8K, 12K, ... 64K] - 16 buckets total
- static const size_t num_chunks = 16;
- ChunkCache() {
- for (size_t i = 0; i < buckets.size(); i++) {
- buckets[i].init(num_chunks);
- }
- }
- bool try_put(uint64_t offset, uint64_t len) {
- if (!lock.try_lock_shared()) {
- return false;
- }
- bool ret = false;
- ceph_assert(p2aligned(offset, base));
- ceph_assert(p2aligned(len, base));
- // permit caching chunks with offsets fitting (without LSBs)
- // into 32 - bit value
- offset = offset >> base_bits;
- size_t idx = len >> base_bits;
- if (offset <= std::numeric_limits<uint32_t>::max() &&
- idx < buckets.size()) {
- uint32_t o = offset;
- ret = buckets[idx].try_put(o);
- }
- lock.unlock_shared();
- return ret;
- }
- bool try_get(uint64_t* offset, uint64_t len) {
- if (!lock.try_lock_shared()) {
- return false;
- }
- bool ret = false;
- ceph_assert(offset);
- ceph_assert(p2aligned(len, base));
- size_t idx = len >> base_bits;
- if (idx < buckets.size()) {
- uint32_t o = 0;
- ret = buckets[idx].try_get(o);
- if (ret) {
- *offset = uint64_t(o) << base_bits;
- ++hits;
- }
- }
- lock.unlock_shared();
- return ret;
- }
- size_t get_hit_count() const {
- return hits.load();
- }
- void foreach(std::function<void(uint64_t offset, uint64_t length)> notify) {
- std::unique_lock _lock(lock);
- for (uint64_t i = 0; i < buckets.size(); i++) {
- auto cb = [&](uint32_t o) {
- notify(uint64_t(o) << base_bits, i << base_bits);
- };
- buckets[i].foreach(cb);
- }
- }
-private:
- std::array<LocklessBuf, num_buckets> buckets;
- std::atomic<size_t> hits = 0;
- ceph::shared_mutex lock{
- ceph::make_shared_mutex(std::string(), false, false, false)
- };
-};
-
/*
* class Btree2Allocator
*
*
*/
class Btree2Allocator : public Allocator {
- typedef SizeBucketCollectionTraits<12, 14> MyTraits;
+ enum {
+ RANGE_SIZE_BUCKET_COUNT = 14,
+ };
+ const ExtentCollectionTraits myTraits;
+
public:
// Making public to share with mempools
struct range_seg_t {
MEMPOOL_CLASS_HELPERS(); ///< memory monitoring
- uint64_t start; ///< starting offset of this segment
- uint64_t end; ///< ending offset (non-inclusive)
+ uint64_t start; ///< starting offset of this segment
+ uint64_t end; ///< ending offset (non-inclusive)
// Tree is sorted by offset, greater offsets at the end of the tree.
struct before_t {
private:
CephContext* cct = nullptr;
- ChunkCache* cache = nullptr;
+ Allocator::OpportunisticExtentCache* cache = nullptr;
std::mutex lock;
template<class T>
range_seg_t,
range_seg_t::shorter_t,
pool_allocator<range_seg_t>>;
- std::array<
- range_size_tree_t, MyTraits::num_size_buckets> range_size_set;
+ std::vector<range_size_tree_t> range_size_set;
std::atomic<uint64_t> num_free = 0; ///< total bytes in freelist