- avl
- btree
- hybrid
+ - hybrid_btree2
with_legacy: true
- name: bluefs_log_replay_check_allocations
type: bool
- btree
- hybrid
- zoned
+ - hybrid_btree2
with_legacy: true
- name: bluestore_freelist_blocks_per_key
type: size
level: dev
desc: Maximum RAM hybrid allocator should use before enabling bitmap supplement
default: 64_M
+- name: bluestore_btree2_alloc_weight_factor
+ type: float
+ level: dev
+ desc: Large continuous extents weight factor
+ default: 2
- name: bluestore_volume_selection_policy
type: str
level: dev
${PROJECT_SOURCE_DIR}/src/os/bluestore/Allocator.cc
${PROJECT_SOURCE_DIR}/src/os/bluestore/AvlAllocator.cc
${PROJECT_SOURCE_DIR}/src/os/bluestore/BtreeAllocator.cc
+ ${PROJECT_SOURCE_DIR}/src/os/bluestore/Btree2Allocator.cc
${PROJECT_SOURCE_DIR}/src/os/bluestore/BitmapFreelistManager.cc
${PROJECT_SOURCE_DIR}/src/os/bluestore/BlueFS.cc
${PROJECT_SOURCE_DIR}/src/os/bluestore/bluefs_types.cc
bluestore/BitmapAllocator.cc
bluestore/AvlAllocator.cc
bluestore/BtreeAllocator.cc
+ bluestore/Btree2Allocator.cc
bluestore/HybridAllocator.cc
)
endif(WITH_BLUESTORE)
#include "BitmapAllocator.h"
#include "AvlAllocator.h"
#include "BtreeAllocator.h"
+#include "Btree2Allocator.h"
#include "HybridAllocator.h"
#ifdef HAVE_LIBZBD
#include "ZonedAllocator.h"
return new ZonedAllocator(cct, size, block_size, zone_size, first_sequential_zone,
name);
#endif
+ } else if (type == "hybrid_btree2") {
+ return new HybridBtree2Allocator(cct, size, block_size,
+ cct->_conf.get_val<uint64_t>("bluestore_hybrid_alloc_mem_cap"),
+ cct->_conf.get_val<double>("bluestore_btree2_alloc_weight_factor"),
+ name);
}
if (alloc == nullptr) {
lderr(cct) << "Allocator::" << __func__ << " unknown alloc type "
uint64_t _lowest_size_available() {
auto rs = range_size_tree.begin();
- return rs != range_size_tree.end() ? rs->length() : std::numeric_limits<uint16_t>::max();
+ return rs != range_size_tree.end() ?
+ rs->length() :
+ std::numeric_limits<uint64_t>::max();
}
int64_t _allocate(
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "Btree2Allocator.h"
+#include "common/debug.h"
+
+#define dout_context (get_context())
+#define dout_subsys ceph_subsys_bluestore
+#undef dout_prefix
+#define dout_prefix *_dout << (std::string(this->get_type()) + "::").c_str()
+
+/*
+ * class Btree2Allocator
+ *
+ *
+ */
+MEMPOOL_DEFINE_OBJECT_FACTORY(Btree2Allocator::range_seg_t, btree2_range_seg_t, bluestore_alloc);
+
+Btree2Allocator::Btree2Allocator(CephContext* _cct,
+ int64_t device_size,
+ int64_t block_size,
+ uint64_t max_mem,
+ double _rweight_factor,
+ bool with_cache,
+ std::string_view name) :
+ Allocator(name, device_size, block_size),
+ cct(_cct),
+ range_count_cap(max_mem / sizeof(range_seg_t))
+{
+ set_weight_factor(_rweight_factor);
+ if (with_cache) {
+ cache = new ChunkCache();
+ }
+}
+
+void Btree2Allocator::init_add_free(uint64_t offset, uint64_t length)
+{
+ ldout(cct, 10) << __func__ << std::hex
+ << " offset 0x" << offset
+ << " length 0x" << length
+ << std::dec << dendl;
+ if (!length)
+ return;
+ std::lock_guard l(lock);
+ ceph_assert(offset + length <= uint64_t(device_size));
+ _add_to_tree(offset, length);
+}
+
+void Btree2Allocator::init_rm_free(uint64_t offset, uint64_t length)
+{
+ ldout(cct, 10) << __func__ << std::hex
+ << " offset 0x" << offset
+ << " length 0x" << length
+ << std::dec << dendl;
+ if (!length)
+ return;
+ std::lock_guard l(lock);
+ ceph_assert(offset + length <= uint64_t(device_size));
+ _remove_from_tree(offset, length);
+}
+
+int64_t Btree2Allocator::allocate(
+ uint64_t want,
+ uint64_t unit,
+ uint64_t max_alloc_size,
+ int64_t hint, // unused, for now!
+ PExtentVector* extents)
+{
+ ldout(cct, 10) << __func__ << std::hex
+ << " want 0x" << want
+ << " unit 0x" << unit
+ << " max_alloc_size 0x" << max_alloc_size
+ << " hint 0x" << hint
+ << std::dec << dendl;
+ ceph_assert(std::has_single_bit(unit));
+ ceph_assert(want % unit == 0);
+
+ if (max_alloc_size == 0) {
+ max_alloc_size = want;
+ }
+ if (constexpr auto cap = std::numeric_limits<decltype(bluestore_pextent_t::length)>::max();
+ max_alloc_size >= cap) {
+ max_alloc_size = p2align(uint64_t(cap), (uint64_t)block_size);
+ }
+ uint64_t cached_chunk_offs = 0;
+ if (cache && cache->try_get(&cached_chunk_offs, want)) {
+ num_free -= want;
+ extents->emplace_back(cached_chunk_offs, want);
+ return want;
+ }
+ std::lock_guard l(lock);
+ return _allocate(want, unit, max_alloc_size, hint, extents);
+}
+
+void Btree2Allocator::release(const release_set_t& release_set)
+{
+ if (!cache || release_set.num_intervals() >= pextent_array_size) {
+ std::lock_guard l(lock);
+ _release(release_set);
+ return;
+ }
+ PExtentArray to_release;
+ size_t count = 0;
+ auto p = release_set.begin();
+ while (p != release_set.end()) {
+ if (cache->try_put(p.get_start(), p.get_len())) {
+ num_free += p.get_len();
+ } else {
+ to_release[count++] = &*p;
+ }
+ ++p;
+ }
+ if (count > 0) {
+ std::lock_guard l(lock);
+ _release(count, &to_release.at(0));
+ }
+}
+
+void Btree2Allocator::_shutdown()
+{
+ if (cache) {
+ delete cache;
+ cache = nullptr;
+ }
+ for (auto& tree : range_size_set) {
+ tree.clear();
+ }
+ range_tree.clear();
+}
+
+void Btree2Allocator::_dump(bool full) const
+{
+ if (full) {
+ ldout(cct, 0) << " >>>range_tree: " << dendl;
+ for (auto& rs : range_tree) {
+ ldout(cct, 0) << std::hex
+ << "0x" << rs.first << "~" << (rs.second - rs.first)
+ << std::dec
+ << dendl;
+ }
+ ldout(cct, 0) << " >>>range_size_trees: " << dendl;
+ size_t i = 0;
+ for (auto& rs_tree : range_size_set) {
+ ldout(cct, 0) << " >>>bucket[" << i << "]" << dendl;
+ ++i;
+ for (auto& rs : rs_tree)
+ ldout(cct, 0) << std::hex
+ << "0x" << rs.start << "~" << (rs.end - rs.start)
+ << std::dec << dendl;
+ }
+ if (cache) {
+ ldout(cct, 0) << " >>>cache: " << dendl;
+ auto cb = [&](uint64_t offset, uint64_t length) {
+ ldout(cct, 0) << std::hex
+ << "0x" << offset << "~" << length
+ << std::dec << dendl;
+ };
+ cache->foreach(cb);
+ }
+ ldout(cct, 0) << " >>>>>>>>>>>" << dendl;
+ }
+ if (cache) {
+ ldout(cct, 0) << " >>>cache stats: " << dendl;
+ ldout(cct, 0) << " hits: " << cache->get_hit_count() << dendl;
+ }
+}
+
+void Btree2Allocator::_foreach(
+ std::function<void(uint64_t offset, uint64_t length)> notify)
+{
+ for (auto& rs : range_tree) {
+ notify(rs.first, rs.second - rs.first);
+ }
+ if (cache) {
+ cache->foreach(notify);
+ }
+}
+
+int64_t Btree2Allocator::_allocate(
+ uint64_t want,
+ uint64_t unit,
+ uint64_t max_alloc_size,
+ int64_t hint, // unused, for now!
+ PExtentVector* extents)
+{
+ uint64_t allocated = 0;
+ while (allocated < want) {
+ auto want_now = std::min(max_alloc_size, want - allocated);
+ if (cache && want_now != want) {
+ uint64_t cached_chunk_offs = 0;
+ if (cache->try_get(&cached_chunk_offs, want_now)) {
+ num_free -= want_now;
+ extents->emplace_back(cached_chunk_offs, want_now);
+ allocated += want_now;
+ continue;
+ }
+ }
+ size_t bucket0 = MyTraits::_get_size_bucket(want_now);
+ int64_t r = __allocate(bucket0, want_now,
+ unit, extents);
+ if (r < 0) {
+ // Allocation failed.
+ break;
+ }
+ allocated += r;
+ }
+ return allocated ? allocated : -ENOSPC;
+}
+
+void Btree2Allocator::_release(const release_set_t& release_set)
+{
+ for (auto p = release_set.begin(); p != release_set.end(); ++p) {
+ const auto offset = p.get_start();
+ const auto length = p.get_len();
+ ceph_assert(offset + length <= uint64_t(device_size));
+ ldout(cct, 10) << __func__ << std::hex
+ << " offset 0x" << offset
+ << " length 0x" << length
+ << std::dec << dendl;
+ _add_to_tree(offset, length);
+ }
+}
+
+void Btree2Allocator::_release(const PExtentVector& release_set)
+{
+ for (auto& e : release_set) {
+ ldout(cct, 10) << __func__ << std::hex
+ << " offset 0x" << e.offset
+ << " length 0x" << e.length
+ << std::dec << dendl;
+ _add_to_tree(e.offset, e.length);
+ }
+}
+
+void Btree2Allocator::_release(size_t count, const release_set_entry_t** to_release)
+{
+ for (size_t i = 0; i < count; i++) {
+ auto* e = to_release[i];
+ ldout(cct, 10) << __func__ << std::hex
+ << " offset 0x" << e->first
+ << " length 0x" << e->second
+ << std::dec << dendl;
+ _add_to_tree(e->first, e->second);
+ }
+}
+
+void Btree2Allocator::_add_to_tree(uint64_t start, uint64_t size)
+{
+ ceph_assert(size != 0);
+
+ uint64_t end = start + size;
+
+ // Make sure we don't overlap with either of our neighbors
+ auto rt_p_after = range_tree.upper_bound(start);
+ auto rt_p_before = range_tree.end();
+ if (rt_p_after != range_tree.begin()) {
+ rt_p_before = std::prev(rt_p_after);
+ }
+ bool merge_before = (rt_p_before != range_tree.end() && rt_p_before->second == start);
+ bool merge_after = (rt_p_after != range_tree.end() && rt_p_after->first == end);
+
+ range_seg_t rs(start, end);
+ if (merge_before && merge_after) {
+ rs = range_seg_t{ rt_p_before->first, rt_p_after->second};
+ _range_size_tree_rm(range_seg_t{ rt_p_before->first, rt_p_before->second });
+ _range_size_tree_rm(range_seg_t{ rt_p_after->first, rt_p_after->second });
+ rt_p_after = range_tree.erase(rt_p_before);
+ rt_p_after = range_tree.erase(rt_p_after);
+ } else if (merge_before) {
+ rs = range_seg_t(rt_p_before->first, end);
+ _range_size_tree_rm(range_seg_t{ rt_p_before->first, rt_p_before->second });
+ rt_p_after = range_tree.erase(rt_p_before);
+ } else if (merge_after) {
+ rs = range_seg_t(start, rt_p_after->second);
+ _range_size_tree_rm(range_seg_t{ rt_p_after->first, rt_p_after->second });
+ rt_p_after = range_tree.erase(rt_p_after);
+ }
+ // create new entry at both range_tree and range_size_set
+ __try_insert_range(rs, &rt_p_after);
+}
+
+void Btree2Allocator::_try_remove_from_tree(uint64_t start, uint64_t size,
+ std::function<void(uint64_t, uint64_t, bool)> cb)
+{
+ uint64_t end = start + size;
+
+ ceph_assert(size != 0);
+
+ auto rt_p = range_tree.lower_bound(start);
+ if ((rt_p == range_tree.end() || rt_p->first > start) && rt_p != range_tree.begin()) {
+ --rt_p;
+ }
+
+ if (rt_p == range_tree.end() || rt_p->first >= end) {
+ cb(start, size, false);
+ return;
+ }
+ do {
+ auto next_p = rt_p;
+ ++next_p;
+
+ if (start < rt_p->first) {
+ cb(start, rt_p->first - start, false);
+ start = rt_p->first;
+ }
+ auto range_end = std::min(rt_p->second, end);
+
+ _remove_from_tree(rt_p, start, range_end);
+
+ cb(start, range_end - start, true);
+ start = range_end;
+
+ rt_p = next_p;
+ } while (rt_p != range_tree.end() && rt_p->first < end && start < end);
+ if (start < end) {
+ cb(start, end - start, false);
+ }
+}
+
+int64_t Btree2Allocator::__allocate(
+ size_t bucket0,
+ uint64_t size,
+ uint64_t unit,
+ PExtentVector* extents)
+{
+ int64_t allocated = 0;
+
+ auto rs_tree = &range_size_set[bucket0];
+ auto rs_p = _pick_block(0, rs_tree, size);
+
+ if (rs_p == rs_tree->end()) {
+ auto bucket_center = MyTraits::_get_size_bucket(weight_center);
+
+ // requested size is to the left of weight center
+ // hence we try to search up toward it first
+ //
+ size_t bucket = bucket0; // using unsigned for bucket is crucial
+ // as bounds checking when walking downsize
+ // depends on signed->unsigned value
+ // transformation
+ while (++bucket <= bucket_center) {
+ rs_tree = &range_size_set[bucket];
+ rs_p = _pick_block(1, rs_tree, size);
+ if (rs_p != rs_tree->end()) {
+ goto found;
+ }
+ }
+
+ int dir = left_weight() > right_weight() ? -1 : 1; // lookup direction
+ int dir0 = dir;
+
+ // Start from the initial bucket if searching downhill to
+ // run through that bucket again as we haven't check extents shorter
+ // than requested size if any.
+ // Start from bucket_center + 1 if walking uphill.
+ bucket = dir < 0 ? bucket0 : bucket_center + 1;
+ do {
+ // try spilled over or different direction if bucket index is out of bounds
+ if (bucket >= MyTraits::num_size_buckets) {
+ if (dir < 0) {
+ // reached the bottom while going downhill,
+ // time to try spilled over extents
+ uint64_t r = _spillover_allocate(size,
+ unit,
+ size,
+ 0,
+ extents); // 0 is returned if error were detected.
+ allocated += r;
+ ceph_assert(size >= (uint64_t)r);
+ size -= r;
+ if (size == 0) {
+ return allocated;
+ }
+ }
+ // change direction
+ dir = -dir;
+ bucket = dir < 0 ? bucket0 : bucket_center + 1; // See above on new bucket
+ // selection rationales
+ ceph_assert(bucket < MyTraits::num_size_buckets); // this should never happen
+ if (dir == dir0 ) {
+ // stop if both directions already attempted
+ return -ENOSPC;
+ }
+ }
+ rs_tree = &range_size_set[bucket];
+ rs_p = _pick_block(dir, rs_tree, size);
+ bucket += dir; // this might wrap over zero and trigger direction
+ // change on the next loop iteration.
+ } while (rs_p == rs_tree->end());
+ }
+
+found:
+ if (rs_p != rs_tree->end()) {
+ auto o = rs_p->start;
+ auto l = std::min(size, rs_p->length());
+ auto rt_p = range_tree.find(o);
+ ceph_assert(rt_p != range_tree.end());
+ _remove_from_tree(rs_tree, rs_p, rt_p, o, o + l);
+ extents->emplace_back(o, l);
+ allocated += l;
+ return allocated;
+ }
+ // should never get here
+ ceph_assert(false);
+ return -EFAULT;
+}
+
+Btree2Allocator::range_size_tree_t::iterator
+Btree2Allocator::_pick_block(int dir,
+ Btree2Allocator::range_size_tree_t* tree,
+ uint64_t size)
+{
+ if (dir < 0) {
+ // use the largest available chunk from 'shorter chunks' buckets
+ // given the preliminary check before the function's call
+ // we should get the result in a single step
+ auto p = tree->end();
+ if (p != tree->begin()) {
+ --p;
+ return p;
+ }
+ } else if (dir > 0) {
+ auto p = tree->begin();
+ if (p != tree->end()) {
+ return p;
+ }
+ } else if (tree->size()) {
+ //For the 'best matching' bucket
+ //look for a chunk with the best size matching.
+ // Start checking from the last item to
+ // avoid lookup if possible
+ auto p = tree->end();
+ --p;
+ if (p->length() > size) {
+ p = tree->lower_bound(range_seg_t{ 0, size });
+ ceph_assert(p != tree->end());
+ }
+ return p;
+ }
+ return tree->end();
+}
+
+void Btree2Allocator::_remove_from_tree(uint64_t start, uint64_t size)
+{
+ ceph_assert(size != 0);
+ ceph_assert(size <= num_free);
+ auto end = start + size;
+
+ // Find chunk we completely overlap with
+ auto rt_p = range_tree.lower_bound(start);
+ if ((rt_p == range_tree.end() || rt_p->first > start) && rt_p != range_tree.begin()) {
+ --rt_p;
+ }
+
+ /// Make sure we completely overlap with someone
+ ceph_assert(rt_p != range_tree.end());
+ ceph_assert(rt_p->first <= start);
+ ceph_assert(rt_p->second >= end);
+ _remove_from_tree(rt_p, start, end);
+}
+
+void Btree2Allocator::_remove_from_tree(
+ Btree2Allocator::range_tree_t::iterator rt_p,
+ uint64_t start,
+ uint64_t end)
+{
+ range_seg_t rs(rt_p->first, rt_p->second);
+ size_t bucket = MyTraits::_get_size_bucket(rs.length());
+ range_size_tree_t* rs_tree = &range_size_set[bucket];
+ auto rs_p = rs_tree->find(rs);
+ ceph_assert(rs_p != rs_tree->end());
+
+ _remove_from_tree(rs_tree, rs_p, rt_p, start, end);
+}
+
+void Btree2Allocator::_remove_from_tree(
+ Btree2Allocator::range_size_tree_t* rs_tree,
+ Btree2Allocator::range_size_tree_t::iterator rs_p,
+ Btree2Allocator::range_tree_t::iterator rt_p,
+ uint64_t start,
+ uint64_t end)
+{
+ bool left_over = (rt_p->first != start);
+ bool right_over = (rt_p->second != end);
+
+ range_seg_t rs = *rs_p;
+ _range_size_tree_rm(rs_tree, rs_p);
+ rt_p = range_tree.erase(rt_p); // It's suboptimal to do the removal every time.
+ // Some paths might avoid that but coupled with
+ // spillover handling this looks non-trivial.
+ // Hence leaving as-is.
+ if (left_over && right_over) {
+ range_seg_t rs_before(rs.start, start);
+ range_seg_t rs_after(end, rs.end);
+ // insert rs_after first since this updates rt_p which starts to point to
+ // newly inserted item.
+ __try_insert_range(rs_after, &rt_p);
+ __try_insert_range(rs_before, &rt_p);
+ } else if (left_over) {
+ rs.end = start;
+ __try_insert_range(rs, &rt_p);
+ } else if (right_over) {
+ rs.start = end;
+ __try_insert_range(rs, &rt_p);
+ }
+}
+
+void Btree2Allocator::_try_insert_range(const range_seg_t& rs)
+{
+ if (__try_insert_range(rs, nullptr)) {
+ _range_size_tree_add(rs);
+ }
+ else {
+ range_tree.erase(rs.start);
+ }
+}
+
+bool Btree2Allocator::__try_insert_range(
+ const Btree2Allocator::range_seg_t& rs,
+ Btree2Allocator::range_tree_t::iterator* rt_p_insert)
+{
+ ceph_assert(rs.end > rs.start);
+ // Check if amount of range_seg_t entries isn't above the threshold,
+ // use doubled range_tree's size for the estimation
+ bool spillover_input = range_count_cap && uint64_t(2 * range_tree.size()) >= range_count_cap;
+ range_size_tree_t* lowest_rs_tree = nullptr;
+ range_size_tree_t::iterator lowest_rs_p;
+ if (spillover_input) {
+ lowest_rs_tree = _get_lowest(&lowest_rs_p);
+ if (lowest_rs_tree) {
+ ceph_assert(lowest_rs_p != lowest_rs_tree->end());
+ spillover_input = rs.length() <= lowest_rs_p->length();
+ }
+ // make sure we never spill over chunks larger than weight_center
+ if (spillover_input) {
+ spillover_input = rs.length() <= weight_center;
+ lowest_rs_tree = nullptr;
+ } else if (lowest_rs_tree && lowest_rs_p->length () > weight_center) {
+ lowest_rs_tree = nullptr;
+ }
+ }
+ if (spillover_input) {
+ _spillover_range(rs.start, rs.end);
+ } else {
+ // NB: we should be careful about iterators (lowest_rs_p & rt_p_insert)
+ // Relevant trees' updates might invalidate any of them hence
+ // a bit convoluted logic below.
+ range_seg_t lowest_rs;
+ range_tree_t new_rt_p;
+ if (lowest_rs_tree) {
+ lowest_rs = *lowest_rs_p;
+ _range_size_tree_rm(lowest_rs_tree, lowest_rs_p);
+ range_tree.erase(lowest_rs.start);
+ // refresh the iterator to be returned
+ // due to the above range_tree removal invalidated it.
+ if (rt_p_insert) {
+ *rt_p_insert = range_tree.lower_bound(rs.start);
+ }
+ _spillover_range(lowest_rs.start, lowest_rs.end);
+ }
+ if (rt_p_insert) {
+ *rt_p_insert = range_tree.emplace_hint(*rt_p_insert, rs.start, rs.end);
+ _range_size_tree_add(rs);
+ }
+ }
+ return !spillover_input;
+}
+
+void Btree2Allocator::_range_size_tree_add(const range_seg_t& rs) {
+ auto l = rs.length();
+ ceph_assert(rs.end > rs.start);
+ size_t bucket = MyTraits::_get_size_bucket(l);
+ range_size_set[bucket].insert(rs);
+ num_free += l;
+
+ if (l > weight_center) {
+ rsum += l;
+ } else {
+ lsum += l;
+ }
+
+}
+void Btree2Allocator::_range_size_tree_rm(const range_seg_t& rs)
+{
+ size_t bucket = MyTraits::_get_size_bucket(rs.length());
+ range_size_tree_t* rs_tree = &range_size_set[bucket];
+ ceph_assert(rs_tree->size() > 0);
+ auto rs_p = rs_tree->find(rs);
+ ceph_assert(rs_p != rs_tree->end());
+ _range_size_tree_rm(rs_tree, rs_p);
+}
+
+void Btree2Allocator::_range_size_tree_rm(
+ Btree2Allocator::range_size_tree_t* rs_tree,
+ Btree2Allocator::range_size_tree_t::iterator rs_p)
+{
+ auto l = rs_p->length();
+ ceph_assert(num_free >= l);
+ num_free -= l;
+ if (l > weight_center) {
+ ceph_assert(rsum >= l);
+ rsum -= l;
+ } else {
+ ceph_assert(lsum >= l);
+ lsum -= l;
+ }
+ rs_tree->erase(rs_p);
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <mutex>
+
+#include "include/cpp-btree/btree_map.h"
+#include "include/cpp-btree/btree_set.h"
+
+#include "Allocator.h"
+
+#include "os/bluestore/bluestore_types.h"
+#include "include/mempool.h"
+#include "common/ceph_mutex.h"
+
+template <size_t BASE_BITS, size_t NUM_BUCKETS, size_t FACTOR = 1>
+struct SizeBucketCollectionTraits {
+ static const size_t base_bits = BASE_BITS;
+ static const size_t base = 1ull << base_bits;
+ static const size_t size_factor = FACTOR; // single bucket size range to be
+ // determined as [n, n * 2 * FACTOR)
+ static const size_t num_size_buckets = NUM_BUCKETS;
+ static const size_t bucket_max =
+ base << ((size_factor * (num_size_buckets - 2)));
+
+ static inline size_t _get_size_bucket(uint64_t len) {
+ size_t idx;
+ ceph_assert(size_factor);
+ if (len <= base) {
+ idx = 0;
+ } else if (len > bucket_max) {
+ idx = num_size_buckets - 1;
+ } else {
+ size_t most_bit = cbits(uint64_t(len - 1)) - 1;
+ idx = 1 + ((most_bit - base_bits) / size_factor);
+ }
+ ceph_assert(idx < num_size_buckets);
+ return idx;
+ }
+};
+
+class ChunkCache {
+public:
+ class LocklessBuf {
+ std::atomic<size_t> ref = 0;
+ size_t count = 0;
+ std::vector<uint32_t> data;
+ public:
+ void init(size_t size) {
+ ceph_assert(data.size() == 0);
+ data.resize(size);
+ }
+ bool try_put(uint32_t& v) {
+ bool done = ++ref == 1 && count < data.size();
+ if (done) {
+ data[count++] = v;
+ }
+ --ref;
+ return done;
+ }
+ bool try_get(uint32_t& v) {
+ bool done = ++ref == 1 && count > 0;
+ if (done) {
+ v = data[--count];
+ }
+ --ref;
+ return done;
+ }
+ void foreach(std::function<void(uint32_t)> notify) {
+ for (size_t i = 0; i < count; i++) {
+ notify(data[i]);
+ }
+ }
+ };
+
+ static const size_t base_bits = 12; // 4K
+ static const size_t base = 1ull << base_bits;
+ static const size_t num_buckets = 16; // [4K, 8K, 12K, ... 64K] - 16 buckets total
+ static const size_t num_chunks = 16;
+ ChunkCache() {
+ for (size_t i = 0; i < buckets.size(); i++) {
+ buckets[i].init(num_chunks);
+ }
+ }
+ bool try_put(uint64_t offset, uint64_t len) {
+ if (!lock.try_lock_shared()) {
+ return false;
+ }
+ bool ret = false;
+ ceph_assert(p2aligned(offset, base));
+ ceph_assert(p2aligned(len, base));
+ // permit caching chunks with offsets fitting (without LSBs)
+ // into 32 - bit value
+ offset = offset >> base_bits;
+ size_t idx = len >> base_bits;
+ if (offset <= std::numeric_limits<uint32_t>::max() &&
+ idx < buckets.size()) {
+ uint32_t o = offset;
+ ret = buckets[idx].try_put(o);
+ }
+ lock.unlock_shared();
+ return ret;
+ }
+ bool try_get(uint64_t* offset, uint64_t len) {
+ if (!lock.try_lock_shared()) {
+ return false;
+ }
+ bool ret = false;
+ ceph_assert(offset);
+ ceph_assert(p2aligned(len, base));
+ size_t idx = len >> base_bits;
+ if (idx < buckets.size()) {
+ uint32_t o = 0;
+ ret = buckets[idx].try_get(o);
+ if (ret) {
+ *offset = uint64_t(o) << base_bits;
+ ++hits;
+ }
+ }
+ lock.unlock_shared();
+ return ret;
+ }
+ size_t get_hit_count() const {
+ return hits.load();
+ }
+ void foreach(std::function<void(uint64_t offset, uint64_t length)> notify) {
+ std::unique_lock _lock(lock);
+ for (uint64_t i = 0; i < buckets.size(); i++) {
+ auto cb = [&](uint32_t o) {
+ notify(uint64_t(o) << base_bits, i << base_bits);
+ };
+ buckets[i].foreach(cb);
+ }
+ }
+private:
+ std::array<LocklessBuf, num_buckets> buckets;
+ std::atomic<size_t> hits = 0;
+ ceph::shared_mutex lock{
+ ceph::make_shared_mutex(std::string(), false, false, false)
+ };
+};
+
+/*
+ * class Btree2Allocator
+ *
+ *
+ */
+class Btree2Allocator : public Allocator {
+ typedef SizeBucketCollectionTraits<12, 14> MyTraits;
+public:
+ // Making public to share with mempools
+ struct range_seg_t {
+ MEMPOOL_CLASS_HELPERS(); ///< memory monitoring
+ uint64_t start; ///< starting offset of this segment
+ uint64_t end; ///< ending offset (non-inclusive)
+
+ // Tree is sorted by offset, greater offsets at the end of the tree.
+ struct before_t {
+ template<typename KeyLeft, typename KeyRight>
+ bool operator()(const KeyLeft& lhs, const KeyRight& rhs) const {
+ return lhs.end <= rhs.start;
+ }
+ };
+
+ // Tree is sorted by size, larger chunks are at the end of the tree.
+ struct shorter_t {
+ template<typename KeyType1, typename KeyType2>
+ int compare(const KeyType1& lhs, const KeyType2& rhs) const {
+ int64_t delta =
+ int64_t(lhs.end) - int64_t(lhs.start) - int64_t(rhs.end) + int64_t(rhs.start);
+ if (delta == 0) {
+ delta = (int64_t)lhs.start - (int64_t)rhs.start;
+ }
+ return delta ? delta / std::abs(delta) : 0;
+ }
+ template<typename KeyType>
+ int operator()(const range_seg_t& lhs, const KeyType& rhs) const {
+ return compare(lhs, rhs);
+ }
+ };
+
+ range_seg_t(uint64_t start = 0, uint64_t end = 0)
+ : start{ start }, end{ end }
+ {}
+ inline uint64_t length() const {
+ return end - start;
+ }
+ };
+
+public:
+ Btree2Allocator(CephContext* cct, int64_t device_size, int64_t block_size,
+ uint64_t max_mem,
+ double _rweight_factor,
+ bool with_cache,
+ std::string_view name);
+
+ //
+ //ctor intended for the usage from descendant (aka hybrid) class(es)
+ // which provide handling for spilled over entries
+ //
+ Btree2Allocator(CephContext* cct, int64_t device_size, int64_t block_size,
+ uint64_t max_mem,
+ std::string_view name) :
+ Btree2Allocator(cct, device_size, block_size, max_mem, 1.0, true, name) {
+ }
+
+ ~Btree2Allocator() override {
+ shutdown();
+ }
+
+ const char* get_type() const override
+ {
+ return "btree_v2";
+ }
+ void init_add_free(uint64_t offset, uint64_t length) override;
+ void init_rm_free(uint64_t offset, uint64_t length) override;
+
+ int64_t allocate(
+ uint64_t want,
+ uint64_t unit,
+ uint64_t max_alloc_size,
+ int64_t hint,
+ PExtentVector* extents) override;
+
+ void release(const release_set_t& release_set) override;
+
+ uint64_t get_free() override {
+ return num_free;
+ }
+ double get_fragmentation() override {
+ std::lock_guard l(lock);
+ return _get_fragmentation();
+ }
+ size_t get_cache_hit_count() const {
+ return cache ? cache->get_hit_count() : 0;
+ }
+
+ void dump() override {
+ std::lock_guard l(lock);
+ _dump();
+ }
+ void foreach(
+ std::function<void(uint64_t offset, uint64_t length)> notify) override {
+ std::lock_guard l(lock);
+ _foreach(notify);
+ }
+ void shutdown() override {
+ std::lock_guard l(lock);
+ _shutdown();
+ }
+
+private:
+ CephContext* cct = nullptr;
+ ChunkCache* cache = nullptr;
+ std::mutex lock;
+
+ template<class T>
+ using pool_allocator = mempool::bluestore_alloc::pool_allocator<T>;
+ using range_tree_t =
+ btree::btree_map<
+ uint64_t, // start
+ uint64_t, // end
+ std::less<uint64_t>,
+ pool_allocator<std::pair<uint64_t, uint64_t>>>;
+ range_tree_t range_tree; ///< main range tree
+
+ //
+ // The range_size_tree should always contain the
+ // same number of segments as the range_tree.
+ // The only difference is that the range_size_tree
+ // is ordered by segment sizes.
+ //
+ using range_size_tree_t =
+ btree::btree_set<
+ range_seg_t,
+ range_seg_t::shorter_t,
+ pool_allocator<range_seg_t>>;
+ std::array<
+ range_size_tree_t, MyTraits::num_size_buckets> range_size_set;
+
+ std::atomic<uint64_t> num_free = 0; ///< total bytes in freelist
+
+ //
+ // Max amount of range entries allowed. 0 - unlimited
+ //
+ uint64_t range_count_cap = 0;
+
+ const uint64_t weight_center = 1ull << 20; // 1M
+ uint64_t lsum = 0;
+ uint64_t rsum = 0;
+ double rweight_factor = 0;
+ uint64_t left_weight() const {
+ return lsum + _get_spilled_over();
+ }
+ uint64_t right_weight() const {
+ return rsum * rweight_factor;
+ }
+protected:
+ static const uint64_t pextent_array_size = 64;
+ typedef std::array <const release_set_t::value_type*, pextent_array_size> PExtentArray;
+
+ void set_weight_factor(double _rweight_factor) {
+ rweight_factor = _rweight_factor;
+ }
+
+ CephContext* get_context() {
+ return cct;
+ }
+ std::mutex& get_lock() {
+ return lock;
+ }
+ range_size_tree_t* _get_lowest(range_size_tree_t::iterator* rs_p) {
+ for (auto& t : range_size_set) {
+ if (t.begin() != t.end()) {
+ *rs_p = t.begin();
+ return &t;
+ }
+ }
+ return nullptr;
+ }
+ uint64_t _lowest_size_available() {
+ range_size_tree_t::iterator rs_p;
+ if (_get_lowest(&rs_p) != nullptr) {
+ return rs_p->length();
+ }
+ return std::numeric_limits<uint64_t>::max();
+ }
+ uint64_t _get_free() const {
+ return num_free;
+ }
+ double _get_fragmentation() const {
+ auto free_blocks = p2align(num_free.load(), (uint64_t)block_size) / block_size;
+ if (free_blocks <= 1) {
+ return .0;
+ }
+ return (static_cast<double>(range_tree.size() - 1) / (free_blocks - 1));
+ }
+
+ void _shutdown();
+
+ void _dump(bool full = true) const;
+ void _foreach(std::function<void(uint64_t offset, uint64_t length)>);
+
+ int64_t _allocate(
+ uint64_t want,
+ uint64_t unit,
+ uint64_t max_alloc_size,
+ int64_t hint,
+ PExtentVector* extents);
+
+ void _release(const release_set_t& release_set);
+ void _release(const PExtentVector& release_set);
+ void _release(size_t count, const release_set_entry_t** to_release);
+
+ /*
+ * overridables for HybridAllocator
+ */
+ // called when extent to be released/marked free
+ virtual void _add_to_tree(uint64_t start, uint64_t size);
+ virtual void _spillover_range(uint64_t start, uint64_t end) {
+ // this should be overriden when range count cap is present,
+ // i.e. (range_count_cap > 0)
+ ceph_assert(false);
+ }
+ // to be overriden by Hybrid wrapper
+ virtual uint64_t _get_spilled_over() const {
+ return 0;
+ }
+ virtual uint64_t _spillover_allocate(uint64_t want,
+ uint64_t unit,
+ uint64_t max_alloc_size,
+ int64_t hint,
+ PExtentVector* extents) {
+ // this should be overriden when range count cap is present,
+ // i.e. (range_count_cap > 0)
+ ceph_assert(false);
+ return 0;
+ }
+
+ /*
+ * Used exclusively from HybridAllocator
+ */
+ void _try_remove_from_tree(uint64_t start, uint64_t size,
+ std::function<void(uint64_t offset, uint64_t length, bool found)> cb);
+ bool has_cache() const {
+ return !!cache;
+ }
+ bool try_put_cache(uint64_t start, uint64_t len) {
+ bool ret = cache && cache->try_put(start, len);
+ if (ret) {
+ num_free += len;
+ }
+ return ret;
+ }
+ bool try_get_from_cache(uint64_t* res_offset, uint64_t want) {
+ bool ret = cache && cache->try_get(res_offset, want);
+ if (ret) {
+ num_free -= want;
+ }
+ return ret;
+ }
+
+private:
+ int64_t __allocate(size_t bucket0,
+ uint64_t size,
+ uint64_t unit,
+ PExtentVector* extents);
+
+ inline range_size_tree_t::iterator _pick_block(int distance,
+ range_size_tree_t* tree, uint64_t size);
+
+ inline void _remove_from_tree(uint64_t start, uint64_t size);
+ inline void _remove_from_tree(range_tree_t::iterator rt_p,
+ uint64_t start, uint64_t end);
+ inline void _remove_from_tree(range_size_tree_t* rs_tree,
+ range_size_tree_t::iterator rs,
+ range_tree_t::iterator rt_p,
+ uint64_t start, uint64_t end);
+
+ inline void _try_insert_range(const range_seg_t& rs);
+ inline bool __try_insert_range(const range_seg_t& rs,
+ range_tree_t::iterator* insert_pos);
+
+ inline void _range_size_tree_add(const range_seg_t& r);
+ inline void _range_size_tree_rm(const range_seg_t& rs);
+ inline void _range_size_tree_rm(range_size_tree_t* rs_tree,
+ range_size_tree_t::iterator rs);
+};
uint64_t _lowest_size_available() const {
auto rs = range_size_tree.begin();
- return rs != range_size_tree.end() ? rs->size : 0;
+ return rs != range_size_tree.end() ?
+ rs->size :
+ std::numeric_limits<uint64_t>::max();
}
int64_t _allocate(
return "hybrid";
}
+/*
+ * class HybridBtree2Allocator
+ *
+ *
+ */
+const char* HybridBtree2Allocator::get_type() const
+{
+ return "hybrid_btree2";
+}
+
+int64_t HybridBtree2Allocator::allocate(
+ uint64_t want,
+ uint64_t unit,
+ uint64_t max_alloc_size,
+ int64_t hint,
+ PExtentVector* extents)
+{
+ ldout(get_context(), 10) << __func__ << std::hex
+ << " want 0x" << want
+ << " unit 0x" << unit
+ << " max_alloc_size 0x" << max_alloc_size
+ << " hint 0x" << hint
+ << std::dec << dendl;
+ ceph_assert(std::has_single_bit(unit));
+ ceph_assert(want % unit == 0);
+
+ uint64_t cached_chunk_offs = 0;
+ if (try_get_from_cache(&cached_chunk_offs, want)) {
+ extents->emplace_back(cached_chunk_offs, want);
+ return want;
+ }
+ return HybridAllocatorBase<Btree2Allocator>::allocate(want,
+ unit, max_alloc_size, hint, extents);
+}
+void HybridBtree2Allocator::release(const release_set_t& release_set)
+{
+ if (!has_cache() || release_set.num_intervals() >= pextent_array_size) {
+ HybridAllocatorBase<Btree2Allocator>::release(release_set);
+ return;
+ }
+ PExtentArray to_release;
+ size_t count = 0;
+ auto p = release_set.begin();
+ while (p != release_set.end()) {
+ if (!try_put_cache(p.get_start(), p.get_len())) {
+ to_release[count++] = &*p;// bluestore_pextent_t(p.get_start(), p.get_len());
+ }
+ ++p;
+ }
+ if (count > 0) {
+ std::lock_guard l(get_lock());
+ _release(count, &to_release.at(0));
+ }
+}
+
#include "HybridAllocator_impl.h"
#include <mutex>
#include "AvlAllocator.h"
+#include "Btree2Allocator.h"
#include "BitmapAllocator.h"
template <typename PrimaryAllocator>
return bmap_alloc;
}
private:
-
void _spillover_range(uint64_t start, uint64_t end) override;
uint64_t _spillover_allocate(uint64_t want,
uint64_t unit,
const char* get_type() const override;
};
+
+class HybridBtree2Allocator : public HybridAllocatorBase<Btree2Allocator>
+{
+public:
+ HybridBtree2Allocator(CephContext* cct,
+ int64_t device_size,
+ int64_t _block_size,
+ uint64_t max_mem,
+ double weight_factor,
+ std::string_view name) :
+ HybridAllocatorBase<Btree2Allocator>(cct,
+ device_size,
+ _block_size,
+ max_mem,
+ name) {
+ set_weight_factor(weight_factor);
+ }
+ const char* get_type() const override;
+
+ int64_t allocate(
+ uint64_t want,
+ uint64_t unit,
+ uint64_t max_alloc_size,
+ int64_t hint,
+ PExtentVector* extents) override;
+ void release(const release_set_t& release_set) override;
+};
// try bitmap first to avoid unneeded contiguous extents split if
// desired amount is less than shortes range in AVL
- bool primary_first = !(bmap_alloc && bmap_alloc->get_free() &&
- want < T::_lowest_size_available());
+ bool primary_first = !(bmap_alloc &&
+ bmap_alloc->get_free() &&
+ want < T::_lowest_size_available());
int64_t res = _allocate_or_rollback(primary_first,
want, unit, max_alloc_size, hint, extents);
}
void doOverwriteTest(uint64_t capacity, uint64_t prefill,
uint64_t overwrite);
+ void doOverwriteMPCTest(size_t thread_count,
+ uint64_t capacity, uint64_t prefill,
+ uint64_t overwrite);
+ void doOverwriteMPC2Test(size_t thread_count,
+ uint64_t capacity, uint64_t prefill,
+ uint64_t overwrite,
+ float extra = 0.05);
};
const uint64_t _1m = 1024 * 1024;
uint64_t head = 0;
uint64_t tail = 0;
uint64_t size = 0;
- boost::uniform_int<> u1;
+ boost::uniform_int<uint64_t> u1;
public:
AllocTracker(uint64_t capacity, uint64_t alloc_unit)
if (size == 0)
return false;
- uint64_t pos = (u1(rng) % size) + tail;
+ size_t pos = (u1(rng) % size) + tail;
pos %= allocations.size();
uint64_t val = allocations[pos];
*len = uint64_t((val & 0xffffff) << 8);
<< capacity / 1024 / 1024 << std::endl;
}
}
+ std::cout << "Executed in " << ceph_clock_now() - start << std::endl;
std::cout << "releasing..." << std::endl;
for (size_t i = 0; i < capacity; i += want_size)
dump_mempools();
}
+TEST_P(AllocTest, test_alloc_bench_seq_interleaving)
+{
+ // by adjusting capacity and mempool dump output analysis one can
+ // estimate max RAM usage for specific allocator's implementation with
+ // real-life disk size, e.g. 8TB or 16 TB.
+ // The current capacity is left pretty small to avoid huge RAM utilization
+ // when using non-effective allocators (e.g. AVL) and hence let the test
+ // case run perfectly on week H/W.
+ uint64_t capacity = uint64_t(1024) * 1024 * 1024 * 128; //128GB
+ uint64_t alloc_unit = 4096;
+ uint64_t want_size = alloc_unit;
+ PExtentVector allocated, tmp;
+
+ init_alloc(capacity, alloc_unit);
+ alloc->init_add_free(0, capacity);
+
+ utime_t start = ceph_clock_now();
+ alloc->init_rm_free(0, capacity);
+ std::cout << "Executed in " << ceph_clock_now() - start << std::endl;
+
+ std::cout << "releasing..." << std::endl;
+ for (size_t i = 0; i < capacity; i += want_size * 2)
+ {
+ interval_set<uint64_t> release_set;
+ release_set.insert(i, want_size);
+ alloc->release(release_set);
+ if (0 == (i % (1 * 1024 * _1m))) {
+ std::cout << "release " << i / 1024 / 1024 << " mb of "
+ << capacity / 1024 / 1024 << std::endl;
+ }
+ }
+ std::cout << "Executed in " << ceph_clock_now() - start << std::endl;
+ dump_mempools();
+}
+
TEST_P(AllocTest, test_alloc_bench)
{
uint64_t capacity = uint64_t(1024) * 1024 * 1024 * 1024;
dump_mempools();
}
+struct OverwriteTextContext : public Thread {
+ size_t idx = 0;
+ AllocTracker* tracker;
+ Allocator* alloc = nullptr;
+ size_t r_count = 0;
+ size_t a_count = 0;
+ uint64_t ae_count = 0;
+ uint64_t re_count = 0;
+
+ uint64_t how_many = 0;
+ uint64_t alloc_unit = 0;
+ timespan r_time;
+ timespan a_time;
+
+ OverwriteTextContext(size_t _idx,
+ AllocTracker* at,
+ Allocator* a,
+ uint64_t want,
+ uint64_t unit) :
+ idx(_idx), tracker(at), alloc(a), how_many(want), alloc_unit(unit)
+ {
+ }
+
+ void build_histogram() {
+ size_t num_buckets = 8;
+ Allocator::FreeStateHistogram hist;
+ hist.resize(num_buckets);
+ alloc->build_free_state_histogram(alloc_unit, hist);
+ for (size_t i = 0; i < num_buckets; i++) {
+ uint64_t a_bytes = (hist[i].alloc_units * alloc_unit);
+ std::cout << "<=" << hist[i].get_max(i, num_buckets)
+ << " -> " << hist[i].total << "/" << hist[i].aligned
+ << " a_bytes " << a_bytes
+ << " " << ((float)a_bytes / alloc->get_capacity() * 100) << "%"
+ << std::endl;
+ }
+ }
+
+ void* entry() override {
+ PExtentVector allocated, tmp;
+ gen_type rng(time(NULL));
+ boost::uniform_int<> u1(0, 9); // 4K-2M
+ boost::uniform_int<> u2(0, 9); // 4K-2M
+
+ r_time = ceph::make_timespan(0);
+ a_time = ceph::make_timespan(0);
+
+ for (uint64_t i = 0; i < how_many; )
+ {
+ uint64_t want_release = alloc_unit << u2(rng);
+ uint64_t released = 0;
+ interval_set<uint64_t> release_set;
+ do {
+ uint64_t o = 0;
+ uint32_t l = 0;
+ if (!tracker->pop_random(rng, &o, &l, want_release - released)) {
+ break;
+ }
+ release_set.insert(o, l);
+ released += l;
+ } while (released < want_release);
+
+ uint32_t want = alloc_unit << u1(rng);
+ tmp.clear();
+ auto t0 = mono_clock::now();
+ auto r = alloc->allocate(want, alloc_unit, 0, 0, &tmp);
+ a_count++;
+ ae_count += tmp.size();
+ a_time += mono_clock::now() - t0;
+ if (r != want) {
+ std::cout << "Can't allocate more space, stopping." << std::endl;
+ break;
+ }
+ i += r;
+
+ for (auto a : tmp) {
+ bool full = !tracker->push(a.offset, a.length);
+ EXPECT_EQ(full, false);
+ }
+ {
+ auto t0 = mono_clock::now();
+ alloc->release(release_set);
+ r_count++;
+ r_time += mono_clock::now() - t0;
+ re_count += release_set.num_intervals();
+ }
+ if (0 == (i % (1 * 1024 * _1m))) {
+ std::cout << idx << ">> reuse " << i / 1024 / 1024 << " mb of "
+ << how_many / 1024 / 1024 << std::endl;
+ }
+ }
+ return nullptr;
+ }
+};
+
void AllocTest::doOverwriteTest(uint64_t capacity, uint64_t prefill,
uint64_t overwrite)
{
i += r;
for(auto a : tmp) {
+
bool full = !at.push(a.offset, a.length);
EXPECT_EQ(full, false);
}
<< cap / 1024 / 1024 << std::endl;
}
}
-
+ std::cout << "Executed prefill in " << ceph_clock_now() - start << std::endl;
cap = overwrite;
+ OverwriteTextContext ctx(0, &at, alloc.get(), cap, alloc_unit);
+
+ start = ceph_clock_now();
+ ctx.entry();
+
+ std::cout << "Executed in " << ceph_clock_now() - start
+ << " alloc:" << ctx.a_count << "/" << ctx.ae_count << " in " << ctx.a_time
+ << " release:" << ctx.r_count << "/" << ctx.re_count << " in " << ctx.r_time
+ << std::endl;
+ std::cout<<"Avail "<< alloc->get_free() / _1m << " MB" << std::endl;
+
+ dump_mempools();
+}
+
+void AllocTest::doOverwriteMPCTest(size_t thread_count,
+ uint64_t capacity, uint64_t prefill,
+ uint64_t overwrite)
+{
+ uint64_t alloc_unit = 4096;
+ PExtentVector tmp;
+ std::vector<AllocTracker*> at;
+ std::vector<OverwriteTextContext*> ctx;
+
+ init_alloc(capacity, alloc_unit);
+ alloc->init_add_free(0, capacity);
+
+ at.resize(thread_count);
+ ctx.resize(thread_count);
+ for (size_t i = 0; i < thread_count; i++) {
+ at[i] = new AllocTracker(capacity, alloc_unit);
+ ctx[i] = new OverwriteTextContext(i, at[i], alloc.get(), overwrite, alloc_unit);
+ }
+
+ gen_type rng(time(NULL));
+ boost::uniform_int<> u1(0, 9); // 4K-2M
+
+ utime_t start = ceph_clock_now();
+ // allocate %% + 10% of the capacity
+ float extra = 0.1;
+ auto cap = prefill * (1 + extra);
+
+ uint64_t idx = 0;
for (uint64_t i = 0; i < cap; )
{
- uint64_t want_release = alloc_unit << u2(rng);
+ uint32_t want = alloc_unit << u1(rng);
+ tmp.clear();
+ auto r = alloc->allocate(want, alloc_unit, 0, 0, &tmp);
+ if (r < want) {
+ break;
+ }
+ i += r;
+
+ for (auto a : tmp) {
+ bool full = !at[idx]->push(a.offset, a.length);
+ EXPECT_EQ(full, false);
+ }
+ if (0 == (i % (1 * 1024 * _1m))) {
+ std::cout << "alloc " << i / 1024 / 1024 << " mb of "
+ << cap / 1024 / 1024 << std::endl;
+ }
+ idx = (idx + 1) % thread_count;
+ }
+
+ // do release extra space to introduce some fragmentation
+ cap = prefill * extra;
+ idx = 0;
+ for (uint64_t i = 0; i < cap; )
+ {
+ uint64_t want_release = alloc_unit << u1(rng);
uint64_t released = 0;
+ interval_set<uint64_t> release_set;
do {
uint64_t o = 0;
uint32_t l = 0;
- interval_set<uint64_t> release_set;
- if (!at.pop_random(rng, &o, &l, want_release - released)) {
+ if (!at[idx]->pop_random(rng, &o, &l, want_release - released)) {
break;
}
release_set.insert(o, l);
- alloc->release(release_set);
released += l;
} while (released < want_release);
+ alloc->release(release_set);
+ i += released;
+ if (0 == (i % (1 * 1024 * _1m))) {
+ std::cout << "release " << i / 1024 / 1024 << " mb of "
+ << cap / 1024 / 1024 << std::endl;
+ }
+ idx = (idx + 1) % thread_count;
+ }
+ std::cout << "Executed prefill in " << ceph_clock_now() - start
+ << " Fragmentation:" << alloc->get_fragmentation_score()
+ << std::endl;
+ ctx[0]->build_histogram();
+
+ start = ceph_clock_now();
+ for (size_t i = 0; i < thread_count; i++) {
+ ctx.at(i)->create(stringify(i).c_str());
+ }
+
+ for (size_t i = 0; i < thread_count; i++) {
+ ctx.at(i)->join();
+ }
+ std::cout << "Executed in " << ceph_clock_now() - start
+ << std::endl;
+ std::cout << "Avail " << alloc->get_free() / _1m << " MB"
+ << " Fragmentation:" << alloc->get_fragmentation_score()
+ << std::endl;
+ for (size_t i = 0; i < thread_count; i++) {
+ std::cout << "alloc/release stats for " << i
+ << " alloc:" << ctx.at(i)->a_count << "/" << ctx.at(i)->ae_count << " in " << ctx.at(i)->a_time
+ << " release:" << ctx.at(i)->r_count << "/" << ctx.at(i)->re_count << " in " << ctx.at(i)->r_time
+ << std::endl;
+ }
+ ctx[0]->build_histogram();
+ dump_mempools();
+ for (size_t i = 0; i < thread_count; i++) {
+ delete at[i];
+ delete ctx[i];
+ }
+}
+struct OverwriteTextContext2 : public OverwriteTextContext {
+
+ using OverwriteTextContext::OverwriteTextContext;
+ void* entry() override {
+ PExtentVector allocated, tmp;
+ gen_type rng(time(NULL));
+ boost::uniform_int<> u1(1, 16); // alloc_unit * u1 => 4K-64K
+
+ r_time = ceph::make_timespan(0);
+ a_time = ceph::make_timespan(0);
+ uint64_t processed = 0;
+ auto t00 = ceph_clock_now();
+ for (uint64_t i = 0; i < how_many; )
+ {
+ int64_t want = alloc_unit * u1(rng);
+ int64_t released = 0;
+ interval_set<uint64_t> release_set;
+ do {
+ uint64_t o = 0;
+ uint32_t l = 0;
+ if (!tracker->pop_random(rng, &o, &l, want - released)) {
+ break;
+ }
+ release_set.insert(o, l);
+ released += l;
+ } while (released < want);
+ tmp.clear();
+ auto t0 = mono_clock::now();
+ auto r = alloc->allocate(want, alloc_unit, 0, 0, &tmp);
+ a_count++;
+ ae_count += tmp.size();
+ a_time += mono_clock::now() - t0;
+ if (r != want) {
+ std::cout << "Can't allocate more space, stopping." << std::endl;
+ break;
+ }
+ i += r;
+
+ for (auto a : tmp) {
+ bool full = !tracker->push(a.offset, a.length);
+ EXPECT_EQ(full, false);
+ }
+ {
+ auto t0 = mono_clock::now();
+ alloc->release(release_set);
+ r_count++;
+ r_time += mono_clock::now() - t0;
+ re_count += release_set.num_intervals();
+ }
+ auto processed0 = processed;
+ processed += want;
+ auto _1g = 1024 * _1m;
+ if (processed / _1g != processed0 / _1g) {
+ std::cout << idx << ">> reuse " << i / 1024 / 1024 << " mb of "
+ << how_many / 1024 / 1024 << std::endl;
+
+ }
+ auto c = alloc->get_capacity();
+ bool capacity_written = (processed / c) != (processed0 / c);
+ if (capacity_written) {
+ std::cout << "> Single iteration writing completed in " << (ceph_clock_now() - t00)
+ << " alloc/release stats for " << idx
+ << " alloc:" << a_count << "/" << ae_count << " in " << a_time
+ << " release:" << r_count << "/" << re_count << " in " << r_time
+ << std::endl;
+ a_count = 0;
+ ae_count = 0;
+ r_count = 0;
+ re_count = 0;
+ r_time = ceph::make_timespan(0);
+ a_time = ceph::make_timespan(0);
+ if (idx == 0) {
+ std::cout << " Fragmentation: " << alloc->get_fragmentation_score()
+ << std::endl;
+ build_histogram();
+ }
+ t00 = ceph_clock_now();
+ }
+ }
+ return nullptr;
+ }
+};
+
+void AllocTest::doOverwriteMPC2Test(size_t thread_count,
+ uint64_t capacity, uint64_t prefill,
+ uint64_t overwrite,
+ float extra)
+{
+ uint64_t alloc_unit = 4096;
+ PExtentVector tmp;
+ std::vector<AllocTracker*> at;
+ std::vector<OverwriteTextContext2*> ctx;
+
+ init_alloc(capacity, alloc_unit);
+ alloc->init_add_free(0, capacity);
+
+ at.resize(thread_count);
+ ctx.resize(thread_count);
+ for (size_t i = 0; i < thread_count; i++) {
+ at[i] = new AllocTracker(capacity, alloc_unit);
+ ctx[i] = new OverwriteTextContext2(i, at[i], alloc.get(), overwrite, alloc_unit);
+ }
+
+ gen_type rng(time(NULL));
+ boost::uniform_int<> u1(8, 10); // 4096 << u1 => 1-4M chunks used for prefill
+ boost::uniform_int<> u2(1, 512); // 4096 * u2 => 4K-2M chunks used for overwrite
+
+ utime_t start = ceph_clock_now();
+ // allocate %% + extra% of the capacity
+ float cap = prefill + capacity * extra;
+
+ uint64_t idx = 0;
+ for (uint64_t i = 0; i < cap; )
+ {
uint32_t want = alloc_unit << u1(rng);
tmp.clear();
auto r = alloc->allocate(want, alloc_unit, 0, 0, &tmp);
- if (r != want) {
- std::cout<<"Can't allocate more space, stopping."<< std::endl;
+ if (r < want) {
break;
}
i += r;
- for(auto a : tmp) {
- bool full = !at.push(a.offset, a.length);
+ for (auto a : tmp) {
+ bool full = !at[idx]->push(a.offset, a.length);
EXPECT_EQ(full, false);
}
-
if (0 == (i % (1 * 1024 * _1m))) {
- std::cout << "reuse " << i / 1024 / 1024 << " mb of "
- << cap / 1024 / 1024 << std::endl;
+ std::cout << "alloc " << i / 1024 / 1024 << " mb of "
+ << cap / 1024 / 1024 << std::endl;
}
+ idx = (idx + 1) % thread_count;
}
- std::cout<<"Executed in "<< ceph_clock_now() - start << std::endl;
- std::cout<<"Avail "<< alloc->get_free() / _1m << " MB" << std::endl;
+ // do release extra space to introduce some fragmentation
+ cap = capacity * extra;
+ idx = 0;
+ for (uint64_t i = 0; i < (uint64_t)cap; )
+ {
+ uint64_t want_release = alloc_unit * u2(rng);
+ uint64_t released = 0;
+ interval_set<uint64_t> release_set;
+ do {
+ uint64_t o = 0;
+ uint32_t l = 0;
+ if (!at[idx]->pop_random(rng, &o, &l, want_release - released)) {
+ break;
+ }
+ release_set.insert(o, l);
+ released += l;
+ } while (released < want_release);
+ alloc->release(release_set);
+ i += released;
+ if (0 == (i % (1 * 1024 * _1m))) {
+ std::cout << "release " << i / 1024 / 1024 << " mb of "
+ << cap / 1024 / 1024 << std::endl;
+ }
+ idx = (idx + 1) % thread_count;
+ }
+
+ std::cout << "Executed prefill in " << ceph_clock_now() - start
+ << " Fragmentation:" << alloc->get_fragmentation_score()
+ << std::endl;
+ ctx[0]->build_histogram();
+
+ start = ceph_clock_now();
+ for (size_t i = 0; i < thread_count; i++) {
+ ctx[i]->create(stringify(i).c_str());
+ }
+
+ for (size_t i = 0; i < thread_count; i++) {
+ ctx.at(i)->join();
+ }
+ std::cout << "Executed in " << ceph_clock_now() - start
+ << std::endl;
+ std::cout << "Avail " << alloc->get_free() / _1m << " MB"
+ << " Fragmentation:" << alloc->get_fragmentation_score()
+ << std::endl;
+ ctx[0]->build_histogram();
dump_mempools();
+ for (size_t i = 0; i < thread_count; i++) {
+ delete at[i];
+ delete ctx[i];
+ }
}
TEST_P(AllocTest, test_alloc_bench_90_300)
doOverwriteTest(capacity, prefill, overwrite);
}
+TEST_P(AllocTest, test_alloc_bench_50_300_x2)
+{
+ // skipping for legacy and slow code
+ if ((GetParam() == string("stupid"))) {
+ GTEST_SKIP() << "skipping for specific allocators";
+ }
+ uint64_t capacity = uint64_t(1024) * 1024 * 1024 * 128;
+ auto prefill = capacity / 2;
+ auto overwrite = capacity * 3;
+ doOverwriteMPCTest(2, capacity, prefill, overwrite);
+}
+
+/*
+* The following benchmark test simulates small block overwrites over highly
+* utilized disk space prefilled with large extents. Overwrites are performed
+* from two concurring threads accessing the same allocator.
+* Detailed scenario:
+* 1. Prefill (single threaded):
+* 1.1. Fill 95% of the space with 1M-4M chunk allocations
+* 1.2. Release 5% of the allcoated space using random extents of 4K-2M bytes
+* 2. Random overwrite using 2 threads
+* 2.1 Deallocate random sub-extent of size randomly selected within [4K-64K] range
+* 2.2. Allocate random extent of size rwithin [4K-64K] range.
+* 2.3. Repeat 2.1-2.2 until total newly allocated bytes are equal to 500% of
+* the original disk space
+*
+* Pay attention to the resulting fragmentation score and histogram, time taken and
+* bluestore_alloc mempool stats
+*
+*/
+TEST_P(AllocTest, test_alloc_bench2_90_500_x2)
+{
+ // skipping for legacy and slow code
+ if ((GetParam() == string("stupid"))) {
+ GTEST_SKIP() << "skipping for specific allocators";
+ }
+ uint64_t capacity = uint64_t(1024) * 1024 * 1024 * 128;
+ auto prefill = capacity * 9 / 10;
+ auto overwrite = capacity * 5;
+ doOverwriteMPC2Test(2, capacity, prefill, overwrite);
+}
+
+TEST_P(AllocTest, test_alloc_bench2_20_500_x2)
+{
+ // skipping for legacy and slow code
+ if ((GetParam() == string("stupid"))) {
+ GTEST_SKIP() << "skipping for specific allocators";
+ }
+ uint64_t capacity = uint64_t(1024) * 1024 * 1024 * 128;
+ auto prefill = capacity * 2 / 10;
+ auto overwrite = capacity * 5;
+ doOverwriteMPC2Test(2, capacity, prefill, overwrite, 0.05);
+}
+
+TEST_P(AllocTest, test_alloc_bench2_50_500_x2)
+{
+ // skipping for legacy and slow code
+ if ((GetParam() == string("stupid"))) {
+ GTEST_SKIP() << "skipping for specific allocators";
+ }
+ uint64_t capacity = uint64_t(1024) * 1024 * 1024 * 128;
+ auto prefill = capacity * 5 / 10;
+ auto overwrite = capacity * 5;
+ doOverwriteMPC2Test(2, capacity, prefill, overwrite, 0.05);
+}
+
+TEST_P(AllocTest, test_alloc_bench2_75_500_x2)
+{
+ // skipping for legacy and slow code
+ if ((GetParam() == string("stupid"))) {
+ GTEST_SKIP() << "skipping for specific allocators";
+ }
+ uint64_t capacity = uint64_t(1024) * 1024 * 1024 * 128;
+ auto prefill = capacity * 75 / 100;
+ auto overwrite = capacity * 15;
+ doOverwriteMPC2Test(2, capacity, prefill, overwrite, 0.05);
+}
+
TEST_P(AllocTest, mempoolAccounting)
{
uint64_t bytes = mempool::bluestore_alloc::allocated_bytes();
INSTANTIATE_TEST_SUITE_P(
Allocator,
AllocTest,
- ::testing::Values("stupid", "bitmap", "avl", "hybrid", "btree"));
+ ::testing::Values("stupid", "bitmap", "avl", "hybrid", "btree", "hybrid_btree2"));
void init_close() {
alloc.reset(0);
}
+ void dump_alloc() {
+ alloc->dump();
+ }
};
TEST_P(AllocTest, test_alloc_init)
TEST_P(AllocTest, test_alloc_min_alloc)
{
- int64_t block_size = 1024;
- int64_t capacity = 4 * 1024 * block_size;
+ int64_t block_size = 4096;
+ int64_t capacity = 1024 * block_size;
{
init_alloc(capacity, block_size);
alloc->init_add_free(block_size, block_size);
+ dump_alloc();
PExtentVector extents;
EXPECT_EQ(block_size, alloc->allocate(block_size, block_size,
0, (int64_t) 0, &extents));
TEST_P(AllocTest, test_alloc_min_max_alloc)
{
- int64_t block_size = 1024;
+ int64_t block_size = 4096;
- int64_t capacity = 4 * 1024 * block_size;
+ int64_t capacity = 1024 * block_size;
init_alloc(capacity, block_size);
/*
TEST_P(AllocTest, test_alloc_failure)
{
- int64_t block_size = 1024;
- int64_t capacity = 4 * 1024 * block_size;
+ if (!(GetParam() == string("stupid") ||
+ GetParam() == string("avl") ||
+ GetParam() == string("bitmap") ||
+ GetParam() == string("hybrid"))) {
+ // new generation allocator(s) don't care about other-than-4K alignment
+ // hence the test case is not applicable
+ GTEST_SKIP() << "skipping for 'unaligned' allocators";
+ }
+
+ int64_t block_size = 4096;
+ int64_t capacity = 1024 * block_size;
{
init_alloc(capacity, block_size);
uint64_t alloc_unit = 4096;
uint64_t want_size = alloc_unit;
PExtentVector allocated, tmp;
-
+
init_alloc(capacity, alloc_unit);
alloc->init_add_free(0, capacity);
bool bitmap_alloc = GetParam() == std::string("bitmap");
-
+
EXPECT_EQ(0.0, alloc->get_fragmentation());
for (size_t i = 0; i < capacity / alloc_unit; ++i)
{
tmp.clear();
EXPECT_EQ(static_cast<int64_t>(want_size),
- alloc->allocate(want_size, alloc_unit, 0, 0, &tmp));
+ alloc->allocate(want_size, alloc_unit, 0, 0, &tmp));
allocated.insert(allocated.end(), tmp.begin(), tmp.end());
// bitmap fragmentation calculation doesn't provide such constant
tmp.clear();
EXPECT_EQ(-ENOSPC, alloc->allocate(want_size, alloc_unit, 0, 0, &tmp));
- if (GetParam() == string("avl")) {
- // AVL allocator uses a different allocating strategy
- GTEST_SKIP() << "skipping for AVL allocator";
- } else if (GetParam() == string("hybrid")) {
- // AVL allocator uses a different allocating strategy
- GTEST_SKIP() << "skipping for Hybrid allocator";
+ if (!(GetParam() == string("stupid") || GetParam() == string("bitmap"))) {
+ GTEST_SKIP() << "skipping for specific allocators";
}
for (size_t i = 0; i < allocated.size(); i += 2)
TEST_P(AllocTest, test_alloc_47883)
{
+ if (!(GetParam() == string("stupid") ||
+ GetParam() == string("avl") ||
+ GetParam() == string("bitmap") ||
+ GetParam() == string("hybrid"))) {
+ // new generation allocator(s) don't care about other-than-4K alignment
+ // hence the test case is not applicable
+ GTEST_SKIP() << "skipping for 'unaligned' allocators";
+ }
uint64_t block = 0x1000;
uint64_t size = 1599858540544ul;
INSTANTIATE_TEST_SUITE_P(
Allocator,
AllocTest,
- ::testing::Values("stupid", "bitmap", "avl", "hybrid", "btree"));
+ ::testing::Values("stupid", "bitmap", "avl", "hybrid", "btree", "hybrid_btree2"));