From dd95fca5af5b5faed5d5c73ee9f5998411765a24 Mon Sep 17 00:00:00 2001 From: Igor Fedotov Date: Fri, 14 Jul 2023 17:48:52 +0300 Subject: [PATCH] os/bluestore: introduce new allocator hybrid_btree2 Signed-off-by: Igor Fedotov --- src/common/options/global.yaml.in | 7 + src/crimson/os/alienstore/CMakeLists.txt | 1 + src/os/CMakeLists.txt | 1 + src/os/bluestore/Allocator.cc | 6 + src/os/bluestore/AvlAllocator.h | 4 +- src/os/bluestore/Btree2Allocator.cc | 608 +++++++++++++++++++++++ src/os/bluestore/Btree2Allocator.h | 429 ++++++++++++++++ src/os/bluestore/BtreeAllocator.h | 4 +- src/os/bluestore/HybridAllocator.cc | 55 ++ src/os/bluestore/HybridAllocator.h | 29 +- src/os/bluestore/HybridAllocator_impl.h | 5 +- src/test/objectstore/Allocator_bench.cc | 515 ++++++++++++++++++- src/test/objectstore/Allocator_test.cc | 49 +- 13 files changed, 1675 insertions(+), 38 deletions(-) create mode 100644 src/os/bluestore/Btree2Allocator.cc create mode 100644 src/os/bluestore/Btree2Allocator.h diff --git a/src/common/options/global.yaml.in b/src/common/options/global.yaml.in index f529bd1cc4ba8..3a66298ccd26f 100644 --- a/src/common/options/global.yaml.in +++ b/src/common/options/global.yaml.in @@ -4193,6 +4193,7 @@ options: - avl - btree - hybrid + - hybrid_btree2 with_legacy: true - name: bluefs_log_replay_check_allocations type: bool @@ -4974,6 +4975,7 @@ options: - btree - hybrid - zoned + - hybrid_btree2 with_legacy: true - name: bluestore_freelist_blocks_per_key type: size @@ -5499,6 +5501,11 @@ options: level: dev desc: Maximum RAM hybrid allocator should use before enabling bitmap supplement default: 64_M +- name: bluestore_btree2_alloc_weight_factor + type: float + level: dev + desc: Large continuous extents weight factor + default: 2 - name: bluestore_volume_selection_policy type: str level: dev diff --git a/src/crimson/os/alienstore/CMakeLists.txt b/src/crimson/os/alienstore/CMakeLists.txt index c881f4fbccbc7..2822a2fb0830f 100644 --- a/src/crimson/os/alienstore/CMakeLists.txt +++ b/src/crimson/os/alienstore/CMakeLists.txt @@ -50,6 +50,7 @@ set(alien_store_srcs ${PROJECT_SOURCE_DIR}/src/os/bluestore/Allocator.cc ${PROJECT_SOURCE_DIR}/src/os/bluestore/AvlAllocator.cc ${PROJECT_SOURCE_DIR}/src/os/bluestore/BtreeAllocator.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/Btree2Allocator.cc ${PROJECT_SOURCE_DIR}/src/os/bluestore/BitmapFreelistManager.cc ${PROJECT_SOURCE_DIR}/src/os/bluestore/BlueFS.cc ${PROJECT_SOURCE_DIR}/src/os/bluestore/bluefs_types.cc diff --git a/src/os/CMakeLists.txt b/src/os/CMakeLists.txt index 55415fb37228c..876a6675f80b2 100644 --- a/src/os/CMakeLists.txt +++ b/src/os/CMakeLists.txt @@ -23,6 +23,7 @@ if(WITH_BLUESTORE) bluestore/BitmapAllocator.cc bluestore/AvlAllocator.cc bluestore/BtreeAllocator.cc + bluestore/Btree2Allocator.cc bluestore/HybridAllocator.cc ) endif(WITH_BLUESTORE) diff --git a/src/os/bluestore/Allocator.cc b/src/os/bluestore/Allocator.cc index 748506ae25181..082e5fea5ef93 100644 --- a/src/os/bluestore/Allocator.cc +++ b/src/os/bluestore/Allocator.cc @@ -7,6 +7,7 @@ #include "BitmapAllocator.h" #include "AvlAllocator.h" #include "BtreeAllocator.h" +#include "Btree2Allocator.h" #include "HybridAllocator.h" #ifdef HAVE_LIBZBD #include "ZonedAllocator.h" @@ -196,6 +197,11 @@ Allocator *Allocator::create( return new ZonedAllocator(cct, size, block_size, zone_size, first_sequential_zone, name); #endif + } else if (type == "hybrid_btree2") { + return new HybridBtree2Allocator(cct, size, block_size, + cct->_conf.get_val("bluestore_hybrid_alloc_mem_cap"), + cct->_conf.get_val("bluestore_btree2_alloc_weight_factor"), + name); } if (alloc == nullptr) { lderr(cct) << "Allocator::" << __func__ << " unknown alloc type " diff --git a/src/os/bluestore/AvlAllocator.h b/src/os/bluestore/AvlAllocator.h index 91bac9b0619ca..37d65c727826f 100644 --- a/src/os/bluestore/AvlAllocator.h +++ b/src/os/bluestore/AvlAllocator.h @@ -266,7 +266,9 @@ protected: uint64_t _lowest_size_available() { auto rs = range_size_tree.begin(); - return rs != range_size_tree.end() ? rs->length() : std::numeric_limits::max(); + return rs != range_size_tree.end() ? + rs->length() : + std::numeric_limits::max(); } int64_t _allocate( diff --git a/src/os/bluestore/Btree2Allocator.cc b/src/os/bluestore/Btree2Allocator.cc new file mode 100644 index 0000000000000..d0a2763ac2fdb --- /dev/null +++ b/src/os/bluestore/Btree2Allocator.cc @@ -0,0 +1,608 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "Btree2Allocator.h" +#include "common/debug.h" + +#define dout_context (get_context()) +#define dout_subsys ceph_subsys_bluestore +#undef dout_prefix +#define dout_prefix *_dout << (std::string(this->get_type()) + "::").c_str() + +/* + * class Btree2Allocator + * + * + */ +MEMPOOL_DEFINE_OBJECT_FACTORY(Btree2Allocator::range_seg_t, btree2_range_seg_t, bluestore_alloc); + +Btree2Allocator::Btree2Allocator(CephContext* _cct, + int64_t device_size, + int64_t block_size, + uint64_t max_mem, + double _rweight_factor, + bool with_cache, + std::string_view name) : + Allocator(name, device_size, block_size), + cct(_cct), + range_count_cap(max_mem / sizeof(range_seg_t)) +{ + set_weight_factor(_rweight_factor); + if (with_cache) { + cache = new ChunkCache(); + } +} + +void Btree2Allocator::init_add_free(uint64_t offset, uint64_t length) +{ + ldout(cct, 10) << __func__ << std::hex + << " offset 0x" << offset + << " length 0x" << length + << std::dec << dendl; + if (!length) + return; + std::lock_guard l(lock); + ceph_assert(offset + length <= uint64_t(device_size)); + _add_to_tree(offset, length); +} + +void Btree2Allocator::init_rm_free(uint64_t offset, uint64_t length) +{ + ldout(cct, 10) << __func__ << std::hex + << " offset 0x" << offset + << " length 0x" << length + << std::dec << dendl; + if (!length) + return; + std::lock_guard l(lock); + ceph_assert(offset + length <= uint64_t(device_size)); + _remove_from_tree(offset, length); +} + +int64_t Btree2Allocator::allocate( + uint64_t want, + uint64_t unit, + uint64_t max_alloc_size, + int64_t hint, // unused, for now! + PExtentVector* extents) +{ + ldout(cct, 10) << __func__ << std::hex + << " want 0x" << want + << " unit 0x" << unit + << " max_alloc_size 0x" << max_alloc_size + << " hint 0x" << hint + << std::dec << dendl; + ceph_assert(std::has_single_bit(unit)); + ceph_assert(want % unit == 0); + + if (max_alloc_size == 0) { + max_alloc_size = want; + } + if (constexpr auto cap = std::numeric_limits::max(); + max_alloc_size >= cap) { + max_alloc_size = p2align(uint64_t(cap), (uint64_t)block_size); + } + uint64_t cached_chunk_offs = 0; + if (cache && cache->try_get(&cached_chunk_offs, want)) { + num_free -= want; + extents->emplace_back(cached_chunk_offs, want); + return want; + } + std::lock_guard l(lock); + return _allocate(want, unit, max_alloc_size, hint, extents); +} + +void Btree2Allocator::release(const release_set_t& release_set) +{ + if (!cache || release_set.num_intervals() >= pextent_array_size) { + std::lock_guard l(lock); + _release(release_set); + return; + } + PExtentArray to_release; + size_t count = 0; + auto p = release_set.begin(); + while (p != release_set.end()) { + if (cache->try_put(p.get_start(), p.get_len())) { + num_free += p.get_len(); + } else { + to_release[count++] = &*p; + } + ++p; + } + if (count > 0) { + std::lock_guard l(lock); + _release(count, &to_release.at(0)); + } +} + +void Btree2Allocator::_shutdown() +{ + if (cache) { + delete cache; + cache = nullptr; + } + for (auto& tree : range_size_set) { + tree.clear(); + } + range_tree.clear(); +} + +void Btree2Allocator::_dump(bool full) const +{ + if (full) { + ldout(cct, 0) << " >>>range_tree: " << dendl; + for (auto& rs : range_tree) { + ldout(cct, 0) << std::hex + << "0x" << rs.first << "~" << (rs.second - rs.first) + << std::dec + << dendl; + } + ldout(cct, 0) << " >>>range_size_trees: " << dendl; + size_t i = 0; + for (auto& rs_tree : range_size_set) { + ldout(cct, 0) << " >>>bucket[" << i << "]" << dendl; + ++i; + for (auto& rs : rs_tree) + ldout(cct, 0) << std::hex + << "0x" << rs.start << "~" << (rs.end - rs.start) + << std::dec << dendl; + } + if (cache) { + ldout(cct, 0) << " >>>cache: " << dendl; + auto cb = [&](uint64_t offset, uint64_t length) { + ldout(cct, 0) << std::hex + << "0x" << offset << "~" << length + << std::dec << dendl; + }; + cache->foreach(cb); + } + ldout(cct, 0) << " >>>>>>>>>>>" << dendl; + } + if (cache) { + ldout(cct, 0) << " >>>cache stats: " << dendl; + ldout(cct, 0) << " hits: " << cache->get_hit_count() << dendl; + } +} + +void Btree2Allocator::_foreach( + std::function notify) +{ + for (auto& rs : range_tree) { + notify(rs.first, rs.second - rs.first); + } + if (cache) { + cache->foreach(notify); + } +} + +int64_t Btree2Allocator::_allocate( + uint64_t want, + uint64_t unit, + uint64_t max_alloc_size, + int64_t hint, // unused, for now! + PExtentVector* extents) +{ + uint64_t allocated = 0; + while (allocated < want) { + auto want_now = std::min(max_alloc_size, want - allocated); + if (cache && want_now != want) { + uint64_t cached_chunk_offs = 0; + if (cache->try_get(&cached_chunk_offs, want_now)) { + num_free -= want_now; + extents->emplace_back(cached_chunk_offs, want_now); + allocated += want_now; + continue; + } + } + size_t bucket0 = MyTraits::_get_size_bucket(want_now); + int64_t r = __allocate(bucket0, want_now, + unit, extents); + if (r < 0) { + // Allocation failed. + break; + } + allocated += r; + } + return allocated ? allocated : -ENOSPC; +} + +void Btree2Allocator::_release(const release_set_t& release_set) +{ + for (auto p = release_set.begin(); p != release_set.end(); ++p) { + const auto offset = p.get_start(); + const auto length = p.get_len(); + ceph_assert(offset + length <= uint64_t(device_size)); + ldout(cct, 10) << __func__ << std::hex + << " offset 0x" << offset + << " length 0x" << length + << std::dec << dendl; + _add_to_tree(offset, length); + } +} + +void Btree2Allocator::_release(const PExtentVector& release_set) +{ + for (auto& e : release_set) { + ldout(cct, 10) << __func__ << std::hex + << " offset 0x" << e.offset + << " length 0x" << e.length + << std::dec << dendl; + _add_to_tree(e.offset, e.length); + } +} + +void Btree2Allocator::_release(size_t count, const release_set_entry_t** to_release) +{ + for (size_t i = 0; i < count; i++) { + auto* e = to_release[i]; + ldout(cct, 10) << __func__ << std::hex + << " offset 0x" << e->first + << " length 0x" << e->second + << std::dec << dendl; + _add_to_tree(e->first, e->second); + } +} + +void Btree2Allocator::_add_to_tree(uint64_t start, uint64_t size) +{ + ceph_assert(size != 0); + + uint64_t end = start + size; + + // Make sure we don't overlap with either of our neighbors + auto rt_p_after = range_tree.upper_bound(start); + auto rt_p_before = range_tree.end(); + if (rt_p_after != range_tree.begin()) { + rt_p_before = std::prev(rt_p_after); + } + bool merge_before = (rt_p_before != range_tree.end() && rt_p_before->second == start); + bool merge_after = (rt_p_after != range_tree.end() && rt_p_after->first == end); + + range_seg_t rs(start, end); + if (merge_before && merge_after) { + rs = range_seg_t{ rt_p_before->first, rt_p_after->second}; + _range_size_tree_rm(range_seg_t{ rt_p_before->first, rt_p_before->second }); + _range_size_tree_rm(range_seg_t{ rt_p_after->first, rt_p_after->second }); + rt_p_after = range_tree.erase(rt_p_before); + rt_p_after = range_tree.erase(rt_p_after); + } else if (merge_before) { + rs = range_seg_t(rt_p_before->first, end); + _range_size_tree_rm(range_seg_t{ rt_p_before->first, rt_p_before->second }); + rt_p_after = range_tree.erase(rt_p_before); + } else if (merge_after) { + rs = range_seg_t(start, rt_p_after->second); + _range_size_tree_rm(range_seg_t{ rt_p_after->first, rt_p_after->second }); + rt_p_after = range_tree.erase(rt_p_after); + } + // create new entry at both range_tree and range_size_set + __try_insert_range(rs, &rt_p_after); +} + +void Btree2Allocator::_try_remove_from_tree(uint64_t start, uint64_t size, + std::function cb) +{ + uint64_t end = start + size; + + ceph_assert(size != 0); + + auto rt_p = range_tree.lower_bound(start); + if ((rt_p == range_tree.end() || rt_p->first > start) && rt_p != range_tree.begin()) { + --rt_p; + } + + if (rt_p == range_tree.end() || rt_p->first >= end) { + cb(start, size, false); + return; + } + do { + auto next_p = rt_p; + ++next_p; + + if (start < rt_p->first) { + cb(start, rt_p->first - start, false); + start = rt_p->first; + } + auto range_end = std::min(rt_p->second, end); + + _remove_from_tree(rt_p, start, range_end); + + cb(start, range_end - start, true); + start = range_end; + + rt_p = next_p; + } while (rt_p != range_tree.end() && rt_p->first < end && start < end); + if (start < end) { + cb(start, end - start, false); + } +} + +int64_t Btree2Allocator::__allocate( + size_t bucket0, + uint64_t size, + uint64_t unit, + PExtentVector* extents) +{ + int64_t allocated = 0; + + auto rs_tree = &range_size_set[bucket0]; + auto rs_p = _pick_block(0, rs_tree, size); + + if (rs_p == rs_tree->end()) { + auto bucket_center = MyTraits::_get_size_bucket(weight_center); + + // requested size is to the left of weight center + // hence we try to search up toward it first + // + size_t bucket = bucket0; // using unsigned for bucket is crucial + // as bounds checking when walking downsize + // depends on signed->unsigned value + // transformation + while (++bucket <= bucket_center) { + rs_tree = &range_size_set[bucket]; + rs_p = _pick_block(1, rs_tree, size); + if (rs_p != rs_tree->end()) { + goto found; + } + } + + int dir = left_weight() > right_weight() ? -1 : 1; // lookup direction + int dir0 = dir; + + // Start from the initial bucket if searching downhill to + // run through that bucket again as we haven't check extents shorter + // than requested size if any. + // Start from bucket_center + 1 if walking uphill. + bucket = dir < 0 ? bucket0 : bucket_center + 1; + do { + // try spilled over or different direction if bucket index is out of bounds + if (bucket >= MyTraits::num_size_buckets) { + if (dir < 0) { + // reached the bottom while going downhill, + // time to try spilled over extents + uint64_t r = _spillover_allocate(size, + unit, + size, + 0, + extents); // 0 is returned if error were detected. + allocated += r; + ceph_assert(size >= (uint64_t)r); + size -= r; + if (size == 0) { + return allocated; + } + } + // change direction + dir = -dir; + bucket = dir < 0 ? bucket0 : bucket_center + 1; // See above on new bucket + // selection rationales + ceph_assert(bucket < MyTraits::num_size_buckets); // this should never happen + if (dir == dir0 ) { + // stop if both directions already attempted + return -ENOSPC; + } + } + rs_tree = &range_size_set[bucket]; + rs_p = _pick_block(dir, rs_tree, size); + bucket += dir; // this might wrap over zero and trigger direction + // change on the next loop iteration. + } while (rs_p == rs_tree->end()); + } + +found: + if (rs_p != rs_tree->end()) { + auto o = rs_p->start; + auto l = std::min(size, rs_p->length()); + auto rt_p = range_tree.find(o); + ceph_assert(rt_p != range_tree.end()); + _remove_from_tree(rs_tree, rs_p, rt_p, o, o + l); + extents->emplace_back(o, l); + allocated += l; + return allocated; + } + // should never get here + ceph_assert(false); + return -EFAULT; +} + +Btree2Allocator::range_size_tree_t::iterator +Btree2Allocator::_pick_block(int dir, + Btree2Allocator::range_size_tree_t* tree, + uint64_t size) +{ + if (dir < 0) { + // use the largest available chunk from 'shorter chunks' buckets + // given the preliminary check before the function's call + // we should get the result in a single step + auto p = tree->end(); + if (p != tree->begin()) { + --p; + return p; + } + } else if (dir > 0) { + auto p = tree->begin(); + if (p != tree->end()) { + return p; + } + } else if (tree->size()) { + //For the 'best matching' bucket + //look for a chunk with the best size matching. + // Start checking from the last item to + // avoid lookup if possible + auto p = tree->end(); + --p; + if (p->length() > size) { + p = tree->lower_bound(range_seg_t{ 0, size }); + ceph_assert(p != tree->end()); + } + return p; + } + return tree->end(); +} + +void Btree2Allocator::_remove_from_tree(uint64_t start, uint64_t size) +{ + ceph_assert(size != 0); + ceph_assert(size <= num_free); + auto end = start + size; + + // Find chunk we completely overlap with + auto rt_p = range_tree.lower_bound(start); + if ((rt_p == range_tree.end() || rt_p->first > start) && rt_p != range_tree.begin()) { + --rt_p; + } + + /// Make sure we completely overlap with someone + ceph_assert(rt_p != range_tree.end()); + ceph_assert(rt_p->first <= start); + ceph_assert(rt_p->second >= end); + _remove_from_tree(rt_p, start, end); +} + +void Btree2Allocator::_remove_from_tree( + Btree2Allocator::range_tree_t::iterator rt_p, + uint64_t start, + uint64_t end) +{ + range_seg_t rs(rt_p->first, rt_p->second); + size_t bucket = MyTraits::_get_size_bucket(rs.length()); + range_size_tree_t* rs_tree = &range_size_set[bucket]; + auto rs_p = rs_tree->find(rs); + ceph_assert(rs_p != rs_tree->end()); + + _remove_from_tree(rs_tree, rs_p, rt_p, start, end); +} + +void Btree2Allocator::_remove_from_tree( + Btree2Allocator::range_size_tree_t* rs_tree, + Btree2Allocator::range_size_tree_t::iterator rs_p, + Btree2Allocator::range_tree_t::iterator rt_p, + uint64_t start, + uint64_t end) +{ + bool left_over = (rt_p->first != start); + bool right_over = (rt_p->second != end); + + range_seg_t rs = *rs_p; + _range_size_tree_rm(rs_tree, rs_p); + rt_p = range_tree.erase(rt_p); // It's suboptimal to do the removal every time. + // Some paths might avoid that but coupled with + // spillover handling this looks non-trivial. + // Hence leaving as-is. + if (left_over && right_over) { + range_seg_t rs_before(rs.start, start); + range_seg_t rs_after(end, rs.end); + // insert rs_after first since this updates rt_p which starts to point to + // newly inserted item. + __try_insert_range(rs_after, &rt_p); + __try_insert_range(rs_before, &rt_p); + } else if (left_over) { + rs.end = start; + __try_insert_range(rs, &rt_p); + } else if (right_over) { + rs.start = end; + __try_insert_range(rs, &rt_p); + } +} + +void Btree2Allocator::_try_insert_range(const range_seg_t& rs) +{ + if (__try_insert_range(rs, nullptr)) { + _range_size_tree_add(rs); + } + else { + range_tree.erase(rs.start); + } +} + +bool Btree2Allocator::__try_insert_range( + const Btree2Allocator::range_seg_t& rs, + Btree2Allocator::range_tree_t::iterator* rt_p_insert) +{ + ceph_assert(rs.end > rs.start); + // Check if amount of range_seg_t entries isn't above the threshold, + // use doubled range_tree's size for the estimation + bool spillover_input = range_count_cap && uint64_t(2 * range_tree.size()) >= range_count_cap; + range_size_tree_t* lowest_rs_tree = nullptr; + range_size_tree_t::iterator lowest_rs_p; + if (spillover_input) { + lowest_rs_tree = _get_lowest(&lowest_rs_p); + if (lowest_rs_tree) { + ceph_assert(lowest_rs_p != lowest_rs_tree->end()); + spillover_input = rs.length() <= lowest_rs_p->length(); + } + // make sure we never spill over chunks larger than weight_center + if (spillover_input) { + spillover_input = rs.length() <= weight_center; + lowest_rs_tree = nullptr; + } else if (lowest_rs_tree && lowest_rs_p->length () > weight_center) { + lowest_rs_tree = nullptr; + } + } + if (spillover_input) { + _spillover_range(rs.start, rs.end); + } else { + // NB: we should be careful about iterators (lowest_rs_p & rt_p_insert) + // Relevant trees' updates might invalidate any of them hence + // a bit convoluted logic below. + range_seg_t lowest_rs; + range_tree_t new_rt_p; + if (lowest_rs_tree) { + lowest_rs = *lowest_rs_p; + _range_size_tree_rm(lowest_rs_tree, lowest_rs_p); + range_tree.erase(lowest_rs.start); + // refresh the iterator to be returned + // due to the above range_tree removal invalidated it. + if (rt_p_insert) { + *rt_p_insert = range_tree.lower_bound(rs.start); + } + _spillover_range(lowest_rs.start, lowest_rs.end); + } + if (rt_p_insert) { + *rt_p_insert = range_tree.emplace_hint(*rt_p_insert, rs.start, rs.end); + _range_size_tree_add(rs); + } + } + return !spillover_input; +} + +void Btree2Allocator::_range_size_tree_add(const range_seg_t& rs) { + auto l = rs.length(); + ceph_assert(rs.end > rs.start); + size_t bucket = MyTraits::_get_size_bucket(l); + range_size_set[bucket].insert(rs); + num_free += l; + + if (l > weight_center) { + rsum += l; + } else { + lsum += l; + } + +} +void Btree2Allocator::_range_size_tree_rm(const range_seg_t& rs) +{ + size_t bucket = MyTraits::_get_size_bucket(rs.length()); + range_size_tree_t* rs_tree = &range_size_set[bucket]; + ceph_assert(rs_tree->size() > 0); + auto rs_p = rs_tree->find(rs); + ceph_assert(rs_p != rs_tree->end()); + _range_size_tree_rm(rs_tree, rs_p); +} + +void Btree2Allocator::_range_size_tree_rm( + Btree2Allocator::range_size_tree_t* rs_tree, + Btree2Allocator::range_size_tree_t::iterator rs_p) +{ + auto l = rs_p->length(); + ceph_assert(num_free >= l); + num_free -= l; + if (l > weight_center) { + ceph_assert(rsum >= l); + rsum -= l; + } else { + ceph_assert(lsum >= l); + lsum -= l; + } + rs_tree->erase(rs_p); +} diff --git a/src/os/bluestore/Btree2Allocator.h b/src/os/bluestore/Btree2Allocator.h new file mode 100644 index 0000000000000..f950999adc45d --- /dev/null +++ b/src/os/bluestore/Btree2Allocator.h @@ -0,0 +1,429 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include + +#include "include/cpp-btree/btree_map.h" +#include "include/cpp-btree/btree_set.h" + +#include "Allocator.h" + +#include "os/bluestore/bluestore_types.h" +#include "include/mempool.h" +#include "common/ceph_mutex.h" + +template +struct SizeBucketCollectionTraits { + static const size_t base_bits = BASE_BITS; + static const size_t base = 1ull << base_bits; + static const size_t size_factor = FACTOR; // single bucket size range to be + // determined as [n, n * 2 * FACTOR) + static const size_t num_size_buckets = NUM_BUCKETS; + static const size_t bucket_max = + base << ((size_factor * (num_size_buckets - 2))); + + static inline size_t _get_size_bucket(uint64_t len) { + size_t idx; + ceph_assert(size_factor); + if (len <= base) { + idx = 0; + } else if (len > bucket_max) { + idx = num_size_buckets - 1; + } else { + size_t most_bit = cbits(uint64_t(len - 1)) - 1; + idx = 1 + ((most_bit - base_bits) / size_factor); + } + ceph_assert(idx < num_size_buckets); + return idx; + } +}; + +class ChunkCache { +public: + class LocklessBuf { + std::atomic ref = 0; + size_t count = 0; + std::vector data; + public: + void init(size_t size) { + ceph_assert(data.size() == 0); + data.resize(size); + } + bool try_put(uint32_t& v) { + bool done = ++ref == 1 && count < data.size(); + if (done) { + data[count++] = v; + } + --ref; + return done; + } + bool try_get(uint32_t& v) { + bool done = ++ref == 1 && count > 0; + if (done) { + v = data[--count]; + } + --ref; + return done; + } + void foreach(std::function notify) { + for (size_t i = 0; i < count; i++) { + notify(data[i]); + } + } + }; + + static const size_t base_bits = 12; // 4K + static const size_t base = 1ull << base_bits; + static const size_t num_buckets = 16; // [4K, 8K, 12K, ... 64K] - 16 buckets total + static const size_t num_chunks = 16; + ChunkCache() { + for (size_t i = 0; i < buckets.size(); i++) { + buckets[i].init(num_chunks); + } + } + bool try_put(uint64_t offset, uint64_t len) { + if (!lock.try_lock_shared()) { + return false; + } + bool ret = false; + ceph_assert(p2aligned(offset, base)); + ceph_assert(p2aligned(len, base)); + // permit caching chunks with offsets fitting (without LSBs) + // into 32 - bit value + offset = offset >> base_bits; + size_t idx = len >> base_bits; + if (offset <= std::numeric_limits::max() && + idx < buckets.size()) { + uint32_t o = offset; + ret = buckets[idx].try_put(o); + } + lock.unlock_shared(); + return ret; + } + bool try_get(uint64_t* offset, uint64_t len) { + if (!lock.try_lock_shared()) { + return false; + } + bool ret = false; + ceph_assert(offset); + ceph_assert(p2aligned(len, base)); + size_t idx = len >> base_bits; + if (idx < buckets.size()) { + uint32_t o = 0; + ret = buckets[idx].try_get(o); + if (ret) { + *offset = uint64_t(o) << base_bits; + ++hits; + } + } + lock.unlock_shared(); + return ret; + } + size_t get_hit_count() const { + return hits.load(); + } + void foreach(std::function notify) { + std::unique_lock _lock(lock); + for (uint64_t i = 0; i < buckets.size(); i++) { + auto cb = [&](uint32_t o) { + notify(uint64_t(o) << base_bits, i << base_bits); + }; + buckets[i].foreach(cb); + } + } +private: + std::array buckets; + std::atomic hits = 0; + ceph::shared_mutex lock{ + ceph::make_shared_mutex(std::string(), false, false, false) + }; +}; + +/* + * class Btree2Allocator + * + * + */ +class Btree2Allocator : public Allocator { + typedef SizeBucketCollectionTraits<12, 14> MyTraits; +public: + // Making public to share with mempools + struct range_seg_t { + MEMPOOL_CLASS_HELPERS(); ///< memory monitoring + uint64_t start; ///< starting offset of this segment + uint64_t end; ///< ending offset (non-inclusive) + + // Tree is sorted by offset, greater offsets at the end of the tree. + struct before_t { + template + bool operator()(const KeyLeft& lhs, const KeyRight& rhs) const { + return lhs.end <= rhs.start; + } + }; + + // Tree is sorted by size, larger chunks are at the end of the tree. + struct shorter_t { + template + int compare(const KeyType1& lhs, const KeyType2& rhs) const { + int64_t delta = + int64_t(lhs.end) - int64_t(lhs.start) - int64_t(rhs.end) + int64_t(rhs.start); + if (delta == 0) { + delta = (int64_t)lhs.start - (int64_t)rhs.start; + } + return delta ? delta / std::abs(delta) : 0; + } + template + int operator()(const range_seg_t& lhs, const KeyType& rhs) const { + return compare(lhs, rhs); + } + }; + + range_seg_t(uint64_t start = 0, uint64_t end = 0) + : start{ start }, end{ end } + {} + inline uint64_t length() const { + return end - start; + } + }; + +public: + Btree2Allocator(CephContext* cct, int64_t device_size, int64_t block_size, + uint64_t max_mem, + double _rweight_factor, + bool with_cache, + std::string_view name); + + // + //ctor intended for the usage from descendant (aka hybrid) class(es) + // which provide handling for spilled over entries + // + Btree2Allocator(CephContext* cct, int64_t device_size, int64_t block_size, + uint64_t max_mem, + std::string_view name) : + Btree2Allocator(cct, device_size, block_size, max_mem, 1.0, true, name) { + } + + ~Btree2Allocator() override { + shutdown(); + } + + const char* get_type() const override + { + return "btree_v2"; + } + void init_add_free(uint64_t offset, uint64_t length) override; + void init_rm_free(uint64_t offset, uint64_t length) override; + + int64_t allocate( + uint64_t want, + uint64_t unit, + uint64_t max_alloc_size, + int64_t hint, + PExtentVector* extents) override; + + void release(const release_set_t& release_set) override; + + uint64_t get_free() override { + return num_free; + } + double get_fragmentation() override { + std::lock_guard l(lock); + return _get_fragmentation(); + } + size_t get_cache_hit_count() const { + return cache ? cache->get_hit_count() : 0; + } + + void dump() override { + std::lock_guard l(lock); + _dump(); + } + void foreach( + std::function notify) override { + std::lock_guard l(lock); + _foreach(notify); + } + void shutdown() override { + std::lock_guard l(lock); + _shutdown(); + } + +private: + CephContext* cct = nullptr; + ChunkCache* cache = nullptr; + std::mutex lock; + + template + using pool_allocator = mempool::bluestore_alloc::pool_allocator; + using range_tree_t = + btree::btree_map< + uint64_t, // start + uint64_t, // end + std::less, + pool_allocator>>; + range_tree_t range_tree; ///< main range tree + + // + // The range_size_tree should always contain the + // same number of segments as the range_tree. + // The only difference is that the range_size_tree + // is ordered by segment sizes. + // + using range_size_tree_t = + btree::btree_set< + range_seg_t, + range_seg_t::shorter_t, + pool_allocator>; + std::array< + range_size_tree_t, MyTraits::num_size_buckets> range_size_set; + + std::atomic num_free = 0; ///< total bytes in freelist + + // + // Max amount of range entries allowed. 0 - unlimited + // + uint64_t range_count_cap = 0; + + const uint64_t weight_center = 1ull << 20; // 1M + uint64_t lsum = 0; + uint64_t rsum = 0; + double rweight_factor = 0; + uint64_t left_weight() const { + return lsum + _get_spilled_over(); + } + uint64_t right_weight() const { + return rsum * rweight_factor; + } +protected: + static const uint64_t pextent_array_size = 64; + typedef std::array PExtentArray; + + void set_weight_factor(double _rweight_factor) { + rweight_factor = _rweight_factor; + } + + CephContext* get_context() { + return cct; + } + std::mutex& get_lock() { + return lock; + } + range_size_tree_t* _get_lowest(range_size_tree_t::iterator* rs_p) { + for (auto& t : range_size_set) { + if (t.begin() != t.end()) { + *rs_p = t.begin(); + return &t; + } + } + return nullptr; + } + uint64_t _lowest_size_available() { + range_size_tree_t::iterator rs_p; + if (_get_lowest(&rs_p) != nullptr) { + return rs_p->length(); + } + return std::numeric_limits::max(); + } + uint64_t _get_free() const { + return num_free; + } + double _get_fragmentation() const { + auto free_blocks = p2align(num_free.load(), (uint64_t)block_size) / block_size; + if (free_blocks <= 1) { + return .0; + } + return (static_cast(range_tree.size() - 1) / (free_blocks - 1)); + } + + void _shutdown(); + + void _dump(bool full = true) const; + void _foreach(std::function); + + int64_t _allocate( + uint64_t want, + uint64_t unit, + uint64_t max_alloc_size, + int64_t hint, + PExtentVector* extents); + + void _release(const release_set_t& release_set); + void _release(const PExtentVector& release_set); + void _release(size_t count, const release_set_entry_t** to_release); + + /* + * overridables for HybridAllocator + */ + // called when extent to be released/marked free + virtual void _add_to_tree(uint64_t start, uint64_t size); + virtual void _spillover_range(uint64_t start, uint64_t end) { + // this should be overriden when range count cap is present, + // i.e. (range_count_cap > 0) + ceph_assert(false); + } + // to be overriden by Hybrid wrapper + virtual uint64_t _get_spilled_over() const { + return 0; + } + virtual uint64_t _spillover_allocate(uint64_t want, + uint64_t unit, + uint64_t max_alloc_size, + int64_t hint, + PExtentVector* extents) { + // this should be overriden when range count cap is present, + // i.e. (range_count_cap > 0) + ceph_assert(false); + return 0; + } + + /* + * Used exclusively from HybridAllocator + */ + void _try_remove_from_tree(uint64_t start, uint64_t size, + std::function cb); + bool has_cache() const { + return !!cache; + } + bool try_put_cache(uint64_t start, uint64_t len) { + bool ret = cache && cache->try_put(start, len); + if (ret) { + num_free += len; + } + return ret; + } + bool try_get_from_cache(uint64_t* res_offset, uint64_t want) { + bool ret = cache && cache->try_get(res_offset, want); + if (ret) { + num_free -= want; + } + return ret; + } + +private: + int64_t __allocate(size_t bucket0, + uint64_t size, + uint64_t unit, + PExtentVector* extents); + + inline range_size_tree_t::iterator _pick_block(int distance, + range_size_tree_t* tree, uint64_t size); + + inline void _remove_from_tree(uint64_t start, uint64_t size); + inline void _remove_from_tree(range_tree_t::iterator rt_p, + uint64_t start, uint64_t end); + inline void _remove_from_tree(range_size_tree_t* rs_tree, + range_size_tree_t::iterator rs, + range_tree_t::iterator rt_p, + uint64_t start, uint64_t end); + + inline void _try_insert_range(const range_seg_t& rs); + inline bool __try_insert_range(const range_seg_t& rs, + range_tree_t::iterator* insert_pos); + + inline void _range_size_tree_add(const range_seg_t& r); + inline void _range_size_tree_rm(const range_seg_t& rs); + inline void _range_size_tree_rm(range_size_tree_t* rs_tree, + range_size_tree_t::iterator rs); +}; diff --git a/src/os/bluestore/BtreeAllocator.h b/src/os/bluestore/BtreeAllocator.h index 4561d9f4c45a1..5f9a9e955feba 100644 --- a/src/os/bluestore/BtreeAllocator.h +++ b/src/os/bluestore/BtreeAllocator.h @@ -173,7 +173,9 @@ private: uint64_t _lowest_size_available() const { auto rs = range_size_tree.begin(); - return rs != range_size_tree.end() ? rs->size : 0; + return rs != range_size_tree.end() ? + rs->size : + std::numeric_limits::max(); } int64_t _allocate( diff --git a/src/os/bluestore/HybridAllocator.cc b/src/os/bluestore/HybridAllocator.cc index 1cfab7e5ac3dc..e99e8bf932e5f 100644 --- a/src/os/bluestore/HybridAllocator.cc +++ b/src/os/bluestore/HybridAllocator.cc @@ -18,4 +18,59 @@ const char* HybridAvlAllocator::get_type() const return "hybrid"; } +/* + * class HybridBtree2Allocator + * + * + */ +const char* HybridBtree2Allocator::get_type() const +{ + return "hybrid_btree2"; +} + +int64_t HybridBtree2Allocator::allocate( + uint64_t want, + uint64_t unit, + uint64_t max_alloc_size, + int64_t hint, + PExtentVector* extents) +{ + ldout(get_context(), 10) << __func__ << std::hex + << " want 0x" << want + << " unit 0x" << unit + << " max_alloc_size 0x" << max_alloc_size + << " hint 0x" << hint + << std::dec << dendl; + ceph_assert(std::has_single_bit(unit)); + ceph_assert(want % unit == 0); + + uint64_t cached_chunk_offs = 0; + if (try_get_from_cache(&cached_chunk_offs, want)) { + extents->emplace_back(cached_chunk_offs, want); + return want; + } + return HybridAllocatorBase::allocate(want, + unit, max_alloc_size, hint, extents); +} +void HybridBtree2Allocator::release(const release_set_t& release_set) +{ + if (!has_cache() || release_set.num_intervals() >= pextent_array_size) { + HybridAllocatorBase::release(release_set); + return; + } + PExtentArray to_release; + size_t count = 0; + auto p = release_set.begin(); + while (p != release_set.end()) { + if (!try_put_cache(p.get_start(), p.get_len())) { + to_release[count++] = &*p;// bluestore_pextent_t(p.get_start(), p.get_len()); + } + ++p; + } + if (count > 0) { + std::lock_guard l(get_lock()); + _release(count, &to_release.at(0)); + } +} + #include "HybridAllocator_impl.h" diff --git a/src/os/bluestore/HybridAllocator.h b/src/os/bluestore/HybridAllocator.h index bca6e7af9e5ed..d9ee7dce5ce15 100644 --- a/src/os/bluestore/HybridAllocator.h +++ b/src/os/bluestore/HybridAllocator.h @@ -6,6 +6,7 @@ #include #include "AvlAllocator.h" +#include "Btree2Allocator.h" #include "BitmapAllocator.h" template @@ -73,7 +74,6 @@ protected: return bmap_alloc; } private: - void _spillover_range(uint64_t start, uint64_t end) override; uint64_t _spillover_allocate(uint64_t want, uint64_t unit, @@ -122,3 +122,30 @@ public: const char* get_type() const override; }; + +class HybridBtree2Allocator : public HybridAllocatorBase +{ +public: + HybridBtree2Allocator(CephContext* cct, + int64_t device_size, + int64_t _block_size, + uint64_t max_mem, + double weight_factor, + std::string_view name) : + HybridAllocatorBase(cct, + device_size, + _block_size, + max_mem, + name) { + set_weight_factor(weight_factor); + } + const char* get_type() const override; + + int64_t allocate( + uint64_t want, + uint64_t unit, + uint64_t max_alloc_size, + int64_t hint, + PExtentVector* extents) override; + void release(const release_set_t& release_set) override; +}; diff --git a/src/os/bluestore/HybridAllocator_impl.h b/src/os/bluestore/HybridAllocator_impl.h index 4542319b07a67..8a41e86a3cf1c 100644 --- a/src/os/bluestore/HybridAllocator_impl.h +++ b/src/os/bluestore/HybridAllocator_impl.h @@ -39,8 +39,9 @@ int64_t HybridAllocatorBase::allocate( // try bitmap first to avoid unneeded contiguous extents split if // desired amount is less than shortes range in AVL - bool primary_first = !(bmap_alloc && bmap_alloc->get_free() && - want < T::_lowest_size_available()); + bool primary_first = !(bmap_alloc && + bmap_alloc->get_free() && + want < T::_lowest_size_available()); int64_t res = _allocate_or_rollback(primary_first, want, unit, max_alloc_size, hint, extents); diff --git a/src/test/objectstore/Allocator_bench.cc b/src/test/objectstore/Allocator_bench.cc index 76598d2a1b263..ec0c806a55948 100644 --- a/src/test/objectstore/Allocator_bench.cc +++ b/src/test/objectstore/Allocator_bench.cc @@ -39,6 +39,13 @@ public: } void doOverwriteTest(uint64_t capacity, uint64_t prefill, uint64_t overwrite); + void doOverwriteMPCTest(size_t thread_count, + uint64_t capacity, uint64_t prefill, + uint64_t overwrite); + void doOverwriteMPC2Test(size_t thread_count, + uint64_t capacity, uint64_t prefill, + uint64_t overwrite, + float extra = 0.05); }; const uint64_t _1m = 1024 * 1024; @@ -62,7 +69,7 @@ class AllocTracker uint64_t head = 0; uint64_t tail = 0; uint64_t size = 0; - boost::uniform_int<> u1; + boost::uniform_int u1; public: AllocTracker(uint64_t capacity, uint64_t alloc_unit) @@ -116,7 +123,7 @@ public: if (size == 0) return false; - uint64_t pos = (u1(rng) % size) + tail; + size_t pos = (u1(rng) % size) + tail; pos %= allocations.size(); uint64_t val = allocations[pos]; *len = uint64_t((val & 0xffffff) << 8); @@ -155,6 +162,7 @@ TEST_P(AllocTest, test_alloc_bench_seq) << capacity / 1024 / 1024 << std::endl; } } + std::cout << "Executed in " << ceph_clock_now() - start << std::endl; std::cout << "releasing..." << std::endl; for (size_t i = 0; i < capacity; i += want_size) @@ -171,6 +179,41 @@ TEST_P(AllocTest, test_alloc_bench_seq) dump_mempools(); } +TEST_P(AllocTest, test_alloc_bench_seq_interleaving) +{ + // by adjusting capacity and mempool dump output analysis one can + // estimate max RAM usage for specific allocator's implementation with + // real-life disk size, e.g. 8TB or 16 TB. + // The current capacity is left pretty small to avoid huge RAM utilization + // when using non-effective allocators (e.g. AVL) and hence let the test + // case run perfectly on week H/W. + uint64_t capacity = uint64_t(1024) * 1024 * 1024 * 128; //128GB + uint64_t alloc_unit = 4096; + uint64_t want_size = alloc_unit; + PExtentVector allocated, tmp; + + init_alloc(capacity, alloc_unit); + alloc->init_add_free(0, capacity); + + utime_t start = ceph_clock_now(); + alloc->init_rm_free(0, capacity); + std::cout << "Executed in " << ceph_clock_now() - start << std::endl; + + std::cout << "releasing..." << std::endl; + for (size_t i = 0; i < capacity; i += want_size * 2) + { + interval_set release_set; + release_set.insert(i, want_size); + alloc->release(release_set); + if (0 == (i % (1 * 1024 * _1m))) { + std::cout << "release " << i / 1024 / 1024 << " mb of " + << capacity / 1024 / 1024 << std::endl; + } + } + std::cout << "Executed in " << ceph_clock_now() - start << std::endl; + dump_mempools(); +} + TEST_P(AllocTest, test_alloc_bench) { uint64_t capacity = uint64_t(1024) * 1024 * 1024 * 1024; @@ -225,6 +268,101 @@ TEST_P(AllocTest, test_alloc_bench) dump_mempools(); } +struct OverwriteTextContext : public Thread { + size_t idx = 0; + AllocTracker* tracker; + Allocator* alloc = nullptr; + size_t r_count = 0; + size_t a_count = 0; + uint64_t ae_count = 0; + uint64_t re_count = 0; + + uint64_t how_many = 0; + uint64_t alloc_unit = 0; + timespan r_time; + timespan a_time; + + OverwriteTextContext(size_t _idx, + AllocTracker* at, + Allocator* a, + uint64_t want, + uint64_t unit) : + idx(_idx), tracker(at), alloc(a), how_many(want), alloc_unit(unit) + { + } + + void build_histogram() { + size_t num_buckets = 8; + Allocator::FreeStateHistogram hist; + hist.resize(num_buckets); + alloc->build_free_state_histogram(alloc_unit, hist); + for (size_t i = 0; i < num_buckets; i++) { + uint64_t a_bytes = (hist[i].alloc_units * alloc_unit); + std::cout << "<=" << hist[i].get_max(i, num_buckets) + << " -> " << hist[i].total << "/" << hist[i].aligned + << " a_bytes " << a_bytes + << " " << ((float)a_bytes / alloc->get_capacity() * 100) << "%" + << std::endl; + } + } + + void* entry() override { + PExtentVector allocated, tmp; + gen_type rng(time(NULL)); + boost::uniform_int<> u1(0, 9); // 4K-2M + boost::uniform_int<> u2(0, 9); // 4K-2M + + r_time = ceph::make_timespan(0); + a_time = ceph::make_timespan(0); + + for (uint64_t i = 0; i < how_many; ) + { + uint64_t want_release = alloc_unit << u2(rng); + uint64_t released = 0; + interval_set release_set; + do { + uint64_t o = 0; + uint32_t l = 0; + if (!tracker->pop_random(rng, &o, &l, want_release - released)) { + break; + } + release_set.insert(o, l); + released += l; + } while (released < want_release); + + uint32_t want = alloc_unit << u1(rng); + tmp.clear(); + auto t0 = mono_clock::now(); + auto r = alloc->allocate(want, alloc_unit, 0, 0, &tmp); + a_count++; + ae_count += tmp.size(); + a_time += mono_clock::now() - t0; + if (r != want) { + std::cout << "Can't allocate more space, stopping." << std::endl; + break; + } + i += r; + + for (auto a : tmp) { + bool full = !tracker->push(a.offset, a.length); + EXPECT_EQ(full, false); + } + { + auto t0 = mono_clock::now(); + alloc->release(release_set); + r_count++; + r_time += mono_clock::now() - t0; + re_count += release_set.num_intervals(); + } + if (0 == (i % (1 * 1024 * _1m))) { + std::cout << idx << ">> reuse " << i / 1024 / 1024 << " mb of " + << how_many / 1024 / 1024 << std::endl; + } + } + return nullptr; + } +}; + void AllocTest::doOverwriteTest(uint64_t capacity, uint64_t prefill, uint64_t overwrite) { @@ -253,6 +391,7 @@ void AllocTest::doOverwriteTest(uint64_t capacity, uint64_t prefill, i += r; for(auto a : tmp) { + bool full = !at.push(a.offset, a.length); EXPECT_EQ(full, false); } @@ -261,47 +400,311 @@ void AllocTest::doOverwriteTest(uint64_t capacity, uint64_t prefill, << cap / 1024 / 1024 << std::endl; } } - + std::cout << "Executed prefill in " << ceph_clock_now() - start << std::endl; cap = overwrite; + OverwriteTextContext ctx(0, &at, alloc.get(), cap, alloc_unit); + + start = ceph_clock_now(); + ctx.entry(); + + std::cout << "Executed in " << ceph_clock_now() - start + << " alloc:" << ctx.a_count << "/" << ctx.ae_count << " in " << ctx.a_time + << " release:" << ctx.r_count << "/" << ctx.re_count << " in " << ctx.r_time + << std::endl; + std::cout<<"Avail "<< alloc->get_free() / _1m << " MB" << std::endl; + + dump_mempools(); +} + +void AllocTest::doOverwriteMPCTest(size_t thread_count, + uint64_t capacity, uint64_t prefill, + uint64_t overwrite) +{ + uint64_t alloc_unit = 4096; + PExtentVector tmp; + std::vector at; + std::vector ctx; + + init_alloc(capacity, alloc_unit); + alloc->init_add_free(0, capacity); + + at.resize(thread_count); + ctx.resize(thread_count); + for (size_t i = 0; i < thread_count; i++) { + at[i] = new AllocTracker(capacity, alloc_unit); + ctx[i] = new OverwriteTextContext(i, at[i], alloc.get(), overwrite, alloc_unit); + } + + gen_type rng(time(NULL)); + boost::uniform_int<> u1(0, 9); // 4K-2M + + utime_t start = ceph_clock_now(); + // allocate %% + 10% of the capacity + float extra = 0.1; + auto cap = prefill * (1 + extra); + + uint64_t idx = 0; for (uint64_t i = 0; i < cap; ) { - uint64_t want_release = alloc_unit << u2(rng); + uint32_t want = alloc_unit << u1(rng); + tmp.clear(); + auto r = alloc->allocate(want, alloc_unit, 0, 0, &tmp); + if (r < want) { + break; + } + i += r; + + for (auto a : tmp) { + bool full = !at[idx]->push(a.offset, a.length); + EXPECT_EQ(full, false); + } + if (0 == (i % (1 * 1024 * _1m))) { + std::cout << "alloc " << i / 1024 / 1024 << " mb of " + << cap / 1024 / 1024 << std::endl; + } + idx = (idx + 1) % thread_count; + } + + // do release extra space to introduce some fragmentation + cap = prefill * extra; + idx = 0; + for (uint64_t i = 0; i < cap; ) + { + uint64_t want_release = alloc_unit << u1(rng); uint64_t released = 0; + interval_set release_set; do { uint64_t o = 0; uint32_t l = 0; - interval_set release_set; - if (!at.pop_random(rng, &o, &l, want_release - released)) { + if (!at[idx]->pop_random(rng, &o, &l, want_release - released)) { break; } release_set.insert(o, l); - alloc->release(release_set); released += l; } while (released < want_release); + alloc->release(release_set); + i += released; + if (0 == (i % (1 * 1024 * _1m))) { + std::cout << "release " << i / 1024 / 1024 << " mb of " + << cap / 1024 / 1024 << std::endl; + } + idx = (idx + 1) % thread_count; + } + std::cout << "Executed prefill in " << ceph_clock_now() - start + << " Fragmentation:" << alloc->get_fragmentation_score() + << std::endl; + ctx[0]->build_histogram(); + + start = ceph_clock_now(); + for (size_t i = 0; i < thread_count; i++) { + ctx.at(i)->create(stringify(i).c_str()); + } + + for (size_t i = 0; i < thread_count; i++) { + ctx.at(i)->join(); + } + std::cout << "Executed in " << ceph_clock_now() - start + << std::endl; + std::cout << "Avail " << alloc->get_free() / _1m << " MB" + << " Fragmentation:" << alloc->get_fragmentation_score() + << std::endl; + for (size_t i = 0; i < thread_count; i++) { + std::cout << "alloc/release stats for " << i + << " alloc:" << ctx.at(i)->a_count << "/" << ctx.at(i)->ae_count << " in " << ctx.at(i)->a_time + << " release:" << ctx.at(i)->r_count << "/" << ctx.at(i)->re_count << " in " << ctx.at(i)->r_time + << std::endl; + } + ctx[0]->build_histogram(); + dump_mempools(); + for (size_t i = 0; i < thread_count; i++) { + delete at[i]; + delete ctx[i]; + } +} +struct OverwriteTextContext2 : public OverwriteTextContext { + + using OverwriteTextContext::OverwriteTextContext; + void* entry() override { + PExtentVector allocated, tmp; + gen_type rng(time(NULL)); + boost::uniform_int<> u1(1, 16); // alloc_unit * u1 => 4K-64K + + r_time = ceph::make_timespan(0); + a_time = ceph::make_timespan(0); + uint64_t processed = 0; + auto t00 = ceph_clock_now(); + for (uint64_t i = 0; i < how_many; ) + { + int64_t want = alloc_unit * u1(rng); + int64_t released = 0; + interval_set release_set; + do { + uint64_t o = 0; + uint32_t l = 0; + if (!tracker->pop_random(rng, &o, &l, want - released)) { + break; + } + release_set.insert(o, l); + released += l; + } while (released < want); + tmp.clear(); + auto t0 = mono_clock::now(); + auto r = alloc->allocate(want, alloc_unit, 0, 0, &tmp); + a_count++; + ae_count += tmp.size(); + a_time += mono_clock::now() - t0; + if (r != want) { + std::cout << "Can't allocate more space, stopping." << std::endl; + break; + } + i += r; + + for (auto a : tmp) { + bool full = !tracker->push(a.offset, a.length); + EXPECT_EQ(full, false); + } + { + auto t0 = mono_clock::now(); + alloc->release(release_set); + r_count++; + r_time += mono_clock::now() - t0; + re_count += release_set.num_intervals(); + } + auto processed0 = processed; + processed += want; + auto _1g = 1024 * _1m; + if (processed / _1g != processed0 / _1g) { + std::cout << idx << ">> reuse " << i / 1024 / 1024 << " mb of " + << how_many / 1024 / 1024 << std::endl; + + } + auto c = alloc->get_capacity(); + bool capacity_written = (processed / c) != (processed0 / c); + if (capacity_written) { + std::cout << "> Single iteration writing completed in " << (ceph_clock_now() - t00) + << " alloc/release stats for " << idx + << " alloc:" << a_count << "/" << ae_count << " in " << a_time + << " release:" << r_count << "/" << re_count << " in " << r_time + << std::endl; + a_count = 0; + ae_count = 0; + r_count = 0; + re_count = 0; + r_time = ceph::make_timespan(0); + a_time = ceph::make_timespan(0); + if (idx == 0) { + std::cout << " Fragmentation: " << alloc->get_fragmentation_score() + << std::endl; + build_histogram(); + } + t00 = ceph_clock_now(); + } + } + return nullptr; + } +}; + +void AllocTest::doOverwriteMPC2Test(size_t thread_count, + uint64_t capacity, uint64_t prefill, + uint64_t overwrite, + float extra) +{ + uint64_t alloc_unit = 4096; + PExtentVector tmp; + std::vector at; + std::vector ctx; + + init_alloc(capacity, alloc_unit); + alloc->init_add_free(0, capacity); + + at.resize(thread_count); + ctx.resize(thread_count); + for (size_t i = 0; i < thread_count; i++) { + at[i] = new AllocTracker(capacity, alloc_unit); + ctx[i] = new OverwriteTextContext2(i, at[i], alloc.get(), overwrite, alloc_unit); + } + + gen_type rng(time(NULL)); + boost::uniform_int<> u1(8, 10); // 4096 << u1 => 1-4M chunks used for prefill + boost::uniform_int<> u2(1, 512); // 4096 * u2 => 4K-2M chunks used for overwrite + + utime_t start = ceph_clock_now(); + // allocate %% + extra% of the capacity + float cap = prefill + capacity * extra; + + uint64_t idx = 0; + for (uint64_t i = 0; i < cap; ) + { uint32_t want = alloc_unit << u1(rng); tmp.clear(); auto r = alloc->allocate(want, alloc_unit, 0, 0, &tmp); - if (r != want) { - std::cout<<"Can't allocate more space, stopping."<< std::endl; + if (r < want) { break; } i += r; - for(auto a : tmp) { - bool full = !at.push(a.offset, a.length); + for (auto a : tmp) { + bool full = !at[idx]->push(a.offset, a.length); EXPECT_EQ(full, false); } - if (0 == (i % (1 * 1024 * _1m))) { - std::cout << "reuse " << i / 1024 / 1024 << " mb of " - << cap / 1024 / 1024 << std::endl; + std::cout << "alloc " << i / 1024 / 1024 << " mb of " + << cap / 1024 / 1024 << std::endl; } + idx = (idx + 1) % thread_count; } - std::cout<<"Executed in "<< ceph_clock_now() - start << std::endl; - std::cout<<"Avail "<< alloc->get_free() / _1m << " MB" << std::endl; + // do release extra space to introduce some fragmentation + cap = capacity * extra; + idx = 0; + for (uint64_t i = 0; i < (uint64_t)cap; ) + { + uint64_t want_release = alloc_unit * u2(rng); + uint64_t released = 0; + interval_set release_set; + do { + uint64_t o = 0; + uint32_t l = 0; + if (!at[idx]->pop_random(rng, &o, &l, want_release - released)) { + break; + } + release_set.insert(o, l); + released += l; + } while (released < want_release); + alloc->release(release_set); + i += released; + if (0 == (i % (1 * 1024 * _1m))) { + std::cout << "release " << i / 1024 / 1024 << " mb of " + << cap / 1024 / 1024 << std::endl; + } + idx = (idx + 1) % thread_count; + } + + std::cout << "Executed prefill in " << ceph_clock_now() - start + << " Fragmentation:" << alloc->get_fragmentation_score() + << std::endl; + ctx[0]->build_histogram(); + + start = ceph_clock_now(); + for (size_t i = 0; i < thread_count; i++) { + ctx[i]->create(stringify(i).c_str()); + } + + for (size_t i = 0; i < thread_count; i++) { + ctx.at(i)->join(); + } + std::cout << "Executed in " << ceph_clock_now() - start + << std::endl; + std::cout << "Avail " << alloc->get_free() / _1m << " MB" + << " Fragmentation:" << alloc->get_fragmentation_score() + << std::endl; + ctx[0]->build_histogram(); dump_mempools(); + for (size_t i = 0; i < thread_count; i++) { + delete at[i]; + delete ctx[i]; + } } TEST_P(AllocTest, test_alloc_bench_90_300) @@ -328,6 +731,84 @@ TEST_P(AllocTest, test_alloc_bench_10_300) doOverwriteTest(capacity, prefill, overwrite); } +TEST_P(AllocTest, test_alloc_bench_50_300_x2) +{ + // skipping for legacy and slow code + if ((GetParam() == string("stupid"))) { + GTEST_SKIP() << "skipping for specific allocators"; + } + uint64_t capacity = uint64_t(1024) * 1024 * 1024 * 128; + auto prefill = capacity / 2; + auto overwrite = capacity * 3; + doOverwriteMPCTest(2, capacity, prefill, overwrite); +} + +/* +* The following benchmark test simulates small block overwrites over highly +* utilized disk space prefilled with large extents. Overwrites are performed +* from two concurring threads accessing the same allocator. +* Detailed scenario: +* 1. Prefill (single threaded): +* 1.1. Fill 95% of the space with 1M-4M chunk allocations +* 1.2. Release 5% of the allcoated space using random extents of 4K-2M bytes +* 2. Random overwrite using 2 threads +* 2.1 Deallocate random sub-extent of size randomly selected within [4K-64K] range +* 2.2. Allocate random extent of size rwithin [4K-64K] range. +* 2.3. Repeat 2.1-2.2 until total newly allocated bytes are equal to 500% of +* the original disk space +* +* Pay attention to the resulting fragmentation score and histogram, time taken and +* bluestore_alloc mempool stats +* +*/ +TEST_P(AllocTest, test_alloc_bench2_90_500_x2) +{ + // skipping for legacy and slow code + if ((GetParam() == string("stupid"))) { + GTEST_SKIP() << "skipping for specific allocators"; + } + uint64_t capacity = uint64_t(1024) * 1024 * 1024 * 128; + auto prefill = capacity * 9 / 10; + auto overwrite = capacity * 5; + doOverwriteMPC2Test(2, capacity, prefill, overwrite); +} + +TEST_P(AllocTest, test_alloc_bench2_20_500_x2) +{ + // skipping for legacy and slow code + if ((GetParam() == string("stupid"))) { + GTEST_SKIP() << "skipping for specific allocators"; + } + uint64_t capacity = uint64_t(1024) * 1024 * 1024 * 128; + auto prefill = capacity * 2 / 10; + auto overwrite = capacity * 5; + doOverwriteMPC2Test(2, capacity, prefill, overwrite, 0.05); +} + +TEST_P(AllocTest, test_alloc_bench2_50_500_x2) +{ + // skipping for legacy and slow code + if ((GetParam() == string("stupid"))) { + GTEST_SKIP() << "skipping for specific allocators"; + } + uint64_t capacity = uint64_t(1024) * 1024 * 1024 * 128; + auto prefill = capacity * 5 / 10; + auto overwrite = capacity * 5; + doOverwriteMPC2Test(2, capacity, prefill, overwrite, 0.05); +} + +TEST_P(AllocTest, test_alloc_bench2_75_500_x2) +{ + // skipping for legacy and slow code + if ((GetParam() == string("stupid"))) { + GTEST_SKIP() << "skipping for specific allocators"; + } + uint64_t capacity = uint64_t(1024) * 1024 * 1024 * 128; + auto prefill = capacity * 75 / 100; + auto overwrite = capacity * 15; + doOverwriteMPC2Test(2, capacity, prefill, overwrite, 0.05); +} + TEST_P(AllocTest, mempoolAccounting) { uint64_t bytes = mempool::bluestore_alloc::allocated_bytes(); @@ -365,4 +846,4 @@ TEST_P(AllocTest, mempoolAccounting) INSTANTIATE_TEST_SUITE_P( Allocator, AllocTest, - ::testing::Values("stupid", "bitmap", "avl", "hybrid", "btree")); + ::testing::Values("stupid", "bitmap", "avl", "hybrid", "btree", "hybrid_btree2")); diff --git a/src/test/objectstore/Allocator_test.cc b/src/test/objectstore/Allocator_test.cc index 0e76c479002ad..0dce80d6e8f2d 100644 --- a/src/test/objectstore/Allocator_test.cc +++ b/src/test/objectstore/Allocator_test.cc @@ -33,6 +33,9 @@ public: void init_close() { alloc.reset(0); } + void dump_alloc() { + alloc->dump(); + } }; TEST_P(AllocTest, test_alloc_init) @@ -69,13 +72,14 @@ TEST_P(AllocTest, test_init_add_free) TEST_P(AllocTest, test_alloc_min_alloc) { - int64_t block_size = 1024; - int64_t capacity = 4 * 1024 * block_size; + int64_t block_size = 4096; + int64_t capacity = 1024 * block_size; { init_alloc(capacity, block_size); alloc->init_add_free(block_size, block_size); + dump_alloc(); PExtentVector extents; EXPECT_EQ(block_size, alloc->allocate(block_size, block_size, 0, (int64_t) 0, &extents)); @@ -116,9 +120,9 @@ TEST_P(AllocTest, test_alloc_min_alloc) TEST_P(AllocTest, test_alloc_min_max_alloc) { - int64_t block_size = 1024; + int64_t block_size = 4096; - int64_t capacity = 4 * 1024 * block_size; + int64_t capacity = 1024 * block_size; init_alloc(capacity, block_size); /* @@ -193,8 +197,17 @@ TEST_P(AllocTest, test_alloc_min_max_alloc) TEST_P(AllocTest, test_alloc_failure) { - int64_t block_size = 1024; - int64_t capacity = 4 * 1024 * block_size; + if (!(GetParam() == string("stupid") || + GetParam() == string("avl") || + GetParam() == string("bitmap") || + GetParam() == string("hybrid"))) { + // new generation allocator(s) don't care about other-than-4K alignment + // hence the test case is not applicable + GTEST_SKIP() << "skipping for 'unaligned' allocators"; + } + + int64_t block_size = 4096; + int64_t capacity = 1024 * block_size; { init_alloc(capacity, block_size); @@ -263,18 +276,18 @@ TEST_P(AllocTest, test_alloc_fragmentation) uint64_t alloc_unit = 4096; uint64_t want_size = alloc_unit; PExtentVector allocated, tmp; - + init_alloc(capacity, alloc_unit); alloc->init_add_free(0, capacity); bool bitmap_alloc = GetParam() == std::string("bitmap"); - + EXPECT_EQ(0.0, alloc->get_fragmentation()); for (size_t i = 0; i < capacity / alloc_unit; ++i) { tmp.clear(); EXPECT_EQ(static_cast(want_size), - alloc->allocate(want_size, alloc_unit, 0, 0, &tmp)); + alloc->allocate(want_size, alloc_unit, 0, 0, &tmp)); allocated.insert(allocated.end(), tmp.begin(), tmp.end()); // bitmap fragmentation calculation doesn't provide such constant @@ -286,12 +299,8 @@ TEST_P(AllocTest, test_alloc_fragmentation) tmp.clear(); EXPECT_EQ(-ENOSPC, alloc->allocate(want_size, alloc_unit, 0, 0, &tmp)); - if (GetParam() == string("avl")) { - // AVL allocator uses a different allocating strategy - GTEST_SKIP() << "skipping for AVL allocator"; - } else if (GetParam() == string("hybrid")) { - // AVL allocator uses a different allocating strategy - GTEST_SKIP() << "skipping for Hybrid allocator"; + if (!(GetParam() == string("stupid") || GetParam() == string("bitmap"))) { + GTEST_SKIP() << "skipping for specific allocators"; } for (size_t i = 0; i < allocated.size(); i += 2) @@ -575,6 +584,14 @@ TEST_P(AllocTest, test_alloc_contiguous) TEST_P(AllocTest, test_alloc_47883) { + if (!(GetParam() == string("stupid") || + GetParam() == string("avl") || + GetParam() == string("bitmap") || + GetParam() == string("hybrid"))) { + // new generation allocator(s) don't care about other-than-4K alignment + // hence the test case is not applicable + GTEST_SKIP() << "skipping for 'unaligned' allocators"; + } uint64_t block = 0x1000; uint64_t size = 1599858540544ul; @@ -656,4 +673,4 @@ TEST_P(AllocTest, test_init_rm_free_unbound) INSTANTIATE_TEST_SUITE_P( Allocator, AllocTest, - ::testing::Values("stupid", "bitmap", "avl", "hybrid", "btree")); + ::testing::Values("stupid", "bitmap", "avl", "hybrid", "btree", "hybrid_btree2")); -- 2.39.5