From: Igor Fedotov Date: Mon, 27 May 2024 15:00:05 +0000 (+0300) Subject: os/bluestore: introduce allocator lookup policy X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=01cf6db70acfd36f4a68728821403ff41399f583;p=ceph.git os/bluestore: introduce allocator lookup policy This allows to have different free space lookup approaches for ssd and hdd drives. Signed-off-by: Igor Fedotov --- diff --git a/src/common/options/global.yaml.in b/src/common/options/global.yaml.in index fdafd3206e3..c7a707c3f21 100644 --- a/src/common/options/global.yaml.in +++ b/src/common/options/global.yaml.in @@ -5646,6 +5646,23 @@ options: - hdd - ssd with_legacy: true +- name: bluestore_allocator_lookup_policy + type: str + level: advanced + desc: Determines how to perform the next free extent lookup. + long_desc: When set to 'hdd_optimized' the allocator searches from the last + location found. This may facilitate contiguous disk writes. It may similarly + be beneficial for large-IU QLC SSDs to enable firmware coalescing of sub-IU + writes. + When set to 'ssd-optimized' the allocator will search from the beginning of + the device. This may facilitate SSD firmware housekeeping. + When set to 'auto' the value will be derived from the detected device type + (rotational or non-rotational). + default: auto + enum_values: + - hdd_optimized + - ssd_optimized + - auto - name: bluestore_avl_alloc_ff_max_search_count type: uint level: dev diff --git a/src/os/bluestore/AvlAllocator.cc b/src/os/bluestore/AvlAllocator.cc index c97a88325f6..c59a8c9c9f7 100644 --- a/src/os/bluestore/AvlAllocator.cc +++ b/src/os/bluestore/AvlAllocator.cc @@ -239,14 +239,14 @@ int64_t AvlAllocator::_allocate( uint64_t want, uint64_t unit, uint64_t max_alloc_size, - int64_t hint, // unused, for now! + int64_t hint, PExtentVector* extents) { uint64_t allocated = 0; while (allocated < want) { uint64_t offset, length; int r = _allocate(std::min(max_alloc_size, want - allocated), - unit, &offset, &length); + unit, hint, &offset, &length); if (r < 0) { // Allocation failed. break; @@ -260,6 +260,7 @@ int64_t AvlAllocator::_allocate( int AvlAllocator::_allocate( uint64_t size, uint64_t unit, + int64_t hint, uint64_t *offset, uint64_t *length) { @@ -296,7 +297,9 @@ int AvlAllocator::_allocate( */ uint64_t align = size & -size; ceph_assert(align != 0); - uint64_t* cursor = &lbas[cbits(align) - 1]; + uint64_t dummy_cursor = (uint64_t)hint; + uint64_t* cursor = + hint == -1 ? &lbas[cbits(align) - 1] : &dummy_cursor; start = _pick_block_after(cursor, size, unit); dout(20) << __func__ << std::hex << " first fit params: 0x" << start << "~" << size @@ -399,7 +402,7 @@ int64_t AvlAllocator::allocate( uint64_t want, uint64_t unit, uint64_t max_alloc_size, - int64_t hint, // unused, for now! + int64_t hint, PExtentVector* extents) { ldout(cct, 10) << __func__ << std::hex diff --git a/src/os/bluestore/AvlAllocator.h b/src/os/bluestore/AvlAllocator.h index 44547842a92..1ee2a9a76a6 100644 --- a/src/os/bluestore/AvlAllocator.h +++ b/src/os/bluestore/AvlAllocator.h @@ -109,6 +109,7 @@ private: int _allocate( uint64_t size, uint64_t unit, + int64_t hint, uint64_t *offset, uint64_t *length); diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index 4972518d4b6..195dced51f3 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -4387,7 +4387,7 @@ int BlueFS::_allocate(uint8_t id, uint64_t len, ceph_assert(id < alloc.size()); int64_t alloc_len = 0; PExtentVector extents; - uint64_t hint = 0; + int64_t hint = -1; int64_t need = len; bool shared = is_shared_alloc(id); auto shared_unit = shared_alloc ? shared_alloc->alloc_unit : 0; @@ -4414,7 +4414,7 @@ int BlueFS::_allocate(uint8_t id, uint64_t len, need = round_up_to(len, alloc_unit); if (!node->extents.empty() && node->extents.back().bdev == id) { hint = node->extents.back().end(); - } + } ++alloc_attempts; extents.reserve(4); // 4 should be (more than) enough for most allocations auto t0 = mono_clock::now(); diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 5b6ff47ef15..5e8d245929b 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -5742,7 +5742,8 @@ std::vector BlueStore::get_tracked_keys() const noexcept "bluestore_warn_on_no_per_pool_omap"s, "bluestore_warn_on_no_per_pg_omap"s, "bluestore_max_defer_interval"s, - "bluestore_onode_segment_size"s + "bluestore_onode_segment_size"s, + "bluestore_allocator_lookup_policy"s }; } @@ -5814,6 +5815,9 @@ void BlueStore::handle_conf_change(const ConfigProxy& conf, changed.count("osd_memory_expected_fragmentation")) { _update_osd_memory_options(); } + if (changed.count("bluestore_allocator_lookup_policy")) { + _update_allocator_lookup_policy(); + } } void BlueStore::_set_compression() @@ -5948,6 +5952,24 @@ void BlueStore::_update_osd_memory_options() << dendl; } + +void BlueStore::_update_allocator_lookup_policy() +{ + auto policy = cct->_conf.get_val("bluestore_allocator_lookup_policy"); + if (policy == "hdd_optimized") { + use_last_allocator_lookup_position = true; + } else if (policy == "ssd_optimized") { + use_last_allocator_lookup_position = false; + } else { + // Apply "auto" policy for everything else. + // Which means reusing last lookup position for hdds. + use_last_allocator_lookup_position = _use_rotational_settings(); + } + dout(5) << __func__ + << " use_last_lookup_position " << use_last_allocator_lookup_position + << dendl; +} + int BlueStore::_set_cache_sizes() { ceph_assert(bdev); @@ -11240,7 +11262,7 @@ int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair) dout(5) << __func__ << "::NCB::(F)alloc=" << alloc << ", length=" << e->length << dendl; int64_t alloc_len = alloc->allocate(e->length, min_alloc_size, - 0, 0, &exts); + 0, -1, &exts); if (alloc_len < 0 || alloc_len < (int64_t)e->length) { derr << __func__ << " failed to allocate 0x" << std::hex << e->length @@ -11694,7 +11716,7 @@ void BlueStore::inject_leaked(uint64_t len) { PExtentVector exts; int64_t alloc_len = alloc->allocate(len, min_alloc_size, - min_alloc_size * 256, 0, &exts); + min_alloc_size * 256, -1, &exts); ceph_assert(alloc_len >= 0); // generally we do not expect any errors if (fm->is_null_manager()) { return; @@ -14087,6 +14109,7 @@ int BlueStore::_open_super_meta() _set_csum(); _set_compression(); _set_blob_size(); + _update_allocator_lookup_policy(); _validate_bdev(); return 0; @@ -17019,7 +17042,8 @@ int BlueStore::_do_alloc_write( auto start = mono_clock::now(); prealloc_left = alloc->allocate( need, min_alloc_size, need, - 0, &prealloc); + use_last_allocator_lookup_position ? -1 : 0, + &prealloc); log_latency("allocator@_do_alloc_write", l_bluestore_allocator_lat, mono_clock::now() - start, diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index a95dbca34c0..81be4617919 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -2536,6 +2536,8 @@ private: friend class SocketHook; AdminSocketHook* asok_hook = nullptr; + bool use_last_allocator_lookup_position = true; + struct MempoolThread : public Thread { public: BlueStore *store; @@ -2793,6 +2795,7 @@ private: void _set_finisher_num(); void _set_per_pool_omap(); void _update_osd_memory_options(); + void _update_allocator_lookup_policy(); int _open_bdev(bool create); // Verifies if disk space is enough for reserved + min bluefs diff --git a/src/os/bluestore/Btree2Allocator.cc b/src/os/bluestore/Btree2Allocator.cc index 98cdeade2c7..3425b49ba29 100644 --- a/src/os/bluestore/Btree2Allocator.cc +++ b/src/os/bluestore/Btree2Allocator.cc @@ -65,7 +65,7 @@ int64_t Btree2Allocator::allocate( uint64_t want, uint64_t unit, uint64_t max_alloc_size, - int64_t hint, // unused, for now! + int64_t hint, // unused and likely unneeded PExtentVector* extents) { ldout(cct, 10) << __func__ << std::hex @@ -182,7 +182,7 @@ int64_t Btree2Allocator::_allocate( uint64_t want, uint64_t unit, uint64_t max_alloc_size, - int64_t hint, // unused, for now! + int64_t hint, // unused and likely unneeded PExtentVector* extents) { uint64_t allocated = 0; diff --git a/src/os/bluestore/BtreeAllocator.cc b/src/os/bluestore/BtreeAllocator.cc index 89eb1d3f897..acf664e696c 100644 --- a/src/os/bluestore/BtreeAllocator.cc +++ b/src/os/bluestore/BtreeAllocator.cc @@ -226,14 +226,14 @@ int64_t BtreeAllocator::_allocate( uint64_t want, uint64_t unit, uint64_t max_alloc_size, - int64_t hint, // unused, for now! + int64_t hint, PExtentVector* extents) { uint64_t allocated = 0; while (allocated < want) { uint64_t offset, length; int r = _allocate(std::min(max_alloc_size, want - allocated), - unit, &offset, &length); + unit, hint, &offset, &length); if (r < 0) { // Allocation failed. break; @@ -248,6 +248,7 @@ int64_t BtreeAllocator::_allocate( int BtreeAllocator::_allocate( uint64_t size, uint64_t unit, + int64_t hint, uint64_t *offset, uint64_t *length) { @@ -294,7 +295,8 @@ int BtreeAllocator::_allocate( * not guarantee that other allocations sizes may exist in the same * region. */ - uint64_t* cursor = &lbas[cbits(size) - 1]; + uint64_t dummy_cursor = (uint64_t)hint; + uint64_t* cursor = hint == -1 ? &lbas[cbits(size) - 1] : &dummy_cursor; start = _pick_block_after(cursor, size, unit); dout(20) << __func__ << " first fit=" << start << " size=" << size << dendl; if (start != uint64_t(-1ULL)) { @@ -376,7 +378,7 @@ int64_t BtreeAllocator::allocate( uint64_t want, uint64_t unit, uint64_t max_alloc_size, - int64_t hint, // unused, for now! + int64_t hint, PExtentVector* extents) { ldout(cct, 10) << __func__ << std::hex diff --git a/src/os/bluestore/BtreeAllocator.h b/src/os/bluestore/BtreeAllocator.h index dc5c1d49f7a..93b68506a7c 100644 --- a/src/os/bluestore/BtreeAllocator.h +++ b/src/os/bluestore/BtreeAllocator.h @@ -103,6 +103,7 @@ private: int _allocate( uint64_t size, uint64_t unit, + int64_t hint, uint64_t *offset, uint64_t *length); diff --git a/src/os/bluestore/HybridAllocator.cc b/src/os/bluestore/HybridAllocator.cc index e99e8bf932e..ce0b039558a 100644 --- a/src/os/bluestore/HybridAllocator.cc +++ b/src/os/bluestore/HybridAllocator.cc @@ -32,7 +32,7 @@ int64_t HybridBtree2Allocator::allocate( uint64_t want, uint64_t unit, uint64_t max_alloc_size, - int64_t hint, + int64_t hint, // unused and likely unneeded for btree2 allocator PExtentVector* extents) { ldout(get_context(), 10) << __func__ << std::hex diff --git a/src/os/bluestore/StupidAllocator.cc b/src/os/bluestore/StupidAllocator.cc index 27bf9e5b80b..ef6bf229c64 100644 --- a/src/os/bluestore/StupidAllocator.cc +++ b/src/os/bluestore/StupidAllocator.cc @@ -67,59 +67,59 @@ int64_t StupidAllocator::allocate_int( auto p = free[0].begin(); - if (!hint) + if (hint < 0) hint = last_alloc; // search up (from hint) - if (hint) { - for (bin = orig_bin; bin < (int)free.size(); ++bin) { - p = free[bin].lower_bound(hint); - while (p != free[bin].end()) { - if (p.get_len() >= want_size) { - goto found; - } - ++p; - } - } - } - - // search up (from origin, and skip searched extents by hint) for (bin = orig_bin; bin < (int)free.size(); ++bin) { - p = free[bin].begin(); - auto end = hint ? free[bin].lower_bound(hint) : free[bin].end(); - while (p != end) { + p = free[bin].lower_bound(hint); + while (p != free[bin].end()) { if (p.get_len() >= want_size) { - goto found; + goto found; } ++p; } } - // search down (hint) + // search up (from origin, and skip searched extents by hint) if (hint) { - for (bin = orig_bin; bin >= 0; --bin) { - p = free[bin].lower_bound(hint); - while (p != free[bin].end()) { - if (p.get_len() >= alloc_unit) { + for (bin = orig_bin; bin < (int)free.size(); ++bin) { + p = free[bin].begin(); + auto end = free[bin].lower_bound(hint); + while (p != end) { + if (p.get_len() >= want_size) { goto found; - } - ++p; + } + ++p; } } } - // search down (from origin, and skip searched extents by hint) + // search down (hint) for (bin = orig_bin; bin >= 0; --bin) { - p = free[bin].begin(); - auto end = hint ? free[bin].lower_bound(hint) : free[bin].end(); - while (p != end) { + p = free[bin].lower_bound(hint); + while (p != free[bin].end()) { if (p.get_len() >= alloc_unit) { - goto found; + goto found; } ++p; } } + // search down (from origin, and skip searched extents by hint) + if (hint) { + for (bin = orig_bin; bin >= 0; --bin) { + p = free[bin].begin(); + auto end = free[bin].lower_bound(hint); + while (p != end) { + if (p.get_len() >= alloc_unit) { + goto found; + } + ++p; + } + } + } + return -ENOSPC; found: diff --git a/src/os/bluestore/fastbmap_allocator_impl.h b/src/os/bluestore/fastbmap_allocator_impl.h index 550214b62a8..8293e349999 100644 --- a/src/os/bluestore/fastbmap_allocator_impl.h +++ b/src/os/bluestore/fastbmap_allocator_impl.h @@ -699,7 +699,7 @@ protected: void _allocate_l2(uint64_t length, uint64_t min_length, uint64_t max_length, - uint64_t hint, + int64_t hint, uint64_t* allocated, interval_vector_t* res) @@ -724,7 +724,7 @@ protected: if (available < min_length) { return; } - if (hint != 0) { + if (hint != -1) { last_pos = (hint / (d * l2_granularity)) < l2.size() ? p2align(hint / l2_granularity, d) : 0; } auto l2_pos = last_pos; diff --git a/src/test/objectstore/Allocator_bench.cc b/src/test/objectstore/Allocator_bench.cc index d557f6168cc..5a4aa391192 100644 --- a/src/test/objectstore/Allocator_bench.cc +++ b/src/test/objectstore/Allocator_bench.cc @@ -158,7 +158,7 @@ TEST_P(AllocTest, test_alloc_bench_seq) { tmp.clear(); EXPECT_EQ(static_cast(want_size), - alloc->allocate(want_size, alloc_unit, 0, 0, &tmp)); + alloc->allocate(want_size, alloc_unit, 0, -1, &tmp)); if (0 == (i % (1 * 1024 * _1m))) { std::cout << "alloc " << i / 1024 / 1024 << " mb of " << capacity / 1024 / 1024 << std::endl; @@ -236,7 +236,7 @@ TEST_P(AllocTest, test_alloc_bench) uint32_t want = alloc_unit << u1(rng); tmp.clear(); - auto r = alloc->allocate(want, alloc_unit, 0, 0, &tmp); + auto r = alloc->allocate(want, alloc_unit, 0, -1, &tmp); if (r < want) { break; } @@ -390,7 +390,7 @@ void AllocTest::doOverwriteTest(uint64_t capacity, uint64_t prefill, { uint32_t want = alloc_unit << u1(rng); tmp.clear(); - auto r = alloc->allocate(want, alloc_unit, 0, 0, &tmp); + auto r = alloc->allocate(want, alloc_unit, 0, -1, &tmp); if (r < want) { break; } @@ -830,10 +830,10 @@ TEST_P(AllocTest, mempoolAccounting) std::map all_allocs; for (size_t i = 0; i < 10000; i++) { PExtentVector tmp; - alloc->allocate(alloc_size, alloc_size, 0, 0, &tmp); + alloc->allocate(alloc_size, alloc_size, 0, -1, &tmp); all_allocs[rand()] = tmp; tmp.clear(); - alloc->allocate(alloc_size, alloc_size, 0, 0, &tmp); + alloc->allocate(alloc_size, alloc_size, 0, -1, &tmp); all_allocs[rand()] = tmp; tmp.clear(); diff --git a/src/test/objectstore/Allocator_test.cc b/src/test/objectstore/Allocator_test.cc index 1a66303add3..34dcfbcfc9c 100644 --- a/src/test/objectstore/Allocator_test.cc +++ b/src/test/objectstore/Allocator_test.cc @@ -82,7 +82,7 @@ TEST_P(AllocTest, test_alloc_min_alloc) dump_alloc(); PExtentVector extents; EXPECT_EQ(block_size, alloc->allocate(block_size, block_size, - 0, (int64_t) 0, &extents)); + 0, (int64_t) -1, &extents)); } /* @@ -94,7 +94,7 @@ TEST_P(AllocTest, test_alloc_min_alloc) PExtentVector extents; EXPECT_EQ(4*block_size, alloc->allocate(4 * (uint64_t)block_size, (uint64_t) block_size, - 0, (int64_t) 0, &extents)); + 0, (int64_t) -1, &extents)); EXPECT_EQ(1u, extents.size()); EXPECT_EQ(extents[0].length, 4 * block_size); } @@ -110,7 +110,7 @@ TEST_P(AllocTest, test_alloc_min_alloc) EXPECT_EQ(4*block_size, alloc->allocate(4 * (uint64_t)block_size, (uint64_t) block_size, - 0, (int64_t) 0, &extents)); + 0, (int64_t) -1, &extents)); EXPECT_EQ(2u, extents.size()); EXPECT_EQ(extents[0].length, 2 * block_size); EXPECT_EQ(extents[1].length, 2 * block_size); @@ -135,7 +135,7 @@ TEST_P(AllocTest, test_alloc_min_max_alloc) PExtentVector extents; EXPECT_EQ(4*block_size, alloc->allocate(4 * (uint64_t)block_size, (uint64_t) block_size, - block_size, (int64_t) 0, &extents)); + block_size, (int64_t) -1, &extents)); for (auto e : extents) { EXPECT_EQ(e.length, block_size); } @@ -153,7 +153,7 @@ TEST_P(AllocTest, test_alloc_min_max_alloc) PExtentVector extents; EXPECT_EQ(4*block_size, alloc->allocate(4 * (uint64_t)block_size, (uint64_t) block_size, - 2 * block_size, (int64_t) 0, &extents)); + 2 * block_size, (int64_t) -1, &extents)); EXPECT_EQ(2u, extents.size()); for (auto& e : extents) { EXPECT_EQ(e.length, block_size * 2); @@ -170,7 +170,7 @@ TEST_P(AllocTest, test_alloc_min_max_alloc) EXPECT_EQ(1024 * block_size, alloc->allocate(1024 * (uint64_t)block_size, (uint64_t) block_size * 4, - block_size * 4, (int64_t) 0, &extents)); + block_size * 4, (int64_t) -1, &extents)); for (auto& e : extents) { EXPECT_EQ(e.length, block_size * 4); } @@ -186,7 +186,7 @@ TEST_P(AllocTest, test_alloc_min_max_alloc) PExtentVector extents; EXPECT_EQ(16 * block_size, alloc->allocate(16 * (uint64_t)block_size, (uint64_t) block_size, - 2 * block_size, (int64_t) 0, &extents)); + 2 * block_size, (int64_t) -1, &extents)); EXPECT_EQ(extents.size(), 8u); for (auto& e : extents) { @@ -218,14 +218,14 @@ TEST_P(AllocTest, test_alloc_failure) EXPECT_EQ(512 * block_size, alloc->allocate(512 * (uint64_t)block_size, (uint64_t) block_size * 256, - block_size * 256, (int64_t) 0, &extents)); + block_size * 256, (int64_t) -1, &extents)); alloc->init_add_free(0, block_size * 256); alloc->init_add_free(block_size * 512, block_size * 256); extents.clear(); EXPECT_EQ(-ENOSPC, alloc->allocate(512 * (uint64_t)block_size, (uint64_t) block_size * 512, - block_size * 512, (int64_t) 0, &extents)); + block_size * 512, (int64_t) -1, &extents)); } } @@ -240,7 +240,7 @@ TEST_P(AllocTest, test_alloc_big) cout << big << std::endl; PExtentVector extents; EXPECT_EQ(big, - alloc->allocate(big, mas, 0, &extents)); + alloc->allocate(big, mas, -1, &extents)); } } @@ -257,7 +257,7 @@ TEST_P(AllocTest, test_alloc_non_aligned_len) alloc->init_add_free(3670016, 2097152); PExtentVector extents; - EXPECT_EQ(want_size, alloc->allocate(want_size, alloc_unit, 0, &extents)); + EXPECT_EQ(want_size, alloc->allocate(want_size, alloc_unit, -1, &extents)); } TEST_P(AllocTest, test_alloc_39334) @@ -287,7 +287,7 @@ TEST_P(AllocTest, test_alloc_fragmentation) { tmp.clear(); EXPECT_EQ(static_cast(want_size), - alloc->allocate(want_size, alloc_unit, 0, 0, &tmp)); + alloc->allocate(want_size, alloc_unit, 0, -1, &tmp)); allocated.insert(allocated.end(), tmp.begin(), tmp.end()); // bitmap fragmentation calculation doesn't provide such constant @@ -297,7 +297,7 @@ TEST_P(AllocTest, test_alloc_fragmentation) } } tmp.clear(); - EXPECT_EQ(-ENOSPC, alloc->allocate(want_size, alloc_unit, 0, 0, &tmp)); + EXPECT_EQ(-ENOSPC, alloc->allocate(want_size, alloc_unit, 0, -1, &tmp)); if (!(GetParam() == string("stupid") || GetParam() == string("bitmap"))) { GTEST_SKIP() << "skipping for specific allocators"; @@ -441,7 +441,7 @@ TEST_P(AllocTest, test_dump_fragmentation_score) //allocate want_size = ( rng() % one_alloc_max ) / alloc_unit * alloc_unit + alloc_unit; tmp.clear(); - int64_t r = alloc->allocate(want_size, alloc_unit, 0, 0, &tmp); + int64_t r = alloc->allocate(want_size, alloc_unit, 0, -1, &tmp); if (r > 0) { for (auto& t: tmp) { if (t.length > 0) @@ -506,7 +506,7 @@ TEST_P(AllocTest, test_alloc_bug_24598) alloc->init_add_free(0x4b00000, 0x200000); EXPECT_EQ(static_cast(want_size), - alloc->allocate(want_size, 0x100000, 0, 0, &tmp)); + alloc->allocate(want_size, 0x100000, 0, -1, &tmp)); EXPECT_EQ(1u, tmp.size()); EXPECT_EQ(0x4b00000u, tmp[0].offset); EXPECT_EQ(0x200000u, tmp[0].length); @@ -526,11 +526,11 @@ TEST_P(AllocTest, test_alloc_big2) PExtentVector extents; uint64_t need = block_size * blocks / 4; // 2GB EXPECT_EQ(need, - alloc->allocate(need, mas, 0, &extents)); + alloc->allocate(need, mas, -1, &extents)); need = block_size * blocks / 4; // 2GB extents.clear(); EXPECT_EQ(need, - alloc->allocate(need, mas, 0, &extents)); + alloc->allocate(need, mas, -1, &extents)); EXPECT_TRUE(extents[0].length > 0); } @@ -548,7 +548,7 @@ TEST_P(AllocTest, test_alloc_big3) PExtentVector extents; uint64_t need = block_size * blocks / 2; // 4GB EXPECT_EQ(need, - alloc->allocate(need, mas, 0, &extents)); + alloc->allocate(need, mas, -1, &extents)); EXPECT_TRUE(extents[0].length > 0); } @@ -565,7 +565,7 @@ TEST_P(AllocTest, test_alloc_contiguous) uint64_t need = 4 * block_size; EXPECT_EQ(need, alloc->allocate(need, need, - 0, (int64_t)0, &extents)); + 0, (int64_t)-1, &extents)); EXPECT_EQ(1u, extents.size()); EXPECT_EQ(extents[0].offset, 0); EXPECT_EQ(extents[0].length, 4 * block_size); @@ -573,7 +573,7 @@ TEST_P(AllocTest, test_alloc_contiguous) extents.clear(); EXPECT_EQ(need, alloc->allocate(need, need, - 0, (int64_t)0, &extents)); + 0, (int64_t)-1, &extents)); EXPECT_EQ(1u, extents.size()); EXPECT_EQ(extents[0].offset, 4 * block_size); EXPECT_EQ(extents[0].length, 4 * block_size); @@ -603,7 +603,7 @@ TEST_P(AllocTest, test_alloc_47883) PExtentVector extents; auto need = 0x3f980000; - auto got = alloc->allocate(need, 0x10000, 0, (int64_t)0, &extents); + auto got = alloc->allocate(need, 0x10000, 0, (int64_t)-1, &extents); EXPECT_GE(got, 0x630000); } @@ -623,7 +623,7 @@ TEST_P(AllocTest, test_alloc_50656_best_fit) PExtentVector extents; auto need = 0x400000; - auto got = alloc->allocate(need, 0x10000, 0, (int64_t)0, &extents); + auto got = alloc->allocate(need, 0x10000, 0, (int64_t)-1, &extents); EXPECT_GT(got, 0); EXPECT_EQ(got, 0x400000); } @@ -643,7 +643,7 @@ TEST_P(AllocTest, test_alloc_50656_first_fit) PExtentVector extents; auto need = 0x400000; - auto got = alloc->allocate(need, 0x10000, 0, (int64_t)0, &extents); + auto got = alloc->allocate(need, 0x10000, 0, (int64_t)-1, &extents); EXPECT_GT(got, 0); EXPECT_EQ(got, 0x400000); } @@ -670,6 +670,130 @@ TEST_P(AllocTest, test_init_rm_free_unbound) } } +TEST_P(AllocTest, test_alloc_spatial_locality) +{ + if (GetParam() == string("hybrid_btree2")) { + // new generation allocator doesn't support legacy + // spatial locality approach. Which is being able to start searching + // free extent from the previous lookup success (or externally hinted) + // position. + // This looks generally useless on a fragmented volume, it might cause + // excessive fragmentation and misguide PTL level on SSD drives. + // Hence the test case is not applicable. + GTEST_SKIP() << "skipping for hybrid_btree2 allocator"; + } + + int64_t block_size = 0x1000; + int64_t capacity = 128 * (1ull << 30); // 128GB + + // do allocations with no hint provided hence enabling internal spatial locality + { + init_alloc(capacity, block_size); + + alloc->init_add_free(0, capacity); + + PExtentVector extents1; + PExtentVector extents2; + PExtentVector extents3; + + uint64_t need = 0x1000; + EXPECT_EQ(need, + alloc->allocate(need, need, + 0, (int64_t)-1, &extents1)); + EXPECT_EQ(1u, extents1.size()); + EXPECT_EQ(extents1[0].offset, 0); + EXPECT_EQ(extents1[0].length, need); + + // mark a large extent allocated + // to work around bitmap cursor tracking which uses + // l2 granularity (equal to 32GB for 4K unit). + uint64_t skip; + if (GetParam() == string("bitmap")) { + skip = 32 * (1ull << 30) - need; + alloc->init_rm_free(need, skip); + } else { + skip = 0; + } + + EXPECT_EQ(need, + alloc->allocate(need, need, + 0, (int64_t)-1, &extents2)); + EXPECT_EQ(1u, extents2.size()); + EXPECT_EQ(extents2[0].offset, need + skip); + EXPECT_EQ(extents2[0].length, need); + + { + // now release the very first 4K extent + interval_set release_set; + release_set.insert(extents1[0].offset, block_size); + alloc->release(release_set); + } + // and now allocate once again, this will get the following LBA, + // not zero one + EXPECT_EQ(need, + alloc->allocate(need, need, + 0, (int64_t)-1, &extents3)); + EXPECT_EQ(1u, extents3.size()); + EXPECT_EQ(extents3[0].offset, need + need + skip); + EXPECT_EQ(extents3[0].length, need); + alloc->shutdown(); + } + + // do allocations with zero hint provided hence disabling internal spatial locality + { + init_alloc(capacity, block_size); + + alloc->init_add_free(0, capacity); + + PExtentVector extents1; + PExtentVector extents2; + PExtentVector extents3; + + uint64_t need = 0x1000; + EXPECT_EQ(need, + alloc->allocate(need, need, + 0, (int64_t)0, &extents1)); + EXPECT_EQ(1u, extents1.size()); + EXPECT_EQ(extents1[0].offset, 0); + EXPECT_EQ(extents1[0].length, need); + + // mark a large extent allocated + // to work around bitmap cursor tracking which uses + // l2 granularity (equal to 32GB for 4K unit). + uint64_t skip; + if (GetParam() == string("bitmap")) { + skip = 32 * (1ull << 30) - need; + alloc->init_rm_free(need, skip); + } else { + skip = 0; + } + + EXPECT_EQ(need, + alloc->allocate(need, need, + 0, (int64_t)0, &extents2)); + EXPECT_EQ(1u, extents2.size()); + EXPECT_EQ(extents2[0].offset, need + skip); + EXPECT_EQ(extents2[0].length, need); + + { + // now release the very first 4K extent + interval_set release_set; + release_set.insert(extents1[0].offset, block_size); + alloc->release(release_set); + } + // and allocate once again, this will get the extent at LBA = 0 + // which just has been released + EXPECT_EQ(need, + alloc->allocate(need, need, + 0, (int64_t)0, &extents3)); + EXPECT_EQ(1u, extents3.size()); + EXPECT_EQ(extents3[0].offset, 0); + EXPECT_EQ(extents3[0].length, need); + alloc->shutdown(); + } +} + + INSTANTIATE_TEST_SUITE_P( Allocator, AllocTest, diff --git a/src/test/objectstore/allocator_replay_test.cc b/src/test/objectstore/allocator_replay_test.cc index b76fd7f7514..c98e854ab83 100644 --- a/src/test/objectstore/allocator_replay_test.cc +++ b/src/test/objectstore/allocator_replay_test.cc @@ -185,7 +185,7 @@ int replay_and_check_for_duplicate(char* fname) return -1; } tmp.clear(); - auto allocated = alloc->allocate(want, alloc_unit, 0, 0, &tmp); + auto allocated = alloc->allocate(want, alloc_unit, 0, -1, &tmp); std::cout << "allocated TOTAL: " << allocated << std::endl; for (auto& ee : tmp) { std::cerr << "dump extent: " << std::hex @@ -628,7 +628,7 @@ int main(int argc, char **argv) for(size_t i = 0; i < count; i++) { extents.clear(); auto t0 = ceph::mono_clock::now(); - auto r = a->allocate(want, alloc_unit, 0, &extents); + auto r = a->allocate(want, alloc_unit, -1, &extents); std::cout << "Duration (ns): " << (ceph::mono_clock::now() - t0).count() << std::endl; if (r < 0) { std::cerr << "Error: allocation failure at step:" << i + 1 @@ -695,7 +695,8 @@ int main(int argc, char **argv) for (auto i = 0; i < replay_count; ++i) { while (fgets(s, sizeof(s), f_alloc_list) != nullptr) { /* parse allocation request */ - uint64_t want = 0, unit = 0, max = 0, hint = 0; + uint64_t want = 0, unit = 0, max = 0; + int64_t hint = -1; if (std::sscanf(s, "%ji %ji %ji %ji", &want, &unit, &max, &hint) < 2) { diff --git a/src/test/objectstore/hybrid_allocator_test.cc b/src/test/objectstore/hybrid_allocator_test.cc index f103b3644d6..77397382698 100755 --- a/src/test/objectstore/hybrid_allocator_test.cc +++ b/src/test/objectstore/hybrid_allocator_test.cc @@ -113,7 +113,7 @@ TEST(HybridAllocator, basic) PExtentVector extents; // allocate 4K, to be served from bitmap EXPECT_EQ(block_size, ha.allocate(block_size, block_size, - 0, (int64_t)0, &extents)); + 0, (int64_t)-1, &extents)); ASSERT_EQ(1, extents.size()); ASSERT_EQ(0, extents[0].offset); @@ -253,7 +253,7 @@ TEST(HybridAllocator, basic) // allocate 12M using 2M chunks. 10M to be returned PExtentVector extents; EXPECT_EQ(10 * _1m, ha.allocate(12 * _1m, 2 * _1m, - 0, (int64_t)0, &extents)); + 0, (int64_t)-1, &extents)); // release everything allocated for (auto& e : extents) { diff --git a/src/vstart.sh b/src/vstart.sh index adeeab9bb9a..a50389fcf4d 100755 --- a/src/vstart.sh +++ b/src/vstart.sh @@ -1003,6 +1003,9 @@ $DAEMONOPTS bluestore fsck on mount = true bluestore block create = true + bluestore allocator = bitmap + bluestore alloc favor spatial locality = false + $BLUESTORE_OPTS ; kstore