From: Igor Fedotov Date: Thu, 26 Mar 2026 21:35:21 +0000 (+0300) Subject: os/bluestore: print effective extra in 'bluefs stats' report X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=1fa1b3ed7e169eaa92b8aa1d38bb6599ec45adf4;p=ceph.git os/bluestore: print effective extra in 'bluefs stats' report I.e. the one which takes historic maximum into account. Signed-off-by: Igor Fedotov --- diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index 7bfd975f6675..19032f5ecbce 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -5425,6 +5425,27 @@ void FitToFastVolumeSelector::get_paths(const std::string& base, paths& res) con res.emplace_back(base, 1); // size of the last db_path has no effect } +uint64_t RocksDBBlueFSVolumeSelector::get_effective_extra() const { + // considering statically available db space vs. + // - observed maximums on DB dev for DB/WAL/UNSORTED data + // - observed maximum spillovers + + // max db usage we potentially observed + uint64_t max_db_use = 0; + max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_LOG - LEVEL_FIRST); + max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_WAL - LEVEL_FIRST); + max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_DB - LEVEL_FIRST); + // this could go to db hence using it in the estimation + max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_SLOW, LEVEL_DB - LEVEL_FIRST); + + auto db_total = l_totals[LEVEL_DB - LEVEL_FIRST]; + uint64_t avail = std::min( + db_avail4slow, + max_db_use < db_total ? db_total - max_db_use : 0); + + return avail; +} + uint8_t RocksDBBlueFSVolumeSelector::select_prefer_bdev(void* h) { ceph_assert(h != nullptr); uint64_t hint = reinterpret_cast(h); @@ -5433,21 +5454,8 @@ uint8_t RocksDBBlueFSVolumeSelector::select_prefer_bdev(void* h) { case LEVEL_SLOW: res = BlueFS::BDEV_SLOW; if (db_avail4slow > 0) { - // considering statically available db space vs. - // - observed maximums on DB dev for DB/WAL/UNSORTED data - // - observed maximum spillovers - uint64_t max_db_use = 0; // max db usage we potentially observed - max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_LOG - LEVEL_FIRST); - max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_WAL - LEVEL_FIRST); - max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_DB - LEVEL_FIRST); - // this could go to db hence using it in the estimation - max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_SLOW, LEVEL_DB - LEVEL_FIRST); - - auto db_total = l_totals[LEVEL_DB - LEVEL_FIRST]; - uint64_t avail = std::min( - db_avail4slow, - max_db_use < db_total ? db_total - max_db_use : 0); + auto avail = get_effective_extra(); // considering current DB dev usage for SLOW data if (avail > per_level_per_dev_usage.at(BlueFS::BDEV_DB, LEVEL_SLOW - LEVEL_FIRST)) { res = BlueFS::BDEV_DB; @@ -5504,8 +5512,9 @@ void RocksDBBlueFSVolumeSelector::dump(ostream& sout) { auto max_y = per_level_per_dev_usage.get_max_y(); sout << "RocksDBBlueFSVolumeSelector " << std::endl; - sout << ">>Settings<<" - << " extra=" << byte_u_t(db_avail4slow) + sout << ">>Parameters<<" + << " effective extra=" << byte_u_t(get_effective_extra()) + << " max extra=" << byte_u_t(db_avail4slow) << ", extra level=" << extra_level << ", l0_size=" << byte_u_t(level0_size) << ", l_base=" << byte_u_t(level_base) diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h index 450d6766acba..7295fd47869b 100644 --- a/src/os/bluestore/BlueFS.h +++ b/src/os/bluestore/BlueFS.h @@ -1152,12 +1152,27 @@ public: } } - uint64_t get_available_extra() const { - return db_avail4slow; - } + // Returns a static value (based on disk layout and store's settings) + // of the first RocksDB level which doesn't fully fit into "fast" volume. + // So with the selector's help data belonging to this level could be + // [partially] allocated at that volume if get_*_extra() methods below + // indicate non-zero values. uint64_t get_extra_level() const { return extra_level; } + // Returns a static value (based on disk layout and store's settings) + // of the extra space at DB volume which + // this volume selector permits for using by data from "slow" levels. + // Takes both maximum historical observations and store's settings + // into account + uint64_t get_max_extra() const { + return db_avail4slow; + } + // Calculates the effective amount of extra space at "fast" volume which + // this volume selector permits for using by data from "slow" levels. + // Takes both the maximum historical observations and the value from + // get_max_extra() into account to get the final result. + uint64_t get_effective_extra() const; void* get_hint_for_log() const override { return reinterpret_cast(LEVEL_LOG); } diff --git a/src/test/objectstore/test_bluestore_vselector.cc b/src/test/objectstore/test_bluestore_vselector.cc index 274318b0dd37..1fb20ba78a17 100644 --- a/src/test/objectstore/test_bluestore_vselector.cc +++ b/src/test/objectstore/test_bluestore_vselector.cc @@ -34,7 +34,8 @@ TEST(rocksdb_bluefs_vselector, basic) { bluefs_extent_t e; ASSERT_EQ(4, selector.get_extra_level()); - ASSERT_EQ(30ull << 30, selector.get_available_extra()); // 168GB - 1GB (L0) - 1GB (L1) - 8GB (L2) - 2*64GB (L3) + ASSERT_EQ(30ull << 30, selector.get_max_extra()); // 168GB - 1GB (L0) - 1GB (L1) - 8GB (L2) - 2*64GB (L3) + ASSERT_EQ(30ull << 30, selector.get_effective_extra()); // no history so we get max extra ASSERT_EQ(0, selector.select_prefer_bdev((void*)log_bdev)); ASSERT_EQ(0, selector.select_prefer_bdev((void*)wal_bdev)); @@ -46,13 +47,16 @@ TEST(rocksdb_bluefs_vselector, basic) { e.length = 1ull * (1 << 30); selector.add_usage((void*)db_bdev, e); } + + ASSERT_EQ(30ull << 30, selector.get_effective_extra()); // still 30GB is available + ASSERT_EQ(0, selector.select_prefer_bdev((void*)log_bdev)); ASSERT_EQ(0, selector.select_prefer_bdev((void*)wal_bdev)); ASSERT_EQ(1, selector.select_prefer_bdev((void*)db_bdev)); ASSERT_EQ(1, selector.select_prefer_bdev((void*)slow_bdev)); - // 'Use' 30GB Slow level data at DB vol - for (size_t i = 0; i < 30; i++) { + // 'Use' 20GB Slow level data at DB vol + for (size_t i = 0; i < 20; i++) { e.bdev = 1; // DB dev e.length = 1ull * (1 << 30); ASSERT_EQ(1, selector.select_prefer_bdev((void*)slow_bdev)); @@ -61,6 +65,19 @@ TEST(rocksdb_bluefs_vselector, basic) { ASSERT_EQ(0, selector.select_prefer_bdev((void*)log_bdev)); ASSERT_EQ(0, selector.select_prefer_bdev((void*)wal_bdev)); ASSERT_EQ(1, selector.select_prefer_bdev((void*)db_bdev)); + ASSERT_EQ(1, selector.select_prefer_bdev((void*)slow_bdev)); + + // 'Use' 10GB more Slow level data at DB vol + for (size_t i = 0; i < 10; i++) { + e.bdev = 1; // DB dev + e.length = 1ull * (1 << 30); + ASSERT_EQ(1, selector.select_prefer_bdev((void*)slow_bdev)); + selector.add_usage((void*)slow_bdev, e); + } + // now slow data to be targeted to slow dev + ASSERT_EQ(0, selector.select_prefer_bdev((void*)log_bdev)); + ASSERT_EQ(0, selector.select_prefer_bdev((void*)wal_bdev)); + ASSERT_EQ(1, selector.select_prefer_bdev((void*)db_bdev)); ASSERT_EQ(2, selector.select_prefer_bdev((void*)slow_bdev)); // 'Unuse' 10GB DB level data at DB vol, slow data still wouldn't fit @@ -93,6 +110,9 @@ TEST(rocksdb_bluefs_vselector, basic) { ASSERT_EQ(1, selector.select_prefer_bdev((void*)slow_bdev)); selector.add_usage((void*)db_bdev, e); } + ASSERT_EQ(10ull << 30, selector.get_effective_extra()); // reduced extra due to historic usage + + // 'Use' 10GB more slw level data at DB vol for (size_t i = 0; i < 10; i++) { e.bdev = 1; // DB dev e.length = 1ull * (1 << 30); @@ -100,6 +120,7 @@ TEST(rocksdb_bluefs_vselector, basic) { selector.add_usage((void*)slow_bdev, e); } ASSERT_EQ(2, selector.select_prefer_bdev((void*)slow_bdev)); + ASSERT_EQ(10ull << 30, selector.get_effective_extra()); // reduced extra due to historic usage // 'Unuse' remaining 10GB Slow level data at DB vol for (size_t i = 0; i < 10; i++) { @@ -115,6 +136,7 @@ TEST(rocksdb_bluefs_vselector, basic) { selector.add_usage((void*)db_bdev, e); } ASSERT_EQ(2, selector.select_prefer_bdev((void*)slow_bdev)); + ASSERT_EQ(0, selector.get_effective_extra()); // historic usage results in no extra // 'Unuse' 50GB DB level data, thi s wouldn't let slow data use DB volume anyway // due to updated historic maximum @@ -124,6 +146,7 @@ TEST(rocksdb_bluefs_vselector, basic) { selector.sub_usage((void*)db_bdev, e); } ASSERT_EQ(2, selector.select_prefer_bdev((void*)slow_bdev)); + ASSERT_EQ(0, selector.get_effective_extra()); // historic usage results in no extra { std::stringstream ss; @@ -137,7 +160,9 @@ TEST(rocksdb_bluefs_vselector, basic) { ASSERT_EQ(0, selector.get_max_db_total()); selector.dump(ss); std::cout << ss.str() << std::endl; + ASSERT_EQ(30ull << 30, selector.get_effective_extra()); // no history any more } + } int main(int argc, char **argv) {