From: Igor Fedotov Date: Wed, 11 Feb 2026 16:34:30 +0000 (+0300) Subject: os/bluestore: move RocksDBBlueFSVolumeSelector to BlueFS.cc X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=42b25e52e19014206b26be7ea7fa1e5538ecae08;p=ceph.git os/bluestore: move RocksDBBlueFSVolumeSelector to BlueFS.cc We'll need it from test_bluefs.cc and hence better move out of BlueStore.cc to not expose it for the tests. Signed-off-by: Igor Fedotov --- diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index 3d855b467bd..dc151c4a7f8 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -16,6 +16,7 @@ #include "Allocator.h" #include "include/buffer_fwd.h" #include "include/ceph_assert.h" +#include "include/stringify.h" #include "common/admin_socket.h" #include "os/bluestore/bluefs_types.h" @@ -5397,3 +5398,190 @@ void OriginalVolumeSelector::dump(ostream& sout) { void FitToFastVolumeSelector::get_paths(const std::string& base, paths& res) const { res.emplace_back(base, 1); // size of the last db_path has no effect } + +uint8_t RocksDBBlueFSVolumeSelector::select_prefer_bdev(void* h) { + ceph_assert(h != nullptr); + uint64_t hint = reinterpret_cast(h); + uint8_t res; + switch (hint) { + case LEVEL_SLOW: + res = BlueFS::BDEV_SLOW; + if (db_avail4slow > 0) { + // considering statically available db space vs. + // - observed maximums on DB dev for DB/WAL/UNSORTED data + // - observed maximum spillovers + uint64_t max_db_use = 0; // max db usage we potentially observed + max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_LOG - LEVEL_FIRST); + max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_WAL - LEVEL_FIRST); + max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_DB - LEVEL_FIRST); + // this could go to db hence using it in the estimation + max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_SLOW, LEVEL_DB - LEVEL_FIRST); + + auto db_total = l_totals[LEVEL_DB - LEVEL_FIRST]; + uint64_t avail = std::min( + db_avail4slow, + max_db_use < db_total ? db_total - max_db_use : 0); + + // considering current DB dev usage for SLOW data + if (avail > per_level_per_dev_usage.at(BlueFS::BDEV_DB, LEVEL_SLOW - LEVEL_FIRST)) { + res = BlueFS::BDEV_DB; + } + } + break; + case LEVEL_LOG: + case LEVEL_WAL: + res = BlueFS::BDEV_WAL; + break; + case LEVEL_DB: + default: + res = BlueFS::BDEV_DB; + break; + } + return res; +} + +void RocksDBBlueFSVolumeSelector::get_paths(const std::string& base, paths& res) const +{ + auto db_size = l_totals[LEVEL_DB - LEVEL_FIRST]; + res.emplace_back(base, db_size); + auto slow_size = l_totals[LEVEL_SLOW - LEVEL_FIRST]; + if (slow_size == 0) { + slow_size = db_size; + } + res.emplace_back(base + ".slow", slow_size); +} + +void* RocksDBBlueFSVolumeSelector::get_hint_by_dir(std::string_view dirname) const { + uint8_t res = LEVEL_DB; + if (dirname.length() > 5) { + // the "db.slow" and "db.wal" directory names are hard-coded at + // match up with bluestore. the slow device is always the second + // one (when a dedicated block.db device is present and used at + // bdev 0). the wal device is always last. + if (boost::algorithm::ends_with(dirname, ".slow")) { + res = LEVEL_SLOW; + } + else if (boost::algorithm::ends_with(dirname, ".wal")) { + res = LEVEL_WAL; + } + } + return reinterpret_cast(res); +} + +void RocksDBBlueFSVolumeSelector::dump(ostream& sout) { + auto max_x = per_level_per_dev_usage.get_max_x(); + auto max_y = per_level_per_dev_usage.get_max_y(); + + sout << "RocksDBBlueFSVolumeSelector " << std::endl; + sout << ">>Settings<<" + << " extra=" << byte_u_t(db_avail4slow) + << ", extra level=" << extra_level + << ", l0_size=" << byte_u_t(level0_size) + << ", l_base=" << byte_u_t(level_base) + << ", l_multi=" << byte_u_t(level_multiplier) + << std::endl; + constexpr std::array names{ { + "LEV/DEV", + "WAL", + "DB", + "SLOW", + "*", + "*", + "REAL", + "FILES", + } }; + const size_t width = 12; + for (size_t i = 0; i < names.size(); ++i) { + sout.setf(std::ios::left, std::ios::adjustfield); + sout.width(width); + sout << names[i]; + } + sout << std::endl; + for (size_t l = 0; l < max_y; l++) { + sout.setf(std::ios::left, std::ios::adjustfield); + sout.width(width); + switch (l + LEVEL_FIRST) { + case LEVEL_LOG: + sout << "log"; break; + case LEVEL_WAL: + sout << "db.wal"; break; + case LEVEL_DB: + sout << "db"; break; + case LEVEL_SLOW: + sout << "db.slow"; break; + case LEVEL_MAX: + sout << "TOTAL"; break; + } + for (size_t d = 0; d < max_x; d++) { + sout.setf(std::ios::left, std::ios::adjustfield); + sout.width(width); + sout << stringify(byte_u_t(per_level_per_dev_usage.at(d, l))); + } + sout.setf(std::ios::left, std::ios::adjustfield); + sout.width(width); + sout << stringify(per_level_files[l]) << std::endl; + } + ceph_assert(max_x == per_level_per_dev_max.get_max_x()); + ceph_assert(max_y == per_level_per_dev_max.get_max_y()); + sout << "MAXIMUMS:" << std::endl; + for (size_t l = 0; l < max_y; l++) { + sout.setf(std::ios::left, std::ios::adjustfield); + sout.width(width); + switch (l + LEVEL_FIRST) { + case LEVEL_LOG: + sout << "log"; break; + case LEVEL_WAL: + sout << "db.wal"; break; + case LEVEL_DB: + sout << "db"; break; + case LEVEL_SLOW: + sout << "db.slow"; break; + case LEVEL_MAX: + sout << "TOTAL"; break; + } + for (size_t d = 0; d < max_x - 1; d++) { + sout.setf(std::ios::left, std::ios::adjustfield); + sout.width(width); + sout << stringify(byte_u_t(per_level_per_dev_max.at(d, l))); + } + sout.setf(std::ios::left, std::ios::adjustfield); + sout.width(width); + sout << stringify(byte_u_t(per_level_per_dev_max.at(max_x - 1, l))); + sout << std::endl; + } + string sizes[] = { + ">> SIZE <<", + stringify(byte_u_t(l_totals[LEVEL_WAL - LEVEL_FIRST])), + stringify(byte_u_t(l_totals[LEVEL_DB - LEVEL_FIRST])), + stringify(byte_u_t(l_totals[LEVEL_SLOW - LEVEL_FIRST])), + }; + for (size_t i = 0; i < (sizeof(sizes) / sizeof(sizes[0])); i++) { + sout.setf(std::ios::left, std::ios::adjustfield); + sout.width(width); + sout << sizes[i]; + } + sout << std::endl; +} + +BlueFSVolumeSelector* RocksDBBlueFSVolumeSelector::clone_empty() const { + RocksDBBlueFSVolumeSelector* ns = + new RocksDBBlueFSVolumeSelector(0, 0, 0, 0, 0, 0, false); + return ns; +} + +bool RocksDBBlueFSVolumeSelector::compare(BlueFSVolumeSelector* other) { + RocksDBBlueFSVolumeSelector* o = dynamic_cast(other); + ceph_assert(o); + bool equal = true; + for (size_t x = 0; x < BlueFS::MAX_BDEV + 1; x++) { + for (size_t y = 0; y < LEVEL_MAX - LEVEL_FIRST + 1; y++) { + equal &= (per_level_per_dev_usage.at(x, y) == o->per_level_per_dev_usage.at(x, y)); + } + } + for (size_t t = 0; t < LEVEL_MAX - LEVEL_FIRST + 1; t++) { + equal &= (per_level_files[t] == o->per_level_files[t]); + } + return equal; +} + +// ======================================================= diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h index a889a19122a..5f676aa27f1 100644 --- a/src/os/bluestore/BlueFS.h +++ b/src/os/bluestore/BlueFS.h @@ -1000,6 +1000,218 @@ public: void get_paths(const std::string& base, paths& res) const override; }; + +class RocksDBBlueFSVolumeSelector : public BlueFSVolumeSelector +{ + template + class matrix_2d { + T values[MaxX][MaxY]; + public: + matrix_2d() { + clear(); + } + T& at(size_t x, size_t y) { + ceph_assert(x < MaxX); + ceph_assert(y < MaxY); + + return values[x][y]; + } + size_t get_max_x() const { + return MaxX; + } + size_t get_max_y() const { + return MaxY; + } + void clear() { + memset(values, 0, sizeof(values)); + } + }; + + enum { + // use 0/nullptr as unset indication + LEVEL_FIRST = 1, + LEVEL_LOG = LEVEL_FIRST, // BlueFS log + LEVEL_WAL, + LEVEL_DB, + LEVEL_SLOW, + LEVEL_MAX + }; + // add +1 row for per-level actual (taken from file size) total + // add +1 column for corresponding per-device totals + typedef matrix_2d, BlueFS::MAX_BDEV + 1, LEVEL_MAX - LEVEL_FIRST + 1> per_level_per_dev_usage_t; + + per_level_per_dev_usage_t per_level_per_dev_usage; + // file count per level, add +1 to keep total file count + std::atomic per_level_files[LEVEL_MAX - LEVEL_FIRST + 1] = { 0 }; + + // Note: maximum per-device totals below might be smaller than corresponding + // perf counters by up to a single alloc unit (1M) due to superblock extent. + // The later is not accounted here. + per_level_per_dev_usage_t per_level_per_dev_max; + + uint64_t l_totals[LEVEL_MAX - LEVEL_FIRST]; + uint64_t db_avail4slow = 0; + uint64_t level0_size = 0; + uint64_t level_base = 0; + uint64_t level_multiplier = 0; + bool new_pol = false; + size_t extra_level = 0; + enum { + OLD_POLICY, + USE_SOME_EXTRA + }; + +public: + RocksDBBlueFSVolumeSelector( + uint64_t _wal_total, + uint64_t _db_total, + uint64_t _slow_total, + uint64_t _level0_size, + uint64_t _level_base, + uint64_t _level_multiplier, + bool _new_pol) { + + l_totals[LEVEL_LOG - LEVEL_FIRST] = 0; // not used at the moment + l_totals[LEVEL_WAL - LEVEL_FIRST] = _wal_total; + l_totals[LEVEL_DB - LEVEL_FIRST] = _db_total; + l_totals[LEVEL_SLOW - LEVEL_FIRST] = _slow_total; + + level0_size = _level0_size; + level_base = _level_base; + level_multiplier = _level_multiplier; + + new_pol = _new_pol; + } + + void update_from_config(CephContext* cct) override + { + if (!new_pol) { + return; + } + + db_avail4slow = 0; + extra_level = 0; + double reserved_factor = + cct->_conf->bluestore_volume_selection_reserved_factor; + uint64_t reserved = cct->_conf->bluestore_volume_selection_reserved; + + auto db_total = l_totals[LEVEL_DB - LEVEL_FIRST]; + // Calculating how much extra space is available at DB volume. + // Depending on the presence of explicit reserved size specification it might be either + // * DB volume size - reserved + // or + // * DB volume size - sum_max_level_size(0, L-1) - max_level_size(L) * reserved_factor + if (!reserved) { + uint64_t prev_levels = level0_size; + uint64_t cur_level = level_base; + extra_level = 1; + do { + uint64_t next_level = cur_level * level_multiplier; + uint64_t next_threshold = prev_levels + cur_level + next_level; + ++extra_level; + if (db_total <= next_threshold) { + uint64_t cur_threshold = prev_levels + cur_level * reserved_factor; + db_avail4slow = cur_threshold < db_total ? db_total - cur_threshold : 0; + break; + } + else { + prev_levels += cur_level; + cur_level = next_level; + } + } while (true); + } + else { + db_avail4slow = reserved < db_total ? db_total - reserved : 0; + extra_level = 0; + } + } + + uint64_t get_available_extra() const { + return db_avail4slow; + } + uint64_t get_extra_level() const { + return extra_level; + } + void* get_hint_for_log() const override { + return reinterpret_cast(LEVEL_LOG); + } + void* get_hint_by_dir(std::string_view dirname) const override; + + void add_usage(void* hint, const bluefs_extent_t& extent) override { + if (hint == nullptr) + return; + size_t pos = (size_t)hint - LEVEL_FIRST; + auto& cur = per_level_per_dev_usage.at(extent.bdev, pos); + auto& max = per_level_per_dev_max.at(extent.bdev, pos); + uint64_t v = cur.fetch_add(extent.length) + extent.length; + while (v > max) { + max.exchange(v); + } + { + //update per-device totals + auto& cur = per_level_per_dev_usage.at(extent.bdev, LEVEL_MAX - LEVEL_FIRST); + auto& max = per_level_per_dev_max.at(extent.bdev, LEVEL_MAX - LEVEL_FIRST); + uint64_t v = cur.fetch_add(extent.length) + extent.length; + while (v > max) { + max.exchange(v); + } + } + } + void sub_usage(void* hint, const bluefs_extent_t& extent) override { + if (hint == nullptr) + return; + size_t pos = (size_t)hint - LEVEL_FIRST; + auto& cur = per_level_per_dev_usage.at(extent.bdev, pos); + ceph_assert(cur >= extent.length); + cur -= extent.length; + + //update per-device totals + auto& cur2 = per_level_per_dev_usage.at(extent.bdev, LEVEL_MAX - LEVEL_FIRST); + ceph_assert(cur2 >= extent.length); + cur2 -= extent.length; + } + void add_usage(void* hint, uint64_t size_more, bool upd_files) override { + if (hint == nullptr) + return; + size_t pos = (size_t)hint - LEVEL_FIRST; + //update per-level actual totals + auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos); + auto& max = per_level_per_dev_max.at(BlueFS::MAX_BDEV, pos); + uint64_t v = cur.fetch_add(size_more) + size_more; + while (v > max) { + max.exchange(v); + } + if (upd_files) { + ++per_level_files[pos]; + ++per_level_files[LEVEL_MAX - LEVEL_FIRST]; + } + } + void sub_usage(void* hint, uint64_t size_less, bool upd_files) override { + if (hint == nullptr) + return; + size_t pos = (size_t)hint - LEVEL_FIRST; + //update per-level actual totals + auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos); + ceph_assert(cur >= size_less); + cur -= size_less; + if (upd_files) { + ceph_assert(per_level_files[pos] > 0); + --per_level_files[pos]; + ceph_assert(per_level_files[LEVEL_MAX - LEVEL_FIRST] > 0); + --per_level_files[LEVEL_MAX - LEVEL_FIRST]; + } + } + + uint8_t select_prefer_bdev(void* h) override; + void get_paths( + const std::string& base, + BlueFSVolumeSelector::paths& res) const override; + + void dump(std::ostream& sout) override; + BlueFSVolumeSelector* clone_empty() const override; + bool compare(BlueFSVolumeSelector* other) override; +}; + /** * Directional graph of locks. * Vertices - Locks. Edges (directed) - locking progression. diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 97fc9dd68de..c0ce8cc4eba 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -19610,196 +19610,6 @@ unsigned BlueStoreRepairer::apply(KeyValueDB* db) return repaired; } -// ======================================================= -// RocksDBBlueFSVolumeSelector - -uint8_t RocksDBBlueFSVolumeSelector::select_prefer_bdev(void* h) { - ceph_assert(h != nullptr); - uint64_t hint = reinterpret_cast(h); - uint8_t res; - switch (hint) { - case LEVEL_SLOW: - res = BlueFS::BDEV_SLOW; - if (db_avail4slow > 0) { - // considering statically available db space vs. - // - observed maximums on DB dev for DB/WAL/UNSORTED data - // - observed maximum spillovers - uint64_t max_db_use = 0; // max db usage we potentially observed - max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_LOG - LEVEL_FIRST); - max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_WAL - LEVEL_FIRST); - max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_DB - LEVEL_FIRST); - // this could go to db hence using it in the estimation - max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_SLOW, LEVEL_DB - LEVEL_FIRST); - - auto db_total = l_totals[LEVEL_DB - LEVEL_FIRST]; - uint64_t avail = min( - db_avail4slow, - max_db_use < db_total ? db_total - max_db_use : 0); - - // considering current DB dev usage for SLOW data - if (avail > per_level_per_dev_usage.at(BlueFS::BDEV_DB, LEVEL_SLOW - LEVEL_FIRST)) { - res = BlueFS::BDEV_DB; - } - } - break; - case LEVEL_LOG: - case LEVEL_WAL: - res = BlueFS::BDEV_WAL; - break; - case LEVEL_DB: - default: - res = BlueFS::BDEV_DB; - break; - } - return res; -} - -void RocksDBBlueFSVolumeSelector::get_paths(const std::string& base, paths& res) const -{ - auto db_size = l_totals[LEVEL_DB - LEVEL_FIRST]; - res.emplace_back(base, db_size); - auto slow_size = l_totals[LEVEL_SLOW - LEVEL_FIRST]; - if (slow_size == 0) { - slow_size = db_size; - } - res.emplace_back(base + ".slow", slow_size); -} - -void* RocksDBBlueFSVolumeSelector::get_hint_by_dir(std::string_view dirname) const { - uint8_t res = LEVEL_DB; - if (dirname.length() > 5) { - // the "db.slow" and "db.wal" directory names are hard-coded at - // match up with bluestore. the slow device is always the second - // one (when a dedicated block.db device is present and used at - // bdev 0). the wal device is always last. - if (boost::algorithm::ends_with(dirname, ".slow")) { - res = LEVEL_SLOW; - } - else if (boost::algorithm::ends_with(dirname, ".wal")) { - res = LEVEL_WAL; - } - } - return reinterpret_cast(res); -} - -void RocksDBBlueFSVolumeSelector::dump(ostream& sout) { - auto max_x = per_level_per_dev_usage.get_max_x(); - auto max_y = per_level_per_dev_usage.get_max_y(); - - sout << "RocksDBBlueFSVolumeSelector " << std::endl; - sout << ">>Settings<<" - << " extra=" << byte_u_t(db_avail4slow) - << ", extra level=" << extra_level - << ", l0_size=" << byte_u_t(level0_size) - << ", l_base=" << byte_u_t(level_base) - << ", l_multi=" << byte_u_t(level_multiplier) - << std::endl; - constexpr std::array names{ { - "LEV/DEV", - "WAL", - "DB", - "SLOW", - "*", - "*", - "REAL", - "FILES", - } }; - const size_t width = 12; - for (size_t i = 0; i < names.size(); ++i) { - sout.setf(std::ios::left, std::ios::adjustfield); - sout.width(width); - sout << names[i]; - } - sout << std::endl; - for (size_t l = 0; l < max_y; l++) { - sout.setf(std::ios::left, std::ios::adjustfield); - sout.width(width); - switch (l + LEVEL_FIRST) { - case LEVEL_LOG: - sout << "log"; break; - case LEVEL_WAL: - sout << "db.wal"; break; - case LEVEL_DB: - sout << "db"; break; - case LEVEL_SLOW: - sout << "db.slow"; break; - case LEVEL_MAX: - sout << "TOTAL"; break; - } - for (size_t d = 0; d < max_x; d++) { - sout.setf(std::ios::left, std::ios::adjustfield); - sout.width(width); - sout << stringify(byte_u_t(per_level_per_dev_usage.at(d, l))); - } - sout.setf(std::ios::left, std::ios::adjustfield); - sout.width(width); - sout << stringify(per_level_files[l]) << std::endl; - } - ceph_assert(max_x == per_level_per_dev_max.get_max_x()); - ceph_assert(max_y == per_level_per_dev_max.get_max_y()); - sout << "MAXIMUMS:" << std::endl; - for (size_t l = 0; l < max_y; l++) { - sout.setf(std::ios::left, std::ios::adjustfield); - sout.width(width); - switch (l + LEVEL_FIRST) { - case LEVEL_LOG: - sout << "log"; break; - case LEVEL_WAL: - sout << "db.wal"; break; - case LEVEL_DB: - sout << "db"; break; - case LEVEL_SLOW: - sout << "db.slow"; break; - case LEVEL_MAX: - sout << "TOTAL"; break; - } - for (size_t d = 0; d < max_x - 1; d++) { - sout.setf(std::ios::left, std::ios::adjustfield); - sout.width(width); - sout << stringify(byte_u_t(per_level_per_dev_max.at(d, l))); - } - sout.setf(std::ios::left, std::ios::adjustfield); - sout.width(width); - sout << stringify(byte_u_t(per_level_per_dev_max.at(max_x - 1, l))); - sout << std::endl; - } - string sizes[] = { - ">> SIZE <<", - stringify(byte_u_t(l_totals[LEVEL_WAL - LEVEL_FIRST])), - stringify(byte_u_t(l_totals[LEVEL_DB - LEVEL_FIRST])), - stringify(byte_u_t(l_totals[LEVEL_SLOW - LEVEL_FIRST])), - }; - for (size_t i = 0; i < (sizeof(sizes) / sizeof(sizes[0])); i++) { - sout.setf(std::ios::left, std::ios::adjustfield); - sout.width(width); - sout << sizes[i]; - } - sout << std::endl; -} - -BlueFSVolumeSelector* RocksDBBlueFSVolumeSelector::clone_empty() const { - RocksDBBlueFSVolumeSelector* ns = - new RocksDBBlueFSVolumeSelector(0, 0, 0, 0, 0, 0, false); - return ns; -} - -bool RocksDBBlueFSVolumeSelector::compare(BlueFSVolumeSelector* other) { - RocksDBBlueFSVolumeSelector* o = dynamic_cast(other); - ceph_assert(o); - bool equal = true; - for (size_t x = 0; x < BlueFS::MAX_BDEV + 1; x++) { - for (size_t y = 0; y per_level_per_dev_usage.at(x, y)); - } - } - for (size_t t = 0; t < LEVEL_MAX - LEVEL_FIRST + 1; t++) { - equal &= (per_level_files[t] == o->per_level_files[t]); - } - return equal; -} - -// ======================================================= - //================================================================================================================ // BlueStore is committing all allocation information (alloc/release) into RocksDB before the client Write is performed. // This cause a delay in write path and add significant load to the CPU/Memory/Disk. diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 38bea331c09..7bcc04e40de 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -4396,213 +4396,4 @@ private: }; -class RocksDBBlueFSVolumeSelector : public BlueFSVolumeSelector -{ - template - class matrix_2d { - T values[MaxX][MaxY]; - public: - matrix_2d() { - clear(); - } - T& at(size_t x, size_t y) { - ceph_assert(x < MaxX); - ceph_assert(y < MaxY); - - return values[x][y]; - } - size_t get_max_x() const { - return MaxX; - } - size_t get_max_y() const { - return MaxY; - } - void clear() { - memset(values, 0, sizeof(values)); - } - }; - - enum { - // use 0/nullptr as unset indication - LEVEL_FIRST = 1, - LEVEL_LOG = LEVEL_FIRST, // BlueFS log - LEVEL_WAL, - LEVEL_DB, - LEVEL_SLOW, - LEVEL_MAX - }; - // add +1 row for per-level actual (taken from file size) total - // add +1 column for corresponding per-device totals - typedef matrix_2d, BlueFS::MAX_BDEV + 1, LEVEL_MAX - LEVEL_FIRST + 1> per_level_per_dev_usage_t; - - per_level_per_dev_usage_t per_level_per_dev_usage; - // file count per level, add +1 to keep total file count - std::atomic per_level_files[LEVEL_MAX - LEVEL_FIRST + 1] = { 0 }; - - // Note: maximum per-device totals below might be smaller than corresponding - // perf counters by up to a single alloc unit (1M) due to superblock extent. - // The later is not accounted here. - per_level_per_dev_usage_t per_level_per_dev_max; - - uint64_t l_totals[LEVEL_MAX - LEVEL_FIRST]; - uint64_t db_avail4slow = 0; - uint64_t level0_size = 0; - uint64_t level_base = 0; - uint64_t level_multiplier = 0; - bool new_pol = false; - size_t extra_level = 0; - enum { - OLD_POLICY, - USE_SOME_EXTRA - }; - -public: - RocksDBBlueFSVolumeSelector( - uint64_t _wal_total, - uint64_t _db_total, - uint64_t _slow_total, - uint64_t _level0_size, - uint64_t _level_base, - uint64_t _level_multiplier, - bool _new_pol) { - - l_totals[LEVEL_LOG - LEVEL_FIRST] = 0; // not used at the moment - l_totals[LEVEL_WAL - LEVEL_FIRST] = _wal_total; - l_totals[LEVEL_DB - LEVEL_FIRST] = _db_total; - l_totals[LEVEL_SLOW - LEVEL_FIRST] = _slow_total; - - level0_size = _level0_size; - level_base = _level_base; - level_multiplier = _level_multiplier; - - new_pol = _new_pol; - } - - void update_from_config(CephContext* cct) override - { - if (!new_pol) { - return; - } - - db_avail4slow = 0; - extra_level = 0; - double reserved_factor = - cct->_conf->bluestore_volume_selection_reserved_factor; - uint64_t reserved = cct->_conf->bluestore_volume_selection_reserved; - - auto db_total = l_totals[LEVEL_DB - LEVEL_FIRST]; - // Calculating how much extra space is available at DB volume. - // Depending on the presence of explicit reserved size specification it might be either - // * DB volume size - reserved - // or - // * DB volume size - sum_max_level_size(0, L-1) - max_level_size(L) * reserved_factor - if (!reserved) { - uint64_t prev_levels = level0_size; - uint64_t cur_level = level_base; - extra_level = 1; - do { - uint64_t next_level = cur_level * level_multiplier; - uint64_t next_threshold = prev_levels + cur_level + next_level; - ++extra_level; - if (db_total <= next_threshold) { - uint64_t cur_threshold = prev_levels + cur_level * reserved_factor; - db_avail4slow = cur_threshold < db_total ? db_total - cur_threshold : 0; - break; - } else { - prev_levels += cur_level; - cur_level = next_level; - } - } while (true); - } else { - db_avail4slow = reserved < db_total ? db_total - reserved : 0; - extra_level = 0; - } - } - - uint64_t get_available_extra() const { - return db_avail4slow; - } - uint64_t get_extra_level() const { - return extra_level; - } - void* get_hint_for_log() const override { - return reinterpret_cast(LEVEL_LOG); - } - void* get_hint_by_dir(std::string_view dirname) const override; - - void add_usage(void* hint, const bluefs_extent_t& extent) override { - if (hint == nullptr) - return; - size_t pos = (size_t)hint - LEVEL_FIRST; - auto& cur = per_level_per_dev_usage.at(extent.bdev, pos); - auto& max = per_level_per_dev_max.at(extent.bdev, pos); - uint64_t v = cur.fetch_add(extent.length) + extent.length; - while (v > max) { - max.exchange(v); - } - { - //update per-device totals - auto& cur = per_level_per_dev_usage.at(extent.bdev, LEVEL_MAX - LEVEL_FIRST); - auto& max = per_level_per_dev_max.at(extent.bdev, LEVEL_MAX - LEVEL_FIRST); - uint64_t v = cur.fetch_add(extent.length) + extent.length; - while (v > max) { - max.exchange(v); - } - } - } - void sub_usage(void* hint, const bluefs_extent_t& extent) override { - if (hint == nullptr) - return; - size_t pos = (size_t)hint - LEVEL_FIRST; - auto& cur = per_level_per_dev_usage.at(extent.bdev, pos); - ceph_assert(cur >= extent.length); - cur -= extent.length; - - //update per-device totals - auto& cur2 = per_level_per_dev_usage.at(extent.bdev, LEVEL_MAX - LEVEL_FIRST); - ceph_assert(cur2 >= extent.length); - cur2 -= extent.length; - } - void add_usage(void* hint, uint64_t size_more, bool upd_files) override { - if (hint == nullptr) - return; - size_t pos = (size_t)hint - LEVEL_FIRST; - //update per-level actual totals - auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos); - auto& max = per_level_per_dev_max.at(BlueFS::MAX_BDEV, pos); - uint64_t v = cur.fetch_add(size_more) + size_more; - while (v > max) { - max.exchange(v); - } - if (upd_files) { - ++per_level_files[pos]; - ++per_level_files[LEVEL_MAX - LEVEL_FIRST]; - } - } - void sub_usage(void* hint, uint64_t size_less, bool upd_files) override { - if (hint == nullptr) - return; - size_t pos = (size_t)hint - LEVEL_FIRST; - //update per-level actual totals - auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos); - ceph_assert(cur >= size_less); - cur -= size_less; - if (upd_files) { - ceph_assert(per_level_files[pos] > 0); - --per_level_files[pos]; - ceph_assert(per_level_files[LEVEL_MAX - LEVEL_FIRST] > 0); - --per_level_files[LEVEL_MAX - LEVEL_FIRST]; - } - } - - uint8_t select_prefer_bdev(void* h) override; - void get_paths( - const std::string& base, - BlueFSVolumeSelector::paths& res) const override; - - void dump(std::ostream& sout) override; - BlueFSVolumeSelector* clone_empty() const override; - bool compare(BlueFSVolumeSelector* other) override; -}; - #endif