#include "Allocator.h"
#include "include/buffer_fwd.h"
#include "include/ceph_assert.h"
+#include "include/stringify.h"
#include "common/admin_socket.h"
#include "os/bluestore/bluefs_types.h"
void FitToFastVolumeSelector::get_paths(const std::string& base, paths& res) const {
res.emplace_back(base, 1); // size of the last db_path has no effect
}
+
+uint8_t RocksDBBlueFSVolumeSelector::select_prefer_bdev(void* h) {
+ ceph_assert(h != nullptr);
+ uint64_t hint = reinterpret_cast<uint64_t>(h);
+ uint8_t res;
+ switch (hint) {
+ case LEVEL_SLOW:
+ res = BlueFS::BDEV_SLOW;
+ if (db_avail4slow > 0) {
+ // considering statically available db space vs.
+ // - observed maximums on DB dev for DB/WAL/UNSORTED data
+ // - observed maximum spillovers
+ uint64_t max_db_use = 0; // max db usage we potentially observed
+ max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_LOG - LEVEL_FIRST);
+ max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_WAL - LEVEL_FIRST);
+ max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_DB - LEVEL_FIRST);
+ // this could go to db hence using it in the estimation
+ max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_SLOW, LEVEL_DB - LEVEL_FIRST);
+
+ auto db_total = l_totals[LEVEL_DB - LEVEL_FIRST];
+ uint64_t avail = std::min(
+ db_avail4slow,
+ max_db_use < db_total ? db_total - max_db_use : 0);
+
+ // considering current DB dev usage for SLOW data
+ if (avail > per_level_per_dev_usage.at(BlueFS::BDEV_DB, LEVEL_SLOW - LEVEL_FIRST)) {
+ res = BlueFS::BDEV_DB;
+ }
+ }
+ break;
+ case LEVEL_LOG:
+ case LEVEL_WAL:
+ res = BlueFS::BDEV_WAL;
+ break;
+ case LEVEL_DB:
+ default:
+ res = BlueFS::BDEV_DB;
+ break;
+ }
+ return res;
+}
+
+void RocksDBBlueFSVolumeSelector::get_paths(const std::string& base, paths& res) const
+{
+ auto db_size = l_totals[LEVEL_DB - LEVEL_FIRST];
+ res.emplace_back(base, db_size);
+ auto slow_size = l_totals[LEVEL_SLOW - LEVEL_FIRST];
+ if (slow_size == 0) {
+ slow_size = db_size;
+ }
+ res.emplace_back(base + ".slow", slow_size);
+}
+
+void* RocksDBBlueFSVolumeSelector::get_hint_by_dir(std::string_view dirname) const {
+ uint8_t res = LEVEL_DB;
+ if (dirname.length() > 5) {
+ // the "db.slow" and "db.wal" directory names are hard-coded at
+ // match up with bluestore. the slow device is always the second
+ // one (when a dedicated block.db device is present and used at
+ // bdev 0). the wal device is always last.
+ if (boost::algorithm::ends_with(dirname, ".slow")) {
+ res = LEVEL_SLOW;
+ }
+ else if (boost::algorithm::ends_with(dirname, ".wal")) {
+ res = LEVEL_WAL;
+ }
+ }
+ return reinterpret_cast<void*>(res);
+}
+
+void RocksDBBlueFSVolumeSelector::dump(ostream& sout) {
+ auto max_x = per_level_per_dev_usage.get_max_x();
+ auto max_y = per_level_per_dev_usage.get_max_y();
+
+ sout << "RocksDBBlueFSVolumeSelector " << std::endl;
+ sout << ">>Settings<<"
+ << " extra=" << byte_u_t(db_avail4slow)
+ << ", extra level=" << extra_level
+ << ", l0_size=" << byte_u_t(level0_size)
+ << ", l_base=" << byte_u_t(level_base)
+ << ", l_multi=" << byte_u_t(level_multiplier)
+ << std::endl;
+ constexpr std::array<const char*, 8> names{ {
+ "LEV/DEV",
+ "WAL",
+ "DB",
+ "SLOW",
+ "*",
+ "*",
+ "REAL",
+ "FILES",
+ } };
+ const size_t width = 12;
+ for (size_t i = 0; i < names.size(); ++i) {
+ sout.setf(std::ios::left, std::ios::adjustfield);
+ sout.width(width);
+ sout << names[i];
+ }
+ sout << std::endl;
+ for (size_t l = 0; l < max_y; l++) {
+ sout.setf(std::ios::left, std::ios::adjustfield);
+ sout.width(width);
+ switch (l + LEVEL_FIRST) {
+ case LEVEL_LOG:
+ sout << "log"; break;
+ case LEVEL_WAL:
+ sout << "db.wal"; break;
+ case LEVEL_DB:
+ sout << "db"; break;
+ case LEVEL_SLOW:
+ sout << "db.slow"; break;
+ case LEVEL_MAX:
+ sout << "TOTAL"; break;
+ }
+ for (size_t d = 0; d < max_x; d++) {
+ sout.setf(std::ios::left, std::ios::adjustfield);
+ sout.width(width);
+ sout << stringify(byte_u_t(per_level_per_dev_usage.at(d, l)));
+ }
+ sout.setf(std::ios::left, std::ios::adjustfield);
+ sout.width(width);
+ sout << stringify(per_level_files[l]) << std::endl;
+ }
+ ceph_assert(max_x == per_level_per_dev_max.get_max_x());
+ ceph_assert(max_y == per_level_per_dev_max.get_max_y());
+ sout << "MAXIMUMS:" << std::endl;
+ for (size_t l = 0; l < max_y; l++) {
+ sout.setf(std::ios::left, std::ios::adjustfield);
+ sout.width(width);
+ switch (l + LEVEL_FIRST) {
+ case LEVEL_LOG:
+ sout << "log"; break;
+ case LEVEL_WAL:
+ sout << "db.wal"; break;
+ case LEVEL_DB:
+ sout << "db"; break;
+ case LEVEL_SLOW:
+ sout << "db.slow"; break;
+ case LEVEL_MAX:
+ sout << "TOTAL"; break;
+ }
+ for (size_t d = 0; d < max_x - 1; d++) {
+ sout.setf(std::ios::left, std::ios::adjustfield);
+ sout.width(width);
+ sout << stringify(byte_u_t(per_level_per_dev_max.at(d, l)));
+ }
+ sout.setf(std::ios::left, std::ios::adjustfield);
+ sout.width(width);
+ sout << stringify(byte_u_t(per_level_per_dev_max.at(max_x - 1, l)));
+ sout << std::endl;
+ }
+ string sizes[] = {
+ ">> SIZE <<",
+ stringify(byte_u_t(l_totals[LEVEL_WAL - LEVEL_FIRST])),
+ stringify(byte_u_t(l_totals[LEVEL_DB - LEVEL_FIRST])),
+ stringify(byte_u_t(l_totals[LEVEL_SLOW - LEVEL_FIRST])),
+ };
+ for (size_t i = 0; i < (sizeof(sizes) / sizeof(sizes[0])); i++) {
+ sout.setf(std::ios::left, std::ios::adjustfield);
+ sout.width(width);
+ sout << sizes[i];
+ }
+ sout << std::endl;
+}
+
+BlueFSVolumeSelector* RocksDBBlueFSVolumeSelector::clone_empty() const {
+ RocksDBBlueFSVolumeSelector* ns =
+ new RocksDBBlueFSVolumeSelector(0, 0, 0, 0, 0, 0, false);
+ return ns;
+}
+
+bool RocksDBBlueFSVolumeSelector::compare(BlueFSVolumeSelector* other) {
+ RocksDBBlueFSVolumeSelector* o = dynamic_cast<RocksDBBlueFSVolumeSelector*>(other);
+ ceph_assert(o);
+ bool equal = true;
+ for (size_t x = 0; x < BlueFS::MAX_BDEV + 1; x++) {
+ for (size_t y = 0; y < LEVEL_MAX - LEVEL_FIRST + 1; y++) {
+ equal &= (per_level_per_dev_usage.at(x, y) == o->per_level_per_dev_usage.at(x, y));
+ }
+ }
+ for (size_t t = 0; t < LEVEL_MAX - LEVEL_FIRST + 1; t++) {
+ equal &= (per_level_files[t] == o->per_level_files[t]);
+ }
+ return equal;
+}
+
+// =======================================================
void get_paths(const std::string& base, paths& res) const override;
};
+
+class RocksDBBlueFSVolumeSelector : public BlueFSVolumeSelector
+{
+ template <class T, size_t MaxX, size_t MaxY>
+ class matrix_2d {
+ T values[MaxX][MaxY];
+ public:
+ matrix_2d() {
+ clear();
+ }
+ T& at(size_t x, size_t y) {
+ ceph_assert(x < MaxX);
+ ceph_assert(y < MaxY);
+
+ return values[x][y];
+ }
+ size_t get_max_x() const {
+ return MaxX;
+ }
+ size_t get_max_y() const {
+ return MaxY;
+ }
+ void clear() {
+ memset(values, 0, sizeof(values));
+ }
+ };
+
+ enum {
+ // use 0/nullptr as unset indication
+ LEVEL_FIRST = 1,
+ LEVEL_LOG = LEVEL_FIRST, // BlueFS log
+ LEVEL_WAL,
+ LEVEL_DB,
+ LEVEL_SLOW,
+ LEVEL_MAX
+ };
+ // add +1 row for per-level actual (taken from file size) total
+ // add +1 column for corresponding per-device totals
+ typedef matrix_2d<std::atomic<uint64_t>, BlueFS::MAX_BDEV + 1, LEVEL_MAX - LEVEL_FIRST + 1> per_level_per_dev_usage_t;
+
+ per_level_per_dev_usage_t per_level_per_dev_usage;
+ // file count per level, add +1 to keep total file count
+ std::atomic<uint64_t> per_level_files[LEVEL_MAX - LEVEL_FIRST + 1] = { 0 };
+
+ // Note: maximum per-device totals below might be smaller than corresponding
+ // perf counters by up to a single alloc unit (1M) due to superblock extent.
+ // The later is not accounted here.
+ per_level_per_dev_usage_t per_level_per_dev_max;
+
+ uint64_t l_totals[LEVEL_MAX - LEVEL_FIRST];
+ uint64_t db_avail4slow = 0;
+ uint64_t level0_size = 0;
+ uint64_t level_base = 0;
+ uint64_t level_multiplier = 0;
+ bool new_pol = false;
+ size_t extra_level = 0;
+ enum {
+ OLD_POLICY,
+ USE_SOME_EXTRA
+ };
+
+public:
+ RocksDBBlueFSVolumeSelector(
+ uint64_t _wal_total,
+ uint64_t _db_total,
+ uint64_t _slow_total,
+ uint64_t _level0_size,
+ uint64_t _level_base,
+ uint64_t _level_multiplier,
+ bool _new_pol) {
+
+ l_totals[LEVEL_LOG - LEVEL_FIRST] = 0; // not used at the moment
+ l_totals[LEVEL_WAL - LEVEL_FIRST] = _wal_total;
+ l_totals[LEVEL_DB - LEVEL_FIRST] = _db_total;
+ l_totals[LEVEL_SLOW - LEVEL_FIRST] = _slow_total;
+
+ level0_size = _level0_size;
+ level_base = _level_base;
+ level_multiplier = _level_multiplier;
+
+ new_pol = _new_pol;
+ }
+
+ void update_from_config(CephContext* cct) override
+ {
+ if (!new_pol) {
+ return;
+ }
+
+ db_avail4slow = 0;
+ extra_level = 0;
+ double reserved_factor =
+ cct->_conf->bluestore_volume_selection_reserved_factor;
+ uint64_t reserved = cct->_conf->bluestore_volume_selection_reserved;
+
+ auto db_total = l_totals[LEVEL_DB - LEVEL_FIRST];
+ // Calculating how much extra space is available at DB volume.
+ // Depending on the presence of explicit reserved size specification it might be either
+ // * DB volume size - reserved
+ // or
+ // * DB volume size - sum_max_level_size(0, L-1) - max_level_size(L) * reserved_factor
+ if (!reserved) {
+ uint64_t prev_levels = level0_size;
+ uint64_t cur_level = level_base;
+ extra_level = 1;
+ do {
+ uint64_t next_level = cur_level * level_multiplier;
+ uint64_t next_threshold = prev_levels + cur_level + next_level;
+ ++extra_level;
+ if (db_total <= next_threshold) {
+ uint64_t cur_threshold = prev_levels + cur_level * reserved_factor;
+ db_avail4slow = cur_threshold < db_total ? db_total - cur_threshold : 0;
+ break;
+ }
+ else {
+ prev_levels += cur_level;
+ cur_level = next_level;
+ }
+ } while (true);
+ }
+ else {
+ db_avail4slow = reserved < db_total ? db_total - reserved : 0;
+ extra_level = 0;
+ }
+ }
+
+ uint64_t get_available_extra() const {
+ return db_avail4slow;
+ }
+ uint64_t get_extra_level() const {
+ return extra_level;
+ }
+ void* get_hint_for_log() const override {
+ return reinterpret_cast<void*>(LEVEL_LOG);
+ }
+ void* get_hint_by_dir(std::string_view dirname) const override;
+
+ void add_usage(void* hint, const bluefs_extent_t& extent) override {
+ if (hint == nullptr)
+ return;
+ size_t pos = (size_t)hint - LEVEL_FIRST;
+ auto& cur = per_level_per_dev_usage.at(extent.bdev, pos);
+ auto& max = per_level_per_dev_max.at(extent.bdev, pos);
+ uint64_t v = cur.fetch_add(extent.length) + extent.length;
+ while (v > max) {
+ max.exchange(v);
+ }
+ {
+ //update per-device totals
+ auto& cur = per_level_per_dev_usage.at(extent.bdev, LEVEL_MAX - LEVEL_FIRST);
+ auto& max = per_level_per_dev_max.at(extent.bdev, LEVEL_MAX - LEVEL_FIRST);
+ uint64_t v = cur.fetch_add(extent.length) + extent.length;
+ while (v > max) {
+ max.exchange(v);
+ }
+ }
+ }
+ void sub_usage(void* hint, const bluefs_extent_t& extent) override {
+ if (hint == nullptr)
+ return;
+ size_t pos = (size_t)hint - LEVEL_FIRST;
+ auto& cur = per_level_per_dev_usage.at(extent.bdev, pos);
+ ceph_assert(cur >= extent.length);
+ cur -= extent.length;
+
+ //update per-device totals
+ auto& cur2 = per_level_per_dev_usage.at(extent.bdev, LEVEL_MAX - LEVEL_FIRST);
+ ceph_assert(cur2 >= extent.length);
+ cur2 -= extent.length;
+ }
+ void add_usage(void* hint, uint64_t size_more, bool upd_files) override {
+ if (hint == nullptr)
+ return;
+ size_t pos = (size_t)hint - LEVEL_FIRST;
+ //update per-level actual totals
+ auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos);
+ auto& max = per_level_per_dev_max.at(BlueFS::MAX_BDEV, pos);
+ uint64_t v = cur.fetch_add(size_more) + size_more;
+ while (v > max) {
+ max.exchange(v);
+ }
+ if (upd_files) {
+ ++per_level_files[pos];
+ ++per_level_files[LEVEL_MAX - LEVEL_FIRST];
+ }
+ }
+ void sub_usage(void* hint, uint64_t size_less, bool upd_files) override {
+ if (hint == nullptr)
+ return;
+ size_t pos = (size_t)hint - LEVEL_FIRST;
+ //update per-level actual totals
+ auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos);
+ ceph_assert(cur >= size_less);
+ cur -= size_less;
+ if (upd_files) {
+ ceph_assert(per_level_files[pos] > 0);
+ --per_level_files[pos];
+ ceph_assert(per_level_files[LEVEL_MAX - LEVEL_FIRST] > 0);
+ --per_level_files[LEVEL_MAX - LEVEL_FIRST];
+ }
+ }
+
+ uint8_t select_prefer_bdev(void* h) override;
+ void get_paths(
+ const std::string& base,
+ BlueFSVolumeSelector::paths& res) const override;
+
+ void dump(std::ostream& sout) override;
+ BlueFSVolumeSelector* clone_empty() const override;
+ bool compare(BlueFSVolumeSelector* other) override;
+};
+
/**
* Directional graph of locks.
* Vertices - Locks. Edges (directed) - locking progression.
return repaired;
}
-// =======================================================
-// RocksDBBlueFSVolumeSelector
-
-uint8_t RocksDBBlueFSVolumeSelector::select_prefer_bdev(void* h) {
- ceph_assert(h != nullptr);
- uint64_t hint = reinterpret_cast<uint64_t>(h);
- uint8_t res;
- switch (hint) {
- case LEVEL_SLOW:
- res = BlueFS::BDEV_SLOW;
- if (db_avail4slow > 0) {
- // considering statically available db space vs.
- // - observed maximums on DB dev for DB/WAL/UNSORTED data
- // - observed maximum spillovers
- uint64_t max_db_use = 0; // max db usage we potentially observed
- max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_LOG - LEVEL_FIRST);
- max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_WAL - LEVEL_FIRST);
- max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_DB - LEVEL_FIRST);
- // this could go to db hence using it in the estimation
- max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_SLOW, LEVEL_DB - LEVEL_FIRST);
-
- auto db_total = l_totals[LEVEL_DB - LEVEL_FIRST];
- uint64_t avail = min(
- db_avail4slow,
- max_db_use < db_total ? db_total - max_db_use : 0);
-
- // considering current DB dev usage for SLOW data
- if (avail > per_level_per_dev_usage.at(BlueFS::BDEV_DB, LEVEL_SLOW - LEVEL_FIRST)) {
- res = BlueFS::BDEV_DB;
- }
- }
- break;
- case LEVEL_LOG:
- case LEVEL_WAL:
- res = BlueFS::BDEV_WAL;
- break;
- case LEVEL_DB:
- default:
- res = BlueFS::BDEV_DB;
- break;
- }
- return res;
-}
-
-void RocksDBBlueFSVolumeSelector::get_paths(const std::string& base, paths& res) const
-{
- auto db_size = l_totals[LEVEL_DB - LEVEL_FIRST];
- res.emplace_back(base, db_size);
- auto slow_size = l_totals[LEVEL_SLOW - LEVEL_FIRST];
- if (slow_size == 0) {
- slow_size = db_size;
- }
- res.emplace_back(base + ".slow", slow_size);
-}
-
-void* RocksDBBlueFSVolumeSelector::get_hint_by_dir(std::string_view dirname) const {
- uint8_t res = LEVEL_DB;
- if (dirname.length() > 5) {
- // the "db.slow" and "db.wal" directory names are hard-coded at
- // match up with bluestore. the slow device is always the second
- // one (when a dedicated block.db device is present and used at
- // bdev 0). the wal device is always last.
- if (boost::algorithm::ends_with(dirname, ".slow")) {
- res = LEVEL_SLOW;
- }
- else if (boost::algorithm::ends_with(dirname, ".wal")) {
- res = LEVEL_WAL;
- }
- }
- return reinterpret_cast<void*>(res);
-}
-
-void RocksDBBlueFSVolumeSelector::dump(ostream& sout) {
- auto max_x = per_level_per_dev_usage.get_max_x();
- auto max_y = per_level_per_dev_usage.get_max_y();
-
- sout << "RocksDBBlueFSVolumeSelector " << std::endl;
- sout << ">>Settings<<"
- << " extra=" << byte_u_t(db_avail4slow)
- << ", extra level=" << extra_level
- << ", l0_size=" << byte_u_t(level0_size)
- << ", l_base=" << byte_u_t(level_base)
- << ", l_multi=" << byte_u_t(level_multiplier)
- << std::endl;
- constexpr std::array<const char*, 8> names{ {
- "LEV/DEV",
- "WAL",
- "DB",
- "SLOW",
- "*",
- "*",
- "REAL",
- "FILES",
- } };
- const size_t width = 12;
- for (size_t i = 0; i < names.size(); ++i) {
- sout.setf(std::ios::left, std::ios::adjustfield);
- sout.width(width);
- sout << names[i];
- }
- sout << std::endl;
- for (size_t l = 0; l < max_y; l++) {
- sout.setf(std::ios::left, std::ios::adjustfield);
- sout.width(width);
- switch (l + LEVEL_FIRST) {
- case LEVEL_LOG:
- sout << "log"; break;
- case LEVEL_WAL:
- sout << "db.wal"; break;
- case LEVEL_DB:
- sout << "db"; break;
- case LEVEL_SLOW:
- sout << "db.slow"; break;
- case LEVEL_MAX:
- sout << "TOTAL"; break;
- }
- for (size_t d = 0; d < max_x; d++) {
- sout.setf(std::ios::left, std::ios::adjustfield);
- sout.width(width);
- sout << stringify(byte_u_t(per_level_per_dev_usage.at(d, l)));
- }
- sout.setf(std::ios::left, std::ios::adjustfield);
- sout.width(width);
- sout << stringify(per_level_files[l]) << std::endl;
- }
- ceph_assert(max_x == per_level_per_dev_max.get_max_x());
- ceph_assert(max_y == per_level_per_dev_max.get_max_y());
- sout << "MAXIMUMS:" << std::endl;
- for (size_t l = 0; l < max_y; l++) {
- sout.setf(std::ios::left, std::ios::adjustfield);
- sout.width(width);
- switch (l + LEVEL_FIRST) {
- case LEVEL_LOG:
- sout << "log"; break;
- case LEVEL_WAL:
- sout << "db.wal"; break;
- case LEVEL_DB:
- sout << "db"; break;
- case LEVEL_SLOW:
- sout << "db.slow"; break;
- case LEVEL_MAX:
- sout << "TOTAL"; break;
- }
- for (size_t d = 0; d < max_x - 1; d++) {
- sout.setf(std::ios::left, std::ios::adjustfield);
- sout.width(width);
- sout << stringify(byte_u_t(per_level_per_dev_max.at(d, l)));
- }
- sout.setf(std::ios::left, std::ios::adjustfield);
- sout.width(width);
- sout << stringify(byte_u_t(per_level_per_dev_max.at(max_x - 1, l)));
- sout << std::endl;
- }
- string sizes[] = {
- ">> SIZE <<",
- stringify(byte_u_t(l_totals[LEVEL_WAL - LEVEL_FIRST])),
- stringify(byte_u_t(l_totals[LEVEL_DB - LEVEL_FIRST])),
- stringify(byte_u_t(l_totals[LEVEL_SLOW - LEVEL_FIRST])),
- };
- for (size_t i = 0; i < (sizeof(sizes) / sizeof(sizes[0])); i++) {
- sout.setf(std::ios::left, std::ios::adjustfield);
- sout.width(width);
- sout << sizes[i];
- }
- sout << std::endl;
-}
-
-BlueFSVolumeSelector* RocksDBBlueFSVolumeSelector::clone_empty() const {
- RocksDBBlueFSVolumeSelector* ns =
- new RocksDBBlueFSVolumeSelector(0, 0, 0, 0, 0, 0, false);
- return ns;
-}
-
-bool RocksDBBlueFSVolumeSelector::compare(BlueFSVolumeSelector* other) {
- RocksDBBlueFSVolumeSelector* o = dynamic_cast<RocksDBBlueFSVolumeSelector*>(other);
- ceph_assert(o);
- bool equal = true;
- for (size_t x = 0; x < BlueFS::MAX_BDEV + 1; x++) {
- for (size_t y = 0; y <LEVEL_MAX - LEVEL_FIRST + 1; y++) {
- equal &= (per_level_per_dev_usage.at(x, y) == o->per_level_per_dev_usage.at(x, y));
- }
- }
- for (size_t t = 0; t < LEVEL_MAX - LEVEL_FIRST + 1; t++) {
- equal &= (per_level_files[t] == o->per_level_files[t]);
- }
- return equal;
-}
-
-// =======================================================
-
//================================================================================================================
// BlueStore is committing all allocation information (alloc/release) into RocksDB before the client Write is performed.
// This cause a delay in write path and add significant load to the CPU/Memory/Disk.
};
-class RocksDBBlueFSVolumeSelector : public BlueFSVolumeSelector
-{
- template <class T, size_t MaxX, size_t MaxY>
- class matrix_2d {
- T values[MaxX][MaxY];
- public:
- matrix_2d() {
- clear();
- }
- T& at(size_t x, size_t y) {
- ceph_assert(x < MaxX);
- ceph_assert(y < MaxY);
-
- return values[x][y];
- }
- size_t get_max_x() const {
- return MaxX;
- }
- size_t get_max_y() const {
- return MaxY;
- }
- void clear() {
- memset(values, 0, sizeof(values));
- }
- };
-
- enum {
- // use 0/nullptr as unset indication
- LEVEL_FIRST = 1,
- LEVEL_LOG = LEVEL_FIRST, // BlueFS log
- LEVEL_WAL,
- LEVEL_DB,
- LEVEL_SLOW,
- LEVEL_MAX
- };
- // add +1 row for per-level actual (taken from file size) total
- // add +1 column for corresponding per-device totals
- typedef matrix_2d<std::atomic<uint64_t>, BlueFS::MAX_BDEV + 1, LEVEL_MAX - LEVEL_FIRST + 1> per_level_per_dev_usage_t;
-
- per_level_per_dev_usage_t per_level_per_dev_usage;
- // file count per level, add +1 to keep total file count
- std::atomic<uint64_t> per_level_files[LEVEL_MAX - LEVEL_FIRST + 1] = { 0 };
-
- // Note: maximum per-device totals below might be smaller than corresponding
- // perf counters by up to a single alloc unit (1M) due to superblock extent.
- // The later is not accounted here.
- per_level_per_dev_usage_t per_level_per_dev_max;
-
- uint64_t l_totals[LEVEL_MAX - LEVEL_FIRST];
- uint64_t db_avail4slow = 0;
- uint64_t level0_size = 0;
- uint64_t level_base = 0;
- uint64_t level_multiplier = 0;
- bool new_pol = false;
- size_t extra_level = 0;
- enum {
- OLD_POLICY,
- USE_SOME_EXTRA
- };
-
-public:
- RocksDBBlueFSVolumeSelector(
- uint64_t _wal_total,
- uint64_t _db_total,
- uint64_t _slow_total,
- uint64_t _level0_size,
- uint64_t _level_base,
- uint64_t _level_multiplier,
- bool _new_pol) {
-
- l_totals[LEVEL_LOG - LEVEL_FIRST] = 0; // not used at the moment
- l_totals[LEVEL_WAL - LEVEL_FIRST] = _wal_total;
- l_totals[LEVEL_DB - LEVEL_FIRST] = _db_total;
- l_totals[LEVEL_SLOW - LEVEL_FIRST] = _slow_total;
-
- level0_size = _level0_size;
- level_base = _level_base;
- level_multiplier = _level_multiplier;
-
- new_pol = _new_pol;
- }
-
- void update_from_config(CephContext* cct) override
- {
- if (!new_pol) {
- return;
- }
-
- db_avail4slow = 0;
- extra_level = 0;
- double reserved_factor =
- cct->_conf->bluestore_volume_selection_reserved_factor;
- uint64_t reserved = cct->_conf->bluestore_volume_selection_reserved;
-
- auto db_total = l_totals[LEVEL_DB - LEVEL_FIRST];
- // Calculating how much extra space is available at DB volume.
- // Depending on the presence of explicit reserved size specification it might be either
- // * DB volume size - reserved
- // or
- // * DB volume size - sum_max_level_size(0, L-1) - max_level_size(L) * reserved_factor
- if (!reserved) {
- uint64_t prev_levels = level0_size;
- uint64_t cur_level = level_base;
- extra_level = 1;
- do {
- uint64_t next_level = cur_level * level_multiplier;
- uint64_t next_threshold = prev_levels + cur_level + next_level;
- ++extra_level;
- if (db_total <= next_threshold) {
- uint64_t cur_threshold = prev_levels + cur_level * reserved_factor;
- db_avail4slow = cur_threshold < db_total ? db_total - cur_threshold : 0;
- break;
- } else {
- prev_levels += cur_level;
- cur_level = next_level;
- }
- } while (true);
- } else {
- db_avail4slow = reserved < db_total ? db_total - reserved : 0;
- extra_level = 0;
- }
- }
-
- uint64_t get_available_extra() const {
- return db_avail4slow;
- }
- uint64_t get_extra_level() const {
- return extra_level;
- }
- void* get_hint_for_log() const override {
- return reinterpret_cast<void*>(LEVEL_LOG);
- }
- void* get_hint_by_dir(std::string_view dirname) const override;
-
- void add_usage(void* hint, const bluefs_extent_t& extent) override {
- if (hint == nullptr)
- return;
- size_t pos = (size_t)hint - LEVEL_FIRST;
- auto& cur = per_level_per_dev_usage.at(extent.bdev, pos);
- auto& max = per_level_per_dev_max.at(extent.bdev, pos);
- uint64_t v = cur.fetch_add(extent.length) + extent.length;
- while (v > max) {
- max.exchange(v);
- }
- {
- //update per-device totals
- auto& cur = per_level_per_dev_usage.at(extent.bdev, LEVEL_MAX - LEVEL_FIRST);
- auto& max = per_level_per_dev_max.at(extent.bdev, LEVEL_MAX - LEVEL_FIRST);
- uint64_t v = cur.fetch_add(extent.length) + extent.length;
- while (v > max) {
- max.exchange(v);
- }
- }
- }
- void sub_usage(void* hint, const bluefs_extent_t& extent) override {
- if (hint == nullptr)
- return;
- size_t pos = (size_t)hint - LEVEL_FIRST;
- auto& cur = per_level_per_dev_usage.at(extent.bdev, pos);
- ceph_assert(cur >= extent.length);
- cur -= extent.length;
-
- //update per-device totals
- auto& cur2 = per_level_per_dev_usage.at(extent.bdev, LEVEL_MAX - LEVEL_FIRST);
- ceph_assert(cur2 >= extent.length);
- cur2 -= extent.length;
- }
- void add_usage(void* hint, uint64_t size_more, bool upd_files) override {
- if (hint == nullptr)
- return;
- size_t pos = (size_t)hint - LEVEL_FIRST;
- //update per-level actual totals
- auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos);
- auto& max = per_level_per_dev_max.at(BlueFS::MAX_BDEV, pos);
- uint64_t v = cur.fetch_add(size_more) + size_more;
- while (v > max) {
- max.exchange(v);
- }
- if (upd_files) {
- ++per_level_files[pos];
- ++per_level_files[LEVEL_MAX - LEVEL_FIRST];
- }
- }
- void sub_usage(void* hint, uint64_t size_less, bool upd_files) override {
- if (hint == nullptr)
- return;
- size_t pos = (size_t)hint - LEVEL_FIRST;
- //update per-level actual totals
- auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos);
- ceph_assert(cur >= size_less);
- cur -= size_less;
- if (upd_files) {
- ceph_assert(per_level_files[pos] > 0);
- --per_level_files[pos];
- ceph_assert(per_level_files[LEVEL_MAX - LEVEL_FIRST] > 0);
- --per_level_files[LEVEL_MAX - LEVEL_FIRST];
- }
- }
-
- uint8_t select_prefer_bdev(void* h) override;
- void get_paths(
- const std::string& base,
- BlueFSVolumeSelector::paths& res) const override;
-
- void dump(std::ostream& sout) override;
- BlueFSVolumeSelector* clone_empty() const override;
- bool compare(BlueFSVolumeSelector* other) override;
-};
-
#endif