It allows excessive space usage for higher DB levels.
Signed-off-by: Igor Fedotov <ifedotov@suse.com>
OPTION(bluestore_log_omap_iterator_age, OPT_DOUBLE)
OPTION(bluestore_log_collection_list_age, OPT_DOUBLE)
OPTION(bluestore_debug_enforce_settings, OPT_STR)
+OPTION(bluestore_volume_selection_policy, OPT_STR)
+OPTION(bluestore_volume_selection_reserved_factor, OPT_DOUBLE)
OPTION(kstore_max_ops, OPT_U64)
OPTION(kstore_max_bytes, OPT_U64)
.set_default(4)
.set_description(""),
+ Option("bluestore_volume_selection_policy", Option::TYPE_STR, Option::LEVEL_DEV)
+ .set_default("rocksdb_original")
+ .set_enum_allowed({ "rocksdb_original", "use_some_extra" })
+ .set_description("Determines bluefs volume selection policy")
+ .set_long_description("Determines bluefs volume selection policy. 'use_some_extra' policy allows to override RocksDB level granularity and put high level's data to faster device even when the level doesn't completely fit there"),
+
+ Option("bluestore_volume_selection_reserved_factor", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_flag(Option::FLAG_STARTUP)
+ .set_default(2.0)
+ .set_description("DB level size multiplier. Determines amount of space at DB device to bar from the usage when 'use some extra' policy is in action. Reserved size is determined as sum(L_max_size[0], L_max_size[L-1]) + L_max_size[L] * this_factor"),
+
// -----------------------------------------
// kstore
discard_cb[BDEV_DB] = db_discard_cb;
discard_cb[BDEV_SLOW] = slow_discard_cb;
asok_hook = SocketHook::create(this);
- // set default volume selector
- vselector.reset(new OriginalVolumeSelector(this));
}
BlueFS::~BlueFS()
<< " osd_uuid " << osd_uuid
<< dendl;
+ // set volume selector if not provided before/outside
+ if (vselector == nullptr) {
+ vselector.reset(
+ new OriginalVolumeSelector(
+ get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
+ get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
+ get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100));
+ }
+
_init_alloc();
_init_logger();
_close_writer(log_writer);
log_writer = NULL;
block_all.clear();
+ vselector.reset(nullptr);
_stop_alloc();
_shutdown_logger();
goto out;
}
+ // set volume selector if not provided before/outside
+ if (vselector == nullptr) {
+ vselector.reset(
+ new OriginalVolumeSelector(
+ get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
+ get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
+ get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100));
+ }
+
block_all.clear();
block_all.resize(MAX_BDEV);
_init_alloc();
_close_writer(log_writer);
log_writer = NULL;
+ vselector.reset(nullptr);
_stop_alloc();
file_map.clear();
dir_map.clear();
void OriginalVolumeSelector::get_paths(const std::string& base, paths& res) const
{
- // we have both block.db and block; tell rocksdb!
- // note: the second (last) size value doesn't really matter
- uint64_t db_size = bluefs->get_block_device_size(BlueFS::BDEV_DB);
- uint64_t slow_size = bluefs->get_block_device_size(BlueFS::BDEV_SLOW);
- res.emplace_back(base, (uint64_t)(db_size * 95 / 100));
- res.emplace_back(base + ".slow", (uint64_t)(slow_size * 95 / 100));
+ res.emplace_back(base, db_total);
+ res.emplace_back(base + ".slow", slow_total);
}
#undef dout_prefix
#define dout_prefix *_dout << "OriginalVolumeSelector: "
void OriginalVolumeSelector::dump(CephContext* c) {
- ldout(c, 1) << "OriginalVolumeSelector" << dendl;
+ ldout(c, 1) << "wal_total:" << wal_total
+ << ", db_total:" << db_total
+ << ", slow_total:" << slow_total
+ << dendl;
}
virtual void dump(CephContext* cct) = 0;
};
class BlueFS;
-class OriginalVolumeSelector : public BlueFSVolumeSelector {
- BlueFS* bluefs = nullptr;
-public:
- OriginalVolumeSelector(BlueFS* _bluefs) : bluefs(_bluefs) {}
-
- void* get_hint_by_device(uint8_t dev) const override;
- void* get_hint_by_dir(const string& dirname) const override;
-
- void add_usage(void* file_hint, const bluefs_fnode_t& fnode) override {
- // do nothing
- }
- void sub_usage(void* file_hint, const bluefs_fnode_t& fnode) override {
- // do nothing
- }
- void add_usage(void* file_hint, uint64_t fsize) override {
- // do nothing
- }
- void sub_usage(void* file_hint, uint64_t fsize) override {
- // do nothing
- }
- uint8_t select_prefer_bdev(void* hint) override;
- void get_paths(const std::string& base, paths& res) const override;
- void dump(CephContext* cct) override;
-
-};
class BlueFS {
public:
void debug_inject_duplicate_gift(unsigned bdev, uint64_t offset, uint64_t len);
};
+class OriginalVolumeSelector : public BlueFSVolumeSelector {
+ uint64_t wal_total;
+ uint64_t db_total;
+ uint64_t slow_total;
+
+public:
+ OriginalVolumeSelector(
+ uint64_t _wal_total,
+ uint64_t _db_total,
+ uint64_t _slow_total)
+ : wal_total(_wal_total), db_total(_db_total), slow_total(_slow_total) {}
+
+ void* get_hint_by_device(uint8_t dev) const override;
+ void* get_hint_by_dir(const string& dirname) const override;
+
+ void add_usage(void* hint, const bluefs_fnode_t& fnode) override {
+ // do nothing
+ return;
+ }
+ void sub_usage(void* hint, const bluefs_fnode_t& fnode) override {
+ // do nothing
+ return;
+ }
+ void add_usage(void* hint, uint64_t fsize) override {
+ // do nothing
+ return;
+ }
+ void sub_usage(void* hint, uint64_t fsize) override {
+ // do nothing
+ return;
+ }
+
+ uint8_t select_prefer_bdev(void* hint) override;
+ void get_paths(const std::string& base, paths& res) const override;
+ void dump(CephContext* cct) override;
+};
+
#endif
#include <memory>
#include <string>
+#include "rocksdb/options.h"
#include "rocksdb/status.h"
#include "rocksdb/utilities/env_mirror.h"
#include "include/ceph_assert.h"
+#include "kv/RocksDBStore.h"
class BlueFS;
#include <fcntl.h>
#include <boost/container/flat_set.hpp>
+#include "boost/algorithm/string.hpp"
#include "include/cpp-btree/btree_set.h"
if (r < 0) {
return r;
}
+ RocksDBBlueFSVolumeSelector* vselector = nullptr;
+ if (bluefs_layout.shared_bdev == BlueFS::BDEV_SLOW) {
+
+ string options = cct->_conf->bluestore_rocksdb_options;
+
+ rocksdb::Options rocks_opts;
+ int r = RocksDBStore::ParseOptionsFromStringStatic(
+ cct,
+ options,
+ rocks_opts,
+ nullptr);
+ if (r < 0) {
+ return r;
+ }
+
+ double reserved_factor = cct->_conf->bluestore_volume_selection_reserved_factor;
+ vselector =
+ new RocksDBBlueFSVolumeSelector(
+ bluefs->get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
+ bluefs->get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
+ bluefs->get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100,
+ 1024 * 1024 * 1024, //FIXME: set expected l0 size here
+ rocks_opts.max_bytes_for_level_base,
+ rocks_opts.max_bytes_for_level_multiplier,
+ reserved_factor,
+ cct->_conf->bluestore_volume_selection_policy != "rocksdb_original");
+ }
if (create) {
bluefs->mkfs(fsid, bluefs_layout);
}
+ bluefs->set_volume_selector(vselector);
r = bluefs->mount();
if (r < 0) {
derr << __func__ << " failed bluefs mount: " << cpp_strerror(r) << dendl;
if (r < 0) {
return r;
}
- bluefs->set_slow_device_expander(this);
if (cct->_conf->bluestore_bluefs_env_mirror) {
- rocksdb::Env *a = new BlueRocksEnv(bluefs);
- rocksdb::Env *b = rocksdb::Env::Default();
+ rocksdb::Env* a = new BlueRocksEnv(bluefs);
+ rocksdb::Env* b = rocksdb::Env::Default();
if (create) {
- string cmd = "rm -rf " + path + "/db " +
- path + "/db.slow " +
- path + "/db.wal";
- int r = system(cmd.c_str());
- (void)r;
+ string cmd = "rm -rf " + path + "/db " +
+ path + "/db.slow " +
+ path + "/db.wal";
+ int r = system(cmd.c_str());
+ (void)r;
}
env = new rocksdb::EnvMirror(b, a, false, true);
- } else {
+ }
+ else {
env = new BlueRocksEnv(bluefs);
// simplify the dir names, too, as "seen" by rocksdb
fn = "db";
}
+ bluefs->set_slow_device_expander(this);
+ BlueFSVolumeSelector::paths paths;
+ bluefs->get_vselector_paths(fn, paths);
if (bluefs_layout.shared_bdev == BlueFS::BDEV_SLOW) {
// we have both block.db and block; tell rocksdb!
// note: the second (last) size value doesn't really matter
ostringstream db_paths;
- uint64_t db_size = bluefs->get_block_device_size(BlueFS::BDEV_DB);
- uint64_t slow_size = bluefs->get_block_device_size(BlueFS::BDEV_SLOW);
- db_paths << fn << ","
- << (uint64_t)(db_size * 95 / 100) << " "
- << fn + ".slow" << ","
- << (uint64_t)(slow_size * 95 / 100);
+ bool first = true;
+ for (auto& p : paths) {
+ if (!first) {
+ db_paths << " ";
+ }
+ first = false;
+ db_paths << p.first << "," << p.second;
+
+ }
kv_options["db_paths"] = db_paths.str();
- dout(10) << __func__ << " set db_paths to " << db_paths.str() << dendl;
+ dout(1) << __func__ << " set db_paths to " << db_paths.str() << dendl;
}
if (create) {
- env->CreateDir(fn);
+ for (auto& p : paths) {
+ env->CreateDir(p.first);
+ }
+ // Selectors don't provide wal path so far hence create explicitly
env->CreateDir(fn + ".wal");
- env->CreateDir(fn + ".slow");
} else {
std::vector<std::string> res;
// check for dir presence
}
// =======================================================
+// RocksDBBlueFSVolumeSelector
+
+uint8_t RocksDBBlueFSVolumeSelector::select_prefer_bdev(void* h) {
+ ceph_assert(h != nullptr);
+ uint64_t hint = reinterpret_cast<uint64_t>(h);
+ uint8_t res;
+ switch (hint) {
+ case LEVEL_SLOW:
+ res = BlueFS::BDEV_SLOW;
+ if (db_avail4slow > 0) {
+ // considering statically available db space vs.
+ // - observed maximums on DB dev for DB/WAL/UNSORTED data
+ // - observed maximum spillovers
+ uint64_t max_db_use = 0; // max db usage we potentially observed
+ max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_WAL - LEVEL_FIRST);
+ max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_DB - LEVEL_FIRST);
+ // this could go to db hence using it in the estimation
+ max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_SLOW, LEVEL_DB - LEVEL_FIRST);
+
+ auto db_total = l_totals[LEVEL_DB - LEVEL_FIRST];
+ uint64_t avail = min(
+ db_avail4slow,
+ max_db_use < db_total ? db_total - max_db_use : 0);
+
+ // considering current DB dev usage for SLOW data
+ if (avail > per_level_per_dev_usage.at(BlueFS::BDEV_DB, LEVEL_SLOW - LEVEL_FIRST)) {
+ res = BlueFS::BDEV_DB;
+ }
+ }
+ break;
+ case LEVEL_WAL:
+ res = BlueFS::BDEV_WAL;
+ break;
+ case LEVEL_DB:
+ default:
+ res = BlueFS::BDEV_DB;
+ break;
+ }
+ return res;
+}
+void RocksDBBlueFSVolumeSelector::get_paths(const std::string& base, paths& res) const
+{
+ res.emplace_back(base, l_totals[LEVEL_DB - LEVEL_FIRST]);
+ res.emplace_back(base + ".slow", l_totals[LEVEL_SLOW - LEVEL_FIRST]);
+}
+
+void* RocksDBBlueFSVolumeSelector::get_hint_by_dir(const string& dirname) const {
+ uint8_t res = LEVEL_DB;
+ if (dirname.length() > 5) {
+ // the "db.slow" and "db.wal" directory names are hard-coded at
+ // match up with bluestore. the slow device is always the second
+ // one (when a dedicated block.db device is present and used at
+ // bdev 0). the wal device is always last.
+ if (boost::algorithm::ends_with(dirname, ".slow")) {
+ res = LEVEL_SLOW;
+ }
+ else if (boost::algorithm::ends_with(dirname, ".wal")) {
+ res = LEVEL_WAL;
+ }
+ }
+ return reinterpret_cast<void*>(res);
+}
+
+#undef dout_prefix
+#define dout_prefix *_dout << "RocksDBBlueFSVolumeSelector: "
+
+void RocksDBBlueFSVolumeSelector::dump(CephContext* c) {
+ stringstream matrix_output;
+ auto max_x = per_level_per_dev_usage.get_max_x();
+ auto max_y = per_level_per_dev_usage.get_max_y();
+ matrix_output << "LEVEL, WAL, DB, SLOW, ****, ****, REAL" << std::endl;
+ for (size_t l = 0; l < max_y; l++) {
+ switch (l + LEVEL_FIRST) {
+ case LEVEL_WAL:
+ matrix_output << "WAL "; break;
+ case LEVEL_DB:
+ matrix_output << "DB "; break;
+ case LEVEL_SLOW:
+ matrix_output << "SLOW" << " "; break;
+ case LEVEL_MAX:
+ matrix_output << "TOTALS "; break;
+ }
+ for (size_t d = 0; d < max_x - 1; d++) {
+ matrix_output << per_level_per_dev_usage.at(d, l) << ",";
+ }
+ matrix_output << per_level_per_dev_usage.at(max_x - 1, l) << std::endl;
+ }
+ ceph_assert(max_x == per_level_per_dev_max.get_max_x());
+ ceph_assert(max_y == per_level_per_dev_max.get_max_y());
+ matrix_output << "MAXIMUMS:" << std::endl;
+ for (size_t l = 0; l < max_y; l++) {
+ switch (l + LEVEL_FIRST) {
+ case LEVEL_WAL:
+ matrix_output << "WAL "; break;
+ case LEVEL_DB:
+ matrix_output << "DB "; break;
+ case LEVEL_SLOW:
+ matrix_output << "SLOW" << " "; break;
+ case LEVEL_MAX:
+ matrix_output << "TOTALS "; break;
+ }
+ for (size_t d = 0; d < max_x - 1; d++) {
+ matrix_output << per_level_per_dev_max.at(d, l) << ",";
+ }
+ matrix_output << per_level_per_dev_max.at(max_x - 1, l);
+ if (l < max_y - 1) {
+ matrix_output << std::endl;
+ }
+ }
+ ldout(c, 1)
+ << "wal_total:" << l_totals[LEVEL_WAL - LEVEL_FIRST]
+ << ", db_total:" << l_totals[LEVEL_DB - LEVEL_FIRST]
+ << ", slow_total:" << l_totals[LEVEL_SLOW - LEVEL_FIRST]
+ << ", db_avail:" << db_avail4slow
+ << " usage matrix:" << std::endl
+ << matrix_output.str()
+ << dendl;
+}
+
+// =======================================================
fsck_interval misreferenced_extents;
};
+
+class RocksDBBlueFSVolumeSelector : public BlueFSVolumeSelector
+{
+ template <class T, size_t MaxX, size_t MaxY>
+ class matrix_2d {
+ T values[MaxX][MaxY];
+ public:
+ matrix_2d() {
+ clear();
+ }
+ T& at(size_t x, size_t y) {
+ ceph_assert(x < MaxX);
+ ceph_assert(y < MaxY);
+
+ return values[x][y];
+ }
+ size_t get_max_x() const {
+ return MaxX;
+ }
+ size_t get_max_y() const {
+ return MaxY;
+ }
+ void clear() {
+ memset(values, 0, sizeof(values));
+ }
+ };
+
+ enum {
+ // use 0/nullptr as unset indication
+ LEVEL_FIRST = 1,
+ LEVEL_WAL = LEVEL_FIRST,
+ LEVEL_DB,
+ LEVEL_SLOW,
+ LEVEL_MAX
+ };
+ // add +1 row for corresponding per-device totals
+ // add +1 column for per-level actual (taken from file size) total
+ typedef matrix_2d<uint64_t, BlueFS::MAX_BDEV + 1, LEVEL_MAX - LEVEL_FIRST + 1> per_level_per_dev_usage_t;
+
+ per_level_per_dev_usage_t per_level_per_dev_usage;
+
+ // Note: maximum per-device totals below might be smaller than corresponding
+ // perf counters by up to a single alloc unit (1M) due to superblock extent.
+ // The later is not accounted here.
+ per_level_per_dev_usage_t per_level_per_dev_max;
+
+ uint64_t l_totals[LEVEL_MAX - LEVEL_FIRST];
+ uint64_t db_avail4slow = 0;
+ enum {
+ OLD_POLICY,
+ USE_SOME_EXTRA
+ };
+
+public:
+ RocksDBBlueFSVolumeSelector(
+ uint64_t _wal_total,
+ uint64_t _db_total,
+ uint64_t _slow_total,
+ uint64_t _level0_size,
+ uint64_t _level_base,
+ uint64_t _level_multiplier,
+ double reserved_factor,
+ bool new_pol)
+ {
+ l_totals[LEVEL_WAL - LEVEL_FIRST] = _wal_total;
+ l_totals[LEVEL_DB - LEVEL_FIRST] = _db_total;
+ l_totals[LEVEL_SLOW - LEVEL_FIRST] = _slow_total;
+
+ if (!new_pol) {
+ return;
+ }
+
+ // Calculating how much extra space is available at DB volume.
+ // Which is equal to
+ // DB size - sum_max_level_size(0, L-1) - max_level_size(L) * reserved_factor
+ uint64_t prev_levels = _level0_size;
+ uint64_t cur_level = _level_base;
+ uint64_t cur_threshold = 0;
+ do {
+ uint64_t next_level = cur_level * _level_multiplier;
+ uint64_t next_threshold = prev_levels + cur_level + next_level * reserved_factor;
+ if (_db_total <= next_threshold) {
+ db_avail4slow = cur_threshold ? _db_total - cur_threshold : 0;
+ break;
+ } else {
+ prev_levels += cur_level;
+ cur_level = next_level;
+ cur_threshold = next_threshold;
+ }
+ } while (true);
+ }
+
+ void* get_hint_by_device(uint8_t dev) const override {
+ ceph_assert(dev == BlueFS::BDEV_WAL); // others aren't used atm
+ return reinterpret_cast<void*>(LEVEL_WAL);
+ }
+ void* get_hint_by_dir(const string& dirname) const override;
+
+ void add_usage(void* hint, const bluefs_fnode_t& fnode) override {
+ if (hint == nullptr)
+ return;
+ size_t pos = (size_t)hint - LEVEL_FIRST;
+ for (auto& p : fnode.extents) {
+ auto& cur = per_level_per_dev_usage.at(p.bdev, pos);
+ auto& max = per_level_per_dev_max.at(p.bdev, pos);
+ cur += p.length;
+ if (cur > max) {
+ max = cur;
+ }
+ {
+ //update per-device totals
+ auto& cur = per_level_per_dev_usage.at(p.bdev, LEVEL_MAX - LEVEL_FIRST);
+ auto& max = per_level_per_dev_max.at(p.bdev, LEVEL_MAX - LEVEL_FIRST);
+ cur += p.length;
+ if (cur > max) {
+ max = cur;
+ }
+ }
+ }
+ {
+ //update per-level actual totals
+ auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos);
+ auto& max = per_level_per_dev_max.at(BlueFS::MAX_BDEV, pos);
+ cur += fnode.size;
+ if (cur > max) {
+ max = cur;
+ }
+ }
+ }
+ void sub_usage(void* hint, const bluefs_fnode_t& fnode) override {
+ if (hint == nullptr)
+ return;
+ size_t pos = (size_t)hint - LEVEL_FIRST;
+ for (auto& p : fnode.extents) {
+ auto& cur = per_level_per_dev_usage.at(p.bdev, pos);
+ ceph_assert(cur >= p.length);
+ cur -= p.length;
+
+ //update per-device totals
+ auto& cur2 = per_level_per_dev_usage.at(p.bdev, LEVEL_MAX - LEVEL_FIRST);
+ ceph_assert(cur2 >= p.length);
+ cur2 -= p.length;
+ }
+ //update per-level actual totals
+ auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos);
+ ceph_assert(cur >= fnode.size);
+ cur -= fnode.size;
+ }
+ void add_usage(void* hint, uint64_t fsize) override {
+ if (hint == nullptr)
+ return;
+ size_t pos = (size_t)hint - LEVEL_FIRST;
+ //update per-level actual totals
+ auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos);
+ auto& max = per_level_per_dev_max.at(BlueFS::MAX_BDEV, pos);
+ cur += fsize;
+ if (cur > max) {
+ max = cur;
+ }
+ }
+ void sub_usage(void* hint, uint64_t fsize) override {
+ if (hint == nullptr)
+ return;
+ size_t pos = (size_t)hint - LEVEL_FIRST;
+ //update per-level actual totals
+ auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos);
+ ceph_assert(cur >= fsize);
+ per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos) -= fsize;
+ }
+
+ uint8_t select_prefer_bdev(void* h) override;
+ void get_paths(
+ const std::string& base,
+ BlueFSVolumeSelector::paths& res) const override;
+
+ void dump(CephContext* cct) override;
+};
+
#endif