]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
os/bluestore: implement more intelligent DB volume space managment for BlueFS.
authorIgor Fedotov <ifedotov@suse.com>
Mon, 8 Jul 2019 14:40:20 +0000 (17:40 +0300)
committerIgor Fedotov <ifedotov@suse.com>
Tue, 26 Nov 2019 18:28:34 +0000 (21:28 +0300)
It allows excessive space usage for higher DB levels.

Signed-off-by: Igor Fedotov <ifedotov@suse.com>
src/common/legacy_config_opts.h
src/common/options.cc
src/os/bluestore/BlueFS.cc
src/os/bluestore/BlueFS.h
src/os/bluestore/BlueRocksEnv.h
src/os/bluestore/BlueStore.cc
src/os/bluestore/BlueStore.h

index 6dfeddc8f527d6234abcec379af622000cc99a6b..bb0e5b87987158dee8fbb6cc9a1a9da5d1687ea7 100644 (file)
@@ -1057,6 +1057,8 @@ OPTION(bluestore_log_op_age, OPT_DOUBLE)
 OPTION(bluestore_log_omap_iterator_age, OPT_DOUBLE)
 OPTION(bluestore_log_collection_list_age, OPT_DOUBLE)
 OPTION(bluestore_debug_enforce_settings, OPT_STR)
+OPTION(bluestore_volume_selection_policy, OPT_STR)
+OPTION(bluestore_volume_selection_reserved_factor, OPT_DOUBLE)
 
 OPTION(kstore_max_ops, OPT_U64)
 OPTION(kstore_max_bytes, OPT_U64)
index 957923df580bd6066425116d0d47d1abdda07b30..24f33a4832d1020b4d70525ef2b79c4eacf5deec 100644 (file)
@@ -4584,6 +4584,17 @@ std::vector<Option> get_global_options() {
     .set_default(4)
     .set_description(""),
 
+    Option("bluestore_volume_selection_policy", Option::TYPE_STR, Option::LEVEL_DEV)
+    .set_default("rocksdb_original")
+    .set_enum_allowed({ "rocksdb_original", "use_some_extra" })
+    .set_description("Determines bluefs volume selection policy")
+    .set_long_description("Determines bluefs volume selection policy. 'use_some_extra' policy allows to override RocksDB level granularity and put high level's data to faster device even when the level doesn't completely fit there"),
+
+    Option("bluestore_volume_selection_reserved_factor", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+      .set_flag(Option::FLAG_STARTUP)
+      .set_default(2.0)
+      .set_description("DB level size multiplier. Determines amount of space at DB device to bar from the usage when 'use some extra' policy is in action. Reserved size is determined as sum(L_max_size[0], L_max_size[L-1]) + L_max_size[L] * this_factor"),
+
     // -----------------------------------------
     // kstore
 
index a8dee6fccc7603d0d9d9f277b6d1cbbb07ecfacd..a39abd86eff72f8862beb5a7de50e7a6e6a2de10 100644 (file)
@@ -121,8 +121,6 @@ BlueFS::BlueFS(CephContext* cct)
   discard_cb[BDEV_DB] = db_discard_cb;
   discard_cb[BDEV_SLOW] = slow_discard_cb;
   asok_hook = SocketHook::create(this);
-  // set default volume selector
-  vselector.reset(new OriginalVolumeSelector(this));
 }
 
 BlueFS::~BlueFS()
@@ -452,6 +450,15 @@ int BlueFS::mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout)
          << " osd_uuid " << osd_uuid
          << dendl;
 
+  // set volume selector if not provided before/outside
+  if (vselector == nullptr) {
+    vselector.reset(
+      new OriginalVolumeSelector(
+        get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
+        get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
+        get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100));
+  }
+
   _init_alloc();
   _init_logger();
 
@@ -499,6 +506,7 @@ int BlueFS::mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout)
   _close_writer(log_writer);
   log_writer = NULL;
   block_all.clear();
+  vselector.reset(nullptr);
   _stop_alloc();
   _shutdown_logger();
 
@@ -582,6 +590,15 @@ int BlueFS::mount()
     goto out;
   }
 
+  // set volume selector if not provided before/outside
+  if (vselector == nullptr) {
+    vselector.reset(
+      new OriginalVolumeSelector(
+        get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
+        get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
+        get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100));
+  }
+
   block_all.clear();
   block_all.resize(MAX_BDEV);
   _init_alloc();
@@ -644,6 +661,7 @@ void BlueFS::umount()
   _close_writer(log_writer);
   log_writer = NULL;
 
+  vselector.reset(nullptr);
   _stop_alloc();
   file_map.clear();
   dir_map.clear();
@@ -3473,17 +3491,16 @@ uint8_t OriginalVolumeSelector::select_prefer_bdev(void* hint)
 
 void OriginalVolumeSelector::get_paths(const std::string& base, paths& res) const
 {
-  // we have both block.db and block; tell rocksdb!
-  // note: the second (last) size value doesn't really matter
-  uint64_t db_size = bluefs->get_block_device_size(BlueFS::BDEV_DB);
-  uint64_t slow_size = bluefs->get_block_device_size(BlueFS::BDEV_SLOW);
-  res.emplace_back(base, (uint64_t)(db_size * 95 / 100));
-  res.emplace_back(base + ".slow", (uint64_t)(slow_size * 95 / 100));
+  res.emplace_back(base, db_total);
+  res.emplace_back(base + ".slow", slow_total);
 }
 
 #undef dout_prefix
 #define dout_prefix *_dout << "OriginalVolumeSelector: "
 
 void OriginalVolumeSelector::dump(CephContext* c) {
-  ldout(c, 1) << "OriginalVolumeSelector" << dendl;
+  ldout(c, 1) << "wal_total:" << wal_total
+    << ", db_total:" << db_total
+    << ", slow_total:" << slow_total
+    << dendl;
 }
index 7ad3eae316d490c0e9dbc0d87c35e39a319c7d26..9d8b8e48f2bc7b1b5182af25e179796fd51ec33c 100644 (file)
@@ -93,31 +93,6 @@ public:
   virtual void dump(CephContext* cct) = 0;
 };
 class BlueFS;
-class OriginalVolumeSelector : public BlueFSVolumeSelector {
-  BlueFS* bluefs = nullptr;
-public:
-  OriginalVolumeSelector(BlueFS* _bluefs) : bluefs(_bluefs) {}
-
-  void* get_hint_by_device(uint8_t dev) const override;
-  void* get_hint_by_dir(const string& dirname) const override;
-
-  void add_usage(void* file_hint, const bluefs_fnode_t& fnode) override {
-    // do nothing
-  }
-  void sub_usage(void* file_hint, const bluefs_fnode_t& fnode) override {
-    // do nothing
-  }
-  void add_usage(void* file_hint, uint64_t fsize) override {
-    // do nothing
-  }
-  void sub_usage(void* file_hint, uint64_t fsize) override {
-    // do nothing
-  }
-  uint8_t select_prefer_bdev(void* hint) override;
-  void get_paths(const std::string& base, paths& res) const override;
-  void dump(CephContext* cct) override;
-
-};
 
 class BlueFS {
 public:
@@ -609,4 +584,41 @@ public:
   void debug_inject_duplicate_gift(unsigned bdev, uint64_t offset, uint64_t len);
 };
 
+class OriginalVolumeSelector : public BlueFSVolumeSelector {
+  uint64_t wal_total;
+  uint64_t db_total;
+  uint64_t slow_total;
+
+public:
+  OriginalVolumeSelector(
+    uint64_t _wal_total,
+    uint64_t _db_total,
+    uint64_t _slow_total)
+    : wal_total(_wal_total), db_total(_db_total), slow_total(_slow_total) {}
+
+  void* get_hint_by_device(uint8_t dev) const override;
+  void* get_hint_by_dir(const string& dirname) const override;
+
+  void add_usage(void* hint, const bluefs_fnode_t& fnode) override {
+    // do nothing
+    return;
+  }
+  void sub_usage(void* hint, const bluefs_fnode_t& fnode) override {
+    // do nothing
+    return;
+  }
+  void add_usage(void* hint, uint64_t fsize) override {
+    // do nothing
+    return;
+  }
+  void sub_usage(void* hint, uint64_t fsize) override {
+    // do nothing
+    return;
+  }
+
+  uint8_t select_prefer_bdev(void* hint) override;
+  void get_paths(const std::string& base, paths& res) const override;
+  void dump(CephContext* cct) override;
+};
+
 #endif
index 486d16432062964ef4721eae83430aeda7e99157..82cffcd809bdd27d7313c38b3a6c715cdc576ec4 100644 (file)
@@ -6,10 +6,12 @@
 #include <memory>
 #include <string>
 
+#include "rocksdb/options.h"
 #include "rocksdb/status.h"
 #include "rocksdb/utilities/env_mirror.h"
 
 #include "include/ceph_assert.h"
+#include "kv/RocksDBStore.h"
 
 class BlueFS;
 
index 935119d9521f9066cdbb5340906a1abd62457301..e146cf600f8e531478f65f4514db8bfefd83a9cc 100644 (file)
@@ -19,6 +19,7 @@
 #include <fcntl.h>
 
 #include <boost/container/flat_set.hpp>
+#include "boost/algorithm/string.hpp"
 
 #include "include/cpp-btree/btree_set.h"
 
@@ -5393,9 +5394,37 @@ int BlueStore::_open_bluefs(bool create)
   if (r < 0) {
     return r;
   }
+  RocksDBBlueFSVolumeSelector* vselector = nullptr;
+  if (bluefs_layout.shared_bdev == BlueFS::BDEV_SLOW) {
+
+    string options = cct->_conf->bluestore_rocksdb_options;
+
+    rocksdb::Options rocks_opts;
+    int r = RocksDBStore::ParseOptionsFromStringStatic(
+      cct,
+      options,
+      rocks_opts,
+      nullptr);
+    if (r < 0) {
+      return r;
+    }
+
+    double reserved_factor = cct->_conf->bluestore_volume_selection_reserved_factor;
+    vselector =
+      new RocksDBBlueFSVolumeSelector(
+        bluefs->get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
+        bluefs->get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
+        bluefs->get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100,
+        1024 * 1024 * 1024, //FIXME: set expected l0 size here
+        rocks_opts.max_bytes_for_level_base,
+        rocks_opts.max_bytes_for_level_multiplier,
+        reserved_factor,
+        cct->_conf->bluestore_volume_selection_policy != "rocksdb_original");
+  }
   if (create) {
     bluefs->mkfs(fsid, bluefs_layout);
   }
+  bluefs->set_volume_selector(vselector);
   r = bluefs->mount();
   if (r < 0) {
     derr << __func__ << " failed bluefs mount: " << cpp_strerror(r) << dendl;
@@ -5608,44 +5637,52 @@ int BlueStore::_open_db(bool create, bool to_repair_db, bool read_only)
     if (r < 0) {
       return r;
     }
-    bluefs->set_slow_device_expander(this);
 
     if (cct->_conf->bluestore_bluefs_env_mirror) {
-      rocksdb::Env *a = new BlueRocksEnv(bluefs);
-      rocksdb::Env *b = rocksdb::Env::Default();
+      rocksdb::Enva = new BlueRocksEnv(bluefs);
+      rocksdb::Envb = rocksdb::Env::Default();
       if (create) {
-       string cmd = "rm -rf " + path + "/db " +
-         path + "/db.slow " +
-         path + "/db.wal";
-       int r = system(cmd.c_str());
-       (void)r;
+        string cmd = "rm -rf " + path + "/db " +
+          path + "/db.slow " +
+          path + "/db.wal";
+        int r = system(cmd.c_str());
+        (void)r;
       }
       env = new rocksdb::EnvMirror(b, a, false, true);
-    } else {
+    }
+    else {
       env = new BlueRocksEnv(bluefs);
 
       // simplify the dir names, too, as "seen" by rocksdb
       fn = "db";
     }
+    bluefs->set_slow_device_expander(this);
+    BlueFSVolumeSelector::paths paths;
+    bluefs->get_vselector_paths(fn, paths);
 
     if (bluefs_layout.shared_bdev == BlueFS::BDEV_SLOW) {
       // we have both block.db and block; tell rocksdb!
       // note: the second (last) size value doesn't really matter
       ostringstream db_paths;
-      uint64_t db_size = bluefs->get_block_device_size(BlueFS::BDEV_DB);
-      uint64_t slow_size = bluefs->get_block_device_size(BlueFS::BDEV_SLOW);
-      db_paths << fn << ","
-               << (uint64_t)(db_size * 95 / 100) << " "
-               << fn + ".slow" << ","
-               << (uint64_t)(slow_size * 95 / 100);
+      bool first = true;
+      for (auto& p : paths) {
+        if (!first) {
+          db_paths << " ";
+        }
+        first = false;
+        db_paths << p.first << "," << p.second;
+
+      }
       kv_options["db_paths"] = db_paths.str();
-      dout(10) << __func__ << " set db_paths to " << db_paths.str() << dendl;
+      dout(1) << __func__ << " set db_paths to " << db_paths.str() << dendl;
     }
 
     if (create) {
-      env->CreateDir(fn);
+      for (auto& p : paths) {
+        env->CreateDir(p.first);
+      }
+      // Selectors don't provide wal path so far hence create explicitly
       env->CreateDir(fn + ".wal");
-      env->CreateDir(fn + ".slow");
     } else {
       std::vector<std::string> res;
       // check for dir presence
@@ -15496,4 +15533,124 @@ unsigned BlueStoreRepairer::apply(KeyValueDB* db)
 }
 
 // =======================================================
+// RocksDBBlueFSVolumeSelector
+
+uint8_t RocksDBBlueFSVolumeSelector::select_prefer_bdev(void* h) {
+  ceph_assert(h != nullptr);
+  uint64_t hint = reinterpret_cast<uint64_t>(h);
+  uint8_t res;
+  switch (hint) {
+  case LEVEL_SLOW:
+    res = BlueFS::BDEV_SLOW;
+    if (db_avail4slow > 0) {
+      // considering statically available db space vs.
+      // - observed maximums on DB dev for DB/WAL/UNSORTED data
+      // - observed maximum spillovers
+      uint64_t max_db_use = 0; // max db usage we potentially observed
+      max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_WAL - LEVEL_FIRST);
+      max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_DB - LEVEL_FIRST);
+      // this could go to db hence using it in the estimation
+      max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_SLOW, LEVEL_DB - LEVEL_FIRST);
+
+      auto db_total = l_totals[LEVEL_DB - LEVEL_FIRST];
+      uint64_t avail = min(
+        db_avail4slow,
+        max_db_use < db_total ? db_total - max_db_use : 0);
+
+      // considering current DB dev usage for SLOW data
+      if (avail > per_level_per_dev_usage.at(BlueFS::BDEV_DB, LEVEL_SLOW - LEVEL_FIRST)) {
+        res = BlueFS::BDEV_DB;
+      }
+    }
+    break;
+  case LEVEL_WAL:
+    res = BlueFS::BDEV_WAL;
+    break;
+  case LEVEL_DB:
+  default:
+    res = BlueFS::BDEV_DB;
+    break;
+  }
+  return res;
+}
 
+void RocksDBBlueFSVolumeSelector::get_paths(const std::string& base, paths& res) const
+{
+  res.emplace_back(base, l_totals[LEVEL_DB - LEVEL_FIRST]);
+  res.emplace_back(base + ".slow", l_totals[LEVEL_SLOW - LEVEL_FIRST]);
+}
+
+void* RocksDBBlueFSVolumeSelector::get_hint_by_dir(const string& dirname) const {
+  uint8_t res = LEVEL_DB;
+  if (dirname.length() > 5) {
+    // the "db.slow" and "db.wal" directory names are hard-coded at
+    // match up with bluestore.  the slow device is always the second
+    // one (when a dedicated block.db device is present and used at
+    // bdev 0).  the wal device is always last.
+    if (boost::algorithm::ends_with(dirname, ".slow")) {
+      res = LEVEL_SLOW;
+    }
+    else if (boost::algorithm::ends_with(dirname, ".wal")) {
+      res = LEVEL_WAL;
+    }
+  }
+  return reinterpret_cast<void*>(res);
+}
+
+#undef dout_prefix
+#define dout_prefix *_dout << "RocksDBBlueFSVolumeSelector: "
+
+void RocksDBBlueFSVolumeSelector::dump(CephContext* c) {
+  stringstream matrix_output;
+  auto max_x = per_level_per_dev_usage.get_max_x();
+  auto max_y = per_level_per_dev_usage.get_max_y();
+  matrix_output << "LEVEL, WAL, DB, SLOW, ****, ****, REAL" << std::endl;
+  for (size_t l = 0; l < max_y; l++) {
+    switch (l + LEVEL_FIRST) {
+    case LEVEL_WAL:
+      matrix_output << "WAL "; break;
+    case LEVEL_DB:
+      matrix_output << "DB "; break;
+    case LEVEL_SLOW:
+      matrix_output << "SLOW" << " "; break;
+    case LEVEL_MAX:
+      matrix_output << "TOTALS "; break;
+    }
+    for (size_t d = 0; d < max_x - 1; d++) {
+      matrix_output << per_level_per_dev_usage.at(d, l) << ",";
+    }
+    matrix_output << per_level_per_dev_usage.at(max_x - 1, l) << std::endl;
+  }
+  ceph_assert(max_x == per_level_per_dev_max.get_max_x());
+  ceph_assert(max_y == per_level_per_dev_max.get_max_y());
+  matrix_output << "MAXIMUMS:" << std::endl;
+  for (size_t l = 0; l < max_y; l++) {
+    switch (l + LEVEL_FIRST) {
+    case LEVEL_WAL:
+      matrix_output << "WAL "; break;
+    case LEVEL_DB:
+      matrix_output << "DB "; break;
+    case LEVEL_SLOW:
+      matrix_output << "SLOW" << " "; break;
+    case LEVEL_MAX:
+      matrix_output << "TOTALS "; break;
+    }
+    for (size_t d = 0; d < max_x - 1; d++) {
+      matrix_output << per_level_per_dev_max.at(d, l) << ",";
+    }
+    matrix_output << per_level_per_dev_max.at(max_x - 1, l);
+    if (l < max_y - 1) {
+      matrix_output << std::endl;
+    }
+  }
+  ldout(c, 1)
+    << "wal_total:" << l_totals[LEVEL_WAL - LEVEL_FIRST]
+    << ", db_total:" << l_totals[LEVEL_DB - LEVEL_FIRST]
+    << ", slow_total:" << l_totals[LEVEL_SLOW - LEVEL_FIRST]
+    << ", db_avail:" << db_avail4slow
+    << " usage matrix:" << std::endl
+    << matrix_output.str()
+    << dendl;
+}
+
+// =======================================================
index 56a0740f74633a102ecfe72afc142f57c77c8583..3652f5e5a8a4dfd8357f407f6b62528356b82aa9 100644 (file)
@@ -3511,4 +3511,182 @@ private:
   fsck_interval misreferenced_extents;
 
 };
+
+class RocksDBBlueFSVolumeSelector : public BlueFSVolumeSelector
+{
+  template <class T, size_t MaxX, size_t MaxY>
+  class matrix_2d {
+    T values[MaxX][MaxY];
+  public:
+    matrix_2d() {
+      clear();
+    }
+    T& at(size_t x, size_t y) {
+      ceph_assert(x < MaxX);
+      ceph_assert(y < MaxY);
+
+      return values[x][y];
+    }
+    size_t get_max_x() const {
+      return MaxX;
+    }
+    size_t get_max_y() const {
+      return MaxY;
+    }
+    void clear() {
+      memset(values, 0, sizeof(values));
+    }
+  };
+
+  enum {
+    // use 0/nullptr as unset indication
+    LEVEL_FIRST = 1,
+    LEVEL_WAL = LEVEL_FIRST,
+    LEVEL_DB,
+    LEVEL_SLOW,
+    LEVEL_MAX
+  };
+  // add +1 row for corresponding per-device totals
+  // add +1 column for per-level actual (taken from file size) total
+  typedef matrix_2d<uint64_t, BlueFS::MAX_BDEV + 1, LEVEL_MAX - LEVEL_FIRST + 1> per_level_per_dev_usage_t;
+
+  per_level_per_dev_usage_t per_level_per_dev_usage;
+
+  // Note: maximum per-device totals below might be smaller than corresponding
+  // perf counters by up to a single alloc unit (1M) due to superblock extent.
+  // The later is not accounted here.
+  per_level_per_dev_usage_t per_level_per_dev_max;
+
+  uint64_t l_totals[LEVEL_MAX - LEVEL_FIRST];
+  uint64_t db_avail4slow = 0;
+  enum {
+    OLD_POLICY,
+    USE_SOME_EXTRA
+  };
+
+public:
+  RocksDBBlueFSVolumeSelector(
+    uint64_t _wal_total,
+    uint64_t _db_total,
+    uint64_t _slow_total,
+    uint64_t _level0_size,
+    uint64_t _level_base,
+    uint64_t _level_multiplier,
+    double reserved_factor,
+    bool new_pol)
+  {
+    l_totals[LEVEL_WAL - LEVEL_FIRST] = _wal_total;
+    l_totals[LEVEL_DB - LEVEL_FIRST] = _db_total;
+    l_totals[LEVEL_SLOW - LEVEL_FIRST] = _slow_total;
+
+    if (!new_pol) {
+      return;
+    }
+
+    // Calculating how much extra space is available at DB volume.
+    // Which is equal to
+    // DB size - sum_max_level_size(0, L-1) - max_level_size(L) * reserved_factor
+    uint64_t prev_levels = _level0_size;
+    uint64_t cur_level = _level_base;
+    uint64_t cur_threshold = 0;
+    do {
+      uint64_t next_level = cur_level * _level_multiplier;
+      uint64_t next_threshold = prev_levels + cur_level + next_level * reserved_factor;
+      if (_db_total <= next_threshold) {
+        db_avail4slow = cur_threshold ? _db_total - cur_threshold : 0;
+        break;
+      } else {
+        prev_levels += cur_level;
+        cur_level = next_level;
+        cur_threshold = next_threshold;
+      }
+    } while (true);
+  }
+
+  void* get_hint_by_device(uint8_t dev) const override {
+    ceph_assert(dev == BlueFS::BDEV_WAL); // others aren't used atm
+    return  reinterpret_cast<void*>(LEVEL_WAL);
+  }
+  void* get_hint_by_dir(const string& dirname) const override;
+
+  void add_usage(void* hint, const bluefs_fnode_t& fnode) override {
+    if (hint == nullptr)
+      return;
+    size_t pos = (size_t)hint - LEVEL_FIRST;
+    for (auto& p : fnode.extents) {
+      auto& cur = per_level_per_dev_usage.at(p.bdev, pos);
+      auto& max = per_level_per_dev_max.at(p.bdev, pos);
+      cur += p.length;
+      if (cur > max) {
+        max = cur;
+      }
+      {
+        //update per-device totals
+        auto& cur = per_level_per_dev_usage.at(p.bdev, LEVEL_MAX - LEVEL_FIRST);
+        auto& max = per_level_per_dev_max.at(p.bdev, LEVEL_MAX - LEVEL_FIRST);
+        cur += p.length;
+        if (cur > max) {
+          max = cur;
+        }
+      }
+    }
+    {
+      //update per-level actual totals
+      auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos);
+      auto& max = per_level_per_dev_max.at(BlueFS::MAX_BDEV, pos);
+      cur += fnode.size;
+      if (cur > max) {
+        max = cur;
+      }
+    }
+  }
+  void sub_usage(void* hint, const bluefs_fnode_t& fnode) override {
+    if (hint == nullptr)
+      return;
+    size_t pos = (size_t)hint - LEVEL_FIRST;
+    for (auto& p : fnode.extents) {
+      auto& cur = per_level_per_dev_usage.at(p.bdev, pos);
+      ceph_assert(cur >= p.length);
+      cur -= p.length;
+
+      //update per-device totals
+      auto& cur2 = per_level_per_dev_usage.at(p.bdev, LEVEL_MAX - LEVEL_FIRST);
+      ceph_assert(cur2 >= p.length);
+      cur2 -= p.length;
+    }
+    //update per-level actual totals
+    auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos);
+    ceph_assert(cur >= fnode.size);
+    cur -= fnode.size;
+  }
+  void add_usage(void* hint, uint64_t fsize) override {
+    if (hint == nullptr)
+      return;
+    size_t pos = (size_t)hint - LEVEL_FIRST;
+    //update per-level actual totals
+    auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos);
+    auto& max = per_level_per_dev_max.at(BlueFS::MAX_BDEV, pos);
+    cur += fsize;
+    if (cur > max) {
+      max = cur;
+    }
+  }
+  void sub_usage(void* hint, uint64_t fsize) override {
+    if (hint == nullptr)
+      return;
+    size_t pos = (size_t)hint - LEVEL_FIRST;
+    //update per-level actual totals
+    auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos);
+    ceph_assert(cur >= fsize);
+    per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos) -= fsize;
+  }
+
+  uint8_t select_prefer_bdev(void* h) override;
+  void get_paths(
+    const std::string& base,
+    BlueFSVolumeSelector::paths& res) const override;
+
+  void dump(CephContext* cct) override;
+};
+
 #endif