]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
os/bluestore/bluefs: Code for volume selector check
authorAdam Kupczyk <akupczyk@redhat.com>
Thu, 20 Jan 2022 12:44:35 +0000 (13:44 +0100)
committerIgor Fedotov <igor.fedotov@croit.io>
Thu, 21 Sep 2023 18:42:52 +0000 (21:42 +0300)
Adds ability to verify that volume selector properly tracks disk usage.
Creates options:
- bluefs_check_volume_selector_on_umount
- bluefs_check_volume_selector_often
that can be used to validate that vselector does not diverge from
values it should have.

Signed-off-by: Adam Kupczyk <akupczyk@redhat.com>
(cherry picked from commit d233e3b1d23c135f0ec8d808c0961ddce8526bc8)

 Conflicts:
src/common/options/global.yaml.in
(new yaml config settings aren't present in Pacific)

src/common/legacy_config_opts.h
src/common/options.cc
src/os/bluestore/BlueFS.cc
src/os/bluestore/BlueFS.h
src/os/bluestore/BlueStore.cc
src/os/bluestore/BlueStore.h

index 07fc657444fc3fea2898ae5dfd048a2f6e531571..99d0aad95b214bf3c30663fd6c10507314dcef7f 100644 (file)
@@ -924,6 +924,8 @@ OPTION(bluefs_replay_recovery, OPT_BOOL)
 OPTION(bluefs_replay_recovery_disable_compact, OPT_BOOL)
 OPTION(bluefs_check_for_zeros, OPT_BOOL)
 
+OPTION(bluefs_check_volume_selector_on_umount, OPT_BOOL)
+OPTION(bluefs_check_volume_selector_often, OPT_BOOL)
 OPTION(bluestore_bluefs, OPT_BOOL)
 OPTION(bluestore_bluefs_env_mirror, OPT_BOOL) // mirror to normal Env for debug
 // how often (sec) to dump allocator on allocation failure
index 546908b55bba2393f2d7545114489574e22fc174..3dd1a7f73e88a8c5de9a042f659626d2dcd1d689 100644 (file)
@@ -4355,6 +4355,21 @@ std::vector<Option> get_global_options() {
                          "If this happens, we re-read data. If there is difference, we print error to log.")
     .add_see_also("bluestore_retry_disk_reads"),
 
+    Option("bluefs_check_volume_selector_on_umount", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("Check validity of volume selector on umount")
+    .set_long_description("Checks if volume selector did not diverge from the state it should be in. "
+                          "Reference is constructed from bluefs inode table. Asserts on inconsistency."),
+    Option("bluefs_check_volume_selector_often", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_flag(Option::FLAG_STARTUP)
+    .set_description("Periodically check validity of volume selector")
+    .set_long_description("Periodically checks if current volume selector does not diverge from the valid state. "
+                          "Reference is constructed from bluefs inode table. Asserts on inconsistency. "
+                          " This is debug feature.")
+    .add_see_also("bluefs_check_volume_selector_on_umount"),
+
     Option("bluestore_bluefs", Option::TYPE_BOOL, Option::LEVEL_DEV)
     .set_default(true)
     .set_flag(Option::FLAG_CREATE)
index 9596b7f5b112d0e0349af9729c67c1e935baf324..55abc0b5b0119c6c302bb292ab6d33cc6bd1043f 100644 (file)
@@ -198,7 +198,6 @@ BlueFS::BlueFS(CephContext* cct)
   discard_cb[BDEV_DB] = db_discard_cb;
   discard_cb[BDEV_SLOW] = slow_discard_cb;
   asok_hook = SocketHook::create(this);
-
 }
 
 BlueFS::~BlueFS()
@@ -860,7 +859,9 @@ void BlueFS::umount(bool avoid_compact)
   dout(1) << __func__ << dendl;
 
   sync_metadata(avoid_compact);
-
+  if (cct->_conf->bluefs_check_volume_selector_on_umount) {
+    _check_vselector_LNF();
+  }
   _close_writer(log.writer);
   log.writer = NULL;
   log.t.clear();
@@ -3210,6 +3211,7 @@ int BlueFS::_signal_dirty_to_log_D(FileWriter *h)
 
 void BlueFS::flush_range(FileWriter *h, uint64_t offset, uint64_t length)/*_WF*/
 {
+  _maybe_check_vselector_LNF();
   std::unique_lock hl(h->lock);
   _flush_range_F(h, offset, length);
 }
@@ -3516,17 +3518,18 @@ int BlueFS::truncate(FileWriter *h, uint64_t offset)/*_WF_L*/
   }
   ceph_assert(h->file->fnode.size >= offset);
   _flush_bdev(h);
+
+  std::lock_guard ll(log.lock);
   vselector->sub_usage(h->file->vselector_hint, h->file->fnode.size);
   h->file->fnode.size = offset;
   vselector->add_usage(h->file->vselector_hint, h->file->fnode.size);
-
-  std::lock_guard ll(log.lock);
   log.t.op_file_update_inc(h->file->fnode);
   return 0;
 }
 
 int BlueFS::fsync(FileWriter *h)/*_WF_WD_WLD_WLNF_WNF*/
 {
+  _maybe_check_vselector_LNF();
   std::unique_lock hl(h->lock);
   uint64_t old_dirty_seq = 0;
   {
@@ -3552,6 +3555,7 @@ int BlueFS::fsync(FileWriter *h)/*_WF_WD_WLD_WLNF_WNF*/
     _flush_and_sync_log_LD(old_dirty_seq);
   }
   _maybe_compact_log_LNF_NF_LD_D();
+
   return 0;
 }
 
@@ -3806,6 +3810,7 @@ int BlueFS::open_for_write(
   FileWriter **h,
   bool overwrite)/*_N_LD*/
 {
+  _maybe_check_vselector_LNF();
   FileRef file;
   bool create = false;
   bool truncate = false;
@@ -3960,6 +3965,7 @@ int BlueFS::open_for_read(
   FileReader **h,
   bool random)/*_N*/
 {
+  _maybe_check_vselector_LNF();
   std::lock_guard nl(nodes.lock);
   dout(10) << __func__ << " " << dirname << "/" << filename
           << (random ? " (random)":" (sequential)") << dendl;
@@ -4414,6 +4420,35 @@ int BlueFS::_do_replay_recovery_read(FileReader *log_reader,
   return 0;
 }
 
+void BlueFS::_check_vselector_LNF() {
+  BlueFSVolumeSelector* vs = vselector->clone_empty();
+  if (!vs) {
+    return;
+  }
+  std::lock_guard ll(log.lock);
+  std::lock_guard nl(nodes.lock);
+  // Checking vselector is under log, nodes and file(s) locks,
+  // so any modification of vselector must be under at least one of those locks.
+  for (auto& f : nodes.file_map) {
+    f.second->lock.lock();
+    vs->add_usage(f.second->vselector_hint, f.second->fnode);
+  }
+  bool res = vselector->compare(vs);
+  if (!res) {
+    dout(0) << "Current:";
+    vselector->dump(*_dout);
+    *_dout << dendl;
+    dout(0) << "Expected:";
+    vs->dump(*_dout);
+    *_dout << dendl;
+  }
+  ceph_assert(res);
+  for (auto& f : nodes.file_map) {
+    f.second->lock.unlock();
+  }
+  delete vs;
+}
+
 size_t BlueFS::probe_alloc_avail(int dev, uint64_t alloc_size)
 {
   size_t total = 0;
index ec1b4c8ba1382208c788d8ae38c67e42b97fc7af..3d5e20ca4cc488b0aded05026e0b9ace0fc8a2d2 100644 (file)
@@ -75,6 +75,10 @@ public:
   virtual uint8_t select_prefer_bdev(void* hint) = 0;
   virtual void get_paths(const std::string& base, paths& res) const = 0;
   virtual void dump(std::ostream& sout) = 0;
+
+  /* used for sanity checking of vselector */
+  virtual BlueFSVolumeSelector* clone_empty() const { return nullptr; }
+  virtual bool compare(BlueFSVolumeSelector* other) { return true; };
 };
 
 struct bluefs_shared_alloc_context_t {
@@ -512,7 +516,11 @@ private:
   unsigned get_super_length() {
     return 4096;
   }
-
+  void _maybe_check_vselector_LNF() {
+    if (cct->_conf->bluefs_check_volume_selector_often) {
+      _check_vselector_LNF();
+    }
+  }
 public:
   BlueFS(CephContext* cct);
   ~BlueFS();
@@ -658,6 +666,7 @@ private:
                               size_t read_offset,
                               size_t read_len,
                               bufferlist* bl);
+  void _check_vselector_LNF();
 };
 
 class OriginalVolumeSelector : public BlueFSVolumeSelector {
index 8040fe0ac9a7370e02d481a7571b82b23902e049..bad7a0bb2fd286fb849efa534f38859164e2124d 100644 (file)
@@ -16942,5 +16942,28 @@ void RocksDBBlueFSVolumeSelector::dump(ostream& sout) {
   sout << std::endl;
 }
 
+BlueFSVolumeSelector* RocksDBBlueFSVolumeSelector::clone_empty() const {
+  RocksDBBlueFSVolumeSelector* ns =
+    new RocksDBBlueFSVolumeSelector(0, 0, 0,
+                                   0, 0, 0,
+                                   0, 0, false);
+  return ns;
+}
+
+bool RocksDBBlueFSVolumeSelector::compare(BlueFSVolumeSelector* other) {
+  RocksDBBlueFSVolumeSelector* o = dynamic_cast<RocksDBBlueFSVolumeSelector*>(other);
+  ceph_assert(o);
+  bool equal = true;
+  for (size_t x = 0; x < BlueFS::MAX_BDEV + 1; x++) {
+    for (size_t y = 0; y <LEVEL_MAX - LEVEL_FIRST + 1; y++) {
+      equal &= (per_level_per_dev_usage.at(x, y) == o->per_level_per_dev_usage.at(x, y));
+    }
+  }
+  for (size_t t = 0; t < LEVEL_MAX - LEVEL_FIRST + 1; t++) {
+    equal &= (per_level_files[t] == o->per_level_files[t]);
+  }
+  return equal;
+}
+
 // =======================================================
 // =======================================================
index 50ad3a85edc9b544c5a302751b01759bde7dfb9d..023a6b9d52d581655c5a9135959010ef77031a86 100644 (file)
@@ -3912,6 +3912,8 @@ public:
     BlueFSVolumeSelector::paths& res) const override;
 
   void dump(std::ostream& sout) override;
+  BlueFSVolumeSelector* clone_empty() const override;
+  bool compare(BlueFSVolumeSelector* other) override;
 };
 
 #endif