]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
os/bluestore/bluefs: Code for volume selector check
authorAdam Kupczyk <akupczyk@redhat.com>
Thu, 20 Jan 2022 12:44:35 +0000 (13:44 +0100)
committerAdam Kupczyk <akupczyk@redhat.com>
Wed, 9 Feb 2022 10:37:32 +0000 (11:37 +0100)
Adds ability to verify that volume selector properly tracks disk usage.
Creates options:
- bluefs_check_volume_selector_on_umount
- bluefs_check_volume_selector_often
that can be used to validate that vselector does not diverge from
values it should have.

Signed-off-by: Adam Kupczyk <akupczyk@redhat.com>
(cherry picked from commit d233e3b1d23c135f0ec8d808c0961ddce8526bc8)

src/common/options/global.yaml.in
src/os/bluestore/BlueFS.cc
src/os/bluestore/BlueFS.h
src/os/bluestore/BlueStore.cc
src/os/bluestore/BlueStore.h

index 9d4ab120ea63daff7967ca4ba2cb71efa68baa5a..c0e3b014d9a6495bf80ca70127a05963b2b9d27a 100644 (file)
@@ -4103,6 +4103,28 @@ options:
   flags:
   - runtime
   with_legacy: true
+- name: bluefs_check_volume_selector_on_umount
+  type: bool
+  level: dev
+  desc: Check validity of volume selector on umount
+  long_desc: Checks if volume selector did not diverge from the state it should be in.
+    Reference is constructed from bluefs inode table. Asserts on inconsistency.
+  default: false
+  flags:
+  - runtime
+  with_legacy: true
+- name: bluefs_check_volume_selector_often
+  type: bool
+  level: dev
+  desc: Periodically check validity of volume selector
+  long_desc: Periodically checks if current volume selector does not diverge from the valid state.
+    Reference is constructed from bluefs inode table. Asserts on inconsistency. This is debug feature.
+  default: false
+  see_also:
+  - bluefs_check_volume_selector_on_umount
+  flags:
+  - startup
+  with_legacy: true
 - name: bluestore_bluefs
   type: bool
   level: dev
index 5e52d81e9016b983cb3dd46c92fe8d7349d55eb8..e74cbc6308501a725c4d3ae8901e87db6bf9c877 100644 (file)
@@ -196,7 +196,6 @@ BlueFS::BlueFS(CephContext* cct)
   discard_cb[BDEV_DB] = db_discard_cb;
   discard_cb[BDEV_SLOW] = slow_discard_cb;
   asok_hook = SocketHook::create(this);
-
 }
 
 BlueFS::~BlueFS()
@@ -955,7 +954,9 @@ void BlueFS::umount(bool avoid_compact)
   dout(1) << __func__ << dendl;
 
   sync_metadata(avoid_compact);
-
+  if (cct->_conf->bluefs_check_volume_selector_on_umount) {
+    _check_vselector_LNF();
+  }
   _close_writer(log.writer);
   log.writer = NULL;
   log.t.clear();
@@ -3082,6 +3083,7 @@ int BlueFS::_signal_dirty_to_log_D(FileWriter *h)
 
 void BlueFS::flush_range(FileWriter *h, uint64_t offset, uint64_t length)/*_WF*/
 {
+  _maybe_check_vselector_LNF();
   std::unique_lock hl(h->lock);
   _flush_range_F(h, offset, length);
 }
@@ -3386,17 +3388,18 @@ int BlueFS::truncate(FileWriter *h, uint64_t offset)/*_WF_L*/
   }
   ceph_assert(h->file->fnode.size >= offset);
   _flush_bdev(h);
+
+  std::lock_guard ll(log.lock);
   vselector->sub_usage(h->file->vselector_hint, h->file->fnode.size);
   h->file->fnode.size = offset;
   vselector->add_usage(h->file->vselector_hint, h->file->fnode.size);
-
-  std::lock_guard ll(log.lock);
   log.t.op_file_update_inc(h->file->fnode);
   return 0;
 }
 
 int BlueFS::fsync(FileWriter *h)/*_WF_WD_WLD_WLNF_WNF*/
 {
+  _maybe_check_vselector_LNF();
   std::unique_lock hl(h->lock);
   uint64_t old_dirty_seq = 0;
   {
@@ -3422,6 +3425,7 @@ int BlueFS::fsync(FileWriter *h)/*_WF_WD_WLD_WLNF_WNF*/
     _flush_and_sync_log_LD(old_dirty_seq);
   }
   _maybe_compact_log_LNF_NF_LD_D();
+
   return 0;
 }
 
@@ -3650,6 +3654,7 @@ int BlueFS::open_for_write(
   FileWriter **h,
   bool overwrite)/*_N_LD*/
 {
+  _maybe_check_vselector_LNF();
   FileRef file;
   bool create = false;
   bool truncate = false;
@@ -3804,6 +3809,7 @@ int BlueFS::open_for_read(
   FileReader **h,
   bool random)/*_N*/
 {
+  _maybe_check_vselector_LNF();
   std::lock_guard nl(nodes.lock);
   dout(10) << __func__ << " " << dirname << "/" << filename
           << (random ? " (random)":" (sequential)") << dendl;
@@ -4260,6 +4266,35 @@ int BlueFS::_do_replay_recovery_read(FileReader *log_reader,
   return 0;
 }
 
+void BlueFS::_check_vselector_LNF() {
+  BlueFSVolumeSelector* vs = vselector->clone_empty();
+  if (!vs) {
+    return;
+  }
+  std::lock_guard ll(log.lock);
+  std::lock_guard nl(nodes.lock);
+  // Checking vselector is under log, nodes and file(s) locks,
+  // so any modification of vselector must be under at least one of those locks.
+  for (auto& f : nodes.file_map) {
+    f.second->lock.lock();
+    vs->add_usage(f.second->vselector_hint, f.second->fnode);
+  }
+  bool res = vselector->compare(vs);
+  if (!res) {
+    dout(0) << "Current:";
+    vselector->dump(*_dout);
+    *_dout << dendl;
+    dout(0) << "Expected:";
+    vs->dump(*_dout);
+    *_dout << dendl;
+  }
+  ceph_assert(res);
+  for (auto& f : nodes.file_map) {
+    f.second->lock.unlock();
+  }
+  delete vs;
+}
+
 size_t BlueFS::probe_alloc_avail(int dev, uint64_t alloc_size)
 {
   size_t total = 0;
index c942327bf805230c3c5737011864f655d06458d0..20dfbb5c4cfc13e6f40ff4afed6475b1dcf5f9ef 100644 (file)
@@ -83,6 +83,10 @@ public:
   virtual uint8_t select_prefer_bdev(void* hint) = 0;
   virtual void get_paths(const std::string& base, paths& res) const = 0;
   virtual void dump(std::ostream& sout) = 0;
+
+  /* used for sanity checking of vselector */
+  virtual BlueFSVolumeSelector* clone_empty() const { return nullptr; }
+  virtual bool compare(BlueFSVolumeSelector* other) { return true; };
 };
 
 struct bluefs_shared_alloc_context_t {
@@ -511,7 +515,11 @@ private:
   unsigned get_super_length() {
     return 4096;
   }
-
+  void _maybe_check_vselector_LNF() {
+    if (cct->_conf->bluefs_check_volume_selector_often) {
+      _check_vselector_LNF();
+    }
+  }
 public:
   BlueFS(CephContext* cct);
   ~BlueFS();
@@ -661,6 +669,7 @@ private:
                               size_t read_offset,
                               size_t read_len,
                               bufferlist* bl);
+  void _check_vselector_LNF();
 };
 
 class OriginalVolumeSelector : public BlueFSVolumeSelector {
index 5f08051d7e2f5b33fa31dfe7aab78f51a534747f..6d67ca57c34e764a07b262d5b5858f108089f65b 100644 (file)
@@ -17945,6 +17945,29 @@ void RocksDBBlueFSVolumeSelector::dump(ostream& sout) {
   }
 }
 
+BlueFSVolumeSelector* RocksDBBlueFSVolumeSelector::clone_empty() const {
+  RocksDBBlueFSVolumeSelector* ns =
+    new RocksDBBlueFSVolumeSelector(0, 0, 0,
+                                   0, 0, 0,
+                                   0, 0, false);
+  return ns;
+}
+
+bool RocksDBBlueFSVolumeSelector::compare(BlueFSVolumeSelector* other) {
+  RocksDBBlueFSVolumeSelector* o = dynamic_cast<RocksDBBlueFSVolumeSelector*>(other);
+  ceph_assert(o);
+  bool equal = true;
+  for (size_t x = 0; x < BlueFS::MAX_BDEV + 1; x++) {
+    for (size_t y = 0; y <LEVEL_MAX - LEVEL_FIRST + 1; y++) {
+      equal &= (per_level_per_dev_usage.at(x, y) == o->per_level_per_dev_usage.at(x, y));
+    }
+  }
+  for (size_t t = 0; t < LEVEL_MAX - LEVEL_FIRST + 1; t++) {
+    equal &= (per_level_files[t] == o->per_level_files[t]);
+  }
+  return equal;
+}
+
 // =======================================================
 
 //================================================================================================================
index f98efc8fbd5f1d9979b7b668fe0696b1f5de1419..13edcae5a681aba2d33fcb6d2e38cf7ad6545359 100644 (file)
@@ -4190,6 +4190,8 @@ public:
     BlueFSVolumeSelector::paths& res) const override;
 
   void dump(std::ostream& sout) override;
+  BlueFSVolumeSelector* clone_empty() const override;
+  bool compare(BlueFSVolumeSelector* other) override;
 };
 
 #endif