From: Adam Kupczyk Date: Thu, 20 Jan 2022 12:44:35 +0000 (+0100) Subject: os/bluestore/bluefs: Code for volume selector check X-Git-Tag: v17.1.0~14^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=2c3472db540545887536825285a08710e2e5212b;p=ceph.git os/bluestore/bluefs: Code for volume selector check Adds ability to verify that volume selector properly tracks disk usage. Creates options: - bluefs_check_volume_selector_on_umount - bluefs_check_volume_selector_often that can be used to validate that vselector does not diverge from values it should have. Signed-off-by: Adam Kupczyk (cherry picked from commit d233e3b1d23c135f0ec8d808c0961ddce8526bc8) --- diff --git a/src/common/options/global.yaml.in b/src/common/options/global.yaml.in index 9d4ab120ea63..c0e3b014d9a6 100644 --- a/src/common/options/global.yaml.in +++ b/src/common/options/global.yaml.in @@ -4103,6 +4103,28 @@ options: flags: - runtime with_legacy: true +- name: bluefs_check_volume_selector_on_umount + type: bool + level: dev + desc: Check validity of volume selector on umount + long_desc: Checks if volume selector did not diverge from the state it should be in. + Reference is constructed from bluefs inode table. Asserts on inconsistency. + default: false + flags: + - runtime + with_legacy: true +- name: bluefs_check_volume_selector_often + type: bool + level: dev + desc: Periodically check validity of volume selector + long_desc: Periodically checks if current volume selector does not diverge from the valid state. + Reference is constructed from bluefs inode table. Asserts on inconsistency. This is debug feature. + default: false + see_also: + - bluefs_check_volume_selector_on_umount + flags: + - startup + with_legacy: true - name: bluestore_bluefs type: bool level: dev diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index 5e52d81e9016..e74cbc630850 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -196,7 +196,6 @@ BlueFS::BlueFS(CephContext* cct) discard_cb[BDEV_DB] = db_discard_cb; discard_cb[BDEV_SLOW] = slow_discard_cb; asok_hook = SocketHook::create(this); - } BlueFS::~BlueFS() @@ -955,7 +954,9 @@ void BlueFS::umount(bool avoid_compact) dout(1) << __func__ << dendl; sync_metadata(avoid_compact); - + if (cct->_conf->bluefs_check_volume_selector_on_umount) { + _check_vselector_LNF(); + } _close_writer(log.writer); log.writer = NULL; log.t.clear(); @@ -3082,6 +3083,7 @@ int BlueFS::_signal_dirty_to_log_D(FileWriter *h) void BlueFS::flush_range(FileWriter *h, uint64_t offset, uint64_t length)/*_WF*/ { + _maybe_check_vselector_LNF(); std::unique_lock hl(h->lock); _flush_range_F(h, offset, length); } @@ -3386,17 +3388,18 @@ int BlueFS::truncate(FileWriter *h, uint64_t offset)/*_WF_L*/ } ceph_assert(h->file->fnode.size >= offset); _flush_bdev(h); + + std::lock_guard ll(log.lock); vselector->sub_usage(h->file->vselector_hint, h->file->fnode.size); h->file->fnode.size = offset; vselector->add_usage(h->file->vselector_hint, h->file->fnode.size); - - std::lock_guard ll(log.lock); log.t.op_file_update_inc(h->file->fnode); return 0; } int BlueFS::fsync(FileWriter *h)/*_WF_WD_WLD_WLNF_WNF*/ { + _maybe_check_vselector_LNF(); std::unique_lock hl(h->lock); uint64_t old_dirty_seq = 0; { @@ -3422,6 +3425,7 @@ int BlueFS::fsync(FileWriter *h)/*_WF_WD_WLD_WLNF_WNF*/ _flush_and_sync_log_LD(old_dirty_seq); } _maybe_compact_log_LNF_NF_LD_D(); + return 0; } @@ -3650,6 +3654,7 @@ int BlueFS::open_for_write( FileWriter **h, bool overwrite)/*_N_LD*/ { + _maybe_check_vselector_LNF(); FileRef file; bool create = false; bool truncate = false; @@ -3804,6 +3809,7 @@ int BlueFS::open_for_read( FileReader **h, bool random)/*_N*/ { + _maybe_check_vselector_LNF(); std::lock_guard nl(nodes.lock); dout(10) << __func__ << " " << dirname << "/" << filename << (random ? " (random)":" (sequential)") << dendl; @@ -4260,6 +4266,35 @@ int BlueFS::_do_replay_recovery_read(FileReader *log_reader, return 0; } +void BlueFS::_check_vselector_LNF() { + BlueFSVolumeSelector* vs = vselector->clone_empty(); + if (!vs) { + return; + } + std::lock_guard ll(log.lock); + std::lock_guard nl(nodes.lock); + // Checking vselector is under log, nodes and file(s) locks, + // so any modification of vselector must be under at least one of those locks. + for (auto& f : nodes.file_map) { + f.second->lock.lock(); + vs->add_usage(f.second->vselector_hint, f.second->fnode); + } + bool res = vselector->compare(vs); + if (!res) { + dout(0) << "Current:"; + vselector->dump(*_dout); + *_dout << dendl; + dout(0) << "Expected:"; + vs->dump(*_dout); + *_dout << dendl; + } + ceph_assert(res); + for (auto& f : nodes.file_map) { + f.second->lock.unlock(); + } + delete vs; +} + size_t BlueFS::probe_alloc_avail(int dev, uint64_t alloc_size) { size_t total = 0; diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h index c942327bf805..20dfbb5c4cfc 100644 --- a/src/os/bluestore/BlueFS.h +++ b/src/os/bluestore/BlueFS.h @@ -83,6 +83,10 @@ public: virtual uint8_t select_prefer_bdev(void* hint) = 0; virtual void get_paths(const std::string& base, paths& res) const = 0; virtual void dump(std::ostream& sout) = 0; + + /* used for sanity checking of vselector */ + virtual BlueFSVolumeSelector* clone_empty() const { return nullptr; } + virtual bool compare(BlueFSVolumeSelector* other) { return true; }; }; struct bluefs_shared_alloc_context_t { @@ -511,7 +515,11 @@ private: unsigned get_super_length() { return 4096; } - + void _maybe_check_vselector_LNF() { + if (cct->_conf->bluefs_check_volume_selector_often) { + _check_vselector_LNF(); + } + } public: BlueFS(CephContext* cct); ~BlueFS(); @@ -661,6 +669,7 @@ private: size_t read_offset, size_t read_len, bufferlist* bl); + void _check_vselector_LNF(); }; class OriginalVolumeSelector : public BlueFSVolumeSelector { diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 5f08051d7e2f..6d67ca57c34e 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -17945,6 +17945,29 @@ void RocksDBBlueFSVolumeSelector::dump(ostream& sout) { } } +BlueFSVolumeSelector* RocksDBBlueFSVolumeSelector::clone_empty() const { + RocksDBBlueFSVolumeSelector* ns = + new RocksDBBlueFSVolumeSelector(0, 0, 0, + 0, 0, 0, + 0, 0, false); + return ns; +} + +bool RocksDBBlueFSVolumeSelector::compare(BlueFSVolumeSelector* other) { + RocksDBBlueFSVolumeSelector* o = dynamic_cast(other); + ceph_assert(o); + bool equal = true; + for (size_t x = 0; x < BlueFS::MAX_BDEV + 1; x++) { + for (size_t y = 0; y per_level_per_dev_usage.at(x, y)); + } + } + for (size_t t = 0; t < LEVEL_MAX - LEVEL_FIRST + 1; t++) { + equal &= (per_level_files[t] == o->per_level_files[t]); + } + return equal; +} + // ======================================================= //================================================================================================================ diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index f98efc8fbd5f..13edcae5a681 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -4190,6 +4190,8 @@ public: BlueFSVolumeSelector::paths& res) const override; void dump(std::ostream& sout) override; + BlueFSVolumeSelector* clone_empty() const override; + bool compare(BlueFSVolumeSelector* other) override; }; #endif