From 24ec5c7ee22860130e720c85b1953db8c70f16e7 Mon Sep 17 00:00:00 2001 From: Adam Kupczyk Date: Mon, 17 Mar 2025 15:16:48 +0000 Subject: [PATCH] os/bluestore: Make wal_v2 selection more robust Changed logic for selecting bluefs_super_t wal_version field. Now it is more a result of the data that is in bluefs log and less directly controlled. Signed-off-by: Adam Kupczyk --- src/os/bluestore/BlueFS.cc | 38 ++++++++++++++++++++++---------------- src/os/bluestore/BlueFS.h | 30 ++++++++++-------------------- 2 files changed, 32 insertions(+), 36 deletions(-) diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index 08bae808c87..000d89c7616 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -1081,8 +1081,13 @@ int BlueFS::mount() goto out; } + selected_wal_v2 = cct->_conf.get_val("bluefs_wal_v2"); + log.use_wal_v2 = selected_wal_v2; // init freelist for (auto& p : nodes.file_map) { + if (p.second->is_new_wal()) { + log.use_wal_v2 = true; + } dout(20) << __func__ << " noting alloc for " << p.second->fnode << dendl; for (auto& q : p.second->fnode.extents) { bool is_shared = is_shared_alloc(q.bdev); @@ -1236,15 +1241,10 @@ int BlueFS::fsck() return 0; } -int BlueFS::_write_super(int dev, uint8_t wal_version) +int BlueFS::_write_super(int dev) { ++super.seq; - if (wal_version > 0) { - super.wal_version = wal_version; - } else { - bool use_wal_v2 = cct->_conf.get_val("bluefs_wal_v2"); - super.wal_version = use_wal_v2 ? 2 : 1; - } + super.wal_version = log.use_wal_v2 ? 2 : 1; // build superblock bufferlist bl; encode(super, bl); @@ -2270,10 +2270,11 @@ int BlueFS::downgrade_wal_to_v1() } } + selected_wal_v2 = false; // Ensure no dangling wal v2 files are inside transactions. _compact_log_sync_LNF_LD(); - // TODO assert on presence of wal_v2 - _write_super(BDEV_DB, 1); + ceph_assert(!log.use_wal_v2); + _write_super(BDEV_DB); dout(5) << fmt::format("{} success moving data", __func__) << dendl; @@ -2878,7 +2879,7 @@ void BlueFS::_compact_log_dump_metadata_NF(uint64_t start_seq, t->uuid = super.uuid; std::lock_guard nl(nodes.lock); - + bool all_wal_is_v1 = true; for (auto& [ino, file_ref] : nodes.file_map) { if (ino == 1) continue; @@ -2915,6 +2916,13 @@ void BlueFS::_compact_log_dump_metadata_NF(uint64_t start_seq, << file_ref->dirty_seq << " " << file_ref->fnode << dendl; } t->op_file_update(file_ref->fnode); + if (file_ref->is_new_wal()) { + all_wal_is_v1 = false; + } + } + if (all_wal_is_v1) { + // we are free to select now + log.use_wal_v2 = selected_wal_v2; } for (auto& [path, dir_ref] : nodes.dir_map) { dout(20) << __func__ << " op_dir_create " << path << dendl; @@ -4626,12 +4634,10 @@ int BlueFS::open_for_write( file->fnode.mtime = ceph_clock_now(); dout(20) << __func__ << " mapping " << dirname << "/" << filename - << " vsel_hint " << file->vselector_hint - << dendl; - *h = _create_writer(file); - if (boost::algorithm::ends_with(filename, ".log")) { - bool use_wal_v2 = cct->_conf.get_val("bluefs_wal_v2"); - if (use_wal_v2) { + << " vsel_hint " << file->vselector_hint << dendl; + *h = _create_writer(file); + if (boost::algorithm::ends_with(filename, ".log")) { + if (selected_wal_v2) { file->fnode.type = WAL_V2; file->is_wal_read_loaded = false; file->wal_marker = File::wal_flush_t::generate_hashed_marker(super.uuid, file->fnode.ino); diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h index 875252b43be..012ceef308a 100644 --- a/src/os/bluestore/BlueFS.h +++ b/src/os/bluestore/BlueFS.h @@ -301,33 +301,21 @@ public: return sizeof(wal_marker_t); } - uint64_t end_offset() { - return get_marker_offset() + tail_size(); - } - - uint64_t get_payload_offset() { - return wal_offset + header_size(); - } - - uint64_t get_marker_offset() { - return get_payload_offset() + wal_length; - } - - static constexpr uint64_t extra_envelope_size_on_front_and_tail() { - return header_size() + tail_size(); - } - static wal_marker_t generate_hashed_marker(uuid_d uuid, uint64_t ino) { wal_marker_t m; - uint8_t uuid_copy[16]; - memcpy(uuid_copy, uuid.bytes(), 16); + const char* uuid_bytes = uuid.bytes(); uint64_t hashed_ino = ino; hashed_ino ^= hashed_ino << 5; hashed_ino ^= hashed_ino << 11; hashed_ino ^= hashed_ino << 23; // use hashed ino in a endiness-agnostic way + // U0 U1 U2 U3 U4 U5 U6 U7 + // U8 U9 U10 U11 U12 U13 U14 U15 + // H0 H1 H2 H3 H4 H5 H6 H7 + // ^ ^ ^ ^ ^ ^ ^ ^ + // m0 m1 m2 m3 m4 m5 m6 m7 for (int i = 0; i < 8; i++) { - m.v[i] = uuid_copy[i] ^ uuid_copy[8 + i] ^ (hashed_ino >> (8 * i)); + m.v[i] = uuid_bytes[i] ^ uuid_bytes[8 + i] ^ (hashed_ino >> (8 * i)); } return m; } @@ -573,12 +561,14 @@ private: bluefs_super_t super; ///< latest superblock (as last written) uint64_t ino_last = 0; ///< last assigned ino (this one is in use) + bool selected_wal_v2 = false; ///< conf "bluefs_wal_v2" at mount struct { ceph::mutex lock = ceph::make_mutex("BlueFS::log.lock"); uint64_t seq_live = 1; //seq that log is currently writing to; mirrors dirty.seq_live FileWriter *writer = 0; bluefs_transaction_t t; + bool use_wal_v2 = false; //version of log currently in force } log; struct { @@ -755,7 +745,7 @@ private: char *out); ///< [out] optional: or copy it here int _open_super(); - int _write_super(int dev, uint8_t wal_version = 1); + int _write_super(int dev); int _check_allocations(const bluefs_fnode_t& fnode, boost::dynamic_bitset* used_blocks, bool is_alloc, //true when allocating, false when deallocating -- 2.39.5