From: Adam Kupczyk Date: Thu, 27 Mar 2025 13:17:36 +0000 (+0000) Subject: os/bluestore: Generalize WAL_v2 => envelope_mode X-Git-Tag: testing/wip-vshankar-testing-20250411.090237-debug~28^2~3 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=613ff6c025734922490bb3460c3ddde661f5bfeb;p=ceph-ci.git os/bluestore: Generalize WAL_v2 => envelope_mode This refactor turns WAL_v2 into globally available ENVELOPE_MODE. The RocksDB WAL files are now just using existing feature. Now adding new file encoding schemes will be easier. Signed-off-by: Adam Kupczyk --- diff --git a/src/common/options/global.yaml.in b/src/common/options/global.yaml.in index 92a545fb748..e6c10714548 100644 --- a/src/common/options/global.yaml.in +++ b/src/common/options/global.yaml.in @@ -6718,12 +6718,12 @@ options: desc: Enables exception throwing instead of process abort on transaction submission error. default: false with_legacy: false -- name: bluefs_wal_v2 +- name: bluefs_wal_envelope_mode type: bool level: advanced desc: Enables a faster backend in BlueFS for WAL writes. - long_desc: Enabling this feature will reduce ~50% the amount of fdatasync syscalls issued by WAL writes. This happens because we embed metadata - with the data itself. Downgrading from a version that uses v2 to v1 will require running `ceph-bluestore-tool --command downgrade-wal-to-v1` - to move wal files to previous format. + long_desc: In envelope mode BlueFS files do not need to update metadata. When applied to RocksDB WAL files, + it reduces by ~50% the amount of fdatasync syscalls. + Downgrading from an envelope mode to legacy mode requires `ceph-bluestore-tool --command downgrade-wal-to-v1`. default: true with_legacy: false diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index 000d89c7616..440cba38ff7 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -1081,12 +1081,12 @@ int BlueFS::mount() goto out; } - selected_wal_v2 = cct->_conf.get_val("bluefs_wal_v2"); - log.use_wal_v2 = selected_wal_v2; + conf_wal_envelope_mode = cct->_conf.get_val("bluefs_wal_envelope_mode"); + log.uses_envelope_mode = conf_wal_envelope_mode; // init freelist for (auto& p : nodes.file_map) { - if (p.second->is_new_wal()) { - log.use_wal_v2 = true; + if (p.second->envelope_mode()) { + log.uses_envelope_mode = true; } dout(20) << __func__ << " noting alloc for " << p.second->fnode << dendl; for (auto& q : p.second->fnode.extents) { @@ -1244,7 +1244,8 @@ int BlueFS::fsck() int BlueFS::_write_super(int dev) { ++super.seq; - super.wal_version = log.use_wal_v2 ? 2 : 1; + super._version = log.uses_envelope_mode ? + bluefs_super_t::ENVELOPE_MODE_ENABLED : bluefs_super_t::BASELINE; // build superblock bufferlist bl; encode(super, bl); @@ -1708,9 +1709,9 @@ int BlueFS::_replay(bool noop, bool to_stdout) { bluefs_fnode_t fnode; decode(fnode, p); - ceph_assert(fnode.type == bluefs_node_type::REGULAR || - fnode.type == bluefs_node_type::WAL_V2 || - fnode.type == bluefs_node_type::WAL_V2_FIN); + ceph_assert(fnode.encoding == bluefs_node_encoding::PLAIN || + fnode.encoding == bluefs_node_encoding::ENVELOPE || + fnode.encoding == bluefs_node_encoding::ENVELOPE_FIN); dout(20) << __func__ << " 0x" << std::hex << pos << std::dec << ": op_file_update " << " " << fnode << " " << dendl; if (unlikely(to_stdout)) { @@ -1784,8 +1785,8 @@ int BlueFS::_replay(bool noop, bool to_stdout) } fnode.claim_extents(delta.extents); fnode.size = delta.size; - fnode.type = delta.type; - fnode.wal_size = delta.wal_size; + fnode.encoding = delta.encoding; + fnode.content_size = delta.content_size; dout(20) << __func__ << " 0x" << std::hex << pos << std::dec << ": op_file_update_inc produced " << " " << fnode << " " << dendl; @@ -1863,8 +1864,8 @@ int BlueFS::_replay(bool noop, bool to_stdout) dirty.seq_stable = log_seq; for (const auto &[filename, file] : nodes.file_map) { - if (file->is_new_wal()) { - _wal_index_file(file); + if (file->envelope_mode()) { + _envmode_index_file(file); } } } @@ -2225,7 +2226,7 @@ int BlueFS::downgrade_wal_to_v1( // we use dir for wals and name like wal; should get proper hint r = open_for_write(dir, tmp_name, &writer, false); // use normal v1 write path by marking node type to legacy - writer->file->fnode.type = bluefs_node_type::REGULAR; + writer->file->fnode.encoding = bluefs_node_encoding::PLAIN; ceph_assert(r == 0); r = open_for_read(dir, name, &reader); ceph_assert(r == 0); @@ -2261,7 +2262,7 @@ int BlueFS::downgrade_wal_to_v1() // copy, so it does not change auto dir_copy = dir_it->second->file_map; for (const auto& [file_name, file] : dir_copy) { - if(file->is_new_wal()) { + if(file->envelope_mode()) { downgrade_wal_to_v1(wal_dir, file_name); sync_metadata(true); dout(10) << __func__ << fmt::format(" {} v2=>v1", file_name) << dendl; @@ -2270,10 +2271,10 @@ int BlueFS::downgrade_wal_to_v1() } } - selected_wal_v2 = false; + conf_wal_envelope_mode = false; // Ensure no dangling wal v2 files are inside transactions. _compact_log_sync_LNF_LD(); - ceph_assert(!log.use_wal_v2); + ceph_assert(!log.uses_envelope_mode); _write_super(BDEV_DB); dout(5) << fmt::format("{} success moving data", __func__) << dendl; @@ -2446,41 +2447,41 @@ int64_t BlueFS::_read_random( std::ostream& operator<<( std::ostream& out, - const BlueFS::File::wal_flush_t& w) { + const BlueFS::File::envelope_t& w) { out << fmt::format("[wal:{:#x}~{:x} -> file:{:#x}~{:x}/{:x}/{:x}]", - w.wal_offset, w.wal_length, - w.file_offset, w.header_len, w.wal_length, w.tailer_len); + w.content_offset, w.content_length, + w.file_offset, w.head_len, w.content_length, w.tail_len); return out; } std::ostream& operator<<( std::ostream& out, - const BlueFS::File::wal_flush_t::wal_marker_t& m) { + const BlueFS::File::envelope_t::stamp_t& m) { out << fmt::format("0x{:02x}{:02x}{:02x}{:02x}{:02x}{:02x}{:02x}{:02x}", m.v[0], m.v[1], m.v[2], m.v[3], m.v[4], m.v[5], m.v[6], m.v[7]); return out; } -void BlueFS::_wal_index_file( +void BlueFS::_envmode_index_file( FileRef file) { - file->wal_marker = File::wal_flush_t::generate_hashed_marker(super.uuid, file->fnode.ino); - dout(10) << file << " required marker=#" << std::hex << file->wal_marker << std::dec << dendl; + file->stamp = File::envelope_t::generate_stamp(super.uuid, file->fnode.ino); + dout(10) << file << " required stamp=#" << std::hex << file->stamp << std::dec << dendl; uint64_t scan_ofs = 0; - uint64_t wal_ofs = 0; - File::wal_flush_t flush; + uint64_t env_ofs = 0; + File::envelope_t flush; bool envelope_good; uint64_t file_size = file->fnode.size; FileReader *h = new FileReader(file, 4096, true); ceph_assert(h); while(scan_ofs < file->fnode.allocated) { - envelope_good = _read_wal_flush(h, scan_ofs, wal_ofs, &flush); + envelope_good = _read_envelope(h, scan_ofs, env_ofs, &flush); dout(20) << "envelope " << (envelope_good ? "good " : "bad ") << flush << dendl; if (envelope_good) { - h->file->wal_flushes.push_back(flush); - wal_ofs += flush.wal_length; - scan_ofs += flush.wal_length + flush.header_len + flush.tailer_len; - if (file->fnode.type == WAL_V2_FIN) { + h->file->envelopes.push_back(flush); + env_ofs += flush.content_length; + scan_ofs += flush.content_length + flush.head_len + flush.tail_len; + if (file->fnode.encoding == ENVELOPE_FIN) { if (scan_ofs > file_size) { dout(5) << "valid envelope but exceeds file size." << dendl; } @@ -2491,18 +2492,18 @@ void BlueFS::_wal_index_file( // more envelopes expected continue; } else { - // WAL_V2, might continue + // ENVELOPE, not finalized, might continue continue; } } else { if (scan_ofs < file_size) { // we want to accept failed envelope if we are below confirmed wal size - h->file->wal_flushes.push_back(flush); + h->file->envelopes.push_back(flush); } break; } } - file->is_wal_read_loaded = true; + file->envelopes_indexed = true; delete h; } @@ -2511,80 +2512,80 @@ void BlueFS::_wal_index_file( // In case that some flush envelopes are corrupted, construct replacement; // make it up so the total content in flushes matches file size. // Replacement flush are considered valid in context of this function. -int BlueFS::_wal_seek_to( +int BlueFS::_envmode_seek_to( FileReader *h, uint64_t lookup_ofs, - File::wal_flush_t* fl_out) + File::envelope_t* fl_out) { //1st, lets check if we already have such offset - uint64_t wal_ofs = 0; + uint64_t env_ofs = 0; uint64_t file_ofs = 0; - for (const auto& fl: h->file->wal_flushes) { - ceph_assert(fl.wal_offset == wal_ofs); + for (const auto& fl: h->file->envelopes) { + ceph_assert(fl.content_offset == env_ofs); ceph_assert(fl.file_offset == file_ofs); - if (fl.wal_offset + fl.wal_length <= lookup_ofs) { - wal_ofs += fl.wal_length; - file_ofs += fl.header_len + fl.wal_length + fl.tailer_len; + if (fl.content_offset + fl.content_length <= lookup_ofs) { + env_ofs += fl.content_length; + file_ofs += fl.head_len + fl.content_length + fl.tail_len; continue; } - ceph_assert(fl.wal_offset <= lookup_ofs); + ceph_assert(fl.content_offset <= lookup_ofs); *fl_out = fl; - return fl.wal_offset + fl.wal_length - lookup_ofs; + return fl.content_offset + fl.content_length - lookup_ofs; } - if (wal_ofs == lookup_ofs) { + if (env_ofs == lookup_ofs) { // asking exactly for EOF return 0; } return -1; } -bool BlueFS::_read_wal_flush( +bool BlueFS::_read_envelope( FileReader *h, uint64_t file_ofs, - uint64_t wal_ofs, - File::wal_flush_t* fl) + uint64_t env_ofs, + File::envelope_t* fl) { ceph_le64 flush_length_le; - File::wal_flush_t::wal_marker_t wal_marker; - static_assert(File::wal_flush_t::header_size() == sizeof(flush_length_le)); - int64_t r = _read(h, file_ofs, File::wal_flush_t::header_size(), nullptr, (char*)&flush_length_le); - if (r != File::wal_flush_t::header_size()) goto fail; - r = _read(h, file_ofs + File::wal_flush_t::header_size() + flush_length_le, - File::wal_flush_t::tail_size(), nullptr, (char*)&wal_marker); - if (r != File::wal_flush_t::tail_size()) goto fail; - if (0 != memcmp(&wal_marker, &h->file->wal_marker, 8)) goto fail; - dout(20) << __func__ << " read.len=0x" << std::hex << flush_length_le << " read.marker=#" - << wal_marker << std::dec << dendl; - fl->wal_offset = wal_ofs; + File::envelope_t::stamp_t stamp; + static_assert(File::envelope_t::head_size() == sizeof(flush_length_le)); + int64_t r = _read(h, file_ofs, File::envelope_t::head_size(), nullptr, (char*)&flush_length_le); + if (r != File::envelope_t::head_size()) goto fail; + r = _read(h, file_ofs + File::envelope_t::head_size() + flush_length_le, + File::envelope_t::tail_size(), nullptr, (char*)&stamp); + if (r != File::envelope_t::tail_size()) goto fail; + if (0 != memcmp(&stamp.v, &h->file->stamp.v, sizeof(stamp.v))) goto fail; + dout(20) << __func__ << " read.len=0x" << std::hex << flush_length_le << " read.stamp=#" + << stamp << std::dec << dendl; + fl->content_offset = env_ofs; fl->file_offset = file_ofs; - fl->header_len = File::wal_flush_t::header_size(); - fl->tailer_len = File::wal_flush_t::tail_size(); - fl->wal_length = flush_length_le; + fl->head_len = File::envelope_t::head_size(); + fl->tail_len = File::envelope_t::tail_size(); + fl->content_length = flush_length_le; dout(20) << __func__ << " envelope: " << *fl << dendl; return true; fail: - dout(20) << __func__ << "read.len=0x" << std::hex << flush_length_le << " read.marker=#" << wal_marker - << " required marker=#" << h->file->wal_marker << std::dec << dendl; + dout(20) << __func__ << "read.len=0x" << std::hex << flush_length_le << " read.stamp=#" << stamp + << " required stamp=#" << h->file->stamp << std::dec << dendl; // technically, we could scan for missing flushes.... - fl->wal_offset = wal_ofs; + fl->content_offset = env_ofs; fl->file_offset = file_ofs; - fl->header_len = 0; - fl->tailer_len = 0; - fl->wal_length = file_ofs < h->file->fnode.size + fl->head_len = 0; + fl->tail_len = 0; + fl->content_length = file_ofs < h->file->fnode.size ? h->file->fnode.size - file_ofs : h->file->fnode.allocated - file_ofs; dout(10) << __func__ << " failed to find envelope, created artificial: " << *fl << dendl; return false; } -int64_t BlueFS::_read_wal( +int64_t BlueFS::_read_envmode( FileReader *h, ///< [in] read from here uint64_t off_req, ///< [in] offset size_t len_req, ///< [in] this many bytes bufferlist *outbl, ///< [out] optional: reference the result here char *out) ///< [out] optional: or copy it here { - ceph_assert(h->file->is_wal_read_loaded); + ceph_assert(h->file->envelopes_indexed); dout(10) << __func__ << " h " << h << " offset: 0x" << off_req << std::hex << "~" << len_req << std::hex << dendl; if (outbl) { @@ -2593,8 +2594,8 @@ int64_t BlueFS::_read_wal( uint64_t off = off_req; int64_t r = 0; while (off < off_req + len_req) { - File::wal_flush_t fl; - int64_t readable = _wal_seek_to(h, off, &fl); + File::envelope_t fl; + int64_t readable = _envmode_seek_to(h, off, &fl); if (readable == 0) { // we apparently read everything break; @@ -2605,9 +2606,9 @@ int64_t BlueFS::_read_wal( } readable = std::min(uint64_t(readable), off_req + len_req - off); dout(20) << fmt::format("{} wal:{:#x}~{:#x} -> file:{:#x}~{:#x}/{:#x}/{:#x}", - __func__, fl.wal_offset, fl.wal_length, fl.file_offset, fl.header_len, fl.wal_length, fl.tailer_len) << dendl; - ceph_assert(fl.wal_offset <= off && off < fl.wal_offset + fl.wal_length); - uint64_t file_off = fl.file_offset + fl.header_len + (off - fl.wal_offset); + __func__, fl.content_offset, fl.content_length, fl.file_offset, fl.head_len, fl.content_length, fl.tail_len) << dendl; + ceph_assert(fl.content_offset <= off && off < fl.content_offset + fl.content_length); + uint64_t file_off = fl.file_offset + fl.head_len + (off - fl.content_offset); bufferlist res; r = _read(h, file_off, readable, outbl ? &res : nullptr, out ? out + (off - off_req) : nullptr); @@ -2879,7 +2880,7 @@ void BlueFS::_compact_log_dump_metadata_NF(uint64_t start_seq, t->uuid = super.uuid; std::lock_guard nl(nodes.lock); - bool all_wal_is_v1 = true; + bool all_files_plain = true; for (auto& [ino, file_ref] : nodes.file_map) { if (ino == 1) continue; @@ -2916,13 +2917,13 @@ void BlueFS::_compact_log_dump_metadata_NF(uint64_t start_seq, << file_ref->dirty_seq << " " << file_ref->fnode << dendl; } t->op_file_update(file_ref->fnode); - if (file_ref->is_new_wal()) { - all_wal_is_v1 = false; + if (file_ref->envelope_mode()) { + all_files_plain = false; } } - if (all_wal_is_v1) { + if (all_files_plain) { // we are free to select now - log.use_wal_v2 = selected_wal_v2; + log.uses_envelope_mode = conf_wal_envelope_mode; } for (auto& [path, dir_ref] : nodes.dir_map) { dout(20) << __func__ << " op_dir_create " << path << dendl; @@ -3839,26 +3840,26 @@ void BlueFS::flush_range(FileWriter *h, uint64_t offset, uint64_t length)/*_WF*/ { _maybe_check_vselector_LNF(); std::unique_lock hl(h->lock); - if (h->file->is_new_wal()) { - //For wal files disregard offset and length and just flush current envelope. - _flush_wal_F(h); + if (h->file->envelope_mode()) { + // For envelope files disregard offset and length and just flush current envelope. + _flush_envelope_F(h); } else { _flush_range_F(h, offset, length); } } -int BlueFS::_flush_wal_F(FileWriter *h) +int BlueFS::_flush_envelope_F(FileWriter *h) { ceph_assert(ceph_mutex_is_locked(h->lock)); - ceph_assert(h->file->is_new_wal()); - uint64_t content_length = h->get_buffer_length() - File::wal_flush_t::header_size(); - h->append((char*)&h->file->wal_marker.v[0], File::wal_flush_t::tail_size()); + ceph_assert(h->file->envelope_mode()); + uint64_t content_length = h->get_buffer_length() - File::envelope_t::head_size(); + h->append((char*)&h->file->stamp.v[0], File::envelope_t::tail_size()); uint64_t offset = h->pos; - h->file->fnode.wal_size += content_length; + h->file->fnode.content_size += content_length; ceph_le64 flush_length_le(content_length); - h->wal_header_filler.copy_in(File::wal_flush_t::header_size(), (char*)&flush_length_le); - uint64_t length = File::wal_flush_t::header_size() + content_length + File::wal_flush_t::tail_size(); + h->envelope_head_filler.copy_in(File::envelope_t::head_size(), (char*)&flush_length_le); + uint64_t length = File::envelope_t::head_size() + content_length + File::envelope_t::tail_size(); return _flush_range_F(h, offset, length); } @@ -3886,8 +3887,8 @@ int BlueFS::_flush_range_F(FileWriter *h, uint64_t offset, uint64_t length) if (end <= h->pos) return 0; if (offset < h->pos) { - // NOTE: let's assume that we do not overwrite wal - ceph_assert(!h->file->is_new_wal()); + // NOTE: for envelope files we flush existing data + ceph_assert(!h->file->envelope_mode()); length -= h->pos - offset; offset = h->pos; dout(10) << " still need 0x" @@ -3922,8 +3923,9 @@ int BlueFS::_flush_range_F(FileWriter *h, uint64_t offset, uint64_t length) if (h->file->fnode.size < end) { vselector->add_usage(h->file->vselector_hint, end - h->file->fnode.size); h->file->fnode.size = end; - // Don't mark regular appends as dirty on WAL_V2. Note that allocations are marked as dirty. - if (!h->file->is_new_wal()) { + // Don't mark regular appends as dirty on envelope files. + // Note that allocations are marked as dirty. + if (!h->file->envelope_mode()) { h->file->is_dirty = true; } } @@ -4057,8 +4059,8 @@ void BlueFS::append_try_flush(FileWriter *h, const char* buf, size_t len)/*_WF_L bool flushed_sum = false; { std::unique_lock hl(h->lock); - if (h->file->is_new_wal() && h->get_buffer_length() == 0) { - h->wal_header_filler = h->append_hole(File::wal_flush_t::header_size()); + if (h->file->envelope_mode() && h->get_buffer_length() == 0) { + h->envelope_head_filler = h->append_hole(File::envelope_t::head_size()); } size_t max_size = 1ull << 30; // cap to 1GB while (len > 0) { @@ -4126,8 +4128,8 @@ int BlueFS::_flush_F(FileWriter *h, bool force, bool *flushed) << " to " << h->file->fnode << dendl; ceph_assert(h->pos <= h->file->fnode.size); int r; - if (h->file->is_new_wal()) { - r = _flush_wal_F(h); + if (h->file->envelope_mode()) { + r = _flush_envelope_F(h); } else { r = _flush_range_F(h, offset, length); } @@ -4185,11 +4187,11 @@ int BlueFS::truncate(FileWriter *h, uint64_t offset)/*_WF_L*/ if (r < 0) return r; } - if (h->file->is_new_wal()) { + if (h->file->envelope_mode()) { // We cannot do random truncation. // But it is observed that RocksDB truncates WALs right at the end of written data. - ceph_assert(offset == h->file->fnode.wal_size || offset == 0); - if (offset == h->file->fnode.wal_size) { + ceph_assert(offset == h->file->fnode.content_size || offset == 0); + if (offset == h->file->fnode.content_size) { offset = h->file->fnode.size; } } @@ -4637,11 +4639,11 @@ int BlueFS::open_for_write( << " vsel_hint " << file->vselector_hint << dendl; *h = _create_writer(file); if (boost::algorithm::ends_with(filename, ".log")) { - if (selected_wal_v2) { - file->fnode.type = WAL_V2; - file->is_wal_read_loaded = false; - file->wal_marker = File::wal_flush_t::generate_hashed_marker(super.uuid, file->fnode.ino); - dout(20) << "wal v2 marker=#" << std::hex << file->wal_marker << std::dec << dendl; + if (conf_wal_envelope_mode) { + file->fnode.encoding = ENVELOPE; + file->envelopes_indexed = false; + file->stamp = File::envelope_t::generate_stamp(super.uuid, file->fnode.ino); + dout(20) << " stamp=#" << std::hex << file->stamp << std::dec << dendl; } (*h)->writer_type = BlueFS::WRITER_WAL; if (logger && !overwrite) { @@ -4709,9 +4711,9 @@ void BlueFS::close_writer(FileWriter *h) { std::unique_lock hl(h->lock); bool force_dirty = false; - if (h->file->is_new_wal()) { - ceph_assert(h->file->fnode.type == WAL_V2); - h->file->fnode.type = WAL_V2_FIN; + if (h->file->envelope_mode()) { + ceph_assert(h->file->fnode.encoding == ENVELOPE); + h->file->fnode.encoding = ENVELOPE_FIN; // we force fsync by forcing dirty flag force_dirty = true; } @@ -4768,11 +4770,11 @@ int BlueFS::open_for_read( return -ENOENT; } File *file = q->second.get(); - if (file->is_new_wal() && !file->is_wal_read_loaded) { - _wal_index_file(file); + if (file->envelope_mode() && !file->envelopes_indexed) { + _envmode_index_file(file); } *h = new FileReader(file, random ? 4096 : cct->_conf->bluefs_max_prefetch, - file->is_new_wal()); + file->envelope_mode()); dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl; return 0; } @@ -4894,8 +4896,8 @@ int BlueFS::stat(std::string_view dirname, std::string_view filename, << " " << file->fnode << dendl; if (size) { - if (file->is_new_wal()) { - *size = file->fnode.wal_size; + if (file->envelope_mode()) { + *size = file->fnode.content_size; } else { *size = file->fnode.size; } diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h index 012ceef308a..a7459b6cf89 100644 --- a/src/os/bluestore/BlueFS.h +++ b/src/os/bluestore/BlueFS.h @@ -270,39 +270,38 @@ public: MEMPOOL_CLASS_HELPERS(); /* - * WAL v2 files in bluefs have a different format from normal ones. In order to not flush metadata + * Envelope mode files in bluefs have a different format from normal ones. In order to not flush metadata * for every write we make to data extents, we create a package/envelope around the real data - * that includes length of the data we want to flush and a marker that identifies the flush. + * that includes length of the data we want to flush and a unique stamp. * * The format on disk is: - * legend: l = length of flush, d = data, m = marker; each character represents one byte + * legend: l = length of envelope, d = data, s = stamp * * flush 0 l==24 flush 1 l==4 flush 2 l==12 * v v v - * llll llll dddd dddd dddd dddd dddd dddd mmmm mmmm llll llll dddd mmmm mmmm llll llll dddd dddd dddd mmmm mmmm - * + * llll llll dddd dddd dddd dddd dddd dddd ssss ssss llll llll dddd ssss ssss llll llll dddd dddd dddd ssss ssss */ - struct wal_flush_t { - typedef struct wal_marker_t { + struct envelope_t { + typedef struct stamp_t { uint8_t v[8] = {0}; - } wal_marker_t; - typedef uint64_t wal_length_t; + } stamp_t; + typedef uint64_t envelope_len_t; uint64_t file_offset = 0; - uint64_t wal_offset = 0; // offset of start of flush, it should be length offset - uint32_t header_len = 0; - uint32_t tailer_len = 0; - uint64_t wal_length = 0; + uint64_t content_offset = 0; // offset of start of flush, it should be length offset + uint32_t head_len = 0; + uint32_t tail_len = 0; + uint64_t content_length = 0; - static constexpr size_t header_size() { - return sizeof(wal_length_t); + static constexpr size_t head_size() { + return sizeof(envelope_len_t); } static constexpr size_t tail_size() { - return sizeof(wal_marker_t); + return sizeof(stamp_t); } - static wal_marker_t generate_hashed_marker(uuid_d uuid, uint64_t ino) { - wal_marker_t m; + static stamp_t generate_stamp(uuid_d uuid, uint64_t ino) { + stamp_t m; const char* uuid_bytes = uuid.bytes(); uint64_t hashed_ino = ino; hashed_ino ^= hashed_ino << 5; @@ -339,13 +338,11 @@ public: _replay, device_migrate_to_existing, device_migrate_to_new */ ceph::mutex lock = ceph::make_mutex("BlueFS::File::lock"); - bool is_wal_read_loaded; // Before reading from WALv2 all flush envelopes must be located. - // The flag indicates whether `wal_flushes` is initialized. - std::vector wal_flushes; // to keep track of the amount of flushes we performed on a WAL file - // so that we can easily recalculate real data offsets. - // On "replay" this should be refilled in order to append data - // correctly. Nevertheless, replayed wal file most probably won't be reused - wal_flush_t::wal_marker_t wal_marker; + bool envelopes_indexed; // Before reading from enveloped file all envelopes must be located. + // The flag indicates whether `envelopes` is initialized. + std::vector envelopes; // Reading from enveloped file requires having indexed envelopes. + // Its filled either on _replay() or when file is opened for read. + envelope_t::stamp_t stamp; private: FRIEND_MAKE_REF(File); @@ -360,7 +357,7 @@ public: num_writers(0), num_reading(0), vselector_hint(nullptr), - is_wal_read_loaded(false) + envelopes_indexed(false) {} ~File() override { ceph_assert(num_readers.load() == 0); @@ -370,9 +367,8 @@ public: } public: - bool is_new_wal() { - // checks for both WAL_V2 and WAL_V2_FIN - return (fnode.type == WAL_V2 || fnode.type == WAL_V2_FIN); + bool envelope_mode() { + return (fnode.encoding == ENVELOPE || fnode.encoding == ENVELOPE_FIN); } }; @@ -414,7 +410,7 @@ public: const unsigned length, const bluefs_super_t& super); ceph::buffer::list::page_aligned_appender buffer_appender; //< for const char* only - bufferlist::contiguous_filler wal_header_filler; + bufferlist::contiguous_filler envelope_head_filler; public: int writer_type = 0; ///< WRITER_* int write_hint = WRITE_LIFE_NOT_SET; @@ -426,7 +422,7 @@ public: FileWriter(FileRef f) : file(std::move(f)), buffer_appender(buffer.get_page_aligned_appender( - g_conf()->bluefs_alloc_size / CEPH_PAGE_SIZE)), wal_header_filler() { + g_conf()->bluefs_alloc_size / CEPH_PAGE_SIZE)), envelope_head_filler() { ++file->num_writers; iocv.fill(nullptr); dirty_devs.fill(false); @@ -561,14 +557,14 @@ private: bluefs_super_t super; ///< latest superblock (as last written) uint64_t ino_last = 0; ///< last assigned ino (this one is in use) - bool selected_wal_v2 = false; ///< conf "bluefs_wal_v2" at mount + bool conf_wal_envelope_mode = false; ///< conf "bluefs_wal_envelope_mode" at mount struct { ceph::mutex lock = ceph::make_mutex("BlueFS::log.lock"); uint64_t seq_live = 1; //seq that log is currently writing to; mirrors dirty.seq_live FileWriter *writer = 0; bluefs_transaction_t t; - bool use_wal_v2 = false; //version of log currently in force + bool uses_envelope_mode = false; // true if any file is in envelope mode } log; struct { @@ -660,7 +656,7 @@ private: int _flush_range_F(FileWriter *h, uint64_t offset, uint64_t length); int _flush_data(FileWriter *h, uint64_t offset, uint64_t length, bool buffered); int _flush_F(FileWriter *h, bool force, bool *flushed = nullptr); - int _flush_wal_F(FileWriter *h); + int _flush_envelope_F(FileWriter *h); int _fsync(FileWriter *h, bool force_dirty); uint64_t _flush_special(FileWriter *h); @@ -715,23 +711,23 @@ private: void _flush_bdev(); // this is safe to call without a lock void _flush_bdev(std::array& dirty_bdevs); // this is safe to call without a lock - int64_t _read_wal( + int64_t _read_envmode( FileReader *h, ///< [in] read from here uint64_t offset, ///< [in] offset size_t len, ///< [in] this many bytes ceph::buffer::list *outbl, ///< [out] optional: reference the result here char *out); ///< [out] optional: or copy it here - void _wal_index_file( + void _envmode_index_file( FileRef file); - int _wal_seek_to( + int _envmode_seek_to( FileReader *h, ///< [in] wal-file to read uint64_t off, ///< [in] offset in wal datastream - File::wal_flush_t* fl);///< [out] set wal envelope params - bool _read_wal_flush( + File::envelope_t* fl);///< [out] set wal envelope params + bool _read_envelope( FileReader *h, ///< [in] wal-file to read uint64_t file_ofs, ///< [in] offset to expect envelope - uint64_t wal_ofs, ///< [in] respective offset in wal datastream - File::wal_flush_t* fl);///< [out] set wal envelope params + uint64_t env_ofs, ///< [in] respective offset in wal datastream + File::envelope_t* fl);///< [out] set wal envelope params int64_t _read( FileReader *h, ///< [in] read from here uint64_t offset, ///< [in] offset @@ -888,8 +884,8 @@ public: // no need to hold the global lock here; we only touch h and // h->file, and read vs write or delete is already protected (via // atomics and asserts). - if (h->file->is_new_wal()) { - return _read_wal(h, offset, len, outbl, out); + if (h->file->envelope_mode()) { + return _read_envmode(h, offset, len, outbl, out); } return _read(h, offset, len, outbl, out); } diff --git a/src/os/bluestore/bluefs_types.cc b/src/os/bluestore/bluefs_types.cc index 6940b092fbe..80e948a72f2 100644 --- a/src/os/bluestore/bluefs_types.cc +++ b/src/os/bluestore/bluefs_types.cc @@ -173,16 +173,15 @@ void bluefs_layout_t::generate_test_instances(list& ls) } // bluefs_super_t -bluefs_super_t::bluefs_super_t() : seq(0), block_size(4096), wal_version(1) { +bluefs_super_t::bluefs_super_t() : seq(0), block_size(4096) { } void bluefs_super_t::encode(bufferlist& bl) const { - __u8 _version = 2; + ceph_assert(_version >= BASELINE); __u8 _compat = 1; - if (wal_version >= 2) { - _version = 3; - _compat = 3; + if (_version == ENVELOPE_MODE_ENABLED) { + _compat = ENVELOPE_MODE_ENABLED; } ENCODE_START(_version, _compat, bl); encode(uuid, bl); @@ -281,8 +280,8 @@ bluefs_fnode_delta_t* bluefs_fnode_t::make_delta(bluefs_fnode_delta_t* delta) { delta->offset = allocated_commited; delta->extents.clear(); - delta->type = type; - delta->wal_size = wal_size; + delta->encoding = encoding; + delta->content_size = content_size; if (allocated_commited < allocated) { uint64_t x_off = 0; auto p = seek(allocated_commited, &x_off); @@ -320,7 +319,7 @@ void bluefs_fnode_t::generate_test_instances(list& ls) ls.back()->mtime = utime_t(123,45); ls.back()->extents.push_back(bluefs_extent_t(0, 1048576, 4096)); ls.back()->__unused__ = 1; - ls.back()->type = 0; + ls.back()->encoding = 0; } ostream& operator<<(ostream& out, const bluefs_fnode_t& file) @@ -331,12 +330,12 @@ ostream& operator<<(ostream& out, const bluefs_fnode_t& file) << " allocated " << std::hex << file.allocated << std::dec << " alloc_commit " << std::hex << file.allocated_commited << std::dec << " extents " << file.extents; - if (file.type == WAL_V2 || file.type == WAL_V2_FIN) { - out << " wal_size 0x" << std::hex << file.wal_size << std::dec << std::hex; - if (file.type == WAL_V2) - out << " type WAL_V2 " << std::dec; - if (file.type == WAL_V2_FIN) - out << " type WAL_V2_FIN " << std::dec; + if (file.encoding == ENVELOPE || file.encoding == ENVELOPE_FIN) { + out << " content-size 0x" << std::hex << file.content_size << std::dec << std::hex; + if (file.encoding == ENVELOPE) + out << " ENVELOPE " << std::dec; + if (file.encoding == ENVELOPE_FIN) + out << " ENVELOPE-FIN " << std::dec; } out << ")"; return out; @@ -351,12 +350,12 @@ std::ostream& operator<<(std::ostream& out, const bluefs_fnode_delta_t& delta) << " mtime " << delta.mtime << " offset " << std::hex << delta.offset << std::dec << " extents " << delta.extents; - if (delta.type == WAL_V2 || delta.type == WAL_V2_FIN) { - out << " wal_size 0x" << std::hex << delta.wal_size << std::dec << std::hex; - if (delta.type == WAL_V2) - out << " type WAL_V2" << std::dec; - if (delta.type == WAL_V2_FIN) - out << " type WAL_V2_FIN" << std::dec; + if (delta.encoding == ENVELOPE || delta.encoding == ENVELOPE_FIN) { + out << " content-size 0x" << std::hex << delta.content_size << std::dec << std::hex; + if (delta.encoding == ENVELOPE) + out << " ENVELOPE" << std::dec; + if (delta.encoding == ENVELOPE_FIN) + out << " ENVELOPE-FIN" << std::dec; } out << ")"; return out; diff --git a/src/os/bluestore/bluefs_types.h b/src/os/bluestore/bluefs_types.h index ad1e5a31f3c..767d0cf1968 100644 --- a/src/os/bluestore/bluefs_types.h +++ b/src/os/bluestore/bluefs_types.h @@ -34,11 +34,12 @@ public: }; WRITE_CLASS_DENC(bluefs_extent_t) -enum bluefs_node_type { - REGULAR = 0, - WAL_V2 = 1, // WAL_V2 that is open for write - WAL_V2_FIN = 2, // WAL_V2 that we are done writing to; there is no data in [onode.size ... allocated) - NODE_TYPE_END = 0x100, +enum bluefs_node_encoding { + PLAIN = 0, ///< Normal; legacy mode. + ENVELOPE = 1, ///< Data flushed to file is wrapped in envelope - no size update needed. + /// Without shutdown, range [fnode.size ... fnode.allocated) may contain envelopes. + ENVELOPE_FIN = 2, ///< Same as envelope but file orderly closed. Fnode.size reflects actual end. + ENCODING_MAX = 3 }; std::ostream& operator<<(std::ostream& out, const bluefs_extent_t& e); @@ -82,9 +83,9 @@ struct bluefs_fnode_delta_t { // Equal to 'allocated' when created. // Used for consistency checking. - // only relevant in case of wal node - uint8_t type = REGULAR; - uint64_t wal_size; // The size of payload in the file; size = wal_size + n * envelope_size + uint8_t encoding = PLAIN; + // For envelope mode only. + uint64_t content_size; // The size of payload in the file; size = wal_size + n * envelope_size mempool::bluefs::vector extents; @@ -92,7 +93,7 @@ struct bluefs_fnode_delta_t { void bound_encode(size_t& p) const { uint8_t version = 1, compat = 1; - if (type == WAL_V2 || type == WAL_V2_FIN) { + if (encoding == ENVELOPE || encoding == ENVELOPE_FIN) { version = 2; compat = 2; } @@ -103,7 +104,7 @@ struct bluefs_fnode_delta_t { void encode(ceph::buffer::list::contiguous_appender& p) const { DENC_DUMP_PRE(bluefs_fnode_t); uint8_t version = 1, compat = 1; - if (type == WAL_V2 || type == WAL_V2_FIN) { + if (encoding == ENVELOPE || encoding == ENVELOPE_FIN) { version = 2; compat = 2; } @@ -126,8 +127,8 @@ struct bluefs_fnode_delta_t { denc(v.offset, p); denc(v.extents, p); if (struct_v >= 2) { - denc_varint(v.type, p); - denc_varint(v.wal_size, p); + denc_varint(v.encoding, p); + denc_varint(v.content_size, p); } } }; @@ -148,18 +149,19 @@ struct bluefs_fnode_t { uint64_t allocated; uint64_t allocated_commited; - uint8_t type = REGULAR; - uint64_t wal_size; // Amount of payload bytes in WAL(not including envelope data), there could be more on power off instances, in range of fnode.size~wal_limit - - bluefs_fnode_t() : ino(0), size(0), allocated(0), allocated_commited(0), wal_size(0) {} + uint8_t encoding = PLAIN; + // envelope mode only + uint64_t content_size; ///< Payload bytes inside envelopes. + /// When encoding == ENVELOPE indexing might update the value. + bluefs_fnode_t() : ino(0), size(0), allocated(0), allocated_commited(0), content_size(0) {} bluefs_fnode_t(uint64_t _ino, uint64_t _size, utime_t _mtime) : - ino(_ino), size(_size), mtime(_mtime), allocated(0), allocated_commited(0), wal_size(0) {} + ino(_ino), size(_size), mtime(_mtime), allocated(0), allocated_commited(0), content_size(0) {} bluefs_fnode_t(const bluefs_fnode_t& other) : ino(other.ino), size(other.size), mtime(other.mtime), allocated(other.allocated), allocated_commited(other.allocated_commited), - type(other.type), - wal_size(other.wal_size) { + encoding(other.encoding), + content_size(other.content_size) { clone_extents(other); } @@ -181,7 +183,7 @@ struct bluefs_fnode_t { DENC_HELPERS void bound_encode(size_t& p) const { uint8_t version = 1, compat = 1; - if (type == WAL_V2 || type == WAL_V2_FIN) { + if (encoding == ENVELOPE || encoding == ENVELOPE_FIN) { version = 2; compat = 2; } @@ -192,7 +194,7 @@ struct bluefs_fnode_t { void encode(ceph::buffer::list::contiguous_appender& p) const { DENC_DUMP_PRE(bluefs_fnode_t); uint8_t version = 1, compat = 1; - if (type == WAL_V2 || type == WAL_V2_FIN) { + if (encoding == ENVELOPE || encoding == ENVELOPE_FIN) { version = 2; compat = 2; } @@ -217,8 +219,8 @@ struct bluefs_fnode_t { denc(v.__unused__, p); denc(v.extents, p); if (struct_v >= 2) { - denc_varint(v.type, p); - denc_varint(v.wal_size, p); + denc_varint(v.encoding, p); + denc_varint(v.content_size, p); } } void reset_delta() { @@ -312,6 +314,11 @@ struct bluefs_layout_t { WRITE_CLASS_ENCODER(bluefs_layout_t) struct bluefs_super_t { + static constexpr uint8_t BASELINE = 2; + static constexpr uint8_t ENVELOPE_MODE_ENABLED = 3; + + uint8_t _version = BASELINE; ///< usually we hide encoding version, + /// but we need to tie features to it uuid_d uuid; ///< unique to this bluefs instance uuid_d osd_uuid; ///< matches the osd that owns us uint64_t seq; ///< sequence counter