From 18968bc07f8cd4d30901dd0b3ee6efdbb04d271e Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Wed, 31 Jul 2024 12:13:06 +0200 Subject: [PATCH] os/bluestore: Introduce custom format of BlueFS file - WAL WAL file mode allows to cut by 50% of fdatasync(). For regular files, we independently sync file data and file metadata. In WAL mode we are able to recover most metadata from file bytestream. Hence - we only sync file data. Signed-off-by: Pere Diaz Bou (cherry picked from commit 38519fffc0e196a0b9c83c54a9142d7eade7febb) Amended-by: Adam Kupczyk --- src/common/options/global.yaml.in | 9 + src/include/denc.h | 10 + src/include/encoding.h | 27 +++ src/os/bluestore/BlueFS.cc | 340 +++++++++++++++++++++++++++--- src/os/bluestore/BlueFS.h | 121 ++++++++++- src/os/bluestore/bluefs_types.cc | 64 +++++- src/os/bluestore/bluefs_types.h | 121 ++++++++++- 7 files changed, 644 insertions(+), 48 deletions(-) diff --git a/src/common/options/global.yaml.in b/src/common/options/global.yaml.in index e44838fc00c6b..92a545fb748fb 100644 --- a/src/common/options/global.yaml.in +++ b/src/common/options/global.yaml.in @@ -6718,3 +6718,12 @@ options: desc: Enables exception throwing instead of process abort on transaction submission error. default: false with_legacy: false +- name: bluefs_wal_v2 + type: bool + level: advanced + desc: Enables a faster backend in BlueFS for WAL writes. + long_desc: Enabling this feature will reduce ~50% the amount of fdatasync syscalls issued by WAL writes. This happens because we embed metadata + with the data itself. Downgrading from a version that uses v2 to v1 will require running `ceph-bluestore-tool --command downgrade-wal-to-v1` + to move wal files to previous format. + default: true + with_legacy: false diff --git a/src/include/denc.h b/src/include/denc.h index d945e2646896e..81a302f044226 100644 --- a/src/include/denc.h +++ b/src/include/denc.h @@ -1912,6 +1912,16 @@ struct StructVChecker _denc_start(p, &struct_v.v, &struct_compat, &_denc_pchar, &_denc_u32); \ do { +// This variant is unsafe, because older versions will not even catch incompatibility. +// The ability to decode must be verified by other means, +#define DENC_START_UNSAFE(v, compat, p) \ + __u8 struct_v = v; \ + __u8 struct_compat = compat; \ + char *_denc_pchar; \ + uint32_t _denc_u32; \ + _denc_start(p, &struct_v, &struct_compat, &_denc_pchar, &_denc_u32); \ + do { + // For osd_reqid_t which cannot be upgraded at all. // We used it to communicate with clients and now we cannot safely upgrade. #define DENC_START_OSD_REQID(_v, compat, p) \ diff --git a/src/include/encoding.h b/src/include/encoding.h index fc8825c6492c9..362143a60325c 100644 --- a/src/include/encoding.h +++ b/src/include/encoding.h @@ -1447,6 +1447,21 @@ decode(std::array& v, bufferlist::const_iterator& p) using ::ceph::encode; \ do { +#define ENCODE_START_FILLER(v, compat, filler_in) \ + __u8 struct_v = v; \ + __u8 struct_compat = compat; \ + ceph_le32 struct_len; \ + auto& filler = filler_in; \ + filler.copy_in(sizeof(struct_v), (char *)&struct_v); \ + filler.copy_in(sizeof(struct_compat), \ + (char *)&struct_compat); \ + char* struct_len_ptr = filler.c_str(); \ + filler.advance(sizeof(struct_len)); \ + const auto starting_bl_len = filler.c_str(); \ + using ::ceph::encode; \ + do { + + /** * finish encoding block * @@ -1464,6 +1479,18 @@ decode(std::array& v, bufferlist::const_iterator& p) (char *)&struct_compat); \ filler.copy_in(sizeof(struct_len), (char *)&struct_len); + +/** + * finish encoding block with filler + * + * @param bl bufferlist we were encoding to + * @param new_struct_compat struct-compat value to use + */ +#define ENCODE_FINISH_FILLER() \ + } while (false); \ + struct_len = filler.c_str() - starting_bl_len; \ + *((ceph_le32*)struct_len_ptr) = struct_len; + #define ENCODE_FINISH(bl) ENCODE_FINISH_NEW_COMPAT(bl, 0) #define DECODE_ERR_OLDVERSION(func, v, compatv) \ diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index da6dea8931537..d95694226c817 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -1,6 +1,8 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab +#include #include +#include #include "boost/algorithm/string.hpp" #include "bluestore_common.h" #include "BlueFS.h" @@ -10,8 +12,10 @@ #include "common/errno.h" #include "common/perf_counters.h" #include "Allocator.h" +#include "include/buffer_fwd.h" #include "include/ceph_assert.h" #include "common/admin_socket.h" +#include "os/bluestore/bluefs_types.h" #ifdef WITH_SEASTAR #include "crimson/common/perf_counters_collection.h" @@ -1228,9 +1232,15 @@ int BlueFS::fsck() return 0; } -int BlueFS::_write_super(int dev) +int BlueFS::_write_super(int dev, uint8_t wal_version) { ++super.seq; + if (wal_version > 0) { + super.wal_version = wal_version; + } else { + bool use_wal_v2 = cct->_conf.get_val("bluefs_wal_v2"); + super.wal_version = use_wal_v2 ? 2 : 1; + } // build superblock bufferlist bl; encode(super, bl); @@ -1617,6 +1627,7 @@ int BlueFS::_replay(bool noop, bool to_stdout) vselector->get_hint_by_dir(dirname); vselector->add_usage(file->vselector_hint, file->fnode); + q->second->file_map[filename] = file; ++file->refs; } @@ -1642,8 +1653,10 @@ int BlueFS::_replay(bool noop, bool to_stdout) ceph_assert(q != nodes.dir_map.end()); map::iterator r = q->second->file_map.find(filename); ceph_assert(r != q->second->file_map.end()); - ceph_assert(r->second->refs > 0); - --r->second->refs; + + FileRef file = r->second; + ceph_assert(file->refs > 0); + --file->refs; q->second->file_map.erase(r); } } @@ -1692,6 +1705,7 @@ int BlueFS::_replay(bool noop, bool to_stdout) { bluefs_fnode_t fnode; decode(fnode, p); + ceph_assert(fnode.type == bluefs_node_type::REGULAR || fnode.type == bluefs_node_type::WAL_V2); dout(20) << __func__ << " 0x" << std::hex << pos << std::dec << ": op_file_update " << " " << fnode << " " << dendl; if (unlikely(to_stdout)) { @@ -1763,8 +1777,10 @@ int BlueFS::_replay(bool noop, bool to_stdout) if (fnode.ino != 1) { vselector->sub_usage(f->vselector_hint, fnode); } - fnode.size = delta.size; fnode.claim_extents(delta.extents); + fnode.size = delta.size; + fnode.wal_limit = delta.wal_limit; + fnode.wal_size = delta.wal_size; dout(20) << __func__ << " 0x" << std::hex << pos << std::dec << ": op_file_update_inc produced " << " " << fnode << " " << dendl; @@ -1840,8 +1856,19 @@ int BlueFS::_replay(bool noop, bool to_stdout) dirty.seq_live = log_seq + 1; log.t.seq = log.seq_live; dirty.seq_stable = log_seq; + + for (const auto &[filename, file] : nodes.file_map) { + if (file->is_new_wal()) { + dout(5) << __func__ << " " << file << " " << file->refs << dendl; + if (file->refs == 0) { + continue; + } + _wal_update_size(file, file->fnode.size); + } + } } + dout(10) << __func__ << " log file size was 0x" << std::hex << log_file->fnode.size << std::dec << dendl; if (unlikely(to_stdout)) { @@ -2186,6 +2213,7 @@ int BlueFS::device_migrate_to_new( return 0; } + BlueFS::FileRef BlueFS::_get_file(uint64_t ino) { auto p = nodes.file_map.find(ino); @@ -2349,6 +2377,185 @@ int64_t BlueFS::_read_random( return ret; } +void BlueFS::_wal_update_size(FileRef file, uint64_t increment) { + using WALLength = File::WALFlush::WALLength; + + file->is_wal_read_loaded = true; + file->wal_flushes.clear(); + + uint64_t flush_offset = 0; + dout(20) + << fmt::format( + "{} updating WAL file {} for range {:#x}~{:#x} limit is {:#x}", + __func__, file->fnode.ino, flush_offset, increment, file->fnode.wal_limit) + << dendl; + ceph_assert(file->wal_flushes.empty()); + + FileReader *h = new FileReader(file, cct->_conf->bluefs_max_prefetch, false, true); + + size_t header_size = File::WALFlush::header_size(); + + uint64_t flush_end = flush_offset + increment; + while (flush_offset < file->fnode.wal_limit) { + // read first part of wal flush + bufferlist bl; + bluefs_wal_header_t header; + + uint64_t read_result = (uint64_t)_read(h, flush_offset, header_size, &bl, nullptr); + if (read_result < header_size) { + dout(20) << fmt::format("{} cannot read wal header, most likely we are out of bounds. flush_offset={:#X}", __func__, flush_offset) << dendl; + break; + } + + dout(30) << __func__ << " result \n"; + bl.hexdump(*_dout); + *_dout << dendl; + auto buffer_iterator = bl.cbegin(); + try { + decode(header, buffer_iterator); + } catch(ceph::buffer::error& e) { + // EOF or corruption + dout(30) << fmt::format("couldn't decode wal flush header at offset {:#x}: {}", flush_offset, e.what()) << dendl; + break; + } + + WALLength flush_length = header.flush_length; + dout(20) << __func__ << " flush_length " << flush_length << dendl; + File::WALFlush new_flush(flush_offset, flush_length); + + // read marker + bl.clear(); + uint64_t marker_offset = new_flush.get_marker_offset(); + read_result = _read(h, marker_offset, new_flush.tail_size(), &bl, nullptr); + if (read_result < new_flush.tail_size()) { + dout(20) << fmt::format("{} cannot read marker, most likely we are out of bounds. flush_offset={:#X}, marker_offset={:#X}", __func__, flush_offset, marker_offset) << dendl; + break; + } + uint64_t marker; + buffer_iterator = bl.cbegin(); + decode(marker, buffer_iterator); + if (marker != File::WALFlush::generate_hashed_marker(super.osd_uuid, file->fnode.ino)) { + // EOF or corruption + dout(30) << fmt::format("reached eof or marker corruption {:#x}", flush_offset) << dendl; + break; + } + + uint64_t increase = new_flush.end_offset() - new_flush.offset; + dout(20) << fmt::format("{} adding flush {:#x}~{:#x}", __func__, flush_offset, new_flush.length) << dendl; + file->wal_flushes.push_back(new_flush); + if (flush_offset >= flush_end) { + dout(20) << fmt::format("{} recovering flush {:#x}~{:#x}", __func__, flush_offset, new_flush.length) << dendl; + file->fnode.wal_size += new_flush.length; + file->fnode.size += increase; + vselector->add_usage(file->vselector_hint, increase); + } + + flush_offset += increase; + } + + // if we read less it might mean corruption + if (flush_offset < flush_end) { + dout(20) << fmt::format("{} read less than expected {:#x} bytes", __func__, flush_offset) << dendl; + } + ceph_assert(flush_offset >= flush_end); + + delete h; +} + +int64_t BlueFS::_read_wal( + FileReader *h, ///< [in] read from here + uint64_t off, ///< [in] offset + size_t len, ///< [in] this many bytes + bufferlist *outbl, ///< [out] optional: reference the result here + char *out) ///< [out] optional: or copy it here +{ + ceph_assert(h->file->is_wal_read_loaded); + dout(20) << __func__ << " h " << h << " offset: 0x" + << off << std::hex << "~" << len << std::hex << dendl; + if (outbl) { + outbl->clear(); + } + + int64_t ret = 0; + + // WAL data is wrapped in an envelope that has a format of [length of flush, payload, file ino] + // wal_data_logical_offset points to the offset of the payload we are currently in. + uint64_t wal_data_logical_offset = 0; + + + // save previous position as buffer pos is treated difffernt on regular files + uint64_t previous_pos = h->buf.pos; + + uint64_t remaining_len = len; + auto flush_iterator = h->file->wal_flushes.begin(); + while (remaining_len > 0 && flush_iterator != h->file->wal_flushes.end()) { + uint64_t flush_offset = flush_iterator->offset; + uint64_t flush_length = flush_iterator->length; + dout(25) << fmt::format("{} flush_offset={:#x} flush_length={:#x}", __func__, flush_offset, flush_length) << dendl; + + if (flush_length == 0) { + if (remaining_len > 0) { + dout(5) << __func__ << " flush_length 0: reading less then required " + << ret << "<" << len - ret << dendl; + } + break; + } + // if we won't find offset here, go ahead + bool in_range = wal_data_logical_offset < off + len && wal_data_logical_offset + flush_length > off; + ceph_assert(wal_data_logical_offset < off+len); + if (!in_range) { + if (off >= wal_data_logical_offset + flush_length) { + // move to next flush + // TODO(pere): do we check "ino" here too? + wal_data_logical_offset += flush_length; + flush_iterator++; + continue; + } + } + + uint64_t payload_offset = flush_iterator->get_payload_offset(); + + uint64_t skip_front = 0; + if(wal_data_logical_offset < off) { + // offset is in this flush chunk so if we are before we move forward + skip_front = off - wal_data_logical_offset; + } + payload_offset += skip_front; + wal_data_logical_offset += skip_front; + + dout(20) << fmt::format("{} payload_offset is {:#X} after skipping {:#X} bytes", __func__, payload_offset, skip_front) << dendl; + + uint64_t data_to_read_from_flush = std::min(flush_length-skip_front, remaining_len); + bufferlist payload; + dout(25) << fmt::format("{} data to read from flush = {:#X}", __func__, data_to_read_from_flush) << dendl; + _read(h, payload_offset, data_to_read_from_flush, &payload, nullptr); + + if (out) { + auto p = payload.begin(); + p.copy(data_to_read_from_flush, out); + out += data_to_read_from_flush; + } + if (outbl) { + outbl->claim_append(payload); + } + flush_iterator++; + + remaining_len -= data_to_read_from_flush; + wal_data_logical_offset += data_to_read_from_flush; + ret += data_to_read_from_flush; + } + if (remaining_len > 0) { + dout(20) << __func__ << " reading less than required, missing: " << remaining_len << dendl; + } + + dout(20) << __func__ << std::hex + << " got 0x" << ret + << std::dec << dendl; + ceph_assert(!outbl || (int)outbl->length() == ret); + h->buf.pos = previous_pos + ret; + return ret; +} + int64_t BlueFS::_read( FileReader *h, ///< [in] read from here uint64_t off, ///< [in] offset @@ -3478,14 +3685,19 @@ ceph::bufferlist BlueFS::FileWriter::flush_buffer( if (partial) { tail_block.splice(0, tail_block.length(), &bl); } + dout(20) << __func__ << " tail is" << std::hex << bl.length() << dendl; + ceph_assert(length >= bl.length()); const auto remaining_len = length - bl.length(); buffer.splice(0, remaining_len, &bl); if (buffer.length()) { dout(20) << " leaving 0x" << std::hex << buffer.length() << std::dec << " unflushed" << dendl; } - if (const unsigned tail = bl.length() & ~super.block_mask(); tail) { - const auto padding_len = super.block_size - tail; + unsigned padding_len = 0; + // Append padding to fill block + const unsigned tail = bl.length() & ~super.block_mask(); + if (tail) { + padding_len = super.block_size - tail; dout(20) << __func__ << " caching tail of 0x" << std::hex << tail << " and padding block with 0x" << padding_len @@ -3496,6 +3708,7 @@ ceph::bufferlist BlueFS::FileWriter::flush_buffer( // Otherwise a costly rebuild could happen in e.g. `KernelDevice`. buffer_appender.append_zero(padding_len); buffer.splice(buffer.length() - padding_len, padding_len, &bl); + // Deep copy the tail here. This allows to avoid costlier copy on // bufferlist rebuild in e.g. `KernelDevice` and minimizes number // of memory allocations. @@ -3507,6 +3720,7 @@ ceph::bufferlist BlueFS::FileWriter::flush_buffer( } else { tail_block.clear(); } + return bl; } @@ -3558,6 +3772,13 @@ int BlueFS::_flush_range_F(FileWriter *h, uint64_t offset, uint64_t length) ceph_assert(h->file->num_readers.load() == 0); ceph_assert(h->file->fnode.ino > 1); + if (h->file->is_new_wal()) { + // WALFlush::WALLength is already appended at the start of first append_try_flush + // update length, offset is already updated with correct position + length += File::WALFlush::tail_size(); + } + uint64_t end = offset + length; + dout(10) << __func__ << " " << h << " pos 0x" << std::hex << h->pos << " 0x" << offset << "~" << length << std::dec << " to " << h->file->fnode @@ -3570,9 +3791,11 @@ int BlueFS::_flush_range_F(FileWriter *h, uint64_t offset, uint64_t length) bool buffered = cct->_conf->bluefs_buffered_io; - if (offset + length <= h->pos) + if (end <= h->pos) return 0; if (offset < h->pos) { + // NOTE: let's assume that we do not overwrite wal + ceph_assert(!h->file->is_new_wal()); length -= h->pos - offset; offset = h->pos; dout(10) << " still need 0x" @@ -3585,11 +3808,11 @@ int BlueFS::_flush_range_F(FileWriter *h, uint64_t offset, uint64_t length) uint64_t allocated = h->file->fnode.get_allocated(); // do not bother to dirty the file if we are overwriting // previously allocated extents. - if (allocated < offset + length) { + if (allocated < end) { // we should never run out of log space here; see the min runway check // in _flush_and_sync_log. int r = _allocate(vselector->select_prefer_bdev(h->file->vselector_hint), - offset + length - allocated, + end - allocated, 0, &h->file->fnode, [&](const bluefs_extent_t& e) { @@ -3604,11 +3827,27 @@ int BlueFS::_flush_range_F(FileWriter *h, uint64_t offset, uint64_t length) } h->file->is_dirty = true; } - if (h->file->fnode.size < offset + length) { - vselector->add_usage(h->file->vselector_hint, offset + length - h->file->fnode.size); - h->file->fnode.size = offset + length; - h->file->is_dirty = true; + if (h->file->fnode.size < end) { + vselector->add_usage(h->file->vselector_hint, end - h->file->fnode.size); + h->file->fnode.size = end; + // Don't mark regular appends as dirty on WAL_V2. Note that allocations are marked as dirty. + if (!h->file->is_new_wal()) { + h->file->is_dirty = true; + } + } + + if (h->file->is_new_wal()) { + // create WAL flush envelope + uint64_t flush_size = length - File::WALFlush::extra_envelope_size_on_front_and_tail(); + ceph_assert(h->get_wal_header_filler() != nullptr); + bluefs_wal_header_t(flush_size).encode(*h->get_wal_header_filler()); + h->set_wal_header_filler(nullptr); + + h->append(h->file->wal_marker); + h->file->fnode.wal_size += flush_size; + h->file->fnode.wal_limit = h->file->fnode.get_allocated(); } + dout(20) << __func__ << " file now, unflushed " << h->file->fnode << dendl; int res = _flush_data(h, offset, length, buffered); logger->tinc(l_bluefs_flush_lat, mono_clock::now() - t0); @@ -3698,6 +3937,7 @@ int BlueFS::_flush_data(FileWriter *h, uint64_t offset, uint64_t length, bool bu } } } + dout(20) << __func__ << " h " << h << " pos now 0x" << std::hex << h->pos << std::dec << dendl; return 0; @@ -3738,6 +3978,14 @@ void BlueFS::append_try_flush(FileWriter *h, const char* buf, size_t len)/*_WF_L bool flushed_sum = false; { std::unique_lock hl(h->lock); + + if (h->file->is_new_wal() && h->get_buffer_length() == 0) { + size_t size = 0; + bluefs_wal_header_t().bound_encode(size); + bufferlist::contiguous_filler filler = h->append_hole(size); + h->set_wal_header_filler(std::make_unique(bufferlist::contiguous_filler(filler))); + } + size_t max_size = 1ull << 30; // cap to 1GB while (len > 0) { bool need_flush = true; @@ -3866,6 +4114,16 @@ int BlueFS::truncate(FileWriter *h, uint64_t offset)/*_WF_L*/ { std::lock_guard ll(log.lock); std::lock_guard dl(dirty.lock); + if (h->file->is_new_wal()) { + // This assumption comes from reading logs of rocksdb+bluefs where a WAL file follows this pattern: + // 1. create wal + // 2. open_for_write + // 3. close_writer + // 4. truncate -> fnode.size + // 5. unlink + ceph_assert(h->file->fnode.size == offset || offset == 0); + h->file->fnode.wal_limit = offset; + } bool changed_extents = false; vselector->sub_usage(h->file->vselector_hint, fnode); uint64_t x_off = 0; @@ -3913,7 +4171,7 @@ int BlueFS::truncate(FileWriter *h, uint64_t offset)/*_WF_L*/ return 0; } -int BlueFS::fsync(FileWriter *h)/*_WF_WD_WLD_WLNF_WNF*/ +int BlueFS::fsync(FileWriter *h, bool force_dirty)/*_WF_WD_WLD_WLNF_WNF*/ { auto t0 = mono_clock::now(); _maybe_check_vselector_LNF(); @@ -3926,7 +4184,7 @@ int BlueFS::fsync(FileWriter *h)/*_WF_WD_WLD_WLNF_WNF*/ if (r < 0) return r; _flush_bdev(h); - if (h->file->is_dirty) { + if (h->file->is_dirty || force_dirty) { _signal_dirty_to_log_D(h); h->file->is_dirty = false; } @@ -4184,8 +4442,11 @@ int BlueFS::preallocate(FileRef f, uint64_t off, uint64_t len)/*_LF*/ }); if (r < 0) return r; - + if (f->is_new_wal()) { + f->fnode.wal_limit = f->fnode.get_allocated(); + } log.t.op_file_update_inc(f->fnode); + f->is_dirty = true; } return 0; } @@ -4297,18 +4558,15 @@ int BlueFS::open_for_write( << " vsel_hint " << file->vselector_hint << dendl; - log.t.op_file_update(file->fnode); - if (create) - log.t.op_dir_link(dirname, filename, file->fnode.ino); - - std::lock_guard dl(dirty.lock); - for (auto& p : pending_release_extents) { - dirty.pending_release[p.bdev].insert(p.offset, p.length); - } - } - *h = _create_writer(file); + *h = _create_writer(file); - if (boost::algorithm::ends_with(filename, ".log")) { + if (boost::algorithm::ends_with(filename, ".log")) { + bool use_wal_v2 = cct->_conf.get_val("bluefs_wal_v2"); + if (use_wal_v2) { + file->fnode.type = WAL_V2; + file->is_wal_read_loaded = false; + file->wal_marker = File::WALFlush::generate_hashed_marker(super.osd_uuid, file->fnode.ino); + } (*h)->writer_type = BlueFS::WRITER_WAL; if (logger && !overwrite) { logger->inc(l_bluefs_files_written_wal); @@ -4320,6 +4578,18 @@ int BlueFS::open_for_write( } } + log.t.op_file_update(file->fnode); + if (create) { + log.t.op_dir_link(dirname, filename, file->fnode.ino); + } + + std::lock_guard dl(dirty.lock); + for (auto& p : pending_release_extents) { + dirty.pending_release[p.bdev].insert(p.offset, p.length); + } + } + + dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl; return 0; } @@ -4360,6 +4630,11 @@ void BlueFS::_close_writer(FileWriter *h) } void BlueFS::close_writer(FileWriter *h) { + if (h->file->is_new_wal()) { + // we force fsync by forcing dirty flag + fsync(h, true); + } + { std::lock_guard l(h->lock); _drain_writer(h); @@ -4535,8 +4810,15 @@ int BlueFS::stat(std::string_view dirname, std::string_view filename, File *file = q->second.get(); dout(10) << __func__ << " " << dirname << "/" << filename << " " << file->fnode << dendl; - if (size) - *size = file->fnode.size; + + if (size) { + if (file->is_new_wal()) { + *size = file->fnode.wal_size; + } else { + *size = file->fnode.size; + } + } + if (mtime) *mtime = file->fnode.mtime; return 0; diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h index f21c20869fb65..4c365eb497b4a 100644 --- a/src/os/bluestore/BlueFS.h +++ b/src/os/bluestore/BlueFS.h @@ -6,6 +6,7 @@ #include #include #include +#include #include "bluefs_types.h" #include "blk/BlockDevice.h" @@ -13,10 +14,13 @@ #include "common/RefCountedObj.h" #include "common/ceph_context.h" #include "global/global_context.h" +#include "include/byteorder.h" +#include "include/ceph_hash.h" #include "include/common_fwd.h" #include "boost/intrusive/list.hpp" #include "boost/dynamic_bitset.hpp" +#include "include/hash.h" class Allocator; @@ -265,6 +269,65 @@ public: struct File : public RefCountedObject { MEMPOOL_CLASS_HELPERS(); + /* + * WAL files in bluefs have a different format from normal ones. In order to not flush metadata + * for every write we make to data extents, we create a package/envelope around the real data + * that includes Length of the data we want to flush and a marker that identifies the flush. + * + * The format on disk will look something like: + * legend = l = length of flush, d = data, m = marker, x=unused/used bvy other file, each character will be a byte + * + * flush 0 l==24 flush 1 l==4 flush 2 l==12 + * v v v + * llll llll dddd dddd dddd dddd dddd dddd mmmm mmmm xxxx xxx xxxx llll llll dddd mmmm mmmm xxxx llll llll dddd dddd dddd mmmm mmmm + * + */ + struct WALFlush { + typedef uint64_t WALMarker; + typedef uint64_t WALLength; + + uint64_t offset = 0; // offset of start of flush, it should be length offset + uint64_t length = 0; + + WALFlush(uint64_t offset, uint64_t length) : offset(offset), length(length) {} + + + static constexpr size_t header_size() { + return bluefs_wal_header_t::size(); + } + + static constexpr size_t tail_size() { + return sizeof(WALMarker); + } + + uint64_t end_offset() { + return get_marker_offset() + tail_size(); + } + + uint64_t get_payload_offset() { + return offset + header_size(); + } + + uint64_t get_marker_offset() { + return get_payload_offset() + length; + } + + static constexpr uint64_t extra_envelope_size_on_front_and_tail() { + return header_size() + tail_size(); + } + + static uint64_t generate_hashed_marker(uuid_d uuid, uint64_t ino) { + char uuid_copy[16]; + memcpy(uuid_copy, uuid.bytes(), 16); + uint64_t* blocks_of_64 = (uint64_t*)&uuid_copy[0]; + for (size_t i = 0; i < (sizeof(uuid_copy) / sizeof(uint64_t)); i++) { + blocks_of_64[i] ^= ino; + } + return ceph_str_hash(CEPH_STR_HASH_RJENKINS, &uuid_copy[0], sizeof(uuid_copy)); + } + }; + + bluefs_fnode_t fnode; int refs; uint64_t dirty_seq; @@ -283,6 +346,13 @@ public: _replay, device_migrate_to_existing, device_migrate_to_new */ ceph::mutex lock = ceph::make_mutex("BlueFS::File::lock"); + bool is_wal_read_loaded; // mark whether the WAL file is ready to be read as wal_update_size was called + std::vector wal_flushes; // to keep track of the amount of flushes we performed on a WAL file + // so that we can easily recalculate real data offsets. + // On "replay" this should be refilled in order to append data + // correctly. Nevertheless, replayed wal file most probably won't be reused + uint64_t wal_marker; + private: FRIEND_MAKE_REF(File); File() @@ -295,7 +365,8 @@ public: num_readers(0), num_writers(0), num_reading(0), - vselector_hint(nullptr) + vselector_hint(nullptr), + is_wal_read_loaded(false) {} ~File() override { ceph_assert(num_readers.load() == 0); @@ -303,6 +374,12 @@ public: ceph_assert(num_reading.load() == 0); ceph_assert(!locked); } + + public: + bool is_new_wal() { + return fnode.type == WAL_V2; + } + }; using FileRef = ceph::ref_t; @@ -329,8 +406,8 @@ public: FileRef file; uint64_t pos = 0; ///< start offset for buffer - private: ceph::buffer::list buffer; ///< new data to write (at end of file) + private: ceph::buffer::list tail_block; ///< existing partial block at end of file, if any public: unsigned get_buffer_length() const { @@ -342,6 +419,7 @@ public: const unsigned length, const bluefs_super_t& super); ceph::buffer::list::page_aligned_appender buffer_appender; //< for const char* only + std::unique_ptr wal_header_filler; // To encode bluefs_wal_header_t we need to save the location of the header we want to fill public: int writer_type = 0; ///< WRITER_* int write_hint = WRITE_LIFE_NOT_SET; @@ -353,7 +431,7 @@ public: FileWriter(FileRef f) : file(std::move(f)), buffer_appender(buffer.get_page_aligned_appender( - g_conf()->bluefs_alloc_size / CEPH_PAGE_SIZE)) { + g_conf()->bluefs_alloc_size / CEPH_PAGE_SIZE)), wal_header_filler(nullptr) { ++file->num_writers; iocv.fill(nullptr); dirty_devs.fill(false); @@ -393,9 +471,30 @@ public: buffer_appender.append_zero(len); } + void append(uint64_t value) { + uint64_t l0 = get_buffer_length(); + ceph_assert(l0 + sizeof(value) <= std::numeric_limits::max()); + bufferlist encoded; + encode(value, encoded); + buffer_appender.append(encoded); + } + + bufferlist::contiguous_filler append_hole(uint64_t len) { + return buffer.append_hole(len); + } + + void set_wal_header_filler(std::unique_ptr filler) { + wal_header_filler.swap(filler); + } + + bufferlist::contiguous_filler* get_wal_header_filler() { + return wal_header_filler.get(); + } + uint64_t get_effective_write_pos() { return pos + buffer.length(); } + }; struct FileReaderBuffer { @@ -636,6 +735,14 @@ private: void _flush_bdev(); // this is safe to call without a lock void _flush_bdev(std::array& dirty_bdevs); // this is safe to call without a lock + void _wal_update_size(FileRef file, uint64_t increment); + + int64_t _read_wal( + FileReader *h, ///< [in] read from here + uint64_t offset, ///< [in] offset + size_t len, ///< [in] this many bytes + ceph::buffer::list *outbl, ///< [out] optional: reference the result here + char *out); ///< [out] optional: or copy it here int64_t _read( FileReader *h, ///< [in] read from here uint64_t offset, ///< [in] offset @@ -649,7 +756,7 @@ private: char *out); ///< [out] optional: or copy it here int _open_super(); - int _write_super(int dev); + int _write_super(int dev, uint8_t wal_version = 1); int _check_allocations(const bluefs_fnode_t& fnode, boost::dynamic_bitset* used_blocks, bool is_alloc, //true when allocating, false when deallocating @@ -677,6 +784,7 @@ private: _check_vselector_LNF(); } } + public: BlueFS(CephContext* cct); ~BlueFS(); @@ -784,12 +892,15 @@ public: void append_try_flush(FileWriter *h, const char* buf, size_t len); void flush_range(FileWriter *h, uint64_t offset, uint64_t length); - int fsync(FileWriter *h); + int fsync(FileWriter *h, bool force_dirty = false); int64_t read(FileReader *h, uint64_t offset, size_t len, ceph::buffer::list *outbl, char *out) { // no need to hold the global lock here; we only touch h and // h->file, and read vs write or delete is already protected (via // atomics and asserts). + if (h->file->is_new_wal()) { + return _read_wal(h, offset, len, outbl, out); + } return _read(h, offset, len, outbl, out); } int64_t read_random(FileReader *h, uint64_t offset, size_t len, diff --git a/src/os/bluestore/bluefs_types.cc b/src/os/bluestore/bluefs_types.cc index 554ebcfc24709..892c157e30d68 100644 --- a/src/os/bluestore/bluefs_types.cc +++ b/src/os/bluestore/bluefs_types.cc @@ -5,7 +5,9 @@ #include "bluefs_types.h" #include "BlueFS.h" #include "common/Formatter.h" +#include "include/byteorder.h" #include "include/denc.h" +#include "include/encoding.h" #include "include/uuid.h" #include "include/stringify.h" @@ -171,24 +173,34 @@ void bluefs_layout_t::generate_test_instances(list& ls) } // bluefs_super_t -bluefs_super_t::bluefs_super_t() : seq(0), block_size(4096) { +bluefs_super_t::bluefs_super_t() : seq(0), block_size(4096), wal_version(1) { } void bluefs_super_t::encode(bufferlist& bl) const { - ENCODE_START(2, 1, bl); + __u8 _version = 2; + __u8 _compat = 1; + if (wal_version >= 2) { + _version = 3; + _compat = 3; + } + ENCODE_START(_version, _compat, bl); encode(uuid, bl); encode(osd_uuid, bl); encode(seq, bl); encode(block_size, bl); encode(log_fnode, bl); encode(memorized_layout, bl); + if (_version >= 3) { + encode(wal_version, bl); + } ENCODE_FINISH(bl); } void bluefs_super_t::decode(bufferlist::const_iterator& p) { - DECODE_START(2, p); + + DECODE_START(3, p); decode(uuid, p); decode(osd_uuid, p); decode(seq, p); @@ -197,6 +209,9 @@ void bluefs_super_t::decode(bufferlist::const_iterator& p) if (struct_v >= 2) { decode(memorized_layout, p); } + if (struct_v >= 3) { + decode(wal_version, p); + } DECODE_FINISH(p); } @@ -265,6 +280,10 @@ bluefs_fnode_delta_t* bluefs_fnode_t::make_delta(bluefs_fnode_delta_t* delta) { delta->mtime = mtime; delta->offset = allocated_commited; delta->extents.clear(); + + delta->type = type; + delta->wal_limit = wal_limit; + delta->wal_size = wal_size; if (allocated_commited < allocated) { uint64_t x_off = 0; auto p = seek(allocated_commited, &x_off); @@ -302,17 +321,24 @@ void bluefs_fnode_t::generate_test_instances(list& ls) ls.back()->mtime = utime_t(123,45); ls.back()->extents.push_back(bluefs_extent_t(0, 1048576, 4096)); ls.back()->__unused__ = 1; + ls.back()->type = 0; } ostream& operator<<(ostream& out, const bluefs_fnode_t& file) { - return out << "file(ino " << file.ino + out << "file(ino " << file.ino << " size 0x" << std::hex << file.size << std::dec << " mtime " << file.mtime << " allocated " << std::hex << file.allocated << std::dec << " alloc_commit " << std::hex << file.allocated_commited << std::dec - << " extents " << file.extents - << ")"; + << " extents " << file.extents; + if (file.type == WAL_V2) { + out << " wal_limit " << file.wal_limit << std::hex; + out << " wal_size " << file.wal_size << std::hex; + out << " type WAL_V2 " << std::dec; + } + out << ")"; + return out; } // bluefs_fnode_delta_t @@ -405,3 +431,29 @@ ostream& operator<<(ostream& out, const bluefs_transaction_t& t) << std::dec << ")"; } +void bluefs_wal_header_t::bound_encode(size_t &s) const { + s += 1; // version + s += 1; // compat + s += 4; // size + denc(flush_length, s); +} + +void bluefs_wal_header_t::encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(flush_length, bl); + ENCODE_FINISH(bl); +} + +void bluefs_wal_header_t::encode(bufferlist::contiguous_filler& filler_in) const { + ENCODE_START_FILLER(1, 1, filler_in); + ceph_le64 flush_length_le(flush_length); + filler_in.copy_in(sizeof(flush_length_le), (char*)&flush_length_le); + ENCODE_FINISH_FILLER(); +} + +void bluefs_wal_header_t::decode(bufferlist::const_iterator& p) +{ + DECODE_START(1, p); + decode(flush_length, p); + DECODE_FINISH(p); +} diff --git a/src/os/bluestore/bluefs_types.h b/src/os/bluestore/bluefs_types.h index 75fc22fb9eaf9..610ba35f2406f 100644 --- a/src/os/bluestore/bluefs_types.h +++ b/src/os/bluestore/bluefs_types.h @@ -4,6 +4,7 @@ #define CEPH_OS_BLUESTORE_BLUEFS_TYPES_H #include +#include #include "bluestore_types.h" #include "include/utime.h" @@ -33,6 +34,12 @@ public: }; WRITE_CLASS_DENC(bluefs_extent_t) +enum bluefs_node_type { + REGULAR = 0, + WAL_V2 = 1, + NODE_TYPE_END = 0x100, +}; + std::ostream& operator<<(std::ostream& out, const bluefs_extent_t& e); struct bluefs_locked_extents_t { @@ -73,16 +80,65 @@ struct bluefs_fnode_delta_t { uint64_t offset; // Contains offset in file of extents. // Equal to 'allocated' when created. // Used for consistency checking. + + // only relevant in case of wal node + uint64_t wal_limit; + uint64_t wal_size; + uint8_t type = REGULAR; + + mempool::bluefs::vector extents; - DENC(bluefs_fnode_delta_t, v, p) { - DENC_START(1, 1, p); + DENC_HELPERS + + void bound_encode(size_t& p) const { + _denc_friend(*this, p); + } + void encode(ceph::buffer::list::contiguous_appender& p) const { + DENC_DUMP_PRE(bluefs_fnode_t); + _denc_friend(*this, p); + } + void decode(ceph::buffer::ptr::const_iterator& p) { + DENC_START_UNSAFE(2, 2, p); + denc_varint(ino, p); + denc_varint(size, p); + denc(mtime, p); + denc(offset, p); + denc(extents, p); + + if (struct_v >= 2) { + denc(type, p); + denc(wal_limit, p); + denc(wal_size, p); + } + DENC_FINISH(p); + + } + + template + friend std::enable_if_t>> + _denc_friend(T& v, P& p) { + uint8_t version = 1, compat = 1; + if (v.type == WAL_V2) { + version = 2; + compat = 2; + } + DENC_START_UNSAFE(version, compat, p); + denc_varint(v.ino, p); denc_varint(v.size, p); denc(v.mtime, p); denc(v.offset, p); denc(v.extents, p); + + if (struct_v >= 2) { + denc(v.type, p); + denc(v.wal_limit, p); + denc(v.wal_size, p); + } + DENC_FINISH(p); + } }; WRITE_CLASS_DENC(bluefs_fnode_delta_t) @@ -102,14 +158,19 @@ struct bluefs_fnode_t { uint64_t allocated; uint64_t allocated_commited; + uint8_t type = REGULAR; + uint64_t wal_limit; // EOF of wal, this limit represents upper limit of fnode.size, not upper limit of wal_size + uint64_t wal_size; // Amount of payload bytes in WAL(not including envelope data), there could be more on power off instances, in range of wal_size~wal_limit - bluefs_fnode_t() : ino(0), size(0), allocated(0), allocated_commited(0) {} + bluefs_fnode_t() : ino(0), size(0), allocated(0), allocated_commited(0), wal_limit(0), wal_size(0) {} bluefs_fnode_t(uint64_t _ino, uint64_t _size, utime_t _mtime) : - ino(_ino), size(_size), mtime(_mtime), allocated(0), allocated_commited(0) {} + ino(_ino), size(_size), mtime(_mtime), allocated(0), allocated_commited(0), wal_limit(0), wal_size(0) {} bluefs_fnode_t(const bluefs_fnode_t& other) : ino(other.ino), size(other.size), mtime(other.mtime), allocated(other.allocated), - allocated_commited(other.allocated_commited) { + allocated_commited(other.allocated_commited), + wal_limit(other.wal_limit), + wal_size(other.wal_size) { clone_extents(other); } @@ -136,19 +197,46 @@ struct bluefs_fnode_t { DENC_DUMP_PRE(bluefs_fnode_t); _denc_friend(*this, p); } + void decode(ceph::buffer::ptr::const_iterator& p) { - _denc_friend(*this, p); + DENC_START_COMPAT_2(2, 2, p); + denc_varint(ino, p); + denc_varint(size, p); + denc(mtime, p); + denc(__unused__, p); + denc(extents, p); + if (struct_v >= 2) { + denc(type, p); + denc(wal_limit, p); + denc(wal_size, p); + } + if (struct_v == 1) { + type = REGULAR; + } + DENC_FINISH(p); recalc_allocated(); } + template friend std::enable_if_t>> _denc_friend(T& v, P& p) { - DENC_START(1, 1, p); + + uint8_t version = 1, compat = 1; + if (v.type == WAL_V2) { + version = 2; + compat = 2; + } + DENC_START_UNSAFE(version, compat, p); denc_varint(v.ino, p); denc_varint(v.size, p); denc(v.mtime, p); denc(v.__unused__, p); denc(v.extents, p); + if (struct_v >= 2) { + denc(v.type, p); + denc(v.wal_limit, p); + denc(v.wal_size, p); + } DENC_FINISH(p); } void reset_delta() { @@ -251,6 +339,8 @@ struct bluefs_super_t { std::optional memorized_layout; + uint8_t wal_version; + bluefs_super_t(); uint64_t block_mask() const { @@ -287,7 +377,6 @@ struct bluefs_transaction_t { uuid_d uuid; ///< fs uuid uint64_t seq; ///< sequence number ceph::buffer::list op_bl; ///< encoded transaction ops - bluefs_transaction_t() : seq(0) {} void clear() { @@ -369,4 +458,20 @@ struct bluefs_transaction_t { WRITE_CLASS_ENCODER(bluefs_transaction_t) std::ostream& operator<<(std::ostream& out, const bluefs_transaction_t& t); + + + +struct bluefs_wal_header_t { + uint64_t flush_length; + + bluefs_wal_header_t() : flush_length(0) {} + bluefs_wal_header_t(uint64_t flush_length) : flush_length(flush_length) {} + static constexpr size_t size() { return (sizeof(__u8)*2) + sizeof(uint32_t) + sizeof(uint64_t); } + void bound_encode(size_t &s) const; + void encode(ceph::buffer::list& bl) const; + void encode(ceph::buffer::list::contiguous_filler& filler_in) const; + void decode(ceph::buffer::list::const_iterator& p); +}; +WRITE_CLASS_ENCODER(bluefs_wal_header_t) + #endif -- 2.39.5