From eed03125c4a7c5ee558ae72e73060475c406dc5f Mon Sep 17 00:00:00 2001 From: liangmingyuan Date: Sun, 21 Apr 2024 20:32:09 +0800 Subject: [PATCH] bluefs: bluefs alloc unit should only be shrink The alloc unit has already forbidden changed for bluestore, what's more, it should forbidden increased in bluefs. Otherwise, it can leads to coredump or bad data. Let's explain it use Bitmap Allocater, it has two presentations: a) in BitmapAllocator::init_rm_free(offset, length), (offset + length) should bigger than offs. But when get_min_alloc_size() changed bigger, this can not be guaranteed. b) if init_rm_free() are successfully in luck, then in rocksdb compact, when release() be called, it release a small extent but may leads to larger extents be released to Bitmap. As a result, rocksdb data is corrupted, and the osd can not be booted again. Signed-off-by: Mingyuan Liang --- src/os/bluestore/BlueFS.cc | 42 +++++++++++++++++++++++++++++++- src/os/bluestore/bluefs_types.cc | 17 +++++++++++-- src/os/bluestore/bluefs_types.h | 6 ++--- 3 files changed, 59 insertions(+), 6 deletions(-) diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index a8b1fb25ee8..b3715a05d8b 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -712,10 +712,27 @@ void BlueFS::_init_alloc() { dout(20) << __func__ << dendl; + // 'changed' should keep its previous value if no actual modification occurred + auto change_alloc_size = [this](uint64_t& max_alloc_size, + uint64_t new_alloc, bool& changed) { + if (max_alloc_size == 0 || + (max_alloc_size > new_alloc && ((new_alloc & (new_alloc -1)) == 0))) { + max_alloc_size = new_alloc; + changed = true; + dout(5) << " changed alloc_size to 0x" << std::hex << new_alloc << dendl; + } else if (max_alloc_size != new_alloc) { + derr << " can not change current alloc_size 0x" << std::hex + << max_alloc_size << " to new alloc_size 0x" << new_alloc << dendl; + } + }; + + bool alloc_size_changed = false; size_t wal_alloc_size = 0; if (bdev[BDEV_WAL]) { wal_alloc_size = cct->_conf->bluefs_alloc_size; alloc_size[BDEV_WAL] = wal_alloc_size; + change_alloc_size(super.bluefs_max_alloc_size[BDEV_WAL], + wal_alloc_size, alloc_size_changed); } logger->set(l_bluefs_wal_alloc_unit, wal_alloc_size); @@ -731,18 +748,38 @@ void BlueFS::_init_alloc() if (bdev[BDEV_SLOW]) { alloc_size[BDEV_DB] = cct->_conf->bluefs_alloc_size; alloc_size[BDEV_SLOW] = shared_alloc_size; + change_alloc_size(super.bluefs_max_alloc_size[BDEV_DB], + cct->_conf->bluefs_alloc_size, alloc_size_changed); + change_alloc_size(super.bluefs_max_alloc_size[BDEV_SLOW], + shared_alloc_size, alloc_size_changed); } else { alloc_size[BDEV_DB] = shared_alloc_size; alloc_size[BDEV_SLOW] = 0; + change_alloc_size(super.bluefs_max_alloc_size[BDEV_DB], + shared_alloc_size, alloc_size_changed); } logger->set(l_bluefs_db_alloc_unit, alloc_size[BDEV_DB]); logger->set(l_bluefs_slow_alloc_unit, alloc_size[BDEV_SLOW]); // new wal and db devices are never shared if (bdev[BDEV_NEWWAL]) { alloc_size[BDEV_NEWWAL] = cct->_conf->bluefs_alloc_size; + change_alloc_size(super.bluefs_max_alloc_size[BDEV_NEWWAL], + cct->_conf->bluefs_alloc_size, alloc_size_changed); } + if (alloc_size_changed) { + dout(1) << __func__ << " alloc_size changed, the new super is:" << super << dendl; + _write_super(BDEV_DB); + } + + alloc_size_changed = false; if (bdev[BDEV_NEWDB]) { alloc_size[BDEV_NEWDB] = cct->_conf->bluefs_alloc_size; + change_alloc_size(super.bluefs_max_alloc_size[BDEV_NEWDB], + cct->_conf->bluefs_alloc_size, alloc_size_changed); + } + if (alloc_size_changed) { + dout(1) << __func__ << " alloc_size changed, the new super is:" << super << dendl; + _write_super(BDEV_NEWDB); } for (unsigned id = 0; id < bdev.size(); ++id) { @@ -750,6 +787,7 @@ void BlueFS::_init_alloc() continue; } ceph_assert(bdev[id]->get_size()); + ceph_assert(super.bluefs_max_alloc_size[id]); if (is_shared_alloc(id)) { dout(1) << __func__ << " shared, id " << id << std::hex << ", capacity 0x" << bdev[id]->get_size() @@ -769,10 +807,11 @@ void BlueFS::_init_alloc() << ", capacity 0x" << bdev[id]->get_size() << ", reserved 0x" << block_reserved[id] << ", block size 0x" << alloc_size[id] + << ", max alloc size 0x" << super.bluefs_max_alloc_size[id] << std::dec << dendl; alloc[id] = Allocator::create(cct, cct->_conf->bluefs_allocator, bdev[id]->get_size(), - alloc_size[id], + super.bluefs_max_alloc_size[id], name); alloc[id]->init_add_free( block_reserved[id], @@ -992,6 +1031,7 @@ int BlueFS::mount() _init_alloc(); + dout(5) << __func__ << " super: " << super << dendl; r = _replay(false, false); if (r < 0) { derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl; diff --git a/src/os/bluestore/bluefs_types.cc b/src/os/bluestore/bluefs_types.cc index 5b2281a9ffd..e18dd490140 100644 --- a/src/os/bluestore/bluefs_types.cc +++ b/src/os/bluestore/bluefs_types.cc @@ -3,6 +3,7 @@ #include #include "bluefs_types.h" +#include "BlueFS.h" #include "common/Formatter.h" #include "include/denc.h" #include "include/uuid.h" @@ -74,22 +75,26 @@ void bluefs_layout_t::generate_test_instances(list& ls) } // bluefs_super_t +bluefs_super_t::bluefs_super_t() : version(0), block_size(4096) { + bluefs_max_alloc_size.resize(BlueFS::MAX_BDEV, 0); +} void bluefs_super_t::encode(bufferlist& bl) const { - ENCODE_START(2, 1, bl); + ENCODE_START(3, 1, bl); encode(uuid, bl); encode(osd_uuid, bl); encode(version, bl); encode(block_size, bl); encode(log_fnode, bl); encode(memorized_layout, bl); + encode(bluefs_max_alloc_size, bl); ENCODE_FINISH(bl); } void bluefs_super_t::decode(bufferlist::const_iterator& p) { - DECODE_START(2, p); + DECODE_START(3, p); decode(uuid, p); decode(osd_uuid, p); decode(version, p); @@ -98,6 +103,11 @@ void bluefs_super_t::decode(bufferlist::const_iterator& p) if (struct_v >= 2) { decode(memorized_layout, p); } + if (struct_v >= 3) { + decode(bluefs_max_alloc_size, p); + } else { + std::fill(bluefs_max_alloc_size.begin(), bluefs_max_alloc_size.end(), 0); + } DECODE_FINISH(p); } @@ -108,6 +118,8 @@ void bluefs_super_t::dump(Formatter *f) const f->dump_unsigned("version", version); f->dump_unsigned("block_size", block_size); f->dump_object("log_fnode", log_fnode); + for (auto& p : bluefs_max_alloc_size) + f->dump_unsigned("max_alloc_size", p); } void bluefs_super_t::generate_test_instances(list& ls) @@ -125,6 +137,7 @@ ostream& operator<<(ostream& out, const bluefs_super_t& s) << " v " << s.version << " block_size 0x" << std::hex << s.block_size << " log_fnode 0x" << s.log_fnode + << " max_alloc_size " << s.bluefs_max_alloc_size << std::dec << ")"; } diff --git a/src/os/bluestore/bluefs_types.h b/src/os/bluestore/bluefs_types.h index 6516f404e12..627118c12f8 100644 --- a/src/os/bluestore/bluefs_types.h +++ b/src/os/bluestore/bluefs_types.h @@ -219,9 +219,9 @@ struct bluefs_super_t { std::optional memorized_layout; - bluefs_super_t() - : version(0), - block_size(4096) { } + std::vector bluefs_max_alloc_size; + + bluefs_super_t(); uint64_t block_mask() const { return ~((uint64_t)block_size - 1); -- 2.39.5