From: Igor Fedotov Date: Fri, 28 Feb 2025 09:40:33 +0000 (+0300) Subject: os/bluestore: use dev's block size as a minimal BlueFS allocation unit. X-Git-Tag: v19.2.3~166^2~6 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=47091d1732b98881ca4a75fb2cf096418ea9eb7b;p=ceph.git os/bluestore: use dev's block size as a minimal BlueFS allocation unit. Additionall this locks tail of DB/WAL volumes which is unaligned to configured (not minimal!!) BlueFS allocation unit. Effectively replaces changes from https://github.com/ceph/ceph/pull/57015 Fixes: https://tracker.ceph.com/issues/68772 Signed-off-by: Igor Fedotov (cherry picked from commit effaa686f38b9eff2f7b9c8df2ffaf76c9a49aff) --- diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index b7cfe352c8872..ebc49cff73526 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -187,9 +187,9 @@ BlueFS::BlueFS(CephContext* cct) : cct(cct), bdev(MAX_BDEV), ioc(MAX_BDEV), - block_reserved(MAX_BDEV), alloc(MAX_BDEV), - alloc_size(MAX_BDEV, 0) + alloc_size(MAX_BDEV, 0), + locked_alloc(MAX_BDEV) { dirty.pending_release.resize(MAX_BDEV); discard_cb[BDEV_WAL] = wal_discard_cb; @@ -482,33 +482,28 @@ void BlueFS::_update_logger_stats() int BlueFS::add_block_device(unsigned id, const string& path, bool trim, bluefs_shared_alloc_context_t* _shared_alloc) { - uint64_t reserved; string dev_name; switch(id) { case BDEV_WAL: case BDEV_NEWWAL: - reserved = BDEV_LABEL_BLOCK_SIZE; dev_name = "wal"; break; case BDEV_DB: case BDEV_NEWDB: - reserved = SUPER_RESERVED; dev_name = "db"; break; case BDEV_SLOW: - reserved = 0; dev_name = "slow"; break; default: ceph_assert(false); } dout(10) << __func__ << " bdev " << id << " path " << path << " " - << " reserved " << reserved << dendl; + << dendl; ceph_assert(id < bdev.size()); ceph_assert(bdev[id] == NULL); BlockDevice *b = BlockDevice::create(cct, path, NULL, NULL, discard_cb[id], static_cast(this), dev_name.c_str()); - block_reserved[id] = reserved; if (_shared_alloc) { b->set_no_exclusive_lock(); } @@ -614,6 +609,35 @@ uint64_t BlueFS::get_free(unsigned id) return alloc[id]->get_free(); } +uint64_t BlueFS::_get_minimal_reserved(unsigned id) const +{ + uint64_t reserved = 0; + switch(id) { + case BDEV_WAL: + case BDEV_NEWWAL: + reserved = BDEV_LABEL_BLOCK_SIZE; + break; + case BDEV_DB: + case BDEV_NEWDB: + reserved = SUPER_RESERVED; + break; + case BDEV_SLOW: + reserved = 0; + break; + default: + ceph_assert(false); + } + return reserved; +} + +uint64_t BlueFS::get_full_reserved(unsigned id) +{ + if (!is_shared_alloc(id)) { + return locked_alloc[id].length + _get_minimal_reserved(id); + } + return 0; +} + void BlueFS::dump_perf_counters(Formatter *f) { f->open_object_section("bluefs_perf_counters"); @@ -670,13 +694,13 @@ int BlueFS::mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout) } _init_logger(); - _init_alloc(); super.version = 0; super.block_size = bdev[BDEV_DB]->get_block_size(); super.osd_uuid = osd_uuid; super.uuid.generate_random(); - dout(1) << __func__ << " uuid " << super.uuid << dendl; + + _init_alloc(); // init log FileRef log_file = ceph::make_ref(); @@ -701,6 +725,7 @@ int BlueFS::mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout) super.log_fnode = log_file->fnode; super.memorized_layout = layout; _write_super(BDEV_DB); + dout(1) << __func__ << " super " << super << dendl; _flush_bdev(); // clean up @@ -761,6 +786,8 @@ void BlueFS::_init_alloc() continue; } ceph_assert(bdev[id]->get_size()); + locked_alloc[id] = bluefs_extent_t(); + if (is_shared_alloc(id)) { dout(1) << __func__ << " shared, id " << id << std::hex << ", capacity 0x" << bdev[id]->get_size() @@ -774,21 +801,39 @@ void BlueFS::_init_alloc() name += devnames[id]; else name += to_string(uintptr_t(this)); - string alloc_type = cct->_conf->bluefs_allocator; + auto reserved = _get_minimal_reserved(id); + uint64_t locked_offs = 0; + { + // Try to lock tailing space at device if allocator controlled space + // isn't aligned with recommended alloc unit. + // Final decision whether locked tail to be maintained is made after + // BlueFS replay depending on existing allocations. + uint64_t size0 = _get_total(id); + uint64_t size = size0 - reserved; + size = p2align(size, alloc_size[id]) + reserved; + if (size < size0) { + locked_offs = size; + locked_alloc[id] = bluefs_extent_t(id, locked_offs, uint32_t(size0 - size)); + } + } + string alloc_type = cct->_conf->bluefs_allocator; dout(1) << __func__ << " new, id " << id << std::hex << ", allocator name " << name << ", allocator type " << alloc_type << ", capacity 0x" << bdev[id]->get_size() - << ", reserved 0x" << block_reserved[id] - << ", block size 0x" << alloc_size[id] + << ", reserved 0x" << reserved + << ", locked 0x" << locked_alloc[id].offset + << "~" << locked_alloc[id].length + << ", block size 0x" << bdev[id]->get_block_size() + << ", alloc unit 0x" << alloc_size[id] << std::dec << dendl; alloc[id] = Allocator::create(cct, alloc_type, bdev[id]->get_size(), - alloc_size[id], + bdev[id]->get_block_size(), name); - auto reserved = block_reserved[id]; - alloc[id]->init_add_free(reserved, _get_total(id) - reserved); + uint64_t free_len = locked_offs ? locked_offs : _get_total(id) - reserved; + alloc[id]->init_add_free(reserved, free_len); } } } @@ -992,6 +1037,7 @@ int BlueFS::mount() derr << __func__ << " failed to open super: " << cpp_strerror(r) << dendl; goto out; } + dout(5) << __func__ << " super: " << super << dendl; // set volume selector if not provided before/outside if (vselector == nullptr) { @@ -1021,6 +1067,20 @@ int BlueFS::mount() shared_alloc->bluefs_used += q.length; alloc[q.bdev]->init_rm_free(q.offset, q.length); } else if (!is_shared) { + if (locked_alloc[q.bdev].length) { + auto locked_offs = locked_alloc[q.bdev].offset; + if (q.offset + q.length > locked_offs) { + // we already have allocated extents in locked range, + // do not enforce this lock then. + bluefs_extent_t dummy; + std::swap(locked_alloc[q.bdev], dummy); + alloc[q.bdev]->init_add_free(dummy.offset, dummy.length); + dout(1) << __func__ << std::hex + << " unlocked at " << q.bdev + << " 0x" << dummy.offset << "~" << dummy.length + << std::dec << dendl; + } + } alloc[q.bdev]->init_rm_free(q.offset, q.length); } } @@ -1283,9 +1343,10 @@ int BlueFS::_replay(bool noop, bool to_stdout) bool seen_recs = false; boost::dynamic_bitset used_blocks[MAX_BDEV]; + bool check_allocations = cct->_conf->bluefs_log_replay_check_allocations; if (!noop) { - if (cct->_conf->bluefs_log_replay_check_allocations) { + if (check_allocations) { for (size_t i = 0; i < MAX_BDEV; ++i) { if (bdev[i] != nullptr) { // let's use minimal allocation unit we can have @@ -1617,7 +1678,7 @@ int BlueFS::_replay(bool noop, bool to_stdout) } if (!noop) { FileRef f = _get_file(fnode.ino); - if (cct->_conf->bluefs_log_replay_check_allocations) { + if (check_allocations) { int r = _check_allocations(f->fnode, used_blocks, false, "OP_FILE_UPDATE"); if (r < 0) { @@ -1633,7 +1694,7 @@ int BlueFS::_replay(bool noop, bool to_stdout) if (fnode.ino > ino_last) { ino_last = fnode.ino; } - if (cct->_conf->bluefs_log_replay_check_allocations) { + if (check_allocations) { int r = _check_allocations(f->fnode, used_blocks, true, "OP_FILE_UPDATE"); if (r < 0) { @@ -1667,7 +1728,7 @@ int BlueFS::_replay(bool noop, bool to_stdout) // be leanient, if there is no extents just produce error message ceph_assert(delta.offset == fnode.allocated || delta.extents.empty()); } - if (cct->_conf->bluefs_log_replay_check_allocations) { + if (check_allocations) { int r = _check_allocations(fnode, used_blocks, false, "OP_FILE_UPDATE_INC"); if (r < 0) { @@ -1692,7 +1753,7 @@ int BlueFS::_replay(bool noop, bool to_stdout) if (fnode.ino > ino_last) { ino_last = fnode.ino; } - if (cct->_conf->bluefs_log_replay_check_allocations) { + if (check_allocations) { int r = _check_allocations(f->fnode, used_blocks, true, "OP_FILE_UPDATE_INC"); if (r < 0) { @@ -1726,7 +1787,7 @@ int BlueFS::_replay(bool noop, bool to_stdout) auto p = nodes.file_map.find(ino); ceph_assert(p != nodes.file_map.end()); vselector->sub_usage(p->second->vselector_hint, p->second->fnode); - if (cct->_conf->bluefs_log_replay_check_allocations) { + if (check_allocations) { int r = _check_allocations(p->second->fnode, used_blocks, false, "OP_FILE_REMOVE"); if (r < 0) { diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h index c185b25301d14..67a262aeaa11f 100644 --- a/src/os/bluestore/BlueFS.h +++ b/src/os/bluestore/BlueFS.h @@ -521,9 +521,12 @@ private: */ std::vector bdev; ///< block devices we can use std::vector ioc; ///< IOContexts for bdevs - std::vector block_reserved; ///< starting reserve extent per device std::vector alloc; ///< allocators for bdevs std::vector alloc_size; ///< alloc size for each device + std::vector locked_alloc; ///< candidate extents for locked alocations, + ///< no alloc/release reqs matching these space + ///< to be issued to allocator. + //std::vector> block_unused_too_granular; @@ -555,7 +558,7 @@ private: uint64_t _get_used(unsigned id) const; uint64_t _get_total(unsigned id) const; - + uint64_t _get_minimal_reserved(unsigned id) const; FileRef _get_file(uint64_t ino); void _drop_link_DF(FileRef f); @@ -711,6 +714,7 @@ public: uint64_t get_total(unsigned id); uint64_t get_free(unsigned id); uint64_t get_used(unsigned id); + uint64_t get_full_reserved(unsigned id); void dump_perf_counters(ceph::Formatter *f); void dump_block_extents(std::ostream& out); diff --git a/src/os/bluestore/bluefs_types.cc b/src/os/bluestore/bluefs_types.cc index 8a574e9c3d6c2..0ba3a70d2e9ff 100644 --- a/src/os/bluestore/bluefs_types.cc +++ b/src/os/bluestore/bluefs_types.cc @@ -74,6 +74,8 @@ void bluefs_layout_t::generate_test_instances(list& ls) } // bluefs_super_t +bluefs_super_t::bluefs_super_t() : version(0), block_size(4096) { +} void bluefs_super_t::encode(bufferlist& bl) const { diff --git a/src/os/bluestore/bluefs_types.h b/src/os/bluestore/bluefs_types.h index 99ce1c3c1461e..2d293d2a9ee4e 100644 --- a/src/os/bluestore/bluefs_types.h +++ b/src/os/bluestore/bluefs_types.h @@ -220,9 +220,7 @@ struct bluefs_super_t { std::optional memorized_layout; - bluefs_super_t() - : version(0), - block_size(4096) { } + bluefs_super_t(); uint64_t block_mask() const { return ~((uint64_t)block_size - 1);