From bcb95ee0f5522cc2c5f2d423210b1eb4e0f34bc5 Mon Sep 17 00:00:00 2001 From: Igor Fedotov Date: Thu, 19 Oct 2023 14:57:23 +0300 Subject: [PATCH] os/bluestore: rework vselector calls We can provide fnode delta to vseector now. Which is a bit more effective. Signed-off-by: Igor Fedotov (cherry picked from commit 3e04ac48779f62394ebd9587298823bf6b20a570) Conflicts: src/os/bluestore/BlueFS.cc - Unsplitted _maybe_extend_log method which prevented smooth cherry picking --- src/os/bluestore/BlueFS.cc | 67 +++++++++++++------------ src/os/bluestore/BlueFS.h | 31 +++++++++--- src/os/bluestore/BlueStore.cc | 10 +++- src/os/bluestore/BlueStore.h | 93 ++++++++++++++++------------------- 4 files changed, 109 insertions(+), 92 deletions(-) diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index 710021f0787..1f1ced92b4a 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -1549,11 +1549,9 @@ int BlueFS::_replay(bool noop, bool to_stdout) } if (fnode.ino != 1) { vselector->sub_usage(f->vselector_hint, f->fnode); - } + vselector->add_usage(f->vselector_hint, fnode); + } f->fnode = fnode; - if (fnode.ino != 1) { - vselector->add_usage(f->vselector_hint, f->fnode); - } if (fnode.ino > ino_last) { ino_last = fnode.ino; @@ -1795,7 +1793,7 @@ int BlueFS::device_migrate_to_existing( // write entire file auto l = _allocate(dev_target, bl.length(), 0, - &file_ref->fnode, 0, false); + &file_ref->fnode, nullptr, 0, false); if (l < 0) { derr << __func__ << " unable to allocate len 0x" << std::hex << bl.length() << std::dec << " from " << (int)dev_target @@ -1935,7 +1933,7 @@ int BlueFS::device_migrate_to_new( // write entire file auto l = _allocate(dev_target, bl.length(), 0, - &file_ref->fnode, 0, false); + &file_ref->fnode, nullptr, 0, false); if (l < 0) { derr << __func__ << " unable to allocate len 0x" << std::hex << bl.length() << std::dec << " from " << (int)dev_target @@ -2525,7 +2523,6 @@ void BlueFS::_rewrite_log_and_layout_sync_LNF_LD(bool permit_dev_fallback, auto t0 = mono_clock::now(); File *log_file = log.writer->file.get(); - bluefs_fnode_t fnode_tail; // log.t.seq is always set to current live seq ceph_assert(log.t.seq == log.seq_live); // Capturing entire state. Dump anything that has been stored there. @@ -2580,7 +2577,8 @@ void BlueFS::_rewrite_log_and_layout_sync_LNF_LD(bool permit_dev_fallback, dout(20) << __func__ << " compacted_meta_need " << compacted_meta_need << dendl; - int r = _allocate(log_dev, compacted_meta_need, 0, &fnode_tail, 0, + bluefs_fnode_t fnode_tail; + int r = _allocate(log_dev, compacted_meta_need, 0, &fnode_tail, nullptr, 0, permit_dev_fallback); ceph_assert(r == 0); @@ -2591,7 +2589,7 @@ void BlueFS::_rewrite_log_and_layout_sync_LNF_LD(bool permit_dev_fallback, uint64_t starter_need = _make_initial_transaction(starter_seq, fnode_tail, 0, nullptr); bluefs_fnode_t fnode_starter(log_file->fnode.ino, 0, mtime); - r = _allocate(log_dev, starter_need, 0, &fnode_starter, 0, + r = _allocate(log_dev, starter_need, 0, &fnode_starter, nullptr, 0, permit_dev_fallback); ceph_assert(r == 0); @@ -3092,14 +3090,15 @@ int64_t BlueFS::_maybe_extend_log() if (log_forbidden_to_expand.load() == true) { return -EWOULDBLOCK; } - vselector->sub_usage(log.writer->file->vselector_hint, log.writer->file->fnode); int r = _allocate( vselector->select_prefer_bdev(log.writer->file->vselector_hint), cct->_conf->bluefs_max_log_runway, 0, - &log.writer->file->fnode); + &log.writer->file->fnode, + [&](const bluefs_extent_t& e) { + vselector->add_usage(log.writer->file->vselector_hint, e); + }); ceph_assert(r == 0); - vselector->add_usage(log.writer->file->vselector_hint, log.writer->file->fnode); log.t.op_file_update_inc(log.writer->file->fnode); } return runway; @@ -3365,7 +3364,9 @@ int BlueFS::_flush_range_F(FileWriter *h, uint64_t offset, uint64_t length) dout(10) << __func__ << " " << h << " pos 0x" << std::hex << h->pos << " 0x" << offset << "~" << length << std::dec - << " to " << h->file->fnode << dendl; + << " to " << h->file->fnode + << " hint " << h->file->vselector_hint + << dendl; if (h->file->deleted) { dout(10) << __func__ << " deleted, no-op" << dendl; return 0; @@ -3386,7 +3387,6 @@ int BlueFS::_flush_range_F(FileWriter *h, uint64_t offset, uint64_t length) ceph_assert(offset <= h->file->fnode.size); uint64_t allocated = h->file->fnode.get_allocated(); - vselector->sub_usage(h->file->vselector_hint, h->file->fnode); // do not bother to dirty the file if we are overwriting // previously allocated extents. if (allocated < offset + length) { @@ -3395,25 +3395,26 @@ int BlueFS::_flush_range_F(FileWriter *h, uint64_t offset, uint64_t length) int r = _allocate(vselector->select_prefer_bdev(h->file->vselector_hint), offset + length - allocated, 0, - &h->file->fnode); + &h->file->fnode, + [&](const bluefs_extent_t& e) { + vselector->add_usage(h->file->vselector_hint, e); + }); if (r < 0) { derr << __func__ << " allocated: 0x" << std::hex << allocated << " offset: 0x" << offset << " length: 0x" << length << std::dec << dendl; - vselector->add_usage(h->file->vselector_hint, h->file->fnode); // undo ceph_abort_msg("bluefs enospc"); return r; } h->file->is_dirty = true; } if (h->file->fnode.size < offset + length) { + vselector->add_usage(h->file->vselector_hint, offset + length - h->file->fnode.size); h->file->fnode.size = offset + length; h->file->is_dirty = true; } - dout(20) << __func__ << " file now, unflushed " << h->file->fnode << dendl; int res = _flush_data(h, offset, length, buffered); - vselector->add_usage(h->file->vselector_hint, h->file->fnode); return res; } @@ -3668,10 +3669,9 @@ int BlueFS::truncate(FileWriter *h, uint64_t offset)/*_WF_L*/ _flush_bdev(h); std::lock_guard ll(log.lock); - vselector->sub_usage(h->file->vselector_hint, h->file->fnode.size); + vselector->sub_usage(h->file->vselector_hint, h->file->fnode.size - offset); h->file->fnode.size = offset; h->file->is_dirty = true; - vselector->add_usage(h->file->vselector_hint, h->file->fnode.size); log.t.op_file_update_inc(h->file->fnode); return 0; } @@ -3765,6 +3765,7 @@ const char* BlueFS::get_device_name(unsigned id) int BlueFS::_allocate(uint8_t id, uint64_t len, uint64_t alloc_unit, bluefs_fnode_t* node, + update_fn_t cb, size_t alloc_attempts, bool permit_dev_fallback) { @@ -3848,6 +3849,7 @@ int BlueFS::_allocate(uint8_t id, uint64_t len, len, alloc_unit, node, + cb, alloc_attempts, permit_dev_fallback); } else if (permit_dev_fallback && id != BDEV_SLOW && alloc[id + 1]) { @@ -3861,6 +3863,7 @@ int BlueFS::_allocate(uint8_t id, uint64_t len, len, 0, // back to default alloc unit node, + cb, alloc_attempts, permit_dev_fallback); } else { @@ -3880,9 +3883,12 @@ int BlueFS::_allocate(uint8_t id, uint64_t len, } for (auto& p : extents) { - node->append_extent(bluefs_extent_t(id, p.offset, p.length)); + bluefs_extent_t e(id, p.offset, p.length); + node->append_extent(e); + if (cb) { + cb(e); + } } - return 0; } @@ -3901,12 +3907,13 @@ int BlueFS::preallocate(FileRef f, uint64_t off, uint64_t len)/*_LF*/ if (off + len > allocated) { uint64_t want = off + len - allocated; - vselector->sub_usage(f->vselector_hint, f->fnode); int r = _allocate(vselector->select_prefer_bdev(f->vselector_hint), want, 0, - &f->fnode); - vselector->add_usage(f->vselector_hint, f->fnode); + &f->fnode, + [&](const bluefs_extent_t& e) { + vselector->add_usage(f->vselector_hint, e); + }); if (r < 0) return r; @@ -3963,7 +3970,6 @@ int BlueFS::open_for_write( _maybe_check_vselector_LNF(); FileRef file; bool create = false; - bool truncate = false; mempool::bluefs::vector pending_release_extents; { std::lock_guard ll(log.lock); @@ -3990,10 +3996,12 @@ int BlueFS::open_for_write( } file = ceph::make_ref(); file->fnode.ino = ++ino_last; + file->vselector_hint = vselector->get_hint_by_dir(dirname); nodes.file_map[ino_last] = file; dir->file_map[string{filename}] = file; ++file->refs; create = true; + vselector->add_usage(file->vselector_hint, file->fnode.size, true); // update file count logger->set(l_bluefs_num_files, nodes.file_map.size()); } else { // overwrite existing file? @@ -4008,8 +4016,8 @@ int BlueFS::open_for_write( << " already exists, truncate + overwrite" << dendl; vselector->sub_usage(file->vselector_hint, file->fnode); file->fnode.size = 0; + vselector->add_usage(file->vselector_hint, file->fnode.size, true); // restore file count pending_release_extents.swap(file->fnode.extents); - truncate = true; file->fnode.clear_extents(); } @@ -4017,11 +4025,6 @@ int BlueFS::open_for_write( ceph_assert(file->fnode.ino > 1); file->fnode.mtime = ceph_clock_now(); - file->vselector_hint = vselector->get_hint_by_dir(dirname); - if (create || truncate) { - vselector->add_usage(file->vselector_hint, file->fnode); // update file count - } - dout(20) << __func__ << " mapping " << dirname << "/" << filename << " vsel_hint " << file->vselector_hint << dendl; diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h index adfc8eb0a23..5926c0c81ec 100644 --- a/src/os/bluestore/BlueFS.h +++ b/src/os/bluestore/BlueFS.h @@ -85,10 +85,22 @@ public: virtual void* get_hint_for_log() const = 0; virtual void* get_hint_by_dir(std::string_view dirname) const = 0; - virtual void add_usage(void* file_hint, const bluefs_fnode_t& fnode) = 0; - virtual void sub_usage(void* file_hint, const bluefs_fnode_t& fnode) = 0; - virtual void add_usage(void* file_hint, uint64_t fsize) = 0; - virtual void sub_usage(void* file_hint, uint64_t fsize) = 0; + void add_usage(void* hint, const bluefs_fnode_t& fnode) { + for (auto& e : fnode.extents) { + add_usage(hint, e); + } + add_usage(hint, fnode.size, true); + } + void sub_usage(void* hint, const bluefs_fnode_t& fnode) { + for (auto& e : fnode.extents) { + sub_usage(hint, e); + } + sub_usage(hint, fnode.size, true); + } + virtual void add_usage(void* hint, const bluefs_extent_t& extent) = 0; + virtual void sub_usage(void* hint, const bluefs_extent_t& extent) = 0; + virtual void add_usage(void* hint, uint64_t fsize, bool upd_files = false) = 0; + virtual void sub_usage(void* hint, uint64_t fsize, bool upd_files = false) = 0; virtual uint8_t select_prefer_bdev(void* hint) = 0; virtual void get_paths(const std::string& base, paths& res) const = 0; virtual void dump(std::ostream& sout) = 0; @@ -433,9 +445,12 @@ private: return bdev[BDEV_SLOW] ? BDEV_SLOW : BDEV_DB; } const char* get_device_name(unsigned id); + + typedef std::function update_fn_t; int _allocate(uint8_t bdev, uint64_t len, uint64_t alloc_unit, bluefs_fnode_t* node, + update_fn_t cb = nullptr, size_t alloc_attempts = 0, bool permit_dev_fallback = true); @@ -713,19 +728,19 @@ public: void* get_hint_for_log() const override; void* get_hint_by_dir(std::string_view dirname) const override; - void add_usage(void* hint, const bluefs_fnode_t& fnode) override { + void add_usage(void* hint, const bluefs_extent_t& extent) override { // do nothing return; } - void sub_usage(void* hint, const bluefs_fnode_t& fnode) override { + void sub_usage(void* hint, const bluefs_extent_t& extent) override { // do nothing return; } - void add_usage(void* hint, uint64_t fsize) override { + void add_usage(void*, uint64_t, bool) override { // do nothing return; } - void sub_usage(void* hint, uint64_t fsize) override { + void sub_usage(void*, uint64_t, bool) override { // do nothing return; } diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index aa14d0204f7..8ca72d2aec9 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -6311,7 +6311,7 @@ int BlueStore::_open_bluefs(bool create, bool read_only) bluefs->get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100, bluefs->get_block_device_size(BlueFS::BDEV_DB) * 95 / 100, bluefs->get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100, - 1024 * 1024 * 1024, //FIXME: set expected l0 size here + rocks_opts.write_buffer_size * rocks_opts.max_write_buffer_number, rocks_opts.max_bytes_for_level_base, rocks_opts.max_bytes_for_level_multiplier, reserved_factor, @@ -18102,7 +18102,13 @@ void RocksDBBlueFSVolumeSelector::dump(ostream& sout) { auto max_x = per_level_per_dev_usage.get_max_x(); auto max_y = per_level_per_dev_usage.get_max_y(); - sout << "RocksDBBlueFSVolumeSelector Usage Matrix:" << std::endl; + sout << "RocksDBBlueFSVolumeSelector " << std::endl; + sout << ">>Settings<<" + << " extra=" << byte_u_t(db_avail4slow) + << ", l0_size=" << byte_u_t(level0_size) + << ", l_base=" << byte_u_t(level_base) + << ", l_multi=" << byte_u_t(level_multiplier) + << std::endl; constexpr std::array names{ { "DEV/LEV", "WAL", diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index c3d014dc938..2b8268ea8a9 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -4142,6 +4142,9 @@ class RocksDBBlueFSVolumeSelector : public BlueFSVolumeSelector uint64_t l_totals[LEVEL_MAX - LEVEL_FIRST]; uint64_t db_avail4slow = 0; + uint64_t level0_size = 0; + uint64_t level_base = 0; + uint64_t level_multiplier = 0; enum { OLD_POLICY, USE_SOME_EXTRA @@ -4167,21 +4170,24 @@ public: if (!new_pol) { return; } - // Calculating how much extra space is available at DB volume. // Depending on the presence of explicit reserved size specification it might be either // * DB volume size - reserved // or // * DB volume size - sum_max_level_size(0, L-1) - max_level_size(L) * reserved_factor if (!reserved) { + level0_size = _level0_size; + level_base = _level_base; + level_multiplier = _level_multiplier; uint64_t prev_levels = _level0_size; uint64_t cur_level = _level_base; - uint64_t cur_threshold = 0; + uint64_t cur_threshold = prev_levels + cur_level; do { - uint64_t next_level = cur_level * _level_multiplier; - uint64_t next_threshold = prev_levels + cur_level + next_level * reserved_factor; + uint64_t next_level = cur_level * _level_multiplier; + uint64_t next_threshold = prev_levels + cur_level + next_level; if (_db_total <= next_threshold) { - db_avail4slow = cur_threshold ? _db_total - cur_threshold : 0; + cur_threshold *= reserved_factor; + db_avail4slow = cur_threshold < _db_total ? _db_total - cur_threshold : 0; break; } else { prev_levels += cur_level; @@ -4190,7 +4196,7 @@ public: } } while (true); } else { - db_avail4slow = _db_total - reserved; + db_avail4slow = reserved < _db_total ? _db_total - reserved : 0; } } @@ -4199,63 +4205,40 @@ public: } void* get_hint_by_dir(std::string_view dirname) const override; - void add_usage(void* hint, const bluefs_fnode_t& fnode) override { + void add_usage(void* hint, const bluefs_extent_t& extent) override { if (hint == nullptr) return; size_t pos = (size_t)hint - LEVEL_FIRST; - for (auto& p : fnode.extents) { - auto& cur = per_level_per_dev_usage.at(p.bdev, pos); - auto& max = per_level_per_dev_max.at(p.bdev, pos); - uint64_t v = cur.fetch_add(p.length) + p.length; - while (v > max) { - max.exchange(v); - } - { - //update per-device totals - auto& cur = per_level_per_dev_usage.at(p.bdev, LEVEL_MAX - LEVEL_FIRST); - auto& max = per_level_per_dev_max.at(p.bdev, LEVEL_MAX - LEVEL_FIRST); - uint64_t v = cur.fetch_add(p.length) + p.length; - while (v > max) { - max.exchange(v); - } - } + auto& cur = per_level_per_dev_usage.at(extent.bdev, pos); + auto& max = per_level_per_dev_max.at(extent.bdev, pos); + uint64_t v = cur.fetch_add(extent.length) + extent.length; + while (v > max) { + max.exchange(v); } { - //update per-level actual totals - auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos); - auto& max = per_level_per_dev_max.at(BlueFS::MAX_BDEV, pos); - uint64_t v = cur.fetch_add(fnode.size) + fnode.size; + //update per-device totals + auto& cur = per_level_per_dev_usage.at(extent.bdev, LEVEL_MAX - LEVEL_FIRST); + auto& max = per_level_per_dev_max.at(extent.bdev, LEVEL_MAX - LEVEL_FIRST); + uint64_t v = cur.fetch_add(extent.length) + extent.length; while (v > max) { max.exchange(v); } } - ++per_level_files[pos]; - ++per_level_files[LEVEL_MAX - LEVEL_FIRST]; } - void sub_usage(void* hint, const bluefs_fnode_t& fnode) override { + void sub_usage(void* hint, const bluefs_extent_t& extent) override { if (hint == nullptr) return; size_t pos = (size_t)hint - LEVEL_FIRST; - for (auto& p : fnode.extents) { - auto& cur = per_level_per_dev_usage.at(p.bdev, pos); - ceph_assert(cur >= p.length); - cur -= p.length; - - //update per-device totals - auto& cur2 = per_level_per_dev_usage.at(p.bdev, LEVEL_MAX - LEVEL_FIRST); - ceph_assert(cur2 >= p.length); - cur2 -= p.length; - } - //update per-level actual totals - auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos); - ceph_assert(cur >= fnode.size); - cur -= fnode.size; - ceph_assert(per_level_files[pos] > 0); - --per_level_files[pos]; - ceph_assert(per_level_files[LEVEL_MAX - LEVEL_FIRST] > 0); - --per_level_files[LEVEL_MAX - LEVEL_FIRST]; + auto& cur = per_level_per_dev_usage.at(extent.bdev, pos); + ceph_assert(cur >= extent.length); + cur -= extent.length; + + //update per-device totals + auto& cur2 = per_level_per_dev_usage.at(extent.bdev, LEVEL_MAX - LEVEL_FIRST); + ceph_assert(cur2 >= extent.length); + cur2 -= extent.length; } - void add_usage(void* hint, uint64_t size_more) override { + void add_usage(void* hint, uint64_t size_more, bool upd_files) override { if (hint == nullptr) return; size_t pos = (size_t)hint - LEVEL_FIRST; @@ -4266,8 +4249,12 @@ public: while (v > max) { max.exchange(v); } + if (upd_files) { + ++per_level_files[pos]; + ++per_level_files[LEVEL_MAX - LEVEL_FIRST]; + } } - void sub_usage(void* hint, uint64_t size_less) override { + void sub_usage(void* hint, uint64_t size_less, bool upd_files) override { if (hint == nullptr) return; size_t pos = (size_t)hint - LEVEL_FIRST; @@ -4275,6 +4262,12 @@ public: auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos); ceph_assert(cur >= size_less); cur -= size_less; + if (upd_files) { + ceph_assert(per_level_files[pos] > 0); + --per_level_files[pos]; + ceph_assert(per_level_files[LEVEL_MAX - LEVEL_FIRST] > 0); + --per_level_files[LEVEL_MAX - LEVEL_FIRST]; + } } uint8_t select_prefer_bdev(void* h) override; -- 2.47.3