From: Igor Fedotov Date: Thu, 19 Oct 2023 11:57:23 +0000 (+0300) Subject: os/bluestore: rework vselector calls X-Git-Tag: v17.2.8~39^2~2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=1a5653001d94a1d76d4d5a895c02e6ac9201181b;p=ceph.git os/bluestore: rework vselector calls We can provide fnode delta to vseector now. Which is a bit more effective. Signed-off-by: Igor Fedotov (cherry picked from commit 3e04ac48779f62394ebd9587298823bf6b20a570) Conflicts: src/os/bluestore/BlueFS.cc * trival --- diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index 4c6a2042ab88..1c14cd24d1d2 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -1545,11 +1545,9 @@ int BlueFS::_replay(bool noop, bool to_stdout) } if (fnode.ino != 1) { vselector->sub_usage(f->vselector_hint, f->fnode); - } + vselector->add_usage(f->vselector_hint, fnode); + } f->fnode = fnode; - if (fnode.ino != 1) { - vselector->add_usage(f->vselector_hint, f->fnode); - } if (fnode.ino > ino_last) { ino_last = fnode.ino; @@ -1791,7 +1789,7 @@ int BlueFS::device_migrate_to_existing( // write entire file auto l = _allocate(dev_target, bl.length(), 0, - &file_ref->fnode, 0, false); + &file_ref->fnode, nullptr, 0, false); if (l < 0) { derr << __func__ << " unable to allocate len 0x" << std::hex << bl.length() << std::dec << " from " << (int)dev_target @@ -1931,7 +1929,7 @@ int BlueFS::device_migrate_to_new( // write entire file auto l = _allocate(dev_target, bl.length(), 0, - &file_ref->fnode, 0, false); + &file_ref->fnode, nullptr, 0, false); if (l < 0) { derr << __func__ << " unable to allocate len 0x" << std::hex << bl.length() << std::dec << " from " << (int)dev_target @@ -2521,7 +2519,6 @@ void BlueFS::_rewrite_log_and_layout_sync_LNF_LD(bool permit_dev_fallback, auto t0 = mono_clock::now(); File *log_file = log.writer->file.get(); - bluefs_fnode_t fnode_tail; // log.t.seq is always set to current live seq ceph_assert(log.t.seq == log.seq_live); // Capturing entire state. Dump anything that has been stored there. @@ -2576,7 +2573,8 @@ void BlueFS::_rewrite_log_and_layout_sync_LNF_LD(bool permit_dev_fallback, dout(20) << __func__ << " compacted_meta_need " << compacted_meta_need << dendl; - int r = _allocate(log_dev, compacted_meta_need, 0, &fnode_tail, 0, + bluefs_fnode_t fnode_tail; + int r = _allocate(log_dev, compacted_meta_need, 0, &fnode_tail, nullptr, 0, permit_dev_fallback); ceph_assert(r == 0); @@ -2587,7 +2585,7 @@ void BlueFS::_rewrite_log_and_layout_sync_LNF_LD(bool permit_dev_fallback, uint64_t starter_need = _make_initial_transaction(starter_seq, fnode_tail, 0, nullptr); bluefs_fnode_t fnode_starter(log_file->fnode.ino, 0, mtime); - r = _allocate(log_dev, starter_need, 0, &fnode_starter, 0, + r = _allocate(log_dev, starter_need, 0, &fnode_starter, nullptr, 0, permit_dev_fallback); ceph_assert(r == 0); @@ -3087,16 +3085,17 @@ void BlueFS::_extend_log(uint64_t amount) { } ll.release(); uint64_t allocated_before_extension = log.writer->file->fnode.get_allocated(); - vselector->sub_usage(log.writer->file->vselector_hint, log.writer->file->fnode); amount = round_up_to(amount, super.block_size); int r = _allocate( vselector->select_prefer_bdev(log.writer->file->vselector_hint), amount, 0, - &log.writer->file->fnode); + &log.writer->file->fnode, + [&](const bluefs_extent_t& e) { + vselector->add_usage(log.writer->file->vselector_hint, e); + }); ceph_assert(r == 0); dout(10) << "extended log by 0x" << std::hex << amount << " bytes " << dendl; - vselector->add_usage(log.writer->file->vselector_hint, log.writer->file->fnode); bluefs_transaction_t log_extend_transaction; log_extend_transaction.seq = log.t.seq; @@ -3369,7 +3368,9 @@ int BlueFS::_flush_range_F(FileWriter *h, uint64_t offset, uint64_t length) dout(10) << __func__ << " " << h << " pos 0x" << std::hex << h->pos << " 0x" << offset << "~" << length << std::dec - << " to " << h->file->fnode << dendl; + << " to " << h->file->fnode + << " hint " << h->file->vselector_hint + << dendl; if (h->file->deleted) { dout(10) << __func__ << " deleted, no-op" << dendl; return 0; @@ -3390,7 +3391,6 @@ int BlueFS::_flush_range_F(FileWriter *h, uint64_t offset, uint64_t length) ceph_assert(offset <= h->file->fnode.size); uint64_t allocated = h->file->fnode.get_allocated(); - vselector->sub_usage(h->file->vselector_hint, h->file->fnode); // do not bother to dirty the file if we are overwriting // previously allocated extents. if (allocated < offset + length) { @@ -3399,25 +3399,26 @@ int BlueFS::_flush_range_F(FileWriter *h, uint64_t offset, uint64_t length) int r = _allocate(vselector->select_prefer_bdev(h->file->vselector_hint), offset + length - allocated, 0, - &h->file->fnode); + &h->file->fnode, + [&](const bluefs_extent_t& e) { + vselector->add_usage(h->file->vselector_hint, e); + }); if (r < 0) { derr << __func__ << " allocated: 0x" << std::hex << allocated << " offset: 0x" << offset << " length: 0x" << length << std::dec << dendl; - vselector->add_usage(h->file->vselector_hint, h->file->fnode); // undo ceph_abort_msg("bluefs enospc"); return r; } h->file->is_dirty = true; } if (h->file->fnode.size < offset + length) { + vselector->add_usage(h->file->vselector_hint, offset + length - h->file->fnode.size); h->file->fnode.size = offset + length; h->file->is_dirty = true; } - dout(20) << __func__ << " file now, unflushed " << h->file->fnode << dendl; int res = _flush_data(h, offset, length, buffered); - vselector->add_usage(h->file->vselector_hint, h->file->fnode); return res; } @@ -3672,10 +3673,9 @@ int BlueFS::truncate(FileWriter *h, uint64_t offset)/*_WF_L*/ _flush_bdev(h); std::lock_guard ll(log.lock); - vselector->sub_usage(h->file->vselector_hint, h->file->fnode.size); + vselector->sub_usage(h->file->vselector_hint, h->file->fnode.size - offset); h->file->fnode.size = offset; h->file->is_dirty = true; - vselector->add_usage(h->file->vselector_hint, h->file->fnode.size); log.t.op_file_update_inc(h->file->fnode); return 0; } @@ -3769,6 +3769,7 @@ const char* BlueFS::get_device_name(unsigned id) int BlueFS::_allocate(uint8_t id, uint64_t len, uint64_t alloc_unit, bluefs_fnode_t* node, + update_fn_t cb, size_t alloc_attempts, bool permit_dev_fallback) { @@ -3852,6 +3853,7 @@ int BlueFS::_allocate(uint8_t id, uint64_t len, len, alloc_unit, node, + cb, alloc_attempts, permit_dev_fallback); } else if (permit_dev_fallback && id != BDEV_SLOW && alloc[id + 1]) { @@ -3865,6 +3867,7 @@ int BlueFS::_allocate(uint8_t id, uint64_t len, len, 0, // back to default alloc unit node, + cb, alloc_attempts, permit_dev_fallback); } else { @@ -3884,9 +3887,12 @@ int BlueFS::_allocate(uint8_t id, uint64_t len, } for (auto& p : extents) { - node->append_extent(bluefs_extent_t(id, p.offset, p.length)); + bluefs_extent_t e(id, p.offset, p.length); + node->append_extent(e); + if (cb) { + cb(e); + } } - return 0; } @@ -3905,12 +3911,13 @@ int BlueFS::preallocate(FileRef f, uint64_t off, uint64_t len)/*_LF*/ if (off + len > allocated) { uint64_t want = off + len - allocated; - vselector->sub_usage(f->vselector_hint, f->fnode); int r = _allocate(vselector->select_prefer_bdev(f->vselector_hint), want, 0, - &f->fnode); - vselector->add_usage(f->vselector_hint, f->fnode); + &f->fnode, + [&](const bluefs_extent_t& e) { + vselector->add_usage(f->vselector_hint, e); + }); if (r < 0) return r; @@ -3967,7 +3974,6 @@ int BlueFS::open_for_write( _maybe_check_vselector_LNF(); FileRef file; bool create = false; - bool truncate = false; mempool::bluefs::vector pending_release_extents; { std::lock_guard ll(log.lock); @@ -3994,10 +4000,12 @@ int BlueFS::open_for_write( } file = ceph::make_ref(); file->fnode.ino = ++ino_last; + file->vselector_hint = vselector->get_hint_by_dir(dirname); nodes.file_map[ino_last] = file; dir->file_map[string{filename}] = file; ++file->refs; create = true; + vselector->add_usage(file->vselector_hint, file->fnode.size, true); // update file count logger->set(l_bluefs_num_files, nodes.file_map.size()); } else { // overwrite existing file? @@ -4012,8 +4020,8 @@ int BlueFS::open_for_write( << " already exists, truncate + overwrite" << dendl; vselector->sub_usage(file->vselector_hint, file->fnode); file->fnode.size = 0; + vselector->add_usage(file->vselector_hint, file->fnode.size, true); // restore file count pending_release_extents.swap(file->fnode.extents); - truncate = true; file->fnode.clear_extents(); } @@ -4021,11 +4029,6 @@ int BlueFS::open_for_write( ceph_assert(file->fnode.ino > 1); file->fnode.mtime = ceph_clock_now(); - file->vselector_hint = vselector->get_hint_by_dir(dirname); - if (create || truncate) { - vselector->add_usage(file->vselector_hint, file->fnode); // update file count - } - dout(20) << __func__ << " mapping " << dirname << "/" << filename << " vsel_hint " << file->vselector_hint << dendl; diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h index 615ea49143e9..aa43db8946f4 100644 --- a/src/os/bluestore/BlueFS.h +++ b/src/os/bluestore/BlueFS.h @@ -85,10 +85,22 @@ public: virtual void* get_hint_for_log() const = 0; virtual void* get_hint_by_dir(std::string_view dirname) const = 0; - virtual void add_usage(void* file_hint, const bluefs_fnode_t& fnode) = 0; - virtual void sub_usage(void* file_hint, const bluefs_fnode_t& fnode) = 0; - virtual void add_usage(void* file_hint, uint64_t fsize) = 0; - virtual void sub_usage(void* file_hint, uint64_t fsize) = 0; + void add_usage(void* hint, const bluefs_fnode_t& fnode) { + for (auto& e : fnode.extents) { + add_usage(hint, e); + } + add_usage(hint, fnode.size, true); + } + void sub_usage(void* hint, const bluefs_fnode_t& fnode) { + for (auto& e : fnode.extents) { + sub_usage(hint, e); + } + sub_usage(hint, fnode.size, true); + } + virtual void add_usage(void* hint, const bluefs_extent_t& extent) = 0; + virtual void sub_usage(void* hint, const bluefs_extent_t& extent) = 0; + virtual void add_usage(void* hint, uint64_t fsize, bool upd_files = false) = 0; + virtual void sub_usage(void* hint, uint64_t fsize, bool upd_files = false) = 0; virtual uint8_t select_prefer_bdev(void* hint) = 0; virtual void get_paths(const std::string& base, paths& res) const = 0; virtual void dump(std::ostream& sout) = 0; @@ -433,9 +445,12 @@ private: return bdev[BDEV_SLOW] ? BDEV_SLOW : BDEV_DB; } const char* get_device_name(unsigned id); + + typedef std::function update_fn_t; int _allocate(uint8_t bdev, uint64_t len, uint64_t alloc_unit, bluefs_fnode_t* node, + update_fn_t cb = nullptr, size_t alloc_attempts = 0, bool permit_dev_fallback = true); @@ -710,19 +725,19 @@ public: void* get_hint_for_log() const override; void* get_hint_by_dir(std::string_view dirname) const override; - void add_usage(void* hint, const bluefs_fnode_t& fnode) override { + void add_usage(void* hint, const bluefs_extent_t& extent) override { // do nothing return; } - void sub_usage(void* hint, const bluefs_fnode_t& fnode) override { + void sub_usage(void* hint, const bluefs_extent_t& extent) override { // do nothing return; } - void add_usage(void* hint, uint64_t fsize) override { + void add_usage(void*, uint64_t, bool) override { // do nothing return; } - void sub_usage(void* hint, uint64_t fsize) override { + void sub_usage(void*, uint64_t, bool) override { // do nothing return; } diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 1eba5dcdd988..aa7c7eb5c1d7 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -6299,7 +6299,7 @@ int BlueStore::_open_bluefs(bool create, bool read_only) bluefs->get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100, bluefs->get_block_device_size(BlueFS::BDEV_DB) * 95 / 100, bluefs->get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100, - 1024 * 1024 * 1024, //FIXME: set expected l0 size here + rocks_opts.write_buffer_size * rocks_opts.max_write_buffer_number, rocks_opts.max_bytes_for_level_base, rocks_opts.max_bytes_for_level_multiplier, reserved_factor, @@ -18126,7 +18126,13 @@ void RocksDBBlueFSVolumeSelector::dump(ostream& sout) { auto max_x = per_level_per_dev_usage.get_max_x(); auto max_y = per_level_per_dev_usage.get_max_y(); - sout << "RocksDBBlueFSVolumeSelector Usage Matrix:" << std::endl; + sout << "RocksDBBlueFSVolumeSelector " << std::endl; + sout << ">>Settings<<" + << " extra=" << byte_u_t(db_avail4slow) + << ", l0_size=" << byte_u_t(level0_size) + << ", l_base=" << byte_u_t(level_base) + << ", l_multi=" << byte_u_t(level_multiplier) + << std::endl; constexpr std::array names{ { "DEV/LEV", "WAL", diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 6613e301756f..cf60ece0ca99 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -4140,6 +4140,9 @@ class RocksDBBlueFSVolumeSelector : public BlueFSVolumeSelector uint64_t l_totals[LEVEL_MAX - LEVEL_FIRST]; uint64_t db_avail4slow = 0; + uint64_t level0_size = 0; + uint64_t level_base = 0; + uint64_t level_multiplier = 0; enum { OLD_POLICY, USE_SOME_EXTRA @@ -4165,21 +4168,24 @@ public: if (!new_pol) { return; } - // Calculating how much extra space is available at DB volume. // Depending on the presence of explicit reserved size specification it might be either // * DB volume size - reserved // or // * DB volume size - sum_max_level_size(0, L-1) - max_level_size(L) * reserved_factor if (!reserved) { + level0_size = _level0_size; + level_base = _level_base; + level_multiplier = _level_multiplier; uint64_t prev_levels = _level0_size; uint64_t cur_level = _level_base; - uint64_t cur_threshold = 0; + uint64_t cur_threshold = prev_levels + cur_level; do { - uint64_t next_level = cur_level * _level_multiplier; - uint64_t next_threshold = prev_levels + cur_level + next_level * reserved_factor; + uint64_t next_level = cur_level * _level_multiplier; + uint64_t next_threshold = prev_levels + cur_level + next_level; if (_db_total <= next_threshold) { - db_avail4slow = cur_threshold ? _db_total - cur_threshold : 0; + cur_threshold *= reserved_factor; + db_avail4slow = cur_threshold < _db_total ? _db_total - cur_threshold : 0; break; } else { prev_levels += cur_level; @@ -4188,7 +4194,7 @@ public: } } while (true); } else { - db_avail4slow = _db_total - reserved; + db_avail4slow = reserved < _db_total ? _db_total - reserved : 0; } } @@ -4197,63 +4203,40 @@ public: } void* get_hint_by_dir(std::string_view dirname) const override; - void add_usage(void* hint, const bluefs_fnode_t& fnode) override { + void add_usage(void* hint, const bluefs_extent_t& extent) override { if (hint == nullptr) return; size_t pos = (size_t)hint - LEVEL_FIRST; - for (auto& p : fnode.extents) { - auto& cur = per_level_per_dev_usage.at(p.bdev, pos); - auto& max = per_level_per_dev_max.at(p.bdev, pos); - uint64_t v = cur.fetch_add(p.length) + p.length; - while (v > max) { - max.exchange(v); - } - { - //update per-device totals - auto& cur = per_level_per_dev_usage.at(p.bdev, LEVEL_MAX - LEVEL_FIRST); - auto& max = per_level_per_dev_max.at(p.bdev, LEVEL_MAX - LEVEL_FIRST); - uint64_t v = cur.fetch_add(p.length) + p.length; - while (v > max) { - max.exchange(v); - } - } + auto& cur = per_level_per_dev_usage.at(extent.bdev, pos); + auto& max = per_level_per_dev_max.at(extent.bdev, pos); + uint64_t v = cur.fetch_add(extent.length) + extent.length; + while (v > max) { + max.exchange(v); } { - //update per-level actual totals - auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos); - auto& max = per_level_per_dev_max.at(BlueFS::MAX_BDEV, pos); - uint64_t v = cur.fetch_add(fnode.size) + fnode.size; + //update per-device totals + auto& cur = per_level_per_dev_usage.at(extent.bdev, LEVEL_MAX - LEVEL_FIRST); + auto& max = per_level_per_dev_max.at(extent.bdev, LEVEL_MAX - LEVEL_FIRST); + uint64_t v = cur.fetch_add(extent.length) + extent.length; while (v > max) { max.exchange(v); } } - ++per_level_files[pos]; - ++per_level_files[LEVEL_MAX - LEVEL_FIRST]; } - void sub_usage(void* hint, const bluefs_fnode_t& fnode) override { + void sub_usage(void* hint, const bluefs_extent_t& extent) override { if (hint == nullptr) return; size_t pos = (size_t)hint - LEVEL_FIRST; - for (auto& p : fnode.extents) { - auto& cur = per_level_per_dev_usage.at(p.bdev, pos); - ceph_assert(cur >= p.length); - cur -= p.length; - - //update per-device totals - auto& cur2 = per_level_per_dev_usage.at(p.bdev, LEVEL_MAX - LEVEL_FIRST); - ceph_assert(cur2 >= p.length); - cur2 -= p.length; - } - //update per-level actual totals - auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos); - ceph_assert(cur >= fnode.size); - cur -= fnode.size; - ceph_assert(per_level_files[pos] > 0); - --per_level_files[pos]; - ceph_assert(per_level_files[LEVEL_MAX - LEVEL_FIRST] > 0); - --per_level_files[LEVEL_MAX - LEVEL_FIRST]; + auto& cur = per_level_per_dev_usage.at(extent.bdev, pos); + ceph_assert(cur >= extent.length); + cur -= extent.length; + + //update per-device totals + auto& cur2 = per_level_per_dev_usage.at(extent.bdev, LEVEL_MAX - LEVEL_FIRST); + ceph_assert(cur2 >= extent.length); + cur2 -= extent.length; } - void add_usage(void* hint, uint64_t size_more) override { + void add_usage(void* hint, uint64_t size_more, bool upd_files) override { if (hint == nullptr) return; size_t pos = (size_t)hint - LEVEL_FIRST; @@ -4264,8 +4247,12 @@ public: while (v > max) { max.exchange(v); } + if (upd_files) { + ++per_level_files[pos]; + ++per_level_files[LEVEL_MAX - LEVEL_FIRST]; + } } - void sub_usage(void* hint, uint64_t size_less) override { + void sub_usage(void* hint, uint64_t size_less, bool upd_files) override { if (hint == nullptr) return; size_t pos = (size_t)hint - LEVEL_FIRST; @@ -4273,6 +4260,12 @@ public: auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos); ceph_assert(cur >= size_less); cur -= size_less; + if (upd_files) { + ceph_assert(per_level_files[pos] > 0); + --per_level_files[pos]; + ceph_assert(per_level_files[LEVEL_MAX - LEVEL_FIRST] > 0); + --per_level_files[LEVEL_MAX - LEVEL_FIRST]; + } } uint8_t select_prefer_bdev(void* h) override;