From: Xuehan Xu Date: Thu, 7 Nov 2024 01:41:18 +0000 (+0800) Subject: crimson/os/seastore: move the root meta out of the root block X-Git-Tag: v20.0.0~659^2~3 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=4cb1243d0301739e9440cb7be437d531bbcb455b;p=ceph.git crimson/os/seastore: move the root meta out of the root block During massive data backfilling, new osdmaps keep being created due to frequent pg status changes, which can lead to frequent osd meta updates. Those updates will be translated into "SeaStore::write_meta"s, which modifies the root block's meta field and invalidates all inflight transactions. Since the osd meta updates can be very frequent, long transactions may be kept invalidated and the corresponding IO requests hang. This commit moves the root meta out of the root block, so that updates to it won't invalidate irrelevant transactions Signed-off-by: Xuehan Xu --- diff --git a/src/crimson/os/seastore/cache.cc b/src/crimson/os/seastore/cache.cc index 5dcb7514ee1ab..70fec7caca48a 100644 --- a/src/crimson/os/seastore/cache.cc +++ b/src/crimson/os/seastore/cache.cc @@ -172,6 +172,7 @@ void Cache::register_metrics() {extent_types_t::LADDR_INTERNAL, sm::label_instance("ext", "LADDR_INTERNAL")}, {extent_types_t::LADDR_LEAF, sm::label_instance("ext", "LADDR_LEAF")}, {extent_types_t::DINK_LADDR_LEAF, sm::label_instance("ext", "DINK_LADDR_LEAF")}, + {extent_types_t::ROOT_META, sm::label_instance("ext", "ROOT_META")}, {extent_types_t::OMAP_INNER, sm::label_instance("ext", "OMAP_INNER")}, {extent_types_t::OMAP_LEAF, sm::label_instance("ext", "OMAP_LEAF")}, {extent_types_t::ONODE_BLOCK_STAGED, sm::label_instance("ext", "ONODE_BLOCK_STAGED")}, @@ -1093,6 +1094,9 @@ CachedExtentRef Cache::alloc_new_extent_by_type( case extent_types_t::LADDR_LEAF: return alloc_new_non_data_extent( t, length, hint, gen); + case extent_types_t::ROOT_META: + return alloc_new_non_data_extent( + t, length, hint, gen); case extent_types_t::ONODE_BLOCK_STAGED: return alloc_new_non_data_extent( t, length, hint, gen); @@ -2193,6 +2197,12 @@ Cache::do_get_caching_extent_by_type( ).safe_then([](auto extent) { return CachedExtentRef(extent.detach(), false /* add_ref */); }); + case extent_types_t::ROOT_META: + return do_get_caching_extent( + offset, length, std::move(extent_init_func), std::move(on_cache) + ).safe_then([](auto extent) { + return CachedExtentRef(extent.detach(), false /* add_ref */); + }); case extent_types_t::OMAP_INNER: return do_get_caching_extent( offset, length, std::move(extent_init_func), std::move(on_cache) diff --git a/src/crimson/os/seastore/root_meta.h b/src/crimson/os/seastore/root_meta.h new file mode 100644 index 0000000000000..edf082f1e383f --- /dev/null +++ b/src/crimson/os/seastore/root_meta.h @@ -0,0 +1,76 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/os/seastore/cached_extent.h" + +namespace crimson::os::seastore { + +struct RootMetaBlock : LogicalCachedExtent { + using meta_t = std::map; + using Ref = TCachedExtentRef; + static constexpr size_t SIZE = 4096; + static constexpr int MAX_META_LENGTH = 1024; + + explicit RootMetaBlock(ceph::bufferptr &&ptr) + : LogicalCachedExtent(std::move(ptr)) {} + explicit RootMetaBlock(extent_len_t length) + : LogicalCachedExtent(length) {} + RootMetaBlock(const RootMetaBlock &rhs) + : LogicalCachedExtent(rhs) {} + + CachedExtentRef duplicate_for_write(Transaction&) final { + return CachedExtentRef(new RootMetaBlock(*this)); + } + + static constexpr extent_types_t TYPE = extent_types_t::ROOT_META; + extent_types_t get_type() const final { + return extent_types_t::ROOT_META; + } + + /// dumps root meta as delta + ceph::bufferlist get_delta() final { + ceph::bufferlist bl; + ceph::buffer::ptr bptr(get_bptr(), 0, MAX_META_LENGTH); + bl.append(bptr); + return bl; + } + + /// overwrites root + void apply_delta(const ceph::bufferlist &_bl) final + { + assert(_bl.length() == MAX_META_LENGTH); + ceph::bufferlist bl = _bl; + bl.rebuild(); + get_bptr().copy_in(0, MAX_META_LENGTH, bl.front().c_str()); + } + + meta_t get_meta() const { + bufferlist bl; + bl.append(get_bptr()); + meta_t ret; + auto iter = bl.cbegin(); + decode(ret, iter); + return ret; + } + + void set_meta(const meta_t &m) { + ceph::bufferlist bl; + encode(m, bl); + ceph_assert(bl.length() <= MAX_META_LENGTH); + bl.rebuild(); + get_bptr().zero(0, MAX_META_LENGTH); + get_bptr().copy_in(0, bl.length(), bl.front().c_str()); + } + +}; +using RootMetaBlockRef = RootMetaBlock::Ref; + +} // crimson::os::seastore + + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter + : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/os/seastore/seastore_types.cc b/src/crimson/os/seastore/seastore_types.cc index f379dd0117c8d..450118e5e7570 100644 --- a/src/crimson/os/seastore/seastore_types.cc +++ b/src/crimson/os/seastore/seastore_types.cc @@ -246,6 +246,8 @@ std::ostream &operator<<(std::ostream &out, extent_types_t t) return out << "LADDR_LEAF"; case extent_types_t::ONODE_BLOCK_STAGED: return out << "ONODE_BLOCK_STAGED"; + case extent_types_t::ROOT_META: + return out << "ROOT_META"; case extent_types_t::OMAP_INNER: return out << "OMAP_INNER"; case extent_types_t::OMAP_LEAF: diff --git a/src/crimson/os/seastore/seastore_types.h b/src/crimson/os/seastore/seastore_types.h index df5c184e7ab0c..65cad878fbadc 100644 --- a/src/crimson/os/seastore/seastore_types.h +++ b/src/crimson/os/seastore/seastore_types.h @@ -1378,23 +1378,24 @@ enum class extent_types_t : uint8_t { LADDR_INTERNAL = 1, LADDR_LEAF = 2, DINK_LADDR_LEAF = 3, // should only be used for unitttests - OMAP_INNER = 4, - OMAP_LEAF = 5, - ONODE_BLOCK_STAGED = 6, - COLL_BLOCK = 7, - OBJECT_DATA_BLOCK = 8, - RETIRED_PLACEHOLDER = 9, + ROOT_META = 4, + OMAP_INNER = 5, + OMAP_LEAF = 6, + ONODE_BLOCK_STAGED = 7, + COLL_BLOCK = 8, + OBJECT_DATA_BLOCK = 9, + RETIRED_PLACEHOLDER = 10, // the following two types are not extent types, // they are just used to indicates paddr allocation deltas - ALLOC_INFO = 10, - JOURNAL_TAIL = 11, + ALLOC_INFO = 11, + JOURNAL_TAIL = 12, // Test Block Types - TEST_BLOCK = 12, - TEST_BLOCK_PHYSICAL = 13, - BACKREF_INTERNAL = 14, - BACKREF_LEAF = 15, + TEST_BLOCK = 13, + TEST_BLOCK_PHYSICAL = 14, + BACKREF_INTERNAL = 15, + BACKREF_LEAF = 16, // None and the number of valid extent_types_t - NONE = 16, + NONE = 17, }; using extent_types_le_t = uint8_t; constexpr auto EXTENT_TYPES_MAX = static_cast(extent_types_t::NONE); @@ -1409,12 +1410,12 @@ constexpr bool is_data_type(extent_types_t type) { } constexpr bool is_logical_metadata_type(extent_types_t type) { - return type >= extent_types_t::OMAP_INNER && + return type >= extent_types_t::ROOT_META && type <= extent_types_t::COLL_BLOCK; } constexpr bool is_logical_type(extent_types_t type) { - if ((type >= extent_types_t::OMAP_INNER && + if ((type >= extent_types_t::ROOT_META && type <= extent_types_t::OBJECT_DATA_BLOCK) || type == extent_types_t::TEST_BLOCK) { assert(is_logical_metadata_type(type) || @@ -1926,44 +1927,18 @@ using backref_root_t = phy_tree_root_t; * TODO: generalize this to permit more than one lba_manager implementation */ struct __attribute__((packed)) root_t { - using meta_t = std::map; - - static constexpr int MAX_META_LENGTH = 1024; - backref_root_t backref_root; lba_root_t lba_root; laddr_le_t onode_root; coll_root_le_t collection_root; + laddr_le_t meta; - char meta[MAX_META_LENGTH]; - - root_t() { - set_meta(meta_t{}); - } + root_t() = default; void adjust_addrs_from_base(paddr_t base) { lba_root.adjust_addrs_from_base(base); backref_root.adjust_addrs_from_base(base); } - - meta_t get_meta() { - bufferlist bl; - bl.append(ceph::buffer::create_static(MAX_META_LENGTH, meta)); - meta_t ret; - auto iter = bl.cbegin(); - decode(ret, iter); - return ret; - } - - void set_meta(const meta_t &m) { - ceph::bufferlist bl; - encode(m, bl); - ceph_assert(bl.length() < MAX_META_LENGTH); - bl.rebuild(); - auto &bptr = bl.front(); - ::memset(meta, 0, MAX_META_LENGTH); - ::memcpy(meta, bptr.c_str(), bl.length()); - } }; struct alloc_blk_t { diff --git a/src/crimson/os/seastore/transaction_manager.cc b/src/crimson/os/seastore/transaction_manager.cc index f4e3b0858f2f1..717c3822db951 100644 --- a/src/crimson/os/seastore/transaction_manager.cc +++ b/src/crimson/os/seastore/transaction_manager.cc @@ -74,6 +74,8 @@ TransactionManager::mkfs_ertr::future<> TransactionManager::mkfs() return lba_manager->mkfs(t); }).si_then([this, &t] { return backref_manager->mkfs(t); + }).si_then([this, &t] { + return init_root_meta(t); }).si_then([this, FNAME, &t] { INFOT("submitting mkfs transaction", t); return submit_transaction_direct(t); diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h index c7a94a9ef1132..dd03fca3c0f66 100644 --- a/src/crimson/os/seastore/transaction_manager.h +++ b/src/crimson/os/seastore/transaction_manager.h @@ -23,6 +23,7 @@ #include "crimson/os/seastore/logging.h" #include "crimson/os/seastore/seastore_types.h" #include "crimson/os/seastore/cache.h" +#include "crimson/os/seastore/root_meta.h" #include "crimson/os/seastore/lba_manager.h" #include "crimson/os/seastore/backref_manager.h" #include "crimson/os/seastore/journal.h" @@ -690,9 +691,11 @@ public: const std::string &key) { return cache->get_root( t - ).si_then([&key, &t](auto root) { + ).si_then([&t, this](auto root) { + return read_extent(t, root->root.meta); + }).si_then([key, &t](auto mblock) { LOG_PREFIX(TransactionManager::read_root_meta); - auto meta = root->root.get_meta(); + auto meta = mblock->get_meta(); auto iter = meta.find(key); if (iter == meta.end()) { SUBDEBUGT(seastore_tm, "{} -> nullopt", t, key); @@ -701,7 +704,35 @@ public: SUBDEBUGT(seastore_tm, "{} -> {}", t, key, iter->second); return seastar::make_ready_future(iter->second); } - }); + }).handle_error_interruptible( + crimson::ct_error::input_output_error::pass_further{}, + crimson::ct_error::assert_all{"unexpected error!"} + ); + } + + /** + * init_root_meta + * + * create the root meta block + */ + using init_root_meta_iertr = base_iertr; + using init_root_meta_ret = init_root_meta_iertr::future<>; + init_root_meta_ret init_root_meta(Transaction &t) { + return alloc_non_data_extent( + t, L_ADDR_MIN, RootMetaBlock::SIZE + ).si_then([this, &t](auto meta) { + meta->set_meta(RootMetaBlock::meta_t{}); + return cache->get_root(t + ).si_then([this, &t, meta](auto root) { + auto mroot = cache->duplicate_for_write( + t, root)->template cast(); + mroot->root.meta = meta->get_laddr(); + return seastar::now(); + }); + }).handle_error_interruptible( + crimson::ct_error::input_output_error::pass_further{}, + crimson::ct_error::assert_all{"unexpected error!"} + ); } /** @@ -719,15 +750,21 @@ public: SUBDEBUGT(seastore_tm, "seastore_tm, {} -> {} ...", t, key, value); return cache->get_root( t - ).si_then([this, &t, &key, &value](RootBlockRef root) { - root = cache->duplicate_for_write(t, root)->cast(); + ).si_then([this, &t](RootBlockRef root) { + return read_extent(t, root->root.meta); + }).si_then([this, key, value, &t](auto mblock) { + mblock = get_mutable_extent(t, mblock + )->template cast(); - auto meta = root->root.get_meta(); + auto meta = mblock->get_meta(); meta[key] = value; - root->root.set_meta(meta); + mblock->set_meta(meta); return seastar::now(); - }); + }).handle_error_interruptible( + crimson::ct_error::input_output_error::pass_further{}, + crimson::ct_error::assert_all{"unexpected error!"} + ); } /**