From: Samuel Just Date: Wed, 17 Jun 2020 18:46:26 +0000 (-0700) Subject: crimson/os/seastore: embed the root pointers directly in the root delta X-Git-Tag: v16.1.0~1882^2~2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=4252b87c1e163382424e14c7406bc08527f271df;p=ceph.git crimson/os/seastore: embed the root pointers directly in the root delta Previously, I set it up so that there was a "magic" ROOT_LOCATION delta associated with no block type which would point at the root block. As long as replaying the journal yielded at least one of those, we'd therefore be able to find the most recent root block. However, in practice, the entire root block is unlikely to ever weigh in at more than 100 bytes or so. Thus, this patch does a few things: - ROOT_LOCATION deltas are removed - ROOT deltas contain an encoding of the entire root block - ROOT blocks are never actually written down except in these deltas - ROOT blocks do not have a physical address - ROOT blocks never show up in the transaction fresh or mutated lists nor do they show up in the cache extent set or dirty list This adds a little bit of special handling, but the root was already special due to a need for cache interface users to discover it without knowing the physical address. Signed-off-by: Samuel Just --- diff --git a/src/crimson/os/seastore/CMakeLists.txt b/src/crimson/os/seastore/CMakeLists.txt index 1d4998bbb36d..4acc48f05d47 100644 --- a/src/crimson/os/seastore/CMakeLists.txt +++ b/src/crimson/os/seastore/CMakeLists.txt @@ -14,7 +14,6 @@ add_library(crimson-seastore onode_manager/simple-fltree/onode_delta.cc onode_manager/simple-fltree/onode_node.cc seastore.cc - root_block.cc ) target_link_libraries(crimson-seastore crimson) diff --git a/src/crimson/os/seastore/cache.cc b/src/crimson/os/seastore/cache.cc index 2b5790aa58c5..25a337633c78 100644 --- a/src/crimson/os/seastore/cache.cc +++ b/src/crimson/os/seastore/cache.cc @@ -17,6 +17,9 @@ namespace { namespace crimson::os::seastore { +Cache::Cache(SegmentManager &segment_manager) : + segment_manager(segment_manager) {} + Cache::~Cache() { for (auto &i: extents) { @@ -76,17 +79,16 @@ CachedExtentRef Cache::duplicate_for_write( return i; auto ret = i->duplicate_for_write(); - ret->version++; - ret->last_committed_crc = i->last_committed_crc; - ret->state = CachedExtent::extent_state_t::MUTATION_PENDING; - if (ret->get_type() == extent_types_t::ROOT) { t.root = ret->cast(); + } else { + ret->last_committed_crc = i->last_committed_crc; + t.add_to_retired_set(i); + t.add_mutated_extent(ret); } - t.add_to_retired_set(i); - t.add_mutated_extent(ret); - + ret->version++; + ret->state = CachedExtent::extent_state_t::MUTATION_PENDING; return ret; } @@ -134,6 +136,19 @@ std::optional Cache::try_construct_record(Transaction &t) i->last_committed_crc = final_crc; } + if (t.root) { + record.deltas.push_back( + delta_info_t{ + extent_types_t::ROOT, + paddr_t{}, + 0, + 0, + 0, + t.root->get_version() - 1, + t.root->get_delta() + }); + } + record.extents.reserve(t.fresh_block_list.size()); for (auto &i: t.fresh_block_list) { logger().debug("try_construct_record: fresh block {}", *i); @@ -141,16 +156,7 @@ std::optional Cache::try_construct_record(Transaction &t) i->prepare_write(); bl.append(i->get_bptr()); if (i->get_type() == extent_types_t::ROOT) { - record.deltas.push_back( - delta_info_t{ - extent_types_t::ROOT_LOCATION, - i->get_paddr(), - 0, - 0, - 0, - 0, - bufferlist() - }); + assert(0 == "ROOT never gets written as a fresh block"); } record.extents.push_back(extent_t{std::move(bl)}); } @@ -163,8 +169,12 @@ void Cache::complete_commit( Transaction &t, paddr_t final_block_start) { - if (t.root) + if (t.root) { root = t.root; + root->on_delta_write(final_block_start); + root->state = CachedExtent::extent_state_t::DIRTY; + logger().debug("complete_commit: new root {}", *t.root); + } paddr_t cur = final_block_start; for (auto &i: t.fresh_block_list) { @@ -189,15 +199,19 @@ void Cache::complete_commit( } } +void Cache::init() { + root = new RootBlock(); + root->state = CachedExtent::extent_state_t::DIRTY; +} + Cache::mkfs_ertr::future<> Cache::mkfs(Transaction &t) { - t.root = alloc_new_extent(t, RootBlock::SIZE); + duplicate_for_write(t, root); return mkfs_ertr::now(); } Cache::close_ertr::future<> Cache::close() { - retire_extent(root); root.reset(); for (auto i = dirty.begin(); i != dirty.end(); ) { auto ptr = &*i; @@ -210,44 +224,30 @@ Cache::close_ertr::future<> Cache::close() Cache::replay_delta_ret Cache::replay_delta(paddr_t record_base, const delta_info_t &delta) { - if (delta.type == extent_types_t::ROOT_LOCATION) { - auto root_location = delta.paddr.is_relative() - ? record_base.add_record_relative(delta.paddr) - : delta.paddr; - logger().debug("replay_delta: found root addr {}", root_location); + if (delta.type == extent_types_t::ROOT) { + logger().debug("replay_delta: found root delta"); root->apply_delta_and_adjust_crc(record_base, delta.bl); - return get_extent( - root_location, - RootBlock::SIZE - ).safe_then([this, root_location](auto ref) { - logger().debug("replay_delta: finished reading root at {}", root_location); - root = ref; - return root->complete_load(); - }).safe_then([root_location] { - logger().debug("replay_delta: finished loading root at {}", root_location); - return replay_delta_ret(replay_delta_ertr::ready_future_marker{}); - }); + return replay_delta_ertr::now(); + } else { + return get_extent_by_type( + delta.type, + delta.paddr, + delta.length).safe_then([this, record_base, delta](auto extent) { + logger().debug( + "replay_delta: replaying {} on {}", + *extent, + delta); + + assert(extent->version == delta.pversion); + + assert(extent->last_committed_crc == delta.prev_crc); + extent->apply_delta_and_adjust_crc(record_base, delta.bl); + assert(extent->last_committed_crc == delta.final_crc); + + extent->version++; + mark_dirty(extent); + }); } - - return get_extent_by_type( - delta.type, - delta.paddr, - delta.length).safe_then([this, record_base, delta](auto extent) { - /* TODO asserts about version */ - logger().debug( - "replay_delta: replaying {} on {}", - *extent, - delta); - - assert(extent->version == delta.pversion); - - assert(extent->last_committed_crc == delta.prev_crc); - extent->apply_delta_and_adjust_crc(record_base, delta.bl); - assert(extent->last_committed_crc == delta.final_crc); - - extent->version++; - mark_dirty(extent); - }); } Cache::get_root_ret Cache::get_root(Transaction &t) @@ -272,15 +272,9 @@ Cache::get_extent_ertr::future Cache::get_extent_by_type( segment_off_t length) { switch (type) { - case extent_types_t::ROOT_LOCATION: { - ceph_assert(0 == "root location deltas are handled specially"); - return get_extent_ertr::make_ready_future(); - } case extent_types_t::ROOT: - return get_extent(offset, length - ).safe_then([](auto extent) { - return CachedExtentRef(extent.detach(), false /* add_ref */); - }); + assert(0 == "ROOT is never directly read"); + return get_extent_ertr::make_ready_future(); case extent_types_t::LADDR_INTERNAL: return get_extent(offset, length ).safe_then([](auto extent) { diff --git a/src/crimson/os/seastore/cache.h b/src/crimson/os/seastore/cache.h index 13a34a746a30..622478368adf 100644 --- a/src/crimson/os/seastore/cache.h +++ b/src/crimson/os/seastore/cache.h @@ -142,7 +142,7 @@ using TransactionRef = std::unique_ptr; */ class Cache { public: - Cache(SegmentManager &segment_manager) : segment_manager(segment_manager) {} + Cache(SegmentManager &segment_manager); ~Cache(); TransactionRef get_transaction() { @@ -325,6 +325,11 @@ public: paddr_t final_block_start ///< [in] offset of initial block ); + /** + * init + */ + void init(); + /** * mkfs * @@ -350,8 +355,6 @@ public: * Intended for use in Journal::delta. For each delta, should decode delta, * read relevant block from disk or cache (using correct type), and call * CachedExtent::apply_delta marking the extent dirty. - * - * TODO: currently only handles the ROOT_LOCATION delta. */ using replay_delta_ertr = crimson::errorator< crimson::ct_error::input_output_error>; diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc index 9fcb03eda847..a578f9aca2fd 100644 --- a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc +++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc @@ -23,17 +23,17 @@ BtreeLBAManager::mkfs_ret BtreeLBAManager::mkfs( Transaction &t) { logger().debug("BtreeLBAManager::mkfs"); - return cache.get_root(t).safe_then([this, &t](auto root) { + return cache.get_root(t).safe_then([this, &t](auto croot) { auto root_leaf = cache.alloc_new_extent( t, LBA_BLOCK_SIZE); root_leaf->set_size(0); - root->get_lba_root() = - btree_lba_root_t{ + croot->get_lba_root() = + root_t{ 0, 0, - root_leaf->get_paddr() - root->get_paddr(), - paddr_t{}}; + root_leaf->get_paddr(), + make_record_relative_paddr(0)}; return mkfs_ertr::now(); }); } @@ -41,19 +41,17 @@ BtreeLBAManager::mkfs_ret BtreeLBAManager::mkfs( BtreeLBAManager::get_root_ret BtreeLBAManager::get_root(Transaction &t) { - return cache.get_root(t).safe_then([this, &t](auto root) { - paddr_t root_addr = root->get_lba_root().lba_root_addr; - root_addr = root_addr.maybe_relative_to(root->get_paddr()); + return cache.get_root(t).safe_then([this, &t](auto croot) { logger().debug( "BtreeLBAManager::get_root: reading root at {} depth {}", - root->get_lba_root().lba_root_addr, - root->get_lba_root().lba_depth); + paddr_t{croot->get_lba_root().lba_root_addr}, + unsigned(croot->get_lba_root().lba_depth)); return get_lba_btree_extent( cache, t, - root->get_lba_root().lba_depth, - root->get_lba_root().lba_root_addr, - root->get_paddr()); + croot->get_lba_root().lba_depth, + croot->get_lba_root().lba_root_addr, + paddr_t()); }); } diff --git a/src/crimson/os/seastore/root_block.cc b/src/crimson/os/seastore/root_block.cc deleted file mode 100644 index 7b32fa4fff6b..000000000000 --- a/src/crimson/os/seastore/root_block.cc +++ /dev/null @@ -1,48 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include "crimson/os/seastore/root_block.h" - -namespace crimson::os::seastore { - -void RootBlock::prepare_write() -{ - bufferlist tmp; - ::encode(root, tmp); - auto bpiter = tmp.begin(); - bpiter.copy(tmp.length(), get_bptr().c_str()); -} - -RootBlock::complete_load_ertr::future<> RootBlock::complete_load() -{ - auto biter = get_bptr().cbegin(); - root.decode(biter); - if (root.lba_root.lba_root_addr.is_relative()) { - root.lba_root.lba_root_addr = get_paddr().add_block_relative( - root.lba_root.lba_root_addr); - } - return complete_load_ertr::now(); -} - -void RootBlock::on_delta_write(paddr_t record_block_offset) -{ - if (root.lba_root.lba_root_addr.is_relative()) { - root.lba_root.lba_root_addr = record_block_offset.add_record_relative( - root.lba_root.lba_root_addr); - } -} - -void RootBlock::on_initial_write() -{ - if (root.lba_root.lba_root_addr.is_relative()) { - root.lba_root.lba_root_addr = get_paddr().add_block_relative( - root.lba_root.lba_root_addr); - } -} - -btree_lba_root_t &RootBlock::get_lba_root() -{ - return root.lba_root; -} - -} diff --git a/src/crimson/os/seastore/root_block.h b/src/crimson/os/seastore/root_block.h index 5a74f10d0f86..acc31eefe954 100644 --- a/src/crimson/os/seastore/root_block.h +++ b/src/crimson/os/seastore/root_block.h @@ -9,32 +9,22 @@ namespace crimson::os::seastore { using depth_t = uint32_t; -/* Belongs in lba_manager/btree, TODO: generalize this to - * permit more than one lba_manager implementation +/** + * root_t + * + * Contains information required to find metadata roots. + * TODO: generalize this to permit more than one lba_manager implementation */ -struct btree_lba_root_t { +struct __attribute__((aligned(8), packed)) root_t { depth_t lba_depth = 0; depth_t segment_depth = 0; paddr_t lba_root_addr; paddr_t segment_root; - DENC(btree_lba_root_t, v, p) { - DENC_START(1, 1, p); - denc(v.lba_depth, p); - denc(v.segment_depth, p); - denc(v.lba_root_addr, p); - denc(v.segment_root, p); - DENC_FINISH(p); - } -}; - -struct root_block_t { - btree_lba_root_t lba_root; - - DENC(root_block_t, v, p) { - DENC_START(1, 1, p); - denc(v.lba_root, p); - DENC_FINISH(p); + void adjust_addrs_from_base(paddr_t base) { + if (lba_root_addr.is_relative()) { + lba_root_addr = base.add_record_relative(lba_root_addr); + } } }; @@ -45,26 +35,31 @@ struct root_block_t { * In-memory values may be * - absolute: reference to block which predates the current transaction * - record_relative: reference to block updated in this transaction - * if !is_initial_pending() - * - block_relative: reference to block updated in this transaction - * if is_initial_pending() + * if !pending() * - * Upon initial commit, on_initial_write checks physical references and updates - * based on newly discovered address (relative ones must be block_relative). + * Journal replay only considers deltas and must always discover the most + * recent value for the RootBlock. Because the contents of root_t above are + * very small, it's simplest to stash the entire root_t value into the delta + * and never actually write the RootBlock to a physical location (safe since + * nothing references the location of the RootBlock). * - * complete_load also updates addresses in memory post load based on block addr. + * As a result, Cache treats the root differently in a few ways including: + * - state will only ever be DIRTY or MUTATION_PENDING + * - RootBlock's never show up in the transaction fresh or dirty lists -- + * there's a special Transaction::root member for when the root needs to + * be mutated. * - * Upon delta commit, on_delta_write uses record addr to update in-memory values. - * apply_delta will do the same once implemented (TODO). + * TODO: Journal trimming will need to be aware of the most recent RootBlock + * delta location, or, even easier, just always write one out with the + * mutation which changes the journal trim bound. */ struct RootBlock : CachedExtent { constexpr static segment_off_t SIZE = 4<<10; using Ref = TCachedExtentRef; - root_block_t root; + root_t root; - template - RootBlock(T&&... t) : CachedExtent(std::forward(t)...) {} + RootBlock() : CachedExtent(0) {} RootBlock(const RootBlock &rhs) = default; @@ -72,41 +67,44 @@ struct RootBlock : CachedExtent { return CachedExtentRef(new RootBlock(*this)); }; - /** - * prepare_write - * - * For RootBlock, serializes RootBlock::root into the bptr. - */ - void prepare_write() final; - static constexpr extent_types_t TYPE = extent_types_t::ROOT; extent_types_t get_type() const final { return extent_types_t::ROOT; } + /// dumps root as delta ceph::bufferlist get_delta() final { - return ceph::bufferlist(); + ceph::bufferlist bl; + ceph::buffer::ptr bptr(sizeof(root_t)); + *reinterpret_cast(bptr.c_str()) = root; + bl.append(bptr); + return bl; } + /// overwrites root void apply_delta_and_adjust_crc(paddr_t base, const ceph::bufferlist &_bl) final { - ceph_assert(0 == "TODO"); + assert(_bl.length() == sizeof(root_t)); + ceph::bufferlist bl = _bl; + bl.rebuild(); + root = *reinterpret_cast(bl.front().c_str()); + root.adjust_addrs_from_base(base); } - /// Patches relative addrs in memory based on actual address - complete_load_ertr::future<> complete_load() final; - /// Patches relative addrs in memory based on record commit addr - void on_delta_write(paddr_t record_block_offset) final; + void on_delta_write(paddr_t record_block_offset) final { + root.adjust_addrs_from_base(record_block_offset); + } - /// Patches relative addrs in memory based on record addr - void on_initial_write() final; + complete_load_ertr::future<> complete_load() final { + assert(0 == "Root is only written via deltas"); + } - btree_lba_root_t &get_lba_root(); + void on_initial_write() final { + assert(0 == "Root is only written via deltas"); + } + root_t &get_lba_root() { return root; } }; using RootBlockRef = RootBlock::Ref; } - -WRITE_CLASS_DENC(crimson::os::seastore::btree_lba_root_t) -WRITE_CLASS_DENC(crimson::os::seastore::root_block_t) diff --git a/src/crimson/os/seastore/seastore_types.cc b/src/crimson/os/seastore/seastore_types.cc index 8532a7dd71aa..85914241caa2 100644 --- a/src/crimson/os/seastore/seastore_types.cc +++ b/src/crimson/os/seastore/seastore_types.cc @@ -37,8 +37,6 @@ std::ostream &operator<<(std::ostream &out, const paddr_t &rhs) std::ostream &operator<<(std::ostream &out, extent_types_t t) { switch (t) { - case extent_types_t::ROOT_LOCATION: - return out << "ROOT_LOCATION"; case extent_types_t::ROOT: return out << "ROOT"; case extent_types_t::LADDR_INTERNAL: diff --git a/src/crimson/os/seastore/seastore_types.h b/src/crimson/os/seastore/seastore_types.h index ba5112e21882..a0e61b16ab94 100644 --- a/src/crimson/os/seastore/seastore_types.h +++ b/src/crimson/os/seastore/seastore_types.h @@ -208,11 +208,10 @@ std::ostream &operator<<(std::ostream &out, const paddr_list_t &rhs); * Cache::get_extent_by_type in cache.cc */ enum class extent_types_t : uint8_t { - ROOT_LOCATION = 0, // delta only - ROOT = 1, - LADDR_INTERNAL = 2, - LADDR_LEAF = 3, - ONODE_BLOCK = 4, + ROOT = 0, + LADDR_INTERNAL = 1, + LADDR_LEAF = 2, + ONODE_BLOCK = 3, // Test Block Types TEST_BLOCK = 0xF0, diff --git a/src/crimson/os/seastore/transaction_manager.cc b/src/crimson/os/seastore/transaction_manager.cc index 89281c9550d4..785eaf1c0fad 100644 --- a/src/crimson/os/seastore/transaction_manager.cc +++ b/src/crimson/os/seastore/transaction_manager.cc @@ -39,6 +39,7 @@ TransactionManager::mkfs_ertr::future<> TransactionManager::mkfs() lba_manager.create_transaction(), [this](auto &transaction) { logger().debug("TransactionManager::mkfs: about to cache.mkfs"); + cache.init(); return cache.mkfs(*transaction ).safe_then([this, &transaction] { return lba_manager.mkfs(*transaction); @@ -60,6 +61,7 @@ TransactionManager::mkfs_ertr::future<> TransactionManager::mkfs() TransactionManager::mount_ertr::future<> TransactionManager::mount() { + cache.init(); return journal.replay([this](auto paddr, const auto &e) { return cache.replay_delta(paddr, e); }).safe_then([this] { diff --git a/src/test/crimson/seastore/test_btree_lba_manager.cc b/src/test/crimson/seastore/test_btree_lba_manager.cc index c31c3f5e4b6a..a1abff02427e 100644 --- a/src/test/crimson/seastore/test_btree_lba_manager.cc +++ b/src/test/crimson/seastore/test_btree_lba_manager.cc @@ -76,6 +76,7 @@ struct btree_lba_manager_test : return seastar::do_with( lba_manager->create_transaction(), [this](auto &transaction) { + cache.init(); return cache.mkfs(*transaction ).safe_then([this, &transaction] { return lba_manager->mkfs(*transaction); diff --git a/src/test/crimson/seastore/test_seastore_cache.cc b/src/test/crimson/seastore/test_seastore_cache.cc index e5dda0d23237..6091fd67edc7 100644 --- a/src/test/crimson/seastore/test_seastore_cache.cc +++ b/src/test/crimson/seastore/test_seastore_cache.cc @@ -74,6 +74,7 @@ struct cache_test_t : public seastar_test_suite_t { return seastar::do_with( TransactionRef(new Transaction()), [this](auto &transaction) { + cache.init(); return cache.mkfs(*transaction).safe_then( [this, &transaction] { return submit_transaction(std::move(transaction)).then( @@ -90,7 +91,8 @@ struct cache_test_t : public seastar_test_suite_t { } seastar::future<> tear_down_fut() final { - return seastar::now(); + return cache.close().handle_error( + Cache::close_ertr::assert_all{}); } };