From c64d6f5539b253267fbddd39f14fd8c0bf6d7b43 Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Thu, 10 Mar 2022 17:46:37 +0800 Subject: [PATCH] crimson/os/seastore: record replay_from info for dirty extents and alloc infos in segment header/tail Signed-off-by: Xuehan Xu --- src/crimson/os/seastore/cache.cc | 19 +++ src/crimson/os/seastore/cache.h | 27 ++++ src/crimson/os/seastore/journal.h | 2 + .../os/seastore/journal/segment_allocator.cc | 8 + .../os/seastore/journal/segmented_journal.cc | 1 + src/crimson/os/seastore/seastore_types.h | 15 +- src/crimson/os/seastore/segment_cleaner.cc | 138 ++++++++++++------ src/crimson/os/seastore/segment_cleaner.h | 25 +++- src/crimson/os/seastore/transaction.h | 4 + .../os/seastore/transaction_manager.cc | 14 +- .../seastore/test_btree_lba_manager.cc | 8 + .../crimson/seastore/test_seastore_journal.cc | 13 +- 12 files changed, 217 insertions(+), 57 deletions(-) diff --git a/src/crimson/os/seastore/cache.cc b/src/crimson/os/seastore/cache.cc index 002d948da6e..7bf9ba776f7 100644 --- a/src/crimson/os/seastore/cache.cc +++ b/src/crimson/os/seastore/cache.cc @@ -1134,6 +1134,15 @@ record_t Cache::prepare_record( record.push_back(std::move(delta)); } + if (t.is_cleaner_transaction()) { + bufferlist bl; + encode(get_oldest_backref_dirty_from().value_or(JOURNAL_SEQ_NULL), bl); + delta_info_t delta; + delta.type = extent_types_t::ALLOC_TAIL; + delta.bl = bl; + record.push_back(std::move(delta)); + } + ceph_assert(t.get_fresh_block_stats().num == t.inline_block_list.size() + t.ool_block_list.size() + @@ -1416,9 +1425,11 @@ Cache::replay_delta( journal_seq_t journal_seq, paddr_t record_base, const delta_info_t &delta, + const journal_seq_t &alloc_replay_from, seastar::lowres_system_clock::time_point& last_modified) { LOG_PREFIX(Cache::replay_delta); + assert(alloc_replay_from != JOURNAL_SEQ_NULL); if (delta.type == extent_types_t::ROOT) { TRACE("replay root delta at {} {}, remove extent ... -- {}, prv_root={}", journal_seq, record_base, delta, *root); @@ -1432,6 +1443,11 @@ Cache::replay_delta( add_extent(root); return replay_delta_ertr::now(); } else if (delta.type == extent_types_t::ALLOC_INFO) { + if (journal_seq < alloc_replay_from) { + DEBUG("journal_seq {} < alloc_replay_from {}, don't replay {}", + journal_seq, alloc_replay_from, delta); + return replay_delta_ertr::now(); + } may_roll_backref_buffer(journal_seq.offset); alloc_delta_t alloc_delta; decode(alloc_delta, delta.bl); @@ -1454,6 +1470,9 @@ Cache::replay_delta( if (!backref_list.empty()) backref_batch_update(std::move(backref_list), journal_seq); return replay_delta_ertr::now(); + } else if (delta.type == extent_types_t::ALLOC_TAIL) { + // this delta should have been dealt with during segment cleaner mounting + return replay_delta_ertr::now(); } else { auto _get_extent_if_cached = [this](paddr_t addr) -> get_extent_ertr::future { diff --git a/src/crimson/os/seastore/cache.h b/src/crimson/os/seastore/cache.h index 7270a4fe379..b16bdaff7b9 100644 --- a/src/crimson/os/seastore/cache.h +++ b/src/crimson/os/seastore/cache.h @@ -784,6 +784,8 @@ public: journal_seq_t seq, paddr_t record_block_base, const delta_info_t &delta, + const journal_seq_t &, // journal seq from which alloc + // delta should be replayed seastar::lowres_system_clock::time_point& last_modified); /** @@ -906,15 +908,40 @@ public: journal_seq_t seq, size_t max_bytes); + std::optional get_oldest_backref_dirty_from() const { + LOG_PREFIX(Cache::get_oldest_backref_dirty_from); + journal_seq_t backref_oldest = JOURNAL_SEQ_NULL; + if (backref_bufs_to_flush.empty()) { + if (backref_buffer && !backref_buffer->backrefs.empty()) { + backref_oldest = backref_buffer->backrefs.begin()->first; + } + } else { + auto &oldest_buf = backref_bufs_to_flush.front(); + backref_oldest = oldest_buf->backrefs.begin()->first; + } + if (backref_oldest == JOURNAL_SEQ_NULL) { + SUBDEBUG(seastore_cache, "backref_oldest: null"); + return std::nullopt; + } else { + SUBDEBUG(seastore_cache, "backref_oldest: {}", + backref_oldest); + return backref_oldest; + } + } + /// returns std::nullopt if no dirty extents or get_dirty_from() for oldest std::optional get_oldest_dirty_from() const { + LOG_PREFIX(Cache::get_oldest_dirty_from); if (dirty.empty()) { + SUBDEBUG(seastore_cache, "oldest: null"); return std::nullopt; } else { auto oldest = dirty.begin()->get_dirty_from(); if (oldest == JOURNAL_SEQ_NULL) { + SUBDEBUG(seastore_cache, "oldest: null"); return std::nullopt; } else { + SUBDEBUG(seastore_cache, "oldest: {}", oldest); return oldest; } } diff --git a/src/crimson/os/seastore/journal.h b/src/crimson/os/seastore/journal.h index c135cf551f6..a33a5468684 100644 --- a/src/crimson/os/seastore/journal.h +++ b/src/crimson/os/seastore/journal.h @@ -80,6 +80,8 @@ public: using delta_handler_t = std::function< replay_ret(const record_locator_t&, const delta_info_t&, + const journal_seq_t, // journal seq from which + // alloc delta should replayed seastar::lowres_system_clock::time_point last_modified)>; virtual replay_ret replay( delta_handler_t &&delta_handler) = 0; diff --git a/src/crimson/os/seastore/journal/segment_allocator.cc b/src/crimson/os/seastore/journal/segment_allocator.cc index e27900561b9..015d31286f3 100644 --- a/src/crimson/os/seastore/journal/segment_allocator.cc +++ b/src/crimson/os/seastore/journal/segment_allocator.cc @@ -51,16 +51,20 @@ SegmentAllocator::do_open() ).safe_then([this, FNAME, new_segment_seq](auto sref) { // initialize new segment journal_seq_t new_journal_tail; + journal_seq_t new_alloc_replay_from; if (type == segment_type_t::JOURNAL) { new_journal_tail = segment_provider.get_journal_tail_target(); + new_alloc_replay_from = segment_provider.get_alloc_info_replay_from(); } else { // OOL new_journal_tail = NO_DELTAS; + new_alloc_replay_from = NO_DELTAS; } segment_id_t segment_id = sref->get_segment_id(); auto header = segment_header_t{ new_segment_seq, segment_id, new_journal_tail, + new_alloc_replay_from, current_segment_nonce, type}; INFO("{} writing header to new segment ... -- {}", @@ -210,15 +214,19 @@ SegmentAllocator::close_segment(bool is_rolling) } auto close_seg_info = segment_provider.get_seg_info(close_segment_id); journal_seq_t cur_journal_tail; + journal_seq_t new_alloc_replay_from; if (type == segment_type_t::JOURNAL) { cur_journal_tail = segment_provider.get_journal_tail_target(); + new_alloc_replay_from = segment_provider.get_alloc_info_replay_from(); } else { // OOL cur_journal_tail = NO_DELTAS; + new_alloc_replay_from = NO_DELTAS; } auto tail = segment_tail_t{ close_seg_info.seq, close_segment_id, cur_journal_tail, + new_alloc_replay_from, current_segment_nonce, type, close_seg_info.last_modified.time_since_epoch().count(), diff --git a/src/crimson/os/seastore/journal/segmented_journal.cc b/src/crimson/os/seastore/journal/segmented_journal.cc index 2413ba14c13..f9c979df05d 100644 --- a/src/crimson/os/seastore/journal/segmented_journal.cc +++ b/src/crimson/os/seastore/journal/segmented_journal.cc @@ -223,6 +223,7 @@ SegmentedJournal::replay_segment( return handler( locator, delta, + segment_provider.get_alloc_info_replay_from(), seastar::lowres_system_clock::time_point( seastar::lowres_system_clock::duration(commit_time))); }); diff --git a/src/crimson/os/seastore/seastore_types.h b/src/crimson/os/seastore/seastore_types.h index 404a4f48126..839b2bf8836 100644 --- a/src/crimson/os/seastore/seastore_types.h +++ b/src/crimson/os/seastore/seastore_types.h @@ -862,13 +862,14 @@ enum class extent_types_t : uint8_t { // the following two types are not extent types, // they are just used to indicates paddr allocation deltas ALLOC_INFO = 9, + ALLOC_TAIL = 10, // Test Block Types - TEST_BLOCK = 10, - TEST_BLOCK_PHYSICAL = 11, - BACKREF_INTERNAL = 12, - BACKREF_LEAF = 13, + TEST_BLOCK = 11, + TEST_BLOCK_PHYSICAL = 12, + BACKREF_INTERNAL = 13, + BACKREF_LEAF = 14, // None and the number of valid extent_types_t - NONE = 14, + NONE = 15, }; using extent_types_le_t = uint8_t; constexpr auto EXTENT_TYPES_MAX = static_cast(extent_types_t::NONE); @@ -1348,6 +1349,7 @@ struct segment_header_t { segment_id_t physical_segment_id; // debugging journal_seq_t journal_tail; + journal_seq_t alloc_replay_from; segment_nonce_t segment_nonce; segment_type_t type; @@ -1361,6 +1363,7 @@ struct segment_header_t { denc(v.segment_seq, p); denc(v.physical_segment_id, p); denc(v.journal_tail, p); + denc(v.alloc_replay_from, p); denc(v.segment_nonce, p); denc(v.type, p); DENC_FINISH(p); @@ -1373,6 +1376,7 @@ struct segment_tail_t { segment_id_t physical_segment_id; // debugging journal_seq_t journal_tail; + journal_seq_t alloc_replay_from; segment_nonce_t segment_nonce; segment_type_t type; @@ -1389,6 +1393,7 @@ struct segment_tail_t { denc(v.segment_seq, p); denc(v.physical_segment_id, p); denc(v.journal_tail, p); + denc(v.alloc_replay_from, p); denc(v.segment_nonce, p); denc(v.type, p); denc(v.last_modified, p); diff --git a/src/crimson/os/seastore/segment_cleaner.cc b/src/crimson/os/seastore/segment_cleaner.cc index a5c11fb76a3..1d0ed0a48f4 100644 --- a/src/crimson/os/seastore/segment_cleaner.cc +++ b/src/crimson/os/seastore/segment_cleaner.cc @@ -482,14 +482,28 @@ segment_id_t SegmentCleaner::allocate_segment( return NULL_SEG_ID; } -void SegmentCleaner::update_journal_tail_target(journal_seq_t target) +void SegmentCleaner::update_journal_tail_target( + journal_seq_t dirty_replay_from, + journal_seq_t alloc_replay_from) { + logger().debug( + "{}: {}, current dirty_extents_replay_from {}", + __func__, + dirty_replay_from, + dirty_extents_replay_from); + if (dirty_extents_replay_from == JOURNAL_SEQ_NULL + || dirty_replay_from > dirty_extents_replay_from) { + dirty_extents_replay_from = dirty_replay_from; + } + + update_alloc_info_replay_from(alloc_replay_from); + + journal_seq_t target = std::min(dirty_replay_from, alloc_replay_from); logger().debug( "{}: {}, current tail target {}", __func__, target, journal_tail_target); - assert(journal_tail_target == JOURNAL_SEQ_NULL || target >= journal_tail_target); if (journal_tail_target == JOURNAL_SEQ_NULL || target > journal_tail_target) { journal_tail_target = target; } @@ -497,6 +511,20 @@ void SegmentCleaner::update_journal_tail_target(journal_seq_t target) maybe_wake_gc_blocked_io(); } +void SegmentCleaner::update_alloc_info_replay_from( + journal_seq_t alloc_replay_from) +{ + logger().debug( + "{}: {}, current alloc_info_replay_from {}", + __func__, + alloc_replay_from, + alloc_info_replay_from); + if (alloc_info_replay_from == JOURNAL_SEQ_NULL + || alloc_replay_from > alloc_info_replay_from) { + alloc_info_replay_from = alloc_replay_from; + } +} + void SegmentCleaner::update_journal_tail_committed(journal_seq_t committed) { if (journal_tail_committed == JOURNAL_SEQ_NULL || @@ -876,6 +904,8 @@ SegmentCleaner::mount_ret SegmentCleaner::mount() journal_tail_target = JOURNAL_SEQ_NULL; journal_tail_committed = JOURNAL_SEQ_NULL; journal_head = JOURNAL_SEQ_NULL; + dirty_extents_replay_from = JOURNAL_SEQ_NULL; + alloc_info_replay_from = JOURNAL_SEQ_NULL; space_tracker.reset( detailed ? @@ -924,6 +954,12 @@ SegmentCleaner::mount_ret SegmentCleaner::mount() time_point last_rewritten(duration(tail.last_rewritten)); segments.update_last_modified_rewritten( segment_id, last_modified, last_rewritten); + if (tail.get_type() == segment_type_t::JOURNAL) { + update_journal_tail_committed(tail.journal_tail); + update_journal_tail_target( + tail.journal_tail, + tail.alloc_replay_from); + } init_mark_segment_closed( segment_id, header.segment_seq, @@ -955,23 +991,23 @@ SegmentCleaner::scan_extents_ret SegmentCleaner::scan_nonfull_segment( scan_extents_ret_bare& segment_set, segment_id_t segment_id) { - if (header.get_type() == segment_type_t::OOL) { - logger().info( - "SegmentCleaner::scan_nonfull_segment: out-of-line segment {}", - segment_id); + return seastar::do_with( + scan_valid_records_cursor({ + segments[segment_id].seq, + paddr_t::make_seg_paddr(segment_id, 0)}), + [this, segment_id, segment_header=header](auto& cursor) { return seastar::do_with( - scan_valid_records_cursor({ - segments[segment_id].seq, - paddr_t::make_seg_paddr(segment_id, 0)}), - [this, segment_id, header](auto& cursor) { - return seastar::do_with( - SegmentManagerGroup::found_record_handler_t([this, segment_id]( - record_locator_t locator, - const record_group_header_t& header, - const bufferlist& mdbuf - ) mutable -> SegmentManagerGroup::scan_valid_records_ertr::future<> { - LOG_PREFIX(SegmentCleaner::scan_nonfull_segment); - DEBUG("decodeing {} records", header.records); + SegmentManagerGroup::found_record_handler_t( + [this, segment_id, segment_header]( + record_locator_t locator, + const record_group_header_t& header, + const bufferlist& mdbuf + ) mutable -> SegmentManagerGroup::scan_valid_records_ertr::future<> { + LOG_PREFIX(SegmentCleaner::scan_nonfull_segment); + if (segment_header.get_type() == segment_type_t::OOL) { + DEBUG("out-of-line segment {}, decodeing {} records", + segment_id, + header.records); auto maybe_headers = try_decode_record_headers(header, mdbuf); if (!maybe_headers) { ERROR("unable to decode record headers for record group {}", @@ -997,36 +1033,44 @@ SegmentCleaner::scan_extents_ret SegmentCleaner::scan_nonfull_segment( segments.update_last_modified_rewritten(segment_id, {}, commit_time); } } - return seastar::now(); - }), - [&cursor, header, this](auto& handler) { - return sm_group->scan_valid_records( - cursor, - header.segment_nonce, - segments.get_segment_size(), - handler); + } else { + DEBUG("inline segment {}, decodeing {} records", + segment_id, + header.records); + auto maybe_record_deltas_list = try_decode_deltas( + header, mdbuf, locator.record_block_base); + if (!maybe_record_deltas_list) { + ERROR("unable to decode deltas for record {} at {}", + header, locator); + return crimson::ct_error::input_output_error::make(); + } + for (auto &record_deltas : *maybe_record_deltas_list) { + for (auto &[ctime, delta] : record_deltas.deltas) { + if (delta.type == extent_types_t::ALLOC_TAIL) { + journal_seq_t seq; + decode(seq, delta.bl); + update_alloc_info_replay_from(seq); + } + } + } } - ); - }).safe_then([this, segment_id, header](auto) { - init_mark_segment_closed( - segment_id, - header.segment_seq, - header.type); - return seastar::now(); - }); - } else if (header.get_type() == segment_type_t::JOURNAL) { - logger().info( - "SegmentCleaner::scan_nonfull_segment: journal segment {}", - segment_id); - segment_set.emplace_back(std::make_pair(segment_id, std::move(header))); - } else { - ceph_abort("unexpected segment type"); - } - init_mark_segment_closed( - segment_id, - header.segment_seq, - header.type); - return seastar::now(); + return seastar::now(); + }), + [&cursor, segment_header, this](auto& handler) { + return sm_group->scan_valid_records( + cursor, + segment_header.segment_nonce, + segments.get_segment_size(), + handler); + } + ); + }).safe_then([this, segment_id, header](auto) { + init_mark_segment_closed( + segment_id, + header.segment_seq, + header.type); + return seastar::now(); + }); } SegmentCleaner::release_ertr::future<> diff --git a/src/crimson/os/seastore/segment_cleaner.h b/src/crimson/os/seastore/segment_cleaner.h index 357b4bf7851..0070527b2b2 100644 --- a/src/crimson/os/seastore/segment_cleaner.h +++ b/src/crimson/os/seastore/segment_cleaner.h @@ -193,6 +193,10 @@ public: virtual segment_id_t allocate_segment( segment_seq_t seq, segment_type_t type) = 0; + virtual journal_seq_t get_dirty_extents_replay_from() const = 0; + + virtual journal_seq_t get_alloc_info_replay_from() const = 0; + virtual void close_segment(segment_id_t) = 0; virtual void update_journal_tail_committed(journal_seq_t tail_committed) = 0; @@ -583,6 +587,12 @@ private: /// target journal_tail for next fresh segment journal_seq_t journal_tail_target; + /// target replay_from for dirty extents + journal_seq_t dirty_extents_replay_from; + + /// target replay_from for alloc infos + journal_seq_t alloc_info_replay_from; + /// most recently committed journal_tail journal_seq_t journal_tail_committed; @@ -639,7 +649,20 @@ public: return sm_group.get(); } - void update_journal_tail_target(journal_seq_t target); + journal_seq_t get_dirty_extents_replay_from() const final { + return dirty_extents_replay_from; + } + + journal_seq_t get_alloc_info_replay_from() const final { + return alloc_info_replay_from; + } + + void update_journal_tail_target( + journal_seq_t dirty_replay_from, + journal_seq_t alloc_replay_from); + + void update_alloc_info_replay_from( + journal_seq_t alloc_replay_from); void init_mkfs(journal_seq_t head) { journal_tail_target = head; diff --git a/src/crimson/os/seastore/transaction.h b/src/crimson/os/seastore/transaction.h index 17479795df1..466f0b4a995 100644 --- a/src/crimson/os/seastore/transaction.h +++ b/src/crimson/os/seastore/transaction.h @@ -260,6 +260,10 @@ public: return src; } + bool is_cleaner_transaction() const { + return src >= Transaction::src_t::CLEANER_TRIM; + } + bool is_weak() const { return weak; } diff --git a/src/crimson/os/seastore/transaction_manager.cc b/src/crimson/os/seastore/transaction_manager.cc index 7251b07fac9..308a76e51cd 100644 --- a/src/crimson/os/seastore/transaction_manager.cc +++ b/src/crimson/os/seastore/transaction_manager.cc @@ -89,14 +89,21 @@ TransactionManager::mount_ertr::future<> TransactionManager::mount() return segment_cleaner->mount( ).safe_then([this] { return journal->replay( - [this](const auto &offsets, const auto &e, auto last_modified) { + [this]( + const auto &offsets, + const auto &e, + const journal_seq_t alloc_replay_from, + auto last_modified) + { auto start_seq = offsets.write_result.start_seq; segment_cleaner->update_journal_tail_target( - cache->get_oldest_dirty_from().value_or(start_seq)); + cache->get_oldest_dirty_from().value_or(start_seq), + cache->get_oldest_backref_dirty_from().value_or(start_seq)); return cache->replay_delta( start_seq, offsets.record_block_base, e, + alloc_replay_from, last_modified); }); }).safe_then([this] { @@ -405,7 +412,8 @@ TransactionManager::submit_transaction_direct( backref_manager->complete_transaction(tref, backref_to_clear, backref_to_link); segment_cleaner->update_journal_tail_target( - cache->get_oldest_dirty_from().value_or(start_seq)); + cache->get_oldest_dirty_from().value_or(start_seq), + cache->get_oldest_backref_dirty_from().value_or(start_seq)); return segment_cleaner->maybe_release_segment(tref); }).safe_then([FNAME, &tref] { SUBTRACET(seastore_t, "completed", tref); diff --git a/src/test/crimson/seastore/test_btree_lba_manager.cc b/src/test/crimson/seastore/test_btree_lba_manager.cc index 8e21f0affe8..bde23e84f35 100644 --- a/src/test/crimson/seastore/test_btree_lba_manager.cc +++ b/src/test/crimson/seastore/test_btree_lba_manager.cc @@ -79,6 +79,14 @@ struct btree_test_base : SegmentManagerGroup* get_segment_manager_group() final { return sms.get(); } + journal_seq_t get_dirty_extents_replay_from() const final { + return JOURNAL_SEQ_NULL; + } + + journal_seq_t get_alloc_info_replay_from() const final { + return JOURNAL_SEQ_NULL; + } + virtual void complete_commit(Transaction &t) {} seastar::future<> submit_transaction(TransactionRef t) { diff --git a/src/test/crimson/seastore/test_seastore_journal.cc b/src/test/crimson/seastore/test_seastore_journal.cc index e33211fd590..4641987810f 100644 --- a/src/test/crimson/seastore/test_seastore_journal.cc +++ b/src/test/crimson/seastore/test_seastore_journal.cc @@ -99,6 +99,14 @@ struct journal_test_t : seastar_test_suite_t, SegmentProvider { return tmp_info; } + journal_seq_t get_dirty_extents_replay_from() const final { + return JOURNAL_SEQ_NULL; + } + + journal_seq_t get_alloc_info_replay_from() const final { + return JOURNAL_SEQ_NULL; + } + segment_id_t allocate_segment( segment_seq_t seq, segment_type_t type @@ -184,7 +192,10 @@ struct journal_test_t : seastar_test_suite_t, SegmentProvider { replay( [&advance, &delta_checker] - (const auto &offsets, const auto &di, auto t) mutable { + (const auto &offsets, + const auto &di, + const journal_seq_t, + auto t) mutable { if (!delta_checker) { EXPECT_FALSE("No Deltas Left"); } -- 2.39.5