From 4fdc95b39ffe31c0301eeeef0e14b802638f7ec6 Mon Sep 17 00:00:00 2001 From: Yingxin Cheng Date: Thu, 21 Jul 2022 17:16:50 +0800 Subject: [PATCH] crimson/os/seastore: simplify journal tails * append latest dirty tail as journal delta, in order to simplify and drop journal_tail_committed in the cleaner. * simplify misc journal tails into alloc_tail and dirty_tail, with proper renaming. * move journal-tail recovery logic from cleaner to journal. Signed-off-by: Yingxin Cheng --- src/crimson/os/seastore/async_cleaner.cc | 142 +++++----------- src/crimson/os/seastore/async_cleaner.h | 86 ++++------ src/crimson/os/seastore/cache.cc | 25 +-- .../os/seastore/extent_placement_manager.cc | 2 +- .../os/seastore/journal/segment_allocator.cc | 32 ++-- .../os/seastore/journal/segmented_journal.cc | 157 +++++++++++++----- .../os/seastore/journal/segmented_journal.h | 10 +- src/crimson/os/seastore/seastore_types.cc | 10 +- src/crimson/os/seastore/seastore_types.h | 39 ++++- .../os/seastore/transaction_manager.cc | 8 +- .../seastore/test_btree_lba_manager.cc | 16 +- .../crimson/seastore/test_seastore_journal.cc | 16 +- 12 files changed, 288 insertions(+), 255 deletions(-) diff --git a/src/crimson/os/seastore/async_cleaner.cc b/src/crimson/os/seastore/async_cleaner.cc index df146d39075..c64adcd88c3 100644 --- a/src/crimson/os/seastore/async_cleaner.cc +++ b/src/crimson/os/seastore/async_cleaner.cc @@ -625,75 +625,42 @@ segment_id_t AsyncCleaner::allocate_segment( return NULL_SEG_ID; } -void AsyncCleaner::update_journal_tail_target( - journal_seq_t dirty_replay_from, - journal_seq_t alloc_replay_from) +void AsyncCleaner::update_journal_tails( + journal_seq_t dirty_tail, + journal_seq_t alloc_tail) { - LOG_PREFIX(AsyncCleaner::update_journal_tail_target); + LOG_PREFIX(AsyncCleaner::update_journal_tails); if (disable_trim) return; - assert(dirty_replay_from.offset.get_addr_type() != addr_types_t::RANDOM_BLOCK); - assert(alloc_replay_from.offset.get_addr_type() != addr_types_t::RANDOM_BLOCK); - if (dirty_extents_replay_from == JOURNAL_SEQ_NULL - || dirty_replay_from > dirty_extents_replay_from) { - DEBUG("dirty_extents_replay_from={} => {}", - dirty_extents_replay_from, dirty_replay_from); - dirty_extents_replay_from = dirty_replay_from; - } - - update_alloc_info_replay_from(alloc_replay_from); + assert(dirty_tail.offset.get_addr_type() != addr_types_t::RANDOM_BLOCK); + assert(alloc_tail.offset.get_addr_type() != addr_types_t::RANDOM_BLOCK); - journal_seq_t target = std::min(dirty_replay_from, alloc_replay_from); - ceph_assert(target != JOURNAL_SEQ_NULL); + ceph_assert(dirty_tail != JOURNAL_SEQ_NULL); + ceph_assert(alloc_tail != JOURNAL_SEQ_NULL); ceph_assert(journal_head == JOURNAL_SEQ_NULL || - journal_head >= target); - if (journal_tail_target == JOURNAL_SEQ_NULL || - target > journal_tail_target) { - if (!init_complete || - journal_tail_target.segment_seq == target.segment_seq) { - DEBUG("journal_tail_target={} => {}", journal_tail_target, target); + (journal_head >= dirty_tail && journal_head >= alloc_tail)); + + if (journal_dirty_tail == JOURNAL_SEQ_NULL || + dirty_tail > journal_dirty_tail) { + if (journal_dirty_tail.segment_seq == dirty_tail.segment_seq) { + DEBUG("journal_dirty_tail {} => {}", journal_dirty_tail, dirty_tail); } else { - INFO("journal_tail_target={} => {}", journal_tail_target, target); + INFO("journal_dirty_tail {} => {}", journal_dirty_tail, dirty_tail); } - journal_tail_target = target; + journal_dirty_tail = dirty_tail; } - gc_process.maybe_wake_on_space_used(); - maybe_wake_gc_blocked_io(); -} -void AsyncCleaner::update_alloc_info_replay_from( - journal_seq_t alloc_replay_from) -{ - LOG_PREFIX(AsyncCleaner::update_alloc_info_replay_from); - if (alloc_info_replay_from == JOURNAL_SEQ_NULL - || alloc_replay_from > alloc_info_replay_from) { - DEBUG("alloc_info_replay_from={} => {}", - alloc_info_replay_from, alloc_replay_from); - alloc_info_replay_from = alloc_replay_from; - } -} - -void AsyncCleaner::update_journal_tail_committed(journal_seq_t committed) -{ - LOG_PREFIX(AsyncCleaner::update_journal_tail_committed); - assert(committed.offset.get_addr_type() != addr_types_t::RANDOM_BLOCK); - if (committed == JOURNAL_SEQ_NULL) { - return; + if (journal_alloc_tail == JOURNAL_SEQ_NULL || + alloc_tail > journal_alloc_tail) { + if (journal_alloc_tail.segment_seq == alloc_tail.segment_seq) { + DEBUG("journal_alloc_tail {} => {}", journal_alloc_tail, alloc_tail); + } else { + INFO("journal_alloc_tail {} => {}", journal_alloc_tail, alloc_tail); + } + journal_alloc_tail = alloc_tail; } - ceph_assert(journal_head == JOURNAL_SEQ_NULL || - journal_head >= committed); - if (journal_tail_committed == JOURNAL_SEQ_NULL || - committed > journal_tail_committed) { - DEBUG("update journal_tail_committed={} => {}", - journal_tail_committed, committed); - journal_tail_committed = committed; - } - if (journal_tail_target == JOURNAL_SEQ_NULL || - committed > journal_tail_target) { - DEBUG("update journal_tail_target={} => {}", - journal_tail_target, committed); - journal_tail_target = committed; - } + gc_process.maybe_wake_on_space_used(); + maybe_wake_gc_blocked_io(); } void AsyncCleaner::close_segment(segment_id_t segment) @@ -819,7 +786,7 @@ AsyncCleaner::gc_cycle_ret AsyncCleaner::GCProcess::run() AsyncCleaner::gc_cycle_ret AsyncCleaner::do_gc_cycle() { if (gc_should_trim_backref()) { - return gc_trim_backref(get_backref_tail() + return gc_trim_backref(get_alloc_tail_target() ).safe_then([](auto) { return seastar::now(); }).handle_error( @@ -884,7 +851,7 @@ AsyncCleaner::gc_trim_backref(journal_seq_t limit) { AsyncCleaner::gc_trim_journal_ret AsyncCleaner::gc_trim_journal() { - return gc_trim_backref(get_dirty_tail() + return gc_trim_backref(get_dirty_tail_target() ).safe_then([this](auto seq) { return repeat_eagain([this, seq=std::move(seq)]() mutable { return ecb->with_transaction_intr( @@ -1086,10 +1053,8 @@ AsyncCleaner::mount_ret AsyncCleaner::mount() INFO("{} segment managers", sms.size()); init_complete = false; stats = {}; - journal_tail_target = JOURNAL_SEQ_NULL; - journal_tail_committed = JOURNAL_SEQ_NULL; - dirty_extents_replay_from = JOURNAL_SEQ_NULL; - alloc_info_replay_from = JOURNAL_SEQ_NULL; + journal_alloc_tail = JOURNAL_SEQ_NULL; + journal_dirty_tail = JOURNAL_SEQ_NULL; space_tracker.reset( detailed ? @@ -1184,31 +1149,8 @@ AsyncCleaner::scan_extents_ret AsyncCleaner::scan_no_tail_segment( ) mutable -> SegmentManagerGroup::scan_valid_records_ertr::future<> { LOG_PREFIX(AsyncCleaner::scan_no_tail_segment); - if (segment_header.get_type() == segment_type_t::OOL) { - DEBUG("out-of-line segment {}, decodeing {} records", - segment_id, - record_group_header.records); - } else { - DEBUG("inline segment {}, decodeing {} records", - segment_id, - record_group_header.records); - auto maybe_record_deltas_list = try_decode_deltas( - record_group_header, mdbuf, locator.record_block_base); - if (!maybe_record_deltas_list) { - ERROR("unable to decode deltas for record {} at {}", - record_group_header, locator); - return crimson::ct_error::input_output_error::make(); - } - for (auto &record_deltas : *maybe_record_deltas_list) { - for (auto &[ctime, delta] : record_deltas.deltas) { - if (delta.type == extent_types_t::ALLOC_TAIL) { - journal_seq_t seq; - decode(seq, delta.bl); - update_alloc_info_replay_from(seq); - } - } - } - } + DEBUG("{} {}, decoding {} records", + segment_id, segment_header.get_type(), record_group_header.records); auto maybe_headers = try_decode_record_headers( record_group_header, mdbuf); @@ -1389,7 +1331,7 @@ segment_id_t AsyncCleaner::get_next_reclaim_segment() const } for (auto& [_id, segment_info] : segments) { if (segment_info.is_closed() && - !segment_info.is_in_journal(journal_tail_committed)) { + !segment_info.is_in_journal(get_journal_tail())) { double benefit_cost = calc_gc_benefit_cost(_id, now_time, bound_time); if (benefit_cost > max_benefit_cost) { id = _id; @@ -1431,10 +1373,11 @@ void AsyncCleaner::log_gc_state(const char *caller) const "should_block_on_reclaim {}, " "gc_should_reclaim_space {}, " "journal_head {}, " - "journal_tail_target {}, " - "journal_tail_commit {}, " - "dirty_tail {}, " - "dirty_tail_limit {}, " + "journal_alloc_tail {}, " + "journal_dirty_tail {}, " + "alloc_tail_target {}, " + "dirty_tail_target {}, " + "tail_limit {}, " "gc_should_trim_journal {}, ", caller, segments.get_num_empty(), @@ -1452,10 +1395,11 @@ void AsyncCleaner::log_gc_state(const char *caller) const should_block_on_reclaim(), gc_should_reclaim_space(), journal_head, - journal_tail_target, - journal_tail_committed, - get_dirty_tail(), - get_dirty_tail_limit(), + journal_alloc_tail, + journal_dirty_tail, + get_alloc_tail_target(), + get_dirty_tail_target(), + get_tail_limit(), gc_should_trim_journal() ); } diff --git a/src/crimson/os/seastore/async_cleaner.h b/src/crimson/os/seastore/async_cleaner.h index 23e939a2dc0..ba44faef840 100644 --- a/src/crimson/os/seastore/async_cleaner.h +++ b/src/crimson/os/seastore/async_cleaner.h @@ -271,21 +271,24 @@ class SegmentProvider { public: virtual void set_journal_head(journal_seq_t) = 0; - virtual journal_seq_t get_journal_tail_target() const = 0; + journal_seq_t get_journal_tail() const { + return std::min(get_alloc_tail(), get_dirty_tail()); + } + + virtual journal_seq_t get_dirty_tail() const = 0; + + virtual journal_seq_t get_alloc_tail() const = 0; + + virtual void update_journal_tails( + journal_seq_t dirty_tail, journal_seq_t alloc_tail) = 0; virtual const segment_info_t& get_seg_info(segment_id_t id) const = 0; virtual segment_id_t allocate_segment( segment_seq_t, segment_type_t, data_category_t, reclaim_gen_t) = 0; - virtual journal_seq_t get_dirty_extents_replay_from() const = 0; - - virtual journal_seq_t get_alloc_info_replay_from() const = 0; - virtual void close_segment(segment_id_t) = 0; - virtual void update_journal_tail_committed(journal_seq_t tail_committed) = 0; - virtual void update_segment_avail_bytes(segment_type_t, paddr_t) = 0; virtual void update_modify_time( @@ -721,17 +724,9 @@ private: seastar::metrics::metric_group metrics; void register_metrics(); - /// target journal_tail for next fresh segment - journal_seq_t journal_tail_target; + journal_seq_t journal_alloc_tail; - /// target replay_from for dirty extents - journal_seq_t dirty_extents_replay_from; - - /// target replay_from for alloc infos - journal_seq_t alloc_info_replay_from; - - /// most recently committed journal_tail - journal_seq_t journal_tail_committed; + journal_seq_t journal_dirty_tail; /// head of journal journal_seq_t journal_head; @@ -770,10 +765,6 @@ public: /* * SegmentProvider interfaces */ - journal_seq_t get_journal_tail_target() const final { - return journal_tail_target; - } - const segment_info_t& get_seg_info(segment_id_t id) const final { return segments[id]; } @@ -791,8 +782,6 @@ public: void close_segment(segment_id_t segment) final; - void update_journal_tail_committed(journal_seq_t committed) final; - void update_segment_avail_bytes(segment_type_t type, paddr_t offset) final { segments.update_written_to(type, offset); if (type == segment_type_t::JOURNAL) { @@ -812,25 +801,21 @@ public: return sm_group.get(); } - journal_seq_t get_dirty_extents_replay_from() const final { - return dirty_extents_replay_from; + journal_seq_t get_dirty_tail() const final { + return journal_dirty_tail; } - journal_seq_t get_alloc_info_replay_from() const final { - return alloc_info_replay_from; + journal_seq_t get_alloc_tail() const final { + return journal_alloc_tail; } - void update_journal_tail_target( - journal_seq_t dirty_replay_from, - journal_seq_t alloc_replay_from); - - void update_alloc_info_replay_from( - journal_seq_t alloc_replay_from); + void update_journal_tails( + journal_seq_t dirty_tail, journal_seq_t alloc_tail) final; void init_mkfs() { ceph_assert(disable_trim || journal_head != JOURNAL_SEQ_NULL); - journal_tail_target = journal_head; - journal_tail_committed = journal_head; + journal_alloc_tail = journal_head; + journal_dirty_tail = journal_head; } using release_ertr = SegmentManagerGroup::release_ertr; @@ -947,7 +932,7 @@ private: Transaction &t, journal_seq_t limit); - journal_seq_t get_dirty_tail() const { + journal_seq_t get_dirty_tail_target() const { auto ret = journal_head; ceph_assert(ret != JOURNAL_SEQ_NULL); if (ret.segment_seq >= config.target_journal_segments) { @@ -959,7 +944,7 @@ private: return ret; } - journal_seq_t get_dirty_tail_limit() const { + journal_seq_t get_tail_limit() const { auto ret = journal_head; ceph_assert(ret != JOURNAL_SEQ_NULL); if (ret.segment_seq >= config.max_journal_segments) { @@ -971,7 +956,7 @@ private: return ret; } - journal_seq_t get_backref_tail() const { + journal_seq_t get_alloc_tail_target() const { auto ret = journal_head; ceph_assert(ret != JOURNAL_SEQ_NULL); if (ret.segment_seq >= config.target_backref_inflight_segments) { @@ -1149,12 +1134,13 @@ private: if (!init_complete) { return 0; } - if (journal_tail_committed == JOURNAL_SEQ_NULL) { + auto journal_tail = get_journal_tail(); + if (journal_tail == JOURNAL_SEQ_NULL) { return segments.get_num_type_journal(); } assert(journal_head != JOURNAL_SEQ_NULL); - assert(journal_head.segment_seq >= journal_tail_committed.segment_seq); - return journal_head.segment_seq + 1 - journal_tail_committed.segment_seq; + assert(journal_head.segment_seq >= journal_tail.segment_seq); + return journal_head.segment_seq + 1 - journal_tail.segment_seq; } std::size_t get_segments_in_journal_closed() const { auto in_journal = get_segments_in_journal(); @@ -1217,26 +1203,26 @@ private: */ std::size_t get_dirty_journal_size() const { if (journal_head == JOURNAL_SEQ_NULL || - dirty_extents_replay_from == JOURNAL_SEQ_NULL) { + journal_dirty_tail == JOURNAL_SEQ_NULL) { return 0; } - return (journal_head.segment_seq - dirty_extents_replay_from.segment_seq) * + return (journal_head.segment_seq - journal_dirty_tail.segment_seq) * segments.get_segment_size() + journal_head.offset.as_seg_paddr().get_segment_off() - segments.get_segment_size() - - dirty_extents_replay_from.offset.as_seg_paddr().get_segment_off(); + journal_dirty_tail.offset.as_seg_paddr().get_segment_off(); } std::size_t get_alloc_journal_size() const { if (journal_head == JOURNAL_SEQ_NULL || - alloc_info_replay_from == JOURNAL_SEQ_NULL) { + journal_alloc_tail == JOURNAL_SEQ_NULL) { return 0; } - return (journal_head.segment_seq - alloc_info_replay_from.segment_seq) * + return (journal_head.segment_seq - journal_alloc_tail.segment_seq) * segments.get_segment_size() + journal_head.offset.as_seg_paddr().get_segment_off() - segments.get_segment_size() - - alloc_info_replay_from.offset.as_seg_paddr().get_segment_off(); + journal_alloc_tail.offset.as_seg_paddr().get_segment_off(); } /** @@ -1246,7 +1232,7 @@ private: */ bool should_block_on_trim() const { if (disable_trim) return false; - return get_dirty_tail_limit() > journal_tail_target; + return get_tail_limit() > get_journal_tail(); } bool should_block_on_reclaim() const { @@ -1311,11 +1297,11 @@ private: * Encapsulates logic for whether gc should be reclaiming segment space. */ bool gc_should_trim_journal() const { - return get_dirty_tail() > journal_tail_target; + return get_dirty_tail_target() > journal_dirty_tail; } bool gc_should_trim_backref() const { - return get_backref_tail() > alloc_info_replay_from; + return get_alloc_tail_target() > journal_alloc_tail; } /** * gc_should_run diff --git a/src/crimson/os/seastore/cache.cc b/src/crimson/os/seastore/cache.cc index 12282a30396..be7bd75cd0c 100644 --- a/src/crimson/os/seastore/cache.cc +++ b/src/crimson/os/seastore/cache.cc @@ -150,7 +150,7 @@ void Cache::register_metrics() {extent_types_t::OBJECT_DATA_BLOCK, sm::label_instance("ext", "OBJECT_DATA_BLOCK")}, {extent_types_t::RETIRED_PLACEHOLDER, sm::label_instance("ext", "RETIRED_PLACEHOLDER")}, {extent_types_t::ALLOC_INFO, sm::label_instance("ext", "ALLOC_INFO")}, - {extent_types_t::ALLOC_TAIL, sm::label_instance("ext", "ALLOC_TAIL")}, + {extent_types_t::JOURNAL_TAIL, sm::label_instance("ext", "JOURNAL_TAIL")}, {extent_types_t::TEST_BLOCK, sm::label_instance("ext", "TEST_BLOCK")}, {extent_types_t::TEST_BLOCK_PHYSICAL, sm::label_instance("ext", "TEST_BLOCK_PHYSICAL")}, {extent_types_t::BACKREF_INTERNAL, sm::label_instance("ext", "BACKREF_INTERNAL")}, @@ -1045,7 +1045,7 @@ record_t Cache::prepare_record( t.read_set.clear(); t.write_set.clear(); - record_t record; + record_t record(trans_src); auto commit_time = seastar::lowres_system_clock::now(); // Add new copy of mutated blocks, set_io_wait to block until written @@ -1248,10 +1248,15 @@ record_t Cache::prepare_record( } if (is_cleaner_transaction(trans_src)) { + assert(cleaner != nullptr); + auto tails = journal_tail_delta_t{ + get_oldest_backref_dirty_from().value_or(JOURNAL_SEQ_NULL), + get_oldest_dirty_from().value_or(JOURNAL_SEQ_NULL) + }; bufferlist bl; - encode(get_oldest_backref_dirty_from().value_or(JOURNAL_SEQ_NULL), bl); + encode(tails, bl); delta_info_t delta; - delta.type = extent_types_t::ALLOC_TAIL; + delta.type = extent_types_t::JOURNAL_TAIL; delta.bl = bl; record.push_back(std::move(delta)); } @@ -1593,11 +1598,11 @@ Cache::replay_delta( journal_seq_t journal_seq, paddr_t record_base, const delta_info_t &delta, - const journal_seq_t &alloc_replay_from, + const journal_seq_t &alloc_tail, sea_time_point &modify_time) { LOG_PREFIX(Cache::replay_delta); - assert(alloc_replay_from != JOURNAL_SEQ_NULL); + assert(alloc_tail != JOURNAL_SEQ_NULL); ceph_assert(modify_time != NULL_TIME); if (delta.type == extent_types_t::ROOT) { TRACE("replay root delta at {} {}, remove extent ... -- {}, prv_root={}", @@ -1612,9 +1617,9 @@ Cache::replay_delta( add_extent(root); return replay_delta_ertr::now(); } else if (delta.type == extent_types_t::ALLOC_INFO) { - if (journal_seq < alloc_replay_from) { - DEBUG("journal_seq {} < alloc_replay_from {}, don't replay {}", - journal_seq, alloc_replay_from, delta); + if (journal_seq < alloc_tail) { + DEBUG("journal_seq {} < alloc_tail {}, don't replay {}", + journal_seq, alloc_tail, delta); return replay_delta_ertr::now(); } alloc_delta_t alloc_delta; @@ -1638,7 +1643,7 @@ Cache::replay_delta( if (!backref_list.empty()) backref_batch_update(std::move(backref_list), journal_seq); return replay_delta_ertr::now(); - } else if (delta.type == extent_types_t::ALLOC_TAIL) { + } else if (delta.type == extent_types_t::JOURNAL_TAIL) { // this delta should have been dealt with during segment cleaner mounting return replay_delta_ertr::now(); } else { diff --git a/src/crimson/os/seastore/extent_placement_manager.cc b/src/crimson/os/seastore/extent_placement_manager.cc index 48017f8e39c..78c707bb713 100644 --- a/src/crimson/os/seastore/extent_placement_manager.cc +++ b/src/crimson/os/seastore/extent_placement_manager.cc @@ -81,7 +81,7 @@ SegmentedOolWriter::do_write( return do_write(t, extents); }); } - record_t record; + record_t record(TRANSACTION_TYPE_NULL); std::list pending_extents; auto commit_time = seastar::lowres_system_clock::now(); diff --git a/src/crimson/os/seastore/journal/segment_allocator.cc b/src/crimson/os/seastore/journal/segment_allocator.cc index 6ad2ab1a53e..1d11155f948 100644 --- a/src/crimson/os/seastore/journal/segment_allocator.cc +++ b/src/crimson/os/seastore/journal/segment_allocator.cc @@ -54,34 +54,34 @@ SegmentAllocator::do_open(bool is_mkfs) ).safe_then([this, is_mkfs, FNAME, new_segment_seq](auto sref) { // initialize new segment segment_id_t segment_id = sref->get_segment_id(); - journal_seq_t new_journal_tail; - journal_seq_t new_alloc_replay_from; + journal_seq_t dirty_tail; + journal_seq_t alloc_tail; if (type == segment_type_t::JOURNAL) { - new_journal_tail = segment_provider.get_journal_tail_target(); - new_alloc_replay_from = segment_provider.get_alloc_info_replay_from(); + dirty_tail = segment_provider.get_dirty_tail(); + alloc_tail = segment_provider.get_alloc_tail(); if (is_mkfs) { - ceph_assert(new_journal_tail == JOURNAL_SEQ_NULL); - ceph_assert(new_alloc_replay_from == JOURNAL_SEQ_NULL); + ceph_assert(dirty_tail == JOURNAL_SEQ_NULL); + ceph_assert(alloc_tail == JOURNAL_SEQ_NULL); auto mkfs_seq = journal_seq_t{ new_segment_seq, paddr_t::make_seg_paddr(segment_id, 0) }; - new_journal_tail = mkfs_seq; - new_alloc_replay_from = mkfs_seq; + dirty_tail = mkfs_seq; + alloc_tail = mkfs_seq; } else { - ceph_assert(new_journal_tail != JOURNAL_SEQ_NULL); - ceph_assert(new_alloc_replay_from != JOURNAL_SEQ_NULL); + ceph_assert(dirty_tail != JOURNAL_SEQ_NULL); + ceph_assert(alloc_tail != JOURNAL_SEQ_NULL); } } else { // OOL ceph_assert(!is_mkfs); - new_journal_tail = NO_DELTAS; - new_alloc_replay_from = NO_DELTAS; + dirty_tail = NO_DELTAS; + alloc_tail = NO_DELTAS; } auto header = segment_header_t{ new_segment_seq, segment_id, - new_journal_tail, - new_alloc_replay_from, + dirty_tail, + alloc_tail, current_segment_nonce, type, category, @@ -116,13 +116,9 @@ SegmentAllocator::do_open(bool is_mkfs) ).safe_then([this, FNAME, new_journal_seq, - new_journal_tail, sref=std::move(sref)]() mutable { ceph_assert(!current_segment); current_segment = std::move(sref); - if (type == segment_type_t::JOURNAL) { - segment_provider.update_journal_tail_committed(new_journal_tail); - } DEBUG("{} rolled new segment id={}", print_name, current_segment->get_segment_id()); ceph_assert(new_journal_seq.segment_seq == diff --git a/src/crimson/os/seastore/journal/segmented_journal.cc b/src/crimson/os/seastore/journal/segmented_journal.cc index 9fc3ec7bdcb..24ada40f736 100644 --- a/src/crimson/os/seastore/journal/segmented_journal.cc +++ b/src/crimson/os/seastore/journal/segmented_journal.cc @@ -100,43 +100,126 @@ SegmentedJournal::prep_replay_segments( } }); - auto journal_tail = segments.rbegin()->second.journal_tail; - segment_provider.update_journal_tail_committed(journal_tail); - auto journal_tail_paddr = journal_tail.offset; - ceph_assert(journal_tail != JOURNAL_SEQ_NULL); - ceph_assert(journal_tail_paddr != P_ADDR_NULL); - auto from = std::find_if( - segments.begin(), - segments.end(), - [&journal_tail_paddr](const auto &seg) -> bool { - auto& seg_addr = journal_tail_paddr.as_seg_paddr(); - return seg.first == seg_addr.get_segment_id(); - }); - if (from->second.segment_seq != journal_tail.segment_seq) { - ERROR("journal_tail {} does not match {}", - journal_tail, from->second); - ceph_abort(); - } + auto last_segment_id = segments.rbegin()->first; + auto last_header = segments.rbegin()->second; + return scan_last_segment(last_segment_id, last_header + ).safe_then([this, FNAME, segments=std::move(segments)] { + auto journal_tail = segment_provider.get_journal_tail(); + auto journal_tail_paddr = journal_tail.offset; + ceph_assert(journal_tail != JOURNAL_SEQ_NULL); + ceph_assert(journal_tail_paddr != P_ADDR_NULL); + auto from = std::find_if( + segments.begin(), + segments.end(), + [&journal_tail_paddr](const auto &seg) -> bool { + auto& seg_addr = journal_tail_paddr.as_seg_paddr(); + return seg.first == seg_addr.get_segment_id(); + }); + if (from->second.segment_seq != journal_tail.segment_seq) { + ERROR("journal_tail {} does not match {}", + journal_tail, from->second); + ceph_abort(); + } - auto num_segments = segments.end() - from; - INFO("{} segments to replay from {}", - num_segments, journal_tail); - auto ret = replay_segments_t(num_segments); - std::transform( - from, segments.end(), ret.begin(), - [this](const auto &p) { - auto ret = journal_seq_t{ - p.second.segment_seq, - paddr_t::make_seg_paddr( - p.first, - sm_group.get_block_size()) - }; - return std::make_pair(ret, p.second); - }); - ret[0].first.offset = journal_tail_paddr; - return prep_replay_segments_fut( - prep_replay_segments_ertr::ready_future_marker{}, - std::move(ret)); + auto num_segments = segments.end() - from; + INFO("{} segments to replay from {}", + num_segments, journal_tail); + auto ret = replay_segments_t(num_segments); + std::transform( + from, segments.end(), ret.begin(), + [this](const auto &p) { + auto ret = journal_seq_t{ + p.second.segment_seq, + paddr_t::make_seg_paddr( + p.first, + sm_group.get_block_size()) + }; + return std::make_pair(ret, p.second); + }); + ret[0].first.offset = journal_tail_paddr; + return prep_replay_segments_fut( + replay_ertr::ready_future_marker{}, + std::move(ret)); + }); +} + +SegmentedJournal::scan_last_segment_ertr::future<> +SegmentedJournal::scan_last_segment( + const segment_id_t &segment_id, + const segment_header_t &segment_header) +{ + LOG_PREFIX(SegmentedJournal::scan_last_segment); + assert(segment_id == segment_header.physical_segment_id); + segment_provider.update_journal_tails( + segment_header.dirty_tail, segment_header.alloc_tail); + auto seq = journal_seq_t{ + segment_header.segment_seq, + paddr_t::make_seg_paddr(segment_id, 0) + }; + INFO("scanning {} for journal tail deltas", seq); + return seastar::do_with( + scan_valid_records_cursor(seq), + SegmentManagerGroup::found_record_handler_t( + [FNAME, this]( + record_locator_t locator, + const record_group_header_t& record_group_header, + const bufferlist& mdbuf + ) -> SegmentManagerGroup::scan_valid_records_ertr::future<> + { + DEBUG("decoding {} at {}", record_group_header, locator); + bool has_tail_delta = false; + auto maybe_headers = try_decode_record_headers( + record_group_header, mdbuf); + if (!maybe_headers) { + // This should be impossible, we did check the crc on the mdbuf + ERROR("unable to decode headers from {} at {}", + record_group_header, locator); + ceph_abort(); + } + for (auto &record_header : *maybe_headers) { + ceph_assert(is_valid_transaction(record_header.type)); + if (is_cleaner_transaction(record_header.type)) { + has_tail_delta = true; + } + } + if (has_tail_delta) { + bool found_delta = false; + auto maybe_record_deltas_list = try_decode_deltas( + record_group_header, mdbuf, locator.record_block_base); + if (!maybe_record_deltas_list) { + ERROR("unable to decode deltas from {} at {}", + record_group_header, locator); + ceph_abort(); + } + for (auto &record_deltas : *maybe_record_deltas_list) { + for (auto &[ctime, delta] : record_deltas.deltas) { + if (delta.type == extent_types_t::JOURNAL_TAIL) { + found_delta = true; + journal_tail_delta_t tail_delta; + decode(tail_delta, delta.bl); + if (tail_delta.alloc_tail == JOURNAL_SEQ_NULL) { + tail_delta.alloc_tail = locator.write_result.start_seq; + } + if (tail_delta.dirty_tail == JOURNAL_SEQ_NULL) { + tail_delta.dirty_tail = locator.write_result.start_seq; + } + segment_provider.update_journal_tails( + tail_delta.dirty_tail, tail_delta.alloc_tail); + } + } + } + ceph_assert(found_delta); + } + return seastar::now(); + }), + [this, nonce=segment_header.segment_nonce](auto &cursor, auto &handler) + { + return sm_group.scan_valid_records( + cursor, + nonce, + std::numeric_limits::max(), + handler).discard_result(); + }); } SegmentedJournal::replay_ertr::future<> @@ -226,7 +309,7 @@ SegmentedJournal::replay_segment( return handler( locator, delta, - segment_provider.get_alloc_info_replay_from(), + segment_provider.get_alloc_tail(), modify_time); }); }); diff --git a/src/crimson/os/seastore/journal/segmented_journal.h b/src/crimson/os/seastore/journal/segmented_journal.h index 8c6485a1302..3cf935382b6 100644 --- a/src/crimson/os/seastore/journal/segmented_journal.h +++ b/src/crimson/os/seastore/journal/segmented_journal.h @@ -65,14 +65,16 @@ private: /// return ordered vector of segments to replay using replay_segments_t = std::vector< std::pair>; - using prep_replay_segments_ertr = crimson::errorator< - crimson::ct_error::input_output_error - >; - using prep_replay_segments_fut = prep_replay_segments_ertr::future< + using prep_replay_segments_fut = replay_ertr::future< replay_segments_t>; prep_replay_segments_fut prep_replay_segments( std::vector> segments); + /// scan the last segment for tail deltas + using scan_last_segment_ertr = replay_ertr; + scan_last_segment_ertr::future<> scan_last_segment( + const segment_id_t&, const segment_header_t&); + /// replays records starting at start through end of segment replay_ertr::future<> replay_segment( diff --git a/src/crimson/os/seastore/seastore_types.cc b/src/crimson/os/seastore/seastore_types.cc index 6ef15f6fba6..a6191f7dfbc 100644 --- a/src/crimson/os/seastore/seastore_types.cc +++ b/src/crimson/os/seastore/seastore_types.cc @@ -269,7 +269,8 @@ std::ostream &operator<<(std::ostream &out, const segment_header_t &header) return out << "segment_header_t(" << "segment_seq=" << segment_seq_printer_t{header.segment_seq} << ", segment_id=" << header.physical_segment_id - << ", journal_tail=" << header.journal_tail + << ", dirty_tail=" << header.dirty_tail + << ", alloc_tail=" << header.alloc_tail << ", segment_nonce=" << header.segment_nonce << ", type=" << header.type << ", category=" << header.category @@ -341,7 +342,8 @@ std::ostream &operator<<(std::ostream& out, const record_size_t& rsize) std::ostream &operator<<(std::ostream& out, const record_t& r) { return out << "record_t(" - << "num_extents=" << r.extents.size() + << "type=" << r.type + << ", num_extents=" << r.extents.size() << ", num_deltas=" << r.deltas.size() << ", modify_time=" << sea_time_point_printer_t{r.modify_time} << ")"; @@ -350,7 +352,8 @@ std::ostream &operator<<(std::ostream& out, const record_t& r) std::ostream &operator<<(std::ostream& out, const record_header_t& r) { return out << "record_header_t(" - << "num_extents=" << r.extents + << "type=" << r.type + << ", num_extents=" << r.extents << ", num_deltas=" << r.deltas << ", modify_time=" << mod_time_point_printer_t{r.modify_time} << ")"; @@ -450,6 +453,7 @@ ceph::bufferlist encode_records( for (auto& r: record_group.records) { record_header_t rheader{ + r.type, (extent_len_t)r.deltas.size(), (extent_len_t)r.extents.size(), timepoint_to_mod(r.modify_time) diff --git a/src/crimson/os/seastore/seastore_types.h b/src/crimson/os/seastore/seastore_types.h index 9e5dbc0543e..c76c85172c6 100644 --- a/src/crimson/os/seastore/seastore_types.h +++ b/src/crimson/os/seastore/seastore_types.h @@ -931,7 +931,7 @@ enum class extent_types_t : uint8_t { // the following two types are not extent types, // they are just used to indicates paddr allocation deltas ALLOC_INFO = 9, - ALLOC_TAIL = 10, + JOURNAL_TAIL = 10, // Test Block Types TEST_BLOCK = 11, TEST_BLOCK_PHYSICAL = 12, @@ -1107,6 +1107,19 @@ struct delta_info_t { std::ostream &operator<<(std::ostream &out, const delta_info_t &delta); +/* contains the latest journal tail information */ +struct journal_tail_delta_t { + journal_seq_t alloc_tail; + journal_seq_t dirty_tail; + + DENC(journal_tail_delta_t, v, p) { + DENC_START(1, 1, p); + denc(v.alloc_tail, p); + denc(v.dirty_tail, p); + DENC_FINISH(p); + } +}; + class object_data_t { laddr_t reserved_data_base = L_ADDR_NULL; extent_len_t reserved_data_len = 0; @@ -1475,14 +1488,16 @@ using segment_nonce_t = uint32_t; * Every segment contains and encode segment_header_t in the first block. * Our strategy for finding the journal replay point is: * 1) Find the segment with the highest journal_segment_seq - * 2) Replay starting at record located at that segment's journal_tail + * 2) Get dirty_tail and alloc_tail from the segment header + * 3) Scan forward to update tails from journal_tail_delta_t + * 4) Replay from the latest tails */ struct segment_header_t { segment_seq_t segment_seq; segment_id_t physical_segment_id; // debugging - journal_seq_t journal_tail; - journal_seq_t alloc_replay_from; + journal_seq_t dirty_tail; + journal_seq_t alloc_tail; segment_nonce_t segment_nonce; segment_type_t type; @@ -1498,8 +1513,8 @@ struct segment_header_t { DENC_START(1, 1, p); denc(v.segment_seq, p); denc(v.physical_segment_id, p); - denc(v.journal_tail, p); - denc(v.alloc_replay_from, p); + denc(v.dirty_tail, p); + denc(v.alloc_tail, p); denc(v.segment_nonce, p); denc(v.type, p); denc(v.category, p); @@ -1585,12 +1600,18 @@ WRITE_EQ_OPERATORS_2(record_size_t, plain_mdlength, dlength); std::ostream &operator<<(std::ostream&, const record_size_t&); struct record_t { + transaction_type_t type = TRANSACTION_TYPE_NULL; std::vector extents; std::vector deltas; record_size_t size; sea_time_point modify_time = NULL_TIME; - record_t() = default; + record_t(transaction_type_t type) : type{type} { } + + // unit test only + record_t() { + type = transaction_type_t::MUTATE; + } // unit test only record_t(std::vector&& _extents, @@ -1602,6 +1623,7 @@ struct record_t { for (auto& d: _deltas) { push_back(std::move(d)); } + type = transaction_type_t::MUTATE; } bool is_empty() const { @@ -1639,12 +1661,14 @@ struct record_t { std::ostream &operator<<(std::ostream&, const record_t&); struct record_header_t { + transaction_type_t type; uint32_t deltas; // number of deltas uint32_t extents; // number of extents mod_time_point_t modify_time; DENC(record_header_t, v, p) { DENC_START(1, 1, p); + denc(v.type, p); denc(v.deltas, p); denc(v.extents, p); denc(v.modify_time, p); @@ -1945,6 +1969,7 @@ WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::segment_id_t) WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::paddr_t) WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::journal_seq_t) WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::delta_info_t) +WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::journal_tail_delta_t) WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::record_header_t) WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::record_group_header_t) WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::extent_info_t) diff --git a/src/crimson/os/seastore/transaction_manager.cc b/src/crimson/os/seastore/transaction_manager.cc index f9a082634e1..4a70b501570 100644 --- a/src/crimson/os/seastore/transaction_manager.cc +++ b/src/crimson/os/seastore/transaction_manager.cc @@ -92,18 +92,18 @@ TransactionManager::mount_ertr::future<> TransactionManager::mount() [this]( const auto &offsets, const auto &e, - const journal_seq_t alloc_replay_from, + const journal_seq_t alloc_tail, auto modify_time) { auto start_seq = offsets.write_result.start_seq; - async_cleaner->update_journal_tail_target( + async_cleaner->update_journal_tails( cache->get_oldest_dirty_from().value_or(start_seq), cache->get_oldest_backref_dirty_from().value_or(start_seq)); return cache->replay_delta( start_seq, offsets.record_block_base, e, - alloc_replay_from, + alloc_tail, modify_time); }); }).safe_then([this] { @@ -416,7 +416,7 @@ TransactionManager::submit_transaction_direct( lba_manager->complete_transaction(tref, lba_to_clear, lba_to_link); backref_manager->complete_transaction(tref, backref_to_clear, backref_to_link); - async_cleaner->update_journal_tail_target( + async_cleaner->update_journal_tails( cache->get_oldest_dirty_from().value_or(start_seq), cache->get_oldest_backref_dirty_from().value_or(start_seq)); return async_cleaner->maybe_release_segment(tref); diff --git a/src/test/crimson/seastore/test_btree_lba_manager.cc b/src/test/crimson/seastore/test_btree_lba_manager.cc index c782531680a..1bf7c5050f4 100644 --- a/src/test/crimson/seastore/test_btree_lba_manager.cc +++ b/src/test/crimson/seastore/test_btree_lba_manager.cc @@ -53,7 +53,11 @@ struct btree_test_base : */ void set_journal_head(journal_seq_t) final {} - journal_seq_t get_journal_tail_target() const final { return dummy_tail; } + journal_seq_t get_dirty_tail() const final { return dummy_tail; } + + journal_seq_t get_alloc_tail() const final { return dummy_tail; } + + void update_journal_tails(journal_seq_t, journal_seq_t) final {} const segment_info_t& get_seg_info(segment_id_t id) const final { tmp_info = {}; @@ -79,22 +83,12 @@ struct btree_test_base : void close_segment(segment_id_t) final {} - void update_journal_tail_committed(journal_seq_t committed) final {} - void update_segment_avail_bytes(segment_type_t, paddr_t) final {} void update_modify_time(segment_id_t, sea_time_point, std::size_t) final {} SegmentManagerGroup* get_segment_manager_group() final { return sms.get(); } - journal_seq_t get_dirty_extents_replay_from() const final { - return dummy_tail; - } - - journal_seq_t get_alloc_info_replay_from() const final { - return dummy_tail; - } - virtual void complete_commit(Transaction &t) {} seastar::future<> submit_transaction(TransactionRef t) { diff --git a/src/test/crimson/seastore/test_seastore_journal.cc b/src/test/crimson/seastore/test_seastore_journal.cc index 63a61f8a6f9..3b116482ca1 100644 --- a/src/test/crimson/seastore/test_seastore_journal.cc +++ b/src/test/crimson/seastore/test_seastore_journal.cc @@ -94,7 +94,11 @@ struct journal_test_t : seastar_test_suite_t, SegmentProvider { */ void set_journal_head(journal_seq_t) final {} - journal_seq_t get_journal_tail_target() const final { return dummy_tail; } + journal_seq_t get_dirty_tail() const final { return dummy_tail; } + + journal_seq_t get_alloc_tail() const final { return dummy_tail; } + + void update_journal_tails(journal_seq_t, journal_seq_t) final {} const segment_info_t& get_seg_info(segment_id_t id) const final { tmp_info = {}; @@ -103,14 +107,6 @@ struct journal_test_t : seastar_test_suite_t, SegmentProvider { return tmp_info; } - journal_seq_t get_dirty_extents_replay_from() const final { - return dummy_tail; - } - - journal_seq_t get_alloc_info_replay_from() const final { - return dummy_tail; - } - segment_id_t allocate_segment( segment_seq_t seq, segment_type_t type, @@ -128,8 +124,6 @@ struct journal_test_t : seastar_test_suite_t, SegmentProvider { void close_segment(segment_id_t) final {} - void update_journal_tail_committed(journal_seq_t paddr) final {} - void update_segment_avail_bytes(segment_type_t, paddr_t) final {} void update_modify_time(segment_id_t, sea_time_point, std::size_t) final {} -- 2.39.5