From 84400c374d66cdd1e3d27db1e60ee4cd6e1425d1 Mon Sep 17 00:00:00 2001 From: Zhang Song Date: Mon, 13 Jun 2022 16:40:54 +0800 Subject: [PATCH] crimson/os/seastore: introduce TransactionManager::map_existing_extent Signed-off-by: Zhang Song --- src/crimson/os/seastore/cache.cc | 84 ++++++++++++++++-- src/crimson/os/seastore/seastore_types.h | 4 + src/crimson/os/seastore/transaction.h | 88 +++++++++++++++++-- .../os/seastore/transaction_manager.cc | 9 +- src/crimson/os/seastore/transaction_manager.h | 62 +++++++++++++ 5 files changed, 233 insertions(+), 14 deletions(-) diff --git a/src/crimson/os/seastore/cache.cc b/src/crimson/os/seastore/cache.cc index 7812267d4af..64ce9217807 100644 --- a/src/crimson/os/seastore/cache.cc +++ b/src/crimson/os/seastore/cache.cc @@ -979,6 +979,15 @@ CachedExtentRef Cache::duplicate_for_write( if (i->is_pending()) return i; + if (i->is_exist_clean()) { + i->version++; + i->state = CachedExtent::extent_state_t::EXIST_MUTATION_PENDING; + i->last_committed_crc = i->get_crc32c(); + t.add_mutated_extent(i); + DEBUGT("duplicate existing extent {}", t, *i); + return i; + } + auto ret = i->duplicate_for_write(); ret->prior_instance = i; t.add_mutated_extent(ret); @@ -1035,16 +1044,25 @@ record_t Cache::prepare_record( DEBUGT("invalid mutated extent -- {}", t, *i); continue; } - assert(i->prior_instance); + assert(i->is_exist_mutation_pending() || + i->prior_instance); get_by_ext(efforts.mutate_by_ext, i->get_type()).increment(i->get_length()); auto delta_bl = i->get_delta(); auto delta_length = delta_bl.length(); - DEBUGT("mutated extent with {}B delta, commit replace extent ... -- {}, prior={}", - t, delta_length, *i, *i->prior_instance); i->set_modify_time(commit_time); - commit_replace_extent(t, i, i->prior_instance); + DEBUGT("mutated extent with {}B delta -- {}", + t, delta_length, *i); + if (!i->is_exist_mutation_pending()) { + DEBUGT("commit replace extent ... -- {}, prior={}", + t, *i, *i->prior_instance); + // extent with EXIST_MUTATION_PENDING doesn't have + // prior_instance field so skip these extents. + // the existing extents should be added into Cache + // during complete_commit to sync with gc transaction. + commit_replace_extent(t, i, i->prior_instance); + } i->prepare_write(); i->set_io_wait(); @@ -1197,6 +1215,16 @@ record_t Cache::prepare_record( i->get_type()); } } + + for (auto &i: t.existing_block_list) { + if (i->is_valid()) { + alloc_delta.alloc_blk_ranges.emplace_back( + i->get_paddr(), + i->cast()->get_laddr(), + i->get_length(), + i->get_type()); + } + } alloc_deltas.emplace_back(std::move(alloc_delta)); for (auto b : alloc_deltas) { @@ -1401,6 +1429,10 @@ void Cache::complete_commit( i->get_length()); } if (is_backref_mapped_extent_node(i)) { + DEBUGT("backref_list new {} len {}", + t, + i->get_paddr(), + i->get_length()); backref_list.emplace_back( std::make_unique( i->get_paddr(), @@ -1426,7 +1458,8 @@ void Cache::complete_commit( if (!i->is_valid()) { continue; } - assert(i->prior_instance); + assert(i->is_exist_mutation_pending() || + i->prior_instance); i->on_delta_write(final_block_start); i->prior_instance = CachedExtentRef(); i->state = CachedExtent::extent_state_t::DIRTY; @@ -1445,6 +1478,13 @@ void Cache::complete_commit( i->get_paddr(), i->get_length()); } + for (auto &i: t.existing_block_list) { + if (i->is_valid()) { + cleaner->mark_space_used( + i->get_paddr(), + i->get_length()); + } + } } for (auto &i: t.mutated_block_list) { @@ -1459,6 +1499,11 @@ void Cache::complete_commit( i->dirty_from_or_retired_at = last_commit; if (is_backref_mapped_extent_node(i) || is_retired_placeholder(i->get_type())) { + DEBUGT("backref_list free {} len {} should release {}", + t, + i->get_paddr(), + i->get_length(), + t.should_record_release(i->get_paddr())); if (t.should_record_release(i->get_paddr())) { backref_list.emplace_back( std::make_unique( @@ -1475,6 +1520,35 @@ void Cache::complete_commit( ceph_abort("not possible"); } } + + auto existing_stats = t.get_existing_block_stats(); + DEBUGT("total existing blocks num: {}, exist clean num: {}, " + "exist mutation pending num: {}", + t, + existing_stats.valid_num, + existing_stats.clean_num, + existing_stats.mutated_num); + for (auto &i: t.existing_block_list) { + if (i->is_valid()) { + if (i->is_exist_clean()) { + i->state = CachedExtent::extent_state_t::CLEAN; + } else { + assert(i->state == CachedExtent::extent_state_t::DIRTY); + } + DEBUGT("backref_list new existing {} len {}", + t, + i->get_paddr(), + i->get_length()); + backref_list.emplace_back( + std::make_unique( + i->get_paddr(), + i->cast()->get_laddr(), + i->get_length(), + i->get_type(), + seq)); + add_extent(i); + } + } if (!backref_list.empty()) backref_batch_update(std::move(backref_list), seq); } diff --git a/src/crimson/os/seastore/seastore_types.h b/src/crimson/os/seastore/seastore_types.h index 3fceac7ed58..a49f2f989d5 100644 --- a/src/crimson/os/seastore/seastore_types.h +++ b/src/crimson/os/seastore/seastore_types.h @@ -552,6 +552,10 @@ public: return !is_zero() && !is_null(); } + bool is_absolute() const { + return get_device_id() <= DEVICE_ID_MAX_VALID; + } + DENC(paddr_t, v, p) { DENC_START(1, 1, p); denc(v.dev_addr, p); diff --git a/src/crimson/os/seastore/transaction.h b/src/crimson/os/seastore/transaction.h index 83c99f07bc6..78150ddb586 100644 --- a/src/crimson/os/seastore/transaction.h +++ b/src/crimson/os/seastore/transaction.h @@ -82,15 +82,20 @@ public: }; get_extent_ret get_extent(paddr_t addr, CachedExtentRef *out) { LOG_PREFIX(Transaction::get_extent); - if (retired_set.count(addr)) { - return get_extent_ret::RETIRED; - } else if (auto iter = write_set.find_offset(addr); + // it's possible that both write_set and retired_set contain + // this addr at the same time when addr is absolute and the + // corresponding extent is used to map existing extent on disk. + // So search write_set first. + if (auto iter = write_set.find_offset(addr); iter != write_set.end()) { if (out) *out = CachedExtentRef(&*iter); SUBTRACET(seastore_cache, "{} is present in write_set -- {}", *this, addr, *iter); + assert((*out)->is_valid()); return get_extent_ret::PRESENT; + } else if (retired_set.count(addr)) { + return get_extent_ret::RETIRED; } else if ( auto iter = read_set.find(addr); iter != read_set.end()) { @@ -109,7 +114,12 @@ public: void add_to_retired_set(CachedExtentRef ref) { ceph_assert(!is_weak()); - if (ref->is_initial_pending()) { + if (ref->is_exist_clean() || + ref->is_exist_mutation_pending()) { + existing_block_stats.dec(ref); + ref->state = CachedExtent::extent_state_t::INVALID; + write_set.erase(*ref); + } else if (ref->is_initial_pending()) { ref->state = CachedExtent::extent_state_t::INVALID; write_set.erase(*ref); } else if (ref->is_mutation_pending()) { @@ -137,19 +147,23 @@ public: void add_fresh_extent( CachedExtentRef ref) { ceph_assert(!is_weak()); - if (ref->get_paddr().is_delayed()) { + if (ref->is_exist_clean()) { + existing_block_stats.inc(ref); + existing_block_list.push_back(ref); + } else if (ref->get_paddr().is_delayed()) { assert(ref->get_paddr() == make_delayed_temp_paddr(0)); assert(ref->is_logical()); ref->set_paddr(make_delayed_temp_paddr(delayed_temp_offset)); delayed_temp_offset += ref->get_length(); delayed_alloc_list.emplace_back(ref->cast()); + fresh_block_stats.increment(ref->get_length()); } else { assert(ref->get_paddr() == make_record_relative_paddr(0)); ref->set_paddr(make_record_relative_paddr(offset)); offset += ref->get_length(); inline_block_list.push_back(ref); + fresh_block_stats.increment(ref->get_length()); } - fresh_block_stats.increment(ref->get_length()); write_set.insert(*ref); if (is_backref_node(ref->get_type())) fresh_backref_extents++; @@ -178,9 +192,15 @@ public: void add_mutated_extent(CachedExtentRef ref) { ceph_assert(!is_weak()); - assert(read_set.count(ref->prior_instance->get_paddr())); + assert(ref->is_exist_mutation_pending() || + read_set.count(ref->prior_instance->get_paddr())); mutated_block_list.push_back(ref); - write_set.insert(*ref); + if (!ref->is_exist_mutation_pending()) { + write_set.insert(*ref); + } else { + assert(write_set.find_offset(ref->get_paddr()) != + write_set.end()); + } } void replace_placeholder(CachedExtent& placeholder, CachedExtent& extent) { @@ -233,10 +253,31 @@ public: return mutated_block_list; } + const auto &get_existing_block_list() { + return existing_block_list; + } + const auto &get_retired_set() { return retired_set; } + bool is_retired(laddr_t laddr, extent_len_t len, paddr_t paddr) { + if (retired_set.empty()) { + return false; + } + auto iter = retired_set.lower_bound(paddr); + if (iter == retired_set.end() || + (*iter)->get_paddr() > paddr) { + assert(iter != retired_set.begin()); + --iter; + } + + auto lextent = (*iter)->cast(); + auto ext_laddr = lextent->get_laddr(); + return ext_laddr <= laddr && + ext_laddr + lextent->get_length() >= laddr + len; + } + bool should_record_release(paddr_t addr) { auto count = no_release_delta_retired_set.count(addr); #ifndef NDEBUG @@ -337,6 +378,8 @@ public: ool_block_list.clear(); retired_set.clear(); no_release_delta_retired_set.clear(); + existing_block_list.clear(); + existing_block_stats = {}; onode_tree_stats = {}; omap_tree_stats = {}; lba_tree_stats = {}; @@ -404,6 +447,31 @@ public: return rewrite_version_stats; } + struct existing_block_stats_t { + uint64_t valid_num = 0; + uint64_t clean_num = 0; + uint64_t mutated_num = 0; + void inc(const CachedExtentRef &ref) { + valid_num++; + if (ref->is_exist_clean()) { + clean_num++; + } else { + mutated_num++; + } + } + void dec(const CachedExtentRef &ref) { + valid_num--; + if (ref->is_exist_clean()) { + clean_num--; + } else { + mutated_num--; + } + } + }; + existing_block_stats_t& get_existing_block_stats() { + return existing_block_stats; + } + private: friend class Cache; friend Ref make_test_transaction(); @@ -455,6 +523,10 @@ private: /// list of mutated blocks, holds refcounts, subset of write_set std::list mutated_block_list; + /// partial blocks of extents on disk, with data and refcounts + std::list existing_block_list; + existing_block_stats_t existing_block_stats; + /** * retire_set * diff --git a/src/crimson/os/seastore/transaction_manager.cc b/src/crimson/os/seastore/transaction_manager.cc index 434daf8dfc2..14813a66848 100644 --- a/src/crimson/os/seastore/transaction_manager.cc +++ b/src/crimson/os/seastore/transaction_manager.cc @@ -389,7 +389,8 @@ TransactionManager::submit_transaction_direct( // ...but add_pin from parent->leaf std::vector lba_to_link; std::vector backref_to_link; - lba_to_link.reserve(tref.get_fresh_block_stats().num); + lba_to_link.reserve(tref.get_fresh_block_stats().num + + tref.get_existing_block_stats().valid_num); backref_to_link.reserve(tref.get_fresh_block_stats().num); tref.for_each_fresh_block([&](auto &e) { if (e->is_valid()) { @@ -400,6 +401,12 @@ TransactionManager::submit_transaction_direct( } }); + for (auto &e: tref.get_existing_block_list()) { + if (e->is_valid()) { + lba_to_link.push_back(e); + } + } + lba_manager->complete_transaction(tref, lba_to_clear, lba_to_link); backref_manager->complete_transaction(tref, backref_to_clear, backref_to_link); diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h index de7bb3fee4d..823b1abcef9 100644 --- a/src/crimson/os/seastore/transaction_manager.h +++ b/src/crimson/os/seastore/transaction_manager.h @@ -362,6 +362,68 @@ public: }); } + /** + * map_existing_extent + * + * Allocates a new extent at given existing_paddr that must be absolute and + * reads disk to fill the extent. + * The common usage is that remove the LogicalCachedExtent (laddr~length at paddr) + * and map extent to multiple new extents. + * placement_hint and generation should follow the original extent. + */ + using map_existing_extent_iertr = + alloc_extent_iertr::extend_ertr; + template + using map_existing_extent_ret = + map_existing_extent_iertr::future>; + template + map_existing_extent_ret map_existing_extent( + Transaction &t, + laddr_t laddr_hint, + paddr_t existing_paddr, + extent_len_t length, + placement_hint_t placement_hint = placement_hint_t::HOT, + reclaim_gen_t gen = DIRTY_GENERATION) { + LOG_PREFIX(TransactionManager::map_existing_extent); + ceph_assert(existing_paddr.is_absolute()); + assert(t.is_retired(laddr_hint, length, existing_paddr)); + + auto bp = ceph::bufferptr(buffer::create_page_aligned(length)); + bp.zero(); + + // ExtentPlacementManager::alloc_new_extent will make a new + // (relative/temp) paddr, so make extent directly + auto ext = CachedExtent::make_cached_extent_ref(std::move(bp)); + + ext->init(CachedExtent::extent_state_t::EXIST_CLEAN, + existing_paddr, + placement_hint, + gen); + + t.add_fresh_extent(ext); + + return lba_manager->alloc_extent( + t, + laddr_hint, + length, + existing_paddr + ).si_then([ext=std::move(ext), laddr_hint, &t, this, FNAME](auto &&ref) { + SUBDEBUGT(seastore_tm, "map existing extent: {}, laddr_hint: {} pin: {}", + t, *ext, laddr_hint, *ref); + ceph_assert(laddr_hint == ref->get_key()); + ext->set_pin(std::move(ref)); + return epm->read( + ext->get_paddr(), + ext->get_length(), + ext->get_bptr() + ).safe_then([ext=std::move(ext)] { + return map_existing_extent_iertr::make_ready_future> + (std::move(ext)); + }); + }); + } + + using reserve_extent_iertr = alloc_extent_iertr; using reserve_extent_ret = reserve_extent_iertr::future; reserve_extent_ret reserve_region( -- 2.39.5