From: Yingxin Cheng Date: Mon, 7 Nov 2022 08:42:58 +0000 (+0800) Subject: crimson/os/seastore: define the usage of generation X-Git-Tag: v18.1.0~903^2~1 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=5f4cb5b5a4d945cd0812f6de333089217b27464a;p=ceph-ci.git crimson/os/seastore: define the usage of generation Unify the definition of the reclaim generation, cleanup and explain its intentions and usages. Please refer to the comments in seastore_types.h. Signed-off-by: Yingxin Cheng --- diff --git a/src/crimson/os/seastore/async_cleaner.cc b/src/crimson/os/seastore/async_cleaner.cc index 322f54ee3d9..819f69e5e2b 100644 --- a/src/crimson/os/seastore/async_cleaner.cc +++ b/src/crimson/os/seastore/async_cleaner.cc @@ -31,7 +31,7 @@ void segment_info_t::set_open( ceph_assert(_seq != NULL_SEG_SEQ); ceph_assert(_type != segment_type_t::NULL_SEG); ceph_assert(_category != data_category_t::NUM); - ceph_assert(_generation < RECLAIM_GENERATIONS); + ceph_assert(is_reclaim_generation(_generation)); state = Segment::segment_state_t::OPEN; seq = _seq; type = _type; @@ -66,7 +66,7 @@ void segment_info_t::init_closed( ceph_assert(_seq != NULL_SEG_SEQ); ceph_assert(_type != segment_type_t::NULL_SEG); ceph_assert(_category != data_category_t::NUM); - ceph_assert(_generation < RECLAIM_GENERATIONS); + ceph_assert(is_reclaim_generation(_generation)); state = Segment::segment_state_t::CLOSED; seq = _seq; type = _type; @@ -612,7 +612,7 @@ JournalTrimmerImpl::trim_dirty() dirty_list, [this, &t](auto &e) { return extent_callback->rewrite_extent( - t, e, DIRTY_GENERATION, NULL_TIME); + t, e, INIT_GENERATION, NULL_TIME); }); }); }).si_then([this, &t] { diff --git a/src/crimson/os/seastore/async_cleaner.h b/src/crimson/os/seastore/async_cleaner.h index ded2a3da8ac..eb09464e6e4 100644 --- a/src/crimson/os/seastore/async_cleaner.h +++ b/src/crimson/os/seastore/async_cleaner.h @@ -1085,10 +1085,20 @@ private: segment_id_t segment_id, reclaim_gen_t generation, segment_off_t segment_size) { - ceph_assert(generation < RECLAIM_GENERATIONS); + ceph_assert(is_reclaim_generation(generation)); + + reclaim_gen_t target_gen; + if (generation < MIN_REWRITE_GENERATION) { + target_gen = MIN_REWRITE_GENERATION; + } else { + // tolerate the target_gen to exceed MAX_REWRETE_GENERATION to make EPM + // aware of its original generation for the decisions. + target_gen = generation + 1; + } + + assert(is_target_reclaim_generation(target_gen)); return {generation, - (reclaim_gen_t)(generation == RECLAIM_GENERATIONS - 1 ? - generation : generation + 1), + target_gen, segment_size, P_ADDR_NULL, paddr_t::make_seg_paddr(segment_id, 0)}; diff --git a/src/crimson/os/seastore/btree/fixed_kv_btree.h b/src/crimson/os/seastore/btree/fixed_kv_btree.h index 1ea09d0e598..ccf90caf06b 100644 --- a/src/crimson/os/seastore/btree/fixed_kv_btree.h +++ b/src/crimson/os/seastore/btree/fixed_kv_btree.h @@ -315,7 +315,7 @@ public: c.trans, node_size, placement_hint_t::HOT, - 0); + INIT_GENERATION); root_leaf->set_size(0); fixed_kv_node_meta_t meta{min_max_t::min, min_max_t::max, 1}; root_leaf->set_meta(meta); @@ -818,6 +818,7 @@ public: c.trans, fixed_kv_extent.get_length(), fixed_kv_extent.get_user_hint(), + // get target reclaim generation fixed_kv_extent.get_reclaim_generation()); fixed_kv_extent.get_bptr().copy_out( 0, @@ -1406,7 +1407,7 @@ private: if (split_from == iter.get_depth()) { auto nroot = c.cache.template alloc_new_extent( - c.trans, node_size, placement_hint_t::HOT, 0); + c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION); fixed_kv_node_meta_t meta{ min_max_t::min, min_max_t::max, iter.get_depth() + 1}; nroot->set_meta(meta); diff --git a/src/crimson/os/seastore/btree/fixed_kv_node.h b/src/crimson/os/seastore/btree/fixed_kv_node.h index f193509f50e..1aed9fb200c 100644 --- a/src/crimson/os/seastore/btree/fixed_kv_node.h +++ b/src/crimson/os/seastore/btree/fixed_kv_node.h @@ -154,9 +154,9 @@ struct FixedKVInternalNode std::tuple make_split_children(op_context_t c) { auto left = c.cache.template alloc_new_extent( - c.trans, node_size, placement_hint_t::HOT, 0); + c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION); auto right = c.cache.template alloc_new_extent( - c.trans, node_size, placement_hint_t::HOT, 0); + c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION); auto pivot = this->split_into(*left, *right); left->pin.set_range(left->get_meta()); right->pin.set_range(right->get_meta()); @@ -170,7 +170,7 @@ struct FixedKVInternalNode op_context_t c, Ref &right) { auto replacement = c.cache.template alloc_new_extent( - c.trans, node_size, placement_hint_t::HOT, 0); + c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION); replacement->merge_from(*this, *right->template cast()); replacement->pin.set_range(replacement->get_meta()); return replacement; @@ -184,9 +184,9 @@ struct FixedKVInternalNode ceph_assert(_right->get_type() == this->get_type()); auto &right = *_right->template cast(); auto replacement_left = c.cache.template alloc_new_extent( - c.trans, node_size, placement_hint_t::HOT, 0); + c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION); auto replacement_right = c.cache.template alloc_new_extent( - c.trans, node_size, placement_hint_t::HOT, 0); + c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION); auto pivot = this->balance_into_new_nodes( *this, @@ -355,9 +355,9 @@ struct FixedKVLeafNode std::tuple make_split_children(op_context_t c) { auto left = c.cache.template alloc_new_extent( - c.trans, node_size, placement_hint_t::HOT, 0); + c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION); auto right = c.cache.template alloc_new_extent( - c.trans, node_size, placement_hint_t::HOT, 0); + c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION); auto pivot = this->split_into(*left, *right); left->pin.set_range(left->get_meta()); right->pin.set_range(right->get_meta()); @@ -371,7 +371,7 @@ struct FixedKVLeafNode op_context_t c, Ref &right) { auto replacement = c.cache.template alloc_new_extent( - c.trans, node_size, placement_hint_t::HOT, 0); + c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION); replacement->merge_from(*this, *right->template cast()); replacement->pin.set_range(replacement->get_meta()); return replacement; @@ -385,9 +385,9 @@ struct FixedKVLeafNode ceph_assert(_right->get_type() == this->get_type()); auto &right = *_right->template cast(); auto replacement_left = c.cache.template alloc_new_extent( - c.trans, node_size, placement_hint_t::HOT, 0); + c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION); auto replacement_right = c.cache.template alloc_new_extent( - c.trans, node_size, placement_hint_t::HOT, 0); + c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION); auto pivot = this->balance_into_new_nodes( *this, diff --git a/src/crimson/os/seastore/cache.cc b/src/crimson/os/seastore/cache.cc index 80d09889b1b..1fe91306ba5 100644 --- a/src/crimson/os/seastore/cache.cc +++ b/src/crimson/os/seastore/cache.cc @@ -700,6 +700,8 @@ void Cache::add_extent( const Transaction::src_t* p_src=nullptr) { assert(ref->is_valid()); + assert(ref->user_hint == PLACEMENT_HINT_NULL); + assert(ref->reclaim_generation == NULL_GENERATION); extents.insert(*ref); if (ref->is_dirty()) { add_to_dirty(ref); @@ -1012,7 +1014,6 @@ CachedExtentRef Cache::duplicate_for_write( ret->version++; ret->state = CachedExtent::extent_state_t::MUTATION_PENDING; - ret->set_reclaim_generation(DIRTY_GENERATION); DEBUGT("{} -> {}", t, *i, *ret); return ret; } @@ -1440,6 +1441,7 @@ void Cache::complete_commit( DEBUGT("add extent as fresh, inline={} -- {}", t, is_inline, *i); const auto t_src = t.get_src(); + i->invalidate_hints(); add_extent(i, &t_src); epm.mark_space_used(i->get_paddr(), i->get_length()); if (is_backref_mapped_extent_node(i)) { diff --git a/src/crimson/os/seastore/cached_extent.h b/src/crimson/os/seastore/cached_extent.h index 05cf5638664..2024e8115f8 100644 --- a/src/crimson/os/seastore/cached_extent.h +++ b/src/crimson/os/seastore/cached_extent.h @@ -118,6 +118,7 @@ public: paddr_t paddr, placement_hint_t hint, reclaim_gen_t gen) { + assert(gen == NULL_GENERATION || is_reclaim_generation(gen)); state = _state; set_paddr(paddr); user_hint = hint; @@ -402,8 +403,10 @@ public: reclaim_generation = NULL_GENERATION; } - void set_reclaim_generation(reclaim_gen_t gen) { - assert(gen < RECLAIM_GENERATIONS); + /// assign the target reclaim generation for the followup rewrite + void set_target_reclaim_generation(reclaim_gen_t gen) { + assert(is_target_reclaim_generation(gen)); + user_hint = placement_hint_t::REWRITE; reclaim_generation = gen; } @@ -485,10 +488,11 @@ private: read_set_item_t::list transactions; - placement_hint_t user_hint; + placement_hint_t user_hint = PLACEMENT_HINT_NULL; - /// > 0 and not null means the extent is under reclaimming - reclaim_gen_t reclaim_generation; + // the target reclaim generation for the followup rewrite + // or the reclaim generation for the fresh write + reclaim_gen_t reclaim_generation = NULL_GENERATION; protected: CachedExtent(CachedExtent &&other) = delete; diff --git a/src/crimson/os/seastore/extent_placement_manager.cc b/src/crimson/os/seastore/extent_placement_manager.cc index 8e77a680426..a39b69c3200 100644 --- a/src/crimson/os/seastore/extent_placement_manager.cc +++ b/src/crimson/os/seastore/extent_placement_manager.cc @@ -57,7 +57,6 @@ SegmentedOolWriter::write_record( TRACET("{} ool extent written at {} -- {}", t, segment_allocator.get_name(), extent_addr, *extent); - extent->invalidate_hints(); t.mark_delayed_extent_ool(extent, extent_addr); extent_addr = extent_addr.as_seg_paddr().add_offset( extent->get_length()); @@ -179,23 +178,23 @@ void ExtentPlacementManager::init( { writer_refs.clear(); - ceph_assert(RECLAIM_GENERATIONS > 0); auto segment_cleaner = dynamic_cast(cleaner.get()); ceph_assert(segment_cleaner != nullptr); - data_writers_by_gen.resize(RECLAIM_GENERATIONS, {}); - for (reclaim_gen_t gen = 0; gen < RECLAIM_GENERATIONS; ++gen) { + auto num_writers = generation_to_writer(REWRITE_GENERATIONS); + data_writers_by_gen.resize(num_writers, {}); + for (reclaim_gen_t gen = OOL_GENERATION; gen < REWRITE_GENERATIONS; ++gen) { writer_refs.emplace_back(std::make_unique( data_category_t::DATA, gen, *segment_cleaner, segment_cleaner->get_ool_segment_seq_allocator())); - data_writers_by_gen[gen] = writer_refs.back().get(); + data_writers_by_gen[generation_to_writer(gen)] = writer_refs.back().get(); } - md_writers_by_gen.resize(RECLAIM_GENERATIONS - 1, {}); - for (reclaim_gen_t gen = 1; gen < RECLAIM_GENERATIONS; ++gen) { + md_writers_by_gen.resize(num_writers, {}); + for (reclaim_gen_t gen = OOL_GENERATION; gen < REWRITE_GENERATIONS; ++gen) { writer_refs.emplace_back(std::make_unique( data_category_t::METADATA, gen, *segment_cleaner, segment_cleaner->get_ool_segment_seq_allocator())); - md_writers_by_gen[gen - 1] = writer_refs.back().get(); + md_writers_by_gen[generation_to_writer(gen)] = writer_refs.back().get(); } for (auto *device : segment_cleaner->get_segment_manager_group() diff --git a/src/crimson/os/seastore/extent_placement_manager.h b/src/crimson/os/seastore/extent_placement_manager.h index 0804e7d1c25..d17732e8340 100644 --- a/src/crimson/os/seastore/extent_placement_manager.h +++ b/src/crimson/os/seastore/extent_placement_manager.h @@ -146,8 +146,8 @@ public: reclaim_gen_t gen ) { assert(hint < placement_hint_t::NUM_HINTS); - assert(gen < RECLAIM_GENERATIONS); - assert(gen == 0 || hint == placement_hint_t::REWRITE); + assert(is_target_reclaim_generation(gen)); + assert(gen == INIT_GENERATION || hint == placement_hint_t::REWRITE); // XXX: bp might be extended to point to differnt memory (e.g. PMem) // according to the allocator. @@ -159,31 +159,37 @@ public: // TODO: implement out-of-line strategy for physical extent. return {make_record_relative_paddr(0), std::move(bp), - 0}; + INLINE_GENERATION}; } if (hint == placement_hint_t::COLD) { - assert(gen == 0); + assert(gen == INIT_GENERATION); return {make_delayed_temp_paddr(0), std::move(bp), - COLD_GENERATION}; + MIN_REWRITE_GENERATION}; } if (get_extent_category(type) == data_category_t::METADATA && - gen == 0) { - // gen 0 METADATA writer is the journal writer + gen == INIT_GENERATION) { if (prefer_ool) { return {make_delayed_temp_paddr(0), std::move(bp), - 1}; + OOL_GENERATION}; } else { + // default not to ool metadata extents to reduce padding overhead. + // TODO: improve padding so we can default to the prefer_ool path. return {make_record_relative_paddr(0), std::move(bp), - 0}; + INLINE_GENERATION}; } } else { assert(get_extent_category(type) == data_category_t::DATA || - gen > 0); + gen >= MIN_REWRITE_GENERATION); + if (gen > MAX_REWRITE_GENERATION) { + gen = MAX_REWRITE_GENERATION; + } else if (gen == INIT_GENERATION) { + gen = OOL_GENERATION; + } return {make_delayed_temp_paddr(0), std::move(bp), gen}; @@ -261,14 +267,13 @@ private: data_category_t category, reclaim_gen_t gen) { assert(hint < placement_hint_t::NUM_HINTS); - assert(gen < RECLAIM_GENERATIONS); + assert(is_reclaim_generation(gen)); + assert(gen != INLINE_GENERATION); if (category == data_category_t::DATA) { - return data_writers_by_gen[gen]; + return data_writers_by_gen[generation_to_writer(gen)]; } else { assert(category == data_category_t::METADATA); - // gen 0 METADATA writer is the journal writer - assert(gen > 0); - return md_writers_by_gen[gen - 1]; + return md_writers_by_gen[generation_to_writer(gen)]; } } diff --git a/src/crimson/os/seastore/journal/segmented_journal.cc b/src/crimson/os/seastore/journal/segmented_journal.cc index f8b8539738d..58df9137493 100644 --- a/src/crimson/os/seastore/journal/segmented_journal.cc +++ b/src/crimson/os/seastore/journal/segmented_journal.cc @@ -33,7 +33,7 @@ SegmentedJournal::SegmentedJournal( new SegmentSeqAllocator(segment_type_t::JOURNAL)), journal_segment_allocator(&trimmer, data_category_t::METADATA, - 0, // generation + INLINE_GENERATION, segment_provider, *segment_seq_allocator), record_submitter(crimson::common::get_conf( diff --git a/src/crimson/os/seastore/seastore_types.cc b/src/crimson/os/seastore/seastore_types.cc index bb41a9f78d3..1b8bab23625 100644 --- a/src/crimson/os/seastore/seastore_types.cc +++ b/src/crimson/os/seastore/seastore_types.cc @@ -249,9 +249,15 @@ std::ostream &operator<<(std::ostream &out, extent_types_t t) std::ostream &operator<<(std::ostream &out, reclaim_gen_printer_t gen) { if (gen.gen == NULL_GENERATION) { - return out << "NULL_GEN"; - } else if (gen.gen >= RECLAIM_GENERATIONS) { - return out << "INVALID_GEN(" << (unsigned)gen.gen << ")"; + return out << "GEN_NULL"; + } else if (gen.gen == INIT_GENERATION) { + return out << "GEN_INIT"; + } else if (gen.gen == INLINE_GENERATION) { + return out << "GEN_INL"; + } else if (gen.gen == OOL_GENERATION) { + return out << "GEN_OOL"; + } else if (gen.gen > REWRITE_GENERATIONS) { + return out << "GEN_INVALID(" << (unsigned)gen.gen << ")!"; } else { return out << "GEN(" << (unsigned)gen.gen << ")"; } diff --git a/src/crimson/os/seastore/seastore_types.h b/src/crimson/os/seastore/seastore_types.h index 28d12ebb957..490622a67ff 100644 --- a/src/crimson/os/seastore/seastore_types.h +++ b/src/crimson/os/seastore/seastore_types.h @@ -1117,11 +1117,44 @@ constexpr bool is_backref_node(extent_types_t type) std::ostream &operator<<(std::ostream &out, extent_types_t t); +/** + * reclaim_gen_t + * + * The goal is to group the similar aged extents in the same segment for better + * bimodel utilization distribution, and also to the same device tier. For EPM, + * it has the flexibility to make placement decisions by re-assigning the + * generation. And each non-inline generation will be statically mapped to a + * writer in EPM. + * + * All the fresh and dirty extents start with INIT_GENERATION upon allocation, + * and they will be assigned to INLINE/OOL generation by EPM before the initial + * writes. After that, the generation can only be increased upon rewrite. + * + * Note, although EPM can re-assign the generations according to the tiering + * status, it cannot decrease the generation for the correctness of space + * reservation. It may choose to assign a larger generation if the extent is + * hinted cold, or if want to evict extents to the cold tier. And it may choose + * to not increase the generation if want to keep the hot tier as filled as + * possible. + */ using reclaim_gen_t = uint8_t; -constexpr reclaim_gen_t DIRTY_GENERATION = 1; -constexpr reclaim_gen_t COLD_GENERATION = 1; -constexpr reclaim_gen_t RECLAIM_GENERATIONS = 3; +// INIT_GENERATION requires EPM decision to INLINE/OOL_GENERATION +constexpr reclaim_gen_t INIT_GENERATION = 0; +constexpr reclaim_gen_t INLINE_GENERATION = 1; // to the journal +constexpr reclaim_gen_t OOL_GENERATION = 2; + +// All the rewritten extents start with MIN_REWRITE_GENERATION +constexpr reclaim_gen_t MIN_REWRITE_GENERATION = 3; +constexpr reclaim_gen_t MAX_REWRITE_GENERATION = 4; + +/** + * TODO: + * For tiering, might introduce 5 and 6 for the cold tier, and 1 ~ 4 for the + * hot tier. + */ + +constexpr reclaim_gen_t REWRITE_GENERATIONS = MAX_REWRITE_GENERATION + 1; constexpr reclaim_gen_t NULL_GENERATION = std::numeric_limits::max(); @@ -1131,6 +1164,24 @@ struct reclaim_gen_printer_t { std::ostream &operator<<(std::ostream &out, reclaim_gen_printer_t gen); +constexpr std::size_t generation_to_writer(reclaim_gen_t gen) { + // caller to assert the gen is in the reasonable range + return gen - OOL_GENERATION; +} + +// before EPM decision +constexpr bool is_target_reclaim_generation(reclaim_gen_t gen) { + return gen == INIT_GENERATION || + (gen >= MIN_REWRITE_GENERATION && + gen <= REWRITE_GENERATIONS); +} + +// after EPM decision +constexpr bool is_reclaim_generation(reclaim_gen_t gen) { + return gen >= INLINE_GENERATION && + gen < REWRITE_GENERATIONS; +} + enum class data_category_t : uint8_t { METADATA = 0, DATA, diff --git a/src/crimson/os/seastore/transaction_manager.cc b/src/crimson/os/seastore/transaction_manager.cc index 29d2c4873a0..f4ad79803e9 100644 --- a/src/crimson/os/seastore/transaction_manager.cc +++ b/src/crimson/os/seastore/transaction_manager.cc @@ -451,6 +451,7 @@ TransactionManager::rewrite_logical_extent( lextent->get_type(), lextent->get_length(), lextent->get_user_hint(), + // get target reclaim generation lextent->get_reclaim_generation())->cast(); lextent->get_bptr().copy_out( 0, @@ -493,9 +494,9 @@ TransactionManager::rewrite_extent_ret TransactionManager::rewrite_extent( assert(extent->is_valid() && !extent->is_initial_pending()); if (extent->is_dirty()) { - extent->set_reclaim_generation(DIRTY_GENERATION); + extent->set_target_reclaim_generation(INIT_GENERATION); } else { - extent->set_reclaim_generation(target_generation); + extent->set_target_reclaim_generation(target_generation); ceph_assert(modify_time != NULL_TIME); extent->set_modify_time(modify_time); } diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h index 3945537708c..6e6eb45f73c 100644 --- a/src/crimson/os/seastore/transaction_manager.h +++ b/src/crimson/os/seastore/transaction_manager.h @@ -320,7 +320,7 @@ public: t, len, placement_hint, - 0); + INIT_GENERATION); return lba_manager->alloc_extent( t, laddr_hint,