From: Zhang Song Date: Wed, 3 Sep 2025 08:05:38 +0000 (+0800) Subject: crimson/os/seastore: add write through policy X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=69778531eae58ed0d4e1618e8d9d2fb9b956ce33;p=ceph-ci.git crimson/os/seastore: add write through policy Signed-off-by: Zhang Song Signed-off-by: Xuehan Xu --- diff --git a/src/common/options/crimson.yaml.in b/src/common/options/crimson.yaml.in index 4772368dc83..5c31100954c 100644 --- a/src/common/options/crimson.yaml.in +++ b/src/common/options/crimson.yaml.in @@ -343,3 +343,8 @@ options: level: advanced desc: Size in bytes of extents to be demoted from logical bucket default: 2_M +- name: seastore_write_through_size + type: size + level: dev + desc: select write through policy when data length is greater than this value. + default: 512_K diff --git a/src/crimson/os/seastore/cache.cc b/src/crimson/os/seastore/cache.cc index 555b81c14ce..55c855bdc2d 100644 --- a/src/crimson/os/seastore/cache.cc +++ b/src/crimson/os/seastore/cache.cc @@ -1272,14 +1272,16 @@ std::vector Cache::alloc_new_data_extents_by_type( case extent_types_t::OBJECT_DATA_BLOCK: { auto extents = alloc_new_data_extents< - ObjectDataBlock>(t, length, {hint, gen, is_tracked}); + ObjectDataBlock>(t, length, {hint, gen, is_tracked, + epm.get_write_policy(type, length)}); res.insert(res.begin(), extents.begin(), extents.end()); } return res; case extent_types_t::TEST_BLOCK: { auto extents = alloc_new_data_extents< - TestBlock>(t, length, {hint, gen, is_tracked}); + TestBlock>(t, length, {hint, gen, is_tracked, + epm.get_write_policy(type, length)}); res.insert(res.begin(), extents.begin(), extents.end()); } return res; @@ -2197,7 +2199,8 @@ void Cache::init() P_ADDR_ROOT, PLACEMENT_HINT_NULL, NULL_GENERATION, - TRANS_ID_NULL); + TRANS_ID_NULL, + write_policy_t::WRITE_BACK); root->set_modify_time(seastar::lowres_system_clock::now()); INFO("init root -- {}", *root); add_extent(root); @@ -2621,7 +2624,8 @@ Cache::_get_absent_extent_by_type( offset, PLACEMENT_HINT_NULL, NULL_GENERATION, - TRANS_ID_NULL); + TRANS_ID_NULL, + write_policy_t::WRITE_BACK); DEBUGT("{} length=0x{:x} is absent, add extent ... -- {}", t, type, length, *ret); add_extent(ret); diff --git a/src/crimson/os/seastore/cache.h b/src/crimson/os/seastore/cache.h index a5ba78ffae4..5317e14bbb8 100644 --- a/src/crimson/os/seastore/cache.h +++ b/src/crimson/os/seastore/cache.h @@ -402,7 +402,8 @@ public: offset, PLACEMENT_HINT_NULL, NULL_GENERATION, - TRANS_ID_NULL); + TRANS_ID_NULL, + write_policy_t::WRITE_BACK); SUBDEBUGT(seastore_cache, "{} {}~0x{:x} is absent, add extent and reading range 0x{:x}~0x{:x} ... -- {}", t, T::TYPE, offset, length, partial_off, partial_len, *ret); @@ -843,7 +844,8 @@ private: offset, PLACEMENT_HINT_NULL, NULL_GENERATION, - TRANS_ID_NULL); + TRANS_ID_NULL, + write_policy_t::WRITE_BACK); SUBDEBUG(seastore_cache, "{} {}~0x{:x} is absent, add extent and reading range 0x{:x}~0x{:x} ... -- {}", T::TYPE, offset, length, partial_off, partial_len, *ret); @@ -1122,7 +1124,8 @@ public: result->paddr, opt.hint, result->gen, - t.get_trans_id()); + t.get_trans_id(), + write_policy_t::WRITE_BACK); t.add_fresh_extent(ret); SUBDEBUGT(seastore_cache, "allocated {} 0x{:x}B extent at {}, hint={}, gen={} -- {}", @@ -1160,7 +1163,8 @@ public: result.paddr, opt.hint, result.gen, - t.get_trans_id()); + t.get_trans_id(), + opt.write_policy); t.add_fresh_extent(ret); SUBDEBUGT(seastore_cache, "allocated {} 0x{:x}B extent at {}, hint={}, gen={} -- {}", @@ -1202,7 +1206,8 @@ public: remap_paddr, PLACEMENT_HINT_NULL, NULL_GENERATION, - t.get_trans_id()); + t.get_trans_id(), + write_policy_t::WRITE_BACK); auto extent = ext->template cast(); extent->set_laddr(remap_laddr); diff --git a/src/crimson/os/seastore/cached_extent.h b/src/crimson/os/seastore/cached_extent.h index d2358399986..9371c984544 100644 --- a/src/crimson/os/seastore/cached_extent.h +++ b/src/crimson/os/seastore/cached_extent.h @@ -361,12 +361,14 @@ public: paddr_t paddr, placement_hint_t hint, rewrite_gen_t gen, - transaction_id_t trans_id) { + transaction_id_t trans_id, + write_policy_t policy) { state = _state; set_paddr(paddr); user_hint = hint; rewrite_generation = gen; pending_for_transaction = trans_id; + write_policy = policy; } void set_modify_time(sea_time_point t) { @@ -538,6 +540,7 @@ public: << ", last_committed_crc=" << last_committed_crc << ", refcount=" << use_count() << ", user_hint=" << user_hint + << ", write_policy=" << write_policy << ", rewrite_gen=" << rewrite_gen_printer_t{rewrite_generation} << ", pending_io="; if (is_pending_io()) { @@ -917,6 +920,18 @@ public: is_shadow = b; } + write_policy_t get_write_policy() const { + return write_policy; + } + + void set_write_policy(write_policy_t w) { + write_policy = w; + } + + void reset_write_policy() { + write_policy = write_policy_t::WRITE_BACK; + } + private: template friend class read_set_item_t; @@ -1028,6 +1043,8 @@ private: placement_hint_t user_hint = PLACEMENT_HINT_NULL; + write_policy_t write_policy = write_policy_t::WRITE_BACK; + // the target rewrite generation for the followup rewrite // or the rewrite generation for the fresh write rewrite_gen_t rewrite_generation = NULL_GENERATION; diff --git a/src/crimson/os/seastore/extent_placement_manager.h b/src/crimson/os/seastore/extent_placement_manager.h index 716211678b6..3027f645a0f 100644 --- a/src/crimson/os/seastore/extent_placement_manager.h +++ b/src/crimson/os/seastore/extent_placement_manager.h @@ -351,7 +351,9 @@ public: ool_segment_seq_allocator( std::make_unique(segment_type_t::OOL)), max_data_allocation_size(crimson::common::get_conf( - "seastore_max_data_allocation_size")) + "seastore_max_data_allocation_size")), + write_through_size(crimson::common::get_conf( + "seastore_write_through_size")) { LOG_PREFIX(ExtentPlacementManager::ExtentPlacementManager); devices_by_id.resize(DEVICE_ID_MAX, nullptr); @@ -429,6 +431,7 @@ public: placement_hint_t hint; rewrite_gen_t gen; bool is_tracked; + write_policy_t write_policy = write_policy_t::WRITE_BACK; #ifdef UNIT_TESTS_BUILT std::optional external_paddr = std::nullopt; #endif @@ -449,7 +452,8 @@ public: assert(opt.gen == INIT_GENERATION || opt.hint == placement_hint_t::REWRITE); data_category_t category = get_extent_category(type); - opt.gen = adjust_generation(category, type, opt.hint, opt.gen, opt.is_tracked); + opt.gen = adjust_generation( + category, type, opt.hint, opt.gen, opt.write_policy, opt.is_tracked); paddr_t addr; #ifdef UNIT_TESTS_BUILT @@ -489,7 +493,8 @@ public: assert(opt.gen == INIT_GENERATION || opt.hint == placement_hint_t::REWRITE); data_category_t category = get_extent_category(type); - opt.gen = adjust_generation(category, type, opt.hint, opt.gen, opt.is_tracked); + opt.gen = adjust_generation( + category, type, opt.hint, opt.gen, opt.write_policy, opt.is_tracked); assert(opt.gen != INLINE_GENERATION); // XXX: bp might be extended to point to different memory (e.g. PMem) @@ -527,6 +532,13 @@ public: return allocs; } + write_policy_t get_write_policy(extent_types_t type, extent_len_t length) const { + if (has_cold_tier() && length >= write_through_size && is_data_type(type)) { + return write_policy_t::WRITE_THROUGH; + } + return write_policy_t::WRITE_BACK; + } + #ifdef UNIT_TESTS_BUILT void prefill_fragmented_devices() { LOG_PREFIX(ExtentPlacementManager::prefill_fragmented_devices); @@ -696,6 +708,7 @@ private: extent_types_t type, placement_hint_t hint, rewrite_gen_t gen, + write_policy_t policy, bool is_tracked) { assert(is_real_type(type)); if (is_root_type(type)) { @@ -725,10 +738,20 @@ private: } } else { assert(category == data_category_t::DATA); - gen = OOL_GENERATION; + if (background_process.has_cold_tier() && + policy == write_policy_t::WRITE_THROUGH) { + gen = hot_tier_generations; + } else { + assert(policy != write_policy_t::WRITE_THROUGH); + gen = OOL_GENERATION; + } } } else if (background_process.has_cold_tier()) { gen = background_process.adjust_generation(gen); + if (gen <= hot_tier_generations && + policy == write_policy_t::WRITE_THROUGH) { + gen = hot_tier_generations; + } } if (is_tracked && gen >= hot_tier_generations && @@ -1283,6 +1306,7 @@ private: // TODO: drop once paddr->journal_seq_t is introduced SegmentSeqAllocatorRef ool_segment_seq_allocator; extent_len_t max_data_allocation_size = 0; + std::size_t write_through_size; friend class ::transaction_manager_test_t; friend class Cache; diff --git a/src/crimson/os/seastore/seastore_types.cc b/src/crimson/os/seastore/seastore_types.cc index df83d1e5825..65f9a39b12a 100644 --- a/src/crimson/os/seastore/seastore_types.cc +++ b/src/crimson/os/seastore/seastore_types.cc @@ -1224,6 +1224,16 @@ std::ostream& operator<<(std::ostream& out, placement_hint_t h) } } +std::ostream& operator<<(std::ostream& out, write_policy_t w) +{ + switch(w) { + case write_policy_t::WRITE_BACK: + return out << "WRITE_BACK"; + case write_policy_t::WRITE_THROUGH: + return out << "WRITE_THROUGH"; + } +} + bool can_delay_allocation(device_type_t type) { // Some types of device may not support delayed allocation, for example PMEM. // All types of device currently support delayed allocation. diff --git a/src/crimson/os/seastore/seastore_types.h b/src/crimson/os/seastore/seastore_types.h index 8472775a1dd..a000f2ee089 100644 --- a/src/crimson/os/seastore/seastore_types.h +++ b/src/crimson/os/seastore/seastore_types.h @@ -2069,6 +2069,13 @@ constexpr bool is_real_type(extent_types_t type) { std::ostream &operator<<(std::ostream &out, extent_types_t t); +enum class write_policy_t { + WRITE_BACK, + WRITE_THROUGH +}; + +std::ostream& operator<<(std::ostream& out, write_policy_t w); + /** * rewrite_gen_t * @@ -3684,6 +3691,7 @@ template <> struct fmt::formatter : f template <> struct fmt::formatter : fmt::ostream_formatter {}; template <> struct fmt::formatter : fmt::ostream_formatter {}; template <> struct fmt::formatter : fmt::ostream_formatter {}; +template <> struct fmt::formatter : fmt::ostream_formatter {}; #endif template <> diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h index 7919f09689a..7dc7735e155 100644 --- a/src/crimson/os/seastore/transaction_manager.h +++ b/src/crimson/os/seastore/transaction_manager.h @@ -591,7 +591,11 @@ public: SUBDEBUGT(seastore_tm, "{} hint {}~0x{:x} phint={} ...", t, T::TYPE, laddr_hint, len, placement_hint); auto exts = cache->alloc_new_data_extents( - t, len, {placement_hint, INIT_GENERATION}); + t, len, + { + placement_hint, INIT_GENERATION, false, + epm->get_write_policy(T::TYPE, len) + }); // user must initialize the logical extent themselves assert(is_user_transaction(t.get_src())); for (auto& ext : exts) { diff --git a/src/test/crimson/seastore/test_btree_lba_manager.cc b/src/test/crimson/seastore/test_btree_lba_manager.cc index b5541aaaf48..291c845a18e 100644 --- a/src/test/crimson/seastore/test_btree_lba_manager.cc +++ b/src/test/crimson/seastore/test_btree_lba_manager.cc @@ -327,7 +327,9 @@ struct lba_btree_test : btree_test_base { check.emplace(addr, get_map_val(len, TestBlock::TYPE)); lba_btree_update([=, this](auto &btree, auto &t) { auto extents = cache->alloc_new_data_extents( - t, TestBlock::SIZE, {placement_hint_t::HOT, 0, false, get_paddr()}); + t, TestBlock::SIZE, + {placement_hint_t::HOT, 0, false, + write_policy_t::WRITE_BACK, get_paddr()}); return seastar::do_with( std::move(extents), [this, addr, &t, len, &btree](auto &extents) { @@ -544,7 +546,9 @@ struct btree_lba_manager_test : btree_test_base { *t.t, [=, this](auto &t) { auto extents = cache->alloc_new_data_extents( - t, TestBlock::SIZE, {placement_hint_t::HOT, 0, false, get_paddr()}); + t, TestBlock::SIZE, + {placement_hint_t::HOT, 0, false, + write_policy_t::WRITE_BACK, get_paddr()}); return seastar::do_with( std::vector( extents.begin(), extents.end()), diff --git a/src/test/crimson/seastore/test_transaction_manager.cc b/src/test/crimson/seastore/test_transaction_manager.cc index d07579abcce..85d4ff2bee3 100644 --- a/src/test/crimson/seastore/test_transaction_manager.cc +++ b/src/test/crimson/seastore/test_transaction_manager.cc @@ -1141,6 +1141,7 @@ struct transaction_manager_test_t : t, placement_hint_t::HOT, gen, + write_policy_t::WRITE_BACK, false); if (expected_generations[t][gen] != epm_gen) { logger().error("caller: {}, extent type: {}, input generation: {}, "