From: Zhang Song Date: Thu, 3 Jul 2025 07:41:43 +0000 (+0800) Subject: crimson/os/seastore: support promote extents purged from pinboard X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=4696565e22e45e631fdbb3a58d4afd7d7617c30b;p=ceph-ci.git crimson/os/seastore: support promote extents purged from pinboard Signed-off-by: Zhang Song Signed-off-by: Xuehan Xu --- diff --git a/src/common/options/crimson.yaml.in b/src/common/options/crimson.yaml.in index 89727905862..92e513e4579 100644 --- a/src/common/options/crimson.yaml.in +++ b/src/common/options/crimson.yaml.in @@ -298,3 +298,8 @@ options: desc: disable osd shards changes upon restart. flags: - startup +- name: seastore_cache_promotion_size + type: size + level: advanced + desc: Size in bytes of extents to be promoted from cold devices, set 0 to disable promotion + default: 2_M diff --git a/src/crimson/os/seastore/async_cleaner.h b/src/crimson/os/seastore/async_cleaner.h index a0ca2d1a056..623a1ccbfa0 100644 --- a/src/crimson/os/seastore/async_cleaner.h +++ b/src/crimson/os/seastore/async_cleaner.h @@ -351,6 +351,22 @@ public: rewrite_gen_t target_generation, sea_time_point modify_time) = 0; + /** + * promote_extent + * + * Promote an extent located from slow devices to the faster devices. + * When the type of extent is ObjectDataBlock, the original extent won't + * be retired, so that this extent is located in two differenct devices + * at the same time, which is helpful to reduce the cost of cleaner process. + * The others follow the normal rewrite process but its rewrite generation + * will be INIT_GENERATION(XXX: or the maximum generation of the hot tier?). + */ + using promote_extent_iertr = base_iertr; + using promote_extent_ret = promote_extent_iertr::future<>; + virtual promote_extent_ret promote_extent( + Transaction &t, + CachedExtentRef extent) = 0; + /** * get_extents_if_live * diff --git a/src/crimson/os/seastore/cache.cc b/src/crimson/os/seastore/cache.cc index 1b08b006ce5..7c7c0714ff9 100644 --- a/src/crimson/os/seastore/cache.cc +++ b/src/crimson/os/seastore/cache.cc @@ -35,7 +35,8 @@ Cache::Cache( : epm(epm), pinboard(create_extent_pinboard( crimson::common::get_conf( - "seastore_cachepin_size_pershard"))) + "seastore_cachepin_size_pershard"), + &epm)) { register_metrics(store_index); segment_providers_by_device_id.resize(DEVICE_ID_MAX, nullptr); diff --git a/src/crimson/os/seastore/cached_extent.cc b/src/crimson/os/seastore/cached_extent.cc index e8ff0c428bc..4fd253e27e5 100644 --- a/src/crimson/os/seastore/cached_extent.cc +++ b/src/crimson/os/seastore/cached_extent.cc @@ -70,6 +70,8 @@ std::ostream &operator<<(std::ostream &out, const extent_pin_state_t &s) { switch (s) { case extent_pin_state_t::Fresh: return out << "Fresh"; + case extent_pin_state_t::PendingPromote: + return out << "PendingPromote"; case extent_pin_state_t::WarmIn: return out << "WarmIn"; case extent_pin_state_t::Hot: diff --git a/src/crimson/os/seastore/cached_extent.h b/src/crimson/os/seastore/cached_extent.h index 8785c183d45..3f70bb8df46 100644 --- a/src/crimson/os/seastore/cached_extent.h +++ b/src/crimson/os/seastore/cached_extent.h @@ -269,6 +269,7 @@ private: enum class extent_pin_state_t : uint8_t { // shared state between LRU and 2Q impl Fresh = 0, + PendingPromote, // 2Q impl only WarmIn, Hot, @@ -874,7 +875,7 @@ public: using crimson::common::get_conf; auto type = get_conf("seastore_cachepin_type"); if (type == "LRU") { - assert(pin_state == extent_pin_state_t::Fresh); + assert(pin_state <= extent_pin_state_t::PendingPromote); } else if (type == "2Q") { assert(pin_state < extent_pin_state_t::Max); } else { @@ -889,7 +890,7 @@ public: using crimson::common::get_conf; auto type = get_conf("seastore_cachepin_type"); if (type == "LRU") { - assert(pin_state == extent_pin_state_t::Fresh); + assert(pin_state <= extent_pin_state_t::PendingPromote); } else if (type == "2Q") { assert(pin_state < extent_pin_state_t::Max); } else { @@ -1116,6 +1117,7 @@ protected: friend class ExtentQueue; friend class ExtentPinboardLRU; friend class ExtentPinboardTwoQ; + friend class ExtentPromoter; template static TCachedExtentRef make_cached_extent_ref( Args&&... args) { diff --git a/src/crimson/os/seastore/extent_pinboard.cc b/src/crimson/os/seastore/extent_pinboard.cc index c75786685be..a0773ca12d0 100644 --- a/src/crimson/os/seastore/extent_pinboard.cc +++ b/src/crimson/os/seastore/extent_pinboard.cc @@ -3,6 +3,7 @@ #include "crimson/os/seastore/extent_pinboard.h" #include "crimson/os/seastore/transaction.h" +#include "crimson/os/seastore/transaction_manager.h" #include @@ -266,8 +267,134 @@ void ExtentQueue::get_stats( last_overall_io = overall_io; } +class ExtentPromoter { +public: + ExtentPromoter(size_t promotion_size, ExtentPlacementManager &epm) + : promotion_size(promotion_size), epm(epm) {} + + ~ExtentPromoter() { + clear(); + } + + bool enabled() const { + return ecb != nullptr; + } + + bool should_promote_extent(const CachedExtent &extent) { + return enabled() && epm.is_cold_device(extent.get_paddr().get_device_id()); + } + + size_t get_promotion_size() const { + return current_contents; + } + + void set_background_callback(BackgroundListener *l) { + listener = l; + } + + void set_extent_callback(ExtentCallbackInterface *cb) { + ecb = cb; + } + + bool should_run_promote() const { + return enabled() && current_contents >= promotion_size; + } + + std::size_t get_promoted_size() const { + return promoted_size; + } + + std::size_t get_promoted_count() const { + return promoted_count; + } + + void add_extent(CachedExtent &extent) { + assert(!extent.is_linked_to_list()); + assert(extent.is_stable_clean()); + extent.set_pin_state(extent_pin_state_t::PendingPromote); + list.push_back(extent); + current_contents += extent.get_length(); + intrusive_ptr_add_ref(&extent); + while (current_contents > promotion_size) { + remove_extent(list.front(), extent_pin_state_t::Fresh); + } + if (should_run_promote()) { + // TODO: wake promote background process + } + } + + void remove_extent(CachedExtent &extent, extent_pin_state_t new_state) { + assert(extent.is_linked_to_list()); + assert(extent.get_pin_state() == extent_pin_state_t::PendingPromote); + assert(current_contents >= extent.get_length()); + extent.set_pin_state(new_state); + list.erase(list.s_iterator_to(extent)); + current_contents -= extent.get_length(); + intrusive_ptr_release(&extent); + } + + void clear() { + for (auto iter = list.begin(); iter != list.end();) { + remove_extent(*(iter++), extent_pin_state_t::Fresh); + } + } + + using run_promote_ret = base_iertr::future<>; + run_promote_ret run_promote(Transaction &t) { + LOG_PREFIX(ExtentPromoter::run_promote); + std::size_t promote_size = 0; + std::list extents; + DEBUGT("start promote", t); + for (auto &extent : list) { + DEBUGT("promote {} to the hot tier", t, extent); + ceph_assert(extent.is_stable_clean()); + ceph_assert(extent.get_pin_state() == extent_pin_state_t::PendingPromote); + promote_size += extent.get_length(); + t.add_to_read_set(&extent); + extents.emplace_back(&extent); + } + for (auto &extent : extents) { + remove_extent(*extent, extent_pin_state_t::Fresh); + } + for (auto &extent : extents) { + co_await trans_intr::make_interruptible(extent->wait_io()); + co_await ecb->promote_extent(t, extent); + } + // existing extents in lru will be retired after transaction submitted + co_await ecb->submit_transaction_direct(t); + promoted_count += extents.size(); + promoted_size += promote_size; + DEBUGT("finish promoting {} {}B extents", t, extents.size(), promote_size); + co_return; + } + + seastar::future<> promote() { + assert(enabled()); + return repeat_eagain([this] { + return ecb->with_transaction_intr( + Transaction::src_t::PROMOTE, + "promote", cache_hint_t::get_nocache(), + [this](auto &t) { + return run_promote(t); + }); + }).handle_error(crimson::ct_error::assert_all{"error occupied during promotion"}); + } + +private: + const size_t promotion_size; + ExtentPlacementManager &epm; + ExtentCallbackInterface *ecb = nullptr; + BackgroundListener *listener = nullptr; + CachedExtent::primary_ref_list list; + size_t current_contents; + + size_t promoted_count; + size_t promoted_size; +}; + class ExtentPinboardLRU : public ExtentPinboard { ExtentQueue lru; + ExtentPromoter promoter; seastar::metrics::metric_group metrics; // hit and miss indicates if an extent is linked when touching it @@ -275,9 +402,11 @@ class ExtentPinboardLRU : public ExtentPinboard { uint64_t miss = 0; public: - ExtentPinboardLRU(std::size_t capacity) : lru(capacity) { + ExtentPinboardLRU(std::size_t capacity, size_t promotion_size, ExtentPlacementManager &epm) + : lru(capacity), promoter(promotion_size, epm) { LOG_PREFIX(ExtentPinboardLRU::ExtentPinboardLRU); - INFO("created, lru_capacity=0x{:x}B", capacity); + INFO("created, lru_capacity=0x{:x}B, promotion_size=0x{:x}B", + capacity, promotion_size); } std::size_t get_capacity_bytes() const { @@ -325,6 +454,24 @@ public: ), } ); + if (promoter.enabled()) { + metrics.add_group( + "cache", + { + sm::make_counter( + "promoted_size", + [this] { + return promoter.get_promoted_size(); + }, + sm::description("total bytes promoted by the lru")), + sm::make_counter( + "promoted_count", + [this] { + return promoter.get_promoted_count(); + }, + sm::description("total extents promoted by the lru")), + }); + } } void get_stats( @@ -335,8 +482,28 @@ public: } void remove(CachedExtent &extent) final { + auto s = extent.get_pin_state(); if (extent.is_linked_to_list()) { - lru.remove(extent); + if (s == extent_pin_state_t::Fresh) { + lru.remove(extent); + } else { + promoter.remove_extent(extent, extent_pin_state_t::Fresh); + } + } else { + ceph_assert(s == extent_pin_state_t::Fresh); + } + } + + void add_to_top( + CachedExtent &extent, + const Transaction::src_t* p_src) { + auto trimmed = lru.add_to_top(extent, p_src); + if (promoter.enabled()) { + for (auto &extent : trimmed) { + if (promoter.should_promote_extent(*extent)) { + promoter.add_extent(*extent); + } + } } } @@ -346,10 +513,17 @@ public: extent_len_t /*load_start*/, extent_len_t /*load_length*/) final { if (extent.is_linked_to_list()) { - lru.move_to_top(extent, p_src); + auto s = extent.get_pin_state(); + assert(s <= extent_pin_state_t::PendingPromote); + if (s == extent_pin_state_t::Fresh) { + lru.move_to_top(extent, p_src); + } else { + promoter.remove_extent(extent, extent_pin_state_t::Fresh); + add_to_top(extent, p_src); + } hit++; } else { - lru.add_to_top(extent, p_src); + add_to_top(extent, p_src); miss++; } } @@ -358,13 +532,38 @@ public: CachedExtent &extent, extent_len_t increased_length, const Transaction::src_t* p_src) final { - if (extent.is_linked_to_list()) { + if (extent.is_linked_to_list() && + extent.get_pin_state() == extent_pin_state_t::Fresh) { lru.increase_cached_size(extent, increased_length, p_src); + } else { + // promoter take the complete extent size for content size calculation + assert(extent.get_pin_state() <= extent_pin_state_t::PendingPromote); } } void clear() final { lru.clear(); + promoter.clear(); + } + + void set_background_callback(BackgroundListener *listener) final { + promoter.set_background_callback(listener); + } + + void set_extent_callback(ExtentCallbackInterface *cb) final { + promoter.set_extent_callback(cb); + } + + std::size_t get_promotion_size() const final { + return promoter.get_promotion_size(); + } + + bool should_promote() const final { + return promoter.should_run_promote(); + } + + seastar::future<> promote() final { + return promoter.promote(); } ~ExtentPinboardLRU() { @@ -482,15 +681,18 @@ public: ExtentPinboardTwoQ( std::size_t warm_in_capacity, std::size_t warm_out_capacity, - std::size_t hot_capacity) + std::size_t hot_capacity, + std::size_t promotion_size, + ExtentPlacementManager &epm) : warm_in(warm_in_capacity), warm_out(warm_out_capacity), - hot(hot_capacity) + hot(hot_capacity), + promoter(promotion_size, epm) { LOG_PREFIX(ExtentPinboardTwoQ::ExtentPinboardTwoQ); - INFO("created, warm_in_capacity=0x{:x}B, " - "warm_out_capacity=0x{:x}B, hot_capacity=0x{:x}B", - warm_in_capacity, warm_out_capacity, hot_capacity); + INFO("created, warm_in_capacity=0x{:x}B, warm_out_capacity=0x{:x}B, " + "hot_capacity=0x{:x}B, promotion_size=0x{:x}B", + warm_in_capacity, warm_out_capacity, hot_capacity, promotion_size); } std::size_t get_capacity_bytes() const { @@ -517,6 +719,8 @@ public: if (extent.is_linked_to_list()) { if (s == extent_pin_state_t::WarmIn) { warm_in.remove(extent); + } else if (s == extent_pin_state_t::PendingPromote) { + promoter.remove_extent(extent, extent_pin_state_t::Fresh); } else { ceph_assert(s == extent_pin_state_t::Hot); hot.remove(extent); @@ -538,6 +742,11 @@ public: if (state == extent_pin_state_t::Hot) { hot.move_to_top(extent, p_src); hit_queue(overall_hits.hot_hits, p_src, type); + } else if (state == extent_pin_state_t::PendingPromote) { + promoter.remove_extent(extent, extent_pin_state_t::Hot); + auto trimmed_extents = hot.add_to_top(extent, p_src); + on_update_hot(trimmed_extents); + hit_queue(overall_hits.hot_hits, p_src, type); } else { ceph_assert(state == extent_pin_state_t::WarmIn); hit_queue(overall_hits.warm_in_hits, p_src, type); @@ -598,11 +807,13 @@ public: auto trimmed_extents = warm_in.increase_cached_size( extent, increased_length, p_src); on_update_warm_in(trimmed_extents); - } else { - ceph_assert(state == extent_pin_state_t::Hot); + } else if (state == extent_pin_state_t::Hot) { auto trimmed_extents = hot.increase_cached_size( extent, increased_length, p_src); on_update_hot(trimmed_extents); + } else { + ceph_assert(extent.get_pin_state() == + extent_pin_state_t::PendingPromote); } } } @@ -616,6 +827,23 @@ public: warm_in.clear(); warm_out.clear(); hot.clear(); + promoter.clear(); + } + + void set_background_callback(BackgroundListener *listener) final { + promoter.set_background_callback(listener); + } + void set_extent_callback(ExtentCallbackInterface *cb) final { + promoter.set_extent_callback(cb); + } + std::size_t get_promotion_size() const final { + return promoter.get_promotion_size(); + } + bool should_promote() const final { + return promoter.should_run_promote(); + } + seastar::future<> promote() final { + return promoter.promote(); } ~ExtentPinboardTwoQ() { @@ -624,7 +852,11 @@ public: private: void on_update_hot(std::list &extents) { for (auto extent : extents) { - extent->set_pin_state(extent_pin_state_t::Fresh); + if (promoter.should_promote_extent(*extent)) { + promoter.add_extent(*extent); + } else { + extent->set_pin_state(extent_pin_state_t::Fresh); + } } } void on_update_warm_in(std::list &extents) { @@ -663,6 +895,7 @@ private: ExtentQueue warm_in; IndexedFifoQueue warm_out; ExtentQueue hot; + ExtentPromoter promoter; seastar::metrics::metric_group metrics; struct QueueCounter { @@ -852,13 +1085,32 @@ void ExtentPinboardTwoQ::register_metrics(store_index_t store_index) { ), } ); + if (promoter.enabled()) { + metrics.add_group( + "cache", + { + sm::make_counter( + "promoted_size", + [this] { + return promoter.get_promoted_size(); + }, + sm::description("total bytes promoted by the lru")), + sm::make_counter( + "promoted_count", + [this] { + return promoter.get_promoted_count(); + }, + sm::description("total extents promoted by the lru")), + }); + } } -ExtentPinboardRef create_extent_pinboard(std::size_t capacity) { +ExtentPinboardRef create_extent_pinboard(std::size_t capacity, ExtentPlacementManager *epm) { using crimson::common::get_conf; + size_t promotion_size = get_conf("seastore_cache_promotion_size"); auto algorithm = get_conf("seastore_cachepin_type"); if (algorithm == "LRU") { - return std::make_unique(capacity); + return std::make_unique(capacity, promotion_size, *epm); } else if (algorithm == "2Q") { auto warm_in_ratio = get_conf("seastore_cachepin_2q_in_ratio"); auto warm_out_ratio = get_conf("seastore_cachepin_2q_out_ratio"); @@ -867,7 +1119,8 @@ ExtentPinboardRef create_extent_pinboard(std::size_t capacity) { return std::make_unique( capacity * warm_in_ratio, capacity * warm_out_ratio, - capacity * (1 - warm_in_ratio)); + capacity * (1 - warm_in_ratio), + promotion_size, *epm); } else { ceph_abort("invalid seastore_cachepin_type(LRU or 2Q)"); return nullptr; diff --git a/src/crimson/os/seastore/extent_pinboard.h b/src/crimson/os/seastore/extent_pinboard.h index 3d2dd542b26..a766e486d4f 100644 --- a/src/crimson/os/seastore/extent_pinboard.h +++ b/src/crimson/os/seastore/extent_pinboard.h @@ -7,6 +7,9 @@ #include "crimson/os/seastore/transaction.h" namespace crimson::os::seastore { +class BackgroundListener; +class ExtentCallbackInterface; +class ExtentPlacementManager; struct ExtentPinboard { virtual ~ExtentPinboard() = default; @@ -28,8 +31,14 @@ struct ExtentPinboard { extent_len_t increased_length, const Transaction::src_t *p_src) = 0; virtual void clear() = 0; + virtual void set_background_callback(BackgroundListener *listener) = 0; + virtual void set_extent_callback(ExtentCallbackInterface *cb) = 0; + virtual std::size_t get_promotion_size() const = 0; + virtual bool should_promote() const = 0; + virtual seastar::future<> promote() = 0; }; using ExtentPinboardRef = std::unique_ptr; -ExtentPinboardRef create_extent_pinboard(std::size_t capacity); +ExtentPinboardRef create_extent_pinboard( + std::size_t capacity, ExtentPlacementManager *epm); } // namespace crimson::os::seastore diff --git a/src/crimson/os/seastore/extent_placement_manager.h b/src/crimson/os/seastore/extent_placement_manager.h index 123826f16be..b7ad1b06a9e 100644 --- a/src/crimson/os/seastore/extent_placement_manager.h +++ b/src/crimson/os/seastore/extent_placement_manager.h @@ -571,13 +571,20 @@ public: return primary_device->get_backend_type(); } - bool is_pure_rbm() const { return get_main_backend_type() == backend_type_t::RANDOM_BLOCK && // as of now, cold tier can only be segmented. !background_process.has_cold_tier(); } + bool has_cold_tier() const { + return background_process.has_cold_tier(); + } + + bool is_cold_device(device_id_t id) const { + return background_process.is_cold_device(id); + } + // Testing interfaces void test_init_no_background(Device *test_device) { @@ -770,6 +777,14 @@ private: return cold_cleaner.get() != nullptr; } + bool is_cold_device(device_id_t id) const { + if (!has_cold_tier()) { + return false; + } + assert(cleaners_by_device_id[id]); + return cleaners_by_device_id[id] != main_cleaner.get(); + } + void set_extent_callback(ExtentCallbackInterface *cb) { trimmer->set_extent_callback(cb); main_cleaner->set_extent_callback(cb); diff --git a/src/crimson/os/seastore/transaction_manager.cc b/src/crimson/os/seastore/transaction_manager.cc index 8dd915f988b..2af3d2d2a1a 100644 --- a/src/crimson/os/seastore/transaction_manager.cc +++ b/src/crimson/os/seastore/transaction_manager.cc @@ -992,6 +992,15 @@ TransactionManager::move_region( co_return; } +TransactionManager::promote_extent_ret +TransactionManager::promote_extent( + Transaction &t, + CachedExtentRef extent) +{ + // TODO + return rewrite_extent_iertr::make_ready_future(); +} + TransactionManager::get_extents_if_live_ret TransactionManager::get_extents_if_live( Transaction &t, diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h index 2e59c701ea3..1c94df8e927 100644 --- a/src/crimson/os/seastore/transaction_manager.h +++ b/src/crimson/os/seastore/transaction_manager.h @@ -900,6 +900,11 @@ public: rewrite_gen_t target_generation, sea_time_point modify_time) final; + using ExtentCallbackInterface::promote_extent_ret; + promote_extent_ret promote_extent( + Transaction &t, + CachedExtentRef extent); + using ExtentCallbackInterface::get_extents_if_live_ret; get_extents_if_live_ret get_extents_if_live( Transaction &t,