]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
crimson/os/seastore: support promote extents purged from pinboard
authorZhang Song <zhangsong02@qianxin.com>
Thu, 3 Jul 2025 07:41:43 +0000 (15:41 +0800)
committerXuehan Xu <xuxuehan@qianxin.com>
Sat, 23 May 2026 07:54:05 +0000 (15:54 +0800)
Signed-off-by: Zhang Song <zhangsong02@qianxin.com>
Signed-off-by: Xuehan Xu <xuxuehan@qianxin.com>
src/common/options/crimson.yaml.in
src/crimson/os/seastore/async_cleaner.h
src/crimson/os/seastore/cache.cc
src/crimson/os/seastore/cached_extent.cc
src/crimson/os/seastore/cached_extent.h
src/crimson/os/seastore/extent_pinboard.cc
src/crimson/os/seastore/extent_pinboard.h
src/crimson/os/seastore/extent_placement_manager.h
src/crimson/os/seastore/transaction_manager.cc
src/crimson/os/seastore/transaction_manager.h

index 8972790586214bf812c0e5add6aa8e1d19cb7c56..92e513e457963ab499bd27a97592b4f469630617 100644 (file)
@@ -298,3 +298,8 @@ options:
   desc: disable osd shards changes upon restart.
   flags:
   - startup
+- name: seastore_cache_promotion_size
+  type: size
+  level: advanced
+  desc: Size in bytes of extents to be promoted from cold devices, set 0 to disable promotion
+  default: 2_M
index a0ca2d1a056f2693b9fbc38d25dd495da3d2dcde..623a1ccbfa0436c76a974ff9b9162e384e856fb4 100644 (file)
@@ -351,6 +351,22 @@ public:
     rewrite_gen_t target_generation,
     sea_time_point modify_time) = 0;
 
+  /**
+   * promote_extent
+   *
+   * Promote an extent located from slow devices to the faster devices.
+   * When the type of extent is ObjectDataBlock, the original extent won't
+   * be retired, so that this extent is located in two differenct devices
+   * at the same time, which is helpful to reduce the cost of cleaner process.
+   * The others follow the normal rewrite process but its rewrite generation
+   * will be INIT_GENERATION(XXX: or the maximum generation of the hot tier?).
+   */
+  using promote_extent_iertr = base_iertr;
+  using promote_extent_ret = promote_extent_iertr::future<>;
+  virtual promote_extent_ret promote_extent(
+    Transaction &t,
+    CachedExtentRef extent) = 0;
+
   /**
    * get_extents_if_live
    *
index 1b08b006ce5f3997b2197b0bbb8fde76f2bfdbf1..7c7c0714ff9f050ae8842d0f15409d6d013be813 100644 (file)
@@ -35,7 +35,8 @@ Cache::Cache(
   : epm(epm),
     pinboard(create_extent_pinboard(
       crimson::common::get_conf<Option::size_t>(
-       "seastore_cachepin_size_pershard")))
+       "seastore_cachepin_size_pershard"),
+      &epm))
 {
   register_metrics(store_index);
   segment_providers_by_device_id.resize(DEVICE_ID_MAX, nullptr);
index e8ff0c428bc6d93c3b341d42c8c8be6307482660..4fd253e27e54e6c2528d428b7e486233c3249b63 100644 (file)
@@ -70,6 +70,8 @@ std::ostream &operator<<(std::ostream &out, const extent_pin_state_t &s) {
   switch (s) {
   case extent_pin_state_t::Fresh:
     return out << "Fresh";
+  case extent_pin_state_t::PendingPromote:
+    return out << "PendingPromote";
   case extent_pin_state_t::WarmIn:
     return out << "WarmIn";
   case extent_pin_state_t::Hot:
index 8785c183d455e81be3d5d3f7bb652564cd2543b6..3f70bb8df464914216593a47fe27dc8daed1200f 100644 (file)
@@ -269,6 +269,7 @@ private:
 enum class extent_pin_state_t : uint8_t {
   // shared state between LRU and 2Q impl
   Fresh = 0,
+  PendingPromote,
   // 2Q impl only
   WarmIn,
   Hot,
@@ -874,7 +875,7 @@ public:
     using crimson::common::get_conf;
     auto type = get_conf<std::string>("seastore_cachepin_type");
     if (type == "LRU") {
-      assert(pin_state == extent_pin_state_t::Fresh);
+      assert(pin_state <= extent_pin_state_t::PendingPromote);
     } else if (type == "2Q") {
       assert(pin_state < extent_pin_state_t::Max);
     } else {
@@ -889,7 +890,7 @@ public:
     using crimson::common::get_conf;
     auto type = get_conf<std::string>("seastore_cachepin_type");
     if (type == "LRU") {
-      assert(pin_state == extent_pin_state_t::Fresh);
+      assert(pin_state <= extent_pin_state_t::PendingPromote);
     } else if (type == "2Q") {
       assert(pin_state < extent_pin_state_t::Max);
     } else {
@@ -1116,6 +1117,7 @@ protected:
   friend class ExtentQueue;
   friend class ExtentPinboardLRU;
   friend class ExtentPinboardTwoQ;
+  friend class ExtentPromoter;
   template <typename T, typename... Args>
   static TCachedExtentRef<T> make_cached_extent_ref(
     Args&&... args) {
index c75786685be8cc5dad7fd92c70416e5f1d3c946d..a0773ca12d08a55ba35f8936bb355f1f5d0fba81 100644 (file)
@@ -3,6 +3,7 @@
 
 #include "crimson/os/seastore/extent_pinboard.h"
 #include "crimson/os/seastore/transaction.h"
+#include "crimson/os/seastore/transaction_manager.h"
 
 #include <boost/unordered/unordered_flat_map.hpp>
 
@@ -266,8 +267,134 @@ void ExtentQueue::get_stats(
   last_overall_io = overall_io;
 }
 
+class ExtentPromoter {
+public:
+  ExtentPromoter(size_t promotion_size, ExtentPlacementManager &epm)
+      : promotion_size(promotion_size), epm(epm) {}
+
+  ~ExtentPromoter() {
+    clear();
+  }
+
+  bool enabled() const {
+    return ecb != nullptr;
+  }
+
+  bool should_promote_extent(const CachedExtent &extent) {
+    return enabled() && epm.is_cold_device(extent.get_paddr().get_device_id());
+  }
+
+  size_t get_promotion_size() const {
+    return current_contents;
+  }
+
+  void set_background_callback(BackgroundListener *l) {
+    listener = l;
+  }
+
+  void set_extent_callback(ExtentCallbackInterface *cb) {
+    ecb = cb;
+  }
+
+  bool should_run_promote() const {
+    return enabled() && current_contents >= promotion_size;
+  }
+
+  std::size_t get_promoted_size() const {
+    return promoted_size;
+  }
+
+  std::size_t get_promoted_count() const {
+    return promoted_count;
+  }
+
+  void add_extent(CachedExtent &extent) {
+    assert(!extent.is_linked_to_list());
+    assert(extent.is_stable_clean());
+    extent.set_pin_state(extent_pin_state_t::PendingPromote);
+    list.push_back(extent);
+    current_contents += extent.get_length();
+    intrusive_ptr_add_ref(&extent);
+    while (current_contents > promotion_size) {
+      remove_extent(list.front(), extent_pin_state_t::Fresh);
+    }
+    if (should_run_promote()) {
+      // TODO: wake promote background process
+    }
+  }
+
+  void remove_extent(CachedExtent &extent, extent_pin_state_t new_state) {
+    assert(extent.is_linked_to_list());
+    assert(extent.get_pin_state() == extent_pin_state_t::PendingPromote);
+    assert(current_contents >= extent.get_length());
+    extent.set_pin_state(new_state);
+    list.erase(list.s_iterator_to(extent));
+    current_contents -= extent.get_length();
+    intrusive_ptr_release(&extent);
+  }
+
+  void clear() {
+    for (auto iter = list.begin(); iter != list.end();) {
+      remove_extent(*(iter++), extent_pin_state_t::Fresh);
+    }
+  }
+
+  using run_promote_ret = base_iertr::future<>;
+  run_promote_ret run_promote(Transaction &t) {
+    LOG_PREFIX(ExtentPromoter::run_promote);
+    std::size_t promote_size = 0;
+    std::list<CachedExtentRef> extents;
+    DEBUGT("start promote", t);
+    for (auto &extent : list) {
+      DEBUGT("promote {} to the hot tier", t, extent);
+      ceph_assert(extent.is_stable_clean());
+      ceph_assert(extent.get_pin_state() == extent_pin_state_t::PendingPromote);
+      promote_size += extent.get_length();
+      t.add_to_read_set(&extent);
+      extents.emplace_back(&extent);
+    }
+    for (auto &extent : extents) {
+      remove_extent(*extent, extent_pin_state_t::Fresh);
+    }
+    for (auto &extent : extents) {
+      co_await trans_intr::make_interruptible(extent->wait_io());
+      co_await ecb->promote_extent(t, extent);
+    }
+    // existing extents in lru will be retired after transaction submitted
+    co_await ecb->submit_transaction_direct(t);
+    promoted_count += extents.size();
+    promoted_size += promote_size;
+    DEBUGT("finish promoting {} {}B extents", t, extents.size(), promote_size);
+    co_return;
+  }
+
+  seastar::future<> promote() {
+    assert(enabled());
+    return repeat_eagain([this] {
+      return ecb->with_transaction_intr(
+        Transaction::src_t::PROMOTE,
+       "promote", cache_hint_t::get_nocache(),
+        [this](auto &t) {
+         return run_promote(t);
+       });
+    }).handle_error(crimson::ct_error::assert_all{"error occupied during promotion"});
+  }
+
+private:
+  const size_t promotion_size;
+  ExtentPlacementManager &epm;
+  ExtentCallbackInterface *ecb = nullptr;
+  BackgroundListener *listener = nullptr;
+  CachedExtent::primary_ref_list list;
+  size_t current_contents;
+
+  size_t promoted_count;
+  size_t promoted_size;
+};
+
 class ExtentPinboardLRU : public ExtentPinboard {
   ExtentQueue lru;
+  ExtentPromoter promoter;
   seastar::metrics::metric_group metrics;
 
   // hit and miss indicates if an extent is linked when touching it
@@ -275,9 +402,11 @@ class ExtentPinboardLRU : public ExtentPinboard {
   uint64_t miss = 0;
 
 public:
-  ExtentPinboardLRU(std::size_t capacity) : lru(capacity) {
+  ExtentPinboardLRU(std::size_t capacity, size_t promotion_size, ExtentPlacementManager &epm)
+      : lru(capacity), promoter(promotion_size, epm) {
     LOG_PREFIX(ExtentPinboardLRU::ExtentPinboardLRU);
-    INFO("created, lru_capacity=0x{:x}B", capacity);
+    INFO("created, lru_capacity=0x{:x}B, promotion_size=0x{:x}B",
+        capacity, promotion_size);
   }
 
   std::size_t get_capacity_bytes() const {
@@ -325,6 +454,24 @@ public:
         ),
       }
     );
+    if (promoter.enabled()) {
+      metrics.add_group(
+       "cache",
+       {
+         sm::make_counter(
+           "promoted_size",
+           [this] {
+             return promoter.get_promoted_size();
+           },
+           sm::description("total bytes promoted by the lru")),
+         sm::make_counter(
+           "promoted_count",
+           [this] {
+             return promoter.get_promoted_count();
+           },
+           sm::description("total extents promoted by the lru")),
+       });
+    }
   }
 
   void get_stats(
@@ -335,8 +482,28 @@ public:
   }
 
   void remove(CachedExtent &extent) final {
+    auto s = extent.get_pin_state();
     if (extent.is_linked_to_list()) {
-      lru.remove(extent);
+      if (s == extent_pin_state_t::Fresh) {
+       lru.remove(extent);
+      } else {
+       promoter.remove_extent(extent, extent_pin_state_t::Fresh);
+      }
+    } else {
+      ceph_assert(s == extent_pin_state_t::Fresh);
+    }
+  }
+
+  void add_to_top(
+    CachedExtent &extent,
+    const Transaction::src_t* p_src) {
+    auto trimmed = lru.add_to_top(extent, p_src);
+    if (promoter.enabled()) {
+      for (auto &extent : trimmed) {
+       if (promoter.should_promote_extent(*extent)) {
+         promoter.add_extent(*extent);
+       }
+      }
     }
   }
 
@@ -346,10 +513,17 @@ public:
     extent_len_t /*load_start*/,
     extent_len_t /*load_length*/) final {
     if (extent.is_linked_to_list()) {
-      lru.move_to_top(extent, p_src);
+      auto s = extent.get_pin_state();
+      assert(s <= extent_pin_state_t::PendingPromote);
+      if (s == extent_pin_state_t::Fresh) {
+       lru.move_to_top(extent, p_src);
+      } else {
+       promoter.remove_extent(extent, extent_pin_state_t::Fresh);
+       add_to_top(extent, p_src);
+      }
       hit++;
     } else {
-      lru.add_to_top(extent, p_src);
+      add_to_top(extent, p_src);
       miss++;
     }
   }
@@ -358,13 +532,38 @@ public:
     CachedExtent &extent,
     extent_len_t increased_length,
     const Transaction::src_t* p_src) final {
-    if (extent.is_linked_to_list()) {
+    if (extent.is_linked_to_list() &&
+       extent.get_pin_state() == extent_pin_state_t::Fresh) {
       lru.increase_cached_size(extent, increased_length, p_src);
+    } else {
+      // promoter take the complete extent size for content size calculation
+      assert(extent.get_pin_state() <= extent_pin_state_t::PendingPromote);
     }
   }
 
   void clear() final {
     lru.clear();
+    promoter.clear();
+  }
+
+  void set_background_callback(BackgroundListener *listener) final {
+    promoter.set_background_callback(listener);
+  }
+
+  void set_extent_callback(ExtentCallbackInterface *cb) final {
+    promoter.set_extent_callback(cb);
+  }
+
+  std::size_t get_promotion_size() const final {
+    return promoter.get_promotion_size();
+  }
+
+  bool should_promote() const final {
+    return promoter.should_run_promote();
+  }
+
+  seastar::future<> promote() final {
+    return promoter.promote();
   }
 
   ~ExtentPinboardLRU() {
@@ -482,15 +681,18 @@ public:
   ExtentPinboardTwoQ(
     std::size_t warm_in_capacity,
     std::size_t warm_out_capacity,
-    std::size_t hot_capacity)
+    std::size_t hot_capacity,
+    std::size_t promotion_size,
+    ExtentPlacementManager &epm)
       : warm_in(warm_in_capacity),
        warm_out(warm_out_capacity),
-       hot(hot_capacity)
+       hot(hot_capacity),
+       promoter(promotion_size, epm)
   {
     LOG_PREFIX(ExtentPinboardTwoQ::ExtentPinboardTwoQ);
-    INFO("created, warm_in_capacity=0x{:x}B, "
-        "warm_out_capacity=0x{:x}B, hot_capacity=0x{:x}B",
-        warm_in_capacity, warm_out_capacity, hot_capacity);
+    INFO("created, warm_in_capacity=0x{:x}B, warm_out_capacity=0x{:x}B, "
+        "hot_capacity=0x{:x}B, promotion_size=0x{:x}B",
+        warm_in_capacity, warm_out_capacity, hot_capacity, promotion_size);
   }
 
   std::size_t get_capacity_bytes() const {
@@ -517,6 +719,8 @@ public:
     if (extent.is_linked_to_list()) {
       if (s == extent_pin_state_t::WarmIn) {
        warm_in.remove(extent);
+      } else if (s == extent_pin_state_t::PendingPromote) {
+       promoter.remove_extent(extent, extent_pin_state_t::Fresh);
       } else {
        ceph_assert(s == extent_pin_state_t::Hot);
        hot.remove(extent);
@@ -538,6 +742,11 @@ public:
       if (state == extent_pin_state_t::Hot) {
        hot.move_to_top(extent, p_src);
        hit_queue(overall_hits.hot_hits, p_src, type);
+      } else if (state == extent_pin_state_t::PendingPromote) {
+       promoter.remove_extent(extent, extent_pin_state_t::Hot);
+       auto trimmed_extents = hot.add_to_top(extent, p_src);
+       on_update_hot(trimmed_extents);
+       hit_queue(overall_hits.hot_hits, p_src, type);
       } else {
        ceph_assert(state == extent_pin_state_t::WarmIn);
        hit_queue(overall_hits.warm_in_hits, p_src, type);
@@ -598,11 +807,13 @@ public:
        auto trimmed_extents = warm_in.increase_cached_size(
          extent, increased_length, p_src);
        on_update_warm_in(trimmed_extents);
-      } else {
-       ceph_assert(state == extent_pin_state_t::Hot);
+      } else if (state == extent_pin_state_t::Hot) {
        auto trimmed_extents = hot.increase_cached_size(
          extent, increased_length, p_src);
        on_update_hot(trimmed_extents);
+      } else {
+       ceph_assert(extent.get_pin_state() ==
+                   extent_pin_state_t::PendingPromote);
       }
     }
   }
@@ -616,6 +827,23 @@ public:
     warm_in.clear();
     warm_out.clear();
     hot.clear();
+    promoter.clear();
+  }
+
+  void set_background_callback(BackgroundListener *listener) final {
+    promoter.set_background_callback(listener);
+  }
+  void set_extent_callback(ExtentCallbackInterface *cb) final {
+    promoter.set_extent_callback(cb);
+  }
+  std::size_t get_promotion_size() const final {
+    return promoter.get_promotion_size();
+  }
+  bool should_promote() const final {
+    return promoter.should_run_promote();
+  }
+  seastar::future<> promote() final {
+    return promoter.promote();
   }
 
   ~ExtentPinboardTwoQ() {
@@ -624,7 +852,11 @@ public:
 private:
   void on_update_hot(std::list<CachedExtentRef> &extents) {
     for (auto extent : extents) {
-      extent->set_pin_state(extent_pin_state_t::Fresh);
+      if (promoter.should_promote_extent(*extent)) {
+       promoter.add_extent(*extent);
+      } else {
+       extent->set_pin_state(extent_pin_state_t::Fresh);
+      }
     }
   }
   void on_update_warm_in(std::list<CachedExtentRef> &extents) {
@@ -663,6 +895,7 @@ private:
   ExtentQueue warm_in;
   IndexedFifoQueue warm_out;
   ExtentQueue hot;
+  ExtentPromoter promoter;
   seastar::metrics::metric_group metrics;
 
   struct QueueCounter {
@@ -852,13 +1085,32 @@ void ExtentPinboardTwoQ::register_metrics(store_index_t store_index) {
       ),
     }
   );
+  if (promoter.enabled()) {
+    metrics.add_group(
+      "cache",
+      {
+       sm::make_counter(
+         "promoted_size",
+         [this] {
+           return promoter.get_promoted_size();
+         },
+         sm::description("total bytes promoted by the lru")),
+       sm::make_counter(
+         "promoted_count",
+         [this] {
+           return promoter.get_promoted_count();
+         },
+         sm::description("total extents promoted by the lru")),
+      });
+  }
 }
 
-ExtentPinboardRef create_extent_pinboard(std::size_t capacity) {
+ExtentPinboardRef create_extent_pinboard(std::size_t capacity, ExtentPlacementManager *epm) {
   using crimson::common::get_conf;
+  size_t promotion_size = get_conf<Option::size_t>("seastore_cache_promotion_size");
   auto algorithm = get_conf<std::string>("seastore_cachepin_type");
   if (algorithm == "LRU") {
-    return std::make_unique<ExtentPinboardLRU>(capacity);
+    return std::make_unique<ExtentPinboardLRU>(capacity, promotion_size, *epm);
   } else if (algorithm == "2Q") {
     auto warm_in_ratio = get_conf<double>("seastore_cachepin_2q_in_ratio");
     auto warm_out_ratio = get_conf<double>("seastore_cachepin_2q_out_ratio");
@@ -867,7 +1119,8 @@ ExtentPinboardRef create_extent_pinboard(std::size_t capacity) {
     return std::make_unique<ExtentPinboardTwoQ>(
       capacity * warm_in_ratio,
       capacity * warm_out_ratio,
-      capacity * (1 - warm_in_ratio));
+      capacity * (1 - warm_in_ratio),
+      promotion_size, *epm);
   } else {
     ceph_abort("invalid seastore_cachepin_type(LRU or 2Q)");
     return nullptr;
index 3d2dd542b26d36b1b923efbc6331a3367392b3c9..a766e486d4fb7fb153aea8a212739953e68ab6b3 100644 (file)
@@ -7,6 +7,9 @@
 #include "crimson/os/seastore/transaction.h"
 
 namespace crimson::os::seastore {
+class BackgroundListener;
+class ExtentCallbackInterface;
+class ExtentPlacementManager;
 
 struct ExtentPinboard {
   virtual ~ExtentPinboard() = default;
@@ -28,8 +31,14 @@ struct ExtentPinboard {
     extent_len_t increased_length,
     const Transaction::src_t *p_src) = 0;
   virtual void clear() = 0;
+  virtual void set_background_callback(BackgroundListener *listener) = 0;
+  virtual void set_extent_callback(ExtentCallbackInterface *cb) = 0;
+  virtual std::size_t get_promotion_size() const = 0;
+  virtual bool should_promote() const = 0;
+  virtual seastar::future<> promote() = 0;
 };
 using ExtentPinboardRef = std::unique_ptr<ExtentPinboard>;
-ExtentPinboardRef create_extent_pinboard(std::size_t capacity);
+ExtentPinboardRef create_extent_pinboard(
+  std::size_t capacity, ExtentPlacementManager *epm);
 
 } // namespace crimson::os::seastore
index 123826f16be77d5f7f28200e5b64259934b28c2e..b7ad1b06a9e5cf7edb0dcef7b7209141cfdd4f19 100644 (file)
@@ -571,13 +571,20 @@ public:
     return primary_device->get_backend_type();
   }
 
-
   bool is_pure_rbm() const {
     return get_main_backend_type() == backend_type_t::RANDOM_BLOCK &&
       // as of now, cold tier can only be segmented.
       !background_process.has_cold_tier();
   }
 
+  bool has_cold_tier() const {
+    return background_process.has_cold_tier();
+  }
+
+  bool is_cold_device(device_id_t id) const {
+    return background_process.is_cold_device(id);
+  }
+
   // Testing interfaces
 
   void test_init_no_background(Device *test_device) {
@@ -770,6 +777,14 @@ private:
       return cold_cleaner.get() != nullptr;
     }
 
+    bool is_cold_device(device_id_t id) const {
+      if (!has_cold_tier()) {
+        return false;
+      }
+      assert(cleaners_by_device_id[id]);
+      return cleaners_by_device_id[id] != main_cleaner.get();
+    }
+
     void set_extent_callback(ExtentCallbackInterface *cb) {
       trimmer->set_extent_callback(cb);
       main_cleaner->set_extent_callback(cb);
index 8dd915f988bb0fe9fe58f3bee853d2d3d3934074..2af3d2d2a1a646cb6bc4d9a3b9f94cd6fefbdfee 100644 (file)
@@ -992,6 +992,15 @@ TransactionManager::move_region(
   co_return;
 }
 
+TransactionManager::promote_extent_ret
+TransactionManager::promote_extent(
+  Transaction &t,
+  CachedExtentRef extent)
+{
+  // TODO
+  return rewrite_extent_iertr::make_ready_future();
+}
+
 TransactionManager::get_extents_if_live_ret
 TransactionManager::get_extents_if_live(
   Transaction &t,
index 2e59c701ea3938015fade80edd45cf4545732b21..1c94df8e927cac4b49e40bce87b0dfa4cff6a181 100644 (file)
@@ -900,6 +900,11 @@ public:
     rewrite_gen_t target_generation,
     sea_time_point modify_time) final;
 
+  using ExtentCallbackInterface::promote_extent_ret;
+  promote_extent_ret promote_extent(
+    Transaction &t,
+    CachedExtentRef extent);
+
   using ExtentCallbackInterface::get_extents_if_live_ret;
   get_extents_if_live_ret get_extents_if_live(
     Transaction &t,