]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
crimson/os/seastore: add write through policy
authorZhang Song <zhangsong02@qianxin.com>
Wed, 3 Sep 2025 08:05:38 +0000 (16:05 +0800)
committerXuehan Xu <xuxuehan@qianxin.com>
Sat, 23 May 2026 09:11:36 +0000 (17:11 +0800)
Signed-off-by: Zhang Song <zhangsong02@qianxin.com>
Signed-off-by: Xuehan Xu <xuxuehan@qianxin.com>
src/common/options/crimson.yaml.in
src/crimson/os/seastore/cache.cc
src/crimson/os/seastore/cache.h
src/crimson/os/seastore/cached_extent.h
src/crimson/os/seastore/extent_placement_manager.h
src/crimson/os/seastore/seastore_types.cc
src/crimson/os/seastore/seastore_types.h
src/crimson/os/seastore/transaction_manager.h
src/test/crimson/seastore/test_btree_lba_manager.cc
src/test/crimson/seastore/test_transaction_manager.cc

index 4772368dc8301ea0f347045541550850ed3e047f..5c31100954c0820885c66809b52230c87e710909 100644 (file)
@@ -343,3 +343,8 @@ options:
   level: advanced
   desc: Size in bytes of extents to be demoted from logical bucket
   default: 2_M
+- name: seastore_write_through_size
+  type: size
+  level: dev
+  desc: select write through policy when data length is greater than this value.
+  default: 512_K
index 555b81c14ce18e75c4d9a352ed5ccbab60fedf15..55c855bdc2dce5789dce93d4984e23fdc335c61a 100644 (file)
@@ -1272,14 +1272,16 @@ std::vector<CachedExtentRef> Cache::alloc_new_data_extents_by_type(
   case extent_types_t::OBJECT_DATA_BLOCK:
     {
       auto extents = alloc_new_data_extents<
-       ObjectDataBlock>(t, length, {hint, gen, is_tracked});
+       ObjectDataBlock>(t, length, {hint, gen, is_tracked,
+           epm.get_write_policy(type, length)});
       res.insert(res.begin(), extents.begin(), extents.end());
     }
     return res;
   case extent_types_t::TEST_BLOCK:
     {
       auto extents = alloc_new_data_extents<
-       TestBlock>(t, length, {hint, gen, is_tracked});
+       TestBlock>(t, length, {hint, gen, is_tracked,
+         epm.get_write_policy(type, length)});
       res.insert(res.begin(), extents.begin(), extents.end());
     }
     return res;
@@ -2197,7 +2199,8 @@ void Cache::init()
              P_ADDR_ROOT,
              PLACEMENT_HINT_NULL,
              NULL_GENERATION,
-             TRANS_ID_NULL);
+             TRANS_ID_NULL,
+            write_policy_t::WRITE_BACK);
   root->set_modify_time(seastar::lowres_system_clock::now());
   INFO("init root -- {}", *root);
   add_extent(root);
@@ -2621,7 +2624,8 @@ Cache::_get_absent_extent_by_type(
            offset,
            PLACEMENT_HINT_NULL,
            NULL_GENERATION,
-           TRANS_ID_NULL);
+           TRANS_ID_NULL,
+           write_policy_t::WRITE_BACK);
   DEBUGT("{} length=0x{:x} is absent, add extent ... -- {}",
     t, type, length, *ret);
   add_extent(ret);
index a5ba78ffae45cf6fb2b38cbcf5eaea66dfd3391e..5317e14bbb8965076834d47646bac6e1da99dfe6 100644 (file)
@@ -402,7 +402,8 @@ public:
              offset,
              PLACEMENT_HINT_NULL,
              NULL_GENERATION,
-             TRANS_ID_NULL);
+             TRANS_ID_NULL,
+             write_policy_t::WRITE_BACK);
     SUBDEBUGT(seastore_cache,
        "{} {}~0x{:x} is absent, add extent and reading range 0x{:x}~0x{:x} ... -- {}",
        t, T::TYPE, offset, length, partial_off, partial_len, *ret);
@@ -843,7 +844,8 @@ private:
                 offset,
                 PLACEMENT_HINT_NULL,
                 NULL_GENERATION,
-               TRANS_ID_NULL);
+               TRANS_ID_NULL,
+               write_policy_t::WRITE_BACK);
       SUBDEBUG(seastore_cache,
           "{} {}~0x{:x} is absent, add extent and reading range 0x{:x}~0x{:x} ... -- {}",
           T::TYPE, offset, length, partial_off, partial_len, *ret);
@@ -1122,7 +1124,8 @@ public:
               result->paddr,
               opt.hint,
               result->gen,
-             t.get_trans_id());
+             t.get_trans_id(),
+             write_policy_t::WRITE_BACK);
     t.add_fresh_extent(ret);
     SUBDEBUGT(seastore_cache,
               "allocated {} 0x{:x}B extent at {}, hint={}, gen={} -- {}",
@@ -1160,7 +1163,8 @@ public:
                 result.paddr,
                 opt.hint,
                 result.gen,
-                t.get_trans_id());
+                t.get_trans_id(),
+               opt.write_policy);
       t.add_fresh_extent(ret);
       SUBDEBUGT(seastore_cache,
                 "allocated {} 0x{:x}B extent at {}, hint={}, gen={} -- {}",
@@ -1202,7 +1206,8 @@ public:
              remap_paddr,
              PLACEMENT_HINT_NULL,
              NULL_GENERATION,
-              t.get_trans_id());
+              t.get_trans_id(),
+             write_policy_t::WRITE_BACK);
 
     auto extent = ext->template cast<T>();
     extent->set_laddr(remap_laddr);
index d2358399986e0cecd5a5bd409a2d6715741727cb..9371c984544b9ff779f6ae1b7846fab15838fcc5 100644 (file)
@@ -361,12 +361,14 @@ public:
             paddr_t paddr,
             placement_hint_t hint,
             rewrite_gen_t gen,
-           transaction_id_t trans_id) {
+            transaction_id_t trans_id,
+            write_policy_t policy) {
     state = _state;
     set_paddr(paddr);
     user_hint = hint;
     rewrite_generation = gen;
     pending_for_transaction = trans_id;
+    write_policy = policy;
   }
 
   void set_modify_time(sea_time_point t) {
@@ -538,6 +540,7 @@ public:
        << ", last_committed_crc=" << last_committed_crc
        << ", refcount=" << use_count()
        << ", user_hint=" << user_hint
+       << ", write_policy=" << write_policy
        << ", rewrite_gen=" << rewrite_gen_printer_t{rewrite_generation}
        << ", pending_io=";
     if (is_pending_io()) {
@@ -917,6 +920,18 @@ public:
     is_shadow = b;
   }
 
+  write_policy_t get_write_policy() const {
+    return write_policy;
+  }
+
+  void set_write_policy(write_policy_t w) {
+    write_policy = w;
+  }
+
+  void reset_write_policy() {
+    write_policy = write_policy_t::WRITE_BACK;
+  }
+
 private:
   template <typename T>
   friend class read_set_item_t;
@@ -1028,6 +1043,8 @@ private:
 
   placement_hint_t user_hint = PLACEMENT_HINT_NULL;
 
+  write_policy_t write_policy = write_policy_t::WRITE_BACK;
+
   // the target rewrite generation for the followup rewrite
   // or the rewrite generation for the fresh write
   rewrite_gen_t rewrite_generation = NULL_GENERATION;
index 716211678b6dc0780eb219c11fb4ce1625870f8e..3027f645a0fa0c63df57f72c410c9db305bd542d 100644 (file)
@@ -351,7 +351,9 @@ public:
       ool_segment_seq_allocator(
           std::make_unique<SegmentSeqAllocator>(segment_type_t::OOL)),
       max_data_allocation_size(crimson::common::get_conf<Option::size_t>(
-         "seastore_max_data_allocation_size"))
+         "seastore_max_data_allocation_size")),
+      write_through_size(crimson::common::get_conf<Option::size_t>(
+         "seastore_write_through_size"))
   {
     LOG_PREFIX(ExtentPlacementManager::ExtentPlacementManager);
     devices_by_id.resize(DEVICE_ID_MAX, nullptr);
@@ -429,6 +431,7 @@ public:
     placement_hint_t hint;
     rewrite_gen_t gen;
     bool is_tracked;
+    write_policy_t write_policy = write_policy_t::WRITE_BACK;
 #ifdef UNIT_TESTS_BUILT
     std::optional<paddr_t> external_paddr = std::nullopt;
 #endif
@@ -449,7 +452,8 @@ public:
     assert(opt.gen == INIT_GENERATION || opt.hint == placement_hint_t::REWRITE);
 
     data_category_t category = get_extent_category(type);
-    opt.gen = adjust_generation(category, type, opt.hint, opt.gen, opt.is_tracked);
+    opt.gen = adjust_generation(
+      category, type, opt.hint, opt.gen, opt.write_policy, opt.is_tracked);
 
     paddr_t addr;
 #ifdef UNIT_TESTS_BUILT
@@ -489,7 +493,8 @@ public:
     assert(opt.gen == INIT_GENERATION || opt.hint == placement_hint_t::REWRITE);
 
     data_category_t category = get_extent_category(type);
-    opt.gen = adjust_generation(category, type, opt.hint, opt.gen, opt.is_tracked);
+    opt.gen = adjust_generation(
+      category, type, opt.hint, opt.gen, opt.write_policy, opt.is_tracked);
     assert(opt.gen != INLINE_GENERATION);
 
     // XXX: bp might be extended to point to different memory (e.g. PMem)
@@ -527,6 +532,13 @@ public:
     return allocs;
   }
 
+  write_policy_t get_write_policy(extent_types_t type, extent_len_t length) const {
+    if (has_cold_tier() && length >= write_through_size && is_data_type(type)) {
+      return write_policy_t::WRITE_THROUGH;
+    }
+    return write_policy_t::WRITE_BACK;
+  }
+
 #ifdef UNIT_TESTS_BUILT
   void prefill_fragmented_devices() {
     LOG_PREFIX(ExtentPlacementManager::prefill_fragmented_devices);
@@ -696,6 +708,7 @@ private:
       extent_types_t type,
       placement_hint_t hint,
       rewrite_gen_t gen,
+      write_policy_t policy,
       bool is_tracked) {
     assert(is_real_type(type));
     if (is_root_type(type)) {
@@ -725,10 +738,20 @@ private:
         }
       } else {
         assert(category == data_category_t::DATA);
-        gen = OOL_GENERATION;
+        if (background_process.has_cold_tier() &&
+            policy == write_policy_t::WRITE_THROUGH) {
+          gen = hot_tier_generations;
+        } else {
+          assert(policy != write_policy_t::WRITE_THROUGH);
+          gen = OOL_GENERATION;
+        }
       }
     } else if (background_process.has_cold_tier()) {
       gen = background_process.adjust_generation(gen);
+      if (gen <= hot_tier_generations &&
+          policy == write_policy_t::WRITE_THROUGH) {
+        gen = hot_tier_generations;
+      }
     }
 
     if (is_tracked && gen >= hot_tier_generations &&
@@ -1283,6 +1306,7 @@ private:
   // TODO: drop once paddr->journal_seq_t is introduced
   SegmentSeqAllocatorRef ool_segment_seq_allocator;
   extent_len_t max_data_allocation_size = 0;
+  std::size_t write_through_size;
 
   friend class ::transaction_manager_test_t;
   friend class Cache;
index df83d1e5825da042453f5762563500b9f46e31e5..65f9a39b12adaae8a64470b2e0e00e86d3c40b72 100644 (file)
@@ -1224,6 +1224,16 @@ std::ostream& operator<<(std::ostream& out, placement_hint_t h)
   }
 }
 
+std::ostream& operator<<(std::ostream& out, write_policy_t w)
+{
+  switch(w) {
+  case write_policy_t::WRITE_BACK:
+    return out << "WRITE_BACK";
+  case write_policy_t::WRITE_THROUGH:
+    return out << "WRITE_THROUGH";
+  }
+}
+
 bool can_delay_allocation(device_type_t type) {
   // Some types of device may not support delayed allocation, for example PMEM.
   // All types of device currently support delayed allocation.
index 8472775a1dd91b785e01a2ab45d450113d34ee4d..a000f2ee089d9ed0ff317aecd55bf1de4f464444 100644 (file)
@@ -2069,6 +2069,13 @@ constexpr bool is_real_type(extent_types_t type) {
 
 std::ostream &operator<<(std::ostream &out, extent_types_t t);
 
+enum class write_policy_t {
+  WRITE_BACK,
+  WRITE_THROUGH
+};
+
+std::ostream& operator<<(std::ostream& out, write_policy_t w);
+
 /**
  * rewrite_gen_t
  *
@@ -3684,6 +3691,7 @@ template <> struct fmt::formatter<crimson::os::seastore::transaction_type_t> : f
 template <> struct fmt::formatter<crimson::os::seastore::write_result_t> : fmt::ostream_formatter {};
 template <> struct fmt::formatter<crimson::os::seastore::omap_type_t> : fmt::ostream_formatter {};
 template <> struct fmt::formatter<ceph::buffer::list> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::write_policy_t> : fmt::ostream_formatter {};
 #endif
 
 template <>
index 7919f09689a503bef17b8935be9ca10b40334a0b..7dc7735e15515555614dfcb4e973df8be95c0cfd 100644 (file)
@@ -591,7 +591,11 @@ public:
     SUBDEBUGT(seastore_tm, "{} hint {}~0x{:x} phint={} ...",
               t, T::TYPE, laddr_hint, len, placement_hint);
     auto exts = cache->alloc_new_data_extents<T>(
-      t, len, {placement_hint, INIT_GENERATION});
+      t, len,
+      {
+        placement_hint, INIT_GENERATION, false,
+        epm->get_write_policy(T::TYPE, len)
+      });
     // user must initialize the logical extent themselves
     assert(is_user_transaction(t.get_src()));
     for (auto& ext : exts) {
index b5541aaaf48729646e9b473a9ef3920d96f995b7..291c845a18edaa40982bd3d464b03652b7ce9f8d 100644 (file)
@@ -327,7 +327,9 @@ struct lba_btree_test : btree_test_base {
     check.emplace(addr, get_map_val(len, TestBlock::TYPE));
     lba_btree_update([=, this](auto &btree, auto &t) {
       auto extents = cache->alloc_new_data_extents<TestBlock>(
-         t, TestBlock::SIZE, {placement_hint_t::HOT, 0, false, get_paddr()});
+         t, TestBlock::SIZE,
+         {placement_hint_t::HOT, 0, false,
+          write_policy_t::WRITE_BACK, get_paddr()});
       return seastar::do_with(
        std::move(extents),
        [this, addr, &t, len, &btree](auto &extents) {
@@ -544,7 +546,9 @@ struct btree_lba_manager_test : btree_test_base {
       *t.t,
       [=, this](auto &t) {
        auto extents = cache->alloc_new_data_extents<TestBlock>(
-           t, TestBlock::SIZE, {placement_hint_t::HOT, 0, false, get_paddr()});
+           t, TestBlock::SIZE,
+           {placement_hint_t::HOT, 0, false,
+            write_policy_t::WRITE_BACK, get_paddr()});
        return seastar::do_with(
          std::vector<LogicalChildNodeRef>(
            extents.begin(), extents.end()),
index d07579abcceb814752f0a46330a68b0a744df10c..85d4ff2bee3cc46631ff1d0b43b82e51a1f44c8a 100644 (file)
@@ -1141,6 +1141,7 @@ struct transaction_manager_test_t :
               t,
               placement_hint_t::HOT,
               gen,
+             write_policy_t::WRITE_BACK,
              false);
             if (expected_generations[t][gen] != epm_gen) {
               logger().error("caller: {}, extent type: {}, input generation: {}, "