]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
crimson/os/seastore: demote/promote background processes are also
authorXuehan Xu <xuxuehan@qianxin.com>
Mon, 18 May 2026 09:55:35 +0000 (17:55 +0800)
committerXuehan Xu <xuxuehan@qianxin.com>
Sun, 24 May 2026 09:40:13 +0000 (17:40 +0800)
rewrite transactions

Signed-off-by: Xuehan Xu <xuxuehan@qianxin.com>
src/crimson/os/seastore/cache.cc
src/crimson/os/seastore/cache.h
src/crimson/os/seastore/cached_extent.cc
src/crimson/os/seastore/cached_extent.h
src/crimson/os/seastore/lba/btree_lba_manager.cc
src/crimson/os/seastore/lba/lba_btree_node.h
src/crimson/os/seastore/logical_child_node.h
src/crimson/os/seastore/seastore_types.h
src/crimson/os/seastore/transaction.h
src/crimson/os/seastore/transaction_manager.cc
src/crimson/os/seastore/transaction_manager.h

index 0399763193355733f7cae6206eadaa2b983e4c7c..e82cf96080f24cb4ccd0ce4d0ef3afdf78c94d27 100644 (file)
@@ -1804,7 +1804,7 @@ record_t Cache::prepare_record(
 
     if (i->is_exist_clean()) {
       assert(i->version == 0);
-      assert(!i->prior_instance);
+      assert(!i->prior_instance || t.get_src() == transaction_type_t::DEMOTE);
       // no set_io_wait(), skip complete_commit()
       assert(!i->is_pending_io());
       i->pending_for_transaction = TRANS_ID_NULL;
@@ -1815,13 +1815,26 @@ record_t Cache::prepare_record(
                      should_use_no_conflict_publish(t.get_src(), i->get_type()));
     }
 
-    // exist mutation pending extents must be in t.mutated_block_list
-    add_extent(i);
-    const auto t_src = t.get_src();
-    if (i->is_stable_dirty()) {
-      add_to_dirty(i, &t_src);
+    assert(i->is_logical());
+    if (t.get_src() == transaction_type_t::DEMOTE) {
+      assert(!i->committer);
+      assert(!i->get_prior_instance()->committer);
+      i->new_committer(t);
+      assert(i->committer);
+      i->get_prior_instance()->committer = i->committer;
+      auto &committer = *i->committer;
+      committer.block_trans(t);
+      i->get_prior_instance()->set_io_wait(
+        CachedExtent::extent_state_t::CLEAN, true);
     } else {
-      touch_extent_fully(*i, &t_src, t.get_cache_hint());
+      // exist mutation pending extents must be in t.mutated_block_list
+      add_extent(i);
+      const auto t_src = t.get_src();
+      if (i->is_stable_dirty()) {
+        add_to_dirty(i, &t_src);
+      } else {
+        touch_extent_fully(*i, &t_src, t.get_cache_hint());
+      }
     }
 
     alloc_delta.alloc_blk_ranges.emplace_back(
@@ -2096,6 +2109,10 @@ void Cache::complete_commit(
       if (is_lba_backref_node(i->get_type())) {
         committer.commit_data();
       }
+      if (i->is_logical() &&
+          t.get_src() == transaction_type_t::PROMOTE) {
+        committer.commit_shadow_promote(t);
+      }
       touch_extent_fully(prior, &t_src, t.get_cache_hint());
       committer.sync_version();
       committer.unblock_trans(t);
@@ -2205,6 +2222,32 @@ void Cache::complete_commit(
       continue;
     }
     epm.mark_space_used(i->get_paddr(), i->get_length());
+    assert(i->is_logical());
+    auto t_src = t.get_src();
+    if (t.get_src() == transaction_type_t::DEMOTE) {
+      assert(i->committer);
+      auto &committer = *i->committer;
+      auto &prior = static_cast<LogicalChildNode&>(
+        *i->get_prior_instance());
+      ceph_assert(prior.is_valid());
+      TRACET("committing rewritten extent into "
+             "existing -- {}, prior={}",
+             t, *i, prior);
+      prior.pending_for_transaction = TRANS_ID_NULL;
+      if (auto shadow = prior.get_shadow(); shadow) {
+        committer.commit_shadow_demote(t);
+        prior.reset_shadow();
+      }
+      committer.commit_state();
+      committer.sync_checksum();
+      committer.commit_and_share_paddr();
+      touch_extent_fully(prior, &t_src, t.get_cache_hint());
+      committer.sync_version();
+      committer.unblock_trans(t);
+      prior.complete_io();
+      i->committer.reset();
+      prior.committer.reset();
+    }
   }
   for (auto &i: t.pre_alloc_list) {
     if (!i->is_valid()) {
index 0b3d216ffb6308de67e7ff716d559989c9b9ae31..93651568f0b05a33bad643611acdae4d45ee7fb8 100644 (file)
@@ -1682,6 +1682,9 @@ public:
       read_extent_futs, [](auto &fut) { return std::move(fut); });
   }
 
+  bool is_on_cold_tier(paddr_t paddr) const {
+    return epm.is_cold_device(paddr.get_device_id());
+  }
 private:
   void touch_extent_fully(
       CachedExtent &ext,
index 7ee7c8980f63b293ca05604685935d23fec5139a..e15a5618ac5b2a1b4675b39234ed400c15a3732d 100644 (file)
@@ -542,4 +542,43 @@ void ExtentCommitter::unblock_trans(Transaction &t) {
   }
 }
 
+void ExtentCommitter::commit_shadow_demote(Transaction &t) {
+  LOG_PREFIX(ExtentCommitter::commit_shadow_demote);
+  assert(t.get_src() == transaction_type_t::DEMOTE);
+  auto &prior = *extent.prior_instance->template cast<LogicalChildNode>();
+  auto shadow = prior.get_shadow();
+  assert(shadow);
+  for (auto &trans_view : prior.retired_transactions) {
+    assert(trans_view.t != nullptr);
+    auto view_tid = trans_view.t->get_trans_id();
+    if (view_tid == t.get_trans_id()) {
+      continue;
+    }
+    TRACET("removing shadow {} from retired_set of t.{}", t, *shadow, view_tid);
+    [[maybe_unused]] bool removed =
+      trans_view.t->remove_from_retired_set(*shadow);
+    assert(removed);
+  }
+}
+
+void ExtentCommitter::commit_shadow_promote(Transaction &t) {
+  LOG_PREFIX(ExtentCommitter::commit_shadow_promote);
+  assert(t.get_src() == transaction_type_t::PROMOTE);
+  assert(extent.is_logical());
+  auto &lprior = static_cast<LogicalChildNode&>(*extent.prior_instance);
+  auto &lext = static_cast<LogicalChildNode&>(extent);
+  auto shadow = lext.get_shadow();
+  assert(shadow);
+  lprior.set_shadow(shadow);
+  for (auto &trans_view : lprior.retired_transactions) {
+    assert(trans_view.t != nullptr);
+    auto view_tid = trans_view.t->get_trans_id();
+    if (view_tid == t.get_trans_id()) {
+      continue;
+    }
+    TRACET("adding shadow {} from t.{}", t, *shadow, view_tid);
+    trans_view.t->add_absent_to_retired_set(shadow);
+  }
+}
+
 }
index 9371c984544b9ff779f6ae1b7846fab15838fcc5..89db7219b7ada0b49d2b6885216184a23c76f6cb 100644 (file)
@@ -300,6 +300,8 @@ public:
 
   void commit_and_share_paddr();
 
+  void commit_shadow_demote(Transaction&);
+  void commit_shadow_promote(Transaction&);
 private:
   // the rewritten extent
   CachedExtent &extent;
index fff5c5bf1646f8c151ee79954b4f6a96c593acc7..ee4f5e719d5c0743ec150cd9b6d2d893ea8c2d4e 100644 (file)
@@ -951,13 +951,22 @@ BtreeLBAManager::update_mappings(
          return this->_update_mapping(
            c.trans,
            *cursor,
-           [prev_addr, addr, len, checksum](
+           [prev_addr, addr, len, checksum, extent, c](
              const lba_map_val_t &in) {
              lba_map_val_t ret = in;
              ceph_assert(in.pladdr.is_paddr());
-             ceph_assert(in.pladdr.get_paddr() == prev_addr);
              ceph_assert(in.len == len);
-             ret.pladdr = addr;
+             if (likely(in.pladdr.get_paddr() == prev_addr)) {
+                ret.pladdr = addr;
+              } else {
+                // this can only happen when the extent is EXIST_CLEAN
+                // and is demoted onto the cold tier by a DEMOTE trans.
+                assert(in.shadow_paddr == P_ADDR_NULL);
+                assert(extent->is_exist_clean());
+                assert(extent->get_paddr() == in.pladdr.get_paddr());
+                assert(c.cache.is_on_cold_tier(extent->get_paddr()));
+                assert(!c.cache.is_on_cold_tier(prev_addr));
+              }
              ret.checksum = checksum;
              return ret;
            },
index 447a6caa9c0ea1501e640046ba4e9f7537556b0a..6c95e43419109a8ef8f6ecebeb3cf3ba1bb7f02e 100644 (file)
@@ -294,6 +294,7 @@ struct LBALeafNode
     iterator &iter)
   {
     LOG_PREFIX(LBALeafNode::merge_content_to);
+    SUBTRACET(seastore_lba, "merging with {}", t, pending_version);
     std::map<laddr_t, pladdr_t> modified;
     auto it = pending_version.begin();
     while (it != pending_version.end() && iter != this->end()) {
@@ -315,11 +316,17 @@ struct LBALeafNode
         ceph_abort();
       }
       if (is_valid_child_ptr(child) &&
-          (child->_is_mutable() || child->_is_pending_io())) {
-        // skip the ones that the pending version is also modifying
+          (// skip the ones that the pending version is also modifying
+           (child->_is_mutable() || child->_is_pending_io()) ||
+           // EXIST_CLEAN extents created by DEMOTE transactions also
+           // updates their paddrs, so they should also be skpped.
+           (pending_version.t->get_src() == transaction_type_t::DEMOTE))) {
+        SUBTRACET(seastore_lba, "skipping {}~{}", t, it->get_key(), it->get_val());
         it++;
         continue;
       }
+      SUBTRACET(seastore_lba, "examing v2: {}~{}, v1: {}~{}",
+        t, it->get_key(), it->get_val(), iter->get_key(), iter->get_val());
       auto pending_key = it->get_key();
       auto stable_key = iter->get_key();
       auto stable_end = stable_key + v1.len;
@@ -338,6 +345,11 @@ struct LBALeafNode
           auto paddr = v1.pladdr.get_paddr();
           paddr = paddr + off;
           m_v2.pladdr = paddr;
+          if (v1.shadow_paddr == P_ADDR_NULL) {
+            m_v2.shadow_paddr = P_ADDR_NULL;
+          } else {
+            m_v2.shadow_paddr = (v1.shadow_paddr + off);
+          }
           SUBTRACET(seastore_lba, "merging to {}, paddr: {} -> {}",
             t, pending_version, m_v2.pladdr, paddr);
           if (!is_valid_child_ptr(child) ||
index ea4bb9c6a763be72b1357cd52ff0dbd2a8eda3f3..977b96aaf21d8ea7c4d6246f1c6556aac0c59029 100644 (file)
@@ -44,6 +44,20 @@ public:
   laddr_t get_end() const {
     return (get_laddr() + get_length()).checked_to_laddr();
   }
+
+  TCachedExtentRef<LogicalChildNode> get_shadow() const {
+    return shadow;
+  }
+
+  void set_shadow(TCachedExtentRef<LogicalChildNode> &s) {
+    assert(!shadow);
+    shadow = s;
+  }
+  
+  void reset_shadow() {
+    shadow.reset();
+  }
+
 protected:
   void on_replace_prior(Transaction &t) final {
     assert(is_seen_by_users());
@@ -56,6 +70,8 @@ protected:
   void on_data_commit() final {
     ceph_abort("impossible");
   }
+private:
+  TCachedExtentRef<LogicalChildNode> shadow;
 };
 using LogicalChildNodeRef = TCachedExtentRef<LogicalChildNode>;
 } // namespace crimson::os::seastore
index a000f2ee089d9ed0ff317aecd55bf1de4f464444..03dcd08c505ce9773e0bfebf46d19174a4a6b14b 100644 (file)
@@ -2756,7 +2756,9 @@ constexpr bool is_background_transaction(transaction_type_t type) {
 constexpr bool is_rewrite_transaction(transaction_type_t type) {
   return type == transaction_type_t::TRIM_DIRTY ||
     type == transaction_type_t::CLEANER_MAIN ||
-    type == transaction_type_t::CLEANER_COLD;
+    type == transaction_type_t::CLEANER_COLD ||
+    type == transaction_type_t::DEMOTE ||
+    type == transaction_type_t::PROMOTE;
 }
 
 constexpr bool is_trim_transaction(transaction_type_t type) {
index b11f505c79edac95324cdf53bcb89e9423b683f5..4688c1057c2f36c033b4fcccf2a0a3a3744446b0 100644 (file)
@@ -363,6 +363,21 @@ public:
     }
   }
 
+  bool remove_from_retired_set(CachedExtent &ext) {
+    auto it = retired_set.find(ext.get_paddr());
+    if (it == retired_set.end()) {
+      return false;
+    }
+    auto &extent = it->extent;
+    if (extent->get_paddr() != ext.get_paddr()) {
+      return false;
+    } else {
+      assert(ext.get_length() == extent->get_length());
+      retired_set.erase(it);
+      return true;
+    }
+  }
+
   std::pair<bool, bool> pre_stable_extent_paddr_mod(
     read_set_item_t<Transaction> &item)
   {
index f7b5aa8af44456cc78face79276289999caf94e4..54184acfbf68d0eaabf421255639522f3957fd77 100644 (file)
@@ -259,13 +259,14 @@ TransactionManager::ref_ret TransactionManager::remove(
         auto laddr = ref->get_laddr();
         cache->retire_absent_extent_addr_by_type(
           t, laddr, shadow_addr, length, ref->get_type(),
-          [laddr](auto &extent) {
+          [ref, laddr](auto &extent) {
             auto lextent = extent.template cast<LogicalChildNode>();
             assert(extent.is_logical());
             assert(!lextent->has_laddr());
             assert(!extent.has_been_invalidated());
             lextent->set_laddr(laddr);
             extent.set_shadow_extent(true);
+            ref->set_shadow(lextent);
           });
       }
     }
@@ -332,11 +333,32 @@ TransactionManager::_remove(
        LogicalChildNode
        >();
       ceph_assert(extent);
-      cache->retire_extent(t, std::move(extent));
+      cache->retire_extent(t, extent);
+      if (mapping.has_shadow_val()) {
+        if (auto shadow = extent->get_shadow(); shadow) {
+          t.add_absent_to_retired_set(shadow);
+        } else {
+          auto laddr = mapping.get_intermediate_base();
+          std::ignore = cache->retire_absent_extent_addr_by_type(
+            t, laddr,
+            mapping.get_shadow_val(),
+            mapping.get_intermediate_length(),
+            mapping.get_extent_type(),
+            [extent, laddr](auto &ext) {
+              auto lextent = ext.template cast<LogicalChildNode>();
+              assert(ext.is_logical());
+              assert(!lextent->has_laddr());
+              assert(!ext.has_been_invalidated());
+              lextent->set_laddr(laddr);
+              ext.set_shadow_extent(true);
+              extent->set_shadow(lextent);
+            });
+        }
+      }
     } else {
       auto &child_pos = maybe_mapped_extent.get_child_pos();
       auto laddr = mapping.get_intermediate_base();
-      std::ignore = cache->retire_absent_extent_addr_by_type(
+      auto ext = cache->retire_absent_extent_addr_by_type(
        t, laddr,
        mapping.get_val(),
        mapping.get_intermediate_length(),
@@ -349,13 +371,23 @@ TransactionManager::_remove(
           child_pos.link_child(lextent.get());
           lextent->set_laddr(laddr);
         }
-      );
-    }
-    if (mapping.has_shadow_val()) {
-      cache->retire_absent_extent_addr(
-        t, mapping.get_intermediate_base(),
-        mapping.get_shadow_val(),
-        mapping.get_intermediate_length());
+      )->template cast<LogicalChildNode>();
+      if (mapping.has_shadow_val()) {
+        std::ignore = cache->retire_absent_extent_addr_by_type(
+          t, mapping.get_intermediate_base(),
+          mapping.get_shadow_val(),
+          mapping.get_intermediate_length(),
+          mapping.get_extent_type(),
+          [laddr, ext](auto &extent) {
+            auto lextent = extent.template cast<LogicalChildNode>();
+            assert(extent.is_logical());
+            assert(!lextent->has_laddr());
+            assert(!extent.has_been_invalidated());
+            lextent->set_laddr(laddr);
+            extent.set_shadow_extent(true);
+            ext->set_shadow(lextent);
+          });
+      }
     }
   }
 
@@ -501,8 +533,9 @@ TransactionManager::relocate_shadow_extent(
   assert(mapping.has_shadow_val());
   assert(!mapping.is_zero_reserved());
   assert(mapping.is_viewable());
+  assert(t.get_src() == transaction_type_t::DEMOTE);
   auto v = mapping.get_logical_extent(t);
-  CachedExtentRef extent;
+  LogicalChildNodeRef extent;
   auto laddr = mapping.get_key();
   if (!v.has_child()) {
     auto &child_pos = v.get_child_pos();
@@ -520,26 +553,34 @@ TransactionManager::relocate_shadow_extent(
         child_pos.link_child(lextent.get());
         lextent->set_laddr(laddr);
       }
-    );
+    )->template cast<LogicalChildNode>();
   } else {
-    auto extent = co_await std::move(v.get_child_fut());
+    extent = co_await std::move(v.get_child_fut());
     cache->retire_extent(t, extent);
   }
-  auto shadow_paddr = mapping.get_shadow_val();
-  std::ignore = cache->retire_absent_extent_addr_by_type(
-    t, laddr, shadow_paddr, mapping.get_length(), mapping.get_extent_type(),
-    [laddr](auto &ext) {
-      auto lextent = ext.template cast<LogicalChildNode>();
-      assert(ext.is_logical());
-      assert(!lextent->has_laddr());
-      assert(!ext.has_been_invalidated());
-      lextent->set_laddr(laddr);
-    }
-  );
-  co_return cache->alloc_remapped_extent_by_type(
+  if (auto shadow = extent->get_shadow(); shadow) {
+    t.add_absent_to_retired_set(shadow);
+  } else {
+    auto shadow_paddr = mapping.get_shadow_val();
+    std::ignore = cache->retire_absent_extent_addr_by_type(
+      t, laddr, shadow_paddr, mapping.get_length(), mapping.get_extent_type(),
+      [laddr, extent](auto &ext) {
+        auto lextent = ext.template cast<LogicalChildNode>();
+        assert(ext.is_logical());
+        assert(!lextent->has_laddr());
+        assert(!ext.has_been_invalidated());
+        lextent->set_laddr(laddr);
+        ext.set_shadow_extent(true);
+        extent->set_shadow(lextent);
+      }
+    );
+  }
+  auto nextent =  cache->alloc_remapped_extent_by_type(
     t, mapping.get_extent_type(), laddr,
     mapping.get_shadow_val(), 0, mapping.get_length(), std::nullopt
   )->cast<LogicalChildNode>();
+  nextent->set_prior_instance(extent);
+  co_return nextent;
 }
 
 TransactionManager::submit_transaction_iertr::future<>
@@ -1194,6 +1235,8 @@ TransactionManager::promote_extent(
         slice_length,
         std::nullopt);
       remapped_cold_extent->set_shadow_extent(true);
+      auto lremapped = remapped_cold_extent->template cast<LogicalChildNode>();
+      lext->set_shadow(lremapped);
 
       offset += slice_length;
     }
@@ -1235,8 +1278,8 @@ TransactionManager::promote_extent(
       orig_ext->get_length(),
       std::nullopt);
     remapped_cold_extent->set_shadow_extent(true);
-
-    remapped_cold_extent->set_shadow_extent(true);
+    auto lremapped = remapped_cold_extent->template cast<LogicalChildNode>();
+    lext->set_shadow(lremapped);
   }
 
   auto cursor = co_await lba_manager->get_cursor(
index 64ac9c009dea922bfa9e9c79a143941c6ac98699..e549572bd1491172dca8325d5a89920236ad30de 100644 (file)
@@ -1543,7 +1543,7 @@ private:
           SUBTRACET(seastore_tm, "retire extent place holder...", t);
           auto &child_pos = ret.get_child_pos();
           auto laddr = pin.get_key();
-          std::ignore = cache->retire_absent_extent_addr_by_type(
+          auto ext = cache->retire_absent_extent_addr_by_type(
             t, laddr, original_paddr, original_len, pin.get_extent_type(),
             [&child_pos, laddr](auto &extent) mutable {
               auto lextent = extent.template cast<LogicalChildNode>();
@@ -1553,17 +1553,18 @@ private:
               child_pos.link_child(lextent.get());
               lextent->set_laddr(laddr);
             }
-          );
+          )->template cast<LogicalChildNode>();
           if (pin.has_shadow_val()) {
             cache->retire_absent_extent_addr_by_type(
               t, pin.get_key(), pin.get_shadow_val(),
               original_len, pin.get_extent_type(),
-              [laddr](auto &extent) {
+              [laddr, ext](auto &extent) {
                 auto lextent = extent.template cast<LogicalChildNode>();
                 assert(extent.is_logical());
                 assert(!lextent->has_laddr());
                 assert(!extent.has_been_invalidated());
                 lextent->set_laddr(laddr);
+                ext->set_shadow(lextent);
               }
             );
           }