]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
crimson/os/seastore/transaction: turn Transaction::read_set into an intrusive set
authorXuehan Xu <xuxuehan@qianxin.com>
Tue, 22 Apr 2025 13:47:58 +0000 (21:47 +0800)
committerXuehan Xu <xuxuehan@qianxin.com>
Wed, 28 May 2025 02:30:18 +0000 (10:30 +0800)
Fixes: https://tracker.ceph.com/issues/70976
Signed-off-by: Xuehan Xu <xuxuehan@qianxin.com>
src/crimson/os/seastore/cache.cc
src/crimson/os/seastore/cached_extent.h
src/crimson/os/seastore/transaction.h

index 472c1d57dc3d46280ff31a9e40c08b02e80a8771..4fd6ee40be72c1b4402f3702c027f5c057eb306b 100644 (file)
@@ -1231,6 +1231,7 @@ record_t Cache::prepare_record(
   auto trans_src = t.get_src();
   assert(!t.is_weak());
   assert(trans_src != Transaction::src_t::READ);
+  assert(t.read_set.size() + t.num_replace_placeholder == t.read_items.size());
 
   auto& efforts = get_by_src(stats.committed_efforts_by_src,
                              trans_src);
@@ -1247,7 +1248,7 @@ record_t Cache::prepare_record(
                i.ref->get_type()).increment(i.ref->get_length());
     read_stat.increment(i.ref->get_length());
   }
-  t.read_set.clear();
+  t.clear_read_set();
   t.write_set.clear();
 
   record_t record(record_type_t::JOURNAL, trans_src);
index 6f8b73bb2ad5bb2651a609c5a601fc13cb34e7b6..a65bf46bd181de88b6563ee3c3be8b9b3e4284d5 100644 (file)
@@ -67,11 +67,16 @@ class read_set_item_t {
   using set_hook_t = boost::intrusive::set_member_hook<
     boost::intrusive::link_mode<
       boost::intrusive::auto_unlink>>;
-  set_hook_t trans_hook;
-  using set_hook_options = boost::intrusive::member_hook<
+  set_hook_t trans_hook; // used to attach transactions to extents
+  set_hook_t extent_hook; // used to attach extents to transactions
+  using trans_hook_options = boost::intrusive::member_hook<
     read_set_item_t,
     set_hook_t,
     &read_set_item_t::trans_hook>;
+  using extent_hook_options = boost::intrusive::member_hook<
+    read_set_item_t,
+    set_hook_t,
+    &read_set_item_t::extent_hook>;
 
 public:
   struct extent_cmp_t {
@@ -101,9 +106,14 @@ public:
 
   using trans_set_t =  boost::intrusive::set<
     read_set_item_t,
-    set_hook_options,
+    trans_hook_options,
     boost::intrusive::constant_time_size<false>,
     boost::intrusive::compare<trans_cmp_t>>;
+  using extent_set_t =  boost::intrusive::set<
+    read_set_item_t,
+    extent_hook_options,
+    boost::intrusive::constant_time_size<false>,
+    boost::intrusive::compare<extent_cmp_t>>;
 
   T *t = nullptr;
   CachedExtentRef ref;
@@ -115,9 +125,7 @@ public:
 };
 
 template <typename T>
-using read_extent_set_t = std::set<
-  read_set_item_t<T>,
-  typename read_set_item_t<T>::extent_cmp_t>;
+using read_extent_set_t = typename read_set_item_t<T>::extent_set_t;
 
 template <typename T>
 using read_trans_set_t = typename read_set_item_t<T>::trans_set_t;
index c6e4545bfa83253876a54278354789637e5265fa..ddd42531a03854447a62bc73e10bb20807410279 100644 (file)
@@ -126,6 +126,7 @@ public:
     add_present_to_retired_set(ref);
   }
 
+  using extent_cmp_t = read_set_item_t<Transaction>::extent_cmp_t;
   void add_present_to_retired_set(CachedExtentRef ref) {
     assert(ref->get_paddr().is_real_location());
     assert(!is_weak());
@@ -147,7 +148,7 @@ public:
       write_set.erase(*ref);
       assert(ref->prior_instance);
       retired_set.emplace(ref->prior_instance, trans_id);
-      assert(read_set.count(ref->prior_instance->get_paddr()));
+      assert(read_set.count(ref->prior_instance->get_paddr(), extent_cmp_t{}));
       ref->prior_instance.reset();
     } else {
       // && retired_set.count(ref->get_paddr()) == 0
@@ -269,7 +270,7 @@ public:
     assert(ref->get_paddr().is_absolute() ||
            ref->get_paddr().is_root());
     assert(ref->is_exist_mutation_pending() ||
-          read_set.count(ref->prior_instance->get_paddr()));
+          read_set.count(ref->prior_instance->get_paddr(), extent_cmp_t{}));
     mutated_block_list.push_back(ref);
     if (!ref->is_exist_mutation_pending()) {
       write_set.insert(*ref);
@@ -288,12 +289,19 @@ public:
     assert(extent.get_paddr() == placeholder.get_paddr());
     assert(extent.get_paddr().is_absolute());
     {
-      auto where = read_set.find(placeholder.get_paddr());
+      auto where = read_set.find(placeholder.get_paddr(), extent_cmp_t{});
       assert(where != read_set.end());
       assert(where->ref.get() == &placeholder);
       where = read_set.erase(where);
-      auto it = read_set.emplace_hint(where, this, &extent);
+      placeholder.read_transactions.erase(
+       read_trans_set_t<Transaction>::s_iterator_to(*where));
+      // Note, the retired-placeholder is not removed from read_items after replace.
+      read_items.emplace_back(this, &extent);
+      auto it = read_set.insert_before(where, read_items.back());
       extent.read_transactions.insert(const_cast<read_set_item_t<Transaction>&>(*it));
+#ifndef NDEBUG
+      num_replace_placeholder++;
+#endif
     }
     {
       auto where = retired_set.find(&placeholder);
@@ -428,7 +436,7 @@ public:
     root.reset();
     offset = 0;
     delayed_temp_offset = 0;
-    read_set.clear();
+    read_items.clear();
     fresh_backref_extents = 0;
     invalidate_clear_write_set();
     mutated_block_list.clear();
@@ -587,7 +595,7 @@ private:
     } else if (retired_set.count(addr)) {
       return {get_extent_ret::RETIRED, nullptr};
     } else if (
-      auto iter = read_set.find(addr);
+      auto iter = read_set.find(addr, extent_cmp_t{});
       iter != read_set.end()) {
       auto ret = iter->ref;
       SUBTRACET(seastore_cache, "{} is present in read_set -- {}",
@@ -617,7 +625,8 @@ private:
       return false;
     }
 
-    auto [iter, inserted] = read_set.emplace(this, ref);
+    read_items.emplace_back(this, ref);
+    auto [iter, inserted] = read_set.insert(read_items.back());
     ceph_assert(inserted);
     ref->read_transactions.insert_before(
       it, const_cast<read_set_item_t<Transaction>&>(*iter));
@@ -652,6 +661,10 @@ private:
    * invalidate *this.
    */
   read_extent_set_t<Transaction> read_set; ///< set of extents read by paddr
+  std::list<read_set_item_t<Transaction>> read_items;
+#ifndef NDEBUG
+  size_t num_replace_placeholder = 0;
+#endif
 
   uint64_t fresh_backref_extents = 0; // counter of new backref extents