]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
crimson/os/seastore/async_cleaner: set different threshold for starting
authorXuehan Xu <xuxuehan@qianxin.com>
Tue, 27 May 2025 07:54:58 +0000 (15:54 +0800)
committerXuehan Xu <xuxuehan@qianxin.com>
Tue, 10 Jun 2025 01:12:22 +0000 (09:12 +0800)
and stopping trimming dirty

At present, the trim_dirty starts and stops at the same threshold
*target_dirty_bytes*, which leads to very frequent *trim_dirty*
operations that further invalidate transactions.

We've observed that long read transactions kept being invalidated by
*trim_dirty*, which led to starvations.

Fixes: https://tracker.ceph.com/issues/71443
Signed-off-by: Xuehan Xu <xuxuehan@qianxin.com>
src/crimson/os/seastore/async_cleaner.cc
src/crimson/os/seastore/async_cleaner.h

index 1ccdfea40c62e049582f8e6d5ff967c95df729f0..890adfdd22f66ffb22f0b713bcd5632aa5c5f803 100644 (file)
@@ -342,8 +342,12 @@ void JournalTrimmerImpl::config_t::validate() const
 {
   ceph_assert(max_journal_bytes <= DEVICE_OFF_MAX);
   ceph_assert(max_journal_bytes > target_journal_dirty_bytes);
+  ceph_assert(target_journal_dirty_bytes >= min_journal_bytes);
+  ceph_assert(target_journal_dirty_bytes > rewrite_dirty_bytes_per_cycle);
   ceph_assert(max_journal_bytes > target_journal_alloc_bytes);
-  ceph_assert(rewrite_dirty_bytes_per_cycle > 0);
+  ceph_assert(rewrite_dirty_bytes_per_trans > 0);
+  ceph_assert(rewrite_dirty_bytes_per_cycle >=
+    rewrite_dirty_bytes_per_trans);
   ceph_assert(rewrite_backref_bytes_per_cycle > 0);
 }
 
@@ -354,22 +358,27 @@ JournalTrimmerImpl::config_t::get_default(
   assert(roll_size);
   std::size_t target_dirty_bytes = 0;
   std::size_t target_alloc_bytes = 0;
+  std::size_t min_journal_bytes = 0;
   std::size_t max_journal_bytes = 0;
   if (type == backend_type_t::SEGMENTED) {
-    target_dirty_bytes = 12 * roll_size;
+    min_journal_bytes = 12 * roll_size;
     target_alloc_bytes = 2 * roll_size;
+    target_dirty_bytes = 14 * roll_size;
     max_journal_bytes = 16 * roll_size;
   } else {
     assert(type == backend_type_t::RANDOM_BLOCK);
-    target_dirty_bytes = roll_size / 4;
+    min_journal_bytes = roll_size / 4;
     target_alloc_bytes = roll_size / 4;
+    target_dirty_bytes = roll_size / 3;
     max_journal_bytes = roll_size / 2;
   }
   return config_t{
     target_dirty_bytes,
     target_alloc_bytes,
+    min_journal_bytes,
     max_journal_bytes,
-    1<<17,// rewrite_dirty_bytes_per_cycle
+    1<<26,// rewrite_dirty_bytes_per_cycle
+    1<<17,// rewrite_dirty_bytes_per_trans
     1<<24 // rewrite_backref_bytes_per_cycle
   };
 }
@@ -381,22 +390,29 @@ JournalTrimmerImpl::config_t::get_test(
   assert(roll_size);
   std::size_t target_dirty_bytes = 0;
   std::size_t target_alloc_bytes = 0;
+  std::size_t min_journal_bytes = 0;
   std::size_t max_journal_bytes = 0;
   if (type == backend_type_t::SEGMENTED) {
-    target_dirty_bytes = 2 * roll_size;
+    min_journal_bytes = 2 * roll_size;
     target_alloc_bytes = 2 * roll_size;
+    target_dirty_bytes = 3 * roll_size;
     max_journal_bytes = 4 * roll_size;
   } else {
     assert(type == backend_type_t::RANDOM_BLOCK);
-    target_dirty_bytes = roll_size / 36;
+    min_journal_bytes = roll_size / 36;
     target_alloc_bytes = roll_size / 4;
+    target_dirty_bytes = roll_size / 24;
     max_journal_bytes = roll_size / 2;
   }
   return config_t{
     target_dirty_bytes,
     target_alloc_bytes,
+    min_journal_bytes,
     max_journal_bytes,
-    1<<17,// rewrite_dirty_bytes_per_cycle
+    (target_dirty_bytes > 1<<26)
+      ? 1<<25
+      : target_dirty_bytes / 2,// rewrite_dirty_bytes_per_cycle
+    1<<17,// rewrite_dirty_bytes_per_trans
     1<<24 // rewrite_backref_bytes_per_cycle
   };
 }
@@ -514,6 +530,31 @@ journal_seq_t JournalTrimmerImpl::get_tail_limit() const
   return ret;
 }
 
+journal_seq_t JournalTrimmerImpl::get_dirty_tail_min_target() const
+{
+  assert(background_callback->is_ready());
+  auto ret = journal_head.add_offset(
+      backend_type,
+      -static_cast<device_off_t>(config.min_journal_bytes),
+      roll_start,
+      roll_size);
+  return ret;
+}
+
+journal_seq_t JournalTrimmerImpl::get_dirty_tail_target_per_cycle() const
+{
+  assert(background_callback->is_ready());
+  auto journal_dirty_bytes = get_journal_dirty_bytes();
+  assert(journal_dirty_bytes >= get_dirty_bytes_to_trim());
+  auto ret = journal_head.add_offset(
+      backend_type,
+      -static_cast<device_off_t>(
+       journal_dirty_bytes - get_dirty_bytes_to_trim()),
+      roll_start,
+      roll_size);
+  return ret;
+}
+
 journal_seq_t JournalTrimmerImpl::get_dirty_tail_target() const
 {
   assert(background_callback->is_ready());
@@ -579,7 +620,7 @@ seastar::future<> JournalTrimmerImpl::trim() {
       }
     },
     [this] {
-      if (should_trim_dirty()) {
+      if (should_start_trim_dirty()) {
         return trim_dirty(
         ).handle_error(
           crimson::ct_error::assert_all{
@@ -646,46 +687,55 @@ JournalTrimmerImpl::trim_dirty()
 
   auto& shard_stats = extent_callback->get_shard_stats();
   ++(shard_stats.trim_dirty_num);
-  ++(shard_stats.pending_bg_num);
 
-  return repeat_eagain([this, FNAME, &shard_stats] {
-    ++(shard_stats.repeat_trim_dirty_num);
+  auto target = get_dirty_tail_target_per_cycle();
+  return ::crimson::repeat([this, FNAME, &shard_stats, target] {
+    if (should_stop_trim_dirty(target)) {
+      return trim_ertr::make_ready_future<
+       seastar::stop_iteration>(seastar::stop_iteration::yes);
+    }
+    ++(shard_stats.pending_bg_num);
+    return repeat_eagain([this, FNAME, &shard_stats, target] {
+      ++(shard_stats.repeat_trim_dirty_num);
 
-    return extent_callback->with_transaction_intr(
-      Transaction::src_t::TRIM_DIRTY,
-      "trim_dirty",
-      CACHE_HINT_NOCACHE,
-      [this, FNAME](auto &t)
-    {
-      auto target = get_dirty_tail_target();
-      DEBUGT("start, dirty_tail={}, target={}",
-             t, journal_dirty_tail, target);
-      return extent_callback->get_next_dirty_extents(
-        t,
-        target,
-        config.rewrite_dirty_bytes_per_cycle
-      ).si_then([this, FNAME, &t](auto dirty_list) {
-        DEBUGT("rewrite {} dirty extents", t, dirty_list.size());
-        return seastar::do_with(
-          std::move(dirty_list),
-          [this, &t](auto &dirty_list)
-        {
-          return trans_intr::do_for_each(
-            dirty_list,
-            [this, &t](auto &e) {
-            return extent_callback->rewrite_extent(
-                t, e, INIT_GENERATION, NULL_TIME);
-          });
-        });
-      }).si_then([this, &t] {
-        return extent_callback->submit_transaction_direct(t);
+      return extent_callback->with_transaction_intr(
+       Transaction::src_t::TRIM_DIRTY,
+       "trim_dirty",
+       CACHE_HINT_NOCACHE,
+       [this, FNAME, target](auto &t)
+      {
+       DEBUGT("start, dirty_tail={}, target={}",
+              t, journal_dirty_tail, target);
+       return extent_callback->get_next_dirty_extents(
+         t,
+         target,
+         config.rewrite_dirty_bytes_per_trans
+       ).si_then([this, FNAME, &t](auto dirty_list) {
+         DEBUGT("rewrite {} dirty extents", t, dirty_list.size());
+         return seastar::do_with(
+           std::move(dirty_list),
+           [this, &t](auto &dirty_list)
+         {
+           return trans_intr::do_for_each(
+             dirty_list,
+             [this, &t](auto &e) {
+             return extent_callback->rewrite_extent(
+                 t, e, INIT_GENERATION, NULL_TIME);
+           });
+         });
+       }).si_then([this, &t] {
+         return extent_callback->submit_transaction_direct(t);
+       });
       });
+    }).safe_then([] {
+      return seastar::stop_iteration::no;
+    }).finally([this, FNAME, &shard_stats] {
+      DEBUG("finish, dirty_tail={}", journal_dirty_tail);
+
+      assert(shard_stats.pending_bg_num);
+      --(shard_stats.pending_bg_num);
+      return seastar::stop_iteration::no;
     });
-  }).finally([this, FNAME, &shard_stats] {
-    DEBUG("finish, dirty_tail={}", journal_dirty_tail);
-
-    assert(shard_stats.pending_bg_num);
-    --(shard_stats.pending_bg_num);
   });
 }
 
@@ -708,7 +758,7 @@ std::ostream &operator<<(
   os << "JournalTrimmer(";
   if (stats.trimmer.background_callback->is_ready()) {
     os << "should_block_io_on_trim=" << stats.trimmer.should_block_io_on_trim()
-       << ", should_(trim_dirty=" << stats.trimmer.should_trim_dirty()
+       << ", should_(trim_dirty=" << stats.trimmer.should_start_trim_dirty()
        << ", trim_alloc=" << stats.trimmer.should_trim_alloc() << ")";
   } else {
     os << "not-ready";
@@ -719,7 +769,8 @@ std::ostream &operator<<(
        << ", dirty_tail=" << stats.trimmer.get_dirty_tail();
     if (stats.trimmer.background_callback->is_ready()) {
       os << ", alloc_tail_target=" << stats.trimmer.get_alloc_tail_target()
-         << ", dirty_tail_target=" << stats.trimmer.get_dirty_tail_target()
+         << ", dirty_tail_min_target=" << stats.trimmer.get_dirty_tail_min_target()
+        << ", dirty_tail_target=" << stats.trimmer.get_dirty_tail_target()
          << ", tail_limit=" << stats.trimmer.get_tail_limit();
     }
   }
index 4f7bf8d7f8e5150afe6f3fd43b50733368f1e522..6c4cfe4b2d956d2266a71be30d947a36cd980448 100644 (file)
@@ -502,15 +502,21 @@ using JournalTrimmerImplRef = std::unique_ptr<JournalTrimmerImpl>;
 class JournalTrimmerImpl : public JournalTrimmer {
 public:
   struct config_t {
-    /// Number of minimum bytes to stop trimming dirty.
+    /// Number of minimum bytes to start trimming dirty,
+    // this must be larger than or equal to min_journal_bytes,
+    // otherwise trim_dirty may never happen.
     std::size_t target_journal_dirty_bytes = 0;
     /// Number of minimum bytes to stop trimming allocation
     /// (having the corresponding backrefs unmerged)
     std::size_t target_journal_alloc_bytes = 0;
+    /// Number of minimum dirty bytes of the journal.
+    std::size_t min_journal_bytes = 0;
     /// Number of maximum bytes to block user transactions.
     std::size_t max_journal_bytes = 0;
     /// Number of bytes to rewrite dirty per cycle
     std::size_t rewrite_dirty_bytes_per_cycle = 0;
+    /// Number of bytes to rewrite dirty per transaction
+    std::size_t rewrite_dirty_bytes_per_trans = 0;
     /// Number of bytes to rewrite backref per cycle
     std::size_t rewrite_backref_bytes_per_cycle = 0;
 
@@ -561,7 +567,7 @@ public:
 
   std::size_t get_trim_size_per_cycle() const final {
     return config.rewrite_backref_bytes_per_cycle +
-      config.rewrite_dirty_bytes_per_cycle;
+      get_dirty_bytes_to_trim();
   }
 
   backend_type_t get_backend_type() const {
@@ -584,7 +590,7 @@ public:
   }
 
   bool should_trim() const {
-    return should_trim_alloc() || should_trim_dirty();
+    return should_trim_alloc() || should_start_trim_dirty();
   }
 
   bool should_block_io_on_trim() const {
@@ -627,10 +633,14 @@ public:
   friend std::ostream &operator<<(std::ostream &, const stat_printer_t &);
 
 private:
-  bool should_trim_dirty() const {
+  bool should_start_trim_dirty() const {
     return get_dirty_tail_target() > journal_dirty_tail;
   }
 
+  bool should_stop_trim_dirty(const journal_seq_t &target) const {
+    return target <= journal_dirty_tail;
+  }
+
   bool should_trim_alloc() const {
     return get_alloc_tail_target() > journal_alloc_tail;
   }
@@ -642,10 +652,30 @@ private:
   trim_ertr::future<> trim_alloc();
 
   journal_seq_t get_tail_limit() const;
+  journal_seq_t get_dirty_tail_min_target() const;
   journal_seq_t get_dirty_tail_target() const;
+  journal_seq_t get_dirty_tail_target_per_cycle() const;
   journal_seq_t get_alloc_tail_target() const;
   std::size_t get_dirty_journal_size() const;
   std::size_t get_alloc_journal_size() const;
+  std::size_t get_journal_dirty_bytes() const {
+    return journal_head.relative_to(
+      backend_type,
+      journal_dirty_tail,
+      roll_start,
+      roll_size);
+  }
+  std::size_t get_max_dirty_bytes_to_trim() const {
+    auto journal_dirty_bytes = get_journal_dirty_bytes();
+    if (journal_dirty_bytes <= config.min_journal_bytes) {
+      return 0;
+    }
+    return journal_dirty_bytes - config.min_journal_bytes;
+  }
+  std::size_t get_dirty_bytes_to_trim() const {
+    return std::min(get_max_dirty_bytes_to_trim(),
+                   config.rewrite_dirty_bytes_per_cycle);
+  }
   void register_metrics();
 
   ExtentCallbackInterface *extent_callback = nullptr;