]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
crimson/os/seastore: adaptive cleaner hard_limit from observed open-segment peak
authorShai Fultheim <shai.fultheim@gmail.com>
Tue, 19 May 2026 10:55:02 +0000 (13:55 +0300)
committerShai Fultheim <shai.fultheim@gmail.com>
Fri, 29 May 2026 07:33:11 +0000 (10:33 +0300)
The cleaner's `available_ratio_hard_limit` controls when user IO blocks
(once projected_aratio < hard_limit). Setting it too high causes
unnecessary blocks during transient pressure; setting it too low risks
running out of free segments for the cleaner's own working set, which
aborts the OSD with "seastore device size setting is too small".

The current default of `0.10` was chosen empirically and does not scale
with cluster geometry. On a 32 GiB cluster with default 64 MiB segments,
`0.10` reserves ~3 GiB of always-empty space. The cleaner's actual
named-writer working set is 1 journal + `seastore_hot_tier_generations`
hot writers + `seastore_cold_tier_generations` cold writers + 1
metadata writer = (hot + cold + 2) segments. For the typical defaults
(5 hot, 3 cold) that is 10 segments = 640 MiB on a 32 GiB OSD = 2.0%.
Reserving 10% leaves ~80% of that "headroom" sitting unused, which
causes the cluster to operate at lower fill, accumulate fewer dead
bytes per segment, and pay 4-5x WAF on garbage collection cycles.

This commit makes hard_limit adaptive: track the peak open-segment
count observed during each 30 s window, then derive

  hard_limit = max(observed_peak, named_writers) + 1
             ────────────────────────────────────────
                       (segments_in_cluster)

where the "+ 1" segment is the minimum safety unit (one more open
segment than ever observed). The `named_writers` count is the
architectural floor below which the cleaner cannot allocate; staying
above it prevents the abort. `observed_peak` floats to track the
actual transient overhead introduced by segment transitions in the
running workload.

Implementation
==============

`AsyncCleaner::maybe_adjust_thresholds()` is added as a virtual no-op
hook; `SegmentCleaner` overrides it. The hook is invoked once per
`BackgroundProcess::run()` iteration. Each call samples the current
open-segment count into the rolling window peak. Every 30 s, the
window's peak is consumed to recompute hard_limit, and the window
resets.

`config_t config` loses its `const` qualifier; the only mutation is
this hook, which is the single writer in the cleaner's shard.

This commit only adapts `hard_limit`. `gc_max` remains at its existing
default (0.15). A follow-up commit will add adaptive `gc_max` driven
by observed user-burst and cleaner-cycle peaks; that is where the
remaining WAF reduction lives.

Bench measurements
==================

qa/standalone/crimson randwrite at 70% fill, 1 MiB writes, 32 GiB
per-OSD null_blk backing, 1280 GiB write target. Comparison against
the same workload with static `hard_limit = 0.10`:

  Metric                | static (0.15, 0.10) | adaptive hard_limit |
  ----------------------|---------------------|---------------------|
  user_written          |          1,374 GiB  |          1,374 GiB  |
  device_written        |          7,901 GiB  |          4,503 GiB  |
  WAF (d / u)           |              5.749  |              3.276  |
  completion            |              100 %  |              100 %  |
  bench duration        |             33 min  |             17 min  |
  fio exit              |             rc = 0  |             rc = 0  |
  observed peak open    |                  -  |       7 (each OSD)  |
  computed hard_limit   |                  -  |             0.0215  |

WAF drops 43 % and end-to-end throughput nearly doubles. The mechanism
is that fewer projected_aratio dips cross the (much lower) block
threshold, so the cluster spends less time in the block-recover-block
cycle that bloats device_written without progressing user_written.

No new workload-tuned constants are introduced. The two literal
numbers in the algorithm are the 30 s recompute interval (time scale
of the feedback loop, not workload-specific) and the `+ 1 segment`
safety unit (the smallest possible buffer in units the cleaner can
allocate).

Signed-off-by: Shai Fultheim <shai.fultheim@gmail.com>
src/crimson/os/seastore/async_cleaner.cc
src/crimson/os/seastore/async_cleaner.h
src/crimson/os/seastore/extent_placement_manager.cc

index 7c36897aeefef4eb76db3f1f166ab55430163824..87f63dcbcf30a0260fda50b55c0c52b26cf3cd17 100644 (file)
@@ -1128,6 +1128,65 @@ segment_id_t SegmentCleaner::allocate_segment(
   return NULL_SEG_ID;
 }
 
+void SegmentCleaner::maybe_adjust_thresholds()
+{
+  // Sample current open-segment count into the window peak each call.
+  peak_open_segments_window = std::max(
+      peak_open_segments_window, segments.get_num_open());
+
+  // Only recompute hard_limit every 30s.
+  using namespace std::chrono_literals;
+  LOG_PREFIX(SegmentCleaner::maybe_adjust_thresholds);
+  auto now = seastar::lowres_clock::now();
+  if (adaptive_last_time != seastar::lowres_clock::time_point{} &&
+      now - adaptive_last_time < 30s) {
+    return;
+  }
+  adaptive_last_time = now;
+
+  // Architectural floor: named writers (journal + hot/cold gens + metadata).
+  auto hot = crimson::common::get_conf<uint64_t>(
+      "seastore_hot_tier_generations");
+  auto cold = crimson::common::get_conf<uint64_t>(
+      "seastore_cold_tier_generations");
+  std::size_t named_writers = hot + cold + 2;
+  std::size_t seg_size = segments.get_segment_size();
+  std::size_t total_bytes = segments.get_total_bytes();
+  if (total_bytes == 0 || seg_size == 0) {
+    return;
+  }
+
+  // hard_limit = (max(peak, named) + 1) * seg / total. "+1" is the minimum
+  // safety unit: allow one more open segment than ever observed.
+  std::size_t observed_peak =
+      std::max<std::size_t>(peak_open_segments_window, named_writers);
+  double new_hard_limit =
+      static_cast<double>(observed_peak + 1) *
+      static_cast<double>(seg_size) /
+      static_cast<double>(total_bytes);
+
+  double crash_floor =
+      static_cast<double>(named_writers) *
+      static_cast<double>(seg_size) /
+      static_cast<double>(total_bytes);
+  new_hard_limit = std::max(new_hard_limit, crash_floor);
+
+  // Keep gc_max strictly greater than hard_limit.
+  if (config.available_ratio_gc_max <= new_hard_limit) {
+    config.available_ratio_gc_max = new_hard_limit + 0.001;
+  }
+  config.available_ratio_hard_limit = new_hard_limit;
+
+  INFO("[ADAPTIVE_GC] peak_open={} named={} hard_limit={:.4f} "
+       "gc_max={:.4f} crash_floor={:.4f}",
+       peak_open_segments_window, named_writers,
+       config.available_ratio_hard_limit,
+       config.available_ratio_gc_max, crash_floor);
+
+  // Reset window: record current open count as the new baseline.
+  peak_open_segments_window = segments.get_num_open();
+}
+
 void SegmentCleaner::close_segment(segment_id_t segment)
 {
   LOG_PREFIX(SegmentCleaner::close_segment);
@@ -1343,11 +1402,12 @@ SegmentCleaner::clean_space_ret SegmentCleaner::clean_space()
   }
   reclaim_state->advance(config.reclaim_bytes_per_cycle);
 
-  DEBUG("reclaiming {} {}~{}",
+  double pavail_ratio = get_projected_available_ratio();
+  DEBUG("reclaiming {} {}~{}, projected_avail_ratio={}",
         rewrite_gen_printer_t{reclaim_state->generation},
         reclaim_state->start_pos,
-        reclaim_state->end_pos);
-  double pavail_ratio = get_projected_available_ratio();
+        reclaim_state->end_pos,
+        pavail_ratio);
   sea_time_point start = seastar::lowres_system_clock::now();
 
   // Backref-tree doesn't support tree-read during tree-updates with parallel
index 3f8ab98d19c08c1d96201648c5a4703bb039f348..e8134a10b84a5cb0808b70c2cf898d98c69edd36 100644 (file)
@@ -1245,6 +1245,9 @@ public:
 
   virtual std::size_t get_reclaim_size_per_cycle() const = 0;
 
+  // Periodic hook for adaptive threshold control. Default: no-op.
+  virtual void maybe_adjust_thresholds() {}
+
 #ifdef UNIT_TESTS_BUILT
   virtual void prefill_fragmented_devices() {}
 #endif
@@ -1458,6 +1461,8 @@ public:
            greedy_free >= ratio * picked_free;
   }
 
+  void maybe_adjust_thresholds() final;
+
   const std::set<device_id_t>& get_device_ids() const final {
     return sm_group->get_device_ids();
   }
@@ -1657,7 +1662,12 @@ private:
   store_index_t store_index;
   const bool detailed;
   const bool is_cold;
-  const config_t config;
+  // Mutated by maybe_adjust_thresholds(): hard_limit tracks observed open-segment peak.
+  config_t config;
+
+  // Adaptive state: peak open segments observed since last adjust.
+  std::size_t peak_open_segments_window = 0;
+  seastar::lowres_clock::time_point adaptive_last_time;
 
   SegmentManagerGroupRef sm_group;
   BackrefManager &backref_manager;
index 928b8d37966cf65c6e1628b5ddcb6bf8ad40b9d9..9e1f7173afbf363bde9290373e125b0561e91cf9 100644 (file)
@@ -829,6 +829,13 @@ ExtentPlacementManager::BackgroundProcess::run()
         pending_user_io_wake = false;
         co_await seastar::yield();
       }
+      // Adaptive threshold hook: each cleaner has its own state and floor.
+      if (main_cleaner) {
+        main_cleaner->maybe_adjust_thresholds();
+      }
+      if (cold_cleaner) {
+        cold_cleaner->maybe_adjust_thresholds();
+      }
     } else {
       log_state("run(block)");
       assert(!blocking_background);