]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
rgw/lc: add per-bucket lifecycle performance monitoring
authorMatthew N. Heler <matthew.heler@hotmail.com>
Sun, 5 Oct 2025 15:43:26 +0000 (10:43 -0500)
committerMatthew N. Heler <matthew.heler@hotmail.com>
Mon, 27 Apr 2026 11:36:15 +0000 (06:36 -0500)
Implements real-time per-bucket lifecycle (LC) monitoring via performance
counters exposed through the admin socket.

Signed-off-by: Matthew N. Heler <matthew.heler@hotmail.com>
doc/radosgw/metrics.rst
qa/suites/rgw/lifecycle/overrides.yaml
qa/suites/rgw/verify/overrides.yaml
src/common/options/rgw.yaml.in
src/rgw/rgw_lc.cc
src/rgw/rgw_lc.h
src/rgw/rgw_perf_counters.cc
src/rgw/rgw_perf_counters.h

index 4b8c2dad4edd17c0a9335fdc32593d36f03147b1..b78ce681c91bf6c40814f1fcd39a13b125ee6878 100644 (file)
@@ -191,6 +191,345 @@ Cache sizing can depend on a number of factors. These factors include:
 
 To help calculate the Ceph Object Gateway's memory usage of a cache, it should be noted that each cache entry, encompassing all of the op metrics, is 1360 bytes. This is an estimate and subject to change if metrics are added or removed from the op metrics list.
 
+Lifecycle Metrics
+=================
+
+The following metrics related to lifecycle (LC) processing are tracked per bucket by the Ceph Object Gateway.
+
+.. list-table:: Ceph Object Gateway Lifecycle Metrics
+   :widths: 25 25 75
+   :header-rows: 1
+
+   * - Name
+     - Type
+     - Description
+   * - ``start_time``
+     - Gauge
+     - LC processing start timestamp (Unix epoch seconds)
+   * - ``end_time``
+     - Gauge
+     - LC processing end timestamp (Unix epoch seconds)
+   * - ``objects_scanned``
+     - Counter
+     - Total objects scanned for lifecycle rules (cumulative across all runs since RGW start)
+   * - ``objects_pending``
+     - Gauge
+     - Objects currently pending lifecycle processing
+   * - ``objects_expired``
+     - Counter
+     - Current-version objects expired by lifecycle in current run
+   * - ``objects_noncurrent_expired``
+     - Counter
+     - Noncurrent-version objects expired by lifecycle in current run
+   * - ``objects_dm_expired``
+     - Counter
+     - Delete markers expired by lifecycle in current run
+   * - ``objects_transitioned``
+     - Counter
+     - Objects transitioned to another storage class by lifecycle in current run
+   * - ``objects_mpu_aborted``
+     - Counter
+     - Multipart uploads aborted by lifecycle in current run
+
+Lifecycle metrics are labeled per-bucket in the ``rgw_lc_per_bucket`` section.
+
+The action metrics (``objects_expired``, ``objects_noncurrent_expired``,
+``objects_dm_expired``, ``objects_transitioned``, ``objects_mpu_aborted``) and
+``objects_scanned`` are monotonic counters that accumulate across every LC run
+for the lifetime of the Ceph Object Gateway process.
+
+To compute per-run deltas in Prometheus, use ``increase()`` over a window that
+covers a single LC run (for example, ``increase(rgw_lc_per_bucket_objects_scanned[24h])``
+on the default daily LC schedule). To detect a new LC run, look for a change
+in ``start_time``.
+
+Information about lifecycle metrics can be seen in the ``rgw_lc_per_bucket``
+section from the output of the ``counter schema`` command.
+
+To retrieve lifecycle metrics from a ``radosgw`` daemon's admin socket, see the
+``rgw_lc_per_bucket`` section in the output of the ``counter dump`` command::
+
+    "rgw_lc_per_bucket": [
+        {
+            "labels": {
+                "bucket": "mybucket",
+                "tenant": ""
+            },
+            "counters": {
+                "start_time": 1728139406,
+                "end_time": 0,
+                "objects_scanned": 15000,
+                "objects_pending": 2300,
+                "objects_expired": 1200,
+                "objects_noncurrent_expired": 300,
+                "objects_dm_expired": 50,
+                "objects_transitioned": 800,
+                "objects_mpu_aborted": 10
+            }
+        },
+        ...
+    ]
+
+Lifecycle Counter Cache
+-----------------------
+
+To track lifecycle metrics per bucket, set :confval:`rgw_lc_counters_cache` to ``true``. The default value is ``false``.
+
+Lifecycle metrics are stored as labeled performance counters in memory. All counters are lost when the Ceph Object Gateway restarts or crashes.
+
+Lifecycle Counter Cache Size & Eviction
+----------------------------------------
+
+The :confval:`rgw_lc_counters_cache_size` option can be used to set number of entries in the cache.
+
+When the number of cached counters exceeds this value, the least recently used (LRU) counters are evicted.
+
+Lifecycle Counter Batching
+---------------------------
+
+To minimize performance impact, lifecycle counter updates are batched. The :confval:`rgw_lc_counters_batch_size` option controls how often counter updates are flushed to the cache.
+
+Lower values provide more frequent updates but with slightly higher overhead. Higher values reduce overhead but updates appear less frequently.
+
+Lifecycle Metric Usage Examples
+-------------------------------
+
+The following examples show how to use each per-bucket lifecycle metric.
+PromQL examples assume that the ``ceph-exporter`` is being scraped by
+Prometheus. Admin-socket examples use ``ceph daemon`` directly against a
+specific ``radosgw`` daemon instance.
+
+In a multi-RGW deployment only one RGW processes a given bucket per LC cycle. To find which daemon ran (or is running) LC for a bucket, take the series with the highest ``start_time`` for that bucket label.
+
+``start_time`` (gauge, Unix epoch seconds)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The timestamp when the most recent LC run started for the bucket. ``0`` means LC has not run for this bucket since the RGW started.
+
+* Find buckets whose LC run started in the last hour:
+
+  .. code-block:: promql
+
+      time() - rgw_lc_per_bucket_start_time < 3600 and rgw_lc_per_bucket_start_time > 0
+
+* Identify which RGW most recently ran LC for ``mybucket``:
+
+  .. code-block:: promql
+
+      topk(1, rgw_lc_per_bucket_start_time{bucket="mybucket"})
+
+* Detect missed LC runs (no run in over 36 hours on a daily LC schedule):
+
+  .. code-block:: promql
+
+      time() - max by (bucket, tenant) (rgw_lc_per_bucket_start_time) > 36 * 3600
+
+``end_time`` (gauge, Unix epoch seconds)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The timestamp when the most recent LC run finished for the bucket. ``0`` while ``start_time > 0`` means LC is currently in progress on that RGW.
+
+* Buckets currently being processed:
+
+  .. code-block:: promql
+
+      rgw_lc_per_bucket_start_time > 0 and rgw_lc_per_bucket_end_time == 0
+
+* Per-run LC duration (seconds):
+
+  .. code-block:: promql
+
+      rgw_lc_per_bucket_end_time - rgw_lc_per_bucket_start_time
+        and rgw_lc_per_bucket_end_time > 0
+
+* Buckets whose last LC run took longer than 30 minutes:
+
+  .. code-block:: promql
+
+      (rgw_lc_per_bucket_end_time - rgw_lc_per_bucket_start_time > 1800)
+        and rgw_lc_per_bucket_end_time > 0
+
+``objects_scanned`` (counter)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Cumulative count of objects examined by LC for this bucket since the RGW process started. Use ``increase()`` to compute the per-run scan total.
+
+* Objects scanned per bucket in the last 24 hours:
+
+  .. code-block:: promql
+
+      increase(rgw_lc_per_bucket_objects_scanned[24h])
+
+* Top 10 buckets by LC scan rate over the last hour:
+
+  .. code-block:: promql
+
+      topk(10, rate(rgw_lc_per_bucket_objects_scanned[1h]))
+
+* Live scan progress for an in-flight run via admin socket:
+
+  .. code-block:: bash
+
+      ceph daemon radosgw.<id>.asok counter dump \
+        | jq '.rgw_lc_per_bucket[]
+              | select(.labels.bucket == "mybucket")
+              | .counters.objects_scanned'
+
+``objects_pending`` (gauge)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Number of objects currently queued for LC action evaluation. Returns to 0 once the run finishes.
+
+* Buckets with significant in-flight LC work:
+
+  .. code-block:: promql
+
+      rgw_lc_per_bucket_objects_pending > 1000
+
+* Aggregate pending work across the cluster:
+
+  .. code-block:: promql
+
+      sum(rgw_lc_per_bucket_objects_pending)
+
+* Detect a stalled run (pending > 0 but no scan progress in the last 5 minutes):
+
+  .. code-block:: promql
+
+      rgw_lc_per_bucket_objects_pending > 0
+        and rate(rgw_lc_per_bucket_objects_scanned[5m]) == 0
+
+``objects_expired`` (counter)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Cumulative current-version objects expired by LC for this bucket since RGW start.
+
+* Objects expired per bucket in the last 24 hours:
+
+  .. code-block:: promql
+
+      increase(rgw_lc_per_bucket_objects_expired[24h])
+
+* Cluster-wide expiration rate (objects/sec):
+
+  .. code-block:: promql
+
+      sum(rate(rgw_lc_per_bucket_objects_expired[5m]))
+
+* Buckets with no expirations in the last 7 days (possible rule misconfiguration on a bucket where you expect deletes):
+
+  .. code-block:: promql
+
+      increase(rgw_lc_per_bucket_objects_expired[7d]) == 0
+        and rgw_lc_per_bucket_start_time > 0
+
+``objects_noncurrent_expired`` (counter)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Cumulative noncurrent-version objects expired by LC. Useful only on versioned buckets.
+
+* Noncurrent-version objects removed per run:
+
+  .. code-block:: promql
+
+      increase(rgw_lc_per_bucket_objects_noncurrent_expired[24h])
+
+* Ratio of noncurrent vs current expirations (high ratio indicates a versioning-heavy workload):
+
+  .. code-block:: promql
+
+      sum by (bucket) (increase(rgw_lc_per_bucket_objects_noncurrent_expired[24h]))
+        /
+      (sum by (bucket) (increase(rgw_lc_per_bucket_objects_expired[24h])) > 0)
+
+``objects_dm_expired`` (counter)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Cumulative delete markers expired by LC. A non-zero value means LC is reclaiming dangling delete markers on a versioned bucket.
+
+* Delete markers cleaned per run:
+
+  .. code-block:: promql
+
+      increase(rgw_lc_per_bucket_objects_dm_expired[24h])
+
+* Buckets accumulating delete markers without cleanup (no DM expiration in 7 days but recent runs occurred):
+
+  .. code-block:: promql
+
+      increase(rgw_lc_per_bucket_objects_dm_expired[7d]) == 0
+        and increase(rgw_lc_per_bucket_objects_scanned[7d]) > 0
+
+``objects_transitioned`` (counter)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Cumulative objects transitioned to another storage class (including cloud tiers) by LC.
+
+* Objects transitioned per bucket in the last 24 hours:
+
+  .. code-block:: promql
+
+      increase(rgw_lc_per_bucket_objects_transitioned[24h])
+
+* Cluster-wide transition throughput (objects/sec):
+
+  .. code-block:: promql
+
+      sum(rate(rgw_lc_per_bucket_objects_transitioned[5m]))
+
+* Buckets with a configured transition rule but zero transitions in the last 24h (likely investigation target):
+
+  .. code-block:: promql
+
+      increase(rgw_lc_per_bucket_objects_transitioned[24h]) == 0
+        and increase(rgw_lc_per_bucket_objects_scanned[24h]) > 0
+
+``objects_mpu_aborted`` (counter)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Cumulative incomplete multipart uploads aborted by LC. Driven by ``AbortIncompleteMultipartUpload`` rules.
+
+* Aborted multipart uploads per bucket per run:
+
+  .. code-block:: promql
+
+      increase(rgw_lc_per_bucket_objects_mpu_aborted[24h])
+
+* Cluster-wide rate of MPU cleanup:
+
+  .. code-block:: promql
+
+      sum(rate(rgw_lc_per_bucket_objects_mpu_aborted[5m]))
+
+Combined Examples
+^^^^^^^^^^^^^^^^^
+
+* Total LC work done per bucket in the last 24 hours (scan + all action types):
+
+  .. code-block:: promql
+
+      sum by (bucket, tenant) (
+        increase(rgw_lc_per_bucket_objects_expired[24h])
+        + increase(rgw_lc_per_bucket_objects_noncurrent_expired[24h])
+        + increase(rgw_lc_per_bucket_objects_dm_expired[24h])
+        + increase(rgw_lc_per_bucket_objects_transitioned[24h])
+        + increase(rgw_lc_per_bucket_objects_mpu_aborted[24h])
+      )
+
+* Per-bucket LC hit rate (fraction of scanned objects acted on):
+
+  .. code-block:: promql
+
+      (
+        increase(rgw_lc_per_bucket_objects_expired[24h])
+        + increase(rgw_lc_per_bucket_objects_noncurrent_expired[24h])
+        + increase(rgw_lc_per_bucket_objects_dm_expired[24h])
+        + increase(rgw_lc_per_bucket_objects_transitioned[24h])
+        + increase(rgw_lc_per_bucket_objects_mpu_aborted[24h])
+      )
+      /
+      increase(rgw_lc_per_bucket_objects_scanned[24h])
+
 Sending Metrics to Prometheus
 =============================
 
index 335b6dcf83492608ae078e5831dd6ae4e00d565c..72ea52c3da69f00398c6b29a0e92bd4c98b164a7 100644 (file)
@@ -10,6 +10,7 @@ overrides:
         rgw crypt require ssl: false
         rgw sts key: abcdefghijklmnop
         rgw s3 auth use sts: true
+        rgw lc counters cache: true
         rgw lc debug interval: 10
   rgw:
     storage classes:
index ba1f6942d5a2b6a45365d04eecf51dfcdc850fae..899c084a0ed7d3f713989f498fd7b180ba5e22c3 100644 (file)
@@ -12,6 +12,7 @@ overrides:
         rgw torrent flag: true
         rgw user counters cache: true
         rgw bucket counters cache: true
+        rgw lc counters cache: true
         rgw sts key: abcdefghijklmnop
         rgw s3 auth use sts: true
         rgw reshard progress judge interval: 10
index 57000c5154b53618b4ddd61a8a75e90d3fa7e87c..08e56e87209eba7a808eb7293ce8704fddcef38d 100644 (file)
@@ -525,6 +525,38 @@ options:
   services:
   - rgw
   with_legacy: true
+- name: rgw_lc_counters_cache
+  type: bool
+  level: advanced
+  desc: Enable per-bucket lifecycle performance counters cache
+  long_desc: When enabled, RGW will create and update per-bucket lifecycle performance
+    counters and expose them via admin socket perf dump.
+  default: false
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_lc_counters_cache_size
+  type: uint
+  level: advanced
+  desc: Target size for lifecycle counters cache
+  long_desc: Maximum number of per-bucket LC counter entries to maintain in cache
+  default: 10000
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_lc_counters_batch_size
+  type: uint
+  level: advanced
+  desc: Batch size for flushing LC per-bucket counters
+  long_desc: LC per-bucket counters are flushed to cache every N objects processed.
+    Lower values provide more frequent updates, higher values reduce overhead.
+  default: 5000
+  min: 1
+  services:
+  - rgw
+  see_also:
+  - rgw_lc_counters_cache
+  with_legacy: true
 - name: rgw_restore_debug_interval
   type: int
   level: dev
index 4f7168810b8f7bb77efccd28477a0ec22ba5c150..1bd8cbf91f27e784b17132fce7431ca3ecd335da 100644 (file)
@@ -8,6 +8,7 @@
 #include <algorithm>
 #include <tuple>
 #include <functional>
+#include <atomic>
 
 #include <boost/algorithm/string/split.hpp>
 #include <boost/algorithm/string.hpp>
@@ -44,6 +45,100 @@ constexpr int32_t secs_in_a_day = hours_in_a_day * 60 * 60;
 
 using namespace std;
 
+/*
+ * Batching accumulator for LC counter updates.
+ *
+ * obj_scanned is touched only by the producer thread (the listing loop),
+ * so it is non-atomic. All other staged counters are touched from worker
+ * coroutines via record_*() and must be atomic.
+ */
+struct LCBatchCounters {
+  PerfCounters* perf_counters;
+  uint64_t obj_scanned{0};
+  std::atomic<uint64_t> obj_completed{0};
+  std::atomic<uint64_t> obj_expired{0};
+  std::atomic<uint64_t> obj_noncur_expired{0};
+  std::atomic<uint64_t> obj_dm_expired{0};
+  std::atomic<uint64_t> obj_transitioned{0};
+  std::atomic<uint64_t> obj_mpu_aborted{0};
+  uint64_t flush_threshold;
+
+  LCBatchCounters(PerfCounters* pc, uint64_t threshold)
+    : perf_counters(pc), flush_threshold(threshold) {}
+
+  void increment_scanned() {
+    if (!perf_counters) return;
+    ++obj_scanned;
+  }
+
+  void decrement_pending() {
+    if (!perf_counters) return;
+    obj_completed.fetch_add(1, std::memory_order_relaxed);
+  }
+
+  void record_expired() {
+    if (!perf_counters) return;
+    obj_expired.fetch_add(1, std::memory_order_relaxed);
+  }
+
+  void record_noncur_expired() {
+    if (!perf_counters) return;
+    obj_noncur_expired.fetch_add(1, std::memory_order_relaxed);
+  }
+
+  void record_dm_expired() {
+    if (!perf_counters) return;
+    obj_dm_expired.fetch_add(1, std::memory_order_relaxed);
+  }
+
+  void record_transitioned() {
+    if (!perf_counters) return;
+    obj_transitioned.fetch_add(1, std::memory_order_relaxed);
+  }
+
+  void record_mpu_aborted() {
+    if (!perf_counters) return;
+    obj_mpu_aborted.fetch_add(1, std::memory_order_relaxed);
+  }
+
+  void flush_scanned() {
+    if (!perf_counters) return;
+    if (obj_scanned > 0) {
+      perf_counters->inc(l_rgw_lc_per_bucket_obj_scanned, obj_scanned);
+      perf_counters->inc(l_rgw_lc_per_bucket_obj_pending, obj_scanned);
+      obj_scanned = 0;
+    }
+  }
+
+  void flush_completed() {
+    if (!perf_counters) return;
+    uint64_t completed = obj_completed.exchange(0, std::memory_order_relaxed);
+    if (completed > 0) {
+      perf_counters->dec(l_rgw_lc_per_bucket_obj_pending, completed);
+    }
+  }
+
+  void flush_actions() {
+    if (!perf_counters) return;
+    uint64_t v;
+    if ((v = obj_expired.exchange(0, std::memory_order_relaxed)) > 0) {
+      perf_counters->inc(l_rgw_lc_per_bucket_obj_expired, v);
+    }
+    if ((v = obj_noncur_expired.exchange(0, std::memory_order_relaxed)) > 0) {
+      perf_counters->inc(l_rgw_lc_per_bucket_obj_noncur_expired, v);
+    }
+    if ((v = obj_dm_expired.exchange(0, std::memory_order_relaxed)) > 0) {
+      perf_counters->inc(l_rgw_lc_per_bucket_obj_dm_expired, v);
+    }
+    if ((v = obj_transitioned.exchange(0, std::memory_order_relaxed)) > 0) {
+      perf_counters->inc(l_rgw_lc_per_bucket_obj_transitioned, v);
+    }
+    if ((v = obj_mpu_aborted.exchange(0, std::memory_order_relaxed)) > 0) {
+      perf_counters->inc(l_rgw_lc_per_bucket_obj_mpu_aborted, v);
+    }
+  }
+};
+
 const char* LC_STATUS[] = {
       "UNINITIAL",
       "PROCESSING",
@@ -508,6 +603,7 @@ struct lc_op_ctx {
 
   std::unique_ptr<rgw::sal::Object> obj;
   const DoutPrefixProvider *dpp;
+  LCBatchCounters* batch_counters;
 
   std::unique_ptr<rgw::sal::PlacementTier> tier;
   const RGWObjTags* cached_tags{nullptr};
@@ -516,11 +612,12 @@ struct lc_op_ctx {
            boost::optional<std::string> next_key_name,
            uint64_t num_noncurrent,
            ceph::real_time effective_mtime,
-           const DoutPrefixProvider *dpp)
+           const DoutPrefixProvider *dpp,
+            LCBatchCounters* batch_counters)
     : cct(env.driver->ctx()), env(env), o(o), next_key_name(next_key_name),
       num_noncurrent(num_noncurrent), effective_mtime(effective_mtime),
       driver(env.driver), bucket(env.bucket), op(env.op), ol(env.ol),
-      dpp(dpp)
+      dpp(dpp), batch_counters(batch_counters)
     {
       obj = bucket->get_object(o.key);
       /* once bucket versioning is enabled, the non-current entries with
@@ -771,7 +868,8 @@ public:
   void build();
   void update();
   int process(rgw_bucket_dir_entry& o, const DoutPrefixProvider *dpp,
-             optional_yield y, const RGWObjTags* cached_tags = nullptr);
+              LCBatchCounters* batch_counters, optional_yield y,
+              const RGWObjTags* cached_tags = nullptr);
 }; /* LCOpRule */
 
 RGWLC::LCWorker::LCWorker(const DoutPrefixProvider* dpp, CephContext *cct,
@@ -789,7 +887,8 @@ int RGWLC::handle_multipart_expiration(rgw::sal::Bucket* target,
                                       const multimap<string, lc_op>& prefix_map,
                                       ceph::async::spawn_throttle& workpool,
                                       boost::asio::yield_context yield,
-                                      LCWorker* worker, time_t stop_at, bool once)
+                                      LCWorker* worker, LCBatchCounters* batch_counters,
+                                      time_t stop_at, bool once)
 {
   int ret;
   rgw::sal::Bucket::ListParams params_base;
@@ -807,7 +906,7 @@ int RGWLC::handle_multipart_expiration(rgw::sal::Bucket* target,
   params_base.ns = RGW_OBJ_NS_MULTIPART;
   params_base.access_list_filter = MultipartMetaFilter;
 
-  auto pf = [this, target] (optional_yield y, const lc_op& rule,
+  auto pf = [this, target, batch_counters] (optional_yield y, const lc_op& rule,
                             const rgw_bucket_dir_entry& obj) {
     int ret{0};
 
@@ -835,6 +934,9 @@ int RGWLC::handle_multipart_expiration(rgw::sal::Bucket* target,
         if (perfcounter) {
           perfcounter->inc(l_rgw_lc_abort_mpu, 1);
         }
+        if (batch_counters) {
+          batch_counters->record_mpu_aborted();
+        }
       } else {
         if (ret == -ERR_NO_SUCH_UPLOAD) {
           ldpp_dout(this, 5) << "ERROR: abort_multipart_upload failed, ret="
@@ -845,9 +947,27 @@ int RGWLC::handle_multipart_expiration(rgw::sal::Bucket* target,
         }
       } /* abort failed */
     }   /* expired */
-               return ret;
+
+    return ret;
   };
 
+  /*
+   * mpu_count tracks total objects for the modulo flush pattern;
+   * obj_scanned resets on each flush so it can't be used for modulo.
+   */
+  uint64_t mpu_count = 0;
+  uint64_t batch_threshold = batch_counters->flush_threshold;
+
+  // Flush counters on all exit paths
+  auto flush_guard = make_scope_guard(
+    [batch_counters]
+      {
+        batch_counters->flush_scanned();
+        batch_counters->flush_completed();
+        batch_counters->flush_actions();
+      }
+    );
+
   std::map<std::string, std::vector<const lc_op*>> grouped_mp_ops;
   for (auto& prefix_entry : prefix_map) {
     if (!prefix_entry.second.status || prefix_entry.second.mp_expiration <= 0) {
@@ -881,24 +1001,41 @@ int RGWLC::handle_multipart_expiration(rgw::sal::Bucket* target,
 
       for (auto obj_iter = results.objs.begin(); obj_iter != results.objs.end(); ++obj_iter, ++offset) {
         const auto obj = *obj_iter;
-        for (auto* op : prefix_iter->second) {
-          workpool.spawn([pf, op, obj]
-                         (boost::asio::yield_context yield) mutable {
+        workpool.spawn([pf, ops = prefix_iter->second, obj, batch_counters]
+                       (boost::asio::yield_context yield) mutable {
+            for (auto* op : ops) {
               pf(yield, *op, obj);
-            });
+            }
+            batch_counters->decrement_pending();
+          });
+
+        batch_counters->increment_scanned();
+        mpu_count++;
+
+        /*
+         * Flush counters every batch_threshold multipart objects.
+         * Flush scanned first, then completed; obj_scanned is incremented
+         * synchronously above before any worker decrement, so
+         * obj_scanned >= obj_completed always holds at flush time.
+         */
+        if (batch_threshold > 0 && (mpu_count % batch_threshold) == 0) {
+          batch_counters->flush_scanned();
+          batch_counters->flush_completed();
+          batch_counters->flush_actions();
+        }
+
+        if (going_down()) {
+          return 0;
         }
-       if (going_down()) {
-         return 0;
-       }
       } /* for objs */
 
       if ((offset % 100) == 0) {
-       if (worker_should_stop(stop_at, once)) {
-         ldpp_dout(this, 5) << __func__ << " interval budget EXPIRED worker="
-                            << worker->ix << " bucket=" << target->get_name()
-                            << dendl;
-         return 0;
-       }
+        if (worker_should_stop(stop_at, once)) {
+          ldpp_dout(this, 5) << __func__ << " interval budget EXPIRED worker="
+                             << worker->ix << " bucket=" << target->get_name()
+                             << dendl;
+          return 0;
+        }
       }
     } while(results.is_truncated);
   } /* for grouped_mp_ops */
@@ -1098,6 +1235,9 @@ public:
       }
       ldpp_dout(oc.dpp, 2) << "DELETED: current is-dm "
                       << oc.bucket << ":" << o.key << dendl;
+      if (oc.batch_counters) {
+        oc.batch_counters->record_dm_expired();
+      }
     } else {
       /* ! o.is_delete_marker() */
       r = remove_expired_obj(oc.dpp, y, oc, !oc.bucket->versioning_enabled(),
@@ -1114,6 +1254,9 @@ public:
       }
       ldpp_dout(oc.dpp, 2) << "DELETED:" << oc.bucket << ":" << o.key
                       << dendl;
+      if (oc.batch_counters) {
+        oc.batch_counters->record_expired();
+      }
     }
     return 0;
   }
@@ -1166,6 +1309,9 @@ public:
     }
     ldpp_dout(oc.dpp, 2) << "DELETED:" << oc.bucket << ":" << o.key
                     << " (non-current expiration)" << dendl;
+    if (oc.batch_counters) {
+      oc.batch_counters->record_noncur_expired();
+    }
     return 0;
   }
 };
@@ -1211,6 +1357,9 @@ public:
     }
     ldpp_dout(oc.dpp, 2) << "DELETED:" << oc.bucket << ":" << o.key
                     << " (delete marker expiration)" << dendl;
+    if (oc.batch_counters) {
+      oc.batch_counters->record_dm_expired();
+    }
     return 0;
   }
 };
@@ -1464,6 +1613,9 @@ public:
     ldpp_dout(oc.dpp, 2) << "TRANSITIONED:" << oc.bucket
                         << ":" << o.key << " -> "
                         << transition.storage_class << dendl;
+    if (oc.batch_counters) {
+      oc.batch_counters->record_transitioned();
+    }
     return 0;
   }
 };
@@ -1553,10 +1705,11 @@ void LCOpRule::update()
 
 int LCOpRule::process(rgw_bucket_dir_entry& o,
                      const DoutPrefixProvider *dpp,
-                     optional_yield y,
+                     LCBatchCounters* batch_counters,
+                      optional_yield y,
                      const RGWObjTags* cached_tags)
 {
-  lc_op_ctx ctx(env, o, next_key_name, num_noncurrent, effective_mtime, dpp);
+  lc_op_ctx ctx(env, o, next_key_name, num_noncurrent, effective_mtime, dpp, batch_counters);
   ctx.cached_tags = cached_tags;
   shared_ptr<LCOpAction> *selected = nullptr; // n.b., req'd by sharing
   real_time exp;
@@ -1640,16 +1793,15 @@ int RGWLC::bucket_lc_process(string& shard_id, LCWorker* worker,
     return ret;
   }
 
-  // use a limited number of coroutines for concurrent processing
-  size_t limit = cct->_conf.get_val<int64_t>("rgw_lc_max_wp_worker");
-  auto workpool = ceph::async::spawn_throttle{yield, limit};
-  auto stack_guard = make_scope_guard(
-    [&workpool]
-      {
-       workpool.wait();
-      }
-    );
-
+  /*
+   * Validate the bucket before initializing per-bucket LC counters or
+   * spawning a workpool. Stamping start_time/end_time for stale index
+   * entries or buckets whose lifecycle configuration has been removed
+   * would make those skipped entries appear as completed LC runs in
+   * the metrics, and for a recreated same-name bucket would overwrite
+   * the visible run timestamps without any LC processing actually
+   * happening.
+   */
   if (bucket->get_marker() != bucket_marker) {
     ldpp_dout(this, 1) << "LC: deleting stale entry found for bucket="
                       << bucket_tenant << ":" << bucket_name
@@ -1676,15 +1828,63 @@ int RGWLC::bucket_lc_process(string& shard_id, LCWorker* worker,
       return -1;
     }
 
+  /*
+   * Past this point the bucket is valid and we will actually process
+   * LC for it: create the workpool, set up per-bucket counters, and
+   * install the scope guards.
+   */
+  size_t limit = cct->_conf.get_val<int64_t>("rgw_lc_max_wp_worker");
+  auto workpool = ceph::async::spawn_throttle{yield, limit};
+
+  auto perf_counters = rgw::lc_counters::get(bucket->get_name(), bucket_tenant);
+  uint64_t batch_threshold = cct->_conf->rgw_lc_counters_batch_size;
+  LCBatchCounters batch_counters(perf_counters.get(), batch_threshold);
+
+  if (perf_counters) {
+    perf_counters->set(l_rgw_lc_per_bucket_start_time, ceph_clock_now().sec());
+    perf_counters->set(l_rgw_lc_per_bucket_end_time, 0);
+  }
+
+  /*
+   * total_objects_scanned tracks total objects for the modulo flush
+   * pattern; obj_scanned resets on each flush so it can't be used
+   * for modulo.
+   */
+  uint64_t total_objects_scanned = 0;
+
+  /*
+   * Flush counters after workers complete on all exit paths.
+   * flush_guard is declared first so it is destroyed last (after
+   * stack_guard waits for workers to finish).
+   */
+  auto flush_guard = make_scope_guard(
+    [&batch_counters, &perf_counters]
+      {
+        batch_counters.flush_scanned();
+        batch_counters.flush_completed();
+        batch_counters.flush_actions();
+        if (perf_counters) {
+          perf_counters->set(l_rgw_lc_per_bucket_end_time,
+                             ceph_clock_now().sec());
+        }
+      }
+    );
+  auto stack_guard = make_scope_guard(
+    [&workpool]
+      {
+        workpool.wait();
+      }
+    );
+
   /* fetch information for zone checks */
   rgw::sal::Zone* zone = driver->get_zone();
 
-  auto pf = [&bucket_name](const DoutPrefixProvider* dpp, optional_yield y,
+  auto pf = [&bucket_name, &batch_counters](const DoutPrefixProvider* dpp, optional_yield y,
                            LCOpRule& op_rule, rgw_bucket_dir_entry& o,
                            const RGWObjTags* cached_tags) {
     ldpp_dout(dpp, 20)
       << __func__ << "(): key=" << o.key << dendl;
-    int ret = op_rule.process(o, dpp, y, cached_tags);
+    int ret = op_rule.process(o, dpp, &batch_counters, y, cached_tags);
     if (ret < 0) {
       ldpp_dout(dpp, 20)
        << "ERROR: orule.process() returned ret=" << ret
@@ -1772,7 +1972,7 @@ int RGWLC::bucket_lc_process(string& shard_id, LCWorker* worker,
       }
 
       // Spawn one coroutine per object to process all rules
-      workpool.spawn([&pf, dpp=this, rules_copy=rules, obj, bucket=bucket.get()]
+      workpool.spawn([&pf, &batch_counters, dpp=this, rules_copy=rules, obj, bucket=bucket.get()]
                      (boost::asio::yield_context yield) mutable {
         // Check if any rule needs tags so we only fetch once per object
         bool any_rule_needs_tags = std::any_of(rules_copy.begin(), rules_copy.end(),
@@ -1821,8 +2021,27 @@ int RGWLC::bucket_lc_process(string& shard_id, LCWorker* worker,
           pf(dpp, yield, rule, const_cast<rgw_bucket_dir_entry&>(obj),
              cached_tags_ptr);
         }
+
+        // Decrement pending once per object, after all rules are evaluated
+        batch_counters.decrement_pending();
       });
 
+      total_objects_scanned++;
+
+      batch_counters.increment_scanned();
+
+      /*
+       * Flush counters every batch_threshold objects.
+       * Flush scanned first, then completed; obj_scanned is incremented
+       * synchronously above before any worker decrement, so
+       * obj_scanned >= obj_completed always holds at flush time.
+       */
+      if (batch_threshold > 0 && (total_objects_scanned % batch_threshold) == 0) {
+        batch_counters.flush_scanned();
+        batch_counters.flush_completed();
+        batch_counters.flush_actions();
+      }
+
       if ((offset % 100) == 0) {
        if (worker_should_stop(stop_at, once)) {
          ldpp_dout(this, 5) << __func__ << " interval budget EXPIRED worker="
@@ -1835,7 +2054,8 @@ int RGWLC::bucket_lc_process(string& shard_id, LCWorker* worker,
   }
 
   ret = handle_multipart_expiration(bucket.get(), prefix_map, workpool,
-                                    yield, worker, stop_at, once);
+                                    yield, worker, &batch_counters, stop_at, once);
+
   return ret;
 }
 
index a0b7f3eef88f6e0a98c390fe4ec174d979cd2ee3..b4cab41916d5429e2fd80f4c194d721702da45de 100644 (file)
@@ -31,6 +31,9 @@ static std::string lc_index_lock_name = "lc_process";
 
 extern const char* LC_STATUS[];
 
+// Forward declaration
+struct LCBatchCounters;
+
 typedef enum {
   lc_uninitial = 0,
   lc_processing,
@@ -675,7 +678,8 @@ public:
                                  const std::multimap<std::string, lc_op>& prefix_map,
                                  ceph::async::spawn_throttle& workpool,
                                  boost::asio::yield_context yield,
-                                 LCWorker* worker, time_t stop_at, bool once);
+                                 LCWorker* worker, LCBatchCounters* batch_counters,
+                                 time_t stop_at, bool once);
 };
 
 namespace rgw::lc {
index 0eba41d8081448129f95908ae37134a10f5fad77..dab47012c5158f40176281b8e5f45907e72a4c5a 100644 (file)
@@ -232,6 +232,61 @@ CountersManager::~CountersManager() {
 
 } // namespace rgw::persistent_topic_counters
 
+namespace rgw::lc_counters {
+
+ceph::perf_counters::PerfCountersCache *lc_counters_cache = nullptr;
+const std::string rgw_lc_counters_key = "rgw_lc_per_bucket";
+
+void add_lc_counters(PerfCountersBuilder *pcb) {
+  pcb->set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
+
+  pcb->add_u64(l_rgw_lc_per_bucket_start_time, "start_time",
+               "LC processing start timestamp (Unix epoch seconds)");
+  pcb->add_u64(l_rgw_lc_per_bucket_end_time, "end_time",
+               "LC processing end timestamp (Unix epoch seconds)");
+  pcb->add_u64_counter(l_rgw_lc_per_bucket_obj_scanned, "objects_scanned",
+                       "Objects scanned by LC (cumulative across runs)");
+  pcb->add_u64(l_rgw_lc_per_bucket_obj_pending, "objects_pending",
+               "Objects currently pending LC processing");
+  pcb->add_u64_counter(l_rgw_lc_per_bucket_obj_expired, "objects_expired",
+                       "Current-version objects expired by LC");
+  pcb->add_u64_counter(l_rgw_lc_per_bucket_obj_noncur_expired,
+                       "objects_noncurrent_expired",
+                       "Noncurrent-version objects expired by LC");
+  pcb->add_u64_counter(l_rgw_lc_per_bucket_obj_dm_expired,
+                       "objects_dm_expired",
+                       "Delete markers expired by LC");
+  pcb->add_u64_counter(l_rgw_lc_per_bucket_obj_transitioned,
+                       "objects_transitioned",
+                       "Objects transitioned by LC");
+  pcb->add_u64_counter(l_rgw_lc_per_bucket_obj_mpu_aborted,
+                       "objects_mpu_aborted",
+                       "Multipart uploads aborted by LC");
+}
+
+std::shared_ptr<PerfCounters> create_lc_counters(const std::string& name, CephContext *cct) {
+  PerfCountersBuilder pcb(cct, name, l_rgw_lc_per_bucket_first, l_rgw_lc_per_bucket_last);
+  add_lc_counters(&pcb);
+  std::shared_ptr<PerfCounters> new_counters(pcb.create_perf_counters());
+  cct->get_perfcounters_collection()->add(new_counters.get());
+  return new_counters;
+}
+
+std::shared_ptr<PerfCounters> get(const std::string& bucket_name,
+                                  const std::string& tenant) {
+  if (!lc_counters_cache) {
+    return nullptr;
+  }
+  std::string key = ceph::perf_counters::key_create(rgw_lc_counters_key,
+                                                    {{"bucket", bucket_name}});
+  if (!tenant.empty()) {
+    key = ceph::perf_counters::key_insert(key, {{"tenant", tenant}});
+  }
+  return lc_counters_cache->get(key);
+}
+
+} // namespace rgw::lc_counters
+
 int rgw_perf_start(CephContext *cct)
 {
   frontend_counters_init(cct);
@@ -248,6 +303,12 @@ int rgw_perf_start(CephContext *cct)
     bucket_counters_cache = new PerfCountersCache(cct, target_size, create_rgw_op_counters);
   }
 
+  bool lc_counters_cache_enabled = cct->_conf.get_val<bool>("rgw_lc_counters_cache");
+  if (lc_counters_cache_enabled) {
+    uint64_t target_size = cct->_conf.get_val<uint64_t>("rgw_lc_counters_cache_size");
+    rgw::lc_counters::lc_counters_cache = new PerfCountersCache(cct, target_size, rgw::lc_counters::create_lc_counters);
+  }
+
   global_op_counters_init(cct);
   return 0;
 }
@@ -262,4 +323,5 @@ void rgw_perf_stop(CephContext *cct)
   delete global_op_counters;
   delete user_counters_cache;
   delete bucket_counters_cache;
+  delete rgw::lc_counters::lc_counters_cache;
 }
index 162c55bc07512add6d897616cd399aa1937ba348..e93b4fe36f91c30939328afb92855ce22f395034 100644 (file)
@@ -96,6 +96,22 @@ enum {
   l_rgw_topic_last
 };
 
+enum {
+  l_rgw_lc_per_bucket_first = 18000,
+
+  l_rgw_lc_per_bucket_start_time,
+  l_rgw_lc_per_bucket_end_time,
+  l_rgw_lc_per_bucket_obj_scanned,
+  l_rgw_lc_per_bucket_obj_pending,
+  l_rgw_lc_per_bucket_obj_expired,
+  l_rgw_lc_per_bucket_obj_noncur_expired,
+  l_rgw_lc_per_bucket_obj_dm_expired,
+  l_rgw_lc_per_bucket_obj_transitioned,
+  l_rgw_lc_per_bucket_obj_mpu_aborted,
+
+  l_rgw_lc_per_bucket_last
+};
+
 namespace rgw::op_counters {
 
 struct CountersContainer {
@@ -129,3 +145,10 @@ public:
 };
 
 } // namespace rgw::persistent_topic_counters
+
+namespace rgw::lc_counters {
+
+std::shared_ptr<PerfCounters> get(const std::string& bucket_name,
+                                  const std::string& tenant);
+
+} // namespace rgw::lc_counters