]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
os/bluestore: add health warning for oversized BlueFS usage 67192/head
authorIndira Sawant <indira@example.com>
Tue, 11 Nov 2025 17:51:43 +0000 (11:51 -0600)
committerIndira Sawant <indira.sawant@ibm.com>
Fri, 13 Mar 2026 19:02:02 +0000 (14:02 -0500)
Add a BLUESTORE_BLUEFS_OVERSIZED health warning when total BlueFS usage
(DB, WAL, and spillover on the slow device) exceeds a configurable ratio
of the main device size.

The threshold is controlled by the new configuration option
`bluestore_bluefs_warn_ratio` (default 0.06).

Fixes: https://tracker.ceph.com/issues/73826
Signed-off-by: Indira Sawant <indira.sawant@ibm.com>
PendingReleaseNotes
src/common/options/global.yaml.in
src/mon/PGMap.cc
src/os/bluestore/BlueFS.cc
src/os/bluestore/BlueStore.cc

index 446297dad6179e1a3f8f3e387069f606ca6bd150..bac7711f12e85cd86f2281f8ed1d0113ac5504b1 100644 (file)
   scan_extents, scan_inodes, and other state-changing operations.
   Related Tracker: https://tracker.ceph.com/issues/63191
 
+* OSD: A health warning is reported when BlueFS usage exceeds the
+  configured ratio of the main OSD data device size. This warning is
+  informational and can be muted with:
+  ``ceph health mute BLUESTORE_BLUEFS_OVERSIZED``
+
 >=20.0.0
 
 * RADOS: The lead Monitor and stretch mode status are now displayed by `ceph status`.
index f3ebf1b1a8e2e8c6c24fa28af5545eea29ea5cd2..0de100d4798f8c6c869538658b6019cf793ef7a0 100644 (file)
@@ -5865,6 +5865,15 @@ options:
   flags:
   - runtime
   with_legacy: true
+- name: bluestore_bluefs_warn_ratio
+  type: float
+  level: basic
+  desc: The ratio at which BlueFS usage relative to the main device raises a
+    health warning. Set to "1" to disable.
+  default: 0.06
+  with_legacy: false
+  flags:
+  - runtime
 # rocksdb options that will be used for omap(if omap_backend is rocksdb)
 - name: filestore_rocksdb_options
   type: str
index 3a500d5f35fc350f9cf7b179aa0c0347cdc1d0e7..3eef838e826cda8218ef331115e3e2da4de822a0 100644 (file)
@@ -3314,6 +3314,8 @@ void PGMap::get_health_checks(
         summary += " experiencing stalled read in block device of BlueStore";
       } else if (asum.first == "WAL_DEVICE_STALLED_READ_ALERT") {
         summary += " experiencing stalled read in wal device of BlueFS";
+      } else if (asum.first == "BLUESTORE_BLUEFS_OVERSIZED") {
+        summary += " have BlueFS usage exceeding configured ratio of main device size";
       } else if (asum.first == "DB_DEVICE_STALLED_READ_ALERT") {
         summary += " experiencing stalled read in db device of BlueFS";
       } else if (asum.first.find("_DISCARD_QUEUE") != std::string::npos) {
index dbc9ce6d784c36b437201391ac5a4a27cb37a971..a38f40267f93197c955676a62c08ba3759529c38 100644 (file)
@@ -621,7 +621,6 @@ uint64_t BlueFS::_get_used(unsigned id) const
 uint64_t BlueFS::get_used(unsigned id)
 {
   ceph_assert(id < alloc.size());
-  ceph_assert(alloc[id]);
   return _get_used(id);
 }
 
index 28ee6a4f87a44a0a1639ab14f18889b0f38be1e7..af324a0acd09df50b923221d1f368eba5839101b 100644 (file)
@@ -19274,6 +19274,32 @@ void BlueStore::_log_alerts(osd_alert_list_t& alerts)
   } else if (!spillover_alert.empty()){
     spillover_alert.clear();
   }
+  // CHECK: BlueFS usage relative to main device size
+  if (bluefs) {
+    uint64_t db_used = bluefs->get_used(BlueFS::BDEV_DB);
+    uint64_t wal_used = bluefs->get_used(BlueFS::BDEV_WAL);
+    uint64_t slow_used = bluefs->get_used(BlueFS::BDEV_SLOW);
+    uint64_t main_size = bdev->get_size();
+
+    if (main_size > 0) {
+      uint64_t total_bluefs_usage = db_used + wal_used + slow_used;
+      double ratio = static_cast<double>(total_bluefs_usage) /
+                     static_cast<double>(main_size);
+      double warn_ratio =
+        cct->_conf.get_val<double>("bluestore_bluefs_warn_ratio");
+
+      if (ratio > warn_ratio) {
+        ostringstream ss;
+        ss << "BlueFS usage (" << byte_u_t(total_bluefs_usage)
+           << ") exceeds " << std::fixed << std::setprecision(4)
+           << (warn_ratio * 100.0) << "% of main device ("
+           << byte_u_t(main_size) << ", "
+           << std::fixed << std::setprecision(2)
+           << ratio * 100.0 << "%)";
+        alerts.emplace("BLUESTORE_BLUEFS_OVERSIZED", ss.str());
+      }
+    }
+  }
   if (cct->_conf->bluestore_slow_ops_warn_threshold) {
     size_t qsize = _trim_slow_op_event_queue(mono_clock::now());
     if (qsize >= cct->_conf->bluestore_slow_ops_warn_threshold) {