]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
os/bluestore: add health warn for shared DB/WAL ratio wip-isawant-bz2387783
authorIndira Sawant <indira@example.com>
Tue, 27 Jan 2026 19:40:20 +0000 (13:40 -0600)
committerIndira Sawant <indira@example.com>
Tue, 27 Jan 2026 19:40:20 +0000 (13:40 -0600)
BlueStore now emits a HEALTH_WARN alert when the BlueFS DB/WAL device,
which shares the main device, exceeds 6% of the main device size. This
helps administrators identify potentially oversized metadata that could
impact performance.

Changes include:
- Added logic in BlueStore::_log_alerts() to check the shared DB/WAL
  size relative to the main device.
- Introduced a new alert key "BLUESTORE_SHARED_DB_RATIO" for reporting
  via `ceph health detail`.
- Updated the OSD health summary aggregation code to display a human-
  readable description of this alert.

Fixes: https://tracker.ceph.com/issues/73826
Signed-off-by: Indira Sawant <indira@example.com>
PendingReleaseNotes
src/mon/PGMap.cc
src/os/bluestore/BlueStore.cc

index 446297dad6179e1a3f8f3e387069f606ca6bd150..18d21fe4056bd6840341f1f58923409713685092 100644 (file)
   scan_extents, scan_inodes, and other state-changing operations.
   Related Tracker: https://tracker.ceph.com/issues/63191
 
+* OSD: A health warning is reported when BlueStore DB/WAL sizes are large relative
+  to the main OSD data device.
+  This warning is informational and does not impact OSD functionality, but highlights
+  uneven OSD utilization and cause some OSDs to reach *full states sooner
+  than others. Administrators may review and take action if needed.
+  Users can temporarily mute it with:
+  ``ceph health mute BLUESTORE_SHARED_DB_RATIO``
+
 >=20.0.0
 
 * RADOS: The lead Monitor and stretch mode status are now displayed by `ceph status`.
index 3a500d5f35fc350f9cf7b179aa0c0347cdc1d0e7..071a85f1e4885894007c469ab7ad7c6d501e16bd 100644 (file)
@@ -3314,6 +3314,8 @@ void PGMap::get_health_checks(
         summary += " experiencing stalled read in block device of BlueStore";
       } else if (asum.first == "WAL_DEVICE_STALLED_READ_ALERT") {
         summary += " experiencing stalled read in wal device of BlueFS";
+      } else if (asum.first == "BLUESTORE_SHARED_DB_RATIO") {
+        summary += " have shared DB/WAL device exceeding 6% of main device size";
       } else if (asum.first == "DB_DEVICE_STALLED_READ_ALERT") {
         summary += " experiencing stalled read in db device of BlueFS";
       } else if (asum.first.find("_DISCARD_QUEUE") != std::string::npos) {
index 28ee6a4f87a44a0a1639ab14f18889b0f38be1e7..f57648b7e931f259546245cbea82036aeb122478 100644 (file)
@@ -19274,6 +19274,23 @@ void BlueStore::_log_alerts(osd_alert_list_t& alerts)
   } else if (!spillover_alert.empty()){
     spillover_alert.clear();
   }
+  // CHECK: shared DB/WAL ratio with main device size
+  if (bluefs) {
+    uint64_t db_size = bluefs->get_block_device_size(BlueFS::BDEV_DB);
+    uint64_t block_size = bluefs->get_block_device_size(BlueFS::BDEV_SLOW);
+
+    if (block_size > 0 && db_size >0) {
+      double ratio = static_cast<double>(db_size) / static_cast<double>(block_size);
+      if (ratio > 0.06) {
+        ostringstream ss;
+        ss << "BlueStore shared DB/WAL device (" << byte_u_t(db_size)
+           << ") exceeds 6% of main device (" << byte_u_t(block_size)
+           << ", " << std::fixed << std::setprecision(2)
+           << ratio * 100.0 << "%)";
+        alerts.emplace("BLUESTORE_SHARED_DB_RATIO", ss.str());
+      }
+    }
+  }
   if (cct->_conf->bluestore_slow_ops_warn_threshold) {
     size_t qsize = _trim_slow_op_event_queue(mono_clock::now());
     if (qsize >= cct->_conf->bluestore_slow_ops_warn_threshold) {