]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
Warning added for slow operations and stalled read in BlueStore. User can control... 59468/head
authorsajibreadd <sajibreadd@gmail.com>
Mon, 27 May 2024 07:30:06 +0000 (13:30 +0600)
committersajibreadd <sajibreadd@gmail.com>
Thu, 19 Sep 2024 11:03:58 +0000 (17:03 +0600)
Fixes: https://tracker.ceph.com/issues/62500
Signed-off-by: Md Mahamudur Rahaman Sajib <mahamudur.sajib@croit.io>
(cherry picked from commit 73b80a9a2c38259346fb646f85fa2ba4dcbb1329)

Conflicts:
src/blk/BlockDevice.h
src/common/options/global.yaml.in
src/os/bluestore/BlueStore.cc
src/os/bluestore/BlueStore.h
        doc/rados/operations/health-checks.rst

src/blk/BlockDevice.cc
src/blk/BlockDevice.h
src/blk/kernel/KernelDevice.cc
src/common/options/global.yaml.in
src/mon/PGMap.cc
src/os/bluestore/BlueFS.cc
src/os/bluestore/BlueFS.h
src/os/bluestore/BlueStore.cc
src/os/bluestore/BlueStore.h

index fd07e443c1365f470861edef5be391ebafb0936b..d878afc388e9073a1a087982ee5caed660b4507f 100644 (file)
@@ -46,6 +46,7 @@
 #define dout_prefix *_dout << "bdev "
 
 using std::string;
+using ceph::mono_clock;
 
 
 blk_access_mode_t buffermode(bool buffered) 
@@ -209,3 +210,39 @@ bool BlockDevice::is_valid_io(uint64_t off, uint64_t len) const {
   }
   return ret;
 }
+
+size_t BlockDevice::trim_stalled_read_event_queue(mono_clock::time_point cur_time) {
+  std::lock_guard lock(stalled_read_event_queue_lock);
+  auto warn_duration = std::chrono::seconds(cct->_conf->bdev_stalled_read_warn_lifetime);
+  while (!stalled_read_event_queue.empty() && 
+    ((stalled_read_event_queue.front() < cur_time - warn_duration) ||
+      (stalled_read_event_queue.size() > cct->_conf->bdev_stalled_read_warn_threshold))) {
+      stalled_read_event_queue.pop();
+  }
+  return stalled_read_event_queue.size();
+}
+
+void BlockDevice::add_stalled_read_event() {
+  if (!cct->_conf->bdev_stalled_read_warn_threshold) {
+    return;
+  }
+  auto cur_time = mono_clock::now();
+  {
+    std::lock_guard lock(stalled_read_event_queue_lock);
+    stalled_read_event_queue.push(cur_time);
+  }
+  trim_stalled_read_event_queue(cur_time);
+}
+
+void BlockDevice::collect_alerts(osd_alert_list_t& alerts, const std::string& device_name) {
+  if (cct->_conf->bdev_stalled_read_warn_threshold) {
+    size_t qsize = trim_stalled_read_event_queue(mono_clock::now());
+    if (qsize >= cct->_conf->bdev_stalled_read_warn_threshold) {
+      std::ostringstream ss;
+      ss << "observed stalled read indications in "
+        << device_name << " device";
+      alerts.emplace(device_name + "_DEVICE_STALLED_READ_ALERT", ss.str());
+    }
+  }
+}
+
index 67b4b330b6c45fd970e36f6a85f60e304db47d85..295dfd1e7ee413c0be389fa1ebd8c12cd9c68d96 100644 (file)
 #include <set>
 #include <string>
 #include <vector>
+#include <queue>
 
 #include "acconfig.h"
 #include "common/ceph_mutex.h"
 #include "include/common_fwd.h"
+#include "osd/osd_types.h"
 
 #if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO)
 #include "aio/aio.h"
@@ -147,6 +149,8 @@ class BlockDevice {
 public:
   CephContext* cct;
   typedef void (*aio_callback_t)(void *handle, void *aio);
+  void collect_alerts(osd_alert_list_t& alerts, const std::string& device_name);
+
 private:
   ceph::mutex ioc_reap_lock = ceph::make_mutex("BlockDevice::ioc_reap_lock");
   std::vector<IOContext*> ioc_reap_queue;
@@ -166,12 +170,14 @@ private:
     pmem,
 #endif
   };
+  std::queue <ceph::mono_clock::time_point> stalled_read_event_queue;
+  ceph::mutex stalled_read_event_queue_lock = ceph::make_mutex("BlockDevice::stalled_read_event_queue_lock");
+  size_t trim_stalled_read_event_queue(mono_clock::time_point cur_time);
   static block_device_t detect_device_type(const std::string& path);
   static block_device_t device_type_from_name(const std::string& blk_dev_name);
   static BlockDevice *create_with_type(block_device_t device_type,
     CephContext* cct, const std::string& path, aio_callback_t cb,
     void *cbpriv, aio_callback_t d_cb, void *d_cbpriv);
-
 protected:
   uint64_t size = 0;
   uint64_t block_size = 0;
@@ -189,6 +195,7 @@ protected:
   // of the drive.  The zones 524-52155 are sequential zones.
   uint64_t conventional_region_size = 0;
   uint64_t zone_size = 0;
+  void add_stalled_read_event();
 
 public:
   aio_callback_t aio_callback;
index b5830b51d6c213884929868befe214573bbcf3f4..f1bca417a68718d3d999cc6ddf75fc27e01773a0 100644 (file)
@@ -1262,6 +1262,7 @@ int KernelDevice::read(uint64_t off, uint64_t len, bufferlist *pbl,
         << " since " << start1 << ", timeout is "
         << age
         << "s" << dendl;
+    add_stalled_read_event();
   }
   if (r < 0) {
     if (ioc->allow_eio && is_expected_ioerr(-errno)) {
@@ -1335,6 +1336,7 @@ int KernelDevice::direct_read_unaligned(uint64_t off, uint64_t len, char *buf)
         << " since " << start1 << ", timeout is "
         << age
         << "s" << dendl;
+    add_stalled_read_event();
   }
 
   if (r < 0) {
@@ -1398,6 +1400,7 @@ int KernelDevice::read_random(uint64_t off, uint64_t len, char *buf,
            << " (buffered) since " << start1 << ", timeout is "
           << age
           << "s" << dendl;
+      add_stalled_read_event();
     }
   } else {
     //direct and aligned read
@@ -1408,6 +1411,7 @@ int KernelDevice::read_random(uint64_t off, uint64_t len, char *buf,
            << " (direct) since " << start1 << ", timeout is "
           << age
           << "s" << dendl;
+      add_stalled_read_event();
     }
     if (r < 0) {
       r = -errno;
index 3999a1a625fccd3c410dda52d31d0ef865d03ba1..a8a00325bde38fe327ccf3f74ccd6d8a4e9a3d3e 100644 (file)
@@ -5310,6 +5310,18 @@ options:
   desc: Enable health indication when spurious read errors are observed by OSD
   default: true
   with_legacy: true
+- name: bluestore_slow_ops_warn_lifetime
+  type: uint
+  level: advanced
+  desc: A configurable duration for slow ops warning to be appeared if number of occurence pass `bluestore_slow_ops_warn_threshold` in `bluestore_slow_ops_warn_lifetime` seconds
+  default: 86400
+  with_legacy: true
+- name: bluestore_slow_ops_warn_threshold
+  type: uint
+  level: advanced
+  desc: A configurable number for slow ops warning to be appeared if number of occurence pass `bluestore_slow_ops_warn_threshold` in `bluestore_slow_ops_warn_lifetime` seconds
+  default: 1
+  with_legacy: true
 - name: bluestore_fsck_error_on_no_per_pool_omap
   type: bool
   level: advanced
@@ -6345,6 +6357,18 @@ options:
   - spdk
   - pmem
   - hm_smr
+- name: bdev_stalled_read_warn_lifetime
+  type: uint
+  level: advanced
+  desc: A configurable duration for stalled read warning to be appeared if number of stalled read occurence pass `bdev_stalled_read_warn_threshold` in `bdev_stalled_read_warn_lifetime` seconds
+  default: 86400
+  with_legacy: true
+- name: bdev_stalled_read_warn_threshold
+  type: uint
+  level: advanced
+  desc: A configurable number for stalled read warning to be appeared if number of stalled read occurence pass `bdev_stalled_read_warn_threshold` in `bdev_stalled_read_warn_lifetime` seconds
+  default: 1
+  with_legacy: true
 - name: bluestore_cleaner_sleep_interval
   type: float
   level: advanced
index 326d589b705ad820ee16724e6416bb3db3044182..7ccafa384520092eb116b222c1381f114fd6a1a1 100644 (file)
@@ -3238,6 +3238,14 @@ void PGMap::get_health_checks(
        summary += " reporting legacy (not per-pool) BlueStore omap usage stats";
       } else if (asum.first == "BLUESTORE_SPURIOUS_READ_ERRORS") {
         summary += " have spurious read errors";
+      } else if (asum.first == "BLUESTORE_SLOW_OP_ALERT") {
+        summary += " experiencing slow operations in BlueStore";
+      } else if (asum.first == "BLOCK_DEVICE_STALLED_READ_ALERT") {
+        summary += " experiencing stalled read in block device of BlueStore";
+      } else if (asum.first == "WAL_DEVICE_STALLED_READ_ALERT") {
+        summary += " experiencing stalled read in wal device of BlueFS";
+      } else if (asum.first == "DB_DEVICE_STALLED_READ_ALERT") {
+        summary += " experiencing stalled read in db device of BlueFS";
       }
 
       auto& d = checks->add(asum.first, HEALTH_WARN, summary, asum.second.first);
index 04f7adfc71ad701eb14e7d82ae15d6ba0eae54ce..4c6a2042ab8829a9d9ca70bc6fd01b91f7f5a23c 100644 (file)
@@ -4112,6 +4112,15 @@ bool BlueFS::debug_get_is_dev_dirty(FileWriter *h, uint8_t dev)
   return h->dirty_devs[dev];
 }
 
+void BlueFS::collect_alerts(osd_alert_list_t& alerts) {
+  if (bdev[BDEV_DB]) {
+    bdev[BDEV_DB]->collect_alerts(alerts, "DB");
+  }
+  if (bdev[BDEV_WAL]) {
+    bdev[BDEV_WAL]->collect_alerts(alerts, "WAL");
+  }
+}
+
 int BlueFS::open_for_read(
   std::string_view dirname,
   std::string_view filename,
index 9c08bc377ffc0f3c9d215f789e7c9ceb36727c61..615ea49143e9804b268bdf88b954f4528a4184e7 100644 (file)
@@ -132,6 +132,7 @@ public:
     WRITER_WAL,
     WRITER_SST,
   };
+  void collect_alerts(osd_alert_list_t& alerts);
 
   struct File : public RefCountedObject {
     MEMPOOL_CLASS_HELPERS();
@@ -334,7 +335,6 @@ public:
     FileRef file;
     explicit FileLock(FileRef f) : file(std::move(f)) {}
   };
-
 private:
   PerfCounters *logger = nullptr;
 
index 3ca91ed41d5b0da299b92ba9e69ecc47c0827ba6..fd0c97455031d32c4ada8cc0a73c4e6421018b68 100644 (file)
@@ -11230,7 +11230,7 @@ int BlueStore::_do_read(
 int BlueStore::_verify_csum(OnodeRef& o,
                            const bluestore_blob_t* blob, uint64_t blob_xoffset,
                            const bufferlist& bl,
-                           uint64_t logical_offset) const
+                           uint64_t logical_offset)
 {
   int bad;
   uint64_t bad_csum;
@@ -17254,12 +17254,33 @@ int BlueStore::_merge_collection(
   return r;
 }
 
+size_t BlueStore::_trim_slow_op_event_queue(mono_clock::time_point cur_time) {
+  ceph_assert(ceph_mutex_is_locked(qlock));
+  auto warn_duration = std::chrono::seconds(cct->_conf->bluestore_slow_ops_warn_lifetime);
+  while (!slow_op_event_queue.empty() && 
+    ((slow_op_event_queue.front() < cur_time - warn_duration) ||
+      (slow_op_event_queue.size() > cct->_conf->bluestore_slow_ops_warn_threshold))) {
+      slow_op_event_queue.pop();
+  }
+  return slow_op_event_queue.size();
+}
+
+void BlueStore::_add_slow_op_event() {
+  if (!cct->_conf->bluestore_slow_ops_warn_threshold) {
+    return;
+  }
+  std::lock_guard lock(qlock);
+  auto cur_time = mono_clock::now();
+  slow_op_event_queue.push(cur_time);
+  _trim_slow_op_event_queue(cur_time);
+}
+
 void BlueStore::log_latency(
   const char* name,
   int idx,
   const ceph::timespan& l,
   double lat_threshold,
-  const char* info) const
+  const char* info)
 {
   logger->tinc(idx, l);
   if (lat_threshold > 0.0 &&
@@ -17268,6 +17289,7 @@ void BlueStore::log_latency(
       << ", latency = " << l
       << info
       << dendl;
+    _add_slow_op_event();
   }
 }
 
@@ -17276,7 +17298,7 @@ void BlueStore::log_latency_fn(
   int idx,
   const ceph::timespan& l,
   double lat_threshold,
-  std::function<string (const ceph::timespan& lat)> fn) const
+  std::function<string (const ceph::timespan& lat)> fn)
 {
   logger->tinc(idx, l);
   if (lat_threshold > 0.0 &&
@@ -17285,6 +17307,7 @@ void BlueStore::log_latency_fn(
       << ", latency = " << l
       << fn(l)
       << dendl;
+    _add_slow_op_event();
   }
 }
 
@@ -17680,7 +17703,16 @@ void BlueStore::_log_alerts(osd_alert_list_t& alerts)
   } else if (!spillover_alert.empty()){
     spillover_alert.clear();
   }
-
+  if (cct->_conf->bluestore_slow_ops_warn_threshold) {
+    size_t qsize = _trim_slow_op_event_queue(mono_clock::now());
+    if (qsize >= cct->_conf->bluestore_slow_ops_warn_threshold) {
+      ostringstream ss;
+      ss << "observed slow operation indications in BlueStore";
+      alerts.emplace("BLUESTORE_SLOW_OP_ALERT", ss.str());
+    }
+  }
+  bdev->collect_alerts(alerts, "BLOCK");
+  bluefs->collect_alerts(alerts);
   if (!spurious_read_errors_alert.empty() &&
       cct->_conf->bluestore_warn_on_spurious_read_errors) {
     alerts.emplace(
index 643b32d8a6dd8757ddecb244f49ff7dce1b68217..65dbcef81f25e5780fc5693f4ca102c7f2e18f64 100644 (file)
@@ -23,6 +23,7 @@
 #include <chrono>
 #include <ratio>
 #include <mutex>
+#include <queue>
 #include <condition_variable>
 
 #include <boost/intrusive/list.hpp>
@@ -3290,13 +3291,13 @@ public:
     int idx,
     const ceph::timespan& lat,
     double lat_threshold,
-    const char* info = "") const;
+    const char* info = "");
 
   inline void log_latency_fn(const char* name,
     int idx,
     const ceph::timespan& lat,
     double lat_threshold,
-    std::function<std::string (const ceph::timespan& lat)> fn) const;
+    std::function<std::string (const ceph::timespan& lat)> fn);
 
 private:
   bool _debug_data_eio(const ghobject_t& o) {
@@ -3330,7 +3331,10 @@ private:
   std::string no_per_pg_omap_alert;
   std::string disk_size_mismatch_alert;
   std::string spurious_read_errors_alert;
+  std::queue <ceph::mono_clock::time_point> slow_op_event_queue;
 
+  size_t _trim_slow_op_event_queue(ceph::mono_clock::time_point cur_time);
+  void _add_slow_op_event();
   void _log_alerts(osd_alert_list_t& alerts);
   bool _set_compression_alert(bool cmode, const char* s) {
     std::lock_guard l(qlock);
@@ -3367,7 +3371,7 @@ private:
     const bluestore_blob_t* blob,
     uint64_t blob_xoffset,
     const ceph::buffer::list& bl,
-    uint64_t logical_offset) const;
+    uint64_t logical_offset);
   int _decompress(ceph::buffer::list& source, ceph::buffer::list* result);