]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
Warning added for slow operations and stalled read in BlueStore. User can control...
authorsajibreadd <sajibreadd@gmail.com>
Mon, 27 May 2024 07:30:06 +0000 (13:30 +0600)
committersajibreadd <sajibreadd@gmail.com>
Thu, 28 Nov 2024 12:05:13 +0000 (13:05 +0100)
Fixes: https://tracker.ceph.com/issues/62500
Signed-off-by: Md Mahamudur Rahaman Sajib <mahamudur.sajib@croit.io>
(cherry picked from commit 73b80a9a2c38259346fb646f85fa2ba4dcbb1329)

Conflicts:
src/common/options/global.yaml.in
src/os/bluestore/BlueStore.cc
src/os/bluestore/BlueStore.h
        doc/rados/operations/health-checks.rst

src/blk/BlockDevice.cc
src/blk/BlockDevice.h
src/blk/kernel/KernelDevice.cc
src/common/options/global.yaml.in
src/mon/PGMap.cc
src/os/bluestore/BlueFS.cc
src/os/bluestore/BlueFS.h
src/os/bluestore/BlueStore.cc
src/os/bluestore/BlueStore.h

index fd07e443c1365f470861edef5be391ebafb0936b..d878afc388e9073a1a087982ee5caed660b4507f 100644 (file)
@@ -46,6 +46,7 @@
 #define dout_prefix *_dout << "bdev "
 
 using std::string;
+using ceph::mono_clock;
 
 
 blk_access_mode_t buffermode(bool buffered) 
@@ -209,3 +210,39 @@ bool BlockDevice::is_valid_io(uint64_t off, uint64_t len) const {
   }
   return ret;
 }
+
+size_t BlockDevice::trim_stalled_read_event_queue(mono_clock::time_point cur_time) {
+  std::lock_guard lock(stalled_read_event_queue_lock);
+  auto warn_duration = std::chrono::seconds(cct->_conf->bdev_stalled_read_warn_lifetime);
+  while (!stalled_read_event_queue.empty() && 
+    ((stalled_read_event_queue.front() < cur_time - warn_duration) ||
+      (stalled_read_event_queue.size() > cct->_conf->bdev_stalled_read_warn_threshold))) {
+      stalled_read_event_queue.pop();
+  }
+  return stalled_read_event_queue.size();
+}
+
+void BlockDevice::add_stalled_read_event() {
+  if (!cct->_conf->bdev_stalled_read_warn_threshold) {
+    return;
+  }
+  auto cur_time = mono_clock::now();
+  {
+    std::lock_guard lock(stalled_read_event_queue_lock);
+    stalled_read_event_queue.push(cur_time);
+  }
+  trim_stalled_read_event_queue(cur_time);
+}
+
+void BlockDevice::collect_alerts(osd_alert_list_t& alerts, const std::string& device_name) {
+  if (cct->_conf->bdev_stalled_read_warn_threshold) {
+    size_t qsize = trim_stalled_read_event_queue(mono_clock::now());
+    if (qsize >= cct->_conf->bdev_stalled_read_warn_threshold) {
+      std::ostringstream ss;
+      ss << "observed stalled read indications in "
+        << device_name << " device";
+      alerts.emplace(device_name + "_DEVICE_STALLED_READ_ALERT", ss.str());
+    }
+  }
+}
+
index ad4ed285f60497e2703753447ec004978c8cfda6..ebb23f366615ac08ec980a6dd4b3cd4b14b3f08a 100644 (file)
 #include <set>
 #include <string>
 #include <vector>
+#include <queue>
 
 #include "acconfig.h"
 #include "common/ceph_mutex.h"
 #include "include/common_fwd.h"
 #include "extblkdev/ExtBlkDevInterface.h"
+#include "osd/osd_types.h"
 
 #if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO)
 #include "aio/aio.h"
@@ -148,6 +150,8 @@ class BlockDevice {
 public:
   CephContext* cct;
   typedef void (*aio_callback_t)(void *handle, void *aio);
+  void collect_alerts(osd_alert_list_t& alerts, const std::string& device_name);
+
 private:
   ceph::mutex ioc_reap_lock = ceph::make_mutex("BlockDevice::ioc_reap_lock");
   std::vector<IOContext*> ioc_reap_queue;
@@ -167,12 +171,14 @@ private:
     pmem,
 #endif
   };
+  std::queue <ceph::mono_clock::time_point> stalled_read_event_queue;
+  ceph::mutex stalled_read_event_queue_lock = ceph::make_mutex("BlockDevice::stalled_read_event_queue_lock");
+  size_t trim_stalled_read_event_queue(mono_clock::time_point cur_time);
   static block_device_t detect_device_type(const std::string& path);
   static block_device_t device_type_from_name(const std::string& blk_dev_name);
   static BlockDevice *create_with_type(block_device_t device_type,
     CephContext* cct, const std::string& path, aio_callback_t cb,
     void *cbpriv, aio_callback_t d_cb, void *d_cbpriv);
-
 protected:
   uint64_t size = 0;
   uint64_t block_size = 0;
@@ -190,6 +196,7 @@ protected:
   // of the drive.  The zones 524-52155 are sequential zones.
   uint64_t conventional_region_size = 0;
   uint64_t zone_size = 0;
+  void add_stalled_read_event();
 
 public:
   aio_callback_t aio_callback;
index ad7bfc4cf94670db5fddb087e6e6886f03b21a53..d870b5691b050f1e41fecf778060d5dfea122a6f 100644 (file)
@@ -1331,6 +1331,7 @@ int KernelDevice::read(uint64_t off, uint64_t len, bufferlist *pbl,
         << " since " << start1 << ", timeout is "
         << age
         << "s" << dendl;
+    add_stalled_read_event();
   }
   if (r < 0) {
     if (ioc->allow_eio && is_expected_ioerr(-errno)) {
@@ -1404,6 +1405,7 @@ int KernelDevice::direct_read_unaligned(uint64_t off, uint64_t len, char *buf)
         << " since " << start1 << ", timeout is "
         << age
         << "s" << dendl;
+    add_stalled_read_event();
   }
 
   if (r < 0) {
@@ -1467,6 +1469,7 @@ int KernelDevice::read_random(uint64_t off, uint64_t len, char *buf,
            << " (buffered) since " << start1 << ", timeout is "
           << age
           << "s" << dendl;
+      add_stalled_read_event();
     }
   } else {
     //direct and aligned read
@@ -1477,6 +1480,7 @@ int KernelDevice::read_random(uint64_t off, uint64_t len, char *buf,
            << " (direct) since " << start1 << ", timeout is "
           << age
           << "s" << dendl;
+      add_stalled_read_event();
     }
     if (r < 0) {
       r = -errno;
index c84c2c7bdde6fc98c2507ee946cb4d6879a6e1bb..8c876897a062b8277e99796b53768ca7a95a1f99 100644 (file)
@@ -5354,6 +5354,18 @@ options:
   desc: Enable health indication when spurious read errors are observed by OSD
   default: true
   with_legacy: true
+- name: bluestore_slow_ops_warn_lifetime
+  type: uint
+  level: advanced
+  desc: A configurable duration for slow ops warning to be appeared if number of occurence pass `bluestore_slow_ops_warn_threshold` in `bluestore_slow_ops_warn_lifetime` seconds
+  default: 86400
+  with_legacy: true
+- name: bluestore_slow_ops_warn_threshold
+  type: uint
+  level: advanced
+  desc: A configurable number for slow ops warning to be appeared if number of occurence pass `bluestore_slow_ops_warn_threshold` in `bluestore_slow_ops_warn_lifetime` seconds
+  default: 1
+  with_legacy: true
 - name: bluestore_fsck_error_on_no_per_pool_omap
   type: bool
   level: advanced
@@ -6390,6 +6402,18 @@ options:
   - spdk
   - pmem
   - hm_smr
+- name: bdev_stalled_read_warn_lifetime
+  type: uint
+  level: advanced
+  desc: A configurable duration for stalled read warning to be appeared if number of stalled read occurence pass `bdev_stalled_read_warn_threshold` in `bdev_stalled_read_warn_lifetime` seconds
+  default: 86400
+  with_legacy: true
+- name: bdev_stalled_read_warn_threshold
+  type: uint
+  level: advanced
+  desc: A configurable number for stalled read warning to be appeared if number of stalled read occurence pass `bdev_stalled_read_warn_threshold` in `bdev_stalled_read_warn_lifetime` seconds
+  default: 1
+  with_legacy: true
 - name: bluestore_cleaner_sleep_interval
   type: float
   level: advanced
index 16fa71c84c069b2eecb741b9767aa083879ea7f1..138826aebc17f6c463eb0743d44c9e9bae6aac4d 100644 (file)
@@ -3238,6 +3238,14 @@ void PGMap::get_health_checks(
        summary += " reporting legacy (not per-pool) BlueStore omap usage stats";
       } else if (asum.first == "BLUESTORE_SPURIOUS_READ_ERRORS") {
         summary += " have spurious read errors";
+      } else if (asum.first == "BLUESTORE_SLOW_OP_ALERT") {
+        summary += " experiencing slow operations in BlueStore";
+      } else if (asum.first == "BLOCK_DEVICE_STALLED_READ_ALERT") {
+        summary += " experiencing stalled read in block device of BlueStore";
+      } else if (asum.first == "WAL_DEVICE_STALLED_READ_ALERT") {
+        summary += " experiencing stalled read in wal device of BlueFS";
+      } else if (asum.first == "DB_DEVICE_STALLED_READ_ALERT") {
+        summary += " experiencing stalled read in db device of BlueFS";
       }
 
       auto& d = checks->add(asum.first, HEALTH_WARN, summary, asum.second.first);
index 4dcf42f23041d08fa1306ab5f1c879a43177c450..60d07bfb51ca0f18ae26226a511291c3a1c45746 100644 (file)
@@ -4163,6 +4163,15 @@ bool BlueFS::debug_get_is_dev_dirty(FileWriter *h, uint8_t dev)
   return h->dirty_devs[dev];
 }
 
+void BlueFS::collect_alerts(osd_alert_list_t& alerts) {
+  if (bdev[BDEV_DB]) {
+    bdev[BDEV_DB]->collect_alerts(alerts, "DB");
+  }
+  if (bdev[BDEV_WAL]) {
+    bdev[BDEV_WAL]->collect_alerts(alerts, "WAL");
+  }
+}
+
 int BlueFS::open_for_read(
   std::string_view dirname,
   std::string_view filename,
index 6674190a55c5f6e6c6cfa103587bdad18dbb2a88..2d2e9b066e9e57dc9de8ad34802ba309622c834b 100644 (file)
@@ -225,6 +225,7 @@ public:
     WRITER_WAL,
     WRITER_SST,
   };
+  void collect_alerts(osd_alert_list_t& alerts);
 
   struct File : public RefCountedObject {
     MEMPOOL_CLASS_HELPERS();
@@ -427,7 +428,6 @@ public:
     FileRef file;
     explicit FileLock(FileRef f) : file(std::move(f)) {}
   };
-
 private:
   PerfCounters *logger = nullptr;
 
index 849e93ab475965bccd5babba1f46889b2e717443..beb04434f861a15f8dd2ca328581291832da91c2 100644 (file)
@@ -11300,7 +11300,7 @@ int BlueStore::_do_read(
 int BlueStore::_verify_csum(OnodeRef& o,
                            const bluestore_blob_t* blob, uint64_t blob_xoffset,
                            const bufferlist& bl,
-                           uint64_t logical_offset) const
+                           uint64_t logical_offset)
 {
   int bad;
   uint64_t bad_csum;
@@ -17346,13 +17346,34 @@ int BlueStore::_merge_collection(
   return r;
 }
 
+size_t BlueStore::_trim_slow_op_event_queue(mono_clock::time_point cur_time) {
+  ceph_assert(ceph_mutex_is_locked(qlock));
+  auto warn_duration = std::chrono::seconds(cct->_conf->bluestore_slow_ops_warn_lifetime);
+  while (!slow_op_event_queue.empty() && 
+    ((slow_op_event_queue.front() < cur_time - warn_duration) ||
+      (slow_op_event_queue.size() > cct->_conf->bluestore_slow_ops_warn_threshold))) {
+      slow_op_event_queue.pop();
+  }
+  return slow_op_event_queue.size();
+}
+
+void BlueStore::_add_slow_op_event() {
+  if (!cct->_conf->bluestore_slow_ops_warn_threshold) {
+    return;
+  }
+  std::lock_guard lock(qlock);
+  auto cur_time = mono_clock::now();
+  slow_op_event_queue.push(cur_time);
+  _trim_slow_op_event_queue(cur_time);
+}
+
 void BlueStore::log_latency(
   const char* name,
   int idx,
   const ceph::timespan& l,
   double lat_threshold,
   const char* info,
-  int idx2) const
+  int idx2)
 {
   logger->tinc(idx, l);
   if (lat_threshold > 0.0 &&
@@ -17364,6 +17385,7 @@ void BlueStore::log_latency(
     if (idx2 > l_bluestore_first && idx2 < l_bluestore_last) {
       logger->inc(idx2);
     }
+    _add_slow_op_event();
   }
 }
 
@@ -17373,7 +17395,7 @@ void BlueStore::log_latency_fn(
   const ceph::timespan& l,
   double lat_threshold,
   std::function<string (const ceph::timespan& lat)> fn,
-  int idx2) const
+  int idx2)
 {
   logger->tinc(idx, l);
   if (lat_threshold > 0.0 &&
@@ -17385,6 +17407,7 @@ void BlueStore::log_latency_fn(
     if (idx2 > l_bluestore_first && idx2 < l_bluestore_last) {
       logger->inc(idx2);
     }
+    _add_slow_op_event();
   }
 }
 
@@ -17780,7 +17803,16 @@ void BlueStore::_log_alerts(osd_alert_list_t& alerts)
   } else if (!spillover_alert.empty()){
     spillover_alert.clear();
   }
-
+  if (cct->_conf->bluestore_slow_ops_warn_threshold) {
+    size_t qsize = _trim_slow_op_event_queue(mono_clock::now());
+    if (qsize >= cct->_conf->bluestore_slow_ops_warn_threshold) {
+      ostringstream ss;
+      ss << "observed slow operation indications in BlueStore";
+      alerts.emplace("BLUESTORE_SLOW_OP_ALERT", ss.str());
+    }
+  }
+  bdev->collect_alerts(alerts, "BLOCK");
+  bluefs->collect_alerts(alerts);
   if (!spurious_read_errors_alert.empty() &&
       cct->_conf->bluestore_warn_on_spurious_read_errors) {
     alerts.emplace(
index bacfcba84ec2dc95c9ab05137e094ba3f46ba04a..7f20f3eaefc232ae27148909d24fa37d499410b3 100644 (file)
@@ -24,6 +24,7 @@
 #include <chrono>
 #include <ratio>
 #include <mutex>
+#include <queue>
 #include <condition_variable>
 
 #include <boost/intrusive/list.hpp>
@@ -3317,14 +3318,14 @@ public:
     const ceph::timespan& lat,
     double lat_threshold,
     const char* info = "",
-    int idx2 = l_bluestore_first) const;
+    int idx2 = l_bluestore_first);
 
   inline void log_latency_fn(const char* name,
     int idx,
     const ceph::timespan& lat,
     double lat_threshold,
     std::function<std::string (const ceph::timespan& lat)> fn,
-    int idx2 = l_bluestore_first) const;
+    int idx2 = l_bluestore_first);
 
 private:
   bool _debug_data_eio(const ghobject_t& o) {
@@ -3358,7 +3359,10 @@ private:
   std::string no_per_pg_omap_alert;
   std::string disk_size_mismatch_alert;
   std::string spurious_read_errors_alert;
+  std::queue <ceph::mono_clock::time_point> slow_op_event_queue;
 
+  size_t _trim_slow_op_event_queue(ceph::mono_clock::time_point cur_time);
+  void _add_slow_op_event();
   void _log_alerts(osd_alert_list_t& alerts);
   bool _set_compression_alert(bool cmode, const char* s) {
     std::lock_guard l(qlock);
@@ -3395,7 +3399,7 @@ private:
     const bluestore_blob_t* blob,
     uint64_t blob_xoffset,
     const ceph::buffer::list& bl,
-    uint64_t logical_offset) const;
+    uint64_t logical_offset);
   int _decompress(ceph::buffer::list& source, ceph::buffer::list* result);