]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
Warning added for slow operations and stalled read in BlueStore. User can control...
authorsajibreadd <sajibreadd@gmail.com>
Mon, 27 May 2024 07:30:06 +0000 (13:30 +0600)
committersajibreadd <sajibreadd@gmail.com>
Wed, 26 Jun 2024 08:47:03 +0000 (14:47 +0600)
Fixes: https://tracker.ceph.com/issues/62500
Signed-off-by: Md Mahamudur Rahaman Sajib <mahamudur.sajib@croit.io>
doc/rados/operations/health-checks.rst
src/blk/BlockDevice.cc
src/blk/BlockDevice.h
src/blk/kernel/KernelDevice.cc
src/common/options/global.yaml.in
src/mon/PGMap.cc
src/os/bluestore/BlueFS.cc
src/os/bluestore/BlueFS.h
src/os/bluestore/BlueStore.cc
src/os/bluestore/BlueStore.h

index 6f3ca281bbfa69af6847047a0cfb32ec9f7fba8c..fd8b8b00803a5a37a4d50a05cb96c8c0d2b214a6 100644 (file)
@@ -801,6 +801,98 @@ Or, to disable this alert on a specific OSD, run the following command:
 
    ceph config set osd.123 bluestore_warn_on_spurious_read_errors false
 
+BLOCK_DEVICE_STALLED_READ_ALERT
+_______________________________
+
+There are certain BlueStore log messages that surface storage drive issues 
+that can lead to performance degradation and can cause bad disk.
+
+``read stalled read 0x29f40370000~100000 (buffered) since 63410177.290546s, timeout is 5.000000s``
+
+However, this is difficult to spot as there's no discernible warning (a
+health warning or info in ``ceph health detail`` for example). More observations
+can be found here: https://tracker.ceph.com/issues/62500
+
+As there can be false positive ``stalled read`` instances, a mechanism
+has been added for more reliability. If in last ``bdev_stalled_read_warn_lifetime``
+duration ``stalled read`` indications are found more than or equal to
+``bdev_stalled_read_warn_threshold`` for a given BlueStore block device, this
+warning will be reported in ``ceph health detail``.
+
+By default value of ``bdev_stalled_read_warn_lifetime = 86400s`` and
+``bdev_stalled_read_warn_threshold = 1``. But user can configure it for
+individual OSDs.
+
+To change this, run the following command:
+
+.. prompt:: bash $
+
+   ceph config set global bdev_stalled_read_warn_lifetime 10
+   ceph config set global bdev_stalled_read_warn_threshold 5
+
+this may be done surgically for individual OSDs or a given mask
+
+.. prompt:: bash $
+
+   ceph config set osd.123 bdev_stalled_read_warn_lifetime 10
+   ceph config set osd.123 bdev_stalled_read_warn_threshold 5
+   ceph config set class:ssd bdev_stalled_read_warn_lifetime 10
+   ceph config set class:ssd bdev_stalled_read_warn_threshold 5
+
+WAL_DEVICE_STALLED_READ_ALERT
+_____________________________
+
+A similar warning like ``BLOCK_DEVICE_STALLED_READ_ALERT`` will be raised to
+identify ``stalled read`` instances on a given BlueStore OSD's ``WAL_DEVICE``.
+This warning can be configured via ``bdev_stalled_read_warn_lifetime`` and
+``bdev_stalled_read_warn_threshold`` parameters similarly described in the
+``BLOCK_DEVICE_STALLED_READ_ALERT`` warning section.
+
+DB_DEVICE_STALLED_READ_ALERT
+____________________________
+
+A similar warning like ``BLOCK_DEVICE_STALLED_READ_ALERT`` will be raised to
+identify ``stalled read`` instances on a given BlueStore OSD's ``WAL_DEVICE``.
+This warning can be configured via ``bdev_stalled_read_warn_lifetime`` and
+``bdev_stalled_read_warn_threshold`` parameters similarly described in the
+``BLOCK_DEVICE_STALLED_READ_ALERT`` warning section.
+
+BLUESTORE_SLOW_OP_ALERT
+_______________________
+
+There are certain BlueStore log messages that surface storage drive issues 
+that can lead to performance degradation and can cause bad disk.
+
+``log_latency_fn slow operation observed for _txc_committed_kv, latency = 12.028621219s, txc = 0x55a107c30f00``
+``log_latency_fn slow operation observed for upper_bound, latency = 6.25955s``
+``log_latency slow operation observed for submit_transaction..``
+
+As there can be false positive ``slow ops`` instances, a mechanism has
+been added for more reliability. If in last ``bluestore_slow_ops_warn_lifetime``
+duration ``slow ops`` indications are found more than or equal to
+``bluestore_slow_ops_warn_threshold`` for a given BlueStore OSD, this warning
+will be reported in ``ceph health detail``.
+
+By default value of ``bluestore_slow_ops_warn_lifetime = 86400s`` and
+``bluestore_slow_ops_warn_threshold = 1``. But user can configure it for
+individual OSDs.
+
+To change this, run the following command:
+
+.. prompt:: bash $
+
+   ceph config set global bluestore_slow_ops_warn_lifetime 10
+   ceph config set global bluestore_slow_ops_warn_threshold 5
+
+this may be done surgically for individual OSDs or a given mask
+
+.. prompt:: bash $
+
+   ceph config set osd.123 bluestore_slow_ops_warn_lifetime 10
+   ceph config set osd.123 bluestore_slow_ops_warn_threshold 5
+   ceph config set class:ssd bluestore_slow_ops_warn_lifetime 10
+   ceph config set class:ssd bluestore_slow_ops_warn_threshold 5
+
 Device health
 -------------
 
index 33ceacc8909eb2fe13fe91a2166f40834363317a..7510fb89c8b7c92c6a9153561a968db6ac4e4fe1 100644 (file)
@@ -42,6 +42,7 @@
 #define dout_prefix *_dout << "bdev "
 
 using std::string;
+using ceph::mono_clock;
 
 
 blk_access_mode_t buffermode(bool buffered) 
@@ -191,3 +192,39 @@ bool BlockDevice::is_valid_io(uint64_t off, uint64_t len) const {
   }
   return ret;
 }
+
+size_t BlockDevice::trim_stalled_read_event_queue(mono_clock::time_point cur_time) {
+  std::lock_guard lock(stalled_read_event_queue_lock);
+  auto warn_duration = std::chrono::seconds(cct->_conf->bdev_stalled_read_warn_lifetime);
+  while (!stalled_read_event_queue.empty() && 
+    ((stalled_read_event_queue.front() < cur_time - warn_duration) ||
+      (stalled_read_event_queue.size() > cct->_conf->bdev_stalled_read_warn_threshold))) {
+      stalled_read_event_queue.pop();
+  }
+  return stalled_read_event_queue.size();
+}
+
+void BlockDevice::add_stalled_read_event() {
+  if (!cct->_conf->bdev_stalled_read_warn_threshold) {
+    return;
+  }
+  auto cur_time = mono_clock::now();
+  {
+    std::lock_guard lock(stalled_read_event_queue_lock);
+    stalled_read_event_queue.push(cur_time);
+  }
+  trim_stalled_read_event_queue(cur_time);
+}
+
+void BlockDevice::collect_alerts(osd_alert_list_t& alerts, const std::string& device_name) {
+  if (cct->_conf->bdev_stalled_read_warn_threshold) {
+    size_t qsize = trim_stalled_read_event_queue(mono_clock::now());
+    if (qsize >= cct->_conf->bdev_stalled_read_warn_threshold) {
+      std::ostringstream ss;
+      ss << "observed stalled read indications in "
+        << device_name << " device";
+      alerts.emplace(device_name + "_DEVICE_STALLED_READ_ALERT", ss.str());
+    }
+  }
+}
+
index 6c55646fc76df8d5594bbb66d1a7729e9a0966c5..115f7e6c1b5387d1e9f24ba39a54af18ba708a2f 100644 (file)
 #include <set>
 #include <string>
 #include <vector>
+#include <queue>
 
 #include "acconfig.h"
 #include "common/ceph_mutex.h"
 #include "include/common_fwd.h"
 #include "extblkdev/ExtBlkDevInterface.h"
+#include "osd/osd_types.h"
 
 #if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO)
 #include "aio/aio.h"
@@ -148,6 +150,8 @@ class BlockDevice {
 public:
   CephContext* cct;
   typedef void (*aio_callback_t)(void *handle, void *aio);
+  void collect_alerts(osd_alert_list_t& alerts, const std::string& device_name);
+
 private:
   ceph::mutex ioc_reap_lock = ceph::make_mutex("BlockDevice::ioc_reap_lock");
   std::vector<IOContext*> ioc_reap_queue;
@@ -164,12 +168,14 @@ private:
     pmem,
 #endif
   };
+  std::queue <ceph::mono_clock::time_point> stalled_read_event_queue;
+  ceph::mutex stalled_read_event_queue_lock = ceph::make_mutex("BlockDevice::stalled_read_event_queue_lock");
+  size_t trim_stalled_read_event_queue(mono_clock::time_point cur_time);
   static block_device_t detect_device_type(const std::string& path);
   static block_device_t device_type_from_name(const std::string& blk_dev_name);
   static BlockDevice *create_with_type(block_device_t device_type,
     CephContext* cct, const std::string& path, aio_callback_t cb,
     void *cbpriv, aio_callback_t d_cb, void *d_cbpriv);
-
 protected:
   uint64_t size = 0;
   uint64_t block_size = 0;
@@ -187,6 +193,7 @@ protected:
   // of the drive.  The zones 524-52155 are sequential zones.
   uint64_t conventional_region_size = 0;
   uint64_t zone_size = 0;
+  void add_stalled_read_event();
 
 public:
   aio_callback_t aio_callback;
index 6337292f5dec25e95766e5db79b1e9cc9dcff8ad..f6edaf98e8304b673481137bd03a434cbbc6f945 100644 (file)
@@ -1310,6 +1310,7 @@ int KernelDevice::read(uint64_t off, uint64_t len, bufferlist *pbl,
         << " since " << start1 << ", timeout is "
         << age
         << "s" << dendl;
+    add_stalled_read_event();
   }
   if (r < 0) {
     if (ioc->allow_eio && is_expected_ioerr(-errno)) {
@@ -1383,6 +1384,7 @@ int KernelDevice::direct_read_unaligned(uint64_t off, uint64_t len, char *buf)
         << " since " << start1 << ", timeout is "
         << age
         << "s" << dendl;
+    add_stalled_read_event();
   }
 
   if (r < 0) {
@@ -1446,6 +1448,7 @@ int KernelDevice::read_random(uint64_t off, uint64_t len, char *buf,
            << " (buffered) since " << start1 << ", timeout is "
           << age
           << "s" << dendl;
+      add_stalled_read_event();
     }
   } else {
     //direct and aligned read
@@ -1456,6 +1459,7 @@ int KernelDevice::read_random(uint64_t off, uint64_t len, char *buf,
            << " (direct) since " << start1 << ", timeout is "
           << age
           << "s" << dendl;
+      add_stalled_read_event();
     }
     if (r < 0) {
       r = -errno;
index 7366bbb31c645b68caf500ed0b9bb17cf6a6c1d6..716276648dcf2c9eb0697a0843cd8c193d135dd2 100644 (file)
@@ -5365,6 +5365,18 @@ options:
   desc: Enable health indication when spurious read errors are observed by OSD
   default: true
   with_legacy: true
+- name: bluestore_slow_ops_warn_lifetime
+  type: uint
+  level: advanced
+  desc: A configurable duration for slow ops warning to be appeared if number of occurence pass `bluestore_slow_ops_warn_threshold` in `bluestore_slow_ops_warn_lifetime` seconds
+  default: 86400
+  with_legacy: true
+- name: bluestore_slow_ops_warn_threshold
+  type: uint
+  level: advanced
+  desc: A configurable number for slow ops warning to be appeared if number of occurence pass `bluestore_slow_ops_warn_threshold` in `bluestore_slow_ops_warn_lifetime` seconds
+  default: 1
+  with_legacy: true
 - name: bluestore_fsck_error_on_no_per_pool_omap
   type: bool
   level: advanced
@@ -6442,6 +6454,18 @@ options:
   - aio
   - spdk
   - pmem
+- name: bdev_stalled_read_warn_lifetime
+  type: uint
+  level: advanced
+  desc: A configurable duration for stalled read warning to be appeared if number of stalled read occurence pass `bdev_stalled_read_warn_threshold` in `bdev_stalled_read_warn_lifetime` seconds
+  default: 86400
+  with_legacy: true
+- name: bdev_stalled_read_warn_threshold
+  type: uint
+  level: advanced
+  desc: A configurable number for stalled read warning to be appeared if number of stalled read occurence pass `bdev_stalled_read_warn_threshold` in `bdev_stalled_read_warn_lifetime` seconds
+  default: 1
+  with_legacy: true
 - name: bluestore_cleaner_sleep_interval
   type: float
   level: advanced
index 16fa71c84c069b2eecb741b9767aa083879ea7f1..138826aebc17f6c463eb0743d44c9e9bae6aac4d 100644 (file)
@@ -3238,6 +3238,14 @@ void PGMap::get_health_checks(
        summary += " reporting legacy (not per-pool) BlueStore omap usage stats";
       } else if (asum.first == "BLUESTORE_SPURIOUS_READ_ERRORS") {
         summary += " have spurious read errors";
+      } else if (asum.first == "BLUESTORE_SLOW_OP_ALERT") {
+        summary += " experiencing slow operations in BlueStore";
+      } else if (asum.first == "BLOCK_DEVICE_STALLED_READ_ALERT") {
+        summary += " experiencing stalled read in block device of BlueStore";
+      } else if (asum.first == "WAL_DEVICE_STALLED_READ_ALERT") {
+        summary += " experiencing stalled read in wal device of BlueFS";
+      } else if (asum.first == "DB_DEVICE_STALLED_READ_ALERT") {
+        summary += " experiencing stalled read in db device of BlueFS";
       }
 
       auto& d = checks->add(asum.first, HEALTH_WARN, summary, asum.second.first);
index 9f419c195851ac2963d5ff306dbf4412ea512f6f..83fcb8bc02d2bffa02d6ca1c24f93dbdcc78e165 100644 (file)
@@ -4214,6 +4214,15 @@ bool BlueFS::debug_get_is_dev_dirty(FileWriter *h, uint8_t dev)
   return h->dirty_devs[dev];
 }
 
+void BlueFS::collect_alerts(osd_alert_list_t& alerts) {
+  if (bdev[BDEV_DB]) {
+    bdev[BDEV_DB]->collect_alerts(alerts, "DB");
+  }
+  if (bdev[BDEV_WAL]) {
+    bdev[BDEV_WAL]->collect_alerts(alerts, "WAL");
+  }
+}
+
 int BlueFS::open_for_read(
   std::string_view dirname,
   std::string_view filename,
index 86fb3cc882e1e4c2c5cd9d98d0d98e246734ea7a..4a11494771edfea432626d4903b7d454ca97b819 100644 (file)
@@ -231,6 +231,7 @@ public:
     WRITER_WAL,
     WRITER_SST,
   };
+  void collect_alerts(osd_alert_list_t& alerts);
 
   struct File : public RefCountedObject {
     MEMPOOL_CLASS_HELPERS();
@@ -433,7 +434,6 @@ public:
     FileRef file;
     explicit FileLock(FileRef f) : file(std::move(f)) {}
   };
-
 private:
   PerfCounters *logger = nullptr;
 
index 2f6cf3efb899bed8f04ecc9f95043fe34e0589aa..14bce505a6d44243a4df9a9f00ece671a265a9db 100644 (file)
@@ -12185,7 +12185,7 @@ int BlueStore::_do_read(
 int BlueStore::_verify_csum(OnodeRef& o,
                            const bluestore_blob_t* blob, uint64_t blob_xoffset,
                            const bufferlist& bl,
-                           uint64_t logical_offset) const
+                           uint64_t logical_offset)
 {
   int bad;
   uint64_t bad_csum;
@@ -17865,13 +17865,34 @@ int BlueStore::_merge_collection(
   return r;
 }
 
+size_t BlueStore::_trim_slow_op_event_queue(mono_clock::time_point cur_time) {
+  ceph_assert(ceph_mutex_is_locked(qlock));
+  auto warn_duration = std::chrono::seconds(cct->_conf->bluestore_slow_ops_warn_lifetime);
+  while (!slow_op_event_queue.empty() && 
+    ((slow_op_event_queue.front() < cur_time - warn_duration) ||
+      (slow_op_event_queue.size() > cct->_conf->bluestore_slow_ops_warn_threshold))) {
+      slow_op_event_queue.pop();
+  }
+  return slow_op_event_queue.size();
+}
+
+void BlueStore::_add_slow_op_event() {
+  if (!cct->_conf->bluestore_slow_ops_warn_threshold) {
+    return;
+  }
+  std::lock_guard lock(qlock);
+  auto cur_time = mono_clock::now();
+  slow_op_event_queue.push(cur_time);
+  _trim_slow_op_event_queue(cur_time);
+}
+
 void BlueStore::log_latency(
   const char* name,
   int idx,
   const ceph::timespan& l,
   double lat_threshold,
   const char* info,
-  int idx2) const
+  int idx2)
 {
   logger->tinc(idx, l);
   if (lat_threshold > 0.0 &&
@@ -17880,6 +17901,7 @@ void BlueStore::log_latency(
       << ", latency = " << l
       << info
       << dendl;
+    _add_slow_op_event();
     if (idx2 > l_bluestore_first && idx2 < l_bluestore_last) {
       logger->inc(idx2);
     }
@@ -17892,7 +17914,7 @@ void BlueStore::log_latency_fn(
   const ceph::timespan& l,
   double lat_threshold,
   std::function<string (const ceph::timespan& lat)> fn,
-  int idx2) const
+  int idx2)
 {
   logger->tinc(idx, l);
   if (lat_threshold > 0.0 &&
@@ -17901,6 +17923,7 @@ void BlueStore::log_latency_fn(
       << ", latency = " << l
       << fn(l)
       << dendl;
+    _add_slow_op_event();
     if (idx2 > l_bluestore_first && idx2 < l_bluestore_last) {
       logger->inc(idx2);
     }
@@ -18299,7 +18322,16 @@ void BlueStore::_log_alerts(osd_alert_list_t& alerts)
   } else if (!spillover_alert.empty()){
     spillover_alert.clear();
   }
-
+  if (cct->_conf->bluestore_slow_ops_warn_threshold) {
+    size_t qsize = _trim_slow_op_event_queue(mono_clock::now());
+    if (qsize >= cct->_conf->bluestore_slow_ops_warn_threshold) {
+      ostringstream ss;
+      ss << "observed slow operation indications in BlueStore";
+      alerts.emplace("BLUESTORE_SLOW_OP_ALERT", ss.str());
+    }
+  }
+  bdev->collect_alerts(alerts, "BLOCK");
+  bluefs->collect_alerts(alerts);
   if (!spurious_read_errors_alert.empty() &&
       cct->_conf->bluestore_warn_on_spurious_read_errors) {
     alerts.emplace(
index a9b510e162e00f285bdbe2e788094ead18200a29..f75da4dc7be41aa7a015400de73665ef31aab2ff 100644 (file)
@@ -25,6 +25,7 @@
 #include <chrono>
 #include <ratio>
 #include <mutex>
+#include <queue>
 #include <condition_variable>
 
 #include <boost/intrusive/list.hpp>
@@ -3428,14 +3429,14 @@ public:
     const ceph::timespan& lat,
     double lat_threshold,
     const char* info = "",
-    int idx2 = l_bluestore_first) const;
+    int idx2 = l_bluestore_first);
 
   inline void log_latency_fn(const char* name,
     int idx,
     const ceph::timespan& lat,
     double lat_threshold,
     std::function<std::string (const ceph::timespan& lat)> fn,
-    int idx2 = l_bluestore_first) const;
+    int idx2 = l_bluestore_first);
 
 private:
   bool _debug_data_eio(const ghobject_t& o) {
@@ -3469,7 +3470,10 @@ private:
   std::string no_per_pg_omap_alert;
   std::string disk_size_mismatch_alert;
   std::string spurious_read_errors_alert;
+  std::queue <ceph::mono_clock::time_point> slow_op_event_queue;
 
+  size_t _trim_slow_op_event_queue(ceph::mono_clock::time_point cur_time);
+  void _add_slow_op_event();
   void _log_alerts(osd_alert_list_t& alerts);
   bool _set_compression_alert(bool cmode, const char* s) {
     std::lock_guard l(qlock);
@@ -3506,7 +3510,7 @@ private:
     const bluestore_blob_t* blob,
     uint64_t blob_xoffset,
     const ceph::buffer::list& bl,
-    uint64_t logical_offset) const;
+    uint64_t logical_offset);
   int _decompress(ceph::buffer::list& source, ceph::buffer::list* result);