From e7866873abf58733efb6c455328dcd15a409fc64 Mon Sep 17 00:00:00 2001 From: sajibreadd Date: Mon, 27 May 2024 13:30:06 +0600 Subject: [PATCH] Warning added for slow operations and stalled read in BlueStore. User can control how much time the warning should persist after last occurence and maximum number of operations as a threshold will be considered for the warning. Fixes: https://tracker.ceph.com/issues/62500 Signed-off-by: Md Mahamudur Rahaman Sajib (cherry picked from commit 73b80a9a2c38259346fb646f85fa2ba4dcbb1329) Conflicts: src/blk/BlockDevice.h src/common/options/global.yaml.in src/os/bluestore/BlueStore.cc src/os/bluestore/BlueStore.h doc/rados/operations/health-checks.rst --- src/blk/BlockDevice.cc | 37 ++++++++++++++++++++++++++++ src/blk/BlockDevice.h | 9 ++++++- src/blk/kernel/KernelDevice.cc | 4 ++++ src/common/options/global.yaml.in | 24 +++++++++++++++++++ src/mon/PGMap.cc | 8 +++++++ src/os/bluestore/BlueFS.cc | 9 +++++++ src/os/bluestore/BlueFS.h | 2 +- src/os/bluestore/BlueStore.cc | 40 +++++++++++++++++++++++++++---- src/os/bluestore/BlueStore.h | 10 +++++--- 9 files changed, 134 insertions(+), 9 deletions(-) diff --git a/src/blk/BlockDevice.cc b/src/blk/BlockDevice.cc index fd07e443c1365..d878afc388e90 100644 --- a/src/blk/BlockDevice.cc +++ b/src/blk/BlockDevice.cc @@ -46,6 +46,7 @@ #define dout_prefix *_dout << "bdev " using std::string; +using ceph::mono_clock; blk_access_mode_t buffermode(bool buffered) @@ -209,3 +210,39 @@ bool BlockDevice::is_valid_io(uint64_t off, uint64_t len) const { } return ret; } + +size_t BlockDevice::trim_stalled_read_event_queue(mono_clock::time_point cur_time) { + std::lock_guard lock(stalled_read_event_queue_lock); + auto warn_duration = std::chrono::seconds(cct->_conf->bdev_stalled_read_warn_lifetime); + while (!stalled_read_event_queue.empty() && + ((stalled_read_event_queue.front() < cur_time - warn_duration) || + (stalled_read_event_queue.size() > cct->_conf->bdev_stalled_read_warn_threshold))) { + stalled_read_event_queue.pop(); + } + return stalled_read_event_queue.size(); +} + +void BlockDevice::add_stalled_read_event() { + if (!cct->_conf->bdev_stalled_read_warn_threshold) { + return; + } + auto cur_time = mono_clock::now(); + { + std::lock_guard lock(stalled_read_event_queue_lock); + stalled_read_event_queue.push(cur_time); + } + trim_stalled_read_event_queue(cur_time); +} + +void BlockDevice::collect_alerts(osd_alert_list_t& alerts, const std::string& device_name) { + if (cct->_conf->bdev_stalled_read_warn_threshold) { + size_t qsize = trim_stalled_read_event_queue(mono_clock::now()); + if (qsize >= cct->_conf->bdev_stalled_read_warn_threshold) { + std::ostringstream ss; + ss << "observed stalled read indications in " + << device_name << " device"; + alerts.emplace(device_name + "_DEVICE_STALLED_READ_ALERT", ss.str()); + } + } +} + diff --git a/src/blk/BlockDevice.h b/src/blk/BlockDevice.h index 67b4b330b6c45..295dfd1e7ee41 100644 --- a/src/blk/BlockDevice.h +++ b/src/blk/BlockDevice.h @@ -25,10 +25,12 @@ #include #include #include +#include #include "acconfig.h" #include "common/ceph_mutex.h" #include "include/common_fwd.h" +#include "osd/osd_types.h" #if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO) #include "aio/aio.h" @@ -147,6 +149,8 @@ class BlockDevice { public: CephContext* cct; typedef void (*aio_callback_t)(void *handle, void *aio); + void collect_alerts(osd_alert_list_t& alerts, const std::string& device_name); + private: ceph::mutex ioc_reap_lock = ceph::make_mutex("BlockDevice::ioc_reap_lock"); std::vector ioc_reap_queue; @@ -166,12 +170,14 @@ private: pmem, #endif }; + std::queue stalled_read_event_queue; + ceph::mutex stalled_read_event_queue_lock = ceph::make_mutex("BlockDevice::stalled_read_event_queue_lock"); + size_t trim_stalled_read_event_queue(mono_clock::time_point cur_time); static block_device_t detect_device_type(const std::string& path); static block_device_t device_type_from_name(const std::string& blk_dev_name); static BlockDevice *create_with_type(block_device_t device_type, CephContext* cct, const std::string& path, aio_callback_t cb, void *cbpriv, aio_callback_t d_cb, void *d_cbpriv); - protected: uint64_t size = 0; uint64_t block_size = 0; @@ -189,6 +195,7 @@ protected: // of the drive. The zones 524-52155 are sequential zones. uint64_t conventional_region_size = 0; uint64_t zone_size = 0; + void add_stalled_read_event(); public: aio_callback_t aio_callback; diff --git a/src/blk/kernel/KernelDevice.cc b/src/blk/kernel/KernelDevice.cc index b5830b51d6c21..f1bca417a6871 100644 --- a/src/blk/kernel/KernelDevice.cc +++ b/src/blk/kernel/KernelDevice.cc @@ -1262,6 +1262,7 @@ int KernelDevice::read(uint64_t off, uint64_t len, bufferlist *pbl, << " since " << start1 << ", timeout is " << age << "s" << dendl; + add_stalled_read_event(); } if (r < 0) { if (ioc->allow_eio && is_expected_ioerr(-errno)) { @@ -1335,6 +1336,7 @@ int KernelDevice::direct_read_unaligned(uint64_t off, uint64_t len, char *buf) << " since " << start1 << ", timeout is " << age << "s" << dendl; + add_stalled_read_event(); } if (r < 0) { @@ -1398,6 +1400,7 @@ int KernelDevice::read_random(uint64_t off, uint64_t len, char *buf, << " (buffered) since " << start1 << ", timeout is " << age << "s" << dendl; + add_stalled_read_event(); } } else { //direct and aligned read @@ -1408,6 +1411,7 @@ int KernelDevice::read_random(uint64_t off, uint64_t len, char *buf, << " (direct) since " << start1 << ", timeout is " << age << "s" << dendl; + add_stalled_read_event(); } if (r < 0) { r = -errno; diff --git a/src/common/options/global.yaml.in b/src/common/options/global.yaml.in index 3999a1a625fcc..a8a00325bde38 100644 --- a/src/common/options/global.yaml.in +++ b/src/common/options/global.yaml.in @@ -5310,6 +5310,18 @@ options: desc: Enable health indication when spurious read errors are observed by OSD default: true with_legacy: true +- name: bluestore_slow_ops_warn_lifetime + type: uint + level: advanced + desc: A configurable duration for slow ops warning to be appeared if number of occurence pass `bluestore_slow_ops_warn_threshold` in `bluestore_slow_ops_warn_lifetime` seconds + default: 86400 + with_legacy: true +- name: bluestore_slow_ops_warn_threshold + type: uint + level: advanced + desc: A configurable number for slow ops warning to be appeared if number of occurence pass `bluestore_slow_ops_warn_threshold` in `bluestore_slow_ops_warn_lifetime` seconds + default: 1 + with_legacy: true - name: bluestore_fsck_error_on_no_per_pool_omap type: bool level: advanced @@ -6345,6 +6357,18 @@ options: - spdk - pmem - hm_smr +- name: bdev_stalled_read_warn_lifetime + type: uint + level: advanced + desc: A configurable duration for stalled read warning to be appeared if number of stalled read occurence pass `bdev_stalled_read_warn_threshold` in `bdev_stalled_read_warn_lifetime` seconds + default: 86400 + with_legacy: true +- name: bdev_stalled_read_warn_threshold + type: uint + level: advanced + desc: A configurable number for stalled read warning to be appeared if number of stalled read occurence pass `bdev_stalled_read_warn_threshold` in `bdev_stalled_read_warn_lifetime` seconds + default: 1 + with_legacy: true - name: bluestore_cleaner_sleep_interval type: float level: advanced diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc index 326d589b705ad..7ccafa3845200 100644 --- a/src/mon/PGMap.cc +++ b/src/mon/PGMap.cc @@ -3238,6 +3238,14 @@ void PGMap::get_health_checks( summary += " reporting legacy (not per-pool) BlueStore omap usage stats"; } else if (asum.first == "BLUESTORE_SPURIOUS_READ_ERRORS") { summary += " have spurious read errors"; + } else if (asum.first == "BLUESTORE_SLOW_OP_ALERT") { + summary += " experiencing slow operations in BlueStore"; + } else if (asum.first == "BLOCK_DEVICE_STALLED_READ_ALERT") { + summary += " experiencing stalled read in block device of BlueStore"; + } else if (asum.first == "WAL_DEVICE_STALLED_READ_ALERT") { + summary += " experiencing stalled read in wal device of BlueFS"; + } else if (asum.first == "DB_DEVICE_STALLED_READ_ALERT") { + summary += " experiencing stalled read in db device of BlueFS"; } auto& d = checks->add(asum.first, HEALTH_WARN, summary, asum.second.first); diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index 04f7adfc71ad7..4c6a2042ab882 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -4112,6 +4112,15 @@ bool BlueFS::debug_get_is_dev_dirty(FileWriter *h, uint8_t dev) return h->dirty_devs[dev]; } +void BlueFS::collect_alerts(osd_alert_list_t& alerts) { + if (bdev[BDEV_DB]) { + bdev[BDEV_DB]->collect_alerts(alerts, "DB"); + } + if (bdev[BDEV_WAL]) { + bdev[BDEV_WAL]->collect_alerts(alerts, "WAL"); + } +} + int BlueFS::open_for_read( std::string_view dirname, std::string_view filename, diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h index 9c08bc377ffc0..615ea49143e98 100644 --- a/src/os/bluestore/BlueFS.h +++ b/src/os/bluestore/BlueFS.h @@ -132,6 +132,7 @@ public: WRITER_WAL, WRITER_SST, }; + void collect_alerts(osd_alert_list_t& alerts); struct File : public RefCountedObject { MEMPOOL_CLASS_HELPERS(); @@ -334,7 +335,6 @@ public: FileRef file; explicit FileLock(FileRef f) : file(std::move(f)) {} }; - private: PerfCounters *logger = nullptr; diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 3ca91ed41d5b0..fd0c97455031d 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -11230,7 +11230,7 @@ int BlueStore::_do_read( int BlueStore::_verify_csum(OnodeRef& o, const bluestore_blob_t* blob, uint64_t blob_xoffset, const bufferlist& bl, - uint64_t logical_offset) const + uint64_t logical_offset) { int bad; uint64_t bad_csum; @@ -17254,12 +17254,33 @@ int BlueStore::_merge_collection( return r; } +size_t BlueStore::_trim_slow_op_event_queue(mono_clock::time_point cur_time) { + ceph_assert(ceph_mutex_is_locked(qlock)); + auto warn_duration = std::chrono::seconds(cct->_conf->bluestore_slow_ops_warn_lifetime); + while (!slow_op_event_queue.empty() && + ((slow_op_event_queue.front() < cur_time - warn_duration) || + (slow_op_event_queue.size() > cct->_conf->bluestore_slow_ops_warn_threshold))) { + slow_op_event_queue.pop(); + } + return slow_op_event_queue.size(); +} + +void BlueStore::_add_slow_op_event() { + if (!cct->_conf->bluestore_slow_ops_warn_threshold) { + return; + } + std::lock_guard lock(qlock); + auto cur_time = mono_clock::now(); + slow_op_event_queue.push(cur_time); + _trim_slow_op_event_queue(cur_time); +} + void BlueStore::log_latency( const char* name, int idx, const ceph::timespan& l, double lat_threshold, - const char* info) const + const char* info) { logger->tinc(idx, l); if (lat_threshold > 0.0 && @@ -17268,6 +17289,7 @@ void BlueStore::log_latency( << ", latency = " << l << info << dendl; + _add_slow_op_event(); } } @@ -17276,7 +17298,7 @@ void BlueStore::log_latency_fn( int idx, const ceph::timespan& l, double lat_threshold, - std::function fn) const + std::function fn) { logger->tinc(idx, l); if (lat_threshold > 0.0 && @@ -17285,6 +17307,7 @@ void BlueStore::log_latency_fn( << ", latency = " << l << fn(l) << dendl; + _add_slow_op_event(); } } @@ -17680,7 +17703,16 @@ void BlueStore::_log_alerts(osd_alert_list_t& alerts) } else if (!spillover_alert.empty()){ spillover_alert.clear(); } - + if (cct->_conf->bluestore_slow_ops_warn_threshold) { + size_t qsize = _trim_slow_op_event_queue(mono_clock::now()); + if (qsize >= cct->_conf->bluestore_slow_ops_warn_threshold) { + ostringstream ss; + ss << "observed slow operation indications in BlueStore"; + alerts.emplace("BLUESTORE_SLOW_OP_ALERT", ss.str()); + } + } + bdev->collect_alerts(alerts, "BLOCK"); + bluefs->collect_alerts(alerts); if (!spurious_read_errors_alert.empty() && cct->_conf->bluestore_warn_on_spurious_read_errors) { alerts.emplace( diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 643b32d8a6dd8..65dbcef81f25e 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -3290,13 +3291,13 @@ public: int idx, const ceph::timespan& lat, double lat_threshold, - const char* info = "") const; + const char* info = ""); inline void log_latency_fn(const char* name, int idx, const ceph::timespan& lat, double lat_threshold, - std::function fn) const; + std::function fn); private: bool _debug_data_eio(const ghobject_t& o) { @@ -3330,7 +3331,10 @@ private: std::string no_per_pg_omap_alert; std::string disk_size_mismatch_alert; std::string spurious_read_errors_alert; + std::queue slow_op_event_queue; + size_t _trim_slow_op_event_queue(ceph::mono_clock::time_point cur_time); + void _add_slow_op_event(); void _log_alerts(osd_alert_list_t& alerts); bool _set_compression_alert(bool cmode, const char* s) { std::lock_guard l(qlock); @@ -3367,7 +3371,7 @@ private: const bluestore_blob_t* blob, uint64_t blob_xoffset, const ceph::buffer::list& bl, - uint64_t logical_offset) const; + uint64_t logical_offset); int _decompress(ceph::buffer::list& source, ceph::buffer::list* result); -- 2.39.5