From e33fa85f2a729d1d98a5737f9677d5a65db61d67 Mon Sep 17 00:00:00 2001 From: sajibreadd Date: Mon, 27 May 2024 13:30:06 +0600 Subject: [PATCH] Warning added for slow operations and stalled read in BlueStore. User can control how much time the warning should persist after last occurence and maximum number of operations as a threshold will be considered for the warning. Fixes: https://tracker.ceph.com/issues/62500 Signed-off-by: Md Mahamudur Rahaman Sajib (cherry picked from commit 73b80a9a2c38259346fb646f85fa2ba4dcbb1329) Conflicts: src/common/options/global.yaml.in src/os/bluestore/BlueStore.cc src/os/bluestore/BlueStore.h doc/rados/operations/health-checks.rst --- src/blk/BlockDevice.cc | 37 ++++++++++++++++++++++++++++ src/blk/BlockDevice.h | 9 ++++++- src/blk/kernel/KernelDevice.cc | 4 ++++ src/common/options/global.yaml.in | 24 +++++++++++++++++++ src/mon/PGMap.cc | 8 +++++++ src/os/bluestore/BlueFS.cc | 9 +++++++ src/os/bluestore/BlueFS.h | 2 +- src/os/bluestore/BlueStore.cc | 40 +++++++++++++++++++++++++++---- src/os/bluestore/BlueStore.h | 10 +++++--- 9 files changed, 134 insertions(+), 9 deletions(-) diff --git a/src/blk/BlockDevice.cc b/src/blk/BlockDevice.cc index fd07e443c1365..d878afc388e90 100644 --- a/src/blk/BlockDevice.cc +++ b/src/blk/BlockDevice.cc @@ -46,6 +46,7 @@ #define dout_prefix *_dout << "bdev " using std::string; +using ceph::mono_clock; blk_access_mode_t buffermode(bool buffered) @@ -209,3 +210,39 @@ bool BlockDevice::is_valid_io(uint64_t off, uint64_t len) const { } return ret; } + +size_t BlockDevice::trim_stalled_read_event_queue(mono_clock::time_point cur_time) { + std::lock_guard lock(stalled_read_event_queue_lock); + auto warn_duration = std::chrono::seconds(cct->_conf->bdev_stalled_read_warn_lifetime); + while (!stalled_read_event_queue.empty() && + ((stalled_read_event_queue.front() < cur_time - warn_duration) || + (stalled_read_event_queue.size() > cct->_conf->bdev_stalled_read_warn_threshold))) { + stalled_read_event_queue.pop(); + } + return stalled_read_event_queue.size(); +} + +void BlockDevice::add_stalled_read_event() { + if (!cct->_conf->bdev_stalled_read_warn_threshold) { + return; + } + auto cur_time = mono_clock::now(); + { + std::lock_guard lock(stalled_read_event_queue_lock); + stalled_read_event_queue.push(cur_time); + } + trim_stalled_read_event_queue(cur_time); +} + +void BlockDevice::collect_alerts(osd_alert_list_t& alerts, const std::string& device_name) { + if (cct->_conf->bdev_stalled_read_warn_threshold) { + size_t qsize = trim_stalled_read_event_queue(mono_clock::now()); + if (qsize >= cct->_conf->bdev_stalled_read_warn_threshold) { + std::ostringstream ss; + ss << "observed stalled read indications in " + << device_name << " device"; + alerts.emplace(device_name + "_DEVICE_STALLED_READ_ALERT", ss.str()); + } + } +} + diff --git a/src/blk/BlockDevice.h b/src/blk/BlockDevice.h index ad4ed285f6049..ebb23f366615a 100644 --- a/src/blk/BlockDevice.h +++ b/src/blk/BlockDevice.h @@ -25,11 +25,13 @@ #include #include #include +#include #include "acconfig.h" #include "common/ceph_mutex.h" #include "include/common_fwd.h" #include "extblkdev/ExtBlkDevInterface.h" +#include "osd/osd_types.h" #if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO) #include "aio/aio.h" @@ -148,6 +150,8 @@ class BlockDevice { public: CephContext* cct; typedef void (*aio_callback_t)(void *handle, void *aio); + void collect_alerts(osd_alert_list_t& alerts, const std::string& device_name); + private: ceph::mutex ioc_reap_lock = ceph::make_mutex("BlockDevice::ioc_reap_lock"); std::vector ioc_reap_queue; @@ -167,12 +171,14 @@ private: pmem, #endif }; + std::queue stalled_read_event_queue; + ceph::mutex stalled_read_event_queue_lock = ceph::make_mutex("BlockDevice::stalled_read_event_queue_lock"); + size_t trim_stalled_read_event_queue(mono_clock::time_point cur_time); static block_device_t detect_device_type(const std::string& path); static block_device_t device_type_from_name(const std::string& blk_dev_name); static BlockDevice *create_with_type(block_device_t device_type, CephContext* cct, const std::string& path, aio_callback_t cb, void *cbpriv, aio_callback_t d_cb, void *d_cbpriv); - protected: uint64_t size = 0; uint64_t block_size = 0; @@ -190,6 +196,7 @@ protected: // of the drive. The zones 524-52155 are sequential zones. uint64_t conventional_region_size = 0; uint64_t zone_size = 0; + void add_stalled_read_event(); public: aio_callback_t aio_callback; diff --git a/src/blk/kernel/KernelDevice.cc b/src/blk/kernel/KernelDevice.cc index ad7bfc4cf9467..d870b5691b050 100644 --- a/src/blk/kernel/KernelDevice.cc +++ b/src/blk/kernel/KernelDevice.cc @@ -1331,6 +1331,7 @@ int KernelDevice::read(uint64_t off, uint64_t len, bufferlist *pbl, << " since " << start1 << ", timeout is " << age << "s" << dendl; + add_stalled_read_event(); } if (r < 0) { if (ioc->allow_eio && is_expected_ioerr(-errno)) { @@ -1404,6 +1405,7 @@ int KernelDevice::direct_read_unaligned(uint64_t off, uint64_t len, char *buf) << " since " << start1 << ", timeout is " << age << "s" << dendl; + add_stalled_read_event(); } if (r < 0) { @@ -1467,6 +1469,7 @@ int KernelDevice::read_random(uint64_t off, uint64_t len, char *buf, << " (buffered) since " << start1 << ", timeout is " << age << "s" << dendl; + add_stalled_read_event(); } } else { //direct and aligned read @@ -1477,6 +1480,7 @@ int KernelDevice::read_random(uint64_t off, uint64_t len, char *buf, << " (direct) since " << start1 << ", timeout is " << age << "s" << dendl; + add_stalled_read_event(); } if (r < 0) { r = -errno; diff --git a/src/common/options/global.yaml.in b/src/common/options/global.yaml.in index c84c2c7bdde6f..8c876897a062b 100644 --- a/src/common/options/global.yaml.in +++ b/src/common/options/global.yaml.in @@ -5354,6 +5354,18 @@ options: desc: Enable health indication when spurious read errors are observed by OSD default: true with_legacy: true +- name: bluestore_slow_ops_warn_lifetime + type: uint + level: advanced + desc: A configurable duration for slow ops warning to be appeared if number of occurence pass `bluestore_slow_ops_warn_threshold` in `bluestore_slow_ops_warn_lifetime` seconds + default: 86400 + with_legacy: true +- name: bluestore_slow_ops_warn_threshold + type: uint + level: advanced + desc: A configurable number for slow ops warning to be appeared if number of occurence pass `bluestore_slow_ops_warn_threshold` in `bluestore_slow_ops_warn_lifetime` seconds + default: 1 + with_legacy: true - name: bluestore_fsck_error_on_no_per_pool_omap type: bool level: advanced @@ -6390,6 +6402,18 @@ options: - spdk - pmem - hm_smr +- name: bdev_stalled_read_warn_lifetime + type: uint + level: advanced + desc: A configurable duration for stalled read warning to be appeared if number of stalled read occurence pass `bdev_stalled_read_warn_threshold` in `bdev_stalled_read_warn_lifetime` seconds + default: 86400 + with_legacy: true +- name: bdev_stalled_read_warn_threshold + type: uint + level: advanced + desc: A configurable number for stalled read warning to be appeared if number of stalled read occurence pass `bdev_stalled_read_warn_threshold` in `bdev_stalled_read_warn_lifetime` seconds + default: 1 + with_legacy: true - name: bluestore_cleaner_sleep_interval type: float level: advanced diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc index 16fa71c84c069..138826aebc17f 100644 --- a/src/mon/PGMap.cc +++ b/src/mon/PGMap.cc @@ -3238,6 +3238,14 @@ void PGMap::get_health_checks( summary += " reporting legacy (not per-pool) BlueStore omap usage stats"; } else if (asum.first == "BLUESTORE_SPURIOUS_READ_ERRORS") { summary += " have spurious read errors"; + } else if (asum.first == "BLUESTORE_SLOW_OP_ALERT") { + summary += " experiencing slow operations in BlueStore"; + } else if (asum.first == "BLOCK_DEVICE_STALLED_READ_ALERT") { + summary += " experiencing stalled read in block device of BlueStore"; + } else if (asum.first == "WAL_DEVICE_STALLED_READ_ALERT") { + summary += " experiencing stalled read in wal device of BlueFS"; + } else if (asum.first == "DB_DEVICE_STALLED_READ_ALERT") { + summary += " experiencing stalled read in db device of BlueFS"; } auto& d = checks->add(asum.first, HEALTH_WARN, summary, asum.second.first); diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index 4dcf42f23041d..60d07bfb51ca0 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -4163,6 +4163,15 @@ bool BlueFS::debug_get_is_dev_dirty(FileWriter *h, uint8_t dev) return h->dirty_devs[dev]; } +void BlueFS::collect_alerts(osd_alert_list_t& alerts) { + if (bdev[BDEV_DB]) { + bdev[BDEV_DB]->collect_alerts(alerts, "DB"); + } + if (bdev[BDEV_WAL]) { + bdev[BDEV_WAL]->collect_alerts(alerts, "WAL"); + } +} + int BlueFS::open_for_read( std::string_view dirname, std::string_view filename, diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h index 6674190a55c5f..2d2e9b066e9e5 100644 --- a/src/os/bluestore/BlueFS.h +++ b/src/os/bluestore/BlueFS.h @@ -225,6 +225,7 @@ public: WRITER_WAL, WRITER_SST, }; + void collect_alerts(osd_alert_list_t& alerts); struct File : public RefCountedObject { MEMPOOL_CLASS_HELPERS(); @@ -427,7 +428,6 @@ public: FileRef file; explicit FileLock(FileRef f) : file(std::move(f)) {} }; - private: PerfCounters *logger = nullptr; diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 849e93ab47596..beb04434f861a 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -11300,7 +11300,7 @@ int BlueStore::_do_read( int BlueStore::_verify_csum(OnodeRef& o, const bluestore_blob_t* blob, uint64_t blob_xoffset, const bufferlist& bl, - uint64_t logical_offset) const + uint64_t logical_offset) { int bad; uint64_t bad_csum; @@ -17346,13 +17346,34 @@ int BlueStore::_merge_collection( return r; } +size_t BlueStore::_trim_slow_op_event_queue(mono_clock::time_point cur_time) { + ceph_assert(ceph_mutex_is_locked(qlock)); + auto warn_duration = std::chrono::seconds(cct->_conf->bluestore_slow_ops_warn_lifetime); + while (!slow_op_event_queue.empty() && + ((slow_op_event_queue.front() < cur_time - warn_duration) || + (slow_op_event_queue.size() > cct->_conf->bluestore_slow_ops_warn_threshold))) { + slow_op_event_queue.pop(); + } + return slow_op_event_queue.size(); +} + +void BlueStore::_add_slow_op_event() { + if (!cct->_conf->bluestore_slow_ops_warn_threshold) { + return; + } + std::lock_guard lock(qlock); + auto cur_time = mono_clock::now(); + slow_op_event_queue.push(cur_time); + _trim_slow_op_event_queue(cur_time); +} + void BlueStore::log_latency( const char* name, int idx, const ceph::timespan& l, double lat_threshold, const char* info, - int idx2) const + int idx2) { logger->tinc(idx, l); if (lat_threshold > 0.0 && @@ -17364,6 +17385,7 @@ void BlueStore::log_latency( if (idx2 > l_bluestore_first && idx2 < l_bluestore_last) { logger->inc(idx2); } + _add_slow_op_event(); } } @@ -17373,7 +17395,7 @@ void BlueStore::log_latency_fn( const ceph::timespan& l, double lat_threshold, std::function fn, - int idx2) const + int idx2) { logger->tinc(idx, l); if (lat_threshold > 0.0 && @@ -17385,6 +17407,7 @@ void BlueStore::log_latency_fn( if (idx2 > l_bluestore_first && idx2 < l_bluestore_last) { logger->inc(idx2); } + _add_slow_op_event(); } } @@ -17780,7 +17803,16 @@ void BlueStore::_log_alerts(osd_alert_list_t& alerts) } else if (!spillover_alert.empty()){ spillover_alert.clear(); } - + if (cct->_conf->bluestore_slow_ops_warn_threshold) { + size_t qsize = _trim_slow_op_event_queue(mono_clock::now()); + if (qsize >= cct->_conf->bluestore_slow_ops_warn_threshold) { + ostringstream ss; + ss << "observed slow operation indications in BlueStore"; + alerts.emplace("BLUESTORE_SLOW_OP_ALERT", ss.str()); + } + } + bdev->collect_alerts(alerts, "BLOCK"); + bluefs->collect_alerts(alerts); if (!spurious_read_errors_alert.empty() && cct->_conf->bluestore_warn_on_spurious_read_errors) { alerts.emplace( diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index bacfcba84ec2d..7f20f3eaefc23 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -3317,14 +3318,14 @@ public: const ceph::timespan& lat, double lat_threshold, const char* info = "", - int idx2 = l_bluestore_first) const; + int idx2 = l_bluestore_first); inline void log_latency_fn(const char* name, int idx, const ceph::timespan& lat, double lat_threshold, std::function fn, - int idx2 = l_bluestore_first) const; + int idx2 = l_bluestore_first); private: bool _debug_data_eio(const ghobject_t& o) { @@ -3358,7 +3359,10 @@ private: std::string no_per_pg_omap_alert; std::string disk_size_mismatch_alert; std::string spurious_read_errors_alert; + std::queue slow_op_event_queue; + size_t _trim_slow_op_event_queue(ceph::mono_clock::time_point cur_time); + void _add_slow_op_event(); void _log_alerts(osd_alert_list_t& alerts); bool _set_compression_alert(bool cmode, const char* s) { std::lock_guard l(qlock); @@ -3395,7 +3399,7 @@ private: const bluestore_blob_t* blob, uint64_t blob_xoffset, const ceph::buffer::list& bl, - uint64_t logical_offset) const; + uint64_t logical_offset); int _decompress(ceph::buffer::list& source, ceph::buffer::list* result); -- 2.39.5