From 3b1b5a9c7c53dec29bc98b2a0f563a65ee90cdd9 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 7 Nov 2016 17:57:01 -0500 Subject: [PATCH] os/bluestore/KernelDevice: debug support to detect stalled aios This is additional bookkeeping to track the oldest aio in flight, so we make it optional, and disabled by default. It is useful for identifying buggy kernel aio or bad devices. Signed-off-by: Sage Weil --- src/common/config_opts.h | 2 ++ src/os/bluestore/KernelDevice.cc | 50 ++++++++++++++++++++++++++++++ src/os/bluestore/KernelDevice.h | 8 +++++ src/test/objectstore/store_test.cc | 2 ++ 4 files changed, 62 insertions(+) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 28e87f736aa..16962524262 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -937,6 +937,8 @@ OPTION(bdev_aio, OPT_BOOL, true) OPTION(bdev_aio_poll_ms, OPT_INT, 250) // milliseconds OPTION(bdev_aio_max_queue_depth, OPT_INT, 32) OPTION(bdev_block_size, OPT_INT, 4096) +OPTION(bdev_debug_aio, OPT_BOOL, false) +OPTION(bdev_debug_aio_suicide_timeout, OPT_FLOAT, 60.0) // if yes, osd will unbind all NVMe devices from kernel driver and bind them // to the uio_pci_generic driver. The purpose is to prevent the case where diff --git a/src/os/bluestore/KernelDevice.cc b/src/os/bluestore/KernelDevice.cc index a24b8b751e9..e5e322c28c6 100644 --- a/src/os/bluestore/KernelDevice.cc +++ b/src/os/bluestore/KernelDevice.cc @@ -259,6 +259,10 @@ void KernelDevice::_aio_thread() for (int i = 0; i < r; ++i) { IOContext *ioc = static_cast(aio[i]->priv); _aio_log_finish(ioc, aio[i]->offset, aio[i]->length); + if (aio[i]->queue_item.is_linked()) { + std::lock_guard l(debug_queue_lock); + debug_aio_unlink(*aio[i]); + } int left = --ioc->num_running; int r = aio[i]->get_return_value(); dout(10) << __func__ << " finished aio " << aio[i] << " r " << r @@ -275,6 +279,25 @@ void KernelDevice::_aio_thread() } } } + if (g_conf->bdev_debug_aio) { + utime_t now = ceph_clock_now(NULL); + std::lock_guard l(debug_queue_lock); + if (debug_oldest) { + if (debug_stall_since == utime_t()) { + debug_stall_since = now; + } else { + utime_t cutoff = now; + cutoff -= g_conf->bdev_debug_aio_suicide_timeout; + if (debug_stall_since < cutoff) { + derr << __func__ << " stalled aio " << debug_oldest + << " since " << debug_stall_since << ", timeout is " + << g_conf->bdev_debug_aio_suicide_timeout + << "s, suicide" << dendl; + assert(0 == "stalled aio... buggy kernel or bad device?"); + } + } + } + } reap_ioc(); if (g_conf->bdev_inject_crash) { ++inject_crash_count; @@ -310,6 +333,29 @@ void KernelDevice::_aio_log_start( } } +void KernelDevice::debug_aio_link(FS::aio_t& aio) +{ + if (debug_queue.empty()) { + debug_oldest = &aio; + } + debug_queue.push_back(aio); +} + +void KernelDevice::debug_aio_unlink(FS::aio_t& aio) +{ + if (aio.queue_item.is_linked()) { + debug_queue.erase(debug_queue.iterator_to(aio)); + if (debug_oldest == &aio) { + if (debug_queue.empty()) { + debug_oldest = nullptr; + } else { + debug_oldest = &debug_queue.front(); + } + debug_stall_since = utime_t(); + } + } +} + void KernelDevice::_aio_log_finish( IOContext *ioc, uint64_t offset, @@ -365,6 +411,10 @@ void KernelDevice::aio_submit(IOContext *ioc) // do not dereference txc (or it's contents) after we submit (if // done == true and we don't loop) int retries = 0; + if (g_conf->bdev_debug_aio) { + std::lock_guard l(debug_queue_lock); + debug_aio_link(*cur); + } int r = aio_queue.submit(*cur, &retries); if (retries) derr << __func__ << " retries " << retries << dendl; diff --git a/src/os/bluestore/KernelDevice.h b/src/os/bluestore/KernelDevice.h index 48cb0d44d93..9e512f93b04 100644 --- a/src/os/bluestore/KernelDevice.h +++ b/src/os/bluestore/KernelDevice.h @@ -64,6 +64,14 @@ class KernelDevice : public BlockDevice { int direct_read_unaligned(uint64_t off, uint64_t len, char *buf); + // stalled aio debugging + FS::aio_list_t debug_queue; + std::mutex debug_queue_lock; + FS::aio_t *debug_oldest; + utime_t debug_stall_since; + void debug_aio_link(FS::aio_t& aio); + void debug_aio_unlink(FS::aio_t& aio); + public: KernelDevice(aio_callback_t cb, void *cbpriv); diff --git a/src/test/objectstore/store_test.cc b/src/test/objectstore/store_test.cc index c242c7045a0..b7a8c661adc 100644 --- a/src/test/objectstore/store_test.cc +++ b/src/test/objectstore/store_test.cc @@ -5745,6 +5745,8 @@ int main(int argc, char **argv) { g_ceph_context->_conf->set_val("bluestore_debug_randomize_serial_transaction", "10"); + g_ceph_context->_conf->set_val("bdev_debug_aio", "true"); + // specify device size g_ceph_context->_conf->set_val("bluestore_block_size", "10240000000"); -- 2.39.5