From 3d4a899d6c5f0c3c4c663d5d6dbb78285fbd7672 Mon Sep 17 00:00:00 2001 From: Joshua Baergen Date: Tue, 2 Jul 2024 13:57:15 -0600 Subject: [PATCH] blk/KernelDevice: Fix several issues with stopping discard threads 1. In _discard_stop(), the wait for !discard_threads.empty() was there from a prior implementation where there could theoretically be a race between _discard_start() and _discard_stop(). If that race does exist, this check won't help; not only is _discard_stop() not called if discard_threads is empty, but discard_threads won't be populated until _discard_start() runs and thus this won't detect such a race. 2. Calling _discard_stop() from handle_conf_change() is a guaranteed deadlock because discard_lock is already held, so don't do that. Use the same flow whether we're stopping a subset of threads or all threads. 3. Asking a subset of discard threads to stop was not guaranteed to take effect, since if they continued to find contents in discard_queue then they would continue to run indefinitely. Add additional logic to _discard_thread() to have threads stop if they have been requested to stop and other threads exist to continue draining discard_queue. 4. Make the flow of _discard_stop() and handle_conf_change() more similar. Fixes: https://tracker.ceph.com/issues/66817 Signed-off-by: Joshua Baergen --- src/blk/kernel/KernelDevice.cc | 37 ++++++++++++++++------------------ 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/src/blk/kernel/KernelDevice.cc b/src/blk/kernel/KernelDevice.cc index 057dd04cba898..4eb9702b6110b 100644 --- a/src/blk/kernel/KernelDevice.cc +++ b/src/blk/kernel/KernelDevice.cc @@ -562,21 +562,17 @@ void KernelDevice::_discard_stop() // Signal threads to stop, then wait for them to join { std::unique_lock l(discard_lock); - while (discard_threads.empty()) { - discard_cond.wait(l); - } for(auto &t : discard_threads) { t->stop = true; + t->detach(); } + discard_threads.clear(); discard_cond.notify_all(); } - // Threads are shared pointers and are cleaned up for us - for(auto &t : discard_threads) - t->join(); - discard_threads.clear(); + discard_drain(); dout(10) << __func__ << " stopped" << dendl; } @@ -761,6 +757,13 @@ void KernelDevice::_discard_thread(uint64_t tid) discard_cond.wait(l); dout(20) << __func__ << " wake" << dendl; } else { + // If there are non-stopped discard threads and we have been requested + // to stop, do so now. Otherwise, we need to proceed because + // discard_queued is non-empty and at least one thread is needed to + // drain it. + if (thr->stop && !discard_threads.empty()) + break; + // Limit local processing to MAX_LOCAL_DISCARD items. // This will allow threads to work in parallel // instead of a single thread taking over the whole discard_queued. @@ -1547,19 +1550,13 @@ void KernelDevice::handle_conf_change(const ConfigProxy& conf, } else if (newval < oldval) { dout(10) << __func__ << " stopping " << (oldval - newval) << " existing discard threads" << dendl; - // Decreasing to zero is exactly the same as disabling async discard. - // Signal all threads to stop - if(newval == 0) { - _discard_stop(); - } else { - // Signal the last threads to quit, and stop tracking them - for(uint64_t i = oldval - 1; i >= newval; i--) - { - // Also detach the thread so we no longer need to join - discard_threads[i]->stop = true; - discard_threads[i]->detach(); - discard_threads.erase(discard_threads.begin() + i); - } + // Signal the last threads to quit, and stop tracking them + for(uint64_t i = oldval - 1; i >= newval && i != UINT64_MAX; i--) + { + // Also detach the thread so we no longer need to join + discard_threads[i]->stop = true; + discard_threads[i]->detach(); + discard_threads.erase(discard_threads.begin() + i); } discard_cond.notify_all(); -- 2.39.5