From 52d15521eb75f9b521744db675bee61025d2fa52 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Fri, 27 Jun 2025 11:54:20 +0800 Subject: [PATCH] sched/deadline: Don't count nr_running for dl_server proxy tasks On CPU offline the kernel stalled with below call trace: INFO: task kworker/0:1:11 blocked for more than 120 seconds. cpuhp hold the cpu hotplug lock endless and stalled vmstat_shepherd. This is because we count nr_running twice on cpuhp enqueuing and failed the wait condition of cpuhp: enqueue_task_fair() // pick cpuhp from idle, rq->nr_running = 0 dl_server_start() [...] add_nr_running() // rq->nr_running = 1 add_nr_running() // rq->nr_running = 2 [switch to cpuhp, waiting on balance_hotplug_wait()] rcuwait_wait_event(rq->nr_running == 1 && ...) // failed, rq->nr_running=2 schedule() // wait again It doesn't make sense to count the dl_server towards runnable tasks, since it runs other tasks. Fixes: 63ba8422f876 ("sched/deadline: Introduce deadline servers") Signed-off-by: Yicong Yang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250627035420.37712-1-yangyicong@huawei.com --- kernel/sched/deadline.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 88c3bd64a8a05..f25301267e471 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1851,7 +1851,9 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) u64 deadline = dl_se->deadline; dl_rq->dl_nr_running++; - add_nr_running(rq_of_dl_rq(dl_rq), 1); + + if (!dl_server(dl_se)) + add_nr_running(rq_of_dl_rq(dl_rq), 1); inc_dl_deadline(dl_rq, deadline); } @@ -1861,7 +1863,9 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { WARN_ON(!dl_rq->dl_nr_running); dl_rq->dl_nr_running--; - sub_nr_running(rq_of_dl_rq(dl_rq), 1); + + if (!dl_server(dl_se)) + sub_nr_running(rq_of_dl_rq(dl_rq), 1); dec_dl_deadline(dl_rq, dl_se->deadline); } -- 2.39.5