sched/fair: Fix zero_vruntime tracking

author Peter Zijlstra <peterz@infradead.org>

Mon, 9 Feb 2026 14:28:16 +0000 (15:28 +0100)

committer Peter Zijlstra <peterz@infradead.org>

Mon, 23 Feb 2026 10:19:17 +0000 (11:19 +0100)
author Peter Zijlstra <peterz@infradead.org>
Mon, 9 Feb 2026 14:28:16 +0000 (15:28 +0100)
committer Peter Zijlstra <peterz@infradead.org>
Mon, 23 Feb 2026 10:19:17 +0000 (11:19 +0100)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index eea99ec01a3fb25f2fc9f2a863c494c1f360e0a5..56dddd4fd20893a764049096b7e9ff3af99bbbf3 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -589,6 +589,21 @@ static inline bool entity_before(const struct sched_entity *a,
         return vruntime_cmp(a->deadline, "<", b->deadline);
  }
  
+/*
+ * Per avg_vruntime() below, cfs_rq::zero_vruntime is only slightly stale
+ * and this value should be no more than two lag bounds. Which puts it in the
+ * general order of:
+ *
+ *     (slice + TICK_NSEC) << NICE_0_LOAD_SHIFT
+ *
+ * which is around 44 bits in size (on 64bit); that is 20 for
+ * NICE_0_LOAD_SHIFT, another 20 for NSEC_PER_MSEC and then a handful for
+ * however many msec the actual slice+tick ends up begin.
+ *
+ * (disregarding the actual divide-by-weight part makes for the worst case
+ * weight of 2, which nicely cancels vs the fuzz in zero_vruntime not actually
+ * being the zero-lag point).
+ */
  static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
         return vruntime_op(se->vruntime, "-", cfs_rq->zero_vruntime);
@@ -676,39 +691,61 @@ sum_w_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
  }
  
  static inline
-void sum_w_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
+void update_zero_vruntime(struct cfs_rq *cfs_rq, s64 delta)
  {
         /*
-        * v' = v + d ==> sum_w_vruntime' = sum_runtime - d*sum_weight
+        * v' = v + d ==> sum_w_vruntime' = sum_w_vruntime - d*sum_weight
          */
         cfs_rq->sum_w_vruntime -= cfs_rq->sum_weight * delta;
+       cfs_rq->zero_vruntime += delta;
  }
  
  /*
- * Specifically: avg_runtime() + 0 must result in entity_eligible() := true
+ * Specifically: avg_vruntime() + 0 must result in entity_eligible() := true
   * For this to be so, the result of this function must have a left bias.
+ *
+ * Called in:
+ *  - place_entity()      -- before enqueue
+ *  - update_entity_lag() -- before dequeue
+ *  - entity_tick()
+ *
+ * This means it is one entry 'behind' but that puts it close enough to where
+ * the bound on entity_key() is at most two lag bounds.
   */
  u64 avg_vruntime(struct cfs_rq *cfs_rq)
  {
         struct sched_entity *curr = cfs_rq->curr;
-       s64 avg = cfs_rq->sum_w_vruntime;
-       long load = cfs_rq->sum_weight;
+       long weight = cfs_rq->sum_weight;
+       s64 delta = 0;
  
-       if (curr && curr->on_rq) {
-               unsigned long weight = scale_load_down(curr->load.weight);
+       if (curr && !curr->on_rq)
+               curr = NULL;
  
-               avg += entity_key(cfs_rq, curr) * weight;
-               load += weight;
-       }
+       if (weight) {
+               s64 runtime = cfs_rq->sum_w_vruntime;
+
+               if (curr) {
+                       unsigned long w = scale_load_down(curr->load.weight);
+
+                       runtime += entity_key(cfs_rq, curr) * w;
+                       weight += w;
+               }
  
-       if (load) {
                 /* sign flips effective floor / ceiling */
-               if (avg < 0)
-                       avg -= (load - 1);
-               avg = div_s64(avg, load);
+               if (runtime < 0)
+                       runtime -= (weight - 1);
+
+               delta = div_s64(runtime, weight);
+       } else if (curr) {
+               /*
+                * When there is but one element, it is the average.
+                */
+               delta = curr->vruntime - cfs_rq->zero_vruntime;
         }
  
-       return cfs_rq->zero_vruntime + avg;
+       update_zero_vruntime(cfs_rq, delta);
+
+       return cfs_rq->zero_vruntime;
  }
  
  /*
@@ -777,16 +814,6 @@ int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
         return vruntime_eligible(cfs_rq, se->vruntime);
  }
  
-static void update_zero_vruntime(struct cfs_rq *cfs_rq)
-{
-       u64 vruntime = avg_vruntime(cfs_rq);
-       s64 delta = vruntime_op(vruntime, "-", cfs_rq->zero_vruntime);
-
-       sum_w_vruntime_update(cfs_rq, delta);
-
-       cfs_rq->zero_vruntime = vruntime;
-}
-
  static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq)
  {
         struct sched_entity *root = __pick_root_entity(cfs_rq);
@@ -856,7 +883,6 @@ RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
  static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
         sum_w_vruntime_add(cfs_rq, se);
-       update_zero_vruntime(cfs_rq);
         se->min_vruntime = se->vruntime;
         se->min_slice = se->slice;
         rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
@@ -868,7 +894,6 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
         rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
                                   &min_vruntime_cb);
         sum_w_vruntime_sub(cfs_rq, se);
-       update_zero_vruntime(cfs_rq);
  }
  
  struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq)
@@ -5524,6 +5549,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
         update_load_avg(cfs_rq, curr, UPDATE_TG);
         update_cfs_group(curr);
  
+       /*
+        * Pulls along cfs_rq::zero_vruntime.
+        */
+       avg_vruntime(cfs_rq);
+
  #ifdef CONFIG_SCHED_HRTICK
         /*
          * queued ticks are scheduled to match the slice, so don't bother
author	Peter Zijlstra <peterz@infradead.org>
	Mon, 9 Feb 2026 14:28:16 +0000 (15:28 +0100)
committer	Peter Zijlstra <peterz@infradead.org>
	Mon, 23 Feb 2026 10:19:17 +0000 (11:19 +0100)