git: 4808bab7fa6c - stable/13 - sched_ule(4): Improve long-term load balancer.
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Thu, 21 Oct 2021 22:31:33 UTC
The branch stable/13 has been updated by mav: URL: https://cgit.FreeBSD.org/src/commit/?id=4808bab7fa6c3ec49b49476b8326d7a0250a03fa commit 4808bab7fa6c3ec49b49476b8326d7a0250a03fa Author: Alexander Motin <mav@FreeBSD.org> AuthorDate: 2021-09-21 22:14:22 +0000 Commit: Alexander Motin <mav@FreeBSD.org> CommitDate: 2021-10-21 22:24:35 +0000 sched_ule(4): Improve long-term load balancer. Before this change long-term load balancer was unable to migrate running threads, only ones waiting on run queues. But with growing number of CPU cores it is quite typical now for system to not have many waiting threads. But same time if due to some coincidence two long-running CPU-bound threads ended up sharing same physical CPU core, they could suffer from the SMT penalty indefinitely, and the load balancer couldn't help. Improve that by teaching the load balancer to hint running threads to migrate by marking them with TDF_NEEDRESCHED and new TDF_PICKCPU flag, making sched_pickcpu() to search for better CPU later, when it is convenient. Fix CPU search logic when balancing to limit round-robin migrations in case of almost equal load to the group of physical cores. The previous code bounced threads across all the system, that should be pretty bad for caches and NUMA affinity, while additional fairness was almost invisible, diminishing with number of cores in the group. MFC after: 1 month (cherry picked from commit e745d729be60a47b49eb19c02a6864a747fb2744) --- sys/kern/sched_ule.c | 119 ++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 93 insertions(+), 26 deletions(-) diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c index 53d5a59a3605..e7de4a50bb93 100644 --- a/sys/kern/sched_ule.c +++ b/sys/kern/sched_ule.c @@ -196,6 +196,7 @@ _Static_assert(sizeof(struct thread) + sizeof(struct td_sched) <= #define SCHED_SLICE_MIN_DIVISOR 6 /* DEFAULT/MIN = ~16 ms. */ /* Flags kept in td_flags. */ +#define TDF_PICKCPU TDF_SCHED0 /* Thread should pick new CPU. */ #define TDF_SLICEEND TDF_SCHED2 /* Thread time slice is over. */ /* @@ -633,15 +634,16 @@ sched_random(void) } struct cpu_search { - cpuset_t *cs_mask; - u_int cs_prefer; + cpuset_t *cs_mask; /* The mask of allowed CPUs to choose from. */ + int cs_prefer; /* Prefer this CPU and groups including it. */ + int cs_running; /* The thread is now running at cs_prefer. */ int cs_pri; /* Min priority for low. */ int cs_limit; /* Max load for low, min load for high. */ }; struct cpu_search_res { - int cs_cpu; - int cs_load; + int cs_cpu; /* The best CPU found. */ + int cs_load; /* The load of cs_cpu. */ }; /* @@ -657,7 +659,7 @@ cpu_search_lowest(const struct cpu_group *cg, const struct cpu_search *s, { struct cpu_search_res lr; struct tdq *tdq; - int c, bload, l, load, total; + int c, bload, l, load, p, total; total = 0; bload = INT_MAX; @@ -668,6 +670,17 @@ cpu_search_lowest(const struct cpu_group *cg, const struct cpu_search *s, for (c = cg->cg_children - 1; c >= 0; c--) { load = cpu_search_lowest(&cg->cg_child[c], s, &lr); total += load; + + /* + * When balancing do not prefer SMT groups with load >1. + * It allows round-robin between SMT groups with equal + * load within parent group for more fair scheduling. + */ + if (__predict_false(s->cs_running) && + (cg->cg_child[c].cg_flags & CG_FLAG_THREAD) && + load >= 128 && (load & 128) != 0) + load += 128; + if (lr.cs_cpu >= 0 && (load < bload || (load == bload && lr.cs_load < r->cs_load))) { bload = load; @@ -684,20 +697,40 @@ cpu_search_lowest(const struct cpu_group *cg, const struct cpu_search *s, continue; tdq = TDQ_CPU(c); l = tdq->tdq_load; + if (c == s->cs_prefer) { + if (__predict_false(s->cs_running)) + l--; + p = 128; + } else + p = 0; load = l * 256; - if (c == s->cs_prefer) - load -= 128; - total += load; - if (l > s->cs_limit || tdq->tdq_lowpri <= s->cs_pri || + total += load - p; + + /* + * Check this CPU is acceptable. + * If the threads is already on the CPU, don't look on the TDQ + * priority, since it can be the priority of the thread itself. + */ + if (l > s->cs_limit || (tdq->tdq_lowpri <= s->cs_pri && + (!s->cs_running || c != s->cs_prefer)) || !CPU_ISSET(c, s->cs_mask)) continue; + + /* + * When balancing do not prefer CPUs with load > 1. + * It allows round-robin between CPUs with equal load + * within the CPU group for more fair scheduling. + */ + if (__predict_false(s->cs_running) && l > 0) + p = 0; + load -= sched_random() % 128; - if (load < bload) { - bload = load; + if (bload > load - p) { + bload = load - p; r->cs_cpu = c; + r->cs_load = load; } } - r->cs_load = bload; return (total); } @@ -736,9 +769,17 @@ cpu_search_highest(const struct cpu_group *cg, const struct cpu_search *s, l = tdq->tdq_load; load = l * 256; total += load; - if (l < s->cs_limit || !tdq->tdq_transferable || + + /* + * Check this CPU is acceptable. + * If requested minimum load is 1, then caller must know how + * to handle running threads, not counted in tdq_transferable. + */ + if (l < s->cs_limit || (tdq->tdq_transferable == 0 && + (s->cs_limit > 1 || l > 1)) || !CPU_ISSET(c, s->cs_mask)) continue; + load -= sched_random() % 256; if (load > bload) { bload = load; @@ -756,12 +797,13 @@ cpu_search_highest(const struct cpu_group *cg, const struct cpu_search *s, */ static inline int sched_lowest(const struct cpu_group *cg, cpuset_t *mask, int pri, int maxload, - int prefer) + int prefer, int running) { struct cpu_search s; struct cpu_search_res r; s.cs_prefer = prefer; + s.cs_running = running; s.cs_mask = mask; s.cs_pri = pri; s.cs_limit = maxload; @@ -788,12 +830,13 @@ static void sched_balance_group(struct cpu_group *cg) { struct tdq *tdq; + struct thread *td; cpuset_t hmask, lmask; int high, low, anylow; CPU_FILL(&hmask); for (;;) { - high = sched_highest(cg, &hmask, 2); + high = sched_highest(cg, &hmask, 1); /* Stop if there is no more CPU with transferrable threads. */ if (high == -1) break; @@ -802,10 +845,28 @@ sched_balance_group(struct cpu_group *cg) /* Stop if there is no more CPU left for low. */ if (CPU_EMPTY(&lmask)) break; - anylow = 1; tdq = TDQ_CPU(high); + if (tdq->tdq_load == 1) { + /* + * There is only one running thread. We can't move + * it from here, so tell it to pick new CPU by itself. + */ + TDQ_LOCK(tdq); + td = pcpu_find(high)->pc_curthread; + if ((td->td_flags & TDF_IDLETD) == 0 && + THREAD_CAN_MIGRATE(td)) { + td->td_flags |= TDF_NEEDRESCHED | TDF_PICKCPU; + if (high != curcpu) + ipi_cpu(high, IPI_AST); + } + TDQ_UNLOCK(tdq); + break; + } + anylow = 1; nextlow: - low = sched_lowest(cg, &lmask, -1, tdq->tdq_load - 1, high); + if (tdq->tdq_transferable == 0) + continue; + low = sched_lowest(cg, &lmask, -1, tdq->tdq_load - 1, high, 1); /* Stop if we looked well and found no less loaded CPU. */ if (anylow && low == -1) break; @@ -1227,7 +1288,7 @@ sched_pickcpu(struct thread *td, int flags) struct td_sched *ts; struct tdq *tdq; cpuset_t *mask; - int cpu, pri, self, intr; + int cpu, pri, r, self, intr; self = PCPU_GET(cpuid); ts = td_get_sched(td); @@ -1305,32 +1366,33 @@ llc: cpu = -1; mask = &td->td_cpuset->cs_mask; pri = td->td_priority; + r = TD_IS_RUNNING(td); /* * Try hard to keep interrupts within found LLC. Search the LLC for * the least loaded CPU we can run now. For NUMA systems it should * be within target domain, and it also reduces scheduling overhead. */ if (ccg != NULL && intr) { - cpu = sched_lowest(ccg, mask, pri, INT_MAX, ts->ts_cpu); + cpu = sched_lowest(ccg, mask, pri, INT_MAX, ts->ts_cpu, r); if (cpu >= 0) SCHED_STAT_INC(pickcpu_intrbind); } else /* Search the LLC for the least loaded idle CPU we can run now. */ if (ccg != NULL) { cpu = sched_lowest(ccg, mask, max(pri, PRI_MAX_TIMESHARE), - INT_MAX, ts->ts_cpu); + INT_MAX, ts->ts_cpu, r); if (cpu >= 0) SCHED_STAT_INC(pickcpu_affinity); } /* Search globally for the least loaded CPU we can run now. */ if (cpu < 0) { - cpu = sched_lowest(cpu_top, mask, pri, INT_MAX, ts->ts_cpu); + cpu = sched_lowest(cpu_top, mask, pri, INT_MAX, ts->ts_cpu, r); if (cpu >= 0) SCHED_STAT_INC(pickcpu_lowest); } /* Search globally for the least loaded CPU. */ if (cpu < 0) { - cpu = sched_lowest(cpu_top, mask, -1, INT_MAX, ts->ts_cpu); + cpu = sched_lowest(cpu_top, mask, -1, INT_MAX, ts->ts_cpu, r); if (cpu >= 0) SCHED_STAT_INC(pickcpu_lowest); } @@ -2056,7 +2118,7 @@ sched_switch(struct thread *td, int flags) struct td_sched *ts; struct mtx *mtx; int srqflag; - int cpuid, preempted; + int cpuid, pickcpu, preempted; THREAD_LOCK_ASSERT(td, MA_OWNED); @@ -2064,11 +2126,15 @@ sched_switch(struct thread *td, int flags) tdq = TDQ_SELF(); ts = td_get_sched(td); sched_pctcpu_update(ts, 1); - ts->ts_rltick = ticks; + pickcpu = (td->td_flags & TDF_PICKCPU) != 0; + if (pickcpu) + ts->ts_rltick = ticks - affinity * MAX_CACHE_LEVELS; + else + ts->ts_rltick = ticks; td->td_lastcpu = td->td_oncpu; preempted = (td->td_flags & TDF_SLICEEND) == 0 && (flags & SW_PREEMPT) != 0; - td->td_flags &= ~(TDF_NEEDRESCHED | TDF_SLICEEND); + td->td_flags &= ~(TDF_NEEDRESCHED | TDF_PICKCPU | TDF_SLICEEND); td->td_owepreempt = 0; tdq->tdq_owepreempt = 0; if (!TD_IS_IDLETHREAD(td)) @@ -2088,7 +2154,8 @@ sched_switch(struct thread *td, int flags) SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED : SRQ_OURSELF|SRQ_YIELDING; #ifdef SMP - if (THREAD_CAN_MIGRATE(td) && !THREAD_CAN_SCHED(td, ts->ts_cpu)) + if (THREAD_CAN_MIGRATE(td) && (!THREAD_CAN_SCHED(td, ts->ts_cpu) + || pickcpu)) ts->ts_cpu = sched_pickcpu(td, 0); #endif if (ts->ts_cpu == cpuid)