git: 74cf7cae4d22 - main - softclock: Use dedicated ithreads for running callouts.

From: John Baldwin <jhb_at_FreeBSD.org>
Date: Thu, 30 Dec 2021 22:55:24 UTC
The branch main has been updated by jhb:

URL: https://cgit.FreeBSD.org/src/commit/?id=74cf7cae4d2238ae6d1c949b2bbd077e1ab33634

commit 74cf7cae4d2238ae6d1c949b2bbd077e1ab33634
Author:     John Baldwin <jhb@FreeBSD.org>
AuthorDate: 2021-12-30 22:54:29 +0000
Commit:     John Baldwin <jhb@FreeBSD.org>
CommitDate: 2021-12-30 22:55:08 +0000

    softclock: Use dedicated ithreads for running callouts.
    
    Rather than using the swi infrastructure, rewrite softclock() as a
    thread loop (softclock_thread()) and use it as the main routine of the
    softclock threads.  The threads use the CC_LOCK as the thread lock
    when idle.
    
    Reviewed by:    mav, imp, kib
    Sponsored by:   Netflix
    Differential Revision:  https://reviews.freebsd.org/D33683
---
 sys/kern/kern_timeout.c | 115 ++++++++++++++++++++++++++++++++----------------
 sys/sys/systm.h         |   1 -
 2 files changed, 76 insertions(+), 40 deletions(-)

diff --git a/sys/kern/kern_timeout.c b/sys/kern/kern_timeout.c
index 992a093d30ab..3923c214be8d 100644
--- a/sys/kern/kern_timeout.c
+++ b/sys/kern/kern_timeout.c
@@ -52,14 +52,17 @@ __FBSDID("$FreeBSD$");
 #include <sys/interrupt.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
+#include <sys/kthread.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
+#include <sys/sched.h>
 #include <sys/sdt.h>
 #include <sys/sleepqueue.h>
 #include <sys/sysctl.h>
 #include <sys/smp.h>
+#include <sys/unistd.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
@@ -77,6 +80,8 @@ SDT_PROVIDER_DEFINE(callout_execute);
 SDT_PROBE_DEFINE1(callout_execute, , , callout__start, "struct callout *");
 SDT_PROBE_DEFINE1(callout_execute, , , callout__end, "struct callout *");
 
+static void	softclock_thread(void *arg);
+
 #ifdef CALLOUT_PROFILING
 static int avg_depth;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_depth, CTLFLAG_RD, &avg_depth, 0,
@@ -166,7 +171,7 @@ struct callout_cpu {
 	struct callout_tailq	cc_expireq;
 	sbintime_t		cc_firstevent;
 	sbintime_t		cc_lastscan;
-	void			*cc_cookie;
+	struct thread		*cc_thread;
 	u_int			cc_bucket;
 	u_int			cc_inited;
 #ifdef KTR
@@ -222,7 +227,7 @@ static MALLOC_DEFINE(M_CALLOUT, "callout", "Callout datastructures");
  *                     relevant callout completes.
  *   cc_cancel       - Changing to 1 with both callout_lock and cc_lock held
  *                     guarantees that the current callout will not run.
- *                     The softclock() function sets this to 0 before it
+ *                     The softclock_call_cc() function sets this to 0 before it
  *                     drops callout_lock to acquire c_lock, and it calls
  *                     the handler only if curr_cancelled is still 0 after
  *                     cc_lock is successfully acquired.
@@ -316,7 +321,7 @@ callout_cpu_init(struct callout_cpu *cc, int cpu)
 {
 	int i;
 
-	mtx_init(&cc->cc_lock, "callout", NULL, MTX_SPIN | MTX_RECURSE);
+	mtx_init(&cc->cc_lock, "callout", NULL, MTX_SPIN);
 	cc->cc_inited = 1;
 	cc->cc_callwheel = malloc_domainset(sizeof(struct callout_list) *
 	    callwheelsize, M_CALLOUT,
@@ -369,28 +374,38 @@ callout_cpu_switch(struct callout *c, struct callout_cpu *cc, int new_cpu)
 static void
 start_softclock(void *dummy)
 {
+	struct proc *p;
+	struct thread *td;
 	struct callout_cpu *cc;
-	char name[MAXCOMLEN];
-	int cpu;
+	int cpu, error;
 	bool pin_swi;
-	struct intr_event *ie;
 
+	p = NULL;
 	CPU_FOREACH(cpu) {
 		cc = CC_CPU(cpu);
-		snprintf(name, sizeof(name), "clock (%d)", cpu);
-		ie = NULL;
-		if (swi_add(&ie, name, softclock, cc, SWI_CLOCK,
-		    INTR_MPSAFE, &cc->cc_cookie))
-			panic("died while creating standard software ithreads");
+		error = kproc_kthread_add(softclock_thread, cc, &p, &td,
+		    RFSTOPPED, 0, "clock", "clock (%d)", cpu);
+		if (error != 0)
+			panic("failed to create softclock thread for cpu %d: %d",
+			    cpu, error);
+		CC_LOCK(cc);
+		cc->cc_thread = td;
+		thread_lock(td);
+		sched_class(td, PRI_ITHD);
+		sched_prio(td, PI_SWI(SWI_CLOCK));
+		TD_SET_IWAIT(td);
+		thread_lock_set(td, (struct mtx *)&cc->cc_lock);
+		thread_unlock(td);
 		if (cpu == cc_default_cpu)
 			pin_swi = pin_default_swi;
 		else
 			pin_swi = pin_pcpu_swi;
-		if (pin_swi && (intr_event_bind(ie, cpu) != 0)) {
-			printf("%s: %s clock couldn't be pinned to cpu %d\n",
-			    __func__,
-			    cpu == cc_default_cpu ? "default" : "per-cpu",
-			    cpu);
+		if (pin_swi) {
+			error = cpuset_setithread(td->td_tid, cpu);
+			if (error != 0)
+				printf("%s: %s clock couldn't be pinned to cpu %d: %d\n",
+				    __func__, cpu == cc_default_cpu ?
+				    "default" : "per-cpu", cpu, error);
 		}
 	}
 }
@@ -418,6 +433,7 @@ callout_process(sbintime_t now)
 	struct callout *tmp, *tmpn;
 	struct callout_cpu *cc;
 	struct callout_list *sc;
+	struct thread *td;
 	sbintime_t first, last, max, tmp_max;
 	uint32_t lookahead;
 	u_int firstb, lastb, nowb;
@@ -529,13 +545,15 @@ next:
 	avg_mpcalls_dir += (mpcalls_dir * 1000 - avg_mpcalls_dir) >> 8;
 	avg_lockcalls_dir += (lockcalls_dir * 1000 - avg_lockcalls_dir) >> 8;
 #endif
-	mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET);
-	/*
-	 * swi_sched acquires the thread lock, so we don't want to call it
-	 * with cc_lock held; incorrect locking order.
-	 */
-	if (!TAILQ_EMPTY(&cc->cc_expireq))
-		swi_sched(cc->cc_cookie, 0);
+	if (!TAILQ_EMPTY(&cc->cc_expireq)) {
+		td = cc->cc_thread;
+		if (TD_AWAITING_INTR(td)) {
+			TD_CLR_IWAIT(td);
+			sched_add(td, SRQ_INTR);
+		} else
+			mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET);
+	} else
+		mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET);
 }
 
 static struct callout_cpu *
@@ -797,38 +815,57 @@ skip:
  */
 
 /*
- * Software (low priority) clock interrupt.
+ * Software (low priority) clock interrupt thread handler.
  * Run periodic events from timeout queue.
  */
-void
-softclock(void *arg)
+static void
+softclock_thread(void *arg)
 {
+	struct thread *td = curthread;
 	struct callout_cpu *cc;
 	struct callout *c;
 #ifdef CALLOUT_PROFILING
-	int depth = 0, gcalls = 0, lockcalls = 0, mpcalls = 0;
+	int depth, gcalls, lockcalls, mpcalls;
 #endif
 
 	cc = (struct callout_cpu *)arg;
 	CC_LOCK(cc);
-	while ((c = TAILQ_FIRST(&cc->cc_expireq)) != NULL) {
-		TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
-		softclock_call_cc(c, cc,
+	for (;;) {
+		while (TAILQ_EMPTY(&cc->cc_expireq)) {
+			/*
+			 * Use CC_LOCK(cc) as the thread_lock while
+			 * idle.
+			 */
+			thread_lock(td);
+			thread_lock_set(td, (struct mtx *)&cc->cc_lock);
+			TD_SET_IWAIT(td);
+			mi_switch(SW_VOL | SWT_IWAIT);
+
+			/* mi_switch() drops thread_lock(). */
+			CC_LOCK(cc);
+		}
+
 #ifdef CALLOUT_PROFILING
-		    &mpcalls, &lockcalls, &gcalls,
+		depth = gcalls = lockcalls = mpcalls = 0;
 #endif
-		    0);
+		while ((c = TAILQ_FIRST(&cc->cc_expireq)) != NULL) {
+			TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
+			softclock_call_cc(c, cc,
 #ifdef CALLOUT_PROFILING
-		++depth;
+			    &mpcalls, &lockcalls, &gcalls,
 #endif
-	}
+			    0);
 #ifdef CALLOUT_PROFILING
-	avg_depth += (depth * 1000 - avg_depth) >> 8;
-	avg_mpcalls += (mpcalls * 1000 - avg_mpcalls) >> 8;
-	avg_lockcalls += (lockcalls * 1000 - avg_lockcalls) >> 8;
-	avg_gcalls += (gcalls * 1000 - avg_gcalls) >> 8;
+			++depth;
 #endif
-	CC_UNLOCK(cc);
+		}
+#ifdef CALLOUT_PROFILING
+		avg_depth += (depth * 1000 - avg_depth) >> 8;
+		avg_mpcalls += (mpcalls * 1000 - avg_mpcalls) >> 8;
+		avg_lockcalls += (lockcalls * 1000 - avg_lockcalls) >> 8;
+		avg_gcalls += (gcalls * 1000 - avg_gcalls) >> 8;
+#endif
+	}
 }
 
 void
diff --git a/sys/sys/systm.h b/sys/sys/systm.h
index ffe014eb8b42..25a2d0c41e82 100644
--- a/sys/sys/systm.h
+++ b/sys/sys/systm.h
@@ -471,7 +471,6 @@ int	sysbeep(int hertz, sbintime_t duration);
 
 void	hardclock(int cnt, int usermode);
 void	hardclock_sync(int cpu);
-void	softclock(void *);
 void	statclock(int cnt, int usermode);
 void	profclock(int cnt, int usermode, uintfptr_t pc);