PERFORCE change 60246 for review

Sun Aug 22 01:14:08 PDT 2004

http://perforce.freebsd.org/chv.cgi?CH=60246

Change 60246 by julian at julian_ref on 2004/08/22 08:13:44

	put "e.diff" into nsched branch of p4

Affected files ...

.. //depot/projects/nsched/sys/alpha/alpha/machdep.c#4 edit
.. //depot/projects/nsched/sys/amd64/amd64/machdep.c#7 edit
.. //depot/projects/nsched/sys/arm/sa11x0/assabet_machdep.c#3 edit
.. //depot/projects/nsched/sys/ddb/db_ps.c#5 edit
.. //depot/projects/nsched/sys/i386/i386/machdep.c#13 edit
.. //depot/projects/nsched/sys/ia64/ia64/machdep.c#5 edit
.. //depot/projects/nsched/sys/kern/init_main.c#11 edit
.. //depot/projects/nsched/sys/kern/kern_exec.c#10 edit
.. //depot/projects/nsched/sys/kern/kern_exit.c#17 edit
.. //depot/projects/nsched/sys/kern/kern_fork.c#9 edit
.. //depot/projects/nsched/sys/kern/kern_kse.c#23 edit
.. //depot/projects/nsched/sys/kern/kern_proc.c#11 edit
.. //depot/projects/nsched/sys/kern/kern_switch.c#5 edit
.. //depot/projects/nsched/sys/kern/kern_synch.c#9 edit
.. //depot/projects/nsched/sys/kern/kern_thr.c#12 edit
.. //depot/projects/nsched/sys/kern/kern_thread.c#32 edit
.. //depot/projects/nsched/sys/kern/sched_4bsd.c#31 edit
.. //depot/projects/nsched/sys/kern/sched_ule.c#19 edit
.. //depot/projects/nsched/sys/pc98/i386/machdep.c#8 edit
.. //depot/projects/nsched/sys/powerpc/powerpc/machdep.c#4 edit
.. //depot/projects/nsched/sys/sparc64/sparc64/machdep.c#5 edit
.. //depot/projects/nsched/sys/sys/proc.h#22 edit
.. //depot/projects/nsched/sys/sys/sched.h#11 edit

Differences ...

==== //depot/projects/nsched/sys/alpha/alpha/machdep.c#4 (text+ko) ====

@@ -846,7 +846,7 @@
 
 	}
 
-	proc_linkup(&proc0, &ksegrp0, &kse0, &thread0);
+	proc_linkup(&proc0, &ksegrp0, &thread0);
 	/*
 	 * Init mapping for u page(s) for proc 0
 	 */

==== //depot/projects/nsched/sys/amd64/amd64/machdep.c#7 (text+ko) ====

@@ -1112,7 +1112,7 @@
  	 * This may be done better later if it gets more high level
  	 * components in it. If so just link td->td_proc here.
 	 */
-	proc_linkup(&proc0, &ksegrp0, &kse0, &thread0);
+	proc_linkup(&proc0, &ksegrp0, &thread0);
 
 	preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
 	preload_bootstrap_relocate(KERNBASE);

==== //depot/projects/nsched/sys/arm/sa11x0/assabet_machdep.c#3 (text+ko) ====

@@ -370,7 +370,7 @@
 
 	/* Set stack for exception handlers */
 	
-	proc_linkup(&proc0, &ksegrp0, &kse0, &thread0);
+	proc_linkup(&proc0, &ksegrp0, &thread0);
 	proc0.p_uarea = (struct user *) proc0_uarea.pv_va;
 	thread0.td_kstack = kernelstack.pv_va;
 	thread0.td_pcb = (struct pcb *)

==== //depot/projects/nsched/sys/ddb/db_ps.c#5 (text+ko) ====


==== //depot/projects/nsched/sys/i386/i386/machdep.c#13 (text+ko) ====

@@ -1943,17 +1943,15 @@
 	int gsel_tss, metadata_missing, off, x;
 	struct pcpu *pc;
 
-	/* 
-	 * Set up things that proc0 would have associated with it already 
-	 * if it were taken from the process allocation cache.
-	 * This includes a ksegrp, a thread, and a stack and uarea to go
-	 * with the thread. The pcb is deliniated ready for use.
-	 * Note that the stack for proc0 has no guard page.
-	 */
 	proc0.p_uarea = proc0uarea;
 	thread0.td_kstack = proc0kstack;
 	thread0.td_pcb = (struct pcb *)
 	   (thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1;
+
+	/*
+ 	 * This may be done better later if it gets more high level
+ 	 * components in it. If so just link td->td_proc here.
+	 */
 	proc_linkup(&proc0, &ksegrp0, &thread0);
 
 	metadata_missing = 0;

==== //depot/projects/nsched/sys/ia64/ia64/machdep.c#5 (text+ko) ====

@@ -724,7 +724,7 @@
 	msgbufp = (struct msgbuf *)pmap_steal_memory(MSGBUF_SIZE);
 	msgbufinit(msgbufp, MSGBUF_SIZE);
 
-	proc_linkup(&proc0, &ksegrp0, &kse0, &thread0);
+	proc_linkup(&proc0, &ksegrp0, &thread0);
 	/*
 	 * Init mapping for u page(s) for proc 0
 	 */

==== //depot/projects/nsched/sys/kern/init_main.c#11 (text+ko) ====

@@ -338,11 +338,11 @@
 	kseinit();	/* set up kse specific stuff  e.g. upcall zone*/
 
 	/*
-	 * initialise scheduler resources.
+	 * Initialise scheduler resources.
 	 * Add scheduler specific parts to proc, ksegrp, thread as needed.
 	 */
 	schedinit();	/* scheduler gets its house in order */
-
+	schedinit2();	/* temporary */
 	/*
 	 * Initialize sleep queue hash table
 	 */
@@ -370,8 +370,6 @@
 	session0.s_leader = p;
 
 	p->p_sysent = &null_sysvec;
-
-
 	p->p_flag = P_SYSTEM;
 	p->p_sflag = PS_INMEM;
 	p->p_state = PRS_NORMAL;

==== //depot/projects/nsched/sys/kern/kern_exec.c#10 (text+ko) ====

@@ -53,7 +53,6 @@
 #include <sys/namei.h>
 #include <sys/sf_buf.h>
 #include <sys/sysent.h>
-#include <sys/sched.h>
 #include <sys/shm.h>
 #include <sys/sysctl.h>
 #include <sys/user.h>
@@ -255,7 +254,7 @@
 	PROC_LOCK(p);
 	KASSERT((p->p_flag & P_INEXEC) == 0,
 	    ("%s(): process already has P_INEXEC flag", __func__));
-	if (p->p_flag & P_SA || p->p_numthreads > 1) {
+	if (p->p_flag & P_HADTHREADS) {
 		if (thread_single(SINGLE_EXIT)) {
 			PROC_UNLOCK(p);
 			mtx_unlock(&Giant);
@@ -263,18 +262,8 @@
 		}
 		/*
 		 * If we get here all other threads are dead,
-		 * so unset the associated flags and lose KSE mode.
-		 * This meams that we must get rid of any extra
-		 * upcalls and kses we may have picked up along the way.
+		 * and threading mode has been turned off
 		 */
-		mtx_lock_spin(&sched_lock);
-		sched_set_concurrency(td->td_ksegrp, 1);
-		upcall_remove(td);
-		mtx_unlock_spin(&sched_lock);
-		p->p_flag &= ~(P_SA|P_HADTHREADS);
-		td->td_mailbox = NULL;
-		td->td_pflags &= ~TDP_SA;
-		thread_single_end();
 	}
 	p->p_flag |= P_INEXEC;
 	PROC_UNLOCK(p);

==== //depot/projects/nsched/sys/kern/kern_exit.c#17 (text+ko) ====

@@ -134,7 +134,7 @@
 	 * MUST abort all other threads before proceeding past here.
 	 */
 	PROC_LOCK(p);
-	if (p->p_flag & P_SA || p->p_numthreads > 1) {
+	if (p->p_flag & P_HADTHREADS) {
 retry:
 		/*
 		 * First check if some other thread got here before us..
@@ -164,18 +164,8 @@
 			goto retry;
 		/*
 		 * All other activity in this process is now stopped.
-		 * Remove excess KSEs and KSEGRPS. XXXKSE (when we have them)
-		 * ...
-		 * Turn off threading support.
+		 * Threading support has been turned off.
 		 */
-		mtx_lock_spin(&sched_lock);
-		sched_set_concurrency(td->td_ksegrp, 1);
-		upcall_remove(td);
-		mtx_unlock_spin(&sched_lock);
-		p->p_flag &= ~(P_SA|P_HADTHREADS);
-		td->td_mailbox = NULL;
-		td->td_pflags &= ~TDP_SA;
-		thread_single_end();
 	}
 
 	p->p_flag |= P_WEXIT;
@@ -488,7 +478,7 @@
 	 * Finally, call machine-dependent code to release the remaining
 	 * resources including address space.
 	 * The address space is released by "vmspace_exitfree(p)" in
-	 * vm_waitproc(). To be called BEFORE taking the final schedlock.
+	 * vm_waitproc().
 	 */
 	cpu_exit(td);
 
@@ -533,7 +523,8 @@
 	knlist_destroy(&p->p_klist);
 
 	/*
-	 * Make sure the system takes this thread out of its tables etc.
+	 * Make sure the scheduler takes this thread out of its tables etc.
+	 * This will also release this thread's reference to the ucred.
 	 * Other thread parts to release include pcb bits and such.
 	 */
 	thread_exit();
@@ -681,8 +672,6 @@
 
 			/*
 			 * do any thread-system specific cleanups
-			 * like freeing the thread's ucred,
-			 * and any spare threads, etc.
 			 */
 			thread_wait(p);
 
@@ -695,6 +684,8 @@
 #ifdef MAC
 			mac_destroy_proc(p);
 #endif
+			KASSERT(FIRST_THREAD_IN_PROC(p),
+			    ("kern_wait: no residual thread!"));
 			uma_zfree(proc_zone, p);
 			sx_xlock(&allproc_lock);
 			nprocs--;

==== //depot/projects/nsched/sys/kern/kern_fork.c#9 (text+ko) ====

@@ -508,7 +508,7 @@
 	 * Allow the scheduler to adjust the priority of the child and
 	 * parent while we hold the sched_lock.
 	 */
-	sched_fork(td, p2);
+	sched_fork(td, td2);
 
 	mtx_unlock_spin(&sched_lock);
 	p2->p_ucred = crhold(td->td_ucred);

==== //depot/projects/nsched/sys/kern/kern_kse.c#23 (text+ko) ====

@@ -325,9 +325,8 @@
 		psignal(p, SIGSEGV);
 	mtx_lock_spin(&sched_lock);
 	upcall_remove(td);
-
 	if (p->p_numthreads != 1) {
-		/* 
+		/*
 		 * If we are not the last thread, but we are the last
 		 * thread in this ksegrp, then by definition this is not
 		 * the last group and we need to clean it up as well.
@@ -337,21 +336,21 @@
 		thread_exit();
 		/* NOTREACHED */
 	}
-	/* 
+	/*
 	 * This is the last thread. Just return to the user.
 	 * We know that there is only one ksegrp too, as any others
 	 * would have been discarded in previous calls to thread_exit().
 	 * Effectively we have left threading mode..
-	 * The only real thing left to do is ensure that the 
-	 * scheduler sets out concurrency back to 1 as that may be a 
+	 * The only real thing left to do is ensure that the
+	 * scheduler sets out concurrency back to 1 as that may be a
 	 * resource leak otherwise.
 	 * This is an A[PB]I issue.. what SHOULD we do?
 	 * One possibility is to return to the user. It may not cope well.
 	 * The other possibility would be to let the process exit.
 	 */
-	p->p_flag &= ~P_SA;
+	p->p_flag &= ~(P_SA|P_HADTHREADS);
+	sched_reset_concurrency(td->td_ksegrp);
 	mtx_unlock_spin(&sched_lock);
-	sched_set_concurrency(td->td_ksegrp, 1);
 	PROC_UNLOCK(p);
 #if 1
 	return (0);
@@ -501,6 +500,10 @@
 /*
  * No new KSEG: first call: use current KSE, don't schedule an upcall
  * All other situations, do allocate max new KSEs and schedule an upcall.
+ *
+ * XXX should be changed so that 'first' behaviour lasts for as long 
+ * as you have not made a kse in this ksegrp. i.e. as long as we do not have
+ * a mailbox..
  */
 /* struct kse_create_args {
 	struct kse_mailbox *mbx;
@@ -521,6 +524,13 @@
 	if ((err = copyin(uap->mbx, &mbx, sizeof(mbx))))
 		return (err);
 
+	/*
+	 * Processes using the other threading model can't
+	 * suddenly start calling this one
+	 */
+	if ((p->p_flag & (P_SA|P_HADTHREADS)) == P_HADTHREADS)
+		 return (EINVAL);
+
 	ncpus = mp_ncpus;
 	if (virtual_cpu != 0)
 		ncpus = virtual_cpu;
@@ -605,15 +615,17 @@
 		 * Initialize KSE group with the appropriate
 		 * concurrency.
 		 *
-		 * For multiplxed group, set concurrency equal to physical
-		 * cpus. This increases concurrent even if userland
-		 * is not MP safe and can only run on single CPU.
-		 * In ideal world, every physical cpu should execute a thread.
-		 * If there is enough KSEs, threads in kernel can be
-		 * executed parallel on different cpus with full speed,
-		 * Concurrent in kernel shouldn't be restricted by number of
-		 * upcalls userland provides. Adding more upcall structures
-		 * only increases concurrent in userland.
+		 * For a multiplexed group, create as as much concurrency
+		 * as the number of physical cpus.
+		 * This increases concurrency in the kernel even if the
+		 * userland is not MP safe and can only run on a single CPU.
+		 * In an ideal world, every physical cpu should execute a
+		 * thread.  If there is enough concurrency, threads in the
+		 * kernel can be executed parallel on different cpus at
+		 * full speed without being restricted by the number of
+		 * upcalls the userland provides.
+		 * Adding more upcall structures only increases concurrency
+		 * in userland.
 		 *
 		 * For a bound thread group, because there is only one thread
 		 * in the group, we only set the concurrency for the group 
@@ -767,7 +779,6 @@
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0);
 }
 
-
 /*
  * Stash an embarasingly extra upcall into the zombie upcall queue.
  */
@@ -1002,7 +1013,8 @@
 /*
  * This function is intended to be used to initialize a spare thread
  * for upcall. Initialize thread's large data area outside sched_lock
- * for thread_schedule_upcall().
+ * for thread_schedule_upcall(). The crhold is also here to get it out
+ * from the schedlock as it has a mutex op itself.
  */
 void
 thread_alloc_spare(struct thread *td)
@@ -1011,7 +1023,8 @@
 
 	if (td->td_standin)
 		return;
-	td->td_standin = spare =  thread_alloc();
+	spare = thread_alloc();
+	td->td_standin = spare;
 	bzero(&spare->td_startzero,
 	    (unsigned) RANGEOF(struct thread, td_startzero, td_endzero));
 	spare->td_proc = td->td_proc;
@@ -1056,11 +1069,13 @@
 	ku->ku_owner   = td2;
 	td2->td_upcall = ku;
 	td2->td_flags  = 0;
-	td2->td_pflags = (TDP_SA | TDP_UPCALLING);
+	td2->td_pflags = TDP_SA|TDP_UPCALLING;
+	td2->td_kse    = NULL;
 	td2->td_state  = TDS_CAN_RUN;
 	td2->td_inhibitors = 0;
 	SIGFILLSET(td2->td_sigmask);
 	SIG_CANTMASK(td2->td_sigmask);
+	sched_fork_thread(td, td2);
 	return (td2);	/* bogus.. should be a void function */
 }
 
@@ -1181,8 +1196,6 @@
 	    (ku->ku_mflags & KMF_NOUPCALL)) {
 		td->td_mailbox = NULL;
 	} else {
-		if (td->td_standin == NULL)
-			thread_alloc_spare(td);
 		flags = fuword32(&tmbx->tm_flags);
 		/*
 		 * On some architectures, TP register points to thread

==== //depot/projects/nsched/sys/kern/kern_proc.c#11 (text+ko) ====

@@ -100,8 +100,6 @@
 SYSCTL_INT(_kern, OID_AUTO, kstack_pages, CTLFLAG_RD, &kstack_pages, 0, "");
 SYSCTL_INT(_kern, OID_AUTO, uarea_pages, CTLFLAG_RD, &uarea_pages, 0, "");
 
-#define RANGEOF(type, start, end) (offsetof(type, end) - offsetof(type, start))
-
 CTASSERT(sizeof(struct kinfo_proc) == KINFO_PROC_SIZE);
 
 /*
@@ -127,7 +125,6 @@
 
 /*
  * Prepare a proc for use.
- * cache->used
  */
 static int
 proc_ctor(void *mem, int size, void *arg, int flags)
@@ -140,7 +137,6 @@
 
 /*
  * Reclaim a proc after use.
- * used -> cache
  */
 static void
 proc_dtor(void *mem, int size, void *arg)
@@ -151,7 +147,7 @@
 
 	/* INVARIANTS checks go here */
 	p = (struct proc *)mem;
-	td = FIRST_THREAD_IN_PROC(p);
+        td = FIRST_THREAD_IN_PROC(p);
 #ifdef INVARIANTS
 	KASSERT((p->p_numthreads == 1),
 	    ("bad number of threads in exiting process"));
@@ -171,7 +167,6 @@
 
 /*
  * Initialize type-stable parts of a proc (when newly created).
- * raw memory -> cache
  */
 static int
 proc_init(void *mem, int size, int flags)
@@ -185,15 +180,15 @@
 	vm_proc_new(p);
 	td = thread_alloc();
 	kg = ksegrp_alloc();
+	bzero(&p->p_mtx, sizeof(struct mtx));
 	mtx_init(&p->p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK);
 	proc_linkup(p, kg, td);
-	sched_newproc(p, kg, td); /* err? */
+	sched_newproc(p, kg, td);
 	return (0);
 }
 
 /*
  * Tear down type-stable parts of a proc (just before being discarded)
- * cache -> free memeory
  */
 static void
 proc_fini(void *mem, int size)
@@ -765,8 +760,8 @@
 		kp->ki_kstack = (void *)td->td_kstack;
 		kp->ki_pctcpu = sched_pctcpu(td);
 
+		/* Things in the kse */
 #if 0
-		/* Things in the kse */
 		if (ke)
 			kp->ki_rqindex = ke->ke_rqindex;
 		else

==== //depot/projects/nsched/sys/kern/kern_switch.c#5 (text+ko) ====

@@ -90,6 +90,7 @@
 
 #include "opt_full_preemption.h"
 
+#ifndef KERN_SWITCH_INCLUDE
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kdb.h>
@@ -100,6 +101,7 @@
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/sched.h>
+#else  /* KERN_SWITCH_INCLUDE */
 #if defined(SMP) && (defined(__i386__) || defined(__amd64__))
 #include <sys/smp.h>
 #endif
@@ -223,6 +225,7 @@
 	TAILQ_INSERT_TAIL(&kg->kg_iq, ke, ke_kgrlist);
 	kg->kg_idle_kses++;
 	CTR1(KTR_RUNQ, "kse_reassign: ke%p on idle queue", ke);
+	sched_check_concurrency(kg); /* could implement directly */
 	return;
 }
 
@@ -837,3 +840,348 @@
 }
 #endif
 
+/****** functions that are temporarily here ***********/
+#include <vm/uma.h>
+#define RANGEOF(type, start, end) (offsetof(type, end) - offsetof(type, start))
+static uma_zone_t kse_zone;
+TAILQ_HEAD(, kse) zombie_kses = TAILQ_HEAD_INITIALIZER(zombie_kses);
+extern struct mtx kse_zombie_lock;
+
+/*
+ * Initialize type-stable parts of a kse (when newly created).
+ */
+static int
+kse_init(void *mem, int size, int flags)
+{
+	struct kse	*ke;
+
+	ke = (struct kse *)mem;
+	ke->ke_sched = (struct ke_sched *)&ke[1];
+	return (0);
+}
+
+/*
+ * Allocate a kse.
+ */
+static struct kse *
+kse_alloc(void)
+{
+	return (uma_zalloc(kse_zone, M_WAITOK));
+}
+
+/*
+ * Deallocate a kse.
+ */
+void
+kse_free(struct kse *td)
+{
+	uma_zfree(kse_zone, td);
+}
+
+/*
+ * KSE is linked into kse group.
+ * If we know the thread at this time attach to it,
+ * otherwise put it on the idle kse queue.
+ * Called from:
+ *  sched_init_concurrency()  schedlock
+ *  sched_set_concurrency()  schedlock
+ *  schedinit2()  NO schedlock (too early)
+ */
+static void
+kse_link(struct kse *ke, struct ksegrp *kg, struct thread *td)
+{
+	struct proc *p = kg->kg_proc;
+
+	TAILQ_INSERT_HEAD(&kg->kg_kseq, ke, ke_kglist);
+	kg->kg_kses++;
+	ke->ke_proc	= p;
+	ke->ke_ksegrp	= kg;
+	ke->ke_oncpu	= NOCPU;
+	ke->ke_flags	= 0;
+	if (td) {
+		ke->ke_state	= KES_THREAD;
+		td->td_kse = ke;
+		ke->ke_thread	= td;
+	} else {
+		TAILQ_INSERT_TAIL(&kg->kg_iq, ke, ke_kgrlist);
+		kg->kg_idle_kses++;
+		ke->ke_state	= KES_IDLE;
+		ke->ke_thread	= NULL;
+	}
+}
+
+/*
+ * Stash an embarasingly extra kse into the zombie kse queue.
+ */
+static void
+kse_stash(struct kse *ke)
+{
+	mtx_lock_spin(&kse_zombie_lock);
+	TAILQ_INSERT_HEAD(&zombie_kses, ke, ke_procq);
+	mtx_unlock_spin(&kse_zombie_lock);
+}
+
+/* 
+ * Called from:
+ *  sched_set_concurrency()
+ *  sched_reset_concurrency()
+ *  sched_check_concurrency()
+ */
+static void
+kse_unlink(struct kse *ke)
+{
+	struct ksegrp *kg;
+
+	mtx_assert(&sched_lock, MA_OWNED);
+	kg = ke->ke_ksegrp;
+	TAILQ_REMOVE(&kg->kg_kseq, ke, ke_kglist);
+	if (ke->ke_state == KES_IDLE) {
+		TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
+		kg->kg_idle_kses--;
+	}
+	--kg->kg_kses;
+	/*
+	 * Aggregate stats from the KSE
+	 */
+	kse_stash(ke);
+}
+
+
+/*
+ * Concurrency is implemented using the number of KSEs.
+ * This will be re-implmented using another method, so 
+ * isolate the details with a simple API.
+ * Once the API has been implemented, we can switch out the
+ * underlying implementation.
+ */
+
+/*
+ *  Allocate scheduler specific per-process resources.
+ * The thread and ksegrp have already been linked in.
+ * Called from:
+ *  proc_init() (UMA init method)
+ */
+void
+sched_newproc(struct proc *p, struct ksegrp *kg, struct thread *td)
+{
+
+	sched_init_concurrency(kg, td);
+}
+
+/*
+ * Called by the uma process fini routine..
+ * undo anything we may have done in the uma_init method.
+ * Panic if it's not all 1:1:1:1
+ * Called from:
+ *  proc_fini() (UMA method)
+ */
+void
+sched_destroyproc(struct proc *p)
+{
+	struct ksegrp *kg;
+	
+	KASSERT((p->p_numthreads == 1), ("Cached proc with > 1 thread "));
+	KASSERT((p->p_numksegrps == 1), ("Cached proc with > 1 ksegrp "));
+	kg = FIRST_KSEGRP_IN_PROC(p);
+	KASSERT((kg->kg_kses == 1), ("Cached proc with > 1 kse "));
+	kse_free(TAILQ_FIRST(&kg->kg_kseq));
+}
+
+/*
+ * thread is being either created or recycled.
+ * Fix up the per-scheduler resources associated with it.
+ * Called from:
+ *  thread_dtor()
+ *  thread_init()
+ */
+void
+sched_newthread(struct thread *td)
+{
+	td->td_last_kse = NULL;
+	td->td_kse      = NULL;
+}
+
+/*
+ * (Re) assign resources to allow the ksegrp to implement
+ * the requested concurrency. At this time it means allocating
+ * or freeing KSE structures.
+ * We may not remove all the KSEs if there are enough threads in the
+ * ksegrp to justify them. They will eventually go away as they are added
+ * to the free kse queue and threads exit.
+ */
+
+/*
+ * set up an initial concurrency of 1 
+ * and set the given thread (if given) to be using that 
+ * concurrency slot.
+ * May be used "offline"..before the ksegrp is attached to the world
+ * and thus wouldn't need schedlock in that case.
+ * Called from:
+ *  thr_create()
+ *  proc_init() (UMA) via sched_newproc() 
+ */
+void
+sched_init_concurrency(struct ksegrp *kg, struct thread *td)
+{
+	struct kse *newke;
+
+	newke = kse_alloc();
+	if (newke == NULL)
+		panic("sched_init_concurrency: no kse allocated");
+	kg->kg_concurrency = 1;
+	mtx_lock_spin(&sched_lock);
+	kse_link(newke, kg , td);
+	mtx_unlock_spin(&sched_lock);
+}
+
+/*
+ * Change the concurrency of an existing ksegrp to N
+ * Called from:
+ *  kse_create()
+ */
+void
+sched_set_concurrency(struct ksegrp *kg, int concurrency)
+{
+	struct kse *newke;
+
+	kg->kg_concurrency = concurrency;
+	/* Handle the case for a declining concurrency */
+	while ((kg->kg_concurrency < kg->kg_kses) &&
+    	    (kg->kg_idle_kses > 0) &&
+	    (kg->kg_kses > kg->kg_numthreads)) {
+		kse_unlink(TAILQ_FIRST(&kg->kg_iq));
+	}
+	while (kg->kg_kses < kg->kg_concurrency) {
+		newke = kse_alloc();
+		if (newke == NULL)
+			panic("sched_set_concurrency: no kse allocated");
+		bzero(&newke->ke_startzero, RANGEOF(struct kse,
+		      ke_startzero, ke_endzero));
+		mtx_lock_spin(&sched_lock);
+		kse_link(newke, kg , NULL);
+		sched_fork_kse(curthread, newke);
+		mtx_unlock_spin(&sched_lock);
+	}
+}
+
+/* 
+ * This is a temporary function to remove the KSE(s) from a ksegrp when the
+ * ksegrp is being destroyed, so we need not keep it consistent.
+ * Called from:
+ *  thread_exit()
+ *  Not called from sched_destroyproc() which does it itself.
+ */
+void
+sched_remove_concurrency(struct ksegrp *kg)
+{
+	struct kse *ke, *nextke;
+	ke = TAILQ_FIRST(&kg->kg_kseq);
+	while (ke != NULL) {
+		nextke = TAILQ_NEXT(ke, ke_kglist);
+		kse_stash(ke);
+		ke = nextke;
+	}
+	/* set these down just so later kasserts don't trigger */
+	kg->kg_kses = 0;
+	kg->kg_idle_kses = 0;
+	kg->kg_concurrency = 0;
+}
+
+/* 
+ * Whenever we have idle KSEs and there are too many for the concurrency,
+ * then free as many as we can. Don't free too many if we have threads
+ * to run/kill.
+ * Called from:
+ *  kse_reassign().
+ */
+void
+sched_check_concurrency(struct ksegrp * kg) {
+	while ((kg->kg_concurrency < kg->kg_kses) &&
+    	    (kg->kg_idle_kses > 0) &&
+	    (kg->kg_kses > kg->kg_numthreads)) {
+		kse_unlink(TAILQ_FIRST(&kg->kg_iq));
+	}
+}
+
+/*
+ * Reset the Concurrency  back to 1, whatever that means to this scheduler.
+ * In this case, free all but one KSE.
+ * Called from:
+ *  thread_single(SINGLE_EXIT)  (e.g. in execve and exit)
+ *  kse_exit()
+ */
+void
+sched_reset_concurrency(struct ksegrp *kg)
+{
+	KASSERT((kg->kg_numthreads == 1),
+	    ("sched_reset_concurrency: Nthread!= 1"));
+	mtx_assert(&sched_lock, MA_OWNED);
+	kg->kg_concurrency = 1;
+	while ((kg->kg_concurrency < kg->kg_kses) &&
+    	    (kg->kg_idle_kses > 0) &&
+	    (kg->kg_kses > kg->kg_numthreads)) {
+		kse_unlink(TAILQ_FIRST(&kg->kg_iq));
+	}
+}
+
+/*
+ * Very early in the boot some setup of scheduler-specific
+ * parts of proc0 and of some scheduler resources needs to be done.
+ * This is temporary, it will merge with schedinit() in each scheduler
+ * soon.
+ * Called from:
+ *  proc0_init()
+ */
+void
+schedinit2(void)
+{
+	/* 
+	 * Put our per-scheduler struct on the ksegrp/thread
+	 * this puts it on the idle queue.
+	 */
+	kse_link(&kse0, &ksegrp0, &thread0);
+
+	kse_zone = uma_zcreate("KSE", sched_sizeof_kse(),
+	    NULL, NULL, kse_init, NULL, UMA_ALIGN_CACHE, 0);
+}
+
+void
+sched_thread_exit(struct thread *td)
+{
+	struct kse *ke;
+ 
+	ke = td->td_kse;
+   
+	if ((td->td_proc->p_flag & P_SA) && ke != NULL) {
+		ke->ke_thread = NULL;
+		td->td_kse = NULL;
+	        kse_reassign(ke);
+	}
+}
+
+/*
+ * Reap zombie kse resource.
+ */
+void
+sched_GC(void)
+{
+	struct kse *ke_first, *ke_next;
+
+	/*
+	 * Don't even bother to lock if none at this instant,
+	 * we really don't care about the next instant..
+	 */
+	if (!TAILQ_EMPTY(&zombie_kses)) {
+		mtx_lock_spin(&kse_zombie_lock);
+		ke_first = TAILQ_FIRST(&zombie_kses);
+		if (ke_first)
+			TAILQ_INIT(&zombie_kses);
+		mtx_unlock_spin(&kse_zombie_lock);
+		while (ke_first) {
+			ke_next = TAILQ_NEXT(ke_first, ke_procq);
+			kse_free(ke_first);
+			ke_first = ke_next;
+		}
+	}
+}
+#endif /* KERN_SWITCH_INCLUDE */

==== //depot/projects/nsched/sys/kern/kern_synch.c#9 (text+ko) ====


==== //depot/projects/nsched/sys/kern/kern_thr.c#12 (text+ko) ====

@@ -44,69 +44,13 @@
 
 #include <machine/frame.h>
 
-extern int virtual_cpu;
+extern int max_threads_per_proc;
+extern int max_groups_per_proc;
+
 /*
  * Back end support functions.
  */
 
-void
-thr_exit1(void)
-{
-	struct ksegrp *kg;
-	struct thread *td;
-	struct proc *p;
-
-	td = curthread;
-	p = td->td_proc;
-	kg = td->td_ksegrp;
-
-	mtx_assert(&sched_lock, MA_OWNED);
-	PROC_LOCK_ASSERT(p, MA_OWNED);
-	KASSERT(!mtx_owned(&Giant), ("dying thread owns giant"));
-
-	/*
-	 * Shutting down last thread in the proc.  This will actually
-	 * call exit() in the trampoline when it returns.
-	 */
-	if (p->p_numthreads == 1) {
-		PROC_UNLOCK(p);
-		return;
-	}
-
-	/*
-	 * XXX Undelivered process wide signals should be reposted to the
-	 * proc.
-	 */
-
-	/* Clean up cpu resources. */
-	cpu_thread_exit(td);
-
-	/* let the scheduler know we are dying.. */
-	/* Lots in common with sched_thread_exit.. merge one day */
-	sched_thr_exit(td);
-
-	/* Unlink the thread from the process and kseg. */
-	thread_unlink(td);
-
-	/*
-	 * If we were stopped while waiting for all threads to exit and this
-	 * is the last thread wakeup the exiting thread.
-	 */
-	if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE)
-		if (p->p_numthreads == 1)
-			thread_unsuspend_one(p->p_singlethread);
-
-	PROC_UNLOCK(p);
-	td->td_state = TDS_INACTIVE;
-#if 0
-	td->td_proc = NULL;
-#endif
-	td->td_ksegrp = NULL;
-	thread_stash(td);
-
-	cpu_throw(td, choosethread(SW_VOL));
-}
-
 #define	RANGEOF(type, start, end) (offsetof(type, end) - offsetof(type, start))
 
 /*
@@ -116,57 +60,82 @@
 thr_create(struct thread *td, struct thr_create_args *uap)
     /* ucontext_t *ctx, long *id, int flags */
 {
-	struct thread *td0;
+	struct thread *newtd;
 	ucontext_t ctx;
 	long id;
 	int error;
-	int ncpus;
+	struct ksegrp *kg, *newkg;
+	struct proc *p;
 
+	p = td->td_proc;
+	kg = td->td_ksegrp;
 	if ((error = copyin(uap->ctx, &ctx, sizeof(ctx))))
 		return (error);
 
-	/* Initialize our td. */
-	td0 = thread_alloc();
-
+	/* Have race condition but it is cheap */
+	if ((p->p_numksegrps >= max_groups_per_proc) ||
+	    (p->p_numthreads >= max_threads_per_proc)) {
+		return (EPROCLIM);
+	}
+	/* Initialize our td and new ksegrp.. */
+	newtd = thread_alloc();
+	newkg = ksegrp_alloc();
 	/*
 	 * Try the copyout as soon as we allocate the td so we don't have to
 	 * tear things down in a failure case below.
 	 */
-	id = td0->td_tid;
+	id = newtd->td_tid;
 	if ((error = copyout(&id, uap->id, sizeof(long)))) {
-		thread_free(td0);
+		ksegrp_free(newkg);
+		thread_free(newtd);
 		return (error);
 	}
 
-	bzero(&td0->td_startzero,
-	    (unsigned)RANGEOF(struct thread, td_startzero, td_endzero));
-	bcopy(&td->td_startcopy, &td0->td_startcopy,
+	bzero(&newtd->td_startzero,
+	    (unsigned) RANGEOF(struct thread, td_startzero, td_endzero));
+	bcopy(&td->td_startcopy, &newtd->td_startcopy,
 	    (unsigned) RANGEOF(struct thread, td_startcopy, td_endcopy));
 
-	td0->td_proc = td->td_proc;
-	PROC_LOCK(td->td_proc);
-	td0->td_sigmask = td->td_sigmask;
-	/* First time through? */
-	if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
-		ncpus = mp_ncpus;
-		if (virtual_cpu != 0)
-			ncpus = virtual_cpu;
-		sched_set_concurrency(td->td_ksegrp, ncpus);
-		td->td_proc->p_flag |= P_HADTHREADS;
-	}
-	PROC_UNLOCK(td->td_proc);
-	td0->td_ucred = crhold(td->td_ucred);
+	bzero(&newkg->kg_startzero, 
+	    (unsigned) RANGEOF(struct ksegrp, kg_startzero, kg_endzero));
+	bcopy(&kg->kg_startcopy, &newkg->kg_startcopy,
+	    (unsigned) RANGEOF(struct ksegrp, kg_startcopy, kg_endcopy));
+
+	newtd->td_proc = td->td_proc;
+	newtd->td_ucred = crhold(td->td_ucred);
 
 	/* Set up our machine context. */

>>> TRUNCATED FOR MAIL (1000 lines) <<<