cvs commit: src/sys/kern kern_fork.c kern_resource.c

Don Lewis truckman at FreeBSD.org
Sun Mar 11 19:15:19 UTC 2007


On 26 Feb, Xin LI wrote:
> delphij     2007-02-26 03:38:10 UTC
> 
>   FreeBSD src repository
> 
>   Modified files:
>     sys/kern             kern_fork.c kern_resource.c 
>   Log:
>   Close race conditions between fork() and [sg]etpriority()'s
>   PRIO_USER case, possibly also other places that deferences
>   p_ucred.
>   
>   In the past, we insert a new process into the allproc list right
>   after PID allocation, and release the allproc_lock sx.  Because
>   most content in new proc's structure is not yet initialized,
>   this could lead to undefined result if we do not handle PRS_NEW
>   with care.
>   
>   The problem with PRS_NEW state is that it does not provide fine
>   grained information about how much initialization is done for a
>   new process.  By defination, after PRIO_USER setpriority(), all
>   processes that belongs to given user should have their nice value
>   set to the specified value.  Therefore, if p_{start,end}copy
>   section was done for a PRS_NEW process, we can not safely ignore
>   it because p_nice is in this area.  On the other hand, we should
>   be careful on PRS_NEW processes because we do not allow non-root
>   users to lower their nice values, and without a successful copy
>   of the copy section, we can get stale values that is inherted
>   from the uninitialized area of the process structure.
>   
>   This commit tries to close the race condition by grabbing proc
>   mutex *before* we release allproc_lock xlock, and do copy as
>   well as zero immediately after the allproc_lock xunlock.  This
>   guarantees that the new process would have its p_copy and p_zero
>   sections, as well as user credential informaion initialized.  In
>   getpriority() case, instead of grabbing PROC_LOCK for a PRS_NEW
>   process, we just skip the process in question, because it does
>   not affect the final result of the call, as the p_nice value
>   would be copied from its parent, and we will see it during
>   allproc traverse.
>   
>   Other potential solutions are still under evaluation.
>   
>   Discussed with: davidxu, jhb, rwatson
>   PR:             kern/108071
>   MFC after:      2 weeks
>   
>   Revision  Changes    Path
>   1.267     +14 -5     src/sys/kern/kern_fork.c
>   1.167     +3 -0      src/sys/kern/kern_resource.c

Quite some time ago I rototilled fork1() to separate the process limit
check from pid allocation and the insertion of the new process in to the
proc tree, so that the process could be more fully initialized before
making it visible, and hopefully elimininating the need for PRS_NEW.

I thought I had sent it out for review at the time, but now I am not
able to find any evidence that I actually did.

In any case, here is a diff versus kern_fork.c 1.266.  I've been
successfully running it for quite some time, even though I don't know
that I have properly resolved all the patch conflicts that have cropped
up since I first put this code in my local tree.  It needs to be
reviewed to see if it avoids the bug that 1.267 fixes, and to see if
there are any other problems.  Unfortunately, I currently have no
bandwidth to look at it.

Index: kern_fork.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/kern_fork.c,v
retrieving revision 1.266
diff -u -r1.266 kern_fork.c
--- kern_fork.c	23 Jan 2007 08:46:50 -0000	1.266
+++ kern_fork.c	11 Mar 2007 18:27:07 -0000
@@ -197,15 +197,11 @@
 	int pages;
 	struct proc **procp;
 {
-	struct proc *p1, *p2, *pptr;
-	struct proc *newproc;
+	struct proc *p1, *p2, *p3, *pptr;
 	int ok, trypid;
 	static int curfail, pidchecked = 0;
 	static struct timeval lastfail;
-	struct filedesc *fd;
-	struct filedesc_to_leader *fdtol;
 	struct thread *td2;
-	struct sigacts *newsigacts;
 	int error;
 
 	/* Can't copy and clear. */
@@ -286,18 +282,7 @@
 	}
 
 	/* Allocate new proc. */
-	newproc = uma_zalloc(proc_zone, M_WAITOK);
-#ifdef MAC
-	mac_init_proc(newproc);
-#endif
-#ifdef AUDIT
-	audit_proc_alloc(newproc);
-#endif
-	knlist_init(&newproc->p_klist, &newproc->p_mtx, NULL, NULL, NULL);
-	STAILQ_INIT(&newproc->p_ktr);
-
-	/* We have to lock the process tree while we look for a pid. */
-	sx_slock(&proctree_lock);
+	p2 = uma_zalloc(proc_zone, M_WAITOK);
 
 	/*
 	 * Although process entries are dynamically created, we still keep
@@ -340,110 +325,46 @@
 	 * are hard-limits as to the number of processes that can run.
 	 */
 	nprocs++;
+	sx_xunlock(&allproc_lock);
 
-	/*
-	 * Find an unused process ID.  We remember a range of unused IDs
-	 * ready to use (from lastpid+1 through pidchecked-1).
-	 *
-	 * If RFHIGHPID is set (used during system boot), do not allocate
-	 * low-numbered pids.
-	 */
-	trypid = lastpid + 1;
-	if (flags & RFHIGHPID) {
-		if (trypid < 10)
-			trypid = 10;
-	} else {
-		if (randompid)
-			trypid += arc4random() % randompid;
-	}
-retry:
-	/*
-	 * If the process ID prototype has wrapped around,
-	 * restart somewhat above 0, as the low-numbered procs
-	 * tend to include daemons that don't exit.
-	 */
-	if (trypid >= PID_MAX) {
-		trypid = trypid % PID_MAX;
-		if (trypid < 100)
-			trypid += 100;
-		pidchecked = 0;
-	}
-	if (trypid >= pidchecked) {
-		int doingzomb = 0;
-
-		pidchecked = PID_MAX;
-		/*
-		 * Scan the active and zombie procs to check whether this pid
-		 * is in use.  Remember the lowest pid that's greater
-		 * than trypid, so we can avoid checking for a while.
-		 */
-		p2 = LIST_FIRST(&allproc);
-again:
-		for (; p2 != NULL; p2 = LIST_NEXT(p2, p_list)) {
-			while (p2->p_pid == trypid ||
-			    (p2->p_pgrp != NULL &&
-			    (p2->p_pgrp->pg_id == trypid ||
-			    (p2->p_session != NULL &&
-			    p2->p_session->s_sid == trypid)))) {
-				trypid++;
-				if (trypid >= pidchecked)
-					goto retry;
-			}
-			if (p2->p_pid > trypid && pidchecked > p2->p_pid)
-				pidchecked = p2->p_pid;
-			if (p2->p_pgrp != NULL) {
-				if (p2->p_pgrp->pg_id > trypid &&
-				    pidchecked > p2->p_pgrp->pg_id)
-					pidchecked = p2->p_pgrp->pg_id;
-				if (p2->p_session != NULL &&
-				    p2->p_session->s_sid > trypid &&
-				    pidchecked > p2->p_session->s_sid)
-					pidchecked = p2->p_session->s_sid;
-			}
-		}
-		if (!doingzomb) {
-			doingzomb = 1;
-			p2 = LIST_FIRST(&zombproc);
-			goto again;
-		}
-	}
-	sx_sunlock(&proctree_lock);
+	/* Zero the section of proc that is zero-initialized. */
+	td2 = FIRST_THREAD_IN_PROC(p2);
+	bzero(&p2->p_startzero,
+	    __rangeof(struct proc, p_startzero, p_endzero));
+	bzero(&td2->td_startzero,
+	    __rangeof(struct thread, td_startzero, td_endzero));
+	p2->p_flag = 0;
+	knlist_init(&p2->p_klist, &p2->p_mtx, NULL, NULL, NULL);
+	STAILQ_INIT(&p2->p_ktr);
 
-	/*
-	 * RFHIGHPID does not mess with the lastpid counter during boot.
-	 */
-	if (flags & RFHIGHPID)
-		pidchecked = 0;
+	if (flags & RFLINUXTHPN) 
+	        p2->p_sigparent = SIGUSR1;
 	else
-		lastpid = trypid;
-
-	p2 = newproc;
-	p2->p_state = PRS_NEW;		/* protect against others */
-	p2->p_pid = trypid;
-	AUDIT_ARG(pid, p2->p_pid);
-	LIST_INSERT_HEAD(&allproc, p2, p_list);
-	LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
-	sx_xunlock(&allproc_lock);
+	        p2->p_sigparent = SIGCHLD;
 
 	/*
 	 * Malloc things while we don't hold any locks.
 	 */
-	if (flags & RFSIGSHARE)
-		newsigacts = NULL;
-	else
-		newsigacts = sigacts_alloc();
+	if ((flags & RFSIGSHARE) == 0)
+		p2->p_sigacts = sigacts_alloc();
+#ifdef MAC
+	mac_init_proc(p2);
+#endif
+#ifdef AUDIT
+	audit_proc_alloc(p2);
+#endif
 
 	/*
 	 * Copy filedesc.
 	 */
 	if (flags & RFCFDG) {
-		fd = fdinit(p1->p_fd);
-		fdtol = NULL;
+		p2->p_fd = fdinit(p1->p_fd);
+		p2->p_fdtol = NULL;
 	} else if (flags & RFFDG) {
-		fd = fdcopy(p1->p_fd);
-		fdtol = NULL;
+		p2->p_fd = fdcopy(p1->p_fd);
+		p2->p_fdtol = NULL;
 	} else {
-		fd = fdshare(p1->p_fd);
+		p2->p_fd = fdshare(p1->p_fd);
 		if (p1->p_fdtol == NULL)
 			p1->p_fdtol =
 				filedesc_to_leader_alloc(NULL,
@@ -454,39 +375,30 @@
 			 * Shared file descriptor table and
 			 * shared process leaders.
 			 */
-			fdtol = p1->p_fdtol;
+			p2->p_fdtol = p1->p_fdtol;
 			FILEDESC_LOCK_FAST(p1->p_fd);
-			fdtol->fdl_refcount++;
+			p2->p_fdtol->fdl_refcount++;
 			FILEDESC_UNLOCK_FAST(p1->p_fd);
 		} else {
 			/* 
 			 * Shared file descriptor table, and
 			 * different process leaders 
 			 */
-			fdtol = filedesc_to_leader_alloc(p1->p_fdtol,
+			p2->p_fdtol = filedesc_to_leader_alloc(p1->p_fdtol,
 							 p1->p_fd,
 							 p2);
 		}
 	}
-	/*
-	 * Make a proc table entry for the new process.
-	 * Start by zeroing the section of proc that is zero-initialized,
-	 * then copy the section that is copied directly from the parent.
-	 */
-	td2 = FIRST_THREAD_IN_PROC(p2);
-
 	/* Allocate and switch to an alternate kstack if specified. */
 	if (pages != 0)
 		vm_thread_new_altkstack(td2, pages);
 
+	/*
+	 * Copy the section of the proc table entry that is copied directly
+         * from the parent.
+	 */
 	PROC_LOCK(p2);
 	PROC_LOCK(p1);
-
-	bzero(&p2->p_startzero,
-	    __rangeof(struct proc, p_startzero, p_endzero));
-	bzero(&td2->td_startzero,
-	    __rangeof(struct thread, td_startzero, td_endzero));
-
 	bcopy(&p1->p_startcopy, &p2->p_startcopy,
 	    __rangeof(struct proc, p_startcopy, p_endcopy));
 	bcopy(&td->td_startcopy, &td2->td_startcopy,
@@ -499,18 +411,6 @@
 	 * Duplicate sub-structures as needed.
 	 * Increase reference counts on shared objects.
 	 */
-	p2->p_flag = 0;
-	if (p1->p_flag & P_PROFIL)
-		startprofclock(p2);
-	mtx_lock_spin(&sched_lock);
-	p2->p_sflag = PS_INMEM;
-	/*
-	 * Allow the scheduler to adjust the priority of the child and
-	 * parent while we hold the sched_lock.
-	 */
-	sched_fork(td, td2);
-
-	mtx_unlock_spin(&sched_lock);
 	p2->p_ucred = crhold(td->td_ucred);
 	td2->td_ucred = crhold(p2->p_ucred);
 #ifdef AUDIT
@@ -518,20 +418,12 @@
 #endif
 	pargs_hold(p2->p_args);
 
-	if (flags & RFSIGSHARE) {
+	if (flags & RFSIGSHARE)
 		p2->p_sigacts = sigacts_hold(p1->p_sigacts);
-	} else {
-		sigacts_copy(newsigacts, p1->p_sigacts);
-		p2->p_sigacts = newsigacts;
-	}
-	if (flags & RFLINUXTHPN) 
-	        p2->p_sigparent = SIGUSR1;
 	else
-	        p2->p_sigparent = SIGCHLD;
+		sigacts_copy(p2->p_sigacts, p1->p_sigacts);
 
 	p2->p_textvp = p1->p_textvp;
-	p2->p_fd = fd;
-	p2->p_fdtol = fdtol;
 
 	/*
 	 * p_limit is copy-on-write.  Bump its refcount.
@@ -547,42 +439,106 @@
 	if (p2->p_textvp)
 		vref(p2->p_textvp);
 
+	sx_xlock(&proctree_lock);
+	/* We have to lock the allproc list while we look for a pid. */
+	sx_xlock(&allproc_lock);
+
 	/*
-	 * Set up linkage for kernel based threading.
+	 * Find an unused process ID.  We remember a range of unused IDs
+	 * ready to use (from lastpid+1 through pidchecked-1).
+	 *
+	 * If RFHIGHPID is set (used during system boot), do not allocate
+	 * low-numbered pids.
 	 */
-	if ((flags & RFTHREAD) != 0) {
-		mtx_lock(&ppeers_lock);
-		p2->p_peers = p1->p_peers;
-		p1->p_peers = p2;
-		p2->p_leader = p1->p_leader;
-		mtx_unlock(&ppeers_lock);
-		PROC_LOCK(p1->p_leader);
-		if ((p1->p_leader->p_flag & P_WEXIT) != 0) {
-			PROC_UNLOCK(p1->p_leader);
-			/*
-			 * The task leader is exiting, so process p1 is
-			 * going to be killed shortly.  Since p1 obviously
-			 * isn't dead yet, we know that the leader is either
-			 * sending SIGKILL's to all the processes in this
-			 * task or is sleeping waiting for all the peers to
-			 * exit.  We let p1 complete the fork, but we need
-			 * to go ahead and kill the new process p2 since
-			 * the task leader may not get a chance to send
-			 * SIGKILL to it.  We leave it on the list so that
-			 * the task leader will wait for this new process
-			 * to commit suicide.
-			 */
-			PROC_LOCK(p2);
-			psignal(p2, SIGKILL);
-			PROC_UNLOCK(p2);
-		} else
-			PROC_UNLOCK(p1->p_leader);
+	trypid = lastpid + 1;
+	if (flags & RFHIGHPID) {
+		if (trypid < 10)
+			trypid = 10;
 	} else {
-		p2->p_peers = NULL;
-		p2->p_leader = p2;
+		if (randompid)
+			trypid += arc4random() % randompid;
 	}
+retry:
+	/*
+	 * If the process ID prototype has wrapped around,
+	 * restart somewhat above 0, as the low-numbered procs
+	 * tend to include daemons that don't exit.
+	 */
+	if (trypid >= PID_MAX) {
+		trypid = trypid % PID_MAX;
+		if (trypid < 100)
+			trypid += 100;
+		pidchecked = 0;
+	}
+	if (trypid >= pidchecked) {
+		int doingzomb = 0;
+
+		pidchecked = PID_MAX;
+		/*
+		 * Scan the active and zombie procs to check whether this pid
+		 * is in use.  Remember the lowest pid that's greater
+		 * than trypid, so we can avoid checking for a while.
+		 */
+		p3 = LIST_FIRST(&allproc);
+again:
+		for (; p3 != NULL; p3 = LIST_NEXT(p3, p_list)) {
+			while (p3->p_pid == trypid ||
+			    (p3->p_pgrp != NULL &&
+			    (p3->p_pgrp->pg_id == trypid ||
+			    (p3->p_session != NULL &&
+			    p3->p_session->s_sid == trypid)))) {
+				trypid++;
+				if (trypid >= pidchecked)
+					goto retry;
+			}
+			if (p3->p_pid > trypid && pidchecked > p3->p_pid)
+				pidchecked = p3->p_pid;
+			if (p3->p_pgrp != NULL) {
+				if (p3->p_pgrp->pg_id > trypid &&
+				    pidchecked > p3->p_pgrp->pg_id)
+					pidchecked = p3->p_pgrp->pg_id;
+				if (p3->p_session != NULL &&
+				    p3->p_session->s_sid > trypid &&
+				    pidchecked > p3->p_session->s_sid)
+					pidchecked = p3->p_session->s_sid;
+			}
+		}
+		if (!doingzomb) {
+			doingzomb = 1;
+			p3 = LIST_FIRST(&zombproc);
+			goto again;
+		}
+	}
+
+	/*
+	 * RFHIGHPID does not mess with the lastpid counter during boot.
+	 */
+	if (flags & RFHIGHPID)
+		pidchecked = 0;
+	else
+		lastpid = trypid;
+
+	p2->p_state = PRS_NEW;		/* protect against others */
+	p2->p_pid = trypid;
+	AUDIT_ARG(pid, p2->p_pid);
+	LIST_INSERT_HEAD(&allproc, p2, p_list);
+	LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
+	sx_xunlock(&allproc_lock);
+
+	/*
+	 * Attach the new process to its parent.
+	 *
+	 * If RFNOWAIT is set, the newly created process becomes a child
+	 * of init.  This effectively disassociates the child from the
+	 * parent.
+	 */
+	if (flags & RFNOWAIT)
+		pptr = initproc;
+	else
+		pptr = p1;
+	p2->p_pptr = pptr;
+	LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
 
-	sx_xlock(&proctree_lock);
 	PGRP_LOCK(p1->p_pgrp);
 	PROC_LOCK(p2);
 	PROC_LOCK(p1);
@@ -641,20 +597,42 @@
 	_PHOLD(p1);
 	PROC_UNLOCK(p1);
 
+	sx_xunlock(&proctree_lock);
+
 	/*
-	 * Attach the new process to its parent.
-	 *
-	 * If RFNOWAIT is set, the newly created process becomes a child
-	 * of init.  This effectively disassociates the child from the
-	 * parent.
+	 * Set up linkage for kernel based threading.
 	 */
-	if (flags & RFNOWAIT)
-		pptr = initproc;
-	else
-		pptr = p1;
-	p2->p_pptr = pptr;
-	LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
-	sx_xunlock(&proctree_lock);
+	if ((flags & RFTHREAD) != 0) {
+		mtx_lock(&ppeers_lock);
+		p2->p_peers = p1->p_peers;
+		p1->p_peers = p2;
+		p2->p_leader = p1->p_leader;
+		mtx_unlock(&ppeers_lock);
+		PROC_LOCK(p1->p_leader);
+		if ((p1->p_leader->p_flag & P_WEXIT) != 0) {
+			PROC_UNLOCK(p1->p_leader);
+			/*
+			 * The task leader is exiting, so process p1 is
+			 * going to be killed shortly.  Since p1 obviously
+			 * isn't dead yet, we know that the leader is either
+			 * sending SIGKILL's to all the processes in this
+			 * task or is sleeping waiting for all the peers to
+			 * exit.  We let p1 complete the fork, but we need
+			 * to go ahead and kill the new process p2 since
+			 * the task leader may not get a chance to send
+			 * SIGKILL to it.  We leave it on the list so that
+			 * the task leader will wait for this new process
+			 * to commit suicide.
+			 */
+			PROC_LOCK(p2);
+			psignal(p2, SIGKILL);
+			PROC_UNLOCK(p2);
+		} else
+			PROC_UNLOCK(p1->p_leader);
+	} else {
+		p2->p_peers = NULL;
+		p2->p_leader = p2;
+	}
 
 	/* Inform accounting that we have forked. */
 	p2->p_acflag = AFORK;
@@ -694,11 +672,22 @@
 	/*
 	 * Set the child start time and mark the process as being complete.
 	 */
+	PROC_LOCK(p2);
+	PROC_LOCK(p1);
 	microuptime(&p2->p_stats->p_start);
 	mtx_lock_spin(&sched_lock);
+	p2->p_sflag = PS_INMEM;
 	p2->p_state = PRS_NORMAL;
+	if (p1->p_flag & P_PROFIL)
+		startprofclock(p2);
 
 	/*
+	 * Allow the scheduler to adjust the priority of the child and
+	 * parent while we hold the sched_lock.
+	 */
+	sched_fork(td, td2);
+	PROC_UNLOCK(p2);
+	/*
 	 * If RFSTOPPED not requested, make child runnable and add to
 	 * run queue.
 	 */
@@ -711,7 +700,6 @@
 	/*
 	 * Now can be swapped.
 	 */
-	PROC_LOCK(p1);
 	_PRELE(p1);
 
 	/*
@@ -746,18 +734,11 @@
 	*procp = p2;
 	return (0);
 fail:
-	sx_sunlock(&proctree_lock);
 	if (ppsratecheck(&lastfail, &curfail, 1))
 		printf("maxproc limit exceeded by uid %i, please see tuning(7) and login.conf(5).\n",
 		    td->td_ucred->cr_ruid);
 	sx_xunlock(&allproc_lock);
-#ifdef MAC
-	mac_destroy_proc(newproc);
-#endif
-#ifdef AUDIT
-	audit_proc_free(newproc);
-#endif
-	uma_zfree(proc_zone, newproc);
+	uma_zfree(proc_zone, p2);
 	if (p1->p_flag & P_HADTHREADS) {
 		PROC_LOCK(p1);
 		thread_single_end();






More information about the cvs-all mailing list