git: 3360b48525fc - main - killpg(2): close a race with fork(2), part1

From: Konstantin Belousov <kib_at_FreeBSD.org>
Date: Tue, 04 Jul 2023 03:44:10 UTC
The branch main has been updated by kib:

URL: https://cgit.FreeBSD.org/src/commit/?id=3360b48525fc966894e77b8cd9c124669664472d

commit 3360b48525fc966894e77b8cd9c124669664472d
Author:     Konstantin Belousov <kib@FreeBSD.org>
AuthorDate: 2023-06-12 07:33:43 +0000
Commit:     Konstantin Belousov <kib@FreeBSD.org>
CommitDate: 2023-07-04 03:21:53 +0000

    killpg(2): close a race with fork(2), part1
    
    If the process group member performs fork(), the child could escape
    signalling from killpg(). Prevent it by introducing an sx process group
    lock pg_killsx which is taken interruptibly shared around fork. If there
    is a pending signal, do the trip through userspace with ERESTART to
    handle signal ASTs. The lock is taken exclusively during killpg().
    
    The lock is also locked exclusive when the process changes group
    membership, to avoid escaping a signal by this means, by ensuring that
    the process group is stable during fork.
    
    Note that the new lock is before proctree lock, so in some situations we
    could only do trylocking to obtain it.
    
    This relatively simple approach cannot work for REAP_KILL, because
    process potentially belongs to more than one reaper tree by having
    sub-reapers.
    
    Reported by:    dchagin
    Tested by:      dchagin, pho
    Reviewed by:    markj
    Sponsored by:   The FreeBSD Foundation
    MFC after:      2 weeks
    Differential revision:  https://reviews.freebsd.org/D40493
---
 sys/kern/init_main.c |  1 +
 sys/kern/kern_fork.c | 28 ++++++++++++++++++++++++++++
 sys/kern/kern_proc.c | 20 ++++++++++++++++++++
 sys/kern/kern_prot.c | 17 +++++++++++++----
 sys/kern/kern_sig.c  |  8 ++++++++
 sys/sys/proc.h       |  2 ++
 6 files changed, 72 insertions(+), 4 deletions(-)

diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c
index 77a0f5478eb6..abc6b3f6e2f2 100644
--- a/sys/kern/init_main.c
+++ b/sys/kern/init_main.c
@@ -495,6 +495,7 @@ proc0_init(void *dummy __unused)
 	LIST_INSERT_HEAD(&allproc, p, p_list);
 	LIST_INSERT_HEAD(PIDHASH(0), p, p_hash);
 	mtx_init(&pgrp0.pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK);
+	sx_init(&pgrp0.pg_killsx, "killpg racer");
 	p->p_pgrp = &pgrp0;
 	LIST_INSERT_HEAD(PGRPHASH(0), &pgrp0, pg_hash);
 	LIST_INIT(&pgrp0.pg_members);
diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c
index 944ecf494736..81bee99fa1ca 100644
--- a/sys/kern/kern_fork.c
+++ b/sys/kern/kern_fork.c
@@ -856,11 +856,13 @@ fork1(struct thread *td, struct fork_req *fr)
 	struct vmspace *vm2;
 	struct ucred *cred;
 	struct file *fp_procdesc;
+	struct pgrp *pg;
 	vm_ooffset_t mem_charged;
 	int error, nprocs_new;
 	static int curfail;
 	static struct timeval lastfail;
 	int flags, pages;
+	bool killsx_locked;
 
 	flags = fr->fr_flags;
 	pages = fr->fr_pages;
@@ -917,6 +919,7 @@ fork1(struct thread *td, struct fork_req *fr)
 	fp_procdesc = NULL;
 	newproc = NULL;
 	vm2 = NULL;
+	killsx_locked = false;
 
 	/*
 	 * Increment the nprocs resource before allocations occur.
@@ -946,6 +949,28 @@ fork1(struct thread *td, struct fork_req *fr)
 		}
 	}
 
+	/*
+	 * Atomically check for signals and block threads from sending
+	 * a signal to our process group until the child is visible.
+	 */
+	pg = p1->p_pgrp;
+	if (sx_slock_sig(&pg->pg_killsx) != 0) {
+		error = ERESTART;
+		goto fail2;
+	} else if (__predict_false(p1->p_pgrp != pg || sig_intr() != 0)) {
+		/*
+		 * Either the process was moved to other process
+		 * group, or there is pending signal.  sx_slock_sig()
+		 * does not check for signals if not sleeping for the
+		 * lock.
+		 */
+		sx_sunlock(&pg->pg_killsx);
+		error = ERESTART;
+		goto fail2;
+	} else {
+		killsx_locked = true;
+	}
+
 	/*
 	 * If required, create a process descriptor in the parent first; we
 	 * will abandon it if something goes wrong. We don't finit() until
@@ -1037,6 +1062,7 @@ fork1(struct thread *td, struct fork_req *fr)
 	}
 
 	do_fork(td, fr, newproc, td2, vm2, fp_procdesc);
+	sx_sunlock(&pg->pg_killsx);
 	return (0);
 fail0:
 	error = EAGAIN;
@@ -1055,6 +1081,8 @@ fail2:
 		fdrop(fp_procdesc, td);
 	}
 	atomic_add_int(&nprocs, -1);
+	if (killsx_locked)
+		sx_sunlock(&pg->pg_killsx);
 	pause("fork", hz / 2);
 	return (error);
 }
diff --git a/sys/kern/kern_proc.c b/sys/kern/kern_proc.c
index 3983e536e70e..19b3b41aa6f3 100644
--- a/sys/kern/kern_proc.c
+++ b/sys/kern/kern_proc.c
@@ -310,6 +310,7 @@ pgrp_init(void *mem, int size, int flags)
 
 	pg = mem;
 	mtx_init(&pg->pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK);
+	sx_init(&pg->pg_killsx, "killpg racer");
 	return (0);
 }
 
@@ -573,6 +574,7 @@ errout:
 int
 enterpgrp(struct proc *p, pid_t pgid, struct pgrp *pgrp, struct session *sess)
 {
+	struct pgrp *old_pgrp;
 
 	sx_assert(&proctree_lock, SX_XLOCKED);
 
@@ -584,6 +586,11 @@ enterpgrp(struct proc *p, pid_t pgid, struct pgrp *pgrp, struct session *sess)
 	KASSERT(!SESS_LEADER(p),
 	    ("enterpgrp: session leader attempted setpgrp"));
 
+	old_pgrp = p->p_pgrp;
+	if (!sx_try_xlock(&old_pgrp->pg_killsx))
+		return (ERESTART);
+	MPASS(old_pgrp == p->p_pgrp);
+
 	if (sess != NULL) {
 		/*
 		 * new session
@@ -625,6 +632,7 @@ enterpgrp(struct proc *p, pid_t pgid, struct pgrp *pgrp, struct session *sess)
 
 	doenterpgrp(p, pgrp);
 
+	sx_xunlock(&old_pgrp->pg_killsx);
 	return (0);
 }
 
@@ -634,6 +642,7 @@ enterpgrp(struct proc *p, pid_t pgid, struct pgrp *pgrp, struct session *sess)
 int
 enterthispgrp(struct proc *p, struct pgrp *pgrp)
 {
+	struct pgrp *old_pgrp;
 
 	sx_assert(&proctree_lock, SX_XLOCKED);
 	PROC_LOCK_ASSERT(p, MA_NOTOWNED);
@@ -646,8 +655,19 @@ enterthispgrp(struct proc *p, struct pgrp *pgrp)
 	KASSERT(pgrp != p->p_pgrp,
 	    ("%s: p %p belongs to pgrp %p", __func__, p, pgrp));
 
+	old_pgrp = p->p_pgrp;
+	if (!sx_try_xlock(&old_pgrp->pg_killsx))
+		return (ERESTART);
+	MPASS(old_pgrp == p->p_pgrp);
+	if (!sx_try_xlock(&pgrp->pg_killsx)) {
+		sx_xunlock(&old_pgrp->pg_killsx);
+		return (ERESTART);
+	}
+
 	doenterpgrp(p, pgrp);
 
+	sx_xunlock(&pgrp->pg_killsx);
+	sx_xunlock(&old_pgrp->pg_killsx);
 	return (0);
 }
 
diff --git a/sys/kern/kern_prot.c b/sys/kern/kern_prot.c
index 3d8f3e222b4c..733a92839f53 100644
--- a/sys/kern/kern_prot.c
+++ b/sys/kern/kern_prot.c
@@ -332,12 +332,13 @@ sys_setsid(struct thread *td, struct setsid_args *uap)
 	struct pgrp *newpgrp;
 	struct session *newsess;
 
-	error = 0;
 	pgrp = NULL;
 
 	newpgrp = uma_zalloc(pgrp_zone, M_WAITOK);
 	newsess = malloc(sizeof(struct session), M_SESSION, M_WAITOK | M_ZERO);
 
+again:
+	error = 0;
 	sx_xlock(&proctree_lock);
 
 	if (p->p_pgid == p->p_pid || (pgrp = pgfind(p->p_pid)) != NULL) {
@@ -345,7 +346,12 @@ sys_setsid(struct thread *td, struct setsid_args *uap)
 			PGRP_UNLOCK(pgrp);
 		error = EPERM;
 	} else {
-		(void)enterpgrp(p, p->p_pid, newpgrp, newsess);
+		error = enterpgrp(p, p->p_pid, newpgrp, newsess);
+		if (error == ERESTART) {
+			sx_xunlock(&proctree_lock);
+			goto again;
+		}
+		MPASS(error == 0);
 		td->td_retval[0] = p->p_pid;
 		newpgrp = NULL;
 		newsess = NULL;
@@ -391,10 +397,11 @@ sys_setpgid(struct thread *td, struct setpgid_args *uap)
 	if (uap->pgid < 0)
 		return (EINVAL);
 
-	error = 0;
-
 	newpgrp = uma_zalloc(pgrp_zone, M_WAITOK);
 
+again:
+	error = 0;
+
 	sx_xlock(&proctree_lock);
 	if (uap->pid != 0 && uap->pid != curp->p_pid) {
 		if ((targp = pfind(uap->pid)) == NULL) {
@@ -456,6 +463,8 @@ done:
 	sx_xunlock(&proctree_lock);
 	KASSERT((error == 0) || (newpgrp != NULL),
 	    ("setpgid failed and newpgrp is NULL"));
+	if (error == ERESTART)
+		goto again;
 	uma_zfree(pgrp_zone, newpgrp);
 	return (error);
 }
diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c
index 167ebc1be4f3..948ef97c0715 100644
--- a/sys/kern/kern_sig.c
+++ b/sys/kern/kern_sig.c
@@ -1847,6 +1847,7 @@ killpg1(struct thread *td, int sig, int pgid, int all, ksiginfo_t *ksi)
 		prison_proc_iterate(td->td_ucred->cr_prison,
 		    kill_processes_prison_cb, &arg);
 	} else {
+again:
 		sx_slock(&proctree_lock);
 		if (pgid == 0) {
 			/*
@@ -1862,10 +1863,17 @@ killpg1(struct thread *td, int sig, int pgid, int all, ksiginfo_t *ksi)
 			}
 		}
 		sx_sunlock(&proctree_lock);
+		if (!sx_try_xlock(&pgrp->pg_killsx)) {
+			PGRP_UNLOCK(pgrp);
+			sx_xlock(&pgrp->pg_killsx);
+			sx_xunlock(&pgrp->pg_killsx);
+			goto again;
+		}
 		LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
 			killpg1_sendsig(p, false, &arg);
 		}
 		PGRP_UNLOCK(pgrp);
+		sx_xunlock(&pgrp->pg_killsx);
 	}
 	MPASS(arg.ret != 0 || arg.found || !arg.sent);
 	if (arg.ret == 0 && !arg.sent)
diff --git a/sys/sys/proc.h b/sys/sys/proc.h
index ca3911e9db66..d79b7a440168 100644
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@@ -113,6 +113,8 @@ struct pgrp {
 	pid_t		pg_id;		/* (c) Process group id. */
 	struct mtx	pg_mtx;		/* Mutex to protect members */
 	int		pg_flags;	/* (m) PGRP_ flags */
+	struct sx	pg_killsx;	/* Mutual exclusion between group member
+					 * fork() and killpg() */
 };
 
 #define	PGRP_ORPHANED	0x00000001	/* Group is orphaned */