git: 81a37995c757 - main - killpg(): close a race with fork(), part 2

From: Konstantin Belousov <kib_at_FreeBSD.org>
Date: Tue, 04 Jul 2023 03:44:11 UTC
The branch main has been updated by kib:

URL: https://cgit.FreeBSD.org/src/commit/?id=81a37995c757b4e3ad8a5c699864197fd1ebdcf5

commit 81a37995c757b4e3ad8a5c699864197fd1ebdcf5
Author:     Konstantin Belousov <kib@FreeBSD.org>
AuthorDate: 2023-06-16 09:52:10 +0000
Commit:     Konstantin Belousov <kib@FreeBSD.org>
CommitDate: 2023-07-04 03:43:16 +0000

    killpg(): close a race with fork(), part 2
    
    When we are sending terminating signal to the group, killpg() needs to
    guarantee that all group members are to be terminated (it does not need
    to ensure that they are terminated on return from killpg()).  The
    pg_killsx change eliminates the largest window there, but still, if a
    multithreaded process is signalled, the following could happen:
    - thread 1 is selected for the signal delivery and gets descheduled
    - thread 2 waits for pg_killsx lock, obtains it and forks
    - thread 1 continue executing and terminates the process
    This scenario allows the child to escape still.
    
    To fix it, count the number of signals sent to the process with
    killpg(2), in p_killpg_cnt variable, which is incremented in killpg()
    and decremented after signal handler frame is created or in exit1()
    after single-threading.  This way we avoid forking if the termination is
    due.
    
    Noted and reviewed by:  markj (previous version)
    Tested by:      pho
    Sponsored by:   The FreeBSD Foundation
    MFC after:      2 weeks
    Differential revision:  https://reviews.freebsd.org/D40493
---
 sys/kern/kern_exit.c   | 13 ++++++++++++-
 sys/kern/kern_fork.c   |  3 ++-
 sys/kern/kern_sig.c    | 37 ++++++++++++++++++++++++++++++++-----
 sys/kern/kern_thread.c |  6 +++---
 sys/sys/proc.h         |  2 ++
 sys/sys/signalvar.h    |  3 ++-
 6 files changed, 53 insertions(+), 11 deletions(-)

diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c
index a92d5cc0f642..a6f3ca2a2d66 100644
--- a/sys/kern/kern_exit.c
+++ b/sys/kern/kern_exit.c
@@ -220,13 +220,19 @@ proc_set_p2_wexit(struct proc *p)
 	p->p_flag2 |= P2_WEXIT;
 }
 
+void
+exit1(struct thread *td, int rval, int signo)
+{
+	exit2(td, rval, signo, false);
+}
+
 /*
  * Exit: deallocate address space and other resources, change proc state to
  * zombie, and unlink proc from allproc and parent's lists.  Save exit status
  * and rusage for wait().  Check for child processes and orphan them.
  */
 void
-exit1(struct thread *td, int rval, int signo)
+exit2(struct thread *td, int rval, int signo, bool dec_killpg_cnt)
 {
 	struct proc *p, *nq, *q, *t;
 	struct thread *tdt;
@@ -304,6 +310,11 @@ exit1(struct thread *td, int rval, int signo)
 	    ("exit1: proc %p exiting with %d threads", p, p->p_numthreads));
 	racct_sub(p, RACCT_NTHR, 1);
 
+	if (dec_killpg_cnt) {
+		MPASS(atomic_load_int(&p->p_killpg_cnt) > 0);
+		atomic_add_int(&p->p_killpg_cnt, -1);
+	}
+
 	/* Let event handler change exit status */
 	p->p_xexit = rval;
 	p->p_xsig = signo;
diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c
index 81bee99fa1ca..180c96ae33ef 100644
--- a/sys/kern/kern_fork.c
+++ b/sys/kern/kern_fork.c
@@ -957,7 +957,8 @@ fork1(struct thread *td, struct fork_req *fr)
 	if (sx_slock_sig(&pg->pg_killsx) != 0) {
 		error = ERESTART;
 		goto fail2;
-	} else if (__predict_false(p1->p_pgrp != pg || sig_intr() != 0)) {
+	} else if (__predict_false(p1->p_pgrp != pg || sig_intr() != 0 ||
+	    atomic_load_int(&p1->p_killpg_cnt) != 0)) {
 		/*
 		 * Either the process was moved to other process
 		 * group, or there is pending signal.  sx_slock_sig()
diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c
index 948ef97c0715..18756d53e98c 100644
--- a/sys/kern/kern_sig.c
+++ b/sys/kern/kern_sig.c
@@ -121,6 +121,7 @@ static int	filt_signal(struct knote *kn, long hint);
 static struct thread *sigtd(struct proc *p, int sig, bool fast_sigblock);
 static void	sigqueue_start(void);
 static void	sigfastblock_setpend(struct thread *td, bool resched);
+static void	sigexit1(struct thread *td, int sig, ksiginfo_t *ksi) __dead2;
 
 static uma_zone_t	ksiginfo_zone = NULL;
 struct filterops sig_filtops = {
@@ -371,6 +372,15 @@ sigqueue_start(void)
 	    TDP_OLDMASK, ast_sigsuspend);
 }
 
+static void
+sig_handle_killpg(struct proc *p, ksiginfo_t *ksi)
+{
+	if ((ksi->ksi_flags & KSI_KILLPG) != 0 && p != NULL) {
+		MPASS(atomic_load_int(&p->p_killpg_cnt) > 0);
+		atomic_add_int(&p->p_killpg_cnt, -1);
+	}
+}
+
 ksiginfo_t *
 ksiginfo_alloc(int mwait)
 {
@@ -470,6 +480,7 @@ sigqueue_take(ksiginfo_t *ksi)
 	p = sq->sq_proc;
 	TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
 	ksi->ksi_sigq = NULL;
+	sig_handle_killpg(p, ksi);
 	if (!(ksi->ksi_flags & KSI_EXT) && p != NULL)
 		p->p_pendingcnt--;
 
@@ -567,6 +578,7 @@ sigqueue_flush(sigqueue_t *sq)
 	while ((ksi = TAILQ_FIRST(&sq->sq_list)) != NULL) {
 		TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
 		ksi->ksi_sigq = NULL;
+		sig_handle_killpg(p, ksi);
 		if (ksiginfo_tryfree(ksi) && p != NULL)
 			p->p_pendingcnt--;
 	}
@@ -642,6 +654,7 @@ sigqueue_delete_set(sigqueue_t *sq, const sigset_t *set)
 		if (SIGISMEMBER(*set, ksi->ksi_signo)) {
 			TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
 			ksi->ksi_sigq = NULL;
+			sig_handle_killpg(p, ksi);
 			if (ksiginfo_tryfree(ksi) && p != NULL)
 				p->p_pendingcnt--;
 		}
@@ -1458,7 +1471,7 @@ kern_sigtimedwait(struct thread *td, sigset_t waitset, ksiginfo_t *ksi,
 #endif
 		if (sig == SIGKILL) {
 			proc_td_siginfo_capture(td, &ksi->ksi_info);
-			sigexit(td, sig);
+			sigexit1(td, sig, ksi);
 		}
 	}
 	PROC_UNLOCK(p);
@@ -1936,8 +1949,10 @@ kern_kill(struct thread *td, pid_t pid, int signum)
 	case -1:		/* broadcast signal */
 		return (killpg1(td, signum, 0, 1, &ksi));
 	case 0:			/* signal own process group */
+		ksi.ksi_flags |= KSI_KILLPG;
 		return (killpg1(td, signum, 0, 0, &ksi));
 	default:		/* negative explicit process group */
+		ksi.ksi_flags |= KSI_KILLPG;
 		return (killpg1(td, signum, -pid, 0, &ksi));
 	}
 	/* NOTREACHED */
@@ -1988,6 +2003,7 @@ okillpg(struct thread *td, struct okillpg_args *uap)
 	ksi.ksi_code = SI_USER;
 	ksi.ksi_pid = td->td_proc->p_pid;
 	ksi.ksi_uid = td->td_ucred->cr_ruid;
+	ksi.ksi_flags |= KSI_KILLPG;
 	return (killpg1(td, uap->signum, uap->pgid, 0, &ksi));
 }
 #endif /* COMPAT_43 */
@@ -2375,6 +2391,10 @@ tdsendsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi)
 	ret = sigqueue_add(sigqueue, sig, ksi);
 	if (ret != 0)
 		return (ret);
+	if ((ksi->ksi_flags & KSI_KILLPG) != 0) {
+		sx_assert(&p->p_pgrp->pg_killsx, SX_XLOCKED);
+		atomic_add_int(&p->p_killpg_cnt, 1);
+	}
 	signotify(td);
 	/*
 	 * Defer further processing for signals which are held,
@@ -3425,7 +3445,7 @@ postsig(int sig)
 		 */
 		mtx_unlock(&ps->ps_mtx);
 		proc_td_siginfo_capture(td, &ksi.ksi_info);
-		sigexit(td, sig);
+		sigexit1(td, sig, &ksi);
 		/* NOTREACHED */
 	} else {
 		/*
@@ -3453,6 +3473,7 @@ postsig(int sig)
 		if (p->p_sig == sig) {
 			p->p_sig = 0;
 		}
+		sig_handle_killpg(p, &ksi);
 		(*p->p_sysent->sv_sendsig)(action, &ksi, &returnmask);
 		postsig_done(sig, td, ps);
 	}
@@ -3610,8 +3631,8 @@ killproc(struct proc *p, const char *why)
  * If dumping core, save the signal number for the debugger.  Calls exit and
  * does not return.
  */
-void
-sigexit(struct thread *td, int sig)
+static void
+sigexit1(struct thread *td, int sig, ksiginfo_t *ksi)
 {
 	struct proc *p = td->td_proc;
 
@@ -3650,10 +3671,16 @@ sigexit(struct thread *td, int sig)
 			    sig & WCOREFLAG ? " (core dumped)" : "");
 	} else
 		PROC_UNLOCK(p);
-	exit1(td, 0, sig);
+	exit2(td, 0, sig, ksi != NULL && (ksi->ksi_flags & KSI_KILLPG) != 0);
 	/* NOTREACHED */
 }
 
+void
+sigexit(struct thread *td, int sig)
+{
+	sigexit1(td, sig, NULL);
+}
+
 /*
  * Send queued SIGCHLD to parent when child process's state
  * is changed.
diff --git a/sys/kern/kern_thread.c b/sys/kern/kern_thread.c
index 67712450c128..fad1abd1be6c 100644
--- a/sys/kern/kern_thread.c
+++ b/sys/kern/kern_thread.c
@@ -99,7 +99,7 @@ _Static_assert(offsetof(struct proc, p_pid) == 0xc4,
     "struct proc KBI p_pid");
 _Static_assert(offsetof(struct proc, p_filemon) == 0x3c8,
     "struct proc KBI p_filemon");
-_Static_assert(offsetof(struct proc, p_comm) == 0x3e0,
+_Static_assert(offsetof(struct proc, p_comm) == 0x3e4,
     "struct proc KBI p_comm");
 _Static_assert(offsetof(struct proc, p_emuldata) == 0x4d0,
     "struct proc KBI p_emuldata");
@@ -119,9 +119,9 @@ _Static_assert(offsetof(struct proc, p_pid) == 0x78,
     "struct proc KBI p_pid");
 _Static_assert(offsetof(struct proc, p_filemon) == 0x270,
     "struct proc KBI p_filemon");
-_Static_assert(offsetof(struct proc, p_comm) == 0x284,
+_Static_assert(offsetof(struct proc, p_comm) == 0x288,
     "struct proc KBI p_comm");
-_Static_assert(offsetof(struct proc, p_emuldata) == 0x318,
+_Static_assert(offsetof(struct proc, p_emuldata) == 0x31c,
     "struct proc KBI p_emuldata");
 #endif
 
diff --git a/sys/sys/proc.h b/sys/sys/proc.h
index d79b7a440168..5c77c2d683c1 100644
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@@ -722,6 +722,7 @@ struct proc {
 	int		p_pendingexits; /* (c) Count of pending thread exits. */
 	struct filemon	*p_filemon;	/* (c) filemon-specific data. */
 	int		p_pdeathsig;	/* (c) Signal from parent on exit. */
+	int		p_killpg_cnt;
 /* End area that is zeroed on creation. */
 #define	p_endzero	p_magic
 
@@ -1236,6 +1237,7 @@ void	userret(struct thread *, struct trapframe *);
 
 void	cpu_exit(struct thread *);
 void	exit1(struct thread *, int, int) __dead2;
+void	exit2(struct thread *, int, int, bool) __dead2;
 void	cpu_copy_thread(struct thread *td, struct thread *td0);
 bool	cpu_exec_vmspace_reuse(struct proc *p, struct vm_map *map);
 int	cpu_fetch_syscall_args(struct thread *td);
diff --git a/sys/sys/signalvar.h b/sys/sys/signalvar.h
index 59b26105effd..611eb11a8629 100644
--- a/sys/sys/signalvar.h
+++ b/sys/sys/signalvar.h
@@ -240,7 +240,8 @@ typedef struct ksiginfo {
 #define	KSI_SIGQ	0x08	/* Generated by sigqueue, might ret EAGAIN. */
 #define	KSI_HEAD	0x10	/* Insert into head, not tail. */
 #define	KSI_PTRACE	0x20	/* Generated by ptrace. */
-#define	KSI_COPYMASK	(KSI_TRAP | KSI_SIGQ | KSI_PTRACE)
+#define	KSI_KILLPG	0x40	/* killpg - update p_killpg_cnt */
+#define	KSI_COPYMASK	(KSI_TRAP | KSI_SIGQ | KSI_PTRACE | KSI_KILLPG)
 
 #define	KSI_ONQ(ksi)	((ksi)->ksi_sigq != NULL)