git: 05fe82455f26 - main - linuxkpi: races between linux_queue_delayed_work_on() and linux_cancel_delayed_work_sync()

From: Konstantin Belousov <kib_at_FreeBSD.org>
Date: Tue, 07 Nov 2023 11:23:03 UTC
The branch main has been updated by kib:

URL: https://cgit.FreeBSD.org/src/commit/?id=05fe82455f263ad107a860ce20dd89e1a5c1619c

commit 05fe82455f263ad107a860ce20dd89e1a5c1619c
Author:     Konstantin Belousov <kib@FreeBSD.org>
AuthorDate: 2023-11-04 07:45:48 +0000
Commit:     Konstantin Belousov <kib@FreeBSD.org>
CommitDate: 2023-11-07 10:58:04 +0000

    linuxkpi: races between linux_queue_delayed_work_on() and linux_cancel_delayed_work_sync()
    
    1. Suppose that linux_queue_delayed_work_on() is called with
       non-zero delay and found the work.state WORK_ST_IDLE. It
       resets the state to WORK_ST_TIMER and locks timer.mtx. Now, if
       linux_cancel_delayed_work_sync() was also called meantime, read
       state as WORK_ST_TIMER and already taken the mutex, it is executing
       callout_stop() on non-armed callout. Then linux_queue_delayed_work_on()
       continues and schedules callout.  But the return value from cancel() is
       false, making it possible to the requeue from callback to slip in.
    
    2. If linux_cancel_delayed_work_sync() returned true, we need to cancel
       again.  The requeue from callback could have revived the work.
    
    The end result is that we schedule callout that might be freed, since
    cancel_delayed_work_sync() claims that everything was stopped.  This
    contradicts the way the KPI is used in Linux, where consumers expect
    that cancel_delayed_work_sync() is reliable on its own.
    
    Reviewed by:    markj
    Discussed with: bz
    Sponsored by:   NVidia networking
    MFC after:      1 week
    Differential revision:  https://reviews.freebsd.org/D42468
---
 sys/compat/linuxkpi/common/src/linux_work.c | 36 ++++++++++++++++++++---------
 1 file changed, 25 insertions(+), 11 deletions(-)

diff --git a/sys/compat/linuxkpi/common/src/linux_work.c b/sys/compat/linuxkpi/common/src/linux_work.c
index 990ba5d20fd5..888ac97dbff6 100644
--- a/sys/compat/linuxkpi/common/src/linux_work.c
+++ b/sys/compat/linuxkpi/common/src/linux_work.c
@@ -221,16 +221,19 @@ linux_queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 		[WORK_ST_EXEC] = WORK_ST_TIMER,		/* start timeout */
 		[WORK_ST_CANCEL] = WORK_ST_TIMER,	/* start timeout */
 	};
+	bool res;
 
 	if (atomic_read(&wq->draining) != 0)
 		return (!work_pending(&dwork->work));
 
+	mtx_lock(&dwork->timer.mtx);
 	switch (linux_update_state(&dwork->work.state, states)) {
 	case WORK_ST_EXEC:
 	case WORK_ST_CANCEL:
 		if (delay == 0 && linux_work_exec_unblock(&dwork->work) != 0) {
 			dwork->timer.expires = jiffies;
-			return (true);
+			res = true;
+			goto out;
 		}
 		/* FALLTHROUGH */
 	case WORK_ST_IDLE:
@@ -240,20 +243,21 @@ linux_queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 		if (delay == 0) {
 			linux_delayed_work_enqueue(dwork);
 		} else if (unlikely(cpu != WORK_CPU_UNBOUND)) {
-			mtx_lock(&dwork->timer.mtx);
 			callout_reset_on(&dwork->timer.callout, delay,
 			    &linux_delayed_work_timer_fn, dwork, cpu);
-			mtx_unlock(&dwork->timer.mtx);
 		} else {
-			mtx_lock(&dwork->timer.mtx);
 			callout_reset(&dwork->timer.callout, delay,
 			    &linux_delayed_work_timer_fn, dwork);
-			mtx_unlock(&dwork->timer.mtx);
 		}
-		return (true);
+		res = true;
+		break;
 	default:
-		return (false);		/* already on a queue */
+		res = false;
+		break;
 	}
+out:
+	mtx_unlock(&dwork->timer.mtx);
+	return (res);
 }
 
 void
@@ -467,8 +471,8 @@ linux_cancel_delayed_work(struct delayed_work *dwork)
  * fashion. It returns non-zero if the work was successfully
  * cancelled. Else the work was already cancelled.
  */
-bool
-linux_cancel_delayed_work_sync(struct delayed_work *dwork)
+static bool
+linux_cancel_delayed_work_sync_int(struct delayed_work *dwork)
 {
 	static const uint8_t states[WORK_ST_MAX] __aligned(8) = {
 		[WORK_ST_IDLE] = WORK_ST_IDLE,		/* NOP */
@@ -478,7 +482,6 @@ linux_cancel_delayed_work_sync(struct delayed_work *dwork)
 		[WORK_ST_CANCEL] = WORK_ST_IDLE,	/* cancel and drain */
 	};
 	struct taskqueue *tq;
-	bool retval = false;
 	int ret, state;
 	bool cancelled;
 
@@ -490,7 +493,7 @@ linux_cancel_delayed_work_sync(struct delayed_work *dwork)
 	switch (state) {
 	case WORK_ST_IDLE:
 		mtx_unlock(&dwork->timer.mtx);
-		return (retval);
+		return (false);
 	case WORK_ST_TIMER:
 	case WORK_ST_CANCEL:
 		cancelled = (callout_stop(&dwork->timer.callout) == 1);
@@ -512,6 +515,17 @@ linux_cancel_delayed_work_sync(struct delayed_work *dwork)
 	}
 }
 
+bool
+linux_cancel_delayed_work_sync(struct delayed_work *dwork)
+{
+	bool res;
+
+	res = false;
+	while (linux_cancel_delayed_work_sync_int(dwork))
+		res = true;
+	return (res);
+}
+
 /*
  * This function waits until the given work structure is completed.
  * It returns non-zero if the work was successfully