git: b899a02ad733 - main - ena: Move ena_copy_eni_metrics into separate task

From: Marcin Wojtas <mw_at_FreeBSD.org>
Date: Thu, 30 Jun 2022 16:15:37 UTC
The branch main has been updated by mw:

URL: https://cgit.FreeBSD.org/src/commit/?id=b899a02ad7330cae3c9bb08ad7975601dc3b9551

commit b899a02ad7330cae3c9bb08ad7975601dc3b9551
Author:     Dawid Gorecki <dgr@semihalf.com>
AuthorDate: 2022-06-10 09:18:10 +0000
Commit:     Marcin Wojtas <mw@FreeBSD.org>
CommitDate: 2022-06-30 15:31:53 +0000

    ena: Move ena_copy_eni_metrics into separate task
    
    Copying ENI metrics was done in callout context, this caused the driver
    to panic when sample_interval was set to a value other than 0, as the
    admin queue call which was executed could sleep while waiting on
    a condition variable. Taskqueue, unlike callout, allows for sleeping, so
    moving the function to a separate taskqueue fixes the problem.
    ena_timer_service is still responsible for scheduling the taskqueue.
    
    Stop draining the callout during ena_up/ena_down. This was done to
    prevent a race between ena_up/down and ena_copy_eni_metrics admin queue
    calls. Since ena_metrics_task is protected by ENA_LOCK there is no
    possibility of a race between ena_up/down and ena_metrics_task.
    
    Remove a comment about locking in ena_timer_service. With ENI metrics
    in a separate task this comment became obsolete.
    
    Obtained from: Semihalf
    MFC after: 2 weeks
    Sponsored by: Amazon, Inc.
---
 sys/dev/ena/ena.c | 57 ++++++++++++++++++++++---------------------------------
 sys/dev/ena/ena.h |  2 ++
 2 files changed, 25 insertions(+), 34 deletions(-)

diff --git a/sys/dev/ena/ena.c b/sys/dev/ena/ena.c
index ab9bb177ee25..e5854dc4d140 100644
--- a/sys/dev/ena/ena.c
+++ b/sys/dev/ena/ena.c
@@ -2104,13 +2104,6 @@ ena_up(struct ena_adapter *adapter)
 
 	ena_log(adapter->pdev, INFO, "device is going UP\n");
 
-	/*
-	 * ena_timer_service can use functions, which write to the admin queue.
-	 * Those calls are not protected by ENA_LOCK, and because of that, the
-	 * timer should be stopped when bringing the device up or down.
-	 */
-	ENA_TIMER_DRAIN(adapter);
-
 	/* setup interrupts for IO queues */
 	rc = ena_setup_io_intr(adapter);
 	if (unlikely(rc != 0)) {
@@ -2157,8 +2150,6 @@ ena_up(struct ena_adapter *adapter)
 
 	ena_unmask_all_io_irqs(adapter);
 
-	ENA_TIMER_RESET(adapter);
-
 	return (0);
 
 err_up_complete:
@@ -2168,8 +2159,6 @@ err_up_complete:
 err_create_queues_with_backoff:
 	ena_free_io_irq(adapter);
 error:
-	ENA_TIMER_RESET(adapter);
-
 	return (rc);
 }
 
@@ -2474,9 +2463,6 @@ ena_down(struct ena_adapter *adapter)
 	if (!ENA_FLAG_ISSET(ENA_FLAG_DEV_UP, adapter))
 		return;
 
-	/* Drain timer service to avoid admin queue race condition. */
-	ENA_TIMER_DRAIN(adapter);
-
 	ena_log(adapter->pdev, INFO, "device is going DOWN\n");
 
 	ENA_FLAG_CLEAR_ATOMIC(ENA_FLAG_DEV_UP, adapter);
@@ -2501,8 +2487,6 @@ ena_down(struct ena_adapter *adapter)
 	ena_free_all_rx_resources(adapter);
 
 	counter_u64_add(adapter->dev_stats.interface_down, 1);
-
-	ENA_TIMER_RESET(adapter);
 }
 
 static uint32_t
@@ -3275,24 +3259,7 @@ ena_timer_service(void *data)
 	if ((adapter->eni_metrics_sample_interval != 0) &&
 	    (++adapter->eni_metrics_sample_interval_cnt >=
 	     adapter->eni_metrics_sample_interval)) {
-		/*
-		 * There is no race with other admin queue calls, as:
-		 *   - Timer service runs after attach function ends, so all
-		 *     configuration calls to the admin queue are finished.
-		 *   - Timer service is temporarily stopped when bringing
-		 *     the interface up or down.
-		 *   - After interface is up, the driver doesn't use (at least
-		 *     for now) other functions writing to the admin queue.
-		 *
-		 * It may change in the future, so in that situation, the lock
-		 * will be needed. ENA_LOCK_*() cannot be used for that purpose,
-		 * as callout ena_timer_service is protected by them. It could
-		 * lead to the deadlock if callout_drain() would hold the lock
-		 * before ena_copy_eni_metrics() was executed. It's advised to
-		 * use separate lock in that situation which will be used only
-		 * for the admin queue.
-		 */
-		(void)ena_copy_eni_metrics(adapter);
+		taskqueue_enqueue(adapter->metrics_tq, &adapter->metrics_task);
 		adapter->eni_metrics_sample_interval_cnt = 0;
 	}
 
@@ -3499,6 +3466,16 @@ err:
 	return (rc);
 }
 
+static void
+ena_metrics_task(void *arg, int pending)
+{
+	struct ena_adapter *adapter = (struct ena_adapter *)arg;
+
+	ENA_LOCK_LOCK();
+	(void)ena_copy_eni_metrics(adapter);
+	ENA_LOCK_UNLOCK();
+}
+
 static void
 ena_reset_task(void *arg, int pending)
 {
@@ -3719,6 +3696,13 @@ ena_attach(device_t pdev)
 	taskqueue_start_threads(&adapter->reset_tq, 1, PI_NET,
 	    "%s rstq", device_get_nameunit(adapter->pdev));
 
+	/* Initialize metrics task queue */
+	TASK_INIT(&adapter->metrics_task, 0, ena_metrics_task, adapter);
+	adapter->metrics_tq = taskqueue_create("ena_metrics_enqueue",
+	    M_WAITOK | M_ZERO, taskqueue_thread_enqueue, &adapter->metrics_tq);
+	taskqueue_start_threads(&adapter->metrics_tq, 1, PI_NET,
+	    "%s metricsq", device_get_nameunit(adapter->pdev));
+
 	/* Initialize statistics */
 	ena_alloc_counters((counter_u64_t *)&adapter->dev_stats,
 	    sizeof(struct ena_stats_dev));
@@ -3797,6 +3781,11 @@ ena_detach(device_t pdev)
 	ENA_TIMER_DRAIN(adapter);
 	ENA_LOCK_UNLOCK();
 
+	/* Release metrics task */
+	while (taskqueue_cancel(adapter->metrics_tq, &adapter->metrics_task, NULL))
+		taskqueue_drain(adapter->metrics_tq, &adapter->metrics_task);
+	taskqueue_free(adapter->metrics_tq);
+
 	/* Release reset task */
 	while (taskqueue_cancel(adapter->reset_tq, &adapter->reset_task, NULL))
 		taskqueue_drain(adapter->reset_tq, &adapter->reset_task);
diff --git a/sys/dev/ena/ena.h b/sys/dev/ena/ena.h
index 56e14bd800ff..43fbaad17b78 100644
--- a/sys/dev/ena/ena.h
+++ b/sys/dev/ena/ena.h
@@ -470,6 +470,8 @@ struct ena_adapter {
 	uint32_t next_monitored_tx_qid;
 	struct task reset_task;
 	struct taskqueue *reset_tq;
+	struct task metrics_task;
+	struct taskqueue *metrics_tq;
 	int wd_active;
 	sbintime_t keep_alive_timeout;
 	sbintime_t missing_tx_timeout;