git: 89940eed9118 - stable/14 - ena: Improve reset reason statistics

From: Osama Abboud <osamaabb_at_FreeBSD.org>
Date: Thu, 31 Oct 2024 16:01:25 UTC
The branch stable/14 has been updated by osamaabb:

URL: https://cgit.FreeBSD.org/src/commit/?id=89940eed91182f4dbb20c14bdbb689fc622dad9b

commit 89940eed91182f4dbb20c14bdbb689fc622dad9b
Author:     Osama Abboud <osamaabb@amazon.com>
AuthorDate: 2024-08-07 06:24:19 +0000
Commit:     Osama Abboud <osamaabb@FreeBSD.org>
CommitDate: 2024-10-31 14:54:10 +0000

    ena: Improve reset reason statistics
    
    The driver uses different reset reasons.
    Some of them are counted and presented in the driver statistics.
    There are cases where statistics are counted on a ring level,
    but these are zeroed after a reset procedure takes place.
    
    This commit makes the following changes:
    1. Add statistics for the unrepresented reset reasons.
    2. Add reset reasons which are counted on a ring level,
    to be also global for better tracking.
    
    Approved by: cperciva (mentor)
    Sponsored by: Amazon, Inc.
    
    (cherry picked from commit 89ce3f6314f6feba0e6626be51832d44df611218)
---
 sys/dev/ena/ena.c        |  2 --
 sys/dev/ena/ena.h        | 42 ++++++++++++++++++++++++++++++++++++++++++
 sys/dev/ena/ena_sysctl.c | 16 ++++++++++++++++
 3 files changed, 58 insertions(+), 2 deletions(-)

diff --git a/sys/dev/ena/ena.c b/sys/dev/ena/ena.c
index fd92b5046f84..7c86c0594daf 100644
--- a/sys/dev/ena/ena.c
+++ b/sys/dev/ena/ena.c
@@ -3014,7 +3014,6 @@ check_for_missing_keep_alive(struct ena_adapter *adapter)
 	time = getsbinuptime() - timestamp;
 	if (unlikely(time > adapter->keep_alive_timeout)) {
 		ena_log(adapter->pdev, ERR, "Keep alive watchdog timeout.\n");
-		counter_u64_add(adapter->dev_stats.wd_expired, 1);
 		ena_trigger_reset(adapter, ENA_REGS_RESET_KEEP_ALIVE_TO);
 	}
 }
@@ -3026,7 +3025,6 @@ check_for_admin_com_state(struct ena_adapter *adapter)
 	if (unlikely(ena_com_get_admin_running_state(adapter->ena_dev) == false)) {
 		ena_log(adapter->pdev, ERR,
 		    "ENA admin queue is not in running state!\n");
-		counter_u64_add(adapter->dev_stats.admin_q_pause, 1);
 		ena_trigger_reset(adapter, ENA_REGS_RESET_ADMIN_TO);
 	}
 }
diff --git a/sys/dev/ena/ena.h b/sys/dev/ena/ena.h
index be55f63bdb7b..4ac79edd0016 100644
--- a/sys/dev/ena/ena.h
+++ b/sys/dev/ena/ena.h
@@ -381,6 +381,14 @@ struct ena_stats_dev {
 	counter_u64_t interface_up;
 	counter_u64_t interface_down;
 	counter_u64_t admin_q_pause;
+	counter_u64_t total_resets;
+	counter_u64_t os_trigger;
+	counter_u64_t missing_tx_cmpl;
+	counter_u64_t bad_rx_req_id;
+	counter_u64_t bad_tx_req_id;
+	counter_u64_t bad_rx_desc_num;
+	counter_u64_t invalid_state;
+	counter_u64_t missing_intr;
 };
 
 struct ena_hw_stats {
@@ -519,6 +527,29 @@ struct ena_adapter {
 
 extern struct sx ena_global_lock;
 
+#define ENA_RESET_STATS_ENTRY(reset_reason, stat) \
+	[reset_reason] = { \
+	.stat_offset = offsetof(struct ena_stats_dev, stat) / sizeof(u64), \
+	.has_counter = true \
+}
+
+struct ena_reset_stats_offset {
+	int stat_offset;
+	bool has_counter;
+};
+
+static const struct ena_reset_stats_offset resets_to_stats_offset_map[ENA_REGS_RESET_LAST] = {
+	ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_KEEP_ALIVE_TO, wd_expired),
+	ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_ADMIN_TO, admin_q_pause),
+	ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_OS_TRIGGER, os_trigger),
+	ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_MISS_TX_CMPL, missing_tx_cmpl),
+	ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_INV_RX_REQ_ID, bad_rx_req_id),
+	ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_INV_TX_REQ_ID, bad_tx_req_id),
+	ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_TOO_MANY_RX_DESCS, bad_rx_desc_num),
+	ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_DRIVER_INVALID_STATE, invalid_state),
+	ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_MISS_INTERRUPT, missing_intr),
+};
+
 int	ena_up(struct ena_adapter *adapter);
 void	ena_down(struct ena_adapter *adapter);
 int	ena_restore_device(struct ena_adapter *adapter);
@@ -547,6 +578,17 @@ ena_trigger_reset(struct ena_adapter *adapter,
     enum ena_regs_reset_reason_types reset_reason)
 {
 	if (likely(!ENA_FLAG_ISSET(ENA_FLAG_TRIGGER_RESET, adapter))) {
+		const struct ena_reset_stats_offset *ena_reset_stats_offset =
+		    &resets_to_stats_offset_map[reset_reason];
+
+		if (ena_reset_stats_offset->has_counter) {
+			uint64_t *stat_ptr = (uint64_t *)&adapter->dev_stats +
+			    ena_reset_stats_offset->stat_offset;
+
+			counter_u64_add((counter_u64_t)(*stat_ptr), 1);
+		}
+
+		counter_u64_add(adapter->dev_stats.total_resets, 1);
 		adapter->reset_reason = reset_reason;
 		ENA_FLAG_SET_ATOMIC(ENA_FLAG_TRIGGER_RESET, adapter);
 	}
diff --git a/sys/dev/ena/ena_sysctl.c b/sys/dev/ena/ena_sysctl.c
index a94bcbccdc98..6eafe2a8c052 100644
--- a/sys/dev/ena/ena_sysctl.c
+++ b/sys/dev/ena/ena_sysctl.c
@@ -280,6 +280,22 @@ ena_sysctl_add_stats(struct ena_adapter *adapter)
 	    "Network interface down count");
 	SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "admin_q_pause",
 	    CTLFLAG_RD, &dev_stats->admin_q_pause, "Admin queue pauses");
+	SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "os_trigger", CTLFLAG_RD,
+	    &dev_stats->os_trigger, "OS trigger count");
+	SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "missing_tx_cmpl", CTLFLAG_RD,
+	    &dev_stats->missing_tx_cmpl, "Missing TX completions resets count");
+	SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "bad_rx_req_id", CTLFLAG_RD,
+	    &dev_stats->bad_rx_req_id, "Bad RX req id count");
+	SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "bad_tx_req_id", CTLFLAG_RD,
+	    &dev_stats->bad_tx_req_id, "Bad TX req id count");
+	SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "bad_rx_desc_num", CTLFLAG_RD,
+	    &dev_stats->bad_rx_desc_num, "Bad RX descriptors number count");
+	SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "invalid_state", CTLFLAG_RD,
+	    &dev_stats->invalid_state, "Driver invalid state count");
+	SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "missing_intr", CTLFLAG_RD,
+	    &dev_stats->missing_intr, "Missing interrupt count");
+	SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "total_resets", CTLFLAG_RD,
+	    &dev_stats->total_resets, "Total resets count");
 
 	for (i = 0; i < adapter->num_io_queues; ++i, ++tx_ring, ++rx_ring) {
 		snprintf(namebuf, QUEUE_NAME_LEN, "queue%d", i);