git: 189bc23fd0e4 - stable/14 - ena: Add reset reason for corrupted TX cdescs

From: Osama Abboud <osamaabb_at_FreeBSD.org>
Date: Thu, 31 Oct 2024 16:01:26 UTC
The branch stable/14 has been updated by osamaabb:

URL: https://cgit.FreeBSD.org/src/commit/?id=189bc23fd0e4ddc0485dd52b2925728d321ab8ec

commit 189bc23fd0e4ddc0485dd52b2925728d321ab8ec
Author:     Osama Abboud <osamaabb@amazon.com>
AuthorDate: 2024-08-07 06:24:19 +0000
Commit:     Osama Abboud <osamaabb@FreeBSD.org>
CommitDate: 2024-10-31 14:54:10 +0000

    ena: Add reset reason for corrupted TX cdescs
    
    TX completion descriptors may sometimes contain errors due
    to corruption. Upon identifying such a case, the driver will
    trigger a reset with an explicit reset reason
    ENA_REGS_RESET_TX_DESCRIPTOR_MALFORMED.
    
    Approved by: cperciva (mentor)
    Sponsored by: Amazon, Inc.
    
    (cherry picked from commit 38727218460008a500fbc18f08c90082ed678895)
---
 sys/dev/ena/ena.c          | 26 ++++++++++++++++++++++++++
 sys/dev/ena/ena.h          |  3 +++
 sys/dev/ena/ena_datapath.c | 27 ++++++++++-----------------
 sys/dev/ena/ena_netmap.c   | 38 ++++++++++++++------------------------
 sys/dev/ena/ena_sysctl.c   |  2 ++
 5 files changed, 55 insertions(+), 41 deletions(-)

diff --git a/sys/dev/ena/ena.c b/sys/dev/ena/ena.c
index 7c86c0594daf..3f3a4946ccca 100644
--- a/sys/dev/ena/ena.c
+++ b/sys/dev/ena/ena.c
@@ -560,6 +560,32 @@ ena_free_rx_dma_tag(struct ena_adapter *adapter)
 	return (ret);
 }
 
+int
+validate_tx_req_id(struct ena_ring *tx_ring, uint16_t req_id, int tx_req_id_rc)
+{
+	struct ena_adapter *adapter = tx_ring->adapter;
+	enum ena_regs_reset_reason_types reset_reason = ENA_REGS_RESET_INV_TX_REQ_ID;
+
+	if (unlikely(tx_req_id_rc != 0)) {
+		if (tx_req_id_rc == ENA_COM_FAULT) {
+			reset_reason = ENA_REGS_RESET_TX_DESCRIPTOR_MALFORMED;
+			ena_log(adapter->pdev, ERR,
+			    "TX descriptor malformed. req_id %hu qid %hu\n",
+			    req_id, tx_ring->qid);
+		} else if (tx_req_id_rc == ENA_COM_INVAL) {
+			ena_log_nm(adapter->pdev, WARN,
+			    "Invalid req_id %hu in qid %hu\n",
+			    req_id, tx_ring->qid);
+			counter_u64_add(tx_ring->tx_stats.bad_req_id, 1);
+		}
+
+		ena_trigger_reset(adapter, reset_reason);
+		return (EFAULT);
+	}
+
+	return (0);
+}
+
 static void
 ena_release_all_tx_dmamap(struct ena_ring *tx_ring)
 {
diff --git a/sys/dev/ena/ena.h b/sys/dev/ena/ena.h
index 4ac79edd0016..22c42a9346f7 100644
--- a/sys/dev/ena/ena.h
+++ b/sys/dev/ena/ena.h
@@ -389,6 +389,7 @@ struct ena_stats_dev {
 	counter_u64_t bad_rx_desc_num;
 	counter_u64_t invalid_state;
 	counter_u64_t missing_intr;
+	counter_u64_t tx_desc_malformed;
 };
 
 struct ena_hw_stats {
@@ -548,6 +549,7 @@ static const struct ena_reset_stats_offset resets_to_stats_offset_map[ENA_REGS_R
 	ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_TOO_MANY_RX_DESCS, bad_rx_desc_num),
 	ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_DRIVER_INVALID_STATE, invalid_state),
 	ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_MISS_INTERRUPT, missing_intr),
+	ENA_RESET_STATS_ENTRY(ENA_REGS_RESET_TX_DESCRIPTOR_MALFORMED, tx_desc_malformed),
 };
 
 int	ena_up(struct ena_adapter *adapter);
@@ -562,6 +564,7 @@ int	ena_update_queue_size(struct ena_adapter *adapter, uint32_t new_tx_size,
 int	ena_update_io_queue_nb(struct ena_adapter *adapter, uint32_t new_num);
 int     ena_update_base_cpu(struct ena_adapter *adapter, int new_num);
 int     ena_update_cpu_stride(struct ena_adapter *adapter, uint32_t new_num);
+int     validate_tx_req_id(struct ena_ring *tx_ring, uint16_t req_id, int tx_req_id_rc);
 static inline int
 ena_mbuf_count(struct mbuf *mbuf)
 {
diff --git a/sys/dev/ena/ena_datapath.c b/sys/dev/ena/ena_datapath.c
index 0691444f5357..c63a8914f9c2 100644
--- a/sys/dev/ena/ena_datapath.c
+++ b/sys/dev/ena/ena_datapath.c
@@ -200,29 +200,22 @@ ena_get_tx_req_id(struct ena_ring *tx_ring, struct ena_com_io_cq *io_cq,
     uint16_t *req_id)
 {
 	struct ena_adapter *adapter = tx_ring->adapter;
-	int rc;
+	int rc = ena_com_tx_comp_req_id_get(io_cq, req_id);
 
-	rc = ena_com_tx_comp_req_id_get(io_cq, req_id);
-	if (rc == ENA_COM_TRY_AGAIN)
+	if (unlikely(rc == ENA_COM_TRY_AGAIN))
 		return (EAGAIN);
 
-	if (unlikely(rc != 0)) {
-		ena_log(adapter->pdev, ERR, "Invalid req_id %hu in qid %hu\n",
+	rc = validate_tx_req_id(tx_ring, *req_id, rc);
+
+	if (unlikely(tx_ring->tx_buffer_info[*req_id].mbuf == NULL)) {
+		ena_log(adapter->pdev, ERR,
+		    "tx_info doesn't have valid mbuf. req_id %hu qid %hu\n",
 		    *req_id, tx_ring->qid);
-		counter_u64_add(tx_ring->tx_stats.bad_req_id, 1);
-		goto err;
+		ena_trigger_reset(adapter, ENA_REGS_RESET_INV_TX_REQ_ID);
+		rc = EFAULT;
 	}
 
-	if (tx_ring->tx_buffer_info[*req_id].mbuf != NULL)
-		return (0);
-
-	ena_log(adapter->pdev, ERR,
-	    "tx_info doesn't have valid mbuf. req_id %hu qid %hu\n",
-	    *req_id, tx_ring->qid);
-err:
-	ena_trigger_reset(adapter, ENA_REGS_RESET_INV_TX_REQ_ID);
-
-	return (EFAULT);
+	return (rc);
 }
 
 /**
diff --git a/sys/dev/ena/ena_netmap.c b/sys/dev/ena/ena_netmap.c
index ae2140d1217c..618d25a07f67 100644
--- a/sys/dev/ena/ena_netmap.c
+++ b/sys/dev/ena/ena_netmap.c
@@ -71,7 +71,6 @@ static void ena_netmap_unmap_last_socket_chain(struct ena_netmap_ctx *,
     struct ena_tx_buffer *);
 static void ena_netmap_tx_cleanup(struct ena_netmap_ctx *);
 static uint16_t ena_netmap_tx_clean_one(struct ena_netmap_ctx *, uint16_t);
-static inline int validate_tx_req_id(struct ena_ring *, uint16_t);
 static int ena_netmap_rx_frames(struct ena_netmap_ctx *);
 static int ena_netmap_rx_frame(struct ena_netmap_ctx *);
 static int ena_netmap_rx_load_desc(struct ena_netmap_ctx *, uint16_t, int *);
@@ -795,25 +794,33 @@ ena_netmap_unmap_last_socket_chain(struct ena_netmap_ctx *ctx,
 static void
 ena_netmap_tx_cleanup(struct ena_netmap_ctx *ctx)
 {
+	struct ena_ring *tx_ring = ctx->ring;
+	int rc;
 	uint16_t req_id;
 	uint16_t total_tx_descs = 0;
 
 	ctx->nm_i = ctx->kring->nr_hwtail;
-	ctx->nt = ctx->ring->next_to_clean;
+	ctx->nt = tx_ring->next_to_clean;
 
 	/* Reclaim buffers for completed transmissions */
-	while (ena_com_tx_comp_req_id_get(ctx->io_cq, &req_id) >= 0) {
-		if (validate_tx_req_id(ctx->ring, req_id) != 0)
+	do {
+		rc = ena_com_tx_comp_req_id_get(ctx->io_cq, &req_id);
+		if(unlikely(rc == ENA_COM_TRY_AGAIN))
 			break;
+
+		rc = validate_tx_req_id(tx_ring, req_id, rc);
+		if(unlikely(rc != 0))
+			break;
+
 		total_tx_descs += ena_netmap_tx_clean_one(ctx, req_id);
-	}
+	} while (1);
 
 	ctx->kring->nr_hwtail = ctx->nm_i;
 
 	if (total_tx_descs > 0) {
 		/* acknowledge completion of sent packets */
-		ctx->ring->next_to_clean = ctx->nt;
-		ena_com_comp_ack(ctx->ring->ena_com_io_sq, total_tx_descs);
+		tx_ring->next_to_clean = ctx->nt;
+		ena_com_comp_ack(tx_ring->ena_com_io_sq, total_tx_descs);
 	}
 }
 
@@ -856,23 +863,6 @@ ena_netmap_tx_clean_one(struct ena_netmap_ctx *ctx, uint16_t req_id)
 	return tx_info->tx_descs;
 }
 
-static inline int
-validate_tx_req_id(struct ena_ring *tx_ring, uint16_t req_id)
-{
-	struct ena_adapter *adapter = tx_ring->adapter;
-
-	if (likely(req_id < tx_ring->ring_size))
-		return (0);
-
-	ena_log_nm(adapter->pdev, WARN, "Invalid req_id %hu in qid %hu\n",
-	    req_id, tx_ring->qid);
-	counter_u64_add(tx_ring->tx_stats.bad_req_id, 1);
-
-	ena_trigger_reset(adapter, ENA_REGS_RESET_INV_TX_REQ_ID);
-
-	return (EFAULT);
-}
-
 static int
 ena_netmap_rxsync(struct netmap_kring *kring, int flags)
 {
diff --git a/sys/dev/ena/ena_sysctl.c b/sys/dev/ena/ena_sysctl.c
index 6eafe2a8c052..71c25dd5e443 100644
--- a/sys/dev/ena/ena_sysctl.c
+++ b/sys/dev/ena/ena_sysctl.c
@@ -294,6 +294,8 @@ ena_sysctl_add_stats(struct ena_adapter *adapter)
 	    &dev_stats->invalid_state, "Driver invalid state count");
 	SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "missing_intr", CTLFLAG_RD,
 	    &dev_stats->missing_intr, "Missing interrupt count");
+	SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "tx_desc_malformed", CTLFLAG_RD,
+	    &dev_stats->tx_desc_malformed, "TX descriptors malformed count");
 	SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "total_resets", CTLFLAG_RD,
 	    &dev_stats->total_resets, "Total resets count");