git: bc531a1faa99 - main - mlx5en: Improve CQE error debugging.

From: Hans Petter Selasky <hselasky_at_FreeBSD.org>
Date: Thu, 17 Feb 2022 12:13:51 UTC
The branch main has been updated by hselasky:

URL: https://cgit.FreeBSD.org/src/commit/?id=bc531a1faa99b94b7b7761f1640304dd815eec5d

commit bc531a1faa99b94b7b7761f1640304dd815eec5d
Author:     Hans Petter Selasky <hselasky@FreeBSD.org>
AuthorDate: 2022-02-17 11:50:22 +0000
Commit:     Hans Petter Selasky <hselasky@FreeBSD.org>
CommitDate: 2022-02-17 12:13:09 +0000

    mlx5en: Improve CQE error debugging.
    
    MFC after:      1 week
    Sponsored by:   NVIDIA Networking
---
 sys/dev/mlx5/mlx5_en/en.h           |  2 ++
 sys/dev/mlx5/mlx5_en/mlx5_en_rx.c   |  1 +
 sys/dev/mlx5/mlx5_en/mlx5_en_tx.c   |  4 +++-
 sys/dev/mlx5/mlx5_en/mlx5_en_txrx.c | 25 +++++++++++++++++++++++++
 4 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/sys/dev/mlx5/mlx5_en/en.h b/sys/dev/mlx5/mlx5_en/en.h
index fa355c68831e..36a55ff5c4d0 100644
--- a/sys/dev/mlx5/mlx5_en/en.h
+++ b/sys/dev/mlx5/mlx5_en/en.h
@@ -1205,6 +1205,8 @@ int	mlx5e_open_locked(struct ifnet *);
 int	mlx5e_close_locked(struct ifnet *);
 
 void	mlx5e_cq_error_event(struct mlx5_core_cq *mcq, int event);
+void	mlx5e_dump_err_cqe(struct mlx5e_cq *, u32, const struct mlx5_err_cqe *);
+
 mlx5e_cq_comp_t mlx5e_rx_cq_comp;
 mlx5e_cq_comp_t mlx5e_tx_cq_comp;
 struct mlx5_cqe64 *mlx5e_get_cqe(struct mlx5e_cq *cq);
diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c b/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c
index 0e3a3b3917f4..3c8813190f76 100644
--- a/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c
+++ b/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c
@@ -495,6 +495,7 @@ mlx5e_poll_rx_cq(struct mlx5e_rq *rq, int budget)
 		    BUS_DMASYNC_POSTREAD);
 
 		if (unlikely((cqe->op_own >> 4) != MLX5_CQE_RESP_SEND)) {
+			mlx5e_dump_err_cqe(&rq->cq, rq->rqn, (const void *)cqe);
 			rq->stats.wqe_err++;
 			goto wq_ll_pop;
 		}
diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c b/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c
index 9e0837a76393..78458ab69f13 100644
--- a/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c
+++ b/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c
@@ -1045,8 +1045,10 @@ mlx5e_poll_tx_cq(struct mlx5e_sq *sq, int budget)
 		mlx5_cqwq_pop(&sq->cq.wq);
 
 		/* check if the completion event indicates an error */
-		if (unlikely(get_cqe_opcode(cqe) != MLX5_CQE_REQ))
+		if (unlikely(get_cqe_opcode(cqe) != MLX5_CQE_REQ)) {
+			mlx5e_dump_err_cqe(&sq->cq, sq->sqn, (const void *)cqe);
 			sq->stats.cqe_err++;
+		}
 
 		/* setup local variables */
 		sqcc_this = be16toh(cqe->wqe_counter);
diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_txrx.c b/sys/dev/mlx5/mlx5_en/mlx5_en_txrx.c
index 9f5e17ad864e..aff247f5aea2 100644
--- a/sys/dev/mlx5/mlx5_en/mlx5_en_txrx.c
+++ b/sys/dev/mlx5/mlx5_en/mlx5_en_txrx.c
@@ -28,6 +28,8 @@
 #include "opt_rss.h"
 #include "opt_ratelimit.h"
 
+#include <linux/printk.h>
+
 #include <dev/mlx5/mlx5_en/en.h>
 
 struct mlx5_cqe64 *
@@ -54,3 +56,26 @@ mlx5e_cq_error_event(struct mlx5_core_cq *mcq, int event)
 	mlx5_en_err(cq->priv->ifp, "cqn=0x%.6x event=0x%.2x\n",
 	    mcq->cqn, event);
 }
+
+void
+mlx5e_dump_err_cqe(struct mlx5e_cq *cq, u32 qn, const struct mlx5_err_cqe *err_cqe)
+{
+	u32 ci;
+
+	/* Don't print flushed in error syndromes. */
+	if (err_cqe->vendor_err_synd == 0xf9 && err_cqe->syndrome == 0x05)
+		return;
+	/* Don't print when the queue is set to error state by software. */
+	if (err_cqe->vendor_err_synd == 0xf5 && err_cqe->syndrome == 0x05)
+		return;
+
+	ci = (cq->wq.cc - 1) & cq->wq.sz_m1;
+
+	mlx5_en_err(cq->priv->ifp,
+	    "Error CQE on CQN 0x%x, CI 0x%x, QN 0x%x, OPCODE 0x%x, SYNDROME 0x%x, VENDOR SYNDROME 0x%x\n",
+	    cq->mcq.cqn, ci, qn, err_cqe->op_own >> 4,
+	    err_cqe->syndrome, err_cqe->vendor_err_synd);
+
+	print_hex_dump(NULL, NULL, DUMP_PREFIX_OFFSET,
+	    16, 1, err_cqe, sizeof(*err_cqe), false);
+}