git: 1bda36a393c2 - stable/14 - gve: Add DQO QPL support

From: Mark Johnston <markj_at_FreeBSD.org>
Date: Wed, 20 Nov 2024 21:41:17 UTC
The branch stable/14 has been updated by markj:

URL: https://cgit.FreeBSD.org/src/commit/?id=1bda36a393c24a05581522a9dc56b4f6f5b53e10

commit 1bda36a393c24a05581522a9dc56b4f6f5b53e10
Author:     Shailend Chand <shailend@google.com>
AuthorDate: 2024-11-05 19:38:30 +0000
Commit:     Mark Johnston <markj@FreeBSD.org>
CommitDate: 2024-11-20 21:41:08 +0000

    gve: Add DQO QPL support
    
    DQO is the descriptor format for our next generation virtual NIC.
    It is necessary to make full use of the hardware bandwidth on many
    newer GCP VM shapes.
    
    This patch extends the previously introduced DQO descriptor format
    with a "QPL" mode. QPL stands for Queue Page List and refers to
    the fact that the hardware cannot access arbitrary regions of the
    host memory and instead expects a fixed bounce buffer comprising
    of a list of pages.
    
    The QPL aspects are similar to the already existing GQI queue
    queue format: in that the mbufs being input in the Rx path have
    external storage in the form of vm pages attached to them; and
    in the Tx path we always copy the mbuf payload into QPL pages.
    
    Signed-off-by: Shailend Chand <shailend@google.com>
    Reviewed-by: markj
    MFC-after: 2 weeks
    Differential Revision: https://reviews.freebsd.org/D46691
    
    (cherry picked from commit 2348ac893d10f06d2d84e1e4bd5ca9f1c5da92d8)
---
 share/man/man4/gve.4     |   2 +
 sys/dev/gve/gve.h        | 101 ++++++++++-
 sys/dev/gve/gve_adminq.c |  38 +++-
 sys/dev/gve/gve_adminq.h |  14 +-
 sys/dev/gve/gve_dqo.h    |  17 +-
 sys/dev/gve/gve_main.c   |  20 ++-
 sys/dev/gve/gve_qpl.c    |  32 +++-
 sys/dev/gve/gve_rx.c     |  18 --
 sys/dev/gve/gve_rx_dqo.c | 449 +++++++++++++++++++++++++++++++++++++++++++----
 sys/dev/gve/gve_sysctl.c |   8 +
 sys/dev/gve/gve_tx.c     |   5 +-
 sys/dev/gve/gve_tx_dqo.c | 425 ++++++++++++++++++++++++++++++++++++--------
 12 files changed, 983 insertions(+), 146 deletions(-)

diff --git a/share/man/man4/gve.4 b/share/man/man4/gve.4
index 82db2af548ae..8c2b3cabbd38 100644
--- a/share/man/man4/gve.4
+++ b/share/man/man4/gve.4
@@ -239,6 +239,8 @@ The D in "DQO" refers to a newer generation of hardware, and the "QO"
 stands for "Queue Out-of-order" referring to the fact that the NIC might
 send Tx and Rx completions in an order different from the one in which
 the corresponding descriptors were posted by the driver.
+.It
+DQO_QPL: The next generation descriptor format in the "QPL" mode.
 .El
 .Sh SUPPORT
 Please email gvnic-drivers@google.com with the specifics of the issue encountered.
diff --git a/sys/dev/gve/gve.h b/sys/dev/gve/gve.h
index 98f1139c6bc2..43082d64ba95 100644
--- a/sys/dev/gve/gve.h
+++ b/sys/dev/gve/gve.h
@@ -105,6 +105,7 @@ enum gve_queue_format {
 	GVE_GQI_RDA_FORMAT		= 0x1,
 	GVE_GQI_QPL_FORMAT		= 0x2,
 	GVE_DQO_RDA_FORMAT		= 0x3,
+	GVE_DQO_QPL_FORMAT		= 0x4,
 };
 
 enum gve_state_flags_bit {
@@ -226,6 +227,7 @@ struct gve_rxq_stats {
 	counter_u64_t rx_frag_flip_cnt;
 	counter_u64_t rx_frag_copy_cnt;
 	counter_u64_t rx_dropped_pkt_desc_err;
+	counter_u64_t rx_dropped_pkt_buf_post_fail;
 	counter_u64_t rx_dropped_pkt_mbuf_alloc_fail;
 	counter_u64_t rx_mbuf_dmamap_err;
 	counter_u64_t rx_mbuf_mclget_null;
@@ -233,11 +235,34 @@ struct gve_rxq_stats {
 
 #define NUM_RX_STATS (sizeof(struct gve_rxq_stats) / sizeof(counter_u64_t))
 
+union gve_rx_qpl_buf_id_dqo {
+	struct {
+		uint16_t buf_id:11; /* Index into rx->dqo.bufs */
+		uint8_t frag_num:5; /* Which frag in the QPL page */
+	};
+	uint16_t all;
+} __packed;
+_Static_assert(sizeof(union gve_rx_qpl_buf_id_dqo) == 2,
+    "gve: bad dqo qpl rx buf id length");
+
 struct gve_rx_buf_dqo {
-	struct mbuf *mbuf;
-	bus_dmamap_t dmamap;
-	uint64_t addr;
-	bool mapped;
+	union {
+		/* RDA */
+		struct {
+			struct mbuf *mbuf;
+			bus_dmamap_t dmamap;
+			uint64_t addr;
+			bool mapped;
+		};
+		/* QPL */
+		struct {
+			uint8_t num_nic_frags; /* number of pending completions */
+			uint8_t next_idx;  /* index of the next frag to post */
+			/* for chaining rx->dqo.used_bufs */
+			STAILQ_ENTRY(gve_rx_buf_dqo) stailq_entry;
+		};
+	};
+	/* for chaining rx->dqo.free_bufs */
 	SLIST_ENTRY(gve_rx_buf_dqo) slist_entry;
 };
 
@@ -276,6 +301,13 @@ struct gve_rx_ring {
 			uint32_t tail; /* The index at which to receive the next compl at */
 			uint8_t cur_gen_bit; /* Gets flipped on every cycle of the compl ring */
 			SLIST_HEAD(, gve_rx_buf_dqo) free_bufs;
+
+			/*
+			 * Only used in QPL mode. Pages refered to by if_input-ed mbufs
+			 * stay parked here till their wire count comes back to 1.
+			 * Pages are moved here after there aren't any pending completions.
+			 */
+			STAILQ_HEAD(, gve_rx_buf_dqo) used_bufs;
 		} dqo;
 	};
 
@@ -313,6 +345,7 @@ struct gve_txq_stats {
 	counter_u64_t tx_dropped_pkt_nospace_bufring;
 	counter_u64_t tx_delayed_pkt_nospace_descring;
 	counter_u64_t tx_delayed_pkt_nospace_compring;
+	counter_u64_t tx_delayed_pkt_nospace_qpl_bufs;
 	counter_u64_t tx_delayed_pkt_tsoerr;
 	counter_u64_t tx_dropped_pkt_vlan;
 	counter_u64_t tx_mbuf_collapse;
@@ -326,7 +359,19 @@ struct gve_txq_stats {
 
 struct gve_tx_pending_pkt_dqo {
 	struct mbuf *mbuf;
-	bus_dmamap_t dmamap;
+	union {
+		/* RDA */
+		bus_dmamap_t dmamap;
+		/* QPL */
+		struct {
+			/*
+			 * A linked list of entries from qpl_bufs that served
+			 * as the bounce buffer for this packet.
+			 */
+			int32_t qpl_buf_head;
+			uint32_t num_qpl_bufs;
+		};
+	};
 	uint8_t state; /* the gve_packet_state enum */
 	int next; /* To chain the free_pending_pkts lists */
 };
@@ -377,7 +422,20 @@ struct gve_tx_ring {
 				 */
 				int32_t free_pending_pkts_csm;
 
-				bus_dma_tag_t buf_dmatag; /* DMA params for mapping Tx mbufs */
+				/*
+				 * The head index of a singly linked list representing QPL page fragments
+				 * to copy mbuf payload into for the NIC to see. Once this list is depleted,
+				 * the "_prd" suffixed producer list, grown by the completion taskqueue,
+				 * is stolen.
+				 *
+				 * Only used in QPL mode. int32_t because atomic_swap_16 doesn't exist.
+				 */
+				int32_t free_qpl_bufs_csm;
+				uint32_t qpl_bufs_consumed; /* Allows quickly checking for buf availability */
+				uint32_t qpl_bufs_produced_cached; /* Cached value of qpl_bufs_produced */
+
+				/* DMA params for mapping Tx mbufs. Only used in RDA mode. */
+				bus_dma_tag_t buf_dmatag;
 			} __aligned(CACHE_LINE_SIZE);
 
 			/* Accessed when processing completions */
@@ -395,6 +453,18 @@ struct gve_tx_ring {
 				 * its consumer list, with the "_csm" suffix, is depleted.
 				 */
 				int32_t free_pending_pkts_prd;
+
+				/*
+				 * The completion taskqueue moves the QPL pages corresponding to a
+				 * completed packet into this list. It is only used in QPL mode.
+				 * The "_prd" denotes that this is a producer list. The trasnmit
+				 * taskqueue steals this list once its consumer list, with the "_csm"
+				 * suffix, is depleted.
+				 *
+				 * Only used in QPL mode. int32_t because atomic_swap_16 doesn't exist.
+				 */
+				int32_t free_qpl_bufs_prd;
+				uint32_t qpl_bufs_produced;
 			} __aligned(CACHE_LINE_SIZE);
 
 			/* Accessed by both the completion and xmit loops */
@@ -402,6 +472,16 @@ struct gve_tx_ring {
 				/* completion tags index into this array */
 				struct gve_tx_pending_pkt_dqo *pending_pkts;
 				uint16_t num_pending_pkts;
+
+				/*
+				 * Represents QPL page fragments. An index into this array
+				 * always represents the same QPL page fragment. The value
+				 * is also an index into this array and servers as a means
+				 * to chain buffers into linked lists whose heads are
+				 * either free_qpl_bufs_prd or free_qpl_bufs_csm or
+				 * qpl_bufs_head.
+				 */
+				int32_t *qpl_bufs;
 			} __aligned(CACHE_LINE_SIZE);
 		} dqo;
 	};
@@ -531,6 +611,13 @@ gve_is_gqi(struct gve_priv *priv)
 	return (priv->queue_format == GVE_GQI_QPL_FORMAT);
 }
 
+static inline bool
+gve_is_qpl(struct gve_priv *priv)
+{
+	return (priv->queue_format == GVE_GQI_QPL_FORMAT ||
+	    priv->queue_format == GVE_DQO_QPL_FORMAT);
+}
+
 /* Defined in gve_main.c */
 void gve_schedule_reset(struct gve_priv *priv);
 
@@ -545,6 +632,7 @@ int gve_alloc_qpls(struct gve_priv *priv);
 void gve_free_qpls(struct gve_priv *priv);
 int gve_register_qpls(struct gve_priv *priv);
 int gve_unregister_qpls(struct gve_priv *priv);
+void gve_mextadd_free(struct mbuf *mbuf);
 
 /* TX functions defined in gve_tx.c */
 int gve_alloc_tx_rings(struct gve_priv *priv);
@@ -563,6 +651,7 @@ void gve_tx_free_ring_dqo(struct gve_priv *priv, int i);
 void gve_clear_tx_ring_dqo(struct gve_priv *priv, int i);
 int gve_tx_intr_dqo(void *arg);
 int gve_xmit_dqo(struct gve_tx_ring *tx, struct mbuf **mbuf_ptr);
+int gve_xmit_dqo_qpl(struct gve_tx_ring *tx, struct mbuf *mbuf);
 void gve_tx_cleanup_tq_dqo(void *arg, int pending);
 
 /* RX functions defined in gve_rx.c */
diff --git a/sys/dev/gve/gve_adminq.c b/sys/dev/gve/gve_adminq.c
index 7865b979888b..dd03f817f45a 100644
--- a/sys/dev/gve/gve_adminq.c
+++ b/sys/dev/gve/gve_adminq.c
@@ -58,6 +58,7 @@ void gve_parse_device_option(struct gve_priv *priv,
     struct gve_device_option *option,
     struct gve_device_option_gqi_qpl **dev_op_gqi_qpl,
     struct gve_device_option_dqo_rda **dev_op_dqo_rda,
+    struct gve_device_option_dqo_qpl **dev_op_dqo_qpl,
     struct gve_device_option_jumbo_frames **dev_op_jumbo_frames)
 {
 	uint32_t req_feat_mask = be32toh(option->required_features_mask);
@@ -103,6 +104,23 @@ void gve_parse_device_option(struct gve_priv *priv,
 		*dev_op_dqo_rda = (void *)(option + 1);
 		break;
 
+	case GVE_DEV_OPT_ID_DQO_QPL:
+		if (option_length < sizeof(**dev_op_dqo_qpl) ||
+		    req_feat_mask != GVE_DEV_OPT_REQ_FEAT_MASK_DQO_QPL) {
+			device_printf(priv->dev, GVE_DEVICE_OPTION_ERROR_FMT,
+			    "DQO QPL", (int)sizeof(**dev_op_dqo_qpl),
+			    GVE_DEV_OPT_REQ_FEAT_MASK_DQO_QPL,
+			    option_length, req_feat_mask);
+			break;
+		}
+
+		if (option_length > sizeof(**dev_op_dqo_qpl)) {
+			device_printf(priv->dev, GVE_DEVICE_OPTION_TOO_BIG_FMT,
+			    "DQO QPL");
+		}
+		*dev_op_dqo_qpl = (void *)(option + 1);
+		break;
+
 	case GVE_DEV_OPT_ID_JUMBO_FRAMES:
 		if (option_length < sizeof(**dev_op_jumbo_frames) ||
 		    req_feat_mask != GVE_DEV_OPT_REQ_FEAT_MASK_JUMBO_FRAMES) {
@@ -136,6 +154,7 @@ gve_process_device_options(struct gve_priv *priv,
     struct gve_device_descriptor *descriptor,
     struct gve_device_option_gqi_qpl **dev_op_gqi_qpl,
     struct gve_device_option_dqo_rda **dev_op_dqo_rda,
+    struct gve_device_option_dqo_qpl **dev_op_dqo_qpl,
     struct gve_device_option_jumbo_frames **dev_op_jumbo_frames)
 {
 	char *desc_end = (char *)descriptor + be16toh(descriptor->total_length);
@@ -154,7 +173,10 @@ gve_process_device_options(struct gve_priv *priv,
 		}
 
 		gve_parse_device_option(priv, descriptor, dev_opt,
-		    dev_op_gqi_qpl, dev_op_dqo_rda, dev_op_jumbo_frames);
+		    dev_op_gqi_qpl,
+		    dev_op_dqo_rda,
+		    dev_op_dqo_qpl,
+		    dev_op_jumbo_frames);
 		dev_opt = (void *)((char *)(dev_opt + 1) + be16toh(dev_opt->option_length));
 	}
 
@@ -387,6 +409,7 @@ gve_adminq_describe_device(struct gve_priv *priv)
 	struct gve_dma_handle desc_mem;
 	struct gve_device_option_gqi_qpl *dev_op_gqi_qpl = NULL;
 	struct gve_device_option_dqo_rda *dev_op_dqo_rda = NULL;
+	struct gve_device_option_dqo_qpl *dev_op_dqo_qpl = NULL;
 	struct gve_device_option_jumbo_frames *dev_op_jumbo_frames = NULL;
 	uint32_t supported_features_mask = 0;
 	int rc;
@@ -416,7 +439,9 @@ gve_adminq_describe_device(struct gve_priv *priv)
 	bus_dmamap_sync(desc_mem.tag, desc_mem.map, BUS_DMASYNC_POSTREAD);
 
 	rc = gve_process_device_options(priv, desc,
-	    &dev_op_gqi_qpl, &dev_op_dqo_rda,
+	    &dev_op_gqi_qpl,
+	    &dev_op_dqo_rda,
+	    &dev_op_dqo_qpl,
 	    &dev_op_jumbo_frames);
 	if (rc != 0)
 		goto free_device_descriptor;
@@ -430,6 +455,15 @@ gve_adminq_describe_device(struct gve_priv *priv)
 		if (bootverbose)
 			device_printf(priv->dev,
 			    "Driver is running with DQO RDA queue format.\n");
+	} else if (dev_op_dqo_qpl != NULL) {
+		snprintf(gve_queue_format, sizeof(gve_queue_format),
+		    "%s", "DQO QPL");
+		priv->queue_format = GVE_DQO_QPL_FORMAT;
+		supported_features_mask = be32toh(
+		    dev_op_dqo_qpl->supported_features_mask);
+		if (bootverbose)
+			device_printf(priv->dev,
+			    "Driver is running with DQO QPL queue format.\n");
 	} else if (dev_op_gqi_qpl != NULL) {
 		snprintf(gve_queue_format, sizeof(gve_queue_format),
 		    "%s", "GQI QPL");
diff --git a/sys/dev/gve/gve_adminq.h b/sys/dev/gve/gve_adminq.h
index b5d512331d42..37a7cb3ecbb8 100644
--- a/sys/dev/gve/gve_adminq.h
+++ b/sys/dev/gve/gve_adminq.h
@@ -144,6 +144,15 @@ struct gve_device_option_dqo_rda {
 _Static_assert(sizeof(struct gve_device_option_dqo_rda) == 8,
     "gve: bad admin queue struct length");
 
+struct gve_device_option_dqo_qpl {
+	__be32 supported_features_mask;
+	__be16 tx_comp_ring_entries;
+	__be16 rx_buff_ring_entries;
+};
+
+_Static_assert(sizeof(struct gve_device_option_dqo_qpl) == 8,
+    "gve: bad admin queue struct length");
+
 struct gve_device_option_modify_ring {
 	__be32 supported_features_mask;
 	__be16 max_rx_ring_size;
@@ -168,6 +177,7 @@ enum gve_dev_opt_id {
 	GVE_DEV_OPT_ID_GQI_QPL = 0x3,
 	GVE_DEV_OPT_ID_DQO_RDA = 0x4,
 	GVE_DEV_OPT_ID_MODIFY_RING = 0x6,
+	GVE_DEV_OPT_ID_DQO_QPL = 0x7,
 	GVE_DEV_OPT_ID_JUMBO_FRAMES = 0x8,
 };
 
@@ -182,6 +192,7 @@ enum gve_dev_opt_req_feat_mask {
 	GVE_DEV_OPT_REQ_FEAT_MASK_GQI_RDA = 0x0,
 	GVE_DEV_OPT_REQ_FEAT_MASK_GQI_QPL = 0x0,
 	GVE_DEV_OPT_REQ_FEAT_MASK_DQO_RDA = 0x0,
+	GVE_DEV_OPT_REQ_FEAT_MASK_DQO_QPL = 0x0,
 	GVE_DEV_OPT_REQ_FEAT_MASK_MODIFY_RING = 0x0,
 	GVE_DEV_OPT_REQ_FEAT_MASK_JUMBO_FRAMES = 0x0,
 };
@@ -196,7 +207,7 @@ enum gve_sup_feature_mask {
 enum gve_driver_capability {
 	gve_driver_capability_gqi_qpl = 0,
 	gve_driver_capability_gqi_rda = 1,
-	gve_driver_capability_dqo_qpl = 2, /* reserved for future use */
+	gve_driver_capability_dqo_qpl = 2,
 	gve_driver_capability_dqo_rda = 3,
 };
 
@@ -212,6 +223,7 @@ enum gve_driver_capability {
  */
 #define GVE_DRIVER_CAPABILITY_FLAGS1 \
 	(GVE_CAP1(gve_driver_capability_gqi_qpl) | \
+	 GVE_CAP1(gve_driver_capability_dqo_qpl) | \
 	 GVE_CAP1(gve_driver_capability_dqo_rda))
 #define GVE_DRIVER_CAPABILITY_FLAGS2 0x0
 #define GVE_DRIVER_CAPABILITY_FLAGS3 0x0
diff --git a/sys/dev/gve/gve_dqo.h b/sys/dev/gve/gve_dqo.h
index 5f3f36d2245f..214138303a77 100644
--- a/sys/dev/gve/gve_dqo.h
+++ b/sys/dev/gve/gve_dqo.h
@@ -57,7 +57,22 @@
  * Start dropping RX fragments if at least these many
  * buffers cannot be posted to the NIC.
  */
-#define GVE_RX_DQO_MIN_PENDING_BUFS 32
+#define GVE_RX_DQO_MIN_PENDING_BUFS 128
+
+#define GVE_DQ_NUM_FRAGS_IN_PAGE (PAGE_SIZE / GVE_DEFAULT_RX_BUFFER_SIZE)
+
+/*
+ * gve_rx_qpl_buf_id_dqo's 11 bit wide buf_id field limits the total
+ * number of pages per QPL to 2048.
+ */
+#define GVE_RX_NUM_QPL_PAGES_DQO 2048
+
+/* 2K TX buffers for DQO-QPL */
+#define GVE_TX_BUF_SHIFT_DQO 11
+#define GVE_TX_BUF_SIZE_DQO BIT(GVE_TX_BUF_SHIFT_DQO)
+#define GVE_TX_BUFS_PER_PAGE_DQO (PAGE_SIZE >> GVE_TX_BUF_SHIFT_DQO)
+
+#define GVE_TX_NUM_QPL_PAGES_DQO 512
 
 /* Basic TX descriptor (DTYPE 0x0C) */
 struct gve_tx_pkt_desc_dqo {
diff --git a/sys/dev/gve/gve_main.c b/sys/dev/gve/gve_main.c
index 3fa8fec51910..5575c82f0681 100644
--- a/sys/dev/gve/gve_main.c
+++ b/sys/dev/gve/gve_main.c
@@ -32,9 +32,9 @@
 #include "gve_adminq.h"
 #include "gve_dqo.h"
 
-#define GVE_DRIVER_VERSION "GVE-FBSD-1.2.0\n"
+#define GVE_DRIVER_VERSION "GVE-FBSD-1.3.0\n"
 #define GVE_VERSION_MAJOR 1
-#define GVE_VERSION_MINOR 2
+#define GVE_VERSION_MINOR 3
 #define GVE_VERSION_SUB 0
 
 #define GVE_DEFAULT_RX_COPYBREAK 256
@@ -125,7 +125,7 @@ gve_up(struct gve_priv *priv)
 	if (if_getcapenable(ifp) & IFCAP_TSO6)
 		if_sethwassistbits(ifp, CSUM_IP6_TSO, 0);
 
-	if (gve_is_gqi(priv)) {
+	if (gve_is_qpl(priv)) {
 		err = gve_register_qpls(priv);
 		if (err != 0)
 			goto reset;
@@ -177,7 +177,7 @@ gve_down(struct gve_priv *priv)
 	if (gve_destroy_tx_rings(priv) != 0)
 		goto reset;
 
-	if (gve_is_gqi(priv)) {
+	if (gve_is_qpl(priv)) {
 		if (gve_unregister_qpls(priv) != 0)
 			goto reset;
 	}
@@ -375,13 +375,15 @@ gve_setup_ifnet(device_t dev, struct gve_priv *priv)
 
 	/*
 	 * Set TSO limits, must match the arguments to bus_dma_tag_create
-	 * when creating tx->dqo.buf_dmatag
+	 * when creating tx->dqo.buf_dmatag. Only applies to the RDA mode
+	 * because in QPL we copy the entire pakcet into the bounce buffer
+	 * and thus it does not matter how fragmented the mbuf is.
 	 */
-	if (!gve_is_gqi(priv)) {
-		if_sethwtsomax(ifp, GVE_TSO_MAXSIZE_DQO);
+	if (!gve_is_gqi(priv) && !gve_is_qpl(priv)) {
 		if_sethwtsomaxsegcount(ifp, GVE_TX_MAX_DATA_DESCS_DQO);
 		if_sethwtsomaxsegsize(ifp, GVE_TX_MAX_BUF_SIZE_DQO);
 	}
+	if_sethwtsomax(ifp, GVE_TSO_MAXSIZE_DQO);
 
 #if __FreeBSD_version >= 1400086
 	if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST);
@@ -465,7 +467,7 @@ gve_free_rings(struct gve_priv *priv)
 	gve_free_irqs(priv);
 	gve_free_tx_rings(priv);
 	gve_free_rx_rings(priv);
-	if (gve_is_gqi(priv))
+	if (gve_is_qpl(priv))
 		gve_free_qpls(priv);
 }
 
@@ -474,7 +476,7 @@ gve_alloc_rings(struct gve_priv *priv)
 {
 	int err;
 
-	if (gve_is_gqi(priv)) {
+	if (gve_is_qpl(priv)) {
 		err = gve_alloc_qpls(priv);
 		if (err != 0)
 			goto abort;
diff --git a/sys/dev/gve/gve_qpl.c b/sys/dev/gve/gve_qpl.c
index 9a946a2a2f2d..1fcc2b5365c9 100644
--- a/sys/dev/gve/gve_qpl.c
+++ b/sys/dev/gve/gve_qpl.c
@@ -32,13 +32,14 @@
 
 #include "gve.h"
 #include "gve_adminq.h"
+#include "gve_dqo.h"
 
 static MALLOC_DEFINE(M_GVE_QPL, "gve qpl", "gve qpl allocations");
 
 static uint32_t
 gve_num_tx_qpls(struct gve_priv *priv)
 {
-	if (priv->queue_format != GVE_GQI_QPL_FORMAT)
+	if (!gve_is_qpl(priv))
 		return (0);
 
 	return (priv->tx_cfg.max_queues);
@@ -47,7 +48,7 @@ gve_num_tx_qpls(struct gve_priv *priv)
 static uint32_t
 gve_num_rx_qpls(struct gve_priv *priv)
 {
-	if (priv->queue_format != GVE_GQI_QPL_FORMAT)
+	if (!gve_is_qpl(priv))
 		return (0);
 
 	return (priv->rx_cfg.max_queues);
@@ -189,6 +190,7 @@ gve_free_qpls(struct gve_priv *priv)
 int gve_alloc_qpls(struct gve_priv *priv)
 {
 	int num_qpls = gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv);
+	int num_pages;
 	int err;
 	int i;
 
@@ -198,15 +200,19 @@ int gve_alloc_qpls(struct gve_priv *priv)
 	priv->qpls = malloc(num_qpls * sizeof(*priv->qpls), M_GVE_QPL,
 	    M_WAITOK | M_ZERO);
 
+	num_pages = gve_is_gqi(priv) ?
+	    priv->tx_desc_cnt / GVE_QPL_DIVISOR :
+	    GVE_TX_NUM_QPL_PAGES_DQO;
 	for (i = 0; i < gve_num_tx_qpls(priv); i++) {
-		err = gve_alloc_qpl(priv, i, priv->tx_desc_cnt / GVE_QPL_DIVISOR,
+		err = gve_alloc_qpl(priv, i, num_pages,
 		    /*single_kva=*/true);
 		if (err != 0)
 			goto abort;
 	}
 
+	num_pages = gve_is_gqi(priv) ? priv->rx_desc_cnt : GVE_RX_NUM_QPL_PAGES_DQO;
 	for (; i < num_qpls; i++) {
-		err = gve_alloc_qpl(priv, i, priv->rx_desc_cnt, /*single_kva=*/false);
+		err = gve_alloc_qpl(priv, i, num_pages, /*single_kva=*/false);
 		if (err != 0)
 			goto abort;
 	}
@@ -283,3 +289,21 @@ gve_unregister_qpls(struct gve_priv *priv)
 	gve_clear_state_flag(priv, GVE_STATE_FLAG_QPLREG_OK);
 	return (0);
 }
+
+void
+gve_mextadd_free(struct mbuf *mbuf)
+{
+	vm_page_t page = (vm_page_t)mbuf->m_ext.ext_arg1;
+	vm_offset_t va = (vm_offset_t)mbuf->m_ext.ext_arg2;
+
+	/*
+	 * Free the page only if this is the last ref.
+	 * The interface might no longer exist by the time
+	 * this callback is called, see gve_free_qpl.
+	 */
+	if (__predict_false(vm_page_unwire_noq(page))) {
+		pmap_qremove(va, 1);
+		kva_free(va, PAGE_SIZE);
+		vm_page_free(page);
+	}
+}
diff --git a/sys/dev/gve/gve_rx.c b/sys/dev/gve/gve_rx.c
index 69bb13fc56c6..35f22f2308f0 100644
--- a/sys/dev/gve/gve_rx.c
+++ b/sys/dev/gve/gve_rx.c
@@ -409,24 +409,6 @@ gve_set_rss_type(__be16 flag, struct mbuf *mbuf)
 	}
 }
 
-static void
-gve_mextadd_free(struct mbuf *mbuf)
-{
-	vm_page_t page = (vm_page_t)mbuf->m_ext.ext_arg1;
-	vm_offset_t va = (vm_offset_t)mbuf->m_ext.ext_arg2;
-
-	/*
-	 * Free the page only if this is the last ref.
-	 * The interface might no longer exist by the time
-	 * this callback is called, see gve_free_qpl.
-	 */
-	if (__predict_false(vm_page_unwire_noq(page))) {
-		pmap_qremove(va, 1);
-		kva_free(va, PAGE_SIZE);
-		vm_page_free(page);
-	}
-}
-
 static void
 gve_rx_flip_buff(struct gve_rx_slot_page_info *page_info, __be64 *slot_addr)
 {
diff --git a/sys/dev/gve/gve_rx_dqo.c b/sys/dev/gve/gve_rx_dqo.c
index b391449328e1..6c5d656aaa04 100644
--- a/sys/dev/gve/gve_rx_dqo.c
+++ b/sys/dev/gve/gve_rx_dqo.c
@@ -38,6 +38,9 @@ gve_free_rx_mbufs_dqo(struct gve_rx_ring *rx)
 	struct gve_rx_buf_dqo *buf;
 	int i;
 
+	if (gve_is_qpl(rx->com.priv))
+		return;
+
 	for (i = 0; i < rx->dqo.buf_cnt; i++) {
 		buf = &rx->dqo.bufs[i];
 		if (!buf->mbuf)
@@ -70,7 +73,7 @@ gve_rx_free_ring_dqo(struct gve_priv *priv, int i)
 	if (rx->dqo.bufs != NULL) {
 		gve_free_rx_mbufs_dqo(rx);
 
-		if (rx->dqo.buf_dmatag) {
+		if (!gve_is_qpl(priv) && rx->dqo.buf_dmatag) {
 			for (j = 0; j < rx->dqo.buf_cnt; j++)
 				if (rx->dqo.bufs[j].mapped)
 					bus_dmamap_destroy(rx->dqo.buf_dmatag,
@@ -81,7 +84,7 @@ gve_rx_free_ring_dqo(struct gve_priv *priv, int i)
 		rx->dqo.bufs = NULL;
 	}
 
-	if (rx->dqo.buf_dmatag)
+	if (!gve_is_qpl(priv) && rx->dqo.buf_dmatag)
 		bus_dma_tag_destroy(rx->dqo.buf_dmatag);
 }
 
@@ -103,6 +106,31 @@ gve_rx_alloc_ring_dqo(struct gve_priv *priv, int i)
 	rx->dqo.desc_ring = rx->desc_ring_mem.cpu_addr;
 	rx->dqo.mask = priv->rx_desc_cnt - 1;
 
+	err = gve_dma_alloc_coherent(priv,
+	    sizeof(struct gve_rx_compl_desc_dqo) * priv->rx_desc_cnt,
+	    CACHE_LINE_SIZE, &rx->dqo.compl_ring_mem);
+	if (err != 0) {
+		device_printf(priv->dev,
+		    "Failed to alloc compl ring for rx ring %d", i);
+		goto abort;
+	}
+	rx->dqo.compl_ring = rx->dqo.compl_ring_mem.cpu_addr;
+	rx->dqo.mask = priv->rx_desc_cnt - 1;
+
+	rx->dqo.buf_cnt = gve_is_qpl(priv) ? GVE_RX_NUM_QPL_PAGES_DQO :
+	    priv->rx_desc_cnt;
+	rx->dqo.bufs = malloc(rx->dqo.buf_cnt * sizeof(struct gve_rx_buf_dqo),
+	    M_GVE, M_WAITOK | M_ZERO);
+
+	if (gve_is_qpl(priv)) {
+		rx->com.qpl = &priv->qpls[priv->tx_cfg.max_queues + i];
+		if (rx->com.qpl == NULL) {
+			device_printf(priv->dev, "No QPL left for rx ring %d", i);
+			return (ENOMEM);
+		}
+		return (0);
+	}
+
 	err = bus_dma_tag_create(
 	    bus_get_dma_tag(priv->dev),	/* parent */
 	    1, 0,			/* alignment, bounds */
@@ -123,9 +151,6 @@ gve_rx_alloc_ring_dqo(struct gve_priv *priv, int i)
 		goto abort;
 	}
 
-	rx->dqo.buf_cnt = priv->rx_desc_cnt;
-	rx->dqo.bufs = malloc(rx->dqo.buf_cnt * sizeof(struct gve_rx_buf_dqo),
-	    M_GVE, M_WAITOK | M_ZERO);
 	for (j = 0; j < rx->dqo.buf_cnt; j++) {
 		err = bus_dmamap_create(rx->dqo.buf_dmatag, 0,
 		    &rx->dqo.bufs[j].dmamap);
@@ -138,17 +163,6 @@ gve_rx_alloc_ring_dqo(struct gve_priv *priv, int i)
 		rx->dqo.bufs[j].mapped = true;
 	}
 
-	err = gve_dma_alloc_coherent(priv,
-	    sizeof(struct gve_rx_compl_desc_dqo) * priv->rx_desc_cnt,
-	    CACHE_LINE_SIZE, &rx->dqo.compl_ring_mem);
-	if (err != 0) {
-		device_printf(priv->dev,
-		    "Failed to alloc compl ring for rx ring %d", i);
-		goto abort;
-	}
-	rx->dqo.compl_ring = rx->dqo.compl_ring_mem.cpu_addr;
-	rx->dqo.mask = priv->rx_desc_cnt - 1;
-
 	return (0);
 
 abort:
@@ -202,10 +216,36 @@ gve_clear_rx_ring_dqo(struct gve_priv *priv, int i)
 
 	gve_free_rx_mbufs_dqo(rx);
 
-	SLIST_INIT(&rx->dqo.free_bufs);
-	for (j = 0; j < rx->dqo.buf_cnt; j++)
-		SLIST_INSERT_HEAD(&rx->dqo.free_bufs,
-		    &rx->dqo.bufs[j], slist_entry);
+	if (gve_is_qpl(priv)) {
+		SLIST_INIT(&rx->dqo.free_bufs);
+		STAILQ_INIT(&rx->dqo.used_bufs);
+
+		for (j = 0; j < rx->dqo.buf_cnt; j++) {
+			struct gve_rx_buf_dqo *buf = &rx->dqo.bufs[j];
+
+			vm_page_t page = rx->com.qpl->pages[buf - rx->dqo.bufs];
+			u_int ref_count = atomic_load_int(&page->ref_count);
+
+			/*
+			 * An ifconfig down+up might see pages still in flight
+			 * from the previous innings.
+			 */
+			if (VPRC_WIRE_COUNT(ref_count) == 1)
+				SLIST_INSERT_HEAD(&rx->dqo.free_bufs,
+				    buf, slist_entry);
+			else
+				STAILQ_INSERT_TAIL(&rx->dqo.used_bufs,
+				    buf, stailq_entry);
+
+			buf->num_nic_frags = 0;
+			buf->next_idx = 0;
+		}
+	} else {
+		SLIST_INIT(&rx->dqo.free_bufs);
+		for (j = 0; j < rx->dqo.buf_cnt; j++)
+			SLIST_INSERT_HEAD(&rx->dqo.free_bufs,
+			    &rx->dqo.bufs[j], slist_entry);
+	}
 }
 
 int
@@ -223,6 +263,20 @@ gve_rx_intr_dqo(void *arg)
 	return (FILTER_HANDLED);
 }
 
+static void
+gve_rx_advance_head_dqo(struct gve_rx_ring *rx)
+{
+	rx->dqo.head = (rx->dqo.head + 1) & rx->dqo.mask;
+	rx->fill_cnt++; /* rx->fill_cnt is just a sysctl counter */
+
+	if ((rx->dqo.head & (GVE_RX_BUF_THRESH_DQO - 1)) == 0) {
+		bus_dmamap_sync(rx->desc_ring_mem.tag, rx->desc_ring_mem.map,
+		    BUS_DMASYNC_PREWRITE);
+		gve_db_bar_dqo_write_4(rx->com.priv, rx->com.db_offset,
+		    rx->dqo.head);
+	}
+}
+
 static void
 gve_rx_post_buf_dqo(struct gve_rx_ring *rx, struct gve_rx_buf_dqo *buf)
 {
@@ -235,15 +289,7 @@ gve_rx_post_buf_dqo(struct gve_rx_ring *rx, struct gve_rx_buf_dqo *buf)
 	desc->buf_id = htole16(buf - rx->dqo.bufs);
 	desc->buf_addr = htole64(buf->addr);
 
-	rx->dqo.head = (rx->dqo.head + 1) & rx->dqo.mask;
-	rx->fill_cnt++; /* rx->fill_cnt is just a sysctl counter */
-
-	if ((rx->dqo.head & (GVE_RX_BUF_THRESH_DQO - 1)) == 0) {
-		bus_dmamap_sync(rx->desc_ring_mem.tag, rx->desc_ring_mem.map,
-		    BUS_DMASYNC_PREWRITE);
-		gve_db_bar_dqo_write_4(rx->com.priv, rx->com.db_offset,
-		    rx->dqo.head);
-	}
+	gve_rx_advance_head_dqo(rx);
 }
 
 static int
@@ -294,6 +340,103 @@ abort_with_buf:
 	return (err);
 }
 
+static struct gve_dma_handle *
+gve_get_page_dma_handle(struct gve_rx_ring *rx, struct gve_rx_buf_dqo *buf)
+{
+	return (&(rx->com.qpl->dmas[buf - rx->dqo.bufs]));
+}
+
+static void
+gve_rx_post_qpl_buf_dqo(struct gve_rx_ring *rx, struct gve_rx_buf_dqo *buf,
+    uint8_t frag_num)
+{
+	struct gve_rx_desc_dqo *desc = &rx->dqo.desc_ring[rx->dqo.head];
+	union gve_rx_qpl_buf_id_dqo composed_id;
+	struct gve_dma_handle *page_dma_handle;
+
+	composed_id.buf_id = buf - rx->dqo.bufs;
+	composed_id.frag_num = frag_num;
+	desc->buf_id = htole16(composed_id.all);
+
+	page_dma_handle = gve_get_page_dma_handle(rx, buf);
+	bus_dmamap_sync(page_dma_handle->tag, page_dma_handle->map,
+	    BUS_DMASYNC_PREREAD);
+	desc->buf_addr = htole64(page_dma_handle->bus_addr +
+	    frag_num * GVE_DEFAULT_RX_BUFFER_SIZE);
+
+	buf->num_nic_frags++;
+	gve_rx_advance_head_dqo(rx);
+}
+
+static void
+gve_rx_maybe_extract_from_used_bufs(struct gve_rx_ring *rx, bool just_one)
+{
+	struct gve_rx_buf_dqo *hol_blocker = NULL;
+	struct gve_rx_buf_dqo *buf;
+	u_int ref_count;
+	vm_page_t page;
+
+	while (true) {
+		buf = STAILQ_FIRST(&rx->dqo.used_bufs);
+		if (__predict_false(buf == NULL))
+			break;
+
+		page = rx->com.qpl->pages[buf - rx->dqo.bufs];
+		ref_count = atomic_load_int(&page->ref_count);
+
+		if (VPRC_WIRE_COUNT(ref_count) != 1) {
+			/* Account for one head-of-line blocker */
+			if (hol_blocker != NULL)
+				break;
+			hol_blocker = buf;
+			STAILQ_REMOVE_HEAD(&rx->dqo.used_bufs,
+			    stailq_entry);
+			continue;
+		}
+
+		STAILQ_REMOVE_HEAD(&rx->dqo.used_bufs,
+		    stailq_entry);
+		SLIST_INSERT_HEAD(&rx->dqo.free_bufs,
+		    buf, slist_entry);
+		if (just_one)
+			break;
+	}
+
+	if (hol_blocker != NULL)
+		STAILQ_INSERT_HEAD(&rx->dqo.used_bufs,
+		    hol_blocker, stailq_entry);
+}
+
+static int
+gve_rx_post_new_dqo_qpl_buf(struct gve_rx_ring *rx)
+{
+	struct gve_rx_buf_dqo *buf;
+
+	buf = SLIST_FIRST(&rx->dqo.free_bufs);
+	if (__predict_false(buf == NULL)) {
+		gve_rx_maybe_extract_from_used_bufs(rx, /*just_one=*/true);
+		buf = SLIST_FIRST(&rx->dqo.free_bufs);
+		if (__predict_false(buf == NULL))
+			return (ENOBUFS);
+	}
+
+	gve_rx_post_qpl_buf_dqo(rx, buf, buf->next_idx);
+	if (buf->next_idx == GVE_DQ_NUM_FRAGS_IN_PAGE - 1)
+		buf->next_idx = 0;
+	else
+		buf->next_idx++;
+
+	/*
+	 * We have posted all the frags in this buf to the NIC.
+	 * - buf will enter used_bufs once the last completion arrives.
+	 * - It will renter free_bufs in gve_rx_maybe_extract_from_used_bufs
+	 *   when its wire count drops back to 1.
+	 */
+	if (buf->next_idx == 0)
+		SLIST_REMOVE_HEAD(&rx->dqo.free_bufs, slist_entry);
+	return (0);
+}
+
 static void
 gve_rx_post_buffers_dqo(struct gve_rx_ring *rx, int how)
 {
@@ -306,7 +449,10 @@ gve_rx_post_buffers_dqo(struct gve_rx_ring *rx, int how)
 	num_to_post = rx->dqo.mask - num_pending_bufs;
 
 	for (i = 0; i < num_to_post; i++) {
-		err = gve_rx_post_new_mbuf_dqo(rx, how);
+		if (gve_is_qpl(rx->com.priv))
+			err = gve_rx_post_new_dqo_qpl_buf(rx);
+		else
+			err = gve_rx_post_new_mbuf_dqo(rx, how);
 		if (err)
 			break;
 	}
@@ -427,7 +573,7 @@ gve_rx_input_mbuf_dqo(struct gve_rx_ring *rx,
 }
 
 static int
-gve_rx_copybreak_dqo(struct gve_rx_ring *rx, struct gve_rx_buf_dqo *buf,
+gve_rx_copybreak_dqo(struct gve_rx_ring *rx, void *va,
     struct gve_rx_compl_desc_dqo *compl_desc, uint16_t frag_len)
 {
 	struct mbuf *mbuf;
@@ -440,14 +586,13 @@ gve_rx_copybreak_dqo(struct gve_rx_ring *rx, struct gve_rx_buf_dqo *buf,
 	counter_u64_add_protected(rx->stats.rx_copybreak_cnt, 1);
 	counter_exit();
 
-	m_copyback(mbuf, 0, frag_len, mtod(buf->mbuf, char*));
+	m_copyback(mbuf, 0, frag_len, va);
 	mbuf->m_len = frag_len;
 
 	rx->ctx.mbuf_head = mbuf;
 	rx->ctx.mbuf_tail = mbuf;
 	rx->ctx.total_size += frag_len;
 
-	gve_rx_post_buf_dqo(rx, buf);
 	gve_rx_input_mbuf_dqo(rx, compl_desc);
 	return (0);
 }
@@ -495,10 +640,12 @@ gve_rx_dqo(struct gve_priv *priv, struct gve_rx_ring *rx,
 
 	frag_len = compl_desc->packet_len;
 	if (frag_len <= priv->rx_copybreak && !ctx->mbuf_head && is_last_frag) {
-		err = gve_rx_copybreak_dqo(rx, buf, compl_desc, frag_len);
+		err = gve_rx_copybreak_dqo(rx, mtod(buf->mbuf, char*),
+		    compl_desc, frag_len);
 		if (__predict_false(err != 0))
 			goto drop_frag;
 		(*work_done)++;
+		gve_rx_post_buf_dqo(rx, buf);
 		return;
 	}
 
@@ -579,6 +726,233 @@ drop_frag_clear_ctx:
 	rx->ctx = (struct gve_rx_ctx){};
 }
 
+static void *
+gve_get_cpu_addr_for_qpl_buf(struct gve_rx_ring *rx,
+    struct gve_rx_buf_dqo *buf, uint8_t buf_frag_num)
+{
+	int page_idx = buf - rx->dqo.bufs;
+	void *va = rx->com.qpl->dmas[page_idx].cpu_addr;
+
+	va = (char *)va + (buf_frag_num * GVE_DEFAULT_RX_BUFFER_SIZE);
+	return (va);
+}
+
+static int
+gve_rx_add_clmbuf_to_ctx(struct gve_rx_ring *rx,
+    struct gve_rx_ctx *ctx, struct gve_rx_buf_dqo *buf,
+    uint8_t buf_frag_num, uint16_t frag_len)
+{
+	void *va = gve_get_cpu_addr_for_qpl_buf(rx, buf, buf_frag_num);
+	struct mbuf *mbuf;
+
+	if (ctx->mbuf_tail == NULL) {
+		mbuf = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
+		if (mbuf == NULL)
+			return (ENOMEM);
+		ctx->mbuf_head = mbuf;
+		ctx->mbuf_tail = mbuf;
+	} else {
+		mbuf = m_getcl(M_NOWAIT, MT_DATA, 0);
+		if (mbuf == NULL)
+			return (ENOMEM);
+		ctx->mbuf_tail->m_next = mbuf;
+		ctx->mbuf_tail = mbuf;
+	}
+
+	mbuf->m_len = frag_len;
+	ctx->total_size += frag_len;
+
+	m_copyback(mbuf, 0, frag_len, va);
+	counter_enter();
+	counter_u64_add_protected(rx->stats.rx_frag_copy_cnt, 1);
+	counter_exit();
+	return (0);
+}
+
+static int
+gve_rx_add_extmbuf_to_ctx(struct gve_rx_ring *rx,
+    struct gve_rx_ctx *ctx, struct gve_rx_buf_dqo *buf,
+    uint8_t buf_frag_num, uint16_t frag_len)
+{
+	struct mbuf *mbuf;
+	void *page_addr;
+	vm_page_t page;
+	int page_idx;
+	void *va;
*** 765 LINES SKIPPED ***