git: 54dfc97b0bd9 - main - Add gve, the driver for Google Virtual NIC (gVNIC)

From: Xin LI <delphij_at_FreeBSD.org>
Date: Fri, 02 Jun 2023 21:35:37 UTC
The branch main has been updated by delphij:

URL: https://cgit.FreeBSD.org/src/commit/?id=54dfc97b0bd99f1c3bcbb37357cf28cd81a7cf00

commit 54dfc97b0bd99f1c3bcbb37357cf28cd81a7cf00
Author:     Shailend Chand <shailend@google.com>
AuthorDate: 2023-06-02 18:58:24 +0000
Commit:     Xin LI <delphij@FreeBSD.org>
CommitDate: 2023-06-02 21:31:54 +0000

    Add gve, the driver for Google Virtual NIC (gVNIC)
    
    gVNIC is a virtual network interface designed specifically for
    Google Compute Engine (GCE). It is required to support per-VM Tier_1
    networking performance, and for using certain VM shapes on GCE.
    
    The NIC supports TSO, Rx and Tx checksum offloads, and RSS.
    It does not currently do hardware LRO, and thus the software-LRO
    in the host is used instead. It also supports jumbo frames.
    
    For each queue, the driver negotiates a set of pages with the NIC to
    serve as a fixed bounce buffer, this precludes the use of iflib.
    
    Reviewed-by:            markj
    MFC-after:              2 weeks
    Differential Revision: https://reviews.freebsd.org/D39873
---
 share/man/man4/Makefile    |   5 +
 share/man/man4/gve.4       | 215 ++++++++++++
 sys/conf/files             |   7 +
 sys/dev/gve/gve.h          | 459 ++++++++++++++++++++++++
 sys/dev/gve/gve_adminq.c   | 803 ++++++++++++++++++++++++++++++++++++++++++
 sys/dev/gve/gve_adminq.h   | 394 +++++++++++++++++++++
 sys/dev/gve/gve_desc.h     | 151 ++++++++
 sys/dev/gve/gve_main.c     | 853 +++++++++++++++++++++++++++++++++++++++++++++
 sys/dev/gve/gve_plat.h     |  94 +++++
 sys/dev/gve/gve_qpl.c      | 284 +++++++++++++++
 sys/dev/gve/gve_register.h |  54 +++
 sys/dev/gve/gve_rx.c       | 684 ++++++++++++++++++++++++++++++++++++
 sys/dev/gve/gve_sysctl.c   | 261 ++++++++++++++
 sys/dev/gve/gve_tx.c       | 806 ++++++++++++++++++++++++++++++++++++++++++
 sys/dev/gve/gve_utils.c    | 405 +++++++++++++++++++++
 sys/modules/Makefile       |   5 +
 sys/modules/gve/Makefile   |  36 ++
 17 files changed, 5516 insertions(+)

diff --git a/share/man/man4/Makefile b/share/man/man4/Makefile
index 71d12055f4db..9149a562383f 100644
--- a/share/man/man4/Makefile
+++ b/share/man/man4/Makefile
@@ -176,6 +176,7 @@ MAN=	aac.4 \
 	geom_map.4 \
 	geom_uzip.4 \
 	gif.4 \
+	${_gve.4} \
 	gpio.4 \
 	gpioiic.4 \
 	gpiokeys.4 \
@@ -895,6 +896,10 @@ _linux.4=	linux.4
 _ossl.4=	ossl.4
 .endif
 
+.if ${MACHINE_CPUARCH} == "amd64" || ${MACHINE_CPUARCH} == "aarch64"
+_gve.4=		gve.4
+.endif
+
 .if ${MACHINE_CPUARCH} == "arm" || ${MACHINE_CPUARCH} == "aarch64" || \
 	 ${MACHINE_CPUARCH} == "riscv"
 _cgem.4=	cgem.4
diff --git a/share/man/man4/gve.4 b/share/man/man4/gve.4
new file mode 100644
index 000000000000..9bb1be1b9a53
--- /dev/null
+++ b/share/man/man4/gve.4
@@ -0,0 +1,215 @@
+.\" SPDX-License-Identifier: BSD-3-Clause
+.\"
+.\" Copyright (c) 2023 Google LLC
+.\"
+.\" Redistribution and use in source and binary forms, with or without modification,
+.\" are permitted provided that the following conditions are met:
+.\"
+.\" 1. Redistributions of source code must retain the above copyright notice, this
+.\"    list of conditions and the following disclaimer.
+.\"
+.\" 2. Redistributions in binary form must reproduce the above copyright notice,
+.\"    this list of conditions and the following disclaimer in the documentation
+.\"    and/or other materials provided with the distribution.
+.\"
+.\" 3. Neither the name of the copyright holder nor the names of its contributors
+.\"    may be used to endorse or promote products derived from this software without
+.\"    specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+.\" WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+.\" DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+.\" ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+.\" (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+.\" LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+.\" ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+.\" (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+.\" SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+.Dd April 26, 2023
+.Dt GVE 4
+.Os
+.Sh NAME
+.Nm gve
+.Nd "Ethernet driver for Google Virtual NIC (gVNIC)"
+.Sh SYNOPSIS
+To compile this driver into the kernel,
+place the following lines in your
+kernel configuration file:
+.Bd -ragged -offset indent
+.Cd "device gve"
+.Ed
+.Pp
+Alternatively, to load the driver as a
+module at boot time, place the following line in
+.Xr loader.conf 5 :
+.Bd -literal -offset indent
+if_gve_load="YES"
+.Ed
+.Sh DESCRIPTION
+gVNIC is a virtual network interface designed specifically for Google Compute Engine (GCE).
+It is required to support per-VM Tier-1 networking performance, and for using certain VM shapes on GCE.
+.Pp
+.Nm
+is the driver for gVNIC.
+It supports the following features:
+.Pp
+.Bl -bullet -compact
+.It
+RX checksum offload
+.It
+TX chesksum offload
+.It
+TCP Segmentation Offload (TSO)
+.It
+Large Receive Offload (LRO) in software
+.It
+Jumbo frames
+.It
+Receive Side Scaling (RSS)
+.El
+.Pp
+For more information on configuring this device, see
+.Xr ifconfig 8 .
+.Sh HARDWARE
+.Nm
+binds to a single PCI device ID presented by gVNIC:
+.Pp
+.Bl -bullet -compact
+.It
+0x1AE0:0x0042
+.El
+.Sh DIAGNOSTICS
+The following messages are recorded during driver initialization:
+.Bl -diag
+.It "Enabled MSIX with %d vectors"
+.It "Configured device resources"
+.It "Successfully attached %s"
+.It "Deconfigured device resources"
+.El
+.Pp
+These messages are seen if driver initialization fails.
+Global (across-queues) allocation failures:
+.Bl -diag
+.It "Failed to configure device resources: err=%d"
+.It "No compatible queue formats"
+.It "Failed to allocate ifnet struct"
+.It "Failed to allocate admin queue mem"
+.It "Failed to alloc DMA mem for DescribeDevice"
+.It "Failed to allocate QPL page"
+.El
+.Pp
+irq and BAR allocation failures:
+.Bl -diag
+.It "Failed to acquire any msix vectors"
+.It "Tried to acquire %d msix vectors, got only %d"
+.It "Failed to setup irq %d for Tx queue %d "
+.It "Failed to setup irq %d for Rx queue %d "
+.It "Failed to allocate irq %d for mgmnt queue"
+.It "Failed to setup irq %d for mgmnt queue, err: %d"
+.It "Failed to allocate BAR0"
+.It "Failed to allocate BAR2"
+.It "Failed to allocate msix table"
+.El
+.Pp
+Rx queue-specific allocation failures:
+.Bl -diag
+.It "No QPL left for rx ring %d"
+.It "Failed to alloc queue resources for rx ring %d"
+.It "Failed to alloc desc ring for rx ring %d"
+.It "Failed to alloc data ring for rx ring %d"
+.El
+.Pp
+Tx queue-specific allocation failures:
+.Bl -diag
+.It "No QPL left for tx ring %d"
+.It "Failed to alloc queue resources for tx ring %d"
+.It "Failed to alloc desc ring for tx ring %d"
+.It "Failed to vmap fifo, qpl_id = %d"
+.El
+.El
+.Pp
+The following messages are recorded when the interface detach fails:
+.Bl -diag
+.It "Failed to deconfigure device resources: err=%d"
+.El
+.Pp
+If bootverbose is on, the following messages are recorded when the interface is being brought up:
+.Bl -diag
+.It "Created %d rx queues"
+.It "Created %d tx queues"
+.It "MTU set to %d"
+.El
+.Pp
+The following messages are recorded when the interface is being brought down:
+.Bl -diag
+.It "Destroyed %d rx queues"
+.It "Destroyed %d tx queues"
+.El
+.Pp
+These messages are seen if errors are encountered when bringing the interface up or down:
+.Bl -diag
+.It "Failed to destroy rxq %d, err: %d"
+.It "Failed to destroy txq %d, err: %d"
+.It "Failed to create rxq %d, err: %d"
+.It "Failed to create txq %d, err: %d"
+.It "Failed to set MTU to %d"
+.It "Invalid new MTU setting. new mtu: %d max mtu: %d min mtu: %d"
+.It "Cannot bring the iface up when detached"
+.It "Reached max number of registered pages %lu > %lu"
+.It "Failed to init lro for rx ring %d"
+.El
+.Pp
+These messages are seen if any admin queue command fails:
+.Bl -diag
+.It "AQ command(%u): failed with status %d"
+.It "AQ command(%u): unknown status code %d"
+.It "AQ commands timed out, need to reset AQ"
+.It "Unknown AQ command opcode %d"
+.El
+.Pp
+These messages are recorded when the device is being reset due to an error:
+.Bl -diag
+.It "Scheduling reset task!"
+.It "Waiting until admin queue is released."
+.It "Admin queue released"
+.El
+.Pp
+If it was the NIC that requested the reset, this message is recorded:
+.Bl -diag
+.It "Device requested reset"
+.El
+.Pp
+If the reset fails during the reinitialization phase, this message is recorded:
+.Bl -diag
+.It "Restore failed!"
+.El
+.Pp
+These two messages correspoond to the NIC alerting the driver to link state changes:
+.Bl -diag
+.It "Device link is up."
+.It "Device link is down."
+.El
+.Pp
+Apart from these messages, the driver exposes per-queue packet and error counters as sysctl nodes.
+Global (across queues) counters can be read using
+.Xr netstat 8 .
+.Sh LIMITATIONS
+.Nm
+does not support the transmission of VLAN-tagged packets.
+All VLAN-tagged traffic is dropped.
+.Sh SUPPORT
+Please email gvnic-drivers@google.com with the specifics of the issue encountered.
+.El
+.Sh SEE ALSO
+.Xr ifconfig 8 ,
+.Xr netstat 8
+.Sh HISTORY
+The
+.Nm
+device driver first appeared in
+.Fx 14.0 .
+.Sh AUTHORS
+The
+.Nm
+driver was written by Google.
diff --git a/sys/conf/files b/sys/conf/files
index c0728504da5a..a5f55b49451c 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -1748,6 +1748,13 @@ dev/fxp/if_fxp.c		optional fxp
 dev/fxp/inphy.c			optional fxp
 dev/gem/if_gem.c		optional gem
 dev/gem/if_gem_pci.c		optional gem pci
+dev/gve/gve_adminq.c		optional gve
+dev/gve/gve_main.c		optional gve
+dev/gve/gve_qpl.c		optional gve
+dev/gve/gve_rx.c		optional gve
+dev/gve/gve_sysctl.c		optional gve
+dev/gve/gve_tx.c		optional gve
+dev/gve/gve_utils.c		optional gve
 dev/goldfish/goldfish_rtc.c	optional goldfish_rtc fdt
 dev/gpio/dwgpio/dwgpio.c	optional gpio dwgpio fdt
 dev/gpio/dwgpio/dwgpio_bus.c	optional gpio dwgpio fdt
diff --git a/sys/dev/gve/gve.h b/sys/dev/gve/gve.h
new file mode 100644
index 000000000000..61781cddee94
--- /dev/null
+++ b/sys/dev/gve/gve.h
@@ -0,0 +1,459 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 2023 Google LLC
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software without
+ *    specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _GVE_FBSD_H
+#define _GVE_FBSD_H
+
+#include "gve_desc.h"
+#include "gve_plat.h"
+#include "gve_register.h"
+
+#ifndef PCI_VENDOR_ID_GOOGLE
+#define PCI_VENDOR_ID_GOOGLE	0x1ae0
+#endif
+
+#define PCI_DEV_ID_GVNIC	0x0042
+#define GVE_REGISTER_BAR	0
+#define GVE_DOORBELL_BAR	2
+
+/* Driver can alloc up to 2 segments for the header and 2 for the payload. */
+#define GVE_TX_MAX_DESCS	4
+#define GVE_TX_BUFRING_ENTRIES	4096
+
+#define ADMINQ_SIZE PAGE_SIZE
+
+#define GVE_DEFAULT_RX_BUFFER_SIZE 2048
+/* Each RX bounce buffer page can fit two packet buffers. */
+#define GVE_DEFAULT_RX_BUFFER_OFFSET (PAGE_SIZE / 2)
+
+/*
+ * Number of descriptors per queue page list.
+ * Page count AKA QPL size can be derived by dividing the number of elements in
+ * a page by the number of descriptors available.
+ */
+#define GVE_QPL_DIVISOR	16
+
+static MALLOC_DEFINE(M_GVE, "gve", "gve allocations");
+
+struct gve_dma_handle {
+	bus_addr_t	bus_addr;
+	void		*cpu_addr;
+	bus_dma_tag_t	tag;
+	bus_dmamap_t	map;
+};
+
+union gve_tx_desc {
+	struct gve_tx_pkt_desc pkt; /* first desc for a packet */
+	struct gve_tx_mtd_desc mtd; /* optional metadata descriptor */
+	struct gve_tx_seg_desc seg; /* subsequent descs for a packet */
+};
+
+/* Tracks the memory in the fifo occupied by a segment of a packet */
+struct gve_tx_iovec {
+	uint32_t iov_offset; /* offset into this segment */
+	uint32_t iov_len; /* length */
+	uint32_t iov_padding; /* padding associated with this segment */
+};
+
+/* Tracks allowed and current queue settings */
+struct gve_queue_config {
+	uint16_t max_queues;
+	uint16_t num_queues; /* current */
+};
+
+struct gve_irq_db {
+	__be32 index;
+} __aligned(CACHE_LINE_SIZE);
+
+/*
+ * GVE_QUEUE_FORMAT_UNSPECIFIED must be zero since 0 is the default value
+ * when the entire configure_device_resources command is zeroed out and the
+ * queue_format is not specified.
+ */
+enum gve_queue_format {
+	GVE_QUEUE_FORMAT_UNSPECIFIED	= 0x0,
+	GVE_GQI_RDA_FORMAT		= 0x1,
+	GVE_GQI_QPL_FORMAT		= 0x2,
+	GVE_DQO_RDA_FORMAT		= 0x3,
+};
+
+enum gve_state_flags_bit {
+	GVE_STATE_FLAG_ADMINQ_OK,
+	GVE_STATE_FLAG_RESOURCES_OK,
+	GVE_STATE_FLAG_QPLREG_OK,
+	GVE_STATE_FLAG_RX_RINGS_OK,
+	GVE_STATE_FLAG_TX_RINGS_OK,
+	GVE_STATE_FLAG_QUEUES_UP,
+	GVE_STATE_FLAG_LINK_UP,
+	GVE_STATE_FLAG_DO_RESET,
+	GVE_STATE_FLAG_IN_RESET,
+	GVE_NUM_STATE_FLAGS /* Not part of the enum space */
+};
+
+BITSET_DEFINE(gve_state_flags, GVE_NUM_STATE_FLAGS);
+
+#define GVE_DEVICE_STATUS_RESET (0x1 << 1)
+#define GVE_DEVICE_STATUS_LINK_STATUS (0x1 << 2)
+
+#define GVE_RING_LOCK(ring)	mtx_lock(&(ring)->ring_mtx)
+#define GVE_RING_TRYLOCK(ring)	mtx_trylock(&(ring)->ring_mtx)
+#define GVE_RING_UNLOCK(ring)	mtx_unlock(&(ring)->ring_mtx)
+#define GVE_RING_ASSERT(ring)	mtx_assert(&(ring)->ring_mtx, MA_OWNED)
+
+#define GVE_IFACE_LOCK_INIT(lock)     sx_init(&lock, "gve interface lock")
+#define GVE_IFACE_LOCK_DESTROY(lock)  sx_destroy(&lock)
+#define GVE_IFACE_LOCK_LOCK(lock)     sx_xlock(&lock)
+#define GVE_IFACE_LOCK_UNLOCK(lock)   sx_unlock(&lock)
+#define GVE_IFACE_LOCK_ASSERT(lock)   sx_assert(&lock, SA_XLOCKED)
+
+struct gve_queue_page_list {
+	uint32_t id;
+	uint32_t num_dmas;
+	uint32_t num_pages;
+	vm_offset_t kva;
+	vm_page_t *pages;
+	struct gve_dma_handle *dmas;
+};
+
+struct gve_irq {
+	struct resource *res;
+	void *cookie;
+};
+
+struct gve_rx_slot_page_info {
+	void *page_address;
+	vm_page_t page;
+	uint32_t page_offset;
+	uint16_t pad;
+};
+
+/*
+ * A single received packet split across multiple buffers may be
+ * reconstructed using the information in this structure.
+ */
+struct gve_rx_ctx {
+	/* head and tail of mbuf chain for the current packet */
+	struct mbuf *mbuf_head;
+	struct mbuf *mbuf_tail;
+	uint32_t total_size;
+	uint8_t frag_cnt;
+	bool drop_pkt;
+};
+
+struct gve_ring_com {
+	struct gve_priv *priv;
+	uint32_t id;
+
+	/*
+	 * BAR2 offset for this ring's doorbell and the
+	 * counter-array offset for this ring's counter.
+	 * Acquired from the device individually for each
+	 * queue in the queue_create adminq command.
+	 */
+	struct gve_queue_resources *q_resources;
+	struct gve_dma_handle q_resources_mem;
+
+	/* Byte offset into BAR2 where this ring's 4-byte irq doorbell lies. */
+	uint32_t irq_db_offset;
+	/* Byte offset into BAR2 where this ring's 4-byte doorbell lies. */
+	uint32_t db_offset;
+	/*
+	 * Index, not byte-offset, into the counter array where this ring's
+	 * 4-byte counter lies.
+	 */
+	uint32_t counter_idx;
+
+	/*
+	 * The index of the MSIX vector that was assigned to
+	 * this ring in `gve_alloc_irqs`.
+	 *
+	 * It is passed to the device in the queue_create adminq
+	 * command.
+	 *
+	 * Additionally, this also serves as the index into
+	 * `priv->irq_db_indices` where this ring's irq doorbell's
+	 * BAR2 offset, `irq_db_idx`, can be found.
+	 */
+	int ntfy_id;
+
+	/*
+	 * The fixed bounce buffer for this ring.
+	 * Once allocated, has to be offered to the device
+	 * over the register-page-list adminq command.
+	 */
+	struct gve_queue_page_list *qpl;
+
+	struct task cleanup_task;
+	struct taskqueue *cleanup_tq;
+} __aligned(CACHE_LINE_SIZE);
+
+struct gve_rxq_stats {
+	counter_u64_t rbytes;
+	counter_u64_t rpackets;
+	counter_u64_t rx_dropped_pkt;
+	counter_u64_t rx_copybreak_cnt;
+	counter_u64_t rx_frag_flip_cnt;
+	counter_u64_t rx_frag_copy_cnt;
+	counter_u64_t rx_dropped_pkt_desc_err;
+	counter_u64_t rx_dropped_pkt_mbuf_alloc_fail;
+};
+
+#define NUM_RX_STATS (sizeof(struct gve_rxq_stats) / sizeof(counter_u64_t))
+
+/* power-of-2 sized receive ring */
+struct gve_rx_ring {
+	struct gve_ring_com com;
+	struct gve_dma_handle desc_ring_mem;
+	struct gve_dma_handle data_ring_mem;
+
+	/* accessed in the receive hot path */
+	struct {
+		struct gve_rx_desc *desc_ring;
+		union gve_rx_data_slot *data_ring;
+		struct gve_rx_slot_page_info *page_info;
+
+		struct gve_rx_ctx ctx;
+		struct lro_ctrl lro;
+		uint8_t seq_no; /* helps traverse the descriptor ring */
+		uint32_t cnt; /* free-running total number of completed packets */
+		uint32_t fill_cnt; /* free-running total number of descs and buffs posted */
+		uint32_t mask; /* masks the cnt and fill_cnt to the size of the ring */
+		struct gve_rxq_stats stats;
+	} __aligned(CACHE_LINE_SIZE);
+
+} __aligned(CACHE_LINE_SIZE);
+
+/*
+ * A contiguous representation of the pages composing the Tx bounce buffer.
+ * The xmit taskqueue and the completion taskqueue both simultaneously use it.
+ * Both operate on `available`: the xmit tq lowers it and the completion tq
+ * raises it. `head` is the last location written at and so only the xmit tq
+ * uses it.
+ */
+struct gve_tx_fifo {
+	vm_offset_t base; /* address of base of FIFO */
+	uint32_t size; /* total size */
+	volatile int available; /* how much space is still available */
+	uint32_t head; /* offset to write at */
+};
+
+struct gve_tx_buffer_state {
+	struct mbuf *mbuf;
+	struct gve_tx_iovec iov[GVE_TX_MAX_DESCS];
+};
+
+struct gve_txq_stats {
+	counter_u64_t tbytes;
+	counter_u64_t tpackets;
+	counter_u64_t tso_packet_cnt;
+	counter_u64_t tx_dropped_pkt;
+	counter_u64_t tx_dropped_pkt_nospace_device;
+	counter_u64_t tx_dropped_pkt_nospace_bufring;
+	counter_u64_t tx_dropped_pkt_vlan;
+};
+
+#define NUM_TX_STATS (sizeof(struct gve_txq_stats) / sizeof(counter_u64_t))
+
+/* power-of-2 sized transmit ring */
+struct gve_tx_ring {
+	struct gve_ring_com com;
+	struct gve_dma_handle desc_ring_mem;
+
+	struct task xmit_task;
+	struct taskqueue *xmit_tq;
+
+	/* accessed in the transmit hot path */
+	struct {
+		union gve_tx_desc *desc_ring;
+		struct gve_tx_buffer_state *info;
+		struct buf_ring *br;
+
+		struct gve_tx_fifo fifo;
+		struct mtx ring_mtx;
+
+		uint32_t req; /* free-running total number of packets written to the nic */
+		uint32_t done; /* free-running total number of completed packets */
+		uint32_t mask; /* masks the req and done to the size of the ring */
+		struct gve_txq_stats stats;
+	} __aligned(CACHE_LINE_SIZE);
+
+} __aligned(CACHE_LINE_SIZE);
+
+struct gve_priv {
+	if_t ifp;
+	device_t dev;
+	struct ifmedia media;
+
+	uint8_t mac[ETHER_ADDR_LEN];
+
+	struct gve_dma_handle aq_mem;
+
+	struct resource *reg_bar; /* BAR0 */
+	struct resource *db_bar; /* BAR2 */
+	struct resource *msix_table;
+
+	uint32_t mgmt_msix_idx;
+	uint32_t rx_copybreak;
+
+	uint16_t num_event_counters;
+	uint16_t default_num_queues;
+	uint16_t tx_desc_cnt;
+	uint16_t rx_desc_cnt;
+	uint16_t rx_pages_per_qpl;
+	uint64_t max_registered_pages;
+	uint64_t num_registered_pages;
+	uint32_t supported_features;
+	uint16_t max_mtu;
+
+	struct gve_dma_handle counter_array_mem;
+	__be32 *counters;
+	struct gve_dma_handle irqs_db_mem;
+	struct gve_irq_db *irq_db_indices;
+
+	enum gve_queue_format queue_format;
+	struct gve_queue_page_list *qpls;
+	struct gve_queue_config tx_cfg;
+	struct gve_queue_config rx_cfg;
+	uint32_t num_queues;
+
+	struct gve_irq *irq_tbl;
+	struct gve_tx_ring *tx;
+	struct gve_rx_ring *rx;
+
+	/*
+	 * Admin queue - see gve_adminq.h
+	 * Since AQ cmds do not run in steady state, 32 bit counters suffice
+	 */
+	struct gve_adminq_command *adminq;
+	vm_paddr_t adminq_bus_addr;
+	uint32_t adminq_mask; /* masks prod_cnt to adminq size */
+	uint32_t adminq_prod_cnt; /* free-running count of AQ cmds executed */
+	uint32_t adminq_cmd_fail; /* free-running count of AQ cmds failed */
+	uint32_t adminq_timeouts; /* free-running count of AQ cmds timeouts */
+	/* free-running count of each distinct AQ cmd executed */
+	uint32_t adminq_describe_device_cnt;
+	uint32_t adminq_cfg_device_resources_cnt;
+	uint32_t adminq_register_page_list_cnt;
+	uint32_t adminq_unregister_page_list_cnt;
+	uint32_t adminq_create_tx_queue_cnt;
+	uint32_t adminq_create_rx_queue_cnt;
+	uint32_t adminq_destroy_tx_queue_cnt;
+	uint32_t adminq_destroy_rx_queue_cnt;
+	uint32_t adminq_dcfg_device_resources_cnt;
+	uint32_t adminq_set_driver_parameter_cnt;
+	uint32_t adminq_verify_driver_compatibility_cnt;
+
+	uint32_t interface_up_cnt;
+	uint32_t interface_down_cnt;
+	uint32_t reset_cnt;
+
+	struct task service_task;
+	struct taskqueue *service_tq;
+
+	struct gve_state_flags state_flags;
+	struct sx gve_iface_lock;
+};
+
+static inline bool
+gve_get_state_flag(struct gve_priv *priv, int pos)
+{
+	return (BIT_ISSET(GVE_NUM_STATE_FLAGS, pos, &priv->state_flags));
+}
+
+static inline void
+gve_set_state_flag(struct gve_priv *priv, int pos)
+{
+	BIT_SET_ATOMIC(GVE_NUM_STATE_FLAGS, pos, &priv->state_flags);
+}
+
+static inline void
+gve_clear_state_flag(struct gve_priv *priv, int pos)
+{
+	BIT_CLR_ATOMIC(GVE_NUM_STATE_FLAGS, pos, &priv->state_flags);
+}
+
+/* Defined in gve_main.c */
+void gve_schedule_reset(struct gve_priv *priv);
+
+/* Register access functions defined in gve_utils.c */
+uint32_t gve_reg_bar_read_4(struct gve_priv *priv, bus_size_t offset);
+void gve_reg_bar_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val);
+void gve_db_bar_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val);
+
+/* QPL (Queue Page List) functions defined in gve_qpl.c */
+int gve_alloc_qpls(struct gve_priv *priv);
+void gve_free_qpls(struct gve_priv *priv);
+int gve_register_qpls(struct gve_priv *priv);
+int gve_unregister_qpls(struct gve_priv *priv);
+
+/* TX functions defined in gve_tx.c */
+int gve_alloc_tx_rings(struct gve_priv *priv);
+void gve_free_tx_rings(struct gve_priv *priv);
+int gve_create_tx_rings(struct gve_priv *priv);
+int gve_destroy_tx_rings(struct gve_priv *priv);
+int gve_tx_intr(void *arg);
+int gve_xmit_ifp(if_t ifp, struct mbuf *mbuf);
+void gve_qflush(if_t ifp);
+void gve_xmit_tq(void *arg, int pending);
+void gve_tx_cleanup_tq(void *arg, int pending);
+
+/* RX functions defined in gve_rx.c */
+int gve_alloc_rx_rings(struct gve_priv *priv);
+void gve_free_rx_rings(struct gve_priv *priv);
+int gve_create_rx_rings(struct gve_priv *priv);
+int gve_destroy_rx_rings(struct gve_priv *priv);
+int gve_rx_intr(void *arg);
+void gve_rx_cleanup_tq(void *arg, int pending);
+
+/* DMA functions defined in gve_utils.c */
+int gve_dma_alloc_coherent(struct gve_priv *priv, int size, int align,
+    struct gve_dma_handle *dma);
+void gve_dma_free_coherent(struct gve_dma_handle *dma);
+int gve_dmamap_create(struct gve_priv *priv, int size, int align,
+    struct gve_dma_handle *dma);
+void gve_dmamap_destroy(struct gve_dma_handle *dma);
+
+/* IRQ functions defined in gve_utils.c */
+void gve_free_irqs(struct gve_priv *priv);
+int gve_alloc_irqs(struct gve_priv *priv);
+void gve_unmask_all_queue_irqs(struct gve_priv *priv);
+void gve_mask_all_queue_irqs(struct gve_priv *priv);
+
+/* Systcl functions defined in gve_sysctl.c*/
+void gve_setup_sysctl(struct gve_priv *priv);
+void gve_accum_stats(struct gve_priv *priv, uint64_t *rpackets,
+    uint64_t *rbytes, uint64_t *rx_dropped_pkt, uint64_t *tpackets,
+    uint64_t *tbytes, uint64_t *tx_dropped_pkt);
+
+/* Stats functions defined in gve_utils.c */
+void gve_alloc_counters(counter_u64_t *stat, int num_stats);
+void gve_free_counters(counter_u64_t *stat, int num_stats);
+
+#endif /* _GVE_FBSD_H_ */
diff --git a/sys/dev/gve/gve_adminq.c b/sys/dev/gve/gve_adminq.c
new file mode 100644
index 000000000000..3c332607ebd4
--- /dev/null
+++ b/sys/dev/gve/gve_adminq.c
@@ -0,0 +1,803 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 2023 Google LLC
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software without
+ *    specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <sys/endian.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <net/if_var.h>
+
+#include "gve.h"
+#include "gve_adminq.h"
+
+#define GVE_ADMINQ_SLEEP_LEN_MS 20
+#define GVE_MAX_ADMINQ_EVENT_COUNTER_CHECK 10
+#define GVE_ADMINQ_DEVICE_DESCRIPTOR_VERSION 1
+#define GVE_REG_ADMINQ_ADDR 16
+#define ADMINQ_SLOTS (ADMINQ_SIZE / sizeof(struct gve_adminq_command))
+
+#define GVE_DEVICE_OPTION_ERROR_FMT "%s option error:\n" \
+    "Expected: length=%d, feature_mask=%x.\n" \
+    "Actual: length=%d, feature_mask=%x.\n"
+
+#define GVE_DEVICE_OPTION_TOO_BIG_FMT "Length of %s option larger than expected." \
+    " Possible older version of guest driver.\n"
+
+static
+void gve_parse_device_option(struct gve_priv *priv,
+    struct gve_device_descriptor *device_descriptor,
+    struct gve_device_option *option,
+    struct gve_device_option_gqi_qpl **dev_op_gqi_qpl,
+    struct gve_device_option_jumbo_frames **dev_op_jumbo_frames)
+{
+	uint32_t req_feat_mask = be32toh(option->required_features_mask);
+	uint16_t option_length = be16toh(option->option_length);
+	uint16_t option_id = be16toh(option->option_id);
+
+	/*
+	 * If the length or feature mask doesn't match, continue without
+	 * enabling the feature.
+	 */
+	switch (option_id) {
+	case GVE_DEV_OPT_ID_GQI_QPL:
+		if (option_length < sizeof(**dev_op_gqi_qpl) ||
+		    req_feat_mask != GVE_DEV_OPT_REQ_FEAT_MASK_GQI_QPL) {
+			device_printf(priv->dev, GVE_DEVICE_OPTION_ERROR_FMT,
+			    "GQI QPL", (int)sizeof(**dev_op_gqi_qpl),
+			    GVE_DEV_OPT_REQ_FEAT_MASK_GQI_QPL,
+			    option_length, req_feat_mask);
+			break;
+		}
+
+		if (option_length > sizeof(**dev_op_gqi_qpl)) {
+			device_printf(priv->dev, GVE_DEVICE_OPTION_TOO_BIG_FMT,
+			    "GQI QPL");
+		}
+		*dev_op_gqi_qpl = (void *)(option + 1);
+		break;
+
+	case GVE_DEV_OPT_ID_JUMBO_FRAMES:
+		if (option_length < sizeof(**dev_op_jumbo_frames) ||
+		    req_feat_mask != GVE_DEV_OPT_REQ_FEAT_MASK_JUMBO_FRAMES) {
+			device_printf(priv->dev, GVE_DEVICE_OPTION_ERROR_FMT,
+			    "Jumbo Frames", (int)sizeof(**dev_op_jumbo_frames),
+			    GVE_DEV_OPT_REQ_FEAT_MASK_JUMBO_FRAMES,
+			    option_length, req_feat_mask);
+			break;
+		}
+
+		if (option_length > sizeof(**dev_op_jumbo_frames)) {
+			device_printf(priv->dev,
+			    GVE_DEVICE_OPTION_TOO_BIG_FMT, "Jumbo Frames");
+		}
+		*dev_op_jumbo_frames = (void *)(option + 1);
+		break;
+
+	default:
+		/*
+		 * If we don't recognize the option just continue
+		 * without doing anything.
+		 */
+		device_printf(priv->dev, "Unrecognized device option 0x%hx not enabled.\n",
+		    option_id);
+	}
+}
+
+/* Process all device options for a given describe device call. */
+static int
+gve_process_device_options(struct gve_priv *priv,
+    struct gve_device_descriptor *descriptor,
+    struct gve_device_option_gqi_qpl **dev_op_gqi_qpl,
+    struct gve_device_option_jumbo_frames **dev_op_jumbo_frames)
+{
+	char *desc_end = (char *)descriptor + be16toh(descriptor->total_length);
+	const int num_options = be16toh(descriptor->num_device_options);
+	struct gve_device_option *dev_opt;
+	int i;
+
+	/* The options struct directly follows the device descriptor. */
+	dev_opt = (void *)(descriptor + 1);
+	for (i = 0; i < num_options; i++) {
+		if ((char *)(dev_opt + 1) > desc_end ||
+		    (char *)(dev_opt + 1) + be16toh(dev_opt->option_length) > desc_end) {
+			device_printf(priv->dev,
+			    "options exceed device_descriptor's total length.\n");
+			return (EINVAL);
+		}
+
+		gve_parse_device_option(priv, descriptor, dev_opt,
+		    dev_op_gqi_qpl, dev_op_jumbo_frames);
+		dev_opt = (void *)((char *)(dev_opt + 1) + be16toh(dev_opt->option_length));
+	}
+
+	return (0);
+}
+
+static int gve_adminq_execute_cmd(struct gve_priv *priv,
+    struct gve_adminq_command *cmd);
+
+static int
+gve_adminq_destroy_tx_queue(struct gve_priv *priv, uint32_t id)
+{
+	struct gve_adminq_command cmd = (struct gve_adminq_command){};
+
+	cmd.opcode = htobe32(GVE_ADMINQ_DESTROY_TX_QUEUE);
+	cmd.destroy_tx_queue.queue_id = htobe32(id);
+
+	return (gve_adminq_execute_cmd(priv, &cmd));
+}
+
+static int
+gve_adminq_destroy_rx_queue(struct gve_priv *priv, uint32_t id)
+{
+	struct gve_adminq_command cmd = (struct gve_adminq_command){};
+
+	cmd.opcode = htobe32(GVE_ADMINQ_DESTROY_RX_QUEUE);
+	cmd.destroy_rx_queue.queue_id = htobe32(id);
+
+	return (gve_adminq_execute_cmd(priv, &cmd));
+}
+
+int
+gve_adminq_destroy_rx_queues(struct gve_priv *priv, uint32_t num_queues)
+{
+	int err;
+	int i;
+
+	for (i = 0; i < num_queues; i++) {
+		err = gve_adminq_destroy_rx_queue(priv, i);
+		if (err != 0) {
+			device_printf(priv->dev, "Failed to destroy rxq %d, err: %d\n",
+			    i, err);
+		}
+	}
+
+	if (err != 0)
+		return (err);
+
+	device_printf(priv->dev, "Destroyed %d rx queues\n", num_queues);
+	return (0);
+}
+
+int
+gve_adminq_destroy_tx_queues(struct gve_priv *priv, uint32_t num_queues)
+{
+	int err;
+	int i;
+
+	for (i = 0; i < num_queues; i++) {
+		err = gve_adminq_destroy_tx_queue(priv, i);
+		if (err != 0) {
+			device_printf(priv->dev, "Failed to destroy txq %d, err: %d\n",
+			    i, err);
+		}
+	}
+
+	if (err != 0)
+		return (err);
+
+	device_printf(priv->dev, "Destroyed %d tx queues\n", num_queues);
*** 4706 LINES SKIPPED ***