git: 0f5116d7efe3 - main - AMD IOMMU driver

From: Konstantin Belousov <kib_at_FreeBSD.org>
Date: Sat, 02 Nov 2024 23:46:40 UTC
The branch main has been updated by kib:

URL: https://cgit.FreeBSD.org/src/commit/?id=0f5116d7efe33c81f0b24b56eec78af37898f500

commit 0f5116d7efe33c81f0b24b56eec78af37898f500
Author:     Konstantin Belousov <kib@FreeBSD.org>
AuthorDate: 2024-05-12 10:20:11 +0000
Commit:     Konstantin Belousov <kib@FreeBSD.org>
CommitDate: 2024-11-02 23:45:40 +0000

    AMD IOMMU driver
    
    This driver is functionally equivalent to the in-tree Intel DMAR code.
    It handles busdma and interrupt remapping from the host OS.  There is no
    integration with bhyve, and the stub iommu drivers in bhyve code cannot
    coexist with this driver (planned).
    
    The biggest architectural problem with the code is that the AMD IOMMU
    units are enumerated as PCIe-attached security devices, which is much
    later after HPET and IOAPIC drivers attached and actived interrupts.
    Because of this, HPET FSB interrupts and IOAPIC interrupts are always
    identity-mapped.
    
    The code is of late alpha quality.  By default the driver is disabled.
    To enable for testing, set in loader.conf:
    hw.amdiommu.enable=1
    hw.iommu.dma=1 <- to enable iommu busdma
    hw.iommu.ir=1 <- to enable interrupt remapping
    
    Discussed with: emaste
    Sponsored by:   Advanced Micro Devices (AMD)
    Sponsored by:   The FreeBSD Foundation
    MFC after:      1 week
    Differential revision:  https://reviews.freebsd.org/D47256
---
 sys/conf/files.x86          |    6 +
 sys/x86/iommu/amd_cmd.c     |  360 +++++++++++++
 sys/x86/iommu/amd_ctx.c     |  639 +++++++++++++++++++++++
 sys/x86/iommu/amd_drv.c     | 1205 +++++++++++++++++++++++++++++++++++++++++++
 sys/x86/iommu/amd_event.c   |  323 ++++++++++++
 sys/x86/iommu/amd_idpgtbl.c |  396 ++++++++++++++
 sys/x86/iommu/amd_intrmap.c |  391 ++++++++++++++
 sys/x86/iommu/amd_iommu.h   |  243 +++++++++
 8 files changed, 3563 insertions(+)

diff --git a/sys/conf/files.x86 b/sys/conf/files.x86
index 7c05544f03da..df206b314b38 100644
--- a/sys/conf/files.x86
+++ b/sys/conf/files.x86
@@ -344,6 +344,12 @@ x86/cpufreq/hwpstate_amd.c	optional	cpufreq
 x86/cpufreq/hwpstate_intel.c	optional	cpufreq
 x86/cpufreq/p4tcc.c		optional	cpufreq
 x86/cpufreq/powernow.c		optional	cpufreq
+x86/iommu/amd_cmd.c		optional	acpi iommu pci
+x86/iommu/amd_ctx.c		optional	acpi iommu pci
+x86/iommu/amd_drv.c		optional	acpi iommu pci
+x86/iommu/amd_event.c		optional	acpi iommu pci
+x86/iommu/amd_idpgtbl.c		optional	acpi iommu pci
+x86/iommu/amd_intrmap.c		optional	acpi iommu pci
 x86/iommu/intel_ctx.c		optional	acpi iommu pci
 x86/iommu/intel_drv.c		optional	acpi iommu pci
 x86/iommu/intel_fault.c		optional	acpi iommu pci
diff --git a/sys/x86/iommu/amd_cmd.c b/sys/x86/iommu/amd_cmd.c
new file mode 100644
index 000000000000..bbc2a8e0ad9f
--- /dev/null
+++ b/sys/x86/iommu/amd_cmd.c
@@ -0,0 +1,360 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 The FreeBSD Foundation
+ *
+ * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "opt_acpi.h"
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/memdesc.h>
+#include <sys/module.h>
+#include <sys/rman.h>
+#include <sys/taskqueue.h>
+#include <sys/time.h>
+#include <sys/tree.h>
+#include <sys/vmem.h>
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_page.h>
+#include <vm/vm_map.h>
+#include <contrib/dev/acpica/include/acpi.h>
+#include <contrib/dev/acpica/include/accommon.h>
+#include <dev/acpica/acpivar.h>
+#include <dev/pci/pcireg.h>
+#include <machine/bus.h>
+#include <machine/cpu.h>
+#include <x86/include/busdma_impl.h>
+#include <dev/iommu/busdma_iommu.h>
+#include <x86/iommu/amd_reg.h>
+#include <x86/iommu/x86_iommu.h>
+#include <x86/iommu/amd_iommu.h>
+
+static void
+amdiommu_enable_cmdbuf(struct amdiommu_unit *unit)
+{
+	AMDIOMMU_ASSERT_LOCKED(unit);
+
+	unit->hw_ctrl |= AMDIOMMU_CTRL_CMDBUF_EN;
+	amdiommu_write8(unit, AMDIOMMU_CTRL, unit->hw_ctrl);
+}
+
+static void
+amdiommu_disable_cmdbuf(struct amdiommu_unit *unit)
+{
+	AMDIOMMU_ASSERT_LOCKED(unit);
+
+	unit->hw_ctrl &= ~AMDIOMMU_CTRL_CMDBUF_EN;
+	amdiommu_write8(unit, AMDIOMMU_CTRL, unit->hw_ctrl);
+}
+
+
+static void
+amdiommu_enable_qi_intr(struct iommu_unit *iommu)
+{
+	struct amdiommu_unit *unit;
+
+	unit = IOMMU2AMD(iommu);
+	AMDIOMMU_ASSERT_LOCKED(unit);
+	unit->hw_ctrl |= AMDIOMMU_CTRL_COMWINT_EN;
+	amdiommu_write8(unit, AMDIOMMU_CTRL, unit->hw_ctrl);
+	amdiommu_write8(unit, AMDIOMMU_CMDEV_STATUS,
+	    AMDIOMMU_CMDEVS_COMWAITINT);
+}
+
+static void
+amdiommu_disable_qi_intr(struct iommu_unit *iommu)
+{
+	struct amdiommu_unit *unit;
+
+	unit = IOMMU2AMD(iommu);
+	AMDIOMMU_ASSERT_LOCKED(unit);
+	unit->hw_ctrl &= ~AMDIOMMU_CTRL_COMWINT_EN;
+	amdiommu_write8(unit, AMDIOMMU_CTRL, unit->hw_ctrl);
+}
+
+static void
+amdiommu_cmd_advance_tail(struct iommu_unit *iommu)
+{
+	struct amdiommu_unit *unit;
+
+	unit = IOMMU2AMD(iommu);
+	AMDIOMMU_ASSERT_LOCKED(unit);
+	amdiommu_write8(unit, AMDIOMMU_CMDBUF_TAIL, unit->x86c.inv_queue_tail);
+}
+
+static void
+amdiommu_cmd_ensure(struct iommu_unit *iommu, int descr_count)
+{
+	struct amdiommu_unit *unit;
+	uint64_t head;
+	int bytes;
+
+	unit = IOMMU2AMD(iommu);
+	AMDIOMMU_ASSERT_LOCKED(unit);
+	bytes = descr_count << AMDIOMMU_CMD_SZ_SHIFT;
+	for (;;) {
+		if (bytes <= unit->x86c.inv_queue_avail)
+			break;
+		/* refill */
+		head = amdiommu_read8(unit, AMDIOMMU_CMDBUF_HEAD);
+		head &= AMDIOMMU_CMDPTR_MASK;
+		unit->x86c.inv_queue_avail = head - unit->x86c.inv_queue_tail -
+		    AMDIOMMU_CMD_SZ;
+		if (head <= unit->x86c.inv_queue_tail)
+			unit->x86c.inv_queue_avail += unit->x86c.inv_queue_size;
+		if (bytes <= unit->x86c.inv_queue_avail)
+			break;
+
+		/*
+		 * No space in the queue, do busy wait.  Hardware must
+		 * make a progress.  But first advance the tail to
+		 * inform the descriptor streamer about entries we
+		 * might have already filled, otherwise they could
+		 * clog the whole queue..
+		 *
+		 * See dmar_qi_invalidate_locked() for a discussion
+		 * about data race prevention.
+		 */
+		amdiommu_cmd_advance_tail(iommu);
+		unit->x86c.inv_queue_full++;
+		cpu_spinwait();
+	}
+	unit->x86c.inv_queue_avail -= bytes;
+}
+
+static void
+amdiommu_cmd_emit(struct amdiommu_unit *unit, const struct
+    amdiommu_cmd_generic *cmd)
+{
+	AMDIOMMU_ASSERT_LOCKED(unit);
+
+	memcpy(unit->x86c.inv_queue + unit->x86c.inv_queue_tail, cmd,
+	    sizeof(*cmd));
+	unit->x86c.inv_queue_tail += AMDIOMMU_CMD_SZ;
+	KASSERT(unit->x86c.inv_queue_tail <= unit->x86c.inv_queue_size,
+	    ("tail overflow 0x%x 0x%jx", unit->x86c.inv_queue_tail,
+	    (uintmax_t)unit->x86c.inv_queue_size));
+	unit->x86c.inv_queue_tail &= unit->x86c.inv_queue_size - 1;
+}
+
+static void
+amdiommu_cmd_emit_wait_descr(struct iommu_unit *iommu, uint32_t seq,
+    bool intr, bool memw, bool fence)
+{
+	struct amdiommu_unit *unit;
+	struct amdiommu_cmd_completion_wait c;
+
+	unit = IOMMU2AMD(iommu);
+	AMDIOMMU_ASSERT_LOCKED(unit);
+
+	bzero(&c, sizeof(c));
+	c.op = AMDIOMMU_CMD_COMPLETION_WAIT;
+	if (memw) {
+		uint32_t x;
+
+		c.s = 1;
+		x = unit->x86c.inv_waitd_seq_hw_phys;
+		x >>= 3;
+		c.address0 = x;
+		x = unit->x86c.inv_waitd_seq_hw_phys >> 32;
+		c.address1 = x;
+		c.data0 = seq;
+	}
+	if (fence)
+		c.f = 1;
+	if (intr)
+		c.i = 1;
+	amdiommu_cmd_emit(unit, (struct amdiommu_cmd_generic *)&c);
+}
+
+static void
+amdiommu_qi_invalidate_emit(struct iommu_domain *adomain, iommu_gaddr_t base,
+    iommu_gaddr_t size, struct iommu_qi_genseq *pseq, bool emit_wait)
+{
+	struct amdiommu_domain *domain;
+	struct amdiommu_unit *unit;
+	struct amdiommu_cmd_invalidate_iommu_pages c;
+	u_int isize;
+
+	domain = IODOM2DOM(adomain);
+	unit = domain->unit;
+	AMDIOMMU_ASSERT_LOCKED(unit);
+	bzero(&c, sizeof(c));
+	c.op = AMDIOMMU_CMD_INVALIDATE_IOMMU_PAGES;
+	c.domainid = domain->domain;
+	isize = IOMMU_PAGE_SIZE; /* XXXKIB handle superpages */
+
+	for (; size > 0; base += isize, size -= isize) {
+		amdiommu_cmd_ensure(AMD2IOMMU(unit), 1);
+		c.s = 0;
+		c.pde = 1;
+		c.address = base >> IOMMU_PAGE_SHIFT;
+		amdiommu_cmd_emit(unit, (struct amdiommu_cmd_generic *)&c);
+	}
+	iommu_qi_emit_wait_seq(AMD2IOMMU(unit), pseq, emit_wait);
+}
+
+void
+amdiommu_qi_invalidate_all_pages_locked_nowait(struct amdiommu_domain *domain)
+{
+	struct amdiommu_unit *unit;
+	struct amdiommu_cmd_invalidate_iommu_pages c;
+
+	unit = domain->unit;
+	AMDIOMMU_ASSERT_LOCKED(unit);
+	bzero(&c, sizeof(c));
+	c.op = AMDIOMMU_CMD_INVALIDATE_IOMMU_PAGES;
+	c.domainid = domain->domain;
+
+	/*
+	 * The magic specified in the note for INVALIDATE_IOMMU_PAGES
+	 * description.
+	 */
+	c.s = 1;
+	c.pde = 1;
+	c.address = 0x7ffffffffffff;
+
+	amdiommu_cmd_ensure(AMD2IOMMU(unit), 1);
+	amdiommu_cmd_emit(unit, (struct amdiommu_cmd_generic *)&c);
+}
+
+void
+amdiommu_qi_invalidate_wait_sync(struct iommu_unit *iommu)
+{
+	struct iommu_qi_genseq gseq;
+
+	amdiommu_cmd_ensure(iommu, 1);
+	iommu_qi_emit_wait_seq(iommu, &gseq, true);
+	IOMMU2AMD(iommu)->x86c.inv_seq_waiters++;
+	amdiommu_cmd_advance_tail(iommu);
+	iommu_qi_wait_for_seq(iommu, &gseq, true);
+}
+
+void
+amdiommu_qi_invalidate_ctx_locked_nowait(struct amdiommu_ctx *ctx)
+{
+	struct amdiommu_cmd_invalidate_devtab_entry c;
+
+	amdiommu_cmd_ensure(AMD2IOMMU(CTX2AMD(ctx)), 1);
+	bzero(&c, sizeof(c));
+	c.op = AMDIOMMU_CMD_INVALIDATE_DEVTAB_ENTRY;
+	c.devid = ctx->context.rid;
+	amdiommu_cmd_emit(CTX2AMD(ctx), (struct amdiommu_cmd_generic *)&c);
+}
+
+
+void
+amdiommu_qi_invalidate_ctx_locked(struct amdiommu_ctx *ctx)
+{
+	amdiommu_qi_invalidate_ctx_locked_nowait(ctx);
+	amdiommu_qi_invalidate_wait_sync(AMD2IOMMU(CTX2AMD(ctx)));
+}
+
+void
+amdiommu_qi_invalidate_ir_locked_nowait(struct amdiommu_unit *unit,
+    uint16_t devid)
+{
+	struct amdiommu_cmd_invalidate_interrupt_table c;
+
+	AMDIOMMU_ASSERT_LOCKED(unit);
+
+	amdiommu_cmd_ensure(AMD2IOMMU(unit), 1);
+	bzero(&c, sizeof(c));
+	c.op = AMDIOMMU_CMD_INVALIDATE_INTERRUPT_TABLE;
+	c.devid = devid;
+	amdiommu_cmd_emit(unit, (struct amdiommu_cmd_generic *)&c);
+}
+
+void
+amdiommu_qi_invalidate_ir_locked(struct amdiommu_unit *unit, uint16_t devid)
+{
+	amdiommu_qi_invalidate_ir_locked_nowait(unit, devid);
+	amdiommu_qi_invalidate_wait_sync(AMD2IOMMU(unit));
+}
+
+static void
+amdiommu_qi_task(void *arg, int pending __unused)
+{
+	struct amdiommu_unit *unit;
+
+	unit = IOMMU2AMD(arg);
+	iommu_qi_drain_tlb_flush(AMD2IOMMU(unit));
+
+	AMDIOMMU_LOCK(unit);
+	if (unit->x86c.inv_seq_waiters > 0)
+		wakeup(&unit->x86c.inv_seq_waiters);
+	AMDIOMMU_UNLOCK(unit);
+}
+
+int
+amdiommu_init_cmd(struct amdiommu_unit *unit)
+{
+	uint64_t qi_sz, rv;
+
+	unit->x86c.qi_buf_maxsz = ilog2(AMDIOMMU_CMDBUF_MAX / PAGE_SIZE);
+	unit->x86c.qi_cmd_sz = AMDIOMMU_CMD_SZ;
+	iommu_qi_common_init(AMD2IOMMU(unit), amdiommu_qi_task);
+	get_x86_iommu()->qi_ensure = amdiommu_cmd_ensure;
+	get_x86_iommu()->qi_emit_wait_descr = amdiommu_cmd_emit_wait_descr;
+	get_x86_iommu()->qi_advance_tail = amdiommu_cmd_advance_tail;
+	get_x86_iommu()->qi_invalidate_emit = amdiommu_qi_invalidate_emit;
+
+	rv = pmap_kextract((uintptr_t)unit->x86c.inv_queue);
+
+	/*
+	 * See the description of the ComLen encoding for Command
+	 * buffer Base Address Register.
+	 */
+	qi_sz = ilog2(unit->x86c.inv_queue_size / PAGE_SIZE) + 8;
+	rv |= qi_sz << AMDIOMMU_CMDBUF_BASE_SZSHIFT;
+
+	AMDIOMMU_LOCK(unit);
+	amdiommu_write8(unit, AMDIOMMU_CMDBUF_BASE, rv);
+	amdiommu_enable_cmdbuf(unit);
+	amdiommu_enable_qi_intr(AMD2IOMMU(unit));
+	AMDIOMMU_UNLOCK(unit);
+
+	return (0);
+}
+
+static void
+amdiommu_fini_cmd_helper(struct iommu_unit *iommu)
+{
+	amdiommu_disable_cmdbuf(IOMMU2AMD(iommu));
+	amdiommu_disable_qi_intr(iommu);
+}
+
+void
+amdiommu_fini_cmd(struct amdiommu_unit *unit)
+{
+	iommu_qi_common_fini(AMD2IOMMU(unit), amdiommu_fini_cmd_helper);
+}
diff --git a/sys/x86/iommu/amd_ctx.c b/sys/x86/iommu/amd_ctx.c
new file mode 100644
index 000000000000..b3e85350a995
--- /dev/null
+++ b/sys/x86/iommu/amd_ctx.c
@@ -0,0 +1,639 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 The FreeBSD Foundation
+ *
+ * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/bus.h>
+#include <sys/interrupt.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/memdesc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/rman.h>
+#include <sys/sysctl.h>
+#include <sys/taskqueue.h>
+#include <sys/tree.h>
+#include <sys/uio.h>
+#include <sys/vmem.h>
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
+#include <vm/vm_map.h>
+#include <contrib/dev/acpica/include/acpi.h>
+#include <contrib/dev/acpica/include/accommon.h>
+#include <dev/pci/pcireg.h>
+#include <dev/pci/pcivar.h>
+#include <machine/atomic.h>
+#include <machine/bus.h>
+#include <machine/md_var.h>
+#include <machine/specialreg.h>
+#include <x86/include/busdma_impl.h>
+#include <dev/iommu/busdma_iommu.h>
+#include <x86/iommu/amd_reg.h>
+#include <x86/iommu/x86_iommu.h>
+#include <x86/iommu/amd_iommu.h>
+
+static MALLOC_DEFINE(M_AMDIOMMU_CTX, "amdiommu_ctx", "AMD IOMMU Context");
+static MALLOC_DEFINE(M_AMDIOMMU_DOMAIN, "amdiommu_dom", "AMD IOMMU Domain");
+
+static void amdiommu_unref_domain_locked(struct amdiommu_unit *unit,
+    struct amdiommu_domain *domain);
+
+static struct amdiommu_dte *
+amdiommu_get_dtep(struct amdiommu_ctx *ctx)
+{
+	return (&CTX2AMD(ctx)->dev_tbl[ctx->context.rid]);
+}
+
+void
+amdiommu_domain_unload_entry(struct iommu_map_entry *entry, bool free,
+    bool cansleep)
+{
+	struct amdiommu_domain *domain;
+	struct amdiommu_unit *unit;
+
+	domain = IODOM2DOM(entry->domain);
+	unit = DOM2AMD(domain);
+
+	/*
+	 * If "free" is false, then the IOTLB invalidation must be performed
+	 * synchronously.  Otherwise, the caller might free the entry before
+	 * dmar_qi_task() is finished processing it.
+	 */
+	if (free) {
+		AMDIOMMU_LOCK(unit);
+		iommu_qi_invalidate_locked(&domain->iodom, entry, true);
+		AMDIOMMU_UNLOCK(unit);
+	} else {
+		iommu_qi_invalidate_sync(&domain->iodom, entry->start,
+		    entry->end - entry->start, cansleep);
+		iommu_domain_free_entry(entry, false);
+	}
+}
+
+static bool
+amdiommu_domain_unload_emit_wait(struct amdiommu_domain *domain,
+    struct iommu_map_entry *entry)
+{
+	return (true); /* XXXKIB */
+}
+
+void
+amdiommu_domain_unload(struct iommu_domain *iodom,
+    struct iommu_map_entries_tailq *entries, bool cansleep)
+{
+	struct amdiommu_domain *domain;
+	struct amdiommu_unit *unit;
+	struct iommu_map_entry *entry, *entry1;
+	int error __diagused;
+
+	domain = IODOM2DOM(iodom);
+	unit = DOM2AMD(domain);
+
+	TAILQ_FOREACH_SAFE(entry, entries, dmamap_link, entry1) {
+		KASSERT((entry->flags & IOMMU_MAP_ENTRY_MAP) != 0,
+		    ("not mapped entry %p %p", domain, entry));
+		error = iodom->ops->unmap(iodom, entry,
+		    cansleep ? IOMMU_PGF_WAITOK : 0);
+		KASSERT(error == 0, ("unmap %p error %d", domain, error));
+	}
+	if (TAILQ_EMPTY(entries))
+		return;
+
+	AMDIOMMU_LOCK(unit);
+	while ((entry = TAILQ_FIRST(entries)) != NULL) {
+		TAILQ_REMOVE(entries, entry, dmamap_link);
+		iommu_qi_invalidate_locked(&domain->iodom, entry,
+		    amdiommu_domain_unload_emit_wait(domain, entry));
+	}
+	AMDIOMMU_UNLOCK(unit);
+}
+
+static void
+amdiommu_domain_destroy(struct amdiommu_domain *domain)
+{
+	struct iommu_domain *iodom;
+	struct amdiommu_unit *unit;
+
+	iodom = DOM2IODOM(domain);
+
+	KASSERT(TAILQ_EMPTY(&domain->iodom.unload_entries),
+	    ("unfinished unloads %p", domain));
+	KASSERT(LIST_EMPTY(&iodom->contexts),
+	    ("destroying dom %p with contexts", domain));
+	KASSERT(domain->ctx_cnt == 0,
+	    ("destroying dom %p with ctx_cnt %d", domain, domain->ctx_cnt));
+	KASSERT(domain->refs == 0,
+	    ("destroying dom %p with refs %d", domain, domain->refs));
+
+	if ((domain->iodom.flags & IOMMU_DOMAIN_GAS_INITED) != 0) {
+		AMDIOMMU_DOMAIN_LOCK(domain);
+		iommu_gas_fini_domain(iodom);
+		AMDIOMMU_DOMAIN_UNLOCK(domain);
+	}
+	if ((domain->iodom.flags & IOMMU_DOMAIN_PGTBL_INITED) != 0) {
+		if (domain->pgtbl_obj != NULL)
+			AMDIOMMU_DOMAIN_PGLOCK(domain);
+		amdiommu_domain_free_pgtbl(domain);
+	}
+	iommu_domain_fini(iodom);
+	unit = DOM2AMD(domain);
+	free_unr(unit->domids, domain->domain);
+	free(domain, M_AMDIOMMU_DOMAIN);
+}
+
+static iommu_gaddr_t
+lvl2addr(int lvl)
+{
+	int x;
+
+	x = IOMMU_PAGE_SHIFT + IOMMU_NPTEPGSHIFT * lvl;
+	/* Level 6 has only 8 bits for page table index */
+	if (x >= NBBY * sizeof(uint64_t))
+		return (-1ull);
+	return (1ull < (1ull << x));
+}
+
+static void
+amdiommu_domain_init_pglvl(struct amdiommu_unit *unit,
+    struct amdiommu_domain *domain)
+{
+	iommu_gaddr_t end;
+	int hats, i;
+	uint64_t efr_hats;
+
+	end = DOM2IODOM(domain)->end;
+	for (i = AMDIOMMU_PGTBL_MAXLVL; i > 1; i--) {
+		if (lvl2addr(i) >= end && lvl2addr(i - 1) < end)
+			break;
+	}
+	domain->pglvl = i;
+
+	efr_hats = unit->efr & AMDIOMMU_EFR_HATS_MASK;
+	switch (efr_hats) {
+	case AMDIOMMU_EFR_HATS_6LVL:
+		hats = 6;
+		break;
+	case AMDIOMMU_EFR_HATS_5LVL:
+		hats = 5;
+		break;
+	case AMDIOMMU_EFR_HATS_4LVL:
+		hats = 4;
+		break;
+	default:
+		printf("amdiommu%d: HATS %#jx (reserved) ignoring\n",
+		    unit->iommu.unit, (uintmax_t)efr_hats);
+		return;
+	}
+	if (hats >= domain->pglvl)
+		return;
+
+	printf("amdiommu%d: domain %d HATS %d pglvl %d reducing to HATS\n",
+	    unit->iommu.unit, domain->domain, hats, domain->pglvl);
+	domain->pglvl = hats;
+	domain->iodom.end = lvl2addr(hats);
+}
+
+static struct amdiommu_domain *
+amdiommu_domain_alloc(struct amdiommu_unit *unit, bool id_mapped)
+{
+	struct amdiommu_domain *domain;
+	struct iommu_domain *iodom;
+	int error, id;
+
+	id = alloc_unr(unit->domids);
+	if (id == -1)
+		return (NULL);
+	domain = malloc(sizeof(*domain), M_AMDIOMMU_DOMAIN, M_WAITOK | M_ZERO);
+	iodom = DOM2IODOM(domain);
+	domain->domain = id;
+	LIST_INIT(&iodom->contexts);
+	iommu_domain_init(AMD2IOMMU(unit), iodom, &amdiommu_domain_map_ops);
+
+	domain->unit = unit;
+
+	domain->iodom.end = id_mapped ? ptoa(Maxmem) : BUS_SPACE_MAXADDR;
+	amdiommu_domain_init_pglvl(unit, domain);
+	iommu_gas_init_domain(DOM2IODOM(domain));
+
+	if (id_mapped) {
+		domain->iodom.flags |= IOMMU_DOMAIN_IDMAP;
+	} else {
+		error = amdiommu_domain_alloc_pgtbl(domain);
+		if (error != 0)
+			goto fail;
+		/* Disable local apic region access */
+		error = iommu_gas_reserve_region(iodom, 0xfee00000,
+		    0xfeefffff + 1, &iodom->msi_entry);
+		if (error != 0)
+			goto fail;
+	}
+
+	return (domain);
+
+fail:
+	amdiommu_domain_destroy(domain);
+	return (NULL);
+}
+
+static struct amdiommu_ctx *
+amdiommu_ctx_alloc(struct amdiommu_domain *domain, uint16_t rid)
+{
+	struct amdiommu_ctx *ctx;
+
+	ctx = malloc(sizeof(*ctx), M_AMDIOMMU_CTX, M_WAITOK | M_ZERO);
+	ctx->context.domain = DOM2IODOM(domain);
+	ctx->context.tag = malloc(sizeof(struct bus_dma_tag_iommu),
+	    M_AMDIOMMU_CTX, M_WAITOK | M_ZERO);
+	ctx->context.rid = rid;
+	ctx->context.refs = 1;
+	return (ctx);
+}
+
+static void
+amdiommu_ctx_link(struct amdiommu_ctx *ctx)
+{
+	struct amdiommu_domain *domain;
+
+	domain = CTX2DOM(ctx);
+	IOMMU_ASSERT_LOCKED(domain->iodom.iommu);
+	KASSERT(domain->refs >= domain->ctx_cnt,
+	    ("dom %p ref underflow %d %d", domain, domain->refs,
+	    domain->ctx_cnt));
+	domain->refs++;
+	domain->ctx_cnt++;
+	LIST_INSERT_HEAD(&domain->iodom.contexts, &ctx->context, link);
+}
+
+static void
+amdiommu_ctx_unlink(struct amdiommu_ctx *ctx)
+{
+	struct amdiommu_domain *domain;
+
+	domain = CTX2DOM(ctx);
+	IOMMU_ASSERT_LOCKED(domain->iodom.iommu);
+	KASSERT(domain->refs > 0,
+	    ("domain %p ctx dtr refs %d", domain, domain->refs));
+	KASSERT(domain->ctx_cnt >= domain->refs,
+	    ("domain %p ctx dtr refs %d ctx_cnt %d", domain,
+	    domain->refs, domain->ctx_cnt));
+	domain->refs--;
+	domain->ctx_cnt--;
+	LIST_REMOVE(&ctx->context, link);
+}
+
+struct amdiommu_ctx *
+amdiommu_find_ctx_locked(struct amdiommu_unit *unit, uint16_t rid)
+{
+	struct amdiommu_domain *domain;
+	struct iommu_ctx *ctx;
+
+	AMDIOMMU_ASSERT_LOCKED(unit);
+
+	LIST_FOREACH(domain, &unit->domains, link) {
+		LIST_FOREACH(ctx, &domain->iodom.contexts, link) {
+			if (ctx->rid == rid)
+				return (IOCTX2CTX(ctx));
+		}
+	}
+	return (NULL);
+}
+
+struct amdiommu_domain *
+amdiommu_find_domain(struct amdiommu_unit *unit, uint16_t rid)
+{
+	struct amdiommu_domain *domain;
+	struct iommu_ctx *ctx;
+
+	AMDIOMMU_LOCK(unit);
+	LIST_FOREACH(domain, &unit->domains, link) {
+		LIST_FOREACH(ctx, &domain->iodom.contexts, link) {
+			if (ctx->rid == rid)
+				break;
+		}
+	}
+	AMDIOMMU_UNLOCK(unit);
+	return (domain);
+}
+
+static void
+amdiommu_free_ctx_locked(struct amdiommu_unit *unit, struct amdiommu_ctx *ctx)
+{
+	struct amdiommu_dte *dtep;
+	struct amdiommu_domain *domain;
+
+	AMDIOMMU_ASSERT_LOCKED(unit);
+	KASSERT(ctx->context.refs >= 1,
+	    ("amdiommu %p ctx %p refs %u", unit, ctx, ctx->context.refs));
+
+	/*
+	 * If our reference is not last, only the dereference should
+	 * be performed.
+	 */
+	if (ctx->context.refs > 1) {
+		ctx->context.refs--;
+		AMDIOMMU_UNLOCK(unit);
+		return;
+	}
+
+	KASSERT((ctx->context.flags & IOMMU_CTX_DISABLED) == 0,
+	    ("lost ref on disabled ctx %p", ctx));
+
+	/*
+	 * Otherwise, the device table entry must be cleared before
+	 * the page table is destroyed.
+	 */
+	dtep = amdiommu_get_dtep(ctx);
+	dtep->v = 0;
+	atomic_thread_fence_rel();
+	memset(dtep, 0, sizeof(*dtep));
+
+	domain = CTX2DOM(ctx);
+	amdiommu_qi_invalidate_ctx_locked_nowait(ctx);
+	amdiommu_qi_invalidate_ir_locked_nowait(unit, ctx->context.rid);
+	amdiommu_qi_invalidate_all_pages_locked_nowait(domain);
+	amdiommu_qi_invalidate_wait_sync(AMD2IOMMU(CTX2AMD(ctx)));
+
+	if (unit->irte_enabled)
+		amdiommu_ctx_fini_irte(ctx);
+
+	amdiommu_ctx_unlink(ctx);
+	free(ctx->context.tag, M_AMDIOMMU_CTX);
+	free(ctx, M_AMDIOMMU_CTX);
+	amdiommu_unref_domain_locked(unit, domain);
+}
+
+static void
+amdiommu_free_ctx(struct amdiommu_ctx *ctx)
+{
+	struct amdiommu_unit *unit;
+
+	unit = CTX2AMD(ctx);
+	AMDIOMMU_LOCK(unit);
+	amdiommu_free_ctx_locked(unit, ctx);
+}
+
+static void
+amdiommu_unref_domain_locked(struct amdiommu_unit *unit,
+    struct amdiommu_domain *domain)
+{
+	AMDIOMMU_ASSERT_LOCKED(unit);
+	KASSERT(domain->refs >= 1,
+	    ("amdiommu%d domain %p refs %u", unit->iommu.unit, domain,
+	    domain->refs));
+	KASSERT(domain->refs > domain->ctx_cnt,
+	    ("amdiommu%d domain %p refs %d ctx_cnt %d", unit->iommu.unit,
+	    domain, domain->refs, domain->ctx_cnt));
+
+	if (domain->refs > 1) {
+		domain->refs--;
+		AMDIOMMU_UNLOCK(unit);
+		return;
+	}
+
+	LIST_REMOVE(domain, link);
+	AMDIOMMU_UNLOCK(unit);
+
+	taskqueue_drain(unit->iommu.delayed_taskqueue,
+	    &domain->iodom.unload_task);
+	amdiommu_domain_destroy(domain);
+}
+
+static void
+dte_entry_init_one(struct amdiommu_dte *dtep, struct amdiommu_ctx *ctx,
+    vm_page_t pgtblr, uint8_t dte, uint32_t edte)
+{
+	struct amdiommu_domain *domain;
+	struct amdiommu_unit *unit;
+
+	domain = CTX2DOM(ctx);
+	unit = DOM2AMD(domain);
+
+	dtep->tv = 1;
+	/* dtep->had not used for now */
+	dtep->ir = 1;
+	dtep->iw = 1;
+	dtep->domainid = domain->domain;
+	dtep->pioctl = AMDIOMMU_DTE_PIOCTL_DIS;
+
+	/* fill device interrupt passing hints from IVHD. */
+	dtep->initpass = (dte & ACPI_IVHD_INIT_PASS) != 0;
+	dtep->eintpass = (dte & ACPI_IVHD_EINT_PASS) != 0;
+	dtep->nmipass = (dte & ACPI_IVHD_NMI_PASS) != 0;
+	dtep->sysmgt = (dte & ACPI_IVHD_SYSTEM_MGMT) >> 4;
+	dtep->lint0pass = (dte & ACPI_IVHD_LINT0_PASS) != 0;
+	dtep->lint1pass = (dte & ACPI_IVHD_LINT1_PASS) != 0;
+
+	if (unit->irte_enabled) {
+		dtep->iv = 1;
+		dtep->i = 0;
+		dtep->inttablen = ilog2(unit->irte_nentries);
+		dtep->intrroot = pmap_kextract(unit->irte_x2apic ?
+		    (vm_offset_t)ctx->irtx2 :
+		    (vm_offset_t)ctx->irtb) >> 6;
+
+		dtep->intctl = AMDIOMMU_DTE_INTCTL_MAP;
+	}
+
+	if ((DOM2IODOM(domain)->flags & IOMMU_DOMAIN_IDMAP) != 0) {
+		dtep->pgmode = AMDIOMMU_DTE_PGMODE_1T1;
+	} else {
+		MPASS(domain->pglvl > 0 && domain->pglvl <=
+		    AMDIOMMU_PGTBL_MAXLVL);
+		dtep->pgmode = domain->pglvl;
+		dtep->ptroot = VM_PAGE_TO_PHYS(pgtblr) >> 12;
+	}
+
+	atomic_thread_fence_rel();
+	dtep->v = 1;
+}
+
+static void
+dte_entry_init(struct amdiommu_ctx *ctx, bool move, uint8_t dte, uint32_t edte)
+{
+	struct amdiommu_dte *dtep;
+	struct amdiommu_unit *unit;
+	struct amdiommu_domain *domain;
+	int i;
+
+	domain = CTX2DOM(ctx);
+	unit = DOM2AMD(domain);
+
+	dtep = amdiommu_get_dtep(ctx);
+	KASSERT(dtep->v == 0,
+	    ("amdiommu%d initializing valid dte @%p %#jx",
+	    CTX2AMD(ctx)->iommu.unit, dtep, (uintmax_t)(*(uint64_t *)dtep)));
+
+	if (iommu_is_buswide_ctx(AMD2IOMMU(unit),
+	    PCI_RID2BUS(ctx->context.rid))) {
+		MPASS(!move);
+		for (i = 0; i <= PCI_BUSMAX; i++) {
+			dte_entry_init_one(&dtep[i], ctx, domain->pgtblr,
+			    dte, edte);
+		}
+	} else {
+		dte_entry_init_one(dtep, ctx, domain->pgtblr, dte, edte);
+	}
+}
+
+struct amdiommu_ctx *
+amdiommu_get_ctx_for_dev(struct amdiommu_unit *unit, device_t dev, uint16_t rid,
+    int dev_domain, bool id_mapped, bool rmrr_init, uint8_t dte, uint32_t edte)
+{
+	struct amdiommu_domain *domain, *domain1;
+	struct amdiommu_ctx *ctx, *ctx1;
+	int bus, slot, func;
+
+	if (dev != NULL) {
+		bus = pci_get_bus(dev);
+		slot = pci_get_slot(dev);
+		func = pci_get_function(dev);
+	} else {
+		bus = PCI_RID2BUS(rid);
+		slot = PCI_RID2SLOT(rid);
+		func = PCI_RID2FUNC(rid);
+	}
+	AMDIOMMU_LOCK(unit);
+	KASSERT(!iommu_is_buswide_ctx(AMD2IOMMU(unit), bus) ||
+	    (slot == 0 && func == 0),
+	    ("iommu%d pci%d:%d:%d get_ctx for buswide", AMD2IOMMU(unit)->unit,
+	    bus, slot, func));
+	ctx = amdiommu_find_ctx_locked(unit, rid);
+	if (ctx == NULL) {
+		/*
+		 * Perform the allocations which require sleep or have
+		 * higher chance to succeed if the sleep is allowed.
+		 */
+		AMDIOMMU_UNLOCK(unit);
+		domain1 = amdiommu_domain_alloc(unit, id_mapped);
+		if (domain1 == NULL)
+			return (NULL);
+		if (!id_mapped) {
+			/*
+			 * XXXKIB IVMD seems to be less significant
+			 * and less used on AMD than RMRR on Intel.
+			 * Not implemented for now.
+			 */
+		}
+		ctx1 = amdiommu_ctx_alloc(domain1, rid);
*** 2674 LINES SKIPPED ***