git: ef2a572bf6bd - main - ipsec_offload: kernel infrastructure

From: Konstantin Belousov <kib_at_FreeBSD.org>
Date: Fri, 12 Jul 2024 11:25:08 UTC
The branch main has been updated by kib:

URL: https://cgit.FreeBSD.org/src/commit/?id=ef2a572bf6bdcac97ef29ce631d2f50f938e1ec8

commit ef2a572bf6bdcac97ef29ce631d2f50f938e1ec8
Author:     Konstantin Belousov <kib@FreeBSD.org>
AuthorDate: 2021-08-22 19:38:04 +0000
Commit:     Konstantin Belousov <kib@FreeBSD.org>
CommitDate: 2024-07-12 04:27:58 +0000

    ipsec_offload: kernel infrastructure
    
    Inline IPSEC offload moves almost whole IPSEC processing from the
    CPU/MCU and possibly crypto accelerator, to the network card.
    
    The transmitted packet content is not touched by CPU during TX
    operations, kernel only does the required policy and security
    association lookups to find out that given flow is offloaded, and then
    packet is transmitted as plain text to the card. For driver convenience,
    a metadata is attached to the packet identifying SA which must process
    the packet. Card does encryption of the payload, padding, calculates
    authentication, and does the reformat according to the policy.
    
    Similarly, on receive, card does the decapsulation, decryption, and
    authentification.  Kernel receives the identifier of SA that was
    used to process the packet, together with the plain-text packet.
    
    Overall, payload octets are only read or written by card DMA engine,
    removing a lot of memory subsystem overhead, and saving CPU time because
    IPSEC algos calculations are avoided.
    
    If driver declares support for inline IPSEC offload (with the
    IFCAP2_IPSEC_OFFLOAD capability set and registering method table struct
    if_ipsec_accel_methods), kernel offers the SPD and SAD to driver.
    Driver decides which policies and SAs can be offloaded based on
    hardware capacity, and acks/nacks each SA for given interface to
    kernel.  Kernel needs to keep this information to make a decision to
    skip software processing on TX, and to assume processing already done
    on RX.  This shadow SPD/SAD database of offloads is rooted from
    policies (struct secpolicy accel_ifps, struct ifp_handle_sp) and SAs
    (struct secasvar accel_ipfs, struct ifp_handle_sav).
    
    Some extensions to the PF_KEY socket allow to limit interfaces for
    which given SP/SA could be offloaded (proposed for offload).  Also,
    additional statistics extensions allow to observe allocation/octet/use
    counters for specific SA.
    
    Since SPs and SAs are typically instantiated in non-sleepable context,
    while offloading them into card is expected to require costly async
    manipulations of the card state, calls to the driver for offload and
    termination are executed in the threaded taskqueue.  It also solves
    the issue of allocating resources needed for the offload database.
    Neither ipf_handle_sp nor ipf_handle_sav do not add reference to the
    owning SP/SA, the offload must be terminated before last reference is
    dropped.  ipsec_accel only adds transient references to ensure safe
    pointer ownership by taskqueue.
    
    Maintaining the SA counters for hardware-accelerated packets is the
    duty of the driver.  The helper ipsec_accel_drv_sa_lifetime_update()
    is provided to hide accel infrastructure from drivers which would use
    expected callout to query hardware periodically for updates.
    
    Reviewed by:    rscheff (transport, stack integration), np
    Sponsored by:   NVIDIA networking
    Differential revision:  https://reviews.freebsd.org/D44219
---
 sys/conf/files               |    2 +
 sys/conf/options             |    1 +
 sys/modules/ipsec/Makefile   |    5 +-
 sys/netipsec/ipsec.c         |   17 +
 sys/netipsec/ipsec.h         |   11 +
 sys/netipsec/ipsec_input.c   |   11 +
 sys/netipsec/ipsec_offload.c | 1061 ++++++++++++++++++++++++++++++++++++++++++
 sys/netipsec/ipsec_offload.h |  191 ++++++++
 sys/netipsec/ipsec_output.c  |   15 +
 sys/netipsec/ipsec_pcb.c     |   38 +-
 sys/netipsec/key.c           |  270 ++++++++++-
 sys/netipsec/key.h           |    6 +
 sys/netipsec/key_debug.c     |    5 +
 sys/netipsec/keydb.h         |   14 +
 14 files changed, 1628 insertions(+), 19 deletions(-)

diff --git a/sys/conf/files b/sys/conf/files
index 609ac407d400..1f99c3586b86 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -4464,6 +4464,8 @@ netipsec/ipsec.c		optional ipsec inet | ipsec inet6
 netipsec/ipsec_input.c		optional ipsec inet | ipsec inet6
 netipsec/ipsec_mbuf.c		optional ipsec inet | ipsec inet6
 netipsec/ipsec_mod.c		optional ipsec inet | ipsec inet6
+netipsec/ipsec_offload.c	optional ipsec ipsec_offload inet | \
+	ipsec ipsec_offload inet6
 netipsec/ipsec_output.c		optional ipsec inet | ipsec inet6
 netipsec/ipsec_pcb.c		optional ipsec inet | ipsec inet6 | \
 	ipsec_support inet | ipsec_support inet6
diff --git a/sys/conf/options b/sys/conf/options
index f50d009987bc..928927fe99df 100644
--- a/sys/conf/options
+++ b/sys/conf/options
@@ -466,6 +466,7 @@ IPFIREWALL_PMOD		opt_ipfw.h
 IPSEC			opt_ipsec.h
 IPSEC_DEBUG		opt_ipsec.h
 IPSEC_SUPPORT		opt_ipsec.h
+IPSEC_OFFLOAD		opt_ipsec.h
 IPSTEALTH
 KERN_TLS
 KRPC
diff --git a/sys/modules/ipsec/Makefile b/sys/modules/ipsec/Makefile
index 08a2e88d5794..8979508375a4 100644
--- a/sys/modules/ipsec/Makefile
+++ b/sys/modules/ipsec/Makefile
@@ -2,8 +2,9 @@
 .PATH: ${SRCTOP}/sys/net ${SRCTOP}/sys/netipsec
 
 KMOD=	ipsec
-SRCS=	if_ipsec.c ipsec.c ipsec_input.c ipsec_mbuf.c ipsec_mod.c \
-	ipsec_output.c xform_ah.c xform_esp.c xform_ipcomp.c \
+SRCS=	if_ipsec.c ipsec.c ipsec_input.c ipsec_mbuf.c \
+	ipsec_mod.c ipsec_offload.c ipsec_output.c \
+	xform_ah.c xform_esp.c xform_ipcomp.c \
 	opt_inet.h opt_inet6.h opt_ipsec.h opt_kern_tls.h opt_sctp.h
 .if "${MK_INET}" != "no" || "${MK_INET6}" != "no"
 SRCS+=	udpencap.c
diff --git a/sys/netipsec/ipsec.c b/sys/netipsec/ipsec.c
index 0ca33424bca8..e22a3872d48d 100644
--- a/sys/netipsec/ipsec.c
+++ b/sys/netipsec/ipsec.c
@@ -85,6 +85,7 @@
 #ifdef INET6
 #include <netipsec/ipsec6.h>
 #endif
+#include <netipsec/ipsec_offload.h>
 #include <netipsec/ah_var.h>
 #include <netipsec/esp_var.h>
 #include <netipsec/ipcomp.h>		/*XXX*/
@@ -636,8 +637,16 @@ int
 ipsec4_in_reject(const struct mbuf *m, struct inpcb *inp)
 {
 	struct secpolicy *sp;
+#ifdef IPSEC_OFFLOAD
+	struct ipsec_accel_in_tag *tag;
+#endif
 	int result;
 
+#ifdef IPSEC_OFFLOAD
+	tag = ipsec_accel_input_tag_lookup(m);
+	if (tag != NULL)
+		return (0);
+#endif
 	sp = ipsec4_getpolicy(m, inp, IPSEC_DIR_INBOUND, 0);
 	result = ipsec_in_reject(sp, inp, m);
 	key_freesp(&sp);
@@ -802,8 +811,16 @@ int
 ipsec6_in_reject(const struct mbuf *m, struct inpcb *inp)
 {
 	struct secpolicy *sp;
+#ifdef IPSEC_OFFLOAD
+	struct ipsec_accel_in_tag *tag;
+#endif
 	int result;
 
+#ifdef IPSEC_OFFLOAD
+	tag = ipsec_accel_input_tag_lookup(m);
+	if (tag != NULL)
+		return (0);
+#endif
 	sp = ipsec6_getpolicy(m, inp, IPSEC_DIR_INBOUND, 0);
 	result = ipsec_in_reject(sp, inp, m);
 	key_freesp(&sp);
diff --git a/sys/netipsec/ipsec.h b/sys/netipsec/ipsec.h
index 2a1dcb8bb77b..55cc0839eab9 100644
--- a/sys/netipsec/ipsec.h
+++ b/sys/netipsec/ipsec.h
@@ -71,6 +71,12 @@ struct ipsecrequest {
 	u_int level;		/* IPsec level defined below. */
 };
 
+struct ipsec_accel_adddel_sp_tq {
+	struct vnet *adddel_vnet;
+	struct task adddel_task;
+	int adddel_scheduled;
+};
+
 /* Security Policy Data Base */
 struct secpolicy {
 	TAILQ_ENTRY(secpolicy) chain;
@@ -102,6 +108,11 @@ struct secpolicy {
 	time_t lastused;	/* updated every when kernel sends a packet */
 	long lifetime;		/* duration of the lifetime of this policy */
 	long validtime;		/* duration this policy is valid without use */
+	CK_LIST_HEAD(, ifp_handle_sp) accel_ifps;
+	struct ipsec_accel_adddel_sp_tq accel_add_tq;
+	struct ipsec_accel_adddel_sp_tq accel_del_tq;
+	struct inpcb *ipsec_accel_add_sp_inp;
+	const char *accel_ifname;
 };
 
 /*
diff --git a/sys/netipsec/ipsec_input.c b/sys/netipsec/ipsec_input.c
index 1150f3f470d3..dbb20748cf45 100644
--- a/sys/netipsec/ipsec_input.c
+++ b/sys/netipsec/ipsec_input.c
@@ -90,6 +90,7 @@
 #include <netipsec/esp.h>
 #include <netipsec/esp_var.h>
 #include <netipsec/ipcomp_var.h>
+#include <netipsec/ipsec_offload.h>
 
 #include <netipsec/key.h>
 #include <netipsec/keydb.h>
@@ -237,6 +238,11 @@ ipsec_common_input(struct mbuf *m, int skip, int protoff, int af, int sproto)
 int
 ipsec4_input(struct mbuf *m, int offset, int proto)
 {
+	int error;
+
+	error = ipsec_accel_input(m, offset, proto);
+	if (error != ENXIO)
+		return (error);
 
 	switch (proto) {
 	case IPPROTO_AH:
@@ -536,7 +542,12 @@ ipsec6_lasthdr(int proto)
 int
 ipsec6_input(struct mbuf *m, int offset, int proto)
 {
+	int error;
 
+	error = ipsec_accel_input(m, offset, proto);
+	if (error != ENXIO)
+		return (error);
+		
 	switch (proto) {
 	case IPPROTO_AH:
 	case IPPROTO_ESP:
diff --git a/sys/netipsec/ipsec_offload.c b/sys/netipsec/ipsec_offload.c
new file mode 100644
index 000000000000..851bacaf4ea1
--- /dev/null
+++ b/sys/netipsec/ipsec_offload.c
@@ -0,0 +1,1061 @@
+/*-
+ * Copyright (c) 2021,2022 NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/ck.h>
+#include <sys/kernel.h>
+#include <sys/mbuf.h>
+#include <sys/pctrie.h>
+#include <sys/proc.h>
+#include <sys/socket.h>
+#include <sys/protosw.h>
+#include <sys/taskqueue.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/vnet.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip6.h>
+#include <netinet6/ip6_var.h>
+#include <netinet/in_pcb.h>
+
+#include <netipsec/key.h>
+#include <netipsec/keydb.h>
+#include <netipsec/key_debug.h>
+#include <netipsec/xform.h>
+#include <netipsec/ipsec.h>
+#include <netipsec/ipsec_offload.h>
+#include <netipsec/ah_var.h>
+#include <netipsec/esp.h>
+#include <netipsec/esp_var.h>
+#include <netipsec/ipcomp_var.h>
+
+#ifdef IPSEC_OFFLOAD
+
+static struct mtx ipsec_accel_sav_tmp;
+static struct unrhdr *drv_spi_unr;
+static struct mtx ipsec_accel_cnt_lock;
+
+struct ipsec_accel_install_newkey_tq {
+	struct secasvar *sav;
+	struct vnet *install_vnet;
+	struct task install_task;
+};
+
+struct ipsec_accel_forget_tq {
+	struct vnet *forget_vnet;
+	struct task forget_task;
+	struct secasvar *sav;
+};
+
+struct ifp_handle_sav {
+	CK_LIST_ENTRY(ifp_handle_sav) sav_link;
+	CK_LIST_ENTRY(ifp_handle_sav) sav_allh_link;
+	struct secasvar *sav;
+	struct ifnet *ifp;
+	void *ifdata;
+	uint64_t drv_spi;
+	uint32_t flags;
+	size_t hdr_ext_size;
+	uint64_t cnt_octets;
+	uint64_t cnt_allocs;
+};
+
+#define	IFP_HS_HANDLED	0x00000001
+#define	IFP_HS_REJECTED	0x00000002
+#define	IFP_HS_INPUT	0x00000004
+#define	IFP_HS_OUTPUT	0x00000008
+#define	IFP_HS_MARKER	0x00000010
+
+static CK_LIST_HEAD(, ifp_handle_sav) ipsec_accel_all_sav_handles;
+
+struct ifp_handle_sp {
+	CK_LIST_ENTRY(ifp_handle_sp) sp_link;
+	CK_LIST_ENTRY(ifp_handle_sp) sp_allh_link;
+	struct secpolicy *sp;
+	struct ifnet *ifp;
+	void *ifdata;
+	uint32_t flags;
+};
+
+#define	IFP_HP_HANDLED	0x00000001
+#define	IFP_HP_REJECTED	0x00000002
+#define	IFP_HP_MARKER	0x00000004
+
+static CK_LIST_HEAD(, ifp_handle_sp) ipsec_accel_all_sp_handles;
+
+static void *
+drvspi_sa_trie_alloc(struct pctrie *ptree)
+{
+	void *res;
+
+	res = malloc(pctrie_node_size(), M_IPSEC_MISC, M_ZERO | M_NOWAIT);
+	if (res != NULL)
+		pctrie_zone_init(res, 0, 0);
+	return (res);
+}
+
+static void
+drvspi_sa_trie_free(struct pctrie *ptree, void *node)
+{
+	free(node, M_IPSEC_MISC);
+}
+
+PCTRIE_DEFINE(DRVSPI_SA, ifp_handle_sav, drv_spi,
+    drvspi_sa_trie_alloc, drvspi_sa_trie_free);
+static struct pctrie drv_spi_pctrie;
+
+static void ipsec_accel_sa_newkey_impl(struct secasvar *sav);
+static int ipsec_accel_handle_sav(struct secasvar *sav, struct ifnet *ifp,
+    u_int drv_spi, void *priv, uint32_t flags, struct ifp_handle_sav **ires);
+static void ipsec_accel_forget_sav_clear(struct secasvar *sav);
+static struct ifp_handle_sav *ipsec_accel_is_accel_sav_ptr(struct secasvar *sav,
+    struct ifnet *ifp);
+static int ipsec_accel_sa_lifetime_op_impl(struct secasvar *sav,
+    struct seclifetime *lft_c, if_t ifp, enum IF_SA_CNT_WHICH op,
+    struct rm_priotracker *sahtree_trackerp);
+static void ipsec_accel_sa_recordxfer(struct secasvar *sav, struct mbuf *m);
+static void ipsec_accel_sync_imp(void);
+static bool ipsec_accel_is_accel_sav_impl(struct secasvar *sav);
+static struct mbuf *ipsec_accel_key_setaccelif_impl(struct secasvar *sav);
+
+static void
+ipsec_accel_init(void *arg)
+{
+	mtx_init(&ipsec_accel_sav_tmp, "ipasat", MTX_DEF, 0);
+	mtx_init(&ipsec_accel_cnt_lock, "ipascn", MTX_DEF, 0);
+	drv_spi_unr = new_unrhdr(IPSEC_ACCEL_DRV_SPI_MIN,
+	    IPSEC_ACCEL_DRV_SPI_MAX, &ipsec_accel_sav_tmp);
+	ipsec_accel_sa_newkey_p = ipsec_accel_sa_newkey_impl;
+	ipsec_accel_forget_sav_p = ipsec_accel_forget_sav_impl;
+	ipsec_accel_spdadd_p = ipsec_accel_spdadd_impl;
+	ipsec_accel_spddel_p = ipsec_accel_spddel_impl;
+	ipsec_accel_sa_lifetime_op_p = ipsec_accel_sa_lifetime_op_impl;
+	ipsec_accel_sync_p = ipsec_accel_sync_imp;
+	ipsec_accel_is_accel_sav_p = ipsec_accel_is_accel_sav_impl;
+	ipsec_accel_key_setaccelif_p = ipsec_accel_key_setaccelif_impl;
+	pctrie_init(&drv_spi_pctrie);
+}
+SYSINIT(ipsec_accel_init, SI_SUB_VNET_DONE, SI_ORDER_ANY,
+    ipsec_accel_init, NULL);
+
+static void
+ipsec_accel_fini(void *arg)
+{
+	ipsec_accel_sa_newkey_p = NULL;
+	ipsec_accel_forget_sav_p = NULL;
+	ipsec_accel_spdadd_p = NULL;
+	ipsec_accel_spddel_p = NULL;
+	ipsec_accel_sa_lifetime_op_p = NULL;
+	ipsec_accel_sync_p = NULL;
+	ipsec_accel_is_accel_sav_p = NULL;
+	ipsec_accel_key_setaccelif_p = NULL;
+	ipsec_accel_sync_imp();
+	clean_unrhdr(drv_spi_unr);	/* avoid panic, should go later */
+	clear_unrhdr(drv_spi_unr);
+	delete_unrhdr(drv_spi_unr);
+	mtx_destroy(&ipsec_accel_sav_tmp);
+	mtx_destroy(&ipsec_accel_cnt_lock);
+}
+SYSUNINIT(ipsec_accel_fini, SI_SUB_VNET_DONE, SI_ORDER_ANY,
+    ipsec_accel_fini, NULL);
+
+static void
+ipsec_accel_alloc_forget_tq(struct secasvar *sav)
+{
+	void *ftq;
+
+	if (sav->accel_forget_tq != 0)
+		return;
+
+	ftq = malloc(sizeof(struct ipsec_accel_forget_tq), M_TEMP, M_WAITOK);
+	if (!atomic_cmpset_ptr(&sav->accel_forget_tq, 0, (uintptr_t)ftq))
+		free(ftq, M_TEMP);
+}
+
+static bool
+ipsec_accel_sa_install_match(if_t ifp, void *arg)
+{
+	if ((ifp->if_capenable2 & IFCAP2_BIT(IFCAP2_IPSEC_OFFLOAD)) == 0)
+		return (false);
+	if (ifp->if_ipsec_accel_m->if_sa_newkey == NULL) {
+		printf("driver bug ifp %s if_sa_newkey NULL\n",
+		    if_name(ifp));
+		return (false);
+	}
+	return (true);
+}
+
+static int
+ipsec_accel_sa_newkey_cb(if_t ifp, void *arg)
+{
+	struct ipsec_accel_install_newkey_tq *tq;
+	void *priv;
+	u_int drv_spi;
+	int error;
+
+	tq = arg;
+
+	printf("ipsec_accel_sa_newkey_act: ifp %s h %p spi %#x "
+	    "flags %#x seq %d\n",
+	    if_name(ifp), ifp->if_ipsec_accel_m->if_sa_newkey,
+	    be32toh(tq->sav->spi), tq->sav->flags, tq->sav->seq);
+	priv = NULL;
+	drv_spi = alloc_unr(drv_spi_unr);
+	if (tq->sav->accel_ifname != NULL &&
+	    strcmp(tq->sav->accel_ifname, if_name(ifp)) != 0) {
+		error = ipsec_accel_handle_sav(tq->sav,
+		    ifp, drv_spi, priv, IFP_HS_REJECTED, NULL);
+		goto out;
+	}
+	if (drv_spi == -1) {
+		/* XXXKIB */
+		printf("ipsec_accel_sa_install_newkey: cannot alloc "
+		    "drv_spi if %s spi %#x\n", if_name(ifp),
+		    be32toh(tq->sav->spi));
+		return (ENOMEM);
+	}
+	error = ifp->if_ipsec_accel_m->if_sa_newkey(ifp, tq->sav,
+	    drv_spi, &priv);
+	if (error != 0) {
+		if (error == EOPNOTSUPP) {
+			printf("ipsec_accel_sa_newkey: driver "
+			    "refused sa if %s spi %#x\n",
+			    if_name(ifp), be32toh(tq->sav->spi));
+			error = ipsec_accel_handle_sav(tq->sav,
+			    ifp, drv_spi, priv, IFP_HS_REJECTED, NULL);
+			/* XXXKIB */
+		} else {
+			printf("ipsec_accel_sa_newkey: driver "
+			    "error %d if %s spi %#x\n",
+			    error, if_name(ifp), be32toh(tq->sav->spi));
+			/* XXXKIB */
+		}
+	} else {
+		error = ipsec_accel_handle_sav(tq->sav, ifp,
+		    drv_spi, priv, IFP_HS_HANDLED, NULL);
+		if (error != 0) {
+			/* XXXKIB */
+			printf("ipsec_accel_sa_newkey: handle_sav "
+			    "err %d if %s spi %#x\n", error,
+			    if_name(ifp), be32toh(tq->sav->spi));
+		}
+	}
+out:
+	return (error);
+}
+
+static void
+ipsec_accel_sa_newkey_act(void *context, int pending)
+{
+	struct ipsec_accel_install_newkey_tq *tq;
+	void *tqf;
+	struct secasvar *sav;
+
+	tq = context;
+	tqf = NULL;
+	sav = tq->sav;
+	CURVNET_SET(tq->install_vnet);
+	mtx_lock(&ipsec_accel_sav_tmp);
+	if ((sav->accel_flags & (SADB_KEY_ACCEL_INST |
+	    SADB_KEY_ACCEL_DEINST)) == 0 &&
+	    sav->state == SADB_SASTATE_MATURE) {
+		sav->accel_flags |= SADB_KEY_ACCEL_INST;
+		mtx_unlock(&ipsec_accel_sav_tmp);
+		if_foreach_sleep(ipsec_accel_sa_install_match, context,
+		    ipsec_accel_sa_newkey_cb, context);
+		ipsec_accel_alloc_forget_tq(sav);
+		mtx_lock(&ipsec_accel_sav_tmp);
+
+		/*
+		 * If ipsec_accel_forget_sav() raced with us and set
+		 * the flag, do its work.  Its task cannot execute in
+		 * parallel since taskqueue_thread is single-threaded.
+		 */
+		if ((sav->accel_flags & SADB_KEY_ACCEL_DEINST) != 0) {
+			tqf = (void *)sav->accel_forget_tq;
+			sav->accel_forget_tq = 0;
+			ipsec_accel_forget_sav_clear(sav);
+		}
+	}
+	mtx_unlock(&ipsec_accel_sav_tmp);
+	key_freesav(&tq->sav);
+	CURVNET_RESTORE();
+	free(tq, M_TEMP);
+	free(tqf, M_TEMP);
+}
+
+static void
+ipsec_accel_sa_newkey_impl(struct secasvar *sav)
+{
+	struct ipsec_accel_install_newkey_tq *tq;
+
+	if ((sav->accel_flags & (SADB_KEY_ACCEL_INST |
+	    SADB_KEY_ACCEL_DEINST)) != 0)
+		return;
+
+	printf(
+	    "ipsec_accel_sa_install_newkey: spi %#x flags %#x seq %d\n",
+	    be32toh(sav->spi), sav->flags, sav->seq);
+
+	tq = malloc(sizeof(*tq), M_TEMP, M_NOWAIT);
+	if (tq == NULL) {
+		printf("ipsec_accel_sa_install_newkey: no memory for tq, "
+		    "spi %#x\n", be32toh(sav->spi));
+		/* XXXKIB */
+		return;
+	}
+
+	refcount_acquire(&sav->refcnt);
+
+	TASK_INIT(&tq->install_task, 0, ipsec_accel_sa_newkey_act, tq);
+	tq->sav = sav;
+	tq->install_vnet = curthread->td_vnet;	/* XXXKIB liveness */
+	taskqueue_enqueue(taskqueue_thread, &tq->install_task);
+}
+
+static int
+ipsec_accel_handle_sav(struct secasvar *sav, struct ifnet *ifp,
+    u_int drv_spi, void *priv, uint32_t flags, struct ifp_handle_sav **ires)
+{
+	struct ifp_handle_sav *ihs, *i;
+	int error;
+
+	MPASS(__bitcount(flags & (IFP_HS_HANDLED | IFP_HS_REJECTED)) == 1);
+
+	ihs = malloc(sizeof(*ihs), M_IPSEC_MISC, M_WAITOK | M_ZERO);
+	ihs->ifp = ifp;
+	ihs->sav = sav;
+	ihs->drv_spi = drv_spi;
+	ihs->ifdata = priv;
+	ihs->flags = flags;
+	if ((flags & IFP_HS_OUTPUT) != 0)
+		ihs->hdr_ext_size = esp_hdrsiz(sav);
+	mtx_lock(&ipsec_accel_sav_tmp);
+	CK_LIST_FOREACH(i, &sav->accel_ifps, sav_link) {
+		if (i->ifp == ifp) {
+			error = EALREADY;
+			goto errout;
+		}
+	}
+	error = DRVSPI_SA_PCTRIE_INSERT(&drv_spi_pctrie, ihs);
+	if (error != 0)
+		goto errout;
+	if_ref(ihs->ifp);
+	CK_LIST_INSERT_HEAD(&sav->accel_ifps, ihs, sav_link);
+	CK_LIST_INSERT_HEAD(&ipsec_accel_all_sav_handles, ihs, sav_allh_link);
+	mtx_unlock(&ipsec_accel_sav_tmp);
+	if (ires != NULL)
+		*ires = ihs;
+	return (0);
+errout:
+	mtx_unlock(&ipsec_accel_sav_tmp);
+	free(ihs, M_IPSEC_MISC);
+	if (ires != NULL)
+		*ires = NULL;
+	return (error);
+}
+
+static void
+ipsec_accel_forget_handle_sav(struct ifp_handle_sav *i, bool freesav)
+{
+	struct ifnet *ifp;
+	struct secasvar *sav;
+
+	mtx_assert(&ipsec_accel_sav_tmp, MA_OWNED);
+
+	CK_LIST_REMOVE(i, sav_link);
+	CK_LIST_REMOVE(i, sav_allh_link);
+	DRVSPI_SA_PCTRIE_REMOVE(&drv_spi_pctrie, i->drv_spi);
+	mtx_unlock(&ipsec_accel_sav_tmp);
+	NET_EPOCH_WAIT();
+	ifp = i->ifp;
+	sav = i->sav;
+	if ((i->flags & (IFP_HS_HANDLED | IFP_HS_REJECTED)) ==
+	    IFP_HS_HANDLED) {
+		printf("sa deinstall %s %p spi %#x ifl %#x\n",
+		    if_name(ifp), sav, be32toh(sav->spi), i->flags);
+		ifp->if_ipsec_accel_m->if_sa_deinstall(ifp,
+		    i->drv_spi, i->ifdata);
+	}
+	if_rele(ifp);
+	free_unr(drv_spi_unr, i->drv_spi);
+	free(i, M_IPSEC_MISC);
+	if (freesav)
+		key_freesav(&sav);
+	mtx_lock(&ipsec_accel_sav_tmp);
+}
+
+static void
+ipsec_accel_forget_sav_clear(struct secasvar *sav)
+{
+	struct ifp_handle_sav *i;
+
+	for (;;) {
+		i = CK_LIST_FIRST(&sav->accel_ifps);
+		if (i == NULL)
+			break;
+		ipsec_accel_forget_handle_sav(i, false);
+	}
+}
+
+static void
+ipsec_accel_forget_sav_act(void *arg, int pending)
+{
+	struct ipsec_accel_forget_tq *tq;
+	struct secasvar *sav;
+
+	tq = arg;
+	sav = tq->sav;
+	CURVNET_SET(tq->forget_vnet);
+	mtx_lock(&ipsec_accel_sav_tmp);
+	ipsec_accel_forget_sav_clear(sav);
+	mtx_unlock(&ipsec_accel_sav_tmp);
+	key_freesav(&sav);
+	CURVNET_RESTORE();
+	free(tq, M_TEMP);
+}
+
+void
+ipsec_accel_forget_sav_impl(struct secasvar *sav)
+{
+	struct ipsec_accel_forget_tq *tq;
+
+	mtx_lock(&ipsec_accel_sav_tmp);
+	sav->accel_flags |= SADB_KEY_ACCEL_DEINST;
+	tq = (void *)atomic_load_ptr(&sav->accel_forget_tq);
+	if (tq == NULL || !atomic_cmpset_ptr(&sav->accel_forget_tq,
+	    (uintptr_t)tq, 0)) {
+		mtx_unlock(&ipsec_accel_sav_tmp);
+		return;
+	}
+	mtx_unlock(&ipsec_accel_sav_tmp);
+
+	refcount_acquire(&sav->refcnt);
+	TASK_INIT(&tq->forget_task, 0, ipsec_accel_forget_sav_act, tq);
+	tq->forget_vnet = curthread->td_vnet;
+	tq->sav = sav;
+	taskqueue_enqueue(taskqueue_thread, &tq->forget_task);
+}
+
+static void
+ipsec_accel_on_ifdown_sav(struct ifnet *ifp)
+{
+	struct ifp_handle_sav *i, *marker;
+
+	marker = malloc(sizeof(*marker), M_IPSEC_MISC, M_WAITOK | M_ZERO);
+	marker->flags = IFP_HS_MARKER;
+
+	mtx_lock(&ipsec_accel_sav_tmp);
+	CK_LIST_INSERT_HEAD(&ipsec_accel_all_sav_handles, marker,
+	    sav_allh_link);
+	for (;;) {
+		i = CK_LIST_NEXT(marker, sav_allh_link);
+		if (i == NULL)
+			break;
+		CK_LIST_REMOVE(marker, sav_allh_link);
+		CK_LIST_INSERT_AFTER(i, marker, sav_allh_link);
+		if (i->ifp == ifp) {
+			refcount_acquire(&i->sav->refcnt); /* XXXKIB wrap ? */
+			ipsec_accel_forget_handle_sav(i, true);
+		}
+	}
+	CK_LIST_REMOVE(marker, sav_allh_link);
+	mtx_unlock(&ipsec_accel_sav_tmp);
+	free(marker, M_IPSEC_MISC);
+}
+
+static struct ifp_handle_sav *
+ipsec_accel_is_accel_sav_ptr_raw(struct secasvar *sav, struct ifnet *ifp)
+{
+	struct ifp_handle_sav *i;
+
+	if ((ifp->if_capenable2 & IFCAP2_BIT(IFCAP2_IPSEC_OFFLOAD)) == 0)
+		return (NULL);
+	CK_LIST_FOREACH(i, &sav->accel_ifps, sav_link) {
+		if (i->ifp == ifp)
+			return (i);
+	}
+	return (NULL);
+}
+
+static struct ifp_handle_sav *
+ipsec_accel_is_accel_sav_ptr(struct secasvar *sav, struct ifnet *ifp)
+{
+	NET_EPOCH_ASSERT();
+	return (ipsec_accel_is_accel_sav_ptr_raw(sav, ifp));
+}
+
+static bool
+ipsec_accel_is_accel_sav_impl(struct secasvar *sav)
+{
+	return (!CK_LIST_EMPTY(&sav->accel_ifps));
+}
+
+static struct secasvar *
+ipsec_accel_drvspi_to_sa(u_int drv_spi)
+{
+	struct ifp_handle_sav *i;
+
+	i = DRVSPI_SA_PCTRIE_LOOKUP(&drv_spi_pctrie, drv_spi);
+	if (i == NULL)
+		return (NULL);
+	return (i->sav);
+}
+
+static struct ifp_handle_sp *
+ipsec_accel_find_accel_sp(struct secpolicy *sp, if_t ifp)
+{
+	struct ifp_handle_sp *i;
+
+	CK_LIST_FOREACH(i, &sp->accel_ifps, sp_link) {
+		if (i->ifp == ifp)
+			return (i);
+	}
+	return (NULL);
+}
+
+static bool
+ipsec_accel_is_accel_sp(struct secpolicy *sp, if_t ifp)
+{
+	return (ipsec_accel_find_accel_sp(sp, ifp) != NULL);
+}
+
+static int
+ipsec_accel_remember_sp(struct secpolicy *sp, if_t ifp,
+    struct ifp_handle_sp **ip)
+{
+	struct ifp_handle_sp *i;
+
+	i = malloc(sizeof(*i), M_IPSEC_MISC, M_WAITOK | M_ZERO);
+	i->sp = sp;
+	i->ifp = ifp;
+	if_ref(ifp);
+	i->flags = IFP_HP_HANDLED;
+	mtx_lock(&ipsec_accel_sav_tmp);
+	CK_LIST_INSERT_HEAD(&sp->accel_ifps, i, sp_link);
+	CK_LIST_INSERT_HEAD(&ipsec_accel_all_sp_handles, i, sp_allh_link);
+	mtx_unlock(&ipsec_accel_sav_tmp);
+	*ip = i;
+	return (0);
+}
+
+static bool
+ipsec_accel_spdadd_match(if_t ifp, void *arg)
+{
+	struct secpolicy *sp;
+
+	if ((ifp->if_capenable2 & IFCAP2_BIT(IFCAP2_IPSEC_OFFLOAD)) == 0 ||
+	    ifp->if_ipsec_accel_m->if_spdadd == NULL)
+		return (false);
+	sp = arg;
+	if (sp->accel_ifname != NULL &&
+	    strcmp(sp->accel_ifname, if_name(ifp)) != 0)
+		return (false);
+	if (ipsec_accel_is_accel_sp(sp, ifp))
+		return (false);
+	return (true);
+}
+
+static int
+ipsec_accel_spdadd_cb(if_t ifp, void *arg)
+{
+	struct secpolicy *sp;
+	struct inpcb *inp;
+	struct ifp_handle_sp *i;
+	int error;
+
+	sp = arg;
+	inp = sp->ipsec_accel_add_sp_inp;
+	printf("ipsec_accel_spdadd_cb: ifp %s m %p sp %p inp %p\n",
+	    if_name(ifp), ifp->if_ipsec_accel_m->if_spdadd, sp, inp);
+	error = ipsec_accel_remember_sp(sp, ifp, &i);
+	if (error != 0) {
+		printf("ipsec_accel_spdadd: %s if_spdadd %p remember res %d\n",
+		    if_name(ifp), sp, error);
+		return (error);
+	}
+	error = ifp->if_ipsec_accel_m->if_spdadd(ifp, sp, inp, &i->ifdata);
+	if (error != 0) {
+		i->flags |= IFP_HP_REJECTED;
+		printf("ipsec_accel_spdadd: %s if_spdadd %p res %d\n",
+		    if_name(ifp), sp, error);
+	}
+	return (error);
+}
+
+static void
+ipsec_accel_spdadd_act(void *arg, int pending)
+{
+	struct secpolicy *sp;
+	struct inpcb *inp;
+
+	sp = arg;
+	CURVNET_SET(sp->accel_add_tq.adddel_vnet);
+	if_foreach_sleep(ipsec_accel_spdadd_match, arg,
+	    ipsec_accel_spdadd_cb, arg);
+	inp = sp->ipsec_accel_add_sp_inp;
+	if (inp != NULL) {
+		INP_WLOCK(inp);
+		if (!in_pcbrele_wlocked(inp))
+			INP_WUNLOCK(inp);
+		sp->ipsec_accel_add_sp_inp = NULL;
+	}
+	CURVNET_RESTORE();
+	key_freesp(&sp);
+}
+
+void
+ipsec_accel_spdadd_impl(struct secpolicy *sp, struct inpcb *inp)
+{
+	struct ipsec_accel_adddel_sp_tq *tq;
+
+	if (sp == NULL)
+		return;
+	if (sp->tcount == 0 && inp == NULL)
+		return;
+	tq = &sp->accel_add_tq;
+	if (atomic_cmpset_int(&tq->adddel_scheduled, 0, 1) == 0)
+		return;
+	tq->adddel_vnet = curthread->td_vnet;
+	sp->ipsec_accel_add_sp_inp = inp;
+	if (inp != NULL)
+		in_pcbref(inp);
+	TASK_INIT(&tq->adddel_task, 0, ipsec_accel_spdadd_act, sp);
+	key_addref(sp);
+	taskqueue_enqueue(taskqueue_thread, &tq->adddel_task);
+}
+
+static void
+ipsec_accel_spddel_act(void *arg, int pending)
+{
+	struct ifp_handle_sp *i;
+	struct secpolicy *sp;
+	int error;
+
+	sp = arg;
+	CURVNET_SET(sp->accel_del_tq.adddel_vnet);
+	mtx_lock(&ipsec_accel_sav_tmp);
+	for (;;) {
+		i = CK_LIST_FIRST(&sp->accel_ifps);
+		if (i == NULL)
+			break;
+		CK_LIST_REMOVE(i, sp_link);
+		CK_LIST_REMOVE(i, sp_allh_link);
+		mtx_unlock(&ipsec_accel_sav_tmp);
+		NET_EPOCH_WAIT();
+		if ((i->flags & (IFP_HP_HANDLED | IFP_HP_REJECTED)) ==
+		    IFP_HP_HANDLED) {
+			printf("spd deinstall %s %p\n", if_name(i->ifp), sp);
+			error = i->ifp->if_ipsec_accel_m->if_spddel(i->ifp,
+			    sp, i->ifdata);
+			if (error != 0) {
+				printf(
+		    "ipsec_accel_spddel: %s if_spddel %p res %d\n",
+				    if_name(i->ifp), sp, error);
+			}
+		}
+		if_rele(i->ifp);
+		free(i, M_IPSEC_MISC);
+		mtx_lock(&ipsec_accel_sav_tmp);
+	}
+	mtx_unlock(&ipsec_accel_sav_tmp);
+	key_freesp(&sp);
+	CURVNET_RESTORE();
+}
+
+void
+ipsec_accel_spddel_impl(struct secpolicy *sp)
+{
+	struct ipsec_accel_adddel_sp_tq *tq;
+
+	if (sp == NULL)
+		return;
+
+	tq = &sp->accel_del_tq;
+	if (atomic_cmpset_int(&tq->adddel_scheduled, 0, 1) == 0)
+		return;
+	tq->adddel_vnet = curthread->td_vnet;
+	TASK_INIT(&tq->adddel_task, 0, ipsec_accel_spddel_act, sp);
+	key_addref(sp);
+	taskqueue_enqueue(taskqueue_thread, &tq->adddel_task);
+}
+
+static void
+ipsec_accel_on_ifdown_sp(struct ifnet *ifp)
+{
+	struct ifp_handle_sp *i, *marker;
+	struct secpolicy *sp;
+	int error;
+
+	marker = malloc(sizeof(*marker), M_IPSEC_MISC, M_WAITOK | M_ZERO);
+	marker->flags = IFP_HS_MARKER;
+
+	mtx_lock(&ipsec_accel_sav_tmp);
+	CK_LIST_INSERT_HEAD(&ipsec_accel_all_sp_handles, marker,
+	    sp_allh_link);
+	for (;;) {
+		i = CK_LIST_NEXT(marker, sp_allh_link);
+		if (i == NULL)
+			break;
+		CK_LIST_REMOVE(marker, sp_allh_link);
+		CK_LIST_INSERT_AFTER(i, marker, sp_allh_link);
+		if (i->ifp != ifp)
+			continue;
+
+		sp = i->sp;
+		key_addref(sp);
+		CK_LIST_REMOVE(i, sp_link);
+		CK_LIST_REMOVE(i, sp_allh_link);
+		mtx_unlock(&ipsec_accel_sav_tmp);
+		NET_EPOCH_WAIT();
+		if ((i->flags & (IFP_HP_HANDLED | IFP_HP_REJECTED)) ==
+		    IFP_HP_HANDLED) {
+			printf("spd deinstall %s %p\n", if_name(ifp), sp);
+			error = ifp->if_ipsec_accel_m->if_spddel(ifp,
+			    sp, i->ifdata);
+		}
*** 1421 LINES SKIPPED ***