git: ebf4e2da2c59 - stable/13 - cxgbe(4): Overhaul CLIP (Compressed Local IPv6) table management.

From: Navdeep Parhar <np_at_FreeBSD.org>
Date: Wed, 20 Oct 2021 17:27:04 UTC
The branch stable/13 has been updated by np:

URL: https://cgit.FreeBSD.org/src/commit/?id=ebf4e2da2c598e41656a187c4bf5cd547c6c4a65

commit ebf4e2da2c598e41656a187c4bf5cd547c6c4a65
Author:     Navdeep Parhar <np@FreeBSD.org>
AuthorDate: 2021-05-23 21:58:29 +0000
Commit:     Navdeep Parhar <np@FreeBSD.org>
CommitDate: 2021-10-20 17:05:02 +0000

    cxgbe(4): Overhaul CLIP (Compressed Local IPv6) table management.
    
    - Process the list of local IPs once instead of once per adapter.  Add
      addresses from all VNETs to the driver's list but leave hardware
      updates for later when the global VNET/IFADDR list locks have been
      released.
    
    - Add address to the hardware table synchronously when a CLIP entry is
      requested for an address that's not already in there.
    
    - Provide ioctls that allow userspace tools to manage addresses in the
      CLIP table.
    
    - Add a knob (hw.cxgbe.clip_db_auto) that controls whether local IPs are
      automatically added to the CLIP table or not.
    
    Sponsored by:   Chelsio Communications
    
    (cherry picked from commit 24b98f288d11750f2cdfbfe360be1c92a9c2ee1d)
---
 sys/dev/cxgbe/adapter.h            |  16 +-
 sys/dev/cxgbe/crypto/t4_kern_tls.c |   4 +-
 sys/dev/cxgbe/t4_clip.c            | 842 ++++++++++++++++++++++++++++---------
 sys/dev/cxgbe/t4_clip.h            |  15 +-
 sys/dev/cxgbe/t4_ioctl.h           |  10 +
 sys/dev/cxgbe/t4_main.c            |  37 ++
 sys/dev/cxgbe/tom/t4_connect.c     |   4 +-
 sys/dev/cxgbe/tom/t4_listen.c      |  18 +-
 sys/dev/cxgbe/tom/t4_tom.c         |   2 +-
 9 files changed, 735 insertions(+), 213 deletions(-)

diff --git a/sys/dev/cxgbe/adapter.h b/sys/dev/cxgbe/adapter.h
index 2f4619b1180f..630e9c4ac1b9 100644
--- a/sys/dev/cxgbe/adapter.h
+++ b/sys/dev/cxgbe/adapter.h
@@ -50,6 +50,7 @@
 #include <machine/bus.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
+#include <sys/taskqueue.h>
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_var.h>
@@ -68,6 +69,15 @@ MALLOC_DECLARE(M_CXGBE);
 #define CXGBE_UNIMPLEMENTED(s) \
     panic("%s (%s, line %d) not implemented yet.", s, __FILE__, __LINE__)
 
+/*
+ * Same as LIST_HEAD from queue.h.  This is to avoid conflict with LinuxKPI's
+ * LIST_HEAD when building iw_cxgbe.
+ */
+#define	CXGBE_LIST_HEAD(name, type)					\
+struct name {								\
+	struct type *lh_first;	/* first element */			\
+}
+
 #ifndef SYSCTL_ADD_UQUAD
 #define SYSCTL_ADD_UQUAD SYSCTL_ADD_QUAD
 #define sysctl_handle_64 sysctl_handle_quad
@@ -881,9 +891,11 @@ struct adapter {
 	struct port_info *port[MAX_NPORTS];
 	uint8_t chan_map[MAX_NCHAN];		/* channel -> port */
 
-	struct mtx clip_table_lock;
-	TAILQ_HEAD(, clip_entry) clip_table;
+	CXGBE_LIST_HEAD(, clip_entry) *clip_table;
+	TAILQ_HEAD(, clip_entry) clip_pending;	/* these need hw update. */
+	u_long clip_mask;
 	int clip_gen;
+	struct timeout_task clip_task;
 
 	void *tom_softc;	/* (struct tom_data *) */
 	struct tom_tunables tt;
diff --git a/sys/dev/cxgbe/crypto/t4_kern_tls.c b/sys/dev/cxgbe/crypto/t4_kern_tls.c
index 957d0202fa3f..99d0d33cf128 100644
--- a/sys/dev/cxgbe/crypto/t4_kern_tls.c
+++ b/sys/dev/cxgbe/crypto/t4_kern_tls.c
@@ -379,7 +379,7 @@ send_ktls_act_open_req(struct adapter *sc, struct vi_info *vi,
 
 	isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
 	if (isipv6) {
-		tlsp->ce = t4_hold_lip(sc, &inp->in6p_laddr, NULL);
+		tlsp->ce = t4_get_clip_entry(sc, &inp->in6p_laddr, true);
 		if (tlsp->ce == NULL)
 			return (ENOENT);
 	}
@@ -2333,7 +2333,7 @@ cxgbe_tls_tag_free(struct m_snd_tag *mst)
 	if (tlsp->tid >= 0)
 		release_tid(sc, tlsp->tid, tlsp->ctrlq);
 	if (tlsp->ce)
-		t4_release_lip(sc, tlsp->ce);
+		t4_release_clip_entry(sc, tlsp->ce);
 	if (tlsp->tx_key_addr >= 0)
 		free_keyid(tlsp, tlsp->tx_key_addr);
 
diff --git a/sys/dev/cxgbe/t4_clip.c b/sys/dev/cxgbe/t4_clip.c
index ad26d212315e..18d78a9e830b 100644
--- a/sys/dev/cxgbe/t4_clip.c
+++ b/sys/dev/cxgbe/t4_clip.c
@@ -1,7 +1,7 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
- * Copyright (c) 2012 Chelsio Communications, Inc.
+ * Copyright (c) 2012-2021 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
@@ -50,112 +50,345 @@ __FBSDID("$FreeBSD$");
 #include "common/common.h"
 #include "t4_clip.h"
 
+/*
+ * Code to deal with the Compressed Local IPv6 (CLIP) table in the ASIC.
+ *
+ * The driver maintains a global CLIP database (clip_db) of IPv6 addresses and a
+ * per-adapter CLIP table (sc->clip_table) with entries that point to an IPv6 in
+ * the clip_db.  All access is protected by a single global lock (clip_db_lock).
+ * The correct lock order is clip lock before synchronized op.
+ *
+ * By default (hw.cxgbe.clip_db_auto=1) all local IPv6 addresses are added to
+ * the db.  Addresses are also added on-demand when the driver allocates an
+ * entry for a filter, TOE tid, etc.  krn_ref counts the number of times an
+ * address appears in the system.  adp_ref counts the number of adapters that
+ * have that address in their CLIP table.  If both are 0 then the entry is
+ * evicted from the db.  Consumers of the CLIP table entry (filters, TOE tids)
+ * are tracked in ce->refcount.  Driver ioctls let external consumers add/remove
+ * addresses from the CLIP table.
+ */
+
 #if defined(INET6)
-static int add_lip(struct adapter *, struct in6_addr *);
-static int delete_lip(struct adapter *, struct in6_addr *);
-static struct clip_entry *search_lip(struct adapter *, struct in6_addr *);
-static void update_clip(struct adapter *, void *);
-static void t4_clip_task(void *, int);
-static void update_clip_table(struct adapter *);
+struct clip_db_entry {
+	LIST_ENTRY(clip_db_entry) link;	/* clip_db hash linkage */
+	struct in6_addr lip;
+	u_int krn_ref;	/* # of times this IP6 appears in list of all IP6 */
+	u_int adp_ref;	/* # of adapters with this IP6 in their CLIP */
+	u_int tmp_ref;	/* Used only during refresh */
+};
+
+struct clip_entry {
+	LIST_ENTRY(clip_entry) link;	/* clip_table hash linkage */
+	TAILQ_ENTRY(clip_entry) plink;	/* clip_pending linkage */
+	struct clip_db_entry *cde;
+	int16_t clip_idx;		/* index in the hw table */
+	bool pending;			/* in clip_pending list */
+	int refcount;
+};
 
-static int in6_ifaddr_gen;
 static eventhandler_tag ifaddr_evhandler;
-static struct timeout_task clip_task;
+static struct mtx clip_db_lock;
+static LIST_HEAD(, clip_db_entry) *clip_db;
+static u_long clip_db_mask;
+static int clip_db_gen;
+static struct task clip_db_task;
+
+static int add_lip(struct adapter *, struct in6_addr *, int16_t *);
+static int del_lip(struct adapter *, struct in6_addr *);
+static void t4_clip_db_task(void *, int);
+static void t4_clip_task(void *, int);
+static void update_clip_db(void);
+static int update_sw_clip_table(struct adapter *);
+static int update_hw_clip_table(struct adapter *);
+static void update_clip_table(struct adapter *, void *);
+static int sysctl_clip_db(SYSCTL_HANDLER_ARGS);
+static int sysctl_clip_db_auto(SYSCTL_HANDLER_ARGS);
+static struct clip_db_entry *lookup_clip_db_entry(struct in6_addr *, bool);
+static struct clip_entry *lookup_clip_entry(struct adapter *, struct in6_addr *,
+    bool);
+
+SYSCTL_PROC(_hw_cxgbe, OID_AUTO, clip_db, CTLTYPE_STRING | CTLFLAG_RD |
+    CTLFLAG_SKIP | CTLFLAG_MPSAFE, NULL, 0, sysctl_clip_db, "A",
+    "CLIP database");
+
+int t4_clip_db_auto = 1;
+SYSCTL_PROC(_hw_cxgbe, OID_AUTO, clip_db_auto, CTLTYPE_INT | CTLFLAG_RWTUN |
+    CTLFLAG_MPSAFE, NULL, 0, sysctl_clip_db_auto, "I",
+    "Add local IPs to CLIP db automatically (0 = no, 1 = yes)");
+
+static inline uint32_t
+clip_hashfn(struct in6_addr *addr)
+{
+	return (fnv_32_buf(addr, sizeof(*addr), FNV1_32_INIT) & clip_db_mask);
+}
+
+static inline struct clip_db_entry *
+alloc_clip_db_entry(struct in6_addr *in6)
+{
+	struct clip_db_entry *cde;
+
+	cde = malloc(sizeof(*cde), M_CXGBE, M_NOWAIT | M_ZERO);
+	if (__predict_true(cde != NULL))
+		memcpy(&cde->lip, in6, sizeof(cde->lip));
+
+	return (cde);
+}
+
+static inline struct clip_entry *
+alloc_clip_entry(struct clip_db_entry *cde)
+{
+	struct clip_entry *ce;
+
+	mtx_assert(&clip_db_lock, MA_OWNED);
+
+	ce = malloc(sizeof(*ce), M_CXGBE, M_NOWAIT | M_ZERO);
+	if (__predict_true(ce != NULL)) {
+		ce->cde = cde;
+		cde->adp_ref++;
+		ce->clip_idx = -1;
+	}
+
+	return (ce);
+}
+
+/*
+ * Look up the IP6 address in the CLIP db.  If add is set then an entry for the
+ * IP6 will be added to the db.
+ */
+static struct clip_db_entry *
+lookup_clip_db_entry(struct in6_addr *in6, bool add)
+{
+	struct clip_db_entry *cde;
+	const int bucket = clip_hashfn(in6);
+
+	mtx_assert(&clip_db_lock, MA_OWNED);
+
+	LIST_FOREACH(cde, &clip_db[bucket], link) {
+		if (IN6_ARE_ADDR_EQUAL(&cde->lip, in6))
+			return (cde);
+	}
+
+	/* Not found.  Create a new entry if requested. */
+	if (add) {
+		cde = alloc_clip_db_entry(in6);
+		if (cde != NULL)
+			LIST_INSERT_HEAD(&clip_db[bucket], cde, link);
+	}
+
+	return (cde);
+}
+
+/*
+ * Look up the IP6 address in the CLIP db.  If add is set then an entry for the
+ * IP6 will be added to the db.
+ */
+static struct clip_entry *
+lookup_clip_entry(struct adapter *sc, struct in6_addr *in6, bool add)
+{
+	struct clip_db_entry *cde;
+	struct clip_entry *ce;
+	const int bucket = clip_hashfn(in6);
+
+	mtx_assert(&clip_db_lock, MA_OWNED);
+
+	cde = lookup_clip_db_entry(in6, add);
+	if (cde == NULL)
+		return (NULL);
+
+	LIST_FOREACH(ce, &sc->clip_table[bucket], link) {
+		if (ce->cde == cde)
+			return (ce);
+	}
+
+	/* Not found.  Create a new entry if requested. */
+	if (add) {
+		ce = alloc_clip_entry(cde);
+		if (ce != NULL) {
+			LIST_INSERT_HEAD(&sc->clip_table[bucket], ce, link);
+			TAILQ_INSERT_TAIL(&sc->clip_pending, ce, plink);
+			ce->pending = true;
+		}
+	}
+
+	return (ce);
+}
 
 static int
-add_lip(struct adapter *sc, struct in6_addr *lip)
+add_lip(struct adapter *sc, struct in6_addr *lip, int16_t *idx)
 {
-        struct fw_clip_cmd c;
+	struct fw_clip_cmd c;
+	int rc;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
-	mtx_assert(&sc->clip_table_lock, MA_OWNED);
 
-        memset(&c, 0, sizeof(c));
+	memset(&c, 0, sizeof(c));
 	c.op_to_write = htonl(V_FW_CMD_OP(FW_CLIP_CMD) | F_FW_CMD_REQUEST |
 	    F_FW_CMD_WRITE);
-        c.alloc_to_len16 = htonl(F_FW_CLIP_CMD_ALLOC | FW_LEN16(c));
-        c.ip_hi = *(uint64_t *)&lip->s6_addr[0];
-        c.ip_lo = *(uint64_t *)&lip->s6_addr[8];
+	c.alloc_to_len16 = htonl(F_FW_CLIP_CMD_ALLOC | FW_LEN16(c));
+	c.ip_hi = *(uint64_t *)&lip->s6_addr[0];
+	c.ip_lo = *(uint64_t *)&lip->s6_addr[8];
 
-	return (-t4_wr_mbox_ns(sc, sc->mbox, &c, sizeof(c), &c));
+	rc = -t4_wr_mbox_ns(sc, sc->mbox, &c, sizeof(c), &c);
+	if (rc == 0 && idx != NULL)
+		*idx = G_FW_CLIP_CMD_INDEX(ntohl(c.alloc_to_len16));
+	return (rc);
 }
 
 static int
-delete_lip(struct adapter *sc, struct in6_addr *lip)
+del_lip(struct adapter *sc, struct in6_addr *lip)
 {
 	struct fw_clip_cmd c;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
-	mtx_assert(&sc->clip_table_lock, MA_OWNED);
 
 	memset(&c, 0, sizeof(c));
 	c.op_to_write = htonl(V_FW_CMD_OP(FW_CLIP_CMD) | F_FW_CMD_REQUEST |
 	    F_FW_CMD_READ);
-        c.alloc_to_len16 = htonl(F_FW_CLIP_CMD_FREE | FW_LEN16(c));
-        c.ip_hi = *(uint64_t *)&lip->s6_addr[0];
-        c.ip_lo = *(uint64_t *)&lip->s6_addr[8];
+	c.alloc_to_len16 = htonl(F_FW_CLIP_CMD_FREE | FW_LEN16(c));
+	c.ip_hi = *(uint64_t *)&lip->s6_addr[0];
+	c.ip_lo = *(uint64_t *)&lip->s6_addr[8];
 
 	return (-t4_wr_mbox_ns(sc, sc->mbox, &c, sizeof(c), &c));
 }
+#endif
 
-static struct clip_entry *
-search_lip(struct adapter *sc, struct in6_addr *lip)
+struct clip_entry *
+t4_get_clip_entry(struct adapter *sc, struct in6_addr *in6, bool add)
 {
+#ifdef INET6
 	struct clip_entry *ce;
+	bool schedule = false;
 
-	mtx_assert(&sc->clip_table_lock, MA_OWNED);
-
-	TAILQ_FOREACH(ce, &sc->clip_table, link) {
-		if (IN6_ARE_ADDR_EQUAL(&ce->lip, lip))
-			return (ce);
+	mtx_lock(&clip_db_lock);
+	ce = lookup_clip_entry(sc, in6, add);
+	if (ce != NULL) {
+		MPASS(ce->cde->adp_ref > 0);
+		if (++ce->refcount == 1 && ce->pending && ce->clip_idx != -1) {
+			/*
+			 * Valid entry that was waiting to be deleted.  It is in
+			 * use now so take it off the pending list.
+			 */
+			TAILQ_REMOVE(&sc->clip_pending, ce, plink);
+			ce->pending = false;
+		}
+		if (ce->clip_idx == -1 && update_hw_clip_table(sc) != 0)
+			schedule = true;
 	}
+	mtx_unlock(&clip_db_lock);
+	if (schedule)
+		taskqueue_enqueue_timeout(taskqueue_thread, &sc->clip_task, 0);
 
+	return (ce);
+#else
 	return (NULL);
-}
 #endif
+}
 
-struct clip_entry *
-t4_hold_lip(struct adapter *sc, struct in6_addr *lip, struct clip_entry *ce)
+void
+t4_hold_clip_entry(struct adapter *sc, struct clip_entry *ce)
 {
-
 #ifdef INET6
-	mtx_lock(&sc->clip_table_lock);
-	if (ce == NULL)
-		ce = search_lip(sc, lip);
-	if (ce != NULL)
-		ce->refcount++;
-	mtx_unlock(&sc->clip_table_lock);
+	MPASS(ce != NULL);
+	MPASS(ce->cde->adp_ref > 0);
 
-	return (ce);
-#else
-	return (NULL);
+	mtx_lock(&clip_db_lock);
+	MPASS(ce->refcount > 0); /* Caller should already have a reference */
+	ce->refcount++;
+	mtx_unlock(&clip_db_lock);
 #endif
 }
 
+#ifdef INET6
+static void
+release_clip_entry_locked(struct adapter *sc, struct clip_entry *ce)
+{
+	struct clip_db_entry *cde;
+
+	mtx_assert(&clip_db_lock, MA_OWNED);
+	MPASS(ce->refcount > 0);
+	cde = ce->cde;
+	MPASS(cde->adp_ref > 0);
+	if (--ce->refcount == 0 && cde->krn_ref == 0) {
+		if (ce->clip_idx == -1) {
+			/* Was never written to the hardware. */
+			MPASS(ce->pending);
+			TAILQ_REMOVE(&sc->clip_pending, ce, plink);
+			LIST_REMOVE(ce, link);
+			free(ce, M_CXGBE);
+			if (--cde->adp_ref == 0) {
+				LIST_REMOVE(cde, link);
+				free(cde, M_CXGBE);
+			}
+		} else {
+			/*
+			 * Valid entry is now unused, add to the pending list
+			 * for deletion.  Its refcount was 1 on entry so it
+			 * can't already be pending.
+			 */
+			MPASS(!ce->pending);
+			TAILQ_INSERT_HEAD(&sc->clip_pending, ce, plink);
+			ce->pending = true;
+		}
+	}
+}
+#endif
+
 void
-t4_release_lip(struct adapter *sc, struct clip_entry *ce)
+t4_release_clip_entry(struct adapter *sc, struct clip_entry *ce)
 {
+#ifdef INET6
+	MPASS(ce != NULL);
+
+	mtx_lock(&clip_db_lock);
+	release_clip_entry_locked(sc, ce);
+	/*
+	 * This isn't a manual release via the ioctl.  No need to update the
+	 * hw right now even if the release resulted in the entry being queued
+	 * for deletion.
+	 */
+	mtx_unlock(&clip_db_lock);
+#endif
+}
 
+int
+t4_release_clip_addr(struct adapter *sc, struct in6_addr *in6)
+{
+	int rc = ENOTSUP;
 #ifdef INET6
-	mtx_lock(&sc->clip_table_lock);
-	KASSERT(search_lip(sc, &ce->lip) == ce,
-	    ("%s: CLIP entry %p p not in CLIP table.", __func__, ce));
-	KASSERT(ce->refcount > 0,
-	    ("%s: CLIP entry %p has refcount 0", __func__, ce));
-	--ce->refcount;
-	mtx_unlock(&sc->clip_table_lock);
+	struct clip_entry *ce;
+	bool schedule = false;
+
+	mtx_lock(&clip_db_lock);
+	ce = lookup_clip_entry(sc, in6, false);
+	if (ce == NULL)
+		rc = ENOENT;
+	else if (ce->refcount == 0)
+		rc = EIO;
+	else {
+		release_clip_entry_locked(sc, ce);
+		if (update_hw_clip_table(sc) != 0)
+			schedule = true;
+		rc = 0;
+	}
+	mtx_unlock(&clip_db_lock);
+	if (schedule)
+		taskqueue_enqueue_timeout(taskqueue_thread, &sc->clip_task, 0);
 #endif
+	return (rc);
 }
 
 #ifdef INET6
 void
 t4_init_clip_table(struct adapter *sc)
 {
-
-	mtx_init(&sc->clip_table_lock, "CLIP table lock", NULL, MTX_DEF);
-	TAILQ_INIT(&sc->clip_table);
+	TAILQ_INIT(&sc->clip_pending);
+	TIMEOUT_TASK_INIT(taskqueue_thread, &sc->clip_task, 0, t4_clip_task, sc);
 	sc->clip_gen = -1;
+	sc->clip_table = hashinit(CLIP_HASH_SIZE, M_CXGBE, &sc->clip_mask);
 
+	/* Both the hashes must use the same bucket for the same key. */
+	if (sc->clip_table != NULL)
+		MPASS(sc->clip_mask == clip_db_mask);
 	/*
 	 * Don't bother forcing an update of the clip table when the
 	 * adapter is initialized.  Before an interface can be used it
@@ -164,194 +397,344 @@ t4_init_clip_table(struct adapter *sc)
 	 */
 }
 
+/*
+ * Returns true if any additions or deletions were made to the CLIP DB.
+ */
 static void
-update_clip(struct adapter *sc, void *arg __unused)
+update_clip_db(void)
 {
+	VNET_ITERATOR_DECL(vnet_iter);
+	struct rm_priotracker in6_ifa_tracker;
+	struct in6_addr *in6, tin6;
+	struct in6_ifaddr *ia;
+	struct clip_db_entry *cde, *cde_tmp;
+	int i, addel;
 
-	if (begin_synchronized_op(sc, NULL, HOLD_LOCK, "t4clip"))
-		return;
+	VNET_LIST_RLOCK();
+	IN6_IFADDR_RLOCK(&in6_ifa_tracker);
+	mtx_lock(&clip_db_lock);
+	VNET_FOREACH(vnet_iter) {
+		CURVNET_SET_QUIET(vnet_iter);
+		CK_STAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) {
+			if (ia->ia_ifp->if_flags & IFF_LOOPBACK)
+				continue;
+			in6 = &ia->ia_addr.sin6_addr;
+			KASSERT(!IN6_IS_ADDR_MULTICAST(in6),
+			    ("%s: mcast address in in6_ifaddr list", __func__));
+			if (IN6_IS_ADDR_LOOPBACK(in6))
+				continue;
 
-	if (mtx_initialized(&sc->clip_table_lock) && !hw_off_limits(sc))
-		update_clip_table(sc);
+			if (IN6_IS_SCOPE_EMBED(in6)) {
+				tin6 = *in6;
+				in6 = &tin6;
+				in6_clearscope(in6);
+			}
+			cde = lookup_clip_db_entry(in6, true);
+			if (cde == NULL)
+				continue;
+			cde->tmp_ref++;
+		}
+		CURVNET_RESTORE();
+	}
+
+	addel = 0;
+	for (i = 0; i <= clip_db_mask; i++) {
+		LIST_FOREACH_SAFE(cde, &clip_db[i], link, cde_tmp) {
+			if (cde->krn_ref == 0 && cde->tmp_ref > 0) {
+				addel++;	/* IP6 addr added. */
+			} else if (cde->krn_ref > 0 && cde->tmp_ref == 0) {
+				if (cde->adp_ref == 0) {
+					LIST_REMOVE(cde, link);
+					free(cde, M_CXGBE);
+					continue;
+				}
+				addel++;	/* IP6 addr deleted. */
+			}
+			cde->krn_ref = cde->tmp_ref;
+			cde->tmp_ref = 0;
+		}
+	}
+	if (addel > 0)
+		clip_db_gen++;
+	mtx_unlock(&clip_db_lock);
+	IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
+	VNET_LIST_RUNLOCK();
 
-	end_synchronized_op(sc, LOCK_HELD);
 }
 
+/*
+ * Update the CLIP db and then update the CLIP tables on all the adapters.
+ */
 static void
-t4_clip_task(void *arg, int count)
+t4_clip_db_task(void *arg, int count)
 {
-
-	t4_iterate(update_clip, NULL);
+	update_clip_db();
+	t4_iterate(update_clip_table, NULL);
 }
 
-static void
-update_clip_table(struct adapter *sc)
+/*
+ * Refresh the sw CLIP table for this adapter from the global CLIP db.  Entries
+ * that need to be added or deleted from the hardware CLIP table are placed on a
+ * pending list but the hardware is not touched.  The pending list is something
+ * reasonable even if this fails so it's ok to apply that to the hardware.
+ */
+static int
+update_sw_clip_table(struct adapter *sc)
 {
-	struct rm_priotracker in6_ifa_tracker;
-	struct in6_ifaddr *ia;
-	struct in6_addr *lip, tlip;
-	TAILQ_HEAD(, clip_entry) stale;
+	struct clip_db_entry *cde;
 	struct clip_entry *ce, *ce_temp;
-	struct vi_info *vi;
-	int rc, gen, i, j;
-	uintptr_t last_vnet;
-
-	ASSERT_SYNCHRONIZED_OP(sc);
+	int i;
+	bool found;
 
-	IN6_IFADDR_RLOCK(&in6_ifa_tracker);
-	mtx_lock(&sc->clip_table_lock);
-
-	gen = atomic_load_acq_int(&in6_ifaddr_gen);
-	if (gen == sc->clip_gen)
-		goto done;
-
-	TAILQ_INIT(&stale);
-	TAILQ_CONCAT(&stale, &sc->clip_table, link);
+	mtx_assert(&clip_db_lock, MA_OWNED);
 
 	/*
-	 * last_vnet optimizes the common cases where all if_vnet = NULL (no
-	 * VIMAGE) or all if_vnet = vnet0.
+	 * We are about to rebuild the pending list from scratch.  Deletions are
+	 * placed before additions because that's how we want to submit them to
+	 * the hardware.
 	 */
-	last_vnet = (uintptr_t)(-1);
-	for_each_port(sc, i)
-	for_each_vi(sc->port[i], j, vi) {
-		if (IS_DOOMED(vi))
-			continue;
-
-		if (last_vnet == (uintptr_t)vi->ifp->if_vnet)
-			continue;
+	TAILQ_INIT(&sc->clip_pending);
 
-		/* XXX: races with if_vmove */
-		CURVNET_SET(vi->ifp->if_vnet);
-		CK_STAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) {
-			lip = &ia->ia_addr.sin6_addr;
-
-			KASSERT(!IN6_IS_ADDR_MULTICAST(lip),
-			    ("%s: mcast address in in6_ifaddr list", __func__));
-
-			if (IN6_IS_ADDR_LOOPBACK(lip))
+	/*
+	 * Walk the sw CLIP table first.  We want to reset every entry's pending
+	 * status as we're rebuilding the pending list.
+	 */
+	for (i = 0; i <= clip_db_mask; i++) {
+		LIST_FOREACH_SAFE(ce, &sc->clip_table[i], link, ce_temp) {
+			cde = ce->cde;
+			MPASS(cde->adp_ref > 0);
+			if (ce->refcount != 0 || cde->krn_ref != 0) {
+				/*
+				 * Entry should stay in the CLIP.
+				 */
+
+				if (ce->clip_idx != -1) {
+					ce->pending = false;
+				} else {
+					/* Was never added, carry forward. */
+					MPASS(ce->pending);
+					TAILQ_INSERT_TAIL(&sc->clip_pending, ce,
+					    plink);
+				}
 				continue;
-			if (IN6_IS_SCOPE_EMBED(lip)) {
-				/* Remove the embedded scope */
-				tlip = *lip;
-				lip = &tlip;
-				in6_clearscope(lip);
 			}
-			/*
-			 * XXX: how to weed out the link local address for the
-			 * loopback interface?  It's fe80::1 usually (always?).
-			 */
 
 			/*
-			 * If it's in the main list then we already know it's
-			 * not stale.
+			 * Entry should be removed from the CLIP.
 			 */
-			TAILQ_FOREACH(ce, &sc->clip_table, link) {
-				if (IN6_ARE_ADDR_EQUAL(&ce->lip, lip))
-					goto next;
-			}
 
-			/*
-			 * If it's in the stale list we should move it to the
-			 * main list.
-			 */
-			TAILQ_FOREACH(ce, &stale, link) {
-				if (IN6_ARE_ADDR_EQUAL(&ce->lip, lip)) {
-					TAILQ_REMOVE(&stale, ce, link);
-					TAILQ_INSERT_TAIL(&sc->clip_table, ce,
-					    link);
-					goto next;
+			if (ce->clip_idx != -1) {
+				ce->pending = true;
+				TAILQ_INSERT_HEAD(&sc->clip_pending, ce, plink);
+			} else {
+				/* Was never added, free right now. */
+				MPASS(ce->pending);
+				LIST_REMOVE(ce, link);
+				free(ce, M_CXGBE);
+				if (--cde->adp_ref == 0) {
+					LIST_REMOVE(cde, link);
+					free(cde, M_CXGBE);
 				}
 			}
+		}
+	}
+
+	for (i = 0; i <= clip_db_mask; i++) {
+		LIST_FOREACH(cde, &clip_db[i], link) {
+			if (cde->krn_ref == 0)
+				continue;
 
-			/* A new IP6 address; add it to the CLIP table */
-			ce = malloc(sizeof(*ce), M_CXGBE, M_NOWAIT);
-			memcpy(&ce->lip, lip, sizeof(ce->lip));
-			ce->refcount = 0;
-			rc = add_lip(sc, lip);
-			if (rc == 0)
-				TAILQ_INSERT_TAIL(&sc->clip_table, ce, link);
-			else {
-				char ip[INET6_ADDRSTRLEN];
-
-				inet_ntop(AF_INET6, &ce->lip, &ip[0],
-				    sizeof(ip));
-				if (sc->flags & KERN_TLS_ON ||
-				    sc->active_ulds != 0) {
-					log(LOG_ERR,
-					    "%s: could not add %s (%d)\n",
-					    __func__, ip, rc);
+			found = false;
+			LIST_FOREACH(ce, &sc->clip_table[i], link) {
+				if (ce->cde == cde) {
+					found = true;
+					break;
 				}
-				free(ce, M_CXGBE);
 			}
-next:
-			continue;
+			if (found)
+				continue;
+			ce = alloc_clip_entry(cde);
+			if (ce == NULL)
+				return (ENOMEM);
+			LIST_INSERT_HEAD(&sc->clip_table[i], ce, link);
+			TAILQ_INSERT_TAIL(&sc->clip_pending, ce, plink);
+			ce->pending = true;
 		}
-		CURVNET_RESTORE();
-		last_vnet = (uintptr_t)vi->ifp->if_vnet;
 	}
 
-	/*
-	 * Remove stale addresses (those no longer in V_in6_ifaddrhead) that are
-	 * no longer referenced by the driver.
-	 */
-	TAILQ_FOREACH_SAFE(ce, &stale, link, ce_temp) {
-		if (ce->refcount == 0) {
-			rc = delete_lip(sc, &ce->lip);
-			if (rc == 0) {
-				TAILQ_REMOVE(&stale, ce, link);
+	sc->clip_gen = clip_db_gen;
+	return (0);
+}
+
+static int
+update_hw_clip_table(struct adapter *sc)
+{
+	struct clip_db_entry *cde;
+	struct clip_entry *ce;
+	int rc;
+	char ip[INET6_ADDRSTRLEN];
+
+	mtx_assert(&clip_db_lock, MA_OWNED);
+	rc = begin_synchronized_op(sc, NULL, HOLD_LOCK, "t4clip");
+	if (rc != 0)
+		return (rc);
+	if (hw_off_limits(sc))
+		goto done;	/* with rc = 0, we don't want to reschedule. */
+	while (!TAILQ_EMPTY(&sc->clip_pending)) {
+		ce = TAILQ_FIRST(&sc->clip_pending);
+		MPASS(ce->pending);
+		cde = ce->cde;
+		MPASS(cde->adp_ref > 0);
+
+		if (ce->clip_idx == -1) {
+			/*
+			 * Entry was queued for addition to the HW CLIP.
+			 */
+
+			if (ce->refcount == 0 && cde->krn_ref == 0) {
+				/* No need to add to HW CLIP. */
+				TAILQ_REMOVE(&sc->clip_pending, ce, plink);
+				LIST_REMOVE(ce, link);
 				free(ce, M_CXGBE);
+				if (--cde->adp_ref == 0) {
+					LIST_REMOVE(cde, link);
+					free(cde, M_CXGBE);
+				}
 			} else {
-				char ip[INET6_ADDRSTRLEN];
+				/* Add to the HW CLIP. */
+				rc = add_lip(sc, &cde->lip, &ce->clip_idx);
+				if (rc == FW_ENOMEM) {
+					/* CLIP full, no point in retrying. */
+					rc = 0;
+					goto done;
+				}
+				if (rc != 0) {
+					inet_ntop(AF_INET6, &cde->lip, &ip[0],
+					    sizeof(ip));
+					CH_ERR(sc, "add_lip(%s) failed: %d\n",
+					    ip, rc);
+					goto done;
+				}
+				MPASS(ce->clip_idx != -1);
+				TAILQ_REMOVE(&sc->clip_pending, ce, plink);
+				ce->pending = false;
+			}
+		} else {
+			/*
+			 * Entry was queued for deletion from the HW CLIP.
+			 */
 
-				inet_ntop(AF_INET6, &ce->lip, &ip[0],
-				    sizeof(ip));
-				log(LOG_ERR, "%s: could not delete %s (%d)\n",
-				    __func__, ip, rc);
+			if (ce->refcount == 0 && cde->krn_ref == 0) {
+				/*
+				 * Delete from the HW CLIP.  Delete should never
+				 * fail so we always log an error.  But if the
+				 * failure is that the entry wasn't found in the
+				 * CLIP then we carry on as if it was deleted.
+				 */
+				rc = del_lip(sc, &cde->lip);
+				if (rc != 0)
+					CH_ERR(sc, "del_lip(%s) failed: %d\n",
+					    ip, rc);
+				if (rc == FW_EPROTO)
+					rc = 0;
+				if (rc != 0)
+					goto done;
+
+				TAILQ_REMOVE(&sc->clip_pending, ce, plink);
+				LIST_REMOVE(ce, link);
+				free(ce, M_CXGBE);
+				if (--cde->adp_ref == 0) {
+					LIST_REMOVE(cde, link);
+					free(cde, M_CXGBE);
+				}
+			} else {
+				/* No need to delete from HW CLIP. */
+				TAILQ_REMOVE(&sc->clip_pending, ce, plink);
+				ce->pending = false;
 			}
 		}
 	}
-	/* The ones that are still referenced need to stay in the CLIP table */
-	TAILQ_CONCAT(&sc->clip_table, &stale, link);
-
-	sc->clip_gen = gen;
 done:
-	mtx_unlock(&sc->clip_table_lock);
-	IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
+	end_synchronized_op(sc, LOCK_HELD);
+	return (rc);
+}
+
+static void
+update_clip_table(struct adapter *sc, void *arg __unused)
+{
+	bool reschedule;
+
+	if (sc->clip_table == NULL)
+		return;
+
+	reschedule = false;
+	mtx_lock(&clip_db_lock);
+	if (sc->clip_gen != clip_db_gen && update_sw_clip_table(sc) != 0)
+		reschedule = true;
+	if (!TAILQ_EMPTY(&sc->clip_pending) && update_hw_clip_table(sc) != 0)
+		reschedule = true;
+	mtx_unlock(&clip_db_lock);
+	if (reschedule)
+		taskqueue_enqueue_timeout(taskqueue_thread, &sc->clip_task,
+		    -hz / 4);
+}
+
+/*
+ * Update the CLIP table of the specified adapter.
+ */
+static void
+t4_clip_task(void *sc, int count)
+{
+	update_clip_table(sc, NULL);
 }
 
 void
 t4_destroy_clip_table(struct adapter *sc)
 {
 	struct clip_entry *ce, *ce_temp;
-
-	if (mtx_initialized(&sc->clip_table_lock)) {
-		mtx_lock(&sc->clip_table_lock);
-		TAILQ_FOREACH_SAFE(ce, &sc->clip_table, link, ce_temp) {
-			KASSERT(ce->refcount == 0,
-			    ("%s: CLIP entry %p still in use (%d)", __func__,
-			    ce, ce->refcount));
-			TAILQ_REMOVE(&sc->clip_table, ce, link);
+	int i;
+
+	mtx_lock(&clip_db_lock);
+	if (sc->clip_table == NULL)
+		goto done;		/* CLIP was never initialized. */
+	for (i = 0; i <= sc->clip_mask; i++) {
+		LIST_FOREACH_SAFE(ce, &sc->clip_table[i], link, ce_temp) {
+			MPASS(ce->refcount == 0);
+			MPASS(ce->cde->adp_ref > 0);
 #if 0
-			delete_lip(sc, &ce->lip);
+			del_lip(sc, &ce->lip);
 #endif
+			LIST_REMOVE(ce, link);
+			if (--ce->cde->adp_ref == 0 && ce->cde->krn_ref == 0) {
+				LIST_REMOVE(ce->cde, link);
+				free(ce->cde, M_CXGBE);
+			}
 			free(ce, M_CXGBE);
 		}
-		mtx_unlock(&sc->clip_table_lock);
-		mtx_destroy(&sc->clip_table_lock);
 	}
+	hashdestroy(&sc->clip_table, M_CXGBE, sc->clip_mask);
+	sc->clip_table = NULL;
+done:
+	mtx_unlock(&clip_db_lock);
 }
 
 static void
 t4_ifaddr_event(void *arg __unused, struct ifnet *ifp, struct ifaddr *ifa,
     int event)
 {
+	struct in6_addr *in6;
 
+	if (t4_clip_db_auto == 0)
+		return;		/* Automatic updates not allowed. */
 	if (ifa->ifa_addr->sa_family != AF_INET6)
 		return;
+	if (ifp->if_flags & IFF_LOOPBACK)
+		return;
+	in6 = &((struct in6_ifaddr *)ifa)->ia_addr.sin6_addr;
+	if (IN6_IS_ADDR_LOOPBACK(in6) || IN6_IS_ADDR_MULTICAST(in6))
+		return;
 
-	atomic_add_rel_int(&in6_ifaddr_gen, 1);
-	taskqueue_enqueue_timeout(taskqueue_thread, &clip_task, -hz / 4);
+	taskqueue_enqueue(taskqueue_thread, &clip_db_task);
 }
*** 337 LINES SKIPPED ***