git: bc9402abdd11 - main - igc: Add AIM

From: Kevin Bowling <kbowling_at_FreeBSD.org>
Date: Sat, 12 Oct 2024 07:37:45 UTC
The branch main has been updated by kbowling:

URL: https://cgit.FreeBSD.org/src/commit/?id=bc9402abdd11630ced33cbecb79b0d72f203f08a

commit bc9402abdd11630ced33cbecb79b0d72f203f08a
Author:     Kevin Bowling <kbowling@FreeBSD.org>
AuthorDate: 2024-10-11 01:20:13 +0000
Commit:     Kevin Bowling <kbowling@FreeBSD.org>
CommitDate: 2024-10-12 07:36:39 +0000

    igc: Add AIM
    
    igc is derived from igb and has never had an AIM implementation. The
    same algorithm from e1000 is appropriate here.
    
    Upon more detailed study of the Linux driver which has a newer AIM
    implementation, it finally became clear to me this is actually a
    holdoff timer and not an interrupt limit as it is conventionally
    (statically) programmed and displayed as an interrupt rate. The data
    sheets also make this somewhat clear.
    
    Thus, AIM accomplishes two beneficial things for a wide variety of
    workloads[1]:
    
    1. At low throughput/packet rates, it will significantly lower latency
    (by counter-intuitively "increasing" the interrupt rate.. better
    thought of as decreasing the holdoff timer because you will modulate
    down before coming anywhere near these interrupt rates).
    2. At bulk data rates, it is tuned to achieve a lower interrupt rate
    (by increasing the holdoff timer) than the current static 8000/s. This
    decreases processing overhead and yields more headroom for other work
    such as packet filters or userland.
    
    For a single NIC this might be worth a few sys% on common CPUs, but may
    be meaningful when multiplied such as if_lagg, if_bridge and forwarding
    setups.
    
    The AIM algorithm was re-introduced from the older igb or out of tree
    driver, and then modernized with permission to use Intel code from other
    drivers.
    
    [1]: http://iommu.com/datasheets/ethernet/controllers-nics/intel/e1000/gbe-controllers-interrupt-moderation-appl-note.pdf
    
    MFC after:      1 week
    Relnotes:       yes
    Sponsored by:   Rubicon Communications, LLC ("Netgate")
    Sponsored by:   BBOX.io
    Differential Revision:  https://reviews.freebsd.org/D47053
---
 sys/dev/igc/if_igc.c   | 232 +++++++++++++++++++++++++++++++++++++++++++++++--
 sys/dev/igc/if_igc.h   |  27 +++++-
 sys/dev/igc/igc_txrx.c |   4 +
 3 files changed, 256 insertions(+), 7 deletions(-)

diff --git a/sys/dev/igc/if_igc.c b/sys/dev/igc/if_igc.c
index 006fecdab7b0..c4f5e82ff8c8 100644
--- a/sys/dev/igc/if_igc.c
+++ b/sys/dev/igc/if_igc.c
@@ -1,9 +1,9 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
+ * Copyright (c) 2001-2024, Intel Corporation
  * Copyright (c) 2016 Nicole Graziano <nicole@nextbsd.org>
- * All rights reserved.
- * Copyright (c) 2021 Rubicon Communications, LLC (Netgate)
+ * Copyright (c) 2021-2024 Rubicon Communications, LLC (Netgate)
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -125,6 +125,8 @@ static int	igc_sysctl_debug_info(SYSCTL_HANDLER_ARGS);
 static int	igc_get_rs(SYSCTL_HANDLER_ARGS);
 static void	igc_print_debug_info(struct igc_adapter *);
 static int 	igc_is_valid_ether_addr(u8 *);
+static void	igc_neweitr(struct igc_adapter *, struct igc_rx_queue *,
+    struct tx_ring *, struct rx_ring *);
 /* Management and WOL Support */
 static void	igc_get_hw_control(struct igc_adapter *);
 static void	igc_release_hw_control(struct igc_adapter *);
@@ -238,10 +240,19 @@ static int igc_eee_setting = 1;
 SYSCTL_INT(_hw_igc, OID_AUTO, eee_setting, CTLFLAG_RDTUN, &igc_eee_setting, 0,
     "Enable Energy Efficient Ethernet");
 
+/*
+ * AIM: Adaptive Interrupt Moderation
+ * which means that the interrupt rate is varied over time based on the
+ * traffic for that interrupt vector
+ */
+static int igc_enable_aim = 1;
+SYSCTL_INT(_hw_igc, OID_AUTO, enable_aim, CTLFLAG_RWTUN, &igc_enable_aim,
+    0, "Enable adaptive interrupt moderation (1=normal, 2=lowlatency)");
+
 /*
 ** Tuneable Interrupt rate
 */
-static int igc_max_interrupt_rate = 20000;
+static int igc_max_interrupt_rate = IGC_INTS_DEFAULT;
 SYSCTL_INT(_hw_igc, OID_AUTO, max_interrupt_rate, CTLFLAG_RDTUN,
     &igc_max_interrupt_rate, 0, "Maximum interrupts per second");
 
@@ -444,6 +455,13 @@ igc_if_attach_pre(if_ctx_t ctx)
 	    OID_AUTO, "nvm", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
 	    adapter, 0, igc_sysctl_nvm_info, "I", "NVM Information");
 
+	adapter->enable_aim = igc_enable_aim;
+	SYSCTL_ADD_INT(device_get_sysctl_ctx(dev),
+	    SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
+	    OID_AUTO, "enable_aim", CTLFLAG_RW,
+	    &adapter->enable_aim, 0,
+	    "Interrupt Moderation (1=normal, 2=lowlatency)");
+
 	SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
 	    SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
 	    OID_AUTO, "fw_version", CTLTYPE_STRING | CTLFLAG_RD,
@@ -816,6 +834,142 @@ igc_if_init(if_ctx_t ctx)
 	igc_set_eee_i225(&adapter->hw, true, true, true);
 }
 
+enum eitr_latency_target {
+	eitr_latency_disabled = 0,
+	eitr_latency_lowest = 1,
+	eitr_latency_low = 2,
+	eitr_latency_bulk = 3
+};
+/*********************************************************************
+ *
+ *  Helper to calculate next EITR value for AIM
+ *
+ *********************************************************************/
+static void
+igc_neweitr(struct igc_adapter *sc, struct igc_rx_queue *que,
+    struct tx_ring *txr, struct rx_ring *rxr)
+{
+	struct igc_hw *hw = &sc->hw;
+	u32 neweitr;
+	u32 bytes;
+	u32 bytes_packets;
+	u32 packets;
+	u8 nextlatency;
+
+	/* Idle, do nothing */
+	if ((txr->tx_bytes == 0) && (rxr->rx_bytes == 0))
+		return;
+
+	neweitr = 0;
+
+	if (sc->enable_aim) {
+		nextlatency = rxr->rx_nextlatency;
+
+		/* Use half default (4K) ITR if sub-gig */
+		if (sc->link_speed != 1000) {
+			neweitr = IGC_INTS_4K;
+			goto igc_set_next_eitr;
+		}
+		/* Want at least enough packet buffer for two frames to AIM */
+		if (sc->shared->isc_max_frame_size * 2 > (sc->pba << 10)) {
+			neweitr = igc_max_interrupt_rate;
+			sc->enable_aim = 0;
+			goto igc_set_next_eitr;
+		}
+
+		/* Get the largest values from the associated tx and rx ring */
+		if (txr->tx_bytes && txr->tx_packets) {
+			bytes = txr->tx_bytes;
+			bytes_packets = txr->tx_bytes/txr->tx_packets;
+			packets = txr->tx_packets;
+		}
+		if (rxr->rx_bytes && rxr->rx_packets) {
+			bytes = max(bytes, rxr->rx_bytes);
+			bytes_packets = max(bytes_packets, rxr->rx_bytes/rxr->rx_packets);
+			packets = max(packets, rxr->rx_packets);
+		}
+
+		/* Latency state machine */
+		switch (nextlatency) {
+		case eitr_latency_disabled: /* Bootstrapping */
+			nextlatency = eitr_latency_low;
+			break;
+		case eitr_latency_lowest: /* 70k ints/s */
+			/* TSO and jumbo frames */
+			if (bytes_packets > 8000)
+				nextlatency = eitr_latency_bulk;
+			else if ((packets < 5) && (bytes > 512))
+				nextlatency = eitr_latency_low;
+			break;
+		case eitr_latency_low: /* 20k ints/s */
+			if (bytes > 10000) {
+				/* Handle TSO */
+				if (bytes_packets > 8000)
+					nextlatency = eitr_latency_bulk;
+				else if ((packets < 10) || (bytes_packets > 1200))
+					nextlatency = eitr_latency_bulk;
+				else if (packets > 35)
+					nextlatency = eitr_latency_lowest;
+			} else if (bytes_packets > 2000) {
+				nextlatency = eitr_latency_bulk;
+			} else if (packets < 3 && bytes < 512) {
+				nextlatency = eitr_latency_lowest;
+			}
+			break;
+		case eitr_latency_bulk: /* 4k ints/s */
+			if (bytes > 25000) {
+				if (packets > 35)
+					nextlatency = eitr_latency_low;
+			} else if (bytes < 1500)
+				nextlatency = eitr_latency_low;
+			break;
+		default:
+			nextlatency = eitr_latency_low;
+			device_printf(sc->dev, "Unexpected neweitr transition %d\n",
+			    nextlatency);
+			break;
+		}
+
+		/* Trim itr_latency_lowest for default AIM setting */
+		if (sc->enable_aim == 1 && nextlatency == eitr_latency_lowest)
+			nextlatency = eitr_latency_low;
+
+		/* Request new latency */
+		rxr->rx_nextlatency = nextlatency;
+	} else {
+		/* We may have toggled to AIM disabled */
+		nextlatency = eitr_latency_disabled;
+		rxr->rx_nextlatency = nextlatency;
+	}
+
+	/* ITR state machine */
+	switch(nextlatency) {
+	case eitr_latency_lowest:
+		neweitr = IGC_INTS_70K;
+		break;
+	case eitr_latency_low:
+		neweitr = IGC_INTS_20K;
+		break;
+	case eitr_latency_bulk:
+		neweitr = IGC_INTS_4K;
+		break;
+	case eitr_latency_disabled:
+	default:
+		neweitr = igc_max_interrupt_rate;
+		break;
+	}
+
+igc_set_next_eitr:
+	neweitr = IGC_INTS_TO_EITR(neweitr);
+
+	neweitr |= IGC_EITR_CNT_IGNR;
+
+	if (neweitr != que->eitr_setting) {
+		que->eitr_setting = neweitr;
+		IGC_WRITE_REG(hw, IGC_EITR(que->msix), que->eitr_setting);
+	}
+}
+
 /*********************************************************************
  *
  *  Fast Legacy/MSI Combined Interrupt Service routine
@@ -825,10 +979,14 @@ int
 igc_intr(void *arg)
 {
 	struct igc_adapter *adapter = arg;
+	struct igc_hw *hw = &adapter->hw;
+	struct igc_rx_queue *que = &adapter->rx_queues[0];
+	struct tx_ring *txr = &adapter->tx_queues[0].txr;
+	struct rx_ring *rxr = &que->rxr;
 	if_ctx_t ctx = adapter->ctx;
 	u32 reg_icr;
 
-	reg_icr = IGC_READ_REG(&adapter->hw, IGC_ICR);
+	reg_icr = IGC_READ_REG(hw, IGC_ICR);
 
 	/* Hot eject? */
 	if (reg_icr == 0xffffffff)
@@ -856,6 +1014,14 @@ igc_intr(void *arg)
 	if (reg_icr & IGC_ICR_RXO)
 		adapter->rx_overruns++;
 
+	igc_neweitr(adapter, que, txr, rxr);
+
+	/* Reset state */
+	txr->tx_bytes = 0;
+	txr->tx_packets = 0;
+	rxr->rx_bytes = 0;
+	rxr->rx_packets = 0;
+
 	return (FILTER_SCHEDULE_THREAD);
 }
 
@@ -888,9 +1054,20 @@ static int
 igc_msix_que(void *arg)
 {
 	struct igc_rx_queue *que = arg;
+	struct igc_adapter *sc = que->adapter;
+	struct tx_ring *txr = &sc->tx_queues[que->msix].txr;
+	struct rx_ring *rxr = &que->rxr;
 
 	++que->irqs;
 
+	igc_neweitr(sc, que, txr, rxr);
+
+	/* Reset state */
+	txr->tx_bytes = 0;
+	txr->tx_packets = 0;
+	rxr->rx_bytes = 0;
+	rxr->rx_packets = 0;
+
 	return (FILTER_SCHEDULE_THREAD);
 }
 
@@ -1395,7 +1572,7 @@ igc_configure_queues(struct igc_adapter *adapter)
 
 	/* Set the starting interrupt rate */
 	if (igc_max_interrupt_rate > 0)
-		newitr = (4000000 / igc_max_interrupt_rate) & 0x7FFC;
+		newitr = IGC_INTS_TO_EITR(igc_max_interrupt_rate);
 
 	newitr |= IGC_EITR_CNT_IGNR;
 
@@ -1608,6 +1785,9 @@ igc_reset(if_ctx_t ctx)
 	/* Setup DMA Coalescing */
 	igc_init_dmac(adapter, pba);
 
+	/* Save the final PBA off if it needs to be used elsewhere i.e. AIM */
+	adapter->pba = pba;
+
 	IGC_WRITE_REG(hw, IGC_VET, ETHERTYPE_VLAN);
 	igc_get_phy_info(hw);
 	igc_check_for_link(hw);
@@ -2380,6 +2560,40 @@ igc_sysctl_reg_handler(SYSCTL_HANDLER_ARGS)
 	return (sysctl_handle_int(oidp, &val, 0, req));
 }
 
+/* Per queue holdoff interrupt rate handler */
+static int
+igc_sysctl_interrupt_rate_handler(SYSCTL_HANDLER_ARGS)
+{
+	struct igc_rx_queue *rque;
+	struct igc_tx_queue *tque;
+	struct igc_hw *hw;
+	int error;
+	u32 reg, usec, rate;
+
+	bool tx = oidp->oid_arg2;
+
+	if (tx) {
+		tque = oidp->oid_arg1;
+		hw = &tque->adapter->hw;
+		reg = IGC_READ_REG(hw, IGC_EITR(tque->me));
+	} else {
+		rque = oidp->oid_arg1;
+		hw = &rque->adapter->hw;
+		reg = IGC_READ_REG(hw, IGC_EITR(rque->msix));
+	}
+
+	usec = (reg & IGC_QVECTOR_MASK);
+	if (usec > 0)
+		rate = IGC_INTS_TO_EITR(usec);
+	else
+		rate = 0;
+
+	error = sysctl_handle_int(oidp, &rate, 0, req);
+	if (error || !req->newptr)
+		return error;
+	return 0;
+}
+
 /*
  * Add sysctl variables, one per statistic, to the system.
  */
@@ -2436,6 +2650,10 @@ igc_add_hw_stats(struct igc_adapter *adapter)
 		    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "TX Queue Name");
 		queue_list = SYSCTL_CHILDREN(queue_node);
 
+		SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "interrupt_rate",
+		    CTLTYPE_UINT | CTLFLAG_RD, tx_que,
+		    true, igc_sysctl_interrupt_rate_handler, "IU",
+		    "Interrupt Rate");
 		SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "txd_head",
 		    CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_NEEDGIANT, adapter,
 		    IGC_TDH(txr->me), igc_sysctl_reg_handler, "IU",
@@ -2456,6 +2674,10 @@ igc_add_hw_stats(struct igc_adapter *adapter)
 		    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "RX Queue Name");
 		queue_list = SYSCTL_CHILDREN(queue_node);
 
+		SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "interrupt_rate",
+		    CTLTYPE_UINT | CTLFLAG_RD, rx_que,
+			false, igc_sysctl_interrupt_rate_handler, "IU",
+			"Interrupt Rate");
 		SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "rxd_head",
 		    CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_NEEDGIANT, adapter,
 		    IGC_RDH(rxr->me), igc_sysctl_reg_handler, "IU",
diff --git a/sys/dev/igc/if_igc.h b/sys/dev/igc/if_igc.h
index 727699baee5f..57949c3ad38a 100644
--- a/sys/dev/igc/if_igc.h
+++ b/sys/dev/igc/if_igc.h
@@ -1,8 +1,8 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
+ * Copyright (c) 2001-2024, Intel Corporation
  * Copyright (c) 2016 Nicole Graziano <nicole@nextbsd.org>
- * All rights reserved.
  * Copyright (c) 2021 Rubicon Communications, LLC (Netgate)
  *
  * Redistribution and use in source and binary forms, with or without
@@ -163,6 +163,17 @@
 #define IGC_TX_PTHRESH			8
 #define IGC_TX_HTHRESH			1
 
+/* Define the interrupt rates and EITR helpers */
+#define IGC_INTS_4K		4000
+#define IGC_INTS_20K		20000
+#define IGC_INTS_70K		70000
+#define IGC_INTS_DEFAULT	8000
+#define IGC_EITR_DIVIDEND	1000000
+#define IGC_EITR_SHIFT		2
+#define IGC_QVECTOR_MASK	0x7FFC
+#define IGC_INTS_TO_EITR(i)	(((IGC_EITR_DIVIDEND/i) & IGC_QVECTOR_MASK) << \
+				    IGC_EITR_SHIFT)
+
 /*
  * TDBA/RDBA should be aligned on 16 byte boundary. But TDLEN/RDLEN should be
  * multiple of 128 bytes. So we align TDBA/RDBA on 128 byte boundary. This will
@@ -218,7 +229,12 @@ struct tx_ring {
 	/* Interrupt resources */
 	void                    *tag;
 	struct resource         *res;
-        unsigned long		tx_irq;
+
+	/* Soft stats */
+	unsigned long		tx_irq;
+	unsigned long		tx_packets;
+	unsigned long		tx_bytes;
+
 
 	/* Saved csum offloading context information */
 	int			csum_flags;
@@ -253,6 +269,9 @@ struct rx_ring {
         unsigned long		rx_discarded;
         unsigned long		rx_packets;
         unsigned long		rx_bytes;
+
+        /* Next requested ITR latency */
+        u8			rx_nextlatency;
 };
 
 struct igc_tx_queue {
@@ -268,6 +287,7 @@ struct igc_rx_queue {
 	u32                    me;
 	u32                    msix;
 	u32                    eims;
+	u32                    eitr_setting;
 	struct rx_ring         rxr;
 	u64                    irqs;
 	struct if_irq          que_irq;
@@ -315,6 +335,8 @@ struct igc_adapter {
 
 	u32		rx_mbuf_sz;
 
+	int		enable_aim;
+
 	/* Management and WOL features */
 	u32		wol;
 
@@ -328,6 +350,7 @@ struct igc_adapter {
 	u16		link_duplex;
 	u32		smartspeed;
 	u32		dmac;
+	u32		pba;
 	int		link_mask;
 
 	u64		que_mask;
diff --git a/sys/dev/igc/igc_txrx.c b/sys/dev/igc/igc_txrx.c
index 7601513a709e..cd7175f45f34 100644
--- a/sys/dev/igc/igc_txrx.c
+++ b/sys/dev/igc/igc_txrx.c
@@ -316,6 +316,10 @@ igc_isc_txd_encap(void *arg, if_pkt_info_t pi)
 	txd->read.cmd_type_len |= htole32(IGC_ADVTXD_DCMD_EOP | txd_flags);
 	pi->ipi_new_pidx = i;
 
+	/* Sent data accounting for AIM */
+	txr->tx_bytes += pi->ipi_len;
+	++txr->tx_packets;
+
 	return (0);
 }