svn commit: r228387 - head/sys/dev/e1000

Jack F Vogel jfv at FreeBSD.org
Sat Dec 10 07:08:52 UTC 2011


Author: jfv
Date: Sat Dec 10 07:08:52 2011
New Revision: 228387
URL: http://svn.freebsd.org/changeset/base/228387

Log:
  Part 2 of 2  New deltas for the 1G drivers.
  
  There have still been intermittent problems with apparent TX
  hangs for some customers. These have been problematic to reproduce
  but I believe these changes will address them. Testing on a number
  of fronts have been positive.
  
  EM: there is an important 'chicken bit' fix for 82574 in the shared
  code this is supported in the core here.
      - The TX path has been tightened up to improve performance. In
        particular UDP with jumbo frames was having problems, and the
        changes here have improved that.
      - OACTIVE has been used more carefully on the theory that some
        hangs may be due to a problem in this interaction
      - Problems with the RX init code, the "lazy" allocation and
        ring initialization has been found to cause problems in some
        newer client systems, and as it really is not that big a win
        (its not in a hot path) it seems best to remove it.
      - HWTSO was broken when VLAN HWTAGGING or HWFILTER is used, I
        found this was due to an error in setting up the descriptors
        in em_xmit.
  
  IGB:
      - TX is also improved here. With multiqueue I realized its very
        important to handle OACTIVE only under the CORE lock so there
        are no races between the queues.
      - Flow Control handling was broken in a couple ways, I have changed
        and I hope improved that in this delta.
      - UDP also had a problem in the TX path here, it was change to
        improve that.
      - On some hardware, with the driver static, a weird stray interrupt
        seems to sometimes fire and cause a panic in the RX mbuf refresh
        code. This is addressed by setting interrupts late in the init
        path, and also to set all interrupts bits off at the start of that.

Modified:
  head/sys/dev/e1000/if_em.c
  head/sys/dev/e1000/if_em.h
  head/sys/dev/e1000/if_igb.c
  head/sys/dev/e1000/if_igb.h
  head/sys/dev/e1000/if_lem.c
  head/sys/dev/e1000/if_lem.h

Modified: head/sys/dev/e1000/if_em.c
==============================================================================
--- head/sys/dev/e1000/if_em.c	Sat Dec 10 06:55:02 2011	(r228386)
+++ head/sys/dev/e1000/if_em.c	Sat Dec 10 07:08:52 2011	(r228387)
@@ -35,6 +35,7 @@
 #ifdef HAVE_KERNEL_OPTION_HEADERS
 #include "opt_device_polling.h"
 #include "opt_inet.h"
+#include "opt_inet6.h"
 #endif
 
 #include <sys/param.h>
@@ -93,7 +94,7 @@ int	em_display_debug_stats = 0;
 /*********************************************************************
  *  Driver version:
  *********************************************************************/
-char em_driver_version[] = "7.2.3";
+char em_driver_version[] = "7.3.2";
 
 /*********************************************************************
  *  PCI Device ID Table
@@ -286,6 +287,7 @@ static void	em_handle_link(void *context
 
 static void	em_set_sysctl_value(struct adapter *, const char *,
 		    const char *, int *, int);
+static int	em_set_flowcntl(SYSCTL_HANDLER_ARGS);
 
 static __inline void em_rx_discard(struct rx_ring *, int);
 
@@ -382,13 +384,8 @@ static int em_rx_process_limit = 100;
 TUNABLE_INT("hw.em.rx_process_limit", &em_rx_process_limit);
 SYSCTL_INT(_hw_em, OID_AUTO, rx_process_limit, CTLFLAG_RDTUN,
     &em_rx_process_limit, 0,
-    "Maximum number of received packets to process at a time, -1 means unlimited");
-
-/* Flow control setting - default to FULL */
-static int em_fc_setting = e1000_fc_full;
-TUNABLE_INT("hw.em.fc_setting", &em_fc_setting);
-SYSCTL_INT(_hw_em, OID_AUTO, fc_setting, CTLFLAG_RDTUN, &em_fc_setting, 0,
-    "Flow control");
+    "Maximum number of received packets to process "
+    "at a time, -1 means unlimited");
 
 /* Energy efficient ethernet - default to OFF */
 static int eee_setting = 0;
@@ -473,6 +470,11 @@ em_attach(device_t dev)
 
 	INIT_DEBUGOUT("em_attach: begin");
 
+	if (resource_disabled("em", device_get_unit(dev))) {
+		device_printf(dev, "Disabled by device hint\n");
+		return (ENXIO);
+	}
+
 	adapter = device_get_softc(dev);
 	adapter->dev = adapter->osdep.dev = dev;
 	hw = &adapter->hw;
@@ -489,6 +491,11 @@ em_attach(device_t dev)
 	    OID_AUTO, "debug", CTLTYPE_INT|CTLFLAG_RW, adapter, 0,
 	    em_sysctl_debug_info, "I", "Debug Information");
 
+	SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
+	    SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
+	    OID_AUTO, "fc", CTLTYPE_INT|CTLFLAG_RW, adapter, 0,
+	    em_set_flowcntl, "I", "Flow Control");
+
 	callout_init_mtx(&adapter->timer, &adapter->core_mtx, 0);
 
 	/* Determine hardware and mac info */
@@ -560,11 +567,6 @@ em_attach(device_t dev)
 	    "max number of rx packets to process", &adapter->rx_process_limit,
 	    em_rx_process_limit);
 
-	/* Sysctl for setting the interface flow control */
-	em_set_sysctl_value(adapter, "flow_control",
-	    "configure flow control",
-	    &adapter->fc_setting, em_fc_setting);
-
 	/*
 	 * Validate number of transmit and receive descriptors. It
 	 * must not exceed hardware maximum, and must be multiple
@@ -714,7 +716,8 @@ em_attach(device_t dev)
 		em_get_hw_control(adapter);
 
 	/* Tell the stack that the interface is not active */
-	adapter->ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE);
+	adapter->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
+	adapter->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
 
 	adapter->led_dev = led_create(em_led_func, adapter,
 	    device_get_nameunit(dev));
@@ -847,6 +850,8 @@ em_resume(device_t dev)
 	struct ifnet *ifp = adapter->ifp;
 
 	EM_CORE_LOCK(adapter);
+	if (adapter->hw.mac.type == e1000_pch2lan)
+		e1000_resume_workarounds_pchlan(&adapter->hw);
 	em_init_locked(adapter);
 	em_init_manageability(adapter);
 	EM_CORE_UNLOCK(adapter);
@@ -856,17 +861,15 @@ em_resume(device_t dev)
 }
 
 
+#ifdef EM_MULTIQUEUE
 /*********************************************************************
- *  Transmit entry point
+ *  Multiqueue Transmit routines 
  *
- *  em_start is called by the stack to initiate a transmit.
- *  The driver will remain in this routine as long as there are
- *  packets to transmit and transmit resources are available.
- *  In case resources are not available stack is notified and
- *  the packet is requeued.
+ *  em_mq_start is called by the stack to initiate a transmit.
+ *  however, if busy the driver can queue the request rather
+ *  than do an immediate send. It is this that is an advantage
+ *  in this driver, rather than also having multiple tx queues.
  **********************************************************************/
-
-#ifdef EM_MULTIQUEUE
 static int
 em_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr, struct mbuf *m)
 {
@@ -881,10 +884,6 @@ em_mq_start_locked(struct ifnet *ifp, st
 		return (err);
 	}
 
-        /* Call cleanup if number of TX descriptors low */
-	if (txr->tx_avail <= EM_TX_CLEANUP_THRESHOLD)
-		em_txeof(txr);
-
 	enq = 0;
 	if (m == NULL) {
 		next = drbr_dequeue(ifp, txr->br);
@@ -907,10 +906,6 @@ em_mq_start_locked(struct ifnet *ifp, st
 		ETHER_BPF_MTAP(ifp, next);
 		if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
                         break;
-		if (txr->tx_avail < EM_MAX_SCATTER) {
-			ifp->if_drv_flags |= IFF_DRV_OACTIVE;
-			break;
-		}
 		next = drbr_dequeue(ifp, txr->br);
 	}
 
@@ -919,6 +914,11 @@ em_mq_start_locked(struct ifnet *ifp, st
                 txr->queue_status = EM_QUEUE_WORKING;
 		txr->watchdog_time = ticks;
 	}
+
+	if (txr->tx_avail < EM_MAX_SCATTER)
+		em_txeof(txr);
+	if (txr->tx_avail < EM_MAX_SCATTER)
+		ifp->if_drv_flags |= IFF_DRV_OACTIVE;
 	return (err);
 }
 
@@ -959,7 +959,6 @@ em_qflush(struct ifnet *ifp)
 	}
 	if_qflush(ifp);
 }
-
 #endif /* EM_MULTIQUEUE */
 
 static void
@@ -995,7 +994,6 @@ em_start_locked(struct ifnet *ifp, struc
 		if (em_xmit(txr, &m_head)) {
 			if (m_head == NULL)
 				break;
-			ifp->if_drv_flags |= IFF_DRV_OACTIVE;
 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
 			break;
 		}
@@ -1022,6 +1020,12 @@ em_start(struct ifnet *ifp)
 		em_start_locked(ifp, txr);
 		EM_TX_UNLOCK(txr);
 	}
+	/*
+	** If we went inactive schedule
+	** a task to clean up.
+	*/
+	if (ifp->if_drv_flags & IFF_DRV_OACTIVE)
+		taskqueue_enqueue(txr->tq, &txr->tx_task);
 	return;
 }
 
@@ -1038,11 +1042,12 @@ static int
 em_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
 {
 	struct adapter	*adapter = ifp->if_softc;
-	struct ifreq *ifr = (struct ifreq *)data;
-#ifdef INET
-	struct ifaddr *ifa = (struct ifaddr *)data;
+	struct ifreq	*ifr = (struct ifreq *)data;
+#if defined(INET) || defined(INET6)
+	struct ifaddr	*ifa = (struct ifaddr *)data;
 #endif
-	int error = 0;
+	bool		avoid_reset = FALSE;
+	int		error = 0;
 
 	if (adapter->in_detach)
 		return (error);
@@ -1050,23 +1055,22 @@ em_ioctl(struct ifnet *ifp, u_long comma
 	switch (command) {
 	case SIOCSIFADDR:
 #ifdef INET
-		if (ifa->ifa_addr->sa_family == AF_INET) {
-			/*
-			 * XXX
-			 * Since resetting hardware takes a very long time
-			 * and results in link renegotiation we only
-			 * initialize the hardware only when it is absolutely
-			 * required.
-			 */
+		if (ifa->ifa_addr->sa_family == AF_INET)
+			avoid_reset = TRUE;
+#endif
+		/*
+		** Calling init results in link renegotiation,
+		** so we avoid doing it when possible.
+		*/
+		if (avoid_reset) {
 			ifp->if_flags |= IFF_UP;
-			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
-				EM_CORE_LOCK(adapter);
-				em_init_locked(adapter);
-				EM_CORE_UNLOCK(adapter);
-			}
-			arp_ifinit(ifp, ifa);
-		} else
+			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING))
+				em_init(adapter);
+#ifdef INET
+			if (!(ifp->if_flags & IFF_NOARP))
+				arp_ifinit(ifp, ifa);
 #endif
+		} else
 			error = ether_ioctl(ifp, command, data);
 		break;
 	case SIOCSIFMTU:
@@ -1083,6 +1087,7 @@ em_ioctl(struct ifnet *ifp, u_long comma
 		case e1000_ich10lan:
 		case e1000_pch2lan:
 		case e1000_82574:
+		case e1000_82583:
 		case e1000_80003es2lan:	/* 9K Jumbo Frame size */
 			max_frame_size = 9234;
 			break;
@@ -1090,7 +1095,6 @@ em_ioctl(struct ifnet *ifp, u_long comma
 			max_frame_size = 4096;
 			break;
 			/* Adapters that do not support jumbo frames */
-		case e1000_82583:
 		case e1000_ich8lan:
 			max_frame_size = ETHER_MAX_LEN;
 			break;
@@ -1145,11 +1149,6 @@ em_ioctl(struct ifnet *ifp, u_long comma
 		}
 		break;
 	case SIOCSIFMEDIA:
-		/*
-		** As the speed/duplex settings are being
-		** changed, we need to reset the PHY.
-		*/
-		adapter->hw.phy.reset_disable = FALSE;
 		/* Check SOL/IDER usage */
 		EM_CORE_LOCK(adapter);
 		if (e1000_check_reset_block(&adapter->hw)) {
@@ -1208,6 +1207,10 @@ em_ioctl(struct ifnet *ifp, u_long comma
 			ifp->if_capenable ^= IFCAP_VLAN_HWFILTER;
 			reinit = 1;
 		}
+		if (mask & IFCAP_VLAN_HWTSO) {
+			ifp->if_capenable ^= IFCAP_VLAN_HWTSO;
+			reinit = 1;
+		}
 		if ((mask & IFCAP_WOL) &&
 		    (ifp->if_capabilities & IFCAP_WOL) != 0) {
 			if (mask & IFCAP_WOL_MCAST)
@@ -1246,7 +1249,6 @@ em_init_locked(struct adapter *adapter)
 {
 	struct ifnet	*ifp = adapter->ifp;
 	device_t	dev = adapter->dev;
-	u32		pba;
 
 	INIT_DEBUGOUT("em_init: begin");
 
@@ -1255,46 +1257,6 @@ em_init_locked(struct adapter *adapter)
 	em_disable_intr(adapter);
 	callout_stop(&adapter->timer);
 
-	/*
-	 * Packet Buffer Allocation (PBA)
-	 * Writing PBA sets the receive portion of the buffer
-	 * the remainder is used for the transmit buffer.
-	 */
-	switch (adapter->hw.mac.type) {
-	/* Total Packet Buffer on these is 48K */
-	case e1000_82571:
-	case e1000_82572:
-	case e1000_80003es2lan:
-			pba = E1000_PBA_32K; /* 32K for Rx, 16K for Tx */
-		break;
-	case e1000_82573: /* 82573: Total Packet Buffer is 32K */
-			pba = E1000_PBA_12K; /* 12K for Rx, 20K for Tx */
-		break;
-	case e1000_82574:
-	case e1000_82583:
-			pba = E1000_PBA_20K; /* 20K for Rx, 20K for Tx */
-		break;
-	case e1000_ich8lan:
-		pba = E1000_PBA_8K;
-		break;
-	case e1000_ich9lan:
-	case e1000_ich10lan:
-		pba = E1000_PBA_10K;
-		break;
-	case e1000_pchlan:
-	case e1000_pch2lan:
-		pba = E1000_PBA_26K;
-		break;
-	default:
-		if (adapter->max_frame_size > 8192)
-			pba = E1000_PBA_40K; /* 40K for Rx, 24K for Tx */
-		else
-			pba = E1000_PBA_48K; /* 48K for Rx, 16K for Tx */
-	}
-
-	INIT_DEBUGOUT1("em_init: pba=%dK",pba);
-	E1000_WRITE_REG(&adapter->hw, E1000_PBA, pba);
-	
 	/* Get the latest mac address, User can use a LAA */
         bcopy(IF_LLADDR(adapter->ifp), adapter->hw.mac.addr,
               ETHER_ADDR_LEN);
@@ -1373,6 +1335,7 @@ em_init_locked(struct adapter *adapter)
 	/* Don't lose promiscuous settings */
 	em_set_promisc(adapter);
 
+	/* Set the interface as ACTIVE */
 	ifp->if_drv_flags |= IFF_DRV_RUNNING;
 	ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
 
@@ -1403,9 +1366,6 @@ em_init_locked(struct adapter *adapter)
 	/* AMT based hardware can now take control from firmware */
 	if (adapter->has_manage && adapter->has_amt)
 		em_get_hw_control(adapter);
-
-	/* Don't reset the phy next time init gets called */
-	adapter->hw.phy.reset_disable = TRUE;
 }
 
 static void
@@ -1995,6 +1955,14 @@ retry:
 		em_transmit_checksum_setup(txr, m_head,
 		    ip_off, ip, &txd_upper, &txd_lower);
 
+	if (m_head->m_flags & M_VLANTAG) {
+		/* Set the vlan id. */
+		txd_upper |=
+		    (htole16(m_head->m_pkthdr.ether_vtag) << 16);
+                /* Tell hardware to add tag */
+                txd_lower |= htole32(E1000_TXD_CMD_VLE);
+        }
+
 	i = txr->next_avail_desc;
 
 	/* Set up our transmit descriptors */
@@ -2052,15 +2020,13 @@ retry:
 	if (tso_desc) /* TSO used an extra for sentinel */
 		txr->tx_avail -= txd_used;
 
-	if (m_head->m_flags & M_VLANTAG) {
-		/* Set the vlan id. */
-		ctxd->upper.fields.special =
-		    htole16(m_head->m_pkthdr.ether_vtag);
-                /* Tell hardware to add tag */
-                ctxd->lower.data |= htole32(E1000_TXD_CMD_VLE);
-        }
-
         tx_buffer->m_head = m_head;
+	/*
+	** Here we swap the map so the last descriptor,
+	** which gets the completion interrupt has the
+	** real map, and the first descriptor gets the
+	** unused map from this descriptor.
+	*/
 	tx_buffer_mapped->map = tx_buffer->map;
 	tx_buffer->map = map;
         bus_dmamap_sync(txr->txtag, map, BUS_DMASYNC_PREWRITE);
@@ -2230,22 +2196,21 @@ em_local_timer(void *arg)
 	else
 		trigger = E1000_ICS_RXDMT0;
 
-	/* 
-	** Don't do TX watchdog check if we've been paused
-	*/
-	if (adapter->pause_frames) {
-		adapter->pause_frames = 0;
-		goto out;
-	}
 	/*
 	** Check on the state of the TX queue(s), this 
 	** can be done without the lock because its RO
 	** and the HUNG state will be static if set.
 	*/
-	for (int i = 0; i < adapter->num_queues; i++, txr++)
-		if (txr->queue_status == EM_QUEUE_HUNG)
+	for (int i = 0; i < adapter->num_queues; i++, txr++) {
+		if ((txr->queue_status == EM_QUEUE_HUNG) &&
+		    (adapter->pause_frames == 0))
 			goto hung;
-out:
+		/* Schedule a TX tasklet if needed */
+		if (txr->tx_avail <= EM_MAX_SCATTER)
+			taskqueue_enqueue(txr->tq, &txr->tx_task);
+	}
+	
+	adapter->pause_frames = 0;
 	callout_reset(&adapter->timer, hz, em_local_timer, adapter);
 #ifndef DEVICE_POLLING
 	/* Trigger an RX interrupt to guarantee mbuf refresh */
@@ -2264,6 +2229,7 @@ hung:
 	    txr->me, txr->tx_avail, txr->next_to_clean);
 	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 	adapter->watchdog_events++;
+	adapter->pause_frames = 0;
 	em_init_locked(adapter);
 }
 
@@ -2362,7 +2328,8 @@ em_stop(void *arg)
 	callout_stop(&adapter->timer);
 
 	/* Tell the stack that the interface is no longer active */
-	ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE);
+	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
+	ifp->if_drv_flags |= IFF_DRV_OACTIVE;
 
         /* Unarm watchdog timer. */
 	for (int i = 0; i < adapter->num_queues; i++, txr++) {
@@ -2458,6 +2425,7 @@ int
 em_allocate_legacy(struct adapter *adapter)
 {
 	device_t dev = adapter->dev;
+	struct tx_ring	*txr = adapter->tx_rings;
 	int error, rid = 0;
 
 	/* Manually turn off all interrupts */
@@ -2479,11 +2447,17 @@ em_allocate_legacy(struct adapter *adapt
 	 * deferred processing contexts.
 	 */
 	TASK_INIT(&adapter->que_task, 0, em_handle_que, adapter);
-	TASK_INIT(&adapter->link_task, 0, em_handle_link, adapter);
 	adapter->tq = taskqueue_create_fast("em_taskq", M_NOWAIT,
 	    taskqueue_thread_enqueue, &adapter->tq);
-	taskqueue_start_threads(&adapter->tq, 1, PI_NET, "%s taskq",
+	taskqueue_start_threads(&adapter->tq, 1, PI_NET, "%s que",
+	    device_get_nameunit(adapter->dev));
+	/* Use a TX only tasklet for local timer */
+	TASK_INIT(&txr->tx_task, 0, em_handle_tx, txr);
+	txr->tq = taskqueue_create_fast("em_txq", M_NOWAIT,
+	    taskqueue_thread_enqueue, &txr->tq);
+	taskqueue_start_threads(&txr->tq, 1, PI_NET, "%s txq",
 	    device_get_nameunit(adapter->dev));
+	TASK_INIT(&adapter->link_task, 0, em_handle_link, adapter);
 	if ((error = bus_setup_intr(dev, adapter->res, INTR_TYPE_NET,
 	    em_irq_fast, NULL, adapter, &adapter->tag)) != 0) {
 		device_printf(dev, "Failed to register fast interrupt "
@@ -2500,7 +2474,8 @@ em_allocate_legacy(struct adapter *adapt
  *
  *  Setup the MSIX Interrupt handlers
  *   This is not really Multiqueue, rather
- *   its just multiple interrupt vectors.
+ *   its just seperate interrupt vectors
+ *   for TX, RX, and Link.
  *
  **********************************************************************/
 int
@@ -2692,7 +2667,6 @@ em_setup_msix(struct adapter *adapter)
 	device_t dev = adapter->dev;
 	int val = 0;
 
-
 	/*
 	** Setup MSI/X for Hartwell: tests have shown
 	** use of two queues to be unstable, and to
@@ -2712,16 +2686,18 @@ em_setup_msix(struct adapter *adapter)
 			goto msi;
        		}
 		val = pci_msix_count(dev); 
-		if (val < 3) {
+		/* We only need 3 vectors */
+		if (val > 3)
+			val = 3;
+		if ((val != 3) && (val != 5)) {
 			bus_release_resource(dev, SYS_RES_MEMORY,
 			    PCIR_BAR(EM_MSIX_BAR), adapter->msix_mem);
 			adapter->msix_mem = NULL;
                		device_printf(adapter->dev,
-			    "MSIX: insufficient vectors, using MSI\n");
+			    "MSIX: incorrect vectors, using MSI\n");
 			goto msi;
 		}
-		val = 3;
-		adapter->num_queues = 1;
+
 		if (pci_alloc_msix(dev, &val) == 0) {
 			device_printf(adapter->dev,
 			    "Using MSIX interrupts "
@@ -2756,6 +2732,7 @@ em_reset(struct adapter *adapter)
 	struct ifnet	*ifp = adapter->ifp;
 	struct e1000_hw	*hw = &adapter->hw;
 	u16		rx_buffer_size;
+	u32		pba;
 
 	INIT_DEBUGOUT("em_reset: begin");
 
@@ -2771,6 +2748,48 @@ em_reset(struct adapter *adapter)
 	}
 
 	/*
+	 * Packet Buffer Allocation (PBA)
+	 * Writing PBA sets the receive portion of the buffer
+	 * the remainder is used for the transmit buffer.
+	 */
+	switch (hw->mac.type) {
+	/* Total Packet Buffer on these is 48K */
+	case e1000_82571:
+	case e1000_82572:
+	case e1000_80003es2lan:
+			pba = E1000_PBA_32K; /* 32K for Rx, 16K for Tx */
+		break;
+	case e1000_82573: /* 82573: Total Packet Buffer is 32K */
+			pba = E1000_PBA_12K; /* 12K for Rx, 20K for Tx */
+		break;
+	case e1000_82574:
+	case e1000_82583:
+			pba = E1000_PBA_20K; /* 20K for Rx, 20K for Tx */
+		break;
+	case e1000_ich8lan:
+		pba = E1000_PBA_8K;
+		break;
+	case e1000_ich9lan:
+	case e1000_ich10lan:
+		/* Boost Receive side for jumbo frames */
+		if (adapter->max_frame_size > 4096)
+			pba = E1000_PBA_14K;
+		else
+			pba = E1000_PBA_10K;
+		break;
+	case e1000_pchlan:
+	case e1000_pch2lan:
+		pba = E1000_PBA_26K;
+		break;
+	default:
+		if (adapter->max_frame_size > 8192)
+			pba = E1000_PBA_40K; /* 40K for Rx, 24K for Tx */
+		else
+			pba = E1000_PBA_48K; /* 48K for Rx, 16K for Tx */
+	}
+	E1000_WRITE_REG(&adapter->hw, E1000_PBA, pba);
+
+	/*
 	 * These parameters control the automatic generation (Tx) and
 	 * response (Rx) to Ethernet PAUSE frames.
 	 * - High water mark should allow for at least two frames to be
@@ -2785,11 +2804,15 @@ em_reset(struct adapter *adapter)
 	 * - The pause time is fairly large at 1000 x 512ns = 512 usec.
 	 */
 	rx_buffer_size = ((E1000_READ_REG(hw, E1000_PBA) & 0xffff) << 10 );
-
 	hw->fc.high_water = rx_buffer_size -
 	    roundup2(adapter->max_frame_size, 1024);
 	hw->fc.low_water = hw->fc.high_water - 1500;
 
+	if (adapter->fc) /* locally set flow control value? */
+		hw->fc.requested_mode = adapter->fc;
+	else
+		hw->fc.requested_mode = e1000_fc_full;
+
 	if (hw->mac.type == e1000_80003es2lan)
 		hw->fc.pause_time = 0xFFFF;
 	else
@@ -2797,15 +2820,22 @@ em_reset(struct adapter *adapter)
 
 	hw->fc.send_xon = TRUE;
 
-        /* Set Flow control, use the tunable location if sane */
-	hw->fc.requested_mode = adapter->fc_setting;
-
-	/* Workaround: no TX flow ctrl for PCH */
-	if (hw->mac.type == e1000_pchlan)
+	/* Device specific overrides/settings */
+	switch (hw->mac.type) {
+	case e1000_pchlan:
+		/* Workaround: no TX flow ctrl for PCH */
                 hw->fc.requested_mode = e1000_fc_rx_pause;
-
-	/* Override - settings for PCH2LAN, ya its magic :) */
-	if (hw->mac.type == e1000_pch2lan) {
+		hw->fc.pause_time = 0xFFFF; /* override */
+		if (ifp->if_mtu > ETHERMTU) {
+			hw->fc.high_water = 0x3500;
+			hw->fc.low_water = 0x1500;
+		} else {
+			hw->fc.high_water = 0x5000;
+			hw->fc.low_water = 0x3000;
+		}
+		hw->fc.refresh_time = 0x1000;
+		break;
+	case e1000_pch2lan:
 		hw->fc.high_water = 0x5C20;
 		hw->fc.low_water = 0x5048;
 		hw->fc.pause_time = 0x0650;
@@ -2815,13 +2845,26 @@ em_reset(struct adapter *adapter)
 			E1000_WRITE_REG(hw, E1000_PBA, 12);
 		else
 			E1000_WRITE_REG(hw, E1000_PBA, 26);
+		break;
+        case e1000_ich9lan:
+        case e1000_ich10lan:
+		if (ifp->if_mtu > ETHERMTU) {
+			hw->fc.high_water = 0x2800;
+			hw->fc.low_water = hw->fc.high_water - 8;
+			break;
+		} 
+		/* else fall thru */
+	default:
+		if (hw->mac.type == e1000_80003es2lan)
+			hw->fc.pause_time = 0xFFFF;
+		break;
 	}
 
 	/* Issue a global reset */
 	e1000_reset_hw(hw);
 	E1000_WRITE_REG(hw, E1000_WUC, 0);
 	em_disable_aspm(adapter);
-
+	/* and a re-init */
 	if (e1000_init_hw(hw) < 0) {
 		device_printf(dev, "Hardware Initialization Failed\n");
 		return;
@@ -2866,28 +2909,25 @@ em_setup_interface(device_t dev, struct 
 	ifp->if_capabilities = ifp->if_capenable = 0;
 
 #ifdef EM_MULTIQUEUE
-	/* Multiqueue tx functions */
+	/* Multiqueue stack interface */
 	ifp->if_transmit = em_mq_start;
 	ifp->if_qflush = em_qflush;
 #endif	
 
 	ifp->if_capabilities |= IFCAP_HWCSUM | IFCAP_VLAN_HWCSUM;
-	ifp->if_capenable |= IFCAP_HWCSUM | IFCAP_VLAN_HWCSUM;
-
-	/* Enable TSO by default, can disable with ifconfig */
 	ifp->if_capabilities |= IFCAP_TSO4;
-	ifp->if_capenable |= IFCAP_TSO4;
-
 	/*
 	 * Tell the upper layer(s) we
 	 * support full VLAN capability
 	 */
 	ifp->if_data.ifi_hdrlen = sizeof(struct ether_vlan_header);
-	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
-	ifp->if_capenable |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
+	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING
+			     |  IFCAP_VLAN_HWTSO
+			     |  IFCAP_VLAN_MTU;
+	ifp->if_capenable = ifp->if_capabilities;
 
 	/*
-	** Dont turn this on by default, if vlans are
+	** Don't turn this on by default, if vlans are
 	** created on another pseudo device (eg. lagg)
 	** then vlan events are not passed thru, breaking
 	** operation, but with HW FILTER off it works. If
@@ -3339,11 +3379,6 @@ em_initialize_transmit_unit(struct adapt
 
 	/* Set the default values for the Tx Inter Packet Gap timer */
 	switch (adapter->hw.mac.type) {
-	case e1000_82542:
-		tipg = DEFAULT_82542_TIPG_IPGT;
-		tipg |= DEFAULT_82542_TIPG_IPGR1 << E1000_TIPG_IPGR1_SHIFT;
-		tipg |= DEFAULT_82542_TIPG_IPGR2 << E1000_TIPG_IPGR2_SHIFT;
-		break;
 	case e1000_80003es2lan:
 		tipg = DEFAULT_82543_TIPG_IPGR1;
 		tipg |= DEFAULT_80003ES2LAN_TIPG_IPGR2 <<
@@ -3813,9 +3848,12 @@ em_txeof(struct tx_ring *txr)
         /*
          * If we have a minimum free, clear IFF_DRV_OACTIVE
          * to tell the stack that it is OK to send packets.
+	 * Notice that all writes of OACTIVE happen under the
+	 * TX lock which, with a single queue, guarantees 
+	 * sanity.
          */
-        if (txr->tx_avail > EM_MAX_SCATTER)
-                ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
+        if (txr->tx_avail >= EM_MAX_SCATTER)
+		ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
 
 	/* Disable watchdog if all clean */
 	if (txr->tx_avail == adapter->num_tx_desc) {
@@ -3978,26 +4016,31 @@ em_setup_receive_ring(struct rx_ring *rx
 	struct	adapter 	*adapter = rxr->adapter;
 	struct em_buffer	*rxbuf;
 	bus_dma_segment_t	seg[1];
-	int			i, j, nsegs, error = 0;
+	int			rsize, nsegs, error;
 
 
 	/* Clear the ring contents */
 	EM_RX_LOCK(rxr);
+	rsize = roundup2(adapter->num_rx_desc *
+	    sizeof(struct e1000_rx_desc), EM_DBA_ALIGN);
+	bzero((void *)rxr->rx_base, rsize);
 
-	/* Invalidate all descriptors */
-	for (i = 0; i < adapter->num_rx_desc; i++) {
-		struct e1000_rx_desc* cur;
-		cur = &rxr->rx_base[i];
-		cur->status = 0;
+	/*
+	** Free current RX buffer structs and their mbufs
+	*/
+	for (int i = 0; i < adapter->num_rx_desc; i++) {
+		rxbuf = &rxr->rx_buffers[i];
+		if (rxbuf->m_head != NULL) {
+			bus_dmamap_sync(rxr->rxtag, rxbuf->map,
+			    BUS_DMASYNC_POSTREAD);
+			bus_dmamap_unload(rxr->rxtag, rxbuf->map);
+			m_freem(rxbuf->m_head);
+		}
 	}
 
 	/* Now replenish the mbufs */
-	i = j = rxr->next_to_refresh;
-	if (++j == adapter->num_rx_desc)
-		j = 0;
-
-	while (j != rxr->next_to_check) {
-		rxbuf = &rxr->rx_buffers[i];
+        for (int j = 0; j != adapter->num_rx_desc; ++j) {
+		rxbuf = &rxr->rx_buffers[j];
 		rxbuf->m_head = m_getjcl(M_DONTWAIT, MT_DATA,
 		    M_PKTHDR, adapter->rx_mbuf_sz);
 		if (rxbuf->m_head == NULL) {
@@ -4021,11 +4064,13 @@ em_setup_receive_ring(struct rx_ring *rx
 		    rxbuf->map, BUS_DMASYNC_PREREAD);
 
 		/* Update descriptor */
-		rxr->rx_base[i].buffer_addr = htole64(seg[0].ds_addr);
-		i = j;
-		if (++j == adapter->num_rx_desc)
-			j = 0;
+		rxr->rx_base[j].buffer_addr = htole64(seg[0].ds_addr);
 	}
+	rxr->next_to_check = 0;
+	rxr->next_to_refresh = 0;
+	bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map,
+	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
+
 #ifdef DEV_NETMAP
     {
 	/*
@@ -4079,9 +4124,6 @@ em_setup_receive_ring(struct rx_ring *rx
 #endif /* DEV_NETMAP */
 
 fail:
-	rxr->next_to_refresh = i;
-	bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map,
-	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 	EM_RX_UNLOCK(rxr);
 	return (error);
 }
@@ -4108,10 +4150,9 @@ fail:
 	 * the rings that completed, the failing case will have
 	 * cleaned up for itself. 'q' failed, so its the terminus.
 	 */
-	for (int i = 0, n = 0; i < q; ++i) {
+	for (int i = 0; i < q; ++i) {
 		rxr = &adapter->rx_rings[i];
-		n = rxr->next_to_check;
-		while(n != rxr->next_to_refresh) {
+		for (int n = 0; n < adapter->num_rx_desc; n++) {
 			struct em_buffer *rxbuf;
 			rxbuf = &rxr->rx_buffers[n];
 			if (rxbuf->m_head != NULL) {
@@ -4121,8 +4162,6 @@ fail:
 				m_freem(rxbuf->m_head);
 				rxbuf->m_head = NULL;
 			}
-			if (++n == adapter->num_rx_desc)
-				n = 0;
 		}
 		rxr->next_to_check = 0;
 		rxr->next_to_refresh = 0;
@@ -4166,8 +4205,7 @@ em_free_receive_buffers(struct rx_ring *
 	INIT_DEBUGOUT("free_receive_buffers: begin");
 
 	if (rxr->rx_buffers != NULL) {
-		int i = rxr->next_to_check;
-		while(i != rxr->next_to_refresh) {
+		for (int i = 0; i < adapter->num_rx_desc; i++) {
 			rxbuf = &rxr->rx_buffers[i];
 			if (rxbuf->map != NULL) {
 				bus_dmamap_sync(rxr->rxtag, rxbuf->map,
@@ -4179,8 +4217,6 @@ em_free_receive_buffers(struct rx_ring *
 				m_freem(rxbuf->m_head);
 				rxbuf->m_head = NULL;
 			}
-			if (++i == adapter->num_rx_desc)
-				i = 0;
 		}
 		free(rxr->rx_buffers, M_DEVBUF);
 		rxr->rx_buffers = NULL;
@@ -4221,7 +4257,9 @@ em_initialize_receive_unit(struct adapte
 	 * up the descriptor ring
 	 */
 	rctl = E1000_READ_REG(hw, E1000_RCTL);
-	E1000_WRITE_REG(hw, E1000_RCTL, rctl & ~E1000_RCTL_EN);
+	/* Do not disable if ever enabled on this hardware */
+	if ((hw->mac.type != e1000_82574) && (hw->mac.type != e1000_82583))
+		E1000_WRITE_REG(hw, E1000_RCTL, rctl & ~E1000_RCTL_EN);
 
 	E1000_WRITE_REG(&adapter->hw, E1000_RADV,
 	    adapter->rx_abs_int_delay.value);
@@ -4235,14 +4273,13 @@ em_initialize_receive_unit(struct adapte
 	** When using MSIX interrupts we need to throttle
 	** using the EITR register (82574 only)
 	*/
-	if (hw->mac.type == e1000_82574)
+	if (hw->mac.type == e1000_82574) {
 		for (int i = 0; i < 4; i++)
 			E1000_WRITE_REG(hw, E1000_EITR_82574(i),
 			    DEFAULT_ITR);
-
-	/* Disable accelerated ackknowledge */
-	if (adapter->hw.mac.type == e1000_82574)
+		/* Disable accelerated acknowledge */
 		E1000_WRITE_REG(hw, E1000_RFCTL, E1000_RFCTL_ACK_DIS);
+	}
 
 	if (ifp->if_capenable & IFCAP_RXCSUM) {
 		rxcsum = E1000_READ_REG(hw, E1000_RXCSUM);
@@ -4268,7 +4305,8 @@ em_initialize_receive_unit(struct adapte
 		E1000_WRITE_REG(hw, E1000_RDBAH(i), (u32)(bus_addr >> 32));
 		E1000_WRITE_REG(hw, E1000_RDBAL(i), (u32)bus_addr);
 		/* Setup the Head and Tail Descriptor Pointers */
-		E1000_WRITE_REG(hw, E1000_RDH(i), rxr->next_to_check);
+		E1000_WRITE_REG(hw, E1000_RDH(i), 0);
+		E1000_WRITE_REG(hw, E1000_RDT(i), adapter->num_rx_desc - 1);
 #ifdef DEV_NETMAP
 		/*
 		 * an init() while a netmap client is active must
@@ -4286,17 +4324,16 @@ em_initialize_receive_unit(struct adapte
 			E1000_WRITE_REG(hw, E1000_RDT(i), t);
 		} else
 #endif /* DEV_NETMAP */
-		E1000_WRITE_REG(hw, E1000_RDT(i), rxr->next_to_refresh);
+		E1000_WRITE_REG(hw, E1000_RDT(i), adapter->num_rx_desc - 1);
 	}
 
-	/* Set early receive threshold on appropriate hw */
+	/* Set PTHRESH for improved jumbo performance */
 	if (((adapter->hw.mac.type == e1000_ich9lan) ||
 	    (adapter->hw.mac.type == e1000_pch2lan) ||
 	    (adapter->hw.mac.type == e1000_ich10lan)) &&
 	    (ifp->if_mtu > ETHERMTU)) {
 		u32 rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(0));
 		E1000_WRITE_REG(hw, E1000_RXDCTL(0), rxdctl | 3);
-		E1000_WRITE_REG(hw, E1000_ERT, 0x100 | (1 << 13));
 	}
 		
 	if (adapter->hw.mac.type == e1000_pch2lan) {
@@ -4443,10 +4480,6 @@ em_rxeof(struct rx_ring *rxr, int count,
 				    E1000_RXD_SPC_VLAN_MASK);
 				sendmp->m_flags |= M_VLANTAG;
 			}
-#ifdef EM_MULTIQUEUE
-			sendmp->m_pkthdr.flowid = rxr->msix;
-			sendmp->m_flags |= M_FLOWID;
-#endif
 #ifndef __NO_STRICT_ALIGNMENT
 skip:
 #endif
@@ -4921,7 +4954,7 @@ em_enable_wakeup(device_t dev)
 	    (adapter->hw.mac.type == e1000_pchlan) ||
 	    (adapter->hw.mac.type == e1000_ich9lan) ||
 	    (adapter->hw.mac.type == e1000_ich10lan))
-		e1000_disable_gig_wol_ich8lan(&adapter->hw);
+		e1000_suspend_workarounds_ich8lan(&adapter->hw);
 
 	/* Keep the laser running on Fiber adapters */
 	if (adapter->hw.phy.media_type == e1000_media_type_fiber ||
@@ -5518,7 +5551,7 @@ em_add_hw_stats(struct adapter *adapter)
 static int
 em_sysctl_nvm_info(SYSCTL_HANDLER_ARGS)
 {
-	struct adapter *adapter;
+	struct adapter *adapter = (struct adapter *)arg1;
 	int error;
 	int result;
 
@@ -5533,10 +5566,8 @@ em_sysctl_nvm_info(SYSCTL_HANDLER_ARGS)
 	 * first 32 16-bit words of the EEPROM to
 	 * the screen.
 	 */
-	if (result == 1) {
-		adapter = (struct adapter *)arg1;
+	if (result == 1)
 		em_print_nvm_info(adapter);
-        }
 
 	return (error);
 }
@@ -5626,6 +5657,49 @@ em_set_sysctl_value(struct adapter *adap
 	    OID_AUTO, name, CTLTYPE_INT|CTLFLAG_RW, limit, value, description);
 }
 
+
+/*
+** Set flow control using sysctl:
+** Flow control values:
+**      0 - off
+**      1 - rx pause
+**      2 - tx pause
+**      3 - full
+*/
+static int
+em_set_flowcntl(SYSCTL_HANDLER_ARGS)
+{       
+        int		error;
+	static int	input = 3; /* default is full */
+        struct adapter	*adapter = (struct adapter *) arg1;
+                    
+        error = sysctl_handle_int(oidp, &input, 0, req);
+    
+        if ((error) || (req->newptr == NULL))
+                return (error);
+                
+	if (input == adapter->fc) /* no change? */
+		return (error);
+
+        switch (input) {
+                case e1000_fc_rx_pause:
+                case e1000_fc_tx_pause:
+                case e1000_fc_full:
+                case e1000_fc_none:
+                        adapter->hw.fc.requested_mode = input;
+			adapter->fc = input;
+                        break;
+                default:
+			/* Do nothing */
+			return (error);
+        }
+
+        adapter->hw.fc.current_mode = adapter->hw.fc.requested_mode;
+        e1000_force_mac_fc(&adapter->hw);
+        return (error);
+}
+
+
 static int
 em_sysctl_debug_info(SYSCTL_HANDLER_ARGS)
 {
@@ -5662,10 +5736,11 @@ em_print_debug_info(struct adapter *adap
 		printf("Interface is RUNNING ");
 	else
 		printf("Interface is NOT RUNNING\n");
+
 	if (adapter->ifp->if_drv_flags & IFF_DRV_OACTIVE)
-		printf("and ACTIVE\n");
-	else
 		printf("and INACTIVE\n");
+	else
+		printf("and ACTIVE\n");
 
 	device_printf(dev, "hw tdh = %d, hw tdt = %d\n",
 	    E1000_READ_REG(&adapter->hw, E1000_TDH(0)),

Modified: head/sys/dev/e1000/if_em.h
==============================================================================
--- head/sys/dev/e1000/if_em.h	Sat Dec 10 06:55:02 2011	(r228386)
+++ head/sys/dev/e1000/if_em.h	Sat Dec 10 07:08:52 2011	(r228387)
@@ -212,7 +212,8 @@
 #define EM_BAR_MEM_TYPE_64BIT	0x00000004
 #define EM_MSIX_BAR		3	/* On 82575 */
 
-#if !defined(SYSCTL_ADD_UQUAD)
+/* More backward compatibility */
+#if __FreeBSD_version < 900000
 #define SYSCTL_ADD_UQUAD SYSCTL_ADD_QUAD
 #endif
 
@@ -418,11 +419,11 @@ struct adapter {
 	u32		shadow_vfta[EM_VFTA_SIZE];
 
 	/* Info about the interface */
-	u8		link_active;
+	u16		link_active;
+	u16		fc;
 	u16		link_speed;

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***


More information about the svn-src-head mailing list