svn commit: r257529 - in head: share/man/man4 sys/conf sys/dev/e1000 sys/dev/ixgbe sys/dev/netmap sys/dev/re sys/net tools/tools/netmap
Luigi Rizzo
luigi at FreeBSD.org
Fri Nov 1 21:21:17 UTC 2013
Author: luigi
Date: Fri Nov 1 21:21:14 2013
New Revision: 257529
URL: http://svnweb.freebsd.org/changeset/base/257529
Log:
update to the latest netmap snapshot.
This includes the following:
- use separate memory regions for VALE ports
- locking fixes
- some simplifications in the NIC-specific routines
- performance improvements for the VALE switch
- some new features in the pkt-gen test program
- documentation updates
There are small API changes that require programs to be recompiled
(NETMAP_API has been bumped so you will detect old binaries at runtime).
In particular:
- struct netmap_slot now is 16 bytes to support an extra pointer,
which may save one data copy when using VALE ports or VMs;
- the struct netmap_if has two extra fields;
MFC after: 3 days
Modified:
head/share/man/man4/netmap.4
head/sys/conf/files
head/sys/dev/e1000/if_em.c
head/sys/dev/e1000/if_igb.c
head/sys/dev/e1000/if_lem.c
head/sys/dev/e1000/if_lem.h
head/sys/dev/ixgbe/ixgbe.c
head/sys/dev/netmap/if_em_netmap.h
head/sys/dev/netmap/if_igb_netmap.h
head/sys/dev/netmap/if_lem_netmap.h
head/sys/dev/netmap/if_re_netmap.h
head/sys/dev/netmap/ixgbe_netmap.h
head/sys/dev/netmap/netmap.c
head/sys/dev/netmap/netmap_kern.h
head/sys/dev/netmap/netmap_mem2.c
head/sys/dev/re/if_re.c
head/sys/net/netmap.h
head/tools/tools/netmap/nm_util.c
head/tools/tools/netmap/pkt-gen.c
Modified: head/share/man/man4/netmap.4
==============================================================================
--- head/share/man/man4/netmap.4 Fri Nov 1 21:17:45 2013 (r257528)
+++ head/share/man/man4/netmap.4 Fri Nov 1 21:21:14 2013 (r257529)
@@ -1,4 +1,4 @@
-.\" Copyright (c) 2011 Matteo Landi, Luigi Rizzo, Universita` di Pisa
+.\" Copyright (c) 2011-2013 Matteo Landi, Luigi Rizzo, Universita` di Pisa
.\" All rights reserved.
.\"
.\" Redistribution and use in source and binary forms, with or without
@@ -21,14 +21,13 @@
.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
.\" SUCH DAMAGE.
-.\"
+.\"
.\" This document is derived in part from the enet man page (enet.4)
.\" distributed with 4.3BSD Unix.
.\"
.\" $FreeBSD$
-.\" $Id: netmap.4 11563 2012-08-02 08:59:12Z luigi $: stable/8/share/man/man4/bpf.4 181694 2008-08-13 17:45:06Z ed $
.\"
-.Dd September 23, 2013
+.Dd October 18, 2013
.Dt NETMAP 4
.Os
.Sh NAME
@@ -38,101 +37,230 @@
.Cd device netmap
.Sh DESCRIPTION
.Nm
-is a framework for fast and safe access to network devices
-(reaching 14.88 Mpps at less than 1 GHz).
-.Nm
-uses memory mapped buffers and metadata
-(buffer indexes and lengths) to communicate with the kernel,
-which is in charge of validating information through
-.Pa ioctl()
-and
-.Pa select()/poll().
+is a framework for extremely fast and efficient packet I/O
+(reaching 14.88 Mpps with a single core at less than 1 GHz)
+for both userspace and kernel clients.
+Userspace clients can use the netmap API
+to send and receive raw packets through physical interfaces
+or ports of the
+.Xr VALE 4
+switch.
+.Pp
+.Nm VALE
+is a very fast (reaching 20 Mpps per port)
+and modular software switch,
+implemented within the kernel, which can interconnect
+virtual ports, physical devices, and the native host stack.
+.Pp
+.Nm
+uses a memory mapped region to share packet buffers,
+descriptors and queues with the kernel.
+Simple
+.Pa ioctl()s
+are used to bind interfaces/ports to file descriptors and
+implement non-blocking I/O, whereas blocking I/O uses
+.Pa select()/poll() .
.Nm
can exploit the parallelism in multiqueue devices and
multicore systems.
.Pp
+For the best performance,
+.Nm
+requires explicit support in device drivers;
+a generic emulation layer is available to implement the
.Nm
-requires explicit support in device drivers.
-For a list of supported devices, see the end of this manual page.
-.Sh OPERATION
+API on top of unmodified device drivers,
+at the price of reduced performance
+(but still better than what can be achieved with
+sockets or BPF/pcap).
+.Pp
+For a list of devices with native
.Nm
-clients must first open the
+support, see the end of this manual page.
+.Pp
+.Sh OPERATION - THE NETMAP API
+.Nm
+clients must first
.Pa open("/dev/netmap") ,
and then issue an
-.Pa ioctl(...,NIOCREGIF,...)
-to bind the file descriptor to a network device.
-.Pp
-When a device is put in
+.Pa ioctl(fd, NIOCREGIF, (struct nmreq *)arg)
+to bind the file descriptor to a specific interface or port.
.Nm
-mode, its data path is disconnected from the host stack.
-The processes owning the file descriptor
-can exchange packets with the device, or with the host stack,
-through an mmapped memory region that contains pre-allocated
-buffers and metadata.
+has multiple modes of operation controlled by the
+content of the
+.Pa struct nmreq
+passed to the
+.Pa ioctl() .
+In particular, the
+.Em nr_name
+field specifies whether the client operates on a physical network
+interface or on a port of a
+.Nm VALE
+switch, as indicated below. Additional fields in the
+.Pa struct nmreq
+control the details of operation.
+.Pp
+.Bl -tag -width XXXX
+.It Dv Interface name (e.g. 'em0', 'eth1', ... )
+The data path of the interface is disconnected from the host stack.
+Depending on additional arguments,
+the file descriptor is bound to the NIC (one or all queues),
+or to the host stack.
+.It Dv valeXXX:YYY (arbitrary XXX and YYY)
+The file descriptor is bound to port YYY of a VALE switch called XXX,
+where XXX and YYY are arbitrary alphanumeric strings.
+The string cannot exceed IFNAMSIZ characters, and YYY cannot
+matching the name of any existing interface.
+.Pp
+The switch and the port are created if not existing.
+.It Dv valeXXX:ifname (ifname is an existing interface)
+Flags in the argument control whether the physical interface
+(and optionally the corrisponding host stack endpoint)
+are connected or disconnected from the VALE switch named XXX.
.Pp
+In this case the
+.Pa ioctl()
+is used only for configuring the VALE switch, typically through the
+.Nm vale-ctl
+command.
+The file descriptor cannot be used for I/O, and should be
+.Pa close()d
+after issuing the
+.Pa ioctl().
+.El
+.Pp
+The binding can be removed (and the interface returns to
+regular operation, or the virtual port destroyed) with a
+.Pa close()
+on the file descriptor.
+.Pp
+The processes owning the file descriptor can then
+.Pa mmap()
+the memory region that contains pre-allocated
+buffers, descriptors and queues, and use them to
+read/write raw packets.
Non blocking I/O is done with special
.Pa ioctl()'s ,
whereas the file descriptor can be passed to
.Pa select()/poll()
to be notified about incoming packet or available transmit buffers.
-.Ss Data structures
-All data structures for all devices in
+.Ss DATA STRUCTURES
+The data structures in the mmapped memory are described below
+(see
+.Xr sys/net/netmap.h
+for reference).
+All physical devices operating in
.Nm
-mode are in a memory
-region shared by the kernel and all processes
-who open
+mode use the same memory region,
+shared by the kernel and all processes who own
.Pa /dev/netmap
+descriptors bound to those devices
(NOTE: visibility may be restricted in future implementations).
+Virtual ports instead use separate memory regions,
+shared only with the kernel.
+.Pp
All references between the shared data structure
are relative (offsets or indexes). Some macros help converting
them into actual pointers.
-.Pp
-The data structures in shared memory are the following:
.Bl -tag -width XXX
.It Dv struct netmap_if (one per interface)
indicates the number of rings supported by an interface, their
sizes, and the offsets of the
.Pa netmap_rings
associated to the interface.
-The offset of a
+.Pp
.Pa struct netmap_if
-in the shared memory region is indicated by the
+is at offset
.Pa nr_offset
+in the shared memory region is indicated by the
field in the structure returned by the
.Pa NIOCREGIF
(see below).
.Bd -literal
struct netmap_if {
- char ni_name[IFNAMSIZ]; /* name of the interface. */
- const u_int ni_num_queues; /* number of hw ring pairs */
- const ssize_t ring_ofs[]; /* offset of tx and rx rings */
+ char ni_name[IFNAMSIZ]; /* name of the interface. */
+ const u_int ni_version; /* API version */
+ const u_int ni_rx_rings; /* number of rx ring pairs */
+ const u_int ni_tx_rings; /* if 0, same as ni_rx_rings */
+ const ssize_t ring_ofs[]; /* offset of tx and rx rings */
};
.Ed
.It Dv struct netmap_ring (one per ring)
-contains the index of the current read or write slot (cur),
-the number of slots available for reception or transmission (avail),
+Contains the positions in the transmit and receive rings to
+synchronize the kernel and the application,
and an array of
.Pa slots
describing the buffers.
-There is one ring pair for each of the N hardware ring pairs
-supported by the card (numbered 0..N-1), plus
-one ring pair (numbered N) for packets from/to the host stack.
+'reserved' is used in receive rings to tell the kernel the
+number of slots after 'cur' that are still in usr
+indicates how many slots starting from 'cur'
+the
+.Pp
+Each physical interface has one
+.Pa netmap_ring
+for each hardware transmit and receive ring,
+plus one extra transmit and one receive structure
+that connect to the host stack.
.Bd -literal
struct netmap_ring {
- const ssize_t buf_ofs;
- const uint32_t num_slots; /* number of slots in the ring. */
- uint32_t avail; /* number of usable slots */
- uint32_t cur; /* 'current' index for the user side */
- uint32_t reserved; /* not refilled before current */
+ const ssize_t buf_ofs; /* see details */
+ const uint32_t num_slots; /* number of slots in the ring */
+ uint32_t avail; /* number of usable slots */
+ uint32_t cur; /* 'current' read/write index */
+ uint32_t reserved; /* not refilled before current */
const uint16_t nr_buf_size;
- uint16_t flags;
- struct netmap_slot slot[0]; /* array of slots. */
+ uint16_t flags;
+#define NR_TIMESTAMP 0x0002 /* set timestamp on *sync() */
+#define NR_FORWARD 0x0004 /* enable NS_FORWARD for ring */
+#define NR_RX_TSTMP 0x0008 /* set rx timestamp in slots */
+ struct timeval ts;
+ struct netmap_slot slot[0]; /* array of slots */
}
.Ed
+.Pp
+In transmit rings, after a system call 'cur' indicates
+the first slot that can be used for transmissions,
+and 'avail' reports how many of them are available.
+Before the next netmap-related system call on the file
+descriptor, the application should fill buffers and
+slots with data, and update 'cur' and 'avail'
+accordingly, as shown in the figure below:
+.Bd -literal
+
+ cur
+ |----- avail ---| (after syscall)
+ v
+ TX [*****aaaaaaaaaaaaaaaaa**]
+ TX [*****TTTTTaaaaaaaaaaaa**]
+ ^
+ |-- avail --| (before syscall)
+ cur
+.Ed
+
+In receive rings, after a system call 'cur' indicates
+the first slot that contains a valid packet,
+and 'avail' reports how many of them are available.
+Before the next netmap-related system call on the file
+descriptor, the application can process buffers and
+release them to the kernel updating
+'cur' and 'avail' accordingly, as shown in the figure below.
+Receive rings have an additional field called 'reserved'
+to indicate how many buffers before 'cur' are still
+under processing and cannot be released.
+.Bd -literal
+ cur
+ |-res-|-- avail --| (after syscall)
+ v
+ RX [**rrrrrrRRRRRRRRRRRR******]
+ RX [**...........rrrrRRR******]
+ |res|--|<avail (before syscall)
+ ^
+ cur
+
+.Ed
.It Dv struct netmap_slot (one per packet)
-contains the metadata for a packet: a buffer index (buf_idx),
-a buffer length (len), and some flags.
+contains the metadata for a packet:
.Bd -literal
struct netmap_slot {
uint32_t buf_idx; /* buffer index */
@@ -142,23 +270,94 @@ struct netmap_slot {
#define NS_REPORT 0x0002 /* tell hw to report results
* e.g. by generating an interrupt
*/
+#define NS_FORWARD 0x0004 /* pass packet to the other endpoint
+ * (host stack or device)
+ */
+#define NS_NO_LEARN 0x0008
+#define NS_INDIRECT 0x0010
+#define NS_MOREFRAG 0x0020
+#define NS_PORT_SHIFT 8
+#define NS_PORT_MASK (0xff << NS_PORT_SHIFT)
+#define NS_RFRAGS(_slot) ( ((_slot)->flags >> 8) & 0xff)
+ uint64_t ptr; /* buffer address (indirect buffers) */
};
.Ed
+The flags control how the the buffer associated to the slot
+should be managed.
.It Dv packet buffers
-are fixed size (approximately 2k) buffers allocated by the kernel
+are normally fixed size (2 Kbyte) buffers allocated by the kernel
that contain packet data. Buffers addresses are computed through
macros.
.El
.Pp
+.Bl -tag -width XXX
Some macros support the access to objects in the shared memory
-region. In particular:
-.Bd -literal
-struct netmap_if *nifp;
-struct netmap_ring *txring = NETMAP_TXRING(nifp, i);
-struct netmap_ring *rxring = NETMAP_RXRING(nifp, i);
-int i = txring->slot[txring->cur].buf_idx;
-char *buf = NETMAP_BUF(txring, i);
-.Ed
+region. In particular,
+.It NETMAP_TXRING(nifp, i)
+.It NETMAP_RXRING(nifp, i)
+return the address of the i-th transmit and receive ring,
+respectively, whereas
+.It NETMAP_BUF(ring, buf_idx)
+returns the address of the buffer with index buf_idx
+(which can be part of any ring for the given interface).
+.El
+.Pp
+Normally, buffers are associated to slots when interfaces are bound,
+and one packet is fully contained in a single buffer.
+Clients can however modify the mapping using the
+following flags:
+.Ss FLAGS
+.Bl -tag -width XXX
+.It NS_BUF_CHANGED
+indicates that the buf_idx in the slot has changed.
+This can be useful if the client wants to implement
+some form of zero-copy forwarding (e.g. by passing buffers
+from an input interface to an output interface), or
+needs to process packets out of order.
+.Pp
+The flag MUST be used whenever the buffer index is changed.
+.It NS_REPORT
+indicates that we want to be woken up when this buffer
+has been transmitted. This reduces performance but insures
+a prompt notification when a buffer has been sent.
+Normally,
+.Nm
+notifies transmit completions in batches, hence signals
+can be delayed indefinitely. However, we need such notifications
+before closing a descriptor.
+.It NS_FORWARD
+When the device is open in 'transparent' mode,
+the client can mark slots in receive rings with this flag.
+For all marked slots, marked packets are forwarded to
+the other endpoint at the next system call, thus restoring
+(in a selective way) the connection between the NIC and the
+host stack.
+.It NS_NO_LEARN
+tells the forwarding code that the SRC MAC address for this
+packet should not be used in the learning bridge
+.It NS_INDIRECT
+indicates that the packet's payload is not in the netmap
+supplied buffer, but in a user-supplied buffer whose
+user virtual address is in the 'ptr' field of the slot.
+The size can reach 65535 bytes.
+.Em This is only supported on the transmit ring of virtual ports
+.It NS_MOREFRAG
+indicates that the packet continues with subsequent buffers;
+the last buffer in a packet must have the flag clear.
+The maximum length of a chain is 64 buffers.
+.Em This is only supported on virtual ports
+.It ns_ctr
+on receive rings, contains the number of remaining buffers
+in a packet, including this one.
+Slots with a value greater than 1 also have NS_MOREFRAG set.
+The length refers to the individual buffer, there is no
+field for the total length
+XXX maybe put it in the ptr field ?
+.Pp
+On transmit rings, if NS_DST is set, it is passed to the lookup
+function, which can use it e.g. as the index of the destination
+port instead of doing an address lookup.
+.El
.Sh IOCTLS
.Nm
supports some ioctl() to synchronize the state of the rings
@@ -166,13 +365,13 @@ between the kernel and the user processe
to query and configure the interface.
The former do not require any argument, whereas the latter
use a
-.Pa struct netmap_req
+.Pa struct nmreq
defined as follows:
.Bd -literal
struct nmreq {
char nr_name[IFNAMSIZ];
uint32_t nr_version; /* API version */
-#define NETMAP_API 3 /* current version */
+#define NETMAP_API 4 /* current version */
uint32_t nr_offset; /* nifp offset in the shared region */
uint32_t nr_memsize; /* size of the shared region */
uint32_t nr_tx_slots; /* slots in tx rings */
@@ -184,8 +383,14 @@ struct nmreq {
#define NETMAP_SW_RING 0x2000 /* we process the sw ring */
#define NETMAP_NO_TX_POLL 0x1000 /* no gratuitous txsync on poll */
#define NETMAP_RING_MASK 0xfff /* the actual ring number */
- uint16_t spare1;
- uint32_t spare2[4];
+ uint16_t nr_cmd;
+#define NETMAP_BDG_ATTACH 1 /* attach the NIC */
+#define NETMAP_BDG_DETACH 2 /* detach the NIC */
+#define NETMAP_BDG_LOOKUP_REG 3 /* register lookup function */
+#define NETMAP_BDG_LIST 4 /* get bridge's info */
+ uint16_t nr_arg1;
+ uint16_t nr_arg2;
+ uint32_t spare2[3];
};
.Ed
@@ -200,15 +405,27 @@ command codes below are defined in
and are:
.Bl -tag -width XXXX
.It Dv NIOCGINFO
-returns information about the interface named in nr_name.
-On return, nr_memsize indicates the size of the shared netmap
-memory region (this is device-independent),
-nr_tx_slots and nr_rx_slots indicates how many buffers are in a
-transmit and receive ring,
-nr_tx_rings and nr_rx_rings indicates the number of transmit
-and receive rings supported by the hardware.
-.Pp
-If the device does not support netmap, the ioctl returns EINVAL.
+returns EINVAL if the named device does not support netmap.
+Otherwise, it returns 0 and (advisory) information
+about the interface.
+Note that all the information below can change before the
+interface is actually put in netmap mode.
+.Pp
+.Pa nr_memsize
+indicates the size of the netmap
+memory region. Physical devices all share the same memory region,
+whereas VALE ports may have independent regions for each port.
+These sizes can be set through system-wise sysctl variables.
+.Pa nr_tx_slots, nr_rx_slots
+indicate the size of transmit and receive rings.
+.Pa nr_tx_rings, nr_rx_rings
+indicate the number of transmit
+and receive rings.
+Both ring number and sizes may be configured at runtime
+using interface-specific functions (e.g.
+.Pa sysctl
+or
+.Pa ethtool .
.It Dv NIOCREGIF
puts the interface named in nr_name into netmap mode, disconnecting
it from the host stack, and/or defines which rings are controlled
@@ -243,8 +460,11 @@ or the send queue is full.
.Pa NIOCREGIF
can be used multiple times to change the association of a
file descriptor to a ring pair, always within the same device.
-.It Dv NIOCUNREGIF
-brings an interface back to normal mode.
+.Pp
+When registering a virtual interface that is dynamically created to a
+.Xr vale 4
+switch, we can specify the desired number of rings (1 by default,
+and currently up to 16) on it using nr_tx_rings and nr_rx_rings fields.
.It Dv NIOCTXSYNC
tells the hardware of new packets to transmit, and updates the
number of slots available for transmission.
@@ -255,10 +475,20 @@ packets.
.Sh SYSTEM CALLS
.Nm
uses
-.Nm select
+.Xr select 2
and
-.Nm poll
-to wake up processes when significant events occur.
+.Xr poll 2
+to wake up processes when significant events occur, and
+.Xr mmap 2
+to map memory.
+.Pp
+Applications may need to create threads and bind them to
+specific cores to improve performance, using standard
+OS primitives, see
+.Xr pthread 3 .
+In particular,
+.Xr pthread_setaffinity_np 3
+may be of use.
.Sh EXAMPLES
The following code implements a traffic generator
.Pp
@@ -272,10 +502,10 @@ struct nmreq nmr;
fd = open("/dev/netmap", O_RDWR);
bzero(&nmr, sizeof(nmr));
strcpy(nmr.nr_name, "ix0");
-nmr.nr_version = NETMAP_API;
-ioctl(fd, NIOCREG, &nmr);
+nmr.nm_version = NETMAP_API;
+ioctl(fd, NIOCREGIF, &nmr);
p = mmap(0, nmr.nr_memsize, fd);
-nifp = NETMAP_IF(p, nmr.offset);
+nifp = NETMAP_IF(p, nmr.nr_offset);
ring = NETMAP_TXRING(nifp, 0);
fds.fd = fd;
fds.events = POLLOUT;
@@ -312,13 +542,17 @@ Usenix ATC'12, June 2012, Boston
.An -nosplit
The
.Nm
-framework has been designed and implemented at the
+framework has been originally designed and implemented at the
Universita` di Pisa in 2011 by
.An Luigi Rizzo ,
-with help from
+and further extended with help from
.An Matteo Landi ,
.An Gaetano Catalli ,
-.An Giuseppe Lettieri .
+.An Giuseppe Lettieri ,
+.An Vincenzo Maffione .
.Pp
.Nm
-has been funded by the European Commission within FP7 Project CHANGE (257422).
+and
+.Nm VALE
+have been funded by the European Commission within FP7 Projects
+CHANGE (257422) and OPENLAB (287581).
Modified: head/sys/conf/files
==============================================================================
--- head/sys/conf/files Fri Nov 1 21:17:45 2013 (r257528)
+++ head/sys/conf/files Fri Nov 1 21:21:14 2013 (r257529)
@@ -1881,6 +1881,8 @@ dev/nand/nfc_if.m optional nand
dev/ncv/ncr53c500.c optional ncv
dev/ncv/ncr53c500_pccard.c optional ncv pccard
dev/netmap/netmap.c optional netmap
+dev/netmap/netmap_mem2.c optional netmap
+# compile-with "${NORMAL_C} -Wconversion -Wextra"
dev/nge/if_nge.c optional nge
dev/nxge/if_nxge.c optional nxge \
compile-with "${NORMAL_C} ${NO_WSELF_ASSIGN}"
Modified: head/sys/dev/e1000/if_em.c
==============================================================================
--- head/sys/dev/e1000/if_em.c Fri Nov 1 21:17:45 2013 (r257528)
+++ head/sys/dev/e1000/if_em.c Fri Nov 1 21:21:14 2013 (r257529)
@@ -3836,8 +3836,7 @@ em_txeof(struct tx_ring *txr)
EM_TX_LOCK_ASSERT(txr);
#ifdef DEV_NETMAP
- if (netmap_tx_irq(ifp, txr->me |
- (NETMAP_LOCKED_ENTER | NETMAP_LOCKED_EXIT)))
+ if (netmap_tx_irq(ifp, txr->me))
return;
#endif /* DEV_NETMAP */
@@ -4101,7 +4100,7 @@ em_setup_receive_ring(struct rx_ring *rx
sizeof(struct e1000_rx_desc), EM_DBA_ALIGN);
bzero((void *)rxr->rx_base, rsize);
#ifdef DEV_NETMAP
- slot = netmap_reset(na, NR_RX, 0, 0);
+ slot = netmap_reset(na, NR_RX, rxr->me, 0);
#endif
/*
@@ -4433,8 +4432,10 @@ em_rxeof(struct rx_ring *rxr, int count,
EM_RX_LOCK(rxr);
#ifdef DEV_NETMAP
- if (netmap_rx_irq(ifp, rxr->me | NETMAP_LOCKED_ENTER, &processed))
+ if (netmap_rx_irq(ifp, rxr->me, &processed)) {
+ EM_RX_UNLOCK(rxr);
return (FALSE);
+ }
#endif /* DEV_NETMAP */
for (i = rxr->next_to_check, processed = 0; count != 0;) {
Modified: head/sys/dev/e1000/if_igb.c
==============================================================================
--- head/sys/dev/e1000/if_igb.c Fri Nov 1 21:17:45 2013 (r257528)
+++ head/sys/dev/e1000/if_igb.c Fri Nov 1 21:21:14 2013 (r257529)
@@ -3962,8 +3962,7 @@ igb_txeof(struct tx_ring *txr)
mtx_assert(&txr->tx_mtx, MA_OWNED);
#ifdef DEV_NETMAP
- if (netmap_tx_irq(ifp, txr->me |
- (NETMAP_LOCKED_ENTER|NETMAP_LOCKED_EXIT)))
+ if (netmap_tx_irq(ifp, txr->me))
return (FALSE);
#endif /* DEV_NETMAP */
@@ -4829,8 +4828,10 @@ igb_rxeof(struct igb_queue *que, int cou
BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
#ifdef DEV_NETMAP
- if (netmap_rx_irq(ifp, rxr->me | NETMAP_LOCKED_ENTER, &processed))
+ if (netmap_rx_irq(ifp, rxr->me, &processed)) {
+ IGB_RX_UNLOCK(rxr);
return (FALSE);
+ }
#endif /* DEV_NETMAP */
/* Main clean loop */
Modified: head/sys/dev/e1000/if_lem.c
==============================================================================
--- head/sys/dev/e1000/if_lem.c Fri Nov 1 21:17:45 2013 (r257528)
+++ head/sys/dev/e1000/if_lem.c Fri Nov 1 21:21:14 2013 (r257529)
@@ -2986,7 +2986,7 @@ lem_txeof(struct adapter *adapter)
EM_TX_LOCK_ASSERT(adapter);
#ifdef DEV_NETMAP
- if (netmap_tx_irq(ifp, 0 | (NETMAP_LOCKED_ENTER|NETMAP_LOCKED_EXIT)))
+ if (netmap_tx_irq(ifp, 0))
return;
#endif /* DEV_NETMAP */
if (adapter->num_tx_desc_avail == adapter->num_tx_desc)
@@ -3455,8 +3455,10 @@ lem_rxeof(struct adapter *adapter, int c
BUS_DMASYNC_POSTREAD);
#ifdef DEV_NETMAP
- if (netmap_rx_irq(ifp, 0 | NETMAP_LOCKED_ENTER, &rx_sent))
+ if (netmap_rx_irq(ifp, 0, &rx_sent)) {
+ EM_RX_UNLOCK(adapter);
return (FALSE);
+ }
#endif /* DEV_NETMAP */
if (!((current_desc->status) & E1000_RXD_STAT_DD)) {
Modified: head/sys/dev/e1000/if_lem.h
==============================================================================
--- head/sys/dev/e1000/if_lem.h Fri Nov 1 21:17:45 2013 (r257528)
+++ head/sys/dev/e1000/if_lem.h Fri Nov 1 21:21:14 2013 (r257529)
@@ -265,6 +265,13 @@
#define PICOSECS_PER_TICK 20833
#define TSYNC_PORT 319 /* UDP port for the protocol */
+#ifdef NIC_PARAVIRT
+#define E1000_PARA_SUBDEV 0x1101 /* special id */
+#define E1000_CSBAL 0x02830 /* csb phys. addr. low */
+#define E1000_CSBAH 0x02834 /* csb phys. addr. hi */
+#include <net/paravirt.h>
+#endif /* NIC_PARAVIRT */
+
/*
* Bus dma allocation structure used by
* e1000_dma_malloc and e1000_dma_free.
@@ -437,6 +444,26 @@ struct adapter {
boolean_t pcix_82544;
boolean_t in_detach;
+#ifdef NIC_SEND_COMBINING
+ /* 0 = idle; 1xxxx int-pending; 3xxxx int + d pending + tdt */
+#define MIT_PENDING_INT 0x10000 /* pending interrupt */
+#define MIT_PENDING_TDT 0x30000 /* both intr and tdt write are pending */
+ uint32_t shadow_tdt;
+ uint32_t sc_enable;
+#endif /* NIC_SEND_COMBINING */
+#ifdef BATCH_DISPATCH
+ uint32_t batch_enable;
+#endif /* BATCH_DISPATCH */
+
+#ifdef NIC_PARAVIRT
+ struct em_dma_alloc csb_mem; /* phys address */
+ struct paravirt_csb *csb; /* virtual addr */
+ uint32_t rx_retries; /* optimize rx loop */
+ uint32_t tdt_csb_count;// XXX stat
+ uint32_t tdt_reg_count;// XXX stat
+ uint32_t tdt_int_count;// XXX stat
+ uint32_t guest_need_kick_count;// XXX stat
+#endif /* NIC_PARAVIRT */
struct e1000_hw_stats stats;
};
Modified: head/sys/dev/ixgbe/ixgbe.c
==============================================================================
--- head/sys/dev/ixgbe/ixgbe.c Fri Nov 1 21:17:45 2013 (r257528)
+++ head/sys/dev/ixgbe/ixgbe.c Fri Nov 1 21:21:14 2013 (r257529)
@@ -3621,16 +3621,11 @@ ixgbe_txeof(struct tx_ring *txr)
* means the user thread should not be woken up);
* - the driver ignores tx interrupts unless netmap_mitigate=0
* or the slot has the DD bit set.
- *
- * When the driver has separate locks, we need to
- * release and re-acquire txlock to avoid deadlocks.
- * XXX see if we can find a better way.
*/
if (!netmap_mitigate ||
(kring->nr_kflags < kring->nkr_num_slots &&
txd[kring->nr_kflags].wb.status & IXGBE_TXD_STAT_DD)) {
- netmap_tx_irq(ifp, txr->me |
- (NETMAP_LOCKED_ENTER|NETMAP_LOCKED_EXIT));
+ netmap_tx_irq(ifp, txr->me);
}
return;
}
@@ -4422,8 +4417,10 @@ ixgbe_rxeof(struct ix_queue *que)
#ifdef DEV_NETMAP
/* Same as the txeof routine: wakeup clients on intr. */
- if (netmap_rx_irq(ifp, rxr->me | NETMAP_LOCKED_ENTER, &processed))
+ if (netmap_rx_irq(ifp, rxr->me, &processed)) {
+ IXGBE_RX_UNLOCK(rxr);
return (FALSE);
+ }
#endif /* DEV_NETMAP */
for (i = rxr->next_to_check; count != 0;) {
Modified: head/sys/dev/netmap/if_em_netmap.h
==============================================================================
--- head/sys/dev/netmap/if_em_netmap.h Fri Nov 1 21:17:45 2013 (r257528)
+++ head/sys/dev/netmap/if_em_netmap.h Fri Nov 1 21:21:14 2013 (r257529)
@@ -43,35 +43,6 @@ static void em_netmap_block_tasks(struct
static void em_netmap_unblock_tasks(struct adapter *);
-static void
-em_netmap_lock_wrapper(struct ifnet *ifp, int what, u_int queueid)
-{
- struct adapter *adapter = ifp->if_softc;
-
- ASSERT(queueid < adapter->num_queues);
- switch (what) {
- case NETMAP_CORE_LOCK:
- EM_CORE_LOCK(adapter);
- break;
- case NETMAP_CORE_UNLOCK:
- EM_CORE_UNLOCK(adapter);
- break;
- case NETMAP_TX_LOCK:
- EM_TX_LOCK(&adapter->tx_rings[queueid]);
- break;
- case NETMAP_TX_UNLOCK:
- EM_TX_UNLOCK(&adapter->tx_rings[queueid]);
- break;
- case NETMAP_RX_LOCK:
- EM_RX_LOCK(&adapter->rx_rings[queueid]);
- break;
- case NETMAP_RX_UNLOCK:
- EM_RX_UNLOCK(&adapter->rx_rings[queueid]);
- break;
- }
-}
-
-
// XXX do we need to block/unblock the tasks ?
static void
em_netmap_block_tasks(struct adapter *adapter)
@@ -137,7 +108,7 @@ em_netmap_reg(struct ifnet *ifp, int ono
ifp->if_capenable |= IFCAP_NETMAP;
na->if_transmit = ifp->if_transmit;
- ifp->if_transmit = netmap_start;
+ ifp->if_transmit = netmap_transmit;
em_init_locked(adapter);
if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) {
@@ -160,7 +131,7 @@ fail:
* Reconcile kernel and user view of the transmit ring.
*/
static int
-em_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock)
+em_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int flags)
{
struct adapter *adapter = ifp->if_softc;
struct tx_ring *txr = &adapter->tx_rings[ring_nr];
@@ -176,8 +147,6 @@ em_netmap_txsync(struct ifnet *ifp, u_in
if (k > lim)
return netmap_ring_reinit(kring);
- if (do_lock)
- EM_TX_LOCK(txr);
bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
BUS_DMASYNC_POSTREAD);
@@ -202,8 +171,6 @@ em_netmap_txsync(struct ifnet *ifp, u_in
u_int len = slot->len;
if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) {
- if (do_lock)
- EM_TX_UNLOCK(txr);
return netmap_ring_reinit(kring);
}
@@ -252,8 +219,6 @@ em_netmap_txsync(struct ifnet *ifp, u_in
/* update avail to what the kernel knows */
ring->avail = kring->nr_hwavail;
- if (do_lock)
- EM_TX_UNLOCK(txr);
return 0;
}
@@ -262,7 +227,7 @@ em_netmap_txsync(struct ifnet *ifp, u_in
* Reconcile kernel and user view of the receive ring.
*/
static int
-em_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int do_lock)
+em_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags)
{
struct adapter *adapter = ifp->if_softc;
struct rx_ring *rxr = &adapter->rx_rings[ring_nr];
@@ -270,16 +235,13 @@ em_netmap_rxsync(struct ifnet *ifp, u_in
struct netmap_kring *kring = &na->rx_rings[ring_nr];
struct netmap_ring *ring = kring->ring;
u_int j, l, n, lim = kring->nkr_num_slots - 1;
- int force_update = do_lock || kring->nr_kflags & NKR_PENDINTR;
+ int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR;
u_int k = ring->cur, resvd = ring->reserved;
k = ring->cur;
if (k > lim)
return netmap_ring_reinit(kring);
- if (do_lock)
- EM_RX_LOCK(rxr);
-
/* XXX check sync modes */
bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map,
BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
@@ -334,8 +296,6 @@ em_netmap_rxsync(struct ifnet *ifp, u_in
void *addr = PNMB(slot, &paddr);
if (addr == netmap_buffer_base) { /* bad buf */
- if (do_lock)
- EM_RX_UNLOCK(rxr);
return netmap_ring_reinit(kring);
}
@@ -364,8 +324,6 @@ em_netmap_rxsync(struct ifnet *ifp, u_in
}
/* tell userspace that there are new packets */
ring->avail = kring->nr_hwavail - resvd;
- if (do_lock)
- EM_RX_UNLOCK(rxr);
return 0;
}
@@ -378,12 +336,11 @@ em_netmap_attach(struct adapter *adapter
bzero(&na, sizeof(na));
na.ifp = adapter->ifp;
- na.separate_locks = 1;
+ na.na_flags = NAF_BDG_MAYSLEEP;
na.num_tx_desc = adapter->num_tx_desc;
na.num_rx_desc = adapter->num_rx_desc;
na.nm_txsync = em_netmap_txsync;
na.nm_rxsync = em_netmap_rxsync;
- na.nm_lock = em_netmap_lock_wrapper;
na.nm_register = em_netmap_reg;
netmap_attach(&na, adapter->num_queues);
}
Modified: head/sys/dev/netmap/if_igb_netmap.h
==============================================================================
--- head/sys/dev/netmap/if_igb_netmap.h Fri Nov 1 21:17:45 2013 (r257528)
+++ head/sys/dev/netmap/if_igb_netmap.h Fri Nov 1 21:21:14 2013 (r257529)
@@ -39,38 +39,6 @@
/*
- * wrapper to export locks to the generic code
- */
-static void
-igb_netmap_lock_wrapper(struct ifnet *ifp, int what, u_int queueid)
-{
- struct adapter *adapter = ifp->if_softc;
-
- ASSERT(queueid < adapter->num_queues);
- switch (what) {
- case NETMAP_CORE_LOCK:
- IGB_CORE_LOCK(adapter);
- break;
- case NETMAP_CORE_UNLOCK:
- IGB_CORE_UNLOCK(adapter);
- break;
- case NETMAP_TX_LOCK:
- IGB_TX_LOCK(&adapter->tx_rings[queueid]);
- break;
- case NETMAP_TX_UNLOCK:
- IGB_TX_UNLOCK(&adapter->tx_rings[queueid]);
- break;
- case NETMAP_RX_LOCK:
- IGB_RX_LOCK(&adapter->rx_rings[queueid]);
- break;
- case NETMAP_RX_UNLOCK:
- IGB_RX_UNLOCK(&adapter->rx_rings[queueid]);
- break;
- }
-}
-
-
-/*
* register-unregister routine
*/
static int
@@ -92,7 +60,7 @@ igb_netmap_reg(struct ifnet *ifp, int on
ifp->if_capenable |= IFCAP_NETMAP;
na->if_transmit = ifp->if_transmit;
- ifp->if_transmit = netmap_start;
+ ifp->if_transmit = netmap_transmit;
igb_init_locked(adapter);
if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) {
@@ -114,7 +82,7 @@ fail:
* Reconcile kernel and user view of the transmit ring.
*/
static int
-igb_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock)
+igb_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int flags)
{
struct adapter *adapter = ifp->if_softc;
struct tx_ring *txr = &adapter->tx_rings[ring_nr];
@@ -130,8 +98,6 @@ igb_netmap_txsync(struct ifnet *ifp, u_i
if (k > lim)
return netmap_ring_reinit(kring);
- if (do_lock)
- IGB_TX_LOCK(txr);
bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
BUS_DMASYNC_POSTREAD);
@@ -153,6 +119,13 @@ igb_netmap_txsync(struct ifnet *ifp, u_i
/* curr is the current slot in the nic ring */
union e1000_adv_tx_desc *curr =
(union e1000_adv_tx_desc *)&txr->tx_base[l];
+#ifndef IGB_MEDIA_RESET
+/* at the same time as IGB_MEDIA_RESET was defined, the
+ * tx buffer descriptor was renamed, so use this to revert
+ * back to the old name.
+ */
+#define igb_tx_buf igb_tx_buffer
+#endif
struct igb_tx_buf *txbuf = &txr->tx_buffers[l];
int flags = ((slot->flags & NS_REPORT) ||
j == 0 || j == report_frequency) ?
@@ -162,8 +135,6 @@ igb_netmap_txsync(struct ifnet *ifp, u_i
u_int len = slot->len;
if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) {
- if (do_lock)
- IGB_TX_UNLOCK(txr);
return netmap_ring_reinit(kring);
}
@@ -223,8 +194,6 @@ igb_netmap_txsync(struct ifnet *ifp, u_i
/* update avail to what the kernel knows */
ring->avail = kring->nr_hwavail;
- if (do_lock)
- IGB_TX_UNLOCK(txr);
return 0;
}
@@ -233,7 +202,7 @@ igb_netmap_txsync(struct ifnet *ifp, u_i
* Reconcile kernel and user view of the receive ring.
*/
static int
-igb_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int do_lock)
+igb_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags)
{
struct adapter *adapter = ifp->if_softc;
struct rx_ring *rxr = &adapter->rx_rings[ring_nr];
@@ -241,16 +210,13 @@ igb_netmap_rxsync(struct ifnet *ifp, u_i
struct netmap_kring *kring = &na->rx_rings[ring_nr];
struct netmap_ring *ring = kring->ring;
u_int j, l, n, lim = kring->nkr_num_slots - 1;
- int force_update = do_lock || kring->nr_kflags & NKR_PENDINTR;
+ int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR;
u_int k = ring->cur, resvd = ring->reserved;
k = ring->cur;
if (k > lim)
return netmap_ring_reinit(kring);
- if (do_lock)
- IGB_RX_LOCK(rxr);
-
/* XXX check sync modes */
bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map,
BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
@@ -303,8 +269,6 @@ igb_netmap_rxsync(struct ifnet *ifp, u_i
void *addr = PNMB(slot, &paddr);
if (addr == netmap_buffer_base) { /* bad buf */
- if (do_lock)
- IGB_RX_UNLOCK(rxr);
return netmap_ring_reinit(kring);
}
@@ -332,8 +296,6 @@ igb_netmap_rxsync(struct ifnet *ifp, u_i
}
/* tell userspace that there are new packets */
ring->avail = kring->nr_hwavail - resvd;
*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
More information about the svn-src-all
mailing list