MFC of UDP socket enhancement for BIND?

Robert Watson rwatson at FreeBSD.org
Thu Mar 1 11:30:45 UTC 2007


On Sun, 18 Feb 2007, Lars Erik Gullerud wrote:

> On Fri, 16 Feb 2007, Robert Watson wrote:
>
>> I can certainly investigate doing this -- since 6.2 is safely out the door 
>> it's a good time to do so.  I'll follow up by e-mail in a few days -- would 
>> it be possible for you to help with testing?
>
> We would of course be most happy to test any patches you come up with, and 
> run performance benchmarks on our systems.

It turns out this change comes in two parts:

(1) In the first part, the structure of the socket send routing, sosend(), is
     simplified by breaking out the code that copies data from user space from
     the code that transmits via the protocol.

(2) In the second part, a version of sosend() specific to datagram protocols
     (where the socket send buffer isn't ever used) is added.

I'm going to attach two patches against RELENG_6 from today -- the first 
performs only the first step (sosend_copyin.diff), and the second performs 
both (sosend_dgram.diff) (so will have to be applied against a fresh version 
of uipc_socket.c as opposed to the patched version).  The first change 
requires heavy stability testing, and the second requires both performance and 
stability testing.  Any assistance from you in helping to make this a reliable 
MFC would be much appreciated.

For reference, the sosend_copyin.diff applies these changes:

   src/sys/kern/uipc_socket.c:1.253, 1.254, 1.255

The sosend_dgram.diff patch incrementally also applies these changes on top of 
sosend_copyin.diff:

   src/sys/kern/uipc_socket.c:1.256
   src/sys/netinet/udp_usrreq.c:1.188

I've CC'd the performance list as there is a relevant thread going on there 
right now, and other people might also be interested in reviewing and testing 
these changes.  The short description is that this eliminates a large number 
of socket buffer interactions in the UDP send path--one of the effects is to 
avoid locking the socket buffer for an extended period, as it's largely unused 
in the datagram transmit path.  Per the commit comments, this idea was 
suggested by Jinmei Tatsuya at ISC as a result of their performance analysis; 
this change has been in 7-CURRENT since May of last year and has seen some bug 
fixes but no substantial changes in that time, so has been moderately burned 
in.

Robert N M Watson
Computer Laboratory
University of Cambridge
-------------- next part --------------
Index: kern/uipc_socket.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.242.2.8
diff -u -r1.242.2.8 uipc_socket.c
--- kern/uipc_socket.c	3 Feb 2007 04:01:22 -0000	1.242.2.8
+++ kern/uipc_socket.c	1 Mar 2007 11:18:35 -0000
@@ -584,7 +584,149 @@
 	return (error);
 }
 
+#ifdef ZERO_COPY_SOCKETS
+struct so_zerocopy_stats{
+	int size_ok;
+	int align_ok;
+	int found_ifp;
+};
+struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
+#include <netinet/in.h>
+#include <net/route.h>
+#include <netinet/in_pcb.h>
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <vm/vm_object.h>
+#endif /*ZERO_COPY_SOCKETS*/
+
+/*
+ * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or
+ * all of the data referenced by the uio.  If desired, it uses zero-copy.
+ * *space will be updated to reflect data copied in.
+ *
+ * NB: If atomic I/O is requested, the caller must already have checked that
+ * space can hold resid bytes.
+ *
+ * NB: In the event of an error, the caller may need to free the partial
+ * chain pointed to by *mpp.  The contents of both *uio and *space may be
+ * modified even in the case of an error.
+ */
+static int
+sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space,
+    int flags)
+{
+	struct mbuf *m, **mp, *top;
+	long len, resid;
+	int error;
+#ifdef ZERO_COPY_SOCKETS
+	int cow_send;
+#endif
+
+	*retmp = top = NULL;
+	mp = &top;
+	len = 0;
+	resid = uio->uio_resid;
+	error = 0;
+	do {
+#ifdef ZERO_COPY_SOCKETS
+		cow_send = 0;
+#endif /* ZERO_COPY_SOCKETS */
+		if (resid >= MINCLSIZE) {
+#ifdef ZERO_COPY_SOCKETS
+			if (top == NULL) {
+				MGETHDR(m, M_TRYWAIT, MT_DATA);
+				if (m == NULL) {
+					error = ENOBUFS;
+					goto out;
+				}
+				m->m_pkthdr.len = 0;
+				m->m_pkthdr.rcvif = NULL; 
+			} else {
+				MGET(m, M_TRYWAIT, MT_DATA);
+				if (m == NULL) {
+					error = ENOBUFS;
+					goto out;
+				}
+			}
+			if (so_zero_copy_send &&
+			    resid>=PAGE_SIZE &&
+			    *space>=PAGE_SIZE &&
+			    uio->uio_iov->iov_len>=PAGE_SIZE) {
+				so_zerocp_stats.size_ok++;
+				so_zerocp_stats.align_ok++;
+				cow_send = socow_setup(m, uio);
+				len = cow_send;
+			}
+			if (!cow_send) {
+				MCLGET(m, M_TRYWAIT);
+				if ((m->m_flags & M_EXT) == 0) {
+					m_free(m);
+					m = NULL;
+				} else {
+					len = min(min(MCLBYTES, resid),
+					    *space);
+				}
+			}
+#else /* ZERO_COPY_SOCKETS */
+			if (top == NULL) {
+				m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR);
+				m->m_pkthdr.len = 0;
+				m->m_pkthdr.rcvif = NULL;
+			} else
+				m = m_getcl(M_TRYWAIT, MT_DATA, 0);
+			len = min(min(MCLBYTES, resid), *space);
+#endif /* ZERO_COPY_SOCKETS */
+		} else {
+			if (top == NULL) {
+				m = m_gethdr(M_TRYWAIT, MT_DATA);
+				m->m_pkthdr.len = 0;
+				m->m_pkthdr.rcvif = NULL;
+
+				len = min(min(MHLEN, resid), *space);
+				/*
+				 * For datagram protocols, leave room
+				 * for protocol headers in first mbuf.
+				 */
+				if (atomic && m && len < MHLEN)
+					MH_ALIGN(m, len);
+			} else {
+				m = m_get(M_TRYWAIT, MT_DATA);
+				len = min(min(MLEN, resid), *space);
+			}
+		}
+		if (m == NULL) {
+			error = ENOBUFS;
+			goto out;
+		}
+
+		*space -= len;
+#ifdef ZERO_COPY_SOCKETS
+		if (cow_send)
+			error = 0;
+		else
+#endif /* ZERO_COPY_SOCKETS */
+		error = uiomove(mtod(m, void *), (int)len, uio);
+		resid = uio->uio_resid;
+		m->m_len = len;
+		*mp = m;
+		top->m_pkthdr.len += len;
+		if (error)
+			goto out;
+		mp = &m->m_next;
+		if (resid <= 0) {
+			if (flags & MSG_EOR)
+				top->m_flags |= M_EOR;
+			break;
+		}
+	} while (*space > 0 && atomic);
+out:
+	*retmp = top;
+	return (error);
+}
+
 #define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
+#define	snderr(errno)	{ error = (errno); goto release; }
+
 /*
  * Send on a socket.
  * If send must go all at once and message is larger than
@@ -603,21 +745,6 @@
  * Data and control buffers are freed on return.
  */
 
-#ifdef ZERO_COPY_SOCKETS
-struct so_zerocopy_stats{
-	int size_ok;
-	int align_ok;
-	int found_ifp;
-};
-struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
-#include <netinet/in.h>
-#include <net/route.h>
-#include <netinet/in_pcb.h>
-#include <vm/vm.h>
-#include <vm/vm_page.h>
-#include <vm/vm_object.h>
-#endif /*ZERO_COPY_SOCKETS*/
-
 int
 sosend(so, addr, uio, top, control, flags, td)
 	struct socket *so;
@@ -628,14 +755,9 @@
 	int flags;
 	struct thread *td;
 {
-	struct mbuf **mp;
-	struct mbuf *m;
-	long space, len = 0, resid;
+	long space, resid;
 	int clen = 0, error, dontroute;
 	int atomic = sosendallatonce(so) || top;
-#ifdef ZERO_COPY_SOCKETS
-	int cow_send;
-#endif /* ZERO_COPY_SOCKETS */
 
 	if (uio != NULL)
 		resid = uio->uio_resid;
@@ -663,7 +785,6 @@
 		td->td_proc->p_stats->p_ru.ru_msgsnd++;
 	if (control != NULL)
 		clen = control->m_len;
-#define	snderr(errno)	{ error = (errno); goto release; }
 
 	SOCKBUF_LOCK(&so->so_snd);
 restart:
@@ -713,153 +834,61 @@
 			goto restart;
 		}
 		SOCKBUF_UNLOCK(&so->so_snd);
-		mp = &top;
 		space -= clen;
 		do {
-		    if (uio == NULL) {
-			/*
-			 * Data is prepackaged in "top".
-			 */
-			resid = 0;
-			if (flags & MSG_EOR)
-				top->m_flags |= M_EOR;
-		    } else do {
-#ifdef ZERO_COPY_SOCKETS
-			cow_send = 0;
-#endif /* ZERO_COPY_SOCKETS */
-			if (resid >= MINCLSIZE) {
-#ifdef ZERO_COPY_SOCKETS
-				if (top == NULL) {
-					MGETHDR(m, M_TRYWAIT, MT_DATA);
-					if (m == NULL) {
-						error = ENOBUFS;
-						SOCKBUF_LOCK(&so->so_snd);
-						goto release;
-					}
-					m->m_pkthdr.len = 0;
-					m->m_pkthdr.rcvif = NULL; 
-				} else {
-					MGET(m, M_TRYWAIT, MT_DATA);
-					if (m == NULL) {
-						error = ENOBUFS;
-						SOCKBUF_LOCK(&so->so_snd);
-						goto release;
-					}
-				}
-				if (so_zero_copy_send &&
-				    resid>=PAGE_SIZE &&
-				    space>=PAGE_SIZE &&
-				    uio->uio_iov->iov_len>=PAGE_SIZE) {
-					so_zerocp_stats.size_ok++;
-					so_zerocp_stats.align_ok++;
-					cow_send = socow_setup(m, uio);
-					len = cow_send;
-				}
-				if (!cow_send) {
-					MCLGET(m, M_TRYWAIT);
-					if ((m->m_flags & M_EXT) == 0) {
-						m_free(m);
-						m = NULL;
-					} else {
-						len = min(min(MCLBYTES, resid), space);
-					}
-				}
-#else /* ZERO_COPY_SOCKETS */
-				if (top == NULL) {
-					m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR);
-					m->m_pkthdr.len = 0;
-					m->m_pkthdr.rcvif = NULL;
-				} else
-					m = m_getcl(M_TRYWAIT, MT_DATA, 0);
-				len = min(min(MCLBYTES, resid), space);
-#endif /* ZERO_COPY_SOCKETS */
+			if (uio == NULL) {
+				resid = 0;
+				if (flags & MSG_EOR)
+					top->m_flags |= M_EOR;
 			} else {
-				if (top == NULL) {
-					m = m_gethdr(M_TRYWAIT, MT_DATA);
-					m->m_pkthdr.len = 0;
-					m->m_pkthdr.rcvif = NULL;
-
-					len = min(min(MHLEN, resid), space);
-					/*
-					 * For datagram protocols, leave room
-					 * for protocol headers in first mbuf.
-					 */
-					if (atomic && m && len < MHLEN)
-						MH_ALIGN(m, len);
-				} else {
-					m = m_get(M_TRYWAIT, MT_DATA);
-					len = min(min(MLEN, resid), space);
+				error = sosend_copyin(uio, &top, atomic,
+				    &space, flags);
+				if (error != 0) {
+					SOCKBUF_LOCK(&so->so_snd);
+					goto release;
 				}
+				resid = uio->uio_resid;
 			}
-			if (m == NULL) {
-				error = ENOBUFS;
-				SOCKBUF_LOCK(&so->so_snd);
-				goto release;
+			if (dontroute) {
+				SOCK_LOCK(so);
+				so->so_options |= SO_DONTROUTE;
+				SOCK_UNLOCK(so);
 			}
-
-			space -= len;
-#ifdef ZERO_COPY_SOCKETS
-			if (cow_send)
-				error = 0;
-			else
-#endif /* ZERO_COPY_SOCKETS */
-			error = uiomove(mtod(m, void *), (int)len, uio);
-			resid = uio->uio_resid;
-			m->m_len = len;
-			*mp = m;
-			top->m_pkthdr.len += len;
-			if (error) {
-				SOCKBUF_LOCK(&so->so_snd);
-				goto release;
-			}
-			mp = &m->m_next;
-			if (resid <= 0) {
-				if (flags & MSG_EOR)
-					top->m_flags |= M_EOR;
-				break;
-			}
-		    } while (space > 0 && atomic);
-		    if (dontroute) {
-			    SOCK_LOCK(so);
-			    so->so_options |= SO_DONTROUTE;
-			    SOCK_UNLOCK(so);
-		    }
-		    /*
-		     * XXX all the SBS_CANTSENDMORE checks previously
-		     * done could be out of date.  We could have recieved
-		     * a reset packet in an interrupt or maybe we slept
-		     * while doing page faults in uiomove() etc. We could
-		     * probably recheck again inside the locking protection
-		     * here, but there are probably other places that this
-		     * also happens.  We must rethink this.
-		     */
-		    error = (*so->so_proto->pr_usrreqs->pru_send)(so,
-			(flags & MSG_OOB) ? PRUS_OOB :
+			/*
+			 * XXX all the SBS_CANTSENDMORE checks previously
+			 * done could be out of date.  We could have recieved
+			 * a reset packet in an interrupt or maybe we slept
+			 * while doing page faults in uiomove() etc. We could
+			 * probably recheck again inside the locking protection
+			 * here, but there are probably other places that this
+			 * also happens.  We must rethink this.
+			 */
+			error = (*so->so_proto->pr_usrreqs->pru_send)(so,
+			    (flags & MSG_OOB) ? PRUS_OOB :
 			/*
 			 * If the user set MSG_EOF, the protocol
 			 * understands this flag and nothing left to
 			 * send then use PRU_SEND_EOF instead of PRU_SEND.
 			 */
-			((flags & MSG_EOF) &&
-			 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
-			 (resid <= 0)) ?
+			    ((flags & MSG_EOF) &&
+			     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
+			     (resid <= 0)) ?
 				PRUS_EOF :
 			/* If there is more to send set PRUS_MORETOCOME */
-			(resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
-			top, addr, control, td);
-		    if (dontroute) {
-			    SOCK_LOCK(so);
-			    so->so_options &= ~SO_DONTROUTE;
-			    SOCK_UNLOCK(so);
-		    }
-		    clen = 0;
-		    control = NULL;
-		    top = NULL;
-		    mp = &top;
-		    if (error) {
-			SOCKBUF_LOCK(&so->so_snd);
-			goto release;
-		    }
+			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
+			    top, addr, control, td);
+			if (dontroute) {
+				SOCK_LOCK(so);
+				so->so_options &= ~SO_DONTROUTE;
+				SOCK_UNLOCK(so);
+			}
+			clen = 0;
+			control = NULL;
+			top = NULL;
+			if (error) {
+				SOCKBUF_LOCK(&so->so_snd);
+				goto release;
+			}
 		} while (resid && space > 0);
 		SOCKBUF_LOCK(&so->so_snd);
 	} while (resid);
-------------- next part --------------
Index: kern/uipc_socket.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.242.2.8
diff -u -r1.242.2.8 uipc_socket.c
--- kern/uipc_socket.c	3 Feb 2007 04:01:22 -0000	1.242.2.8
+++ kern/uipc_socket.c	1 Mar 2007 11:27:11 -0000
@@ -584,7 +584,301 @@
 	return (error);
 }
 
+#ifdef ZERO_COPY_SOCKETS
+struct so_zerocopy_stats{
+	int size_ok;
+	int align_ok;
+	int found_ifp;
+};
+struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
+#include <netinet/in.h>
+#include <net/route.h>
+#include <netinet/in_pcb.h>
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <vm/vm_object.h>
+#endif /*ZERO_COPY_SOCKETS*/
+
+/*
+ * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or
+ * all of the data referenced by the uio.  If desired, it uses zero-copy.
+ * *space will be updated to reflect data copied in.
+ *
+ * NB: If atomic I/O is requested, the caller must already have checked that
+ * space can hold resid bytes.
+ *
+ * NB: In the event of an error, the caller may need to free the partial
+ * chain pointed to by *mpp.  The contents of both *uio and *space may be
+ * modified even in the case of an error.
+ */
+static int
+sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space,
+    int flags)
+{
+	struct mbuf *m, **mp, *top;
+	long len, resid;
+	int error;
+#ifdef ZERO_COPY_SOCKETS
+	int cow_send;
+#endif
+
+	*retmp = top = NULL;
+	mp = &top;
+	len = 0;
+	resid = uio->uio_resid;
+	error = 0;
+	do {
+#ifdef ZERO_COPY_SOCKETS
+		cow_send = 0;
+#endif /* ZERO_COPY_SOCKETS */
+		if (resid >= MINCLSIZE) {
+#ifdef ZERO_COPY_SOCKETS
+			if (top == NULL) {
+				MGETHDR(m, M_TRYWAIT, MT_DATA);
+				if (m == NULL) {
+					error = ENOBUFS;
+					goto out;
+				}
+				m->m_pkthdr.len = 0;
+				m->m_pkthdr.rcvif = NULL; 
+			} else {
+				MGET(m, M_TRYWAIT, MT_DATA);
+				if (m == NULL) {
+					error = ENOBUFS;
+					goto out;
+				}
+			}
+			if (so_zero_copy_send &&
+			    resid>=PAGE_SIZE &&
+			    *space>=PAGE_SIZE &&
+			    uio->uio_iov->iov_len>=PAGE_SIZE) {
+				so_zerocp_stats.size_ok++;
+				so_zerocp_stats.align_ok++;
+				cow_send = socow_setup(m, uio);
+				len = cow_send;
+			}
+			if (!cow_send) {
+				MCLGET(m, M_TRYWAIT);
+				if ((m->m_flags & M_EXT) == 0) {
+					m_free(m);
+					m = NULL;
+				} else {
+					len = min(min(MCLBYTES, resid),
+					    *space);
+				}
+			}
+#else /* ZERO_COPY_SOCKETS */
+			if (top == NULL) {
+				m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR);
+				m->m_pkthdr.len = 0;
+				m->m_pkthdr.rcvif = NULL;
+			} else
+				m = m_getcl(M_TRYWAIT, MT_DATA, 0);
+			len = min(min(MCLBYTES, resid), *space);
+#endif /* ZERO_COPY_SOCKETS */
+		} else {
+			if (top == NULL) {
+				m = m_gethdr(M_TRYWAIT, MT_DATA);
+				m->m_pkthdr.len = 0;
+				m->m_pkthdr.rcvif = NULL;
+
+				len = min(min(MHLEN, resid), *space);
+				/*
+				 * For datagram protocols, leave room
+				 * for protocol headers in first mbuf.
+				 */
+				if (atomic && m && len < MHLEN)
+					MH_ALIGN(m, len);
+			} else {
+				m = m_get(M_TRYWAIT, MT_DATA);
+				len = min(min(MLEN, resid), *space);
+			}
+		}
+		if (m == NULL) {
+			error = ENOBUFS;
+			goto out;
+		}
+
+		*space -= len;
+#ifdef ZERO_COPY_SOCKETS
+		if (cow_send)
+			error = 0;
+		else
+#endif /* ZERO_COPY_SOCKETS */
+		error = uiomove(mtod(m, void *), (int)len, uio);
+		resid = uio->uio_resid;
+		m->m_len = len;
+		*mp = m;
+		top->m_pkthdr.len += len;
+		if (error)
+			goto out;
+		mp = &m->m_next;
+		if (resid <= 0) {
+			if (flags & MSG_EOR)
+				top->m_flags |= M_EOR;
+			break;
+		}
+	} while (*space > 0 && atomic);
+out:
+	*retmp = top;
+	return (error);
+}
+
 #define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
+
+int
+sosend_dgram(so, addr, uio, top, control, flags, td)
+	struct socket *so;
+	struct sockaddr *addr;
+	struct uio *uio;
+	struct mbuf *top;
+	struct mbuf *control;
+	int flags;
+	struct thread *td;
+{
+	long space, resid;
+	int clen = 0, error, dontroute;
+	int atomic = sosendallatonce(so) || top;
+
+	KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM"));
+	KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
+	    ("sodgram_send: !PR_ATOMIC"));
+
+	if (uio != NULL)
+		resid = uio->uio_resid;
+	else
+		resid = top->m_pkthdr.len;
+	/*
+	 * In theory resid should be unsigned.
+	 * However, space must be signed, as it might be less than 0
+	 * if we over-committed, and we must use a signed comparison
+	 * of space and resid.  On the other hand, a negative resid
+	 * causes us to loop sending 0-length segments to the protocol.
+	 *
+	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
+	 * type sockets since that's an error.
+	 */
+	if (resid < 0) {
+		error = EINVAL;
+		goto out;
+	}
+
+	dontroute =
+	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
+	if (td != NULL)
+		td->td_proc->p_stats->p_ru.ru_msgsnd++;
+	if (control != NULL)
+		clen = control->m_len;
+
+	SOCKBUF_LOCK(&so->so_snd);
+	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
+		SOCKBUF_UNLOCK(&so->so_snd);
+		error = EPIPE;
+		goto out;
+	}
+	if (so->so_error) {
+		error = so->so_error;
+		so->so_error = 0;
+		SOCKBUF_UNLOCK(&so->so_snd);
+		goto out;
+	}
+	if ((so->so_state & SS_ISCONNECTED) == 0) {
+		/*
+		 * `sendto' and `sendmsg' is allowed on a connection-
+		 * based socket if it supports implied connect.
+		 * Return ENOTCONN if not connected and no address is
+		 * supplied.
+		 */
+		if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
+		    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
+			if ((so->so_state & SS_ISCONFIRMING) == 0 &&
+			    !(resid == 0 && clen != 0)) {
+				SOCKBUF_UNLOCK(&so->so_snd);
+				error = ENOTCONN;
+				goto out;
+			}
+		} else if (addr == NULL) {
+			if (so->so_proto->pr_flags & PR_CONNREQUIRED)
+				error = ENOTCONN;
+			else
+				error = EDESTADDRREQ;
+			SOCKBUF_UNLOCK(&so->so_snd);
+			goto out;
+		}
+	}
+
+	/*
+	 * Do we need MSG_OOB support in SOCK_DGRAM?  Signs here may be a
+	 * problem and need fixing.
+	 */
+	space = sbspace(&so->so_snd);
+	if (flags & MSG_OOB)
+		space += 1024;
+	space -= clen;
+	if (resid > space) {
+		error = EMSGSIZE;
+		goto out;
+	}
+	SOCKBUF_UNLOCK(&so->so_snd);
+	if (uio == NULL) {
+		resid = 0;
+		if (flags & MSG_EOR)
+			top->m_flags |= M_EOR;
+	} else {
+		error = sosend_copyin(uio, &top, atomic, &space, flags);
+		if (error)
+			goto out;
+		resid = uio->uio_resid;
+	}
+	KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
+	/*
+	 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
+	 * than with.
+	 */
+	if (dontroute) {
+		SOCK_LOCK(so);
+		so->so_options |= SO_DONTROUTE;
+		SOCK_UNLOCK(so);
+	}
+	/*
+	 * XXX all the SBS_CANTSENDMORE checks previously
+	 * done could be out of date.  We could have recieved
+	 * a reset packet in an interrupt or maybe we slept
+	 * while doing page faults in uiomove() etc. We could
+	 * probably recheck again inside the locking protection
+	 * here, but there are probably other places that this
+	 * also happens.  We must rethink this.
+	 */
+	error = (*so->so_proto->pr_usrreqs->pru_send)(so,
+	    (flags & MSG_OOB) ? PRUS_OOB :
+	/*
+	 * If the user set MSG_EOF, the protocol
+	 * understands this flag and nothing left to
+	 * send then use PRU_SEND_EOF instead of PRU_SEND.
+	 */
+	    ((flags & MSG_EOF) &&
+	     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
+	     (resid <= 0)) ?
+		PRUS_EOF :
+		/* If there is more to send set PRUS_MORETOCOME */
+		(resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
+		top, addr, control, td);
+	if (dontroute) {
+		SOCK_LOCK(so);
+		so->so_options &= ~SO_DONTROUTE;
+		SOCK_UNLOCK(so);
+	}
+	clen = 0;
+	control = NULL;
+	top = NULL;
+out:
+	if (top != NULL)
+		m_freem(top);
+	if (control != NULL)
+		m_freem(control);
+	return (error);
+}
+
 /*
  * Send on a socket.
  * If send must go all at once and message is larger than
@@ -602,22 +896,7 @@
  * must check for short counts if EINTR/ERESTART are returned.
  * Data and control buffers are freed on return.
  */
-
-#ifdef ZERO_COPY_SOCKETS
-struct so_zerocopy_stats{
-	int size_ok;
-	int align_ok;
-	int found_ifp;
-};
-struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
-#include <netinet/in.h>
-#include <net/route.h>
-#include <netinet/in_pcb.h>
-#include <vm/vm.h>
-#include <vm/vm_page.h>
-#include <vm/vm_object.h>
-#endif /*ZERO_COPY_SOCKETS*/
-
+#define	snderr(errno)	{ error = (errno); goto release; }
 int
 sosend(so, addr, uio, top, control, flags, td)
 	struct socket *so;
@@ -628,14 +907,9 @@
 	int flags;
 	struct thread *td;
 {
-	struct mbuf **mp;
-	struct mbuf *m;
-	long space, len = 0, resid;
+	long space, resid;
 	int clen = 0, error, dontroute;
 	int atomic = sosendallatonce(so) || top;
-#ifdef ZERO_COPY_SOCKETS
-	int cow_send;
-#endif /* ZERO_COPY_SOCKETS */
 
 	if (uio != NULL)
 		resid = uio->uio_resid;
@@ -663,7 +937,6 @@
 		td->td_proc->p_stats->p_ru.ru_msgsnd++;
 	if (control != NULL)
 		clen = control->m_len;
-#define	snderr(errno)	{ error = (errno); goto release; }
 
 	SOCKBUF_LOCK(&so->so_snd);
 restart:
@@ -713,153 +986,61 @@
 			goto restart;
 		}
 		SOCKBUF_UNLOCK(&so->so_snd);
-		mp = &top;
 		space -= clen;
 		do {
-		    if (uio == NULL) {
-			/*
-			 * Data is prepackaged in "top".
-			 */
-			resid = 0;
-			if (flags & MSG_EOR)
-				top->m_flags |= M_EOR;
-		    } else do {
-#ifdef ZERO_COPY_SOCKETS
-			cow_send = 0;
-#endif /* ZERO_COPY_SOCKETS */
-			if (resid >= MINCLSIZE) {
-#ifdef ZERO_COPY_SOCKETS
-				if (top == NULL) {
-					MGETHDR(m, M_TRYWAIT, MT_DATA);
-					if (m == NULL) {
-						error = ENOBUFS;
-						SOCKBUF_LOCK(&so->so_snd);
-						goto release;
-					}
-					m->m_pkthdr.len = 0;
-					m->m_pkthdr.rcvif = NULL; 
-				} else {
-					MGET(m, M_TRYWAIT, MT_DATA);
-					if (m == NULL) {
-						error = ENOBUFS;
-						SOCKBUF_LOCK(&so->so_snd);
-						goto release;
-					}
-				}
-				if (so_zero_copy_send &&
-				    resid>=PAGE_SIZE &&
-				    space>=PAGE_SIZE &&
-				    uio->uio_iov->iov_len>=PAGE_SIZE) {
-					so_zerocp_stats.size_ok++;
-					so_zerocp_stats.align_ok++;
-					cow_send = socow_setup(m, uio);
-					len = cow_send;
-				}
-				if (!cow_send) {
-					MCLGET(m, M_TRYWAIT);
-					if ((m->m_flags & M_EXT) == 0) {
-						m_free(m);
-						m = NULL;
-					} else {
-						len = min(min(MCLBYTES, resid), space);
-					}
-				}
-#else /* ZERO_COPY_SOCKETS */
-				if (top == NULL) {
-					m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR);
-					m->m_pkthdr.len = 0;
-					m->m_pkthdr.rcvif = NULL;
-				} else
-					m = m_getcl(M_TRYWAIT, MT_DATA, 0);
-				len = min(min(MCLBYTES, resid), space);
-#endif /* ZERO_COPY_SOCKETS */
+			if (uio == NULL) {
+				resid = 0;
+				if (flags & MSG_EOR)
+					top->m_flags |= M_EOR;
 			} else {
-				if (top == NULL) {
-					m = m_gethdr(M_TRYWAIT, MT_DATA);
-					m->m_pkthdr.len = 0;
-					m->m_pkthdr.rcvif = NULL;
-
-					len = min(min(MHLEN, resid), space);
-					/*
-					 * For datagram protocols, leave room
-					 * for protocol headers in first mbuf.
-					 */
-					if (atomic && m && len < MHLEN)
-						MH_ALIGN(m, len);
-				} else {
-					m = m_get(M_TRYWAIT, MT_DATA);
-					len = min(min(MLEN, resid), space);
+				error = sosend_copyin(uio, &top, atomic,
+				    &space, flags);
+				if (error != 0) {
+					SOCKBUF_LOCK(&so->so_snd);
+					goto release;
 				}
+				resid = uio->uio_resid;
 			}
-			if (m == NULL) {
-				error = ENOBUFS;
-				SOCKBUF_LOCK(&so->so_snd);
-				goto release;
-			}
-
-			space -= len;
-#ifdef ZERO_COPY_SOCKETS
-			if (cow_send)
-				error = 0;
-			else
-#endif /* ZERO_COPY_SOCKETS */
-			error = uiomove(mtod(m, void *), (int)len, uio);
-			resid = uio->uio_resid;
-			m->m_len = len;
-			*mp = m;
-			top->m_pkthdr.len += len;
-			if (error) {
-				SOCKBUF_LOCK(&so->so_snd);
-				goto release;
-			}
-			mp = &m->m_next;
-			if (resid <= 0) {
-				if (flags & MSG_EOR)
-					top->m_flags |= M_EOR;
-				break;
+			if (dontroute) {
+				SOCK_LOCK(so);
+				so->so_options |= SO_DONTROUTE;
+				SOCK_UNLOCK(so);
 			}
-		    } while (space > 0 && atomic);
-		    if (dontroute) {
-			    SOCK_LOCK(so);
-			    so->so_options |= SO_DONTROUTE;
-			    SOCK_UNLOCK(so);
-		    }
-		    /*
-		     * XXX all the SBS_CANTSENDMORE checks previously
-		     * done could be out of date.  We could have recieved
-		     * a reset packet in an interrupt or maybe we slept
-		     * while doing page faults in uiomove() etc. We could
-		     * probably recheck again inside the locking protection
-		     * here, but there are probably other places that this
-		     * also happens.  We must rethink this.
-		     */
-		    error = (*so->so_proto->pr_usrreqs->pru_send)(so,
-			(flags & MSG_OOB) ? PRUS_OOB :
+			/*
+			 * XXX all the SBS_CANTSENDMORE checks previously
+			 * done could be out of date.  We could have recieved
+			 * a reset packet in an interrupt or maybe we slept
+			 * while doing page faults in uiomove() etc. We could
+			 * probably recheck again inside the locking protection
+			 * here, but there are probably other places that this
+			 * also happens.  We must rethink this.
+			 */
+			error = (*so->so_proto->pr_usrreqs->pru_send)(so,
+			    (flags & MSG_OOB) ? PRUS_OOB :
 			/*
 			 * If the user set MSG_EOF, the protocol
 			 * understands this flag and nothing left to
 			 * send then use PRU_SEND_EOF instead of PRU_SEND.
 			 */
-			((flags & MSG_EOF) &&
-			 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
-			 (resid <= 0)) ?
+			    ((flags & MSG_EOF) &&
+			     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
+			     (resid <= 0)) ?
 				PRUS_EOF :
 			/* If there is more to send set PRUS_MORETOCOME */
-			(resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
-			top, addr, control, td);
-		    if (dontroute) {
-			    SOCK_LOCK(so);
-			    so->so_options &= ~SO_DONTROUTE;
-			    SOCK_UNLOCK(so);
-		    }
-		    clen = 0;
-		    control = NULL;
-		    top = NULL;
-		    mp = &top;
-		    if (error) {
-			SOCKBUF_LOCK(&so->so_snd);
-			goto release;
-		    }
+			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
+			    top, addr, control, td);
+			if (dontroute) {
+				SOCK_LOCK(so);
+				so->so_options &= ~SO_DONTROUTE;
+				SOCK_UNLOCK(so);
+			}
+			clen = 0;
+			control = NULL;
+			top = NULL;
+			if (error) {
+				SOCKBUF_LOCK(&so->so_snd);
+				goto release;
+			}
 		} while (resid && space > 0);
 		SOCKBUF_LOCK(&so->so_snd);
 	} while (resid);
@@ -877,6 +1058,7 @@
 		m_freem(control);
 	return (error);
 }
+#undef snderr
 
 /*
  * The part of soreceive() that implements reading non-inline out-of-band
Index: netinet/udp_usrreq.c
===================================================================
RCS file: /home/ncvs/src/sys/netinet/udp_usrreq.c,v
retrieving revision 1.175.2.9
diff -u -r1.175.2.9 udp_usrreq.c
--- netinet/udp_usrreq.c	29 Dec 2006 19:25:49 -0000	1.175.2.9
+++ netinet/udp_usrreq.c	1 Mar 2007 11:27:34 -0000
@@ -1150,6 +1150,7 @@
 	.pru_disconnect =	udp_disconnect,
 	.pru_peeraddr =		udp_peeraddr,
 	.pru_send =		udp_send,
+	.pru_sosend =		sosend_dgram,
 	.pru_shutdown =		udp_shutdown,
 	.pru_sockaddr =		udp_sockaddr,
 	.pru_sosetlabel =	in_pcbsosetlabel


More information about the freebsd-performance mailing list