git: 07285bb4c22c - main - tcp: utilize new solisten_clone() and solisten_enqueue()

From: Gleb Smirnoff <glebius_at_FreeBSD.org>
Date: Wed, 10 Aug 2022 18:10:28 UTC
The branch main has been updated by glebius:

URL: https://cgit.FreeBSD.org/src/commit/?id=07285bb4c22c026a50f69149d5dae03169b15fe4

commit 07285bb4c22c026a50f69149d5dae03169b15fe4
Author:     Gleb Smirnoff <glebius@FreeBSD.org>
AuthorDate: 2022-08-10 18:09:34 +0000
Commit:     Gleb Smirnoff <glebius@FreeBSD.org>
CommitDate: 2022-08-10 18:09:34 +0000

    tcp: utilize new solisten_clone() and solisten_enqueue()
    
    This streamlines cloning of a socket from a listener.  Now we do not
    drop the inpcb lock during creation of a new socket, do not do useless
    state transitions, and put a fully initialized socket+inpcb+tcpcb into
    the listen queue.
    
    Before this change, first we would allocate the socket and inpcb+tcpcb via
    tcp_usr_attach() as TCPS_CLOSED, link them into global list of pcbs, unlock
    pcb and put this onto incomplete queue (see 6f3caa6d815).  Then, after
    sonewconn() we would lock it again, transition into TCPS_SYN_RECEIVED,
    insert into inpcb hash, finalize initialization of tcpcb.  And then, in
    call into tcp_do_segment() and upon transition to TCPS_ESTABLISHED call
    soisconnected().  This call would lock the listening socket once again
    with a LOR protection sequence and then we would relocate the socket onto
    the complete queue and only now it is ready for accept(2).
    
    Reviewed by:            rrs, tuexen
    Differential revision:  https://reviews.freebsd.org/D36064
---
 sys/kern/uipc_socket.c     |   2 +-
 sys/netinet/tcp_syncache.c | 130 ++++++++++++++++++++-------------------------
 sys/netinet/tcp_usrreq.c   |   8 ++-
 sys/sys/socketvar.h        |   1 +
 4 files changed, 64 insertions(+), 77 deletions(-)

diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c
index 73725efceb2d..e9fe4dfd7feb 100644
--- a/sys/kern/uipc_socket.c
+++ b/sys/kern/uipc_socket.c
@@ -455,7 +455,7 @@ soalloc(struct vnet *vnet)
  * locks, labels, etc.  All protocol state is assumed already to have been
  * torn down (and possibly never set up) by the caller.
  */
-static void
+void
 sodealloc(struct socket *so)
 {
 
diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c
index ed5981bf85d7..df900280e3df 100644
--- a/sys/netinet/tcp_syncache.c
+++ b/sys/netinet/tcp_syncache.c
@@ -804,44 +804,25 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
 	/*
 	 * Ok, create the full blown connection, and set things up
 	 * as they would have been set up if we had created the
-	 * connection when the SYN arrived.  If we can't create
-	 * the connection, abort it.
+	 * connection when the SYN arrived.
 	 */
-	so = sonewconn(lso, 0);
-	if (so == NULL) {
-		/*
-		 * Drop the connection; we will either send a RST or
-		 * have the peer retransmit its SYN again after its
-		 * RTO and try again.
-		 */
-		TCPSTAT_INC(tcps_listendrop);
-		if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) {
-			log(LOG_DEBUG, "%s; %s: Socket create failed "
-			    "due to limits or memory shortage\n",
-			    s, __func__);
-			free(s, M_TCPLOG);
-		}
-		goto abort2;
-	}
+	if ((so = solisten_clone(lso)) == NULL)
+		goto allocfail;
 #ifdef MAC
 	mac_socketpeer_set_from_mbuf(m, so);
 #endif
-
+	error = in_pcballoc(so, &V_tcbinfo);
+	if (error) {
+		sodealloc(so);
+		goto allocfail;
+	}
 	inp = sotoinpcb(so);
-	inp->inp_inc.inc_fibnum = so->so_fibnum;
-	INP_WLOCK(inp);
-	/*
-	 * Exclusive pcbinfo lock is not required in syncache socket case even
-	 * if two inpcb locks can be acquired simultaneously:
-	 *  - the inpcb in LISTEN state,
-	 *  - the newly created inp.
-	 *
-	 * In this case, an inp cannot be at same time in LISTEN state and
-	 * just created by an accept() call.
-	 */
-	INP_HASH_WLOCK(&V_tcbinfo);
-
-	/* Insert new socket into PCB hash list. */
+	if ((tp = tcp_newtcpcb(inp)) == NULL) {
+		in_pcbdetach(inp);
+		in_pcbfree(inp);
+		sodealloc(so);
+		goto allocfail;
+	}
 	inp->inp_inc.inc_flags = sc->sc_inc.inc_flags;
 #ifdef INET6
 	if (sc->sc_inc.inc_flags & INC_ISIPV6) {
@@ -904,16 +885,12 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
 		laddr6 = inp->in6p_laddr;
 		if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
 			inp->in6p_laddr = sc->sc_inc.inc6_laddr;
-		if ((error = in6_pcbconnect_mbuf(inp, (struct sockaddr *)&sin6,
-		    thread0.td_ucred, m, false)) != 0) {
+		INP_HASH_WLOCK(&V_tcbinfo);
+		error = in6_pcbconnect_mbuf(inp, (struct sockaddr *)&sin6,
+		    thread0.td_ucred, m, false);
+		INP_HASH_WUNLOCK(&V_tcbinfo);
+		if (error != 0) {
 			inp->in6p_laddr = laddr6;
-			if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) {
-				log(LOG_DEBUG, "%s; %s: in6_pcbconnect failed "
-				    "with error %i\n",
-				    s, __func__, error);
-				free(s, M_TCPLOG);
-			}
-			INP_HASH_WUNLOCK(&V_tcbinfo);
 			goto abort;
 		}
 		/* Override flowlabel from in6_pcbconnect. */
@@ -944,16 +921,12 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
 		laddr = inp->inp_laddr;
 		if (inp->inp_laddr.s_addr == INADDR_ANY)
 			inp->inp_laddr = sc->sc_inc.inc_laddr;
-		if ((error = in_pcbconnect(inp, (struct sockaddr *)&sin,
-		    thread0.td_ucred, false)) != 0) {
+		INP_HASH_WLOCK(&V_tcbinfo);
+		error = in_pcbconnect(inp, (struct sockaddr *)&sin,
+		    thread0.td_ucred, false);
+		INP_HASH_WUNLOCK(&V_tcbinfo);
+		if (error != 0) {
 			inp->inp_laddr = laddr;
-			if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) {
-				log(LOG_DEBUG, "%s; %s: in_pcbconnect failed "
-				    "with error %i\n",
-				    s, __func__, error);
-				free(s, M_TCPLOG);
-			}
-			INP_HASH_WUNLOCK(&V_tcbinfo);
 			goto abort;
 		}
 	}
@@ -963,9 +936,7 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
 	if (ipsec_copy_pcbpolicy(sotoinpcb(lso), inp) != 0)
 		printf("syncache_socket: could not copy policy\n");
 #endif
-	INP_HASH_WUNLOCK(&V_tcbinfo);
-	tp = intotcpcb(inp);
-	tcp_state_change(tp, TCPS_SYN_RECEIVED);
+	tp->t_state = TCPS_SYN_RECEIVED;
 	tp->iss = sc->sc_iss;
 	tp->irs = sc->sc_irs;
 	tp->t_port = sc->sc_port;
@@ -1066,13 +1037,37 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
 	tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
 
 	TCPSTAT_INC(tcps_accepts);
+	TCP_PROBE6(state__change, NULL, tp, NULL, tp, NULL, TCPS_LISTEN);
+
+	solisten_enqueue(so, SS_ISCONNECTED);
+
 	return (so);
 
+allocfail:
+	/*
+	 * Drop the connection; we will either send a RST or have the peer
+	 * retransmit its SYN again after its RTO and try again.
+	 */
+	if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) {
+		log(LOG_DEBUG, "%s; %s: Socket create failed "
+		    "due to limits or memory shortage\n",
+		    s, __func__);
+		free(s, M_TCPLOG);
+	}
+	TCPSTAT_INC(tcps_listendrop);
+	return (NULL);
+
 abort:
-	INP_WUNLOCK(inp);
-abort2:
-	if (so != NULL)
-		soabort(so);
+	in_pcbdetach(inp);
+	in_pcbfree(inp);
+	sodealloc(so);
+	if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) {
+		log(LOG_DEBUG, "%s; %s: in%s_pcbconnect failed with error %i\n",
+		    s, __func__, (sc->sc_inc.inc_flags & INC_ISIPV6) ? "6" : "",
+		    error);
+		free(s, M_TCPLOG);
+	}
+	TCPSTAT_INC(tcps_listendrop);
 	return (NULL);
 }
 
@@ -1176,6 +1171,7 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
 			return (-1); /* Do not send RST */
 		}
 #endif /* TCP_SIGNATURE */
+		TCPSTATES_INC(TCPS_SYN_RECEIVED);
 	} else {
 		if (sc->sc_port != port) {
 			SCH_UNLOCK(sch);
@@ -1282,17 +1278,6 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
 				return (-1);  /* Do not send RST */
 			}
 		}
-
-		/*
-		 * Pull out the entry to unlock the bucket row.
-		 *
-		 * NOTE: We must decrease TCPS_SYN_RECEIVED count here, not
-		 * tcp_state_change().  The tcpcb is not existent at this
-		 * moment.  A new one will be allocated via syncache_socket->
-		 * sonewconn->tcp_usr_attach in TCPS_CLOSED state, then
-		 * syncache_socket() will change it to TCPS_SYN_RECEIVED.
-		 */
-		TCPSTATES_DEC(TCPS_SYN_RECEIVED);
 		TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash);
 		sch->sch_length--;
 #ifdef TCP_OFFLOAD
@@ -1340,8 +1325,11 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
 		syncache_free(sc);
 	return (1);
 failed:
-	if (sc != NULL && sc != &scs)
-		syncache_free(sc);
+	if (sc != NULL) {
+		TCPSTATES_DEC(TCPS_SYN_RECEIVED);
+		if (sc != &scs)
+			syncache_free(sc);
+	}
 	if (s != NULL)
 		free(s, M_TCPLOG);
 	*lsop = NULL;
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
index 9280e4310bc3..d77f37efad7c 100644
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -172,11 +172,9 @@ tcp_usr_attach(struct socket *so, int proto, struct thread *td)
 	KASSERT(inp == NULL, ("tcp_usr_attach: inp != NULL"));
 	TCPDEBUG1();
 
-	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
-		error = soreserve(so, V_tcp_sendspace, V_tcp_recvspace);
-		if (error)
-			goto out;
-	}
+	error = soreserve(so, V_tcp_sendspace, V_tcp_recvspace);
+	if (error)
+		goto out;
 
 	so->so_rcv.sb_flags |= SB_AUTOSIZE;
 	so->so_snd.sb_flags |= SB_AUTOSIZE;
diff --git a/sys/sys/socketvar.h b/sys/sys/socketvar.h
index ed797c6d6239..678642eeff6d 100644
--- a/sys/sys/socketvar.h
+++ b/sys/sys/socketvar.h
@@ -498,6 +498,7 @@ int	soreceive_generic(struct socket *so, struct sockaddr **paddr,
 	    struct uio *uio, struct mbuf **mp0, struct mbuf **controlp,
 	    int *flagsp);
 void	sorele_locked(struct socket *so);
+void	sodealloc(struct socket *);
 int	soreserve(struct socket *so, u_long sndcc, u_long rcvcc);
 void	sorflush(struct socket *so);
 int	sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,