git: d8596171c5d6 - main - sockets: use only soref()/sorele() as socket reference count

From: Gleb Smirnoff <glebius_at_FreeBSD.org>
Date: Mon, 04 Jul 2022 19:41:17 UTC
The branch main has been updated by glebius:

URL: https://cgit.FreeBSD.org/src/commit/?id=d8596171c5d68c8dda1853e1243ad08944812400

commit d8596171c5d68c8dda1853e1243ad08944812400
Author:     Gleb Smirnoff <glebius@FreeBSD.org>
AuthorDate: 2022-07-04 19:40:51 +0000
Commit:     Gleb Smirnoff <glebius@FreeBSD.org>
CommitDate: 2022-07-04 19:40:51 +0000

    sockets: use only soref()/sorele() as socket reference count
    
    o Retire SS_FDREF as it is basically a debug flag on top of already
      existing soref()/sorele().
    o Convert SS_PROTOREF into soref()/sorele().
    o Change reference model for the listen queues, see below.
    o Make sofree() private.  The correct KPI to use is only sorele().
    o Make soabort() respect the model and sorele() instead of sofree().
    
    Note on listening queues.  Until now the sockets on a queue had zero
    reference count.  And the reference were given only upon accept(2).  The
    assumption was that there is no way to see the queued socket from anywhere
    except its head.  This is not true, since queued sockets already have pcbs,
    which are linked at least into the global pcb lists.  With this change we
    put the reference right in the sonewconn() and on accept(2) path we just
    hand the existing reference to the file descriptor.
    
    Differential revision:  https://reviews.freebsd.org/D35679
---
 sys/kern/uipc_debug.c                          |   8 --
 sys/kern/uipc_socket.c                         | 133 +++++--------------------
 sys/netinet/tcp_subr.c                         |   6 +-
 sys/netinet/tcp_timewait.c                     |  12 +--
 sys/netinet/tcp_usrreq.c                       |   8 +-
 sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c |  10 +-
 sys/sys/socketvar.h                            |  11 --
 7 files changed, 30 insertions(+), 158 deletions(-)

diff --git a/sys/kern/uipc_debug.c b/sys/kern/uipc_debug.c
index dee2b12e7cb7..0abb5352bed5 100644
--- a/sys/kern/uipc_debug.c
+++ b/sys/kern/uipc_debug.c
@@ -158,10 +158,6 @@ db_print_sostate(short so_state)
 	int comma;
 
 	comma = 0;
-	if (so_state & SS_FDREF) {
-		db_printf("%sSS_FDREF", comma ? ", " : "");
-		comma = 1;
-	}
 	if (so_state & SS_ISCONNECTED) {
 		db_printf("%sSS_ISCONNECTED", comma ? ", " : "");
 		comma = 1;
@@ -186,10 +182,6 @@ db_print_sostate(short so_state)
 		db_printf("%sSS_ISCONFIRMING", comma ? ", " : "");
 		comma = 1;
 	}
-	if (so_state & SS_PROTOREF) {
-		db_printf("%sSS_PROTOREF", comma ? ", " : "");
-		comma = 1;
-	}
 }
 
 static void
diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c
index aa0470a51a5f..f25db7be1fd8 100644
--- a/sys/kern/uipc_socket.c
+++ b/sys/kern/uipc_socket.c
@@ -540,7 +540,6 @@ socreate(int dom, struct socket **aso, int type, int proto,
 		return (ENOBUFS);
 
 	so->so_type = type;
-	so->so_state = SS_FDREF;
 	so->so_cred = crhold(cred);
 	if ((prp->pr_domain->dom_family == PF_INET) ||
 	    (prp->pr_domain->dom_family == PF_INET6) ||
@@ -741,7 +740,7 @@ sonewconn(struct socket *head, int connstatus)
 	so->so_type = head->so_type;
 	so->so_options = head->so_options & ~SO_ACCEPTCONN;
 	so->so_linger = head->so_linger;
-	so->so_state = head->so_state & ~SS_FDREF;
+	so->so_state = head->so_state;
 	so->so_fibnum = head->so_fibnum;
 	so->so_proto = head->so_proto;
 	so->so_cred = crhold(head->so_cred);
@@ -763,7 +762,10 @@ sonewconn(struct socket *head, int connstatus)
 		so->so_snd.sb_mtx = &so->so_snd_mtx;
 		so->so_rcv.sb_mtx = &so->so_rcv_mtx;
 	}
+	/* Socket starts with a reference from the listen queue. */
+	refcount_init(&so->so_count, 1);
 	if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
+		MPASS(refcount_release(&so->so_count));
 		sodealloc(so);
 		log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n",
 		    __func__, head->so_pcb);
@@ -1062,7 +1064,8 @@ solisten_wakeup(struct socket *sol)
 /*
  * Return single connection off a listening socket queue.  Main consumer of
  * the function is kern_accept4().  Some modules, that do their own accept
- * management also use the function.
+ * management also use the function.  The socket reference held by the
+ * listen queue is handed to the caller.
  *
  * Listening socket must be locked on entry and is returned unlocked on
  * return.
@@ -1100,7 +1103,6 @@ solisten_dequeue(struct socket *head, struct socket **ret, int flags)
 	SOCK_LOCK(so);
 	KASSERT(so->so_qstate == SQ_COMP,
 	    ("%s: so %p not SQ_COMP", __func__, so));
-	soref(so);
 	head->sol_qlen--;
 	so->so_qstate = SQ_NONE;
 	so->so_listen = NULL;
@@ -1117,86 +1119,19 @@ solisten_dequeue(struct socket *head, struct socket **ret, int flags)
 }
 
 /*
- * Evaluate the reference count and named references on a socket; if no
- * references remain, free it.  This should be called whenever a reference is
- * released, such as in sorele(), but also when named reference flags are
- * cleared in socket or protocol code.
- *
- * sofree() will free the socket if:
- *
- * - There are no outstanding file descriptor references or related consumers
- *   (so_count == 0).
- *
- * - The socket has been closed by user space, if ever open (no SS_FDREF).
- *
- * - The protocol does not have an outstanding strong reference on the socket
- *   (SS_PROTOREF).
- *
- * - The socket is not in a completed connection queue, so a process has been
- *   notified that it is present.  If it is removed, the user process may
- *   block in accept() despite select() saying the socket was ready.
+ * Free socket upon release of the very last reference.
  */
-void
+static void
 sofree(struct socket *so)
 {
 	struct protosw *pr = so->so_proto;
-	bool last __diagused;
 
 	SOCK_LOCK_ASSERT(so);
+	KASSERT(refcount_load(&so->so_count) == 0,
+	    ("%s: so %p has references", __func__, so));
+	KASSERT(SOLISTENING(so) || so->so_qstate == SQ_NONE,
+	    ("%s: so %p is on listen queue", __func__, so));
 
-	if ((so->so_state & (SS_FDREF | SS_PROTOREF)) != 0 ||
-	    refcount_load(&so->so_count) != 0 || so->so_qstate == SQ_COMP) {
-		SOCK_UNLOCK(so);
-		return;
-	}
-
-	if (!SOLISTENING(so) && so->so_qstate == SQ_INCOMP) {
-		struct socket *sol;
-
-		sol = so->so_listen;
-		KASSERT(sol, ("%s: so %p on incomp of NULL", __func__, so));
-
-		/*
-		 * To solve race between close of a listening socket and
-		 * a socket on its incomplete queue, we need to lock both.
-		 * The order is first listening socket, then regular.
-		 * Since we don't have SS_FDREF neither SS_PROTOREF, this
-		 * function and the listening socket are the only pointers
-		 * to so.  To preserve so and sol, we reference both and then
-		 * relock.
-		 * After relock the socket may not move to so_comp since it
-		 * doesn't have PCB already, but it may be removed from
-		 * so_incomp. If that happens, we share responsiblity on
-		 * freeing the socket, but soclose() has already removed
-		 * it from queue.
-		 */
-		soref(sol);
-		soref(so);
-		SOCK_UNLOCK(so);
-		SOLISTEN_LOCK(sol);
-		SOCK_LOCK(so);
-		if (so->so_qstate == SQ_INCOMP) {
-			KASSERT(so->so_listen == sol,
-			    ("%s: so %p migrated out of sol %p",
-			    __func__, so, sol));
-			TAILQ_REMOVE(&sol->sol_incomp, so, so_list);
-			sol->sol_incqlen--;
-			last = refcount_release(&sol->so_count);
-			KASSERT(!last, ("%s: released last reference for %p",
-			    __func__, sol));
-			so->so_qstate = SQ_NONE;
-			so->so_listen = NULL;
-		} else
-			KASSERT(so->so_listen == NULL,
-			    ("%s: so %p not on (in)comp with so_listen",
-			    __func__, so));
-		sorele_locked(sol);
-		KASSERT(refcount_load(&so->so_count) == 1,
-		    ("%s: so %p count %u", __func__, so, so->so_count));
-		so->so_count = 0;
-	}
-	if (SOLISTENING(so))
-		so->so_error = ECONNABORTED;
 	SOCK_UNLOCK(so);
 
 	if (so->so_dtor != NULL)
@@ -1255,9 +1190,6 @@ soclose(struct socket *so)
 	int error = 0;
 	bool listening, last __diagused;
 
-	KASSERT(so->so_state & SS_FDREF,
-	    ("%s: %p no SS_FDREF on enter", __func__, so));
-
 	CURVNET_SET(so->so_vnet);
 	funsetown(&so->so_sigio);
 	if (so->so_state & SS_ISCONNECTED) {
@@ -1308,24 +1240,12 @@ drop:
 			    __func__, so));
 		}
 	}
-	KASSERT(so->so_state & SS_FDREF,
-	    ("%s: %p no SS_FDREF upon lock", __func__, so));
-	so->so_state &= ~SS_FDREF;
 	sorele_locked(so);
 	if (listening) {
 		struct socket *sp, *tsp;
 
-		TAILQ_FOREACH_SAFE(sp, &lqueue, so_list, tsp) {
-			SOCK_LOCK(sp);
-			if (refcount_load(&sp->so_count) == 0) {
-				SOCK_UNLOCK(sp);
-				soabort(sp);
-			} else {
-				/* See the handling of queued sockets
-				   in sofree(). */
-				SOCK_UNLOCK(sp);
-			}
-		}
+		TAILQ_FOREACH_SAFE(sp, &lqueue, so_list, tsp)
+			soabort(sp);
 	}
 	CURVNET_RESTORE();
 	return (error);
@@ -1338,31 +1258,30 @@ drop:
  *
  * This interface is tricky, because it is called on an unreferenced socket,
  * and must be called only by a thread that has actually removed the socket
- * from the listen queue it was on, or races with other threads are risked.
+ * from the listen queue it was on.  Likely this thread holds the last
+ * reference on the socket and soabort() will proceed with sofree().  But
+ * it might be not the last, as the sockets on the listen queues are seen
+ * from the protocol side.
  *
  * This interface will call into the protocol code, so must not be called
  * with any socket locks held.  Protocols do call it while holding their own
  * recursible protocol mutexes, but this is something that should be subject
  * to review in the future.
+ *
+ * Usually socket should have a single reference left, but this is not a
+ * requirement.  In the past, when we have had named references for file
+ * descriptor and protocol, we asserted that none of them are being held.
  */
 void
 soabort(struct socket *so)
 {
 
-	/*
-	 * In as much as is possible, assert that no references to this
-	 * socket are held.  This is not quite the same as asserting that the
-	 * current thread is responsible for arranging for no references, but
-	 * is as close as we can get for now.
-	 */
-	KASSERT((so->so_state & (SS_FDREF | SS_PROTOREF)) == 0 &&
-	    so->so_count == 0, ("%s: so %p has references", __func__, so));
 	VNET_SO_ASSERT(so);
 
 	if (so->so_proto->pr_usrreqs->pru_abort != NULL)
 		(*so->so_proto->pr_usrreqs->pru_abort)(so);
 	SOCK_LOCK(so);
-	sofree(so);
+	sorele_locked(so);
 }
 
 int
@@ -1370,12 +1289,6 @@ soaccept(struct socket *so, struct sockaddr **nam)
 {
 	int error;
 
-	SOCK_LOCK(so);
-	KASSERT((so->so_state & SS_FDREF) == 0,
-	    ("%s: %p has SS_FDREF", __func__, so));
-	so->so_state |= SS_FDREF;
-	SOCK_UNLOCK(so);
-
 	CURVNET_SET(so->so_vnet);
 	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
 	CURVNET_RESTORE();
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index 3c5e6f2db9ec..8a1c862f9249 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -2501,13 +2501,9 @@ tcp_close(struct tcpcb *tp)
 	so = inp->inp_socket;
 	soisdisconnected(so);
 	if (inp->inp_flags & INP_SOCKREF) {
-		KASSERT(so->so_state & SS_PROTOREF,
-		    ("tcp_close: !SS_PROTOREF"));
 		inp->inp_flags &= ~INP_SOCKREF;
 		INP_WUNLOCK(inp);
-		SOCK_LOCK(so);
-		so->so_state &= ~SS_PROTOREF;
-		sofree(so);
+		sorele(so);
 		return (NULL);
 	}
 	return (tp);
diff --git a/sys/netinet/tcp_timewait.c b/sys/netinet/tcp_timewait.c
index 3a87821108c4..2ea57e0b3397 100644
--- a/sys/netinet/tcp_timewait.c
+++ b/sys/netinet/tcp_timewait.c
@@ -370,13 +370,9 @@ tcp_twstart(struct tcpcb *tp)
 	 * detach and free the socket as it is not needed in time wait.
 	 */
 	if (inp->inp_flags & INP_SOCKREF) {
-		KASSERT(so->so_state & SS_PROTOREF,
-		    ("tcp_twstart: !SS_PROTOREF"));
 		inp->inp_flags &= ~INP_SOCKREF;
 		INP_WUNLOCK(inp);
-		SOCK_LOCK(so);
-		so->so_state &= ~SS_PROTOREF;
-		sofree(so);
+		sorele(so);
 	} else
 		INP_WUNLOCK(inp);
 }
@@ -573,11 +569,7 @@ tcp_twclose(struct tcptw *tw, int reuse)
 		if (inp->inp_flags & INP_SOCKREF) {
 			inp->inp_flags &= ~INP_SOCKREF;
 			INP_WUNLOCK(inp);
-			SOCK_LOCK(so);
-			KASSERT(so->so_state & SS_PROTOREF,
-			    ("tcp_twclose: INP_SOCKREF && !SS_PROTOREF"));
-			so->so_state &= ~SS_PROTOREF;
-			sofree(so);
+			sorele(so);
 		} else {
 			/*
 			 * If we don't own the only reference, the socket and
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
index 6e4171bd1264..cdeb2cbcd98e 100644
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -1370,9 +1370,7 @@ tcp_usr_abort(struct socket *so)
 		TCP_PROBE2(debug__user, tp, PRU_ABORT);
 	}
 	if (!(inp->inp_flags & INP_DROPPED)) {
-		SOCK_LOCK(so);
-		so->so_state |= SS_PROTOREF;
-		SOCK_UNLOCK(so);
+		soref(so);
 		inp->inp_flags |= INP_SOCKREF;
 	}
 	INP_WUNLOCK(inp);
@@ -1413,9 +1411,7 @@ tcp_usr_close(struct socket *so)
 		TCP_PROBE2(debug__user, tp, PRU_CLOSE);
 	}
 	if (!(inp->inp_flags & INP_DROPPED)) {
-		SOCK_LOCK(so);
-		so->so_state |= SS_PROTOREF;
-		SOCK_UNLOCK(so);
+		soref(so);
 		inp->inp_flags |= INP_SOCKREF;
 	}
 	INP_WUNLOCK(inp);
diff --git a/sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c b/sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c
index def5edb98983..4305b42f1c00 100644
--- a/sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c
+++ b/sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c
@@ -308,13 +308,9 @@ sdp_closed(struct sdp_sock *ssk)
 	so = ssk->socket;
 	soisdisconnected(so);
 	if (ssk->flags & SDP_SOCKREF) {
-		KASSERT(so->so_state & SS_PROTOREF,
-		    ("sdp_closed: !SS_PROTOREF"));
 		ssk->flags &= ~SDP_SOCKREF;
 		SDP_WUNLOCK(ssk);
-		SOCK_LOCK(so);
-		so->so_state &= ~SS_PROTOREF;
-		sofree(so);
+		sorele(so);
 		return (NULL);
 	}
 	return (ssk);
@@ -1472,10 +1468,8 @@ sdp_close(struct socket *so)
 	 * holding on to the socket and pcb for a while.
 	 */
 	if (!(ssk->flags & SDP_DROPPED)) {
-		SOCK_LOCK(so);
-		so->so_state |= SS_PROTOREF;
-		SOCK_UNLOCK(so);
 		ssk->flags |= SDP_SOCKREF;
+		soref(so);
 	}
 	SDP_WUNLOCK(ssk);
 }
diff --git a/sys/sys/socketvar.h b/sys/sys/socketvar.h
index d1ce687ac956..c496239f92bc 100644
--- a/sys/sys/socketvar.h
+++ b/sys/sys/socketvar.h
@@ -210,7 +210,6 @@ struct socket {
  * Many fields will be read without locks to improve performance and avoid
  * lock order issues.  However, this approach must be used with caution.
  */
-#define	SS_FDREF		0x0001	/* strong file descriptor reference */
 #define	SS_ISCONNECTED		0x0002	/* socket connected to a peer */
 #define	SS_ISCONNECTING		0x0004	/* in process of connecting to peer */
 #define	SS_ISDISCONNECTING	0x0008	/* in process of disconnecting */
@@ -219,15 +218,6 @@ struct socket {
 #define	SS_ISCONFIRMING		0x0400	/* deciding to accept connection req */
 #define	SS_ISDISCONNECTED	0x2000	/* socket disconnected from peer */
 
-/*
- * Protocols can mark a socket as SS_PROTOREF to indicate that, following
- * pru_detach, they still want the socket to persist, and will free it
- * themselves when they are done.  Protocols should only ever call sofree()
- * following setting this flag in pru_detach(), and never otherwise, as
- * sofree() bypasses socket reference counting.
- */
-#define	SS_PROTOREF		0x4000	/* strong protocol reference */
-
 #ifdef _KERNEL
 
 #define	SOCK_MTX(so)		(&(so)->so_lock)
@@ -479,7 +469,6 @@ int	socreate(int dom, struct socket **aso, int type, int proto,
 int	sodisconnect(struct socket *so);
 void	sodtor_set(struct socket *, so_dtor_t *);
 struct	sockaddr *sodupsockaddr(const struct sockaddr *sa, int mflags);
-void	sofree(struct socket *so);
 void	sohasoutofband(struct socket *so);
 int	solisten(struct socket *so, int backlog, struct thread *td);
 void	solisten_proto(struct socket *so, int backlog);