git: 032bf749fd44 - main - [tcp] Keep socket buffer locked until upcall
Richard Scheffenegger
rscheff at FreeBSD.org
Fri May 21 09:08:42 UTC 2021
The branch main has been updated by rscheff:
URL: https://cgit.FreeBSD.org/src/commit/?id=032bf749fd44ac5ff20aab2c3d8e3c05491778ea
commit 032bf749fd44ac5ff20aab2c3d8e3c05491778ea
Author: Richard Scheffenegger <rscheff at FreeBSD.org>
AuthorDate: 2021-05-21 09:00:53 +0000
Commit: Richard Scheffenegger <rscheff at FreeBSD.org>
CommitDate: 2021-05-21 09:07:51 +0000
[tcp] Keep socket buffer locked until upcall
r367492 would unlock the socket buffer before eventually calling the upcall.
This leads to problematic interaction with NFS kernel server/client components
(MP threads) accessing the socket buffer with potentially not correctly updated
state.
Reported by: rmacklem
Reviewed By: tuexen, #transport
Tested by: rmacklem, otis
MFC after: 2 weeks
Sponsored By: NetApp, Inc.
Differential Revision: https://reviews.freebsd.org/D29690
---
sys/netinet/tcp_input.c | 38 ++++++++++++++------------------
sys/netinet/tcp_reass.c | 2 --
sys/netinet/tcp_stacks/bbr.c | 20 ++++++++---------
sys/netinet/tcp_stacks/rack.c | 20 ++++++++---------
sys/netinet/tcp_stacks/rack_bbr_common.c | 1 -
sys/netinet/tcp_var.h | 2 +-
6 files changed, 37 insertions(+), 46 deletions(-)
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index 49db8cc63cb3..18ef52959c15 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -1509,18 +1509,17 @@ tcp_handle_wakeup(struct tcpcb *tp, struct socket *so)
* the TIME_WAIT state before coming here, we need
* to check if the socket is still connected.
*/
- if ((so->so_state & SS_ISCONNECTED) == 0)
+ if (tp == NULL) {
return;
+ }
+ if (so == NULL) {
+ return;
+ }
INP_LOCK_ASSERT(tp->t_inpcb);
if (tp->t_flags & TF_WAKESOR) {
tp->t_flags &= ~TF_WAKESOR;
- SOCKBUF_UNLOCK_ASSERT(&so->so_rcv);
- sorwakeup(so);
- }
- if (tp->t_flags & TF_WAKESOW) {
- tp->t_flags &= ~TF_WAKESOW;
- SOCKBUF_UNLOCK_ASSERT(&so->so_snd);
- sowwakeup(so);
+ SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+ sorwakeup_locked(so);
}
}
@@ -1898,7 +1897,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
else if (!tcp_timer_active(tp, TT_PERSIST))
tcp_timer_activate(tp, TT_REXMT,
tp->t_rxtcur);
- tp->t_flags |= TF_WAKESOW;
+ sowwakeup(so);
if (sbavail(&so->so_snd))
(void) tp->t_fb->tfb_tcp_output(tp);
goto check_delack;
@@ -1963,8 +1962,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
m_adj(m, drop_hdrlen); /* delayed header drop */
sbappendstream_locked(&so->so_rcv, m, 0);
}
- SOCKBUF_UNLOCK(&so->so_rcv);
- tp->t_flags |= TF_WAKESOR;
+ /* NB: sorwakeup_locked() does an implicit unlock. */
+ sorwakeup_locked(so);
if (DELAY_ACK(tp, tlen)) {
tp->t_flags |= TF_DELACK;
} else {
@@ -2502,9 +2501,11 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
* If segment contains data or ACK, will call tcp_reass()
* later; if not, do so now to pass queued data to user.
*/
- if (tlen == 0 && (thflags & TH_FIN) == 0)
+ if (tlen == 0 && (thflags & TH_FIN) == 0) {
(void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0,
(struct mbuf *)0);
+ tcp_handle_wakeup(tp, so);
+ }
tp->snd_wl1 = th->th_seq - 1;
/* FALLTHROUGH */
@@ -2959,8 +2960,8 @@ process_ACK:
tp->snd_wnd = 0;
ourfinisacked = 0;
}
- SOCKBUF_UNLOCK(&so->so_snd);
- tp->t_flags |= TF_WAKESOW;
+ /* NB: sowwakeup_locked() does an implicit unlock. */
+ sowwakeup_locked(so);
m_freem(mfree);
/* Detect una wraparound. */
if (!IN_RECOVERY(tp->t_flags) &&
@@ -3181,7 +3182,6 @@ dodata: /* XXX */
m_freem(m);
else
sbappendstream_locked(&so->so_rcv, m, 0);
- SOCKBUF_UNLOCK(&so->so_rcv);
tp->t_flags |= TF_WAKESOR;
} else {
/*
@@ -3227,6 +3227,7 @@ dodata: /* XXX */
save_start + tlen);
}
}
+ tcp_handle_wakeup(tp, so);
#if 0
/*
* Note the amount of data that peer has sent into
@@ -3250,9 +3251,8 @@ dodata: /* XXX */
*/
if (thflags & TH_FIN) {
if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
- socantrcvmore(so);
/* The socket upcall is handled by socantrcvmore. */
- tp->t_flags &= ~TF_WAKESOR;
+ socantrcvmore(so);
/*
* If connection is half-synchronized
* (ie NEEDSYN flag on) then delay ACK,
@@ -3316,7 +3316,6 @@ check_delack:
tp->t_flags &= ~TF_DELACK;
tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
}
- tcp_handle_wakeup(tp, so);
INP_WUNLOCK(tp->t_inpcb);
return;
@@ -3350,7 +3349,6 @@ dropafterack:
TCP_PROBE3(debug__input, tp, th, m);
tp->t_flags |= TF_ACKNOW;
(void) tp->t_fb->tfb_tcp_output(tp);
- tcp_handle_wakeup(tp, so);
INP_WUNLOCK(tp->t_inpcb);
m_freem(m);
return;
@@ -3358,7 +3356,6 @@ dropafterack:
dropwithreset:
if (tp != NULL) {
tcp_dropwithreset(m, th, tp, tlen, rstreason);
- tcp_handle_wakeup(tp, so);
INP_WUNLOCK(tp->t_inpcb);
} else
tcp_dropwithreset(m, th, NULL, tlen, rstreason);
@@ -3375,7 +3372,6 @@ drop:
#endif
TCP_PROBE3(debug__input, tp, th, m);
if (tp != NULL) {
- tcp_handle_wakeup(tp, so);
INP_WUNLOCK(tp->t_inpcb);
}
m_freem(m);
diff --git a/sys/netinet/tcp_reass.c b/sys/netinet/tcp_reass.c
index 310d7d540507..8e24e7412473 100644
--- a/sys/netinet/tcp_reass.c
+++ b/sys/netinet/tcp_reass.c
@@ -959,7 +959,6 @@ new_entry:
} else {
sbappendstream_locked(&so->so_rcv, m, 0);
}
- SOCKBUF_UNLOCK(&so->so_rcv);
tp->t_flags |= TF_WAKESOR;
return (flags);
}
@@ -1108,7 +1107,6 @@ present:
#ifdef TCP_REASS_LOGGING
tcp_reass_log_dump(tp);
#endif
- SOCKBUF_UNLOCK(&so->so_rcv);
tp->t_flags |= TF_WAKESOR;
return (flags);
}
diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c
index 22762d21c289..dd4cbfd7dc53 100644
--- a/sys/netinet/tcp_stacks/bbr.c
+++ b/sys/netinet/tcp_stacks/bbr.c
@@ -7825,8 +7825,8 @@ bbr_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
acked_amount = min(acked, (int)sbavail(&so->so_snd));
tp->snd_wnd -= acked_amount;
mfree = sbcut_locked(&so->so_snd, acked_amount);
- SOCKBUF_UNLOCK(&so->so_snd);
- tp->t_flags |= TF_WAKESOW;
+ /* NB: sowwakeup_locked() does an implicit unlock. */
+ sowwakeup_locked(so);
m_freem(mfree);
if (SEQ_GT(th->th_ack, tp->snd_una)) {
bbr_collapse_rtt(tp, bbr, TCP_REXMTVAL(tp));
@@ -8302,7 +8302,6 @@ bbr_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
appended =
#endif
sbappendstream_locked(&so->so_rcv, m, 0);
- SOCKBUF_UNLOCK(&so->so_rcv);
tp->t_flags |= TF_WAKESOR;
#ifdef NETFLIX_SB_LIMITS
if (so->so_rcv.sb_shlim && appended != mcnt)
@@ -8353,6 +8352,7 @@ bbr_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
save_start + tlen);
}
}
+ tcp_handle_wakeup(tp, so);
} else {
m_freem(m);
thflags &= ~TH_FIN;
@@ -8364,9 +8364,8 @@ bbr_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
if (thflags & TH_FIN) {
if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
- socantrcvmore(so);
/* The socket upcall is handled by socantrcvmore. */
- tp->t_flags &= ~TF_WAKESOR;
+ socantrcvmore(so);
/*
* If connection is half-synchronized (ie NEEDSYN
* flag on) then delay ACK, so it may be piggybacked
@@ -8557,8 +8556,8 @@ bbr_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
sbappendstream_locked(&so->so_rcv, m, 0);
ctf_calc_rwin(so, tp);
}
- SOCKBUF_UNLOCK(&so->so_rcv);
- tp->t_flags |= TF_WAKESOR;
+ /* NB: sorwakeup_locked() does an implicit unlock. */
+ sorwakeup_locked(so);
#ifdef NETFLIX_SB_LIMITS
if (so->so_rcv.sb_shlim && mcnt != appended)
counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended);
@@ -8749,7 +8748,7 @@ bbr_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
&tcp_savetcp, 0);
#endif
/* Wake up the socket if we have room to write more */
- tp->t_flags |= TF_WAKESOW;
+ sowwakeup(so);
if (tp->snd_una == tp->snd_max) {
/* Nothing left outstanding */
bbr_log_progress_event(bbr, tp, ticks, PROGRESS_CLEAR, __LINE__);
@@ -9157,9 +9156,11 @@ bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
* If segment contains data or ACK, will call tcp_reass() later; if
* not, do so now to pass queued data to user.
*/
- if (tlen == 0 && (thflags & TH_FIN) == 0)
+ if (tlen == 0 && (thflags & TH_FIN) == 0) {
(void)tcp_reass(tp, (struct tcphdr *)0, NULL, 0,
(struct mbuf *)0);
+ tcp_handle_wakeup(tp, so);
+ }
tp->snd_wl1 = th->th_seq - 1;
if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
return (ret_val);
@@ -11711,7 +11712,6 @@ bbr_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
retval = bbr_do_segment_nounlock(m, th, so, tp,
drop_hdrlen, tlen, iptos, 0, &tv);
if (retval == 0) {
- tcp_handle_wakeup(tp, so);
INP_WUNLOCK(tp->t_inpcb);
}
}
diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
index 73b47745cbac..efab90a2e5bd 100644
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
@@ -9857,8 +9857,8 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (acked_amount && sbavail(&so->so_snd))
rack_adjust_sendmap(rack, &so->so_snd, tp->snd_una);
rack_log_wakeup(tp,rack, &so->so_snd, acked, 2);
- SOCKBUF_UNLOCK(&so->so_snd);
- tp->t_flags |= TF_WAKESOW;
+ /* NB: sowwakeup_locked() does an implicit unlock. */
+ sowwakeup_locked(so);
m_freem(mfree);
if (SEQ_GT(tp->snd_una, tp->snd_recover))
tp->snd_recover = tp->snd_una;
@@ -10208,7 +10208,6 @@ rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
sbappendstream_locked(&so->so_rcv, m, 0);
rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1);
- SOCKBUF_UNLOCK(&so->so_rcv);
tp->t_flags |= TF_WAKESOR;
#ifdef NETFLIX_SB_LIMITS
if (so->so_rcv.sb_shlim && appended != mcnt)
@@ -10265,6 +10264,7 @@ rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
save_start + tlen);
}
}
+ tcp_handle_wakeup(tp, so);
} else {
m_freem(m);
thflags &= ~TH_FIN;
@@ -10276,9 +10276,8 @@ rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
if (thflags & TH_FIN) {
if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
- socantrcvmore(so);
/* The socket upcall is handled by socantrcvmore. */
- tp->t_flags &= ~TF_WAKESOR;
+ socantrcvmore(so);
/*
* If connection is half-synchronized (ie NEEDSYN
* flag on) then delay ACK, so it may be piggybacked
@@ -10471,7 +10470,6 @@ rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
ctf_calc_rwin(so, tp);
}
rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1);
- SOCKBUF_UNLOCK(&so->so_rcv);
tp->t_flags |= TF_WAKESOR;
#ifdef NETFLIX_SB_LIMITS
if (so->so_rcv.sb_shlim && mcnt != appended)
@@ -10631,8 +10629,7 @@ rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
rack_adjust_sendmap(rack, &so->so_snd, tp->snd_una);
/* Wake up the socket if we have room to write more */
rack_log_wakeup(tp,rack, &so->so_snd, acked, 2);
- SOCKBUF_UNLOCK(&so->so_snd);
- tp->t_flags |= TF_WAKESOW;
+ sowwakeup(so);
m_freem(mfree);
tp->t_rxtshift = 0;
RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
@@ -11070,9 +11067,11 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
* If segment contains data or ACK, will call tcp_reass() later; if
* not, do so now to pass queued data to user.
*/
- if (tlen == 0 && (thflags & TH_FIN) == 0)
+ if (tlen == 0 && (thflags & TH_FIN) == 0) {
(void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0,
(struct mbuf *)0);
+ tcp_handle_wakeup(tp, so);
+ }
tp->snd_wl1 = th->th_seq - 1;
/* For syn-recv we need to possibly update the rtt */
if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
@@ -13155,8 +13154,7 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb
rack_adjust_sendmap(rack, &so->so_snd, tp->snd_una);
/* Wake up the socket if we have room to write more */
rack_log_wakeup(tp,rack, &so->so_snd, acked, 2);
- SOCKBUF_UNLOCK(&so->so_snd);
- tp->t_flags |= TF_WAKESOW;
+ sowwakeup(so);
m_freem(mfree);
}
/* update progress */
diff --git a/sys/netinet/tcp_stacks/rack_bbr_common.c b/sys/netinet/tcp_stacks/rack_bbr_common.c
index 6475a1a01c26..501d29ac48da 100644
--- a/sys/netinet/tcp_stacks/rack_bbr_common.c
+++ b/sys/netinet/tcp_stacks/rack_bbr_common.c
@@ -533,7 +533,6 @@ ctf_do_queued_segments(struct socket *so, struct tcpcb *tp, int have_pkt)
/* We lost the tcpcb (maybe a RST came in)? */
return(1);
}
- tcp_handle_wakeup(tp, so);
}
return (0);
}
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index b80746b1ede4..1742b3b1c514 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -408,7 +408,7 @@ TAILQ_HEAD(tcp_funchead, tcp_function);
#define TF_FORCEDATA 0x00800000 /* force out a byte */
#define TF_TSO 0x01000000 /* TSO enabled on this connection */
#define TF_TOE 0x02000000 /* this connection is offloaded */
-#define TF_WAKESOW 0x04000000 /* wake up send socket */
+#define TF_UNUSED0 0x04000000 /* unused */
#define TF_UNUSED1 0x08000000 /* unused */
#define TF_LRD 0x10000000 /* Lost Retransmission Detection */
#define TF_CONGRECOVERY 0x20000000 /* congestion recovery mode */
More information about the dev-commits-src-main
mailing list