TCP stack lock contention with short-lived connections
Julien Charbon
jch at freebsd.org
Fri Oct 3 13:17:02 UTC 2014
Hi,
On 23/05/14 14:06, Julien Charbon wrote:
> On 27/02/14 11:32, Julien Charbon wrote:
>> On 07/11/13 14:55, Julien Charbon wrote:
>>> On Mon, 04 Nov 2013 22:21:04 +0100, Julien Charbon
>>> <jcharbon at verisign.com> wrote:
>>>> I have put technical and how-to-repeat details in below PR:
>>>>
>>>> kern/183659: TCP stack lock contention with short-lived connections
>>>> https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=183659
>>>>
>>>> We are currently working on this performance improvement effort; it
>>>> will impact only the TCP locking strategy not the TCP stack logic
>>>> itself. We will share on freebsd-net the patches we made for
>>>> reviewing and improvement propositions; anyway this change might also
>>>> require enough eyeballs to avoid tricky race conditions introduction
>>>> in TCP stack.
As usual, first a follow-up on TCP short-lived connections improvement
progress:
Thanks to jhb (committer) and adrian, hiren, jhb, Mike Bentkofsky
(reviewers) the SYN reception optimization has been pushed in HEAD:
"In tcp_input(), don't acquire the pcbinfo global write lock for SYN
packets targeting a listening socket"
http://svnweb.freebsd.org/base?view=revision&revision=271119
Next, two related patches are remaining:
- The first one is actually a race condition fix, thanks to Marc for
spotting it. This race has been introduced with our first TCP timewait
change:
http://svnweb.freebsd.org/base?view=revision&revision=264321
The proposed fix is currently under review (see also joined patch):
https://reviews.freebsd.org/D826
Note: The original change has not been MFC'ed, thus this race
condition is only in HEAD.
- The second one being (see also joined patch):
https://github.com/verisign/freebsd/commit/f4c11c6b678195515bbab8bb2825fa5222ed3a58
Nothing new here (just read previous emails in this thread for
details). The patch just becomes easier to read with time. As we
didn't find any issues with it, following a proposition from Marc we are
going to start a specific code review process:
#1 Write down all the current INP_INFO_WLOCK locking rules, especially
all the non-obvious ones
#2 Check that all these rules are still respected with the proposed
improvement
It will permit an in depth code review, and to get all pcbinfo/inpcb
related locking rules well described.
--
Julien
-------------- next part --------------
From 91a1a400f50aad0e2304d655a78286d9285efbc5 Mon Sep 17 00:00:00 2001
From: Julien Charbon <jcharbon at verisign.com>
Date: Thu, 4 Sep 2014 09:40:21 +0200
Subject: [PATCH] [tcp-scale] Fix a race condition in TCP timewait between
tcp_tw_2msl_reuse() and tcp_tw_2msl_scan() that drives (at least) timewait
timeout cancellation. Also simplify implementation by holding inpcb
reference and removing tw_pcbref() and tw_pcbrele().
---
sys/netinet/tcp_timewait.c | 133 ++++++++++++++++++++++++++++-----------------
sys/netinet/tcp_usrreq.c | 15 +++++
sys/netinet/tcp_var.h | 1 -
3 files changed, 99 insertions(+), 50 deletions(-)
diff --git a/sys/netinet/tcp_timewait.c b/sys/netinet/tcp_timewait.c
index 33555d9..9c17655 100644
--- a/sys/netinet/tcp_timewait.c
+++ b/sys/netinet/tcp_timewait.c
@@ -49,7 +49,6 @@ __FBSDID("$FreeBSD$");
#include <sys/socketvar.h>
#include <sys/protosw.h>
#include <sys/random.h>
-#include <sys/refcount.h>
#include <vm/uma.h>
@@ -101,6 +100,11 @@ static int maxtcptw;
* currently in the TIME_WAIT state. The queue pointers, including the
* queue pointers in each tcptw structure, are protected using the global
* timewait lock, which must be held over queue iteration and modification.
+ *
+ * Rules on tcptw usage:
+ * - a inpcb is always freed _after_ its tcptw
+ * - a tcptw relies on its inpcb reference counting for memory stability
+ * - a tcptw is valid only under its inpcb locked
*/
static VNET_DEFINE(TAILQ_HEAD(, tcptw), twq_2msl);
#define V_twq_2msl VNET(twq_2msl)
@@ -124,32 +128,6 @@ static void tcp_tw_2msl_reset(struct tcptw *, int);
static void tcp_tw_2msl_stop(struct tcptw *, int);
static int tcp_twrespond(struct tcptw *, int);
-/*
- * tw_pcbref() bumps the reference count on an tw in order to maintain
- * stability of an tw pointer despite the tw lock being released.
- */
-static void
-tw_pcbref(struct tcptw *tw)
-{
-
- KASSERT(tw->tw_refcount > 0, ("%s: refcount 0", __func__));
- refcount_acquire(&tw->tw_refcount);
-}
-
-/*
- * Drop a refcount on an tw elevated using tw_pcbref().
- */
-static int
-tw_pcbrele(struct tcptw *tw)
-{
-
- KASSERT(tw->tw_refcount > 0, ("%s: refcount 0", __func__));
- if (!refcount_release(&tw->tw_refcount))
- return (0);
- uma_zfree(V_tcptw_zone, tw);
- return (1);
-}
-
static int
tcptw_auto_size(void)
{
@@ -289,7 +267,11 @@ tcp_twstart(struct tcpcb *tp)
}
}
tw->tw_inpcb = inp;
- refcount_init(&tw->tw_refcount, 1);
+ /*
+ * The tcptw will hold a reference on its inpcb until tcp_twclose
+ * is called
+ */
+ in_pcbref(inp); /* Reference from tw */
/*
* Recover last window size sent.
@@ -479,7 +461,6 @@ tcp_twclose(struct tcptw *tw, int reuse)
INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* in_pcbfree() */
INP_WLOCK_ASSERT(inp);
- tw->tw_inpcb = NULL;
tcp_tw_2msl_stop(tw, reuse);
inp->inp_ppcb = NULL;
in_pcbdrop(inp);
@@ -509,8 +490,13 @@ tcp_twclose(struct tcptw *tw, int reuse)
*/
INP_WUNLOCK(inp);
}
- } else
+ } else {
+ /*
+ * The socket has been already cleaned-up for us, only free the
+ * inpcb.
+ */
in_pcbfree(inp);
+ }
TCPSTAT_INC(tcps_closed);
}
@@ -641,36 +627,70 @@ tcp_tw_2msl_reset(struct tcptw *tw, int rearm)
static void
tcp_tw_2msl_stop(struct tcptw *tw, int reuse)
{
+ struct ucred *cred;
+ struct inpcb *inp;
+ int released;
INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
TW_WLOCK(V_tw_lock);
+ inp = tw->tw_inpcb;
+ tw->tw_inpcb = NULL;
+
TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl);
- crfree(tw->tw_cred);
+ cred = tw->tw_cred;
tw->tw_cred = NULL;
TW_WUNLOCK(V_tw_lock);
+ if (cred != NULL)
+ crfree(cred);
+
+ released = in_pcbrele_wlocked(inp);
+ KASSERT(!released, ("%s: inp should not be released here", __func__));
+
if (!reuse)
- tw_pcbrele(tw);
+ uma_zfree(V_tcptw_zone, tw);
}
struct tcptw *
tcp_tw_2msl_reuse(void)
{
struct tcptw *tw;
+ struct inpcb *inp;
INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
- TW_WLOCK(V_tw_lock);
- tw = TAILQ_FIRST(&V_twq_2msl);
- if (tw == NULL) {
- TW_WUNLOCK(V_tw_lock);
- return NULL;
- }
- TW_WUNLOCK(V_tw_lock);
+ for (;;) {
+ TW_RLOCK(V_tw_lock);
+ tw = TAILQ_FIRST(&V_twq_2msl);
+ if (tw == NULL) {
+ TW_RUNLOCK(V_tw_lock);
+ break;
+ }
+ KASSERT(tw->tw_inpcb != NULL, ("%s: tw->tw_inpcb == NULL",
+ __func__));
- INP_WLOCK(tw->tw_inpcb);
- tcp_twclose(tw, 1);
+ inp = tw->tw_inpcb;
+ in_pcbref(inp);
+ TW_RUNLOCK(V_tw_lock);
+
+ INP_WLOCK(inp);
+ tw = intotw(inp);
+ if (in_pcbrele_wlocked(inp)) {
+ KASSERT(tw == NULL, ("%s: held last inp reference but "
+ "tw not NULL", __func__));
+ continue;
+ }
+
+ if (tw == NULL) {
+ /* tcp_twclose() has already been called */
+ INP_WUNLOCK(inp);
+ continue;
+ }
+
+ tcp_twclose(tw, 1);
+ break;
+ }
return (tw);
}
@@ -679,6 +699,7 @@ void
tcp_tw_2msl_scan(void)
{
struct tcptw *tw;
+ struct inpcb *inp;
for (;;) {
TW_RLOCK(V_tw_lock);
@@ -687,24 +708,38 @@ tcp_tw_2msl_scan(void)
TW_RUNLOCK(V_tw_lock);
break;
}
- tw_pcbref(tw);
+ KASSERT(tw->tw_inpcb != NULL, ("%s: tw->tw_inpcb == NULL",
+ __func__));
+
+ inp = tw->tw_inpcb;
+ in_pcbref(inp);
TW_RUNLOCK(V_tw_lock);
- /* Close timewait state */
if (INP_INFO_TRY_WLOCK(&V_tcbinfo)) {
- if (tw_pcbrele(tw)) {
+
+ INP_WLOCK(inp);
+ tw = intotw(inp);
+ if (in_pcbrele_wlocked(inp)) {
+ KASSERT(tw == NULL, ("%s: held last inp "
+ "reference but tw not NULL", __func__));
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ continue;
+ }
+
+ if (tw == NULL) {
+ /* tcp_twclose() has already been called */
+ INP_WUNLOCK(inp);
INP_INFO_WUNLOCK(&V_tcbinfo);
continue;
}
- KASSERT(tw->tw_inpcb != NULL,
- ("%s: tw->tw_inpcb == NULL", __func__));
- INP_WLOCK(tw->tw_inpcb);
tcp_twclose(tw, 0);
INP_INFO_WUNLOCK(&V_tcbinfo);
} else {
- /* INP_INFO lock is busy; continue later. */
- tw_pcbrele(tw);
+ /* INP_INFO lock is busy, continue later. */
+ INP_WLOCK(inp);
+ if (!in_pcbrele_wlocked(inp))
+ INP_WUNLOCK(inp);
break;
}
}
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
index 22a160c..908afa2 100644
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -183,6 +183,21 @@ tcp_detach(struct socket *so, struct inpcb *inp)
* present until timewait ends.
*
* XXXRW: Would it be cleaner to free the tcptw here?
+ *
+ * Astute question indeed, from twtcp perspective there are
+ * three cases to consider:
+ *
+ * #1 tcp_detach is called at tcptw creation time by
+ * tcp_twstart, then do not discard the newly created tcptw
+ * and leave inpcb present until timewait ends
+ * #2 tcp_detach is called at timewait end (or reuse) by
+ * tcp_twclose, then the tcptw has already been discarded
+ * and inpcb is freed here
+ * #3 tcp_detach is called() after timewait ends (or reuse)
+ * (e.g. by soclose), then tcptw has already been discarded
+ * and inpcb is freed here
+ *
+ * In all three cases the tcptw should not be freed here.
*/
if (inp->inp_flags & INP_DROPPED) {
KASSERT(tp == NULL, ("tcp_detach: INP_TIMEWAIT && "
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index c2298fc..93e1b62 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -349,7 +349,6 @@ struct tcptw {
u_int t_starttime;
int tw_time;
TAILQ_ENTRY(tcptw) tw_2msl;
- u_int tw_refcount; /* refcount */
};
#define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb)
-------------- next part --------------
From f4c11c6b678195515bbab8bb2825fa5222ed3a58 Mon Sep 17 00:00:00 2001
From: Julien Charbon <jcharbon at verisign.com>
Date: Fri, 28 Mar 2014 15:36:52 +0100
Subject: [PATCH] [tcp-scale] Introduce the INP_LIST global mutex for
protecting pcbinfo global structures. Then use INP_INFO_RLOCK in critical
paths to increase TCP processing parallelism, and use INP_INFO_WLOCK in full
INPs iteration loops.
Julien's review:
- Fix INP_INFO_WLOCK assertions
- Fixing comments
- Rebased on svn path=/head/; revision=272099
---
sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c | 28 +++++-----
sys/dev/cxgb/ulp/tom/cxgb_listen.c | 14 ++---
sys/dev/cxgbe/tom/t4_connect.c | 4 +-
sys/dev/cxgbe/tom/t4_cpl_io.c | 20 +++----
sys/dev/cxgbe/tom/t4_listen.c | 10 ++--
sys/netinet/in_pcb.c | 43 ++++++++++++---
sys/netinet/in_pcb.h | 73 ++++++++++++++++++-------
sys/netinet/tcp_input.c | 108 ++++++++++++++++++-------------------
sys/netinet/tcp_subr.c | 40 +++++++-------
sys/netinet/tcp_syncache.c | 4 +-
sys/netinet/tcp_timer.c | 40 +++++++-------
sys/netinet/tcp_timewait.c | 24 ++++-----
sys/netinet/tcp_usrreq.c | 40 +++++++-------
sys/netinet/toecore.c | 6 +--
sys/netinet6/in6_pcb.c | 4 +-
15 files changed, 261 insertions(+), 197 deletions(-)
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c b/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c
index a86bf72..f28c83d 100644
--- a/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c
+++ b/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c
@@ -639,7 +639,7 @@ t3_send_fin(struct toedev *tod, struct tcpcb *tp)
unsigned int tid = toep->tp_tid;
#endif
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
CTR4(KTR_CXGB, "%s: tid %d, toep %p, flags %x", __func__, tid, toep,
@@ -925,12 +925,12 @@ do_act_open_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
rc = act_open_rpl_status_to_errno(s);
if (rc != EAGAIN)
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
INP_WLOCK(inp);
toe_connect_failed(tod, inp, rc);
toepcb_release(toep); /* unlocks inp */
if (rc != EAGAIN)
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
m_freem(m);
return (0);
@@ -1061,7 +1061,7 @@ send_reset(struct toepcb *toep)
struct adapter *sc = tod->tod_softc;
struct mbuf *m;
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
CTR4(KTR_CXGB, "%s: tid %d, toep %p (%x)", __func__, tid, toep,
@@ -1172,12 +1172,12 @@ do_rx_data(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
SOCKBUF_UNLOCK(so_rcv);
INP_WUNLOCK(inp);
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
INP_WLOCK(inp);
tp = tcp_drop(tp, ECONNRESET);
if (tp)
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
m_freem(m);
return (0);
@@ -1222,7 +1222,7 @@ do_peer_close(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
struct tcpcb *tp;
struct socket *so;
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
INP_WLOCK(inp);
tp = intotcpcb(inp);
@@ -1250,7 +1250,7 @@ do_peer_close(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
case TCPS_FIN_WAIT_2:
tcp_twstart(tp);
INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
INP_WLOCK(inp);
toepcb_release(toep); /* no more CPLs expected */
@@ -1264,7 +1264,7 @@ do_peer_close(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
done:
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
m_freem(m);
return (0);
@@ -1285,7 +1285,7 @@ do_close_con_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
struct tcpcb *tp;
struct socket *so;
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
INP_WLOCK(inp);
tp = intotcpcb(inp);
@@ -1303,7 +1303,7 @@ do_close_con_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
tcp_twstart(tp);
release:
INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
INP_WLOCK(inp);
toepcb_release(toep); /* no more CPLs expected */
@@ -1328,7 +1328,7 @@ do_close_con_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
done:
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
m_freem(m);
return (0);
@@ -1489,7 +1489,7 @@ do_abort_req(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
return (do_abort_req_synqe(qs, r, m));
inp = toep->tp_inp;
- INP_INFO_WLOCK(&V_tcbinfo); /* for tcp_close */
+ INP_INFO_RLOCK(&V_tcbinfo); /* for tcp_close */
INP_WLOCK(inp);
tp = intotcpcb(inp);
@@ -1523,7 +1523,7 @@ do_abort_req(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
INP_WLOCK(inp); /* re-acquire */
toepcb_release(toep); /* no more CPLs expected */
}
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
send_abort_rpl(tod, tid, qset);
m_freem(m);
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_listen.c b/sys/dev/cxgb/ulp/tom/cxgb_listen.c
index 94a219b..631899d 100644
--- a/sys/dev/cxgb/ulp/tom/cxgb_listen.c
+++ b/sys/dev/cxgb/ulp/tom/cxgb_listen.c
@@ -554,11 +554,11 @@ do_pass_accept_req(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
REJECT_PASS_ACCEPT(); /* no l2te, or ifp mismatch */
}
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
/* Don't offload if the 4-tuple is already in use */
if (toe_4tuple_check(&inc, &th, ifp) != 0) {
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
REJECT_PASS_ACCEPT();
}
@@ -571,7 +571,7 @@ do_pass_accept_req(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
* resources tied to this listen context.
*/
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
REJECT_PASS_ACCEPT();
}
so = inp->inp_socket;
@@ -713,7 +713,7 @@ do_pass_establish(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
KASSERT(qs->idx == synqe->qset,
("%s qset mismatch %d %d", __func__, qs->idx, synqe->qset));
- INP_INFO_WLOCK(&V_tcbinfo); /* for syncache_expand */
+ INP_INFO_RLOCK(&V_tcbinfo); /* for syncache_expand */
INP_WLOCK(inp);
if (__predict_false(inp->inp_flags & INP_DROPPED)) {
@@ -727,7 +727,7 @@ do_pass_establish(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
("%s: listen socket dropped but tid %u not aborted.",
__func__, tid));
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
m_freem(m);
return (0);
}
@@ -743,7 +743,7 @@ do_pass_establish(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
reset:
t3_send_reset_synqe(tod, synqe);
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
m_freem(m);
return (0);
}
@@ -775,7 +775,7 @@ do_pass_establish(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
inp = release_lctx(td, lctx);
if (inp)
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
release_synqe(synqe);
m_freem(m);
diff --git a/sys/dev/cxgbe/tom/t4_connect.c b/sys/dev/cxgbe/tom/t4_connect.c
index 9973fa5..718f62a 100644
--- a/sys/dev/cxgbe/tom/t4_connect.c
+++ b/sys/dev/cxgbe/tom/t4_connect.c
@@ -208,12 +208,12 @@ do_act_open_rpl(struct sge_iq *iq, const struct rss_header *rss,
rc = act_open_rpl_status_to_errno(status);
if (rc != EAGAIN)
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
INP_WLOCK(inp);
toe_connect_failed(tod, inp, rc);
final_cpl_received(toep); /* unlocks inp */
if (rc != EAGAIN)
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
return (0);
}
diff --git a/sys/dev/cxgbe/tom/t4_cpl_io.c b/sys/dev/cxgbe/tom/t4_cpl_io.c
index f18e0c7..f0e9b0a 100644
--- a/sys/dev/cxgbe/tom/t4_cpl_io.c
+++ b/sys/dev/cxgbe/tom/t4_cpl_io.c
@@ -1059,7 +1059,7 @@ do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
INP_WLOCK(inp);
tp = intotcpcb(inp);
@@ -1113,7 +1113,7 @@ do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
case TCPS_FIN_WAIT_2:
tcp_twstart(tp);
INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
INP_WLOCK(inp);
final_cpl_received(toep);
@@ -1125,7 +1125,7 @@ do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
}
done:
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
return (0);
}
@@ -1152,7 +1152,7 @@ do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss,
KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
INP_WLOCK(inp);
tp = intotcpcb(inp);
@@ -1170,7 +1170,7 @@ do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss,
tcp_twstart(tp);
release:
INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
INP_WLOCK(inp);
final_cpl_received(toep); /* no more CPLs expected */
@@ -1194,7 +1194,7 @@ do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss,
}
done:
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
return (0);
}
@@ -1353,7 +1353,7 @@ do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
}
inp = toep->inp;
- INP_INFO_WLOCK(&V_tcbinfo); /* for tcp_close */
+ INP_INFO_RLOCK(&V_tcbinfo); /* for tcp_close */
INP_WLOCK(inp);
tp = intotcpcb(inp);
@@ -1387,7 +1387,7 @@ do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
final_cpl_received(toep);
done:
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST);
return (0);
}
@@ -1501,12 +1501,12 @@ do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
SOCKBUF_UNLOCK(sb);
INP_WUNLOCK(inp);
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
INP_WLOCK(inp);
tp = tcp_drop(tp, ECONNRESET);
if (tp)
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
return (0);
}
diff --git a/sys/dev/cxgbe/tom/t4_listen.c b/sys/dev/cxgbe/tom/t4_listen.c
index 4380c9e..0faac12 100644
--- a/sys/dev/cxgbe/tom/t4_listen.c
+++ b/sys/dev/cxgbe/tom/t4_listen.c
@@ -1311,15 +1311,15 @@ do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss,
REJECT_PASS_ACCEPT();
rpl = wrtod(wr);
- INP_INFO_WLOCK(&V_tcbinfo); /* for 4-tuple check */
+ INP_INFO_RLOCK(&V_tcbinfo); /* for 4-tuple check */
/* Don't offload if the 4-tuple is already in use */
if (toe_4tuple_check(&inc, &th, ifp) != 0) {
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
free(wr, M_CXGBE);
REJECT_PASS_ACCEPT();
}
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
inp = lctx->inp; /* listening socket, not owned by TOE */
INP_WLOCK(inp);
@@ -1511,7 +1511,7 @@ do_pass_establish(struct sge_iq *iq, const struct rss_header *rss,
KASSERT(synqe->flags & TPF_SYNQE,
("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe));
- INP_INFO_WLOCK(&V_tcbinfo); /* for syncache_expand */
+ INP_INFO_RLOCK(&V_tcbinfo); /* for syncache_expand */
INP_WLOCK(inp);
CTR6(KTR_CXGBE,
@@ -1609,7 +1609,7 @@ do_pass_establish(struct sge_iq *iq, const struct rss_header *rss,
inp = release_lctx(sc, lctx);
if (inp != NULL)
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
release_synqe(synqe);
return (0);
diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c
index a0a1b30..b91c5ad 100644
--- a/sys/netinet/in_pcb.c
+++ b/sys/netinet/in_pcb.c
@@ -218,6 +218,7 @@ in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name,
INP_INFO_LOCK_INIT(pcbinfo, name);
INP_HASH_LOCK_INIT(pcbinfo, "pcbinfohash"); /* XXXRW: argument? */
+ INP_LIST_LOCK_INIT(pcbinfo, "pcbinfolist");
#ifdef VIMAGE
pcbinfo->ipi_vnet = curvnet;
#endif
@@ -256,6 +257,7 @@ in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
in_pcbgroup_destroy(pcbinfo);
#endif
uma_zdestroy(pcbinfo->ipi_zone);
+ INP_LIST_LOCK_DESTROY(pcbinfo);
INP_HASH_LOCK_DESTROY(pcbinfo);
INP_INFO_LOCK_DESTROY(pcbinfo);
}
@@ -270,7 +272,14 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
struct inpcb *inp;
int error;
- INP_INFO_WLOCK_ASSERT(pcbinfo);
+#ifdef INVARIANTS
+ if (pcbinfo == &V_tcbinfo) {
+ INP_INFO_RLOCK_ASSERT(pcbinfo);
+ } else {
+ INP_INFO_WLOCK_ASSERT(pcbinfo);
+ }
+#endif
+
error = 0;
inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT);
if (inp == NULL)
@@ -302,6 +311,8 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
inp->inp_flags |= IN6P_IPV6_V6ONLY;
}
#endif
+ INP_WLOCK(inp);
+ INP_LIST_WLOCK(pcbinfo);
LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list);
pcbinfo->ipi_count++;
so->so_pcb = (caddr_t)inp;
@@ -309,9 +320,9 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
if (V_ip6_auto_flowlabel)
inp->inp_flags |= IN6P_AUTOFLOWLABEL;
#endif
- INP_WLOCK(inp);
inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
refcount_init(&inp->inp_refcount, 1); /* Reference from inpcbinfo */
+ INP_LIST_WUNLOCK(pcbinfo);
#if defined(IPSEC) || defined(MAC)
out:
if (error != 0) {
@@ -1239,7 +1250,13 @@ in_pcbfree(struct inpcb *inp)
KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
- INP_INFO_WLOCK_ASSERT(pcbinfo);
+#ifdef INVARIANTS
+ if (pcbinfo == &V_tcbinfo) {
+ INP_INFO_RLOCK_ASSERT(pcbinfo);
+ } else {
+ INP_INFO_WLOCK_ASSERT(pcbinfo);
+ }
+#endif
INP_WLOCK_ASSERT(inp);
/* XXXRW: Do as much as possible here. */
@@ -1247,8 +1264,10 @@ in_pcbfree(struct inpcb *inp)
if (inp->inp_sp != NULL)
ipsec_delete_pcbpolicy(inp);
#endif
+ INP_LIST_WLOCK(pcbinfo);
inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
in_pcbremlists(inp);
+ INP_LIST_WUNLOCK(pcbinfo);
#ifdef INET6
if (inp->inp_vflag & INP_IPV6PROTO) {
ip6_freepcbopts(inp->in6p_outputopts);
@@ -1405,7 +1424,7 @@ in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
struct ip_moptions *imo;
int i, gap;
- INP_INFO_RLOCK(pcbinfo);
+ INP_INFO_WLOCK(pcbinfo);
LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
INP_WLOCK(inp);
imo = inp->inp_moptions;
@@ -1435,7 +1454,7 @@ in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
}
INP_WUNLOCK(inp);
}
- INP_INFO_RUNLOCK(pcbinfo);
+ INP_INFO_WUNLOCK(pcbinfo);
}
/*
@@ -2171,8 +2190,16 @@ in_pcbremlists(struct inpcb *inp)
{
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
- INP_INFO_WLOCK_ASSERT(pcbinfo);
+#ifdef INVARIANTS
+ if (pcbinfo == &V_tcbinfo) {
+ INP_INFO_RLOCK_ASSERT(pcbinfo);
+ } else {
+ INP_INFO_WLOCK_ASSERT(pcbinfo);
+ }
+#endif
+
INP_WLOCK_ASSERT(inp);
+ INP_LIST_WLOCK_ASSERT(pcbinfo);
inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
if (inp->inp_flags & INP_INHASHLIST) {
@@ -2317,13 +2344,13 @@ inp_apply_all(void (*func)(struct inpcb *, void *), void *arg)
{
struct inpcb *inp;
- INP_INFO_RLOCK(&V_tcbinfo);
+ INP_INFO_WLOCK(&V_tcbinfo);
LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) {
INP_WLOCK(inp);
func(inp, arg);
INP_WUNLOCK(inp);
}
- INP_INFO_RUNLOCK(&V_tcbinfo);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
}
struct socket *
diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h
index 185bcfb..661d7b8 100644
--- a/sys/netinet/in_pcb.h
+++ b/sys/netinet/in_pcb.h
@@ -134,19 +134,20 @@ struct icmp6_filter;
* and IPv6 sockets. In the case of TCP, further per-connection state is
* hung off of inp_ppcb most of the time. Almost all fields of struct inpcb
* are static after creation or protected by a per-inpcb rwlock, inp_lock. A
- * few fields also require the global pcbinfo lock for the inpcb to be held,
- * when modified, such as the global connection lists and hashes, as well as
- * binding information (which affects which hash a connection is on). This
- * model means that connections can be looked up without holding the
- * per-connection lock, which is important for performance when attempting to
- * find the connection for a packet given its IP and port tuple. Writing to
- * these fields that write locks be held on both the inpcb and global locks.
+ * few fields also require the global pcblist lock for the inpcb to be held,
+ * when modified, such as the global connection lists. This model means that
+ * connections can be looked up without holding the per-connection lock, which
+ * is important for performance when attempting to find the connection for a
+ * packet given its IP and port tuple. Writing to these fields that write
+ * locks be held on both the inpcb and global locks.
*
* Key:
* (c) - Constant after initialization
* (g) - Protected by the pcbgroup lock
* (i) - Protected by the inpcb lock
* (p) - Protected by the pcbinfo lock for the inpcb
+ * (l) - Protected by the pcblist lock for the inpcb
+ * (h) - Protected by the pcbhash lock for the inpcb
* (s) - Protected by another subsystem's locks
* (x) - Undefined locking
*
@@ -163,13 +164,13 @@ struct icmp6_filter;
* The inp_vflag field is overloaded, and would otherwise ideally be (c).
*/
struct inpcb {
- LIST_ENTRY(inpcb) inp_hash; /* (i/p) hash list */
+ LIST_ENTRY(inpcb) inp_hash; /* (i/h) hash list */
LIST_ENTRY(inpcb) inp_pcbgrouphash; /* (g/i) hash list */
- LIST_ENTRY(inpcb) inp_list; /* (i/p) list for all PCBs for proto */
+ LIST_ENTRY(inpcb) inp_list; /* (i/l) list for all PCBs for proto */
void *inp_ppcb; /* (i) pointer to per-protocol pcb */
struct inpcbinfo *inp_pcbinfo; /* (c) PCB list info */
struct inpcbgroup *inp_pcbgroup; /* (g/i) PCB group list */
- LIST_ENTRY(inpcb) inp_pcbgroup_wild; /* (g/i/p) group wildcard entry */
+ LIST_ENTRY(inpcb) inp_pcbgroup_wild; /* (g/i/h) group wildcard entry */
struct socket *inp_socket; /* (i) back pointer to socket */
struct ucred *inp_cred; /* (c) cache of socket cred */
u_int32_t inp_flow; /* (i) IPv6 flow information */
@@ -188,7 +189,7 @@ struct inpcb {
* general use */
/* Local and foreign ports, local and foreign addr. */
- struct in_conninfo inp_inc; /* (i/p) list for PCB's local port */
+ struct in_conninfo inp_inc; /* (i) list for PCB's local port */
/* MAC and IPSEC policy information. */
struct label *inp_label; /* (i) MAC label */
@@ -213,8 +214,8 @@ struct inpcb {
int inp6_cksum;
short inp6_hops;
} inp_depend6;
- LIST_ENTRY(inpcb) inp_portlist; /* (i/p) */
- struct inpcbport *inp_phd; /* (i/p) head of this list */
+ LIST_ENTRY(inpcb) inp_portlist; /* (i/h) */
+ struct inpcbport *inp_phd; /* (i/h) head of this list */
#define inp_zero_size offsetof(struct inpcb, inp_gencnt)
inp_gen_t inp_gencnt; /* (c) generation count */
struct llentry *inp_lle; /* cached L2 information */
@@ -279,16 +280,24 @@ struct inpcbport {
* Global data structure for each high-level protocol (UDP, TCP, ...) in both
* IPv4 and IPv6. Holds inpcb lists and information for managing them.
*
- * Each pcbinfo is protected by two locks: ipi_lock and ipi_hash_lock,
- * the former covering mutable global fields (such as the global pcb list),
- * and the latter covering the hashed lookup tables. The lock order is:
+ * Each pcbinfo is protected by three locks: ipi_lock, ipi_hash_lock and
+ * ipi_list_lock:
+ * - ipi_lock covering the global pcb list stability during loop iteration,
+ * - ipi_hash_lock covering the hashed lookup tables,
+ * - ipi_list_lock covering mutable global fields (such as the global
+ * pcb list)
*
- * ipi_lock (before) inpcb locks (before) {ipi_hash_lock, pcbgroup locks}
+ * The lock order is:
+ *
+ * ipi_lock (before)
+ * inpcb locks (before)
+ * {ipi_hash_lock, ipi_list_lock, pcbgroup locks}
*
* Locking key:
*
* (c) Constant or nearly constant after initialisation
* (g) Locked by ipi_lock
+ * (l) Locked by ipi_list_lock
* (h) Read using either ipi_hash_lock or inpcb lock; write requires both
* (p) Protected by one or more pcbgroup locks
* (x) Synchronisation properties poorly defined
@@ -302,14 +311,14 @@ struct inpcbinfo {
/*
* Global list of inpcbs on the protocol.
*/
- struct inpcbhead *ipi_listhead; /* (g) */
- u_int ipi_count; /* (g) */
+ struct inpcbhead *ipi_listhead; /* (g/l) */
+ u_int ipi_count; /* (g/l) */
/*
* Generation count -- incremented each time a connection is allocated
* or freed.
*/
- u_quad_t ipi_gencnt; /* (g) */
+ u_quad_t ipi_gencnt; /* (g/l) */
/*
* Fields associated with port lookup and allocation.
@@ -367,6 +376,11 @@ struct inpcbinfo {
* general use 2
*/
void *ipi_pspare[2];
+
+ /*
+ * Global lock protecting global inpcb list, inpcb count, etc.
+ */
+ struct rwlock ipi_list_lock;
};
#ifdef _KERNEL
@@ -466,6 +480,25 @@ short inp_so_options(const struct inpcb *inp);
#define INP_INFO_WLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_WLOCKED)
#define INP_INFO_UNLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_UNLOCKED)
+#define INP_LIST_LOCK_INIT(ipi, d) \
+ rw_init_flags(&(ipi)->ipi_list_lock, (d), 0)
+#define INP_LIST_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_list_lock)
+#define INP_LIST_RLOCK(ipi) rw_rlock(&(ipi)->ipi_list_lock)
+#define INP_LIST_WLOCK(ipi) rw_wlock(&(ipi)->ipi_list_lock)
+#define INP_LIST_TRY_RLOCK(ipi) rw_try_rlock(&(ipi)->ipi_list_lock)
+#define INP_LIST_TRY_WLOCK(ipi) rw_try_wlock(&(ipi)->ipi_list_lock)
+#define INP_LIST_TRY_UPGRADE(ipi) rw_try_upgrade(&(ipi)->ipi_list_lock)
+#define INP_LIST_RUNLOCK(ipi) rw_runlock(&(ipi)->ipi_list_lock)
+#define INP_LIST_WUNLOCK(ipi) rw_wunlock(&(ipi)->ipi_list_lock)
+#define INP_LIST_LOCK_ASSERT(ipi) \
+ rw_assert(&(ipi)->ipi_list_lock, RA_LOCKED)
+#define INP_LIST_RLOCK_ASSERT(ipi) \
+ rw_assert(&(ipi)->ipi_list_lock, RA_RLOCKED)
+#define INP_LIST_WLOCK_ASSERT(ipi) \
+ rw_assert(&(ipi)->ipi_list_lock, RA_WLOCKED)
+#define INP_LIST_UNLOCK_ASSERT(ipi) \
+ rw_assert(&(ipi)->ipi_list_lock, RA_UNLOCKED)
+
#define INP_HASH_LOCK_INIT(ipi, d) \
rw_init_flags(&(ipi)->ipi_hash_lock, (d), 0)
#define INP_HASH_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_hash_lock)
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index e338b1e..8090303 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -571,7 +571,7 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
char *s = NULL; /* address and port logging */
int ti_locked;
#define TI_UNLOCKED 1
-#define TI_WLOCKED 2
+#define TI_RLOCKED 2
#ifdef TCPDEBUG
/*
@@ -760,8 +760,8 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
* connection in TIMEWAIT and SYNs not targeting a listening socket.
*/
if ((thflags & (TH_FIN | TH_RST)) != 0) {
- INP_INFO_WLOCK(&V_tcbinfo);
- ti_locked = TI_WLOCKED;
+ INP_INFO_RLOCK(&V_tcbinfo);
+ ti_locked = TI_RLOCKED;
} else
ti_locked = TI_UNLOCKED;
@@ -783,8 +783,8 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
findpcb:
#ifdef INVARIANTS
- if (ti_locked == TI_WLOCKED) {
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ if (ti_locked == TI_RLOCKED) {
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
} else {
INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
}
@@ -936,20 +936,20 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
relocked:
if (inp->inp_flags & INP_TIMEWAIT) {
if (ti_locked == TI_UNLOCKED) {
- if (INP_INFO_TRY_WLOCK(&V_tcbinfo) == 0) {
+ if (INP_INFO_TRY_RLOCK(&V_tcbinfo) == 0) {
in_pcbref(inp);
INP_WUNLOCK(inp);
- INP_INFO_WLOCK(&V_tcbinfo);
- ti_locked = TI_WLOCKED;
+ INP_INFO_RLOCK(&V_tcbinfo);
+ ti_locked = TI_RLOCKED;
INP_WLOCK(inp);
if (in_pcbrele_wlocked(inp)) {
inp = NULL;
goto findpcb;
}
} else
- ti_locked = TI_WLOCKED;
+ ti_locked = TI_RLOCKED;
}
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
if (thflags & TH_SYN)
tcp_dooptions(&to, optp, optlen, TO_SYN);
@@ -958,7 +958,7 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
*/
if (tcp_twcheck(inp, &to, th, m, tlen))
goto findpcb;
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
return (IPPROTO_DONE);
}
/*
@@ -989,16 +989,16 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
*/
#ifdef INVARIANTS
if ((thflags & (TH_FIN | TH_RST)) != 0)
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
#endif
if (!((tp->t_state == TCPS_ESTABLISHED && (thflags & TH_SYN) == 0) ||
(tp->t_state == TCPS_LISTEN && (thflags & TH_SYN)))) {
if (ti_locked == TI_UNLOCKED) {
- if (INP_INFO_TRY_WLOCK(&V_tcbinfo) == 0) {
+ if (INP_INFO_TRY_RLOCK(&V_tcbinfo) == 0) {
in_pcbref(inp);
INP_WUNLOCK(inp);
- INP_INFO_WLOCK(&V_tcbinfo);
- ti_locked = TI_WLOCKED;
+ INP_INFO_RLOCK(&V_tcbinfo);
+ ti_locked = TI_RLOCKED;
INP_WLOCK(inp);
if (in_pcbrele_wlocked(inp)) {
inp = NULL;
@@ -1006,9 +1006,9 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
}
goto relocked;
} else
- ti_locked = TI_WLOCKED;
+ ti_locked = TI_RLOCKED;
}
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
}
#ifdef MAC
@@ -1063,7 +1063,7 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
*/
if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) {
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
/*
* Parse the TCP options here because
* syncookies need access to the reflected
@@ -1346,8 +1346,8 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
* Entry added to syncache and mbuf consumed.
* Only the listen socket is unlocked by syncache_add().
*/
- if (ti_locked == TI_WLOCKED) {
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ if (ti_locked == TI_RLOCKED) {
+ INP_INFO_RUNLOCK(&V_tcbinfo);
ti_locked = TI_UNLOCKED;
}
INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
@@ -1396,8 +1396,8 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
dropwithreset:
TCP_PROBE5(receive, NULL, tp, mtod(m, const char *), tp, th);
- if (ti_locked == TI_WLOCKED) {
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ if (ti_locked == TI_RLOCKED) {
+ INP_INFO_RUNLOCK(&V_tcbinfo);
ti_locked = TI_UNLOCKED;
}
#ifdef INVARIANTS
@@ -1420,8 +1420,8 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
if (m != NULL)
TCP_PROBE5(receive, NULL, tp, mtod(m, const char *), tp, th);
- if (ti_locked == TI_WLOCKED) {
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ if (ti_locked == TI_RLOCKED) {
+ INP_INFO_RUNLOCK(&V_tcbinfo);
ti_locked = TI_UNLOCKED;
}
#ifdef INVARIANTS
@@ -1478,13 +1478,13 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
tp->t_state != TCPS_ESTABLISHED) {
- KASSERT(ti_locked == TI_WLOCKED, ("%s ti_locked %d for "
+ KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for "
"SYN/FIN/RST/!EST", __func__, ti_locked));
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
} else {
#ifdef INVARIANTS
- if (ti_locked == TI_WLOCKED)
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ if (ti_locked == TI_RLOCKED)
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
else {
KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST "
"ti_locked: %d", __func__, ti_locked));
@@ -1652,8 +1652,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
/*
* This is a pure ack for outstanding data.
*/
- if (ti_locked == TI_WLOCKED)
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ if (ti_locked == TI_RLOCKED)
+ INP_INFO_RUNLOCK(&V_tcbinfo);
ti_locked = TI_UNLOCKED;
TCPSTAT_INC(tcps_predack);
@@ -1756,8 +1756,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
* nothing on the reassembly queue and we have enough
* buffer space to take it.
*/
- if (ti_locked == TI_WLOCKED)
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ if (ti_locked == TI_RLOCKED)
+ INP_INFO_RUNLOCK(&V_tcbinfo);
ti_locked = TI_UNLOCKED;
/* Clean receiver SACK report if present */
@@ -1992,9 +1992,9 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
tcp_state_change(tp, TCPS_SYN_RECEIVED);
}
- KASSERT(ti_locked == TI_WLOCKED, ("%s: trimthenstep6: "
+ KASSERT(ti_locked == TI_RLOCKED, ("%s: trimthenstep6: "
"ti_locked %d", __func__, ti_locked));
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(tp->t_inpcb);
/*
@@ -2067,8 +2067,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
(tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) {
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
- KASSERT(ti_locked == TI_WLOCKED,
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+ KASSERT(ti_locked == TI_RLOCKED,
("%s: TH_RST ti_locked %d, th %p tp %p",
__func__, ti_locked, th, tp));
KASSERT(tp->t_state != TCPS_SYN_SENT,
@@ -2111,9 +2111,9 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
* Send challenge ACK for any SYN in synchronized state.
*/
if ((thflags & TH_SYN) && tp->t_state != TCPS_SYN_SENT) {
- KASSERT(ti_locked == TI_WLOCKED,
+ KASSERT(ti_locked == TI_RLOCKED,
("tcp_do_segment: TH_SYN ti_locked %d", ti_locked));
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
TCPSTAT_INC(tcps_badsyn);
if (V_tcp_insecure_syn &&
@@ -2226,9 +2226,9 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
if ((so->so_state & SS_NOFDREF) &&
tp->t_state > TCPS_CLOSE_WAIT && tlen) {
- KASSERT(ti_locked == TI_WLOCKED, ("%s: SS_NOFDEREF && "
+ KASSERT(ti_locked == TI_RLOCKED, ("%s: SS_NOFDEREF && "
"CLOSE_WAIT && tlen ti_locked %d", __func__, ti_locked));
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data "
@@ -2729,9 +2729,9 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
case TCPS_CLOSING:
if (ourfinisacked) {
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
tcp_twstart(tp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
m_freem(m);
return;
}
@@ -2745,7 +2745,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
case TCPS_LAST_ACK:
if (ourfinisacked) {
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
tp = tcp_close(tp);
goto drop;
}
@@ -2959,18 +2959,18 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
* standard timers.
*/
case TCPS_FIN_WAIT_2:
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
- KASSERT(ti_locked == TI_WLOCKED, ("%s: dodata "
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+ KASSERT(ti_locked == TI_RLOCKED, ("%s: dodata "
"TCP_FIN_WAIT_2 ti_locked: %d", __func__,
ti_locked));
tcp_twstart(tp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
return;
}
}
- if (ti_locked == TI_WLOCKED)
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ if (ti_locked == TI_RLOCKED)
+ INP_INFO_RUNLOCK(&V_tcbinfo);
ti_locked = TI_UNLOCKED;
#ifdef TCPDEBUG
@@ -3025,8 +3025,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
&tcp_savetcp, 0);
#endif
- if (ti_locked == TI_WLOCKED)
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ if (ti_locked == TI_RLOCKED)
+ INP_INFO_RUNLOCK(&V_tcbinfo);
ti_locked = TI_UNLOCKED;
tp->t_flags |= TF_ACKNOW;
@@ -3036,8 +3036,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
return;
dropwithreset:
- if (ti_locked == TI_WLOCKED)
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ if (ti_locked == TI_RLOCKED)
+ INP_INFO_RUNLOCK(&V_tcbinfo);
ti_locked = TI_UNLOCKED;
if (tp != NULL) {
@@ -3048,8 +3048,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
return;
drop:
- if (ti_locked == TI_WLOCKED) {
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ if (ti_locked == TI_RLOCKED) {
+ INP_INFO_RUNLOCK(&V_tcbinfo);
ti_locked = TI_UNLOCKED;
}
#ifdef INVARIANTS
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index 7adda33..ff2153e 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -849,11 +849,11 @@ tcp_ccalgounload(struct cc_algo *unload_algo)
VNET_LIST_RLOCK();
VNET_FOREACH(vnet_iter) {
CURVNET_SET(vnet_iter);
- INP_INFO_RLOCK(&V_tcbinfo);
+ INP_INFO_WLOCK(&V_tcbinfo);
/*
* New connections already part way through being initialised
* with the CC algo we're removing will not race with this code
- * because the INP_INFO_WLOCK is held during initialisation. We
+ * because the INP_INFO_RLOCK is held during initialisation. We
* therefore don't enter the loop below until the connection
* list has stabilised.
*/
@@ -879,7 +879,7 @@ tcp_ccalgounload(struct cc_algo *unload_algo)
}
INP_WUNLOCK(inp);
}
- INP_INFO_RUNLOCK(&V_tcbinfo);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
}
VNET_LIST_RUNLOCK();
@@ -897,7 +897,7 @@ tcp_drop(struct tcpcb *tp, int errno)
{
struct socket *so = tp->t_inpcb->inp_socket;
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(tp->t_inpcb);
if (TCPS_HAVERCVDSYN(tp->t_state)) {
@@ -1033,7 +1033,7 @@ tcp_close(struct tcpcb *tp)
struct inpcb *inp = tp->t_inpcb;
struct socket *so;
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
#ifdef TCP_OFFLOAD
@@ -1081,7 +1081,7 @@ tcp_drain(void)
* where we're really low on mbufs, this is potentially
* useful.
*/
- INP_INFO_RLOCK(&V_tcbinfo);
+ INP_INFO_WLOCK(&V_tcbinfo);
LIST_FOREACH(inpb, V_tcbinfo.ipi_listhead, inp_list) {
if (inpb->inp_flags & INP_TIMEWAIT)
continue;
@@ -1092,7 +1092,7 @@ tcp_drain(void)
}
INP_WUNLOCK(inpb);
}
- INP_INFO_RUNLOCK(&V_tcbinfo);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
}
VNET_LIST_RUNLOCK_NOSLEEP();
@@ -1176,8 +1176,10 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS)
* OK, now we're committed to doing something.
*/
INP_INFO_RLOCK(&V_tcbinfo);
+ INP_LIST_RLOCK(&V_tcbinfo);
gencnt = V_tcbinfo.ipi_gencnt;
n = V_tcbinfo.ipi_count;
+ INP_LIST_RUNLOCK(&V_tcbinfo);
INP_INFO_RUNLOCK(&V_tcbinfo);
m = syncache_pcbcount();
@@ -1203,7 +1205,7 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS)
if (inp_list == NULL)
return (ENOMEM);
- INP_INFO_RLOCK(&V_tcbinfo);
+ INP_INFO_WLOCK(&V_tcbinfo);
for (inp = LIST_FIRST(V_tcbinfo.ipi_listhead), i = 0;
inp != NULL && i < n; inp = LIST_NEXT(inp, inp_list)) {
INP_WLOCK(inp);
@@ -1228,7 +1230,7 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS)
}
INP_WUNLOCK(inp);
}
- INP_INFO_RUNLOCK(&V_tcbinfo);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
n = i;
error = 0;
@@ -1266,14 +1268,14 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS)
} else
INP_RUNLOCK(inp);
}
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
for (i = 0; i < n; i++) {
inp = inp_list[i];
INP_RLOCK(inp);
if (!in_pcbrele_rlocked(inp))
INP_RUNLOCK(inp);
}
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
if (!error) {
/*
@@ -1284,9 +1286,11 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS)
* might be necessary to retry.
*/
INP_INFO_RLOCK(&V_tcbinfo);
+ INP_LIST_RLOCK(&V_tcbinfo);
xig.xig_gen = V_tcbinfo.ipi_gencnt;
xig.xig_sogen = so_gencnt;
xig.xig_count = V_tcbinfo.ipi_count + pcb_count;
+ INP_LIST_RUNLOCK(&V_tcbinfo);
INP_INFO_RUNLOCK(&V_tcbinfo);
error = SYSCTL_OUT(req, &xig, sizeof xig);
}
@@ -1448,7 +1452,7 @@ tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
- offsetof(struct icmp, icmp_ip));
th = (struct tcphdr *)((caddr_t)ip
+ (ip->ip_hl << 2));
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
inp = in_pcblookup(&V_tcbinfo, faddr, th->th_dport,
ip->ip_src, th->th_sport, INPLOOKUP_WLOCKPCB, NULL);
if (inp != NULL) {
@@ -1508,7 +1512,7 @@ tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
inc.inc_laddr = ip->ip_src;
syncache_unreach(&inc, th);
}
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
} else
in_pcbnotifyall(&V_tcbinfo, faddr, inetctlerrmap[cmd], notify);
}
@@ -1581,9 +1585,9 @@ tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d)
inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr;
inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr;
inc.inc_flags |= INC_ISIPV6;
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
syncache_unreach(&inc, &th);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
} else
in6_pcbnotify(&V_tcbinfo, sa, 0, (const struct sockaddr *)sa6_src,
0, cmd, NULL, notify);
@@ -1716,7 +1720,7 @@ tcp_drop_syn_sent(struct inpcb *inp, int errno)
{
struct tcpcb *tp;
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
if ((inp->inp_flags & INP_TIMEWAIT) ||
@@ -2239,7 +2243,7 @@ sysctl_drop(SYSCTL_HANDLER_ARGS)
default:
return (EINVAL);
}
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
switch (addrs[0].ss_family) {
#ifdef INET6
case AF_INET6:
@@ -2278,7 +2282,7 @@ sysctl_drop(SYSCTL_HANDLER_ARGS)
INP_WUNLOCK(inp);
} else
error = ESRCH;
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
return (error);
}
diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c
index 55a5044..197c788 100644
--- a/sys/netinet/tcp_syncache.c
+++ b/sys/netinet/tcp_syncache.c
@@ -662,7 +662,7 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
int error;
char *s;
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
/*
* Ok, create the full blown connection, and set things up
@@ -944,7 +944,7 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
* Global TCP locks are held because we manipulate the PCB lists
* and create a new socket.
*/
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
KASSERT((th->th_flags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK,
("%s: can handle only ACK", __func__));
diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c
index 1767e1e..997638c 100644
--- a/sys/netinet/tcp_timer.c
+++ b/sys/netinet/tcp_timer.c
@@ -269,7 +269,7 @@ tcp_timer_2msl(void *xtp)
/*
* XXXRW: Does this actually happen?
*/
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
inp = tp->t_inpcb;
/*
* XXXRW: While this assert is in fact correct, bugs in the tcpcb
@@ -280,7 +280,7 @@ tcp_timer_2msl(void *xtp)
*/
if (inp == NULL) {
tcp_timer_race++;
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
}
@@ -289,14 +289,14 @@ tcp_timer_2msl(void *xtp)
if (callout_pending(&tp->t_timers->tt_2msl) ||
!callout_active(&tp->t_timers->tt_2msl)) {
INP_WUNLOCK(tp->t_inpcb);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
}
callout_deactivate(&tp->t_timers->tt_2msl);
if ((inp->inp_flags & INP_DROPPED) != 0) {
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
}
@@ -332,7 +332,7 @@ tcp_timer_2msl(void *xtp)
#endif
if (tp != NULL)
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
}
@@ -348,7 +348,7 @@ tcp_timer_keep(void *xtp)
ostate = tp->t_state;
#endif
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
inp = tp->t_inpcb;
/*
* XXXRW: While this assert is in fact correct, bugs in the tcpcb
@@ -359,7 +359,7 @@ tcp_timer_keep(void *xtp)
*/
if (inp == NULL) {
tcp_timer_race++;
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
}
@@ -367,14 +367,14 @@ tcp_timer_keep(void *xtp)
if (callout_pending(&tp->t_timers->tt_keep) ||
!callout_active(&tp->t_timers->tt_keep)) {
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
}
callout_deactivate(&tp->t_timers->tt_keep);
if ((inp->inp_flags & INP_DROPPED) != 0) {
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
}
@@ -421,7 +421,7 @@ tcp_timer_keep(void *xtp)
PRU_SLOWTIMO);
#endif
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
@@ -436,7 +436,7 @@ tcp_timer_keep(void *xtp)
#endif
if (tp != NULL)
INP_WUNLOCK(tp->t_inpcb);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
}
@@ -451,7 +451,7 @@ tcp_timer_persist(void *xtp)
ostate = tp->t_state;
#endif
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
inp = tp->t_inpcb;
/*
* XXXRW: While this assert is in fact correct, bugs in the tcpcb
@@ -462,7 +462,7 @@ tcp_timer_persist(void *xtp)
*/
if (inp == NULL) {
tcp_timer_race++;
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
}
@@ -470,14 +470,14 @@ tcp_timer_persist(void *xtp)
if (callout_pending(&tp->t_timers->tt_persist) ||
!callout_active(&tp->t_timers->tt_persist)) {
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
}
callout_deactivate(&tp->t_timers->tt_persist);
if ((inp->inp_flags & INP_DROPPED) != 0) {
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
}
@@ -522,7 +522,7 @@ tcp_timer_persist(void *xtp)
#endif
if (tp != NULL)
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
}
@@ -581,16 +581,16 @@ tcp_timer_rexmt(void * xtp)
in_pcbref(inp);
INP_INFO_RUNLOCK(&V_tcbinfo);
INP_WUNLOCK(inp);
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
INP_WLOCK(inp);
if (in_pcbrele_wlocked(inp)) {
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
}
if (inp->inp_flags & INP_DROPPED) {
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
}
@@ -688,7 +688,7 @@ tcp_timer_rexmt(void * xtp)
if (tp != NULL)
INP_WUNLOCK(inp);
if (headlocked)
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
}
diff --git a/sys/netinet/tcp_timewait.c b/sys/netinet/tcp_timewait.c
index 9c17655..7687058 100644
--- a/sys/netinet/tcp_timewait.c
+++ b/sys/netinet/tcp_timewait.c
@@ -202,10 +202,10 @@ tcp_tw_destroy(void)
{
struct tcptw *tw;
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
while ((tw = TAILQ_FIRST(&V_twq_2msl)) != NULL)
tcp_twclose(tw, 0);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
TW_LOCK_DESTROY(V_tw_lock);
uma_zdestroy(V_tcptw_zone);
@@ -228,7 +228,7 @@ tcp_twstart(struct tcpcb *tp)
int isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6;
#endif
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
if (V_nolocaltimewait) {
@@ -357,7 +357,7 @@ tcp_twcheck(struct inpcb *inp, struct tcpopt *to __unused, struct tcphdr *th,
int thflags;
tcp_seq seq;
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
/*
@@ -458,7 +458,7 @@ tcp_twclose(struct tcptw *tw, int reuse)
inp = tw->tw_inpcb;
KASSERT((inp->inp_flags & INP_TIMEWAIT), ("tcp_twclose: !timewait"));
KASSERT(intotw(inp) == tw, ("tcp_twclose: inp_ppcb != tw"));
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* in_pcbfree() */
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* in_pcbfree() */
INP_WLOCK_ASSERT(inp);
tcp_tw_2msl_stop(tw, reuse);
@@ -613,7 +613,7 @@ static void
tcp_tw_2msl_reset(struct tcptw *tw, int rearm)
{
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(tw->tw_inpcb);
TW_WLOCK(V_tw_lock);
@@ -631,7 +631,7 @@ tcp_tw_2msl_stop(struct tcptw *tw, int reuse)
struct inpcb *inp;
int released;
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
TW_WLOCK(V_tw_lock);
inp = tw->tw_inpcb;
@@ -658,7 +658,7 @@ tcp_tw_2msl_reuse(void)
struct tcptw *tw;
struct inpcb *inp;
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
for (;;) {
TW_RLOCK(V_tw_lock);
@@ -715,26 +715,26 @@ tcp_tw_2msl_scan(void)
in_pcbref(inp);
TW_RUNLOCK(V_tw_lock);
- if (INP_INFO_TRY_WLOCK(&V_tcbinfo)) {
+ if (INP_INFO_TRY_RLOCK(&V_tcbinfo)) {
INP_WLOCK(inp);
tw = intotw(inp);
if (in_pcbrele_wlocked(inp)) {
KASSERT(tw == NULL, ("%s: held last inp "
"reference but tw not NULL", __func__));
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
continue;
}
if (tw == NULL) {
/* tcp_twclose() has already been called */
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
continue;
}
tcp_twclose(tw, 0);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
} else {
/* INP_INFO lock is busy, continue later. */
INP_WLOCK(inp);
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
index 2820ffb..faa2d45 100644
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -163,7 +163,7 @@ tcp_detach(struct socket *so, struct inpcb *inp)
{
struct tcpcb *tp;
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
KASSERT(so->so_pcb == inp, ("tcp_detach: so_pcb != inp"));
@@ -244,12 +244,12 @@ tcp_usr_detach(struct socket *so)
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("tcp_usr_detach: inp == NULL"));
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
INP_WLOCK(inp);
KASSERT(inp->inp_socket != NULL,
("tcp_usr_detach: inp_socket == NULL"));
tcp_detach(so, inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
}
#ifdef INET
@@ -603,7 +603,7 @@ tcp_usr_disconnect(struct socket *so)
int error = 0;
TCPDEBUG0;
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("tcp_usr_disconnect: inp == NULL"));
INP_WLOCK(inp);
@@ -619,7 +619,7 @@ tcp_usr_disconnect(struct socket *so)
out:
TCPDEBUG2(PRU_DISCONNECT);
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
return (error);
}
@@ -734,7 +734,7 @@ tcp_usr_shutdown(struct socket *so)
struct tcpcb *tp = NULL;
TCPDEBUG0;
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("inp == NULL"));
INP_WLOCK(inp);
@@ -752,7 +752,7 @@ tcp_usr_shutdown(struct socket *so)
out:
TCPDEBUG2(PRU_SHUTDOWN);
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
return (error);
}
@@ -814,7 +814,7 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
* this call.
*/
if (flags & PRUS_EOF)
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL"));
INP_WLOCK(inp);
@@ -871,7 +871,7 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
* Close the send side of the connection after
* the data is sent.
*/
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
socantsendmore(so);
tcp_usrclosed(tp);
}
@@ -935,7 +935,7 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
INP_WUNLOCK(inp);
if (flags & PRUS_EOF)
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
return (error);
}
@@ -952,7 +952,7 @@ tcp_usr_abort(struct socket *so)
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("tcp_usr_abort: inp == NULL"));
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
INP_WLOCK(inp);
KASSERT(inp->inp_socket != NULL,
("tcp_usr_abort: inp_socket == NULL"));
@@ -974,7 +974,7 @@ tcp_usr_abort(struct socket *so)
inp->inp_flags |= INP_SOCKREF;
}
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
}
/*
@@ -990,7 +990,7 @@ tcp_usr_close(struct socket *so)
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("tcp_usr_close: inp == NULL"));
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
INP_WLOCK(inp);
KASSERT(inp->inp_socket != NULL,
("tcp_usr_close: inp_socket == NULL"));
@@ -1013,7 +1013,7 @@ tcp_usr_close(struct socket *so)
inp->inp_flags |= INP_SOCKREF;
}
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
}
/*
@@ -1611,10 +1611,10 @@ tcp_attach(struct socket *so)
}
so->so_rcv.sb_flags |= SB_AUTOSIZE;
so->so_snd.sb_flags |= SB_AUTOSIZE;
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
error = in_pcballoc(so, &V_tcbinfo);
if (error) {
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
return (error);
}
inp = sotoinpcb(so);
@@ -1630,12 +1630,12 @@ tcp_attach(struct socket *so)
if (tp == NULL) {
in_pcbdetach(inp);
in_pcbfree(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
return (ENOBUFS);
}
tp->t_state = TCPS_CLOSED;
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
return (0);
}
@@ -1653,7 +1653,7 @@ tcp_disconnect(struct tcpcb *tp)
struct inpcb *inp = tp->t_inpcb;
struct socket *so = inp->inp_socket;
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
/*
@@ -1691,7 +1691,7 @@ static void
tcp_usrclosed(struct tcpcb *tp)
{
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(tp->t_inpcb);
switch (tp->t_state) {
diff --git a/sys/netinet/toecore.c b/sys/netinet/toecore.c
index 1ab6c73..5887576 100644
--- a/sys/netinet/toecore.c
+++ b/sys/netinet/toecore.c
@@ -339,7 +339,7 @@ toe_syncache_expand(struct in_conninfo *inc, struct tcpopt *to,
struct tcphdr *th, struct socket **lsop)
{
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
return (syncache_expand(inc, to, th, lsop, NULL));
}
@@ -370,7 +370,7 @@ toe_4tuple_check(struct in_conninfo *inc, struct tcphdr *th, struct ifnet *ifp)
if ((inp->inp_flags & INP_TIMEWAIT) && th != NULL) {
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* for twcheck */
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* for twcheck */
if (!tcp_twcheck(inp, NULL, th, NULL, 0))
return (EADDRINUSE);
} else {
@@ -574,7 +574,7 @@ toe_connect_failed(struct toedev *tod, struct inpcb *inp, int err)
(void) tcp_output(tp);
} else {
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
tp = tcp_drop(tp, err);
if (tp == NULL)
INP_WLOCK(inp); /* re-acquire */
diff --git a/sys/netinet6/in6_pcb.c b/sys/netinet6/in6_pcb.c
index 2be2e83..0fcf091 100644
--- a/sys/netinet6/in6_pcb.c
+++ b/sys/netinet6/in6_pcb.c
@@ -795,7 +795,7 @@ in6_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
struct ip6_moptions *im6o;
int i, gap;
- INP_INFO_RLOCK(pcbinfo);
+ INP_INFO_WLOCK(pcbinfo);
LIST_FOREACH(in6p, pcbinfo->ipi_listhead, inp_list) {
INP_WLOCK(in6p);
im6o = in6p->in6p_moptions;
@@ -826,7 +826,7 @@ in6_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
}
INP_WUNLOCK(in6p);
}
- INP_INFO_RUNLOCK(pcbinfo);
+ INP_INFO_WUNLOCK(pcbinfo);
}
/*
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 536 bytes
Desc: OpenPGP digital signature
URL: <http://lists.freebsd.org/pipermail/freebsd-net/attachments/20141003/02c7fcd4/attachment.sig>
More information about the freebsd-net
mailing list