svn commit: r286027 - in head/sys: netinet sys
Patrick Kelsey
pkelsey at FreeBSD.org
Wed Jul 29 17:59:16 UTC 2015
Author: pkelsey
Date: Wed Jul 29 17:59:13 2015
New Revision: 286027
URL: https://svnweb.freebsd.org/changeset/base/286027
Log:
Revert r265338, r271089 and r271123 as those changes do not handle
non-inline urgent data and introduce an mbuf exhaustion attack vector
similar to FreeBSD-SA-15:15.tcp, but not requiring VNETs.
Address the issue described in FreeBSD-SA-15:15.tcp.
Reviewed by: glebius
Approved by: so
Approved by: jmallett (mentor)
Security: FreeBSD-SA-15:15.tcp
Sponsored by: Norse Corp, Inc.
Modified:
head/sys/netinet/tcp_input.c
head/sys/netinet/tcp_reass.c
head/sys/netinet/tcp_subr.c
head/sys/netinet/tcp_usrreq.c
head/sys/netinet/tcp_var.h
head/sys/sys/mbuf.h
Modified: head/sys/netinet/tcp_input.c
==============================================================================
--- head/sys/netinet/tcp_input.c Wed Jul 29 17:50:14 2015 (r286026)
+++ head/sys/netinet/tcp_input.c Wed Jul 29 17:59:13 2015 (r286027)
@@ -1665,7 +1665,8 @@ tcp_do_segment(struct mbuf *m, struct tc
tp->snd_nxt == tp->snd_max &&
tiwin && tiwin == tp->snd_wnd &&
((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
- tp->t_segq == NULL && ((to.to_flags & TOF_TS) == 0 ||
+ LIST_EMPTY(&tp->t_segq) &&
+ ((to.to_flags & TOF_TS) == 0 ||
TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) {
/*
@@ -2903,7 +2904,8 @@ dodata: /* XXX */
* immediately when segments are out of order (so
* fast retransmit can work).
*/
- if (th->th_seq == tp->rcv_nxt && tp->t_segq == NULL &&
+ if (th->th_seq == tp->rcv_nxt &&
+ LIST_EMPTY(&tp->t_segq) &&
TCPS_HAVEESTABLISHED(tp->t_state)) {
if (DELAY_ACK(tp, tlen))
tp->t_flags |= TF_DELACK;
Modified: head/sys/netinet/tcp_reass.c
==============================================================================
--- head/sys/netinet/tcp_reass.c Wed Jul 29 17:50:14 2015 (r286026)
+++ head/sys/netinet/tcp_reass.c Wed Jul 29 17:59:13 2015 (r286027)
@@ -71,33 +71,80 @@ __FBSDID("$FreeBSD$");
#include <netinet/tcp_var.h>
#include <netinet6/tcp6_var.h>
#include <netinet/tcpip.h>
+#ifdef TCPDEBUG
+#include <netinet/tcp_debug.h>
+#endif /* TCPDEBUG */
+
+static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0,
+ "TCP Segment Reassembly Queue");
+
+static int tcp_reass_maxseg = 0;
+SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RDTUN,
+ &tcp_reass_maxseg, 0,
+ "Global maximum number of TCP Segments in Reassembly Queue");
+
+static uma_zone_t tcp_reass_zone;
+SYSCTL_UMA_CUR(_net_inet_tcp_reass, OID_AUTO, cursegments, CTLFLAG_VNET,
+ &tcp_reass_zone,
+ "Global number of TCP Segments currently in Reassembly Queue");
+
+/* Initialize TCP reassembly queue */
+static void
+tcp_reass_zone_change(void *tag)
+{
+
+ /* Set the zone limit and read back the effective value. */
+ tcp_reass_maxseg = nmbclusters / 16;
+ tcp_reass_maxseg = uma_zone_set_max(tcp_reass_zone,
+ tcp_reass_maxseg);
+}
+
+void
+tcp_reass_global_init(void)
+{
+
+ tcp_reass_maxseg = nmbclusters / 16;
+ TUNABLE_INT_FETCH("net.inet.tcp.reass.maxsegments",
+ &tcp_reass_maxseg);
+ tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct tseg_qent),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ /* Set the zone limit and read back the effective value. */
+ tcp_reass_maxseg = uma_zone_set_max(tcp_reass_zone,
+ tcp_reass_maxseg);
+ EVENTHANDLER_REGISTER(nmbclusters_change,
+ tcp_reass_zone_change, NULL, EVENTHANDLER_PRI_ANY);
+}
void
tcp_reass_flush(struct tcpcb *tp)
{
- struct mbuf *m;
+ struct tseg_qent *qe;
INP_WLOCK_ASSERT(tp->t_inpcb);
- while ((m = tp->t_segq) != NULL) {
- tp->t_segq = m->m_nextpkt;
- tp->t_segqlen -= m->m_pkthdr.len;
- m_freem(m);
+ while ((qe = LIST_FIRST(&tp->t_segq)) != NULL) {
+ LIST_REMOVE(qe, tqe_q);
+ m_freem(qe->tqe_m);
+ uma_zfree(tcp_reass_zone, qe);
+ tp->t_segqlen--;
}
KASSERT((tp->t_segqlen == 0),
- ("TCP reass queue %p length is %d instead of 0 after flush.",
+ ("TCP reass queue %p segment count is %d instead of 0 after flush.",
tp, tp->t_segqlen));
}
-#define M_TCPHDR(m) ((struct tcphdr *)((m)->m_pkthdr.pkt_tcphdr))
-
int
tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
{
+ struct tseg_qent *q;
+ struct tseg_qent *p = NULL;
+ struct tseg_qent *nq;
+ struct tseg_qent *te = NULL;
struct socket *so = tp->t_inpcb->inp_socket;
- struct mbuf *mq, *mp;
- int flags, wakeup;
+ char *s = NULL;
+ int flags;
+ struct tseg_qent tqs;
INP_WLOCK_ASSERT(tp->t_inpcb);
@@ -113,10 +160,6 @@ tcp_reass(struct tcpcb *tp, struct tcphd
if (th == NULL)
goto present;
- M_ASSERTPKTHDR(m);
- KASSERT(*tlenp == m->m_pkthdr.len, ("%s: tlenp %u len %u", __func__,
- *tlenp, m->m_pkthdr.len));
-
/*
* Limit the number of segments that can be queued to reduce the
* potential for mbuf exhaustion. For best performance, we want to be
@@ -127,15 +170,17 @@ tcp_reass(struct tcpcb *tp, struct tcphd
* Always let the missing segment through which caused this queue.
* NB: Access to the socket buffer is left intentionally unlocked as we
* can tolerate stale information here.
+ *
+ * XXXLAS: Using sbspace(so->so_rcv) instead of so->so_rcv.sb_hiwat
+ * should work but causes packets to be dropped when they shouldn't.
+ * Investigate why and re-evaluate the below limit after the behaviour
+ * is understood.
*/
if ((th->th_seq != tp->rcv_nxt || !TCPS_HAVEESTABLISHED(tp->t_state)) &&
- tp->t_segqlen + m->m_pkthdr.len >= sbspace(&so->so_rcv)) {
- char *s;
-
+ tp->t_segqlen >= (so->so_rcv.sb_hiwat / tp->t_maxseg) + 1) {
TCPSTAT_INC(tcps_rcvreassfull);
*tlenp = 0;
- if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL,
- NULL))) {
+ if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, NULL))) {
log(LOG_DEBUG, "%s; %s: queue limit reached, "
"segment dropped\n", s, __func__);
free(s, M_TCPLOG);
@@ -145,13 +190,46 @@ tcp_reass(struct tcpcb *tp, struct tcphd
}
/*
+ * Allocate a new queue entry. If we can't, or hit the zone limit
+ * just drop the pkt.
+ *
+ * Use a temporary structure on the stack for the missing segment
+ * when the zone is exhausted. Otherwise we may get stuck.
+ */
+ te = uma_zalloc(tcp_reass_zone, M_NOWAIT);
+ if (te == NULL) {
+ if (th->th_seq != tp->rcv_nxt || !TCPS_HAVEESTABLISHED(tp->t_state)) {
+ TCPSTAT_INC(tcps_rcvmemdrop);
+ m_freem(m);
+ *tlenp = 0;
+ if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL,
+ NULL))) {
+ log(LOG_DEBUG, "%s; %s: global zone limit "
+ "reached, segment dropped\n", s, __func__);
+ free(s, M_TCPLOG);
+ }
+ return (0);
+ } else {
+ bzero(&tqs, sizeof(struct tseg_qent));
+ te = &tqs;
+ if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL,
+ NULL))) {
+ log(LOG_DEBUG,
+ "%s; %s: global zone limit reached, using "
+ "stack for missing segment\n", s, __func__);
+ free(s, M_TCPLOG);
+ }
+ }
+ }
+ tp->t_segqlen++;
+
+ /*
* Find a segment which begins after this one does.
*/
- mp = NULL;
- for (mq = tp->t_segq; mq != NULL; mq = mq->m_nextpkt) {
- if (SEQ_GT(M_TCPHDR(mq)->th_seq, th->th_seq))
+ LIST_FOREACH(q, &tp->t_segq, tqe_q) {
+ if (SEQ_GT(q->tqe_th->th_seq, th->th_seq))
break;
- mp = mq;
+ p = q;
}
/*
@@ -159,16 +237,18 @@ tcp_reass(struct tcpcb *tp, struct tcphd
* our data already. If so, drop the data from the incoming
* segment. If it provides all of our data, drop us.
*/
- if (mp != NULL) {
+ if (p != NULL) {
int i;
-
/* conversion to int (in i) handles seq wraparound */
- i = M_TCPHDR(mp)->th_seq + mp->m_pkthdr.len - th->th_seq;
+ i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
if (i > 0) {
if (i >= *tlenp) {
TCPSTAT_INC(tcps_rcvduppack);
TCPSTAT_ADD(tcps_rcvdupbyte, *tlenp);
m_freem(m);
+ if (te != &tqs)
+ uma_zfree(tcp_reass_zone, te);
+ tp->t_segqlen--;
/*
* Try to present any queued data
* at the left window edge to the user.
@@ -190,54 +270,37 @@ tcp_reass(struct tcpcb *tp, struct tcphd
* While we overlap succeeding segments trim them or,
* if they are completely covered, dequeue them.
*/
- while (mq) {
- struct mbuf *nq;
- int i;
-
- i = (th->th_seq + *tlenp) - M_TCPHDR(mq)->th_seq;
+ while (q) {
+ int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
if (i <= 0)
break;
- if (i < mq->m_pkthdr.len) {
- M_TCPHDR(mq)->th_seq += i;
- m_adj(mq, i);
- tp->t_segqlen -= i;
+ if (i < q->tqe_len) {
+ q->tqe_th->th_seq += i;
+ q->tqe_len -= i;
+ m_adj(q->tqe_m, i);
break;
}
- nq = mq->m_nextpkt;
- tp->t_segqlen -= mq->m_pkthdr.len;
- m_freem(mq);
- if (mp)
- mp->m_nextpkt = nq;
- else
- tp->t_segq = nq;
- mq = nq;
+ nq = LIST_NEXT(q, tqe_q);
+ LIST_REMOVE(q, tqe_q);
+ m_freem(q->tqe_m);
+ uma_zfree(tcp_reass_zone, q);
+ tp->t_segqlen--;
+ q = nq;
}
- /*
- * Insert the new segment queue entry into place. Try to collapse
- * mbuf chains if segments are adjacent.
- */
- if (mp) {
- if (M_TCPHDR(mp)->th_seq + mp->m_pkthdr.len == th->th_seq)
- m_catpkt(mp, m);
- else {
- m->m_nextpkt = mp->m_nextpkt;
- mp->m_nextpkt = m;
- m->m_pkthdr.pkt_tcphdr = th;
- }
+ /* Insert the new segment queue entry into place. */
+ te->tqe_m = m;
+ te->tqe_th = th;
+ te->tqe_len = *tlenp;
+
+ if (p == NULL) {
+ LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
} else {
- mq = tp->t_segq;
- tp->t_segq = m;
- if (mq && th->th_seq + *tlenp == M_TCPHDR(mq)->th_seq) {
- m->m_nextpkt = mq->m_nextpkt;
- mq->m_nextpkt = NULL;
- m_catpkt(m, mq);
- } else
- m->m_nextpkt = mq;
- m->m_pkthdr.pkt_tcphdr = th;
+ KASSERT(te != &tqs, ("%s: temporary stack based entry not "
+ "first element in queue", __func__));
+ LIST_INSERT_AFTER(p, te, tqe_q);
}
- tp->t_segqlen += *tlenp;
present:
/*
@@ -246,30 +309,25 @@ present:
*/
if (!TCPS_HAVEESTABLISHED(tp->t_state))
return (0);
-
- flags = 0;
- wakeup = 0;
+ q = LIST_FIRST(&tp->t_segq);
+ if (!q || q->tqe_th->th_seq != tp->rcv_nxt)
+ return (0);
SOCKBUF_LOCK(&so->so_rcv);
- while ((mq = tp->t_segq) != NULL &&
- M_TCPHDR(mq)->th_seq == tp->rcv_nxt) {
- tp->t_segq = mq->m_nextpkt;
-
- tp->rcv_nxt += mq->m_pkthdr.len;
- tp->t_segqlen -= mq->m_pkthdr.len;
- flags = M_TCPHDR(mq)->th_flags & TH_FIN;
-
+ do {
+ tp->rcv_nxt += q->tqe_len;
+ flags = q->tqe_th->th_flags & TH_FIN;
+ nq = LIST_NEXT(q, tqe_q);
+ LIST_REMOVE(q, tqe_q);
if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
- m_freem(mq);
- else {
- mq->m_nextpkt = NULL;
- sbappendstream_locked(&so->so_rcv, mq, 0);
- wakeup = 1;
- }
- }
+ m_freem(q->tqe_m);
+ else
+ sbappendstream_locked(&so->so_rcv, q->tqe_m, 0);
+ if (q != &tqs)
+ uma_zfree(tcp_reass_zone, q);
+ tp->t_segqlen--;
+ q = nq;
+ } while (q && q->tqe_th->th_seq == tp->rcv_nxt);
ND6_HINT(tp);
- if (wakeup)
- sorwakeup_locked(so);
- else
- SOCKBUF_UNLOCK(&so->so_rcv);
+ sorwakeup_locked(so);
return (flags);
}
Modified: head/sys/netinet/tcp_subr.c
==============================================================================
--- head/sys/netinet/tcp_subr.c Wed Jul 29 17:50:14 2015 (r286026)
+++ head/sys/netinet/tcp_subr.c Wed Jul 29 17:59:13 2015 (r286027)
@@ -385,6 +385,8 @@ tcp_init(void)
if (!IS_DEFAULT_VNET(curvnet))
return;
+ tcp_reass_global_init();
+
/* XXX virtualize those bellow? */
tcp_delacktime = TCPTV_DELACK;
tcp_keepinit = TCPTV_KEEP_INIT;
Modified: head/sys/netinet/tcp_usrreq.c
==============================================================================
--- head/sys/netinet/tcp_usrreq.c Wed Jul 29 17:50:14 2015 (r286026)
+++ head/sys/netinet/tcp_usrreq.c Wed Jul 29 17:59:13 2015 (r286027)
@@ -1977,7 +1977,7 @@ db_print_tcpcb(struct tcpcb *tp, const c
db_print_indent(indent);
db_printf("t_segq first: %p t_segqlen: %d t_dupacks: %d\n",
- tp->t_segq, tp->t_segqlen, tp->t_dupacks);
+ LIST_FIRST(&tp->t_segq), tp->t_segqlen, tp->t_dupacks);
db_print_indent(indent);
db_printf("tt_rexmt: %p tt_persist: %p tt_keep: %p\n",
Modified: head/sys/netinet/tcp_var.h
==============================================================================
--- head/sys/netinet/tcp_var.h Wed Jul 29 17:50:14 2015 (r286026)
+++ head/sys/netinet/tcp_var.h Wed Jul 29 17:59:13 2015 (r286027)
@@ -46,6 +46,15 @@ VNET_DECLARE(int, tcp_do_rfc1323);
#endif /* _KERNEL */
+/* TCP segment queue entry */
+struct tseg_qent {
+ LIST_ENTRY(tseg_qent) tqe_q;
+ int tqe_len; /* TCP segment data length */
+ struct tcphdr *tqe_th; /* a pointer to tcp header */
+ struct mbuf *tqe_m; /* mbuf contains packet */
+};
+LIST_HEAD(tsegqe_head, tseg_qent);
+
struct sackblk {
tcp_seq start; /* start seq no. of sack block */
tcp_seq end; /* end seq no. */
@@ -91,7 +100,7 @@ do { \
* Organized for 16 byte cacheline efficiency.
*/
struct tcpcb {
- struct mbuf *t_segq; /* segment reassembly queue */
+ struct tsegqe_head t_segq; /* segment reassembly queue */
void *t_pspare[2]; /* new reassembly queue */
int t_segqlen; /* segment reassembly queue length */
int t_dupacks; /* consecutive dup acks recd */
@@ -667,6 +676,7 @@ char *tcp_log_addrs(struct in_conninfo *
char *tcp_log_vain(struct in_conninfo *, struct tcphdr *, void *,
const void *);
int tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *);
+void tcp_reass_global_init(void);
void tcp_reass_flush(struct tcpcb *);
int tcp_input(struct mbuf **, int *, int);
u_long tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *);
Modified: head/sys/sys/mbuf.h
==============================================================================
--- head/sys/sys/mbuf.h Wed Jul 29 17:50:14 2015 (r286026)
+++ head/sys/sys/mbuf.h Wed Jul 29 17:59:13 2015 (r286027)
@@ -150,7 +150,6 @@ struct pkthdr {
#define tso_segsz PH_per.sixteen[1]
#define csum_phsum PH_per.sixteen[2]
#define csum_data PH_per.thirtytwo[1]
-#define pkt_tcphdr PH_loc.ptr
/*
* Description of external storage mapped into mbuf; valid only if M_EXT is
More information about the svn-src-all
mailing list