svn commit: r184025 - in projects/tcp_cc_8.x/sys: conf netinet
Lawrence Stewart
lstewart at FreeBSD.org
Sat Oct 18 07:20:46 UTC 2008
Author: lstewart
Date: Sat Oct 18 07:20:45 2008
New Revision: 184025
URL: http://svn.freebsd.org/changeset/base/184025
Log:
Initial import of the TCP modular congestion control framework from my private
repository. See http://caia.swin.edu.au/urp/newtcp/ for more details.
Patch is currently in good shape and defaults to running with the regular New
Reno congestion control algorithm.
Todo:
- KPI man page
- Integrate properly with ECN
- Integrate my currently private congestion control algorithm modules
- Test that vimage changes have not functionally changed anything
Added:
projects/tcp_cc_8.x/sys/netinet/cc.c (contents, props changed)
projects/tcp_cc_8.x/sys/netinet/cc.h (contents, props changed)
Modified:
projects/tcp_cc_8.x/sys/conf/files
projects/tcp_cc_8.x/sys/netinet/tcp_input.c
projects/tcp_cc_8.x/sys/netinet/tcp_output.c
projects/tcp_cc_8.x/sys/netinet/tcp_subr.c
projects/tcp_cc_8.x/sys/netinet/tcp_timer.c
projects/tcp_cc_8.x/sys/netinet/tcp_usrreq.c
projects/tcp_cc_8.x/sys/netinet/tcp_var.h
Modified: projects/tcp_cc_8.x/sys/conf/files
==============================================================================
--- projects/tcp_cc_8.x/sys/conf/files Sat Oct 18 06:56:07 2008 (r184024)
+++ projects/tcp_cc_8.x/sys/conf/files Sat Oct 18 07:20:45 2008 (r184025)
@@ -1960,6 +1960,7 @@ netinet/ip_mroute.c optional mrouting i
netinet/ip_options.c optional inet
netinet/ip_output.c optional inet
netinet/raw_ip.c optional inet
+netinet/cc.c optional inet
netinet/sctp_asconf.c optional inet sctp
netinet/sctp_auth.c optional inet sctp
netinet/sctp_bsd_addr.c optional inet sctp
Added: projects/tcp_cc_8.x/sys/netinet/cc.c
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ projects/tcp_cc_8.x/sys/netinet/cc.c Sat Oct 18 07:20:45 2008 (r184025)
@@ -0,0 +1,451 @@
+/*-
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
+ * The Regents of the University of California.
+ * Copyright (c) 2008 Swinburne University of Technology, Melbourne, Australia
+ * All rights reserved.
+ *
+ * The majority of this software was developed at the Centre for
+ * Advanced Internet Architectures, Swinburne University, by Lawrence Stewart
+ * and James Healy, made possible in part by a grant from the Cisco University
+ * Research Program Fund at Community Foundation Silicon Valley.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/libkern.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
+#include <sys/mutex.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/sbuf.h>
+
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/cc.h>
+
+
+/* list of available cc algorithms on the current system */
+struct cc_head cc_list = STAILQ_HEAD_INITIALIZER(cc_list);
+
+struct rwlock cc_list_lock;
+
+MALLOC_DECLARE(M_STRING);
+MALLOC_DEFINE(M_STRING, "string", "a string");
+
+/* create a struct to point to our newreno functions */
+struct cc_algo newreno_cc_algo = {
+ .name = "newreno",
+ .init = newreno_init,
+ .deinit = NULL,
+ .cwnd_init = newreno_cwnd_init,
+ .ack_received = newreno_ack_received,
+ .pre_fr = newreno_pre_fr,
+ .post_fr = newreno_post_fr,
+ .after_idle = newreno_after_idle,
+ .after_timeout = newreno_after_timeout
+};
+
+/* the system wide default cc algorithm */
+char cc_algorithm[TCP_CA_NAME_MAX];
+
+/*
+ * sysctl handler that allows the default cc algorithm for the system to be
+ * viewed and changed
+ */
+static int
+cc_default_algorithm(SYSCTL_HANDLER_ARGS)
+{
+ struct cc_algo *funcs;
+
+ if (!req->newptr)
+ goto skip;
+
+ CC_LIST_RLOCK();
+ STAILQ_FOREACH(funcs, &cc_list, entries) {
+ if (strncmp((char *)req->newptr, funcs->name, TCP_CA_NAME_MAX) == 0)
+ goto reorder;
+ }
+ CC_LIST_RUNLOCK();
+
+ return 1;
+
+reorder:
+ /*
+ * Make the selected system default cc algorithm
+ * the first element in the list if it isn't already
+ */
+ CC_LIST_RUNLOCK();
+ CC_LIST_WLOCK();
+ if (funcs != STAILQ_FIRST(&cc_list)) {
+ STAILQ_REMOVE(&cc_list, funcs, cc_algo, entries);
+ STAILQ_INSERT_HEAD(&cc_list, funcs, entries);
+ }
+ CC_LIST_WUNLOCK();
+
+skip:
+ return sysctl_handle_string(oidp, arg1, arg2, req);
+}
+
+/*
+ * sysctl handler that displays the available cc algorithms as a read
+ * only value
+ */
+static int
+cc_list_available(SYSCTL_HANDLER_ARGS)
+{
+ struct cc_algo *algo;
+ int error = 0, first = 1;
+ struct sbuf *s = NULL;
+
+ if ((s = sbuf_new(NULL, NULL, TCP_CA_NAME_MAX, SBUF_AUTOEXTEND)) == NULL)
+ return -1;
+
+ CC_LIST_RLOCK();
+ STAILQ_FOREACH(algo, &cc_list, entries) {
+ error = sbuf_printf(s, (first) ? "%s" : ", %s", algo->name);
+ if (error != 0)
+ break;
+ first = 0;
+ }
+ CC_LIST_RUNLOCK();
+
+ if (!error) {
+ sbuf_finish(s);
+ error = sysctl_handle_string(oidp, sbuf_data(s), 1, req);
+ }
+
+ sbuf_delete(s);
+ return error;
+}
+
+/*
+ * Initialise cc on system boot
+ */
+void
+cc_init()
+{
+ /* initialise the lock that will protect read/write access to our linked list */
+ CC_LIST_LOCK_INIT();
+
+ /* initilize list of cc algorithms */
+ STAILQ_INIT(&cc_list);
+
+ /* add newreno to the list of available algorithms */
+ cc_register_algorithm(&newreno_cc_algo);
+
+ /* set newreno to the system default */
+ strncpy(cc_algorithm, newreno_cc_algo.name, sizeof(cc_algorithm));
+}
+
+/*
+ * Returns 1 on success, 0 on failure
+ */
+int
+cc_deregister_algorithm(struct cc_algo *remove_cc)
+{
+ struct cc_algo *funcs, *tmpfuncs;
+ register struct tcpcb *tp = NULL;
+ register struct inpcb *inp = NULL;
+ int success = 0;
+
+ /* remove the algorithm from the list available to the system */
+ CC_LIST_RLOCK();
+ STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) {
+ if (funcs == remove_cc) {
+ if (CC_LIST_TRY_WLOCK()) {
+ /* if this algorithm is the system default, reset the default to newreno */
+ if (strncmp(cc_algorithm, remove_cc->name, TCP_CA_NAME_MAX) == 0)
+ snprintf(cc_algorithm, TCP_CA_NAME_MAX, "%s", newreno_cc_algo.name);
+
+ STAILQ_REMOVE(&cc_list, funcs, cc_algo, entries);
+ success = 1;
+ CC_LIST_W2RLOCK();
+ }
+ break;
+ }
+ }
+ CC_LIST_RUNLOCK();
+
+ if (success) {
+ /*
+ * check all active control blocks and change any that are using this
+ * algorithm back to newreno. If the algorithm that was in use requires
+ * deinit code to be run, call it
+ */
+ INP_INFO_RLOCK(&tcbinfo);
+ LIST_FOREACH(inp, &tcb, inp_list) {
+ /* skip tcptw structs */
+ if (inp->inp_vflag & INP_TIMEWAIT)
+ continue;
+ INP_WLOCK(inp);
+ if ((tp = intotcpcb(inp)) != NULL) {
+ if (strncmp(CC_ALGO(tp)->name, remove_cc->name, TCP_CA_NAME_MAX) == 0 ) {
+ tmpfuncs = CC_ALGO(tp);
+ CC_ALGO(tp) = &newreno_cc_algo;
+ /*
+ * XXX: We should stall here until
+ * we're sure the tcb has stopped
+ * using the deregistered algo's functions...
+ * Not sure how to do that yet!
+ */
+ if(CC_ALGO(tp)->init)
+ CC_ALGO(tp)->init(tp);
+ if (tmpfuncs->deinit)
+ tmpfuncs->deinit(tp);
+ }
+ }
+ INP_WUNLOCK(inp);
+ }
+ INP_INFO_RUNLOCK(&tcbinfo);
+ }
+
+ return success;
+}
+
+int
+cc_register_algorithm(struct cc_algo *add_cc)
+{
+ CC_LIST_WLOCK();
+ STAILQ_INSERT_TAIL(&cc_list, add_cc, entries);
+ CC_LIST_WUNLOCK();
+ return 1;
+}
+
+/*
+ * NEW RENO
+ */
+
+int
+newreno_init(struct tcpcb *tp)
+{
+ printf("initialising tcp connection with newreno congestion control\n");
+ return 0;
+}
+
+/*
+ * update ssthresh to approx 1/2 of cwnd
+ */
+void
+newreno_ssthresh_update(struct tcpcb *tp)
+{
+ u_int win;
+
+ /* reset ssthresh */
+ win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg;
+
+ if (win < 2)
+ win = 2;
+
+ tp->snd_ssthresh = win * tp->t_maxseg;
+}
+
+/*
+ * initial cwnd at the start of a connection
+ * if there is a hostcache entry for the foreign host, base cwnd on that
+ * if rfc3390 is enabled, set cwnd to approx 4 MSS as recommended
+ * otherwise use the sysctl variables configured by the administrator
+ */
+void
+newreno_cwnd_init(struct tcpcb *tp)
+{
+ struct hc_metrics_lite metrics;
+ struct inpcb *inp = tp->t_inpcb;
+ struct socket *so = inp->inp_socket;
+
+ /*
+ * Set the slow-start flight size depending on whether this
+ * is a local network or not.
+ *
+ * Extend this so we cache the cwnd too and retrieve it here.
+ * Make cwnd even bigger than RFC3390 suggests but only if we
+ * have previous experience with the remote host. Be careful
+ * not make cwnd bigger than remote receive window or our own
+ * send socket buffer. Maybe put some additional upper bound
+ * on the retrieved cwnd. Should do incremental updates to
+ * hostcache when cwnd collapses so next connection doesn't
+ * overloads the path again.
+ *
+ * RFC3390 says only do this if SYN or SYN/ACK didn't got lost.
+ * We currently check only in syncache_socket for that.
+ */
+
+ tcp_hc_get(&inp->inp_inc, &metrics);
+
+#define TCP_METRICS_CWND
+#ifdef TCP_METRICS_CWND
+ if (metrics.rmx_cwnd)
+ tp->snd_cwnd = max(tp->t_maxseg,
+ min(metrics.rmx_cwnd / 2,
+ min(tp->snd_wnd, so->so_snd.sb_hiwat)));
+ else
+#endif
+ if (tcp_do_rfc3390)
+ tp->snd_cwnd = min(4 * tp->t_maxseg, max(2 * tp->t_maxseg, 4380));
+#ifdef INET6
+ else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
+ (!isipv6 && in_localaddr(inp->inp_faddr)))
+#else
+ else if (in_localaddr(inp->inp_faddr))
+#endif
+ tp->snd_cwnd = tp->t_maxseg * ss_fltsz_local;
+ else
+ tp->snd_cwnd = tp->t_maxseg * ss_fltsz;
+}
+
+/*
+ * increase cwnd on receipt of a successful ACK
+ * if cwnd <= ssthresh, increases by 1 MSS per ACK
+ * if cwnd > ssthresh, increase by ~1 MSS per RTT
+ */
+void
+newreno_ack_received(struct tcpcb *tp, struct tcphdr *th)
+{
+ u_int cw = tp->snd_cwnd;
+ u_int incr = tp->t_maxseg;
+
+ if (cw > tp->snd_ssthresh)
+ incr = incr * incr / cw;
+
+ tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<<tp->snd_scale);
+}
+
+/*
+ * update the value of ssthresh before entering FR
+ */
+void
+newreno_pre_fr(struct tcpcb *tp, struct tcphdr *th)
+{
+ newreno_ssthresh_update(tp);
+}
+
+/*
+ * decrease the cwnd in response to packet loss or a transmit timeout.
+ * th can be null, in which case cwnd will be set according to reno instead
+ * of new reno.
+ */
+void
+newreno_post_fr(struct tcpcb *tp, struct tcphdr *th)
+{
+ /*
+ * Out of fast recovery.
+ * Window inflation should have left us
+ * with approximately snd_ssthresh
+ * outstanding data.
+ * But in case we would be inclined to
+ * send a burst, better to do it via
+ * the slow start mechanism.
+ */
+ if (th && SEQ_GT(th->th_ack + tp->snd_ssthresh, tp->snd_max))
+ tp->snd_cwnd = tp->snd_max - th->th_ack + tp->t_maxseg;
+ else
+ tp->snd_cwnd = tp->snd_ssthresh;
+}
+
+/*
+ * if a connection has been idle for a while and more data is ready to be sent,
+ * reset cwnd
+ */
+void
+newreno_after_idle(struct tcpcb *tp)
+{
+ /*
+ * We have been idle for "a while" and no acks are
+ * expected to clock out any data we send --
+ * slow start to get ack "clock" running again.
+ *
+ * Set the slow-start flight size depending on whether
+ * this is a local network or not.
+ *
+ * Set the slow-start flight size depending on whether
+ * this is a local network or not.
+ */
+ int ss = ss_fltsz;
+
+#ifdef INET6
+ if (isipv6) {
+ if (in6_localaddr(&tp->t_inpcb->in6p_faddr))
+ ss = ss_fltsz_local;
+ } else
+#endif /* INET6 */
+
+ if (in_localaddr(tp->t_inpcb->inp_faddr))
+ ss = ss_fltsz_local;
+
+ tp->snd_cwnd = tp->t_maxseg * ss;
+}
+
+/*
+ * reset the cwnd after a transmission timeout.
+ */
+void
+newreno_after_timeout(struct tcpcb *tp)
+{
+ newreno_ssthresh_update(tp);
+
+ /*
+ * Close the congestion window down to one segment
+ * (we'll open it by one segment for each ack we get).
+ * Since we probably have a window's worth of unacked
+ * data accumulated, this "slow start" keeps us from
+ * dumping all that data as back-to-back packets (which
+ * might overwhelm an intermediate gateway).
+ *
+ * There are two phases to the opening: Initially we
+ * open by one mss on each ack. This makes the window
+ * size increase exponentially with time. If the
+ * window is larger than the path can handle, this
+ * exponential growth results in dropped packet(s)
+ * almost immediately. To get more time between
+ * drops but still "push" the network to take advantage
+ * of improving conditions, we switch from exponential
+ * to linear window opening at some threshhold size.
+ * For a threshhold, we use half the current window
+ * size, truncated to a multiple of the mss.
+ *
+ * (the minimum cwnd that will give us exponential
+ * growth is 2 mss. We don't allow the threshhold
+ * to go below this.)
+ */
+ tp->snd_cwnd = tp->t_maxseg;
+}
+
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, cc, CTLFLAG_RW, NULL,
+ "congestion control related settings");
+
+SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm, CTLTYPE_STRING|CTLFLAG_RW,
+ &cc_algorithm, sizeof(cc_algorithm), cc_default_algorithm, "A",
+ "default congestion control algorithm");
+
+SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, available, CTLTYPE_STRING|CTLFLAG_RD,
+ NULL, 0, cc_list_available, "A",
+ "list available congestion control algorithms");
Added: projects/tcp_cc_8.x/sys/netinet/cc.h
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ projects/tcp_cc_8.x/sys/netinet/cc.h Sat Oct 18 07:20:45 2008 (r184025)
@@ -0,0 +1,119 @@
+/*-
+ * Copyright (c) 2008 Swinburne University of Technology, Melbourne, Australia
+ * All rights reserved.
+ *
+ * This software was developed at the Centre for Advanced Internet
+ * Architectures, Swinburne University, by Lawrence Stewart and James Healy,
+ * made possible in part by a grant from the Cisco University Research Program
+ * Fund at Community Foundation Silicon Valley.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_CC_H_
+#define _NETINET_CC_H_
+
+#include <sys/queue.h>
+#include <netinet/tcp_var.h>
+
+/*
+ * Global CC vars
+ */
+extern STAILQ_HEAD(cc_head, cc_algo) cc_list;
+extern char cc_algorithm[];
+extern const int tcprexmtthresh;
+extern struct cc_algo newreno_cc_algo;
+
+/*
+ * Define the new net.inet.tcp.cc sysctl tree
+ */
+SYSCTL_DECL(_net_inet_tcp_cc);
+
+/*
+ * CC housekeeping functions
+ */
+void cc_init(void);
+int cc_register_algorithm(struct cc_algo *add_cc);
+int cc_deregister_algorithm(struct cc_algo *remove_cc);
+
+/*
+ * NewReno CC functions
+ */
+int newreno_init(struct tcpcb *tp);
+void newreno_cwnd_init(struct tcpcb *tp);
+void newreno_ack_received(struct tcpcb *tp, struct tcphdr *th);
+void newreno_pre_fr(struct tcpcb *tp, struct tcphdr *th);
+void newreno_post_fr(struct tcpcb *tp, struct tcphdr *th);
+void newreno_after_idle(struct tcpcb *tp);
+void newreno_after_timeout(struct tcpcb *tp);
+void newreno_ssthresh_update(struct tcpcb *tp);
+
+/*
+ * Structure to hold function pointers to the functions responsible
+ * for congestion control. Based on similar structure in the SCTP stack
+ */
+struct cc_algo {
+ char name[TCP_CA_NAME_MAX];
+
+ /* init the congestion algorithm for the specified control block */
+ int (*init) (struct tcpcb *tp);
+
+ /* deinit the congestion algorithm for the specified control block */
+ void (*deinit) (struct tcpcb *tp);
+
+ /* initilise cwnd at the start of a connection */
+ void (*cwnd_init) (struct tcpcb *tp);
+
+ /* called on the receipt of a valid ack */
+ void (*ack_received) (struct tcpcb *tp, struct tcphdr *th);
+
+ /* called before entering FR */
+ void (*pre_fr) (struct tcpcb *tp, struct tcphdr *th);
+
+ /* after exiting FR */
+ void (*post_fr) (struct tcpcb *tp, struct tcphdr *th);
+
+ /* perform tasks when data transfer resumes after an idle period */
+ void (*after_idle) (struct tcpcb *tp);
+
+ /* perform tasks when the connection's retransmit timer expires */
+ void (*after_timeout) (struct tcpcb *tp);
+
+ STAILQ_ENTRY(cc_algo) entries;
+};
+
+#define CC_ALGO(tp) ((tp)->cc_algo)
+#define CC_DATA(tp) ((tp)->cc_data)
+
+extern struct rwlock cc_list_lock;
+#define CC_LIST_LOCK_INIT() rw_init(&cc_list_lock, "cc_list")
+#define CC_LIST_LOCK_DESTROY() rw_destroy(&cc_list_lock)
+#define CC_LIST_RLOCK() rw_rlock(&cc_list_lock)
+#define CC_LIST_RUNLOCK() rw_runlock(&cc_list_lock)
+#define CC_LIST_WLOCK() rw_wlock(&cc_list_lock)
+#define CC_LIST_WUNLOCK() rw_wunlock(&cc_list_lock)
+#define CC_LIST_TRY_WLOCK() rw_try_upgrade(&cc_list_lock)
+#define CC_LIST_W2RLOCK() rw_downgrade(&cc_list_lock)
+
+#endif /* _NETINET_CC_H_ */
Modified: projects/tcp_cc_8.x/sys/netinet/tcp_input.c
==============================================================================
--- projects/tcp_cc_8.x/sys/netinet/tcp_input.c Sat Oct 18 06:56:07 2008 (r184024)
+++ projects/tcp_cc_8.x/sys/netinet/tcp_input.c Sat Oct 18 07:20:45 2008 (r184025)
@@ -84,6 +84,7 @@ __FBSDID("$FreeBSD$");
#include <netinet6/tcp6_var.h>
#include <netinet/tcpip.h>
#include <netinet/tcp_syncache.h>
+#include <netinet/cc.h>
#ifdef TCPDEBUG
#include <netinet/tcp_debug.h>
#endif /* TCPDEBUG */
@@ -97,7 +98,7 @@ __FBSDID("$FreeBSD$");
#include <security/mac/mac_framework.h>
-static const int tcprexmtthresh = 3;
+const int tcprexmtthresh = 3;
struct tcpstat tcpstat;
SYSCTL_V_STRUCT(V_NET, vnet_inet, _net_inet_tcp, TCPCTL_STATS, stats,
@@ -125,7 +126,7 @@ static int tcp_do_rfc3042 = 1;
SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW,
tcp_do_rfc3042, 0, "Enable RFC 3042 (Limited Transmit)");
-static int tcp_do_rfc3390 = 1;
+int tcp_do_rfc3390 = 1;
SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW,
tcp_do_rfc3390, 0,
"Enable RFC 3390 (Increasing TCP's Initial Congestion Window)");
@@ -1096,14 +1097,9 @@ tcp_do_segment(struct mbuf *m, struct tc
if (SEQ_GT(th->th_ack, tp->snd_una) &&
SEQ_LEQ(th->th_ack, tp->snd_max) &&
tp->snd_cwnd >= tp->snd_wnd &&
- ((!V_tcp_do_newreno &&
- !(tp->t_flags & TF_SACK_PERMIT) &&
- tp->t_dupacks < tcprexmtthresh) ||
- ((V_tcp_do_newreno ||
- (tp->t_flags & TF_SACK_PERMIT)) &&
- !IN_FASTRECOVERY(tp) &&
- (to.to_flags & TOF_SACK) == 0 &&
- TAILQ_EMPTY(&tp->snd_holes)))) {
+ !IN_FASTRECOVERY(tp) &&
+ (to.to_flags & TOF_SACK) == 0 &&
+ TAILQ_EMPTY(&tp->snd_holes)) {
KASSERT(headlocked,
("%s: headlocked", __func__));
INP_INFO_WUNLOCK(&V_tcbinfo);
@@ -1870,9 +1866,7 @@ tcp_do_segment(struct mbuf *m, struct tc
th->th_ack != tp->snd_una)
tp->t_dupacks = 0;
else if (++tp->t_dupacks > tcprexmtthresh ||
- ((V_tcp_do_newreno ||
- (tp->t_flags & TF_SACK_PERMIT)) &&
- IN_FASTRECOVERY(tp))) {
+ IN_FASTRECOVERY(tp)) {
if ((tp->t_flags & TF_SACK_PERMIT) &&
IN_FASTRECOVERY(tp)) {
int awnd;
@@ -1909,14 +1903,24 @@ tcp_do_segment(struct mbuf *m, struct tc
tp->t_dupacks = 0;
break;
}
- } else if (V_tcp_do_newreno ||
- V_tcp_do_ecn) {
+ } else {
if (SEQ_LEQ(th->th_ack,
tp->snd_recover)) {
tp->t_dupacks = 0;
break;
}
}
+
+ /*
+ * If the current tcp cc module has
+ * defined a hook for tasks to run
+ * before entering FR, call it
+ */
+ if (CC_ALGO(tp)->pre_fr)
+ CC_ALGO(tp)->pre_fr(tp, th);
+
+ ENTER_FASTRECOVERY(tp);
+ tp->snd_recover = tp->snd_max;
tcp_congestion_exp(tp);
tcp_timer_activate(tp, TT_REXMT, 0);
tp->t_rtttime = 0;
@@ -1981,37 +1985,16 @@ tcp_do_segment(struct mbuf *m, struct tc
* If the congestion window was inflated to account
* for the other side's cached packets, retract it.
*/
- if (V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) {
- if (IN_FASTRECOVERY(tp)) {
- if (SEQ_LT(th->th_ack, tp->snd_recover)) {
- if (tp->t_flags & TF_SACK_PERMIT)
- tcp_sack_partialack(tp, th);
- else
- tcp_newreno_partial_ack(tp, th);
- } else {
- /*
- * Out of fast recovery.
- * Window inflation should have left us
- * with approximately snd_ssthresh
- * outstanding data.
- * But in case we would be inclined to
- * send a burst, better to do it via
- * the slow start mechanism.
- */
- if (SEQ_GT(th->th_ack +
- tp->snd_ssthresh,
- tp->snd_max))
- tp->snd_cwnd = tp->snd_max -
- th->th_ack +
- tp->t_maxseg;
- else
- tp->snd_cwnd = tp->snd_ssthresh;
- }
+ if (IN_FASTRECOVERY(tp)) {
+ if (SEQ_LT(th->th_ack, tp->snd_recover)) {
+ if (tp->t_flags & TF_SACK_PERMIT)
+ tcp_sack_partialack(tp, th);
+ else
+ tcp_newreno_partial_ack(tp, th);
+ } else {
+ if (CC_ALGO(tp)->post_fr)
+ CC_ALGO(tp)->post_fr(tp, th);
}
- } else {
- if (tp->t_dupacks >= tcprexmtthresh &&
- tp->snd_cwnd > tp->snd_ssthresh)
- tp->snd_cwnd = tp->snd_ssthresh;
}
tp->t_dupacks = 0;
/*
@@ -2117,13 +2100,9 @@ process_ACK:
* If cwnd > maxseg^2, fix the cwnd increment at 1 byte
* to avoid capping cwnd (as suggested in RFC2581).
*/
- if ((!V_tcp_do_newreno && !(tp->t_flags & TF_SACK_PERMIT)) ||
- !IN_FASTRECOVERY(tp)) {
- u_int cw = tp->snd_cwnd;
- u_int incr = tp->t_maxseg;
- if (cw > tp->snd_ssthresh)
- incr = max((incr * incr / cw), 1);
- tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<<tp->snd_scale);
+ if (!IN_FASTRECOVERY(tp)) {
+ if (CC_ALGO(tp)->ack_received)
+ CC_ALGO(tp)->ack_received(tp, th);
}
SOCKBUF_LOCK(&so->so_snd);
if (acked > so->so_snd.sb_cc) {
@@ -2138,14 +2117,11 @@ process_ACK:
/* NB: sowwakeup_locked() does an implicit unlock. */
sowwakeup_locked(so);
/* Detect una wraparound. */
- if ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) &&
- !IN_FASTRECOVERY(tp) &&
+ if (!IN_FASTRECOVERY(tp) &&
SEQ_GT(tp->snd_una, tp->snd_recover) &&
SEQ_LEQ(th->th_ack, tp->snd_recover))
tp->snd_recover = th->th_ack - 1;
- if ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) &&
- IN_FASTRECOVERY(tp) &&
- SEQ_GEQ(th->th_ack, tp->snd_recover))
+ if (IN_FASTRECOVERY(tp) && SEQ_GEQ(th->th_ack, tp->snd_recover))
EXIT_FASTRECOVERY(tp);
tp->snd_una = th->th_ack;
if (tp->t_flags & TF_SACK_PERMIT) {
@@ -3072,41 +3048,9 @@ tcp_mss(struct tcpcb *tp, int offer)
if (metrics.rmx_bandwidth)
tp->snd_bandwidth = metrics.rmx_bandwidth;
- /*
- * Set the slow-start flight size depending on whether this
- * is a local network or not.
- *
- * Extend this so we cache the cwnd too and retrieve it here.
- * Make cwnd even bigger than RFC3390 suggests but only if we
- * have previous experience with the remote host. Be careful
- * not make cwnd bigger than remote receive window or our own
- * send socket buffer. Maybe put some additional upper bound
- * on the retrieved cwnd. Should do incremental updates to
- * hostcache when cwnd collapses so next connection doesn't
- * overloads the path again.
- *
- * RFC3390 says only do this if SYN or SYN/ACK didn't got lost.
- * We currently check only in syncache_socket for that.
- */
-#define TCP_METRICS_CWND
-#ifdef TCP_METRICS_CWND
- if (metrics.rmx_cwnd)
- tp->snd_cwnd = max(mss,
- min(metrics.rmx_cwnd / 2,
- min(tp->snd_wnd, so->so_snd.sb_hiwat)));
- else
-#endif
- if (V_tcp_do_rfc3390)
- tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380));
-#ifdef INET6
- else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
- (!isipv6 && in_localaddr(inp->inp_faddr)))
-#else
- else if (in_localaddr(inp->inp_faddr))
-#endif
- tp->snd_cwnd = mss * V_ss_fltsz_local;
- else
- tp->snd_cwnd = mss * V_ss_fltsz;
+ /* set the initial cwnd value */
+ if (CC_ALGO(tp)->cwnd_init)
+ CC_ALGO(tp)->cwnd_init(tp);
}
/*
Modified: projects/tcp_cc_8.x/sys/netinet/tcp_output.c
==============================================================================
--- projects/tcp_cc_8.x/sys/netinet/tcp_output.c Sat Oct 18 06:56:07 2008 (r184024)
+++ projects/tcp_cc_8.x/sys/netinet/tcp_output.c Sat Oct 18 07:20:45 2008 (r184025)
@@ -71,6 +71,7 @@ __FBSDID("$FreeBSD$");
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcpip.h>
+#include <netinet/cc.h>
#ifdef TCPDEBUG
#include <netinet/tcp_debug.h>
#endif
@@ -101,10 +102,6 @@ SYSCTL_V_INT(V_NET, vnet_inet, _net_inet
local_slowstart_flightsize, CTLFLAG_RW,
ss_fltsz_local, 1, "Slow start flight size for local networks");
-int tcp_do_newreno = 1;
-SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW,
- tcp_do_newreno, 0, "Enable NewReno Algorithms");
-
int tcp_do_tso = 1;
SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW,
tcp_do_tso, 0, "Enable TCP Segmentation Offload");
@@ -169,24 +166,9 @@ tcp_output(struct tcpcb *tp)
*/
idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
if (idle && (ticks - tp->t_rcvtime) >= tp->t_rxtcur) {
- /*
- * We have been idle for "a while" and no acks are
- * expected to clock out any data we send --
- * slow start to get ack "clock" running again.
- *
- * Set the slow-start flight size depending on whether
- * this is a local network or not.
- */
- int ss = V_ss_fltsz;
-#ifdef INET6
- if (isipv6) {
- if (in6_localaddr(&tp->t_inpcb->in6p_faddr))
- ss = V_ss_fltsz_local;
- } else
-#endif /* INET6 */
- if (in_localaddr(tp->t_inpcb->inp_faddr))
- ss = V_ss_fltsz_local;
- tp->snd_cwnd = tp->t_maxseg * ss;
+ /* reset cwnd after a period of idleness */
+ if (CC_ALGO(tp)->after_idle)
+ CC_ALGO(tp)->after_idle(tp);
}
tp->t_flags &= ~TF_LASTIDLE;
if (idle) {
Modified: projects/tcp_cc_8.x/sys/netinet/tcp_subr.c
==============================================================================
--- projects/tcp_cc_8.x/sys/netinet/tcp_subr.c Sat Oct 18 06:56:07 2008 (r184024)
+++ projects/tcp_cc_8.x/sys/netinet/tcp_subr.c Sat Oct 18 07:20:45 2008 (r184025)
@@ -49,6 +49,8 @@ __FBSDID("$FreeBSD$");
#ifdef INET6
#include <sys/domain.h>
#endif
+#include <sys/lock.h>
+#include <sys/rwlock.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/socket.h>
@@ -87,6 +89,7 @@ __FBSDID("$FreeBSD$");
#include <netinet/tcp_var.h>
#include <netinet/tcp_syncache.h>
#include <netinet/tcp_offload.h>
+#include <netinet/cc.h>
#ifdef INET6
#include <netinet6/tcp6_var.h>
#endif
@@ -312,6 +315,8 @@ tcp_init(void)
V_tcp_inflight_rttthresh = TCPTV_INFLIGHT_RTTTHRESH;
tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT;
+ cc_init();
+
INP_INFO_LOCK_INIT(&V_tcbinfo, "tcp");
LIST_INIT(&V_tcb);
V_tcbinfo.ipi_listhead = &V_tcb;
@@ -638,6 +643,21 @@ tcp_newtcpcb(struct inpcb *inp)
if (tm == NULL)
return (NULL);
tp = &tm->tcb;
+
+ /*
+ * use the current system default cc algorithm, which is always
+ * the first algorithm in cc_list
+ */
+ CC_LIST_RLOCK();
+ CC_ALGO(tp) = STAILQ_FIRST(&cc_list);
+ CC_LIST_RUNLOCK();
+
+ /* if the cc module fails to init, stop building the control block */
+ if (CC_ALGO(tp)->init(tp) > 0) {
+ uma_zfree(tcpcb_zone, tp);
+ return NULL;
+ }
+
tp->t_timers = &tm->tt;
/* LIST_INIT(&tp->t_segq); */ /* XXX covered by M_ZERO */
tp->t_maxseg = tp->t_maxopd =
@@ -800,8 +820,13 @@ tcp_discardcb(struct tcpcb *tp)
}
/* Disconnect offload device, if any. */
tcp_offload_detach(tp);
-
tcp_free_sackholes(tp);
+
+ /* Allow the cc algorithm in use for this cb to clean up after itself */
+ if (CC_ALGO(tp)->deinit)
+ CC_ALGO(tp)->deinit(tp);
+
+ CC_ALGO(tp) = NULL;
inp->inp_ppcb = NULL;
tp->t_inpcb = NULL;
uma_zfree(tcpcb_zone, tp);
Modified: projects/tcp_cc_8.x/sys/netinet/tcp_timer.c
==============================================================================
--- projects/tcp_cc_8.x/sys/netinet/tcp_timer.c Sat Oct 18 06:56:07 2008 (r184024)
+++ projects/tcp_cc_8.x/sys/netinet/tcp_timer.c Sat Oct 18 07:20:45 2008 (r184025)
@@ -61,6 +61,7 @@ __FBSDID("$FreeBSD$");
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcpip.h>
+#include <netinet/cc.h>
#ifdef TCPDEBUG
#include <netinet/tcp_debug.h>
#endif
@@ -552,38 +553,11 @@ tcp_timer_rexmt(void * xtp)
* If timing a segment in this window, stop the timer.
*/
tp->t_rtttime = 0;
- /*
- * Close the congestion window down to one segment
- * (we'll open it by one segment for each ack we get).
- * Since we probably have a window's worth of unacked
- * data accumulated, this "slow start" keeps us from
- * dumping all that data as back-to-back packets (which
- * might overwhelm an intermediate gateway).
- *
- * There are two phases to the opening: Initially we
- * open by one mss on each ack. This makes the window
- * size increase exponentially with time. If the
- * window is larger than the path can handle, this
- * exponential growth results in dropped packet(s)
- * almost immediately. To get more time between
- * drops but still "push" the network to take advantage
- * of improving conditions, we switch from exponential
- * to linear window opening at some threshhold size.
- * For a threshhold, we use half the current window
- * size, truncated to a multiple of the mss.
- *
- * (the minimum cwnd that will give us exponential
- * growth is 2 mss. We don't allow the threshhold
- * to go below this.)
- */
- {
- u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg;
- if (win < 2)
- win = 2;
- tp->snd_cwnd = tp->t_maxseg;
- tp->snd_ssthresh = win * tp->t_maxseg;
- tp->t_dupacks = 0;
- }
+
+ if (CC_ALGO(tp)->after_timeout)
+ CC_ALGO(tp)->after_timeout(tp);
+
+ tp->t_dupacks = 0;
EXIT_FASTRECOVERY(tp);
(void) tcp_output(tp);
Modified: projects/tcp_cc_8.x/sys/netinet/tcp_usrreq.c
==============================================================================
--- projects/tcp_cc_8.x/sys/netinet/tcp_usrreq.c Sat Oct 18 06:56:07 2008 (r184024)
+++ projects/tcp_cc_8.x/sys/netinet/tcp_usrreq.c Sat Oct 18 07:20:45 2008 (r184025)
@@ -83,6 +83,7 @@ __FBSDID("$FreeBSD$");
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcpip.h>
+#include <netinet/cc.h>
#ifdef TCPDEBUG
#include <netinet/tcp_debug.h>
#endif
@@ -1281,6 +1282,8 @@ tcp_ctloutput(struct socket *so, struct
struct inpcb *inp;
struct tcpcb *tp;
struct tcp_info ti;
+ char buf[TCP_CA_NAME_MAX];
+ struct cc_algo *cc_algo;
error = 0;
inp = sotoinpcb(so);
@@ -1390,6 +1393,58 @@ tcp_ctloutput(struct socket *so, struct
error = EINVAL;
break;
+ case TCP_CONGESTION:
+ INP_WUNLOCK(inp);
+ bzero(buf, sizeof(buf));
+ error = sooptcopyin(sopt, &buf, sizeof(buf), 1);
+ if (error)
+ break;
+ INP_WLOCK_RECHECK(inp);
+ /*
+ * We return EINVAL if we can't find the requested cc
+ * algo. We set error here and reset to 0 if found to
+ * simplify the error checking,
+ */
+ error = EINVAL;
*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
More information about the svn-src-projects
mailing list