svn commit: r184025 - in projects/tcp_cc_8.x/sys: conf netinet

Sat Oct 18 07:20:46 UTC 2008

Author: lstewart
Date: Sat Oct 18 07:20:45 2008
New Revision: 184025
URL: http://svn.freebsd.org/changeset/base/184025

Log:
  Initial import of the TCP modular congestion control framework from my private
  repository. See http://caia.swin.edu.au/urp/newtcp/ for more details.
  
  Patch is currently in good shape and defaults to running with the regular New
  Reno congestion control algorithm.
  
  Todo:
   - KPI man page
   - Integrate properly with ECN
   - Integrate my currently private congestion control algorithm modules
   - Test that vimage changes have not functionally changed anything

Added:
  projects/tcp_cc_8.x/sys/netinet/cc.c   (contents, props changed)
  projects/tcp_cc_8.x/sys/netinet/cc.h   (contents, props changed)
Modified:
  projects/tcp_cc_8.x/sys/conf/files
  projects/tcp_cc_8.x/sys/netinet/tcp_input.c
  projects/tcp_cc_8.x/sys/netinet/tcp_output.c
  projects/tcp_cc_8.x/sys/netinet/tcp_subr.c
  projects/tcp_cc_8.x/sys/netinet/tcp_timer.c
  projects/tcp_cc_8.x/sys/netinet/tcp_usrreq.c
  projects/tcp_cc_8.x/sys/netinet/tcp_var.h

Modified: projects/tcp_cc_8.x/sys/conf/files
==============================================================================

--- projects/tcp_cc_8.x/sys/conf/files	Sat Oct 18 06:56:07 2008	(r184024)
+++ projects/tcp_cc_8.x/sys/conf/files	Sat Oct 18 07:20:45 2008	(r184025)
@@ -1960,6 +1960,7 @@ netinet/ip_mroute.c		optional mrouting i
 netinet/ip_options.c		optional inet
 netinet/ip_output.c		optional inet
 netinet/raw_ip.c		optional inet
+netinet/cc.c			optional inet
 netinet/sctp_asconf.c		optional inet sctp
 netinet/sctp_auth.c		optional inet sctp
 netinet/sctp_bsd_addr.c		optional inet sctp

Added: projects/tcp_cc_8.x/sys/netinet/cc.c
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ projects/tcp_cc_8.x/sys/netinet/cc.c	Sat Oct 18 07:20:45 2008	(r184025)
@@ -0,0 +1,451 @@
+/*-
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
+ *	The Regents of the University of California.
+ * Copyright (c) 2008 Swinburne University of Technology, Melbourne, Australia
+ * All rights reserved.
+ *
+ * The majority of this software was developed at the Centre for
+ * Advanced Internet Architectures, Swinburne University, by Lawrence Stewart
+ * and James Healy, made possible in part by a grant from the Cisco University
+ * Research Program Fund at Community Foundation Silicon Valley.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/libkern.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
+#include <sys/mutex.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/sbuf.h>
+
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/cc.h>
+
+
+/* list of available cc algorithms on the current system */
+struct cc_head cc_list = STAILQ_HEAD_INITIALIZER(cc_list); 
+
+struct rwlock cc_list_lock;
+
+MALLOC_DECLARE(M_STRING);
+MALLOC_DEFINE(M_STRING, "string", "a string");
+
+/* create a struct to point to our newreno functions */
+struct cc_algo newreno_cc_algo = {
+	.name = "newreno",
+	.init = newreno_init,
+	.deinit = NULL,
+	.cwnd_init = newreno_cwnd_init,
+	.ack_received = newreno_ack_received,
+	.pre_fr = newreno_pre_fr,
+	.post_fr = newreno_post_fr,
+	.after_idle = newreno_after_idle,
+	.after_timeout = newreno_after_timeout
+};
+
+/* the system wide default cc algorithm */
+char cc_algorithm[TCP_CA_NAME_MAX];
+
+/*
+ * sysctl handler that allows the default cc algorithm for the system to be
+ * viewed and changed
+ */
+static int
+cc_default_algorithm(SYSCTL_HANDLER_ARGS)
+{
+	struct cc_algo *funcs;
+
+	if (!req->newptr)
+		goto skip;
+
+	CC_LIST_RLOCK();
+	STAILQ_FOREACH(funcs, &cc_list, entries) {
+		if (strncmp((char *)req->newptr, funcs->name, TCP_CA_NAME_MAX) == 0)
+			goto reorder;
+	}
+	CC_LIST_RUNLOCK();
+
+	return 1;
+
+reorder:
+	/*
+	 * Make the selected system default cc algorithm
+	 * the first element in the list if it isn't already
+	 */
+	CC_LIST_RUNLOCK();
+	CC_LIST_WLOCK();
+	if (funcs != STAILQ_FIRST(&cc_list)) {
+		STAILQ_REMOVE(&cc_list, funcs, cc_algo, entries);
+		STAILQ_INSERT_HEAD(&cc_list, funcs, entries);
+	}
+	CC_LIST_WUNLOCK();
+
+skip:
+	return sysctl_handle_string(oidp, arg1, arg2, req);
+}
+
+/*
+ * sysctl handler that displays the available cc algorithms as a read
+ * only value
+ */
+static int
+cc_list_available(SYSCTL_HANDLER_ARGS)
+{
+	struct cc_algo *algo;
+	int error = 0, first = 1;
+	struct sbuf *s = NULL;
+
+	if ((s = sbuf_new(NULL, NULL, TCP_CA_NAME_MAX, SBUF_AUTOEXTEND)) == NULL)
+		return -1;
+
+	CC_LIST_RLOCK();
+	STAILQ_FOREACH(algo, &cc_list, entries) {
+		error = sbuf_printf(s, (first) ? "%s" : ", %s", algo->name);
+		if (error != 0)
+			break;
+		first = 0;
+	}
+	CC_LIST_RUNLOCK();
+
+	if (!error) {
+		sbuf_finish(s);
+		error = sysctl_handle_string(oidp, sbuf_data(s), 1, req);
+	}
+
+	sbuf_delete(s);
+	return error;
+}
+
+/*
+ * Initialise cc on system boot
+ */
+void 
+cc_init()
+{
+	/* initialise the lock that will protect read/write access to our linked list */
+	CC_LIST_LOCK_INIT();
+
+	/* initilize list of cc algorithms */
+	STAILQ_INIT(&cc_list);
+
+	/* add newreno to the list of available algorithms */
+	cc_register_algorithm(&newreno_cc_algo);
+
+	/* set newreno to the system default */
+	strncpy(cc_algorithm, newreno_cc_algo.name, sizeof(cc_algorithm));
+}
+
+/*
+ * Returns 1 on success, 0 on failure
+ */
+int
+cc_deregister_algorithm(struct cc_algo *remove_cc)
+{
+	struct cc_algo *funcs, *tmpfuncs;
+	register struct tcpcb *tp = NULL;
+	register struct inpcb *inp = NULL;
+	int success = 0;
+
+	/* remove the algorithm from the list available to the system */
+	CC_LIST_RLOCK();
+	STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) {
+		if (funcs == remove_cc) {
+			if (CC_LIST_TRY_WLOCK()) {
+				/* if this algorithm is the system default, reset the default to newreno */
+				if (strncmp(cc_algorithm, remove_cc->name, TCP_CA_NAME_MAX) == 0)
+					snprintf(cc_algorithm, TCP_CA_NAME_MAX, "%s", newreno_cc_algo.name);
+
+				STAILQ_REMOVE(&cc_list, funcs, cc_algo, entries);
+				success = 1;
+				CC_LIST_W2RLOCK();
+			}
+			break;
+		}
+	}
+	CC_LIST_RUNLOCK();
+
+	if (success) {
+		/*
+		 * check all active control blocks and change any that are using this
+		 * algorithm back to newreno. If the algorithm that was in use requires
+		 * deinit code to be run, call it
+		 */
+		INP_INFO_RLOCK(&tcbinfo);
+		LIST_FOREACH(inp, &tcb, inp_list) {
+			/* skip tcptw structs */
+			if (inp->inp_vflag & INP_TIMEWAIT)
+				continue;
+			INP_WLOCK(inp);
+			if ((tp = intotcpcb(inp)) != NULL) {
+				if (strncmp(CC_ALGO(tp)->name, remove_cc->name, TCP_CA_NAME_MAX) == 0 ) {
+					tmpfuncs = CC_ALGO(tp);
+					CC_ALGO(tp) = &newreno_cc_algo;
+					/*
+					 * XXX: We should stall here until
+					 * we're sure the tcb has stopped
+					 * using the deregistered algo's functions...
+					 * Not sure how to do that yet!
+					 */
+					if(CC_ALGO(tp)->init)
+						CC_ALGO(tp)->init(tp);
+					if (tmpfuncs->deinit)
+						tmpfuncs->deinit(tp);
+				}
+			}
+			INP_WUNLOCK(inp);
+		}
+		INP_INFO_RUNLOCK(&tcbinfo);
+	}
+
+	return success;
+}
+
+int
+cc_register_algorithm(struct cc_algo *add_cc)
+{
+	CC_LIST_WLOCK();
+	STAILQ_INSERT_TAIL(&cc_list, add_cc, entries);
+	CC_LIST_WUNLOCK();
+	return 1;
+}
+
+/*
+ * NEW RENO
+ */
+
+int
+newreno_init(struct tcpcb *tp)
+{
+	printf("initialising tcp connection with newreno congestion control\n");
+	return 0;
+}
+
+/*
+ * update ssthresh to approx 1/2 of cwnd
+ */
+void
+newreno_ssthresh_update(struct tcpcb *tp)
+{
+	u_int win;
+
+	/* reset ssthresh */
+	win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg;
+
+	if (win < 2)
+		win = 2;
+
+	tp->snd_ssthresh = win * tp->t_maxseg;
+}
+
+/*
+ * initial cwnd at the start of a connection
+ * if there is a hostcache entry for the foreign host, base cwnd on that
+ * if rfc3390 is enabled, set cwnd to approx 4 MSS as recommended
+ * otherwise use the sysctl variables configured by the administrator
+ */
+void
+newreno_cwnd_init(struct tcpcb *tp)
+{
+	struct hc_metrics_lite metrics;
+	struct inpcb *inp = tp->t_inpcb;
+	struct socket *so = inp->inp_socket;
+
+	/*
+	 * Set the slow-start flight size depending on whether this
+	 * is a local network or not.
+	 *
+	 * Extend this so we cache the cwnd too and retrieve it here.
+	 * Make cwnd even bigger than RFC3390 suggests but only if we
+	 * have previous experience with the remote host. Be careful
+	 * not make cwnd bigger than remote receive window or our own
+	 * send socket buffer. Maybe put some additional upper bound
+	 * on the retrieved cwnd. Should do incremental updates to
+	 * hostcache when cwnd collapses so next connection doesn't
+	 * overloads the path again.
+	 *
+	 * RFC3390 says only do this if SYN or SYN/ACK didn't got lost.
+	 * We currently check only in syncache_socket for that.
+	 */
+
+	tcp_hc_get(&inp->inp_inc, &metrics);
+
+#define TCP_METRICS_CWND
+#ifdef TCP_METRICS_CWND
+	if (metrics.rmx_cwnd)
+		tp->snd_cwnd = max(tp->t_maxseg,
+				min(metrics.rmx_cwnd / 2,
+				 min(tp->snd_wnd, so->so_snd.sb_hiwat)));
+	else
+#endif
+	if (tcp_do_rfc3390)
+		tp->snd_cwnd = min(4 * tp->t_maxseg, max(2 * tp->t_maxseg, 4380));
+#ifdef INET6
+	else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
+		 (!isipv6 && in_localaddr(inp->inp_faddr)))
+#else
+	else if (in_localaddr(inp->inp_faddr))
+#endif
+		tp->snd_cwnd = tp->t_maxseg * ss_fltsz_local;
+	else
+		tp->snd_cwnd = tp->t_maxseg * ss_fltsz;
+}
+
+/*
+ * increase cwnd on receipt of a successful ACK
+ * if cwnd <= ssthresh, increases by 1 MSS per ACK
+ * if cwnd > ssthresh, increase by ~1 MSS per RTT
+ */
+void
+newreno_ack_received(struct tcpcb *tp, struct tcphdr *th)
+{
+	u_int cw = tp->snd_cwnd;
+	u_int incr = tp->t_maxseg;
+
+	if (cw > tp->snd_ssthresh)
+		incr = incr * incr / cw;
+
+	tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<<tp->snd_scale);
+}
+
+/*
+ * update the value of ssthresh before entering FR
+ */
+void 
+newreno_pre_fr(struct tcpcb *tp, struct tcphdr *th)
+{
+  newreno_ssthresh_update(tp);
+}
+
+/*
+ * decrease the cwnd in response to packet loss or a transmit timeout.
+ * th can be null, in which case cwnd will be set according to reno instead
+ * of new reno.
+ */
+void 
+newreno_post_fr(struct tcpcb *tp, struct tcphdr *th)
+{
+	/*
+	* Out of fast recovery.
+	* Window inflation should have left us
+	* with approximately snd_ssthresh
+	* outstanding data.
+	* But in case we would be inclined to
+	* send a burst, better to do it via
+	* the slow start mechanism.
+	*/
+	if (th && SEQ_GT(th->th_ack + tp->snd_ssthresh, tp->snd_max))
+		tp->snd_cwnd = tp->snd_max - th->th_ack + tp->t_maxseg;
+	else
+		tp->snd_cwnd = tp->snd_ssthresh;
+}
+
+/*
+ * if a connection has been idle for a while and more data is ready to be sent,
+ * reset cwnd
+ */
+void
+newreno_after_idle(struct tcpcb *tp)
+{
+	/*
+	* We have been idle for "a while" and no acks are
+	* expected to clock out any data we send --
+	* slow start to get ack "clock" running again.
+	*
+	* Set the slow-start flight size depending on whether
+	* this is a local network or not.
+	*
+	* Set the slow-start flight size depending on whether
+	* this is a local network or not.
+	*/
+	int ss = ss_fltsz;
+
+#ifdef INET6
+	if (isipv6) {
+		if (in6_localaddr(&tp->t_inpcb->in6p_faddr))
+			ss = ss_fltsz_local;
+	} else
+#endif /* INET6 */
+
+	if (in_localaddr(tp->t_inpcb->inp_faddr))
+		ss = ss_fltsz_local;
+
+	tp->snd_cwnd = tp->t_maxseg * ss;
+}
+
+/*
+ * reset the cwnd after a transmission timeout.
+ */
+void
+newreno_after_timeout(struct tcpcb *tp)
+{
+	newreno_ssthresh_update(tp);
+
+	/*
+	 * Close the congestion window down to one segment
+	 * (we'll open it by one segment for each ack we get).
+	 * Since we probably have a window's worth of unacked
+	 * data accumulated, this "slow start" keeps us from
+	 * dumping all that data as back-to-back packets (which
+	 * might overwhelm an intermediate gateway).
+	 *
+	 * There are two phases to the opening: Initially we
+	 * open by one mss on each ack.  This makes the window
+	 * size increase exponentially with time.  If the
+	 * window is larger than the path can handle, this
+	 * exponential growth results in dropped packet(s)
+	 * almost immediately.  To get more time between
+	 * drops but still "push" the network to take advantage
+	 * of improving conditions, we switch from exponential
+	 * to linear window opening at some threshhold size.
+	 * For a threshhold, we use half the current window
+	 * size, truncated to a multiple of the mss.
+	 *
+	 * (the minimum cwnd that will give us exponential
+	 * growth is 2 mss.  We don't allow the threshhold
+	 * to go below this.)
+	 */
+	tp->snd_cwnd = tp->t_maxseg;
+}
+
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, cc, CTLFLAG_RW, NULL,
+	"congestion control related settings");
+
+SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm, CTLTYPE_STRING|CTLFLAG_RW,
+	&cc_algorithm, sizeof(cc_algorithm), cc_default_algorithm, "A",
+	"default congestion control algorithm");
+
+SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, available, CTLTYPE_STRING|CTLFLAG_RD,
+	NULL, 0, cc_list_available, "A",
+	"list available congestion control algorithms");

Added: projects/tcp_cc_8.x/sys/netinet/cc.h
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ projects/tcp_cc_8.x/sys/netinet/cc.h	Sat Oct 18 07:20:45 2008	(r184025)
@@ -0,0 +1,119 @@
+/*-
+ * Copyright (c) 2008 Swinburne University of Technology, Melbourne, Australia
+ * All rights reserved.
+ *
+ * This software was developed at the Centre for Advanced Internet
+ * Architectures, Swinburne University, by Lawrence Stewart and James Healy,
+ * made possible in part by a grant from the Cisco University Research Program
+ * Fund at Community Foundation Silicon Valley.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_CC_H_
+#define _NETINET_CC_H_
+
+#include <sys/queue.h>
+#include <netinet/tcp_var.h>
+
+/*
+ * Global CC vars
+ */
+extern	STAILQ_HEAD(cc_head, cc_algo) cc_list;
+extern	char cc_algorithm[];
+extern	const int tcprexmtthresh;
+extern	struct cc_algo newreno_cc_algo;
+
+/*
+ * Define the new net.inet.tcp.cc sysctl tree
+ */
+SYSCTL_DECL(_net_inet_tcp_cc);
+
+/*
+ * CC housekeeping functions
+ */
+void	cc_init(void);
+int	cc_register_algorithm(struct cc_algo *add_cc);
+int	cc_deregister_algorithm(struct cc_algo *remove_cc);
+
+/*
+ * NewReno CC functions
+ */
+int	newreno_init(struct tcpcb *tp);
+void	newreno_cwnd_init(struct tcpcb *tp);
+void	newreno_ack_received(struct tcpcb *tp, struct tcphdr *th);
+void	newreno_pre_fr(struct tcpcb *tp, struct tcphdr *th);
+void	newreno_post_fr(struct tcpcb *tp, struct tcphdr *th);
+void	newreno_after_idle(struct tcpcb *tp);
+void	newreno_after_timeout(struct tcpcb *tp);
+void	newreno_ssthresh_update(struct tcpcb *tp);
+
+/*
+ * Structure to hold function pointers to the functions responsible
+ * for congestion control. Based on similar structure in the SCTP stack
+ */
+struct cc_algo {
+	char name[TCP_CA_NAME_MAX];
+
+	/* init the congestion algorithm for the specified control block */
+	int (*init) (struct tcpcb *tp);
+
+	/* deinit the congestion algorithm for the specified control block */
+	void (*deinit) (struct tcpcb *tp);
+
+	/* initilise cwnd at the start of a connection */
+	void (*cwnd_init) (struct tcpcb *tp);
+
+	/* called on the receipt of a valid ack */
+	void (*ack_received) (struct tcpcb *tp, struct tcphdr *th);
+
+	/* called before entering FR */
+	void (*pre_fr) (struct tcpcb *tp, struct tcphdr *th);
+
+	/*  after exiting FR */
+	void (*post_fr) (struct tcpcb *tp, struct tcphdr *th);
+
+	/* perform tasks when data transfer resumes after an idle period */
+	void (*after_idle) (struct tcpcb *tp);
+
+	/* perform tasks when the connection's retransmit timer expires */
+	void (*after_timeout) (struct tcpcb *tp);
+
+	STAILQ_ENTRY(cc_algo) entries;
+};
+
+#define CC_ALGO(tp) ((tp)->cc_algo)
+#define CC_DATA(tp) ((tp)->cc_data)
+
+extern struct rwlock cc_list_lock;
+#define CC_LIST_LOCK_INIT() rw_init(&cc_list_lock, "cc_list")
+#define CC_LIST_LOCK_DESTROY() rw_destroy(&cc_list_lock)
+#define CC_LIST_RLOCK() rw_rlock(&cc_list_lock)
+#define CC_LIST_RUNLOCK() rw_runlock(&cc_list_lock)
+#define CC_LIST_WLOCK() rw_wlock(&cc_list_lock)
+#define CC_LIST_WUNLOCK() rw_wunlock(&cc_list_lock)
+#define CC_LIST_TRY_WLOCK() rw_try_upgrade(&cc_list_lock)
+#define CC_LIST_W2RLOCK() rw_downgrade(&cc_list_lock)
+
+#endif /* _NETINET_CC_H_ */

Modified: projects/tcp_cc_8.x/sys/netinet/tcp_input.c
==============================================================================
--- projects/tcp_cc_8.x/sys/netinet/tcp_input.c	Sat Oct 18 06:56:07 2008	(r184024)
+++ projects/tcp_cc_8.x/sys/netinet/tcp_input.c	Sat Oct 18 07:20:45 2008	(r184025)
@@ -84,6 +84,7 @@ __FBSDID("$FreeBSD$");
 #include <netinet6/tcp6_var.h>
 #include <netinet/tcpip.h>
 #include <netinet/tcp_syncache.h>
+#include <netinet/cc.h>
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif /* TCPDEBUG */
@@ -97,7 +98,7 @@ __FBSDID("$FreeBSD$");
 
 #include <security/mac/mac_framework.h>
 
-static const int tcprexmtthresh = 3;
+const int tcprexmtthresh = 3;
 
 struct	tcpstat tcpstat;
 SYSCTL_V_STRUCT(V_NET, vnet_inet, _net_inet_tcp, TCPCTL_STATS, stats,
@@ -125,7 +126,7 @@ static int tcp_do_rfc3042 = 1;
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW,
     tcp_do_rfc3042, 0, "Enable RFC 3042 (Limited Transmit)");
 
-static int tcp_do_rfc3390 = 1;
+int tcp_do_rfc3390 = 1;
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW,
     tcp_do_rfc3390, 0,
     "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)");
@@ -1096,14 +1097,9 @@ tcp_do_segment(struct mbuf *m, struct tc
 			if (SEQ_GT(th->th_ack, tp->snd_una) &&
 			    SEQ_LEQ(th->th_ack, tp->snd_max) &&
 			    tp->snd_cwnd >= tp->snd_wnd &&
-			    ((!V_tcp_do_newreno &&
-			      !(tp->t_flags & TF_SACK_PERMIT) &&
-			      tp->t_dupacks < tcprexmtthresh) ||
-			     ((V_tcp_do_newreno ||
-			       (tp->t_flags & TF_SACK_PERMIT)) &&
-			      !IN_FASTRECOVERY(tp) &&
-			      (to.to_flags & TOF_SACK) == 0 &&
-			      TAILQ_EMPTY(&tp->snd_holes)))) {
+			    !IN_FASTRECOVERY(tp) &&
+			    (to.to_flags & TOF_SACK) == 0 &&
+			    TAILQ_EMPTY(&tp->snd_holes)) {
 				KASSERT(headlocked,
 				    ("%s: headlocked", __func__));
 				INP_INFO_WUNLOCK(&V_tcbinfo);
@@ -1870,9 +1866,7 @@ tcp_do_segment(struct mbuf *m, struct tc
 				    th->th_ack != tp->snd_una)
 					tp->t_dupacks = 0;
 				else if (++tp->t_dupacks > tcprexmtthresh ||
-				    ((V_tcp_do_newreno ||
-				      (tp->t_flags & TF_SACK_PERMIT)) &&
-				     IN_FASTRECOVERY(tp))) {
+				     IN_FASTRECOVERY(tp)) {
 					if ((tp->t_flags & TF_SACK_PERMIT) &&
 					    IN_FASTRECOVERY(tp)) {
 						int awnd;
@@ -1909,14 +1903,24 @@ tcp_do_segment(struct mbuf *m, struct tc
 							tp->t_dupacks = 0;
 							break;
 						}
-					} else if (V_tcp_do_newreno ||
-					    V_tcp_do_ecn) {
+					} else {
 						if (SEQ_LEQ(th->th_ack,
 						    tp->snd_recover)) {
 							tp->t_dupacks = 0;
 							break;
 						}
 					}
+
+					/*
+					 * If the current tcp cc module has
+					 * defined a hook for tasks to run
+					 * before entering FR, call it
+					 */
+					if (CC_ALGO(tp)->pre_fr)
+						CC_ALGO(tp)->pre_fr(tp, th);
+
+					ENTER_FASTRECOVERY(tp);
+					tp->snd_recover = tp->snd_max;
 					tcp_congestion_exp(tp);
 					tcp_timer_activate(tp, TT_REXMT, 0);
 					tp->t_rtttime = 0;
@@ -1981,37 +1985,16 @@ tcp_do_segment(struct mbuf *m, struct tc
 		 * If the congestion window was inflated to account
 		 * for the other side's cached packets, retract it.
 		 */
-		if (V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) {
-			if (IN_FASTRECOVERY(tp)) {
-				if (SEQ_LT(th->th_ack, tp->snd_recover)) {
-					if (tp->t_flags & TF_SACK_PERMIT)
-						tcp_sack_partialack(tp, th);
-					else
-						tcp_newreno_partial_ack(tp, th);
-				} else {
-					/*
-					 * Out of fast recovery.
-					 * Window inflation should have left us
-					 * with approximately snd_ssthresh
-					 * outstanding data.
-					 * But in case we would be inclined to
-					 * send a burst, better to do it via
-					 * the slow start mechanism.
-					 */
-					if (SEQ_GT(th->th_ack +
-							tp->snd_ssthresh,
-						   tp->snd_max))
-						tp->snd_cwnd = tp->snd_max -
-								th->th_ack +
-								tp->t_maxseg;
-					else
-						tp->snd_cwnd = tp->snd_ssthresh;
-				}
+		if (IN_FASTRECOVERY(tp)) {
+			if (SEQ_LT(th->th_ack, tp->snd_recover)) {
+				if (tp->t_flags & TF_SACK_PERMIT)
+					tcp_sack_partialack(tp, th);
+				else
+					tcp_newreno_partial_ack(tp, th);
+			} else {
+				if (CC_ALGO(tp)->post_fr)
+					CC_ALGO(tp)->post_fr(tp, th);
 			}
-		} else {
-			if (tp->t_dupacks >= tcprexmtthresh &&
-			    tp->snd_cwnd > tp->snd_ssthresh)
-				tp->snd_cwnd = tp->snd_ssthresh;
 		}
 		tp->t_dupacks = 0;
 		/*
@@ -2117,13 +2100,9 @@ process_ACK:
 		 * If cwnd > maxseg^2, fix the cwnd increment at 1 byte
 		 * to avoid capping cwnd (as suggested in RFC2581).
 		 */
-		if ((!V_tcp_do_newreno && !(tp->t_flags & TF_SACK_PERMIT)) ||
-		    !IN_FASTRECOVERY(tp)) {
-			u_int cw = tp->snd_cwnd;
-			u_int incr = tp->t_maxseg;
-			if (cw > tp->snd_ssthresh)
-				incr = max((incr * incr / cw), 1);
-			tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<<tp->snd_scale);
+		if (!IN_FASTRECOVERY(tp)) {
+			if (CC_ALGO(tp)->ack_received)
+				CC_ALGO(tp)->ack_received(tp, th);
 		}
 		SOCKBUF_LOCK(&so->so_snd);
 		if (acked > so->so_snd.sb_cc) {
@@ -2138,14 +2117,11 @@ process_ACK:
 		/* NB: sowwakeup_locked() does an implicit unlock. */
 		sowwakeup_locked(so);
 		/* Detect una wraparound. */
-		if ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) &&
-		    !IN_FASTRECOVERY(tp) &&
+		if (!IN_FASTRECOVERY(tp) &&
 		    SEQ_GT(tp->snd_una, tp->snd_recover) &&
 		    SEQ_LEQ(th->th_ack, tp->snd_recover))
 			tp->snd_recover = th->th_ack - 1;
-		if ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) &&
-		    IN_FASTRECOVERY(tp) &&
-		    SEQ_GEQ(th->th_ack, tp->snd_recover))
+		if (IN_FASTRECOVERY(tp) && SEQ_GEQ(th->th_ack, tp->snd_recover))
 			EXIT_FASTRECOVERY(tp);
 		tp->snd_una = th->th_ack;
 		if (tp->t_flags & TF_SACK_PERMIT) {
@@ -3072,41 +3048,9 @@ tcp_mss(struct tcpcb *tp, int offer)
 	if (metrics.rmx_bandwidth)
 		tp->snd_bandwidth = metrics.rmx_bandwidth;
 
-	/*
-	 * Set the slow-start flight size depending on whether this
-	 * is a local network or not.
-	 *
-	 * Extend this so we cache the cwnd too and retrieve it here.
-	 * Make cwnd even bigger than RFC3390 suggests but only if we
-	 * have previous experience with the remote host. Be careful
-	 * not make cwnd bigger than remote receive window or our own
-	 * send socket buffer. Maybe put some additional upper bound
-	 * on the retrieved cwnd. Should do incremental updates to
-	 * hostcache when cwnd collapses so next connection doesn't
-	 * overloads the path again.
-	 *
-	 * RFC3390 says only do this if SYN or SYN/ACK didn't got lost.
-	 * We currently check only in syncache_socket for that.
-	 */
-#define TCP_METRICS_CWND
-#ifdef TCP_METRICS_CWND
-	if (metrics.rmx_cwnd)
-		tp->snd_cwnd = max(mss,
-				min(metrics.rmx_cwnd / 2,
-				 min(tp->snd_wnd, so->so_snd.sb_hiwat)));
-	else
-#endif
-	if (V_tcp_do_rfc3390)
-		tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380));
-#ifdef INET6
-	else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
-		 (!isipv6 && in_localaddr(inp->inp_faddr)))
-#else
-	else if (in_localaddr(inp->inp_faddr))
-#endif
-		tp->snd_cwnd = mss * V_ss_fltsz_local;
-	else
-		tp->snd_cwnd = mss * V_ss_fltsz;
+	/* set the initial cwnd value */
+	if (CC_ALGO(tp)->cwnd_init)
+		CC_ALGO(tp)->cwnd_init(tp);
 }
 
 /*

Modified: projects/tcp_cc_8.x/sys/netinet/tcp_output.c
==============================================================================
--- projects/tcp_cc_8.x/sys/netinet/tcp_output.c	Sat Oct 18 06:56:07 2008	(r184024)
+++ projects/tcp_cc_8.x/sys/netinet/tcp_output.c	Sat Oct 18 07:20:45 2008	(r184025)
@@ -71,6 +71,7 @@ __FBSDID("$FreeBSD$");
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcpip.h>
+#include <netinet/cc.h>
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif
@@ -101,10 +102,6 @@ SYSCTL_V_INT(V_NET, vnet_inet, _net_inet
 	local_slowstart_flightsize, CTLFLAG_RW,
 	ss_fltsz_local, 1, "Slow start flight size for local networks");
 
-int     tcp_do_newreno = 1;
-SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW,
-	tcp_do_newreno, 0, "Enable NewReno Algorithms");
-
 int	tcp_do_tso = 1;
 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW,
 	tcp_do_tso, 0, "Enable TCP Segmentation Offload");
@@ -169,24 +166,9 @@ tcp_output(struct tcpcb *tp)
 	 */
 	idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
 	if (idle && (ticks - tp->t_rcvtime) >= tp->t_rxtcur) {
-		/*
-		 * We have been idle for "a while" and no acks are
-		 * expected to clock out any data we send --
-		 * slow start to get ack "clock" running again.
-		 *
-		 * Set the slow-start flight size depending on whether
-		 * this is a local network or not.
-		 */
-		int ss = V_ss_fltsz;
-#ifdef INET6
-		if (isipv6) {
-			if (in6_localaddr(&tp->t_inpcb->in6p_faddr))
-				ss = V_ss_fltsz_local;
-		} else
-#endif /* INET6 */
-		if (in_localaddr(tp->t_inpcb->inp_faddr))
-			ss = V_ss_fltsz_local;
-		tp->snd_cwnd = tp->t_maxseg * ss;
+		/* reset cwnd after a period of idleness */
+		if (CC_ALGO(tp)->after_idle)
+			CC_ALGO(tp)->after_idle(tp);
 	}
 	tp->t_flags &= ~TF_LASTIDLE;
 	if (idle) {

Modified: projects/tcp_cc_8.x/sys/netinet/tcp_subr.c
==============================================================================
--- projects/tcp_cc_8.x/sys/netinet/tcp_subr.c	Sat Oct 18 06:56:07 2008	(r184024)
+++ projects/tcp_cc_8.x/sys/netinet/tcp_subr.c	Sat Oct 18 07:20:45 2008	(r184025)
@@ -49,6 +49,8 @@ __FBSDID("$FreeBSD$");
 #ifdef INET6
 #include <sys/domain.h>
 #endif
+#include <sys/lock.h>
+#include <sys/rwlock.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
@@ -87,6 +89,7 @@ __FBSDID("$FreeBSD$");
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_syncache.h>
 #include <netinet/tcp_offload.h>
+#include <netinet/cc.h>
 #ifdef INET6
 #include <netinet6/tcp6_var.h>
 #endif
@@ -312,6 +315,8 @@ tcp_init(void)
 	V_tcp_inflight_rttthresh = TCPTV_INFLIGHT_RTTTHRESH;
 	tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT;
 
+	cc_init();
+
 	INP_INFO_LOCK_INIT(&V_tcbinfo, "tcp");
 	LIST_INIT(&V_tcb);
 	V_tcbinfo.ipi_listhead = &V_tcb;
@@ -638,6 +643,21 @@ tcp_newtcpcb(struct inpcb *inp)
 	if (tm == NULL)
 		return (NULL);
 	tp = &tm->tcb;
+
+	/*
+	 * use the current system default cc algorithm, which is always
+	 * the first algorithm in cc_list
+	 */
+	CC_LIST_RLOCK();
+	CC_ALGO(tp) = STAILQ_FIRST(&cc_list);
+	CC_LIST_RUNLOCK();
+
+	/* if the cc module fails to init, stop building the control block */
+	if (CC_ALGO(tp)->init(tp) > 0) {
+		uma_zfree(tcpcb_zone, tp);
+		return NULL;
+	}
+
 	tp->t_timers = &tm->tt;
 	/*	LIST_INIT(&tp->t_segq); */	/* XXX covered by M_ZERO */
 	tp->t_maxseg = tp->t_maxopd =
@@ -800,8 +820,13 @@ tcp_discardcb(struct tcpcb *tp)
 	}
 	/* Disconnect offload device, if any. */
 	tcp_offload_detach(tp);
-		
 	tcp_free_sackholes(tp);
+
+	/* Allow the cc algorithm in use for this cb to clean up after itself */
+	if (CC_ALGO(tp)->deinit)
+		CC_ALGO(tp)->deinit(tp);
+
+	CC_ALGO(tp) = NULL;
 	inp->inp_ppcb = NULL;
 	tp->t_inpcb = NULL;
 	uma_zfree(tcpcb_zone, tp);

Modified: projects/tcp_cc_8.x/sys/netinet/tcp_timer.c
==============================================================================
--- projects/tcp_cc_8.x/sys/netinet/tcp_timer.c	Sat Oct 18 06:56:07 2008	(r184024)
+++ projects/tcp_cc_8.x/sys/netinet/tcp_timer.c	Sat Oct 18 07:20:45 2008	(r184025)
@@ -61,6 +61,7 @@ __FBSDID("$FreeBSD$");
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcpip.h>
+#include <netinet/cc.h>
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif
@@ -552,38 +553,11 @@ tcp_timer_rexmt(void * xtp)
 	 * If timing a segment in this window, stop the timer.
 	 */
 	tp->t_rtttime = 0;
-	/*
-	 * Close the congestion window down to one segment
-	 * (we'll open it by one segment for each ack we get).
-	 * Since we probably have a window's worth of unacked
-	 * data accumulated, this "slow start" keeps us from
-	 * dumping all that data as back-to-back packets (which
-	 * might overwhelm an intermediate gateway).
-	 *
-	 * There are two phases to the opening: Initially we
-	 * open by one mss on each ack.  This makes the window
-	 * size increase exponentially with time.  If the
-	 * window is larger than the path can handle, this
-	 * exponential growth results in dropped packet(s)
-	 * almost immediately.  To get more time between
-	 * drops but still "push" the network to take advantage
-	 * of improving conditions, we switch from exponential
-	 * to linear window opening at some threshhold size.
-	 * For a threshhold, we use half the current window
-	 * size, truncated to a multiple of the mss.
-	 *
-	 * (the minimum cwnd that will give us exponential
-	 * growth is 2 mss.  We don't allow the threshhold
-	 * to go below this.)
-	 */
-	{
-		u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg;
-		if (win < 2)
-			win = 2;
-		tp->snd_cwnd = tp->t_maxseg;
-		tp->snd_ssthresh = win * tp->t_maxseg;
-		tp->t_dupacks = 0;
-	}
+
+	if (CC_ALGO(tp)->after_timeout)
+		CC_ALGO(tp)->after_timeout(tp);
+
+	tp->t_dupacks = 0;
 	EXIT_FASTRECOVERY(tp);
 	(void) tcp_output(tp);
 

Modified: projects/tcp_cc_8.x/sys/netinet/tcp_usrreq.c
==============================================================================
--- projects/tcp_cc_8.x/sys/netinet/tcp_usrreq.c	Sat Oct 18 06:56:07 2008	(r184024)
+++ projects/tcp_cc_8.x/sys/netinet/tcp_usrreq.c	Sat Oct 18 07:20:45 2008	(r184025)
@@ -83,6 +83,7 @@ __FBSDID("$FreeBSD$");
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcpip.h>
+#include <netinet/cc.h>
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif
@@ -1281,6 +1282,8 @@ tcp_ctloutput(struct socket *so, struct 
 	struct	inpcb *inp;
 	struct	tcpcb *tp;
 	struct	tcp_info ti;
+	char buf[TCP_CA_NAME_MAX];
+	struct cc_algo *cc_algo;
 
 	error = 0;
 	inp = sotoinpcb(so);
@@ -1390,6 +1393,58 @@ tcp_ctloutput(struct socket *so, struct 
 			error = EINVAL;
 			break;
 
+		case TCP_CONGESTION:
+			INP_WUNLOCK(inp);
+			bzero(buf, sizeof(buf));
+			error = sooptcopyin(sopt, &buf, sizeof(buf), 1);
+			if (error)
+				break;
+			INP_WLOCK_RECHECK(inp);
+			/*
+			 * We return EINVAL if we can't find the requested cc
+			 * algo. We set error here and reset to 0 if found to
+			 * simplify the error checking,
+			 */
+			error = EINVAL;

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***