git: ea9017fb25ff - main - tcp: Congestion control move to using reference counting.
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Mon, 21 Feb 2022 11:31:11 UTC
The branch main has been updated by rrs: URL: https://cgit.FreeBSD.org/src/commit/?id=ea9017fb25ff51fcb022a5835b1e472e57490d34 commit ea9017fb25ff51fcb022a5835b1e472e57490d34 Author: Randall Stewart <rrs@FreeBSD.org> AuthorDate: 2022-02-21 11:30:17 +0000 Commit: Randall Stewart <rrs@FreeBSD.org> CommitDate: 2022-02-21 11:30:17 +0000 tcp: Congestion control move to using reference counting. In the transport call on 12/3 Gleb asked to move the CC modules towards using reference counting to prevent folks from unloading a module in use. It was also agreed that Michael would do a user space utility like tcp_drop that could be used to move all connections that are using a specific CC to some other CC. This is the half I committed to doing, making it so that we maintain a refcount on a cc module every time a pcb refers to it and decrementing that every time a pcb no longer uses a cc module. This also helps us simplify the whole unloading process by getting rid of tcp_ccunload() which munged through all the tcb's. Instead we mark a module as being removed and prevent further references to it. We also make sure that if a module is marked as being removed it cannot be made as the default and also the opposite of that, if its a default it fails and does not mark it as being removed. Reviewed by: Michael Tuexen, Gleb Smirnoff Sponsored by: Netflix Inc. Differential Revision: https://reviews.freebsd.org/D33249 --- sys/netinet/cc/cc.c | 196 +++++++++++++++++++++++++++++++---------------- sys/netinet/cc/cc.h | 11 +++ sys/netinet/tcp_subr.c | 95 +---------------------- sys/netinet/tcp_usrreq.c | 48 ++++++------ sys/netinet/tcp_var.h | 1 - 5 files changed, 171 insertions(+), 180 deletions(-) diff --git a/sys/netinet/cc/cc.c b/sys/netinet/cc/cc.c index 647ecb47f18d..55a5f6ef652e 100644 --- a/sys/netinet/cc/cc.c +++ b/sys/netinet/cc/cc.c @@ -107,6 +107,45 @@ VNET_DEFINE(struct cc_algo *, default_cc_ptr) = NULL; VNET_DEFINE(uint32_t, newreno_beta) = 50; #define V_newreno_beta VNET(newreno_beta) +void +cc_refer(struct cc_algo *algo) +{ + CC_LIST_LOCK_ASSERT(); + refcount_acquire(&algo->cc_refcount); +} + +void +cc_release(struct cc_algo *algo) +{ + CC_LIST_LOCK_ASSERT(); + refcount_release(&algo->cc_refcount); +} + + +void +cc_attach(struct tcpcb *tp, struct cc_algo *algo) +{ + /* + * Attach the tcpcb to the algorithm. + */ + CC_LIST_RLOCK(); + CC_ALGO(tp) = algo; + cc_refer(algo); + CC_LIST_RUNLOCK(); +} + +void +cc_detach(struct tcpcb *tp) +{ + struct cc_algo *algo; + + CC_LIST_RLOCK(); + algo = CC_ALGO(tp); + CC_ALGO(tp) = NULL; + cc_release(algo); + CC_LIST_RUNLOCK(); +} + /* * Sysctl handler to show and change the default CC algorithm. */ @@ -137,6 +176,10 @@ cc_default_algo(SYSCTL_HANDLER_ARGS) STAILQ_FOREACH(funcs, &cc_list, entries) { if (strncmp(default_cc, funcs->name, sizeof(default_cc))) continue; + if (funcs->flags & CC_MODULE_BEING_REMOVED) { + /* Its being removed, its not eligible */ + continue; + } V_default_cc_ptr = funcs; error = 0; break; @@ -153,12 +196,12 @@ static int cc_list_available(SYSCTL_HANDLER_ARGS) { struct cc_algo *algo; - struct sbuf *s; - int err, first, nalgos; - - err = nalgos = 0; - first = 1; + int error, nalgos; + int linesz; + char *buffer, *cp; + size_t bufsz, outsz; + error = nalgos = 0; CC_LIST_RLOCK(); STAILQ_FOREACH(algo, &cc_list, entries) { nalgos++; @@ -167,37 +210,34 @@ cc_list_available(SYSCTL_HANDLER_ARGS) if (nalgos == 0) { return (ENOENT); } - s = sbuf_new(NULL, NULL, nalgos * TCP_CA_NAME_MAX, SBUF_FIXEDLEN); - - if (s == NULL) - return (ENOMEM); - - /* - * It is theoretically possible for the CC list to have grown in size - * since the call to sbuf_new() and therefore for the sbuf to be too - * small. If this were to happen (incredibly unlikely), the sbuf will - * reach an overflow condition, sbuf_printf() will return an error and - * the sysctl will fail gracefully. - */ + bufsz = (nalgos+2) * ((TCP_CA_NAME_MAX + 13) + 1); + buffer = malloc(bufsz, M_TEMP, M_WAITOK); + cp = buffer; + + linesz = snprintf(cp, bufsz, "\n%-16s%c %s\n", "CCmod", 'D', + "PCB count"); + cp += linesz; + bufsz -= linesz; + outsz = linesz; CC_LIST_RLOCK(); STAILQ_FOREACH(algo, &cc_list, entries) { - err = sbuf_printf(s, first ? "%s" : ", %s", algo->name); - if (err) { - /* Sbuf overflow condition. */ - err = EOVERFLOW; + linesz = snprintf(cp, bufsz, "%-16s%c %u\n", + algo->name, + (algo == CC_DEFAULT_ALGO()) ? '*' : ' ', + algo->cc_refcount); + if (linesz >= bufsz) { + error = EOVERFLOW; break; } - first = 0; + cp += linesz; + bufsz -= linesz; + outsz += linesz; } CC_LIST_RUNLOCK(); - - if (!err) { - sbuf_finish(s); - err = sysctl_handle_string(oidp, sbuf_data(s), 0, req); - } - - sbuf_delete(s); - return (err); + if (error == 0) + error = sysctl_handle_string(oidp, buffer, outsz + 1, req); + free(buffer, M_TEMP); + return (error); } /* @@ -243,41 +283,36 @@ cc_init(void) int cc_deregister_algo(struct cc_algo *remove_cc) { - struct cc_algo *funcs, *tmpfuncs; - int err; - - err = ENOENT; + struct cc_algo *funcs; + int found = 0; /* Remove algo from cc_list so that new connections can't use it. */ CC_LIST_WLOCK(); - STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) { - if (funcs == remove_cc) { - if (cc_check_default(remove_cc)) { - CC_LIST_WUNLOCK(); - return(EBUSY); - } - break; - } + + /* This is unlikely to fail */ + STAILQ_FOREACH(funcs, &cc_list, entries) { + if (funcs == remove_cc) + found = 1; } - remove_cc->flags |= CC_MODULE_BEING_REMOVED; - CC_LIST_WUNLOCK(); - err = tcp_ccalgounload(remove_cc); - /* - * Now back through and we either remove the temp flag - * or pull the registration. - */ - CC_LIST_WLOCK(); - STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) { - if (funcs == remove_cc) { - if (err == 0) - STAILQ_REMOVE(&cc_list, funcs, cc_algo, entries); - else - funcs->flags &= ~CC_MODULE_BEING_REMOVED; - break; - } + if (found == 0) { + /* Nothing to remove? */ + CC_LIST_WUNLOCK(); + return (ENOENT); + } + /* We assert it should have been MOD_QUIESCE'd */ + KASSERT((remove_cc->flags & CC_MODULE_BEING_REMOVED), + ("remove_cc:%p does not have CC_MODULE_BEING_REMOVED flag", remove_cc)); + if (cc_check_default(remove_cc)) { + CC_LIST_WUNLOCK(); + return(EBUSY); + } + if (remove_cc->cc_refcount != 0) { + CC_LIST_WUNLOCK(); + return (EBUSY); } + STAILQ_REMOVE(&cc_list, remove_cc, cc_algo, entries); CC_LIST_WUNLOCK(); - return (err); + return (0); } /* @@ -304,6 +339,9 @@ cc_register_algo(struct cc_algo *add_cc) break; } } + /* Init its reference count */ + if (err == 0) + refcount_init(&add_cc->cc_refcount, 0); /* * The first loaded congestion control module will become * the default until we find the "CC_DEFAULT" defined in @@ -526,6 +564,20 @@ newreno_cc_ack_received(struct cc_var *ccv, uint16_t type) } } +static int +cc_stop_new_assignments(struct cc_algo *algo) +{ + CC_LIST_WLOCK(); + if (cc_check_default(algo)) { + /* A default cannot be removed */ + CC_LIST_WUNLOCK(); + return (EBUSY); + } + algo->flags |= CC_MODULE_BEING_REMOVED; + CC_LIST_WUNLOCK(); + return (0); +} + /* * Handles kld related events. Returns 0 on success, non-zero on failure. */ @@ -555,16 +607,32 @@ cc_modevent(module_t mod, int event_type, void *data) err = cc_register_algo(algo); break; - case MOD_QUIESCE: case MOD_SHUTDOWN: + break; + case MOD_QUIESCE: + /* Stop any new assigments */ + err = cc_stop_new_assignments(algo); + break; case MOD_UNLOAD: + /* + * Deregister and remove the module from the list + */ + CC_LIST_WLOCK(); + /* Even with -f we can't unload if its the default */ + if (cc_check_default(algo)) { + /* A default cannot be removed */ + CC_LIST_WUNLOCK(); + return (EBUSY); + } + /* + * If -f was used and users are still attached to + * the algorithm things are going to go boom. + */ err = cc_deregister_algo(algo); - if (!err && algo->mod_destroy != NULL) + if ((err == 0) && (algo->mod_destroy != NULL)) { algo->mod_destroy(); - if (err == ENOENT) - err = 0; + } break; - default: err = EINVAL; break; diff --git a/sys/netinet/cc/cc.h b/sys/netinet/cc/cc.h index 6f942da7aa83..1906207bddb4 100644 --- a/sys/netinet/cc/cc.h +++ b/sys/netinet/cc/cc.h @@ -200,6 +200,7 @@ struct cc_algo { int (*ctl_output)(struct cc_var *, struct sockopt *, void *); STAILQ_ENTRY (cc_algo) entries; + u_int cc_refcount; uint8_t flags; }; @@ -236,5 +237,15 @@ void newreno_cc_after_idle(struct cc_var *); void newreno_cc_cong_signal(struct cc_var *, uint32_t ); void newreno_cc_ack_received(struct cc_var *, uint16_t); +/* Called to temporarily keep an algo from going away during change */ +void cc_refer(struct cc_algo *algo); +/* Called to release the temporary hold */ +void cc_release(struct cc_algo *algo); + +/* Called to attach a CC algorithm to a tcpcb */ +void cc_attach(struct tcpcb *, struct cc_algo *); +/* Called to detach a CC algorithm from a tcpcb */ +void cc_detach(struct tcpcb *); + #endif /* _KERNEL */ #endif /* _NETINET_CC_CC_H_ */ diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index 298ae38d5b27..95c34c581e59 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -2171,10 +2171,7 @@ tcp_newtcpcb(struct inpcb *inp) /* * Use the current system default CC algorithm. */ - CC_LIST_RLOCK(); - KASSERT(!STAILQ_EMPTY(&cc_list), ("cc_list is empty!")); - CC_ALGO(tp) = CC_DEFAULT_ALGO(); - CC_LIST_RUNLOCK(); + cc_attach(tp, CC_DEFAULT_ALGO()); /* * The tcpcb will hold a reference on its inpcb until tcp_discardcb() @@ -2185,6 +2182,7 @@ tcp_newtcpcb(struct inpcb *inp) if (CC_ALGO(tp)->cb_init != NULL) if (CC_ALGO(tp)->cb_init(tp->ccv, NULL) > 0) { + cc_detach(tp); if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); in_pcbrele_wlocked(inp); @@ -2276,93 +2274,6 @@ tcp_newtcpcb(struct inpcb *inp) return (tp); /* XXX */ } -/* - * Switch the congestion control algorithm back to Vnet default for any active - * control blocks using an algorithm which is about to go away. If the algorithm - * has a cb_init function and it fails (no memory) then the operation fails and - * the unload will not succeed. - * - */ -int -tcp_ccalgounload(struct cc_algo *unload_algo) -{ - struct cc_algo *oldalgo, *newalgo; - struct inpcb *inp; - struct tcpcb *tp; - VNET_ITERATOR_DECL(vnet_iter); - - /* - * Check all active control blocks across all network stacks and change - * any that are using "unload_algo" back to its default. If "unload_algo" - * requires cleanup code to be run, call it. - */ - VNET_LIST_RLOCK(); - VNET_FOREACH(vnet_iter) { - CURVNET_SET(vnet_iter); - struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo, - INPLOOKUP_WLOCKPCB); - /* - * XXXGL: would new accept(2)d connections use algo being - * unloaded? - */ - newalgo = CC_DEFAULT_ALGO(); - while ((inp = inp_next(&inpi)) != NULL) { - /* Important to skip tcptw structs. */ - if (!(inp->inp_flags & INP_TIMEWAIT) && - (tp = intotcpcb(inp)) != NULL) { - /* - * By holding INP_WLOCK here, we are assured - * that the connection is not currently - * executing inside the CC module's functions. - * We attempt to switch to the Vnets default, - * if the init fails then we fail the whole - * operation and the module unload will fail. - */ - if (CC_ALGO(tp) == unload_algo) { - struct cc_var cc_mem; - int err; - - oldalgo = CC_ALGO(tp); - memset(&cc_mem, 0, sizeof(cc_mem)); - cc_mem.ccvc.tcp = tp; - if (newalgo->cb_init == NULL) { - /* - * No init we can skip the - * dance around a possible failure. - */ - CC_DATA(tp) = NULL; - goto proceed; - } - err = (newalgo->cb_init)(&cc_mem, NULL); - if (err) { - /* - * Presumably no memory the caller will - * need to try again. - */ - INP_WUNLOCK(inp); - CURVNET_RESTORE(); - VNET_LIST_RUNLOCK(); - return (err); - } -proceed: - if (oldalgo->cb_destroy != NULL) - oldalgo->cb_destroy(tp->ccv); - CC_ALGO(tp) = newalgo; - memcpy(tp->ccv, &cc_mem, sizeof(struct cc_var)); - if (TCPS_HAVEESTABLISHED(tp->t_state) && - (CC_ALGO(tp)->conn_init != NULL)) { - /* Yep run the connection init for the new CC */ - CC_ALGO(tp)->conn_init(tp->ccv); - } - } - } - } - CURVNET_RESTORE(); - } - VNET_LIST_RUNLOCK(); - return (0); -} - /* * Drop a TCP connection, reporting * the specified error. If connection is synchronized, @@ -2443,6 +2354,8 @@ tcp_discardcb(struct tcpcb *tp) if (CC_ALGO(tp)->cb_destroy != NULL) CC_ALGO(tp)->cb_destroy(tp->ccv); CC_DATA(tp) = NULL; + /* Detach from the CC algorithm */ + cc_detach(tp); #ifdef TCP_HHOOK khelp_destroy_osd(tp->osd); diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c index ac9e79aed917..8c8cba8ea5ad 100644 --- a/sys/netinet/tcp_usrreq.c +++ b/sys/netinet/tcp_usrreq.c @@ -2004,7 +2004,7 @@ copyin_tls_enable(struct sockopt *sopt, struct tls_enable *tls) extern struct cc_algo newreno_cc_algo; static int -tcp_congestion(struct inpcb *inp, struct sockopt *sopt) +tcp_set_cc_mod(struct inpcb *inp, struct sockopt *sopt) { struct cc_algo *algo; void *ptr = NULL; @@ -2020,7 +2020,7 @@ tcp_congestion(struct inpcb *inp, struct sockopt *sopt) return(error); buf[sopt->sopt_valsize] = '\0'; CC_LIST_RLOCK(); - STAILQ_FOREACH(algo, &cc_list, entries) + STAILQ_FOREACH(algo, &cc_list, entries) { if (strncmp(buf, algo->name, TCP_CA_NAME_MAX) == 0) { if (algo->flags & CC_MODULE_BEING_REMOVED) { @@ -2029,30 +2029,24 @@ tcp_congestion(struct inpcb *inp, struct sockopt *sopt) } break; } + } if (algo == NULL) { CC_LIST_RUNLOCK(); return(ESRCH); } -do_over: + /* + * With a reference the algorithm cannot be removed + * so we hold a reference through the change process. + */ + cc_refer(algo); + CC_LIST_RUNLOCK(); if (algo->cb_init != NULL) { /* We can now pre-get the memory for the CC */ mem_sz = (*algo->cc_data_sz)(); if (mem_sz == 0) { goto no_mem_needed; } - CC_LIST_RUNLOCK(); ptr = malloc(mem_sz, M_CC_MEM, M_WAITOK); - CC_LIST_RLOCK(); - STAILQ_FOREACH(algo, &cc_list, entries) - if (strncmp(buf, algo->name, - TCP_CA_NAME_MAX) == 0) - break; - if (algo == NULL) { - if (ptr) - free(ptr, M_CC_MEM); - CC_LIST_RUNLOCK(); - return(ESRCH); - } } else { no_mem_needed: mem_sz = 0; @@ -2063,22 +2057,20 @@ no_mem_needed: * back the inplock. */ memset(&cc_mem, 0, sizeof(cc_mem)); - if (mem_sz != (*algo->cc_data_sz)()) { - if (ptr) - free(ptr, M_CC_MEM); - goto do_over; - } INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_WUNLOCK(inp); + if (ptr) + free(ptr, M_CC_MEM); + /* Release our temp reference */ + CC_LIST_RLOCK(); + cc_release(algo); CC_LIST_RUNLOCK(); - free(ptr, M_CC_MEM); return (ECONNRESET); } tp = intotcpcb(inp); if (ptr != NULL) memset(ptr, 0, mem_sz); - CC_LIST_RUNLOCK(); cc_mem.ccvc.tcp = tp; /* * We once again hold a write lock over the tcb so it's @@ -2103,8 +2095,12 @@ no_mem_needed: */ if (CC_ALGO(tp)->cb_destroy != NULL) CC_ALGO(tp)->cb_destroy(tp->ccv); + /* Detach the old CC from the tcpcb */ + cc_detach(tp); + /* Copy in our temp memory that was inited */ memcpy(tp->ccv, &cc_mem, sizeof(struct cc_var)); - tp->cc_algo = algo; + /* Now attach the new, which takes a reference */ + cc_attach(tp, algo); /* Ok now are we where we have gotten past any conn_init? */ if (TCPS_HAVEESTABLISHED(tp->t_state) && (CC_ALGO(tp)->conn_init != NULL)) { /* Yep run the connection init for the new CC */ @@ -2113,6 +2109,10 @@ no_mem_needed: } else if (ptr) free(ptr, M_CC_MEM); INP_WUNLOCK(inp); + /* Now lets release our temp reference */ + CC_LIST_RLOCK(); + cc_release(algo); + CC_LIST_RUNLOCK(); return (error); } @@ -2336,7 +2336,7 @@ unlock_and_done: break; case TCP_CONGESTION: - error = tcp_congestion(inp, sopt); + error = tcp_set_cc_mod(inp, sopt); break; case TCP_REUSPORT_LB_NUMA: diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h index 6b1f79a0b9ed..b9d37471771c 100644 --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -1076,7 +1076,6 @@ VNET_DECLARE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST + 1]); #endif int tcp_addoptions(struct tcpopt *, u_char *); -int tcp_ccalgounload(struct cc_algo *unload_algo); struct tcpcb * tcp_close(struct tcpcb *); void tcp_discardcb(struct tcpcb *);