From nobody Sat Oct 09 11:36:39 2021 X-Original-To: dev-commits-src-main@mlmmj.nyi.freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2610:1c1:1:606c::19:1]) by mlmmj.nyi.freebsd.org (Postfix) with ESMTP id 2103A17F70A9; Sat, 9 Oct 2021 11:36:40 +0000 (UTC) (envelope-from git@FreeBSD.org) Received: from mxrelay.nyi.freebsd.org (mxrelay.nyi.freebsd.org [IPv6:2610:1c1:1:606c::19:3]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (4096 bits) server-digest SHA256 client-signature RSA-PSS (4096 bits) client-digest SHA256) (Client CN "mxrelay.nyi.freebsd.org", Issuer "R3" (verified OK)) by mx1.freebsd.org (Postfix) with ESMTPS id 4HRNLc0QYbz4d2R; Sat, 9 Oct 2021 11:36:40 +0000 (UTC) (envelope-from git@FreeBSD.org) Received: from gitrepo.freebsd.org (gitrepo.freebsd.org [IPv6:2610:1c1:1:6068::e6a:5]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (4096 bits) server-digest SHA256) (Client did not present a certificate) by mxrelay.nyi.freebsd.org (Postfix) with ESMTPS id E281819B6A; Sat, 9 Oct 2021 11:36:39 +0000 (UTC) (envelope-from git@FreeBSD.org) Received: from gitrepo.freebsd.org ([127.0.1.44]) by gitrepo.freebsd.org (8.16.1/8.16.1) with ESMTP id 199Bad0S060124; Sat, 9 Oct 2021 11:36:39 GMT (envelope-from git@gitrepo.freebsd.org) Received: (from git@localhost) by gitrepo.freebsd.org (8.16.1/8.16.1/Submit) id 199Bad0K060123; Sat, 9 Oct 2021 11:36:39 GMT (envelope-from git) Date: Sat, 9 Oct 2021 11:36:39 GMT Message-Id: <202110091136.199Bad0K060123@gitrepo.freebsd.org> To: src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-main@FreeBSD.org From: Marko Zec Subject: git: 1549575f22d1 - main - [fib_algo][dxr] Improve incremental updating strategy List-Id: Commit messages for the main branch of the src repository List-Archive: https://lists.freebsd.org/archives/dev-commits-src-main List-Help: List-Post: List-Subscribe: List-Unsubscribe: Sender: owner-dev-commits-src-main@freebsd.org X-BeenThere: dev-commits-src-main@freebsd.org MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 8bit X-Git-Committer: zec X-Git-Repository: src X-Git-Refname: refs/heads/main X-Git-Reftype: branch X-Git-Commit: 1549575f22d14b3ac89a73627618a63132217460 Auto-Submitted: auto-generated X-ThisMailContainsUnwantedMimeParts: N The branch main has been updated by zec: URL: https://cgit.FreeBSD.org/src/commit/?id=1549575f22d14b3ac89a73627618a63132217460 commit 1549575f22d14b3ac89a73627618a63132217460 Author: Marko Zec AuthorDate: 2021-10-09 11:22:27 +0000 Commit: Marko Zec CommitDate: 2021-10-09 11:22:27 +0000 [fib_algo][dxr] Improve incremental updating strategy Tracking the number of unused holes in the trie and the range table was a bad metric based on which full trie and / or range rebuilds were triggered, which would happen in vain by far too frequently, particularly with live BGP feeds. Instead, track the total unused space inside the trie and range table structures, and trigger rebuilds if the percentage of unused space exceeds a sysctl-tunable threshold. MFC after: 3 days PR: 257965 --- sys/netinet/in_fib_dxr.c | 103 ++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 84 insertions(+), 19 deletions(-) diff --git a/sys/netinet/in_fib_dxr.c b/sys/netinet/in_fib_dxr.c index 6a9f414c3ab0..d832a66ee2cc 100644 --- a/sys/netinet/in_fib_dxr.c +++ b/sys/netinet/in_fib_dxr.c @@ -49,6 +49,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include #include @@ -195,6 +196,7 @@ struct dxr_aux { uint32_t updates_high; uint32_t all_chunks_cnt; uint32_t unused_chunks_cnt; + uint32_t unused_chunks_size; uint32_t xtbl_size; uint32_t all_trie_cnt; uint32_t unused_trie_cnt; @@ -232,21 +234,48 @@ static MALLOC_DEFINE(M_DXRAUX, "dxr aux", "DXR auxiliary"); uma_zone_t chunk_zone; uma_zone_t trie_zone; +VNET_DEFINE_STATIC(int, frag_limit) = 100; +#define V_frag_limit VNET(frag_limit) + SYSCTL_DECL(_net_route_algo); SYSCTL_NODE(_net_route_algo, OID_AUTO, dxr, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "DXR tunables"); -VNET_DEFINE_STATIC(int, max_trie_holes) = 8; -#define V_max_trie_holes VNET(max_trie_holes) -SYSCTL_INT(_net_route_algo_dxr, OID_AUTO, max_trie_holes, - CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(max_trie_holes), 0, - "Trie fragmentation threshold before triggering a full rebuild"); +static int +sysctl_dxr_frag_limit(SYSCTL_HANDLER_ARGS) +{ + char buf[8]; + int error, new, i; + + snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100, + V_frag_limit % 100); + error = sysctl_handle_string(oidp, buf, sizeof(buf), req); + if (error != 0 || req->newptr == NULL) + return (error); + if (!isdigit(*buf) && *buf != '.') + return (EINVAL); + for (i = 0, new = 0; isdigit(buf[i]) && i < sizeof(buf); i++) + new = new * 10 + buf[i] - '0'; + new *= 100; + if (buf[i++] == '.') { + if (!isdigit(buf[i])) + return (EINVAL); + new += (buf[i++] - '0') * 10; + if (isdigit(buf[i])) + new += buf[i++] - '0'; + } + if (new > 1000) + return (EINVAL); + V_frag_limit = new; + snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100, + V_frag_limit % 100); + return (0); +} -VNET_DEFINE_STATIC(int, max_range_holes) = 16; -#define V_max_range_holes VNET(max_range_holes) -SYSCTL_INT(_net_route_algo_dxr, OID_AUTO, max_range_holes, - CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(max_range_holes), 0, - "Range table fragmentation threshold before triggering a full rebuild"); +SYSCTL_PROC(_net_route_algo_dxr, OID_AUTO, frag_limit, + CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_VNET, + 0, 0, sysctl_dxr_frag_limit, "A", + "Fragmentation threshold to full rebuild"); /* Binary search for a matching address range */ #define DXR_LOOKUP_STAGE \ @@ -424,6 +453,7 @@ chunk_ref(struct dxr_aux *da, uint32_t chunk) fdesc->base = cdp->cd_base; da->rtbl_top -= size; da->unused_chunks_cnt--; + da->unused_chunks_size -= cdp->cd_max_size; if (cdp->cd_max_size > size) { /* Split the range in two, need a new descriptor */ empty_cdp = uma_zalloc(chunk_zone, M_NOWAIT); @@ -442,6 +472,7 @@ chunk_ref(struct dxr_aux *da, uint32_t chunk) da->all_chunks_cnt++; da->unused_chunks_cnt++; + da->unused_chunks_size += empty_cdp->cd_max_size; cdp->cd_max_size = size; } LIST_REMOVE(cdp, cd_hash_le); @@ -471,9 +502,9 @@ chunk_ref(struct dxr_aux *da, uint32_t chunk) return (1); } da->rtbl_size += RTBL_SIZE_INCR; - if (da->rtbl_top >= BASE_MAX / 4) - FIB_PRINTF(LOG_WARNING, da->fd, "range table at %d%%", - da->rtbl_top * 100 / BASE_MAX); + i = (BASE_MAX - da->rtbl_top) * LOG_DEBUG / BASE_MAX; + FIB_PRINTF(i, da->fd, "range table at %d%% structural limit", + da->rtbl_top * 100 / BASE_MAX); da->range_tbl = realloc(da->range_tbl, sizeof(*da->range_tbl) * da->rtbl_size + FRAGS_PREF_SHORT, M_DXRAUX, M_NOWAIT); @@ -508,6 +539,7 @@ chunk_unref(struct dxr_aux *da, uint32_t chunk) LIST_REMOVE(cdp, cd_hash_le); da->unused_chunks_cnt++; + da->unused_chunks_size += cdp->cd_max_size; cdp->cd_cur_size = 0; /* Attempt to merge with the preceding chunk, if empty */ @@ -546,6 +578,7 @@ chunk_unref(struct dxr_aux *da, uint32_t chunk) da->all_chunks_cnt--; da->unused_chunks_cnt--; da->rtbl_top -= cdp->cd_max_size; + da->unused_chunks_size -= cdp->cd_max_size; LIST_REMOVE(cdp, cd_all_le); uma_zfree(chunk_zone, cdp); return; @@ -846,10 +879,12 @@ dxr_build(struct dxr *dxr) struct timeval t0, t1, t2, t3; uint32_t r_size, dxr_tot_size; uint32_t i, m, range_rebuild = 0; + uint32_t range_frag; #ifdef DXR2 struct trie_desc *tp; uint32_t d_tbl_size, dxr_x, d_size, x_size; uint32_t ti, trie_rebuild = 0, prev_size = 0; + uint32_t trie_frag; #endif KASSERT(dxr->d == NULL, ("dxr: d not free")); @@ -897,9 +932,10 @@ dxr_build(struct dxr *dxr) dxr->nh_tbl = fib_get_nhop_array(da->fd); fib_get_rtable_info(fib_get_rh(da->fd), &rinfo); - if (da->updates_low > da->updates_high || - da->unused_chunks_cnt > V_max_range_holes) + if (da->updates_low > da->updates_high) range_rebuild = 1; + +range_build: if (range_rebuild) { /* Bulk cleanup */ bzero(da->chunk_hashtbl, sizeof(da->chunk_hashtbl)); @@ -910,6 +946,7 @@ dxr_build(struct dxr *dxr) for (i = 0; i < UNUSED_BUCKETS; i++) LIST_INIT(&da->unused_chunks[i]); da->all_chunks_cnt = da->unused_chunks_cnt = 0; + da->unused_chunks_size = 0; da->rtbl_top = 0; da->updates_low = 0; da->updates_high = DIRECT_TBL_SIZE - 1; @@ -929,18 +966,32 @@ dxr_build(struct dxr *dxr) else if (m & 1 && update_chunk(da, i) != 0) return; } + + range_frag = 0; + if (da->rtbl_top) + range_frag = da->unused_chunks_size * 10000ULL / da->rtbl_top; + if (range_frag > V_frag_limit) { + range_rebuild = 1; + goto range_build; + } + r_size = sizeof(*da->range_tbl) * da->rtbl_top; microuptime(&t1); #ifdef DXR2 - if (range_rebuild || da->unused_trie_cnt > V_max_trie_holes || + if (range_rebuild || abs(fls(da->prefixes) - fls(da->trie_rebuilt_prefixes)) > 1) trie_rebuild = 1; + +trie_build: if (trie_rebuild) { da->trie_rebuilt_prefixes = da->prefixes; da->d_bits = DXR_D; da->updates_low = 0; da->updates_high = DIRECT_TBL_SIZE - 1; + if (!range_rebuild) + memset(da->updates_mask, 0xff, + sizeof(da->updates_mask)); } dxr2_try_squeeze: @@ -976,6 +1027,14 @@ dxr2_try_squeeze: da->d_tbl[i] = ti; } + trie_frag = 0; + if (da->all_trie_cnt) + trie_frag = da->unused_trie_cnt * 10000ULL / da->all_trie_cnt; + if (trie_frag > V_frag_limit) { + trie_rebuild = 1; + goto trie_build; + } + d_size = sizeof(*da->d_tbl) * d_tbl_size; x_size = sizeof(*da->x_tbl) * DIRECT_TBL_SIZE / d_tbl_size * da->all_trie_cnt; @@ -1035,6 +1094,15 @@ dxr2_try_squeeze: FIB_PRINTF(LOG_INFO, da->fd, "%d.%02d KBytes, %d.%02d Bytes/prefix", dxr_tot_size / 1024, dxr_tot_size * 100 / 1024 % 100, i / 100, i % 100); +#ifdef DXR2 + FIB_PRINTF(LOG_INFO, da->fd, + "%d.%02d%% trie, %d.%02d%% range fragmentation", + trie_frag / 100, trie_frag % 100, + range_frag / 100, range_frag % 100); +#else + FIB_PRINTF(LOG_INFO, da->fd, "%d.%01d%% range fragmentation", + range_frag / 100, range_frag % 100); +#endif i = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec; FIB_PRINTF(LOG_INFO, da->fd, "range table %s in %u.%03u ms", range_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000); @@ -1046,9 +1114,6 @@ dxr2_try_squeeze: i = (t3.tv_sec - t2.tv_sec) * 1000000 + t3.tv_usec - t2.tv_usec; FIB_PRINTF(LOG_INFO, da->fd, "snapshot forked in %u.%03u ms", i / 1000, i % 1000); - FIB_PRINTF(LOG_INFO, da->fd, "range table: %d%%, %d chunks, %d holes", - da->rtbl_top * 100 / BASE_MAX, da->all_chunks_cnt, - da->unused_chunks_cnt); } /*