Re: git: d3f96f661050 - main - Fix O(n^2) behavior in sysctl
- Reply: Alan Somers : "Re: git: d3f96f661050 - main - Fix O(n^2) behavior in sysctl"
- Reply: Andriy Gapon : "Re: git: d3f96f661050 - main - Fix O(n^2) behavior in sysctl"
- In reply to: Alan Somers : "git: d3f96f661050 - main - Fix O(n^2) behavior in sysctl"
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Wed, 28 Sep 2022 16:18:57 UTC
This also brings a question as to whether sysctl is the right interface to pull this data from the kernel in the first place? From my somewhat ignorant look this approach is likely to be poised with all sorts of race conditions, such so if configuration changes while you are pulling it out you'd get some inconsistent view that is not here not there. Wouldn't it be easier to use some other mechanism to pull configuration of all 1,000 datasets as one blob in one or few system calls? Like read(2) from /dev/zfsstats or something like that? Then you can iterate over it as much as you need in userland. -Max On Tue, Sep 27, 2022, 3:04 AM Alan Somers <asomers@freebsd.org> wrote: > The branch main has been updated by asomers: > > URL: > https://cgit.FreeBSD.org/src/commit/?id=d3f96f661050e9bd21fe29931992a8b9e67ff189 > > commit d3f96f661050e9bd21fe29931992a8b9e67ff189 > Author: Alan Somers <asomers@FreeBSD.org> > AuthorDate: 2022-09-07 14:12:49 +0000 > Commit: Alan Somers <asomers@FreeBSD.org> > CommitDate: 2022-09-27 00:03:34 +0000 > > Fix O(n^2) behavior in sysctl > > Sysctl OIDs were internally stored in linked lists, triggering O(n^2) > behavior when userland iterates over many of them. The slowdown is > noticeable for MIBs that have > 100 children (for example, vm.uma). > But > it's unignorable for kstat.zfs when a pool has > 1000 datasets. > > Convert the linked lists into RB trees. This produces a ~25x speedup > for listing kstat.zfs with 4100 datasets, and no measurable penalty for > small dataset counts. > > Bump __FreeBSD_version for the KPI change. > > Sponsored by: Axcient > Reviewed by: mjg > Differential Revision: https://reviews.freebsd.org/D36500 > --- > sys/compat/linuxkpi/common/include/linux/sysfs.h | 2 +- > sys/kern/kern_sysctl.c | 149 > +++++++++++------------ > sys/kern/vfs_init.c | 2 +- > sys/sys/param.h | 2 +- > sys/sys/sysctl.h | 31 ++++- > 5 files changed, 99 insertions(+), 87 deletions(-) > > diff --git a/sys/compat/linuxkpi/common/include/linux/sysfs.h > b/sys/compat/linuxkpi/common/include/linux/sysfs.h > index 0b6b479d9362..881a72e62ed9 100644 > --- a/sys/compat/linuxkpi/common/include/linux/sysfs.h > +++ b/sys/compat/linuxkpi/common/include/linux/sysfs.h > @@ -246,7 +246,7 @@ sysfs_unmerge_group(struct kobject *kobj, const struct > attribute_group *grp) > struct attribute **attr; > struct sysctl_oid *oidp; > > - SLIST_FOREACH(oidp, SYSCTL_CHILDREN(kobj->oidp), oid_link) { > + RB_FOREACH(oidp, sysctl_oid_list, SYSCTL_CHILDREN(kobj->oidp)) { > if (strcmp(oidp->oid_name, grp->name) != 0) > continue; > for (attr = grp->attrs; *attr != NULL; attr++) { > diff --git a/sys/kern/kern_sysctl.c b/sys/kern/kern_sysctl.c > index 9bc595f111cc..e1cd6ea4bd61 100644 > --- a/sys/kern/kern_sysctl.c > +++ b/sys/kern/kern_sysctl.c > @@ -84,6 +84,8 @@ static MALLOC_DEFINE(M_SYSCTL, "sysctl", "sysctl > internal magic"); > static MALLOC_DEFINE(M_SYSCTLOID, "sysctloid", "sysctl dynamic oids"); > static MALLOC_DEFINE(M_SYSCTLTMP, "sysctltmp", "sysctl temp output > buffer"); > > +RB_GENERATE(sysctl_oid_list, sysctl_oid, oid_link, cmp_sysctl_oid); > + > /* > * The sysctllock protects the MIB tree. It also protects sysctl > * contexts used with dynamic sysctls. The sysctl_register_oid() and > @@ -120,7 +122,7 @@ static struct sx sysctlstringlock; > static int sysctl_root(SYSCTL_HANDLER_ARGS); > > /* Root list */ > -struct sysctl_oid_list sysctl__children = > SLIST_HEAD_INITIALIZER(&sysctl__children); > +struct sysctl_oid_list sysctl__children = > RB_INITIALIZER(&sysctl__children); > > static char* sysctl_escape_name(const char*); > static int sysctl_remove_oid_locked(struct sysctl_oid *oidp, int del, > @@ -134,7 +136,7 @@ sysctl_find_oidname(const char *name, struct > sysctl_oid_list *list) > struct sysctl_oid *oidp; > > SYSCTL_ASSERT_LOCKED(); > - SLIST_FOREACH(oidp, list, oid_link) { > + RB_FOREACH(oidp, sysctl_oid_list, list) { > if (strcmp(oidp->oid_name, name) == 0) { > return (oidp); > } > @@ -356,11 +358,14 @@ sysctl_search_oid(struct sysctl_oid **nodes, struct > sysctl_oid *needle) > indx = 0; > while (indx < CTL_MAXNAME && indx >= 0) { > if (nodes[indx] == NULL && indx == 0) > - nodes[indx] = SLIST_FIRST(&sysctl__children); > + nodes[indx] = RB_MIN(sysctl_oid_list, > + &sysctl__children); > else if (nodes[indx] == NULL) > - nodes[indx] = SLIST_FIRST(&nodes[indx - > 1]->oid_children); > + nodes[indx] = RB_MIN(sysctl_oid_list, > + &nodes[indx - 1]->oid_children); > else > - nodes[indx] = SLIST_NEXT(nodes[indx], oid_link); > + nodes[indx] = RB_NEXT(sysctl_oid_list, > + &nodes[indx - 1]->oid_children, nodes[indx]); > > if (nodes[indx] == needle) > return (indx + 1); > @@ -425,8 +430,7 @@ void > sysctl_register_oid(struct sysctl_oid *oidp) > { > struct sysctl_oid_list *parent = oidp->oid_parent; > - struct sysctl_oid *p; > - struct sysctl_oid *q; > + struct sysctl_oid *p, key; > int oid_number; > int timeout = 2; > > @@ -476,25 +480,21 @@ sysctl_register_oid(struct sysctl_oid *oidp) > * Insert the OID into the parent's list sorted by OID number. > */ > retry: > - q = NULL; > - SLIST_FOREACH(p, parent, oid_link) { > - /* check if the current OID number is in use */ > - if (oid_number == p->oid_number) { > - /* get the next valid OID number */ > - if (oid_number < CTL_AUTO_START || > - oid_number == 0x7fffffff) { > - /* wraparound - restart */ > - oid_number = CTL_AUTO_START; > - /* don't loop forever */ > - if (!timeout--) > - panic("sysctl: Out of OID > numbers\n"); > - goto retry; > - } else { > - oid_number++; > - } > - } else if (oid_number < p->oid_number) > - break; > - q = p; > + key.oid_number = oid_number; > + p = RB_FIND(sysctl_oid_list, parent, &key); > + if (p) { > + /* get the next valid OID number */ > + if (oid_number < CTL_AUTO_START || > + oid_number == 0x7fffffff) { > + /* wraparound - restart */ > + oid_number = CTL_AUTO_START; > + /* don't loop forever */ > + if (!timeout--) > + panic("sysctl: Out of OID numbers\n"); > + goto retry; > + } else { > + oid_number++; > + } > } > /* check for non-auto OID number collision */ > if (oidp->oid_number >= 0 && oidp->oid_number < CTL_AUTO_START && > @@ -504,10 +504,7 @@ retry: > } > /* update the OID number, if any */ > oidp->oid_number = oid_number; > - if (q != NULL) > - SLIST_INSERT_AFTER(q, oidp, oid_link); > - else > - SLIST_INSERT_HEAD(parent, oidp, oid_link); > + RB_INSERT(sysctl_oid_list, parent, oidp); > > if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE && > #ifdef VIMAGE > @@ -556,7 +553,6 @@ sysctl_enable_oid(struct sysctl_oid *oidp) > void > sysctl_unregister_oid(struct sysctl_oid *oidp) > { > - struct sysctl_oid *p; > int error; > > SYSCTL_ASSERT_WLOCKED(); > @@ -564,14 +560,8 @@ sysctl_unregister_oid(struct sysctl_oid *oidp) > error = EINVAL; > } else { > error = ENOENT; > - SLIST_FOREACH(p, oidp->oid_parent, oid_link) { > - if (p == oidp) { > - SLIST_REMOVE(oidp->oid_parent, oidp, > - sysctl_oid, oid_link); > - error = 0; > - break; > - } > - } > + if (RB_REMOVE(sysctl_oid_list, oidp->oid_parent, oidp)) > + error = 0; > } > > /* > @@ -732,17 +722,14 @@ int > sysctl_remove_name(struct sysctl_oid *parent, const char *name, > int del, int recurse) > { > - struct sysctl_oid *p, *tmp; > + struct sysctl_oid *p; > int error; > > error = ENOENT; > SYSCTL_WLOCK(); > - SLIST_FOREACH_SAFE(p, SYSCTL_CHILDREN(parent), oid_link, tmp) { > - if (strcmp(p->oid_name, name) == 0) { > - error = sysctl_remove_oid_locked(p, del, recurse); > - break; > - } > - } > + p = sysctl_find_oidname(name, &parent->oid_children); > + if (p) > + error = sysctl_remove_oid_locked(p, del, recurse); > SYSCTL_WUNLOCK(); > > return (error); > @@ -811,14 +798,16 @@ sysctl_remove_oid_locked(struct sysctl_oid *oidp, > int del, int recurse) > */ > if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) { > if (oidp->oid_refcnt == 1) { > - SLIST_FOREACH_SAFE(p, > - SYSCTL_CHILDREN(oidp), oid_link, tmp) { > + for(p = RB_MIN(sysctl_oid_list, > &oidp->oid_children); > + p != NULL; p = tmp) { > if (!recurse) { > printf("Warning: failed attempt to > " > "remove oid %s with child > %s\n", > oidp->oid_name, p->oid_name); > return (ENOTEMPTY); > } > + tmp = RB_NEXT(sysctl_oid_list, > + &oidp->oid_children, p); > error = sysctl_remove_oid_locked(p, del, > recurse); > if (error) > @@ -895,7 +884,7 @@ sysctl_add_oid(struct sysctl_ctx_list *clist, struct > sysctl_oid_list *parent, > } > oidp = malloc(sizeof(struct sysctl_oid), M_SYSCTLOID, > M_WAITOK|M_ZERO); > oidp->oid_parent = parent; > - SLIST_INIT(&oidp->oid_children); > + RB_INIT(&oidp->oid_children); > oidp->oid_number = number; > oidp->oid_refcnt = 1; > oidp->oid_name = escaped; > @@ -1016,7 +1005,7 @@ sysctl_sysctl_debug_dump_node(struct sysctl_oid_list > *l, int i) > struct sysctl_oid *oidp; > > SYSCTL_ASSERT_LOCKED(); > - SLIST_FOREACH(oidp, l, oid_link) { > + RB_FOREACH(oidp, sysctl_oid_list, l) { > for (k=0; k<i; k++) > printf(" "); > > @@ -1081,7 +1070,7 @@ sysctl_sysctl_name(SYSCTL_HANDLER_ARGS) > int *name = (int *) arg1; > u_int namelen = arg2; > int error; > - struct sysctl_oid *oid; > + struct sysctl_oid *oid, key; > struct sysctl_oid_list *lsp = &sysctl__children, *lsp2; > struct rm_priotracker tracker; > char buf[10]; > @@ -1105,10 +1094,9 @@ sysctl_sysctl_name(SYSCTL_HANDLER_ARGS) > continue; > } > lsp2 = NULL; > - SLIST_FOREACH(oid, lsp, oid_link) { > - if (oid->oid_number != *name) > - continue; > - > + key.oid_number = *name; > + oid = RB_FIND(sysctl_oid_list, lsp, &key); > + if (oid) { > if (req->oldidx) > error = SYSCTL_OUT(req, ".", 1); > if (!error) > @@ -1120,14 +1108,9 @@ sysctl_sysctl_name(SYSCTL_HANDLER_ARGS) > namelen--; > name++; > > - if ((oid->oid_kind & CTLTYPE) != CTLTYPE_NODE) > - break; > - > - if (oid->oid_handler) > - break; > - > - lsp2 = SYSCTL_CHILDREN(oid); > - break; > + if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE && > + !oid->oid_handler) > + lsp2 = SYSCTL_CHILDREN(oid); > } > lsp = lsp2; > } > @@ -1239,13 +1222,25 @@ static bool > sysctl_sysctl_next_action(struct sysctl_oid_list *lsp, int *name, u_int > namelen, > int *next, int *len, int level, bool honor_skip) > { > - struct sysctl_oid *oidp; > + struct sysctl_oid_list *next_lsp; > + struct sysctl_oid *oidp = NULL, key; > bool success = false; > enum sysctl_iter_action action; > > SYSCTL_ASSERT_LOCKED(); > - SLIST_FOREACH(oidp, lsp, oid_link) { > - action = sysctl_sysctl_next_node(oidp, name, namelen, > honor_skip); > + /* > + * Start the search at the requested oid. But if not found, then > scan > + * through all children. > + */ > + if (namelen > 0) { > + key.oid_number = *name; > + oidp = RB_FIND(sysctl_oid_list, lsp, &key); > + } > + if (!oidp) > + oidp = RB_MIN(sysctl_oid_list, lsp); > + for(; oidp != NULL; oidp = RB_NEXT(sysctl_oid_list, lsp, oidp)) { > + action = sysctl_sysctl_next_node(oidp, name, namelen, > + honor_skip); > if (action == ITER_SIBLINGS) > continue; > if (action == ITER_FOUND) { > @@ -1254,13 +1249,13 @@ sysctl_sysctl_next_action(struct sysctl_oid_list > *lsp, int *name, u_int namelen, > } > KASSERT((action== ITER_CHILDREN), > ("ret(%d)!=ITER_CHILDREN", action)); > > - lsp = SYSCTL_CHILDREN(oidp); > + next_lsp = SYSCTL_CHILDREN(oidp); > if (namelen == 0) { > - success = sysctl_sysctl_next_action(lsp, NULL, 0, > + success = sysctl_sysctl_next_action(next_lsp, > NULL, 0, > next + 1, len, level + 1, honor_skip); > } else { > - success = sysctl_sysctl_next_action(lsp, name + 1, > namelen - 1, > - next + 1, len, level + 1, honor_skip); > + success = sysctl_sysctl_next_action(next_lsp, name > + 1, > + namelen - 1, next + 1, len, level + 1, > honor_skip); > if (!success) { > > /* > @@ -1332,13 +1327,12 @@ name2oid(char *name, int *oid, int *len, struct > sysctl_oid **oidpp) > for (*len = 0; *len < CTL_MAXNAME;) { > p = strsep(&name, "."); > > - oidp = SLIST_FIRST(lsp); > - for (;; oidp = SLIST_NEXT(oidp, oid_link)) { > - if (oidp == NULL) > - return (ENOENT); > + RB_FOREACH(oidp, sysctl_oid_list, lsp) { > if (strcmp(p, oidp->oid_name) == 0) > break; > } > + if (oidp == NULL) > + return (ENOENT); > *oid++ = oidp->oid_number; > (*len)++; > > @@ -2162,16 +2156,15 @@ sysctl_find_oid(int *name, u_int namelen, struct > sysctl_oid **noid, > { > struct sysctl_oid_list *lsp; > struct sysctl_oid *oid; > + struct sysctl_oid key; > int indx; > > SYSCTL_ASSERT_LOCKED(); > lsp = &sysctl__children; > indx = 0; > while (indx < CTL_MAXNAME) { > - SLIST_FOREACH(oid, lsp, oid_link) { > - if (oid->oid_number == name[indx]) > - break; > - } > + key.oid_number = name[indx]; > + oid = RB_FIND(sysctl_oid_list, lsp, &key); > if (oid == NULL) > return (ENOENT); > > diff --git a/sys/kern/vfs_init.c b/sys/kern/vfs_init.c > index 6572a8e362c2..6e2e78aaf597 100644 > --- a/sys/kern/vfs_init.c > +++ b/sys/kern/vfs_init.c > @@ -525,7 +525,7 @@ vfs_register(struct vfsconf *vfc) > * number. > */ > sysctl_wlock(); > - SLIST_FOREACH(oidp, SYSCTL_CHILDREN(&sysctl___vfs), oid_link) { > + RB_FOREACH(oidp, sysctl_oid_list, SYSCTL_CHILDREN(&sysctl___vfs)) { > if (strcmp(oidp->oid_name, vfc->vfc_name) == 0) { > sysctl_unregister_oid(oidp); > oidp->oid_number = vfc->vfc_typenum; > diff --git a/sys/sys/param.h b/sys/sys/param.h > index f875d839d41f..3f5da06ef951 100644 > --- a/sys/sys/param.h > +++ b/sys/sys/param.h > @@ -76,7 +76,7 @@ > * cannot include sys/param.h and should only be updated here. > */ > #undef __FreeBSD_version > -#define __FreeBSD_version 1400070 > +#define __FreeBSD_version 1400071 > > /* > * __FreeBSD_kernel__ indicates that this system uses the kernel of > FreeBSD, > diff --git a/sys/sys/sysctl.h b/sys/sys/sysctl.h > index 451d83bbe125..3bd77cf87243 100644 > --- a/sys/sys/sysctl.h > +++ b/sys/sys/sysctl.h > @@ -39,7 +39,8 @@ > #define _SYS_SYSCTL_H_ > > #ifdef _KERNEL > -#include <sys/queue.h> > +#include <sys/tree.h> > +#include <sys/systm.h> > #endif > > /* > @@ -173,20 +174,25 @@ struct sysctl_req { > int flags; > }; > > -SLIST_HEAD(sysctl_oid_list, sysctl_oid); > +struct sysctl_oid; > + > +/* RB Tree handling */ > +RB_HEAD(sysctl_oid_list, sysctl_oid); > > /* > * This describes one "oid" in the MIB tree. Potentially more nodes can > * be hidden behind it, expanded by the handler. > */ > struct sysctl_oid { > - struct sysctl_oid_list oid_children; > - struct sysctl_oid_list *oid_parent; > - SLIST_ENTRY(sysctl_oid) oid_link; > + struct sysctl_oid_list oid_children; > + struct sysctl_oid_list* oid_parent; > + RB_ENTRY(sysctl_oid) oid_link; > + /* Sort key for all siblings, and lookup key for userland */ > int oid_number; > u_int oid_kind; > void *oid_arg1; > intmax_t oid_arg2; > + /* Must be unique amongst all siblings. */ > const char *oid_name; > int (*oid_handler)(SYSCTL_HANDLER_ARGS); > const char *oid_fmt; > @@ -196,6 +202,19 @@ struct sysctl_oid { > const char *oid_label; > }; > > +static inline int > +cmp_sysctl_oid(struct sysctl_oid *a, struct sysctl_oid *b) > +{ > + if (a->oid_number > b->oid_number) > + return (1); > + else if (a->oid_number < b->oid_number) > + return (-1); > + else > + return (0); > +} > + > +RB_PROTOTYPE(sysctl_oid_list, sysctl_oid, oid_link, cmp_sysctl_oid); > + > #define SYSCTL_IN(r, p, l) (r->newfunc)(r, p, l) > #define SYSCTL_OUT(r, p, l) (r->oldfunc)(r, p, l) > #define SYSCTL_OUT_STR(r, p) (r->oldfunc)(r, p, strlen(p) + 1) > @@ -275,7 +294,7 @@ TAILQ_HEAD(sysctl_ctx_list, sysctl_ctx_entry); > #define SYSCTL_OID_RAW(id, parent_child_head, nbr, name, kind, a1, > a2, handler, fmt, descr, label) \ > struct sysctl_oid id = { \ > .oid_parent = (parent_child_head), \ > - .oid_children = SLIST_HEAD_INITIALIZER(&id.oid_children), \ > + .oid_children = RB_INITIALIZER(&id.oid_children), \ > .oid_number = (nbr), \ > .oid_kind = (kind), \ > .oid_arg1 = (a1), \ > >