git: f35093f8d6d8 - main - Use Linux semantics for the thread affinity syscalls.
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Wed, 11 May 2022 07:41:18 UTC
The branch main has been updated by dchagin: URL: https://cgit.FreeBSD.org/src/commit/?id=f35093f8d6d8155ab2e56c11ee03d474688b16a2 commit f35093f8d6d8155ab2e56c11ee03d474688b16a2 Author: Dmitry Chagin <dchagin@FreeBSD.org> AuthorDate: 2022-05-11 07:36:01 +0000 Commit: Dmitry Chagin <dchagin@FreeBSD.org> CommitDate: 2022-05-11 07:36:01 +0000 Use Linux semantics for the thread affinity syscalls. Linux has more tolerant checks of the user supplied cpuset_t's. Minimum cpuset_t size that the Linux kernel permits in case of getaffinity() is the maximum CPU id, present in the system / NBBY, the maximum size is not limited. For setaffinity(), Linux does not limit the size of the user-provided cpuset_t, internally using only the meaningful part of the set, where the upper bound is the maximum CPU id, present in the system, no larger than the size of the kernel cpuset_t. Unlike FreeBSD, Linux ignores high bits if set in the setaffinity(), so clear it in the sched_setaffinity() and Linuxulator itself. Reviewed by: Pau Amma (man pages) In collaboration with: jhb Differential revision: https://reviews.freebsd.org/D34849 MFC after: 2 weeks --- lib/libc/gen/sched_getaffinity.c | 27 +++----- lib/libc/gen/sched_setaffinity.c | 29 ++++++-- lib/libc/sys/cpuset_getaffinity.2 | 19 ++++-- share/man/man3/pthread_attr_affinity_np.3 | 25 +++---- sys/compat/freebsd32/freebsd32_misc.c | 2 +- sys/compat/linux/linux_misc.c | 43 ++++++++---- sys/kern/kern_cpuset.c | 109 ++++++++++++++++++++---------- sys/sys/syscallsubr.h | 2 + 8 files changed, 162 insertions(+), 94 deletions(-) diff --git a/lib/libc/gen/sched_getaffinity.c b/lib/libc/gen/sched_getaffinity.c index 7d345eb82a3b..92135109156c 100644 --- a/lib/libc/gen/sched_getaffinity.c +++ b/lib/libc/gen/sched_getaffinity.c @@ -33,24 +33,15 @@ int sched_getaffinity(pid_t pid, size_t cpusetsz, cpuset_t *cpuset) { - /* - * Be more Linux-compatible: - * - return EINVAL in passed size is less than size of cpuset_t - * in advance, instead of ERANGE from the syscall - * - if passed size is larger than the size of cpuset_t, be - * permissive by claming it back to sizeof(cpuset_t) and - * zeroing the rest. - */ - if (cpusetsz < sizeof(cpuset_t)) { + int error; + + error = cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, + pid == 0 ? -1 : pid, cpusetsz, cpuset); + if (error == -1 && errno == ERANGE) errno = EINVAL; - return (-1); - } - if (cpusetsz > sizeof(cpuset_t)) { - memset((char *)cpuset + sizeof(cpuset_t), 0, - cpusetsz - sizeof(cpuset_t)); - cpusetsz = sizeof(cpuset_t); - } + if (error == 0) + return (cpusetsz < sizeof(cpuset_t) ? cpusetsz : + sizeof(cpuset_t)); - return (cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, - pid == 0 ? -1 : pid, cpusetsz, cpuset)); + return (error); } diff --git a/lib/libc/gen/sched_setaffinity.c b/lib/libc/gen/sched_setaffinity.c index 09e2b9097d5a..0052521cd081 100644 --- a/lib/libc/gen/sched_setaffinity.c +++ b/lib/libc/gen/sched_setaffinity.c @@ -26,6 +26,8 @@ * SUCH DAMAGE. */ +#include <sys/param.h> +#include <sys/sysctl.h> #include <errno.h> #include <sched.h> #include <string.h> @@ -33,15 +35,28 @@ int sched_setaffinity(pid_t pid, size_t cpusetsz, const cpuset_t *cpuset) { + static int mp_maxid; cpuset_t c; - int error; + int error, lbs, cpu; + size_t len, sz; - if (cpusetsz > sizeof(cpuset_t)) { - errno = EINVAL; - return (-1); - } else { - memset(&c, 0, sizeof(c)); - memcpy(&c, cpuset, cpusetsz); + sz = cpusetsz > sizeof(cpuset_t) ? sizeof(cpuset_t) : cpusetsz; + memset(&c, 0, sizeof(c)); + memcpy(&c, cpuset, sz); + + /* Linux ignores high bits */ + if (mp_maxid == 0) { + len = sizeof(mp_maxid); + error = sysctlbyname("kern.smp.maxid", &mp_maxid, &len, + NULL, 0); + if (error == -1) + return (error); + } + lbs = CPU_FLS(&c) - 1; + if (lbs > mp_maxid) { + CPU_FOREACH_ISSET(cpu, &c) + if (cpu > mp_maxid) + CPU_CLR(cpu, &c); } error = cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, pid == 0 ? -1 : pid, sizeof(cpuset_t), &c); diff --git a/lib/libc/sys/cpuset_getaffinity.2 b/lib/libc/sys/cpuset_getaffinity.2 index bce9161a1880..f7ac3873a9be 100644 --- a/lib/libc/sys/cpuset_getaffinity.2 +++ b/lib/libc/sys/cpuset_getaffinity.2 @@ -25,7 +25,7 @@ .\" .\" $FreeBSD$ .\" -.Dd May 23, 2017 +.Dd April 27, 2022 .Dt CPUSET_GETAFFINITY 2 .Os .Sh NAME @@ -71,14 +71,19 @@ Masks of type are composed using the .Dv CPU_SET macros. -The kernel tolerates large sets as long as all CPUs specified -in the set exist. -Sets smaller than the kernel uses generate an error on calls to +If the user-supplied mask is not large enough to fit all of the matching CPUs, .Fn cpuset_getaffinity -even if the result set would fit within the user supplied set. +fails with +.Er ERANGE . Calls to .Fn cpuset_setaffinity -tolerate small sets with no restrictions. +tolerate masks of any size with no restrictions. +The kernel uses the meaningful part of the mask, where the upper bound is +the maximum CPU id present in the system. +If bits for non-existing CPUs are set, calls to +.Fn cpuset_setaffinity +fails with +.Er EINVAL . .Pp The supplied mask should have a size of .Fa setsize @@ -144,7 +149,7 @@ arguments could not be found. .It Bq Er ERANGE The .Fa cpusetsize -was either preposterously large or smaller than the kernel set size. +was smaller than needed to fit all of the matching CPUs. .It Bq Er EPERM The calling process did not have the credentials required to complete the operation. diff --git a/share/man/man3/pthread_attr_affinity_np.3 b/share/man/man3/pthread_attr_affinity_np.3 index 7b1cd3dea0d9..2c85aee9ac19 100644 --- a/share/man/man3/pthread_attr_affinity_np.3 +++ b/share/man/man3/pthread_attr_affinity_np.3 @@ -24,7 +24,7 @@ .\" .\" $FreeBSD$ .\" -.Dd October 12, 2021 +.Dd April 27, 2022 .Dt PTHREAD_ATTR_AFFINITY_NP 3 .Os .Sh NAME @@ -51,14 +51,19 @@ Masks of type are composed using the .Dv CPU_SET macros. -The kernel tolerates large sets as long as all CPUs specified -in the set exist. -Sets smaller than the kernel uses generate an error on calls to -.Fn pthread_attr_getaffinity_np -even if the result set would fit within the user supplied set. +If the user-supplied mask is not large enough to fit all of the matching CPUs, +.Fn cpuset_getaffinity +fails with +.Er ERANGE . Calls to -.Fn pthread_attr_setaffinity_np -tolerate small sets with no restrictions. +.Fn cpuset_setaffinity +tolerate masks of any size with no restrictions. +The kernel uses the meaningful part of the mask, where the upper bound is +the maximum CPU id present in the system. +If bits for non-existing CPUs are set, calls to +.Fn cpuset_setaffinity +fails with +.Er EINVAL . .Pp The supplied mask should have a size of .Fa cpusetsize @@ -119,10 +124,6 @@ or the attribute specified by it is The .Fa cpusetp specified a CPU that was outside the set supported by the kernel. -.It Bq Er ERANGE -The -.Fa cpusetsize -is too small. .It Bq Er ENOMEM Insufficient memory exists to store the cpuset mask. .El diff --git a/sys/compat/freebsd32/freebsd32_misc.c b/sys/compat/freebsd32/freebsd32_misc.c index 7be1dd8602a0..25c7a4e17dc7 100644 --- a/sys/compat/freebsd32/freebsd32_misc.c +++ b/sys/compat/freebsd32/freebsd32_misc.c @@ -3324,7 +3324,7 @@ freebsd32_cpuset_setaffinity(struct thread *td, struct freebsd32_cpuset_setaffinity_args *uap) { - return (kern_cpuset_setaffinity(td, uap->level, uap->which, + return (user_cpuset_setaffinity(td, uap->level, uap->which, PAIR32TO64(id_t,uap->id), uap->cpusetsize, uap->mask)); } diff --git a/sys/compat/linux/linux_misc.c b/sys/compat/linux/linux_misc.c index 54fbc9e2b938..e9d29ec57436 100644 --- a/sys/compat/linux/linux_misc.c +++ b/sys/compat/linux/linux_misc.c @@ -61,6 +61,7 @@ __FBSDID("$FreeBSD$"); #include <sys/sched.h> #include <sys/sdt.h> #include <sys/signalvar.h> +#include <sys/smp.h> #include <sys/stat.h> #include <sys/syscallsubr.h> #include <sys/sysctl.h> @@ -2256,22 +2257,22 @@ int linux_sched_getaffinity(struct thread *td, struct linux_sched_getaffinity_args *args) { - int error; struct thread *tdt; - - if (args->len < sizeof(cpuset_t)) - return (EINVAL); + int error; + id_t tid; tdt = linux_tdfind(td, args->pid, -1); if (tdt == NULL) return (ESRCH); - + tid = tdt->td_tid; PROC_UNLOCK(tdt->td_proc); error = kern_cpuset_getaffinity(td, CPU_LEVEL_WHICH, CPU_WHICH_TID, - tdt->td_tid, sizeof(cpuset_t), (cpuset_t *)args->user_mask_ptr); + tid, args->len, (cpuset_t *)args->user_mask_ptr); + if (error == ERANGE) + error = EINVAL; if (error == 0) - td->td_retval[0] = sizeof(cpuset_t); + td->td_retval[0] = min(args->len, sizeof(cpuset_t)); return (error); } @@ -2284,18 +2285,34 @@ linux_sched_setaffinity(struct thread *td, struct linux_sched_setaffinity_args *args) { struct thread *tdt; - - if (args->len < sizeof(cpuset_t)) - return (EINVAL); + cpuset_t *mask; + int cpu, error; + size_t len; + id_t tid; tdt = linux_tdfind(td, args->pid, -1); if (tdt == NULL) return (ESRCH); - + tid = tdt->td_tid; PROC_UNLOCK(tdt->td_proc); - return (kern_cpuset_setaffinity(td, CPU_LEVEL_WHICH, CPU_WHICH_TID, - tdt->td_tid, sizeof(cpuset_t), (cpuset_t *) args->user_mask_ptr)); + len = min(args->len, sizeof(cpuset_t)); + mask = malloc(sizeof(cpuset_t), M_TEMP, M_WAITOK | M_ZERO);; + error = copyin(args->user_mask_ptr, mask, len); + if (error != 0) + goto out; + /* Linux ignore high bits */ + CPU_FOREACH_ISSET(cpu, mask) + if (cpu > mp_maxid) + CPU_CLR(cpu, mask); + + error = kern_cpuset_setaffinity(td, CPU_LEVEL_WHICH, CPU_WHICH_TID, + tid, mask); + if (error == EDEADLK) + error = EINVAL; +out: + free(mask, M_TEMP); + return (error); } struct linux_rlimit64 { diff --git a/sys/kern/kern_cpuset.c b/sys/kern/kern_cpuset.c index 962e7abb44b7..b02f33dddf1b 100644 --- a/sys/kern/kern_cpuset.c +++ b/sys/kern/kern_cpuset.c @@ -1896,13 +1896,10 @@ kern_cpuset_getaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which, int error; size_t size; - if (cpusetsize < sizeof(cpuset_t) || cpusetsize > CPU_MAXSIZE / NBBY) - return (ERANGE); error = cpuset_check_capabilities(td, level, which, id); if (error != 0) return (error); - size = cpusetsize; - mask = malloc(size, M_TEMP, M_WAITOK | M_ZERO); + mask = malloc(sizeof(cpuset_t), M_TEMP, M_WAITOK | M_ZERO); error = cpuset_which(which, id, &p, &ttd, &set); if (error) goto out; @@ -1972,8 +1969,33 @@ kern_cpuset_getaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which, cpuset_rel(set); if (p) PROC_UNLOCK(p); - if (error == 0) + if (error == 0) { + if (cpusetsize < howmany(CPU_FLS(mask), NBBY)) { + error = ERANGE; + goto out; + } + size = min(cpusetsize, sizeof(cpuset_t)); error = copyout(mask, maskp, size); + if (error != 0) + goto out; + if (cpusetsize > size) { + char *end; + char *cp; + int rv; + + end = cp = (char *)&maskp->__bits; + end += cpusetsize; + cp += size; + while (cp != end) { + rv = subyte(cp, 0); + if (rv == -1) { + error = EFAULT; + goto out; + } + cp++; + } + } + } out: free(mask, M_TEMP); return (error); @@ -1992,50 +2014,25 @@ int sys_cpuset_setaffinity(struct thread *td, struct cpuset_setaffinity_args *uap) { - return (kern_cpuset_setaffinity(td, uap->level, uap->which, + return (user_cpuset_setaffinity(td, uap->level, uap->which, uap->id, uap->cpusetsize, uap->mask)); } int kern_cpuset_setaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which, - id_t id, size_t cpusetsize, const cpuset_t *maskp) + id_t id, cpuset_t *mask) { struct cpuset *nset; struct cpuset *set; struct thread *ttd; struct proc *p; - cpuset_t *mask; int error; - if (cpusetsize < sizeof(cpuset_t) || cpusetsize > CPU_MAXSIZE / NBBY) - return (ERANGE); error = cpuset_check_capabilities(td, level, which, id); if (error != 0) return (error); - mask = malloc(cpusetsize, M_TEMP, M_WAITOK | M_ZERO); - error = copyin(maskp, mask, cpusetsize); - if (error) - goto out; - /* - * Verify that no high bits are set. - */ - if (cpusetsize > sizeof(cpuset_t)) { - char *end; - char *cp; - - end = cp = (char *)&mask->__bits; - end += cpusetsize; - cp += sizeof(cpuset_t); - while (cp != end) - if (*cp++ != 0) { - error = EINVAL; - goto out; - } - } - if (CPU_EMPTY(mask)) { - error = EDEADLK; - goto out; - } + if (CPU_EMPTY(mask)) + return (EDEADLK); switch (level) { case CPU_LEVEL_ROOT: case CPU_LEVEL_CPUSET: @@ -2057,8 +2054,7 @@ kern_cpuset_setaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which, case CPU_WHICH_INTRHANDLER: case CPU_WHICH_ITHREAD: case CPU_WHICH_DOMAIN: - error = EINVAL; - goto out; + return (EINVAL); } if (level == CPU_LEVEL_ROOT) nset = cpuset_refroot(set); @@ -2098,6 +2094,47 @@ kern_cpuset_setaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which, error = EINVAL; break; } + return (error); +} + +int +user_cpuset_setaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which, + id_t id, size_t cpusetsize, const cpuset_t *maskp) +{ + cpuset_t *mask; + int error; + size_t size; + + size = min(cpusetsize, sizeof(cpuset_t)); + mask = malloc(sizeof(cpuset_t), M_TEMP, M_WAITOK | M_ZERO); + error = copyin(maskp, mask, size); + if (error) + goto out; + /* + * Verify that no high bits are set. + */ + if (cpusetsize > sizeof(cpuset_t)) { + const char *end, *cp; + int val; + end = cp = (const char *)&maskp->__bits; + end += cpusetsize; + cp += sizeof(cpuset_t); + + while (cp != end) { + val = fubyte(cp); + if (val == -1) { + error = EFAULT; + goto out; + } + if (val != 0) { + error = EINVAL; + goto out; + } + cp++; + } + } + error = kern_cpuset_setaffinity(td, level, which, id, mask); + out: free(mask, M_TEMP); return (error); diff --git a/sys/sys/syscallsubr.h b/sys/sys/syscallsubr.h index 1f5f4cd2369d..294539c6593c 100644 --- a/sys/sys/syscallsubr.h +++ b/sys/sys/syscallsubr.h @@ -121,6 +121,8 @@ int kern_copy_file_range(struct thread *td, int infd, off_t *inoffp, int kern_cpuset_getaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which, id_t id, size_t cpusetsize, cpuset_t *maskp); int kern_cpuset_setaffinity(struct thread *td, cpulevel_t level, + cpuwhich_t which, id_t id, cpuset_t *maskp); +int user_cpuset_setaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which, id_t id, size_t cpusetsize, const cpuset_t *maskp); int kern_cpuset_getdomain(struct thread *td, cpulevel_t level,