git: 0dc332bff200 - main - Add fspacectl(2), vn_deallocate(9) and VOP_DEALLOCATE(9).
Ka Ho Ng
khng at FreeBSD.org
Thu Aug 5 15:23:37 UTC 2021
The branch main has been updated by khng:
URL: https://cgit.FreeBSD.org/src/commit/?id=0dc332bff200c940edc36c4715b629a2e1e9f9ae
commit 0dc332bff200c940edc36c4715b629a2e1e9f9ae
Author: Ka Ho Ng <khng at FreeBSD.org>
AuthorDate: 2021-08-05 15:20:42 +0000
Commit: Ka Ho Ng <khng at FreeBSD.org>
CommitDate: 2021-08-05 15:20:42 +0000
Add fspacectl(2), vn_deallocate(9) and VOP_DEALLOCATE(9).
fspacectl(2) is a system call to provide space management support to
userspace applications. VOP_DEALLOCATE(9) is a VOP call to perform the
deallocation. vn_deallocate(9) is a public KPI for kmods' use.
The purpose of proposing a new system call, a KPI and a VOP call is to
allow bhyve or other hypervisor monitors to emulate the behavior of SCSI
UNMAP/NVMe DEALLOCATE on a plain file.
fspacectl(2) comprises of cmd and flags parameters to specify the
space management operation to be performed. Currently cmd has to be
SPACECTL_DEALLOC, and flags has to be 0.
fo_fspacectl is added to fileops.
VOP_DEALLOCATE(9) is added as a new VOP call. A trivial implementation
of VOP_DEALLOCATE(9) is provided.
Sponsored by: The FreeBSD Foundation
Reviewed by: kib
Differential Revision: https://reviews.freebsd.org/D28347
---
lib/libc/sys/Makefile.inc | 1 +
lib/libc/sys/Symbol.map | 1 +
lib/libc/sys/fspacectl.2 | 189 +++++++++++++++++++
lib/libc/sys/pathconf.2 | 3 +
share/man/man9/Makefile | 2 +
share/man/man9/VOP_DEALLOCATE.9 | 101 ++++++++++
share/man/man9/vn_deallocate.9 | 103 +++++++++++
sys/bsm/audit_kevents.h | 1 +
sys/compat/freebsd32/freebsd32.h | 4 +
sys/compat/freebsd32/freebsd32_misc.c | 34 ++++
sys/compat/freebsd32/syscalls.master | 5 +
sys/kern/capabilities.conf | 5 +
sys/kern/sys_generic.c | 70 +++++++
sys/kern/syscalls.master | 9 +
sys/kern/vfs_default.c | 122 ++++++++++++
sys/kern/vfs_vnops.c | 110 +++++++++++
sys/kern/vnode_if.src | 11 ++
sys/security/audit/audit_bsm.c | 12 ++
sys/sys/fcntl.h | 20 ++
sys/sys/file.h | 15 ++
sys/sys/syscallsubr.h | 3 +
sys/sys/unistd.h | 1 +
sys/sys/vnode.h | 2 +
tests/sys/file/Makefile | 1 +
tests/sys/file/fspacectl_test.c | 338 ++++++++++++++++++++++++++++++++++
25 files changed, 1163 insertions(+)
diff --git a/lib/libc/sys/Makefile.inc b/lib/libc/sys/Makefile.inc
index a1eb9567a380..29e914872a8d 100644
--- a/lib/libc/sys/Makefile.inc
+++ b/lib/libc/sys/Makefile.inc
@@ -190,6 +190,7 @@ MAN+= abort2.2 \
fhreadlink.2 \
flock.2 \
fork.2 \
+ fspacectl.2 \
fsync.2 \
getdirentries.2 \
getdtablesize.2 \
diff --git a/lib/libc/sys/Symbol.map b/lib/libc/sys/Symbol.map
index 80bb2c236191..93fbc947a7e1 100644
--- a/lib/libc/sys/Symbol.map
+++ b/lib/libc/sys/Symbol.map
@@ -419,6 +419,7 @@ FBSD_1.6 {
FBSD_1.7 {
_Fork;
+ fspacectl;
};
FBSDprivate_1.0 {
diff --git a/lib/libc/sys/fspacectl.2 b/lib/libc/sys/fspacectl.2
new file mode 100644
index 000000000000..2f581d1c1fb8
--- /dev/null
+++ b/lib/libc/sys/fspacectl.2
@@ -0,0 +1,189 @@
+.\"
+.\" SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+.\"
+.\" Copyright (c) 2021 The FreeBSD Foundation
+.\"
+.\" This manual page was written by Ka Ho Ng under sponsorship from
+.\" the FreeBSD Foundation.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.Dd August 4, 2021
+.Dt FSPACECTL 2
+.Os
+.Sh NAME
+.Nm fspacectl
+.Nd space management in a file
+.Sh LIBRARY
+.Lb libc
+.Sh SYNOPSIS
+.In fcntl.h
+.Ft int
+.Fo fspacectl
+.Fa "int fd"
+.Fa "int cmd"
+.Fa "const struct spacectl_range *rqsr"
+.Fa "int flags"
+.Fa "struct spacectl_range *rmsr"
+.Fc
+.Sh DESCRIPTION
+.Nm
+is a system call performing space management over a file.
+The
+.Fa fd
+argument specifies the file descriptor to be operated on by the
+.Fa cmd
+argument.
+The
+.Fa rqsr
+argument points to a
+.Fa spacectl_range
+structure that contains the requested operation range.
+The
+.Fa flags
+argument controls the behavior of the operation to take place.
+If the
+.Fa rmsr
+argument is non-NULL, the
+.Fa spacectl_range
+structure it points to is updated to contain the unprocessed operation range
+after the system call returns.
+Both
+.Fa rqsr
+and
+.Fa rmsr
+arguments can point to the same structure.
+.Pp
+The
+.Fa spacectl_range
+structure is defined as:
+.Bd -literal
+struct spacectl_range {
+ off_t r_offset;
+ off_t r_len;
+};
+.Ed
+.Pp
+The operation specified by the
+.Fa cmd
+argument may be one of:
+.Bl -tag -width SPACECTL_DEALLOC
+.It Dv SPACECTL_DEALLOC
+Zero a region in the file specified by the
+.Fa rqsr
+argument.
+The
+.Va "rqsr->r_offset"
+has to be a value greater than or equal to 0, and the
+.Va "rqsr->r_len"
+has to be a value greater than 0.
+.Pp
+If the file system supports hole-punching,
+file system space deallocation may be performed in the given region.
+.El
+.Pp
+The
+.Fa flags
+argument needs to be the value 0 currently.
+.Sh RETURN VALUES
+Upon successful completion, the value 0 is returned;
+otherwise the value -1 is returned and
+.Va errno
+is set to indicate the error.
+.Sh ERRORS
+Possible failure conditions:
+.Bl -tag -width Er
+.It Bq Er EBADF
+The
+.Fa fd
+argument is not a valid file descriptor.
+.It Bq Er EBADF
+The
+.Fa fd
+argument references a file that was opened without write permission.
+.It Bq Er EINTR
+A signal was caught during execution.
+.It Bq Er EINVAL
+The
+.Fa cmd
+argument is not valid.
+.It Bq Er EINVAL
+If the
+.Fa cmd
+argument is
+.Dv SPACECTL_DEALLOC ,
+either the
+.Fa "range->r_offset"
+argument was less than zero, or the
+.Fa "range->r_len"
+argument was less than or equal to zero.
+.It Bq Er EINVAL
+An invalid or unsupported flag is included in
+.Fa flags .
+.It Bq Er EINVAL
+A flag included in
+.Fa flags
+is not supported by the operation specified by the
+.Fa cmd
+argument.
+.It Bq Er EFAULT
+The
+.Fa rqsr
+or a non-NULL
+.Fa rmsr
+argument point outside the process' allocated address space.
+.It Bq Er EIO
+An I/O error occurred while reading from or writing to a file system.
+.It Bq Er EINTEGRITY
+Corrupted data was detected while reading from the file system.
+.It Bq Er ENODEV
+The
+.Fa fd
+argument does not refer to a file that supports
+.Nm .
+.It Bq Er ENOSPC
+There is insufficient free space remaining on the file system storage
+media.
+.It Bq Er ENOTCAPABLE
+The file descriptor
+.Fa fd
+has insufficient rights.
+.It Bq Er ESPIPE
+The
+.Fa fd
+argument is associated with a pipe or FIFO.
+.El
+.Sh SEE ALSO
+.Xr creat 2 ,
+.Xr ftruncate 2 ,
+.Xr open 2 ,
+.Xr unlink 2
+.Sh HISTORY
+The
+.Nm
+system call appeared in
+.Fx 14.0 .
+.Sh AUTHORS
+.Nm
+and this manual page were written by
+.An Ka Ho Ng Aq Mt khng at FreeBSD.org
+under sponsorship from the FreeBSD Foundation.
diff --git a/lib/libc/sys/pathconf.2 b/lib/libc/sys/pathconf.2
index 62ec532705ef..c5a7ba1be3c5 100644
--- a/lib/libc/sys/pathconf.2
+++ b/lib/libc/sys/pathconf.2
@@ -166,6 +166,9 @@ specified file, otherwise 0.
.It Li _PC_MIN_HOLE_SIZE
If a file system supports the reporting of holes (see
.Xr lseek 2 ) ,
+.It Li _PC_DEALLOC_PRESENT
+If a file system supports hole-punching (see
+.Xr fspacectl 2 ) ,
.Fn pathconf
and
.Fn fpathconf
diff --git a/share/man/man9/Makefile b/share/man/man9/Makefile
index d0012301d889..b2f1451a79d7 100644
--- a/share/man/man9/Makefile
+++ b/share/man/man9/Makefile
@@ -404,6 +404,7 @@ MAN= accept_filter.9 \
vm_page_wire.9 \
vm_set_page_size.9 \
vmem.9 \
+ vn_deallocate.9 \
vn_fullpath.9 \
vn_isdisk.9 \
vnet.9 \
@@ -420,6 +421,7 @@ MAN= accept_filter.9 \
VOP_BWRITE.9 \
VOP_COPY_FILE_RANGE.9 \
VOP_CREATE.9 \
+ VOP_DEALLOCATE.9 \
VOP_FSYNC.9 \
VOP_GETACL.9 \
VOP_GETEXTATTR.9 \
diff --git a/share/man/man9/VOP_DEALLOCATE.9 b/share/man/man9/VOP_DEALLOCATE.9
new file mode 100644
index 000000000000..1c7f80cfbc6c
--- /dev/null
+++ b/share/man/man9/VOP_DEALLOCATE.9
@@ -0,0 +1,101 @@
+.\"
+.\" SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+.\"
+.\" Copyright (c) 2021 The FreeBSD Foundation
+.\"
+.\" This manual page was written by Ka Ho Ng under sponsorship from
+.\" the FreeBSD Foundation.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.Dd May 11, 2021
+.Dt VOP_DEALLOCATE 9
+.Os
+.Sh NAME
+.Nm VOP_DEALLOCATE
+.Nd zero and/or deallocate storage from a file
+.Sh SYNOPSIS
+.In sys/param.h
+.In sys/vnode.h
+.Ft int
+.Fo VOP_DEALLOCATE
+.Fa "struct vnode *vp"
+.Fa "off_t *offset"
+.Fa "off_t *len"
+.Fa "int flags"
+.Fa "struct ucred *cred"
+.Fc
+.Sh DESCRIPTION
+This VOP call zeroes/deallocates storage for an offset range in a file.
+It is used to implement the
+.Xr fspacectl 2
+system call.
+.Pp
+Its arguments are:
+.Bl -tag -width offset
+.It Fa vp
+The vnode of the file.
+.It Fa offset
+The start of the range to deallocate storage in the file.
+.It Fa len
+The length of the range to deallocate storage in the file.
+.It Fa flags
+The flags of this call.
+This should be set to 0 for now.
+.It Fa cred
+The credentials of the caller.
+.El
+.Pp
+.Fa *offset
+and
+.Fa *len
+are updated to reflect the portion of the range that
+still needs to be zeroed/deallocated on return.
+Partial result is considered a successful operation.
+.Sh LOCKS
+The vnode should be locked on entry and will still be locked on exit.
+.Sh RETURN VALUES
+Zero is returned if the call is successful, otherwise an appropriate
+error code is returned.
+.Sh ERRORS
+.Bl -tag -width Er
+.It Bq Er EINVAL
+Invalid
+.Fa offset , len
+or
+.Fa flags
+parameters are passed into this VOP call.
+.It Bq Er ENODEV
+The vnode type is not supported by this VOP call.
+.It Bq Er ENOSPC
+The file system is full.
+.It Bq Er EPERM
+An append-only flag is set on the file, but the caller is attempting to
+zero before the current end of file.
+.El
+.Sh SEE ALSO
+.Xr vnode 9
+.Sh AUTHORS
+.Nm
+and this manual page was written by
+.An Ka Ho Ng Aq Mt khng at FreeBSD.org
+under sponsorship from the FreeBSD Foundation.
diff --git a/share/man/man9/vn_deallocate.9 b/share/man/man9/vn_deallocate.9
new file mode 100644
index 000000000000..415a8941ca68
--- /dev/null
+++ b/share/man/man9/vn_deallocate.9
@@ -0,0 +1,103 @@
+.\"
+.\" SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+.\"
+.\" Copyright (c) 2021 The FreeBSD Foundation
+.\"
+.\" This manual page was written by Ka Ho Ng under sponsorship from
+.\" the FreeBSD Foundation.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.Dd Jul 30, 2021
+.Dt VN_DEALLOCATE 9
+.Os
+.Sh NAME
+.Nm vn_deallocate
+.Nd zero and/or deallocate storage from a file
+.Sh SYNOPSIS
+.In sys/param.h
+.In sys/vnode.h
+.Ft int
+.Fo vn_deallocate
+.Fa "struct vnode *vp"
+.Fa "off_t *offset"
+.Fa "off_t *length"
+.Fa "int flags"
+.Fa "int ioflg"
+.Fa "struct ucred *active_cred"
+.Fa "struct ucred *file_cred"
+.Fc
+.Sh DESCRIPTION
+The
+.Fn vn_deallocate
+function zeros and/or deallocates backing storage space from a file.
+This function only works on vnodes with
+.Dv VREG
+type.
+.Pp
+The arguments are:
+.Bl -tag -width active_cred
+.It Fa vp
+The vnode of the file.
+.It Fa offset
+The starting offset of the operation range.
+.It Fa length
+The length of the operation range.
+This must be greater than 0.
+.It Fa flags
+The control flags of the operation.
+This should be set to 0 for now.
+.It Fa ioflg
+The control flags of vnode locking.
+.It Fa active_cred
+The user credentials of the calling thread.
+.It Fa file_cred
+The credentials installed on the file description pointing to the vnode or NOCRED.
+.El
+.Pp
+The
+.Fn ioflg
+argument may be one or more of the following flags:
+.Bl -tag -width IO_RANGELOCKED
+.It Dv IO_NODELOCKED
+The vnode was locked before the call.
+.It Dv IO_RANGELOCKED
+Rangelock was owned around the call.
+.It Dv IO_NOMACCHECK
+Skip MAC checking in the call.
+.El
+.Pp
+.Fa *offset
+and
+.Fa *length
+are updated to reflect the unprocessed operation range of the call.
+.Sh RETURN VALUES
+Upon successful completion, the value 0 is returned; otherwise the
+appropriate error is returned.
+.Sh SEE ALSO
+.Xr vnode 9 ,
+.Xr VOP_DEALLOCATE 9
+.Sh AUTHORS
+.Nm
+and this manual page was written by
+.An Ka Ho Ng Aq Mt khng at FreeBSD.org
+under sponsorship from the FreeBSD Foundation.
diff --git a/sys/bsm/audit_kevents.h b/sys/bsm/audit_kevents.h
index eeb928ecafdc..0da82de1fbcb 100644
--- a/sys/bsm/audit_kevents.h
+++ b/sys/bsm/audit_kevents.h
@@ -662,6 +662,7 @@
#define AUE_SPECIALFD 43266 /* FreeBSD-specific. */
#define AUE_AIO_WRITEV 43267 /* FreeBSD-specific. */
#define AUE_AIO_READV 43268 /* FreeBSD-specific. */
+#define AUE_FSPACECTL 43269 /* FreeBSD-specific. */
/*
* Darwin BSM uses a number of AUE_O_* definitions, which are aliased to the
diff --git a/sys/compat/freebsd32/freebsd32.h b/sys/compat/freebsd32/freebsd32.h
index 2e4f5155cbf4..8a14a42db813 100644
--- a/sys/compat/freebsd32/freebsd32.h
+++ b/sys/compat/freebsd32/freebsd32.h
@@ -435,5 +435,9 @@ struct ptrace_coredump32 {
uint32_t pc_limit1, pc_limit2;
};
+struct spacectl_range32 {
+ uint32_t r_offset1, r_offset2;
+ uint32_t r_len1, r_len2;
+};
#endif /* !_COMPAT_FREEBSD32_FREEBSD32_H_ */
diff --git a/sys/compat/freebsd32/freebsd32_misc.c b/sys/compat/freebsd32/freebsd32_misc.c
index 736fd1123d53..c417a64d286a 100644
--- a/sys/compat/freebsd32/freebsd32_misc.c
+++ b/sys/compat/freebsd32/freebsd32_misc.c
@@ -3857,3 +3857,37 @@ freebsd32_ntp_adjtime(struct thread *td, struct freebsd32_ntp_adjtime_args *uap)
}
return (error);
}
+
+int
+freebsd32_fspacectl(struct thread *td, struct freebsd32_fspacectl_args *uap)
+{
+ struct spacectl_range rqsr, rmsr;
+ struct spacectl_range32 rqsr32, rmsr32;
+ int error, cerror;
+
+ error = copyin(uap->rqsr, &rqsr32, sizeof(rqsr32));
+ if (error != 0)
+ return (error);
+ rqsr.r_offset = PAIR32TO64(off_t, rqsr32.r_offset);
+ rqsr.r_len = PAIR32TO64(off_t, rqsr32.r_len);
+
+ error = kern_fspacectl(td, uap->fd, uap->cmd, &rqsr, uap->flags,
+ &rmsr);
+ if (uap->rmsr != NULL) {
+#if BYTE_ORDER == LITTLE_ENDIAN
+ rmsr32.r_offset1 = rmsr.r_offset;
+ rmsr32.r_offset2 = rmsr.r_offset >> 32;
+ rmsr32.r_len1 = rmsr.r_len;
+ rmsr32.r_len2 = rmsr.r_len >> 32;
+#else
+ rmsr32.r_offset1 = rmsr.r_offset >> 32;
+ rmsr32.r_offset2 = rmsr.r_offset;
+ rmsr32.r_len1 = rmsr.r_len >> 32;
+ rmsr32.r_len2 = rmsr.r_len;
+#endif
+ cerror = copyout(&rmsr32, uap->rmsr, sizeof(rmsr32));
+ if (error == 0)
+ error = cerror;
+ }
+ return (error);
+}
diff --git a/sys/compat/freebsd32/syscalls.master b/sys/compat/freebsd32/syscalls.master
index aac788bf3956..3e53de2dc966 100644
--- a/sys/compat/freebsd32/syscalls.master
+++ b/sys/compat/freebsd32/syscalls.master
@@ -1176,5 +1176,10 @@
struct aiocb32 *aiocbp); }
579 AUE_AIO_READV STD { int freebsd32_aio_readv( \
struct aiocb32 *aiocbp); }
+580 AUE_FSPACECTL STD { int freebsd32_fspacectl(int fd, \
+ int cmd, \
+ const struct spacectl_range32 *rqsr, \
+ int flags, \
+ struct spacectl_range32 *rmsr); }
; vim: syntax=off
diff --git a/sys/kern/capabilities.conf b/sys/kern/capabilities.conf
index 602ec7088fc6..f53530eb7fa7 100644
--- a/sys/kern/capabilities.conf
+++ b/sys/kern/capabilities.conf
@@ -228,6 +228,11 @@ freebsd6_mmap
freebsd6_pread
freebsd6_pwrite
+##
+## Allow I/O-related file operations, subject to capability rights.
+##
+fspacectl
+
##
## Allow querying file and file system state with fstat(2) and fstatfs(2),
## subject to capability rights.
diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c
index f86d494400e2..e6b2cba27a04 100644
--- a/sys/kern/sys_generic.c
+++ b/sys/kern/sys_generic.c
@@ -861,6 +861,76 @@ kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len)
return (error);
}
+int
+sys_fspacectl(struct thread *td, struct fspacectl_args *uap)
+{
+ struct spacectl_range rqsr, rmsr;
+ int error, cerror;
+
+ error = copyin(uap->rqsr, &rqsr, sizeof(rqsr));
+ if (error != 0)
+ return (error);
+
+ error = kern_fspacectl(td, uap->fd, uap->cmd, &rqsr, uap->flags,
+ &rmsr);
+ if (uap->rmsr != NULL) {
+ cerror = copyout(&rmsr, uap->rmsr, sizeof(rmsr));
+ if (error == 0)
+ error = cerror;
+ }
+ return (error);
+}
+
+int
+kern_fspacectl(struct thread *td, int fd, int cmd,
+ const struct spacectl_range *rqsr, int flags, struct spacectl_range *rmsrp)
+{
+ struct file *fp;
+ struct spacectl_range rmsr;
+ int error;
+
+ AUDIT_ARG_FD(fd);
+ AUDIT_ARG_CMD(cmd);
+ AUDIT_ARG_FFLAGS(flags);
+
+ if (rqsr == NULL)
+ return (EINVAL);
+ rmsr = *rqsr;
+ if (rmsrp != NULL)
+ *rmsrp = rmsr;
+
+ if (cmd != SPACECTL_DEALLOC ||
+ rqsr->r_offset < 0 || rqsr->r_len <= 0 ||
+ rqsr->r_offset > OFF_MAX - rqsr->r_len ||
+ (flags & ~SPACECTL_F_SUPPORTED) != 0)
+ return (EINVAL);
+
+ error = fget_write(td, fd, &cap_pwrite_rights, &fp);
+ if (error != 0)
+ return (error);
+ AUDIT_ARG_FILE(td->td_proc, fp);
+ if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) {
+ error = ESPIPE;
+ goto out;
+ }
+ if ((fp->f_flag & FWRITE) == 0) {
+ error = EBADF;
+ goto out;
+ }
+
+ error = fo_fspacectl(fp, cmd, &rmsr.r_offset, &rmsr.r_len, flags,
+ td->td_ucred, td);
+ /* fspacectl is not restarted after signals if the file is modified. */
+ if (rmsr.r_len != rqsr->r_len && (error == ERESTART ||
+ error == EINTR || error == EWOULDBLOCK))
+ error = 0;
+ if (rmsrp != NULL)
+ *rmsrp = rmsr;
+out:
+ fdrop(fp, td);
+ return (error);
+}
+
int
kern_specialfd(struct thread *td, int type, void *arg)
{
diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master
index af787908451a..11247aed8fd6 100644
--- a/sys/kern/syscalls.master
+++ b/sys/kern/syscalls.master
@@ -3250,6 +3250,15 @@
_Inout_ struct aiocb *aiocbp
);
}
+580 AUE_FSPACECTL STD {
+ int fspacectl(
+ int fd,
+ int cmd,
+ _In_ const struct spacectl_range *rqsr,
+ int flags,
+ _Out_opt_ struct spacectl_range *rmsr,
+ );
+ }
; Please copy any additions and changes to the following compatability tables:
; sys/compat/freebsd32/syscalls.master
diff --git a/sys/kern/vfs_default.c b/sys/kern/vfs_default.c
index 63bca7810847..c42d5a795935 100644
--- a/sys/kern/vfs_default.c
+++ b/sys/kern/vfs_default.c
@@ -93,6 +93,7 @@ static int vop_stdgetpages_async(struct vop_getpages_async_args *ap);
static int vop_stdread_pgcache(struct vop_read_pgcache_args *ap);
static int vop_stdstat(struct vop_stat_args *ap);
static int vop_stdvput_pair(struct vop_vput_pair_args *ap);
+static int vop_stddeallocate(struct vop_deallocate_args *ap);
/*
* This vnode table stores what we want to do if the filesystem doesn't
@@ -117,6 +118,7 @@ struct vop_vector default_vnodeops = {
.vop_advlockasync = vop_stdadvlockasync,
.vop_advlockpurge = vop_stdadvlockpurge,
.vop_allocate = vop_stdallocate,
+ .vop_deallocate = vop_stddeallocate,
.vop_bmap = vop_stdbmap,
.vop_close = VOP_NULL,
.vop_fsync = VOP_NULL,
@@ -518,6 +520,7 @@ vop_stdpathconf(ap)
case _PC_ACL_EXTENDED:
case _PC_ACL_NFS4:
case _PC_CAP_PRESENT:
+ case _PC_DEALLOC_PRESENT:
case _PC_INF_PRESENT:
case _PC_MAC_PRESENT:
*ap->a_retval = 0;
@@ -1069,6 +1072,125 @@ vop_stdallocate(struct vop_allocate_args *ap)
return (error);
}
+static int
+vp_zerofill(struct vnode *vp, struct vattr *vap, off_t *offsetp, off_t *lenp,
+ struct ucred *cred)
+{
+ int iosize;
+ int error = 0;
+ struct iovec aiov;
+ struct uio auio;
+ struct thread *td;
+ off_t offset, len;
+
+ iosize = vap->va_blocksize;
+ td = curthread;
+ offset = *offsetp;
+ len = *lenp;
+
+ if (iosize == 0)
+ iosize = BLKDEV_IOSIZE;
+ /* If va_blocksize is 512 bytes, iosize will be 4 kilobytes */
+ iosize = min(iosize * 8, ZERO_REGION_SIZE);
+
+ while (len > 0) {
+ int xfersize = iosize;
+ if (offset % iosize != 0)
+ xfersize -= offset % iosize;
+ if (xfersize > len)
+ xfersize = len;
+
+ aiov.iov_base = __DECONST(void *, zero_region);
+ aiov.iov_len = xfersize;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = offset;
+ auio.uio_resid = xfersize;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_rw = UIO_WRITE;
+ auio.uio_td = td;
+
+ error = VOP_WRITE(vp, &auio, 0, cred);
+ if (error != 0) {
+ len -= xfersize - auio.uio_resid;
+ offset += xfersize - auio.uio_resid;
+ break;
+ }
+
+ len -= xfersize;
+ offset += xfersize;
+ }
+
+ *offsetp = offset;
+ *lenp = len;
+ return (error);
+}
+
+static int
+vop_stddeallocate(struct vop_deallocate_args *ap)
+{
+ struct vnode *vp;
+ off_t offset, len;
+ struct ucred *cred;
+ int error;
+ struct vattr va;
+ off_t noff, xfersize, rem;
+
+ vp = ap->a_vp;
+ offset = *ap->a_offset;
+ len = *ap->a_len;
+ cred = ap->a_cred;
+
+ error = VOP_GETATTR(vp, &va, cred);
+ if (error)
+ return (error);
+
+ len = omin(OFF_MAX - offset, *ap->a_len);
+ while (len > 0) {
+ noff = offset;
+ error = vn_bmap_seekhole_locked(vp, FIOSEEKDATA, &noff, cred);
+ if (error) {
+ if (error != ENXIO)
+ /* XXX: Is it okay to fallback further? */
+ goto out;
+
+ /*
+ * No more data region to be filled
+ */
+ len = 0;
+ error = 0;
+ break;
+ }
+ KASSERT(noff >= offset, ("FIOSEEKDATA going backward"));
+ if (noff != offset) {
+ xfersize = omin(noff - offset, len);
+ len -= xfersize;
+ offset += xfersize;
+ if (len == 0)
+ break;
+ }
+ error = vn_bmap_seekhole_locked(vp, FIOSEEKHOLE, &noff, cred);
+ if (error)
+ goto out;
+
+ /* Fill zeroes */
+ xfersize = rem = omin(noff - offset, len);
+ error = vp_zerofill(vp, &va, &offset, &rem, cred);
+ if (error) {
+ len -= xfersize - rem;
+ goto out;
+ }
+
+ len -= xfersize;
+ if (should_yield())
+ break;
+ }
+out:
+ *ap->a_offset = offset;
+ *ap->a_len = len;
+ return (error);
+}
+
int
vop_stdadvise(struct vop_advise_args *ap)
{
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index ccc468d71737..c54f55a99036 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -106,6 +106,7 @@ static fo_kqfilter_t vn_kqfilter;
static fo_close_t vn_closefile;
static fo_mmap_t vn_mmap;
static fo_fallocate_t vn_fallocate;
+static fo_fspacectl_t vn_fspacectl;
struct fileops vnops = {
.fo_read = vn_io_fault,
@@ -123,6 +124,7 @@ struct fileops vnops = {
.fo_fill_kinfo = vn_fill_kinfo,
.fo_mmap = vn_mmap,
.fo_fallocate = vn_fallocate,
+ .fo_fspacectl = vn_fspacectl,
.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
};
@@ -3439,6 +3441,114 @@ vn_fallocate(struct file *fp, off_t offset, off_t len, struct thread *td)
return (error);
}
+static int
+vn_deallocate_impl(struct vnode *vp, off_t *offset, off_t *length, int flags,
+ int ioflg, struct ucred *active_cred, struct ucred *file_cred)
+{
+ struct mount *mp;
+ void *rl_cookie;
+ off_t off, len;
+ int error;
+#ifdef AUDIT
+ bool audited_vnode1 = false;
+#endif
+
+ rl_cookie = NULL;
+ error = 0;
+ mp = NULL;
+ off = *offset;
+ len = *length;
+
+ if ((ioflg & (IO_NODELOCKED|IO_RANGELOCKED)) == 0)
+ rl_cookie = vn_rangelock_wlock(vp, off, off + len);
+ while (len > 0 && error == 0) {
+ /*
+ * Try to deallocate the longest range in one pass.
+ * In case a pass takes too long to be executed, it returns
+ * partial result. The residue will be proceeded in the next
+ * pass.
+ */
+
+ if ((ioflg & IO_NODELOCKED) == 0) {
+ bwillwrite();
+ if ((error = vn_start_write(vp, &mp,
+ V_WAIT | PCATCH)) != 0)
+ goto out;
+ vn_lock(vp, vn_lktype_write(mp, vp) | LK_RETRY);
+ }
+#ifdef AUDIT
+ if (!audited_vnode1) {
+ AUDIT_ARG_VNODE1(vp);
+ audited_vnode1 = true;
+ }
+#endif
+
+#ifdef MAC
+ if ((ioflg & IO_NOMACCHECK) == 0)
+ error = mac_vnode_check_write(active_cred, file_cred,
+ vp);
+#endif
+ if (error == 0)
+ error = VOP_DEALLOCATE(vp, &off, &len, flags,
+ active_cred);
+
+ if ((ioflg & IO_NODELOCKED) == 0) {
+ VOP_UNLOCK(vp);
+ if (mp != NULL) {
+ vn_finished_write(mp);
+ mp = NULL;
+ }
+ }
+ }
+out:
+ if (rl_cookie != NULL)
+ vn_rangelock_unlock(vp, rl_cookie);
+ *offset = off;
+ *length = len;
+ return (error);
+}
+
+int
+vn_deallocate(struct vnode *vp, off_t *offset, off_t *length, int flags,
+ int ioflg, struct ucred *active_cred, struct ucred *file_cred)
+{
+ if (*offset < 0 || *length <= 0 || *length > OFF_MAX - *offset ||
+ flags != 0)
+ return (EINVAL);
+ if (vp->v_type != VREG)
+ return (ENODEV);
+
+ return (vn_deallocate_impl(vp, offset, length, flags, ioflg,
+ active_cred, file_cred));
+}
+
*** 562 LINES SKIPPED ***
More information about the dev-commits-src-all
mailing list