git: 3d8450db4c60 - main - vfs: vn_dir_next_dirent(): Simplify interface and harden

From: Konstantin Belousov <kib_at_FreeBSD.org>
Date: Fri, 28 Apr 2023 01:29:55 UTC
The branch main has been updated by kib:

URL: https://cgit.FreeBSD.org/src/commit/?id=3d8450db4c603d18aa45422159170e133c95214d

commit 3d8450db4c603d18aa45422159170e133c95214d
Author:     Olivier Certner <olce.freebsd@certner.fr>
AuthorDate: 2023-04-24 08:25:15 +0000
Commit:     Konstantin Belousov <kib@FreeBSD.org>
CommitDate: 2023-04-28 01:27:54 +0000

    vfs: vn_dir_next_dirent(): Simplify interface and harden
    
    Simplify the old interface (one less argument, simpler termination test)
    and add documentation about it. Add more sanity checks (mostly under
    INVARIANTS, but also in the general case to prevent infinite
    loops). Drop the explicit test on minimum directory entry size (without
    INVARIANTS).
    
    Deal with the impacts in callers (dirent_exists() and vop_stdvptocnp()).
    dirent_exists() has been simplified a bit, preserving the exact same
    semantics but for the return code whose meaning has been reversed (0 now
    means the entry exists, ENOENT that it doesn't and other values are
    genuine errors). While here, suppress gratuitous casts of malloc return
    values.
    
    vn_dir_next_dirent() has been tested by a 'make -j4 buildkernel' with a
    temporary modification to the VFS cache causing vn_vptocnp() to always
    call VOP_VPTOCNP() and finally vop_stdvptocnp() (observed with temporary
    debug counters).
    
    Export new _GENERIC_MINDIRSIZ and _GENERIC_MAXDIRSIZ on __BSD_VISIBLE,
    and GENERIC_MINDIRSIZ and GENERIC_MAXDIRSIZ on _KERNEL.
    
    Reviewed by:    kib
    MFC after:      1 week
    Differential Revision:  https://reviews.freebsd.org/D39764
---
 sys/kern/vfs_default.c |  94 ++++++++++++++++-------------
 sys/kern/vfs_vnops.c   | 156 ++++++++++++++++++++++++++++++++++++++++---------
 sys/sys/dirent.h       |   5 +-
 sys/sys/vnode.h        |   6 +-
 4 files changed, 187 insertions(+), 74 deletions(-)

diff --git a/sys/kern/vfs_default.c b/sys/kern/vfs_default.c
index 7da4f7b62cff..87ba1e0d728f 100644
--- a/sys/kern/vfs_default.c
+++ b/sys/kern/vfs_default.c
@@ -276,49 +276,56 @@ vop_nostrategy (struct vop_strategy_args *ap)
 }
 
 /*
- * Check if a named file exists in a given directory vnode.
+ * Check if a named file exists in a given directory vnode
+ *
+ * Returns 0 if the file exists, ENOENT if it doesn't, or errors returned by
+ * vn_dir_next_dirent().
  */
 static int
 dirent_exists(struct vnode *vp, const char *dirname, struct thread *td)
 {
-	char *dirbuf, *cpos;
-	int error, eofflag, dirbuflen, len, found;
+	char *dirbuf;
+	int error, eofflag;
+	size_t dirbuflen, len;
 	off_t off;
 	struct dirent *dp;
 	struct vattr va;
 
-	KASSERT(VOP_ISLOCKED(vp), ("vp %p is not locked", vp));
+	ASSERT_VOP_LOCKED(vp, "vnode not locked");
 	KASSERT(vp->v_type == VDIR, ("vp %p is not a directory", vp));
 
-	found = 0;
-
 	error = VOP_GETATTR(vp, &va, td->td_ucred);
-	if (error)
-		return (found);
+	if (error != 0)
+		return (error);
 
-	dirbuflen = DEV_BSIZE;
+	dirbuflen = MAX(DEV_BSIZE, GENERIC_MAXDIRSIZ);
 	if (dirbuflen < va.va_blocksize)
 		dirbuflen = va.va_blocksize;
-	dirbuf = (char *)malloc(dirbuflen, M_TEMP, M_WAITOK);
+	dirbuf = malloc(dirbuflen, M_TEMP, M_WAITOK);
 
-	off = 0;
 	len = 0;
-	do {
-		error = vn_dir_next_dirent(vp, &dp, dirbuf, dirbuflen, &off,
-		    &cpos, &len, &eofflag, td);
-		if (error)
+	off = 0;
+	eofflag = 0;
+
+	for (;;) {
+		error = vn_dir_next_dirent(vp, td, dirbuf, dirbuflen,
+		    &dp, &len, &off, &eofflag);
+		if (error != 0)
 			goto out;
 
+		if (len == 0)
+			break;
+
 		if (dp->d_type != DT_WHT && dp->d_fileno != 0 &&
-		    strcmp(dp->d_name, dirname) == 0) {
-			found = 1;
+		    strcmp(dp->d_name, dirname) == 0)
 			goto out;
-		}
-	} while (len > 0 || !eofflag);
+	}
+
+	error = ENOENT;
 
 out:
 	free(dirbuf, M_TEMP);
-	return (found);
+	return (error);
 }
 
 int
@@ -672,27 +679,24 @@ vop_stdvptofh(struct vop_vptofh_args *ap)
 int
 vop_stdvptocnp(struct vop_vptocnp_args *ap)
 {
-	struct vnode *vp = ap->a_vp;
-	struct vnode **dvp = ap->a_vpp;
-	struct ucred *cred;
+	struct vnode *const vp = ap->a_vp;
+	struct vnode **const dvp = ap->a_vpp;
 	char *buf = ap->a_buf;
 	size_t *buflen = ap->a_buflen;
-	char *dirbuf, *cpos;
-	int i, error, eofflag, dirbuflen, flags, locked, len, covered;
+	char *dirbuf;
+	int i = *buflen;
+	int error = 0, covered = 0;
+	int eofflag, flags, locked;
+	size_t dirbuflen, len;
 	off_t off;
 	ino_t fileno;
 	struct vattr va;
 	struct nameidata nd;
-	struct thread *td;
+	struct thread *const td = curthread;
+	struct ucred *const cred = td->td_ucred;
 	struct dirent *dp;
 	struct vnode *mvp;
 
-	i = *buflen;
-	error = 0;
-	covered = 0;
-	td = curthread;
-	cred = td->td_ucred;
-
 	if (vp->v_type != VDIR)
 		return (ENOENT);
 
@@ -729,31 +733,38 @@ vop_stdvptocnp(struct vop_vptocnp_args *ap)
 
 	fileno = va.va_fileid;
 
-	dirbuflen = DEV_BSIZE;
+	dirbuflen = MAX(DEV_BSIZE, GENERIC_MAXDIRSIZ);
 	if (dirbuflen < va.va_blocksize)
 		dirbuflen = va.va_blocksize;
-	dirbuf = (char *)malloc(dirbuflen, M_TEMP, M_WAITOK);
+	dirbuf = malloc(dirbuflen, M_TEMP, M_WAITOK);
 
 	if ((*dvp)->v_type != VDIR) {
 		error = ENOENT;
 		goto out;
 	}
 
-	off = 0;
 	len = 0;
-	do {
+	off = 0;
+	eofflag = 0;
+
+	for (;;) {
 		/* call VOP_READDIR of parent */
-		error = vn_dir_next_dirent(*dvp, &dp, dirbuf, dirbuflen, &off,
-		    &cpos, &len, &eofflag, td);
-		if (error)
+		error = vn_dir_next_dirent(*dvp, td,
+		    dirbuf, dirbuflen, &dp, &len, &off, &eofflag);
+		if (error != 0)
 			goto out;
 
+		if (len == 0) {
+			error = ENOENT;
+			goto out;
+		}
+
 		if ((dp->d_type != DT_WHT) &&
 		    (dp->d_fileno == fileno)) {
 			if (covered) {
 				VOP_UNLOCK(*dvp);
 				vn_lock(mvp, LK_SHARED | LK_RETRY);
-				if (dirent_exists(mvp, dp->d_name, td)) {
+				if (dirent_exists(mvp, dp->d_name, td) == 0) {
 					error = ENOENT;
 					VOP_UNLOCK(mvp);
 					vn_lock(*dvp, LK_SHARED | LK_RETRY);
@@ -776,8 +787,7 @@ vop_stdvptocnp(struct vop_vptocnp_args *ap)
 			}
 			goto out;
 		}
-	} while (len > 0 || !eofflag);
-	error = ENOENT;
+	}
 
 out:
 	free(dirbuf, M_TEMP);
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index ad9b0c8864b0..415ed158b5a3 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -3737,22 +3737,112 @@ vn_fspacectl(struct file *fp, int cmd, off_t *offset, off_t *length, int flags,
 	return (error);
 }
 
-#define DIRENT_MINSIZE (sizeof(struct dirent) - (MAXNAMLEN+1) + 4)
+/*
+ * Keep this assert as long as sizeof(struct dirent) is used as the maximum
+ * entry size.
+ */
+_Static_assert(_GENERIC_MAXDIRSIZ == sizeof(struct dirent),
+    "'struct dirent' size must be a multiple of its alignment "
+    "(see _GENERIC_DIRLEN())");
 
+/*
+ * Returns successive directory entries through some caller's provided buffer
+ *
+ * This function automatically refills the provided buffer with calls to
+ * VOP_READDIR() (after MAC permission checks).
+ *
+ * 'td' is used for credentials and passed to uiomove(). 'dirbuf' is the
+ * caller's buffer to fill and 'dirbuflen' its allocated size. 'dirbuf' must be
+ * properly aligned to access 'struct dirent' structures and 'dirbuflen' must
+ * be greater than GENERIC_MAXDIRSIZ to avoid VOP_READDIR() returning EINVAL
+ * (the latter is not a strong guarantee (yet); but EINVAL will always be
+ * returned if this requirement is not verified). '*dpp' points to the current
+ * directory entry in the buffer and '*len' contains the remaining valid bytes
+ * in 'dirbuf' after 'dpp' (including the pointed entry).
+ *
+ * At first call (or when restarting the read), '*len' must have been set to 0,
+ * '*off' to 0 (or any valid start offset) and '*eofflag' to 0. There are no
+ * more entries as soon as '*len' is 0 after a call that returned 0. Calling
+ * again this function after such a condition is considered an error and EINVAL
+ * will be returned. Other possible error codes are those of VOP_READDIR(),
+ * EINTEGRITY if the returned entries do not pass coherency tests, or EINVAL
+ * (bad call). All errors are unrecoverable, i.e., the state ('*len', '*off'
+ * and '*eofflag') must be re-initialized before a subsequent call. On error or
+ * at end of directory, '*dpp' is reset to NULL.
+ *
+ * '*len', '*off' and '*eofflag' are internal state the caller should not
+ * tamper with except as explained above. '*off' is the next directory offset
+ * to read from to refill the buffer. '*eofflag' is set to 0 or 1 by the last
+ * internal call to VOP_READDIR() that returned without error, indicating
+ * whether it reached the end of the directory, and to 2 by this function after
+ * all entries have been read.
+ */
 int
-vn_dir_next_dirent(struct vnode *vp, struct dirent **dpp, char *dirbuf,
-    int dirbuflen, off_t *off, char **cpos, int *len,
-    int *eofflag, struct thread *td)
+vn_dir_next_dirent(struct vnode *vp, struct thread *td,
+    char *dirbuf, size_t dirbuflen,
+    struct dirent **dpp, size_t *len, off_t *off, int *eofflag)
 {
-	int error, reclen;
+	struct dirent *dp = NULL;
+	int reclen;
+	int error;
 	struct uio uio;
 	struct iovec iov;
-	struct dirent *dp;
 
-	KASSERT(VOP_ISLOCKED(vp), ("vp %p is not locked", vp));
-	KASSERT(vp->v_type == VDIR, ("vp %p is not a directory", vp));
+	ASSERT_VOP_LOCKED(vp, "vnode not locked");
+	VNASSERT(vp->v_type == VDIR, vp, ("vnode is not a directory"));
+	MPASS2((uintptr_t)dirbuf < (uintptr_t)dirbuf + dirbuflen,
+	    "Address space overflow");
+
+	if (__predict_false(dirbuflen < GENERIC_MAXDIRSIZ)) {
+		/* Don't take any chances in this case */
+		error = EINVAL;
+		goto out;
+	}
+
+	if (*len != 0) {
+		dp = *dpp;
+
+		/*
+		 * The caller continued to call us after an error (we set dp to
+		 * NULL in a previous iteration). Bail out right now.
+		 */
+		if (__predict_false(dp == NULL))
+			return (EINVAL);
+
+		MPASS(*len <= dirbuflen);
+		MPASS2((uintptr_t)dirbuf <= (uintptr_t)dp &&
+		    (uintptr_t)dp + *len <= (uintptr_t)dirbuf + dirbuflen,
+		    "Filled range not inside buffer");
+
+		reclen = dp->d_reclen;
+		if (reclen >= *len) {
+			/* End of buffer reached */
+			*len = 0;
+		} else {
+			dp = (struct dirent *)((char *)dp + reclen);
+			*len -= reclen;
+		}
+	}
 
 	if (*len == 0) {
+		dp = NULL;
+
+		/* Have to refill. */
+		switch (*eofflag) {
+		case 0:
+			break;
+
+		case 1:
+			/* Nothing more to read. */
+			*eofflag = 2; /* Remember the caller reached EOF. */
+			goto success;
+
+		default:
+			/* The caller didn't test for EOF. */
+			error = EINVAL;
+			goto out;
+		}
+
 		iov.iov_base = dirbuf;
 		iov.iov_len = dirbuflen;
 
@@ -3764,40 +3854,50 @@ vn_dir_next_dirent(struct vnode *vp, struct dirent **dpp, char *dirbuf,
 		uio.uio_rw = UIO_READ;
 		uio.uio_td = td;
 
-		*eofflag = 0;
-
 #ifdef MAC
 		error = mac_vnode_check_readdir(td->td_ucred, vp);
 		if (error == 0)
 #endif
 			error = VOP_READDIR(vp, &uio, td->td_ucred, eofflag,
-		    		NULL, NULL);
-		if (error)
-			return (error);
+			    NULL, NULL);
+		if (error != 0)
+			goto out;
 
+		*len = dirbuflen - uio.uio_resid;
 		*off = uio.uio_offset;
 
-		*cpos = dirbuf;
-		*len = (dirbuflen - uio.uio_resid);
-
-		if (*len == 0)
-			return (ENOENT);
-	}
+		if (*len == 0) {
+			/* Sanity check on INVARIANTS. */
+			MPASS(*eofflag != 0);
+			*eofflag = 1;
+			goto success;
+		}
 
-	dp = (struct dirent *)(*cpos);
-	reclen = dp->d_reclen;
-	*dpp = dp;
+		/*
+		 * Normalize the flag returned by VOP_READDIR(), since we use 2
+		 * as a sentinel value.
+		 */
+		if (*eofflag != 0)
+			*eofflag = 1;
 
-	/* check for malformed directory.. */
-	if (reclen < DIRENT_MINSIZE)
-		return (EINVAL);
+		dp = (struct dirent *)dirbuf;
+	}
 
-	*cpos += reclen;
-	*len -= reclen;
+	if (__predict_false(*len < GENERIC_MINDIRSIZ ||
+	    dp->d_reclen < GENERIC_MINDIRSIZ)) {
+		error = EINTEGRITY;
+		dp = NULL;
+		goto out;
+	}
 
-	return (0);
+success:
+	error = 0;
+out:
+	*dpp = dp;
+	return (error);
 }
 
+
 static u_long vn_lock_pair_pause_cnt;
 SYSCTL_ULONG(_debug, OID_AUTO, vn_lock_pair_pause, CTLFLAG_RD,
     &vn_lock_pair_pause_cnt, 0,
diff --git a/sys/sys/dirent.h b/sys/sys/dirent.h
index b3c5e00cd9ad..9087b01fa597 100644
--- a/sys/sys/dirent.h
+++ b/sys/sys/dirent.h
@@ -122,11 +122,14 @@ struct freebsd11_dirent {
 #define	_GENERIC_DIRLEN(namlen)					\
 	((__offsetof(struct dirent, d_name) + (namlen) + 1 + 7) & ~7)
 #define	_GENERIC_DIRSIZ(dp)	_GENERIC_DIRLEN((dp)->d_namlen)
+#define	_GENERIC_MINDIRSIZ	_GENERIC_DIRLEN(1) /* Name must not be empty */
+#define	_GENERIC_MAXDIRSIZ	_GENERIC_DIRLEN(MAXNAMLEN)
 #endif /* __BSD_VISIBLE */
 
 #ifdef _KERNEL
 #define	GENERIC_DIRSIZ(dp)	_GENERIC_DIRSIZ(dp)
-
+#define	GENERIC_MINDIRSIZ	_GENERIC_MINDIRSIZ
+#define	GENERIC_MAXDIRSIZ	_GENERIC_MAXDIRSIZ
 /*
  * Ensure that padding bytes are zeroed and that the name is NUL-terminated.
  */
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
index 282f75bcdd71..8403dd8c035b 100644
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -1099,9 +1099,9 @@ void vfs_hash_remove(struct vnode *vp);
 
 int vfs_kqfilter(struct vop_kqfilter_args *);
 struct dirent;
-int vn_dir_next_dirent(struct vnode *vp, struct dirent **dpp, char *dirbuf,
-    int dirbuflen, off_t *off, char **cpos, int *len,
-    int *eofflag, struct thread *td);
+int vn_dir_next_dirent(struct vnode *vp, struct thread *td,
+    char *dirbuf, size_t dirbuflen,
+    struct dirent **dpp, size_t *len, off_t *off, int *eofflag);
 int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off);
 int vfs_emptydir(struct vnode *vp);