git: 3b6056204dd8 - main - FIOSEEKHOLE/FIOSEEKDATA: correct consistency for bmap-based implementation

From: Konstantin Belousov <kib_at_FreeBSD.org>
Date: Sat, 04 Feb 2023 18:32:45 UTC
The branch main has been updated by kib:

URL: https://cgit.FreeBSD.org/src/commit/?id=3b6056204dd80cc866b7998ef0776247ebc42ce4

commit 3b6056204dd80cc866b7998ef0776247ebc42ce4
Author:     Konstantin Belousov <kib@FreeBSD.org>
AuthorDate: 2023-02-04 01:20:19 +0000
Commit:     Konstantin Belousov <kib@FreeBSD.org>
CommitDate: 2023-02-04 18:32:07 +0000

    FIOSEEKHOLE/FIOSEEKDATA: correct consistency for bmap-based implementation
    
    Writes on UFS through a mapped region do not allocate disk blocks in
    holes immediately. The blocks are allocated when the pages are paged out
    first time.
    
    This breaks the algorithm in vn_bmap_seekhole() and ufs_bmap_seekdata(),
    because VOP_BMAP() reports hole for the place which already contains a
    valid data.
    
    Clean the pages before doing VOP_BMAP() in the affected functions.  In
    principle, we could clean less by only requesting clean starting from
    the offset, but it is probably not very important.
    
    PR:     269261
    Reported by:    asomers
    Reviewed by:    asomers, markj
    Sponsored by:   The FreeBSD Foundation
    MFC after:      1 week
    Differential revision:  https://reviews.freebsd.org/D38379
---
 sys/kern/vfs_vnops.c    | 14 ++++++++++++--
 sys/ufs/ufs/ufs_bmap.c  | 18 ++++++++++++++++++
 sys/ufs/ufs/ufs_vnops.c |  2 +-
 3 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index a49070b6060d..1d90f76f9cd6 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -2556,6 +2556,7 @@ int
 vn_bmap_seekhole_locked(struct vnode *vp, u_long cmd, off_t *off,
     struct ucred *cred)
 {
+	vm_object_t obj;
 	off_t size;
 	daddr_t bn, bnp;
 	uint64_t bsize;
@@ -2564,7 +2565,7 @@ vn_bmap_seekhole_locked(struct vnode *vp, u_long cmd, off_t *off,
 
 	KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA,
 	    ("%s: Wrong command %lu", __func__, cmd));
-	ASSERT_VOP_LOCKED(vp, "vn_bmap_seekhole_locked");
+	ASSERT_VOP_ELOCKED(vp, "vn_bmap_seekhole_locked");
 
 	if (vp->v_type != VREG) {
 		error = ENOTTY;
@@ -2578,6 +2579,15 @@ vn_bmap_seekhole_locked(struct vnode *vp, u_long cmd, off_t *off,
 		error = ENXIO;
 		goto out;
 	}
+
+	/* See the comment in ufs_bmap_seekdata(). */
+	obj = vp->v_object;
+	if (obj != NULL) {
+		VM_OBJECT_WLOCK(obj);
+		vm_object_page_clean(obj, 0, 0, OBJPC_SYNC);
+		VM_OBJECT_WUNLOCK(obj);
+	}
+
 	bsize = vp->v_mount->mnt_stat.f_iosize;
 	for (bn = noff / bsize; noff < size; bn++, noff += bsize -
 	    noff % bsize) {
@@ -2613,7 +2623,7 @@ vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred)
 	KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA,
 	    ("%s: Wrong command %lu", __func__, cmd));
 
-	if (vn_lock(vp, LK_SHARED) != 0)
+	if (vn_lock(vp, LK_EXCLUSIVE) != 0)
 		return (EBADF);
 	error = vn_bmap_seekhole_locked(vp, cmd, off, cred);
 	VOP_UNLOCK(vp);
diff --git a/sys/ufs/ufs/ufs_bmap.c b/sys/ufs/ufs/ufs_bmap.c
index 4ac8ca149279..acdd334f6c7b 100644
--- a/sys/ufs/ufs/ufs_bmap.c
+++ b/sys/ufs/ufs/ufs_bmap.c
@@ -44,12 +44,16 @@ __FBSDID("$FreeBSD$");
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/proc.h>
+#include <sys/rwlock.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/stat.h>
 
+#include <vm/vm.h>
+#include <vm/vm_object.h>
+
 #include <ufs/ufs/extattr.h>
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/inode.h>
@@ -348,6 +352,7 @@ ufs_bmap_seekdata(struct vnode *vp, off_t *offp)
 	struct inode *ip;
 	struct mount *mp;
 	struct ufsmount *ump;
+	vm_object_t obj;
 	ufs2_daddr_t bn, daddr, nextbn;
 	uint64_t bsize;
 	off_t numblks;
@@ -364,6 +369,19 @@ ufs_bmap_seekdata(struct vnode *vp, off_t *offp)
 	if (*offp < 0 || *offp >= ip->i_size)
 		return (ENXIO);
 
+	/*
+	 * We could have pages on the vnode' object queue which still
+	 * do not have the data blocks allocated.  Convert all dirty
+	 * pages into buffer writes to ensure that we see all
+	 * allocated data.
+	 */
+	obj = vp->v_object;
+	if (obj != NULL) {
+		VM_OBJECT_WLOCK(obj);
+		vm_object_page_clean(obj, 0, 0, OBJPC_SYNC);
+		VM_OBJECT_WUNLOCK(obj);
+	}
+
 	bsize = mp->mnt_stat.f_iosize;
 	for (bn = *offp / bsize, numblks = howmany(ip->i_size, bsize);
 	    bn < numblks; bn = nextbn) {
diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c
index 93a5b173b785..ae6d963920f3 100644
--- a/sys/ufs/ufs/ufs_vnops.c
+++ b/sys/ufs/ufs/ufs_vnops.c
@@ -2944,7 +2944,7 @@ ufs_ioctl(struct vop_ioctl_args *ap)
 	vp = ap->a_vp;
 	switch (ap->a_command) {
 	case FIOSEEKDATA:
-		error = vn_lock(vp, LK_SHARED);
+		error = vn_lock(vp, LK_EXCLUSIVE);
 		if (error == 0) {
 			error = ufs_bmap_seekdata(vp, (off_t *)ap->a_data);
 			VOP_UNLOCK(vp);