svn commit: r330991 - in stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs: . sys

Thu Mar 15 12:05:19 UTC 2018

Author: avg
Date: Thu Mar 15 12:05:17 2018
New Revision: 330991
URL: https://svnweb.freebsd.org/changeset/base/330991

Log:
  MFC r329363: read-behind / read-ahead support for zfs_getpages()

Modified:
  stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
  stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
  stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
Directory Properties:
  stable/11/   (props changed)

Modified: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
==============================================================================

--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c	Thu Mar 15 11:06:04 2018	(r330990)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c	Thu Mar 15 12:05:17 2018	(r330991)
@@ -1518,6 +1518,188 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 	return (err);
 }
+
+int
+dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count,
+    int *rbehind, int *rahead, int last_size)
+{
+	struct sf_buf *sf;
+	vm_object_t vmobj;
+	vm_page_t m;
+	dmu_buf_t **dbp;
+	dmu_buf_t *db;
+	caddr_t va;
+	int numbufs, i;
+	int bufoff, pgoff, tocpy;
+	int mi, di;
+	int err;
+
+	ASSERT3U(ma[0]->pindex + count - 1, ==, ma[count - 1]->pindex);
+	ASSERT(last_size <= PAGE_SIZE);
+
+	err = dmu_buf_hold_array(os, object, IDX_TO_OFF(ma[0]->pindex),
+	    IDX_TO_OFF(count - 1) + last_size, TRUE, FTAG, &numbufs, &dbp);
+	if (err != 0)
+		return (err);
+
+#ifdef DEBUG
+	IMPLY(last_size < PAGE_SIZE, *rahead == 0);
+	if (dbp[0]->db_offset != 0 || numbufs > 1) {
+		for (i = 0; i < numbufs; i++) {
+			ASSERT(ISP2(dbp[i]->db_size));
+			ASSERT((dbp[i]->db_offset % dbp[i]->db_size) == 0);
+			ASSERT3U(dbp[i]->db_size, ==, dbp[0]->db_size);
+		}
+	}
+#endif
+
+	vmobj = ma[0]->object;
+	zfs_vmobject_wlock(vmobj);
+
+	db = dbp[0];
+	for (i = 0; i < *rbehind; i++) {
+		m = vm_page_grab(vmobj, ma[0]->pindex - 1 - i,
+		    VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | VM_ALLOC_NOBUSY);
+		if (m == NULL)
+			break;
+		if (m->valid != 0) {
+			ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL);
+			break;
+		}
+		ASSERT(m->dirty == 0);
+		ASSERT(!pmap_page_is_mapped(m));
+
+		ASSERT(db->db_size > PAGE_SIZE);
+		bufoff = IDX_TO_OFF(m->pindex) % db->db_size;
+		va = zfs_map_page(m, &sf);
+		bcopy((char *)db->db_data + bufoff, va, PAGESIZE);
+		zfs_unmap_page(sf);
+		m->valid = VM_PAGE_BITS_ALL;
+		vm_page_lock(m);
+		if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
+			vm_page_activate(m);
+		else
+			vm_page_deactivate(m);
+		vm_page_unlock(m);
+	}
+	*rbehind = i;
+
+	bufoff = IDX_TO_OFF(ma[0]->pindex) % db->db_size;
+	pgoff = 0;
+	for (mi = 0, di = 0; mi < count && di < numbufs; ) {
+		if (pgoff == 0) {
+			m = ma[mi];
+			vm_page_assert_xbusied(m);
+			ASSERT(m->valid == 0);
+			ASSERT(m->dirty == 0);
+			ASSERT(!pmap_page_is_mapped(m));
+			va = zfs_map_page(m, &sf);
+		}
+		if (bufoff == 0)
+			db = dbp[di];
+
+		ASSERT3U(IDX_TO_OFF(m->pindex) + pgoff, ==,
+		    db->db_offset + bufoff);
+
+		/*
+		 * We do not need to clamp the copy size by the file
+		 * size as the last block is zero-filled beyond the
+		 * end of file anyway.
+		 */
+		tocpy = MIN(db->db_size - bufoff, PAGESIZE - pgoff);
+		bcopy((char *)db->db_data + bufoff, va + pgoff, tocpy);
+
+		pgoff += tocpy;
+		ASSERT(pgoff <= PAGESIZE);
+		if (pgoff == PAGESIZE) {
+			zfs_unmap_page(sf);
+			m->valid = VM_PAGE_BITS_ALL;
+			ASSERT(mi < count);
+			mi++;
+			pgoff = 0;
+		}
+
+		bufoff += tocpy;
+		ASSERT(bufoff <= db->db_size);
+		if (bufoff == db->db_size) {
+			ASSERT(di < numbufs);
+			di++;
+			bufoff = 0;
+		}
+	}
+
+#ifdef DEBUG
+	/*
+	 * Three possibilities:
+	 * - last requested page ends at a buffer boundary and , thus,
+	 *   all pages and buffers have been iterated;
+	 * - all requested pages are filled, but the last buffer
+	 *   has not been exhausted;
+	 *   the read-ahead is possible only in this case;
+	 * - all buffers have been read, but the last page has not been
+	 *   fully filled;
+	 *   this is only possible if the file has only a single buffer
+	 *   with a size that is not a multiple of the page size.
+	 */
+	if (mi == count) {
+		ASSERT(di >= numbufs - 1);
+		IMPLY(*rahead != 0, di == numbufs - 1);
+		IMPLY(*rahead != 0, bufoff != 0);
+		ASSERT(pgoff == 0);
+	}
+	if (di == numbufs) {
+		ASSERT(mi >= count - 1);
+		ASSERT(*rahead == 0);
+		IMPLY(pgoff == 0, mi == count);
+		if (pgoff != 0) {
+			ASSERT(mi == count - 1);
+			ASSERT((dbp[0]->db_size & PAGE_MASK) != 0);
+		}
+	}
+#endif
+	if (pgoff != 0) {
+		bzero(va + pgoff, PAGESIZE - pgoff);
+		zfs_unmap_page(sf);
+		m->valid = VM_PAGE_BITS_ALL;
+	}
+
+	for (i = 0; i < *rahead; i++) {
+		m = vm_page_grab(vmobj, ma[count - 1]->pindex + 1 + i,
+		    VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | VM_ALLOC_NOBUSY);
+		if (m == NULL)
+			break;
+		if (m->valid != 0) {
+			ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL);
+			break;
+		}
+		ASSERT(m->dirty == 0);
+		ASSERT(!pmap_page_is_mapped(m));
+
+		ASSERT(db->db_size > PAGE_SIZE);
+		bufoff = IDX_TO_OFF(m->pindex) % db->db_size;
+		tocpy = MIN(db->db_size - bufoff, PAGESIZE);
+		va = zfs_map_page(m, &sf);
+		bcopy((char *)db->db_data + bufoff, va, tocpy);
+		if (tocpy < PAGESIZE) {
+			ASSERT(i == *rahead - 1);
+			ASSERT((db->db_size & PAGE_MASK) != 0);
+			bzero(va + tocpy, PAGESIZE - tocpy);
+		}
+		zfs_unmap_page(sf);
+		m->valid = VM_PAGE_BITS_ALL;
+		vm_page_lock(m);
+		if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
+			vm_page_activate(m);
+		else
+			vm_page_deactivate(m);
+		vm_page_unlock(m);
+	}
+	*rahead = i;
+	zfs_vmobject_wunlock(vmobj);
+
+	dmu_buf_rele_array(dbp, numbufs, FTAG);
+	return (0);
+}
 #endif	/* illumos */
 #endif	/* _KERNEL */
 

Modified: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
==============================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h	Thu Mar 15 11:06:04 2018	(r330990)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h	Thu Mar 15 12:05:17 2018	(r330991)
@@ -753,6 +753,8 @@ int dmu_write_pages(objset_t *os, uint64_t object, uin
 #else
 int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t size, struct vm_page **ppa, dmu_tx_t *tx);
+int dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count,
+    int *rbehind, int *rahead, int last_size);
 #endif
 #endif
 struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size);

Modified: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
==============================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c	Thu Mar 15 11:06:04 2018	(r330990)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c	Thu Mar 15 12:05:17 2018	(r330991)
@@ -4515,81 +4515,85 @@ zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int fla
 }
 
 static int
-zfs_getpages(struct vnode *vp, vm_page_t *m, int count, int *rbehind,
+zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind,
     int *rahead)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	objset_t *os = zp->z_zfsvfs->z_os;
-	vm_page_t mlast;
+	rl_t *rl;
 	vm_object_t object;
-	caddr_t va;
-	struct sf_buf *sf;
-	off_t startoff, endoff;
-	int i, error;
-	vm_pindex_t reqstart, reqend;
-	int lsize, size;
+	off_t start, end, obj_size;
+	uint_t blksz;
+	int pgsin_b, pgsin_a;
+	int error;
 
-	object = m[0]->object;
-	error = 0;
-
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
-	zfs_vmobject_wlock(object);
-	if (m[count - 1]->valid != 0 && --count == 0) {
-		zfs_vmobject_wunlock(object);
-		goto out;
+	start = IDX_TO_OFF(ma[0]->pindex);
+	end = IDX_TO_OFF(ma[count - 1]->pindex + 1);
+
+	/*
+	 * Lock a range covering all required and optional pages.
+	 * Note that we need to handle the case of the block size growing.
+	 */
+	for (;;) {
+		blksz = zp->z_blksz;
+		rl = zfs_range_lock(zp, rounddown(start, blksz),
+		    roundup(end, blksz) - rounddown(start, blksz), RL_READER);
+		if (blksz == zp->z_blksz)
+			break;
+		zfs_range_unlock(rl);
 	}
 
-	mlast = m[count - 1];
-
-	if (IDX_TO_OFF(mlast->pindex) >=
-	    object->un_pager.vnp.vnp_size) {
-		zfs_vmobject_wunlock(object);
+	object = ma[0]->object;
+	zfs_vmobject_wlock(object);
+	obj_size = object->un_pager.vnp.vnp_size;
+	zfs_vmobject_wunlock(object);
+	if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) {
+		zfs_range_unlock(rl);
 		ZFS_EXIT(zfsvfs);
 		return (zfs_vm_pagerret_bad);
 	}
 
-	PCPU_INC(cnt.v_vnodein);
-	PCPU_ADD(cnt.v_vnodepgsin, count);
+	pgsin_b = 0;
+	if (rbehind != NULL) {
+		pgsin_b = OFF_TO_IDX(start - rounddown(start, blksz));
+		pgsin_b = MIN(*rbehind, pgsin_b);
+	}
 
-	lsize = PAGE_SIZE;
-	if (IDX_TO_OFF(mlast->pindex) + lsize > object->un_pager.vnp.vnp_size)
-		lsize = object->un_pager.vnp.vnp_size -
-		    IDX_TO_OFF(mlast->pindex);
-	zfs_vmobject_wunlock(object);
-
-	for (i = 0; i < count; i++) {
-		size = PAGE_SIZE;
-		if (i == count - 1)
-			size = lsize;
-		va = zfs_map_page(m[i], &sf);
-		error = dmu_read(os, zp->z_id, IDX_TO_OFF(m[i]->pindex),
-		    size, va, DMU_READ_PREFETCH);
-		if (size != PAGE_SIZE)
-			bzero(va + size, PAGE_SIZE - size);
-		zfs_unmap_page(sf);
-		if (error != 0)
-			goto out;
+	pgsin_a = 0;
+	if (rahead != NULL) {
+		pgsin_a = OFF_TO_IDX(roundup(end, blksz) - end);
+		if (end + IDX_TO_OFF(pgsin_a) >= obj_size)
+			pgsin_a = OFF_TO_IDX(round_page(obj_size) - end);
+		pgsin_a = MIN(*rahead, pgsin_a);
 	}
 
-	zfs_vmobject_wlock(object);
-	for (i = 0; i < count; i++)
-		m[i]->valid = VM_PAGE_BITS_ALL;
-	zfs_vmobject_wunlock(object);
+	/*
+	 * NB: we need to pass the exact byte size of the data that we expect
+	 * to read after accounting for the file size.  This is required because
+	 * ZFS will panic if we request DMU to read beyond the end of the last
+	 * allocated block.
+	 */
+	error = dmu_read_pages(os, zp->z_id, ma, count, &pgsin_b, &pgsin_a,
+	    MIN(end, obj_size) - (end - PAGE_SIZE));
 
-out:
+	zfs_range_unlock(rl);
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 	ZFS_EXIT(zfsvfs);
-	if (error == 0) {
-		if (rbehind)
-			*rbehind = 0;
-		if (rahead)
-			*rahead = 0;
-		return (zfs_vm_pagerret_ok);
-	} else
+
+	if (error != 0)
 		return (zfs_vm_pagerret_error);
+
+	PCPU_INC(cnt.v_vnodein);
+	PCPU_ADD(cnt.v_vnodepgsin, count + pgsin_b + pgsin_a);
+	if (rbehind != NULL)
+		*rbehind = pgsin_b;
+	if (rahead != NULL)
+		*rahead = pgsin_a;
+	return (zfs_vm_pagerret_ok);
 }
 
 static int