git: fdb1dbb1cc06 - main - vm: read-locked fault handling for backing objects

From: Mateusz Guzik <mjg_at_FreeBSD.org>
Date: Sat, 11 Mar 2023 11:08:43 UTC
The branch main has been updated by mjg:

URL: https://cgit.FreeBSD.org/src/commit/?id=fdb1dbb1cc0608edd54451050fa56b84a303c8a6

commit fdb1dbb1cc0608edd54451050fa56b84a303c8a6
Author:     Mateusz Guzik <mjg@FreeBSD.org>
AuthorDate: 2023-03-07 20:56:54 +0000
Commit:     Mateusz Guzik <mjg@FreeBSD.org>
CommitDate: 2023-03-11 11:08:21 +0000

    vm: read-locked fault handling for backing objects
    
    This is almost the simplest patch which manages to avoid write locking
    for backing objects, as a result mostly fixing vm object contention
    problems.
    
    What is not fixed:
    1. cacheline ping pong due to read-locks
    2. cacheline ping pong due to pip
    3. cacheling ping pong due to object busying
    4. write locking on first object
    
    On top of it the use of VM_OBJECT_UNLOCK instead of explicitly tracking
    the state is slower multithreaded that it needs to be, done for
    simplicity for the time being.
    
    Sample lock profiling results doing -j 104 buildkernel on tmpfs:
    before:
    71446200 (rw:vmobject)
    14689706 (sx:vm map (user))
    4166251 (rw:pmap pv list)
    2799924 (spin mutex:turnstile chain)
    
    after:
    19940411 (rw:vmobject)
    8166012 (rw:pmap pv list)
    6017608 (sx:vm map (user))
    1151416 (sleep mutex:pipe mutex)
    
    Reviewed by:    kib
    Reviewed by:    markj
    Tested by:      pho
    Differential Revision:  https://reviews.freebsd.org/D38964
---
 sys/vm/vm_fault.c | 81 +++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 67 insertions(+), 14 deletions(-)

diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c
index 2afe5a19d2d7..5df667052615 100644
--- a/sys/vm/vm_fault.c
+++ b/sys/vm/vm_fault.c
@@ -132,6 +132,7 @@ struct faultstate {
 	struct timeval	oom_start_time;
 	bool		oom_started;
 	int		nera;
+	bool		can_read_lock;
 
 	/* Page reference for cow. */
 	vm_page_t m_cow;
@@ -170,6 +171,12 @@ enum fault_status {
 	FAULT_PROTECTION_FAILURE, /* Invalid access. */
 };
 
+enum fault_next_status {
+	FAULT_NEXT_GOTOBJ = 1,
+	FAULT_NEXT_NOOBJ,
+	FAULT_NEXT_RESTART,
+};
+
 static void vm_fault_dontneed(const struct faultstate *fs, vm_offset_t vaddr,
 	    int ahead);
 static void vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra,
@@ -278,7 +285,7 @@ static void
 unlock_and_deallocate(struct faultstate *fs)
 {
 
-	VM_OBJECT_WUNLOCK(fs->object);
+	VM_OBJECT_UNLOCK(fs->object);
 	fault_deallocate(fs);
 }
 
@@ -736,6 +743,26 @@ vm_fault_trap(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
 	return (result);
 }
 
+static bool
+vm_fault_object_ensure_wlocked(struct faultstate *fs)
+{
+	if (fs->object == fs->first_object)
+		VM_OBJECT_ASSERT_WLOCKED(fs->object);
+
+	if (!fs->can_read_lock)  {
+		VM_OBJECT_ASSERT_WLOCKED(fs->object);
+		return (true);
+	}
+
+	if (VM_OBJECT_WOWNED(fs->object))
+		return (true);
+
+	if (VM_OBJECT_TRYUPGRADE(fs->object))
+		return (true);
+
+	return (false);
+}
+
 static enum fault_status
 vm_fault_lock_vnode(struct faultstate *fs, bool objlocked)
 {
@@ -1042,12 +1069,15 @@ vm_fault_cow(struct faultstate *fs)
 	curthread->td_cow++;
 }
 
-static bool
+static enum fault_next_status
 vm_fault_next(struct faultstate *fs)
 {
 	vm_object_t next_object;
 
-	VM_OBJECT_ASSERT_WLOCKED(fs->object);
+	if (fs->object == fs->first_object || !fs->can_read_lock)
+		VM_OBJECT_ASSERT_WLOCKED(fs->object);
+	else
+		VM_OBJECT_ASSERT_LOCKED(fs->object);
 
 	/*
 	 * The requested page does not exist at this object/
@@ -1062,8 +1092,14 @@ vm_fault_next(struct faultstate *fs)
 	if (fs->object == fs->first_object) {
 		fs->first_m = fs->m;
 		fs->m = NULL;
-	} else
+	} else {
+		if (!vm_fault_object_ensure_wlocked(fs)) {
+			fs->can_read_lock = false;
+			unlock_and_deallocate(fs);
+			return (FAULT_NEXT_RESTART);
+		}
 		fault_page_free(&fs->m);
+	}
 
 	/*
 	 * Move on to the next object.  Lock the next object before
@@ -1071,18 +1107,21 @@ vm_fault_next(struct faultstate *fs)
 	 */
 	next_object = fs->object->backing_object;
 	if (next_object == NULL)
-		return (false);
+		return (FAULT_NEXT_NOOBJ);
 	MPASS(fs->first_m != NULL);
 	KASSERT(fs->object != next_object, ("object loop %p", next_object));
-	VM_OBJECT_WLOCK(next_object);
+	if (fs->can_read_lock)
+		VM_OBJECT_RLOCK(next_object);
+	else
+		VM_OBJECT_WLOCK(next_object);
 	vm_object_pip_add(next_object, 1);
 	if (fs->object != fs->first_object)
 		vm_object_pip_wakeup(fs->object);
 	fs->pindex += OFF_TO_IDX(fs->object->backing_object_offset);
-	VM_OBJECT_WUNLOCK(fs->object);
+	VM_OBJECT_UNLOCK(fs->object);
 	fs->object = next_object;
 
-	return (true);
+	return (FAULT_NEXT_GOTOBJ);
 }
 
 static void
@@ -1364,7 +1403,7 @@ vm_fault_busy_sleep(struct faultstate *fs)
 	unlock_map(fs);
 	if (fs->m != vm_page_lookup(fs->object, fs->pindex) ||
 	    !vm_page_busy_sleep(fs->m, "vmpfw", 0))
-		VM_OBJECT_WUNLOCK(fs->object);
+		VM_OBJECT_UNLOCK(fs->object);
 	VM_CNT_INC(v_intrans);
 	vm_object_deallocate(fs->first_object);
 }
@@ -1383,7 +1422,10 @@ vm_fault_object(struct faultstate *fs, int *behindp, int *aheadp)
 	enum fault_status res;
 	bool dead;
 
-	VM_OBJECT_ASSERT_WLOCKED(fs->object);
+	if (fs->object == fs->first_object || !fs->can_read_lock)
+		VM_OBJECT_ASSERT_WLOCKED(fs->object);
+	else
+		VM_OBJECT_ASSERT_LOCKED(fs->object);
 
 	/*
 	 * If the object is marked for imminent termination, we retry
@@ -1415,7 +1457,7 @@ vm_fault_object(struct faultstate *fs, int *behindp, int *aheadp)
 		 * done.
 		 */
 		if (vm_page_all_valid(fs->m)) {
-			VM_OBJECT_WUNLOCK(fs->object);
+			VM_OBJECT_UNLOCK(fs->object);
 			return (FAULT_SOFT);
 		}
 	}
@@ -1427,6 +1469,11 @@ vm_fault_object(struct faultstate *fs, int *behindp, int *aheadp)
 	 */
 	if (fs->m == NULL && (fault_object_needs_getpages(fs->object) ||
 	    fs->object == fs->first_object)) {
+		if (!vm_fault_object_ensure_wlocked(fs)) {
+			fs->can_read_lock = false;
+			unlock_and_deallocate(fs);
+			return (FAULT_RESTART);
+		}
 		res = vm_fault_allocate(fs);
 		if (res != FAULT_CONTINUE)
 			return (res);
@@ -1448,7 +1495,7 @@ vm_fault_object(struct faultstate *fs, int *behindp, int *aheadp)
 		 * prevents simultaneous faults and collapses while
 		 * the object lock is dropped.
 		 */
-		VM_OBJECT_WUNLOCK(fs->object);
+		VM_OBJECT_UNLOCK(fs->object);
 		res = vm_fault_getpages(fs, behindp, aheadp);
 		if (res == FAULT_CONTINUE)
 			VM_OBJECT_WLOCK(fs->object);
@@ -1465,6 +1512,7 @@ vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
 	struct faultstate fs;
 	int ahead, behind, faultcount, rv;
 	enum fault_status res;
+	enum fault_next_status res_next;
 	bool hardfault;
 
 	VM_CNT_INC(v_vm_faults);
@@ -1480,6 +1528,7 @@ vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
 	fs.lookup_still_valid = false;
 	fs.oom_started = false;
 	fs.nera = -1;
+	fs.can_read_lock = true;
 	faultcount = 0;
 	hardfault = false;
 
@@ -1590,15 +1639,19 @@ RetryFault:
 		 * traverse into a backing object or zero fill if none is
 		 * found.
 		 */
-		if (vm_fault_next(&fs))
+		res_next = vm_fault_next(&fs);
+		if (res_next == FAULT_NEXT_RESTART)
+			goto RetryFault;
+		else if (res_next == FAULT_NEXT_GOTOBJ)
 			continue;
+		MPASS(res_next == FAULT_NEXT_NOOBJ);
 		if ((fs.fault_flags & VM_FAULT_NOFILL) != 0) {
 			if (fs.first_object == fs.object)
 				fault_page_free(&fs.first_m);
 			unlock_and_deallocate(&fs);
 			return (KERN_OUT_OF_BOUNDS);
 		}
-		VM_OBJECT_WUNLOCK(fs.object);
+		VM_OBJECT_UNLOCK(fs.object);
 		vm_fault_zerofill(&fs);
 		/* Don't try to prefault neighboring pages. */
 		faultcount = 1;