mmap() question

Wed Oct 8 15:14:51 UTC 2014

On 12 окт. 2013 г., at 13:59, Konstantin Belousov <kostikbel at gmail.com> wrote:
> 
> I was not able to reproduce the situation locally. I even tried to start
> a lot of threads accessing the mapped regions, to try to outrun the
> pagedaemon. The user threads sleep on the disk read, while pagedaemon
> has a lot of time to rebalance the queues. It might be a case when SSD
> indeed makes a difference.
> 
> Still, I see how this situation could appear. The code, which triggers
> OOM, never fires if there is a free space in the swapfile, so the
> absense of swap is neccessary condition to trigger the bug.  Next, OOM
> calculation does not account for a possibility that almost all pages on
> the queues can be reused. It just fires if free pages depleted too much
> or free target cannot be reached.
> 
> IMO one of the possible solution is to account the queued pages in
> addition to the swap space.  This is not entirely accurate, since some
> pages on the queues cannot be reused, at least transiently.  Most precise
> algorithm would count the hold and busy pages globally, and substract
> this count from queues length, but it is probably too costly.
> 
> Instead, I think we could rely on the numbers which are counted by
> pagedaemon threads during the passes.  Due to the transient nature of the
> pagedaemon failures, this should be fine.
> 
> Below is the prototype patch, against HEAD.  It is not applicable to
> stable, please use HEAD kernel for test.

Hello,

any chance to commit this patch?

Thanks!

> 
> diff --git a/sys/sys/vmmeter.h b/sys/sys/vmmeter.h
> index d2ad920..ee5159a 100644
> --- a/sys/sys/vmmeter.h
> +++ b/sys/sys/vmmeter.h
> @@ -93,9 +93,10 @@ struct vmmeter {
> 	u_int v_free_min;	/* (c) pages desired free */
> 	u_int v_free_count;	/* (f) pages free */
> 	u_int v_wire_count;	/* (a) pages wired down */
> -	u_int v_active_count;	/* (q) pages active */
> +	u_int v_active_count;	/* (a) pages active */
> 	u_int v_inactive_target; /* (c) pages desired inactive */
> -	u_int v_inactive_count;	/* (q) pages inactive */
> +	u_int v_inactive_count;	/* (a) pages inactive */
> +	u_int v_queue_sticky;	/* (a) pages on queues but cannot process */
> 	u_int v_cache_count;	/* (f) pages on cache queue */
> 	u_int v_cache_min;	/* (c) min pages desired on cache queue */
> 	u_int v_cache_max;	/* (c) max pages in cached obj (unused) */
> diff --git a/sys/vm/vm_meter.c b/sys/vm/vm_meter.c
> index 713a2be..4bb1f1f 100644
> --- a/sys/vm/vm_meter.c
> +++ b/sys/vm/vm_meter.c
> @@ -316,6 +316,7 @@ VM_STATS_VM(v_active_count, "Active pages");
> VM_STATS_VM(v_inactive_target, "Desired inactive pages");
> VM_STATS_VM(v_inactive_count, "Inactive pages");
> VM_STATS_VM(v_cache_count, "Pages on cache queue");
> +VM_STATS_VM(v_queue_sticky, "Pages which cannot be moved from queues");
> VM_STATS_VM(v_cache_min, "Min pages on cache queue");
> VM_STATS_VM(v_cache_max, "Max pages on cached queue");
> VM_STATS_VM(v_pageout_free_min, "Min pages reserved for kernel");
> diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h
> index 7846702..6943a0e 100644
> --- a/sys/vm/vm_page.h
> +++ b/sys/vm/vm_page.h
> @@ -226,6 +226,7 @@ struct vm_domain {
> 	long vmd_segs;	/* bitmask of the segments */
> 	boolean_t vmd_oom;
> 	int vmd_pass;	/* local pagedaemon pass */
> +	int vmd_queue_sticky;	/* pages on queues which cannot be processed */
> 	struct vm_page vmd_marker; /* marker for pagedaemon private use */
> };
> 
> diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c
> index 5660b56..a62cf97 100644
> --- a/sys/vm/vm_pageout.c
> +++ b/sys/vm/vm_pageout.c
> @@ -896,7 +896,7 @@ vm_pageout_scan(struct vm_domain *vmd, int pass)
> {
> 	vm_page_t m, next;
> 	struct vm_pagequeue *pq;
> -	int page_shortage, maxscan, pcount;
> +	int failed_scan, page_shortage, maxscan, pcount;
> 	int addl_page_shortage;
> 	vm_object_t object;
> 	int act_delta;
> @@ -960,6 +960,7 @@ vm_pageout_scan(struct vm_domain *vmd, int pass)
> 	 */
> 	pq = &vmd->vmd_pagequeues[PQ_INACTIVE];
> 	maxscan = pq->pq_cnt;
> +	failed_scan = 0;
> 	vm_pagequeue_lock(pq);
> 	queues_locked = TRUE;
> 	for (m = TAILQ_FIRST(&pq->pq_pl);
> @@ -1012,6 +1013,7 @@ vm_pageout_scan(struct vm_domain *vmd, int pass)
> 			vm_page_unlock(m);
> 			VM_OBJECT_WUNLOCK(object);
> 			addl_page_shortage++;
> +			failed_scan++;
> 			continue;
> 		}
> 
> @@ -1075,6 +1077,7 @@ vm_pageout_scan(struct vm_domain *vmd, int pass)
> 			 * loop over the active queue below.
> 			 */
> 			addl_page_shortage++;
> +			failed_scan++;
> 			goto relock_queues;
> 		}
> 
> @@ -1229,6 +1232,7 @@ vm_pageout_scan(struct vm_domain *vmd, int pass)
> 				 */
> 				if (vm_page_busied(m)) {
> 					vm_page_unlock(m);
> +					failed_scan++;
> 					goto unlock_and_continue;
> 				}
> 
> @@ -1241,6 +1245,7 @@ vm_pageout_scan(struct vm_domain *vmd, int pass)
> 					vm_page_requeue_locked(m);
> 					if (object->flags & OBJ_MIGHTBEDIRTY)
> 						vnodes_skipped++;
> +					failed_scan++;
> 					goto unlock_and_continue;
> 				}
> 				vm_pagequeue_unlock(pq);
> @@ -1386,6 +1391,11 @@ relock_queues:
> 		m = next;
> 	}
> 	vm_pagequeue_unlock(pq);
> +
> +	atomic_add_int(&cnt.v_queue_sticky, failed_scan -
> +	    vmd->vmd_queue_sticky);
> +	vmd->vmd_queue_sticky = failed_scan;
> +
> #if !defined(NO_SWAPPING)
> 	/*
> 	 * Idle process swapout -- run once per second.
> @@ -1433,10 +1443,15 @@ static int vm_pageout_oom_vote;
> static void
> vm_pageout_mightbe_oom(struct vm_domain *vmd, int pass)
> {
> +	u_int queues_count;
> 	int old_vote;
> 
> -	if (pass <= 1 || !((swap_pager_avail < 64 && vm_page_count_min()) ||
> -	    (swap_pager_full && vm_paging_target() > 0))) {
> +	queues_count = cnt.v_active_count + cnt.v_inactive_count -
> +	    cnt.v_queue_sticky;
> +	if (pass <= 1 || !((swap_pager_avail < 64 && vm_page_count_min() &&
> +	    queues_count <= cnt.v_free_min) ||
> +	    (swap_pager_full && vm_paging_target() > 0 &&
> +	    queues_count <= vm_paging_target()))) {
> 		if (vmd->vmd_oom) {
> 			vmd->vmd_oom = FALSE;
> 			atomic_subtract_int(&vm_pageout_oom_vote, 1);