mmap MAP_NOSYNC regression in 10.x
Alan Cox
alc at rice.edu
Sun Sep 7 00:17:40 UTC 2014
On 09/05/2014 07:38, Konstantin Belousov wrote:
> On Fri, Sep 05, 2014 at 01:56:40PM +0200, Pieter de Goeje wrote:
>> Thanks, works for me!
> I realized that the patch contains yet another bug. The oflags page
> flags update is protected by the exclusive vm object lock, which is only
> held in shared mode on the fast path. Below is the fixed patch, where
> I take the page lock around setting VPO_NOSYNC (exclusive lock owners
> cannot race with fast path since we own shared lock, and two parallel
> fast path execution would be handled by the page lock).
Suppose that the page is clean and two threads are executing this code
concurrently. One's map entry has MAP_NOSYNC set, and the other's
doesn't. Let's call these threads NOSYNC and SYNC, respectively.
Suppose that the thread SYNC is slightly ahead. It has already
performed "m->oflags &= ~VPO_NOSYNC;" and now it's about to perform
"vm_page_dirty(fs.m);". However, just before the thread SYNC calls
vm_page_dirty(), the thread NOSYNC evaluates "m->dirty == 0", which is
still true, and it performs "m->oflags |= VPO_NOSYNC; "
This can't happen on the slow path. That is, a fault by a thread
without MAP_NOSYNC set on its map entry will reliably clear VPO_NOSYNC.
The best course of action may be to fall back to the slow path if you
actually need to change VPO_NOSYNC's state. Usually, you won't need to.
> diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c
> index 30b0456..f327ab0 100644
> --- a/sys/vm/vm_fault.c
> +++ b/sys/vm/vm_fault.c
> @@ -174,6 +174,54 @@ unlock_and_deallocate(struct faultstate *fs)
> }
> }
>
> +static void
> +vm_fault_dirty(vm_map_entry_t entry, vm_page_t m, vm_prot_t prot,
> + vm_prot_t fault_type, int fault_flags, boolean_t set_wd)
> +{
> +
> + if (((prot & VM_PROT_WRITE) != 0 ||
> + (fault_flags & VM_FAULT_DIRTY) != 0) &&
> + (m->oflags & VPO_UNMANAGED) == 0) {
> + if (set_wd)
> + vm_object_set_writeable_dirty(m->object);
> +
> + /*
> + * If this is a NOSYNC mmap we do not want to set VPO_NOSYNC
> + * if the page is already dirty to prevent data written with
> + * the expectation of being synced from not being synced.
> + * Likewise if this entry does not request NOSYNC then make
> + * sure the page isn't marked NOSYNC. Applications sharing
> + * data should use the same flags to avoid ping ponging.
> + */
> + if (entry->eflags & MAP_ENTRY_NOSYNC) {
> + if (m->dirty == 0) {
> + if (!set_wd)
> + vm_page_lock(m);
> + m->oflags |= VPO_NOSYNC;
> + if (!set_wd)
> + vm_page_unlock(m);
> + }
> + } else {
> + m->oflags &= ~VPO_NOSYNC;
> + }
> +
> + /*
> + * If the fault is a write, we know that this page is being
> + * written NOW so dirty it explicitly to save on
> + * pmap_is_modified() calls later.
> + *
> + * Also tell the backing pager, if any, that it should remove
> + * any swap backing since the page is now dirty.
> + */
> + if (((fault_type & VM_PROT_WRITE) != 0 &&
> + (fault_flags & VM_FAULT_CHANGE_WIRING) == 0) ||
> + (fault_flags & VM_FAULT_DIRTY) != 0) {
> + vm_page_dirty(m);
> + vm_pager_page_unswapped(m);
> + }
> + }
> +}
> +
> /*
> * TRYPAGER - used by vm_fault to calculate whether the pager for the
> * current object *might* contain the page.
> @@ -321,11 +369,8 @@ RetryFault:;
> vm_page_hold(m);
> vm_page_unlock(m);
> }
> - if ((fault_type & VM_PROT_WRITE) != 0 &&
> - (m->oflags & VPO_UNMANAGED) == 0) {
> - vm_page_dirty(m);
> - vm_pager_page_unswapped(m);
> - }
> + vm_fault_dirty(fs.entry, m, prot, fault_type, fault_flags,
> + FALSE);
> VM_OBJECT_RUNLOCK(fs.first_object);
> if (!wired)
> vm_fault_prefault(&fs, vaddr, 0, 0);
> @@ -898,42 +943,7 @@ vnode_locked:
> if (hardfault)
> fs.entry->next_read = fs.pindex + faultcount - reqpage;
>
> - if (((prot & VM_PROT_WRITE) != 0 ||
> - (fault_flags & VM_FAULT_DIRTY) != 0) &&
> - (fs.m->oflags & VPO_UNMANAGED) == 0) {
> - vm_object_set_writeable_dirty(fs.object);
> -
> - /*
> - * If this is a NOSYNC mmap we do not want to set VPO_NOSYNC
> - * if the page is already dirty to prevent data written with
> - * the expectation of being synced from not being synced.
> - * Likewise if this entry does not request NOSYNC then make
> - * sure the page isn't marked NOSYNC. Applications sharing
> - * data should use the same flags to avoid ping ponging.
> - */
> - if (fs.entry->eflags & MAP_ENTRY_NOSYNC) {
> - if (fs.m->dirty == 0)
> - fs.m->oflags |= VPO_NOSYNC;
> - } else {
> - fs.m->oflags &= ~VPO_NOSYNC;
> - }
> -
> - /*
> - * If the fault is a write, we know that this page is being
> - * written NOW so dirty it explicitly to save on
> - * pmap_is_modified() calls later.
> - *
> - * Also tell the backing pager, if any, that it should remove
> - * any swap backing since the page is now dirty.
> - */
> - if (((fault_type & VM_PROT_WRITE) != 0 &&
> - (fault_flags & VM_FAULT_CHANGE_WIRING) == 0) ||
> - (fault_flags & VM_FAULT_DIRTY) != 0) {
> - vm_page_dirty(fs.m);
> - vm_pager_page_unswapped(fs.m);
> - }
> - }
> -
> + vm_fault_dirty(fs.entry, fs.m, prot, fault_type, fault_flags, TRUE);
> vm_page_assert_xbusied(fs.m);
>
> /*
> diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h
> index f12b76c..a45648d 100644
> --- a/sys/vm/vm_page.h
> +++ b/sys/vm/vm_page.h
> @@ -147,7 +147,7 @@ struct vm_page {
> uint16_t hold_count; /* page hold count (P) */
> uint16_t flags; /* page PG_* flags (P) */
> uint8_t aflags; /* access is atomic */
> - uint8_t oflags; /* page VPO_* flags (O) */
> + uint8_t oflags; /* page VPO_* flags (OM) */
> uint8_t queue; /* page queue index (P,Q) */
> int8_t psind; /* pagesizes[] index (O) */
> int8_t segind;
> @@ -163,8 +163,9 @@ struct vm_page {
> /*
> * Page flags stored in oflags:
> *
> - * Access to these page flags is synchronized by the lock on the object
> - * containing the page (O).
> + * Access to these page flags is synchronized by the exclusive lock on
> + * the object containing the page, or combination of shared object
> + * lock and the page lock (OM).
> *
> * Note: VPO_UNMANAGED (used by OBJT_DEVICE, OBJT_PHYS and OBJT_SG)
> * indicates that the page is not under PV management but
More information about the freebsd-hackers
mailing list