svn commit: r262199 - in projects/numa/sys: amd64/amd64 i386/i386 ia64/ia64 kern mips/mips powerpc/aim sparc64/sparc64 sys vm

Jeff Roberson jeff at FreeBSD.org
Wed Feb 19 00:35:31 UTC 2014


Author: jeff
Date: Wed Feb 19 00:35:27 2014
New Revision: 262199
URL: http://svnweb.freebsd.org/changeset/base/262199

Log:
   - Push NUMA domain selection into the object and out of the physical page
     layer.
   - Make UMA zones NUMA domain aware.

Modified:
  projects/numa/sys/amd64/amd64/uma_machdep.c
  projects/numa/sys/i386/i386/pmap.c
  projects/numa/sys/ia64/ia64/uma_machdep.c
  projects/numa/sys/kern/kern_mbuf.c
  projects/numa/sys/kern/subr_busdma_bufalloc.c
  projects/numa/sys/kern/subr_vmem.c
  projects/numa/sys/mips/mips/pmap.c
  projects/numa/sys/mips/mips/uma_machdep.c
  projects/numa/sys/powerpc/aim/mmu_oea64.c
  projects/numa/sys/powerpc/aim/slb.c
  projects/numa/sys/powerpc/aim/uma_machdep.c
  projects/numa/sys/sparc64/sparc64/vm_machdep.c
  projects/numa/sys/sys/busdma_bufalloc.h
  projects/numa/sys/sys/proc.h
  projects/numa/sys/vm/uma.h
  projects/numa/sys/vm/uma_core.c
  projects/numa/sys/vm/uma_int.h
  projects/numa/sys/vm/vm_object.c
  projects/numa/sys/vm/vm_object.h
  projects/numa/sys/vm/vm_page.c
  projects/numa/sys/vm/vm_page.h
  projects/numa/sys/vm/vm_phys.c
  projects/numa/sys/vm/vm_phys.h
  projects/numa/sys/vm/vm_reserv.c
  projects/numa/sys/vm/vm_reserv.h

Modified: projects/numa/sys/amd64/amd64/uma_machdep.c
==============================================================================
--- projects/numa/sys/amd64/amd64/uma_machdep.c	Tue Feb 18 23:22:54 2014	(r262198)
+++ projects/numa/sys/amd64/amd64/uma_machdep.c	Wed Feb 19 00:35:27 2014	(r262199)
@@ -41,7 +41,8 @@ __FBSDID("$FreeBSD$");
 #include <machine/vmparam.h>
 
 void *
-uma_small_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
+uma_small_alloc(uma_zone_t zone, int bytes, int domain, u_int8_t *flags,
+    int wait)
 {
 	vm_page_t m;
 	vm_paddr_t pa;
@@ -51,7 +52,7 @@ uma_small_alloc(uma_zone_t zone, int byt
 	*flags = UMA_SLAB_PRIV;
 	pflags = malloc2vm_flags(wait) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED;
 	for (;;) {
-		m = vm_page_alloc(NULL, 0, pflags);
+		m = vm_page_alloc_domain(NULL, 0, domain, pflags);
 		if (m == NULL) {
 			if (wait & M_NOWAIT)
 				return (NULL);

Modified: projects/numa/sys/i386/i386/pmap.c
==============================================================================
--- projects/numa/sys/i386/i386/pmap.c	Tue Feb 18 23:22:54 2014	(r262198)
+++ projects/numa/sys/i386/i386/pmap.c	Wed Feb 19 00:35:27 2014	(r262199)
@@ -339,7 +339,8 @@ static pt_entry_t *pmap_pte_quick(pmap_t
 static void pmap_pte_release(pt_entry_t *pte);
 static int pmap_unuse_pt(pmap_t, vm_offset_t, struct spglist *);
 #ifdef PAE
-static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait);
+static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, int domain,
+    u_int8_t *flags, int wait);
 #endif
 static void pmap_set_pg(void);
 
@@ -648,7 +649,8 @@ pmap_page_init(vm_page_t m)
 
 #ifdef PAE
 static void *
-pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
+pmap_pdpt_allocf(uma_zone_t zone, int bytes, int domain, u_int8_t *flags,
+    int wait)
 {
 
 	/* Inform UMA that this allocator uses kernel_map/object. */

Modified: projects/numa/sys/ia64/ia64/uma_machdep.c
==============================================================================
--- projects/numa/sys/ia64/ia64/uma_machdep.c	Tue Feb 18 23:22:54 2014	(r262198)
+++ projects/numa/sys/ia64/ia64/uma_machdep.c	Wed Feb 19 00:35:27 2014	(r262199)
@@ -40,7 +40,8 @@ __FBSDID("$FreeBSD$");
 #include <machine/vmparam.h>
 
 void *
-uma_small_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
+uma_small_alloc(uma_zone_t zone, int bytes, int domain, u_int8_t *flags,
+    int wait)
 {
 	void *va;
 	vm_page_t m;

Modified: projects/numa/sys/kern/kern_mbuf.c
==============================================================================
--- projects/numa/sys/kern/kern_mbuf.c	Tue Feb 18 23:22:54 2014	(r262198)
+++ projects/numa/sys/kern/kern_mbuf.c	Wed Feb 19 00:35:27 2014	(r262199)
@@ -284,7 +284,7 @@ static int	mb_zinit_pack(void *, int, in
 static void	mb_zfini_pack(void *, int);
 
 static void	mb_reclaim(void *);
-static void    *mbuf_jumbo_alloc(uma_zone_t, int, uint8_t *, int);
+static void    *mbuf_jumbo_alloc(uma_zone_t, int, int, uint8_t *, int);
 
 /* Ensure that MSIZE is a power of 2. */
 CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE);
@@ -389,7 +389,8 @@ SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRS
  * pages.
  */
 static void *
-mbuf_jumbo_alloc(uma_zone_t zone, int bytes, uint8_t *flags, int wait)
+mbuf_jumbo_alloc(uma_zone_t zone, int bytes, int domain, uint8_t *flags,
+    int wait)
 {
 
 	/* Inform UMA that this allocator uses kernel_map/object. */

Modified: projects/numa/sys/kern/subr_busdma_bufalloc.c
==============================================================================
--- projects/numa/sys/kern/subr_busdma_bufalloc.c	Tue Feb 18 23:22:54 2014	(r262198)
+++ projects/numa/sys/kern/subr_busdma_bufalloc.c	Wed Feb 19 00:35:27 2014	(r262199)
@@ -147,8 +147,8 @@ busdma_bufalloc_findzone(busdma_bufalloc
 }
 
 void *
-busdma_bufalloc_alloc_uncacheable(uma_zone_t zone, int size, u_int8_t *pflag,
-    int wait)
+busdma_bufalloc_alloc_uncacheable(uma_zone_t zone, int size, int domain,
+    u_int8_t *pflag, int wait)
 {
 #ifdef VM_MEMATTR_UNCACHEABLE
 

Modified: projects/numa/sys/kern/subr_vmem.c
==============================================================================
--- projects/numa/sys/kern/subr_vmem.c	Tue Feb 18 23:22:54 2014	(r262198)
+++ projects/numa/sys/kern/subr_vmem.c	Wed Feb 19 00:35:27 2014	(r262199)
@@ -495,7 +495,7 @@ bt_insfree(vmem_t *vm, bt_t *bt)
  * Import from the arena into the quantum cache in UMA.
  */
 static int
-qc_import(void *arg, void **store, int cnt, int flags)
+qc_import(void *arg, void **store, int cnt, int domain, int flags)
 {
 	qcache_t *qc;
 	vmem_addr_t addr;
@@ -608,7 +608,7 @@ static struct mtx_padalign vmem_bt_lock;
  * we are really out of KVA.
  */
 static void *
-vmem_bt_alloc(uma_zone_t zone, int bytes, uint8_t *pflag, int wait)
+vmem_bt_alloc(uma_zone_t zone, int bytes, int domain, uint8_t *pflag, int wait)
 {
 	vmem_addr_t addr;
 

Modified: projects/numa/sys/mips/mips/pmap.c
==============================================================================
--- projects/numa/sys/mips/mips/pmap.c	Tue Feb 18 23:22:54 2014	(r262198)
+++ projects/numa/sys/mips/mips/pmap.c	Wed Feb 19 00:35:27 2014	(r262199)
@@ -1047,7 +1047,7 @@ pmap_alloc_direct_page(unsigned int inde
 {
 	vm_page_t m;
 
-	m = vm_page_alloc_freelist(VM_FREELIST_DIRECT, req | VM_ALLOC_WIRED |
+	m = vm_page_alloc_freelist(0, VM_FREELIST_DIRECT, req | VM_ALLOC_WIRED |
 	    VM_ALLOC_ZERO);
 	if (m == NULL)
 		return (NULL);
@@ -1581,7 +1581,7 @@ retry:
 		}
 	}
 	/* No free items, allocate another chunk */
-	m = vm_page_alloc_freelist(VM_FREELIST_DIRECT, VM_ALLOC_NORMAL |
+	m = vm_page_alloc_freelist(0, VM_FREELIST_DIRECT, VM_ALLOC_NORMAL |
 	    VM_ALLOC_WIRED);
 	if (m == NULL) {
 		if (try) {

Modified: projects/numa/sys/mips/mips/uma_machdep.c
==============================================================================
--- projects/numa/sys/mips/mips/uma_machdep.c	Tue Feb 18 23:22:54 2014	(r262198)
+++ projects/numa/sys/mips/mips/uma_machdep.c	Wed Feb 19 00:35:27 2014	(r262199)
@@ -41,7 +41,8 @@ __FBSDID("$FreeBSD$");
 #include <machine/vmparam.h>
 
 void *
-uma_small_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
+uma_small_alloc(uma_zone_t zone, int bytes, int domain, u_int8_t *flags,
+    int wait)
 {
 	vm_paddr_t pa;
 	vm_page_t m;
@@ -52,7 +53,7 @@ uma_small_alloc(uma_zone_t zone, int byt
 	pflags = malloc2vm_flags(wait) | VM_ALLOC_WIRED;
 
 	for (;;) {
-		m = vm_page_alloc_freelist(VM_FREELIST_DIRECT, pflags);
+		m = vm_page_alloc_freelist(domain, VM_FREELIST_DIRECT, pflags);
 		if (m == NULL) {
 			if (wait & M_NOWAIT)
 				return (NULL);

Modified: projects/numa/sys/powerpc/aim/mmu_oea64.c
==============================================================================
--- projects/numa/sys/powerpc/aim/mmu_oea64.c	Tue Feb 18 23:22:54 2014	(r262198)
+++ projects/numa/sys/powerpc/aim/mmu_oea64.c	Wed Feb 19 00:35:27 2014	(r262199)
@@ -1446,7 +1446,8 @@ retry:
 static mmu_t installed_mmu;
 
 static void *
-moea64_uma_page_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) 
+moea64_uma_page_alloc(uma_zone_t zone, int bytes, int domain, u_int8_t *flags,
+    int wait) 
 {
 	/*
 	 * This entire routine is a horrible hack to avoid bothering kmem

Modified: projects/numa/sys/powerpc/aim/slb.c
==============================================================================
--- projects/numa/sys/powerpc/aim/slb.c	Tue Feb 18 23:22:54 2014	(r262198)
+++ projects/numa/sys/powerpc/aim/slb.c	Wed Feb 19 00:35:27 2014	(r262199)
@@ -473,7 +473,8 @@ slb_insert_user(pmap_t pm, struct slb *s
 }
 
 static void *
-slb_uma_real_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
+slb_uma_real_alloc(uma_zone_t zone, int bytes, int domain, u_int8_t *flags,
+    int wait)
 {
 	static vm_offset_t realmax = 0;
 	void *va;

Modified: projects/numa/sys/powerpc/aim/uma_machdep.c
==============================================================================
--- projects/numa/sys/powerpc/aim/uma_machdep.c	Tue Feb 18 23:22:54 2014	(r262198)
+++ projects/numa/sys/powerpc/aim/uma_machdep.c	Wed Feb 19 00:35:27 2014	(r262199)
@@ -50,7 +50,8 @@ SYSCTL_INT(_hw, OID_AUTO, uma_mdpages, C
 	   "UMA MD pages in use");
 
 void *
-uma_small_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
+uma_small_alloc(uma_zone_t zone, int bytes, int domain, u_int8_t *flags,
+    int wait)
 {
 	void *va;
 	vm_page_t m;

Modified: projects/numa/sys/sparc64/sparc64/vm_machdep.c
==============================================================================
--- projects/numa/sys/sparc64/sparc64/vm_machdep.c	Tue Feb 18 23:22:54 2014	(r262198)
+++ projects/numa/sys/sparc64/sparc64/vm_machdep.c	Wed Feb 19 00:35:27 2014	(r262199)
@@ -502,7 +502,8 @@ swi_vm(void *v)
 }
 
 void *
-uma_small_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
+uma_small_alloc(uma_zone_t zone, int bytes, int domain, u_int8_t *flags,
+    int wait)
 {
 	vm_paddr_t pa;
 	vm_page_t m;

Modified: projects/numa/sys/sys/busdma_bufalloc.h
==============================================================================
--- projects/numa/sys/sys/busdma_bufalloc.h	Tue Feb 18 23:22:54 2014	(r262198)
+++ projects/numa/sys/sys/busdma_bufalloc.h	Wed Feb 19 00:35:27 2014	(r262199)
@@ -111,7 +111,7 @@ struct busdma_bufzone * busdma_bufalloc_
  * you can probably use these when you need uncacheable buffers.
  */
 void * busdma_bufalloc_alloc_uncacheable(uma_zone_t zone, int size, 
-    u_int8_t *pflag, int wait);
+    int domain, u_int8_t *pflag, int wait);
 void  busdma_bufalloc_free_uncacheable(void *item, int size, u_int8_t pflag);
 
 #endif	/* _MACHINE_BUSDMA_BUFALLOC_H_ */

Modified: projects/numa/sys/sys/proc.h
==============================================================================
--- projects/numa/sys/sys/proc.h	Tue Feb 18 23:22:54 2014	(r262198)
+++ projects/numa/sys/sys/proc.h	Wed Feb 19 00:35:27 2014	(r262199)
@@ -274,7 +274,6 @@ struct thread {
 	pid_t		td_dbg_forked;	/* (c) Child pid for debugger. */
 	u_int		td_vp_reserv;	/* (k) Count of reserved vnodes. */
 	int		td_no_sleeping;	/* (k) Sleeping disabled count. */
-	int		td_dom_rr_idx;	/* (k) RR Numa domain selection. */
 #define	td_endzero td_sigmask
 
 /* Copied during fork1() or create_thread(). */

Modified: projects/numa/sys/vm/uma.h
==============================================================================
--- projects/numa/sys/vm/uma.h	Tue Feb 18 23:22:54 2014	(r262198)
+++ projects/numa/sys/vm/uma.h	Wed Feb 19 00:35:27 2014	(r262199)
@@ -126,7 +126,8 @@ typedef void (*uma_fini)(void *mem, int 
 /*
  * Import new memory into a cache zone.
  */
-typedef int (*uma_import)(void *arg, void **store, int count, int flags);
+typedef int (*uma_import)(void *arg, void **store, int count, int domain,
+    int flags);
 
 /*
  * Free memory from a cache zone.
@@ -279,6 +280,11 @@ uma_zone_t uma_zcache_create(char *name,
 					 * Allocates mp_ncpus slabs sized to
 					 * sizeof(struct pcpu).
 					 */
+#define	UMA_ZONE_NUMA		0x10000	/*
+					 * Zone is NUMA aware.  Implements
+					 * a best effort first-touch
+					 * allocation policy.
+					 */
 
 /*
  * These flags are shared between the keg and zone.  In zones wishing to add
@@ -365,16 +371,12 @@ uma_zfree(uma_zone_t zone, void *item)
 }
 
 /*
- * XXX The rest of the prototypes in this header are h0h0 magic for the VM.
- * If you think you need to use it for a normal zone you're probably incorrect.
- */
-
-/*
  * Backend page supplier routines
  *
  * Arguments:
  *	zone  The zone that is requesting pages.
  *	size  The number of bytes being requested.
+ *      domain The NUMA domain we prefer for this allocation.
  *	pflag Flags for these memory pages, see below.
  *	wait  Indicates our willingness to block.
  *
@@ -382,7 +384,8 @@ uma_zfree(uma_zone_t zone, void *item)
  *	A pointer to the allocated memory or NULL on failure.
  */
 
-typedef void *(*uma_alloc)(uma_zone_t zone, int size, uint8_t *pflag, int wait);
+typedef void *(*uma_alloc)(uma_zone_t zone, int size, int domain,
+    uint8_t *pflag, int wait);
 
 /*
  * Backend page free routines
@@ -397,8 +400,6 @@ typedef void *(*uma_alloc)(uma_zone_t zo
  */
 typedef void (*uma_free)(void *item, int size, uint8_t pflag);
 
-
-
 /*
  * Sets up the uma allocator. (Called by vm_mem_init)
  *

Modified: projects/numa/sys/vm/uma_core.c
==============================================================================
--- projects/numa/sys/vm/uma_core.c	Tue Feb 18 23:22:54 2014	(r262198)
+++ projects/numa/sys/vm/uma_core.c	Wed Feb 19 00:35:27 2014	(r262199)
@@ -84,6 +84,7 @@ __FBSDID("$FreeBSD$");
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_param.h>
+#include <vm/vm_phys.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
@@ -226,11 +227,11 @@ enum zfreeskip { SKIP_NONE = 0, SKIP_DTO
 
 /* Prototypes.. */
 
-static void *noobj_alloc(uma_zone_t, int, uint8_t *, int);
-static void *page_alloc(uma_zone_t, int, uint8_t *, int);
-static void *startup_alloc(uma_zone_t, int, uint8_t *, int);
+static void *noobj_alloc(uma_zone_t, int, int, uint8_t *, int);
+static void *page_alloc(uma_zone_t, int, int, uint8_t *, int);
+static void *startup_alloc(uma_zone_t, int, int, uint8_t *, int);
 static void page_free(void *, int, uint8_t);
-static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int);
+static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int);
 static void cache_drain(uma_zone_t);
 static void bucket_drain(uma_zone_t, uma_bucket_t);
 static void bucket_cache_drain(uma_zone_t zone);
@@ -248,23 +249,23 @@ static int hash_expand(struct uma_hash *
 static void hash_free(struct uma_hash *hash);
 static void uma_timeout(void *);
 static void uma_startup3(void);
-static void *zone_alloc_item(uma_zone_t, void *, int);
+static void *zone_alloc_item(uma_zone_t, void *, int, int);
 static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip);
 static void bucket_enable(void);
 static void bucket_init(void);
 static uma_bucket_t bucket_alloc(uma_zone_t zone, void *, int);
 static void bucket_free(uma_zone_t zone, uma_bucket_t, void *);
 static void bucket_zone_drain(void);
-static uma_bucket_t zone_alloc_bucket(uma_zone_t zone, void *, int flags);
-static uma_slab_t zone_fetch_slab(uma_zone_t zone, uma_keg_t last, int flags);
-static uma_slab_t zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int flags);
+static uma_bucket_t zone_alloc_bucket(uma_zone_t, void *, int, int);
+static uma_slab_t zone_fetch_slab(uma_zone_t, uma_keg_t, int, int);
+static uma_slab_t zone_fetch_slab_multi(uma_zone_t, uma_keg_t, int, int);
 static void *slab_alloc_item(uma_keg_t keg, uma_slab_t slab);
 static void slab_free_item(uma_keg_t keg, uma_slab_t slab, void *item);
 static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit,
     uma_fini fini, int align, uint32_t flags);
-static int zone_import(uma_zone_t zone, void **bucket, int max, int flags);
-static void zone_release(uma_zone_t zone, void **bucket, int cnt);
-static void uma_zero_item(void *item, uma_zone_t zone);
+static int zone_import(uma_zone_t, void **, int, int, int);
+static void zone_release(uma_zone_t, void **, int);
+static void uma_zero_item(void *, uma_zone_t);
 
 void uma_print_zone(uma_zone_t);
 void uma_print_stats(void);
@@ -311,7 +312,7 @@ bucket_init(void)
 		size += sizeof(void *) * ubz->ubz_entries;
 		ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size,
 		    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
-		    UMA_ZONE_MTXCLASS | UMA_ZFLAG_BUCKET);
+		    UMA_ZONE_MTXCLASS | UMA_ZFLAG_BUCKET | UMA_ZONE_NUMA);
 	}
 }
 
@@ -539,7 +540,7 @@ hash_alloc(struct uma_hash *hash)
 	} else {
 		alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT;
 		hash->uh_slab_hash = zone_alloc_item(hashzone, NULL,
-		    M_WAITOK);
+		    UMA_ANYDOMAIN, M_WAITOK);
 		hash->uh_hashsize = UMA_HASH_SIZE_INIT;
 	}
 	if (hash->uh_slab_hash) {
@@ -705,17 +706,23 @@ cache_drain_safe_cpu(uma_zone_t zone)
 {
 	uma_cache_t cache;
 	uma_bucket_t b1, b2;
+	int domain;
 
 	if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
 		return;
 
 	b1 = b2 = NULL;
+
 	ZONE_LOCK(zone);
 	critical_enter();
+	if (zone->uz_flags & UMA_ZONE_NUMA)
+		domain = PCPU_GET(domain);
+	else
+		domain = 0;
 	cache = &zone->uz_cpu[curcpu];
 	if (cache->uc_allocbucket) {
 		if (cache->uc_allocbucket->ub_cnt != 0)
-			LIST_INSERT_HEAD(&zone->uz_buckets,
+			LIST_INSERT_HEAD(&zone->uz_domain[domain].uzd_buckets,
 			    cache->uc_allocbucket, ub_link);
 		else
 			b1 = cache->uc_allocbucket;
@@ -723,7 +730,7 @@ cache_drain_safe_cpu(uma_zone_t zone)
 	}
 	if (cache->uc_freebucket) {
 		if (cache->uc_freebucket->ub_cnt != 0)
-			LIST_INSERT_HEAD(&zone->uz_buckets,
+			LIST_INSERT_HEAD(&zone->uz_domain[domain].uzd_buckets,
 			    cache->uc_freebucket, ub_link);
 		else
 			b2 = cache->uc_freebucket;
@@ -778,18 +785,22 @@ cache_drain_safe(uma_zone_t zone)
 static void
 bucket_cache_drain(uma_zone_t zone)
 {
+	uma_zone_domain_t zdom;
 	uma_bucket_t bucket;
+	int i;
 
 	/*
-	 * Drain the bucket queues and free the buckets, we just keep two per
-	 * cpu (alloc/free).
+	 * Drain the bucket queues and free the buckets.
 	 */
-	while ((bucket = LIST_FIRST(&zone->uz_buckets)) != NULL) {
-		LIST_REMOVE(bucket, ub_link);
-		ZONE_UNLOCK(zone);
-		bucket_drain(zone, bucket);
-		bucket_free(zone, bucket, NULL);
-		ZONE_LOCK(zone);
+	for (i = 0; i < vm_ndomains; i++) {
+		zdom = &zone->uz_domain[i];
+		while ((bucket = LIST_FIRST(&zdom->uzd_buckets)) != NULL) {
+			LIST_REMOVE(bucket, ub_link);
+			ZONE_UNLOCK(zone);
+			bucket_drain(zone, bucket);
+			bucket_free(zone, bucket, NULL);
+			ZONE_LOCK(zone);
+		}
 	}
 
 	/*
@@ -834,8 +845,10 @@ static void
 keg_drain(uma_keg_t keg)
 {
 	struct slabhead freeslabs = { 0 };
+	uma_domain_t dom;
 	uma_slab_t slab;
 	uma_slab_t n;
+	int i;
 
 	/*
 	 * We don't want to take pages from statically allocated kegs at this
@@ -851,26 +864,30 @@ keg_drain(uma_keg_t keg)
 	if (keg->uk_free == 0)
 		goto finished;
 
-	slab = LIST_FIRST(&keg->uk_free_slab);
-	while (slab) {
-		n = LIST_NEXT(slab, us_link);
-
-		/* We have no where to free these to */
-		if (slab->us_flags & UMA_SLAB_BOOT) {
-			slab = n;
-			continue;
-		}
-
-		LIST_REMOVE(slab, us_link);
-		keg->uk_pages -= keg->uk_ppera;
-		keg->uk_free -= keg->uk_ipers;
+	for (i = 0; i < vm_ndomains; i++) {
+		dom = &keg->uk_domain[i];
+		slab = LIST_FIRST(&dom->ud_free_slab);
+		while (slab) {
+			n = LIST_NEXT(slab, us_link);
+
+			/* We have no where to free these to */
+			if (slab->us_flags & UMA_SLAB_BOOT) {
+				slab = n;
+				continue;
+			}
 
-		if (keg->uk_flags & UMA_ZONE_HASH)
-			UMA_HASH_REMOVE(&keg->uk_hash, slab, slab->us_data);
+			LIST_REMOVE(slab, us_link);
+			keg->uk_pages -= keg->uk_ppera;
+			keg->uk_free -= keg->uk_ipers;
+
+			if (keg->uk_flags & UMA_ZONE_HASH)
+				UMA_HASH_REMOVE(&keg->uk_hash, slab,
+				    slab->us_data);
 
-		SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink);
+			SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink);
 
-		slab = n;
+			slab = n;
+		}
 	}
 finished:
 	KEG_UNLOCK(keg);
@@ -933,7 +950,7 @@ zone_drain(uma_zone_t zone)
  *	caller specified M_NOWAIT.
  */
 static uma_slab_t
-keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int wait)
+keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int domain, int wait)
 {
 	uma_slabrefcnt_t slabref;
 	uma_alloc allocf;
@@ -942,6 +959,8 @@ keg_alloc_slab(uma_keg_t keg, uma_zone_t
 	uint8_t flags;
 	int i;
 
+	KASSERT(domain >= 0 && domain < vm_ndomains,
+	    ("keg_alloc_slab: domain %d out of range", domain));
 	mtx_assert(&keg->uk_lock, MA_OWNED);
 	slab = NULL;
 	mem = NULL;
@@ -953,7 +972,7 @@ keg_alloc_slab(uma_keg_t keg, uma_zone_t
 	KEG_UNLOCK(keg);
 
 	if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
-		slab = zone_alloc_item(keg->uk_slabzone, NULL, wait);
+		slab = zone_alloc_item(keg->uk_slabzone, NULL, domain, wait);
 		if (slab == NULL)
 			goto out;
 	}
@@ -974,7 +993,7 @@ keg_alloc_slab(uma_keg_t keg, uma_zone_t
 		wait |= M_NODUMP;
 
 	/* zone is passed for legacy reasons. */
-	mem = allocf(zone, keg->uk_ppera * PAGE_SIZE, &flags, wait);
+	mem = allocf(zone, keg->uk_ppera * PAGE_SIZE, domain, &flags, wait);
 	if (mem == NULL) {
 		if (keg->uk_flags & UMA_ZONE_OFFPAGE)
 			zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE);
@@ -998,6 +1017,18 @@ keg_alloc_slab(uma_keg_t keg, uma_zone_t
 #ifdef INVARIANTS
 	BIT_ZERO(SLAB_SETSIZE, &slab->us_debugfree);
 #endif
+	/*
+	 * Set the domain based on the first page.  This may be incorrect
+	 * for multi-page allocations depending on the numa_policy specified.
+	 */
+#if MAXMEMDOM > 1
+	if ((flags & UMA_SLAB_BOOT) == 0)
+		slab->us_domain = vm_phys_domain(PHYS_TO_VM_PAGE(
+		    pmap_kextract((vm_offset_t)mem)));
+	else
+#endif
+		slab->us_domain = 0;
+
 	if (keg->uk_flags & UMA_ZONE_REFCNT) {
 		slabref = (uma_slabrefcnt_t)slab;
 		for (i = 0; i < keg->uk_ipers; i++)
@@ -1035,7 +1066,7 @@ out:
  * the VM is ready.
  */
 static void *
-startup_alloc(uma_zone_t zone, int bytes, uint8_t *pflag, int wait)
+startup_alloc(uma_zone_t zone, int bytes, int domain, uint8_t *pflag, int wait)
 {
 	uma_keg_t keg;
 	uma_slab_t tmps;
@@ -1080,7 +1111,7 @@ startup_alloc(uma_zone_t zone, int bytes
 #else
 	keg->uk_allocf = page_alloc;
 #endif
-	return keg->uk_allocf(zone, bytes, pflag, wait);
+	return keg->uk_allocf(zone, bytes, domain, pflag, wait);
 }
 
 /*
@@ -1095,7 +1126,7 @@ startup_alloc(uma_zone_t zone, int bytes
  *	NULL if M_NOWAIT is set.
  */
 static void *
-page_alloc(uma_zone_t zone, int bytes, uint8_t *pflag, int wait)
+page_alloc(uma_zone_t zone, int bytes, int domain, uint8_t *pflag, int wait)
 {
 	void *p;	/* Returned page */
 
@@ -1117,7 +1148,7 @@ page_alloc(uma_zone_t zone, int bytes, u
  *	NULL if M_NOWAIT is set.
  */
 static void *
-noobj_alloc(uma_zone_t zone, int bytes, uint8_t *flags, int wait)
+noobj_alloc(uma_zone_t zone, int bytes, int domain, uint8_t *flags, int wait)
 {
 	TAILQ_HEAD(, vm_page) alloctail;
 	u_long npages;
@@ -1130,7 +1161,7 @@ noobj_alloc(uma_zone_t zone, int bytes, 
 
 	npages = howmany(bytes, PAGE_SIZE);
 	while (npages > 0) {
-		p = vm_page_alloc(NULL, 0, VM_ALLOC_INTERRUPT |
+		p = vm_page_alloc_domain(NULL, 0, domain, VM_ALLOC_INTERRUPT |
 		    VM_ALLOC_WIRED | VM_ALLOC_NOOBJ);
 		if (p != NULL) {
 			/*
@@ -1410,6 +1441,7 @@ keg_ctor(void *mem, int size, void *udat
 	keg->uk_init = arg->uminit;
 	keg->uk_fini = arg->fini;
 	keg->uk_align = arg->align;
+	keg->uk_cursor = 0;
 	keg->uk_free = 0;
 	keg->uk_reserve = 0;
 	keg->uk_pages = 0;
@@ -1910,7 +1942,7 @@ uma_kcreate(uma_zone_t zone, size_t size
 	args.align = (align == UMA_ALIGN_CACHE) ? uma_align_cache : align;
 	args.flags = flags;
 	args.zone = zone;
-	return (zone_alloc_item(kegs, &args, M_WAITOK));
+	return (zone_alloc_item(kegs, &args, UMA_ANYDOMAIN, M_WAITOK));
 }
 
 /* See uma.h */
@@ -1942,7 +1974,7 @@ uma_zcreate(const char *name, size_t siz
 	args.flags = flags;
 	args.keg = NULL;
 
-	return (zone_alloc_item(zones, &args, M_WAITOK));
+	return (zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK));
 }
 
 /* See uma.h */
@@ -1966,7 +1998,7 @@ uma_zsecond_create(char *name, uma_ctor 
 	args.keg = keg;
 
 	/* XXX Attaches only one keg of potentially many. */
-	return (zone_alloc_item(zones, &args, M_WAITOK));
+	return (zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK));
 }
 
 /* See uma.h */
@@ -1990,7 +2022,7 @@ uma_zcache_create(char *name, int size, 
 	args.align = 0;
 	args.flags = flags;
 
-	return (zone_alloc_item(zones, &args, M_WAITOK));
+	return (zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK));
 }
 
 static void
@@ -2091,10 +2123,12 @@ uma_zdestroy(uma_zone_t zone)
 void *
 uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
 {
-	void *item;
-	uma_cache_t cache;
+	uma_zone_domain_t zdom;
 	uma_bucket_t bucket;
+	uma_cache_t cache;
+	void *item;
 	int lockfail;
+	int domain;
 	int cpu;
 
 	/* This is the fast path allocation */
@@ -2196,8 +2230,10 @@ zalloc_start:
 		bucket_free(zone, bucket, udata);
 
 	/* Short-circuit for zones without buckets and low memory. */
-	if (zone->uz_count == 0 || bucketdisable)
+	if (zone->uz_count == 0 || bucketdisable) {
+		domain = UMA_ANYDOMAIN;
 		goto zalloc_item;
+	}
 
 	/*
 	 * Attempt to retrieve the item from the per-CPU cache has failed, so
@@ -2232,10 +2268,19 @@ zalloc_start:
 		goto zalloc_start;
 	}
 
+	/* Get the domain according to zone flags. */
+	if (zone->uz_flags & UMA_ZONE_NUMA) {
+		domain = PCPU_GET(domain);
+		zdom = &zone->uz_domain[domain];
+	} else {
+		domain = UMA_ANYDOMAIN; 
+		zdom = &zone->uz_domain[0];
+	}
+
 	/*
 	 * Check the zone's cache of buckets.
 	 */
-	if ((bucket = LIST_FIRST(&zone->uz_buckets)) != NULL) {
+	if ((bucket = LIST_FIRST(&zdom->uzd_buckets)) != NULL) {
 		KASSERT(bucket->ub_cnt != 0,
 		    ("uma_zalloc_arg: Returning an empty bucket."));
 
@@ -2260,7 +2305,7 @@ zalloc_start:
 	 * works we'll restart the allocation from the begining and it
 	 * will use the just filled bucket.
 	 */
-	bucket = zone_alloc_bucket(zone, udata, flags);
+	bucket = zone_alloc_bucket(zone, udata, domain, flags);
 	if (bucket != NULL) {
 		ZONE_LOCK(zone);
 		critical_enter();
@@ -2271,10 +2316,11 @@ zalloc_start:
 		 * initialized bucket to make this less likely or claim
 		 * the memory directly.
 		 */
-		if (cache->uc_allocbucket == NULL)
-			cache->uc_allocbucket = bucket;
+		if (cache->uc_allocbucket != NULL ||
+		    (domain != UMA_ANYDOMAIN && domain != PCPU_GET(domain)))
+			LIST_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link);
 		else
-			LIST_INSERT_HEAD(&zone->uz_buckets, bucket, ub_link);
+			cache->uc_allocbucket = bucket;
 		ZONE_UNLOCK(zone);
 		goto zalloc_start;
 	}
@@ -2287,16 +2333,44 @@ zalloc_start:
 #endif
 
 zalloc_item:
-	item = zone_alloc_item(zone, udata, flags);
+	item = zone_alloc_item(zone, udata, domain, flags);
 
 	return (item);
 }
 
+/*
+ * Find a slab with some space.  Prefer slabs that are partially
+ * used over those that are totally full.  This helps to reduce
+ * fragmentation.
+ */
+static uma_slab_t
+keg_first_slab(uma_keg_t keg, int domain)
+{
+	uma_domain_t dom;
+	uma_slab_t slab;
+
+	KASSERT(domain >= 0 && domain < vm_ndomains,
+	    ("keg_first_slab: Domain %d out of range", domain));
+	dom = &keg->uk_domain[domain];
+	if (!LIST_EMPTY(&dom->ud_part_slab))
+		return LIST_FIRST(&dom->ud_part_slab);
+	if (LIST_EMPTY(&dom->ud_free_slab))
+		return (NULL);
+	slab = LIST_FIRST(&dom->ud_free_slab);
+	LIST_REMOVE(slab, us_link);
+	LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
+
+	return (slab);
+}
+
 static uma_slab_t
-keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int flags)
+keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int rdomain, int flags)
 {
+	uma_domain_t dom;
 	uma_slab_t slab;
 	int reserve;
+	int domain;
+	int start;
 
 	mtx_assert(&keg->uk_lock, MA_OWNED);
 	slab = NULL;
@@ -2304,21 +2378,14 @@ keg_fetch_slab(uma_keg_t keg, uma_zone_t
 	if ((flags & M_USE_RESERVE) == 0)
 		reserve = keg->uk_reserve;
 
-	for (;;) {
-		/*
-		 * Find a slab with some space.  Prefer slabs that are partially
-		 * used over those that are totally full.  This helps to reduce
-		 * fragmentation.
-		 */
-		if (keg->uk_free > reserve) {
-			if (!LIST_EMPTY(&keg->uk_part_slab)) {
-				slab = LIST_FIRST(&keg->uk_part_slab);
-			} else {
-				slab = LIST_FIRST(&keg->uk_free_slab);
-				LIST_REMOVE(slab, us_link);
-				LIST_INSERT_HEAD(&keg->uk_part_slab, slab,
-				    us_link);
-			}
+	if (rdomain == UMA_ANYDOMAIN) {
+		keg->uk_cursor = (keg->uk_cursor + 1) % vm_ndomains;
+		domain = start = keg->uk_cursor;
+	} else
+		domain = start = rdomain;
+	do {
+		if (keg->uk_free > reserve &&
+		    (slab = keg_first_slab(keg, domain)) != NULL) {
 			MPASS(slab->us_keg == keg);
 			return (slab);
 		}
@@ -2345,7 +2412,7 @@ keg_fetch_slab(uma_keg_t keg, uma_zone_t
 			msleep(keg, &keg->uk_lock, PVM, "keglimit", 0);
 			continue;
 		}
-		slab = keg_alloc_slab(keg, zone, flags);
+		slab = keg_alloc_slab(keg, zone, domain, flags);
 		/*
 		 * If we got a slab here it's safe to mark it partially used
 		 * and return.  We assume that the caller is going to remove
@@ -2353,7 +2420,8 @@ keg_fetch_slab(uma_keg_t keg, uma_zone_t
 		 */
 		if (slab) {
 			MPASS(slab->us_keg == keg);
-			LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link);
+			dom = &keg->uk_domain[slab->us_domain];
+			LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
 			return (slab);
 		}
 		/*
@@ -2361,13 +2429,21 @@ keg_fetch_slab(uma_keg_t keg, uma_zone_t
 		 * could have while we were unlocked.  Check again before we
 		 * fail.
 		 */
-		flags |= M_NOVM;
-	}
-	return (slab);
+		if ((slab = keg_first_slab(keg, domain)) != NULL) {
+			MPASS(slab->us_keg == keg);
+			return (slab);
+		}
+		if (rdomain == UMA_ANYDOMAIN) {
+			keg->uk_cursor = (keg->uk_cursor + 1) % vm_ndomains;
+			domain = keg->uk_cursor;
+		}
+	} while (domain != start);
+
+	return (NULL);
 }
 
 static uma_slab_t
-zone_fetch_slab(uma_zone_t zone, uma_keg_t keg, int flags)
+zone_fetch_slab(uma_zone_t zone, uma_keg_t keg, int domain, int flags)
 {
 	uma_slab_t slab;
 
@@ -2377,7 +2453,7 @@ zone_fetch_slab(uma_zone_t zone, uma_keg
 	}
 
 	for (;;) {
-		slab = keg_fetch_slab(keg, zone, flags);
+		slab = keg_fetch_slab(keg, zone, domain, flags);
 		if (slab)
 			return (slab);
 		if (flags & (M_NOWAIT | M_NOVM))
@@ -2394,7 +2470,7 @@ zone_fetch_slab(uma_zone_t zone, uma_keg
  * The last pointer is used to seed the search.  It is not required.
  */
 static uma_slab_t
-zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int rflags)
+zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int domain, int rflags)
 {
 	uma_klink_t klink;
 	uma_slab_t slab;
@@ -2414,7 +2490,7 @@ zone_fetch_slab_multi(uma_zone_t zone, u
 	 * the search.
 	 */
 	if (last != NULL) {
-		slab = keg_fetch_slab(last, zone, flags);
+		slab = keg_fetch_slab(last, zone, domain, flags);
 		if (slab)
 			return (slab);
 		KEG_UNLOCK(last);
@@ -2435,7 +2511,7 @@ zone_fetch_slab_multi(uma_zone_t zone, u
 			keg = klink->kl_keg;
 			KEG_LOCK(keg);
 			if ((keg->uk_flags & UMA_ZFLAG_FULL) == 0) {
-				slab = keg_fetch_slab(keg, zone, flags);
+				slab = keg_fetch_slab(keg, zone, domain, flags);
 				if (slab)
 					return (slab);
 			}
@@ -2470,6 +2546,7 @@ zone_fetch_slab_multi(uma_zone_t zone, u
 static void *
 slab_alloc_item(uma_keg_t keg, uma_slab_t slab)
 {
+	uma_domain_t dom;
 	void *item;
 	uint8_t freei;
 
@@ -2485,14 +2562,15 @@ slab_alloc_item(uma_keg_t keg, uma_slab_
 	/* Move this slab to the full list */
 	if (slab->us_freecount == 0) {
 		LIST_REMOVE(slab, us_link);
-		LIST_INSERT_HEAD(&keg->uk_full_slab, slab, us_link);
+		dom = &keg->uk_domain[slab->us_domain];
+		LIST_INSERT_HEAD(&dom->ud_full_slab, slab, us_link);
 	}
 
 	return (item);
 }
 
 static int
-zone_import(uma_zone_t zone, void **bucket, int max, int flags)
+zone_import(uma_zone_t zone, void **bucket, int max, int domain, int flags)
 {
 	uma_slab_t slab;
 	uma_keg_t keg;
@@ -2502,13 +2580,25 @@ zone_import(uma_zone_t zone, void **buck
 	keg = NULL;
 	/* Try to keep the buckets totally full */
 	for (i = 0; i < max; ) {
-		if ((slab = zone->uz_slab(zone, keg, flags)) == NULL)
+		if ((slab = zone->uz_slab(zone, keg, domain, flags)) == NULL)
 			break;
 		keg = slab->us_keg;
 		while (slab->us_freecount && i < max) { 
 			bucket[i++] = slab_alloc_item(keg, slab);
 			if (keg->uk_free <= keg->uk_reserve)
 				break;
+#if MAXMEMDOM > 1
+			/*
+			 * If the zone is striped we pick a new slab for
+			 * every allocation.  Eliminating this conditional
+			 * will instead pick a new domain for each bucket
+			 * rather than stripe within each bucket.  The
+			 * current options produces more fragmentation but
+			 * yields better distribution.
+			 */
+			if (domain == UMA_ANYDOMAIN && vm_ndomains > 1)
+				break;
+#endif
 		}
 		/* Don't grab more than one slab at a time. */
 		flags &= ~M_WAITOK;
@@ -2521,7 +2611,7 @@ zone_import(uma_zone_t zone, void **buck
 }
 
 static uma_bucket_t
-zone_alloc_bucket(uma_zone_t zone, void *udata, int flags)
+zone_alloc_bucket(uma_zone_t zone, void *udata, int domain, int flags)
 {
 	uma_bucket_t bucket;
 	int max;
@@ -2533,7 +2623,7 @@ zone_alloc_bucket(uma_zone_t zone, void 
 
 	max = MIN(bucket->ub_entries, zone->uz_count);
 	bucket->ub_cnt = zone->uz_import(zone->uz_arg, bucket->ub_bucket,
-	    max, flags);
+	    max, domain, flags);
 
 	/*
 	 * Initialize the memory if necessary.
@@ -2583,7 +2673,7 @@ zone_alloc_bucket(uma_zone_t zone, void 
  */
 
 static void *
-zone_alloc_item(uma_zone_t zone, void *udata, int flags)
+zone_alloc_item(uma_zone_t zone, void *udata, int domain, int flags)
 {
 	void *item;
 
@@ -2592,7 +2682,7 @@ zone_alloc_item(uma_zone_t zone, void *u
 #ifdef UMA_DEBUG_ALLOC
 	printf("INTERNAL: Allocating one item from %s(%p)\n", zone->uz_name, zone);
 #endif
-	if (zone->uz_import(zone->uz_arg, &item, 1, flags) != 1)
+	if (zone->uz_import(zone->uz_arg, &item, 1, domain, flags) != 1)
 		goto fail;
 	atomic_add_long(&zone->uz_allocs, 1);
 
@@ -2633,7 +2723,9 @@ uma_zfree_arg(uma_zone_t zone, void *ite
 {
 	uma_cache_t cache;
 	uma_bucket_t bucket;
+	uma_zone_domain_t zdom;
 	int lockfail;
+	int domain;
 	int cpu;
 
 #ifdef UMA_DEBUG_ALLOC_1
@@ -2743,6 +2835,15 @@ zfree_start:
 	}
 	cache->uc_freebucket = NULL;
 
+	/* Get the domain according to zone flags. */
+	if (zone->uz_flags & UMA_ZONE_NUMA) {
+		domain = PCPU_GET(domain);
+		zdom = &zone->uz_domain[domain];
+	} else {
+		zdom = &zone->uz_domain[0];
+		domain = UMA_ANYDOMAIN;
+	}
+
 	/* Can we throw this on the zone full list? */
 	if (bucket != NULL) {
 #ifdef UMA_DEBUG_ALLOC
@@ -2751,7 +2852,7 @@ zfree_start:
 		/* ub_cnt is pointing to the last free item */
 		KASSERT(bucket->ub_cnt != 0,
 		    ("uma_zfree: Attempting to insert an empty bucket onto the full list.\n"));
-		LIST_INSERT_HEAD(&zone->uz_buckets, bucket, ub_link);
+		LIST_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link);
 	}
 
 	/* We are no longer associated with this CPU. */
@@ -2773,7 +2874,8 @@ zfree_start:
 		critical_enter();
 		cpu = curcpu;
 		cache = &zone->uz_cpu[cpu];
-		if (cache->uc_freebucket == NULL) {
+		if (cache->uc_freebucket == NULL &&
+		    (domain == UMA_ANYDOMAIN || domain == PCPU_GET(domain))) {
 			cache->uc_freebucket = bucket;
 			goto zfree_start;
 		}
@@ -2798,18 +2900,20 @@ zfree_item:
 static void
 slab_free_item(uma_keg_t keg, uma_slab_t slab, void *item)
 {
+	uma_domain_t dom;
 	uint8_t freei;
 
 	mtx_assert(&keg->uk_lock, MA_OWNED);
 	MPASS(keg == slab->us_keg);
 
+	dom = &keg->uk_domain[slab->us_domain];
 	/* Do we need to remove from any lists? */
 	if (slab->us_freecount+1 == keg->uk_ipers) {

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***


More information about the svn-src-projects mailing list