svn commit: r305004 - in head/sys: amd64/amd64 amd64/include i386/i386

Mon Aug 29 13:07:23 UTC 2016

Author: bde
Date: Mon Aug 29 13:07:21 2016
New Revision: 305004
URL: https://svnweb.freebsd.org/changeset/base/305004

Log:
  On amd64, declare sse2_pagezero() and start using it again, but only
  for zeroing pages in idle where nontemporal writes are clearly best.
  This is almost a no-op since zeroing in idle works does nothing good
  and is off by default.  Fix END() statement forgotten in previous
  commit.
  
  Align the loop in sse2_pagezero().  Since it writes to main memory,
  the loop doesn't have to be very carefully written to keep up.
  Unrolling it was considered useless or harmful and was not done on
  i386, but that was too careless.
  
  Timing for i386: the loop was not unrolled at all, and moved only 4
  bytes/iteration.  So on a 2GHz CPU, it needed to run at 2 cycles/
  iteration to keep up with a memory speed of just 4GB/sec.  But when
  it crossed a 16-byte boundary, on old CPUs it ran at 3 cycles/
  iteration so it gave a maximum speed of 2.67GB/sec and couldn't even
  keep up with PC3200 memory.  Fix the alignment so that it keep up with
  4GB/sec memory, and unroll once to get nearer to 8GB/sec.  Further
  unrolling might be useless or harmful since it would prevent the loop
  fitting in 16-bytes.  My test system with an old CPU and old DDR1 only
  needed 5+ GB/sec.  My test system with a new CPU and DDR3 doesn't need
  any changes to keep up ~16GB/sec.
  
  Timing for amd64: with 8-byte accesses and newer faster CPUs it is
  easy to reach 16GB/sec but not so easy to go much faster.  The
  alignment doesn't matter much if the CPU is not very old.  The loop
  was already unrolled 4 times, but needs 32 bytes and uses a fancy
  method that doesn't work for 2-way unrolling in 16 bytes.  Just
  align it to 32-bytes.

Modified:
  head/sys/amd64/amd64/pmap.c
  head/sys/amd64/amd64/support.S
  head/sys/amd64/include/md_var.h
  head/sys/i386/i386/support.s

Modified: head/sys/amd64/amd64/pmap.c
==============================================================================

--- head/sys/amd64/amd64/pmap.c	Mon Aug 29 12:57:28 2016	(r305003)
+++ head/sys/amd64/amd64/pmap.c	Mon Aug 29 13:07:21 2016	(r305004)
@@ -5192,7 +5192,7 @@ pmap_zero_page_idle(vm_page_t m)
 {
 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 
-	pagezero((void *)va);
+	sse2_pagezero((void *)va);
 }
 
 /*

Modified: head/sys/amd64/amd64/support.S
==============================================================================
--- head/sys/amd64/amd64/support.S	Mon Aug 29 12:57:28 2016	(r305003)
+++ head/sys/amd64/amd64/support.S	Mon Aug 29 13:07:21 2016	(r305004)
@@ -78,6 +78,12 @@ ENTRY(sse2_pagezero)
 	movq	$-PAGE_SIZE,%rdx
 	subq	%rdx,%rdi
 	xorl	%eax,%eax
+	jmp	1f
+	/*
+	 * The loop takes 29 bytes.  Ensure that it doesn't cross a 32-byte
+	 * cache line.
+	 */
+	.p2align 5,0x90
 1:
 	movnti	%rax,(%rdi,%rdx)
 	movnti	%rax,8(%rdi,%rdx)
@@ -88,7 +94,7 @@ ENTRY(sse2_pagezero)
 	sfence
 	POP_FRAME_POINTER
 	ret
-END(pagezero)
+END(sse2_pagezero)
 
 ENTRY(bcmp)
 	PUSH_FRAME_POINTER

Modified: head/sys/amd64/include/md_var.h
==============================================================================
--- head/sys/amd64/include/md_var.h	Mon Aug 29 12:57:28 2016	(r305003)
+++ head/sys/amd64/include/md_var.h	Mon Aug 29 13:07:21 2016	(r305004)
@@ -57,6 +57,7 @@ void	gsbase_load_fault(void) __asm(__STR
 void	fpstate_drop(struct thread *td);
 void	pagezero(void *addr);
 void	setidt(int idx, alias_for_inthand_t *func, int typ, int dpl, int ist);
+void	sse2_pagezero(void *addr);
 struct savefpu *get_pcb_user_save_td(struct thread *td);
 struct savefpu *get_pcb_user_save_pcb(struct pcb *pcb);
 

Modified: head/sys/i386/i386/support.s
==============================================================================
--- head/sys/i386/i386/support.s	Mon Aug 29 12:57:28 2016	(r305003)
+++ head/sys/i386/i386/support.s	Mon Aug 29 13:07:21 2016	(r305004)
@@ -69,9 +69,16 @@ ENTRY(sse2_pagezero)
 	movl	%ecx,%eax
 	addl	$4096,%eax
 	xor	%ebx,%ebx
+	jmp	1f
+	/*
+	 * The loop takes 14 bytes.  Ensure that it doesn't cross a 16-byte
+	 * cache line.
+	 */
+	.p2align 4,0x90
 1:
 	movnti	%ebx,(%ecx)
-	addl	$4,%ecx
+	movnti	%ebx,4(%ecx)
+	addl	$8,%ecx
 	cmpl	%ecx,%eax
 	jne	1b
 	sfence