svn commit: r230673 - stable/9/sys/sparc64/include

Sat Jan 28 23:24:03 UTC 2012

Author: marius
Date: Sat Jan 28 23:24:03 2012
New Revision: 230673
URL: http://svn.freebsd.org/changeset/base/230673

Log:
  MFC: r225889, r228222
  
  In total store which we use for running the kernel and all of the userland
  atomic operations behave as if they were followed by a CPU memory barrier
  so there's no need to include ones in the acquire variants of atomic(9) and
  it's sufficient to just use include compiler memory barriers to satisfy
  the requirements of atomic(9). Removing the CPU memory barriers results in
  a small performance improvement, specifically this is sufficient to
  compensate the performance loss seen in the worldstone benchmark seen when
  using SCHED_ULE instead of SCHED_4BSD.
  This change is inspired by Linux even more radically doing the equivalent
  thing some time ago.
  Thanks go to Peter Jeremy for additional testing.

Modified:
  stable/9/sys/sparc64/include/atomic.h
Directory Properties:
  stable/9/sys/   (props changed)
  stable/9/sys/amd64/include/xen/   (props changed)
  stable/9/sys/boot/   (props changed)
  stable/9/sys/boot/i386/efi/   (props changed)
  stable/9/sys/boot/ia64/efi/   (props changed)
  stable/9/sys/boot/ia64/ski/   (props changed)
  stable/9/sys/boot/powerpc/boot1.chrp/   (props changed)
  stable/9/sys/boot/powerpc/ofw/   (props changed)
  stable/9/sys/cddl/contrib/opensolaris/   (props changed)
  stable/9/sys/conf/   (props changed)
  stable/9/sys/contrib/dev/acpica/   (props changed)
  stable/9/sys/contrib/octeon-sdk/   (props changed)
  stable/9/sys/contrib/pf/   (props changed)
  stable/9/sys/contrib/x86emu/   (props changed)

Modified: stable/9/sys/sparc64/include/atomic.h
==============================================================================

--- stable/9/sys/sparc64/include/atomic.h	Sat Jan 28 23:18:02 2012	(r230672)
+++ stable/9/sys/sparc64/include/atomic.h	Sat Jan 28 23:24:03 2012	(r230673)
@@ -74,12 +74,16 @@
  *
  * the return value of cas is used to avoid the extra reload.
  *
- * The memory barriers provided by the acq and rel variants are intended
- * to be sufficient for use of relaxed memory ordering.  Due to the
- * suggested assembly syntax of the membar operands containing a #
- * character, they cannot be used in macros.  The cmask and mmask bits
+ * We only include a memory barrier in the rel variants as in total store
+ * order which we use for running the kernel and all of the userland atomic
+ * loads and stores behave as if the were followed by a membar with a mask
+ * of #LoadLoad | #LoadStore | #StoreStore.  In order to be also sufficient
+ * for use of relaxed memory ordering, the atomic_cas() in the acq variants
+ * additionally would have to be followed by a membar #LoadLoad | #LoadStore.
+ * Due to the suggested assembly syntax of the membar operands containing a
+ * # character, they cannot be used in macros.  The cmask and mmask bits thus
  * are hard coded in machine/cpufunc.h and used here through macros.
- * Hopefully sun will choose not to change the bit numbers.
+ * Hopefully the bit numbers won't change in the future.
  */
 
 #define	itype(sz)	uint ## sz ## _t
@@ -93,7 +97,7 @@
 #define	atomic_cas_acq(p, e, s, sz) ({					\
 	itype(sz) v;							\
 	v = atomic_cas(p, e, s, sz);					\
-	membar(LoadLoad | LoadStore);					\
+	__asm __volatile("" : : : "memory");				\
 	v;								\
 })
 
@@ -118,7 +122,7 @@
 #define	atomic_op_acq(p, op, v, sz) ({					\
 	itype(sz) t;							\
 	t = atomic_op(p, op, v, sz);					\
-	membar(LoadLoad | LoadStore);					\
+	__asm __volatile("" : : : "memory");				\
 	t;								\
 })
 
@@ -135,7 +139,7 @@
 #define	atomic_load_acq(p, sz) ({					\
 	itype(sz) v;							\
 	v = atomic_load(p, sz);						\
-	membar(LoadLoad | LoadStore);					\
+	__asm __volatile("" : : : "memory");				\
 	v;								\
 })