1 files changed, 83 insertions, 49 deletions
diff --git a/sys/i386/include/atomic.h b/sys/i386/include/atomic.h
index 0156b5b..9d365bc 100644
--- a/sys/i386/include/atomic.h
+++ b/sys/i386/include/atomic.h
@@ -87,7 +87,7 @@ int	atomic_cmpset_int(volatile u_int *dst, u_int expect, u_int src);
 u_int	atomic_fetchadd_int(volatile u_int *p, u_int v);
 int	atomic_testandset_int(volatile u_int *p, u_int v);
 
-#define	ATOMIC_LOAD(TYPE, LOP)					\
+#define	ATOMIC_LOAD(TYPE)					\
 u_##TYPE	atomic_load_acq_##TYPE(volatile u_##TYPE *p)
 #define	ATOMIC_STORE(TYPE)					\
 void		atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)
@@ -228,53 +228,87 @@ atomic_testandset_int(volatile u_int *p, u_int v)
  * We assume that a = b will do atomic loads and stores.  Due to the
  * IA32 memory model, a simple store guarantees release semantics.
  *
- * However, loads may pass stores, so for atomic_load_acq we have to
- * ensure a Store/Load barrier to do the load in SMP kernels.  We use
- * "lock cmpxchg" as recommended by the AMD Software Optimization
- * Guide, and not mfence.  For UP kernels, however, the cache of the
- * single processor is always consistent, so we only need to take care
- * of the compiler.
+ * However, a load may pass a store if they are performed on distinct
+ * addresses, so for atomic_load_acq we introduce a Store/Load barrier
+ * before the load in SMP kernels.  We use "lock addl $0,mem", as
+ * recommended by the AMD Software Optimization Guide, and not mfence.
+ * In the kernel, we use a private per-cpu cache line as the target
+ * for the locked addition, to avoid introducing false data
+ * dependencies.  In userspace, a word at the top of the stack is
+ * utilized.
+ *
+ * For UP kernels, however, the memory of the single processor is
+ * always consistent, so we only need to stop the compiler from
+ * reordering accesses in a way that violates the semantics of acquire
+ * and release.
  */
-#define	ATOMIC_STORE(TYPE)				\
-static __inline void					\
-atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\
-{							\
-	__compiler_membar();				\
-	*p = v;						\
-}							\
-struct __hack
+#if defined(_KERNEL)
 
-#if defined(_KERNEL) && !defined(SMP)
+/*
+ * OFFSETOF_MONITORBUF == __pcpu_offset(pc_monitorbuf).
+ *
+ * The open-coded number is used instead of the symbolic expression to
+ * avoid a dependency on sys/pcpu.h in machine/atomic.h consumers.
+ * An assertion in i386/vm_machdep.c ensures that the value is correct.
+ */
+#define	OFFSETOF_MONITORBUF	0x180
 
-#define	ATOMIC_LOAD(TYPE, LOP)				\
-static __inline u_##TYPE				\
-atomic_load_acq_##TYPE(volatile u_##TYPE *p)		\
-{							\
-	u_##TYPE tmp;					\
-							\
-	tmp = *p;					\
-	__compiler_membar();				\
-	return (tmp);					\
-}							\
-struct __hack
+#if defined(SMP)
+static __inline void
+__storeload_barrier(void)
+{
 
-#else /* !(_KERNEL && !SMP) */
+	__asm __volatile("lock; addl $0,%%fs:%0"
+	    : "+m" (*(u_int *)OFFSETOF_MONITORBUF) : : "memory", "cc");
+}
+#else /* _KERNEL && UP */
+static __inline void
+__storeload_barrier(void)
+{
 
-#define	ATOMIC_LOAD(TYPE, LOP)				\
-static __inline u_##TYPE				\
-atomic_load_acq_##TYPE(volatile u_##TYPE *p)		\
-{							\
-	u_##TYPE res;					\
-							\
-	__asm __volatile(MPLOCKED LOP			\
-	: "=a" (res),			/* 0 */		\
-	  "+m" (*p)			/* 1 */		\
-	: : "memory", "cc");				\
-	return (res);					\
-}							\
+	__compiler_membar();
+}
+#endif /* SMP */
+#else /* !_KERNEL */
+static __inline void
+__storeload_barrier(void)
+{
+
+	__asm __volatile("lock; addl $0,(%%esp)" : : : "memory", "cc");
+}
+#endif /* _KERNEL*/
+
+/*
+ * C11-standard acq/rel semantics only apply when the variable in the
+ * call is the same for acq as it is for rel.  However, our previous
+ * (x86) implementations provided much stronger ordering than required
+ * (essentially what is called seq_cst order in C11).  This
+ * implementation provides the historical strong ordering since some
+ * callers depend on it.
+ */
+
+#define	ATOMIC_LOAD(TYPE)					\
+static __inline u_##TYPE					\
+atomic_load_acq_##TYPE(volatile u_##TYPE *p)			\
+{								\
+	u_##TYPE res;						\
+								\
+	__storeload_barrier();					\
+	res = *p;						\
+	__compiler_membar();					\
+	return (res);						\
+}								\
 struct __hack
 
-#endif /* _KERNEL && !SMP */
+#define	ATOMIC_STORE(TYPE)					\
+static __inline void						\
+atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)	\
+{								\
+								\
+	__compiler_membar();					\
+	*p = v;							\
+}								\
+struct __hack
 
 #ifdef _KERNEL
 
@@ -511,19 +545,19 @@ ATOMIC_ASM(clear,    long,  "andl %1,%0",  "ir", ~v);
 ATOMIC_ASM(add,	     long,  "addl %1,%0",  "ir",  v);
 ATOMIC_ASM(subtract, long,  "subl %1,%0",  "ir",  v);
 
-ATOMIC_LOAD(char,  "cmpxchgb %b0,%1");
-ATOMIC_LOAD(short, "cmpxchgw %w0,%1");
-ATOMIC_LOAD(int,   "cmpxchgl %0,%1");
-ATOMIC_LOAD(long,  "cmpxchgl %0,%1");
+#define	ATOMIC_LOADSTORE(TYPE)				\
+	ATOMIC_LOAD(TYPE);				\
+	ATOMIC_STORE(TYPE)
 
-ATOMIC_STORE(char);
-ATOMIC_STORE(short);
-ATOMIC_STORE(int);
-ATOMIC_STORE(long);
+ATOMIC_LOADSTORE(char);
+ATOMIC_LOADSTORE(short);
+ATOMIC_LOADSTORE(int);
+ATOMIC_LOADSTORE(long);
 
 #undef ATOMIC_ASM
 #undef ATOMIC_LOAD
 #undef ATOMIC_STORE
+#undef ATOMIC_LOADSTORE
 
 #ifndef WANT_FUNCTIONS