summaryrefslogtreecommitdiffstats
path: root/sys/amd64/include/atomic.h
diff options
context:
space:
mode:
authorkib <kib@FreeBSD.org>2015-06-28 05:04:08 +0000
committerkib <kib@FreeBSD.org>2015-06-28 05:04:08 +0000
commit6279b7c930802b0ec654cf9d6c051c6a919d7bd0 (patch)
tree2e41a3d37eee6a7b024d2845a13f9a891c21c2f7 /sys/amd64/include/atomic.h
parent9c6d6e9137dfbc13f2677060c04dad121320aaaf (diff)
downloadFreeBSD-src-6279b7c930802b0ec654cf9d6c051c6a919d7bd0.zip
FreeBSD-src-6279b7c930802b0ec654cf9d6c051c6a919d7bd0.tar.gz
Remove unneeded data dependency, currently imposed by
atomic_load_acq(9), on it source, for x86. Right now, atomic_load_acq() on x86 is sequentially consistent with other atomics, code ensures this by doing store/load barrier by performing locked nop on the source. Provide separate primitive __storeload_barrier(), which is implemented as the locked nop done on a cpu-private variable, and put __storeload_barrier() before load, to keep seq_cst semantic but avoid introducing false dependency on the no-modification of the source for its later use. Note that seq_cst property of x86 atomic_load_acq() is not documented and not carried by atomics implementations on other architectures, although some kernel code relies on the behaviour. This commit does not intend to change this. Reviewed by: alc Discussed with: bde Tested by: pho Sponsored by: The FreeBSD Foundation MFC after: 2 weeks
Diffstat (limited to 'sys/amd64/include/atomic.h')
-rw-r--r--sys/amd64/include/atomic.h125
1 files changed, 75 insertions, 50 deletions
diff --git a/sys/amd64/include/atomic.h b/sys/amd64/include/atomic.h
index 9110dc5..dceb3dc 100644
--- a/sys/amd64/include/atomic.h
+++ b/sys/amd64/include/atomic.h
@@ -85,7 +85,7 @@ u_long atomic_fetchadd_long(volatile u_long *p, u_long v);
int atomic_testandset_int(volatile u_int *p, u_int v);
int atomic_testandset_long(volatile u_long *p, u_int v);
-#define ATOMIC_LOAD(TYPE, LOP) \
+#define ATOMIC_LOAD(TYPE) \
u_##TYPE atomic_load_acq_##TYPE(volatile u_##TYPE *p)
#define ATOMIC_STORE(TYPE) \
void atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)
@@ -245,53 +245,79 @@ atomic_testandset_long(volatile u_long *p, u_int v)
* We assume that a = b will do atomic loads and stores. Due to the
* IA32 memory model, a simple store guarantees release semantics.
*
- * However, loads may pass stores, so for atomic_load_acq we have to
- * ensure a Store/Load barrier to do the load in SMP kernels. We use
- * "lock cmpxchg" as recommended by the AMD Software Optimization
- * Guide, and not mfence. For UP kernels, however, the cache of the
- * single processor is always consistent, so we only need to take care
- * of the compiler.
+ * However, a load may pass a store if they are performed on distinct
+ * addresses, so for atomic_load_acq we introduce a Store/Load barrier
+ * before the load in SMP kernels. We use "lock addl $0,mem", as
+ * recommended by the AMD Software Optimization Guide, and not mfence.
+ * In the kernel, we use a private per-cpu cache line as the target
+ * for the locked addition, to avoid introducing false data
+ * dependencies. In userspace, a word in the red zone on the stack
+ * (-8(%rsp)) is utilized.
+ *
+ * For UP kernels, however, the memory of the single processor is
+ * always consistent, so we only need to stop the compiler from
+ * reordering accesses in a way that violates the semantics of acquire
+ * and release.
*/
-#define ATOMIC_STORE(TYPE) \
-static __inline void \
-atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\
-{ \
- __compiler_membar(); \
- *p = v; \
-} \
-struct __hack
-#if defined(_KERNEL) && !defined(SMP)
+#if defined(_KERNEL)
-#define ATOMIC_LOAD(TYPE, LOP) \
-static __inline u_##TYPE \
-atomic_load_acq_##TYPE(volatile u_##TYPE *p) \
-{ \
- u_##TYPE tmp; \
- \
- tmp = *p; \
- __compiler_membar(); \
- return (tmp); \
-} \
-struct __hack
+/*
+ * OFFSETOF_MONITORBUF == __pcpu_offset(pc_monitorbuf).
+ *
+ * The open-coded number is used instead of the symbolic expression to
+ * avoid a dependency on sys/pcpu.h in machine/atomic.h consumers.
+ * An assertion in amd64/vm_machdep.c ensures that the value is correct.
+ */
+#define OFFSETOF_MONITORBUF 0x180
+
+#if defined(SMP)
+static __inline void
+__storeload_barrier(void)
+{
-#else /* !(_KERNEL && !SMP) */
+ __asm __volatile("lock; addl $0,%%gs:%0"
+ : "+m" (*(u_int *)OFFSETOF_MONITORBUF) : : "memory", "cc");
+}
+#else /* _KERNEL && UP */
+static __inline void
+__storeload_barrier(void)
+{
-#define ATOMIC_LOAD(TYPE, LOP) \
-static __inline u_##TYPE \
-atomic_load_acq_##TYPE(volatile u_##TYPE *p) \
-{ \
- u_##TYPE res; \
- \
- __asm __volatile(MPLOCKED LOP \
- : "=a" (res), /* 0 */ \
- "+m" (*p) /* 1 */ \
- : : "memory", "cc"); \
- return (res); \
-} \
+ __compiler_membar();
+}
+#endif /* SMP */
+#else /* !_KERNEL */
+static __inline void
+__storeload_barrier(void)
+{
+
+ __asm __volatile("lock; addl $0,-8(%%rsp)" : : : "memory", "cc");
+}
+#endif /* _KERNEL*/
+
+#define ATOMIC_LOAD(TYPE) \
+static __inline u_##TYPE \
+atomic_load_acq_##TYPE(volatile u_##TYPE *p) \
+{ \
+ u_##TYPE res; \
+ \
+ __storeload_barrier(); \
+ res = *p; \
+ __compiler_membar(); \
+ return (res); \
+} \
struct __hack
-#endif /* _KERNEL && !SMP */
+#define ATOMIC_STORE(TYPE) \
+static __inline void \
+atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v) \
+{ \
+ \
+ __compiler_membar(); \
+ *p = v; \
+} \
+struct __hack
#endif /* KLD_MODULE || !__GNUCLIKE_ASM */
@@ -315,20 +341,19 @@ ATOMIC_ASM(clear, long, "andq %1,%0", "ir", ~v);
ATOMIC_ASM(add, long, "addq %1,%0", "ir", v);
ATOMIC_ASM(subtract, long, "subq %1,%0", "ir", v);
-ATOMIC_LOAD(char, "cmpxchgb %b0,%1");
-ATOMIC_LOAD(short, "cmpxchgw %w0,%1");
-ATOMIC_LOAD(int, "cmpxchgl %0,%1");
-ATOMIC_LOAD(long, "cmpxchgq %0,%1");
+#define ATOMIC_LOADSTORE(TYPE) \
+ ATOMIC_LOAD(TYPE); \
+ ATOMIC_STORE(TYPE)
-ATOMIC_STORE(char);
-ATOMIC_STORE(short);
-ATOMIC_STORE(int);
-ATOMIC_STORE(long);
+ATOMIC_LOADSTORE(char);
+ATOMIC_LOADSTORE(short);
+ATOMIC_LOADSTORE(int);
+ATOMIC_LOADSTORE(long);
#undef ATOMIC_ASM
#undef ATOMIC_LOAD
#undef ATOMIC_STORE
-
+#undef ATOMIC_LOADSTORE
#ifndef WANT_FUNCTIONS
/* Read the current value and store a new value in the destination. */
OpenPOWER on IntegriCloud