summaryrefslogtreecommitdiffstats
path: root/arch/sparc64/lib
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2005-06-27 15:42:04 -0700
committerDavid S. Miller <davem@davemloft.net>2005-06-27 15:42:04 -0700
commitb445e26cbf784cdba10f2b6c3e2cd3ee7bab360a (patch)
tree8c8c377a7e5cbf608d730ec45e091e4f2b826a82 /arch/sparc64/lib
parent020f46a39eb7b99a575b9f4d105fce2b142acdf1 (diff)
downloadop-kernel-dev-b445e26cbf784cdba10f2b6c3e2cd3ee7bab360a.zip
op-kernel-dev-b445e26cbf784cdba10f2b6c3e2cd3ee7bab360a.tar.gz
[SPARC64]: Avoid membar instructions in delay slots.
In particular, avoid membar instructions in the delay slot of a jmpl instruction. UltraSPARC-I, II, IIi, and IIe have a bug, documented in the UltraSPARC-IIi User's Manual, Appendix K, Erratum 51 The long and short of it is that if the IMU unit misses on a branch or jmpl, and there is a store buffer synchronizing membar in the delay slot, the chip can stop fetching instructions. If interrupts are enabled or some other trap is enabled, the chip will unwedge itself, but performance will suffer. We already had a workaround for this bug in a few spots, but it's better to have the entire tree sanitized for this rule. Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'arch/sparc64/lib')
-rw-r--r--arch/sparc64/lib/U1memcpy.S103
-rw-r--r--arch/sparc64/lib/VISsave.S15
-rw-r--r--arch/sparc64/lib/atomic.S42
-rw-r--r--arch/sparc64/lib/bitops.S31
-rw-r--r--arch/sparc64/lib/debuglocks.c6
-rw-r--r--arch/sparc64/lib/dec_and_lock.S6
-rw-r--r--arch/sparc64/lib/rwsem.S15
7 files changed, 130 insertions, 88 deletions
diff --git a/arch/sparc64/lib/U1memcpy.S b/arch/sparc64/lib/U1memcpy.S
index da9b520..bafd2fc 100644
--- a/arch/sparc64/lib/U1memcpy.S
+++ b/arch/sparc64/lib/U1memcpy.S
@@ -87,14 +87,17 @@
#define LOOP_CHUNK3(src, dest, len, branch_dest) \
MAIN_LOOP_CHUNK(src, dest, f32, f48, len, branch_dest)
+#define DO_SYNC membar #Sync;
#define STORE_SYNC(dest, fsrc) \
EX_ST(STORE_BLK(%fsrc, %dest)); \
- add %dest, 0x40, %dest;
+ add %dest, 0x40, %dest; \
+ DO_SYNC
#define STORE_JUMP(dest, fsrc, target) \
EX_ST(STORE_BLK(%fsrc, %dest)); \
add %dest, 0x40, %dest; \
- ba,pt %xcc, target;
+ ba,pt %xcc, target; \
+ nop;
#define FINISH_VISCHUNK(dest, f0, f1, left) \
subcc %left, 8, %left;\
@@ -239,17 +242,17 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
ba,pt %xcc, 1b+4
faligndata %f0, %f2, %f48
1: FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32)
- STORE_SYNC(o0, f48) membar #Sync
+ STORE_SYNC(o0, f48)
FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0)
- STORE_JUMP(o0, f48, 40f) membar #Sync
+ STORE_JUMP(o0, f48, 40f)
2: FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0)
- STORE_SYNC(o0, f48) membar #Sync
+ STORE_SYNC(o0, f48)
FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16)
- STORE_JUMP(o0, f48, 48f) membar #Sync
+ STORE_JUMP(o0, f48, 48f)
3: FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16)
- STORE_SYNC(o0, f48) membar #Sync
+ STORE_SYNC(o0, f48)
FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32)
- STORE_JUMP(o0, f48, 56f) membar #Sync
+ STORE_JUMP(o0, f48, 56f)
1: FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18)
LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
@@ -260,17 +263,17 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
ba,pt %xcc, 1b+4
faligndata %f2, %f4, %f48
1: FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34)
- STORE_SYNC(o0, f48) membar #Sync
+ STORE_SYNC(o0, f48)
FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2)
- STORE_JUMP(o0, f48, 41f) membar #Sync
+ STORE_JUMP(o0, f48, 41f)
2: FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2)
- STORE_SYNC(o0, f48) membar #Sync
+ STORE_SYNC(o0, f48)
FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18)
- STORE_JUMP(o0, f48, 49f) membar #Sync
+ STORE_JUMP(o0, f48, 49f)
3: FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18)
- STORE_SYNC(o0, f48) membar #Sync
+ STORE_SYNC(o0, f48)
FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34)
- STORE_JUMP(o0, f48, 57f) membar #Sync
+ STORE_JUMP(o0, f48, 57f)
1: FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20)
LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
@@ -281,17 +284,17 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
ba,pt %xcc, 1b+4
faligndata %f4, %f6, %f48
1: FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36)
- STORE_SYNC(o0, f48) membar #Sync
+ STORE_SYNC(o0, f48)
FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4)
- STORE_JUMP(o0, f48, 42f) membar #Sync
+ STORE_JUMP(o0, f48, 42f)
2: FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4)
- STORE_SYNC(o0, f48) membar #Sync
+ STORE_SYNC(o0, f48)
FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20)
- STORE_JUMP(o0, f48, 50f) membar #Sync
+ STORE_JUMP(o0, f48, 50f)
3: FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20)
- STORE_SYNC(o0, f48) membar #Sync
+ STORE_SYNC(o0, f48)
FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36)
- STORE_JUMP(o0, f48, 58f) membar #Sync
+ STORE_JUMP(o0, f48, 58f)
1: FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22)
LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
@@ -302,17 +305,17 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
ba,pt %xcc, 1b+4
faligndata %f6, %f8, %f48
1: FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38)
- STORE_SYNC(o0, f48) membar #Sync
+ STORE_SYNC(o0, f48)
FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6)
- STORE_JUMP(o0, f48, 43f) membar #Sync
+ STORE_JUMP(o0, f48, 43f)
2: FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6)
- STORE_SYNC(o0, f48) membar #Sync
+ STORE_SYNC(o0, f48)
FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22)
- STORE_JUMP(o0, f48, 51f) membar #Sync
+ STORE_JUMP(o0, f48, 51f)
3: FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22)
- STORE_SYNC(o0, f48) membar #Sync
+ STORE_SYNC(o0, f48)
FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38)
- STORE_JUMP(o0, f48, 59f) membar #Sync
+ STORE_JUMP(o0, f48, 59f)
1: FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24)
LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
@@ -323,17 +326,17 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
ba,pt %xcc, 1b+4
faligndata %f8, %f10, %f48
1: FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40)
- STORE_SYNC(o0, f48) membar #Sync
+ STORE_SYNC(o0, f48)
FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8)
- STORE_JUMP(o0, f48, 44f) membar #Sync
+ STORE_JUMP(o0, f48, 44f)
2: FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8)
- STORE_SYNC(o0, f48) membar #Sync
+ STORE_SYNC(o0, f48)
FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24)
- STORE_JUMP(o0, f48, 52f) membar #Sync
+ STORE_JUMP(o0, f48, 52f)
3: FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24)
- STORE_SYNC(o0, f48) membar #Sync
+ STORE_SYNC(o0, f48)
FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40)
- STORE_JUMP(o0, f48, 60f) membar #Sync
+ STORE_JUMP(o0, f48, 60f)
1: FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26)
LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
@@ -344,17 +347,17 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
ba,pt %xcc, 1b+4
faligndata %f10, %f12, %f48
1: FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42)
- STORE_SYNC(o0, f48) membar #Sync
+ STORE_SYNC(o0, f48)
FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10)
- STORE_JUMP(o0, f48, 45f) membar #Sync
+ STORE_JUMP(o0, f48, 45f)
2: FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10)
- STORE_SYNC(o0, f48) membar #Sync
+ STORE_SYNC(o0, f48)
FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26)
- STORE_JUMP(o0, f48, 53f) membar #Sync
+ STORE_JUMP(o0, f48, 53f)
3: FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26)
- STORE_SYNC(o0, f48) membar #Sync
+ STORE_SYNC(o0, f48)
FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42)
- STORE_JUMP(o0, f48, 61f) membar #Sync
+ STORE_JUMP(o0, f48, 61f)
1: FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28)
LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
@@ -365,17 +368,17 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
ba,pt %xcc, 1b+4
faligndata %f12, %f14, %f48
1: FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44)
- STORE_SYNC(o0, f48) membar #Sync
+ STORE_SYNC(o0, f48)
FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12)
- STORE_JUMP(o0, f48, 46f) membar #Sync
+ STORE_JUMP(o0, f48, 46f)
2: FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12)
- STORE_SYNC(o0, f48) membar #Sync
+ STORE_SYNC(o0, f48)
FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28)
- STORE_JUMP(o0, f48, 54f) membar #Sync
+ STORE_JUMP(o0, f48, 54f)
3: FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28)
- STORE_SYNC(o0, f48) membar #Sync
+ STORE_SYNC(o0, f48)
FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44)
- STORE_JUMP(o0, f48, 62f) membar #Sync
+ STORE_JUMP(o0, f48, 62f)
1: FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30)
LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
@@ -386,17 +389,17 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
ba,pt %xcc, 1b+4
faligndata %f14, %f16, %f48
1: FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46)
- STORE_SYNC(o0, f48) membar #Sync
+ STORE_SYNC(o0, f48)
FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14)
- STORE_JUMP(o0, f48, 47f) membar #Sync
+ STORE_JUMP(o0, f48, 47f)
2: FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14)
- STORE_SYNC(o0, f48) membar #Sync
+ STORE_SYNC(o0, f48)
FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30)
- STORE_JUMP(o0, f48, 55f) membar #Sync
+ STORE_JUMP(o0, f48, 55f)
3: FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30)
- STORE_SYNC(o0, f48) membar #Sync
+ STORE_SYNC(o0, f48)
FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46)
- STORE_JUMP(o0, f48, 63f) membar #Sync
+ STORE_JUMP(o0, f48, 63f)
40: FINISH_VISCHUNK(o0, f0, f2, g3)
41: FINISH_VISCHUNK(o0, f2, f4, g3)
diff --git a/arch/sparc64/lib/VISsave.S b/arch/sparc64/lib/VISsave.S
index 65e328d..4e18989 100644
--- a/arch/sparc64/lib/VISsave.S
+++ b/arch/sparc64/lib/VISsave.S
@@ -72,7 +72,11 @@ vis1: ldub [%g6 + TI_FPSAVED], %g3
stda %f48, [%g3 + %g1] ASI_BLK_P
5: membar #Sync
- jmpl %g7 + %g0, %g0
+ ba,pt %xcc, 80f
+ nop
+
+ .align 32
+80: jmpl %g7 + %g0, %g0
nop
6: ldub [%g3 + TI_FPSAVED], %o5
@@ -87,8 +91,11 @@ vis1: ldub [%g6 + TI_FPSAVED], %g3
stda %f32, [%g2 + %g1] ASI_BLK_P
stda %f48, [%g3 + %g1] ASI_BLK_P
membar #Sync
- jmpl %g7 + %g0, %g0
+ ba,pt %xcc, 80f
+ nop
+ .align 32
+80: jmpl %g7 + %g0, %g0
nop
.align 32
@@ -126,6 +133,10 @@ VISenterhalf:
stda %f0, [%g2 + %g1] ASI_BLK_P
stda %f16, [%g3 + %g1] ASI_BLK_P
membar #Sync
+ ba,pt %xcc, 4f
+ nop
+
+ .align 32
4: and %o5, FPRS_DU, %o5
jmpl %g7 + %g0, %g0
wr %o5, FPRS_FEF, %fprs
diff --git a/arch/sparc64/lib/atomic.S b/arch/sparc64/lib/atomic.S
index e528b8d..faf87c3 100644
--- a/arch/sparc64/lib/atomic.S
+++ b/arch/sparc64/lib/atomic.S
@@ -7,18 +7,6 @@
#include <linux/config.h>
#include <asm/asi.h>
- /* On SMP we need to use memory barriers to ensure
- * correct memory operation ordering, nop these out
- * for uniprocessor.
- */
-#ifdef CONFIG_SMP
-#define ATOMIC_PRE_BARRIER membar #StoreLoad | #LoadLoad
-#define ATOMIC_POST_BARRIER membar #StoreLoad | #StoreStore
-#else
-#define ATOMIC_PRE_BARRIER nop
-#define ATOMIC_POST_BARRIER nop
-#endif
-
.text
/* Two versions of the atomic routines, one that
@@ -52,6 +40,24 @@ atomic_sub: /* %o0 = decrement, %o1 = atomic_ptr */
nop
.size atomic_sub, .-atomic_sub
+ /* On SMP we need to use memory barriers to ensure
+ * correct memory operation ordering, nop these out
+ * for uniprocessor.
+ */
+#ifdef CONFIG_SMP
+
+#define ATOMIC_PRE_BARRIER membar #StoreLoad | #LoadLoad;
+#define ATOMIC_POST_BARRIER \
+ ba,pt %xcc, 80b; \
+ membar #StoreLoad | #StoreStore
+
+80: retl
+ nop
+#else
+#define ATOMIC_PRE_BARRIER
+#define ATOMIC_POST_BARRIER
+#endif
+
.globl atomic_add_ret
.type atomic_add_ret,#function
atomic_add_ret: /* %o0 = increment, %o1 = atomic_ptr */
@@ -62,9 +68,10 @@ atomic_add_ret: /* %o0 = increment, %o1 = atomic_ptr */
cmp %g1, %g7
bne,pn %icc, 1b
add %g7, %o0, %g7
+ sra %g7, 0, %o0
ATOMIC_POST_BARRIER
retl
- sra %g7, 0, %o0
+ nop
.size atomic_add_ret, .-atomic_add_ret
.globl atomic_sub_ret
@@ -77,9 +84,10 @@ atomic_sub_ret: /* %o0 = decrement, %o1 = atomic_ptr */
cmp %g1, %g7
bne,pn %icc, 1b
sub %g7, %o0, %g7
+ sra %g7, 0, %o0
ATOMIC_POST_BARRIER
retl
- sra %g7, 0, %o0
+ nop
.size atomic_sub_ret, .-atomic_sub_ret
.globl atomic64_add
@@ -118,9 +126,10 @@ atomic64_add_ret: /* %o0 = increment, %o1 = atomic_ptr */
cmp %g1, %g7
bne,pn %xcc, 1b
add %g7, %o0, %g7
+ mov %g7, %o0
ATOMIC_POST_BARRIER
retl
- mov %g7, %o0
+ nop
.size atomic64_add_ret, .-atomic64_add_ret
.globl atomic64_sub_ret
@@ -133,7 +142,8 @@ atomic64_sub_ret: /* %o0 = decrement, %o1 = atomic_ptr */
cmp %g1, %g7
bne,pn %xcc, 1b
sub %g7, %o0, %g7
+ mov %g7, %o0
ATOMIC_POST_BARRIER
retl
- mov %g7, %o0
+ nop
.size atomic64_sub_ret, .-atomic64_sub_ret
diff --git a/arch/sparc64/lib/bitops.S b/arch/sparc64/lib/bitops.S
index 886dcd2..31afbfe 100644
--- a/arch/sparc64/lib/bitops.S
+++ b/arch/sparc64/lib/bitops.S
@@ -7,20 +7,26 @@
#include <linux/config.h>
#include <asm/asi.h>
+ .text
+
/* On SMP we need to use memory barriers to ensure
* correct memory operation ordering, nop these out
* for uniprocessor.
*/
+
#ifdef CONFIG_SMP
#define BITOP_PRE_BARRIER membar #StoreLoad | #LoadLoad
-#define BITOP_POST_BARRIER membar #StoreLoad | #StoreStore
+#define BITOP_POST_BARRIER \
+ ba,pt %xcc, 80b; \
+ membar #StoreLoad | #StoreStore
+
+80: retl
+ nop
#else
-#define BITOP_PRE_BARRIER nop
-#define BITOP_POST_BARRIER nop
+#define BITOP_PRE_BARRIER
+#define BITOP_POST_BARRIER
#endif
- .text
-
.globl test_and_set_bit
.type test_and_set_bit,#function
test_and_set_bit: /* %o0=nr, %o1=addr */
@@ -37,10 +43,11 @@ test_and_set_bit: /* %o0=nr, %o1=addr */
cmp %g7, %g1
bne,pn %xcc, 1b
and %g7, %o2, %g2
- BITOP_POST_BARRIER
clr %o0
+ movrne %g2, 1, %o0
+ BITOP_POST_BARRIER
retl
- movrne %g2, 1, %o0
+ nop
.size test_and_set_bit, .-test_and_set_bit
.globl test_and_clear_bit
@@ -59,10 +66,11 @@ test_and_clear_bit: /* %o0=nr, %o1=addr */
cmp %g7, %g1
bne,pn %xcc, 1b
and %g7, %o2, %g2
- BITOP_POST_BARRIER
clr %o0
+ movrne %g2, 1, %o0
+ BITOP_POST_BARRIER
retl
- movrne %g2, 1, %o0
+ nop
.size test_and_clear_bit, .-test_and_clear_bit
.globl test_and_change_bit
@@ -81,10 +89,11 @@ test_and_change_bit: /* %o0=nr, %o1=addr */
cmp %g7, %g1
bne,pn %xcc, 1b
and %g7, %o2, %g2
- BITOP_POST_BARRIER
clr %o0
+ movrne %g2, 1, %o0
+ BITOP_POST_BARRIER
retl
- movrne %g2, 1, %o0
+ nop
.size test_and_change_bit, .-test_and_change_bit
.globl set_bit
diff --git a/arch/sparc64/lib/debuglocks.c b/arch/sparc64/lib/debuglocks.c
index c421e0c..f03344c 100644
--- a/arch/sparc64/lib/debuglocks.c
+++ b/arch/sparc64/lib/debuglocks.c
@@ -252,8 +252,9 @@ wlock_again:
" andn %%g1, %%g3, %%g7\n"
" casx [%0], %%g1, %%g7\n"
" cmp %%g1, %%g7\n"
+" membar #StoreLoad | #StoreStore\n"
" bne,pn %%xcc, 1b\n"
-" membar #StoreLoad | #StoreStore"
+" nop"
: /* no outputs */
: "r" (&(rw->lock))
: "g3", "g1", "g7", "cc", "memory");
@@ -351,8 +352,9 @@ int _do_write_trylock (rwlock_t *rw, char *str)
" andn %%g1, %%g3, %%g7\n"
" casx [%0], %%g1, %%g7\n"
" cmp %%g1, %%g7\n"
+" membar #StoreLoad | #StoreStore\n"
" bne,pn %%xcc, 1b\n"
-" membar #StoreLoad | #StoreStore"
+" nop"
: /* no outputs */
: "r" (&(rw->lock))
: "g3", "g1", "g7", "cc", "memory");
diff --git a/arch/sparc64/lib/dec_and_lock.S b/arch/sparc64/lib/dec_and_lock.S
index 7e6fdae..8ee288d 100644
--- a/arch/sparc64/lib/dec_and_lock.S
+++ b/arch/sparc64/lib/dec_and_lock.S
@@ -48,8 +48,9 @@ start_to_zero:
#endif
to_zero:
ldstub [%o1], %g3
+ membar #StoreLoad | #StoreStore
brnz,pn %g3, spin_on_lock
- membar #StoreLoad | #StoreStore
+ nop
loop2: cas [%o0], %g2, %g7 /* ASSERT(g7 == 0) */
cmp %g2, %g7
@@ -71,8 +72,9 @@ loop2: cas [%o0], %g2, %g7 /* ASSERT(g7 == 0) */
nop
spin_on_lock:
ldub [%o1], %g3
+ membar #LoadLoad
brnz,pt %g3, spin_on_lock
- membar #LoadLoad
+ nop
ba,pt %xcc, to_zero
nop
nop
diff --git a/arch/sparc64/lib/rwsem.S b/arch/sparc64/lib/rwsem.S
index 174ff7b..75f0e6b 100644
--- a/arch/sparc64/lib/rwsem.S
+++ b/arch/sparc64/lib/rwsem.S
@@ -17,8 +17,9 @@ __down_read:
bne,pn %icc, 1b
add %g7, 1, %g7
cmp %g7, 0
+ membar #StoreLoad | #StoreStore
bl,pn %icc, 3f
- membar #StoreLoad | #StoreStore
+ nop
2:
retl
nop
@@ -57,8 +58,9 @@ __down_write:
cmp %g3, %g7
bne,pn %icc, 1b
cmp %g7, 0
+ membar #StoreLoad | #StoreStore
bne,pn %icc, 3f
- membar #StoreLoad | #StoreStore
+ nop
2: retl
nop
3:
@@ -97,8 +99,9 @@ __up_read:
cmp %g1, %g7
bne,pn %icc, 1b
cmp %g7, 0
+ membar #StoreLoad | #StoreStore
bl,pn %icc, 3f
- membar #StoreLoad | #StoreStore
+ nop
2: retl
nop
3: sethi %hi(RWSEM_ACTIVE_MASK), %g1
@@ -126,8 +129,9 @@ __up_write:
bne,pn %icc, 1b
sub %g7, %g1, %g7
cmp %g7, 0
+ membar #StoreLoad | #StoreStore
bl,pn %icc, 3f
- membar #StoreLoad | #StoreStore
+ nop
2:
retl
nop
@@ -151,8 +155,9 @@ __downgrade_write:
bne,pn %icc, 1b
sub %g7, %g1, %g7
cmp %g7, 0
+ membar #StoreLoad | #StoreStore
bl,pn %icc, 3f
- membar #StoreLoad | #StoreStore
+ nop
2:
retl
nop
OpenPOWER on IntegriCloud