From 6b999dff3e159c3cb1559a30c663435c3265807b Mon Sep 17 00:00:00 2001
From: ian <ian@FreeBSD.org>
Date: Mon, 11 Aug 2014 02:20:24 +0000
Subject: MFC r269403, r269405, r269410, r269414:

  Add 64-bit atomic ops for armv6, and also for armv4 only in kernel code.
  Use the new ops in the cddl code (and avoid defining functions with the
  same names locally).
---
 sys/arm/include/atomic.h                           | 321 +++++++++++++++++++++
 .../compat/opensolaris/kern/opensolaris_atomic.c   |   2 +-
 sys/cddl/compat/opensolaris/sys/atomic.h           |   4 +-
 3 files changed, 324 insertions(+), 3 deletions(-)

diff --git a/sys/arm/include/atomic.h b/sys/arm/include/atomic.h
index 02be1bd..a62ca00 100644
--- a/sys/arm/include/atomic.h
+++ b/sys/arm/include/atomic.h
@@ -88,6 +88,8 @@
 	defined (__ARM_ARCH_6T2__) || \
 	defined (__ARM_ARCH_6Z__)  || \
 	defined (__ARM_ARCH_6ZK__)
+#define	ARM_HAVE_ATOMIC64
+
 static __inline void
 __do_dmb(void)
 {
@@ -146,6 +148,28 @@ atomic_set_32(volatile uint32_t *address, uint32_t setmask)
 }
 
 static __inline void
+atomic_set_64(volatile uint64_t *p, uint64_t val)
+{
+	uint64_t tmp;
+	uint32_t exflag;
+
+	__asm __volatile(
+		"1:          \n"
+		"   ldrexd   %[tmp], [%[ptr]]\n"
+		"   orr      %Q[tmp], %Q[val]\n"
+		"   orr      %R[tmp], %R[val]\n"
+		"   strexd   %[exf], %[tmp], [%[ptr]]\n"
+		"   teq      %[exf], #0\n"
+		"   it ne    \n"
+		"   bne      1b\n"
+		:   [exf]    "=&r"  (exflag), 
+		    [tmp]    "=&r"  (tmp)
+		:   [ptr]    "r"    (p), 
+		    [val]    "r"    (val)
+		:   "cc", "memory");
+}
+
+static __inline void
 atomic_set_long(volatile u_long *address, u_long setmask)
 {
 	u_long tmp = 0, tmp2 = 0;
@@ -177,6 +201,28 @@ atomic_clear_32(volatile uint32_t *address, uint32_t setmask)
 }
 
 static __inline void
+atomic_clear_64(volatile uint64_t *p, uint64_t val)
+{
+	uint64_t tmp;
+	uint32_t exflag;
+
+	__asm __volatile(
+		"1:          \n"
+		"   ldrexd   %[tmp], [%[ptr]]\n"
+		"   bic      %Q[tmp], %Q[val]\n"
+		"   bic      %R[tmp], %R[val]\n"
+		"   strexd   %[exf], %[tmp], [%[ptr]]\n"
+		"   teq      %[exf], #0\n"
+		"   it ne    \n"
+		"   bne      1b\n"
+		:   [exf]    "=&r"  (exflag), 
+		    [tmp]    "=&r"  (tmp)
+		:   [ptr]    "r"    (p), 
+		    [val]    "r"    (val)
+		:   "cc", "memory");
+}
+
+static __inline void
 atomic_clear_long(volatile u_long *address, u_long setmask)
 {
 	u_long tmp = 0, tmp2 = 0;
@@ -213,6 +259,35 @@ atomic_cmpset_32(volatile u_int32_t *p, volatile u_int32_t cmpval, volatile u_in
 	return (ret);
 }
 
+static __inline int
+atomic_cmpset_64(volatile uint64_t *p, uint64_t cmpval, uint64_t newval)
+{
+	uint64_t tmp;
+	uint32_t ret;
+
+	__asm __volatile(
+		"1:          \n"
+		"   ldrexd   %[tmp], [%[ptr]]\n"
+		"   teq      %Q[tmp], %Q[cmp]\n"
+		"   itee eq  \n"
+		"   teqeq    %R[tmp], %R[cmp]\n"
+		"   movne    %[ret], #0\n"
+		"   bne      2f\n"
+		"   strexd   %[ret], %[new], [%[ptr]]\n"
+		"   teq      %[ret], #0\n"
+		"   it ne    \n"
+		"   bne      1b\n"
+		"   mov      %[ret], #1\n"
+		"2:          \n"
+		:   [ret]    "=&r"  (ret), 
+		    [tmp]    "=&r"  (tmp)
+		:   [ptr]    "r"    (p), 
+		    [cmp]    "r"    (cmpval), 
+		    [new]    "r"    (newval)
+		:   "cc", "memory");
+	return (ret);
+}
+
 static __inline u_long
 atomic_cmpset_long(volatile u_long *p, volatile u_long cmpval, volatile u_long newval)
 {
@@ -244,6 +319,15 @@ atomic_cmpset_acq_32(volatile u_int32_t *p, volatile u_int32_t cmpval, volatile
 	return (ret);
 }
 
+static __inline uint64_t
+atomic_cmpset_acq_64(volatile uint64_t *p, volatile uint64_t cmpval, volatile uint64_t newval)
+{
+	uint64_t ret = atomic_cmpset_64(p, cmpval, newval);
+
+	__do_dmb();
+	return (ret);
+}
+
 static __inline u_long
 atomic_cmpset_acq_long(volatile u_long *p, volatile u_long cmpval, volatile u_long newval)
 {
@@ -261,6 +345,14 @@ atomic_cmpset_rel_32(volatile u_int32_t *p, volatile u_int32_t cmpval, volatile
 	return (atomic_cmpset_32(p, cmpval, newval));
 }
 
+static __inline uint64_t
+atomic_cmpset_rel_64(volatile uint64_t *p, volatile uint64_t cmpval, volatile uint64_t newval)
+{
+	
+	__do_dmb();
+	return (atomic_cmpset_64(p, cmpval, newval));
+}
+
 static __inline u_long
 atomic_cmpset_rel_long(volatile u_long *p, volatile u_long cmpval, volatile u_long newval)
 {
@@ -286,6 +378,28 @@ atomic_add_32(volatile u_int32_t *p, u_int32_t val)
 }
 
 static __inline void
+atomic_add_64(volatile uint64_t *p, uint64_t val)
+{
+	uint64_t tmp;
+	uint32_t exflag;
+
+	__asm __volatile(
+		"1:          \n"
+		"   ldrexd   %[tmp], [%[ptr]]\n"
+		"   adds     %Q[tmp], %Q[val]\n"
+		"   adc      %R[tmp], %R[val]\n"
+		"   strexd   %[exf], %[tmp], [%[ptr]]\n"
+		"   teq      %[exf], #0\n"
+		"   it ne    \n"
+		"   bne      1b\n"
+		:   [exf]    "=&r"  (exflag), 
+		    [tmp]    "=&r"  (tmp)
+		:   [ptr]    "r"    (p), 
+		    [val]    "r"    (val)
+		:   "cc", "memory");
+}
+
+static __inline void
 atomic_add_long(volatile u_long *p, u_long val)
 {
 	u_long tmp = 0, tmp2 = 0;
@@ -316,6 +430,28 @@ atomic_subtract_32(volatile u_int32_t *p, u_int32_t val)
 }
 
 static __inline void
+atomic_subtract_64(volatile uint64_t *p, uint64_t val)
+{
+	uint64_t tmp;
+	uint32_t exflag;
+
+	__asm __volatile(
+		"1:          \n"
+		"   ldrexd   %[tmp], [%[ptr]]\n"
+		"   subs     %Q[tmp], %Q[val]\n"
+		"   sbc      %R[tmp], %R[val]\n"
+		"   strexd   %[exf], %[tmp], [%[ptr]]\n"
+		"   teq      %[exf], #0\n"
+		"   it ne    \n"
+		"   bne      1b\n"
+		:   [exf]    "=&r"  (exflag), 
+		    [tmp]    "=&r"  (tmp)
+		:   [ptr]    "r"    (p), 
+		    [val]    "r"    (val)
+		:   "cc", "memory");
+}
+
+static __inline void
 atomic_subtract_long(volatile u_long *p, u_long val)
 {
 	u_long tmp = 0, tmp2 = 0;
@@ -334,6 +470,10 @@ ATOMIC_ACQ_REL(clear, 32)
 ATOMIC_ACQ_REL(add, 32)
 ATOMIC_ACQ_REL(subtract, 32)
 ATOMIC_ACQ_REL(set, 32)
+ATOMIC_ACQ_REL(clear, 64)
+ATOMIC_ACQ_REL(add, 64)
+ATOMIC_ACQ_REL(subtract, 64)
+ATOMIC_ACQ_REL(set, 64)
 ATOMIC_ACQ_REL_LONG(clear)
 ATOMIC_ACQ_REL_LONG(add)
 ATOMIC_ACQ_REL_LONG(subtract)
@@ -392,6 +532,116 @@ atomic_store_rel_32(volatile uint32_t *p, uint32_t v)
 	*p = v;
 }
 
+static __inline uint64_t
+atomic_fetchadd_64(volatile uint64_t *p, uint64_t val)
+{
+	uint64_t ret, tmp;
+	uint32_t exflag;
+
+	__asm __volatile(
+		"1:          \n"
+		"   ldrexd   %[ret], [%[ptr]]\n"
+		"   adds     %Q[tmp], %Q[ret], %Q[val]\n"
+		"   adc      %R[tmp], %R[ret], %R[val]\n"
+		"   strexd   %[exf], %[tmp], [%[ptr]]\n"
+		"   teq      %[exf], #0\n"
+		"   it ne    \n"
+		"   bne      1b\n"
+		:   [ret]    "=&r"  (ret),
+		    [exf]    "=&r"  (exflag),
+		    [tmp]    "=&r"  (tmp)
+		:   [ptr]    "r"    (p), 
+		    [val]    "r"    (val)
+		:   "cc", "memory");
+	return (ret);
+}
+
+static __inline uint64_t
+atomic_readandclear_64(volatile uint64_t *p)
+{
+	uint64_t ret, tmp;
+	uint32_t exflag;
+
+	__asm __volatile(
+		"1:          \n"
+		"   ldrexd   %[ret], [%[ptr]]\n"
+		"   mov      %Q[tmp], #0\n"
+		"   mov      %R[tmp], #0\n"
+		"   strexd   %[exf], %[tmp], [%[ptr]]\n"
+		"   teq      %[exf], #0\n"
+		"   it ne    \n"
+		"   bne      1b\n"
+		:   [ret]    "=&r"  (ret),
+		    [exf]    "=&r"  (exflag),
+		    [tmp]    "=&r"  (tmp)
+		:   [ptr]    "r"    (p)
+		:   "cc", "memory");
+	return (ret);
+}
+
+static __inline uint64_t
+atomic_load_64(volatile uint64_t *p)
+{
+	uint64_t ret;
+
+	/*
+	 * The only way to atomically load 64 bits is with LDREXD which puts the
+	 * exclusive monitor into the open state, so reset it with CLREX because
+	 * we don't actually need to store anything.
+	 */
+	__asm __volatile(
+		"1:          \n"
+		"   ldrexd   %[ret], [%[ptr]]\n"
+		"   clrex    \n"
+		:   [ret]    "=&r"  (ret)
+		:   [ptr]    "r"    (p)
+		:   "cc", "memory");
+	return (ret);
+}
+
+static __inline uint64_t
+atomic_load_acq_64(volatile uint64_t *p)
+{
+	uint64_t ret;
+
+	ret = atomic_load_64(p);
+	__do_dmb();
+	return (ret);
+}
+
+static __inline void
+atomic_store_64(volatile uint64_t *p, uint64_t val)
+{
+	uint64_t tmp;
+	uint32_t exflag;
+
+	/*
+	 * The only way to atomically store 64 bits is with STREXD, which will
+	 * succeed only if paired up with a preceeding LDREXD using the same
+	 * address, so we read and discard the existing value before storing.
+	 */
+	__asm __volatile(
+		"1:          \n"
+		"   ldrexd   %[tmp], [%[ptr]]\n"
+		"   strexd   %[exf], %[val], [%[ptr]]\n"
+		"   teq      %[exf], #0\n"
+		"   it ne    \n"
+		"   bne      1b\n"
+		:   [tmp]    "=&r"  (tmp),
+		    [exf]    "=&r"  (exflag)
+		:   [ptr]    "r"    (p),
+		    [val]    "r"    (val)
+		:   "cc", "memory");
+}
+
+static __inline void
+atomic_store_rel_64(volatile uint64_t *p, uint64_t val)
+{
+
+	__do_dmb();
+	atomic_store_64(p, val);
+}
+
 static __inline u_long
 atomic_fetchadd_long(volatile u_long *p, u_long val)
 {
@@ -474,6 +724,8 @@ __swp(uint32_t val, volatile uint32_t *ptr)
 
 
 #ifdef _KERNEL
+#define	ARM_HAVE_ATOMIC64
+
 static __inline void
 atomic_set_32(volatile uint32_t *address, uint32_t setmask)
 {
@@ -481,11 +733,23 @@ atomic_set_32(volatile uint32_t *address, uint32_t setmask)
 }
 
 static __inline void
+atomic_set_64(volatile uint64_t *address, uint64_t setmask)
+{
+	__with_interrupts_disabled(*address |= setmask);
+}
+
+static __inline void
 atomic_clear_32(volatile uint32_t *address, uint32_t clearmask)
 {
 	__with_interrupts_disabled(*address &= ~clearmask);
 }
 
+static __inline void
+atomic_clear_64(volatile uint64_t *address, uint64_t clearmask)
+{
+	__with_interrupts_disabled(*address &= ~clearmask);
+}
+
 static __inline u_int32_t
 atomic_cmpset_32(volatile u_int32_t *p, volatile u_int32_t cmpval, volatile u_int32_t newval)
 {
@@ -503,6 +767,23 @@ atomic_cmpset_32(volatile u_int32_t *p, volatile u_int32_t cmpval, volatile u_in
 	return (ret);
 }
 
+static __inline u_int64_t
+atomic_cmpset_64(volatile u_int64_t *p, volatile u_int64_t cmpval, volatile u_int64_t newval)
+{
+	int ret;
+	
+	__with_interrupts_disabled(
+	 {
+	    	if (*p == cmpval) {
+			*p = newval;
+			ret = 1;
+		} else {
+			ret = 0;
+		}
+	});
+	return (ret);
+}
+
 static __inline void
 atomic_add_32(volatile u_int32_t *p, u_int32_t val)
 {
@@ -510,11 +791,23 @@ atomic_add_32(volatile u_int32_t *p, u_int32_t val)
 }
 
 static __inline void
+atomic_add_64(volatile u_int64_t *p, u_int64_t val)
+{
+	__with_interrupts_disabled(*p += val);
+}
+
+static __inline void
 atomic_subtract_32(volatile u_int32_t *p, u_int32_t val)
 {
 	__with_interrupts_disabled(*p -= val);
 }
 
+static __inline void
+atomic_subtract_64(volatile u_int64_t *p, u_int64_t val)
+{
+	__with_interrupts_disabled(*p -= val);
+}
+
 static __inline uint32_t
 atomic_fetchadd_32(volatile uint32_t *p, uint32_t v)
 {
@@ -528,6 +821,34 @@ atomic_fetchadd_32(volatile uint32_t *p, uint32_t v)
 	return (value);
 }
 
+static __inline uint64_t
+atomic_fetchadd_64(volatile uint64_t *p, uint64_t v)
+{
+	uint64_t value;
+
+	__with_interrupts_disabled(
+	{
+	    	value = *p;
+		*p += v;
+	});
+	return (value);
+}
+
+static __inline uint64_t
+atomic_load_64(volatile uint64_t *p)
+{
+	uint64_t value;
+
+	__with_interrupts_disabled(value = *p);
+	return (value);
+}
+
+static __inline void
+atomic_store_64(volatile uint64_t *p, uint64_t value)
+{
+	__with_interrupts_disabled(*p = value);
+}
+
 #else /* !_KERNEL */
 
 static __inline u_int32_t
diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris_atomic.c b/sys/cddl/compat/opensolaris/kern/opensolaris_atomic.c
index 4aba9ea..8a0fdf3 100644
--- a/sys/cddl/compat/opensolaris/kern/opensolaris_atomic.c
+++ b/sys/cddl/compat/opensolaris/kern/opensolaris_atomic.c
@@ -52,7 +52,7 @@ atomic_init(void)
 }
 #endif
 
-#if !defined(__LP64__) && !defined(__mips_n32)
+#if !defined(__LP64__) && !defined(__mips_n32) && !defined(ARM_HAVE_ATOMIC64)
 void
 atomic_add_64(volatile uint64_t *target, int64_t delta)
 {
diff --git a/sys/cddl/compat/opensolaris/sys/atomic.h b/sys/cddl/compat/opensolaris/sys/atomic.h
index f34d77e..65c9e71 100644
--- a/sys/cddl/compat/opensolaris/sys/atomic.h
+++ b/sys/cddl/compat/opensolaris/sys/atomic.h
@@ -36,7 +36,7 @@
 	atomic_cmpset_ptr((volatile uintptr_t *)(_a), (uintptr_t)(_b), (uintptr_t) (_c))
 #define cas32	atomic_cmpset_32
 
-#if !defined(__LP64__) && !defined(__mips_n32)
+#if !defined(__LP64__) && !defined(__mips_n32) && !defined(ARM_HAVE_ATOMIC64)
 extern void atomic_add_64(volatile uint64_t *target, int64_t delta);
 extern void atomic_dec_64(volatile uint64_t *target);
 #endif
@@ -85,7 +85,7 @@ atomic_dec_32_nv(volatile uint32_t *target)
 	return (atomic_fetchadd_32(target, -1) - 1);
 }
 
-#if defined(__LP64__) || defined(__mips_n32)
+#if defined(__LP64__) || defined(__mips_n32) || defined(ARM_HAVE_ATOMIC64)
 static __inline void
 atomic_dec_64(volatile uint64_t *target)
 {
-- 
cgit v1.1