summaryrefslogtreecommitdiffstats
path: root/lib/arm/udivsi3.S
diff options
context:
space:
mode:
Diffstat (limited to 'lib/arm/udivsi3.S')
-rw-r--r--lib/arm/udivsi3.S80
1 files changed, 80 insertions, 0 deletions
diff --git a/lib/arm/udivsi3.S b/lib/arm/udivsi3.S
new file mode 100644
index 0000000..6d89665
--- /dev/null
+++ b/lib/arm/udivsi3.S
@@ -0,0 +1,80 @@
+/*===-- udivsi3.S - 32-bit unsigned integer divide ------------------------===//
+ *
+ * The LLVM Compiler Infrastructure
+ *
+ * This file is dual licensed under the MIT and the University of Illinois Open
+ * Source Licenses. See LICENSE.TXT for details.
+ *
+ *===----------------------------------------------------------------------===//
+ *
+ * This file implements the __udivsi3 (32-bit unsigned integer divide)
+ * function for the ARM architecture. A naive digit-by-digit computation is
+ * employed for simplicity.
+ *
+ *===----------------------------------------------------------------------===*/
+
+#include "../assembly.h"
+
+#define ESTABLISH_FRAME \
+ push {r7, lr} ;\
+ mov r7, sp
+#define CLEAR_FRAME_AND_RETURN \
+ pop {r7, pc}
+
+#define a r0
+#define b r1
+#define r r2
+#define i r3
+#define q ip
+#define one lr
+
+.syntax unified
+.align 3
+// Ok, APCS and AAPCS agree on 32 bit args, so it's safe to use the same routine.
+DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_uidiv, __udivsi3)
+DEFINE_COMPILERRT_FUNCTION(__udivsi3)
+// We use a simple digit by digit algorithm; before we get into the actual
+// divide loop, we must calculate the left-shift amount necessary to align
+// the MSB of the divisor with that of the dividend (If this shift is
+// negative, then the result is zero, and we early out). We also conjure a
+// bit mask of 1 to use in constructing the quotient, and initialize the
+// quotient to zero.
+ ESTABLISH_FRAME
+ clz r2, a
+ tst b, b // detect divide-by-zero
+ clz r3, b
+ mov q, #0
+ beq LOCAL_LABEL(return) // return 0 if b is zero.
+ mov one, #1
+ subs i, r3, r2
+ blt LOCAL_LABEL(return) // return 0 if MSB(a) < MSB(b)
+
+LOCAL_LABEL(mainLoop):
+// This loop basically implements the following:
+//
+// do {
+// if (a >= b << i) {
+// a -= b << i;
+// q |= 1 << i;
+// if (a == 0) break;
+// }
+// } while (--i)
+//
+// Note that this does not perform the final iteration (i == 0); by doing it
+// this way, we can merge the two branches which is a substantial win for
+// such a tight loop on current ARM architectures.
+ subs r, a, b, lsl i
+ orrhs q, q,one, lsl i
+ movhs a, r
+ subsne i, i, #1
+ bhi LOCAL_LABEL(mainLoop)
+
+// Do the final test subtraction and update of quotient (i == 0), as it is
+// not performed in the main loop.
+ subs r, a, b
+ orrhs q, #1
+
+LOCAL_LABEL(return):
+// Move the quotient to r0 and return.
+ mov r0, q
+ CLEAR_FRAME_AND_RETURN
OpenPOWER on IntegriCloud