1 files changed, 80 insertions, 0 deletions
diff --git a/lib/arm/udivsi3.S b/lib/arm/udivsi3.S
new file mode 100644
index 0000000..6d89665
--- /dev/null
+++ b/lib/arm/udivsi3.S
@@ -0,0 +1,80 @@
+/*===-- udivsi3.S - 32-bit unsigned integer divide ------------------------===//
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is dual licensed under the MIT and the University of Illinois Open
+ * Source Licenses. See LICENSE.TXT for details.
+ *
+ *===----------------------------------------------------------------------===//
+ *
+ * This file implements the __udivsi3 (32-bit unsigned integer divide) 
+ * function for the ARM architecture.  A naive digit-by-digit computation is
+ * employed for simplicity.
+ *
+ *===----------------------------------------------------------------------===*/
+
+#include "../assembly.h"
+
+#define ESTABLISH_FRAME \
+    push   {r7, lr}    ;\
+    mov     r7,     sp
+#define CLEAR_FRAME_AND_RETURN \
+    pop    {r7, pc}
+
+#define a r0
+#define b r1
+#define r r2
+#define i r3
+#define q ip
+#define one lr
+
+.syntax unified
+.align 3
+// Ok, APCS and AAPCS agree on 32 bit args, so it's safe to use the same routine.
+DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_uidiv, __udivsi3)
+DEFINE_COMPILERRT_FUNCTION(__udivsi3)
+//  We use a simple digit by digit algorithm; before we get into the actual 
+//  divide loop, we must calculate the left-shift amount necessary to align
+//  the MSB of the divisor with that of the dividend (If this shift is
+//  negative, then the result is zero, and we early out). We also conjure a
+//  bit mask of 1 to use in constructing the quotient, and initialize the
+//  quotient to zero.
+    ESTABLISH_FRAME
+    clz     r2,     a
+    tst     b,      b   // detect divide-by-zero
+    clz     r3,     b
+    mov     q,      #0
+    beq     LOCAL_LABEL(return)    // return 0 if b is zero.
+    mov     one,    #1
+    subs    i,      r3, r2
+    blt     LOCAL_LABEL(return)    // return 0 if MSB(a) < MSB(b)
+
+LOCAL_LABEL(mainLoop):
+//  This loop basically implements the following:
+//
+//  do {
+//      if (a >= b << i) {
+//          a -= b << i;
+//          q |= 1 << i;
+//          if (a == 0) break;
+//      }
+//  } while (--i)
+//
+//  Note that this does not perform the final iteration (i == 0); by doing it
+//  this way, we can merge the two branches which is a substantial win for
+//  such a tight loop on current ARM architectures.
+    subs    r,      a,  b, lsl i
+    orrhs   q,      q,one, lsl i
+    movhs   a,      r
+    subsne  i,      i, #1
+    bhi     LOCAL_LABEL(mainLoop)
+
+//  Do the final test subtraction and update of quotient (i == 0), as it is
+//  not performed in the main loop.
+    subs    r,      a,  b
+    orrhs   q,      #1
+
+LOCAL_LABEL(return):
+//  Move the quotient to r0 and return.
+    mov     r0,     q
+    CLEAR_FRAME_AND_RETURN