diff options
Diffstat (limited to 'sys/libkern/arm')
-rw-r--r-- | sys/libkern/arm/divsi3.S | 387 | ||||
-rw-r--r-- | sys/libkern/arm/ffs.S | 77 | ||||
-rw-r--r-- | sys/libkern/arm/muldi3.c | 249 |
3 files changed, 713 insertions, 0 deletions
diff --git a/sys/libkern/arm/divsi3.S b/sys/libkern/arm/divsi3.S new file mode 100644 index 0000000..baab96b --- /dev/null +++ b/sys/libkern/arm/divsi3.S @@ -0,0 +1,387 @@ +/* $NetBSD: divsi3.S,v 1.4 2003/04/05 23:27:15 bjh21 Exp $ */ + +/* + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <machine/asm.h> +__FBSDID("$FreeBSD$"); + +/* + * stack is aligned as there's a possibility of branching to L_overflow + * which makes a C call + */ + +ENTRY(__umodsi3) + stmfd sp!, {lr} + sub sp, sp, #4 /* align stack */ + bl .L_udivide + add sp, sp, #4 /* unalign stack */ + mov r0, r1 + ldmfd sp!, {pc} + +ENTRY(__modsi3) + stmfd sp!, {lr} + sub sp, sp, #4 /* align stack */ + bl .L_divide + add sp, sp, #4 /* unalign stack */ + mov r0, r1 + ldmfd sp!, {pc} + +.L_overflow: +#if !defined(_KERNEL) && !defined(_STANDALONE) + mov r0, #8 /* SIGFPE */ + bl PIC_SYM(_C_LABEL(raise), PLT) /* raise it */ + mov r0, #0 +#else + /* XXX should cause a fatal error */ + mvn r0, #0 +#endif + mov pc, lr + +ENTRY(__udivsi3) +.L_udivide: /* r0 = r0 / r1; r1 = r0 % r1 */ + eor r0, r1, r0 + eor r1, r0, r1 + eor r0, r1, r0 + /* r0 = r1 / r0; r1 = r1 % r0 */ + cmp r0, #1 + bcc .L_overflow + beq .L_divide_l0 + mov ip, #0 + movs r1, r1 + bpl .L_divide_l1 + orr ip, ip, #0x20000000 /* ip bit 0x20000000 = -ve r1 */ + movs r1, r1, lsr #1 + orrcs ip, ip, #0x10000000 /* ip bit 0x10000000 = bit 0 of r1 */ + b .L_divide_l1 + +.L_divide_l0: /* r0 == 1 */ + mov r0, r1 + mov r1, #0 + mov pc, lr + +ENTRY(__divsi3) +.L_divide: /* r0 = r0 / r1; r1 = r0 % r1 */ + eor r0, r1, r0 + eor r1, r0, r1 + eor r0, r1, r0 + /* r0 = r1 / r0; r1 = r1 % r0 */ + cmp r0, #1 + bcc .L_overflow + beq .L_divide_l0 + ands ip, r0, #0x80000000 + rsbmi r0, r0, #0 + ands r2, r1, #0x80000000 + eor ip, ip, r2 + rsbmi r1, r1, #0 + orr ip, r2, ip, lsr #1 /* ip bit 0x40000000 = -ve division */ + /* ip bit 0x80000000 = -ve remainder */ + +.L_divide_l1: + mov r2, #1 + mov r3, #0 + + /* + * If the highest bit of the dividend is set, we have to be + * careful when shifting the divisor. Test this. + */ + movs r1,r1 + bpl .L_old_code + + /* + * At this point, the highest bit of r1 is known to be set. + * We abuse this below in the tst instructions. + */ + tst r1, r0 /*, lsl #0 */ + bmi .L_divide_b1 + tst r1, r0, lsl #1 + bmi .L_divide_b2 + tst r1, r0, lsl #2 + bmi .L_divide_b3 + tst r1, r0, lsl #3 + bmi .L_divide_b4 + tst r1, r0, lsl #4 + bmi .L_divide_b5 + tst r1, r0, lsl #5 + bmi .L_divide_b6 + tst r1, r0, lsl #6 + bmi .L_divide_b7 + tst r1, r0, lsl #7 + bmi .L_divide_b8 + tst r1, r0, lsl #8 + bmi .L_divide_b9 + tst r1, r0, lsl #9 + bmi .L_divide_b10 + tst r1, r0, lsl #10 + bmi .L_divide_b11 + tst r1, r0, lsl #11 + bmi .L_divide_b12 + tst r1, r0, lsl #12 + bmi .L_divide_b13 + tst r1, r0, lsl #13 + bmi .L_divide_b14 + tst r1, r0, lsl #14 + bmi .L_divide_b15 + tst r1, r0, lsl #15 + bmi .L_divide_b16 + tst r1, r0, lsl #16 + bmi .L_divide_b17 + tst r1, r0, lsl #17 + bmi .L_divide_b18 + tst r1, r0, lsl #18 + bmi .L_divide_b19 + tst r1, r0, lsl #19 + bmi .L_divide_b20 + tst r1, r0, lsl #20 + bmi .L_divide_b21 + tst r1, r0, lsl #21 + bmi .L_divide_b22 + tst r1, r0, lsl #22 + bmi .L_divide_b23 + tst r1, r0, lsl #23 + bmi .L_divide_b24 + tst r1, r0, lsl #24 + bmi .L_divide_b25 + tst r1, r0, lsl #25 + bmi .L_divide_b26 + tst r1, r0, lsl #26 + bmi .L_divide_b27 + tst r1, r0, lsl #27 + bmi .L_divide_b28 + tst r1, r0, lsl #28 + bmi .L_divide_b29 + tst r1, r0, lsl #29 + bmi .L_divide_b30 + tst r1, r0, lsl #30 + bmi .L_divide_b31 +/* + * instead of: + * tst r1, r0, lsl #31 + * bmi .L_divide_b32 + */ + b .L_divide_b32 + +.L_old_code: + cmp r1, r0 + bcc .L_divide_b0 + cmp r1, r0, lsl #1 + bcc .L_divide_b1 + cmp r1, r0, lsl #2 + bcc .L_divide_b2 + cmp r1, r0, lsl #3 + bcc .L_divide_b3 + cmp r1, r0, lsl #4 + bcc .L_divide_b4 + cmp r1, r0, lsl #5 + bcc .L_divide_b5 + cmp r1, r0, lsl #6 + bcc .L_divide_b6 + cmp r1, r0, lsl #7 + bcc .L_divide_b7 + cmp r1, r0, lsl #8 + bcc .L_divide_b8 + cmp r1, r0, lsl #9 + bcc .L_divide_b9 + cmp r1, r0, lsl #10 + bcc .L_divide_b10 + cmp r1, r0, lsl #11 + bcc .L_divide_b11 + cmp r1, r0, lsl #12 + bcc .L_divide_b12 + cmp r1, r0, lsl #13 + bcc .L_divide_b13 + cmp r1, r0, lsl #14 + bcc .L_divide_b14 + cmp r1, r0, lsl #15 + bcc .L_divide_b15 + cmp r1, r0, lsl #16 + bcc .L_divide_b16 + cmp r1, r0, lsl #17 + bcc .L_divide_b17 + cmp r1, r0, lsl #18 + bcc .L_divide_b18 + cmp r1, r0, lsl #19 + bcc .L_divide_b19 + cmp r1, r0, lsl #20 + bcc .L_divide_b20 + cmp r1, r0, lsl #21 + bcc .L_divide_b21 + cmp r1, r0, lsl #22 + bcc .L_divide_b22 + cmp r1, r0, lsl #23 + bcc .L_divide_b23 + cmp r1, r0, lsl #24 + bcc .L_divide_b24 + cmp r1, r0, lsl #25 + bcc .L_divide_b25 + cmp r1, r0, lsl #26 + bcc .L_divide_b26 + cmp r1, r0, lsl #27 + bcc .L_divide_b27 + cmp r1, r0, lsl #28 + bcc .L_divide_b28 + cmp r1, r0, lsl #29 + bcc .L_divide_b29 + cmp r1, r0, lsl #30 + bcc .L_divide_b30 +.L_divide_b32: + cmp r1, r0, lsl #31 + subhs r1, r1,r0, lsl #31 + addhs r3, r3,r2, lsl #31 +.L_divide_b31: + cmp r1, r0, lsl #30 + subhs r1, r1,r0, lsl #30 + addhs r3, r3,r2, lsl #30 +.L_divide_b30: + cmp r1, r0, lsl #29 + subhs r1, r1,r0, lsl #29 + addhs r3, r3,r2, lsl #29 +.L_divide_b29: + cmp r1, r0, lsl #28 + subhs r1, r1,r0, lsl #28 + addhs r3, r3,r2, lsl #28 +.L_divide_b28: + cmp r1, r0, lsl #27 + subhs r1, r1,r0, lsl #27 + addhs r3, r3,r2, lsl #27 +.L_divide_b27: + cmp r1, r0, lsl #26 + subhs r1, r1,r0, lsl #26 + addhs r3, r3,r2, lsl #26 +.L_divide_b26: + cmp r1, r0, lsl #25 + subhs r1, r1,r0, lsl #25 + addhs r3, r3,r2, lsl #25 +.L_divide_b25: + cmp r1, r0, lsl #24 + subhs r1, r1,r0, lsl #24 + addhs r3, r3,r2, lsl #24 +.L_divide_b24: + cmp r1, r0, lsl #23 + subhs r1, r1,r0, lsl #23 + addhs r3, r3,r2, lsl #23 +.L_divide_b23: + cmp r1, r0, lsl #22 + subhs r1, r1,r0, lsl #22 + addhs r3, r3,r2, lsl #22 +.L_divide_b22: + cmp r1, r0, lsl #21 + subhs r1, r1,r0, lsl #21 + addhs r3, r3,r2, lsl #21 +.L_divide_b21: + cmp r1, r0, lsl #20 + subhs r1, r1,r0, lsl #20 + addhs r3, r3,r2, lsl #20 +.L_divide_b20: + cmp r1, r0, lsl #19 + subhs r1, r1,r0, lsl #19 + addhs r3, r3,r2, lsl #19 +.L_divide_b19: + cmp r1, r0, lsl #18 + subhs r1, r1,r0, lsl #18 + addhs r3, r3,r2, lsl #18 +.L_divide_b18: + cmp r1, r0, lsl #17 + subhs r1, r1,r0, lsl #17 + addhs r3, r3,r2, lsl #17 +.L_divide_b17: + cmp r1, r0, lsl #16 + subhs r1, r1,r0, lsl #16 + addhs r3, r3,r2, lsl #16 +.L_divide_b16: + cmp r1, r0, lsl #15 + subhs r1, r1,r0, lsl #15 + addhs r3, r3,r2, lsl #15 +.L_divide_b15: + cmp r1, r0, lsl #14 + subhs r1, r1,r0, lsl #14 + addhs r3, r3,r2, lsl #14 +.L_divide_b14: + cmp r1, r0, lsl #13 + subhs r1, r1,r0, lsl #13 + addhs r3, r3,r2, lsl #13 +.L_divide_b13: + cmp r1, r0, lsl #12 + subhs r1, r1,r0, lsl #12 + addhs r3, r3,r2, lsl #12 +.L_divide_b12: + cmp r1, r0, lsl #11 + subhs r1, r1,r0, lsl #11 + addhs r3, r3,r2, lsl #11 +.L_divide_b11: + cmp r1, r0, lsl #10 + subhs r1, r1,r0, lsl #10 + addhs r3, r3,r2, lsl #10 +.L_divide_b10: + cmp r1, r0, lsl #9 + subhs r1, r1,r0, lsl #9 + addhs r3, r3,r2, lsl #9 +.L_divide_b9: + cmp r1, r0, lsl #8 + subhs r1, r1,r0, lsl #8 + addhs r3, r3,r2, lsl #8 +.L_divide_b8: + cmp r1, r0, lsl #7 + subhs r1, r1,r0, lsl #7 + addhs r3, r3,r2, lsl #7 +.L_divide_b7: + cmp r1, r0, lsl #6 + subhs r1, r1,r0, lsl #6 + addhs r3, r3,r2, lsl #6 +.L_divide_b6: + cmp r1, r0, lsl #5 + subhs r1, r1,r0, lsl #5 + addhs r3, r3,r2, lsl #5 +.L_divide_b5: + cmp r1, r0, lsl #4 + subhs r1, r1,r0, lsl #4 + addhs r3, r3,r2, lsl #4 +.L_divide_b4: + cmp r1, r0, lsl #3 + subhs r1, r1,r0, lsl #3 + addhs r3, r3,r2, lsl #3 +.L_divide_b3: + cmp r1, r0, lsl #2 + subhs r1, r1,r0, lsl #2 + addhs r3, r3,r2, lsl #2 +.L_divide_b2: + cmp r1, r0, lsl #1 + subhs r1, r1,r0, lsl #1 + addhs r3, r3,r2, lsl #1 +.L_divide_b1: + cmp r1, r0 + subhs r1, r1, r0 + addhs r3, r3, r2 +.L_divide_b0: + + tst ip, #0x20000000 + bne .L_udivide_l1 + mov r0, r3 + cmp ip, #0 + rsbmi r1, r1, #0 + movs ip, ip, lsl #1 + bicmi r0, r0, #0x80000000 /* Fix incase we divided 0x80000000 */ + rsbmi r0, r0, #0 + mov pc, lr + +.L_udivide_l1: + tst ip, #0x10000000 + mov r1, r1, lsl #1 + orrne r1, r1, #1 + mov r3, r3, lsl #1 + cmp r1, r0 + subhs r1, r1, r0 + addhs r3, r3, r2 + mov r0, r3 + mov pc, lr diff --git a/sys/libkern/arm/ffs.S b/sys/libkern/arm/ffs.S new file mode 100644 index 0000000..4e6fe3d --- /dev/null +++ b/sys/libkern/arm/ffs.S @@ -0,0 +1,77 @@ +/* $NetBSD: ffs.S,v 1.3 2003/04/05 23:27:15 bjh21 Exp $ */ +/* + * Copyright (c) 2001 Christopher Gilbert + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the company nor the name of the author may be used to + * endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <machine/asm.h> +__FBSDID("$FreeBSD$"); + + +/* + * ffs - find first set bit, this algorithm isolates the first set + * bit, then multiplies the number by 0x0450fbaf which leaves the top + * 6 bits as an index into the table. This algorithm should be a win + * over the checking each bit in turn as per the C compiled version. + * + * under ARMv5 there's an instruction called CLZ (count leading Zero's) that + * could be used + * + * This is the ffs algorithm devised by d.seal and posted to comp.sys.arm on + * 16 Feb 1994. + */ + +ENTRY(ffs) + /* Standard trick to isolate bottom bit in r0 or 0 if r0 = 0 on entry */ + rsb r1, r0, #0 + ands r0, r0, r1 + /* + * now r0 has at most one set bit, call this X + * if X = 0, all further instructions are skipped + */ + adrne r2, .L_ffs_table + orrne r0, r0, r0, lsl #4 /* r0 = X * 0x11 */ + orrne r0, r0, r0, lsl #6 /* r0 = X * 0x451 */ + rsbne r0, r0, r0, lsl #16 /* r0 = X * 0x0450fbaf */ + + /* now lookup in table indexed on top 6 bits of r0 */ + ldrneb r0, [ r2, r0, lsr #26 ] + + mov pc, lr +.text; +.type .L_ffs_table, _ASM_TYPE_OBJECT; +.L_ffs_table: +/* 0 1 2 3 4 5 6 7 */ + .byte 0, 1, 2, 13, 3, 7, 0, 14 /* 0- 7 */ + .byte 4, 0, 8, 0, 0, 0, 0, 15 /* 8-15 */ + .byte 11, 5, 0, 0, 9, 0, 0, 26 /* 16-23 */ + .byte 0, 0, 0, 0, 0, 22, 28, 16 /* 24-31 */ + .byte 32, 12, 6, 0, 0, 0, 0, 0 /* 32-39 */ + .byte 10, 0, 0, 25, 0, 0, 21, 27 /* 40-47 */ + .byte 31, 0, 0, 0, 0, 24, 0, 20 /* 48-55 */ + .byte 30, 0, 23, 19, 29, 18, 17, 0 /* 56-63 */ + diff --git a/sys/libkern/arm/muldi3.c b/sys/libkern/arm/muldi3.c new file mode 100644 index 0000000..8cdb478 --- /dev/null +++ b/sys/libkern/arm/muldi3.c @@ -0,0 +1,249 @@ +/* $NetBSD: muldi3.c,v 1.8 2003/08/07 16:32:09 agc Exp $ */ + +/*- + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This software was developed by the Computer Systems Engineering group + * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and + * contributed to Berkeley. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +#if defined(LIBC_SCCS) && !defined(lint) +#if 0 +static char sccsid[] = "@(#)muldi3.c 8.1 (Berkeley) 6/4/93"; +#else +__FBSDID("$FreeBSD$"); +#endif +#endif /* LIBC_SCCS and not lint */ + +#include <libkern/quad.h> + +/* + * Multiply two quads. + * + * Our algorithm is based on the following. Split incoming quad values + * u and v (where u,v >= 0) into + * + * u = 2^n u1 * u0 (n = number of bits in `u_int', usu. 32) + * + * and + * + * v = 2^n v1 * v0 + * + * Then + * + * uv = 2^2n u1 v1 + 2^n u1 v0 + 2^n v1 u0 + u0 v0 + * = 2^2n u1 v1 + 2^n (u1 v0 + v1 u0) + u0 v0 + * + * Now add 2^n u1 v1 to the first term and subtract it from the middle, + * and add 2^n u0 v0 to the last term and subtract it from the middle. + * This gives: + * + * uv = (2^2n + 2^n) (u1 v1) + + * (2^n) (u1 v0 - u1 v1 + u0 v1 - u0 v0) + + * (2^n + 1) (u0 v0) + * + * Factoring the middle a bit gives us: + * + * uv = (2^2n + 2^n) (u1 v1) + [u1v1 = high] + * (2^n) (u1 - u0) (v0 - v1) + [(u1-u0)... = mid] + * (2^n + 1) (u0 v0) [u0v0 = low] + * + * The terms (u1 v1), (u1 - u0) (v0 - v1), and (u0 v0) can all be done + * in just half the precision of the original. (Note that either or both + * of (u1 - u0) or (v0 - v1) may be negative.) + * + * This algorithm is from Knuth vol. 2 (2nd ed), section 4.3.3, p. 278. + * + * Since C does not give us a `int * int = quad' operator, we split + * our input quads into two ints, then split the two ints into two + * shorts. We can then calculate `short * short = int' in native + * arithmetic. + * + * Our product should, strictly speaking, be a `long quad', with 128 + * bits, but we are going to discard the upper 64. In other words, + * we are not interested in uv, but rather in (uv mod 2^2n). This + * makes some of the terms above vanish, and we get: + * + * (2^n)(high) + (2^n)(mid) + (2^n + 1)(low) + * + * or + * + * (2^n)(high + mid + low) + low + * + * Furthermore, `high' and `mid' can be computed mod 2^n, as any factor + * of 2^n in either one will also vanish. Only `low' need be computed + * mod 2^2n, and only because of the final term above. + */ +static quad_t __lmulq(u_int, u_int); + +quad_t __muldi3(quad_t, quad_t); +quad_t +__muldi3(quad_t a, quad_t b) +{ + union uu u, v, low, prod; + u_int high, mid, udiff, vdiff; + int negall, negmid; +#define u1 u.ul[H] +#define u0 u.ul[L] +#define v1 v.ul[H] +#define v0 v.ul[L] + + /* + * Get u and v such that u, v >= 0. When this is finished, + * u1, u0, v1, and v0 will be directly accessible through the + * int fields. + */ + if (a >= 0) + u.q = a, negall = 0; + else + u.q = -a, negall = 1; + if (b >= 0) + v.q = b; + else + v.q = -b, negall ^= 1; + + if (u1 == 0 && v1 == 0) { + /* + * An (I hope) important optimization occurs when u1 and v1 + * are both 0. This should be common since most numbers + * are small. Here the product is just u0*v0. + */ + prod.q = __lmulq(u0, v0); + } else { + /* + * Compute the three intermediate products, remembering + * whether the middle term is negative. We can discard + * any upper bits in high and mid, so we can use native + * u_int * u_int => u_int arithmetic. + */ + low.q = __lmulq(u0, v0); + + if (u1 >= u0) + negmid = 0, udiff = u1 - u0; + else + negmid = 1, udiff = u0 - u1; + if (v0 >= v1) + vdiff = v0 - v1; + else + vdiff = v1 - v0, negmid ^= 1; + mid = udiff * vdiff; + + high = u1 * v1; + + /* + * Assemble the final product. + */ + prod.ul[H] = high + (negmid ? -mid : mid) + low.ul[L] + + low.ul[H]; + prod.ul[L] = low.ul[L]; + } + return (negall ? -prod.q : prod.q); +#undef u1 +#undef u0 +#undef v1 +#undef v0 +} + +/* + * Multiply two 2N-bit ints to produce a 4N-bit quad, where N is half + * the number of bits in an int (whatever that is---the code below + * does not care as long as quad.h does its part of the bargain---but + * typically N==16). + * + * We use the same algorithm from Knuth, but this time the modulo refinement + * does not apply. On the other hand, since N is half the size of an int, + * we can get away with native multiplication---none of our input terms + * exceeds (UINT_MAX >> 1). + * + * Note that, for u_int l, the quad-precision result + * + * l << N + * + * splits into high and low ints as HHALF(l) and LHUP(l) respectively. + */ +static quad_t +__lmulq(u_int u, u_int v) +{ + u_int u1, u0, v1, v0, udiff, vdiff, high, mid, low; + u_int prodh, prodl, was; + union uu prod; + int neg; + + u1 = HHALF(u); + u0 = LHALF(u); + v1 = HHALF(v); + v0 = LHALF(v); + + low = u0 * v0; + + /* This is the same small-number optimization as before. */ + if (u1 == 0 && v1 == 0) + return (low); + + if (u1 >= u0) + udiff = u1 - u0, neg = 0; + else + udiff = u0 - u1, neg = 1; + if (v0 >= v1) + vdiff = v0 - v1; + else + vdiff = v1 - v0, neg ^= 1; + mid = udiff * vdiff; + + high = u1 * v1; + + /* prod = (high << 2N) + (high << N); */ + prodh = high + HHALF(high); + prodl = LHUP(high); + + /* if (neg) prod -= mid << N; else prod += mid << N; */ + if (neg) { + was = prodl; + prodl -= LHUP(mid); + prodh -= HHALF(mid) + (prodl > was); + } else { + was = prodl; + prodl += LHUP(mid); + prodh += HHALF(mid) + (prodl < was); + } + + /* prod += low << N */ + was = prodl; + prodl += LHUP(low); + prodh += HHALF(low) + (prodl < was); + /* ... + low; */ + if ((prodl += low) < low) + prodh++; + + /* return 4N-bit product */ + prod.ul[H] = prodh; + prod.ul[L] = prodl; + return (prod.q); +} |