diff options
author | jkim <jkim@FreeBSD.org> | 2015-10-23 19:46:02 +0000 |
---|---|---|
committer | jkim <jkim@FreeBSD.org> | 2015-10-23 19:46:02 +0000 |
commit | 64cb0c902e312216cdc4c826fc0be9ba9e1bf4da (patch) | |
tree | ae816a5a768ec78af3610e509ca39507b33aa9f7 /crypto/bn | |
parent | e5911a7a89e76432a8d4607068e9171b30272e08 (diff) | |
download | FreeBSD-src-64cb0c902e312216cdc4c826fc0be9ba9e1bf4da.zip FreeBSD-src-64cb0c902e312216cdc4c826fc0be9ba9e1bf4da.tar.gz |
Import OpenSSL 1.0.2d.
Diffstat (limited to 'crypto/bn')
-rw-r--r-- | crypto/bn/Makefile | 20 | ||||
-rwxr-xr-x | crypto/bn/asm/armv4-gf2m.pl | 169 | ||||
-rwxr-xr-x | crypto/bn/asm/armv4-mont.pl | 484 | ||||
-rwxr-xr-x | crypto/bn/asm/mips-mont.pl | 2 | ||||
-rwxr-xr-x | crypto/bn/asm/mips.pl | 2 | ||||
-rw-r--r-- | crypto/bn/asm/mips3.s | 2201 | ||||
-rwxr-xr-x | crypto/bn/asm/modexp512-x86_64.pl | 1497 | ||||
-rwxr-xr-x | crypto/bn/asm/ppc-mont.pl | 1 | ||||
-rw-r--r-- | crypto/bn/asm/ppc.pl | 10 | ||||
-rwxr-xr-x | crypto/bn/asm/ppc64-mont.pl | 660 | ||||
-rwxr-xr-x | crypto/bn/asm/rsaz-avx2.pl | 1898 | ||||
-rwxr-xr-x | crypto/bn/asm/rsaz-x86_64.pl | 2144 | ||||
-rwxr-xr-x | crypto/bn/asm/sparct4-mont.pl | 1222 | ||||
-rwxr-xr-x | crypto/bn/asm/sparcv9-gf2m.pl | 190 | ||||
-rwxr-xr-x | crypto/bn/asm/vis3-mont.pl | 373 | ||||
-rw-r--r-- | crypto/bn/asm/x86_64-gcc.c | 110 | ||||
-rwxr-xr-x | crypto/bn/asm/x86_64-mont.pl | 1289 | ||||
-rwxr-xr-x | crypto/bn/asm/x86_64-mont5.pl | 2908 | ||||
-rw-r--r-- | crypto/bn/bn.h | 18 | ||||
-rw-r--r-- | crypto/bn/bn_asm.c | 243 | ||||
-rw-r--r-- | crypto/bn/bn_exp.c | 326 | ||||
-rw-r--r-- | crypto/bn/bn_gf2m.c | 3 | ||||
-rw-r--r-- | crypto/bn/bn_lcl.h | 27 | ||||
-rw-r--r-- | crypto/bn/bntest.c | 3 | ||||
-rw-r--r-- | crypto/bn/rsaz_exp.c | 346 | ||||
-rw-r--r-- | crypto/bn/rsaz_exp.h | 56 |
26 files changed, 13314 insertions, 2888 deletions
diff --git a/crypto/bn/Makefile b/crypto/bn/Makefile index 3d0158c..215855e 100644 --- a/crypto/bn/Makefile +++ b/crypto/bn/Makefile @@ -77,6 +77,12 @@ sparcv9a-mont.s: asm/sparcv9a-mont.pl $(PERL) asm/sparcv9a-mont.pl $(CFLAGS) > $@ sparcv9-mont.s: asm/sparcv9-mont.pl $(PERL) asm/sparcv9-mont.pl $(CFLAGS) > $@ +vis3-mont.s: asm/vis3-mont.pl + $(PERL) asm/vis3-mont.pl $(CFLAGS) > $@ +sparct4-mont.S: asm/sparct4-mont.pl + $(PERL) asm/sparct4-mont.pl $(CFLAGS) > $@ +sparcv9-gf2m.S: asm/sparcv9-gf2m.pl + $(PERL) asm/sparcv9-gf2m.pl $(CFLAGS) > $@ bn-mips3.o: asm/mips3.s @if [ "$(CC)" = "gcc" ]; then \ @@ -102,8 +108,10 @@ x86_64-mont5.s: asm/x86_64-mont5.pl $(PERL) asm/x86_64-mont5.pl $(PERLASM_SCHEME) > $@ x86_64-gf2m.s: asm/x86_64-gf2m.pl $(PERL) asm/x86_64-gf2m.pl $(PERLASM_SCHEME) > $@ -modexp512-x86_64.s: asm/modexp512-x86_64.pl - $(PERL) asm/modexp512-x86_64.pl $(PERLASM_SCHEME) > $@ +rsaz-x86_64.s: asm/rsaz-x86_64.pl + $(PERL) asm/rsaz-x86_64.pl $(PERLASM_SCHEME) > $@ +rsaz-avx2.s: asm/rsaz-avx2.pl + $(PERL) asm/rsaz-avx2.pl $(PERLASM_SCHEME) > $@ bn-ia64.s: asm/ia64.S $(CC) $(CFLAGS) -E asm/ia64.S > $@ @@ -125,14 +133,15 @@ ppc-mont.s: asm/ppc-mont.pl;$(PERL) asm/ppc-mont.pl $(PERLASM_SCHEME) $@ ppc64-mont.s: asm/ppc64-mont.pl;$(PERL) asm/ppc64-mont.pl $(PERLASM_SCHEME) $@ alpha-mont.s: asm/alpha-mont.pl - (preproc=/tmp/$$$$.$@; trap "rm $$preproc" INT; \ + (preproc=$$$$.$@.S; trap "rm $$preproc" INT; \ $(PERL) asm/alpha-mont.pl > $$preproc && \ - $(CC) -E $$preproc > $@ && rm $$preproc) + $(CC) -E -P $$preproc > $@ && rm $$preproc) # GNU make "catch all" -%-mont.s: asm/%-mont.pl; $(PERL) $< $(PERLASM_SCHEME) $@ +%-mont.S: asm/%-mont.pl; $(PERL) $< $(PERLASM_SCHEME) $@ %-gf2m.S: asm/%-gf2m.pl; $(PERL) $< $(PERLASM_SCHEME) $@ +armv4-mont.o: armv4-mont.S armv4-gf2m.o: armv4-gf2m.S files: @@ -244,6 +253,7 @@ bn_exp.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h bn_exp.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h bn_exp.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h bn_exp.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_exp.c bn_lcl.h +bn_exp.o: rsaz_exp.h bn_exp2.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h bn_exp2.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h bn_exp2.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h diff --git a/crypto/bn/asm/armv4-gf2m.pl b/crypto/bn/asm/armv4-gf2m.pl index c52e0b7..8f529c9 100755 --- a/crypto/bn/asm/armv4-gf2m.pl +++ b/crypto/bn/asm/armv4-gf2m.pl @@ -20,48 +20,26 @@ # length, more for longer keys. Even though NEON 1x1 multiplication # runs in even less cycles, ~30, improvement is measurable only on # longer keys. One has to optimize code elsewhere to get NEON glow... +# +# April 2014 +# +# Double bn_GF2m_mul_2x2 performance by using algorithm from paper +# referred below, which improves ECDH and ECDSA verify benchmarks +# by 18-40%. +# +# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software +# Polynomial Multiplication on ARM Processors using the NEON Engine. +# +# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; -sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } -sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } -sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; } - $code=<<___; #include "arm_arch.h" .text .code 32 - -#if __ARM_ARCH__>=7 -.fpu neon - -.type mul_1x1_neon,%function -.align 5 -mul_1x1_neon: - vshl.u64 `&Dlo("q1")`,d16,#8 @ q1-q3 are slided $a - vmull.p8 `&Q("d0")`,d16,d17 @ a·bb - vshl.u64 `&Dlo("q2")`,d16,#16 - vmull.p8 q1,`&Dlo("q1")`,d17 @ a<<8·bb - vshl.u64 `&Dlo("q3")`,d16,#24 - vmull.p8 q2,`&Dlo("q2")`,d17 @ a<<16·bb - vshr.u64 `&Dlo("q1")`,#8 - vmull.p8 q3,`&Dlo("q3")`,d17 @ a<<24·bb - vshl.u64 `&Dhi("q1")`,#24 - veor d0,`&Dlo("q1")` - vshr.u64 `&Dlo("q2")`,#16 - veor d0,`&Dhi("q1")` - vshl.u64 `&Dhi("q2")`,#16 - veor d0,`&Dlo("q2")` - vshr.u64 `&Dlo("q3")`,#24 - veor d0,`&Dhi("q2")` - vshl.u64 `&Dhi("q3")`,#8 - veor d0,`&Dlo("q3")` - veor d0,`&Dhi("q3")` - bx lr -.size mul_1x1_neon,.-mul_1x1_neon -#endif ___ ################ # private interface to mul_1x1_ialu @@ -159,56 +137,17 @@ ___ # void bn_GF2m_mul_2x2(BN_ULONG *r, # BN_ULONG a1,BN_ULONG a0, # BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0·b1b0 - -($A1,$B1,$A0,$B0,$A1B1,$A0B0)=map("d$_",(18..23)); - +{ $code.=<<___; .global bn_GF2m_mul_2x2 .type bn_GF2m_mul_2x2,%function .align 5 bn_GF2m_mul_2x2: -#if __ARM_ARCH__>=7 +#if __ARM_MAX_ARCH__>=7 ldr r12,.LOPENSSL_armcap .Lpic: ldr r12,[pc,r12] tst r12,#1 - beq .Lialu - - veor $A1,$A1 - vmov.32 $B1,r3,r3 @ two copies of b1 - vmov.32 ${A1}[0],r1 @ a1 - - veor $A0,$A0 - vld1.32 ${B0}[],[sp,:32] @ two copies of b0 - vmov.32 ${A0}[0],r2 @ a0 - mov r12,lr - - vmov d16,$A1 - vmov d17,$B1 - bl mul_1x1_neon @ a1·b1 - vmov $A1B1,d0 - - vmov d16,$A0 - vmov d17,$B0 - bl mul_1x1_neon @ a0·b0 - vmov $A0B0,d0 - - veor d16,$A0,$A1 - veor d17,$B0,$B1 - veor $A0,$A0B0,$A1B1 - bl mul_1x1_neon @ (a0+a1)·(b0+b1) - - veor d0,$A0 @ (a0+a1)·(b0+b1)-a0·b0-a1·b1 - vshl.u64 d1,d0,#32 - vshr.u64 d0,d0,#32 - veor $A0B0,d1 - veor $A1B1,d0 - vst1.32 {${A0B0}[0]},[r0,:32]! - vst1.32 {${A0B0}[1]},[r0,:32]! - vst1.32 {${A1B1}[0]},[r0,:32]! - vst1.32 {${A1B1}[1]},[r0,:32] - bx r12 -.align 4 -.Lialu: + bne .LNEON #endif ___ $ret="r10"; # reassigned 1st argument @@ -260,8 +199,72 @@ $code.=<<___; moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) #endif +___ +} +{ +my ($r,$t0,$t1,$t2,$t3)=map("q$_",(0..3,8..12)); +my ($a,$b,$k48,$k32,$k16)=map("d$_",(26..31)); + +$code.=<<___; +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + +.align 5 +.LNEON: + ldr r12, [sp] @ 5th argument + vmov.32 $a, r2, r1 + vmov.32 $b, r12, r3 + vmov.i64 $k48, #0x0000ffffffffffff + vmov.i64 $k32, #0x00000000ffffffff + vmov.i64 $k16, #0x000000000000ffff + + vext.8 $t0#lo, $a, $a, #1 @ A1 + vmull.p8 $t0, $t0#lo, $b @ F = A1*B + vext.8 $r#lo, $b, $b, #1 @ B1 + vmull.p8 $r, $a, $r#lo @ E = A*B1 + vext.8 $t1#lo, $a, $a, #2 @ A2 + vmull.p8 $t1, $t1#lo, $b @ H = A2*B + vext.8 $t3#lo, $b, $b, #2 @ B2 + vmull.p8 $t3, $a, $t3#lo @ G = A*B2 + vext.8 $t2#lo, $a, $a, #3 @ A3 + veor $t0, $t0, $r @ L = E + F + vmull.p8 $t2, $t2#lo, $b @ J = A3*B + vext.8 $r#lo, $b, $b, #3 @ B3 + veor $t1, $t1, $t3 @ M = G + H + vmull.p8 $r, $a, $r#lo @ I = A*B3 + veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8 + vand $t0#hi, $t0#hi, $k48 + vext.8 $t3#lo, $b, $b, #4 @ B4 + veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16 + vand $t1#hi, $t1#hi, $k32 + vmull.p8 $t3, $a, $t3#lo @ K = A*B4 + veor $t2, $t2, $r @ N = I + J + veor $t0#lo, $t0#lo, $t0#hi + veor $t1#lo, $t1#lo, $t1#hi + veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24 + vand $t2#hi, $t2#hi, $k16 + vext.8 $t0, $t0, $t0, #15 + veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32 + vmov.i64 $t3#hi, #0 + vext.8 $t1, $t1, $t1, #14 + veor $t2#lo, $t2#lo, $t2#hi + vmull.p8 $r, $a, $b @ D = A*B + vext.8 $t3, $t3, $t3, #12 + vext.8 $t2, $t2, $t2, #13 + veor $t0, $t0, $t1 + veor $t2, $t2, $t3 + veor $r, $r, $t0 + veor $r, $r, $t2 + + vst1.32 {$r}, [r0] + ret @ bx lr +#endif +___ +} +$code.=<<___; .size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 -#if __ARM_ARCH__>=7 +#if __ARM_MAX_ARCH__>=7 .align 5 .LOPENSSL_armcap: .word OPENSSL_armcap_P-(.Lpic+8) @@ -269,10 +272,18 @@ $code.=<<___; .asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" .align 5 +#if __ARM_MAX_ARCH__>=7 .comm OPENSSL_armcap_P,4,4 +#endif ___ -$code =~ s/\`([^\`]*)\`/eval $1/gem; -$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 -print $code; +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or + s/\bret\b/bx lr/go or + s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 + + print $_,"\n"; +} close STDOUT; # enforce flush diff --git a/crypto/bn/asm/armv4-mont.pl b/crypto/bn/asm/armv4-mont.pl index f78a8b5..1d330e9 100755 --- a/crypto/bn/asm/armv4-mont.pl +++ b/crypto/bn/asm/armv4-mont.pl @@ -1,7 +1,7 @@ #!/usr/bin/env perl # ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. @@ -23,6 +23,21 @@ # than 1/2KB. Windows CE port would be trivial, as it's exclusively # about decorations, ABI and instruction syntax are identical. +# November 2013 +# +# Add NEON code path, which handles lengths divisible by 8. RSA/DSA +# performance improvement on Cortex-A8 is ~45-100% depending on key +# length, more for longer keys. On Cortex-A15 the span is ~10-105%. +# On Snapdragon S4 improvement was measured to vary from ~70% to +# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is +# rather because original integer-only code seems to perform +# suboptimally on S4. Situation on Cortex-A9 is unfortunately +# different. It's being looked into, but the trouble is that +# performance for vectors longer than 256 bits is actually couple +# of percent worse than for integer-only code. The code is chosen +# for execution on all NEON-capable processors, because gain on +# others outweighs the marginal loss on Cortex-A9. + while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; @@ -52,16 +67,40 @@ $_n0="$num,#14*4"; $_num="$num,#15*4"; $_bpend=$_num; $code=<<___; +#include "arm_arch.h" + .text +.code 32 + +#if __ARM_MAX_ARCH__>=7 +.align 5 +.LOPENSSL_armcap: +.word OPENSSL_armcap_P-bn_mul_mont +#endif .global bn_mul_mont .type bn_mul_mont,%function -.align 2 +.align 5 bn_mul_mont: + ldr ip,[sp,#4] @ load num stmdb sp!,{r0,r2} @ sp points at argument block - ldr $num,[sp,#3*4] @ load num - cmp $num,#2 +#if __ARM_MAX_ARCH__>=7 + tst ip,#7 + bne .Lialu + adr r0,bn_mul_mont + ldr r2,.LOPENSSL_armcap + ldr r0,[r0,r2] + tst r0,#1 @ NEON available? + ldmia sp, {r0,r2} + beq .Lialu + add sp,sp,#8 + b bn_mul8x_mont_neon +.align 4 +.Lialu: +#endif + cmp ip,#2 + mov $num,ip @ load num movlt r0,#0 addlt sp,sp,#2*4 blt .Labrt @@ -191,14 +230,447 @@ bn_mul_mont: ldmia sp!,{r4-r12,lr} @ restore registers add sp,sp,#2*4 @ skip over {r0,r2} mov r0,#1 -.Labrt: tst lr,#1 +.Labrt: +#if __ARM_ARCH__>=5 + ret @ bx lr +#else + tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) +#endif .size bn_mul_mont,.-bn_mul_mont -.asciz "Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" +___ +{ +sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } +sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } + +my ($A0,$A1,$A2,$A3)=map("d$_",(0..3)); +my ($N0,$N1,$N2,$N3)=map("d$_",(4..7)); +my ($Z,$Temp)=("q4","q5"); +my ($A0xB,$A1xB,$A2xB,$A3xB,$A4xB,$A5xB,$A6xB,$A7xB)=map("q$_",(6..13)); +my ($Bi,$Ni,$M0)=map("d$_",(28..31)); +my $zero=&Dlo($Z); +my $temp=&Dlo($Temp); + +my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5)); +my ($tinptr,$toutptr,$inner,$outer)=map("r$_",(6..9)); + +$code.=<<___; +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + +.type bn_mul8x_mont_neon,%function +.align 5 +bn_mul8x_mont_neon: + mov ip,sp + stmdb sp!,{r4-r11} + vstmdb sp!,{d8-d15} @ ABI specification says so + ldmia ip,{r4-r5} @ load rest of parameter block + + sub $toutptr,sp,#16 + vld1.32 {${Bi}[0]}, [$bptr,:32]! + sub $toutptr,$toutptr,$num,lsl#4 + vld1.32 {$A0-$A3}, [$aptr]! @ can't specify :32 :-( + and $toutptr,$toutptr,#-64 + vld1.32 {${M0}[0]}, [$n0,:32] + mov sp,$toutptr @ alloca + veor $zero,$zero,$zero + subs $inner,$num,#8 + vzip.16 $Bi,$zero + + vmull.u32 $A0xB,$Bi,${A0}[0] + vmull.u32 $A1xB,$Bi,${A0}[1] + vmull.u32 $A2xB,$Bi,${A1}[0] + vshl.i64 $temp,`&Dhi("$A0xB")`,#16 + vmull.u32 $A3xB,$Bi,${A1}[1] + + vadd.u64 $temp,$temp,`&Dlo("$A0xB")` + veor $zero,$zero,$zero + vmul.u32 $Ni,$temp,$M0 + + vmull.u32 $A4xB,$Bi,${A2}[0] + vld1.32 {$N0-$N3}, [$nptr]! + vmull.u32 $A5xB,$Bi,${A2}[1] + vmull.u32 $A6xB,$Bi,${A3}[0] + vzip.16 $Ni,$zero + vmull.u32 $A7xB,$Bi,${A3}[1] + + bne .LNEON_1st + + @ special case for num=8, everything is in register bank... + + vmlal.u32 $A0xB,$Ni,${N0}[0] + sub $outer,$num,#1 + vmlal.u32 $A1xB,$Ni,${N0}[1] + vmlal.u32 $A2xB,$Ni,${N1}[0] + vmlal.u32 $A3xB,$Ni,${N1}[1] + + vmlal.u32 $A4xB,$Ni,${N2}[0] + vmov $Temp,$A0xB + vmlal.u32 $A5xB,$Ni,${N2}[1] + vmov $A0xB,$A1xB + vmlal.u32 $A6xB,$Ni,${N3}[0] + vmov $A1xB,$A2xB + vmlal.u32 $A7xB,$Ni,${N3}[1] + vmov $A2xB,$A3xB + vmov $A3xB,$A4xB + vshr.u64 $temp,$temp,#16 + vmov $A4xB,$A5xB + vmov $A5xB,$A6xB + vadd.u64 $temp,$temp,`&Dhi("$Temp")` + vmov $A6xB,$A7xB + veor $A7xB,$A7xB + vshr.u64 $temp,$temp,#16 + + b .LNEON_outer8 + +.align 4 +.LNEON_outer8: + vld1.32 {${Bi}[0]}, [$bptr,:32]! + veor $zero,$zero,$zero + vzip.16 $Bi,$zero + vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp + + vmlal.u32 $A0xB,$Bi,${A0}[0] + vmlal.u32 $A1xB,$Bi,${A0}[1] + vmlal.u32 $A2xB,$Bi,${A1}[0] + vshl.i64 $temp,`&Dhi("$A0xB")`,#16 + vmlal.u32 $A3xB,$Bi,${A1}[1] + + vadd.u64 $temp,$temp,`&Dlo("$A0xB")` + veor $zero,$zero,$zero + subs $outer,$outer,#1 + vmul.u32 $Ni,$temp,$M0 + + vmlal.u32 $A4xB,$Bi,${A2}[0] + vmlal.u32 $A5xB,$Bi,${A2}[1] + vmlal.u32 $A6xB,$Bi,${A3}[0] + vzip.16 $Ni,$zero + vmlal.u32 $A7xB,$Bi,${A3}[1] + + vmlal.u32 $A0xB,$Ni,${N0}[0] + vmlal.u32 $A1xB,$Ni,${N0}[1] + vmlal.u32 $A2xB,$Ni,${N1}[0] + vmlal.u32 $A3xB,$Ni,${N1}[1] + + vmlal.u32 $A4xB,$Ni,${N2}[0] + vmov $Temp,$A0xB + vmlal.u32 $A5xB,$Ni,${N2}[1] + vmov $A0xB,$A1xB + vmlal.u32 $A6xB,$Ni,${N3}[0] + vmov $A1xB,$A2xB + vmlal.u32 $A7xB,$Ni,${N3}[1] + vmov $A2xB,$A3xB + vmov $A3xB,$A4xB + vshr.u64 $temp,$temp,#16 + vmov $A4xB,$A5xB + vmov $A5xB,$A6xB + vadd.u64 $temp,$temp,`&Dhi("$Temp")` + vmov $A6xB,$A7xB + veor $A7xB,$A7xB + vshr.u64 $temp,$temp,#16 + + bne .LNEON_outer8 + + vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp + mov $toutptr,sp + vshr.u64 $temp,`&Dlo("$A0xB")`,#16 + mov $inner,$num + vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp + add $tinptr,sp,#16 + vshr.u64 $temp,`&Dhi("$A0xB")`,#16 + vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")` + + b .LNEON_tail2 + +.align 4 +.LNEON_1st: + vmlal.u32 $A0xB,$Ni,${N0}[0] + vld1.32 {$A0-$A3}, [$aptr]! + vmlal.u32 $A1xB,$Ni,${N0}[1] + subs $inner,$inner,#8 + vmlal.u32 $A2xB,$Ni,${N1}[0] + vmlal.u32 $A3xB,$Ni,${N1}[1] + + vmlal.u32 $A4xB,$Ni,${N2}[0] + vld1.32 {$N0-$N1}, [$nptr]! + vmlal.u32 $A5xB,$Ni,${N2}[1] + vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! + vmlal.u32 $A6xB,$Ni,${N3}[0] + vmlal.u32 $A7xB,$Ni,${N3}[1] + vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! + + vmull.u32 $A0xB,$Bi,${A0}[0] + vld1.32 {$N2-$N3}, [$nptr]! + vmull.u32 $A1xB,$Bi,${A0}[1] + vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! + vmull.u32 $A2xB,$Bi,${A1}[0] + vmull.u32 $A3xB,$Bi,${A1}[1] + vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! + + vmull.u32 $A4xB,$Bi,${A2}[0] + vmull.u32 $A5xB,$Bi,${A2}[1] + vmull.u32 $A6xB,$Bi,${A3}[0] + vmull.u32 $A7xB,$Bi,${A3}[1] + + bne .LNEON_1st + + vmlal.u32 $A0xB,$Ni,${N0}[0] + add $tinptr,sp,#16 + vmlal.u32 $A1xB,$Ni,${N0}[1] + sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr + vmlal.u32 $A2xB,$Ni,${N1}[0] + vld1.64 {$Temp}, [sp,:128] + vmlal.u32 $A3xB,$Ni,${N1}[1] + sub $outer,$num,#1 + + vmlal.u32 $A4xB,$Ni,${N2}[0] + vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! + vmlal.u32 $A5xB,$Ni,${N2}[1] + vshr.u64 $temp,$temp,#16 + vld1.64 {$A0xB}, [$tinptr, :128]! + vmlal.u32 $A6xB,$Ni,${N3}[0] + vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! + vmlal.u32 $A7xB,$Ni,${N3}[1] + + vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! + vadd.u64 $temp,$temp,`&Dhi("$Temp")` + veor $Z,$Z,$Z + vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! + vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! + vst1.64 {$Z}, [$toutptr,:128] + vshr.u64 $temp,$temp,#16 + + b .LNEON_outer + +.align 4 +.LNEON_outer: + vld1.32 {${Bi}[0]}, [$bptr,:32]! + sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr + vld1.32 {$A0-$A3}, [$aptr]! + veor $zero,$zero,$zero + mov $toutptr,sp + vzip.16 $Bi,$zero + sub $inner,$num,#8 + vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp + + vmlal.u32 $A0xB,$Bi,${A0}[0] + vld1.64 {$A3xB-$A4xB},[$tinptr,:256]! + vmlal.u32 $A1xB,$Bi,${A0}[1] + vmlal.u32 $A2xB,$Bi,${A1}[0] + vld1.64 {$A5xB-$A6xB},[$tinptr,:256]! + vmlal.u32 $A3xB,$Bi,${A1}[1] + + vshl.i64 $temp,`&Dhi("$A0xB")`,#16 + veor $zero,$zero,$zero + vadd.u64 $temp,$temp,`&Dlo("$A0xB")` + vld1.64 {$A7xB},[$tinptr,:128]! + vmul.u32 $Ni,$temp,$M0 + + vmlal.u32 $A4xB,$Bi,${A2}[0] + vld1.32 {$N0-$N3}, [$nptr]! + vmlal.u32 $A5xB,$Bi,${A2}[1] + vmlal.u32 $A6xB,$Bi,${A3}[0] + vzip.16 $Ni,$zero + vmlal.u32 $A7xB,$Bi,${A3}[1] + +.LNEON_inner: + vmlal.u32 $A0xB,$Ni,${N0}[0] + vld1.32 {$A0-$A3}, [$aptr]! + vmlal.u32 $A1xB,$Ni,${N0}[1] + subs $inner,$inner,#8 + vmlal.u32 $A2xB,$Ni,${N1}[0] + vmlal.u32 $A3xB,$Ni,${N1}[1] + vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! + + vmlal.u32 $A4xB,$Ni,${N2}[0] + vld1.64 {$A0xB}, [$tinptr, :128]! + vmlal.u32 $A5xB,$Ni,${N2}[1] + vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! + vmlal.u32 $A6xB,$Ni,${N3}[0] + vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! + vmlal.u32 $A7xB,$Ni,${N3}[1] + vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! + + vmlal.u32 $A0xB,$Bi,${A0}[0] + vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]! + vmlal.u32 $A1xB,$Bi,${A0}[1] + vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! + vmlal.u32 $A2xB,$Bi,${A1}[0] + vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]! + vmlal.u32 $A3xB,$Bi,${A1}[1] + vld1.32 {$N0-$N3}, [$nptr]! + + vmlal.u32 $A4xB,$Bi,${A2}[0] + vld1.64 {$A7xB}, [$tinptr, :128]! + vmlal.u32 $A5xB,$Bi,${A2}[1] + vmlal.u32 $A6xB,$Bi,${A3}[0] + vmlal.u32 $A7xB,$Bi,${A3}[1] + + bne .LNEON_inner + + vmlal.u32 $A0xB,$Ni,${N0}[0] + add $tinptr,sp,#16 + vmlal.u32 $A1xB,$Ni,${N0}[1] + sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr + vmlal.u32 $A2xB,$Ni,${N1}[0] + vld1.64 {$Temp}, [sp,:128] + vmlal.u32 $A3xB,$Ni,${N1}[1] + subs $outer,$outer,#1 + + vmlal.u32 $A4xB,$Ni,${N2}[0] + vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! + vmlal.u32 $A5xB,$Ni,${N2}[1] + vld1.64 {$A0xB}, [$tinptr, :128]! + vshr.u64 $temp,$temp,#16 + vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! + vmlal.u32 $A6xB,$Ni,${N3}[0] + vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! + vmlal.u32 $A7xB,$Ni,${N3}[1] + + vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! + vadd.u64 $temp,$temp,`&Dhi("$Temp")` + vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! + vshr.u64 $temp,$temp,#16 + + bne .LNEON_outer + + mov $toutptr,sp + mov $inner,$num + +.LNEON_tail: + vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp + vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]! + vshr.u64 $temp,`&Dlo("$A0xB")`,#16 + vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp + vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]! + vshr.u64 $temp,`&Dhi("$A0xB")`,#16 + vld1.64 {$A7xB}, [$tinptr, :128]! + vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")` + +.LNEON_tail2: + vadd.u64 `&Dlo("$A1xB")`,`&Dlo("$A1xB")`,$temp + vst1.32 {`&Dlo("$A0xB")`[0]}, [$toutptr, :32]! + vshr.u64 $temp,`&Dlo("$A1xB")`,#16 + vadd.u64 `&Dhi("$A1xB")`,`&Dhi("$A1xB")`,$temp + vshr.u64 $temp,`&Dhi("$A1xB")`,#16 + vzip.16 `&Dlo("$A1xB")`,`&Dhi("$A1xB")` + + vadd.u64 `&Dlo("$A2xB")`,`&Dlo("$A2xB")`,$temp + vst1.32 {`&Dlo("$A1xB")`[0]}, [$toutptr, :32]! + vshr.u64 $temp,`&Dlo("$A2xB")`,#16 + vadd.u64 `&Dhi("$A2xB")`,`&Dhi("$A2xB")`,$temp + vshr.u64 $temp,`&Dhi("$A2xB")`,#16 + vzip.16 `&Dlo("$A2xB")`,`&Dhi("$A2xB")` + + vadd.u64 `&Dlo("$A3xB")`,`&Dlo("$A3xB")`,$temp + vst1.32 {`&Dlo("$A2xB")`[0]}, [$toutptr, :32]! + vshr.u64 $temp,`&Dlo("$A3xB")`,#16 + vadd.u64 `&Dhi("$A3xB")`,`&Dhi("$A3xB")`,$temp + vshr.u64 $temp,`&Dhi("$A3xB")`,#16 + vzip.16 `&Dlo("$A3xB")`,`&Dhi("$A3xB")` + + vadd.u64 `&Dlo("$A4xB")`,`&Dlo("$A4xB")`,$temp + vst1.32 {`&Dlo("$A3xB")`[0]}, [$toutptr, :32]! + vshr.u64 $temp,`&Dlo("$A4xB")`,#16 + vadd.u64 `&Dhi("$A4xB")`,`&Dhi("$A4xB")`,$temp + vshr.u64 $temp,`&Dhi("$A4xB")`,#16 + vzip.16 `&Dlo("$A4xB")`,`&Dhi("$A4xB")` + + vadd.u64 `&Dlo("$A5xB")`,`&Dlo("$A5xB")`,$temp + vst1.32 {`&Dlo("$A4xB")`[0]}, [$toutptr, :32]! + vshr.u64 $temp,`&Dlo("$A5xB")`,#16 + vadd.u64 `&Dhi("$A5xB")`,`&Dhi("$A5xB")`,$temp + vshr.u64 $temp,`&Dhi("$A5xB")`,#16 + vzip.16 `&Dlo("$A5xB")`,`&Dhi("$A5xB")` + + vadd.u64 `&Dlo("$A6xB")`,`&Dlo("$A6xB")`,$temp + vst1.32 {`&Dlo("$A5xB")`[0]}, [$toutptr, :32]! + vshr.u64 $temp,`&Dlo("$A6xB")`,#16 + vadd.u64 `&Dhi("$A6xB")`,`&Dhi("$A6xB")`,$temp + vld1.64 {$A0xB}, [$tinptr, :128]! + vshr.u64 $temp,`&Dhi("$A6xB")`,#16 + vzip.16 `&Dlo("$A6xB")`,`&Dhi("$A6xB")` + + vadd.u64 `&Dlo("$A7xB")`,`&Dlo("$A7xB")`,$temp + vst1.32 {`&Dlo("$A6xB")`[0]}, [$toutptr, :32]! + vshr.u64 $temp,`&Dlo("$A7xB")`,#16 + vadd.u64 `&Dhi("$A7xB")`,`&Dhi("$A7xB")`,$temp + vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! + vshr.u64 $temp,`&Dhi("$A7xB")`,#16 + vzip.16 `&Dlo("$A7xB")`,`&Dhi("$A7xB")` + subs $inner,$inner,#8 + vst1.32 {`&Dlo("$A7xB")`[0]}, [$toutptr, :32]! + + bne .LNEON_tail + + vst1.32 {${temp}[0]}, [$toutptr, :32] @ top-most bit + sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr + subs $aptr,sp,#0 @ clear carry flag + add $bptr,sp,$num,lsl#2 + +.LNEON_sub: + ldmia $aptr!, {r4-r7} + ldmia $nptr!, {r8-r11} + sbcs r8, r4,r8 + sbcs r9, r5,r9 + sbcs r10,r6,r10 + sbcs r11,r7,r11 + teq $aptr,$bptr @ preserves carry + stmia $rptr!, {r8-r11} + bne .LNEON_sub + + ldr r10, [$aptr] @ load top-most bit + veor q0,q0,q0 + sub r11,$bptr,sp @ this is num*4 + veor q1,q1,q1 + mov $aptr,sp + sub $rptr,$rptr,r11 @ rewind $rptr + mov $nptr,$bptr @ second 3/4th of frame + sbcs r10,r10,#0 @ result is carry flag + +.LNEON_copy_n_zap: + ldmia $aptr!, {r4-r7} + ldmia $rptr, {r8-r11} + movcc r8, r4 + vst1.64 {q0-q1}, [$nptr,:256]! @ wipe + movcc r9, r5 + movcc r10,r6 + vst1.64 {q0-q1}, [$nptr,:256]! @ wipe + movcc r11,r7 + ldmia $aptr, {r4-r7} + stmia $rptr!, {r8-r11} + sub $aptr,$aptr,#16 + ldmia $rptr, {r8-r11} + movcc r8, r4 + vst1.64 {q0-q1}, [$aptr,:256]! @ wipe + movcc r9, r5 + movcc r10,r6 + vst1.64 {q0-q1}, [$nptr,:256]! @ wipe + movcc r11,r7 + teq $aptr,$bptr @ preserves carry + stmia $rptr!, {r8-r11} + bne .LNEON_copy_n_zap + + sub sp,ip,#96 + vldmia sp!,{d8-d15} + ldmia sp!,{r4-r11} + ret @ bx lr +.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon +#endif +___ +} +$code.=<<___; +.asciz "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" .align 2 +#if __ARM_MAX_ARCH__>=7 +.comm OPENSSL_armcap_P,4,4 +#endif ___ +$code =~ s/\`([^\`]*)\`/eval $1/gem; $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 +$code =~ s/\bret\b/bx lr/gm; print $code; close STDOUT; diff --git a/crypto/bn/asm/mips-mont.pl b/crypto/bn/asm/mips-mont.pl index caae04e..a33cdf4 100755 --- a/crypto/bn/asm/mips-mont.pl +++ b/crypto/bn/asm/mips-mont.pl @@ -46,7 +46,7 @@ # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); # -$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64 +$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64 if ($flavour =~ /64|n32/i) { $PTR_ADD="dadd"; # incidentally works even on n32 diff --git a/crypto/bn/asm/mips.pl b/crypto/bn/asm/mips.pl index 215c9a7..acafde5 100755 --- a/crypto/bn/asm/mips.pl +++ b/crypto/bn/asm/mips.pl @@ -48,7 +48,7 @@ # has to content with 40-85% improvement depending on benchmark and # key length, more for longer keys. -$flavour = shift; +$flavour = shift || "o32"; while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; diff --git a/crypto/bn/asm/mips3.s b/crypto/bn/asm/mips3.s new file mode 100644 index 0000000..dca4105 --- /dev/null +++ b/crypto/bn/asm/mips3.s @@ -0,0 +1,2201 @@ +.rdata +.asciiz "mips3.s, Version 1.1" +.asciiz "MIPS III/IV ISA artwork by Andy Polyakov <appro@fy.chalmers.se>" + +/* + * ==================================================================== + * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL + * project. + * + * Rights for redistribution and usage in source and binary forms are + * granted according to the OpenSSL license. Warranty of any kind is + * disclaimed. + * ==================================================================== + */ + +/* + * This is my modest contributon to the OpenSSL project (see + * http://www.openssl.org/ for more information about it) and is + * a drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c + * module. For updates see http://fy.chalmers.se/~appro/hpe/. + * + * The module is designed to work with either of the "new" MIPS ABI(5), + * namely N32 or N64, offered by IRIX 6.x. It's not ment to work under + * IRIX 5.x not only because it doesn't support new ABIs but also + * because 5.x kernels put R4x00 CPU into 32-bit mode and all those + * 64-bit instructions (daddu, dmultu, etc.) found below gonna only + * cause illegal instruction exception:-( + * + * In addition the code depends on preprocessor flags set up by MIPSpro + * compiler driver (either as or cc) and therefore (probably?) can't be + * compiled by the GNU assembler. GNU C driver manages fine though... + * I mean as long as -mmips-as is specified or is the default option, + * because then it simply invokes /usr/bin/as which in turn takes + * perfect care of the preprocessor definitions. Another neat feature + * offered by the MIPSpro assembler is an optimization pass. This gave + * me the opportunity to have the code looking more regular as all those + * architecture dependent instruction rescheduling details were left to + * the assembler. Cool, huh? + * + * Performance improvement is astonishing! 'apps/openssl speed rsa dsa' + * goes way over 3 times faster! + * + * <appro@fy.chalmers.se> + */ +#include <asm.h> +#include <regdef.h> + +#if _MIPS_ISA>=4 +#define MOVNZ(cond,dst,src) \ + movn dst,src,cond +#else +#define MOVNZ(cond,dst,src) \ + .set noreorder; \ + bnezl cond,.+8; \ + move dst,src; \ + .set reorder +#endif + +.text + +.set noat +.set reorder + +#define MINUS4 v1 + +.align 5 +LEAF(bn_mul_add_words) + .set noreorder + bgtzl a2,.L_bn_mul_add_words_proceed + ld t0,0(a1) + jr ra + move v0,zero + .set reorder + +.L_bn_mul_add_words_proceed: + li MINUS4,-4 + and ta0,a2,MINUS4 + move v0,zero + beqz ta0,.L_bn_mul_add_words_tail + +.L_bn_mul_add_words_loop: + dmultu t0,a3 + ld t1,0(a0) + ld t2,8(a1) + ld t3,8(a0) + ld ta0,16(a1) + ld ta1,16(a0) + daddu t1,v0 + sltu v0,t1,v0 /* All manuals say it "compares 32-bit + * values", but it seems to work fine + * even on 64-bit registers. */ + mflo AT + mfhi t0 + daddu t1,AT + daddu v0,t0 + sltu AT,t1,AT + sd t1,0(a0) + daddu v0,AT + + dmultu t2,a3 + ld ta2,24(a1) + ld ta3,24(a0) + daddu t3,v0 + sltu v0,t3,v0 + mflo AT + mfhi t2 + daddu t3,AT + daddu v0,t2 + sltu AT,t3,AT + sd t3,8(a0) + daddu v0,AT + + dmultu ta0,a3 + subu a2,4 + PTR_ADD a0,32 + PTR_ADD a1,32 + daddu ta1,v0 + sltu v0,ta1,v0 + mflo AT + mfhi ta0 + daddu ta1,AT + daddu v0,ta0 + sltu AT,ta1,AT + sd ta1,-16(a0) + daddu v0,AT + + + dmultu ta2,a3 + and ta0,a2,MINUS4 + daddu ta3,v0 + sltu v0,ta3,v0 + mflo AT + mfhi ta2 + daddu ta3,AT + daddu v0,ta2 + sltu AT,ta3,AT + sd ta3,-8(a0) + daddu v0,AT + .set noreorder + bgtzl ta0,.L_bn_mul_add_words_loop + ld t0,0(a1) + + bnezl a2,.L_bn_mul_add_words_tail + ld t0,0(a1) + .set reorder + +.L_bn_mul_add_words_return: + jr ra + +.L_bn_mul_add_words_tail: + dmultu t0,a3 + ld t1,0(a0) + subu a2,1 + daddu t1,v0 + sltu v0,t1,v0 + mflo AT + mfhi t0 + daddu t1,AT + daddu v0,t0 + sltu AT,t1,AT + sd t1,0(a0) + daddu v0,AT + beqz a2,.L_bn_mul_add_words_return + + ld t0,8(a1) + dmultu t0,a3 + ld t1,8(a0) + subu a2,1 + daddu t1,v0 + sltu v0,t1,v0 + mflo AT + mfhi t0 + daddu t1,AT + daddu v0,t0 + sltu AT,t1,AT + sd t1,8(a0) + daddu v0,AT + beqz a2,.L_bn_mul_add_words_return + + ld t0,16(a1) + dmultu t0,a3 + ld t1,16(a0) + daddu t1,v0 + sltu v0,t1,v0 + mflo AT + mfhi t0 + daddu t1,AT + daddu v0,t0 + sltu AT,t1,AT + sd t1,16(a0) + daddu v0,AT + jr ra +END(bn_mul_add_words) + +.align 5 +LEAF(bn_mul_words) + .set noreorder + bgtzl a2,.L_bn_mul_words_proceed + ld t0,0(a1) + jr ra + move v0,zero + .set reorder + +.L_bn_mul_words_proceed: + li MINUS4,-4 + and ta0,a2,MINUS4 + move v0,zero + beqz ta0,.L_bn_mul_words_tail + +.L_bn_mul_words_loop: + dmultu t0,a3 + ld t2,8(a1) + ld ta0,16(a1) + ld ta2,24(a1) + mflo AT + mfhi t0 + daddu v0,AT + sltu t1,v0,AT + sd v0,0(a0) + daddu v0,t1,t0 + + dmultu t2,a3 + subu a2,4 + PTR_ADD a0,32 + PTR_ADD a1,32 + mflo AT + mfhi t2 + daddu v0,AT + sltu t3,v0,AT + sd v0,-24(a0) + daddu v0,t3,t2 + + dmultu ta0,a3 + mflo AT + mfhi ta0 + daddu v0,AT + sltu ta1,v0,AT + sd v0,-16(a0) + daddu v0,ta1,ta0 + + + dmultu ta2,a3 + and ta0,a2,MINUS4 + mflo AT + mfhi ta2 + daddu v0,AT + sltu ta3,v0,AT + sd v0,-8(a0) + daddu v0,ta3,ta2 + .set noreorder + bgtzl ta0,.L_bn_mul_words_loop + ld t0,0(a1) + + bnezl a2,.L_bn_mul_words_tail + ld t0,0(a1) + .set reorder + +.L_bn_mul_words_return: + jr ra + +.L_bn_mul_words_tail: + dmultu t0,a3 + subu a2,1 + mflo AT + mfhi t0 + daddu v0,AT + sltu t1,v0,AT + sd v0,0(a0) + daddu v0,t1,t0 + beqz a2,.L_bn_mul_words_return + + ld t0,8(a1) + dmultu t0,a3 + subu a2,1 + mflo AT + mfhi t0 + daddu v0,AT + sltu t1,v0,AT + sd v0,8(a0) + daddu v0,t1,t0 + beqz a2,.L_bn_mul_words_return + + ld t0,16(a1) + dmultu t0,a3 + mflo AT + mfhi t0 + daddu v0,AT + sltu t1,v0,AT + sd v0,16(a0) + daddu v0,t1,t0 + jr ra +END(bn_mul_words) + +.align 5 +LEAF(bn_sqr_words) + .set noreorder + bgtzl a2,.L_bn_sqr_words_proceed + ld t0,0(a1) + jr ra + move v0,zero + .set reorder + +.L_bn_sqr_words_proceed: + li MINUS4,-4 + and ta0,a2,MINUS4 + move v0,zero + beqz ta0,.L_bn_sqr_words_tail + +.L_bn_sqr_words_loop: + dmultu t0,t0 + ld t2,8(a1) + ld ta0,16(a1) + ld ta2,24(a1) + mflo t1 + mfhi t0 + sd t1,0(a0) + sd t0,8(a0) + + dmultu t2,t2 + subu a2,4 + PTR_ADD a0,64 + PTR_ADD a1,32 + mflo t3 + mfhi t2 + sd t3,-48(a0) + sd t2,-40(a0) + + dmultu ta0,ta0 + mflo ta1 + mfhi ta0 + sd ta1,-32(a0) + sd ta0,-24(a0) + + + dmultu ta2,ta2 + and ta0,a2,MINUS4 + mflo ta3 + mfhi ta2 + sd ta3,-16(a0) + sd ta2,-8(a0) + + .set noreorder + bgtzl ta0,.L_bn_sqr_words_loop + ld t0,0(a1) + + bnezl a2,.L_bn_sqr_words_tail + ld t0,0(a1) + .set reorder + +.L_bn_sqr_words_return: + move v0,zero + jr ra + +.L_bn_sqr_words_tail: + dmultu t0,t0 + subu a2,1 + mflo t1 + mfhi t0 + sd t1,0(a0) + sd t0,8(a0) + beqz a2,.L_bn_sqr_words_return + + ld t0,8(a1) + dmultu t0,t0 + subu a2,1 + mflo t1 + mfhi t0 + sd t1,16(a0) + sd t0,24(a0) + beqz a2,.L_bn_sqr_words_return + + ld t0,16(a1) + dmultu t0,t0 + mflo t1 + mfhi t0 + sd t1,32(a0) + sd t0,40(a0) + jr ra +END(bn_sqr_words) + +.align 5 +LEAF(bn_add_words) + .set noreorder + bgtzl a3,.L_bn_add_words_proceed + ld t0,0(a1) + jr ra + move v0,zero + .set reorder + +.L_bn_add_words_proceed: + li MINUS4,-4 + and AT,a3,MINUS4 + move v0,zero + beqz AT,.L_bn_add_words_tail + +.L_bn_add_words_loop: + ld ta0,0(a2) + subu a3,4 + ld t1,8(a1) + and AT,a3,MINUS4 + ld t2,16(a1) + PTR_ADD a2,32 + ld t3,24(a1) + PTR_ADD a0,32 + ld ta1,-24(a2) + PTR_ADD a1,32 + ld ta2,-16(a2) + ld ta3,-8(a2) + daddu ta0,t0 + sltu t8,ta0,t0 + daddu t0,ta0,v0 + sltu v0,t0,ta0 + sd t0,-32(a0) + daddu v0,t8 + + daddu ta1,t1 + sltu t9,ta1,t1 + daddu t1,ta1,v0 + sltu v0,t1,ta1 + sd t1,-24(a0) + daddu v0,t9 + + daddu ta2,t2 + sltu t8,ta2,t2 + daddu t2,ta2,v0 + sltu v0,t2,ta2 + sd t2,-16(a0) + daddu v0,t8 + + daddu ta3,t3 + sltu t9,ta3,t3 + daddu t3,ta3,v0 + sltu v0,t3,ta3 + sd t3,-8(a0) + daddu v0,t9 + + .set noreorder + bgtzl AT,.L_bn_add_words_loop + ld t0,0(a1) + + bnezl a3,.L_bn_add_words_tail + ld t0,0(a1) + .set reorder + +.L_bn_add_words_return: + jr ra + +.L_bn_add_words_tail: + ld ta0,0(a2) + daddu ta0,t0 + subu a3,1 + sltu t8,ta0,t0 + daddu t0,ta0,v0 + sltu v0,t0,ta0 + sd t0,0(a0) + daddu v0,t8 + beqz a3,.L_bn_add_words_return + + ld t1,8(a1) + ld ta1,8(a2) + daddu ta1,t1 + subu a3,1 + sltu t9,ta1,t1 + daddu t1,ta1,v0 + sltu v0,t1,ta1 + sd t1,8(a0) + daddu v0,t9 + beqz a3,.L_bn_add_words_return + + ld t2,16(a1) + ld ta2,16(a2) + daddu ta2,t2 + sltu t8,ta2,t2 + daddu t2,ta2,v0 + sltu v0,t2,ta2 + sd t2,16(a0) + daddu v0,t8 + jr ra +END(bn_add_words) + +.align 5 +LEAF(bn_sub_words) + .set noreorder + bgtzl a3,.L_bn_sub_words_proceed + ld t0,0(a1) + jr ra + move v0,zero + .set reorder + +.L_bn_sub_words_proceed: + li MINUS4,-4 + and AT,a3,MINUS4 + move v0,zero + beqz AT,.L_bn_sub_words_tail + +.L_bn_sub_words_loop: + ld ta0,0(a2) + subu a3,4 + ld t1,8(a1) + and AT,a3,MINUS4 + ld t2,16(a1) + PTR_ADD a2,32 + ld t3,24(a1) + PTR_ADD a0,32 + ld ta1,-24(a2) + PTR_ADD a1,32 + ld ta2,-16(a2) + ld ta3,-8(a2) + sltu t8,t0,ta0 + dsubu t0,ta0 + dsubu ta0,t0,v0 + sd ta0,-32(a0) + MOVNZ (t0,v0,t8) + + sltu t9,t1,ta1 + dsubu t1,ta1 + dsubu ta1,t1,v0 + sd ta1,-24(a0) + MOVNZ (t1,v0,t9) + + + sltu t8,t2,ta2 + dsubu t2,ta2 + dsubu ta2,t2,v0 + sd ta2,-16(a0) + MOVNZ (t2,v0,t8) + + sltu t9,t3,ta3 + dsubu t3,ta3 + dsubu ta3,t3,v0 + sd ta3,-8(a0) + MOVNZ (t3,v0,t9) + + .set noreorder + bgtzl AT,.L_bn_sub_words_loop + ld t0,0(a1) + + bnezl a3,.L_bn_sub_words_tail + ld t0,0(a1) + .set reorder + +.L_bn_sub_words_return: + jr ra + +.L_bn_sub_words_tail: + ld ta0,0(a2) + subu a3,1 + sltu t8,t0,ta0 + dsubu t0,ta0 + dsubu ta0,t0,v0 + MOVNZ (t0,v0,t8) + sd ta0,0(a0) + beqz a3,.L_bn_sub_words_return + + ld t1,8(a1) + subu a3,1 + ld ta1,8(a2) + sltu t9,t1,ta1 + dsubu t1,ta1 + dsubu ta1,t1,v0 + MOVNZ (t1,v0,t9) + sd ta1,8(a0) + beqz a3,.L_bn_sub_words_return + + ld t2,16(a1) + ld ta2,16(a2) + sltu t8,t2,ta2 + dsubu t2,ta2 + dsubu ta2,t2,v0 + MOVNZ (t2,v0,t8) + sd ta2,16(a0) + jr ra +END(bn_sub_words) + +#undef MINUS4 + +.align 5 +LEAF(bn_div_3_words) + .set reorder + move a3,a0 /* we know that bn_div_words doesn't + * touch a3, ta2, ta3 and preserves a2 + * so that we can save two arguments + * and return address in registers + * instead of stack:-) + */ + ld a0,(a3) + move ta2,a1 + ld a1,-8(a3) + bne a0,a2,.L_bn_div_3_words_proceed + li v0,-1 + jr ra +.L_bn_div_3_words_proceed: + move ta3,ra + bal bn_div_words + move ra,ta3 + dmultu ta2,v0 + ld t2,-16(a3) + move ta0,zero + mfhi t1 + mflo t0 + sltu t8,t1,v1 +.L_bn_div_3_words_inner_loop: + bnez t8,.L_bn_div_3_words_inner_loop_done + sgeu AT,t2,t0 + seq t9,t1,v1 + and AT,t9 + sltu t3,t0,ta2 + daddu v1,a2 + dsubu t1,t3 + dsubu t0,ta2 + sltu t8,t1,v1 + sltu ta0,v1,a2 + or t8,ta0 + .set noreorder + beqzl AT,.L_bn_div_3_words_inner_loop + dsubu v0,1 + .set reorder +.L_bn_div_3_words_inner_loop_done: + jr ra +END(bn_div_3_words) + +.align 5 +LEAF(bn_div_words) + .set noreorder + bnezl a2,.L_bn_div_words_proceed + move v1,zero + jr ra + li v0,-1 /* I'd rather signal div-by-zero + * which can be done with 'break 7' */ + +.L_bn_div_words_proceed: + bltz a2,.L_bn_div_words_body + move t9,v1 + dsll a2,1 + bgtz a2,.-4 + addu t9,1 + + .set reorder + negu t1,t9 + li t2,-1 + dsll t2,t1 + and t2,a0 + dsrl AT,a1,t1 + .set noreorder + bnezl t2,.+8 + break 6 /* signal overflow */ + .set reorder + dsll a0,t9 + dsll a1,t9 + or a0,AT + +#define QT ta0 +#define HH ta1 +#define DH v1 +.L_bn_div_words_body: + dsrl DH,a2,32 + sgeu AT,a0,a2 + .set noreorder + bnezl AT,.+8 + dsubu a0,a2 + .set reorder + + li QT,-1 + dsrl HH,a0,32 + dsrl QT,32 /* q=0xffffffff */ + beq DH,HH,.L_bn_div_words_skip_div1 + ddivu zero,a0,DH + mflo QT +.L_bn_div_words_skip_div1: + dmultu a2,QT + dsll t3,a0,32 + dsrl AT,a1,32 + or t3,AT + mflo t0 + mfhi t1 +.L_bn_div_words_inner_loop1: + sltu t2,t3,t0 + seq t8,HH,t1 + sltu AT,HH,t1 + and t2,t8 + sltu v0,t0,a2 + or AT,t2 + .set noreorder + beqz AT,.L_bn_div_words_inner_loop1_done + dsubu t1,v0 + dsubu t0,a2 + b .L_bn_div_words_inner_loop1 + dsubu QT,1 + .set reorder +.L_bn_div_words_inner_loop1_done: + + dsll a1,32 + dsubu a0,t3,t0 + dsll v0,QT,32 + + li QT,-1 + dsrl HH,a0,32 + dsrl QT,32 /* q=0xffffffff */ + beq DH,HH,.L_bn_div_words_skip_div2 + ddivu zero,a0,DH + mflo QT +.L_bn_div_words_skip_div2: +#undef DH + dmultu a2,QT + dsll t3,a0,32 + dsrl AT,a1,32 + or t3,AT + mflo t0 + mfhi t1 +.L_bn_div_words_inner_loop2: + sltu t2,t3,t0 + seq t8,HH,t1 + sltu AT,HH,t1 + and t2,t8 + sltu v1,t0,a2 + or AT,t2 + .set noreorder + beqz AT,.L_bn_div_words_inner_loop2_done + dsubu t1,v1 + dsubu t0,a2 + b .L_bn_div_words_inner_loop2 + dsubu QT,1 + .set reorder +.L_bn_div_words_inner_loop2_done: +#undef HH + + dsubu a0,t3,t0 + or v0,QT + dsrl v1,a0,t9 /* v1 contains remainder if anybody wants it */ + dsrl a2,t9 /* restore a2 */ + jr ra +#undef QT +END(bn_div_words) + +#define a_0 t0 +#define a_1 t1 +#define a_2 t2 +#define a_3 t3 +#define b_0 ta0 +#define b_1 ta1 +#define b_2 ta2 +#define b_3 ta3 + +#define a_4 s0 +#define a_5 s2 +#define a_6 s4 +#define a_7 a1 /* once we load a[7] we don't need a anymore */ +#define b_4 s1 +#define b_5 s3 +#define b_6 s5 +#define b_7 a2 /* once we load b[7] we don't need b anymore */ + +#define t_1 t8 +#define t_2 t9 + +#define c_1 v0 +#define c_2 v1 +#define c_3 a3 + +#define FRAME_SIZE 48 + +.align 5 +LEAF(bn_mul_comba8) + .set noreorder + PTR_SUB sp,FRAME_SIZE + .frame sp,64,ra + .set reorder + ld a_0,0(a1) /* If compiled with -mips3 option on + * R5000 box assembler barks on this + * line with "shouldn't have mult/div + * as last instruction in bb (R10K + * bug)" warning. If anybody out there + * has a clue about how to circumvent + * this do send me a note. + * <appro@fy.chalmers.se> + */ + ld b_0,0(a2) + ld a_1,8(a1) + ld a_2,16(a1) + ld a_3,24(a1) + ld b_1,8(a2) + ld b_2,16(a2) + ld b_3,24(a2) + dmultu a_0,b_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */ + sd s0,0(sp) + sd s1,8(sp) + sd s2,16(sp) + sd s3,24(sp) + sd s4,32(sp) + sd s5,40(sp) + mflo c_1 + mfhi c_2 + + dmultu a_0,b_1 /* mul_add_c(a[0],b[1],c2,c3,c1); */ + ld a_4,32(a1) + ld a_5,40(a1) + ld a_6,48(a1) + ld a_7,56(a1) + ld b_4,32(a2) + ld b_5,40(a2) + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu c_3,t_2,AT + dmultu a_1,b_0 /* mul_add_c(a[1],b[0],c2,c3,c1); */ + ld b_6,48(a2) + ld b_7,56(a2) + sd c_1,0(a0) /* r[0]=c1; */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu c_1,c_3,t_2 + sd c_2,8(a0) /* r[1]=c2; */ + + dmultu a_2,b_0 /* mul_add_c(a[2],b[0],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + dmultu a_1,b_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu c_2,c_1,t_2 + dmultu a_0,b_2 /* mul_add_c(a[0],b[2],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + sd c_3,16(a0) /* r[2]=c3; */ + + dmultu a_0,b_3 /* mul_add_c(a[0],b[3],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu c_3,c_2,t_2 + dmultu a_1,b_2 /* mul_add_c(a[1],b[2],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_2,b_1 /* mul_add_c(a[2],b[1],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_3,b_0 /* mul_add_c(a[3],b[0],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + sd c_1,24(a0) /* r[3]=c1; */ + + dmultu a_4,b_0 /* mul_add_c(a[4],b[0],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu c_1,c_3,t_2 + dmultu a_3,b_1 /* mul_add_c(a[3],b[1],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_2,b_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_1,b_3 /* mul_add_c(a[1],b[3],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_0,b_4 /* mul_add_c(a[0],b[4],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + sd c_2,32(a0) /* r[4]=c2; */ + + dmultu a_0,b_5 /* mul_add_c(a[0],b[5],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu c_2,c_1,t_2 + dmultu a_1,b_4 /* mul_add_c(a[1],b[4],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_2,b_3 /* mul_add_c(a[2],b[3],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_3,b_2 /* mul_add_c(a[3],b[2],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_4,b_1 /* mul_add_c(a[4],b[1],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_5,b_0 /* mul_add_c(a[5],b[0],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + sd c_3,40(a0) /* r[5]=c3; */ + + dmultu a_6,b_0 /* mul_add_c(a[6],b[0],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu c_3,c_2,t_2 + dmultu a_5,b_1 /* mul_add_c(a[5],b[1],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_4,b_2 /* mul_add_c(a[4],b[2],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_3,b_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_2,b_4 /* mul_add_c(a[2],b[4],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_1,b_5 /* mul_add_c(a[1],b[5],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_0,b_6 /* mul_add_c(a[0],b[6],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + sd c_1,48(a0) /* r[6]=c1; */ + + dmultu a_0,b_7 /* mul_add_c(a[0],b[7],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu c_1,c_3,t_2 + dmultu a_1,b_6 /* mul_add_c(a[1],b[6],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_2,b_5 /* mul_add_c(a[2],b[5],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_3,b_4 /* mul_add_c(a[3],b[4],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_4,b_3 /* mul_add_c(a[4],b[3],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_5,b_2 /* mul_add_c(a[5],b[2],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_6,b_1 /* mul_add_c(a[6],b[1],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_7,b_0 /* mul_add_c(a[7],b[0],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + sd c_2,56(a0) /* r[7]=c2; */ + + dmultu a_7,b_1 /* mul_add_c(a[7],b[1],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu c_2,c_1,t_2 + dmultu a_6,b_2 /* mul_add_c(a[6],b[2],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_5,b_3 /* mul_add_c(a[5],b[3],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_4,b_4 /* mul_add_c(a[4],b[4],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_3,b_5 /* mul_add_c(a[3],b[5],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_2,b_6 /* mul_add_c(a[2],b[6],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_1,b_7 /* mul_add_c(a[1],b[7],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + sd c_3,64(a0) /* r[8]=c3; */ + + dmultu a_2,b_7 /* mul_add_c(a[2],b[7],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu c_3,c_2,t_2 + dmultu a_3,b_6 /* mul_add_c(a[3],b[6],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_4,b_5 /* mul_add_c(a[4],b[5],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_5,b_4 /* mul_add_c(a[5],b[4],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_6,b_3 /* mul_add_c(a[6],b[3],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_7,b_2 /* mul_add_c(a[7],b[2],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + sd c_1,72(a0) /* r[9]=c1; */ + + dmultu a_7,b_3 /* mul_add_c(a[7],b[3],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu c_1,c_3,t_2 + dmultu a_6,b_4 /* mul_add_c(a[6],b[4],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_5,b_5 /* mul_add_c(a[5],b[5],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_4,b_6 /* mul_add_c(a[4],b[6],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_3,b_7 /* mul_add_c(a[3],b[7],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + sd c_2,80(a0) /* r[10]=c2; */ + + dmultu a_4,b_7 /* mul_add_c(a[4],b[7],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu c_2,c_1,t_2 + dmultu a_5,b_6 /* mul_add_c(a[5],b[6],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_6,b_5 /* mul_add_c(a[6],b[5],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_7,b_4 /* mul_add_c(a[7],b[4],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + sd c_3,88(a0) /* r[11]=c3; */ + + dmultu a_7,b_5 /* mul_add_c(a[7],b[5],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu c_3,c_2,t_2 + dmultu a_6,b_6 /* mul_add_c(a[6],b[6],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_5,b_7 /* mul_add_c(a[5],b[7],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + sd c_1,96(a0) /* r[12]=c1; */ + + dmultu a_6,b_7 /* mul_add_c(a[6],b[7],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu c_1,c_3,t_2 + dmultu a_7,b_6 /* mul_add_c(a[7],b[6],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + sd c_2,104(a0) /* r[13]=c2; */ + + dmultu a_7,b_7 /* mul_add_c(a[7],b[7],c3,c1,c2); */ + ld s0,0(sp) + ld s1,8(sp) + ld s2,16(sp) + ld s3,24(sp) + ld s4,32(sp) + ld s5,40(sp) + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sd c_3,112(a0) /* r[14]=c3; */ + sd c_1,120(a0) /* r[15]=c1; */ + + PTR_ADD sp,FRAME_SIZE + + jr ra +END(bn_mul_comba8) + +.align 5 +LEAF(bn_mul_comba4) + .set reorder + ld a_0,0(a1) + ld b_0,0(a2) + ld a_1,8(a1) + ld a_2,16(a1) + dmultu a_0,b_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */ + ld a_3,24(a1) + ld b_1,8(a2) + ld b_2,16(a2) + ld b_3,24(a2) + mflo c_1 + mfhi c_2 + sd c_1,0(a0) + + dmultu a_0,b_1 /* mul_add_c(a[0],b[1],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu c_3,t_2,AT + dmultu a_1,b_0 /* mul_add_c(a[1],b[0],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu c_1,c_3,t_2 + sd c_2,8(a0) + + dmultu a_2,b_0 /* mul_add_c(a[2],b[0],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + dmultu a_1,b_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu c_2,c_1,t_2 + dmultu a_0,b_2 /* mul_add_c(a[0],b[2],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + sd c_3,16(a0) + + dmultu a_0,b_3 /* mul_add_c(a[0],b[3],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu c_3,c_2,t_2 + dmultu a_1,b_2 /* mul_add_c(a[1],b[2],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_2,b_1 /* mul_add_c(a[2],b[1],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_3,b_0 /* mul_add_c(a[3],b[0],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + sd c_1,24(a0) + + dmultu a_3,b_1 /* mul_add_c(a[3],b[1],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu c_1,c_3,t_2 + dmultu a_2,b_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_1,b_3 /* mul_add_c(a[1],b[3],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + sd c_2,32(a0) + + dmultu a_2,b_3 /* mul_add_c(a[2],b[3],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu c_2,c_1,t_2 + dmultu a_3,b_2 /* mul_add_c(a[3],b[2],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + sd c_3,40(a0) + + dmultu a_3,b_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sd c_1,48(a0) + sd c_2,56(a0) + + jr ra +END(bn_mul_comba4) + +#undef a_4 +#undef a_5 +#undef a_6 +#undef a_7 +#define a_4 b_0 +#define a_5 b_1 +#define a_6 b_2 +#define a_7 b_3 + +.align 5 +LEAF(bn_sqr_comba8) + .set reorder + ld a_0,0(a1) + ld a_1,8(a1) + ld a_2,16(a1) + ld a_3,24(a1) + + dmultu a_0,a_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */ + ld a_4,32(a1) + ld a_5,40(a1) + ld a_6,48(a1) + ld a_7,56(a1) + mflo c_1 + mfhi c_2 + sd c_1,0(a0) + + dmultu a_0,a_1 /* mul_add_c2(a[0],b[1],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + slt c_1,t_2,zero + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu c_3,t_2,AT + sd c_2,8(a0) + + dmultu a_2,a_0 /* mul_add_c2(a[2],b[0],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + slt c_2,t_2,zero + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_1,a_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + sd c_3,16(a0) + + dmultu a_0,a_3 /* mul_add_c2(a[0],b[3],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + slt c_3,t_2,zero + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_1,a_2 /* mul_add_c2(a[1],b[2],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + slt AT,t_2,zero + daddu c_3,AT + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + sd c_1,24(a0) + + dmultu a_4,a_0 /* mul_add_c2(a[4],b[0],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + slt c_1,t_2,zero + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_3,a_1 /* mul_add_c2(a[3],b[1],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + slt AT,t_2,zero + daddu c_1,AT + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_2,a_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + sd c_2,32(a0) + + dmultu a_0,a_5 /* mul_add_c2(a[0],b[5],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + slt c_2,t_2,zero + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_1,a_4 /* mul_add_c2(a[1],b[4],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + slt AT,t_2,zero + daddu c_2,AT + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_2,a_3 /* mul_add_c2(a[2],b[3],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + slt AT,t_2,zero + daddu c_2,AT + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + sd c_3,40(a0) + + dmultu a_6,a_0 /* mul_add_c2(a[6],b[0],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + slt c_3,t_2,zero + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_5,a_1 /* mul_add_c2(a[5],b[1],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + slt AT,t_2,zero + daddu c_3,AT + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_4,a_2 /* mul_add_c2(a[4],b[2],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + slt AT,t_2,zero + daddu c_3,AT + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_3,a_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + sd c_1,48(a0) + + dmultu a_0,a_7 /* mul_add_c2(a[0],b[7],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + slt c_1,t_2,zero + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_1,a_6 /* mul_add_c2(a[1],b[6],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + slt AT,t_2,zero + daddu c_1,AT + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_2,a_5 /* mul_add_c2(a[2],b[5],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + slt AT,t_2,zero + daddu c_1,AT + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_3,a_4 /* mul_add_c2(a[3],b[4],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + slt AT,t_2,zero + daddu c_1,AT + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + sd c_2,56(a0) + + dmultu a_7,a_1 /* mul_add_c2(a[7],b[1],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + slt c_2,t_2,zero + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_6,a_2 /* mul_add_c2(a[6],b[2],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + slt AT,t_2,zero + daddu c_2,AT + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_5,a_3 /* mul_add_c2(a[5],b[3],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + slt AT,t_2,zero + daddu c_2,AT + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_4,a_4 /* mul_add_c(a[4],b[4],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + sd c_3,64(a0) + + dmultu a_2,a_7 /* mul_add_c2(a[2],b[7],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + slt c_3,t_2,zero + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_3,a_6 /* mul_add_c2(a[3],b[6],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + slt AT,t_2,zero + daddu c_3,AT + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_4,a_5 /* mul_add_c2(a[4],b[5],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + slt AT,t_2,zero + daddu c_3,AT + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + sd c_1,72(a0) + + dmultu a_7,a_3 /* mul_add_c2(a[7],b[3],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + slt c_1,t_2,zero + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_6,a_4 /* mul_add_c2(a[6],b[4],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + slt AT,t_2,zero + daddu c_1,AT + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_5,a_5 /* mul_add_c(a[5],b[5],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + sd c_2,80(a0) + + dmultu a_4,a_7 /* mul_add_c2(a[4],b[7],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + slt c_2,t_2,zero + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_5,a_6 /* mul_add_c2(a[5],b[6],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + slt AT,t_2,zero + daddu c_2,AT + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + sd c_3,88(a0) + + dmultu a_7,a_5 /* mul_add_c2(a[7],b[5],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + slt c_3,t_2,zero + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_6,a_6 /* mul_add_c(a[6],b[6],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + sd c_1,96(a0) + + dmultu a_6,a_7 /* mul_add_c2(a[6],b[7],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + slt c_1,t_2,zero + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + sd c_2,104(a0) + + dmultu a_7,a_7 /* mul_add_c(a[7],b[7],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sd c_3,112(a0) + sd c_1,120(a0) + + jr ra +END(bn_sqr_comba8) + +.align 5 +LEAF(bn_sqr_comba4) + .set reorder + ld a_0,0(a1) + ld a_1,8(a1) + ld a_2,16(a1) + ld a_3,24(a1) + dmultu a_0,a_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */ + mflo c_1 + mfhi c_2 + sd c_1,0(a0) + + dmultu a_0,a_1 /* mul_add_c2(a[0],b[1],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + slt c_1,t_2,zero + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu c_3,t_2,AT + sd c_2,8(a0) + + dmultu a_2,a_0 /* mul_add_c2(a[2],b[0],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + slt c_2,t_2,zero + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_1,a_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + sd c_3,16(a0) + + dmultu a_0,a_3 /* mul_add_c2(a[0],b[3],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + slt c_3,t_2,zero + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_1,a_2 /* mul_add_c(a2[1],b[2],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + slt AT,t_2,zero + daddu c_3,AT + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + sd c_1,24(a0) + + dmultu a_3,a_1 /* mul_add_c2(a[3],b[1],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + slt c_1,t_2,zero + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_2,a_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + sd c_2,32(a0) + + dmultu a_2,a_3 /* mul_add_c2(a[2],b[3],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + slt c_2,t_2,zero + dsll t_2,1 + slt a2,t_1,zero + daddu t_2,a2 + dsll t_1,1 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + sd c_3,40(a0) + + dmultu a_3,a_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sd c_1,48(a0) + sd c_2,56(a0) + + jr ra +END(bn_sqr_comba4) diff --git a/crypto/bn/asm/modexp512-x86_64.pl b/crypto/bn/asm/modexp512-x86_64.pl deleted file mode 100755 index bfd6e97..0000000 --- a/crypto/bn/asm/modexp512-x86_64.pl +++ /dev/null @@ -1,1497 +0,0 @@ -#!/usr/bin/env perl -# -# Copyright (c) 2010-2011 Intel Corp. -# Author: Vinodh.Gopal@intel.com -# Jim Guilford -# Erdinc.Ozturk@intel.com -# Maxim.Perminov@intel.com -# -# More information about algorithm used can be found at: -# http://www.cse.buffalo.edu/srds2009/escs2009_submission_Gopal.pdf -# -# ==================================================================== -# Copyright (c) 2011 The OpenSSL Project. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# -# 1. Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in -# the documentation and/or other materials provided with the -# distribution. -# -# 3. All advertising materials mentioning features or use of this -# software must display the following acknowledgment: -# "This product includes software developed by the OpenSSL Project -# for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" -# -# 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to -# endorse or promote products derived from this software without -# prior written permission. For written permission, please contact -# licensing@OpenSSL.org. -# -# 5. Products derived from this software may not be called "OpenSSL" -# nor may "OpenSSL" appear in their names without prior written -# permission of the OpenSSL Project. -# -# 6. Redistributions of any form whatsoever must retain the following -# acknowledgment: -# "This product includes software developed by the OpenSSL Project -# for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" -# -# THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY -# EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR -# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT -# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, -# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED -# OF THE POSSIBILITY OF SUCH DAMAGE. -# ==================================================================== - -$flavour = shift; -$output = shift; -if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } - -my $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or -( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or -die "can't locate x86_64-xlate.pl"; - -open OUT,"| \"$^X\" $xlate $flavour $output"; -*STDOUT=*OUT; - -use strict; -my $code=".text\n\n"; -my $m=0; - -# -# Define x512 macros -# - -#MULSTEP_512_ADD MACRO x7, x6, x5, x4, x3, x2, x1, x0, dst, src1, src2, add_src, tmp1, tmp2 -# -# uses rax, rdx, and args -sub MULSTEP_512_ADD -{ - my ($x, $DST, $SRC2, $ASRC, $OP, $TMP)=@_; - my @X=@$x; # make a copy -$code.=<<___; - mov (+8*0)($SRC2), %rax - mul $OP # rdx:rax = %OP * [0] - mov ($ASRC), $X[0] - add %rax, $X[0] - adc \$0, %rdx - mov $X[0], $DST -___ -for(my $i=1;$i<8;$i++) { -$code.=<<___; - mov %rdx, $TMP - - mov (+8*$i)($SRC2), %rax - mul $OP # rdx:rax = %OP * [$i] - mov (+8*$i)($ASRC), $X[$i] - add %rax, $X[$i] - adc \$0, %rdx - add $TMP, $X[$i] - adc \$0, %rdx -___ -} -$code.=<<___; - mov %rdx, $X[0] -___ -} - -#MULSTEP_512 MACRO x7, x6, x5, x4, x3, x2, x1, x0, dst, src2, src1_val, tmp -# -# uses rax, rdx, and args -sub MULSTEP_512 -{ - my ($x, $DST, $SRC2, $OP, $TMP)=@_; - my @X=@$x; # make a copy -$code.=<<___; - mov (+8*0)($SRC2), %rax - mul $OP # rdx:rax = %OP * [0] - add %rax, $X[0] - adc \$0, %rdx - mov $X[0], $DST -___ -for(my $i=1;$i<8;$i++) { -$code.=<<___; - mov %rdx, $TMP - - mov (+8*$i)($SRC2), %rax - mul $OP # rdx:rax = %OP * [$i] - add %rax, $X[$i] - adc \$0, %rdx - add $TMP, $X[$i] - adc \$0, %rdx -___ -} -$code.=<<___; - mov %rdx, $X[0] -___ -} - -# -# Swizzle Macros -# - -# macro to copy data from flat space to swizzled table -#MACRO swizzle pDst, pSrc, tmp1, tmp2 -# pDst and pSrc are modified -sub swizzle -{ - my ($pDst, $pSrc, $cnt, $d0)=@_; -$code.=<<___; - mov \$8, $cnt -loop_$m: - mov ($pSrc), $d0 - mov $d0#w, ($pDst) - shr \$16, $d0 - mov $d0#w, (+64*1)($pDst) - shr \$16, $d0 - mov $d0#w, (+64*2)($pDst) - shr \$16, $d0 - mov $d0#w, (+64*3)($pDst) - lea 8($pSrc), $pSrc - lea 64*4($pDst), $pDst - dec $cnt - jnz loop_$m -___ - - $m++; -} - -# macro to copy data from swizzled table to flat space -#MACRO unswizzle pDst, pSrc, tmp*3 -sub unswizzle -{ - my ($pDst, $pSrc, $cnt, $d0, $d1)=@_; -$code.=<<___; - mov \$4, $cnt -loop_$m: - movzxw (+64*3+256*0)($pSrc), $d0 - movzxw (+64*3+256*1)($pSrc), $d1 - shl \$16, $d0 - shl \$16, $d1 - mov (+64*2+256*0)($pSrc), $d0#w - mov (+64*2+256*1)($pSrc), $d1#w - shl \$16, $d0 - shl \$16, $d1 - mov (+64*1+256*0)($pSrc), $d0#w - mov (+64*1+256*1)($pSrc), $d1#w - shl \$16, $d0 - shl \$16, $d1 - mov (+64*0+256*0)($pSrc), $d0#w - mov (+64*0+256*1)($pSrc), $d1#w - mov $d0, (+8*0)($pDst) - mov $d1, (+8*1)($pDst) - lea 256*2($pSrc), $pSrc - lea 8*2($pDst), $pDst - sub \$1, $cnt - jnz loop_$m -___ - - $m++; -} - -# -# Data Structures -# - -# Reduce Data -# -# -# Offset Value -# 0C0 Carries -# 0B8 X2[10] -# 0B0 X2[9] -# 0A8 X2[8] -# 0A0 X2[7] -# 098 X2[6] -# 090 X2[5] -# 088 X2[4] -# 080 X2[3] -# 078 X2[2] -# 070 X2[1] -# 068 X2[0] -# 060 X1[12] P[10] -# 058 X1[11] P[9] Z[8] -# 050 X1[10] P[8] Z[7] -# 048 X1[9] P[7] Z[6] -# 040 X1[8] P[6] Z[5] -# 038 X1[7] P[5] Z[4] -# 030 X1[6] P[4] Z[3] -# 028 X1[5] P[3] Z[2] -# 020 X1[4] P[2] Z[1] -# 018 X1[3] P[1] Z[0] -# 010 X1[2] P[0] Y[2] -# 008 X1[1] Q[1] Y[1] -# 000 X1[0] Q[0] Y[0] - -my $X1_offset = 0; # 13 qwords -my $X2_offset = $X1_offset + 13*8; # 11 qwords -my $Carries_offset = $X2_offset + 11*8; # 1 qword -my $Q_offset = 0; # 2 qwords -my $P_offset = $Q_offset + 2*8; # 11 qwords -my $Y_offset = 0; # 3 qwords -my $Z_offset = $Y_offset + 3*8; # 9 qwords - -my $Red_Data_Size = $Carries_offset + 1*8; # (25 qwords) - -# -# Stack Frame -# -# -# offset value -# ... <old stack contents> -# ... -# 280 Garray - -# 278 tmp16[15] -# ... ... -# 200 tmp16[0] - -# 1F8 tmp[7] -# ... ... -# 1C0 tmp[0] - -# 1B8 GT[7] -# ... ... -# 180 GT[0] - -# 178 Reduce Data -# ... ... -# 0B8 Reduce Data -# 0B0 reserved -# 0A8 reserved -# 0A0 reserved -# 098 reserved -# 090 reserved -# 088 reduce result addr -# 080 exp[8] - -# ... -# 048 exp[1] -# 040 exp[0] - -# 038 reserved -# 030 loop_idx -# 028 pg -# 020 i -# 018 pData ; arg 4 -# 010 pG ; arg 2 -# 008 pResult ; arg 1 -# 000 rsp ; stack pointer before subtract - -my $rsp_offset = 0; -my $pResult_offset = 8*1 + $rsp_offset; -my $pG_offset = 8*1 + $pResult_offset; -my $pData_offset = 8*1 + $pG_offset; -my $i_offset = 8*1 + $pData_offset; -my $pg_offset = 8*1 + $i_offset; -my $loop_idx_offset = 8*1 + $pg_offset; -my $reserved1_offset = 8*1 + $loop_idx_offset; -my $exp_offset = 8*1 + $reserved1_offset; -my $red_result_addr_offset= 8*9 + $exp_offset; -my $reserved2_offset = 8*1 + $red_result_addr_offset; -my $Reduce_Data_offset = 8*5 + $reserved2_offset; -my $GT_offset = $Red_Data_Size + $Reduce_Data_offset; -my $tmp_offset = 8*8 + $GT_offset; -my $tmp16_offset = 8*8 + $tmp_offset; -my $garray_offset = 8*16 + $tmp16_offset; -my $mem_size = 8*8*32 + $garray_offset; - -# -# Offsets within Reduce Data -# -# -# struct MODF_2FOLD_MONT_512_C1_DATA { -# UINT64 t[8][8]; -# UINT64 m[8]; -# UINT64 m1[8]; /* 2^768 % m */ -# UINT64 m2[8]; /* 2^640 % m */ -# UINT64 k1[2]; /* (- 1/m) % 2^128 */ -# }; - -my $T = 0; -my $M = 512; # = 8 * 8 * 8 -my $M1 = 576; # = 8 * 8 * 9 /* += 8 * 8 */ -my $M2 = 640; # = 8 * 8 * 10 /* += 8 * 8 */ -my $K1 = 704; # = 8 * 8 * 11 /* += 8 * 8 */ - -# -# FUNCTIONS -# - -{{{ -# -# MULADD_128x512 : Function to multiply 128-bits (2 qwords) by 512-bits (8 qwords) -# and add 512-bits (8 qwords) -# to get 640 bits (10 qwords) -# Input: 128-bit mul source: [rdi+8*1], rbp -# 512-bit mul source: [rsi+8*n] -# 512-bit add source: r15, r14, ..., r9, r8 -# Output: r9, r8, r15, r14, r13, r12, r11, r10, [rcx+8*1], [rcx+8*0] -# Clobbers all regs except: rcx, rsi, rdi -$code.=<<___; -.type MULADD_128x512,\@abi-omnipotent -.align 16 -MULADD_128x512: -___ - &MULSTEP_512([map("%r$_",(8..15))], "(+8*0)(%rcx)", "%rsi", "%rbp", "%rbx"); -$code.=<<___; - mov (+8*1)(%rdi), %rbp -___ - &MULSTEP_512([map("%r$_",(9..15,8))], "(+8*1)(%rcx)", "%rsi", "%rbp", "%rbx"); -$code.=<<___; - ret -.size MULADD_128x512,.-MULADD_128x512 -___ -}}} - -{{{ -#MULADD_256x512 MACRO pDst, pA, pB, OP, TMP, X7, X6, X5, X4, X3, X2, X1, X0 -# -# Inputs: pDst: Destination (768 bits, 12 qwords) -# pA: Multiplicand (1024 bits, 16 qwords) -# pB: Multiplicand (512 bits, 8 qwords) -# Dst = Ah * B + Al -# where Ah is (in qwords) A[15:12] (256 bits) and Al is A[7:0] (512 bits) -# Results in X3 X2 X1 X0 X7 X6 X5 X4 Dst[3:0] -# Uses registers: arguments, RAX, RDX -sub MULADD_256x512 -{ - my ($pDst, $pA, $pB, $OP, $TMP, $X)=@_; -$code.=<<___; - mov (+8*12)($pA), $OP -___ - &MULSTEP_512_ADD($X, "(+8*0)($pDst)", $pB, $pA, $OP, $TMP); - push(@$X,shift(@$X)); - -$code.=<<___; - mov (+8*13)($pA), $OP -___ - &MULSTEP_512($X, "(+8*1)($pDst)", $pB, $OP, $TMP); - push(@$X,shift(@$X)); - -$code.=<<___; - mov (+8*14)($pA), $OP -___ - &MULSTEP_512($X, "(+8*2)($pDst)", $pB, $OP, $TMP); - push(@$X,shift(@$X)); - -$code.=<<___; - mov (+8*15)($pA), $OP -___ - &MULSTEP_512($X, "(+8*3)($pDst)", $pB, $OP, $TMP); - push(@$X,shift(@$X)); -} - -# -# mont_reduce(UINT64 *x, /* 1024 bits, 16 qwords */ -# UINT64 *m, /* 512 bits, 8 qwords */ -# MODF_2FOLD_MONT_512_C1_DATA *data, -# UINT64 *r) /* 512 bits, 8 qwords */ -# Input: x (number to be reduced): tmp16 (Implicit) -# m (modulus): [pM] (Implicit) -# data (reduce data): [pData] (Implicit) -# Output: r (result): Address in [red_res_addr] -# result also in: r9, r8, r15, r14, r13, r12, r11, r10 - -my @X=map("%r$_",(8..15)); - -$code.=<<___; -.type mont_reduce,\@abi-omnipotent -.align 16 -mont_reduce: -___ - -my $STACK_DEPTH = 8; - # - # X1 = Xh * M1 + Xl -$code.=<<___; - lea (+$Reduce_Data_offset+$X1_offset+$STACK_DEPTH)(%rsp), %rdi # pX1 (Dst) 769 bits, 13 qwords - mov (+$pData_offset+$STACK_DEPTH)(%rsp), %rsi # pM1 (Bsrc) 512 bits, 8 qwords - add \$$M1, %rsi - lea (+$tmp16_offset+$STACK_DEPTH)(%rsp), %rcx # X (Asrc) 1024 bits, 16 qwords - -___ - - &MULADD_256x512("%rdi", "%rcx", "%rsi", "%rbp", "%rbx", \@X); # rotates @X 4 times - # results in r11, r10, r9, r8, r15, r14, r13, r12, X1[3:0] - -$code.=<<___; - xor %rax, %rax - # X1 += xl - add (+8*8)(%rcx), $X[4] - adc (+8*9)(%rcx), $X[5] - adc (+8*10)(%rcx), $X[6] - adc (+8*11)(%rcx), $X[7] - adc \$0, %rax - # X1 is now rax, r11-r8, r15-r12, tmp16[3:0] - - # - # check for carry ;; carry stored in rax - mov $X[4], (+8*8)(%rdi) # rdi points to X1 - mov $X[5], (+8*9)(%rdi) - mov $X[6], %rbp - mov $X[7], (+8*11)(%rdi) - - mov %rax, (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp) - - mov (+8*0)(%rdi), $X[4] - mov (+8*1)(%rdi), $X[5] - mov (+8*2)(%rdi), $X[6] - mov (+8*3)(%rdi), $X[7] - - # X1 is now stored in: X1[11], rbp, X1[9:8], r15-r8 - # rdi -> X1 - # rsi -> M1 - - # - # X2 = Xh * M2 + Xl - # do first part (X2 = Xh * M2) - add \$8*10, %rdi # rdi -> pXh ; 128 bits, 2 qwords - # Xh is actually { [rdi+8*1], rbp } - add \$`$M2-$M1`, %rsi # rsi -> M2 - lea (+$Reduce_Data_offset+$X2_offset+$STACK_DEPTH)(%rsp), %rcx # rcx -> pX2 ; 641 bits, 11 qwords -___ - unshift(@X,pop(@X)); unshift(@X,pop(@X)); -$code.=<<___; - - call MULADD_128x512 # args in rcx, rdi / rbp, rsi, r15-r8 - # result in r9, r8, r15, r14, r13, r12, r11, r10, X2[1:0] - mov (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp), %rax - - # X2 += Xl - add (+8*8-8*10)(%rdi), $X[6] # (-8*10) is to adjust rdi -> Xh to Xl - adc (+8*9-8*10)(%rdi), $X[7] - mov $X[6], (+8*8)(%rcx) - mov $X[7], (+8*9)(%rcx) - - adc %rax, %rax - mov %rax, (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp) - - lea (+$Reduce_Data_offset+$Q_offset+$STACK_DEPTH)(%rsp), %rdi # rdi -> pQ ; 128 bits, 2 qwords - add \$`$K1-$M2`, %rsi # rsi -> pK1 ; 128 bits, 2 qwords - - # MUL_128x128t128 rdi, rcx, rsi ; Q = X2 * K1 (bottom half) - # B1:B0 = rsi[1:0] = K1[1:0] - # A1:A0 = rcx[1:0] = X2[1:0] - # Result = rdi[1],rbp = Q[1],rbp - mov (%rsi), %r8 # B0 - mov (+8*1)(%rsi), %rbx # B1 - - mov (%rcx), %rax # A0 - mul %r8 # B0 - mov %rax, %rbp - mov %rdx, %r9 - - mov (+8*1)(%rcx), %rax # A1 - mul %r8 # B0 - add %rax, %r9 - - mov (%rcx), %rax # A0 - mul %rbx # B1 - add %rax, %r9 - - mov %r9, (+8*1)(%rdi) - # end MUL_128x128t128 - - sub \$`$K1-$M`, %rsi - - mov (%rcx), $X[6] - mov (+8*1)(%rcx), $X[7] # r9:r8 = X2[1:0] - - call MULADD_128x512 # args in rcx, rdi / rbp, rsi, r15-r8 - # result in r9, r8, r15, r14, r13, r12, r11, r10, X2[1:0] - - # load first half of m to rdx, rdi, rbx, rax - # moved this here for efficiency - mov (+8*0)(%rsi), %rax - mov (+8*1)(%rsi), %rbx - mov (+8*2)(%rsi), %rdi - mov (+8*3)(%rsi), %rdx - - # continue with reduction - mov (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp), %rbp - - add (+8*8)(%rcx), $X[6] - adc (+8*9)(%rcx), $X[7] - - #accumulate the final carry to rbp - adc %rbp, %rbp - - # Add in overflow corrections: R = (X2>>128) += T[overflow] - # R = {r9, r8, r15, r14, ..., r10} - shl \$3, %rbp - mov (+$pData_offset+$STACK_DEPTH)(%rsp), %rcx # rsi -> Data (and points to T) - add %rcx, %rbp # pT ; 512 bits, 8 qwords, spread out - - # rsi will be used to generate a mask after the addition - xor %rsi, %rsi - - add (+8*8*0)(%rbp), $X[0] - adc (+8*8*1)(%rbp), $X[1] - adc (+8*8*2)(%rbp), $X[2] - adc (+8*8*3)(%rbp), $X[3] - adc (+8*8*4)(%rbp), $X[4] - adc (+8*8*5)(%rbp), $X[5] - adc (+8*8*6)(%rbp), $X[6] - adc (+8*8*7)(%rbp), $X[7] - - # if there is a carry: rsi = 0xFFFFFFFFFFFFFFFF - # if carry is clear: rsi = 0x0000000000000000 - sbb \$0, %rsi - - # if carry is clear, subtract 0. Otherwise, subtract 256 bits of m - and %rsi, %rax - and %rsi, %rbx - and %rsi, %rdi - and %rsi, %rdx - - mov \$1, %rbp - sub %rax, $X[0] - sbb %rbx, $X[1] - sbb %rdi, $X[2] - sbb %rdx, $X[3] - - # if there is a borrow: rbp = 0 - # if there is no borrow: rbp = 1 - # this is used to save the borrows in between the first half and the 2nd half of the subtraction of m - sbb \$0, %rbp - - #load second half of m to rdx, rdi, rbx, rax - - add \$$M, %rcx - mov (+8*4)(%rcx), %rax - mov (+8*5)(%rcx), %rbx - mov (+8*6)(%rcx), %rdi - mov (+8*7)(%rcx), %rdx - - # use the rsi mask as before - # if carry is clear, subtract 0. Otherwise, subtract 256 bits of m - and %rsi, %rax - and %rsi, %rbx - and %rsi, %rdi - and %rsi, %rdx - - # if rbp = 0, there was a borrow before, it is moved to the carry flag - # if rbp = 1, there was not a borrow before, carry flag is cleared - sub \$1, %rbp - - sbb %rax, $X[4] - sbb %rbx, $X[5] - sbb %rdi, $X[6] - sbb %rdx, $X[7] - - # write R back to memory - - mov (+$red_result_addr_offset+$STACK_DEPTH)(%rsp), %rsi - mov $X[0], (+8*0)(%rsi) - mov $X[1], (+8*1)(%rsi) - mov $X[2], (+8*2)(%rsi) - mov $X[3], (+8*3)(%rsi) - mov $X[4], (+8*4)(%rsi) - mov $X[5], (+8*5)(%rsi) - mov $X[6], (+8*6)(%rsi) - mov $X[7], (+8*7)(%rsi) - - ret -.size mont_reduce,.-mont_reduce -___ -}}} - -{{{ -#MUL_512x512 MACRO pDst, pA, pB, x7, x6, x5, x4, x3, x2, x1, x0, tmp*2 -# -# Inputs: pDst: Destination (1024 bits, 16 qwords) -# pA: Multiplicand (512 bits, 8 qwords) -# pB: Multiplicand (512 bits, 8 qwords) -# Uses registers rax, rdx, args -# B operand in [pB] and also in x7...x0 -sub MUL_512x512 -{ - my ($pDst, $pA, $pB, $x, $OP, $TMP, $pDst_o)=@_; - my ($pDst, $pDst_o) = ($pDst =~ m/([^+]*)\+?(.*)?/); - my @X=@$x; # make a copy - -$code.=<<___; - mov (+8*0)($pA), $OP - - mov $X[0], %rax - mul $OP # rdx:rax = %OP * [0] - mov %rax, (+$pDst_o+8*0)($pDst) - mov %rdx, $X[0] -___ -for(my $i=1;$i<8;$i++) { -$code.=<<___; - mov $X[$i], %rax - mul $OP # rdx:rax = %OP * [$i] - add %rax, $X[$i-1] - adc \$0, %rdx - mov %rdx, $X[$i] -___ -} - -for(my $i=1;$i<8;$i++) { -$code.=<<___; - mov (+8*$i)($pA), $OP -___ - - &MULSTEP_512(\@X, "(+$pDst_o+8*$i)($pDst)", $pB, $OP, $TMP); - push(@X,shift(@X)); -} - -$code.=<<___; - mov $X[0], (+$pDst_o+8*8)($pDst) - mov $X[1], (+$pDst_o+8*9)($pDst) - mov $X[2], (+$pDst_o+8*10)($pDst) - mov $X[3], (+$pDst_o+8*11)($pDst) - mov $X[4], (+$pDst_o+8*12)($pDst) - mov $X[5], (+$pDst_o+8*13)($pDst) - mov $X[6], (+$pDst_o+8*14)($pDst) - mov $X[7], (+$pDst_o+8*15)($pDst) -___ -} - -# -# mont_mul_a3b : subroutine to compute (Src1 * Src2) % M (all 512-bits) -# Input: src1: Address of source 1: rdi -# src2: Address of source 2: rsi -# Output: dst: Address of destination: [red_res_addr] -# src2 and result also in: r9, r8, r15, r14, r13, r12, r11, r10 -# Temp: Clobbers [tmp16], all registers -$code.=<<___; -.type mont_mul_a3b,\@abi-omnipotent -.align 16 -mont_mul_a3b: - # - # multiply tmp = src1 * src2 - # For multiply: dst = rcx, src1 = rdi, src2 = rsi - # stack depth is extra 8 from call -___ - &MUL_512x512("%rsp+$tmp16_offset+8", "%rdi", "%rsi", [map("%r$_",(10..15,8..9))], "%rbp", "%rbx"); -$code.=<<___; - # - # Dst = tmp % m - # Call reduce(tmp, m, data, dst) - - # tail recursion optimization: jmp to mont_reduce and return from there - jmp mont_reduce - # call mont_reduce - # ret -.size mont_mul_a3b,.-mont_mul_a3b -___ -}}} - -{{{ -#SQR_512 MACRO pDest, pA, x7, x6, x5, x4, x3, x2, x1, x0, tmp*4 -# -# Input in memory [pA] and also in x7...x0 -# Uses all argument registers plus rax and rdx -# -# This version computes all of the off-diagonal terms into memory, -# and then it adds in the diagonal terms - -sub SQR_512 -{ - my ($pDst, $pA, $x, $A, $tmp, $x7, $x6, $pDst_o)=@_; - my ($pDst, $pDst_o) = ($pDst =~ m/([^+]*)\+?(.*)?/); - my @X=@$x; # make a copy -$code.=<<___; - # ------------------ - # first pass 01...07 - # ------------------ - mov $X[0], $A - - mov $X[1],%rax - mul $A - mov %rax, (+$pDst_o+8*1)($pDst) -___ -for(my $i=2;$i<8;$i++) { -$code.=<<___; - mov %rdx, $X[$i-2] - mov $X[$i],%rax - mul $A - add %rax, $X[$i-2] - adc \$0, %rdx -___ -} -$code.=<<___; - mov %rdx, $x7 - - mov $X[0], (+$pDst_o+8*2)($pDst) - - # ------------------ - # second pass 12...17 - # ------------------ - - mov (+8*1)($pA), $A - - mov (+8*2)($pA),%rax - mul $A - add %rax, $X[1] - adc \$0, %rdx - mov $X[1], (+$pDst_o+8*3)($pDst) - - mov %rdx, $X[0] - mov (+8*3)($pA),%rax - mul $A - add %rax, $X[2] - adc \$0, %rdx - add $X[0], $X[2] - adc \$0, %rdx - mov $X[2], (+$pDst_o+8*4)($pDst) - - mov %rdx, $X[0] - mov (+8*4)($pA),%rax - mul $A - add %rax, $X[3] - adc \$0, %rdx - add $X[0], $X[3] - adc \$0, %rdx - - mov %rdx, $X[0] - mov (+8*5)($pA),%rax - mul $A - add %rax, $X[4] - adc \$0, %rdx - add $X[0], $X[4] - adc \$0, %rdx - - mov %rdx, $X[0] - mov $X[6],%rax - mul $A - add %rax, $X[5] - adc \$0, %rdx - add $X[0], $X[5] - adc \$0, %rdx - - mov %rdx, $X[0] - mov $X[7],%rax - mul $A - add %rax, $x7 - adc \$0, %rdx - add $X[0], $x7 - adc \$0, %rdx - - mov %rdx, $X[1] - - # ------------------ - # third pass 23...27 - # ------------------ - mov (+8*2)($pA), $A - - mov (+8*3)($pA),%rax - mul $A - add %rax, $X[3] - adc \$0, %rdx - mov $X[3], (+$pDst_o+8*5)($pDst) - - mov %rdx, $X[0] - mov (+8*4)($pA),%rax - mul $A - add %rax, $X[4] - adc \$0, %rdx - add $X[0], $X[4] - adc \$0, %rdx - mov $X[4], (+$pDst_o+8*6)($pDst) - - mov %rdx, $X[0] - mov (+8*5)($pA),%rax - mul $A - add %rax, $X[5] - adc \$0, %rdx - add $X[0], $X[5] - adc \$0, %rdx - - mov %rdx, $X[0] - mov $X[6],%rax - mul $A - add %rax, $x7 - adc \$0, %rdx - add $X[0], $x7 - adc \$0, %rdx - - mov %rdx, $X[0] - mov $X[7],%rax - mul $A - add %rax, $X[1] - adc \$0, %rdx - add $X[0], $X[1] - adc \$0, %rdx - - mov %rdx, $X[2] - - # ------------------ - # fourth pass 34...37 - # ------------------ - - mov (+8*3)($pA), $A - - mov (+8*4)($pA),%rax - mul $A - add %rax, $X[5] - adc \$0, %rdx - mov $X[5], (+$pDst_o+8*7)($pDst) - - mov %rdx, $X[0] - mov (+8*5)($pA),%rax - mul $A - add %rax, $x7 - adc \$0, %rdx - add $X[0], $x7 - adc \$0, %rdx - mov $x7, (+$pDst_o+8*8)($pDst) - - mov %rdx, $X[0] - mov $X[6],%rax - mul $A - add %rax, $X[1] - adc \$0, %rdx - add $X[0], $X[1] - adc \$0, %rdx - - mov %rdx, $X[0] - mov $X[7],%rax - mul $A - add %rax, $X[2] - adc \$0, %rdx - add $X[0], $X[2] - adc \$0, %rdx - - mov %rdx, $X[5] - - # ------------------ - # fifth pass 45...47 - # ------------------ - mov (+8*4)($pA), $A - - mov (+8*5)($pA),%rax - mul $A - add %rax, $X[1] - adc \$0, %rdx - mov $X[1], (+$pDst_o+8*9)($pDst) - - mov %rdx, $X[0] - mov $X[6],%rax - mul $A - add %rax, $X[2] - adc \$0, %rdx - add $X[0], $X[2] - adc \$0, %rdx - mov $X[2], (+$pDst_o+8*10)($pDst) - - mov %rdx, $X[0] - mov $X[7],%rax - mul $A - add %rax, $X[5] - adc \$0, %rdx - add $X[0], $X[5] - adc \$0, %rdx - - mov %rdx, $X[1] - - # ------------------ - # sixth pass 56...57 - # ------------------ - mov (+8*5)($pA), $A - - mov $X[6],%rax - mul $A - add %rax, $X[5] - adc \$0, %rdx - mov $X[5], (+$pDst_o+8*11)($pDst) - - mov %rdx, $X[0] - mov $X[7],%rax - mul $A - add %rax, $X[1] - adc \$0, %rdx - add $X[0], $X[1] - adc \$0, %rdx - mov $X[1], (+$pDst_o+8*12)($pDst) - - mov %rdx, $X[2] - - # ------------------ - # seventh pass 67 - # ------------------ - mov $X[6], $A - - mov $X[7],%rax - mul $A - add %rax, $X[2] - adc \$0, %rdx - mov $X[2], (+$pDst_o+8*13)($pDst) - - mov %rdx, (+$pDst_o+8*14)($pDst) - - # start finalize (add in squares, and double off-terms) - mov (+$pDst_o+8*1)($pDst), $X[0] - mov (+$pDst_o+8*2)($pDst), $X[1] - mov (+$pDst_o+8*3)($pDst), $X[2] - mov (+$pDst_o+8*4)($pDst), $X[3] - mov (+$pDst_o+8*5)($pDst), $X[4] - mov (+$pDst_o+8*6)($pDst), $X[5] - - mov (+8*3)($pA), %rax - mul %rax - mov %rax, $x6 - mov %rdx, $X[6] - - add $X[0], $X[0] - adc $X[1], $X[1] - adc $X[2], $X[2] - adc $X[3], $X[3] - adc $X[4], $X[4] - adc $X[5], $X[5] - adc \$0, $X[6] - - mov (+8*0)($pA), %rax - mul %rax - mov %rax, (+$pDst_o+8*0)($pDst) - mov %rdx, $A - - mov (+8*1)($pA), %rax - mul %rax - - add $A, $X[0] - adc %rax, $X[1] - adc \$0, %rdx - - mov %rdx, $A - mov $X[0], (+$pDst_o+8*1)($pDst) - mov $X[1], (+$pDst_o+8*2)($pDst) - - mov (+8*2)($pA), %rax - mul %rax - - add $A, $X[2] - adc %rax, $X[3] - adc \$0, %rdx - - mov %rdx, $A - - mov $X[2], (+$pDst_o+8*3)($pDst) - mov $X[3], (+$pDst_o+8*4)($pDst) - - xor $tmp, $tmp - add $A, $X[4] - adc $x6, $X[5] - adc \$0, $tmp - - mov $X[4], (+$pDst_o+8*5)($pDst) - mov $X[5], (+$pDst_o+8*6)($pDst) - - # %%tmp has 0/1 in column 7 - # %%A6 has a full value in column 7 - - mov (+$pDst_o+8*7)($pDst), $X[0] - mov (+$pDst_o+8*8)($pDst), $X[1] - mov (+$pDst_o+8*9)($pDst), $X[2] - mov (+$pDst_o+8*10)($pDst), $X[3] - mov (+$pDst_o+8*11)($pDst), $X[4] - mov (+$pDst_o+8*12)($pDst), $X[5] - mov (+$pDst_o+8*13)($pDst), $x6 - mov (+$pDst_o+8*14)($pDst), $x7 - - mov $X[7], %rax - mul %rax - mov %rax, $X[7] - mov %rdx, $A - - add $X[0], $X[0] - adc $X[1], $X[1] - adc $X[2], $X[2] - adc $X[3], $X[3] - adc $X[4], $X[4] - adc $X[5], $X[5] - adc $x6, $x6 - adc $x7, $x7 - adc \$0, $A - - add $tmp, $X[0] - - mov (+8*4)($pA), %rax - mul %rax - - add $X[6], $X[0] - adc %rax, $X[1] - adc \$0, %rdx - - mov %rdx, $tmp - - mov $X[0], (+$pDst_o+8*7)($pDst) - mov $X[1], (+$pDst_o+8*8)($pDst) - - mov (+8*5)($pA), %rax - mul %rax - - add $tmp, $X[2] - adc %rax, $X[3] - adc \$0, %rdx - - mov %rdx, $tmp - - mov $X[2], (+$pDst_o+8*9)($pDst) - mov $X[3], (+$pDst_o+8*10)($pDst) - - mov (+8*6)($pA), %rax - mul %rax - - add $tmp, $X[4] - adc %rax, $X[5] - adc \$0, %rdx - - mov $X[4], (+$pDst_o+8*11)($pDst) - mov $X[5], (+$pDst_o+8*12)($pDst) - - add %rdx, $x6 - adc $X[7], $x7 - adc \$0, $A - - mov $x6, (+$pDst_o+8*13)($pDst) - mov $x7, (+$pDst_o+8*14)($pDst) - mov $A, (+$pDst_o+8*15)($pDst) -___ -} - -# -# sqr_reduce: subroutine to compute Result = reduce(Result * Result) -# -# input and result also in: r9, r8, r15, r14, r13, r12, r11, r10 -# -$code.=<<___; -.type sqr_reduce,\@abi-omnipotent -.align 16 -sqr_reduce: - mov (+$pResult_offset+8)(%rsp), %rcx -___ - &SQR_512("%rsp+$tmp16_offset+8", "%rcx", [map("%r$_",(10..15,8..9))], "%rbx", "%rbp", "%rsi", "%rdi"); -$code.=<<___; - # tail recursion optimization: jmp to mont_reduce and return from there - jmp mont_reduce - # call mont_reduce - # ret -.size sqr_reduce,.-sqr_reduce -___ -}}} - -# -# MAIN FUNCTION -# - -#mod_exp_512(UINT64 *result, /* 512 bits, 8 qwords */ -# UINT64 *g, /* 512 bits, 8 qwords */ -# UINT64 *exp, /* 512 bits, 8 qwords */ -# struct mod_ctx_512 *data) - -# window size = 5 -# table size = 2^5 = 32 -#table_entries equ 32 -#table_size equ table_entries * 8 -$code.=<<___; -.globl mod_exp_512 -.type mod_exp_512,\@function,4 -mod_exp_512: - push %rbp - push %rbx - push %r12 - push %r13 - push %r14 - push %r15 - - # adjust stack down and then align it with cache boundary - mov %rsp, %r8 - sub \$$mem_size, %rsp - and \$-64, %rsp - - # store previous stack pointer and arguments - mov %r8, (+$rsp_offset)(%rsp) - mov %rdi, (+$pResult_offset)(%rsp) - mov %rsi, (+$pG_offset)(%rsp) - mov %rcx, (+$pData_offset)(%rsp) -.Lbody: - # transform g into montgomery space - # GT = reduce(g * C2) = reduce(g * (2^256)) - # reduce expects to have the input in [tmp16] - pxor %xmm4, %xmm4 - movdqu (+16*0)(%rsi), %xmm0 - movdqu (+16*1)(%rsi), %xmm1 - movdqu (+16*2)(%rsi), %xmm2 - movdqu (+16*3)(%rsi), %xmm3 - movdqa %xmm4, (+$tmp16_offset+16*0)(%rsp) - movdqa %xmm4, (+$tmp16_offset+16*1)(%rsp) - movdqa %xmm4, (+$tmp16_offset+16*6)(%rsp) - movdqa %xmm4, (+$tmp16_offset+16*7)(%rsp) - movdqa %xmm0, (+$tmp16_offset+16*2)(%rsp) - movdqa %xmm1, (+$tmp16_offset+16*3)(%rsp) - movdqa %xmm2, (+$tmp16_offset+16*4)(%rsp) - movdqa %xmm3, (+$tmp16_offset+16*5)(%rsp) - - # load pExp before rdx gets blown away - movdqu (+16*0)(%rdx), %xmm0 - movdqu (+16*1)(%rdx), %xmm1 - movdqu (+16*2)(%rdx), %xmm2 - movdqu (+16*3)(%rdx), %xmm3 - - lea (+$GT_offset)(%rsp), %rbx - mov %rbx, (+$red_result_addr_offset)(%rsp) - call mont_reduce - - # Initialize tmp = C - lea (+$tmp_offset)(%rsp), %rcx - xor %rax, %rax - mov %rax, (+8*0)(%rcx) - mov %rax, (+8*1)(%rcx) - mov %rax, (+8*3)(%rcx) - mov %rax, (+8*4)(%rcx) - mov %rax, (+8*5)(%rcx) - mov %rax, (+8*6)(%rcx) - mov %rax, (+8*7)(%rcx) - mov %rax, (+$exp_offset+8*8)(%rsp) - movq \$1, (+8*2)(%rcx) - - lea (+$garray_offset)(%rsp), %rbp - mov %rcx, %rsi # pTmp - mov %rbp, %rdi # Garray[][0] -___ - - &swizzle("%rdi", "%rcx", "%rax", "%rbx"); - - # for (rax = 31; rax != 0; rax--) { - # tmp = reduce(tmp * G) - # swizzle(pg, tmp); - # pg += 2; } -$code.=<<___; - mov \$31, %rax - mov %rax, (+$i_offset)(%rsp) - mov %rbp, (+$pg_offset)(%rsp) - # rsi -> pTmp - mov %rsi, (+$red_result_addr_offset)(%rsp) - mov (+8*0)(%rsi), %r10 - mov (+8*1)(%rsi), %r11 - mov (+8*2)(%rsi), %r12 - mov (+8*3)(%rsi), %r13 - mov (+8*4)(%rsi), %r14 - mov (+8*5)(%rsi), %r15 - mov (+8*6)(%rsi), %r8 - mov (+8*7)(%rsi), %r9 -init_loop: - lea (+$GT_offset)(%rsp), %rdi - call mont_mul_a3b - lea (+$tmp_offset)(%rsp), %rsi - mov (+$pg_offset)(%rsp), %rbp - add \$2, %rbp - mov %rbp, (+$pg_offset)(%rsp) - mov %rsi, %rcx # rcx = rsi = addr of tmp -___ - - &swizzle("%rbp", "%rcx", "%rax", "%rbx"); -$code.=<<___; - mov (+$i_offset)(%rsp), %rax - sub \$1, %rax - mov %rax, (+$i_offset)(%rsp) - jne init_loop - - # - # Copy exponent onto stack - movdqa %xmm0, (+$exp_offset+16*0)(%rsp) - movdqa %xmm1, (+$exp_offset+16*1)(%rsp) - movdqa %xmm2, (+$exp_offset+16*2)(%rsp) - movdqa %xmm3, (+$exp_offset+16*3)(%rsp) - - - # - # Do exponentiation - # Initialize result to G[exp{511:507}] - mov (+$exp_offset+62)(%rsp), %eax - mov %rax, %rdx - shr \$11, %rax - and \$0x07FF, %edx - mov %edx, (+$exp_offset+62)(%rsp) - lea (+$garray_offset)(%rsp,%rax,2), %rsi - mov (+$pResult_offset)(%rsp), %rdx -___ - - &unswizzle("%rdx", "%rsi", "%rbp", "%rbx", "%rax"); - - # - # Loop variables - # rcx = [loop_idx] = index: 510-5 to 0 by 5 -$code.=<<___; - movq \$505, (+$loop_idx_offset)(%rsp) - - mov (+$pResult_offset)(%rsp), %rcx - mov %rcx, (+$red_result_addr_offset)(%rsp) - mov (+8*0)(%rcx), %r10 - mov (+8*1)(%rcx), %r11 - mov (+8*2)(%rcx), %r12 - mov (+8*3)(%rcx), %r13 - mov (+8*4)(%rcx), %r14 - mov (+8*5)(%rcx), %r15 - mov (+8*6)(%rcx), %r8 - mov (+8*7)(%rcx), %r9 - jmp sqr_2 - -main_loop_a3b: - call sqr_reduce - call sqr_reduce - call sqr_reduce -sqr_2: - call sqr_reduce - call sqr_reduce - - # - # Do multiply, first look up proper value in Garray - mov (+$loop_idx_offset)(%rsp), %rcx # bit index - mov %rcx, %rax - shr \$4, %rax # rax is word pointer - mov (+$exp_offset)(%rsp,%rax,2), %edx - and \$15, %rcx - shrq %cl, %rdx - and \$0x1F, %rdx - - lea (+$garray_offset)(%rsp,%rdx,2), %rsi - lea (+$tmp_offset)(%rsp), %rdx - mov %rdx, %rdi -___ - - &unswizzle("%rdx", "%rsi", "%rbp", "%rbx", "%rax"); - # rdi = tmp = pG - - # - # Call mod_mul_a1(pDst, pSrc1, pSrc2, pM, pData) - # result result pG M Data -$code.=<<___; - mov (+$pResult_offset)(%rsp), %rsi - call mont_mul_a3b - - # - # finish loop - mov (+$loop_idx_offset)(%rsp), %rcx - sub \$5, %rcx - mov %rcx, (+$loop_idx_offset)(%rsp) - jge main_loop_a3b - - # - -end_main_loop_a3b: - # transform result out of Montgomery space - # result = reduce(result) - mov (+$pResult_offset)(%rsp), %rdx - pxor %xmm4, %xmm4 - movdqu (+16*0)(%rdx), %xmm0 - movdqu (+16*1)(%rdx), %xmm1 - movdqu (+16*2)(%rdx), %xmm2 - movdqu (+16*3)(%rdx), %xmm3 - movdqa %xmm4, (+$tmp16_offset+16*4)(%rsp) - movdqa %xmm4, (+$tmp16_offset+16*5)(%rsp) - movdqa %xmm4, (+$tmp16_offset+16*6)(%rsp) - movdqa %xmm4, (+$tmp16_offset+16*7)(%rsp) - movdqa %xmm0, (+$tmp16_offset+16*0)(%rsp) - movdqa %xmm1, (+$tmp16_offset+16*1)(%rsp) - movdqa %xmm2, (+$tmp16_offset+16*2)(%rsp) - movdqa %xmm3, (+$tmp16_offset+16*3)(%rsp) - call mont_reduce - - # If result > m, subract m - # load result into r15:r8 - mov (+$pResult_offset)(%rsp), %rax - mov (+8*0)(%rax), %r8 - mov (+8*1)(%rax), %r9 - mov (+8*2)(%rax), %r10 - mov (+8*3)(%rax), %r11 - mov (+8*4)(%rax), %r12 - mov (+8*5)(%rax), %r13 - mov (+8*6)(%rax), %r14 - mov (+8*7)(%rax), %r15 - - # subtract m - mov (+$pData_offset)(%rsp), %rbx - add \$$M, %rbx - - sub (+8*0)(%rbx), %r8 - sbb (+8*1)(%rbx), %r9 - sbb (+8*2)(%rbx), %r10 - sbb (+8*3)(%rbx), %r11 - sbb (+8*4)(%rbx), %r12 - sbb (+8*5)(%rbx), %r13 - sbb (+8*6)(%rbx), %r14 - sbb (+8*7)(%rbx), %r15 - - # if Carry is clear, replace result with difference - mov (+8*0)(%rax), %rsi - mov (+8*1)(%rax), %rdi - mov (+8*2)(%rax), %rcx - mov (+8*3)(%rax), %rdx - cmovnc %r8, %rsi - cmovnc %r9, %rdi - cmovnc %r10, %rcx - cmovnc %r11, %rdx - mov %rsi, (+8*0)(%rax) - mov %rdi, (+8*1)(%rax) - mov %rcx, (+8*2)(%rax) - mov %rdx, (+8*3)(%rax) - - mov (+8*4)(%rax), %rsi - mov (+8*5)(%rax), %rdi - mov (+8*6)(%rax), %rcx - mov (+8*7)(%rax), %rdx - cmovnc %r12, %rsi - cmovnc %r13, %rdi - cmovnc %r14, %rcx - cmovnc %r15, %rdx - mov %rsi, (+8*4)(%rax) - mov %rdi, (+8*5)(%rax) - mov %rcx, (+8*6)(%rax) - mov %rdx, (+8*7)(%rax) - - mov (+$rsp_offset)(%rsp), %rsi - mov 0(%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbx - mov 40(%rsi),%rbp - lea 48(%rsi),%rsp -.Lepilogue: - ret -.size mod_exp_512, . - mod_exp_512 -___ - -if ($win64) { -# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, -# CONTEXT *context,DISPATCHER_CONTEXT *disp) -my $rec="%rcx"; -my $frame="%rdx"; -my $context="%r8"; -my $disp="%r9"; - -$code.=<<___; -.extern __imp_RtlVirtualUnwind -.type mod_exp_512_se_handler,\@abi-omnipotent -.align 16 -mod_exp_512_se_handler: - push %rsi - push %rdi - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - pushfq - sub \$64,%rsp - - mov 120($context),%rax # pull context->Rax - mov 248($context),%rbx # pull context->Rip - - lea .Lbody(%rip),%r10 - cmp %r10,%rbx # context->Rip<prologue label - jb .Lin_prologue - - mov 152($context),%rax # pull context->Rsp - - lea .Lepilogue(%rip),%r10 - cmp %r10,%rbx # context->Rip>=epilogue label - jae .Lin_prologue - - mov $rsp_offset(%rax),%rax # pull saved Rsp - - mov 32(%rax),%rbx - mov 40(%rax),%rbp - mov 24(%rax),%r12 - mov 16(%rax),%r13 - mov 8(%rax),%r14 - mov 0(%rax),%r15 - lea 48(%rax),%rax - mov %rbx,144($context) # restore context->Rbx - mov %rbp,160($context) # restore context->Rbp - mov %r12,216($context) # restore context->R12 - mov %r13,224($context) # restore context->R13 - mov %r14,232($context) # restore context->R14 - mov %r15,240($context) # restore context->R15 - -.Lin_prologue: - mov 8(%rax),%rdi - mov 16(%rax),%rsi - mov %rax,152($context) # restore context->Rsp - mov %rsi,168($context) # restore context->Rsi - mov %rdi,176($context) # restore context->Rdi - - mov 40($disp),%rdi # disp->ContextRecord - mov $context,%rsi # context - mov \$154,%ecx # sizeof(CONTEXT) - .long 0xa548f3fc # cld; rep movsq - - mov $disp,%rsi - xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER - mov 8(%rsi),%rdx # arg2, disp->ImageBase - mov 0(%rsi),%r8 # arg3, disp->ControlPc - mov 16(%rsi),%r9 # arg4, disp->FunctionEntry - mov 40(%rsi),%r10 # disp->ContextRecord - lea 56(%rsi),%r11 # &disp->HandlerData - lea 24(%rsi),%r12 # &disp->EstablisherFrame - mov %r10,32(%rsp) # arg5 - mov %r11,40(%rsp) # arg6 - mov %r12,48(%rsp) # arg7 - mov %rcx,56(%rsp) # arg8, (NULL) - call *__imp_RtlVirtualUnwind(%rip) - - mov \$1,%eax # ExceptionContinueSearch - add \$64,%rsp - popfq - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbp - pop %rbx - pop %rdi - pop %rsi - ret -.size mod_exp_512_se_handler,.-mod_exp_512_se_handler - -.section .pdata -.align 4 - .rva .LSEH_begin_mod_exp_512 - .rva .LSEH_end_mod_exp_512 - .rva .LSEH_info_mod_exp_512 - -.section .xdata -.align 8 -.LSEH_info_mod_exp_512: - .byte 9,0,0,0 - .rva mod_exp_512_se_handler -___ -} - -sub reg_part { -my ($reg,$conv)=@_; - if ($reg =~ /%r[0-9]+/) { $reg .= $conv; } - elsif ($conv eq "b") { $reg =~ s/%[er]([^x]+)x?/%$1l/; } - elsif ($conv eq "w") { $reg =~ s/%[er](.+)/%$1/; } - elsif ($conv eq "d") { $reg =~ s/%[er](.+)/%e$1/; } - return $reg; -} - -$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem; -$code =~ s/\`([^\`]*)\`/eval $1/gem; -$code =~ s/(\(\+[^)]+\))/eval $1/gem; -print $code; -close STDOUT; diff --git a/crypto/bn/asm/ppc-mont.pl b/crypto/bn/asm/ppc-mont.pl index f9b6992..da69c6a 100755 --- a/crypto/bn/asm/ppc-mont.pl +++ b/crypto/bn/asm/ppc-mont.pl @@ -325,6 +325,7 @@ Lcopy: ; copy or in-place refresh .long 0 .byte 0,12,4,0,0x80,12,6,0 .long 0 +.size .bn_mul_mont_int,.-.bn_mul_mont_int .asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>" ___ diff --git a/crypto/bn/asm/ppc.pl b/crypto/bn/asm/ppc.pl index 1249ce2..04df1fe 100644 --- a/crypto/bn/asm/ppc.pl +++ b/crypto/bn/asm/ppc.pl @@ -392,6 +392,7 @@ $data=<<EOF; .long 0 .byte 0,12,0x14,0,0,0,2,0 .long 0 +.size .bn_sqr_comba4,.-.bn_sqr_comba4 # # NOTE: The following label name should be changed to @@ -819,6 +820,7 @@ $data=<<EOF; .long 0 .byte 0,12,0x14,0,0,0,2,0 .long 0 +.size .bn_sqr_comba8,.-.bn_sqr_comba8 # # NOTE: The following label name should be changed to @@ -972,6 +974,7 @@ $data=<<EOF; .long 0 .byte 0,12,0x14,0,0,0,3,0 .long 0 +.size .bn_mul_comba4,.-.bn_mul_comba4 # # NOTE: The following label name should be changed to @@ -1510,6 +1513,7 @@ $data=<<EOF; .long 0 .byte 0,12,0x14,0,0,0,3,0 .long 0 +.size .bn_mul_comba8,.-.bn_mul_comba8 # # NOTE: The following label name should be changed to @@ -1560,6 +1564,7 @@ Lppcasm_sub_adios: .long 0 .byte 0,12,0x14,0,0,0,4,0 .long 0 +.size .bn_sub_words,.-.bn_sub_words # # NOTE: The following label name should be changed to @@ -1605,6 +1610,7 @@ Lppcasm_add_adios: .long 0 .byte 0,12,0x14,0,0,0,4,0 .long 0 +.size .bn_add_words,.-.bn_add_words # # NOTE: The following label name should be changed to @@ -1720,6 +1726,7 @@ Lppcasm_div9: .long 0 .byte 0,12,0x14,0,0,0,3,0 .long 0 +.size .bn_div_words,.-.bn_div_words # # NOTE: The following label name should be changed to @@ -1761,6 +1768,7 @@ Lppcasm_sqr_adios: .long 0 .byte 0,12,0x14,0,0,0,3,0 .long 0 +.size .bn_sqr_words,.-.bn_sqr_words # # NOTE: The following label name should be changed to @@ -1866,6 +1874,7 @@ Lppcasm_mw_OVER: .long 0 .byte 0,12,0x14,0,0,0,4,0 .long 0 +.size bn_mul_words,.-bn_mul_words # # NOTE: The following label name should be changed to @@ -1991,6 +2000,7 @@ Lppcasm_maw_adios: .long 0 .byte 0,12,0x14,0,0,0,4,0 .long 0 +.size .bn_mul_add_words,.-.bn_mul_add_words .align 4 EOF $data =~ s/\`([^\`]*)\`/eval $1/gem; diff --git a/crypto/bn/asm/ppc64-mont.pl b/crypto/bn/asm/ppc64-mont.pl index a14e769..68e3733 100755 --- a/crypto/bn/asm/ppc64-mont.pl +++ b/crypto/bn/asm/ppc64-mont.pl @@ -1,7 +1,7 @@ #!/usr/bin/env perl # ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. @@ -65,6 +65,14 @@ # others alternative would be to break dependence on upper halves of # GPRs by sticking to 32-bit integer operations... +# December 2012 + +# Remove above mentioned dependence on GPRs' upper halves in 32-bit +# build. No signal masking overhead, but integer instructions are +# *more* numerous... It's still "universally" faster than 32-bit +# ppc-mont.pl, but improvement coefficient is not as impressive +# for longer keys... + $flavour = shift; if ($flavour =~ /32/) { @@ -110,6 +118,9 @@ $tp="r10"; $j="r11"; $i="r12"; # non-volatile registers +$c1="r19"; +$n1="r20"; +$a1="r21"; $nap_d="r22"; # interleaved ap and np in double format $a0="r23"; # ap[0] $t0="r24"; # temporary registers @@ -180,8 +191,8 @@ $T3a="f30"; $T3b="f31"; # . . # +-------------------------------+ # . . -# -12*size_t +-------------------------------+ -# | 10 saved gpr, r22-r31 | +# -13*size_t +-------------------------------+ +# | 13 saved gpr, r19-r31 | # . . # . . # -12*8 +-------------------------------+ @@ -215,6 +226,9 @@ $code=<<___; mr $i,$sp $STUX $sp,$sp,$tp ; alloca + $PUSH r19,`-12*8-13*$SIZE_T`($i) + $PUSH r20,`-12*8-12*$SIZE_T`($i) + $PUSH r21,`-12*8-11*$SIZE_T`($i) $PUSH r22,`-12*8-10*$SIZE_T`($i) $PUSH r23,`-12*8-9*$SIZE_T`($i) $PUSH r24,`-12*8-8*$SIZE_T`($i) @@ -237,40 +251,26 @@ $code=<<___; stfd f29,`-3*8`($i) stfd f30,`-2*8`($i) stfd f31,`-1*8`($i) -___ -$code.=<<___ if ($SIZE_T==8); - ld $a0,0($ap) ; pull ap[0] value - ld $n0,0($n0) ; pull n0[0] value - ld $t3,0($bp) ; bp[0] -___ -$code.=<<___ if ($SIZE_T==4); - mr $t1,$n0 - lwz $a0,0($ap) ; pull ap[0,1] value - lwz $t0,4($ap) - lwz $n0,0($t1) ; pull n0[0,1] value - lwz $t1,4($t1) - lwz $t3,0($bp) ; bp[0,1] - lwz $t2,4($bp) - insrdi $a0,$t0,32,0 - insrdi $n0,$t1,32,0 - insrdi $t3,$t2,32,0 -___ -$code.=<<___; + addi $tp,$sp,`$FRAME+$TRANSFER+8+64` li $i,-64 add $nap_d,$tp,$num and $nap_d,$nap_d,$i ; align to 64 bytes - - mulld $t7,$a0,$t3 ; ap[0]*bp[0] ; nap_d is off by 1, because it's used with stfdu/lfdu addi $nap_d,$nap_d,-8 srwi $j,$num,`3+1` ; counter register, num/2 - mulld $t7,$t7,$n0 ; tp[0]*n0 addi $j,$j,-1 addi $tp,$sp,`$FRAME+$TRANSFER-8` li $carry,0 mtctr $j +___ + +$code.=<<___ if ($SIZE_T==8); + ld $a0,0($ap) ; pull ap[0] value + ld $t3,0($bp) ; bp[0] + ld $n0,0($n0) ; pull n0[0] value + mulld $t7,$a0,$t3 ; ap[0]*bp[0] ; transfer bp[0] to FPU as 4x16-bit values extrdi $t0,$t3,16,48 extrdi $t1,$t3,16,32 @@ -280,6 +280,8 @@ $code.=<<___; std $t1,`$FRAME+8`($sp) std $t2,`$FRAME+16`($sp) std $t3,`$FRAME+24`($sp) + + mulld $t7,$t7,$n0 ; tp[0]*n0 ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values extrdi $t4,$t7,16,48 extrdi $t5,$t7,16,32 @@ -289,21 +291,61 @@ $code.=<<___; std $t5,`$FRAME+40`($sp) std $t6,`$FRAME+48`($sp) std $t7,`$FRAME+56`($sp) -___ -$code.=<<___ if ($SIZE_T==8); - lwz $t0,4($ap) ; load a[j] as 32-bit word pair - lwz $t1,0($ap) - lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair + + extrdi $t0,$a0,32,32 ; lwz $t0,4($ap) + extrdi $t1,$a0,32,0 ; lwz $t1,0($ap) + lwz $t2,12($ap) ; load a[1] as 32-bit word pair lwz $t3,8($ap) - lwz $t4,4($np) ; load n[j] as 32-bit word pair + lwz $t4,4($np) ; load n[0] as 32-bit word pair lwz $t5,0($np) - lwz $t6,12($np) ; load n[j+1] as 32-bit word pair + lwz $t6,12($np) ; load n[1] as 32-bit word pair lwz $t7,8($np) ___ $code.=<<___ if ($SIZE_T==4); - lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs - lwz $t1,4($ap) - lwz $t2,8($ap) + lwz $a0,0($ap) ; pull ap[0,1] value + mr $n1,$n0 + lwz $a1,4($ap) + li $c1,0 + lwz $t1,0($bp) ; bp[0,1] + lwz $t3,4($bp) + lwz $n0,0($n1) ; pull n0[0,1] value + lwz $n1,4($n1) + + mullw $t4,$a0,$t1 ; mulld ap[0]*bp[0] + mulhwu $t5,$a0,$t1 + mullw $t6,$a1,$t1 + mullw $t7,$a0,$t3 + add $t5,$t5,$t6 + add $t5,$t5,$t7 + ; transfer bp[0] to FPU as 4x16-bit values + extrwi $t0,$t1,16,16 + extrwi $t1,$t1,16,0 + extrwi $t2,$t3,16,16 + extrwi $t3,$t3,16,0 + std $t0,`$FRAME+0`($sp) ; yes, std in 32-bit build + std $t1,`$FRAME+8`($sp) + std $t2,`$FRAME+16`($sp) + std $t3,`$FRAME+24`($sp) + + mullw $t0,$t4,$n0 ; mulld tp[0]*n0 + mulhwu $t1,$t4,$n0 + mullw $t2,$t5,$n0 + mullw $t3,$t4,$n1 + add $t1,$t1,$t2 + add $t1,$t1,$t3 + ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values + extrwi $t4,$t0,16,16 + extrwi $t5,$t0,16,0 + extrwi $t6,$t1,16,16 + extrwi $t7,$t1,16,0 + std $t4,`$FRAME+32`($sp) ; yes, std in 32-bit build + std $t5,`$FRAME+40`($sp) + std $t6,`$FRAME+48`($sp) + std $t7,`$FRAME+56`($sp) + + mr $t0,$a0 ; lwz $t0,0($ap) + mr $t1,$a1 ; lwz $t1,4($ap) + lwz $t2,8($ap) ; load a[j..j+3] as 32-bit word pairs lwz $t3,12($ap) lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs lwz $t5,4($np) @@ -319,7 +361,7 @@ $code.=<<___; lfd $nb,`$FRAME+40`($sp) lfd $nc,`$FRAME+48`($sp) lfd $nd,`$FRAME+56`($sp) - std $t0,`$FRAME+64`($sp) + std $t0,`$FRAME+64`($sp) ; yes, std even in 32-bit build std $t1,`$FRAME+72`($sp) std $t2,`$FRAME+80`($sp) std $t3,`$FRAME+88`($sp) @@ -441,7 +483,7 @@ $code.=<<___ if ($SIZE_T==4); lwz $t7,12($np) ___ $code.=<<___; - std $t0,`$FRAME+64`($sp) + std $t0,`$FRAME+64`($sp) ; yes, std even in 32-bit build std $t1,`$FRAME+72`($sp) std $t2,`$FRAME+80`($sp) std $t3,`$FRAME+88`($sp) @@ -449,6 +491,9 @@ $code.=<<___; std $t5,`$FRAME+104`($sp) std $t6,`$FRAME+112`($sp) std $t7,`$FRAME+120`($sp) +___ +if ($SIZE_T==8 or $flavour =~ /osx/) { +$code.=<<___; ld $t0,`$FRAME+0`($sp) ld $t1,`$FRAME+8`($sp) ld $t2,`$FRAME+16`($sp) @@ -457,6 +502,20 @@ $code.=<<___; ld $t5,`$FRAME+40`($sp) ld $t6,`$FRAME+48`($sp) ld $t7,`$FRAME+56`($sp) +___ +} else { +$code.=<<___; + lwz $t1,`$FRAME+0`($sp) + lwz $t0,`$FRAME+4`($sp) + lwz $t3,`$FRAME+8`($sp) + lwz $t2,`$FRAME+12`($sp) + lwz $t5,`$FRAME+16`($sp) + lwz $t4,`$FRAME+20`($sp) + lwz $t7,`$FRAME+24`($sp) + lwz $t6,`$FRAME+28`($sp) +___ +} +$code.=<<___; lfd $A0,`$FRAME+64`($sp) lfd $A1,`$FRAME+72`($sp) lfd $A2,`$FRAME+80`($sp) @@ -488,7 +547,9 @@ $code.=<<___; fmadd $T0b,$A0,$bb,$dotb stfd $A2,24($nap_d) ; save a[j+1] in double format stfd $A3,32($nap_d) - +___ +if ($SIZE_T==8 or $flavour =~ /osx/) { +$code.=<<___; fmadd $T1a,$A0,$bc,$T1a fmadd $T1b,$A0,$bd,$T1b fmadd $T2a,$A1,$bc,$T2a @@ -561,11 +622,123 @@ $code.=<<___; stfd $T3b,`$FRAME+56`($sp) std $t0,8($tp) ; tp[j-1] stdu $t4,16($tp) ; tp[j] +___ +} else { +$code.=<<___; + fmadd $T1a,$A0,$bc,$T1a + fmadd $T1b,$A0,$bd,$T1b + addc $t0,$t0,$carry + adde $t1,$t1,$c1 + srwi $carry,$t0,16 + fmadd $T2a,$A1,$bc,$T2a + fmadd $T2b,$A1,$bd,$T2b + stfd $N0,40($nap_d) ; save n[j] in double format + stfd $N1,48($nap_d) + srwi $c1,$t1,16 + insrwi $carry,$t1,16,0 + fmadd $T3a,$A2,$bc,$T3a + fmadd $T3b,$A2,$bd,$T3b + addc $t2,$t2,$carry + adde $t3,$t3,$c1 + srwi $carry,$t2,16 + fmul $dota,$A3,$bc + fmul $dotb,$A3,$bd + stfd $N2,56($nap_d) ; save n[j+1] in double format + stfdu $N3,64($nap_d) + insrwi $t0,$t2,16,0 ; 0..31 bits + srwi $c1,$t3,16 + insrwi $carry,$t3,16,0 + + fmadd $T1a,$N1,$na,$T1a + fmadd $T1b,$N1,$nb,$T1b + lwz $t3,`$FRAME+32`($sp) ; permuted $t1 + lwz $t2,`$FRAME+36`($sp) ; permuted $t0 + addc $t4,$t4,$carry + adde $t5,$t5,$c1 + srwi $carry,$t4,16 + fmadd $T2a,$N2,$na,$T2a + fmadd $T2b,$N2,$nb,$T2b + srwi $c1,$t5,16 + insrwi $carry,$t5,16,0 + fmadd $T3a,$N3,$na,$T3a + fmadd $T3b,$N3,$nb,$T3b + addc $t6,$t6,$carry + adde $t7,$t7,$c1 + srwi $carry,$t6,16 + fmadd $T0a,$N0,$na,$T0a + fmadd $T0b,$N0,$nb,$T0b + insrwi $t4,$t6,16,0 ; 32..63 bits + srwi $c1,$t7,16 + insrwi $carry,$t7,16,0 + + fmadd $T1a,$N0,$nc,$T1a + fmadd $T1b,$N0,$nd,$T1b + lwz $t7,`$FRAME+40`($sp) ; permuted $t3 + lwz $t6,`$FRAME+44`($sp) ; permuted $t2 + addc $t2,$t2,$carry + adde $t3,$t3,$c1 + srwi $carry,$t2,16 + fmadd $T2a,$N1,$nc,$T2a + fmadd $T2b,$N1,$nd,$T2b + stw $t0,12($tp) ; tp[j-1] + stw $t4,8($tp) + srwi $c1,$t3,16 + insrwi $carry,$t3,16,0 + fmadd $T3a,$N2,$nc,$T3a + fmadd $T3b,$N2,$nd,$T3b + lwz $t1,`$FRAME+48`($sp) ; permuted $t5 + lwz $t0,`$FRAME+52`($sp) ; permuted $t4 + addc $t6,$t6,$carry + adde $t7,$t7,$c1 + srwi $carry,$t6,16 + fmadd $dota,$N3,$nc,$dota + fmadd $dotb,$N3,$nd,$dotb + insrwi $t2,$t6,16,0 ; 64..95 bits + srwi $c1,$t7,16 + insrwi $carry,$t7,16,0 + + fctid $T0a,$T0a + fctid $T0b,$T0b + lwz $t5,`$FRAME+56`($sp) ; permuted $t7 + lwz $t4,`$FRAME+60`($sp) ; permuted $t6 + addc $t0,$t0,$carry + adde $t1,$t1,$c1 + srwi $carry,$t0,16 + fctid $T1a,$T1a + fctid $T1b,$T1b + srwi $c1,$t1,16 + insrwi $carry,$t1,16,0 + fctid $T2a,$T2a + fctid $T2b,$T2b + addc $t4,$t4,$carry + adde $t5,$t5,$c1 + srwi $carry,$t4,16 + fctid $T3a,$T3a + fctid $T3b,$T3b + insrwi $t0,$t4,16,0 ; 96..127 bits + srwi $c1,$t5,16 + insrwi $carry,$t5,16,0 + + stfd $T0a,`$FRAME+0`($sp) + stfd $T0b,`$FRAME+8`($sp) + stfd $T1a,`$FRAME+16`($sp) + stfd $T1b,`$FRAME+24`($sp) + stfd $T2a,`$FRAME+32`($sp) + stfd $T2b,`$FRAME+40`($sp) + stfd $T3a,`$FRAME+48`($sp) + stfd $T3b,`$FRAME+56`($sp) + stw $t2,20($tp) ; tp[j] + stwu $t0,16($tp) +___ +} +$code.=<<___; bdnz- L1st fctid $dota,$dota fctid $dotb,$dotb - +___ +if ($SIZE_T==8 or $flavour =~ /osx/) { +$code.=<<___; ld $t0,`$FRAME+0`($sp) ld $t1,`$FRAME+8`($sp) ld $t2,`$FRAME+16`($sp) @@ -611,33 +784,117 @@ $code.=<<___; insrdi $t6,$t7,48,0 srdi $ovf,$t7,48 std $t6,8($tp) ; tp[num-1] +___ +} else { +$code.=<<___; + lwz $t1,`$FRAME+0`($sp) + lwz $t0,`$FRAME+4`($sp) + lwz $t3,`$FRAME+8`($sp) + lwz $t2,`$FRAME+12`($sp) + lwz $t5,`$FRAME+16`($sp) + lwz $t4,`$FRAME+20`($sp) + lwz $t7,`$FRAME+24`($sp) + lwz $t6,`$FRAME+28`($sp) + stfd $dota,`$FRAME+64`($sp) + stfd $dotb,`$FRAME+72`($sp) + addc $t0,$t0,$carry + adde $t1,$t1,$c1 + srwi $carry,$t0,16 + insrwi $carry,$t1,16,0 + srwi $c1,$t1,16 + addc $t2,$t2,$carry + adde $t3,$t3,$c1 + srwi $carry,$t2,16 + insrwi $t0,$t2,16,0 ; 0..31 bits + insrwi $carry,$t3,16,0 + srwi $c1,$t3,16 + addc $t4,$t4,$carry + adde $t5,$t5,$c1 + srwi $carry,$t4,16 + insrwi $carry,$t5,16,0 + srwi $c1,$t5,16 + addc $t6,$t6,$carry + adde $t7,$t7,$c1 + srwi $carry,$t6,16 + insrwi $t4,$t6,16,0 ; 32..63 bits + insrwi $carry,$t7,16,0 + srwi $c1,$t7,16 + stw $t0,12($tp) ; tp[j-1] + stw $t4,8($tp) + + lwz $t3,`$FRAME+32`($sp) ; permuted $t1 + lwz $t2,`$FRAME+36`($sp) ; permuted $t0 + lwz $t7,`$FRAME+40`($sp) ; permuted $t3 + lwz $t6,`$FRAME+44`($sp) ; permuted $t2 + lwz $t1,`$FRAME+48`($sp) ; permuted $t5 + lwz $t0,`$FRAME+52`($sp) ; permuted $t4 + lwz $t5,`$FRAME+56`($sp) ; permuted $t7 + lwz $t4,`$FRAME+60`($sp) ; permuted $t6 + + addc $t2,$t2,$carry + adde $t3,$t3,$c1 + srwi $carry,$t2,16 + insrwi $carry,$t3,16,0 + srwi $c1,$t3,16 + addc $t6,$t6,$carry + adde $t7,$t7,$c1 + srwi $carry,$t6,16 + insrwi $t2,$t6,16,0 ; 64..95 bits + insrwi $carry,$t7,16,0 + srwi $c1,$t7,16 + addc $t0,$t0,$carry + adde $t1,$t1,$c1 + srwi $carry,$t0,16 + insrwi $carry,$t1,16,0 + srwi $c1,$t1,16 + addc $t4,$t4,$carry + adde $t5,$t5,$c1 + srwi $carry,$t4,16 + insrwi $t0,$t4,16,0 ; 96..127 bits + insrwi $carry,$t5,16,0 + srwi $c1,$t5,16 + stw $t2,20($tp) ; tp[j] + stwu $t0,16($tp) + + lwz $t7,`$FRAME+64`($sp) + lwz $t6,`$FRAME+68`($sp) + lwz $t5,`$FRAME+72`($sp) + lwz $t4,`$FRAME+76`($sp) + + addc $t6,$t6,$carry + adde $t7,$t7,$c1 + srwi $carry,$t6,16 + insrwi $carry,$t7,16,0 + srwi $c1,$t7,16 + addc $t4,$t4,$carry + adde $t5,$t5,$c1 + + insrwi $t6,$t4,16,0 + srwi $t4,$t4,16 + insrwi $t4,$t5,16,0 + srwi $ovf,$t5,16 + stw $t6,12($tp) ; tp[num-1] + stw $t4,8($tp) +___ +} +$code.=<<___; slwi $t7,$num,2 subf $nap_d,$t7,$nap_d ; rewind pointer li $i,8 ; i=1 .align 5 Louter: -___ -$code.=<<___ if ($SIZE_T==8); - ldx $t3,$bp,$i ; bp[i] -___ -$code.=<<___ if ($SIZE_T==4); - add $t0,$bp,$i - lwz $t3,0($t0) ; bp[i,i+1] - lwz $t0,4($t0) - insrdi $t3,$t0,32,0 -___ -$code.=<<___; - ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0] - mulld $t7,$a0,$t3 ; ap[0]*bp[i] - addi $tp,$sp,`$FRAME+$TRANSFER` - add $t7,$t7,$t6 ; ap[0]*bp[i]+tp[0] li $carry,0 - mulld $t7,$t7,$n0 ; tp[0]*n0 mtctr $j +___ +$code.=<<___ if ($SIZE_T==8); + ldx $t3,$bp,$i ; bp[i] + ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0] + mulld $t7,$a0,$t3 ; ap[0]*bp[i] + add $t7,$t7,$t6 ; ap[0]*bp[i]+tp[0] ; transfer bp[i] to FPU as 4x16-bit values extrdi $t0,$t3,16,48 extrdi $t1,$t3,16,32 @@ -647,6 +904,8 @@ $code.=<<___; std $t1,`$FRAME+8`($sp) std $t2,`$FRAME+16`($sp) std $t3,`$FRAME+24`($sp) + + mulld $t7,$t7,$n0 ; tp[0]*n0 ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values extrdi $t4,$t7,16,48 extrdi $t5,$t7,16,32 @@ -656,7 +915,50 @@ $code.=<<___; std $t5,`$FRAME+40`($sp) std $t6,`$FRAME+48`($sp) std $t7,`$FRAME+56`($sp) +___ +$code.=<<___ if ($SIZE_T==4); + add $t0,$bp,$i + li $c1,0 + lwz $t1,0($t0) ; bp[i,i+1] + lwz $t3,4($t0) + + mullw $t4,$a0,$t1 ; ap[0]*bp[i] + lwz $t0,`$FRAME+$TRANSFER+8+4`($sp) ; tp[0] + mulhwu $t5,$a0,$t1 + lwz $t2,`$FRAME+$TRANSFER+8`($sp) ; tp[0] + mullw $t6,$a1,$t1 + mullw $t7,$a0,$t3 + add $t5,$t5,$t6 + add $t5,$t5,$t7 + addc $t4,$t4,$t0 ; ap[0]*bp[i]+tp[0] + adde $t5,$t5,$t2 + ; transfer bp[i] to FPU as 4x16-bit values + extrwi $t0,$t1,16,16 + extrwi $t1,$t1,16,0 + extrwi $t2,$t3,16,16 + extrwi $t3,$t3,16,0 + std $t0,`$FRAME+0`($sp) ; yes, std in 32-bit build + std $t1,`$FRAME+8`($sp) + std $t2,`$FRAME+16`($sp) + std $t3,`$FRAME+24`($sp) + mullw $t0,$t4,$n0 ; mulld tp[0]*n0 + mulhwu $t1,$t4,$n0 + mullw $t2,$t5,$n0 + mullw $t3,$t4,$n1 + add $t1,$t1,$t2 + add $t1,$t1,$t3 + ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values + extrwi $t4,$t0,16,16 + extrwi $t5,$t0,16,0 + extrwi $t6,$t1,16,16 + extrwi $t7,$t1,16,0 + std $t4,`$FRAME+32`($sp) ; yes, std in 32-bit build + std $t5,`$FRAME+40`($sp) + std $t6,`$FRAME+48`($sp) + std $t7,`$FRAME+56`($sp) +___ +$code.=<<___; lfd $A0,8($nap_d) ; load a[j] in double format lfd $A1,16($nap_d) lfd $A2,24($nap_d) ; load a[j+1] in double format @@ -769,7 +1071,9 @@ Linner: fmul $dotb,$A3,$bd lfd $A2,24($nap_d) ; load a[j+1] in double format lfd $A3,32($nap_d) - +___ +if ($SIZE_T==8 or $flavour =~ /osx/) { +$code.=<<___; fmadd $T1a,$N1,$na,$T1a fmadd $T1b,$N1,$nb,$T1b ld $t0,`$FRAME+0`($sp) @@ -856,10 +1160,131 @@ $code.=<<___; addze $carry,$carry std $t3,-16($tp) ; tp[j-1] std $t5,-8($tp) ; tp[j] +___ +} else { +$code.=<<___; + fmadd $T1a,$N1,$na,$T1a + fmadd $T1b,$N1,$nb,$T1b + lwz $t1,`$FRAME+0`($sp) + lwz $t0,`$FRAME+4`($sp) + fmadd $T2a,$N2,$na,$T2a + fmadd $T2b,$N2,$nb,$T2b + lwz $t3,`$FRAME+8`($sp) + lwz $t2,`$FRAME+12`($sp) + fmadd $T3a,$N3,$na,$T3a + fmadd $T3b,$N3,$nb,$T3b + lwz $t5,`$FRAME+16`($sp) + lwz $t4,`$FRAME+20`($sp) + addc $t0,$t0,$carry + adde $t1,$t1,$c1 + srwi $carry,$t0,16 + fmadd $T0a,$N0,$na,$T0a + fmadd $T0b,$N0,$nb,$T0b + lwz $t7,`$FRAME+24`($sp) + lwz $t6,`$FRAME+28`($sp) + srwi $c1,$t1,16 + insrwi $carry,$t1,16,0 + + fmadd $T1a,$N0,$nc,$T1a + fmadd $T1b,$N0,$nd,$T1b + addc $t2,$t2,$carry + adde $t3,$t3,$c1 + srwi $carry,$t2,16 + fmadd $T2a,$N1,$nc,$T2a + fmadd $T2b,$N1,$nd,$T2b + insrwi $t0,$t2,16,0 ; 0..31 bits + srwi $c1,$t3,16 + insrwi $carry,$t3,16,0 + fmadd $T3a,$N2,$nc,$T3a + fmadd $T3b,$N2,$nd,$T3b + lwz $t2,12($tp) ; tp[j] + lwz $t3,8($tp) + addc $t4,$t4,$carry + adde $t5,$t5,$c1 + srwi $carry,$t4,16 + fmadd $dota,$N3,$nc,$dota + fmadd $dotb,$N3,$nd,$dotb + srwi $c1,$t5,16 + insrwi $carry,$t5,16,0 + + fctid $T0a,$T0a + addc $t6,$t6,$carry + adde $t7,$t7,$c1 + srwi $carry,$t6,16 + fctid $T0b,$T0b + insrwi $t4,$t6,16,0 ; 32..63 bits + srwi $c1,$t7,16 + insrwi $carry,$t7,16,0 + fctid $T1a,$T1a + addc $t0,$t0,$t2 + adde $t4,$t4,$t3 + lwz $t3,`$FRAME+32`($sp) ; permuted $t1 + lwz $t2,`$FRAME+36`($sp) ; permuted $t0 + fctid $T1b,$T1b + addze $carry,$carry + addze $c1,$c1 + stw $t0,4($tp) ; tp[j-1] + stw $t4,0($tp) + fctid $T2a,$T2a + addc $t2,$t2,$carry + adde $t3,$t3,$c1 + srwi $carry,$t2,16 + lwz $t7,`$FRAME+40`($sp) ; permuted $t3 + lwz $t6,`$FRAME+44`($sp) ; permuted $t2 + fctid $T2b,$T2b + srwi $c1,$t3,16 + insrwi $carry,$t3,16,0 + lwz $t1,`$FRAME+48`($sp) ; permuted $t5 + lwz $t0,`$FRAME+52`($sp) ; permuted $t4 + fctid $T3a,$T3a + addc $t6,$t6,$carry + adde $t7,$t7,$c1 + srwi $carry,$t6,16 + lwz $t5,`$FRAME+56`($sp) ; permuted $t7 + lwz $t4,`$FRAME+60`($sp) ; permuted $t6 + fctid $T3b,$T3b + + insrwi $t2,$t6,16,0 ; 64..95 bits + insrwi $carry,$t7,16,0 + srwi $c1,$t7,16 + lwz $t6,20($tp) + lwzu $t7,16($tp) + addc $t0,$t0,$carry + stfd $T0a,`$FRAME+0`($sp) + adde $t1,$t1,$c1 + srwi $carry,$t0,16 + stfd $T0b,`$FRAME+8`($sp) + insrwi $carry,$t1,16,0 + srwi $c1,$t1,16 + addc $t4,$t4,$carry + stfd $T1a,`$FRAME+16`($sp) + adde $t5,$t5,$c1 + srwi $carry,$t4,16 + insrwi $t0,$t4,16,0 ; 96..127 bits + stfd $T1b,`$FRAME+24`($sp) + insrwi $carry,$t5,16,0 + srwi $c1,$t5,16 + + addc $t2,$t2,$t6 + stfd $T2a,`$FRAME+32`($sp) + adde $t0,$t0,$t7 + stfd $T2b,`$FRAME+40`($sp) + addze $carry,$carry + stfd $T3a,`$FRAME+48`($sp) + addze $c1,$c1 + stfd $T3b,`$FRAME+56`($sp) + stw $t2,-4($tp) ; tp[j] + stw $t0,-8($tp) +___ +} +$code.=<<___; bdnz- Linner fctid $dota,$dota fctid $dotb,$dotb +___ +if ($SIZE_T==8 or $flavour =~ /osx/) { +$code.=<<___; ld $t0,`$FRAME+0`($sp) ld $t1,`$FRAME+8`($sp) ld $t2,`$FRAME+16`($sp) @@ -926,7 +1351,116 @@ $code.=<<___; insrdi $t6,$t7,48,0 srdi $ovf,$t7,48 std $t6,0($tp) ; tp[num-1] +___ +} else { +$code.=<<___; + lwz $t1,`$FRAME+0`($sp) + lwz $t0,`$FRAME+4`($sp) + lwz $t3,`$FRAME+8`($sp) + lwz $t2,`$FRAME+12`($sp) + lwz $t5,`$FRAME+16`($sp) + lwz $t4,`$FRAME+20`($sp) + lwz $t7,`$FRAME+24`($sp) + lwz $t6,`$FRAME+28`($sp) + stfd $dota,`$FRAME+64`($sp) + stfd $dotb,`$FRAME+72`($sp) + addc $t0,$t0,$carry + adde $t1,$t1,$c1 + srwi $carry,$t0,16 + insrwi $carry,$t1,16,0 + srwi $c1,$t1,16 + addc $t2,$t2,$carry + adde $t3,$t3,$c1 + srwi $carry,$t2,16 + insrwi $t0,$t2,16,0 ; 0..31 bits + lwz $t2,12($tp) ; tp[j] + insrwi $carry,$t3,16,0 + srwi $c1,$t3,16 + lwz $t3,8($tp) + addc $t4,$t4,$carry + adde $t5,$t5,$c1 + srwi $carry,$t4,16 + insrwi $carry,$t5,16,0 + srwi $c1,$t5,16 + addc $t6,$t6,$carry + adde $t7,$t7,$c1 + srwi $carry,$t6,16 + insrwi $t4,$t6,16,0 ; 32..63 bits + insrwi $carry,$t7,16,0 + srwi $c1,$t7,16 + + addc $t0,$t0,$t2 + adde $t4,$t4,$t3 + addze $carry,$carry + addze $c1,$c1 + stw $t0,4($tp) ; tp[j-1] + stw $t4,0($tp) + + lwz $t3,`$FRAME+32`($sp) ; permuted $t1 + lwz $t2,`$FRAME+36`($sp) ; permuted $t0 + lwz $t7,`$FRAME+40`($sp) ; permuted $t3 + lwz $t6,`$FRAME+44`($sp) ; permuted $t2 + lwz $t1,`$FRAME+48`($sp) ; permuted $t5 + lwz $t0,`$FRAME+52`($sp) ; permuted $t4 + lwz $t5,`$FRAME+56`($sp) ; permuted $t7 + lwz $t4,`$FRAME+60`($sp) ; permuted $t6 + + addc $t2,$t2,$carry + adde $t3,$t3,$c1 + srwi $carry,$t2,16 + insrwi $carry,$t3,16,0 + srwi $c1,$t3,16 + addc $t6,$t6,$carry + adde $t7,$t7,$c1 + srwi $carry,$t6,16 + insrwi $t2,$t6,16,0 ; 64..95 bits + lwz $t6,20($tp) + insrwi $carry,$t7,16,0 + srwi $c1,$t7,16 + lwzu $t7,16($tp) + addc $t0,$t0,$carry + adde $t1,$t1,$c1 + srwi $carry,$t0,16 + insrwi $carry,$t1,16,0 + srwi $c1,$t1,16 + addc $t4,$t4,$carry + adde $t5,$t5,$c1 + srwi $carry,$t4,16 + insrwi $t0,$t4,16,0 ; 96..127 bits + insrwi $carry,$t5,16,0 + srwi $c1,$t5,16 + + addc $t2,$t2,$t6 + adde $t0,$t0,$t7 + lwz $t7,`$FRAME+64`($sp) + lwz $t6,`$FRAME+68`($sp) + addze $carry,$carry + addze $c1,$c1 + lwz $t5,`$FRAME+72`($sp) + lwz $t4,`$FRAME+76`($sp) + + addc $t6,$t6,$carry + adde $t7,$t7,$c1 + stw $t2,-4($tp) ; tp[j] + stw $t0,-8($tp) + addc $t6,$t6,$ovf + addze $t7,$t7 + srwi $carry,$t6,16 + insrwi $carry,$t7,16,0 + srwi $c1,$t7,16 + addc $t4,$t4,$carry + adde $t5,$t5,$c1 + + insrwi $t6,$t4,16,0 + srwi $t4,$t4,16 + insrwi $t4,$t5,16,0 + srwi $ovf,$t5,16 + stw $t6,4($tp) ; tp[num-1] + stw $t4,0($tp) +___ +} +$code.=<<___; slwi $t7,$num,2 addi $i,$i,8 subf $nap_d,$t7,$nap_d ; rewind pointer @@ -994,14 +1528,14 @@ $code.=<<___ if ($SIZE_T==4); mtctr $j .align 4 -Lsub: ld $t0,8($tp) ; load tp[j..j+3] in 64-bit word order - ldu $t2,16($tp) +Lsub: lwz $t0,12($tp) ; load tp[j..j+3] in 64-bit word order + lwz $t1,8($tp) + lwz $t2,20($tp) + lwzu $t3,16($tp) lwz $t4,4($np) ; load np[j..j+3] in 32-bit word order lwz $t5,8($np) lwz $t6,12($np) lwzu $t7,16($np) - extrdi $t1,$t0,32,0 - extrdi $t3,$t2,32,0 subfe $t4,$t4,$t0 ; tp[j]-np[j] stw $t0,4($ap) ; save tp[j..j+3] in 32-bit word order subfe $t5,$t5,$t1 ; tp[j+1]-np[j+1] @@ -1052,6 +1586,9 @@ ___ $code.=<<___; $POP $i,0($sp) li r3,1 ; signal "handled" + $POP r19,`-12*8-13*$SIZE_T`($i) + $POP r20,`-12*8-12*$SIZE_T`($i) + $POP r21,`-12*8-11*$SIZE_T`($i) $POP r22,`-12*8-10*$SIZE_T`($i) $POP r23,`-12*8-9*$SIZE_T`($i) $POP r24,`-12*8-8*$SIZE_T`($i) @@ -1077,8 +1614,9 @@ $code.=<<___; mr $sp,$i blr .long 0 - .byte 0,12,4,0,0x8c,10,6,0 + .byte 0,12,4,0,0x8c,13,6,0 .long 0 +.size .$fname,.-.$fname .asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@openssl.org>" ___ diff --git a/crypto/bn/asm/rsaz-avx2.pl b/crypto/bn/asm/rsaz-avx2.pl new file mode 100755 index 0000000..3b6ccf8 --- /dev/null +++ b/crypto/bn/asm/rsaz-avx2.pl @@ -0,0 +1,1898 @@ +#!/usr/bin/env perl + +############################################################################## +# # +# Copyright (c) 2012, Intel Corporation # +# # +# All rights reserved. # +# # +# Redistribution and use in source and binary forms, with or without # +# modification, are permitted provided that the following conditions are # +# met: # +# # +# * Redistributions of source code must retain the above copyright # +# notice, this list of conditions and the following disclaimer. # +# # +# * Redistributions in binary form must reproduce the above copyright # +# notice, this list of conditions and the following disclaimer in the # +# documentation and/or other materials provided with the # +# distribution. # +# # +# * Neither the name of the Intel Corporation nor the names of its # +# contributors may be used to endorse or promote products derived from # +# this software without specific prior written permission. # +# # +# # +# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY # +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR # +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # +# # +############################################################################## +# Developers and authors: # +# Shay Gueron (1, 2), and Vlad Krasnov (1) # +# (1) Intel Corporation, Israel Development Center, Haifa, Israel # +# (2) University of Haifa, Israel # +############################################################################## +# Reference: # +# [1] S. Gueron, V. Krasnov: "Software Implementation of Modular # +# Exponentiation, Using Advanced Vector Instructions Architectures", # +# F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369, # +# pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012 # +# [2] S. Gueron: "Efficient Software Implementations of Modular # +# Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012). # +# [3] S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE # +# Proceedings of 9th International Conference on Information Technology: # +# New Generations (ITNG 2012), pp.821-823 (2012) # +# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis # +# resistant 1024-bit modular exponentiation, for optimizing RSA2048 # +# on AVX2 capable x86_64 platforms", # +# http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest# +############################################################################## +# +# +13% improvement over original submission by <appro@openssl.org> +# +# rsa2048 sign/sec OpenSSL 1.0.1 scalar(*) this +# 2.3GHz Haswell 621 765/+23% 1113/+79% +# 2.3GHz Broadwell(**) 688 1200(***)/+74% 1120/+63% +# +# (*) if system doesn't support AVX2, for reference purposes; +# (**) scaled to 2.3GHz to simplify comparison; +# (***) scalar AD*X code is faster than AVX2 and is preferred code +# path for Broadwell; + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.19) + ($1>=2.22); + $addx = ($1>=2.23); +} + +if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.09) + ($1>=2.10); + $addx = ($1>=2.10); +} + +if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $avx = ($1>=10) + ($1>=11); + $addx = ($1>=11); +} + +if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) { + my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 + $avx = ($ver>=3.0) + ($ver>=3.01); + $addx = ($ver>=3.03); +} + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT = *OUT; + +if ($avx>1) {{{ +{ # void AMS_WW( +my $rp="%rdi"; # BN_ULONG *rp, +my $ap="%rsi"; # const BN_ULONG *ap, +my $np="%rdx"; # const BN_ULONG *np, +my $n0="%ecx"; # const BN_ULONG n0, +my $rep="%r8d"; # int repeat); + +# The registers that hold the accumulated redundant result +# The AMM works on 1024 bit operands, and redundant word size is 29 +# Therefore: ceil(1024/29)/4 = 9 +my $ACC0="%ymm0"; +my $ACC1="%ymm1"; +my $ACC2="%ymm2"; +my $ACC3="%ymm3"; +my $ACC4="%ymm4"; +my $ACC5="%ymm5"; +my $ACC6="%ymm6"; +my $ACC7="%ymm7"; +my $ACC8="%ymm8"; +my $ACC9="%ymm9"; +# Registers that hold the broadcasted words of bp, currently used +my $B1="%ymm10"; +my $B2="%ymm11"; +# Registers that hold the broadcasted words of Y, currently used +my $Y1="%ymm12"; +my $Y2="%ymm13"; +# Helper registers +my $TEMP1="%ymm14"; +my $AND_MASK="%ymm15"; +# alu registers that hold the first words of the ACC +my $r0="%r9"; +my $r1="%r10"; +my $r2="%r11"; +my $r3="%r12"; + +my $i="%r14d"; # loop counter +my $tmp = "%r15"; + +my $FrameSize=32*18+32*8; # place for A^2 and 2*A + +my $aap=$r0; +my $tp0="%rbx"; +my $tp1=$r3; +my $tpa=$tmp; + +$np="%r13"; # reassigned argument + +$code.=<<___; +.text + +.globl rsaz_1024_sqr_avx2 +.type rsaz_1024_sqr_avx2,\@function,5 +.align 64 +rsaz_1024_sqr_avx2: # 702 cycles, 14% faster than rsaz_1024_mul_avx2 + lea (%rsp), %rax + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + vzeroupper +___ +$code.=<<___ if ($win64); + lea -0xa8(%rsp),%rsp + vmovaps %xmm6,-0xd8(%rax) + vmovaps %xmm7,-0xc8(%rax) + vmovaps %xmm8,-0xb8(%rax) + vmovaps %xmm9,-0xa8(%rax) + vmovaps %xmm10,-0x98(%rax) + vmovaps %xmm11,-0x88(%rax) + vmovaps %xmm12,-0x78(%rax) + vmovaps %xmm13,-0x68(%rax) + vmovaps %xmm14,-0x58(%rax) + vmovaps %xmm15,-0x48(%rax) +.Lsqr_1024_body: +___ +$code.=<<___; + mov %rax,%rbp + mov %rdx, $np # reassigned argument + sub \$$FrameSize, %rsp + mov $np, $tmp + sub \$-128, $rp # size optimization + sub \$-128, $ap + sub \$-128, $np + + and \$4095, $tmp # see if $np crosses page + add \$32*10, $tmp + shr \$12, $tmp + vpxor $ACC9,$ACC9,$ACC9 + jz .Lsqr_1024_no_n_copy + + # unaligned 256-bit load that crosses page boundary can + # cause >2x performance degradation here, so if $np does + # cross page boundary, copy it to stack and make sure stack + # frame doesn't... + sub \$32*10,%rsp + vmovdqu 32*0-128($np), $ACC0 + and \$-2048, %rsp + vmovdqu 32*1-128($np), $ACC1 + vmovdqu 32*2-128($np), $ACC2 + vmovdqu 32*3-128($np), $ACC3 + vmovdqu 32*4-128($np), $ACC4 + vmovdqu 32*5-128($np), $ACC5 + vmovdqu 32*6-128($np), $ACC6 + vmovdqu 32*7-128($np), $ACC7 + vmovdqu 32*8-128($np), $ACC8 + lea $FrameSize+128(%rsp),$np + vmovdqu $ACC0, 32*0-128($np) + vmovdqu $ACC1, 32*1-128($np) + vmovdqu $ACC2, 32*2-128($np) + vmovdqu $ACC3, 32*3-128($np) + vmovdqu $ACC4, 32*4-128($np) + vmovdqu $ACC5, 32*5-128($np) + vmovdqu $ACC6, 32*6-128($np) + vmovdqu $ACC7, 32*7-128($np) + vmovdqu $ACC8, 32*8-128($np) + vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero + +.Lsqr_1024_no_n_copy: + and \$-1024, %rsp + + vmovdqu 32*1-128($ap), $ACC1 + vmovdqu 32*2-128($ap), $ACC2 + vmovdqu 32*3-128($ap), $ACC3 + vmovdqu 32*4-128($ap), $ACC4 + vmovdqu 32*5-128($ap), $ACC5 + vmovdqu 32*6-128($ap), $ACC6 + vmovdqu 32*7-128($ap), $ACC7 + vmovdqu 32*8-128($ap), $ACC8 + + lea 192(%rsp), $tp0 # 64+128=192 + vpbroadcastq .Land_mask(%rip), $AND_MASK + jmp .LOOP_GRANDE_SQR_1024 + +.align 32 +.LOOP_GRANDE_SQR_1024: + lea 32*18+128(%rsp), $aap # size optimization + lea 448(%rsp), $tp1 # 64+128+256=448 + + # the squaring is performed as described in Variant B of + # "Speeding up Big-Number Squaring", so start by calculating + # the A*2=A+A vector + vpaddq $ACC1, $ACC1, $ACC1 + vpbroadcastq 32*0-128($ap), $B1 + vpaddq $ACC2, $ACC2, $ACC2 + vmovdqa $ACC1, 32*0-128($aap) + vpaddq $ACC3, $ACC3, $ACC3 + vmovdqa $ACC2, 32*1-128($aap) + vpaddq $ACC4, $ACC4, $ACC4 + vmovdqa $ACC3, 32*2-128($aap) + vpaddq $ACC5, $ACC5, $ACC5 + vmovdqa $ACC4, 32*3-128($aap) + vpaddq $ACC6, $ACC6, $ACC6 + vmovdqa $ACC5, 32*4-128($aap) + vpaddq $ACC7, $ACC7, $ACC7 + vmovdqa $ACC6, 32*5-128($aap) + vpaddq $ACC8, $ACC8, $ACC8 + vmovdqa $ACC7, 32*6-128($aap) + vpxor $ACC9, $ACC9, $ACC9 + vmovdqa $ACC8, 32*7-128($aap) + + vpmuludq 32*0-128($ap), $B1, $ACC0 + vpbroadcastq 32*1-128($ap), $B2 + vmovdqu $ACC9, 32*9-192($tp0) # zero upper half + vpmuludq $B1, $ACC1, $ACC1 + vmovdqu $ACC9, 32*10-448($tp1) + vpmuludq $B1, $ACC2, $ACC2 + vmovdqu $ACC9, 32*11-448($tp1) + vpmuludq $B1, $ACC3, $ACC3 + vmovdqu $ACC9, 32*12-448($tp1) + vpmuludq $B1, $ACC4, $ACC4 + vmovdqu $ACC9, 32*13-448($tp1) + vpmuludq $B1, $ACC5, $ACC5 + vmovdqu $ACC9, 32*14-448($tp1) + vpmuludq $B1, $ACC6, $ACC6 + vmovdqu $ACC9, 32*15-448($tp1) + vpmuludq $B1, $ACC7, $ACC7 + vmovdqu $ACC9, 32*16-448($tp1) + vpmuludq $B1, $ACC8, $ACC8 + vpbroadcastq 32*2-128($ap), $B1 + vmovdqu $ACC9, 32*17-448($tp1) + + mov $ap, $tpa + mov \$4, $i + jmp .Lsqr_entry_1024 +___ +$TEMP0=$Y1; +$TEMP2=$Y2; +$code.=<<___; +.align 32 +.LOOP_SQR_1024: + vpbroadcastq 32*1-128($tpa), $B2 + vpmuludq 32*0-128($ap), $B1, $ACC0 + vpaddq 32*0-192($tp0), $ACC0, $ACC0 + vpmuludq 32*0-128($aap), $B1, $ACC1 + vpaddq 32*1-192($tp0), $ACC1, $ACC1 + vpmuludq 32*1-128($aap), $B1, $ACC2 + vpaddq 32*2-192($tp0), $ACC2, $ACC2 + vpmuludq 32*2-128($aap), $B1, $ACC3 + vpaddq 32*3-192($tp0), $ACC3, $ACC3 + vpmuludq 32*3-128($aap), $B1, $ACC4 + vpaddq 32*4-192($tp0), $ACC4, $ACC4 + vpmuludq 32*4-128($aap), $B1, $ACC5 + vpaddq 32*5-192($tp0), $ACC5, $ACC5 + vpmuludq 32*5-128($aap), $B1, $ACC6 + vpaddq 32*6-192($tp0), $ACC6, $ACC6 + vpmuludq 32*6-128($aap), $B1, $ACC7 + vpaddq 32*7-192($tp0), $ACC7, $ACC7 + vpmuludq 32*7-128($aap), $B1, $ACC8 + vpbroadcastq 32*2-128($tpa), $B1 + vpaddq 32*8-192($tp0), $ACC8, $ACC8 +.Lsqr_entry_1024: + vmovdqu $ACC0, 32*0-192($tp0) + vmovdqu $ACC1, 32*1-192($tp0) + + vpmuludq 32*1-128($ap), $B2, $TEMP0 + vpaddq $TEMP0, $ACC2, $ACC2 + vpmuludq 32*1-128($aap), $B2, $TEMP1 + vpaddq $TEMP1, $ACC3, $ACC3 + vpmuludq 32*2-128($aap), $B2, $TEMP2 + vpaddq $TEMP2, $ACC4, $ACC4 + vpmuludq 32*3-128($aap), $B2, $TEMP0 + vpaddq $TEMP0, $ACC5, $ACC5 + vpmuludq 32*4-128($aap), $B2, $TEMP1 + vpaddq $TEMP1, $ACC6, $ACC6 + vpmuludq 32*5-128($aap), $B2, $TEMP2 + vpaddq $TEMP2, $ACC7, $ACC7 + vpmuludq 32*6-128($aap), $B2, $TEMP0 + vpaddq $TEMP0, $ACC8, $ACC8 + vpmuludq 32*7-128($aap), $B2, $ACC0 + vpbroadcastq 32*3-128($tpa), $B2 + vpaddq 32*9-192($tp0), $ACC0, $ACC0 + + vmovdqu $ACC2, 32*2-192($tp0) + vmovdqu $ACC3, 32*3-192($tp0) + + vpmuludq 32*2-128($ap), $B1, $TEMP2 + vpaddq $TEMP2, $ACC4, $ACC4 + vpmuludq 32*2-128($aap), $B1, $TEMP0 + vpaddq $TEMP0, $ACC5, $ACC5 + vpmuludq 32*3-128($aap), $B1, $TEMP1 + vpaddq $TEMP1, $ACC6, $ACC6 + vpmuludq 32*4-128($aap), $B1, $TEMP2 + vpaddq $TEMP2, $ACC7, $ACC7 + vpmuludq 32*5-128($aap), $B1, $TEMP0 + vpaddq $TEMP0, $ACC8, $ACC8 + vpmuludq 32*6-128($aap), $B1, $TEMP1 + vpaddq $TEMP1, $ACC0, $ACC0 + vpmuludq 32*7-128($aap), $B1, $ACC1 + vpbroadcastq 32*4-128($tpa), $B1 + vpaddq 32*10-448($tp1), $ACC1, $ACC1 + + vmovdqu $ACC4, 32*4-192($tp0) + vmovdqu $ACC5, 32*5-192($tp0) + + vpmuludq 32*3-128($ap), $B2, $TEMP0 + vpaddq $TEMP0, $ACC6, $ACC6 + vpmuludq 32*3-128($aap), $B2, $TEMP1 + vpaddq $TEMP1, $ACC7, $ACC7 + vpmuludq 32*4-128($aap), $B2, $TEMP2 + vpaddq $TEMP2, $ACC8, $ACC8 + vpmuludq 32*5-128($aap), $B2, $TEMP0 + vpaddq $TEMP0, $ACC0, $ACC0 + vpmuludq 32*6-128($aap), $B2, $TEMP1 + vpaddq $TEMP1, $ACC1, $ACC1 + vpmuludq 32*7-128($aap), $B2, $ACC2 + vpbroadcastq 32*5-128($tpa), $B2 + vpaddq 32*11-448($tp1), $ACC2, $ACC2 + + vmovdqu $ACC6, 32*6-192($tp0) + vmovdqu $ACC7, 32*7-192($tp0) + + vpmuludq 32*4-128($ap), $B1, $TEMP0 + vpaddq $TEMP0, $ACC8, $ACC8 + vpmuludq 32*4-128($aap), $B1, $TEMP1 + vpaddq $TEMP1, $ACC0, $ACC0 + vpmuludq 32*5-128($aap), $B1, $TEMP2 + vpaddq $TEMP2, $ACC1, $ACC1 + vpmuludq 32*6-128($aap), $B1, $TEMP0 + vpaddq $TEMP0, $ACC2, $ACC2 + vpmuludq 32*7-128($aap), $B1, $ACC3 + vpbroadcastq 32*6-128($tpa), $B1 + vpaddq 32*12-448($tp1), $ACC3, $ACC3 + + vmovdqu $ACC8, 32*8-192($tp0) + vmovdqu $ACC0, 32*9-192($tp0) + lea 8($tp0), $tp0 + + vpmuludq 32*5-128($ap), $B2, $TEMP2 + vpaddq $TEMP2, $ACC1, $ACC1 + vpmuludq 32*5-128($aap), $B2, $TEMP0 + vpaddq $TEMP0, $ACC2, $ACC2 + vpmuludq 32*6-128($aap), $B2, $TEMP1 + vpaddq $TEMP1, $ACC3, $ACC3 + vpmuludq 32*7-128($aap), $B2, $ACC4 + vpbroadcastq 32*7-128($tpa), $B2 + vpaddq 32*13-448($tp1), $ACC4, $ACC4 + + vmovdqu $ACC1, 32*10-448($tp1) + vmovdqu $ACC2, 32*11-448($tp1) + + vpmuludq 32*6-128($ap), $B1, $TEMP0 + vpaddq $TEMP0, $ACC3, $ACC3 + vpmuludq 32*6-128($aap), $B1, $TEMP1 + vpbroadcastq 32*8-128($tpa), $ACC0 # borrow $ACC0 for $B1 + vpaddq $TEMP1, $ACC4, $ACC4 + vpmuludq 32*7-128($aap), $B1, $ACC5 + vpbroadcastq 32*0+8-128($tpa), $B1 # for next iteration + vpaddq 32*14-448($tp1), $ACC5, $ACC5 + + vmovdqu $ACC3, 32*12-448($tp1) + vmovdqu $ACC4, 32*13-448($tp1) + lea 8($tpa), $tpa + + vpmuludq 32*7-128($ap), $B2, $TEMP0 + vpaddq $TEMP0, $ACC5, $ACC5 + vpmuludq 32*7-128($aap), $B2, $ACC6 + vpaddq 32*15-448($tp1), $ACC6, $ACC6 + + vpmuludq 32*8-128($ap), $ACC0, $ACC7 + vmovdqu $ACC5, 32*14-448($tp1) + vpaddq 32*16-448($tp1), $ACC7, $ACC7 + vmovdqu $ACC6, 32*15-448($tp1) + vmovdqu $ACC7, 32*16-448($tp1) + lea 8($tp1), $tp1 + + dec $i + jnz .LOOP_SQR_1024 +___ +$ZERO = $ACC9; +$TEMP0 = $B1; +$TEMP2 = $B2; +$TEMP3 = $Y1; +$TEMP4 = $Y2; +$code.=<<___; + #we need to fix indexes 32-39 to avoid overflow + vmovdqu 32*8(%rsp), $ACC8 # 32*8-192($tp0), + vmovdqu 32*9(%rsp), $ACC1 # 32*9-192($tp0) + vmovdqu 32*10(%rsp), $ACC2 # 32*10-192($tp0) + lea 192(%rsp), $tp0 # 64+128=192 + + vpsrlq \$29, $ACC8, $TEMP1 + vpand $AND_MASK, $ACC8, $ACC8 + vpsrlq \$29, $ACC1, $TEMP2 + vpand $AND_MASK, $ACC1, $ACC1 + + vpermq \$0x93, $TEMP1, $TEMP1 + vpxor $ZERO, $ZERO, $ZERO + vpermq \$0x93, $TEMP2, $TEMP2 + + vpblendd \$3, $ZERO, $TEMP1, $TEMP0 + vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 + vpaddq $TEMP0, $ACC8, $ACC8 + vpblendd \$3, $TEMP2, $ZERO, $TEMP2 + vpaddq $TEMP1, $ACC1, $ACC1 + vpaddq $TEMP2, $ACC2, $ACC2 + vmovdqu $ACC1, 32*9-192($tp0) + vmovdqu $ACC2, 32*10-192($tp0) + + mov (%rsp), %rax + mov 8(%rsp), $r1 + mov 16(%rsp), $r2 + mov 24(%rsp), $r3 + vmovdqu 32*1(%rsp), $ACC1 + vmovdqu 32*2-192($tp0), $ACC2 + vmovdqu 32*3-192($tp0), $ACC3 + vmovdqu 32*4-192($tp0), $ACC4 + vmovdqu 32*5-192($tp0), $ACC5 + vmovdqu 32*6-192($tp0), $ACC6 + vmovdqu 32*7-192($tp0), $ACC7 + + mov %rax, $r0 + imull $n0, %eax + and \$0x1fffffff, %eax + vmovd %eax, $Y1 + + mov %rax, %rdx + imulq -128($np), %rax + vpbroadcastq $Y1, $Y1 + add %rax, $r0 + mov %rdx, %rax + imulq 8-128($np), %rax + shr \$29, $r0 + add %rax, $r1 + mov %rdx, %rax + imulq 16-128($np), %rax + add $r0, $r1 + add %rax, $r2 + imulq 24-128($np), %rdx + add %rdx, $r3 + + mov $r1, %rax + imull $n0, %eax + and \$0x1fffffff, %eax + + mov \$9, $i + jmp .LOOP_REDUCE_1024 + +.align 32 +.LOOP_REDUCE_1024: + vmovd %eax, $Y2 + vpbroadcastq $Y2, $Y2 + + vpmuludq 32*1-128($np), $Y1, $TEMP0 + mov %rax, %rdx + imulq -128($np), %rax + vpaddq $TEMP0, $ACC1, $ACC1 + add %rax, $r1 + vpmuludq 32*2-128($np), $Y1, $TEMP1 + mov %rdx, %rax + imulq 8-128($np), %rax + vpaddq $TEMP1, $ACC2, $ACC2 + vpmuludq 32*3-128($np), $Y1, $TEMP2 + .byte 0x67 + add %rax, $r2 + .byte 0x67 + mov %rdx, %rax + imulq 16-128($np), %rax + shr \$29, $r1 + vpaddq $TEMP2, $ACC3, $ACC3 + vpmuludq 32*4-128($np), $Y1, $TEMP0 + add %rax, $r3 + add $r1, $r2 + vpaddq $TEMP0, $ACC4, $ACC4 + vpmuludq 32*5-128($np), $Y1, $TEMP1 + mov $r2, %rax + imull $n0, %eax + vpaddq $TEMP1, $ACC5, $ACC5 + vpmuludq 32*6-128($np), $Y1, $TEMP2 + and \$0x1fffffff, %eax + vpaddq $TEMP2, $ACC6, $ACC6 + vpmuludq 32*7-128($np), $Y1, $TEMP0 + vpaddq $TEMP0, $ACC7, $ACC7 + vpmuludq 32*8-128($np), $Y1, $TEMP1 + vmovd %eax, $Y1 + #vmovdqu 32*1-8-128($np), $TEMP2 # moved below + vpaddq $TEMP1, $ACC8, $ACC8 + #vmovdqu 32*2-8-128($np), $TEMP0 # moved below + vpbroadcastq $Y1, $Y1 + + vpmuludq 32*1-8-128($np), $Y2, $TEMP2 # see above + vmovdqu 32*3-8-128($np), $TEMP1 + mov %rax, %rdx + imulq -128($np), %rax + vpaddq $TEMP2, $ACC1, $ACC1 + vpmuludq 32*2-8-128($np), $Y2, $TEMP0 # see above + vmovdqu 32*4-8-128($np), $TEMP2 + add %rax, $r2 + mov %rdx, %rax + imulq 8-128($np), %rax + vpaddq $TEMP0, $ACC2, $ACC2 + add $r3, %rax + shr \$29, $r2 + vpmuludq $Y2, $TEMP1, $TEMP1 + vmovdqu 32*5-8-128($np), $TEMP0 + add $r2, %rax + vpaddq $TEMP1, $ACC3, $ACC3 + vpmuludq $Y2, $TEMP2, $TEMP2 + vmovdqu 32*6-8-128($np), $TEMP1 + .byte 0x67 + mov %rax, $r3 + imull $n0, %eax + vpaddq $TEMP2, $ACC4, $ACC4 + vpmuludq $Y2, $TEMP0, $TEMP0 + .byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 # vmovdqu 32*7-8-128($np), $TEMP2 + and \$0x1fffffff, %eax + vpaddq $TEMP0, $ACC5, $ACC5 + vpmuludq $Y2, $TEMP1, $TEMP1 + vmovdqu 32*8-8-128($np), $TEMP0 + vpaddq $TEMP1, $ACC6, $ACC6 + vpmuludq $Y2, $TEMP2, $TEMP2 + vmovdqu 32*9-8-128($np), $ACC9 + vmovd %eax, $ACC0 # borrow ACC0 for Y2 + imulq -128($np), %rax + vpaddq $TEMP2, $ACC7, $ACC7 + vpmuludq $Y2, $TEMP0, $TEMP0 + vmovdqu 32*1-16-128($np), $TEMP1 + vpbroadcastq $ACC0, $ACC0 + vpaddq $TEMP0, $ACC8, $ACC8 + vpmuludq $Y2, $ACC9, $ACC9 + vmovdqu 32*2-16-128($np), $TEMP2 + add %rax, $r3 + +___ +($ACC0,$Y2)=($Y2,$ACC0); +$code.=<<___; + vmovdqu 32*1-24-128($np), $ACC0 + vpmuludq $Y1, $TEMP1, $TEMP1 + vmovdqu 32*3-16-128($np), $TEMP0 + vpaddq $TEMP1, $ACC1, $ACC1 + vpmuludq $Y2, $ACC0, $ACC0 + vpmuludq $Y1, $TEMP2, $TEMP2 + .byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff # vmovdqu 32*4-16-128($np), $TEMP1 + vpaddq $ACC1, $ACC0, $ACC0 + vpaddq $TEMP2, $ACC2, $ACC2 + vpmuludq $Y1, $TEMP0, $TEMP0 + vmovdqu 32*5-16-128($np), $TEMP2 + .byte 0x67 + vmovq $ACC0, %rax + vmovdqu $ACC0, (%rsp) # transfer $r0-$r3 + vpaddq $TEMP0, $ACC3, $ACC3 + vpmuludq $Y1, $TEMP1, $TEMP1 + vmovdqu 32*6-16-128($np), $TEMP0 + vpaddq $TEMP1, $ACC4, $ACC4 + vpmuludq $Y1, $TEMP2, $TEMP2 + vmovdqu 32*7-16-128($np), $TEMP1 + vpaddq $TEMP2, $ACC5, $ACC5 + vpmuludq $Y1, $TEMP0, $TEMP0 + vmovdqu 32*8-16-128($np), $TEMP2 + vpaddq $TEMP0, $ACC6, $ACC6 + vpmuludq $Y1, $TEMP1, $TEMP1 + shr \$29, $r3 + vmovdqu 32*9-16-128($np), $TEMP0 + add $r3, %rax + vpaddq $TEMP1, $ACC7, $ACC7 + vpmuludq $Y1, $TEMP2, $TEMP2 + #vmovdqu 32*2-24-128($np), $TEMP1 # moved below + mov %rax, $r0 + imull $n0, %eax + vpaddq $TEMP2, $ACC8, $ACC8 + vpmuludq $Y1, $TEMP0, $TEMP0 + and \$0x1fffffff, %eax + vmovd %eax, $Y1 + vmovdqu 32*3-24-128($np), $TEMP2 + .byte 0x67 + vpaddq $TEMP0, $ACC9, $ACC9 + vpbroadcastq $Y1, $Y1 + + vpmuludq 32*2-24-128($np), $Y2, $TEMP1 # see above + vmovdqu 32*4-24-128($np), $TEMP0 + mov %rax, %rdx + imulq -128($np), %rax + mov 8(%rsp), $r1 + vpaddq $TEMP1, $ACC2, $ACC1 + vpmuludq $Y2, $TEMP2, $TEMP2 + vmovdqu 32*5-24-128($np), $TEMP1 + add %rax, $r0 + mov %rdx, %rax + imulq 8-128($np), %rax + .byte 0x67 + shr \$29, $r0 + mov 16(%rsp), $r2 + vpaddq $TEMP2, $ACC3, $ACC2 + vpmuludq $Y2, $TEMP0, $TEMP0 + vmovdqu 32*6-24-128($np), $TEMP2 + add %rax, $r1 + mov %rdx, %rax + imulq 16-128($np), %rax + vpaddq $TEMP0, $ACC4, $ACC3 + vpmuludq $Y2, $TEMP1, $TEMP1 + vmovdqu 32*7-24-128($np), $TEMP0 + imulq 24-128($np), %rdx # future $r3 + add %rax, $r2 + lea ($r0,$r1), %rax + vpaddq $TEMP1, $ACC5, $ACC4 + vpmuludq $Y2, $TEMP2, $TEMP2 + vmovdqu 32*8-24-128($np), $TEMP1 + mov %rax, $r1 + imull $n0, %eax + vpmuludq $Y2, $TEMP0, $TEMP0 + vpaddq $TEMP2, $ACC6, $ACC5 + vmovdqu 32*9-24-128($np), $TEMP2 + and \$0x1fffffff, %eax + vpaddq $TEMP0, $ACC7, $ACC6 + vpmuludq $Y2, $TEMP1, $TEMP1 + add 24(%rsp), %rdx + vpaddq $TEMP1, $ACC8, $ACC7 + vpmuludq $Y2, $TEMP2, $TEMP2 + vpaddq $TEMP2, $ACC9, $ACC8 + vmovq $r3, $ACC9 + mov %rdx, $r3 + + dec $i + jnz .LOOP_REDUCE_1024 +___ +($ACC0,$Y2)=($Y2,$ACC0); +$code.=<<___; + lea 448(%rsp), $tp1 # size optimization + vpaddq $ACC9, $Y2, $ACC0 + vpxor $ZERO, $ZERO, $ZERO + + vpaddq 32*9-192($tp0), $ACC0, $ACC0 + vpaddq 32*10-448($tp1), $ACC1, $ACC1 + vpaddq 32*11-448($tp1), $ACC2, $ACC2 + vpaddq 32*12-448($tp1), $ACC3, $ACC3 + vpaddq 32*13-448($tp1), $ACC4, $ACC4 + vpaddq 32*14-448($tp1), $ACC5, $ACC5 + vpaddq 32*15-448($tp1), $ACC6, $ACC6 + vpaddq 32*16-448($tp1), $ACC7, $ACC7 + vpaddq 32*17-448($tp1), $ACC8, $ACC8 + + vpsrlq \$29, $ACC0, $TEMP1 + vpand $AND_MASK, $ACC0, $ACC0 + vpsrlq \$29, $ACC1, $TEMP2 + vpand $AND_MASK, $ACC1, $ACC1 + vpsrlq \$29, $ACC2, $TEMP3 + vpermq \$0x93, $TEMP1, $TEMP1 + vpand $AND_MASK, $ACC2, $ACC2 + vpsrlq \$29, $ACC3, $TEMP4 + vpermq \$0x93, $TEMP2, $TEMP2 + vpand $AND_MASK, $ACC3, $ACC3 + vpermq \$0x93, $TEMP3, $TEMP3 + + vpblendd \$3, $ZERO, $TEMP1, $TEMP0 + vpermq \$0x93, $TEMP4, $TEMP4 + vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 + vpaddq $TEMP0, $ACC0, $ACC0 + vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 + vpaddq $TEMP1, $ACC1, $ACC1 + vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 + vpaddq $TEMP2, $ACC2, $ACC2 + vpblendd \$3, $TEMP4, $ZERO, $TEMP4 + vpaddq $TEMP3, $ACC3, $ACC3 + vpaddq $TEMP4, $ACC4, $ACC4 + + vpsrlq \$29, $ACC0, $TEMP1 + vpand $AND_MASK, $ACC0, $ACC0 + vpsrlq \$29, $ACC1, $TEMP2 + vpand $AND_MASK, $ACC1, $ACC1 + vpsrlq \$29, $ACC2, $TEMP3 + vpermq \$0x93, $TEMP1, $TEMP1 + vpand $AND_MASK, $ACC2, $ACC2 + vpsrlq \$29, $ACC3, $TEMP4 + vpermq \$0x93, $TEMP2, $TEMP2 + vpand $AND_MASK, $ACC3, $ACC3 + vpermq \$0x93, $TEMP3, $TEMP3 + + vpblendd \$3, $ZERO, $TEMP1, $TEMP0 + vpermq \$0x93, $TEMP4, $TEMP4 + vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 + vpaddq $TEMP0, $ACC0, $ACC0 + vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 + vpaddq $TEMP1, $ACC1, $ACC1 + vmovdqu $ACC0, 32*0-128($rp) + vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 + vpaddq $TEMP2, $ACC2, $ACC2 + vmovdqu $ACC1, 32*1-128($rp) + vpblendd \$3, $TEMP4, $ZERO, $TEMP4 + vpaddq $TEMP3, $ACC3, $ACC3 + vmovdqu $ACC2, 32*2-128($rp) + vpaddq $TEMP4, $ACC4, $ACC4 + vmovdqu $ACC3, 32*3-128($rp) +___ +$TEMP5=$ACC0; +$code.=<<___; + vpsrlq \$29, $ACC4, $TEMP1 + vpand $AND_MASK, $ACC4, $ACC4 + vpsrlq \$29, $ACC5, $TEMP2 + vpand $AND_MASK, $ACC5, $ACC5 + vpsrlq \$29, $ACC6, $TEMP3 + vpermq \$0x93, $TEMP1, $TEMP1 + vpand $AND_MASK, $ACC6, $ACC6 + vpsrlq \$29, $ACC7, $TEMP4 + vpermq \$0x93, $TEMP2, $TEMP2 + vpand $AND_MASK, $ACC7, $ACC7 + vpsrlq \$29, $ACC8, $TEMP5 + vpermq \$0x93, $TEMP3, $TEMP3 + vpand $AND_MASK, $ACC8, $ACC8 + vpermq \$0x93, $TEMP4, $TEMP4 + + vpblendd \$3, $ZERO, $TEMP1, $TEMP0 + vpermq \$0x93, $TEMP5, $TEMP5 + vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 + vpaddq $TEMP0, $ACC4, $ACC4 + vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 + vpaddq $TEMP1, $ACC5, $ACC5 + vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 + vpaddq $TEMP2, $ACC6, $ACC6 + vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 + vpaddq $TEMP3, $ACC7, $ACC7 + vpaddq $TEMP4, $ACC8, $ACC8 + + vpsrlq \$29, $ACC4, $TEMP1 + vpand $AND_MASK, $ACC4, $ACC4 + vpsrlq \$29, $ACC5, $TEMP2 + vpand $AND_MASK, $ACC5, $ACC5 + vpsrlq \$29, $ACC6, $TEMP3 + vpermq \$0x93, $TEMP1, $TEMP1 + vpand $AND_MASK, $ACC6, $ACC6 + vpsrlq \$29, $ACC7, $TEMP4 + vpermq \$0x93, $TEMP2, $TEMP2 + vpand $AND_MASK, $ACC7, $ACC7 + vpsrlq \$29, $ACC8, $TEMP5 + vpermq \$0x93, $TEMP3, $TEMP3 + vpand $AND_MASK, $ACC8, $ACC8 + vpermq \$0x93, $TEMP4, $TEMP4 + + vpblendd \$3, $ZERO, $TEMP1, $TEMP0 + vpermq \$0x93, $TEMP5, $TEMP5 + vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 + vpaddq $TEMP0, $ACC4, $ACC4 + vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 + vpaddq $TEMP1, $ACC5, $ACC5 + vmovdqu $ACC4, 32*4-128($rp) + vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 + vpaddq $TEMP2, $ACC6, $ACC6 + vmovdqu $ACC5, 32*5-128($rp) + vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 + vpaddq $TEMP3, $ACC7, $ACC7 + vmovdqu $ACC6, 32*6-128($rp) + vpaddq $TEMP4, $ACC8, $ACC8 + vmovdqu $ACC7, 32*7-128($rp) + vmovdqu $ACC8, 32*8-128($rp) + + mov $rp, $ap + dec $rep + jne .LOOP_GRANDE_SQR_1024 + + vzeroall + mov %rbp, %rax +___ +$code.=<<___ if ($win64); + movaps -0xd8(%rax),%xmm6 + movaps -0xc8(%rax),%xmm7 + movaps -0xb8(%rax),%xmm8 + movaps -0xa8(%rax),%xmm9 + movaps -0x98(%rax),%xmm10 + movaps -0x88(%rax),%xmm11 + movaps -0x78(%rax),%xmm12 + movaps -0x68(%rax),%xmm13 + movaps -0x58(%rax),%xmm14 + movaps -0x48(%rax),%xmm15 +___ +$code.=<<___; + mov -48(%rax),%r15 + mov -40(%rax),%r14 + mov -32(%rax),%r13 + mov -24(%rax),%r12 + mov -16(%rax),%rbp + mov -8(%rax),%rbx + lea (%rax),%rsp # restore %rsp +.Lsqr_1024_epilogue: + ret +.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 +___ +} + +{ # void AMM_WW( +my $rp="%rdi"; # BN_ULONG *rp, +my $ap="%rsi"; # const BN_ULONG *ap, +my $bp="%rdx"; # const BN_ULONG *bp, +my $np="%rcx"; # const BN_ULONG *np, +my $n0="%r8d"; # unsigned int n0); + +# The registers that hold the accumulated redundant result +# The AMM works on 1024 bit operands, and redundant word size is 29 +# Therefore: ceil(1024/29)/4 = 9 +my $ACC0="%ymm0"; +my $ACC1="%ymm1"; +my $ACC2="%ymm2"; +my $ACC3="%ymm3"; +my $ACC4="%ymm4"; +my $ACC5="%ymm5"; +my $ACC6="%ymm6"; +my $ACC7="%ymm7"; +my $ACC8="%ymm8"; +my $ACC9="%ymm9"; + +# Registers that hold the broadcasted words of multiplier, currently used +my $Bi="%ymm10"; +my $Yi="%ymm11"; + +# Helper registers +my $TEMP0=$ACC0; +my $TEMP1="%ymm12"; +my $TEMP2="%ymm13"; +my $ZERO="%ymm14"; +my $AND_MASK="%ymm15"; + +# alu registers that hold the first words of the ACC +my $r0="%r9"; +my $r1="%r10"; +my $r2="%r11"; +my $r3="%r12"; + +my $i="%r14d"; +my $tmp="%r15"; + +$bp="%r13"; # reassigned argument + +$code.=<<___; +.globl rsaz_1024_mul_avx2 +.type rsaz_1024_mul_avx2,\@function,5 +.align 64 +rsaz_1024_mul_avx2: + lea (%rsp), %rax + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 +___ +$code.=<<___ if ($win64); + vzeroupper + lea -0xa8(%rsp),%rsp + vmovaps %xmm6,-0xd8(%rax) + vmovaps %xmm7,-0xc8(%rax) + vmovaps %xmm8,-0xb8(%rax) + vmovaps %xmm9,-0xa8(%rax) + vmovaps %xmm10,-0x98(%rax) + vmovaps %xmm11,-0x88(%rax) + vmovaps %xmm12,-0x78(%rax) + vmovaps %xmm13,-0x68(%rax) + vmovaps %xmm14,-0x58(%rax) + vmovaps %xmm15,-0x48(%rax) +.Lmul_1024_body: +___ +$code.=<<___; + mov %rax,%rbp + vzeroall + mov %rdx, $bp # reassigned argument + sub \$64,%rsp + + # unaligned 256-bit load that crosses page boundary can + # cause severe performance degradation here, so if $ap does + # cross page boundary, swap it with $bp [meaning that caller + # is advised to lay down $ap and $bp next to each other, so + # that only one can cross page boundary]. + .byte 0x67,0x67 + mov $ap, $tmp + and \$4095, $tmp + add \$32*10, $tmp + shr \$12, $tmp + mov $ap, $tmp + cmovnz $bp, $ap + cmovnz $tmp, $bp + + mov $np, $tmp + sub \$-128,$ap # size optimization + sub \$-128,$np + sub \$-128,$rp + + and \$4095, $tmp # see if $np crosses page + add \$32*10, $tmp + .byte 0x67,0x67 + shr \$12, $tmp + jz .Lmul_1024_no_n_copy + + # unaligned 256-bit load that crosses page boundary can + # cause severe performance degradation here, so if $np does + # cross page boundary, copy it to stack and make sure stack + # frame doesn't... + sub \$32*10,%rsp + vmovdqu 32*0-128($np), $ACC0 + and \$-512, %rsp + vmovdqu 32*1-128($np), $ACC1 + vmovdqu 32*2-128($np), $ACC2 + vmovdqu 32*3-128($np), $ACC3 + vmovdqu 32*4-128($np), $ACC4 + vmovdqu 32*5-128($np), $ACC5 + vmovdqu 32*6-128($np), $ACC6 + vmovdqu 32*7-128($np), $ACC7 + vmovdqu 32*8-128($np), $ACC8 + lea 64+128(%rsp),$np + vmovdqu $ACC0, 32*0-128($np) + vpxor $ACC0, $ACC0, $ACC0 + vmovdqu $ACC1, 32*1-128($np) + vpxor $ACC1, $ACC1, $ACC1 + vmovdqu $ACC2, 32*2-128($np) + vpxor $ACC2, $ACC2, $ACC2 + vmovdqu $ACC3, 32*3-128($np) + vpxor $ACC3, $ACC3, $ACC3 + vmovdqu $ACC4, 32*4-128($np) + vpxor $ACC4, $ACC4, $ACC4 + vmovdqu $ACC5, 32*5-128($np) + vpxor $ACC5, $ACC5, $ACC5 + vmovdqu $ACC6, 32*6-128($np) + vpxor $ACC6, $ACC6, $ACC6 + vmovdqu $ACC7, 32*7-128($np) + vpxor $ACC7, $ACC7, $ACC7 + vmovdqu $ACC8, 32*8-128($np) + vmovdqa $ACC0, $ACC8 + vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero after vzeroall +.Lmul_1024_no_n_copy: + and \$-64,%rsp + + mov ($bp), %rbx + vpbroadcastq ($bp), $Bi + vmovdqu $ACC0, (%rsp) # clear top of stack + xor $r0, $r0 + .byte 0x67 + xor $r1, $r1 + xor $r2, $r2 + xor $r3, $r3 + + vmovdqu .Land_mask(%rip), $AND_MASK + mov \$9, $i + vmovdqu $ACC9, 32*9-128($rp) # $ACC9 is zero after vzeroall + jmp .Loop_mul_1024 + +.align 32 +.Loop_mul_1024: + vpsrlq \$29, $ACC3, $ACC9 # correct $ACC3(*) + mov %rbx, %rax + imulq -128($ap), %rax + add $r0, %rax + mov %rbx, $r1 + imulq 8-128($ap), $r1 + add 8(%rsp), $r1 + + mov %rax, $r0 + imull $n0, %eax + and \$0x1fffffff, %eax + + mov %rbx, $r2 + imulq 16-128($ap), $r2 + add 16(%rsp), $r2 + + mov %rbx, $r3 + imulq 24-128($ap), $r3 + add 24(%rsp), $r3 + vpmuludq 32*1-128($ap),$Bi,$TEMP0 + vmovd %eax, $Yi + vpaddq $TEMP0,$ACC1,$ACC1 + vpmuludq 32*2-128($ap),$Bi,$TEMP1 + vpbroadcastq $Yi, $Yi + vpaddq $TEMP1,$ACC2,$ACC2 + vpmuludq 32*3-128($ap),$Bi,$TEMP2 + vpand $AND_MASK, $ACC3, $ACC3 # correct $ACC3 + vpaddq $TEMP2,$ACC3,$ACC3 + vpmuludq 32*4-128($ap),$Bi,$TEMP0 + vpaddq $TEMP0,$ACC4,$ACC4 + vpmuludq 32*5-128($ap),$Bi,$TEMP1 + vpaddq $TEMP1,$ACC5,$ACC5 + vpmuludq 32*6-128($ap),$Bi,$TEMP2 + vpaddq $TEMP2,$ACC6,$ACC6 + vpmuludq 32*7-128($ap),$Bi,$TEMP0 + vpermq \$0x93, $ACC9, $ACC9 # correct $ACC3 + vpaddq $TEMP0,$ACC7,$ACC7 + vpmuludq 32*8-128($ap),$Bi,$TEMP1 + vpbroadcastq 8($bp), $Bi + vpaddq $TEMP1,$ACC8,$ACC8 + + mov %rax,%rdx + imulq -128($np),%rax + add %rax,$r0 + mov %rdx,%rax + imulq 8-128($np),%rax + add %rax,$r1 + mov %rdx,%rax + imulq 16-128($np),%rax + add %rax,$r2 + shr \$29, $r0 + imulq 24-128($np),%rdx + add %rdx,$r3 + add $r0, $r1 + + vpmuludq 32*1-128($np),$Yi,$TEMP2 + vmovq $Bi, %rbx + vpaddq $TEMP2,$ACC1,$ACC1 + vpmuludq 32*2-128($np),$Yi,$TEMP0 + vpaddq $TEMP0,$ACC2,$ACC2 + vpmuludq 32*3-128($np),$Yi,$TEMP1 + vpaddq $TEMP1,$ACC3,$ACC3 + vpmuludq 32*4-128($np),$Yi,$TEMP2 + vpaddq $TEMP2,$ACC4,$ACC4 + vpmuludq 32*5-128($np),$Yi,$TEMP0 + vpaddq $TEMP0,$ACC5,$ACC5 + vpmuludq 32*6-128($np),$Yi,$TEMP1 + vpaddq $TEMP1,$ACC6,$ACC6 + vpmuludq 32*7-128($np),$Yi,$TEMP2 + vpblendd \$3, $ZERO, $ACC9, $ACC9 # correct $ACC3 + vpaddq $TEMP2,$ACC7,$ACC7 + vpmuludq 32*8-128($np),$Yi,$TEMP0 + vpaddq $ACC9, $ACC3, $ACC3 # correct $ACC3 + vpaddq $TEMP0,$ACC8,$ACC8 + + mov %rbx, %rax + imulq -128($ap),%rax + add %rax,$r1 + vmovdqu -8+32*1-128($ap),$TEMP1 + mov %rbx, %rax + imulq 8-128($ap),%rax + add %rax,$r2 + vmovdqu -8+32*2-128($ap),$TEMP2 + + mov $r1, %rax + imull $n0, %eax + and \$0x1fffffff, %eax + + imulq 16-128($ap),%rbx + add %rbx,$r3 + vpmuludq $Bi,$TEMP1,$TEMP1 + vmovd %eax, $Yi + vmovdqu -8+32*3-128($ap),$TEMP0 + vpaddq $TEMP1,$ACC1,$ACC1 + vpmuludq $Bi,$TEMP2,$TEMP2 + vpbroadcastq $Yi, $Yi + vmovdqu -8+32*4-128($ap),$TEMP1 + vpaddq $TEMP2,$ACC2,$ACC2 + vpmuludq $Bi,$TEMP0,$TEMP0 + vmovdqu -8+32*5-128($ap),$TEMP2 + vpaddq $TEMP0,$ACC3,$ACC3 + vpmuludq $Bi,$TEMP1,$TEMP1 + vmovdqu -8+32*6-128($ap),$TEMP0 + vpaddq $TEMP1,$ACC4,$ACC4 + vpmuludq $Bi,$TEMP2,$TEMP2 + vmovdqu -8+32*7-128($ap),$TEMP1 + vpaddq $TEMP2,$ACC5,$ACC5 + vpmuludq $Bi,$TEMP0,$TEMP0 + vmovdqu -8+32*8-128($ap),$TEMP2 + vpaddq $TEMP0,$ACC6,$ACC6 + vpmuludq $Bi,$TEMP1,$TEMP1 + vmovdqu -8+32*9-128($ap),$ACC9 + vpaddq $TEMP1,$ACC7,$ACC7 + vpmuludq $Bi,$TEMP2,$TEMP2 + vpaddq $TEMP2,$ACC8,$ACC8 + vpmuludq $Bi,$ACC9,$ACC9 + vpbroadcastq 16($bp), $Bi + + mov %rax,%rdx + imulq -128($np),%rax + add %rax,$r1 + vmovdqu -8+32*1-128($np),$TEMP0 + mov %rdx,%rax + imulq 8-128($np),%rax + add %rax,$r2 + vmovdqu -8+32*2-128($np),$TEMP1 + shr \$29, $r1 + imulq 16-128($np),%rdx + add %rdx,$r3 + add $r1, $r2 + + vpmuludq $Yi,$TEMP0,$TEMP0 + vmovq $Bi, %rbx + vmovdqu -8+32*3-128($np),$TEMP2 + vpaddq $TEMP0,$ACC1,$ACC1 + vpmuludq $Yi,$TEMP1,$TEMP1 + vmovdqu -8+32*4-128($np),$TEMP0 + vpaddq $TEMP1,$ACC2,$ACC2 + vpmuludq $Yi,$TEMP2,$TEMP2 + vmovdqu -8+32*5-128($np),$TEMP1 + vpaddq $TEMP2,$ACC3,$ACC3 + vpmuludq $Yi,$TEMP0,$TEMP0 + vmovdqu -8+32*6-128($np),$TEMP2 + vpaddq $TEMP0,$ACC4,$ACC4 + vpmuludq $Yi,$TEMP1,$TEMP1 + vmovdqu -8+32*7-128($np),$TEMP0 + vpaddq $TEMP1,$ACC5,$ACC5 + vpmuludq $Yi,$TEMP2,$TEMP2 + vmovdqu -8+32*8-128($np),$TEMP1 + vpaddq $TEMP2,$ACC6,$ACC6 + vpmuludq $Yi,$TEMP0,$TEMP0 + vmovdqu -8+32*9-128($np),$TEMP2 + vpaddq $TEMP0,$ACC7,$ACC7 + vpmuludq $Yi,$TEMP1,$TEMP1 + vpaddq $TEMP1,$ACC8,$ACC8 + vpmuludq $Yi,$TEMP2,$TEMP2 + vpaddq $TEMP2,$ACC9,$ACC9 + + vmovdqu -16+32*1-128($ap),$TEMP0 + mov %rbx,%rax + imulq -128($ap),%rax + add $r2,%rax + + vmovdqu -16+32*2-128($ap),$TEMP1 + mov %rax,$r2 + imull $n0, %eax + and \$0x1fffffff, %eax + + imulq 8-128($ap),%rbx + add %rbx,$r3 + vpmuludq $Bi,$TEMP0,$TEMP0 + vmovd %eax, $Yi + vmovdqu -16+32*3-128($ap),$TEMP2 + vpaddq $TEMP0,$ACC1,$ACC1 + vpmuludq $Bi,$TEMP1,$TEMP1 + vpbroadcastq $Yi, $Yi + vmovdqu -16+32*4-128($ap),$TEMP0 + vpaddq $TEMP1,$ACC2,$ACC2 + vpmuludq $Bi,$TEMP2,$TEMP2 + vmovdqu -16+32*5-128($ap),$TEMP1 + vpaddq $TEMP2,$ACC3,$ACC3 + vpmuludq $Bi,$TEMP0,$TEMP0 + vmovdqu -16+32*6-128($ap),$TEMP2 + vpaddq $TEMP0,$ACC4,$ACC4 + vpmuludq $Bi,$TEMP1,$TEMP1 + vmovdqu -16+32*7-128($ap),$TEMP0 + vpaddq $TEMP1,$ACC5,$ACC5 + vpmuludq $Bi,$TEMP2,$TEMP2 + vmovdqu -16+32*8-128($ap),$TEMP1 + vpaddq $TEMP2,$ACC6,$ACC6 + vpmuludq $Bi,$TEMP0,$TEMP0 + vmovdqu -16+32*9-128($ap),$TEMP2 + vpaddq $TEMP0,$ACC7,$ACC7 + vpmuludq $Bi,$TEMP1,$TEMP1 + vpaddq $TEMP1,$ACC8,$ACC8 + vpmuludq $Bi,$TEMP2,$TEMP2 + vpbroadcastq 24($bp), $Bi + vpaddq $TEMP2,$ACC9,$ACC9 + + vmovdqu -16+32*1-128($np),$TEMP0 + mov %rax,%rdx + imulq -128($np),%rax + add %rax,$r2 + vmovdqu -16+32*2-128($np),$TEMP1 + imulq 8-128($np),%rdx + add %rdx,$r3 + shr \$29, $r2 + + vpmuludq $Yi,$TEMP0,$TEMP0 + vmovq $Bi, %rbx + vmovdqu -16+32*3-128($np),$TEMP2 + vpaddq $TEMP0,$ACC1,$ACC1 + vpmuludq $Yi,$TEMP1,$TEMP1 + vmovdqu -16+32*4-128($np),$TEMP0 + vpaddq $TEMP1,$ACC2,$ACC2 + vpmuludq $Yi,$TEMP2,$TEMP2 + vmovdqu -16+32*5-128($np),$TEMP1 + vpaddq $TEMP2,$ACC3,$ACC3 + vpmuludq $Yi,$TEMP0,$TEMP0 + vmovdqu -16+32*6-128($np),$TEMP2 + vpaddq $TEMP0,$ACC4,$ACC4 + vpmuludq $Yi,$TEMP1,$TEMP1 + vmovdqu -16+32*7-128($np),$TEMP0 + vpaddq $TEMP1,$ACC5,$ACC5 + vpmuludq $Yi,$TEMP2,$TEMP2 + vmovdqu -16+32*8-128($np),$TEMP1 + vpaddq $TEMP2,$ACC6,$ACC6 + vpmuludq $Yi,$TEMP0,$TEMP0 + vmovdqu -16+32*9-128($np),$TEMP2 + vpaddq $TEMP0,$ACC7,$ACC7 + vpmuludq $Yi,$TEMP1,$TEMP1 + vmovdqu -24+32*1-128($ap),$TEMP0 + vpaddq $TEMP1,$ACC8,$ACC8 + vpmuludq $Yi,$TEMP2,$TEMP2 + vmovdqu -24+32*2-128($ap),$TEMP1 + vpaddq $TEMP2,$ACC9,$ACC9 + + add $r2, $r3 + imulq -128($ap),%rbx + add %rbx,$r3 + + mov $r3, %rax + imull $n0, %eax + and \$0x1fffffff, %eax + + vpmuludq $Bi,$TEMP0,$TEMP0 + vmovd %eax, $Yi + vmovdqu -24+32*3-128($ap),$TEMP2 + vpaddq $TEMP0,$ACC1,$ACC1 + vpmuludq $Bi,$TEMP1,$TEMP1 + vpbroadcastq $Yi, $Yi + vmovdqu -24+32*4-128($ap),$TEMP0 + vpaddq $TEMP1,$ACC2,$ACC2 + vpmuludq $Bi,$TEMP2,$TEMP2 + vmovdqu -24+32*5-128($ap),$TEMP1 + vpaddq $TEMP2,$ACC3,$ACC3 + vpmuludq $Bi,$TEMP0,$TEMP0 + vmovdqu -24+32*6-128($ap),$TEMP2 + vpaddq $TEMP0,$ACC4,$ACC4 + vpmuludq $Bi,$TEMP1,$TEMP1 + vmovdqu -24+32*7-128($ap),$TEMP0 + vpaddq $TEMP1,$ACC5,$ACC5 + vpmuludq $Bi,$TEMP2,$TEMP2 + vmovdqu -24+32*8-128($ap),$TEMP1 + vpaddq $TEMP2,$ACC6,$ACC6 + vpmuludq $Bi,$TEMP0,$TEMP0 + vmovdqu -24+32*9-128($ap),$TEMP2 + vpaddq $TEMP0,$ACC7,$ACC7 + vpmuludq $Bi,$TEMP1,$TEMP1 + vpaddq $TEMP1,$ACC8,$ACC8 + vpmuludq $Bi,$TEMP2,$TEMP2 + vpbroadcastq 32($bp), $Bi + vpaddq $TEMP2,$ACC9,$ACC9 + add \$32, $bp # $bp++ + + vmovdqu -24+32*1-128($np),$TEMP0 + imulq -128($np),%rax + add %rax,$r3 + shr \$29, $r3 + + vmovdqu -24+32*2-128($np),$TEMP1 + vpmuludq $Yi,$TEMP0,$TEMP0 + vmovq $Bi, %rbx + vmovdqu -24+32*3-128($np),$TEMP2 + vpaddq $TEMP0,$ACC1,$ACC0 # $ACC0==$TEMP0 + vpmuludq $Yi,$TEMP1,$TEMP1 + vmovdqu $ACC0, (%rsp) # transfer $r0-$r3 + vpaddq $TEMP1,$ACC2,$ACC1 + vmovdqu -24+32*4-128($np),$TEMP0 + vpmuludq $Yi,$TEMP2,$TEMP2 + vmovdqu -24+32*5-128($np),$TEMP1 + vpaddq $TEMP2,$ACC3,$ACC2 + vpmuludq $Yi,$TEMP0,$TEMP0 + vmovdqu -24+32*6-128($np),$TEMP2 + vpaddq $TEMP0,$ACC4,$ACC3 + vpmuludq $Yi,$TEMP1,$TEMP1 + vmovdqu -24+32*7-128($np),$TEMP0 + vpaddq $TEMP1,$ACC5,$ACC4 + vpmuludq $Yi,$TEMP2,$TEMP2 + vmovdqu -24+32*8-128($np),$TEMP1 + vpaddq $TEMP2,$ACC6,$ACC5 + vpmuludq $Yi,$TEMP0,$TEMP0 + vmovdqu -24+32*9-128($np),$TEMP2 + mov $r3, $r0 + vpaddq $TEMP0,$ACC7,$ACC6 + vpmuludq $Yi,$TEMP1,$TEMP1 + add (%rsp), $r0 + vpaddq $TEMP1,$ACC8,$ACC7 + vpmuludq $Yi,$TEMP2,$TEMP2 + vmovq $r3, $TEMP1 + vpaddq $TEMP2,$ACC9,$ACC8 + + dec $i + jnz .Loop_mul_1024 +___ + +# (*) Original implementation was correcting ACC1-ACC3 for overflow +# after 7 loop runs, or after 28 iterations, or 56 additions. +# But as we underutilize resources, it's possible to correct in +# each iteration with marginal performance loss. But then, as +# we do it in each iteration, we can correct less digits, and +# avoid performance penalties completely. Also note that we +# correct only three digits out of four. This works because +# most significant digit is subjected to less additions. + +$TEMP0 = $ACC9; +$TEMP3 = $Bi; +$TEMP4 = $Yi; +$code.=<<___; + vpermq \$0, $AND_MASK, $AND_MASK + vpaddq (%rsp), $TEMP1, $ACC0 + + vpsrlq \$29, $ACC0, $TEMP1 + vpand $AND_MASK, $ACC0, $ACC0 + vpsrlq \$29, $ACC1, $TEMP2 + vpand $AND_MASK, $ACC1, $ACC1 + vpsrlq \$29, $ACC2, $TEMP3 + vpermq \$0x93, $TEMP1, $TEMP1 + vpand $AND_MASK, $ACC2, $ACC2 + vpsrlq \$29, $ACC3, $TEMP4 + vpermq \$0x93, $TEMP2, $TEMP2 + vpand $AND_MASK, $ACC3, $ACC3 + + vpblendd \$3, $ZERO, $TEMP1, $TEMP0 + vpermq \$0x93, $TEMP3, $TEMP3 + vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 + vpermq \$0x93, $TEMP4, $TEMP4 + vpaddq $TEMP0, $ACC0, $ACC0 + vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 + vpaddq $TEMP1, $ACC1, $ACC1 + vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 + vpaddq $TEMP2, $ACC2, $ACC2 + vpblendd \$3, $TEMP4, $ZERO, $TEMP4 + vpaddq $TEMP3, $ACC3, $ACC3 + vpaddq $TEMP4, $ACC4, $ACC4 + + vpsrlq \$29, $ACC0, $TEMP1 + vpand $AND_MASK, $ACC0, $ACC0 + vpsrlq \$29, $ACC1, $TEMP2 + vpand $AND_MASK, $ACC1, $ACC1 + vpsrlq \$29, $ACC2, $TEMP3 + vpermq \$0x93, $TEMP1, $TEMP1 + vpand $AND_MASK, $ACC2, $ACC2 + vpsrlq \$29, $ACC3, $TEMP4 + vpermq \$0x93, $TEMP2, $TEMP2 + vpand $AND_MASK, $ACC3, $ACC3 + vpermq \$0x93, $TEMP3, $TEMP3 + + vpblendd \$3, $ZERO, $TEMP1, $TEMP0 + vpermq \$0x93, $TEMP4, $TEMP4 + vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 + vpaddq $TEMP0, $ACC0, $ACC0 + vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 + vpaddq $TEMP1, $ACC1, $ACC1 + vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 + vpaddq $TEMP2, $ACC2, $ACC2 + vpblendd \$3, $TEMP4, $ZERO, $TEMP4 + vpaddq $TEMP3, $ACC3, $ACC3 + vpaddq $TEMP4, $ACC4, $ACC4 + + vmovdqu $ACC0, 0-128($rp) + vmovdqu $ACC1, 32-128($rp) + vmovdqu $ACC2, 64-128($rp) + vmovdqu $ACC3, 96-128($rp) +___ + +$TEMP5=$ACC0; +$code.=<<___; + vpsrlq \$29, $ACC4, $TEMP1 + vpand $AND_MASK, $ACC4, $ACC4 + vpsrlq \$29, $ACC5, $TEMP2 + vpand $AND_MASK, $ACC5, $ACC5 + vpsrlq \$29, $ACC6, $TEMP3 + vpermq \$0x93, $TEMP1, $TEMP1 + vpand $AND_MASK, $ACC6, $ACC6 + vpsrlq \$29, $ACC7, $TEMP4 + vpermq \$0x93, $TEMP2, $TEMP2 + vpand $AND_MASK, $ACC7, $ACC7 + vpsrlq \$29, $ACC8, $TEMP5 + vpermq \$0x93, $TEMP3, $TEMP3 + vpand $AND_MASK, $ACC8, $ACC8 + vpermq \$0x93, $TEMP4, $TEMP4 + + vpblendd \$3, $ZERO, $TEMP1, $TEMP0 + vpermq \$0x93, $TEMP5, $TEMP5 + vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 + vpaddq $TEMP0, $ACC4, $ACC4 + vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 + vpaddq $TEMP1, $ACC5, $ACC5 + vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 + vpaddq $TEMP2, $ACC6, $ACC6 + vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 + vpaddq $TEMP3, $ACC7, $ACC7 + vpaddq $TEMP4, $ACC8, $ACC8 + + vpsrlq \$29, $ACC4, $TEMP1 + vpand $AND_MASK, $ACC4, $ACC4 + vpsrlq \$29, $ACC5, $TEMP2 + vpand $AND_MASK, $ACC5, $ACC5 + vpsrlq \$29, $ACC6, $TEMP3 + vpermq \$0x93, $TEMP1, $TEMP1 + vpand $AND_MASK, $ACC6, $ACC6 + vpsrlq \$29, $ACC7, $TEMP4 + vpermq \$0x93, $TEMP2, $TEMP2 + vpand $AND_MASK, $ACC7, $ACC7 + vpsrlq \$29, $ACC8, $TEMP5 + vpermq \$0x93, $TEMP3, $TEMP3 + vpand $AND_MASK, $ACC8, $ACC8 + vpermq \$0x93, $TEMP4, $TEMP4 + + vpblendd \$3, $ZERO, $TEMP1, $TEMP0 + vpermq \$0x93, $TEMP5, $TEMP5 + vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 + vpaddq $TEMP0, $ACC4, $ACC4 + vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 + vpaddq $TEMP1, $ACC5, $ACC5 + vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 + vpaddq $TEMP2, $ACC6, $ACC6 + vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 + vpaddq $TEMP3, $ACC7, $ACC7 + vpaddq $TEMP4, $ACC8, $ACC8 + + vmovdqu $ACC4, 128-128($rp) + vmovdqu $ACC5, 160-128($rp) + vmovdqu $ACC6, 192-128($rp) + vmovdqu $ACC7, 224-128($rp) + vmovdqu $ACC8, 256-128($rp) + vzeroupper + + mov %rbp, %rax +___ +$code.=<<___ if ($win64); + movaps -0xd8(%rax),%xmm6 + movaps -0xc8(%rax),%xmm7 + movaps -0xb8(%rax),%xmm8 + movaps -0xa8(%rax),%xmm9 + movaps -0x98(%rax),%xmm10 + movaps -0x88(%rax),%xmm11 + movaps -0x78(%rax),%xmm12 + movaps -0x68(%rax),%xmm13 + movaps -0x58(%rax),%xmm14 + movaps -0x48(%rax),%xmm15 +___ +$code.=<<___; + mov -48(%rax),%r15 + mov -40(%rax),%r14 + mov -32(%rax),%r13 + mov -24(%rax),%r12 + mov -16(%rax),%rbp + mov -8(%rax),%rbx + lea (%rax),%rsp # restore %rsp +.Lmul_1024_epilogue: + ret +.size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2 +___ +} +{ +my ($out,$inp) = $win64 ? ("%rcx","%rdx") : ("%rdi","%rsi"); +my @T = map("%r$_",(8..11)); + +$code.=<<___; +.globl rsaz_1024_red2norm_avx2 +.type rsaz_1024_red2norm_avx2,\@abi-omnipotent +.align 32 +rsaz_1024_red2norm_avx2: + sub \$-128,$inp # size optimization + xor %rax,%rax +___ + +for ($j=0,$i=0; $i<16; $i++) { + my $k=0; + while (29*$j<64*($i+1)) { # load data till boundary + $code.=" mov `8*$j-128`($inp), @T[0]\n"; + $j++; $k++; push(@T,shift(@T)); + } + $l=$k; + while ($k>1) { # shift loaded data but last value + $code.=" shl \$`29*($j-$k)`,@T[-$k]\n"; + $k--; + } + $code.=<<___; # shift last value + mov @T[-1], @T[0] + shl \$`29*($j-1)`, @T[-1] + shr \$`-29*($j-1)`, @T[0] +___ + while ($l) { # accumulate all values + $code.=" add @T[-$l], %rax\n"; + $l--; + } + $code.=<<___; + adc \$0, @T[0] # consume eventual carry + mov %rax, 8*$i($out) + mov @T[0], %rax +___ + push(@T,shift(@T)); +} +$code.=<<___; + ret +.size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2 + +.globl rsaz_1024_norm2red_avx2 +.type rsaz_1024_norm2red_avx2,\@abi-omnipotent +.align 32 +rsaz_1024_norm2red_avx2: + sub \$-128,$out # size optimization + mov ($inp),@T[0] + mov \$0x1fffffff,%eax +___ +for ($j=0,$i=0; $i<16; $i++) { + $code.=" mov `8*($i+1)`($inp),@T[1]\n" if ($i<15); + $code.=" xor @T[1],@T[1]\n" if ($i==15); + my $k=1; + while (29*($j+1)<64*($i+1)) { + $code.=<<___; + mov @T[0],@T[-$k] + shr \$`29*$j`,@T[-$k] + and %rax,@T[-$k] # &0x1fffffff + mov @T[-$k],`8*$j-128`($out) +___ + $j++; $k++; + } + $code.=<<___; + shrd \$`29*$j`,@T[1],@T[0] + and %rax,@T[0] + mov @T[0],`8*$j-128`($out) +___ + $j++; + push(@T,shift(@T)); +} +$code.=<<___; + mov @T[0],`8*$j-128`($out) # zero + mov @T[0],`8*($j+1)-128`($out) + mov @T[0],`8*($j+2)-128`($out) + mov @T[0],`8*($j+3)-128`($out) + ret +.size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2 +___ +} +{ +my ($out,$inp,$power) = $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx"); + +$code.=<<___; +.globl rsaz_1024_scatter5_avx2 +.type rsaz_1024_scatter5_avx2,\@abi-omnipotent +.align 32 +rsaz_1024_scatter5_avx2: + vzeroupper + vmovdqu .Lscatter_permd(%rip),%ymm5 + shl \$4,$power + lea ($out,$power),$out + mov \$9,%eax + jmp .Loop_scatter_1024 + +.align 32 +.Loop_scatter_1024: + vmovdqu ($inp),%ymm0 + lea 32($inp),$inp + vpermd %ymm0,%ymm5,%ymm0 + vmovdqu %xmm0,($out) + lea 16*32($out),$out + dec %eax + jnz .Loop_scatter_1024 + + vzeroupper + ret +.size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2 + +.globl rsaz_1024_gather5_avx2 +.type rsaz_1024_gather5_avx2,\@abi-omnipotent +.align 32 +rsaz_1024_gather5_avx2: +___ +$code.=<<___ if ($win64); + lea -0x88(%rsp),%rax + vzeroupper +.LSEH_begin_rsaz_1024_gather5: + # I can't trust assembler to use specific encoding:-( + .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp + .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6,-0x20(%rax) + .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7,-0x10(%rax) + .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8,0(%rax) + .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9,0x10(%rax) + .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10,0x20(%rax) + .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11,0x30(%rax) + .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12,0x40(%rax) + .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13,0x50(%rax) + .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14,0x60(%rax) + .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15,0x70(%rax) +___ +$code.=<<___; + lea .Lgather_table(%rip),%r11 + mov $power,%eax + and \$3,$power + shr \$2,%eax # cache line number + shl \$4,$power # offset within cache line + + vmovdqu -32(%r11),%ymm7 # .Lgather_permd + vpbroadcastb 8(%r11,%rax), %xmm8 + vpbroadcastb 7(%r11,%rax), %xmm9 + vpbroadcastb 6(%r11,%rax), %xmm10 + vpbroadcastb 5(%r11,%rax), %xmm11 + vpbroadcastb 4(%r11,%rax), %xmm12 + vpbroadcastb 3(%r11,%rax), %xmm13 + vpbroadcastb 2(%r11,%rax), %xmm14 + vpbroadcastb 1(%r11,%rax), %xmm15 + + lea 64($inp,$power),$inp + mov \$64,%r11 # size optimization + mov \$9,%eax + jmp .Loop_gather_1024 + +.align 32 +.Loop_gather_1024: + vpand -64($inp), %xmm8,%xmm0 + vpand ($inp), %xmm9,%xmm1 + vpand 64($inp), %xmm10,%xmm2 + vpand ($inp,%r11,2), %xmm11,%xmm3 + vpor %xmm0,%xmm1,%xmm1 + vpand 64($inp,%r11,2), %xmm12,%xmm4 + vpor %xmm2,%xmm3,%xmm3 + vpand ($inp,%r11,4), %xmm13,%xmm5 + vpor %xmm1,%xmm3,%xmm3 + vpand 64($inp,%r11,4), %xmm14,%xmm6 + vpor %xmm4,%xmm5,%xmm5 + vpand -128($inp,%r11,8), %xmm15,%xmm2 + lea ($inp,%r11,8),$inp + vpor %xmm3,%xmm5,%xmm5 + vpor %xmm2,%xmm6,%xmm6 + vpor %xmm5,%xmm6,%xmm6 + vpermd %ymm6,%ymm7,%ymm6 + vmovdqu %ymm6,($out) + lea 32($out),$out + dec %eax + jnz .Loop_gather_1024 + + vpxor %ymm0,%ymm0,%ymm0 + vmovdqu %ymm0,($out) + vzeroupper +___ +$code.=<<___ if ($win64); + movaps (%rsp),%xmm6 + movaps 0x10(%rsp),%xmm7 + movaps 0x20(%rsp),%xmm8 + movaps 0x30(%rsp),%xmm9 + movaps 0x40(%rsp),%xmm10 + movaps 0x50(%rsp),%xmm11 + movaps 0x60(%rsp),%xmm12 + movaps 0x70(%rsp),%xmm13 + movaps 0x80(%rsp),%xmm14 + movaps 0x90(%rsp),%xmm15 + lea 0xa8(%rsp),%rsp +.LSEH_end_rsaz_1024_gather5: +___ +$code.=<<___; + ret +.size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2 +___ +} + +$code.=<<___; +.extern OPENSSL_ia32cap_P +.globl rsaz_avx2_eligible +.type rsaz_avx2_eligible,\@abi-omnipotent +.align 32 +rsaz_avx2_eligible: + mov OPENSSL_ia32cap_P+8(%rip),%eax +___ +$code.=<<___ if ($addx); + mov \$`1<<8|1<<19`,%ecx + mov \$0,%edx + and %eax,%ecx + cmp \$`1<<8|1<<19`,%ecx # check for BMI2+AD*X + cmove %edx,%eax +___ +$code.=<<___; + and \$`1<<5`,%eax + shr \$5,%eax + ret +.size rsaz_avx2_eligible,.-rsaz_avx2_eligible + +.align 64 +.Land_mask: + .quad 0x1fffffff,0x1fffffff,0x1fffffff,-1 +.Lscatter_permd: + .long 0,2,4,6,7,7,7,7 +.Lgather_permd: + .long 0,7,1,7,2,7,3,7 +.Lgather_table: + .byte 0,0,0,0,0,0,0,0, 0xff,0,0,0,0,0,0,0 +.align 64 +___ + +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___ +.extern __imp_RtlVirtualUnwind +.type rsaz_se_handler,\@abi-omnipotent +.align 16 +rsaz_se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->Rip<prologue label + jb .Lcommon_seh_tail + + mov 152($context),%rax # pull context->Rsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + mov 160($context),%rax # pull context->Rbp + + mov -48(%rax),%r15 + mov -40(%rax),%r14 + mov -32(%rax),%r13 + mov -24(%rax),%r12 + mov -16(%rax),%rbp + mov -8(%rax),%rbx + mov %r15,240($context) + mov %r14,232($context) + mov %r13,224($context) + mov %r12,216($context) + mov %rbp,160($context) + mov %rbx,144($context) + + lea -0xd8(%rax),%rsi # %xmm save area + lea 512($context),%rdi # & context.Xmm6 + mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) + .long 0xa548f3fc # cld; rep movsq + +.Lcommon_seh_tail: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size rsaz_se_handler,.-rsaz_se_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_rsaz_1024_sqr_avx2 + .rva .LSEH_end_rsaz_1024_sqr_avx2 + .rva .LSEH_info_rsaz_1024_sqr_avx2 + + .rva .LSEH_begin_rsaz_1024_mul_avx2 + .rva .LSEH_end_rsaz_1024_mul_avx2 + .rva .LSEH_info_rsaz_1024_mul_avx2 + + .rva .LSEH_begin_rsaz_1024_gather5 + .rva .LSEH_end_rsaz_1024_gather5 + .rva .LSEH_info_rsaz_1024_gather5 +.section .xdata +.align 8 +.LSEH_info_rsaz_1024_sqr_avx2: + .byte 9,0,0,0 + .rva rsaz_se_handler + .rva .Lsqr_1024_body,.Lsqr_1024_epilogue +.LSEH_info_rsaz_1024_mul_avx2: + .byte 9,0,0,0 + .rva rsaz_se_handler + .rva .Lmul_1024_body,.Lmul_1024_epilogue +.LSEH_info_rsaz_1024_gather5: + .byte 0x01,0x33,0x16,0x00 + .byte 0x36,0xf8,0x09,0x00 #vmovaps 0x90(rsp),xmm15 + .byte 0x31,0xe8,0x08,0x00 #vmovaps 0x80(rsp),xmm14 + .byte 0x2c,0xd8,0x07,0x00 #vmovaps 0x70(rsp),xmm13 + .byte 0x27,0xc8,0x06,0x00 #vmovaps 0x60(rsp),xmm12 + .byte 0x22,0xb8,0x05,0x00 #vmovaps 0x50(rsp),xmm11 + .byte 0x1d,0xa8,0x04,0x00 #vmovaps 0x40(rsp),xmm10 + .byte 0x18,0x98,0x03,0x00 #vmovaps 0x30(rsp),xmm9 + .byte 0x13,0x88,0x02,0x00 #vmovaps 0x20(rsp),xmm8 + .byte 0x0e,0x78,0x01,0x00 #vmovaps 0x10(rsp),xmm7 + .byte 0x09,0x68,0x00,0x00 #vmovaps 0x00(rsp),xmm6 + .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8 +___ +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval($1)/ge; + + s/\b(sh[rl]d?\s+\$)(-?[0-9]+)/$1.$2%64/ge or + + s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or + s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or + s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or + s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or + s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go; + print $_,"\n"; +} + +}}} else {{{ +print <<___; # assembler is too old +.text + +.globl rsaz_avx2_eligible +.type rsaz_avx2_eligible,\@abi-omnipotent +rsaz_avx2_eligible: + xor %eax,%eax + ret +.size rsaz_avx2_eligible,.-rsaz_avx2_eligible + +.globl rsaz_1024_sqr_avx2 +.globl rsaz_1024_mul_avx2 +.globl rsaz_1024_norm2red_avx2 +.globl rsaz_1024_red2norm_avx2 +.globl rsaz_1024_scatter5_avx2 +.globl rsaz_1024_gather5_avx2 +.type rsaz_1024_sqr_avx2,\@abi-omnipotent +rsaz_1024_sqr_avx2: +rsaz_1024_mul_avx2: +rsaz_1024_norm2red_avx2: +rsaz_1024_red2norm_avx2: +rsaz_1024_scatter5_avx2: +rsaz_1024_gather5_avx2: + .byte 0x0f,0x0b # ud2 + ret +.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 +___ +}}} + +close STDOUT; diff --git a/crypto/bn/asm/rsaz-x86_64.pl b/crypto/bn/asm/rsaz-x86_64.pl new file mode 100755 index 0000000..3bd45db --- /dev/null +++ b/crypto/bn/asm/rsaz-x86_64.pl @@ -0,0 +1,2144 @@ +#!/usr/bin/env perl + +############################################################################## +# # +# Copyright (c) 2012, Intel Corporation # +# # +# All rights reserved. # +# # +# Redistribution and use in source and binary forms, with or without # +# modification, are permitted provided that the following conditions are # +# met: # +# # +# * Redistributions of source code must retain the above copyright # +# notice, this list of conditions and the following disclaimer. # +# # +# * Redistributions in binary form must reproduce the above copyright # +# notice, this list of conditions and the following disclaimer in the # +# documentation and/or other materials provided with the # +# distribution. # +# # +# * Neither the name of the Intel Corporation nor the names of its # +# contributors may be used to endorse or promote products derived from # +# this software without specific prior written permission. # +# # +# # +# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY # +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR # +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # +# # +############################################################################## +# Developers and authors: # +# Shay Gueron (1, 2), and Vlad Krasnov (1) # +# (1) Intel Architecture Group, Microprocessor and Chipset Development, # +# Israel Development Center, Haifa, Israel # +# (2) University of Haifa # +############################################################################## +# Reference: # +# [1] S. Gueron, "Efficient Software Implementations of Modular # +# Exponentiation", http://eprint.iacr.org/2011/239 # +# [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring". # +# IEEE Proceedings of 9th International Conference on Information # +# Technology: New Generations (ITNG 2012), 821-823 (2012). # +# [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation# +# Journal of Cryptographic Engineering 2:31-43 (2012). # +# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis # +# resistant 512-bit and 1024-bit modular exponentiation for optimizing # +# RSA1024 and RSA2048 on x86_64 platforms", # +# http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest# +############################################################################## + +# While original submission covers 512- and 1024-bit exponentiation, +# this module is limited to 512-bit version only (and as such +# accelerates RSA1024 sign). This is because improvement for longer +# keys is not high enough to justify the effort, highest measured +# was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming +# for the moment of this writing!] Nor does this module implement +# "monolithic" complete exponentiation jumbo-subroutine, but adheres +# to more modular mixture of C and assembly. And it's optimized even +# for processors other than Intel Core family (see table below for +# improvement coefficients). +# <appro@openssl.org> +# +# RSA1024 sign/sec this/original |this/rsax(*) this/fips(*) +# ----------------+--------------------------- +# Opteron +13% |+5% +20% +# Bulldozer -0% |-1% +10% +# P4 +11% |+7% +8% +# Westmere +5% |+14% +17% +# Sandy Bridge +2% |+12% +29% +# Ivy Bridge +1% |+11% +35% +# Haswell(**) -0% |+12% +39% +# Atom +13% |+11% +4% +# VIA Nano +70% |+9% +25% +# +# (*) rsax engine and fips numbers are presented for reference +# purposes; +# (**) MULX was attempted, but found to give only marginal improvement; + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $addx = ($1>=2.23); +} + +if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { + $addx = ($1>=2.10); +} + +if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $addx = ($1>=12); +} + +if (!$addx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) { + my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 + $addx = ($ver>=3.03); +} + +($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API +{ +my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d"); + +$code.=<<___; +.text + +.extern OPENSSL_ia32cap_P + +.globl rsaz_512_sqr +.type rsaz_512_sqr,\@function,5 +.align 32 +rsaz_512_sqr: # 25-29% faster than rsaz_512_mul + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + subq \$128+24, %rsp +.Lsqr_body: + movq $mod, %rbp # common argument + movq ($inp), %rdx + movq 8($inp), %rax + movq $n0, 128(%rsp) +___ +$code.=<<___ if ($addx); + movl \$0x80100,%r11d + andl OPENSSL_ia32cap_P+8(%rip),%r11d + cmpl \$0x80100,%r11d # check for MULX and ADO/CX + je .Loop_sqrx +___ +$code.=<<___; + jmp .Loop_sqr + +.align 32 +.Loop_sqr: + movl $times,128+8(%rsp) +#first iteration + movq %rdx, %rbx + mulq %rdx + movq %rax, %r8 + movq 16($inp), %rax + movq %rdx, %r9 + + mulq %rbx + addq %rax, %r9 + movq 24($inp), %rax + movq %rdx, %r10 + adcq \$0, %r10 + + mulq %rbx + addq %rax, %r10 + movq 32($inp), %rax + movq %rdx, %r11 + adcq \$0, %r11 + + mulq %rbx + addq %rax, %r11 + movq 40($inp), %rax + movq %rdx, %r12 + adcq \$0, %r12 + + mulq %rbx + addq %rax, %r12 + movq 48($inp), %rax + movq %rdx, %r13 + adcq \$0, %r13 + + mulq %rbx + addq %rax, %r13 + movq 56($inp), %rax + movq %rdx, %r14 + adcq \$0, %r14 + + mulq %rbx + addq %rax, %r14 + movq %rbx, %rax + movq %rdx, %r15 + adcq \$0, %r15 + + addq %r8, %r8 #shlq \$1, %r8 + movq %r9, %rcx + adcq %r9, %r9 #shld \$1, %r8, %r9 + + mulq %rax + movq %rax, (%rsp) + addq %rdx, %r8 + adcq \$0, %r9 + + movq %r8, 8(%rsp) + shrq \$63, %rcx + +#second iteration + movq 8($inp), %r8 + movq 16($inp), %rax + mulq %r8 + addq %rax, %r10 + movq 24($inp), %rax + movq %rdx, %rbx + adcq \$0, %rbx + + mulq %r8 + addq %rax, %r11 + movq 32($inp), %rax + adcq \$0, %rdx + addq %rbx, %r11 + movq %rdx, %rbx + adcq \$0, %rbx + + mulq %r8 + addq %rax, %r12 + movq 40($inp), %rax + adcq \$0, %rdx + addq %rbx, %r12 + movq %rdx, %rbx + adcq \$0, %rbx + + mulq %r8 + addq %rax, %r13 + movq 48($inp), %rax + adcq \$0, %rdx + addq %rbx, %r13 + movq %rdx, %rbx + adcq \$0, %rbx + + mulq %r8 + addq %rax, %r14 + movq 56($inp), %rax + adcq \$0, %rdx + addq %rbx, %r14 + movq %rdx, %rbx + adcq \$0, %rbx + + mulq %r8 + addq %rax, %r15 + movq %r8, %rax + adcq \$0, %rdx + addq %rbx, %r15 + movq %rdx, %r8 + movq %r10, %rdx + adcq \$0, %r8 + + add %rdx, %rdx + lea (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10 + movq %r11, %rbx + adcq %r11, %r11 #shld \$1, %r10, %r11 + + mulq %rax + addq %rax, %r9 + adcq %rdx, %r10 + adcq \$0, %r11 + + movq %r9, 16(%rsp) + movq %r10, 24(%rsp) + shrq \$63, %rbx + +#third iteration + movq 16($inp), %r9 + movq 24($inp), %rax + mulq %r9 + addq %rax, %r12 + movq 32($inp), %rax + movq %rdx, %rcx + adcq \$0, %rcx + + mulq %r9 + addq %rax, %r13 + movq 40($inp), %rax + adcq \$0, %rdx + addq %rcx, %r13 + movq %rdx, %rcx + adcq \$0, %rcx + + mulq %r9 + addq %rax, %r14 + movq 48($inp), %rax + adcq \$0, %rdx + addq %rcx, %r14 + movq %rdx, %rcx + adcq \$0, %rcx + + mulq %r9 + movq %r12, %r10 + lea (%rbx,%r12,2), %r12 #shld \$1, %rbx, %r12 + addq %rax, %r15 + movq 56($inp), %rax + adcq \$0, %rdx + addq %rcx, %r15 + movq %rdx, %rcx + adcq \$0, %rcx + + mulq %r9 + shrq \$63, %r10 + addq %rax, %r8 + movq %r9, %rax + adcq \$0, %rdx + addq %rcx, %r8 + movq %rdx, %r9 + adcq \$0, %r9 + + movq %r13, %rcx + leaq (%r10,%r13,2), %r13 #shld \$1, %r12, %r13 + + mulq %rax + addq %rax, %r11 + adcq %rdx, %r12 + adcq \$0, %r13 + + movq %r11, 32(%rsp) + movq %r12, 40(%rsp) + shrq \$63, %rcx + +#fourth iteration + movq 24($inp), %r10 + movq 32($inp), %rax + mulq %r10 + addq %rax, %r14 + movq 40($inp), %rax + movq %rdx, %rbx + adcq \$0, %rbx + + mulq %r10 + addq %rax, %r15 + movq 48($inp), %rax + adcq \$0, %rdx + addq %rbx, %r15 + movq %rdx, %rbx + adcq \$0, %rbx + + mulq %r10 + movq %r14, %r12 + leaq (%rcx,%r14,2), %r14 #shld \$1, %rcx, %r14 + addq %rax, %r8 + movq 56($inp), %rax + adcq \$0, %rdx + addq %rbx, %r8 + movq %rdx, %rbx + adcq \$0, %rbx + + mulq %r10 + shrq \$63, %r12 + addq %rax, %r9 + movq %r10, %rax + adcq \$0, %rdx + addq %rbx, %r9 + movq %rdx, %r10 + adcq \$0, %r10 + + movq %r15, %rbx + leaq (%r12,%r15,2),%r15 #shld \$1, %r14, %r15 + + mulq %rax + addq %rax, %r13 + adcq %rdx, %r14 + adcq \$0, %r15 + + movq %r13, 48(%rsp) + movq %r14, 56(%rsp) + shrq \$63, %rbx + +#fifth iteration + movq 32($inp), %r11 + movq 40($inp), %rax + mulq %r11 + addq %rax, %r8 + movq 48($inp), %rax + movq %rdx, %rcx + adcq \$0, %rcx + + mulq %r11 + addq %rax, %r9 + movq 56($inp), %rax + adcq \$0, %rdx + movq %r8, %r12 + leaq (%rbx,%r8,2), %r8 #shld \$1, %rbx, %r8 + addq %rcx, %r9 + movq %rdx, %rcx + adcq \$0, %rcx + + mulq %r11 + shrq \$63, %r12 + addq %rax, %r10 + movq %r11, %rax + adcq \$0, %rdx + addq %rcx, %r10 + movq %rdx, %r11 + adcq \$0, %r11 + + movq %r9, %rcx + leaq (%r12,%r9,2), %r9 #shld \$1, %r8, %r9 + + mulq %rax + addq %rax, %r15 + adcq %rdx, %r8 + adcq \$0, %r9 + + movq %r15, 64(%rsp) + movq %r8, 72(%rsp) + shrq \$63, %rcx + +#sixth iteration + movq 40($inp), %r12 + movq 48($inp), %rax + mulq %r12 + addq %rax, %r10 + movq 56($inp), %rax + movq %rdx, %rbx + adcq \$0, %rbx + + mulq %r12 + addq %rax, %r11 + movq %r12, %rax + movq %r10, %r15 + leaq (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10 + adcq \$0, %rdx + shrq \$63, %r15 + addq %rbx, %r11 + movq %rdx, %r12 + adcq \$0, %r12 + + movq %r11, %rbx + leaq (%r15,%r11,2), %r11 #shld \$1, %r10, %r11 + + mulq %rax + addq %rax, %r9 + adcq %rdx, %r10 + adcq \$0, %r11 + + movq %r9, 80(%rsp) + movq %r10, 88(%rsp) + +#seventh iteration + movq 48($inp), %r13 + movq 56($inp), %rax + mulq %r13 + addq %rax, %r12 + movq %r13, %rax + movq %rdx, %r13 + adcq \$0, %r13 + + xorq %r14, %r14 + shlq \$1, %rbx + adcq %r12, %r12 #shld \$1, %rbx, %r12 + adcq %r13, %r13 #shld \$1, %r12, %r13 + adcq %r14, %r14 #shld \$1, %r13, %r14 + + mulq %rax + addq %rax, %r11 + adcq %rdx, %r12 + adcq \$0, %r13 + + movq %r11, 96(%rsp) + movq %r12, 104(%rsp) + +#eighth iteration + movq 56($inp), %rax + mulq %rax + addq %rax, %r13 + adcq \$0, %rdx + + addq %rdx, %r14 + + movq %r13, 112(%rsp) + movq %r14, 120(%rsp) + + movq (%rsp), %r8 + movq 8(%rsp), %r9 + movq 16(%rsp), %r10 + movq 24(%rsp), %r11 + movq 32(%rsp), %r12 + movq 40(%rsp), %r13 + movq 48(%rsp), %r14 + movq 56(%rsp), %r15 + + call __rsaz_512_reduce + + addq 64(%rsp), %r8 + adcq 72(%rsp), %r9 + adcq 80(%rsp), %r10 + adcq 88(%rsp), %r11 + adcq 96(%rsp), %r12 + adcq 104(%rsp), %r13 + adcq 112(%rsp), %r14 + adcq 120(%rsp), %r15 + sbbq %rcx, %rcx + + call __rsaz_512_subtract + + movq %r8, %rdx + movq %r9, %rax + movl 128+8(%rsp), $times + movq $out, $inp + + decl $times + jnz .Loop_sqr +___ +if ($addx) { +$code.=<<___; + jmp .Lsqr_tail + +.align 32 +.Loop_sqrx: + movl $times,128+8(%rsp) + movq $out, %xmm0 # off-load + movq %rbp, %xmm1 # off-load +#first iteration + mulx %rax, %r8, %r9 + + mulx 16($inp), %rcx, %r10 + xor %rbp, %rbp # cf=0, of=0 + + mulx 24($inp), %rax, %r11 + adcx %rcx, %r9 + + mulx 32($inp), %rcx, %r12 + adcx %rax, %r10 + + mulx 40($inp), %rax, %r13 + adcx %rcx, %r11 + + .byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($inp), %rcx, %r14 + adcx %rax, %r12 + adcx %rcx, %r13 + + .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r15 + adcx %rax, %r14 + adcx %rbp, %r15 # %rbp is 0 + + mov %r9, %rcx + shld \$1, %r8, %r9 + shl \$1, %r8 + + xor %ebp, %ebp + mulx %rdx, %rax, %rdx + adcx %rdx, %r8 + mov 8($inp), %rdx + adcx %rbp, %r9 + + mov %rax, (%rsp) + mov %r8, 8(%rsp) + +#second iteration + mulx 16($inp), %rax, %rbx + adox %rax, %r10 + adcx %rbx, %r11 + + .byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r8 + adox $out, %r11 + adcx %r8, %r12 + + mulx 32($inp), %rax, %rbx + adox %rax, %r12 + adcx %rbx, %r13 + + mulx 40($inp), $out, %r8 + adox $out, %r13 + adcx %r8, %r14 + + .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx + adox %rax, %r14 + adcx %rbx, %r15 + + .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8 + adox $out, %r15 + adcx %rbp, %r8 + adox %rbp, %r8 + + mov %r11, %rbx + shld \$1, %r10, %r11 + shld \$1, %rcx, %r10 + + xor %ebp,%ebp + mulx %rdx, %rax, %rcx + mov 16($inp), %rdx + adcx %rax, %r9 + adcx %rcx, %r10 + adcx %rbp, %r11 + + mov %r9, 16(%rsp) + .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp) + +#third iteration + .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r9 + adox $out, %r12 + adcx %r9, %r13 + + mulx 32($inp), %rax, %rcx + adox %rax, %r13 + adcx %rcx, %r14 + + mulx 40($inp), $out, %r9 + adox $out, %r14 + adcx %r9, %r15 + + .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx + adox %rax, %r15 + adcx %rcx, %r8 + + .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r9 + adox $out, %r8 + adcx %rbp, %r9 + adox %rbp, %r9 + + mov %r13, %rcx + shld \$1, %r12, %r13 + shld \$1, %rbx, %r12 + + xor %ebp, %ebp + mulx %rdx, %rax, %rdx + adcx %rax, %r11 + adcx %rdx, %r12 + mov 24($inp), %rdx + adcx %rbp, %r13 + + mov %r11, 32(%rsp) + .byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 # mov %r12, 40(%rsp) + +#fourth iteration + .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 32($inp), %rax, %rbx + adox %rax, %r14 + adcx %rbx, %r15 + + mulx 40($inp), $out, %r10 + adox $out, %r15 + adcx %r10, %r8 + + mulx 48($inp), %rax, %rbx + adox %rax, %r8 + adcx %rbx, %r9 + + mulx 56($inp), $out, %r10 + adox $out, %r9 + adcx %rbp, %r10 + adox %rbp, %r10 + + .byte 0x66 + mov %r15, %rbx + shld \$1, %r14, %r15 + shld \$1, %rcx, %r14 + + xor %ebp, %ebp + mulx %rdx, %rax, %rdx + adcx %rax, %r13 + adcx %rdx, %r14 + mov 32($inp), %rdx + adcx %rbp, %r15 + + mov %r13, 48(%rsp) + mov %r14, 56(%rsp) + +#fifth iteration + .byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r11 + adox $out, %r8 + adcx %r11, %r9 + + mulx 48($inp), %rax, %rcx + adox %rax, %r9 + adcx %rcx, %r10 + + mulx 56($inp), $out, %r11 + adox $out, %r10 + adcx %rbp, %r11 + adox %rbp, %r11 + + mov %r9, %rcx + shld \$1, %r8, %r9 + shld \$1, %rbx, %r8 + + xor %ebp, %ebp + mulx %rdx, %rax, %rdx + adcx %rax, %r15 + adcx %rdx, %r8 + mov 40($inp), %rdx + adcx %rbp, %r9 + + mov %r15, 64(%rsp) + mov %r8, 72(%rsp) + +#sixth iteration + .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx + adox %rax, %r10 + adcx %rbx, %r11 + + .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12 + adox $out, %r11 + adcx %rbp, %r12 + adox %rbp, %r12 + + mov %r11, %rbx + shld \$1, %r10, %r11 + shld \$1, %rcx, %r10 + + xor %ebp, %ebp + mulx %rdx, %rax, %rdx + adcx %rax, %r9 + adcx %rdx, %r10 + mov 48($inp), %rdx + adcx %rbp, %r11 + + mov %r9, 80(%rsp) + mov %r10, 88(%rsp) + +#seventh iteration + .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13 + adox %rax, %r12 + adox %rbp, %r13 + + xor %r14, %r14 + shld \$1, %r13, %r14 + shld \$1, %r12, %r13 + shld \$1, %rbx, %r12 + + xor %ebp, %ebp + mulx %rdx, %rax, %rdx + adcx %rax, %r11 + adcx %rdx, %r12 + mov 56($inp), %rdx + adcx %rbp, %r13 + + .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp) + .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp) + +#eighth iteration + mulx %rdx, %rax, %rdx + adox %rax, %r13 + adox %rbp, %rdx + + .byte 0x66 + add %rdx, %r14 + + movq %r13, 112(%rsp) + movq %r14, 120(%rsp) + movq %xmm0, $out + movq %xmm1, %rbp + + movq 128(%rsp), %rdx # pull $n0 + movq (%rsp), %r8 + movq 8(%rsp), %r9 + movq 16(%rsp), %r10 + movq 24(%rsp), %r11 + movq 32(%rsp), %r12 + movq 40(%rsp), %r13 + movq 48(%rsp), %r14 + movq 56(%rsp), %r15 + + call __rsaz_512_reducex + + addq 64(%rsp), %r8 + adcq 72(%rsp), %r9 + adcq 80(%rsp), %r10 + adcq 88(%rsp), %r11 + adcq 96(%rsp), %r12 + adcq 104(%rsp), %r13 + adcq 112(%rsp), %r14 + adcq 120(%rsp), %r15 + sbbq %rcx, %rcx + + call __rsaz_512_subtract + + movq %r8, %rdx + movq %r9, %rax + movl 128+8(%rsp), $times + movq $out, $inp + + decl $times + jnz .Loop_sqrx + +.Lsqr_tail: +___ +} +$code.=<<___; + + leaq 128+24+48(%rsp), %rax + movq -48(%rax), %r15 + movq -40(%rax), %r14 + movq -32(%rax), %r13 + movq -24(%rax), %r12 + movq -16(%rax), %rbp + movq -8(%rax), %rbx + leaq (%rax), %rsp +.Lsqr_epilogue: + ret +.size rsaz_512_sqr,.-rsaz_512_sqr +___ +} +{ +my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); +$code.=<<___; +.globl rsaz_512_mul +.type rsaz_512_mul,\@function,5 +.align 32 +rsaz_512_mul: + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + subq \$128+24, %rsp +.Lmul_body: + movq $out, %xmm0 # off-load arguments + movq $mod, %xmm1 + movq $n0, 128(%rsp) +___ +$code.=<<___ if ($addx); + movl \$0x80100,%r11d + andl OPENSSL_ia32cap_P+8(%rip),%r11d + cmpl \$0x80100,%r11d # check for MULX and ADO/CX + je .Lmulx +___ +$code.=<<___; + movq ($bp), %rbx # pass b[0] + movq $bp, %rbp # pass argument + call __rsaz_512_mul + + movq %xmm0, $out + movq %xmm1, %rbp + + movq (%rsp), %r8 + movq 8(%rsp), %r9 + movq 16(%rsp), %r10 + movq 24(%rsp), %r11 + movq 32(%rsp), %r12 + movq 40(%rsp), %r13 + movq 48(%rsp), %r14 + movq 56(%rsp), %r15 + + call __rsaz_512_reduce +___ +$code.=<<___ if ($addx); + jmp .Lmul_tail + +.align 32 +.Lmulx: + movq $bp, %rbp # pass argument + movq ($bp), %rdx # pass b[0] + call __rsaz_512_mulx + + movq %xmm0, $out + movq %xmm1, %rbp + + movq 128(%rsp), %rdx # pull $n0 + movq (%rsp), %r8 + movq 8(%rsp), %r9 + movq 16(%rsp), %r10 + movq 24(%rsp), %r11 + movq 32(%rsp), %r12 + movq 40(%rsp), %r13 + movq 48(%rsp), %r14 + movq 56(%rsp), %r15 + + call __rsaz_512_reducex +.Lmul_tail: +___ +$code.=<<___; + addq 64(%rsp), %r8 + adcq 72(%rsp), %r9 + adcq 80(%rsp), %r10 + adcq 88(%rsp), %r11 + adcq 96(%rsp), %r12 + adcq 104(%rsp), %r13 + adcq 112(%rsp), %r14 + adcq 120(%rsp), %r15 + sbbq %rcx, %rcx + + call __rsaz_512_subtract + + leaq 128+24+48(%rsp), %rax + movq -48(%rax), %r15 + movq -40(%rax), %r14 + movq -32(%rax), %r13 + movq -24(%rax), %r12 + movq -16(%rax), %rbp + movq -8(%rax), %rbx + leaq (%rax), %rsp +.Lmul_epilogue: + ret +.size rsaz_512_mul,.-rsaz_512_mul +___ +} +{ +my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); +$code.=<<___; +.globl rsaz_512_mul_gather4 +.type rsaz_512_mul_gather4,\@function,6 +.align 32 +rsaz_512_mul_gather4: + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + mov $pwr, $pwr + subq \$128+24, %rsp +.Lmul_gather4_body: +___ +$code.=<<___ if ($addx); + movl \$0x80100,%r11d + andl OPENSSL_ia32cap_P+8(%rip),%r11d + cmpl \$0x80100,%r11d # check for MULX and ADO/CX + je .Lmulx_gather +___ +$code.=<<___; + movl 64($bp,$pwr,4), %eax + movq $out, %xmm0 # off-load arguments + movl ($bp,$pwr,4), %ebx + movq $mod, %xmm1 + movq $n0, 128(%rsp) + + shlq \$32, %rax + or %rax, %rbx + movq ($ap), %rax + movq 8($ap), %rcx + leaq 128($bp,$pwr,4), %rbp + mulq %rbx # 0 iteration + movq %rax, (%rsp) + movq %rcx, %rax + movq %rdx, %r8 + + mulq %rbx + movd (%rbp), %xmm4 + addq %rax, %r8 + movq 16($ap), %rax + movq %rdx, %r9 + adcq \$0, %r9 + + mulq %rbx + movd 64(%rbp), %xmm5 + addq %rax, %r9 + movq 24($ap), %rax + movq %rdx, %r10 + adcq \$0, %r10 + + mulq %rbx + pslldq \$4, %xmm5 + addq %rax, %r10 + movq 32($ap), %rax + movq %rdx, %r11 + adcq \$0, %r11 + + mulq %rbx + por %xmm5, %xmm4 + addq %rax, %r11 + movq 40($ap), %rax + movq %rdx, %r12 + adcq \$0, %r12 + + mulq %rbx + addq %rax, %r12 + movq 48($ap), %rax + movq %rdx, %r13 + adcq \$0, %r13 + + mulq %rbx + leaq 128(%rbp), %rbp + addq %rax, %r13 + movq 56($ap), %rax + movq %rdx, %r14 + adcq \$0, %r14 + + mulq %rbx + movq %xmm4, %rbx + addq %rax, %r14 + movq ($ap), %rax + movq %rdx, %r15 + adcq \$0, %r15 + + leaq 8(%rsp), %rdi + movl \$7, %ecx + jmp .Loop_mul_gather + +.align 32 +.Loop_mul_gather: + mulq %rbx + addq %rax, %r8 + movq 8($ap), %rax + movq %r8, (%rdi) + movq %rdx, %r8 + adcq \$0, %r8 + + mulq %rbx + movd (%rbp), %xmm4 + addq %rax, %r9 + movq 16($ap), %rax + adcq \$0, %rdx + addq %r9, %r8 + movq %rdx, %r9 + adcq \$0, %r9 + + mulq %rbx + movd 64(%rbp), %xmm5 + addq %rax, %r10 + movq 24($ap), %rax + adcq \$0, %rdx + addq %r10, %r9 + movq %rdx, %r10 + adcq \$0, %r10 + + mulq %rbx + pslldq \$4, %xmm5 + addq %rax, %r11 + movq 32($ap), %rax + adcq \$0, %rdx + addq %r11, %r10 + movq %rdx, %r11 + adcq \$0, %r11 + + mulq %rbx + por %xmm5, %xmm4 + addq %rax, %r12 + movq 40($ap), %rax + adcq \$0, %rdx + addq %r12, %r11 + movq %rdx, %r12 + adcq \$0, %r12 + + mulq %rbx + addq %rax, %r13 + movq 48($ap), %rax + adcq \$0, %rdx + addq %r13, %r12 + movq %rdx, %r13 + adcq \$0, %r13 + + mulq %rbx + addq %rax, %r14 + movq 56($ap), %rax + adcq \$0, %rdx + addq %r14, %r13 + movq %rdx, %r14 + adcq \$0, %r14 + + mulq %rbx + movq %xmm4, %rbx + addq %rax, %r15 + movq ($ap), %rax + adcq \$0, %rdx + addq %r15, %r14 + movq %rdx, %r15 + adcq \$0, %r15 + + leaq 128(%rbp), %rbp + leaq 8(%rdi), %rdi + + decl %ecx + jnz .Loop_mul_gather + + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq %r12, 32(%rdi) + movq %r13, 40(%rdi) + movq %r14, 48(%rdi) + movq %r15, 56(%rdi) + + movq %xmm0, $out + movq %xmm1, %rbp + + movq (%rsp), %r8 + movq 8(%rsp), %r9 + movq 16(%rsp), %r10 + movq 24(%rsp), %r11 + movq 32(%rsp), %r12 + movq 40(%rsp), %r13 + movq 48(%rsp), %r14 + movq 56(%rsp), %r15 + + call __rsaz_512_reduce +___ +$code.=<<___ if ($addx); + jmp .Lmul_gather_tail + +.align 32 +.Lmulx_gather: + mov 64($bp,$pwr,4), %eax + movq $out, %xmm0 # off-load arguments + lea 128($bp,$pwr,4), %rbp + mov ($bp,$pwr,4), %edx + movq $mod, %xmm1 + mov $n0, 128(%rsp) + + shl \$32, %rax + or %rax, %rdx + mulx ($ap), %rbx, %r8 # 0 iteration + mov %rbx, (%rsp) + xor %edi, %edi # cf=0, of=0 + + mulx 8($ap), %rax, %r9 + movd (%rbp), %xmm4 + + mulx 16($ap), %rbx, %r10 + movd 64(%rbp), %xmm5 + adcx %rax, %r8 + + mulx 24($ap), %rax, %r11 + pslldq \$4, %xmm5 + adcx %rbx, %r9 + + mulx 32($ap), %rbx, %r12 + por %xmm5, %xmm4 + adcx %rax, %r10 + + mulx 40($ap), %rax, %r13 + adcx %rbx, %r11 + + mulx 48($ap), %rbx, %r14 + lea 128(%rbp), %rbp + adcx %rax, %r12 + + mulx 56($ap), %rax, %r15 + movq %xmm4, %rdx + adcx %rbx, %r13 + adcx %rax, %r14 + mov %r8, %rbx + adcx %rdi, %r15 # %rdi is 0 + + mov \$-7, %rcx + jmp .Loop_mulx_gather + +.align 32 +.Loop_mulx_gather: + mulx ($ap), %rax, %r8 + adcx %rax, %rbx + adox %r9, %r8 + + mulx 8($ap), %rax, %r9 + .byte 0x66,0x0f,0x6e,0xa5,0x00,0x00,0x00,0x00 # movd (%rbp), %xmm4 + adcx %rax, %r8 + adox %r10, %r9 + + mulx 16($ap), %rax, %r10 + movd 64(%rbp), %xmm5 + lea 128(%rbp), %rbp + adcx %rax, %r9 + adox %r11, %r10 + + .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11 + pslldq \$4, %xmm5 + por %xmm5, %xmm4 + adcx %rax, %r10 + adox %r12, %r11 + + mulx 32($ap), %rax, %r12 + adcx %rax, %r11 + adox %r13, %r12 + + mulx 40($ap), %rax, %r13 + adcx %rax, %r12 + adox %r14, %r13 + + .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14 + adcx %rax, %r13 + adox %r15, %r14 + + mulx 56($ap), %rax, %r15 + movq %xmm4, %rdx + mov %rbx, 64(%rsp,%rcx,8) + adcx %rax, %r14 + adox %rdi, %r15 + mov %r8, %rbx + adcx %rdi, %r15 # cf=0 + + inc %rcx # of=0 + jnz .Loop_mulx_gather + + mov %r8, 64(%rsp) + mov %r9, 64+8(%rsp) + mov %r10, 64+16(%rsp) + mov %r11, 64+24(%rsp) + mov %r12, 64+32(%rsp) + mov %r13, 64+40(%rsp) + mov %r14, 64+48(%rsp) + mov %r15, 64+56(%rsp) + + movq %xmm0, $out + movq %xmm1, %rbp + + mov 128(%rsp), %rdx # pull $n0 + mov (%rsp), %r8 + mov 8(%rsp), %r9 + mov 16(%rsp), %r10 + mov 24(%rsp), %r11 + mov 32(%rsp), %r12 + mov 40(%rsp), %r13 + mov 48(%rsp), %r14 + mov 56(%rsp), %r15 + + call __rsaz_512_reducex + +.Lmul_gather_tail: +___ +$code.=<<___; + addq 64(%rsp), %r8 + adcq 72(%rsp), %r9 + adcq 80(%rsp), %r10 + adcq 88(%rsp), %r11 + adcq 96(%rsp), %r12 + adcq 104(%rsp), %r13 + adcq 112(%rsp), %r14 + adcq 120(%rsp), %r15 + sbbq %rcx, %rcx + + call __rsaz_512_subtract + + leaq 128+24+48(%rsp), %rax + movq -48(%rax), %r15 + movq -40(%rax), %r14 + movq -32(%rax), %r13 + movq -24(%rax), %r12 + movq -16(%rax), %rbp + movq -8(%rax), %rbx + leaq (%rax), %rsp +.Lmul_gather4_epilogue: + ret +.size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4 +___ +} +{ +my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); +$code.=<<___; +.globl rsaz_512_mul_scatter4 +.type rsaz_512_mul_scatter4,\@function,6 +.align 32 +rsaz_512_mul_scatter4: + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + mov $pwr, $pwr + subq \$128+24, %rsp +.Lmul_scatter4_body: + leaq ($tbl,$pwr,4), $tbl + movq $out, %xmm0 # off-load arguments + movq $mod, %xmm1 + movq $tbl, %xmm2 + movq $n0, 128(%rsp) + + movq $out, %rbp +___ +$code.=<<___ if ($addx); + movl \$0x80100,%r11d + andl OPENSSL_ia32cap_P+8(%rip),%r11d + cmpl \$0x80100,%r11d # check for MULX and ADO/CX + je .Lmulx_scatter +___ +$code.=<<___; + movq ($out),%rbx # pass b[0] + call __rsaz_512_mul + + movq %xmm0, $out + movq %xmm1, %rbp + + movq (%rsp), %r8 + movq 8(%rsp), %r9 + movq 16(%rsp), %r10 + movq 24(%rsp), %r11 + movq 32(%rsp), %r12 + movq 40(%rsp), %r13 + movq 48(%rsp), %r14 + movq 56(%rsp), %r15 + + call __rsaz_512_reduce +___ +$code.=<<___ if ($addx); + jmp .Lmul_scatter_tail + +.align 32 +.Lmulx_scatter: + movq ($out), %rdx # pass b[0] + call __rsaz_512_mulx + + movq %xmm0, $out + movq %xmm1, %rbp + + movq 128(%rsp), %rdx # pull $n0 + movq (%rsp), %r8 + movq 8(%rsp), %r9 + movq 16(%rsp), %r10 + movq 24(%rsp), %r11 + movq 32(%rsp), %r12 + movq 40(%rsp), %r13 + movq 48(%rsp), %r14 + movq 56(%rsp), %r15 + + call __rsaz_512_reducex + +.Lmul_scatter_tail: +___ +$code.=<<___; + addq 64(%rsp), %r8 + adcq 72(%rsp), %r9 + adcq 80(%rsp), %r10 + adcq 88(%rsp), %r11 + adcq 96(%rsp), %r12 + adcq 104(%rsp), %r13 + adcq 112(%rsp), %r14 + adcq 120(%rsp), %r15 + movq %xmm2, $inp + sbbq %rcx, %rcx + + call __rsaz_512_subtract + + movl %r8d, 64*0($inp) # scatter + shrq \$32, %r8 + movl %r9d, 64*2($inp) + shrq \$32, %r9 + movl %r10d, 64*4($inp) + shrq \$32, %r10 + movl %r11d, 64*6($inp) + shrq \$32, %r11 + movl %r12d, 64*8($inp) + shrq \$32, %r12 + movl %r13d, 64*10($inp) + shrq \$32, %r13 + movl %r14d, 64*12($inp) + shrq \$32, %r14 + movl %r15d, 64*14($inp) + shrq \$32, %r15 + movl %r8d, 64*1($inp) + movl %r9d, 64*3($inp) + movl %r10d, 64*5($inp) + movl %r11d, 64*7($inp) + movl %r12d, 64*9($inp) + movl %r13d, 64*11($inp) + movl %r14d, 64*13($inp) + movl %r15d, 64*15($inp) + + leaq 128+24+48(%rsp), %rax + movq -48(%rax), %r15 + movq -40(%rax), %r14 + movq -32(%rax), %r13 + movq -24(%rax), %r12 + movq -16(%rax), %rbp + movq -8(%rax), %rbx + leaq (%rax), %rsp +.Lmul_scatter4_epilogue: + ret +.size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4 +___ +} +{ +my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx"); +$code.=<<___; +.globl rsaz_512_mul_by_one +.type rsaz_512_mul_by_one,\@function,4 +.align 32 +rsaz_512_mul_by_one: + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + subq \$128+24, %rsp +.Lmul_by_one_body: +___ +$code.=<<___ if ($addx); + movl OPENSSL_ia32cap_P+8(%rip),%eax +___ +$code.=<<___; + movq $mod, %rbp # reassign argument + movq $n0, 128(%rsp) + + movq ($inp), %r8 + pxor %xmm0, %xmm0 + movq 8($inp), %r9 + movq 16($inp), %r10 + movq 24($inp), %r11 + movq 32($inp), %r12 + movq 40($inp), %r13 + movq 48($inp), %r14 + movq 56($inp), %r15 + + movdqa %xmm0, (%rsp) + movdqa %xmm0, 16(%rsp) + movdqa %xmm0, 32(%rsp) + movdqa %xmm0, 48(%rsp) + movdqa %xmm0, 64(%rsp) + movdqa %xmm0, 80(%rsp) + movdqa %xmm0, 96(%rsp) +___ +$code.=<<___ if ($addx); + andl \$0x80100,%eax + cmpl \$0x80100,%eax # check for MULX and ADO/CX + je .Lby_one_callx +___ +$code.=<<___; + call __rsaz_512_reduce +___ +$code.=<<___ if ($addx); + jmp .Lby_one_tail +.align 32 +.Lby_one_callx: + movq 128(%rsp), %rdx # pull $n0 + call __rsaz_512_reducex +.Lby_one_tail: +___ +$code.=<<___; + movq %r8, ($out) + movq %r9, 8($out) + movq %r10, 16($out) + movq %r11, 24($out) + movq %r12, 32($out) + movq %r13, 40($out) + movq %r14, 48($out) + movq %r15, 56($out) + + leaq 128+24+48(%rsp), %rax + movq -48(%rax), %r15 + movq -40(%rax), %r14 + movq -32(%rax), %r13 + movq -24(%rax), %r12 + movq -16(%rax), %rbp + movq -8(%rax), %rbx + leaq (%rax), %rsp +.Lmul_by_one_epilogue: + ret +.size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one +___ +} +{ # __rsaz_512_reduce + # + # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0 + # output: %r8-%r15 + # clobbers: everything except %rbp and %rdi +$code.=<<___; +.type __rsaz_512_reduce,\@abi-omnipotent +.align 32 +__rsaz_512_reduce: + movq %r8, %rbx + imulq 128+8(%rsp), %rbx + movq 0(%rbp), %rax + movl \$8, %ecx + jmp .Lreduction_loop + +.align 32 +.Lreduction_loop: + mulq %rbx + movq 8(%rbp), %rax + negq %r8 + movq %rdx, %r8 + adcq \$0, %r8 + + mulq %rbx + addq %rax, %r9 + movq 16(%rbp), %rax + adcq \$0, %rdx + addq %r9, %r8 + movq %rdx, %r9 + adcq \$0, %r9 + + mulq %rbx + addq %rax, %r10 + movq 24(%rbp), %rax + adcq \$0, %rdx + addq %r10, %r9 + movq %rdx, %r10 + adcq \$0, %r10 + + mulq %rbx + addq %rax, %r11 + movq 32(%rbp), %rax + adcq \$0, %rdx + addq %r11, %r10 + movq 128+8(%rsp), %rsi + #movq %rdx, %r11 + #adcq \$0, %r11 + adcq \$0, %rdx + movq %rdx, %r11 + + mulq %rbx + addq %rax, %r12 + movq 40(%rbp), %rax + adcq \$0, %rdx + imulq %r8, %rsi + addq %r12, %r11 + movq %rdx, %r12 + adcq \$0, %r12 + + mulq %rbx + addq %rax, %r13 + movq 48(%rbp), %rax + adcq \$0, %rdx + addq %r13, %r12 + movq %rdx, %r13 + adcq \$0, %r13 + + mulq %rbx + addq %rax, %r14 + movq 56(%rbp), %rax + adcq \$0, %rdx + addq %r14, %r13 + movq %rdx, %r14 + adcq \$0, %r14 + + mulq %rbx + movq %rsi, %rbx + addq %rax, %r15 + movq 0(%rbp), %rax + adcq \$0, %rdx + addq %r15, %r14 + movq %rdx, %r15 + adcq \$0, %r15 + + decl %ecx + jne .Lreduction_loop + + ret +.size __rsaz_512_reduce,.-__rsaz_512_reduce +___ +} +if ($addx) { + # __rsaz_512_reducex + # + # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0 + # output: %r8-%r15 + # clobbers: everything except %rbp and %rdi +$code.=<<___; +.type __rsaz_512_reducex,\@abi-omnipotent +.align 32 +__rsaz_512_reducex: + #movq 128+8(%rsp), %rdx # pull $n0 + imulq %r8, %rdx + xorq %rsi, %rsi # cf=0,of=0 + movl \$8, %ecx + jmp .Lreduction_loopx + +.align 32 +.Lreduction_loopx: + mov %r8, %rbx + mulx 0(%rbp), %rax, %r8 + adcx %rbx, %rax + adox %r9, %r8 + + mulx 8(%rbp), %rax, %r9 + adcx %rax, %r8 + adox %r10, %r9 + + mulx 16(%rbp), %rbx, %r10 + adcx %rbx, %r9 + adox %r11, %r10 + + mulx 24(%rbp), %rbx, %r11 + adcx %rbx, %r10 + adox %r12, %r11 + + .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12 + mov %rdx, %rax + mov %r8, %rdx + adcx %rbx, %r11 + adox %r13, %r12 + + mulx 128+8(%rsp), %rbx, %rdx + mov %rax, %rdx + + mulx 40(%rbp), %rax, %r13 + adcx %rax, %r12 + adox %r14, %r13 + + .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14 + adcx %rax, %r13 + adox %r15, %r14 + + mulx 56(%rbp), %rax, %r15 + mov %rbx, %rdx + adcx %rax, %r14 + adox %rsi, %r15 # %rsi is 0 + adcx %rsi, %r15 # cf=0 + + decl %ecx # of=0 + jne .Lreduction_loopx + + ret +.size __rsaz_512_reducex,.-__rsaz_512_reducex +___ +} +{ # __rsaz_512_subtract + # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask + # output: + # clobbers: everything but %rdi, %rsi and %rbp +$code.=<<___; +.type __rsaz_512_subtract,\@abi-omnipotent +.align 32 +__rsaz_512_subtract: + movq %r8, ($out) + movq %r9, 8($out) + movq %r10, 16($out) + movq %r11, 24($out) + movq %r12, 32($out) + movq %r13, 40($out) + movq %r14, 48($out) + movq %r15, 56($out) + + movq 0($mod), %r8 + movq 8($mod), %r9 + negq %r8 + notq %r9 + andq %rcx, %r8 + movq 16($mod), %r10 + andq %rcx, %r9 + notq %r10 + movq 24($mod), %r11 + andq %rcx, %r10 + notq %r11 + movq 32($mod), %r12 + andq %rcx, %r11 + notq %r12 + movq 40($mod), %r13 + andq %rcx, %r12 + notq %r13 + movq 48($mod), %r14 + andq %rcx, %r13 + notq %r14 + movq 56($mod), %r15 + andq %rcx, %r14 + notq %r15 + andq %rcx, %r15 + + addq ($out), %r8 + adcq 8($out), %r9 + adcq 16($out), %r10 + adcq 24($out), %r11 + adcq 32($out), %r12 + adcq 40($out), %r13 + adcq 48($out), %r14 + adcq 56($out), %r15 + + movq %r8, ($out) + movq %r9, 8($out) + movq %r10, 16($out) + movq %r11, 24($out) + movq %r12, 32($out) + movq %r13, 40($out) + movq %r14, 48($out) + movq %r15, 56($out) + + ret +.size __rsaz_512_subtract,.-__rsaz_512_subtract +___ +} +{ # __rsaz_512_mul + # + # input: %rsi - ap, %rbp - bp + # ouput: + # clobbers: everything +my ($ap,$bp) = ("%rsi","%rbp"); +$code.=<<___; +.type __rsaz_512_mul,\@abi-omnipotent +.align 32 +__rsaz_512_mul: + leaq 8(%rsp), %rdi + + movq ($ap), %rax + mulq %rbx + movq %rax, (%rdi) + movq 8($ap), %rax + movq %rdx, %r8 + + mulq %rbx + addq %rax, %r8 + movq 16($ap), %rax + movq %rdx, %r9 + adcq \$0, %r9 + + mulq %rbx + addq %rax, %r9 + movq 24($ap), %rax + movq %rdx, %r10 + adcq \$0, %r10 + + mulq %rbx + addq %rax, %r10 + movq 32($ap), %rax + movq %rdx, %r11 + adcq \$0, %r11 + + mulq %rbx + addq %rax, %r11 + movq 40($ap), %rax + movq %rdx, %r12 + adcq \$0, %r12 + + mulq %rbx + addq %rax, %r12 + movq 48($ap), %rax + movq %rdx, %r13 + adcq \$0, %r13 + + mulq %rbx + addq %rax, %r13 + movq 56($ap), %rax + movq %rdx, %r14 + adcq \$0, %r14 + + mulq %rbx + addq %rax, %r14 + movq ($ap), %rax + movq %rdx, %r15 + adcq \$0, %r15 + + leaq 8($bp), $bp + leaq 8(%rdi), %rdi + + movl \$7, %ecx + jmp .Loop_mul + +.align 32 +.Loop_mul: + movq ($bp), %rbx + mulq %rbx + addq %rax, %r8 + movq 8($ap), %rax + movq %r8, (%rdi) + movq %rdx, %r8 + adcq \$0, %r8 + + mulq %rbx + addq %rax, %r9 + movq 16($ap), %rax + adcq \$0, %rdx + addq %r9, %r8 + movq %rdx, %r9 + adcq \$0, %r9 + + mulq %rbx + addq %rax, %r10 + movq 24($ap), %rax + adcq \$0, %rdx + addq %r10, %r9 + movq %rdx, %r10 + adcq \$0, %r10 + + mulq %rbx + addq %rax, %r11 + movq 32($ap), %rax + adcq \$0, %rdx + addq %r11, %r10 + movq %rdx, %r11 + adcq \$0, %r11 + + mulq %rbx + addq %rax, %r12 + movq 40($ap), %rax + adcq \$0, %rdx + addq %r12, %r11 + movq %rdx, %r12 + adcq \$0, %r12 + + mulq %rbx + addq %rax, %r13 + movq 48($ap), %rax + adcq \$0, %rdx + addq %r13, %r12 + movq %rdx, %r13 + adcq \$0, %r13 + + mulq %rbx + addq %rax, %r14 + movq 56($ap), %rax + adcq \$0, %rdx + addq %r14, %r13 + movq %rdx, %r14 + leaq 8($bp), $bp + adcq \$0, %r14 + + mulq %rbx + addq %rax, %r15 + movq ($ap), %rax + adcq \$0, %rdx + addq %r15, %r14 + movq %rdx, %r15 + adcq \$0, %r15 + + leaq 8(%rdi), %rdi + + decl %ecx + jnz .Loop_mul + + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq %r12, 32(%rdi) + movq %r13, 40(%rdi) + movq %r14, 48(%rdi) + movq %r15, 56(%rdi) + + ret +.size __rsaz_512_mul,.-__rsaz_512_mul +___ +} +if ($addx) { + # __rsaz_512_mulx + # + # input: %rsi - ap, %rbp - bp + # ouput: + # clobbers: everything +my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi"); +$code.=<<___; +.type __rsaz_512_mulx,\@abi-omnipotent +.align 32 +__rsaz_512_mulx: + mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller + mov \$-6, %rcx + + mulx 8($ap), %rax, %r9 + movq %rbx, 8(%rsp) + + mulx 16($ap), %rbx, %r10 + adc %rax, %r8 + + mulx 24($ap), %rax, %r11 + adc %rbx, %r9 + + mulx 32($ap), %rbx, %r12 + adc %rax, %r10 + + mulx 40($ap), %rax, %r13 + adc %rbx, %r11 + + mulx 48($ap), %rbx, %r14 + adc %rax, %r12 + + mulx 56($ap), %rax, %r15 + mov 8($bp), %rdx + adc %rbx, %r13 + adc %rax, %r14 + adc \$0, %r15 + + xor $zero, $zero # cf=0,of=0 + jmp .Loop_mulx + +.align 32 +.Loop_mulx: + movq %r8, %rbx + mulx ($ap), %rax, %r8 + adcx %rax, %rbx + adox %r9, %r8 + + mulx 8($ap), %rax, %r9 + adcx %rax, %r8 + adox %r10, %r9 + + mulx 16($ap), %rax, %r10 + adcx %rax, %r9 + adox %r11, %r10 + + mulx 24($ap), %rax, %r11 + adcx %rax, %r10 + adox %r12, %r11 + + .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12 + adcx %rax, %r11 + adox %r13, %r12 + + mulx 40($ap), %rax, %r13 + adcx %rax, %r12 + adox %r14, %r13 + + mulx 48($ap), %rax, %r14 + adcx %rax, %r13 + adox %r15, %r14 + + mulx 56($ap), %rax, %r15 + movq 64($bp,%rcx,8), %rdx + movq %rbx, 8+64-8(%rsp,%rcx,8) + adcx %rax, %r14 + adox $zero, %r15 + adcx $zero, %r15 # cf=0 + + inc %rcx # of=0 + jnz .Loop_mulx + + movq %r8, %rbx + mulx ($ap), %rax, %r8 + adcx %rax, %rbx + adox %r9, %r8 + + .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9 + adcx %rax, %r8 + adox %r10, %r9 + + .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10 + adcx %rax, %r9 + adox %r11, %r10 + + mulx 24($ap), %rax, %r11 + adcx %rax, %r10 + adox %r12, %r11 + + mulx 32($ap), %rax, %r12 + adcx %rax, %r11 + adox %r13, %r12 + + mulx 40($ap), %rax, %r13 + adcx %rax, %r12 + adox %r14, %r13 + + .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14 + adcx %rax, %r13 + adox %r15, %r14 + + .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15 + adcx %rax, %r14 + adox $zero, %r15 + adcx $zero, %r15 + + mov %rbx, 8+64-8(%rsp) + mov %r8, 8+64(%rsp) + mov %r9, 8+64+8(%rsp) + mov %r10, 8+64+16(%rsp) + mov %r11, 8+64+24(%rsp) + mov %r12, 8+64+32(%rsp) + mov %r13, 8+64+40(%rsp) + mov %r14, 8+64+48(%rsp) + mov %r15, 8+64+56(%rsp) + + ret +.size __rsaz_512_mulx,.-__rsaz_512_mulx +___ +} +{ +my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx"); +$code.=<<___; +.globl rsaz_512_scatter4 +.type rsaz_512_scatter4,\@abi-omnipotent +.align 16 +rsaz_512_scatter4: + leaq ($out,$power,4), $out + movl \$8, %r9d + jmp .Loop_scatter +.align 16 +.Loop_scatter: + movq ($inp), %rax + leaq 8($inp), $inp + movl %eax, ($out) + shrq \$32, %rax + movl %eax, 64($out) + leaq 128($out), $out + decl %r9d + jnz .Loop_scatter + ret +.size rsaz_512_scatter4,.-rsaz_512_scatter4 + +.globl rsaz_512_gather4 +.type rsaz_512_gather4,\@abi-omnipotent +.align 16 +rsaz_512_gather4: + leaq ($inp,$power,4), $inp + movl \$8, %r9d + jmp .Loop_gather +.align 16 +.Loop_gather: + movl ($inp), %eax + movl 64($inp), %r8d + leaq 128($inp), $inp + shlq \$32, %r8 + or %r8, %rax + movq %rax, ($out) + leaq 8($out), $out + decl %r9d + jnz .Loop_gather + ret +.size rsaz_512_gather4,.-rsaz_512_gather4 +___ +} + +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type se_handler,\@abi-omnipotent +.align 16 +se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # end of prologue label + cmp %r10,%rbx # context->Rip<end of prologue label + jb .Lcommon_seh_tail + + mov 152($context),%rax # pull context->Rsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + lea 128+24+48(%rax),%rax + + mov -8(%rax),%rbx + mov -16(%rax),%rbp + mov -24(%rax),%r12 + mov -32(%rax),%r13 + mov -40(%rax),%r14 + mov -48(%rax),%r15 + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R15 + +.Lcommon_seh_tail: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size sqr_handler,.-sqr_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_rsaz_512_sqr + .rva .LSEH_end_rsaz_512_sqr + .rva .LSEH_info_rsaz_512_sqr + + .rva .LSEH_begin_rsaz_512_mul + .rva .LSEH_end_rsaz_512_mul + .rva .LSEH_info_rsaz_512_mul + + .rva .LSEH_begin_rsaz_512_mul_gather4 + .rva .LSEH_end_rsaz_512_mul_gather4 + .rva .LSEH_info_rsaz_512_mul_gather4 + + .rva .LSEH_begin_rsaz_512_mul_scatter4 + .rva .LSEH_end_rsaz_512_mul_scatter4 + .rva .LSEH_info_rsaz_512_mul_scatter4 + + .rva .LSEH_begin_rsaz_512_mul_by_one + .rva .LSEH_end_rsaz_512_mul_by_one + .rva .LSEH_info_rsaz_512_mul_by_one + +.section .xdata +.align 8 +.LSEH_info_rsaz_512_sqr: + .byte 9,0,0,0 + .rva se_handler + .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[] +.LSEH_info_rsaz_512_mul: + .byte 9,0,0,0 + .rva se_handler + .rva .Lmul_body,.Lmul_epilogue # HandlerData[] +.LSEH_info_rsaz_512_mul_gather4: + .byte 9,0,0,0 + .rva se_handler + .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[] +.LSEH_info_rsaz_512_mul_scatter4: + .byte 9,0,0,0 + .rva se_handler + .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[] +.LSEH_info_rsaz_512_mul_by_one: + .byte 9,0,0,0 + .rva se_handler + .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[] +___ +} + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +print $code; +close STDOUT; diff --git a/crypto/bn/asm/sparct4-mont.pl b/crypto/bn/asm/sparct4-mont.pl new file mode 100755 index 0000000..71b4500 --- /dev/null +++ b/crypto/bn/asm/sparct4-mont.pl @@ -0,0 +1,1222 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by David S. Miller <davem@devemloft.net> and Andy Polyakov +# <appro@openssl.org>. The module is licensed under 2-clause BSD +# license. November 2012. All rights reserved. +# ==================================================================== + +###################################################################### +# Montgomery squaring-n-multiplication module for SPARC T4. +# +# The module consists of three parts: +# +# 1) collection of "single-op" subroutines that perform single +# operation, Montgomery squaring or multiplication, on 512-, +# 1024-, 1536- and 2048-bit operands; +# 2) collection of "multi-op" subroutines that perform 5 squaring and +# 1 multiplication operations on operands of above lengths; +# 3) fall-back and helper VIS3 subroutines. +# +# RSA sign is dominated by multi-op subroutine, while RSA verify and +# DSA - by single-op. Special note about 4096-bit RSA verify result. +# Operands are too long for dedicated hardware and it's handled by +# VIS3 code, which is why you don't see any improvement. It's surely +# possible to improve it [by deploying 'mpmul' instruction], maybe in +# the future... +# +# Performance improvement. +# +# 64-bit process, VIS3: +# sign verify sign/s verify/s +# rsa 1024 bits 0.000628s 0.000028s 1592.4 35434.4 +# rsa 2048 bits 0.003282s 0.000106s 304.7 9438.3 +# rsa 4096 bits 0.025866s 0.000340s 38.7 2940.9 +# dsa 1024 bits 0.000301s 0.000332s 3323.7 3013.9 +# dsa 2048 bits 0.001056s 0.001233s 946.9 810.8 +# +# 64-bit process, this module: +# sign verify sign/s verify/s +# rsa 1024 bits 0.000256s 0.000016s 3904.4 61411.9 +# rsa 2048 bits 0.000946s 0.000029s 1056.8 34292.7 +# rsa 4096 bits 0.005061s 0.000340s 197.6 2940.5 +# dsa 1024 bits 0.000176s 0.000195s 5674.7 5130.5 +# dsa 2048 bits 0.000296s 0.000354s 3383.2 2827.6 +# +###################################################################### +# 32-bit process, VIS3: +# sign verify sign/s verify/s +# rsa 1024 bits 0.000665s 0.000028s 1504.8 35233.3 +# rsa 2048 bits 0.003349s 0.000106s 298.6 9433.4 +# rsa 4096 bits 0.025959s 0.000341s 38.5 2934.8 +# dsa 1024 bits 0.000320s 0.000341s 3123.3 2929.6 +# dsa 2048 bits 0.001101s 0.001260s 908.2 793.4 +# +# 32-bit process, this module: +# sign verify sign/s verify/s +# rsa 1024 bits 0.000301s 0.000017s 3317.1 60240.0 +# rsa 2048 bits 0.001034s 0.000030s 966.9 33812.7 +# rsa 4096 bits 0.005244s 0.000341s 190.7 2935.4 +# dsa 1024 bits 0.000201s 0.000205s 4976.1 4879.2 +# dsa 2048 bits 0.000328s 0.000360s 3051.1 2774.2 +# +# 32-bit code is prone to performance degradation as interrupt rate +# dispatched to CPU executing the code grows. This is because in +# standard process of handling interrupt in 32-bit process context +# upper halves of most integer registers used as input or output are +# zeroed. This renders result invalid, and operation has to be re-run. +# If CPU is "bothered" with timer interrupts only, the penalty is +# hardly measurable. But in order to mitigate this problem for higher +# interrupt rates contemporary Linux kernel recognizes biased stack +# even in 32-bit process context and preserves full register contents. +# See http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=517ffce4e1a03aea979fe3a18a3dd1761a24fafb +# for details. + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../perlasm"); +require "sparcv9_modes.pl"; + +$code.=<<___; +#include "sparc_arch.h" + +#ifdef __arch64__ +.register %g2,#scratch +.register %g3,#scratch +#endif + +.section ".text",#alloc,#execinstr + +#ifdef __PIC__ +SPARC_PIC_THUNK(%g1) +#endif +___ + +######################################################################## +# Register layout for mont[mul|sqr] instructions. +# For details see "Oracle SPARC Architecture 2011" manual at +# http://www.oracle.com/technetwork/server-storage/sun-sparc-enterprise/documentation/. +# +my @R=map("%f".2*$_,(0..11,30,31,12..29)); +my @N=(map("%l$_",(0..7)),map("%o$_",(0..5))); @N=(@N,@N,@N[0..3]); +my @A=(@N[0..13],@R[14..31]); +my @B=(map("%i$_",(0..5)),map("%l$_",(0..7))); @B=(@B,@B,map("%o$_",(0..3))); + +######################################################################## +# int bn_mul_mont_t4_$NUM(u64 *rp,const u64 *ap,const u64 *bp, +# const u64 *np,const BN_ULONG *n0); +# +sub generate_bn_mul_mont_t4() { +my $NUM=shift; +my ($rp,$ap,$bp,$np,$sentinel)=map("%g$_",(1..5)); + +$code.=<<___; +.globl bn_mul_mont_t4_$NUM +.align 32 +bn_mul_mont_t4_$NUM: +#ifdef __arch64__ + mov 0,$sentinel + mov -128,%g4 +#elif defined(SPARCV9_64BIT_STACK) + SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) + ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0] + mov -2047,%g4 + and %g1,SPARCV9_64BIT_STACK,%g1 + movrz %g1,0,%g4 + mov -1,$sentinel + add %g4,-128,%g4 +#else + mov -1,$sentinel + mov -128,%g4 +#endif + sllx $sentinel,32,$sentinel + save %sp,%g4,%sp +#ifndef __arch64__ + save %sp,-128,%sp ! warm it up + save %sp,-128,%sp + save %sp,-128,%sp + save %sp,-128,%sp + save %sp,-128,%sp + save %sp,-128,%sp + restore + restore + restore + restore + restore + restore +#endif + and %sp,1,%g4 + or $sentinel,%fp,%fp + or %g4,$sentinel,$sentinel + + ! copy arguments to global registers + mov %i0,$rp + mov %i1,$ap + mov %i2,$bp + mov %i3,$np + ld [%i4+0],%f1 ! load *n0 + ld [%i4+4],%f0 + fsrc2 %f0,%f60 +___ + +# load ap[$NUM] ######################################################## +$code.=<<___; + save %sp,-128,%sp; or $sentinel,%fp,%fp +___ +for($i=0; $i<14 && $i<$NUM; $i++) { +my $lo=$i<13?@A[$i+1]:"%o7"; +$code.=<<___; + ld [$ap+$i*8+0],$lo + ld [$ap+$i*8+4],@A[$i] + sllx @A[$i],32,@A[$i] + or $lo,@A[$i],@A[$i] +___ +} +for(; $i<$NUM; $i++) { +my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1)); +$code.=<<___; + ld [$ap+$i*8+0],$lo + ld [$ap+$i*8+4],$hi + fsrc2 $hi,@A[$i] +___ +} +# load np[$NUM] ######################################################## +$code.=<<___; + save %sp,-128,%sp; or $sentinel,%fp,%fp +___ +for($i=0; $i<14 && $i<$NUM; $i++) { +my $lo=$i<13?@N[$i+1]:"%o7"; +$code.=<<___; + ld [$np+$i*8+0],$lo + ld [$np+$i*8+4],@N[$i] + sllx @N[$i],32,@N[$i] + or $lo,@N[$i],@N[$i] +___ +} +$code.=<<___; + save %sp,-128,%sp; or $sentinel,%fp,%fp +___ +for(; $i<28 && $i<$NUM; $i++) { +my $lo=$i<27?@N[$i+1]:"%o7"; +$code.=<<___; + ld [$np+$i*8+0],$lo + ld [$np+$i*8+4],@N[$i] + sllx @N[$i],32,@N[$i] + or $lo,@N[$i],@N[$i] +___ +} +$code.=<<___; + save %sp,-128,%sp; or $sentinel,%fp,%fp +___ +for(; $i<$NUM; $i++) { +my $lo=($i<$NUM-1)?@N[$i+1]:"%o7"; +$code.=<<___; + ld [$np+$i*8+0],$lo + ld [$np+$i*8+4],@N[$i] + sllx @N[$i],32,@N[$i] + or $lo,@N[$i],@N[$i] +___ +} +$code.=<<___; + cmp $ap,$bp + be SIZE_T_CC,.Lmsquare_$NUM + nop +___ + +# load bp[$NUM] ######################################################## +$code.=<<___; + save %sp,-128,%sp; or $sentinel,%fp,%fp +___ +for($i=0; $i<14 && $i<$NUM; $i++) { +my $lo=$i<13?@B[$i+1]:"%o7"; +$code.=<<___; + ld [$bp+$i*8+0],$lo + ld [$bp+$i*8+4],@B[$i] + sllx @B[$i],32,@B[$i] + or $lo,@B[$i],@B[$i] +___ +} +$code.=<<___; + save %sp,-128,%sp; or $sentinel,%fp,%fp +___ +for(; $i<$NUM; $i++) { +my $lo=($i<$NUM-1)?@B[$i+1]:"%o7"; +$code.=<<___; + ld [$bp+$i*8+0],$lo + ld [$bp+$i*8+4],@B[$i] + sllx @B[$i],32,@B[$i] + or $lo,@B[$i],@B[$i] +___ +} +# magic ################################################################ +$code.=<<___; + .word 0x81b02920+$NUM-1 ! montmul $NUM-1 +.Lmresume_$NUM: + fbu,pn %fcc3,.Lmabort_$NUM +#ifndef __arch64__ + and %fp,$sentinel,$sentinel + brz,pn $sentinel,.Lmabort_$NUM +#endif + nop +#ifdef __arch64__ + restore + restore + restore + restore + restore +#else + restore; and %fp,$sentinel,$sentinel + restore; and %fp,$sentinel,$sentinel + restore; and %fp,$sentinel,$sentinel + restore; and %fp,$sentinel,$sentinel + brz,pn $sentinel,.Lmabort1_$NUM + restore +#endif +___ + +# save tp[$NUM] ######################################################## +for($i=0; $i<14 && $i<$NUM; $i++) { +$code.=<<___; + movxtod @A[$i],@R[$i] +___ +} +$code.=<<___; +#ifdef __arch64__ + restore +#else + and %fp,$sentinel,$sentinel + restore + and $sentinel,1,%o7 + and %fp,$sentinel,$sentinel + srl %fp,0,%fp ! just in case? + or %o7,$sentinel,$sentinel + brz,a,pn $sentinel,.Lmdone_$NUM + mov 0,%i0 ! return failure +#endif +___ +for($i=0; $i<12 && $i<$NUM; $i++) { +@R[$i] =~ /%f([0-9]+)/; +my $lo = "%f".($1+1); +$code.=<<___; + st $lo,[$rp+$i*8+0] + st @R[$i],[$rp+$i*8+4] +___ +} +for(; $i<$NUM; $i++) { +my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1)); +$code.=<<___; + fsrc2 @R[$i],$hi + st $lo,[$rp+$i*8+0] + st $hi,[$rp+$i*8+4] +___ +} +$code.=<<___; + mov 1,%i0 ! return success +.Lmdone_$NUM: + ret + restore + +.Lmabort_$NUM: + restore + restore + restore + restore + restore +.Lmabort1_$NUM: + restore + + mov 0,%i0 ! return failure + ret + restore + +.align 32 +.Lmsquare_$NUM: + save %sp,-128,%sp; or $sentinel,%fp,%fp + save %sp,-128,%sp; or $sentinel,%fp,%fp + .word 0x81b02940+$NUM-1 ! montsqr $NUM-1 + ba .Lmresume_$NUM + nop +.type bn_mul_mont_t4_$NUM, #function +.size bn_mul_mont_t4_$NUM, .-bn_mul_mont_t4_$NUM +___ +} + +for ($i=8;$i<=32;$i+=8) { + &generate_bn_mul_mont_t4($i); +} + +######################################################################## +# +sub load_ccr { +my ($ptbl,$pwr,$ccr,$skip_wr)=@_; +$code.=<<___; + srl $pwr, 2, %o4 + and $pwr, 3, %o5 + and %o4, 7, %o4 + sll %o5, 3, %o5 ! offset within first cache line + add %o5, $ptbl, $ptbl ! of the pwrtbl + or %g0, 1, %o5 + sll %o5, %o4, $ccr +___ +$code.=<<___ if (!$skip_wr); + wr $ccr, %g0, %ccr +___ +} +sub load_b_pair { +my ($pwrtbl,$B0,$B1)=@_; + +$code.=<<___; + ldx [$pwrtbl+0*32], $B0 + ldx [$pwrtbl+8*32], $B1 + ldx [$pwrtbl+1*32], %o4 + ldx [$pwrtbl+9*32], %o5 + movvs %icc, %o4, $B0 + ldx [$pwrtbl+2*32], %o4 + movvs %icc, %o5, $B1 + ldx [$pwrtbl+10*32],%o5 + move %icc, %o4, $B0 + ldx [$pwrtbl+3*32], %o4 + move %icc, %o5, $B1 + ldx [$pwrtbl+11*32],%o5 + movneg %icc, %o4, $B0 + ldx [$pwrtbl+4*32], %o4 + movneg %icc, %o5, $B1 + ldx [$pwrtbl+12*32],%o5 + movcs %xcc, %o4, $B0 + ldx [$pwrtbl+5*32],%o4 + movcs %xcc, %o5, $B1 + ldx [$pwrtbl+13*32],%o5 + movvs %xcc, %o4, $B0 + ldx [$pwrtbl+6*32], %o4 + movvs %xcc, %o5, $B1 + ldx [$pwrtbl+14*32],%o5 + move %xcc, %o4, $B0 + ldx [$pwrtbl+7*32], %o4 + move %xcc, %o5, $B1 + ldx [$pwrtbl+15*32],%o5 + movneg %xcc, %o4, $B0 + add $pwrtbl,16*32, $pwrtbl + movneg %xcc, %o5, $B1 +___ +} +sub load_b { +my ($pwrtbl,$Bi)=@_; + +$code.=<<___; + ldx [$pwrtbl+0*32], $Bi + ldx [$pwrtbl+1*32], %o4 + ldx [$pwrtbl+2*32], %o5 + movvs %icc, %o4, $Bi + ldx [$pwrtbl+3*32], %o4 + move %icc, %o5, $Bi + ldx [$pwrtbl+4*32], %o5 + movneg %icc, %o4, $Bi + ldx [$pwrtbl+5*32], %o4 + movcs %xcc, %o5, $Bi + ldx [$pwrtbl+6*32], %o5 + movvs %xcc, %o4, $Bi + ldx [$pwrtbl+7*32], %o4 + move %xcc, %o5, $Bi + add $pwrtbl,8*32, $pwrtbl + movneg %xcc, %o4, $Bi +___ +} + +######################################################################## +# int bn_pwr5_mont_t4_$NUM(u64 *tp,const u64 *np,const BN_ULONG *n0, +# const u64 *pwrtbl,int pwr,int stride); +# +sub generate_bn_pwr5_mont_t4() { +my $NUM=shift; +my ($tp,$np,$pwrtbl,$pwr,$sentinel)=map("%g$_",(1..5)); + +$code.=<<___; +.globl bn_pwr5_mont_t4_$NUM +.align 32 +bn_pwr5_mont_t4_$NUM: +#ifdef __arch64__ + mov 0,$sentinel + mov -128,%g4 +#elif defined(SPARCV9_64BIT_STACK) + SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) + ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0] + mov -2047,%g4 + and %g1,SPARCV9_64BIT_STACK,%g1 + movrz %g1,0,%g4 + mov -1,$sentinel + add %g4,-128,%g4 +#else + mov -1,$sentinel + mov -128,%g4 +#endif + sllx $sentinel,32,$sentinel + save %sp,%g4,%sp +#ifndef __arch64__ + save %sp,-128,%sp ! warm it up + save %sp,-128,%sp + save %sp,-128,%sp + save %sp,-128,%sp + save %sp,-128,%sp + save %sp,-128,%sp + restore + restore + restore + restore + restore + restore +#endif + and %sp,1,%g4 + or $sentinel,%fp,%fp + or %g4,$sentinel,$sentinel + + ! copy arguments to global registers + mov %i0,$tp + mov %i1,$np + ld [%i2+0],%f1 ! load *n0 + ld [%i2+4],%f0 + mov %i3,$pwrtbl + srl %i4,%g0,%i4 ! pack last arguments + sllx %i5,32,$pwr + or %i4,$pwr,$pwr + fsrc2 %f0,%f60 +___ + +# load tp[$NUM] ######################################################## +$code.=<<___; + save %sp,-128,%sp; or $sentinel,%fp,%fp +___ +for($i=0; $i<14 && $i<$NUM; $i++) { +$code.=<<___; + ldx [$tp+$i*8],@A[$i] +___ +} +for(; $i<$NUM; $i++) { +$code.=<<___; + ldd [$tp+$i*8],@A[$i] +___ +} +# load np[$NUM] ######################################################## +$code.=<<___; + save %sp,-128,%sp; or $sentinel,%fp,%fp +___ +for($i=0; $i<14 && $i<$NUM; $i++) { +$code.=<<___; + ldx [$np+$i*8],@N[$i] +___ +} +$code.=<<___; + save %sp,-128,%sp; or $sentinel,%fp,%fp +___ +for(; $i<28 && $i<$NUM; $i++) { +$code.=<<___; + ldx [$np+$i*8],@N[$i] +___ +} +$code.=<<___; + save %sp,-128,%sp; or $sentinel,%fp,%fp +___ +for(; $i<$NUM; $i++) { +$code.=<<___; + ldx [$np+$i*8],@N[$i] +___ +} +# load pwrtbl[pwr] ######################################################## +$code.=<<___; + save %sp,-128,%sp; or $sentinel,%fp,%fp + + srlx $pwr, 32, %o4 ! unpack $pwr + srl $pwr, %g0, %o5 + sub %o4, 5, %o4 + mov $pwrtbl, %o7 + sllx %o4, 32, $pwr ! re-pack $pwr + or %o5, $pwr, $pwr + srl %o5, %o4, %o5 +___ + &load_ccr("%o7","%o5","%o4"); +$code.=<<___; + b .Lstride_$NUM + nop +.align 16 +.Lstride_$NUM: +___ +for($i=0; $i<14 && $i<$NUM; $i+=2) { + &load_b_pair("%o7",@B[$i],@B[$i+1]); +} +$code.=<<___; + save %sp,-128,%sp; or $sentinel,%fp,%fp +___ +for(; $i<$NUM; $i+=2) { + &load_b_pair("%i7",@B[$i],@B[$i+1]); +} +$code.=<<___; + srax $pwr, 32, %o4 ! unpack $pwr + srl $pwr, %g0, %o5 + sub %o4, 5, %o4 + mov $pwrtbl, %i7 + sllx %o4, 32, $pwr ! re-pack $pwr + or %o5, $pwr, $pwr + srl %o5, %o4, %o5 +___ + &load_ccr("%i7","%o5","%o4",1); + +# magic ################################################################ +for($i=0; $i<5; $i++) { +$code.=<<___; + .word 0x81b02940+$NUM-1 ! montsqr $NUM-1 + fbu,pn %fcc3,.Labort_$NUM +#ifndef __arch64__ + and %fp,$sentinel,$sentinel + brz,pn $sentinel,.Labort_$NUM +#endif + nop +___ +} +$code.=<<___; + wr %o4, %g0, %ccr + .word 0x81b02920+$NUM-1 ! montmul $NUM-1 + fbu,pn %fcc3,.Labort_$NUM +#ifndef __arch64__ + and %fp,$sentinel,$sentinel + brz,pn $sentinel,.Labort_$NUM +#endif + + srax $pwr, 32, %o4 +#ifdef __arch64__ + brgez %o4,.Lstride_$NUM + restore + restore + restore + restore + restore +#else + brgez %o4,.Lstride_$NUM + restore; and %fp,$sentinel,$sentinel + restore; and %fp,$sentinel,$sentinel + restore; and %fp,$sentinel,$sentinel + restore; and %fp,$sentinel,$sentinel + brz,pn $sentinel,.Labort1_$NUM + restore +#endif +___ + +# save tp[$NUM] ######################################################## +for($i=0; $i<14 && $i<$NUM; $i++) { +$code.=<<___; + movxtod @A[$i],@R[$i] +___ +} +$code.=<<___; +#ifdef __arch64__ + restore +#else + and %fp,$sentinel,$sentinel + restore + and $sentinel,1,%o7 + and %fp,$sentinel,$sentinel + srl %fp,0,%fp ! just in case? + or %o7,$sentinel,$sentinel + brz,a,pn $sentinel,.Ldone_$NUM + mov 0,%i0 ! return failure +#endif +___ +for($i=0; $i<$NUM; $i++) { +$code.=<<___; + std @R[$i],[$tp+$i*8] +___ +} +$code.=<<___; + mov 1,%i0 ! return success +.Ldone_$NUM: + ret + restore + +.Labort_$NUM: + restore + restore + restore + restore + restore +.Labort1_$NUM: + restore + + mov 0,%i0 ! return failure + ret + restore +.type bn_pwr5_mont_t4_$NUM, #function +.size bn_pwr5_mont_t4_$NUM, .-bn_pwr5_mont_t4_$NUM +___ +} + +for ($i=8;$i<=32;$i+=8) { + &generate_bn_pwr5_mont_t4($i); +} + +{ +######################################################################## +# Fall-back subroutines +# +# copy of bn_mul_mont_vis3 adjusted for vectors of 64-bit values +# +($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)= + (map("%g$_",(1..5)),map("%o$_",(0..5,7))); + +# int bn_mul_mont( +$rp="%o0"; # u64 *rp, +$ap="%o1"; # const u64 *ap, +$bp="%o2"; # const u64 *bp, +$np="%o3"; # const u64 *np, +$n0p="%o4"; # const BN_ULONG *n0, +$num="%o5"; # int num); # caller ensures that num is >=3 +$code.=<<___; +.globl bn_mul_mont_t4 +.align 32 +bn_mul_mont_t4: + add %sp, STACK_BIAS, %g4 ! real top of stack + sll $num, 3, $num ! size in bytes + add $num, 63, %g1 + andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes + sub %g4, %g1, %g1 + andn %g1, 63, %g1 ! align at 64 byte + sub %g1, STACK_FRAME, %g1 ! new top of stack + sub %g1, %g4, %g1 + + save %sp, %g1, %sp +___ +# +-------------------------------+<----- %sp +# . . +# +-------------------------------+<----- aligned at 64 bytes +# | __int64 tmp[0] | +# +-------------------------------+ +# . . +# . . +# +-------------------------------+<----- aligned at 64 bytes +# . . +($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5)); +($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz)=map("%l$_",(0..7)); +($ovf,$i)=($t0,$t1); +$code.=<<___; + ld [$n0p+0], $t0 ! pull n0[0..1] value + ld [$n0p+4], $t1 + add %sp, STACK_BIAS+STACK_FRAME, $tp + ldx [$bp+0], $m0 ! m0=bp[0] + sllx $t1, 32, $n0 + add $bp, 8, $bp + or $t0, $n0, $n0 + + ldx [$ap+0], $aj ! ap[0] + + mulx $aj, $m0, $lo0 ! ap[0]*bp[0] + umulxhi $aj, $m0, $hi0 + + ldx [$ap+8], $aj ! ap[1] + add $ap, 16, $ap + ldx [$np+0], $nj ! np[0] + + mulx $lo0, $n0, $m1 ! "tp[0]"*n0 + + mulx $aj, $m0, $alo ! ap[1]*bp[0] + umulxhi $aj, $m0, $aj ! ahi=aj + + mulx $nj, $m1, $lo1 ! np[0]*m1 + umulxhi $nj, $m1, $hi1 + + ldx [$np+8], $nj ! np[1] + + addcc $lo0, $lo1, $lo1 + add $np, 16, $np + addxc %g0, $hi1, $hi1 + + mulx $nj, $m1, $nlo ! np[1]*m1 + umulxhi $nj, $m1, $nj ! nhi=nj + + ba .L1st + sub $num, 24, $cnt ! cnt=num-3 + +.align 16 +.L1st: + addcc $alo, $hi0, $lo0 + addxc $aj, %g0, $hi0 + + ldx [$ap+0], $aj ! ap[j] + addcc $nlo, $hi1, $lo1 + add $ap, 8, $ap + addxc $nj, %g0, $hi1 ! nhi=nj + + ldx [$np+0], $nj ! np[j] + mulx $aj, $m0, $alo ! ap[j]*bp[0] + add $np, 8, $np + umulxhi $aj, $m0, $aj ! ahi=aj + + mulx $nj, $m1, $nlo ! np[j]*m1 + addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0] + umulxhi $nj, $m1, $nj ! nhi=nj + addxc %g0, $hi1, $hi1 + stxa $lo1, [$tp]0xe2 ! tp[j-1] + add $tp, 8, $tp ! tp++ + + brnz,pt $cnt, .L1st + sub $cnt, 8, $cnt ! j-- +!.L1st + addcc $alo, $hi0, $lo0 + addxc $aj, %g0, $hi0 ! ahi=aj + + addcc $nlo, $hi1, $lo1 + addxc $nj, %g0, $hi1 + addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0] + addxc %g0, $hi1, $hi1 + stxa $lo1, [$tp]0xe2 ! tp[j-1] + add $tp, 8, $tp + + addcc $hi0, $hi1, $hi1 + addxc %g0, %g0, $ovf ! upmost overflow bit + stxa $hi1, [$tp]0xe2 + add $tp, 8, $tp + + ba .Louter + sub $num, 16, $i ! i=num-2 + +.align 16 +.Louter: + ldx [$bp+0], $m0 ! m0=bp[i] + add $bp, 8, $bp + + sub $ap, $num, $ap ! rewind + sub $np, $num, $np + sub $tp, $num, $tp + + ldx [$ap+0], $aj ! ap[0] + ldx [$np+0], $nj ! np[0] + + mulx $aj, $m0, $lo0 ! ap[0]*bp[i] + ldx [$tp], $tj ! tp[0] + umulxhi $aj, $m0, $hi0 + ldx [$ap+8], $aj ! ap[1] + addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0] + mulx $aj, $m0, $alo ! ap[1]*bp[i] + addxc %g0, $hi0, $hi0 + mulx $lo0, $n0, $m1 ! tp[0]*n0 + umulxhi $aj, $m0, $aj ! ahi=aj + mulx $nj, $m1, $lo1 ! np[0]*m1 + add $ap, 16, $ap + umulxhi $nj, $m1, $hi1 + ldx [$np+8], $nj ! np[1] + add $np, 16, $np + addcc $lo1, $lo0, $lo1 + mulx $nj, $m1, $nlo ! np[1]*m1 + addxc %g0, $hi1, $hi1 + umulxhi $nj, $m1, $nj ! nhi=nj + + ba .Linner + sub $num, 24, $cnt ! cnt=num-3 +.align 16 +.Linner: + addcc $alo, $hi0, $lo0 + ldx [$tp+8], $tj ! tp[j] + addxc $aj, %g0, $hi0 ! ahi=aj + ldx [$ap+0], $aj ! ap[j] + add $ap, 8, $ap + addcc $nlo, $hi1, $lo1 + mulx $aj, $m0, $alo ! ap[j]*bp[i] + addxc $nj, %g0, $hi1 ! nhi=nj + ldx [$np+0], $nj ! np[j] + add $np, 8, $np + umulxhi $aj, $m0, $aj ! ahi=aj + addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j] + mulx $nj, $m1, $nlo ! np[j]*m1 + addxc %g0, $hi0, $hi0 + umulxhi $nj, $m1, $nj ! nhi=nj + addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j] + addxc %g0, $hi1, $hi1 + stx $lo1, [$tp] ! tp[j-1] + add $tp, 8, $tp + brnz,pt $cnt, .Linner + sub $cnt, 8, $cnt +!.Linner + ldx [$tp+8], $tj ! tp[j] + addcc $alo, $hi0, $lo0 + addxc $aj, %g0, $hi0 ! ahi=aj + addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j] + addxc %g0, $hi0, $hi0 + + addcc $nlo, $hi1, $lo1 + addxc $nj, %g0, $hi1 ! nhi=nj + addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j] + addxc %g0, $hi1, $hi1 + stx $lo1, [$tp] ! tp[j-1] + + subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc + addxccc $hi1, $hi0, $hi1 + addxc %g0, %g0, $ovf + stx $hi1, [$tp+8] + add $tp, 16, $tp + + brnz,pt $i, .Louter + sub $i, 8, $i + + sub $ap, $num, $ap ! rewind + sub $np, $num, $np + sub $tp, $num, $tp + ba .Lsub + subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc + +.align 16 +.Lsub: + ldx [$tp], $tj + add $tp, 8, $tp + ldx [$np+0], $nj + add $np, 8, $np + subccc $tj, $nj, $t2 ! tp[j]-np[j] + srlx $tj, 32, $tj + srlx $nj, 32, $nj + subccc $tj, $nj, $t3 + add $rp, 8, $rp + st $t2, [$rp-4] ! reverse order + st $t3, [$rp-8] + brnz,pt $cnt, .Lsub + sub $cnt, 8, $cnt + + sub $np, $num, $np ! rewind + sub $tp, $num, $tp + sub $rp, $num, $rp + + subc $ovf, %g0, $ovf ! handle upmost overflow bit + and $tp, $ovf, $ap + andn $rp, $ovf, $np + or $np, $ap, $ap ! ap=borrow?tp:rp + ba .Lcopy + sub $num, 8, $cnt + +.align 16 +.Lcopy: ! copy or in-place refresh + ldx [$ap+0], $t2 + add $ap, 8, $ap + stx %g0, [$tp] ! zap + add $tp, 8, $tp + stx $t2, [$rp+0] + add $rp, 8, $rp + brnz $cnt, .Lcopy + sub $cnt, 8, $cnt + + mov 1, %o0 + ret + restore +.type bn_mul_mont_t4, #function +.size bn_mul_mont_t4, .-bn_mul_mont_t4 +___ + +# int bn_mul_mont_gather5( +$rp="%o0"; # u64 *rp, +$ap="%o1"; # const u64 *ap, +$bp="%o2"; # const u64 *pwrtbl, +$np="%o3"; # const u64 *np, +$n0p="%o4"; # const BN_ULONG *n0, +$num="%o5"; # int num, # caller ensures that num is >=3 + # int power); +$code.=<<___; +.globl bn_mul_mont_gather5_t4 +.align 32 +bn_mul_mont_gather5_t4: + add %sp, STACK_BIAS, %g4 ! real top of stack + sll $num, 3, $num ! size in bytes + add $num, 63, %g1 + andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes + sub %g4, %g1, %g1 + andn %g1, 63, %g1 ! align at 64 byte + sub %g1, STACK_FRAME, %g1 ! new top of stack + sub %g1, %g4, %g1 + LDPTR [%sp+STACK_7thARG], %g4 ! load power, 7th argument + + save %sp, %g1, %sp +___ +# +-------------------------------+<----- %sp +# . . +# +-------------------------------+<----- aligned at 64 bytes +# | __int64 tmp[0] | +# +-------------------------------+ +# . . +# . . +# +-------------------------------+<----- aligned at 64 bytes +# . . +($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5)); +($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$ccr)=map("%l$_",(0..7)); +($ovf,$i)=($t0,$t1); + &load_ccr($bp,"%g4",$ccr); + &load_b($bp,$m0,"%o7"); # m0=bp[0] + +$code.=<<___; + ld [$n0p+0], $t0 ! pull n0[0..1] value + ld [$n0p+4], $t1 + add %sp, STACK_BIAS+STACK_FRAME, $tp + sllx $t1, 32, $n0 + or $t0, $n0, $n0 + + ldx [$ap+0], $aj ! ap[0] + + mulx $aj, $m0, $lo0 ! ap[0]*bp[0] + umulxhi $aj, $m0, $hi0 + + ldx [$ap+8], $aj ! ap[1] + add $ap, 16, $ap + ldx [$np+0], $nj ! np[0] + + mulx $lo0, $n0, $m1 ! "tp[0]"*n0 + + mulx $aj, $m0, $alo ! ap[1]*bp[0] + umulxhi $aj, $m0, $aj ! ahi=aj + + mulx $nj, $m1, $lo1 ! np[0]*m1 + umulxhi $nj, $m1, $hi1 + + ldx [$np+8], $nj ! np[1] + + addcc $lo0, $lo1, $lo1 + add $np, 16, $np + addxc %g0, $hi1, $hi1 + + mulx $nj, $m1, $nlo ! np[1]*m1 + umulxhi $nj, $m1, $nj ! nhi=nj + + ba .L1st_g5 + sub $num, 24, $cnt ! cnt=num-3 + +.align 16 +.L1st_g5: + addcc $alo, $hi0, $lo0 + addxc $aj, %g0, $hi0 + + ldx [$ap+0], $aj ! ap[j] + addcc $nlo, $hi1, $lo1 + add $ap, 8, $ap + addxc $nj, %g0, $hi1 ! nhi=nj + + ldx [$np+0], $nj ! np[j] + mulx $aj, $m0, $alo ! ap[j]*bp[0] + add $np, 8, $np + umulxhi $aj, $m0, $aj ! ahi=aj + + mulx $nj, $m1, $nlo ! np[j]*m1 + addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0] + umulxhi $nj, $m1, $nj ! nhi=nj + addxc %g0, $hi1, $hi1 + stxa $lo1, [$tp]0xe2 ! tp[j-1] + add $tp, 8, $tp ! tp++ + + brnz,pt $cnt, .L1st_g5 + sub $cnt, 8, $cnt ! j-- +!.L1st_g5 + addcc $alo, $hi0, $lo0 + addxc $aj, %g0, $hi0 ! ahi=aj + + addcc $nlo, $hi1, $lo1 + addxc $nj, %g0, $hi1 + addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0] + addxc %g0, $hi1, $hi1 + stxa $lo1, [$tp]0xe2 ! tp[j-1] + add $tp, 8, $tp + + addcc $hi0, $hi1, $hi1 + addxc %g0, %g0, $ovf ! upmost overflow bit + stxa $hi1, [$tp]0xe2 + add $tp, 8, $tp + + ba .Louter_g5 + sub $num, 16, $i ! i=num-2 + +.align 16 +.Louter_g5: + wr $ccr, %g0, %ccr +___ + &load_b($bp,$m0); # m0=bp[i] +$code.=<<___; + sub $ap, $num, $ap ! rewind + sub $np, $num, $np + sub $tp, $num, $tp + + ldx [$ap+0], $aj ! ap[0] + ldx [$np+0], $nj ! np[0] + + mulx $aj, $m0, $lo0 ! ap[0]*bp[i] + ldx [$tp], $tj ! tp[0] + umulxhi $aj, $m0, $hi0 + ldx [$ap+8], $aj ! ap[1] + addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0] + mulx $aj, $m0, $alo ! ap[1]*bp[i] + addxc %g0, $hi0, $hi0 + mulx $lo0, $n0, $m1 ! tp[0]*n0 + umulxhi $aj, $m0, $aj ! ahi=aj + mulx $nj, $m1, $lo1 ! np[0]*m1 + add $ap, 16, $ap + umulxhi $nj, $m1, $hi1 + ldx [$np+8], $nj ! np[1] + add $np, 16, $np + addcc $lo1, $lo0, $lo1 + mulx $nj, $m1, $nlo ! np[1]*m1 + addxc %g0, $hi1, $hi1 + umulxhi $nj, $m1, $nj ! nhi=nj + + ba .Linner_g5 + sub $num, 24, $cnt ! cnt=num-3 +.align 16 +.Linner_g5: + addcc $alo, $hi0, $lo0 + ldx [$tp+8], $tj ! tp[j] + addxc $aj, %g0, $hi0 ! ahi=aj + ldx [$ap+0], $aj ! ap[j] + add $ap, 8, $ap + addcc $nlo, $hi1, $lo1 + mulx $aj, $m0, $alo ! ap[j]*bp[i] + addxc $nj, %g0, $hi1 ! nhi=nj + ldx [$np+0], $nj ! np[j] + add $np, 8, $np + umulxhi $aj, $m0, $aj ! ahi=aj + addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j] + mulx $nj, $m1, $nlo ! np[j]*m1 + addxc %g0, $hi0, $hi0 + umulxhi $nj, $m1, $nj ! nhi=nj + addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j] + addxc %g0, $hi1, $hi1 + stx $lo1, [$tp] ! tp[j-1] + add $tp, 8, $tp + brnz,pt $cnt, .Linner_g5 + sub $cnt, 8, $cnt +!.Linner_g5 + ldx [$tp+8], $tj ! tp[j] + addcc $alo, $hi0, $lo0 + addxc $aj, %g0, $hi0 ! ahi=aj + addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j] + addxc %g0, $hi0, $hi0 + + addcc $nlo, $hi1, $lo1 + addxc $nj, %g0, $hi1 ! nhi=nj + addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j] + addxc %g0, $hi1, $hi1 + stx $lo1, [$tp] ! tp[j-1] + + subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc + addxccc $hi1, $hi0, $hi1 + addxc %g0, %g0, $ovf + stx $hi1, [$tp+8] + add $tp, 16, $tp + + brnz,pt $i, .Louter_g5 + sub $i, 8, $i + + sub $ap, $num, $ap ! rewind + sub $np, $num, $np + sub $tp, $num, $tp + ba .Lsub_g5 + subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc + +.align 16 +.Lsub_g5: + ldx [$tp], $tj + add $tp, 8, $tp + ldx [$np+0], $nj + add $np, 8, $np + subccc $tj, $nj, $t2 ! tp[j]-np[j] + srlx $tj, 32, $tj + srlx $nj, 32, $nj + subccc $tj, $nj, $t3 + add $rp, 8, $rp + st $t2, [$rp-4] ! reverse order + st $t3, [$rp-8] + brnz,pt $cnt, .Lsub_g5 + sub $cnt, 8, $cnt + + sub $np, $num, $np ! rewind + sub $tp, $num, $tp + sub $rp, $num, $rp + + subc $ovf, %g0, $ovf ! handle upmost overflow bit + and $tp, $ovf, $ap + andn $rp, $ovf, $np + or $np, $ap, $ap ! ap=borrow?tp:rp + ba .Lcopy_g5 + sub $num, 8, $cnt + +.align 16 +.Lcopy_g5: ! copy or in-place refresh + ldx [$ap+0], $t2 + add $ap, 8, $ap + stx %g0, [$tp] ! zap + add $tp, 8, $tp + stx $t2, [$rp+0] + add $rp, 8, $rp + brnz $cnt, .Lcopy_g5 + sub $cnt, 8, $cnt + + mov 1, %o0 + ret + restore +.type bn_mul_mont_gather5_t4, #function +.size bn_mul_mont_gather5_t4, .-bn_mul_mont_gather5_t4 +___ +} + +$code.=<<___; +.globl bn_flip_t4 +.align 32 +bn_flip_t4: +.Loop_flip: + ld [%o1+0], %o4 + sub %o2, 1, %o2 + ld [%o1+4], %o5 + add %o1, 8, %o1 + st %o5, [%o0+0] + st %o4, [%o0+4] + brnz %o2, .Loop_flip + add %o0, 8, %o0 + retl + nop +.type bn_flip_t4, #function +.size bn_flip_t4, .-bn_flip_t4 + +.globl bn_flip_n_scatter5_t4 +.align 32 +bn_flip_n_scatter5_t4: + sll %o3, 3, %o3 + srl %o1, 1, %o1 + add %o3, %o2, %o2 ! &pwrtbl[pwr] + sub %o1, 1, %o1 +.Loop_flip_n_scatter5: + ld [%o0+0], %o4 ! inp[i] + ld [%o0+4], %o5 + add %o0, 8, %o0 + sllx %o5, 32, %o5 + or %o4, %o5, %o5 + stx %o5, [%o2] + add %o2, 32*8, %o2 + brnz %o1, .Loop_flip_n_scatter5 + sub %o1, 1, %o1 + retl + nop +.type bn_flip_n_scatter5_t4, #function +.size bn_flip_n_scatter5_t4, .-bn_flip_n_scatter5_t4 + +.globl bn_gather5_t4 +.align 32 +bn_gather5_t4: +___ + &load_ccr("%o2","%o3","%g1"); +$code.=<<___; + sub %o1, 1, %o1 +.Loop_gather5: +___ + &load_b("%o2","%g1"); +$code.=<<___; + stx %g1, [%o0] + add %o0, 8, %o0 + brnz %o1, .Loop_gather5 + sub %o1, 1, %o1 + + retl + nop +.type bn_gather5_t4, #function +.size bn_gather5_t4, .-bn_gather5_t4 + +.asciz "Montgomery Multiplication for SPARC T4, David S. Miller, Andy Polyakov" +.align 4 +___ + +&emit_assembler(); + +close STDOUT; diff --git a/crypto/bn/asm/sparcv9-gf2m.pl b/crypto/bn/asm/sparcv9-gf2m.pl new file mode 100755 index 0000000..ab94cd9 --- /dev/null +++ b/crypto/bn/asm/sparcv9-gf2m.pl @@ -0,0 +1,190 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# October 2012 +# +# The module implements bn_GF2m_mul_2x2 polynomial multiplication used +# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for +# the time being... Except that it has two code paths: one suitable +# for all SPARCv9 processors and one for VIS3-capable ones. Former +# delivers ~25-45% more, more for longer keys, heaviest DH and DSA +# verify operations on venerable UltraSPARC II. On T4 VIS3 code is +# ~100-230% faster than gcc-generated code and ~35-90% faster than +# the pure SPARCv9 code path. + +$locals=16*8; + +$tab="%l0"; + +@T=("%g2","%g3"); +@i=("%g4","%g5"); + +($a1,$a2,$a4,$a8,$a12,$a48)=map("%o$_",(0..5)); +($lo,$hi,$b)=("%g1",$a8,"%o7"); $a=$lo; + +$code.=<<___; +#include <sparc_arch.h> + +#ifdef __arch64__ +.register %g2,#scratch +.register %g3,#scratch +#endif + +#ifdef __PIC__ +SPARC_PIC_THUNK(%g1) +#endif + +.globl bn_GF2m_mul_2x2 +.align 16 +bn_GF2m_mul_2x2: + SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) + ld [%g1+0],%g1 ! OPENSSL_sparcv9cap_P[0] + + andcc %g1, SPARCV9_VIS3, %g0 + bz,pn %icc,.Lsoftware + nop + + sllx %o1, 32, %o1 + sllx %o3, 32, %o3 + or %o2, %o1, %o1 + or %o4, %o3, %o3 + .word 0x95b262ab ! xmulx %o1, %o3, %o2 + .word 0x99b262cb ! xmulxhi %o1, %o3, %o4 + srlx %o2, 32, %o1 ! 13 cycles later + st %o2, [%o0+0] + st %o1, [%o0+4] + srlx %o4, 32, %o3 + st %o4, [%o0+8] + retl + st %o3, [%o0+12] + +.align 16 +.Lsoftware: + save %sp,-STACK_FRAME-$locals,%sp + + sllx %i1,32,$a + mov -1,$a12 + sllx %i3,32,$b + or %i2,$a,$a + srlx $a12,1,$a48 ! 0x7fff... + or %i4,$b,$b + srlx $a12,2,$a12 ! 0x3fff... + add %sp,STACK_BIAS+STACK_FRAME,$tab + + sllx $a,2,$a4 + mov $a,$a1 + sllx $a,1,$a2 + + srax $a4,63,@i[1] ! broadcast 61st bit + and $a48,$a4,$a4 ! (a<<2)&0x7fff... + srlx $a48,2,$a48 + srax $a2,63,@i[0] ! broadcast 62nd bit + and $a12,$a2,$a2 ! (a<<1)&0x3fff... + srax $a1,63,$lo ! broadcast 63rd bit + and $a48,$a1,$a1 ! (a<<0)&0x1fff... + + sllx $a1,3,$a8 + and $b,$lo,$lo + and $b,@i[0],@i[0] + and $b,@i[1],@i[1] + + stx %g0,[$tab+0*8] ! tab[0]=0 + xor $a1,$a2,$a12 + stx $a1,[$tab+1*8] ! tab[1]=a1 + stx $a2,[$tab+2*8] ! tab[2]=a2 + xor $a4,$a8,$a48 + stx $a12,[$tab+3*8] ! tab[3]=a1^a2 + xor $a4,$a1,$a1 + + stx $a4,[$tab+4*8] ! tab[4]=a4 + xor $a4,$a2,$a2 + stx $a1,[$tab+5*8] ! tab[5]=a1^a4 + xor $a4,$a12,$a12 + stx $a2,[$tab+6*8] ! tab[6]=a2^a4 + xor $a48,$a1,$a1 + stx $a12,[$tab+7*8] ! tab[7]=a1^a2^a4 + xor $a48,$a2,$a2 + + stx $a8,[$tab+8*8] ! tab[8]=a8 + xor $a48,$a12,$a12 + stx $a1,[$tab+9*8] ! tab[9]=a1^a8 + xor $a4,$a1,$a1 + stx $a2,[$tab+10*8] ! tab[10]=a2^a8 + xor $a4,$a2,$a2 + stx $a12,[$tab+11*8] ! tab[11]=a1^a2^a8 + + xor $a4,$a12,$a12 + stx $a48,[$tab+12*8] ! tab[12]=a4^a8 + srlx $lo,1,$hi + stx $a1,[$tab+13*8] ! tab[13]=a1^a4^a8 + sllx $lo,63,$lo + stx $a2,[$tab+14*8] ! tab[14]=a2^a4^a8 + srlx @i[0],2,@T[0] + stx $a12,[$tab+15*8] ! tab[15]=a1^a2^a4^a8 + + sllx @i[0],62,$a1 + sllx $b,3,@i[0] + srlx @i[1],3,@T[1] + and @i[0],`0xf<<3`,@i[0] + sllx @i[1],61,$a2 + ldx [$tab+@i[0]],@i[0] + srlx $b,4-3,@i[1] + xor @T[0],$hi,$hi + and @i[1],`0xf<<3`,@i[1] + xor $a1,$lo,$lo + ldx [$tab+@i[1]],@i[1] + xor @T[1],$hi,$hi + + xor @i[0],$lo,$lo + srlx $b,8-3,@i[0] + xor $a2,$lo,$lo + and @i[0],`0xf<<3`,@i[0] +___ +for($n=1;$n<14;$n++) { +$code.=<<___; + sllx @i[1],`$n*4`,@T[0] + ldx [$tab+@i[0]],@i[0] + srlx @i[1],`64-$n*4`,@T[1] + xor @T[0],$lo,$lo + srlx $b,`($n+2)*4`-3,@i[1] + xor @T[1],$hi,$hi + and @i[1],`0xf<<3`,@i[1] +___ + push(@i,shift(@i)); push(@T,shift(@T)); +} +$code.=<<___; + sllx @i[1],`$n*4`,@T[0] + ldx [$tab+@i[0]],@i[0] + srlx @i[1],`64-$n*4`,@T[1] + xor @T[0],$lo,$lo + + sllx @i[0],`($n+1)*4`,@T[0] + xor @T[1],$hi,$hi + srlx @i[0],`64-($n+1)*4`,@T[1] + xor @T[0],$lo,$lo + xor @T[1],$hi,$hi + + srlx $lo,32,%i1 + st $lo,[%i0+0] + st %i1,[%i0+4] + srlx $hi,32,%i2 + st $hi,[%i0+8] + st %i2,[%i0+12] + + ret + restore +.type bn_GF2m_mul_2x2,#function +.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 +.asciz "GF(2^m) Multiplication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" +.align 4 +___ + +$code =~ s/\`([^\`]*)\`/eval($1)/gem; +print $code; +close STDOUT; diff --git a/crypto/bn/asm/vis3-mont.pl b/crypto/bn/asm/vis3-mont.pl new file mode 100755 index 0000000..263ac02 --- /dev/null +++ b/crypto/bn/asm/vis3-mont.pl @@ -0,0 +1,373 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# October 2012. +# +# SPARCv9 VIS3 Montgomery multiplicaion procedure suitable for T3 and +# onward. There are three new instructions used here: umulxhi, +# addxc[cc] and initializing store. On T3 RSA private key operations +# are 1.54/1.87/2.11/2.26 times faster for 512/1024/2048/4096-bit key +# lengths. This is without dedicated squaring procedure. On T4 +# corresponding coefficients are 1.47/2.10/2.80/2.90x, which is mostly +# for reference purposes, because T4 has dedicated Montgomery +# multiplication and squaring *instructions* that deliver even more. + +$bits=32; +for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } +if ($bits==64) { $bias=2047; $frame=192; } +else { $bias=0; $frame=112; } + +$code.=<<___ if ($bits==64); +.register %g2,#scratch +.register %g3,#scratch +___ +$code.=<<___; +.section ".text",#alloc,#execinstr +___ + +($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)= + (map("%g$_",(1..5)),map("%o$_",(0..5,7))); + +# int bn_mul_mont( +$rp="%o0"; # BN_ULONG *rp, +$ap="%o1"; # const BN_ULONG *ap, +$bp="%o2"; # const BN_ULONG *bp, +$np="%o3"; # const BN_ULONG *np, +$n0p="%o4"; # const BN_ULONG *n0, +$num="%o5"; # int num); # caller ensures that num is even + # and >=6 +$code.=<<___; +.globl bn_mul_mont_vis3 +.align 32 +bn_mul_mont_vis3: + add %sp, $bias, %g4 ! real top of stack + sll $num, 2, $num ! size in bytes + add $num, 63, %g5 + andn %g5, 63, %g5 ! buffer size rounded up to 64 bytes + add %g5, %g5, %g1 + add %g5, %g1, %g1 ! 3*buffer size + sub %g4, %g1, %g1 + andn %g1, 63, %g1 ! align at 64 byte + sub %g1, $frame, %g1 ! new top of stack + sub %g1, %g4, %g1 + + save %sp, %g1, %sp +___ + +# +-------------------------------+<----- %sp +# . . +# +-------------------------------+<----- aligned at 64 bytes +# | __int64 tmp[0] | +# +-------------------------------+ +# . . +# . . +# +-------------------------------+<----- aligned at 64 bytes +# | __int64 ap[1..0] | converted ap[] +# +-------------------------------+ +# | __int64 np[1..0] | converted np[] +# +-------------------------------+ +# | __int64 ap[3..2] | +# . . +# . . +# +-------------------------------+ +($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5)); +($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$anp)=map("%l$_",(0..7)); +($ovf,$i)=($t0,$t1); +$code.=<<___; + ld [$n0p+0], $t0 ! pull n0[0..1] value + add %sp, $bias+$frame, $tp + ld [$n0p+4], $t1 + add $tp, %g5, $anp + ld [$bp+0], $t2 ! m0=bp[0] + sllx $t1, 32, $n0 + ld [$bp+4], $t3 + or $t0, $n0, $n0 + add $bp, 8, $bp + + ld [$ap+0], $t0 ! ap[0] + sllx $t3, 32, $m0 + ld [$ap+4], $t1 + or $t2, $m0, $m0 + + ld [$ap+8], $t2 ! ap[1] + sllx $t1, 32, $aj + ld [$ap+12], $t3 + or $t0, $aj, $aj + add $ap, 16, $ap + stx $aj, [$anp] ! converted ap[0] + + mulx $aj, $m0, $lo0 ! ap[0]*bp[0] + umulxhi $aj, $m0, $hi0 + + ld [$np+0], $t0 ! np[0] + sllx $t3, 32, $aj + ld [$np+4], $t1 + or $t2, $aj, $aj + + ld [$np+8], $t2 ! np[1] + sllx $t1, 32, $nj + ld [$np+12], $t3 + or $t0, $nj, $nj + add $np, 16, $np + stx $nj, [$anp+8] ! converted np[0] + + mulx $lo0, $n0, $m1 ! "tp[0]"*n0 + stx $aj, [$anp+16] ! converted ap[1] + + mulx $aj, $m0, $alo ! ap[1]*bp[0] + umulxhi $aj, $m0, $aj ! ahi=aj + + mulx $nj, $m1, $lo1 ! np[0]*m1 + umulxhi $nj, $m1, $hi1 + + sllx $t3, 32, $nj + or $t2, $nj, $nj + stx $nj, [$anp+24] ! converted np[1] + add $anp, 32, $anp + + addcc $lo0, $lo1, $lo1 + addxc %g0, $hi1, $hi1 + + mulx $nj, $m1, $nlo ! np[1]*m1 + umulxhi $nj, $m1, $nj ! nhi=nj + + ba .L1st + sub $num, 24, $cnt ! cnt=num-3 + +.align 16 +.L1st: + ld [$ap+0], $t0 ! ap[j] + addcc $alo, $hi0, $lo0 + ld [$ap+4], $t1 + addxc $aj, %g0, $hi0 + + sllx $t1, 32, $aj + add $ap, 8, $ap + or $t0, $aj, $aj + stx $aj, [$anp] ! converted ap[j] + + ld [$np+0], $t2 ! np[j] + addcc $nlo, $hi1, $lo1 + ld [$np+4], $t3 + addxc $nj, %g0, $hi1 ! nhi=nj + + sllx $t3, 32, $nj + add $np, 8, $np + mulx $aj, $m0, $alo ! ap[j]*bp[0] + or $t2, $nj, $nj + umulxhi $aj, $m0, $aj ! ahi=aj + stx $nj, [$anp+8] ! converted np[j] + add $anp, 16, $anp ! anp++ + + mulx $nj, $m1, $nlo ! np[j]*m1 + addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0] + umulxhi $nj, $m1, $nj ! nhi=nj + addxc %g0, $hi1, $hi1 + stx $lo1, [$tp] ! tp[j-1] + add $tp, 8, $tp ! tp++ + + brnz,pt $cnt, .L1st + sub $cnt, 8, $cnt ! j-- +!.L1st + addcc $alo, $hi0, $lo0 + addxc $aj, %g0, $hi0 ! ahi=aj + + addcc $nlo, $hi1, $lo1 + addxc $nj, %g0, $hi1 + addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0] + addxc %g0, $hi1, $hi1 + stx $lo1, [$tp] ! tp[j-1] + add $tp, 8, $tp + + addcc $hi0, $hi1, $hi1 + addxc %g0, %g0, $ovf ! upmost overflow bit + stx $hi1, [$tp] + add $tp, 8, $tp + + ba .Louter + sub $num, 16, $i ! i=num-2 + +.align 16 +.Louter: + ld [$bp+0], $t2 ! m0=bp[i] + ld [$bp+4], $t3 + + sub $anp, $num, $anp ! rewind + sub $tp, $num, $tp + sub $anp, $num, $anp + + add $bp, 8, $bp + sllx $t3, 32, $m0 + ldx [$anp+0], $aj ! ap[0] + or $t2, $m0, $m0 + ldx [$anp+8], $nj ! np[0] + + mulx $aj, $m0, $lo0 ! ap[0]*bp[i] + ldx [$tp], $tj ! tp[0] + umulxhi $aj, $m0, $hi0 + ldx [$anp+16], $aj ! ap[1] + addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0] + mulx $aj, $m0, $alo ! ap[1]*bp[i] + addxc %g0, $hi0, $hi0 + mulx $lo0, $n0, $m1 ! tp[0]*n0 + umulxhi $aj, $m0, $aj ! ahi=aj + mulx $nj, $m1, $lo1 ! np[0]*m1 + umulxhi $nj, $m1, $hi1 + ldx [$anp+24], $nj ! np[1] + add $anp, 32, $anp + addcc $lo1, $lo0, $lo1 + mulx $nj, $m1, $nlo ! np[1]*m1 + addxc %g0, $hi1, $hi1 + umulxhi $nj, $m1, $nj ! nhi=nj + + ba .Linner + sub $num, 24, $cnt ! cnt=num-3 +.align 16 +.Linner: + addcc $alo, $hi0, $lo0 + ldx [$tp+8], $tj ! tp[j] + addxc $aj, %g0, $hi0 ! ahi=aj + ldx [$anp+0], $aj ! ap[j] + addcc $nlo, $hi1, $lo1 + mulx $aj, $m0, $alo ! ap[j]*bp[i] + addxc $nj, %g0, $hi1 ! nhi=nj + ldx [$anp+8], $nj ! np[j] + add $anp, 16, $anp + umulxhi $aj, $m0, $aj ! ahi=aj + addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j] + mulx $nj, $m1, $nlo ! np[j]*m1 + addxc %g0, $hi0, $hi0 + umulxhi $nj, $m1, $nj ! nhi=nj + addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j] + addxc %g0, $hi1, $hi1 + stx $lo1, [$tp] ! tp[j-1] + add $tp, 8, $tp + brnz,pt $cnt, .Linner + sub $cnt, 8, $cnt +!.Linner + ldx [$tp+8], $tj ! tp[j] + addcc $alo, $hi0, $lo0 + addxc $aj, %g0, $hi0 ! ahi=aj + addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j] + addxc %g0, $hi0, $hi0 + + addcc $nlo, $hi1, $lo1 + addxc $nj, %g0, $hi1 ! nhi=nj + addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j] + addxc %g0, $hi1, $hi1 + stx $lo1, [$tp] ! tp[j-1] + + subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc + addxccc $hi1, $hi0, $hi1 + addxc %g0, %g0, $ovf + stx $hi1, [$tp+8] + add $tp, 16, $tp + + brnz,pt $i, .Louter + sub $i, 8, $i + + sub $anp, $num, $anp ! rewind + sub $tp, $num, $tp + sub $anp, $num, $anp + ba .Lsub + subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc + +.align 16 +.Lsub: + ldx [$tp], $tj + add $tp, 8, $tp + ldx [$anp+8], $nj + add $anp, 16, $anp + subccc $tj, $nj, $t2 ! tp[j]-np[j] + srlx $tj, 32, $tj + srlx $nj, 32, $nj + subccc $tj, $nj, $t3 + add $rp, 8, $rp + st $t2, [$rp-4] ! reverse order + st $t3, [$rp-8] + brnz,pt $cnt, .Lsub + sub $cnt, 8, $cnt + + sub $anp, $num, $anp ! rewind + sub $tp, $num, $tp + sub $anp, $num, $anp + sub $rp, $num, $rp + + subc $ovf, %g0, $ovf ! handle upmost overflow bit + and $tp, $ovf, $ap + andn $rp, $ovf, $np + or $np, $ap, $ap ! ap=borrow?tp:rp + ba .Lcopy + sub $num, 8, $cnt + +.align 16 +.Lcopy: ! copy or in-place refresh + ld [$ap+0], $t2 + ld [$ap+4], $t3 + add $ap, 8, $ap + stx %g0, [$tp] ! zap + add $tp, 8, $tp + stx %g0, [$anp] ! zap + stx %g0, [$anp+8] + add $anp, 16, $anp + st $t3, [$rp+0] ! flip order + st $t2, [$rp+4] + add $rp, 8, $rp + brnz $cnt, .Lcopy + sub $cnt, 8, $cnt + + mov 1, %o0 + ret + restore +.type bn_mul_mont_vis3, #function +.size bn_mul_mont_vis3, .-bn_mul_mont_vis3 +.asciz "Montgomery Multiplication for SPARCv9 VIS3, CRYPTOGAMS by <appro\@openssl.org>" +.align 4 +___ + +# Purpose of these subroutines is to explicitly encode VIS instructions, +# so that one can compile the module without having to specify VIS +# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. +# Idea is to reserve for option to produce "universal" binary and let +# programmer detect if current CPU is VIS capable at run-time. +sub unvis3 { +my ($mnemonic,$rs1,$rs2,$rd)=@_; +my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); +my ($ref,$opf); +my %visopf = ( "addxc" => 0x011, + "addxccc" => 0x013, + "umulxhi" => 0x016 ); + + $ref = "$mnemonic\t$rs1,$rs2,$rd"; + + if ($opf=$visopf{$mnemonic}) { + foreach ($rs1,$rs2,$rd) { + return $ref if (!/%([goli])([0-9])/); + $_=$bias{$1}+$2; + } + + return sprintf ".word\t0x%08x !%s", + 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, + $ref; + } else { + return $ref; + } +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + + s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/ + &unvis3($1,$2,$3,$4) + /ge; + + print $_,"\n"; +} + +close STDOUT; diff --git a/crypto/bn/asm/x86_64-gcc.c b/crypto/bn/asm/x86_64-gcc.c index 9c5074b..d548886 100644 --- a/crypto/bn/asm/x86_64-gcc.c +++ b/crypto/bn/asm/x86_64-gcc.c @@ -55,7 +55,7 @@ * machine. */ -# ifdef _WIN64 +# if defined(_WIN64) || !defined(__LP64__) # define BN_ULONG unsigned long long # else # define BN_ULONG unsigned long @@ -63,7 +63,6 @@ # undef mul # undef mul_add -# undef sqr /*- * "m"(a), "+m"(r) is the way to favor DirectPath µ-code; @@ -99,8 +98,8 @@ : "cc"); \ (r)=carry, carry=high; \ } while (0) - -# define sqr(r0,r1,a) \ +# undef sqr +# define sqr(r0,r1,a) \ asm ("mulq %2" \ : "=a"(r0),"=d"(r1) \ : "a"(a) \ @@ -204,20 +203,22 @@ BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, int n) { - BN_ULONG ret = 0, i = 0; + BN_ULONG ret; + size_t i = 0; if (n <= 0) return 0; - asm volatile (" subq %2,%2 \n" + asm volatile (" subq %0,%0 \n" /* clear carry */ + " jmp 1f \n" ".p2align 4 \n" "1: movq (%4,%2,8),%0 \n" " adcq (%5,%2,8),%0 \n" " movq %0,(%3,%2,8) \n" - " leaq 1(%2),%2 \n" + " lea 1(%2),%2 \n" " loop 1b \n" - " sbbq %0,%0 \n":"=&a" (ret), "+c"(n), - "=&r"(i) + " sbbq %0,%0 \n":"=&r" (ret), "+c"(n), + "+r"(i) :"r"(rp), "r"(ap), "r"(bp) :"cc", "memory"); @@ -228,20 +229,22 @@ BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, int n) { - BN_ULONG ret = 0, i = 0; + BN_ULONG ret; + size_t i = 0; if (n <= 0) return 0; - asm volatile (" subq %2,%2 \n" + asm volatile (" subq %0,%0 \n" /* clear borrow */ + " jmp 1f \n" ".p2align 4 \n" "1: movq (%4,%2,8),%0 \n" " sbbq (%5,%2,8),%0 \n" " movq %0,(%3,%2,8) \n" - " leaq 1(%2),%2 \n" + " lea 1(%2),%2 \n" " loop 1b \n" - " sbbq %0,%0 \n":"=&a" (ret), "+c"(n), - "=&r"(i) + " sbbq %0,%0 \n":"=&r" (ret), "+c"(n), + "+r"(i) :"r"(rp), "r"(ap), "r"(bp) :"cc", "memory"); @@ -313,55 +316,58 @@ BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) */ # if 0 /* original macros are kept for reference purposes */ -# define mul_add_c(a,b,c0,c1,c2) { \ - BN_ULONG ta=(a),tb=(b); \ - t1 = ta * tb; \ - t2 = BN_UMULT_HIGH(ta,tb); \ - c0 += t1; t2 += (c0<t1)?1:0; \ - c1 += t2; c2 += (c1<t2)?1:0; \ - } - -# define mul_add_c2(a,b,c0,c1,c2) { \ - BN_ULONG ta=(a),tb=(b),t0; \ - t1 = BN_UMULT_HIGH(ta,tb); \ - t0 = ta * tb; \ - c0 += t0; t2 = t1+((c0<t0)?1:0);\ - c1 += t2; c2 += (c1<t2)?1:0; \ - c0 += t0; t1 += (c0<t0)?1:0; \ - c1 += t1; c2 += (c1<t1)?1:0; \ - } +# define mul_add_c(a,b,c0,c1,c2) do { \ + BN_ULONG ta = (a), tb = (b); \ + BN_ULONG lo, hi; \ + BN_UMULT_LOHI(lo,hi,ta,tb); \ + c0 += lo; hi += (c0<lo)?1:0; \ + c1 += hi; c2 += (c1<hi)?1:0; \ + } while(0) + +# define mul_add_c2(a,b,c0,c1,c2) do { \ + BN_ULONG ta = (a), tb = (b); \ + BN_ULONG lo, hi, tt; \ + BN_UMULT_LOHI(lo,hi,ta,tb); \ + c0 += lo; tt = hi+((c0<lo)?1:0); \ + c1 += tt; c2 += (c1<tt)?1:0; \ + c0 += lo; hi += (c0<lo)?1:0; \ + c1 += hi; c2 += (c1<hi)?1:0; \ + } while(0) + +# define sqr_add_c(a,i,c0,c1,c2) do { \ + BN_ULONG ta = (a)[i]; \ + BN_ULONG lo, hi; \ + BN_UMULT_LOHI(lo,hi,ta,ta); \ + c0 += lo; hi += (c0<lo)?1:0; \ + c1 += hi; c2 += (c1<hi)?1:0; \ + } while(0) # else -# define mul_add_c(a,b,c0,c1,c2) do { \ +# define mul_add_c(a,b,c0,c1,c2) do { \ + BN_ULONG t1,t2; \ asm ("mulq %3" \ : "=a"(t1),"=d"(t2) \ : "a"(a),"m"(b) \ : "cc"); \ - asm ("addq %2,%0; adcq %3,%1" \ - : "+r"(c0),"+d"(t2) \ - : "a"(t1),"g"(0) \ - : "cc"); \ - asm ("addq %2,%0; adcq %3,%1" \ - : "+r"(c1),"+r"(c2) \ - : "d"(t2),"g"(0) \ - : "cc"); \ + asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ + : "+r"(c0),"+r"(c1),"+r"(c2) \ + : "r"(t1),"r"(t2),"g"(0) \ + : "cc"); \ } while (0) -# define sqr_add_c(a,i,c0,c1,c2) do { \ +# define sqr_add_c(a,i,c0,c1,c2) do { \ + BN_ULONG t1,t2; \ asm ("mulq %2" \ : "=a"(t1),"=d"(t2) \ : "a"(a[i]) \ : "cc"); \ - asm ("addq %2,%0; adcq %3,%1" \ - : "+r"(c0),"+d"(t2) \ - : "a"(t1),"g"(0) \ - : "cc"); \ - asm ("addq %2,%0; adcq %3,%1" \ - : "+r"(c1),"+r"(c2) \ - : "d"(t2),"g"(0) \ - : "cc"); \ + asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ + : "+r"(c0),"+r"(c1),"+r"(c2) \ + : "r"(t1),"r"(t2),"g"(0) \ + : "cc"); \ } while (0) -# define mul_add_c2(a,b,c0,c1,c2) do { \ +# define mul_add_c2(a,b,c0,c1,c2) do { \ + BN_ULONG t1,t2; \ asm ("mulq %3" \ : "=a"(t1),"=d"(t2) \ : "a"(a),"m"(b) \ @@ -382,7 +388,6 @@ BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) { - BN_ULONG t1, t2; BN_ULONG c1, c2, c3; c1 = 0; @@ -486,7 +491,6 @@ void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) { - BN_ULONG t1, t2; BN_ULONG c1, c2, c3; c1 = 0; @@ -526,7 +530,6 @@ void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) { - BN_ULONG t1, t2; BN_ULONG c1, c2, c3; c1 = 0; @@ -602,7 +605,6 @@ void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) { - BN_ULONG t1, t2; BN_ULONG c1, c2, c3; c1 = 0; diff --git a/crypto/bn/asm/x86_64-mont.pl b/crypto/bn/asm/x86_64-mont.pl index 17fb94c..2989b58 100755 --- a/crypto/bn/asm/x86_64-mont.pl +++ b/crypto/bn/asm/x86_64-mont.pl @@ -29,6 +29,16 @@ # to *initial* version of this module from 2005 is ~0%/30%/40%/45% # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively. +# June 2013. +# +# Optimize reduction in squaring procedure and improve 1024+-bit RSA +# sign performance by 10-16% on Intel Sandy Bridge and later +# (virtually same on non-Intel processors). + +# August 2013. +# +# Add MULX/ADOX/ADCX code path. + $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } @@ -43,6 +53,21 @@ die "can't locate x86_64-xlate.pl"; open OUT,"| \"$^X\" $xlate $flavour $output"; *STDOUT=*OUT; +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $addx = ($1>=2.23); +} + +if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { + $addx = ($1>=2.10); +} + +if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $addx = ($1>=12); +} + # int bn_mul_mont( $rp="%rdi"; # BN_ULONG *rp, $ap="%rsi"; # const BN_ULONG *ap, @@ -61,6 +86,8 @@ $m1="%rbp"; $code=<<___; .text +.extern OPENSSL_ia32cap_P + .globl bn_mul_mont .type bn_mul_mont,\@function,6 .align 16 @@ -69,9 +96,16 @@ bn_mul_mont: jnz .Lmul_enter cmp \$8,${num}d jb .Lmul_enter +___ +$code.=<<___ if ($addx); + mov OPENSSL_ia32cap_P+8(%rip),%r11d +___ +$code.=<<___; cmp $ap,$bp jne .Lmul4x_enter - jmp .Lsqr4x_enter + test \$7,${num}d + jz .Lsqr8x_enter + jmp .Lmul4x_enter .align 16 .Lmul_enter: @@ -227,7 +261,7 @@ $code.=<<___; lea 1($i),$i # i++ cmp $num,$i - jl .Louter + jb .Louter xor $i,$i # i=0 and clear CF! mov (%rsp),%rax # tp[0] @@ -280,6 +314,13 @@ $code.=<<___; .align 16 bn_mul4x_mont: .Lmul4x_enter: +___ +$code.=<<___ if ($addx); + and \$0x80100,%r11d + cmp \$0x80100,%r11d + je .Lmulx4x_enter +___ +$code.=<<___; push %rbx push %rbp push %r12 @@ -401,7 +442,7 @@ $code.=<<___; mov $N[1],-32(%rsp,$j,8) # tp[j-1] mov %rdx,$N[0] cmp $num,$j - jl .L1st4x + jb .L1st4x mulq $m0 # ap[j]*bp[0] add %rax,$A[0] @@ -549,7 +590,7 @@ $code.=<<___; mov $N[1],-32(%rsp,$j,8) # tp[j-1] mov %rdx,$N[0] cmp $num,$j - jl .Linner4x + jb .Linner4x mulq $m0 # ap[j]*bp[i] add %rax,$A[0] @@ -595,7 +636,7 @@ $code.=<<___; mov $N[1],(%rsp,$j,8) # store upmost overflow bit cmp $num,$i - jl .Louter4x + jb .Louter4x ___ { my @ri=("%rax","%rdx",$m0,$m1); @@ -688,25 +729,30 @@ ___ }}} {{{ ###################################################################### -# void bn_sqr4x_mont( +# void bn_sqr8x_mont( my $rptr="%rdi"; # const BN_ULONG *rptr, my $aptr="%rsi"; # const BN_ULONG *aptr, my $bptr="%rdx"; # not used my $nptr="%rcx"; # const BN_ULONG *nptr, my $n0 ="%r8"; # const BN_ULONG *n0); -my $num ="%r9"; # int num, has to be divisible by 4 and - # not less than 8 +my $num ="%r9"; # int num, has to be divisible by 8 my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); my @A0=("%r10","%r11"); my @A1=("%r12","%r13"); my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); +$code.=<<___ if ($addx); +.extern bn_sqrx8x_internal # see x86_64-mont5 module +___ $code.=<<___; -.type bn_sqr4x_mont,\@function,6 -.align 16 -bn_sqr4x_mont: -.Lsqr4x_enter: +.extern bn_sqr8x_internal # see x86_64-mont5 module + +.type bn_sqr8x_mont,\@function,6 +.align 32 +bn_sqr8x_mont: +.Lsqr8x_enter: + mov %rsp,%rax push %rbx push %rbp push %r12 @@ -714,787 +760,445 @@ bn_sqr4x_mont: push %r14 push %r15 + mov ${num}d,%r10d shl \$3,${num}d # convert $num to bytes - xor %r10,%r10 - mov %rsp,%r11 # put aside %rsp - sub $num,%r10 # -$num - mov ($n0),$n0 # *n0 - lea -72(%rsp,%r10,2),%rsp # alloca(frame+2*$num) - and \$-1024,%rsp # minimize TLB usage - ############################################################## - # Stack layout - # - # +0 saved $num, used in reduction section - # +8 &t[2*$num], used in reduction section - # +32 saved $rptr - # +40 saved $nptr - # +48 saved *n0 - # +56 saved %rsp - # +64 t[2*$num] - # - mov $rptr,32(%rsp) # save $rptr - mov $nptr,40(%rsp) - mov $n0, 48(%rsp) - mov %r11, 56(%rsp) # save original %rsp -.Lsqr4x_body: + shl \$3+2,%r10 # 4*$num + neg $num + ############################################################## - # Squaring part: - # - # a) multiply-n-add everything but a[i]*a[i]; - # b) shift result of a) by 1 to the left and accumulate - # a[i]*a[i] products; + # ensure that stack frame doesn't alias with $aptr modulo + # 4096. this is done to allow memory disambiguation logic + # do its job. # - lea 32(%r10),$i # $i=-($num-32) - lea ($aptr,$num),$aptr # end of a[] buffer, ($aptr,$i)=&ap[2] - - mov $num,$j # $j=$num - - # comments apply to $num==8 case - mov -32($aptr,$i),$a0 # a[0] - lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] - mov -24($aptr,$i),%rax # a[1] - lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] - mov -16($aptr,$i),$ai # a[2] - mov %rax,$a1 - - mul $a0 # a[1]*a[0] - mov %rax,$A0[0] # a[1]*a[0] - mov $ai,%rax # a[2] - mov %rdx,$A0[1] - mov $A0[0],-24($tptr,$i) # t[1] - - xor $A0[0],$A0[0] - mul $a0 # a[2]*a[0] - add %rax,$A0[1] - mov $ai,%rax - adc %rdx,$A0[0] - mov $A0[1],-16($tptr,$i) # t[2] - - lea -16($i),$j # j=-16 - - - mov 8($aptr,$j),$ai # a[3] - mul $a1 # a[2]*a[1] - mov %rax,$A1[0] # a[2]*a[1]+t[3] - mov $ai,%rax - mov %rdx,$A1[1] - - xor $A0[1],$A0[1] - add $A1[0],$A0[0] - lea 16($j),$j - adc \$0,$A0[1] - mul $a0 # a[3]*a[0] - add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] - mov $ai,%rax - adc %rdx,$A0[1] - mov $A0[0],-8($tptr,$j) # t[3] - jmp .Lsqr4x_1st + lea -64(%rsp,$num,4),%r11 + mov ($n0),$n0 # *n0 + sub $aptr,%r11 + and \$4095,%r11 + cmp %r11,%r10 + jb .Lsqr8x_sp_alt + sub %r11,%rsp # align with $aptr + lea -64(%rsp,$num,4),%rsp # alloca(frame+4*$num) + jmp .Lsqr8x_sp_done + +.align 32 +.Lsqr8x_sp_alt: + lea 4096-64(,$num,4),%r10 # 4096-frame-4*$num + lea -64(%rsp,$num,4),%rsp # alloca(frame+4*$num) + sub %r10,%r11 + mov \$0,%r10 + cmovc %r10,%r11 + sub %r11,%rsp +.Lsqr8x_sp_done: + and \$-64,%rsp + mov $num,%r10 + neg $num + + lea 64(%rsp,$num,2),%r11 # copy of modulus + mov $n0, 32(%rsp) + mov %rax, 40(%rsp) # save original %rsp +.Lsqr8x_body: + + mov $num,$i + movq %r11, %xmm2 # save pointer to modulus copy + shr \$3+2,$i + mov OPENSSL_ia32cap_P+8(%rip),%eax + jmp .Lsqr8x_copy_n + +.align 32 +.Lsqr8x_copy_n: + movq 8*0($nptr),%xmm0 + movq 8*1($nptr),%xmm1 + movq 8*2($nptr),%xmm3 + movq 8*3($nptr),%xmm4 + lea 8*4($nptr),$nptr + movdqa %xmm0,16*0(%r11) + movdqa %xmm1,16*1(%r11) + movdqa %xmm3,16*2(%r11) + movdqa %xmm4,16*3(%r11) + lea 16*4(%r11),%r11 + dec $i + jnz .Lsqr8x_copy_n -.align 16 -.Lsqr4x_1st: - mov ($aptr,$j),$ai # a[4] - xor $A1[0],$A1[0] - mul $a1 # a[3]*a[1] - add %rax,$A1[1] # a[3]*a[1]+t[4] - mov $ai,%rax - adc %rdx,$A1[0] - - xor $A0[0],$A0[0] - add $A1[1],$A0[1] - adc \$0,$A0[0] - mul $a0 # a[4]*a[0] - add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] - mov $ai,%rax # a[3] - adc %rdx,$A0[0] - mov $A0[1],($tptr,$j) # t[4] - - - mov 8($aptr,$j),$ai # a[5] - xor $A1[1],$A1[1] - mul $a1 # a[4]*a[3] - add %rax,$A1[0] # a[4]*a[3]+t[5] - mov $ai,%rax - adc %rdx,$A1[1] - - xor $A0[1],$A0[1] - add $A1[0],$A0[0] - adc \$0,$A0[1] - mul $a0 # a[5]*a[2] - add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] - mov $ai,%rax - adc %rdx,$A0[1] - mov $A0[0],8($tptr,$j) # t[5] - - mov 16($aptr,$j),$ai # a[6] - xor $A1[0],$A1[0] - mul $a1 # a[5]*a[3] - add %rax,$A1[1] # a[5]*a[3]+t[6] - mov $ai,%rax - adc %rdx,$A1[0] - - xor $A0[0],$A0[0] - add $A1[1],$A0[1] - adc \$0,$A0[0] - mul $a0 # a[6]*a[2] - add %rax,$A0[1] # a[6]*a[2]+a[5]*a[3]+t[6] - mov $ai,%rax # a[3] - adc %rdx,$A0[0] - mov $A0[1],16($tptr,$j) # t[6] - - - mov 24($aptr,$j),$ai # a[7] - xor $A1[1],$A1[1] - mul $a1 # a[6]*a[5] - add %rax,$A1[0] # a[6]*a[5]+t[7] - mov $ai,%rax - adc %rdx,$A1[1] - - xor $A0[1],$A0[1] - add $A1[0],$A0[0] - lea 32($j),$j - adc \$0,$A0[1] - mul $a0 # a[7]*a[4] - add %rax,$A0[0] # a[7]*a[4]+a[6]*a[5]+t[6] - mov $ai,%rax - adc %rdx,$A0[1] - mov $A0[0],-8($tptr,$j) # t[7] - - cmp \$0,$j - jne .Lsqr4x_1st - - xor $A1[0],$A1[0] - add $A0[1],$A1[1] - adc \$0,$A1[0] - mul $a1 # a[7]*a[5] - add %rax,$A1[1] - adc %rdx,$A1[0] - - mov $A1[1],($tptr) # t[8] - lea 16($i),$i - mov $A1[0],8($tptr) # t[9] - jmp .Lsqr4x_outer + pxor %xmm0,%xmm0 + movq $rptr,%xmm1 # save $rptr + movq %r10, %xmm3 # -$num +___ +$code.=<<___ if ($addx); + and \$0x80100,%eax + cmp \$0x80100,%eax + jne .Lsqr8x_nox -.align 16 -.Lsqr4x_outer: # comments apply to $num==6 case - mov -32($aptr,$i),$a0 # a[0] - lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] - mov -24($aptr,$i),%rax # a[1] - lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] - mov -16($aptr,$i),$ai # a[2] - mov %rax,$a1 - - mov -24($tptr,$i),$A0[0] # t[1] - xor $A0[1],$A0[1] - mul $a0 # a[1]*a[0] - add %rax,$A0[0] # a[1]*a[0]+t[1] - mov $ai,%rax # a[2] - adc %rdx,$A0[1] - mov $A0[0],-24($tptr,$i) # t[1] - - xor $A0[0],$A0[0] - add -16($tptr,$i),$A0[1] # a[2]*a[0]+t[2] - adc \$0,$A0[0] - mul $a0 # a[2]*a[0] - add %rax,$A0[1] - mov $ai,%rax - adc %rdx,$A0[0] - mov $A0[1],-16($tptr,$i) # t[2] - - lea -16($i),$j # j=-16 - xor $A1[0],$A1[0] - - - mov 8($aptr,$j),$ai # a[3] - xor $A1[1],$A1[1] - add 8($tptr,$j),$A1[0] - adc \$0,$A1[1] - mul $a1 # a[2]*a[1] - add %rax,$A1[0] # a[2]*a[1]+t[3] - mov $ai,%rax - adc %rdx,$A1[1] - - xor $A0[1],$A0[1] - add $A1[0],$A0[0] - adc \$0,$A0[1] - mul $a0 # a[3]*a[0] - add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] - mov $ai,%rax - adc %rdx,$A0[1] - mov $A0[0],8($tptr,$j) # t[3] - - lea 16($j),$j - jmp .Lsqr4x_inner + call bn_sqrx8x_internal # see x86_64-mont5 module -.align 16 -.Lsqr4x_inner: - mov ($aptr,$j),$ai # a[4] - xor $A1[0],$A1[0] - add ($tptr,$j),$A1[1] - adc \$0,$A1[0] - mul $a1 # a[3]*a[1] - add %rax,$A1[1] # a[3]*a[1]+t[4] - mov $ai,%rax - adc %rdx,$A1[0] - - xor $A0[0],$A0[0] - add $A1[1],$A0[1] - adc \$0,$A0[0] - mul $a0 # a[4]*a[0] - add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] - mov $ai,%rax # a[3] - adc %rdx,$A0[0] - mov $A0[1],($tptr,$j) # t[4] - - mov 8($aptr,$j),$ai # a[5] - xor $A1[1],$A1[1] - add 8($tptr,$j),$A1[0] - adc \$0,$A1[1] - mul $a1 # a[4]*a[3] - add %rax,$A1[0] # a[4]*a[3]+t[5] - mov $ai,%rax - adc %rdx,$A1[1] - - xor $A0[1],$A0[1] - add $A1[0],$A0[0] - lea 16($j),$j # j++ - adc \$0,$A0[1] - mul $a0 # a[5]*a[2] - add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] - mov $ai,%rax - adc %rdx,$A0[1] - mov $A0[0],-8($tptr,$j) # t[5], "preloaded t[1]" below - - cmp \$0,$j - jne .Lsqr4x_inner - - xor $A1[0],$A1[0] - add $A0[1],$A1[1] - adc \$0,$A1[0] - mul $a1 # a[5]*a[3] - add %rax,$A1[1] - adc %rdx,$A1[0] - - mov $A1[1],($tptr) # t[6], "preloaded t[2]" below - mov $A1[0],8($tptr) # t[7], "preloaded t[3]" below - - add \$16,$i - jnz .Lsqr4x_outer - - # comments apply to $num==4 case - mov -32($aptr),$a0 # a[0] - lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] - mov -24($aptr),%rax # a[1] - lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] - mov -16($aptr),$ai # a[2] - mov %rax,$a1 - - xor $A0[1],$A0[1] - mul $a0 # a[1]*a[0] - add %rax,$A0[0] # a[1]*a[0]+t[1], preloaded t[1] - mov $ai,%rax # a[2] - adc %rdx,$A0[1] - mov $A0[0],-24($tptr) # t[1] - - xor $A0[0],$A0[0] - add $A1[1],$A0[1] # a[2]*a[0]+t[2], preloaded t[2] - adc \$0,$A0[0] - mul $a0 # a[2]*a[0] - add %rax,$A0[1] - mov $ai,%rax - adc %rdx,$A0[0] - mov $A0[1],-16($tptr) # t[2] - - mov -8($aptr),$ai # a[3] - mul $a1 # a[2]*a[1] - add %rax,$A1[0] # a[2]*a[1]+t[3], preloaded t[3] - mov $ai,%rax - adc \$0,%rdx - - xor $A0[1],$A0[1] - add $A1[0],$A0[0] - mov %rdx,$A1[1] - adc \$0,$A0[1] - mul $a0 # a[3]*a[0] - add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] - mov $ai,%rax - adc %rdx,$A0[1] - mov $A0[0],-8($tptr) # t[3] - - xor $A1[0],$A1[0] - add $A0[1],$A1[1] - adc \$0,$A1[0] - mul $a1 # a[3]*a[1] - add %rax,$A1[1] - mov -16($aptr),%rax # a[2] - adc %rdx,$A1[0] - - mov $A1[1],($tptr) # t[4] - mov $A1[0],8($tptr) # t[5] - - mul $ai # a[2]*a[3] + pxor %xmm0,%xmm0 + lea 48(%rsp),%rax + lea 64(%rsp,$num,2),%rdx + shr \$3+2,$num + mov 40(%rsp),%rsi # restore %rsp + jmp .Lsqr8x_zero + +.align 32 +.Lsqr8x_nox: ___ -{ -my ($shift,$carry)=($a0,$a1); -my @S=(@A1,$ai,$n0); $code.=<<___; - add \$16,$i - xor $shift,$shift - sub $num,$i # $i=16-$num - xor $carry,$carry - - add $A1[0],%rax # t[5] - adc \$0,%rdx - mov %rax,8($tptr) # t[5] - mov %rdx,16($tptr) # t[6] - mov $carry,24($tptr) # t[7] - - mov -16($aptr,$i),%rax # a[0] - lea 64(%rsp,$num,2),$tptr - xor $A0[0],$A0[0] # t[0] - mov -24($tptr,$i,2),$A0[1] # t[1] - - lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift - shr \$63,$A0[0] - lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | - shr \$63,$A0[1] - or $A0[0],$S[1] # | t[2*i]>>63 - mov -16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch - mov $A0[1],$shift # shift=t[2*i+1]>>63 - mul %rax # a[i]*a[i] - neg $carry # mov $carry,cf - mov -8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch - adc %rax,$S[0] - mov -8($aptr,$i),%rax # a[i+1] # prefetch - mov $S[0],-32($tptr,$i,2) - adc %rdx,$S[1] - - lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift - mov $S[1],-24($tptr,$i,2) - sbb $carry,$carry # mov cf,$carry - shr \$63,$A0[0] - lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | - shr \$63,$A0[1] - or $A0[0],$S[3] # | t[2*i]>>63 - mov 0($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch - mov $A0[1],$shift # shift=t[2*i+1]>>63 - mul %rax # a[i]*a[i] - neg $carry # mov $carry,cf - mov 8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch - adc %rax,$S[2] - mov 0($aptr,$i),%rax # a[i+1] # prefetch - mov $S[2],-16($tptr,$i,2) - adc %rdx,$S[3] - lea 16($i),$i - mov $S[3],-40($tptr,$i,2) - sbb $carry,$carry # mov cf,$carry - jmp .Lsqr4x_shift_n_add + call bn_sqr8x_internal # see x86_64-mont5 module -.align 16 -.Lsqr4x_shift_n_add: - lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift - shr \$63,$A0[0] - lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | - shr \$63,$A0[1] - or $A0[0],$S[1] # | t[2*i]>>63 - mov -16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch - mov $A0[1],$shift # shift=t[2*i+1]>>63 - mul %rax # a[i]*a[i] - neg $carry # mov $carry,cf - mov -8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch - adc %rax,$S[0] - mov -8($aptr,$i),%rax # a[i+1] # prefetch - mov $S[0],-32($tptr,$i,2) - adc %rdx,$S[1] - - lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift - mov $S[1],-24($tptr,$i,2) - sbb $carry,$carry # mov cf,$carry - shr \$63,$A0[0] - lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | - shr \$63,$A0[1] - or $A0[0],$S[3] # | t[2*i]>>63 - mov 0($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch - mov $A0[1],$shift # shift=t[2*i+1]>>63 - mul %rax # a[i]*a[i] - neg $carry # mov $carry,cf - mov 8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch - adc %rax,$S[2] - mov 0($aptr,$i),%rax # a[i+1] # prefetch - mov $S[2],-16($tptr,$i,2) - adc %rdx,$S[3] - - lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift - mov $S[3],-8($tptr,$i,2) - sbb $carry,$carry # mov cf,$carry - shr \$63,$A0[0] - lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | - shr \$63,$A0[1] - or $A0[0],$S[1] # | t[2*i]>>63 - mov 16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch - mov $A0[1],$shift # shift=t[2*i+1]>>63 - mul %rax # a[i]*a[i] - neg $carry # mov $carry,cf - mov 24($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch - adc %rax,$S[0] - mov 8($aptr,$i),%rax # a[i+1] # prefetch - mov $S[0],0($tptr,$i,2) - adc %rdx,$S[1] - - lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift - mov $S[1],8($tptr,$i,2) - sbb $carry,$carry # mov cf,$carry - shr \$63,$A0[0] - lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | - shr \$63,$A0[1] - or $A0[0],$S[3] # | t[2*i]>>63 - mov 32($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch - mov $A0[1],$shift # shift=t[2*i+1]>>63 - mul %rax # a[i]*a[i] - neg $carry # mov $carry,cf - mov 40($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch - adc %rax,$S[2] - mov 16($aptr,$i),%rax # a[i+1] # prefetch - mov $S[2],16($tptr,$i,2) - adc %rdx,$S[3] - mov $S[3],24($tptr,$i,2) - sbb $carry,$carry # mov cf,$carry - add \$32,$i - jnz .Lsqr4x_shift_n_add - - lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift - shr \$63,$A0[0] - lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | - shr \$63,$A0[1] - or $A0[0],$S[1] # | t[2*i]>>63 - mov -16($tptr),$A0[0] # t[2*i+2] # prefetch - mov $A0[1],$shift # shift=t[2*i+1]>>63 - mul %rax # a[i]*a[i] - neg $carry # mov $carry,cf - mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch - adc %rax,$S[0] - mov -8($aptr),%rax # a[i+1] # prefetch - mov $S[0],-32($tptr) - adc %rdx,$S[1] - - lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift - mov $S[1],-24($tptr) - sbb $carry,$carry # mov cf,$carry - shr \$63,$A0[0] - lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | - shr \$63,$A0[1] - or $A0[0],$S[3] # | t[2*i]>>63 - mul %rax # a[i]*a[i] - neg $carry # mov $carry,cf - adc %rax,$S[2] - adc %rdx,$S[3] - mov $S[2],-16($tptr) - mov $S[3],-8($tptr) -___ -} -############################################################## -# Montgomery reduction part, "word-by-word" algorithm. -# -{ -my ($topbit,$nptr)=("%rbp",$aptr); -my ($m0,$m1)=($a0,$a1); -my @Ni=("%rbx","%r9"); -$code.=<<___; - mov 40(%rsp),$nptr # restore $nptr - mov 48(%rsp),$n0 # restore *n0 - xor $j,$j - mov $num,0(%rsp) # save $num - sub $num,$j # $j=-$num - mov 64(%rsp),$A0[0] # t[0] # modsched # - mov $n0,$m0 # # modsched # - lea 64(%rsp,$num,2),%rax # end of t[] buffer - lea 64(%rsp,$num),$tptr # end of t[] window - mov %rax,8(%rsp) # save end of t[] buffer - lea ($nptr,$num),$nptr # end of n[] buffer - xor $topbit,$topbit # $topbit=0 - - mov 0($nptr,$j),%rax # n[0] # modsched # - mov 8($nptr,$j),$Ni[1] # n[1] # modsched # - imulq $A0[0],$m0 # m0=t[0]*n0 # modsched # - mov %rax,$Ni[0] # # modsched # - jmp .Lsqr4x_mont_outer + pxor %xmm0,%xmm0 + lea 48(%rsp),%rax + lea 64(%rsp,$num,2),%rdx + shr \$3+2,$num + mov 40(%rsp),%rsi # restore %rsp + jmp .Lsqr8x_zero + +.align 32 +.Lsqr8x_zero: + movdqa %xmm0,16*0(%rax) # wipe t + movdqa %xmm0,16*1(%rax) + movdqa %xmm0,16*2(%rax) + movdqa %xmm0,16*3(%rax) + lea 16*4(%rax),%rax + movdqa %xmm0,16*0(%rdx) # wipe n + movdqa %xmm0,16*1(%rdx) + movdqa %xmm0,16*2(%rdx) + movdqa %xmm0,16*3(%rdx) + lea 16*4(%rdx),%rdx + dec $num + jnz .Lsqr8x_zero -.align 16 -.Lsqr4x_mont_outer: - xor $A0[1],$A0[1] - mul $m0 # n[0]*m0 - add %rax,$A0[0] # n[0]*m0+t[0] - mov $Ni[1],%rax - adc %rdx,$A0[1] - mov $n0,$m1 + mov \$1,%rax + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp +.Lsqr8x_epilogue: + ret +.size bn_sqr8x_mont,.-bn_sqr8x_mont +___ +}}} + +if ($addx) {{{ +my $bp="%rdx"; # original value - xor $A0[0],$A0[0] - add 8($tptr,$j),$A0[1] - adc \$0,$A0[0] - mul $m0 # n[1]*m0 - add %rax,$A0[1] # n[1]*m0+t[1] - mov $Ni[0],%rax - adc %rdx,$A0[0] - - imulq $A0[1],$m1 - - mov 16($nptr,$j),$Ni[0] # n[2] - xor $A1[1],$A1[1] - add $A0[1],$A1[0] - adc \$0,$A1[1] - mul $m1 # n[0]*m1 - add %rax,$A1[0] # n[0]*m1+"t[1]" - mov $Ni[0],%rax - adc %rdx,$A1[1] - mov $A1[0],8($tptr,$j) # "t[1]" - - xor $A0[1],$A0[1] - add 16($tptr,$j),$A0[0] - adc \$0,$A0[1] - mul $m0 # n[2]*m0 - add %rax,$A0[0] # n[2]*m0+t[2] - mov $Ni[1],%rax - adc %rdx,$A0[1] - - mov 24($nptr,$j),$Ni[1] # n[3] - xor $A1[0],$A1[0] - add $A0[0],$A1[1] - adc \$0,$A1[0] - mul $m1 # n[1]*m1 - add %rax,$A1[1] # n[1]*m1+"t[2]" - mov $Ni[1],%rax - adc %rdx,$A1[0] - mov $A1[1],16($tptr,$j) # "t[2]" - - xor $A0[0],$A0[0] - add 24($tptr,$j),$A0[1] - lea 32($j),$j - adc \$0,$A0[0] - mul $m0 # n[3]*m0 - add %rax,$A0[1] # n[3]*m0+t[3] - mov $Ni[0],%rax - adc %rdx,$A0[0] - jmp .Lsqr4x_mont_inner +$code.=<<___; +.type bn_mulx4x_mont,\@function,6 +.align 32 +bn_mulx4x_mont: +.Lmulx4x_enter: + mov %rsp,%rax + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 -.align 16 -.Lsqr4x_mont_inner: - mov ($nptr,$j),$Ni[0] # n[4] - xor $A1[1],$A1[1] - add $A0[1],$A1[0] - adc \$0,$A1[1] - mul $m1 # n[2]*m1 - add %rax,$A1[0] # n[2]*m1+"t[3]" - mov $Ni[0],%rax - adc %rdx,$A1[1] - mov $A1[0],-8($tptr,$j) # "t[3]" - - xor $A0[1],$A0[1] - add ($tptr,$j),$A0[0] - adc \$0,$A0[1] - mul $m0 # n[4]*m0 - add %rax,$A0[0] # n[4]*m0+t[4] - mov $Ni[1],%rax - adc %rdx,$A0[1] - - mov 8($nptr,$j),$Ni[1] # n[5] - xor $A1[0],$A1[0] - add $A0[0],$A1[1] - adc \$0,$A1[0] - mul $m1 # n[3]*m1 - add %rax,$A1[1] # n[3]*m1+"t[4]" - mov $Ni[1],%rax - adc %rdx,$A1[0] - mov $A1[1],($tptr,$j) # "t[4]" - - xor $A0[0],$A0[0] - add 8($tptr,$j),$A0[1] - adc \$0,$A0[0] - mul $m0 # n[5]*m0 - add %rax,$A0[1] # n[5]*m0+t[5] - mov $Ni[0],%rax - adc %rdx,$A0[0] - - - mov 16($nptr,$j),$Ni[0] # n[6] - xor $A1[1],$A1[1] - add $A0[1],$A1[0] - adc \$0,$A1[1] - mul $m1 # n[4]*m1 - add %rax,$A1[0] # n[4]*m1+"t[5]" - mov $Ni[0],%rax - adc %rdx,$A1[1] - mov $A1[0],8($tptr,$j) # "t[5]" - - xor $A0[1],$A0[1] - add 16($tptr,$j),$A0[0] - adc \$0,$A0[1] - mul $m0 # n[6]*m0 - add %rax,$A0[0] # n[6]*m0+t[6] - mov $Ni[1],%rax - adc %rdx,$A0[1] - - mov 24($nptr,$j),$Ni[1] # n[7] - xor $A1[0],$A1[0] - add $A0[0],$A1[1] - adc \$0,$A1[0] - mul $m1 # n[5]*m1 - add %rax,$A1[1] # n[5]*m1+"t[6]" - mov $Ni[1],%rax - adc %rdx,$A1[0] - mov $A1[1],16($tptr,$j) # "t[6]" - - xor $A0[0],$A0[0] - add 24($tptr,$j),$A0[1] - lea 32($j),$j - adc \$0,$A0[0] - mul $m0 # n[7]*m0 - add %rax,$A0[1] # n[7]*m0+t[7] - mov $Ni[0],%rax - adc %rdx,$A0[0] - cmp \$0,$j - jne .Lsqr4x_mont_inner - - sub 0(%rsp),$j # $j=-$num # modsched # - mov $n0,$m0 # # modsched # - - xor $A1[1],$A1[1] - add $A0[1],$A1[0] - adc \$0,$A1[1] - mul $m1 # n[6]*m1 - add %rax,$A1[0] # n[6]*m1+"t[7]" - mov $Ni[1],%rax - adc %rdx,$A1[1] - mov $A1[0],-8($tptr) # "t[7]" - - xor $A0[1],$A0[1] - add ($tptr),$A0[0] # +t[8] - adc \$0,$A0[1] - mov 0($nptr,$j),$Ni[0] # n[0] # modsched # - add $topbit,$A0[0] - adc \$0,$A0[1] - - imulq 16($tptr,$j),$m0 # m0=t[0]*n0 # modsched # - xor $A1[0],$A1[0] - mov 8($nptr,$j),$Ni[1] # n[1] # modsched # - add $A0[0],$A1[1] - mov 16($tptr,$j),$A0[0] # t[0] # modsched # - adc \$0,$A1[0] - mul $m1 # n[7]*m1 - add %rax,$A1[1] # n[7]*m1+"t[8]" - mov $Ni[0],%rax # # modsched # - adc %rdx,$A1[0] - mov $A1[1],($tptr) # "t[8]" - - xor $topbit,$topbit - add 8($tptr),$A1[0] # +t[9] - adc $topbit,$topbit - add $A0[1],$A1[0] - lea 16($tptr),$tptr # "t[$num]>>128" - adc \$0,$topbit - mov $A1[0],-8($tptr) # "t[9]" - cmp 8(%rsp),$tptr # are we done? - jb .Lsqr4x_mont_outer - - mov 0(%rsp),$num # restore $num - mov $topbit,($tptr) # save $topbit + shl \$3,${num}d # convert $num to bytes + .byte 0x67 + xor %r10,%r10 + sub $num,%r10 # -$num + mov ($n0),$n0 # *n0 + lea -72(%rsp,%r10),%rsp # alloca(frame+$num+8) + lea ($bp,$num),%r10 + and \$-128,%rsp + ############################################################## + # Stack layout + # +0 num + # +8 off-loaded &b[i] + # +16 end of b[num] + # +24 saved n0 + # +32 saved rp + # +40 saved %rsp + # +48 inner counter + # +56 + # +64 tmp[num+1] + # + mov $num,0(%rsp) # save $num + shr \$5,$num + mov %r10,16(%rsp) # end of b[num] + sub \$1,$num + mov $n0, 24(%rsp) # save *n0 + mov $rp, 32(%rsp) # save $rp + mov %rax,40(%rsp) # save original %rsp + mov $num,48(%rsp) # inner counter + jmp .Lmulx4x_body + +.align 32 +.Lmulx4x_body: ___ -} -############################################################## -# Post-condition, 4x unrolled copy from bn_mul_mont -# -{ -my ($tptr,$nptr)=("%rbx",$aptr); -my @ri=("%rax","%rdx","%r10","%r11"); +my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)= + ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax"); +my $rptr=$bptr; $code.=<<___; - mov 64(%rsp,$num),@ri[0] # tp[0] - lea 64(%rsp,$num),$tptr # upper half of t[2*$num] holds result - mov 40(%rsp),$nptr # restore $nptr - shr \$5,$num # num/4 - mov 8($tptr),@ri[1] # t[1] - xor $i,$i # i=0 and clear CF! - - mov 32(%rsp),$rptr # restore $rptr - sub 0($nptr),@ri[0] - mov 16($tptr),@ri[2] # t[2] - mov 24($tptr),@ri[3] # t[3] - sbb 8($nptr),@ri[1] - lea -1($num),$j # j=num/4-1 - jmp .Lsqr4x_sub -.align 16 -.Lsqr4x_sub: - mov @ri[0],0($rptr,$i,8) # rp[i]=tp[i]-np[i] - mov @ri[1],8($rptr,$i,8) # rp[i]=tp[i]-np[i] - sbb 16($nptr,$i,8),@ri[2] - mov 32($tptr,$i,8),@ri[0] # tp[i+1] - mov 40($tptr,$i,8),@ri[1] - sbb 24($nptr,$i,8),@ri[3] - mov @ri[2],16($rptr,$i,8) # rp[i]=tp[i]-np[i] - mov @ri[3],24($rptr,$i,8) # rp[i]=tp[i]-np[i] - sbb 32($nptr,$i,8),@ri[0] - mov 48($tptr,$i,8),@ri[2] - mov 56($tptr,$i,8),@ri[3] - sbb 40($nptr,$i,8),@ri[1] - lea 4($i),$i # i++ - dec $j # doesn't affect CF! - jnz .Lsqr4x_sub - - mov @ri[0],0($rptr,$i,8) # rp[i]=tp[i]-np[i] - mov 32($tptr,$i,8),@ri[0] # load overflow bit - sbb 16($nptr,$i,8),@ri[2] - mov @ri[1],8($rptr,$i,8) # rp[i]=tp[i]-np[i] - sbb 24($nptr,$i,8),@ri[3] - mov @ri[2],16($rptr,$i,8) # rp[i]=tp[i]-np[i] - - sbb \$0,@ri[0] # handle upmost overflow bit - mov @ri[3],24($rptr,$i,8) # rp[i]=tp[i]-np[i] - xor $i,$i # i=0 - and @ri[0],$tptr - not @ri[0] - mov $rptr,$nptr - and @ri[0],$nptr - lea -1($num),$j - or $nptr,$tptr # tp=borrow?tp:rp + lea 8($bp),$bptr + mov ($bp),%rdx # b[0], $bp==%rdx actually + lea 64+32(%rsp),$tptr + mov %rdx,$bi + + mulx 0*8($aptr),$mi,%rax # a[0]*b[0] + mulx 1*8($aptr),%r11,%r14 # a[1]*b[0] + add %rax,%r11 + mov $bptr,8(%rsp) # off-load &b[i] + mulx 2*8($aptr),%r12,%r13 # ... + adc %r14,%r12 + adc \$0,%r13 + + mov $mi,$bptr # borrow $bptr + imulq 24(%rsp),$mi # "t[0]"*n0 + xor $zero,$zero # cf=0, of=0 + + mulx 3*8($aptr),%rax,%r14 + mov $mi,%rdx + lea 4*8($aptr),$aptr + adcx %rax,%r13 + adcx $zero,%r14 # cf=0 + + mulx 0*8($nptr),%rax,%r10 + adcx %rax,$bptr # discarded + adox %r11,%r10 + mulx 1*8($nptr),%rax,%r11 + adcx %rax,%r10 + adox %r12,%r11 + .byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 # mulx 2*8($nptr),%rax,%r12 + mov 48(%rsp),$bptr # counter value + mov %r10,-4*8($tptr) + adcx %rax,%r11 + adox %r13,%r12 + mulx 3*8($nptr),%rax,%r15 + mov $bi,%rdx + mov %r11,-3*8($tptr) + adcx %rax,%r12 + adox $zero,%r15 # of=0 + lea 4*8($nptr),$nptr + mov %r12,-2*8($tptr) + + jmp .Lmulx4x_1st + +.align 32 +.Lmulx4x_1st: + adcx $zero,%r15 # cf=0, modulo-scheduled + mulx 0*8($aptr),%r10,%rax # a[4]*b[0] + adcx %r14,%r10 + mulx 1*8($aptr),%r11,%r14 # a[5]*b[0] + adcx %rax,%r11 + mulx 2*8($aptr),%r12,%rax # ... + adcx %r14,%r12 + mulx 3*8($aptr),%r13,%r14 + .byte 0x67,0x67 + mov $mi,%rdx + adcx %rax,%r13 + adcx $zero,%r14 # cf=0 + lea 4*8($aptr),$aptr + lea 4*8($tptr),$tptr + + adox %r15,%r10 + mulx 0*8($nptr),%rax,%r15 + adcx %rax,%r10 + adox %r15,%r11 + mulx 1*8($nptr),%rax,%r15 + adcx %rax,%r11 + adox %r15,%r12 + mulx 2*8($nptr),%rax,%r15 + mov %r10,-5*8($tptr) + adcx %rax,%r12 + mov %r11,-4*8($tptr) + adox %r15,%r13 + mulx 3*8($nptr),%rax,%r15 + mov $bi,%rdx + mov %r12,-3*8($tptr) + adcx %rax,%r13 + adox $zero,%r15 + lea 4*8($nptr),$nptr + mov %r13,-2*8($tptr) + + dec $bptr # of=0, pass cf + jnz .Lmulx4x_1st + + mov 0(%rsp),$num # load num + mov 8(%rsp),$bptr # re-load &b[i] + adc $zero,%r15 # modulo-scheduled + add %r15,%r14 + sbb %r15,%r15 # top-most carry + mov %r14,-1*8($tptr) + jmp .Lmulx4x_outer + +.align 32 +.Lmulx4x_outer: + mov ($bptr),%rdx # b[i] + lea 8($bptr),$bptr # b++ + sub $num,$aptr # rewind $aptr + mov %r15,($tptr) # save top-most carry + lea 64+4*8(%rsp),$tptr + sub $num,$nptr # rewind $nptr + + mulx 0*8($aptr),$mi,%r11 # a[0]*b[i] + xor %ebp,%ebp # xor $zero,$zero # cf=0, of=0 + mov %rdx,$bi + mulx 1*8($aptr),%r14,%r12 # a[1]*b[i] + adox -4*8($tptr),$mi + adcx %r14,%r11 + mulx 2*8($aptr),%r15,%r13 # ... + adox -3*8($tptr),%r11 + adcx %r15,%r12 + adox $zero,%r12 + adcx $zero,%r13 + + mov $bptr,8(%rsp) # off-load &b[i] + .byte 0x67 + mov $mi,%r15 + imulq 24(%rsp),$mi # "t[0]"*n0 + xor %ebp,%ebp # xor $zero,$zero # cf=0, of=0 + + mulx 3*8($aptr),%rax,%r14 + mov $mi,%rdx + adox -2*8($tptr),%r12 + adcx %rax,%r13 + adox -1*8($tptr),%r13 + adcx $zero,%r14 + lea 4*8($aptr),$aptr + adox $zero,%r14 + + mulx 0*8($nptr),%rax,%r10 + adcx %rax,%r15 # discarded + adox %r11,%r10 + mulx 1*8($nptr),%rax,%r11 + adcx %rax,%r10 + adox %r12,%r11 + mulx 2*8($nptr),%rax,%r12 + mov %r10,-4*8($tptr) + adcx %rax,%r11 + adox %r13,%r12 + mulx 3*8($nptr),%rax,%r15 + mov $bi,%rdx + mov %r11,-3*8($tptr) + lea 4*8($nptr),$nptr + adcx %rax,%r12 + adox $zero,%r15 # of=0 + mov 48(%rsp),$bptr # counter value + mov %r12,-2*8($tptr) + + jmp .Lmulx4x_inner + +.align 32 +.Lmulx4x_inner: + mulx 0*8($aptr),%r10,%rax # a[4]*b[i] + adcx $zero,%r15 # cf=0, modulo-scheduled + adox %r14,%r10 + mulx 1*8($aptr),%r11,%r14 # a[5]*b[i] + adcx 0*8($tptr),%r10 + adox %rax,%r11 + mulx 2*8($aptr),%r12,%rax # ... + adcx 1*8($tptr),%r11 + adox %r14,%r12 + mulx 3*8($aptr),%r13,%r14 + mov $mi,%rdx + adcx 2*8($tptr),%r12 + adox %rax,%r13 + adcx 3*8($tptr),%r13 + adox $zero,%r14 # of=0 + lea 4*8($aptr),$aptr + lea 4*8($tptr),$tptr + adcx $zero,%r14 # cf=0 + + adox %r15,%r10 + mulx 0*8($nptr),%rax,%r15 + adcx %rax,%r10 + adox %r15,%r11 + mulx 1*8($nptr),%rax,%r15 + adcx %rax,%r11 + adox %r15,%r12 + mulx 2*8($nptr),%rax,%r15 + mov %r10,-5*8($tptr) + adcx %rax,%r12 + adox %r15,%r13 + mulx 3*8($nptr),%rax,%r15 + mov $bi,%rdx + mov %r11,-4*8($tptr) + mov %r12,-3*8($tptr) + adcx %rax,%r13 + adox $zero,%r15 + lea 4*8($nptr),$nptr + mov %r13,-2*8($tptr) + + dec $bptr # of=0, pass cf + jnz .Lmulx4x_inner + + mov 0(%rsp),$num # load num + mov 8(%rsp),$bptr # re-load &b[i] + adc $zero,%r15 # modulo-scheduled + sub 0*8($tptr),$zero # pull top-most carry + adc %r15,%r14 + mov -8($nptr),$mi + sbb %r15,%r15 # top-most carry + mov %r14,-1*8($tptr) + + cmp 16(%rsp),$bptr + jne .Lmulx4x_outer + + sub %r14,$mi # compare top-most words + sbb $mi,$mi + or $mi,%r15 + + neg $num + xor %rdx,%rdx + mov 32(%rsp),$rptr # restore rp + lea 64(%rsp),$tptr pxor %xmm0,%xmm0 - lea 64(%rsp,$num,8),$nptr - movdqu ($tptr),%xmm1 - lea ($nptr,$num,8),$nptr - movdqa %xmm0,64(%rsp) # zap lower half of temporary vector - movdqa %xmm0,($nptr) # zap upper half of temporary vector - movdqu %xmm1,($rptr) - jmp .Lsqr4x_copy -.align 16 -.Lsqr4x_copy: # copy or in-place refresh - movdqu 16($tptr,$i),%xmm2 - movdqu 32($tptr,$i),%xmm1 - movdqa %xmm0,80(%rsp,$i) # zap lower half of temporary vector - movdqa %xmm0,96(%rsp,$i) # zap lower half of temporary vector - movdqa %xmm0,16($nptr,$i) # zap upper half of temporary vector - movdqa %xmm0,32($nptr,$i) # zap upper half of temporary vector - movdqu %xmm2,16($rptr,$i) - movdqu %xmm1,32($rptr,$i) - lea 32($i),$i - dec $j - jnz .Lsqr4x_copy - - movdqu 16($tptr,$i),%xmm2 - movdqa %xmm0,80(%rsp,$i) # zap lower half of temporary vector - movdqa %xmm0,16($nptr,$i) # zap upper half of temporary vector - movdqu %xmm2,16($rptr,$i) -___ -} -$code.=<<___; - mov 56(%rsp),%rsi # restore %rsp + mov 0*8($nptr,$num),%r8 + mov 1*8($nptr,$num),%r9 + neg %r8 + jmp .Lmulx4x_sub_entry + +.align 32 +.Lmulx4x_sub: + mov 0*8($nptr,$num),%r8 + mov 1*8($nptr,$num),%r9 + not %r8 +.Lmulx4x_sub_entry: + mov 2*8($nptr,$num),%r10 + not %r9 + and %r15,%r8 + mov 3*8($nptr,$num),%r11 + not %r10 + and %r15,%r9 + not %r11 + and %r15,%r10 + and %r15,%r11 + + neg %rdx # mov %rdx,%cf + adc 0*8($tptr),%r8 + adc 1*8($tptr),%r9 + movdqa %xmm0,($tptr) + adc 2*8($tptr),%r10 + adc 3*8($tptr),%r11 + movdqa %xmm0,16($tptr) + lea 4*8($tptr),$tptr + sbb %rdx,%rdx # mov %cf,%rdx + + mov %r8,0*8($rptr) + mov %r9,1*8($rptr) + mov %r10,2*8($rptr) + mov %r11,3*8($rptr) + lea 4*8($rptr),$rptr + + add \$32,$num + jnz .Lmulx4x_sub + + mov 40(%rsp),%rsi # restore %rsp mov \$1,%rax - mov 0(%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp -.Lsqr4x_epilogue: + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp +.Lmulx4x_epilogue: ret -.size bn_sqr4x_mont,.-bn_sqr4x_mont +.size bn_mulx4x_mont,.-bn_mulx4x_mont ___ }}} $code.=<<___; @@ -1581,18 +1285,22 @@ sqr_handler: mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip - lea .Lsqr4x_body(%rip),%r10 + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # end of prologue label cmp %r10,%rbx # context->Rip<.Lsqr_body jb .Lcommon_seh_tail mov 152($context),%rax # pull context->Rsp - lea .Lsqr4x_epilogue(%rip),%r10 + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue jae .Lcommon_seh_tail - mov 56(%rax),%rax # pull saved stack pointer - lea 48(%rax),%rax + mov 40(%rax),%rax # pull saved stack pointer mov -8(%rax),%rbx mov -16(%rax),%rbp @@ -1657,10 +1365,16 @@ sqr_handler: .rva .LSEH_end_bn_mul4x_mont .rva .LSEH_info_bn_mul4x_mont - .rva .LSEH_begin_bn_sqr4x_mont - .rva .LSEH_end_bn_sqr4x_mont - .rva .LSEH_info_bn_sqr4x_mont - + .rva .LSEH_begin_bn_sqr8x_mont + .rva .LSEH_end_bn_sqr8x_mont + .rva .LSEH_info_bn_sqr8x_mont +___ +$code.=<<___ if ($addx); + .rva .LSEH_begin_bn_mulx4x_mont + .rva .LSEH_end_bn_mulx4x_mont + .rva .LSEH_info_bn_mulx4x_mont +___ +$code.=<<___; .section .xdata .align 8 .LSEH_info_bn_mul_mont: @@ -1671,9 +1385,16 @@ sqr_handler: .byte 9,0,0,0 .rva mul_handler .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[] -.LSEH_info_bn_sqr4x_mont: +.LSEH_info_bn_sqr8x_mont: + .byte 9,0,0,0 + .rva sqr_handler + .rva .Lsqr8x_body,.Lsqr8x_epilogue # HandlerData[] +___ +$code.=<<___ if ($addx); +.LSEH_info_bn_mulx4x_mont: .byte 9,0,0,0 .rva sqr_handler + .rva .Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] ___ } diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl index dae0fe2..820de3d 100755 --- a/crypto/bn/asm/x86_64-mont5.pl +++ b/crypto/bn/asm/x86_64-mont5.pl @@ -17,6 +17,13 @@ # is implemented, so that scatter-/gathering can be tuned without # bn_exp.c modifications. +# August 2013. +# +# Add MULX/AD*X code paths and additional interfaces to optimize for +# branch prediction unit. For input lengths that are multiples of 8 +# the np argument is not just modulus value, but one interleaved +# with 0. This is to optimize post-condition... + $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } @@ -31,6 +38,21 @@ die "can't locate x86_64-xlate.pl"; open OUT,"| \"$^X\" $xlate $flavour $output"; *STDOUT=*OUT; +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $addx = ($1>=2.23); +} + +if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { + $addx = ($1>=2.10); +} + +if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $addx = ($1>=12); +} + # int bn_mul_mont_gather5( $rp="%rdi"; # BN_ULONG *rp, $ap="%rsi"; # const BN_ULONG *ap, @@ -53,19 +75,25 @@ $m1="%rbp"; $code=<<___; .text +.extern OPENSSL_ia32cap_P + .globl bn_mul_mont_gather5 .type bn_mul_mont_gather5,\@function,6 .align 64 bn_mul_mont_gather5: - test \$3,${num}d + test \$7,${num}d jnz .Lmul_enter - cmp \$8,${num}d - jb .Lmul_enter +___ +$code.=<<___ if ($addx); + mov OPENSSL_ia32cap_P+8(%rip),%r11d +___ +$code.=<<___; jmp .Lmul4x_enter .align 16 .Lmul_enter: mov ${num}d,${num}d + mov %rsp,%rax mov `($win64?56:8)`(%rsp),%r10d # load 7th argument push %rbx push %rbp @@ -78,10 +106,8 @@ $code.=<<___ if ($win64); lea -0x28(%rsp),%rsp movaps %xmm6,(%rsp) movaps %xmm7,0x10(%rsp) -.Lmul_alloca: ___ $code.=<<___; - mov %rsp,%rax lea 2($num),%r11 neg %r11 lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)) @@ -287,7 +313,7 @@ $code.=<<___; lea 1($i),$i # i++ cmp $num,$i - jl .Louter + jb .Louter xor $i,$i # i=0 and clear CF! mov (%rsp),%rax # tp[0] @@ -323,18 +349,17 @@ $code.=<<___; mov \$1,%rax ___ $code.=<<___ if ($win64); - movaps (%rsi),%xmm6 - movaps 0x10(%rsi),%xmm7 - lea 0x28(%rsi),%rsi + movaps -88(%rsi),%xmm6 + movaps -72(%rsi),%xmm7 ___ $code.=<<___; - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp .Lmul_epilogue: ret .size bn_mul_mont_gather5,.-bn_mul_mont_gather5 @@ -344,11 +369,18 @@ my @A=("%r10","%r11"); my @N=("%r13","%rdi"); $code.=<<___; .type bn_mul4x_mont_gather5,\@function,6 -.align 16 +.align 32 bn_mul4x_mont_gather5: .Lmul4x_enter: - mov ${num}d,${num}d - mov `($win64?56:8)`(%rsp),%r10d # load 7th argument +___ +$code.=<<___ if ($addx); + and \$0x80100,%r11d + cmp \$0x80100,%r11d + je .Lmulx4x_enter +___ +$code.=<<___; + .byte 0x67 + mov %rsp,%rax push %rbx push %rbp push %r12 @@ -360,23 +392,78 @@ $code.=<<___ if ($win64); lea -0x28(%rsp),%rsp movaps %xmm6,(%rsp) movaps %xmm7,0x10(%rsp) -.Lmul4x_alloca: ___ $code.=<<___; - mov %rsp,%rax - lea 4($num),%r11 - neg %r11 - lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+4)) - and \$-1024,%rsp # minimize TLB usage - - mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp + .byte 0x67 + mov ${num}d,%r10d + shl \$3,${num}d + shl \$3+2,%r10d # 4*$num + neg $num # -$num + + ############################################################## + # ensure that stack frame doesn't alias with $aptr+4*$num + # modulo 4096, which covers ret[num], am[num] and n[2*num] + # (see bn_exp.c). this is done to allow memory disambiguation + # logic do its magic. [excessive frame is allocated in order + # to allow bn_from_mont8x to clear it.] + # + lea -64(%rsp,$num,2),%r11 + sub $ap,%r11 + and \$4095,%r11 + cmp %r11,%r10 + jb .Lmul4xsp_alt + sub %r11,%rsp # align with $ap + lea -64(%rsp,$num,2),%rsp # alloca(128+num*8) + jmp .Lmul4xsp_done + +.align 32 +.Lmul4xsp_alt: + lea 4096-64(,$num,2),%r10 + lea -64(%rsp,$num,2),%rsp # alloca(128+num*8) + sub %r10,%r11 + mov \$0,%r10 + cmovc %r10,%r11 + sub %r11,%rsp +.Lmul4xsp_done: + and \$-64,%rsp + neg $num + + mov %rax,40(%rsp) .Lmul4x_body: - mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp - mov %rdx,%r12 # reassign $bp + + call mul4x_internal + + mov 40(%rsp),%rsi # restore %rsp + mov \$1,%rax +___ +$code.=<<___ if ($win64); + movaps -88(%rsi),%xmm6 + movaps -72(%rsi),%xmm7 +___ +$code.=<<___; + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp +.Lmul4x_epilogue: + ret +.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 + +.type mul4x_internal,\@abi-omnipotent +.align 32 +mul4x_internal: + shl \$5,$num + mov `($win64?56:8)`(%rax),%r10d # load 7th argument + lea 256(%rdx,$num),%r13 + shr \$5,$num # restore $num ___ $bp="%r12"; $STRIDE=2**5*8; # 5 is "window size" $N=$STRIDE/4; # should match cache line size + $tp=$i; $code.=<<___; mov %r10,%r11 shr \$`log($N/8)/log(2)`,%r10 @@ -384,459 +471,2776 @@ $code.=<<___; not %r10 lea .Lmagic_masks(%rip),%rax and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" - lea 96($bp,%r11,8),$bp # pointer within 1st cache line + lea 96(%rdx,%r11,8),$bp # pointer within 1st cache line movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which movq 8(%rax,%r10,8),%xmm5 # cache line contains element + add \$7,%r11 movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument movq 24(%rax,%r10,8),%xmm7 + and \$7,%r11 movq `0*$STRIDE/4-96`($bp),%xmm0 + lea $STRIDE($bp),$tp # borrow $tp movq `1*$STRIDE/4-96`($bp),%xmm1 pand %xmm4,%xmm0 movq `2*$STRIDE/4-96`($bp),%xmm2 pand %xmm5,%xmm1 movq `3*$STRIDE/4-96`($bp),%xmm3 pand %xmm6,%xmm2 + .byte 0x67 por %xmm1,%xmm0 + movq `0*$STRIDE/4-96`($tp),%xmm1 + .byte 0x67 pand %xmm7,%xmm3 + .byte 0x67 por %xmm2,%xmm0 - lea $STRIDE($bp),$bp + movq `1*$STRIDE/4-96`($tp),%xmm2 + .byte 0x67 + pand %xmm4,%xmm1 + .byte 0x67 por %xmm3,%xmm0 + movq `2*$STRIDE/4-96`($tp),%xmm3 movq %xmm0,$m0 # m0=bp[0] + movq `3*$STRIDE/4-96`($tp),%xmm0 + mov %r13,16+8(%rsp) # save end of b[num] + mov $rp, 56+8(%rsp) # save $rp + mov ($n0),$n0 # pull n0[0] value mov ($ap),%rax - - xor $i,$i # i=0 - xor $j,$j # j=0 - - movq `0*$STRIDE/4-96`($bp),%xmm0 - movq `1*$STRIDE/4-96`($bp),%xmm1 - pand %xmm4,%xmm0 - movq `2*$STRIDE/4-96`($bp),%xmm2 - pand %xmm5,%xmm1 + lea ($ap,$num),$ap # end of a[num] + neg $num mov $n0,$m1 mulq $m0 # ap[0]*bp[0] mov %rax,$A[0] mov ($np),%rax - movq `3*$STRIDE/4-96`($bp),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 + pand %xmm5,%xmm2 + pand %xmm6,%xmm3 + por %xmm2,%xmm1 imulq $A[0],$m1 # "tp[0]"*n0 + ############################################################## + # $tp is chosen so that writing to top-most element of the + # vector occurs just "above" references to powers table, + # "above" modulo cache-line size, which effectively precludes + # possibility of memory disambiguation logic failure when + # accessing the table. + # + lea 64+8(%rsp,%r11,8),$tp mov %rdx,$A[1] - por %xmm2,%xmm0 - lea $STRIDE($bp),$bp - por %xmm3,%xmm0 + pand %xmm7,%xmm0 + por %xmm3,%xmm1 + lea 2*$STRIDE($bp),$bp + por %xmm1,%xmm0 mulq $m1 # np[0]*m1 add %rax,$A[0] # discarded - mov 8($ap),%rax + mov 8($ap,$num),%rax adc \$0,%rdx mov %rdx,$N[1] mulq $m0 add %rax,$A[1] - mov 8($np),%rax + mov 16*1($np),%rax # interleaved with 0, therefore 16*n adc \$0,%rdx mov %rdx,$A[0] mulq $m1 add %rax,$N[1] - mov 16($ap),%rax + mov 16($ap,$num),%rax adc \$0,%rdx add $A[1],$N[1] - lea 4($j),$j # j++ + lea 4*8($num),$j # j=4 + lea 16*4($np),$np adc \$0,%rdx - mov $N[1],(%rsp) + mov $N[1],($tp) mov %rdx,$N[0] jmp .L1st4x -.align 16 + +.align 32 .L1st4x: mulq $m0 # ap[j]*bp[0] add %rax,$A[0] - mov -16($np,$j,8),%rax + mov -16*2($np),%rax + lea 32($tp),$tp adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] - mov -8($ap,$j,8),%rax + mov -8($ap,$j),%rax adc \$0,%rdx add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx - mov $N[0],-24(%rsp,$j,8) # tp[j-1] + mov $N[0],-24($tp) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[0] add %rax,$A[1] - mov -8($np,$j,8),%rax + mov -16*1($np),%rax adc \$0,%rdx mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] - mov ($ap,$j,8),%rax + mov ($ap,$j),%rax adc \$0,%rdx add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx - mov $N[1],-16(%rsp,$j,8) # tp[j-1] + mov $N[1],-16($tp) # tp[j-1] mov %rdx,$N[0] mulq $m0 # ap[j]*bp[0] add %rax,$A[0] - mov ($np,$j,8),%rax + mov 16*0($np),%rax adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] - mov 8($ap,$j,8),%rax + mov 8($ap,$j),%rax adc \$0,%rdx add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx - mov $N[0],-8(%rsp,$j,8) # tp[j-1] + mov $N[0],-8($tp) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[0] add %rax,$A[1] - mov 8($np,$j,8),%rax + mov 16*1($np),%rax adc \$0,%rdx - lea 4($j),$j # j++ mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] - mov -16($ap,$j,8),%rax + mov 16($ap,$j),%rax adc \$0,%rdx add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] + lea 16*4($np),$np adc \$0,%rdx - mov $N[1],-32(%rsp,$j,8) # tp[j-1] + mov $N[1],($tp) # tp[j-1] mov %rdx,$N[0] - cmp $num,$j - jl .L1st4x + + add \$32,$j # j+=4 + jnz .L1st4x mulq $m0 # ap[j]*bp[0] add %rax,$A[0] - mov -16($np,$j,8),%rax + mov -16*2($np),%rax + lea 32($tp),$tp adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] - mov -8($ap,$j,8),%rax + mov -8($ap),%rax adc \$0,%rdx add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx - mov $N[0],-24(%rsp,$j,8) # tp[j-1] + mov $N[0],-24($tp) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[0] add %rax,$A[1] - mov -8($np,$j,8),%rax + mov -16*1($np),%rax adc \$0,%rdx mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] - mov ($ap),%rax # ap[0] + mov ($ap,$num),%rax # ap[0] adc \$0,%rdx add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx - mov $N[1],-16(%rsp,$j,8) # tp[j-1] + mov $N[1],-16($tp) # tp[j-1] mov %rdx,$N[0] movq %xmm0,$m0 # bp[1] + lea ($np,$num,2),$np # rewind $np xor $N[1],$N[1] add $A[0],$N[0] adc \$0,$N[1] - mov $N[0],-8(%rsp,$j,8) - mov $N[1],(%rsp,$j,8) # store upmost overflow bit + mov $N[0],-8($tp) - lea 1($i),$i # i++ -.align 4 -.Louter4x: - xor $j,$j # j=0 - movq `0*$STRIDE/4-96`($bp),%xmm0 - movq `1*$STRIDE/4-96`($bp),%xmm1 - pand %xmm4,%xmm0 - movq `2*$STRIDE/4-96`($bp),%xmm2 - pand %xmm5,%xmm1 + jmp .Louter4x - mov (%rsp),$A[0] +.align 32 +.Louter4x: + mov ($tp,$num),$A[0] mov $n0,$m1 mulq $m0 # ap[0]*bp[i] add %rax,$A[0] # ap[0]*bp[i]+tp[0] mov ($np),%rax adc \$0,%rdx + movq `0*$STRIDE/4-96`($bp),%xmm0 + movq `1*$STRIDE/4-96`($bp),%xmm1 + pand %xmm4,%xmm0 + movq `2*$STRIDE/4-96`($bp),%xmm2 + pand %xmm5,%xmm1 movq `3*$STRIDE/4-96`($bp),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 imulq $A[0],$m1 # tp[0]*n0 + .byte 0x67 mov %rdx,$A[1] + mov $N[1],($tp) # store upmost overflow bit + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 por %xmm2,%xmm0 + lea ($tp,$num),$tp # rewind $tp lea $STRIDE($bp),$bp por %xmm3,%xmm0 mulq $m1 # np[0]*m1 add %rax,$A[0] # "$N[0]", discarded - mov 8($ap),%rax + mov 8($ap,$num),%rax adc \$0,%rdx mov %rdx,$N[1] mulq $m0 # ap[j]*bp[i] add %rax,$A[1] - mov 8($np),%rax + mov 16*1($np),%rax # interleaved with 0, therefore 16*n adc \$0,%rdx - add 8(%rsp),$A[1] # +tp[1] + add 8($tp),$A[1] # +tp[1] adc \$0,%rdx mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] - mov 16($ap),%rax + mov 16($ap,$num),%rax adc \$0,%rdx add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] - lea 4($j),$j # j+=2 + lea 4*8($num),$j # j=4 + lea 16*4($np),$np adc \$0,%rdx mov %rdx,$N[0] jmp .Linner4x -.align 16 + +.align 32 .Linner4x: mulq $m0 # ap[j]*bp[i] add %rax,$A[0] - mov -16($np,$j,8),%rax + mov -16*2($np),%rax adc \$0,%rdx - add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] + add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] + lea 32($tp),$tp adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] - mov -8($ap,$j,8),%rax + mov -8($ap,$j),%rax adc \$0,%rdx add $A[0],$N[0] adc \$0,%rdx - mov $N[1],-32(%rsp,$j,8) # tp[j-1] + mov $N[1],-32($tp) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[i] add %rax,$A[1] - mov -8($np,$j,8),%rax + mov -16*1($np),%rax adc \$0,%rdx - add -8(%rsp,$j,8),$A[1] + add -8($tp),$A[1] adc \$0,%rdx mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] - mov ($ap,$j,8),%rax + mov ($ap,$j),%rax adc \$0,%rdx add $A[1],$N[1] adc \$0,%rdx - mov $N[0],-24(%rsp,$j,8) # tp[j-1] + mov $N[0],-24($tp) # tp[j-1] mov %rdx,$N[0] mulq $m0 # ap[j]*bp[i] add %rax,$A[0] - mov ($np,$j,8),%rax + mov 16*0($np),%rax adc \$0,%rdx - add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] + add ($tp),$A[0] # ap[j]*bp[i]+tp[j] adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] - mov 8($ap,$j,8),%rax + mov 8($ap,$j),%rax adc \$0,%rdx add $A[0],$N[0] adc \$0,%rdx - mov $N[1],-16(%rsp,$j,8) # tp[j-1] + mov $N[1],-16($tp) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[i] add %rax,$A[1] - mov 8($np,$j,8),%rax + mov 16*1($np),%rax adc \$0,%rdx - add 8(%rsp,$j,8),$A[1] + add 8($tp),$A[1] adc \$0,%rdx - lea 4($j),$j # j++ mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] - mov -16($ap,$j,8),%rax + mov 16($ap,$j),%rax adc \$0,%rdx add $A[1],$N[1] + lea 16*4($np),$np adc \$0,%rdx - mov $N[0],-40(%rsp,$j,8) # tp[j-1] + mov $N[0],-8($tp) # tp[j-1] mov %rdx,$N[0] - cmp $num,$j - jl .Linner4x + + add \$32,$j # j+=4 + jnz .Linner4x mulq $m0 # ap[j]*bp[i] add %rax,$A[0] - mov -16($np,$j,8),%rax + mov -16*2($np),%rax adc \$0,%rdx - add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] + add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] + lea 32($tp),$tp adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] - mov -8($ap,$j,8),%rax + mov -8($ap),%rax adc \$0,%rdx add $A[0],$N[0] adc \$0,%rdx - mov $N[1],-32(%rsp,$j,8) # tp[j-1] + mov $N[1],-32($tp) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[i] add %rax,$A[1] - mov -8($np,$j,8),%rax + mov $m1,%rax + mov -16*1($np),$m1 adc \$0,%rdx - add -8(%rsp,$j,8),$A[1] + add -8($tp),$A[1] adc \$0,%rdx - lea 1($i),$i # i++ mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] - mov ($ap),%rax # ap[0] + mov ($ap,$num),%rax # ap[0] adc \$0,%rdx add $A[1],$N[1] adc \$0,%rdx - mov $N[0],-24(%rsp,$j,8) # tp[j-1] + mov $N[0],-24($tp) # tp[j-1] mov %rdx,$N[0] movq %xmm0,$m0 # bp[i+1] - mov $N[1],-16(%rsp,$j,8) # tp[j-1] + mov $N[1],-16($tp) # tp[j-1] + lea ($np,$num,2),$np # rewind $np xor $N[1],$N[1] add $A[0],$N[0] adc \$0,$N[1] - add (%rsp,$num,8),$N[0] # pull upmost overflow bit - adc \$0,$N[1] - mov $N[0],-8(%rsp,$j,8) - mov $N[1],(%rsp,$j,8) # store upmost overflow bit + add ($tp),$N[0] # pull upmost overflow bit + adc \$0,$N[1] # upmost overflow bit + mov $N[0],-8($tp) - cmp $num,$i - jl .Louter4x + cmp 16+8(%rsp),$bp + jb .Louter4x ___ -{ -my @ri=("%rax","%rdx",$m0,$m1); +if (1) { $code.=<<___; - mov 16(%rsp,$num,8),$rp # restore $rp - mov 0(%rsp),@ri[0] # tp[0] - pxor %xmm0,%xmm0 - mov 8(%rsp),@ri[1] # tp[1] - shr \$2,$num # num/=4 - lea (%rsp),$ap # borrow ap for tp - xor $i,$i # i=0 and clear CF! - - sub 0($np),@ri[0] - mov 16($ap),@ri[2] # tp[2] - mov 24($ap),@ri[3] # tp[3] - sbb 8($np),@ri[1] - lea -1($num),$j # j=num/4-1 + sub $N[0],$m1 # compare top-most words + adc $j,$j # $j is zero + or $j,$N[1] + xor \$1,$N[1] + lea ($tp,$num),%rbx # tptr in .sqr4x_sub + lea ($np,$N[1],8),%rbp # nptr in .sqr4x_sub + mov %r9,%rcx + sar \$3+2,%rcx # cf=0 + mov 56+8(%rsp),%rdi # rptr in .sqr4x_sub + jmp .Lsqr4x_sub +___ +} else { +my @ri=("%rax",$bp,$m0,$m1); +my $rp="%rdx"; +$code.=<<___ + xor \$1,$N[1] + lea ($tp,$num),$tp # rewind $tp + sar \$5,$num # cf=0 + lea ($np,$N[1],8),$np + mov 56+8(%rsp),$rp # restore $rp jmp .Lsub4x -.align 16 + +.align 32 .Lsub4x: - mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] - mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] - sbb 16($np,$i,8),@ri[2] - mov 32($ap,$i,8),@ri[0] # tp[i+1] - mov 40($ap,$i,8),@ri[1] - sbb 24($np,$i,8),@ri[3] - mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] - mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] - sbb 32($np,$i,8),@ri[0] - mov 48($ap,$i,8),@ri[2] - mov 56($ap,$i,8),@ri[3] - sbb 40($np,$i,8),@ri[1] - lea 4($i),$i # i++ - dec $j # doesnn't affect CF! + .byte 0x66 + mov 8*0($tp),@ri[0] + mov 8*1($tp),@ri[1] + .byte 0x66 + sbb 16*0($np),@ri[0] + mov 8*2($tp),@ri[2] + sbb 16*1($np),@ri[1] + mov 3*8($tp),@ri[3] + lea 4*8($tp),$tp + sbb 16*2($np),@ri[2] + mov @ri[0],8*0($rp) + sbb 16*3($np),@ri[3] + lea 16*4($np),$np + mov @ri[1],8*1($rp) + mov @ri[2],8*2($rp) + mov @ri[3],8*3($rp) + lea 8*4($rp),$rp + + inc $num jnz .Lsub4x - mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] - mov 32($ap,$i,8),@ri[0] # load overflow bit - sbb 16($np,$i,8),@ri[2] - mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] - sbb 24($np,$i,8),@ri[3] - mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] + ret +___ +} +$code.=<<___; +.size mul4x_internal,.-mul4x_internal +___ +}}} +{{{ +###################################################################### +# void bn_power5( +my $rptr="%rdi"; # BN_ULONG *rptr, +my $aptr="%rsi"; # const BN_ULONG *aptr, +my $bptr="%rdx"; # const void *table, +my $nptr="%rcx"; # const BN_ULONG *nptr, +my $n0 ="%r8"; # const BN_ULONG *n0); +my $num ="%r9"; # int num, has to be divisible by 8 + # int pwr + +my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); +my @A0=("%r10","%r11"); +my @A1=("%r12","%r13"); +my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); - sbb \$0,@ri[0] # handle upmost overflow bit - mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] - xor $i,$i # i=0 - and @ri[0],$ap - not @ri[0] - mov $rp,$np - and @ri[0],$np - lea -1($num),$j - or $np,$ap # ap=borrow?tp:rp +$code.=<<___; +.globl bn_power5 +.type bn_power5,\@function,6 +.align 32 +bn_power5: +___ +$code.=<<___ if ($addx); + mov OPENSSL_ia32cap_P+8(%rip),%r11d + and \$0x80100,%r11d + cmp \$0x80100,%r11d + je .Lpowerx5_enter +___ +$code.=<<___; + mov %rsp,%rax + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 +___ +$code.=<<___ if ($win64); + lea -0x28(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) +___ +$code.=<<___; + mov ${num}d,%r10d + shl \$3,${num}d # convert $num to bytes + shl \$3+2,%r10d # 4*$num + neg $num + mov ($n0),$n0 # *n0 + + ############################################################## + # ensure that stack frame doesn't alias with $aptr+4*$num + # modulo 4096, which covers ret[num], am[num] and n[2*num] + # (see bn_exp.c). this is done to allow memory disambiguation + # logic do its magic. + # + lea -64(%rsp,$num,2),%r11 + sub $aptr,%r11 + and \$4095,%r11 + cmp %r11,%r10 + jb .Lpwr_sp_alt + sub %r11,%rsp # align with $aptr + lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) + jmp .Lpwr_sp_done + +.align 32 +.Lpwr_sp_alt: + lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num + lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) + sub %r10,%r11 + mov \$0,%r10 + cmovc %r10,%r11 + sub %r11,%rsp +.Lpwr_sp_done: + and \$-64,%rsp + mov $num,%r10 + neg $num + + ############################################################## + # Stack layout + # + # +0 saved $num, used in reduction section + # +8 &t[2*$num], used in reduction section + # +32 saved *n0 + # +40 saved %rsp + # +48 t[2*$num] + # + mov $n0, 32(%rsp) + mov %rax, 40(%rsp) # save original %rsp +.Lpower5_body: + movq $rptr,%xmm1 # save $rptr + movq $nptr,%xmm2 # save $nptr + movq %r10, %xmm3 # -$num + movq $bptr,%xmm4 + + call __bn_sqr8x_internal + call __bn_sqr8x_internal + call __bn_sqr8x_internal + call __bn_sqr8x_internal + call __bn_sqr8x_internal + + movq %xmm2,$nptr + movq %xmm4,$bptr + mov $aptr,$rptr + mov 40(%rsp),%rax + lea 32(%rsp),$n0 + + call mul4x_internal + + mov 40(%rsp),%rsi # restore %rsp + mov \$1,%rax + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp +.Lpower5_epilogue: + ret +.size bn_power5,.-bn_power5 + +.globl bn_sqr8x_internal +.hidden bn_sqr8x_internal +.type bn_sqr8x_internal,\@abi-omnipotent +.align 32 +bn_sqr8x_internal: +__bn_sqr8x_internal: + ############################################################## + # Squaring part: + # + # a) multiply-n-add everything but a[i]*a[i]; + # b) shift result of a) by 1 to the left and accumulate + # a[i]*a[i] products; + # + ############################################################## + # a[1]a[0] + # a[2]a[0] + # a[3]a[0] + # a[2]a[1] + # a[4]a[0] + # a[3]a[1] + # a[5]a[0] + # a[4]a[1] + # a[3]a[2] + # a[6]a[0] + # a[5]a[1] + # a[4]a[2] + # a[7]a[0] + # a[6]a[1] + # a[5]a[2] + # a[4]a[3] + # a[7]a[1] + # a[6]a[2] + # a[5]a[3] + # a[7]a[2] + # a[6]a[3] + # a[5]a[4] + # a[7]a[3] + # a[6]a[4] + # a[7]a[4] + # a[6]a[5] + # a[7]a[5] + # a[7]a[6] + # a[1]a[0] + # a[2]a[0] + # a[3]a[0] + # a[4]a[0] + # a[5]a[0] + # a[6]a[0] + # a[7]a[0] + # a[2]a[1] + # a[3]a[1] + # a[4]a[1] + # a[5]a[1] + # a[6]a[1] + # a[7]a[1] + # a[3]a[2] + # a[4]a[2] + # a[5]a[2] + # a[6]a[2] + # a[7]a[2] + # a[4]a[3] + # a[5]a[3] + # a[6]a[3] + # a[7]a[3] + # a[5]a[4] + # a[6]a[4] + # a[7]a[4] + # a[6]a[5] + # a[7]a[5] + # a[7]a[6] + # a[0]a[0] + # a[1]a[1] + # a[2]a[2] + # a[3]a[3] + # a[4]a[4] + # a[5]a[5] + # a[6]a[6] + # a[7]a[7] + + lea 32(%r10),$i # $i=-($num-32) + lea ($aptr,$num),$aptr # end of a[] buffer, ($aptr,$i)=&ap[2] + + mov $num,$j # $j=$num + + # comments apply to $num==8 case + mov -32($aptr,$i),$a0 # a[0] + lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] + mov -24($aptr,$i),%rax # a[1] + lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] + mov -16($aptr,$i),$ai # a[2] + mov %rax,$a1 + + mul $a0 # a[1]*a[0] + mov %rax,$A0[0] # a[1]*a[0] + mov $ai,%rax # a[2] + mov %rdx,$A0[1] + mov $A0[0],-24($tptr,$i) # t[1] + + mul $a0 # a[2]*a[0] + add %rax,$A0[1] + mov $ai,%rax + adc \$0,%rdx + mov $A0[1],-16($tptr,$i) # t[2] + mov %rdx,$A0[0] + + + mov -8($aptr,$i),$ai # a[3] + mul $a1 # a[2]*a[1] + mov %rax,$A1[0] # a[2]*a[1]+t[3] + mov $ai,%rax + mov %rdx,$A1[1] + + lea ($i),$j + mul $a0 # a[3]*a[0] + add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] + mov $ai,%rax + mov %rdx,$A0[1] + adc \$0,$A0[1] + add $A1[0],$A0[0] + adc \$0,$A0[1] + mov $A0[0],-8($tptr,$j) # t[3] + jmp .Lsqr4x_1st + +.align 32 +.Lsqr4x_1st: + mov ($aptr,$j),$ai # a[4] + mul $a1 # a[3]*a[1] + add %rax,$A1[1] # a[3]*a[1]+t[4] + mov $ai,%rax + mov %rdx,$A1[0] + adc \$0,$A1[0] + + mul $a0 # a[4]*a[0] + add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] + mov $ai,%rax # a[3] + mov 8($aptr,$j),$ai # a[5] + mov %rdx,$A0[0] + adc \$0,$A0[0] + add $A1[1],$A0[1] + adc \$0,$A0[0] + + + mul $a1 # a[4]*a[3] + add %rax,$A1[0] # a[4]*a[3]+t[5] + mov $ai,%rax + mov $A0[1],($tptr,$j) # t[4] + mov %rdx,$A1[1] + adc \$0,$A1[1] + + mul $a0 # a[5]*a[2] + add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] + mov $ai,%rax + mov 16($aptr,$j),$ai # a[6] + mov %rdx,$A0[1] + adc \$0,$A0[1] + add $A1[0],$A0[0] + adc \$0,$A0[1] + + mul $a1 # a[5]*a[3] + add %rax,$A1[1] # a[5]*a[3]+t[6] + mov $ai,%rax + mov $A0[0],8($tptr,$j) # t[5] + mov %rdx,$A1[0] + adc \$0,$A1[0] + + mul $a0 # a[6]*a[2] + add %rax,$A0[1] # a[6]*a[2]+a[5]*a[3]+t[6] + mov $ai,%rax # a[3] + mov 24($aptr,$j),$ai # a[7] + mov %rdx,$A0[0] + adc \$0,$A0[0] + add $A1[1],$A0[1] + adc \$0,$A0[0] + + + mul $a1 # a[6]*a[5] + add %rax,$A1[0] # a[6]*a[5]+t[7] + mov $ai,%rax + mov $A0[1],16($tptr,$j) # t[6] + mov %rdx,$A1[1] + adc \$0,$A1[1] + lea 32($j),$j + + mul $a0 # a[7]*a[4] + add %rax,$A0[0] # a[7]*a[4]+a[6]*a[5]+t[6] + mov $ai,%rax + mov %rdx,$A0[1] + adc \$0,$A0[1] + add $A1[0],$A0[0] + adc \$0,$A0[1] + mov $A0[0],-8($tptr,$j) # t[7] + + cmp \$0,$j + jne .Lsqr4x_1st + + mul $a1 # a[7]*a[5] + add %rax,$A1[1] + lea 16($i),$i + adc \$0,%rdx + add $A0[1],$A1[1] + adc \$0,%rdx + + mov $A1[1],($tptr) # t[8] + mov %rdx,$A1[0] + mov %rdx,8($tptr) # t[9] + jmp .Lsqr4x_outer + +.align 32 +.Lsqr4x_outer: # comments apply to $num==6 case + mov -32($aptr,$i),$a0 # a[0] + lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] + mov -24($aptr,$i),%rax # a[1] + lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] + mov -16($aptr,$i),$ai # a[2] + mov %rax,$a1 + + mul $a0 # a[1]*a[0] + mov -24($tptr,$i),$A0[0] # t[1] + add %rax,$A0[0] # a[1]*a[0]+t[1] + mov $ai,%rax # a[2] + adc \$0,%rdx + mov $A0[0],-24($tptr,$i) # t[1] + mov %rdx,$A0[1] + + mul $a0 # a[2]*a[0] + add %rax,$A0[1] + mov $ai,%rax + adc \$0,%rdx + add -16($tptr,$i),$A0[1] # a[2]*a[0]+t[2] + mov %rdx,$A0[0] + adc \$0,$A0[0] + mov $A0[1],-16($tptr,$i) # t[2] + + xor $A1[0],$A1[0] + + mov -8($aptr,$i),$ai # a[3] + mul $a1 # a[2]*a[1] + add %rax,$A1[0] # a[2]*a[1]+t[3] + mov $ai,%rax + adc \$0,%rdx + add -8($tptr,$i),$A1[0] + mov %rdx,$A1[1] + adc \$0,$A1[1] + + mul $a0 # a[3]*a[0] + add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] + mov $ai,%rax + adc \$0,%rdx + add $A1[0],$A0[0] + mov %rdx,$A0[1] + adc \$0,$A0[1] + mov $A0[0],-8($tptr,$i) # t[3] + + lea ($i),$j + jmp .Lsqr4x_inner + +.align 32 +.Lsqr4x_inner: + mov ($aptr,$j),$ai # a[4] + mul $a1 # a[3]*a[1] + add %rax,$A1[1] # a[3]*a[1]+t[4] + mov $ai,%rax + mov %rdx,$A1[0] + adc \$0,$A1[0] + add ($tptr,$j),$A1[1] + adc \$0,$A1[0] + + .byte 0x67 + mul $a0 # a[4]*a[0] + add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] + mov $ai,%rax # a[3] + mov 8($aptr,$j),$ai # a[5] + mov %rdx,$A0[0] + adc \$0,$A0[0] + add $A1[1],$A0[1] + adc \$0,$A0[0] + + mul $a1 # a[4]*a[3] + add %rax,$A1[0] # a[4]*a[3]+t[5] + mov $A0[1],($tptr,$j) # t[4] + mov $ai,%rax + mov %rdx,$A1[1] + adc \$0,$A1[1] + add 8($tptr,$j),$A1[0] + lea 16($j),$j # j++ + adc \$0,$A1[1] + + mul $a0 # a[5]*a[2] + add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] + mov $ai,%rax + adc \$0,%rdx + add $A1[0],$A0[0] + mov %rdx,$A0[1] + adc \$0,$A0[1] + mov $A0[0],-8($tptr,$j) # t[5], "preloaded t[1]" below + + cmp \$0,$j + jne .Lsqr4x_inner + + .byte 0x67 + mul $a1 # a[5]*a[3] + add %rax,$A1[1] + adc \$0,%rdx + add $A0[1],$A1[1] + adc \$0,%rdx + + mov $A1[1],($tptr) # t[6], "preloaded t[2]" below + mov %rdx,$A1[0] + mov %rdx,8($tptr) # t[7], "preloaded t[3]" below + + add \$16,$i + jnz .Lsqr4x_outer + + # comments apply to $num==4 case + mov -32($aptr),$a0 # a[0] + lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] + mov -24($aptr),%rax # a[1] + lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] + mov -16($aptr),$ai # a[2] + mov %rax,$a1 + + mul $a0 # a[1]*a[0] + add %rax,$A0[0] # a[1]*a[0]+t[1], preloaded t[1] + mov $ai,%rax # a[2] + mov %rdx,$A0[1] + adc \$0,$A0[1] + + mul $a0 # a[2]*a[0] + add %rax,$A0[1] + mov $ai,%rax + mov $A0[0],-24($tptr) # t[1] + mov %rdx,$A0[0] + adc \$0,$A0[0] + add $A1[1],$A0[1] # a[2]*a[0]+t[2], preloaded t[2] + mov -8($aptr),$ai # a[3] + adc \$0,$A0[0] + + mul $a1 # a[2]*a[1] + add %rax,$A1[0] # a[2]*a[1]+t[3], preloaded t[3] + mov $ai,%rax + mov $A0[1],-16($tptr) # t[2] + mov %rdx,$A1[1] + adc \$0,$A1[1] + + mul $a0 # a[3]*a[0] + add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] + mov $ai,%rax + mov %rdx,$A0[1] + adc \$0,$A0[1] + add $A1[0],$A0[0] + adc \$0,$A0[1] + mov $A0[0],-8($tptr) # t[3] + + mul $a1 # a[3]*a[1] + add %rax,$A1[1] + mov -16($aptr),%rax # a[2] + adc \$0,%rdx + add $A0[1],$A1[1] + adc \$0,%rdx + + mov $A1[1],($tptr) # t[4] + mov %rdx,$A1[0] + mov %rdx,8($tptr) # t[5] + + mul $ai # a[2]*a[3] +___ +{ +my ($shift,$carry)=($a0,$a1); +my @S=(@A1,$ai,$n0); +$code.=<<___; + add \$16,$i + xor $shift,$shift + sub $num,$i # $i=16-$num + xor $carry,$carry + + add $A1[0],%rax # t[5] + adc \$0,%rdx + mov %rax,8($tptr) # t[5] + mov %rdx,16($tptr) # t[6] + mov $carry,24($tptr) # t[7] + + mov -16($aptr,$i),%rax # a[0] + lea 48+8(%rsp),$tptr + xor $A0[0],$A0[0] # t[0] + mov 8($tptr),$A0[1] # t[1] + + lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift + shr \$63,$A0[0] + lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | + shr \$63,$A0[1] + or $A0[0],$S[1] # | t[2*i]>>63 + mov 16($tptr),$A0[0] # t[2*i+2] # prefetch + mov $A0[1],$shift # shift=t[2*i+1]>>63 + mul %rax # a[i]*a[i] + neg $carry # mov $carry,cf + mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch + adc %rax,$S[0] + mov -8($aptr,$i),%rax # a[i+1] # prefetch + mov $S[0],($tptr) + adc %rdx,$S[1] + + lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift + mov $S[1],8($tptr) + sbb $carry,$carry # mov cf,$carry + shr \$63,$A0[0] + lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | + shr \$63,$A0[1] + or $A0[0],$S[3] # | t[2*i]>>63 + mov 32($tptr),$A0[0] # t[2*i+2] # prefetch + mov $A0[1],$shift # shift=t[2*i+1]>>63 + mul %rax # a[i]*a[i] + neg $carry # mov $carry,cf + mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch + adc %rax,$S[2] + mov 0($aptr,$i),%rax # a[i+1] # prefetch + mov $S[2],16($tptr) + adc %rdx,$S[3] + lea 16($i),$i + mov $S[3],24($tptr) + sbb $carry,$carry # mov cf,$carry + lea 64($tptr),$tptr + jmp .Lsqr4x_shift_n_add + +.align 32 +.Lsqr4x_shift_n_add: + lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift + shr \$63,$A0[0] + lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | + shr \$63,$A0[1] + or $A0[0],$S[1] # | t[2*i]>>63 + mov -16($tptr),$A0[0] # t[2*i+2] # prefetch + mov $A0[1],$shift # shift=t[2*i+1]>>63 + mul %rax # a[i]*a[i] + neg $carry # mov $carry,cf + mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch + adc %rax,$S[0] + mov -8($aptr,$i),%rax # a[i+1] # prefetch + mov $S[0],-32($tptr) + adc %rdx,$S[1] + + lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift + mov $S[1],-24($tptr) + sbb $carry,$carry # mov cf,$carry + shr \$63,$A0[0] + lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | + shr \$63,$A0[1] + or $A0[0],$S[3] # | t[2*i]>>63 + mov 0($tptr),$A0[0] # t[2*i+2] # prefetch + mov $A0[1],$shift # shift=t[2*i+1]>>63 + mul %rax # a[i]*a[i] + neg $carry # mov $carry,cf + mov 8($tptr),$A0[1] # t[2*i+2+1] # prefetch + adc %rax,$S[2] + mov 0($aptr,$i),%rax # a[i+1] # prefetch + mov $S[2],-16($tptr) + adc %rdx,$S[3] + + lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift + mov $S[3],-8($tptr) + sbb $carry,$carry # mov cf,$carry + shr \$63,$A0[0] + lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | + shr \$63,$A0[1] + or $A0[0],$S[1] # | t[2*i]>>63 + mov 16($tptr),$A0[0] # t[2*i+2] # prefetch + mov $A0[1],$shift # shift=t[2*i+1]>>63 + mul %rax # a[i]*a[i] + neg $carry # mov $carry,cf + mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch + adc %rax,$S[0] + mov 8($aptr,$i),%rax # a[i+1] # prefetch + mov $S[0],0($tptr) + adc %rdx,$S[1] + + lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift + mov $S[1],8($tptr) + sbb $carry,$carry # mov cf,$carry + shr \$63,$A0[0] + lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | + shr \$63,$A0[1] + or $A0[0],$S[3] # | t[2*i]>>63 + mov 32($tptr),$A0[0] # t[2*i+2] # prefetch + mov $A0[1],$shift # shift=t[2*i+1]>>63 + mul %rax # a[i]*a[i] + neg $carry # mov $carry,cf + mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch + adc %rax,$S[2] + mov 16($aptr,$i),%rax # a[i+1] # prefetch + mov $S[2],16($tptr) + adc %rdx,$S[3] + mov $S[3],24($tptr) + sbb $carry,$carry # mov cf,$carry + lea 64($tptr),$tptr + add \$32,$i + jnz .Lsqr4x_shift_n_add + + lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift + .byte 0x67 + shr \$63,$A0[0] + lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | + shr \$63,$A0[1] + or $A0[0],$S[1] # | t[2*i]>>63 + mov -16($tptr),$A0[0] # t[2*i+2] # prefetch + mov $A0[1],$shift # shift=t[2*i+1]>>63 + mul %rax # a[i]*a[i] + neg $carry # mov $carry,cf + mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch + adc %rax,$S[0] + mov -8($aptr),%rax # a[i+1] # prefetch + mov $S[0],-32($tptr) + adc %rdx,$S[1] + + lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift + mov $S[1],-24($tptr) + sbb $carry,$carry # mov cf,$carry + shr \$63,$A0[0] + lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | + shr \$63,$A0[1] + or $A0[0],$S[3] # | t[2*i]>>63 + mul %rax # a[i]*a[i] + neg $carry # mov $carry,cf + adc %rax,$S[2] + adc %rdx,$S[3] + mov $S[2],-16($tptr) + mov $S[3],-8($tptr) +___ +} +###################################################################### +# Montgomery reduction part, "word-by-word" algorithm. +# +# This new path is inspired by multiple submissions from Intel, by +# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford, +# Vinodh Gopal... +{ +my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx"); - movdqu ($ap),%xmm1 - movdqa %xmm0,(%rsp) - movdqu %xmm1,($rp) - jmp .Lcopy4x -.align 16 -.Lcopy4x: # copy or in-place refresh - movdqu 16($ap,$i),%xmm2 - movdqu 32($ap,$i),%xmm1 - movdqa %xmm0,16(%rsp,$i) - movdqu %xmm2,16($rp,$i) - movdqa %xmm0,32(%rsp,$i) - movdqu %xmm1,32($rp,$i) - lea 32($i),$i - dec $j - jnz .Lcopy4x - - shl \$2,$num - movdqu 16($ap,$i),%xmm2 - movdqa %xmm0,16(%rsp,$i) - movdqu %xmm2,16($rp,$i) +$code.=<<___; + movq %xmm2,$nptr +sqr8x_reduction: + xor %rax,%rax + lea ($nptr,$num,2),%rcx # end of n[] + lea 48+8(%rsp,$num,2),%rdx # end of t[] buffer + mov %rcx,0+8(%rsp) + lea 48+8(%rsp,$num),$tptr # end of initial t[] window + mov %rdx,8+8(%rsp) + neg $num + jmp .L8x_reduction_loop + +.align 32 +.L8x_reduction_loop: + lea ($tptr,$num),$tptr # start of current t[] window + .byte 0x66 + mov 8*0($tptr),$m0 + mov 8*1($tptr),%r9 + mov 8*2($tptr),%r10 + mov 8*3($tptr),%r11 + mov 8*4($tptr),%r12 + mov 8*5($tptr),%r13 + mov 8*6($tptr),%r14 + mov 8*7($tptr),%r15 + mov %rax,(%rdx) # store top-most carry bit + lea 8*8($tptr),$tptr + + .byte 0x67 + mov $m0,%r8 + imulq 32+8(%rsp),$m0 # n0*a[0] + mov 16*0($nptr),%rax # n[0] + mov \$8,%ecx + jmp .L8x_reduce + +.align 32 +.L8x_reduce: + mulq $m0 + mov 16*1($nptr),%rax # n[1] + neg %r8 + mov %rdx,%r8 + adc \$0,%r8 + + mulq $m0 + add %rax,%r9 + mov 16*2($nptr),%rax + adc \$0,%rdx + add %r9,%r8 + mov $m0,48-8+8(%rsp,%rcx,8) # put aside n0*a[i] + mov %rdx,%r9 + adc \$0,%r9 + + mulq $m0 + add %rax,%r10 + mov 16*3($nptr),%rax + adc \$0,%rdx + add %r10,%r9 + mov 32+8(%rsp),$carry # pull n0, borrow $carry + mov %rdx,%r10 + adc \$0,%r10 + + mulq $m0 + add %rax,%r11 + mov 16*4($nptr),%rax + adc \$0,%rdx + imulq %r8,$carry # modulo-scheduled + add %r11,%r10 + mov %rdx,%r11 + adc \$0,%r11 + + mulq $m0 + add %rax,%r12 + mov 16*5($nptr),%rax + adc \$0,%rdx + add %r12,%r11 + mov %rdx,%r12 + adc \$0,%r12 + + mulq $m0 + add %rax,%r13 + mov 16*6($nptr),%rax + adc \$0,%rdx + add %r13,%r12 + mov %rdx,%r13 + adc \$0,%r13 + + mulq $m0 + add %rax,%r14 + mov 16*7($nptr),%rax + adc \$0,%rdx + add %r14,%r13 + mov %rdx,%r14 + adc \$0,%r14 + + mulq $m0 + mov $carry,$m0 # n0*a[i] + add %rax,%r15 + mov 16*0($nptr),%rax # n[0] + adc \$0,%rdx + add %r15,%r14 + mov %rdx,%r15 + adc \$0,%r15 + + dec %ecx + jnz .L8x_reduce + + lea 16*8($nptr),$nptr + xor %rax,%rax + mov 8+8(%rsp),%rdx # pull end of t[] + cmp 0+8(%rsp),$nptr # end of n[]? + jae .L8x_no_tail + + .byte 0x66 + add 8*0($tptr),%r8 + adc 8*1($tptr),%r9 + adc 8*2($tptr),%r10 + adc 8*3($tptr),%r11 + adc 8*4($tptr),%r12 + adc 8*5($tptr),%r13 + adc 8*6($tptr),%r14 + adc 8*7($tptr),%r15 + sbb $carry,$carry # top carry + + mov 48+56+8(%rsp),$m0 # pull n0*a[0] + mov \$8,%ecx + mov 16*0($nptr),%rax + jmp .L8x_tail + +.align 32 +.L8x_tail: + mulq $m0 + add %rax,%r8 + mov 16*1($nptr),%rax + mov %r8,($tptr) # save result + mov %rdx,%r8 + adc \$0,%r8 + + mulq $m0 + add %rax,%r9 + mov 16*2($nptr),%rax + adc \$0,%rdx + add %r9,%r8 + lea 8($tptr),$tptr # $tptr++ + mov %rdx,%r9 + adc \$0,%r9 + + mulq $m0 + add %rax,%r10 + mov 16*3($nptr),%rax + adc \$0,%rdx + add %r10,%r9 + mov %rdx,%r10 + adc \$0,%r10 + + mulq $m0 + add %rax,%r11 + mov 16*4($nptr),%rax + adc \$0,%rdx + add %r11,%r10 + mov %rdx,%r11 + adc \$0,%r11 + + mulq $m0 + add %rax,%r12 + mov 16*5($nptr),%rax + adc \$0,%rdx + add %r12,%r11 + mov %rdx,%r12 + adc \$0,%r12 + + mulq $m0 + add %rax,%r13 + mov 16*6($nptr),%rax + adc \$0,%rdx + add %r13,%r12 + mov %rdx,%r13 + adc \$0,%r13 + + mulq $m0 + add %rax,%r14 + mov 16*7($nptr),%rax + adc \$0,%rdx + add %r14,%r13 + mov %rdx,%r14 + adc \$0,%r14 + + mulq $m0 + mov 48-16+8(%rsp,%rcx,8),$m0# pull n0*a[i] + add %rax,%r15 + adc \$0,%rdx + add %r15,%r14 + mov 16*0($nptr),%rax # pull n[0] + mov %rdx,%r15 + adc \$0,%r15 + + dec %ecx + jnz .L8x_tail + + lea 16*8($nptr),$nptr + mov 8+8(%rsp),%rdx # pull end of t[] + cmp 0+8(%rsp),$nptr # end of n[]? + jae .L8x_tail_done # break out of loop + + mov 48+56+8(%rsp),$m0 # pull n0*a[0] + neg $carry + mov 8*0($nptr),%rax # pull n[0] + adc 8*0($tptr),%r8 + adc 8*1($tptr),%r9 + adc 8*2($tptr),%r10 + adc 8*3($tptr),%r11 + adc 8*4($tptr),%r12 + adc 8*5($tptr),%r13 + adc 8*6($tptr),%r14 + adc 8*7($tptr),%r15 + sbb $carry,$carry # top carry + + mov \$8,%ecx + jmp .L8x_tail + +.align 32 +.L8x_tail_done: + add (%rdx),%r8 # can this overflow? + xor %rax,%rax + + neg $carry +.L8x_no_tail: + adc 8*0($tptr),%r8 + adc 8*1($tptr),%r9 + adc 8*2($tptr),%r10 + adc 8*3($tptr),%r11 + adc 8*4($tptr),%r12 + adc 8*5($tptr),%r13 + adc 8*6($tptr),%r14 + adc 8*7($tptr),%r15 + adc \$0,%rax # top-most carry + mov -16($nptr),%rcx # np[num-1] + xor $carry,$carry + + movq %xmm2,$nptr # restore $nptr + + mov %r8,8*0($tptr) # store top 512 bits + mov %r9,8*1($tptr) + movq %xmm3,$num # $num is %r9, can't be moved upwards + mov %r10,8*2($tptr) + mov %r11,8*3($tptr) + mov %r12,8*4($tptr) + mov %r13,8*5($tptr) + mov %r14,8*6($tptr) + mov %r15,8*7($tptr) + lea 8*8($tptr),$tptr + + cmp %rdx,$tptr # end of t[]? + jb .L8x_reduction_loop +___ +} +############################################################## +# Post-condition, 4x unrolled +# +{ +my ($tptr,$nptr)=("%rbx","%rbp"); +$code.=<<___; + #xor %rsi,%rsi # %rsi was $carry above + sub %r15,%rcx # compare top-most words + lea (%rdi,$num),$tptr # %rdi was $tptr above + adc %rsi,%rsi + mov $num,%rcx + or %rsi,%rax + movq %xmm1,$rptr # restore $rptr + xor \$1,%rax + movq %xmm1,$aptr # prepare for back-to-back call + lea ($nptr,%rax,8),$nptr + sar \$3+2,%rcx # cf=0 + jmp .Lsqr4x_sub + +.align 32 +.Lsqr4x_sub: + .byte 0x66 + mov 8*0($tptr),%r12 + mov 8*1($tptr),%r13 + sbb 16*0($nptr),%r12 + mov 8*2($tptr),%r14 + sbb 16*1($nptr),%r13 + mov 8*3($tptr),%r15 + lea 8*4($tptr),$tptr + sbb 16*2($nptr),%r14 + mov %r12,8*0($rptr) + sbb 16*3($nptr),%r15 + lea 16*4($nptr),$nptr + mov %r13,8*1($rptr) + mov %r14,8*2($rptr) + mov %r15,8*3($rptr) + lea 8*4($rptr),$rptr + + inc %rcx # pass %cf + jnz .Lsqr4x_sub ___ } $code.=<<___; - mov 8(%rsp,$num,8),%rsi # restore %rsp - mov \$1,%rax + mov $num,%r10 # prepare for back-to-back call + neg $num # restore $num + ret +.size bn_sqr8x_internal,.-bn_sqr8x_internal +___ +{ +$code.=<<___; +.globl bn_from_montgomery +.type bn_from_montgomery,\@abi-omnipotent +.align 32 +bn_from_montgomery: + testl \$7,`($win64?"48(%rsp)":"%r9d")` + jz bn_from_mont8x + xor %eax,%eax + ret +.size bn_from_montgomery,.-bn_from_montgomery + +.type bn_from_mont8x,\@function,6 +.align 32 +bn_from_mont8x: + .byte 0x67 + mov %rsp,%rax + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 ___ $code.=<<___ if ($win64); - movaps (%rsi),%xmm6 - movaps 0x10(%rsi),%xmm7 - lea 0x28(%rsi),%rsi + lea -0x28(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) ___ $code.=<<___; - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp -.Lmul4x_epilogue: + .byte 0x67 + mov ${num}d,%r10d + shl \$3,${num}d # convert $num to bytes + shl \$3+2,%r10d # 4*$num + neg $num + mov ($n0),$n0 # *n0 + + ############################################################## + # ensure that stack frame doesn't alias with $aptr+4*$num + # modulo 4096, which covers ret[num], am[num] and n[2*num] + # (see bn_exp.c). this is done to allow memory disambiguation + # logic do its magic. + # + lea -64(%rsp,$num,2),%r11 + sub $aptr,%r11 + and \$4095,%r11 + cmp %r11,%r10 + jb .Lfrom_sp_alt + sub %r11,%rsp # align with $aptr + lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) + jmp .Lfrom_sp_done + +.align 32 +.Lfrom_sp_alt: + lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num + lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) + sub %r10,%r11 + mov \$0,%r10 + cmovc %r10,%r11 + sub %r11,%rsp +.Lfrom_sp_done: + and \$-64,%rsp + mov $num,%r10 + neg $num + + ############################################################## + # Stack layout + # + # +0 saved $num, used in reduction section + # +8 &t[2*$num], used in reduction section + # +32 saved *n0 + # +40 saved %rsp + # +48 t[2*$num] + # + mov $n0, 32(%rsp) + mov %rax, 40(%rsp) # save original %rsp +.Lfrom_body: + mov $num,%r11 + lea 48(%rsp),%rax + pxor %xmm0,%xmm0 + jmp .Lmul_by_1 + +.align 32 +.Lmul_by_1: + movdqu ($aptr),%xmm1 + movdqu 16($aptr),%xmm2 + movdqu 32($aptr),%xmm3 + movdqa %xmm0,(%rax,$num) + movdqu 48($aptr),%xmm4 + movdqa %xmm0,16(%rax,$num) + .byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 # lea 64($aptr),$aptr + movdqa %xmm1,(%rax) + movdqa %xmm0,32(%rax,$num) + movdqa %xmm2,16(%rax) + movdqa %xmm0,48(%rax,$num) + movdqa %xmm3,32(%rax) + movdqa %xmm4,48(%rax) + lea 64(%rax),%rax + sub \$64,%r11 + jnz .Lmul_by_1 + + movq $rptr,%xmm1 + movq $nptr,%xmm2 + .byte 0x67 + mov $nptr,%rbp + movq %r10, %xmm3 # -num +___ +$code.=<<___ if ($addx); + mov OPENSSL_ia32cap_P+8(%rip),%r11d + and \$0x80100,%r11d + cmp \$0x80100,%r11d + jne .Lfrom_mont_nox + + lea (%rax,$num),$rptr + call sqrx8x_reduction + + pxor %xmm0,%xmm0 + lea 48(%rsp),%rax + mov 40(%rsp),%rsi # restore %rsp + jmp .Lfrom_mont_zero + +.align 32 +.Lfrom_mont_nox: +___ +$code.=<<___; + call sqr8x_reduction + + pxor %xmm0,%xmm0 + lea 48(%rsp),%rax + mov 40(%rsp),%rsi # restore %rsp + jmp .Lfrom_mont_zero + +.align 32 +.Lfrom_mont_zero: + movdqa %xmm0,16*0(%rax) + movdqa %xmm0,16*1(%rax) + movdqa %xmm0,16*2(%rax) + movdqa %xmm0,16*3(%rax) + lea 16*4(%rax),%rax + sub \$32,$num + jnz .Lfrom_mont_zero + + mov \$1,%rax + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp +.Lfrom_epilogue: ret -.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 +.size bn_from_mont8x,.-bn_from_mont8x ___ +} }}} + +if ($addx) {{{ +my $bp="%rdx"; # restore original value + +$code.=<<___; +.type bn_mulx4x_mont_gather5,\@function,6 +.align 32 +bn_mulx4x_mont_gather5: +.Lmulx4x_enter: + .byte 0x67 + mov %rsp,%rax + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 +___ +$code.=<<___ if ($win64); + lea -0x28(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) +___ +$code.=<<___; + .byte 0x67 + mov ${num}d,%r10d + shl \$3,${num}d # convert $num to bytes + shl \$3+2,%r10d # 4*$num + neg $num # -$num + mov ($n0),$n0 # *n0 + + ############################################################## + # ensure that stack frame doesn't alias with $aptr+4*$num + # modulo 4096, which covers a[num], ret[num] and n[2*num] + # (see bn_exp.c). this is done to allow memory disambiguation + # logic do its magic. [excessive frame is allocated in order + # to allow bn_from_mont8x to clear it.] + # + lea -64(%rsp,$num,2),%r11 + sub $ap,%r11 + and \$4095,%r11 + cmp %r11,%r10 + jb .Lmulx4xsp_alt + sub %r11,%rsp # align with $aptr + lea -64(%rsp,$num,2),%rsp # alloca(frame+$num) + jmp .Lmulx4xsp_done + +.align 32 +.Lmulx4xsp_alt: + lea 4096-64(,$num,2),%r10 # 4096-frame-$num + lea -64(%rsp,$num,2),%rsp # alloca(frame+$num) + sub %r10,%r11 + mov \$0,%r10 + cmovc %r10,%r11 + sub %r11,%rsp +.Lmulx4xsp_done: + and \$-64,%rsp # ensure alignment + ############################################################## + # Stack layout + # +0 -num + # +8 off-loaded &b[i] + # +16 end of b[num] + # +24 inner counter + # +32 saved n0 + # +40 saved %rsp + # +48 + # +56 saved rp + # +64 tmp[num+1] + # + mov $n0, 32(%rsp) # save *n0 + mov %rax,40(%rsp) # save original %rsp +.Lmulx4x_body: + call mulx4x_internal + + mov 40(%rsp),%rsi # restore %rsp + mov \$1,%rax +___ +$code.=<<___ if ($win64); + movaps -88(%rsi),%xmm6 + movaps -72(%rsi),%xmm7 +___ +$code.=<<___; + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp +.Lmulx4x_epilogue: + ret +.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5 + +.type mulx4x_internal,\@abi-omnipotent +.align 32 +mulx4x_internal: + .byte 0x4c,0x89,0x8c,0x24,0x08,0x00,0x00,0x00 # mov $num,8(%rsp) # save -$num + .byte 0x67 + neg $num # restore $num + shl \$5,$num + lea 256($bp,$num),%r13 + shr \$5+5,$num + mov `($win64?56:8)`(%rax),%r10d # load 7th argument + sub \$1,$num + mov %r13,16+8(%rsp) # end of b[num] + mov $num,24+8(%rsp) # inner counter + mov $rp, 56+8(%rsp) # save $rp +___ +my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)= + ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax"); +my $rptr=$bptr; +my $STRIDE=2**5*8; # 5 is "window size" +my $N=$STRIDE/4; # should match cache line size +$code.=<<___; + mov %r10,%r11 + shr \$`log($N/8)/log(2)`,%r10 + and \$`$N/8-1`,%r11 + not %r10 + lea .Lmagic_masks(%rip),%rax + and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" + lea 96($bp,%r11,8),$bptr # pointer within 1st cache line + movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which + movq 8(%rax,%r10,8),%xmm5 # cache line contains element + add \$7,%r11 + movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument + movq 24(%rax,%r10,8),%xmm7 + and \$7,%r11 + + movq `0*$STRIDE/4-96`($bptr),%xmm0 + lea $STRIDE($bptr),$tptr # borrow $tptr + movq `1*$STRIDE/4-96`($bptr),%xmm1 + pand %xmm4,%xmm0 + movq `2*$STRIDE/4-96`($bptr),%xmm2 + pand %xmm5,%xmm1 + movq `3*$STRIDE/4-96`($bptr),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + movq `0*$STRIDE/4-96`($tptr),%xmm1 + pand %xmm7,%xmm3 + por %xmm2,%xmm0 + movq `1*$STRIDE/4-96`($tptr),%xmm2 + por %xmm3,%xmm0 + .byte 0x67,0x67 + pand %xmm4,%xmm1 + movq `2*$STRIDE/4-96`($tptr),%xmm3 + + movq %xmm0,%rdx # bp[0] + movq `3*$STRIDE/4-96`($tptr),%xmm0 + lea 2*$STRIDE($bptr),$bptr # next &b[i] + pand %xmm5,%xmm2 + .byte 0x67,0x67 + pand %xmm6,%xmm3 + ############################################################## + # $tptr is chosen so that writing to top-most element of the + # vector occurs just "above" references to powers table, + # "above" modulo cache-line size, which effectively precludes + # possibility of memory disambiguation logic failure when + # accessing the table. + # + lea 64+8*4+8(%rsp,%r11,8),$tptr + + mov %rdx,$bi + mulx 0*8($aptr),$mi,%rax # a[0]*b[0] + mulx 1*8($aptr),%r11,%r12 # a[1]*b[0] + add %rax,%r11 + mulx 2*8($aptr),%rax,%r13 # ... + adc %rax,%r12 + adc \$0,%r13 + mulx 3*8($aptr),%rax,%r14 + + mov $mi,%r15 + imulq 32+8(%rsp),$mi # "t[0]"*n0 + xor $zero,$zero # cf=0, of=0 + mov $mi,%rdx + + por %xmm2,%xmm1 + pand %xmm7,%xmm0 + por %xmm3,%xmm1 + mov $bptr,8+8(%rsp) # off-load &b[i] + por %xmm1,%xmm0 + + .byte 0x48,0x8d,0xb6,0x20,0x00,0x00,0x00 # lea 4*8($aptr),$aptr + adcx %rax,%r13 + adcx $zero,%r14 # cf=0 + + mulx 0*16($nptr),%rax,%r10 + adcx %rax,%r15 # discarded + adox %r11,%r10 + mulx 1*16($nptr),%rax,%r11 + adcx %rax,%r10 + adox %r12,%r11 + mulx 2*16($nptr),%rax,%r12 + mov 24+8(%rsp),$bptr # counter value + .byte 0x66 + mov %r10,-8*4($tptr) + adcx %rax,%r11 + adox %r13,%r12 + mulx 3*16($nptr),%rax,%r15 + .byte 0x67,0x67 + mov $bi,%rdx + mov %r11,-8*3($tptr) + adcx %rax,%r12 + adox $zero,%r15 # of=0 + .byte 0x48,0x8d,0x89,0x40,0x00,0x00,0x00 # lea 4*16($nptr),$nptr + mov %r12,-8*2($tptr) + #jmp .Lmulx4x_1st + +.align 32 +.Lmulx4x_1st: + adcx $zero,%r15 # cf=0, modulo-scheduled + mulx 0*8($aptr),%r10,%rax # a[4]*b[0] + adcx %r14,%r10 + mulx 1*8($aptr),%r11,%r14 # a[5]*b[0] + adcx %rax,%r11 + mulx 2*8($aptr),%r12,%rax # ... + adcx %r14,%r12 + mulx 3*8($aptr),%r13,%r14 + .byte 0x67,0x67 + mov $mi,%rdx + adcx %rax,%r13 + adcx $zero,%r14 # cf=0 + lea 4*8($aptr),$aptr + lea 4*8($tptr),$tptr + + adox %r15,%r10 + mulx 0*16($nptr),%rax,%r15 + adcx %rax,%r10 + adox %r15,%r11 + mulx 1*16($nptr),%rax,%r15 + adcx %rax,%r11 + adox %r15,%r12 + mulx 2*16($nptr),%rax,%r15 + mov %r10,-5*8($tptr) + adcx %rax,%r12 + mov %r11,-4*8($tptr) + adox %r15,%r13 + mulx 3*16($nptr),%rax,%r15 + mov $bi,%rdx + mov %r12,-3*8($tptr) + adcx %rax,%r13 + adox $zero,%r15 + lea 4*16($nptr),$nptr + mov %r13,-2*8($tptr) + + dec $bptr # of=0, pass cf + jnz .Lmulx4x_1st + + mov 8(%rsp),$num # load -num + movq %xmm0,%rdx # bp[1] + adc $zero,%r15 # modulo-scheduled + lea ($aptr,$num),$aptr # rewind $aptr + add %r15,%r14 + mov 8+8(%rsp),$bptr # re-load &b[i] + adc $zero,$zero # top-most carry + mov %r14,-1*8($tptr) + jmp .Lmulx4x_outer + +.align 32 +.Lmulx4x_outer: + mov $zero,($tptr) # save top-most carry + lea 4*8($tptr,$num),$tptr # rewind $tptr + mulx 0*8($aptr),$mi,%r11 # a[0]*b[i] + xor $zero,$zero # cf=0, of=0 + mov %rdx,$bi + mulx 1*8($aptr),%r14,%r12 # a[1]*b[i] + adox -4*8($tptr),$mi # +t[0] + adcx %r14,%r11 + mulx 2*8($aptr),%r15,%r13 # ... + adox -3*8($tptr),%r11 + adcx %r15,%r12 + mulx 3*8($aptr),%rdx,%r14 + adox -2*8($tptr),%r12 + adcx %rdx,%r13 + lea ($nptr,$num,2),$nptr # rewind $nptr + lea 4*8($aptr),$aptr + adox -1*8($tptr),%r13 + adcx $zero,%r14 + adox $zero,%r14 + + .byte 0x67 + mov $mi,%r15 + imulq 32+8(%rsp),$mi # "t[0]"*n0 + + movq `0*$STRIDE/4-96`($bptr),%xmm0 + .byte 0x67,0x67 + mov $mi,%rdx + movq `1*$STRIDE/4-96`($bptr),%xmm1 + .byte 0x67 + pand %xmm4,%xmm0 + movq `2*$STRIDE/4-96`($bptr),%xmm2 + .byte 0x67 + pand %xmm5,%xmm1 + movq `3*$STRIDE/4-96`($bptr),%xmm3 + add \$$STRIDE,$bptr # next &b[i] + .byte 0x67 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + xor $zero,$zero # cf=0, of=0 + mov $bptr,8+8(%rsp) # off-load &b[i] + + mulx 0*16($nptr),%rax,%r10 + adcx %rax,%r15 # discarded + adox %r11,%r10 + mulx 1*16($nptr),%rax,%r11 + adcx %rax,%r10 + adox %r12,%r11 + mulx 2*16($nptr),%rax,%r12 + adcx %rax,%r11 + adox %r13,%r12 + mulx 3*16($nptr),%rax,%r15 + mov $bi,%rdx + por %xmm2,%xmm0 + mov 24+8(%rsp),$bptr # counter value + mov %r10,-8*4($tptr) + por %xmm3,%xmm0 + adcx %rax,%r12 + mov %r11,-8*3($tptr) + adox $zero,%r15 # of=0 + mov %r12,-8*2($tptr) + lea 4*16($nptr),$nptr + jmp .Lmulx4x_inner + +.align 32 +.Lmulx4x_inner: + mulx 0*8($aptr),%r10,%rax # a[4]*b[i] + adcx $zero,%r15 # cf=0, modulo-scheduled + adox %r14,%r10 + mulx 1*8($aptr),%r11,%r14 # a[5]*b[i] + adcx 0*8($tptr),%r10 + adox %rax,%r11 + mulx 2*8($aptr),%r12,%rax # ... + adcx 1*8($tptr),%r11 + adox %r14,%r12 + mulx 3*8($aptr),%r13,%r14 + mov $mi,%rdx + adcx 2*8($tptr),%r12 + adox %rax,%r13 + adcx 3*8($tptr),%r13 + adox $zero,%r14 # of=0 + lea 4*8($aptr),$aptr + lea 4*8($tptr),$tptr + adcx $zero,%r14 # cf=0 + + adox %r15,%r10 + mulx 0*16($nptr),%rax,%r15 + adcx %rax,%r10 + adox %r15,%r11 + mulx 1*16($nptr),%rax,%r15 + adcx %rax,%r11 + adox %r15,%r12 + mulx 2*16($nptr),%rax,%r15 + mov %r10,-5*8($tptr) + adcx %rax,%r12 + adox %r15,%r13 + mov %r11,-4*8($tptr) + mulx 3*16($nptr),%rax,%r15 + mov $bi,%rdx + lea 4*16($nptr),$nptr + mov %r12,-3*8($tptr) + adcx %rax,%r13 + adox $zero,%r15 + mov %r13,-2*8($tptr) + + dec $bptr # of=0, pass cf + jnz .Lmulx4x_inner + + mov 0+8(%rsp),$num # load -num + movq %xmm0,%rdx # bp[i+1] + adc $zero,%r15 # modulo-scheduled + sub 0*8($tptr),$bptr # pull top-most carry to %cf + mov 8+8(%rsp),$bptr # re-load &b[i] + mov 16+8(%rsp),%r10 + adc %r15,%r14 + lea ($aptr,$num),$aptr # rewind $aptr + adc $zero,$zero # top-most carry + mov %r14,-1*8($tptr) + + cmp %r10,$bptr + jb .Lmulx4x_outer + + mov -16($nptr),%r10 + xor %r15,%r15 + sub %r14,%r10 # compare top-most words + adc %r15,%r15 + or %r15,$zero + xor \$1,$zero + lea ($tptr,$num),%rdi # rewind $tptr + lea ($nptr,$num,2),$nptr # rewind $nptr + .byte 0x67,0x67 + sar \$3+2,$num # cf=0 + lea ($nptr,$zero,8),%rbp + mov 56+8(%rsp),%rdx # restore rp + mov $num,%rcx + jmp .Lsqrx4x_sub # common post-condition +.size mulx4x_internal,.-mulx4x_internal +___ +}{ +###################################################################### +# void bn_power5( +my $rptr="%rdi"; # BN_ULONG *rptr, +my $aptr="%rsi"; # const BN_ULONG *aptr, +my $bptr="%rdx"; # const void *table, +my $nptr="%rcx"; # const BN_ULONG *nptr, +my $n0 ="%r8"; # const BN_ULONG *n0); +my $num ="%r9"; # int num, has to be divisible by 8 + # int pwr); + +my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); +my @A0=("%r10","%r11"); +my @A1=("%r12","%r13"); +my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); + +$code.=<<___; +.type bn_powerx5,\@function,6 +.align 32 +bn_powerx5: +.Lpowerx5_enter: + .byte 0x67 + mov %rsp,%rax + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 +___ +$code.=<<___ if ($win64); + lea -0x28(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) +___ +$code.=<<___; + .byte 0x67 + mov ${num}d,%r10d + shl \$3,${num}d # convert $num to bytes + shl \$3+2,%r10d # 4*$num + neg $num + mov ($n0),$n0 # *n0 + + ############################################################## + # ensure that stack frame doesn't alias with $aptr+4*$num + # modulo 4096, which covers ret[num], am[num] and n[2*num] + # (see bn_exp.c). this is done to allow memory disambiguation + # logic do its magic. + # + lea -64(%rsp,$num,2),%r11 + sub $aptr,%r11 + and \$4095,%r11 + cmp %r11,%r10 + jb .Lpwrx_sp_alt + sub %r11,%rsp # align with $aptr + lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) + jmp .Lpwrx_sp_done + +.align 32 +.Lpwrx_sp_alt: + lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num + lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) + sub %r10,%r11 + mov \$0,%r10 + cmovc %r10,%r11 + sub %r11,%rsp +.Lpwrx_sp_done: + and \$-64,%rsp + mov $num,%r10 + neg $num + + ############################################################## + # Stack layout + # + # +0 saved $num, used in reduction section + # +8 &t[2*$num], used in reduction section + # +16 intermediate carry bit + # +24 top-most carry bit, used in reduction section + # +32 saved *n0 + # +40 saved %rsp + # +48 t[2*$num] + # + pxor %xmm0,%xmm0 + movq $rptr,%xmm1 # save $rptr + movq $nptr,%xmm2 # save $nptr + movq %r10, %xmm3 # -$num + movq $bptr,%xmm4 + mov $n0, 32(%rsp) + mov %rax, 40(%rsp) # save original %rsp +.Lpowerx5_body: + + call __bn_sqrx8x_internal + call __bn_sqrx8x_internal + call __bn_sqrx8x_internal + call __bn_sqrx8x_internal + call __bn_sqrx8x_internal + + mov %r10,$num # -num + mov $aptr,$rptr + movq %xmm2,$nptr + movq %xmm4,$bptr + mov 40(%rsp),%rax + + call mulx4x_internal + + mov 40(%rsp),%rsi # restore %rsp + mov \$1,%rax +___ +$code.=<<___ if ($win64); + movaps -88(%rsi),%xmm6 + movaps -72(%rsi),%xmm7 +___ +$code.=<<___; + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp +.Lpowerx5_epilogue: + ret +.size bn_powerx5,.-bn_powerx5 + +.globl bn_sqrx8x_internal +.hidden bn_sqrx8x_internal +.type bn_sqrx8x_internal,\@abi-omnipotent +.align 32 +bn_sqrx8x_internal: +__bn_sqrx8x_internal: + ################################################################## + # Squaring part: + # + # a) multiply-n-add everything but a[i]*a[i]; + # b) shift result of a) by 1 to the left and accumulate + # a[i]*a[i] products; + # + ################################################################## + # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0] + # a[1]a[0] + # a[2]a[0] + # a[3]a[0] + # a[2]a[1] + # a[3]a[1] + # a[3]a[2] + # + # a[4]a[0] + # a[5]a[0] + # a[6]a[0] + # a[7]a[0] + # a[4]a[1] + # a[5]a[1] + # a[6]a[1] + # a[7]a[1] + # a[4]a[2] + # a[5]a[2] + # a[6]a[2] + # a[7]a[2] + # a[4]a[3] + # a[5]a[3] + # a[6]a[3] + # a[7]a[3] + # + # a[5]a[4] + # a[6]a[4] + # a[7]a[4] + # a[6]a[5] + # a[7]a[5] + # a[7]a[6] + # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0] +___ +{ +my ($zero,$carry)=("%rbp","%rcx"); +my $aaptr=$zero; +$code.=<<___; + lea 48+8(%rsp),$tptr + lea ($aptr,$num),$aaptr + mov $num,0+8(%rsp) # save $num + mov $aaptr,8+8(%rsp) # save end of $aptr + jmp .Lsqr8x_zero_start + +.align 32 +.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 +.Lsqrx8x_zero: + .byte 0x3e + movdqa %xmm0,0*8($tptr) + movdqa %xmm0,2*8($tptr) + movdqa %xmm0,4*8($tptr) + movdqa %xmm0,6*8($tptr) +.Lsqr8x_zero_start: # aligned at 32 + movdqa %xmm0,8*8($tptr) + movdqa %xmm0,10*8($tptr) + movdqa %xmm0,12*8($tptr) + movdqa %xmm0,14*8($tptr) + lea 16*8($tptr),$tptr + sub \$64,$num + jnz .Lsqrx8x_zero + + mov 0*8($aptr),%rdx # a[0], modulo-scheduled + #xor %r9,%r9 # t[1], ex-$num, zero already + xor %r10,%r10 + xor %r11,%r11 + xor %r12,%r12 + xor %r13,%r13 + xor %r14,%r14 + xor %r15,%r15 + lea 48+8(%rsp),$tptr + xor $zero,$zero # cf=0, cf=0 + jmp .Lsqrx8x_outer_loop + +.align 32 +.Lsqrx8x_outer_loop: + mulx 1*8($aptr),%r8,%rax # a[1]*a[0] + adcx %r9,%r8 # a[1]*a[0]+=t[1] + adox %rax,%r10 + mulx 2*8($aptr),%r9,%rax # a[2]*a[0] + adcx %r10,%r9 + adox %rax,%r11 + .byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 3*8($aptr),%r10,%rax # ... + adcx %r11,%r10 + adox %rax,%r12 + .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 # mulx 4*8($aptr),%r11,%rax + adcx %r12,%r11 + adox %rax,%r13 + mulx 5*8($aptr),%r12,%rax + adcx %r13,%r12 + adox %rax,%r14 + mulx 6*8($aptr),%r13,%rax + adcx %r14,%r13 + adox %r15,%rax + mulx 7*8($aptr),%r14,%r15 + mov 1*8($aptr),%rdx # a[1] + adcx %rax,%r14 + adox $zero,%r15 + adc 8*8($tptr),%r15 + mov %r8,1*8($tptr) # t[1] + mov %r9,2*8($tptr) # t[2] + sbb $carry,$carry # mov %cf,$carry + xor $zero,$zero # cf=0, of=0 + + + mulx 2*8($aptr),%r8,%rbx # a[2]*a[1] + mulx 3*8($aptr),%r9,%rax # a[3]*a[1] + adcx %r10,%r8 + adox %rbx,%r9 + mulx 4*8($aptr),%r10,%rbx # ... + adcx %r11,%r9 + adox %rax,%r10 + .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 # mulx 5*8($aptr),%r11,%rax + adcx %r12,%r10 + adox %rbx,%r11 + .byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r12,%rbx + adcx %r13,%r11 + adox %r14,%r12 + .byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r13,%r14 + mov 2*8($aptr),%rdx # a[2] + adcx %rax,%r12 + adox %rbx,%r13 + adcx %r15,%r13 + adox $zero,%r14 # of=0 + adcx $zero,%r14 # cf=0 + + mov %r8,3*8($tptr) # t[3] + mov %r9,4*8($tptr) # t[4] + + mulx 3*8($aptr),%r8,%rbx # a[3]*a[2] + mulx 4*8($aptr),%r9,%rax # a[4]*a[2] + adcx %r10,%r8 + adox %rbx,%r9 + mulx 5*8($aptr),%r10,%rbx # ... + adcx %r11,%r9 + adox %rax,%r10 + .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r11,%rax + adcx %r12,%r10 + adox %r13,%r11 + .byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r12,%r13 + .byte 0x3e + mov 3*8($aptr),%rdx # a[3] + adcx %rbx,%r11 + adox %rax,%r12 + adcx %r14,%r12 + mov %r8,5*8($tptr) # t[5] + mov %r9,6*8($tptr) # t[6] + mulx 4*8($aptr),%r8,%rax # a[4]*a[3] + adox $zero,%r13 # of=0 + adcx $zero,%r13 # cf=0 + + mulx 5*8($aptr),%r9,%rbx # a[5]*a[3] + adcx %r10,%r8 + adox %rax,%r9 + mulx 6*8($aptr),%r10,%rax # ... + adcx %r11,%r9 + adox %r12,%r10 + mulx 7*8($aptr),%r11,%r12 + mov 4*8($aptr),%rdx # a[4] + mov 5*8($aptr),%r14 # a[5] + adcx %rbx,%r10 + adox %rax,%r11 + mov 6*8($aptr),%r15 # a[6] + adcx %r13,%r11 + adox $zero,%r12 # of=0 + adcx $zero,%r12 # cf=0 + + mov %r8,7*8($tptr) # t[7] + mov %r9,8*8($tptr) # t[8] + + mulx %r14,%r9,%rax # a[5]*a[4] + mov 7*8($aptr),%r8 # a[7] + adcx %r10,%r9 + mulx %r15,%r10,%rbx # a[6]*a[4] + adox %rax,%r10 + adcx %r11,%r10 + mulx %r8,%r11,%rax # a[7]*a[4] + mov %r14,%rdx # a[5] + adox %rbx,%r11 + adcx %r12,%r11 + #adox $zero,%rax # of=0 + adcx $zero,%rax # cf=0 + + mulx %r15,%r14,%rbx # a[6]*a[5] + mulx %r8,%r12,%r13 # a[7]*a[5] + mov %r15,%rdx # a[6] + lea 8*8($aptr),$aptr + adcx %r14,%r11 + adox %rbx,%r12 + adcx %rax,%r12 + adox $zero,%r13 + + .byte 0x67,0x67 + mulx %r8,%r8,%r14 # a[7]*a[6] + adcx %r8,%r13 + adcx $zero,%r14 + + cmp 8+8(%rsp),$aptr + je .Lsqrx8x_outer_break + + neg $carry # mov $carry,%cf + mov \$-8,%rcx + mov $zero,%r15 + mov 8*8($tptr),%r8 + adcx 9*8($tptr),%r9 # +=t[9] + adcx 10*8($tptr),%r10 # ... + adcx 11*8($tptr),%r11 + adc 12*8($tptr),%r12 + adc 13*8($tptr),%r13 + adc 14*8($tptr),%r14 + adc 15*8($tptr),%r15 + lea ($aptr),$aaptr + lea 2*64($tptr),$tptr + sbb %rax,%rax # mov %cf,$carry + + mov -64($aptr),%rdx # a[0] + mov %rax,16+8(%rsp) # offload $carry + mov $tptr,24+8(%rsp) + + #lea 8*8($tptr),$tptr # see 2*8*8($tptr) above + xor %eax,%eax # cf=0, of=0 + jmp .Lsqrx8x_loop + +.align 32 +.Lsqrx8x_loop: + mov %r8,%rbx + mulx 0*8($aaptr),%rax,%r8 # a[8]*a[i] + adcx %rax,%rbx # +=t[8] + adox %r9,%r8 + + mulx 1*8($aaptr),%rax,%r9 # ... + adcx %rax,%r8 + adox %r10,%r9 + + mulx 2*8($aaptr),%rax,%r10 + adcx %rax,%r9 + adox %r11,%r10 + + mulx 3*8($aaptr),%rax,%r11 + adcx %rax,%r10 + adox %r12,%r11 + + .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 4*8($aaptr),%rax,%r12 + adcx %rax,%r11 + adox %r13,%r12 + + mulx 5*8($aaptr),%rax,%r13 + adcx %rax,%r12 + adox %r14,%r13 + + mulx 6*8($aaptr),%rax,%r14 + mov %rbx,($tptr,%rcx,8) # store t[8+i] + mov \$0,%ebx + adcx %rax,%r13 + adox %r15,%r14 + + .byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 # mulx 7*8($aaptr),%rax,%r15 + mov 8($aptr,%rcx,8),%rdx # a[i] + adcx %rax,%r14 + adox %rbx,%r15 # %rbx is 0, of=0 + adcx %rbx,%r15 # cf=0 + + .byte 0x67 + inc %rcx # of=0 + jnz .Lsqrx8x_loop + + lea 8*8($aaptr),$aaptr + mov \$-8,%rcx + cmp 8+8(%rsp),$aaptr # done? + je .Lsqrx8x_break + + sub 16+8(%rsp),%rbx # mov 16(%rsp),%cf + .byte 0x66 + mov -64($aptr),%rdx + adcx 0*8($tptr),%r8 + adcx 1*8($tptr),%r9 + adc 2*8($tptr),%r10 + adc 3*8($tptr),%r11 + adc 4*8($tptr),%r12 + adc 5*8($tptr),%r13 + adc 6*8($tptr),%r14 + adc 7*8($tptr),%r15 + lea 8*8($tptr),$tptr + .byte 0x67 + sbb %rax,%rax # mov %cf,%rax + xor %ebx,%ebx # cf=0, of=0 + mov %rax,16+8(%rsp) # offload carry + jmp .Lsqrx8x_loop + +.align 32 +.Lsqrx8x_break: + sub 16+8(%rsp),%r8 # consume last carry + mov 24+8(%rsp),$carry # initial $tptr, borrow $carry + mov 0*8($aptr),%rdx # a[8], modulo-scheduled + xor %ebp,%ebp # xor $zero,$zero + mov %r8,0*8($tptr) + cmp $carry,$tptr # cf=0, of=0 + je .Lsqrx8x_outer_loop + + mov %r9,1*8($tptr) + mov 1*8($carry),%r9 + mov %r10,2*8($tptr) + mov 2*8($carry),%r10 + mov %r11,3*8($tptr) + mov 3*8($carry),%r11 + mov %r12,4*8($tptr) + mov 4*8($carry),%r12 + mov %r13,5*8($tptr) + mov 5*8($carry),%r13 + mov %r14,6*8($tptr) + mov 6*8($carry),%r14 + mov %r15,7*8($tptr) + mov 7*8($carry),%r15 + mov $carry,$tptr + jmp .Lsqrx8x_outer_loop + +.align 32 +.Lsqrx8x_outer_break: + mov %r9,9*8($tptr) # t[9] + movq %xmm3,%rcx # -$num + mov %r10,10*8($tptr) # ... + mov %r11,11*8($tptr) + mov %r12,12*8($tptr) + mov %r13,13*8($tptr) + mov %r14,14*8($tptr) +___ +}{ +my $i="%rcx"; +$code.=<<___; + lea 48+8(%rsp),$tptr + mov ($aptr,$i),%rdx # a[0] + + mov 8($tptr),$A0[1] # t[1] + xor $A0[0],$A0[0] # t[0], of=0, cf=0 + mov 0+8(%rsp),$num # restore $num + adox $A0[1],$A0[1] + mov 16($tptr),$A1[0] # t[2] # prefetch + mov 24($tptr),$A1[1] # t[3] # prefetch + #jmp .Lsqrx4x_shift_n_add # happens to be aligned + +.align 32 +.Lsqrx4x_shift_n_add: + mulx %rdx,%rax,%rbx + adox $A1[0],$A1[0] + adcx $A0[0],%rax + .byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 # mov 8($aptr,$i),%rdx # a[i+1] # prefetch + .byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 # mov 32($tptr),$A0[0] # t[2*i+4] # prefetch + adox $A1[1],$A1[1] + adcx $A0[1],%rbx + mov 40($tptr),$A0[1] # t[2*i+4+1] # prefetch + mov %rax,0($tptr) + mov %rbx,8($tptr) + + mulx %rdx,%rax,%rbx + adox $A0[0],$A0[0] + adcx $A1[0],%rax + mov 16($aptr,$i),%rdx # a[i+2] # prefetch + mov 48($tptr),$A1[0] # t[2*i+6] # prefetch + adox $A0[1],$A0[1] + adcx $A1[1],%rbx + mov 56($tptr),$A1[1] # t[2*i+6+1] # prefetch + mov %rax,16($tptr) + mov %rbx,24($tptr) + + mulx %rdx,%rax,%rbx + adox $A1[0],$A1[0] + adcx $A0[0],%rax + mov 24($aptr,$i),%rdx # a[i+3] # prefetch + lea 32($i),$i + mov 64($tptr),$A0[0] # t[2*i+8] # prefetch + adox $A1[1],$A1[1] + adcx $A0[1],%rbx + mov 72($tptr),$A0[1] # t[2*i+8+1] # prefetch + mov %rax,32($tptr) + mov %rbx,40($tptr) + + mulx %rdx,%rax,%rbx + adox $A0[0],$A0[0] + adcx $A1[0],%rax + jrcxz .Lsqrx4x_shift_n_add_break + .byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 # mov 0($aptr,$i),%rdx # a[i+4] # prefetch + adox $A0[1],$A0[1] + adcx $A1[1],%rbx + mov 80($tptr),$A1[0] # t[2*i+10] # prefetch + mov 88($tptr),$A1[1] # t[2*i+10+1] # prefetch + mov %rax,48($tptr) + mov %rbx,56($tptr) + lea 64($tptr),$tptr + nop + jmp .Lsqrx4x_shift_n_add + +.align 32 +.Lsqrx4x_shift_n_add_break: + adcx $A1[1],%rbx + mov %rax,48($tptr) + mov %rbx,56($tptr) + lea 64($tptr),$tptr # end of t[] buffer +___ +} +###################################################################### +# Montgomery reduction part, "word-by-word" algorithm. +# +# This new path is inspired by multiple submissions from Intel, by +# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford, +# Vinodh Gopal... +{ +my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx"); + +$code.=<<___; + movq %xmm2,$nptr +sqrx8x_reduction: + xor %eax,%eax # initial top-most carry bit + mov 32+8(%rsp),%rbx # n0 + mov 48+8(%rsp),%rdx # "%r8", 8*0($tptr) + lea -128($nptr,$num,2),%rcx # end of n[] + #lea 48+8(%rsp,$num,2),$tptr # end of t[] buffer + mov %rcx, 0+8(%rsp) # save end of n[] + mov $tptr,8+8(%rsp) # save end of t[] + + lea 48+8(%rsp),$tptr # initial t[] window + jmp .Lsqrx8x_reduction_loop + +.align 32 +.Lsqrx8x_reduction_loop: + mov 8*1($tptr),%r9 + mov 8*2($tptr),%r10 + mov 8*3($tptr),%r11 + mov 8*4($tptr),%r12 + mov %rdx,%r8 + imulq %rbx,%rdx # n0*a[i] + mov 8*5($tptr),%r13 + mov 8*6($tptr),%r14 + mov 8*7($tptr),%r15 + mov %rax,24+8(%rsp) # store top-most carry bit + + lea 8*8($tptr),$tptr + xor $carry,$carry # cf=0,of=0 + mov \$-8,%rcx + jmp .Lsqrx8x_reduce + +.align 32 +.Lsqrx8x_reduce: + mov %r8, %rbx + mulx 16*0($nptr),%rax,%r8 # n[0] + adcx %rbx,%rax # discarded + adox %r9,%r8 + + mulx 16*1($nptr),%rbx,%r9 # n[1] + adcx %rbx,%r8 + adox %r10,%r9 + + mulx 16*2($nptr),%rbx,%r10 + adcx %rbx,%r9 + adox %r11,%r10 + + mulx 16*3($nptr),%rbx,%r11 + adcx %rbx,%r10 + adox %r12,%r11 + + .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x40,0x00,0x00,0x00 # mulx 16*4($nptr),%rbx,%r12 + mov %rdx,%rax + mov %r8,%rdx + adcx %rbx,%r11 + adox %r13,%r12 + + mulx 32+8(%rsp),%rbx,%rdx # %rdx discarded + mov %rax,%rdx + mov %rax,64+48+8(%rsp,%rcx,8) # put aside n0*a[i] + + mulx 16*5($nptr),%rax,%r13 + adcx %rax,%r12 + adox %r14,%r13 + + mulx 16*6($nptr),%rax,%r14 + adcx %rax,%r13 + adox %r15,%r14 + + mulx 16*7($nptr),%rax,%r15 + mov %rbx,%rdx + adcx %rax,%r14 + adox $carry,%r15 # $carry is 0 + adcx $carry,%r15 # cf=0 + + .byte 0x67,0x67,0x67 + inc %rcx # of=0 + jnz .Lsqrx8x_reduce + + mov $carry,%rax # xor %rax,%rax + cmp 0+8(%rsp),$nptr # end of n[]? + jae .Lsqrx8x_no_tail + + mov 48+8(%rsp),%rdx # pull n0*a[0] + add 8*0($tptr),%r8 + lea 16*8($nptr),$nptr + mov \$-8,%rcx + adcx 8*1($tptr),%r9 + adcx 8*2($tptr),%r10 + adc 8*3($tptr),%r11 + adc 8*4($tptr),%r12 + adc 8*5($tptr),%r13 + adc 8*6($tptr),%r14 + adc 8*7($tptr),%r15 + lea 8*8($tptr),$tptr + sbb %rax,%rax # top carry + + xor $carry,$carry # of=0, cf=0 + mov %rax,16+8(%rsp) + jmp .Lsqrx8x_tail + +.align 32 +.Lsqrx8x_tail: + mov %r8,%rbx + mulx 16*0($nptr),%rax,%r8 + adcx %rax,%rbx + adox %r9,%r8 + + mulx 16*1($nptr),%rax,%r9 + adcx %rax,%r8 + adox %r10,%r9 + + mulx 16*2($nptr),%rax,%r10 + adcx %rax,%r9 + adox %r11,%r10 + + mulx 16*3($nptr),%rax,%r11 + adcx %rax,%r10 + adox %r12,%r11 + + .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x40,0x00,0x00,0x00 # mulx 16*4($nptr),%rax,%r12 + adcx %rax,%r11 + adox %r13,%r12 + + mulx 16*5($nptr),%rax,%r13 + adcx %rax,%r12 + adox %r14,%r13 + + mulx 16*6($nptr),%rax,%r14 + adcx %rax,%r13 + adox %r15,%r14 + + mulx 16*7($nptr),%rax,%r15 + mov 72+48+8(%rsp,%rcx,8),%rdx # pull n0*a[i] + adcx %rax,%r14 + adox $carry,%r15 + mov %rbx,($tptr,%rcx,8) # save result + mov %r8,%rbx + adcx $carry,%r15 # cf=0 + + inc %rcx # of=0 + jnz .Lsqrx8x_tail + + cmp 0+8(%rsp),$nptr # end of n[]? + jae .Lsqrx8x_tail_done # break out of loop + + sub 16+8(%rsp),$carry # mov 16(%rsp),%cf + mov 48+8(%rsp),%rdx # pull n0*a[0] + lea 16*8($nptr),$nptr + adc 8*0($tptr),%r8 + adc 8*1($tptr),%r9 + adc 8*2($tptr),%r10 + adc 8*3($tptr),%r11 + adc 8*4($tptr),%r12 + adc 8*5($tptr),%r13 + adc 8*6($tptr),%r14 + adc 8*7($tptr),%r15 + lea 8*8($tptr),$tptr + sbb %rax,%rax + sub \$8,%rcx # mov \$-8,%rcx + + xor $carry,$carry # of=0, cf=0 + mov %rax,16+8(%rsp) + jmp .Lsqrx8x_tail + +.align 32 +.Lsqrx8x_tail_done: + add 24+8(%rsp),%r8 # can this overflow? + mov $carry,%rax # xor %rax,%rax + + sub 16+8(%rsp),$carry # mov 16(%rsp),%cf +.Lsqrx8x_no_tail: # %cf is 0 if jumped here + adc 8*0($tptr),%r8 + movq %xmm3,%rcx + adc 8*1($tptr),%r9 + mov 16*7($nptr),$carry + movq %xmm2,$nptr # restore $nptr + adc 8*2($tptr),%r10 + adc 8*3($tptr),%r11 + adc 8*4($tptr),%r12 + adc 8*5($tptr),%r13 + adc 8*6($tptr),%r14 + adc 8*7($tptr),%r15 + adc %rax,%rax # top-most carry + + mov 32+8(%rsp),%rbx # n0 + mov 8*8($tptr,%rcx),%rdx # modulo-scheduled "%r8" + + mov %r8,8*0($tptr) # store top 512 bits + lea 8*8($tptr),%r8 # borrow %r8 + mov %r9,8*1($tptr) + mov %r10,8*2($tptr) + mov %r11,8*3($tptr) + mov %r12,8*4($tptr) + mov %r13,8*5($tptr) + mov %r14,8*6($tptr) + mov %r15,8*7($tptr) + + lea 8*8($tptr,%rcx),$tptr # start of current t[] window + cmp 8+8(%rsp),%r8 # end of t[]? + jb .Lsqrx8x_reduction_loop +___ +} +############################################################## +# Post-condition, 4x unrolled +# +{ +my ($rptr,$nptr)=("%rdx","%rbp"); +my @ri=map("%r$_",(10..13)); +my @ni=map("%r$_",(14..15)); +$code.=<<___; + xor %rbx,%rbx + sub %r15,%rsi # compare top-most words + adc %rbx,%rbx + mov %rcx,%r10 # -$num + .byte 0x67 + or %rbx,%rax + .byte 0x67 + mov %rcx,%r9 # -$num + xor \$1,%rax + sar \$3+2,%rcx # cf=0 + #lea 48+8(%rsp,%r9),$tptr + lea ($nptr,%rax,8),$nptr + movq %xmm1,$rptr # restore $rptr + movq %xmm1,$aptr # prepare for back-to-back call + jmp .Lsqrx4x_sub + +.align 32 +.Lsqrx4x_sub: + .byte 0x66 + mov 8*0($tptr),%r12 + mov 8*1($tptr),%r13 + sbb 16*0($nptr),%r12 + mov 8*2($tptr),%r14 + sbb 16*1($nptr),%r13 + mov 8*3($tptr),%r15 + lea 8*4($tptr),$tptr + sbb 16*2($nptr),%r14 + mov %r12,8*0($rptr) + sbb 16*3($nptr),%r15 + lea 16*4($nptr),$nptr + mov %r13,8*1($rptr) + mov %r14,8*2($rptr) + mov %r15,8*3($rptr) + lea 8*4($rptr),$rptr + + inc %rcx + jnz .Lsqrx4x_sub +___ +} +$code.=<<___; + neg %r9 # restore $num + ret +.size bn_sqrx8x_internal,.-bn_sqrx8x_internal +___ +}}} { -my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order - ("%rdi","%rsi","%rdx","%rcx"); # Unix order +my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order + ("%rdi","%esi","%rdx","%ecx"); # Unix order my $out=$inp; my $STRIDE=2**5*8; my $N=$STRIDE/4; $code.=<<___; +.globl bn_get_bits5 +.type bn_get_bits5,\@abi-omnipotent +.align 16 +bn_get_bits5: + lea 0($inp),%r10 + lea 1($inp),%r11 + mov $num,%ecx + shr \$4,$num + and \$15,%ecx + lea -8(%ecx),%eax + cmp \$11,%ecx + cmova %r11,%r10 + cmova %eax,%ecx + movzw (%r10,$num,2),%eax + shrl %cl,%eax + and \$31,%eax + ret +.size bn_get_bits5,.-bn_get_bits5 + .globl bn_scatter5 .type bn_scatter5,\@abi-omnipotent .align 16 @@ -868,13 +3272,13 @@ $code.=<<___ if ($win64); .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp) ___ $code.=<<___; - mov $idx,%r11 + mov $idx,%r11d shr \$`log($N/8)/log(2)`,$idx and \$`$N/8-1`,%r11 not $idx lea .Lmagic_masks(%rip),%rax and \$`2**5/($N/8)-1`,$idx # 5 is "window size" - lea 96($tbl,%r11,8),$tbl # pointer within 1st cache line + lea 128($tbl,%r11,8),$tbl # pointer within 1st cache line movq 0(%rax,$idx,8),%xmm4 # set of masks denoting which movq 8(%rax,$idx,8),%xmm5 # cache line contains element movq 16(%rax,$idx,8),%xmm6 # denoted by 7th argument @@ -882,15 +3286,16 @@ $code.=<<___; jmp .Lgather .align 16 .Lgather: - movq `0*$STRIDE/4-96`($tbl),%xmm0 - movq `1*$STRIDE/4-96`($tbl),%xmm1 + movq `0*$STRIDE/4-128`($tbl),%xmm0 + movq `1*$STRIDE/4-128`($tbl),%xmm1 pand %xmm4,%xmm0 - movq `2*$STRIDE/4-96`($tbl),%xmm2 + movq `2*$STRIDE/4-128`($tbl),%xmm2 pand %xmm5,%xmm1 - movq `3*$STRIDE/4-96`($tbl),%xmm3 + movq `3*$STRIDE/4-128`($tbl),%xmm3 pand %xmm6,%xmm2 por %xmm1,%xmm0 pand %xmm7,%xmm3 + .byte 0x67,0x67 por %xmm2,%xmm0 lea $STRIDE($tbl),$tbl por %xmm3,%xmm0 @@ -954,26 +3359,27 @@ mul_handler: cmp %r10,%rbx # context->Rip<end of prologue label jb .Lcommon_seh_tail - lea `40+48`(%rax),%rax - - mov 4(%r11),%r10d # HandlerData[1] - lea (%rsi,%r10),%r10 # end of alloca label - cmp %r10,%rbx # context->Rip<end of alloca label - jb .Lcommon_seh_tail - mov 152($context),%rax # pull context->Rsp - mov 8(%r11),%r10d # HandlerData[2] + mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail + lea .Lmul_epilogue(%rip),%r10 + cmp %r10,%rbx + jb .Lbody_40 + mov 192($context),%r10 # pull $num mov 8(%rax,%r10,8),%rax # pull saved stack pointer + jmp .Lbody_proceed + +.Lbody_40: + mov 40(%rax),%rax # pull saved stack pointer +.Lbody_proceed: - movaps (%rax),%xmm0 - movaps 16(%rax),%xmm1 - lea `40+48`(%rax),%rax + movaps -88(%rax),%xmm0 + movaps -72(%rax),%xmm1 mov -8(%rax),%rbx mov -16(%rax),%rbp @@ -1040,6 +3446,24 @@ mul_handler: .rva .LSEH_end_bn_mul4x_mont_gather5 .rva .LSEH_info_bn_mul4x_mont_gather5 + .rva .LSEH_begin_bn_power5 + .rva .LSEH_end_bn_power5 + .rva .LSEH_info_bn_power5 + + .rva .LSEH_begin_bn_from_mont8x + .rva .LSEH_end_bn_from_mont8x + .rva .LSEH_info_bn_from_mont8x +___ +$code.=<<___ if ($addx); + .rva .LSEH_begin_bn_mulx4x_mont_gather5 + .rva .LSEH_end_bn_mulx4x_mont_gather5 + .rva .LSEH_info_bn_mulx4x_mont_gather5 + + .rva .LSEH_begin_bn_powerx5 + .rva .LSEH_end_bn_powerx5 + .rva .LSEH_info_bn_powerx5 +___ +$code.=<<___; .rva .LSEH_begin_bn_gather5 .rva .LSEH_end_bn_gather5 .rva .LSEH_info_bn_gather5 @@ -1049,12 +3473,36 @@ mul_handler: .LSEH_info_bn_mul_mont_gather5: .byte 9,0,0,0 .rva mul_handler - .rva .Lmul_alloca,.Lmul_body,.Lmul_epilogue # HandlerData[] + .rva .Lmul_body,.Lmul_epilogue # HandlerData[] .align 8 .LSEH_info_bn_mul4x_mont_gather5: .byte 9,0,0,0 .rva mul_handler - .rva .Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[] + .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[] +.align 8 +.LSEH_info_bn_power5: + .byte 9,0,0,0 + .rva mul_handler + .rva .Lpower5_body,.Lpower5_epilogue # HandlerData[] +.align 8 +.LSEH_info_bn_from_mont8x: + .byte 9,0,0,0 + .rva mul_handler + .rva .Lfrom_body,.Lfrom_epilogue # HandlerData[] +___ +$code.=<<___ if ($addx); +.align 8 +.LSEH_info_bn_mulx4x_mont_gather5: + .byte 9,0,0,0 + .rva mul_handler + .rva .Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] +.align 8 +.LSEH_info_bn_powerx5: + .byte 9,0,0,0 + .rva mul_handler + .rva .Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[] +___ +$code.=<<___; .align 8 .LSEH_info_bn_gather5: .byte 0x01,0x0d,0x05,0x00 diff --git a/crypto/bn/bn.h b/crypto/bn/bn.h index 47d8c71..5696965 100644 --- a/crypto/bn/bn.h +++ b/crypto/bn/bn.h @@ -256,24 +256,6 @@ extern "C" { # define BN_HEX_FMT2 "%08X" # endif -/* - * 2011-02-22 SMS. In various places, a size_t variable or a type cast to - * size_t was used to perform integer-only operations on pointers. This - * failed on VMS with 64-bit pointers (CC /POINTER_SIZE = 64) because size_t - * is still only 32 bits. What's needed in these cases is an integer type - * with the same size as a pointer, which size_t is not certain to be. The - * only fix here is VMS-specific. - */ -# if defined(OPENSSL_SYS_VMS) -# if __INITIAL_POINTER_SIZE == 64 -# define PTR_SIZE_INT long long -# else /* __INITIAL_POINTER_SIZE == 64 */ -# define PTR_SIZE_INT int -# endif /* __INITIAL_POINTER_SIZE == 64 [else] */ -# else /* defined(OPENSSL_SYS_VMS) */ -# define PTR_SIZE_INT size_t -# endif /* defined(OPENSSL_SYS_VMS) [else] */ - # define BN_DEFAULT_BITS 1280 # define BN_FLG_MALLOCED 0x01 diff --git a/crypto/bn/bn_asm.c b/crypto/bn/bn_asm.c index 114acf3..03a33cf 100644 --- a/crypto/bn/bn_asm.c +++ b/crypto/bn/bn_asm.c @@ -489,121 +489,144 @@ BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, * c=(c2,c1,c0) */ +# ifdef BN_LLONG /* - * Keep in mind that carrying into high part of multiplication result - * can not overflow, because it cannot be all-ones. + * Keep in mind that additions to multiplication result can not + * overflow, because its high half cannot be all-ones. */ -# ifdef BN_LLONG -# define mul_add_c(a,b,c0,c1,c2) \ - t=(BN_ULLONG)a*b; \ - t1=(BN_ULONG)Lw(t); \ - t2=(BN_ULONG)Hw(t); \ - c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \ - c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; - -# define mul_add_c2(a,b,c0,c1,c2) \ - t=(BN_ULLONG)a*b; \ - tt=(t+t)&BN_MASK; \ - if (tt < t) c2++; \ - t1=(BN_ULONG)Lw(tt); \ - t2=(BN_ULONG)Hw(tt); \ - c0=(c0+t1)&BN_MASK2; \ - if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \ - c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; - -# define sqr_add_c(a,i,c0,c1,c2) \ - t=(BN_ULLONG)a[i]*a[i]; \ - t1=(BN_ULONG)Lw(t); \ - t2=(BN_ULONG)Hw(t); \ - c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \ - c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; +# define mul_add_c(a,b,c0,c1,c2) do { \ + BN_ULONG hi; \ + BN_ULLONG t = (BN_ULLONG)(a)*(b); \ + t += c0; /* no carry */ \ + c0 = (BN_ULONG)Lw(t); \ + hi = (BN_ULONG)Hw(t); \ + c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ + } while(0) + +# define mul_add_c2(a,b,c0,c1,c2) do { \ + BN_ULONG hi; \ + BN_ULLONG t = (BN_ULLONG)(a)*(b); \ + BN_ULLONG tt = t+c0; /* no carry */ \ + c0 = (BN_ULONG)Lw(tt); \ + hi = (BN_ULONG)Hw(tt); \ + c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ + t += c0; /* no carry */ \ + c0 = (BN_ULONG)Lw(t); \ + hi = (BN_ULONG)Hw(t); \ + c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ + } while(0) + +# define sqr_add_c(a,i,c0,c1,c2) do { \ + BN_ULONG hi; \ + BN_ULLONG t = (BN_ULLONG)a[i]*a[i]; \ + t += c0; /* no carry */ \ + c0 = (BN_ULONG)Lw(t); \ + hi = (BN_ULONG)Hw(t); \ + c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ + } while(0) # define sqr_add_c2(a,i,j,c0,c1,c2) \ mul_add_c2((a)[i],(a)[j],c0,c1,c2) # elif defined(BN_UMULT_LOHI) - -# define mul_add_c(a,b,c0,c1,c2) { \ - BN_ULONG ta=(a),tb=(b); \ - BN_UMULT_LOHI(t1,t2,ta,tb); \ - c0 += t1; t2 += (c0<t1)?1:0; \ - c1 += t2; c2 += (c1<t2)?1:0; \ - } - -# define mul_add_c2(a,b,c0,c1,c2) { \ - BN_ULONG ta=(a),tb=(b),t0; \ - BN_UMULT_LOHI(t0,t1,ta,tb); \ - c0 += t0; t2 = t1+((c0<t0)?1:0);\ - c1 += t2; c2 += (c1<t2)?1:0; \ - c0 += t0; t1 += (c0<t0)?1:0; \ - c1 += t1; c2 += (c1<t1)?1:0; \ - } - -# define sqr_add_c(a,i,c0,c1,c2) { \ - BN_ULONG ta=(a)[i]; \ - BN_UMULT_LOHI(t1,t2,ta,ta); \ - c0 += t1; t2 += (c0<t1)?1:0; \ - c1 += t2; c2 += (c1<t2)?1:0; \ - } +/* + * Keep in mind that additions to hi can not overflow, because + * the high word of a multiplication result cannot be all-ones. + */ +# define mul_add_c(a,b,c0,c1,c2) do { \ + BN_ULONG ta = (a), tb = (b); \ + BN_ULONG lo, hi; \ + BN_UMULT_LOHI(lo,hi,ta,tb); \ + c0 += lo; hi += (c0<lo)?1:0; \ + c1 += hi; c2 += (c1<hi)?1:0; \ + } while(0) + +# define mul_add_c2(a,b,c0,c1,c2) do { \ + BN_ULONG ta = (a), tb = (b); \ + BN_ULONG lo, hi, tt; \ + BN_UMULT_LOHI(lo,hi,ta,tb); \ + c0 += lo; tt = hi+((c0<lo)?1:0); \ + c1 += tt; c2 += (c1<tt)?1:0; \ + c0 += lo; hi += (c0<lo)?1:0; \ + c1 += hi; c2 += (c1<hi)?1:0; \ + } while(0) + +# define sqr_add_c(a,i,c0,c1,c2) do { \ + BN_ULONG ta = (a)[i]; \ + BN_ULONG lo, hi; \ + BN_UMULT_LOHI(lo,hi,ta,ta); \ + c0 += lo; hi += (c0<lo)?1:0; \ + c1 += hi; c2 += (c1<hi)?1:0; \ + } while(0) # define sqr_add_c2(a,i,j,c0,c1,c2) \ mul_add_c2((a)[i],(a)[j],c0,c1,c2) # elif defined(BN_UMULT_HIGH) - -# define mul_add_c(a,b,c0,c1,c2) { \ - BN_ULONG ta=(a),tb=(b); \ - t1 = ta * tb; \ - t2 = BN_UMULT_HIGH(ta,tb); \ - c0 += t1; t2 += (c0<t1)?1:0; \ - c1 += t2; c2 += (c1<t2)?1:0; \ - } - -# define mul_add_c2(a,b,c0,c1,c2) { \ - BN_ULONG ta=(a),tb=(b),t0; \ - t1 = BN_UMULT_HIGH(ta,tb); \ - t0 = ta * tb; \ - c0 += t0; t2 = t1+((c0<t0)?1:0);\ - c1 += t2; c2 += (c1<t2)?1:0; \ - c0 += t0; t1 += (c0<t0)?1:0; \ - c1 += t1; c2 += (c1<t1)?1:0; \ - } - -# define sqr_add_c(a,i,c0,c1,c2) { \ - BN_ULONG ta=(a)[i]; \ - t1 = ta * ta; \ - t2 = BN_UMULT_HIGH(ta,ta); \ - c0 += t1; t2 += (c0<t1)?1:0; \ - c1 += t2; c2 += (c1<t2)?1:0; \ - } +/* + * Keep in mind that additions to hi can not overflow, because + * the high word of a multiplication result cannot be all-ones. + */ +# define mul_add_c(a,b,c0,c1,c2) do { \ + BN_ULONG ta = (a), tb = (b); \ + BN_ULONG lo = ta * tb; \ + BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \ + c0 += lo; hi += (c0<lo)?1:0; \ + c1 += hi; c2 += (c1<hi)?1:0; \ + } while(0) + +# define mul_add_c2(a,b,c0,c1,c2) do { \ + BN_ULONG ta = (a), tb = (b), tt; \ + BN_ULONG lo = ta * tb; \ + BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \ + c0 += lo; tt = hi + ((c0<lo)?1:0); \ + c1 += tt; c2 += (c1<tt)?1:0; \ + c0 += lo; hi += (c0<lo)?1:0; \ + c1 += hi; c2 += (c1<hi)?1:0; \ + } while(0) + +# define sqr_add_c(a,i,c0,c1,c2) do { \ + BN_ULONG ta = (a)[i]; \ + BN_ULONG lo = ta * ta; \ + BN_ULONG hi = BN_UMULT_HIGH(ta,ta); \ + c0 += lo; hi += (c0<lo)?1:0; \ + c1 += hi; c2 += (c1<hi)?1:0; \ + } while(0) # define sqr_add_c2(a,i,j,c0,c1,c2) \ mul_add_c2((a)[i],(a)[j],c0,c1,c2) # else /* !BN_LLONG */ -# define mul_add_c(a,b,c0,c1,c2) \ - t1=LBITS(a); t2=HBITS(a); \ - bl=LBITS(b); bh=HBITS(b); \ - mul64(t1,t2,bl,bh); \ - c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \ - c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; - -# define mul_add_c2(a,b,c0,c1,c2) \ - t1=LBITS(a); t2=HBITS(a); \ - bl=LBITS(b); bh=HBITS(b); \ - mul64(t1,t2,bl,bh); \ - if (t2 & BN_TBIT) c2++; \ - t2=(t2+t2)&BN_MASK2; \ - if (t1 & BN_TBIT) t2++; \ - t1=(t1+t1)&BN_MASK2; \ - c0=(c0+t1)&BN_MASK2; \ - if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \ - c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; - -# define sqr_add_c(a,i,c0,c1,c2) \ - sqr64(t1,t2,(a)[i]); \ - c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \ - c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; +/* + * Keep in mind that additions to hi can not overflow, because + * the high word of a multiplication result cannot be all-ones. + */ +# define mul_add_c(a,b,c0,c1,c2) do { \ + BN_ULONG lo = LBITS(a), hi = HBITS(a); \ + BN_ULONG bl = LBITS(b), bh = HBITS(b); \ + mul64(lo,hi,bl,bh); \ + c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \ + c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ + } while(0) + +# define mul_add_c2(a,b,c0,c1,c2) do { \ + BN_ULONG tt; \ + BN_ULONG lo = LBITS(a), hi = HBITS(a); \ + BN_ULONG bl = LBITS(b), bh = HBITS(b); \ + mul64(lo,hi,bl,bh); \ + tt = hi; \ + c0 = (c0+lo)&BN_MASK2; if (c0<lo) tt++; \ + c1 = (c1+tt)&BN_MASK2; if (c1<tt) c2++; \ + c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \ + c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ + } while(0) + +# define sqr_add_c(a,i,c0,c1,c2) do { \ + BN_ULONG lo, hi; \ + sqr64(lo,hi,(a)[i]); \ + c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \ + c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ + } while(0) # define sqr_add_c2(a,i,j,c0,c1,c2) \ mul_add_c2((a)[i],(a)[j],c0,c1,c2) @@ -611,12 +634,6 @@ BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) { -# ifdef BN_LLONG - BN_ULLONG t; -# else - BN_ULONG bl, bh; -# endif - BN_ULONG t1, t2; BN_ULONG c1, c2, c3; c1 = 0; @@ -720,12 +737,6 @@ void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) { -# ifdef BN_LLONG - BN_ULLONG t; -# else - BN_ULONG bl, bh; -# endif - BN_ULONG t1, t2; BN_ULONG c1, c2, c3; c1 = 0; @@ -765,12 +776,6 @@ void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) { -# ifdef BN_LLONG - BN_ULLONG t, tt; -# else - BN_ULONG bl, bh; -# endif - BN_ULONG t1, t2; BN_ULONG c1, c2, c3; c1 = 0; @@ -846,12 +851,6 @@ void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) { -# ifdef BN_LLONG - BN_ULLONG t, tt; -# else - BN_ULONG bl, bh; -# endif - BN_ULONG t1, t2; BN_ULONG c1, c2, c3; c1 = 0; diff --git a/crypto/bn/bn_exp.c b/crypto/bn/bn_exp.c index 27146c8..24afdd6 100644 --- a/crypto/bn/bn_exp.c +++ b/crypto/bn/bn_exp.c @@ -122,6 +122,17 @@ # ifndef alloca # define alloca(s) __builtin_alloca((s)) # endif +#elif defined(__sun) +# include <alloca.h> +#endif + +#include "rsaz_exp.h" + +#undef SPARC_T4_MONT +#if defined(OPENSSL_BN_ASM_MONT) && (defined(__sparc__) || defined(__sparc)) +# include "sparc_arch.h" +extern unsigned int OPENSSL_sparcv9cap_P[]; +# define SPARC_T4_MONT #endif /* maximum precomputation table size for *variable* sliding windows */ @@ -464,6 +475,23 @@ int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, wstart = bits - 1; /* The top bit of the window */ wend = 0; /* The bottom bit of the window */ +#if 1 /* by Shay Gueron's suggestion */ + j = m->top; /* borrow j */ + if (m->d[j - 1] & (((BN_ULONG)1) << (BN_BITS2 - 1))) { + if (bn_wexpand(r, j) == NULL) + goto err; + /* 2^(top*BN_BITS2) - m */ + r->d[0] = (0 - m->d[0]) & BN_MASK2; + for (i = 1; i < j; i++) + r->d[i] = (~m->d[i]) & BN_MASK2; + r->top = j; + /* + * Upper words will be zero if the corresponding words of 'm' were + * 0xfff[...], so decrement r->top accordingly. + */ + bn_correct_top(r); + } else +#endif if (!BN_to_montgomery(r, BN_value_one(), mont, ctx)) goto err; for (;;) { @@ -515,6 +543,17 @@ int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, if (wstart < 0) break; } +#if defined(SPARC_T4_MONT) + if (OPENSSL_sparcv9cap_P[0] & (SPARCV9_VIS3 | SPARCV9_PREFER_FPU)) { + j = mont->N.top; /* borrow j */ + val[0]->d[0] = 1; /* borrow val[0] */ + for (i = 1; i < j; i++) + val[0]->d[i] = 0; + val[0]->top = j; + if (!BN_mod_mul_montgomery(rr, r, val[0], mont, ctx)) + goto err; + } else +#endif if (!BN_from_montgomery(rr, r, mont, ctx)) goto err; ret = 1; @@ -526,6 +565,27 @@ int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, return (ret); } +#if defined(SPARC_T4_MONT) +static BN_ULONG bn_get_bits(const BIGNUM *a, int bitpos) +{ + BN_ULONG ret = 0; + int wordpos; + + wordpos = bitpos / BN_BITS2; + bitpos %= BN_BITS2; + if (wordpos >= 0 && wordpos < a->top) { + ret = a->d[wordpos] & BN_MASK2; + if (bitpos) { + ret >>= bitpos; + if (++wordpos < a->top) + ret |= a->d[wordpos] << (BN_BITS2 - bitpos); + } + } + + return ret & BN_MASK2; +} +#endif + /* * BN_mod_exp_mont_consttime() stores the precomputed powers in a specific * layout so that accessing any of these table values shows the same access @@ -594,6 +654,9 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, int powerbufLen = 0; unsigned char *powerbuf = NULL; BIGNUM tmp, am; +#if defined(SPARC_T4_MONT) + unsigned int t4 = 0; +#endif bn_check_top(a); bn_check_top(p); @@ -626,21 +689,62 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, goto err; } +#ifdef RSAZ_ENABLED + /* + * If the size of the operands allow it, perform the optimized + * RSAZ exponentiation. For further information see + * crypto/bn/rsaz_exp.c and accompanying assembly modules. + */ + if ((16 == a->top) && (16 == p->top) && (BN_num_bits(m) == 1024) + && rsaz_avx2_eligible()) { + if (NULL == bn_wexpand(rr, 16)) + goto err; + RSAZ_1024_mod_exp_avx2(rr->d, a->d, p->d, m->d, mont->RR.d, + mont->n0[0]); + rr->top = 16; + rr->neg = 0; + bn_correct_top(rr); + ret = 1; + goto err; + } else if ((8 == a->top) && (8 == p->top) && (BN_num_bits(m) == 512)) { + if (NULL == bn_wexpand(rr, 8)) + goto err; + RSAZ_512_mod_exp(rr->d, a->d, p->d, m->d, mont->n0[0], mont->RR.d); + rr->top = 8; + rr->neg = 0; + bn_correct_top(rr); + ret = 1; + goto err; + } +#endif + /* Get the window size to use with size of p. */ window = BN_window_bits_for_ctime_exponent_size(bits); +#if defined(SPARC_T4_MONT) + if (window >= 5 && (top & 15) == 0 && top <= 64 && + (OPENSSL_sparcv9cap_P[1] & (CFR_MONTMUL | CFR_MONTSQR)) == + (CFR_MONTMUL | CFR_MONTSQR) && (t4 = OPENSSL_sparcv9cap_P[0])) + window = 5; + else +#endif #if defined(OPENSSL_BN_ASM_MONT5) - if (window == 6 && bits <= 1024) - window = 5; /* ~5% improvement of 2048-bit RSA sign */ + if (window >= 5) { + window = 5; /* ~5% improvement for RSA2048 sign, and even + * for RSA4096 */ + if ((top & 7) == 0) + powerbufLen += 2 * top * sizeof(m->d[0]); + } #endif + (void)0; /* * Allocate a buffer large enough to hold all of the pre-computed powers * of am, am itself and tmp. */ numPowers = 1 << window; - powerbufLen = sizeof(m->d[0]) * (top * numPowers + - ((2 * top) > - numPowers ? (2 * top) : numPowers)); + powerbufLen += sizeof(m->d[0]) * (top * numPowers + + ((2 * top) > + numPowers ? (2 * top) : numPowers)); #ifdef alloca if (powerbufLen < 3072) powerbufFree = @@ -670,15 +774,17 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, tmp.flags = am.flags = BN_FLG_STATIC_DATA; /* prepare a^0 in Montgomery domain */ -#if 1 +#if 1 /* by Shay Gueron's suggestion */ + if (m->d[top - 1] & (((BN_ULONG)1) << (BN_BITS2 - 1))) { + /* 2^(top*BN_BITS2) - m */ + tmp.d[0] = (0 - m->d[0]) & BN_MASK2; + for (i = 1; i < top; i++) + tmp.d[i] = (~m->d[i]) & BN_MASK2; + tmp.top = top; + } else +#endif if (!BN_to_montgomery(&tmp, BN_value_one(), mont, ctx)) goto err; -#else - tmp.d[0] = (0 - m->d[0]) & BN_MASK2; /* 2^(top*BN_BITS2) - m */ - for (i = 1; i < top; i++) - tmp.d[i] = (~m->d[i]) & BN_MASK2; - tmp.top = top; -#endif /* prepare a^1 in Montgomery domain */ if (a->neg || BN_ucmp(a, m) >= 0) { @@ -689,6 +795,138 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, } else if (!BN_to_montgomery(&am, a, mont, ctx)) goto err; +#if defined(SPARC_T4_MONT) + if (t4) { + typedef int (*bn_pwr5_mont_f) (BN_ULONG *tp, const BN_ULONG *np, + const BN_ULONG *n0, const void *table, + int power, int bits); + int bn_pwr5_mont_t4_8(BN_ULONG *tp, const BN_ULONG *np, + const BN_ULONG *n0, const void *table, + int power, int bits); + int bn_pwr5_mont_t4_16(BN_ULONG *tp, const BN_ULONG *np, + const BN_ULONG *n0, const void *table, + int power, int bits); + int bn_pwr5_mont_t4_24(BN_ULONG *tp, const BN_ULONG *np, + const BN_ULONG *n0, const void *table, + int power, int bits); + int bn_pwr5_mont_t4_32(BN_ULONG *tp, const BN_ULONG *np, + const BN_ULONG *n0, const void *table, + int power, int bits); + static const bn_pwr5_mont_f pwr5_funcs[4] = { + bn_pwr5_mont_t4_8, bn_pwr5_mont_t4_16, + bn_pwr5_mont_t4_24, bn_pwr5_mont_t4_32 + }; + bn_pwr5_mont_f pwr5_worker = pwr5_funcs[top / 16 - 1]; + + typedef int (*bn_mul_mont_f) (BN_ULONG *rp, const BN_ULONG *ap, + const void *bp, const BN_ULONG *np, + const BN_ULONG *n0); + int bn_mul_mont_t4_8(BN_ULONG *rp, const BN_ULONG *ap, const void *bp, + const BN_ULONG *np, const BN_ULONG *n0); + int bn_mul_mont_t4_16(BN_ULONG *rp, const BN_ULONG *ap, + const void *bp, const BN_ULONG *np, + const BN_ULONG *n0); + int bn_mul_mont_t4_24(BN_ULONG *rp, const BN_ULONG *ap, + const void *bp, const BN_ULONG *np, + const BN_ULONG *n0); + int bn_mul_mont_t4_32(BN_ULONG *rp, const BN_ULONG *ap, + const void *bp, const BN_ULONG *np, + const BN_ULONG *n0); + static const bn_mul_mont_f mul_funcs[4] = { + bn_mul_mont_t4_8, bn_mul_mont_t4_16, + bn_mul_mont_t4_24, bn_mul_mont_t4_32 + }; + bn_mul_mont_f mul_worker = mul_funcs[top / 16 - 1]; + + void bn_mul_mont_vis3(BN_ULONG *rp, const BN_ULONG *ap, + const void *bp, const BN_ULONG *np, + const BN_ULONG *n0, int num); + void bn_mul_mont_t4(BN_ULONG *rp, const BN_ULONG *ap, + const void *bp, const BN_ULONG *np, + const BN_ULONG *n0, int num); + void bn_mul_mont_gather5_t4(BN_ULONG *rp, const BN_ULONG *ap, + const void *table, const BN_ULONG *np, + const BN_ULONG *n0, int num, int power); + void bn_flip_n_scatter5_t4(const BN_ULONG *inp, size_t num, + void *table, size_t power); + void bn_gather5_t4(BN_ULONG *out, size_t num, + void *table, size_t power); + void bn_flip_t4(BN_ULONG *dst, BN_ULONG *src, size_t num); + + BN_ULONG *np = mont->N.d, *n0 = mont->n0; + int stride = 5 * (6 - (top / 16 - 1)); /* multiple of 5, but less + * than 32 */ + + /* + * BN_to_montgomery can contaminate words above .top [in + * BN_DEBUG[_DEBUG] build]... + */ + for (i = am.top; i < top; i++) + am.d[i] = 0; + for (i = tmp.top; i < top; i++) + tmp.d[i] = 0; + + bn_flip_n_scatter5_t4(tmp.d, top, powerbuf, 0); + bn_flip_n_scatter5_t4(am.d, top, powerbuf, 1); + if (!(*mul_worker) (tmp.d, am.d, am.d, np, n0) && + !(*mul_worker) (tmp.d, am.d, am.d, np, n0)) + bn_mul_mont_vis3(tmp.d, am.d, am.d, np, n0, top); + bn_flip_n_scatter5_t4(tmp.d, top, powerbuf, 2); + + for (i = 3; i < 32; i++) { + /* Calculate a^i = a^(i-1) * a */ + if (!(*mul_worker) (tmp.d, tmp.d, am.d, np, n0) && + !(*mul_worker) (tmp.d, tmp.d, am.d, np, n0)) + bn_mul_mont_vis3(tmp.d, tmp.d, am.d, np, n0, top); + bn_flip_n_scatter5_t4(tmp.d, top, powerbuf, i); + } + + /* switch to 64-bit domain */ + np = alloca(top * sizeof(BN_ULONG)); + top /= 2; + bn_flip_t4(np, mont->N.d, top); + + bits--; + for (wvalue = 0, i = bits % 5; i >= 0; i--, bits--) + wvalue = (wvalue << 1) + BN_is_bit_set(p, bits); + bn_gather5_t4(tmp.d, top, powerbuf, wvalue); + + /* + * Scan the exponent one window at a time starting from the most + * significant bits. + */ + while (bits >= 0) { + if (bits < stride) + stride = bits + 1; + bits -= stride; + wvalue = bn_get_bits(p, bits + 1); + + if ((*pwr5_worker) (tmp.d, np, n0, powerbuf, wvalue, stride)) + continue; + /* retry once and fall back */ + if ((*pwr5_worker) (tmp.d, np, n0, powerbuf, wvalue, stride)) + continue; + + bits += stride - 5; + wvalue >>= stride - 5; + wvalue &= 31; + bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top); + bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top); + bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top); + bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top); + bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top); + bn_mul_mont_gather5_t4(tmp.d, tmp.d, powerbuf, np, n0, top, + wvalue); + } + + bn_flip_t4(tmp.d, tmp.d, top); + top *= 2; + /* back to 32-bit domain */ + tmp.top = top; + bn_correct_top(&tmp); + OPENSSL_cleanse(np, top * sizeof(BN_ULONG)); + } else +#endif #if defined(OPENSSL_BN_ASM_MONT5) if (window == 5 && top > 1) { /* @@ -707,8 +945,15 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, void bn_scatter5(const BN_ULONG *inp, size_t num, void *table, size_t power); void bn_gather5(BN_ULONG *out, size_t num, void *table, size_t power); + void bn_power5(BN_ULONG *rp, const BN_ULONG *ap, + const void *table, const BN_ULONG *np, + const BN_ULONG *n0, int num, int power); + int bn_get_bits5(const BN_ULONG *ap, int off); + int bn_from_montgomery(BN_ULONG *rp, const BN_ULONG *ap, + const BN_ULONG *not_used, const BN_ULONG *np, + const BN_ULONG *n0, int num); - BN_ULONG *np = mont->N.d, *n0 = mont->n0; + BN_ULONG *np = mont->N.d, *n0 = mont->n0, *np2; /* * BN_to_montgomery can contaminate words above .top [in @@ -719,6 +964,12 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, for (i = tmp.top; i < top; i++) tmp.d[i] = 0; + if (top & 7) + np2 = np; + else + for (np2 = am.d + top, i = 0; i < top; i++) + np2[2 * i] = np[i]; + bn_scatter5(tmp.d, top, powerbuf, 0); bn_scatter5(am.d, am.top, powerbuf, 1); bn_mul_mont(tmp.d, am.d, am.d, np, n0, top); @@ -727,7 +978,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, # if 0 for (i = 3; i < 32; i++) { /* Calculate a^i = a^(i-1) * a */ - bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1); + bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1); bn_scatter5(tmp.d, top, powerbuf, i); } # else @@ -738,7 +989,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, } for (i = 3; i < 8; i += 2) { int j; - bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1); + bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1); bn_scatter5(tmp.d, top, powerbuf, i); for (j = 2 * i; j < 32; j *= 2) { bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); @@ -746,13 +997,13 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, } } for (; i < 16; i += 2) { - bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1); + bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1); bn_scatter5(tmp.d, top, powerbuf, i); bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); bn_scatter5(tmp.d, top, powerbuf, 2 * i); } for (; i < 32; i += 2) { - bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1); + bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1); bn_scatter5(tmp.d, top, powerbuf, i); } # endif @@ -765,20 +1016,34 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, * Scan the exponent one window at a time starting from the most * significant bits. */ - while (bits >= 0) { - for (wvalue = 0, i = 0; i < 5; i++, bits--) - wvalue = (wvalue << 1) + BN_is_bit_set(p, bits); + if (top & 7) + while (bits >= 0) { + for (wvalue = 0, i = 0; i < 5; i++, bits--) + wvalue = (wvalue << 1) + BN_is_bit_set(p, bits); - bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); - bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); - bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); - bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); - bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); - bn_mul_mont_gather5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue); + bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); + bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); + bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); + bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); + bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); + bn_mul_mont_gather5(tmp.d, tmp.d, powerbuf, np, n0, top, + wvalue); + } else { + while (bits >= 0) { + wvalue = bn_get_bits5(p->d, bits - 4); + bits -= 5; + bn_power5(tmp.d, tmp.d, powerbuf, np2, n0, top, wvalue); + } } + ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np2, n0, top); tmp.top = top; bn_correct_top(&tmp); + if (ret) { + if (!BN_copy(rr, &tmp)) + ret = 0; + goto err; /* non-zero ret means it's not error */ + } } else #endif { @@ -844,6 +1109,15 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, } /* Convert the final result from montgomery to standard format */ +#if defined(SPARC_T4_MONT) + if (OPENSSL_sparcv9cap_P[0] & (SPARCV9_VIS3 | SPARCV9_PREFER_FPU)) { + am.d[0] = 1; /* borrow am */ + for (i = 1; i < top; i++) + am.d[i] = 0; + if (!BN_mod_mul_montgomery(rr, &tmp, &am, mont, ctx)) + goto err; + } else +#endif if (!BN_from_montgomery(rr, &tmp, mont, ctx)) goto err; ret = 1; diff --git a/crypto/bn/bn_gf2m.c b/crypto/bn/bn_gf2m.c index a0ba8de..cfa1c7c 100644 --- a/crypto/bn/bn_gf2m.c +++ b/crypto/bn/bn_gf2m.c @@ -450,8 +450,7 @@ int BN_GF2m_mod_arr(BIGNUM *r, const BIGNUM *a, const int p[]) d0 = p[k] % BN_BITS2; d1 = BN_BITS2 - d0; z[n] ^= (zz << d0); - tmp_ulong = zz >> d1; - if (d0 && tmp_ulong) + if (d0 && (tmp_ulong = zz >> d1)) z[n + 1] ^= tmp_ulong; } diff --git a/crypto/bn/bn_lcl.h b/crypto/bn/bn_lcl.h index 904a723..00f4f09 100644 --- a/crypto/bn/bn_lcl.h +++ b/crypto/bn/bn_lcl.h @@ -204,6 +204,24 @@ extern "C" { # define BN_MUL_LOW_RECURSIVE_SIZE_NORMAL (32)/* 32 */ # define BN_MONT_CTX_SET_SIZE_WORD (64)/* 32 */ +/* + * 2011-02-22 SMS. In various places, a size_t variable or a type cast to + * size_t was used to perform integer-only operations on pointers. This + * failed on VMS with 64-bit pointers (CC /POINTER_SIZE = 64) because size_t + * is still only 32 bits. What's needed in these cases is an integer type + * with the same size as a pointer, which size_t is not certain to be. The + * only fix here is VMS-specific. + */ +# if defined(OPENSSL_SYS_VMS) +# if __INITIAL_POINTER_SIZE == 64 +# define PTR_SIZE_INT long long +# else /* __INITIAL_POINTER_SIZE == 64 */ +# define PTR_SIZE_INT int +# endif /* __INITIAL_POINTER_SIZE == 64 [else] */ +# elif !defined(PTR_SIZE_INT) /* defined(OPENSSL_SYS_VMS) */ +# define PTR_SIZE_INT size_t +# endif /* defined(OPENSSL_SYS_VMS) [else] */ + # if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM) && !defined(PEDANTIC) /* * BN_UMULT_HIGH section. @@ -295,6 +313,15 @@ unsigned __int64 _umul128(unsigned __int64 a, unsigned __int64 b, : "r"(a), "r"(b)); # endif # endif +# elif defined(__aarch64__) && defined(SIXTY_FOUR_BIT_LONG) +# if defined(__GNUC__) && __GNUC__>=2 +# define BN_UMULT_HIGH(a,b) ({ \ + register BN_ULONG ret; \ + asm ("umulh %0,%1,%2" \ + : "=r"(ret) \ + : "r"(a), "r"(b)); \ + ret; }) +# endif # endif /* cpu */ # endif /* OPENSSL_NO_ASM */ diff --git a/crypto/bn/bntest.c b/crypto/bn/bntest.c index 06662c5..470d5da 100644 --- a/crypto/bn/bntest.c +++ b/crypto/bn/bntest.c @@ -1042,7 +1042,6 @@ int test_mod_exp_mont_consttime(BIO *bp, BN_CTX *ctx) int test_mod_exp_mont5(BIO *bp, BN_CTX *ctx) { BIGNUM *a, *p, *m, *d, *e; - BN_MONT_CTX *mont; a = BN_new(); @@ -1050,7 +1049,6 @@ int test_mod_exp_mont5(BIO *bp, BN_CTX *ctx) m = BN_new(); d = BN_new(); e = BN_new(); - mont = BN_MONT_CTX_new(); BN_bntest_rand(m, 1024, 0, 1); /* must be odd for montgomery */ @@ -1099,6 +1097,7 @@ int test_mod_exp_mont5(BIO *bp, BN_CTX *ctx) fprintf(stderr, "Modular exponentiation test failed!\n"); return 0; } + BN_MONT_CTX_free(mont); BN_free(a); BN_free(p); BN_free(m); diff --git a/crypto/bn/rsaz_exp.c b/crypto/bn/rsaz_exp.c new file mode 100644 index 0000000..c54c6fe --- /dev/null +++ b/crypto/bn/rsaz_exp.c @@ -0,0 +1,346 @@ +/***************************************************************************** +* * +* Copyright (c) 2012, Intel Corporation * +* * +* All rights reserved. * +* * +* Redistribution and use in source and binary forms, with or without * +* modification, are permitted provided that the following conditions are * +* met: * +* * +* * Redistributions of source code must retain the above copyright * +* notice, this list of conditions and the following disclaimer. * +* * +* * Redistributions in binary form must reproduce the above copyright * +* notice, this list of conditions and the following disclaimer in the * +* documentation and/or other materials provided with the * +* distribution. * +* * +* * Neither the name of the Intel Corporation nor the names of its * +* contributors may be used to endorse or promote products derived from * +* this software without specific prior written permission. * +* * +* * +* THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY * +* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * +* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * +* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR * +* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * +* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * +* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * +* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * +* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * +* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * +* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * +* * +****************************************************************************** +* Developers and authors: * +* Shay Gueron (1, 2), and Vlad Krasnov (1) * +* (1) Intel Corporation, Israel Development Center, Haifa, Israel * +* (2) University of Haifa, Israel * +*****************************************************************************/ + +#include "rsaz_exp.h" + +#ifdef RSAZ_ENABLED + +/* + * See crypto/bn/asm/rsaz-avx2.pl for further details. + */ +void rsaz_1024_norm2red_avx2(void *red, const void *norm); +void rsaz_1024_mul_avx2(void *ret, const void *a, const void *b, + const void *n, BN_ULONG k); +void rsaz_1024_sqr_avx2(void *ret, const void *a, const void *n, BN_ULONG k, + int cnt); +void rsaz_1024_scatter5_avx2(void *tbl, const void *val, int i); +void rsaz_1024_gather5_avx2(void *val, const void *tbl, int i); +void rsaz_1024_red2norm_avx2(void *norm, const void *red); + +#if defined(__GNUC__) +# define ALIGN64 __attribute__((aligned(64))) +#elif defined(_MSC_VER) +# define ALIGN64 __declspec(align(64)) +#elif defined(__SUNPRO_C) +# define ALIGN64 +# pragma align 64(one,two80) +#else +/* not fatal, might hurt performance a little */ +# define ALIGN64 +#endif + +ALIGN64 static const BN_ULONG one[40] = { + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +ALIGN64 static const BN_ULONG two80[40] = { + 0, 0, 1 << 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +void RSAZ_1024_mod_exp_avx2(BN_ULONG result_norm[16], + const BN_ULONG base_norm[16], + const BN_ULONG exponent[16], + const BN_ULONG m_norm[16], const BN_ULONG RR[16], + BN_ULONG k0) +{ + unsigned char storage[320 * 3 + 32 * 9 * 16 + 64]; /* 5.5KB */ + unsigned char *p_str = storage + (64 - ((size_t)storage % 64)); + unsigned char *a_inv, *m, *result; + unsigned char *table_s = p_str + 320 * 3; + unsigned char *R2 = table_s; /* borrow */ + int index; + int wvalue; + + if ((((size_t)p_str & 4095) + 320) >> 12) { + result = p_str; + a_inv = p_str + 320; + m = p_str + 320 * 2; /* should not cross page */ + } else { + m = p_str; /* should not cross page */ + result = p_str + 320; + a_inv = p_str + 320 * 2; + } + + rsaz_1024_norm2red_avx2(m, m_norm); + rsaz_1024_norm2red_avx2(a_inv, base_norm); + rsaz_1024_norm2red_avx2(R2, RR); + + rsaz_1024_mul_avx2(R2, R2, R2, m, k0); + rsaz_1024_mul_avx2(R2, R2, two80, m, k0); + + /* table[0] = 1 */ + rsaz_1024_mul_avx2(result, R2, one, m, k0); + /* table[1] = a_inv^1 */ + rsaz_1024_mul_avx2(a_inv, a_inv, R2, m, k0); + + rsaz_1024_scatter5_avx2(table_s, result, 0); + rsaz_1024_scatter5_avx2(table_s, a_inv, 1); + + /* table[2] = a_inv^2 */ + rsaz_1024_sqr_avx2(result, a_inv, m, k0, 1); + rsaz_1024_scatter5_avx2(table_s, result, 2); +#if 0 + /* this is almost 2x smaller and less than 1% slower */ + for (index = 3; index < 32; index++) { + rsaz_1024_mul_avx2(result, result, a_inv, m, k0); + rsaz_1024_scatter5_avx2(table_s, result, index); + } +#else + /* table[4] = a_inv^4 */ + rsaz_1024_sqr_avx2(result, result, m, k0, 1); + rsaz_1024_scatter5_avx2(table_s, result, 4); + /* table[8] = a_inv^8 */ + rsaz_1024_sqr_avx2(result, result, m, k0, 1); + rsaz_1024_scatter5_avx2(table_s, result, 8); + /* table[16] = a_inv^16 */ + rsaz_1024_sqr_avx2(result, result, m, k0, 1); + rsaz_1024_scatter5_avx2(table_s, result, 16); + /* table[17] = a_inv^17 */ + rsaz_1024_mul_avx2(result, result, a_inv, m, k0); + rsaz_1024_scatter5_avx2(table_s, result, 17); + + /* table[3] */ + rsaz_1024_gather5_avx2(result, table_s, 2); + rsaz_1024_mul_avx2(result, result, a_inv, m, k0); + rsaz_1024_scatter5_avx2(table_s, result, 3); + /* table[6] */ + rsaz_1024_sqr_avx2(result, result, m, k0, 1); + rsaz_1024_scatter5_avx2(table_s, result, 6); + /* table[12] */ + rsaz_1024_sqr_avx2(result, result, m, k0, 1); + rsaz_1024_scatter5_avx2(table_s, result, 12); + /* table[24] */ + rsaz_1024_sqr_avx2(result, result, m, k0, 1); + rsaz_1024_scatter5_avx2(table_s, result, 24); + /* table[25] */ + rsaz_1024_mul_avx2(result, result, a_inv, m, k0); + rsaz_1024_scatter5_avx2(table_s, result, 25); + + /* table[5] */ + rsaz_1024_gather5_avx2(result, table_s, 4); + rsaz_1024_mul_avx2(result, result, a_inv, m, k0); + rsaz_1024_scatter5_avx2(table_s, result, 5); + /* table[10] */ + rsaz_1024_sqr_avx2(result, result, m, k0, 1); + rsaz_1024_scatter5_avx2(table_s, result, 10); + /* table[20] */ + rsaz_1024_sqr_avx2(result, result, m, k0, 1); + rsaz_1024_scatter5_avx2(table_s, result, 20); + /* table[21] */ + rsaz_1024_mul_avx2(result, result, a_inv, m, k0); + rsaz_1024_scatter5_avx2(table_s, result, 21); + + /* table[7] */ + rsaz_1024_gather5_avx2(result, table_s, 6); + rsaz_1024_mul_avx2(result, result, a_inv, m, k0); + rsaz_1024_scatter5_avx2(table_s, result, 7); + /* table[14] */ + rsaz_1024_sqr_avx2(result, result, m, k0, 1); + rsaz_1024_scatter5_avx2(table_s, result, 14); + /* table[28] */ + rsaz_1024_sqr_avx2(result, result, m, k0, 1); + rsaz_1024_scatter5_avx2(table_s, result, 28); + /* table[29] */ + rsaz_1024_mul_avx2(result, result, a_inv, m, k0); + rsaz_1024_scatter5_avx2(table_s, result, 29); + + /* table[9] */ + rsaz_1024_gather5_avx2(result, table_s, 8); + rsaz_1024_mul_avx2(result, result, a_inv, m, k0); + rsaz_1024_scatter5_avx2(table_s, result, 9); + /* table[18] */ + rsaz_1024_sqr_avx2(result, result, m, k0, 1); + rsaz_1024_scatter5_avx2(table_s, result, 18); + /* table[19] */ + rsaz_1024_mul_avx2(result, result, a_inv, m, k0); + rsaz_1024_scatter5_avx2(table_s, result, 19); + + /* table[11] */ + rsaz_1024_gather5_avx2(result, table_s, 10); + rsaz_1024_mul_avx2(result, result, a_inv, m, k0); + rsaz_1024_scatter5_avx2(table_s, result, 11); + /* table[22] */ + rsaz_1024_sqr_avx2(result, result, m, k0, 1); + rsaz_1024_scatter5_avx2(table_s, result, 22); + /* table[23] */ + rsaz_1024_mul_avx2(result, result, a_inv, m, k0); + rsaz_1024_scatter5_avx2(table_s, result, 23); + + /* table[13] */ + rsaz_1024_gather5_avx2(result, table_s, 12); + rsaz_1024_mul_avx2(result, result, a_inv, m, k0); + rsaz_1024_scatter5_avx2(table_s, result, 13); + /* table[26] */ + rsaz_1024_sqr_avx2(result, result, m, k0, 1); + rsaz_1024_scatter5_avx2(table_s, result, 26); + /* table[27] */ + rsaz_1024_mul_avx2(result, result, a_inv, m, k0); + rsaz_1024_scatter5_avx2(table_s, result, 27); + + /* table[15] */ + rsaz_1024_gather5_avx2(result, table_s, 14); + rsaz_1024_mul_avx2(result, result, a_inv, m, k0); + rsaz_1024_scatter5_avx2(table_s, result, 15); + /* table[30] */ + rsaz_1024_sqr_avx2(result, result, m, k0, 1); + rsaz_1024_scatter5_avx2(table_s, result, 30); + /* table[31] */ + rsaz_1024_mul_avx2(result, result, a_inv, m, k0); + rsaz_1024_scatter5_avx2(table_s, result, 31); +#endif + + /* load first window */ + p_str = (unsigned char *)exponent; + wvalue = p_str[127] >> 3; + rsaz_1024_gather5_avx2(result, table_s, wvalue); + + index = 1014; + + while (index > -1) { /* loop for the remaining 127 windows */ + + rsaz_1024_sqr_avx2(result, result, m, k0, 5); + + wvalue = *((unsigned short *)&p_str[index / 8]); + wvalue = (wvalue >> (index % 8)) & 31; + index -= 5; + + rsaz_1024_gather5_avx2(a_inv, table_s, wvalue); /* borrow a_inv */ + rsaz_1024_mul_avx2(result, result, a_inv, m, k0); + } + + /* square four times */ + rsaz_1024_sqr_avx2(result, result, m, k0, 4); + + wvalue = p_str[0] & 15; + + rsaz_1024_gather5_avx2(a_inv, table_s, wvalue); /* borrow a_inv */ + rsaz_1024_mul_avx2(result, result, a_inv, m, k0); + + /* from Montgomery */ + rsaz_1024_mul_avx2(result, result, one, m, k0); + + rsaz_1024_red2norm_avx2(result_norm, result); + + OPENSSL_cleanse(storage, sizeof(storage)); +} + +/* + * See crypto/bn/rsaz-x86_64.pl for further details. + */ +void rsaz_512_mul(void *ret, const void *a, const void *b, const void *n, + BN_ULONG k); +void rsaz_512_mul_scatter4(void *ret, const void *a, const void *n, + BN_ULONG k, const void *tbl, unsigned int power); +void rsaz_512_mul_gather4(void *ret, const void *a, const void *tbl, + const void *n, BN_ULONG k, unsigned int power); +void rsaz_512_mul_by_one(void *ret, const void *a, const void *n, BN_ULONG k); +void rsaz_512_sqr(void *ret, const void *a, const void *n, BN_ULONG k, + int cnt); +void rsaz_512_scatter4(void *tbl, const BN_ULONG *val, int power); +void rsaz_512_gather4(BN_ULONG *val, const void *tbl, int power); + +void RSAZ_512_mod_exp(BN_ULONG result[8], + const BN_ULONG base[8], const BN_ULONG exponent[8], + const BN_ULONG m[8], BN_ULONG k0, const BN_ULONG RR[8]) +{ + unsigned char storage[16 * 8 * 8 + 64 * 2 + 64]; /* 1.2KB */ + unsigned char *table = storage + (64 - ((size_t)storage % 64)); + BN_ULONG *a_inv = (BN_ULONG *)(table + 16 * 8 * 8); + BN_ULONG *temp = (BN_ULONG *)(table + 16 * 8 * 8 + 8 * 8); + unsigned char *p_str = (unsigned char *)exponent; + int index; + unsigned int wvalue; + + /* table[0] = 1_inv */ + temp[0] = 0 - m[0]; + temp[1] = ~m[1]; + temp[2] = ~m[2]; + temp[3] = ~m[3]; + temp[4] = ~m[4]; + temp[5] = ~m[5]; + temp[6] = ~m[6]; + temp[7] = ~m[7]; + rsaz_512_scatter4(table, temp, 0); + + /* table [1] = a_inv^1 */ + rsaz_512_mul(a_inv, base, RR, m, k0); + rsaz_512_scatter4(table, a_inv, 1); + + /* table [2] = a_inv^2 */ + rsaz_512_sqr(temp, a_inv, m, k0, 1); + rsaz_512_scatter4(table, temp, 2); + + for (index = 3; index < 16; index++) + rsaz_512_mul_scatter4(temp, a_inv, m, k0, table, index); + + /* load first window */ + wvalue = p_str[63]; + + rsaz_512_gather4(temp, table, wvalue >> 4); + rsaz_512_sqr(temp, temp, m, k0, 4); + rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue & 0xf); + + for (index = 62; index >= 0; index--) { + wvalue = p_str[index]; + + rsaz_512_sqr(temp, temp, m, k0, 4); + rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue >> 4); + + rsaz_512_sqr(temp, temp, m, k0, 4); + rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue & 0x0f); + } + + /* from Montgomery */ + rsaz_512_mul_by_one(result, temp, m, k0); + + OPENSSL_cleanse(storage, sizeof(storage)); +} + +#else + +# if defined(PEDANTIC) || defined(__DECC) || defined(__clang__) +static void *dummy = &dummy; +# endif + +#endif diff --git a/crypto/bn/rsaz_exp.h b/crypto/bn/rsaz_exp.h new file mode 100644 index 0000000..33361de --- /dev/null +++ b/crypto/bn/rsaz_exp.h @@ -0,0 +1,56 @@ +/****************************************************************************** +* Copyright(c) 2012, Intel Corp. +* Developers and authors: +* Shay Gueron (1, 2), and Vlad Krasnov (1) +* (1) Intel Corporation, Israel Development Center, Haifa, Israel +* (2) University of Haifa, Israel +****************************************************************************** +* LICENSE: +* This submission to OpenSSL is to be made available under the OpenSSL +* license, and only to the OpenSSL project, in order to allow integration +* into the publicly distributed code. +* The use of this code, or portions of this code, or concepts embedded in +* this code, or modification of this code and/or algorithm(s) in it, or the +* use of this code for any other purpose than stated above, requires special +* licensing. +****************************************************************************** +* DISCLAIMER: +* THIS SOFTWARE IS PROVIDED BY THE CONTRIBUTORS AND THE COPYRIGHT OWNERS +* ``AS IS''. ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS OR THE COPYRIGHT +* OWNERS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +* OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +* POSSIBILITY OF SUCH DAMAGE. +******************************************************************************/ + +#ifndef RSAZ_EXP_H +# define RSAZ_EXP_H + +# undef RSAZ_ENABLED +# if defined(OPENSSL_BN_ASM_MONT) && \ + (defined(__x86_64) || defined(__x86_64__) || \ + defined(_M_AMD64) || defined(_M_X64)) +# define RSAZ_ENABLED + +# include <openssl/bn.h> + +void RSAZ_1024_mod_exp_avx2(BN_ULONG result[16], + const BN_ULONG base_norm[16], + const BN_ULONG exponent[16], + const BN_ULONG m_norm[16], const BN_ULONG RR[16], + BN_ULONG k0); +int rsaz_avx2_eligible(); + +void RSAZ_512_mod_exp(BN_ULONG result[8], + const BN_ULONG base_norm[8], const BN_ULONG exponent[8], + const BN_ULONG m_norm[8], BN_ULONG k0, + const BN_ULONG RR[8]); + +# endif + +#endif |