diff options
author | simon <simon@FreeBSD.org> | 2009-06-07 19:56:18 +0000 |
---|---|---|
committer | simon <simon@FreeBSD.org> | 2009-06-07 19:56:18 +0000 |
commit | fc5b6d55de4ea73602809deec0b965745f83a804 (patch) | |
tree | 873f9f63931a5d069bbcb053ea49eec4a92802dc /crypto/bn/asm | |
parent | 212fba63d3a2fed2e6ca619035a42526db27a3b8 (diff) | |
download | FreeBSD-src-fc5b6d55de4ea73602809deec0b965745f83a804.zip FreeBSD-src-fc5b6d55de4ea73602809deec0b965745f83a804.tar.gz |
Import OpenSSL 0.9.8k.
Diffstat (limited to 'crypto/bn/asm')
-rwxr-xr-x | crypto/bn/asm/alpha-mont.pl | 317 | ||||
-rwxr-xr-x | crypto/bn/asm/armv4-mont.pl | 200 | ||||
-rwxr-xr-x | crypto/bn/asm/mips3-mont.pl | 327 | ||||
-rwxr-xr-x | crypto/bn/asm/ppc-mont.pl | 323 | ||||
-rwxr-xr-x | crypto/bn/asm/ppc64-mont.pl | 918 | ||||
-rwxr-xr-x | crypto/bn/asm/s390x-mont.pl | 225 | ||||
-rwxr-xr-x | crypto/bn/asm/s390x.S | 678 | ||||
-rwxr-xr-x | crypto/bn/asm/sparcv9-mont.pl | 606 | ||||
-rwxr-xr-x | crypto/bn/asm/sparcv9a-mont.pl | 882 | ||||
-rwxr-xr-x | crypto/bn/asm/via-mont.pl | 242 | ||||
-rwxr-xr-x | crypto/bn/asm/x86-mont.pl | 591 |
11 files changed, 5309 insertions, 0 deletions
diff --git a/crypto/bn/asm/alpha-mont.pl b/crypto/bn/asm/alpha-mont.pl new file mode 100755 index 0000000..7a2cc31 --- /dev/null +++ b/crypto/bn/asm/alpha-mont.pl @@ -0,0 +1,317 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# On 21264 RSA sign performance improves by 70/35/20/15 percent for +# 512/1024/2048/4096 bit key lengths. This is against vendor compiler +# instructed to '-tune host' code with in-line assembler. Other +# benchmarks improve by 15-20%. To anchor it to something else, the +# code provides approximately the same performance per GHz as AMD64. +# I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x +# difference. + +# int bn_mul_mont( +$rp="a0"; # BN_ULONG *rp, +$ap="a1"; # const BN_ULONG *ap, +$bp="a2"; # const BN_ULONG *bp, +$np="a3"; # const BN_ULONG *np, +$n0="a4"; # const BN_ULONG *n0, +$num="a5"; # int num); + +$lo0="t0"; +$hi0="t1"; +$lo1="t2"; +$hi1="t3"; +$aj="t4"; +$bi="t5"; +$nj="t6"; +$tp="t7"; +$alo="t8"; +$ahi="t9"; +$nlo="t10"; +$nhi="t11"; +$tj="t12"; +$i="s3"; +$j="s4"; +$m1="s5"; + +$code=<<___; +#include <asm.h> +#include <regdef.h> + +.text + +.set noat +.set noreorder + +.globl bn_mul_mont +.align 5 +.ent bn_mul_mont +bn_mul_mont: + lda sp,-40(sp) + stq ra,0(sp) + stq s3,8(sp) + stq s4,16(sp) + stq s5,24(sp) + stq fp,32(sp) + mov sp,fp + .mask 0x0400f000,-40 + .frame fp,40,ra + .prologue 0 + + .align 4 + .set reorder + sextl $num,$num + mov 0,v0 + cmplt $num,4,AT + bne AT,.Lexit + + ldq $hi0,0($ap) # ap[0] + s8addq $num,16,AT + ldq $aj,8($ap) + subq sp,AT,sp + ldq $bi,0($bp) # bp[0] + mov -4096,AT + ldq $n0,0($n0) + and sp,AT,sp + + mulq $hi0,$bi,$lo0 + ldq $hi1,0($np) # np[0] + umulh $hi0,$bi,$hi0 + ldq $nj,8($np) + + mulq $lo0,$n0,$m1 + + mulq $hi1,$m1,$lo1 + umulh $hi1,$m1,$hi1 + + addq $lo1,$lo0,$lo1 + cmpult $lo1,$lo0,AT + addq $hi1,AT,$hi1 + + mulq $aj,$bi,$alo + mov 2,$j + umulh $aj,$bi,$ahi + mov sp,$tp + + mulq $nj,$m1,$nlo + s8addq $j,$ap,$aj + umulh $nj,$m1,$nhi + s8addq $j,$np,$nj +.align 4 +.L1st: + .set noreorder + ldq $aj,($aj) + addl $j,1,$j + ldq $nj,($nj) + lda $tp,8($tp) + + addq $alo,$hi0,$lo0 + mulq $aj,$bi,$alo + cmpult $lo0,$hi0,AT + addq $nlo,$hi1,$lo1 + + mulq $nj,$m1,$nlo + addq $ahi,AT,$hi0 + cmpult $lo1,$hi1,v0 + cmplt $j,$num,$tj + + umulh $aj,$bi,$ahi + addq $nhi,v0,$hi1 + addq $lo1,$lo0,$lo1 + s8addq $j,$ap,$aj + + umulh $nj,$m1,$nhi + cmpult $lo1,$lo0,v0 + addq $hi1,v0,$hi1 + s8addq $j,$np,$nj + + stq $lo1,-8($tp) + nop + unop + bne $tj,.L1st + .set reorder + + addq $alo,$hi0,$lo0 + addq $nlo,$hi1,$lo1 + cmpult $lo0,$hi0,AT + cmpult $lo1,$hi1,v0 + addq $ahi,AT,$hi0 + addq $nhi,v0,$hi1 + + addq $lo1,$lo0,$lo1 + cmpult $lo1,$lo0,v0 + addq $hi1,v0,$hi1 + + stq $lo1,0($tp) + + addq $hi1,$hi0,$hi1 + cmpult $hi1,$hi0,AT + stq $hi1,8($tp) + stq AT,16($tp) + + mov 1,$i +.align 4 +.Louter: + s8addq $i,$bp,$bi + ldq $hi0,($ap) + ldq $aj,8($ap) + ldq $bi,($bi) + ldq $hi1,($np) + ldq $nj,8($np) + ldq $tj,(sp) + + mulq $hi0,$bi,$lo0 + umulh $hi0,$bi,$hi0 + + addq $lo0,$tj,$lo0 + cmpult $lo0,$tj,AT + addq $hi0,AT,$hi0 + + mulq $lo0,$n0,$m1 + + mulq $hi1,$m1,$lo1 + umulh $hi1,$m1,$hi1 + + addq $lo1,$lo0,$lo1 + cmpult $lo1,$lo0,AT + mov 2,$j + addq $hi1,AT,$hi1 + + mulq $aj,$bi,$alo + mov sp,$tp + umulh $aj,$bi,$ahi + + mulq $nj,$m1,$nlo + s8addq $j,$ap,$aj + umulh $nj,$m1,$nhi +.align 4 +.Linner: + .set noreorder + ldq $tj,8($tp) #L0 + nop #U1 + ldq $aj,($aj) #L1 + s8addq $j,$np,$nj #U0 + + ldq $nj,($nj) #L0 + nop #U1 + addq $alo,$hi0,$lo0 #L1 + lda $tp,8($tp) + + mulq $aj,$bi,$alo #U1 + cmpult $lo0,$hi0,AT #L0 + addq $nlo,$hi1,$lo1 #L1 + addl $j,1,$j + + mulq $nj,$m1,$nlo #U1 + addq $ahi,AT,$hi0 #L0 + addq $lo0,$tj,$lo0 #L1 + cmpult $lo1,$hi1,v0 #U0 + + umulh $aj,$bi,$ahi #U1 + cmpult $lo0,$tj,AT #L0 + addq $lo1,$lo0,$lo1 #L1 + addq $nhi,v0,$hi1 #U0 + + umulh $nj,$m1,$nhi #U1 + s8addq $j,$ap,$aj #L0 + cmpult $lo1,$lo0,v0 #L1 + cmplt $j,$num,$tj #U0 # borrow $tj + + addq $hi0,AT,$hi0 #L0 + addq $hi1,v0,$hi1 #U1 + stq $lo1,-8($tp) #L1 + bne $tj,.Linner #U0 + .set reorder + + ldq $tj,8($tp) + addq $alo,$hi0,$lo0 + addq $nlo,$hi1,$lo1 + cmpult $lo0,$hi0,AT + cmpult $lo1,$hi1,v0 + addq $ahi,AT,$hi0 + addq $nhi,v0,$hi1 + + addq $lo0,$tj,$lo0 + cmpult $lo0,$tj,AT + addq $hi0,AT,$hi0 + + ldq $tj,16($tp) + addq $lo1,$lo0,$j + cmpult $j,$lo0,v0 + addq $hi1,v0,$hi1 + + addq $hi1,$hi0,$lo1 + stq $j,($tp) + cmpult $lo1,$hi0,$hi1 + addq $lo1,$tj,$lo1 + cmpult $lo1,$tj,AT + addl $i,1,$i + addq $hi1,AT,$hi1 + stq $lo1,8($tp) + cmplt $i,$num,$tj # borrow $tj + stq $hi1,16($tp) + bne $tj,.Louter + + s8addq $num,sp,$tj # &tp[num] + mov $rp,$bp # put rp aside + mov sp,$tp + mov sp,$ap + mov 0,$hi0 # clear borrow bit + +.align 4 +.Lsub: ldq $lo0,($tp) + ldq $lo1,($np) + lda $tp,8($tp) + lda $np,8($np) + subq $lo0,$lo1,$lo1 # tp[i]-np[i] + cmpult $lo0,$lo1,AT + subq $lo1,$hi0,$lo0 + cmpult $lo1,$lo0,$hi0 + or $hi0,AT,$hi0 + stq $lo0,($rp) + cmpult $tp,$tj,v0 + lda $rp,8($rp) + bne v0,.Lsub + + subq $hi1,$hi0,$hi0 # handle upmost overflow bit + mov sp,$tp + mov $bp,$rp # restore rp + + and sp,$hi0,$ap + bic $bp,$hi0,$bp + bis $bp,$ap,$ap # ap=borrow?tp:rp + +.align 4 +.Lcopy: ldq $aj,($ap) # copy or in-place refresh + lda $tp,8($tp) + lda $rp,8($rp) + lda $ap,8($ap) + stq zero,-8($tp) # zap tp + cmpult $tp,$tj,AT + stq $aj,-8($rp) + bne AT,.Lcopy + mov 1,v0 + +.Lexit: + .set noreorder + mov fp,sp + /*ldq ra,0(sp)*/ + ldq s3,8(sp) + ldq s4,16(sp) + ldq s5,24(sp) + ldq fp,32(sp) + lda sp,40(sp) + ret (ra) +.end bn_mul_mont +.rdata +.asciiz "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>" +___ + +print $code; +close STDOUT; diff --git a/crypto/bn/asm/armv4-mont.pl b/crypto/bn/asm/armv4-mont.pl new file mode 100755 index 0000000..05d5dc1 --- /dev/null +++ b/crypto/bn/asm/armv4-mont.pl @@ -0,0 +1,200 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# January 2007. + +# Montgomery multiplication for ARMv4. +# +# Performance improvement naturally varies among CPU implementations +# and compilers. The code was observed to provide +65-35% improvement +# [depending on key length, less for longer keys] on ARM920T, and +# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code +# base and compiler generated code with in-lined umull and even umlal +# instructions. The latter means that this code didn't really have an +# "advantage" of utilizing some "secret" instruction. +# +# The code is interoperable with Thumb ISA and is rather compact, less +# than 1/2KB. Windows CE port would be trivial, as it's exclusively +# about decorations, ABI and instruction syntax are identical. + +$num="r0"; # starts as num argument, but holds &tp[num-1] +$ap="r1"; +$bp="r2"; $bi="r2"; $rp="r2"; +$np="r3"; +$tp="r4"; +$aj="r5"; +$nj="r6"; +$tj="r7"; +$n0="r8"; +########### # r9 is reserved by ELF as platform specific, e.g. TLS pointer +$alo="r10"; # sl, gcc uses it to keep @GOT +$ahi="r11"; # fp +$nlo="r12"; # ip +########### # r13 is stack pointer +$nhi="r14"; # lr +########### # r15 is program counter + +#### argument block layout relative to &tp[num-1], a.k.a. $num +$_rp="$num,#12*4"; +# ap permanently resides in r1 +$_bp="$num,#13*4"; +# np permanently resides in r3 +$_n0="$num,#14*4"; +$_num="$num,#15*4"; $_bpend=$_num; + +$code=<<___; +.text + +.global bn_mul_mont +.type bn_mul_mont,%function + +.align 2 +bn_mul_mont: + stmdb sp!,{r0,r2} @ sp points at argument block + ldr $num,[sp,#3*4] @ load num + cmp $num,#2 + movlt r0,#0 + addlt sp,sp,#2*4 + blt .Labrt + + stmdb sp!,{r4-r12,lr} @ save 10 registers + + mov $num,$num,lsl#2 @ rescale $num for byte count + sub sp,sp,$num @ alloca(4*num) + sub sp,sp,#4 @ +extra dword + sub $num,$num,#4 @ "num=num-1" + add $tp,$bp,$num @ &bp[num-1] + + add $num,sp,$num @ $num to point at &tp[num-1] + ldr $n0,[$_n0] @ &n0 + ldr $bi,[$bp] @ bp[0] + ldr $aj,[$ap],#4 @ ap[0],ap++ + ldr $nj,[$np],#4 @ np[0],np++ + ldr $n0,[$n0] @ *n0 + str $tp,[$_bpend] @ save &bp[num] + + umull $alo,$ahi,$aj,$bi @ ap[0]*bp[0] + str $n0,[$_n0] @ save n0 value + mul $n0,$alo,$n0 @ "tp[0]"*n0 + mov $nlo,#0 + umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"t[0]" + mov $tp,sp + +.L1st: + ldr $aj,[$ap],#4 @ ap[j],ap++ + mov $alo,$ahi + mov $ahi,#0 + umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0] + ldr $nj,[$np],#4 @ np[j],np++ + mov $nhi,#0 + umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 + adds $nlo,$nlo,$alo + str $nlo,[$tp],#4 @ tp[j-1]=,tp++ + adc $nlo,$nhi,#0 + cmp $tp,$num + bne .L1st + + adds $nlo,$nlo,$ahi + mov $nhi,#0 + adc $nhi,$nhi,#0 + ldr $tp,[$_bp] @ restore bp + str $nlo,[$num] @ tp[num-1]= + ldr $n0,[$_n0] @ restore n0 + str $nhi,[$num,#4] @ tp[num]= + +.Louter: + sub $tj,$num,sp @ "original" $num-1 value + sub $ap,$ap,$tj @ "rewind" ap to &ap[1] + sub $np,$np,$tj @ "rewind" np to &np[1] + ldr $bi,[$tp,#4]! @ *(++bp) + ldr $aj,[$ap,#-4] @ ap[0] + ldr $nj,[$np,#-4] @ np[0] + ldr $alo,[sp] @ tp[0] + ldr $tj,[sp,#4] @ tp[1] + + mov $ahi,#0 + umlal $alo,$ahi,$aj,$bi @ ap[0]*bp[i]+tp[0] + str $tp,[$_bp] @ save bp + mul $n0,$alo,$n0 + mov $nlo,#0 + umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"tp[0]" + mov $tp,sp + +.Linner: + ldr $aj,[$ap],#4 @ ap[j],ap++ + adds $alo,$ahi,$tj @ +=tp[j] + mov $ahi,#0 + umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i] + ldr $nj,[$np],#4 @ np[j],np++ + mov $nhi,#0 + umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 + ldr $tj,[$tp,#8] @ tp[j+1] + adc $ahi,$ahi,#0 + adds $nlo,$nlo,$alo + str $nlo,[$tp],#4 @ tp[j-1]=,tp++ + adc $nlo,$nhi,#0 + cmp $tp,$num + bne .Linner + + adds $nlo,$nlo,$ahi + mov $nhi,#0 + adc $nhi,$nhi,#0 + adds $nlo,$nlo,$tj + adc $nhi,$nhi,#0 + ldr $tp,[$_bp] @ restore bp + ldr $tj,[$_bpend] @ restore &bp[num] + str $nlo,[$num] @ tp[num-1]= + ldr $n0,[$_n0] @ restore n0 + str $nhi,[$num,#4] @ tp[num]= + + cmp $tp,$tj + bne .Louter + + ldr $rp,[$_rp] @ pull rp + add $num,$num,#4 @ $num to point at &tp[num] + sub $aj,$num,sp @ "original" num value + mov $tp,sp @ "rewind" $tp + mov $ap,$tp @ "borrow" $ap + sub $np,$np,$aj @ "rewind" $np to &np[0] + + subs $tj,$tj,$tj @ "clear" carry flag +.Lsub: ldr $tj,[$tp],#4 + ldr $nj,[$np],#4 + sbcs $tj,$tj,$nj @ tp[j]-np[j] + str $tj,[$rp],#4 @ rp[j]= + teq $tp,$num @ preserve carry + bne .Lsub + sbcs $nhi,$nhi,#0 @ upmost carry + mov $tp,sp @ "rewind" $tp + sub $rp,$rp,$aj @ "rewind" $rp + + and $ap,$tp,$nhi + bic $np,$rp,$nhi + orr $ap,$ap,$np @ ap=borrow?tp:rp + +.Lcopy: ldr $tj,[$ap],#4 @ copy or in-place refresh + str sp,[$tp],#4 @ zap tp + str $tj,[$rp],#4 + cmp $tp,$num + bne .Lcopy + + add sp,$num,#4 @ skip over tp[num+1] + ldmia sp!,{r4-r12,lr} @ restore registers + add sp,sp,#2*4 @ skip over {r0,r2} + mov r0,#1 +.Labrt: tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +.size bn_mul_mont,.-bn_mul_mont +.asciz "Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" +___ + +$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 +print $code; +close STDOUT; diff --git a/crypto/bn/asm/mips3-mont.pl b/crypto/bn/asm/mips3-mont.pl new file mode 100755 index 0000000..8f9156e --- /dev/null +++ b/crypto/bn/asm/mips3-mont.pl @@ -0,0 +1,327 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# This module doesn't present direct interest for OpenSSL, because it +# doesn't provide better performance for longer keys. While 512-bit +# RSA private key operations are 40% faster, 1024-bit ones are hardly +# faster at all, while longer key operations are slower by up to 20%. +# It might be of interest to embedded system developers though, as +# it's smaller than 1KB, yet offers ~3x improvement over compiler +# generated code. +# +# The module targets N32 and N64 MIPS ABIs and currently is a bit +# IRIX-centric, i.e. is likely to require adaptation for other OSes. + +# int bn_mul_mont( +$rp="a0"; # BN_ULONG *rp, +$ap="a1"; # const BN_ULONG *ap, +$bp="a2"; # const BN_ULONG *bp, +$np="a3"; # const BN_ULONG *np, +$n0="a4"; # const BN_ULONG *n0, +$num="a5"; # int num); + +$lo0="a6"; +$hi0="a7"; +$lo1="v0"; +$hi1="v1"; +$aj="t0"; +$bi="t1"; +$nj="t2"; +$tp="t3"; +$alo="s0"; +$ahi="s1"; +$nlo="s2"; +$nhi="s3"; +$tj="s4"; +$i="s5"; +$j="s6"; +$fp="t8"; +$m1="t9"; + +$FRAME=8*(2+8); + +$code=<<___; +#include <asm.h> +#include <regdef.h> + +.text + +.set noat +.set reorder + +.align 5 +.globl bn_mul_mont +.ent bn_mul_mont +bn_mul_mont: + .set noreorder + PTR_SUB sp,64 + move $fp,sp + .frame $fp,64,ra + slt AT,$num,4 + li v0,0 + beqzl AT,.Lproceed + nop + jr ra + PTR_ADD sp,$fp,64 + .set reorder +.align 5 +.Lproceed: + ld $n0,0($n0) + ld $bi,0($bp) # bp[0] + ld $aj,0($ap) # ap[0] + ld $nj,0($np) # np[0] + PTR_SUB sp,16 # place for two extra words + sll $num,3 + li AT,-4096 + PTR_SUB sp,$num + and sp,AT + + sd s0,0($fp) + sd s1,8($fp) + sd s2,16($fp) + sd s3,24($fp) + sd s4,32($fp) + sd s5,40($fp) + sd s6,48($fp) + sd s7,56($fp) + + dmultu $aj,$bi + ld $alo,8($ap) + ld $nlo,8($np) + mflo $lo0 + mfhi $hi0 + dmultu $lo0,$n0 + mflo $m1 + + dmultu $alo,$bi + mflo $alo + mfhi $ahi + + dmultu $nj,$m1 + mflo $lo1 + mfhi $hi1 + dmultu $nlo,$m1 + daddu $lo1,$lo0 + sltu AT,$lo1,$lo0 + daddu $hi1,AT + mflo $nlo + mfhi $nhi + + move $tp,sp + li $j,16 +.align 4 +.L1st: + .set noreorder + PTR_ADD $aj,$ap,$j + ld $aj,($aj) + PTR_ADD $nj,$np,$j + ld $nj,($nj) + + dmultu $aj,$bi + daddu $lo0,$alo,$hi0 + daddu $lo1,$nlo,$hi1 + sltu AT,$lo0,$hi0 + sltu s7,$lo1,$hi1 + daddu $hi0,$ahi,AT + daddu $hi1,$nhi,s7 + mflo $alo + mfhi $ahi + + daddu $lo1,$lo0 + sltu AT,$lo1,$lo0 + dmultu $nj,$m1 + daddu $hi1,AT + addu $j,8 + sd $lo1,($tp) + sltu s7,$j,$num + mflo $nlo + mfhi $nhi + + bnez s7,.L1st + PTR_ADD $tp,8 + .set reorder + + daddu $lo0,$alo,$hi0 + sltu AT,$lo0,$hi0 + daddu $hi0,$ahi,AT + + daddu $lo1,$nlo,$hi1 + sltu s7,$lo1,$hi1 + daddu $hi1,$nhi,s7 + daddu $lo1,$lo0 + sltu AT,$lo1,$lo0 + daddu $hi1,AT + + sd $lo1,($tp) + + daddu $hi1,$hi0 + sltu AT,$hi1,$hi0 + sd $hi1,8($tp) + sd AT,16($tp) + + li $i,8 +.align 4 +.Louter: + PTR_ADD $bi,$bp,$i + ld $bi,($bi) + ld $aj,($ap) + ld $alo,8($ap) + ld $tj,(sp) + + dmultu $aj,$bi + ld $nj,($np) + ld $nlo,8($np) + mflo $lo0 + mfhi $hi0 + daddu $lo0,$tj + dmultu $lo0,$n0 + sltu AT,$lo0,$tj + daddu $hi0,AT + mflo $m1 + + dmultu $alo,$bi + mflo $alo + mfhi $ahi + + dmultu $nj,$m1 + mflo $lo1 + mfhi $hi1 + + dmultu $nlo,$m1 + daddu $lo1,$lo0 + sltu AT,$lo1,$lo0 + daddu $hi1,AT + mflo $nlo + mfhi $nhi + + move $tp,sp + li $j,16 + ld $tj,8($tp) +.align 4 +.Linner: + .set noreorder + PTR_ADD $aj,$ap,$j + ld $aj,($aj) + PTR_ADD $nj,$np,$j + ld $nj,($nj) + + dmultu $aj,$bi + daddu $lo0,$alo,$hi0 + daddu $lo1,$nlo,$hi1 + sltu AT,$lo0,$hi0 + sltu s7,$lo1,$hi1 + daddu $hi0,$ahi,AT + daddu $hi1,$nhi,s7 + mflo $alo + mfhi $ahi + + daddu $lo0,$tj + addu $j,8 + dmultu $nj,$m1 + sltu AT,$lo0,$tj + daddu $lo1,$lo0 + daddu $hi0,AT + sltu s7,$lo1,$lo0 + ld $tj,16($tp) + daddu $hi1,s7 + sltu AT,$j,$num + mflo $nlo + mfhi $nhi + sd $lo1,($tp) + bnez AT,.Linner + PTR_ADD $tp,8 + .set reorder + + daddu $lo0,$alo,$hi0 + sltu AT,$lo0,$hi0 + daddu $hi0,$ahi,AT + daddu $lo0,$tj + sltu s7,$lo0,$tj + daddu $hi0,s7 + + ld $tj,16($tp) + daddu $lo1,$nlo,$hi1 + sltu AT,$lo1,$hi1 + daddu $hi1,$nhi,AT + daddu $lo1,$lo0 + sltu s7,$lo1,$lo0 + daddu $hi1,s7 + sd $lo1,($tp) + + daddu $lo1,$hi1,$hi0 + sltu $hi1,$lo1,$hi0 + daddu $lo1,$tj + sltu AT,$lo1,$tj + daddu $hi1,AT + sd $lo1,8($tp) + sd $hi1,16($tp) + + addu $i,8 + sltu s7,$i,$num + bnez s7,.Louter + + .set noreorder + PTR_ADD $tj,sp,$num # &tp[num] + move $tp,sp + move $ap,sp + li $hi0,0 # clear borrow bit + +.align 4 +.Lsub: ld $lo0,($tp) + ld $lo1,($np) + PTR_ADD $tp,8 + PTR_ADD $np,8 + dsubu $lo1,$lo0,$lo1 # tp[i]-np[i] + sgtu AT,$lo1,$lo0 + dsubu $lo0,$lo1,$hi0 + sgtu $hi0,$lo0,$lo1 + sd $lo0,($rp) + or $hi0,AT + sltu AT,$tp,$tj + bnez AT,.Lsub + PTR_ADD $rp,8 + + dsubu $hi0,$hi1,$hi0 # handle upmost overflow bit + move $tp,sp + PTR_SUB $rp,$num # restore rp + not $hi1,$hi0 + + and $ap,$hi0,sp + and $bp,$hi1,$rp + or $ap,$ap,$bp # ap=borrow?tp:rp + +.align 4 +.Lcopy: ld $aj,($ap) + PTR_ADD $ap,8 + PTR_ADD $tp,8 + sd zero,-8($tp) + sltu AT,$tp,$tj + sd $aj,($rp) + bnez AT,.Lcopy + PTR_ADD $rp,8 + + ld s0,0($fp) + ld s1,8($fp) + ld s2,16($fp) + ld s3,24($fp) + ld s4,32($fp) + ld s5,40($fp) + ld s6,48($fp) + ld s7,56($fp) + li v0,1 + jr ra + PTR_ADD sp,$fp,64 + .set reorder +END(bn_mul_mont) +.rdata +.asciiz "Montgomery Multiplication for MIPS III/IV, CRYPTOGAMS by <appro\@openssl.org>" +___ + +print $code; +close STDOUT; diff --git a/crypto/bn/asm/ppc-mont.pl b/crypto/bn/asm/ppc-mont.pl new file mode 100755 index 0000000..7849eae --- /dev/null +++ b/crypto/bn/asm/ppc-mont.pl @@ -0,0 +1,323 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# April 2006 + +# "Teaser" Montgomery multiplication module for PowerPC. It's possible +# to gain a bit more by modulo-scheduling outer loop, then dedicated +# squaring procedure should give further 20% and code can be adapted +# for 32-bit application running on 64-bit CPU. As for the latter. +# It won't be able to achieve "native" 64-bit performance, because in +# 32-bit application context every addc instruction will have to be +# expanded as addc, twice right shift by 32 and finally adde, etc. +# So far RSA *sign* performance improvement over pre-bn_mul_mont asm +# for 64-bit application running on PPC970/G5 is: +# +# 512-bit +65% +# 1024-bit +35% +# 2048-bit +18% +# 4096-bit +4% + +$flavour = shift; + +if ($flavour =~ /32/) { + $BITS= 32; + $BNSZ= $BITS/8; + $SIZE_T=4; + $RZONE= 224; + $FRAME= $SIZE_T*16; + + $LD= "lwz"; # load + $LDU= "lwzu"; # load and update + $LDX= "lwzx"; # load indexed + $ST= "stw"; # store + $STU= "stwu"; # store and update + $STX= "stwx"; # store indexed + $STUX= "stwux"; # store indexed and update + $UMULL= "mullw"; # unsigned multiply low + $UMULH= "mulhwu"; # unsigned multiply high + $UCMP= "cmplw"; # unsigned compare + $SHRI= "srwi"; # unsigned shift right by immediate + $PUSH= $ST; + $POP= $LD; +} elsif ($flavour =~ /64/) { + $BITS= 64; + $BNSZ= $BITS/8; + $SIZE_T=8; + $RZONE= 288; + $FRAME= $SIZE_T*16; + + # same as above, but 64-bit mnemonics... + $LD= "ld"; # load + $LDU= "ldu"; # load and update + $LDX= "ldx"; # load indexed + $ST= "std"; # store + $STU= "stdu"; # store and update + $STX= "stdx"; # store indexed + $STUX= "stdux"; # store indexed and update + $UMULL= "mulld"; # unsigned multiply low + $UMULH= "mulhdu"; # unsigned multiply high + $UCMP= "cmpld"; # unsigned compare + $SHRI= "srdi"; # unsigned shift right by immediate + $PUSH= $ST; + $POP= $LD; +} else { die "nonsense $flavour"; } + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or +die "can't locate ppc-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; + +$sp="r1"; +$toc="r2"; +$rp="r3"; $ovf="r3"; +$ap="r4"; +$bp="r5"; +$np="r6"; +$n0="r7"; +$num="r8"; +$rp="r9"; # $rp is reassigned +$aj="r10"; +$nj="r11"; +$tj="r12"; +# non-volatile registers +$i="r14"; +$j="r15"; +$tp="r16"; +$m0="r17"; +$m1="r18"; +$lo0="r19"; +$hi0="r20"; +$lo1="r21"; +$hi1="r22"; +$alo="r23"; +$ahi="r24"; +$nlo="r25"; +# +$nhi="r0"; + +$code=<<___; +.machine "any" +.text + +.globl .bn_mul_mont +.align 4 +.bn_mul_mont: + cmpwi $num,4 + mr $rp,r3 ; $rp is reassigned + li r3,0 + bltlr + + slwi $num,$num,`log($BNSZ)/log(2)` + li $tj,-4096 + addi $ovf,$num,`$FRAME+$RZONE` + subf $ovf,$ovf,$sp ; $sp-$ovf + and $ovf,$ovf,$tj ; minimize TLB usage + subf $ovf,$sp,$ovf ; $ovf-$sp + srwi $num,$num,`log($BNSZ)/log(2)` + $STUX $sp,$sp,$ovf + + $PUSH r14,`4*$SIZE_T`($sp) + $PUSH r15,`5*$SIZE_T`($sp) + $PUSH r16,`6*$SIZE_T`($sp) + $PUSH r17,`7*$SIZE_T`($sp) + $PUSH r18,`8*$SIZE_T`($sp) + $PUSH r19,`9*$SIZE_T`($sp) + $PUSH r20,`10*$SIZE_T`($sp) + $PUSH r21,`11*$SIZE_T`($sp) + $PUSH r22,`12*$SIZE_T`($sp) + $PUSH r23,`13*$SIZE_T`($sp) + $PUSH r24,`14*$SIZE_T`($sp) + $PUSH r25,`15*$SIZE_T`($sp) + + $LD $n0,0($n0) ; pull n0[0] value + addi $num,$num,-2 ; adjust $num for counter register + + $LD $m0,0($bp) ; m0=bp[0] + $LD $aj,0($ap) ; ap[0] + addi $tp,$sp,$FRAME + $UMULL $lo0,$aj,$m0 ; ap[0]*bp[0] + $UMULH $hi0,$aj,$m0 + + $LD $aj,$BNSZ($ap) ; ap[1] + $LD $nj,0($np) ; np[0] + + $UMULL $m1,$lo0,$n0 ; "tp[0]"*n0 + + $UMULL $alo,$aj,$m0 ; ap[1]*bp[0] + $UMULH $ahi,$aj,$m0 + + $UMULL $lo1,$nj,$m1 ; np[0]*m1 + $UMULH $hi1,$nj,$m1 + $LD $nj,$BNSZ($np) ; np[1] + addc $lo1,$lo1,$lo0 + addze $hi1,$hi1 + + $UMULL $nlo,$nj,$m1 ; np[1]*m1 + $UMULH $nhi,$nj,$m1 + + mtctr $num + li $j,`2*$BNSZ` +.align 4 +L1st: + $LDX $aj,$ap,$j ; ap[j] + addc $lo0,$alo,$hi0 + $LDX $nj,$np,$j ; np[j] + addze $hi0,$ahi + $UMULL $alo,$aj,$m0 ; ap[j]*bp[0] + addc $lo1,$nlo,$hi1 + $UMULH $ahi,$aj,$m0 + addze $hi1,$nhi + $UMULL $nlo,$nj,$m1 ; np[j]*m1 + addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0] + $UMULH $nhi,$nj,$m1 + addze $hi1,$hi1 + $ST $lo1,0($tp) ; tp[j-1] + + addi $j,$j,$BNSZ ; j++ + addi $tp,$tp,$BNSZ ; tp++ + bdnz- L1st +;L1st + addc $lo0,$alo,$hi0 + addze $hi0,$ahi + + addc $lo1,$nlo,$hi1 + addze $hi1,$nhi + addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0] + addze $hi1,$hi1 + $ST $lo1,0($tp) ; tp[j-1] + + li $ovf,0 + addc $hi1,$hi1,$hi0 + addze $ovf,$ovf ; upmost overflow bit + $ST $hi1,$BNSZ($tp) + + li $i,$BNSZ +.align 4 +Louter: + $LDX $m0,$bp,$i ; m0=bp[i] + $LD $aj,0($ap) ; ap[0] + addi $tp,$sp,$FRAME + $LD $tj,$FRAME($sp) ; tp[0] + $UMULL $lo0,$aj,$m0 ; ap[0]*bp[i] + $UMULH $hi0,$aj,$m0 + $LD $aj,$BNSZ($ap) ; ap[1] + $LD $nj,0($np) ; np[0] + addc $lo0,$lo0,$tj ; ap[0]*bp[i]+tp[0] + $UMULL $alo,$aj,$m0 ; ap[j]*bp[i] + addze $hi0,$hi0 + $UMULL $m1,$lo0,$n0 ; tp[0]*n0 + $UMULH $ahi,$aj,$m0 + $UMULL $lo1,$nj,$m1 ; np[0]*m1 + $UMULH $hi1,$nj,$m1 + $LD $nj,$BNSZ($np) ; np[1] + addc $lo1,$lo1,$lo0 + $UMULL $nlo,$nj,$m1 ; np[1]*m1 + addze $hi1,$hi1 + $UMULH $nhi,$nj,$m1 + + mtctr $num + li $j,`2*$BNSZ` +.align 4 +Linner: + $LDX $aj,$ap,$j ; ap[j] + addc $lo0,$alo,$hi0 + $LD $tj,$BNSZ($tp) ; tp[j] + addze $hi0,$ahi + $LDX $nj,$np,$j ; np[j] + addc $lo1,$nlo,$hi1 + $UMULL $alo,$aj,$m0 ; ap[j]*bp[i] + addze $hi1,$nhi + $UMULH $ahi,$aj,$m0 + addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j] + $UMULL $nlo,$nj,$m1 ; np[j]*m1 + addze $hi0,$hi0 + $UMULH $nhi,$nj,$m1 + addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j] + addi $j,$j,$BNSZ ; j++ + addze $hi1,$hi1 + $ST $lo1,0($tp) ; tp[j-1] + addi $tp,$tp,$BNSZ ; tp++ + bdnz- Linner +;Linner + $LD $tj,$BNSZ($tp) ; tp[j] + addc $lo0,$alo,$hi0 + addze $hi0,$ahi + addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j] + addze $hi0,$hi0 + + addc $lo1,$nlo,$hi1 + addze $hi1,$nhi + addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j] + addze $hi1,$hi1 + $ST $lo1,0($tp) ; tp[j-1] + + addic $ovf,$ovf,-1 ; move upmost overflow to XER[CA] + li $ovf,0 + adde $hi1,$hi1,$hi0 + addze $ovf,$ovf + $ST $hi1,$BNSZ($tp) +; + slwi $tj,$num,`log($BNSZ)/log(2)` + $UCMP $i,$tj + addi $i,$i,$BNSZ + ble- Louter + + addi $num,$num,2 ; restore $num + subfc $j,$j,$j ; j=0 and "clear" XER[CA] + addi $tp,$sp,$FRAME + mtctr $num + +.align 4 +Lsub: $LDX $tj,$tp,$j + $LDX $nj,$np,$j + subfe $aj,$nj,$tj ; tp[j]-np[j] + $STX $aj,$rp,$j + addi $j,$j,$BNSZ + bdnz- Lsub + + li $j,0 + mtctr $num + subfe $ovf,$j,$ovf ; handle upmost overflow bit + and $ap,$tp,$ovf + andc $np,$rp,$ovf + or $ap,$ap,$np ; ap=borrow?tp:rp + +.align 4 +Lcopy: ; copy or in-place refresh + $LDX $tj,$ap,$j + $STX $tj,$rp,$j + $STX $j,$tp,$j ; zap at once + addi $j,$j,$BNSZ + bdnz- Lcopy + + $POP r14,`4*$SIZE_T`($sp) + $POP r15,`5*$SIZE_T`($sp) + $POP r16,`6*$SIZE_T`($sp) + $POP r17,`7*$SIZE_T`($sp) + $POP r18,`8*$SIZE_T`($sp) + $POP r19,`9*$SIZE_T`($sp) + $POP r20,`10*$SIZE_T`($sp) + $POP r21,`11*$SIZE_T`($sp) + $POP r22,`12*$SIZE_T`($sp) + $POP r23,`13*$SIZE_T`($sp) + $POP r24,`14*$SIZE_T`($sp) + $POP r25,`15*$SIZE_T`($sp) + $POP $sp,0($sp) + li r3,1 + blr + .long 0 +.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>" +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +print $code; +close STDOUT; diff --git a/crypto/bn/asm/ppc64-mont.pl b/crypto/bn/asm/ppc64-mont.pl new file mode 100755 index 0000000..3449b35 --- /dev/null +++ b/crypto/bn/asm/ppc64-mont.pl @@ -0,0 +1,918 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# December 2007 + +# The reason for undertaken effort is basically following. Even though +# Power 6 CPU operates at incredible 4.7GHz clock frequency, its PKI +# performance was observed to be less than impressive, essentially as +# fast as 1.8GHz PPC970, or 2.6 times(!) slower than one would hope. +# Well, it's not surprising that IBM had to make some sacrifices to +# boost the clock frequency that much, but no overall improvement? +# Having observed how much difference did switching to FPU make on +# UltraSPARC, playing same stunt on Power 6 appeared appropriate... +# Unfortunately the resulting performance improvement is not as +# impressive, ~30%, and in absolute terms is still very far from what +# one would expect from 4.7GHz CPU. There is a chance that I'm doing +# something wrong, but in the lack of assembler level micro-profiling +# data or at least decent platform guide I can't tell... Or better +# results might be achieved with VMX... Anyway, this module provides +# *worse* performance on other PowerPC implementations, ~40-15% slower +# on PPC970 depending on key length and ~40% slower on Power 5 for all +# key lengths. As it's obviously inappropriate as "best all-round" +# alternative, it has to be complemented with run-time CPU family +# detection. Oh! It should also be noted that unlike other PowerPC +# implementation IALU ppc-mont.pl module performs *suboptimaly* on +# >=1024-bit key lengths on Power 6. It should also be noted that +# *everything* said so far applies to 64-bit builds! As far as 32-bit +# application executed on 64-bit CPU goes, this module is likely to +# become preferred choice, because it's easy to adapt it for such +# case and *is* faster than 32-bit ppc-mont.pl on *all* processors. + +# February 2008 + +# Micro-profiling assisted optimization results in ~15% improvement +# over original ppc64-mont.pl version, or overall ~50% improvement +# over ppc.pl module on Power 6. If compared to ppc-mont.pl on same +# Power 6 CPU, this module is 5-150% faster depending on key length, +# [hereafter] more for longer keys. But if compared to ppc-mont.pl +# on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive +# in absolute terms, but it's apparently the way Power 6 is... + +$flavour = shift; + +if ($flavour =~ /32/) { + $SIZE_T=4; + $RZONE= 224; + $FRAME= $SIZE_T*12+8*12; + $fname= "bn_mul_mont_ppc64"; + + $STUX= "stwux"; # store indexed and update + $PUSH= "stw"; + $POP= "lwz"; + die "not implemented yet"; +} elsif ($flavour =~ /64/) { + $SIZE_T=8; + $RZONE= 288; + $FRAME= $SIZE_T*12+8*12; + $fname= "bn_mul_mont"; + + # same as above, but 64-bit mnemonics... + $STUX= "stdux"; # store indexed and update + $PUSH= "std"; + $POP= "ld"; +} else { die "nonsense $flavour"; } + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or +die "can't locate ppc-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; + +$FRAME=($FRAME+63)&~63; +$TRANSFER=16*8; + +$carry="r0"; +$sp="r1"; +$toc="r2"; +$rp="r3"; $ovf="r3"; +$ap="r4"; +$bp="r5"; +$np="r6"; +$n0="r7"; +$num="r8"; +$rp="r9"; # $rp is reassigned +$tp="r10"; +$j="r11"; +$i="r12"; +# non-volatile registers +$nap_d="r14"; # interleaved ap and np in double format +$a0="r15"; # ap[0] +$t0="r16"; # temporary registers +$t1="r17"; +$t2="r18"; +$t3="r19"; +$t4="r20"; +$t5="r21"; +$t6="r22"; +$t7="r23"; + +# PPC offers enough register bank capacity to unroll inner loops twice +# +# ..A3A2A1A0 +# dcba +# ----------- +# A0a +# A0b +# A0c +# A0d +# A1a +# A1b +# A1c +# A1d +# A2a +# A2b +# A2c +# A2d +# A3a +# A3b +# A3c +# A3d +# ..a +# ..b +# +$ba="f0"; $bb="f1"; $bc="f2"; $bd="f3"; +$na="f4"; $nb="f5"; $nc="f6"; $nd="f7"; +$dota="f8"; $dotb="f9"; +$A0="f10"; $A1="f11"; $A2="f12"; $A3="f13"; +$N0="f14"; $N1="f15"; $N2="f16"; $N3="f17"; +$T0a="f18"; $T0b="f19"; +$T1a="f20"; $T1b="f21"; +$T2a="f22"; $T2b="f23"; +$T3a="f24"; $T3b="f25"; + +# sp----------->+-------------------------------+ +# | saved sp | +# +-------------------------------+ +# | | +# +-------------------------------+ +# | 10 saved gpr, r14-r23 | +# . . +# . . +# +12*size_t +-------------------------------+ +# | 12 saved fpr, f14-f25 | +# . . +# . . +# +12*8 +-------------------------------+ +# | padding to 64 byte boundary | +# . . +# +X +-------------------------------+ +# | 16 gpr<->fpr transfer zone | +# . . +# . . +# +16*8 +-------------------------------+ +# | __int64 tmp[-1] | +# +-------------------------------+ +# | __int64 tmp[num] | +# . . +# . . +# . . +# +(num+1)*8 +-------------------------------+ +# | padding to 64 byte boundary | +# . . +# +X +-------------------------------+ +# | double nap_d[4*num] | +# . . +# . . +# . . +# +-------------------------------+ + +$code=<<___; +.machine "any" +.text + +.globl .$fname +.align 5 +.$fname: + cmpwi $num,4 + mr $rp,r3 ; $rp is reassigned + li r3,0 ; possible "not handled" return code + bltlr- + andi. r0,$num,1 ; $num has to be even + bnelr- + + slwi $num,$num,3 ; num*=8 + li $i,-4096 + slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num + add $tp,$tp,$num ; place for tp[num+1] + addi $tp,$tp,`$FRAME+$TRANSFER+8+64+$RZONE` + subf $tp,$tp,$sp ; $sp-$tp + and $tp,$tp,$i ; minimize TLB usage + subf $tp,$sp,$tp ; $tp-$sp + $STUX $sp,$sp,$tp ; alloca + + $PUSH r14,`2*$SIZE_T`($sp) + $PUSH r15,`3*$SIZE_T`($sp) + $PUSH r16,`4*$SIZE_T`($sp) + $PUSH r17,`5*$SIZE_T`($sp) + $PUSH r18,`6*$SIZE_T`($sp) + $PUSH r19,`7*$SIZE_T`($sp) + $PUSH r20,`8*$SIZE_T`($sp) + $PUSH r21,`9*$SIZE_T`($sp) + $PUSH r22,`10*$SIZE_T`($sp) + $PUSH r23,`11*$SIZE_T`($sp) + stfd f14,`12*$SIZE_T+0`($sp) + stfd f15,`12*$SIZE_T+8`($sp) + stfd f16,`12*$SIZE_T+16`($sp) + stfd f17,`12*$SIZE_T+24`($sp) + stfd f18,`12*$SIZE_T+32`($sp) + stfd f19,`12*$SIZE_T+40`($sp) + stfd f20,`12*$SIZE_T+48`($sp) + stfd f21,`12*$SIZE_T+56`($sp) + stfd f22,`12*$SIZE_T+64`($sp) + stfd f23,`12*$SIZE_T+72`($sp) + stfd f24,`12*$SIZE_T+80`($sp) + stfd f25,`12*$SIZE_T+88`($sp) + + ld $a0,0($ap) ; pull ap[0] value + ld $n0,0($n0) ; pull n0[0] value + ld $t3,0($bp) ; bp[0] + + addi $tp,$sp,`$FRAME+$TRANSFER+8+64` + li $i,-64 + add $nap_d,$tp,$num + and $nap_d,$nap_d,$i ; align to 64 bytes + + mulld $t7,$a0,$t3 ; ap[0]*bp[0] + ; nap_d is off by 1, because it's used with stfdu/lfdu + addi $nap_d,$nap_d,-8 + srwi $j,$num,`3+1` ; counter register, num/2 + mulld $t7,$t7,$n0 ; tp[0]*n0 + addi $j,$j,-1 + addi $tp,$sp,`$FRAME+$TRANSFER-8` + li $carry,0 + mtctr $j + + ; transfer bp[0] to FPU as 4x16-bit values + extrdi $t0,$t3,16,48 + extrdi $t1,$t3,16,32 + extrdi $t2,$t3,16,16 + extrdi $t3,$t3,16,0 + std $t0,`$FRAME+0`($sp) + std $t1,`$FRAME+8`($sp) + std $t2,`$FRAME+16`($sp) + std $t3,`$FRAME+24`($sp) + ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values + extrdi $t4,$t7,16,48 + extrdi $t5,$t7,16,32 + extrdi $t6,$t7,16,16 + extrdi $t7,$t7,16,0 + std $t4,`$FRAME+32`($sp) + std $t5,`$FRAME+40`($sp) + std $t6,`$FRAME+48`($sp) + std $t7,`$FRAME+56`($sp) + lwz $t0,4($ap) ; load a[j] as 32-bit word pair + lwz $t1,0($ap) + lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair + lwz $t3,8($ap) + lwz $t4,4($np) ; load n[j] as 32-bit word pair + lwz $t5,0($np) + lwz $t6,12($np) ; load n[j+1] as 32-bit word pair + lwz $t7,8($np) + lfd $ba,`$FRAME+0`($sp) + lfd $bb,`$FRAME+8`($sp) + lfd $bc,`$FRAME+16`($sp) + lfd $bd,`$FRAME+24`($sp) + lfd $na,`$FRAME+32`($sp) + lfd $nb,`$FRAME+40`($sp) + lfd $nc,`$FRAME+48`($sp) + lfd $nd,`$FRAME+56`($sp) + std $t0,`$FRAME+64`($sp) + std $t1,`$FRAME+72`($sp) + std $t2,`$FRAME+80`($sp) + std $t3,`$FRAME+88`($sp) + std $t4,`$FRAME+96`($sp) + std $t5,`$FRAME+104`($sp) + std $t6,`$FRAME+112`($sp) + std $t7,`$FRAME+120`($sp) + fcfid $ba,$ba + fcfid $bb,$bb + fcfid $bc,$bc + fcfid $bd,$bd + fcfid $na,$na + fcfid $nb,$nb + fcfid $nc,$nc + fcfid $nd,$nd + + lfd $A0,`$FRAME+64`($sp) + lfd $A1,`$FRAME+72`($sp) + lfd $A2,`$FRAME+80`($sp) + lfd $A3,`$FRAME+88`($sp) + lfd $N0,`$FRAME+96`($sp) + lfd $N1,`$FRAME+104`($sp) + lfd $N2,`$FRAME+112`($sp) + lfd $N3,`$FRAME+120`($sp) + fcfid $A0,$A0 + fcfid $A1,$A1 + fcfid $A2,$A2 + fcfid $A3,$A3 + fcfid $N0,$N0 + fcfid $N1,$N1 + fcfid $N2,$N2 + fcfid $N3,$N3 + addi $ap,$ap,16 + addi $np,$np,16 + + fmul $T1a,$A1,$ba + fmul $T1b,$A1,$bb + stfd $A0,8($nap_d) ; save a[j] in double format + stfd $A1,16($nap_d) + fmul $T2a,$A2,$ba + fmul $T2b,$A2,$bb + stfd $A2,24($nap_d) ; save a[j+1] in double format + stfd $A3,32($nap_d) + fmul $T3a,$A3,$ba + fmul $T3b,$A3,$bb + stfd $N0,40($nap_d) ; save n[j] in double format + stfd $N1,48($nap_d) + fmul $T0a,$A0,$ba + fmul $T0b,$A0,$bb + stfd $N2,56($nap_d) ; save n[j+1] in double format + stfdu $N3,64($nap_d) + + fmadd $T1a,$A0,$bc,$T1a + fmadd $T1b,$A0,$bd,$T1b + fmadd $T2a,$A1,$bc,$T2a + fmadd $T2b,$A1,$bd,$T2b + fmadd $T3a,$A2,$bc,$T3a + fmadd $T3b,$A2,$bd,$T3b + fmul $dota,$A3,$bc + fmul $dotb,$A3,$bd + + fmadd $T1a,$N1,$na,$T1a + fmadd $T1b,$N1,$nb,$T1b + fmadd $T2a,$N2,$na,$T2a + fmadd $T2b,$N2,$nb,$T2b + fmadd $T3a,$N3,$na,$T3a + fmadd $T3b,$N3,$nb,$T3b + fmadd $T0a,$N0,$na,$T0a + fmadd $T0b,$N0,$nb,$T0b + + fmadd $T1a,$N0,$nc,$T1a + fmadd $T1b,$N0,$nd,$T1b + fmadd $T2a,$N1,$nc,$T2a + fmadd $T2b,$N1,$nd,$T2b + fmadd $T3a,$N2,$nc,$T3a + fmadd $T3b,$N2,$nd,$T3b + fmadd $dota,$N3,$nc,$dota + fmadd $dotb,$N3,$nd,$dotb + + fctid $T0a,$T0a + fctid $T0b,$T0b + fctid $T1a,$T1a + fctid $T1b,$T1b + fctid $T2a,$T2a + fctid $T2b,$T2b + fctid $T3a,$T3a + fctid $T3b,$T3b + + stfd $T0a,`$FRAME+0`($sp) + stfd $T0b,`$FRAME+8`($sp) + stfd $T1a,`$FRAME+16`($sp) + stfd $T1b,`$FRAME+24`($sp) + stfd $T2a,`$FRAME+32`($sp) + stfd $T2b,`$FRAME+40`($sp) + stfd $T3a,`$FRAME+48`($sp) + stfd $T3b,`$FRAME+56`($sp) + +.align 5 +L1st: + lwz $t0,4($ap) ; load a[j] as 32-bit word pair + lwz $t1,0($ap) + lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair + lwz $t3,8($ap) + lwz $t4,4($np) ; load n[j] as 32-bit word pair + lwz $t5,0($np) + lwz $t6,12($np) ; load n[j+1] as 32-bit word pair + lwz $t7,8($np) + std $t0,`$FRAME+64`($sp) + std $t1,`$FRAME+72`($sp) + std $t2,`$FRAME+80`($sp) + std $t3,`$FRAME+88`($sp) + std $t4,`$FRAME+96`($sp) + std $t5,`$FRAME+104`($sp) + std $t6,`$FRAME+112`($sp) + std $t7,`$FRAME+120`($sp) + ld $t0,`$FRAME+0`($sp) + ld $t1,`$FRAME+8`($sp) + ld $t2,`$FRAME+16`($sp) + ld $t3,`$FRAME+24`($sp) + ld $t4,`$FRAME+32`($sp) + ld $t5,`$FRAME+40`($sp) + ld $t6,`$FRAME+48`($sp) + ld $t7,`$FRAME+56`($sp) + lfd $A0,`$FRAME+64`($sp) + lfd $A1,`$FRAME+72`($sp) + lfd $A2,`$FRAME+80`($sp) + lfd $A3,`$FRAME+88`($sp) + lfd $N0,`$FRAME+96`($sp) + lfd $N1,`$FRAME+104`($sp) + lfd $N2,`$FRAME+112`($sp) + lfd $N3,`$FRAME+120`($sp) + fcfid $A0,$A0 + fcfid $A1,$A1 + fcfid $A2,$A2 + fcfid $A3,$A3 + fcfid $N0,$N0 + fcfid $N1,$N1 + fcfid $N2,$N2 + fcfid $N3,$N3 + addi $ap,$ap,16 + addi $np,$np,16 + + fmul $T1a,$A1,$ba + fmul $T1b,$A1,$bb + fmul $T2a,$A2,$ba + fmul $T2b,$A2,$bb + stfd $A0,8($nap_d) ; save a[j] in double format + stfd $A1,16($nap_d) + fmul $T3a,$A3,$ba + fmul $T3b,$A3,$bb + fmadd $T0a,$A0,$ba,$dota + fmadd $T0b,$A0,$bb,$dotb + stfd $A2,24($nap_d) ; save a[j+1] in double format + stfd $A3,32($nap_d) + + fmadd $T1a,$A0,$bc,$T1a + fmadd $T1b,$A0,$bd,$T1b + fmadd $T2a,$A1,$bc,$T2a + fmadd $T2b,$A1,$bd,$T2b + stfd $N0,40($nap_d) ; save n[j] in double format + stfd $N1,48($nap_d) + fmadd $T3a,$A2,$bc,$T3a + fmadd $T3b,$A2,$bd,$T3b + add $t0,$t0,$carry ; can not overflow + fmul $dota,$A3,$bc + fmul $dotb,$A3,$bd + stfd $N2,56($nap_d) ; save n[j+1] in double format + stfdu $N3,64($nap_d) + srdi $carry,$t0,16 + add $t1,$t1,$carry + srdi $carry,$t1,16 + + fmadd $T1a,$N1,$na,$T1a + fmadd $T1b,$N1,$nb,$T1b + insrdi $t0,$t1,16,32 + fmadd $T2a,$N2,$na,$T2a + fmadd $T2b,$N2,$nb,$T2b + add $t2,$t2,$carry + fmadd $T3a,$N3,$na,$T3a + fmadd $T3b,$N3,$nb,$T3b + srdi $carry,$t2,16 + fmadd $T0a,$N0,$na,$T0a + fmadd $T0b,$N0,$nb,$T0b + insrdi $t0,$t2,16,16 + add $t3,$t3,$carry + srdi $carry,$t3,16 + + fmadd $T1a,$N0,$nc,$T1a + fmadd $T1b,$N0,$nd,$T1b + insrdi $t0,$t3,16,0 ; 0..63 bits + fmadd $T2a,$N1,$nc,$T2a + fmadd $T2b,$N1,$nd,$T2b + add $t4,$t4,$carry + fmadd $T3a,$N2,$nc,$T3a + fmadd $T3b,$N2,$nd,$T3b + srdi $carry,$t4,16 + fmadd $dota,$N3,$nc,$dota + fmadd $dotb,$N3,$nd,$dotb + add $t5,$t5,$carry + srdi $carry,$t5,16 + insrdi $t4,$t5,16,32 + + fctid $T0a,$T0a + fctid $T0b,$T0b + add $t6,$t6,$carry + fctid $T1a,$T1a + fctid $T1b,$T1b + srdi $carry,$t6,16 + fctid $T2a,$T2a + fctid $T2b,$T2b + insrdi $t4,$t6,16,16 + fctid $T3a,$T3a + fctid $T3b,$T3b + add $t7,$t7,$carry + insrdi $t4,$t7,16,0 ; 64..127 bits + srdi $carry,$t7,16 ; upper 33 bits + + stfd $T0a,`$FRAME+0`($sp) + stfd $T0b,`$FRAME+8`($sp) + stfd $T1a,`$FRAME+16`($sp) + stfd $T1b,`$FRAME+24`($sp) + stfd $T2a,`$FRAME+32`($sp) + stfd $T2b,`$FRAME+40`($sp) + stfd $T3a,`$FRAME+48`($sp) + stfd $T3b,`$FRAME+56`($sp) + std $t0,8($tp) ; tp[j-1] + stdu $t4,16($tp) ; tp[j] + bdnz- L1st + + fctid $dota,$dota + fctid $dotb,$dotb + + ld $t0,`$FRAME+0`($sp) + ld $t1,`$FRAME+8`($sp) + ld $t2,`$FRAME+16`($sp) + ld $t3,`$FRAME+24`($sp) + ld $t4,`$FRAME+32`($sp) + ld $t5,`$FRAME+40`($sp) + ld $t6,`$FRAME+48`($sp) + ld $t7,`$FRAME+56`($sp) + stfd $dota,`$FRAME+64`($sp) + stfd $dotb,`$FRAME+72`($sp) + + add $t0,$t0,$carry ; can not overflow + srdi $carry,$t0,16 + add $t1,$t1,$carry + srdi $carry,$t1,16 + insrdi $t0,$t1,16,32 + add $t2,$t2,$carry + srdi $carry,$t2,16 + insrdi $t0,$t2,16,16 + add $t3,$t3,$carry + srdi $carry,$t3,16 + insrdi $t0,$t3,16,0 ; 0..63 bits + add $t4,$t4,$carry + srdi $carry,$t4,16 + add $t5,$t5,$carry + srdi $carry,$t5,16 + insrdi $t4,$t5,16,32 + add $t6,$t6,$carry + srdi $carry,$t6,16 + insrdi $t4,$t6,16,16 + add $t7,$t7,$carry + insrdi $t4,$t7,16,0 ; 64..127 bits + srdi $carry,$t7,16 ; upper 33 bits + ld $t6,`$FRAME+64`($sp) + ld $t7,`$FRAME+72`($sp) + + std $t0,8($tp) ; tp[j-1] + stdu $t4,16($tp) ; tp[j] + + add $t6,$t6,$carry ; can not overflow + srdi $carry,$t6,16 + add $t7,$t7,$carry + insrdi $t6,$t7,48,0 + srdi $ovf,$t7,48 + std $t6,8($tp) ; tp[num-1] + + slwi $t7,$num,2 + subf $nap_d,$t7,$nap_d ; rewind pointer + + li $i,8 ; i=1 +.align 5 +Louter: + ldx $t3,$bp,$i ; bp[i] + ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0] + mulld $t7,$a0,$t3 ; ap[0]*bp[i] + + addi $tp,$sp,`$FRAME+$TRANSFER` + add $t7,$t7,$t6 ; ap[0]*bp[i]+tp[0] + li $carry,0 + mulld $t7,$t7,$n0 ; tp[0]*n0 + mtctr $j + + ; transfer bp[i] to FPU as 4x16-bit values + extrdi $t0,$t3,16,48 + extrdi $t1,$t3,16,32 + extrdi $t2,$t3,16,16 + extrdi $t3,$t3,16,0 + std $t0,`$FRAME+0`($sp) + std $t1,`$FRAME+8`($sp) + std $t2,`$FRAME+16`($sp) + std $t3,`$FRAME+24`($sp) + ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values + extrdi $t4,$t7,16,48 + extrdi $t5,$t7,16,32 + extrdi $t6,$t7,16,16 + extrdi $t7,$t7,16,0 + std $t4,`$FRAME+32`($sp) + std $t5,`$FRAME+40`($sp) + std $t6,`$FRAME+48`($sp) + std $t7,`$FRAME+56`($sp) + + lfd $A0,8($nap_d) ; load a[j] in double format + lfd $A1,16($nap_d) + lfd $A2,24($nap_d) ; load a[j+1] in double format + lfd $A3,32($nap_d) + lfd $N0,40($nap_d) ; load n[j] in double format + lfd $N1,48($nap_d) + lfd $N2,56($nap_d) ; load n[j+1] in double format + lfdu $N3,64($nap_d) + + lfd $ba,`$FRAME+0`($sp) + lfd $bb,`$FRAME+8`($sp) + lfd $bc,`$FRAME+16`($sp) + lfd $bd,`$FRAME+24`($sp) + lfd $na,`$FRAME+32`($sp) + lfd $nb,`$FRAME+40`($sp) + lfd $nc,`$FRAME+48`($sp) + lfd $nd,`$FRAME+56`($sp) + + fcfid $ba,$ba + fcfid $bb,$bb + fcfid $bc,$bc + fcfid $bd,$bd + fcfid $na,$na + fcfid $nb,$nb + fcfid $nc,$nc + fcfid $nd,$nd + + fmul $T1a,$A1,$ba + fmul $T1b,$A1,$bb + fmul $T2a,$A2,$ba + fmul $T2b,$A2,$bb + fmul $T3a,$A3,$ba + fmul $T3b,$A3,$bb + fmul $T0a,$A0,$ba + fmul $T0b,$A0,$bb + + fmadd $T1a,$A0,$bc,$T1a + fmadd $T1b,$A0,$bd,$T1b + fmadd $T2a,$A1,$bc,$T2a + fmadd $T2b,$A1,$bd,$T2b + fmadd $T3a,$A2,$bc,$T3a + fmadd $T3b,$A2,$bd,$T3b + fmul $dota,$A3,$bc + fmul $dotb,$A3,$bd + + fmadd $T1a,$N1,$na,$T1a + fmadd $T1b,$N1,$nb,$T1b + lfd $A0,8($nap_d) ; load a[j] in double format + lfd $A1,16($nap_d) + fmadd $T2a,$N2,$na,$T2a + fmadd $T2b,$N2,$nb,$T2b + lfd $A2,24($nap_d) ; load a[j+1] in double format + lfd $A3,32($nap_d) + fmadd $T3a,$N3,$na,$T3a + fmadd $T3b,$N3,$nb,$T3b + fmadd $T0a,$N0,$na,$T0a + fmadd $T0b,$N0,$nb,$T0b + + fmadd $T1a,$N0,$nc,$T1a + fmadd $T1b,$N0,$nd,$T1b + fmadd $T2a,$N1,$nc,$T2a + fmadd $T2b,$N1,$nd,$T2b + fmadd $T3a,$N2,$nc,$T3a + fmadd $T3b,$N2,$nd,$T3b + fmadd $dota,$N3,$nc,$dota + fmadd $dotb,$N3,$nd,$dotb + + fctid $T0a,$T0a + fctid $T0b,$T0b + fctid $T1a,$T1a + fctid $T1b,$T1b + fctid $T2a,$T2a + fctid $T2b,$T2b + fctid $T3a,$T3a + fctid $T3b,$T3b + + stfd $T0a,`$FRAME+0`($sp) + stfd $T0b,`$FRAME+8`($sp) + stfd $T1a,`$FRAME+16`($sp) + stfd $T1b,`$FRAME+24`($sp) + stfd $T2a,`$FRAME+32`($sp) + stfd $T2b,`$FRAME+40`($sp) + stfd $T3a,`$FRAME+48`($sp) + stfd $T3b,`$FRAME+56`($sp) + +.align 5 +Linner: + fmul $T1a,$A1,$ba + fmul $T1b,$A1,$bb + fmul $T2a,$A2,$ba + fmul $T2b,$A2,$bb + lfd $N0,40($nap_d) ; load n[j] in double format + lfd $N1,48($nap_d) + fmul $T3a,$A3,$ba + fmul $T3b,$A3,$bb + fmadd $T0a,$A0,$ba,$dota + fmadd $T0b,$A0,$bb,$dotb + lfd $N2,56($nap_d) ; load n[j+1] in double format + lfdu $N3,64($nap_d) + + fmadd $T1a,$A0,$bc,$T1a + fmadd $T1b,$A0,$bd,$T1b + fmadd $T2a,$A1,$bc,$T2a + fmadd $T2b,$A1,$bd,$T2b + lfd $A0,8($nap_d) ; load a[j] in double format + lfd $A1,16($nap_d) + fmadd $T3a,$A2,$bc,$T3a + fmadd $T3b,$A2,$bd,$T3b + fmul $dota,$A3,$bc + fmul $dotb,$A3,$bd + lfd $A2,24($nap_d) ; load a[j+1] in double format + lfd $A3,32($nap_d) + + fmadd $T1a,$N1,$na,$T1a + fmadd $T1b,$N1,$nb,$T1b + ld $t0,`$FRAME+0`($sp) + ld $t1,`$FRAME+8`($sp) + fmadd $T2a,$N2,$na,$T2a + fmadd $T2b,$N2,$nb,$T2b + ld $t2,`$FRAME+16`($sp) + ld $t3,`$FRAME+24`($sp) + fmadd $T3a,$N3,$na,$T3a + fmadd $T3b,$N3,$nb,$T3b + add $t0,$t0,$carry ; can not overflow + ld $t4,`$FRAME+32`($sp) + ld $t5,`$FRAME+40`($sp) + fmadd $T0a,$N0,$na,$T0a + fmadd $T0b,$N0,$nb,$T0b + srdi $carry,$t0,16 + add $t1,$t1,$carry + srdi $carry,$t1,16 + ld $t6,`$FRAME+48`($sp) + ld $t7,`$FRAME+56`($sp) + + fmadd $T1a,$N0,$nc,$T1a + fmadd $T1b,$N0,$nd,$T1b + insrdi $t0,$t1,16,32 + ld $t1,8($tp) ; tp[j] + fmadd $T2a,$N1,$nc,$T2a + fmadd $T2b,$N1,$nd,$T2b + add $t2,$t2,$carry + fmadd $T3a,$N2,$nc,$T3a + fmadd $T3b,$N2,$nd,$T3b + srdi $carry,$t2,16 + insrdi $t0,$t2,16,16 + fmadd $dota,$N3,$nc,$dota + fmadd $dotb,$N3,$nd,$dotb + add $t3,$t3,$carry + ldu $t2,16($tp) ; tp[j+1] + srdi $carry,$t3,16 + insrdi $t0,$t3,16,0 ; 0..63 bits + add $t4,$t4,$carry + + fctid $T0a,$T0a + fctid $T0b,$T0b + srdi $carry,$t4,16 + fctid $T1a,$T1a + fctid $T1b,$T1b + add $t5,$t5,$carry + fctid $T2a,$T2a + fctid $T2b,$T2b + srdi $carry,$t5,16 + insrdi $t4,$t5,16,32 + fctid $T3a,$T3a + fctid $T3b,$T3b + add $t6,$t6,$carry + srdi $carry,$t6,16 + insrdi $t4,$t6,16,16 + + stfd $T0a,`$FRAME+0`($sp) + stfd $T0b,`$FRAME+8`($sp) + add $t7,$t7,$carry + addc $t3,$t0,$t1 + stfd $T1a,`$FRAME+16`($sp) + stfd $T1b,`$FRAME+24`($sp) + insrdi $t4,$t7,16,0 ; 64..127 bits + srdi $carry,$t7,16 ; upper 33 bits + stfd $T2a,`$FRAME+32`($sp) + stfd $T2b,`$FRAME+40`($sp) + adde $t5,$t4,$t2 + stfd $T3a,`$FRAME+48`($sp) + stfd $T3b,`$FRAME+56`($sp) + addze $carry,$carry + std $t3,-16($tp) ; tp[j-1] + std $t5,-8($tp) ; tp[j] + bdnz- Linner + + fctid $dota,$dota + fctid $dotb,$dotb + ld $t0,`$FRAME+0`($sp) + ld $t1,`$FRAME+8`($sp) + ld $t2,`$FRAME+16`($sp) + ld $t3,`$FRAME+24`($sp) + ld $t4,`$FRAME+32`($sp) + ld $t5,`$FRAME+40`($sp) + ld $t6,`$FRAME+48`($sp) + ld $t7,`$FRAME+56`($sp) + stfd $dota,`$FRAME+64`($sp) + stfd $dotb,`$FRAME+72`($sp) + + add $t0,$t0,$carry ; can not overflow + srdi $carry,$t0,16 + add $t1,$t1,$carry + srdi $carry,$t1,16 + insrdi $t0,$t1,16,32 + add $t2,$t2,$carry + ld $t1,8($tp) ; tp[j] + srdi $carry,$t2,16 + insrdi $t0,$t2,16,16 + add $t3,$t3,$carry + ldu $t2,16($tp) ; tp[j+1] + srdi $carry,$t3,16 + insrdi $t0,$t3,16,0 ; 0..63 bits + add $t4,$t4,$carry + srdi $carry,$t4,16 + add $t5,$t5,$carry + srdi $carry,$t5,16 + insrdi $t4,$t5,16,32 + add $t6,$t6,$carry + srdi $carry,$t6,16 + insrdi $t4,$t6,16,16 + add $t7,$t7,$carry + insrdi $t4,$t7,16,0 ; 64..127 bits + srdi $carry,$t7,16 ; upper 33 bits + ld $t6,`$FRAME+64`($sp) + ld $t7,`$FRAME+72`($sp) + + addc $t3,$t0,$t1 + adde $t5,$t4,$t2 + addze $carry,$carry + + std $t3,-16($tp) ; tp[j-1] + std $t5,-8($tp) ; tp[j] + + add $carry,$carry,$ovf ; comsume upmost overflow + add $t6,$t6,$carry ; can not overflow + srdi $carry,$t6,16 + add $t7,$t7,$carry + insrdi $t6,$t7,48,0 + srdi $ovf,$t7,48 + std $t6,0($tp) ; tp[num-1] + + slwi $t7,$num,2 + addi $i,$i,8 + subf $nap_d,$t7,$nap_d ; rewind pointer + cmpw $i,$num + blt- Louter + + subf $np,$num,$np ; rewind np + addi $j,$j,1 ; restore counter + subfc $i,$i,$i ; j=0 and "clear" XER[CA] + addi $tp,$sp,`$FRAME+$TRANSFER+8` + addi $t4,$sp,`$FRAME+$TRANSFER+16` + addi $t5,$np,8 + addi $t6,$rp,8 + mtctr $j + +.align 4 +Lsub: ldx $t0,$tp,$i + ldx $t1,$np,$i + ldx $t2,$t4,$i + ldx $t3,$t5,$i + subfe $t0,$t1,$t0 ; tp[j]-np[j] + subfe $t2,$t3,$t2 ; tp[j+1]-np[j+1] + stdx $t0,$rp,$i + stdx $t2,$t6,$i + addi $i,$i,16 + bdnz- Lsub + + li $i,0 + subfe $ovf,$i,$ovf ; handle upmost overflow bit + and $ap,$tp,$ovf + andc $np,$rp,$ovf + or $ap,$ap,$np ; ap=borrow?tp:rp + addi $t7,$ap,8 + mtctr $j + +.align 4 +Lcopy: ; copy or in-place refresh + ldx $t0,$ap,$i + ldx $t1,$t7,$i + std $i,8($nap_d) ; zap nap_d + std $i,16($nap_d) + std $i,24($nap_d) + std $i,32($nap_d) + std $i,40($nap_d) + std $i,48($nap_d) + std $i,56($nap_d) + stdu $i,64($nap_d) + stdx $t0,$rp,$i + stdx $t1,$t6,$i + stdx $i,$tp,$i ; zap tp at once + stdx $i,$t4,$i + addi $i,$i,16 + bdnz- Lcopy + + $POP r14,`2*$SIZE_T`($sp) + $POP r15,`3*$SIZE_T`($sp) + $POP r16,`4*$SIZE_T`($sp) + $POP r17,`5*$SIZE_T`($sp) + $POP r18,`6*$SIZE_T`($sp) + $POP r19,`7*$SIZE_T`($sp) + $POP r20,`8*$SIZE_T`($sp) + $POP r21,`9*$SIZE_T`($sp) + $POP r22,`10*$SIZE_T`($sp) + $POP r23,`11*$SIZE_T`($sp) + lfd f14,`12*$SIZE_T+0`($sp) + lfd f15,`12*$SIZE_T+8`($sp) + lfd f16,`12*$SIZE_T+16`($sp) + lfd f17,`12*$SIZE_T+24`($sp) + lfd f18,`12*$SIZE_T+32`($sp) + lfd f19,`12*$SIZE_T+40`($sp) + lfd f20,`12*$SIZE_T+48`($sp) + lfd f21,`12*$SIZE_T+56`($sp) + lfd f22,`12*$SIZE_T+64`($sp) + lfd f23,`12*$SIZE_T+72`($sp) + lfd f24,`12*$SIZE_T+80`($sp) + lfd f25,`12*$SIZE_T+88`($sp) + $POP $sp,0($sp) + li r3,1 ; signal "handled" + blr + .long 0 +.asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@fy.chalmers.se>" +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +print $code; +close STDOUT; diff --git a/crypto/bn/asm/s390x-mont.pl b/crypto/bn/asm/s390x-mont.pl new file mode 100755 index 0000000..d232510 --- /dev/null +++ b/crypto/bn/asm/s390x-mont.pl @@ -0,0 +1,225 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# April 2007. +# +# Performance improvement over vanilla C code varies from 85% to 45% +# depending on key length and benchmark. Unfortunately in this context +# these are not very impressive results [for code that utilizes "wide" +# 64x64=128-bit multiplication, which is not commonly available to C +# programmers], at least hand-coded bn_asm.c replacement is known to +# provide 30-40% better results for longest keys. Well, on a second +# thought it's not very surprising, because z-CPUs are single-issue +# and _strictly_ in-order execution, while bn_mul_mont is more or less +# dependent on CPU ability to pipe-line instructions and have several +# of them "in-flight" at the same time. I mean while other methods, +# for example Karatsuba, aim to minimize amount of multiplications at +# the cost of other operations increase, bn_mul_mont aim to neatly +# "overlap" multiplications and the other operations [and on most +# platforms even minimize the amount of the other operations, in +# particular references to memory]. But it's possible to improve this +# module performance by implementing dedicated squaring code-path and +# possibly by unrolling loops... + +# January 2009. +# +# Reschedule to minimize/avoid Address Generation Interlock hazard, +# make inner loops counter-based. + +$mn0="%r0"; +$num="%r1"; + +# int bn_mul_mont( +$rp="%r2"; # BN_ULONG *rp, +$ap="%r3"; # const BN_ULONG *ap, +$bp="%r4"; # const BN_ULONG *bp, +$np="%r5"; # const BN_ULONG *np, +$n0="%r6"; # const BN_ULONG *n0, +#$num="160(%r15)" # int num); + +$bi="%r2"; # zaps rp +$j="%r7"; + +$ahi="%r8"; +$alo="%r9"; +$nhi="%r10"; +$nlo="%r11"; +$AHI="%r12"; +$NHI="%r13"; +$count="%r14"; +$sp="%r15"; + +$code.=<<___; +.text +.globl bn_mul_mont +.type bn_mul_mont,\@function +bn_mul_mont: + lgf $num,164($sp) # pull $num + sla $num,3 # $num to enumerate bytes + la $bp,0($num,$bp) + + stg %r2,16($sp) + + cghi $num,16 # + lghi %r2,0 # + blr %r14 # if($num<16) return 0; + cghi $num,128 # + bhr %r14 # if($num>128) return 0; + + stmg %r3,%r15,24($sp) + + lghi $rp,-160-8 # leave room for carry bit + lcgr $j,$num # -$num + lgr %r0,$sp + la $rp,0($rp,$sp) + la $sp,0($j,$rp) # alloca + stg %r0,0($sp) # back chain + + sra $num,3 # restore $num + la $bp,0($j,$bp) # restore $bp + ahi $num,-1 # adjust $num for inner loop + lg $n0,0($n0) # pull n0 + + lg $bi,0($bp) + lg $alo,0($ap) + mlgr $ahi,$bi # ap[0]*bp[0] + lgr $AHI,$ahi + + lgr $mn0,$alo # "tp[0]"*n0 + msgr $mn0,$n0 + + lg $nlo,0($np) # + mlgr $nhi,$mn0 # np[0]*m1 + algr $nlo,$alo # +="tp[0]" + lghi $NHI,0 + alcgr $NHI,$nhi + + la $j,8(%r0) # j=1 + lr $count,$num + +.align 16 +.L1st: + lg $alo,0($j,$ap) + mlgr $ahi,$bi # ap[j]*bp[0] + algr $alo,$AHI + lghi $AHI,0 + alcgr $AHI,$ahi + + lg $nlo,0($j,$np) + mlgr $nhi,$mn0 # np[j]*m1 + algr $nlo,$NHI + lghi $NHI,0 + alcgr $nhi,$NHI # +="tp[j]" + algr $nlo,$alo + alcgr $NHI,$nhi + + stg $nlo,160-8($j,$sp) # tp[j-1]= + la $j,8($j) # j++ + brct $count,.L1st + + algr $NHI,$AHI + lghi $AHI,0 + alcgr $AHI,$AHI # upmost overflow bit + stg $NHI,160-8($j,$sp) + stg $AHI,160($j,$sp) + la $bp,8($bp) # bp++ + +.Louter: + lg $bi,0($bp) # bp[i] + lg $alo,0($ap) + mlgr $ahi,$bi # ap[0]*bp[i] + alg $alo,160($sp) # +=tp[0] + lghi $AHI,0 + alcgr $AHI,$ahi + + lgr $mn0,$alo + msgr $mn0,$n0 # tp[0]*n0 + + lg $nlo,0($np) # np[0] + mlgr $nhi,$mn0 # np[0]*m1 + algr $nlo,$alo # +="tp[0]" + lghi $NHI,0 + alcgr $NHI,$nhi + + la $j,8(%r0) # j=1 + lr $count,$num + +.align 16 +.Linner: + lg $alo,0($j,$ap) + mlgr $ahi,$bi # ap[j]*bp[i] + algr $alo,$AHI + lghi $AHI,0 + alcgr $ahi,$AHI + alg $alo,160($j,$sp)# +=tp[j] + alcgr $AHI,$ahi + + lg $nlo,0($j,$np) + mlgr $nhi,$mn0 # np[j]*m1 + algr $nlo,$NHI + lghi $NHI,0 + alcgr $nhi,$NHI + algr $nlo,$alo # +="tp[j]" + alcgr $NHI,$nhi + + stg $nlo,160-8($j,$sp) # tp[j-1]= + la $j,8($j) # j++ + brct $count,.Linner + + algr $NHI,$AHI + lghi $AHI,0 + alcgr $AHI,$AHI + alg $NHI,160($j,$sp)# accumulate previous upmost overflow bit + lghi $ahi,0 + alcgr $AHI,$ahi # new upmost overflow bit + stg $NHI,160-8($j,$sp) + stg $AHI,160($j,$sp) + + la $bp,8($bp) # bp++ + clg $bp,160+8+32($j,$sp) # compare to &bp[num] + jne .Louter + + lg $rp,160+8+16($j,$sp) # reincarnate rp + la $ap,160($sp) + ahi $num,1 # restore $num, incidentally clears "borrow" + + la $j,0(%r0) + lr $count,$num +.Lsub: lg $alo,0($j,$ap) + slbg $alo,0($j,$np) + stg $alo,0($j,$rp) + la $j,8($j) + brct $count,.Lsub + lghi $ahi,0 + slbgr $AHI,$ahi # handle upmost carry + + ngr $ap,$AHI + lghi $np,-1 + xgr $np,$AHI + ngr $np,$rp + ogr $ap,$np # ap=borrow?tp:rp + + la $j,0(%r0) + lgr $count,$num +.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh + stg $j,160($j,$sp) # zap tp + stg $alo,0($j,$rp) + la $j,8($j) + brct $count,.Lcopy + + la %r1,160+8+48($j,$sp) + lmg %r6,%r15,0(%r1) + lghi %r2,1 # signal "processed" + br %r14 +.size bn_mul_mont,.-bn_mul_mont +.string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>" +___ + +print $code; +close STDOUT; diff --git a/crypto/bn/asm/s390x.S b/crypto/bn/asm/s390x.S new file mode 100755 index 0000000..8f45f5d --- /dev/null +++ b/crypto/bn/asm/s390x.S @@ -0,0 +1,678 @@ +.ident "s390x.S, version 1.0" +// ==================================================================== +// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +// project. +// +// Rights for redistribution and usage in source and binary forms are +// granted according to the OpenSSL license. Warranty of any kind is +// disclaimed. +// ==================================================================== + +.text + +#define zero %r0 + +// BN_ULONG bn_mul_add_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5); +.globl bn_mul_add_words +.type bn_mul_add_words,@function +.align 4 +bn_mul_add_words: + lghi zero,0 // zero = 0 + la %r1,0(%r2) // put rp aside + lghi %r2,0 // i=0; + ltgfr %r4,%r4 + bler %r14 // if (len<=0) return 0; + + stmg %r6,%r10,48(%r15) + lghi %r8,0 // carry = 0 + srag %r10,%r4,2 // cnt=len/4 + jz .Loop1_madd + +.Loop4_madd: + lg %r7,0(%r2,%r3) // ap[i] + mlgr %r6,%r5 // *=w + algr %r7,%r8 // +=carry + alcgr %r6,zero + alg %r7,0(%r2,%r1) // +=rp[i] + alcgr %r6,zero + stg %r7,0(%r2,%r1) // rp[i]= + + lg %r9,8(%r2,%r3) + mlgr %r8,%r5 + algr %r9,%r6 + alcgr %r8,zero + alg %r9,8(%r2,%r1) + alcgr %r8,zero + stg %r9,8(%r2,%r1) + + lg %r7,16(%r2,%r3) + mlgr %r6,%r5 + algr %r7,%r8 + alcgr %r6,zero + alg %r7,16(%r2,%r1) + alcgr %r6,zero + stg %r7,16(%r2,%r1) + + lg %r9,24(%r2,%r3) + mlgr %r8,%r5 + algr %r9,%r6 + alcgr %r8,zero + alg %r9,24(%r2,%r1) + alcgr %r8,zero + stg %r9,24(%r2,%r1) + + la %r2,32(%r2) // i+=4 + brct %r10,.Loop4_madd + + lghi %r10,3 + nr %r4,%r10 // cnt=len%4 + jz .Lend_madd + +.Loop1_madd: + lg %r7,0(%r2,%r3) // ap[i] + mlgr %r6,%r5 // *=w + algr %r7,%r8 // +=carry + alcgr %r6,zero + alg %r7,0(%r2,%r1) // +=rp[i] + alcgr %r6,zero + stg %r7,0(%r2,%r1) // rp[i]= + + lgr %r8,%r6 + la %r2,8(%r2) // i++ + brct %r4,.Loop1_madd + +.Lend_madd: + lgr %r2,%r8 + lmg %r6,%r10,48(%r15) + br %r14 +.size bn_mul_add_words,.-bn_mul_add_words + +// BN_ULONG bn_mul_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5); +.globl bn_mul_words +.type bn_mul_words,@function +.align 4 +bn_mul_words: + lghi zero,0 // zero = 0 + la %r1,0(%r2) // put rp aside + lghi %r2,0 // i=0; + ltgfr %r4,%r4 + bler %r14 // if (len<=0) return 0; + + stmg %r6,%r10,48(%r15) + lghi %r8,0 // carry = 0 + srag %r10,%r4,2 // cnt=len/4 + jz .Loop1_mul + +.Loop4_mul: + lg %r7,0(%r2,%r3) // ap[i] + mlgr %r6,%r5 // *=w + algr %r7,%r8 // +=carry + alcgr %r6,zero + stg %r7,0(%r2,%r1) // rp[i]= + + lg %r9,8(%r2,%r3) + mlgr %r8,%r5 + algr %r9,%r6 + alcgr %r8,zero + stg %r9,8(%r2,%r1) + + lg %r7,16(%r2,%r3) + mlgr %r6,%r5 + algr %r7,%r8 + alcgr %r6,zero + stg %r7,16(%r2,%r1) + + lg %r9,24(%r2,%r3) + mlgr %r8,%r5 + algr %r9,%r6 + alcgr %r8,zero + stg %r9,24(%r2,%r1) + + la %r2,32(%r2) // i+=4 + brct %r10,.Loop4_mul + + lghi %r10,3 + nr %r4,%r10 // cnt=len%4 + jz .Lend_mul + +.Loop1_mul: + lg %r7,0(%r2,%r3) // ap[i] + mlgr %r6,%r5 // *=w + algr %r7,%r8 // +=carry + alcgr %r6,zero + stg %r7,0(%r2,%r1) // rp[i]= + + lgr %r8,%r6 + la %r2,8(%r2) // i++ + brct %r4,.Loop1_mul + +.Lend_mul: + lgr %r2,%r8 + lmg %r6,%r10,48(%r15) + br %r14 +.size bn_mul_words,.-bn_mul_words + +// void bn_sqr_words(BN_ULONG *r2,BN_ULONG *r2,int r4) +.globl bn_sqr_words +.type bn_sqr_words,@function +.align 4 +bn_sqr_words: + ltgfr %r4,%r4 + bler %r14 + + stmg %r6,%r7,48(%r15) + srag %r1,%r4,2 // cnt=len/4 + jz .Loop1_sqr + +.Loop4_sqr: + lg %r7,0(%r3) + mlgr %r6,%r7 + stg %r7,0(%r2) + stg %r6,8(%r2) + + lg %r7,8(%r3) + mlgr %r6,%r7 + stg %r7,16(%r2) + stg %r6,24(%r2) + + lg %r7,16(%r3) + mlgr %r6,%r7 + stg %r7,32(%r2) + stg %r6,40(%r2) + + lg %r7,24(%r3) + mlgr %r6,%r7 + stg %r7,48(%r2) + stg %r6,56(%r2) + + la %r3,32(%r3) + la %r2,64(%r2) + brct %r1,.Loop4_sqr + + lghi %r1,3 + nr %r4,%r1 // cnt=len%4 + jz .Lend_sqr + +.Loop1_sqr: + lg %r7,0(%r3) + mlgr %r6,%r7 + stg %r7,0(%r2) + stg %r6,8(%r2) + + la %r3,8(%r3) + la %r2,16(%r2) + brct %r4,.Loop1_sqr + +.Lend_sqr: + lmg %r6,%r7,48(%r15) + br %r14 +.size bn_sqr_words,.-bn_sqr_words + +// BN_ULONG bn_div_words(BN_ULONG h,BN_ULONG l,BN_ULONG d); +.globl bn_div_words +.type bn_div_words,@function +.align 4 +bn_div_words: + dlgr %r2,%r4 + lgr %r2,%r3 + br %r14 +.size bn_div_words,.-bn_div_words + +// BN_ULONG bn_add_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5); +.globl bn_add_words +.type bn_add_words,@function +.align 4 +bn_add_words: + la %r1,0(%r2) // put rp aside + lghi %r2,0 // i=0 + ltgfr %r5,%r5 + bler %r14 // if (len<=0) return 0; + + stg %r6,48(%r15) + lghi %r6,3 + nr %r6,%r5 // len%4 + sra %r5,2 // len/4, use sra because it sets condition code + jz .Loop1_add // carry is incidentally cleared if branch taken + algr %r2,%r2 // clear carry + +.Loop4_add: + lg %r0,0(%r2,%r3) + alcg %r0,0(%r2,%r4) + stg %r0,0(%r2,%r1) + lg %r0,8(%r2,%r3) + alcg %r0,8(%r2,%r4) + stg %r0,8(%r2,%r1) + lg %r0,16(%r2,%r3) + alcg %r0,16(%r2,%r4) + stg %r0,16(%r2,%r1) + lg %r0,24(%r2,%r3) + alcg %r0,24(%r2,%r4) + stg %r0,24(%r2,%r1) + + la %r2,32(%r2) // i+=4 + brct %r5,.Loop4_add + + la %r6,1(%r6) // see if len%4 is zero ... + brct %r6,.Loop1_add // without touching condition code:-) + +.Lexit_add: + lghi %r2,0 + alcgr %r2,%r2 + lg %r6,48(%r15) + br %r14 + +.Loop1_add: + lg %r0,0(%r2,%r3) + alcg %r0,0(%r2,%r4) + stg %r0,0(%r2,%r1) + + la %r2,8(%r2) // i++ + brct %r6,.Loop1_add + + j .Lexit_add +.size bn_add_words,.-bn_add_words + +// BN_ULONG bn_sub_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5); +.globl bn_sub_words +.type bn_sub_words,@function +.align 4 +bn_sub_words: + la %r1,0(%r2) // put rp aside + lghi %r2,0 // i=0 + ltgfr %r5,%r5 + bler %r14 // if (len<=0) return 0; + + stg %r6,48(%r15) + lghi %r6,3 + nr %r6,%r5 // len%4 + sra %r5,2 // len/4, use sra because it sets condition code + jnz .Loop4_sub // borrow is incidentally cleared if branch taken + slgr %r2,%r2 // clear borrow + +.Loop1_sub: + lg %r0,0(%r2,%r3) + slbg %r0,0(%r2,%r4) + stg %r0,0(%r2,%r1) + + la %r2,8(%r2) // i++ + brct %r6,.Loop1_sub + j .Lexit_sub + +.Loop4_sub: + lg %r0,0(%r2,%r3) + slbg %r0,0(%r2,%r4) + stg %r0,0(%r2,%r1) + lg %r0,8(%r2,%r3) + slbg %r0,8(%r2,%r4) + stg %r0,8(%r2,%r1) + lg %r0,16(%r2,%r3) + slbg %r0,16(%r2,%r4) + stg %r0,16(%r2,%r1) + lg %r0,24(%r2,%r3) + slbg %r0,24(%r2,%r4) + stg %r0,24(%r2,%r1) + + la %r2,32(%r2) // i+=4 + brct %r5,.Loop4_sub + + la %r6,1(%r6) // see if len%4 is zero ... + brct %r6,.Loop1_sub // without touching condition code:-) + +.Lexit_sub: + lghi %r2,0 + slbgr %r2,%r2 + lcgr %r2,%r2 + lg %r6,48(%r15) + br %r14 +.size bn_sub_words,.-bn_sub_words + +#define c1 %r1 +#define c2 %r5 +#define c3 %r8 + +#define mul_add_c(ai,bi,c1,c2,c3) \ + lg %r7,ai*8(%r3); \ + mlg %r6,bi*8(%r4); \ + algr c1,%r7; \ + alcgr c2,%r6; \ + alcgr c3,zero + +// void bn_mul_comba8(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4); +.globl bn_mul_comba8 +.type bn_mul_comba8,@function +.align 4 +bn_mul_comba8: + stmg %r6,%r8,48(%r15) + + lghi c1,0 + lghi c2,0 + lghi c3,0 + lghi zero,0 + + mul_add_c(0,0,c1,c2,c3); + stg c1,0*8(%r2) + lghi c1,0 + + mul_add_c(0,1,c2,c3,c1); + mul_add_c(1,0,c2,c3,c1); + stg c2,1*8(%r2) + lghi c2,0 + + mul_add_c(2,0,c3,c1,c2); + mul_add_c(1,1,c3,c1,c2); + mul_add_c(0,2,c3,c1,c2); + stg c3,2*8(%r2) + lghi c3,0 + + mul_add_c(0,3,c1,c2,c3); + mul_add_c(1,2,c1,c2,c3); + mul_add_c(2,1,c1,c2,c3); + mul_add_c(3,0,c1,c2,c3); + stg c1,3*8(%r2) + lghi c1,0 + + mul_add_c(4,0,c2,c3,c1); + mul_add_c(3,1,c2,c3,c1); + mul_add_c(2,2,c2,c3,c1); + mul_add_c(1,3,c2,c3,c1); + mul_add_c(0,4,c2,c3,c1); + stg c2,4*8(%r2) + lghi c2,0 + + mul_add_c(0,5,c3,c1,c2); + mul_add_c(1,4,c3,c1,c2); + mul_add_c(2,3,c3,c1,c2); + mul_add_c(3,2,c3,c1,c2); + mul_add_c(4,1,c3,c1,c2); + mul_add_c(5,0,c3,c1,c2); + stg c3,5*8(%r2) + lghi c3,0 + + mul_add_c(6,0,c1,c2,c3); + mul_add_c(5,1,c1,c2,c3); + mul_add_c(4,2,c1,c2,c3); + mul_add_c(3,3,c1,c2,c3); + mul_add_c(2,4,c1,c2,c3); + mul_add_c(1,5,c1,c2,c3); + mul_add_c(0,6,c1,c2,c3); + stg c1,6*8(%r2) + lghi c1,0 + + mul_add_c(0,7,c2,c3,c1); + mul_add_c(1,6,c2,c3,c1); + mul_add_c(2,5,c2,c3,c1); + mul_add_c(3,4,c2,c3,c1); + mul_add_c(4,3,c2,c3,c1); + mul_add_c(5,2,c2,c3,c1); + mul_add_c(6,1,c2,c3,c1); + mul_add_c(7,0,c2,c3,c1); + stg c2,7*8(%r2) + lghi c2,0 + + mul_add_c(7,1,c3,c1,c2); + mul_add_c(6,2,c3,c1,c2); + mul_add_c(5,3,c3,c1,c2); + mul_add_c(4,4,c3,c1,c2); + mul_add_c(3,5,c3,c1,c2); + mul_add_c(2,6,c3,c1,c2); + mul_add_c(1,7,c3,c1,c2); + stg c3,8*8(%r2) + lghi c3,0 + + mul_add_c(2,7,c1,c2,c3); + mul_add_c(3,6,c1,c2,c3); + mul_add_c(4,5,c1,c2,c3); + mul_add_c(5,4,c1,c2,c3); + mul_add_c(6,3,c1,c2,c3); + mul_add_c(7,2,c1,c2,c3); + stg c1,9*8(%r2) + lghi c1,0 + + mul_add_c(7,3,c2,c3,c1); + mul_add_c(6,4,c2,c3,c1); + mul_add_c(5,5,c2,c3,c1); + mul_add_c(4,6,c2,c3,c1); + mul_add_c(3,7,c2,c3,c1); + stg c2,10*8(%r2) + lghi c2,0 + + mul_add_c(4,7,c3,c1,c2); + mul_add_c(5,6,c3,c1,c2); + mul_add_c(6,5,c3,c1,c2); + mul_add_c(7,4,c3,c1,c2); + stg c3,11*8(%r2) + lghi c3,0 + + mul_add_c(7,5,c1,c2,c3); + mul_add_c(6,6,c1,c2,c3); + mul_add_c(5,7,c1,c2,c3); + stg c1,12*8(%r2) + lghi c1,0 + + + mul_add_c(6,7,c2,c3,c1); + mul_add_c(7,6,c2,c3,c1); + stg c2,13*8(%r2) + lghi c2,0 + + mul_add_c(7,7,c3,c1,c2); + stg c3,14*8(%r2) + stg c1,15*8(%r2) + + lmg %r6,%r8,48(%r15) + br %r14 +.size bn_mul_comba8,.-bn_mul_comba8 + +// void bn_mul_comba4(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4); +.globl bn_mul_comba4 +.type bn_mul_comba4,@function +.align 4 +bn_mul_comba4: + stmg %r6,%r8,48(%r15) + + lghi c1,0 + lghi c2,0 + lghi c3,0 + lghi zero,0 + + mul_add_c(0,0,c1,c2,c3); + stg c1,0*8(%r3) + lghi c1,0 + + mul_add_c(0,1,c2,c3,c1); + mul_add_c(1,0,c2,c3,c1); + stg c2,1*8(%r2) + lghi c2,0 + + mul_add_c(2,0,c3,c1,c2); + mul_add_c(1,1,c3,c1,c2); + mul_add_c(0,2,c3,c1,c2); + stg c3,2*8(%r2) + lghi c3,0 + + mul_add_c(0,3,c1,c2,c3); + mul_add_c(1,2,c1,c2,c3); + mul_add_c(2,1,c1,c2,c3); + mul_add_c(3,0,c1,c2,c3); + stg c1,3*8(%r2) + lghi c1,0 + + mul_add_c(3,1,c2,c3,c1); + mul_add_c(2,2,c2,c3,c1); + mul_add_c(1,3,c2,c3,c1); + stg c2,4*8(%r2) + lghi c2,0 + + mul_add_c(2,3,c3,c1,c2); + mul_add_c(3,2,c3,c1,c2); + stg c3,5*8(%r2) + lghi c3,0 + + mul_add_c(3,3,c1,c2,c3); + stg c1,6*8(%r2) + stg c2,7*8(%r2) + + stmg %r6,%r8,48(%r15) + br %r14 +.size bn_mul_comba4,.-bn_mul_comba4 + +#define sqr_add_c(ai,c1,c2,c3) \ + lg %r7,ai*8(%r3); \ + mlgr %r6,%r7; \ + algr c1,%r7; \ + alcgr c2,%r6; \ + alcgr c3,zero + +#define sqr_add_c2(ai,aj,c1,c2,c3) \ + lg %r7,ai*8(%r3); \ + mlg %r6,aj*8(%r3); \ + algr c1,%r7; \ + alcgr c2,%r6; \ + alcgr c3,zero; \ + algr c1,%r7; \ + alcgr c2,%r6; \ + alcgr c3,zero + +// void bn_sqr_comba8(BN_ULONG *r2,BN_ULONG *r3); +.globl bn_sqr_comba8 +.type bn_sqr_comba8,@function +.align 4 +bn_sqr_comba8: + stmg %r6,%r8,48(%r15) + + lghi c1,0 + lghi c2,0 + lghi c3,0 + lghi zero,0 + + sqr_add_c(0,c1,c2,c3); + stg c1,0*8(%r2) + lghi c1,0 + + sqr_add_c2(1,0,c2,c3,c1); + stg c2,1*8(%r2) + lghi c2,0 + + sqr_add_c(1,c3,c1,c2); + sqr_add_c2(2,0,c3,c1,c2); + stg c3,2*8(%r2) + lghi c3,0 + + sqr_add_c2(3,0,c1,c2,c3); + sqr_add_c2(2,1,c1,c2,c3); + stg c1,3*8(%r2) + lghi c1,0 + + sqr_add_c(2,c2,c3,c1); + sqr_add_c2(3,1,c2,c3,c1); + sqr_add_c2(4,0,c2,c3,c1); + stg c2,4*8(%r2) + lghi c2,0 + + sqr_add_c2(5,0,c3,c1,c2); + sqr_add_c2(4,1,c3,c1,c2); + sqr_add_c2(3,2,c3,c1,c2); + stg c3,5*8(%r2) + lghi c3,0 + + sqr_add_c(3,c1,c2,c3); + sqr_add_c2(4,2,c1,c2,c3); + sqr_add_c2(5,1,c1,c2,c3); + sqr_add_c2(6,0,c1,c2,c3); + stg c1,6*8(%r2) + lghi c1,0 + + sqr_add_c2(7,0,c2,c3,c1); + sqr_add_c2(6,1,c2,c3,c1); + sqr_add_c2(5,2,c2,c3,c1); + sqr_add_c2(4,3,c2,c3,c1); + stg c2,7*8(%r2) + lghi c2,0 + + sqr_add_c(4,c3,c1,c2); + sqr_add_c2(5,3,c3,c1,c2); + sqr_add_c2(6,2,c3,c1,c2); + sqr_add_c2(7,1,c3,c1,c2); + stg c3,8*8(%r2) + lghi c3,0 + + sqr_add_c2(7,2,c1,c2,c3); + sqr_add_c2(6,3,c1,c2,c3); + sqr_add_c2(5,4,c1,c2,c3); + stg c1,9*8(%r2) + lghi c1,0 + + sqr_add_c(5,c2,c3,c1); + sqr_add_c2(6,4,c2,c3,c1); + sqr_add_c2(7,3,c2,c3,c1); + stg c2,10*8(%r2) + lghi c2,0 + + sqr_add_c2(7,4,c3,c1,c2); + sqr_add_c2(6,5,c3,c1,c2); + stg c3,11*8(%r2) + lghi c3,0 + + sqr_add_c(6,c1,c2,c3); + sqr_add_c2(7,5,c1,c2,c3); + stg c1,12*8(%r2) + lghi c1,0 + + sqr_add_c2(7,6,c2,c3,c1); + stg c2,13*8(%r2) + lghi c2,0 + + sqr_add_c(7,c3,c1,c2); + stg c3,14*8(%r2) + stg c1,15*8(%r2) + + lmg %r6,%r8,48(%r15) + br %r14 +.size bn_sqr_comba8,.-bn_sqr_comba8 + +// void bn_sqr_comba4(BN_ULONG *r2,BN_ULONG *r3); +.globl bn_sqr_comba4 +.type bn_sqr_comba4,@function +.align 4 +bn_sqr_comba4: + stmg %r6,%r8,48(%r15) + + lghi c1,0 + lghi c2,0 + lghi c3,0 + lghi zero,0 + + sqr_add_c(0,c1,c2,c3); + stg c1,0*8(%r2) + lghi c1,0 + + sqr_add_c2(1,0,c2,c3,c1); + stg c2,1*8(%r2) + lghi c2,0 + + sqr_add_c(1,c3,c1,c2); + sqr_add_c2(2,0,c3,c1,c2); + stg c3,2*8(%r2) + lghi c3,0 + + sqr_add_c2(3,0,c1,c2,c3); + sqr_add_c2(2,1,c1,c2,c3); + stg c1,3*8(%r2) + lghi c1,0 + + sqr_add_c(2,c2,c3,c1); + sqr_add_c2(3,1,c2,c3,c1); + stg c2,4*8(%r2) + lghi c2,0 + + sqr_add_c2(3,2,c3,c1,c2); + stg c3,5*8(%r2) + lghi c3,0 + + sqr_add_c(3,c1,c2,c3); + stg c1,6*8(%r2) + stg c2,7*8(%r2) + + lmg %r6,%r8,48(%r15) + br %r14 +.size bn_sqr_comba4,.-bn_sqr_comba4 diff --git a/crypto/bn/asm/sparcv9-mont.pl b/crypto/bn/asm/sparcv9-mont.pl new file mode 100755 index 0000000..b8fb1e8 --- /dev/null +++ b/crypto/bn/asm/sparcv9-mont.pl @@ -0,0 +1,606 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# December 2005 +# +# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons +# for undertaken effort are multiple. First of all, UltraSPARC is not +# the whole SPARCv9 universe and other VIS-free implementations deserve +# optimized code as much. Secondly, newly introduced UltraSPARC T1, +# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes, +# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with +# several integrated RSA/DSA accelerator circuits accessible through +# kernel driver [only(*)], but having decent user-land software +# implementation is important too. Finally, reasons like desire to +# experiment with dedicated squaring procedure. Yes, this module +# implements one, because it was easiest to draft it in SPARCv9 +# instructions... + +# (*) Engine accessing the driver in question is on my TODO list. +# For reference, acceleator is estimated to give 6 to 10 times +# improvement on single-threaded RSA sign. It should be noted +# that 6-10x improvement coefficient does not actually mean +# something extraordinary in terms of absolute [single-threaded] +# performance, as SPARCv9 instruction set is by all means least +# suitable for high performance crypto among other 64 bit +# platforms. 6-10x factor simply places T1 in same performance +# domain as say AMD64 and IA-64. Improvement of RSA verify don't +# appear impressive at all, but it's the sign operation which is +# far more critical/interesting. + +# You might notice that inner loops are modulo-scheduled:-) This has +# essentially negligible impact on UltraSPARC performance, it's +# Fujitsu SPARC64 V users who should notice and hopefully appreciate +# the advantage... Currently this module surpasses sparcv9a-mont.pl +# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a +# module still have hidden potential [see TODO list there], which is +# estimated to be larger than 20%... + +# int bn_mul_mont( +$rp="%i0"; # BN_ULONG *rp, +$ap="%i1"; # const BN_ULONG *ap, +$bp="%i2"; # const BN_ULONG *bp, +$np="%i3"; # const BN_ULONG *np, +$n0="%i4"; # const BN_ULONG *n0, +$num="%i5"; # int num); + +$bits=32; +for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } +if ($bits==64) { $bias=2047; $frame=192; } +else { $bias=0; $frame=128; } + +$car0="%o0"; +$car1="%o1"; +$car2="%o2"; # 1 bit +$acc0="%o3"; +$acc1="%o4"; +$mask="%g1"; # 32 bits, what a waste... +$tmp0="%g4"; +$tmp1="%g5"; + +$i="%l0"; +$j="%l1"; +$mul0="%l2"; +$mul1="%l3"; +$tp="%l4"; +$apj="%l5"; +$npj="%l6"; +$tpj="%l7"; + +$fname="bn_mul_mont_int"; + +$code=<<___; +.section ".text",#alloc,#execinstr + +.global $fname +.align 32 +$fname: + cmp %o5,4 ! 128 bits minimum + bge,pt %icc,.Lenter + sethi %hi(0xffffffff),$mask + retl + clr %o0 +.align 32 +.Lenter: + save %sp,-$frame,%sp + sll $num,2,$num ! num*=4 + or $mask,%lo(0xffffffff),$mask + ld [$n0],$n0 + cmp $ap,$bp + and $num,$mask,$num + ld [$bp],$mul0 ! bp[0] + nop + + add %sp,$bias,%o7 ! real top of stack + ld [$ap],$car0 ! ap[0] ! redundant in squaring context + sub %o7,$num,%o7 + ld [$ap+4],$apj ! ap[1] + and %o7,-1024,%o7 + ld [$np],$car1 ! np[0] + sub %o7,$bias,%sp ! alloca + ld [$np+4],$npj ! np[1] + be,pt `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont + mov 12,$j + + mulx $car0,$mul0,$car0 ! ap[0]*bp[0] + mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0] + and $car0,$mask,$acc0 + add %sp,$bias+$frame,$tp + ld [$ap+8],$apj !prologue! + + mulx $n0,$acc0,$mul1 ! "t[0]"*n0 + and $mul1,$mask,$mul1 + + mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0 + mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0 + srlx $car0,32,$car0 + add $acc0,$car1,$car1 + ld [$np+8],$npj !prologue! + srlx $car1,32,$car1 + mov $tmp0,$acc0 !prologue! + +.L1st: + mulx $apj,$mul0,$tmp0 + mulx $npj,$mul1,$tmp1 + add $acc0,$car0,$car0 + ld [$ap+$j],$apj ! ap[j] + and $car0,$mask,$acc0 + add $acc1,$car1,$car1 + ld [$np+$j],$npj ! np[j] + srlx $car0,32,$car0 + add $acc0,$car1,$car1 + add $j,4,$j ! j++ + mov $tmp0,$acc0 + st $car1,[$tp] + cmp $j,$num + mov $tmp1,$acc1 + srlx $car1,32,$car1 + bl %icc,.L1st + add $tp,4,$tp ! tp++ +!.L1st + + mulx $apj,$mul0,$tmp0 !epilogue! + mulx $npj,$mul1,$tmp1 + add $acc0,$car0,$car0 + and $car0,$mask,$acc0 + add $acc1,$car1,$car1 + srlx $car0,32,$car0 + add $acc0,$car1,$car1 + st $car1,[$tp] + srlx $car1,32,$car1 + + add $tmp0,$car0,$car0 + and $car0,$mask,$acc0 + add $tmp1,$car1,$car1 + srlx $car0,32,$car0 + add $acc0,$car1,$car1 + st $car1,[$tp+4] + srlx $car1,32,$car1 + + add $car0,$car1,$car1 + st $car1,[$tp+8] + srlx $car1,32,$car2 + + mov 4,$i ! i++ + ld [$bp+4],$mul0 ! bp[1] +.Louter: + add %sp,$bias+$frame,$tp + ld [$ap],$car0 ! ap[0] + ld [$ap+4],$apj ! ap[1] + ld [$np],$car1 ! np[0] + ld [$np+4],$npj ! np[1] + ld [$tp],$tmp1 ! tp[0] + ld [$tp+4],$tpj ! tp[1] + mov 12,$j + + mulx $car0,$mul0,$car0 + mulx $apj,$mul0,$tmp0 !prologue! + add $tmp1,$car0,$car0 + ld [$ap+8],$apj !prologue! + and $car0,$mask,$acc0 + + mulx $n0,$acc0,$mul1 + and $mul1,$mask,$mul1 + + mulx $car1,$mul1,$car1 + mulx $npj,$mul1,$acc1 !prologue! + srlx $car0,32,$car0 + add $acc0,$car1,$car1 + ld [$np+8],$npj !prologue! + srlx $car1,32,$car1 + mov $tmp0,$acc0 !prologue! + +.Linner: + mulx $apj,$mul0,$tmp0 + mulx $npj,$mul1,$tmp1 + add $tpj,$car0,$car0 + ld [$ap+$j],$apj ! ap[j] + add $acc0,$car0,$car0 + add $acc1,$car1,$car1 + ld [$np+$j],$npj ! np[j] + and $car0,$mask,$acc0 + ld [$tp+8],$tpj ! tp[j] + srlx $car0,32,$car0 + add $acc0,$car1,$car1 + add $j,4,$j ! j++ + mov $tmp0,$acc0 + st $car1,[$tp] ! tp[j-1] + srlx $car1,32,$car1 + mov $tmp1,$acc1 + cmp $j,$num + bl %icc,.Linner + add $tp,4,$tp ! tp++ +!.Linner + + mulx $apj,$mul0,$tmp0 !epilogue! + mulx $npj,$mul1,$tmp1 + add $tpj,$car0,$car0 + add $acc0,$car0,$car0 + ld [$tp+8],$tpj ! tp[j] + and $car0,$mask,$acc0 + add $acc1,$car1,$car1 + srlx $car0,32,$car0 + add $acc0,$car1,$car1 + st $car1,[$tp] ! tp[j-1] + srlx $car1,32,$car1 + + add $tpj,$car0,$car0 + add $tmp0,$car0,$car0 + and $car0,$mask,$acc0 + add $tmp1,$car1,$car1 + add $acc0,$car1,$car1 + st $car1,[$tp+4] ! tp[j-1] + srlx $car0,32,$car0 + add $i,4,$i ! i++ + srlx $car1,32,$car1 + + add $car0,$car1,$car1 + cmp $i,$num + add $car2,$car1,$car1 + st $car1,[$tp+8] + + srlx $car1,32,$car2 + bl,a %icc,.Louter + ld [$bp+$i],$mul0 ! bp[i] +!.Louter + + add $tp,12,$tp + +.Ltail: + add $np,$num,$np + add $rp,$num,$rp + mov $tp,$ap + sub %g0,$num,%o7 ! k=-num + ba .Lsub + subcc %g0,%g0,%g0 ! clear %icc.c +.align 16 +.Lsub: + ld [$tp+%o7],%o0 + ld [$np+%o7],%o1 + subccc %o0,%o1,%o1 ! tp[j]-np[j] + add $rp,%o7,$i + add %o7,4,%o7 + brnz %o7,.Lsub + st %o1,[$i] + subc $car2,0,$car2 ! handle upmost overflow bit + and $tp,$car2,$ap + andn $rp,$car2,$np + or $ap,$np,$ap + sub %g0,$num,%o7 + +.Lcopy: + ld [$ap+%o7],%o0 ! copy or in-place refresh + st %g0,[$tp+%o7] ! zap tp + st %o0,[$rp+%o7] + add %o7,4,%o7 + brnz %o7,.Lcopy + nop + mov 1,%i0 + ret + restore +___ + +######## +######## .Lbn_sqr_mont gives up to 20% *overall* improvement over +######## code without following dedicated squaring procedure. +######## +$sbit="%i2"; # re-use $bp! + +$code.=<<___; +.align 32 +.Lbn_sqr_mont: + mulx $mul0,$mul0,$car0 ! ap[0]*ap[0] + mulx $apj,$mul0,$tmp0 !prologue! + and $car0,$mask,$acc0 + add %sp,$bias+$frame,$tp + ld [$ap+8],$apj !prologue! + + mulx $n0,$acc0,$mul1 ! "t[0]"*n0 + srlx $car0,32,$car0 + and $mul1,$mask,$mul1 + + mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0 + mulx $npj,$mul1,$acc1 !prologue! + and $car0,1,$sbit + ld [$np+8],$npj !prologue! + srlx $car0,1,$car0 + add $acc0,$car1,$car1 + srlx $car1,32,$car1 + mov $tmp0,$acc0 !prologue! + +.Lsqr_1st: + mulx $apj,$mul0,$tmp0 + mulx $npj,$mul1,$tmp1 + add $acc0,$car0,$car0 ! ap[j]*a0+c0 + add $acc1,$car1,$car1 + ld [$ap+$j],$apj ! ap[j] + and $car0,$mask,$acc0 + ld [$np+$j],$npj ! np[j] + srlx $car0,32,$car0 + add $acc0,$acc0,$acc0 + or $sbit,$acc0,$acc0 + mov $tmp1,$acc1 + srlx $acc0,32,$sbit + add $j,4,$j ! j++ + and $acc0,$mask,$acc0 + cmp $j,$num + add $acc0,$car1,$car1 + st $car1,[$tp] + mov $tmp0,$acc0 + srlx $car1,32,$car1 + bl %icc,.Lsqr_1st + add $tp,4,$tp ! tp++ +!.Lsqr_1st + + mulx $apj,$mul0,$tmp0 ! epilogue + mulx $npj,$mul1,$tmp1 + add $acc0,$car0,$car0 ! ap[j]*a0+c0 + add $acc1,$car1,$car1 + and $car0,$mask,$acc0 + srlx $car0,32,$car0 + add $acc0,$acc0,$acc0 + or $sbit,$acc0,$acc0 + srlx $acc0,32,$sbit + and $acc0,$mask,$acc0 + add $acc0,$car1,$car1 + st $car1,[$tp] + srlx $car1,32,$car1 + + add $tmp0,$car0,$car0 ! ap[j]*a0+c0 + add $tmp1,$car1,$car1 + and $car0,$mask,$acc0 + srlx $car0,32,$car0 + add $acc0,$acc0,$acc0 + or $sbit,$acc0,$acc0 + srlx $acc0,32,$sbit + and $acc0,$mask,$acc0 + add $acc0,$car1,$car1 + st $car1,[$tp+4] + srlx $car1,32,$car1 + + add $car0,$car0,$car0 + or $sbit,$car0,$car0 + add $car0,$car1,$car1 + st $car1,[$tp+8] + srlx $car1,32,$car2 + + ld [%sp+$bias+$frame],$tmp0 ! tp[0] + ld [%sp+$bias+$frame+4],$tmp1 ! tp[1] + ld [%sp+$bias+$frame+8],$tpj ! tp[2] + ld [$ap+4],$mul0 ! ap[1] + ld [$ap+8],$apj ! ap[2] + ld [$np],$car1 ! np[0] + ld [$np+4],$npj ! np[1] + mulx $n0,$tmp0,$mul1 + + mulx $mul0,$mul0,$car0 + and $mul1,$mask,$mul1 + + mulx $car1,$mul1,$car1 + mulx $npj,$mul1,$acc1 + add $tmp0,$car1,$car1 + and $car0,$mask,$acc0 + ld [$np+8],$npj ! np[2] + srlx $car1,32,$car1 + add $tmp1,$car1,$car1 + srlx $car0,32,$car0 + add $acc0,$car1,$car1 + and $car0,1,$sbit + add $acc1,$car1,$car1 + srlx $car0,1,$car0 + mov 12,$j + st $car1,[%sp+$bias+$frame] ! tp[0]= + srlx $car1,32,$car1 + add %sp,$bias+$frame+4,$tp + +.Lsqr_2nd: + mulx $apj,$mul0,$acc0 + mulx $npj,$mul1,$acc1 + add $acc0,$car0,$car0 + add $tpj,$car1,$car1 + ld [$ap+$j],$apj ! ap[j] + and $car0,$mask,$acc0 + ld [$np+$j],$npj ! np[j] + srlx $car0,32,$car0 + add $acc1,$car1,$car1 + ld [$tp+8],$tpj ! tp[j] + add $acc0,$acc0,$acc0 + add $j,4,$j ! j++ + or $sbit,$acc0,$acc0 + srlx $acc0,32,$sbit + and $acc0,$mask,$acc0 + cmp $j,$num + add $acc0,$car1,$car1 + st $car1,[$tp] ! tp[j-1] + srlx $car1,32,$car1 + bl %icc,.Lsqr_2nd + add $tp,4,$tp ! tp++ +!.Lsqr_2nd + + mulx $apj,$mul0,$acc0 + mulx $npj,$mul1,$acc1 + add $acc0,$car0,$car0 + add $tpj,$car1,$car1 + and $car0,$mask,$acc0 + srlx $car0,32,$car0 + add $acc1,$car1,$car1 + add $acc0,$acc0,$acc0 + or $sbit,$acc0,$acc0 + srlx $acc0,32,$sbit + and $acc0,$mask,$acc0 + add $acc0,$car1,$car1 + st $car1,[$tp] ! tp[j-1] + srlx $car1,32,$car1 + + add $car0,$car0,$car0 + or $sbit,$car0,$car0 + add $car0,$car1,$car1 + add $car2,$car1,$car1 + st $car1,[$tp+4] + srlx $car1,32,$car2 + + ld [%sp+$bias+$frame],$tmp1 ! tp[0] + ld [%sp+$bias+$frame+4],$tpj ! tp[1] + ld [$ap+8],$mul0 ! ap[2] + ld [$np],$car1 ! np[0] + ld [$np+4],$npj ! np[1] + mulx $n0,$tmp1,$mul1 + and $mul1,$mask,$mul1 + mov 8,$i + + mulx $mul0,$mul0,$car0 + mulx $car1,$mul1,$car1 + and $car0,$mask,$acc0 + add $tmp1,$car1,$car1 + srlx $car0,32,$car0 + add %sp,$bias+$frame,$tp + srlx $car1,32,$car1 + and $car0,1,$sbit + srlx $car0,1,$car0 + mov 4,$j + +.Lsqr_outer: +.Lsqr_inner1: + mulx $npj,$mul1,$acc1 + add $tpj,$car1,$car1 + add $j,4,$j + ld [$tp+8],$tpj + cmp $j,$i + add $acc1,$car1,$car1 + ld [$np+$j],$npj + st $car1,[$tp] + srlx $car1,32,$car1 + bl %icc,.Lsqr_inner1 + add $tp,4,$tp +!.Lsqr_inner1 + + add $j,4,$j + ld [$ap+$j],$apj ! ap[j] + mulx $npj,$mul1,$acc1 + add $tpj,$car1,$car1 + ld [$np+$j],$npj ! np[j] + add $acc0,$car1,$car1 + ld [$tp+8],$tpj ! tp[j] + add $acc1,$car1,$car1 + st $car1,[$tp] + srlx $car1,32,$car1 + + add $j,4,$j + cmp $j,$num + be,pn %icc,.Lsqr_no_inner2 + add $tp,4,$tp + +.Lsqr_inner2: + mulx $apj,$mul0,$acc0 + mulx $npj,$mul1,$acc1 + add $tpj,$car1,$car1 + add $acc0,$car0,$car0 + ld [$ap+$j],$apj ! ap[j] + and $car0,$mask,$acc0 + ld [$np+$j],$npj ! np[j] + srlx $car0,32,$car0 + add $acc0,$acc0,$acc0 + ld [$tp+8],$tpj ! tp[j] + or $sbit,$acc0,$acc0 + add $j,4,$j ! j++ + srlx $acc0,32,$sbit + and $acc0,$mask,$acc0 + cmp $j,$num + add $acc0,$car1,$car1 + add $acc1,$car1,$car1 + st $car1,[$tp] ! tp[j-1] + srlx $car1,32,$car1 + bl %icc,.Lsqr_inner2 + add $tp,4,$tp ! tp++ + +.Lsqr_no_inner2: + mulx $apj,$mul0,$acc0 + mulx $npj,$mul1,$acc1 + add $tpj,$car1,$car1 + add $acc0,$car0,$car0 + and $car0,$mask,$acc0 + srlx $car0,32,$car0 + add $acc0,$acc0,$acc0 + or $sbit,$acc0,$acc0 + srlx $acc0,32,$sbit + and $acc0,$mask,$acc0 + add $acc0,$car1,$car1 + add $acc1,$car1,$car1 + st $car1,[$tp] ! tp[j-1] + srlx $car1,32,$car1 + + add $car0,$car0,$car0 + or $sbit,$car0,$car0 + add $car0,$car1,$car1 + add $car2,$car1,$car1 + st $car1,[$tp+4] + srlx $car1,32,$car2 + + add $i,4,$i ! i++ + ld [%sp+$bias+$frame],$tmp1 ! tp[0] + ld [%sp+$bias+$frame+4],$tpj ! tp[1] + ld [$ap+$i],$mul0 ! ap[j] + ld [$np],$car1 ! np[0] + ld [$np+4],$npj ! np[1] + mulx $n0,$tmp1,$mul1 + and $mul1,$mask,$mul1 + add $i,4,$tmp0 + + mulx $mul0,$mul0,$car0 + mulx $car1,$mul1,$car1 + and $car0,$mask,$acc0 + add $tmp1,$car1,$car1 + srlx $car0,32,$car0 + add %sp,$bias+$frame,$tp + srlx $car1,32,$car1 + and $car0,1,$sbit + srlx $car0,1,$car0 + + cmp $tmp0,$num ! i<num-1 + bl %icc,.Lsqr_outer + mov 4,$j + +.Lsqr_last: + mulx $npj,$mul1,$acc1 + add $tpj,$car1,$car1 + add $j,4,$j + ld [$tp+8],$tpj + cmp $j,$i + add $acc1,$car1,$car1 + ld [$np+$j],$npj + st $car1,[$tp] + srlx $car1,32,$car1 + bl %icc,.Lsqr_last + add $tp,4,$tp +!.Lsqr_last + + mulx $npj,$mul1,$acc1 + add $tpj,$car1,$car1 + add $acc0,$car1,$car1 + add $acc1,$car1,$car1 + st $car1,[$tp] + srlx $car1,32,$car1 + + add $car0,$car0,$car0 ! recover $car0 + or $sbit,$car0,$car0 + add $car0,$car1,$car1 + add $car2,$car1,$car1 + st $car1,[$tp+4] + srlx $car1,32,$car2 + + ba .Ltail + add $tp,8,$tp +.type $fname,#function +.size $fname,(.-$fname) +.asciz "Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" +.align 32 +___ +$code =~ s/\`([^\`]*)\`/eval($1)/gem; +print $code; +close STDOUT; diff --git a/crypto/bn/asm/sparcv9a-mont.pl b/crypto/bn/asm/sparcv9a-mont.pl new file mode 100755 index 0000000..a14205f --- /dev/null +++ b/crypto/bn/asm/sparcv9a-mont.pl @@ -0,0 +1,882 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# October 2005 +# +# "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU? +# Because unlike integer multiplier, which simply stalls whole CPU, +# FPU is fully pipelined and can effectively emit 48 bit partial +# product every cycle. Why not blended SPARC v9? One can argue that +# making this module dependent on UltraSPARC VIS extension limits its +# binary compatibility. Well yes, it does exclude SPARC64 prior-V(!) +# implementations from compatibility matrix. But the rest, whole Sun +# UltraSPARC family and brand new Fujitsu's SPARC64 V, all support +# VIS extension instructions used in this module. This is considered +# good enough to not care about HAL SPARC64 users [if any] who have +# integer-only pure SPARCv9 module to "fall down" to. + +# USI&II cores currently exhibit uniform 2x improvement [over pre- +# bn_mul_mont codebase] for all key lengths and benchmarks. On USIII +# performance improves few percents for shorter keys and worsens few +# percents for longer keys. This is because USIII integer multiplier +# is >3x faster than USI&II one, which is harder to match [but see +# TODO list below]. It should also be noted that SPARC64 V features +# out-of-order execution, which *might* mean that integer multiplier +# is pipelined, which in turn *might* be impossible to match... On +# additional note, SPARC64 V implements FP Multiply-Add instruction, +# which is perfectly usable in this context... In other words, as far +# as Fujitsu SPARC64 V goes, talk to the author:-) + +# The implementation implies following "non-natural" limitations on +# input arguments: +# - num may not be less than 4; +# - num has to be even; +# Failure to meet either condition has no fatal effects, simply +# doesn't give any performance gain. + +# TODO: +# - modulo-schedule inner loop for better performance (on in-order +# execution core such as UltraSPARC this shall result in further +# noticeable(!) improvement); +# - dedicated squaring procedure[?]; + +###################################################################### +# November 2006 +# +# Modulo-scheduled inner loops allow to interleave floating point and +# integer instructions and minimize Read-After-Write penalties. This +# results in *further* 20-50% perfromance improvement [depending on +# key length, more for longer keys] on USI&II cores and 30-80% - on +# USIII&IV. + +$fname="bn_mul_mont_fpu"; +$bits=32; +for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } + +if ($bits==64) { + $bias=2047; + $frame=192; +} else { + $bias=0; + $frame=128; # 96 rounded up to largest known cache-line +} +$locals=64; + +# In order to provide for 32-/64-bit ABI duality, I keep integers wider +# than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used +# exclusively for pointers, indexes and other small values... +# int bn_mul_mont( +$rp="%i0"; # BN_ULONG *rp, +$ap="%i1"; # const BN_ULONG *ap, +$bp="%i2"; # const BN_ULONG *bp, +$np="%i3"; # const BN_ULONG *np, +$n0="%i4"; # const BN_ULONG *n0, +$num="%i5"; # int num); + +$tp="%l0"; # t[num] +$ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved +$ap_h="%l2"; # to these four vectors as double-precision FP values. +$np_l="%l3"; # This way a bunch of fxtods are eliminated in second +$np_h="%l4"; # loop and L1-cache aliasing is minimized... +$i="%l5"; +$j="%l6"; +$mask="%l7"; # 16-bit mask, 0xffff + +$n0="%g4"; # reassigned(!) to "64-bit" register +$carry="%i4"; # %i4 reused(!) for a carry bit + +# FP register naming chart +# +# ..HILO +# dcba +# -------- +# LOa +# LOb +# LOc +# LOd +# HIa +# HIb +# HIc +# HId +# ..a +# ..b +$ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6"; +$na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14"; +$alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19"; +$nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23"; + +$dota="%f24"; $dotb="%f26"; + +$aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38"; +$ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46"; +$nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54"; +$nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62"; + +$ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load + +$code=<<___; +.section ".text",#alloc,#execinstr + +.global $fname +.align 32 +$fname: + save %sp,-$frame-$locals,%sp + + cmp $num,4 + bl,a,pn %icc,.Lret + clr %i0 + andcc $num,1,%g0 ! $num has to be even... + bnz,a,pn %icc,.Lret + clr %i0 ! signal "unsupported input value" + + srl $num,1,$num + sethi %hi(0xffff),$mask + ld [%i4+0],$n0 ! $n0 reassigned, remember? + or $mask,%lo(0xffff),$mask + ld [%i4+4],%o0 + sllx %o0,32,%o0 + or %o0,$n0,$n0 ! $n0=n0[1].n0[0] + + sll $num,3,$num ! num*=8 + + add %sp,$bias,%o0 ! real top of stack + sll $num,2,%o1 + add %o1,$num,%o1 ! %o1=num*5 + sub %o0,%o1,%o0 + and %o0,-2048,%o0 ! optimize TLB utilization + sub %o0,$bias,%sp ! alloca(5*num*8) + + rd %asi,%o7 ! save %asi + add %sp,$bias+$frame+$locals,$tp + add $tp,$num,$ap_l + add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends ! + add $ap_l,$num,$ap_h + add $ap_h,$num,$np_l + add $np_l,$num,$np_h + + wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads + + add $rp,$num,$rp ! readjust input pointers to point + add $ap,$num,$ap ! at the ends too... + add $bp,$num,$bp + add $np,$num,$np + + stx %o7,[%sp+$bias+$frame+48] ! save %asi + + sub %g0,$num,$i ! i=-num + sub %g0,$num,$j ! j=-num + + add $ap,$j,%o3 + add $bp,$i,%o4 + + ld [%o3+4],%g1 ! bp[0] + ld [%o3+0],%o0 + ld [%o4+4],%g5 ! ap[0] + sllx %g1,32,%g1 + ld [%o4+0],%o1 + sllx %g5,32,%g5 + or %g1,%o0,%o0 + or %g5,%o1,%o1 + + add $np,$j,%o5 + + mulx %o1,%o0,%o0 ! ap[0]*bp[0] + mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0 + stx %o0,[%sp+$bias+$frame+0] + + ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words + fzeros $alo + ld [%o3+4],$ahi_ + fzeros $ahi + ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words + fzeros $nlo + ld [%o5+4],$nhi_ + fzeros $nhi + + ! transfer b[i] to FPU as 4x16-bit values + ldda [%o4+2]%asi,$ba + fxtod $alo,$alo + ldda [%o4+0]%asi,$bb + fxtod $ahi,$ahi + ldda [%o4+6]%asi,$bc + fxtod $nlo,$nlo + ldda [%o4+4]%asi,$bd + fxtod $nhi,$nhi + + ! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values + ldda [%sp+$bias+$frame+6]%asi,$na + fxtod $ba,$ba + ldda [%sp+$bias+$frame+4]%asi,$nb + fxtod $bb,$bb + ldda [%sp+$bias+$frame+2]%asi,$nc + fxtod $bc,$bc + ldda [%sp+$bias+$frame+0]%asi,$nd + fxtod $bd,$bd + + std $alo,[$ap_l+$j] ! save smashed ap[j] in double format + fxtod $na,$na + std $ahi,[$ap_h+$j] + fxtod $nb,$nb + std $nlo,[$np_l+$j] ! save smashed np[j] in double format + fxtod $nc,$nc + std $nhi,[$np_h+$j] + fxtod $nd,$nd + + fmuld $alo,$ba,$aloa + fmuld $nlo,$na,$nloa + fmuld $alo,$bb,$alob + fmuld $nlo,$nb,$nlob + fmuld $alo,$bc,$aloc + faddd $aloa,$nloa,$nloa + fmuld $nlo,$nc,$nloc + fmuld $alo,$bd,$alod + faddd $alob,$nlob,$nlob + fmuld $nlo,$nd,$nlod + fmuld $ahi,$ba,$ahia + faddd $aloc,$nloc,$nloc + fmuld $nhi,$na,$nhia + fmuld $ahi,$bb,$ahib + faddd $alod,$nlod,$nlod + fmuld $nhi,$nb,$nhib + fmuld $ahi,$bc,$ahic + faddd $ahia,$nhia,$nhia + fmuld $nhi,$nc,$nhic + fmuld $ahi,$bd,$ahid + faddd $ahib,$nhib,$nhib + fmuld $nhi,$nd,$nhid + + faddd $ahic,$nhic,$dota ! $nhic + faddd $ahid,$nhid,$dotb ! $nhid + + faddd $nloc,$nhia,$nloc + faddd $nlod,$nhib,$nlod + + fdtox $nloa,$nloa + fdtox $nlob,$nlob + fdtox $nloc,$nloc + fdtox $nlod,$nlod + + std $nloa,[%sp+$bias+$frame+0] + add $j,8,$j + std $nlob,[%sp+$bias+$frame+8] + add $ap,$j,%o4 + std $nloc,[%sp+$bias+$frame+16] + add $np,$j,%o5 + std $nlod,[%sp+$bias+$frame+24] + + ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words + fzeros $alo + ld [%o4+4],$ahi_ + fzeros $ahi + ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words + fzeros $nlo + ld [%o5+4],$nhi_ + fzeros $nhi + + fxtod $alo,$alo + fxtod $ahi,$ahi + fxtod $nlo,$nlo + fxtod $nhi,$nhi + + ldx [%sp+$bias+$frame+0],%o0 + fmuld $alo,$ba,$aloa + ldx [%sp+$bias+$frame+8],%o1 + fmuld $nlo,$na,$nloa + ldx [%sp+$bias+$frame+16],%o2 + fmuld $alo,$bb,$alob + ldx [%sp+$bias+$frame+24],%o3 + fmuld $nlo,$nb,$nlob + + srlx %o0,16,%o7 + std $alo,[$ap_l+$j] ! save smashed ap[j] in double format + fmuld $alo,$bc,$aloc + add %o7,%o1,%o1 + std $ahi,[$ap_h+$j] + faddd $aloa,$nloa,$nloa + fmuld $nlo,$nc,$nloc + srlx %o1,16,%o7 + std $nlo,[$np_l+$j] ! save smashed np[j] in double format + fmuld $alo,$bd,$alod + add %o7,%o2,%o2 + std $nhi,[$np_h+$j] + faddd $alob,$nlob,$nlob + fmuld $nlo,$nd,$nlod + srlx %o2,16,%o7 + fmuld $ahi,$ba,$ahia + add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] + faddd $aloc,$nloc,$nloc + fmuld $nhi,$na,$nhia + !and %o0,$mask,%o0 + !and %o1,$mask,%o1 + !and %o2,$mask,%o2 + !sllx %o1,16,%o1 + !sllx %o2,32,%o2 + !sllx %o3,48,%o7 + !or %o1,%o0,%o0 + !or %o2,%o0,%o0 + !or %o7,%o0,%o0 ! 64-bit result + srlx %o3,16,%g1 ! 34-bit carry + fmuld $ahi,$bb,$ahib + + faddd $alod,$nlod,$nlod + fmuld $nhi,$nb,$nhib + fmuld $ahi,$bc,$ahic + faddd $ahia,$nhia,$nhia + fmuld $nhi,$nc,$nhic + fmuld $ahi,$bd,$ahid + faddd $ahib,$nhib,$nhib + fmuld $nhi,$nd,$nhid + + faddd $dota,$nloa,$nloa + faddd $dotb,$nlob,$nlob + faddd $ahic,$nhic,$dota ! $nhic + faddd $ahid,$nhid,$dotb ! $nhid + + faddd $nloc,$nhia,$nloc + faddd $nlod,$nhib,$nlod + + fdtox $nloa,$nloa + fdtox $nlob,$nlob + fdtox $nloc,$nloc + fdtox $nlod,$nlod + + std $nloa,[%sp+$bias+$frame+0] + std $nlob,[%sp+$bias+$frame+8] + addcc $j,8,$j + std $nloc,[%sp+$bias+$frame+16] + bz,pn %icc,.L1stskip + std $nlod,[%sp+$bias+$frame+24] + +.align 32 ! incidentally already aligned ! +.L1st: + add $ap,$j,%o4 + add $np,$j,%o5 + ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words + fzeros $alo + ld [%o4+4],$ahi_ + fzeros $ahi + ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words + fzeros $nlo + ld [%o5+4],$nhi_ + fzeros $nhi + + fxtod $alo,$alo + fxtod $ahi,$ahi + fxtod $nlo,$nlo + fxtod $nhi,$nhi + + ldx [%sp+$bias+$frame+0],%o0 + fmuld $alo,$ba,$aloa + ldx [%sp+$bias+$frame+8],%o1 + fmuld $nlo,$na,$nloa + ldx [%sp+$bias+$frame+16],%o2 + fmuld $alo,$bb,$alob + ldx [%sp+$bias+$frame+24],%o3 + fmuld $nlo,$nb,$nlob + + srlx %o0,16,%o7 + std $alo,[$ap_l+$j] ! save smashed ap[j] in double format + fmuld $alo,$bc,$aloc + add %o7,%o1,%o1 + std $ahi,[$ap_h+$j] + faddd $aloa,$nloa,$nloa + fmuld $nlo,$nc,$nloc + srlx %o1,16,%o7 + std $nlo,[$np_l+$j] ! save smashed np[j] in double format + fmuld $alo,$bd,$alod + add %o7,%o2,%o2 + std $nhi,[$np_h+$j] + faddd $alob,$nlob,$nlob + fmuld $nlo,$nd,$nlod + srlx %o2,16,%o7 + fmuld $ahi,$ba,$ahia + add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] + and %o0,$mask,%o0 + faddd $aloc,$nloc,$nloc + fmuld $nhi,$na,$nhia + and %o1,$mask,%o1 + and %o2,$mask,%o2 + fmuld $ahi,$bb,$ahib + sllx %o1,16,%o1 + faddd $alod,$nlod,$nlod + fmuld $nhi,$nb,$nhib + sllx %o2,32,%o2 + fmuld $ahi,$bc,$ahic + sllx %o3,48,%o7 + or %o1,%o0,%o0 + faddd $ahia,$nhia,$nhia + fmuld $nhi,$nc,$nhic + or %o2,%o0,%o0 + fmuld $ahi,$bd,$ahid + or %o7,%o0,%o0 ! 64-bit result + faddd $ahib,$nhib,$nhib + fmuld $nhi,$nd,$nhid + addcc %g1,%o0,%o0 + faddd $dota,$nloa,$nloa + srlx %o3,16,%g1 ! 34-bit carry + faddd $dotb,$nlob,$nlob + bcs,a %xcc,.+8 + add %g1,1,%g1 + + stx %o0,[$tp] ! tp[j-1]= + + faddd $ahic,$nhic,$dota ! $nhic + faddd $ahid,$nhid,$dotb ! $nhid + + faddd $nloc,$nhia,$nloc + faddd $nlod,$nhib,$nlod + + fdtox $nloa,$nloa + fdtox $nlob,$nlob + fdtox $nloc,$nloc + fdtox $nlod,$nlod + + std $nloa,[%sp+$bias+$frame+0] + std $nlob,[%sp+$bias+$frame+8] + std $nloc,[%sp+$bias+$frame+16] + std $nlod,[%sp+$bias+$frame+24] + + addcc $j,8,$j + bnz,pt %icc,.L1st + add $tp,8,$tp + +.L1stskip: + fdtox $dota,$dota + fdtox $dotb,$dotb + + ldx [%sp+$bias+$frame+0],%o0 + ldx [%sp+$bias+$frame+8],%o1 + ldx [%sp+$bias+$frame+16],%o2 + ldx [%sp+$bias+$frame+24],%o3 + + srlx %o0,16,%o7 + std $dota,[%sp+$bias+$frame+32] + add %o7,%o1,%o1 + std $dotb,[%sp+$bias+$frame+40] + srlx %o1,16,%o7 + add %o7,%o2,%o2 + srlx %o2,16,%o7 + add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] + and %o0,$mask,%o0 + and %o1,$mask,%o1 + and %o2,$mask,%o2 + sllx %o1,16,%o1 + sllx %o2,32,%o2 + sllx %o3,48,%o7 + or %o1,%o0,%o0 + or %o2,%o0,%o0 + or %o7,%o0,%o0 ! 64-bit result + ldx [%sp+$bias+$frame+32],%o4 + addcc %g1,%o0,%o0 + ldx [%sp+$bias+$frame+40],%o5 + srlx %o3,16,%g1 ! 34-bit carry + bcs,a %xcc,.+8 + add %g1,1,%g1 + + stx %o0,[$tp] ! tp[j-1]= + add $tp,8,$tp + + srlx %o4,16,%o7 + add %o7,%o5,%o5 + and %o4,$mask,%o4 + sllx %o5,16,%o7 + or %o7,%o4,%o4 + addcc %g1,%o4,%o4 + srlx %o5,48,%g1 + bcs,a %xcc,.+8 + add %g1,1,%g1 + + mov %g1,$carry + stx %o4,[$tp] ! tp[num-1]= + + ba .Louter + add $i,8,$i +.align 32 +.Louter: + sub %g0,$num,$j ! j=-num + add %sp,$bias+$frame+$locals,$tp + + add $ap,$j,%o3 + add $bp,$i,%o4 + + ld [%o3+4],%g1 ! bp[i] + ld [%o3+0],%o0 + ld [%o4+4],%g5 ! ap[0] + sllx %g1,32,%g1 + ld [%o4+0],%o1 + sllx %g5,32,%g5 + or %g1,%o0,%o0 + or %g5,%o1,%o1 + + ldx [$tp],%o2 ! tp[0] + mulx %o1,%o0,%o0 + addcc %o2,%o0,%o0 + mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0 + stx %o0,[%sp+$bias+$frame+0] + + ! transfer b[i] to FPU as 4x16-bit values + ldda [%o4+2]%asi,$ba + ldda [%o4+0]%asi,$bb + ldda [%o4+6]%asi,$bc + ldda [%o4+4]%asi,$bd + + ! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values + ldda [%sp+$bias+$frame+6]%asi,$na + fxtod $ba,$ba + ldda [%sp+$bias+$frame+4]%asi,$nb + fxtod $bb,$bb + ldda [%sp+$bias+$frame+2]%asi,$nc + fxtod $bc,$bc + ldda [%sp+$bias+$frame+0]%asi,$nd + fxtod $bd,$bd + ldd [$ap_l+$j],$alo ! load a[j] in double format + fxtod $na,$na + ldd [$ap_h+$j],$ahi + fxtod $nb,$nb + ldd [$np_l+$j],$nlo ! load n[j] in double format + fxtod $nc,$nc + ldd [$np_h+$j],$nhi + fxtod $nd,$nd + + fmuld $alo,$ba,$aloa + fmuld $nlo,$na,$nloa + fmuld $alo,$bb,$alob + fmuld $nlo,$nb,$nlob + fmuld $alo,$bc,$aloc + faddd $aloa,$nloa,$nloa + fmuld $nlo,$nc,$nloc + fmuld $alo,$bd,$alod + faddd $alob,$nlob,$nlob + fmuld $nlo,$nd,$nlod + fmuld $ahi,$ba,$ahia + faddd $aloc,$nloc,$nloc + fmuld $nhi,$na,$nhia + fmuld $ahi,$bb,$ahib + faddd $alod,$nlod,$nlod + fmuld $nhi,$nb,$nhib + fmuld $ahi,$bc,$ahic + faddd $ahia,$nhia,$nhia + fmuld $nhi,$nc,$nhic + fmuld $ahi,$bd,$ahid + faddd $ahib,$nhib,$nhib + fmuld $nhi,$nd,$nhid + + faddd $ahic,$nhic,$dota ! $nhic + faddd $ahid,$nhid,$dotb ! $nhid + + faddd $nloc,$nhia,$nloc + faddd $nlod,$nhib,$nlod + + fdtox $nloa,$nloa + fdtox $nlob,$nlob + fdtox $nloc,$nloc + fdtox $nlod,$nlod + + std $nloa,[%sp+$bias+$frame+0] + std $nlob,[%sp+$bias+$frame+8] + std $nloc,[%sp+$bias+$frame+16] + add $j,8,$j + std $nlod,[%sp+$bias+$frame+24] + + ldd [$ap_l+$j],$alo ! load a[j] in double format + ldd [$ap_h+$j],$ahi + ldd [$np_l+$j],$nlo ! load n[j] in double format + ldd [$np_h+$j],$nhi + + fmuld $alo,$ba,$aloa + fmuld $nlo,$na,$nloa + fmuld $alo,$bb,$alob + fmuld $nlo,$nb,$nlob + fmuld $alo,$bc,$aloc + ldx [%sp+$bias+$frame+0],%o0 + faddd $aloa,$nloa,$nloa + fmuld $nlo,$nc,$nloc + ldx [%sp+$bias+$frame+8],%o1 + fmuld $alo,$bd,$alod + ldx [%sp+$bias+$frame+16],%o2 + faddd $alob,$nlob,$nlob + fmuld $nlo,$nd,$nlod + ldx [%sp+$bias+$frame+24],%o3 + fmuld $ahi,$ba,$ahia + + srlx %o0,16,%o7 + faddd $aloc,$nloc,$nloc + fmuld $nhi,$na,$nhia + add %o7,%o1,%o1 + fmuld $ahi,$bb,$ahib + srlx %o1,16,%o7 + faddd $alod,$nlod,$nlod + fmuld $nhi,$nb,$nhib + add %o7,%o2,%o2 + fmuld $ahi,$bc,$ahic + srlx %o2,16,%o7 + faddd $ahia,$nhia,$nhia + fmuld $nhi,$nc,$nhic + add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] + ! why? + and %o0,$mask,%o0 + fmuld $ahi,$bd,$ahid + and %o1,$mask,%o1 + and %o2,$mask,%o2 + faddd $ahib,$nhib,$nhib + fmuld $nhi,$nd,$nhid + sllx %o1,16,%o1 + faddd $dota,$nloa,$nloa + sllx %o2,32,%o2 + faddd $dotb,$nlob,$nlob + sllx %o3,48,%o7 + or %o1,%o0,%o0 + faddd $ahic,$nhic,$dota ! $nhic + or %o2,%o0,%o0 + faddd $ahid,$nhid,$dotb ! $nhid + or %o7,%o0,%o0 ! 64-bit result + ldx [$tp],%o7 + faddd $nloc,$nhia,$nloc + addcc %o7,%o0,%o0 + ! end-of-why? + faddd $nlod,$nhib,$nlod + srlx %o3,16,%g1 ! 34-bit carry + fdtox $nloa,$nloa + bcs,a %xcc,.+8 + add %g1,1,%g1 + + fdtox $nlob,$nlob + fdtox $nloc,$nloc + fdtox $nlod,$nlod + + std $nloa,[%sp+$bias+$frame+0] + std $nlob,[%sp+$bias+$frame+8] + addcc $j,8,$j + std $nloc,[%sp+$bias+$frame+16] + bz,pn %icc,.Linnerskip + std $nlod,[%sp+$bias+$frame+24] + + ba .Linner + nop +.align 32 +.Linner: + ldd [$ap_l+$j],$alo ! load a[j] in double format + ldd [$ap_h+$j],$ahi + ldd [$np_l+$j],$nlo ! load n[j] in double format + ldd [$np_h+$j],$nhi + + fmuld $alo,$ba,$aloa + fmuld $nlo,$na,$nloa + fmuld $alo,$bb,$alob + fmuld $nlo,$nb,$nlob + fmuld $alo,$bc,$aloc + ldx [%sp+$bias+$frame+0],%o0 + faddd $aloa,$nloa,$nloa + fmuld $nlo,$nc,$nloc + ldx [%sp+$bias+$frame+8],%o1 + fmuld $alo,$bd,$alod + ldx [%sp+$bias+$frame+16],%o2 + faddd $alob,$nlob,$nlob + fmuld $nlo,$nd,$nlod + ldx [%sp+$bias+$frame+24],%o3 + fmuld $ahi,$ba,$ahia + + srlx %o0,16,%o7 + faddd $aloc,$nloc,$nloc + fmuld $nhi,$na,$nhia + add %o7,%o1,%o1 + fmuld $ahi,$bb,$ahib + srlx %o1,16,%o7 + faddd $alod,$nlod,$nlod + fmuld $nhi,$nb,$nhib + add %o7,%o2,%o2 + fmuld $ahi,$bc,$ahic + srlx %o2,16,%o7 + faddd $ahia,$nhia,$nhia + fmuld $nhi,$nc,$nhic + add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] + and %o0,$mask,%o0 + fmuld $ahi,$bd,$ahid + and %o1,$mask,%o1 + and %o2,$mask,%o2 + faddd $ahib,$nhib,$nhib + fmuld $nhi,$nd,$nhid + sllx %o1,16,%o1 + faddd $dota,$nloa,$nloa + sllx %o2,32,%o2 + faddd $dotb,$nlob,$nlob + sllx %o3,48,%o7 + or %o1,%o0,%o0 + faddd $ahic,$nhic,$dota ! $nhic + or %o2,%o0,%o0 + faddd $ahid,$nhid,$dotb ! $nhid + or %o7,%o0,%o0 ! 64-bit result + faddd $nloc,$nhia,$nloc + addcc %g1,%o0,%o0 + ldx [$tp+8],%o7 ! tp[j] + faddd $nlod,$nhib,$nlod + srlx %o3,16,%g1 ! 34-bit carry + fdtox $nloa,$nloa + bcs,a %xcc,.+8 + add %g1,1,%g1 + fdtox $nlob,$nlob + addcc %o7,%o0,%o0 + fdtox $nloc,$nloc + bcs,a %xcc,.+8 + add %g1,1,%g1 + + stx %o0,[$tp] ! tp[j-1] + fdtox $nlod,$nlod + + std $nloa,[%sp+$bias+$frame+0] + std $nlob,[%sp+$bias+$frame+8] + std $nloc,[%sp+$bias+$frame+16] + addcc $j,8,$j + std $nlod,[%sp+$bias+$frame+24] + bnz,pt %icc,.Linner + add $tp,8,$tp + +.Linnerskip: + fdtox $dota,$dota + fdtox $dotb,$dotb + + ldx [%sp+$bias+$frame+0],%o0 + ldx [%sp+$bias+$frame+8],%o1 + ldx [%sp+$bias+$frame+16],%o2 + ldx [%sp+$bias+$frame+24],%o3 + + srlx %o0,16,%o7 + std $dota,[%sp+$bias+$frame+32] + add %o7,%o1,%o1 + std $dotb,[%sp+$bias+$frame+40] + srlx %o1,16,%o7 + add %o7,%o2,%o2 + srlx %o2,16,%o7 + add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] + and %o0,$mask,%o0 + and %o1,$mask,%o1 + and %o2,$mask,%o2 + sllx %o1,16,%o1 + sllx %o2,32,%o2 + sllx %o3,48,%o7 + or %o1,%o0,%o0 + or %o2,%o0,%o0 + ldx [%sp+$bias+$frame+32],%o4 + or %o7,%o0,%o0 ! 64-bit result + ldx [%sp+$bias+$frame+40],%o5 + addcc %g1,%o0,%o0 + ldx [$tp+8],%o7 ! tp[j] + srlx %o3,16,%g1 ! 34-bit carry + bcs,a %xcc,.+8 + add %g1,1,%g1 + + addcc %o7,%o0,%o0 + bcs,a %xcc,.+8 + add %g1,1,%g1 + + stx %o0,[$tp] ! tp[j-1] + add $tp,8,$tp + + srlx %o4,16,%o7 + add %o7,%o5,%o5 + and %o4,$mask,%o4 + sllx %o5,16,%o7 + or %o7,%o4,%o4 + addcc %g1,%o4,%o4 + srlx %o5,48,%g1 + bcs,a %xcc,.+8 + add %g1,1,%g1 + + addcc $carry,%o4,%o4 + stx %o4,[$tp] ! tp[num-1] + mov %g1,$carry + bcs,a %xcc,.+8 + add $carry,1,$carry + + addcc $i,8,$i + bnz %icc,.Louter + nop + + add $tp,8,$tp ! adjust tp to point at the end + orn %g0,%g0,%g4 + sub %g0,$num,%o7 ! n=-num + ba .Lsub + subcc %g0,%g0,%g0 ! clear %icc.c + +.align 32 +.Lsub: + ldx [$tp+%o7],%o0 + add $np,%o7,%g1 + ld [%g1+0],%o2 + ld [%g1+4],%o3 + srlx %o0,32,%o1 + subccc %o0,%o2,%o2 + add $rp,%o7,%g1 + subccc %o1,%o3,%o3 + st %o2,[%g1+0] + add %o7,8,%o7 + brnz,pt %o7,.Lsub + st %o3,[%g1+4] + subc $carry,0,%g4 + sub %g0,$num,%o7 ! n=-num + ba .Lcopy + nop + +.align 32 +.Lcopy: + ldx [$tp+%o7],%o0 + add $rp,%o7,%g1 + ld [%g1+0],%o2 + ld [%g1+4],%o3 + stx %g0,[$tp+%o7] + and %o0,%g4,%o0 + srlx %o0,32,%o1 + andn %o2,%g4,%o2 + andn %o3,%g4,%o3 + or %o2,%o0,%o0 + or %o3,%o1,%o1 + st %o0,[%g1+0] + add %o7,8,%o7 + brnz,pt %o7,.Lcopy + st %o1,[%g1+4] + sub %g0,$num,%o7 ! n=-num + +.Lzap: + stx %g0,[$ap_l+%o7] + stx %g0,[$ap_h+%o7] + stx %g0,[$np_l+%o7] + stx %g0,[$np_h+%o7] + add %o7,8,%o7 + brnz,pt %o7,.Lzap + nop + + ldx [%sp+$bias+$frame+48],%o7 + wr %g0,%o7,%asi ! restore %asi + + mov 1,%i0 +.Lret: + ret + restore +.type $fname,#function +.size $fname,(.-$fname) +.asciz "Montgomery Multipltication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>" +.align 32 +___ + +$code =~ s/\`([^\`]*)\`/eval($1)/gem; + +# Below substitution makes it possible to compile without demanding +# VIS extentions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I +# dare to do this, because VIS capability is detected at run-time now +# and this routine is not called on CPU not capable to execute it. Do +# note that fzeros is not the only VIS dependency! Another dependency +# is implicit and is just _a_ numerical value loaded to %asi register, +# which assembler can't recognize as VIS specific... +$code =~ s/fzeros\s+%f([0-9]+)/ + sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1) + /gem; + +print $code; +# flush +close STDOUT; diff --git a/crypto/bn/asm/via-mont.pl b/crypto/bn/asm/via-mont.pl new file mode 100755 index 0000000..c046a51 --- /dev/null +++ b/crypto/bn/asm/via-mont.pl @@ -0,0 +1,242 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# Wrapper around 'rep montmul', VIA-specific instruction accessing +# PadLock Montgomery Multiplier. The wrapper is designed as drop-in +# replacement for OpenSSL bn_mul_mont [first implemented in 0.9.9]. +# +# Below are interleaved outputs from 'openssl speed rsa dsa' for 4 +# different software configurations on 1.5GHz VIA Esther processor. +# Lines marked with "software integer" denote performance of hand- +# coded integer-only assembler found in OpenSSL 0.9.7. "Software SSE2" +# refers to hand-coded SSE2 Montgomery multiplication procedure found +# OpenSSL 0.9.9. "Hardware VIA SDK" refers to padlock_pmm routine from +# Padlock SDK 2.0.1 available for download from VIA, which naturally +# utilizes the magic 'repz montmul' instruction. And finally "hardware +# this" refers to *this* implementation which also uses 'repz montmul' +# +# sign verify sign/s verify/s +# rsa 512 bits 0.001720s 0.000140s 581.4 7149.7 software integer +# rsa 512 bits 0.000690s 0.000086s 1450.3 11606.0 software SSE2 +# rsa 512 bits 0.006136s 0.000201s 163.0 4974.5 hardware VIA SDK +# rsa 512 bits 0.000712s 0.000050s 1404.9 19858.5 hardware this +# +# rsa 1024 bits 0.008518s 0.000413s 117.4 2420.8 software integer +# rsa 1024 bits 0.004275s 0.000277s 233.9 3609.7 software SSE2 +# rsa 1024 bits 0.012136s 0.000260s 82.4 3844.5 hardware VIA SDK +# rsa 1024 bits 0.002522s 0.000116s 396.5 8650.9 hardware this +# +# rsa 2048 bits 0.050101s 0.001371s 20.0 729.6 software integer +# rsa 2048 bits 0.030273s 0.001008s 33.0 991.9 software SSE2 +# rsa 2048 bits 0.030833s 0.000976s 32.4 1025.1 hardware VIA SDK +# rsa 2048 bits 0.011879s 0.000342s 84.2 2921.7 hardware this +# +# rsa 4096 bits 0.327097s 0.004859s 3.1 205.8 software integer +# rsa 4096 bits 0.229318s 0.003859s 4.4 259.2 software SSE2 +# rsa 4096 bits 0.233953s 0.003274s 4.3 305.4 hardware VIA SDK +# rsa 4096 bits 0.070493s 0.001166s 14.2 857.6 hardware this +# +# dsa 512 bits 0.001342s 0.001651s 745.2 605.7 software integer +# dsa 512 bits 0.000844s 0.000987s 1185.3 1013.1 software SSE2 +# dsa 512 bits 0.001902s 0.002247s 525.6 444.9 hardware VIA SDK +# dsa 512 bits 0.000458s 0.000524s 2182.2 1909.1 hardware this +# +# dsa 1024 bits 0.003964s 0.004926s 252.3 203.0 software integer +# dsa 1024 bits 0.002686s 0.003166s 372.3 315.8 software SSE2 +# dsa 1024 bits 0.002397s 0.002823s 417.1 354.3 hardware VIA SDK +# dsa 1024 bits 0.000978s 0.001170s 1022.2 855.0 hardware this +# +# dsa 2048 bits 0.013280s 0.016518s 75.3 60.5 software integer +# dsa 2048 bits 0.009911s 0.011522s 100.9 86.8 software SSE2 +# dsa 2048 bits 0.009542s 0.011763s 104.8 85.0 hardware VIA SDK +# dsa 2048 bits 0.002884s 0.003352s 346.8 298.3 hardware this +# +# To give you some other reference point here is output for 2.4GHz P4 +# running hand-coded SSE2 bn_mul_mont found in 0.9.9, i.e. "software +# SSE2" in above terms. +# +# rsa 512 bits 0.000407s 0.000047s 2454.2 21137.0 +# rsa 1024 bits 0.002426s 0.000141s 412.1 7100.0 +# rsa 2048 bits 0.015046s 0.000491s 66.5 2034.9 +# rsa 4096 bits 0.109770s 0.002379s 9.1 420.3 +# dsa 512 bits 0.000438s 0.000525s 2281.1 1904.1 +# dsa 1024 bits 0.001346s 0.001595s 742.7 627.0 +# dsa 2048 bits 0.004745s 0.005582s 210.7 179.1 +# +# Conclusions: +# - VIA SDK leaves a *lot* of room for improvement (which this +# implementation successfully fills:-); +# - 'rep montmul' gives up to >3x performance improvement depending on +# key length; +# - in terms of absolute performance it delivers approximately as much +# as modern out-of-order 32-bit cores [again, for longer keys]. + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../perlasm"); +require "x86asm.pl"; + +&asm_init($ARGV[0],"via-mont.pl"); + +# int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num); +$func="bn_mul_mont_padlock"; + +$pad=16*1; # amount of reserved bytes on top of every vector + +# stack layout +$mZeroPrime=&DWP(0,"esp"); # these are specified by VIA +$A=&DWP(4,"esp"); +$B=&DWP(8,"esp"); +$T=&DWP(12,"esp"); +$M=&DWP(16,"esp"); +$scratch=&DWP(20,"esp"); +$rp=&DWP(24,"esp"); # these are mine +$sp=&DWP(28,"esp"); +# &DWP(32,"esp") # 32 byte scratch area +# &DWP(64+(4*$num+$pad)*0,"esp") # padded tp[num] +# &DWP(64+(4*$num+$pad)*1,"esp") # padded copy of ap[num] +# &DWP(64+(4*$num+$pad)*2,"esp") # padded copy of bp[num] +# &DWP(64+(4*$num+$pad)*3,"esp") # padded copy of np[num] +# Note that SDK suggests to unconditionally allocate 2K per vector. This +# has quite an impact on performance. It naturally depends on key length, +# but to give an example 1024 bit private RSA key operations suffer >30% +# penalty. I allocate only as much as actually required... + +&function_begin($func); + &xor ("eax","eax"); + &mov ("ecx",&wparam(5)); # num + # meet VIA's limitations for num [note that the specification + # expresses them in bits, while we work with amount of 32-bit words] + &test ("ecx",3); + &jnz (&label("leave")); # num % 4 != 0 + &cmp ("ecx",8); + &jb (&label("leave")); # num < 8 + &cmp ("ecx",1024); + &ja (&label("leave")); # num > 1024 + + &pushf (); + &cld (); + + &mov ("edi",&wparam(0)); # rp + &mov ("eax",&wparam(1)); # ap + &mov ("ebx",&wparam(2)); # bp + &mov ("edx",&wparam(3)); # np + &mov ("esi",&wparam(4)); # n0 + &mov ("esi",&DWP(0,"esi")); # *n0 + + &lea ("ecx",&DWP($pad,"","ecx",4)); # ecx becomes vector size in bytes + &lea ("ebp",&DWP(64,"","ecx",4)); # allocate 4 vectors + 64 bytes + &neg ("ebp"); + &add ("ebp","esp"); + &and ("ebp",-64); # align to cache-line + &xchg ("ebp","esp"); # alloca + + &mov ($rp,"edi"); # save rp + &mov ($sp,"ebp"); # save esp + + &mov ($mZeroPrime,"esi"); + &lea ("esi",&DWP(64,"esp")); # tp + &mov ($T,"esi"); + &lea ("edi",&DWP(32,"esp")); # scratch area + &mov ($scratch,"edi"); + &mov ("esi","eax"); + + &lea ("ebp",&DWP(-$pad,"ecx")); + &shr ("ebp",2); # restore original num value in ebp + + &xor ("eax","eax"); + + &mov ("ecx","ebp"); + &lea ("ecx",&DWP((32+$pad)/4,"ecx"));# padded tp + scratch + &data_byte(0xf3,0xab); # rep stosl, bzero + + &mov ("ecx","ebp"); + &lea ("edi",&DWP(64+$pad,"esp","ecx",4));# pointer to ap copy + &mov ($A,"edi"); + &data_byte(0xf3,0xa5); # rep movsl, memcpy + &mov ("ecx",$pad/4); + &data_byte(0xf3,0xab); # rep stosl, bzero pad + # edi points at the end of padded ap copy... + + &mov ("ecx","ebp"); + &mov ("esi","ebx"); + &mov ($B,"edi"); + &data_byte(0xf3,0xa5); # rep movsl, memcpy + &mov ("ecx",$pad/4); + &data_byte(0xf3,0xab); # rep stosl, bzero pad + # edi points at the end of padded bp copy... + + &mov ("ecx","ebp"); + &mov ("esi","edx"); + &mov ($M,"edi"); + &data_byte(0xf3,0xa5); # rep movsl, memcpy + &mov ("ecx",$pad/4); + &data_byte(0xf3,0xab); # rep stosl, bzero pad + # edi points at the end of padded np copy... + + # let magic happen... + &mov ("ecx","ebp"); + &mov ("esi","esp"); + &shl ("ecx",5); # convert word counter to bit counter + &align (4); + &data_byte(0xf3,0x0f,0xa6,0xc0);# rep montmul + + &mov ("ecx","ebp"); + &lea ("esi",&DWP(64,"esp")); # tp + # edi still points at the end of padded np copy... + &neg ("ebp"); + &lea ("ebp",&DWP(-$pad,"edi","ebp",4)); # so just "rewind" + &mov ("edi",$rp); # restore rp + &xor ("edx","edx"); # i=0 and clear CF + +&set_label("sub",8); + &mov ("eax",&DWP(0,"esi","edx",4)); + &sbb ("eax",&DWP(0,"ebp","edx",4)); + &mov (&DWP(0,"edi","edx",4),"eax"); # rp[i]=tp[i]-np[i] + &lea ("edx",&DWP(1,"edx")); # i++ + &loop (&label("sub")); # doesn't affect CF! + + &mov ("eax",&DWP(0,"esi","edx",4)); # upmost overflow bit + &sbb ("eax",0); + &and ("esi","eax"); + ¬ ("eax"); + &mov ("ebp","edi"); + &and ("ebp","eax"); + &or ("esi","ebp"); # tp=carry?tp:rp + + &mov ("ecx","edx"); # num + &xor ("edx","edx"); # i=0 + +&set_label("copy",8); + &mov ("eax",&DWP(0,"esi","edx",4)); + &mov (&DWP(64,"esp","edx",4),"ecx"); # zap tp + &mov (&DWP(0,"edi","edx",4),"eax"); + &lea ("edx",&DWP(1,"edx")); # i++ + &loop (&label("copy")); + + &mov ("ebp",$sp); + &xor ("eax","eax"); + + &mov ("ecx",64/4); + &mov ("edi","esp"); # zap frame including scratch area + &data_byte(0xf3,0xab); # rep stosl, bzero + + # zap copies of ap, bp and np + &lea ("edi",&DWP(64+$pad,"esp","edx",4));# pointer to ap + &lea ("ecx",&DWP(3*$pad/4,"edx","edx",2)); + &data_byte(0xf3,0xab); # rep stosl, bzero + + &mov ("esp","ebp"); + &inc ("eax"); # signal "done" + &popf (); +&set_label("leave"); +&function_end($func); + +&asciz("Padlock Montgomery Multiplication, CRYPTOGAMS by <appro\@openssl.org>"); + +&asm_finish(); diff --git a/crypto/bn/asm/x86-mont.pl b/crypto/bn/asm/x86-mont.pl new file mode 100755 index 0000000..5cd3cd2 --- /dev/null +++ b/crypto/bn/asm/x86-mont.pl @@ -0,0 +1,591 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# October 2005 +# +# This is a "teaser" code, as it can be improved in several ways... +# First of all non-SSE2 path should be implemented (yes, for now it +# performs Montgomery multiplication/convolution only on SSE2-capable +# CPUs such as P4, others fall down to original code). Then inner loop +# can be unrolled and modulo-scheduled to improve ILP and possibly +# moved to 128-bit XMM register bank (though it would require input +# rearrangement and/or increase bus bandwidth utilization). Dedicated +# squaring procedure should give further performance improvement... +# Yet, for being draft, the code improves rsa512 *sign* benchmark by +# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-) + +# December 2006 +# +# Modulo-scheduling SSE2 loops results in further 15-20% improvement. +# Integer-only code [being equipped with dedicated squaring procedure] +# gives ~40% on rsa512 sign benchmark... + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../perlasm"); +require "x86asm.pl"; + +&asm_init($ARGV[0],$0); + +$sse2=0; +for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } + +&external_label("OPENSSL_ia32cap_P") if ($sse2); + +&function_begin("bn_mul_mont"); + +$i="edx"; +$j="ecx"; +$ap="esi"; $tp="esi"; # overlapping variables!!! +$rp="edi"; $bp="edi"; # overlapping variables!!! +$np="ebp"; +$num="ebx"; + +$_num=&DWP(4*0,"esp"); # stack top layout +$_rp=&DWP(4*1,"esp"); +$_ap=&DWP(4*2,"esp"); +$_bp=&DWP(4*3,"esp"); +$_np=&DWP(4*4,"esp"); +$_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp"); +$_sp=&DWP(4*6,"esp"); +$_bpend=&DWP(4*7,"esp"); +$frame=32; # size of above frame rounded up to 16n + + &xor ("eax","eax"); + &mov ("edi",&wparam(5)); # int num + &cmp ("edi",4); + &jl (&label("just_leave")); + + &lea ("esi",&wparam(0)); # put aside pointer to argument block + &lea ("edx",&wparam(1)); # load ap + &mov ("ebp","esp"); # saved stack pointer! + &add ("edi",2); # extra two words on top of tp + &neg ("edi"); + &lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2)) + &neg ("edi"); + + # minimize cache contention by arraning 2K window between stack + # pointer and ap argument [np is also position sensitive vector, + # but it's assumed to be near ap, as it's allocated at ~same + # time]. + &mov ("eax","esp"); + &sub ("eax","edx"); + &and ("eax",2047); + &sub ("esp","eax"); # this aligns sp and ap modulo 2048 + + &xor ("edx","esp"); + &and ("edx",2048); + &xor ("edx",2048); + &sub ("esp","edx"); # this splits them apart modulo 4096 + + &and ("esp",-64); # align to cache line + + ################################# load argument block... + &mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp + &mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap + &mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp + &mov ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np + &mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0 + #&mov ("edi",&DWP(5*4,"esi"));# int num + + &mov ("esi",&DWP(0,"esi")); # pull n0[0] + &mov ($_rp,"eax"); # ... save a copy of argument block + &mov ($_ap,"ebx"); + &mov ($_bp,"ecx"); + &mov ($_np,"edx"); + &mov ($_n0,"esi"); + &lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling + #&mov ($_num,$num); # redundant as $num is not reused + &mov ($_sp,"ebp"); # saved stack pointer! + +if($sse2) { +$acc0="mm0"; # mmx register bank layout +$acc1="mm1"; +$car0="mm2"; +$car1="mm3"; +$mul0="mm4"; +$mul1="mm5"; +$temp="mm6"; +$mask="mm7"; + + &picmeup("eax","OPENSSL_ia32cap_P"); + &bt (&DWP(0,"eax"),26); + &jnc (&label("non_sse2")); + + &mov ("eax",-1); + &movd ($mask,"eax"); # mask 32 lower bits + + &mov ($ap,$_ap); # load input pointers + &mov ($bp,$_bp); + &mov ($np,$_np); + + &xor ($i,$i); # i=0 + &xor ($j,$j); # j=0 + + &movd ($mul0,&DWP(0,$bp)); # bp[0] + &movd ($mul1,&DWP(0,$ap)); # ap[0] + &movd ($car1,&DWP(0,$np)); # np[0] + + &pmuludq($mul1,$mul0); # ap[0]*bp[0] + &movq ($car0,$mul1); + &movq ($acc0,$mul1); # I wish movd worked for + &pand ($acc0,$mask); # inter-register transfers + + &pmuludq($mul1,$_n0q); # *=n0 + + &pmuludq($car1,$mul1); # "t[0]"*np[0]*n0 + &paddq ($car1,$acc0); + + &movd ($acc1,&DWP(4,$np)); # np[1] + &movd ($acc0,&DWP(4,$ap)); # ap[1] + + &psrlq ($car0,32); + &psrlq ($car1,32); + + &inc ($j); # j++ +&set_label("1st",16); + &pmuludq($acc0,$mul0); # ap[j]*bp[0] + &pmuludq($acc1,$mul1); # np[j]*m1 + &paddq ($car0,$acc0); # +=c0 + &paddq ($car1,$acc1); # +=c1 + + &movq ($acc0,$car0); + &pand ($acc0,$mask); + &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1] + &paddq ($car1,$acc0); # +=ap[j]*bp[0]; + &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1] + &psrlq ($car0,32); + &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[j-1]= + &psrlq ($car1,32); + + &lea ($j,&DWP(1,$j)); + &cmp ($j,$num); + &jl (&label("1st")); + + &pmuludq($acc0,$mul0); # ap[num-1]*bp[0] + &pmuludq($acc1,$mul1); # np[num-1]*m1 + &paddq ($car0,$acc0); # +=c0 + &paddq ($car1,$acc1); # +=c1 + + &movq ($acc0,$car0); + &pand ($acc0,$mask); + &paddq ($car1,$acc0); # +=ap[num-1]*bp[0]; + &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]= + + &psrlq ($car0,32); + &psrlq ($car1,32); + + &paddq ($car1,$car0); + &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1] + + &inc ($i); # i++ +&set_label("outer"); + &xor ($j,$j); # j=0 + + &movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i] + &movd ($mul1,&DWP(0,$ap)); # ap[0] + &movd ($temp,&DWP($frame,"esp")); # tp[0] + &movd ($car1,&DWP(0,$np)); # np[0] + &pmuludq($mul1,$mul0); # ap[0]*bp[i] + + &paddq ($mul1,$temp); # +=tp[0] + &movq ($acc0,$mul1); + &movq ($car0,$mul1); + &pand ($acc0,$mask); + + &pmuludq($mul1,$_n0q); # *=n0 + + &pmuludq($car1,$mul1); + &paddq ($car1,$acc0); + + &movd ($temp,&DWP($frame+4,"esp")); # tp[1] + &movd ($acc1,&DWP(4,$np)); # np[1] + &movd ($acc0,&DWP(4,$ap)); # ap[1] + + &psrlq ($car0,32); + &psrlq ($car1,32); + &paddq ($car0,$temp); # +=tp[1] + + &inc ($j); # j++ + &dec ($num); +&set_label("inner"); + &pmuludq($acc0,$mul0); # ap[j]*bp[i] + &pmuludq($acc1,$mul1); # np[j]*m1 + &paddq ($car0,$acc0); # +=c0 + &paddq ($car1,$acc1); # +=c1 + + &movq ($acc0,$car0); + &movd ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1] + &pand ($acc0,$mask); + &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1] + &paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j] + &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1] + &psrlq ($car0,32); + &movd (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]= + &psrlq ($car1,32); + &paddq ($car0,$temp); # +=tp[j+1] + + &dec ($num); + &lea ($j,&DWP(1,$j)); # j++ + &jnz (&label("inner")); + + &mov ($num,$j); + &pmuludq($acc0,$mul0); # ap[num-1]*bp[i] + &pmuludq($acc1,$mul1); # np[num-1]*m1 + &paddq ($car0,$acc0); # +=c0 + &paddq ($car1,$acc1); # +=c1 + + &movq ($acc0,$car0); + &pand ($acc0,$mask); + &paddq ($car1,$acc0); # +=ap[num-1]*bp[i]+tp[num-1] + &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]= + &psrlq ($car0,32); + &psrlq ($car1,32); + + &movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num] + &paddq ($car1,$car0); + &paddq ($car1,$temp); + &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1] + + &lea ($i,&DWP(1,$i)); # i++ + &cmp ($i,$num); + &jle (&label("outer")); + + &emms (); # done with mmx bank + &jmp (&label("common_tail")); + +&set_label("non_sse2",16); +} + +if (0) { + &mov ("esp",$_sp); + &xor ("eax","eax"); # signal "not fast enough [yet]" + &jmp (&label("just_leave")); + # While the below code provides competitive performance for + # all key lengthes on modern Intel cores, it's still more + # than 10% slower for 4096-bit key elsewhere:-( "Competitive" + # means compared to the original integer-only assembler. + # 512-bit RSA sign is better by ~40%, but that's about all + # one can say about all CPUs... +} else { +$inp="esi"; # integer path uses these registers differently +$word="edi"; +$carry="ebp"; + + &mov ($inp,$_ap); + &lea ($carry,&DWP(1,$num)); + &mov ($word,$_bp); + &xor ($j,$j); # j=0 + &mov ("edx",$inp); + &and ($carry,1); # see if num is even + &sub ("edx",$word); # see if ap==bp + &lea ("eax",&DWP(4,$word,$num,4)); # &bp[num] + &or ($carry,"edx"); + &mov ($word,&DWP(0,$word)); # bp[0] + &jz (&label("bn_sqr_mont")); + &mov ($_bpend,"eax"); + &mov ("eax",&DWP(0,$inp)); + &xor ("edx","edx"); + +&set_label("mull",16); + &mov ($carry,"edx"); + &mul ($word); # ap[j]*bp[0] + &add ($carry,"eax"); + &lea ($j,&DWP(1,$j)); + &adc ("edx",0); + &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1] + &cmp ($j,$num); + &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= + &jl (&label("mull")); + + &mov ($carry,"edx"); + &mul ($word); # ap[num-1]*bp[0] + &mov ($word,$_n0); + &add ("eax",$carry); + &mov ($inp,$_np); + &adc ("edx",0); + &imul ($word,&DWP($frame,"esp")); # n0*tp[0] + + &mov (&DWP($frame,"esp",$num,4),"eax"); # tp[num-1]= + &xor ($j,$j); + &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]= + &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]= + + &mov ("eax",&DWP(0,$inp)); # np[0] + &mul ($word); # np[0]*m + &add ("eax",&DWP($frame,"esp")); # +=tp[0] + &mov ("eax",&DWP(4,$inp)); # np[1] + &adc ("edx",0); + &inc ($j); + + &jmp (&label("2ndmadd")); + +&set_label("1stmadd",16); + &mov ($carry,"edx"); + &mul ($word); # ap[j]*bp[i] + &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] + &lea ($j,&DWP(1,$j)); + &adc ("edx",0); + &add ($carry,"eax"); + &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1] + &adc ("edx",0); + &cmp ($j,$num); + &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= + &jl (&label("1stmadd")); + + &mov ($carry,"edx"); + &mul ($word); # ap[num-1]*bp[i] + &add ("eax",&DWP($frame,"esp",$num,4)); # +=tp[num-1] + &mov ($word,$_n0); + &adc ("edx",0); + &mov ($inp,$_np); + &add ($carry,"eax"); + &adc ("edx",0); + &imul ($word,&DWP($frame,"esp")); # n0*tp[0] + + &xor ($j,$j); + &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] + &mov (&DWP($frame,"esp",$num,4),$carry); # tp[num-1]= + &adc ($j,0); + &mov ("eax",&DWP(0,$inp)); # np[0] + &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]= + &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]= + + &mul ($word); # np[0]*m + &add ("eax",&DWP($frame,"esp")); # +=tp[0] + &mov ("eax",&DWP(4,$inp)); # np[1] + &adc ("edx",0); + &mov ($j,1); + +&set_label("2ndmadd",16); + &mov ($carry,"edx"); + &mul ($word); # np[j]*m + &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] + &lea ($j,&DWP(1,$j)); + &adc ("edx",0); + &add ($carry,"eax"); + &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+1] + &adc ("edx",0); + &cmp ($j,$num); + &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j-1]= + &jl (&label("2ndmadd")); + + &mov ($carry,"edx"); + &mul ($word); # np[j]*m + &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1] + &adc ("edx",0); + &add ($carry,"eax"); + &adc ("edx",0); + &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]= + + &xor ("eax","eax"); + &mov ($j,$_bp); # &bp[i] + &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] + &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1] + &lea ($j,&DWP(4,$j)); + &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]= + &cmp ($j,$_bpend); + &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]= + &je (&label("common_tail")); + + &mov ($word,&DWP(0,$j)); # bp[i+1] + &mov ($inp,$_ap); + &mov ($_bp,$j); # &bp[++i] + &xor ($j,$j); + &xor ("edx","edx"); + &mov ("eax",&DWP(0,$inp)); + &jmp (&label("1stmadd")); + +&set_label("bn_sqr_mont",16); +$sbit=$num; + &mov ($_num,$num); + &mov ($_bp,$j); # i=0 + + &mov ("eax",$word); # ap[0] + &mul ($word); # ap[0]*ap[0] + &mov (&DWP($frame,"esp"),"eax"); # tp[0]= + &mov ($sbit,"edx"); + &shr ("edx",1); + &and ($sbit,1); + &inc ($j); +&set_label("sqr",16); + &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j] + &mov ($carry,"edx"); + &mul ($word); # ap[j]*ap[0] + &add ("eax",$carry); + &lea ($j,&DWP(1,$j)); + &adc ("edx",0); + &lea ($carry,&DWP(0,$sbit,"eax",2)); + &shr ("eax",31); + &cmp ($j,$_num); + &mov ($sbit,"eax"); + &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= + &jl (&label("sqr")); + + &mov ("eax",&DWP(0,$inp,$j,4)); # ap[num-1] + &mov ($carry,"edx"); + &mul ($word); # ap[num-1]*ap[0] + &add ("eax",$carry); + &mov ($word,$_n0); + &adc ("edx",0); + &mov ($inp,$_np); + &lea ($carry,&DWP(0,$sbit,"eax",2)); + &imul ($word,&DWP($frame,"esp")); # n0*tp[0] + &shr ("eax",31); + &mov (&DWP($frame,"esp",$j,4),$carry); # tp[num-1]= + + &lea ($carry,&DWP(0,"eax","edx",2)); + &mov ("eax",&DWP(0,$inp)); # np[0] + &shr ("edx",31); + &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num]= + &mov (&DWP($frame+8,"esp",$j,4),"edx"); # tp[num+1]= + + &mul ($word); # np[0]*m + &add ("eax",&DWP($frame,"esp")); # +=tp[0] + &mov ($num,$j); + &adc ("edx",0); + &mov ("eax",&DWP(4,$inp)); # np[1] + &mov ($j,1); + +&set_label("3rdmadd",16); + &mov ($carry,"edx"); + &mul ($word); # np[j]*m + &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] + &adc ("edx",0); + &add ($carry,"eax"); + &mov ("eax",&DWP(4,$inp,$j,4)); # np[j+1] + &adc ("edx",0); + &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j-1]= + + &mov ($carry,"edx"); + &mul ($word); # np[j+1]*m + &add ($carry,&DWP($frame+4,"esp",$j,4)); # +=tp[j+1] + &lea ($j,&DWP(2,$j)); + &adc ("edx",0); + &add ($carry,"eax"); + &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+2] + &adc ("edx",0); + &cmp ($j,$num); + &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j]= + &jl (&label("3rdmadd")); + + &mov ($carry,"edx"); + &mul ($word); # np[j]*m + &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1] + &adc ("edx",0); + &add ($carry,"eax"); + &adc ("edx",0); + &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]= + + &mov ($j,$_bp); # i + &xor ("eax","eax"); + &mov ($inp,$_ap); + &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] + &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1] + &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]= + &cmp ($j,$num); + &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]= + &je (&label("common_tail")); + + &mov ($word,&DWP(4,$inp,$j,4)); # ap[i] + &lea ($j,&DWP(1,$j)); + &mov ("eax",$word); + &mov ($_bp,$j); # ++i + &mul ($word); # ap[i]*ap[i] + &add ("eax",&DWP($frame,"esp",$j,4)); # +=tp[i] + &adc ("edx",0); + &mov (&DWP($frame,"esp",$j,4),"eax"); # tp[i]= + &xor ($carry,$carry); + &cmp ($j,$num); + &lea ($j,&DWP(1,$j)); + &je (&label("sqrlast")); + + &mov ($sbit,"edx"); # zaps $num + &shr ("edx",1); + &and ($sbit,1); +&set_label("sqradd",16); + &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j] + &mov ($carry,"edx"); + &mul ($word); # ap[j]*ap[i] + &add ("eax",$carry); + &lea ($carry,&DWP(0,"eax","eax")); + &adc ("edx",0); + &shr ("eax",31); + &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] + &lea ($j,&DWP(1,$j)); + &adc ("eax",0); + &add ($carry,$sbit); + &adc ("eax",0); + &cmp ($j,$_num); + &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= + &mov ($sbit,"eax"); + &jle (&label("sqradd")); + + &mov ($carry,"edx"); + &lea ("edx",&DWP(0,$sbit,"edx",2)); + &shr ($carry,31); +&set_label("sqrlast"); + &mov ($word,$_n0); + &mov ($inp,$_np); + &imul ($word,&DWP($frame,"esp")); # n0*tp[0] + + &add ("edx",&DWP($frame,"esp",$j,4)); # +=tp[num] + &mov ("eax",&DWP(0,$inp)); # np[0] + &adc ($carry,0); + &mov (&DWP($frame,"esp",$j,4),"edx"); # tp[num]= + &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num+1]= + + &mul ($word); # np[0]*m + &add ("eax",&DWP($frame,"esp")); # +=tp[0] + &lea ($num,&DWP(-1,$j)); + &adc ("edx",0); + &mov ($j,1); + &mov ("eax",&DWP(4,$inp)); # np[1] + + &jmp (&label("3rdmadd")); +} + +&set_label("common_tail",16); + &mov ($np,$_np); # load modulus pointer + &mov ($rp,$_rp); # load result pointer + &lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped] + + &mov ("eax",&DWP(0,$tp)); # tp[0] + &mov ($j,$num); # j=num-1 + &xor ($i,$i); # i=0 and clear CF! + +&set_label("sub",16); + &sbb ("eax",&DWP(0,$np,$i,4)); + &mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i] + &dec ($j); # doesn't affect CF! + &mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1] + &lea ($i,&DWP(1,$i)); # i++ + &jge (&label("sub")); + + &sbb ("eax",0); # handle upmost overflow bit + &and ($tp,"eax"); + ¬ ("eax"); + &mov ($np,$rp); + &and ($np,"eax"); + &or ($tp,$np); # tp=carry?tp:rp + +&set_label("copy",16); # copy or in-place refresh + &mov ("eax",&DWP(0,$tp,$num,4)); + &mov (&DWP(0,$rp,$num,4),"eax"); # rp[i]=tp[i] + &mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector + &dec ($num); + &jge (&label("copy")); + + &mov ("esp",$_sp); # pull saved stack pointer + &mov ("eax",1); +&set_label("just_leave"); +&function_end("bn_mul_mont"); + +&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>"); + +&asm_finish(); |