diff options
author | simon <simon@FreeBSD.org> | 2010-03-13 19:22:41 +0000 |
---|---|---|
committer | simon <simon@FreeBSD.org> | 2010-03-13 19:22:41 +0000 |
commit | 0d816bbd980d8201a2ad23ccd05f7bde16565282 (patch) | |
tree | 909a7c21b7df72ae8c08b80b468a4dd75b3820be /crypto/openssl/crypto/bn/asm | |
parent | 7fd3bd147ec574621124307eca10ead5353e34ba (diff) | |
parent | cdb6eef1f013e22a10ab5f5829dcdc3b5e32d385 (diff) | |
download | FreeBSD-src-0d816bbd980d8201a2ad23ccd05f7bde16565282.zip FreeBSD-src-0d816bbd980d8201a2ad23ccd05f7bde16565282.tar.gz |
Merge OpenSSL 0.9.8m into head.
This also "reverts" some FreeBSD local changes so we should now
be back to using entirely stock OpenSSL. The local changes were
simple $FreeBSD$ lines additions, which were required in the CVS
days, and the patch for FreeBSD-SA-09:15.ssl which has been
superseded with OpenSSL 0.9.8m's RFC5746 'TLS renegotiation
extension' support.
MFC after: 3 weeks
Diffstat (limited to 'crypto/openssl/crypto/bn/asm')
-rwxr-xr-x | crypto/openssl/crypto/bn/asm/alpha-mont.pl | 317 | ||||
-rwxr-xr-x | crypto/openssl/crypto/bn/asm/armv4-mont.pl | 200 | ||||
-rwxr-xr-x | crypto/openssl/crypto/bn/asm/mips3-mont.pl | 327 | ||||
-rwxr-xr-x | crypto/openssl/crypto/bn/asm/ppc-mont.pl | 323 | ||||
-rwxr-xr-x | crypto/openssl/crypto/bn/asm/ppc64-mont.pl | 918 | ||||
-rwxr-xr-x | crypto/openssl/crypto/bn/asm/s390x-mont.pl | 225 | ||||
-rwxr-xr-x | crypto/openssl/crypto/bn/asm/s390x.S | 678 | ||||
-rwxr-xr-x | crypto/openssl/crypto/bn/asm/sparcv9-mont.pl | 606 | ||||
-rwxr-xr-x | crypto/openssl/crypto/bn/asm/sparcv9a-mont.pl | 882 | ||||
-rwxr-xr-x | crypto/openssl/crypto/bn/asm/via-mont.pl | 242 | ||||
-rwxr-xr-x | crypto/openssl/crypto/bn/asm/x86-mont.pl | 591 | ||||
-rw-r--r-- | crypto/openssl/crypto/bn/asm/x86_64-gcc.c | 18 |
12 files changed, 11 insertions, 5316 deletions
diff --git a/crypto/openssl/crypto/bn/asm/alpha-mont.pl b/crypto/openssl/crypto/bn/asm/alpha-mont.pl deleted file mode 100755 index 7a2cc31..0000000 --- a/crypto/openssl/crypto/bn/asm/alpha-mont.pl +++ /dev/null @@ -1,317 +0,0 @@ -#!/usr/bin/env perl -# -# ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. -# ==================================================================== -# -# On 21264 RSA sign performance improves by 70/35/20/15 percent for -# 512/1024/2048/4096 bit key lengths. This is against vendor compiler -# instructed to '-tune host' code with in-line assembler. Other -# benchmarks improve by 15-20%. To anchor it to something else, the -# code provides approximately the same performance per GHz as AMD64. -# I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x -# difference. - -# int bn_mul_mont( -$rp="a0"; # BN_ULONG *rp, -$ap="a1"; # const BN_ULONG *ap, -$bp="a2"; # const BN_ULONG *bp, -$np="a3"; # const BN_ULONG *np, -$n0="a4"; # const BN_ULONG *n0, -$num="a5"; # int num); - -$lo0="t0"; -$hi0="t1"; -$lo1="t2"; -$hi1="t3"; -$aj="t4"; -$bi="t5"; -$nj="t6"; -$tp="t7"; -$alo="t8"; -$ahi="t9"; -$nlo="t10"; -$nhi="t11"; -$tj="t12"; -$i="s3"; -$j="s4"; -$m1="s5"; - -$code=<<___; -#include <asm.h> -#include <regdef.h> - -.text - -.set noat -.set noreorder - -.globl bn_mul_mont -.align 5 -.ent bn_mul_mont -bn_mul_mont: - lda sp,-40(sp) - stq ra,0(sp) - stq s3,8(sp) - stq s4,16(sp) - stq s5,24(sp) - stq fp,32(sp) - mov sp,fp - .mask 0x0400f000,-40 - .frame fp,40,ra - .prologue 0 - - .align 4 - .set reorder - sextl $num,$num - mov 0,v0 - cmplt $num,4,AT - bne AT,.Lexit - - ldq $hi0,0($ap) # ap[0] - s8addq $num,16,AT - ldq $aj,8($ap) - subq sp,AT,sp - ldq $bi,0($bp) # bp[0] - mov -4096,AT - ldq $n0,0($n0) - and sp,AT,sp - - mulq $hi0,$bi,$lo0 - ldq $hi1,0($np) # np[0] - umulh $hi0,$bi,$hi0 - ldq $nj,8($np) - - mulq $lo0,$n0,$m1 - - mulq $hi1,$m1,$lo1 - umulh $hi1,$m1,$hi1 - - addq $lo1,$lo0,$lo1 - cmpult $lo1,$lo0,AT - addq $hi1,AT,$hi1 - - mulq $aj,$bi,$alo - mov 2,$j - umulh $aj,$bi,$ahi - mov sp,$tp - - mulq $nj,$m1,$nlo - s8addq $j,$ap,$aj - umulh $nj,$m1,$nhi - s8addq $j,$np,$nj -.align 4 -.L1st: - .set noreorder - ldq $aj,($aj) - addl $j,1,$j - ldq $nj,($nj) - lda $tp,8($tp) - - addq $alo,$hi0,$lo0 - mulq $aj,$bi,$alo - cmpult $lo0,$hi0,AT - addq $nlo,$hi1,$lo1 - - mulq $nj,$m1,$nlo - addq $ahi,AT,$hi0 - cmpult $lo1,$hi1,v0 - cmplt $j,$num,$tj - - umulh $aj,$bi,$ahi - addq $nhi,v0,$hi1 - addq $lo1,$lo0,$lo1 - s8addq $j,$ap,$aj - - umulh $nj,$m1,$nhi - cmpult $lo1,$lo0,v0 - addq $hi1,v0,$hi1 - s8addq $j,$np,$nj - - stq $lo1,-8($tp) - nop - unop - bne $tj,.L1st - .set reorder - - addq $alo,$hi0,$lo0 - addq $nlo,$hi1,$lo1 - cmpult $lo0,$hi0,AT - cmpult $lo1,$hi1,v0 - addq $ahi,AT,$hi0 - addq $nhi,v0,$hi1 - - addq $lo1,$lo0,$lo1 - cmpult $lo1,$lo0,v0 - addq $hi1,v0,$hi1 - - stq $lo1,0($tp) - - addq $hi1,$hi0,$hi1 - cmpult $hi1,$hi0,AT - stq $hi1,8($tp) - stq AT,16($tp) - - mov 1,$i -.align 4 -.Louter: - s8addq $i,$bp,$bi - ldq $hi0,($ap) - ldq $aj,8($ap) - ldq $bi,($bi) - ldq $hi1,($np) - ldq $nj,8($np) - ldq $tj,(sp) - - mulq $hi0,$bi,$lo0 - umulh $hi0,$bi,$hi0 - - addq $lo0,$tj,$lo0 - cmpult $lo0,$tj,AT - addq $hi0,AT,$hi0 - - mulq $lo0,$n0,$m1 - - mulq $hi1,$m1,$lo1 - umulh $hi1,$m1,$hi1 - - addq $lo1,$lo0,$lo1 - cmpult $lo1,$lo0,AT - mov 2,$j - addq $hi1,AT,$hi1 - - mulq $aj,$bi,$alo - mov sp,$tp - umulh $aj,$bi,$ahi - - mulq $nj,$m1,$nlo - s8addq $j,$ap,$aj - umulh $nj,$m1,$nhi -.align 4 -.Linner: - .set noreorder - ldq $tj,8($tp) #L0 - nop #U1 - ldq $aj,($aj) #L1 - s8addq $j,$np,$nj #U0 - - ldq $nj,($nj) #L0 - nop #U1 - addq $alo,$hi0,$lo0 #L1 - lda $tp,8($tp) - - mulq $aj,$bi,$alo #U1 - cmpult $lo0,$hi0,AT #L0 - addq $nlo,$hi1,$lo1 #L1 - addl $j,1,$j - - mulq $nj,$m1,$nlo #U1 - addq $ahi,AT,$hi0 #L0 - addq $lo0,$tj,$lo0 #L1 - cmpult $lo1,$hi1,v0 #U0 - - umulh $aj,$bi,$ahi #U1 - cmpult $lo0,$tj,AT #L0 - addq $lo1,$lo0,$lo1 #L1 - addq $nhi,v0,$hi1 #U0 - - umulh $nj,$m1,$nhi #U1 - s8addq $j,$ap,$aj #L0 - cmpult $lo1,$lo0,v0 #L1 - cmplt $j,$num,$tj #U0 # borrow $tj - - addq $hi0,AT,$hi0 #L0 - addq $hi1,v0,$hi1 #U1 - stq $lo1,-8($tp) #L1 - bne $tj,.Linner #U0 - .set reorder - - ldq $tj,8($tp) - addq $alo,$hi0,$lo0 - addq $nlo,$hi1,$lo1 - cmpult $lo0,$hi0,AT - cmpult $lo1,$hi1,v0 - addq $ahi,AT,$hi0 - addq $nhi,v0,$hi1 - - addq $lo0,$tj,$lo0 - cmpult $lo0,$tj,AT - addq $hi0,AT,$hi0 - - ldq $tj,16($tp) - addq $lo1,$lo0,$j - cmpult $j,$lo0,v0 - addq $hi1,v0,$hi1 - - addq $hi1,$hi0,$lo1 - stq $j,($tp) - cmpult $lo1,$hi0,$hi1 - addq $lo1,$tj,$lo1 - cmpult $lo1,$tj,AT - addl $i,1,$i - addq $hi1,AT,$hi1 - stq $lo1,8($tp) - cmplt $i,$num,$tj # borrow $tj - stq $hi1,16($tp) - bne $tj,.Louter - - s8addq $num,sp,$tj # &tp[num] - mov $rp,$bp # put rp aside - mov sp,$tp - mov sp,$ap - mov 0,$hi0 # clear borrow bit - -.align 4 -.Lsub: ldq $lo0,($tp) - ldq $lo1,($np) - lda $tp,8($tp) - lda $np,8($np) - subq $lo0,$lo1,$lo1 # tp[i]-np[i] - cmpult $lo0,$lo1,AT - subq $lo1,$hi0,$lo0 - cmpult $lo1,$lo0,$hi0 - or $hi0,AT,$hi0 - stq $lo0,($rp) - cmpult $tp,$tj,v0 - lda $rp,8($rp) - bne v0,.Lsub - - subq $hi1,$hi0,$hi0 # handle upmost overflow bit - mov sp,$tp - mov $bp,$rp # restore rp - - and sp,$hi0,$ap - bic $bp,$hi0,$bp - bis $bp,$ap,$ap # ap=borrow?tp:rp - -.align 4 -.Lcopy: ldq $aj,($ap) # copy or in-place refresh - lda $tp,8($tp) - lda $rp,8($rp) - lda $ap,8($ap) - stq zero,-8($tp) # zap tp - cmpult $tp,$tj,AT - stq $aj,-8($rp) - bne AT,.Lcopy - mov 1,v0 - -.Lexit: - .set noreorder - mov fp,sp - /*ldq ra,0(sp)*/ - ldq s3,8(sp) - ldq s4,16(sp) - ldq s5,24(sp) - ldq fp,32(sp) - lda sp,40(sp) - ret (ra) -.end bn_mul_mont -.rdata -.asciiz "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>" -___ - -print $code; -close STDOUT; diff --git a/crypto/openssl/crypto/bn/asm/armv4-mont.pl b/crypto/openssl/crypto/bn/asm/armv4-mont.pl deleted file mode 100755 index 05d5dc1..0000000 --- a/crypto/openssl/crypto/bn/asm/armv4-mont.pl +++ /dev/null @@ -1,200 +0,0 @@ -#!/usr/bin/env perl - -# ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. -# ==================================================================== - -# January 2007. - -# Montgomery multiplication for ARMv4. -# -# Performance improvement naturally varies among CPU implementations -# and compilers. The code was observed to provide +65-35% improvement -# [depending on key length, less for longer keys] on ARM920T, and -# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code -# base and compiler generated code with in-lined umull and even umlal -# instructions. The latter means that this code didn't really have an -# "advantage" of utilizing some "secret" instruction. -# -# The code is interoperable with Thumb ISA and is rather compact, less -# than 1/2KB. Windows CE port would be trivial, as it's exclusively -# about decorations, ABI and instruction syntax are identical. - -$num="r0"; # starts as num argument, but holds &tp[num-1] -$ap="r1"; -$bp="r2"; $bi="r2"; $rp="r2"; -$np="r3"; -$tp="r4"; -$aj="r5"; -$nj="r6"; -$tj="r7"; -$n0="r8"; -########### # r9 is reserved by ELF as platform specific, e.g. TLS pointer -$alo="r10"; # sl, gcc uses it to keep @GOT -$ahi="r11"; # fp -$nlo="r12"; # ip -########### # r13 is stack pointer -$nhi="r14"; # lr -########### # r15 is program counter - -#### argument block layout relative to &tp[num-1], a.k.a. $num -$_rp="$num,#12*4"; -# ap permanently resides in r1 -$_bp="$num,#13*4"; -# np permanently resides in r3 -$_n0="$num,#14*4"; -$_num="$num,#15*4"; $_bpend=$_num; - -$code=<<___; -.text - -.global bn_mul_mont -.type bn_mul_mont,%function - -.align 2 -bn_mul_mont: - stmdb sp!,{r0,r2} @ sp points at argument block - ldr $num,[sp,#3*4] @ load num - cmp $num,#2 - movlt r0,#0 - addlt sp,sp,#2*4 - blt .Labrt - - stmdb sp!,{r4-r12,lr} @ save 10 registers - - mov $num,$num,lsl#2 @ rescale $num for byte count - sub sp,sp,$num @ alloca(4*num) - sub sp,sp,#4 @ +extra dword - sub $num,$num,#4 @ "num=num-1" - add $tp,$bp,$num @ &bp[num-1] - - add $num,sp,$num @ $num to point at &tp[num-1] - ldr $n0,[$_n0] @ &n0 - ldr $bi,[$bp] @ bp[0] - ldr $aj,[$ap],#4 @ ap[0],ap++ - ldr $nj,[$np],#4 @ np[0],np++ - ldr $n0,[$n0] @ *n0 - str $tp,[$_bpend] @ save &bp[num] - - umull $alo,$ahi,$aj,$bi @ ap[0]*bp[0] - str $n0,[$_n0] @ save n0 value - mul $n0,$alo,$n0 @ "tp[0]"*n0 - mov $nlo,#0 - umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"t[0]" - mov $tp,sp - -.L1st: - ldr $aj,[$ap],#4 @ ap[j],ap++ - mov $alo,$ahi - mov $ahi,#0 - umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0] - ldr $nj,[$np],#4 @ np[j],np++ - mov $nhi,#0 - umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 - adds $nlo,$nlo,$alo - str $nlo,[$tp],#4 @ tp[j-1]=,tp++ - adc $nlo,$nhi,#0 - cmp $tp,$num - bne .L1st - - adds $nlo,$nlo,$ahi - mov $nhi,#0 - adc $nhi,$nhi,#0 - ldr $tp,[$_bp] @ restore bp - str $nlo,[$num] @ tp[num-1]= - ldr $n0,[$_n0] @ restore n0 - str $nhi,[$num,#4] @ tp[num]= - -.Louter: - sub $tj,$num,sp @ "original" $num-1 value - sub $ap,$ap,$tj @ "rewind" ap to &ap[1] - sub $np,$np,$tj @ "rewind" np to &np[1] - ldr $bi,[$tp,#4]! @ *(++bp) - ldr $aj,[$ap,#-4] @ ap[0] - ldr $nj,[$np,#-4] @ np[0] - ldr $alo,[sp] @ tp[0] - ldr $tj,[sp,#4] @ tp[1] - - mov $ahi,#0 - umlal $alo,$ahi,$aj,$bi @ ap[0]*bp[i]+tp[0] - str $tp,[$_bp] @ save bp - mul $n0,$alo,$n0 - mov $nlo,#0 - umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"tp[0]" - mov $tp,sp - -.Linner: - ldr $aj,[$ap],#4 @ ap[j],ap++ - adds $alo,$ahi,$tj @ +=tp[j] - mov $ahi,#0 - umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i] - ldr $nj,[$np],#4 @ np[j],np++ - mov $nhi,#0 - umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 - ldr $tj,[$tp,#8] @ tp[j+1] - adc $ahi,$ahi,#0 - adds $nlo,$nlo,$alo - str $nlo,[$tp],#4 @ tp[j-1]=,tp++ - adc $nlo,$nhi,#0 - cmp $tp,$num - bne .Linner - - adds $nlo,$nlo,$ahi - mov $nhi,#0 - adc $nhi,$nhi,#0 - adds $nlo,$nlo,$tj - adc $nhi,$nhi,#0 - ldr $tp,[$_bp] @ restore bp - ldr $tj,[$_bpend] @ restore &bp[num] - str $nlo,[$num] @ tp[num-1]= - ldr $n0,[$_n0] @ restore n0 - str $nhi,[$num,#4] @ tp[num]= - - cmp $tp,$tj - bne .Louter - - ldr $rp,[$_rp] @ pull rp - add $num,$num,#4 @ $num to point at &tp[num] - sub $aj,$num,sp @ "original" num value - mov $tp,sp @ "rewind" $tp - mov $ap,$tp @ "borrow" $ap - sub $np,$np,$aj @ "rewind" $np to &np[0] - - subs $tj,$tj,$tj @ "clear" carry flag -.Lsub: ldr $tj,[$tp],#4 - ldr $nj,[$np],#4 - sbcs $tj,$tj,$nj @ tp[j]-np[j] - str $tj,[$rp],#4 @ rp[j]= - teq $tp,$num @ preserve carry - bne .Lsub - sbcs $nhi,$nhi,#0 @ upmost carry - mov $tp,sp @ "rewind" $tp - sub $rp,$rp,$aj @ "rewind" $rp - - and $ap,$tp,$nhi - bic $np,$rp,$nhi - orr $ap,$ap,$np @ ap=borrow?tp:rp - -.Lcopy: ldr $tj,[$ap],#4 @ copy or in-place refresh - str sp,[$tp],#4 @ zap tp - str $tj,[$rp],#4 - cmp $tp,$num - bne .Lcopy - - add sp,$num,#4 @ skip over tp[num+1] - ldmia sp!,{r4-r12,lr} @ restore registers - add sp,sp,#2*4 @ skip over {r0,r2} - mov r0,#1 -.Labrt: tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - bx lr @ interoperable with Thumb ISA:-) -.size bn_mul_mont,.-bn_mul_mont -.asciz "Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" -___ - -$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 -print $code; -close STDOUT; diff --git a/crypto/openssl/crypto/bn/asm/mips3-mont.pl b/crypto/openssl/crypto/bn/asm/mips3-mont.pl deleted file mode 100755 index 8f9156e..0000000 --- a/crypto/openssl/crypto/bn/asm/mips3-mont.pl +++ /dev/null @@ -1,327 +0,0 @@ -#!/usr/bin/env perl -# -# ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. -# ==================================================================== - -# This module doesn't present direct interest for OpenSSL, because it -# doesn't provide better performance for longer keys. While 512-bit -# RSA private key operations are 40% faster, 1024-bit ones are hardly -# faster at all, while longer key operations are slower by up to 20%. -# It might be of interest to embedded system developers though, as -# it's smaller than 1KB, yet offers ~3x improvement over compiler -# generated code. -# -# The module targets N32 and N64 MIPS ABIs and currently is a bit -# IRIX-centric, i.e. is likely to require adaptation for other OSes. - -# int bn_mul_mont( -$rp="a0"; # BN_ULONG *rp, -$ap="a1"; # const BN_ULONG *ap, -$bp="a2"; # const BN_ULONG *bp, -$np="a3"; # const BN_ULONG *np, -$n0="a4"; # const BN_ULONG *n0, -$num="a5"; # int num); - -$lo0="a6"; -$hi0="a7"; -$lo1="v0"; -$hi1="v1"; -$aj="t0"; -$bi="t1"; -$nj="t2"; -$tp="t3"; -$alo="s0"; -$ahi="s1"; -$nlo="s2"; -$nhi="s3"; -$tj="s4"; -$i="s5"; -$j="s6"; -$fp="t8"; -$m1="t9"; - -$FRAME=8*(2+8); - -$code=<<___; -#include <asm.h> -#include <regdef.h> - -.text - -.set noat -.set reorder - -.align 5 -.globl bn_mul_mont -.ent bn_mul_mont -bn_mul_mont: - .set noreorder - PTR_SUB sp,64 - move $fp,sp - .frame $fp,64,ra - slt AT,$num,4 - li v0,0 - beqzl AT,.Lproceed - nop - jr ra - PTR_ADD sp,$fp,64 - .set reorder -.align 5 -.Lproceed: - ld $n0,0($n0) - ld $bi,0($bp) # bp[0] - ld $aj,0($ap) # ap[0] - ld $nj,0($np) # np[0] - PTR_SUB sp,16 # place for two extra words - sll $num,3 - li AT,-4096 - PTR_SUB sp,$num - and sp,AT - - sd s0,0($fp) - sd s1,8($fp) - sd s2,16($fp) - sd s3,24($fp) - sd s4,32($fp) - sd s5,40($fp) - sd s6,48($fp) - sd s7,56($fp) - - dmultu $aj,$bi - ld $alo,8($ap) - ld $nlo,8($np) - mflo $lo0 - mfhi $hi0 - dmultu $lo0,$n0 - mflo $m1 - - dmultu $alo,$bi - mflo $alo - mfhi $ahi - - dmultu $nj,$m1 - mflo $lo1 - mfhi $hi1 - dmultu $nlo,$m1 - daddu $lo1,$lo0 - sltu AT,$lo1,$lo0 - daddu $hi1,AT - mflo $nlo - mfhi $nhi - - move $tp,sp - li $j,16 -.align 4 -.L1st: - .set noreorder - PTR_ADD $aj,$ap,$j - ld $aj,($aj) - PTR_ADD $nj,$np,$j - ld $nj,($nj) - - dmultu $aj,$bi - daddu $lo0,$alo,$hi0 - daddu $lo1,$nlo,$hi1 - sltu AT,$lo0,$hi0 - sltu s7,$lo1,$hi1 - daddu $hi0,$ahi,AT - daddu $hi1,$nhi,s7 - mflo $alo - mfhi $ahi - - daddu $lo1,$lo0 - sltu AT,$lo1,$lo0 - dmultu $nj,$m1 - daddu $hi1,AT - addu $j,8 - sd $lo1,($tp) - sltu s7,$j,$num - mflo $nlo - mfhi $nhi - - bnez s7,.L1st - PTR_ADD $tp,8 - .set reorder - - daddu $lo0,$alo,$hi0 - sltu AT,$lo0,$hi0 - daddu $hi0,$ahi,AT - - daddu $lo1,$nlo,$hi1 - sltu s7,$lo1,$hi1 - daddu $hi1,$nhi,s7 - daddu $lo1,$lo0 - sltu AT,$lo1,$lo0 - daddu $hi1,AT - - sd $lo1,($tp) - - daddu $hi1,$hi0 - sltu AT,$hi1,$hi0 - sd $hi1,8($tp) - sd AT,16($tp) - - li $i,8 -.align 4 -.Louter: - PTR_ADD $bi,$bp,$i - ld $bi,($bi) - ld $aj,($ap) - ld $alo,8($ap) - ld $tj,(sp) - - dmultu $aj,$bi - ld $nj,($np) - ld $nlo,8($np) - mflo $lo0 - mfhi $hi0 - daddu $lo0,$tj - dmultu $lo0,$n0 - sltu AT,$lo0,$tj - daddu $hi0,AT - mflo $m1 - - dmultu $alo,$bi - mflo $alo - mfhi $ahi - - dmultu $nj,$m1 - mflo $lo1 - mfhi $hi1 - - dmultu $nlo,$m1 - daddu $lo1,$lo0 - sltu AT,$lo1,$lo0 - daddu $hi1,AT - mflo $nlo - mfhi $nhi - - move $tp,sp - li $j,16 - ld $tj,8($tp) -.align 4 -.Linner: - .set noreorder - PTR_ADD $aj,$ap,$j - ld $aj,($aj) - PTR_ADD $nj,$np,$j - ld $nj,($nj) - - dmultu $aj,$bi - daddu $lo0,$alo,$hi0 - daddu $lo1,$nlo,$hi1 - sltu AT,$lo0,$hi0 - sltu s7,$lo1,$hi1 - daddu $hi0,$ahi,AT - daddu $hi1,$nhi,s7 - mflo $alo - mfhi $ahi - - daddu $lo0,$tj - addu $j,8 - dmultu $nj,$m1 - sltu AT,$lo0,$tj - daddu $lo1,$lo0 - daddu $hi0,AT - sltu s7,$lo1,$lo0 - ld $tj,16($tp) - daddu $hi1,s7 - sltu AT,$j,$num - mflo $nlo - mfhi $nhi - sd $lo1,($tp) - bnez AT,.Linner - PTR_ADD $tp,8 - .set reorder - - daddu $lo0,$alo,$hi0 - sltu AT,$lo0,$hi0 - daddu $hi0,$ahi,AT - daddu $lo0,$tj - sltu s7,$lo0,$tj - daddu $hi0,s7 - - ld $tj,16($tp) - daddu $lo1,$nlo,$hi1 - sltu AT,$lo1,$hi1 - daddu $hi1,$nhi,AT - daddu $lo1,$lo0 - sltu s7,$lo1,$lo0 - daddu $hi1,s7 - sd $lo1,($tp) - - daddu $lo1,$hi1,$hi0 - sltu $hi1,$lo1,$hi0 - daddu $lo1,$tj - sltu AT,$lo1,$tj - daddu $hi1,AT - sd $lo1,8($tp) - sd $hi1,16($tp) - - addu $i,8 - sltu s7,$i,$num - bnez s7,.Louter - - .set noreorder - PTR_ADD $tj,sp,$num # &tp[num] - move $tp,sp - move $ap,sp - li $hi0,0 # clear borrow bit - -.align 4 -.Lsub: ld $lo0,($tp) - ld $lo1,($np) - PTR_ADD $tp,8 - PTR_ADD $np,8 - dsubu $lo1,$lo0,$lo1 # tp[i]-np[i] - sgtu AT,$lo1,$lo0 - dsubu $lo0,$lo1,$hi0 - sgtu $hi0,$lo0,$lo1 - sd $lo0,($rp) - or $hi0,AT - sltu AT,$tp,$tj - bnez AT,.Lsub - PTR_ADD $rp,8 - - dsubu $hi0,$hi1,$hi0 # handle upmost overflow bit - move $tp,sp - PTR_SUB $rp,$num # restore rp - not $hi1,$hi0 - - and $ap,$hi0,sp - and $bp,$hi1,$rp - or $ap,$ap,$bp # ap=borrow?tp:rp - -.align 4 -.Lcopy: ld $aj,($ap) - PTR_ADD $ap,8 - PTR_ADD $tp,8 - sd zero,-8($tp) - sltu AT,$tp,$tj - sd $aj,($rp) - bnez AT,.Lcopy - PTR_ADD $rp,8 - - ld s0,0($fp) - ld s1,8($fp) - ld s2,16($fp) - ld s3,24($fp) - ld s4,32($fp) - ld s5,40($fp) - ld s6,48($fp) - ld s7,56($fp) - li v0,1 - jr ra - PTR_ADD sp,$fp,64 - .set reorder -END(bn_mul_mont) -.rdata -.asciiz "Montgomery Multiplication for MIPS III/IV, CRYPTOGAMS by <appro\@openssl.org>" -___ - -print $code; -close STDOUT; diff --git a/crypto/openssl/crypto/bn/asm/ppc-mont.pl b/crypto/openssl/crypto/bn/asm/ppc-mont.pl deleted file mode 100755 index 7849eae..0000000 --- a/crypto/openssl/crypto/bn/asm/ppc-mont.pl +++ /dev/null @@ -1,323 +0,0 @@ -#!/usr/bin/env perl - -# ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. -# ==================================================================== - -# April 2006 - -# "Teaser" Montgomery multiplication module for PowerPC. It's possible -# to gain a bit more by modulo-scheduling outer loop, then dedicated -# squaring procedure should give further 20% and code can be adapted -# for 32-bit application running on 64-bit CPU. As for the latter. -# It won't be able to achieve "native" 64-bit performance, because in -# 32-bit application context every addc instruction will have to be -# expanded as addc, twice right shift by 32 and finally adde, etc. -# So far RSA *sign* performance improvement over pre-bn_mul_mont asm -# for 64-bit application running on PPC970/G5 is: -# -# 512-bit +65% -# 1024-bit +35% -# 2048-bit +18% -# 4096-bit +4% - -$flavour = shift; - -if ($flavour =~ /32/) { - $BITS= 32; - $BNSZ= $BITS/8; - $SIZE_T=4; - $RZONE= 224; - $FRAME= $SIZE_T*16; - - $LD= "lwz"; # load - $LDU= "lwzu"; # load and update - $LDX= "lwzx"; # load indexed - $ST= "stw"; # store - $STU= "stwu"; # store and update - $STX= "stwx"; # store indexed - $STUX= "stwux"; # store indexed and update - $UMULL= "mullw"; # unsigned multiply low - $UMULH= "mulhwu"; # unsigned multiply high - $UCMP= "cmplw"; # unsigned compare - $SHRI= "srwi"; # unsigned shift right by immediate - $PUSH= $ST; - $POP= $LD; -} elsif ($flavour =~ /64/) { - $BITS= 64; - $BNSZ= $BITS/8; - $SIZE_T=8; - $RZONE= 288; - $FRAME= $SIZE_T*16; - - # same as above, but 64-bit mnemonics... - $LD= "ld"; # load - $LDU= "ldu"; # load and update - $LDX= "ldx"; # load indexed - $ST= "std"; # store - $STU= "stdu"; # store and update - $STX= "stdx"; # store indexed - $STUX= "stdux"; # store indexed and update - $UMULL= "mulld"; # unsigned multiply low - $UMULH= "mulhdu"; # unsigned multiply high - $UCMP= "cmpld"; # unsigned compare - $SHRI= "srdi"; # unsigned shift right by immediate - $PUSH= $ST; - $POP= $LD; -} else { die "nonsense $flavour"; } - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or -( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or -die "can't locate ppc-xlate.pl"; - -open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; - -$sp="r1"; -$toc="r2"; -$rp="r3"; $ovf="r3"; -$ap="r4"; -$bp="r5"; -$np="r6"; -$n0="r7"; -$num="r8"; -$rp="r9"; # $rp is reassigned -$aj="r10"; -$nj="r11"; -$tj="r12"; -# non-volatile registers -$i="r14"; -$j="r15"; -$tp="r16"; -$m0="r17"; -$m1="r18"; -$lo0="r19"; -$hi0="r20"; -$lo1="r21"; -$hi1="r22"; -$alo="r23"; -$ahi="r24"; -$nlo="r25"; -# -$nhi="r0"; - -$code=<<___; -.machine "any" -.text - -.globl .bn_mul_mont -.align 4 -.bn_mul_mont: - cmpwi $num,4 - mr $rp,r3 ; $rp is reassigned - li r3,0 - bltlr - - slwi $num,$num,`log($BNSZ)/log(2)` - li $tj,-4096 - addi $ovf,$num,`$FRAME+$RZONE` - subf $ovf,$ovf,$sp ; $sp-$ovf - and $ovf,$ovf,$tj ; minimize TLB usage - subf $ovf,$sp,$ovf ; $ovf-$sp - srwi $num,$num,`log($BNSZ)/log(2)` - $STUX $sp,$sp,$ovf - - $PUSH r14,`4*$SIZE_T`($sp) - $PUSH r15,`5*$SIZE_T`($sp) - $PUSH r16,`6*$SIZE_T`($sp) - $PUSH r17,`7*$SIZE_T`($sp) - $PUSH r18,`8*$SIZE_T`($sp) - $PUSH r19,`9*$SIZE_T`($sp) - $PUSH r20,`10*$SIZE_T`($sp) - $PUSH r21,`11*$SIZE_T`($sp) - $PUSH r22,`12*$SIZE_T`($sp) - $PUSH r23,`13*$SIZE_T`($sp) - $PUSH r24,`14*$SIZE_T`($sp) - $PUSH r25,`15*$SIZE_T`($sp) - - $LD $n0,0($n0) ; pull n0[0] value - addi $num,$num,-2 ; adjust $num for counter register - - $LD $m0,0($bp) ; m0=bp[0] - $LD $aj,0($ap) ; ap[0] - addi $tp,$sp,$FRAME - $UMULL $lo0,$aj,$m0 ; ap[0]*bp[0] - $UMULH $hi0,$aj,$m0 - - $LD $aj,$BNSZ($ap) ; ap[1] - $LD $nj,0($np) ; np[0] - - $UMULL $m1,$lo0,$n0 ; "tp[0]"*n0 - - $UMULL $alo,$aj,$m0 ; ap[1]*bp[0] - $UMULH $ahi,$aj,$m0 - - $UMULL $lo1,$nj,$m1 ; np[0]*m1 - $UMULH $hi1,$nj,$m1 - $LD $nj,$BNSZ($np) ; np[1] - addc $lo1,$lo1,$lo0 - addze $hi1,$hi1 - - $UMULL $nlo,$nj,$m1 ; np[1]*m1 - $UMULH $nhi,$nj,$m1 - - mtctr $num - li $j,`2*$BNSZ` -.align 4 -L1st: - $LDX $aj,$ap,$j ; ap[j] - addc $lo0,$alo,$hi0 - $LDX $nj,$np,$j ; np[j] - addze $hi0,$ahi - $UMULL $alo,$aj,$m0 ; ap[j]*bp[0] - addc $lo1,$nlo,$hi1 - $UMULH $ahi,$aj,$m0 - addze $hi1,$nhi - $UMULL $nlo,$nj,$m1 ; np[j]*m1 - addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0] - $UMULH $nhi,$nj,$m1 - addze $hi1,$hi1 - $ST $lo1,0($tp) ; tp[j-1] - - addi $j,$j,$BNSZ ; j++ - addi $tp,$tp,$BNSZ ; tp++ - bdnz- L1st -;L1st - addc $lo0,$alo,$hi0 - addze $hi0,$ahi - - addc $lo1,$nlo,$hi1 - addze $hi1,$nhi - addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0] - addze $hi1,$hi1 - $ST $lo1,0($tp) ; tp[j-1] - - li $ovf,0 - addc $hi1,$hi1,$hi0 - addze $ovf,$ovf ; upmost overflow bit - $ST $hi1,$BNSZ($tp) - - li $i,$BNSZ -.align 4 -Louter: - $LDX $m0,$bp,$i ; m0=bp[i] - $LD $aj,0($ap) ; ap[0] - addi $tp,$sp,$FRAME - $LD $tj,$FRAME($sp) ; tp[0] - $UMULL $lo0,$aj,$m0 ; ap[0]*bp[i] - $UMULH $hi0,$aj,$m0 - $LD $aj,$BNSZ($ap) ; ap[1] - $LD $nj,0($np) ; np[0] - addc $lo0,$lo0,$tj ; ap[0]*bp[i]+tp[0] - $UMULL $alo,$aj,$m0 ; ap[j]*bp[i] - addze $hi0,$hi0 - $UMULL $m1,$lo0,$n0 ; tp[0]*n0 - $UMULH $ahi,$aj,$m0 - $UMULL $lo1,$nj,$m1 ; np[0]*m1 - $UMULH $hi1,$nj,$m1 - $LD $nj,$BNSZ($np) ; np[1] - addc $lo1,$lo1,$lo0 - $UMULL $nlo,$nj,$m1 ; np[1]*m1 - addze $hi1,$hi1 - $UMULH $nhi,$nj,$m1 - - mtctr $num - li $j,`2*$BNSZ` -.align 4 -Linner: - $LDX $aj,$ap,$j ; ap[j] - addc $lo0,$alo,$hi0 - $LD $tj,$BNSZ($tp) ; tp[j] - addze $hi0,$ahi - $LDX $nj,$np,$j ; np[j] - addc $lo1,$nlo,$hi1 - $UMULL $alo,$aj,$m0 ; ap[j]*bp[i] - addze $hi1,$nhi - $UMULH $ahi,$aj,$m0 - addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j] - $UMULL $nlo,$nj,$m1 ; np[j]*m1 - addze $hi0,$hi0 - $UMULH $nhi,$nj,$m1 - addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j] - addi $j,$j,$BNSZ ; j++ - addze $hi1,$hi1 - $ST $lo1,0($tp) ; tp[j-1] - addi $tp,$tp,$BNSZ ; tp++ - bdnz- Linner -;Linner - $LD $tj,$BNSZ($tp) ; tp[j] - addc $lo0,$alo,$hi0 - addze $hi0,$ahi - addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j] - addze $hi0,$hi0 - - addc $lo1,$nlo,$hi1 - addze $hi1,$nhi - addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j] - addze $hi1,$hi1 - $ST $lo1,0($tp) ; tp[j-1] - - addic $ovf,$ovf,-1 ; move upmost overflow to XER[CA] - li $ovf,0 - adde $hi1,$hi1,$hi0 - addze $ovf,$ovf - $ST $hi1,$BNSZ($tp) -; - slwi $tj,$num,`log($BNSZ)/log(2)` - $UCMP $i,$tj - addi $i,$i,$BNSZ - ble- Louter - - addi $num,$num,2 ; restore $num - subfc $j,$j,$j ; j=0 and "clear" XER[CA] - addi $tp,$sp,$FRAME - mtctr $num - -.align 4 -Lsub: $LDX $tj,$tp,$j - $LDX $nj,$np,$j - subfe $aj,$nj,$tj ; tp[j]-np[j] - $STX $aj,$rp,$j - addi $j,$j,$BNSZ - bdnz- Lsub - - li $j,0 - mtctr $num - subfe $ovf,$j,$ovf ; handle upmost overflow bit - and $ap,$tp,$ovf - andc $np,$rp,$ovf - or $ap,$ap,$np ; ap=borrow?tp:rp - -.align 4 -Lcopy: ; copy or in-place refresh - $LDX $tj,$ap,$j - $STX $tj,$rp,$j - $STX $j,$tp,$j ; zap at once - addi $j,$j,$BNSZ - bdnz- Lcopy - - $POP r14,`4*$SIZE_T`($sp) - $POP r15,`5*$SIZE_T`($sp) - $POP r16,`6*$SIZE_T`($sp) - $POP r17,`7*$SIZE_T`($sp) - $POP r18,`8*$SIZE_T`($sp) - $POP r19,`9*$SIZE_T`($sp) - $POP r20,`10*$SIZE_T`($sp) - $POP r21,`11*$SIZE_T`($sp) - $POP r22,`12*$SIZE_T`($sp) - $POP r23,`13*$SIZE_T`($sp) - $POP r24,`14*$SIZE_T`($sp) - $POP r25,`15*$SIZE_T`($sp) - $POP $sp,0($sp) - li r3,1 - blr - .long 0 -.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>" -___ - -$code =~ s/\`([^\`]*)\`/eval $1/gem; -print $code; -close STDOUT; diff --git a/crypto/openssl/crypto/bn/asm/ppc64-mont.pl b/crypto/openssl/crypto/bn/asm/ppc64-mont.pl deleted file mode 100755 index 3449b35..0000000 --- a/crypto/openssl/crypto/bn/asm/ppc64-mont.pl +++ /dev/null @@ -1,918 +0,0 @@ -#!/usr/bin/env perl - -# ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. -# ==================================================================== - -# December 2007 - -# The reason for undertaken effort is basically following. Even though -# Power 6 CPU operates at incredible 4.7GHz clock frequency, its PKI -# performance was observed to be less than impressive, essentially as -# fast as 1.8GHz PPC970, or 2.6 times(!) slower than one would hope. -# Well, it's not surprising that IBM had to make some sacrifices to -# boost the clock frequency that much, but no overall improvement? -# Having observed how much difference did switching to FPU make on -# UltraSPARC, playing same stunt on Power 6 appeared appropriate... -# Unfortunately the resulting performance improvement is not as -# impressive, ~30%, and in absolute terms is still very far from what -# one would expect from 4.7GHz CPU. There is a chance that I'm doing -# something wrong, but in the lack of assembler level micro-profiling -# data or at least decent platform guide I can't tell... Or better -# results might be achieved with VMX... Anyway, this module provides -# *worse* performance on other PowerPC implementations, ~40-15% slower -# on PPC970 depending on key length and ~40% slower on Power 5 for all -# key lengths. As it's obviously inappropriate as "best all-round" -# alternative, it has to be complemented with run-time CPU family -# detection. Oh! It should also be noted that unlike other PowerPC -# implementation IALU ppc-mont.pl module performs *suboptimaly* on -# >=1024-bit key lengths on Power 6. It should also be noted that -# *everything* said so far applies to 64-bit builds! As far as 32-bit -# application executed on 64-bit CPU goes, this module is likely to -# become preferred choice, because it's easy to adapt it for such -# case and *is* faster than 32-bit ppc-mont.pl on *all* processors. - -# February 2008 - -# Micro-profiling assisted optimization results in ~15% improvement -# over original ppc64-mont.pl version, or overall ~50% improvement -# over ppc.pl module on Power 6. If compared to ppc-mont.pl on same -# Power 6 CPU, this module is 5-150% faster depending on key length, -# [hereafter] more for longer keys. But if compared to ppc-mont.pl -# on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive -# in absolute terms, but it's apparently the way Power 6 is... - -$flavour = shift; - -if ($flavour =~ /32/) { - $SIZE_T=4; - $RZONE= 224; - $FRAME= $SIZE_T*12+8*12; - $fname= "bn_mul_mont_ppc64"; - - $STUX= "stwux"; # store indexed and update - $PUSH= "stw"; - $POP= "lwz"; - die "not implemented yet"; -} elsif ($flavour =~ /64/) { - $SIZE_T=8; - $RZONE= 288; - $FRAME= $SIZE_T*12+8*12; - $fname= "bn_mul_mont"; - - # same as above, but 64-bit mnemonics... - $STUX= "stdux"; # store indexed and update - $PUSH= "std"; - $POP= "ld"; -} else { die "nonsense $flavour"; } - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or -( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or -die "can't locate ppc-xlate.pl"; - -open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; - -$FRAME=($FRAME+63)&~63; -$TRANSFER=16*8; - -$carry="r0"; -$sp="r1"; -$toc="r2"; -$rp="r3"; $ovf="r3"; -$ap="r4"; -$bp="r5"; -$np="r6"; -$n0="r7"; -$num="r8"; -$rp="r9"; # $rp is reassigned -$tp="r10"; -$j="r11"; -$i="r12"; -# non-volatile registers -$nap_d="r14"; # interleaved ap and np in double format -$a0="r15"; # ap[0] -$t0="r16"; # temporary registers -$t1="r17"; -$t2="r18"; -$t3="r19"; -$t4="r20"; -$t5="r21"; -$t6="r22"; -$t7="r23"; - -# PPC offers enough register bank capacity to unroll inner loops twice -# -# ..A3A2A1A0 -# dcba -# ----------- -# A0a -# A0b -# A0c -# A0d -# A1a -# A1b -# A1c -# A1d -# A2a -# A2b -# A2c -# A2d -# A3a -# A3b -# A3c -# A3d -# ..a -# ..b -# -$ba="f0"; $bb="f1"; $bc="f2"; $bd="f3"; -$na="f4"; $nb="f5"; $nc="f6"; $nd="f7"; -$dota="f8"; $dotb="f9"; -$A0="f10"; $A1="f11"; $A2="f12"; $A3="f13"; -$N0="f14"; $N1="f15"; $N2="f16"; $N3="f17"; -$T0a="f18"; $T0b="f19"; -$T1a="f20"; $T1b="f21"; -$T2a="f22"; $T2b="f23"; -$T3a="f24"; $T3b="f25"; - -# sp----------->+-------------------------------+ -# | saved sp | -# +-------------------------------+ -# | | -# +-------------------------------+ -# | 10 saved gpr, r14-r23 | -# . . -# . . -# +12*size_t +-------------------------------+ -# | 12 saved fpr, f14-f25 | -# . . -# . . -# +12*8 +-------------------------------+ -# | padding to 64 byte boundary | -# . . -# +X +-------------------------------+ -# | 16 gpr<->fpr transfer zone | -# . . -# . . -# +16*8 +-------------------------------+ -# | __int64 tmp[-1] | -# +-------------------------------+ -# | __int64 tmp[num] | -# . . -# . . -# . . -# +(num+1)*8 +-------------------------------+ -# | padding to 64 byte boundary | -# . . -# +X +-------------------------------+ -# | double nap_d[4*num] | -# . . -# . . -# . . -# +-------------------------------+ - -$code=<<___; -.machine "any" -.text - -.globl .$fname -.align 5 -.$fname: - cmpwi $num,4 - mr $rp,r3 ; $rp is reassigned - li r3,0 ; possible "not handled" return code - bltlr- - andi. r0,$num,1 ; $num has to be even - bnelr- - - slwi $num,$num,3 ; num*=8 - li $i,-4096 - slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num - add $tp,$tp,$num ; place for tp[num+1] - addi $tp,$tp,`$FRAME+$TRANSFER+8+64+$RZONE` - subf $tp,$tp,$sp ; $sp-$tp - and $tp,$tp,$i ; minimize TLB usage - subf $tp,$sp,$tp ; $tp-$sp - $STUX $sp,$sp,$tp ; alloca - - $PUSH r14,`2*$SIZE_T`($sp) - $PUSH r15,`3*$SIZE_T`($sp) - $PUSH r16,`4*$SIZE_T`($sp) - $PUSH r17,`5*$SIZE_T`($sp) - $PUSH r18,`6*$SIZE_T`($sp) - $PUSH r19,`7*$SIZE_T`($sp) - $PUSH r20,`8*$SIZE_T`($sp) - $PUSH r21,`9*$SIZE_T`($sp) - $PUSH r22,`10*$SIZE_T`($sp) - $PUSH r23,`11*$SIZE_T`($sp) - stfd f14,`12*$SIZE_T+0`($sp) - stfd f15,`12*$SIZE_T+8`($sp) - stfd f16,`12*$SIZE_T+16`($sp) - stfd f17,`12*$SIZE_T+24`($sp) - stfd f18,`12*$SIZE_T+32`($sp) - stfd f19,`12*$SIZE_T+40`($sp) - stfd f20,`12*$SIZE_T+48`($sp) - stfd f21,`12*$SIZE_T+56`($sp) - stfd f22,`12*$SIZE_T+64`($sp) - stfd f23,`12*$SIZE_T+72`($sp) - stfd f24,`12*$SIZE_T+80`($sp) - stfd f25,`12*$SIZE_T+88`($sp) - - ld $a0,0($ap) ; pull ap[0] value - ld $n0,0($n0) ; pull n0[0] value - ld $t3,0($bp) ; bp[0] - - addi $tp,$sp,`$FRAME+$TRANSFER+8+64` - li $i,-64 - add $nap_d,$tp,$num - and $nap_d,$nap_d,$i ; align to 64 bytes - - mulld $t7,$a0,$t3 ; ap[0]*bp[0] - ; nap_d is off by 1, because it's used with stfdu/lfdu - addi $nap_d,$nap_d,-8 - srwi $j,$num,`3+1` ; counter register, num/2 - mulld $t7,$t7,$n0 ; tp[0]*n0 - addi $j,$j,-1 - addi $tp,$sp,`$FRAME+$TRANSFER-8` - li $carry,0 - mtctr $j - - ; transfer bp[0] to FPU as 4x16-bit values - extrdi $t0,$t3,16,48 - extrdi $t1,$t3,16,32 - extrdi $t2,$t3,16,16 - extrdi $t3,$t3,16,0 - std $t0,`$FRAME+0`($sp) - std $t1,`$FRAME+8`($sp) - std $t2,`$FRAME+16`($sp) - std $t3,`$FRAME+24`($sp) - ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values - extrdi $t4,$t7,16,48 - extrdi $t5,$t7,16,32 - extrdi $t6,$t7,16,16 - extrdi $t7,$t7,16,0 - std $t4,`$FRAME+32`($sp) - std $t5,`$FRAME+40`($sp) - std $t6,`$FRAME+48`($sp) - std $t7,`$FRAME+56`($sp) - lwz $t0,4($ap) ; load a[j] as 32-bit word pair - lwz $t1,0($ap) - lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair - lwz $t3,8($ap) - lwz $t4,4($np) ; load n[j] as 32-bit word pair - lwz $t5,0($np) - lwz $t6,12($np) ; load n[j+1] as 32-bit word pair - lwz $t7,8($np) - lfd $ba,`$FRAME+0`($sp) - lfd $bb,`$FRAME+8`($sp) - lfd $bc,`$FRAME+16`($sp) - lfd $bd,`$FRAME+24`($sp) - lfd $na,`$FRAME+32`($sp) - lfd $nb,`$FRAME+40`($sp) - lfd $nc,`$FRAME+48`($sp) - lfd $nd,`$FRAME+56`($sp) - std $t0,`$FRAME+64`($sp) - std $t1,`$FRAME+72`($sp) - std $t2,`$FRAME+80`($sp) - std $t3,`$FRAME+88`($sp) - std $t4,`$FRAME+96`($sp) - std $t5,`$FRAME+104`($sp) - std $t6,`$FRAME+112`($sp) - std $t7,`$FRAME+120`($sp) - fcfid $ba,$ba - fcfid $bb,$bb - fcfid $bc,$bc - fcfid $bd,$bd - fcfid $na,$na - fcfid $nb,$nb - fcfid $nc,$nc - fcfid $nd,$nd - - lfd $A0,`$FRAME+64`($sp) - lfd $A1,`$FRAME+72`($sp) - lfd $A2,`$FRAME+80`($sp) - lfd $A3,`$FRAME+88`($sp) - lfd $N0,`$FRAME+96`($sp) - lfd $N1,`$FRAME+104`($sp) - lfd $N2,`$FRAME+112`($sp) - lfd $N3,`$FRAME+120`($sp) - fcfid $A0,$A0 - fcfid $A1,$A1 - fcfid $A2,$A2 - fcfid $A3,$A3 - fcfid $N0,$N0 - fcfid $N1,$N1 - fcfid $N2,$N2 - fcfid $N3,$N3 - addi $ap,$ap,16 - addi $np,$np,16 - - fmul $T1a,$A1,$ba - fmul $T1b,$A1,$bb - stfd $A0,8($nap_d) ; save a[j] in double format - stfd $A1,16($nap_d) - fmul $T2a,$A2,$ba - fmul $T2b,$A2,$bb - stfd $A2,24($nap_d) ; save a[j+1] in double format - stfd $A3,32($nap_d) - fmul $T3a,$A3,$ba - fmul $T3b,$A3,$bb - stfd $N0,40($nap_d) ; save n[j] in double format - stfd $N1,48($nap_d) - fmul $T0a,$A0,$ba - fmul $T0b,$A0,$bb - stfd $N2,56($nap_d) ; save n[j+1] in double format - stfdu $N3,64($nap_d) - - fmadd $T1a,$A0,$bc,$T1a - fmadd $T1b,$A0,$bd,$T1b - fmadd $T2a,$A1,$bc,$T2a - fmadd $T2b,$A1,$bd,$T2b - fmadd $T3a,$A2,$bc,$T3a - fmadd $T3b,$A2,$bd,$T3b - fmul $dota,$A3,$bc - fmul $dotb,$A3,$bd - - fmadd $T1a,$N1,$na,$T1a - fmadd $T1b,$N1,$nb,$T1b - fmadd $T2a,$N2,$na,$T2a - fmadd $T2b,$N2,$nb,$T2b - fmadd $T3a,$N3,$na,$T3a - fmadd $T3b,$N3,$nb,$T3b - fmadd $T0a,$N0,$na,$T0a - fmadd $T0b,$N0,$nb,$T0b - - fmadd $T1a,$N0,$nc,$T1a - fmadd $T1b,$N0,$nd,$T1b - fmadd $T2a,$N1,$nc,$T2a - fmadd $T2b,$N1,$nd,$T2b - fmadd $T3a,$N2,$nc,$T3a - fmadd $T3b,$N2,$nd,$T3b - fmadd $dota,$N3,$nc,$dota - fmadd $dotb,$N3,$nd,$dotb - - fctid $T0a,$T0a - fctid $T0b,$T0b - fctid $T1a,$T1a - fctid $T1b,$T1b - fctid $T2a,$T2a - fctid $T2b,$T2b - fctid $T3a,$T3a - fctid $T3b,$T3b - - stfd $T0a,`$FRAME+0`($sp) - stfd $T0b,`$FRAME+8`($sp) - stfd $T1a,`$FRAME+16`($sp) - stfd $T1b,`$FRAME+24`($sp) - stfd $T2a,`$FRAME+32`($sp) - stfd $T2b,`$FRAME+40`($sp) - stfd $T3a,`$FRAME+48`($sp) - stfd $T3b,`$FRAME+56`($sp) - -.align 5 -L1st: - lwz $t0,4($ap) ; load a[j] as 32-bit word pair - lwz $t1,0($ap) - lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair - lwz $t3,8($ap) - lwz $t4,4($np) ; load n[j] as 32-bit word pair - lwz $t5,0($np) - lwz $t6,12($np) ; load n[j+1] as 32-bit word pair - lwz $t7,8($np) - std $t0,`$FRAME+64`($sp) - std $t1,`$FRAME+72`($sp) - std $t2,`$FRAME+80`($sp) - std $t3,`$FRAME+88`($sp) - std $t4,`$FRAME+96`($sp) - std $t5,`$FRAME+104`($sp) - std $t6,`$FRAME+112`($sp) - std $t7,`$FRAME+120`($sp) - ld $t0,`$FRAME+0`($sp) - ld $t1,`$FRAME+8`($sp) - ld $t2,`$FRAME+16`($sp) - ld $t3,`$FRAME+24`($sp) - ld $t4,`$FRAME+32`($sp) - ld $t5,`$FRAME+40`($sp) - ld $t6,`$FRAME+48`($sp) - ld $t7,`$FRAME+56`($sp) - lfd $A0,`$FRAME+64`($sp) - lfd $A1,`$FRAME+72`($sp) - lfd $A2,`$FRAME+80`($sp) - lfd $A3,`$FRAME+88`($sp) - lfd $N0,`$FRAME+96`($sp) - lfd $N1,`$FRAME+104`($sp) - lfd $N2,`$FRAME+112`($sp) - lfd $N3,`$FRAME+120`($sp) - fcfid $A0,$A0 - fcfid $A1,$A1 - fcfid $A2,$A2 - fcfid $A3,$A3 - fcfid $N0,$N0 - fcfid $N1,$N1 - fcfid $N2,$N2 - fcfid $N3,$N3 - addi $ap,$ap,16 - addi $np,$np,16 - - fmul $T1a,$A1,$ba - fmul $T1b,$A1,$bb - fmul $T2a,$A2,$ba - fmul $T2b,$A2,$bb - stfd $A0,8($nap_d) ; save a[j] in double format - stfd $A1,16($nap_d) - fmul $T3a,$A3,$ba - fmul $T3b,$A3,$bb - fmadd $T0a,$A0,$ba,$dota - fmadd $T0b,$A0,$bb,$dotb - stfd $A2,24($nap_d) ; save a[j+1] in double format - stfd $A3,32($nap_d) - - fmadd $T1a,$A0,$bc,$T1a - fmadd $T1b,$A0,$bd,$T1b - fmadd $T2a,$A1,$bc,$T2a - fmadd $T2b,$A1,$bd,$T2b - stfd $N0,40($nap_d) ; save n[j] in double format - stfd $N1,48($nap_d) - fmadd $T3a,$A2,$bc,$T3a - fmadd $T3b,$A2,$bd,$T3b - add $t0,$t0,$carry ; can not overflow - fmul $dota,$A3,$bc - fmul $dotb,$A3,$bd - stfd $N2,56($nap_d) ; save n[j+1] in double format - stfdu $N3,64($nap_d) - srdi $carry,$t0,16 - add $t1,$t1,$carry - srdi $carry,$t1,16 - - fmadd $T1a,$N1,$na,$T1a - fmadd $T1b,$N1,$nb,$T1b - insrdi $t0,$t1,16,32 - fmadd $T2a,$N2,$na,$T2a - fmadd $T2b,$N2,$nb,$T2b - add $t2,$t2,$carry - fmadd $T3a,$N3,$na,$T3a - fmadd $T3b,$N3,$nb,$T3b - srdi $carry,$t2,16 - fmadd $T0a,$N0,$na,$T0a - fmadd $T0b,$N0,$nb,$T0b - insrdi $t0,$t2,16,16 - add $t3,$t3,$carry - srdi $carry,$t3,16 - - fmadd $T1a,$N0,$nc,$T1a - fmadd $T1b,$N0,$nd,$T1b - insrdi $t0,$t3,16,0 ; 0..63 bits - fmadd $T2a,$N1,$nc,$T2a - fmadd $T2b,$N1,$nd,$T2b - add $t4,$t4,$carry - fmadd $T3a,$N2,$nc,$T3a - fmadd $T3b,$N2,$nd,$T3b - srdi $carry,$t4,16 - fmadd $dota,$N3,$nc,$dota - fmadd $dotb,$N3,$nd,$dotb - add $t5,$t5,$carry - srdi $carry,$t5,16 - insrdi $t4,$t5,16,32 - - fctid $T0a,$T0a - fctid $T0b,$T0b - add $t6,$t6,$carry - fctid $T1a,$T1a - fctid $T1b,$T1b - srdi $carry,$t6,16 - fctid $T2a,$T2a - fctid $T2b,$T2b - insrdi $t4,$t6,16,16 - fctid $T3a,$T3a - fctid $T3b,$T3b - add $t7,$t7,$carry - insrdi $t4,$t7,16,0 ; 64..127 bits - srdi $carry,$t7,16 ; upper 33 bits - - stfd $T0a,`$FRAME+0`($sp) - stfd $T0b,`$FRAME+8`($sp) - stfd $T1a,`$FRAME+16`($sp) - stfd $T1b,`$FRAME+24`($sp) - stfd $T2a,`$FRAME+32`($sp) - stfd $T2b,`$FRAME+40`($sp) - stfd $T3a,`$FRAME+48`($sp) - stfd $T3b,`$FRAME+56`($sp) - std $t0,8($tp) ; tp[j-1] - stdu $t4,16($tp) ; tp[j] - bdnz- L1st - - fctid $dota,$dota - fctid $dotb,$dotb - - ld $t0,`$FRAME+0`($sp) - ld $t1,`$FRAME+8`($sp) - ld $t2,`$FRAME+16`($sp) - ld $t3,`$FRAME+24`($sp) - ld $t4,`$FRAME+32`($sp) - ld $t5,`$FRAME+40`($sp) - ld $t6,`$FRAME+48`($sp) - ld $t7,`$FRAME+56`($sp) - stfd $dota,`$FRAME+64`($sp) - stfd $dotb,`$FRAME+72`($sp) - - add $t0,$t0,$carry ; can not overflow - srdi $carry,$t0,16 - add $t1,$t1,$carry - srdi $carry,$t1,16 - insrdi $t0,$t1,16,32 - add $t2,$t2,$carry - srdi $carry,$t2,16 - insrdi $t0,$t2,16,16 - add $t3,$t3,$carry - srdi $carry,$t3,16 - insrdi $t0,$t3,16,0 ; 0..63 bits - add $t4,$t4,$carry - srdi $carry,$t4,16 - add $t5,$t5,$carry - srdi $carry,$t5,16 - insrdi $t4,$t5,16,32 - add $t6,$t6,$carry - srdi $carry,$t6,16 - insrdi $t4,$t6,16,16 - add $t7,$t7,$carry - insrdi $t4,$t7,16,0 ; 64..127 bits - srdi $carry,$t7,16 ; upper 33 bits - ld $t6,`$FRAME+64`($sp) - ld $t7,`$FRAME+72`($sp) - - std $t0,8($tp) ; tp[j-1] - stdu $t4,16($tp) ; tp[j] - - add $t6,$t6,$carry ; can not overflow - srdi $carry,$t6,16 - add $t7,$t7,$carry - insrdi $t6,$t7,48,0 - srdi $ovf,$t7,48 - std $t6,8($tp) ; tp[num-1] - - slwi $t7,$num,2 - subf $nap_d,$t7,$nap_d ; rewind pointer - - li $i,8 ; i=1 -.align 5 -Louter: - ldx $t3,$bp,$i ; bp[i] - ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0] - mulld $t7,$a0,$t3 ; ap[0]*bp[i] - - addi $tp,$sp,`$FRAME+$TRANSFER` - add $t7,$t7,$t6 ; ap[0]*bp[i]+tp[0] - li $carry,0 - mulld $t7,$t7,$n0 ; tp[0]*n0 - mtctr $j - - ; transfer bp[i] to FPU as 4x16-bit values - extrdi $t0,$t3,16,48 - extrdi $t1,$t3,16,32 - extrdi $t2,$t3,16,16 - extrdi $t3,$t3,16,0 - std $t0,`$FRAME+0`($sp) - std $t1,`$FRAME+8`($sp) - std $t2,`$FRAME+16`($sp) - std $t3,`$FRAME+24`($sp) - ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values - extrdi $t4,$t7,16,48 - extrdi $t5,$t7,16,32 - extrdi $t6,$t7,16,16 - extrdi $t7,$t7,16,0 - std $t4,`$FRAME+32`($sp) - std $t5,`$FRAME+40`($sp) - std $t6,`$FRAME+48`($sp) - std $t7,`$FRAME+56`($sp) - - lfd $A0,8($nap_d) ; load a[j] in double format - lfd $A1,16($nap_d) - lfd $A2,24($nap_d) ; load a[j+1] in double format - lfd $A3,32($nap_d) - lfd $N0,40($nap_d) ; load n[j] in double format - lfd $N1,48($nap_d) - lfd $N2,56($nap_d) ; load n[j+1] in double format - lfdu $N3,64($nap_d) - - lfd $ba,`$FRAME+0`($sp) - lfd $bb,`$FRAME+8`($sp) - lfd $bc,`$FRAME+16`($sp) - lfd $bd,`$FRAME+24`($sp) - lfd $na,`$FRAME+32`($sp) - lfd $nb,`$FRAME+40`($sp) - lfd $nc,`$FRAME+48`($sp) - lfd $nd,`$FRAME+56`($sp) - - fcfid $ba,$ba - fcfid $bb,$bb - fcfid $bc,$bc - fcfid $bd,$bd - fcfid $na,$na - fcfid $nb,$nb - fcfid $nc,$nc - fcfid $nd,$nd - - fmul $T1a,$A1,$ba - fmul $T1b,$A1,$bb - fmul $T2a,$A2,$ba - fmul $T2b,$A2,$bb - fmul $T3a,$A3,$ba - fmul $T3b,$A3,$bb - fmul $T0a,$A0,$ba - fmul $T0b,$A0,$bb - - fmadd $T1a,$A0,$bc,$T1a - fmadd $T1b,$A0,$bd,$T1b - fmadd $T2a,$A1,$bc,$T2a - fmadd $T2b,$A1,$bd,$T2b - fmadd $T3a,$A2,$bc,$T3a - fmadd $T3b,$A2,$bd,$T3b - fmul $dota,$A3,$bc - fmul $dotb,$A3,$bd - - fmadd $T1a,$N1,$na,$T1a - fmadd $T1b,$N1,$nb,$T1b - lfd $A0,8($nap_d) ; load a[j] in double format - lfd $A1,16($nap_d) - fmadd $T2a,$N2,$na,$T2a - fmadd $T2b,$N2,$nb,$T2b - lfd $A2,24($nap_d) ; load a[j+1] in double format - lfd $A3,32($nap_d) - fmadd $T3a,$N3,$na,$T3a - fmadd $T3b,$N3,$nb,$T3b - fmadd $T0a,$N0,$na,$T0a - fmadd $T0b,$N0,$nb,$T0b - - fmadd $T1a,$N0,$nc,$T1a - fmadd $T1b,$N0,$nd,$T1b - fmadd $T2a,$N1,$nc,$T2a - fmadd $T2b,$N1,$nd,$T2b - fmadd $T3a,$N2,$nc,$T3a - fmadd $T3b,$N2,$nd,$T3b - fmadd $dota,$N3,$nc,$dota - fmadd $dotb,$N3,$nd,$dotb - - fctid $T0a,$T0a - fctid $T0b,$T0b - fctid $T1a,$T1a - fctid $T1b,$T1b - fctid $T2a,$T2a - fctid $T2b,$T2b - fctid $T3a,$T3a - fctid $T3b,$T3b - - stfd $T0a,`$FRAME+0`($sp) - stfd $T0b,`$FRAME+8`($sp) - stfd $T1a,`$FRAME+16`($sp) - stfd $T1b,`$FRAME+24`($sp) - stfd $T2a,`$FRAME+32`($sp) - stfd $T2b,`$FRAME+40`($sp) - stfd $T3a,`$FRAME+48`($sp) - stfd $T3b,`$FRAME+56`($sp) - -.align 5 -Linner: - fmul $T1a,$A1,$ba - fmul $T1b,$A1,$bb - fmul $T2a,$A2,$ba - fmul $T2b,$A2,$bb - lfd $N0,40($nap_d) ; load n[j] in double format - lfd $N1,48($nap_d) - fmul $T3a,$A3,$ba - fmul $T3b,$A3,$bb - fmadd $T0a,$A0,$ba,$dota - fmadd $T0b,$A0,$bb,$dotb - lfd $N2,56($nap_d) ; load n[j+1] in double format - lfdu $N3,64($nap_d) - - fmadd $T1a,$A0,$bc,$T1a - fmadd $T1b,$A0,$bd,$T1b - fmadd $T2a,$A1,$bc,$T2a - fmadd $T2b,$A1,$bd,$T2b - lfd $A0,8($nap_d) ; load a[j] in double format - lfd $A1,16($nap_d) - fmadd $T3a,$A2,$bc,$T3a - fmadd $T3b,$A2,$bd,$T3b - fmul $dota,$A3,$bc - fmul $dotb,$A3,$bd - lfd $A2,24($nap_d) ; load a[j+1] in double format - lfd $A3,32($nap_d) - - fmadd $T1a,$N1,$na,$T1a - fmadd $T1b,$N1,$nb,$T1b - ld $t0,`$FRAME+0`($sp) - ld $t1,`$FRAME+8`($sp) - fmadd $T2a,$N2,$na,$T2a - fmadd $T2b,$N2,$nb,$T2b - ld $t2,`$FRAME+16`($sp) - ld $t3,`$FRAME+24`($sp) - fmadd $T3a,$N3,$na,$T3a - fmadd $T3b,$N3,$nb,$T3b - add $t0,$t0,$carry ; can not overflow - ld $t4,`$FRAME+32`($sp) - ld $t5,`$FRAME+40`($sp) - fmadd $T0a,$N0,$na,$T0a - fmadd $T0b,$N0,$nb,$T0b - srdi $carry,$t0,16 - add $t1,$t1,$carry - srdi $carry,$t1,16 - ld $t6,`$FRAME+48`($sp) - ld $t7,`$FRAME+56`($sp) - - fmadd $T1a,$N0,$nc,$T1a - fmadd $T1b,$N0,$nd,$T1b - insrdi $t0,$t1,16,32 - ld $t1,8($tp) ; tp[j] - fmadd $T2a,$N1,$nc,$T2a - fmadd $T2b,$N1,$nd,$T2b - add $t2,$t2,$carry - fmadd $T3a,$N2,$nc,$T3a - fmadd $T3b,$N2,$nd,$T3b - srdi $carry,$t2,16 - insrdi $t0,$t2,16,16 - fmadd $dota,$N3,$nc,$dota - fmadd $dotb,$N3,$nd,$dotb - add $t3,$t3,$carry - ldu $t2,16($tp) ; tp[j+1] - srdi $carry,$t3,16 - insrdi $t0,$t3,16,0 ; 0..63 bits - add $t4,$t4,$carry - - fctid $T0a,$T0a - fctid $T0b,$T0b - srdi $carry,$t4,16 - fctid $T1a,$T1a - fctid $T1b,$T1b - add $t5,$t5,$carry - fctid $T2a,$T2a - fctid $T2b,$T2b - srdi $carry,$t5,16 - insrdi $t4,$t5,16,32 - fctid $T3a,$T3a - fctid $T3b,$T3b - add $t6,$t6,$carry - srdi $carry,$t6,16 - insrdi $t4,$t6,16,16 - - stfd $T0a,`$FRAME+0`($sp) - stfd $T0b,`$FRAME+8`($sp) - add $t7,$t7,$carry - addc $t3,$t0,$t1 - stfd $T1a,`$FRAME+16`($sp) - stfd $T1b,`$FRAME+24`($sp) - insrdi $t4,$t7,16,0 ; 64..127 bits - srdi $carry,$t7,16 ; upper 33 bits - stfd $T2a,`$FRAME+32`($sp) - stfd $T2b,`$FRAME+40`($sp) - adde $t5,$t4,$t2 - stfd $T3a,`$FRAME+48`($sp) - stfd $T3b,`$FRAME+56`($sp) - addze $carry,$carry - std $t3,-16($tp) ; tp[j-1] - std $t5,-8($tp) ; tp[j] - bdnz- Linner - - fctid $dota,$dota - fctid $dotb,$dotb - ld $t0,`$FRAME+0`($sp) - ld $t1,`$FRAME+8`($sp) - ld $t2,`$FRAME+16`($sp) - ld $t3,`$FRAME+24`($sp) - ld $t4,`$FRAME+32`($sp) - ld $t5,`$FRAME+40`($sp) - ld $t6,`$FRAME+48`($sp) - ld $t7,`$FRAME+56`($sp) - stfd $dota,`$FRAME+64`($sp) - stfd $dotb,`$FRAME+72`($sp) - - add $t0,$t0,$carry ; can not overflow - srdi $carry,$t0,16 - add $t1,$t1,$carry - srdi $carry,$t1,16 - insrdi $t0,$t1,16,32 - add $t2,$t2,$carry - ld $t1,8($tp) ; tp[j] - srdi $carry,$t2,16 - insrdi $t0,$t2,16,16 - add $t3,$t3,$carry - ldu $t2,16($tp) ; tp[j+1] - srdi $carry,$t3,16 - insrdi $t0,$t3,16,0 ; 0..63 bits - add $t4,$t4,$carry - srdi $carry,$t4,16 - add $t5,$t5,$carry - srdi $carry,$t5,16 - insrdi $t4,$t5,16,32 - add $t6,$t6,$carry - srdi $carry,$t6,16 - insrdi $t4,$t6,16,16 - add $t7,$t7,$carry - insrdi $t4,$t7,16,0 ; 64..127 bits - srdi $carry,$t7,16 ; upper 33 bits - ld $t6,`$FRAME+64`($sp) - ld $t7,`$FRAME+72`($sp) - - addc $t3,$t0,$t1 - adde $t5,$t4,$t2 - addze $carry,$carry - - std $t3,-16($tp) ; tp[j-1] - std $t5,-8($tp) ; tp[j] - - add $carry,$carry,$ovf ; comsume upmost overflow - add $t6,$t6,$carry ; can not overflow - srdi $carry,$t6,16 - add $t7,$t7,$carry - insrdi $t6,$t7,48,0 - srdi $ovf,$t7,48 - std $t6,0($tp) ; tp[num-1] - - slwi $t7,$num,2 - addi $i,$i,8 - subf $nap_d,$t7,$nap_d ; rewind pointer - cmpw $i,$num - blt- Louter - - subf $np,$num,$np ; rewind np - addi $j,$j,1 ; restore counter - subfc $i,$i,$i ; j=0 and "clear" XER[CA] - addi $tp,$sp,`$FRAME+$TRANSFER+8` - addi $t4,$sp,`$FRAME+$TRANSFER+16` - addi $t5,$np,8 - addi $t6,$rp,8 - mtctr $j - -.align 4 -Lsub: ldx $t0,$tp,$i - ldx $t1,$np,$i - ldx $t2,$t4,$i - ldx $t3,$t5,$i - subfe $t0,$t1,$t0 ; tp[j]-np[j] - subfe $t2,$t3,$t2 ; tp[j+1]-np[j+1] - stdx $t0,$rp,$i - stdx $t2,$t6,$i - addi $i,$i,16 - bdnz- Lsub - - li $i,0 - subfe $ovf,$i,$ovf ; handle upmost overflow bit - and $ap,$tp,$ovf - andc $np,$rp,$ovf - or $ap,$ap,$np ; ap=borrow?tp:rp - addi $t7,$ap,8 - mtctr $j - -.align 4 -Lcopy: ; copy or in-place refresh - ldx $t0,$ap,$i - ldx $t1,$t7,$i - std $i,8($nap_d) ; zap nap_d - std $i,16($nap_d) - std $i,24($nap_d) - std $i,32($nap_d) - std $i,40($nap_d) - std $i,48($nap_d) - std $i,56($nap_d) - stdu $i,64($nap_d) - stdx $t0,$rp,$i - stdx $t1,$t6,$i - stdx $i,$tp,$i ; zap tp at once - stdx $i,$t4,$i - addi $i,$i,16 - bdnz- Lcopy - - $POP r14,`2*$SIZE_T`($sp) - $POP r15,`3*$SIZE_T`($sp) - $POP r16,`4*$SIZE_T`($sp) - $POP r17,`5*$SIZE_T`($sp) - $POP r18,`6*$SIZE_T`($sp) - $POP r19,`7*$SIZE_T`($sp) - $POP r20,`8*$SIZE_T`($sp) - $POP r21,`9*$SIZE_T`($sp) - $POP r22,`10*$SIZE_T`($sp) - $POP r23,`11*$SIZE_T`($sp) - lfd f14,`12*$SIZE_T+0`($sp) - lfd f15,`12*$SIZE_T+8`($sp) - lfd f16,`12*$SIZE_T+16`($sp) - lfd f17,`12*$SIZE_T+24`($sp) - lfd f18,`12*$SIZE_T+32`($sp) - lfd f19,`12*$SIZE_T+40`($sp) - lfd f20,`12*$SIZE_T+48`($sp) - lfd f21,`12*$SIZE_T+56`($sp) - lfd f22,`12*$SIZE_T+64`($sp) - lfd f23,`12*$SIZE_T+72`($sp) - lfd f24,`12*$SIZE_T+80`($sp) - lfd f25,`12*$SIZE_T+88`($sp) - $POP $sp,0($sp) - li r3,1 ; signal "handled" - blr - .long 0 -.asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@fy.chalmers.se>" -___ - -$code =~ s/\`([^\`]*)\`/eval $1/gem; -print $code; -close STDOUT; diff --git a/crypto/openssl/crypto/bn/asm/s390x-mont.pl b/crypto/openssl/crypto/bn/asm/s390x-mont.pl deleted file mode 100755 index d232510..0000000 --- a/crypto/openssl/crypto/bn/asm/s390x-mont.pl +++ /dev/null @@ -1,225 +0,0 @@ -#!/usr/bin/env perl - -# ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. -# ==================================================================== - -# April 2007. -# -# Performance improvement over vanilla C code varies from 85% to 45% -# depending on key length and benchmark. Unfortunately in this context -# these are not very impressive results [for code that utilizes "wide" -# 64x64=128-bit multiplication, which is not commonly available to C -# programmers], at least hand-coded bn_asm.c replacement is known to -# provide 30-40% better results for longest keys. Well, on a second -# thought it's not very surprising, because z-CPUs are single-issue -# and _strictly_ in-order execution, while bn_mul_mont is more or less -# dependent on CPU ability to pipe-line instructions and have several -# of them "in-flight" at the same time. I mean while other methods, -# for example Karatsuba, aim to minimize amount of multiplications at -# the cost of other operations increase, bn_mul_mont aim to neatly -# "overlap" multiplications and the other operations [and on most -# platforms even minimize the amount of the other operations, in -# particular references to memory]. But it's possible to improve this -# module performance by implementing dedicated squaring code-path and -# possibly by unrolling loops... - -# January 2009. -# -# Reschedule to minimize/avoid Address Generation Interlock hazard, -# make inner loops counter-based. - -$mn0="%r0"; -$num="%r1"; - -# int bn_mul_mont( -$rp="%r2"; # BN_ULONG *rp, -$ap="%r3"; # const BN_ULONG *ap, -$bp="%r4"; # const BN_ULONG *bp, -$np="%r5"; # const BN_ULONG *np, -$n0="%r6"; # const BN_ULONG *n0, -#$num="160(%r15)" # int num); - -$bi="%r2"; # zaps rp -$j="%r7"; - -$ahi="%r8"; -$alo="%r9"; -$nhi="%r10"; -$nlo="%r11"; -$AHI="%r12"; -$NHI="%r13"; -$count="%r14"; -$sp="%r15"; - -$code.=<<___; -.text -.globl bn_mul_mont -.type bn_mul_mont,\@function -bn_mul_mont: - lgf $num,164($sp) # pull $num - sla $num,3 # $num to enumerate bytes - la $bp,0($num,$bp) - - stg %r2,16($sp) - - cghi $num,16 # - lghi %r2,0 # - blr %r14 # if($num<16) return 0; - cghi $num,128 # - bhr %r14 # if($num>128) return 0; - - stmg %r3,%r15,24($sp) - - lghi $rp,-160-8 # leave room for carry bit - lcgr $j,$num # -$num - lgr %r0,$sp - la $rp,0($rp,$sp) - la $sp,0($j,$rp) # alloca - stg %r0,0($sp) # back chain - - sra $num,3 # restore $num - la $bp,0($j,$bp) # restore $bp - ahi $num,-1 # adjust $num for inner loop - lg $n0,0($n0) # pull n0 - - lg $bi,0($bp) - lg $alo,0($ap) - mlgr $ahi,$bi # ap[0]*bp[0] - lgr $AHI,$ahi - - lgr $mn0,$alo # "tp[0]"*n0 - msgr $mn0,$n0 - - lg $nlo,0($np) # - mlgr $nhi,$mn0 # np[0]*m1 - algr $nlo,$alo # +="tp[0]" - lghi $NHI,0 - alcgr $NHI,$nhi - - la $j,8(%r0) # j=1 - lr $count,$num - -.align 16 -.L1st: - lg $alo,0($j,$ap) - mlgr $ahi,$bi # ap[j]*bp[0] - algr $alo,$AHI - lghi $AHI,0 - alcgr $AHI,$ahi - - lg $nlo,0($j,$np) - mlgr $nhi,$mn0 # np[j]*m1 - algr $nlo,$NHI - lghi $NHI,0 - alcgr $nhi,$NHI # +="tp[j]" - algr $nlo,$alo - alcgr $NHI,$nhi - - stg $nlo,160-8($j,$sp) # tp[j-1]= - la $j,8($j) # j++ - brct $count,.L1st - - algr $NHI,$AHI - lghi $AHI,0 - alcgr $AHI,$AHI # upmost overflow bit - stg $NHI,160-8($j,$sp) - stg $AHI,160($j,$sp) - la $bp,8($bp) # bp++ - -.Louter: - lg $bi,0($bp) # bp[i] - lg $alo,0($ap) - mlgr $ahi,$bi # ap[0]*bp[i] - alg $alo,160($sp) # +=tp[0] - lghi $AHI,0 - alcgr $AHI,$ahi - - lgr $mn0,$alo - msgr $mn0,$n0 # tp[0]*n0 - - lg $nlo,0($np) # np[0] - mlgr $nhi,$mn0 # np[0]*m1 - algr $nlo,$alo # +="tp[0]" - lghi $NHI,0 - alcgr $NHI,$nhi - - la $j,8(%r0) # j=1 - lr $count,$num - -.align 16 -.Linner: - lg $alo,0($j,$ap) - mlgr $ahi,$bi # ap[j]*bp[i] - algr $alo,$AHI - lghi $AHI,0 - alcgr $ahi,$AHI - alg $alo,160($j,$sp)# +=tp[j] - alcgr $AHI,$ahi - - lg $nlo,0($j,$np) - mlgr $nhi,$mn0 # np[j]*m1 - algr $nlo,$NHI - lghi $NHI,0 - alcgr $nhi,$NHI - algr $nlo,$alo # +="tp[j]" - alcgr $NHI,$nhi - - stg $nlo,160-8($j,$sp) # tp[j-1]= - la $j,8($j) # j++ - brct $count,.Linner - - algr $NHI,$AHI - lghi $AHI,0 - alcgr $AHI,$AHI - alg $NHI,160($j,$sp)# accumulate previous upmost overflow bit - lghi $ahi,0 - alcgr $AHI,$ahi # new upmost overflow bit - stg $NHI,160-8($j,$sp) - stg $AHI,160($j,$sp) - - la $bp,8($bp) # bp++ - clg $bp,160+8+32($j,$sp) # compare to &bp[num] - jne .Louter - - lg $rp,160+8+16($j,$sp) # reincarnate rp - la $ap,160($sp) - ahi $num,1 # restore $num, incidentally clears "borrow" - - la $j,0(%r0) - lr $count,$num -.Lsub: lg $alo,0($j,$ap) - slbg $alo,0($j,$np) - stg $alo,0($j,$rp) - la $j,8($j) - brct $count,.Lsub - lghi $ahi,0 - slbgr $AHI,$ahi # handle upmost carry - - ngr $ap,$AHI - lghi $np,-1 - xgr $np,$AHI - ngr $np,$rp - ogr $ap,$np # ap=borrow?tp:rp - - la $j,0(%r0) - lgr $count,$num -.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh - stg $j,160($j,$sp) # zap tp - stg $alo,0($j,$rp) - la $j,8($j) - brct $count,.Lcopy - - la %r1,160+8+48($j,$sp) - lmg %r6,%r15,0(%r1) - lghi %r2,1 # signal "processed" - br %r14 -.size bn_mul_mont,.-bn_mul_mont -.string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>" -___ - -print $code; -close STDOUT; diff --git a/crypto/openssl/crypto/bn/asm/s390x.S b/crypto/openssl/crypto/bn/asm/s390x.S deleted file mode 100755 index 8f45f5d..0000000 --- a/crypto/openssl/crypto/bn/asm/s390x.S +++ /dev/null @@ -1,678 +0,0 @@ -.ident "s390x.S, version 1.0" -// ==================================================================== -// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL -// project. -// -// Rights for redistribution and usage in source and binary forms are -// granted according to the OpenSSL license. Warranty of any kind is -// disclaimed. -// ==================================================================== - -.text - -#define zero %r0 - -// BN_ULONG bn_mul_add_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5); -.globl bn_mul_add_words -.type bn_mul_add_words,@function -.align 4 -bn_mul_add_words: - lghi zero,0 // zero = 0 - la %r1,0(%r2) // put rp aside - lghi %r2,0 // i=0; - ltgfr %r4,%r4 - bler %r14 // if (len<=0) return 0; - - stmg %r6,%r10,48(%r15) - lghi %r8,0 // carry = 0 - srag %r10,%r4,2 // cnt=len/4 - jz .Loop1_madd - -.Loop4_madd: - lg %r7,0(%r2,%r3) // ap[i] - mlgr %r6,%r5 // *=w - algr %r7,%r8 // +=carry - alcgr %r6,zero - alg %r7,0(%r2,%r1) // +=rp[i] - alcgr %r6,zero - stg %r7,0(%r2,%r1) // rp[i]= - - lg %r9,8(%r2,%r3) - mlgr %r8,%r5 - algr %r9,%r6 - alcgr %r8,zero - alg %r9,8(%r2,%r1) - alcgr %r8,zero - stg %r9,8(%r2,%r1) - - lg %r7,16(%r2,%r3) - mlgr %r6,%r5 - algr %r7,%r8 - alcgr %r6,zero - alg %r7,16(%r2,%r1) - alcgr %r6,zero - stg %r7,16(%r2,%r1) - - lg %r9,24(%r2,%r3) - mlgr %r8,%r5 - algr %r9,%r6 - alcgr %r8,zero - alg %r9,24(%r2,%r1) - alcgr %r8,zero - stg %r9,24(%r2,%r1) - - la %r2,32(%r2) // i+=4 - brct %r10,.Loop4_madd - - lghi %r10,3 - nr %r4,%r10 // cnt=len%4 - jz .Lend_madd - -.Loop1_madd: - lg %r7,0(%r2,%r3) // ap[i] - mlgr %r6,%r5 // *=w - algr %r7,%r8 // +=carry - alcgr %r6,zero - alg %r7,0(%r2,%r1) // +=rp[i] - alcgr %r6,zero - stg %r7,0(%r2,%r1) // rp[i]= - - lgr %r8,%r6 - la %r2,8(%r2) // i++ - brct %r4,.Loop1_madd - -.Lend_madd: - lgr %r2,%r8 - lmg %r6,%r10,48(%r15) - br %r14 -.size bn_mul_add_words,.-bn_mul_add_words - -// BN_ULONG bn_mul_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5); -.globl bn_mul_words -.type bn_mul_words,@function -.align 4 -bn_mul_words: - lghi zero,0 // zero = 0 - la %r1,0(%r2) // put rp aside - lghi %r2,0 // i=0; - ltgfr %r4,%r4 - bler %r14 // if (len<=0) return 0; - - stmg %r6,%r10,48(%r15) - lghi %r8,0 // carry = 0 - srag %r10,%r4,2 // cnt=len/4 - jz .Loop1_mul - -.Loop4_mul: - lg %r7,0(%r2,%r3) // ap[i] - mlgr %r6,%r5 // *=w - algr %r7,%r8 // +=carry - alcgr %r6,zero - stg %r7,0(%r2,%r1) // rp[i]= - - lg %r9,8(%r2,%r3) - mlgr %r8,%r5 - algr %r9,%r6 - alcgr %r8,zero - stg %r9,8(%r2,%r1) - - lg %r7,16(%r2,%r3) - mlgr %r6,%r5 - algr %r7,%r8 - alcgr %r6,zero - stg %r7,16(%r2,%r1) - - lg %r9,24(%r2,%r3) - mlgr %r8,%r5 - algr %r9,%r6 - alcgr %r8,zero - stg %r9,24(%r2,%r1) - - la %r2,32(%r2) // i+=4 - brct %r10,.Loop4_mul - - lghi %r10,3 - nr %r4,%r10 // cnt=len%4 - jz .Lend_mul - -.Loop1_mul: - lg %r7,0(%r2,%r3) // ap[i] - mlgr %r6,%r5 // *=w - algr %r7,%r8 // +=carry - alcgr %r6,zero - stg %r7,0(%r2,%r1) // rp[i]= - - lgr %r8,%r6 - la %r2,8(%r2) // i++ - brct %r4,.Loop1_mul - -.Lend_mul: - lgr %r2,%r8 - lmg %r6,%r10,48(%r15) - br %r14 -.size bn_mul_words,.-bn_mul_words - -// void bn_sqr_words(BN_ULONG *r2,BN_ULONG *r2,int r4) -.globl bn_sqr_words -.type bn_sqr_words,@function -.align 4 -bn_sqr_words: - ltgfr %r4,%r4 - bler %r14 - - stmg %r6,%r7,48(%r15) - srag %r1,%r4,2 // cnt=len/4 - jz .Loop1_sqr - -.Loop4_sqr: - lg %r7,0(%r3) - mlgr %r6,%r7 - stg %r7,0(%r2) - stg %r6,8(%r2) - - lg %r7,8(%r3) - mlgr %r6,%r7 - stg %r7,16(%r2) - stg %r6,24(%r2) - - lg %r7,16(%r3) - mlgr %r6,%r7 - stg %r7,32(%r2) - stg %r6,40(%r2) - - lg %r7,24(%r3) - mlgr %r6,%r7 - stg %r7,48(%r2) - stg %r6,56(%r2) - - la %r3,32(%r3) - la %r2,64(%r2) - brct %r1,.Loop4_sqr - - lghi %r1,3 - nr %r4,%r1 // cnt=len%4 - jz .Lend_sqr - -.Loop1_sqr: - lg %r7,0(%r3) - mlgr %r6,%r7 - stg %r7,0(%r2) - stg %r6,8(%r2) - - la %r3,8(%r3) - la %r2,16(%r2) - brct %r4,.Loop1_sqr - -.Lend_sqr: - lmg %r6,%r7,48(%r15) - br %r14 -.size bn_sqr_words,.-bn_sqr_words - -// BN_ULONG bn_div_words(BN_ULONG h,BN_ULONG l,BN_ULONG d); -.globl bn_div_words -.type bn_div_words,@function -.align 4 -bn_div_words: - dlgr %r2,%r4 - lgr %r2,%r3 - br %r14 -.size bn_div_words,.-bn_div_words - -// BN_ULONG bn_add_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5); -.globl bn_add_words -.type bn_add_words,@function -.align 4 -bn_add_words: - la %r1,0(%r2) // put rp aside - lghi %r2,0 // i=0 - ltgfr %r5,%r5 - bler %r14 // if (len<=0) return 0; - - stg %r6,48(%r15) - lghi %r6,3 - nr %r6,%r5 // len%4 - sra %r5,2 // len/4, use sra because it sets condition code - jz .Loop1_add // carry is incidentally cleared if branch taken - algr %r2,%r2 // clear carry - -.Loop4_add: - lg %r0,0(%r2,%r3) - alcg %r0,0(%r2,%r4) - stg %r0,0(%r2,%r1) - lg %r0,8(%r2,%r3) - alcg %r0,8(%r2,%r4) - stg %r0,8(%r2,%r1) - lg %r0,16(%r2,%r3) - alcg %r0,16(%r2,%r4) - stg %r0,16(%r2,%r1) - lg %r0,24(%r2,%r3) - alcg %r0,24(%r2,%r4) - stg %r0,24(%r2,%r1) - - la %r2,32(%r2) // i+=4 - brct %r5,.Loop4_add - - la %r6,1(%r6) // see if len%4 is zero ... - brct %r6,.Loop1_add // without touching condition code:-) - -.Lexit_add: - lghi %r2,0 - alcgr %r2,%r2 - lg %r6,48(%r15) - br %r14 - -.Loop1_add: - lg %r0,0(%r2,%r3) - alcg %r0,0(%r2,%r4) - stg %r0,0(%r2,%r1) - - la %r2,8(%r2) // i++ - brct %r6,.Loop1_add - - j .Lexit_add -.size bn_add_words,.-bn_add_words - -// BN_ULONG bn_sub_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5); -.globl bn_sub_words -.type bn_sub_words,@function -.align 4 -bn_sub_words: - la %r1,0(%r2) // put rp aside - lghi %r2,0 // i=0 - ltgfr %r5,%r5 - bler %r14 // if (len<=0) return 0; - - stg %r6,48(%r15) - lghi %r6,3 - nr %r6,%r5 // len%4 - sra %r5,2 // len/4, use sra because it sets condition code - jnz .Loop4_sub // borrow is incidentally cleared if branch taken - slgr %r2,%r2 // clear borrow - -.Loop1_sub: - lg %r0,0(%r2,%r3) - slbg %r0,0(%r2,%r4) - stg %r0,0(%r2,%r1) - - la %r2,8(%r2) // i++ - brct %r6,.Loop1_sub - j .Lexit_sub - -.Loop4_sub: - lg %r0,0(%r2,%r3) - slbg %r0,0(%r2,%r4) - stg %r0,0(%r2,%r1) - lg %r0,8(%r2,%r3) - slbg %r0,8(%r2,%r4) - stg %r0,8(%r2,%r1) - lg %r0,16(%r2,%r3) - slbg %r0,16(%r2,%r4) - stg %r0,16(%r2,%r1) - lg %r0,24(%r2,%r3) - slbg %r0,24(%r2,%r4) - stg %r0,24(%r2,%r1) - - la %r2,32(%r2) // i+=4 - brct %r5,.Loop4_sub - - la %r6,1(%r6) // see if len%4 is zero ... - brct %r6,.Loop1_sub // without touching condition code:-) - -.Lexit_sub: - lghi %r2,0 - slbgr %r2,%r2 - lcgr %r2,%r2 - lg %r6,48(%r15) - br %r14 -.size bn_sub_words,.-bn_sub_words - -#define c1 %r1 -#define c2 %r5 -#define c3 %r8 - -#define mul_add_c(ai,bi,c1,c2,c3) \ - lg %r7,ai*8(%r3); \ - mlg %r6,bi*8(%r4); \ - algr c1,%r7; \ - alcgr c2,%r6; \ - alcgr c3,zero - -// void bn_mul_comba8(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4); -.globl bn_mul_comba8 -.type bn_mul_comba8,@function -.align 4 -bn_mul_comba8: - stmg %r6,%r8,48(%r15) - - lghi c1,0 - lghi c2,0 - lghi c3,0 - lghi zero,0 - - mul_add_c(0,0,c1,c2,c3); - stg c1,0*8(%r2) - lghi c1,0 - - mul_add_c(0,1,c2,c3,c1); - mul_add_c(1,0,c2,c3,c1); - stg c2,1*8(%r2) - lghi c2,0 - - mul_add_c(2,0,c3,c1,c2); - mul_add_c(1,1,c3,c1,c2); - mul_add_c(0,2,c3,c1,c2); - stg c3,2*8(%r2) - lghi c3,0 - - mul_add_c(0,3,c1,c2,c3); - mul_add_c(1,2,c1,c2,c3); - mul_add_c(2,1,c1,c2,c3); - mul_add_c(3,0,c1,c2,c3); - stg c1,3*8(%r2) - lghi c1,0 - - mul_add_c(4,0,c2,c3,c1); - mul_add_c(3,1,c2,c3,c1); - mul_add_c(2,2,c2,c3,c1); - mul_add_c(1,3,c2,c3,c1); - mul_add_c(0,4,c2,c3,c1); - stg c2,4*8(%r2) - lghi c2,0 - - mul_add_c(0,5,c3,c1,c2); - mul_add_c(1,4,c3,c1,c2); - mul_add_c(2,3,c3,c1,c2); - mul_add_c(3,2,c3,c1,c2); - mul_add_c(4,1,c3,c1,c2); - mul_add_c(5,0,c3,c1,c2); - stg c3,5*8(%r2) - lghi c3,0 - - mul_add_c(6,0,c1,c2,c3); - mul_add_c(5,1,c1,c2,c3); - mul_add_c(4,2,c1,c2,c3); - mul_add_c(3,3,c1,c2,c3); - mul_add_c(2,4,c1,c2,c3); - mul_add_c(1,5,c1,c2,c3); - mul_add_c(0,6,c1,c2,c3); - stg c1,6*8(%r2) - lghi c1,0 - - mul_add_c(0,7,c2,c3,c1); - mul_add_c(1,6,c2,c3,c1); - mul_add_c(2,5,c2,c3,c1); - mul_add_c(3,4,c2,c3,c1); - mul_add_c(4,3,c2,c3,c1); - mul_add_c(5,2,c2,c3,c1); - mul_add_c(6,1,c2,c3,c1); - mul_add_c(7,0,c2,c3,c1); - stg c2,7*8(%r2) - lghi c2,0 - - mul_add_c(7,1,c3,c1,c2); - mul_add_c(6,2,c3,c1,c2); - mul_add_c(5,3,c3,c1,c2); - mul_add_c(4,4,c3,c1,c2); - mul_add_c(3,5,c3,c1,c2); - mul_add_c(2,6,c3,c1,c2); - mul_add_c(1,7,c3,c1,c2); - stg c3,8*8(%r2) - lghi c3,0 - - mul_add_c(2,7,c1,c2,c3); - mul_add_c(3,6,c1,c2,c3); - mul_add_c(4,5,c1,c2,c3); - mul_add_c(5,4,c1,c2,c3); - mul_add_c(6,3,c1,c2,c3); - mul_add_c(7,2,c1,c2,c3); - stg c1,9*8(%r2) - lghi c1,0 - - mul_add_c(7,3,c2,c3,c1); - mul_add_c(6,4,c2,c3,c1); - mul_add_c(5,5,c2,c3,c1); - mul_add_c(4,6,c2,c3,c1); - mul_add_c(3,7,c2,c3,c1); - stg c2,10*8(%r2) - lghi c2,0 - - mul_add_c(4,7,c3,c1,c2); - mul_add_c(5,6,c3,c1,c2); - mul_add_c(6,5,c3,c1,c2); - mul_add_c(7,4,c3,c1,c2); - stg c3,11*8(%r2) - lghi c3,0 - - mul_add_c(7,5,c1,c2,c3); - mul_add_c(6,6,c1,c2,c3); - mul_add_c(5,7,c1,c2,c3); - stg c1,12*8(%r2) - lghi c1,0 - - - mul_add_c(6,7,c2,c3,c1); - mul_add_c(7,6,c2,c3,c1); - stg c2,13*8(%r2) - lghi c2,0 - - mul_add_c(7,7,c3,c1,c2); - stg c3,14*8(%r2) - stg c1,15*8(%r2) - - lmg %r6,%r8,48(%r15) - br %r14 -.size bn_mul_comba8,.-bn_mul_comba8 - -// void bn_mul_comba4(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4); -.globl bn_mul_comba4 -.type bn_mul_comba4,@function -.align 4 -bn_mul_comba4: - stmg %r6,%r8,48(%r15) - - lghi c1,0 - lghi c2,0 - lghi c3,0 - lghi zero,0 - - mul_add_c(0,0,c1,c2,c3); - stg c1,0*8(%r3) - lghi c1,0 - - mul_add_c(0,1,c2,c3,c1); - mul_add_c(1,0,c2,c3,c1); - stg c2,1*8(%r2) - lghi c2,0 - - mul_add_c(2,0,c3,c1,c2); - mul_add_c(1,1,c3,c1,c2); - mul_add_c(0,2,c3,c1,c2); - stg c3,2*8(%r2) - lghi c3,0 - - mul_add_c(0,3,c1,c2,c3); - mul_add_c(1,2,c1,c2,c3); - mul_add_c(2,1,c1,c2,c3); - mul_add_c(3,0,c1,c2,c3); - stg c1,3*8(%r2) - lghi c1,0 - - mul_add_c(3,1,c2,c3,c1); - mul_add_c(2,2,c2,c3,c1); - mul_add_c(1,3,c2,c3,c1); - stg c2,4*8(%r2) - lghi c2,0 - - mul_add_c(2,3,c3,c1,c2); - mul_add_c(3,2,c3,c1,c2); - stg c3,5*8(%r2) - lghi c3,0 - - mul_add_c(3,3,c1,c2,c3); - stg c1,6*8(%r2) - stg c2,7*8(%r2) - - stmg %r6,%r8,48(%r15) - br %r14 -.size bn_mul_comba4,.-bn_mul_comba4 - -#define sqr_add_c(ai,c1,c2,c3) \ - lg %r7,ai*8(%r3); \ - mlgr %r6,%r7; \ - algr c1,%r7; \ - alcgr c2,%r6; \ - alcgr c3,zero - -#define sqr_add_c2(ai,aj,c1,c2,c3) \ - lg %r7,ai*8(%r3); \ - mlg %r6,aj*8(%r3); \ - algr c1,%r7; \ - alcgr c2,%r6; \ - alcgr c3,zero; \ - algr c1,%r7; \ - alcgr c2,%r6; \ - alcgr c3,zero - -// void bn_sqr_comba8(BN_ULONG *r2,BN_ULONG *r3); -.globl bn_sqr_comba8 -.type bn_sqr_comba8,@function -.align 4 -bn_sqr_comba8: - stmg %r6,%r8,48(%r15) - - lghi c1,0 - lghi c2,0 - lghi c3,0 - lghi zero,0 - - sqr_add_c(0,c1,c2,c3); - stg c1,0*8(%r2) - lghi c1,0 - - sqr_add_c2(1,0,c2,c3,c1); - stg c2,1*8(%r2) - lghi c2,0 - - sqr_add_c(1,c3,c1,c2); - sqr_add_c2(2,0,c3,c1,c2); - stg c3,2*8(%r2) - lghi c3,0 - - sqr_add_c2(3,0,c1,c2,c3); - sqr_add_c2(2,1,c1,c2,c3); - stg c1,3*8(%r2) - lghi c1,0 - - sqr_add_c(2,c2,c3,c1); - sqr_add_c2(3,1,c2,c3,c1); - sqr_add_c2(4,0,c2,c3,c1); - stg c2,4*8(%r2) - lghi c2,0 - - sqr_add_c2(5,0,c3,c1,c2); - sqr_add_c2(4,1,c3,c1,c2); - sqr_add_c2(3,2,c3,c1,c2); - stg c3,5*8(%r2) - lghi c3,0 - - sqr_add_c(3,c1,c2,c3); - sqr_add_c2(4,2,c1,c2,c3); - sqr_add_c2(5,1,c1,c2,c3); - sqr_add_c2(6,0,c1,c2,c3); - stg c1,6*8(%r2) - lghi c1,0 - - sqr_add_c2(7,0,c2,c3,c1); - sqr_add_c2(6,1,c2,c3,c1); - sqr_add_c2(5,2,c2,c3,c1); - sqr_add_c2(4,3,c2,c3,c1); - stg c2,7*8(%r2) - lghi c2,0 - - sqr_add_c(4,c3,c1,c2); - sqr_add_c2(5,3,c3,c1,c2); - sqr_add_c2(6,2,c3,c1,c2); - sqr_add_c2(7,1,c3,c1,c2); - stg c3,8*8(%r2) - lghi c3,0 - - sqr_add_c2(7,2,c1,c2,c3); - sqr_add_c2(6,3,c1,c2,c3); - sqr_add_c2(5,4,c1,c2,c3); - stg c1,9*8(%r2) - lghi c1,0 - - sqr_add_c(5,c2,c3,c1); - sqr_add_c2(6,4,c2,c3,c1); - sqr_add_c2(7,3,c2,c3,c1); - stg c2,10*8(%r2) - lghi c2,0 - - sqr_add_c2(7,4,c3,c1,c2); - sqr_add_c2(6,5,c3,c1,c2); - stg c3,11*8(%r2) - lghi c3,0 - - sqr_add_c(6,c1,c2,c3); - sqr_add_c2(7,5,c1,c2,c3); - stg c1,12*8(%r2) - lghi c1,0 - - sqr_add_c2(7,6,c2,c3,c1); - stg c2,13*8(%r2) - lghi c2,0 - - sqr_add_c(7,c3,c1,c2); - stg c3,14*8(%r2) - stg c1,15*8(%r2) - - lmg %r6,%r8,48(%r15) - br %r14 -.size bn_sqr_comba8,.-bn_sqr_comba8 - -// void bn_sqr_comba4(BN_ULONG *r2,BN_ULONG *r3); -.globl bn_sqr_comba4 -.type bn_sqr_comba4,@function -.align 4 -bn_sqr_comba4: - stmg %r6,%r8,48(%r15) - - lghi c1,0 - lghi c2,0 - lghi c3,0 - lghi zero,0 - - sqr_add_c(0,c1,c2,c3); - stg c1,0*8(%r2) - lghi c1,0 - - sqr_add_c2(1,0,c2,c3,c1); - stg c2,1*8(%r2) - lghi c2,0 - - sqr_add_c(1,c3,c1,c2); - sqr_add_c2(2,0,c3,c1,c2); - stg c3,2*8(%r2) - lghi c3,0 - - sqr_add_c2(3,0,c1,c2,c3); - sqr_add_c2(2,1,c1,c2,c3); - stg c1,3*8(%r2) - lghi c1,0 - - sqr_add_c(2,c2,c3,c1); - sqr_add_c2(3,1,c2,c3,c1); - stg c2,4*8(%r2) - lghi c2,0 - - sqr_add_c2(3,2,c3,c1,c2); - stg c3,5*8(%r2) - lghi c3,0 - - sqr_add_c(3,c1,c2,c3); - stg c1,6*8(%r2) - stg c2,7*8(%r2) - - lmg %r6,%r8,48(%r15) - br %r14 -.size bn_sqr_comba4,.-bn_sqr_comba4 diff --git a/crypto/openssl/crypto/bn/asm/sparcv9-mont.pl b/crypto/openssl/crypto/bn/asm/sparcv9-mont.pl deleted file mode 100755 index b8fb1e8..0000000 --- a/crypto/openssl/crypto/bn/asm/sparcv9-mont.pl +++ /dev/null @@ -1,606 +0,0 @@ -#!/usr/bin/env perl - -# ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. -# ==================================================================== - -# December 2005 -# -# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons -# for undertaken effort are multiple. First of all, UltraSPARC is not -# the whole SPARCv9 universe and other VIS-free implementations deserve -# optimized code as much. Secondly, newly introduced UltraSPARC T1, -# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes, -# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with -# several integrated RSA/DSA accelerator circuits accessible through -# kernel driver [only(*)], but having decent user-land software -# implementation is important too. Finally, reasons like desire to -# experiment with dedicated squaring procedure. Yes, this module -# implements one, because it was easiest to draft it in SPARCv9 -# instructions... - -# (*) Engine accessing the driver in question is on my TODO list. -# For reference, acceleator is estimated to give 6 to 10 times -# improvement on single-threaded RSA sign. It should be noted -# that 6-10x improvement coefficient does not actually mean -# something extraordinary in terms of absolute [single-threaded] -# performance, as SPARCv9 instruction set is by all means least -# suitable for high performance crypto among other 64 bit -# platforms. 6-10x factor simply places T1 in same performance -# domain as say AMD64 and IA-64. Improvement of RSA verify don't -# appear impressive at all, but it's the sign operation which is -# far more critical/interesting. - -# You might notice that inner loops are modulo-scheduled:-) This has -# essentially negligible impact on UltraSPARC performance, it's -# Fujitsu SPARC64 V users who should notice and hopefully appreciate -# the advantage... Currently this module surpasses sparcv9a-mont.pl -# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a -# module still have hidden potential [see TODO list there], which is -# estimated to be larger than 20%... - -# int bn_mul_mont( -$rp="%i0"; # BN_ULONG *rp, -$ap="%i1"; # const BN_ULONG *ap, -$bp="%i2"; # const BN_ULONG *bp, -$np="%i3"; # const BN_ULONG *np, -$n0="%i4"; # const BN_ULONG *n0, -$num="%i5"; # int num); - -$bits=32; -for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } -if ($bits==64) { $bias=2047; $frame=192; } -else { $bias=0; $frame=128; } - -$car0="%o0"; -$car1="%o1"; -$car2="%o2"; # 1 bit -$acc0="%o3"; -$acc1="%o4"; -$mask="%g1"; # 32 bits, what a waste... -$tmp0="%g4"; -$tmp1="%g5"; - -$i="%l0"; -$j="%l1"; -$mul0="%l2"; -$mul1="%l3"; -$tp="%l4"; -$apj="%l5"; -$npj="%l6"; -$tpj="%l7"; - -$fname="bn_mul_mont_int"; - -$code=<<___; -.section ".text",#alloc,#execinstr - -.global $fname -.align 32 -$fname: - cmp %o5,4 ! 128 bits minimum - bge,pt %icc,.Lenter - sethi %hi(0xffffffff),$mask - retl - clr %o0 -.align 32 -.Lenter: - save %sp,-$frame,%sp - sll $num,2,$num ! num*=4 - or $mask,%lo(0xffffffff),$mask - ld [$n0],$n0 - cmp $ap,$bp - and $num,$mask,$num - ld [$bp],$mul0 ! bp[0] - nop - - add %sp,$bias,%o7 ! real top of stack - ld [$ap],$car0 ! ap[0] ! redundant in squaring context - sub %o7,$num,%o7 - ld [$ap+4],$apj ! ap[1] - and %o7,-1024,%o7 - ld [$np],$car1 ! np[0] - sub %o7,$bias,%sp ! alloca - ld [$np+4],$npj ! np[1] - be,pt `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont - mov 12,$j - - mulx $car0,$mul0,$car0 ! ap[0]*bp[0] - mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0] - and $car0,$mask,$acc0 - add %sp,$bias+$frame,$tp - ld [$ap+8],$apj !prologue! - - mulx $n0,$acc0,$mul1 ! "t[0]"*n0 - and $mul1,$mask,$mul1 - - mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0 - mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0 - srlx $car0,32,$car0 - add $acc0,$car1,$car1 - ld [$np+8],$npj !prologue! - srlx $car1,32,$car1 - mov $tmp0,$acc0 !prologue! - -.L1st: - mulx $apj,$mul0,$tmp0 - mulx $npj,$mul1,$tmp1 - add $acc0,$car0,$car0 - ld [$ap+$j],$apj ! ap[j] - and $car0,$mask,$acc0 - add $acc1,$car1,$car1 - ld [$np+$j],$npj ! np[j] - srlx $car0,32,$car0 - add $acc0,$car1,$car1 - add $j,4,$j ! j++ - mov $tmp0,$acc0 - st $car1,[$tp] - cmp $j,$num - mov $tmp1,$acc1 - srlx $car1,32,$car1 - bl %icc,.L1st - add $tp,4,$tp ! tp++ -!.L1st - - mulx $apj,$mul0,$tmp0 !epilogue! - mulx $npj,$mul1,$tmp1 - add $acc0,$car0,$car0 - and $car0,$mask,$acc0 - add $acc1,$car1,$car1 - srlx $car0,32,$car0 - add $acc0,$car1,$car1 - st $car1,[$tp] - srlx $car1,32,$car1 - - add $tmp0,$car0,$car0 - and $car0,$mask,$acc0 - add $tmp1,$car1,$car1 - srlx $car0,32,$car0 - add $acc0,$car1,$car1 - st $car1,[$tp+4] - srlx $car1,32,$car1 - - add $car0,$car1,$car1 - st $car1,[$tp+8] - srlx $car1,32,$car2 - - mov 4,$i ! i++ - ld [$bp+4],$mul0 ! bp[1] -.Louter: - add %sp,$bias+$frame,$tp - ld [$ap],$car0 ! ap[0] - ld [$ap+4],$apj ! ap[1] - ld [$np],$car1 ! np[0] - ld [$np+4],$npj ! np[1] - ld [$tp],$tmp1 ! tp[0] - ld [$tp+4],$tpj ! tp[1] - mov 12,$j - - mulx $car0,$mul0,$car0 - mulx $apj,$mul0,$tmp0 !prologue! - add $tmp1,$car0,$car0 - ld [$ap+8],$apj !prologue! - and $car0,$mask,$acc0 - - mulx $n0,$acc0,$mul1 - and $mul1,$mask,$mul1 - - mulx $car1,$mul1,$car1 - mulx $npj,$mul1,$acc1 !prologue! - srlx $car0,32,$car0 - add $acc0,$car1,$car1 - ld [$np+8],$npj !prologue! - srlx $car1,32,$car1 - mov $tmp0,$acc0 !prologue! - -.Linner: - mulx $apj,$mul0,$tmp0 - mulx $npj,$mul1,$tmp1 - add $tpj,$car0,$car0 - ld [$ap+$j],$apj ! ap[j] - add $acc0,$car0,$car0 - add $acc1,$car1,$car1 - ld [$np+$j],$npj ! np[j] - and $car0,$mask,$acc0 - ld [$tp+8],$tpj ! tp[j] - srlx $car0,32,$car0 - add $acc0,$car1,$car1 - add $j,4,$j ! j++ - mov $tmp0,$acc0 - st $car1,[$tp] ! tp[j-1] - srlx $car1,32,$car1 - mov $tmp1,$acc1 - cmp $j,$num - bl %icc,.Linner - add $tp,4,$tp ! tp++ -!.Linner - - mulx $apj,$mul0,$tmp0 !epilogue! - mulx $npj,$mul1,$tmp1 - add $tpj,$car0,$car0 - add $acc0,$car0,$car0 - ld [$tp+8],$tpj ! tp[j] - and $car0,$mask,$acc0 - add $acc1,$car1,$car1 - srlx $car0,32,$car0 - add $acc0,$car1,$car1 - st $car1,[$tp] ! tp[j-1] - srlx $car1,32,$car1 - - add $tpj,$car0,$car0 - add $tmp0,$car0,$car0 - and $car0,$mask,$acc0 - add $tmp1,$car1,$car1 - add $acc0,$car1,$car1 - st $car1,[$tp+4] ! tp[j-1] - srlx $car0,32,$car0 - add $i,4,$i ! i++ - srlx $car1,32,$car1 - - add $car0,$car1,$car1 - cmp $i,$num - add $car2,$car1,$car1 - st $car1,[$tp+8] - - srlx $car1,32,$car2 - bl,a %icc,.Louter - ld [$bp+$i],$mul0 ! bp[i] -!.Louter - - add $tp,12,$tp - -.Ltail: - add $np,$num,$np - add $rp,$num,$rp - mov $tp,$ap - sub %g0,$num,%o7 ! k=-num - ba .Lsub - subcc %g0,%g0,%g0 ! clear %icc.c -.align 16 -.Lsub: - ld [$tp+%o7],%o0 - ld [$np+%o7],%o1 - subccc %o0,%o1,%o1 ! tp[j]-np[j] - add $rp,%o7,$i - add %o7,4,%o7 - brnz %o7,.Lsub - st %o1,[$i] - subc $car2,0,$car2 ! handle upmost overflow bit - and $tp,$car2,$ap - andn $rp,$car2,$np - or $ap,$np,$ap - sub %g0,$num,%o7 - -.Lcopy: - ld [$ap+%o7],%o0 ! copy or in-place refresh - st %g0,[$tp+%o7] ! zap tp - st %o0,[$rp+%o7] - add %o7,4,%o7 - brnz %o7,.Lcopy - nop - mov 1,%i0 - ret - restore -___ - -######## -######## .Lbn_sqr_mont gives up to 20% *overall* improvement over -######## code without following dedicated squaring procedure. -######## -$sbit="%i2"; # re-use $bp! - -$code.=<<___; -.align 32 -.Lbn_sqr_mont: - mulx $mul0,$mul0,$car0 ! ap[0]*ap[0] - mulx $apj,$mul0,$tmp0 !prologue! - and $car0,$mask,$acc0 - add %sp,$bias+$frame,$tp - ld [$ap+8],$apj !prologue! - - mulx $n0,$acc0,$mul1 ! "t[0]"*n0 - srlx $car0,32,$car0 - and $mul1,$mask,$mul1 - - mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0 - mulx $npj,$mul1,$acc1 !prologue! - and $car0,1,$sbit - ld [$np+8],$npj !prologue! - srlx $car0,1,$car0 - add $acc0,$car1,$car1 - srlx $car1,32,$car1 - mov $tmp0,$acc0 !prologue! - -.Lsqr_1st: - mulx $apj,$mul0,$tmp0 - mulx $npj,$mul1,$tmp1 - add $acc0,$car0,$car0 ! ap[j]*a0+c0 - add $acc1,$car1,$car1 - ld [$ap+$j],$apj ! ap[j] - and $car0,$mask,$acc0 - ld [$np+$j],$npj ! np[j] - srlx $car0,32,$car0 - add $acc0,$acc0,$acc0 - or $sbit,$acc0,$acc0 - mov $tmp1,$acc1 - srlx $acc0,32,$sbit - add $j,4,$j ! j++ - and $acc0,$mask,$acc0 - cmp $j,$num - add $acc0,$car1,$car1 - st $car1,[$tp] - mov $tmp0,$acc0 - srlx $car1,32,$car1 - bl %icc,.Lsqr_1st - add $tp,4,$tp ! tp++ -!.Lsqr_1st - - mulx $apj,$mul0,$tmp0 ! epilogue - mulx $npj,$mul1,$tmp1 - add $acc0,$car0,$car0 ! ap[j]*a0+c0 - add $acc1,$car1,$car1 - and $car0,$mask,$acc0 - srlx $car0,32,$car0 - add $acc0,$acc0,$acc0 - or $sbit,$acc0,$acc0 - srlx $acc0,32,$sbit - and $acc0,$mask,$acc0 - add $acc0,$car1,$car1 - st $car1,[$tp] - srlx $car1,32,$car1 - - add $tmp0,$car0,$car0 ! ap[j]*a0+c0 - add $tmp1,$car1,$car1 - and $car0,$mask,$acc0 - srlx $car0,32,$car0 - add $acc0,$acc0,$acc0 - or $sbit,$acc0,$acc0 - srlx $acc0,32,$sbit - and $acc0,$mask,$acc0 - add $acc0,$car1,$car1 - st $car1,[$tp+4] - srlx $car1,32,$car1 - - add $car0,$car0,$car0 - or $sbit,$car0,$car0 - add $car0,$car1,$car1 - st $car1,[$tp+8] - srlx $car1,32,$car2 - - ld [%sp+$bias+$frame],$tmp0 ! tp[0] - ld [%sp+$bias+$frame+4],$tmp1 ! tp[1] - ld [%sp+$bias+$frame+8],$tpj ! tp[2] - ld [$ap+4],$mul0 ! ap[1] - ld [$ap+8],$apj ! ap[2] - ld [$np],$car1 ! np[0] - ld [$np+4],$npj ! np[1] - mulx $n0,$tmp0,$mul1 - - mulx $mul0,$mul0,$car0 - and $mul1,$mask,$mul1 - - mulx $car1,$mul1,$car1 - mulx $npj,$mul1,$acc1 - add $tmp0,$car1,$car1 - and $car0,$mask,$acc0 - ld [$np+8],$npj ! np[2] - srlx $car1,32,$car1 - add $tmp1,$car1,$car1 - srlx $car0,32,$car0 - add $acc0,$car1,$car1 - and $car0,1,$sbit - add $acc1,$car1,$car1 - srlx $car0,1,$car0 - mov 12,$j - st $car1,[%sp+$bias+$frame] ! tp[0]= - srlx $car1,32,$car1 - add %sp,$bias+$frame+4,$tp - -.Lsqr_2nd: - mulx $apj,$mul0,$acc0 - mulx $npj,$mul1,$acc1 - add $acc0,$car0,$car0 - add $tpj,$car1,$car1 - ld [$ap+$j],$apj ! ap[j] - and $car0,$mask,$acc0 - ld [$np+$j],$npj ! np[j] - srlx $car0,32,$car0 - add $acc1,$car1,$car1 - ld [$tp+8],$tpj ! tp[j] - add $acc0,$acc0,$acc0 - add $j,4,$j ! j++ - or $sbit,$acc0,$acc0 - srlx $acc0,32,$sbit - and $acc0,$mask,$acc0 - cmp $j,$num - add $acc0,$car1,$car1 - st $car1,[$tp] ! tp[j-1] - srlx $car1,32,$car1 - bl %icc,.Lsqr_2nd - add $tp,4,$tp ! tp++ -!.Lsqr_2nd - - mulx $apj,$mul0,$acc0 - mulx $npj,$mul1,$acc1 - add $acc0,$car0,$car0 - add $tpj,$car1,$car1 - and $car0,$mask,$acc0 - srlx $car0,32,$car0 - add $acc1,$car1,$car1 - add $acc0,$acc0,$acc0 - or $sbit,$acc0,$acc0 - srlx $acc0,32,$sbit - and $acc0,$mask,$acc0 - add $acc0,$car1,$car1 - st $car1,[$tp] ! tp[j-1] - srlx $car1,32,$car1 - - add $car0,$car0,$car0 - or $sbit,$car0,$car0 - add $car0,$car1,$car1 - add $car2,$car1,$car1 - st $car1,[$tp+4] - srlx $car1,32,$car2 - - ld [%sp+$bias+$frame],$tmp1 ! tp[0] - ld [%sp+$bias+$frame+4],$tpj ! tp[1] - ld [$ap+8],$mul0 ! ap[2] - ld [$np],$car1 ! np[0] - ld [$np+4],$npj ! np[1] - mulx $n0,$tmp1,$mul1 - and $mul1,$mask,$mul1 - mov 8,$i - - mulx $mul0,$mul0,$car0 - mulx $car1,$mul1,$car1 - and $car0,$mask,$acc0 - add $tmp1,$car1,$car1 - srlx $car0,32,$car0 - add %sp,$bias+$frame,$tp - srlx $car1,32,$car1 - and $car0,1,$sbit - srlx $car0,1,$car0 - mov 4,$j - -.Lsqr_outer: -.Lsqr_inner1: - mulx $npj,$mul1,$acc1 - add $tpj,$car1,$car1 - add $j,4,$j - ld [$tp+8],$tpj - cmp $j,$i - add $acc1,$car1,$car1 - ld [$np+$j],$npj - st $car1,[$tp] - srlx $car1,32,$car1 - bl %icc,.Lsqr_inner1 - add $tp,4,$tp -!.Lsqr_inner1 - - add $j,4,$j - ld [$ap+$j],$apj ! ap[j] - mulx $npj,$mul1,$acc1 - add $tpj,$car1,$car1 - ld [$np+$j],$npj ! np[j] - add $acc0,$car1,$car1 - ld [$tp+8],$tpj ! tp[j] - add $acc1,$car1,$car1 - st $car1,[$tp] - srlx $car1,32,$car1 - - add $j,4,$j - cmp $j,$num - be,pn %icc,.Lsqr_no_inner2 - add $tp,4,$tp - -.Lsqr_inner2: - mulx $apj,$mul0,$acc0 - mulx $npj,$mul1,$acc1 - add $tpj,$car1,$car1 - add $acc0,$car0,$car0 - ld [$ap+$j],$apj ! ap[j] - and $car0,$mask,$acc0 - ld [$np+$j],$npj ! np[j] - srlx $car0,32,$car0 - add $acc0,$acc0,$acc0 - ld [$tp+8],$tpj ! tp[j] - or $sbit,$acc0,$acc0 - add $j,4,$j ! j++ - srlx $acc0,32,$sbit - and $acc0,$mask,$acc0 - cmp $j,$num - add $acc0,$car1,$car1 - add $acc1,$car1,$car1 - st $car1,[$tp] ! tp[j-1] - srlx $car1,32,$car1 - bl %icc,.Lsqr_inner2 - add $tp,4,$tp ! tp++ - -.Lsqr_no_inner2: - mulx $apj,$mul0,$acc0 - mulx $npj,$mul1,$acc1 - add $tpj,$car1,$car1 - add $acc0,$car0,$car0 - and $car0,$mask,$acc0 - srlx $car0,32,$car0 - add $acc0,$acc0,$acc0 - or $sbit,$acc0,$acc0 - srlx $acc0,32,$sbit - and $acc0,$mask,$acc0 - add $acc0,$car1,$car1 - add $acc1,$car1,$car1 - st $car1,[$tp] ! tp[j-1] - srlx $car1,32,$car1 - - add $car0,$car0,$car0 - or $sbit,$car0,$car0 - add $car0,$car1,$car1 - add $car2,$car1,$car1 - st $car1,[$tp+4] - srlx $car1,32,$car2 - - add $i,4,$i ! i++ - ld [%sp+$bias+$frame],$tmp1 ! tp[0] - ld [%sp+$bias+$frame+4],$tpj ! tp[1] - ld [$ap+$i],$mul0 ! ap[j] - ld [$np],$car1 ! np[0] - ld [$np+4],$npj ! np[1] - mulx $n0,$tmp1,$mul1 - and $mul1,$mask,$mul1 - add $i,4,$tmp0 - - mulx $mul0,$mul0,$car0 - mulx $car1,$mul1,$car1 - and $car0,$mask,$acc0 - add $tmp1,$car1,$car1 - srlx $car0,32,$car0 - add %sp,$bias+$frame,$tp - srlx $car1,32,$car1 - and $car0,1,$sbit - srlx $car0,1,$car0 - - cmp $tmp0,$num ! i<num-1 - bl %icc,.Lsqr_outer - mov 4,$j - -.Lsqr_last: - mulx $npj,$mul1,$acc1 - add $tpj,$car1,$car1 - add $j,4,$j - ld [$tp+8],$tpj - cmp $j,$i - add $acc1,$car1,$car1 - ld [$np+$j],$npj - st $car1,[$tp] - srlx $car1,32,$car1 - bl %icc,.Lsqr_last - add $tp,4,$tp -!.Lsqr_last - - mulx $npj,$mul1,$acc1 - add $tpj,$car1,$car1 - add $acc0,$car1,$car1 - add $acc1,$car1,$car1 - st $car1,[$tp] - srlx $car1,32,$car1 - - add $car0,$car0,$car0 ! recover $car0 - or $sbit,$car0,$car0 - add $car0,$car1,$car1 - add $car2,$car1,$car1 - st $car1,[$tp+4] - srlx $car1,32,$car2 - - ba .Ltail - add $tp,8,$tp -.type $fname,#function -.size $fname,(.-$fname) -.asciz "Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" -.align 32 -___ -$code =~ s/\`([^\`]*)\`/eval($1)/gem; -print $code; -close STDOUT; diff --git a/crypto/openssl/crypto/bn/asm/sparcv9a-mont.pl b/crypto/openssl/crypto/bn/asm/sparcv9a-mont.pl deleted file mode 100755 index a14205f..0000000 --- a/crypto/openssl/crypto/bn/asm/sparcv9a-mont.pl +++ /dev/null @@ -1,882 +0,0 @@ -#!/usr/bin/env perl - -# ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. -# ==================================================================== - -# October 2005 -# -# "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU? -# Because unlike integer multiplier, which simply stalls whole CPU, -# FPU is fully pipelined and can effectively emit 48 bit partial -# product every cycle. Why not blended SPARC v9? One can argue that -# making this module dependent on UltraSPARC VIS extension limits its -# binary compatibility. Well yes, it does exclude SPARC64 prior-V(!) -# implementations from compatibility matrix. But the rest, whole Sun -# UltraSPARC family and brand new Fujitsu's SPARC64 V, all support -# VIS extension instructions used in this module. This is considered -# good enough to not care about HAL SPARC64 users [if any] who have -# integer-only pure SPARCv9 module to "fall down" to. - -# USI&II cores currently exhibit uniform 2x improvement [over pre- -# bn_mul_mont codebase] for all key lengths and benchmarks. On USIII -# performance improves few percents for shorter keys and worsens few -# percents for longer keys. This is because USIII integer multiplier -# is >3x faster than USI&II one, which is harder to match [but see -# TODO list below]. It should also be noted that SPARC64 V features -# out-of-order execution, which *might* mean that integer multiplier -# is pipelined, which in turn *might* be impossible to match... On -# additional note, SPARC64 V implements FP Multiply-Add instruction, -# which is perfectly usable in this context... In other words, as far -# as Fujitsu SPARC64 V goes, talk to the author:-) - -# The implementation implies following "non-natural" limitations on -# input arguments: -# - num may not be less than 4; -# - num has to be even; -# Failure to meet either condition has no fatal effects, simply -# doesn't give any performance gain. - -# TODO: -# - modulo-schedule inner loop for better performance (on in-order -# execution core such as UltraSPARC this shall result in further -# noticeable(!) improvement); -# - dedicated squaring procedure[?]; - -###################################################################### -# November 2006 -# -# Modulo-scheduled inner loops allow to interleave floating point and -# integer instructions and minimize Read-After-Write penalties. This -# results in *further* 20-50% perfromance improvement [depending on -# key length, more for longer keys] on USI&II cores and 30-80% - on -# USIII&IV. - -$fname="bn_mul_mont_fpu"; -$bits=32; -for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } - -if ($bits==64) { - $bias=2047; - $frame=192; -} else { - $bias=0; - $frame=128; # 96 rounded up to largest known cache-line -} -$locals=64; - -# In order to provide for 32-/64-bit ABI duality, I keep integers wider -# than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used -# exclusively for pointers, indexes and other small values... -# int bn_mul_mont( -$rp="%i0"; # BN_ULONG *rp, -$ap="%i1"; # const BN_ULONG *ap, -$bp="%i2"; # const BN_ULONG *bp, -$np="%i3"; # const BN_ULONG *np, -$n0="%i4"; # const BN_ULONG *n0, -$num="%i5"; # int num); - -$tp="%l0"; # t[num] -$ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved -$ap_h="%l2"; # to these four vectors as double-precision FP values. -$np_l="%l3"; # This way a bunch of fxtods are eliminated in second -$np_h="%l4"; # loop and L1-cache aliasing is minimized... -$i="%l5"; -$j="%l6"; -$mask="%l7"; # 16-bit mask, 0xffff - -$n0="%g4"; # reassigned(!) to "64-bit" register -$carry="%i4"; # %i4 reused(!) for a carry bit - -# FP register naming chart -# -# ..HILO -# dcba -# -------- -# LOa -# LOb -# LOc -# LOd -# HIa -# HIb -# HIc -# HId -# ..a -# ..b -$ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6"; -$na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14"; -$alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19"; -$nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23"; - -$dota="%f24"; $dotb="%f26"; - -$aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38"; -$ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46"; -$nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54"; -$nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62"; - -$ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load - -$code=<<___; -.section ".text",#alloc,#execinstr - -.global $fname -.align 32 -$fname: - save %sp,-$frame-$locals,%sp - - cmp $num,4 - bl,a,pn %icc,.Lret - clr %i0 - andcc $num,1,%g0 ! $num has to be even... - bnz,a,pn %icc,.Lret - clr %i0 ! signal "unsupported input value" - - srl $num,1,$num - sethi %hi(0xffff),$mask - ld [%i4+0],$n0 ! $n0 reassigned, remember? - or $mask,%lo(0xffff),$mask - ld [%i4+4],%o0 - sllx %o0,32,%o0 - or %o0,$n0,$n0 ! $n0=n0[1].n0[0] - - sll $num,3,$num ! num*=8 - - add %sp,$bias,%o0 ! real top of stack - sll $num,2,%o1 - add %o1,$num,%o1 ! %o1=num*5 - sub %o0,%o1,%o0 - and %o0,-2048,%o0 ! optimize TLB utilization - sub %o0,$bias,%sp ! alloca(5*num*8) - - rd %asi,%o7 ! save %asi - add %sp,$bias+$frame+$locals,$tp - add $tp,$num,$ap_l - add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends ! - add $ap_l,$num,$ap_h - add $ap_h,$num,$np_l - add $np_l,$num,$np_h - - wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads - - add $rp,$num,$rp ! readjust input pointers to point - add $ap,$num,$ap ! at the ends too... - add $bp,$num,$bp - add $np,$num,$np - - stx %o7,[%sp+$bias+$frame+48] ! save %asi - - sub %g0,$num,$i ! i=-num - sub %g0,$num,$j ! j=-num - - add $ap,$j,%o3 - add $bp,$i,%o4 - - ld [%o3+4],%g1 ! bp[0] - ld [%o3+0],%o0 - ld [%o4+4],%g5 ! ap[0] - sllx %g1,32,%g1 - ld [%o4+0],%o1 - sllx %g5,32,%g5 - or %g1,%o0,%o0 - or %g5,%o1,%o1 - - add $np,$j,%o5 - - mulx %o1,%o0,%o0 ! ap[0]*bp[0] - mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0 - stx %o0,[%sp+$bias+$frame+0] - - ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words - fzeros $alo - ld [%o3+4],$ahi_ - fzeros $ahi - ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words - fzeros $nlo - ld [%o5+4],$nhi_ - fzeros $nhi - - ! transfer b[i] to FPU as 4x16-bit values - ldda [%o4+2]%asi,$ba - fxtod $alo,$alo - ldda [%o4+0]%asi,$bb - fxtod $ahi,$ahi - ldda [%o4+6]%asi,$bc - fxtod $nlo,$nlo - ldda [%o4+4]%asi,$bd - fxtod $nhi,$nhi - - ! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values - ldda [%sp+$bias+$frame+6]%asi,$na - fxtod $ba,$ba - ldda [%sp+$bias+$frame+4]%asi,$nb - fxtod $bb,$bb - ldda [%sp+$bias+$frame+2]%asi,$nc - fxtod $bc,$bc - ldda [%sp+$bias+$frame+0]%asi,$nd - fxtod $bd,$bd - - std $alo,[$ap_l+$j] ! save smashed ap[j] in double format - fxtod $na,$na - std $ahi,[$ap_h+$j] - fxtod $nb,$nb - std $nlo,[$np_l+$j] ! save smashed np[j] in double format - fxtod $nc,$nc - std $nhi,[$np_h+$j] - fxtod $nd,$nd - - fmuld $alo,$ba,$aloa - fmuld $nlo,$na,$nloa - fmuld $alo,$bb,$alob - fmuld $nlo,$nb,$nlob - fmuld $alo,$bc,$aloc - faddd $aloa,$nloa,$nloa - fmuld $nlo,$nc,$nloc - fmuld $alo,$bd,$alod - faddd $alob,$nlob,$nlob - fmuld $nlo,$nd,$nlod - fmuld $ahi,$ba,$ahia - faddd $aloc,$nloc,$nloc - fmuld $nhi,$na,$nhia - fmuld $ahi,$bb,$ahib - faddd $alod,$nlod,$nlod - fmuld $nhi,$nb,$nhib - fmuld $ahi,$bc,$ahic - faddd $ahia,$nhia,$nhia - fmuld $nhi,$nc,$nhic - fmuld $ahi,$bd,$ahid - faddd $ahib,$nhib,$nhib - fmuld $nhi,$nd,$nhid - - faddd $ahic,$nhic,$dota ! $nhic - faddd $ahid,$nhid,$dotb ! $nhid - - faddd $nloc,$nhia,$nloc - faddd $nlod,$nhib,$nlod - - fdtox $nloa,$nloa - fdtox $nlob,$nlob - fdtox $nloc,$nloc - fdtox $nlod,$nlod - - std $nloa,[%sp+$bias+$frame+0] - add $j,8,$j - std $nlob,[%sp+$bias+$frame+8] - add $ap,$j,%o4 - std $nloc,[%sp+$bias+$frame+16] - add $np,$j,%o5 - std $nlod,[%sp+$bias+$frame+24] - - ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words - fzeros $alo - ld [%o4+4],$ahi_ - fzeros $ahi - ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words - fzeros $nlo - ld [%o5+4],$nhi_ - fzeros $nhi - - fxtod $alo,$alo - fxtod $ahi,$ahi - fxtod $nlo,$nlo - fxtod $nhi,$nhi - - ldx [%sp+$bias+$frame+0],%o0 - fmuld $alo,$ba,$aloa - ldx [%sp+$bias+$frame+8],%o1 - fmuld $nlo,$na,$nloa - ldx [%sp+$bias+$frame+16],%o2 - fmuld $alo,$bb,$alob - ldx [%sp+$bias+$frame+24],%o3 - fmuld $nlo,$nb,$nlob - - srlx %o0,16,%o7 - std $alo,[$ap_l+$j] ! save smashed ap[j] in double format - fmuld $alo,$bc,$aloc - add %o7,%o1,%o1 - std $ahi,[$ap_h+$j] - faddd $aloa,$nloa,$nloa - fmuld $nlo,$nc,$nloc - srlx %o1,16,%o7 - std $nlo,[$np_l+$j] ! save smashed np[j] in double format - fmuld $alo,$bd,$alod - add %o7,%o2,%o2 - std $nhi,[$np_h+$j] - faddd $alob,$nlob,$nlob - fmuld $nlo,$nd,$nlod - srlx %o2,16,%o7 - fmuld $ahi,$ba,$ahia - add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] - faddd $aloc,$nloc,$nloc - fmuld $nhi,$na,$nhia - !and %o0,$mask,%o0 - !and %o1,$mask,%o1 - !and %o2,$mask,%o2 - !sllx %o1,16,%o1 - !sllx %o2,32,%o2 - !sllx %o3,48,%o7 - !or %o1,%o0,%o0 - !or %o2,%o0,%o0 - !or %o7,%o0,%o0 ! 64-bit result - srlx %o3,16,%g1 ! 34-bit carry - fmuld $ahi,$bb,$ahib - - faddd $alod,$nlod,$nlod - fmuld $nhi,$nb,$nhib - fmuld $ahi,$bc,$ahic - faddd $ahia,$nhia,$nhia - fmuld $nhi,$nc,$nhic - fmuld $ahi,$bd,$ahid - faddd $ahib,$nhib,$nhib - fmuld $nhi,$nd,$nhid - - faddd $dota,$nloa,$nloa - faddd $dotb,$nlob,$nlob - faddd $ahic,$nhic,$dota ! $nhic - faddd $ahid,$nhid,$dotb ! $nhid - - faddd $nloc,$nhia,$nloc - faddd $nlod,$nhib,$nlod - - fdtox $nloa,$nloa - fdtox $nlob,$nlob - fdtox $nloc,$nloc - fdtox $nlod,$nlod - - std $nloa,[%sp+$bias+$frame+0] - std $nlob,[%sp+$bias+$frame+8] - addcc $j,8,$j - std $nloc,[%sp+$bias+$frame+16] - bz,pn %icc,.L1stskip - std $nlod,[%sp+$bias+$frame+24] - -.align 32 ! incidentally already aligned ! -.L1st: - add $ap,$j,%o4 - add $np,$j,%o5 - ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words - fzeros $alo - ld [%o4+4],$ahi_ - fzeros $ahi - ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words - fzeros $nlo - ld [%o5+4],$nhi_ - fzeros $nhi - - fxtod $alo,$alo - fxtod $ahi,$ahi - fxtod $nlo,$nlo - fxtod $nhi,$nhi - - ldx [%sp+$bias+$frame+0],%o0 - fmuld $alo,$ba,$aloa - ldx [%sp+$bias+$frame+8],%o1 - fmuld $nlo,$na,$nloa - ldx [%sp+$bias+$frame+16],%o2 - fmuld $alo,$bb,$alob - ldx [%sp+$bias+$frame+24],%o3 - fmuld $nlo,$nb,$nlob - - srlx %o0,16,%o7 - std $alo,[$ap_l+$j] ! save smashed ap[j] in double format - fmuld $alo,$bc,$aloc - add %o7,%o1,%o1 - std $ahi,[$ap_h+$j] - faddd $aloa,$nloa,$nloa - fmuld $nlo,$nc,$nloc - srlx %o1,16,%o7 - std $nlo,[$np_l+$j] ! save smashed np[j] in double format - fmuld $alo,$bd,$alod - add %o7,%o2,%o2 - std $nhi,[$np_h+$j] - faddd $alob,$nlob,$nlob - fmuld $nlo,$nd,$nlod - srlx %o2,16,%o7 - fmuld $ahi,$ba,$ahia - add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] - and %o0,$mask,%o0 - faddd $aloc,$nloc,$nloc - fmuld $nhi,$na,$nhia - and %o1,$mask,%o1 - and %o2,$mask,%o2 - fmuld $ahi,$bb,$ahib - sllx %o1,16,%o1 - faddd $alod,$nlod,$nlod - fmuld $nhi,$nb,$nhib - sllx %o2,32,%o2 - fmuld $ahi,$bc,$ahic - sllx %o3,48,%o7 - or %o1,%o0,%o0 - faddd $ahia,$nhia,$nhia - fmuld $nhi,$nc,$nhic - or %o2,%o0,%o0 - fmuld $ahi,$bd,$ahid - or %o7,%o0,%o0 ! 64-bit result - faddd $ahib,$nhib,$nhib - fmuld $nhi,$nd,$nhid - addcc %g1,%o0,%o0 - faddd $dota,$nloa,$nloa - srlx %o3,16,%g1 ! 34-bit carry - faddd $dotb,$nlob,$nlob - bcs,a %xcc,.+8 - add %g1,1,%g1 - - stx %o0,[$tp] ! tp[j-1]= - - faddd $ahic,$nhic,$dota ! $nhic - faddd $ahid,$nhid,$dotb ! $nhid - - faddd $nloc,$nhia,$nloc - faddd $nlod,$nhib,$nlod - - fdtox $nloa,$nloa - fdtox $nlob,$nlob - fdtox $nloc,$nloc - fdtox $nlod,$nlod - - std $nloa,[%sp+$bias+$frame+0] - std $nlob,[%sp+$bias+$frame+8] - std $nloc,[%sp+$bias+$frame+16] - std $nlod,[%sp+$bias+$frame+24] - - addcc $j,8,$j - bnz,pt %icc,.L1st - add $tp,8,$tp - -.L1stskip: - fdtox $dota,$dota - fdtox $dotb,$dotb - - ldx [%sp+$bias+$frame+0],%o0 - ldx [%sp+$bias+$frame+8],%o1 - ldx [%sp+$bias+$frame+16],%o2 - ldx [%sp+$bias+$frame+24],%o3 - - srlx %o0,16,%o7 - std $dota,[%sp+$bias+$frame+32] - add %o7,%o1,%o1 - std $dotb,[%sp+$bias+$frame+40] - srlx %o1,16,%o7 - add %o7,%o2,%o2 - srlx %o2,16,%o7 - add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] - and %o0,$mask,%o0 - and %o1,$mask,%o1 - and %o2,$mask,%o2 - sllx %o1,16,%o1 - sllx %o2,32,%o2 - sllx %o3,48,%o7 - or %o1,%o0,%o0 - or %o2,%o0,%o0 - or %o7,%o0,%o0 ! 64-bit result - ldx [%sp+$bias+$frame+32],%o4 - addcc %g1,%o0,%o0 - ldx [%sp+$bias+$frame+40],%o5 - srlx %o3,16,%g1 ! 34-bit carry - bcs,a %xcc,.+8 - add %g1,1,%g1 - - stx %o0,[$tp] ! tp[j-1]= - add $tp,8,$tp - - srlx %o4,16,%o7 - add %o7,%o5,%o5 - and %o4,$mask,%o4 - sllx %o5,16,%o7 - or %o7,%o4,%o4 - addcc %g1,%o4,%o4 - srlx %o5,48,%g1 - bcs,a %xcc,.+8 - add %g1,1,%g1 - - mov %g1,$carry - stx %o4,[$tp] ! tp[num-1]= - - ba .Louter - add $i,8,$i -.align 32 -.Louter: - sub %g0,$num,$j ! j=-num - add %sp,$bias+$frame+$locals,$tp - - add $ap,$j,%o3 - add $bp,$i,%o4 - - ld [%o3+4],%g1 ! bp[i] - ld [%o3+0],%o0 - ld [%o4+4],%g5 ! ap[0] - sllx %g1,32,%g1 - ld [%o4+0],%o1 - sllx %g5,32,%g5 - or %g1,%o0,%o0 - or %g5,%o1,%o1 - - ldx [$tp],%o2 ! tp[0] - mulx %o1,%o0,%o0 - addcc %o2,%o0,%o0 - mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0 - stx %o0,[%sp+$bias+$frame+0] - - ! transfer b[i] to FPU as 4x16-bit values - ldda [%o4+2]%asi,$ba - ldda [%o4+0]%asi,$bb - ldda [%o4+6]%asi,$bc - ldda [%o4+4]%asi,$bd - - ! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values - ldda [%sp+$bias+$frame+6]%asi,$na - fxtod $ba,$ba - ldda [%sp+$bias+$frame+4]%asi,$nb - fxtod $bb,$bb - ldda [%sp+$bias+$frame+2]%asi,$nc - fxtod $bc,$bc - ldda [%sp+$bias+$frame+0]%asi,$nd - fxtod $bd,$bd - ldd [$ap_l+$j],$alo ! load a[j] in double format - fxtod $na,$na - ldd [$ap_h+$j],$ahi - fxtod $nb,$nb - ldd [$np_l+$j],$nlo ! load n[j] in double format - fxtod $nc,$nc - ldd [$np_h+$j],$nhi - fxtod $nd,$nd - - fmuld $alo,$ba,$aloa - fmuld $nlo,$na,$nloa - fmuld $alo,$bb,$alob - fmuld $nlo,$nb,$nlob - fmuld $alo,$bc,$aloc - faddd $aloa,$nloa,$nloa - fmuld $nlo,$nc,$nloc - fmuld $alo,$bd,$alod - faddd $alob,$nlob,$nlob - fmuld $nlo,$nd,$nlod - fmuld $ahi,$ba,$ahia - faddd $aloc,$nloc,$nloc - fmuld $nhi,$na,$nhia - fmuld $ahi,$bb,$ahib - faddd $alod,$nlod,$nlod - fmuld $nhi,$nb,$nhib - fmuld $ahi,$bc,$ahic - faddd $ahia,$nhia,$nhia - fmuld $nhi,$nc,$nhic - fmuld $ahi,$bd,$ahid - faddd $ahib,$nhib,$nhib - fmuld $nhi,$nd,$nhid - - faddd $ahic,$nhic,$dota ! $nhic - faddd $ahid,$nhid,$dotb ! $nhid - - faddd $nloc,$nhia,$nloc - faddd $nlod,$nhib,$nlod - - fdtox $nloa,$nloa - fdtox $nlob,$nlob - fdtox $nloc,$nloc - fdtox $nlod,$nlod - - std $nloa,[%sp+$bias+$frame+0] - std $nlob,[%sp+$bias+$frame+8] - std $nloc,[%sp+$bias+$frame+16] - add $j,8,$j - std $nlod,[%sp+$bias+$frame+24] - - ldd [$ap_l+$j],$alo ! load a[j] in double format - ldd [$ap_h+$j],$ahi - ldd [$np_l+$j],$nlo ! load n[j] in double format - ldd [$np_h+$j],$nhi - - fmuld $alo,$ba,$aloa - fmuld $nlo,$na,$nloa - fmuld $alo,$bb,$alob - fmuld $nlo,$nb,$nlob - fmuld $alo,$bc,$aloc - ldx [%sp+$bias+$frame+0],%o0 - faddd $aloa,$nloa,$nloa - fmuld $nlo,$nc,$nloc - ldx [%sp+$bias+$frame+8],%o1 - fmuld $alo,$bd,$alod - ldx [%sp+$bias+$frame+16],%o2 - faddd $alob,$nlob,$nlob - fmuld $nlo,$nd,$nlod - ldx [%sp+$bias+$frame+24],%o3 - fmuld $ahi,$ba,$ahia - - srlx %o0,16,%o7 - faddd $aloc,$nloc,$nloc - fmuld $nhi,$na,$nhia - add %o7,%o1,%o1 - fmuld $ahi,$bb,$ahib - srlx %o1,16,%o7 - faddd $alod,$nlod,$nlod - fmuld $nhi,$nb,$nhib - add %o7,%o2,%o2 - fmuld $ahi,$bc,$ahic - srlx %o2,16,%o7 - faddd $ahia,$nhia,$nhia - fmuld $nhi,$nc,$nhic - add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] - ! why? - and %o0,$mask,%o0 - fmuld $ahi,$bd,$ahid - and %o1,$mask,%o1 - and %o2,$mask,%o2 - faddd $ahib,$nhib,$nhib - fmuld $nhi,$nd,$nhid - sllx %o1,16,%o1 - faddd $dota,$nloa,$nloa - sllx %o2,32,%o2 - faddd $dotb,$nlob,$nlob - sllx %o3,48,%o7 - or %o1,%o0,%o0 - faddd $ahic,$nhic,$dota ! $nhic - or %o2,%o0,%o0 - faddd $ahid,$nhid,$dotb ! $nhid - or %o7,%o0,%o0 ! 64-bit result - ldx [$tp],%o7 - faddd $nloc,$nhia,$nloc - addcc %o7,%o0,%o0 - ! end-of-why? - faddd $nlod,$nhib,$nlod - srlx %o3,16,%g1 ! 34-bit carry - fdtox $nloa,$nloa - bcs,a %xcc,.+8 - add %g1,1,%g1 - - fdtox $nlob,$nlob - fdtox $nloc,$nloc - fdtox $nlod,$nlod - - std $nloa,[%sp+$bias+$frame+0] - std $nlob,[%sp+$bias+$frame+8] - addcc $j,8,$j - std $nloc,[%sp+$bias+$frame+16] - bz,pn %icc,.Linnerskip - std $nlod,[%sp+$bias+$frame+24] - - ba .Linner - nop -.align 32 -.Linner: - ldd [$ap_l+$j],$alo ! load a[j] in double format - ldd [$ap_h+$j],$ahi - ldd [$np_l+$j],$nlo ! load n[j] in double format - ldd [$np_h+$j],$nhi - - fmuld $alo,$ba,$aloa - fmuld $nlo,$na,$nloa - fmuld $alo,$bb,$alob - fmuld $nlo,$nb,$nlob - fmuld $alo,$bc,$aloc - ldx [%sp+$bias+$frame+0],%o0 - faddd $aloa,$nloa,$nloa - fmuld $nlo,$nc,$nloc - ldx [%sp+$bias+$frame+8],%o1 - fmuld $alo,$bd,$alod - ldx [%sp+$bias+$frame+16],%o2 - faddd $alob,$nlob,$nlob - fmuld $nlo,$nd,$nlod - ldx [%sp+$bias+$frame+24],%o3 - fmuld $ahi,$ba,$ahia - - srlx %o0,16,%o7 - faddd $aloc,$nloc,$nloc - fmuld $nhi,$na,$nhia - add %o7,%o1,%o1 - fmuld $ahi,$bb,$ahib - srlx %o1,16,%o7 - faddd $alod,$nlod,$nlod - fmuld $nhi,$nb,$nhib - add %o7,%o2,%o2 - fmuld $ahi,$bc,$ahic - srlx %o2,16,%o7 - faddd $ahia,$nhia,$nhia - fmuld $nhi,$nc,$nhic - add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] - and %o0,$mask,%o0 - fmuld $ahi,$bd,$ahid - and %o1,$mask,%o1 - and %o2,$mask,%o2 - faddd $ahib,$nhib,$nhib - fmuld $nhi,$nd,$nhid - sllx %o1,16,%o1 - faddd $dota,$nloa,$nloa - sllx %o2,32,%o2 - faddd $dotb,$nlob,$nlob - sllx %o3,48,%o7 - or %o1,%o0,%o0 - faddd $ahic,$nhic,$dota ! $nhic - or %o2,%o0,%o0 - faddd $ahid,$nhid,$dotb ! $nhid - or %o7,%o0,%o0 ! 64-bit result - faddd $nloc,$nhia,$nloc - addcc %g1,%o0,%o0 - ldx [$tp+8],%o7 ! tp[j] - faddd $nlod,$nhib,$nlod - srlx %o3,16,%g1 ! 34-bit carry - fdtox $nloa,$nloa - bcs,a %xcc,.+8 - add %g1,1,%g1 - fdtox $nlob,$nlob - addcc %o7,%o0,%o0 - fdtox $nloc,$nloc - bcs,a %xcc,.+8 - add %g1,1,%g1 - - stx %o0,[$tp] ! tp[j-1] - fdtox $nlod,$nlod - - std $nloa,[%sp+$bias+$frame+0] - std $nlob,[%sp+$bias+$frame+8] - std $nloc,[%sp+$bias+$frame+16] - addcc $j,8,$j - std $nlod,[%sp+$bias+$frame+24] - bnz,pt %icc,.Linner - add $tp,8,$tp - -.Linnerskip: - fdtox $dota,$dota - fdtox $dotb,$dotb - - ldx [%sp+$bias+$frame+0],%o0 - ldx [%sp+$bias+$frame+8],%o1 - ldx [%sp+$bias+$frame+16],%o2 - ldx [%sp+$bias+$frame+24],%o3 - - srlx %o0,16,%o7 - std $dota,[%sp+$bias+$frame+32] - add %o7,%o1,%o1 - std $dotb,[%sp+$bias+$frame+40] - srlx %o1,16,%o7 - add %o7,%o2,%o2 - srlx %o2,16,%o7 - add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] - and %o0,$mask,%o0 - and %o1,$mask,%o1 - and %o2,$mask,%o2 - sllx %o1,16,%o1 - sllx %o2,32,%o2 - sllx %o3,48,%o7 - or %o1,%o0,%o0 - or %o2,%o0,%o0 - ldx [%sp+$bias+$frame+32],%o4 - or %o7,%o0,%o0 ! 64-bit result - ldx [%sp+$bias+$frame+40],%o5 - addcc %g1,%o0,%o0 - ldx [$tp+8],%o7 ! tp[j] - srlx %o3,16,%g1 ! 34-bit carry - bcs,a %xcc,.+8 - add %g1,1,%g1 - - addcc %o7,%o0,%o0 - bcs,a %xcc,.+8 - add %g1,1,%g1 - - stx %o0,[$tp] ! tp[j-1] - add $tp,8,$tp - - srlx %o4,16,%o7 - add %o7,%o5,%o5 - and %o4,$mask,%o4 - sllx %o5,16,%o7 - or %o7,%o4,%o4 - addcc %g1,%o4,%o4 - srlx %o5,48,%g1 - bcs,a %xcc,.+8 - add %g1,1,%g1 - - addcc $carry,%o4,%o4 - stx %o4,[$tp] ! tp[num-1] - mov %g1,$carry - bcs,a %xcc,.+8 - add $carry,1,$carry - - addcc $i,8,$i - bnz %icc,.Louter - nop - - add $tp,8,$tp ! adjust tp to point at the end - orn %g0,%g0,%g4 - sub %g0,$num,%o7 ! n=-num - ba .Lsub - subcc %g0,%g0,%g0 ! clear %icc.c - -.align 32 -.Lsub: - ldx [$tp+%o7],%o0 - add $np,%o7,%g1 - ld [%g1+0],%o2 - ld [%g1+4],%o3 - srlx %o0,32,%o1 - subccc %o0,%o2,%o2 - add $rp,%o7,%g1 - subccc %o1,%o3,%o3 - st %o2,[%g1+0] - add %o7,8,%o7 - brnz,pt %o7,.Lsub - st %o3,[%g1+4] - subc $carry,0,%g4 - sub %g0,$num,%o7 ! n=-num - ba .Lcopy - nop - -.align 32 -.Lcopy: - ldx [$tp+%o7],%o0 - add $rp,%o7,%g1 - ld [%g1+0],%o2 - ld [%g1+4],%o3 - stx %g0,[$tp+%o7] - and %o0,%g4,%o0 - srlx %o0,32,%o1 - andn %o2,%g4,%o2 - andn %o3,%g4,%o3 - or %o2,%o0,%o0 - or %o3,%o1,%o1 - st %o0,[%g1+0] - add %o7,8,%o7 - brnz,pt %o7,.Lcopy - st %o1,[%g1+4] - sub %g0,$num,%o7 ! n=-num - -.Lzap: - stx %g0,[$ap_l+%o7] - stx %g0,[$ap_h+%o7] - stx %g0,[$np_l+%o7] - stx %g0,[$np_h+%o7] - add %o7,8,%o7 - brnz,pt %o7,.Lzap - nop - - ldx [%sp+$bias+$frame+48],%o7 - wr %g0,%o7,%asi ! restore %asi - - mov 1,%i0 -.Lret: - ret - restore -.type $fname,#function -.size $fname,(.-$fname) -.asciz "Montgomery Multipltication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>" -.align 32 -___ - -$code =~ s/\`([^\`]*)\`/eval($1)/gem; - -# Below substitution makes it possible to compile without demanding -# VIS extentions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I -# dare to do this, because VIS capability is detected at run-time now -# and this routine is not called on CPU not capable to execute it. Do -# note that fzeros is not the only VIS dependency! Another dependency -# is implicit and is just _a_ numerical value loaded to %asi register, -# which assembler can't recognize as VIS specific... -$code =~ s/fzeros\s+%f([0-9]+)/ - sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1) - /gem; - -print $code; -# flush -close STDOUT; diff --git a/crypto/openssl/crypto/bn/asm/via-mont.pl b/crypto/openssl/crypto/bn/asm/via-mont.pl deleted file mode 100755 index c046a51..0000000 --- a/crypto/openssl/crypto/bn/asm/via-mont.pl +++ /dev/null @@ -1,242 +0,0 @@ -#!/usr/bin/env perl -# -# ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. -# ==================================================================== -# -# Wrapper around 'rep montmul', VIA-specific instruction accessing -# PadLock Montgomery Multiplier. The wrapper is designed as drop-in -# replacement for OpenSSL bn_mul_mont [first implemented in 0.9.9]. -# -# Below are interleaved outputs from 'openssl speed rsa dsa' for 4 -# different software configurations on 1.5GHz VIA Esther processor. -# Lines marked with "software integer" denote performance of hand- -# coded integer-only assembler found in OpenSSL 0.9.7. "Software SSE2" -# refers to hand-coded SSE2 Montgomery multiplication procedure found -# OpenSSL 0.9.9. "Hardware VIA SDK" refers to padlock_pmm routine from -# Padlock SDK 2.0.1 available for download from VIA, which naturally -# utilizes the magic 'repz montmul' instruction. And finally "hardware -# this" refers to *this* implementation which also uses 'repz montmul' -# -# sign verify sign/s verify/s -# rsa 512 bits 0.001720s 0.000140s 581.4 7149.7 software integer -# rsa 512 bits 0.000690s 0.000086s 1450.3 11606.0 software SSE2 -# rsa 512 bits 0.006136s 0.000201s 163.0 4974.5 hardware VIA SDK -# rsa 512 bits 0.000712s 0.000050s 1404.9 19858.5 hardware this -# -# rsa 1024 bits 0.008518s 0.000413s 117.4 2420.8 software integer -# rsa 1024 bits 0.004275s 0.000277s 233.9 3609.7 software SSE2 -# rsa 1024 bits 0.012136s 0.000260s 82.4 3844.5 hardware VIA SDK -# rsa 1024 bits 0.002522s 0.000116s 396.5 8650.9 hardware this -# -# rsa 2048 bits 0.050101s 0.001371s 20.0 729.6 software integer -# rsa 2048 bits 0.030273s 0.001008s 33.0 991.9 software SSE2 -# rsa 2048 bits 0.030833s 0.000976s 32.4 1025.1 hardware VIA SDK -# rsa 2048 bits 0.011879s 0.000342s 84.2 2921.7 hardware this -# -# rsa 4096 bits 0.327097s 0.004859s 3.1 205.8 software integer -# rsa 4096 bits 0.229318s 0.003859s 4.4 259.2 software SSE2 -# rsa 4096 bits 0.233953s 0.003274s 4.3 305.4 hardware VIA SDK -# rsa 4096 bits 0.070493s 0.001166s 14.2 857.6 hardware this -# -# dsa 512 bits 0.001342s 0.001651s 745.2 605.7 software integer -# dsa 512 bits 0.000844s 0.000987s 1185.3 1013.1 software SSE2 -# dsa 512 bits 0.001902s 0.002247s 525.6 444.9 hardware VIA SDK -# dsa 512 bits 0.000458s 0.000524s 2182.2 1909.1 hardware this -# -# dsa 1024 bits 0.003964s 0.004926s 252.3 203.0 software integer -# dsa 1024 bits 0.002686s 0.003166s 372.3 315.8 software SSE2 -# dsa 1024 bits 0.002397s 0.002823s 417.1 354.3 hardware VIA SDK -# dsa 1024 bits 0.000978s 0.001170s 1022.2 855.0 hardware this -# -# dsa 2048 bits 0.013280s 0.016518s 75.3 60.5 software integer -# dsa 2048 bits 0.009911s 0.011522s 100.9 86.8 software SSE2 -# dsa 2048 bits 0.009542s 0.011763s 104.8 85.0 hardware VIA SDK -# dsa 2048 bits 0.002884s 0.003352s 346.8 298.3 hardware this -# -# To give you some other reference point here is output for 2.4GHz P4 -# running hand-coded SSE2 bn_mul_mont found in 0.9.9, i.e. "software -# SSE2" in above terms. -# -# rsa 512 bits 0.000407s 0.000047s 2454.2 21137.0 -# rsa 1024 bits 0.002426s 0.000141s 412.1 7100.0 -# rsa 2048 bits 0.015046s 0.000491s 66.5 2034.9 -# rsa 4096 bits 0.109770s 0.002379s 9.1 420.3 -# dsa 512 bits 0.000438s 0.000525s 2281.1 1904.1 -# dsa 1024 bits 0.001346s 0.001595s 742.7 627.0 -# dsa 2048 bits 0.004745s 0.005582s 210.7 179.1 -# -# Conclusions: -# - VIA SDK leaves a *lot* of room for improvement (which this -# implementation successfully fills:-); -# - 'rep montmul' gives up to >3x performance improvement depending on -# key length; -# - in terms of absolute performance it delivers approximately as much -# as modern out-of-order 32-bit cores [again, for longer keys]. - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -push(@INC,"${dir}","${dir}../../perlasm"); -require "x86asm.pl"; - -&asm_init($ARGV[0],"via-mont.pl"); - -# int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num); -$func="bn_mul_mont_padlock"; - -$pad=16*1; # amount of reserved bytes on top of every vector - -# stack layout -$mZeroPrime=&DWP(0,"esp"); # these are specified by VIA -$A=&DWP(4,"esp"); -$B=&DWP(8,"esp"); -$T=&DWP(12,"esp"); -$M=&DWP(16,"esp"); -$scratch=&DWP(20,"esp"); -$rp=&DWP(24,"esp"); # these are mine -$sp=&DWP(28,"esp"); -# &DWP(32,"esp") # 32 byte scratch area -# &DWP(64+(4*$num+$pad)*0,"esp") # padded tp[num] -# &DWP(64+(4*$num+$pad)*1,"esp") # padded copy of ap[num] -# &DWP(64+(4*$num+$pad)*2,"esp") # padded copy of bp[num] -# &DWP(64+(4*$num+$pad)*3,"esp") # padded copy of np[num] -# Note that SDK suggests to unconditionally allocate 2K per vector. This -# has quite an impact on performance. It naturally depends on key length, -# but to give an example 1024 bit private RSA key operations suffer >30% -# penalty. I allocate only as much as actually required... - -&function_begin($func); - &xor ("eax","eax"); - &mov ("ecx",&wparam(5)); # num - # meet VIA's limitations for num [note that the specification - # expresses them in bits, while we work with amount of 32-bit words] - &test ("ecx",3); - &jnz (&label("leave")); # num % 4 != 0 - &cmp ("ecx",8); - &jb (&label("leave")); # num < 8 - &cmp ("ecx",1024); - &ja (&label("leave")); # num > 1024 - - &pushf (); - &cld (); - - &mov ("edi",&wparam(0)); # rp - &mov ("eax",&wparam(1)); # ap - &mov ("ebx",&wparam(2)); # bp - &mov ("edx",&wparam(3)); # np - &mov ("esi",&wparam(4)); # n0 - &mov ("esi",&DWP(0,"esi")); # *n0 - - &lea ("ecx",&DWP($pad,"","ecx",4)); # ecx becomes vector size in bytes - &lea ("ebp",&DWP(64,"","ecx",4)); # allocate 4 vectors + 64 bytes - &neg ("ebp"); - &add ("ebp","esp"); - &and ("ebp",-64); # align to cache-line - &xchg ("ebp","esp"); # alloca - - &mov ($rp,"edi"); # save rp - &mov ($sp,"ebp"); # save esp - - &mov ($mZeroPrime,"esi"); - &lea ("esi",&DWP(64,"esp")); # tp - &mov ($T,"esi"); - &lea ("edi",&DWP(32,"esp")); # scratch area - &mov ($scratch,"edi"); - &mov ("esi","eax"); - - &lea ("ebp",&DWP(-$pad,"ecx")); - &shr ("ebp",2); # restore original num value in ebp - - &xor ("eax","eax"); - - &mov ("ecx","ebp"); - &lea ("ecx",&DWP((32+$pad)/4,"ecx"));# padded tp + scratch - &data_byte(0xf3,0xab); # rep stosl, bzero - - &mov ("ecx","ebp"); - &lea ("edi",&DWP(64+$pad,"esp","ecx",4));# pointer to ap copy - &mov ($A,"edi"); - &data_byte(0xf3,0xa5); # rep movsl, memcpy - &mov ("ecx",$pad/4); - &data_byte(0xf3,0xab); # rep stosl, bzero pad - # edi points at the end of padded ap copy... - - &mov ("ecx","ebp"); - &mov ("esi","ebx"); - &mov ($B,"edi"); - &data_byte(0xf3,0xa5); # rep movsl, memcpy - &mov ("ecx",$pad/4); - &data_byte(0xf3,0xab); # rep stosl, bzero pad - # edi points at the end of padded bp copy... - - &mov ("ecx","ebp"); - &mov ("esi","edx"); - &mov ($M,"edi"); - &data_byte(0xf3,0xa5); # rep movsl, memcpy - &mov ("ecx",$pad/4); - &data_byte(0xf3,0xab); # rep stosl, bzero pad - # edi points at the end of padded np copy... - - # let magic happen... - &mov ("ecx","ebp"); - &mov ("esi","esp"); - &shl ("ecx",5); # convert word counter to bit counter - &align (4); - &data_byte(0xf3,0x0f,0xa6,0xc0);# rep montmul - - &mov ("ecx","ebp"); - &lea ("esi",&DWP(64,"esp")); # tp - # edi still points at the end of padded np copy... - &neg ("ebp"); - &lea ("ebp",&DWP(-$pad,"edi","ebp",4)); # so just "rewind" - &mov ("edi",$rp); # restore rp - &xor ("edx","edx"); # i=0 and clear CF - -&set_label("sub",8); - &mov ("eax",&DWP(0,"esi","edx",4)); - &sbb ("eax",&DWP(0,"ebp","edx",4)); - &mov (&DWP(0,"edi","edx",4),"eax"); # rp[i]=tp[i]-np[i] - &lea ("edx",&DWP(1,"edx")); # i++ - &loop (&label("sub")); # doesn't affect CF! - - &mov ("eax",&DWP(0,"esi","edx",4)); # upmost overflow bit - &sbb ("eax",0); - &and ("esi","eax"); - ¬ ("eax"); - &mov ("ebp","edi"); - &and ("ebp","eax"); - &or ("esi","ebp"); # tp=carry?tp:rp - - &mov ("ecx","edx"); # num - &xor ("edx","edx"); # i=0 - -&set_label("copy",8); - &mov ("eax",&DWP(0,"esi","edx",4)); - &mov (&DWP(64,"esp","edx",4),"ecx"); # zap tp - &mov (&DWP(0,"edi","edx",4),"eax"); - &lea ("edx",&DWP(1,"edx")); # i++ - &loop (&label("copy")); - - &mov ("ebp",$sp); - &xor ("eax","eax"); - - &mov ("ecx",64/4); - &mov ("edi","esp"); # zap frame including scratch area - &data_byte(0xf3,0xab); # rep stosl, bzero - - # zap copies of ap, bp and np - &lea ("edi",&DWP(64+$pad,"esp","edx",4));# pointer to ap - &lea ("ecx",&DWP(3*$pad/4,"edx","edx",2)); - &data_byte(0xf3,0xab); # rep stosl, bzero - - &mov ("esp","ebp"); - &inc ("eax"); # signal "done" - &popf (); -&set_label("leave"); -&function_end($func); - -&asciz("Padlock Montgomery Multiplication, CRYPTOGAMS by <appro\@openssl.org>"); - -&asm_finish(); diff --git a/crypto/openssl/crypto/bn/asm/x86-mont.pl b/crypto/openssl/crypto/bn/asm/x86-mont.pl deleted file mode 100755 index 5cd3cd2..0000000 --- a/crypto/openssl/crypto/bn/asm/x86-mont.pl +++ /dev/null @@ -1,591 +0,0 @@ -#!/usr/bin/env perl - -# ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. -# ==================================================================== - -# October 2005 -# -# This is a "teaser" code, as it can be improved in several ways... -# First of all non-SSE2 path should be implemented (yes, for now it -# performs Montgomery multiplication/convolution only on SSE2-capable -# CPUs such as P4, others fall down to original code). Then inner loop -# can be unrolled and modulo-scheduled to improve ILP and possibly -# moved to 128-bit XMM register bank (though it would require input -# rearrangement and/or increase bus bandwidth utilization). Dedicated -# squaring procedure should give further performance improvement... -# Yet, for being draft, the code improves rsa512 *sign* benchmark by -# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-) - -# December 2006 -# -# Modulo-scheduling SSE2 loops results in further 15-20% improvement. -# Integer-only code [being equipped with dedicated squaring procedure] -# gives ~40% on rsa512 sign benchmark... - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -push(@INC,"${dir}","${dir}../../perlasm"); -require "x86asm.pl"; - -&asm_init($ARGV[0],$0); - -$sse2=0; -for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } - -&external_label("OPENSSL_ia32cap_P") if ($sse2); - -&function_begin("bn_mul_mont"); - -$i="edx"; -$j="ecx"; -$ap="esi"; $tp="esi"; # overlapping variables!!! -$rp="edi"; $bp="edi"; # overlapping variables!!! -$np="ebp"; -$num="ebx"; - -$_num=&DWP(4*0,"esp"); # stack top layout -$_rp=&DWP(4*1,"esp"); -$_ap=&DWP(4*2,"esp"); -$_bp=&DWP(4*3,"esp"); -$_np=&DWP(4*4,"esp"); -$_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp"); -$_sp=&DWP(4*6,"esp"); -$_bpend=&DWP(4*7,"esp"); -$frame=32; # size of above frame rounded up to 16n - - &xor ("eax","eax"); - &mov ("edi",&wparam(5)); # int num - &cmp ("edi",4); - &jl (&label("just_leave")); - - &lea ("esi",&wparam(0)); # put aside pointer to argument block - &lea ("edx",&wparam(1)); # load ap - &mov ("ebp","esp"); # saved stack pointer! - &add ("edi",2); # extra two words on top of tp - &neg ("edi"); - &lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2)) - &neg ("edi"); - - # minimize cache contention by arraning 2K window between stack - # pointer and ap argument [np is also position sensitive vector, - # but it's assumed to be near ap, as it's allocated at ~same - # time]. - &mov ("eax","esp"); - &sub ("eax","edx"); - &and ("eax",2047); - &sub ("esp","eax"); # this aligns sp and ap modulo 2048 - - &xor ("edx","esp"); - &and ("edx",2048); - &xor ("edx",2048); - &sub ("esp","edx"); # this splits them apart modulo 4096 - - &and ("esp",-64); # align to cache line - - ################################# load argument block... - &mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp - &mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap - &mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp - &mov ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np - &mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0 - #&mov ("edi",&DWP(5*4,"esi"));# int num - - &mov ("esi",&DWP(0,"esi")); # pull n0[0] - &mov ($_rp,"eax"); # ... save a copy of argument block - &mov ($_ap,"ebx"); - &mov ($_bp,"ecx"); - &mov ($_np,"edx"); - &mov ($_n0,"esi"); - &lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling - #&mov ($_num,$num); # redundant as $num is not reused - &mov ($_sp,"ebp"); # saved stack pointer! - -if($sse2) { -$acc0="mm0"; # mmx register bank layout -$acc1="mm1"; -$car0="mm2"; -$car1="mm3"; -$mul0="mm4"; -$mul1="mm5"; -$temp="mm6"; -$mask="mm7"; - - &picmeup("eax","OPENSSL_ia32cap_P"); - &bt (&DWP(0,"eax"),26); - &jnc (&label("non_sse2")); - - &mov ("eax",-1); - &movd ($mask,"eax"); # mask 32 lower bits - - &mov ($ap,$_ap); # load input pointers - &mov ($bp,$_bp); - &mov ($np,$_np); - - &xor ($i,$i); # i=0 - &xor ($j,$j); # j=0 - - &movd ($mul0,&DWP(0,$bp)); # bp[0] - &movd ($mul1,&DWP(0,$ap)); # ap[0] - &movd ($car1,&DWP(0,$np)); # np[0] - - &pmuludq($mul1,$mul0); # ap[0]*bp[0] - &movq ($car0,$mul1); - &movq ($acc0,$mul1); # I wish movd worked for - &pand ($acc0,$mask); # inter-register transfers - - &pmuludq($mul1,$_n0q); # *=n0 - - &pmuludq($car1,$mul1); # "t[0]"*np[0]*n0 - &paddq ($car1,$acc0); - - &movd ($acc1,&DWP(4,$np)); # np[1] - &movd ($acc0,&DWP(4,$ap)); # ap[1] - - &psrlq ($car0,32); - &psrlq ($car1,32); - - &inc ($j); # j++ -&set_label("1st",16); - &pmuludq($acc0,$mul0); # ap[j]*bp[0] - &pmuludq($acc1,$mul1); # np[j]*m1 - &paddq ($car0,$acc0); # +=c0 - &paddq ($car1,$acc1); # +=c1 - - &movq ($acc0,$car0); - &pand ($acc0,$mask); - &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1] - &paddq ($car1,$acc0); # +=ap[j]*bp[0]; - &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1] - &psrlq ($car0,32); - &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[j-1]= - &psrlq ($car1,32); - - &lea ($j,&DWP(1,$j)); - &cmp ($j,$num); - &jl (&label("1st")); - - &pmuludq($acc0,$mul0); # ap[num-1]*bp[0] - &pmuludq($acc1,$mul1); # np[num-1]*m1 - &paddq ($car0,$acc0); # +=c0 - &paddq ($car1,$acc1); # +=c1 - - &movq ($acc0,$car0); - &pand ($acc0,$mask); - &paddq ($car1,$acc0); # +=ap[num-1]*bp[0]; - &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]= - - &psrlq ($car0,32); - &psrlq ($car1,32); - - &paddq ($car1,$car0); - &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1] - - &inc ($i); # i++ -&set_label("outer"); - &xor ($j,$j); # j=0 - - &movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i] - &movd ($mul1,&DWP(0,$ap)); # ap[0] - &movd ($temp,&DWP($frame,"esp")); # tp[0] - &movd ($car1,&DWP(0,$np)); # np[0] - &pmuludq($mul1,$mul0); # ap[0]*bp[i] - - &paddq ($mul1,$temp); # +=tp[0] - &movq ($acc0,$mul1); - &movq ($car0,$mul1); - &pand ($acc0,$mask); - - &pmuludq($mul1,$_n0q); # *=n0 - - &pmuludq($car1,$mul1); - &paddq ($car1,$acc0); - - &movd ($temp,&DWP($frame+4,"esp")); # tp[1] - &movd ($acc1,&DWP(4,$np)); # np[1] - &movd ($acc0,&DWP(4,$ap)); # ap[1] - - &psrlq ($car0,32); - &psrlq ($car1,32); - &paddq ($car0,$temp); # +=tp[1] - - &inc ($j); # j++ - &dec ($num); -&set_label("inner"); - &pmuludq($acc0,$mul0); # ap[j]*bp[i] - &pmuludq($acc1,$mul1); # np[j]*m1 - &paddq ($car0,$acc0); # +=c0 - &paddq ($car1,$acc1); # +=c1 - - &movq ($acc0,$car0); - &movd ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1] - &pand ($acc0,$mask); - &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1] - &paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j] - &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1] - &psrlq ($car0,32); - &movd (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]= - &psrlq ($car1,32); - &paddq ($car0,$temp); # +=tp[j+1] - - &dec ($num); - &lea ($j,&DWP(1,$j)); # j++ - &jnz (&label("inner")); - - &mov ($num,$j); - &pmuludq($acc0,$mul0); # ap[num-1]*bp[i] - &pmuludq($acc1,$mul1); # np[num-1]*m1 - &paddq ($car0,$acc0); # +=c0 - &paddq ($car1,$acc1); # +=c1 - - &movq ($acc0,$car0); - &pand ($acc0,$mask); - &paddq ($car1,$acc0); # +=ap[num-1]*bp[i]+tp[num-1] - &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]= - &psrlq ($car0,32); - &psrlq ($car1,32); - - &movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num] - &paddq ($car1,$car0); - &paddq ($car1,$temp); - &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1] - - &lea ($i,&DWP(1,$i)); # i++ - &cmp ($i,$num); - &jle (&label("outer")); - - &emms (); # done with mmx bank - &jmp (&label("common_tail")); - -&set_label("non_sse2",16); -} - -if (0) { - &mov ("esp",$_sp); - &xor ("eax","eax"); # signal "not fast enough [yet]" - &jmp (&label("just_leave")); - # While the below code provides competitive performance for - # all key lengthes on modern Intel cores, it's still more - # than 10% slower for 4096-bit key elsewhere:-( "Competitive" - # means compared to the original integer-only assembler. - # 512-bit RSA sign is better by ~40%, but that's about all - # one can say about all CPUs... -} else { -$inp="esi"; # integer path uses these registers differently -$word="edi"; -$carry="ebp"; - - &mov ($inp,$_ap); - &lea ($carry,&DWP(1,$num)); - &mov ($word,$_bp); - &xor ($j,$j); # j=0 - &mov ("edx",$inp); - &and ($carry,1); # see if num is even - &sub ("edx",$word); # see if ap==bp - &lea ("eax",&DWP(4,$word,$num,4)); # &bp[num] - &or ($carry,"edx"); - &mov ($word,&DWP(0,$word)); # bp[0] - &jz (&label("bn_sqr_mont")); - &mov ($_bpend,"eax"); - &mov ("eax",&DWP(0,$inp)); - &xor ("edx","edx"); - -&set_label("mull",16); - &mov ($carry,"edx"); - &mul ($word); # ap[j]*bp[0] - &add ($carry,"eax"); - &lea ($j,&DWP(1,$j)); - &adc ("edx",0); - &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1] - &cmp ($j,$num); - &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= - &jl (&label("mull")); - - &mov ($carry,"edx"); - &mul ($word); # ap[num-1]*bp[0] - &mov ($word,$_n0); - &add ("eax",$carry); - &mov ($inp,$_np); - &adc ("edx",0); - &imul ($word,&DWP($frame,"esp")); # n0*tp[0] - - &mov (&DWP($frame,"esp",$num,4),"eax"); # tp[num-1]= - &xor ($j,$j); - &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]= - &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]= - - &mov ("eax",&DWP(0,$inp)); # np[0] - &mul ($word); # np[0]*m - &add ("eax",&DWP($frame,"esp")); # +=tp[0] - &mov ("eax",&DWP(4,$inp)); # np[1] - &adc ("edx",0); - &inc ($j); - - &jmp (&label("2ndmadd")); - -&set_label("1stmadd",16); - &mov ($carry,"edx"); - &mul ($word); # ap[j]*bp[i] - &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] - &lea ($j,&DWP(1,$j)); - &adc ("edx",0); - &add ($carry,"eax"); - &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1] - &adc ("edx",0); - &cmp ($j,$num); - &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= - &jl (&label("1stmadd")); - - &mov ($carry,"edx"); - &mul ($word); # ap[num-1]*bp[i] - &add ("eax",&DWP($frame,"esp",$num,4)); # +=tp[num-1] - &mov ($word,$_n0); - &adc ("edx",0); - &mov ($inp,$_np); - &add ($carry,"eax"); - &adc ("edx",0); - &imul ($word,&DWP($frame,"esp")); # n0*tp[0] - - &xor ($j,$j); - &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] - &mov (&DWP($frame,"esp",$num,4),$carry); # tp[num-1]= - &adc ($j,0); - &mov ("eax",&DWP(0,$inp)); # np[0] - &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]= - &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]= - - &mul ($word); # np[0]*m - &add ("eax",&DWP($frame,"esp")); # +=tp[0] - &mov ("eax",&DWP(4,$inp)); # np[1] - &adc ("edx",0); - &mov ($j,1); - -&set_label("2ndmadd",16); - &mov ($carry,"edx"); - &mul ($word); # np[j]*m - &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] - &lea ($j,&DWP(1,$j)); - &adc ("edx",0); - &add ($carry,"eax"); - &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+1] - &adc ("edx",0); - &cmp ($j,$num); - &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j-1]= - &jl (&label("2ndmadd")); - - &mov ($carry,"edx"); - &mul ($word); # np[j]*m - &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1] - &adc ("edx",0); - &add ($carry,"eax"); - &adc ("edx",0); - &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]= - - &xor ("eax","eax"); - &mov ($j,$_bp); # &bp[i] - &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] - &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1] - &lea ($j,&DWP(4,$j)); - &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]= - &cmp ($j,$_bpend); - &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]= - &je (&label("common_tail")); - - &mov ($word,&DWP(0,$j)); # bp[i+1] - &mov ($inp,$_ap); - &mov ($_bp,$j); # &bp[++i] - &xor ($j,$j); - &xor ("edx","edx"); - &mov ("eax",&DWP(0,$inp)); - &jmp (&label("1stmadd")); - -&set_label("bn_sqr_mont",16); -$sbit=$num; - &mov ($_num,$num); - &mov ($_bp,$j); # i=0 - - &mov ("eax",$word); # ap[0] - &mul ($word); # ap[0]*ap[0] - &mov (&DWP($frame,"esp"),"eax"); # tp[0]= - &mov ($sbit,"edx"); - &shr ("edx",1); - &and ($sbit,1); - &inc ($j); -&set_label("sqr",16); - &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j] - &mov ($carry,"edx"); - &mul ($word); # ap[j]*ap[0] - &add ("eax",$carry); - &lea ($j,&DWP(1,$j)); - &adc ("edx",0); - &lea ($carry,&DWP(0,$sbit,"eax",2)); - &shr ("eax",31); - &cmp ($j,$_num); - &mov ($sbit,"eax"); - &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= - &jl (&label("sqr")); - - &mov ("eax",&DWP(0,$inp,$j,4)); # ap[num-1] - &mov ($carry,"edx"); - &mul ($word); # ap[num-1]*ap[0] - &add ("eax",$carry); - &mov ($word,$_n0); - &adc ("edx",0); - &mov ($inp,$_np); - &lea ($carry,&DWP(0,$sbit,"eax",2)); - &imul ($word,&DWP($frame,"esp")); # n0*tp[0] - &shr ("eax",31); - &mov (&DWP($frame,"esp",$j,4),$carry); # tp[num-1]= - - &lea ($carry,&DWP(0,"eax","edx",2)); - &mov ("eax",&DWP(0,$inp)); # np[0] - &shr ("edx",31); - &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num]= - &mov (&DWP($frame+8,"esp",$j,4),"edx"); # tp[num+1]= - - &mul ($word); # np[0]*m - &add ("eax",&DWP($frame,"esp")); # +=tp[0] - &mov ($num,$j); - &adc ("edx",0); - &mov ("eax",&DWP(4,$inp)); # np[1] - &mov ($j,1); - -&set_label("3rdmadd",16); - &mov ($carry,"edx"); - &mul ($word); # np[j]*m - &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] - &adc ("edx",0); - &add ($carry,"eax"); - &mov ("eax",&DWP(4,$inp,$j,4)); # np[j+1] - &adc ("edx",0); - &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j-1]= - - &mov ($carry,"edx"); - &mul ($word); # np[j+1]*m - &add ($carry,&DWP($frame+4,"esp",$j,4)); # +=tp[j+1] - &lea ($j,&DWP(2,$j)); - &adc ("edx",0); - &add ($carry,"eax"); - &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+2] - &adc ("edx",0); - &cmp ($j,$num); - &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j]= - &jl (&label("3rdmadd")); - - &mov ($carry,"edx"); - &mul ($word); # np[j]*m - &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1] - &adc ("edx",0); - &add ($carry,"eax"); - &adc ("edx",0); - &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]= - - &mov ($j,$_bp); # i - &xor ("eax","eax"); - &mov ($inp,$_ap); - &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] - &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1] - &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]= - &cmp ($j,$num); - &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]= - &je (&label("common_tail")); - - &mov ($word,&DWP(4,$inp,$j,4)); # ap[i] - &lea ($j,&DWP(1,$j)); - &mov ("eax",$word); - &mov ($_bp,$j); # ++i - &mul ($word); # ap[i]*ap[i] - &add ("eax",&DWP($frame,"esp",$j,4)); # +=tp[i] - &adc ("edx",0); - &mov (&DWP($frame,"esp",$j,4),"eax"); # tp[i]= - &xor ($carry,$carry); - &cmp ($j,$num); - &lea ($j,&DWP(1,$j)); - &je (&label("sqrlast")); - - &mov ($sbit,"edx"); # zaps $num - &shr ("edx",1); - &and ($sbit,1); -&set_label("sqradd",16); - &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j] - &mov ($carry,"edx"); - &mul ($word); # ap[j]*ap[i] - &add ("eax",$carry); - &lea ($carry,&DWP(0,"eax","eax")); - &adc ("edx",0); - &shr ("eax",31); - &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] - &lea ($j,&DWP(1,$j)); - &adc ("eax",0); - &add ($carry,$sbit); - &adc ("eax",0); - &cmp ($j,$_num); - &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= - &mov ($sbit,"eax"); - &jle (&label("sqradd")); - - &mov ($carry,"edx"); - &lea ("edx",&DWP(0,$sbit,"edx",2)); - &shr ($carry,31); -&set_label("sqrlast"); - &mov ($word,$_n0); - &mov ($inp,$_np); - &imul ($word,&DWP($frame,"esp")); # n0*tp[0] - - &add ("edx",&DWP($frame,"esp",$j,4)); # +=tp[num] - &mov ("eax",&DWP(0,$inp)); # np[0] - &adc ($carry,0); - &mov (&DWP($frame,"esp",$j,4),"edx"); # tp[num]= - &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num+1]= - - &mul ($word); # np[0]*m - &add ("eax",&DWP($frame,"esp")); # +=tp[0] - &lea ($num,&DWP(-1,$j)); - &adc ("edx",0); - &mov ($j,1); - &mov ("eax",&DWP(4,$inp)); # np[1] - - &jmp (&label("3rdmadd")); -} - -&set_label("common_tail",16); - &mov ($np,$_np); # load modulus pointer - &mov ($rp,$_rp); # load result pointer - &lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped] - - &mov ("eax",&DWP(0,$tp)); # tp[0] - &mov ($j,$num); # j=num-1 - &xor ($i,$i); # i=0 and clear CF! - -&set_label("sub",16); - &sbb ("eax",&DWP(0,$np,$i,4)); - &mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i] - &dec ($j); # doesn't affect CF! - &mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1] - &lea ($i,&DWP(1,$i)); # i++ - &jge (&label("sub")); - - &sbb ("eax",0); # handle upmost overflow bit - &and ($tp,"eax"); - ¬ ("eax"); - &mov ($np,$rp); - &and ($np,"eax"); - &or ($tp,$np); # tp=carry?tp:rp - -&set_label("copy",16); # copy or in-place refresh - &mov ("eax",&DWP(0,$tp,$num,4)); - &mov (&DWP(0,$rp,$num,4),"eax"); # rp[i]=tp[i] - &mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector - &dec ($num); - &jge (&label("copy")); - - &mov ("esp",$_sp); # pull saved stack pointer - &mov ("eax",1); -&set_label("just_leave"); -&function_end("bn_mul_mont"); - -&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>"); - -&asm_finish(); diff --git a/crypto/openssl/crypto/bn/asm/x86_64-gcc.c b/crypto/openssl/crypto/bn/asm/x86_64-gcc.c index f13f52d..2b2bc1e 100644 --- a/crypto/openssl/crypto/bn/asm/x86_64-gcc.c +++ b/crypto/openssl/crypto/bn/asm/x86_64-gcc.c @@ -1,3 +1,4 @@ +#include "../bn_lcl.h" #ifdef __SUNPRO_C # include "../bn_asm.c" /* kind of dirty hack for Sun Studio */ #else @@ -56,6 +57,9 @@ #define BN_ULONG unsigned long +#undef mul +#undef mul_add + /* * "m"(a), "+m"(r) is the way to favor DirectPath µ-code; * "g"(0) let the compiler to decide where does it @@ -97,7 +101,7 @@ : "a"(a) \ : "cc"); -BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) +BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) { BN_ULONG c1=0; @@ -121,7 +125,7 @@ BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) return(c1); } -BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) +BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) { BN_ULONG c1=0; @@ -144,7 +148,7 @@ BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) return(c1); } -void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n) +void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) { if (n <= 0) return; @@ -175,7 +179,7 @@ BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) return ret; } -BN_ULONG bn_add_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n) +BN_ULONG bn_add_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n) { BN_ULONG ret=0,i=0; if (n <= 0) return 0; @@ -198,7 +202,7 @@ BN_ULONG bn_add_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n) } #ifndef SIMICS -BN_ULONG bn_sub_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n) +BN_ULONG bn_sub_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n) { BN_ULONG ret=0,i=0; if (n <= 0) return 0; @@ -485,7 +489,7 @@ void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) r[7]=c2; } -void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) +void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) { BN_ULONG t1,t2; BN_ULONG c1,c2,c3; @@ -561,7 +565,7 @@ void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) r[15]=c1; } -void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) +void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) { BN_ULONG t1,t2; BN_ULONG c1,c2,c3; |