diff options
author | jkim <jkim@FreeBSD.org> | 2015-12-03 21:13:35 +0000 |
---|---|---|
committer | jkim <jkim@FreeBSD.org> | 2015-12-03 21:13:35 +0000 |
commit | 8d77ecefb78a0e7ec702cf614a78dd85de9395ee (patch) | |
tree | ade84397c16fe1b20cb2a441f603826e49c36cf2 /crypto/openssl/crypto/bn/asm | |
parent | 5374819b03f4e6dcb332bf2729f9270e5d10b83a (diff) | |
parent | afd52a5fc90e70242dbb0e7d29987c976eb993e0 (diff) | |
download | FreeBSD-src-8d77ecefb78a0e7ec702cf614a78dd85de9395ee.zip FreeBSD-src-8d77ecefb78a0e7ec702cf614a78dd85de9395ee.tar.gz |
Merge OpenSSL 1.0.2e.
Diffstat (limited to 'crypto/openssl/crypto/bn/asm')
-rwxr-xr-x | crypto/openssl/crypto/bn/asm/armv4-gf2m.pl | 10 | ||||
-rw-r--r-- | crypto/openssl/crypto/bn/asm/ia64.S | 4 | ||||
-rwxr-xr-x | crypto/openssl/crypto/bn/asm/ppc64-mont.pl | 174 | ||||
-rwxr-xr-x | crypto/openssl/crypto/bn/asm/rsaz-x86_64.pl | 2 | ||||
-rwxr-xr-x | crypto/openssl/crypto/bn/asm/s390x-gf2m.pl | 6 | ||||
-rw-r--r-- | crypto/openssl/crypto/bn/asm/s390x.S | 109 | ||||
-rwxr-xr-x | crypto/openssl/crypto/bn/asm/x86-gf2m.pl | 16 | ||||
-rw-r--r-- | crypto/openssl/crypto/bn/asm/x86_64-gcc.c | 2 | ||||
-rwxr-xr-x | crypto/openssl/crypto/bn/asm/x86_64-gf2m.pl | 16 | ||||
-rwxr-xr-x | crypto/openssl/crypto/bn/asm/x86_64-mont.pl | 5 | ||||
-rwxr-xr-x | crypto/openssl/crypto/bn/asm/x86_64-mont5.pl | 27 |
11 files changed, 217 insertions, 154 deletions
diff --git a/crypto/openssl/crypto/bn/asm/armv4-gf2m.pl b/crypto/openssl/crypto/bn/asm/armv4-gf2m.pl index 8f529c9..72381a7 100755 --- a/crypto/openssl/crypto/bn/asm/armv4-gf2m.pl +++ b/crypto/openssl/crypto/bn/asm/armv4-gf2m.pl @@ -27,7 +27,7 @@ # referred below, which improves ECDH and ECDSA verify benchmarks # by 18-40%. # -# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software +# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software # Polynomial Multiplication on ARM Processors using the NEON Engine. # # http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf @@ -136,7 +136,7 @@ ___ ################ # void bn_GF2m_mul_2x2(BN_ULONG *r, # BN_ULONG a1,BN_ULONG a0, -# BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0·b1b0 +# BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0·b1b0 { $code.=<<___; .global bn_GF2m_mul_2x2 @@ -159,7 +159,7 @@ $code.=<<___; mov $mask,#7<<2 sub sp,sp,#32 @ allocate tab[8] - bl mul_1x1_ialu @ a1·b1 + bl mul_1x1_ialu @ a1·b1 str $lo,[$ret,#8] str $hi,[$ret,#12] @@ -169,13 +169,13 @@ $code.=<<___; eor r2,r2,$a eor $b,$b,r3 eor $a,$a,r2 - bl mul_1x1_ialu @ a0·b0 + bl mul_1x1_ialu @ a0·b0 str $lo,[$ret] str $hi,[$ret,#4] eor $a,$a,r2 eor $b,$b,r3 - bl mul_1x1_ialu @ (a1+a0)·(b1+b0) + bl mul_1x1_ialu @ (a1+a0)·(b1+b0) ___ @r=map("r$_",(6..9)); $code.=<<___; diff --git a/crypto/openssl/crypto/bn/asm/ia64.S b/crypto/openssl/crypto/bn/asm/ia64.S index 951abc5..a9a42ab 100644 --- a/crypto/openssl/crypto/bn/asm/ia64.S +++ b/crypto/openssl/crypto/bn/asm/ia64.S @@ -422,7 +422,7 @@ bn_mul_add_words: // This loop spins in 3*(n+10) ticks on Itanium and in 2*(n+10) on // Itanium 2. Yes, unlike previous versions it scales:-) Previous -// version was peforming *all* additions in IALU and was starving +// version was performing *all* additions in IALU and was starving // for those even on Itanium 2. In this version one addition is // moved to FPU and is folded with multiplication. This is at cost // of propogating the result from previous call to this subroutine @@ -568,7 +568,7 @@ bn_sqr_comba8: // I've estimated this routine to run in ~120 ticks, but in reality // (i.e. according to ar.itc) it takes ~160 ticks. Are those extra // cycles consumed for instructions fetch? Or did I misinterpret some -// clause in Itanium µ-architecture manual? Comments are welcomed and +// clause in Itanium µ-architecture manual? Comments are welcomed and // highly appreciated. // // On Itanium 2 it takes ~190 ticks. This is because of stalls on diff --git a/crypto/openssl/crypto/bn/asm/ppc64-mont.pl b/crypto/openssl/crypto/bn/asm/ppc64-mont.pl index 68e3733..9e3c12d 100755 --- a/crypto/openssl/crypto/bn/asm/ppc64-mont.pl +++ b/crypto/openssl/crypto/bn/asm/ppc64-mont.pl @@ -94,6 +94,8 @@ if ($flavour =~ /32/) { $POP= "ld"; } else { die "nonsense $flavour"; } +$LITTLE_ENDIAN = ($flavour=~/le$/) ? 4 : 0; + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or @@ -294,12 +296,12 @@ $code.=<<___ if ($SIZE_T==8); extrdi $t0,$a0,32,32 ; lwz $t0,4($ap) extrdi $t1,$a0,32,0 ; lwz $t1,0($ap) - lwz $t2,12($ap) ; load a[1] as 32-bit word pair - lwz $t3,8($ap) - lwz $t4,4($np) ; load n[0] as 32-bit word pair - lwz $t5,0($np) - lwz $t6,12($np) ; load n[1] as 32-bit word pair - lwz $t7,8($np) + lwz $t2,`12^$LITTLE_ENDIAN`($ap) ; load a[1] as 32-bit word pair + lwz $t3,`8^$LITTLE_ENDIAN`($ap) + lwz $t4,`4^$LITTLE_ENDIAN`($np) ; load n[0] as 32-bit word pair + lwz $t5,`0^$LITTLE_ENDIAN`($np) + lwz $t6,`12^$LITTLE_ENDIAN`($np) ; load n[1] as 32-bit word pair + lwz $t7,`8^$LITTLE_ENDIAN`($np) ___ $code.=<<___ if ($SIZE_T==4); lwz $a0,0($ap) ; pull ap[0,1] value @@ -463,14 +465,14 @@ $code.=<<___; L1st: ___ $code.=<<___ if ($SIZE_T==8); - lwz $t0,4($ap) ; load a[j] as 32-bit word pair - lwz $t1,0($ap) - lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair - lwz $t3,8($ap) - lwz $t4,4($np) ; load n[j] as 32-bit word pair - lwz $t5,0($np) - lwz $t6,12($np) ; load n[j+1] as 32-bit word pair - lwz $t7,8($np) + lwz $t0,`4^$LITTLE_ENDIAN`($ap) ; load a[j] as 32-bit word pair + lwz $t1,`0^$LITTLE_ENDIAN`($ap) + lwz $t2,`12^$LITTLE_ENDIAN`($ap) ; load a[j+1] as 32-bit word pair + lwz $t3,`8^$LITTLE_ENDIAN`($ap) + lwz $t4,`4^$LITTLE_ENDIAN`($np) ; load n[j] as 32-bit word pair + lwz $t5,`0^$LITTLE_ENDIAN`($np) + lwz $t6,`12^$LITTLE_ENDIAN`($np) ; load n[j+1] as 32-bit word pair + lwz $t7,`8^$LITTLE_ENDIAN`($np) ___ $code.=<<___ if ($SIZE_T==4); lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs @@ -505,14 +507,14 @@ $code.=<<___; ___ } else { $code.=<<___; - lwz $t1,`$FRAME+0`($sp) - lwz $t0,`$FRAME+4`($sp) - lwz $t3,`$FRAME+8`($sp) - lwz $t2,`$FRAME+12`($sp) - lwz $t5,`$FRAME+16`($sp) - lwz $t4,`$FRAME+20`($sp) - lwz $t7,`$FRAME+24`($sp) - lwz $t6,`$FRAME+28`($sp) + lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp) + lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp) + lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp) + lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp) + lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp) + lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp) + lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp) + lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp) ___ } $code.=<<___; @@ -651,8 +653,8 @@ $code.=<<___; fmadd $T1a,$N1,$na,$T1a fmadd $T1b,$N1,$nb,$T1b - lwz $t3,`$FRAME+32`($sp) ; permuted $t1 - lwz $t2,`$FRAME+36`($sp) ; permuted $t0 + lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1 + lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0 addc $t4,$t4,$carry adde $t5,$t5,$c1 srwi $carry,$t4,16 @@ -673,8 +675,8 @@ $code.=<<___; fmadd $T1a,$N0,$nc,$T1a fmadd $T1b,$N0,$nd,$T1b - lwz $t7,`$FRAME+40`($sp) ; permuted $t3 - lwz $t6,`$FRAME+44`($sp) ; permuted $t2 + lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3 + lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2 addc $t2,$t2,$carry adde $t3,$t3,$c1 srwi $carry,$t2,16 @@ -686,8 +688,8 @@ $code.=<<___; insrwi $carry,$t3,16,0 fmadd $T3a,$N2,$nc,$T3a fmadd $T3b,$N2,$nd,$T3b - lwz $t1,`$FRAME+48`($sp) ; permuted $t5 - lwz $t0,`$FRAME+52`($sp) ; permuted $t4 + lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5 + lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4 addc $t6,$t6,$carry adde $t7,$t7,$c1 srwi $carry,$t6,16 @@ -699,8 +701,8 @@ $code.=<<___; fctid $T0a,$T0a fctid $T0b,$T0b - lwz $t5,`$FRAME+56`($sp) ; permuted $t7 - lwz $t4,`$FRAME+60`($sp) ; permuted $t6 + lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7 + lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6 addc $t0,$t0,$carry adde $t1,$t1,$c1 srwi $carry,$t0,16 @@ -787,14 +789,14 @@ $code.=<<___; ___ } else { $code.=<<___; - lwz $t1,`$FRAME+0`($sp) - lwz $t0,`$FRAME+4`($sp) - lwz $t3,`$FRAME+8`($sp) - lwz $t2,`$FRAME+12`($sp) - lwz $t5,`$FRAME+16`($sp) - lwz $t4,`$FRAME+20`($sp) - lwz $t7,`$FRAME+24`($sp) - lwz $t6,`$FRAME+28`($sp) + lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp) + lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp) + lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp) + lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp) + lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp) + lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp) + lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp) + lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp) stfd $dota,`$FRAME+64`($sp) stfd $dotb,`$FRAME+72`($sp) @@ -823,14 +825,14 @@ $code.=<<___; stw $t0,12($tp) ; tp[j-1] stw $t4,8($tp) - lwz $t3,`$FRAME+32`($sp) ; permuted $t1 - lwz $t2,`$FRAME+36`($sp) ; permuted $t0 - lwz $t7,`$FRAME+40`($sp) ; permuted $t3 - lwz $t6,`$FRAME+44`($sp) ; permuted $t2 - lwz $t1,`$FRAME+48`($sp) ; permuted $t5 - lwz $t0,`$FRAME+52`($sp) ; permuted $t4 - lwz $t5,`$FRAME+56`($sp) ; permuted $t7 - lwz $t4,`$FRAME+60`($sp) ; permuted $t6 + lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1 + lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0 + lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3 + lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2 + lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5 + lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4 + lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7 + lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6 addc $t2,$t2,$carry adde $t3,$t3,$c1 @@ -857,10 +859,10 @@ $code.=<<___; stw $t2,20($tp) ; tp[j] stwu $t0,16($tp) - lwz $t7,`$FRAME+64`($sp) - lwz $t6,`$FRAME+68`($sp) - lwz $t5,`$FRAME+72`($sp) - lwz $t4,`$FRAME+76`($sp) + lwz $t7,`$FRAME+64^$LITTLE_ENDIAN`($sp) + lwz $t6,`$FRAME+68^$LITTLE_ENDIAN`($sp) + lwz $t5,`$FRAME+72^$LITTLE_ENDIAN`($sp) + lwz $t4,`$FRAME+76^$LITTLE_ENDIAN`($sp) addc $t6,$t6,$carry adde $t7,$t7,$c1 @@ -1165,23 +1167,23 @@ ___ $code.=<<___; fmadd $T1a,$N1,$na,$T1a fmadd $T1b,$N1,$nb,$T1b - lwz $t1,`$FRAME+0`($sp) - lwz $t0,`$FRAME+4`($sp) + lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp) + lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp) fmadd $T2a,$N2,$na,$T2a fmadd $T2b,$N2,$nb,$T2b - lwz $t3,`$FRAME+8`($sp) - lwz $t2,`$FRAME+12`($sp) + lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp) + lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp) fmadd $T3a,$N3,$na,$T3a fmadd $T3b,$N3,$nb,$T3b - lwz $t5,`$FRAME+16`($sp) - lwz $t4,`$FRAME+20`($sp) + lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp) + lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp) addc $t0,$t0,$carry adde $t1,$t1,$c1 srwi $carry,$t0,16 fmadd $T0a,$N0,$na,$T0a fmadd $T0b,$N0,$nb,$T0b - lwz $t7,`$FRAME+24`($sp) - lwz $t6,`$FRAME+28`($sp) + lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp) + lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp) srwi $c1,$t1,16 insrwi $carry,$t1,16,0 @@ -1218,8 +1220,8 @@ $code.=<<___; fctid $T1a,$T1a addc $t0,$t0,$t2 adde $t4,$t4,$t3 - lwz $t3,`$FRAME+32`($sp) ; permuted $t1 - lwz $t2,`$FRAME+36`($sp) ; permuted $t0 + lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1 + lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0 fctid $T1b,$T1b addze $carry,$carry addze $c1,$c1 @@ -1229,19 +1231,19 @@ $code.=<<___; addc $t2,$t2,$carry adde $t3,$t3,$c1 srwi $carry,$t2,16 - lwz $t7,`$FRAME+40`($sp) ; permuted $t3 - lwz $t6,`$FRAME+44`($sp) ; permuted $t2 + lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3 + lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2 fctid $T2b,$T2b srwi $c1,$t3,16 insrwi $carry,$t3,16,0 - lwz $t1,`$FRAME+48`($sp) ; permuted $t5 - lwz $t0,`$FRAME+52`($sp) ; permuted $t4 + lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5 + lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4 fctid $T3a,$T3a addc $t6,$t6,$carry adde $t7,$t7,$c1 srwi $carry,$t6,16 - lwz $t5,`$FRAME+56`($sp) ; permuted $t7 - lwz $t4,`$FRAME+60`($sp) ; permuted $t6 + lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7 + lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6 fctid $T3b,$T3b insrwi $t2,$t6,16,0 ; 64..95 bits @@ -1354,14 +1356,14 @@ $code.=<<___; ___ } else { $code.=<<___; - lwz $t1,`$FRAME+0`($sp) - lwz $t0,`$FRAME+4`($sp) - lwz $t3,`$FRAME+8`($sp) - lwz $t2,`$FRAME+12`($sp) - lwz $t5,`$FRAME+16`($sp) - lwz $t4,`$FRAME+20`($sp) - lwz $t7,`$FRAME+24`($sp) - lwz $t6,`$FRAME+28`($sp) + lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp) + lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp) + lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp) + lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp) + lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp) + lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp) + lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp) + lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp) stfd $dota,`$FRAME+64`($sp) stfd $dotb,`$FRAME+72`($sp) @@ -1397,14 +1399,14 @@ $code.=<<___; stw $t0,4($tp) ; tp[j-1] stw $t4,0($tp) - lwz $t3,`$FRAME+32`($sp) ; permuted $t1 - lwz $t2,`$FRAME+36`($sp) ; permuted $t0 - lwz $t7,`$FRAME+40`($sp) ; permuted $t3 - lwz $t6,`$FRAME+44`($sp) ; permuted $t2 - lwz $t1,`$FRAME+48`($sp) ; permuted $t5 - lwz $t0,`$FRAME+52`($sp) ; permuted $t4 - lwz $t5,`$FRAME+56`($sp) ; permuted $t7 - lwz $t4,`$FRAME+60`($sp) ; permuted $t6 + lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1 + lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0 + lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3 + lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2 + lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5 + lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4 + lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7 + lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6 addc $t2,$t2,$carry adde $t3,$t3,$c1 @@ -1433,12 +1435,12 @@ $code.=<<___; addc $t2,$t2,$t6 adde $t0,$t0,$t7 - lwz $t7,`$FRAME+64`($sp) - lwz $t6,`$FRAME+68`($sp) + lwz $t7,`$FRAME+64^$LITTLE_ENDIAN`($sp) + lwz $t6,`$FRAME+68^$LITTLE_ENDIAN`($sp) addze $carry,$carry addze $c1,$c1 - lwz $t5,`$FRAME+72`($sp) - lwz $t4,`$FRAME+76`($sp) + lwz $t5,`$FRAME+72^$LITTLE_ENDIAN`($sp) + lwz $t4,`$FRAME+76^$LITTLE_ENDIAN`($sp) addc $t6,$t6,$carry adde $t7,$t7,$c1 diff --git a/crypto/openssl/crypto/bn/asm/rsaz-x86_64.pl b/crypto/openssl/crypto/bn/asm/rsaz-x86_64.pl index 3bd45db..12b571c 100755 --- a/crypto/openssl/crypto/bn/asm/rsaz-x86_64.pl +++ b/crypto/openssl/crypto/bn/asm/rsaz-x86_64.pl @@ -113,7 +113,7 @@ if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && $addx = ($1>=12); } -if (!$addx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) { +if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9])\.([0-9]+)/) { my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 $addx = ($ver>=3.03); } diff --git a/crypto/openssl/crypto/bn/asm/s390x-gf2m.pl b/crypto/openssl/crypto/bn/asm/s390x-gf2m.pl index cd9f13e..9d18d40 100755 --- a/crypto/openssl/crypto/bn/asm/s390x-gf2m.pl +++ b/crypto/openssl/crypto/bn/asm/s390x-gf2m.pl @@ -172,19 +172,19 @@ ___ if ($SIZE_T==8) { my @r=map("%r$_",(6..9)); $code.=<<___; - bras $ra,_mul_1x1 # a1·b1 + bras $ra,_mul_1x1 # a1·b1 stmg $lo,$hi,16($rp) lg $a,`$stdframe+128+4*$SIZE_T`($sp) lg $b,`$stdframe+128+6*$SIZE_T`($sp) - bras $ra,_mul_1x1 # a0·b0 + bras $ra,_mul_1x1 # a0·b0 stmg $lo,$hi,0($rp) lg $a,`$stdframe+128+3*$SIZE_T`($sp) lg $b,`$stdframe+128+5*$SIZE_T`($sp) xg $a,`$stdframe+128+4*$SIZE_T`($sp) xg $b,`$stdframe+128+6*$SIZE_T`($sp) - bras $ra,_mul_1x1 # (a0+a1)·(b0+b1) + bras $ra,_mul_1x1 # (a0+a1)·(b0+b1) lmg @r[0],@r[3],0($rp) xgr $lo,$hi diff --git a/crypto/openssl/crypto/bn/asm/s390x.S b/crypto/openssl/crypto/bn/asm/s390x.S index 43fcb79..f5eebe4 100644 --- a/crypto/openssl/crypto/bn/asm/s390x.S +++ b/crypto/openssl/crypto/bn/asm/s390x.S @@ -18,71 +18,106 @@ .align 4 bn_mul_add_words: lghi zero,0 // zero = 0 - la %r1,0(%r2) // put rp aside - lghi %r2,0 // i=0; + la %r1,0(%r2) // put rp aside [to give way to] + lghi %r2,0 // return value ltgfr %r4,%r4 bler %r14 // if (len<=0) return 0; - stmg %r6,%r10,48(%r15) - lghi %r10,3 - lghi %r8,0 // carry = 0 - nr %r10,%r4 // len%4 + stmg %r6,%r13,48(%r15) + lghi %r2,3 + lghi %r12,0 // carry = 0 + slgr %r1,%r3 // rp-=ap + nr %r2,%r4 // len%4 sra %r4,2 // cnt=len/4 jz .Loop1_madd // carry is incidentally cleared if branch taken algr zero,zero // clear carry -.Loop4_madd: - lg %r7,0(%r2,%r3) // ap[i] + lg %r7,0(%r3) // ap[0] + lg %r9,8(%r3) // ap[1] mlgr %r6,%r5 // *=w - alcgr %r7,%r8 // +=carry - alcgr %r6,zero - alg %r7,0(%r2,%r1) // +=rp[i] - stg %r7,0(%r2,%r1) // rp[i]= + brct %r4,.Loop4_madd + j .Loop4_madd_tail - lg %r9,8(%r2,%r3) +.Loop4_madd: mlgr %r8,%r5 + lg %r11,16(%r3) // ap[i+2] + alcgr %r7,%r12 // +=carry + alcgr %r6,zero + alg %r7,0(%r3,%r1) // +=rp[i] + stg %r7,0(%r3,%r1) // rp[i]= + + mlgr %r10,%r5 + lg %r13,24(%r3) alcgr %r9,%r6 alcgr %r8,zero - alg %r9,8(%r2,%r1) - stg %r9,8(%r2,%r1) + alg %r9,8(%r3,%r1) + stg %r9,8(%r3,%r1) + + mlgr %r12,%r5 + lg %r7,32(%r3) + alcgr %r11,%r8 + alcgr %r10,zero + alg %r11,16(%r3,%r1) + stg %r11,16(%r3,%r1) - lg %r7,16(%r2,%r3) mlgr %r6,%r5 - alcgr %r7,%r8 - alcgr %r6,zero - alg %r7,16(%r2,%r1) - stg %r7,16(%r2,%r1) + lg %r9,40(%r3) + alcgr %r13,%r10 + alcgr %r12,zero + alg %r13,24(%r3,%r1) + stg %r13,24(%r3,%r1) - lg %r9,24(%r2,%r3) + la %r3,32(%r3) // i+=4 + brct %r4,.Loop4_madd + +.Loop4_madd_tail: mlgr %r8,%r5 + lg %r11,16(%r3) + alcgr %r7,%r12 // +=carry + alcgr %r6,zero + alg %r7,0(%r3,%r1) // +=rp[i] + stg %r7,0(%r3,%r1) // rp[i]= + + mlgr %r10,%r5 + lg %r13,24(%r3) alcgr %r9,%r6 alcgr %r8,zero - alg %r9,24(%r2,%r1) - stg %r9,24(%r2,%r1) + alg %r9,8(%r3,%r1) + stg %r9,8(%r3,%r1) - la %r2,32(%r2) // i+=4 - brct %r4,.Loop4_madd + mlgr %r12,%r5 + alcgr %r11,%r8 + alcgr %r10,zero + alg %r11,16(%r3,%r1) + stg %r11,16(%r3,%r1) - la %r10,1(%r10) // see if len%4 is zero ... - brct %r10,.Loop1_madd // without touching condition code:-) + alcgr %r13,%r10 + alcgr %r12,zero + alg %r13,24(%r3,%r1) + stg %r13,24(%r3,%r1) + + la %r3,32(%r3) // i+=4 + + la %r2,1(%r2) // see if len%4 is zero ... + brct %r2,.Loop1_madd // without touching condition code:-) .Lend_madd: - alcgr %r8,zero // collect carry bit - lgr %r2,%r8 - lmg %r6,%r10,48(%r15) + lgr %r2,zero // return value + alcgr %r2,%r12 // collect even carry bit + lmg %r6,%r13,48(%r15) br %r14 .Loop1_madd: - lg %r7,0(%r2,%r3) // ap[i] + lg %r7,0(%r3) // ap[i] mlgr %r6,%r5 // *=w - alcgr %r7,%r8 // +=carry + alcgr %r7,%r12 // +=carry alcgr %r6,zero - alg %r7,0(%r2,%r1) // +=rp[i] - stg %r7,0(%r2,%r1) // rp[i]= + alg %r7,0(%r3,%r1) // +=rp[i] + stg %r7,0(%r3,%r1) // rp[i]= - lgr %r8,%r6 - la %r2,8(%r2) // i++ - brct %r10,.Loop1_madd + lgr %r12,%r6 + la %r3,8(%r3) // i++ + brct %r2,.Loop1_madd j .Lend_madd .size bn_mul_add_words,.-bn_mul_add_words diff --git a/crypto/openssl/crypto/bn/asm/x86-gf2m.pl b/crypto/openssl/crypto/bn/asm/x86-gf2m.pl index 808a1e5..b579530 100755 --- a/crypto/openssl/crypto/bn/asm/x86-gf2m.pl +++ b/crypto/openssl/crypto/bn/asm/x86-gf2m.pl @@ -14,7 +14,7 @@ # the time being... Except that it has three code paths: pure integer # code suitable for any x86 CPU, MMX code suitable for PIII and later # and PCLMULQDQ suitable for Westmere and later. Improvement varies -# from one benchmark and µ-arch to another. Below are interval values +# from one benchmark and µ-arch to another. Below are interval values # for 163- and 571-bit ECDH benchmarks relative to compiler-generated # code: # @@ -226,22 +226,22 @@ if ($sse2) { &push ("edi"); &mov ($a,&wparam(1)); &mov ($b,&wparam(3)); - &call ("_mul_1x1_mmx"); # a1·b1 + &call ("_mul_1x1_mmx"); # a1·b1 &movq ("mm7",$R); &mov ($a,&wparam(2)); &mov ($b,&wparam(4)); - &call ("_mul_1x1_mmx"); # a0·b0 + &call ("_mul_1x1_mmx"); # a0·b0 &movq ("mm6",$R); &mov ($a,&wparam(1)); &mov ($b,&wparam(3)); &xor ($a,&wparam(2)); &xor ($b,&wparam(4)); - &call ("_mul_1x1_mmx"); # (a0+a1)·(b0+b1) + &call ("_mul_1x1_mmx"); # (a0+a1)·(b0+b1) &pxor ($R,"mm7"); &mov ($a,&wparam(0)); - &pxor ($R,"mm6"); # (a0+a1)·(b0+b1)-a1·b1-a0·b0 + &pxor ($R,"mm6"); # (a0+a1)·(b0+b1)-a1·b1-a0·b0 &movq ($A,$R); &psllq ($R,32); @@ -266,13 +266,13 @@ if ($sse2) { &mov ($a,&wparam(1)); &mov ($b,&wparam(3)); - &call ("_mul_1x1_ialu"); # a1·b1 + &call ("_mul_1x1_ialu"); # a1·b1 &mov (&DWP(8,"esp"),$lo); &mov (&DWP(12,"esp"),$hi); &mov ($a,&wparam(2)); &mov ($b,&wparam(4)); - &call ("_mul_1x1_ialu"); # a0·b0 + &call ("_mul_1x1_ialu"); # a0·b0 &mov (&DWP(0,"esp"),$lo); &mov (&DWP(4,"esp"),$hi); @@ -280,7 +280,7 @@ if ($sse2) { &mov ($b,&wparam(3)); &xor ($a,&wparam(2)); &xor ($b,&wparam(4)); - &call ("_mul_1x1_ialu"); # (a0+a1)·(b0+b1) + &call ("_mul_1x1_ialu"); # (a0+a1)·(b0+b1) &mov ("ebp",&wparam(0)); @r=("ebx","ecx","edi","esi"); diff --git a/crypto/openssl/crypto/bn/asm/x86_64-gcc.c b/crypto/openssl/crypto/bn/asm/x86_64-gcc.c index d548886..d77dc43 100644 --- a/crypto/openssl/crypto/bn/asm/x86_64-gcc.c +++ b/crypto/openssl/crypto/bn/asm/x86_64-gcc.c @@ -65,7 +65,7 @@ # undef mul_add /*- - * "m"(a), "+m"(r) is the way to favor DirectPath µ-code; + * "m"(a), "+m"(r) is the way to favor DirectPath µ-code; * "g"(0) let the compiler to decide where does it * want to keep the value of zero; */ diff --git a/crypto/openssl/crypto/bn/asm/x86_64-gf2m.pl b/crypto/openssl/crypto/bn/asm/x86_64-gf2m.pl index 226c66c..42bbec2 100755 --- a/crypto/openssl/crypto/bn/asm/x86_64-gf2m.pl +++ b/crypto/openssl/crypto/bn/asm/x86_64-gf2m.pl @@ -13,7 +13,7 @@ # in bn_gf2m.c. It's kind of low-hanging mechanical port from C for # the time being... Except that it has two code paths: code suitable # for any x86_64 CPU and PCLMULQDQ one suitable for Westmere and -# later. Improvement varies from one benchmark and µ-arch to another. +# later. Improvement varies from one benchmark and µ-arch to another. # Vanilla code path is at most 20% faster than compiler-generated code # [not very impressive], while PCLMULQDQ - whole 85%-160% better on # 163- and 571-bit ECDH benchmarks on Intel CPUs. Keep in mind that @@ -184,13 +184,13 @@ ___ $code.=<<___; movdqa %xmm0,%xmm4 movdqa %xmm1,%xmm5 - pclmulqdq \$0,%xmm1,%xmm0 # a1·b1 + pclmulqdq \$0,%xmm1,%xmm0 # a1·b1 pxor %xmm2,%xmm4 pxor %xmm3,%xmm5 - pclmulqdq \$0,%xmm3,%xmm2 # a0·b0 - pclmulqdq \$0,%xmm5,%xmm4 # (a0+a1)·(b0+b1) + pclmulqdq \$0,%xmm3,%xmm2 # a0·b0 + pclmulqdq \$0,%xmm5,%xmm4 # (a0+a1)·(b0+b1) xorps %xmm0,%xmm4 - xorps %xmm2,%xmm4 # (a0+a1)·(b0+b1)-a0·b0-a1·b1 + xorps %xmm2,%xmm4 # (a0+a1)·(b0+b1)-a0·b0-a1·b1 movdqa %xmm4,%xmm5 pslldq \$8,%xmm4 psrldq \$8,%xmm5 @@ -225,13 +225,13 @@ $code.=<<___; mov \$0xf,$mask mov $a1,$a mov $b1,$b - call _mul_1x1 # a1·b1 + call _mul_1x1 # a1·b1 mov $lo,16(%rsp) mov $hi,24(%rsp) mov 48(%rsp),$a mov 64(%rsp),$b - call _mul_1x1 # a0·b0 + call _mul_1x1 # a0·b0 mov $lo,0(%rsp) mov $hi,8(%rsp) @@ -239,7 +239,7 @@ $code.=<<___; mov 56(%rsp),$b xor 48(%rsp),$a xor 64(%rsp),$b - call _mul_1x1 # (a0+a1)·(b0+b1) + call _mul_1x1 # (a0+a1)·(b0+b1) ___ @r=("%rbx","%rcx","%rdi","%rsi"); $code.=<<___; diff --git a/crypto/openssl/crypto/bn/asm/x86_64-mont.pl b/crypto/openssl/crypto/bn/asm/x86_64-mont.pl index 2989b58..725833d 100755 --- a/crypto/openssl/crypto/bn/asm/x86_64-mont.pl +++ b/crypto/openssl/crypto/bn/asm/x86_64-mont.pl @@ -68,6 +68,11 @@ if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && $addx = ($1>=12); } +if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9])\.([0-9]+)/) { + my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 + $addx = ($ver>=3.03); +} + # int bn_mul_mont( $rp="%rdi"; # BN_ULONG *rp, $ap="%rsi"; # const BN_ULONG *ap, diff --git a/crypto/openssl/crypto/bn/asm/x86_64-mont5.pl b/crypto/openssl/crypto/bn/asm/x86_64-mont5.pl index 820de3d..64e668f 100755 --- a/crypto/openssl/crypto/bn/asm/x86_64-mont5.pl +++ b/crypto/openssl/crypto/bn/asm/x86_64-mont5.pl @@ -53,6 +53,11 @@ if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && $addx = ($1>=12); } +if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9])\.([0-9]+)/) { + my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 + $addx = ($ver>=3.03); +} + # int bn_mul_mont_gather5( $rp="%rdi"; # BN_ULONG *rp, $ap="%rsi"; # const BN_ULONG *ap, @@ -1779,6 +1784,15 @@ sqr8x_reduction: .align 32 .L8x_tail_done: add (%rdx),%r8 # can this overflow? + adc \$0,%r9 + adc \$0,%r10 + adc \$0,%r11 + adc \$0,%r12 + adc \$0,%r13 + adc \$0,%r14 + adc \$0,%r15 # can't overflow, because we + # started with "overhung" part + # of multiplication xor %rax,%rax neg $carry @@ -3125,6 +3139,15 @@ sqrx8x_reduction: .align 32 .Lsqrx8x_tail_done: add 24+8(%rsp),%r8 # can this overflow? + adc \$0,%r9 + adc \$0,%r10 + adc \$0,%r11 + adc \$0,%r12 + adc \$0,%r13 + adc \$0,%r14 + adc \$0,%r15 # can't overflow, because we + # started with "overhung" part + # of multiplication mov $carry,%rax # xor %rax,%rax sub 16+8(%rsp),$carry # mov 16(%rsp),%cf @@ -3168,13 +3191,11 @@ my ($rptr,$nptr)=("%rdx","%rbp"); my @ri=map("%r$_",(10..13)); my @ni=map("%r$_",(14..15)); $code.=<<___; - xor %rbx,%rbx + xor %ebx,%ebx sub %r15,%rsi # compare top-most words adc %rbx,%rbx mov %rcx,%r10 # -$num - .byte 0x67 or %rbx,%rax - .byte 0x67 mov %rcx,%r9 # -$num xor \$1,%rax sar \$3+2,%rcx # cf=0 |