summaryrefslogtreecommitdiffstats
path: root/crypto/aes/asm
diff options
context:
space:
mode:
authorjkim <jkim@FreeBSD.org>2015-12-03 17:22:58 +0000
committerjkim <jkim@FreeBSD.org>2015-12-03 17:22:58 +0000
commitafd52a5fc90e70242dbb0e7d29987c976eb993e0 (patch)
treeb0284af4e4144e27eb9f39e88c53868060774b16 /crypto/aes/asm
parent64cb0c902e312216cdc4c826fc0be9ba9e1bf4da (diff)
downloadFreeBSD-src-afd52a5fc90e70242dbb0e7d29987c976eb993e0.zip
FreeBSD-src-afd52a5fc90e70242dbb0e7d29987c976eb993e0.tar.gz
Import OpenSSL 1.0.2e.
Diffstat (limited to 'crypto/aes/asm')
-rwxr-xr-xcrypto/aes/asm/aes-586.pl6
-rwxr-xr-xcrypto/aes/asm/aesni-mb-x86_64.pl2
-rwxr-xr-xcrypto/aes/asm/aesni-sha1-x86_64.pl2
-rwxr-xr-xcrypto/aes/asm/aesni-sha256-x86_64.pl9
-rwxr-xr-xcrypto/aes/asm/aesni-x86.pl2
-rwxr-xr-xcrypto/aes/asm/vpaes-ppc.pl198
6 files changed, 145 insertions, 74 deletions
diff --git a/crypto/aes/asm/aes-586.pl b/crypto/aes/asm/aes-586.pl
index 451d0e0..60286ecb 100755
--- a/crypto/aes/asm/aes-586.pl
+++ b/crypto/aes/asm/aes-586.pl
@@ -45,7 +45,7 @@
# the undertaken effort was that it appeared that in tight IA-32
# register window little-endian flavor could achieve slightly higher
# Instruction Level Parallelism, and it indeed resulted in up to 15%
-# better performance on most recent µ-archs...
+# better performance on most recent µ-archs...
#
# Third version adds AES_cbc_encrypt implementation, which resulted in
# up to 40% performance imrovement of CBC benchmark results. 40% was
@@ -224,7 +224,7 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
$speed_limit=512; # chunks smaller than $speed_limit are
# processed with compact routine in CBC mode
$small_footprint=1; # $small_footprint=1 code is ~5% slower [on
- # recent µ-archs], but ~5 times smaller!
+ # recent µ-archs], but ~5 times smaller!
# I favor compact code to minimize cache
# contention and in hope to "collect" 5% back
# in real-life applications...
@@ -565,7 +565,7 @@ sub enctransform()
# Performance is not actually extraordinary in comparison to pure
# x86 code. In particular encrypt performance is virtually the same.
# Decrypt performance on the other hand is 15-20% better on newer
-# µ-archs [but we're thankful for *any* improvement here], and ~50%
+# µ-archs [but we're thankful for *any* improvement here], and ~50%
# better on PIII:-) And additionally on the pros side this code
# eliminates redundant references to stack and thus relieves/
# minimizes the pressure on the memory bus.
diff --git a/crypto/aes/asm/aesni-mb-x86_64.pl b/crypto/aes/asm/aesni-mb-x86_64.pl
index 33b1aed..5a100fa 100755
--- a/crypto/aes/asm/aesni-mb-x86_64.pl
+++ b/crypto/aes/asm/aesni-mb-x86_64.pl
@@ -63,7 +63,7 @@ if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
$avx = ($1>=10) + ($1>=11);
}
-if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9]\.[0-9]+)/) {
$avx = ($2>=3.0) + ($2>3.0);
}
diff --git a/crypto/aes/asm/aesni-sha1-x86_64.pl b/crypto/aes/asm/aesni-sha1-x86_64.pl
index 97992ad..c803cde 100755
--- a/crypto/aes/asm/aesni-sha1-x86_64.pl
+++ b/crypto/aes/asm/aesni-sha1-x86_64.pl
@@ -94,7 +94,7 @@ $avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
`ml64 2>&1` =~ /Version ([0-9]+)\./ &&
$1>=10);
-$avx=1 if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/ && $2>=3.0);
+$avx=1 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9]\.[0-9]+)/ && $2>=3.0);
$shaext=1; ### set to zero if compiling for 1.0.1
diff --git a/crypto/aes/asm/aesni-sha256-x86_64.pl b/crypto/aes/asm/aesni-sha256-x86_64.pl
index 19b0433..bfe2926 100755
--- a/crypto/aes/asm/aesni-sha256-x86_64.pl
+++ b/crypto/aes/asm/aesni-sha256-x86_64.pl
@@ -59,7 +59,7 @@ if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
$avx = ($1>=10) + ($1>=12);
}
-if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9]\.[0-9]+)/) {
$avx = ($2>=3.0) + ($2>3.0);
}
@@ -139,11 +139,8 @@ $code.=<<___ if ($avx>1);
je ${func}_avx2
___
$code.=<<___;
- and \$`1<<30`,%eax # mask "Intel CPU" bit
- and \$`1<<28|1<<9`,%r10d # mask AVX+SSSE3 bits
- or %eax,%r10d
- cmp \$`1<<28|1<<9|1<<30`,%r10d
- je ${func}_avx
+ and \$`1<<28`,%r10d # check for AVX
+ jnz ${func}_avx
ud2
___
}
diff --git a/crypto/aes/asm/aesni-x86.pl b/crypto/aes/asm/aesni-x86.pl
index f67df8c..9b2e37a 100755
--- a/crypto/aes/asm/aesni-x86.pl
+++ b/crypto/aes/asm/aesni-x86.pl
@@ -88,7 +88,7 @@ $inout3="xmm5"; $in1="xmm5";
$inout4="xmm6"; $in0="xmm6";
$inout5="xmm7"; $ivec="xmm7";
-# AESNI extenstion
+# AESNI extension
sub aeskeygenassist
{ my($dst,$src,$imm)=@_;
if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
diff --git a/crypto/aes/asm/vpaes-ppc.pl b/crypto/aes/asm/vpaes-ppc.pl
index 7fda60e..1759ae9 100755
--- a/crypto/aes/asm/vpaes-ppc.pl
+++ b/crypto/aes/asm/vpaes-ppc.pl
@@ -337,24 +337,27 @@ Lenc_entry:
addi $inp, $inp, 15 # 15 is not a typo
?lvsr $outperm, 0, $out
?lvsl $keyperm, 0, $key # prepare for unaligned access
- vnor $outmask, v7, v7 # 0xff..ff
lvx $inptail, 0, $inp # redundant in aligned case
- ?vperm $outmask, v7, $outmask, $outperm
- lvx $outhead, 0, $out
?vperm v0, v0, $inptail, $inpperm
bl _vpaes_encrypt_core
+ andi. r8, $out, 15
+ li r9, 16
+ beq Lenc_out_aligned
+
vperm v0, v0, v0, $outperm # rotate right/left
- vsel v1, $outhead, v0, $outmask
- vmr $outhead, v0
- stvx v1, 0, $out
- addi $out, $out, 15 # 15 is not a typo
- ########
+ mtctr r9
+Lenc_out_unaligned:
+ stvebx v0, 0, $out
+ addi $out, $out, 1
+ bdnz Lenc_out_unaligned
+ b Lenc_done
- lvx v1, 0, $out # redundant in aligned case
- vsel v1, $outhead, v1, $outmask
- stvx v1, 0, $out
+.align 4
+Lenc_out_aligned:
+ stvx v0, 0, $out
+Lenc_done:
li r10,`15+6*$SIZE_T`
li r11,`31+6*$SIZE_T`
@@ -566,24 +569,27 @@ Ldec_entry:
addi $inp, $inp, 15 # 15 is not a typo
?lvsr $outperm, 0, $out
?lvsl $keyperm, 0, $key
- vnor $outmask, v7, v7 # 0xff..ff
lvx $inptail, 0, $inp # redundant in aligned case
- ?vperm $outmask, v7, $outmask, $outperm
- lvx $outhead, 0, $out
?vperm v0, v0, $inptail, $inpperm
bl _vpaes_decrypt_core
+ andi. r8, $out, 15
+ li r9, 16
+ beq Ldec_out_aligned
+
vperm v0, v0, v0, $outperm # rotate right/left
- vsel v1, $outhead, v0, $outmask
- vmr $outhead, v0
- stvx v1, 0, $out
- addi $out, $out, 15 # 15 is not a typo
- ########
+ mtctr r9
+Ldec_out_unaligned:
+ stvebx v0, 0, $out
+ addi $out, $out, 1
+ bdnz Ldec_out_unaligned
+ b Ldec_done
- lvx v1, 0, $out # redundant in aligned case
- vsel v1, $outhead, v1, $outmask
- stvx v1, 0, $out
+.align 4
+Ldec_out_aligned:
+ stvx v0, 0, $out
+Ldec_done:
li r10,`15+6*$SIZE_T`
li r11,`31+6*$SIZE_T`
@@ -658,11 +664,11 @@ Ldec_entry:
$PUSH r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp)
and r30, r5, r9 # copy length&-16
+ andi. r9, $out, 15 # is $out aligned?
mr r5, r6 # copy pointer to key
mr r31, r7 # copy pointer to iv
- blt Lcbc_abort
- cmpwi r8, 0 # test direction
li r6, -1
+ mcrf cr1, cr0 # put aside $out alignment flag
mr r7, r12 # copy vrsave
mtspr 256, r6 # preserve all AltiVec registers
@@ -672,6 +678,7 @@ Ldec_entry:
lvx v25, r9, r31
?vperm v24, v24, v25, $inpperm
+ cmpwi r8, 0 # test direction
neg r8, $inp # prepare for unaligned access
vxor v7, v7, v7
?lvsl $keyperm, 0, $key
@@ -681,13 +688,37 @@ Ldec_entry:
lvx $inptail, 0, $inp
?vperm $outmask, v7, $outmask, $outperm
addi $inp, $inp, 15 # 15 is not a typo
- lvx $outhead, 0, $out
beq Lcbc_decrypt
bl _vpaes_encrypt_preheat
li r0, 16
+ beq cr1, Lcbc_enc_loop # $out is aligned
+
+ vmr v0, $inptail
+ lvx $inptail, 0, $inp
+ addi $inp, $inp, 16
+ ?vperm v0, v0, $inptail, $inpperm
+ vxor v0, v0, v24 # ^= iv
+
+ bl _vpaes_encrypt_core
+
+ andi. r8, $out, 15
+ vmr v24, v0 # put aside iv
+ sub r9, $out, r8
+ vperm $outhead, v0, v0, $outperm # rotate right/left
+
+Lcbc_enc_head:
+ stvebx $outhead, r8, r9
+ cmpwi r8, 15
+ addi r8, r8, 1
+ bne Lcbc_enc_head
+
+ sub. r30, r30, r0 # len -= 16
+ addi $out, $out, 16
+ beq Lcbc_unaligned_done
+
Lcbc_enc_loop:
vmr v0, $inptail
lvx $inptail, 0, $inp
@@ -713,6 +744,32 @@ Lcbc_decrypt:
bl _vpaes_decrypt_preheat
li r0, 16
+ beq cr1, Lcbc_dec_loop # $out is aligned
+
+ vmr v0, $inptail
+ lvx $inptail, 0, $inp
+ addi $inp, $inp, 16
+ ?vperm v0, v0, $inptail, $inpperm
+ vmr v25, v0 # put aside input
+
+ bl _vpaes_decrypt_core
+
+ andi. r8, $out, 15
+ vxor v0, v0, v24 # ^= iv
+ vmr v24, v25
+ sub r9, $out, r8
+ vperm $outhead, v0, v0, $outperm # rotate right/left
+
+Lcbc_dec_head:
+ stvebx $outhead, r8, r9
+ cmpwi r8, 15
+ addi r8, r8, 1
+ bne Lcbc_dec_head
+
+ sub. r30, r30, r0 # len -= 16
+ addi $out, $out, 16
+ beq Lcbc_unaligned_done
+
Lcbc_dec_loop:
vmr v0, $inptail
lvx $inptail, 0, $inp
@@ -733,23 +790,29 @@ Lcbc_dec_loop:
bne Lcbc_dec_loop
Lcbc_done:
- addi $out, $out, -1
- lvx v1, 0, $out # redundant in aligned case
- vsel v1, $outhead, v1, $outmask
- stvx v1, 0, $out
-
+ beq cr1, Lcbc_write_iv # $out is aligned
+
+Lcbc_unaligned_done:
+ andi. r8, $out, 15
+ sub $out, $out, r8
+ li r9, 0
+Lcbc_tail:
+ stvebx $outhead, r9, $out
+ addi r9, r9, 1
+ cmpw r9, r8
+ bne Lcbc_tail
+
+Lcbc_write_iv:
neg r8, r31 # write [potentially unaligned] iv
+ li r10, 4
?lvsl $outperm, 0, r8
- li r6, 15
- vnor $outmask, v7, v7 # 0xff..ff
- ?vperm $outmask, v7, $outmask, $outperm
- lvx $outhead, 0, r31
+ li r11, 8
+ li r12, 12
vperm v24, v24, v24, $outperm # rotate right/left
- vsel v0, $outhead, v24, $outmask
- lvx v1, r6, r31
- stvx v0, 0, r31
- vsel v1, v24, v1, $outmask
- stvx v1, r6, r31
+ stvewx v24, 0, r31 # ivp is at least 32-bit aligned
+ stvewx v24, r10, r31
+ stvewx v24, r11, r31
+ stvewx v24, r12, r31
mtspr 256, r7 # restore vrsave
li r10,`15+6*$SIZE_T`
@@ -872,18 +935,21 @@ _vpaes_schedule_core:
# encrypting, output zeroth round key after transform
li r8, 0x30 # mov \$0x30,%r8d
- addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
+ li r9, 4
+ li r10, 8
+ li r11, 12
?lvsr $outperm, 0, $out # prepare for unaligned access
vnor $outmask, v9, v9 # 0xff..ff
- lvx $outhead, 0, $out
?vperm $outmask, v9, $outmask, $outperm
#stvx v0, 0, $out # vmovdqu %xmm0, (%rdx)
- vperm v1, v0, v0, $outperm # rotate right/left
- vsel v2, $outhead, v1, $outmask
- vmr $outhead, v1
- stvx v2, 0, $out
+ vperm $outhead, v0, v0, $outperm # rotate right/left
+ stvewx $outhead, 0, $out # some are superfluous
+ stvewx $outhead, r9, $out
+ stvewx $outhead, r10, $out
+ addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
+ stvewx $outhead, r11, $out
b Lschedule_go
Lschedule_am_decrypting:
@@ -893,20 +959,24 @@ Lschedule_am_decrypting:
addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
# decrypting, output zeroth round key after shiftrows
lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
+ li r9, 4
+ li r10, 8
+ li r11, 12
vperm v4, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
neg r0, $out # prepare for unaligned access
?lvsl $outperm, 0, r0
- addi $out, $out, 15 # 15 is not typo
vnor $outmask, v9, v9 # 0xff..ff
- lvx $outhead, 0, $out
?vperm $outmask, $outmask, v9, $outperm
#stvx v4, 0, $out # vmovdqu %xmm3, (%rdx)
- vperm v4, v4, v4, $outperm # rotate right/left
- vsel v2, $outhead, v4, $outmask
- vmr $outhead, v4
- stvx v2, 0, $out
+ vperm $outhead, v4, v4, $outperm # rotate right/left
+ stvewx $outhead, 0, $out # some are superfluous
+ stvewx $outhead, r9, $out
+ stvewx $outhead, r10, $out
+ addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
+ stvewx $outhead, r11, $out
+ addi $out, $out, 15 # 15 is not typo
xori r8, r8, 0x30 # xor \$0x30, %r8
Lschedule_go:
@@ -1038,14 +1108,15 @@ Lschedule_mangle_last:
#stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key
vperm v0, v0, v0, $outperm # rotate right/left
+ li r10, 4
vsel v2, $outhead, v0, $outmask
- vmr $outhead, v0
+ li r11, 8
stvx v2, 0, $out
-
- addi $out, $out, 15 # 15 is not typo
- lvx v1, 0, $out # redundant in aligned case
- vsel v1, $outhead, v1, $outmask
- stvx v1, 0, $out
+ li r12, 12
+ stvewx v0, 0, $out # some (or all) are redundant
+ stvewx v0, r10, $out
+ stvewx v0, r11, $out
+ stvewx v0, r12, $out
b Lschedule_mangle_done
.align 4
@@ -1057,15 +1128,18 @@ Lschedule_mangle_last_dec:
bl _vpaes_schedule_transform # output transform
#stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key
+ addi r9, $out, -15 # -15 is not typo
vperm v0, v0, v0, $outperm # rotate right/left
+ li r10, 4
vsel v2, $outhead, v0, $outmask
- vmr $outhead, v0
+ li r11, 8
stvx v2, 0, $out
+ li r12, 12
+ stvewx v0, 0, r9 # some (or all) are redundant
+ stvewx v0, r10, r9
+ stvewx v0, r11, r9
+ stvewx v0, r12, r9
- addi $out, $out, -15 # -15 is not typo
- lvx v1, 0, $out # redundant in aligned case
- vsel v1, $outhead, v1, $outmask
- stvx v1, 0, $out
Lschedule_mangle_done:
mtlr r7
OpenPOWER on IntegriCloud