diff options
author | jkim <jkim@FreeBSD.org> | 2015-12-03 17:22:58 +0000 |
---|---|---|
committer | jkim <jkim@FreeBSD.org> | 2015-12-03 17:22:58 +0000 |
commit | afd52a5fc90e70242dbb0e7d29987c976eb993e0 (patch) | |
tree | b0284af4e4144e27eb9f39e88c53868060774b16 /crypto/aes/asm | |
parent | 64cb0c902e312216cdc4c826fc0be9ba9e1bf4da (diff) | |
download | FreeBSD-src-afd52a5fc90e70242dbb0e7d29987c976eb993e0.zip FreeBSD-src-afd52a5fc90e70242dbb0e7d29987c976eb993e0.tar.gz |
Import OpenSSL 1.0.2e.
Diffstat (limited to 'crypto/aes/asm')
-rwxr-xr-x | crypto/aes/asm/aes-586.pl | 6 | ||||
-rwxr-xr-x | crypto/aes/asm/aesni-mb-x86_64.pl | 2 | ||||
-rwxr-xr-x | crypto/aes/asm/aesni-sha1-x86_64.pl | 2 | ||||
-rwxr-xr-x | crypto/aes/asm/aesni-sha256-x86_64.pl | 9 | ||||
-rwxr-xr-x | crypto/aes/asm/aesni-x86.pl | 2 | ||||
-rwxr-xr-x | crypto/aes/asm/vpaes-ppc.pl | 198 |
6 files changed, 145 insertions, 74 deletions
diff --git a/crypto/aes/asm/aes-586.pl b/crypto/aes/asm/aes-586.pl index 451d0e0..60286ecb 100755 --- a/crypto/aes/asm/aes-586.pl +++ b/crypto/aes/asm/aes-586.pl @@ -45,7 +45,7 @@ # the undertaken effort was that it appeared that in tight IA-32 # register window little-endian flavor could achieve slightly higher # Instruction Level Parallelism, and it indeed resulted in up to 15% -# better performance on most recent µ-archs... +# better performance on most recent µ-archs... # # Third version adds AES_cbc_encrypt implementation, which resulted in # up to 40% performance imrovement of CBC benchmark results. 40% was @@ -224,7 +224,7 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } } $speed_limit=512; # chunks smaller than $speed_limit are # processed with compact routine in CBC mode $small_footprint=1; # $small_footprint=1 code is ~5% slower [on - # recent µ-archs], but ~5 times smaller! + # recent µ-archs], but ~5 times smaller! # I favor compact code to minimize cache # contention and in hope to "collect" 5% back # in real-life applications... @@ -565,7 +565,7 @@ sub enctransform() # Performance is not actually extraordinary in comparison to pure # x86 code. In particular encrypt performance is virtually the same. # Decrypt performance on the other hand is 15-20% better on newer -# µ-archs [but we're thankful for *any* improvement here], and ~50% +# µ-archs [but we're thankful for *any* improvement here], and ~50% # better on PIII:-) And additionally on the pros side this code # eliminates redundant references to stack and thus relieves/ # minimizes the pressure on the memory bus. diff --git a/crypto/aes/asm/aesni-mb-x86_64.pl b/crypto/aes/asm/aesni-mb-x86_64.pl index 33b1aed..5a100fa 100755 --- a/crypto/aes/asm/aesni-mb-x86_64.pl +++ b/crypto/aes/asm/aesni-mb-x86_64.pl @@ -63,7 +63,7 @@ if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && $avx = ($1>=10) + ($1>=11); } -if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) { +if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9]\.[0-9]+)/) { $avx = ($2>=3.0) + ($2>3.0); } diff --git a/crypto/aes/asm/aesni-sha1-x86_64.pl b/crypto/aes/asm/aesni-sha1-x86_64.pl index 97992ad..c803cde 100755 --- a/crypto/aes/asm/aesni-sha1-x86_64.pl +++ b/crypto/aes/asm/aesni-sha1-x86_64.pl @@ -94,7 +94,7 @@ $avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && $avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && `ml64 2>&1` =~ /Version ([0-9]+)\./ && $1>=10); -$avx=1 if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/ && $2>=3.0); +$avx=1 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9]\.[0-9]+)/ && $2>=3.0); $shaext=1; ### set to zero if compiling for 1.0.1 diff --git a/crypto/aes/asm/aesni-sha256-x86_64.pl b/crypto/aes/asm/aesni-sha256-x86_64.pl index 19b0433..bfe2926 100755 --- a/crypto/aes/asm/aesni-sha256-x86_64.pl +++ b/crypto/aes/asm/aesni-sha256-x86_64.pl @@ -59,7 +59,7 @@ if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && $avx = ($1>=10) + ($1>=12); } -if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) { +if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9]\.[0-9]+)/) { $avx = ($2>=3.0) + ($2>3.0); } @@ -139,11 +139,8 @@ $code.=<<___ if ($avx>1); je ${func}_avx2 ___ $code.=<<___; - and \$`1<<30`,%eax # mask "Intel CPU" bit - and \$`1<<28|1<<9`,%r10d # mask AVX+SSSE3 bits - or %eax,%r10d - cmp \$`1<<28|1<<9|1<<30`,%r10d - je ${func}_avx + and \$`1<<28`,%r10d # check for AVX + jnz ${func}_avx ud2 ___ } diff --git a/crypto/aes/asm/aesni-x86.pl b/crypto/aes/asm/aesni-x86.pl index f67df8c..9b2e37a 100755 --- a/crypto/aes/asm/aesni-x86.pl +++ b/crypto/aes/asm/aesni-x86.pl @@ -88,7 +88,7 @@ $inout3="xmm5"; $in1="xmm5"; $inout4="xmm6"; $in0="xmm6"; $inout5="xmm7"; $ivec="xmm7"; -# AESNI extenstion +# AESNI extension sub aeskeygenassist { my($dst,$src,$imm)=@_; if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) diff --git a/crypto/aes/asm/vpaes-ppc.pl b/crypto/aes/asm/vpaes-ppc.pl index 7fda60e..1759ae9 100755 --- a/crypto/aes/asm/vpaes-ppc.pl +++ b/crypto/aes/asm/vpaes-ppc.pl @@ -337,24 +337,27 @@ Lenc_entry: addi $inp, $inp, 15 # 15 is not a typo ?lvsr $outperm, 0, $out ?lvsl $keyperm, 0, $key # prepare for unaligned access - vnor $outmask, v7, v7 # 0xff..ff lvx $inptail, 0, $inp # redundant in aligned case - ?vperm $outmask, v7, $outmask, $outperm - lvx $outhead, 0, $out ?vperm v0, v0, $inptail, $inpperm bl _vpaes_encrypt_core + andi. r8, $out, 15 + li r9, 16 + beq Lenc_out_aligned + vperm v0, v0, v0, $outperm # rotate right/left - vsel v1, $outhead, v0, $outmask - vmr $outhead, v0 - stvx v1, 0, $out - addi $out, $out, 15 # 15 is not a typo - ######## + mtctr r9 +Lenc_out_unaligned: + stvebx v0, 0, $out + addi $out, $out, 1 + bdnz Lenc_out_unaligned + b Lenc_done - lvx v1, 0, $out # redundant in aligned case - vsel v1, $outhead, v1, $outmask - stvx v1, 0, $out +.align 4 +Lenc_out_aligned: + stvx v0, 0, $out +Lenc_done: li r10,`15+6*$SIZE_T` li r11,`31+6*$SIZE_T` @@ -566,24 +569,27 @@ Ldec_entry: addi $inp, $inp, 15 # 15 is not a typo ?lvsr $outperm, 0, $out ?lvsl $keyperm, 0, $key - vnor $outmask, v7, v7 # 0xff..ff lvx $inptail, 0, $inp # redundant in aligned case - ?vperm $outmask, v7, $outmask, $outperm - lvx $outhead, 0, $out ?vperm v0, v0, $inptail, $inpperm bl _vpaes_decrypt_core + andi. r8, $out, 15 + li r9, 16 + beq Ldec_out_aligned + vperm v0, v0, v0, $outperm # rotate right/left - vsel v1, $outhead, v0, $outmask - vmr $outhead, v0 - stvx v1, 0, $out - addi $out, $out, 15 # 15 is not a typo - ######## + mtctr r9 +Ldec_out_unaligned: + stvebx v0, 0, $out + addi $out, $out, 1 + bdnz Ldec_out_unaligned + b Ldec_done - lvx v1, 0, $out # redundant in aligned case - vsel v1, $outhead, v1, $outmask - stvx v1, 0, $out +.align 4 +Ldec_out_aligned: + stvx v0, 0, $out +Ldec_done: li r10,`15+6*$SIZE_T` li r11,`31+6*$SIZE_T` @@ -658,11 +664,11 @@ Ldec_entry: $PUSH r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp) and r30, r5, r9 # copy length&-16 + andi. r9, $out, 15 # is $out aligned? mr r5, r6 # copy pointer to key mr r31, r7 # copy pointer to iv - blt Lcbc_abort - cmpwi r8, 0 # test direction li r6, -1 + mcrf cr1, cr0 # put aside $out alignment flag mr r7, r12 # copy vrsave mtspr 256, r6 # preserve all AltiVec registers @@ -672,6 +678,7 @@ Ldec_entry: lvx v25, r9, r31 ?vperm v24, v24, v25, $inpperm + cmpwi r8, 0 # test direction neg r8, $inp # prepare for unaligned access vxor v7, v7, v7 ?lvsl $keyperm, 0, $key @@ -681,13 +688,37 @@ Ldec_entry: lvx $inptail, 0, $inp ?vperm $outmask, v7, $outmask, $outperm addi $inp, $inp, 15 # 15 is not a typo - lvx $outhead, 0, $out beq Lcbc_decrypt bl _vpaes_encrypt_preheat li r0, 16 + beq cr1, Lcbc_enc_loop # $out is aligned + + vmr v0, $inptail + lvx $inptail, 0, $inp + addi $inp, $inp, 16 + ?vperm v0, v0, $inptail, $inpperm + vxor v0, v0, v24 # ^= iv + + bl _vpaes_encrypt_core + + andi. r8, $out, 15 + vmr v24, v0 # put aside iv + sub r9, $out, r8 + vperm $outhead, v0, v0, $outperm # rotate right/left + +Lcbc_enc_head: + stvebx $outhead, r8, r9 + cmpwi r8, 15 + addi r8, r8, 1 + bne Lcbc_enc_head + + sub. r30, r30, r0 # len -= 16 + addi $out, $out, 16 + beq Lcbc_unaligned_done + Lcbc_enc_loop: vmr v0, $inptail lvx $inptail, 0, $inp @@ -713,6 +744,32 @@ Lcbc_decrypt: bl _vpaes_decrypt_preheat li r0, 16 + beq cr1, Lcbc_dec_loop # $out is aligned + + vmr v0, $inptail + lvx $inptail, 0, $inp + addi $inp, $inp, 16 + ?vperm v0, v0, $inptail, $inpperm + vmr v25, v0 # put aside input + + bl _vpaes_decrypt_core + + andi. r8, $out, 15 + vxor v0, v0, v24 # ^= iv + vmr v24, v25 + sub r9, $out, r8 + vperm $outhead, v0, v0, $outperm # rotate right/left + +Lcbc_dec_head: + stvebx $outhead, r8, r9 + cmpwi r8, 15 + addi r8, r8, 1 + bne Lcbc_dec_head + + sub. r30, r30, r0 # len -= 16 + addi $out, $out, 16 + beq Lcbc_unaligned_done + Lcbc_dec_loop: vmr v0, $inptail lvx $inptail, 0, $inp @@ -733,23 +790,29 @@ Lcbc_dec_loop: bne Lcbc_dec_loop Lcbc_done: - addi $out, $out, -1 - lvx v1, 0, $out # redundant in aligned case - vsel v1, $outhead, v1, $outmask - stvx v1, 0, $out - + beq cr1, Lcbc_write_iv # $out is aligned + +Lcbc_unaligned_done: + andi. r8, $out, 15 + sub $out, $out, r8 + li r9, 0 +Lcbc_tail: + stvebx $outhead, r9, $out + addi r9, r9, 1 + cmpw r9, r8 + bne Lcbc_tail + +Lcbc_write_iv: neg r8, r31 # write [potentially unaligned] iv + li r10, 4 ?lvsl $outperm, 0, r8 - li r6, 15 - vnor $outmask, v7, v7 # 0xff..ff - ?vperm $outmask, v7, $outmask, $outperm - lvx $outhead, 0, r31 + li r11, 8 + li r12, 12 vperm v24, v24, v24, $outperm # rotate right/left - vsel v0, $outhead, v24, $outmask - lvx v1, r6, r31 - stvx v0, 0, r31 - vsel v1, v24, v1, $outmask - stvx v1, r6, r31 + stvewx v24, 0, r31 # ivp is at least 32-bit aligned + stvewx v24, r10, r31 + stvewx v24, r11, r31 + stvewx v24, r12, r31 mtspr 256, r7 # restore vrsave li r10,`15+6*$SIZE_T` @@ -872,18 +935,21 @@ _vpaes_schedule_core: # encrypting, output zeroth round key after transform li r8, 0x30 # mov \$0x30,%r8d - addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10 + li r9, 4 + li r10, 8 + li r11, 12 ?lvsr $outperm, 0, $out # prepare for unaligned access vnor $outmask, v9, v9 # 0xff..ff - lvx $outhead, 0, $out ?vperm $outmask, v9, $outmask, $outperm #stvx v0, 0, $out # vmovdqu %xmm0, (%rdx) - vperm v1, v0, v0, $outperm # rotate right/left - vsel v2, $outhead, v1, $outmask - vmr $outhead, v1 - stvx v2, 0, $out + vperm $outhead, v0, v0, $outperm # rotate right/left + stvewx $outhead, 0, $out # some are superfluous + stvewx $outhead, r9, $out + stvewx $outhead, r10, $out + addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10 + stvewx $outhead, r11, $out b Lschedule_go Lschedule_am_decrypting: @@ -893,20 +959,24 @@ Lschedule_am_decrypting: addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10 # decrypting, output zeroth round key after shiftrows lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1 + li r9, 4 + li r10, 8 + li r11, 12 vperm v4, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3 neg r0, $out # prepare for unaligned access ?lvsl $outperm, 0, r0 - addi $out, $out, 15 # 15 is not typo vnor $outmask, v9, v9 # 0xff..ff - lvx $outhead, 0, $out ?vperm $outmask, $outmask, v9, $outperm #stvx v4, 0, $out # vmovdqu %xmm3, (%rdx) - vperm v4, v4, v4, $outperm # rotate right/left - vsel v2, $outhead, v4, $outmask - vmr $outhead, v4 - stvx v2, 0, $out + vperm $outhead, v4, v4, $outperm # rotate right/left + stvewx $outhead, 0, $out # some are superfluous + stvewx $outhead, r9, $out + stvewx $outhead, r10, $out + addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10 + stvewx $outhead, r11, $out + addi $out, $out, 15 # 15 is not typo xori r8, r8, 0x30 # xor \$0x30, %r8 Lschedule_go: @@ -1038,14 +1108,15 @@ Lschedule_mangle_last: #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key vperm v0, v0, v0, $outperm # rotate right/left + li r10, 4 vsel v2, $outhead, v0, $outmask - vmr $outhead, v0 + li r11, 8 stvx v2, 0, $out - - addi $out, $out, 15 # 15 is not typo - lvx v1, 0, $out # redundant in aligned case - vsel v1, $outhead, v1, $outmask - stvx v1, 0, $out + li r12, 12 + stvewx v0, 0, $out # some (or all) are redundant + stvewx v0, r10, $out + stvewx v0, r11, $out + stvewx v0, r12, $out b Lschedule_mangle_done .align 4 @@ -1057,15 +1128,18 @@ Lschedule_mangle_last_dec: bl _vpaes_schedule_transform # output transform #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key + addi r9, $out, -15 # -15 is not typo vperm v0, v0, v0, $outperm # rotate right/left + li r10, 4 vsel v2, $outhead, v0, $outmask - vmr $outhead, v0 + li r11, 8 stvx v2, 0, $out + li r12, 12 + stvewx v0, 0, r9 # some (or all) are redundant + stvewx v0, r10, r9 + stvewx v0, r11, r9 + stvewx v0, r12, r9 - addi $out, $out, -15 # -15 is not typo - lvx v1, 0, $out # redundant in aligned case - vsel v1, $outhead, v1, $outmask - stvx v1, 0, $out Lschedule_mangle_done: mtlr r7 |