Import OpenSSL 1.0.2e.

author: jkim <jkim@FreeBSD.org> 2015-12-03 17:22:58 +0000
committer: jkim <jkim@FreeBSD.org> 2015-12-03 17:22:58 +0000
commit: afd52a5fc90e70242dbb0e7d29987c976eb993e0 (patch)
tree: b0284af4e4144e27eb9f39e88c53868060774b16 /crypto/aes/asm
parent: 64cb0c902e312216cdc4c826fc0be9ba9e1bf4da (diff)
download: FreeBSD-src-afd52a5fc90e70242dbb0e7d29987c976eb993e0.zip
FreeBSD-src-afd52a5fc90e70242dbb0e7d29987c976eb993e0.tar.gz
6 files changed, 145 insertions, 74 deletions
diff --git a/crypto/aes/asm/aes-586.pl b/crypto/aes/asm/aes-586.pl
index 451d0e0..60286ecb 100755
--- a/crypto/aes/asm/aes-586.pl
+++ b/crypto/aes/asm/aes-586.pl
@@ -45,7 +45,7 @@
 # the undertaken effort was that it appeared that in tight IA-32
 # register window little-endian flavor could achieve slightly higher
 # Instruction Level Parallelism, and it indeed resulted in up to 15%
-# better performance on most recent Е-archs...
+# better performance on most recent ТЕ-archs...
 #
 # Third version adds AES_cbc_encrypt implementation, which resulted in
 # up to 40% performance imrovement of CBC benchmark results. 40% was
@@ -224,7 +224,7 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
 $speed_limit=512;	# chunks smaller than $speed_limit are
 			# processed with compact routine in CBC mode
 $small_footprint=1;	# $small_footprint=1 code is ~5% slower [on
-			# recent Е-archs], but ~5 times smaller!
+			# recent ТЕ-archs], but ~5 times smaller!
 			# I favor compact code to minimize cache
 			# contention and in hope to "collect" 5% back
 			# in real-life applications...
@@ -565,7 +565,7 @@ sub enctransform()
 # Performance is not actually extraordinary in comparison to pure
 # x86 code. In particular encrypt performance is virtually the same.
 # Decrypt performance on the other hand is 15-20% better on newer
-# Е-archs [but we're thankful for *any* improvement here], and ~50%
+# ТЕ-archs [but we're thankful for *any* improvement here], and ~50%
 # better on PIII:-) And additionally on the pros side this code
 # eliminates redundant references to stack and thus relieves/
 # minimizes the pressure on the memory bus.
diff --git a/crypto/aes/asm/aesni-mb-x86_64.pl b/crypto/aes/asm/aesni-mb-x86_64.pl
index 33b1aed..5a100fa 100755
--- a/crypto/aes/asm/aesni-mb-x86_64.pl
+++ b/crypto/aes/asm/aesni-mb-x86_64.pl
@@ -63,7 +63,7 @@ if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
 	$avx = ($1>=10) + ($1>=11);
 }
 
-if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9]\.[0-9]+)/) {
 	$avx = ($2>=3.0) + ($2>3.0);
 }
 
diff --git a/crypto/aes/asm/aesni-sha1-x86_64.pl b/crypto/aes/asm/aesni-sha1-x86_64.pl
index 97992ad..c803cde 100755
--- a/crypto/aes/asm/aesni-sha1-x86_64.pl
+++ b/crypto/aes/asm/aesni-sha1-x86_64.pl
@@ -94,7 +94,7 @@ $avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
 $avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
 	   `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
 	   $1>=10);
-$avx=1 if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/ && $2>=3.0);
+$avx=1 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9]\.[0-9]+)/ && $2>=3.0);
 
 $shaext=1;	### set to zero if compiling for 1.0.1
 
diff --git a/crypto/aes/asm/aesni-sha256-x86_64.pl b/crypto/aes/asm/aesni-sha256-x86_64.pl
index 19b0433..bfe2926 100755
--- a/crypto/aes/asm/aesni-sha256-x86_64.pl
+++ b/crypto/aes/asm/aesni-sha256-x86_64.pl
@@ -59,7 +59,7 @@ if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
 	$avx = ($1>=10) + ($1>=12);
 }
 
-if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9]\.[0-9]+)/) {
 	$avx = ($2>=3.0) + ($2>3.0);
 }
 
@@ -139,11 +139,8 @@ $code.=<<___ if ($avx>1);
 	je	${func}_avx2
 ___
 $code.=<<___;
-	and	\$`1<<30`,%eax			# mask "Intel CPU" bit
-	and	\$`1<<28|1<<9`,%r10d		# mask AVX+SSSE3 bits
-	or	%eax,%r10d
-	cmp	\$`1<<28|1<<9|1<<30`,%r10d
-	je	${func}_avx
+	and	\$`1<<28`,%r10d			# check for AVX
+	jnz	${func}_avx
 	ud2
 ___
 						}
diff --git a/crypto/aes/asm/aesni-x86.pl b/crypto/aes/asm/aesni-x86.pl
index f67df8c..9b2e37a 100755
--- a/crypto/aes/asm/aesni-x86.pl
+++ b/crypto/aes/asm/aesni-x86.pl
@@ -88,7 +88,7 @@ $inout3="xmm5";	$in1="xmm5";
 $inout4="xmm6";	$in0="xmm6";
 $inout5="xmm7";	$ivec="xmm7";
 
-# AESNI extenstion
+# AESNI extension
 sub aeskeygenassist
 { my($dst,$src,$imm)=@_;
     if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
diff --git a/crypto/aes/asm/vpaes-ppc.pl b/crypto/aes/asm/vpaes-ppc.pl
index 7fda60e..1759ae9 100755
--- a/crypto/aes/asm/vpaes-ppc.pl
+++ b/crypto/aes/asm/vpaes-ppc.pl
@@ -337,24 +337,27 @@ Lenc_entry:
 	addi	$inp, $inp, 15		# 15 is not a typo
 	 ?lvsr	$outperm, 0, $out
 	?lvsl	$keyperm, 0, $key	# prepare for unaligned access
-	 vnor	$outmask, v7, v7	# 0xff..ff
 	lvx	$inptail, 0, $inp	# redundant in aligned case
-	 ?vperm	$outmask, v7, $outmask, $outperm
-	 lvx	$outhead, 0, $out
 	?vperm	v0, v0, $inptail, $inpperm
 
 	bl	_vpaes_encrypt_core
 
+	andi.	r8, $out, 15
+	li	r9, 16
+	beq	Lenc_out_aligned
+
 	vperm	v0, v0, v0, $outperm	# rotate right/left
-	vsel	v1, $outhead, v0, $outmask
-	vmr	$outhead, v0
-	stvx	v1, 0, $out
-	addi	$out, $out, 15		# 15 is not a typo
-	########
+	mtctr	r9
+Lenc_out_unaligned:
+	stvebx	v0, 0, $out
+	addi	$out, $out, 1
+	bdnz	Lenc_out_unaligned
+	b	Lenc_done
 
-	lvx	v1, 0, $out		# redundant in aligned case
-	vsel	v1, $outhead, v1, $outmask
-	stvx	v1, 0, $out
+.align	4
+Lenc_out_aligned:
+	stvx	v0, 0, $out
+Lenc_done:
 
 	li	r10,`15+6*$SIZE_T`
 	li	r11,`31+6*$SIZE_T`
@@ -566,24 +569,27 @@ Ldec_entry:
 	addi	$inp, $inp, 15		# 15 is not a typo
 	 ?lvsr	$outperm, 0, $out
 	?lvsl	$keyperm, 0, $key
-	 vnor	$outmask, v7, v7	# 0xff..ff
 	lvx	$inptail, 0, $inp	# redundant in aligned case
-	 ?vperm	$outmask, v7, $outmask, $outperm
-	 lvx	$outhead, 0, $out
 	?vperm	v0, v0, $inptail, $inpperm
 
 	bl	_vpaes_decrypt_core
 
+	andi.	r8, $out, 15
+	li	r9, 16
+	beq	Ldec_out_aligned
+
 	vperm	v0, v0, v0, $outperm	# rotate right/left
-	vsel	v1, $outhead, v0, $outmask
-	vmr	$outhead, v0
-	stvx	v1, 0, $out
-	addi	$out, $out, 15		# 15 is not a typo
-	########
+	mtctr	r9
+Ldec_out_unaligned:
+	stvebx	v0, 0, $out
+	addi	$out, $out, 1
+	bdnz	Ldec_out_unaligned
+	b	Ldec_done
 
-	lvx	v1, 0, $out		# redundant in aligned case
-	vsel	v1, $outhead, v1, $outmask
-	stvx	v1, 0, $out
+.align	4
+Ldec_out_aligned:
+	stvx	v0, 0, $out
+Ldec_done:
 
 	li	r10,`15+6*$SIZE_T`
 	li	r11,`31+6*$SIZE_T`
@@ -658,11 +664,11 @@ Ldec_entry:
 	$PUSH	r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp)
 
 	and	r30, r5, r9		# copy length&-16
+	andi.	r9, $out, 15		# is $out aligned?
 	mr	r5, r6			# copy pointer to key
 	mr	r31, r7			# copy pointer to iv
-	blt	Lcbc_abort
-	cmpwi	r8, 0			# test direction
 	li	r6, -1
+	mcrf	cr1, cr0		# put aside $out alignment flag
 	mr	r7, r12			# copy vrsave
 	mtspr	256, r6			# preserve all AltiVec registers
 
@@ -672,6 +678,7 @@ Ldec_entry:
 	lvx	v25, r9, r31
 	?vperm	v24, v24, v25, $inpperm
 
+	cmpwi	r8, 0			# test direction
 	neg	r8, $inp		# prepare for unaligned access
 	 vxor	v7, v7, v7
 	?lvsl	$keyperm, 0, $key
@@ -681,13 +688,37 @@ Ldec_entry:
 	lvx	$inptail, 0, $inp
 	 ?vperm	$outmask, v7, $outmask, $outperm
 	addi	$inp, $inp, 15		# 15 is not a typo
-	 lvx	$outhead, 0, $out
 
 	beq	Lcbc_decrypt
 
 	bl	_vpaes_encrypt_preheat
 	li	r0, 16
 
+	beq	cr1, Lcbc_enc_loop	# $out is aligned
+
+	vmr	v0, $inptail
+	lvx	$inptail, 0, $inp
+	addi	$inp, $inp, 16
+	?vperm	v0, v0, $inptail, $inpperm
+	vxor	v0, v0, v24		# ^= iv
+
+	bl	_vpaes_encrypt_core
+
+	andi.	r8, $out, 15
+	vmr	v24, v0			# put aside iv
+	sub	r9, $out, r8
+	vperm	$outhead, v0, v0, $outperm	# rotate right/left
+
+Lcbc_enc_head:
+	stvebx	$outhead, r8, r9
+	cmpwi	r8, 15
+	addi	r8, r8, 1
+	bne	Lcbc_enc_head
+
+	sub.	r30, r30, r0		# len -= 16
+	addi	$out, $out, 16
+	beq	Lcbc_unaligned_done
+
 Lcbc_enc_loop:
 	vmr	v0, $inptail
 	lvx	$inptail, 0, $inp
@@ -713,6 +744,32 @@ Lcbc_decrypt:
 	bl	_vpaes_decrypt_preheat
 	li	r0, 16
 
+	beq	cr1, Lcbc_dec_loop	# $out is aligned
+
+	vmr	v0, $inptail
+	lvx	$inptail, 0, $inp
+	addi	$inp, $inp, 16
+	?vperm	v0, v0, $inptail, $inpperm
+	vmr	v25, v0			# put aside input
+
+	bl	_vpaes_decrypt_core
+
+	andi.	r8, $out, 15
+	vxor	v0, v0, v24		# ^= iv
+	vmr	v24, v25
+	sub	r9, $out, r8
+	vperm	$outhead, v0, v0, $outperm	# rotate right/left
+
+Lcbc_dec_head:
+	stvebx	$outhead, r8, r9
+	cmpwi	r8, 15
+	addi	r8, r8, 1
+	bne	Lcbc_dec_head
+
+	sub.	r30, r30, r0		# len -= 16
+	addi	$out, $out, 16
+	beq	Lcbc_unaligned_done
+
 Lcbc_dec_loop:
 	vmr	v0, $inptail
 	lvx	$inptail, 0, $inp
@@ -733,23 +790,29 @@ Lcbc_dec_loop:
 	bne	Lcbc_dec_loop
 
 Lcbc_done:
-	addi	$out, $out, -1
-	lvx	v1, 0, $out		# redundant in aligned case
-	vsel	v1, $outhead, v1, $outmask
-	stvx	v1, 0, $out
-
+	beq	cr1, Lcbc_write_iv	# $out is aligned
+
+Lcbc_unaligned_done:
+	andi.	r8, $out, 15
+	sub	$out, $out, r8
+	li	r9, 0
+Lcbc_tail:
+	stvebx	$outhead, r9, $out
+	addi	r9, r9, 1
+	cmpw	r9, r8
+	bne	Lcbc_tail
+
+Lcbc_write_iv:
 	neg	r8, r31			# write [potentially unaligned] iv
+	li	r10, 4
 	?lvsl	$outperm, 0, r8
-	li	r6, 15
-	vnor	$outmask, v7, v7	# 0xff..ff
-	?vperm	$outmask, v7, $outmask, $outperm
-	lvx	$outhead, 0, r31
+	li	r11, 8
+	li	r12, 12
 	vperm	v24, v24, v24, $outperm	# rotate right/left
-	vsel	v0, $outhead, v24, $outmask
-	lvx	v1, r6, r31
-	stvx	v0, 0, r31
-	vsel	v1, v24, v1, $outmask
-	stvx	v1, r6, r31
+	stvewx	v24, 0, r31		# ivp is at least 32-bit aligned
+	stvewx	v24, r10, r31
+	stvewx	v24, r11, r31
+	stvewx	v24, r12, r31
 
 	mtspr	256, r7			# restore vrsave
 	li	r10,`15+6*$SIZE_T`
@@ -872,18 +935,21 @@ _vpaes_schedule_core:
 
 	# encrypting, output zeroth round key after transform
 	li	r8, 0x30		# mov	\$0x30,%r8d
-	addi	r10, r12, 0x80		# lea	.Lk_sr(%rip),%r10
+	li	r9, 4
+	li	r10, 8
+	li	r11, 12
 
 	?lvsr	$outperm, 0, $out	# prepare for unaligned access
 	vnor	$outmask, v9, v9	# 0xff..ff
-	lvx	$outhead, 0, $out
 	?vperm	$outmask, v9, $outmask, $outperm
 
 	#stvx	v0, 0, $out		# vmovdqu	%xmm0,	(%rdx)
-	vperm	v1, v0, v0, $outperm	# rotate right/left
-	vsel	v2, $outhead, v1, $outmask
-	vmr	$outhead, v1
-	stvx	v2, 0, $out
+	vperm	$outhead, v0, v0, $outperm	# rotate right/left
+	stvewx	$outhead, 0, $out	# some are superfluous
+	stvewx	$outhead, r9, $out
+	stvewx	$outhead, r10, $out
+	addi	r10, r12, 0x80		# lea	.Lk_sr(%rip),%r10
+	stvewx	$outhead, r11, $out
 	b	Lschedule_go
 
 Lschedule_am_decrypting:
@@ -893,20 +959,24 @@ Lschedule_am_decrypting:
 	addi	r10, r12, 0x80		# lea	.Lk_sr(%rip),%r10
 	# decrypting, output zeroth round key after shiftrows
 	lvx	v1, r8, r10		# vmovdqa	(%r8,%r10),	%xmm1
+	li	r9, 4
+	li	r10, 8
+	li	r11, 12
 	vperm	v4, v3, v3, v1		# vpshufb	%xmm1,	%xmm3,	%xmm3
 
 	neg	r0, $out		# prepare for unaligned access
 	?lvsl	$outperm, 0, r0
-	addi	$out, $out, 15		# 15 is not typo
 	vnor	$outmask, v9, v9	# 0xff..ff
-	lvx	$outhead, 0, $out
 	?vperm	$outmask, $outmask, v9, $outperm
 
 	#stvx	v4, 0, $out		# vmovdqu	%xmm3,	(%rdx)
-	vperm	v4, v4, v4, $outperm	# rotate right/left
-	vsel	v2, $outhead, v4, $outmask
-	vmr	$outhead, v4
-	stvx	v2, 0, $out
+	vperm	$outhead, v4, v4, $outperm	# rotate right/left
+	stvewx	$outhead, 0, $out	# some are superfluous
+	stvewx	$outhead, r9, $out
+	stvewx	$outhead, r10, $out
+	addi	r10, r12, 0x80		# lea	.Lk_sr(%rip),%r10
+	stvewx	$outhead, r11, $out
+	addi	$out, $out, 15		# 15 is not typo
 	xori	r8, r8, 0x30		# xor	\$0x30, %r8
 
 Lschedule_go:
@@ -1038,14 +1108,15 @@ Lschedule_mangle_last:
 
 	#stvx	v0, r0, $out		# vmovdqu	%xmm0,	(%rdx)		# save last key
 	vperm	v0, v0, v0, $outperm	# rotate right/left
+	li	r10, 4
 	vsel	v2, $outhead, v0, $outmask
-	vmr	$outhead, v0
+	li	r11, 8
 	stvx	v2, 0, $out
-
-	addi	$out, $out, 15		# 15 is not typo
-	lvx	v1, 0, $out		# redundant in aligned case
-	vsel	v1, $outhead, v1, $outmask
-	stvx	v1, 0, $out
+	li	r12, 12
+	stvewx	v0, 0, $out		# some (or all) are redundant
+	stvewx	v0, r10, $out
+	stvewx	v0, r11, $out
+	stvewx	v0, r12, $out
 	b	Lschedule_mangle_done
 
 .align	4
@@ -1057,15 +1128,18 @@ Lschedule_mangle_last_dec:
 	bl	_vpaes_schedule_transform	# output transform
 
 	#stvx	v0, r0, $out		# vmovdqu	%xmm0,	(%rdx)		# save last key
+	addi	r9, $out, -15		# -15 is not typo
 	vperm	v0, v0, v0, $outperm	# rotate right/left
+	li	r10, 4
 	vsel	v2, $outhead, v0, $outmask
-	vmr	$outhead, v0
+	li	r11, 8
 	stvx	v2, 0, $out
+	li	r12, 12
+	stvewx	v0, 0, r9		# some (or all) are redundant
+	stvewx	v0, r10, r9
+	stvewx	v0, r11, r9
+	stvewx	v0, r12, r9
 
-	addi	$out, $out, -15		# -15 is not typo
-	lvx	v1, 0, $out		# redundant in aligned case
-	vsel	v1, $outhead, v1, $outmask
-	stvx	v1, 0, $out
 
 Lschedule_mangle_done:
 	mtlr	r7
author	jkim <jkim@FreeBSD.org>	2015-12-03 17:22:58 +0000
committer	jkim <jkim@FreeBSD.org>	2015-12-03 17:22:58 +0000
commit	afd52a5fc90e70242dbb0e7d29987c976eb993e0 (patch)
tree	b0284af4e4144e27eb9f39e88c53868060774b16 /crypto/aes/asm
parent	64cb0c902e312216cdc4c826fc0be9ba9e1bf4da (diff)
download	FreeBSD-src-afd52a5fc90e70242dbb0e7d29987c976eb993e0.zip FreeBSD-src-afd52a5fc90e70242dbb0e7d29987c976eb993e0.tar.gz