summaryrefslogtreecommitdiffstats
path: root/secure/lib/libcrypto/amd64
diff options
context:
space:
mode:
authorjkim <jkim@FreeBSD.org>2016-03-01 22:08:28 +0000
committerjkim <jkim@FreeBSD.org>2016-03-01 22:08:28 +0000
commitde2249f81ccf8ad3eac972b7558a16a3bab99325 (patch)
treedd0f91775301f47811f2b56ba60043ebdf64aea9 /secure/lib/libcrypto/amd64
parent0e774f6016f1dfb6a8f55462cc815c3b4da580b2 (diff)
parent72d32bf80dfdcfe0e69da200b66f195e919653f7 (diff)
downloadFreeBSD-src-de2249f81ccf8ad3eac972b7558a16a3bab99325.zip
FreeBSD-src-de2249f81ccf8ad3eac972b7558a16a3bab99325.tar.gz
Merge OpenSSL 1.0.2g.
Relnotes: yes
Diffstat (limited to 'secure/lib/libcrypto/amd64')
-rw-r--r--secure/lib/libcrypto/amd64/aes-x86_64.S70
-rw-r--r--secure/lib/libcrypto/amd64/aesni-sha1-x86_64.S8
-rw-r--r--secure/lib/libcrypto/amd64/aesni-x86_64.S112
-rw-r--r--secure/lib/libcrypto/amd64/bsaes-x86_64.S158
-rw-r--r--secure/lib/libcrypto/amd64/cmll-x86_64.S2
-rw-r--r--secure/lib/libcrypto/amd64/ecp_nistz256-x86_64.S27
-rw-r--r--secure/lib/libcrypto/amd64/ghash-x86_64.S82
-rw-r--r--secure/lib/libcrypto/amd64/md5-x86_64.S34
-rw-r--r--secure/lib/libcrypto/amd64/rsaz-x86_64.S218
-rw-r--r--secure/lib/libcrypto/amd64/sha1-mb-x86_64.S16
-rw-r--r--secure/lib/libcrypto/amd64/sha1-x86_64.S8
-rw-r--r--secure/lib/libcrypto/amd64/sha256-mb-x86_64.S84
-rw-r--r--secure/lib/libcrypto/amd64/sha256-x86_64.S44
-rw-r--r--secure/lib/libcrypto/amd64/vpaes-x86_64.S20
-rw-r--r--secure/lib/libcrypto/amd64/x86_64-gf2m.S2
-rw-r--r--secure/lib/libcrypto/amd64/x86_64-mont.S108
-rw-r--r--secure/lib/libcrypto/amd64/x86_64-mont5.S901
-rw-r--r--secure/lib/libcrypto/amd64/x86_64cpuid.S48
18 files changed, 1242 insertions, 700 deletions
diff --git a/secure/lib/libcrypto/amd64/aes-x86_64.S b/secure/lib/libcrypto/amd64/aes-x86_64.S
index 3243d6d..4c39bba 100644
--- a/secure/lib/libcrypto/amd64/aes-x86_64.S
+++ b/secure/lib/libcrypto/amd64/aes-x86_64.S
@@ -82,8 +82,8 @@ _x86_64_AES_encrypt:
movl 0(%r14,%rdi,8),%edi
movl 0(%r14,%rbp,8),%ebp
- andl $65280,%edi
- andl $65280,%ebp
+ andl $0x0000ff00,%edi
+ andl $0x0000ff00,%ebp
xorl %edi,%r10d
xorl %ebp,%r11d
@@ -95,8 +95,8 @@ _x86_64_AES_encrypt:
movl 0(%r14,%rsi,8),%esi
movl 0(%r14,%rdi,8),%edi
- andl $65280,%esi
- andl $65280,%edi
+ andl $0x0000ff00,%esi
+ andl $0x0000ff00,%edi
shrl $16,%ebx
xorl %esi,%r12d
xorl %edi,%r8d
@@ -109,9 +109,9 @@ _x86_64_AES_encrypt:
movl 0(%r14,%rdi,8),%edi
movl 0(%r14,%rbp,8),%ebp
- andl $16711680,%esi
- andl $16711680,%edi
- andl $16711680,%ebp
+ andl $0x00ff0000,%esi
+ andl $0x00ff0000,%edi
+ andl $0x00ff0000,%ebp
xorl %esi,%r10d
xorl %edi,%r11d
@@ -124,9 +124,9 @@ _x86_64_AES_encrypt:
movl 2(%r14,%rdi,8),%edi
movl 2(%r14,%rbp,8),%ebp
- andl $16711680,%esi
- andl $4278190080,%edi
- andl $4278190080,%ebp
+ andl $0x00ff0000,%esi
+ andl $0xff000000,%edi
+ andl $0xff000000,%ebp
xorl %esi,%r8d
xorl %edi,%r10d
@@ -139,8 +139,8 @@ _x86_64_AES_encrypt:
movl 2(%r14,%rdi,8),%edi
movl 16+0(%r15),%eax
- andl $4278190080,%esi
- andl $4278190080,%edi
+ andl $0xff000000,%esi
+ andl $0xff000000,%edi
xorl %esi,%r12d
xorl %edi,%r8d
@@ -242,8 +242,8 @@ _x86_64_AES_encrypt_compact:
xorl %r8d,%edx
cmpq 16(%rsp),%r15
je .Lenc_compact_done
- movl $2155905152,%r10d
- movl $2155905152,%r11d
+ movl $0x80808080,%r10d
+ movl $0x80808080,%r11d
andl %eax,%r10d
andl %ebx,%r11d
movl %r10d,%esi
@@ -254,10 +254,10 @@ _x86_64_AES_encrypt_compact:
leal (%rbx,%rbx,1),%r9d
subl %r10d,%esi
subl %r11d,%edi
- andl $4278124286,%r8d
- andl $4278124286,%r9d
- andl $454761243,%esi
- andl $454761243,%edi
+ andl $0xfefefefe,%r8d
+ andl $0xfefefefe,%r9d
+ andl $0x1b1b1b1b,%esi
+ andl $0x1b1b1b1b,%edi
movl %eax,%r10d
movl %ebx,%r11d
xorl %esi,%r8d
@@ -265,9 +265,9 @@ _x86_64_AES_encrypt_compact:
xorl %r8d,%eax
xorl %r9d,%ebx
- movl $2155905152,%r12d
+ movl $0x80808080,%r12d
roll $24,%eax
- movl $2155905152,%ebp
+ movl $0x80808080,%ebp
roll $24,%ebx
andl %ecx,%r12d
andl %edx,%ebp
@@ -290,10 +290,10 @@ _x86_64_AES_encrypt_compact:
xorl %r10d,%eax
xorl %r11d,%ebx
- andl $4278124286,%r8d
- andl $4278124286,%r9d
- andl $454761243,%esi
- andl $454761243,%edi
+ andl $0xfefefefe,%r8d
+ andl $0xfefefefe,%r9d
+ andl $0x1b1b1b1b,%esi
+ andl $0x1b1b1b1b,%edi
movl %ecx,%r12d
movl %edx,%ebp
xorl %esi,%r8d
@@ -346,7 +346,7 @@ AES_encrypt:
andq $-64,%rsp
subq %rsp,%rcx
negq %rcx
- andq $960,%rcx
+ andq $0x3c0,%rcx
subq %rcx,%rsp
subq $32,%rsp
@@ -371,7 +371,7 @@ AES_encrypt:
leaq .LAES_Te+2048(%rip),%r14
leaq 768(%rsp),%rbp
subq %r14,%rbp
- andq $768,%rbp
+ andq $0x300,%rbp
leaq (%r14,%rbp,1),%r14
call _x86_64_AES_encrypt_compact
@@ -793,7 +793,7 @@ AES_decrypt:
andq $-64,%rsp
subq %rsp,%rcx
negq %rcx
- andq $960,%rcx
+ andq $0x3c0,%rcx
subq %rcx,%rsp
subq $32,%rsp
@@ -818,7 +818,7 @@ AES_decrypt:
leaq .LAES_Td+2048(%rip),%r14
leaq 768(%rsp),%rbp
subq %r14,%rbp
- andq $768,%rbp
+ andq $0x300,%rbp
leaq (%r14,%rbp,1),%r14
shrq $3,%rbp
addq %rbp,%r14
@@ -1334,9 +1334,9 @@ AES_cbc_encrypt:
movq %r14,%r10
leaq 2304(%r14),%r11
movq %r15,%r12
- andq $4095,%r10
- andq $4095,%r11
- andq $4095,%r12
+ andq $0xFFF,%r10
+ andq $0xFFF,%r11
+ andq $0xFFF,%r12
cmpq %r11,%r12
jb .Lcbc_te_break_out
@@ -1345,7 +1345,7 @@ AES_cbc_encrypt:
jmp .Lcbc_te_ok
.Lcbc_te_break_out:
subq %r10,%r12
- andq $4095,%r12
+ andq $0xFFF,%r12
addq $320,%r12
subq %r12,%r15
.align 4
@@ -1371,7 +1371,7 @@ AES_cbc_encrypt:
movq %r15,%r10
subq %r14,%r10
- andq $4095,%r10
+ andq $0xfff,%r10
cmpq $2304,%r10
jb .Lcbc_do_ecopy
cmpq $4096-248,%r10
@@ -1558,7 +1558,7 @@ AES_cbc_encrypt:
leaq -88-63(%rcx),%r10
subq %rbp,%r10
negq %r10
- andq $960,%r10
+ andq $0x3c0,%r10
subq %r10,%rbp
xchgq %rsp,%rbp
@@ -1587,7 +1587,7 @@ AES_cbc_encrypt:
leaq 2048(%r14),%r14
leaq 768-8(%rsp),%rax
subq %r14,%rax
- andq $768,%rax
+ andq $0x300,%rax
leaq (%r14,%rax,1),%r14
cmpq $0,%rbx
diff --git a/secure/lib/libcrypto/amd64/aesni-sha1-x86_64.S b/secure/lib/libcrypto/amd64/aesni-sha1-x86_64.S
index fa16434..45a5e3b 100644
--- a/secure/lib/libcrypto/amd64/aesni-sha1-x86_64.S
+++ b/secure/lib/libcrypto/amd64/aesni-sha1-x86_64.S
@@ -1393,8 +1393,8 @@ aesni_cbc_sha1_enc_shaext:
movups 16(%rcx),%xmm0
leaq 112(%rcx),%rcx
- pshufd $27,%xmm8,%xmm8
- pshufd $27,%xmm9,%xmm9
+ pshufd $0b00011011,%xmm8,%xmm8
+ pshufd $0b00011011,%xmm9,%xmm9
jmp .Loop_shaext
.align 16
@@ -1673,8 +1673,8 @@ aesni_cbc_sha1_enc_shaext:
leaq 64(%rdi),%rdi
jnz .Loop_shaext
- pshufd $27,%xmm8,%xmm8
- pshufd $27,%xmm9,%xmm9
+ pshufd $0b00011011,%xmm8,%xmm8
+ pshufd $0b00011011,%xmm9,%xmm9
movups %xmm2,(%r8)
movdqu %xmm8,(%r9)
movd %xmm9,16(%r9)
diff --git a/secure/lib/libcrypto/amd64/aesni-x86_64.S b/secure/lib/libcrypto/amd64/aesni-x86_64.S
index 082a306..5be8616 100644
--- a/secure/lib/libcrypto/amd64/aesni-x86_64.S
+++ b/secure/lib/libcrypto/amd64/aesni-x86_64.S
@@ -504,7 +504,7 @@ aesni_ecb_encrypt:
testl %r8d,%r8d
jz .Lecb_decrypt
- cmpq $128,%rdx
+ cmpq $0x80,%rdx
jb .Lecb_enc_tail
movdqu (%rdi),%xmm2
@@ -516,7 +516,7 @@ aesni_ecb_encrypt:
movdqu 96(%rdi),%xmm8
movdqu 112(%rdi),%xmm9
leaq 128(%rdi),%rdi
- subq $128,%rdx
+ subq $0x80,%rdx
jmp .Lecb_enc_loop8_enter
.align 16
.Lecb_enc_loop8:
@@ -544,7 +544,7 @@ aesni_ecb_encrypt:
call _aesni_encrypt8
- subq $128,%rdx
+ subq $0x80,%rdx
jnc .Lecb_enc_loop8
movups %xmm2,(%rsi)
@@ -558,22 +558,22 @@ aesni_ecb_encrypt:
movups %xmm8,96(%rsi)
movups %xmm9,112(%rsi)
leaq 128(%rsi),%rsi
- addq $128,%rdx
+ addq $0x80,%rdx
jz .Lecb_ret
.Lecb_enc_tail:
movups (%rdi),%xmm2
- cmpq $32,%rdx
+ cmpq $0x20,%rdx
jb .Lecb_enc_one
movups 16(%rdi),%xmm3
je .Lecb_enc_two
movups 32(%rdi),%xmm4
- cmpq $64,%rdx
+ cmpq $0x40,%rdx
jb .Lecb_enc_three
movups 48(%rdi),%xmm5
je .Lecb_enc_four
movups 64(%rdi),%xmm6
- cmpq $96,%rdx
+ cmpq $0x60,%rdx
jb .Lecb_enc_five
movups 80(%rdi),%xmm7
je .Lecb_enc_six
@@ -647,7 +647,7 @@ aesni_ecb_encrypt:
.align 16
.Lecb_decrypt:
- cmpq $128,%rdx
+ cmpq $0x80,%rdx
jb .Lecb_dec_tail
movdqu (%rdi),%xmm2
@@ -659,7 +659,7 @@ aesni_ecb_encrypt:
movdqu 96(%rdi),%xmm8
movdqu 112(%rdi),%xmm9
leaq 128(%rdi),%rdi
- subq $128,%rdx
+ subq $0x80,%rdx
jmp .Lecb_dec_loop8_enter
.align 16
.Lecb_dec_loop8:
@@ -688,7 +688,7 @@ aesni_ecb_encrypt:
call _aesni_decrypt8
movups (%r11),%xmm0
- subq $128,%rdx
+ subq $0x80,%rdx
jnc .Lecb_dec_loop8
movups %xmm2,(%rsi)
@@ -710,22 +710,22 @@ aesni_ecb_encrypt:
movups %xmm9,112(%rsi)
pxor %xmm9,%xmm9
leaq 128(%rsi),%rsi
- addq $128,%rdx
+ addq $0x80,%rdx
jz .Lecb_ret
.Lecb_dec_tail:
movups (%rdi),%xmm2
- cmpq $32,%rdx
+ cmpq $0x20,%rdx
jb .Lecb_dec_one
movups 16(%rdi),%xmm3
je .Lecb_dec_two
movups 32(%rdi),%xmm4
- cmpq $64,%rdx
+ cmpq $0x40,%rdx
jb .Lecb_dec_three
movups 48(%rdi),%xmm5
je .Lecb_dec_four
movups 64(%rdi),%xmm6
- cmpq $96,%rdx
+ cmpq $0x60,%rdx
jb .Lecb_dec_five
movups 80(%rdi),%xmm7
je .Lecb_dec_six
@@ -1599,7 +1599,7 @@ aesni_xts_encrypt:
movdqa .Lxts_magic(%rip),%xmm8
movdqa %xmm2,%xmm15
- pshufd $95,%xmm2,%xmm9
+ pshufd $0x5f,%xmm2,%xmm9
pxor %xmm0,%xmm1
movdqa %xmm9,%xmm14
paddd %xmm9,%xmm9
@@ -1698,7 +1698,7 @@ aesni_xts_encrypt:
.byte 102,15,56,220,248
movups 64(%r11),%xmm0
movdqa %xmm8,80(%rsp)
- pshufd $95,%xmm15,%xmm9
+ pshufd $0x5f,%xmm15,%xmm9
jmp .Lxts_enc_loop6
.align 32
.Lxts_enc_loop6:
@@ -1837,13 +1837,13 @@ aesni_xts_encrypt:
jz .Lxts_enc_done
pxor %xmm0,%xmm11
- cmpq $32,%rdx
+ cmpq $0x20,%rdx
jb .Lxts_enc_one
pxor %xmm0,%xmm12
je .Lxts_enc_two
pxor %xmm0,%xmm13
- cmpq $64,%rdx
+ cmpq $0x40,%rdx
jb .Lxts_enc_three
pxor %xmm0,%xmm14
je .Lxts_enc_four
@@ -2070,7 +2070,7 @@ aesni_xts_decrypt:
movdqa .Lxts_magic(%rip),%xmm8
movdqa %xmm2,%xmm15
- pshufd $95,%xmm2,%xmm9
+ pshufd $0x5f,%xmm2,%xmm9
pxor %xmm0,%xmm1
movdqa %xmm9,%xmm14
paddd %xmm9,%xmm9
@@ -2169,7 +2169,7 @@ aesni_xts_decrypt:
.byte 102,15,56,222,248
movups 64(%r11),%xmm0
movdqa %xmm8,80(%rsp)
- pshufd $95,%xmm15,%xmm9
+ pshufd $0x5f,%xmm15,%xmm9
jmp .Lxts_dec_loop6
.align 32
.Lxts_dec_loop6:
@@ -2309,13 +2309,13 @@ aesni_xts_decrypt:
jz .Lxts_dec_done
pxor %xmm0,%xmm12
- cmpq $32,%rdx
+ cmpq $0x20,%rdx
jb .Lxts_dec_one
pxor %xmm0,%xmm13
je .Lxts_dec_two
pxor %xmm0,%xmm14
- cmpq $64,%rdx
+ cmpq $0x40,%rdx
jb .Lxts_dec_three
je .Lxts_dec_four
@@ -2346,7 +2346,7 @@ aesni_xts_decrypt:
pcmpgtd %xmm15,%xmm14
movdqu %xmm6,64(%rsi)
leaq 80(%rsi),%rsi
- pshufd $19,%xmm14,%xmm11
+ pshufd $0x13,%xmm14,%xmm11
andq $15,%r9
jz .Lxts_dec_ret
@@ -2635,7 +2635,7 @@ aesni_cbc_encrypt:
leaq -8(%rax),%rbp
movups (%r8),%xmm10
movl %r10d,%eax
- cmpq $80,%rdx
+ cmpq $0x50,%rdx
jbe .Lcbc_dec_tail
movups (%rcx),%xmm0
@@ -2651,14 +2651,14 @@ aesni_cbc_encrypt:
movdqu 80(%rdi),%xmm7
movdqa %xmm6,%xmm15
movl OPENSSL_ia32cap_P+4(%rip),%r9d
- cmpq $112,%rdx
+ cmpq $0x70,%rdx
jbe .Lcbc_dec_six_or_seven
andl $71303168,%r9d
- subq $80,%rdx
+ subq $0x50,%rdx
cmpl $4194304,%r9d
je .Lcbc_dec_loop6_enter
- subq $32,%rdx
+ subq $0x20,%rdx
leaq 112(%rcx),%rcx
jmp .Lcbc_dec_loop8_enter
.align 16
@@ -2673,7 +2673,7 @@ aesni_cbc_encrypt:
movups 16-112(%rcx),%xmm1
pxor %xmm0,%xmm4
xorq %r11,%r11
- cmpq $112,%rdx
+ cmpq $0x70,%rdx
pxor %xmm0,%xmm5
pxor %xmm0,%xmm6
pxor %xmm0,%xmm7
@@ -2858,21 +2858,21 @@ aesni_cbc_encrypt:
movups %xmm8,96(%rsi)
leaq 112(%rsi),%rsi
- subq $128,%rdx
+ subq $0x80,%rdx
ja .Lcbc_dec_loop8
movaps %xmm9,%xmm2
leaq -112(%rcx),%rcx
- addq $112,%rdx
+ addq $0x70,%rdx
jle .Lcbc_dec_clear_tail_collected
movups %xmm9,(%rsi)
leaq 16(%rsi),%rsi
- cmpq $80,%rdx
+ cmpq $0x50,%rdx
jbe .Lcbc_dec_tail
movaps %xmm11,%xmm2
.Lcbc_dec_six_or_seven:
- cmpq $96,%rdx
+ cmpq $0x60,%rdx
ja .Lcbc_dec_seven
movaps %xmm7,%xmm8
@@ -2965,33 +2965,33 @@ aesni_cbc_encrypt:
movl %r10d,%eax
movdqu %xmm6,64(%rsi)
leaq 80(%rsi),%rsi
- subq $96,%rdx
+ subq $0x60,%rdx
ja .Lcbc_dec_loop6
movdqa %xmm7,%xmm2
- addq $80,%rdx
+ addq $0x50,%rdx
jle .Lcbc_dec_clear_tail_collected
movups %xmm7,(%rsi)
leaq 16(%rsi),%rsi
.Lcbc_dec_tail:
movups (%rdi),%xmm2
- subq $16,%rdx
+ subq $0x10,%rdx
jbe .Lcbc_dec_one
movups 16(%rdi),%xmm3
movaps %xmm2,%xmm11
- subq $16,%rdx
+ subq $0x10,%rdx
jbe .Lcbc_dec_two
movups 32(%rdi),%xmm4
movaps %xmm3,%xmm12
- subq $16,%rdx
+ subq $0x10,%rdx
jbe .Lcbc_dec_three
movups 48(%rdi),%xmm5
movaps %xmm4,%xmm13
- subq $16,%rdx
+ subq $0x10,%rdx
jbe .Lcbc_dec_four
movups 64(%rdi),%xmm6
@@ -3016,7 +3016,7 @@ aesni_cbc_encrypt:
movdqa %xmm6,%xmm2
pxor %xmm6,%xmm6
pxor %xmm7,%xmm7
- subq $16,%rdx
+ subq $0x10,%rdx
jmp .Lcbc_dec_tail_collected
.align 16
@@ -3333,7 +3333,7 @@ __aesni_set_encrypt_key:
pslldq $4,%xmm0
pxor %xmm3,%xmm0
- pshufd $255,%xmm0,%xmm3
+ pshufd $0xff,%xmm0,%xmm3
pxor %xmm1,%xmm3
pslldq $4,%xmm1
pxor %xmm1,%xmm3
@@ -3420,7 +3420,7 @@ __aesni_set_encrypt_key:
decl %r10d
jz .Ldone_key256
- pshufd $255,%xmm0,%xmm2
+ pshufd $0xff,%xmm0,%xmm2
pxor %xmm3,%xmm3
.byte 102,15,56,221,211
@@ -3463,11 +3463,11 @@ __aesni_set_encrypt_key:
movups %xmm0,(%rax)
leaq 16(%rax),%rax
.Lkey_expansion_128_cold:
- shufps $16,%xmm0,%xmm4
+ shufps $0b00010000,%xmm0,%xmm4
xorps %xmm4,%xmm0
- shufps $140,%xmm0,%xmm4
+ shufps $0b10001100,%xmm0,%xmm4
xorps %xmm4,%xmm0
- shufps $255,%xmm1,%xmm1
+ shufps $0b11111111,%xmm1,%xmm1
xorps %xmm1,%xmm0
.byte 0xf3,0xc3
@@ -3478,25 +3478,25 @@ __aesni_set_encrypt_key:
.Lkey_expansion_192a_cold:
movaps %xmm2,%xmm5
.Lkey_expansion_192b_warm:
- shufps $16,%xmm0,%xmm4
+ shufps $0b00010000,%xmm0,%xmm4
movdqa %xmm2,%xmm3
xorps %xmm4,%xmm0
- shufps $140,%xmm0,%xmm4
+ shufps $0b10001100,%xmm0,%xmm4
pslldq $4,%xmm3
xorps %xmm4,%xmm0
- pshufd $85,%xmm1,%xmm1
+ pshufd $0b01010101,%xmm1,%xmm1
pxor %xmm3,%xmm2
pxor %xmm1,%xmm0
- pshufd $255,%xmm0,%xmm3
+ pshufd $0b11111111,%xmm0,%xmm3
pxor %xmm3,%xmm2
.byte 0xf3,0xc3
.align 16
.Lkey_expansion_192b:
movaps %xmm0,%xmm3
- shufps $68,%xmm0,%xmm5
+ shufps $0b01000100,%xmm0,%xmm5
movups %xmm5,(%rax)
- shufps $78,%xmm2,%xmm3
+ shufps $0b01001110,%xmm2,%xmm3
movups %xmm3,16(%rax)
leaq 32(%rax),%rax
jmp .Lkey_expansion_192b_warm
@@ -3506,11 +3506,11 @@ __aesni_set_encrypt_key:
movups %xmm2,(%rax)
leaq 16(%rax),%rax
.Lkey_expansion_256a_cold:
- shufps $16,%xmm0,%xmm4
+ shufps $0b00010000,%xmm0,%xmm4
xorps %xmm4,%xmm0
- shufps $140,%xmm0,%xmm4
+ shufps $0b10001100,%xmm0,%xmm4
xorps %xmm4,%xmm0
- shufps $255,%xmm1,%xmm1
+ shufps $0b11111111,%xmm1,%xmm1
xorps %xmm1,%xmm0
.byte 0xf3,0xc3
@@ -3519,11 +3519,11 @@ __aesni_set_encrypt_key:
movups %xmm0,(%rax)
leaq 16(%rax),%rax
- shufps $16,%xmm2,%xmm4
+ shufps $0b00010000,%xmm2,%xmm4
xorps %xmm4,%xmm2
- shufps $140,%xmm2,%xmm4
+ shufps $0b10001100,%xmm2,%xmm4
xorps %xmm4,%xmm2
- shufps $170,%xmm1,%xmm1
+ shufps $0b10101010,%xmm1,%xmm1
xorps %xmm1,%xmm2
.byte 0xf3,0xc3
.size aesni_set_encrypt_key,.-aesni_set_encrypt_key
diff --git a/secure/lib/libcrypto/amd64/bsaes-x86_64.S b/secure/lib/libcrypto/amd64/bsaes-x86_64.S
index be410de..d39a81c 100644
--- a/secure/lib/libcrypto/amd64/bsaes-x86_64.S
+++ b/secure/lib/libcrypto/amd64/bsaes-x86_64.S
@@ -325,45 +325,45 @@ _bsaes_encrypt8_bitslice:
pxor %xmm2,%xmm5
decl %r10d
jl .Lenc_done
- pshufd $147,%xmm15,%xmm7
- pshufd $147,%xmm0,%xmm8
+ pshufd $0x93,%xmm15,%xmm7
+ pshufd $0x93,%xmm0,%xmm8
pxor %xmm7,%xmm15
- pshufd $147,%xmm3,%xmm9
+ pshufd $0x93,%xmm3,%xmm9
pxor %xmm8,%xmm0
- pshufd $147,%xmm5,%xmm10
+ pshufd $0x93,%xmm5,%xmm10
pxor %xmm9,%xmm3
- pshufd $147,%xmm2,%xmm11
+ pshufd $0x93,%xmm2,%xmm11
pxor %xmm10,%xmm5
- pshufd $147,%xmm6,%xmm12
+ pshufd $0x93,%xmm6,%xmm12
pxor %xmm11,%xmm2
- pshufd $147,%xmm1,%xmm13
+ pshufd $0x93,%xmm1,%xmm13
pxor %xmm12,%xmm6
- pshufd $147,%xmm4,%xmm14
+ pshufd $0x93,%xmm4,%xmm14
pxor %xmm13,%xmm1
pxor %xmm14,%xmm4
pxor %xmm15,%xmm8
pxor %xmm4,%xmm7
pxor %xmm4,%xmm8
- pshufd $78,%xmm15,%xmm15
+ pshufd $0x4E,%xmm15,%xmm15
pxor %xmm0,%xmm9
- pshufd $78,%xmm0,%xmm0
+ pshufd $0x4E,%xmm0,%xmm0
pxor %xmm2,%xmm12
pxor %xmm7,%xmm15
pxor %xmm6,%xmm13
pxor %xmm8,%xmm0
pxor %xmm5,%xmm11
- pshufd $78,%xmm2,%xmm7
+ pshufd $0x4E,%xmm2,%xmm7
pxor %xmm1,%xmm14
- pshufd $78,%xmm6,%xmm8
+ pshufd $0x4E,%xmm6,%xmm8
pxor %xmm3,%xmm10
- pshufd $78,%xmm5,%xmm2
+ pshufd $0x4E,%xmm5,%xmm2
pxor %xmm4,%xmm10
- pshufd $78,%xmm4,%xmm6
+ pshufd $0x4E,%xmm4,%xmm6
pxor %xmm4,%xmm11
- pshufd $78,%xmm1,%xmm5
+ pshufd $0x4E,%xmm1,%xmm5
pxor %xmm11,%xmm7
- pshufd $78,%xmm3,%xmm1
+ pshufd $0x4E,%xmm3,%xmm1
pxor %xmm12,%xmm8
pxor %xmm10,%xmm2
pxor %xmm14,%xmm6
@@ -797,24 +797,24 @@ _bsaes_decrypt8:
decl %r10d
jl .Ldec_done
- pshufd $78,%xmm15,%xmm7
- pshufd $78,%xmm2,%xmm13
+ pshufd $0x4E,%xmm15,%xmm7
+ pshufd $0x4E,%xmm2,%xmm13
pxor %xmm15,%xmm7
- pshufd $78,%xmm4,%xmm14
+ pshufd $0x4E,%xmm4,%xmm14
pxor %xmm2,%xmm13
- pshufd $78,%xmm0,%xmm8
+ pshufd $0x4E,%xmm0,%xmm8
pxor %xmm4,%xmm14
- pshufd $78,%xmm5,%xmm9
+ pshufd $0x4E,%xmm5,%xmm9
pxor %xmm0,%xmm8
- pshufd $78,%xmm3,%xmm10
+ pshufd $0x4E,%xmm3,%xmm10
pxor %xmm5,%xmm9
pxor %xmm13,%xmm15
pxor %xmm13,%xmm0
- pshufd $78,%xmm1,%xmm11
+ pshufd $0x4E,%xmm1,%xmm11
pxor %xmm3,%xmm10
pxor %xmm7,%xmm5
pxor %xmm8,%xmm3
- pshufd $78,%xmm6,%xmm12
+ pshufd $0x4E,%xmm6,%xmm12
pxor %xmm1,%xmm11
pxor %xmm14,%xmm0
pxor %xmm9,%xmm1
@@ -828,45 +828,45 @@ _bsaes_decrypt8:
pxor %xmm14,%xmm1
pxor %xmm14,%xmm6
pxor %xmm12,%xmm4
- pshufd $147,%xmm15,%xmm7
- pshufd $147,%xmm0,%xmm8
+ pshufd $0x93,%xmm15,%xmm7
+ pshufd $0x93,%xmm0,%xmm8
pxor %xmm7,%xmm15
- pshufd $147,%xmm5,%xmm9
+ pshufd $0x93,%xmm5,%xmm9
pxor %xmm8,%xmm0
- pshufd $147,%xmm3,%xmm10
+ pshufd $0x93,%xmm3,%xmm10
pxor %xmm9,%xmm5
- pshufd $147,%xmm1,%xmm11
+ pshufd $0x93,%xmm1,%xmm11
pxor %xmm10,%xmm3
- pshufd $147,%xmm6,%xmm12
+ pshufd $0x93,%xmm6,%xmm12
pxor %xmm11,%xmm1
- pshufd $147,%xmm2,%xmm13
+ pshufd $0x93,%xmm2,%xmm13
pxor %xmm12,%xmm6
- pshufd $147,%xmm4,%xmm14
+ pshufd $0x93,%xmm4,%xmm14
pxor %xmm13,%xmm2
pxor %xmm14,%xmm4
pxor %xmm15,%xmm8
pxor %xmm4,%xmm7
pxor %xmm4,%xmm8
- pshufd $78,%xmm15,%xmm15
+ pshufd $0x4E,%xmm15,%xmm15
pxor %xmm0,%xmm9
- pshufd $78,%xmm0,%xmm0
+ pshufd $0x4E,%xmm0,%xmm0
pxor %xmm1,%xmm12
pxor %xmm7,%xmm15
pxor %xmm6,%xmm13
pxor %xmm8,%xmm0
pxor %xmm3,%xmm11
- pshufd $78,%xmm1,%xmm7
+ pshufd $0x4E,%xmm1,%xmm7
pxor %xmm2,%xmm14
- pshufd $78,%xmm6,%xmm8
+ pshufd $0x4E,%xmm6,%xmm8
pxor %xmm5,%xmm10
- pshufd $78,%xmm3,%xmm1
+ pshufd $0x4E,%xmm3,%xmm1
pxor %xmm4,%xmm10
- pshufd $78,%xmm4,%xmm6
+ pshufd $0x4E,%xmm4,%xmm6
pxor %xmm4,%xmm11
- pshufd $78,%xmm2,%xmm3
+ pshufd $0x4E,%xmm2,%xmm3
pxor %xmm11,%xmm7
- pshufd $78,%xmm5,%xmm2
+ pshufd $0x4E,%xmm5,%xmm2
pxor %xmm12,%xmm8
pxor %xmm1,%xmm10
pxor %xmm14,%xmm6
@@ -1553,20 +1553,20 @@ bsaes_xts_encrypt:
movdqa %xmm7,(%rax)
andq $-16,%r14
- subq $128,%rsp
+ subq $0x80,%rsp
movdqa 32(%rbp),%xmm6
pxor %xmm14,%xmm14
movdqa .Lxts_magic(%rip),%xmm12
pcmpgtd %xmm6,%xmm14
- subq $128,%r14
+ subq $0x80,%r14
jc .Lxts_enc_short
jmp .Lxts_enc_loop
.align 16
.Lxts_enc_loop:
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm15
movdqa %xmm6,0(%rsp)
@@ -1574,7 +1574,7 @@ bsaes_xts_encrypt:
pand %xmm12,%xmm13
pcmpgtd %xmm6,%xmm14
pxor %xmm13,%xmm6
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm0
movdqa %xmm6,16(%rsp)
@@ -1583,7 +1583,7 @@ bsaes_xts_encrypt:
pcmpgtd %xmm6,%xmm14
pxor %xmm13,%xmm6
movdqu 0(%r12),%xmm7
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm1
movdqa %xmm6,32(%rsp)
@@ -1593,7 +1593,7 @@ bsaes_xts_encrypt:
pxor %xmm13,%xmm6
movdqu 16(%r12),%xmm8
pxor %xmm7,%xmm15
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm2
movdqa %xmm6,48(%rsp)
@@ -1603,7 +1603,7 @@ bsaes_xts_encrypt:
pxor %xmm13,%xmm6
movdqu 32(%r12),%xmm9
pxor %xmm8,%xmm0
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm3
movdqa %xmm6,64(%rsp)
@@ -1613,7 +1613,7 @@ bsaes_xts_encrypt:
pxor %xmm13,%xmm6
movdqu 48(%r12),%xmm10
pxor %xmm9,%xmm1
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm4
movdqa %xmm6,80(%rsp)
@@ -1623,7 +1623,7 @@ bsaes_xts_encrypt:
pxor %xmm13,%xmm6
movdqu 64(%r12),%xmm11
pxor %xmm10,%xmm2
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm5
movdqa %xmm6,96(%rsp)
@@ -1667,20 +1667,20 @@ bsaes_xts_encrypt:
pxor %xmm14,%xmm14
movdqa .Lxts_magic(%rip),%xmm12
pcmpgtd %xmm6,%xmm14
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
paddq %xmm6,%xmm6
pand %xmm12,%xmm13
pcmpgtd %xmm6,%xmm14
pxor %xmm13,%xmm6
- subq $128,%r14
+ subq $0x80,%r14
jnc .Lxts_enc_loop
.Lxts_enc_short:
- addq $128,%r14
+ addq $0x80,%r14
jz .Lxts_enc_done
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm15
movdqa %xmm6,0(%rsp)
@@ -1688,7 +1688,7 @@ bsaes_xts_encrypt:
pand %xmm12,%xmm13
pcmpgtd %xmm6,%xmm14
pxor %xmm13,%xmm6
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm0
movdqa %xmm6,16(%rsp)
@@ -1699,7 +1699,7 @@ bsaes_xts_encrypt:
movdqu 0(%r12),%xmm7
cmpq $16,%r14
je .Lxts_enc_1
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm1
movdqa %xmm6,32(%rsp)
@@ -1711,7 +1711,7 @@ bsaes_xts_encrypt:
cmpq $32,%r14
je .Lxts_enc_2
pxor %xmm7,%xmm15
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm2
movdqa %xmm6,48(%rsp)
@@ -1723,7 +1723,7 @@ bsaes_xts_encrypt:
cmpq $48,%r14
je .Lxts_enc_3
pxor %xmm8,%xmm0
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm3
movdqa %xmm6,64(%rsp)
@@ -1735,7 +1735,7 @@ bsaes_xts_encrypt:
cmpq $64,%r14
je .Lxts_enc_4
pxor %xmm9,%xmm1
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm4
movdqa %xmm6,80(%rsp)
@@ -1747,7 +1747,7 @@ bsaes_xts_encrypt:
cmpq $80,%r14
je .Lxts_enc_5
pxor %xmm10,%xmm2
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm5
movdqa %xmm6,96(%rsp)
@@ -2012,20 +2012,20 @@ bsaes_xts_decrypt:
shlq $4,%rax
subq %rax,%r14
- subq $128,%rsp
+ subq $0x80,%rsp
movdqa 32(%rbp),%xmm6
pxor %xmm14,%xmm14
movdqa .Lxts_magic(%rip),%xmm12
pcmpgtd %xmm6,%xmm14
- subq $128,%r14
+ subq $0x80,%r14
jc .Lxts_dec_short
jmp .Lxts_dec_loop
.align 16
.Lxts_dec_loop:
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm15
movdqa %xmm6,0(%rsp)
@@ -2033,7 +2033,7 @@ bsaes_xts_decrypt:
pand %xmm12,%xmm13
pcmpgtd %xmm6,%xmm14
pxor %xmm13,%xmm6
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm0
movdqa %xmm6,16(%rsp)
@@ -2042,7 +2042,7 @@ bsaes_xts_decrypt:
pcmpgtd %xmm6,%xmm14
pxor %xmm13,%xmm6
movdqu 0(%r12),%xmm7
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm1
movdqa %xmm6,32(%rsp)
@@ -2052,7 +2052,7 @@ bsaes_xts_decrypt:
pxor %xmm13,%xmm6
movdqu 16(%r12),%xmm8
pxor %xmm7,%xmm15
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm2
movdqa %xmm6,48(%rsp)
@@ -2062,7 +2062,7 @@ bsaes_xts_decrypt:
pxor %xmm13,%xmm6
movdqu 32(%r12),%xmm9
pxor %xmm8,%xmm0
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm3
movdqa %xmm6,64(%rsp)
@@ -2072,7 +2072,7 @@ bsaes_xts_decrypt:
pxor %xmm13,%xmm6
movdqu 48(%r12),%xmm10
pxor %xmm9,%xmm1
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm4
movdqa %xmm6,80(%rsp)
@@ -2082,7 +2082,7 @@ bsaes_xts_decrypt:
pxor %xmm13,%xmm6
movdqu 64(%r12),%xmm11
pxor %xmm10,%xmm2
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm5
movdqa %xmm6,96(%rsp)
@@ -2126,20 +2126,20 @@ bsaes_xts_decrypt:
pxor %xmm14,%xmm14
movdqa .Lxts_magic(%rip),%xmm12
pcmpgtd %xmm6,%xmm14
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
paddq %xmm6,%xmm6
pand %xmm12,%xmm13
pcmpgtd %xmm6,%xmm14
pxor %xmm13,%xmm6
- subq $128,%r14
+ subq $0x80,%r14
jnc .Lxts_dec_loop
.Lxts_dec_short:
- addq $128,%r14
+ addq $0x80,%r14
jz .Lxts_dec_done
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm15
movdqa %xmm6,0(%rsp)
@@ -2147,7 +2147,7 @@ bsaes_xts_decrypt:
pand %xmm12,%xmm13
pcmpgtd %xmm6,%xmm14
pxor %xmm13,%xmm6
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm0
movdqa %xmm6,16(%rsp)
@@ -2158,7 +2158,7 @@ bsaes_xts_decrypt:
movdqu 0(%r12),%xmm7
cmpq $16,%r14
je .Lxts_dec_1
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm1
movdqa %xmm6,32(%rsp)
@@ -2170,7 +2170,7 @@ bsaes_xts_decrypt:
cmpq $32,%r14
je .Lxts_dec_2
pxor %xmm7,%xmm15
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm2
movdqa %xmm6,48(%rsp)
@@ -2182,7 +2182,7 @@ bsaes_xts_decrypt:
cmpq $48,%r14
je .Lxts_dec_3
pxor %xmm8,%xmm0
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm3
movdqa %xmm6,64(%rsp)
@@ -2194,7 +2194,7 @@ bsaes_xts_decrypt:
cmpq $64,%r14
je .Lxts_dec_4
pxor %xmm9,%xmm1
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm4
movdqa %xmm6,80(%rsp)
@@ -2206,7 +2206,7 @@ bsaes_xts_decrypt:
cmpq $80,%r14
je .Lxts_dec_5
pxor %xmm10,%xmm2
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm5
movdqa %xmm6,96(%rsp)
@@ -2383,7 +2383,7 @@ bsaes_xts_decrypt:
pxor %xmm14,%xmm14
movdqa .Lxts_magic(%rip),%xmm12
pcmpgtd %xmm6,%xmm14
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
movdqa %xmm6,%xmm5
paddq %xmm6,%xmm6
pand %xmm12,%xmm13
diff --git a/secure/lib/libcrypto/amd64/cmll-x86_64.S b/secure/lib/libcrypto/amd64/cmll-x86_64.S
index ecd33f1..81d2445 100644
--- a/secure/lib/libcrypto/amd64/cmll-x86_64.S
+++ b/secure/lib/libcrypto/amd64/cmll-x86_64.S
@@ -1625,7 +1625,7 @@ Camellia_cbc_encrypt:
leaq -64-63(%rcx),%r10
subq %rsp,%r10
negq %r10
- andq $960,%r10
+ andq $0x3C0,%r10
subq %r10,%rsp
diff --git a/secure/lib/libcrypto/amd64/ecp_nistz256-x86_64.S b/secure/lib/libcrypto/amd64/ecp_nistz256-x86_64.S
index c5875d7..5d7d5e2 100644
--- a/secure/lib/libcrypto/amd64/ecp_nistz256-x86_64.S
+++ b/secure/lib/libcrypto/amd64/ecp_nistz256-x86_64.S
@@ -1122,6 +1122,7 @@ ecp_nistz256_point_double:
pushq %r15
subq $160+8,%rsp
+.Lpoint_double_shortcutq:
movdqu 0(%rsi),%xmm0
movq %rsi,%rbx
movdqu 16(%rsi),%xmm1
@@ -1342,7 +1343,7 @@ ecp_nistz256_point_add:
por %xmm1,%xmm3
movdqu 0(%rsi),%xmm0
- pshufd $177,%xmm3,%xmm5
+ pshufd $0xb1,%xmm3,%xmm5
movdqu 16(%rsi),%xmm1
movdqu 32(%rsi),%xmm2
por %xmm3,%xmm5
@@ -1352,7 +1353,7 @@ ecp_nistz256_point_add:
movq 64+16(%rsi),%r15
movq 64+24(%rsi),%r8
movdqa %xmm0,480(%rsp)
- pshufd $30,%xmm5,%xmm4
+ pshufd $0x1e,%xmm5,%xmm4
movdqa %xmm1,480+16(%rsp)
por %xmm0,%xmm1
.byte 102,72,15,110,199
@@ -1372,10 +1373,10 @@ ecp_nistz256_point_add:
call __ecp_nistz256_sqr_montq
pcmpeqd %xmm4,%xmm5
- pshufd $177,%xmm3,%xmm4
+ pshufd $0xb1,%xmm3,%xmm4
por %xmm3,%xmm4
pshufd $0,%xmm5,%xmm5
- pshufd $30,%xmm4,%xmm3
+ pshufd $0x1e,%xmm4,%xmm3
por %xmm3,%xmm4
pxor %xmm3,%xmm3
pcmpeqd %xmm3,%xmm4
@@ -1384,6 +1385,7 @@ ecp_nistz256_point_add:
movq 64+8(%rbx),%r14
movq 64+16(%rbx),%r15
movq 64+24(%rbx),%r8
+.byte 102,72,15,110,203
leaq 64-0(%rbx),%rsi
leaq 32(%rsp),%rdi
@@ -1475,7 +1477,7 @@ ecp_nistz256_point_add:
testq %r8,%r8
jnz .Ladd_proceedq
testq %r9,%r9
- jz .Ladd_proceedq
+ jz .Ladd_doubleq
.byte 102,72,15,126,199
pxor %xmm0,%xmm0
@@ -1488,6 +1490,13 @@ ecp_nistz256_point_add:
jmp .Ladd_doneq
.align 32
+.Ladd_doubleq:
+.byte 102,72,15,126,206
+.byte 102,72,15,126,199
+ addq $416,%rsp
+ jmp .Lpoint_double_shortcutq
+
+.align 32
.Ladd_proceedq:
movq 0+64(%rsp),%rax
movq 8+64(%rsp),%r14
@@ -1734,13 +1743,13 @@ ecp_nistz256_point_add_affine:
por %xmm1,%xmm3
movdqu 0(%rbx),%xmm0
- pshufd $177,%xmm3,%xmm5
+ pshufd $0xb1,%xmm3,%xmm5
movdqu 16(%rbx),%xmm1
movdqu 32(%rbx),%xmm2
por %xmm3,%xmm5
movdqu 48(%rbx),%xmm3
movdqa %xmm0,416(%rsp)
- pshufd $30,%xmm5,%xmm4
+ pshufd $0x1e,%xmm5,%xmm4
movdqa %xmm1,416+16(%rsp)
por %xmm0,%xmm1
.byte 102,72,15,110,199
@@ -1756,13 +1765,13 @@ ecp_nistz256_point_add_affine:
call __ecp_nistz256_sqr_montq
pcmpeqd %xmm4,%xmm5
- pshufd $177,%xmm3,%xmm4
+ pshufd $0xb1,%xmm3,%xmm4
movq 0(%rbx),%rax
movq %r12,%r9
por %xmm3,%xmm4
pshufd $0,%xmm5,%xmm5
- pshufd $30,%xmm4,%xmm3
+ pshufd $0x1e,%xmm4,%xmm3
movq %r13,%r10
por %xmm3,%xmm4
pxor %xmm3,%xmm3
diff --git a/secure/lib/libcrypto/amd64/ghash-x86_64.S b/secure/lib/libcrypto/amd64/ghash-x86_64.S
index aa93c80..ef024bf 100644
--- a/secure/lib/libcrypto/amd64/ghash-x86_64.S
+++ b/secure/lib/libcrypto/amd64/ghash-x86_64.S
@@ -21,14 +21,14 @@ gcm_gmult_4bit:
movq $14,%rcx
movq 8(%rsi,%rax,1),%r8
movq (%rsi,%rax,1),%r9
- andb $240,%bl
+ andb $0xf0,%bl
movq %r8,%rdx
jmp .Loop1
.align 16
.Loop1:
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
movb (%rdi,%rcx,1),%al
shrq $4,%r9
@@ -44,13 +44,13 @@ gcm_gmult_4bit:
js .Lbreak1
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
shrq $4,%r9
xorq 8(%rsi,%rax,1),%r8
shlq $60,%r10
xorq (%rsi,%rax,1),%r9
- andb $240,%bl
+ andb $0xf0,%bl
xorq (%r11,%rdx,8),%r9
movq %r8,%rdx
xorq %r10,%r8
@@ -59,19 +59,19 @@ gcm_gmult_4bit:
.align 16
.Lbreak1:
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
shrq $4,%r9
xorq 8(%rsi,%rax,1),%r8
shlq $60,%r10
xorq (%rsi,%rax,1),%r9
- andb $240,%bl
+ andb $0xf0,%bl
xorq (%r11,%rdx,8),%r9
movq %r8,%rdx
xorq %r10,%r8
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
shrq $4,%r9
xorq 8(%rsi,%rbx,1),%r8
@@ -662,10 +662,10 @@ gcm_ghash_4bit:
gcm_init_clmul:
.L_init_clmul:
movdqu (%rsi),%xmm2
- pshufd $78,%xmm2,%xmm2
+ pshufd $0b01001110,%xmm2,%xmm2
- pshufd $255,%xmm2,%xmm4
+ pshufd $0b11111111,%xmm2,%xmm4
movdqa %xmm2,%xmm3
psllq $1,%xmm2
pxor %xmm5,%xmm5
@@ -679,11 +679,11 @@ gcm_init_clmul:
pxor %xmm5,%xmm2
- pshufd $78,%xmm2,%xmm6
+ pshufd $0b01001110,%xmm2,%xmm6
movdqa %xmm2,%xmm0
pxor %xmm2,%xmm6
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
+ pshufd $0b01001110,%xmm0,%xmm3
pxor %xmm0,%xmm3
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
@@ -719,8 +719,8 @@ gcm_init_clmul:
pxor %xmm4,%xmm0
psrlq $1,%xmm0
pxor %xmm1,%xmm0
- pshufd $78,%xmm2,%xmm3
- pshufd $78,%xmm0,%xmm4
+ pshufd $0b01001110,%xmm2,%xmm3
+ pshufd $0b01001110,%xmm0,%xmm4
pxor %xmm2,%xmm3
movdqu %xmm2,0(%rdi)
pxor %xmm0,%xmm4
@@ -728,7 +728,7 @@ gcm_init_clmul:
.byte 102,15,58,15,227,8
movdqu %xmm4,32(%rdi)
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
+ pshufd $0b01001110,%xmm0,%xmm3
pxor %xmm0,%xmm3
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
@@ -766,7 +766,7 @@ gcm_init_clmul:
pxor %xmm1,%xmm0
movdqa %xmm0,%xmm5
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
+ pshufd $0b01001110,%xmm0,%xmm3
pxor %xmm0,%xmm3
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
@@ -802,8 +802,8 @@ gcm_init_clmul:
pxor %xmm4,%xmm0
psrlq $1,%xmm0
pxor %xmm1,%xmm0
- pshufd $78,%xmm5,%xmm3
- pshufd $78,%xmm0,%xmm4
+ pshufd $0b01001110,%xmm5,%xmm3
+ pshufd $0b01001110,%xmm0,%xmm4
pxor %xmm5,%xmm3
movdqu %xmm5,48(%rdi)
pxor %xmm0,%xmm4
@@ -823,7 +823,7 @@ gcm_gmult_clmul:
movdqu 32(%rsi),%xmm4
.byte 102,15,56,0,197
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
+ pshufd $0b01001110,%xmm0,%xmm3
pxor %xmm0,%xmm3
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
@@ -875,20 +875,20 @@ gcm_ghash_clmul:
movdqu 32(%rsi),%xmm7
.byte 102,65,15,56,0,194
- subq $16,%rcx
+ subq $0x10,%rcx
jz .Lodd_tail
movdqu 16(%rsi),%xmm6
movl OPENSSL_ia32cap_P+4(%rip),%eax
- cmpq $48,%rcx
+ cmpq $0x30,%rcx
jb .Lskip4x
andl $71303168,%eax
cmpl $4194304,%eax
je .Lskip4x
- subq $48,%rcx
- movq $11547335547999543296,%rax
+ subq $0x30,%rcx
+ movq $0xA040608020C0E000,%rax
movdqu 48(%rsi),%xmm14
movdqu 64(%rsi),%xmm15
@@ -900,14 +900,14 @@ gcm_ghash_clmul:
.byte 102,65,15,56,0,218
.byte 102,69,15,56,0,218
movdqa %xmm3,%xmm5
- pshufd $78,%xmm3,%xmm4
+ pshufd $0b01001110,%xmm3,%xmm4
pxor %xmm3,%xmm4
.byte 102,15,58,68,218,0
.byte 102,15,58,68,234,17
.byte 102,15,58,68,231,0
movdqa %xmm11,%xmm13
- pshufd $78,%xmm11,%xmm12
+ pshufd $0b01001110,%xmm11,%xmm12
pxor %xmm11,%xmm12
.byte 102,68,15,58,68,222,0
.byte 102,68,15,58,68,238,17
@@ -922,12 +922,12 @@ gcm_ghash_clmul:
.byte 102,69,15,56,0,218
.byte 102,69,15,56,0,194
movdqa %xmm11,%xmm13
- pshufd $78,%xmm11,%xmm12
+ pshufd $0b01001110,%xmm11,%xmm12
pxor %xmm8,%xmm0
pxor %xmm11,%xmm12
.byte 102,69,15,58,68,222,0
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm8
+ pshufd $0b01001110,%xmm0,%xmm8
pxor %xmm0,%xmm8
.byte 102,69,15,58,68,238,17
.byte 102,68,15,58,68,231,0
@@ -935,7 +935,7 @@ gcm_ghash_clmul:
xorps %xmm13,%xmm5
leaq 64(%rdx),%rdx
- subq $64,%rcx
+ subq $0x40,%rcx
jc .Ltail4x
jmp .Lmod4_loop
@@ -950,14 +950,14 @@ gcm_ghash_clmul:
movdqu 32(%rdx),%xmm3
movdqa %xmm11,%xmm13
.byte 102,68,15,58,68,199,16
- pshufd $78,%xmm11,%xmm12
+ pshufd $0b01001110,%xmm11,%xmm12
xorps %xmm5,%xmm1
pxor %xmm11,%xmm12
.byte 102,65,15,56,0,218
movups 32(%rsi),%xmm7
xorps %xmm4,%xmm8
.byte 102,68,15,58,68,218,0
- pshufd $78,%xmm3,%xmm4
+ pshufd $0b01001110,%xmm3,%xmm4
pxor %xmm0,%xmm8
movdqa %xmm3,%xmm5
@@ -1001,7 +1001,7 @@ gcm_ghash_clmul:
movdqa %xmm11,%xmm13
pxor %xmm12,%xmm4
- pshufd $78,%xmm11,%xmm12
+ pshufd $0b01001110,%xmm11,%xmm12
pxor %xmm9,%xmm0
pxor %xmm8,%xmm1
pxor %xmm11,%xmm12
@@ -1011,14 +1011,14 @@ gcm_ghash_clmul:
movdqa %xmm0,%xmm1
.byte 102,69,15,58,68,238,17
xorps %xmm11,%xmm3
- pshufd $78,%xmm0,%xmm8
+ pshufd $0b01001110,%xmm0,%xmm8
pxor %xmm0,%xmm8
.byte 102,68,15,58,68,231,0
xorps %xmm13,%xmm5
leaq 64(%rdx),%rdx
- subq $64,%rcx
+ subq $0x40,%rcx
jnc .Lmod4_loop
.Ltail4x:
@@ -1062,10 +1062,10 @@ gcm_ghash_clmul:
pxor %xmm4,%xmm0
psrlq $1,%xmm0
pxor %xmm1,%xmm0
- addq $64,%rcx
+ addq $0x40,%rcx
jz .Ldone
movdqu 32(%rsi),%xmm7
- subq $16,%rcx
+ subq $0x10,%rcx
jz .Lodd_tail
.Lskip4x:
@@ -1080,7 +1080,7 @@ gcm_ghash_clmul:
pxor %xmm8,%xmm0
movdqa %xmm3,%xmm5
- pshufd $78,%xmm3,%xmm4
+ pshufd $0b01001110,%xmm3,%xmm4
pxor %xmm3,%xmm4
.byte 102,15,58,68,218,0
.byte 102,15,58,68,234,17
@@ -1088,7 +1088,7 @@ gcm_ghash_clmul:
leaq 32(%rdx),%rdx
nop
- subq $32,%rcx
+ subq $0x20,%rcx
jbe .Leven_tail
nop
jmp .Lmod_loop
@@ -1097,7 +1097,7 @@ gcm_ghash_clmul:
.Lmod_loop:
movdqa %xmm0,%xmm1
movdqa %xmm4,%xmm8
- pshufd $78,%xmm0,%xmm4
+ pshufd $0b01001110,%xmm0,%xmm4
pxor %xmm0,%xmm4
.byte 102,15,58,68,198,0
@@ -1135,7 +1135,7 @@ gcm_ghash_clmul:
pslldq $8,%xmm0
psrldq $8,%xmm8
pxor %xmm9,%xmm0
- pshufd $78,%xmm5,%xmm4
+ pshufd $0b01001110,%xmm5,%xmm4
pxor %xmm8,%xmm1
pxor %xmm5,%xmm4
@@ -1151,13 +1151,13 @@ gcm_ghash_clmul:
.byte 102,15,58,68,231,0
pxor %xmm1,%xmm0
- subq $32,%rcx
+ subq $0x20,%rcx
ja .Lmod_loop
.Leven_tail:
movdqa %xmm0,%xmm1
movdqa %xmm4,%xmm8
- pshufd $78,%xmm0,%xmm4
+ pshufd $0b01001110,%xmm0,%xmm4
pxor %xmm0,%xmm4
.byte 102,15,58,68,198,0
@@ -1205,7 +1205,7 @@ gcm_ghash_clmul:
.byte 102,69,15,56,0,194
pxor %xmm8,%xmm0
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
+ pshufd $0b01001110,%xmm0,%xmm3
pxor %xmm0,%xmm3
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
diff --git a/secure/lib/libcrypto/amd64/md5-x86_64.S b/secure/lib/libcrypto/amd64/md5-x86_64.S
index 94fb761..8f6e689 100644
--- a/secure/lib/libcrypto/amd64/md5-x86_64.S
+++ b/secure/lib/libcrypto/amd64/md5-x86_64.S
@@ -494,14 +494,14 @@ md5_block_asm_data_order:
movl %ecx,%r11d
addl %ecx,%ebx
movl 0(%rsi),%r10d
- movl $4294967295,%r11d
+ movl $0xffffffff,%r11d
xorl %edx,%r11d
leal -198630844(%rax,%r10,1),%eax
orl %ebx,%r11d
xorl %ecx,%r11d
addl %r11d,%eax
movl 28(%rsi),%r10d
- movl $4294967295,%r11d
+ movl $0xffffffff,%r11d
roll $6,%eax
xorl %ecx,%r11d
addl %ebx,%eax
@@ -510,7 +510,7 @@ md5_block_asm_data_order:
xorl %ebx,%r11d
addl %r11d,%edx
movl 56(%rsi),%r10d
- movl $4294967295,%r11d
+ movl $0xffffffff,%r11d
roll $10,%edx
xorl %ebx,%r11d
addl %eax,%edx
@@ -519,7 +519,7 @@ md5_block_asm_data_order:
xorl %eax,%r11d
addl %r11d,%ecx
movl 20(%rsi),%r10d
- movl $4294967295,%r11d
+ movl $0xffffffff,%r11d
roll $15,%ecx
xorl %eax,%r11d
addl %edx,%ecx
@@ -528,7 +528,7 @@ md5_block_asm_data_order:
xorl %edx,%r11d
addl %r11d,%ebx
movl 48(%rsi),%r10d
- movl $4294967295,%r11d
+ movl $0xffffffff,%r11d
roll $21,%ebx
xorl %edx,%r11d
addl %ecx,%ebx
@@ -537,7 +537,7 @@ md5_block_asm_data_order:
xorl %ecx,%r11d
addl %r11d,%eax
movl 12(%rsi),%r10d
- movl $4294967295,%r11d
+ movl $0xffffffff,%r11d
roll $6,%eax
xorl %ecx,%r11d
addl %ebx,%eax
@@ -546,7 +546,7 @@ md5_block_asm_data_order:
xorl %ebx,%r11d
addl %r11d,%edx
movl 40(%rsi),%r10d
- movl $4294967295,%r11d
+ movl $0xffffffff,%r11d
roll $10,%edx
xorl %ebx,%r11d
addl %eax,%edx
@@ -555,7 +555,7 @@ md5_block_asm_data_order:
xorl %eax,%r11d
addl %r11d,%ecx
movl 4(%rsi),%r10d
- movl $4294967295,%r11d
+ movl $0xffffffff,%r11d
roll $15,%ecx
xorl %eax,%r11d
addl %edx,%ecx
@@ -564,7 +564,7 @@ md5_block_asm_data_order:
xorl %edx,%r11d
addl %r11d,%ebx
movl 32(%rsi),%r10d
- movl $4294967295,%r11d
+ movl $0xffffffff,%r11d
roll $21,%ebx
xorl %edx,%r11d
addl %ecx,%ebx
@@ -573,7 +573,7 @@ md5_block_asm_data_order:
xorl %ecx,%r11d
addl %r11d,%eax
movl 60(%rsi),%r10d
- movl $4294967295,%r11d
+ movl $0xffffffff,%r11d
roll $6,%eax
xorl %ecx,%r11d
addl %ebx,%eax
@@ -582,7 +582,7 @@ md5_block_asm_data_order:
xorl %ebx,%r11d
addl %r11d,%edx
movl 24(%rsi),%r10d
- movl $4294967295,%r11d
+ movl $0xffffffff,%r11d
roll $10,%edx
xorl %ebx,%r11d
addl %eax,%edx
@@ -591,7 +591,7 @@ md5_block_asm_data_order:
xorl %eax,%r11d
addl %r11d,%ecx
movl 52(%rsi),%r10d
- movl $4294967295,%r11d
+ movl $0xffffffff,%r11d
roll $15,%ecx
xorl %eax,%r11d
addl %edx,%ecx
@@ -600,7 +600,7 @@ md5_block_asm_data_order:
xorl %edx,%r11d
addl %r11d,%ebx
movl 16(%rsi),%r10d
- movl $4294967295,%r11d
+ movl $0xffffffff,%r11d
roll $21,%ebx
xorl %edx,%r11d
addl %ecx,%ebx
@@ -609,7 +609,7 @@ md5_block_asm_data_order:
xorl %ecx,%r11d
addl %r11d,%eax
movl 44(%rsi),%r10d
- movl $4294967295,%r11d
+ movl $0xffffffff,%r11d
roll $6,%eax
xorl %ecx,%r11d
addl %ebx,%eax
@@ -618,7 +618,7 @@ md5_block_asm_data_order:
xorl %ebx,%r11d
addl %r11d,%edx
movl 8(%rsi),%r10d
- movl $4294967295,%r11d
+ movl $0xffffffff,%r11d
roll $10,%edx
xorl %ebx,%r11d
addl %eax,%edx
@@ -627,7 +627,7 @@ md5_block_asm_data_order:
xorl %eax,%r11d
addl %r11d,%ecx
movl 36(%rsi),%r10d
- movl $4294967295,%r11d
+ movl $0xffffffff,%r11d
roll $15,%ecx
xorl %eax,%r11d
addl %edx,%ecx
@@ -636,7 +636,7 @@ md5_block_asm_data_order:
xorl %edx,%r11d
addl %r11d,%ebx
movl 0(%rsi),%r10d
- movl $4294967295,%r11d
+ movl $0xffffffff,%r11d
roll $21,%ebx
xorl %edx,%r11d
addl %ecx,%ebx
diff --git a/secure/lib/libcrypto/amd64/rsaz-x86_64.S b/secure/lib/libcrypto/amd64/rsaz-x86_64.S
index efd229a..e2b0313 100644
--- a/secure/lib/libcrypto/amd64/rsaz-x86_64.S
+++ b/secure/lib/libcrypto/amd64/rsaz-x86_64.S
@@ -462,48 +462,94 @@ rsaz_512_mul_gather4:
pushq %r14
pushq %r15
- movl %r9d,%r9d
- subq $128+24,%rsp
+ subq $152,%rsp
.Lmul_gather4_body:
- movl 64(%rdx,%r9,4),%eax
-.byte 102,72,15,110,199
- movl (%rdx,%r9,4),%ebx
-.byte 102,72,15,110,201
+ movd %r9d,%xmm8
+ movdqa .Linc+16(%rip),%xmm1
+ movdqa .Linc(%rip),%xmm0
+
+ pshufd $0,%xmm8,%xmm8
+ movdqa %xmm1,%xmm7
+ movdqa %xmm1,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm8,%xmm0
+ movdqa %xmm7,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm8,%xmm1
+ movdqa %xmm7,%xmm4
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm8,%xmm2
+ movdqa %xmm7,%xmm5
+ paddd %xmm3,%xmm4
+ pcmpeqd %xmm8,%xmm3
+ movdqa %xmm7,%xmm6
+ paddd %xmm4,%xmm5
+ pcmpeqd %xmm8,%xmm4
+ paddd %xmm5,%xmm6
+ pcmpeqd %xmm8,%xmm5
+ paddd %xmm6,%xmm7
+ pcmpeqd %xmm8,%xmm6
+ pcmpeqd %xmm8,%xmm7
+
+ movdqa 0(%rdx),%xmm8
+ movdqa 16(%rdx),%xmm9
+ movdqa 32(%rdx),%xmm10
+ movdqa 48(%rdx),%xmm11
+ pand %xmm0,%xmm8
+ movdqa 64(%rdx),%xmm12
+ pand %xmm1,%xmm9
+ movdqa 80(%rdx),%xmm13
+ pand %xmm2,%xmm10
+ movdqa 96(%rdx),%xmm14
+ pand %xmm3,%xmm11
+ movdqa 112(%rdx),%xmm15
+ leaq 128(%rdx),%rbp
+ pand %xmm4,%xmm12
+ pand %xmm5,%xmm13
+ pand %xmm6,%xmm14
+ pand %xmm7,%xmm15
+ por %xmm10,%xmm8
+ por %xmm11,%xmm9
+ por %xmm12,%xmm8
+ por %xmm13,%xmm9
+ por %xmm14,%xmm8
+ por %xmm15,%xmm9
+
+ por %xmm9,%xmm8
+ pshufd $0x4e,%xmm8,%xmm9
+ por %xmm9,%xmm8
+.byte 102,76,15,126,195
+
movq %r8,128(%rsp)
+ movq %rdi,128+8(%rsp)
+ movq %rcx,128+16(%rsp)
- shlq $32,%rax
- orq %rax,%rbx
movq (%rsi),%rax
movq 8(%rsi),%rcx
- leaq 128(%rdx,%r9,4),%rbp
mulq %rbx
movq %rax,(%rsp)
movq %rcx,%rax
movq %rdx,%r8
mulq %rbx
- movd (%rbp),%xmm4
addq %rax,%r8
movq 16(%rsi),%rax
movq %rdx,%r9
adcq $0,%r9
mulq %rbx
- movd 64(%rbp),%xmm5
addq %rax,%r9
movq 24(%rsi),%rax
movq %rdx,%r10
adcq $0,%r10
mulq %rbx
- pslldq $4,%xmm5
addq %rax,%r10
movq 32(%rsi),%rax
movq %rdx,%r11
adcq $0,%r11
mulq %rbx
- por %xmm5,%xmm4
addq %rax,%r11
movq 40(%rsi),%rax
movq %rdx,%r12
@@ -516,14 +562,12 @@ rsaz_512_mul_gather4:
adcq $0,%r13
mulq %rbx
- leaq 128(%rbp),%rbp
addq %rax,%r13
movq 56(%rsi),%rax
movq %rdx,%r14
adcq $0,%r14
mulq %rbx
-.byte 102,72,15,126,227
addq %rax,%r14
movq (%rsi),%rax
movq %rdx,%r15
@@ -535,6 +579,35 @@ rsaz_512_mul_gather4:
.align 32
.Loop_mul_gather:
+ movdqa 0(%rbp),%xmm8
+ movdqa 16(%rbp),%xmm9
+ movdqa 32(%rbp),%xmm10
+ movdqa 48(%rbp),%xmm11
+ pand %xmm0,%xmm8
+ movdqa 64(%rbp),%xmm12
+ pand %xmm1,%xmm9
+ movdqa 80(%rbp),%xmm13
+ pand %xmm2,%xmm10
+ movdqa 96(%rbp),%xmm14
+ pand %xmm3,%xmm11
+ movdqa 112(%rbp),%xmm15
+ leaq 128(%rbp),%rbp
+ pand %xmm4,%xmm12
+ pand %xmm5,%xmm13
+ pand %xmm6,%xmm14
+ pand %xmm7,%xmm15
+ por %xmm10,%xmm8
+ por %xmm11,%xmm9
+ por %xmm12,%xmm8
+ por %xmm13,%xmm9
+ por %xmm14,%xmm8
+ por %xmm15,%xmm9
+
+ por %xmm9,%xmm8
+ pshufd $0x4e,%xmm8,%xmm9
+ por %xmm9,%xmm8
+.byte 102,76,15,126,195
+
mulq %rbx
addq %rax,%r8
movq 8(%rsi),%rax
@@ -543,7 +616,6 @@ rsaz_512_mul_gather4:
adcq $0,%r8
mulq %rbx
- movd (%rbp),%xmm4
addq %rax,%r9
movq 16(%rsi),%rax
adcq $0,%rdx
@@ -552,7 +624,6 @@ rsaz_512_mul_gather4:
adcq $0,%r9
mulq %rbx
- movd 64(%rbp),%xmm5
addq %rax,%r10
movq 24(%rsi),%rax
adcq $0,%rdx
@@ -561,7 +632,6 @@ rsaz_512_mul_gather4:
adcq $0,%r10
mulq %rbx
- pslldq $4,%xmm5
addq %rax,%r11
movq 32(%rsi),%rax
adcq $0,%rdx
@@ -570,7 +640,6 @@ rsaz_512_mul_gather4:
adcq $0,%r11
mulq %rbx
- por %xmm5,%xmm4
addq %rax,%r12
movq 40(%rsi),%rax
adcq $0,%rdx
@@ -595,7 +664,6 @@ rsaz_512_mul_gather4:
adcq $0,%r14
mulq %rbx
-.byte 102,72,15,126,227
addq %rax,%r15
movq (%rsi),%rax
adcq $0,%rdx
@@ -603,7 +671,6 @@ rsaz_512_mul_gather4:
movq %rdx,%r15
adcq $0,%r15
- leaq 128(%rbp),%rbp
leaq 8(%rdi),%rdi
decl %ecx
@@ -618,8 +685,8 @@ rsaz_512_mul_gather4:
movq %r14,48(%rdi)
movq %r15,56(%rdi)
-.byte 102,72,15,126,199
-.byte 102,72,15,126,205
+ movq 128+8(%rsp),%rdi
+ movq 128+16(%rsp),%rbp
movq (%rsp),%r8
movq 8(%rsp),%r9
@@ -668,7 +735,7 @@ rsaz_512_mul_scatter4:
movl %r9d,%r9d
subq $128+24,%rsp
.Lmul_scatter4_body:
- leaq (%r8,%r9,4),%r8
+ leaq (%r8,%r9,8),%r8
.byte 102,72,15,110,199
.byte 102,72,15,110,202
.byte 102,73,15,110,208
@@ -704,30 +771,14 @@ rsaz_512_mul_scatter4:
call __rsaz_512_subtract
- movl %r8d,0(%rsi)
- shrq $32,%r8
- movl %r9d,128(%rsi)
- shrq $32,%r9
- movl %r10d,256(%rsi)
- shrq $32,%r10
- movl %r11d,384(%rsi)
- shrq $32,%r11
- movl %r12d,512(%rsi)
- shrq $32,%r12
- movl %r13d,640(%rsi)
- shrq $32,%r13
- movl %r14d,768(%rsi)
- shrq $32,%r14
- movl %r15d,896(%rsi)
- shrq $32,%r15
- movl %r8d,64(%rsi)
- movl %r9d,192(%rsi)
- movl %r10d,320(%rsi)
- movl %r11d,448(%rsi)
- movl %r12d,576(%rsi)
- movl %r13d,704(%rsi)
- movl %r14d,832(%rsi)
- movl %r15d,960(%rsi)
+ movq %r8,0(%rsi)
+ movq %r9,128(%rsi)
+ movq %r10,256(%rsi)
+ movq %r11,384(%rsi)
+ movq %r12,512(%rsi)
+ movq %r13,640(%rsi)
+ movq %r14,768(%rsi)
+ movq %r15,896(%rsi)
leaq 128+24+48(%rsp),%rax
movq -48(%rax),%r15
@@ -1080,16 +1131,14 @@ __rsaz_512_mul:
.type rsaz_512_scatter4,@function
.align 16
rsaz_512_scatter4:
- leaq (%rdi,%rdx,4),%rdi
+ leaq (%rdi,%rdx,8),%rdi
movl $8,%r9d
jmp .Loop_scatter
.align 16
.Loop_scatter:
movq (%rsi),%rax
leaq 8(%rsi),%rsi
- movl %eax,(%rdi)
- shrq $32,%rax
- movl %eax,64(%rdi)
+ movq %rax,(%rdi)
leaq 128(%rdi),%rdi
decl %r9d
jnz .Loop_scatter
@@ -1100,19 +1149,72 @@ rsaz_512_scatter4:
.type rsaz_512_gather4,@function
.align 16
rsaz_512_gather4:
- leaq (%rsi,%rdx,4),%rsi
+ movd %edx,%xmm8
+ movdqa .Linc+16(%rip),%xmm1
+ movdqa .Linc(%rip),%xmm0
+
+ pshufd $0,%xmm8,%xmm8
+ movdqa %xmm1,%xmm7
+ movdqa %xmm1,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm8,%xmm0
+ movdqa %xmm7,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm8,%xmm1
+ movdqa %xmm7,%xmm4
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm8,%xmm2
+ movdqa %xmm7,%xmm5
+ paddd %xmm3,%xmm4
+ pcmpeqd %xmm8,%xmm3
+ movdqa %xmm7,%xmm6
+ paddd %xmm4,%xmm5
+ pcmpeqd %xmm8,%xmm4
+ paddd %xmm5,%xmm6
+ pcmpeqd %xmm8,%xmm5
+ paddd %xmm6,%xmm7
+ pcmpeqd %xmm8,%xmm6
+ pcmpeqd %xmm8,%xmm7
movl $8,%r9d
jmp .Loop_gather
.align 16
.Loop_gather:
- movl (%rsi),%eax
- movl 64(%rsi),%r8d
+ movdqa 0(%rsi),%xmm8
+ movdqa 16(%rsi),%xmm9
+ movdqa 32(%rsi),%xmm10
+ movdqa 48(%rsi),%xmm11
+ pand %xmm0,%xmm8
+ movdqa 64(%rsi),%xmm12
+ pand %xmm1,%xmm9
+ movdqa 80(%rsi),%xmm13
+ pand %xmm2,%xmm10
+ movdqa 96(%rsi),%xmm14
+ pand %xmm3,%xmm11
+ movdqa 112(%rsi),%xmm15
leaq 128(%rsi),%rsi
- shlq $32,%r8
- orq %r8,%rax
- movq %rax,(%rdi)
+ pand %xmm4,%xmm12
+ pand %xmm5,%xmm13
+ pand %xmm6,%xmm14
+ pand %xmm7,%xmm15
+ por %xmm10,%xmm8
+ por %xmm11,%xmm9
+ por %xmm12,%xmm8
+ por %xmm13,%xmm9
+ por %xmm14,%xmm8
+ por %xmm15,%xmm9
+
+ por %xmm9,%xmm8
+ pshufd $0x4e,%xmm8,%xmm9
+ por %xmm9,%xmm8
+ movq %xmm8,(%rdi)
leaq 8(%rdi),%rdi
decl %r9d
jnz .Loop_gather
.byte 0xf3,0xc3
+.LSEH_end_rsaz_512_gather4:
.size rsaz_512_gather4,.-rsaz_512_gather4
+
+.align 64
+.Linc:
+.long 0,0, 1,1
+.long 2,2, 2,2
diff --git a/secure/lib/libcrypto/amd64/sha1-mb-x86_64.S b/secure/lib/libcrypto/amd64/sha1-mb-x86_64.S
index 6c7cd2f..6a79761 100644
--- a/secure/lib/libcrypto/amd64/sha1-mb-x86_64.S
+++ b/secure/lib/libcrypto/amd64/sha1-mb-x86_64.S
@@ -2600,10 +2600,10 @@ _shaext_shortcut:
punpcklqdq %xmm5,%xmm0
punpckhqdq %xmm5,%xmm8
- pshufd $63,%xmm7,%xmm1
- pshufd $127,%xmm7,%xmm9
- pshufd $27,%xmm0,%xmm0
- pshufd $27,%xmm8,%xmm8
+ pshufd $0b00111111,%xmm7,%xmm1
+ pshufd $0b01111111,%xmm7,%xmm9
+ pshufd $0b00011011,%xmm0,%xmm0
+ pshufd $0b00011011,%xmm8,%xmm8
jmp .Loop_shaext
.align 32
@@ -2858,8 +2858,8 @@ _shaext_shortcut:
.byte 69,15,58,204,193,3
.byte 69,15,56,200,214
- pshufd $0,%xmm6,%xmm11
- pshufd $85,%xmm6,%xmm12
+ pshufd $0x00,%xmm6,%xmm11
+ pshufd $0x55,%xmm6,%xmm12
movdqa %xmm6,%xmm7
pcmpgtd %xmm4,%xmm11
pcmpgtd %xmm4,%xmm12
@@ -2889,8 +2889,8 @@ _shaext_shortcut:
movl 280(%rsp),%edx
- pshufd $27,%xmm0,%xmm0
- pshufd $27,%xmm8,%xmm8
+ pshufd $0b00011011,%xmm0,%xmm0
+ pshufd $0b00011011,%xmm8,%xmm8
movdqa %xmm0,%xmm6
punpckldq %xmm8,%xmm0
diff --git a/secure/lib/libcrypto/amd64/sha1-x86_64.S b/secure/lib/libcrypto/amd64/sha1-x86_64.S
index 25c27e5..74c9432 100644
--- a/secure/lib/libcrypto/amd64/sha1-x86_64.S
+++ b/secure/lib/libcrypto/amd64/sha1-x86_64.S
@@ -1241,9 +1241,9 @@ _shaext_shortcut:
movdqa K_XX_XX+160(%rip),%xmm3
movdqu (%rsi),%xmm4
- pshufd $27,%xmm0,%xmm0
+ pshufd $0b00011011,%xmm0,%xmm0
movdqu 16(%rsi),%xmm5
- pshufd $27,%xmm1,%xmm1
+ pshufd $0b00011011,%xmm1,%xmm1
movdqu 32(%rsi),%xmm6
.byte 102,15,56,0,227
movdqu 48(%rsi),%xmm7
@@ -1393,8 +1393,8 @@ _shaext_shortcut:
jnz .Loop_shaext
- pshufd $27,%xmm0,%xmm0
- pshufd $27,%xmm1,%xmm1
+ pshufd $0b00011011,%xmm0,%xmm0
+ pshufd $0b00011011,%xmm1,%xmm1
movdqu %xmm0,(%rdi)
movd %xmm1,16(%rdi)
.byte 0xf3,0xc3
diff --git a/secure/lib/libcrypto/amd64/sha256-mb-x86_64.S b/secure/lib/libcrypto/amd64/sha256-mb-x86_64.S
index 893d42a..b14c796 100644
--- a/secure/lib/libcrypto/amd64/sha256-mb-x86_64.S
+++ b/secure/lib/libcrypto/amd64/sha256-mb-x86_64.S
@@ -2678,10 +2678,10 @@ _shaext_shortcut:
punpckhqdq %xmm8,%xmm14
punpckhqdq %xmm10,%xmm15
- pshufd $27,%xmm12,%xmm12
- pshufd $27,%xmm13,%xmm13
- pshufd $27,%xmm14,%xmm14
- pshufd $27,%xmm15,%xmm15
+ pshufd $0b00011011,%xmm12,%xmm12
+ pshufd $0b00011011,%xmm13,%xmm13
+ pshufd $0b00011011,%xmm14,%xmm14
+ pshufd $0b00011011,%xmm15,%xmm15
jmp .Loop_shaext
.align 32
@@ -2713,11 +2713,11 @@ _shaext_shortcut:
movdqa %xmm2,%xmm0
movdqa %xmm15,112(%rsp)
.byte 69,15,56,203,254
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
pxor %xmm12,%xmm4
movdqa %xmm12,64(%rsp)
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
pxor %xmm14,%xmm8
movdqa %xmm14,96(%rsp)
movdqa 16-128(%rbp),%xmm1
@@ -2735,11 +2735,11 @@ _shaext_shortcut:
.byte 102,68,15,56,0,211
prefetcht0 127(%r9)
.byte 69,15,56,203,254
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
.byte 102,68,15,56,0,219
.byte 15,56,204,229
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 32-128(%rbp),%xmm1
paddd %xmm6,%xmm1
.byte 69,15,56,203,247
@@ -2752,14 +2752,14 @@ _shaext_shortcut:
movdqa %xmm2,%xmm0
movdqa %xmm7,%xmm3
.byte 69,15,56,203,254
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
.byte 102,15,58,15,222,4
paddd %xmm3,%xmm4
movdqa %xmm11,%xmm3
.byte 102,65,15,58,15,218,4
.byte 15,56,204,238
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 48-128(%rbp),%xmm1
paddd %xmm7,%xmm1
.byte 69,15,56,203,247
@@ -2776,13 +2776,13 @@ _shaext_shortcut:
.byte 102,15,58,15,223,4
.byte 69,15,56,203,254
.byte 69,15,56,205,195
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm5
movdqa %xmm8,%xmm3
.byte 102,65,15,58,15,219,4
.byte 15,56,204,247
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 64-128(%rbp),%xmm1
paddd %xmm4,%xmm1
.byte 69,15,56,203,247
@@ -2798,13 +2798,13 @@ _shaext_shortcut:
.byte 102,15,58,15,220,4
.byte 69,15,56,203,254
.byte 69,15,56,205,200
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm6
movdqa %xmm9,%xmm3
.byte 102,65,15,58,15,216,4
.byte 15,56,204,252
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 80-128(%rbp),%xmm1
paddd %xmm5,%xmm1
.byte 69,15,56,203,247
@@ -2820,13 +2820,13 @@ _shaext_shortcut:
.byte 102,15,58,15,221,4
.byte 69,15,56,203,254
.byte 69,15,56,205,209
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm7
movdqa %xmm10,%xmm3
.byte 102,65,15,58,15,217,4
.byte 15,56,204,229
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 96-128(%rbp),%xmm1
paddd %xmm6,%xmm1
.byte 69,15,56,203,247
@@ -2842,13 +2842,13 @@ _shaext_shortcut:
.byte 102,15,58,15,222,4
.byte 69,15,56,203,254
.byte 69,15,56,205,218
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm4
movdqa %xmm11,%xmm3
.byte 102,65,15,58,15,218,4
.byte 15,56,204,238
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 112-128(%rbp),%xmm1
paddd %xmm7,%xmm1
.byte 69,15,56,203,247
@@ -2864,13 +2864,13 @@ _shaext_shortcut:
.byte 102,15,58,15,223,4
.byte 69,15,56,203,254
.byte 69,15,56,205,195
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm5
movdqa %xmm8,%xmm3
.byte 102,65,15,58,15,219,4
.byte 15,56,204,247
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 128-128(%rbp),%xmm1
paddd %xmm4,%xmm1
.byte 69,15,56,203,247
@@ -2886,13 +2886,13 @@ _shaext_shortcut:
.byte 102,15,58,15,220,4
.byte 69,15,56,203,254
.byte 69,15,56,205,200
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm6
movdqa %xmm9,%xmm3
.byte 102,65,15,58,15,216,4
.byte 15,56,204,252
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 144-128(%rbp),%xmm1
paddd %xmm5,%xmm1
.byte 69,15,56,203,247
@@ -2908,13 +2908,13 @@ _shaext_shortcut:
.byte 102,15,58,15,221,4
.byte 69,15,56,203,254
.byte 69,15,56,205,209
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm7
movdqa %xmm10,%xmm3
.byte 102,65,15,58,15,217,4
.byte 15,56,204,229
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 160-128(%rbp),%xmm1
paddd %xmm6,%xmm1
.byte 69,15,56,203,247
@@ -2930,13 +2930,13 @@ _shaext_shortcut:
.byte 102,15,58,15,222,4
.byte 69,15,56,203,254
.byte 69,15,56,205,218
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm4
movdqa %xmm11,%xmm3
.byte 102,65,15,58,15,218,4
.byte 15,56,204,238
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 176-128(%rbp),%xmm1
paddd %xmm7,%xmm1
.byte 69,15,56,203,247
@@ -2952,13 +2952,13 @@ _shaext_shortcut:
.byte 102,15,58,15,223,4
.byte 69,15,56,203,254
.byte 69,15,56,205,195
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm5
movdqa %xmm8,%xmm3
.byte 102,65,15,58,15,219,4
.byte 15,56,204,247
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 192-128(%rbp),%xmm1
paddd %xmm4,%xmm1
.byte 69,15,56,203,247
@@ -2974,13 +2974,13 @@ _shaext_shortcut:
.byte 102,15,58,15,220,4
.byte 69,15,56,203,254
.byte 69,15,56,205,200
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm6
movdqa %xmm9,%xmm3
.byte 102,65,15,58,15,216,4
.byte 15,56,204,252
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 208-128(%rbp),%xmm1
paddd %xmm5,%xmm1
.byte 69,15,56,203,247
@@ -2996,13 +2996,13 @@ _shaext_shortcut:
.byte 102,15,58,15,221,4
.byte 69,15,56,203,254
.byte 69,15,56,205,209
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
paddd %xmm3,%xmm7
movdqa %xmm10,%xmm3
.byte 102,65,15,58,15,217,4
nop
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 224-128(%rbp),%xmm1
paddd %xmm6,%xmm1
.byte 69,15,56,203,247
@@ -3019,13 +3019,13 @@ _shaext_shortcut:
pxor %xmm6,%xmm6
.byte 69,15,56,203,254
.byte 69,15,56,205,218
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
movdqa 240-128(%rbp),%xmm1
paddd %xmm7,%xmm1
movq (%rbx),%xmm7
nop
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
movdqa 240-128(%rbp),%xmm2
paddd %xmm11,%xmm2
.byte 69,15,56,203,247
@@ -3035,17 +3035,17 @@ _shaext_shortcut:
cmovgeq %rsp,%r8
cmpl 4(%rbx),%ecx
cmovgeq %rsp,%r9
- pshufd $0,%xmm7,%xmm9
+ pshufd $0x00,%xmm7,%xmm9
.byte 69,15,56,203,236
movdqa %xmm2,%xmm0
- pshufd $85,%xmm7,%xmm10
+ pshufd $0x55,%xmm7,%xmm10
movdqa %xmm7,%xmm11
.byte 69,15,56,203,254
- pshufd $14,%xmm1,%xmm0
+ pshufd $0x0e,%xmm1,%xmm0
pcmpgtd %xmm6,%xmm9
pcmpgtd %xmm6,%xmm10
.byte 69,15,56,203,229
- pshufd $14,%xmm2,%xmm0
+ pshufd $0x0e,%xmm2,%xmm0
pcmpgtd %xmm6,%xmm11
movdqa K256_shaext-16(%rip),%xmm3
.byte 69,15,56,203,247
@@ -3067,10 +3067,10 @@ _shaext_shortcut:
movl 280(%rsp),%edx
- pshufd $27,%xmm12,%xmm12
- pshufd $27,%xmm13,%xmm13
- pshufd $27,%xmm14,%xmm14
- pshufd $27,%xmm15,%xmm15
+ pshufd $0b00011011,%xmm12,%xmm12
+ pshufd $0b00011011,%xmm13,%xmm13
+ pshufd $0b00011011,%xmm14,%xmm14
+ pshufd $0b00011011,%xmm15,%xmm15
movdqa %xmm12,%xmm5
movdqa %xmm13,%xmm6
diff --git a/secure/lib/libcrypto/amd64/sha256-x86_64.S b/secure/lib/libcrypto/amd64/sha256-x86_64.S
index a43a668..0cbc566 100644
--- a/secure/lib/libcrypto/amd64/sha256-x86_64.S
+++ b/secure/lib/libcrypto/amd64/sha256-x86_64.S
@@ -1755,9 +1755,9 @@ _shaext_shortcut:
movdqu 16(%rdi),%xmm2
movdqa 512-128(%rcx),%xmm7
- pshufd $27,%xmm1,%xmm0
- pshufd $177,%xmm1,%xmm1
- pshufd $27,%xmm2,%xmm2
+ pshufd $0x1b,%xmm1,%xmm0
+ pshufd $0xb1,%xmm1,%xmm1
+ pshufd $0x1b,%xmm2,%xmm2
movdqa %xmm7,%xmm8
.byte 102,15,58,15,202,8
punpcklqdq %xmm0,%xmm2
@@ -1776,7 +1776,7 @@ _shaext_shortcut:
.byte 102,15,56,0,231
movdqa %xmm2,%xmm10
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
nop
movdqa %xmm1,%xmm9
.byte 15,56,203,202
@@ -1785,7 +1785,7 @@ _shaext_shortcut:
paddd %xmm4,%xmm0
.byte 102,15,56,0,239
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
leaq 64(%rsi),%rsi
.byte 15,56,204,220
.byte 15,56,203,202
@@ -1794,7 +1794,7 @@ _shaext_shortcut:
paddd %xmm5,%xmm0
.byte 102,15,56,0,247
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm6,%xmm7
.byte 102,15,58,15,253,4
nop
@@ -1806,7 +1806,7 @@ _shaext_shortcut:
paddd %xmm6,%xmm0
.byte 15,56,205,222
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm3,%xmm7
.byte 102,15,58,15,254,4
nop
@@ -1817,7 +1817,7 @@ _shaext_shortcut:
paddd %xmm3,%xmm0
.byte 15,56,205,227
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm4,%xmm7
.byte 102,15,58,15,251,4
nop
@@ -1828,7 +1828,7 @@ _shaext_shortcut:
paddd %xmm4,%xmm0
.byte 15,56,205,236
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm5,%xmm7
.byte 102,15,58,15,252,4
nop
@@ -1839,7 +1839,7 @@ _shaext_shortcut:
paddd %xmm5,%xmm0
.byte 15,56,205,245
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm6,%xmm7
.byte 102,15,58,15,253,4
nop
@@ -1850,7 +1850,7 @@ _shaext_shortcut:
paddd %xmm6,%xmm0
.byte 15,56,205,222
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm3,%xmm7
.byte 102,15,58,15,254,4
nop
@@ -1861,7 +1861,7 @@ _shaext_shortcut:
paddd %xmm3,%xmm0
.byte 15,56,205,227
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm4,%xmm7
.byte 102,15,58,15,251,4
nop
@@ -1872,7 +1872,7 @@ _shaext_shortcut:
paddd %xmm4,%xmm0
.byte 15,56,205,236
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm5,%xmm7
.byte 102,15,58,15,252,4
nop
@@ -1883,7 +1883,7 @@ _shaext_shortcut:
paddd %xmm5,%xmm0
.byte 15,56,205,245
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm6,%xmm7
.byte 102,15,58,15,253,4
nop
@@ -1894,7 +1894,7 @@ _shaext_shortcut:
paddd %xmm6,%xmm0
.byte 15,56,205,222
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm3,%xmm7
.byte 102,15,58,15,254,4
nop
@@ -1905,7 +1905,7 @@ _shaext_shortcut:
paddd %xmm3,%xmm0
.byte 15,56,205,227
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm4,%xmm7
.byte 102,15,58,15,251,4
nop
@@ -1916,7 +1916,7 @@ _shaext_shortcut:
paddd %xmm4,%xmm0
.byte 15,56,205,236
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
movdqa %xmm5,%xmm7
.byte 102,15,58,15,252,4
.byte 15,56,203,202
@@ -1925,7 +1925,7 @@ _shaext_shortcut:
movdqa 448-128(%rcx),%xmm0
paddd %xmm5,%xmm0
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
.byte 15,56,205,245
movdqa %xmm8,%xmm7
.byte 15,56,203,202
@@ -1934,7 +1934,7 @@ _shaext_shortcut:
paddd %xmm6,%xmm0
nop
.byte 15,56,203,209
- pshufd $14,%xmm0,%xmm0
+ pshufd $0x0e,%xmm0,%xmm0
decq %rdx
nop
.byte 15,56,203,202
@@ -1943,9 +1943,9 @@ _shaext_shortcut:
paddd %xmm9,%xmm1
jnz .Loop_shaext
- pshufd $177,%xmm2,%xmm2
- pshufd $27,%xmm1,%xmm7
- pshufd $177,%xmm1,%xmm1
+ pshufd $0xb1,%xmm2,%xmm2
+ pshufd $0x1b,%xmm1,%xmm7
+ pshufd $0xb1,%xmm1,%xmm1
punpckhqdq %xmm2,%xmm1
.byte 102,15,58,15,215,8
diff --git a/secure/lib/libcrypto/amd64/vpaes-x86_64.S b/secure/lib/libcrypto/amd64/vpaes-x86_64.S
index 8ec5c40..c990e69 100644
--- a/secure/lib/libcrypto/amd64/vpaes-x86_64.S
+++ b/secure/lib/libcrypto/amd64/vpaes-x86_64.S
@@ -61,7 +61,7 @@ _vpaes_encrypt_core:
addq $16,%r11
pxor %xmm0,%xmm3
.byte 102,15,56,0,193
- andq $48,%r11
+ andq $0x30,%r11
subq $1,%rax
pxor %xmm3,%xmm0
@@ -121,10 +121,10 @@ _vpaes_decrypt_core:
pand %xmm9,%xmm0
.byte 102,15,56,0,208
movdqa .Lk_dipt+16(%rip),%xmm0
- xorq $48,%r11
+ xorq $0x30,%r11
leaq .Lk_dsbd(%rip),%r10
.byte 102,15,56,0,193
- andq $48,%r11
+ andq $0x30,%r11
pxor %xmm5,%xmm2
movdqa .Lk_mc_forward+48(%rip),%xmm5
pxor %xmm2,%xmm0
@@ -243,7 +243,7 @@ _vpaes_schedule_core:
movdqa (%r8,%r10,1),%xmm1
.byte 102,15,56,0,217
movdqu %xmm3,(%rdx)
- xorq $48,%r8
+ xorq $0x30,%r8
.Lschedule_go:
cmpl $192,%esi
@@ -333,7 +333,7 @@ _vpaes_schedule_core:
call _vpaes_schedule_mangle
- pshufd $255,%xmm0,%xmm0
+ pshufd $0xFF,%xmm0,%xmm0
movdqa %xmm7,%xmm5
movdqa %xmm6,%xmm7
call _vpaes_schedule_low_round
@@ -400,8 +400,8 @@ _vpaes_schedule_core:
.type _vpaes_schedule_192_smear,@function
.align 16
_vpaes_schedule_192_smear:
- pshufd $128,%xmm6,%xmm1
- pshufd $254,%xmm7,%xmm0
+ pshufd $0x80,%xmm6,%xmm1
+ pshufd $0xFE,%xmm7,%xmm0
pxor %xmm1,%xmm6
pxor %xmm1,%xmm1
pxor %xmm0,%xmm6
@@ -438,7 +438,7 @@ _vpaes_schedule_round:
pxor %xmm1,%xmm7
- pshufd $255,%xmm0,%xmm0
+ pshufd $0xFF,%xmm0,%xmm0
.byte 102,15,58,15,192,1
@@ -597,7 +597,7 @@ _vpaes_schedule_mangle:
movdqa (%r8,%r10,1),%xmm1
.byte 102,15,56,0,217
addq $-16,%r8
- andq $48,%r8
+ andq $0x30,%r8
movdqu %xmm3,(%rdx)
.byte 0xf3,0xc3
.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
@@ -615,7 +615,7 @@ vpaes_set_encrypt_key:
movl %eax,240(%rdx)
movl $0,%ecx
- movl $48,%r8d
+ movl $0x30,%r8d
call _vpaes_schedule_core
xorl %eax,%eax
.byte 0xf3,0xc3
diff --git a/secure/lib/libcrypto/amd64/x86_64-gf2m.S b/secure/lib/libcrypto/amd64/x86_64-gf2m.S
index f86c253..a53d511 100644
--- a/secure/lib/libcrypto/amd64/x86_64-gf2m.S
+++ b/secure/lib/libcrypto/amd64/x86_64-gf2m.S
@@ -243,7 +243,7 @@ bn_GF2m_mul_2x2:
movq %rcx,56(%rsp)
movq %r8,64(%rsp)
- movq $15,%r8
+ movq $0xf,%r8
movq %rsi,%rax
movq %rcx,%rbp
call _mul_1x1
diff --git a/secure/lib/libcrypto/amd64/x86_64-mont.S b/secure/lib/libcrypto/amd64/x86_64-mont.S
index bff0fb9..3e67383 100644
--- a/secure/lib/libcrypto/amd64/x86_64-mont.S
+++ b/secure/lib/libcrypto/amd64/x86_64-mont.S
@@ -634,20 +634,20 @@ bn_sqr8x_mont:
- leaq -64(%rsp,%r9,4),%r11
+ leaq -64(%rsp,%r9,2),%r11
movq (%r8),%r8
subq %rsi,%r11
andq $4095,%r11
cmpq %r11,%r10
jb .Lsqr8x_sp_alt
subq %r11,%rsp
- leaq -64(%rsp,%r9,4),%rsp
+ leaq -64(%rsp,%r9,2),%rsp
jmp .Lsqr8x_sp_done
.align 32
.Lsqr8x_sp_alt:
- leaq 4096-64(,%r9,4),%r10
- leaq -64(%rsp,%r9,4),%rsp
+ leaq 4096-64(,%r9,2),%r10
+ leaq -64(%rsp,%r9,2),%rsp
subq %r10,%r11
movq $0,%r10
cmovcq %r10,%r11
@@ -657,58 +657,80 @@ bn_sqr8x_mont:
movq %r9,%r10
negq %r9
- leaq 64(%rsp,%r9,2),%r11
movq %r8,32(%rsp)
movq %rax,40(%rsp)
.Lsqr8x_body:
- movq %r9,%rbp
-.byte 102,73,15,110,211
- shrq $3+2,%rbp
- movl OPENSSL_ia32cap_P+8(%rip),%eax
- jmp .Lsqr8x_copy_n
-
-.align 32
-.Lsqr8x_copy_n:
- movq 0(%rcx),%xmm0
- movq 8(%rcx),%xmm1
- movq 16(%rcx),%xmm3
- movq 24(%rcx),%xmm4
- leaq 32(%rcx),%rcx
- movdqa %xmm0,0(%r11)
- movdqa %xmm1,16(%r11)
- movdqa %xmm3,32(%r11)
- movdqa %xmm4,48(%r11)
- leaq 64(%r11),%r11
- decq %rbp
- jnz .Lsqr8x_copy_n
-
+.byte 102,72,15,110,209
pxor %xmm0,%xmm0
.byte 102,72,15,110,207
.byte 102,73,15,110,218
call bn_sqr8x_internal
+
+
+
+ leaq (%rdi,%r9,1),%rbx
+ movq %r9,%rcx
+ movq %r9,%rdx
+.byte 102,72,15,126,207
+ sarq $3+2,%rcx
+ jmp .Lsqr8x_sub
+
+.align 32
+.Lsqr8x_sub:
+ movq 0(%rbx),%r12
+ movq 8(%rbx),%r13
+ movq 16(%rbx),%r14
+ movq 24(%rbx),%r15
+ leaq 32(%rbx),%rbx
+ sbbq 0(%rbp),%r12
+ sbbq 8(%rbp),%r13
+ sbbq 16(%rbp),%r14
+ sbbq 24(%rbp),%r15
+ leaq 32(%rbp),%rbp
+ movq %r12,0(%rdi)
+ movq %r13,8(%rdi)
+ movq %r14,16(%rdi)
+ movq %r15,24(%rdi)
+ leaq 32(%rdi),%rdi
+ incq %rcx
+ jnz .Lsqr8x_sub
+
+ sbbq $0,%rax
+ leaq (%rbx,%r9,1),%rbx
+ leaq (%rdi,%r9,1),%rdi
+
+.byte 102,72,15,110,200
pxor %xmm0,%xmm0
- leaq 48(%rsp),%rax
- leaq 64(%rsp,%r9,2),%rdx
- shrq $3+2,%r9
+ pshufd $0,%xmm1,%xmm1
movq 40(%rsp),%rsi
- jmp .Lsqr8x_zero
+ jmp .Lsqr8x_cond_copy
.align 32
-.Lsqr8x_zero:
- movdqa %xmm0,0(%rax)
- movdqa %xmm0,16(%rax)
- movdqa %xmm0,32(%rax)
- movdqa %xmm0,48(%rax)
- leaq 64(%rax),%rax
- movdqa %xmm0,0(%rdx)
- movdqa %xmm0,16(%rdx)
- movdqa %xmm0,32(%rdx)
- movdqa %xmm0,48(%rdx)
- leaq 64(%rdx),%rdx
- decq %r9
- jnz .Lsqr8x_zero
+.Lsqr8x_cond_copy:
+ movdqa 0(%rbx),%xmm2
+ movdqa 16(%rbx),%xmm3
+ leaq 32(%rbx),%rbx
+ movdqu 0(%rdi),%xmm4
+ movdqu 16(%rdi),%xmm5
+ leaq 32(%rdi),%rdi
+ movdqa %xmm0,-32(%rbx)
+ movdqa %xmm0,-16(%rbx)
+ movdqa %xmm0,-32(%rbx,%rdx,1)
+ movdqa %xmm0,-16(%rbx,%rdx,1)
+ pcmpeqd %xmm1,%xmm0
+ pand %xmm1,%xmm2
+ pand %xmm1,%xmm3
+ pand %xmm0,%xmm4
+ pand %xmm0,%xmm5
+ pxor %xmm0,%xmm0
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqu %xmm4,-32(%rdi)
+ movdqu %xmm5,-16(%rdi)
+ addq $32,%r9
+ jnz .Lsqr8x_cond_copy
movq $1,%rax
movq -48(%rsi),%r15
diff --git a/secure/lib/libcrypto/amd64/x86_64-mont5.S b/secure/lib/libcrypto/amd64/x86_64-mont5.S
index 19162e8..cff6c72 100644
--- a/secure/lib/libcrypto/amd64/x86_64-mont5.S
+++ b/secure/lib/libcrypto/amd64/x86_64-mont5.S
@@ -15,46 +15,151 @@ bn_mul_mont_gather5:
.Lmul_enter:
movl %r9d,%r9d
movq %rsp,%rax
- movl 8(%rsp),%r10d
+ movd 8(%rsp),%xmm5
+ leaq .Linc(%rip),%r10
pushq %rbx
pushq %rbp
pushq %r12
pushq %r13
pushq %r14
pushq %r15
+
leaq 2(%r9),%r11
negq %r11
- leaq (%rsp,%r11,8),%rsp
+ leaq -264(%rsp,%r11,8),%rsp
andq $-1024,%rsp
movq %rax,8(%rsp,%r9,8)
.Lmul_body:
- movq %rdx,%r12
- movq %r10,%r11
- shrq $3,%r10
- andq $7,%r11
- notq %r10
- leaq .Lmagic_masks(%rip),%rax
- andq $3,%r10
- leaq 96(%r12,%r11,8),%r12
- movq 0(%rax,%r10,8),%xmm4
- movq 8(%rax,%r10,8),%xmm5
- movq 16(%rax,%r10,8),%xmm6
- movq 24(%rax,%r10,8),%xmm7
-
- movq -96(%r12),%xmm0
- movq -32(%r12),%xmm1
- pand %xmm4,%xmm0
- movq 32(%r12),%xmm2
- pand %xmm5,%xmm1
- movq 96(%r12),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
+ leaq 128(%rdx),%r12
+ movdqa 0(%r10),%xmm0
+ movdqa 16(%r10),%xmm1
+ leaq 24-112(%rsp,%r9,8),%r10
+ andq $-16,%r10
+
+ pshufd $0,%xmm5,%xmm5
+ movdqa %xmm1,%xmm4
+ movdqa %xmm1,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+.byte 0x67
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,112(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,128(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,144(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,160(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,176(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,192(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,208(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,224(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,240(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,256(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,272(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,288(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,304(%r10)
+
+ paddd %xmm2,%xmm3
+.byte 0x67
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,320(%r10)
+
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,336(%r10)
+ pand 64(%r12),%xmm0
+
+ pand 80(%r12),%xmm1
+ pand 96(%r12),%xmm2
+ movdqa %xmm3,352(%r10)
+ pand 112(%r12),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -128(%r12),%xmm4
+ movdqa -112(%r12),%xmm5
+ movdqa -96(%r12),%xmm2
+ pand 112(%r10),%xmm4
+ movdqa -80(%r12),%xmm3
+ pand 128(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 144(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 160(%r10),%xmm3
por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -64(%r12),%xmm4
+ movdqa -48(%r12),%xmm5
+ movdqa -32(%r12),%xmm2
+ pand 176(%r10),%xmm4
+ movdqa -16(%r12),%xmm3
+ pand 192(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 208(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 224(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa 0(%r12),%xmm4
+ movdqa 16(%r12),%xmm5
+ movdqa 32(%r12),%xmm2
+ pand 240(%r10),%xmm4
+ movdqa 48(%r12),%xmm3
+ pand 256(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 272(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 288(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ por %xmm1,%xmm0
+ pshufd $0x4e,%xmm0,%xmm1
+ por %xmm1,%xmm0
leaq 256(%r12),%r12
- por %xmm3,%xmm0
-
.byte 102,72,15,126,195
movq (%r8),%r8
@@ -63,29 +168,14 @@ bn_mul_mont_gather5:
xorq %r14,%r14
xorq %r15,%r15
- movq -96(%r12),%xmm0
- movq -32(%r12),%xmm1
- pand %xmm4,%xmm0
- movq 32(%r12),%xmm2
- pand %xmm5,%xmm1
-
movq %r8,%rbp
mulq %rbx
movq %rax,%r10
movq (%rcx),%rax
- movq 96(%r12),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
-
imulq %r10,%rbp
movq %rdx,%r11
- por %xmm2,%xmm0
- leaq 256(%r12),%r12
- por %xmm3,%xmm0
-
mulq %rbp
addq %rax,%r10
movq 8(%rsi),%rax
@@ -118,14 +208,12 @@ bn_mul_mont_gather5:
cmpq %r9,%r15
jne .L1st
-.byte 102,72,15,126,195
addq %rax,%r13
- movq (%rsi),%rax
adcq $0,%rdx
addq %r11,%r13
adcq $0,%rdx
- movq %r13,-16(%rsp,%r15,8)
+ movq %r13,-16(%rsp,%r9,8)
movq %rdx,%r13
movq %r10,%r11
@@ -139,33 +227,78 @@ bn_mul_mont_gather5:
jmp .Louter
.align 16
.Louter:
+ leaq 24+128(%rsp,%r9,8),%rdx
+ andq $-16,%rdx
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ movdqa -128(%r12),%xmm0
+ movdqa -112(%r12),%xmm1
+ movdqa -96(%r12),%xmm2
+ movdqa -80(%r12),%xmm3
+ pand -128(%rdx),%xmm0
+ pand -112(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand -96(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand -80(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa -64(%r12),%xmm0
+ movdqa -48(%r12),%xmm1
+ movdqa -32(%r12),%xmm2
+ movdqa -16(%r12),%xmm3
+ pand -64(%rdx),%xmm0
+ pand -48(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand -32(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand -16(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 0(%r12),%xmm0
+ movdqa 16(%r12),%xmm1
+ movdqa 32(%r12),%xmm2
+ movdqa 48(%r12),%xmm3
+ pand 0(%rdx),%xmm0
+ pand 16(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand 32(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand 48(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 64(%r12),%xmm0
+ movdqa 80(%r12),%xmm1
+ movdqa 96(%r12),%xmm2
+ movdqa 112(%r12),%xmm3
+ pand 64(%rdx),%xmm0
+ pand 80(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand 96(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand 112(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ por %xmm5,%xmm4
+ pshufd $0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
+ leaq 256(%r12),%r12
+
+ movq (%rsi),%rax
+.byte 102,72,15,126,195
+
xorq %r15,%r15
movq %r8,%rbp
movq (%rsp),%r10
- movq -96(%r12),%xmm0
- movq -32(%r12),%xmm1
- pand %xmm4,%xmm0
- movq 32(%r12),%xmm2
- pand %xmm5,%xmm1
-
mulq %rbx
addq %rax,%r10
movq (%rcx),%rax
adcq $0,%rdx
- movq 96(%r12),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
-
imulq %r10,%rbp
movq %rdx,%r11
- por %xmm2,%xmm0
- leaq 256(%r12),%r12
- por %xmm3,%xmm0
-
mulq %rbp
addq %rax,%r10
movq 8(%rsi),%rax
@@ -201,15 +334,12 @@ bn_mul_mont_gather5:
cmpq %r9,%r15
jne .Linner
-.byte 102,72,15,126,195
-
addq %rax,%r13
- movq (%rsi),%rax
adcq $0,%rdx
addq %r10,%r13
- movq (%rsp,%r15,8),%r10
+ movq (%rsp,%r9,8),%r10
adcq $0,%rdx
- movq %r13,-16(%rsp,%r15,8)
+ movq %r13,-16(%rsp,%r9,8)
movq %rdx,%r13
xorq %rdx,%rdx
@@ -256,6 +386,7 @@ bn_mul_mont_gather5:
movq 8(%rsp,%r9,8),%rsi
movq $1,%rax
+
movq -48(%rsi),%r15
movq -40(%rsi),%r14
movq -32(%rsi),%r13
@@ -278,10 +409,10 @@ bn_mul4x_mont_gather5:
pushq %r13
pushq %r14
pushq %r15
+
.byte 0x67
- movl %r9d,%r10d
shll $3,%r9d
- shll $3+2,%r10d
+ leaq (%r9,%r9,2),%r10
negq %r9
@@ -291,19 +422,21 @@ bn_mul4x_mont_gather5:
- leaq -64(%rsp,%r9,2),%r11
- subq %rsi,%r11
+
+
+ leaq -320(%rsp,%r9,2),%r11
+ subq %rdi,%r11
andq $4095,%r11
cmpq %r11,%r10
jb .Lmul4xsp_alt
subq %r11,%rsp
- leaq -64(%rsp,%r9,2),%rsp
+ leaq -320(%rsp,%r9,2),%rsp
jmp .Lmul4xsp_done
.align 32
.Lmul4xsp_alt:
- leaq 4096-64(,%r9,2),%r10
- leaq -64(%rsp,%r9,2),%rsp
+ leaq 4096-320(,%r9,2),%r10
+ leaq -320(%rsp,%r9,2),%rsp
subq %r10,%r11
movq $0,%r10
cmovcq %r10,%r11
@@ -319,6 +452,7 @@ bn_mul4x_mont_gather5:
movq 40(%rsp),%rsi
movq $1,%rax
+
movq -48(%rsi),%r15
movq -40(%rsi),%r14
movq -32(%rsi),%r13
@@ -334,47 +468,141 @@ bn_mul4x_mont_gather5:
.align 32
mul4x_internal:
shlq $5,%r9
- movl 8(%rax),%r10d
- leaq 256(%rdx,%r9,1),%r13
+ movd 8(%rax),%xmm5
+ leaq .Linc(%rip),%rax
+ leaq 128(%rdx,%r9,1),%r13
shrq $5,%r9
- movq %r10,%r11
- shrq $3,%r10
- andq $7,%r11
- notq %r10
- leaq .Lmagic_masks(%rip),%rax
- andq $3,%r10
- leaq 96(%rdx,%r11,8),%r12
- movq 0(%rax,%r10,8),%xmm4
- movq 8(%rax,%r10,8),%xmm5
- addq $7,%r11
- movq 16(%rax,%r10,8),%xmm6
- movq 24(%rax,%r10,8),%xmm7
- andq $7,%r11
-
- movq -96(%r12),%xmm0
- leaq 256(%r12),%r14
- movq -32(%r12),%xmm1
- pand %xmm4,%xmm0
- movq 32(%r12),%xmm2
- pand %xmm5,%xmm1
- movq 96(%r12),%xmm3
- pand %xmm6,%xmm2
-.byte 0x67
- por %xmm1,%xmm0
- movq -96(%r14),%xmm1
-.byte 0x67
- pand %xmm7,%xmm3
-.byte 0x67
- por %xmm2,%xmm0
- movq -32(%r14),%xmm2
+ movdqa 0(%rax),%xmm0
+ movdqa 16(%rax),%xmm1
+ leaq 88-112(%rsp,%r9,1),%r10
+ leaq 128(%rdx),%r12
+
+ pshufd $0,%xmm5,%xmm5
+ movdqa %xmm1,%xmm4
+.byte 0x67,0x67
+ movdqa %xmm1,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
.byte 0x67
- pand %xmm4,%xmm1
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,112(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,128(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,144(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,160(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,176(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,192(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,208(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,224(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,240(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,256(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,272(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,288(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,304(%r10)
+
+ paddd %xmm2,%xmm3
.byte 0x67
- por %xmm3,%xmm0
- movq 32(%r14),%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,320(%r10)
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,336(%r10)
+ pand 64(%r12),%xmm0
+
+ pand 80(%r12),%xmm1
+ pand 96(%r12),%xmm2
+ movdqa %xmm3,352(%r10)
+ pand 112(%r12),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -128(%r12),%xmm4
+ movdqa -112(%r12),%xmm5
+ movdqa -96(%r12),%xmm2
+ pand 112(%r10),%xmm4
+ movdqa -80(%r12),%xmm3
+ pand 128(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 144(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 160(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -64(%r12),%xmm4
+ movdqa -48(%r12),%xmm5
+ movdqa -32(%r12),%xmm2
+ pand 176(%r10),%xmm4
+ movdqa -16(%r12),%xmm3
+ pand 192(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 208(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 224(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa 0(%r12),%xmm4
+ movdqa 16(%r12),%xmm5
+ movdqa 32(%r12),%xmm2
+ pand 240(%r10),%xmm4
+ movdqa 48(%r12),%xmm3
+ pand 256(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 272(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 288(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ por %xmm1,%xmm0
+ pshufd $0x4e,%xmm0,%xmm1
+ por %xmm1,%xmm0
+ leaq 256(%r12),%r12
.byte 102,72,15,126,195
- movq 96(%r14),%xmm0
+
movq %r13,16+8(%rsp)
movq %rdi,56+8(%rsp)
@@ -388,26 +616,10 @@ mul4x_internal:
movq %rax,%r10
movq (%rcx),%rax
- pand %xmm5,%xmm2
- pand %xmm6,%xmm3
- por %xmm2,%xmm1
-
imulq %r10,%rbp
-
-
-
-
-
-
-
- leaq 64+8(%rsp,%r11,8),%r14
+ leaq 64+8(%rsp),%r14
movq %rdx,%r11
- pand %xmm7,%xmm0
- por %xmm3,%xmm1
- leaq 512(%r12),%r12
- por %xmm1,%xmm0
-
mulq %rbp
addq %rax,%r10
movq 8(%rsi,%r9,1),%rax
@@ -416,7 +628,7 @@ mul4x_internal:
mulq %rbx
addq %rax,%r11
- movq 16(%rcx),%rax
+ movq 8(%rcx),%rax
adcq $0,%rdx
movq %rdx,%r10
@@ -426,7 +638,7 @@ mul4x_internal:
adcq $0,%rdx
addq %r11,%rdi
leaq 32(%r9),%r15
- leaq 64(%rcx),%rcx
+ leaq 32(%rcx),%rcx
adcq $0,%rdx
movq %rdi,(%r14)
movq %rdx,%r13
@@ -436,7 +648,7 @@ mul4x_internal:
.L1st4x:
mulq %rbx
addq %rax,%r10
- movq -32(%rcx),%rax
+ movq -16(%rcx),%rax
leaq 32(%r14),%r14
adcq $0,%rdx
movq %rdx,%r11
@@ -452,7 +664,7 @@ mul4x_internal:
mulq %rbx
addq %rax,%r11
- movq -16(%rcx),%rax
+ movq -8(%rcx),%rax
adcq $0,%rdx
movq %rdx,%r10
@@ -482,7 +694,7 @@ mul4x_internal:
mulq %rbx
addq %rax,%r11
- movq 16(%rcx),%rax
+ movq 8(%rcx),%rax
adcq $0,%rdx
movq %rdx,%r10
@@ -491,7 +703,7 @@ mul4x_internal:
movq 16(%rsi,%r15,1),%rax
adcq $0,%rdx
addq %r11,%rdi
- leaq 64(%rcx),%rcx
+ leaq 32(%rcx),%rcx
adcq $0,%rdx
movq %rdi,(%r14)
movq %rdx,%r13
@@ -501,7 +713,7 @@ mul4x_internal:
mulq %rbx
addq %rax,%r10
- movq -32(%rcx),%rax
+ movq -16(%rcx),%rax
leaq 32(%r14),%r14
adcq $0,%rdx
movq %rdx,%r11
@@ -517,7 +729,7 @@ mul4x_internal:
mulq %rbx
addq %rax,%r11
- movq -16(%rcx),%rax
+ movq -8(%rcx),%rax
adcq $0,%rdx
movq %rdx,%r10
@@ -530,8 +742,7 @@ mul4x_internal:
movq %rdi,-16(%r14)
movq %rdx,%r13
-.byte 102,72,15,126,195
- leaq (%rcx,%r9,2),%rcx
+ leaq (%rcx,%r9,1),%rcx
xorq %rdi,%rdi
addq %r10,%r13
@@ -542,6 +753,63 @@ mul4x_internal:
.align 32
.Louter4x:
+ leaq 16+128(%r14),%rdx
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ movdqa -128(%r12),%xmm0
+ movdqa -112(%r12),%xmm1
+ movdqa -96(%r12),%xmm2
+ movdqa -80(%r12),%xmm3
+ pand -128(%rdx),%xmm0
+ pand -112(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand -96(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand -80(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa -64(%r12),%xmm0
+ movdqa -48(%r12),%xmm1
+ movdqa -32(%r12),%xmm2
+ movdqa -16(%r12),%xmm3
+ pand -64(%rdx),%xmm0
+ pand -48(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand -32(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand -16(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 0(%r12),%xmm0
+ movdqa 16(%r12),%xmm1
+ movdqa 32(%r12),%xmm2
+ movdqa 48(%r12),%xmm3
+ pand 0(%rdx),%xmm0
+ pand 16(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand 32(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand 48(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 64(%r12),%xmm0
+ movdqa 80(%r12),%xmm1
+ movdqa 96(%r12),%xmm2
+ movdqa 112(%r12),%xmm3
+ pand 64(%rdx),%xmm0
+ pand 80(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand 96(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand 112(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ por %xmm5,%xmm4
+ pshufd $0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
+ leaq 256(%r12),%r12
+.byte 102,72,15,126,195
+
movq (%r14,%r9,1),%r10
movq %r8,%rbp
mulq %rbx
@@ -549,25 +817,11 @@ mul4x_internal:
movq (%rcx),%rax
adcq $0,%rdx
- movq -96(%r12),%xmm0
- movq -32(%r12),%xmm1
- pand %xmm4,%xmm0
- movq 32(%r12),%xmm2
- pand %xmm5,%xmm1
- movq 96(%r12),%xmm3
-
imulq %r10,%rbp
-.byte 0x67
movq %rdx,%r11
movq %rdi,(%r14)
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
- por %xmm2,%xmm0
leaq (%r14,%r9,1),%r14
- leaq 256(%r12),%r12
- por %xmm3,%xmm0
mulq %rbp
addq %rax,%r10
@@ -577,7 +831,7 @@ mul4x_internal:
mulq %rbx
addq %rax,%r11
- movq 16(%rcx),%rax
+ movq 8(%rcx),%rax
adcq $0,%rdx
addq 8(%r14),%r11
adcq $0,%rdx
@@ -589,7 +843,7 @@ mul4x_internal:
adcq $0,%rdx
addq %r11,%rdi
leaq 32(%r9),%r15
- leaq 64(%rcx),%rcx
+ leaq 32(%rcx),%rcx
adcq $0,%rdx
movq %rdx,%r13
jmp .Linner4x
@@ -598,7 +852,7 @@ mul4x_internal:
.Linner4x:
mulq %rbx
addq %rax,%r10
- movq -32(%rcx),%rax
+ movq -16(%rcx),%rax
adcq $0,%rdx
addq 16(%r14),%r10
leaq 32(%r14),%r14
@@ -616,7 +870,7 @@ mul4x_internal:
mulq %rbx
addq %rax,%r11
- movq -16(%rcx),%rax
+ movq -8(%rcx),%rax
adcq $0,%rdx
addq -8(%r14),%r11
adcq $0,%rdx
@@ -650,7 +904,7 @@ mul4x_internal:
mulq %rbx
addq %rax,%r11
- movq 16(%rcx),%rax
+ movq 8(%rcx),%rax
adcq $0,%rdx
addq 8(%r14),%r11
adcq $0,%rdx
@@ -661,7 +915,7 @@ mul4x_internal:
movq 16(%rsi,%r15,1),%rax
adcq $0,%rdx
addq %r11,%rdi
- leaq 64(%rcx),%rcx
+ leaq 32(%rcx),%rcx
adcq $0,%rdx
movq %r13,-8(%r14)
movq %rdx,%r13
@@ -671,7 +925,7 @@ mul4x_internal:
mulq %rbx
addq %rax,%r10
- movq -32(%rcx),%rax
+ movq -16(%rcx),%rax
adcq $0,%rdx
addq 16(%r14),%r10
leaq 32(%r14),%r14
@@ -690,7 +944,7 @@ mul4x_internal:
mulq %rbx
addq %rax,%r11
movq %rbp,%rax
- movq -16(%rcx),%rbp
+ movq -8(%rcx),%rbp
adcq $0,%rdx
addq -8(%r14),%r11
adcq $0,%rdx
@@ -705,9 +959,8 @@ mul4x_internal:
movq %r13,-24(%r14)
movq %rdx,%r13
-.byte 102,72,15,126,195
movq %rdi,-16(%r14)
- leaq (%rcx,%r9,2),%rcx
+ leaq (%rcx,%r9,1),%rcx
xorq %rdi,%rdi
addq %r10,%r13
@@ -718,16 +971,23 @@ mul4x_internal:
cmpq 16+8(%rsp),%r12
jb .Louter4x
+ xorq %rax,%rax
subq %r13,%rbp
adcq %r15,%r15
orq %r15,%rdi
- xorq $1,%rdi
+ subq %rdi,%rax
leaq (%r14,%r9,1),%rbx
- leaq (%rcx,%rdi,8),%rbp
+ movq (%rcx),%r12
+ leaq (%rcx),%rbp
movq %r9,%rcx
sarq $3+2,%rcx
movq 56+8(%rsp),%rdi
- jmp .Lsqr4x_sub
+ decq %r12
+ xorq %r10,%r10
+ movq 8(%rbp),%r13
+ movq 16(%rbp),%r14
+ movq 24(%rbp),%r15
+ jmp .Lsqr4x_sub_entry
.size mul4x_internal,.-mul4x_internal
.globl bn_power5
.type bn_power5,@function
@@ -740,9 +1000,9 @@ bn_power5:
pushq %r13
pushq %r14
pushq %r15
- movl %r9d,%r10d
+
shll $3,%r9d
- shll $3+2,%r10d
+ leal (%r9,%r9,2),%r10d
negq %r9
movq (%r8),%r8
@@ -752,19 +1012,20 @@ bn_power5:
- leaq -64(%rsp,%r9,2),%r11
- subq %rsi,%r11
+
+ leaq -320(%rsp,%r9,2),%r11
+ subq %rdi,%r11
andq $4095,%r11
cmpq %r11,%r10
jb .Lpwr_sp_alt
subq %r11,%rsp
- leaq -64(%rsp,%r9,2),%rsp
+ leaq -320(%rsp,%r9,2),%rsp
jmp .Lpwr_sp_done
.align 32
.Lpwr_sp_alt:
- leaq 4096-64(,%r9,2),%r10
- leaq -64(%rsp,%r9,2),%rsp
+ leaq 4096-320(,%r9,2),%r10
+ leaq -320(%rsp,%r9,2),%rsp
subq %r10,%r11
movq $0,%r10
cmovcq %r10,%r11
@@ -792,10 +1053,15 @@ bn_power5:
.byte 102,72,15,110,226
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
.byte 102,72,15,126,209
.byte 102,72,15,126,226
@@ -1339,9 +1605,9 @@ __bn_sqr8x_internal:
movq %rbx,-16(%rdi)
movq %r8,-8(%rdi)
.byte 102,72,15,126,213
-sqr8x_reduction:
+__bn_sqr8x_reduction:
xorq %rax,%rax
- leaq (%rbp,%r9,2),%rcx
+ leaq (%r9,%rbp,1),%rcx
leaq 48+8(%rsp,%r9,2),%rdx
movq %rcx,0+8(%rsp)
leaq 48+8(%rsp,%r9,1),%rdi
@@ -1374,14 +1640,14 @@ sqr8x_reduction:
.align 32
.L8x_reduce:
mulq %rbx
- movq 16(%rbp),%rax
+ movq 8(%rbp),%rax
negq %r8
movq %rdx,%r8
adcq $0,%r8
mulq %rbx
addq %rax,%r9
- movq 32(%rbp),%rax
+ movq 16(%rbp),%rax
adcq $0,%rdx
addq %r9,%r8
movq %rbx,48-8+8(%rsp,%rcx,8)
@@ -1390,7 +1656,7 @@ sqr8x_reduction:
mulq %rbx
addq %rax,%r10
- movq 48(%rbp),%rax
+ movq 24(%rbp),%rax
adcq $0,%rdx
addq %r10,%r9
movq 32+8(%rsp),%rsi
@@ -1399,7 +1665,7 @@ sqr8x_reduction:
mulq %rbx
addq %rax,%r11
- movq 64(%rbp),%rax
+ movq 32(%rbp),%rax
adcq $0,%rdx
imulq %r8,%rsi
addq %r11,%r10
@@ -1408,7 +1674,7 @@ sqr8x_reduction:
mulq %rbx
addq %rax,%r12
- movq 80(%rbp),%rax
+ movq 40(%rbp),%rax
adcq $0,%rdx
addq %r12,%r11
movq %rdx,%r12
@@ -1416,7 +1682,7 @@ sqr8x_reduction:
mulq %rbx
addq %rax,%r13
- movq 96(%rbp),%rax
+ movq 48(%rbp),%rax
adcq $0,%rdx
addq %r13,%r12
movq %rdx,%r13
@@ -1424,7 +1690,7 @@ sqr8x_reduction:
mulq %rbx
addq %rax,%r14
- movq 112(%rbp),%rax
+ movq 56(%rbp),%rax
adcq $0,%rdx
addq %r14,%r13
movq %rdx,%r14
@@ -1442,7 +1708,7 @@ sqr8x_reduction:
decl %ecx
jnz .L8x_reduce
- leaq 128(%rbp),%rbp
+ leaq 64(%rbp),%rbp
xorq %rax,%rax
movq 8+8(%rsp),%rdx
cmpq 0+8(%rsp),%rbp
@@ -1468,14 +1734,14 @@ sqr8x_reduction:
.L8x_tail:
mulq %rbx
addq %rax,%r8
- movq 16(%rbp),%rax
+ movq 8(%rbp),%rax
movq %r8,(%rdi)
movq %rdx,%r8
adcq $0,%r8
mulq %rbx
addq %rax,%r9
- movq 32(%rbp),%rax
+ movq 16(%rbp),%rax
adcq $0,%rdx
addq %r9,%r8
leaq 8(%rdi),%rdi
@@ -1484,7 +1750,7 @@ sqr8x_reduction:
mulq %rbx
addq %rax,%r10
- movq 48(%rbp),%rax
+ movq 24(%rbp),%rax
adcq $0,%rdx
addq %r10,%r9
movq %rdx,%r10
@@ -1492,7 +1758,7 @@ sqr8x_reduction:
mulq %rbx
addq %rax,%r11
- movq 64(%rbp),%rax
+ movq 32(%rbp),%rax
adcq $0,%rdx
addq %r11,%r10
movq %rdx,%r11
@@ -1500,7 +1766,7 @@ sqr8x_reduction:
mulq %rbx
addq %rax,%r12
- movq 80(%rbp),%rax
+ movq 40(%rbp),%rax
adcq $0,%rdx
addq %r12,%r11
movq %rdx,%r12
@@ -1508,7 +1774,7 @@ sqr8x_reduction:
mulq %rbx
addq %rax,%r13
- movq 96(%rbp),%rax
+ movq 48(%rbp),%rax
adcq $0,%rdx
addq %r13,%r12
movq %rdx,%r13
@@ -1516,7 +1782,7 @@ sqr8x_reduction:
mulq %rbx
addq %rax,%r14
- movq 112(%rbp),%rax
+ movq 56(%rbp),%rax
adcq $0,%rdx
addq %r14,%r13
movq %rdx,%r14
@@ -1534,7 +1800,7 @@ sqr8x_reduction:
decl %ecx
jnz .L8x_tail
- leaq 128(%rbp),%rbp
+ leaq 64(%rbp),%rbp
movq 8+8(%rsp),%rdx
cmpq 0+8(%rsp),%rbp
jae .L8x_tail_done
@@ -1580,7 +1846,7 @@ sqr8x_reduction:
adcq 48(%rdi),%r14
adcq 56(%rdi),%r15
adcq $0,%rax
- movq -16(%rbp),%rcx
+ movq -8(%rbp),%rcx
xorq %rsi,%rsi
.byte 102,72,15,126,213
@@ -1598,44 +1864,62 @@ sqr8x_reduction:
cmpq %rdx,%rdi
jb .L8x_reduction_loop
-
- subq %r15,%rcx
+ .byte 0xf3,0xc3
+.size bn_sqr8x_internal,.-bn_sqr8x_internal
+.type __bn_post4x_internal,@function
+.align 32
+__bn_post4x_internal:
+ movq 0(%rbp),%r12
leaq (%rdi,%r9,1),%rbx
- adcq %rsi,%rsi
movq %r9,%rcx
- orq %rsi,%rax
.byte 102,72,15,126,207
- xorq $1,%rax
+ negq %rax
.byte 102,72,15,126,206
- leaq (%rbp,%rax,8),%rbp
sarq $3+2,%rcx
- jmp .Lsqr4x_sub
+ decq %r12
+ xorq %r10,%r10
+ movq 8(%rbp),%r13
+ movq 16(%rbp),%r14
+ movq 24(%rbp),%r15
+ jmp .Lsqr4x_sub_entry
-.align 32
+.align 16
.Lsqr4x_sub:
-.byte 0x66
- movq 0(%rbx),%r12
- movq 8(%rbx),%r13
- sbbq 0(%rbp),%r12
- movq 16(%rbx),%r14
- sbbq 16(%rbp),%r13
- movq 24(%rbx),%r15
- leaq 32(%rbx),%rbx
- sbbq 32(%rbp),%r14
+ movq 0(%rbp),%r12
+ movq 8(%rbp),%r13
+ movq 16(%rbp),%r14
+ movq 24(%rbp),%r15
+.Lsqr4x_sub_entry:
+ leaq 32(%rbp),%rbp
+ notq %r12
+ notq %r13
+ notq %r14
+ notq %r15
+ andq %rax,%r12
+ andq %rax,%r13
+ andq %rax,%r14
+ andq %rax,%r15
+
+ negq %r10
+ adcq 0(%rbx),%r12
+ adcq 8(%rbx),%r13
+ adcq 16(%rbx),%r14
+ adcq 24(%rbx),%r15
movq %r12,0(%rdi)
- sbbq 48(%rbp),%r15
- leaq 64(%rbp),%rbp
+ leaq 32(%rbx),%rbx
movq %r13,8(%rdi)
+ sbbq %r10,%r10
movq %r14,16(%rdi)
movq %r15,24(%rdi)
leaq 32(%rdi),%rdi
incq %rcx
jnz .Lsqr4x_sub
+
movq %r9,%r10
negq %r9
.byte 0xf3,0xc3
-.size bn_sqr8x_internal,.-bn_sqr8x_internal
+.size __bn_post4x_internal,.-__bn_post4x_internal
.globl bn_from_montgomery
.type bn_from_montgomery,@function
.align 32
@@ -1657,10 +1941,9 @@ bn_from_mont8x:
pushq %r13
pushq %r14
pushq %r15
-.byte 0x67
- movl %r9d,%r10d
+
shll $3,%r9d
- shll $3+2,%r10d
+ leaq (%r9,%r9,2),%r10
negq %r9
movq (%r8),%r8
@@ -1670,19 +1953,20 @@ bn_from_mont8x:
- leaq -64(%rsp,%r9,2),%r11
- subq %rsi,%r11
+
+ leaq -320(%rsp,%r9,2),%r11
+ subq %rdi,%r11
andq $4095,%r11
cmpq %r11,%r10
jb .Lfrom_sp_alt
subq %r11,%rsp
- leaq -64(%rsp,%r9,2),%rsp
+ leaq -320(%rsp,%r9,2),%rsp
jmp .Lfrom_sp_done
.align 32
.Lfrom_sp_alt:
- leaq 4096-64(,%r9,2),%r10
- leaq -64(%rsp,%r9,2),%rsp
+ leaq 4096-320(,%r9,2),%r10
+ leaq -320(%rsp,%r9,2),%rsp
subq %r10,%r11
movq $0,%r10
cmovcq %r10,%r11
@@ -1733,7 +2017,8 @@ bn_from_mont8x:
.byte 0x67
movq %rcx,%rbp
.byte 102,73,15,110,218
- call sqr8x_reduction
+ call __bn_sqr8x_reduction
+ call __bn_post4x_internal
pxor %xmm0,%xmm0
leaq 48(%rsp),%rax
@@ -1800,45 +2085,169 @@ bn_scatter5:
.globl bn_gather5
.type bn_gather5,@function
-.align 16
+.align 32
bn_gather5:
- movl %ecx,%r11d
- shrl $3,%ecx
- andq $7,%r11
- notl %ecx
- leaq .Lmagic_masks(%rip),%rax
- andl $3,%ecx
- leaq 128(%rdx,%r11,8),%rdx
- movq 0(%rax,%rcx,8),%xmm4
- movq 8(%rax,%rcx,8),%xmm5
- movq 16(%rax,%rcx,8),%xmm6
- movq 24(%rax,%rcx,8),%xmm7
+.LSEH_begin_bn_gather5:
+
+.byte 0x4c,0x8d,0x14,0x24
+.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00
+ leaq .Linc(%rip),%rax
+ andq $-16,%rsp
+
+ movd %ecx,%xmm5
+ movdqa 0(%rax),%xmm0
+ movdqa 16(%rax),%xmm1
+ leaq 128(%rdx),%r11
+ leaq 128(%rsp),%rax
+
+ pshufd $0,%xmm5,%xmm5
+ movdqa %xmm1,%xmm4
+ movdqa %xmm1,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm4,%xmm3
+
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,-128(%rax)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,-112(%rax)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,-96(%rax)
+ movdqa %xmm4,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,-80(%rax)
+ movdqa %xmm4,%xmm3
+
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,-64(%rax)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,-48(%rax)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,-32(%rax)
+ movdqa %xmm4,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,-16(%rax)
+ movdqa %xmm4,%xmm3
+
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,0(%rax)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,16(%rax)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,32(%rax)
+ movdqa %xmm4,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,48(%rax)
+ movdqa %xmm4,%xmm3
+
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,64(%rax)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,80(%rax)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,96(%rax)
+ movdqa %xmm4,%xmm2
+ movdqa %xmm3,112(%rax)
jmp .Lgather
-.align 16
-.Lgather:
- movq -128(%rdx),%xmm0
- movq -64(%rdx),%xmm1
- pand %xmm4,%xmm0
- movq 0(%rdx),%xmm2
- pand %xmm5,%xmm1
- movq 64(%rdx),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
-.byte 0x67,0x67
- por %xmm2,%xmm0
- leaq 256(%rdx),%rdx
- por %xmm3,%xmm0
+.align 32
+.Lgather:
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ movdqa -128(%r11),%xmm0
+ movdqa -112(%r11),%xmm1
+ movdqa -96(%r11),%xmm2
+ pand -128(%rax),%xmm0
+ movdqa -80(%r11),%xmm3
+ pand -112(%rax),%xmm1
+ por %xmm0,%xmm4
+ pand -96(%rax),%xmm2
+ por %xmm1,%xmm5
+ pand -80(%rax),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa -64(%r11),%xmm0
+ movdqa -48(%r11),%xmm1
+ movdqa -32(%r11),%xmm2
+ pand -64(%rax),%xmm0
+ movdqa -16(%r11),%xmm3
+ pand -48(%rax),%xmm1
+ por %xmm0,%xmm4
+ pand -32(%rax),%xmm2
+ por %xmm1,%xmm5
+ pand -16(%rax),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 0(%r11),%xmm0
+ movdqa 16(%r11),%xmm1
+ movdqa 32(%r11),%xmm2
+ pand 0(%rax),%xmm0
+ movdqa 48(%r11),%xmm3
+ pand 16(%rax),%xmm1
+ por %xmm0,%xmm4
+ pand 32(%rax),%xmm2
+ por %xmm1,%xmm5
+ pand 48(%rax),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 64(%r11),%xmm0
+ movdqa 80(%r11),%xmm1
+ movdqa 96(%r11),%xmm2
+ pand 64(%rax),%xmm0
+ movdqa 112(%r11),%xmm3
+ pand 80(%rax),%xmm1
+ por %xmm0,%xmm4
+ pand 96(%rax),%xmm2
+ por %xmm1,%xmm5
+ pand 112(%rax),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ por %xmm5,%xmm4
+ leaq 256(%r11),%r11
+ pshufd $0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
movq %xmm0,(%rdi)
leaq 8(%rdi),%rdi
subl $1,%esi
jnz .Lgather
+
+ leaq (%r10),%rsp
.byte 0xf3,0xc3
.LSEH_end_bn_gather5:
.size bn_gather5,.-bn_gather5
.align 64
-.Lmagic_masks:
-.long 0,0, 0,0, 0,0, -1,-1
-.long 0,0, 0,0, 0,0, 0,0
+.Linc:
+.long 0,0, 1,1
+.long 2,2, 2,2
.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
diff --git a/secure/lib/libcrypto/amd64/x86_64cpuid.S b/secure/lib/libcrypto/amd64/x86_64cpuid.S
index 93de516..e5b8011 100644
--- a/secure/lib/libcrypto/amd64/x86_64cpuid.S
+++ b/secure/lib/libcrypto/amd64/x86_64cpuid.S
@@ -45,43 +45,43 @@ OPENSSL_ia32_cpuid:
movl %eax,%r11d
xorl %eax,%eax
- cmpl $1970169159,%ebx
+ cmpl $0x756e6547,%ebx
setne %al
movl %eax,%r9d
- cmpl $1231384169,%edx
+ cmpl $0x49656e69,%edx
setne %al
orl %eax,%r9d
- cmpl $1818588270,%ecx
+ cmpl $0x6c65746e,%ecx
setne %al
orl %eax,%r9d
jz .Lintel
- cmpl $1752462657,%ebx
+ cmpl $0x68747541,%ebx
setne %al
movl %eax,%r10d
- cmpl $1769238117,%edx
+ cmpl $0x69746E65,%edx
setne %al
orl %eax,%r10d
- cmpl $1145913699,%ecx
+ cmpl $0x444D4163,%ecx
setne %al
orl %eax,%r10d
jnz .Lintel
- movl $2147483648,%eax
+ movl $0x80000000,%eax
cpuid
- cmpl $2147483649,%eax
+ cmpl $0x80000001,%eax
jb .Lintel
movl %eax,%r10d
- movl $2147483649,%eax
+ movl $0x80000001,%eax
cpuid
orl %ecx,%r9d
- andl $2049,%r9d
+ andl $0x00000801,%r9d
- cmpl $2147483656,%r10d
+ cmpl $0x80000008,%r10d
jb .Lintel
- movl $2147483656,%eax
+ movl $0x80000008,%eax
cpuid
movzbq %cl,%r10
incq %r10
@@ -93,7 +93,7 @@ OPENSSL_ia32_cpuid:
shrl $16,%ebx
cmpb %r10b,%bl
ja .Lgeneric
- andl $4026531839,%edx
+ andl $0xefffffff,%edx
jmp .Lgeneric
.Lintel:
@@ -106,7 +106,7 @@ OPENSSL_ia32_cpuid:
cpuid
movl %eax,%r10d
shrl $14,%r10d
- andl $4095,%r10d
+ andl $0xfff,%r10d
cmpl $7,%r11d
jb .Lnocacheinfo
@@ -119,29 +119,29 @@ OPENSSL_ia32_cpuid:
.Lnocacheinfo:
movl $1,%eax
cpuid
- andl $3220176895,%edx
+ andl $0xbfefffff,%edx
cmpl $0,%r9d
jne .Lnotintel
- orl $1073741824,%edx
+ orl $0x40000000,%edx
andb $15,%ah
cmpb $15,%ah
jne .Lnotintel
- orl $1048576,%edx
+ orl $0x00100000,%edx
.Lnotintel:
btl $28,%edx
jnc .Lgeneric
- andl $4026531839,%edx
+ andl $0xefffffff,%edx
cmpl $0,%r10d
je .Lgeneric
- orl $268435456,%edx
+ orl $0x10000000,%edx
shrl $16,%ebx
cmpb $1,%bl
ja .Lgeneric
- andl $4026531839,%edx
+ andl $0xefffffff,%edx
.Lgeneric:
- andl $2048,%r9d
- andl $4294965247,%ecx
+ andl $0x00000800,%r9d
+ andl $0xfffff7ff,%ecx
orl %ecx,%r9d
movl %edx,%r10d
@@ -153,9 +153,9 @@ OPENSSL_ia32_cpuid:
cmpl $6,%eax
je .Ldone
.Lclear_avx:
- movl $4026525695,%eax
+ movl $0xefffe7ff,%eax
andl %eax,%r9d
- andl $4294967263,8(%rdi)
+ andl $0xffffffdf,8(%rdi)
.Ldone:
shlq $32,%r9
movl %r10d,%eax
OpenPOWER on IntegriCloud