diff options
Diffstat (limited to 'secure/lib/libcrypto/amd64')
26 files changed, 19350 insertions, 7830 deletions
diff --git a/secure/lib/libcrypto/amd64/aes-x86_64.S b/secure/lib/libcrypto/amd64/aes-x86_64.S index c800d5e..3243d6d 100644 --- a/secure/lib/libcrypto/amd64/aes-x86_64.S +++ b/secure/lib/libcrypto/amd64/aes-x86_64.S @@ -151,7 +151,7 @@ _x86_64_AES_encrypt: xorl %r11d,%ebx xorl %r12d,%ecx xorl %r8d,%edx -.byte 0xf3,0xc3 +.byte 0xf3,0xc3 .size _x86_64_AES_encrypt,.-_x86_64_AES_encrypt .type _x86_64_AES_encrypt_compact,@function .align 16 @@ -176,80 +176,78 @@ _x86_64_AES_encrypt_compact: movzbl %al,%r10d movzbl %bl,%r11d movzbl %cl,%r12d - movzbl (%r14,%r10,1),%r10d - movzbl (%r14,%r11,1),%r11d - movzbl (%r14,%r12,1),%r12d - movzbl %dl,%r8d movzbl %bh,%esi movzbl %ch,%edi + shrl $16,%ecx + movzbl %dh,%ebp + movzbl (%r14,%r10,1),%r10d + movzbl (%r14,%r11,1),%r11d + movzbl (%r14,%r12,1),%r12d movzbl (%r14,%r8,1),%r8d - movzbl (%r14,%rsi,1),%r9d - movzbl (%r14,%rdi,1),%r13d - movzbl %dh,%ebp + movzbl (%r14,%rsi,1),%r9d movzbl %ah,%esi - shrl $16,%ecx + movzbl (%r14,%rdi,1),%r13d + movzbl %cl,%edi movzbl (%r14,%rbp,1),%ebp movzbl (%r14,%rsi,1),%esi - shrl $16,%edx - movzbl %cl,%edi shll $8,%r9d + shrl $16,%edx shll $8,%r13d - movzbl (%r14,%rdi,1),%edi xorl %r9d,%r10d - xorl %r13d,%r11d - - movzbl %dl,%r9d shrl $16,%eax + movzbl %dl,%r9d shrl $16,%ebx - movzbl %al,%r13d + xorl %r13d,%r11d shll $8,%ebp - shll $8,%esi - movzbl (%r14,%r9,1),%r9d - movzbl (%r14,%r13,1),%r13d + movzbl %al,%r13d + movzbl (%r14,%rdi,1),%edi xorl %ebp,%r12d - xorl %esi,%r8d + shll $8,%esi movzbl %bl,%ebp - movzbl %dh,%esi shll $16,%edi - movzbl (%r14,%rbp,1),%ebp - movzbl (%r14,%rsi,1),%esi + xorl %esi,%r8d + movzbl (%r14,%r9,1),%r9d + movzbl %dh,%esi + movzbl (%r14,%r13,1),%r13d xorl %edi,%r10d - movzbl %ah,%edi shrl $8,%ecx + movzbl %ah,%edi + shll $16,%r9d shrl $8,%ebx + shll $16,%r13d + xorl %r9d,%r11d + movzbl (%r14,%rbp,1),%ebp + movzbl (%r14,%rsi,1),%esi movzbl (%r14,%rdi,1),%edi movzbl (%r14,%rcx,1),%edx movzbl (%r14,%rbx,1),%ecx - shll $16,%r9d - shll $16,%r13d + shll $16,%ebp - xorl %r9d,%r11d xorl %r13d,%r12d - xorl %ebp,%r8d - shll $24,%esi + xorl %ebp,%r8d shll $24,%edi - shll $24,%edx xorl %esi,%r10d - shll $24,%ecx + shll $24,%edx xorl %edi,%r11d + shll $24,%ecx movl %r10d,%eax movl %r11d,%ebx xorl %r12d,%ecx xorl %r8d,%edx cmpq 16(%rsp),%r15 je .Lenc_compact_done - movl %eax,%esi - movl %ebx,%edi - andl $2155905152,%esi - andl $2155905152,%edi - movl %esi,%r10d - movl %edi,%r11d + movl $2155905152,%r10d + movl $2155905152,%r11d + andl %eax,%r10d + andl %ebx,%r11d + movl %r10d,%esi + movl %r11d,%edi shrl $7,%r10d leal (%rax,%rax,1),%r8d shrl $7,%r11d @@ -267,25 +265,25 @@ _x86_64_AES_encrypt_compact: xorl %r8d,%eax xorl %r9d,%ebx - movl %ecx,%esi - movl %edx,%edi + movl $2155905152,%r12d roll $24,%eax + movl $2155905152,%ebp roll $24,%ebx - andl $2155905152,%esi - andl $2155905152,%edi + andl %ecx,%r12d + andl %edx,%ebp xorl %r8d,%eax xorl %r9d,%ebx - movl %esi,%r12d - movl %edi,%ebp + movl %r12d,%esi rorl $16,%r10d + movl %ebp,%edi rorl $16,%r11d - shrl $7,%r12d leal (%rcx,%rcx,1),%r8d + shrl $7,%r12d xorl %r10d,%eax - xorl %r11d,%ebx shrl $7,%ebp - leal (%rdx,%rdx,1),%r9d + xorl %r11d,%ebx rorl $8,%r10d + leal (%rdx,%rdx,1),%r9d rorl $8,%r11d subl %r12d,%esi subl %ebp,%edi @@ -301,23 +299,23 @@ _x86_64_AES_encrypt_compact: xorl %esi,%r8d xorl %edi,%r9d + rorl $16,%r12d xorl %r8d,%ecx + rorl $16,%ebp xorl %r9d,%edx roll $24,%ecx + movl 0(%r14),%esi roll $24,%edx xorl %r8d,%ecx - xorl %r9d,%edx - movl 0(%r14),%esi - rorl $16,%r12d - rorl $16,%ebp movl 64(%r14),%edi - xorl %r12d,%ecx - xorl %ebp,%edx + xorl %r9d,%edx movl 128(%r14),%r8d + xorl %r12d,%ecx rorl $8,%r12d + xorl %ebp,%edx rorl $8,%ebp - movl 192(%r14),%r9d xorl %r12d,%ecx + movl 192(%r14),%r9d xorl %ebp,%edx jmp .Lenc_loop_compact .align 16 @@ -326,7 +324,7 @@ _x86_64_AES_encrypt_compact: xorl 4(%r15),%ebx xorl 8(%r15),%ecx xorl 12(%r15),%edx -.byte 0xf3,0xc3 +.byte 0xf3,0xc3 .size _x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact .globl AES_encrypt .type AES_encrypt,@function @@ -548,7 +546,7 @@ _x86_64_AES_decrypt: xorl %r11d,%ebx xorl %r12d,%ecx xorl %r8d,%edx -.byte 0xf3,0xc3 +.byte 0xf3,0xc3 .size _x86_64_AES_decrypt,.-_x86_64_AES_decrypt .type _x86_64_AES_decrypt_compact,@function .align 16 @@ -574,70 +572,69 @@ _x86_64_AES_decrypt_compact: movzbl %al,%r10d movzbl %bl,%r11d movzbl %cl,%r12d - movzbl (%r14,%r10,1),%r10d - movzbl (%r14,%r11,1),%r11d - movzbl (%r14,%r12,1),%r12d - movzbl %dl,%r8d movzbl %dh,%esi movzbl %ah,%edi + shrl $16,%edx + movzbl %bh,%ebp + movzbl (%r14,%r10,1),%r10d + movzbl (%r14,%r11,1),%r11d + movzbl (%r14,%r12,1),%r12d movzbl (%r14,%r8,1),%r8d - movzbl (%r14,%rsi,1),%r9d - movzbl (%r14,%rdi,1),%r13d - movzbl %bh,%ebp + movzbl (%r14,%rsi,1),%r9d movzbl %ch,%esi - shrl $16,%ecx + movzbl (%r14,%rdi,1),%r13d movzbl (%r14,%rbp,1),%ebp movzbl (%r14,%rsi,1),%esi - shrl $16,%edx - movzbl %cl,%edi - shll $8,%r9d + shrl $16,%ecx shll $8,%r13d - movzbl (%r14,%rdi,1),%edi - xorl %r9d,%r10d - xorl %r13d,%r11d - - movzbl %dl,%r9d + shll $8,%r9d + movzbl %cl,%edi shrl $16,%eax + xorl %r9d,%r10d shrl $16,%ebx - movzbl %al,%r13d + movzbl %dl,%r9d + shll $8,%ebp + xorl %r13d,%r11d shll $8,%esi - movzbl (%r14,%r9,1),%r9d - movzbl (%r14,%r13,1),%r13d + movzbl %al,%r13d + movzbl (%r14,%rdi,1),%edi xorl %ebp,%r12d - xorl %esi,%r8d - movzbl %bl,%ebp - movzbl %bh,%esi + shll $16,%edi + xorl %esi,%r8d + movzbl (%r14,%r9,1),%r9d + movzbl %bh,%esi movzbl (%r14,%rbp,1),%ebp - movzbl (%r14,%rsi,1),%esi xorl %edi,%r10d - + movzbl (%r14,%r13,1),%r13d movzbl %ch,%edi + + shll $16,%ebp shll $16,%r9d shll $16,%r13d - movzbl (%r14,%rdi,1),%ebx + xorl %ebp,%r8d + movzbl %dh,%ebp xorl %r9d,%r11d + shrl $8,%eax xorl %r13d,%r12d - movzbl %dh,%edi - shrl $8,%eax - shll $16,%ebp - movzbl (%r14,%rdi,1),%ecx + movzbl (%r14,%rsi,1),%esi + movzbl (%r14,%rdi,1),%ebx + movzbl (%r14,%rbp,1),%ecx movzbl (%r14,%rax,1),%edx - xorl %ebp,%r8d + movl %r10d,%eax shll $24,%esi shll $24,%ebx shll $24,%ecx - xorl %esi,%r10d + xorl %esi,%eax shll $24,%edx xorl %r11d,%ebx - movl %r10d,%eax xorl %r12d,%ecx xorl %r8d,%edx cmpq 16(%rsp),%r15 @@ -650,12 +647,12 @@ _x86_64_AES_decrypt_compact: orq %rbx,%rax orq %rdx,%rcx movq 256+16(%r14),%rbp - movq %rax,%rbx - movq %rcx,%rdx - andq %rsi,%rbx - andq %rsi,%rdx - movq %rbx,%r9 - movq %rdx,%r12 + movq %rsi,%r9 + movq %rsi,%r12 + andq %rax,%r9 + andq %rcx,%r12 + movq %r9,%rbx + movq %r12,%rdx shrq $7,%r9 leaq (%rax,%rax,1),%r8 shrq $7,%r12 @@ -666,15 +663,15 @@ _x86_64_AES_decrypt_compact: andq %rdi,%r11 andq %rbp,%rbx andq %rbp,%rdx - xorq %r8,%rbx - xorq %r11,%rdx - movq %rbx,%r8 - movq %rdx,%r11 - - andq %rsi,%rbx - andq %rsi,%rdx - movq %rbx,%r10 - movq %rdx,%r13 + xorq %rbx,%r8 + xorq %rdx,%r11 + movq %rsi,%r10 + movq %rsi,%r13 + + andq %r8,%r10 + andq %r11,%r13 + movq %r10,%rbx + movq %r13,%rdx shrq $7,%r10 leaq (%r8,%r8,1),%r9 shrq $7,%r13 @@ -685,15 +682,15 @@ _x86_64_AES_decrypt_compact: andq %rdi,%r12 andq %rbp,%rbx andq %rbp,%rdx - xorq %r9,%rbx - xorq %r12,%rdx - movq %rbx,%r9 - movq %rdx,%r12 - - andq %rsi,%rbx - andq %rsi,%rdx - movq %rbx,%r10 - movq %rdx,%r13 + xorq %rbx,%r9 + xorq %rdx,%r12 + movq %rsi,%r10 + movq %rsi,%r13 + + andq %r9,%r10 + andq %r12,%r13 + movq %r10,%rbx + movq %r13,%rdx shrq $7,%r10 xorq %rax,%r8 shrq $7,%r13 @@ -718,51 +715,51 @@ _x86_64_AES_decrypt_compact: movq %rax,%rbx movq %rcx,%rdx xorq %r10,%r9 - xorq %r13,%r12 shrq $32,%rbx + xorq %r13,%r12 shrq $32,%rdx xorq %r8,%r10 - xorq %r11,%r13 roll $8,%eax + xorq %r11,%r13 roll $8,%ecx xorq %r9,%r10 + roll $8,%ebx xorq %r12,%r13 - roll $8,%ebx roll $8,%edx xorl %r10d,%eax - xorl %r13d,%ecx shrq $32,%r10 + xorl %r13d,%ecx shrq $32,%r13 xorl %r10d,%ebx xorl %r13d,%edx movq %r8,%r10 - movq %r11,%r13 - shrq $32,%r10 - shrq $32,%r13 roll $24,%r8d + movq %r11,%r13 roll $24,%r11d - roll $24,%r10d - roll $24,%r13d + shrq $32,%r10 xorl %r8d,%eax + shrq $32,%r13 xorl %r11d,%ecx + roll $24,%r10d movq %r9,%r8 + roll $24,%r13d movq %r12,%r11 + shrq $32,%r8 xorl %r10d,%ebx + shrq $32,%r11 xorl %r13d,%edx movq 0(%r14),%rsi - shrq $32,%r8 - shrq $32,%r11 - movq 64(%r14),%rdi roll $16,%r9d + movq 64(%r14),%rdi roll $16,%r12d movq 128(%r14),%rbp roll $16,%r8d - roll $16,%r11d movq 192(%r14),%r10 xorl %r9d,%eax + roll $16,%r11d xorl %r12d,%ecx movq 256(%r14),%r13 xorl %r8d,%ebx @@ -774,7 +771,7 @@ _x86_64_AES_decrypt_compact: xorl 4(%r15),%ebx xorl 8(%r15),%ecx xorl 12(%r15),%edx -.byte 0xf3,0xc3 +.byte 0xf3,0xc3 .size _x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact .globl AES_decrypt .type AES_decrypt,@function @@ -860,10 +857,6 @@ private_AES_set_encrypt_key: call _x86_64_AES_set_encrypt_key - movq 8(%rsp),%r15 - movq 16(%rsp),%r14 - movq 24(%rsp),%r13 - movq 32(%rsp),%r12 movq 40(%rsp),%rbp movq 48(%rsp),%rbx addq $56,%rsp @@ -1108,7 +1101,7 @@ _x86_64_AES_set_encrypt_key: .Lbadpointer: movq $-1,%rax .Lexit: -.byte 0xf3,0xc3 +.byte 0xf3,0xc3 .size _x86_64_AES_set_encrypt_key,.-_x86_64_AES_set_encrypt_key .globl private_AES_set_decrypt_key .type private_AES_set_decrypt_key,@function @@ -1161,12 +1154,12 @@ private_AES_set_decrypt_key: leaq 16(%r15),%r15 movq 0(%r15),%rax movq 8(%r15),%rcx - movq %rax,%rbx - movq %rcx,%rdx - andq %rsi,%rbx - andq %rsi,%rdx - movq %rbx,%r9 - movq %rdx,%r12 + movq %rsi,%r9 + movq %rsi,%r12 + andq %rax,%r9 + andq %rcx,%r12 + movq %r9,%rbx + movq %r12,%rdx shrq $7,%r9 leaq (%rax,%rax,1),%r8 shrq $7,%r12 @@ -1177,15 +1170,15 @@ private_AES_set_decrypt_key: andq %rdi,%r11 andq %rbp,%rbx andq %rbp,%rdx - xorq %r8,%rbx - xorq %r11,%rdx - movq %rbx,%r8 - movq %rdx,%r11 - - andq %rsi,%rbx - andq %rsi,%rdx - movq %rbx,%r10 - movq %rdx,%r13 + xorq %rbx,%r8 + xorq %rdx,%r11 + movq %rsi,%r10 + movq %rsi,%r13 + + andq %r8,%r10 + andq %r11,%r13 + movq %r10,%rbx + movq %r13,%rdx shrq $7,%r10 leaq (%r8,%r8,1),%r9 shrq $7,%r13 @@ -1196,15 +1189,15 @@ private_AES_set_decrypt_key: andq %rdi,%r12 andq %rbp,%rbx andq %rbp,%rdx - xorq %r9,%rbx - xorq %r12,%rdx - movq %rbx,%r9 - movq %rdx,%r12 - - andq %rsi,%rbx - andq %rsi,%rdx - movq %rbx,%r10 - movq %rdx,%r13 + xorq %rbx,%r9 + xorq %rdx,%r12 + movq %rsi,%r10 + movq %rsi,%r13 + + andq %r9,%r10 + andq %r12,%r13 + movq %r10,%rbx + movq %r13,%rdx shrq $7,%r10 xorq %rax,%r8 shrq $7,%r13 @@ -1229,51 +1222,51 @@ private_AES_set_decrypt_key: movq %rax,%rbx movq %rcx,%rdx xorq %r10,%r9 - xorq %r13,%r12 shrq $32,%rbx + xorq %r13,%r12 shrq $32,%rdx xorq %r8,%r10 - xorq %r11,%r13 roll $8,%eax + xorq %r11,%r13 roll $8,%ecx xorq %r9,%r10 + roll $8,%ebx xorq %r12,%r13 - roll $8,%ebx roll $8,%edx xorl %r10d,%eax - xorl %r13d,%ecx shrq $32,%r10 + xorl %r13d,%ecx shrq $32,%r13 xorl %r10d,%ebx xorl %r13d,%edx movq %r8,%r10 - movq %r11,%r13 - shrq $32,%r10 - shrq $32,%r13 roll $24,%r8d + movq %r11,%r13 roll $24,%r11d - roll $24,%r10d - roll $24,%r13d + shrq $32,%r10 xorl %r8d,%eax + shrq $32,%r13 xorl %r11d,%ecx + roll $24,%r10d movq %r9,%r8 + roll $24,%r13d movq %r12,%r11 + shrq $32,%r8 xorl %r10d,%ebx + shrq $32,%r11 xorl %r13d,%edx - shrq $32,%r8 - shrq $32,%r11 - roll $16,%r9d + roll $16,%r12d roll $16,%r8d - roll $16,%r11d xorl %r9d,%eax + roll $16,%r11d xorl %r12d,%ecx xorl %r8d,%ebx @@ -1389,7 +1382,7 @@ AES_cbc_encrypt: leaq 80(%rsp),%rdi leaq 80(%rsp),%r15 movl $30,%ecx -.long 0x90A548F3 +.long 0x90A548F3 movl %eax,(%rdi) .Lcbc_skip_ecopy: movq %r15,0(%rsp) @@ -1551,7 +1544,7 @@ AES_cbc_encrypt: je .Lcbc_exit movl $30,%ecx xorq %rax,%rax -.long 0x90AB48F3 +.long 0x90AB48F3 jmp .Lcbc_exit @@ -1606,7 +1599,7 @@ AES_cbc_encrypt: movl 4(%rbp),%ebx movl 8(%rbp),%ecx movl 12(%rbp),%edx - jz .Lcbc_slow_enc_tail + jz .Lcbc_slow_enc_tail .align 4 .Lcbc_slow_enc_loop: @@ -1651,16 +1644,16 @@ AES_cbc_encrypt: movq %r10,%rcx movq %r8,%rsi movq %r9,%rdi -.long 0x9066A4F3 +.long 0x9066A4F3 movq $16,%rcx subq %r10,%rcx xorq %rax,%rax -.long 0x9066AAF3 +.long 0x9066AAF3 movq %r9,%r8 movq $16,%r10 movq %r11,%rax movq %r12,%rcx - jmp .Lcbc_slow_enc_loop + jmp .Lcbc_slow_enc_loop .align 16 .LSLOW_DECRYPT: @@ -1736,7 +1729,7 @@ AES_cbc_encrypt: movq %r9,%rdi leaq 64(%rsp),%rsi leaq 16(%r10),%rcx -.long 0x9066A4F3 +.long 0x9066A4F3 jmp .Lcbc_exit .align 16 diff --git a/secure/lib/libcrypto/amd64/aesni-gcm-x86_64.S b/secure/lib/libcrypto/amd64/aesni-gcm-x86_64.S new file mode 100644 index 0000000..9e99e71 --- /dev/null +++ b/secure/lib/libcrypto/amd64/aesni-gcm-x86_64.S @@ -0,0 +1,16 @@ + # $FreeBSD$ +.text + +.globl aesni_gcm_encrypt +.type aesni_gcm_encrypt,@function +aesni_gcm_encrypt: + xorl %eax,%eax + .byte 0xf3,0xc3 +.size aesni_gcm_encrypt,.-aesni_gcm_encrypt + +.globl aesni_gcm_decrypt +.type aesni_gcm_decrypt,@function +aesni_gcm_decrypt: + xorl %eax,%eax + .byte 0xf3,0xc3 +.size aesni_gcm_decrypt,.-aesni_gcm_decrypt diff --git a/secure/lib/libcrypto/amd64/aesni-mb-x86_64.S b/secure/lib/libcrypto/amd64/aesni-mb-x86_64.S new file mode 100644 index 0000000..7043ec3 --- /dev/null +++ b/secure/lib/libcrypto/amd64/aesni-mb-x86_64.S @@ -0,0 +1,507 @@ + # $FreeBSD$ +.text + + + +.globl aesni_multi_cbc_encrypt +.type aesni_multi_cbc_encrypt,@function +.align 32 +aesni_multi_cbc_encrypt: + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + + + + + + subq $48,%rsp + andq $-64,%rsp + movq %rax,16(%rsp) + +.Lenc4x_body: + movdqu (%rsi),%xmm12 + leaq 120(%rsi),%rsi + leaq 80(%rdi),%rdi + +.Lenc4x_loop_grande: + movl %edx,24(%rsp) + xorl %edx,%edx + movl -64(%rdi),%ecx + movq -80(%rdi),%r8 + cmpl %edx,%ecx + movq -72(%rdi),%r12 + cmovgl %ecx,%edx + testl %ecx,%ecx + movdqu -56(%rdi),%xmm2 + movl %ecx,32(%rsp) + cmovleq %rsp,%r8 + movl -24(%rdi),%ecx + movq -40(%rdi),%r9 + cmpl %edx,%ecx + movq -32(%rdi),%r13 + cmovgl %ecx,%edx + testl %ecx,%ecx + movdqu -16(%rdi),%xmm3 + movl %ecx,36(%rsp) + cmovleq %rsp,%r9 + movl 16(%rdi),%ecx + movq 0(%rdi),%r10 + cmpl %edx,%ecx + movq 8(%rdi),%r14 + cmovgl %ecx,%edx + testl %ecx,%ecx + movdqu 24(%rdi),%xmm4 + movl %ecx,40(%rsp) + cmovleq %rsp,%r10 + movl 56(%rdi),%ecx + movq 40(%rdi),%r11 + cmpl %edx,%ecx + movq 48(%rdi),%r15 + cmovgl %ecx,%edx + testl %ecx,%ecx + movdqu 64(%rdi),%xmm5 + movl %ecx,44(%rsp) + cmovleq %rsp,%r11 + testl %edx,%edx + jz .Lenc4x_done + + movups 16-120(%rsi),%xmm1 + pxor %xmm12,%xmm2 + movups 32-120(%rsi),%xmm0 + pxor %xmm12,%xmm3 + movl 240-120(%rsi),%eax + pxor %xmm12,%xmm4 + movdqu (%r8),%xmm6 + pxor %xmm12,%xmm5 + movdqu (%r9),%xmm7 + pxor %xmm6,%xmm2 + movdqu (%r10),%xmm8 + pxor %xmm7,%xmm3 + movdqu (%r11),%xmm9 + pxor %xmm8,%xmm4 + pxor %xmm9,%xmm5 + movdqa 32(%rsp),%xmm10 + xorq %rbx,%rbx + jmp .Loop_enc4x + +.align 32 +.Loop_enc4x: + addq $16,%rbx + leaq 16(%rsp),%rbp + movl $1,%ecx + subq %rbx,%rbp + +.byte 102,15,56,220,209 + prefetcht0 31(%r8,%rbx,1) + prefetcht0 31(%r9,%rbx,1) +.byte 102,15,56,220,217 + prefetcht0 31(%r10,%rbx,1) + prefetcht0 31(%r10,%rbx,1) +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movups 48-120(%rsi),%xmm1 + cmpl 32(%rsp),%ecx +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 + cmovgeq %rbp,%r8 + cmovgq %rbp,%r12 +.byte 102,15,56,220,232 + movups -56(%rsi),%xmm0 + cmpl 36(%rsp),%ecx +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 + cmovgeq %rbp,%r9 + cmovgq %rbp,%r13 +.byte 102,15,56,220,233 + movups -40(%rsi),%xmm1 + cmpl 40(%rsp),%ecx +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 + cmovgeq %rbp,%r10 + cmovgq %rbp,%r14 +.byte 102,15,56,220,232 + movups -24(%rsi),%xmm0 + cmpl 44(%rsp),%ecx +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 + cmovgeq %rbp,%r11 + cmovgq %rbp,%r15 +.byte 102,15,56,220,233 + movups -8(%rsi),%xmm1 + movdqa %xmm10,%xmm11 +.byte 102,15,56,220,208 + prefetcht0 15(%r12,%rbx,1) + prefetcht0 15(%r13,%rbx,1) +.byte 102,15,56,220,216 + prefetcht0 15(%r14,%rbx,1) + prefetcht0 15(%r15,%rbx,1) +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + movups 128-120(%rsi),%xmm0 + pxor %xmm12,%xmm12 + +.byte 102,15,56,220,209 + pcmpgtd %xmm12,%xmm11 + movdqu -120(%rsi),%xmm12 +.byte 102,15,56,220,217 + paddd %xmm11,%xmm10 + movdqa %xmm10,32(%rsp) +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movups 144-120(%rsi),%xmm1 + + cmpl $11,%eax + +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + movups 160-120(%rsi),%xmm0 + + jb .Lenc4x_tail + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movups 176-120(%rsi),%xmm1 + +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + movups 192-120(%rsi),%xmm0 + + je .Lenc4x_tail + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movups 208-120(%rsi),%xmm1 + +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + movups 224-120(%rsi),%xmm0 + jmp .Lenc4x_tail + +.align 32 +.Lenc4x_tail: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movdqu (%r8,%rbx,1),%xmm6 + movdqu 16-120(%rsi),%xmm1 + +.byte 102,15,56,221,208 + movdqu (%r9,%rbx,1),%xmm7 + pxor %xmm12,%xmm6 +.byte 102,15,56,221,216 + movdqu (%r10,%rbx,1),%xmm8 + pxor %xmm12,%xmm7 +.byte 102,15,56,221,224 + movdqu (%r11,%rbx,1),%xmm9 + pxor %xmm12,%xmm8 +.byte 102,15,56,221,232 + movdqu 32-120(%rsi),%xmm0 + pxor %xmm12,%xmm9 + + movups %xmm2,-16(%r12,%rbx,1) + pxor %xmm6,%xmm2 + movups %xmm3,-16(%r13,%rbx,1) + pxor %xmm7,%xmm3 + movups %xmm4,-16(%r14,%rbx,1) + pxor %xmm8,%xmm4 + movups %xmm5,-16(%r15,%rbx,1) + pxor %xmm9,%xmm5 + + decl %edx + jnz .Loop_enc4x + + movq 16(%rsp),%rax + movl 24(%rsp),%edx + + + + + + + + + + + leaq 160(%rdi),%rdi + decl %edx + jnz .Lenc4x_loop_grande + +.Lenc4x_done: + movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp +.Lenc4x_epilogue: + .byte 0xf3,0xc3 +.size aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt + +.globl aesni_multi_cbc_decrypt +.type aesni_multi_cbc_decrypt,@function +.align 32 +aesni_multi_cbc_decrypt: + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + + + + + + subq $48,%rsp + andq $-64,%rsp + movq %rax,16(%rsp) + +.Ldec4x_body: + movdqu (%rsi),%xmm12 + leaq 120(%rsi),%rsi + leaq 80(%rdi),%rdi + +.Ldec4x_loop_grande: + movl %edx,24(%rsp) + xorl %edx,%edx + movl -64(%rdi),%ecx + movq -80(%rdi),%r8 + cmpl %edx,%ecx + movq -72(%rdi),%r12 + cmovgl %ecx,%edx + testl %ecx,%ecx + movdqu -56(%rdi),%xmm6 + movl %ecx,32(%rsp) + cmovleq %rsp,%r8 + movl -24(%rdi),%ecx + movq -40(%rdi),%r9 + cmpl %edx,%ecx + movq -32(%rdi),%r13 + cmovgl %ecx,%edx + testl %ecx,%ecx + movdqu -16(%rdi),%xmm7 + movl %ecx,36(%rsp) + cmovleq %rsp,%r9 + movl 16(%rdi),%ecx + movq 0(%rdi),%r10 + cmpl %edx,%ecx + movq 8(%rdi),%r14 + cmovgl %ecx,%edx + testl %ecx,%ecx + movdqu 24(%rdi),%xmm8 + movl %ecx,40(%rsp) + cmovleq %rsp,%r10 + movl 56(%rdi),%ecx + movq 40(%rdi),%r11 + cmpl %edx,%ecx + movq 48(%rdi),%r15 + cmovgl %ecx,%edx + testl %ecx,%ecx + movdqu 64(%rdi),%xmm9 + movl %ecx,44(%rsp) + cmovleq %rsp,%r11 + testl %edx,%edx + jz .Ldec4x_done + + movups 16-120(%rsi),%xmm1 + movups 32-120(%rsi),%xmm0 + movl 240-120(%rsi),%eax + movdqu (%r8),%xmm2 + movdqu (%r9),%xmm3 + pxor %xmm12,%xmm2 + movdqu (%r10),%xmm4 + pxor %xmm12,%xmm3 + movdqu (%r11),%xmm5 + pxor %xmm12,%xmm4 + pxor %xmm12,%xmm5 + movdqa 32(%rsp),%xmm10 + xorq %rbx,%rbx + jmp .Loop_dec4x + +.align 32 +.Loop_dec4x: + addq $16,%rbx + leaq 16(%rsp),%rbp + movl $1,%ecx + subq %rbx,%rbp + +.byte 102,15,56,222,209 + prefetcht0 31(%r8,%rbx,1) + prefetcht0 31(%r9,%rbx,1) +.byte 102,15,56,222,217 + prefetcht0 31(%r10,%rbx,1) + prefetcht0 31(%r11,%rbx,1) +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + movups 48-120(%rsi),%xmm1 + cmpl 32(%rsp),%ecx +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 + cmovgeq %rbp,%r8 + cmovgq %rbp,%r12 +.byte 102,15,56,222,232 + movups -56(%rsi),%xmm0 + cmpl 36(%rsp),%ecx +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 + cmovgeq %rbp,%r9 + cmovgq %rbp,%r13 +.byte 102,15,56,222,233 + movups -40(%rsi),%xmm1 + cmpl 40(%rsp),%ecx +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 + cmovgeq %rbp,%r10 + cmovgq %rbp,%r14 +.byte 102,15,56,222,232 + movups -24(%rsi),%xmm0 + cmpl 44(%rsp),%ecx +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 + cmovgeq %rbp,%r11 + cmovgq %rbp,%r15 +.byte 102,15,56,222,233 + movups -8(%rsi),%xmm1 + movdqa %xmm10,%xmm11 +.byte 102,15,56,222,208 + prefetcht0 15(%r12,%rbx,1) + prefetcht0 15(%r13,%rbx,1) +.byte 102,15,56,222,216 + prefetcht0 15(%r14,%rbx,1) + prefetcht0 15(%r15,%rbx,1) +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 + movups 128-120(%rsi),%xmm0 + pxor %xmm12,%xmm12 + +.byte 102,15,56,222,209 + pcmpgtd %xmm12,%xmm11 + movdqu -120(%rsi),%xmm12 +.byte 102,15,56,222,217 + paddd %xmm11,%xmm10 + movdqa %xmm10,32(%rsp) +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + movups 144-120(%rsi),%xmm1 + + cmpl $11,%eax + +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 + movups 160-120(%rsi),%xmm0 + + jb .Ldec4x_tail + +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + movups 176-120(%rsi),%xmm1 + +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 + movups 192-120(%rsi),%xmm0 + + je .Ldec4x_tail + +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + movups 208-120(%rsi),%xmm1 + +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 + movups 224-120(%rsi),%xmm0 + jmp .Ldec4x_tail + +.align 32 +.Ldec4x_tail: +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 + pxor %xmm0,%xmm6 + pxor %xmm0,%xmm7 +.byte 102,15,56,222,233 + movdqu 16-120(%rsi),%xmm1 + pxor %xmm0,%xmm8 + pxor %xmm0,%xmm9 + movdqu 32-120(%rsi),%xmm0 + +.byte 102,15,56,223,214 +.byte 102,15,56,223,223 + movdqu -16(%r8,%rbx,1),%xmm6 + movdqu -16(%r9,%rbx,1),%xmm7 +.byte 102,65,15,56,223,224 +.byte 102,65,15,56,223,233 + movdqu -16(%r10,%rbx,1),%xmm8 + movdqu -16(%r11,%rbx,1),%xmm9 + + movups %xmm2,-16(%r12,%rbx,1) + movdqu (%r8,%rbx,1),%xmm2 + movups %xmm3,-16(%r13,%rbx,1) + movdqu (%r9,%rbx,1),%xmm3 + pxor %xmm12,%xmm2 + movups %xmm4,-16(%r14,%rbx,1) + movdqu (%r10,%rbx,1),%xmm4 + pxor %xmm12,%xmm3 + movups %xmm5,-16(%r15,%rbx,1) + movdqu (%r11,%rbx,1),%xmm5 + pxor %xmm12,%xmm4 + pxor %xmm12,%xmm5 + + decl %edx + jnz .Loop_dec4x + + movq 16(%rsp),%rax + movl 24(%rsp),%edx + + leaq 160(%rdi),%rdi + decl %edx + jnz .Ldec4x_loop_grande + +.Ldec4x_done: + movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp +.Ldec4x_epilogue: + .byte 0xf3,0xc3 +.size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt diff --git a/secure/lib/libcrypto/amd64/aesni-sha1-x86_64.S b/secure/lib/libcrypto/amd64/aesni-sha1-x86_64.S index e9a2053..fa16434 100644 --- a/secure/lib/libcrypto/amd64/aesni-sha1-x86_64.S +++ b/secure/lib/libcrypto/amd64/aesni-sha1-x86_64.S @@ -4,16 +4,18 @@ .globl aesni_cbc_sha1_enc .type aesni_cbc_sha1_enc,@function -.align 16 +.align 32 aesni_cbc_sha1_enc: movl OPENSSL_ia32cap_P+0(%rip),%r10d - movl OPENSSL_ia32cap_P+4(%rip),%r11d + movq OPENSSL_ia32cap_P+4(%rip),%r11 + btq $61,%r11 + jc aesni_cbc_sha1_enc_shaext jmp aesni_cbc_sha1_enc_ssse3 .byte 0xf3,0xc3 .size aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc .type aesni_cbc_sha1_enc_ssse3,@function -.align 16 +.align 32 aesni_cbc_sha1_enc_ssse3: movq 8(%rsp),%r10 @@ -30,12 +32,12 @@ aesni_cbc_sha1_enc_ssse3: movq %rdi,%r12 movq %rsi,%r13 movq %rdx,%r14 - movq %rcx,%r15 - movdqu (%r8),%xmm11 + leaq 112(%rcx),%r15 + movdqu (%r8),%xmm2 movq %r8,88(%rsp) shlq $6,%r14 subq %r12,%r13 - movl 240(%r15),%r8d + movl 240-112(%r15),%r8d addq %r10,%r14 leaq K_XX_XX(%rip),%r11 @@ -45,1188 +47,1168 @@ aesni_cbc_sha1_enc_ssse3: movl 12(%r9),%edx movl %ebx,%esi movl 16(%r9),%ebp + movl %ecx,%edi + xorl %edx,%edi + andl %edi,%esi - movdqa 64(%r11),%xmm6 - movdqa 0(%r11),%xmm9 - movdqu 0(%r10),%xmm0 - movdqu 16(%r10),%xmm1 - movdqu 32(%r10),%xmm2 - movdqu 48(%r10),%xmm3 -.byte 102,15,56,0,198 + movdqa 64(%r11),%xmm3 + movdqa 0(%r11),%xmm13 + movdqu 0(%r10),%xmm4 + movdqu 16(%r10),%xmm5 + movdqu 32(%r10),%xmm6 + movdqu 48(%r10),%xmm7 +.byte 102,15,56,0,227 +.byte 102,15,56,0,235 +.byte 102,15,56,0,243 addq $64,%r10 -.byte 102,15,56,0,206 -.byte 102,15,56,0,214 -.byte 102,15,56,0,222 - paddd %xmm9,%xmm0 - paddd %xmm9,%xmm1 - paddd %xmm9,%xmm2 - movdqa %xmm0,0(%rsp) - psubd %xmm9,%xmm0 - movdqa %xmm1,16(%rsp) - psubd %xmm9,%xmm1 - movdqa %xmm2,32(%rsp) - psubd %xmm9,%xmm2 - movups (%r15),%xmm13 - movups 16(%r15),%xmm14 + paddd %xmm13,%xmm4 +.byte 102,15,56,0,251 + paddd %xmm13,%xmm5 + paddd %xmm13,%xmm6 + movdqa %xmm4,0(%rsp) + psubd %xmm13,%xmm4 + movdqa %xmm5,16(%rsp) + psubd %xmm13,%xmm5 + movdqa %xmm6,32(%rsp) + psubd %xmm13,%xmm6 + movups -112(%r15),%xmm15 + movups 16-112(%r15),%xmm0 jmp .Loop_ssse3 -.align 16 +.align 32 .Loop_ssse3: - movdqa %xmm1,%xmm4 - addl 0(%rsp),%ebp - movups 0(%r12),%xmm12 - xorps %xmm13,%xmm12 - xorps %xmm12,%xmm11 -.byte 102,69,15,56,220,222 - movups 32(%r15),%xmm15 - xorl %edx,%ecx - movdqa %xmm3,%xmm8 -.byte 102,15,58,15,224,8 + rorl $2,%ebx + movups 0(%r12),%xmm14 + xorps %xmm15,%xmm14 + xorps %xmm14,%xmm2 + movups -80(%r15),%xmm1 +.byte 102,15,56,220,208 + pshufd $238,%xmm4,%xmm8 + xorl %edx,%esi + movdqa %xmm7,%xmm12 + paddd %xmm7,%xmm13 movl %eax,%edi + addl 0(%rsp),%ebp + punpcklqdq %xmm5,%xmm8 + xorl %ecx,%ebx roll $5,%eax - paddd %xmm3,%xmm9 - andl %ecx,%esi - xorl %edx,%ecx - psrldq $4,%xmm8 - xorl %edx,%esi - addl %eax,%ebp - pxor %xmm0,%xmm4 - rorl $2,%ebx addl %esi,%ebp - pxor %xmm2,%xmm8 - addl 4(%rsp),%edx - xorl %ecx,%ebx - movl %ebp,%esi - roll $5,%ebp - pxor %xmm8,%xmm4 + psrldq $4,%xmm12 andl %ebx,%edi xorl %ecx,%ebx - movdqa %xmm9,48(%rsp) - xorl %ecx,%edi -.byte 102,69,15,56,220,223 - movups 48(%r15),%xmm14 - addl %ebp,%edx - movdqa %xmm4,%xmm10 - movdqa %xmm4,%xmm8 + pxor %xmm4,%xmm8 + addl %eax,%ebp rorl $7,%eax - addl %edi,%edx - addl 8(%rsp),%ecx + pxor %xmm6,%xmm12 + xorl %ecx,%edi + movl %ebp,%esi + addl 4(%rsp),%edx + pxor %xmm12,%xmm8 xorl %ebx,%eax - pslldq $12,%xmm10 - paddd %xmm4,%xmm4 - movl %edx,%edi - roll $5,%edx + roll $5,%ebp + movdqa %xmm13,48(%rsp) + addl %edi,%edx + movups -64(%r15),%xmm0 +.byte 102,15,56,220,209 andl %eax,%esi + movdqa %xmm8,%xmm3 xorl %ebx,%eax - psrld $31,%xmm8 - xorl %ebx,%esi - addl %edx,%ecx - movdqa %xmm10,%xmm9 + addl %ebp,%edx rorl $7,%ebp - addl %esi,%ecx - psrld $30,%xmm10 - por %xmm8,%xmm4 - addl 12(%rsp),%ebx + movdqa %xmm8,%xmm12 + xorl %ebx,%esi + pslldq $12,%xmm3 + paddd %xmm8,%xmm8 + movl %edx,%edi + addl 8(%rsp),%ecx + psrld $31,%xmm12 xorl %eax,%ebp - movl %ecx,%esi - roll $5,%ecx -.byte 102,69,15,56,220,222 - movups 64(%r15),%xmm15 - pslld $2,%xmm9 - pxor %xmm10,%xmm4 + roll $5,%edx + addl %esi,%ecx + movdqa %xmm3,%xmm13 andl %ebp,%edi xorl %eax,%ebp - movdqa 0(%r11),%xmm10 - xorl %eax,%edi - addl %ecx,%ebx - pxor %xmm9,%xmm4 + psrld $30,%xmm3 + addl %edx,%ecx rorl $7,%edx - addl %edi,%ebx - movdqa %xmm2,%xmm5 - addl 16(%rsp),%eax + por %xmm12,%xmm8 + xorl %eax,%edi + movl %ecx,%esi + addl 12(%rsp),%ebx + movups -48(%r15),%xmm1 +.byte 102,15,56,220,208 + pslld $2,%xmm13 + pxor %xmm3,%xmm8 xorl %ebp,%edx - movdqa %xmm4,%xmm9 -.byte 102,15,58,15,233,8 - movl %ebx,%edi - roll $5,%ebx - paddd %xmm4,%xmm10 + movdqa 0(%r11),%xmm3 + roll $5,%ecx + addl %edi,%ebx andl %edx,%esi + pxor %xmm13,%xmm8 xorl %ebp,%edx - psrldq $4,%xmm9 - xorl %ebp,%esi - addl %ebx,%eax - pxor %xmm1,%xmm5 + addl %ecx,%ebx rorl $7,%ecx - addl %esi,%eax - pxor %xmm3,%xmm9 - addl 20(%rsp),%ebp -.byte 102,69,15,56,220,223 - movups 80(%r15),%xmm14 + pshufd $238,%xmm5,%xmm9 + xorl %ebp,%esi + movdqa %xmm8,%xmm13 + paddd %xmm8,%xmm3 + movl %ebx,%edi + addl 16(%rsp),%eax + punpcklqdq %xmm6,%xmm9 xorl %edx,%ecx - movl %eax,%esi - roll $5,%eax - pxor %xmm9,%xmm5 + roll $5,%ebx + addl %esi,%eax + psrldq $4,%xmm13 andl %ecx,%edi xorl %edx,%ecx - movdqa %xmm10,0(%rsp) - xorl %edx,%edi - addl %eax,%ebp - movdqa %xmm5,%xmm8 - movdqa %xmm5,%xmm9 + pxor %xmm5,%xmm9 + addl %ebx,%eax rorl $7,%ebx - addl %edi,%ebp - addl 24(%rsp),%edx + movups -32(%r15),%xmm0 +.byte 102,15,56,220,209 + pxor %xmm7,%xmm13 + xorl %edx,%edi + movl %eax,%esi + addl 20(%rsp),%ebp + pxor %xmm13,%xmm9 xorl %ecx,%ebx - pslldq $12,%xmm8 - paddd %xmm5,%xmm5 - movl %ebp,%edi - roll $5,%ebp + roll $5,%eax + movdqa %xmm3,0(%rsp) + addl %edi,%ebp andl %ebx,%esi + movdqa %xmm9,%xmm12 xorl %ecx,%ebx - psrld $31,%xmm9 - xorl %ecx,%esi -.byte 102,69,15,56,220,222 - movups 96(%r15),%xmm15 - addl %ebp,%edx - movdqa %xmm8,%xmm10 + addl %eax,%ebp rorl $7,%eax - addl %esi,%edx - psrld $30,%xmm8 - por %xmm9,%xmm5 - addl 28(%rsp),%ecx + movdqa %xmm9,%xmm13 + xorl %ecx,%esi + pslldq $12,%xmm12 + paddd %xmm9,%xmm9 + movl %ebp,%edi + addl 24(%rsp),%edx + psrld $31,%xmm13 xorl %ebx,%eax - movl %edx,%esi - roll $5,%edx - pslld $2,%xmm10 - pxor %xmm8,%xmm5 + roll $5,%ebp + addl %esi,%edx + movups -16(%r15),%xmm1 +.byte 102,15,56,220,208 + movdqa %xmm12,%xmm3 andl %eax,%edi xorl %ebx,%eax - movdqa 16(%r11),%xmm8 - xorl %ebx,%edi - addl %edx,%ecx - pxor %xmm10,%xmm5 + psrld $30,%xmm12 + addl %ebp,%edx rorl $7,%ebp - addl %edi,%ecx - movdqa %xmm3,%xmm6 - addl 32(%rsp),%ebx + por %xmm13,%xmm9 + xorl %ebx,%edi + movl %edx,%esi + addl 28(%rsp),%ecx + pslld $2,%xmm3 + pxor %xmm12,%xmm9 xorl %eax,%ebp - movdqa %xmm5,%xmm10 -.byte 102,15,58,15,242,8 - movl %ecx,%edi - roll $5,%ecx -.byte 102,69,15,56,220,223 - movups 112(%r15),%xmm14 - paddd %xmm5,%xmm8 + movdqa 16(%r11),%xmm12 + roll $5,%edx + addl %edi,%ecx andl %ebp,%esi + pxor %xmm3,%xmm9 xorl %eax,%ebp - psrldq $4,%xmm10 - xorl %eax,%esi - addl %ecx,%ebx - pxor %xmm2,%xmm6 + addl %edx,%ecx rorl $7,%edx - addl %esi,%ebx - pxor %xmm4,%xmm10 - addl 36(%rsp),%eax + pshufd $238,%xmm6,%xmm10 + xorl %eax,%esi + movdqa %xmm9,%xmm3 + paddd %xmm9,%xmm12 + movl %ecx,%edi + addl 32(%rsp),%ebx + movups 0(%r15),%xmm0 +.byte 102,15,56,220,209 + punpcklqdq %xmm7,%xmm10 xorl %ebp,%edx - movl %ebx,%esi - roll $5,%ebx - pxor %xmm10,%xmm6 + roll $5,%ecx + addl %esi,%ebx + psrldq $4,%xmm3 andl %edx,%edi xorl %ebp,%edx - movdqa %xmm8,16(%rsp) - xorl %ebp,%edi - addl %ebx,%eax - movdqa %xmm6,%xmm9 - movdqa %xmm6,%xmm10 + pxor %xmm6,%xmm10 + addl %ecx,%ebx rorl $7,%ecx - addl %edi,%eax - addl 40(%rsp),%ebp -.byte 102,69,15,56,220,222 - movups 128(%r15),%xmm15 + pxor %xmm8,%xmm3 + xorl %ebp,%edi + movl %ebx,%esi + addl 36(%rsp),%eax + pxor %xmm3,%xmm10 xorl %edx,%ecx - pslldq $12,%xmm9 - paddd %xmm6,%xmm6 - movl %eax,%edi - roll $5,%eax + roll $5,%ebx + movdqa %xmm12,16(%rsp) + addl %edi,%eax andl %ecx,%esi + movdqa %xmm10,%xmm13 xorl %edx,%ecx - psrld $31,%xmm10 - xorl %edx,%esi - addl %eax,%ebp - movdqa %xmm9,%xmm8 + addl %ebx,%eax rorl $7,%ebx - addl %esi,%ebp - psrld $30,%xmm9 - por %xmm10,%xmm6 - addl 44(%rsp),%edx + movups 16(%r15),%xmm1 +.byte 102,15,56,220,208 + movdqa %xmm10,%xmm3 + xorl %edx,%esi + pslldq $12,%xmm13 + paddd %xmm10,%xmm10 + movl %eax,%edi + addl 40(%rsp),%ebp + psrld $31,%xmm3 xorl %ecx,%ebx - movl %ebp,%esi - roll $5,%ebp - pslld $2,%xmm8 - pxor %xmm9,%xmm6 + roll $5,%eax + addl %esi,%ebp + movdqa %xmm13,%xmm12 andl %ebx,%edi xorl %ecx,%ebx - movdqa 16(%r11),%xmm9 - xorl %ecx,%edi -.byte 102,69,15,56,220,223 - movups 144(%r15),%xmm14 - addl %ebp,%edx - pxor %xmm8,%xmm6 + psrld $30,%xmm13 + addl %eax,%ebp rorl $7,%eax - addl %edi,%edx - movdqa %xmm4,%xmm7 - addl 48(%rsp),%ecx + por %xmm3,%xmm10 + xorl %ecx,%edi + movl %ebp,%esi + addl 44(%rsp),%edx + pslld $2,%xmm12 + pxor %xmm13,%xmm10 xorl %ebx,%eax - movdqa %xmm6,%xmm8 -.byte 102,15,58,15,251,8 - movl %edx,%edi - roll $5,%edx - paddd %xmm6,%xmm9 + movdqa 16(%r11),%xmm13 + roll $5,%ebp + addl %edi,%edx + movups 32(%r15),%xmm0 +.byte 102,15,56,220,209 andl %eax,%esi + pxor %xmm12,%xmm10 xorl %ebx,%eax - psrldq $4,%xmm8 - xorl %ebx,%esi - addl %edx,%ecx - pxor %xmm3,%xmm7 + addl %ebp,%edx rorl $7,%ebp - addl %esi,%ecx - pxor %xmm5,%xmm8 - addl 52(%rsp),%ebx + pshufd $238,%xmm7,%xmm11 + xorl %ebx,%esi + movdqa %xmm10,%xmm12 + paddd %xmm10,%xmm13 + movl %edx,%edi + addl 48(%rsp),%ecx + punpcklqdq %xmm8,%xmm11 xorl %eax,%ebp - movl %ecx,%esi - roll $5,%ecx -.byte 102,69,15,56,220,222 - movups 160(%r15),%xmm15 - pxor %xmm8,%xmm7 + roll $5,%edx + addl %esi,%ecx + psrldq $4,%xmm12 andl %ebp,%edi xorl %eax,%ebp - movdqa %xmm9,32(%rsp) - xorl %eax,%edi - addl %ecx,%ebx - movdqa %xmm7,%xmm10 - movdqa %xmm7,%xmm8 + pxor %xmm7,%xmm11 + addl %edx,%ecx rorl $7,%edx - addl %edi,%ebx - addl 56(%rsp),%eax + pxor %xmm9,%xmm12 + xorl %eax,%edi + movl %ecx,%esi + addl 52(%rsp),%ebx + movups 48(%r15),%xmm1 +.byte 102,15,56,220,208 + pxor %xmm12,%xmm11 xorl %ebp,%edx - pslldq $12,%xmm10 - paddd %xmm7,%xmm7 - movl %ebx,%edi - roll $5,%ebx + roll $5,%ecx + movdqa %xmm13,32(%rsp) + addl %edi,%ebx andl %edx,%esi + movdqa %xmm11,%xmm3 xorl %ebp,%edx - psrld $31,%xmm8 - xorl %ebp,%esi - addl %ebx,%eax - movdqa %xmm10,%xmm9 + addl %ecx,%ebx rorl $7,%ecx + movdqa %xmm11,%xmm12 + xorl %ebp,%esi + pslldq $12,%xmm3 + paddd %xmm11,%xmm11 + movl %ebx,%edi + addl 56(%rsp),%eax + psrld $31,%xmm12 + xorl %edx,%ecx + roll $5,%ebx addl %esi,%eax - psrld $30,%xmm10 - por %xmm8,%xmm7 - addl 60(%rsp),%ebp + movdqa %xmm3,%xmm13 + andl %ecx,%edi + xorl %edx,%ecx + psrld $30,%xmm3 + addl %ebx,%eax + rorl $7,%ebx cmpl $11,%r8d jb .Laesenclast1 - movups 176(%r15),%xmm14 -.byte 102,69,15,56,220,223 - movups 192(%r15),%xmm15 -.byte 102,69,15,56,220,222 + movups 64(%r15),%xmm0 +.byte 102,15,56,220,209 + movups 80(%r15),%xmm1 +.byte 102,15,56,220,208 je .Laesenclast1 - movups 208(%r15),%xmm14 -.byte 102,69,15,56,220,223 - movups 224(%r15),%xmm15 -.byte 102,69,15,56,220,222 + movups 96(%r15),%xmm0 +.byte 102,15,56,220,209 + movups 112(%r15),%xmm1 +.byte 102,15,56,220,208 .Laesenclast1: -.byte 102,69,15,56,221,223 - movups 16(%r15),%xmm14 - xorl %edx,%ecx +.byte 102,15,56,221,209 + movups 16-112(%r15),%xmm0 + por %xmm12,%xmm11 + xorl %edx,%edi movl %eax,%esi + addl 60(%rsp),%ebp + pslld $2,%xmm13 + pxor %xmm3,%xmm11 + xorl %ecx,%ebx + movdqa 16(%r11),%xmm3 roll $5,%eax - pslld $2,%xmm9 - pxor %xmm10,%xmm7 - andl %ecx,%edi - xorl %edx,%ecx - movdqa 16(%r11),%xmm10 - xorl %edx,%edi - addl %eax,%ebp - pxor %xmm9,%xmm7 - rorl $7,%ebx addl %edi,%ebp - movdqa %xmm7,%xmm9 - addl 0(%rsp),%edx - pxor %xmm4,%xmm0 -.byte 102,68,15,58,15,206,8 - xorl %ecx,%ebx - movl %ebp,%edi - roll $5,%ebp - pxor %xmm1,%xmm0 andl %ebx,%esi + pxor %xmm13,%xmm11 + pshufd $238,%xmm10,%xmm13 xorl %ecx,%ebx - movdqa %xmm10,%xmm8 - paddd %xmm7,%xmm10 - xorl %ecx,%esi - movups 16(%r12),%xmm12 - xorps %xmm13,%xmm12 - movups %xmm11,0(%r13,%r12,1) - xorps %xmm12,%xmm11 -.byte 102,69,15,56,220,222 - movups 32(%r15),%xmm15 - addl %ebp,%edx - pxor %xmm9,%xmm0 + addl %eax,%ebp rorl $7,%eax - addl %esi,%edx - addl 4(%rsp),%ecx + pxor %xmm8,%xmm4 + xorl %ecx,%esi + movl %ebp,%edi + addl 0(%rsp),%edx + punpcklqdq %xmm11,%xmm13 xorl %ebx,%eax - movdqa %xmm0,%xmm9 - movdqa %xmm10,48(%rsp) - movl %edx,%esi - roll $5,%edx + roll $5,%ebp + pxor %xmm5,%xmm4 + addl %esi,%edx + movups 16(%r12),%xmm14 + xorps %xmm15,%xmm14 + movups %xmm2,0(%r12,%r13,1) + xorps %xmm14,%xmm2 + movups -80(%r15),%xmm1 +.byte 102,15,56,220,208 andl %eax,%edi + movdqa %xmm3,%xmm12 xorl %ebx,%eax - pslld $2,%xmm0 - xorl %ebx,%edi - addl %edx,%ecx - psrld $30,%xmm9 + paddd %xmm11,%xmm3 + addl %ebp,%edx + pxor %xmm13,%xmm4 rorl $7,%ebp - addl %edi,%ecx - addl 8(%rsp),%ebx + xorl %ebx,%edi + movl %edx,%esi + addl 4(%rsp),%ecx + movdqa %xmm4,%xmm13 xorl %eax,%ebp - movl %ecx,%edi - roll $5,%ecx -.byte 102,69,15,56,220,223 - movups 48(%r15),%xmm14 - por %xmm9,%xmm0 + roll $5,%edx + movdqa %xmm3,48(%rsp) + addl %edi,%ecx andl %ebp,%esi xorl %eax,%ebp - movdqa %xmm0,%xmm10 - xorl %eax,%esi - addl %ecx,%ebx + pslld $2,%xmm4 + addl %edx,%ecx rorl $7,%edx - addl %esi,%ebx - addl 12(%rsp),%eax + psrld $30,%xmm13 + xorl %eax,%esi + movl %ecx,%edi + addl 8(%rsp),%ebx + movups -64(%r15),%xmm0 +.byte 102,15,56,220,209 + por %xmm13,%xmm4 xorl %ebp,%edx - movl %ebx,%esi - roll $5,%ebx + roll $5,%ecx + pshufd $238,%xmm11,%xmm3 + addl %esi,%ebx andl %edx,%edi xorl %ebp,%edx + addl %ecx,%ebx + addl 12(%rsp),%eax xorl %ebp,%edi - addl %ebx,%eax - rorl $7,%ecx + movl %ebx,%esi + roll $5,%ebx addl %edi,%eax - addl 16(%rsp),%ebp -.byte 102,69,15,56,220,222 - movups 64(%r15),%xmm15 - pxor %xmm5,%xmm1 -.byte 102,68,15,58,15,215,8 xorl %edx,%esi + rorl $7,%ecx + addl %ebx,%eax + pxor %xmm9,%xmm5 + addl 16(%rsp),%ebp + movups -48(%r15),%xmm1 +.byte 102,15,56,220,208 + xorl %ecx,%esi + punpcklqdq %xmm4,%xmm3 movl %eax,%edi roll $5,%eax - pxor %xmm2,%xmm1 - xorl %ecx,%esi - addl %eax,%ebp - movdqa %xmm8,%xmm9 - paddd %xmm0,%xmm8 - rorl $7,%ebx + pxor %xmm6,%xmm5 addl %esi,%ebp - pxor %xmm10,%xmm1 - addl 20(%rsp),%edx xorl %ecx,%edi + movdqa %xmm12,%xmm13 + rorl $7,%ebx + paddd %xmm4,%xmm12 + addl %eax,%ebp + pxor %xmm3,%xmm5 + addl 20(%rsp),%edx + xorl %ebx,%edi movl %ebp,%esi roll $5,%ebp - movdqa %xmm1,%xmm10 - movdqa %xmm8,0(%rsp) - xorl %ebx,%edi - addl %ebp,%edx - rorl $7,%eax + movdqa %xmm5,%xmm3 addl %edi,%edx - pslld $2,%xmm1 - addl 24(%rsp),%ecx xorl %ebx,%esi - psrld $30,%xmm10 + movdqa %xmm12,0(%rsp) + rorl $7,%eax + addl %ebp,%edx + addl 24(%rsp),%ecx + pslld $2,%xmm5 + xorl %eax,%esi movl %edx,%edi + psrld $30,%xmm3 roll $5,%edx - xorl %eax,%esi -.byte 102,69,15,56,220,223 - movups 80(%r15),%xmm14 - addl %edx,%ecx - rorl $7,%ebp addl %esi,%ecx - por %xmm10,%xmm1 - addl 28(%rsp),%ebx + movups -32(%r15),%xmm0 +.byte 102,15,56,220,209 xorl %eax,%edi - movdqa %xmm1,%xmm8 + rorl $7,%ebp + por %xmm3,%xmm5 + addl %edx,%ecx + addl 28(%rsp),%ebx + pshufd $238,%xmm4,%xmm12 + xorl %ebp,%edi movl %ecx,%esi roll $5,%ecx - xorl %ebp,%edi - addl %ecx,%ebx - rorl $7,%edx addl %edi,%ebx - addl 32(%rsp),%eax - pxor %xmm6,%xmm2 -.byte 102,68,15,58,15,192,8 xorl %ebp,%esi + rorl $7,%edx + addl %ecx,%ebx + pxor %xmm10,%xmm6 + addl 32(%rsp),%eax + xorl %edx,%esi + punpcklqdq %xmm5,%xmm12 movl %ebx,%edi roll $5,%ebx - pxor %xmm3,%xmm2 - xorl %edx,%esi - addl %ebx,%eax - movdqa 32(%r11),%xmm10 - paddd %xmm1,%xmm9 - rorl $7,%ecx + pxor %xmm7,%xmm6 addl %esi,%eax - pxor %xmm8,%xmm2 - addl 36(%rsp),%ebp -.byte 102,69,15,56,220,222 - movups 96(%r15),%xmm15 xorl %edx,%edi + movdqa 32(%r11),%xmm3 + rorl $7,%ecx + paddd %xmm5,%xmm13 + addl %ebx,%eax + pxor %xmm12,%xmm6 + addl 36(%rsp),%ebp + movups -16(%r15),%xmm1 +.byte 102,15,56,220,208 + xorl %ecx,%edi movl %eax,%esi roll $5,%eax - movdqa %xmm2,%xmm8 - movdqa %xmm9,16(%rsp) - xorl %ecx,%edi - addl %eax,%ebp - rorl $7,%ebx + movdqa %xmm6,%xmm12 addl %edi,%ebp - pslld $2,%xmm2 - addl 40(%rsp),%edx xorl %ecx,%esi - psrld $30,%xmm8 + movdqa %xmm13,16(%rsp) + rorl $7,%ebx + addl %eax,%ebp + addl 40(%rsp),%edx + pslld $2,%xmm6 + xorl %ebx,%esi movl %ebp,%edi + psrld $30,%xmm12 roll $5,%ebp - xorl %ebx,%esi - addl %ebp,%edx - rorl $7,%eax addl %esi,%edx - por %xmm8,%xmm2 - addl 44(%rsp),%ecx xorl %ebx,%edi - movdqa %xmm2,%xmm9 + rorl $7,%eax + por %xmm12,%xmm6 + addl %ebp,%edx + addl 44(%rsp),%ecx + pshufd $238,%xmm5,%xmm13 + xorl %eax,%edi movl %edx,%esi roll $5,%edx - xorl %eax,%edi -.byte 102,69,15,56,220,223 - movups 112(%r15),%xmm14 - addl %edx,%ecx - rorl $7,%ebp addl %edi,%ecx - addl 48(%rsp),%ebx - pxor %xmm7,%xmm3 -.byte 102,68,15,58,15,201,8 + movups 0(%r15),%xmm0 +.byte 102,15,56,220,209 xorl %eax,%esi + rorl $7,%ebp + addl %edx,%ecx + pxor %xmm11,%xmm7 + addl 48(%rsp),%ebx + xorl %ebp,%esi + punpcklqdq %xmm6,%xmm13 movl %ecx,%edi roll $5,%ecx - pxor %xmm4,%xmm3 - xorl %ebp,%esi - addl %ecx,%ebx - movdqa %xmm10,%xmm8 - paddd %xmm2,%xmm10 - rorl $7,%edx + pxor %xmm8,%xmm7 addl %esi,%ebx - pxor %xmm9,%xmm3 - addl 52(%rsp),%eax xorl %ebp,%edi + movdqa %xmm3,%xmm12 + rorl $7,%edx + paddd %xmm6,%xmm3 + addl %ecx,%ebx + pxor %xmm13,%xmm7 + addl 52(%rsp),%eax + xorl %edx,%edi movl %ebx,%esi roll $5,%ebx - movdqa %xmm3,%xmm9 - movdqa %xmm10,32(%rsp) - xorl %edx,%edi - addl %ebx,%eax - rorl $7,%ecx + movdqa %xmm7,%xmm13 addl %edi,%eax - pslld $2,%xmm3 - addl 56(%rsp),%ebp -.byte 102,69,15,56,220,222 - movups 128(%r15),%xmm15 xorl %edx,%esi - psrld $30,%xmm9 + movdqa %xmm3,32(%rsp) + rorl $7,%ecx + addl %ebx,%eax + addl 56(%rsp),%ebp + movups 16(%r15),%xmm1 +.byte 102,15,56,220,208 + pslld $2,%xmm7 + xorl %ecx,%esi movl %eax,%edi + psrld $30,%xmm13 roll $5,%eax - xorl %ecx,%esi - addl %eax,%ebp - rorl $7,%ebx addl %esi,%ebp - por %xmm9,%xmm3 - addl 60(%rsp),%edx xorl %ecx,%edi - movdqa %xmm3,%xmm10 + rorl $7,%ebx + por %xmm13,%xmm7 + addl %eax,%ebp + addl 60(%rsp),%edx + pshufd $238,%xmm6,%xmm3 + xorl %ebx,%edi movl %ebp,%esi roll $5,%ebp - xorl %ebx,%edi - addl %ebp,%edx - rorl $7,%eax addl %edi,%edx - addl 0(%rsp),%ecx - pxor %xmm0,%xmm4 -.byte 102,68,15,58,15,210,8 xorl %ebx,%esi + rorl $7,%eax + addl %ebp,%edx + pxor %xmm4,%xmm8 + addl 0(%rsp),%ecx + xorl %eax,%esi + punpcklqdq %xmm7,%xmm3 movl %edx,%edi roll $5,%edx - pxor %xmm5,%xmm4 - xorl %eax,%esi -.byte 102,69,15,56,220,223 - movups 144(%r15),%xmm14 - addl %edx,%ecx - movdqa %xmm8,%xmm9 - paddd %xmm3,%xmm8 - rorl $7,%ebp + pxor %xmm9,%xmm8 addl %esi,%ecx - pxor %xmm10,%xmm4 - addl 4(%rsp),%ebx + movups 32(%r15),%xmm0 +.byte 102,15,56,220,209 xorl %eax,%edi + movdqa %xmm12,%xmm13 + rorl $7,%ebp + paddd %xmm7,%xmm12 + addl %edx,%ecx + pxor %xmm3,%xmm8 + addl 4(%rsp),%ebx + xorl %ebp,%edi movl %ecx,%esi roll $5,%ecx - movdqa %xmm4,%xmm10 - movdqa %xmm8,48(%rsp) - xorl %ebp,%edi - addl %ecx,%ebx - rorl $7,%edx + movdqa %xmm8,%xmm3 addl %edi,%ebx - pslld $2,%xmm4 - addl 8(%rsp),%eax xorl %ebp,%esi - psrld $30,%xmm10 + movdqa %xmm12,48(%rsp) + rorl $7,%edx + addl %ecx,%ebx + addl 8(%rsp),%eax + pslld $2,%xmm8 + xorl %edx,%esi movl %ebx,%edi + psrld $30,%xmm3 roll $5,%ebx - xorl %edx,%esi - addl %ebx,%eax - rorl $7,%ecx addl %esi,%eax - por %xmm10,%xmm4 - addl 12(%rsp),%ebp -.byte 102,69,15,56,220,222 - movups 160(%r15),%xmm15 xorl %edx,%edi - movdqa %xmm4,%xmm8 + rorl $7,%ecx + por %xmm3,%xmm8 + addl %ebx,%eax + addl 12(%rsp),%ebp + movups 48(%r15),%xmm1 +.byte 102,15,56,220,208 + pshufd $238,%xmm7,%xmm12 + xorl %ecx,%edi movl %eax,%esi roll $5,%eax - xorl %ecx,%edi - addl %eax,%ebp - rorl $7,%ebx addl %edi,%ebp - addl 16(%rsp),%edx - pxor %xmm1,%xmm5 -.byte 102,68,15,58,15,195,8 xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%ebp + pxor %xmm5,%xmm9 + addl 16(%rsp),%edx + xorl %ebx,%esi + punpcklqdq %xmm8,%xmm12 movl %ebp,%edi roll $5,%ebp - pxor %xmm6,%xmm5 - xorl %ebx,%esi - addl %ebp,%edx - movdqa %xmm9,%xmm10 - paddd %xmm4,%xmm9 - rorl $7,%eax + pxor %xmm10,%xmm9 addl %esi,%edx - pxor %xmm8,%xmm5 - addl 20(%rsp),%ecx xorl %ebx,%edi + movdqa %xmm13,%xmm3 + rorl $7,%eax + paddd %xmm8,%xmm13 + addl %ebp,%edx + pxor %xmm12,%xmm9 + addl 20(%rsp),%ecx + xorl %eax,%edi movl %edx,%esi roll $5,%edx - movdqa %xmm5,%xmm8 - movdqa %xmm9,0(%rsp) - xorl %eax,%edi + movdqa %xmm9,%xmm12 + addl %edi,%ecx cmpl $11,%r8d jb .Laesenclast2 - movups 176(%r15),%xmm14 -.byte 102,69,15,56,220,223 - movups 192(%r15),%xmm15 -.byte 102,69,15,56,220,222 + movups 64(%r15),%xmm0 +.byte 102,15,56,220,209 + movups 80(%r15),%xmm1 +.byte 102,15,56,220,208 je .Laesenclast2 - movups 208(%r15),%xmm14 -.byte 102,69,15,56,220,223 - movups 224(%r15),%xmm15 -.byte 102,69,15,56,220,222 + movups 96(%r15),%xmm0 +.byte 102,15,56,220,209 + movups 112(%r15),%xmm1 +.byte 102,15,56,220,208 .Laesenclast2: -.byte 102,69,15,56,221,223 - movups 16(%r15),%xmm14 - addl %edx,%ecx +.byte 102,15,56,221,209 + movups 16-112(%r15),%xmm0 + xorl %eax,%esi + movdqa %xmm13,0(%rsp) rorl $7,%ebp - addl %edi,%ecx - pslld $2,%xmm5 + addl %edx,%ecx addl 24(%rsp),%ebx - xorl %eax,%esi - psrld $30,%xmm8 + pslld $2,%xmm9 + xorl %ebp,%esi movl %ecx,%edi + psrld $30,%xmm12 roll $5,%ecx - xorl %ebp,%esi - addl %ecx,%ebx - rorl $7,%edx addl %esi,%ebx - por %xmm8,%xmm5 - addl 28(%rsp),%eax xorl %ebp,%edi - movdqa %xmm5,%xmm9 + rorl $7,%edx + por %xmm12,%xmm9 + addl %ecx,%ebx + addl 28(%rsp),%eax + pshufd $238,%xmm8,%xmm13 + rorl $7,%ecx movl %ebx,%esi - roll $5,%ebx xorl %edx,%edi - addl %ebx,%eax - rorl $7,%ecx + roll $5,%ebx addl %edi,%eax - movl %ecx,%edi - movups 32(%r12),%xmm12 - xorps %xmm13,%xmm12 - movups %xmm11,16(%r13,%r12,1) - xorps %xmm12,%xmm11 -.byte 102,69,15,56,220,222 - movups 32(%r15),%xmm15 - pxor %xmm2,%xmm6 -.byte 102,68,15,58,15,204,8 + xorl %ecx,%esi xorl %edx,%ecx + addl %ebx,%eax + pxor %xmm6,%xmm10 addl 32(%rsp),%ebp - andl %edx,%edi - pxor %xmm7,%xmm6 + movups 32(%r12),%xmm14 + xorps %xmm15,%xmm14 + movups %xmm2,16(%r13,%r12,1) + xorps %xmm14,%xmm2 + movups -80(%r15),%xmm1 +.byte 102,15,56,220,208 andl %ecx,%esi + xorl %edx,%ecx rorl $7,%ebx - movdqa %xmm10,%xmm8 - paddd %xmm5,%xmm10 - addl %edi,%ebp + punpcklqdq %xmm9,%xmm13 movl %eax,%edi - pxor %xmm9,%xmm6 + xorl %ecx,%esi + pxor %xmm11,%xmm10 roll $5,%eax addl %esi,%ebp - xorl %edx,%ecx - addl %eax,%ebp - movdqa %xmm6,%xmm9 - movdqa %xmm10,16(%rsp) - movl %ebx,%esi + movdqa %xmm3,%xmm12 + xorl %ebx,%edi + paddd %xmm9,%xmm3 xorl %ecx,%ebx + pxor %xmm13,%xmm10 + addl %eax,%ebp addl 36(%rsp),%edx - andl %ecx,%esi - pslld $2,%xmm6 andl %ebx,%edi + xorl %ecx,%ebx rorl $7,%eax - psrld $30,%xmm9 - addl %esi,%edx + movdqa %xmm10,%xmm13 movl %ebp,%esi + xorl %ebx,%edi + movdqa %xmm3,16(%rsp) roll $5,%ebp -.byte 102,69,15,56,220,223 - movups 48(%r15),%xmm14 addl %edi,%edx - xorl %ecx,%ebx - addl %ebp,%edx - por %xmm9,%xmm6 - movl %eax,%edi + movups -64(%r15),%xmm0 +.byte 102,15,56,220,209 + xorl %eax,%esi + pslld $2,%xmm10 xorl %ebx,%eax - movdqa %xmm6,%xmm10 + addl %ebp,%edx + psrld $30,%xmm13 addl 40(%rsp),%ecx - andl %ebx,%edi andl %eax,%esi + xorl %ebx,%eax + por %xmm13,%xmm10 rorl $7,%ebp - addl %edi,%ecx movl %edx,%edi + xorl %eax,%esi roll $5,%edx + pshufd $238,%xmm9,%xmm3 addl %esi,%ecx - xorl %ebx,%eax - addl %edx,%ecx - movl %ebp,%esi + xorl %ebp,%edi xorl %eax,%ebp + addl %edx,%ecx addl 44(%rsp),%ebx - andl %eax,%esi andl %ebp,%edi -.byte 102,69,15,56,220,222 - movups 64(%r15),%xmm15 + xorl %eax,%ebp rorl $7,%edx - addl %esi,%ebx + movups -48(%r15),%xmm1 +.byte 102,15,56,220,208 movl %ecx,%esi + xorl %ebp,%edi roll $5,%ecx addl %edi,%ebx - xorl %eax,%ebp - addl %ecx,%ebx - movl %edx,%edi - pxor %xmm3,%xmm7 -.byte 102,68,15,58,15,213,8 + xorl %edx,%esi xorl %ebp,%edx + addl %ecx,%ebx + pxor %xmm7,%xmm11 addl 48(%rsp),%eax - andl %ebp,%edi - pxor %xmm0,%xmm7 andl %edx,%esi + xorl %ebp,%edx rorl $7,%ecx - movdqa 48(%r11),%xmm9 - paddd %xmm6,%xmm8 - addl %edi,%eax + punpcklqdq %xmm10,%xmm3 movl %ebx,%edi - pxor %xmm10,%xmm7 + xorl %edx,%esi + pxor %xmm4,%xmm11 roll $5,%ebx addl %esi,%eax - xorl %ebp,%edx - addl %ebx,%eax - movdqa %xmm7,%xmm10 - movdqa %xmm8,32(%rsp) - movl %ecx,%esi -.byte 102,69,15,56,220,223 - movups 80(%r15),%xmm14 + movdqa 48(%r11),%xmm13 + xorl %ecx,%edi + paddd %xmm10,%xmm12 xorl %edx,%ecx + pxor %xmm3,%xmm11 + addl %ebx,%eax addl 52(%rsp),%ebp - andl %edx,%esi - pslld $2,%xmm7 + movups -32(%r15),%xmm0 +.byte 102,15,56,220,209 andl %ecx,%edi + xorl %edx,%ecx rorl $7,%ebx - psrld $30,%xmm10 - addl %esi,%ebp + movdqa %xmm11,%xmm3 movl %eax,%esi + xorl %ecx,%edi + movdqa %xmm12,32(%rsp) roll $5,%eax addl %edi,%ebp - xorl %edx,%ecx - addl %eax,%ebp - por %xmm10,%xmm7 - movl %ebx,%edi + xorl %ebx,%esi + pslld $2,%xmm11 xorl %ecx,%ebx - movdqa %xmm7,%xmm8 + addl %eax,%ebp + psrld $30,%xmm3 addl 56(%rsp),%edx - andl %ecx,%edi andl %ebx,%esi + xorl %ecx,%ebx + por %xmm3,%xmm11 rorl $7,%eax - addl %edi,%edx movl %ebp,%edi + xorl %ebx,%esi roll $5,%ebp -.byte 102,69,15,56,220,222 - movups 96(%r15),%xmm15 + pshufd $238,%xmm10,%xmm12 addl %esi,%edx - xorl %ecx,%ebx - addl %ebp,%edx - movl %eax,%esi + movups -16(%r15),%xmm1 +.byte 102,15,56,220,208 + xorl %eax,%edi xorl %ebx,%eax + addl %ebp,%edx addl 60(%rsp),%ecx - andl %ebx,%esi andl %eax,%edi + xorl %ebx,%eax rorl $7,%ebp - addl %esi,%ecx movl %edx,%esi + xorl %eax,%edi roll $5,%edx addl %edi,%ecx - xorl %ebx,%eax - addl %edx,%ecx - movl %ebp,%edi - pxor %xmm4,%xmm0 -.byte 102,68,15,58,15,198,8 + xorl %ebp,%esi xorl %eax,%ebp + addl %edx,%ecx + pxor %xmm8,%xmm4 addl 0(%rsp),%ebx - andl %eax,%edi - pxor %xmm1,%xmm0 andl %ebp,%esi -.byte 102,69,15,56,220,223 - movups 112(%r15),%xmm14 + xorl %eax,%ebp rorl $7,%edx - movdqa %xmm9,%xmm10 - paddd %xmm7,%xmm9 - addl %edi,%ebx + movups 0(%r15),%xmm0 +.byte 102,15,56,220,209 + punpcklqdq %xmm11,%xmm12 movl %ecx,%edi - pxor %xmm8,%xmm0 + xorl %ebp,%esi + pxor %xmm5,%xmm4 roll $5,%ecx addl %esi,%ebx - xorl %eax,%ebp - addl %ecx,%ebx - movdqa %xmm0,%xmm8 - movdqa %xmm9,48(%rsp) - movl %edx,%esi + movdqa %xmm13,%xmm3 + xorl %edx,%edi + paddd %xmm11,%xmm13 xorl %ebp,%edx + pxor %xmm12,%xmm4 + addl %ecx,%ebx addl 4(%rsp),%eax - andl %ebp,%esi - pslld $2,%xmm0 andl %edx,%edi + xorl %ebp,%edx rorl $7,%ecx - psrld $30,%xmm8 - addl %esi,%eax + movdqa %xmm4,%xmm12 movl %ebx,%esi + xorl %edx,%edi + movdqa %xmm13,48(%rsp) roll $5,%ebx addl %edi,%eax - xorl %ebp,%edx - addl %ebx,%eax - por %xmm8,%xmm0 - movl %ecx,%edi -.byte 102,69,15,56,220,222 - movups 128(%r15),%xmm15 + xorl %ecx,%esi + pslld $2,%xmm4 xorl %edx,%ecx - movdqa %xmm0,%xmm9 + addl %ebx,%eax + psrld $30,%xmm12 addl 8(%rsp),%ebp - andl %edx,%edi + movups 16(%r15),%xmm1 +.byte 102,15,56,220,208 andl %ecx,%esi + xorl %edx,%ecx + por %xmm12,%xmm4 rorl $7,%ebx - addl %edi,%ebp movl %eax,%edi + xorl %ecx,%esi roll $5,%eax + pshufd $238,%xmm11,%xmm13 addl %esi,%ebp - xorl %edx,%ecx - addl %eax,%ebp - movl %ebx,%esi + xorl %ebx,%edi xorl %ecx,%ebx + addl %eax,%ebp addl 12(%rsp),%edx - andl %ecx,%esi andl %ebx,%edi + xorl %ecx,%ebx rorl $7,%eax - addl %esi,%edx movl %ebp,%esi + xorl %ebx,%edi roll $5,%ebp -.byte 102,69,15,56,220,223 - movups 144(%r15),%xmm14 addl %edi,%edx - xorl %ecx,%ebx - addl %ebp,%edx - movl %eax,%edi - pxor %xmm5,%xmm1 -.byte 102,68,15,58,15,207,8 + movups 32(%r15),%xmm0 +.byte 102,15,56,220,209 + xorl %eax,%esi xorl %ebx,%eax + addl %ebp,%edx + pxor %xmm9,%xmm5 addl 16(%rsp),%ecx - andl %ebx,%edi - pxor %xmm2,%xmm1 andl %eax,%esi + xorl %ebx,%eax rorl $7,%ebp - movdqa %xmm10,%xmm8 - paddd %xmm0,%xmm10 - addl %edi,%ecx + punpcklqdq %xmm4,%xmm13 movl %edx,%edi - pxor %xmm9,%xmm1 + xorl %eax,%esi + pxor %xmm6,%xmm5 roll $5,%edx addl %esi,%ecx - xorl %ebx,%eax - addl %edx,%ecx - movdqa %xmm1,%xmm9 - movdqa %xmm10,0(%rsp) - movl %ebp,%esi + movdqa %xmm3,%xmm12 + xorl %ebp,%edi + paddd %xmm4,%xmm3 xorl %eax,%ebp + pxor %xmm13,%xmm5 + addl %edx,%ecx addl 20(%rsp),%ebx - andl %eax,%esi - pslld $2,%xmm1 andl %ebp,%edi -.byte 102,69,15,56,220,222 - movups 160(%r15),%xmm15 + xorl %eax,%ebp rorl $7,%edx - psrld $30,%xmm9 - addl %esi,%ebx + movups 48(%r15),%xmm1 +.byte 102,15,56,220,208 + movdqa %xmm5,%xmm13 movl %ecx,%esi + xorl %ebp,%edi + movdqa %xmm3,0(%rsp) roll $5,%ecx addl %edi,%ebx - xorl %eax,%ebp - addl %ecx,%ebx - por %xmm9,%xmm1 - movl %edx,%edi + xorl %edx,%esi + pslld $2,%xmm5 xorl %ebp,%edx - movdqa %xmm1,%xmm10 + addl %ecx,%ebx + psrld $30,%xmm13 addl 24(%rsp),%eax - andl %ebp,%edi andl %edx,%esi + xorl %ebp,%edx + por %xmm13,%xmm5 rorl $7,%ecx - addl %edi,%eax movl %ebx,%edi + xorl %edx,%esi roll $5,%ebx + pshufd $238,%xmm4,%xmm3 addl %esi,%eax - xorl %ebp,%edx + xorl %ecx,%edi + xorl %edx,%ecx addl %ebx,%eax - movl %ecx,%esi + addl 28(%rsp),%ebp cmpl $11,%r8d jb .Laesenclast3 - movups 176(%r15),%xmm14 -.byte 102,69,15,56,220,223 - movups 192(%r15),%xmm15 -.byte 102,69,15,56,220,222 + movups 64(%r15),%xmm0 +.byte 102,15,56,220,209 + movups 80(%r15),%xmm1 +.byte 102,15,56,220,208 je .Laesenclast3 - movups 208(%r15),%xmm14 -.byte 102,69,15,56,220,223 - movups 224(%r15),%xmm15 -.byte 102,69,15,56,220,222 + movups 96(%r15),%xmm0 +.byte 102,15,56,220,209 + movups 112(%r15),%xmm1 +.byte 102,15,56,220,208 .Laesenclast3: -.byte 102,69,15,56,221,223 - movups 16(%r15),%xmm14 - xorl %edx,%ecx - addl 28(%rsp),%ebp - andl %edx,%esi +.byte 102,15,56,221,209 + movups 16-112(%r15),%xmm0 andl %ecx,%edi + xorl %edx,%ecx rorl $7,%ebx - addl %esi,%ebp movl %eax,%esi + xorl %ecx,%edi roll $5,%eax addl %edi,%ebp - xorl %edx,%ecx - addl %eax,%ebp - movl %ebx,%edi - pxor %xmm6,%xmm2 -.byte 102,68,15,58,15,208,8 + xorl %ebx,%esi xorl %ecx,%ebx + addl %eax,%ebp + pxor %xmm10,%xmm6 addl 32(%rsp),%edx - andl %ecx,%edi - pxor %xmm3,%xmm2 andl %ebx,%esi + xorl %ecx,%ebx rorl $7,%eax - movdqa %xmm8,%xmm9 - paddd %xmm1,%xmm8 - addl %edi,%edx + punpcklqdq %xmm5,%xmm3 movl %ebp,%edi - pxor %xmm10,%xmm2 + xorl %ebx,%esi + pxor %xmm7,%xmm6 roll $5,%ebp - movups 48(%r12),%xmm12 - xorps %xmm13,%xmm12 - movups %xmm11,32(%r13,%r12,1) - xorps %xmm12,%xmm11 -.byte 102,69,15,56,220,222 - movups 32(%r15),%xmm15 addl %esi,%edx - xorl %ecx,%ebx - addl %ebp,%edx - movdqa %xmm2,%xmm10 - movdqa %xmm8,16(%rsp) - movl %eax,%esi + movups 48(%r12),%xmm14 + xorps %xmm15,%xmm14 + movups %xmm2,32(%r13,%r12,1) + xorps %xmm14,%xmm2 + movups -80(%r15),%xmm1 +.byte 102,15,56,220,208 + movdqa %xmm12,%xmm13 + xorl %eax,%edi + paddd %xmm5,%xmm12 xorl %ebx,%eax + pxor %xmm3,%xmm6 + addl %ebp,%edx addl 36(%rsp),%ecx - andl %ebx,%esi - pslld $2,%xmm2 andl %eax,%edi + xorl %ebx,%eax rorl $7,%ebp - psrld $30,%xmm10 - addl %esi,%ecx + movdqa %xmm6,%xmm3 movl %edx,%esi + xorl %eax,%edi + movdqa %xmm12,16(%rsp) roll $5,%edx addl %edi,%ecx - xorl %ebx,%eax - addl %edx,%ecx - por %xmm10,%xmm2 - movl %ebp,%edi + xorl %ebp,%esi + pslld $2,%xmm6 xorl %eax,%ebp - movdqa %xmm2,%xmm8 + addl %edx,%ecx + psrld $30,%xmm3 addl 40(%rsp),%ebx - andl %eax,%edi andl %ebp,%esi -.byte 102,69,15,56,220,223 - movups 48(%r15),%xmm14 + xorl %eax,%ebp + por %xmm3,%xmm6 rorl $7,%edx - addl %edi,%ebx + movups -64(%r15),%xmm0 +.byte 102,15,56,220,209 movl %ecx,%edi + xorl %ebp,%esi roll $5,%ecx + pshufd $238,%xmm5,%xmm12 addl %esi,%ebx - xorl %eax,%ebp - addl %ecx,%ebx - movl %edx,%esi + xorl %edx,%edi xorl %ebp,%edx + addl %ecx,%ebx addl 44(%rsp),%eax - andl %ebp,%esi andl %edx,%edi + xorl %ebp,%edx rorl $7,%ecx - addl %esi,%eax movl %ebx,%esi + xorl %edx,%edi roll $5,%ebx addl %edi,%eax - xorl %ebp,%edx + xorl %edx,%esi addl %ebx,%eax + pxor %xmm11,%xmm7 addl 48(%rsp),%ebp -.byte 102,69,15,56,220,222 - movups 64(%r15),%xmm15 - pxor %xmm7,%xmm3 -.byte 102,68,15,58,15,193,8 - xorl %edx,%esi + movups -48(%r15),%xmm1 +.byte 102,15,56,220,208 + xorl %ecx,%esi + punpcklqdq %xmm6,%xmm12 movl %eax,%edi roll $5,%eax - pxor %xmm4,%xmm3 - xorl %ecx,%esi - addl %eax,%ebp - movdqa %xmm9,%xmm10 - paddd %xmm2,%xmm9 - rorl $7,%ebx + pxor %xmm8,%xmm7 addl %esi,%ebp - pxor %xmm8,%xmm3 - addl 52(%rsp),%edx xorl %ecx,%edi + movdqa %xmm13,%xmm3 + rorl $7,%ebx + paddd %xmm6,%xmm13 + addl %eax,%ebp + pxor %xmm12,%xmm7 + addl 52(%rsp),%edx + xorl %ebx,%edi movl %ebp,%esi roll $5,%ebp - movdqa %xmm3,%xmm8 - movdqa %xmm9,32(%rsp) - xorl %ebx,%edi - addl %ebp,%edx - rorl $7,%eax + movdqa %xmm7,%xmm12 addl %edi,%edx - pslld $2,%xmm3 - addl 56(%rsp),%ecx xorl %ebx,%esi - psrld $30,%xmm8 + movdqa %xmm13,32(%rsp) + rorl $7,%eax + addl %ebp,%edx + addl 56(%rsp),%ecx + pslld $2,%xmm7 + xorl %eax,%esi movl %edx,%edi + psrld $30,%xmm12 roll $5,%edx - xorl %eax,%esi -.byte 102,69,15,56,220,223 - movups 80(%r15),%xmm14 - addl %edx,%ecx - rorl $7,%ebp addl %esi,%ecx - por %xmm8,%xmm3 - addl 60(%rsp),%ebx + movups -32(%r15),%xmm0 +.byte 102,15,56,220,209 xorl %eax,%edi + rorl $7,%ebp + por %xmm12,%xmm7 + addl %edx,%ecx + addl 60(%rsp),%ebx + xorl %ebp,%edi movl %ecx,%esi roll $5,%ecx - xorl %ebp,%edi - addl %ecx,%ebx - rorl $7,%edx addl %edi,%ebx - addl 0(%rsp),%eax - paddd %xmm3,%xmm10 xorl %ebp,%esi + rorl $7,%edx + addl %ecx,%ebx + addl 0(%rsp),%eax + xorl %edx,%esi movl %ebx,%edi roll $5,%ebx - xorl %edx,%esi - movdqa %xmm10,48(%rsp) - addl %ebx,%eax - rorl $7,%ecx + paddd %xmm7,%xmm3 addl %esi,%eax - addl 4(%rsp),%ebp -.byte 102,69,15,56,220,222 - movups 96(%r15),%xmm15 xorl %edx,%edi + movdqa %xmm3,48(%rsp) + rorl $7,%ecx + addl %ebx,%eax + addl 4(%rsp),%ebp + movups -16(%r15),%xmm1 +.byte 102,15,56,220,208 + xorl %ecx,%edi movl %eax,%esi roll $5,%eax - xorl %ecx,%edi - addl %eax,%ebp - rorl $7,%ebx addl %edi,%ebp - addl 8(%rsp),%edx xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%ebp + addl 8(%rsp),%edx + xorl %ebx,%esi movl %ebp,%edi roll $5,%ebp - xorl %ebx,%esi - addl %ebp,%edx - rorl $7,%eax addl %esi,%edx - addl 12(%rsp),%ecx xorl %ebx,%edi + rorl $7,%eax + addl %ebp,%edx + addl 12(%rsp),%ecx + xorl %eax,%edi movl %edx,%esi roll $5,%edx - xorl %eax,%edi -.byte 102,69,15,56,220,223 - movups 112(%r15),%xmm14 - addl %edx,%ecx - rorl $7,%ebp addl %edi,%ecx + movups 0(%r15),%xmm0 +.byte 102,15,56,220,209 + xorl %eax,%esi + rorl $7,%ebp + addl %edx,%ecx cmpq %r14,%r10 je .Ldone_ssse3 - movdqa 64(%r11),%xmm6 - movdqa 0(%r11),%xmm9 - movdqu 0(%r10),%xmm0 - movdqu 16(%r10),%xmm1 - movdqu 32(%r10),%xmm2 - movdqu 48(%r10),%xmm3 -.byte 102,15,56,0,198 + movdqa 64(%r11),%xmm3 + movdqa 0(%r11),%xmm13 + movdqu 0(%r10),%xmm4 + movdqu 16(%r10),%xmm5 + movdqu 32(%r10),%xmm6 + movdqu 48(%r10),%xmm7 +.byte 102,15,56,0,227 addq $64,%r10 addl 16(%rsp),%ebx - xorl %eax,%esi -.byte 102,15,56,0,206 + xorl %ebp,%esi movl %ecx,%edi +.byte 102,15,56,0,235 roll $5,%ecx - paddd %xmm9,%xmm0 - xorl %ebp,%esi - addl %ecx,%ebx - rorl $7,%edx addl %esi,%ebx - movdqa %xmm0,0(%rsp) - addl 20(%rsp),%eax xorl %ebp,%edi - psubd %xmm9,%xmm0 + rorl $7,%edx + paddd %xmm13,%xmm4 + addl %ecx,%ebx + addl 20(%rsp),%eax + xorl %edx,%edi movl %ebx,%esi + movdqa %xmm4,0(%rsp) roll $5,%ebx - xorl %edx,%edi - addl %ebx,%eax - rorl $7,%ecx addl %edi,%eax - addl 24(%rsp),%ebp -.byte 102,69,15,56,220,222 - movups 128(%r15),%xmm15 xorl %edx,%esi + rorl $7,%ecx + psubd %xmm13,%xmm4 + addl %ebx,%eax + addl 24(%rsp),%ebp + movups 16(%r15),%xmm1 +.byte 102,15,56,220,208 + xorl %ecx,%esi movl %eax,%edi roll $5,%eax - xorl %ecx,%esi - addl %eax,%ebp - rorl $7,%ebx addl %esi,%ebp - addl 28(%rsp),%edx xorl %ecx,%edi + rorl $7,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi movl %ebp,%esi roll $5,%ebp - xorl %ebx,%edi - addl %ebp,%edx - rorl $7,%eax addl %edi,%edx - addl 32(%rsp),%ecx xorl %ebx,%esi -.byte 102,15,56,0,214 + rorl $7,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi movl %edx,%edi +.byte 102,15,56,0,243 roll $5,%edx - paddd %xmm9,%xmm1 - xorl %eax,%esi -.byte 102,69,15,56,220,223 - movups 144(%r15),%xmm14 - addl %edx,%ecx - rorl $7,%ebp addl %esi,%ecx - movdqa %xmm1,16(%rsp) - addl 36(%rsp),%ebx + movups 32(%r15),%xmm0 +.byte 102,15,56,220,209 xorl %eax,%edi - psubd %xmm9,%xmm1 + rorl $7,%ebp + paddd %xmm13,%xmm5 + addl %edx,%ecx + addl 36(%rsp),%ebx + xorl %ebp,%edi movl %ecx,%esi + movdqa %xmm5,16(%rsp) roll $5,%ecx - xorl %ebp,%edi - addl %ecx,%ebx - rorl $7,%edx addl %edi,%ebx - addl 40(%rsp),%eax xorl %ebp,%esi + rorl $7,%edx + psubd %xmm13,%xmm5 + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi movl %ebx,%edi roll $5,%ebx - xorl %edx,%esi - addl %ebx,%eax - rorl $7,%ecx addl %esi,%eax - addl 44(%rsp),%ebp -.byte 102,69,15,56,220,222 - movups 160(%r15),%xmm15 xorl %edx,%edi + rorl $7,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + movups 48(%r15),%xmm1 +.byte 102,15,56,220,208 + xorl %ecx,%edi movl %eax,%esi roll $5,%eax - xorl %ecx,%edi - addl %eax,%ebp - rorl $7,%ebx addl %edi,%ebp - addl 48(%rsp),%edx xorl %ecx,%esi -.byte 102,15,56,0,222 + rorl $7,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi movl %ebp,%edi +.byte 102,15,56,0,251 roll $5,%ebp - paddd %xmm9,%xmm2 - xorl %ebx,%esi - addl %ebp,%edx - rorl $7,%eax addl %esi,%edx - movdqa %xmm2,32(%rsp) - addl 52(%rsp),%ecx xorl %ebx,%edi - psubd %xmm9,%xmm2 + rorl $7,%eax + paddd %xmm13,%xmm6 + addl %ebp,%edx + addl 52(%rsp),%ecx + xorl %eax,%edi movl %edx,%esi + movdqa %xmm6,32(%rsp) roll $5,%edx - xorl %eax,%edi + addl %edi,%ecx cmpl $11,%r8d jb .Laesenclast4 - movups 176(%r15),%xmm14 -.byte 102,69,15,56,220,223 - movups 192(%r15),%xmm15 -.byte 102,69,15,56,220,222 + movups 64(%r15),%xmm0 +.byte 102,15,56,220,209 + movups 80(%r15),%xmm1 +.byte 102,15,56,220,208 je .Laesenclast4 - movups 208(%r15),%xmm14 -.byte 102,69,15,56,220,223 - movups 224(%r15),%xmm15 -.byte 102,69,15,56,220,222 + movups 96(%r15),%xmm0 +.byte 102,15,56,220,209 + movups 112(%r15),%xmm1 +.byte 102,15,56,220,208 .Laesenclast4: -.byte 102,69,15,56,221,223 - movups 16(%r15),%xmm14 - addl %edx,%ecx +.byte 102,15,56,221,209 + movups 16-112(%r15),%xmm0 + xorl %eax,%esi rorl $7,%ebp - addl %edi,%ecx + psubd %xmm13,%xmm6 + addl %edx,%ecx addl 56(%rsp),%ebx - xorl %eax,%esi + xorl %ebp,%esi movl %ecx,%edi roll $5,%ecx - xorl %ebp,%esi - addl %ecx,%ebx - rorl $7,%edx addl %esi,%ebx - addl 60(%rsp),%eax xorl %ebp,%edi + rorl $7,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi movl %ebx,%esi roll $5,%ebx - xorl %edx,%edi - addl %ebx,%eax - rorl $7,%ecx addl %edi,%eax - movups %xmm11,48(%r13,%r12,1) + rorl $7,%ecx + addl %ebx,%eax + movups %xmm2,48(%r13,%r12,1) leaq 64(%r12),%r12 addl 0(%r9),%eax @@ -1238,129 +1220,130 @@ aesni_cbc_sha1_enc_ssse3: movl %esi,4(%r9) movl %esi,%ebx movl %ecx,8(%r9) + movl %ecx,%edi movl %edx,12(%r9) + xorl %edx,%edi movl %ebp,16(%r9) + andl %edi,%esi jmp .Loop_ssse3 -.align 16 .Ldone_ssse3: addl 16(%rsp),%ebx - xorl %eax,%esi + xorl %ebp,%esi movl %ecx,%edi roll $5,%ecx - xorl %ebp,%esi - addl %ecx,%ebx - rorl $7,%edx addl %esi,%ebx - addl 20(%rsp),%eax xorl %ebp,%edi + rorl $7,%edx + addl %ecx,%ebx + addl 20(%rsp),%eax + xorl %edx,%edi movl %ebx,%esi roll $5,%ebx - xorl %edx,%edi - addl %ebx,%eax - rorl $7,%ecx addl %edi,%eax - addl 24(%rsp),%ebp -.byte 102,69,15,56,220,222 - movups 128(%r15),%xmm15 xorl %edx,%esi + rorl $7,%ecx + addl %ebx,%eax + addl 24(%rsp),%ebp + movups 16(%r15),%xmm1 +.byte 102,15,56,220,208 + xorl %ecx,%esi movl %eax,%edi roll $5,%eax - xorl %ecx,%esi - addl %eax,%ebp - rorl $7,%ebx addl %esi,%ebp - addl 28(%rsp),%edx xorl %ecx,%edi + rorl $7,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi movl %ebp,%esi roll $5,%ebp - xorl %ebx,%edi - addl %ebp,%edx - rorl $7,%eax addl %edi,%edx - addl 32(%rsp),%ecx xorl %ebx,%esi + rorl $7,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi movl %edx,%edi roll $5,%edx - xorl %eax,%esi -.byte 102,69,15,56,220,223 - movups 144(%r15),%xmm14 - addl %edx,%ecx - rorl $7,%ebp addl %esi,%ecx - addl 36(%rsp),%ebx + movups 32(%r15),%xmm0 +.byte 102,15,56,220,209 xorl %eax,%edi + rorl $7,%ebp + addl %edx,%ecx + addl 36(%rsp),%ebx + xorl %ebp,%edi movl %ecx,%esi roll $5,%ecx - xorl %ebp,%edi - addl %ecx,%ebx - rorl $7,%edx addl %edi,%ebx - addl 40(%rsp),%eax xorl %ebp,%esi + rorl $7,%edx + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi movl %ebx,%edi roll $5,%ebx - xorl %edx,%esi - addl %ebx,%eax - rorl $7,%ecx addl %esi,%eax - addl 44(%rsp),%ebp -.byte 102,69,15,56,220,222 - movups 160(%r15),%xmm15 xorl %edx,%edi + rorl $7,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + movups 48(%r15),%xmm1 +.byte 102,15,56,220,208 + xorl %ecx,%edi movl %eax,%esi roll $5,%eax - xorl %ecx,%edi - addl %eax,%ebp - rorl $7,%ebx addl %edi,%ebp - addl 48(%rsp),%edx xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi movl %ebp,%edi roll $5,%ebp - xorl %ebx,%esi - addl %ebp,%edx - rorl $7,%eax addl %esi,%edx - addl 52(%rsp),%ecx xorl %ebx,%edi + rorl $7,%eax + addl %ebp,%edx + addl 52(%rsp),%ecx + xorl %eax,%edi movl %edx,%esi roll $5,%edx - xorl %eax,%edi + addl %edi,%ecx cmpl $11,%r8d jb .Laesenclast5 - movups 176(%r15),%xmm14 -.byte 102,69,15,56,220,223 - movups 192(%r15),%xmm15 -.byte 102,69,15,56,220,222 + movups 64(%r15),%xmm0 +.byte 102,15,56,220,209 + movups 80(%r15),%xmm1 +.byte 102,15,56,220,208 je .Laesenclast5 - movups 208(%r15),%xmm14 -.byte 102,69,15,56,220,223 - movups 224(%r15),%xmm15 -.byte 102,69,15,56,220,222 + movups 96(%r15),%xmm0 +.byte 102,15,56,220,209 + movups 112(%r15),%xmm1 +.byte 102,15,56,220,208 .Laesenclast5: -.byte 102,69,15,56,221,223 - movups 16(%r15),%xmm14 - addl %edx,%ecx +.byte 102,15,56,221,209 + movups 16-112(%r15),%xmm0 + xorl %eax,%esi rorl $7,%ebp - addl %edi,%ecx + addl %edx,%ecx addl 56(%rsp),%ebx - xorl %eax,%esi + xorl %ebp,%esi movl %ecx,%edi roll $5,%ecx - xorl %ebp,%esi - addl %ecx,%ebx - rorl $7,%edx addl %esi,%ebx - addl 60(%rsp),%eax xorl %ebp,%edi + rorl $7,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi movl %ebx,%esi roll $5,%ebx - xorl %edx,%edi - addl %ebx,%eax - rorl $7,%ecx addl %edi,%eax - movups %xmm11,48(%r13,%r12,1) + rorl $7,%ecx + addl %ebx,%eax + movups %xmm2,48(%r13,%r12,1) movq 88(%rsp),%r8 addl 0(%r9),%eax @@ -1373,7 +1356,7 @@ aesni_cbc_sha1_enc_ssse3: movl %ecx,8(%r9) movl %edx,12(%r9) movl %ebp,16(%r9) - movups %xmm11,(%r8) + movups %xmm2,(%r8) leaq 104(%rsp),%rsi movq 0(%rsi),%r15 movq 8(%rsi),%r14 @@ -1387,11 +1370,313 @@ aesni_cbc_sha1_enc_ssse3: .size aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3 .align 64 K_XX_XX: -.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 -.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 -.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc -.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 -.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 +.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 +.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc +.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 .byte 65,69,83,78,73,45,67,66,67,43,83,72,65,49,32,115,116,105,116,99,104,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 +.type aesni_cbc_sha1_enc_shaext,@function +.align 32 +aesni_cbc_sha1_enc_shaext: + movq 8(%rsp),%r10 + movdqu (%r9),%xmm8 + movd 16(%r9),%xmm9 + movdqa K_XX_XX+80(%rip),%xmm7 + + movl 240(%rcx),%r11d + subq %rdi,%rsi + movups (%rcx),%xmm15 + movups 16(%rcx),%xmm0 + leaq 112(%rcx),%rcx + + pshufd $27,%xmm8,%xmm8 + pshufd $27,%xmm9,%xmm9 + jmp .Loop_shaext + +.align 16 +.Loop_shaext: + movups 0(%rdi),%xmm14 + xorps %xmm15,%xmm14 + xorps %xmm14,%xmm2 + movups -80(%rcx),%xmm1 +.byte 102,15,56,220,208 + movdqu (%r10),%xmm3 + movdqa %xmm9,%xmm12 +.byte 102,15,56,0,223 + movdqu 16(%r10),%xmm4 + movdqa %xmm8,%xmm11 + movups -64(%rcx),%xmm0 +.byte 102,15,56,220,209 +.byte 102,15,56,0,231 + + paddd %xmm3,%xmm9 + movdqu 32(%r10),%xmm5 + leaq 64(%r10),%r10 + pxor %xmm12,%xmm3 + movups -48(%rcx),%xmm1 +.byte 102,15,56,220,208 + pxor %xmm12,%xmm3 + movdqa %xmm8,%xmm10 +.byte 102,15,56,0,239 +.byte 69,15,58,204,193,0 +.byte 68,15,56,200,212 + movups -32(%rcx),%xmm0 +.byte 102,15,56,220,209 +.byte 15,56,201,220 + movdqu -16(%r10),%xmm6 + movdqa %xmm8,%xmm9 +.byte 102,15,56,0,247 + movups -16(%rcx),%xmm1 +.byte 102,15,56,220,208 +.byte 69,15,58,204,194,0 +.byte 68,15,56,200,205 + pxor %xmm5,%xmm3 +.byte 15,56,201,229 + movups 0(%rcx),%xmm0 +.byte 102,15,56,220,209 + movdqa %xmm8,%xmm10 +.byte 69,15,58,204,193,0 +.byte 68,15,56,200,214 + movups 16(%rcx),%xmm1 +.byte 102,15,56,220,208 +.byte 15,56,202,222 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 + movups 32(%rcx),%xmm0 +.byte 102,15,56,220,209 + movdqa %xmm8,%xmm9 +.byte 69,15,58,204,194,0 +.byte 68,15,56,200,203 + movups 48(%rcx),%xmm1 +.byte 102,15,56,220,208 +.byte 15,56,202,227 + pxor %xmm3,%xmm5 +.byte 15,56,201,243 + cmpl $11,%r11d + jb .Laesenclast6 + movups 64(%rcx),%xmm0 +.byte 102,15,56,220,209 + movups 80(%rcx),%xmm1 +.byte 102,15,56,220,208 + je .Laesenclast6 + movups 96(%rcx),%xmm0 +.byte 102,15,56,220,209 + movups 112(%rcx),%xmm1 +.byte 102,15,56,220,208 +.Laesenclast6: +.byte 102,15,56,221,209 + movups 16-112(%rcx),%xmm0 + movdqa %xmm8,%xmm10 +.byte 69,15,58,204,193,0 +.byte 68,15,56,200,212 + movups 16(%rdi),%xmm14 + xorps %xmm15,%xmm14 + movups %xmm2,0(%rsi,%rdi,1) + xorps %xmm14,%xmm2 + movups -80(%rcx),%xmm1 +.byte 102,15,56,220,208 +.byte 15,56,202,236 + pxor %xmm4,%xmm6 +.byte 15,56,201,220 + movups -64(%rcx),%xmm0 +.byte 102,15,56,220,209 + movdqa %xmm8,%xmm9 +.byte 69,15,58,204,194,1 +.byte 68,15,56,200,205 + movups -48(%rcx),%xmm1 +.byte 102,15,56,220,208 +.byte 15,56,202,245 + pxor %xmm5,%xmm3 +.byte 15,56,201,229 + movups -32(%rcx),%xmm0 +.byte 102,15,56,220,209 + movdqa %xmm8,%xmm10 +.byte 69,15,58,204,193,1 +.byte 68,15,56,200,214 + movups -16(%rcx),%xmm1 +.byte 102,15,56,220,208 +.byte 15,56,202,222 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 + movups 0(%rcx),%xmm0 +.byte 102,15,56,220,209 + movdqa %xmm8,%xmm9 +.byte 69,15,58,204,194,1 +.byte 68,15,56,200,203 + movups 16(%rcx),%xmm1 +.byte 102,15,56,220,208 +.byte 15,56,202,227 + pxor %xmm3,%xmm5 +.byte 15,56,201,243 + movups 32(%rcx),%xmm0 +.byte 102,15,56,220,209 + movdqa %xmm8,%xmm10 +.byte 69,15,58,204,193,1 +.byte 68,15,56,200,212 + movups 48(%rcx),%xmm1 +.byte 102,15,56,220,208 +.byte 15,56,202,236 + pxor %xmm4,%xmm6 +.byte 15,56,201,220 + cmpl $11,%r11d + jb .Laesenclast7 + movups 64(%rcx),%xmm0 +.byte 102,15,56,220,209 + movups 80(%rcx),%xmm1 +.byte 102,15,56,220,208 + je .Laesenclast7 + movups 96(%rcx),%xmm0 +.byte 102,15,56,220,209 + movups 112(%rcx),%xmm1 +.byte 102,15,56,220,208 +.Laesenclast7: +.byte 102,15,56,221,209 + movups 16-112(%rcx),%xmm0 + movdqa %xmm8,%xmm9 +.byte 69,15,58,204,194,1 +.byte 68,15,56,200,205 + movups 32(%rdi),%xmm14 + xorps %xmm15,%xmm14 + movups %xmm2,16(%rsi,%rdi,1) + xorps %xmm14,%xmm2 + movups -80(%rcx),%xmm1 +.byte 102,15,56,220,208 +.byte 15,56,202,245 + pxor %xmm5,%xmm3 +.byte 15,56,201,229 + movups -64(%rcx),%xmm0 +.byte 102,15,56,220,209 + movdqa %xmm8,%xmm10 +.byte 69,15,58,204,193,2 +.byte 68,15,56,200,214 + movups -48(%rcx),%xmm1 +.byte 102,15,56,220,208 +.byte 15,56,202,222 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 + movups -32(%rcx),%xmm0 +.byte 102,15,56,220,209 + movdqa %xmm8,%xmm9 +.byte 69,15,58,204,194,2 +.byte 68,15,56,200,203 + movups -16(%rcx),%xmm1 +.byte 102,15,56,220,208 +.byte 15,56,202,227 + pxor %xmm3,%xmm5 +.byte 15,56,201,243 + movups 0(%rcx),%xmm0 +.byte 102,15,56,220,209 + movdqa %xmm8,%xmm10 +.byte 69,15,58,204,193,2 +.byte 68,15,56,200,212 + movups 16(%rcx),%xmm1 +.byte 102,15,56,220,208 +.byte 15,56,202,236 + pxor %xmm4,%xmm6 +.byte 15,56,201,220 + movups 32(%rcx),%xmm0 +.byte 102,15,56,220,209 + movdqa %xmm8,%xmm9 +.byte 69,15,58,204,194,2 +.byte 68,15,56,200,205 + movups 48(%rcx),%xmm1 +.byte 102,15,56,220,208 +.byte 15,56,202,245 + pxor %xmm5,%xmm3 +.byte 15,56,201,229 + cmpl $11,%r11d + jb .Laesenclast8 + movups 64(%rcx),%xmm0 +.byte 102,15,56,220,209 + movups 80(%rcx),%xmm1 +.byte 102,15,56,220,208 + je .Laesenclast8 + movups 96(%rcx),%xmm0 +.byte 102,15,56,220,209 + movups 112(%rcx),%xmm1 +.byte 102,15,56,220,208 +.Laesenclast8: +.byte 102,15,56,221,209 + movups 16-112(%rcx),%xmm0 + movdqa %xmm8,%xmm10 +.byte 69,15,58,204,193,2 +.byte 68,15,56,200,214 + movups 48(%rdi),%xmm14 + xorps %xmm15,%xmm14 + movups %xmm2,32(%rsi,%rdi,1) + xorps %xmm14,%xmm2 + movups -80(%rcx),%xmm1 +.byte 102,15,56,220,208 +.byte 15,56,202,222 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 + movups -64(%rcx),%xmm0 +.byte 102,15,56,220,209 + movdqa %xmm8,%xmm9 +.byte 69,15,58,204,194,3 +.byte 68,15,56,200,203 + movups -48(%rcx),%xmm1 +.byte 102,15,56,220,208 +.byte 15,56,202,227 + pxor %xmm3,%xmm5 +.byte 15,56,201,243 + movups -32(%rcx),%xmm0 +.byte 102,15,56,220,209 + movdqa %xmm8,%xmm10 +.byte 69,15,58,204,193,3 +.byte 68,15,56,200,212 +.byte 15,56,202,236 + pxor %xmm4,%xmm6 + movups -16(%rcx),%xmm1 +.byte 102,15,56,220,208 + movdqa %xmm8,%xmm9 +.byte 69,15,58,204,194,3 +.byte 68,15,56,200,205 +.byte 15,56,202,245 + movups 0(%rcx),%xmm0 +.byte 102,15,56,220,209 + movdqa %xmm12,%xmm5 + movdqa %xmm8,%xmm10 +.byte 69,15,58,204,193,3 +.byte 68,15,56,200,214 + movups 16(%rcx),%xmm1 +.byte 102,15,56,220,208 + movdqa %xmm8,%xmm9 +.byte 69,15,58,204,194,3 +.byte 68,15,56,200,205 + movups 32(%rcx),%xmm0 +.byte 102,15,56,220,209 + movups 48(%rcx),%xmm1 +.byte 102,15,56,220,208 + cmpl $11,%r11d + jb .Laesenclast9 + movups 64(%rcx),%xmm0 +.byte 102,15,56,220,209 + movups 80(%rcx),%xmm1 +.byte 102,15,56,220,208 + je .Laesenclast9 + movups 96(%rcx),%xmm0 +.byte 102,15,56,220,209 + movups 112(%rcx),%xmm1 +.byte 102,15,56,220,208 +.Laesenclast9: +.byte 102,15,56,221,209 + movups 16-112(%rcx),%xmm0 + decq %rdx + + paddd %xmm11,%xmm8 + movups %xmm2,48(%rsi,%rdi,1) + leaq 64(%rdi),%rdi + jnz .Loop_shaext + + pshufd $27,%xmm8,%xmm8 + pshufd $27,%xmm9,%xmm9 + movups %xmm2,(%r8) + movdqu %xmm8,(%r9) + movd %xmm9,16(%r9) + .byte 0xf3,0xc3 +.size aesni_cbc_sha1_enc_shaext,.-aesni_cbc_sha1_enc_shaext diff --git a/secure/lib/libcrypto/amd64/aesni-sha256-x86_64.S b/secure/lib/libcrypto/amd64/aesni-sha256-x86_64.S new file mode 100644 index 0000000..a940892 --- /dev/null +++ b/secure/lib/libcrypto/amd64/aesni-sha256-x86_64.S @@ -0,0 +1,58 @@ + # $FreeBSD$ +.text + + +.globl aesni_cbc_sha256_enc +.type aesni_cbc_sha256_enc,@function +.align 16 +aesni_cbc_sha256_enc: + xorl %eax,%eax + cmpq $0,%rdi + je .Lprobe + ud2 +.Lprobe: + .byte 0xf3,0xc3 +.size aesni_cbc_sha256_enc,.-aesni_cbc_sha256_enc + +.align 64 +.type K256,@object +K256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1 +.long 0,0,0,0, 0,0,0,0 +.byte 65,69,83,78,73,45,67,66,67,43,83,72,65,50,53,54,32,115,116,105,116,99,104,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 64 diff --git a/secure/lib/libcrypto/amd64/aesni-x86_64.S b/secure/lib/libcrypto/amd64/aesni-x86_64.S index df677a7..082a306 100644 --- a/secure/lib/libcrypto/amd64/aesni-x86_64.S +++ b/secure/lib/libcrypto/amd64/aesni-x86_64.S @@ -1,5 +1,6 @@ # $FreeBSD$ .text + .globl aesni_encrypt .type aesni_encrypt,@function .align 16 @@ -15,9 +16,12 @@ aesni_encrypt: decl %eax movups (%rdx),%xmm1 leaq 16(%rdx),%rdx - jnz .Loop_enc1_1 + jnz .Loop_enc1_1 .byte 102,15,56,221,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 .byte 0xf3,0xc3 .size aesni_encrypt,.-aesni_encrypt @@ -36,34 +40,96 @@ aesni_decrypt: decl %eax movups (%rdx),%xmm1 leaq 16(%rdx),%rdx - jnz .Loop_dec1_2 + jnz .Loop_dec1_2 .byte 102,15,56,223,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 .byte 0xf3,0xc3 .size aesni_decrypt, .-aesni_decrypt +.type _aesni_encrypt2,@function +.align 16 +_aesni_encrypt2: + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + xorps %xmm0,%xmm3 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax + addq $16,%rax + +.Lenc_loop2: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Lenc_loop2 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,221,208 +.byte 102,15,56,221,216 + .byte 0xf3,0xc3 +.size _aesni_encrypt2,.-_aesni_encrypt2 +.type _aesni_decrypt2,@function +.align 16 +_aesni_decrypt2: + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + xorps %xmm0,%xmm3 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax + addq $16,%rax + +.Ldec_loop2: +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Ldec_loop2 + +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,223,208 +.byte 102,15,56,223,216 + .byte 0xf3,0xc3 +.size _aesni_decrypt2,.-_aesni_decrypt2 .type _aesni_encrypt3,@function .align 16 _aesni_encrypt3: movups (%rcx),%xmm0 - shrl $1,%eax + shll $4,%eax movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 xorps %xmm0,%xmm3 xorps %xmm0,%xmm4 - movups (%rcx),%xmm0 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax + addq $16,%rax .Lenc_loop3: .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %eax .byte 102,15,56,220,225 - movups 16(%rcx),%xmm1 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leaq 32(%rcx),%rcx .byte 102,15,56,220,224 - movups (%rcx),%xmm0 + movups -16(%rcx,%rax,1),%xmm0 jnz .Lenc_loop3 .byte 102,15,56,220,209 @@ -78,25 +144,26 @@ _aesni_encrypt3: .align 16 _aesni_decrypt3: movups (%rcx),%xmm0 - shrl $1,%eax + shll $4,%eax movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 xorps %xmm0,%xmm3 xorps %xmm0,%xmm4 - movups (%rcx),%xmm0 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax + addq $16,%rax .Ldec_loop3: .byte 102,15,56,222,209 .byte 102,15,56,222,217 - decl %eax .byte 102,15,56,222,225 - movups 16(%rcx),%xmm1 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax .byte 102,15,56,222,208 .byte 102,15,56,222,216 - leaq 32(%rcx),%rcx .byte 102,15,56,222,224 - movups (%rcx),%xmm0 + movups -16(%rcx,%rax,1),%xmm0 jnz .Ldec_loop3 .byte 102,15,56,222,209 @@ -111,28 +178,30 @@ _aesni_decrypt3: .align 16 _aesni_encrypt4: movups (%rcx),%xmm0 - shrl $1,%eax + shll $4,%eax movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 xorps %xmm0,%xmm3 xorps %xmm0,%xmm4 xorps %xmm0,%xmm5 - movups (%rcx),%xmm0 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax +.byte 0x0f,0x1f,0x00 + addq $16,%rax .Lenc_loop4: .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %eax .byte 102,15,56,220,225 .byte 102,15,56,220,233 - movups 16(%rcx),%xmm1 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leaq 32(%rcx),%rcx .byte 102,15,56,220,224 .byte 102,15,56,220,232 - movups (%rcx),%xmm0 + movups -16(%rcx,%rax,1),%xmm0 jnz .Lenc_loop4 .byte 102,15,56,220,209 @@ -149,28 +218,30 @@ _aesni_encrypt4: .align 16 _aesni_decrypt4: movups (%rcx),%xmm0 - shrl $1,%eax + shll $4,%eax movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 xorps %xmm0,%xmm3 xorps %xmm0,%xmm4 xorps %xmm0,%xmm5 - movups (%rcx),%xmm0 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax +.byte 0x0f,0x1f,0x00 + addq $16,%rax .Ldec_loop4: .byte 102,15,56,222,209 .byte 102,15,56,222,217 - decl %eax .byte 102,15,56,222,225 .byte 102,15,56,222,233 - movups 16(%rcx),%xmm1 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax .byte 102,15,56,222,208 .byte 102,15,56,222,216 - leaq 32(%rcx),%rcx .byte 102,15,56,222,224 .byte 102,15,56,222,232 - movups (%rcx),%xmm0 + movups -16(%rcx,%rax,1),%xmm0 jnz .Ldec_loop4 .byte 102,15,56,222,209 @@ -187,43 +258,40 @@ _aesni_decrypt4: .align 16 _aesni_encrypt6: movups (%rcx),%xmm0 - shrl $1,%eax + shll $4,%eax movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 -.byte 102,15,56,220,209 pxor %xmm0,%xmm4 +.byte 102,15,56,220,209 + leaq 32(%rcx,%rax,1),%rcx + negq %rax .byte 102,15,56,220,217 pxor %xmm0,%xmm5 -.byte 102,15,56,220,225 pxor %xmm0,%xmm6 -.byte 102,15,56,220,233 +.byte 102,15,56,220,225 pxor %xmm0,%xmm7 - decl %eax -.byte 102,15,56,220,241 - movups (%rcx),%xmm0 -.byte 102,15,56,220,249 + movups (%rcx,%rax,1),%xmm0 + addq $16,%rax jmp .Lenc_loop6_enter .align 16 .Lenc_loop6: .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %eax .byte 102,15,56,220,225 +.Lenc_loop6_enter: .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 -.Lenc_loop6_enter: - movups 16(%rcx),%xmm1 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leaq 32(%rcx),%rcx .byte 102,15,56,220,224 .byte 102,15,56,220,232 .byte 102,15,56,220,240 .byte 102,15,56,220,248 - movups (%rcx),%xmm0 + movups -16(%rcx,%rax,1),%xmm0 jnz .Lenc_loop6 .byte 102,15,56,220,209 @@ -244,43 +312,40 @@ _aesni_encrypt6: .align 16 _aesni_decrypt6: movups (%rcx),%xmm0 - shrl $1,%eax + shll $4,%eax movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 -.byte 102,15,56,222,209 pxor %xmm0,%xmm4 +.byte 102,15,56,222,209 + leaq 32(%rcx,%rax,1),%rcx + negq %rax .byte 102,15,56,222,217 pxor %xmm0,%xmm5 -.byte 102,15,56,222,225 pxor %xmm0,%xmm6 -.byte 102,15,56,222,233 +.byte 102,15,56,222,225 pxor %xmm0,%xmm7 - decl %eax -.byte 102,15,56,222,241 - movups (%rcx),%xmm0 -.byte 102,15,56,222,249 + movups (%rcx,%rax,1),%xmm0 + addq $16,%rax jmp .Ldec_loop6_enter .align 16 .Ldec_loop6: .byte 102,15,56,222,209 .byte 102,15,56,222,217 - decl %eax .byte 102,15,56,222,225 +.Ldec_loop6_enter: .byte 102,15,56,222,233 .byte 102,15,56,222,241 .byte 102,15,56,222,249 -.Ldec_loop6_enter: - movups 16(%rcx),%xmm1 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax .byte 102,15,56,222,208 .byte 102,15,56,222,216 - leaq 32(%rcx),%rcx .byte 102,15,56,222,224 .byte 102,15,56,222,232 .byte 102,15,56,222,240 .byte 102,15,56,222,248 - movups (%rcx),%xmm0 + movups -16(%rcx,%rax,1),%xmm0 jnz .Ldec_loop6 .byte 102,15,56,222,209 @@ -301,52 +366,46 @@ _aesni_decrypt6: .align 16 _aesni_encrypt8: movups (%rcx),%xmm0 - shrl $1,%eax + shll $4,%eax movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 xorps %xmm0,%xmm3 -.byte 102,15,56,220,209 pxor %xmm0,%xmm4 -.byte 102,15,56,220,217 pxor %xmm0,%xmm5 -.byte 102,15,56,220,225 pxor %xmm0,%xmm6 -.byte 102,15,56,220,233 + leaq 32(%rcx,%rax,1),%rcx + negq %rax +.byte 102,15,56,220,209 pxor %xmm0,%xmm7 - decl %eax -.byte 102,15,56,220,241 pxor %xmm0,%xmm8 -.byte 102,15,56,220,249 +.byte 102,15,56,220,217 pxor %xmm0,%xmm9 - movups (%rcx),%xmm0 -.byte 102,68,15,56,220,193 -.byte 102,68,15,56,220,201 - movups 16(%rcx),%xmm1 - jmp .Lenc_loop8_enter + movups (%rcx,%rax,1),%xmm0 + addq $16,%rax + jmp .Lenc_loop8_inner .align 16 .Lenc_loop8: .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %eax +.Lenc_loop8_inner: .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 .byte 102,68,15,56,220,193 .byte 102,68,15,56,220,201 - movups 16(%rcx),%xmm1 .Lenc_loop8_enter: + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leaq 32(%rcx),%rcx .byte 102,15,56,220,224 .byte 102,15,56,220,232 .byte 102,15,56,220,240 .byte 102,15,56,220,248 .byte 102,68,15,56,220,192 .byte 102,68,15,56,220,200 - movups (%rcx),%xmm0 + movups -16(%rcx,%rax,1),%xmm0 jnz .Lenc_loop8 .byte 102,15,56,220,209 @@ -371,52 +430,46 @@ _aesni_encrypt8: .align 16 _aesni_decrypt8: movups (%rcx),%xmm0 - shrl $1,%eax + shll $4,%eax movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 xorps %xmm0,%xmm3 -.byte 102,15,56,222,209 pxor %xmm0,%xmm4 -.byte 102,15,56,222,217 pxor %xmm0,%xmm5 -.byte 102,15,56,222,225 pxor %xmm0,%xmm6 -.byte 102,15,56,222,233 + leaq 32(%rcx,%rax,1),%rcx + negq %rax +.byte 102,15,56,222,209 pxor %xmm0,%xmm7 - decl %eax -.byte 102,15,56,222,241 pxor %xmm0,%xmm8 -.byte 102,15,56,222,249 +.byte 102,15,56,222,217 pxor %xmm0,%xmm9 - movups (%rcx),%xmm0 -.byte 102,68,15,56,222,193 -.byte 102,68,15,56,222,201 - movups 16(%rcx),%xmm1 - jmp .Ldec_loop8_enter + movups (%rcx,%rax,1),%xmm0 + addq $16,%rax + jmp .Ldec_loop8_inner .align 16 .Ldec_loop8: .byte 102,15,56,222,209 .byte 102,15,56,222,217 - decl %eax +.Ldec_loop8_inner: .byte 102,15,56,222,225 .byte 102,15,56,222,233 .byte 102,15,56,222,241 .byte 102,15,56,222,249 .byte 102,68,15,56,222,193 .byte 102,68,15,56,222,201 - movups 16(%rcx),%xmm1 .Ldec_loop8_enter: + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax .byte 102,15,56,222,208 .byte 102,15,56,222,216 - leaq 32(%rcx),%rcx .byte 102,15,56,222,224 .byte 102,15,56,222,232 .byte 102,15,56,222,240 .byte 102,15,56,222,248 .byte 102,68,15,56,222,192 .byte 102,68,15,56,222,200 - movups (%rcx),%xmm0 + movups -16(%rcx,%rax,1),%xmm0 jnz .Ldec_loop8 .byte 102,15,56,222,209 @@ -525,6 +578,7 @@ aesni_ecb_encrypt: movups 80(%rdi),%xmm7 je .Lecb_enc_six movdqu 96(%rdi),%xmm8 + xorps %xmm9,%xmm9 call _aesni_encrypt8 movups %xmm2,(%rsi) movups %xmm3,16(%rsi) @@ -545,14 +599,13 @@ aesni_ecb_encrypt: decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_enc1_3 + jnz .Loop_enc1_3 .byte 102,15,56,221,209 movups %xmm2,(%rsi) jmp .Lecb_ret .align 16 .Lecb_enc_two: - xorps %xmm4,%xmm4 - call _aesni_encrypt3 + call _aesni_encrypt2 movups %xmm2,(%rsi) movups %xmm3,16(%rsi) jmp .Lecb_ret @@ -639,15 +692,23 @@ aesni_ecb_encrypt: jnc .Lecb_dec_loop8 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 movq %r11,%rcx movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 movl %r10d,%eax movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 movups %xmm5,48(%rsi) + pxor %xmm5,%xmm5 movups %xmm6,64(%rsi) + pxor %xmm6,%xmm6 movups %xmm7,80(%rsi) + pxor %xmm7,%xmm7 movups %xmm8,96(%rsi) + pxor %xmm8,%xmm8 movups %xmm9,112(%rsi) + pxor %xmm9,%xmm9 leaq 128(%rsi),%rsi addq $128,%rdx jz .Lecb_ret @@ -670,14 +731,23 @@ aesni_ecb_encrypt: je .Lecb_dec_six movups 96(%rdi),%xmm8 movups (%rcx),%xmm0 + xorps %xmm9,%xmm9 call _aesni_decrypt8 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 movups %xmm5,48(%rsi) + pxor %xmm5,%xmm5 movups %xmm6,64(%rsi) + pxor %xmm6,%xmm6 movups %xmm7,80(%rsi) + pxor %xmm7,%xmm7 movups %xmm8,96(%rsi) + pxor %xmm8,%xmm8 + pxor %xmm9,%xmm9 jmp .Lecb_ret .align 16 .Lecb_dec_one: @@ -690,53 +760,76 @@ aesni_ecb_encrypt: decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_dec1_4 + jnz .Loop_dec1_4 .byte 102,15,56,223,209 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 jmp .Lecb_ret .align 16 .Lecb_dec_two: - xorps %xmm4,%xmm4 - call _aesni_decrypt3 + call _aesni_decrypt2 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 jmp .Lecb_ret .align 16 .Lecb_dec_three: call _aesni_decrypt3 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 jmp .Lecb_ret .align 16 .Lecb_dec_four: call _aesni_decrypt4 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 movups %xmm5,48(%rsi) + pxor %xmm5,%xmm5 jmp .Lecb_ret .align 16 .Lecb_dec_five: xorps %xmm7,%xmm7 call _aesni_decrypt6 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 movups %xmm5,48(%rsi) + pxor %xmm5,%xmm5 movups %xmm6,64(%rsi) + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 jmp .Lecb_ret .align 16 .Lecb_dec_six: call _aesni_decrypt6 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 movups %xmm5,48(%rsi) + pxor %xmm5,%xmm5 movups %xmm6,64(%rsi) + pxor %xmm6,%xmm6 movups %xmm7,80(%rsi) + pxor %xmm7,%xmm7 .Lecb_ret: + xorps %xmm0,%xmm0 + pxor %xmm1,%xmm1 .byte 0xf3,0xc3 .size aesni_ecb_encrypt,.-aesni_ecb_encrypt .globl aesni_ccm64_encrypt_blocks @@ -744,56 +837,62 @@ aesni_ecb_encrypt: .align 16 aesni_ccm64_encrypt_blocks: movl 240(%rcx),%eax - movdqu (%r8),%xmm9 - movdqa .Lincrement64(%rip),%xmm6 + movdqu (%r8),%xmm6 + movdqa .Lincrement64(%rip),%xmm9 movdqa .Lbswap_mask(%rip),%xmm7 - shrl $1,%eax + shll $4,%eax + movl $16,%r10d leaq 0(%rcx),%r11 movdqu (%r9),%xmm3 - movdqa %xmm9,%xmm2 - movl %eax,%r10d -.byte 102,68,15,56,0,207 + movdqa %xmm6,%xmm2 + leaq 32(%rcx,%rax,1),%rcx +.byte 102,15,56,0,247 + subq %rax,%r10 jmp .Lccm64_enc_outer .align 16 .Lccm64_enc_outer: movups (%r11),%xmm0 - movl %r10d,%eax + movq %r10,%rax movups (%rdi),%xmm8 xorps %xmm0,%xmm2 movups 16(%r11),%xmm1 xorps %xmm8,%xmm0 - leaq 32(%r11),%rcx xorps %xmm0,%xmm3 - movups (%rcx),%xmm0 + movups 32(%r11),%xmm0 .Lccm64_enc2_loop: .byte 102,15,56,220,209 - decl %eax .byte 102,15,56,220,217 - movups 16(%rcx),%xmm1 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax .byte 102,15,56,220,208 - leaq 32(%rcx),%rcx .byte 102,15,56,220,216 - movups 0(%rcx),%xmm0 + movups -16(%rcx,%rax,1),%xmm0 jnz .Lccm64_enc2_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 - paddq %xmm6,%xmm9 + paddq %xmm9,%xmm6 + decq %rdx .byte 102,15,56,221,208 .byte 102,15,56,221,216 - decq %rdx leaq 16(%rdi),%rdi xorps %xmm2,%xmm8 - movdqa %xmm9,%xmm2 + movdqa %xmm6,%xmm2 movups %xmm8,(%rsi) - leaq 16(%rsi),%rsi .byte 102,15,56,0,215 + leaq 16(%rsi),%rsi jnz .Lccm64_enc_outer + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 movups %xmm3,(%r9) + pxor %xmm3,%xmm3 + pxor %xmm8,%xmm8 + pxor %xmm6,%xmm6 .byte 0xf3,0xc3 .size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks .globl aesni_ccm64_decrypt_blocks @@ -801,15 +900,15 @@ aesni_ccm64_encrypt_blocks: .align 16 aesni_ccm64_decrypt_blocks: movl 240(%rcx),%eax - movups (%r8),%xmm9 + movups (%r8),%xmm6 movdqu (%r9),%xmm3 - movdqa .Lincrement64(%rip),%xmm6 + movdqa .Lincrement64(%rip),%xmm9 movdqa .Lbswap_mask(%rip),%xmm7 - movaps %xmm9,%xmm2 + movaps %xmm6,%xmm2 movl %eax,%r10d movq %rcx,%r11 -.byte 102,68,15,56,0,207 +.byte 102,15,56,0,247 movups (%rcx),%xmm0 movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx @@ -819,17 +918,21 @@ aesni_ccm64_decrypt_blocks: decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_enc1_5 + jnz .Loop_enc1_5 .byte 102,15,56,221,209 + shll $4,%r10d + movl $16,%eax movups (%rdi),%xmm8 - paddq %xmm6,%xmm9 + paddq %xmm9,%xmm6 leaq 16(%rdi),%rdi + subq %r10,%rax + leaq 32(%r11,%r10,1),%rcx + movq %rax,%r10 jmp .Lccm64_dec_outer .align 16 .Lccm64_dec_outer: xorps %xmm2,%xmm8 - movdqa %xmm9,%xmm2 - movl %r10d,%eax + movdqa %xmm6,%xmm2 movups %xmm8,(%rsi) leaq 16(%rsi),%rsi .byte 102,15,56,0,215 @@ -838,36 +941,36 @@ aesni_ccm64_decrypt_blocks: jz .Lccm64_dec_break movups (%r11),%xmm0 - shrl $1,%eax + movq %r10,%rax movups 16(%r11),%xmm1 xorps %xmm0,%xmm8 - leaq 32(%r11),%rcx xorps %xmm0,%xmm2 xorps %xmm8,%xmm3 - movups (%rcx),%xmm0 - + movups 32(%r11),%xmm0 + jmp .Lccm64_dec2_loop +.align 16 .Lccm64_dec2_loop: .byte 102,15,56,220,209 - decl %eax .byte 102,15,56,220,217 - movups 16(%rcx),%xmm1 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax .byte 102,15,56,220,208 - leaq 32(%rcx),%rcx .byte 102,15,56,220,216 - movups 0(%rcx),%xmm0 + movups -16(%rcx,%rax,1),%xmm0 jnz .Lccm64_dec2_loop movups (%rdi),%xmm8 - paddq %xmm6,%xmm9 + paddq %xmm9,%xmm6 .byte 102,15,56,220,209 .byte 102,15,56,220,217 - leaq 16(%rdi),%rdi .byte 102,15,56,221,208 .byte 102,15,56,221,216 + leaq 16(%rdi),%rdi jmp .Lccm64_dec_outer .align 16 .Lccm64_dec_break: + movl 240(%r11),%eax movups (%r11),%xmm0 movups 16(%r11),%xmm1 xorps %xmm0,%xmm8 @@ -878,9 +981,15 @@ aesni_ccm64_decrypt_blocks: decl %eax movups (%r11),%xmm1 leaq 16(%r11),%r11 - jnz .Loop_enc1_6 + jnz .Loop_enc1_6 .byte 102,15,56,221,217 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 movups %xmm3,(%r9) + pxor %xmm3,%xmm3 + pxor %xmm8,%xmm8 + pxor %xmm6,%xmm6 .byte 0xf3,0xc3 .size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks .globl aesni_ctr32_encrypt_blocks @@ -888,490 +997,859 @@ aesni_ccm64_decrypt_blocks: .align 16 aesni_ctr32_encrypt_blocks: cmpq $1,%rdx - je .Lctr32_one_shortcut + jne .Lctr32_bulk - movdqu (%r8),%xmm14 - movdqa .Lbswap_mask(%rip),%xmm15 - xorl %eax,%eax -.byte 102,69,15,58,22,242,3 -.byte 102,68,15,58,34,240,3 + + movups (%r8),%xmm2 + movups (%rdi),%xmm3 + movl 240(%rcx),%edx + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx + xorps %xmm0,%xmm2 +.Loop_enc1_7: +.byte 102,15,56,220,209 + decl %edx + movups (%rcx),%xmm1 + leaq 16(%rcx),%rcx + jnz .Loop_enc1_7 +.byte 102,15,56,221,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + xorps %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movups %xmm2,(%rsi) + xorps %xmm2,%xmm2 + jmp .Lctr32_epilogue + +.align 16 +.Lctr32_bulk: + leaq (%rsp),%rax + pushq %rbp + subq $128,%rsp + andq $-16,%rsp + leaq -8(%rax),%rbp + + + + + movdqu (%r8),%xmm2 + movdqu (%rcx),%xmm0 + movl 12(%r8),%r8d + pxor %xmm0,%xmm2 + movl 12(%rcx),%r11d + movdqa %xmm2,0(%rsp) + bswapl %r8d + movdqa %xmm2,%xmm3 + movdqa %xmm2,%xmm4 + movdqa %xmm2,%xmm5 + movdqa %xmm2,64(%rsp) + movdqa %xmm2,80(%rsp) + movdqa %xmm2,96(%rsp) + movq %rdx,%r10 + movdqa %xmm2,112(%rsp) + + leaq 1(%r8),%rax + leaq 2(%r8),%rdx + bswapl %eax + bswapl %edx + xorl %r11d,%eax + xorl %r11d,%edx +.byte 102,15,58,34,216,3 + leaq 3(%r8),%rax + movdqa %xmm3,16(%rsp) +.byte 102,15,58,34,226,3 + bswapl %eax + movq %r10,%rdx + leaq 4(%r8),%r10 + movdqa %xmm4,32(%rsp) + xorl %r11d,%eax + bswapl %r10d +.byte 102,15,58,34,232,3 + xorl %r11d,%r10d + movdqa %xmm5,48(%rsp) + leaq 5(%r8),%r9 + movl %r10d,64+12(%rsp) + bswapl %r9d + leaq 6(%r8),%r10 movl 240(%rcx),%eax + xorl %r11d,%r9d bswapl %r10d - pxor %xmm12,%xmm12 - pxor %xmm13,%xmm13 -.byte 102,69,15,58,34,226,0 - leaq 3(%r10),%r11 -.byte 102,69,15,58,34,235,0 - incl %r10d -.byte 102,69,15,58,34,226,1 - incq %r11 -.byte 102,69,15,58,34,235,1 - incl %r10d -.byte 102,69,15,58,34,226,2 - incq %r11 -.byte 102,69,15,58,34,235,2 - movdqa %xmm12,-40(%rsp) -.byte 102,69,15,56,0,231 - movdqa %xmm13,-24(%rsp) -.byte 102,69,15,56,0,239 - - pshufd $192,%xmm12,%xmm2 - pshufd $128,%xmm12,%xmm3 - pshufd $64,%xmm12,%xmm4 - cmpq $6,%rdx + movl %r9d,80+12(%rsp) + xorl %r11d,%r10d + leaq 7(%r8),%r9 + movl %r10d,96+12(%rsp) + bswapl %r9d + movl OPENSSL_ia32cap_P+4(%rip),%r10d + xorl %r11d,%r9d + andl $71303168,%r10d + movl %r9d,112+12(%rsp) + + movups 16(%rcx),%xmm1 + + movdqa 64(%rsp),%xmm6 + movdqa 80(%rsp),%xmm7 + + cmpq $8,%rdx jb .Lctr32_tail - shrl $1,%eax - movq %rcx,%r11 - movl %eax,%r10d + subq $6,%rdx + cmpl $4194304,%r10d + je .Lctr32_6x + + leaq 128(%rcx),%rcx + subq $2,%rdx + jmp .Lctr32_loop8 + +.align 16 +.Lctr32_6x: + shll $4,%eax + movl $48,%r10d + bswapl %r11d + leaq 32(%rcx,%rax,1),%rcx + subq %rax,%r10 jmp .Lctr32_loop6 .align 16 .Lctr32_loop6: - pshufd $192,%xmm13,%xmm5 - por %xmm14,%xmm2 - movups (%r11),%xmm0 - pshufd $128,%xmm13,%xmm6 - por %xmm14,%xmm3 - movups 16(%r11),%xmm1 - pshufd $64,%xmm13,%xmm7 - por %xmm14,%xmm4 - por %xmm14,%xmm5 - xorps %xmm0,%xmm2 - por %xmm14,%xmm6 - por %xmm14,%xmm7 + addl $6,%r8d + movups -48(%rcx,%r10,1),%xmm0 +.byte 102,15,56,220,209 + movl %r8d,%eax + xorl %r11d,%eax +.byte 102,15,56,220,217 +.byte 0x0f,0x38,0xf1,0x44,0x24,12 + leal 1(%r8),%eax +.byte 102,15,56,220,225 + xorl %r11d,%eax +.byte 0x0f,0x38,0xf1,0x44,0x24,28 +.byte 102,15,56,220,233 + leal 2(%r8),%eax + xorl %r11d,%eax +.byte 102,15,56,220,241 +.byte 0x0f,0x38,0xf1,0x44,0x24,44 + leal 3(%r8),%eax +.byte 102,15,56,220,249 + movups -32(%rcx,%r10,1),%xmm1 + xorl %r11d,%eax +.byte 102,15,56,220,208 +.byte 0x0f,0x38,0xf1,0x44,0x24,60 + leal 4(%r8),%eax +.byte 102,15,56,220,216 + xorl %r11d,%eax +.byte 0x0f,0x38,0xf1,0x44,0x24,76 +.byte 102,15,56,220,224 + leal 5(%r8),%eax + xorl %r11d,%eax +.byte 102,15,56,220,232 +.byte 0x0f,0x38,0xf1,0x44,0x24,92 + movq %r10,%rax +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 + movups -16(%rcx,%r10,1),%xmm0 + call .Lenc_loop6 + movdqu (%rdi),%xmm8 + movdqu 16(%rdi),%xmm9 + movdqu 32(%rdi),%xmm10 + movdqu 48(%rdi),%xmm11 + movdqu 64(%rdi),%xmm12 + movdqu 80(%rdi),%xmm13 + leaq 96(%rdi),%rdi + movups -64(%rcx,%r10,1),%xmm1 + pxor %xmm2,%xmm8 + movaps 0(%rsp),%xmm2 + pxor %xmm3,%xmm9 + movaps 16(%rsp),%xmm3 + pxor %xmm4,%xmm10 + movaps 32(%rsp),%xmm4 + pxor %xmm5,%xmm11 + movaps 48(%rsp),%xmm5 + pxor %xmm6,%xmm12 + movaps 64(%rsp),%xmm6 + pxor %xmm7,%xmm13 + movaps 80(%rsp),%xmm7 + movdqu %xmm8,(%rsi) + movdqu %xmm9,16(%rsi) + movdqu %xmm10,32(%rsi) + movdqu %xmm11,48(%rsi) + movdqu %xmm12,64(%rsi) + movdqu %xmm13,80(%rsi) + leaq 96(%rsi),%rsi - pxor %xmm0,%xmm3 + subq $6,%rdx + jnc .Lctr32_loop6 + + addq $6,%rdx + jz .Lctr32_done + + leal -48(%r10),%eax + leaq -80(%rcx,%r10,1),%rcx + negl %eax + shrl $4,%eax + jmp .Lctr32_tail + +.align 32 +.Lctr32_loop8: + addl $8,%r8d + movdqa 96(%rsp),%xmm8 .byte 102,15,56,220,209 - leaq 32(%r11),%rcx - pxor %xmm0,%xmm4 + movl %r8d,%r9d + movdqa 112(%rsp),%xmm9 .byte 102,15,56,220,217 - movdqa .Lincrement32(%rip),%xmm13 - pxor %xmm0,%xmm5 + bswapl %r9d + movups 32-128(%rcx),%xmm0 .byte 102,15,56,220,225 - movdqa -40(%rsp),%xmm12 - pxor %xmm0,%xmm6 + xorl %r11d,%r9d + nop .byte 102,15,56,220,233 - pxor %xmm0,%xmm7 - movups (%rcx),%xmm0 - decl %eax + movl %r9d,0+12(%rsp) + leaq 1(%r8),%r9 .byte 102,15,56,220,241 .byte 102,15,56,220,249 - jmp .Lctr32_enc_loop6_enter -.align 16 -.Lctr32_enc_loop6: +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 48-128(%rcx),%xmm1 + bswapl %r9d +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + xorl %r11d,%r9d +.byte 0x66,0x90 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + movl %r9d,16+12(%rsp) + leaq 2(%r8),%r9 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 64-128(%rcx),%xmm0 + bswapl %r9d .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %eax + xorl %r11d,%r9d +.byte 0x66,0x90 .byte 102,15,56,220,225 .byte 102,15,56,220,233 + movl %r9d,32+12(%rsp) + leaq 3(%r8),%r9 .byte 102,15,56,220,241 .byte 102,15,56,220,249 -.Lctr32_enc_loop6_enter: - movups 16(%rcx),%xmm1 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 80-128(%rcx),%xmm1 + bswapl %r9d .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leaq 32(%rcx),%rcx + xorl %r11d,%r9d +.byte 0x66,0x90 .byte 102,15,56,220,224 .byte 102,15,56,220,232 + movl %r9d,48+12(%rsp) + leaq 4(%r8),%r9 .byte 102,15,56,220,240 .byte 102,15,56,220,248 - movups (%rcx),%xmm0 - jnz .Lctr32_enc_loop6 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 96-128(%rcx),%xmm0 + bswapl %r9d +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + xorl %r11d,%r9d +.byte 0x66,0x90 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movl %r9d,64+12(%rsp) + leaq 5(%r8),%r9 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 112-128(%rcx),%xmm1 + bswapl %r9d +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + xorl %r11d,%r9d +.byte 0x66,0x90 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + movl %r9d,80+12(%rsp) + leaq 6(%r8),%r9 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 128-128(%rcx),%xmm0 + bswapl %r9d +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + xorl %r11d,%r9d +.byte 0x66,0x90 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movl %r9d,96+12(%rsp) + leaq 7(%r8),%r9 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 144-128(%rcx),%xmm1 + bswapl %r9d +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 + xorl %r11d,%r9d + movdqu 0(%rdi),%xmm10 +.byte 102,15,56,220,232 + movl %r9d,112+12(%rsp) + cmpl $11,%eax +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 160-128(%rcx),%xmm0 + + jb .Lctr32_enc_done .byte 102,15,56,220,209 - paddd %xmm13,%xmm12 .byte 102,15,56,220,217 - paddd -24(%rsp),%xmm13 .byte 102,15,56,220,225 - movdqa %xmm12,-40(%rsp) .byte 102,15,56,220,233 - movdqa %xmm13,-24(%rsp) .byte 102,15,56,220,241 -.byte 102,69,15,56,0,231 .byte 102,15,56,220,249 -.byte 102,69,15,56,0,239 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 176-128(%rcx),%xmm1 -.byte 102,15,56,221,208 - movups (%rdi),%xmm8 -.byte 102,15,56,221,216 - movups 16(%rdi),%xmm9 -.byte 102,15,56,221,224 - movups 32(%rdi),%xmm10 -.byte 102,15,56,221,232 - movups 48(%rdi),%xmm11 -.byte 102,15,56,221,240 - movups 64(%rdi),%xmm1 -.byte 102,15,56,221,248 - movups 80(%rdi),%xmm0 - leaq 96(%rdi),%rdi +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 192-128(%rcx),%xmm0 + je .Lctr32_enc_done - xorps %xmm2,%xmm8 - pshufd $192,%xmm12,%xmm2 - xorps %xmm3,%xmm9 - pshufd $128,%xmm12,%xmm3 - movups %xmm8,(%rsi) - xorps %xmm4,%xmm10 - pshufd $64,%xmm12,%xmm4 - movups %xmm9,16(%rsi) - xorps %xmm5,%xmm11 - movups %xmm10,32(%rsi) - xorps %xmm6,%xmm1 - movups %xmm11,48(%rsi) - xorps %xmm7,%xmm0 - movups %xmm1,64(%rsi) - movups %xmm0,80(%rsi) - leaq 96(%rsi),%rsi - movl %r10d,%eax - subq $6,%rdx - jnc .Lctr32_loop6 +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 208-128(%rcx),%xmm1 - addq $6,%rdx +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 224-128(%rcx),%xmm0 + jmp .Lctr32_enc_done + +.align 16 +.Lctr32_enc_done: + movdqu 16(%rdi),%xmm11 + pxor %xmm0,%xmm10 + movdqu 32(%rdi),%xmm12 + pxor %xmm0,%xmm11 + movdqu 48(%rdi),%xmm13 + pxor %xmm0,%xmm12 + movdqu 64(%rdi),%xmm14 + pxor %xmm0,%xmm13 + movdqu 80(%rdi),%xmm15 + pxor %xmm0,%xmm14 + pxor %xmm0,%xmm15 +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movdqu 96(%rdi),%xmm1 + leaq 128(%rdi),%rdi + +.byte 102,65,15,56,221,210 + pxor %xmm0,%xmm1 + movdqu 112-128(%rdi),%xmm10 +.byte 102,65,15,56,221,219 + pxor %xmm0,%xmm10 + movdqa 0(%rsp),%xmm11 +.byte 102,65,15,56,221,228 +.byte 102,65,15,56,221,237 + movdqa 16(%rsp),%xmm12 + movdqa 32(%rsp),%xmm13 +.byte 102,65,15,56,221,246 +.byte 102,65,15,56,221,255 + movdqa 48(%rsp),%xmm14 + movdqa 64(%rsp),%xmm15 +.byte 102,68,15,56,221,193 + movdqa 80(%rsp),%xmm0 + movups 16-128(%rcx),%xmm1 +.byte 102,69,15,56,221,202 + + movups %xmm2,(%rsi) + movdqa %xmm11,%xmm2 + movups %xmm3,16(%rsi) + movdqa %xmm12,%xmm3 + movups %xmm4,32(%rsi) + movdqa %xmm13,%xmm4 + movups %xmm5,48(%rsi) + movdqa %xmm14,%xmm5 + movups %xmm6,64(%rsi) + movdqa %xmm15,%xmm6 + movups %xmm7,80(%rsi) + movdqa %xmm0,%xmm7 + movups %xmm8,96(%rsi) + movups %xmm9,112(%rsi) + leaq 128(%rsi),%rsi + + subq $8,%rdx + jnc .Lctr32_loop8 + + addq $8,%rdx jz .Lctr32_done - movq %r11,%rcx - leal 1(%rax,%rax,1),%eax + leaq -128(%rcx),%rcx .Lctr32_tail: - por %xmm14,%xmm2 - movups (%rdi),%xmm8 - cmpq $2,%rdx - jb .Lctr32_one - por %xmm14,%xmm3 - movups 16(%rdi),%xmm9 - je .Lctr32_two - pshufd $192,%xmm13,%xmm5 - por %xmm14,%xmm4 - movups 32(%rdi),%xmm10 + leaq 16(%rcx),%rcx cmpq $4,%rdx - jb .Lctr32_three + jb .Lctr32_loop3 + je .Lctr32_loop4 - pshufd $128,%xmm13,%xmm6 - por %xmm14,%xmm5 - movups 48(%rdi),%xmm11 - je .Lctr32_four - por %xmm14,%xmm6 - xorps %xmm7,%xmm7 + shll $4,%eax + movdqa 96(%rsp),%xmm8 + pxor %xmm9,%xmm9 - call _aesni_encrypt6 + movups 16(%rcx),%xmm0 +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + leaq 32-16(%rcx,%rax,1),%rcx + negq %rax +.byte 102,15,56,220,225 + addq $16,%rax + movups (%rdi),%xmm10 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 + movups 16(%rdi),%xmm11 + movups 32(%rdi),%xmm12 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 - movups 64(%rdi),%xmm1 - xorps %xmm2,%xmm8 - xorps %xmm3,%xmm9 - movups %xmm8,(%rsi) - xorps %xmm4,%xmm10 - movups %xmm9,16(%rsi) - xorps %xmm5,%xmm11 - movups %xmm10,32(%rsi) - xorps %xmm6,%xmm1 - movups %xmm11,48(%rsi) - movups %xmm1,64(%rsi) + call .Lenc_loop8_enter + + movdqu 48(%rdi),%xmm13 + pxor %xmm10,%xmm2 + movdqu 64(%rdi),%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm10,%xmm6 + movdqu %xmm5,48(%rsi) + movdqu %xmm6,64(%rsi) + cmpq $6,%rdx + jb .Lctr32_done + + movups 80(%rdi),%xmm11 + xorps %xmm11,%xmm7 + movups %xmm7,80(%rsi) + je .Lctr32_done + + movups 96(%rdi),%xmm12 + xorps %xmm12,%xmm8 + movups %xmm8,96(%rsi) jmp .Lctr32_done -.align 16 -.Lctr32_one_shortcut: - movups (%r8),%xmm2 - movups (%rdi),%xmm8 - movl 240(%rcx),%eax -.Lctr32_one: - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -.Loop_enc1_7: +.align 32 +.Lctr32_loop4: .byte 102,15,56,220,209 + leaq 16(%rcx),%rcx decl %eax +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz .Loop_enc1_7 + jnz .Lctr32_loop4 .byte 102,15,56,221,209 - xorps %xmm2,%xmm8 - movups %xmm8,(%rsi) - jmp .Lctr32_done +.byte 102,15,56,221,217 + movups (%rdi),%xmm10 + movups 16(%rdi),%xmm11 +.byte 102,15,56,221,225 +.byte 102,15,56,221,233 + movups 32(%rdi),%xmm12 + movups 48(%rdi),%xmm13 -.align 16 -.Lctr32_two: - xorps %xmm4,%xmm4 - call _aesni_encrypt3 - xorps %xmm2,%xmm8 - xorps %xmm3,%xmm9 - movups %xmm8,(%rsi) - movups %xmm9,16(%rsi) + xorps %xmm10,%xmm2 + movups %xmm2,(%rsi) + xorps %xmm11,%xmm3 + movups %xmm3,16(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm4,32(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm5,48(%rsi) jmp .Lctr32_done -.align 16 -.Lctr32_three: - call _aesni_encrypt3 - xorps %xmm2,%xmm8 - xorps %xmm3,%xmm9 - movups %xmm8,(%rsi) - xorps %xmm4,%xmm10 - movups %xmm9,16(%rsi) - movups %xmm10,32(%rsi) - jmp .Lctr32_done +.align 32 +.Lctr32_loop3: +.byte 102,15,56,220,209 + leaq 16(%rcx),%rcx + decl %eax +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 + movups (%rcx),%xmm1 + jnz .Lctr32_loop3 +.byte 102,15,56,221,209 +.byte 102,15,56,221,217 +.byte 102,15,56,221,225 -.align 16 -.Lctr32_four: - call _aesni_encrypt4 - xorps %xmm2,%xmm8 - xorps %xmm3,%xmm9 - movups %xmm8,(%rsi) - xorps %xmm4,%xmm10 - movups %xmm9,16(%rsi) - xorps %xmm5,%xmm11 - movups %xmm10,32(%rsi) - movups %xmm11,48(%rsi) + movups (%rdi),%xmm10 + xorps %xmm10,%xmm2 + movups %xmm2,(%rsi) + cmpq $2,%rdx + jb .Lctr32_done + + movups 16(%rdi),%xmm11 + xorps %xmm11,%xmm3 + movups %xmm3,16(%rsi) + je .Lctr32_done + + movups 32(%rdi),%xmm12 + xorps %xmm12,%xmm4 + movups %xmm4,32(%rsi) .Lctr32_done: + xorps %xmm0,%xmm0 + xorl %r11d,%r11d + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + movaps %xmm0,0(%rsp) + pxor %xmm8,%xmm8 + movaps %xmm0,16(%rsp) + pxor %xmm9,%xmm9 + movaps %xmm0,32(%rsp) + pxor %xmm10,%xmm10 + movaps %xmm0,48(%rsp) + pxor %xmm11,%xmm11 + movaps %xmm0,64(%rsp) + pxor %xmm12,%xmm12 + movaps %xmm0,80(%rsp) + pxor %xmm13,%xmm13 + movaps %xmm0,96(%rsp) + pxor %xmm14,%xmm14 + movaps %xmm0,112(%rsp) + pxor %xmm15,%xmm15 + leaq (%rbp),%rsp + popq %rbp +.Lctr32_epilogue: .byte 0xf3,0xc3 .size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks .globl aesni_xts_encrypt .type aesni_xts_encrypt,@function .align 16 aesni_xts_encrypt: - leaq -104(%rsp),%rsp - movups (%r9),%xmm15 + leaq (%rsp),%rax + pushq %rbp + subq $112,%rsp + andq $-16,%rsp + leaq -8(%rax),%rbp + movups (%r9),%xmm2 movl 240(%r8),%eax movl 240(%rcx),%r10d movups (%r8),%xmm0 movups 16(%r8),%xmm1 leaq 32(%r8),%r8 - xorps %xmm0,%xmm15 + xorps %xmm0,%xmm2 .Loop_enc1_8: -.byte 102,68,15,56,220,249 +.byte 102,15,56,220,209 decl %eax movups (%r8),%xmm1 leaq 16(%r8),%r8 - jnz .Loop_enc1_8 -.byte 102,68,15,56,221,249 + jnz .Loop_enc1_8 +.byte 102,15,56,221,209 + movups (%rcx),%xmm0 movq %rcx,%r11 movl %r10d,%eax + shll $4,%r10d movq %rdx,%r9 andq $-16,%rdx + movups 16(%rcx,%r10,1),%xmm1 + movdqa .Lxts_magic(%rip),%xmm8 - pxor %xmm14,%xmm14 - pcmpgtd %xmm15,%xmm14 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + movdqa %xmm2,%xmm15 + pshufd $95,%xmm2,%xmm9 + pxor %xmm0,%xmm1 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm10 + psrad $31,%xmm14 paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 - pxor %xmm9,%xmm15 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm10 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm11 + psrad $31,%xmm14 paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 - pxor %xmm9,%xmm15 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm11 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm12 + psrad $31,%xmm14 paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 - pxor %xmm9,%xmm15 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm12 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm13 + psrad $31,%xmm14 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm13 + pxor %xmm14,%xmm15 + movdqa %xmm15,%xmm14 + psrad $31,%xmm9 paddq %xmm15,%xmm15 pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 + pxor %xmm0,%xmm14 pxor %xmm9,%xmm15 + movaps %xmm1,96(%rsp) + subq $96,%rdx jc .Lxts_enc_short - shrl $1,%eax - subl $1,%eax - movl %eax,%r10d + movl $16+96,%eax + leaq 32(%r11,%r10,1),%rcx + subq %r10,%rax + movups 16(%r11),%xmm1 + movq %rax,%r10 + leaq .Lxts_magic(%rip),%r8 jmp .Lxts_enc_grandloop -.align 16 +.align 32 .Lxts_enc_grandloop: - pshufd $19,%xmm14,%xmm9 - movdqa %xmm15,%xmm14 - paddq %xmm15,%xmm15 movdqu 0(%rdi),%xmm2 - pand %xmm8,%xmm9 + movdqa %xmm0,%xmm8 movdqu 16(%rdi),%xmm3 - pxor %xmm9,%xmm15 - - movdqu 32(%rdi),%xmm4 pxor %xmm10,%xmm2 - movdqu 48(%rdi),%xmm5 + movdqu 32(%rdi),%xmm4 pxor %xmm11,%xmm3 - movdqu 64(%rdi),%xmm6 +.byte 102,15,56,220,209 + movdqu 48(%rdi),%xmm5 pxor %xmm12,%xmm4 - movdqu 80(%rdi),%xmm7 - leaq 96(%rdi),%rdi +.byte 102,15,56,220,217 + movdqu 64(%rdi),%xmm6 pxor %xmm13,%xmm5 - movups (%r11),%xmm0 +.byte 102,15,56,220,225 + movdqu 80(%rdi),%xmm7 + pxor %xmm15,%xmm8 + movdqa 96(%rsp),%xmm9 pxor %xmm14,%xmm6 - pxor %xmm15,%xmm7 - - +.byte 102,15,56,220,233 + movups 32(%r11),%xmm0 + leaq 96(%rdi),%rdi + pxor %xmm8,%xmm7 - movups 16(%r11),%xmm1 - pxor %xmm0,%xmm2 - pxor %xmm0,%xmm3 + pxor %xmm9,%xmm10 +.byte 102,15,56,220,241 + pxor %xmm9,%xmm11 movdqa %xmm10,0(%rsp) -.byte 102,15,56,220,209 - leaq 32(%r11),%rcx - pxor %xmm0,%xmm4 +.byte 102,15,56,220,249 + movups 48(%r11),%xmm1 + pxor %xmm9,%xmm12 + +.byte 102,15,56,220,208 + pxor %xmm9,%xmm13 movdqa %xmm11,16(%rsp) -.byte 102,15,56,220,217 - pxor %xmm0,%xmm5 +.byte 102,15,56,220,216 + pxor %xmm9,%xmm14 movdqa %xmm12,32(%rsp) -.byte 102,15,56,220,225 - pxor %xmm0,%xmm6 - movdqa %xmm13,48(%rsp) -.byte 102,15,56,220,233 - pxor %xmm0,%xmm7 - movups (%rcx),%xmm0 - decl %eax +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + pxor %xmm9,%xmm8 movdqa %xmm14,64(%rsp) -.byte 102,15,56,220,241 - movdqa %xmm15,80(%rsp) -.byte 102,15,56,220,249 - pxor %xmm14,%xmm14 - pcmpgtd %xmm15,%xmm14 - jmp .Lxts_enc_loop6_enter - -.align 16 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 + movups 64(%r11),%xmm0 + movdqa %xmm8,80(%rsp) + pshufd $95,%xmm15,%xmm9 + jmp .Lxts_enc_loop6 +.align 32 .Lxts_enc_loop6: .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %eax .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 -.Lxts_enc_loop6_enter: - movups 16(%rcx),%xmm1 + movups -64(%rcx,%rax,1),%xmm1 + addq $32,%rax + .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leaq 32(%rcx),%rcx .byte 102,15,56,220,224 .byte 102,15,56,220,232 .byte 102,15,56,220,240 .byte 102,15,56,220,248 - movups (%rcx),%xmm0 + movups -80(%rcx,%rax,1),%xmm0 jnz .Lxts_enc_loop6 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - paddq %xmm15,%xmm15 + movdqa (%r8),%xmm8 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 .byte 102,15,56,220,209 - pand %xmm8,%xmm9 + paddq %xmm15,%xmm15 + psrad $31,%xmm14 .byte 102,15,56,220,217 - pcmpgtd %xmm15,%xmm14 + pand %xmm8,%xmm14 + movups (%r11),%xmm10 .byte 102,15,56,220,225 - pxor %xmm9,%xmm15 .byte 102,15,56,220,233 .byte 102,15,56,220,241 + pxor %xmm14,%xmm15 + movaps %xmm10,%xmm11 .byte 102,15,56,220,249 - movups 16(%rcx),%xmm1 + movups -64(%rcx),%xmm1 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm10 - paddq %xmm15,%xmm15 + movdqa %xmm9,%xmm14 .byte 102,15,56,220,208 - pand %xmm8,%xmm9 + paddd %xmm9,%xmm9 + pxor %xmm15,%xmm10 .byte 102,15,56,220,216 - pcmpgtd %xmm15,%xmm14 + psrad $31,%xmm14 + paddq %xmm15,%xmm15 .byte 102,15,56,220,224 - pxor %xmm9,%xmm15 .byte 102,15,56,220,232 + pand %xmm8,%xmm14 + movaps %xmm11,%xmm12 .byte 102,15,56,220,240 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 .byte 102,15,56,220,248 - movups 32(%rcx),%xmm0 + movups -48(%rcx),%xmm0 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm11 - paddq %xmm15,%xmm15 + paddd %xmm9,%xmm9 .byte 102,15,56,220,209 - pand %xmm8,%xmm9 + pxor %xmm15,%xmm11 + psrad $31,%xmm14 .byte 102,15,56,220,217 - pcmpgtd %xmm15,%xmm14 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 .byte 102,15,56,220,225 - pxor %xmm9,%xmm15 .byte 102,15,56,220,233 + movdqa %xmm13,48(%rsp) + pxor %xmm14,%xmm15 .byte 102,15,56,220,241 + movaps %xmm12,%xmm13 + movdqa %xmm9,%xmm14 .byte 102,15,56,220,249 + movups -32(%rcx),%xmm1 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm12 + paddd %xmm9,%xmm9 +.byte 102,15,56,220,208 + pxor %xmm15,%xmm12 + psrad $31,%xmm14 +.byte 102,15,56,220,216 paddq %xmm15,%xmm15 -.byte 102,15,56,221,208 - pand %xmm8,%xmm9 -.byte 102,15,56,221,216 - pcmpgtd %xmm15,%xmm14 -.byte 102,15,56,221,224 - pxor %xmm9,%xmm15 -.byte 102,15,56,221,232 -.byte 102,15,56,221,240 -.byte 102,15,56,221,248 + pand %xmm8,%xmm14 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 + pxor %xmm14,%xmm15 + movaps %xmm13,%xmm14 +.byte 102,15,56,220,248 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm13 + movdqa %xmm9,%xmm0 + paddd %xmm9,%xmm9 +.byte 102,15,56,220,209 + pxor %xmm15,%xmm13 + psrad $31,%xmm0 +.byte 102,15,56,220,217 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm0 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + pxor %xmm0,%xmm15 + movups (%r11),%xmm0 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 + movups 16(%r11),%xmm1 + + pxor %xmm15,%xmm14 +.byte 102,15,56,221,84,36,0 + psrad $31,%xmm9 paddq %xmm15,%xmm15 - xorps 0(%rsp),%xmm2 +.byte 102,15,56,221,92,36,16 +.byte 102,15,56,221,100,36,32 pand %xmm8,%xmm9 - xorps 16(%rsp),%xmm3 - pcmpgtd %xmm15,%xmm14 + movq %r10,%rax +.byte 102,15,56,221,108,36,48 +.byte 102,15,56,221,116,36,64 +.byte 102,15,56,221,124,36,80 pxor %xmm9,%xmm15 - xorps 32(%rsp),%xmm4 - movups %xmm2,0(%rsi) - xorps 48(%rsp),%xmm5 - movups %xmm3,16(%rsi) - xorps 64(%rsp),%xmm6 - movups %xmm4,32(%rsi) - xorps 80(%rsp),%xmm7 - movups %xmm5,48(%rsi) - movl %r10d,%eax - movups %xmm6,64(%rsi) - movups %xmm7,80(%rsi) leaq 96(%rsi),%rsi + movups %xmm2,-96(%rsi) + movups %xmm3,-80(%rsi) + movups %xmm4,-64(%rsi) + movups %xmm5,-48(%rsi) + movups %xmm6,-32(%rsi) + movups %xmm7,-16(%rsi) subq $96,%rdx jnc .Lxts_enc_grandloop - leal 3(%rax,%rax,1),%eax + movl $16+96,%eax + subl %r10d,%eax movq %r11,%rcx - movl %eax,%r10d + shrl $4,%eax .Lxts_enc_short: + + movl %eax,%r10d + pxor %xmm0,%xmm10 addq $96,%rdx jz .Lxts_enc_done + pxor %xmm0,%xmm11 cmpq $32,%rdx jb .Lxts_enc_one + pxor %xmm0,%xmm12 je .Lxts_enc_two + pxor %xmm0,%xmm13 cmpq $64,%rdx jb .Lxts_enc_three + pxor %xmm0,%xmm14 je .Lxts_enc_four - pshufd $19,%xmm14,%xmm9 - movdqa %xmm15,%xmm14 - paddq %xmm15,%xmm15 movdqu (%rdi),%xmm2 - pand %xmm8,%xmm9 movdqu 16(%rdi),%xmm3 - pxor %xmm9,%xmm15 - movdqu 32(%rdi),%xmm4 pxor %xmm10,%xmm2 movdqu 48(%rdi),%xmm5 @@ -1381,6 +1859,7 @@ aesni_xts_encrypt: pxor %xmm12,%xmm4 pxor %xmm13,%xmm5 pxor %xmm14,%xmm6 + pxor %xmm7,%xmm7 call _aesni_encrypt6 @@ -1412,7 +1891,7 @@ aesni_xts_encrypt: decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_enc1_9 + jnz .Loop_enc1_9 .byte 102,15,56,221,209 xorps %xmm10,%xmm2 movdqa %xmm11,%xmm10 @@ -1428,7 +1907,7 @@ aesni_xts_encrypt: xorps %xmm10,%xmm2 xorps %xmm11,%xmm3 - call _aesni_encrypt3 + call _aesni_encrypt2 xorps %xmm10,%xmm2 movdqa %xmm12,%xmm10 @@ -1474,15 +1953,15 @@ aesni_xts_encrypt: call _aesni_encrypt4 - xorps %xmm10,%xmm2 - movdqa %xmm15,%xmm10 - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - movups %xmm2,(%rsi) - xorps %xmm13,%xmm5 - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) + pxor %xmm10,%xmm2 + movdqa %xmm14,%xmm10 + pxor %xmm11,%xmm3 + pxor %xmm12,%xmm4 + movdqu %xmm2,(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm3,16(%rsi) + movdqu %xmm4,32(%rsi) + movdqu %xmm5,48(%rsi) leaq 64(%rsi),%rsi jmp .Lxts_enc_done @@ -1517,13 +1996,37 @@ aesni_xts_encrypt: decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_enc1_10 + jnz .Loop_enc1_10 .byte 102,15,56,221,209 xorps %xmm10,%xmm2 movups %xmm2,-16(%rsi) .Lxts_enc_ret: - leaq 104(%rsp),%rsp + xorps %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + movaps %xmm0,0(%rsp) + pxor %xmm8,%xmm8 + movaps %xmm0,16(%rsp) + pxor %xmm9,%xmm9 + movaps %xmm0,32(%rsp) + pxor %xmm10,%xmm10 + movaps %xmm0,48(%rsp) + pxor %xmm11,%xmm11 + movaps %xmm0,64(%rsp) + pxor %xmm12,%xmm12 + movaps %xmm0,80(%rsp) + pxor %xmm13,%xmm13 + movaps %xmm0,96(%rsp) + pxor %xmm14,%xmm14 + pxor %xmm15,%xmm15 + leaq (%rbp),%rsp + popq %rbp .Lxts_enc_epilogue: .byte 0xf3,0xc3 .size aesni_xts_encrypt,.-aesni_xts_encrypt @@ -1531,249 +2034,293 @@ aesni_xts_encrypt: .type aesni_xts_decrypt,@function .align 16 aesni_xts_decrypt: - leaq -104(%rsp),%rsp - movups (%r9),%xmm15 + leaq (%rsp),%rax + pushq %rbp + subq $112,%rsp + andq $-16,%rsp + leaq -8(%rax),%rbp + movups (%r9),%xmm2 movl 240(%r8),%eax movl 240(%rcx),%r10d movups (%r8),%xmm0 movups 16(%r8),%xmm1 leaq 32(%r8),%r8 - xorps %xmm0,%xmm15 + xorps %xmm0,%xmm2 .Loop_enc1_11: -.byte 102,68,15,56,220,249 +.byte 102,15,56,220,209 decl %eax movups (%r8),%xmm1 leaq 16(%r8),%r8 - jnz .Loop_enc1_11 -.byte 102,68,15,56,221,249 + jnz .Loop_enc1_11 +.byte 102,15,56,221,209 xorl %eax,%eax testq $15,%rdx setnz %al shlq $4,%rax subq %rax,%rdx + movups (%rcx),%xmm0 movq %rcx,%r11 movl %r10d,%eax + shll $4,%r10d movq %rdx,%r9 andq $-16,%rdx + movups 16(%rcx,%r10,1),%xmm1 + movdqa .Lxts_magic(%rip),%xmm8 - pxor %xmm14,%xmm14 - pcmpgtd %xmm15,%xmm14 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + movdqa %xmm2,%xmm15 + pshufd $95,%xmm2,%xmm9 + pxor %xmm0,%xmm1 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm10 + psrad $31,%xmm14 paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 - pxor %xmm9,%xmm15 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm10 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm11 + psrad $31,%xmm14 paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 - pxor %xmm9,%xmm15 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm11 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm12 + psrad $31,%xmm14 paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 - pxor %xmm9,%xmm15 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm12 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm13 + psrad $31,%xmm14 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm13 + pxor %xmm14,%xmm15 + movdqa %xmm15,%xmm14 + psrad $31,%xmm9 paddq %xmm15,%xmm15 pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 + pxor %xmm0,%xmm14 pxor %xmm9,%xmm15 + movaps %xmm1,96(%rsp) + subq $96,%rdx jc .Lxts_dec_short - shrl $1,%eax - subl $1,%eax - movl %eax,%r10d + movl $16+96,%eax + leaq 32(%r11,%r10,1),%rcx + subq %r10,%rax + movups 16(%r11),%xmm1 + movq %rax,%r10 + leaq .Lxts_magic(%rip),%r8 jmp .Lxts_dec_grandloop -.align 16 +.align 32 .Lxts_dec_grandloop: - pshufd $19,%xmm14,%xmm9 - movdqa %xmm15,%xmm14 - paddq %xmm15,%xmm15 movdqu 0(%rdi),%xmm2 - pand %xmm8,%xmm9 + movdqa %xmm0,%xmm8 movdqu 16(%rdi),%xmm3 - pxor %xmm9,%xmm15 - - movdqu 32(%rdi),%xmm4 pxor %xmm10,%xmm2 - movdqu 48(%rdi),%xmm5 + movdqu 32(%rdi),%xmm4 pxor %xmm11,%xmm3 - movdqu 64(%rdi),%xmm6 +.byte 102,15,56,222,209 + movdqu 48(%rdi),%xmm5 pxor %xmm12,%xmm4 - movdqu 80(%rdi),%xmm7 - leaq 96(%rdi),%rdi +.byte 102,15,56,222,217 + movdqu 64(%rdi),%xmm6 pxor %xmm13,%xmm5 - movups (%r11),%xmm0 +.byte 102,15,56,222,225 + movdqu 80(%rdi),%xmm7 + pxor %xmm15,%xmm8 + movdqa 96(%rsp),%xmm9 pxor %xmm14,%xmm6 - pxor %xmm15,%xmm7 - - +.byte 102,15,56,222,233 + movups 32(%r11),%xmm0 + leaq 96(%rdi),%rdi + pxor %xmm8,%xmm7 - movups 16(%r11),%xmm1 - pxor %xmm0,%xmm2 - pxor %xmm0,%xmm3 + pxor %xmm9,%xmm10 +.byte 102,15,56,222,241 + pxor %xmm9,%xmm11 movdqa %xmm10,0(%rsp) -.byte 102,15,56,222,209 - leaq 32(%r11),%rcx - pxor %xmm0,%xmm4 +.byte 102,15,56,222,249 + movups 48(%r11),%xmm1 + pxor %xmm9,%xmm12 + +.byte 102,15,56,222,208 + pxor %xmm9,%xmm13 movdqa %xmm11,16(%rsp) -.byte 102,15,56,222,217 - pxor %xmm0,%xmm5 +.byte 102,15,56,222,216 + pxor %xmm9,%xmm14 movdqa %xmm12,32(%rsp) -.byte 102,15,56,222,225 - pxor %xmm0,%xmm6 - movdqa %xmm13,48(%rsp) -.byte 102,15,56,222,233 - pxor %xmm0,%xmm7 - movups (%rcx),%xmm0 - decl %eax +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 + pxor %xmm9,%xmm8 movdqa %xmm14,64(%rsp) -.byte 102,15,56,222,241 - movdqa %xmm15,80(%rsp) -.byte 102,15,56,222,249 - pxor %xmm14,%xmm14 - pcmpgtd %xmm15,%xmm14 - jmp .Lxts_dec_loop6_enter - -.align 16 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 + movups 64(%r11),%xmm0 + movdqa %xmm8,80(%rsp) + pshufd $95,%xmm15,%xmm9 + jmp .Lxts_dec_loop6 +.align 32 .Lxts_dec_loop6: .byte 102,15,56,222,209 .byte 102,15,56,222,217 - decl %eax .byte 102,15,56,222,225 .byte 102,15,56,222,233 .byte 102,15,56,222,241 .byte 102,15,56,222,249 -.Lxts_dec_loop6_enter: - movups 16(%rcx),%xmm1 + movups -64(%rcx,%rax,1),%xmm1 + addq $32,%rax + .byte 102,15,56,222,208 .byte 102,15,56,222,216 - leaq 32(%rcx),%rcx .byte 102,15,56,222,224 .byte 102,15,56,222,232 .byte 102,15,56,222,240 .byte 102,15,56,222,248 - movups (%rcx),%xmm0 + movups -80(%rcx,%rax,1),%xmm0 jnz .Lxts_dec_loop6 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - paddq %xmm15,%xmm15 + movdqa (%r8),%xmm8 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 .byte 102,15,56,222,209 - pand %xmm8,%xmm9 + paddq %xmm15,%xmm15 + psrad $31,%xmm14 .byte 102,15,56,222,217 - pcmpgtd %xmm15,%xmm14 + pand %xmm8,%xmm14 + movups (%r11),%xmm10 .byte 102,15,56,222,225 - pxor %xmm9,%xmm15 .byte 102,15,56,222,233 .byte 102,15,56,222,241 + pxor %xmm14,%xmm15 + movaps %xmm10,%xmm11 .byte 102,15,56,222,249 - movups 16(%rcx),%xmm1 + movups -64(%rcx),%xmm1 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm10 - paddq %xmm15,%xmm15 + movdqa %xmm9,%xmm14 .byte 102,15,56,222,208 - pand %xmm8,%xmm9 + paddd %xmm9,%xmm9 + pxor %xmm15,%xmm10 .byte 102,15,56,222,216 - pcmpgtd %xmm15,%xmm14 + psrad $31,%xmm14 + paddq %xmm15,%xmm15 .byte 102,15,56,222,224 - pxor %xmm9,%xmm15 .byte 102,15,56,222,232 + pand %xmm8,%xmm14 + movaps %xmm11,%xmm12 .byte 102,15,56,222,240 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 .byte 102,15,56,222,248 - movups 32(%rcx),%xmm0 + movups -48(%rcx),%xmm0 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm11 - paddq %xmm15,%xmm15 + paddd %xmm9,%xmm9 .byte 102,15,56,222,209 - pand %xmm8,%xmm9 + pxor %xmm15,%xmm11 + psrad $31,%xmm14 .byte 102,15,56,222,217 - pcmpgtd %xmm15,%xmm14 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 .byte 102,15,56,222,225 - pxor %xmm9,%xmm15 .byte 102,15,56,222,233 + movdqa %xmm13,48(%rsp) + pxor %xmm14,%xmm15 .byte 102,15,56,222,241 + movaps %xmm12,%xmm13 + movdqa %xmm9,%xmm14 .byte 102,15,56,222,249 + movups -32(%rcx),%xmm1 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm12 + paddd %xmm9,%xmm9 +.byte 102,15,56,222,208 + pxor %xmm15,%xmm12 + psrad $31,%xmm14 +.byte 102,15,56,222,216 paddq %xmm15,%xmm15 -.byte 102,15,56,223,208 - pand %xmm8,%xmm9 -.byte 102,15,56,223,216 - pcmpgtd %xmm15,%xmm14 -.byte 102,15,56,223,224 - pxor %xmm9,%xmm15 -.byte 102,15,56,223,232 -.byte 102,15,56,223,240 -.byte 102,15,56,223,248 + pand %xmm8,%xmm14 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 + pxor %xmm14,%xmm15 + movaps %xmm13,%xmm14 +.byte 102,15,56,222,248 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm13 + movdqa %xmm9,%xmm0 + paddd %xmm9,%xmm9 +.byte 102,15,56,222,209 + pxor %xmm15,%xmm13 + psrad $31,%xmm0 +.byte 102,15,56,222,217 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm0 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + pxor %xmm0,%xmm15 + movups (%r11),%xmm0 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 + movups 16(%r11),%xmm1 + + pxor %xmm15,%xmm14 +.byte 102,15,56,223,84,36,0 + psrad $31,%xmm9 paddq %xmm15,%xmm15 - xorps 0(%rsp),%xmm2 +.byte 102,15,56,223,92,36,16 +.byte 102,15,56,223,100,36,32 pand %xmm8,%xmm9 - xorps 16(%rsp),%xmm3 - pcmpgtd %xmm15,%xmm14 + movq %r10,%rax +.byte 102,15,56,223,108,36,48 +.byte 102,15,56,223,116,36,64 +.byte 102,15,56,223,124,36,80 pxor %xmm9,%xmm15 - xorps 32(%rsp),%xmm4 - movups %xmm2,0(%rsi) - xorps 48(%rsp),%xmm5 - movups %xmm3,16(%rsi) - xorps 64(%rsp),%xmm6 - movups %xmm4,32(%rsi) - xorps 80(%rsp),%xmm7 - movups %xmm5,48(%rsi) - movl %r10d,%eax - movups %xmm6,64(%rsi) - movups %xmm7,80(%rsi) leaq 96(%rsi),%rsi + movups %xmm2,-96(%rsi) + movups %xmm3,-80(%rsi) + movups %xmm4,-64(%rsi) + movups %xmm5,-48(%rsi) + movups %xmm6,-32(%rsi) + movups %xmm7,-16(%rsi) subq $96,%rdx jnc .Lxts_dec_grandloop - leal 3(%rax,%rax,1),%eax + movl $16+96,%eax + subl %r10d,%eax movq %r11,%rcx - movl %eax,%r10d + shrl $4,%eax .Lxts_dec_short: + + movl %eax,%r10d + pxor %xmm0,%xmm10 + pxor %xmm0,%xmm11 addq $96,%rdx jz .Lxts_dec_done + pxor %xmm0,%xmm12 cmpq $32,%rdx jb .Lxts_dec_one + pxor %xmm0,%xmm13 je .Lxts_dec_two + pxor %xmm0,%xmm14 cmpq $64,%rdx jb .Lxts_dec_three je .Lxts_dec_four - pshufd $19,%xmm14,%xmm9 - movdqa %xmm15,%xmm14 - paddq %xmm15,%xmm15 movdqu (%rdi),%xmm2 - pand %xmm8,%xmm9 movdqu 16(%rdi),%xmm3 - pxor %xmm9,%xmm15 - movdqu 32(%rdi),%xmm4 pxor %xmm10,%xmm2 movdqu 48(%rdi),%xmm5 @@ -1823,7 +2370,7 @@ aesni_xts_decrypt: decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_dec1_12 + jnz .Loop_dec1_12 .byte 102,15,56,223,209 xorps %xmm10,%xmm2 movdqa %xmm11,%xmm10 @@ -1840,7 +2387,7 @@ aesni_xts_decrypt: xorps %xmm10,%xmm2 xorps %xmm11,%xmm3 - call _aesni_decrypt3 + call _aesni_decrypt2 xorps %xmm10,%xmm2 movdqa %xmm12,%xmm10 @@ -1866,7 +2413,7 @@ aesni_xts_decrypt: xorps %xmm10,%xmm2 movdqa %xmm13,%xmm10 xorps %xmm11,%xmm3 - movdqa %xmm15,%xmm11 + movdqa %xmm14,%xmm11 xorps %xmm12,%xmm4 movups %xmm2,(%rsi) movups %xmm3,16(%rsi) @@ -1876,14 +2423,8 @@ aesni_xts_decrypt: .align 16 .Lxts_dec_four: - pshufd $19,%xmm14,%xmm9 - movdqa %xmm15,%xmm14 - paddq %xmm15,%xmm15 movups (%rdi),%xmm2 - pand %xmm8,%xmm9 movups 16(%rdi),%xmm3 - pxor %xmm9,%xmm15 - movups 32(%rdi),%xmm4 xorps %xmm10,%xmm2 movups 48(%rdi),%xmm5 @@ -1894,16 +2435,16 @@ aesni_xts_decrypt: call _aesni_decrypt4 - xorps %xmm10,%xmm2 + pxor %xmm10,%xmm2 movdqa %xmm14,%xmm10 - xorps %xmm11,%xmm3 + pxor %xmm11,%xmm3 movdqa %xmm15,%xmm11 - xorps %xmm12,%xmm4 - movups %xmm2,(%rsi) - xorps %xmm13,%xmm5 - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm2,(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm3,16(%rsi) + movdqu %xmm4,32(%rsi) + movdqu %xmm5,48(%rsi) leaq 64(%rsi),%rsi jmp .Lxts_dec_done @@ -1927,7 +2468,7 @@ aesni_xts_decrypt: decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_dec1_13 + jnz .Loop_dec1_13 .byte 102,15,56,223,209 xorps %xmm11,%xmm2 movups %xmm2,(%rsi) @@ -1957,13 +2498,37 @@ aesni_xts_decrypt: decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_dec1_14 + jnz .Loop_dec1_14 .byte 102,15,56,223,209 xorps %xmm10,%xmm2 movups %xmm2,(%rsi) .Lxts_dec_ret: - leaq 104(%rsp),%rsp + xorps %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + movaps %xmm0,0(%rsp) + pxor %xmm8,%xmm8 + movaps %xmm0,16(%rsp) + pxor %xmm9,%xmm9 + movaps %xmm0,32(%rsp) + pxor %xmm10,%xmm10 + movaps %xmm0,48(%rsp) + pxor %xmm11,%xmm11 + movaps %xmm0,64(%rsp) + pxor %xmm12,%xmm12 + movaps %xmm0,80(%rsp) + pxor %xmm13,%xmm13 + movaps %xmm0,96(%rsp) + pxor %xmm14,%xmm14 + pxor %xmm15,%xmm15 + leaq (%rbp),%rsp + popq %rbp .Lxts_dec_epilogue: .byte 0xf3,0xc3 .size aesni_xts_decrypt,.-aesni_xts_decrypt @@ -2000,7 +2565,7 @@ aesni_cbc_encrypt: decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_enc1_15 + jnz .Loop_enc1_15 .byte 102,15,56,221,209 movl %r10d,%eax movq %r11,%rcx @@ -2010,285 +2575,545 @@ aesni_cbc_encrypt: jnc .Lcbc_enc_loop addq $16,%rdx jnz .Lcbc_enc_tail + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 movups %xmm2,(%r8) + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 jmp .Lcbc_ret .Lcbc_enc_tail: movq %rdx,%rcx xchgq %rdi,%rsi -.long 0x9066A4F3 +.long 0x9066A4F3 movl $16,%ecx subq %rdx,%rcx xorl %eax,%eax -.long 0x9066AAF3 +.long 0x9066AAF3 leaq -16(%rdi),%rdi movl %r10d,%eax movq %rdi,%rsi movq %r11,%rcx xorq %rdx,%rdx - jmp .Lcbc_enc_loop + jmp .Lcbc_enc_loop .align 16 .Lcbc_decrypt: - movups (%r8),%xmm9 + cmpq $16,%rdx + jne .Lcbc_decrypt_bulk + + + + movdqu (%rdi),%xmm2 + movdqu (%r8),%xmm3 + movdqa %xmm2,%xmm4 + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx + xorps %xmm0,%xmm2 +.Loop_dec1_16: +.byte 102,15,56,222,209 + decl %r10d + movups (%rcx),%xmm1 + leaq 16(%rcx),%rcx + jnz .Loop_dec1_16 +.byte 102,15,56,223,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + movdqu %xmm4,(%r8) + xorps %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 + jmp .Lcbc_ret +.align 16 +.Lcbc_decrypt_bulk: + leaq (%rsp),%rax + pushq %rbp + subq $16,%rsp + andq $-16,%rsp + leaq -8(%rax),%rbp + movups (%r8),%xmm10 movl %r10d,%eax - cmpq $112,%rdx + cmpq $80,%rdx jbe .Lcbc_dec_tail - shrl $1,%r10d - subq $112,%rdx - movl %r10d,%eax - movaps %xmm9,-24(%rsp) + + movups (%rcx),%xmm0 + movdqu 0(%rdi),%xmm2 + movdqu 16(%rdi),%xmm3 + movdqa %xmm2,%xmm11 + movdqu 32(%rdi),%xmm4 + movdqa %xmm3,%xmm12 + movdqu 48(%rdi),%xmm5 + movdqa %xmm4,%xmm13 + movdqu 64(%rdi),%xmm6 + movdqa %xmm5,%xmm14 + movdqu 80(%rdi),%xmm7 + movdqa %xmm6,%xmm15 + movl OPENSSL_ia32cap_P+4(%rip),%r9d + cmpq $112,%rdx + jbe .Lcbc_dec_six_or_seven + + andl $71303168,%r9d + subq $80,%rdx + cmpl $4194304,%r9d + je .Lcbc_dec_loop6_enter + subq $32,%rdx + leaq 112(%rcx),%rcx jmp .Lcbc_dec_loop8_enter .align 16 .Lcbc_dec_loop8: - movaps %xmm0,-24(%rsp) movups %xmm9,(%rsi) leaq 16(%rsi),%rsi .Lcbc_dec_loop8_enter: - movups (%rcx),%xmm0 - movups (%rdi),%xmm2 - movups 16(%rdi),%xmm3 - movups 16(%rcx),%xmm1 + movdqu 96(%rdi),%xmm8 + pxor %xmm0,%xmm2 + movdqu 112(%rdi),%xmm9 + pxor %xmm0,%xmm3 + movups 16-112(%rcx),%xmm1 + pxor %xmm0,%xmm4 + xorq %r11,%r11 + cmpq $112,%rdx + pxor %xmm0,%xmm5 + pxor %xmm0,%xmm6 + pxor %xmm0,%xmm7 + pxor %xmm0,%xmm8 - leaq 32(%rcx),%rcx - movdqu 32(%rdi),%xmm4 - xorps %xmm0,%xmm2 - movdqu 48(%rdi),%xmm5 - xorps %xmm0,%xmm3 - movdqu 64(%rdi),%xmm6 .byte 102,15,56,222,209 - pxor %xmm0,%xmm4 - movdqu 80(%rdi),%xmm7 + pxor %xmm0,%xmm9 + movups 32-112(%rcx),%xmm0 .byte 102,15,56,222,217 - pxor %xmm0,%xmm5 - movdqu 96(%rdi),%xmm8 .byte 102,15,56,222,225 - pxor %xmm0,%xmm6 - movdqu 112(%rdi),%xmm9 .byte 102,15,56,222,233 - pxor %xmm0,%xmm7 - decl %eax .byte 102,15,56,222,241 - pxor %xmm0,%xmm8 .byte 102,15,56,222,249 - pxor %xmm0,%xmm9 - movups (%rcx),%xmm0 .byte 102,68,15,56,222,193 + setnc %r11b + shlq $7,%r11 .byte 102,68,15,56,222,201 - movups 16(%rcx),%xmm1 - - call .Ldec_loop8_enter + addq %rdi,%r11 + movups 48-112(%rcx),%xmm1 +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 64-112(%rcx),%xmm0 + nop +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movups 80-112(%rcx),%xmm1 + nop +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 96-112(%rcx),%xmm0 + nop +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movups 112-112(%rcx),%xmm1 + nop +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 128-112(%rcx),%xmm0 + nop +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movups 144-112(%rcx),%xmm1 + cmpl $11,%eax +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 160-112(%rcx),%xmm0 + jb .Lcbc_dec_done +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movups 176-112(%rcx),%xmm1 + nop +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 192-112(%rcx),%xmm0 + je .Lcbc_dec_done +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movups 208-112(%rcx),%xmm1 + nop +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 224-112(%rcx),%xmm0 + jmp .Lcbc_dec_done +.align 16 +.Lcbc_dec_done: +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 + pxor %xmm0,%xmm10 + pxor %xmm0,%xmm11 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + pxor %xmm0,%xmm12 + pxor %xmm0,%xmm13 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 + pxor %xmm0,%xmm14 + pxor %xmm0,%xmm15 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movdqu 80(%rdi),%xmm1 + +.byte 102,65,15,56,223,210 + movdqu 96(%rdi),%xmm10 + pxor %xmm0,%xmm1 +.byte 102,65,15,56,223,219 + pxor %xmm0,%xmm10 + movdqu 112(%rdi),%xmm0 +.byte 102,65,15,56,223,228 + leaq 128(%rdi),%rdi + movdqu 0(%r11),%xmm11 +.byte 102,65,15,56,223,237 +.byte 102,65,15,56,223,246 + movdqu 16(%r11),%xmm12 + movdqu 32(%r11),%xmm13 +.byte 102,65,15,56,223,255 +.byte 102,68,15,56,223,193 + movdqu 48(%r11),%xmm14 + movdqu 64(%r11),%xmm15 +.byte 102,69,15,56,223,202 + movdqa %xmm0,%xmm10 + movdqu 80(%r11),%xmm1 + movups -112(%rcx),%xmm0 - movups (%rdi),%xmm1 - movups 16(%rdi),%xmm0 - xorps -24(%rsp),%xmm2 - xorps %xmm1,%xmm3 - movups 32(%rdi),%xmm1 - xorps %xmm0,%xmm4 - movups 48(%rdi),%xmm0 - xorps %xmm1,%xmm5 - movups 64(%rdi),%xmm1 - xorps %xmm0,%xmm6 - movups 80(%rdi),%xmm0 - xorps %xmm1,%xmm7 - movups 96(%rdi),%xmm1 - xorps %xmm0,%xmm8 - movups 112(%rdi),%xmm0 - xorps %xmm1,%xmm9 movups %xmm2,(%rsi) + movdqa %xmm11,%xmm2 movups %xmm3,16(%rsi) + movdqa %xmm12,%xmm3 movups %xmm4,32(%rsi) + movdqa %xmm13,%xmm4 movups %xmm5,48(%rsi) - movl %r10d,%eax + movdqa %xmm14,%xmm5 movups %xmm6,64(%rsi) - movq %r11,%rcx + movdqa %xmm15,%xmm6 movups %xmm7,80(%rsi) - leaq 128(%rdi),%rdi + movdqa %xmm1,%xmm7 movups %xmm8,96(%rsi) leaq 112(%rsi),%rsi + subq $128,%rdx ja .Lcbc_dec_loop8 movaps %xmm9,%xmm2 - movaps %xmm0,%xmm9 + leaq -112(%rcx),%rcx addq $112,%rdx - jle .Lcbc_dec_tail_collected - movups %xmm2,(%rsi) - leal 1(%r10,%r10,1),%eax + jle .Lcbc_dec_clear_tail_collected + movups %xmm9,(%rsi) leaq 16(%rsi),%rsi + cmpq $80,%rdx + jbe .Lcbc_dec_tail + + movaps %xmm11,%xmm2 +.Lcbc_dec_six_or_seven: + cmpq $96,%rdx + ja .Lcbc_dec_seven + + movaps %xmm7,%xmm8 + call _aesni_decrypt6 + pxor %xmm10,%xmm2 + movaps %xmm8,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm3,%xmm3 + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm4,%xmm4 + pxor %xmm14,%xmm6 + movdqu %xmm5,48(%rsi) + pxor %xmm5,%xmm5 + pxor %xmm15,%xmm7 + movdqu %xmm6,64(%rsi) + pxor %xmm6,%xmm6 + leaq 80(%rsi),%rsi + movdqa %xmm7,%xmm2 + pxor %xmm7,%xmm7 + jmp .Lcbc_dec_tail_collected + +.align 16 +.Lcbc_dec_seven: + movups 96(%rdi),%xmm8 + xorps %xmm9,%xmm9 + call _aesni_decrypt8 + movups 80(%rdi),%xmm9 + pxor %xmm10,%xmm2 + movups 96(%rdi),%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm3,%xmm3 + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm4,%xmm4 + pxor %xmm14,%xmm6 + movdqu %xmm5,48(%rsi) + pxor %xmm5,%xmm5 + pxor %xmm15,%xmm7 + movdqu %xmm6,64(%rsi) + pxor %xmm6,%xmm6 + pxor %xmm9,%xmm8 + movdqu %xmm7,80(%rsi) + pxor %xmm7,%xmm7 + leaq 96(%rsi),%rsi + movdqa %xmm8,%xmm2 + pxor %xmm8,%xmm8 + pxor %xmm9,%xmm9 + jmp .Lcbc_dec_tail_collected + +.align 16 +.Lcbc_dec_loop6: + movups %xmm7,(%rsi) + leaq 16(%rsi),%rsi + movdqu 0(%rdi),%xmm2 + movdqu 16(%rdi),%xmm3 + movdqa %xmm2,%xmm11 + movdqu 32(%rdi),%xmm4 + movdqa %xmm3,%xmm12 + movdqu 48(%rdi),%xmm5 + movdqa %xmm4,%xmm13 + movdqu 64(%rdi),%xmm6 + movdqa %xmm5,%xmm14 + movdqu 80(%rdi),%xmm7 + movdqa %xmm6,%xmm15 +.Lcbc_dec_loop6_enter: + leaq 96(%rdi),%rdi + movdqa %xmm7,%xmm8 + + call _aesni_decrypt6 + + pxor %xmm10,%xmm2 + movdqa %xmm8,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm14,%xmm6 + movq %r11,%rcx + movdqu %xmm5,48(%rsi) + pxor %xmm15,%xmm7 + movl %r10d,%eax + movdqu %xmm6,64(%rsi) + leaq 80(%rsi),%rsi + subq $96,%rdx + ja .Lcbc_dec_loop6 + + movdqa %xmm7,%xmm2 + addq $80,%rdx + jle .Lcbc_dec_clear_tail_collected + movups %xmm7,(%rsi) + leaq 16(%rsi),%rsi + .Lcbc_dec_tail: movups (%rdi),%xmm2 - movaps %xmm2,%xmm8 - cmpq $16,%rdx + subq $16,%rdx jbe .Lcbc_dec_one movups 16(%rdi),%xmm3 - movaps %xmm3,%xmm7 - cmpq $32,%rdx + movaps %xmm2,%xmm11 + subq $16,%rdx jbe .Lcbc_dec_two movups 32(%rdi),%xmm4 - movaps %xmm4,%xmm6 - cmpq $48,%rdx + movaps %xmm3,%xmm12 + subq $16,%rdx jbe .Lcbc_dec_three movups 48(%rdi),%xmm5 - cmpq $64,%rdx + movaps %xmm4,%xmm13 + subq $16,%rdx jbe .Lcbc_dec_four movups 64(%rdi),%xmm6 - cmpq $80,%rdx - jbe .Lcbc_dec_five - - movups 80(%rdi),%xmm7 - cmpq $96,%rdx - jbe .Lcbc_dec_six - - movups 96(%rdi),%xmm8 - movaps %xmm9,-24(%rsp) - call _aesni_decrypt8 - movups (%rdi),%xmm1 - movups 16(%rdi),%xmm0 - xorps -24(%rsp),%xmm2 - xorps %xmm1,%xmm3 - movups 32(%rdi),%xmm1 - xorps %xmm0,%xmm4 - movups 48(%rdi),%xmm0 - xorps %xmm1,%xmm5 - movups 64(%rdi),%xmm1 - xorps %xmm0,%xmm6 - movups 80(%rdi),%xmm0 - xorps %xmm1,%xmm7 - movups 96(%rdi),%xmm9 - xorps %xmm0,%xmm8 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - movups %xmm6,64(%rsi) - movups %xmm7,80(%rsi) - leaq 96(%rsi),%rsi - movaps %xmm8,%xmm2 - subq $112,%rdx + movaps %xmm5,%xmm14 + movaps %xmm6,%xmm15 + xorps %xmm7,%xmm7 + call _aesni_decrypt6 + pxor %xmm10,%xmm2 + movaps %xmm15,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm3,%xmm3 + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm4,%xmm4 + pxor %xmm14,%xmm6 + movdqu %xmm5,48(%rsi) + pxor %xmm5,%xmm5 + leaq 64(%rsi),%rsi + movdqa %xmm6,%xmm2 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + subq $16,%rdx jmp .Lcbc_dec_tail_collected + .align 16 .Lcbc_dec_one: + movaps %xmm2,%xmm11 movups (%rcx),%xmm0 movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 -.Loop_dec1_16: +.Loop_dec1_17: .byte 102,15,56,222,209 decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_dec1_16 + jnz .Loop_dec1_17 .byte 102,15,56,223,209 - xorps %xmm9,%xmm2 - movaps %xmm8,%xmm9 - subq $16,%rdx + xorps %xmm10,%xmm2 + movaps %xmm11,%xmm10 jmp .Lcbc_dec_tail_collected .align 16 .Lcbc_dec_two: - xorps %xmm4,%xmm4 - call _aesni_decrypt3 - xorps %xmm9,%xmm2 - xorps %xmm8,%xmm3 - movups %xmm2,(%rsi) - movaps %xmm7,%xmm9 - movaps %xmm3,%xmm2 + movaps %xmm3,%xmm12 + call _aesni_decrypt2 + pxor %xmm10,%xmm2 + movaps %xmm12,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + movdqa %xmm3,%xmm2 + pxor %xmm3,%xmm3 leaq 16(%rsi),%rsi - subq $32,%rdx jmp .Lcbc_dec_tail_collected .align 16 .Lcbc_dec_three: + movaps %xmm4,%xmm13 call _aesni_decrypt3 - xorps %xmm9,%xmm2 - xorps %xmm8,%xmm3 - movups %xmm2,(%rsi) - xorps %xmm7,%xmm4 - movups %xmm3,16(%rsi) - movaps %xmm6,%xmm9 - movaps %xmm4,%xmm2 + pxor %xmm10,%xmm2 + movaps %xmm13,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm3,%xmm3 + movdqa %xmm4,%xmm2 + pxor %xmm4,%xmm4 leaq 32(%rsi),%rsi - subq $48,%rdx jmp .Lcbc_dec_tail_collected .align 16 .Lcbc_dec_four: + movaps %xmm5,%xmm14 call _aesni_decrypt4 - xorps %xmm9,%xmm2 - movups 48(%rdi),%xmm9 - xorps %xmm8,%xmm3 - movups %xmm2,(%rsi) - xorps %xmm7,%xmm4 - movups %xmm3,16(%rsi) - xorps %xmm6,%xmm5 - movups %xmm4,32(%rsi) - movaps %xmm5,%xmm2 + pxor %xmm10,%xmm2 + movaps %xmm14,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm3,%xmm3 + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm4,%xmm4 + movdqa %xmm5,%xmm2 + pxor %xmm5,%xmm5 leaq 48(%rsi),%rsi - subq $64,%rdx - jmp .Lcbc_dec_tail_collected -.align 16 -.Lcbc_dec_five: - xorps %xmm7,%xmm7 - call _aesni_decrypt6 - movups 16(%rdi),%xmm1 - movups 32(%rdi),%xmm0 - xorps %xmm9,%xmm2 - xorps %xmm8,%xmm3 - xorps %xmm1,%xmm4 - movups 48(%rdi),%xmm1 - xorps %xmm0,%xmm5 - movups 64(%rdi),%xmm9 - xorps %xmm1,%xmm6 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - leaq 64(%rsi),%rsi - movaps %xmm6,%xmm2 - subq $80,%rdx - jmp .Lcbc_dec_tail_collected -.align 16 -.Lcbc_dec_six: - call _aesni_decrypt6 - movups 16(%rdi),%xmm1 - movups 32(%rdi),%xmm0 - xorps %xmm9,%xmm2 - xorps %xmm8,%xmm3 - xorps %xmm1,%xmm4 - movups 48(%rdi),%xmm1 - xorps %xmm0,%xmm5 - movups 64(%rdi),%xmm0 - xorps %xmm1,%xmm6 - movups 80(%rdi),%xmm9 - xorps %xmm0,%xmm7 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - movups %xmm6,64(%rsi) - leaq 80(%rsi),%rsi - movaps %xmm7,%xmm2 - subq $96,%rdx jmp .Lcbc_dec_tail_collected + .align 16 +.Lcbc_dec_clear_tail_collected: + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + pxor %xmm8,%xmm8 + pxor %xmm9,%xmm9 .Lcbc_dec_tail_collected: + movups %xmm10,(%r8) andq $15,%rdx - movups %xmm9,(%r8) jnz .Lcbc_dec_tail_partial movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 jmp .Lcbc_dec_ret .align 16 .Lcbc_dec_tail_partial: - movaps %xmm2,-24(%rsp) + movaps %xmm2,(%rsp) + pxor %xmm2,%xmm2 movq $16,%rcx movq %rsi,%rdi subq %rdx,%rcx - leaq -24(%rsp),%rsi -.long 0x9066A4F3 + leaq (%rsp),%rsi +.long 0x9066A4F3 + movdqa %xmm2,(%rsp) .Lcbc_dec_ret: + xorps %xmm0,%xmm0 + pxor %xmm1,%xmm1 + leaq (%rbp),%rsp + popq %rbp .Lcbc_ret: .byte 0xf3,0xc3 .size aesni_cbc_encrypt,.-aesni_cbc_encrypt @@ -2296,7 +3121,7 @@ aesni_cbc_encrypt: .type aesni_set_decrypt_key,@function .align 16 aesni_set_decrypt_key: -.byte 0x48,0x83,0xEC,0x08 +.byte 0x48,0x83,0xEC,0x08 call __aesni_set_encrypt_key shll $4,%esi testl %eax,%eax @@ -2324,7 +3149,9 @@ aesni_set_decrypt_key: movups (%rdx),%xmm0 .byte 102,15,56,219,192 + pxor %xmm1,%xmm1 movups %xmm0,(%rdi) + pxor %xmm0,%xmm0 .Ldec_key_ret: addq $8,%rsp .byte 0xf3,0xc3 @@ -2335,15 +3162,17 @@ aesni_set_decrypt_key: .align 16 aesni_set_encrypt_key: __aesni_set_encrypt_key: -.byte 0x48,0x83,0xEC,0x08 +.byte 0x48,0x83,0xEC,0x08 movq $-1,%rax testq %rdi,%rdi jz .Lenc_key_ret testq %rdx,%rdx jz .Lenc_key_ret + movl $268437504,%r10d movups (%rdi),%xmm0 xorps %xmm4,%xmm4 + andl OPENSSL_ia32cap_P+4(%rip),%r10d leaq 16(%rdx),%rax cmpl $256,%esi je .L14rounds @@ -2354,6 +3183,9 @@ __aesni_set_encrypt_key: .L10rounds: movl $9,%esi + cmpl $268435456,%r10d + je .L10rounds_alt + movups %xmm0,(%rdx) .byte 102,15,58,223,200,1 call .Lkey_expansion_128_cold @@ -2381,9 +3213,79 @@ __aesni_set_encrypt_key: jmp .Lenc_key_ret .align 16 +.L10rounds_alt: + movdqa .Lkey_rotate(%rip),%xmm5 + movl $8,%r10d + movdqa .Lkey_rcon1(%rip),%xmm4 + movdqa %xmm0,%xmm2 + movdqu %xmm0,(%rdx) + jmp .Loop_key128 + +.align 16 +.Loop_key128: +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + pslld $1,%xmm4 + leaq 16(%rax),%rax + + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + + pxor %xmm2,%xmm0 + movdqu %xmm0,-16(%rax) + movdqa %xmm0,%xmm2 + + decl %r10d + jnz .Loop_key128 + + movdqa .Lkey_rcon1b(%rip),%xmm4 + +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + pslld $1,%xmm4 + + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + + pxor %xmm2,%xmm0 + movdqu %xmm0,(%rax) + + movdqa %xmm0,%xmm2 +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + + pxor %xmm2,%xmm0 + movdqu %xmm0,16(%rax) + + movl %esi,96(%rax) + xorl %eax,%eax + jmp .Lenc_key_ret + +.align 16 .L12rounds: movq 16(%rdi),%xmm2 movl $11,%esi + cmpl $268435456,%r10d + je .L12rounds_alt + movups %xmm0,(%rdx) .byte 102,15,58,223,202,1 call .Lkey_expansion_192a_cold @@ -2407,10 +3309,54 @@ __aesni_set_encrypt_key: jmp .Lenc_key_ret .align 16 +.L12rounds_alt: + movdqa .Lkey_rotate192(%rip),%xmm5 + movdqa .Lkey_rcon1(%rip),%xmm4 + movl $8,%r10d + movdqu %xmm0,(%rdx) + jmp .Loop_key192 + +.align 16 +.Loop_key192: + movq %xmm2,0(%rax) + movdqa %xmm2,%xmm1 +.byte 102,15,56,0,213 +.byte 102,15,56,221,212 + pslld $1,%xmm4 + leaq 24(%rax),%rax + + movdqa %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm3,%xmm0 + + pshufd $255,%xmm0,%xmm3 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + + pxor %xmm2,%xmm0 + pxor %xmm3,%xmm2 + movdqu %xmm0,-16(%rax) + + decl %r10d + jnz .Loop_key192 + + movl %esi,32(%rax) + xorl %eax,%eax + jmp .Lenc_key_ret + +.align 16 .L14rounds: movups 16(%rdi),%xmm2 movl $13,%esi leaq 16(%rax),%rax + cmpl $268435456,%r10d + je .L14rounds_alt + movups %xmm0,(%rdx) movups %xmm2,16(%rdx) .byte 102,15,58,223,202,1 @@ -2445,9 +3391,69 @@ __aesni_set_encrypt_key: jmp .Lenc_key_ret .align 16 +.L14rounds_alt: + movdqa .Lkey_rotate(%rip),%xmm5 + movdqa .Lkey_rcon1(%rip),%xmm4 + movl $7,%r10d + movdqu %xmm0,0(%rdx) + movdqa %xmm2,%xmm1 + movdqu %xmm2,16(%rdx) + jmp .Loop_key256 + +.align 16 +.Loop_key256: +.byte 102,15,56,0,213 +.byte 102,15,56,221,212 + + movdqa %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm3,%xmm0 + pslld $1,%xmm4 + + pxor %xmm2,%xmm0 + movdqu %xmm0,(%rax) + + decl %r10d + jz .Ldone_key256 + + pshufd $255,%xmm0,%xmm2 + pxor %xmm3,%xmm3 +.byte 102,15,56,221,211 + + movdqa %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm3,%xmm1 + + pxor %xmm1,%xmm2 + movdqu %xmm2,16(%rax) + leaq 32(%rax),%rax + movdqa %xmm2,%xmm1 + + jmp .Loop_key256 + +.Ldone_key256: + movl %esi,16(%rax) + xorl %eax,%eax + jmp .Lenc_key_ret + +.align 16 .Lbad_keybits: movq $-2,%rax .Lenc_key_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 addq $8,%rsp .byte 0xf3,0xc3 .LSEH_end_set_encrypt_key: @@ -2531,6 +3537,16 @@ __aesni_set_encrypt_key: .long 1,0,0,0 .Lxts_magic: .long 0x87,0,1,0 +.Lincrement1: +.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 +.Lkey_rotate: +.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d +.Lkey_rotate192: +.long 0x04070605,0x04070605,0x04070605,0x04070605 +.Lkey_rcon1: +.long 1,1,1,1 +.Lkey_rcon1b: +.long 0x1b,0x1b,0x1b,0x1b .byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 diff --git a/secure/lib/libcrypto/amd64/bsaes-x86_64.S b/secure/lib/libcrypto/amd64/bsaes-x86_64.S index 5588ef5..be410de 100644 --- a/secure/lib/libcrypto/amd64/bsaes-x86_64.S +++ b/secure/lib/libcrypto/amd64/bsaes-x86_64.S @@ -14,18 +14,18 @@ _bsaes_encrypt8: movdqa 80(%r11),%xmm7 pxor %xmm8,%xmm15 pxor %xmm8,%xmm0 -.byte 102,68,15,56,0,255 pxor %xmm8,%xmm1 -.byte 102,15,56,0,199 pxor %xmm8,%xmm2 -.byte 102,15,56,0,207 +.byte 102,68,15,56,0,255 +.byte 102,15,56,0,199 pxor %xmm8,%xmm3 -.byte 102,15,56,0,215 pxor %xmm8,%xmm4 -.byte 102,15,56,0,223 +.byte 102,15,56,0,207 +.byte 102,15,56,0,215 pxor %xmm8,%xmm5 -.byte 102,15,56,0,231 pxor %xmm8,%xmm6 +.byte 102,15,56,0,223 +.byte 102,15,56,0,231 .byte 102,15,56,0,239 .byte 102,15,56,0,247 _bsaes_encrypt8_bitslice: @@ -122,21 +122,21 @@ _bsaes_encrypt8_bitslice: .Lenc_loop: pxor 0(%rax),%xmm15 pxor 16(%rax),%xmm0 -.byte 102,68,15,56,0,255 pxor 32(%rax),%xmm1 -.byte 102,15,56,0,199 pxor 48(%rax),%xmm2 -.byte 102,15,56,0,207 +.byte 102,68,15,56,0,255 +.byte 102,15,56,0,199 pxor 64(%rax),%xmm3 -.byte 102,15,56,0,215 pxor 80(%rax),%xmm4 -.byte 102,15,56,0,223 +.byte 102,15,56,0,207 +.byte 102,15,56,0,215 pxor 96(%rax),%xmm5 -.byte 102,15,56,0,231 pxor 112(%rax),%xmm6 +.byte 102,15,56,0,223 +.byte 102,15,56,0,231 .byte 102,15,56,0,239 - leaq 128(%rax),%rax .byte 102,15,56,0,247 + leaq 128(%rax),%rax .Lenc_sbox: pxor %xmm5,%xmm4 pxor %xmm0,%xmm1 @@ -486,18 +486,18 @@ _bsaes_decrypt8: movdqa -48(%r11),%xmm7 pxor %xmm8,%xmm15 pxor %xmm8,%xmm0 -.byte 102,68,15,56,0,255 pxor %xmm8,%xmm1 -.byte 102,15,56,0,199 pxor %xmm8,%xmm2 -.byte 102,15,56,0,207 +.byte 102,68,15,56,0,255 +.byte 102,15,56,0,199 pxor %xmm8,%xmm3 -.byte 102,15,56,0,215 pxor %xmm8,%xmm4 -.byte 102,15,56,0,223 +.byte 102,15,56,0,207 +.byte 102,15,56,0,215 pxor %xmm8,%xmm5 -.byte 102,15,56,0,231 pxor %xmm8,%xmm6 +.byte 102,15,56,0,223 +.byte 102,15,56,0,231 .byte 102,15,56,0,239 .byte 102,15,56,0,247 movdqa 0(%r11),%xmm7 @@ -593,21 +593,21 @@ _bsaes_decrypt8: .Ldec_loop: pxor 0(%rax),%xmm15 pxor 16(%rax),%xmm0 -.byte 102,68,15,56,0,255 pxor 32(%rax),%xmm1 -.byte 102,15,56,0,199 pxor 48(%rax),%xmm2 -.byte 102,15,56,0,207 +.byte 102,68,15,56,0,255 +.byte 102,15,56,0,199 pxor 64(%rax),%xmm3 -.byte 102,15,56,0,215 pxor 80(%rax),%xmm4 -.byte 102,15,56,0,223 +.byte 102,15,56,0,207 +.byte 102,15,56,0,215 pxor 96(%rax),%xmm5 -.byte 102,15,56,0,231 pxor 112(%rax),%xmm6 +.byte 102,15,56,0,223 +.byte 102,15,56,0,231 .byte 102,15,56,0,239 - leaq 128(%rax),%rax .byte 102,15,56,0,247 + leaq 128(%rax),%rax .Ldec_sbox: pxor %xmm3,%xmm2 @@ -1285,7 +1285,7 @@ bsaes_cbc_encrypt: leaq (%r12),%rdi leaq 32(%rbp),%rsi leaq (%r15),%rdx - call asm_AES_decrypt + call asm_AES_decrypt pxor 32(%rbp),%xmm14 movdqu %xmm14,(%r13) movdqa %xmm15,%xmm14 @@ -1383,21 +1383,21 @@ bsaes_ctr32_encrypt_blocks: movdqa -16(%r11),%xmm7 pxor %xmm8,%xmm15 pxor %xmm8,%xmm0 -.byte 102,68,15,56,0,255 pxor %xmm8,%xmm1 -.byte 102,15,56,0,199 pxor %xmm8,%xmm2 -.byte 102,15,56,0,207 +.byte 102,68,15,56,0,255 +.byte 102,15,56,0,199 pxor %xmm8,%xmm3 -.byte 102,15,56,0,215 pxor %xmm8,%xmm4 -.byte 102,15,56,0,223 +.byte 102,15,56,0,207 +.byte 102,15,56,0,215 pxor %xmm8,%xmm5 -.byte 102,15,56,0,231 pxor %xmm8,%xmm6 +.byte 102,15,56,0,223 +.byte 102,15,56,0,231 .byte 102,15,56,0,239 - leaq .LBS0(%rip),%r11 .byte 102,15,56,0,247 + leaq .LBS0(%rip),%r11 movl %ebx,%r10d call _bsaes_encrypt8_bitslice @@ -1535,7 +1535,7 @@ bsaes_xts_encrypt: leaq (%r9),%rdi leaq 32(%rbp),%rsi leaq (%r8),%rdx - call asm_AES_encrypt + call asm_AES_encrypt movl 240(%r15),%eax movq %r14,%rbx @@ -1905,7 +1905,7 @@ bsaes_xts_encrypt: leaq 32(%rbp),%rdi leaq 32(%rbp),%rsi leaq (%r15),%rdx - call asm_AES_encrypt + call asm_AES_encrypt pxor 32(%rbp),%xmm15 @@ -1938,7 +1938,7 @@ bsaes_xts_encrypt: leaq 32(%rbp),%rsi movdqa %xmm15,32(%rbp) leaq (%r15),%rdx - call asm_AES_encrypt + call asm_AES_encrypt pxor 32(%rbp),%xmm6 movdqu %xmm6,-16(%r13) @@ -1987,7 +1987,7 @@ bsaes_xts_decrypt: leaq (%r9),%rdi leaq 32(%rbp),%rsi leaq (%r8),%rdx - call asm_AES_encrypt + call asm_AES_encrypt movl 240(%r15),%eax movq %r14,%rbx @@ -2364,7 +2364,7 @@ bsaes_xts_decrypt: leaq 32(%rbp),%rdi leaq 32(%rbp),%rsi leaq (%r15),%rdx - call asm_AES_decrypt + call asm_AES_decrypt pxor 32(%rbp),%xmm15 @@ -2395,7 +2395,7 @@ bsaes_xts_decrypt: leaq 32(%rbp),%rsi movdqa %xmm15,32(%rbp) leaq (%r15),%rdx - call asm_AES_decrypt + call asm_AES_decrypt pxor 32(%rbp),%xmm6 movq %r13,%rdx movdqu %xmm6,(%r13) @@ -2416,7 +2416,7 @@ bsaes_xts_decrypt: leaq 32(%rbp),%rsi movdqa %xmm15,32(%rbp) leaq (%r15),%rdx - call asm_AES_decrypt + call asm_AES_decrypt pxor 32(%rbp),%xmm5 movdqu %xmm5,(%r13) diff --git a/secure/lib/libcrypto/amd64/cmll-x86_64.S b/secure/lib/libcrypto/amd64/cmll-x86_64.S index f42203c..ecd33f1 100644 --- a/secure/lib/libcrypto/amd64/cmll-x86_64.S +++ b/secure/lib/libcrypto/amd64/cmll-x86_64.S @@ -267,7 +267,7 @@ _x86_64_Camellia_encrypt: movl %ecx,%r10d movl %edx,%r11d -.byte 0xf3,0xc3 +.byte 0xf3,0xc3 .size _x86_64_Camellia_encrypt,.-_x86_64_Camellia_encrypt @@ -537,7 +537,7 @@ _x86_64_Camellia_decrypt: movl %eax,%r10d movl %ebx,%r11d -.byte 0xf3,0xc3 +.byte 0xf3,0xc3 .size _x86_64_Camellia_decrypt,.-_x86_64_Camellia_decrypt .globl Camellia_Ekeygen .type Camellia_Ekeygen,@function @@ -550,7 +550,7 @@ Camellia_Ekeygen: pushq %r15 .Lkey_prologue: - movq %rdi,%r15 + movl %edi,%r15d movq %rdx,%r13 movl 0(%rsi),%r8d @@ -1724,14 +1724,14 @@ Camellia_cbc_encrypt: cld movq %r12,%rsi leaq 8+24(%rsp),%rdi -.long 0x9066A4F3 +.long 0x9066A4F3 popfq .Lcbc_enc_popf: leaq 24(%rsp),%r12 leaq 16+24(%rsp),%rax movq %rax,8(%rsp) - jmp .Lcbc_eloop + jmp .Lcbc_eloop .align 16 .LCBC_DECRYPT: @@ -1814,7 +1814,7 @@ Camellia_cbc_encrypt: cld leaq 8+24(%rsp),%rsi leaq (%r13),%rdi -.long 0x9066A4F3 +.long 0x9066A4F3 popfq .Lcbc_dec_popf: diff --git a/secure/lib/libcrypto/amd64/ecp_nistz256-x86_64.S b/secure/lib/libcrypto/amd64/ecp_nistz256-x86_64.S new file mode 100644 index 0000000..c5875d7 --- /dev/null +++ b/secure/lib/libcrypto/amd64/ecp_nistz256-x86_64.S @@ -0,0 +1,2005 @@ + # $FreeBSD$ +.text + + + +.align 64 +.Lpoly: +.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001 + + +.LRR: +.quad 0x0000000000000003, 0xfffffffbffffffff, 0xfffffffffffffffe, 0x00000004fffffffd + +.LOne: +.long 1,1,1,1,1,1,1,1 +.LTwo: +.long 2,2,2,2,2,2,2,2 +.LThree: +.long 3,3,3,3,3,3,3,3 +.LONE_mont: +.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe + +.globl ecp_nistz256_mul_by_2 +.type ecp_nistz256_mul_by_2,@function +.align 64 +ecp_nistz256_mul_by_2: + pushq %r12 + pushq %r13 + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + addq %r8,%r8 + movq 16(%rsi),%r10 + adcq %r9,%r9 + movq 24(%rsi),%r11 + leaq .Lpoly(%rip),%rsi + movq %r8,%rax + adcq %r10,%r10 + adcq %r11,%r11 + movq %r9,%rdx + sbbq %r13,%r13 + + subq 0(%rsi),%r8 + movq %r10,%rcx + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + movq %r11,%r12 + sbbq 24(%rsi),%r11 + testq %r13,%r13 + + cmovzq %rax,%r8 + cmovzq %rdx,%r9 + movq %r8,0(%rdi) + cmovzq %rcx,%r10 + movq %r9,8(%rdi) + cmovzq %r12,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + popq %r13 + popq %r12 + .byte 0xf3,0xc3 +.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 + + + +.globl ecp_nistz256_div_by_2 +.type ecp_nistz256_div_by_2,@function +.align 32 +ecp_nistz256_div_by_2: + pushq %r12 + pushq %r13 + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq %r8,%rax + movq 24(%rsi),%r11 + leaq .Lpoly(%rip),%rsi + + movq %r9,%rdx + xorq %r13,%r13 + addq 0(%rsi),%r8 + movq %r10,%rcx + adcq 8(%rsi),%r9 + adcq 16(%rsi),%r10 + movq %r11,%r12 + adcq 24(%rsi),%r11 + adcq $0,%r13 + xorq %rsi,%rsi + testq $1,%rax + + cmovzq %rax,%r8 + cmovzq %rdx,%r9 + cmovzq %rcx,%r10 + cmovzq %r12,%r11 + cmovzq %rsi,%r13 + + movq %r9,%rax + shrq $1,%r8 + shlq $63,%rax + movq %r10,%rdx + shrq $1,%r9 + orq %rax,%r8 + shlq $63,%rdx + movq %r11,%rcx + shrq $1,%r10 + orq %rdx,%r9 + shlq $63,%rcx + shrq $1,%r11 + shlq $63,%r13 + orq %rcx,%r10 + orq %r13,%r11 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + popq %r13 + popq %r12 + .byte 0xf3,0xc3 +.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 + + + +.globl ecp_nistz256_mul_by_3 +.type ecp_nistz256_mul_by_3,@function +.align 32 +ecp_nistz256_mul_by_3: + pushq %r12 + pushq %r13 + + movq 0(%rsi),%r8 + xorq %r13,%r13 + movq 8(%rsi),%r9 + addq %r8,%r8 + movq 16(%rsi),%r10 + adcq %r9,%r9 + movq 24(%rsi),%r11 + movq %r8,%rax + adcq %r10,%r10 + adcq %r11,%r11 + movq %r9,%rdx + adcq $0,%r13 + + subq $-1,%r8 + movq %r10,%rcx + sbbq .Lpoly+8(%rip),%r9 + sbbq $0,%r10 + movq %r11,%r12 + sbbq .Lpoly+24(%rip),%r11 + testq %r13,%r13 + + cmovzq %rax,%r8 + cmovzq %rdx,%r9 + cmovzq %rcx,%r10 + cmovzq %r12,%r11 + + xorq %r13,%r13 + addq 0(%rsi),%r8 + adcq 8(%rsi),%r9 + movq %r8,%rax + adcq 16(%rsi),%r10 + adcq 24(%rsi),%r11 + movq %r9,%rdx + adcq $0,%r13 + + subq $-1,%r8 + movq %r10,%rcx + sbbq .Lpoly+8(%rip),%r9 + sbbq $0,%r10 + movq %r11,%r12 + sbbq .Lpoly+24(%rip),%r11 + testq %r13,%r13 + + cmovzq %rax,%r8 + cmovzq %rdx,%r9 + movq %r8,0(%rdi) + cmovzq %rcx,%r10 + movq %r9,8(%rdi) + cmovzq %r12,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + popq %r13 + popq %r12 + .byte 0xf3,0xc3 +.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 + + + +.globl ecp_nistz256_add +.type ecp_nistz256_add,@function +.align 32 +ecp_nistz256_add: + pushq %r12 + pushq %r13 + + movq 0(%rsi),%r8 + xorq %r13,%r13 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + leaq .Lpoly(%rip),%rsi + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + movq %r8,%rax + adcq 16(%rdx),%r10 + adcq 24(%rdx),%r11 + movq %r9,%rdx + adcq $0,%r13 + + subq 0(%rsi),%r8 + movq %r10,%rcx + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + movq %r11,%r12 + sbbq 24(%rsi),%r11 + testq %r13,%r13 + + cmovzq %rax,%r8 + cmovzq %rdx,%r9 + movq %r8,0(%rdi) + cmovzq %rcx,%r10 + movq %r9,8(%rdi) + cmovzq %r12,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + popq %r13 + popq %r12 + .byte 0xf3,0xc3 +.size ecp_nistz256_add,.-ecp_nistz256_add + + + +.globl ecp_nistz256_sub +.type ecp_nistz256_sub,@function +.align 32 +ecp_nistz256_sub: + pushq %r12 + pushq %r13 + + movq 0(%rsi),%r8 + xorq %r13,%r13 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + leaq .Lpoly(%rip),%rsi + + subq 0(%rdx),%r8 + sbbq 8(%rdx),%r9 + movq %r8,%rax + sbbq 16(%rdx),%r10 + sbbq 24(%rdx),%r11 + movq %r9,%rdx + sbbq $0,%r13 + + addq 0(%rsi),%r8 + movq %r10,%rcx + adcq 8(%rsi),%r9 + adcq 16(%rsi),%r10 + movq %r11,%r12 + adcq 24(%rsi),%r11 + testq %r13,%r13 + + cmovzq %rax,%r8 + cmovzq %rdx,%r9 + movq %r8,0(%rdi) + cmovzq %rcx,%r10 + movq %r9,8(%rdi) + cmovzq %r12,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + popq %r13 + popq %r12 + .byte 0xf3,0xc3 +.size ecp_nistz256_sub,.-ecp_nistz256_sub + + + +.globl ecp_nistz256_neg +.type ecp_nistz256_neg,@function +.align 32 +ecp_nistz256_neg: + pushq %r12 + pushq %r13 + + xorq %r8,%r8 + xorq %r9,%r9 + xorq %r10,%r10 + xorq %r11,%r11 + xorq %r13,%r13 + + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + movq %r8,%rax + sbbq 24(%rsi),%r11 + leaq .Lpoly(%rip),%rsi + movq %r9,%rdx + sbbq $0,%r13 + + addq 0(%rsi),%r8 + movq %r10,%rcx + adcq 8(%rsi),%r9 + adcq 16(%rsi),%r10 + movq %r11,%r12 + adcq 24(%rsi),%r11 + testq %r13,%r13 + + cmovzq %rax,%r8 + cmovzq %rdx,%r9 + movq %r8,0(%rdi) + cmovzq %rcx,%r10 + movq %r9,8(%rdi) + cmovzq %r12,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + popq %r13 + popq %r12 + .byte 0xf3,0xc3 +.size ecp_nistz256_neg,.-ecp_nistz256_neg + + + + +.globl ecp_nistz256_to_mont +.type ecp_nistz256_to_mont,@function +.align 32 +ecp_nistz256_to_mont: + leaq .LRR(%rip),%rdx + jmp .Lmul_mont +.size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont + + + + + + + +.globl ecp_nistz256_mul_mont +.type ecp_nistz256_mul_mont,@function +.align 32 +ecp_nistz256_mul_mont: +.Lmul_mont: + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq %rdx,%rbx + movq 0(%rdx),%rax + movq 0(%rsi),%r9 + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + + call __ecp_nistz256_mul_montq +.Lmul_mont_done: + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + .byte 0xf3,0xc3 +.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont + +.type __ecp_nistz256_mul_montq,@function +.align 32 +__ecp_nistz256_mul_montq: + + + movq %rax,%rbp + mulq %r9 + movq .Lpoly+8(%rip),%r14 + movq %rax,%r8 + movq %rbp,%rax + movq %rdx,%r9 + + mulq %r10 + movq .Lpoly+24(%rip),%r15 + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %r11 + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %r12 + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + xorq %r13,%r13 + movq %rdx,%r12 + + + + + + + + + + + movq %r8,%rbp + shlq $32,%r8 + mulq %r15 + shrq $32,%rbp + addq %r8,%r9 + adcq %rbp,%r10 + adcq %rax,%r11 + movq 8(%rbx),%rax + adcq %rdx,%r12 + adcq $0,%r13 + xorq %r8,%r8 + + + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rcx,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 16(%rsi) + addq %rcx,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 24(%rsi) + addq %rcx,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %r9,%rax + adcq %rdx,%r13 + adcq $0,%r8 + + + + movq %r9,%rbp + shlq $32,%r9 + mulq %r15 + shrq $32,%rbp + addq %r9,%r10 + adcq %rbp,%r11 + adcq %rax,%r12 + movq 16(%rbx),%rax + adcq %rdx,%r13 + adcq $0,%r8 + xorq %r9,%r9 + + + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rcx,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 16(%rsi) + addq %rcx,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 24(%rsi) + addq %rcx,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %r10,%rax + adcq %rdx,%r8 + adcq $0,%r9 + + + + movq %r10,%rbp + shlq $32,%r10 + mulq %r15 + shrq $32,%rbp + addq %r10,%r11 + adcq %rbp,%r12 + adcq %rax,%r13 + movq 24(%rbx),%rax + adcq %rdx,%r8 + adcq $0,%r9 + xorq %r10,%r10 + + + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rcx,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 16(%rsi) + addq %rcx,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 24(%rsi) + addq %rcx,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r11,%rax + adcq %rdx,%r9 + adcq $0,%r10 + + + + movq %r11,%rbp + shlq $32,%r11 + mulq %r15 + shrq $32,%rbp + addq %r11,%r12 + adcq %rbp,%r13 + movq %r12,%rcx + adcq %rax,%r8 + adcq %rdx,%r9 + movq %r13,%rbp + adcq $0,%r10 + + + + subq $-1,%r12 + movq %r8,%rbx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%rdx + sbbq %r15,%r9 + sbbq $0,%r10 + + cmovcq %rcx,%r12 + cmovcq %rbp,%r13 + movq %r12,0(%rdi) + cmovcq %rbx,%r8 + movq %r13,8(%rdi) + cmovcq %rdx,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 +.size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq + + + + + + + + +.globl ecp_nistz256_sqr_mont +.type ecp_nistz256_sqr_mont,@function +.align 32 +ecp_nistz256_sqr_mont: + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq 0(%rsi),%rax + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%r8 + + call __ecp_nistz256_sqr_montq +.Lsqr_mont_done: + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + .byte 0xf3,0xc3 +.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont + +.type __ecp_nistz256_sqr_montq,@function +.align 32 +__ecp_nistz256_sqr_montq: + movq %rax,%r13 + mulq %r14 + movq %rax,%r9 + movq %r15,%rax + movq %rdx,%r10 + + mulq %r13 + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %r13 + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r12 + + + mulq %r14 + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq %r14 + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r12 + movq %rdx,%r13 + adcq $0,%r13 + + + mulq %r15 + xorq %r15,%r15 + addq %rax,%r13 + movq 0(%rsi),%rax + movq %rdx,%r14 + adcq $0,%r14 + + addq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %r12,%r12 + adcq %r13,%r13 + adcq %r14,%r14 + adcq $0,%r15 + + mulq %rax + movq %rax,%r8 + movq 8(%rsi),%rax + movq %rdx,%rcx + + mulq %rax + addq %rcx,%r9 + adcq %rax,%r10 + movq 16(%rsi),%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq %rax + addq %rcx,%r11 + adcq %rax,%r12 + movq 24(%rsi),%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq %rax + addq %rcx,%r13 + adcq %rax,%r14 + movq %r8,%rax + adcq %rdx,%r15 + + movq .Lpoly+8(%rip),%rsi + movq .Lpoly+24(%rip),%rbp + + + + + movq %r8,%rcx + shlq $32,%r8 + mulq %rbp + shrq $32,%rcx + addq %r8,%r9 + adcq %rcx,%r10 + adcq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + + + + movq %r9,%rcx + shlq $32,%r9 + movq %rdx,%r8 + mulq %rbp + shrq $32,%rcx + addq %r9,%r10 + adcq %rcx,%r11 + adcq %rax,%r8 + movq %r10,%rax + adcq $0,%rdx + + + + movq %r10,%rcx + shlq $32,%r10 + movq %rdx,%r9 + mulq %rbp + shrq $32,%rcx + addq %r10,%r11 + adcq %rcx,%r8 + adcq %rax,%r9 + movq %r11,%rax + adcq $0,%rdx + + + + movq %r11,%rcx + shlq $32,%r11 + movq %rdx,%r10 + mulq %rbp + shrq $32,%rcx + addq %r11,%r8 + adcq %rcx,%r9 + adcq %rax,%r10 + adcq $0,%rdx + xorq %r11,%r11 + + + + addq %r8,%r12 + adcq %r9,%r13 + movq %r12,%r8 + adcq %r10,%r14 + adcq %rdx,%r15 + movq %r13,%r9 + adcq $0,%r11 + + subq $-1,%r12 + movq %r14,%r10 + sbbq %rsi,%r13 + sbbq $0,%r14 + movq %r15,%rcx + sbbq %rbp,%r15 + sbbq $0,%r11 + + cmovcq %r8,%r12 + cmovcq %r9,%r13 + movq %r12,0(%rdi) + cmovcq %r10,%r14 + movq %r13,8(%rdi) + cmovcq %rcx,%r15 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + + .byte 0xf3,0xc3 +.size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq + + + + + + +.globl ecp_nistz256_from_mont +.type ecp_nistz256_from_mont,@function +.align 32 +ecp_nistz256_from_mont: + pushq %r12 + pushq %r13 + + movq 0(%rsi),%rax + movq .Lpoly+24(%rip),%r13 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq %rax,%r8 + movq .Lpoly+8(%rip),%r12 + + + + movq %rax,%rcx + shlq $32,%r8 + mulq %r13 + shrq $32,%rcx + addq %r8,%r9 + adcq %rcx,%r10 + adcq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + + + + movq %r9,%rcx + shlq $32,%r9 + movq %rdx,%r8 + mulq %r13 + shrq $32,%rcx + addq %r9,%r10 + adcq %rcx,%r11 + adcq %rax,%r8 + movq %r10,%rax + adcq $0,%rdx + + + + movq %r10,%rcx + shlq $32,%r10 + movq %rdx,%r9 + mulq %r13 + shrq $32,%rcx + addq %r10,%r11 + adcq %rcx,%r8 + adcq %rax,%r9 + movq %r11,%rax + adcq $0,%rdx + + + + movq %r11,%rcx + shlq $32,%r11 + movq %rdx,%r10 + mulq %r13 + shrq $32,%rcx + addq %r11,%r8 + adcq %rcx,%r9 + movq %r8,%rcx + adcq %rax,%r10 + movq %r9,%rsi + adcq $0,%rdx + + + + subq $-1,%r8 + movq %r10,%rax + sbbq %r12,%r9 + sbbq $0,%r10 + movq %rdx,%r11 + sbbq %r13,%rdx + sbbq %r13,%r13 + + cmovnzq %rcx,%r8 + cmovnzq %rsi,%r9 + movq %r8,0(%rdi) + cmovnzq %rax,%r10 + movq %r9,8(%rdi) + cmovzq %rdx,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + popq %r13 + popq %r12 + .byte 0xf3,0xc3 +.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont + + +.globl ecp_nistz256_select_w5 +.type ecp_nistz256_select_w5,@function +.align 32 +ecp_nistz256_select_w5: + movdqa .LOne(%rip),%xmm0 + movd %edx,%xmm1 + + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + + movdqa %xmm0,%xmm8 + pshufd $0,%xmm1,%xmm1 + + movq $16,%rax +.Lselect_loop_sse_w5: + + movdqa %xmm8,%xmm15 + paddd %xmm0,%xmm8 + pcmpeqd %xmm1,%xmm15 + + movdqa 0(%rsi),%xmm9 + movdqa 16(%rsi),%xmm10 + movdqa 32(%rsi),%xmm11 + movdqa 48(%rsi),%xmm12 + movdqa 64(%rsi),%xmm13 + movdqa 80(%rsi),%xmm14 + leaq 96(%rsi),%rsi + + pand %xmm15,%xmm9 + pand %xmm15,%xmm10 + por %xmm9,%xmm2 + pand %xmm15,%xmm11 + por %xmm10,%xmm3 + pand %xmm15,%xmm12 + por %xmm11,%xmm4 + pand %xmm15,%xmm13 + por %xmm12,%xmm5 + pand %xmm15,%xmm14 + por %xmm13,%xmm6 + por %xmm14,%xmm7 + + decq %rax + jnz .Lselect_loop_sse_w5 + + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + movdqu %xmm4,32(%rdi) + movdqu %xmm5,48(%rdi) + movdqu %xmm6,64(%rdi) + movdqu %xmm7,80(%rdi) + .byte 0xf3,0xc3 +.size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5 + + + +.globl ecp_nistz256_select_w7 +.type ecp_nistz256_select_w7,@function +.align 32 +ecp_nistz256_select_w7: + movdqa .LOne(%rip),%xmm8 + movd %edx,%xmm1 + + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + + movdqa %xmm8,%xmm0 + pshufd $0,%xmm1,%xmm1 + movq $64,%rax + +.Lselect_loop_sse_w7: + movdqa %xmm8,%xmm15 + paddd %xmm0,%xmm8 + movdqa 0(%rsi),%xmm9 + movdqa 16(%rsi),%xmm10 + pcmpeqd %xmm1,%xmm15 + movdqa 32(%rsi),%xmm11 + movdqa 48(%rsi),%xmm12 + leaq 64(%rsi),%rsi + + pand %xmm15,%xmm9 + pand %xmm15,%xmm10 + por %xmm9,%xmm2 + pand %xmm15,%xmm11 + por %xmm10,%xmm3 + pand %xmm15,%xmm12 + por %xmm11,%xmm4 + prefetcht0 255(%rsi) + por %xmm12,%xmm5 + + decq %rax + jnz .Lselect_loop_sse_w7 + + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + movdqu %xmm4,32(%rdi) + movdqu %xmm5,48(%rdi) + .byte 0xf3,0xc3 +.size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7 +.globl ecp_nistz256_avx2_select_w7 +.type ecp_nistz256_avx2_select_w7,@function +.align 32 +ecp_nistz256_avx2_select_w7: +.byte 0x0f,0x0b + .byte 0xf3,0xc3 +.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7 +.type __ecp_nistz256_add_toq,@function +.align 32 +__ecp_nistz256_add_toq: + addq 0(%rbx),%r12 + adcq 8(%rbx),%r13 + movq %r12,%rax + adcq 16(%rbx),%r8 + adcq 24(%rbx),%r9 + movq %r13,%rbp + sbbq %r11,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + testq %r11,%r11 + + cmovzq %rax,%r12 + cmovzq %rbp,%r13 + movq %r12,0(%rdi) + cmovzq %rcx,%r8 + movq %r13,8(%rdi) + cmovzq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 +.size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq + +.type __ecp_nistz256_sub_fromq,@function +.align 32 +__ecp_nistz256_sub_fromq: + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r13 + movq %r12,%rax + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + movq %r13,%rbp + sbbq %r11,%r11 + + addq $-1,%r12 + movq %r8,%rcx + adcq %r14,%r13 + adcq $0,%r8 + movq %r9,%r10 + adcq %r15,%r9 + testq %r11,%r11 + + cmovzq %rax,%r12 + cmovzq %rbp,%r13 + movq %r12,0(%rdi) + cmovzq %rcx,%r8 + movq %r13,8(%rdi) + cmovzq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 +.size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq + +.type __ecp_nistz256_subq,@function +.align 32 +__ecp_nistz256_subq: + subq %r12,%rax + sbbq %r13,%rbp + movq %rax,%r12 + sbbq %r8,%rcx + sbbq %r9,%r10 + movq %rbp,%r13 + sbbq %r11,%r11 + + addq $-1,%rax + movq %rcx,%r8 + adcq %r14,%rbp + adcq $0,%rcx + movq %r10,%r9 + adcq %r15,%r10 + testq %r11,%r11 + + cmovnzq %rax,%r12 + cmovnzq %rbp,%r13 + cmovnzq %rcx,%r8 + cmovnzq %r10,%r9 + + .byte 0xf3,0xc3 +.size __ecp_nistz256_subq,.-__ecp_nistz256_subq + +.type __ecp_nistz256_mul_by_2q,@function +.align 32 +__ecp_nistz256_mul_by_2q: + addq %r12,%r12 + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + sbbq %r11,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + testq %r11,%r11 + + cmovzq %rax,%r12 + cmovzq %rbp,%r13 + movq %r12,0(%rdi) + cmovzq %rcx,%r8 + movq %r13,8(%rdi) + cmovzq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 +.size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q +.globl ecp_nistz256_point_double +.type ecp_nistz256_point_double,@function +.align 32 +ecp_nistz256_point_double: + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $160+8,%rsp + + movdqu 0(%rsi),%xmm0 + movq %rsi,%rbx + movdqu 16(%rsi),%xmm1 + movq 32+0(%rsi),%r12 + movq 32+8(%rsi),%r13 + movq 32+16(%rsi),%r8 + movq 32+24(%rsi),%r9 + movq .Lpoly+8(%rip),%r14 + movq .Lpoly+24(%rip),%r15 + movdqa %xmm0,96(%rsp) + movdqa %xmm1,96+16(%rsp) + leaq 32(%rdi),%r10 + leaq 64(%rdi),%r11 +.byte 102,72,15,110,199 +.byte 102,73,15,110,202 +.byte 102,73,15,110,211 + + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_by_2q + + movq 64+0(%rsi),%rax + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + leaq 64-0(%rsi),%rsi + leaq 64(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 0+0(%rsp),%rax + movq 8+0(%rsp),%r14 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + leaq 0(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 32(%rbx),%rax + movq 64+0(%rbx),%r9 + movq 64+8(%rbx),%r10 + movq 64+16(%rbx),%r11 + movq 64+24(%rbx),%r12 + leaq 64-0(%rbx),%rsi + leaq 32(%rbx),%rbx +.byte 102,72,15,126,215 + call __ecp_nistz256_mul_montq + call __ecp_nistz256_mul_by_2q + + movq 96+0(%rsp),%r12 + movq 96+8(%rsp),%r13 + leaq 64(%rsp),%rbx + movq 96+16(%rsp),%r8 + movq 96+24(%rsp),%r9 + leaq 32(%rsp),%rdi + call __ecp_nistz256_add_toq + + movq 96+0(%rsp),%r12 + movq 96+8(%rsp),%r13 + leaq 64(%rsp),%rbx + movq 96+16(%rsp),%r8 + movq 96+24(%rsp),%r9 + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 0+0(%rsp),%rax + movq 8+0(%rsp),%r14 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 +.byte 102,72,15,126,207 + call __ecp_nistz256_sqr_montq + xorq %r9,%r9 + movq %r12,%rax + addq $-1,%r12 + movq %r13,%r10 + adcq %rsi,%r13 + movq %r14,%rcx + adcq $0,%r14 + movq %r15,%r8 + adcq %rbp,%r15 + adcq $0,%r9 + xorq %rsi,%rsi + testq $1,%rax + + cmovzq %rax,%r12 + cmovzq %r10,%r13 + cmovzq %rcx,%r14 + cmovzq %r8,%r15 + cmovzq %rsi,%r9 + + movq %r13,%rax + shrq $1,%r12 + shlq $63,%rax + movq %r14,%r10 + shrq $1,%r13 + orq %rax,%r12 + shlq $63,%r10 + movq %r15,%rcx + shrq $1,%r14 + orq %r10,%r13 + shlq $63,%rcx + movq %r12,0(%rdi) + shrq $1,%r15 + movq %r13,8(%rdi) + shlq $63,%r9 + orq %rcx,%r14 + orq %r9,%r15 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + movq 64(%rsp),%rax + leaq 64(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_by_2q + + leaq 32(%rsp),%rbx + leaq 32(%rsp),%rdi + call __ecp_nistz256_add_toq + + movq 96(%rsp),%rax + leaq 96(%rsp),%rbx + movq 0+0(%rsp),%r9 + movq 8+0(%rsp),%r10 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r11 + movq 24+0(%rsp),%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_by_2q + + movq 0+32(%rsp),%rax + movq 8+32(%rsp),%r14 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r15 + movq 24+32(%rsp),%r8 +.byte 102,72,15,126,199 + call __ecp_nistz256_sqr_montq + + leaq 128(%rsp),%rbx + movq %r14,%r8 + movq %r15,%r9 + movq %rsi,%r14 + movq %rbp,%r15 + call __ecp_nistz256_sub_fromq + + movq 0+0(%rsp),%rax + movq 0+8(%rsp),%rbp + movq 0+16(%rsp),%rcx + movq 0+24(%rsp),%r10 + leaq 0(%rsp),%rdi + call __ecp_nistz256_subq + + movq 32(%rsp),%rax + leaq 32(%rsp),%rbx + movq %r12,%r14 + xorl %ecx,%ecx + movq %r12,0+0(%rsp) + movq %r13,%r10 + movq %r13,0+8(%rsp) + cmovzq %r8,%r11 + movq %r8,0+16(%rsp) + leaq 0-0(%rsp),%rsi + cmovzq %r9,%r12 + movq %r9,0+24(%rsp) + movq %r14,%r9 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montq + +.byte 102,72,15,126,203 +.byte 102,72,15,126,207 + call __ecp_nistz256_sub_fromq + + addq $160+8,%rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + .byte 0xf3,0xc3 +.size ecp_nistz256_point_double,.-ecp_nistz256_point_double +.globl ecp_nistz256_point_add +.type ecp_nistz256_point_add,@function +.align 32 +ecp_nistz256_point_add: + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $576+8,%rsp + + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + movdqu 64(%rsi),%xmm4 + movdqu 80(%rsi),%xmm5 + movq %rsi,%rbx + movq %rdx,%rsi + movdqa %xmm0,384(%rsp) + movdqa %xmm1,384+16(%rsp) + por %xmm0,%xmm1 + movdqa %xmm2,416(%rsp) + movdqa %xmm3,416+16(%rsp) + por %xmm2,%xmm3 + movdqa %xmm4,448(%rsp) + movdqa %xmm5,448+16(%rsp) + por %xmm1,%xmm3 + + movdqu 0(%rsi),%xmm0 + pshufd $177,%xmm3,%xmm5 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + por %xmm3,%xmm5 + movdqu 48(%rsi),%xmm3 + movq 64+0(%rsi),%rax + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + movdqa %xmm0,480(%rsp) + pshufd $30,%xmm5,%xmm4 + movdqa %xmm1,480+16(%rsp) + por %xmm0,%xmm1 +.byte 102,72,15,110,199 + movdqa %xmm2,512(%rsp) + movdqa %xmm3,512+16(%rsp) + por %xmm2,%xmm3 + por %xmm4,%xmm5 + pxor %xmm4,%xmm4 + por %xmm1,%xmm3 + + leaq 64-0(%rsi),%rsi + movq %rax,544+0(%rsp) + movq %r14,544+8(%rsp) + movq %r15,544+16(%rsp) + movq %r8,544+24(%rsp) + leaq 96(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + pcmpeqd %xmm4,%xmm5 + pshufd $177,%xmm3,%xmm4 + por %xmm3,%xmm4 + pshufd $0,%xmm5,%xmm5 + pshufd $30,%xmm4,%xmm3 + por %xmm3,%xmm4 + pxor %xmm3,%xmm3 + pcmpeqd %xmm3,%xmm4 + pshufd $0,%xmm4,%xmm4 + movq 64+0(%rbx),%rax + movq 64+8(%rbx),%r14 + movq 64+16(%rbx),%r15 + movq 64+24(%rbx),%r8 + + leaq 64-0(%rbx),%rsi + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 544(%rsp),%rax + leaq 544(%rsp),%rbx + movq 0+96(%rsp),%r9 + movq 8+96(%rsp),%r10 + leaq 0+96(%rsp),%rsi + movq 16+96(%rsp),%r11 + movq 24+96(%rsp),%r12 + leaq 224(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 448(%rsp),%rax + leaq 448(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 416(%rsp),%rax + leaq 416(%rsp),%rbx + movq 0+224(%rsp),%r9 + movq 8+224(%rsp),%r10 + leaq 0+224(%rsp),%rsi + movq 16+224(%rsp),%r11 + movq 24+224(%rsp),%r12 + leaq 224(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 512(%rsp),%rax + leaq 512(%rsp),%rbx + movq 0+256(%rsp),%r9 + movq 8+256(%rsp),%r10 + leaq 0+256(%rsp),%rsi + movq 16+256(%rsp),%r11 + movq 24+256(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 224(%rsp),%rbx + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + orq %r13,%r12 + movdqa %xmm4,%xmm2 + orq %r8,%r12 + orq %r9,%r12 + por %xmm5,%xmm2 +.byte 102,73,15,110,220 + + movq 384(%rsp),%rax + leaq 384(%rsp),%rbx + movq 0+96(%rsp),%r9 + movq 8+96(%rsp),%r10 + leaq 0+96(%rsp),%rsi + movq 16+96(%rsp),%r11 + movq 24+96(%rsp),%r12 + leaq 160(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 480(%rsp),%rax + leaq 480(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 192(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 160(%rsp),%rbx + leaq 0(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + orq %r13,%r12 + orq %r8,%r12 + orq %r9,%r12 + +.byte 0x3e + jnz .Ladd_proceedq +.byte 102,73,15,126,208 +.byte 102,73,15,126,217 + testq %r8,%r8 + jnz .Ladd_proceedq + testq %r9,%r9 + jz .Ladd_proceedq + +.byte 102,72,15,126,199 + pxor %xmm0,%xmm0 + movdqu %xmm0,0(%rdi) + movdqu %xmm0,16(%rdi) + movdqu %xmm0,32(%rdi) + movdqu %xmm0,48(%rdi) + movdqu %xmm0,64(%rdi) + movdqu %xmm0,80(%rdi) + jmp .Ladd_doneq + +.align 32 +.Ladd_proceedq: + movq 0+64(%rsp),%rax + movq 8+64(%rsp),%r14 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r15 + movq 24+64(%rsp),%r8 + leaq 96(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 448(%rsp),%rax + leaq 448(%rsp),%rbx + movq 0+0(%rsp),%r9 + movq 8+0(%rsp),%r10 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r11 + movq 24+0(%rsp),%r12 + leaq 352(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 0+0(%rsp),%rax + movq 8+0(%rsp),%r14 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 544(%rsp),%rax + leaq 544(%rsp),%rbx + movq 0+352(%rsp),%r9 + movq 8+352(%rsp),%r10 + leaq 0+352(%rsp),%rsi + movq 16+352(%rsp),%r11 + movq 24+352(%rsp),%r12 + leaq 352(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 0(%rsp),%rax + leaq 0(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 160(%rsp),%rax + leaq 160(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 192(%rsp),%rdi + call __ecp_nistz256_mul_montq + + + + + addq %r12,%r12 + leaq 96(%rsp),%rsi + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + sbbq %r11,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + testq %r11,%r11 + + cmovzq %rax,%r12 + movq 0(%rsi),%rax + cmovzq %rbp,%r13 + movq 8(%rsi),%rbp + cmovzq %rcx,%r8 + movq 16(%rsi),%rcx + cmovzq %r10,%r9 + movq 24(%rsi),%r10 + + call __ecp_nistz256_subq + + leaq 128(%rsp),%rbx + leaq 288(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 192+0(%rsp),%rax + movq 192+8(%rsp),%rbp + movq 192+16(%rsp),%rcx + movq 192+24(%rsp),%r10 + leaq 320(%rsp),%rdi + + call __ecp_nistz256_subq + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq 128(%rsp),%rax + leaq 128(%rsp),%rbx + movq 0+224(%rsp),%r9 + movq 8+224(%rsp),%r10 + leaq 0+224(%rsp),%rsi + movq 16+224(%rsp),%r11 + movq 24+224(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 320(%rsp),%rax + leaq 320(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 320(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 256(%rsp),%rbx + leaq 320(%rsp),%rdi + call __ecp_nistz256_sub_fromq + +.byte 102,72,15,126,199 + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 352(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 352+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 544(%rsp),%xmm2 + pand 544+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 448(%rsp),%xmm2 + pand 448+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,64(%rdi) + movdqu %xmm3,80(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 288(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 288+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 480(%rsp),%xmm2 + pand 480+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 384(%rsp),%xmm2 + pand 384+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 320(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 320+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 512(%rsp),%xmm2 + pand 512+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 416(%rsp),%xmm2 + pand 416+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + +.Ladd_doneq: + addq $576+8,%rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + .byte 0xf3,0xc3 +.size ecp_nistz256_point_add,.-ecp_nistz256_point_add +.globl ecp_nistz256_point_add_affine +.type ecp_nistz256_point_add_affine,@function +.align 32 +ecp_nistz256_point_add_affine: + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $480+8,%rsp + + movdqu 0(%rsi),%xmm0 + movq %rdx,%rbx + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + movdqu 64(%rsi),%xmm4 + movdqu 80(%rsi),%xmm5 + movq 64+0(%rsi),%rax + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + movdqa %xmm0,320(%rsp) + movdqa %xmm1,320+16(%rsp) + por %xmm0,%xmm1 + movdqa %xmm2,352(%rsp) + movdqa %xmm3,352+16(%rsp) + por %xmm2,%xmm3 + movdqa %xmm4,384(%rsp) + movdqa %xmm5,384+16(%rsp) + por %xmm1,%xmm3 + + movdqu 0(%rbx),%xmm0 + pshufd $177,%xmm3,%xmm5 + movdqu 16(%rbx),%xmm1 + movdqu 32(%rbx),%xmm2 + por %xmm3,%xmm5 + movdqu 48(%rbx),%xmm3 + movdqa %xmm0,416(%rsp) + pshufd $30,%xmm5,%xmm4 + movdqa %xmm1,416+16(%rsp) + por %xmm0,%xmm1 +.byte 102,72,15,110,199 + movdqa %xmm2,448(%rsp) + movdqa %xmm3,448+16(%rsp) + por %xmm2,%xmm3 + por %xmm4,%xmm5 + pxor %xmm4,%xmm4 + por %xmm1,%xmm3 + + leaq 64-0(%rsi),%rsi + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + pcmpeqd %xmm4,%xmm5 + pshufd $177,%xmm3,%xmm4 + movq 0(%rbx),%rax + + movq %r12,%r9 + por %xmm3,%xmm4 + pshufd $0,%xmm5,%xmm5 + pshufd $30,%xmm4,%xmm3 + movq %r13,%r10 + por %xmm3,%xmm4 + pxor %xmm3,%xmm3 + movq %r14,%r11 + pcmpeqd %xmm3,%xmm4 + pshufd $0,%xmm4,%xmm4 + + leaq 32-0(%rsp),%rsi + movq %r15,%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 320(%rsp),%rbx + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 384(%rsp),%rax + leaq 384(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 384(%rsp),%rax + leaq 384(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 288(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 448(%rsp),%rax + leaq 448(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 352(%rsp),%rbx + leaq 96(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 0+64(%rsp),%rax + movq 8+64(%rsp),%r14 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r15 + movq 24+64(%rsp),%r8 + leaq 128(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 0+96(%rsp),%rax + movq 8+96(%rsp),%r14 + leaq 0+96(%rsp),%rsi + movq 16+96(%rsp),%r15 + movq 24+96(%rsp),%r8 + leaq 192(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 128(%rsp),%rax + leaq 128(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 160(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 320(%rsp),%rax + leaq 320(%rsp),%rbx + movq 0+128(%rsp),%r9 + movq 8+128(%rsp),%r10 + leaq 0+128(%rsp),%rsi + movq 16+128(%rsp),%r11 + movq 24+128(%rsp),%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montq + + + + + addq %r12,%r12 + leaq 192(%rsp),%rsi + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + sbbq %r11,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + testq %r11,%r11 + + cmovzq %rax,%r12 + movq 0(%rsi),%rax + cmovzq %rbp,%r13 + movq 8(%rsi),%rbp + cmovzq %rcx,%r8 + movq 16(%rsi),%rcx + cmovzq %r10,%r9 + movq 24(%rsi),%r10 + + call __ecp_nistz256_subq + + leaq 160(%rsp),%rbx + leaq 224(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 0+0(%rsp),%rax + movq 0+8(%rsp),%rbp + movq 0+16(%rsp),%rcx + movq 0+24(%rsp),%r10 + leaq 64(%rsp),%rdi + + call __ecp_nistz256_subq + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq 352(%rsp),%rax + leaq 352(%rsp),%rbx + movq 0+160(%rsp),%r9 + movq 8+160(%rsp),%r10 + leaq 0+160(%rsp),%rsi + movq 16+160(%rsp),%r11 + movq 24+160(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 96(%rsp),%rax + leaq 96(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 64(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 32(%rsp),%rbx + leaq 256(%rsp),%rdi + call __ecp_nistz256_sub_fromq + +.byte 102,72,15,126,199 + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 288(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 288+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand .LONE_mont(%rip),%xmm2 + pand .LONE_mont+16(%rip),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 384(%rsp),%xmm2 + pand 384+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,64(%rdi) + movdqu %xmm3,80(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 224(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 224+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 416(%rsp),%xmm2 + pand 416+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 320(%rsp),%xmm2 + pand 320+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 256(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 256+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 448(%rsp),%xmm2 + pand 448+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 352(%rsp),%xmm2 + pand 352+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + + addq $480+8,%rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + .byte 0xf3,0xc3 +.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine diff --git a/secure/lib/libcrypto/amd64/ghash-x86_64.S b/secure/lib/libcrypto/amd64/ghash-x86_64.S index d7ea764..aa93c80 100644 --- a/secure/lib/libcrypto/amd64/ghash-x86_64.S +++ b/secure/lib/libcrypto/amd64/ghash-x86_64.S @@ -1,6 +1,7 @@ # $FreeBSD$ .text + .globl gcm_gmult_4bit .type gcm_gmult_4bit,@function .align 16 @@ -659,6 +660,7 @@ gcm_ghash_4bit: .type gcm_init_clmul,@function .align 16 gcm_init_clmul: +.L_init_clmul: movdqu (%rsi),%xmm2 pshufd $78,%xmm2,%xmm2 @@ -677,15 +679,15 @@ gcm_init_clmul: pxor %xmm5,%xmm2 + pshufd $78,%xmm2,%xmm6 movdqa %xmm2,%xmm0 + pxor %xmm2,%xmm6 movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 - pshufd $78,%xmm2,%xmm4 pxor %xmm0,%xmm3 - pxor %xmm2,%xmm4 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 -.byte 102,15,58,68,220,0 +.byte 102,15,58,68,222,0 pxor %xmm0,%xmm3 pxor %xmm1,%xmm3 @@ -695,44 +697,134 @@ gcm_init_clmul: pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 psllq $1,%xmm0 pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + pshufd $78,%xmm2,%xmm3 + pshufd $78,%xmm0,%xmm4 + pxor %xmm2,%xmm3 + movdqu %xmm2,0(%rdi) + pxor %xmm0,%xmm4 + movdqu %xmm0,16(%rdi) +.byte 102,15,58,15,227,8 + movdqu %xmm4,32(%rdi) + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 +.byte 102,15,58,68,194,0 +.byte 102,15,58,68,202,17 +.byte 102,15,58,68,222,0 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 - movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm4 - pxor %xmm3,%xmm0 - pxor %xmm4,%xmm1 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 psrlq $5,%xmm0 pxor %xmm4,%xmm0 psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + movdqa %xmm0,%xmm5 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 +.byte 102,15,58,68,194,0 +.byte 102,15,58,68,202,17 +.byte 102,15,58,68,222,0 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 - pxor %xmm1,%xmm4 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 pxor %xmm4,%xmm0 - movdqu %xmm2,(%rdi) - movdqu %xmm0,16(%rdi) + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + pshufd $78,%xmm5,%xmm3 + pshufd $78,%xmm0,%xmm4 + pxor %xmm5,%xmm3 + movdqu %xmm5,48(%rdi) + pxor %xmm0,%xmm4 + movdqu %xmm0,64(%rdi) +.byte 102,15,58,15,227,8 + movdqu %xmm4,80(%rdi) .byte 0xf3,0xc3 .size gcm_init_clmul,.-gcm_init_clmul .globl gcm_gmult_clmul .type gcm_gmult_clmul,@function .align 16 gcm_gmult_clmul: +.L_gmult_clmul: movdqu (%rdi),%xmm0 movdqa .Lbswap_mask(%rip),%xmm5 movdqu (%rsi),%xmm2 + movdqu 32(%rsi),%xmm4 .byte 102,15,56,0,197 movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 - pshufd $78,%xmm2,%xmm4 pxor %xmm0,%xmm3 - pxor %xmm2,%xmm4 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 .byte 102,15,58,68,220,0 @@ -745,201 +837,379 @@ gcm_gmult_clmul: pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 - movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm4 - pxor %xmm3,%xmm0 - pxor %xmm4,%xmm1 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 pxor %xmm4,%xmm0 - pxor %xmm1,%xmm4 psrlq $1,%xmm0 - pxor %xmm4,%xmm0 + pxor %xmm1,%xmm0 .byte 102,15,56,0,197 movdqu %xmm0,(%rdi) .byte 0xf3,0xc3 .size gcm_gmult_clmul,.-gcm_gmult_clmul .globl gcm_ghash_clmul .type gcm_ghash_clmul,@function -.align 16 +.align 32 gcm_ghash_clmul: - movdqa .Lbswap_mask(%rip),%xmm5 +.L_ghash_clmul: + movdqa .Lbswap_mask(%rip),%xmm10 movdqu (%rdi),%xmm0 movdqu (%rsi),%xmm2 -.byte 102,15,56,0,197 + movdqu 32(%rsi),%xmm7 +.byte 102,65,15,56,0,194 subq $16,%rcx jz .Lodd_tail - movdqu 16(%rsi),%xmm8 + movdqu 16(%rsi),%xmm6 + movl OPENSSL_ia32cap_P+4(%rip),%eax + cmpq $48,%rcx + jb .Lskip4x + andl $71303168,%eax + cmpl $4194304,%eax + je .Lskip4x + subq $48,%rcx + movq $11547335547999543296,%rax + movdqu 48(%rsi),%xmm14 + movdqu 64(%rsi),%xmm15 - movdqu (%rdx),%xmm3 - movdqu 16(%rdx),%xmm6 -.byte 102,15,56,0,221 -.byte 102,15,56,0,245 - pxor %xmm3,%xmm0 - movdqa %xmm6,%xmm7 - pshufd $78,%xmm6,%xmm3 - pshufd $78,%xmm2,%xmm4 - pxor %xmm6,%xmm3 - pxor %xmm2,%xmm4 -.byte 102,15,58,68,242,0 -.byte 102,15,58,68,250,17 -.byte 102,15,58,68,220,0 - pxor %xmm6,%xmm3 - pxor %xmm7,%xmm3 - movdqa %xmm3,%xmm4 - psrldq $8,%xmm3 - pslldq $8,%xmm4 - pxor %xmm3,%xmm7 - pxor %xmm4,%xmm6 + movdqu 48(%rdx),%xmm3 + movdqu 32(%rdx),%xmm11 +.byte 102,65,15,56,0,218 +.byte 102,69,15,56,0,218 + movdqa %xmm3,%xmm5 + pshufd $78,%xmm3,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,68,218,0 +.byte 102,15,58,68,234,17 +.byte 102,15,58,68,231,0 + + movdqa %xmm11,%xmm13 + pshufd $78,%xmm11,%xmm12 + pxor %xmm11,%xmm12 +.byte 102,68,15,58,68,222,0 +.byte 102,68,15,58,68,238,17 +.byte 102,68,15,58,68,231,16 + xorps %xmm11,%xmm3 + xorps %xmm13,%xmm5 + movups 80(%rsi),%xmm7 + xorps %xmm12,%xmm4 + + movdqu 16(%rdx),%xmm11 + movdqu 0(%rdx),%xmm8 +.byte 102,69,15,56,0,218 +.byte 102,69,15,56,0,194 + movdqa %xmm11,%xmm13 + pshufd $78,%xmm11,%xmm12 + pxor %xmm8,%xmm0 + pxor %xmm11,%xmm12 +.byte 102,69,15,58,68,222,0 movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 - pshufd $78,%xmm8,%xmm4 - pxor %xmm0,%xmm3 - pxor %xmm8,%xmm4 + pshufd $78,%xmm0,%xmm8 + pxor %xmm0,%xmm8 +.byte 102,69,15,58,68,238,17 +.byte 102,68,15,58,68,231,0 + xorps %xmm11,%xmm3 + xorps %xmm13,%xmm5 - leaq 32(%rdx),%rdx - subq $32,%rcx - jbe .Leven_tail + leaq 64(%rdx),%rdx + subq $64,%rcx + jc .Ltail4x -.Lmod_loop: -.byte 102,65,15,58,68,192,0 -.byte 102,65,15,58,68,200,17 -.byte 102,15,58,68,220,0 - pxor %xmm0,%xmm3 - pxor %xmm1,%xmm3 + jmp .Lmod4_loop +.align 32 +.Lmod4_loop: +.byte 102,65,15,58,68,199,0 + xorps %xmm12,%xmm4 + movdqu 48(%rdx),%xmm11 +.byte 102,69,15,56,0,218 +.byte 102,65,15,58,68,207,17 + xorps %xmm3,%xmm0 + movdqu 32(%rdx),%xmm3 + movdqa %xmm11,%xmm13 +.byte 102,68,15,58,68,199,16 + pshufd $78,%xmm11,%xmm12 + xorps %xmm5,%xmm1 + pxor %xmm11,%xmm12 +.byte 102,65,15,56,0,218 + movups 32(%rsi),%xmm7 + xorps %xmm4,%xmm8 +.byte 102,68,15,58,68,218,0 + pshufd $78,%xmm3,%xmm4 - movdqa %xmm3,%xmm4 - psrldq $8,%xmm3 - pslldq $8,%xmm4 - pxor %xmm3,%xmm1 - pxor %xmm4,%xmm0 - movdqu (%rdx),%xmm3 - pxor %xmm6,%xmm0 - pxor %xmm7,%xmm1 - - movdqu 16(%rdx),%xmm6 -.byte 102,15,56,0,221 -.byte 102,15,56,0,245 - - movdqa %xmm6,%xmm7 - pshufd $78,%xmm6,%xmm9 - pshufd $78,%xmm2,%xmm10 - pxor %xmm6,%xmm9 - pxor %xmm2,%xmm10 - pxor %xmm3,%xmm1 + pxor %xmm0,%xmm8 + movdqa %xmm3,%xmm5 + pxor %xmm1,%xmm8 + pxor %xmm3,%xmm4 + movdqa %xmm8,%xmm9 +.byte 102,68,15,58,68,234,17 + pslldq $8,%xmm8 + psrldq $8,%xmm9 + pxor %xmm8,%xmm0 + movdqa .L7_mask(%rip),%xmm8 + pxor %xmm9,%xmm1 +.byte 102,76,15,110,200 + + pand %xmm0,%xmm8 +.byte 102,69,15,56,0,200 + pxor %xmm0,%xmm9 +.byte 102,68,15,58,68,231,0 + psllq $57,%xmm9 + movdqa %xmm9,%xmm8 + pslldq $8,%xmm9 +.byte 102,15,58,68,222,0 + psrldq $8,%xmm8 + pxor %xmm9,%xmm0 + pxor %xmm8,%xmm1 + movdqu 0(%rdx),%xmm8 + + movdqa %xmm0,%xmm9 + psrlq $1,%xmm0 +.byte 102,15,58,68,238,17 + xorps %xmm11,%xmm3 + movdqu 16(%rdx),%xmm11 +.byte 102,69,15,56,0,218 +.byte 102,15,58,68,231,16 + xorps %xmm13,%xmm5 + movups 80(%rsi),%xmm7 +.byte 102,69,15,56,0,194 + pxor %xmm9,%xmm1 + pxor %xmm0,%xmm9 + psrlq $5,%xmm0 + + movdqa %xmm11,%xmm13 + pxor %xmm12,%xmm4 + pshufd $78,%xmm11,%xmm12 + pxor %xmm9,%xmm0 + pxor %xmm8,%xmm1 + pxor %xmm11,%xmm12 +.byte 102,69,15,58,68,222,0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + movdqa %xmm0,%xmm1 +.byte 102,69,15,58,68,238,17 + xorps %xmm11,%xmm3 + pshufd $78,%xmm0,%xmm8 + pxor %xmm0,%xmm8 + +.byte 102,68,15,58,68,231,0 + xorps %xmm13,%xmm5 + leaq 64(%rdx),%rdx + subq $64,%rcx + jnc .Lmod4_loop + +.Ltail4x: +.byte 102,65,15,58,68,199,0 +.byte 102,65,15,58,68,207,17 +.byte 102,68,15,58,68,199,16 + xorps %xmm12,%xmm4 + xorps %xmm3,%xmm0 + xorps %xmm5,%xmm1 + pxor %xmm0,%xmm1 + pxor %xmm4,%xmm8 + + pxor %xmm1,%xmm8 + pxor %xmm0,%xmm1 + + movdqa %xmm8,%xmm9 + psrldq $8,%xmm8 + pslldq $8,%xmm9 + pxor %xmm8,%xmm1 + pxor %xmm9,%xmm0 + + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 pxor %xmm3,%xmm0 -.byte 102,15,58,68,242,0 psllq $57,%xmm0 - movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm4 - pxor %xmm3,%xmm0 - pxor %xmm4,%xmm1 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + -.byte 102,15,58,68,250,17 movdqa %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 pxor %xmm4,%xmm0 - pxor %xmm1,%xmm4 psrlq $1,%xmm0 - pxor %xmm4,%xmm0 + pxor %xmm1,%xmm0 + addq $64,%rcx + jz .Ldone + movdqu 32(%rsi),%xmm7 + subq $16,%rcx + jz .Lodd_tail +.Lskip4x: + + + + + + movdqu (%rdx),%xmm8 + movdqu 16(%rdx),%xmm3 +.byte 102,69,15,56,0,194 +.byte 102,65,15,56,0,218 + pxor %xmm8,%xmm0 -.byte 102,69,15,58,68,202,0 + movdqa %xmm3,%xmm5 + pshufd $78,%xmm3,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,68,218,0 +.byte 102,15,58,68,234,17 +.byte 102,15,58,68,231,0 + + leaq 32(%rdx),%rdx + nop + subq $32,%rcx + jbe .Leven_tail + nop + jmp .Lmod_loop + +.align 32 +.Lmod_loop: movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 - pshufd $78,%xmm8,%xmm4 - pxor %xmm0,%xmm3 + movdqa %xmm4,%xmm8 + pshufd $78,%xmm0,%xmm4 + pxor %xmm0,%xmm4 + +.byte 102,15,58,68,198,0 +.byte 102,15,58,68,206,17 +.byte 102,15,58,68,231,16 + + pxor %xmm3,%xmm0 + pxor %xmm5,%xmm1 + movdqu (%rdx),%xmm9 + pxor %xmm0,%xmm8 +.byte 102,69,15,56,0,202 + movdqu 16(%rdx),%xmm3 + + pxor %xmm1,%xmm8 + pxor %xmm9,%xmm1 pxor %xmm8,%xmm4 +.byte 102,65,15,56,0,218 + movdqa %xmm4,%xmm8 + psrldq $8,%xmm8 + pslldq $8,%xmm4 + pxor %xmm8,%xmm1 + pxor %xmm4,%xmm0 - pxor %xmm6,%xmm9 - pxor %xmm7,%xmm9 - movdqa %xmm9,%xmm10 - psrldq $8,%xmm9 - pslldq $8,%xmm10 - pxor %xmm9,%xmm7 - pxor %xmm10,%xmm6 + movdqa %xmm3,%xmm5 + + movdqa %xmm0,%xmm9 + movdqa %xmm0,%xmm8 + psllq $5,%xmm0 + pxor %xmm0,%xmm8 +.byte 102,15,58,68,218,0 + psllq $1,%xmm0 + pxor %xmm8,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm8 + pslldq $8,%xmm0 + psrldq $8,%xmm8 + pxor %xmm9,%xmm0 + pshufd $78,%xmm5,%xmm4 + pxor %xmm8,%xmm1 + pxor %xmm5,%xmm4 + movdqa %xmm0,%xmm9 + psrlq $1,%xmm0 +.byte 102,15,58,68,234,17 + pxor %xmm9,%xmm1 + pxor %xmm0,%xmm9 + psrlq $5,%xmm0 + pxor %xmm9,%xmm0 leaq 32(%rdx),%rdx + psrlq $1,%xmm0 +.byte 102,15,58,68,231,0 + pxor %xmm1,%xmm0 + subq $32,%rcx ja .Lmod_loop .Leven_tail: -.byte 102,65,15,58,68,192,0 -.byte 102,65,15,58,68,200,17 -.byte 102,15,58,68,220,0 - pxor %xmm0,%xmm3 - pxor %xmm1,%xmm3 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm8 + pshufd $78,%xmm0,%xmm4 + pxor %xmm0,%xmm4 - movdqa %xmm3,%xmm4 - psrldq $8,%xmm3 +.byte 102,15,58,68,198,0 +.byte 102,15,58,68,206,17 +.byte 102,15,58,68,231,16 + + pxor %xmm3,%xmm0 + pxor %xmm5,%xmm1 + pxor %xmm0,%xmm8 + pxor %xmm1,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm8 + psrldq $8,%xmm8 pslldq $8,%xmm4 - pxor %xmm3,%xmm1 + pxor %xmm8,%xmm1 pxor %xmm4,%xmm0 - pxor %xmm6,%xmm0 - pxor %xmm7,%xmm1 + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 - movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm4 - pxor %xmm3,%xmm0 - pxor %xmm4,%xmm1 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 pxor %xmm4,%xmm0 - pxor %xmm1,%xmm4 psrlq $1,%xmm0 - pxor %xmm4,%xmm0 + pxor %xmm1,%xmm0 testq %rcx,%rcx jnz .Ldone .Lodd_tail: - movdqu (%rdx),%xmm3 -.byte 102,15,56,0,221 - pxor %xmm3,%xmm0 + movdqu (%rdx),%xmm8 +.byte 102,69,15,56,0,194 + pxor %xmm8,%xmm0 movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 - pshufd $78,%xmm2,%xmm4 pxor %xmm0,%xmm3 - pxor %xmm2,%xmm4 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 -.byte 102,15,58,68,220,0 +.byte 102,15,58,68,223,0 pxor %xmm0,%xmm3 pxor %xmm1,%xmm3 @@ -949,38 +1219,60 @@ gcm_ghash_clmul: pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 - movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm4 - pxor %xmm3,%xmm0 - pxor %xmm4,%xmm1 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 pxor %xmm4,%xmm0 - pxor %xmm1,%xmm4 psrlq $1,%xmm0 - pxor %xmm4,%xmm0 + pxor %xmm1,%xmm0 .Ldone: -.byte 102,15,56,0,197 +.byte 102,65,15,56,0,194 movdqu %xmm0,(%rdi) .byte 0xf3,0xc3 -.LSEH_end_gcm_ghash_clmul: .size gcm_ghash_clmul,.-gcm_ghash_clmul +.globl gcm_init_avx +.type gcm_init_avx,@function +.align 32 +gcm_init_avx: + jmp .L_init_clmul +.size gcm_init_avx,.-gcm_init_avx +.globl gcm_gmult_avx +.type gcm_gmult_avx,@function +.align 32 +gcm_gmult_avx: + jmp .L_gmult_clmul +.size gcm_gmult_avx,.-gcm_gmult_avx +.globl gcm_ghash_avx +.type gcm_ghash_avx,@function +.align 32 +gcm_ghash_avx: + jmp .L_ghash_clmul +.size gcm_ghash_avx,.-gcm_ghash_avx .align 64 .Lbswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 .L0x1c2_polynomial: .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +.L7_mask: +.long 7,0,7,0 +.L7_mask_poly: +.long 7,0,450,0 .align 64 .type .Lrem_4bit,@object .Lrem_4bit: diff --git a/secure/lib/libcrypto/amd64/md5-x86_64.S b/secure/lib/libcrypto/amd64/md5-x86_64.S index c592dcc..94fb761 100644 --- a/secure/lib/libcrypto/amd64/md5-x86_64.S +++ b/secure/lib/libcrypto/amd64/md5-x86_64.S @@ -30,7 +30,7 @@ md5_block_asm_data_order: cmpq %rdi,%rsi - je .Lend + je .Lend .Lloop: @@ -649,7 +649,7 @@ md5_block_asm_data_order: addq $64,%rsi cmpq %rdi,%rsi - jb .Lloop + jb .Lloop .Lend: diff --git a/secure/lib/libcrypto/amd64/modexp512-x86_64.S b/secure/lib/libcrypto/amd64/modexp512-x86_64.S deleted file mode 100644 index 71072ad..0000000 --- a/secure/lib/libcrypto/amd64/modexp512-x86_64.S +++ /dev/null @@ -1,1774 +0,0 @@ - # $FreeBSD$ -.text - -.type MULADD_128x512,@function -.align 16 -MULADD_128x512: - movq 0(%rsi),%rax - mulq %rbp - addq %rax,%r8 - adcq $0,%rdx - movq %r8,0(%rcx) - movq %rdx,%rbx - - movq 8(%rsi),%rax - mulq %rbp - addq %rax,%r9 - adcq $0,%rdx - addq %rbx,%r9 - adcq $0,%rdx - movq %rdx,%rbx - - movq 16(%rsi),%rax - mulq %rbp - addq %rax,%r10 - adcq $0,%rdx - addq %rbx,%r10 - adcq $0,%rdx - movq %rdx,%rbx - - movq 24(%rsi),%rax - mulq %rbp - addq %rax,%r11 - adcq $0,%rdx - addq %rbx,%r11 - adcq $0,%rdx - movq %rdx,%rbx - - movq 32(%rsi),%rax - mulq %rbp - addq %rax,%r12 - adcq $0,%rdx - addq %rbx,%r12 - adcq $0,%rdx - movq %rdx,%rbx - - movq 40(%rsi),%rax - mulq %rbp - addq %rax,%r13 - adcq $0,%rdx - addq %rbx,%r13 - adcq $0,%rdx - movq %rdx,%rbx - - movq 48(%rsi),%rax - mulq %rbp - addq %rax,%r14 - adcq $0,%rdx - addq %rbx,%r14 - adcq $0,%rdx - movq %rdx,%rbx - - movq 56(%rsi),%rax - mulq %rbp - addq %rax,%r15 - adcq $0,%rdx - addq %rbx,%r15 - adcq $0,%rdx - movq %rdx,%r8 - movq 8(%rdi),%rbp - movq 0(%rsi),%rax - mulq %rbp - addq %rax,%r9 - adcq $0,%rdx - movq %r9,8(%rcx) - movq %rdx,%rbx - - movq 8(%rsi),%rax - mulq %rbp - addq %rax,%r10 - adcq $0,%rdx - addq %rbx,%r10 - adcq $0,%rdx - movq %rdx,%rbx - - movq 16(%rsi),%rax - mulq %rbp - addq %rax,%r11 - adcq $0,%rdx - addq %rbx,%r11 - adcq $0,%rdx - movq %rdx,%rbx - - movq 24(%rsi),%rax - mulq %rbp - addq %rax,%r12 - adcq $0,%rdx - addq %rbx,%r12 - adcq $0,%rdx - movq %rdx,%rbx - - movq 32(%rsi),%rax - mulq %rbp - addq %rax,%r13 - adcq $0,%rdx - addq %rbx,%r13 - adcq $0,%rdx - movq %rdx,%rbx - - movq 40(%rsi),%rax - mulq %rbp - addq %rax,%r14 - adcq $0,%rdx - addq %rbx,%r14 - adcq $0,%rdx - movq %rdx,%rbx - - movq 48(%rsi),%rax - mulq %rbp - addq %rax,%r15 - adcq $0,%rdx - addq %rbx,%r15 - adcq $0,%rdx - movq %rdx,%rbx - - movq 56(%rsi),%rax - mulq %rbp - addq %rax,%r8 - adcq $0,%rdx - addq %rbx,%r8 - adcq $0,%rdx - movq %rdx,%r9 - .byte 0xf3,0xc3 -.size MULADD_128x512,.-MULADD_128x512 -.type mont_reduce,@function -.align 16 -mont_reduce: - leaq 192(%rsp),%rdi - movq 32(%rsp),%rsi - addq $576,%rsi - leaq 520(%rsp),%rcx - - movq 96(%rcx),%rbp - movq 0(%rsi),%rax - mulq %rbp - movq (%rcx),%r8 - addq %rax,%r8 - adcq $0,%rdx - movq %r8,0(%rdi) - movq %rdx,%rbx - - movq 8(%rsi),%rax - mulq %rbp - movq 8(%rcx),%r9 - addq %rax,%r9 - adcq $0,%rdx - addq %rbx,%r9 - adcq $0,%rdx - movq %rdx,%rbx - - movq 16(%rsi),%rax - mulq %rbp - movq 16(%rcx),%r10 - addq %rax,%r10 - adcq $0,%rdx - addq %rbx,%r10 - adcq $0,%rdx - movq %rdx,%rbx - - movq 24(%rsi),%rax - mulq %rbp - movq 24(%rcx),%r11 - addq %rax,%r11 - adcq $0,%rdx - addq %rbx,%r11 - adcq $0,%rdx - movq %rdx,%rbx - - movq 32(%rsi),%rax - mulq %rbp - movq 32(%rcx),%r12 - addq %rax,%r12 - adcq $0,%rdx - addq %rbx,%r12 - adcq $0,%rdx - movq %rdx,%rbx - - movq 40(%rsi),%rax - mulq %rbp - movq 40(%rcx),%r13 - addq %rax,%r13 - adcq $0,%rdx - addq %rbx,%r13 - adcq $0,%rdx - movq %rdx,%rbx - - movq 48(%rsi),%rax - mulq %rbp - movq 48(%rcx),%r14 - addq %rax,%r14 - adcq $0,%rdx - addq %rbx,%r14 - adcq $0,%rdx - movq %rdx,%rbx - - movq 56(%rsi),%rax - mulq %rbp - movq 56(%rcx),%r15 - addq %rax,%r15 - adcq $0,%rdx - addq %rbx,%r15 - adcq $0,%rdx - movq %rdx,%r8 - movq 104(%rcx),%rbp - movq 0(%rsi),%rax - mulq %rbp - addq %rax,%r9 - adcq $0,%rdx - movq %r9,8(%rdi) - movq %rdx,%rbx - - movq 8(%rsi),%rax - mulq %rbp - addq %rax,%r10 - adcq $0,%rdx - addq %rbx,%r10 - adcq $0,%rdx - movq %rdx,%rbx - - movq 16(%rsi),%rax - mulq %rbp - addq %rax,%r11 - adcq $0,%rdx - addq %rbx,%r11 - adcq $0,%rdx - movq %rdx,%rbx - - movq 24(%rsi),%rax - mulq %rbp - addq %rax,%r12 - adcq $0,%rdx - addq %rbx,%r12 - adcq $0,%rdx - movq %rdx,%rbx - - movq 32(%rsi),%rax - mulq %rbp - addq %rax,%r13 - adcq $0,%rdx - addq %rbx,%r13 - adcq $0,%rdx - movq %rdx,%rbx - - movq 40(%rsi),%rax - mulq %rbp - addq %rax,%r14 - adcq $0,%rdx - addq %rbx,%r14 - adcq $0,%rdx - movq %rdx,%rbx - - movq 48(%rsi),%rax - mulq %rbp - addq %rax,%r15 - adcq $0,%rdx - addq %rbx,%r15 - adcq $0,%rdx - movq %rdx,%rbx - - movq 56(%rsi),%rax - mulq %rbp - addq %rax,%r8 - adcq $0,%rdx - addq %rbx,%r8 - adcq $0,%rdx - movq %rdx,%r9 - movq 112(%rcx),%rbp - movq 0(%rsi),%rax - mulq %rbp - addq %rax,%r10 - adcq $0,%rdx - movq %r10,16(%rdi) - movq %rdx,%rbx - - movq 8(%rsi),%rax - mulq %rbp - addq %rax,%r11 - adcq $0,%rdx - addq %rbx,%r11 - adcq $0,%rdx - movq %rdx,%rbx - - movq 16(%rsi),%rax - mulq %rbp - addq %rax,%r12 - adcq $0,%rdx - addq %rbx,%r12 - adcq $0,%rdx - movq %rdx,%rbx - - movq 24(%rsi),%rax - mulq %rbp - addq %rax,%r13 - adcq $0,%rdx - addq %rbx,%r13 - adcq $0,%rdx - movq %rdx,%rbx - - movq 32(%rsi),%rax - mulq %rbp - addq %rax,%r14 - adcq $0,%rdx - addq %rbx,%r14 - adcq $0,%rdx - movq %rdx,%rbx - - movq 40(%rsi),%rax - mulq %rbp - addq %rax,%r15 - adcq $0,%rdx - addq %rbx,%r15 - adcq $0,%rdx - movq %rdx,%rbx - - movq 48(%rsi),%rax - mulq %rbp - addq %rax,%r8 - adcq $0,%rdx - addq %rbx,%r8 - adcq $0,%rdx - movq %rdx,%rbx - - movq 56(%rsi),%rax - mulq %rbp - addq %rax,%r9 - adcq $0,%rdx - addq %rbx,%r9 - adcq $0,%rdx - movq %rdx,%r10 - movq 120(%rcx),%rbp - movq 0(%rsi),%rax - mulq %rbp - addq %rax,%r11 - adcq $0,%rdx - movq %r11,24(%rdi) - movq %rdx,%rbx - - movq 8(%rsi),%rax - mulq %rbp - addq %rax,%r12 - adcq $0,%rdx - addq %rbx,%r12 - adcq $0,%rdx - movq %rdx,%rbx - - movq 16(%rsi),%rax - mulq %rbp - addq %rax,%r13 - adcq $0,%rdx - addq %rbx,%r13 - adcq $0,%rdx - movq %rdx,%rbx - - movq 24(%rsi),%rax - mulq %rbp - addq %rax,%r14 - adcq $0,%rdx - addq %rbx,%r14 - adcq $0,%rdx - movq %rdx,%rbx - - movq 32(%rsi),%rax - mulq %rbp - addq %rax,%r15 - adcq $0,%rdx - addq %rbx,%r15 - adcq $0,%rdx - movq %rdx,%rbx - - movq 40(%rsi),%rax - mulq %rbp - addq %rax,%r8 - adcq $0,%rdx - addq %rbx,%r8 - adcq $0,%rdx - movq %rdx,%rbx - - movq 48(%rsi),%rax - mulq %rbp - addq %rax,%r9 - adcq $0,%rdx - addq %rbx,%r9 - adcq $0,%rdx - movq %rdx,%rbx - - movq 56(%rsi),%rax - mulq %rbp - addq %rax,%r10 - adcq $0,%rdx - addq %rbx,%r10 - adcq $0,%rdx - movq %rdx,%r11 - xorq %rax,%rax - - addq 64(%rcx),%r8 - adcq 72(%rcx),%r9 - adcq 80(%rcx),%r10 - adcq 88(%rcx),%r11 - adcq $0,%rax - - - - - movq %r8,64(%rdi) - movq %r9,72(%rdi) - movq %r10,%rbp - movq %r11,88(%rdi) - - movq %rax,384(%rsp) - - movq 0(%rdi),%r8 - movq 8(%rdi),%r9 - movq 16(%rdi),%r10 - movq 24(%rdi),%r11 - - - - - - - - - addq $80,%rdi - - addq $64,%rsi - leaq 296(%rsp),%rcx - - call MULADD_128x512 - - movq 384(%rsp),%rax - - - addq -16(%rdi),%r8 - adcq -8(%rdi),%r9 - movq %r8,64(%rcx) - movq %r9,72(%rcx) - - adcq %rax,%rax - movq %rax,384(%rsp) - - leaq 192(%rsp),%rdi - addq $64,%rsi - - - - - - movq (%rsi),%r8 - movq 8(%rsi),%rbx - - movq (%rcx),%rax - mulq %r8 - movq %rax,%rbp - movq %rdx,%r9 - - movq 8(%rcx),%rax - mulq %r8 - addq %rax,%r9 - - movq (%rcx),%rax - mulq %rbx - addq %rax,%r9 - - movq %r9,8(%rdi) - - - subq $192,%rsi - - movq (%rcx),%r8 - movq 8(%rcx),%r9 - - call MULADD_128x512 - - - - - movq 0(%rsi),%rax - movq 8(%rsi),%rbx - movq 16(%rsi),%rdi - movq 24(%rsi),%rdx - - - movq 384(%rsp),%rbp - - addq 64(%rcx),%r8 - adcq 72(%rcx),%r9 - - - adcq %rbp,%rbp - - - - shlq $3,%rbp - movq 32(%rsp),%rcx - addq %rcx,%rbp - - - xorq %rsi,%rsi - - addq 0(%rbp),%r10 - adcq 64(%rbp),%r11 - adcq 128(%rbp),%r12 - adcq 192(%rbp),%r13 - adcq 256(%rbp),%r14 - adcq 320(%rbp),%r15 - adcq 384(%rbp),%r8 - adcq 448(%rbp),%r9 - - - - sbbq $0,%rsi - - - andq %rsi,%rax - andq %rsi,%rbx - andq %rsi,%rdi - andq %rsi,%rdx - - movq $1,%rbp - subq %rax,%r10 - sbbq %rbx,%r11 - sbbq %rdi,%r12 - sbbq %rdx,%r13 - - - - - sbbq $0,%rbp - - - - addq $512,%rcx - movq 32(%rcx),%rax - movq 40(%rcx),%rbx - movq 48(%rcx),%rdi - movq 56(%rcx),%rdx - - - - andq %rsi,%rax - andq %rsi,%rbx - andq %rsi,%rdi - andq %rsi,%rdx - - - - subq $1,%rbp - - sbbq %rax,%r14 - sbbq %rbx,%r15 - sbbq %rdi,%r8 - sbbq %rdx,%r9 - - - - movq 144(%rsp),%rsi - movq %r10,0(%rsi) - movq %r11,8(%rsi) - movq %r12,16(%rsi) - movq %r13,24(%rsi) - movq %r14,32(%rsi) - movq %r15,40(%rsi) - movq %r8,48(%rsi) - movq %r9,56(%rsi) - - .byte 0xf3,0xc3 -.size mont_reduce,.-mont_reduce -.type mont_mul_a3b,@function -.align 16 -mont_mul_a3b: - - - - - movq 0(%rdi),%rbp - - movq %r10,%rax - mulq %rbp - movq %rax,520(%rsp) - movq %rdx,%r10 - movq %r11,%rax - mulq %rbp - addq %rax,%r10 - adcq $0,%rdx - movq %rdx,%r11 - movq %r12,%rax - mulq %rbp - addq %rax,%r11 - adcq $0,%rdx - movq %rdx,%r12 - movq %r13,%rax - mulq %rbp - addq %rax,%r12 - adcq $0,%rdx - movq %rdx,%r13 - movq %r14,%rax - mulq %rbp - addq %rax,%r13 - adcq $0,%rdx - movq %rdx,%r14 - movq %r15,%rax - mulq %rbp - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r15 - movq %r8,%rax - mulq %rbp - addq %rax,%r15 - adcq $0,%rdx - movq %rdx,%r8 - movq %r9,%rax - mulq %rbp - addq %rax,%r8 - adcq $0,%rdx - movq %rdx,%r9 - movq 8(%rdi),%rbp - movq 0(%rsi),%rax - mulq %rbp - addq %rax,%r10 - adcq $0,%rdx - movq %r10,528(%rsp) - movq %rdx,%rbx - - movq 8(%rsi),%rax - mulq %rbp - addq %rax,%r11 - adcq $0,%rdx - addq %rbx,%r11 - adcq $0,%rdx - movq %rdx,%rbx - - movq 16(%rsi),%rax - mulq %rbp - addq %rax,%r12 - adcq $0,%rdx - addq %rbx,%r12 - adcq $0,%rdx - movq %rdx,%rbx - - movq 24(%rsi),%rax - mulq %rbp - addq %rax,%r13 - adcq $0,%rdx - addq %rbx,%r13 - adcq $0,%rdx - movq %rdx,%rbx - - movq 32(%rsi),%rax - mulq %rbp - addq %rax,%r14 - adcq $0,%rdx - addq %rbx,%r14 - adcq $0,%rdx - movq %rdx,%rbx - - movq 40(%rsi),%rax - mulq %rbp - addq %rax,%r15 - adcq $0,%rdx - addq %rbx,%r15 - adcq $0,%rdx - movq %rdx,%rbx - - movq 48(%rsi),%rax - mulq %rbp - addq %rax,%r8 - adcq $0,%rdx - addq %rbx,%r8 - adcq $0,%rdx - movq %rdx,%rbx - - movq 56(%rsi),%rax - mulq %rbp - addq %rax,%r9 - adcq $0,%rdx - addq %rbx,%r9 - adcq $0,%rdx - movq %rdx,%r10 - movq 16(%rdi),%rbp - movq 0(%rsi),%rax - mulq %rbp - addq %rax,%r11 - adcq $0,%rdx - movq %r11,536(%rsp) - movq %rdx,%rbx - - movq 8(%rsi),%rax - mulq %rbp - addq %rax,%r12 - adcq $0,%rdx - addq %rbx,%r12 - adcq $0,%rdx - movq %rdx,%rbx - - movq 16(%rsi),%rax - mulq %rbp - addq %rax,%r13 - adcq $0,%rdx - addq %rbx,%r13 - adcq $0,%rdx - movq %rdx,%rbx - - movq 24(%rsi),%rax - mulq %rbp - addq %rax,%r14 - adcq $0,%rdx - addq %rbx,%r14 - adcq $0,%rdx - movq %rdx,%rbx - - movq 32(%rsi),%rax - mulq %rbp - addq %rax,%r15 - adcq $0,%rdx - addq %rbx,%r15 - adcq $0,%rdx - movq %rdx,%rbx - - movq 40(%rsi),%rax - mulq %rbp - addq %rax,%r8 - adcq $0,%rdx - addq %rbx,%r8 - adcq $0,%rdx - movq %rdx,%rbx - - movq 48(%rsi),%rax - mulq %rbp - addq %rax,%r9 - adcq $0,%rdx - addq %rbx,%r9 - adcq $0,%rdx - movq %rdx,%rbx - - movq 56(%rsi),%rax - mulq %rbp - addq %rax,%r10 - adcq $0,%rdx - addq %rbx,%r10 - adcq $0,%rdx - movq %rdx,%r11 - movq 24(%rdi),%rbp - movq 0(%rsi),%rax - mulq %rbp - addq %rax,%r12 - adcq $0,%rdx - movq %r12,544(%rsp) - movq %rdx,%rbx - - movq 8(%rsi),%rax - mulq %rbp - addq %rax,%r13 - adcq $0,%rdx - addq %rbx,%r13 - adcq $0,%rdx - movq %rdx,%rbx - - movq 16(%rsi),%rax - mulq %rbp - addq %rax,%r14 - adcq $0,%rdx - addq %rbx,%r14 - adcq $0,%rdx - movq %rdx,%rbx - - movq 24(%rsi),%rax - mulq %rbp - addq %rax,%r15 - adcq $0,%rdx - addq %rbx,%r15 - adcq $0,%rdx - movq %rdx,%rbx - - movq 32(%rsi),%rax - mulq %rbp - addq %rax,%r8 - adcq $0,%rdx - addq %rbx,%r8 - adcq $0,%rdx - movq %rdx,%rbx - - movq 40(%rsi),%rax - mulq %rbp - addq %rax,%r9 - adcq $0,%rdx - addq %rbx,%r9 - adcq $0,%rdx - movq %rdx,%rbx - - movq 48(%rsi),%rax - mulq %rbp - addq %rax,%r10 - adcq $0,%rdx - addq %rbx,%r10 - adcq $0,%rdx - movq %rdx,%rbx - - movq 56(%rsi),%rax - mulq %rbp - addq %rax,%r11 - adcq $0,%rdx - addq %rbx,%r11 - adcq $0,%rdx - movq %rdx,%r12 - movq 32(%rdi),%rbp - movq 0(%rsi),%rax - mulq %rbp - addq %rax,%r13 - adcq $0,%rdx - movq %r13,552(%rsp) - movq %rdx,%rbx - - movq 8(%rsi),%rax - mulq %rbp - addq %rax,%r14 - adcq $0,%rdx - addq %rbx,%r14 - adcq $0,%rdx - movq %rdx,%rbx - - movq 16(%rsi),%rax - mulq %rbp - addq %rax,%r15 - adcq $0,%rdx - addq %rbx,%r15 - adcq $0,%rdx - movq %rdx,%rbx - - movq 24(%rsi),%rax - mulq %rbp - addq %rax,%r8 - adcq $0,%rdx - addq %rbx,%r8 - adcq $0,%rdx - movq %rdx,%rbx - - movq 32(%rsi),%rax - mulq %rbp - addq %rax,%r9 - adcq $0,%rdx - addq %rbx,%r9 - adcq $0,%rdx - movq %rdx,%rbx - - movq 40(%rsi),%rax - mulq %rbp - addq %rax,%r10 - adcq $0,%rdx - addq %rbx,%r10 - adcq $0,%rdx - movq %rdx,%rbx - - movq 48(%rsi),%rax - mulq %rbp - addq %rax,%r11 - adcq $0,%rdx - addq %rbx,%r11 - adcq $0,%rdx - movq %rdx,%rbx - - movq 56(%rsi),%rax - mulq %rbp - addq %rax,%r12 - adcq $0,%rdx - addq %rbx,%r12 - adcq $0,%rdx - movq %rdx,%r13 - movq 40(%rdi),%rbp - movq 0(%rsi),%rax - mulq %rbp - addq %rax,%r14 - adcq $0,%rdx - movq %r14,560(%rsp) - movq %rdx,%rbx - - movq 8(%rsi),%rax - mulq %rbp - addq %rax,%r15 - adcq $0,%rdx - addq %rbx,%r15 - adcq $0,%rdx - movq %rdx,%rbx - - movq 16(%rsi),%rax - mulq %rbp - addq %rax,%r8 - adcq $0,%rdx - addq %rbx,%r8 - adcq $0,%rdx - movq %rdx,%rbx - - movq 24(%rsi),%rax - mulq %rbp - addq %rax,%r9 - adcq $0,%rdx - addq %rbx,%r9 - adcq $0,%rdx - movq %rdx,%rbx - - movq 32(%rsi),%rax - mulq %rbp - addq %rax,%r10 - adcq $0,%rdx - addq %rbx,%r10 - adcq $0,%rdx - movq %rdx,%rbx - - movq 40(%rsi),%rax - mulq %rbp - addq %rax,%r11 - adcq $0,%rdx - addq %rbx,%r11 - adcq $0,%rdx - movq %rdx,%rbx - - movq 48(%rsi),%rax - mulq %rbp - addq %rax,%r12 - adcq $0,%rdx - addq %rbx,%r12 - adcq $0,%rdx - movq %rdx,%rbx - - movq 56(%rsi),%rax - mulq %rbp - addq %rax,%r13 - adcq $0,%rdx - addq %rbx,%r13 - adcq $0,%rdx - movq %rdx,%r14 - movq 48(%rdi),%rbp - movq 0(%rsi),%rax - mulq %rbp - addq %rax,%r15 - adcq $0,%rdx - movq %r15,568(%rsp) - movq %rdx,%rbx - - movq 8(%rsi),%rax - mulq %rbp - addq %rax,%r8 - adcq $0,%rdx - addq %rbx,%r8 - adcq $0,%rdx - movq %rdx,%rbx - - movq 16(%rsi),%rax - mulq %rbp - addq %rax,%r9 - adcq $0,%rdx - addq %rbx,%r9 - adcq $0,%rdx - movq %rdx,%rbx - - movq 24(%rsi),%rax - mulq %rbp - addq %rax,%r10 - adcq $0,%rdx - addq %rbx,%r10 - adcq $0,%rdx - movq %rdx,%rbx - - movq 32(%rsi),%rax - mulq %rbp - addq %rax,%r11 - adcq $0,%rdx - addq %rbx,%r11 - adcq $0,%rdx - movq %rdx,%rbx - - movq 40(%rsi),%rax - mulq %rbp - addq %rax,%r12 - adcq $0,%rdx - addq %rbx,%r12 - adcq $0,%rdx - movq %rdx,%rbx - - movq 48(%rsi),%rax - mulq %rbp - addq %rax,%r13 - adcq $0,%rdx - addq %rbx,%r13 - adcq $0,%rdx - movq %rdx,%rbx - - movq 56(%rsi),%rax - mulq %rbp - addq %rax,%r14 - adcq $0,%rdx - addq %rbx,%r14 - adcq $0,%rdx - movq %rdx,%r15 - movq 56(%rdi),%rbp - movq 0(%rsi),%rax - mulq %rbp - addq %rax,%r8 - adcq $0,%rdx - movq %r8,576(%rsp) - movq %rdx,%rbx - - movq 8(%rsi),%rax - mulq %rbp - addq %rax,%r9 - adcq $0,%rdx - addq %rbx,%r9 - adcq $0,%rdx - movq %rdx,%rbx - - movq 16(%rsi),%rax - mulq %rbp - addq %rax,%r10 - adcq $0,%rdx - addq %rbx,%r10 - adcq $0,%rdx - movq %rdx,%rbx - - movq 24(%rsi),%rax - mulq %rbp - addq %rax,%r11 - adcq $0,%rdx - addq %rbx,%r11 - adcq $0,%rdx - movq %rdx,%rbx - - movq 32(%rsi),%rax - mulq %rbp - addq %rax,%r12 - adcq $0,%rdx - addq %rbx,%r12 - adcq $0,%rdx - movq %rdx,%rbx - - movq 40(%rsi),%rax - mulq %rbp - addq %rax,%r13 - adcq $0,%rdx - addq %rbx,%r13 - adcq $0,%rdx - movq %rdx,%rbx - - movq 48(%rsi),%rax - mulq %rbp - addq %rax,%r14 - adcq $0,%rdx - addq %rbx,%r14 - adcq $0,%rdx - movq %rdx,%rbx - - movq 56(%rsi),%rax - mulq %rbp - addq %rax,%r15 - adcq $0,%rdx - addq %rbx,%r15 - adcq $0,%rdx - movq %rdx,%r8 - movq %r9,584(%rsp) - movq %r10,592(%rsp) - movq %r11,600(%rsp) - movq %r12,608(%rsp) - movq %r13,616(%rsp) - movq %r14,624(%rsp) - movq %r15,632(%rsp) - movq %r8,640(%rsp) - - - - - - jmp mont_reduce - - -.size mont_mul_a3b,.-mont_mul_a3b -.type sqr_reduce,@function -.align 16 -sqr_reduce: - movq 16(%rsp),%rcx - - - - movq %r10,%rbx - - movq %r11,%rax - mulq %rbx - movq %rax,528(%rsp) - movq %rdx,%r10 - movq %r12,%rax - mulq %rbx - addq %rax,%r10 - adcq $0,%rdx - movq %rdx,%r11 - movq %r13,%rax - mulq %rbx - addq %rax,%r11 - adcq $0,%rdx - movq %rdx,%r12 - movq %r14,%rax - mulq %rbx - addq %rax,%r12 - adcq $0,%rdx - movq %rdx,%r13 - movq %r15,%rax - mulq %rbx - addq %rax,%r13 - adcq $0,%rdx - movq %rdx,%r14 - movq %r8,%rax - mulq %rbx - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r15 - movq %r9,%rax - mulq %rbx - addq %rax,%r15 - adcq $0,%rdx - movq %rdx,%rsi - - movq %r10,536(%rsp) - - - - - - movq 8(%rcx),%rbx - - movq 16(%rcx),%rax - mulq %rbx - addq %rax,%r11 - adcq $0,%rdx - movq %r11,544(%rsp) - - movq %rdx,%r10 - movq 24(%rcx),%rax - mulq %rbx - addq %rax,%r12 - adcq $0,%rdx - addq %r10,%r12 - adcq $0,%rdx - movq %r12,552(%rsp) - - movq %rdx,%r10 - movq 32(%rcx),%rax - mulq %rbx - addq %rax,%r13 - adcq $0,%rdx - addq %r10,%r13 - adcq $0,%rdx - - movq %rdx,%r10 - movq 40(%rcx),%rax - mulq %rbx - addq %rax,%r14 - adcq $0,%rdx - addq %r10,%r14 - adcq $0,%rdx - - movq %rdx,%r10 - movq %r8,%rax - mulq %rbx - addq %rax,%r15 - adcq $0,%rdx - addq %r10,%r15 - adcq $0,%rdx - - movq %rdx,%r10 - movq %r9,%rax - mulq %rbx - addq %rax,%rsi - adcq $0,%rdx - addq %r10,%rsi - adcq $0,%rdx - - movq %rdx,%r11 - - - - - movq 16(%rcx),%rbx - - movq 24(%rcx),%rax - mulq %rbx - addq %rax,%r13 - adcq $0,%rdx - movq %r13,560(%rsp) - - movq %rdx,%r10 - movq 32(%rcx),%rax - mulq %rbx - addq %rax,%r14 - adcq $0,%rdx - addq %r10,%r14 - adcq $0,%rdx - movq %r14,568(%rsp) - - movq %rdx,%r10 - movq 40(%rcx),%rax - mulq %rbx - addq %rax,%r15 - adcq $0,%rdx - addq %r10,%r15 - adcq $0,%rdx - - movq %rdx,%r10 - movq %r8,%rax - mulq %rbx - addq %rax,%rsi - adcq $0,%rdx - addq %r10,%rsi - adcq $0,%rdx - - movq %rdx,%r10 - movq %r9,%rax - mulq %rbx - addq %rax,%r11 - adcq $0,%rdx - addq %r10,%r11 - adcq $0,%rdx - - movq %rdx,%r12 - - - - - - movq 24(%rcx),%rbx - - movq 32(%rcx),%rax - mulq %rbx - addq %rax,%r15 - adcq $0,%rdx - movq %r15,576(%rsp) - - movq %rdx,%r10 - movq 40(%rcx),%rax - mulq %rbx - addq %rax,%rsi - adcq $0,%rdx - addq %r10,%rsi - adcq $0,%rdx - movq %rsi,584(%rsp) - - movq %rdx,%r10 - movq %r8,%rax - mulq %rbx - addq %rax,%r11 - adcq $0,%rdx - addq %r10,%r11 - adcq $0,%rdx - - movq %rdx,%r10 - movq %r9,%rax - mulq %rbx - addq %rax,%r12 - adcq $0,%rdx - addq %r10,%r12 - adcq $0,%rdx - - movq %rdx,%r15 - - - - - movq 32(%rcx),%rbx - - movq 40(%rcx),%rax - mulq %rbx - addq %rax,%r11 - adcq $0,%rdx - movq %r11,592(%rsp) - - movq %rdx,%r10 - movq %r8,%rax - mulq %rbx - addq %rax,%r12 - adcq $0,%rdx - addq %r10,%r12 - adcq $0,%rdx - movq %r12,600(%rsp) - - movq %rdx,%r10 - movq %r9,%rax - mulq %rbx - addq %rax,%r15 - adcq $0,%rdx - addq %r10,%r15 - adcq $0,%rdx - - movq %rdx,%r11 - - - - - movq 40(%rcx),%rbx - - movq %r8,%rax - mulq %rbx - addq %rax,%r15 - adcq $0,%rdx - movq %r15,608(%rsp) - - movq %rdx,%r10 - movq %r9,%rax - mulq %rbx - addq %rax,%r11 - adcq $0,%rdx - addq %r10,%r11 - adcq $0,%rdx - movq %r11,616(%rsp) - - movq %rdx,%r12 - - - - - movq %r8,%rbx - - movq %r9,%rax - mulq %rbx - addq %rax,%r12 - adcq $0,%rdx - movq %r12,624(%rsp) - - movq %rdx,632(%rsp) - - - movq 528(%rsp),%r10 - movq 536(%rsp),%r11 - movq 544(%rsp),%r12 - movq 552(%rsp),%r13 - movq 560(%rsp),%r14 - movq 568(%rsp),%r15 - - movq 24(%rcx),%rax - mulq %rax - movq %rax,%rdi - movq %rdx,%r8 - - addq %r10,%r10 - adcq %r11,%r11 - adcq %r12,%r12 - adcq %r13,%r13 - adcq %r14,%r14 - adcq %r15,%r15 - adcq $0,%r8 - - movq 0(%rcx),%rax - mulq %rax - movq %rax,520(%rsp) - movq %rdx,%rbx - - movq 8(%rcx),%rax - mulq %rax - - addq %rbx,%r10 - adcq %rax,%r11 - adcq $0,%rdx - - movq %rdx,%rbx - movq %r10,528(%rsp) - movq %r11,536(%rsp) - - movq 16(%rcx),%rax - mulq %rax - - addq %rbx,%r12 - adcq %rax,%r13 - adcq $0,%rdx - - movq %rdx,%rbx - - movq %r12,544(%rsp) - movq %r13,552(%rsp) - - xorq %rbp,%rbp - addq %rbx,%r14 - adcq %rdi,%r15 - adcq $0,%rbp - - movq %r14,560(%rsp) - movq %r15,568(%rsp) - - - - - movq 576(%rsp),%r10 - movq 584(%rsp),%r11 - movq 592(%rsp),%r12 - movq 600(%rsp),%r13 - movq 608(%rsp),%r14 - movq 616(%rsp),%r15 - movq 624(%rsp),%rdi - movq 632(%rsp),%rsi - - movq %r9,%rax - mulq %rax - movq %rax,%r9 - movq %rdx,%rbx - - addq %r10,%r10 - adcq %r11,%r11 - adcq %r12,%r12 - adcq %r13,%r13 - adcq %r14,%r14 - adcq %r15,%r15 - adcq %rdi,%rdi - adcq %rsi,%rsi - adcq $0,%rbx - - addq %rbp,%r10 - - movq 32(%rcx),%rax - mulq %rax - - addq %r8,%r10 - adcq %rax,%r11 - adcq $0,%rdx - - movq %rdx,%rbp - - movq %r10,576(%rsp) - movq %r11,584(%rsp) - - movq 40(%rcx),%rax - mulq %rax - - addq %rbp,%r12 - adcq %rax,%r13 - adcq $0,%rdx - - movq %rdx,%rbp - - movq %r12,592(%rsp) - movq %r13,600(%rsp) - - movq 48(%rcx),%rax - mulq %rax - - addq %rbp,%r14 - adcq %rax,%r15 - adcq $0,%rdx - - movq %r14,608(%rsp) - movq %r15,616(%rsp) - - addq %rdx,%rdi - adcq %r9,%rsi - adcq $0,%rbx - - movq %rdi,624(%rsp) - movq %rsi,632(%rsp) - movq %rbx,640(%rsp) - - jmp mont_reduce - - -.size sqr_reduce,.-sqr_reduce -.globl mod_exp_512 -.type mod_exp_512,@function -mod_exp_512: - pushq %rbp - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - - - movq %rsp,%r8 - subq $2688,%rsp - andq $-64,%rsp - - - movq %r8,0(%rsp) - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rcx,24(%rsp) -.Lbody: - - - - pxor %xmm4,%xmm4 - movdqu 0(%rsi),%xmm0 - movdqu 16(%rsi),%xmm1 - movdqu 32(%rsi),%xmm2 - movdqu 48(%rsi),%xmm3 - movdqa %xmm4,512(%rsp) - movdqa %xmm4,528(%rsp) - movdqa %xmm4,608(%rsp) - movdqa %xmm4,624(%rsp) - movdqa %xmm0,544(%rsp) - movdqa %xmm1,560(%rsp) - movdqa %xmm2,576(%rsp) - movdqa %xmm3,592(%rsp) - - - movdqu 0(%rdx),%xmm0 - movdqu 16(%rdx),%xmm1 - movdqu 32(%rdx),%xmm2 - movdqu 48(%rdx),%xmm3 - - leaq 384(%rsp),%rbx - movq %rbx,136(%rsp) - call mont_reduce - - - leaq 448(%rsp),%rcx - xorq %rax,%rax - movq %rax,0(%rcx) - movq %rax,8(%rcx) - movq %rax,24(%rcx) - movq %rax,32(%rcx) - movq %rax,40(%rcx) - movq %rax,48(%rcx) - movq %rax,56(%rcx) - movq %rax,128(%rsp) - movq $1,16(%rcx) - - leaq 640(%rsp),%rbp - movq %rcx,%rsi - movq %rbp,%rdi - movq $8,%rax -loop_0: - movq (%rcx),%rbx - movw %bx,(%rdi) - shrq $16,%rbx - movw %bx,64(%rdi) - shrq $16,%rbx - movw %bx,128(%rdi) - shrq $16,%rbx - movw %bx,192(%rdi) - leaq 8(%rcx),%rcx - leaq 256(%rdi),%rdi - decq %rax - jnz loop_0 - movq $31,%rax - movq %rax,32(%rsp) - movq %rbp,40(%rsp) - - movq %rsi,136(%rsp) - movq 0(%rsi),%r10 - movq 8(%rsi),%r11 - movq 16(%rsi),%r12 - movq 24(%rsi),%r13 - movq 32(%rsi),%r14 - movq 40(%rsi),%r15 - movq 48(%rsi),%r8 - movq 56(%rsi),%r9 -init_loop: - leaq 384(%rsp),%rdi - call mont_mul_a3b - leaq 448(%rsp),%rsi - movq 40(%rsp),%rbp - addq $2,%rbp - movq %rbp,40(%rsp) - movq %rsi,%rcx - movq $8,%rax -loop_1: - movq (%rcx),%rbx - movw %bx,(%rbp) - shrq $16,%rbx - movw %bx,64(%rbp) - shrq $16,%rbx - movw %bx,128(%rbp) - shrq $16,%rbx - movw %bx,192(%rbp) - leaq 8(%rcx),%rcx - leaq 256(%rbp),%rbp - decq %rax - jnz loop_1 - movq 32(%rsp),%rax - subq $1,%rax - movq %rax,32(%rsp) - jne init_loop - - - - movdqa %xmm0,64(%rsp) - movdqa %xmm1,80(%rsp) - movdqa %xmm2,96(%rsp) - movdqa %xmm3,112(%rsp) - - - - - - movl 126(%rsp),%eax - movq %rax,%rdx - shrq $11,%rax - andl $2047,%edx - movl %edx,126(%rsp) - leaq 640(%rsp,%rax,2),%rsi - movq 8(%rsp),%rdx - movq $4,%rbp -loop_2: - movzwq 192(%rsi),%rbx - movzwq 448(%rsi),%rax - shlq $16,%rbx - shlq $16,%rax - movw 128(%rsi),%bx - movw 384(%rsi),%ax - shlq $16,%rbx - shlq $16,%rax - movw 64(%rsi),%bx - movw 320(%rsi),%ax - shlq $16,%rbx - shlq $16,%rax - movw 0(%rsi),%bx - movw 256(%rsi),%ax - movq %rbx,0(%rdx) - movq %rax,8(%rdx) - leaq 512(%rsi),%rsi - leaq 16(%rdx),%rdx - subq $1,%rbp - jnz loop_2 - movq $505,48(%rsp) - - movq 8(%rsp),%rcx - movq %rcx,136(%rsp) - movq 0(%rcx),%r10 - movq 8(%rcx),%r11 - movq 16(%rcx),%r12 - movq 24(%rcx),%r13 - movq 32(%rcx),%r14 - movq 40(%rcx),%r15 - movq 48(%rcx),%r8 - movq 56(%rcx),%r9 - jmp sqr_2 - -main_loop_a3b: - call sqr_reduce - call sqr_reduce - call sqr_reduce -sqr_2: - call sqr_reduce - call sqr_reduce - - - - movq 48(%rsp),%rcx - movq %rcx,%rax - shrq $4,%rax - movl 64(%rsp,%rax,2),%edx - andq $15,%rcx - shrq %cl,%rdx - andq $31,%rdx - - leaq 640(%rsp,%rdx,2),%rsi - leaq 448(%rsp),%rdx - movq %rdx,%rdi - movq $4,%rbp -loop_3: - movzwq 192(%rsi),%rbx - movzwq 448(%rsi),%rax - shlq $16,%rbx - shlq $16,%rax - movw 128(%rsi),%bx - movw 384(%rsi),%ax - shlq $16,%rbx - shlq $16,%rax - movw 64(%rsi),%bx - movw 320(%rsi),%ax - shlq $16,%rbx - shlq $16,%rax - movw 0(%rsi),%bx - movw 256(%rsi),%ax - movq %rbx,0(%rdx) - movq %rax,8(%rdx) - leaq 512(%rsi),%rsi - leaq 16(%rdx),%rdx - subq $1,%rbp - jnz loop_3 - movq 8(%rsp),%rsi - call mont_mul_a3b - - - - movq 48(%rsp),%rcx - subq $5,%rcx - movq %rcx,48(%rsp) - jge main_loop_a3b - - - -end_main_loop_a3b: - - - movq 8(%rsp),%rdx - pxor %xmm4,%xmm4 - movdqu 0(%rdx),%xmm0 - movdqu 16(%rdx),%xmm1 - movdqu 32(%rdx),%xmm2 - movdqu 48(%rdx),%xmm3 - movdqa %xmm4,576(%rsp) - movdqa %xmm4,592(%rsp) - movdqa %xmm4,608(%rsp) - movdqa %xmm4,624(%rsp) - movdqa %xmm0,512(%rsp) - movdqa %xmm1,528(%rsp) - movdqa %xmm2,544(%rsp) - movdqa %xmm3,560(%rsp) - call mont_reduce - - - - movq 8(%rsp),%rax - movq 0(%rax),%r8 - movq 8(%rax),%r9 - movq 16(%rax),%r10 - movq 24(%rax),%r11 - movq 32(%rax),%r12 - movq 40(%rax),%r13 - movq 48(%rax),%r14 - movq 56(%rax),%r15 - - - movq 24(%rsp),%rbx - addq $512,%rbx - - subq 0(%rbx),%r8 - sbbq 8(%rbx),%r9 - sbbq 16(%rbx),%r10 - sbbq 24(%rbx),%r11 - sbbq 32(%rbx),%r12 - sbbq 40(%rbx),%r13 - sbbq 48(%rbx),%r14 - sbbq 56(%rbx),%r15 - - - movq 0(%rax),%rsi - movq 8(%rax),%rdi - movq 16(%rax),%rcx - movq 24(%rax),%rdx - cmovncq %r8,%rsi - cmovncq %r9,%rdi - cmovncq %r10,%rcx - cmovncq %r11,%rdx - movq %rsi,0(%rax) - movq %rdi,8(%rax) - movq %rcx,16(%rax) - movq %rdx,24(%rax) - - movq 32(%rax),%rsi - movq 40(%rax),%rdi - movq 48(%rax),%rcx - movq 56(%rax),%rdx - cmovncq %r12,%rsi - cmovncq %r13,%rdi - cmovncq %r14,%rcx - cmovncq %r15,%rdx - movq %rsi,32(%rax) - movq %rdi,40(%rax) - movq %rcx,48(%rax) - movq %rdx,56(%rax) - - movq 0(%rsp),%rsi - movq 0(%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbx - movq 40(%rsi),%rbp - leaq 48(%rsi),%rsp -.Lepilogue: - .byte 0xf3,0xc3 -.size mod_exp_512, . - mod_exp_512 diff --git a/secure/lib/libcrypto/amd64/rc4-x86_64.S b/secure/lib/libcrypto/amd64/rc4-x86_64.S index c561af7..c51ca89 100644 --- a/secure/lib/libcrypto/amd64/rc4-x86_64.S +++ b/secure/lib/libcrypto/amd64/rc4-x86_64.S @@ -48,7 +48,7 @@ RC4: orq %rsi,%rsi movl (%rdi,%rax,4),%edx movl (%rdi,%r10,4),%eax xorb (%r12),%dl - movb %dl,(%r13,%r12,1) + movb %dl,(%r12,%r13,1) leaq 1(%r12),%r12 decq %rbx jnz .Loop8_warmup @@ -127,7 +127,7 @@ RC4: orq %rsi,%rsi subq $8,%r11 xorq (%r12),%r8 - movq %r8,(%r13,%r12,1) + movq %r8,(%r12,%r13,1) leaq 8(%r12),%r12 testq $-8,%r11 @@ -153,7 +153,7 @@ RC4: orq %rsi,%rsi movl (%rdi,%rax,4),%edx movl (%rdi,%r10,4),%eax xorb (%r12),%dl - movb %dl,(%r13,%r12,1) + movb %dl,(%r12,%r13,1) leaq 1(%r12),%r12 decq %rbx jnz .Loop16_warmup @@ -190,7 +190,7 @@ RC4: orq %rsi,%rsi pxor %xmm1,%xmm2 addb %bl,%cl pinsrw $0,(%rdi,%rax,4),%xmm0 - movdqu %xmm2,(%r13,%r12,1) + movdqu %xmm2,(%r12,%r13,1) leaq 16(%r12),%r12 .Loop16_enter: movl (%rdi,%rcx,4),%edx @@ -326,7 +326,7 @@ RC4: orq %rsi,%rsi psllq $8,%xmm1 pxor %xmm0,%xmm2 pxor %xmm1,%xmm2 - movdqu %xmm2,(%r13,%r12,1) + movdqu %xmm2,(%r12,%r13,1) leaq 16(%r12),%r12 cmpq $0,%r11 @@ -344,7 +344,7 @@ RC4: orq %rsi,%rsi movl (%rdi,%rax,4),%edx movl (%rdi,%r10,4),%eax xorb (%r12),%dl - movb %dl,(%r13,%r12,1) + movb %dl,(%r12,%r13,1) leaq 1(%r12),%r12 decq %r11 jnz .Lloop1 @@ -369,7 +369,7 @@ RC4: orq %rsi,%rsi movb %al,(%rdi,%rcx,1) cmpq %rsi,%rcx movb %dl,(%rdi,%r10,1) - jne .Lcmov0 + jne .Lcmov0 movq %rax,%rbx .Lcmov0: addb %al,%dl @@ -383,7 +383,7 @@ RC4: orq %rsi,%rsi movb %bl,(%rdi,%rcx,1) cmpq %r10,%rcx movb %dl,(%rdi,%rsi,1) - jne .Lcmov1 + jne .Lcmov1 movq %rbx,%rax .Lcmov1: addb %bl,%dl @@ -397,7 +397,7 @@ RC4: orq %rsi,%rsi movb %al,(%rdi,%rcx,1) cmpq %rsi,%rcx movb %dl,(%rdi,%r10,1) - jne .Lcmov2 + jne .Lcmov2 movq %rax,%rbx .Lcmov2: addb %al,%dl @@ -411,7 +411,7 @@ RC4: orq %rsi,%rsi movb %bl,(%rdi,%rcx,1) cmpq %r10,%rcx movb %dl,(%rdi,%rsi,1) - jne .Lcmov3 + jne .Lcmov3 movq %rbx,%rax .Lcmov3: addb %bl,%dl @@ -425,7 +425,7 @@ RC4: orq %rsi,%rsi movb %al,(%rdi,%rcx,1) cmpq %rsi,%rcx movb %dl,(%rdi,%r10,1) - jne .Lcmov4 + jne .Lcmov4 movq %rax,%rbx .Lcmov4: addb %al,%dl @@ -439,7 +439,7 @@ RC4: orq %rsi,%rsi movb %bl,(%rdi,%rcx,1) cmpq %r10,%rcx movb %dl,(%rdi,%rsi,1) - jne .Lcmov5 + jne .Lcmov5 movq %rbx,%rax .Lcmov5: addb %bl,%dl @@ -453,7 +453,7 @@ RC4: orq %rsi,%rsi movb %al,(%rdi,%rcx,1) cmpq %rsi,%rcx movb %dl,(%rdi,%r10,1) - jne .Lcmov6 + jne .Lcmov6 movq %rax,%rbx .Lcmov6: addb %al,%dl @@ -467,7 +467,7 @@ RC4: orq %rsi,%rsi movb %bl,(%rdi,%rcx,1) cmpq %r10,%rcx movb %dl,(%rdi,%rsi,1) - jne .Lcmov7 + jne .Lcmov7 movq %rbx,%rax .Lcmov7: addb %bl,%dl diff --git a/secure/lib/libcrypto/amd64/rsaz-avx2.S b/secure/lib/libcrypto/amd64/rsaz-avx2.S new file mode 100644 index 0000000..ba13765 --- /dev/null +++ b/secure/lib/libcrypto/amd64/rsaz-avx2.S @@ -0,0 +1,26 @@ + # $FreeBSD$ +.text + +.globl rsaz_avx2_eligible +.type rsaz_avx2_eligible,@function +rsaz_avx2_eligible: + xorl %eax,%eax + .byte 0xf3,0xc3 +.size rsaz_avx2_eligible,.-rsaz_avx2_eligible + +.globl rsaz_1024_sqr_avx2 +.globl rsaz_1024_mul_avx2 +.globl rsaz_1024_norm2red_avx2 +.globl rsaz_1024_red2norm_avx2 +.globl rsaz_1024_scatter5_avx2 +.globl rsaz_1024_gather5_avx2 +.type rsaz_1024_sqr_avx2,@function +rsaz_1024_sqr_avx2: +rsaz_1024_mul_avx2: +rsaz_1024_norm2red_avx2: +rsaz_1024_red2norm_avx2: +rsaz_1024_scatter5_avx2: +rsaz_1024_gather5_avx2: +.byte 0x0f,0x0b + .byte 0xf3,0xc3 +.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 diff --git a/secure/lib/libcrypto/amd64/rsaz-x86_64.S b/secure/lib/libcrypto/amd64/rsaz-x86_64.S new file mode 100644 index 0000000..efd229a --- /dev/null +++ b/secure/lib/libcrypto/amd64/rsaz-x86_64.S @@ -0,0 +1,1118 @@ + # $FreeBSD$ +.text + + + +.globl rsaz_512_sqr +.type rsaz_512_sqr,@function +.align 32 +rsaz_512_sqr: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $128+24,%rsp +.Lsqr_body: + movq %rdx,%rbp + movq (%rsi),%rdx + movq 8(%rsi),%rax + movq %rcx,128(%rsp) + jmp .Loop_sqr + +.align 32 +.Loop_sqr: + movl %r8d,128+8(%rsp) + + movq %rdx,%rbx + mulq %rdx + movq %rax,%r8 + movq 16(%rsi),%rax + movq %rdx,%r9 + + mulq %rbx + addq %rax,%r9 + movq 24(%rsi),%rax + movq %rdx,%r10 + adcq $0,%r10 + + mulq %rbx + addq %rax,%r10 + movq 32(%rsi),%rax + movq %rdx,%r11 + adcq $0,%r11 + + mulq %rbx + addq %rax,%r11 + movq 40(%rsi),%rax + movq %rdx,%r12 + adcq $0,%r12 + + mulq %rbx + addq %rax,%r12 + movq 48(%rsi),%rax + movq %rdx,%r13 + adcq $0,%r13 + + mulq %rbx + addq %rax,%r13 + movq 56(%rsi),%rax + movq %rdx,%r14 + adcq $0,%r14 + + mulq %rbx + addq %rax,%r14 + movq %rbx,%rax + movq %rdx,%r15 + adcq $0,%r15 + + addq %r8,%r8 + movq %r9,%rcx + adcq %r9,%r9 + + mulq %rax + movq %rax,(%rsp) + addq %rdx,%r8 + adcq $0,%r9 + + movq %r8,8(%rsp) + shrq $63,%rcx + + + movq 8(%rsi),%r8 + movq 16(%rsi),%rax + mulq %r8 + addq %rax,%r10 + movq 24(%rsi),%rax + movq %rdx,%rbx + adcq $0,%rbx + + mulq %r8 + addq %rax,%r11 + movq 32(%rsi),%rax + adcq $0,%rdx + addq %rbx,%r11 + movq %rdx,%rbx + adcq $0,%rbx + + mulq %r8 + addq %rax,%r12 + movq 40(%rsi),%rax + adcq $0,%rdx + addq %rbx,%r12 + movq %rdx,%rbx + adcq $0,%rbx + + mulq %r8 + addq %rax,%r13 + movq 48(%rsi),%rax + adcq $0,%rdx + addq %rbx,%r13 + movq %rdx,%rbx + adcq $0,%rbx + + mulq %r8 + addq %rax,%r14 + movq 56(%rsi),%rax + adcq $0,%rdx + addq %rbx,%r14 + movq %rdx,%rbx + adcq $0,%rbx + + mulq %r8 + addq %rax,%r15 + movq %r8,%rax + adcq $0,%rdx + addq %rbx,%r15 + movq %rdx,%r8 + movq %r10,%rdx + adcq $0,%r8 + + addq %rdx,%rdx + leaq (%rcx,%r10,2),%r10 + movq %r11,%rbx + adcq %r11,%r11 + + mulq %rax + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0,%r11 + + movq %r9,16(%rsp) + movq %r10,24(%rsp) + shrq $63,%rbx + + + movq 16(%rsi),%r9 + movq 24(%rsi),%rax + mulq %r9 + addq %rax,%r12 + movq 32(%rsi),%rax + movq %rdx,%rcx + adcq $0,%rcx + + mulq %r9 + addq %rax,%r13 + movq 40(%rsi),%rax + adcq $0,%rdx + addq %rcx,%r13 + movq %rdx,%rcx + adcq $0,%rcx + + mulq %r9 + addq %rax,%r14 + movq 48(%rsi),%rax + adcq $0,%rdx + addq %rcx,%r14 + movq %rdx,%rcx + adcq $0,%rcx + + mulq %r9 + movq %r12,%r10 + leaq (%rbx,%r12,2),%r12 + addq %rax,%r15 + movq 56(%rsi),%rax + adcq $0,%rdx + addq %rcx,%r15 + movq %rdx,%rcx + adcq $0,%rcx + + mulq %r9 + shrq $63,%r10 + addq %rax,%r8 + movq %r9,%rax + adcq $0,%rdx + addq %rcx,%r8 + movq %rdx,%r9 + adcq $0,%r9 + + movq %r13,%rcx + leaq (%r10,%r13,2),%r13 + + mulq %rax + addq %rax,%r11 + adcq %rdx,%r12 + adcq $0,%r13 + + movq %r11,32(%rsp) + movq %r12,40(%rsp) + shrq $63,%rcx + + + movq 24(%rsi),%r10 + movq 32(%rsi),%rax + mulq %r10 + addq %rax,%r14 + movq 40(%rsi),%rax + movq %rdx,%rbx + adcq $0,%rbx + + mulq %r10 + addq %rax,%r15 + movq 48(%rsi),%rax + adcq $0,%rdx + addq %rbx,%r15 + movq %rdx,%rbx + adcq $0,%rbx + + mulq %r10 + movq %r14,%r12 + leaq (%rcx,%r14,2),%r14 + addq %rax,%r8 + movq 56(%rsi),%rax + adcq $0,%rdx + addq %rbx,%r8 + movq %rdx,%rbx + adcq $0,%rbx + + mulq %r10 + shrq $63,%r12 + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + addq %rbx,%r9 + movq %rdx,%r10 + adcq $0,%r10 + + movq %r15,%rbx + leaq (%r12,%r15,2),%r15 + + mulq %rax + addq %rax,%r13 + adcq %rdx,%r14 + adcq $0,%r15 + + movq %r13,48(%rsp) + movq %r14,56(%rsp) + shrq $63,%rbx + + + movq 32(%rsi),%r11 + movq 40(%rsi),%rax + mulq %r11 + addq %rax,%r8 + movq 48(%rsi),%rax + movq %rdx,%rcx + adcq $0,%rcx + + mulq %r11 + addq %rax,%r9 + movq 56(%rsi),%rax + adcq $0,%rdx + movq %r8,%r12 + leaq (%rbx,%r8,2),%r8 + addq %rcx,%r9 + movq %rdx,%rcx + adcq $0,%rcx + + mulq %r11 + shrq $63,%r12 + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + addq %rcx,%r10 + movq %rdx,%r11 + adcq $0,%r11 + + movq %r9,%rcx + leaq (%r12,%r9,2),%r9 + + mulq %rax + addq %rax,%r15 + adcq %rdx,%r8 + adcq $0,%r9 + + movq %r15,64(%rsp) + movq %r8,72(%rsp) + shrq $63,%rcx + + + movq 40(%rsi),%r12 + movq 48(%rsi),%rax + mulq %r12 + addq %rax,%r10 + movq 56(%rsi),%rax + movq %rdx,%rbx + adcq $0,%rbx + + mulq %r12 + addq %rax,%r11 + movq %r12,%rax + movq %r10,%r15 + leaq (%rcx,%r10,2),%r10 + adcq $0,%rdx + shrq $63,%r15 + addq %rbx,%r11 + movq %rdx,%r12 + adcq $0,%r12 + + movq %r11,%rbx + leaq (%r15,%r11,2),%r11 + + mulq %rax + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0,%r11 + + movq %r9,80(%rsp) + movq %r10,88(%rsp) + + + movq 48(%rsi),%r13 + movq 56(%rsi),%rax + mulq %r13 + addq %rax,%r12 + movq %r13,%rax + movq %rdx,%r13 + adcq $0,%r13 + + xorq %r14,%r14 + shlq $1,%rbx + adcq %r12,%r12 + adcq %r13,%r13 + adcq %r14,%r14 + + mulq %rax + addq %rax,%r11 + adcq %rdx,%r12 + adcq $0,%r13 + + movq %r11,96(%rsp) + movq %r12,104(%rsp) + + + movq 56(%rsi),%rax + mulq %rax + addq %rax,%r13 + adcq $0,%rdx + + addq %rdx,%r14 + + movq %r13,112(%rsp) + movq %r14,120(%rsp) + + movq (%rsp),%r8 + movq 8(%rsp),%r9 + movq 16(%rsp),%r10 + movq 24(%rsp),%r11 + movq 32(%rsp),%r12 + movq 40(%rsp),%r13 + movq 48(%rsp),%r14 + movq 56(%rsp),%r15 + + call __rsaz_512_reduce + + addq 64(%rsp),%r8 + adcq 72(%rsp),%r9 + adcq 80(%rsp),%r10 + adcq 88(%rsp),%r11 + adcq 96(%rsp),%r12 + adcq 104(%rsp),%r13 + adcq 112(%rsp),%r14 + adcq 120(%rsp),%r15 + sbbq %rcx,%rcx + + call __rsaz_512_subtract + + movq %r8,%rdx + movq %r9,%rax + movl 128+8(%rsp),%r8d + movq %rdi,%rsi + + decl %r8d + jnz .Loop_sqr + + leaq 128+24+48(%rsp),%rax + movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp +.Lsqr_epilogue: + .byte 0xf3,0xc3 +.size rsaz_512_sqr,.-rsaz_512_sqr +.globl rsaz_512_mul +.type rsaz_512_mul,@function +.align 32 +rsaz_512_mul: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $128+24,%rsp +.Lmul_body: +.byte 102,72,15,110,199 +.byte 102,72,15,110,201 + movq %r8,128(%rsp) + movq (%rdx),%rbx + movq %rdx,%rbp + call __rsaz_512_mul + +.byte 102,72,15,126,199 +.byte 102,72,15,126,205 + + movq (%rsp),%r8 + movq 8(%rsp),%r9 + movq 16(%rsp),%r10 + movq 24(%rsp),%r11 + movq 32(%rsp),%r12 + movq 40(%rsp),%r13 + movq 48(%rsp),%r14 + movq 56(%rsp),%r15 + + call __rsaz_512_reduce + addq 64(%rsp),%r8 + adcq 72(%rsp),%r9 + adcq 80(%rsp),%r10 + adcq 88(%rsp),%r11 + adcq 96(%rsp),%r12 + adcq 104(%rsp),%r13 + adcq 112(%rsp),%r14 + adcq 120(%rsp),%r15 + sbbq %rcx,%rcx + + call __rsaz_512_subtract + + leaq 128+24+48(%rsp),%rax + movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp +.Lmul_epilogue: + .byte 0xf3,0xc3 +.size rsaz_512_mul,.-rsaz_512_mul +.globl rsaz_512_mul_gather4 +.type rsaz_512_mul_gather4,@function +.align 32 +rsaz_512_mul_gather4: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + movl %r9d,%r9d + subq $128+24,%rsp +.Lmul_gather4_body: + movl 64(%rdx,%r9,4),%eax +.byte 102,72,15,110,199 + movl (%rdx,%r9,4),%ebx +.byte 102,72,15,110,201 + movq %r8,128(%rsp) + + shlq $32,%rax + orq %rax,%rbx + movq (%rsi),%rax + movq 8(%rsi),%rcx + leaq 128(%rdx,%r9,4),%rbp + mulq %rbx + movq %rax,(%rsp) + movq %rcx,%rax + movq %rdx,%r8 + + mulq %rbx + movd (%rbp),%xmm4 + addq %rax,%r8 + movq 16(%rsi),%rax + movq %rdx,%r9 + adcq $0,%r9 + + mulq %rbx + movd 64(%rbp),%xmm5 + addq %rax,%r9 + movq 24(%rsi),%rax + movq %rdx,%r10 + adcq $0,%r10 + + mulq %rbx + pslldq $4,%xmm5 + addq %rax,%r10 + movq 32(%rsi),%rax + movq %rdx,%r11 + adcq $0,%r11 + + mulq %rbx + por %xmm5,%xmm4 + addq %rax,%r11 + movq 40(%rsi),%rax + movq %rdx,%r12 + adcq $0,%r12 + + mulq %rbx + addq %rax,%r12 + movq 48(%rsi),%rax + movq %rdx,%r13 + adcq $0,%r13 + + mulq %rbx + leaq 128(%rbp),%rbp + addq %rax,%r13 + movq 56(%rsi),%rax + movq %rdx,%r14 + adcq $0,%r14 + + mulq %rbx +.byte 102,72,15,126,227 + addq %rax,%r14 + movq (%rsi),%rax + movq %rdx,%r15 + adcq $0,%r15 + + leaq 8(%rsp),%rdi + movl $7,%ecx + jmp .Loop_mul_gather + +.align 32 +.Loop_mul_gather: + mulq %rbx + addq %rax,%r8 + movq 8(%rsi),%rax + movq %r8,(%rdi) + movq %rdx,%r8 + adcq $0,%r8 + + mulq %rbx + movd (%rbp),%xmm4 + addq %rax,%r9 + movq 16(%rsi),%rax + adcq $0,%rdx + addq %r9,%r8 + movq %rdx,%r9 + adcq $0,%r9 + + mulq %rbx + movd 64(%rbp),%xmm5 + addq %rax,%r10 + movq 24(%rsi),%rax + adcq $0,%rdx + addq %r10,%r9 + movq %rdx,%r10 + adcq $0,%r10 + + mulq %rbx + pslldq $4,%xmm5 + addq %rax,%r11 + movq 32(%rsi),%rax + adcq $0,%rdx + addq %r11,%r10 + movq %rdx,%r11 + adcq $0,%r11 + + mulq %rbx + por %xmm5,%xmm4 + addq %rax,%r12 + movq 40(%rsi),%rax + adcq $0,%rdx + addq %r12,%r11 + movq %rdx,%r12 + adcq $0,%r12 + + mulq %rbx + addq %rax,%r13 + movq 48(%rsi),%rax + adcq $0,%rdx + addq %r13,%r12 + movq %rdx,%r13 + adcq $0,%r13 + + mulq %rbx + addq %rax,%r14 + movq 56(%rsi),%rax + adcq $0,%rdx + addq %r14,%r13 + movq %rdx,%r14 + adcq $0,%r14 + + mulq %rbx +.byte 102,72,15,126,227 + addq %rax,%r15 + movq (%rsi),%rax + adcq $0,%rdx + addq %r15,%r14 + movq %rdx,%r15 + adcq $0,%r15 + + leaq 128(%rbp),%rbp + leaq 8(%rdi),%rdi + + decl %ecx + jnz .Loop_mul_gather + + movq %r8,(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + +.byte 102,72,15,126,199 +.byte 102,72,15,126,205 + + movq (%rsp),%r8 + movq 8(%rsp),%r9 + movq 16(%rsp),%r10 + movq 24(%rsp),%r11 + movq 32(%rsp),%r12 + movq 40(%rsp),%r13 + movq 48(%rsp),%r14 + movq 56(%rsp),%r15 + + call __rsaz_512_reduce + addq 64(%rsp),%r8 + adcq 72(%rsp),%r9 + adcq 80(%rsp),%r10 + adcq 88(%rsp),%r11 + adcq 96(%rsp),%r12 + adcq 104(%rsp),%r13 + adcq 112(%rsp),%r14 + adcq 120(%rsp),%r15 + sbbq %rcx,%rcx + + call __rsaz_512_subtract + + leaq 128+24+48(%rsp),%rax + movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp +.Lmul_gather4_epilogue: + .byte 0xf3,0xc3 +.size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4 +.globl rsaz_512_mul_scatter4 +.type rsaz_512_mul_scatter4,@function +.align 32 +rsaz_512_mul_scatter4: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + movl %r9d,%r9d + subq $128+24,%rsp +.Lmul_scatter4_body: + leaq (%r8,%r9,4),%r8 +.byte 102,72,15,110,199 +.byte 102,72,15,110,202 +.byte 102,73,15,110,208 + movq %rcx,128(%rsp) + + movq %rdi,%rbp + movq (%rdi),%rbx + call __rsaz_512_mul + +.byte 102,72,15,126,199 +.byte 102,72,15,126,205 + + movq (%rsp),%r8 + movq 8(%rsp),%r9 + movq 16(%rsp),%r10 + movq 24(%rsp),%r11 + movq 32(%rsp),%r12 + movq 40(%rsp),%r13 + movq 48(%rsp),%r14 + movq 56(%rsp),%r15 + + call __rsaz_512_reduce + addq 64(%rsp),%r8 + adcq 72(%rsp),%r9 + adcq 80(%rsp),%r10 + adcq 88(%rsp),%r11 + adcq 96(%rsp),%r12 + adcq 104(%rsp),%r13 + adcq 112(%rsp),%r14 + adcq 120(%rsp),%r15 +.byte 102,72,15,126,214 + sbbq %rcx,%rcx + + call __rsaz_512_subtract + + movl %r8d,0(%rsi) + shrq $32,%r8 + movl %r9d,128(%rsi) + shrq $32,%r9 + movl %r10d,256(%rsi) + shrq $32,%r10 + movl %r11d,384(%rsi) + shrq $32,%r11 + movl %r12d,512(%rsi) + shrq $32,%r12 + movl %r13d,640(%rsi) + shrq $32,%r13 + movl %r14d,768(%rsi) + shrq $32,%r14 + movl %r15d,896(%rsi) + shrq $32,%r15 + movl %r8d,64(%rsi) + movl %r9d,192(%rsi) + movl %r10d,320(%rsi) + movl %r11d,448(%rsi) + movl %r12d,576(%rsi) + movl %r13d,704(%rsi) + movl %r14d,832(%rsi) + movl %r15d,960(%rsi) + + leaq 128+24+48(%rsp),%rax + movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp +.Lmul_scatter4_epilogue: + .byte 0xf3,0xc3 +.size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4 +.globl rsaz_512_mul_by_one +.type rsaz_512_mul_by_one,@function +.align 32 +rsaz_512_mul_by_one: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $128+24,%rsp +.Lmul_by_one_body: + movq %rdx,%rbp + movq %rcx,128(%rsp) + + movq (%rsi),%r8 + pxor %xmm0,%xmm0 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + movq 56(%rsi),%r15 + + movdqa %xmm0,(%rsp) + movdqa %xmm0,16(%rsp) + movdqa %xmm0,32(%rsp) + movdqa %xmm0,48(%rsp) + movdqa %xmm0,64(%rsp) + movdqa %xmm0,80(%rsp) + movdqa %xmm0,96(%rsp) + call __rsaz_512_reduce + movq %r8,(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + leaq 128+24+48(%rsp),%rax + movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp +.Lmul_by_one_epilogue: + .byte 0xf3,0xc3 +.size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one +.type __rsaz_512_reduce,@function +.align 32 +__rsaz_512_reduce: + movq %r8,%rbx + imulq 128+8(%rsp),%rbx + movq 0(%rbp),%rax + movl $8,%ecx + jmp .Lreduction_loop + +.align 32 +.Lreduction_loop: + mulq %rbx + movq 8(%rbp),%rax + negq %r8 + movq %rdx,%r8 + adcq $0,%r8 + + mulq %rbx + addq %rax,%r9 + movq 16(%rbp),%rax + adcq $0,%rdx + addq %r9,%r8 + movq %rdx,%r9 + adcq $0,%r9 + + mulq %rbx + addq %rax,%r10 + movq 24(%rbp),%rax + adcq $0,%rdx + addq %r10,%r9 + movq %rdx,%r10 + adcq $0,%r10 + + mulq %rbx + addq %rax,%r11 + movq 32(%rbp),%rax + adcq $0,%rdx + addq %r11,%r10 + movq 128+8(%rsp),%rsi + + + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbx + addq %rax,%r12 + movq 40(%rbp),%rax + adcq $0,%rdx + imulq %r8,%rsi + addq %r12,%r11 + movq %rdx,%r12 + adcq $0,%r12 + + mulq %rbx + addq %rax,%r13 + movq 48(%rbp),%rax + adcq $0,%rdx + addq %r13,%r12 + movq %rdx,%r13 + adcq $0,%r13 + + mulq %rbx + addq %rax,%r14 + movq 56(%rbp),%rax + adcq $0,%rdx + addq %r14,%r13 + movq %rdx,%r14 + adcq $0,%r14 + + mulq %rbx + movq %rsi,%rbx + addq %rax,%r15 + movq 0(%rbp),%rax + adcq $0,%rdx + addq %r15,%r14 + movq %rdx,%r15 + adcq $0,%r15 + + decl %ecx + jne .Lreduction_loop + + .byte 0xf3,0xc3 +.size __rsaz_512_reduce,.-__rsaz_512_reduce +.type __rsaz_512_subtract,@function +.align 32 +__rsaz_512_subtract: + movq %r8,(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + movq 0(%rbp),%r8 + movq 8(%rbp),%r9 + negq %r8 + notq %r9 + andq %rcx,%r8 + movq 16(%rbp),%r10 + andq %rcx,%r9 + notq %r10 + movq 24(%rbp),%r11 + andq %rcx,%r10 + notq %r11 + movq 32(%rbp),%r12 + andq %rcx,%r11 + notq %r12 + movq 40(%rbp),%r13 + andq %rcx,%r12 + notq %r13 + movq 48(%rbp),%r14 + andq %rcx,%r13 + notq %r14 + movq 56(%rbp),%r15 + andq %rcx,%r14 + notq %r15 + andq %rcx,%r15 + + addq (%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + + movq %r8,(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + .byte 0xf3,0xc3 +.size __rsaz_512_subtract,.-__rsaz_512_subtract +.type __rsaz_512_mul,@function +.align 32 +__rsaz_512_mul: + leaq 8(%rsp),%rdi + + movq (%rsi),%rax + mulq %rbx + movq %rax,(%rdi) + movq 8(%rsi),%rax + movq %rdx,%r8 + + mulq %rbx + addq %rax,%r8 + movq 16(%rsi),%rax + movq %rdx,%r9 + adcq $0,%r9 + + mulq %rbx + addq %rax,%r9 + movq 24(%rsi),%rax + movq %rdx,%r10 + adcq $0,%r10 + + mulq %rbx + addq %rax,%r10 + movq 32(%rsi),%rax + movq %rdx,%r11 + adcq $0,%r11 + + mulq %rbx + addq %rax,%r11 + movq 40(%rsi),%rax + movq %rdx,%r12 + adcq $0,%r12 + + mulq %rbx + addq %rax,%r12 + movq 48(%rsi),%rax + movq %rdx,%r13 + adcq $0,%r13 + + mulq %rbx + addq %rax,%r13 + movq 56(%rsi),%rax + movq %rdx,%r14 + adcq $0,%r14 + + mulq %rbx + addq %rax,%r14 + movq (%rsi),%rax + movq %rdx,%r15 + adcq $0,%r15 + + leaq 8(%rbp),%rbp + leaq 8(%rdi),%rdi + + movl $7,%ecx + jmp .Loop_mul + +.align 32 +.Loop_mul: + movq (%rbp),%rbx + mulq %rbx + addq %rax,%r8 + movq 8(%rsi),%rax + movq %r8,(%rdi) + movq %rdx,%r8 + adcq $0,%r8 + + mulq %rbx + addq %rax,%r9 + movq 16(%rsi),%rax + adcq $0,%rdx + addq %r9,%r8 + movq %rdx,%r9 + adcq $0,%r9 + + mulq %rbx + addq %rax,%r10 + movq 24(%rsi),%rax + adcq $0,%rdx + addq %r10,%r9 + movq %rdx,%r10 + adcq $0,%r10 + + mulq %rbx + addq %rax,%r11 + movq 32(%rsi),%rax + adcq $0,%rdx + addq %r11,%r10 + movq %rdx,%r11 + adcq $0,%r11 + + mulq %rbx + addq %rax,%r12 + movq 40(%rsi),%rax + adcq $0,%rdx + addq %r12,%r11 + movq %rdx,%r12 + adcq $0,%r12 + + mulq %rbx + addq %rax,%r13 + movq 48(%rsi),%rax + adcq $0,%rdx + addq %r13,%r12 + movq %rdx,%r13 + adcq $0,%r13 + + mulq %rbx + addq %rax,%r14 + movq 56(%rsi),%rax + adcq $0,%rdx + addq %r14,%r13 + movq %rdx,%r14 + leaq 8(%rbp),%rbp + adcq $0,%r14 + + mulq %rbx + addq %rax,%r15 + movq (%rsi),%rax + adcq $0,%rdx + addq %r15,%r14 + movq %rdx,%r15 + adcq $0,%r15 + + leaq 8(%rdi),%rdi + + decl %ecx + jnz .Loop_mul + + movq %r8,(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + .byte 0xf3,0xc3 +.size __rsaz_512_mul,.-__rsaz_512_mul +.globl rsaz_512_scatter4 +.type rsaz_512_scatter4,@function +.align 16 +rsaz_512_scatter4: + leaq (%rdi,%rdx,4),%rdi + movl $8,%r9d + jmp .Loop_scatter +.align 16 +.Loop_scatter: + movq (%rsi),%rax + leaq 8(%rsi),%rsi + movl %eax,(%rdi) + shrq $32,%rax + movl %eax,64(%rdi) + leaq 128(%rdi),%rdi + decl %r9d + jnz .Loop_scatter + .byte 0xf3,0xc3 +.size rsaz_512_scatter4,.-rsaz_512_scatter4 + +.globl rsaz_512_gather4 +.type rsaz_512_gather4,@function +.align 16 +rsaz_512_gather4: + leaq (%rsi,%rdx,4),%rsi + movl $8,%r9d + jmp .Loop_gather +.align 16 +.Loop_gather: + movl (%rsi),%eax + movl 64(%rsi),%r8d + leaq 128(%rsi),%rsi + shlq $32,%r8 + orq %r8,%rax + movq %rax,(%rdi) + leaq 8(%rdi),%rdi + decl %r9d + jnz .Loop_gather + .byte 0xf3,0xc3 +.size rsaz_512_gather4,.-rsaz_512_gather4 diff --git a/secure/lib/libcrypto/amd64/sha1-mb-x86_64.S b/secure/lib/libcrypto/amd64/sha1-mb-x86_64.S new file mode 100644 index 0000000..6c7cd2f --- /dev/null +++ b/secure/lib/libcrypto/amd64/sha1-mb-x86_64.S @@ -0,0 +1,2935 @@ + # $FreeBSD$ +.text + + + +.globl sha1_multi_block +.type sha1_multi_block,@function +.align 32 +sha1_multi_block: + movq OPENSSL_ia32cap_P+4(%rip),%rcx + btq $61,%rcx + jc _shaext_shortcut + movq %rsp,%rax + pushq %rbx + pushq %rbp + subq $288,%rsp + andq $-256,%rsp + movq %rax,272(%rsp) +.Lbody: + leaq K_XX_XX(%rip),%rbp + leaq 256(%rsp),%rbx + +.Loop_grande: + movl %edx,280(%rsp) + xorl %edx,%edx + movq 0(%rsi),%r8 + movl 8(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,0(%rbx) + cmovleq %rbp,%r8 + movq 16(%rsi),%r9 + movl 24(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,4(%rbx) + cmovleq %rbp,%r9 + movq 32(%rsi),%r10 + movl 40(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,8(%rbx) + cmovleq %rbp,%r10 + movq 48(%rsi),%r11 + movl 56(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,12(%rbx) + cmovleq %rbp,%r11 + testl %edx,%edx + jz .Ldone + + movdqu 0(%rdi),%xmm10 + leaq 128(%rsp),%rax + movdqu 32(%rdi),%xmm11 + movdqu 64(%rdi),%xmm12 + movdqu 96(%rdi),%xmm13 + movdqu 128(%rdi),%xmm14 + movdqa 96(%rbp),%xmm5 + movdqa -32(%rbp),%xmm15 + jmp .Loop + +.align 32 +.Loop: + movd (%r8),%xmm0 + leaq 64(%r8),%r8 + movd (%r9),%xmm2 + leaq 64(%r9),%r9 + movd (%r10),%xmm3 + leaq 64(%r10),%r10 + movd (%r11),%xmm4 + leaq 64(%r11),%r11 + punpckldq %xmm3,%xmm0 + movd -60(%r8),%xmm1 + punpckldq %xmm4,%xmm2 + movd -60(%r9),%xmm9 + punpckldq %xmm2,%xmm0 + movd -60(%r10),%xmm8 +.byte 102,15,56,0,197 + movd -60(%r11),%xmm7 + punpckldq %xmm8,%xmm1 + movdqa %xmm10,%xmm8 + paddd %xmm15,%xmm14 + punpckldq %xmm7,%xmm9 + movdqa %xmm11,%xmm7 + movdqa %xmm11,%xmm6 + pslld $5,%xmm8 + pandn %xmm13,%xmm7 + pand %xmm12,%xmm6 + punpckldq %xmm9,%xmm1 + movdqa %xmm10,%xmm9 + + movdqa %xmm0,0-128(%rax) + paddd %xmm0,%xmm14 + movd -56(%r8),%xmm2 + psrld $27,%xmm9 + pxor %xmm7,%xmm6 + movdqa %xmm11,%xmm7 + + por %xmm9,%xmm8 + movd -56(%r9),%xmm9 + pslld $30,%xmm7 + paddd %xmm6,%xmm14 + + psrld $2,%xmm11 + paddd %xmm8,%xmm14 +.byte 102,15,56,0,205 + movd -56(%r10),%xmm8 + por %xmm7,%xmm11 + movd -56(%r11),%xmm7 + punpckldq %xmm8,%xmm2 + movdqa %xmm14,%xmm8 + paddd %xmm15,%xmm13 + punpckldq %xmm7,%xmm9 + movdqa %xmm10,%xmm7 + movdqa %xmm10,%xmm6 + pslld $5,%xmm8 + pandn %xmm12,%xmm7 + pand %xmm11,%xmm6 + punpckldq %xmm9,%xmm2 + movdqa %xmm14,%xmm9 + + movdqa %xmm1,16-128(%rax) + paddd %xmm1,%xmm13 + movd -52(%r8),%xmm3 + psrld $27,%xmm9 + pxor %xmm7,%xmm6 + movdqa %xmm10,%xmm7 + + por %xmm9,%xmm8 + movd -52(%r9),%xmm9 + pslld $30,%xmm7 + paddd %xmm6,%xmm13 + + psrld $2,%xmm10 + paddd %xmm8,%xmm13 +.byte 102,15,56,0,213 + movd -52(%r10),%xmm8 + por %xmm7,%xmm10 + movd -52(%r11),%xmm7 + punpckldq %xmm8,%xmm3 + movdqa %xmm13,%xmm8 + paddd %xmm15,%xmm12 + punpckldq %xmm7,%xmm9 + movdqa %xmm14,%xmm7 + movdqa %xmm14,%xmm6 + pslld $5,%xmm8 + pandn %xmm11,%xmm7 + pand %xmm10,%xmm6 + punpckldq %xmm9,%xmm3 + movdqa %xmm13,%xmm9 + + movdqa %xmm2,32-128(%rax) + paddd %xmm2,%xmm12 + movd -48(%r8),%xmm4 + psrld $27,%xmm9 + pxor %xmm7,%xmm6 + movdqa %xmm14,%xmm7 + + por %xmm9,%xmm8 + movd -48(%r9),%xmm9 + pslld $30,%xmm7 + paddd %xmm6,%xmm12 + + psrld $2,%xmm14 + paddd %xmm8,%xmm12 +.byte 102,15,56,0,221 + movd -48(%r10),%xmm8 + por %xmm7,%xmm14 + movd -48(%r11),%xmm7 + punpckldq %xmm8,%xmm4 + movdqa %xmm12,%xmm8 + paddd %xmm15,%xmm11 + punpckldq %xmm7,%xmm9 + movdqa %xmm13,%xmm7 + movdqa %xmm13,%xmm6 + pslld $5,%xmm8 + pandn %xmm10,%xmm7 + pand %xmm14,%xmm6 + punpckldq %xmm9,%xmm4 + movdqa %xmm12,%xmm9 + + movdqa %xmm3,48-128(%rax) + paddd %xmm3,%xmm11 + movd -44(%r8),%xmm0 + psrld $27,%xmm9 + pxor %xmm7,%xmm6 + movdqa %xmm13,%xmm7 + + por %xmm9,%xmm8 + movd -44(%r9),%xmm9 + pslld $30,%xmm7 + paddd %xmm6,%xmm11 + + psrld $2,%xmm13 + paddd %xmm8,%xmm11 +.byte 102,15,56,0,229 + movd -44(%r10),%xmm8 + por %xmm7,%xmm13 + movd -44(%r11),%xmm7 + punpckldq %xmm8,%xmm0 + movdqa %xmm11,%xmm8 + paddd %xmm15,%xmm10 + punpckldq %xmm7,%xmm9 + movdqa %xmm12,%xmm7 + movdqa %xmm12,%xmm6 + pslld $5,%xmm8 + pandn %xmm14,%xmm7 + pand %xmm13,%xmm6 + punpckldq %xmm9,%xmm0 + movdqa %xmm11,%xmm9 + + movdqa %xmm4,64-128(%rax) + paddd %xmm4,%xmm10 + movd -40(%r8),%xmm1 + psrld $27,%xmm9 + pxor %xmm7,%xmm6 + movdqa %xmm12,%xmm7 + + por %xmm9,%xmm8 + movd -40(%r9),%xmm9 + pslld $30,%xmm7 + paddd %xmm6,%xmm10 + + psrld $2,%xmm12 + paddd %xmm8,%xmm10 +.byte 102,15,56,0,197 + movd -40(%r10),%xmm8 + por %xmm7,%xmm12 + movd -40(%r11),%xmm7 + punpckldq %xmm8,%xmm1 + movdqa %xmm10,%xmm8 + paddd %xmm15,%xmm14 + punpckldq %xmm7,%xmm9 + movdqa %xmm11,%xmm7 + movdqa %xmm11,%xmm6 + pslld $5,%xmm8 + pandn %xmm13,%xmm7 + pand %xmm12,%xmm6 + punpckldq %xmm9,%xmm1 + movdqa %xmm10,%xmm9 + + movdqa %xmm0,80-128(%rax) + paddd %xmm0,%xmm14 + movd -36(%r8),%xmm2 + psrld $27,%xmm9 + pxor %xmm7,%xmm6 + movdqa %xmm11,%xmm7 + + por %xmm9,%xmm8 + movd -36(%r9),%xmm9 + pslld $30,%xmm7 + paddd %xmm6,%xmm14 + + psrld $2,%xmm11 + paddd %xmm8,%xmm14 +.byte 102,15,56,0,205 + movd -36(%r10),%xmm8 + por %xmm7,%xmm11 + movd -36(%r11),%xmm7 + punpckldq %xmm8,%xmm2 + movdqa %xmm14,%xmm8 + paddd %xmm15,%xmm13 + punpckldq %xmm7,%xmm9 + movdqa %xmm10,%xmm7 + movdqa %xmm10,%xmm6 + pslld $5,%xmm8 + pandn %xmm12,%xmm7 + pand %xmm11,%xmm6 + punpckldq %xmm9,%xmm2 + movdqa %xmm14,%xmm9 + + movdqa %xmm1,96-128(%rax) + paddd %xmm1,%xmm13 + movd -32(%r8),%xmm3 + psrld $27,%xmm9 + pxor %xmm7,%xmm6 + movdqa %xmm10,%xmm7 + + por %xmm9,%xmm8 + movd -32(%r9),%xmm9 + pslld $30,%xmm7 + paddd %xmm6,%xmm13 + + psrld $2,%xmm10 + paddd %xmm8,%xmm13 +.byte 102,15,56,0,213 + movd -32(%r10),%xmm8 + por %xmm7,%xmm10 + movd -32(%r11),%xmm7 + punpckldq %xmm8,%xmm3 + movdqa %xmm13,%xmm8 + paddd %xmm15,%xmm12 + punpckldq %xmm7,%xmm9 + movdqa %xmm14,%xmm7 + movdqa %xmm14,%xmm6 + pslld $5,%xmm8 + pandn %xmm11,%xmm7 + pand %xmm10,%xmm6 + punpckldq %xmm9,%xmm3 + movdqa %xmm13,%xmm9 + + movdqa %xmm2,112-128(%rax) + paddd %xmm2,%xmm12 + movd -28(%r8),%xmm4 + psrld $27,%xmm9 + pxor %xmm7,%xmm6 + movdqa %xmm14,%xmm7 + + por %xmm9,%xmm8 + movd -28(%r9),%xmm9 + pslld $30,%xmm7 + paddd %xmm6,%xmm12 + + psrld $2,%xmm14 + paddd %xmm8,%xmm12 +.byte 102,15,56,0,221 + movd -28(%r10),%xmm8 + por %xmm7,%xmm14 + movd -28(%r11),%xmm7 + punpckldq %xmm8,%xmm4 + movdqa %xmm12,%xmm8 + paddd %xmm15,%xmm11 + punpckldq %xmm7,%xmm9 + movdqa %xmm13,%xmm7 + movdqa %xmm13,%xmm6 + pslld $5,%xmm8 + pandn %xmm10,%xmm7 + pand %xmm14,%xmm6 + punpckldq %xmm9,%xmm4 + movdqa %xmm12,%xmm9 + + movdqa %xmm3,128-128(%rax) + paddd %xmm3,%xmm11 + movd -24(%r8),%xmm0 + psrld $27,%xmm9 + pxor %xmm7,%xmm6 + movdqa %xmm13,%xmm7 + + por %xmm9,%xmm8 + movd -24(%r9),%xmm9 + pslld $30,%xmm7 + paddd %xmm6,%xmm11 + + psrld $2,%xmm13 + paddd %xmm8,%xmm11 +.byte 102,15,56,0,229 + movd -24(%r10),%xmm8 + por %xmm7,%xmm13 + movd -24(%r11),%xmm7 + punpckldq %xmm8,%xmm0 + movdqa %xmm11,%xmm8 + paddd %xmm15,%xmm10 + punpckldq %xmm7,%xmm9 + movdqa %xmm12,%xmm7 + movdqa %xmm12,%xmm6 + pslld $5,%xmm8 + pandn %xmm14,%xmm7 + pand %xmm13,%xmm6 + punpckldq %xmm9,%xmm0 + movdqa %xmm11,%xmm9 + + movdqa %xmm4,144-128(%rax) + paddd %xmm4,%xmm10 + movd -20(%r8),%xmm1 + psrld $27,%xmm9 + pxor %xmm7,%xmm6 + movdqa %xmm12,%xmm7 + + por %xmm9,%xmm8 + movd -20(%r9),%xmm9 + pslld $30,%xmm7 + paddd %xmm6,%xmm10 + + psrld $2,%xmm12 + paddd %xmm8,%xmm10 +.byte 102,15,56,0,197 + movd -20(%r10),%xmm8 + por %xmm7,%xmm12 + movd -20(%r11),%xmm7 + punpckldq %xmm8,%xmm1 + movdqa %xmm10,%xmm8 + paddd %xmm15,%xmm14 + punpckldq %xmm7,%xmm9 + movdqa %xmm11,%xmm7 + movdqa %xmm11,%xmm6 + pslld $5,%xmm8 + pandn %xmm13,%xmm7 + pand %xmm12,%xmm6 + punpckldq %xmm9,%xmm1 + movdqa %xmm10,%xmm9 + + movdqa %xmm0,160-128(%rax) + paddd %xmm0,%xmm14 + movd -16(%r8),%xmm2 + psrld $27,%xmm9 + pxor %xmm7,%xmm6 + movdqa %xmm11,%xmm7 + + por %xmm9,%xmm8 + movd -16(%r9),%xmm9 + pslld $30,%xmm7 + paddd %xmm6,%xmm14 + + psrld $2,%xmm11 + paddd %xmm8,%xmm14 +.byte 102,15,56,0,205 + movd -16(%r10),%xmm8 + por %xmm7,%xmm11 + movd -16(%r11),%xmm7 + punpckldq %xmm8,%xmm2 + movdqa %xmm14,%xmm8 + paddd %xmm15,%xmm13 + punpckldq %xmm7,%xmm9 + movdqa %xmm10,%xmm7 + movdqa %xmm10,%xmm6 + pslld $5,%xmm8 + pandn %xmm12,%xmm7 + pand %xmm11,%xmm6 + punpckldq %xmm9,%xmm2 + movdqa %xmm14,%xmm9 + + movdqa %xmm1,176-128(%rax) + paddd %xmm1,%xmm13 + movd -12(%r8),%xmm3 + psrld $27,%xmm9 + pxor %xmm7,%xmm6 + movdqa %xmm10,%xmm7 + + por %xmm9,%xmm8 + movd -12(%r9),%xmm9 + pslld $30,%xmm7 + paddd %xmm6,%xmm13 + + psrld $2,%xmm10 + paddd %xmm8,%xmm13 +.byte 102,15,56,0,213 + movd -12(%r10),%xmm8 + por %xmm7,%xmm10 + movd -12(%r11),%xmm7 + punpckldq %xmm8,%xmm3 + movdqa %xmm13,%xmm8 + paddd %xmm15,%xmm12 + punpckldq %xmm7,%xmm9 + movdqa %xmm14,%xmm7 + movdqa %xmm14,%xmm6 + pslld $5,%xmm8 + pandn %xmm11,%xmm7 + pand %xmm10,%xmm6 + punpckldq %xmm9,%xmm3 + movdqa %xmm13,%xmm9 + + movdqa %xmm2,192-128(%rax) + paddd %xmm2,%xmm12 + movd -8(%r8),%xmm4 + psrld $27,%xmm9 + pxor %xmm7,%xmm6 + movdqa %xmm14,%xmm7 + + por %xmm9,%xmm8 + movd -8(%r9),%xmm9 + pslld $30,%xmm7 + paddd %xmm6,%xmm12 + + psrld $2,%xmm14 + paddd %xmm8,%xmm12 +.byte 102,15,56,0,221 + movd -8(%r10),%xmm8 + por %xmm7,%xmm14 + movd -8(%r11),%xmm7 + punpckldq %xmm8,%xmm4 + movdqa %xmm12,%xmm8 + paddd %xmm15,%xmm11 + punpckldq %xmm7,%xmm9 + movdqa %xmm13,%xmm7 + movdqa %xmm13,%xmm6 + pslld $5,%xmm8 + pandn %xmm10,%xmm7 + pand %xmm14,%xmm6 + punpckldq %xmm9,%xmm4 + movdqa %xmm12,%xmm9 + + movdqa %xmm3,208-128(%rax) + paddd %xmm3,%xmm11 + movd -4(%r8),%xmm0 + psrld $27,%xmm9 + pxor %xmm7,%xmm6 + movdqa %xmm13,%xmm7 + + por %xmm9,%xmm8 + movd -4(%r9),%xmm9 + pslld $30,%xmm7 + paddd %xmm6,%xmm11 + + psrld $2,%xmm13 + paddd %xmm8,%xmm11 +.byte 102,15,56,0,229 + movd -4(%r10),%xmm8 + por %xmm7,%xmm13 + movdqa 0-128(%rax),%xmm1 + movd -4(%r11),%xmm7 + punpckldq %xmm8,%xmm0 + movdqa %xmm11,%xmm8 + paddd %xmm15,%xmm10 + punpckldq %xmm7,%xmm9 + movdqa %xmm12,%xmm7 + movdqa %xmm12,%xmm6 + pslld $5,%xmm8 + prefetcht0 63(%r8) + pandn %xmm14,%xmm7 + pand %xmm13,%xmm6 + punpckldq %xmm9,%xmm0 + movdqa %xmm11,%xmm9 + + movdqa %xmm4,224-128(%rax) + paddd %xmm4,%xmm10 + psrld $27,%xmm9 + pxor %xmm7,%xmm6 + movdqa %xmm12,%xmm7 + prefetcht0 63(%r9) + + por %xmm9,%xmm8 + pslld $30,%xmm7 + paddd %xmm6,%xmm10 + prefetcht0 63(%r10) + + psrld $2,%xmm12 + paddd %xmm8,%xmm10 +.byte 102,15,56,0,197 + prefetcht0 63(%r11) + por %xmm7,%xmm12 + movdqa 16-128(%rax),%xmm2 + pxor %xmm3,%xmm1 + movdqa 32-128(%rax),%xmm3 + + movdqa %xmm10,%xmm8 + pxor 128-128(%rax),%xmm1 + paddd %xmm15,%xmm14 + movdqa %xmm11,%xmm7 + pslld $5,%xmm8 + pxor %xmm3,%xmm1 + movdqa %xmm11,%xmm6 + pandn %xmm13,%xmm7 + movdqa %xmm1,%xmm5 + pand %xmm12,%xmm6 + movdqa %xmm10,%xmm9 + psrld $31,%xmm5 + paddd %xmm1,%xmm1 + + movdqa %xmm0,240-128(%rax) + paddd %xmm0,%xmm14 + psrld $27,%xmm9 + pxor %xmm7,%xmm6 + + movdqa %xmm11,%xmm7 + por %xmm9,%xmm8 + pslld $30,%xmm7 + paddd %xmm6,%xmm14 + + psrld $2,%xmm11 + paddd %xmm8,%xmm14 + por %xmm5,%xmm1 + por %xmm7,%xmm11 + pxor %xmm4,%xmm2 + movdqa 48-128(%rax),%xmm4 + + movdqa %xmm14,%xmm8 + pxor 144-128(%rax),%xmm2 + paddd %xmm15,%xmm13 + movdqa %xmm10,%xmm7 + pslld $5,%xmm8 + pxor %xmm4,%xmm2 + movdqa %xmm10,%xmm6 + pandn %xmm12,%xmm7 + movdqa %xmm2,%xmm5 + pand %xmm11,%xmm6 + movdqa %xmm14,%xmm9 + psrld $31,%xmm5 + paddd %xmm2,%xmm2 + + movdqa %xmm1,0-128(%rax) + paddd %xmm1,%xmm13 + psrld $27,%xmm9 + pxor %xmm7,%xmm6 + + movdqa %xmm10,%xmm7 + por %xmm9,%xmm8 + pslld $30,%xmm7 + paddd %xmm6,%xmm13 + + psrld $2,%xmm10 + paddd %xmm8,%xmm13 + por %xmm5,%xmm2 + por %xmm7,%xmm10 + pxor %xmm0,%xmm3 + movdqa 64-128(%rax),%xmm0 + + movdqa %xmm13,%xmm8 + pxor 160-128(%rax),%xmm3 + paddd %xmm15,%xmm12 + movdqa %xmm14,%xmm7 + pslld $5,%xmm8 + pxor %xmm0,%xmm3 + movdqa %xmm14,%xmm6 + pandn %xmm11,%xmm7 + movdqa %xmm3,%xmm5 + pand %xmm10,%xmm6 + movdqa %xmm13,%xmm9 + psrld $31,%xmm5 + paddd %xmm3,%xmm3 + + movdqa %xmm2,16-128(%rax) + paddd %xmm2,%xmm12 + psrld $27,%xmm9 + pxor %xmm7,%xmm6 + + movdqa %xmm14,%xmm7 + por %xmm9,%xmm8 + pslld $30,%xmm7 + paddd %xmm6,%xmm12 + + psrld $2,%xmm14 + paddd %xmm8,%xmm12 + por %xmm5,%xmm3 + por %xmm7,%xmm14 + pxor %xmm1,%xmm4 + movdqa 80-128(%rax),%xmm1 + + movdqa %xmm12,%xmm8 + pxor 176-128(%rax),%xmm4 + paddd %xmm15,%xmm11 + movdqa %xmm13,%xmm7 + pslld $5,%xmm8 + pxor %xmm1,%xmm4 + movdqa %xmm13,%xmm6 + pandn %xmm10,%xmm7 + movdqa %xmm4,%xmm5 + pand %xmm14,%xmm6 + movdqa %xmm12,%xmm9 + psrld $31,%xmm5 + paddd %xmm4,%xmm4 + + movdqa %xmm3,32-128(%rax) + paddd %xmm3,%xmm11 + psrld $27,%xmm9 + pxor %xmm7,%xmm6 + + movdqa %xmm13,%xmm7 + por %xmm9,%xmm8 + pslld $30,%xmm7 + paddd %xmm6,%xmm11 + + psrld $2,%xmm13 + paddd %xmm8,%xmm11 + por %xmm5,%xmm4 + por %xmm7,%xmm13 + pxor %xmm2,%xmm0 + movdqa 96-128(%rax),%xmm2 + + movdqa %xmm11,%xmm8 + pxor 192-128(%rax),%xmm0 + paddd %xmm15,%xmm10 + movdqa %xmm12,%xmm7 + pslld $5,%xmm8 + pxor %xmm2,%xmm0 + movdqa %xmm12,%xmm6 + pandn %xmm14,%xmm7 + movdqa %xmm0,%xmm5 + pand %xmm13,%xmm6 + movdqa %xmm11,%xmm9 + psrld $31,%xmm5 + paddd %xmm0,%xmm0 + + movdqa %xmm4,48-128(%rax) + paddd %xmm4,%xmm10 + psrld $27,%xmm9 + pxor %xmm7,%xmm6 + + movdqa %xmm12,%xmm7 + por %xmm9,%xmm8 + pslld $30,%xmm7 + paddd %xmm6,%xmm10 + + psrld $2,%xmm12 + paddd %xmm8,%xmm10 + por %xmm5,%xmm0 + por %xmm7,%xmm12 + movdqa 0(%rbp),%xmm15 + pxor %xmm3,%xmm1 + movdqa 112-128(%rax),%xmm3 + + movdqa %xmm10,%xmm8 + movdqa %xmm13,%xmm6 + pxor 208-128(%rax),%xmm1 + paddd %xmm15,%xmm14 + pslld $5,%xmm8 + pxor %xmm11,%xmm6 + + movdqa %xmm10,%xmm9 + movdqa %xmm0,64-128(%rax) + paddd %xmm0,%xmm14 + pxor %xmm3,%xmm1 + psrld $27,%xmm9 + pxor %xmm12,%xmm6 + movdqa %xmm11,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm1,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm14 + paddd %xmm1,%xmm1 + + psrld $2,%xmm11 + paddd %xmm8,%xmm14 + por %xmm5,%xmm1 + por %xmm7,%xmm11 + pxor %xmm4,%xmm2 + movdqa 128-128(%rax),%xmm4 + + movdqa %xmm14,%xmm8 + movdqa %xmm12,%xmm6 + pxor 224-128(%rax),%xmm2 + paddd %xmm15,%xmm13 + pslld $5,%xmm8 + pxor %xmm10,%xmm6 + + movdqa %xmm14,%xmm9 + movdqa %xmm1,80-128(%rax) + paddd %xmm1,%xmm13 + pxor %xmm4,%xmm2 + psrld $27,%xmm9 + pxor %xmm11,%xmm6 + movdqa %xmm10,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm2,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm13 + paddd %xmm2,%xmm2 + + psrld $2,%xmm10 + paddd %xmm8,%xmm13 + por %xmm5,%xmm2 + por %xmm7,%xmm10 + pxor %xmm0,%xmm3 + movdqa 144-128(%rax),%xmm0 + + movdqa %xmm13,%xmm8 + movdqa %xmm11,%xmm6 + pxor 240-128(%rax),%xmm3 + paddd %xmm15,%xmm12 + pslld $5,%xmm8 + pxor %xmm14,%xmm6 + + movdqa %xmm13,%xmm9 + movdqa %xmm2,96-128(%rax) + paddd %xmm2,%xmm12 + pxor %xmm0,%xmm3 + psrld $27,%xmm9 + pxor %xmm10,%xmm6 + movdqa %xmm14,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm3,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm12 + paddd %xmm3,%xmm3 + + psrld $2,%xmm14 + paddd %xmm8,%xmm12 + por %xmm5,%xmm3 + por %xmm7,%xmm14 + pxor %xmm1,%xmm4 + movdqa 160-128(%rax),%xmm1 + + movdqa %xmm12,%xmm8 + movdqa %xmm10,%xmm6 + pxor 0-128(%rax),%xmm4 + paddd %xmm15,%xmm11 + pslld $5,%xmm8 + pxor %xmm13,%xmm6 + + movdqa %xmm12,%xmm9 + movdqa %xmm3,112-128(%rax) + paddd %xmm3,%xmm11 + pxor %xmm1,%xmm4 + psrld $27,%xmm9 + pxor %xmm14,%xmm6 + movdqa %xmm13,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm4,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm11 + paddd %xmm4,%xmm4 + + psrld $2,%xmm13 + paddd %xmm8,%xmm11 + por %xmm5,%xmm4 + por %xmm7,%xmm13 + pxor %xmm2,%xmm0 + movdqa 176-128(%rax),%xmm2 + + movdqa %xmm11,%xmm8 + movdqa %xmm14,%xmm6 + pxor 16-128(%rax),%xmm0 + paddd %xmm15,%xmm10 + pslld $5,%xmm8 + pxor %xmm12,%xmm6 + + movdqa %xmm11,%xmm9 + movdqa %xmm4,128-128(%rax) + paddd %xmm4,%xmm10 + pxor %xmm2,%xmm0 + psrld $27,%xmm9 + pxor %xmm13,%xmm6 + movdqa %xmm12,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm0,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm10 + paddd %xmm0,%xmm0 + + psrld $2,%xmm12 + paddd %xmm8,%xmm10 + por %xmm5,%xmm0 + por %xmm7,%xmm12 + pxor %xmm3,%xmm1 + movdqa 192-128(%rax),%xmm3 + + movdqa %xmm10,%xmm8 + movdqa %xmm13,%xmm6 + pxor 32-128(%rax),%xmm1 + paddd %xmm15,%xmm14 + pslld $5,%xmm8 + pxor %xmm11,%xmm6 + + movdqa %xmm10,%xmm9 + movdqa %xmm0,144-128(%rax) + paddd %xmm0,%xmm14 + pxor %xmm3,%xmm1 + psrld $27,%xmm9 + pxor %xmm12,%xmm6 + movdqa %xmm11,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm1,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm14 + paddd %xmm1,%xmm1 + + psrld $2,%xmm11 + paddd %xmm8,%xmm14 + por %xmm5,%xmm1 + por %xmm7,%xmm11 + pxor %xmm4,%xmm2 + movdqa 208-128(%rax),%xmm4 + + movdqa %xmm14,%xmm8 + movdqa %xmm12,%xmm6 + pxor 48-128(%rax),%xmm2 + paddd %xmm15,%xmm13 + pslld $5,%xmm8 + pxor %xmm10,%xmm6 + + movdqa %xmm14,%xmm9 + movdqa %xmm1,160-128(%rax) + paddd %xmm1,%xmm13 + pxor %xmm4,%xmm2 + psrld $27,%xmm9 + pxor %xmm11,%xmm6 + movdqa %xmm10,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm2,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm13 + paddd %xmm2,%xmm2 + + psrld $2,%xmm10 + paddd %xmm8,%xmm13 + por %xmm5,%xmm2 + por %xmm7,%xmm10 + pxor %xmm0,%xmm3 + movdqa 224-128(%rax),%xmm0 + + movdqa %xmm13,%xmm8 + movdqa %xmm11,%xmm6 + pxor 64-128(%rax),%xmm3 + paddd %xmm15,%xmm12 + pslld $5,%xmm8 + pxor %xmm14,%xmm6 + + movdqa %xmm13,%xmm9 + movdqa %xmm2,176-128(%rax) + paddd %xmm2,%xmm12 + pxor %xmm0,%xmm3 + psrld $27,%xmm9 + pxor %xmm10,%xmm6 + movdqa %xmm14,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm3,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm12 + paddd %xmm3,%xmm3 + + psrld $2,%xmm14 + paddd %xmm8,%xmm12 + por %xmm5,%xmm3 + por %xmm7,%xmm14 + pxor %xmm1,%xmm4 + movdqa 240-128(%rax),%xmm1 + + movdqa %xmm12,%xmm8 + movdqa %xmm10,%xmm6 + pxor 80-128(%rax),%xmm4 + paddd %xmm15,%xmm11 + pslld $5,%xmm8 + pxor %xmm13,%xmm6 + + movdqa %xmm12,%xmm9 + movdqa %xmm3,192-128(%rax) + paddd %xmm3,%xmm11 + pxor %xmm1,%xmm4 + psrld $27,%xmm9 + pxor %xmm14,%xmm6 + movdqa %xmm13,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm4,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm11 + paddd %xmm4,%xmm4 + + psrld $2,%xmm13 + paddd %xmm8,%xmm11 + por %xmm5,%xmm4 + por %xmm7,%xmm13 + pxor %xmm2,%xmm0 + movdqa 0-128(%rax),%xmm2 + + movdqa %xmm11,%xmm8 + movdqa %xmm14,%xmm6 + pxor 96-128(%rax),%xmm0 + paddd %xmm15,%xmm10 + pslld $5,%xmm8 + pxor %xmm12,%xmm6 + + movdqa %xmm11,%xmm9 + movdqa %xmm4,208-128(%rax) + paddd %xmm4,%xmm10 + pxor %xmm2,%xmm0 + psrld $27,%xmm9 + pxor %xmm13,%xmm6 + movdqa %xmm12,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm0,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm10 + paddd %xmm0,%xmm0 + + psrld $2,%xmm12 + paddd %xmm8,%xmm10 + por %xmm5,%xmm0 + por %xmm7,%xmm12 + pxor %xmm3,%xmm1 + movdqa 16-128(%rax),%xmm3 + + movdqa %xmm10,%xmm8 + movdqa %xmm13,%xmm6 + pxor 112-128(%rax),%xmm1 + paddd %xmm15,%xmm14 + pslld $5,%xmm8 + pxor %xmm11,%xmm6 + + movdqa %xmm10,%xmm9 + movdqa %xmm0,224-128(%rax) + paddd %xmm0,%xmm14 + pxor %xmm3,%xmm1 + psrld $27,%xmm9 + pxor %xmm12,%xmm6 + movdqa %xmm11,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm1,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm14 + paddd %xmm1,%xmm1 + + psrld $2,%xmm11 + paddd %xmm8,%xmm14 + por %xmm5,%xmm1 + por %xmm7,%xmm11 + pxor %xmm4,%xmm2 + movdqa 32-128(%rax),%xmm4 + + movdqa %xmm14,%xmm8 + movdqa %xmm12,%xmm6 + pxor 128-128(%rax),%xmm2 + paddd %xmm15,%xmm13 + pslld $5,%xmm8 + pxor %xmm10,%xmm6 + + movdqa %xmm14,%xmm9 + movdqa %xmm1,240-128(%rax) + paddd %xmm1,%xmm13 + pxor %xmm4,%xmm2 + psrld $27,%xmm9 + pxor %xmm11,%xmm6 + movdqa %xmm10,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm2,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm13 + paddd %xmm2,%xmm2 + + psrld $2,%xmm10 + paddd %xmm8,%xmm13 + por %xmm5,%xmm2 + por %xmm7,%xmm10 + pxor %xmm0,%xmm3 + movdqa 48-128(%rax),%xmm0 + + movdqa %xmm13,%xmm8 + movdqa %xmm11,%xmm6 + pxor 144-128(%rax),%xmm3 + paddd %xmm15,%xmm12 + pslld $5,%xmm8 + pxor %xmm14,%xmm6 + + movdqa %xmm13,%xmm9 + movdqa %xmm2,0-128(%rax) + paddd %xmm2,%xmm12 + pxor %xmm0,%xmm3 + psrld $27,%xmm9 + pxor %xmm10,%xmm6 + movdqa %xmm14,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm3,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm12 + paddd %xmm3,%xmm3 + + psrld $2,%xmm14 + paddd %xmm8,%xmm12 + por %xmm5,%xmm3 + por %xmm7,%xmm14 + pxor %xmm1,%xmm4 + movdqa 64-128(%rax),%xmm1 + + movdqa %xmm12,%xmm8 + movdqa %xmm10,%xmm6 + pxor 160-128(%rax),%xmm4 + paddd %xmm15,%xmm11 + pslld $5,%xmm8 + pxor %xmm13,%xmm6 + + movdqa %xmm12,%xmm9 + movdqa %xmm3,16-128(%rax) + paddd %xmm3,%xmm11 + pxor %xmm1,%xmm4 + psrld $27,%xmm9 + pxor %xmm14,%xmm6 + movdqa %xmm13,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm4,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm11 + paddd %xmm4,%xmm4 + + psrld $2,%xmm13 + paddd %xmm8,%xmm11 + por %xmm5,%xmm4 + por %xmm7,%xmm13 + pxor %xmm2,%xmm0 + movdqa 80-128(%rax),%xmm2 + + movdqa %xmm11,%xmm8 + movdqa %xmm14,%xmm6 + pxor 176-128(%rax),%xmm0 + paddd %xmm15,%xmm10 + pslld $5,%xmm8 + pxor %xmm12,%xmm6 + + movdqa %xmm11,%xmm9 + movdqa %xmm4,32-128(%rax) + paddd %xmm4,%xmm10 + pxor %xmm2,%xmm0 + psrld $27,%xmm9 + pxor %xmm13,%xmm6 + movdqa %xmm12,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm0,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm10 + paddd %xmm0,%xmm0 + + psrld $2,%xmm12 + paddd %xmm8,%xmm10 + por %xmm5,%xmm0 + por %xmm7,%xmm12 + pxor %xmm3,%xmm1 + movdqa 96-128(%rax),%xmm3 + + movdqa %xmm10,%xmm8 + movdqa %xmm13,%xmm6 + pxor 192-128(%rax),%xmm1 + paddd %xmm15,%xmm14 + pslld $5,%xmm8 + pxor %xmm11,%xmm6 + + movdqa %xmm10,%xmm9 + movdqa %xmm0,48-128(%rax) + paddd %xmm0,%xmm14 + pxor %xmm3,%xmm1 + psrld $27,%xmm9 + pxor %xmm12,%xmm6 + movdqa %xmm11,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm1,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm14 + paddd %xmm1,%xmm1 + + psrld $2,%xmm11 + paddd %xmm8,%xmm14 + por %xmm5,%xmm1 + por %xmm7,%xmm11 + pxor %xmm4,%xmm2 + movdqa 112-128(%rax),%xmm4 + + movdqa %xmm14,%xmm8 + movdqa %xmm12,%xmm6 + pxor 208-128(%rax),%xmm2 + paddd %xmm15,%xmm13 + pslld $5,%xmm8 + pxor %xmm10,%xmm6 + + movdqa %xmm14,%xmm9 + movdqa %xmm1,64-128(%rax) + paddd %xmm1,%xmm13 + pxor %xmm4,%xmm2 + psrld $27,%xmm9 + pxor %xmm11,%xmm6 + movdqa %xmm10,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm2,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm13 + paddd %xmm2,%xmm2 + + psrld $2,%xmm10 + paddd %xmm8,%xmm13 + por %xmm5,%xmm2 + por %xmm7,%xmm10 + pxor %xmm0,%xmm3 + movdqa 128-128(%rax),%xmm0 + + movdqa %xmm13,%xmm8 + movdqa %xmm11,%xmm6 + pxor 224-128(%rax),%xmm3 + paddd %xmm15,%xmm12 + pslld $5,%xmm8 + pxor %xmm14,%xmm6 + + movdqa %xmm13,%xmm9 + movdqa %xmm2,80-128(%rax) + paddd %xmm2,%xmm12 + pxor %xmm0,%xmm3 + psrld $27,%xmm9 + pxor %xmm10,%xmm6 + movdqa %xmm14,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm3,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm12 + paddd %xmm3,%xmm3 + + psrld $2,%xmm14 + paddd %xmm8,%xmm12 + por %xmm5,%xmm3 + por %xmm7,%xmm14 + pxor %xmm1,%xmm4 + movdqa 144-128(%rax),%xmm1 + + movdqa %xmm12,%xmm8 + movdqa %xmm10,%xmm6 + pxor 240-128(%rax),%xmm4 + paddd %xmm15,%xmm11 + pslld $5,%xmm8 + pxor %xmm13,%xmm6 + + movdqa %xmm12,%xmm9 + movdqa %xmm3,96-128(%rax) + paddd %xmm3,%xmm11 + pxor %xmm1,%xmm4 + psrld $27,%xmm9 + pxor %xmm14,%xmm6 + movdqa %xmm13,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm4,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm11 + paddd %xmm4,%xmm4 + + psrld $2,%xmm13 + paddd %xmm8,%xmm11 + por %xmm5,%xmm4 + por %xmm7,%xmm13 + pxor %xmm2,%xmm0 + movdqa 160-128(%rax),%xmm2 + + movdqa %xmm11,%xmm8 + movdqa %xmm14,%xmm6 + pxor 0-128(%rax),%xmm0 + paddd %xmm15,%xmm10 + pslld $5,%xmm8 + pxor %xmm12,%xmm6 + + movdqa %xmm11,%xmm9 + movdqa %xmm4,112-128(%rax) + paddd %xmm4,%xmm10 + pxor %xmm2,%xmm0 + psrld $27,%xmm9 + pxor %xmm13,%xmm6 + movdqa %xmm12,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm0,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm10 + paddd %xmm0,%xmm0 + + psrld $2,%xmm12 + paddd %xmm8,%xmm10 + por %xmm5,%xmm0 + por %xmm7,%xmm12 + movdqa 32(%rbp),%xmm15 + pxor %xmm3,%xmm1 + movdqa 176-128(%rax),%xmm3 + + movdqa %xmm10,%xmm8 + movdqa %xmm13,%xmm7 + pxor 16-128(%rax),%xmm1 + pxor %xmm3,%xmm1 + paddd %xmm15,%xmm14 + pslld $5,%xmm8 + movdqa %xmm10,%xmm9 + pand %xmm12,%xmm7 + + movdqa %xmm13,%xmm6 + movdqa %xmm1,%xmm5 + psrld $27,%xmm9 + paddd %xmm7,%xmm14 + pxor %xmm12,%xmm6 + + movdqa %xmm0,128-128(%rax) + paddd %xmm0,%xmm14 + por %xmm9,%xmm8 + psrld $31,%xmm5 + pand %xmm11,%xmm6 + movdqa %xmm11,%xmm7 + + pslld $30,%xmm7 + paddd %xmm1,%xmm1 + paddd %xmm6,%xmm14 + + psrld $2,%xmm11 + paddd %xmm8,%xmm14 + por %xmm5,%xmm1 + por %xmm7,%xmm11 + pxor %xmm4,%xmm2 + movdqa 192-128(%rax),%xmm4 + + movdqa %xmm14,%xmm8 + movdqa %xmm12,%xmm7 + pxor 32-128(%rax),%xmm2 + pxor %xmm4,%xmm2 + paddd %xmm15,%xmm13 + pslld $5,%xmm8 + movdqa %xmm14,%xmm9 + pand %xmm11,%xmm7 + + movdqa %xmm12,%xmm6 + movdqa %xmm2,%xmm5 + psrld $27,%xmm9 + paddd %xmm7,%xmm13 + pxor %xmm11,%xmm6 + + movdqa %xmm1,144-128(%rax) + paddd %xmm1,%xmm13 + por %xmm9,%xmm8 + psrld $31,%xmm5 + pand %xmm10,%xmm6 + movdqa %xmm10,%xmm7 + + pslld $30,%xmm7 + paddd %xmm2,%xmm2 + paddd %xmm6,%xmm13 + + psrld $2,%xmm10 + paddd %xmm8,%xmm13 + por %xmm5,%xmm2 + por %xmm7,%xmm10 + pxor %xmm0,%xmm3 + movdqa 208-128(%rax),%xmm0 + + movdqa %xmm13,%xmm8 + movdqa %xmm11,%xmm7 + pxor 48-128(%rax),%xmm3 + pxor %xmm0,%xmm3 + paddd %xmm15,%xmm12 + pslld $5,%xmm8 + movdqa %xmm13,%xmm9 + pand %xmm10,%xmm7 + + movdqa %xmm11,%xmm6 + movdqa %xmm3,%xmm5 + psrld $27,%xmm9 + paddd %xmm7,%xmm12 + pxor %xmm10,%xmm6 + + movdqa %xmm2,160-128(%rax) + paddd %xmm2,%xmm12 + por %xmm9,%xmm8 + psrld $31,%xmm5 + pand %xmm14,%xmm6 + movdqa %xmm14,%xmm7 + + pslld $30,%xmm7 + paddd %xmm3,%xmm3 + paddd %xmm6,%xmm12 + + psrld $2,%xmm14 + paddd %xmm8,%xmm12 + por %xmm5,%xmm3 + por %xmm7,%xmm14 + pxor %xmm1,%xmm4 + movdqa 224-128(%rax),%xmm1 + + movdqa %xmm12,%xmm8 + movdqa %xmm10,%xmm7 + pxor 64-128(%rax),%xmm4 + pxor %xmm1,%xmm4 + paddd %xmm15,%xmm11 + pslld $5,%xmm8 + movdqa %xmm12,%xmm9 + pand %xmm14,%xmm7 + + movdqa %xmm10,%xmm6 + movdqa %xmm4,%xmm5 + psrld $27,%xmm9 + paddd %xmm7,%xmm11 + pxor %xmm14,%xmm6 + + movdqa %xmm3,176-128(%rax) + paddd %xmm3,%xmm11 + por %xmm9,%xmm8 + psrld $31,%xmm5 + pand %xmm13,%xmm6 + movdqa %xmm13,%xmm7 + + pslld $30,%xmm7 + paddd %xmm4,%xmm4 + paddd %xmm6,%xmm11 + + psrld $2,%xmm13 + paddd %xmm8,%xmm11 + por %xmm5,%xmm4 + por %xmm7,%xmm13 + pxor %xmm2,%xmm0 + movdqa 240-128(%rax),%xmm2 + + movdqa %xmm11,%xmm8 + movdqa %xmm14,%xmm7 + pxor 80-128(%rax),%xmm0 + pxor %xmm2,%xmm0 + paddd %xmm15,%xmm10 + pslld $5,%xmm8 + movdqa %xmm11,%xmm9 + pand %xmm13,%xmm7 + + movdqa %xmm14,%xmm6 + movdqa %xmm0,%xmm5 + psrld $27,%xmm9 + paddd %xmm7,%xmm10 + pxor %xmm13,%xmm6 + + movdqa %xmm4,192-128(%rax) + paddd %xmm4,%xmm10 + por %xmm9,%xmm8 + psrld $31,%xmm5 + pand %xmm12,%xmm6 + movdqa %xmm12,%xmm7 + + pslld $30,%xmm7 + paddd %xmm0,%xmm0 + paddd %xmm6,%xmm10 + + psrld $2,%xmm12 + paddd %xmm8,%xmm10 + por %xmm5,%xmm0 + por %xmm7,%xmm12 + pxor %xmm3,%xmm1 + movdqa 0-128(%rax),%xmm3 + + movdqa %xmm10,%xmm8 + movdqa %xmm13,%xmm7 + pxor 96-128(%rax),%xmm1 + pxor %xmm3,%xmm1 + paddd %xmm15,%xmm14 + pslld $5,%xmm8 + movdqa %xmm10,%xmm9 + pand %xmm12,%xmm7 + + movdqa %xmm13,%xmm6 + movdqa %xmm1,%xmm5 + psrld $27,%xmm9 + paddd %xmm7,%xmm14 + pxor %xmm12,%xmm6 + + movdqa %xmm0,208-128(%rax) + paddd %xmm0,%xmm14 + por %xmm9,%xmm8 + psrld $31,%xmm5 + pand %xmm11,%xmm6 + movdqa %xmm11,%xmm7 + + pslld $30,%xmm7 + paddd %xmm1,%xmm1 + paddd %xmm6,%xmm14 + + psrld $2,%xmm11 + paddd %xmm8,%xmm14 + por %xmm5,%xmm1 + por %xmm7,%xmm11 + pxor %xmm4,%xmm2 + movdqa 16-128(%rax),%xmm4 + + movdqa %xmm14,%xmm8 + movdqa %xmm12,%xmm7 + pxor 112-128(%rax),%xmm2 + pxor %xmm4,%xmm2 + paddd %xmm15,%xmm13 + pslld $5,%xmm8 + movdqa %xmm14,%xmm9 + pand %xmm11,%xmm7 + + movdqa %xmm12,%xmm6 + movdqa %xmm2,%xmm5 + psrld $27,%xmm9 + paddd %xmm7,%xmm13 + pxor %xmm11,%xmm6 + + movdqa %xmm1,224-128(%rax) + paddd %xmm1,%xmm13 + por %xmm9,%xmm8 + psrld $31,%xmm5 + pand %xmm10,%xmm6 + movdqa %xmm10,%xmm7 + + pslld $30,%xmm7 + paddd %xmm2,%xmm2 + paddd %xmm6,%xmm13 + + psrld $2,%xmm10 + paddd %xmm8,%xmm13 + por %xmm5,%xmm2 + por %xmm7,%xmm10 + pxor %xmm0,%xmm3 + movdqa 32-128(%rax),%xmm0 + + movdqa %xmm13,%xmm8 + movdqa %xmm11,%xmm7 + pxor 128-128(%rax),%xmm3 + pxor %xmm0,%xmm3 + paddd %xmm15,%xmm12 + pslld $5,%xmm8 + movdqa %xmm13,%xmm9 + pand %xmm10,%xmm7 + + movdqa %xmm11,%xmm6 + movdqa %xmm3,%xmm5 + psrld $27,%xmm9 + paddd %xmm7,%xmm12 + pxor %xmm10,%xmm6 + + movdqa %xmm2,240-128(%rax) + paddd %xmm2,%xmm12 + por %xmm9,%xmm8 + psrld $31,%xmm5 + pand %xmm14,%xmm6 + movdqa %xmm14,%xmm7 + + pslld $30,%xmm7 + paddd %xmm3,%xmm3 + paddd %xmm6,%xmm12 + + psrld $2,%xmm14 + paddd %xmm8,%xmm12 + por %xmm5,%xmm3 + por %xmm7,%xmm14 + pxor %xmm1,%xmm4 + movdqa 48-128(%rax),%xmm1 + + movdqa %xmm12,%xmm8 + movdqa %xmm10,%xmm7 + pxor 144-128(%rax),%xmm4 + pxor %xmm1,%xmm4 + paddd %xmm15,%xmm11 + pslld $5,%xmm8 + movdqa %xmm12,%xmm9 + pand %xmm14,%xmm7 + + movdqa %xmm10,%xmm6 + movdqa %xmm4,%xmm5 + psrld $27,%xmm9 + paddd %xmm7,%xmm11 + pxor %xmm14,%xmm6 + + movdqa %xmm3,0-128(%rax) + paddd %xmm3,%xmm11 + por %xmm9,%xmm8 + psrld $31,%xmm5 + pand %xmm13,%xmm6 + movdqa %xmm13,%xmm7 + + pslld $30,%xmm7 + paddd %xmm4,%xmm4 + paddd %xmm6,%xmm11 + + psrld $2,%xmm13 + paddd %xmm8,%xmm11 + por %xmm5,%xmm4 + por %xmm7,%xmm13 + pxor %xmm2,%xmm0 + movdqa 64-128(%rax),%xmm2 + + movdqa %xmm11,%xmm8 + movdqa %xmm14,%xmm7 + pxor 160-128(%rax),%xmm0 + pxor %xmm2,%xmm0 + paddd %xmm15,%xmm10 + pslld $5,%xmm8 + movdqa %xmm11,%xmm9 + pand %xmm13,%xmm7 + + movdqa %xmm14,%xmm6 + movdqa %xmm0,%xmm5 + psrld $27,%xmm9 + paddd %xmm7,%xmm10 + pxor %xmm13,%xmm6 + + movdqa %xmm4,16-128(%rax) + paddd %xmm4,%xmm10 + por %xmm9,%xmm8 + psrld $31,%xmm5 + pand %xmm12,%xmm6 + movdqa %xmm12,%xmm7 + + pslld $30,%xmm7 + paddd %xmm0,%xmm0 + paddd %xmm6,%xmm10 + + psrld $2,%xmm12 + paddd %xmm8,%xmm10 + por %xmm5,%xmm0 + por %xmm7,%xmm12 + pxor %xmm3,%xmm1 + movdqa 80-128(%rax),%xmm3 + + movdqa %xmm10,%xmm8 + movdqa %xmm13,%xmm7 + pxor 176-128(%rax),%xmm1 + pxor %xmm3,%xmm1 + paddd %xmm15,%xmm14 + pslld $5,%xmm8 + movdqa %xmm10,%xmm9 + pand %xmm12,%xmm7 + + movdqa %xmm13,%xmm6 + movdqa %xmm1,%xmm5 + psrld $27,%xmm9 + paddd %xmm7,%xmm14 + pxor %xmm12,%xmm6 + + movdqa %xmm0,32-128(%rax) + paddd %xmm0,%xmm14 + por %xmm9,%xmm8 + psrld $31,%xmm5 + pand %xmm11,%xmm6 + movdqa %xmm11,%xmm7 + + pslld $30,%xmm7 + paddd %xmm1,%xmm1 + paddd %xmm6,%xmm14 + + psrld $2,%xmm11 + paddd %xmm8,%xmm14 + por %xmm5,%xmm1 + por %xmm7,%xmm11 + pxor %xmm4,%xmm2 + movdqa 96-128(%rax),%xmm4 + + movdqa %xmm14,%xmm8 + movdqa %xmm12,%xmm7 + pxor 192-128(%rax),%xmm2 + pxor %xmm4,%xmm2 + paddd %xmm15,%xmm13 + pslld $5,%xmm8 + movdqa %xmm14,%xmm9 + pand %xmm11,%xmm7 + + movdqa %xmm12,%xmm6 + movdqa %xmm2,%xmm5 + psrld $27,%xmm9 + paddd %xmm7,%xmm13 + pxor %xmm11,%xmm6 + + movdqa %xmm1,48-128(%rax) + paddd %xmm1,%xmm13 + por %xmm9,%xmm8 + psrld $31,%xmm5 + pand %xmm10,%xmm6 + movdqa %xmm10,%xmm7 + + pslld $30,%xmm7 + paddd %xmm2,%xmm2 + paddd %xmm6,%xmm13 + + psrld $2,%xmm10 + paddd %xmm8,%xmm13 + por %xmm5,%xmm2 + por %xmm7,%xmm10 + pxor %xmm0,%xmm3 + movdqa 112-128(%rax),%xmm0 + + movdqa %xmm13,%xmm8 + movdqa %xmm11,%xmm7 + pxor 208-128(%rax),%xmm3 + pxor %xmm0,%xmm3 + paddd %xmm15,%xmm12 + pslld $5,%xmm8 + movdqa %xmm13,%xmm9 + pand %xmm10,%xmm7 + + movdqa %xmm11,%xmm6 + movdqa %xmm3,%xmm5 + psrld $27,%xmm9 + paddd %xmm7,%xmm12 + pxor %xmm10,%xmm6 + + movdqa %xmm2,64-128(%rax) + paddd %xmm2,%xmm12 + por %xmm9,%xmm8 + psrld $31,%xmm5 + pand %xmm14,%xmm6 + movdqa %xmm14,%xmm7 + + pslld $30,%xmm7 + paddd %xmm3,%xmm3 + paddd %xmm6,%xmm12 + + psrld $2,%xmm14 + paddd %xmm8,%xmm12 + por %xmm5,%xmm3 + por %xmm7,%xmm14 + pxor %xmm1,%xmm4 + movdqa 128-128(%rax),%xmm1 + + movdqa %xmm12,%xmm8 + movdqa %xmm10,%xmm7 + pxor 224-128(%rax),%xmm4 + pxor %xmm1,%xmm4 + paddd %xmm15,%xmm11 + pslld $5,%xmm8 + movdqa %xmm12,%xmm9 + pand %xmm14,%xmm7 + + movdqa %xmm10,%xmm6 + movdqa %xmm4,%xmm5 + psrld $27,%xmm9 + paddd %xmm7,%xmm11 + pxor %xmm14,%xmm6 + + movdqa %xmm3,80-128(%rax) + paddd %xmm3,%xmm11 + por %xmm9,%xmm8 + psrld $31,%xmm5 + pand %xmm13,%xmm6 + movdqa %xmm13,%xmm7 + + pslld $30,%xmm7 + paddd %xmm4,%xmm4 + paddd %xmm6,%xmm11 + + psrld $2,%xmm13 + paddd %xmm8,%xmm11 + por %xmm5,%xmm4 + por %xmm7,%xmm13 + pxor %xmm2,%xmm0 + movdqa 144-128(%rax),%xmm2 + + movdqa %xmm11,%xmm8 + movdqa %xmm14,%xmm7 + pxor 240-128(%rax),%xmm0 + pxor %xmm2,%xmm0 + paddd %xmm15,%xmm10 + pslld $5,%xmm8 + movdqa %xmm11,%xmm9 + pand %xmm13,%xmm7 + + movdqa %xmm14,%xmm6 + movdqa %xmm0,%xmm5 + psrld $27,%xmm9 + paddd %xmm7,%xmm10 + pxor %xmm13,%xmm6 + + movdqa %xmm4,96-128(%rax) + paddd %xmm4,%xmm10 + por %xmm9,%xmm8 + psrld $31,%xmm5 + pand %xmm12,%xmm6 + movdqa %xmm12,%xmm7 + + pslld $30,%xmm7 + paddd %xmm0,%xmm0 + paddd %xmm6,%xmm10 + + psrld $2,%xmm12 + paddd %xmm8,%xmm10 + por %xmm5,%xmm0 + por %xmm7,%xmm12 + pxor %xmm3,%xmm1 + movdqa 160-128(%rax),%xmm3 + + movdqa %xmm10,%xmm8 + movdqa %xmm13,%xmm7 + pxor 0-128(%rax),%xmm1 + pxor %xmm3,%xmm1 + paddd %xmm15,%xmm14 + pslld $5,%xmm8 + movdqa %xmm10,%xmm9 + pand %xmm12,%xmm7 + + movdqa %xmm13,%xmm6 + movdqa %xmm1,%xmm5 + psrld $27,%xmm9 + paddd %xmm7,%xmm14 + pxor %xmm12,%xmm6 + + movdqa %xmm0,112-128(%rax) + paddd %xmm0,%xmm14 + por %xmm9,%xmm8 + psrld $31,%xmm5 + pand %xmm11,%xmm6 + movdqa %xmm11,%xmm7 + + pslld $30,%xmm7 + paddd %xmm1,%xmm1 + paddd %xmm6,%xmm14 + + psrld $2,%xmm11 + paddd %xmm8,%xmm14 + por %xmm5,%xmm1 + por %xmm7,%xmm11 + pxor %xmm4,%xmm2 + movdqa 176-128(%rax),%xmm4 + + movdqa %xmm14,%xmm8 + movdqa %xmm12,%xmm7 + pxor 16-128(%rax),%xmm2 + pxor %xmm4,%xmm2 + paddd %xmm15,%xmm13 + pslld $5,%xmm8 + movdqa %xmm14,%xmm9 + pand %xmm11,%xmm7 + + movdqa %xmm12,%xmm6 + movdqa %xmm2,%xmm5 + psrld $27,%xmm9 + paddd %xmm7,%xmm13 + pxor %xmm11,%xmm6 + + movdqa %xmm1,128-128(%rax) + paddd %xmm1,%xmm13 + por %xmm9,%xmm8 + psrld $31,%xmm5 + pand %xmm10,%xmm6 + movdqa %xmm10,%xmm7 + + pslld $30,%xmm7 + paddd %xmm2,%xmm2 + paddd %xmm6,%xmm13 + + psrld $2,%xmm10 + paddd %xmm8,%xmm13 + por %xmm5,%xmm2 + por %xmm7,%xmm10 + pxor %xmm0,%xmm3 + movdqa 192-128(%rax),%xmm0 + + movdqa %xmm13,%xmm8 + movdqa %xmm11,%xmm7 + pxor 32-128(%rax),%xmm3 + pxor %xmm0,%xmm3 + paddd %xmm15,%xmm12 + pslld $5,%xmm8 + movdqa %xmm13,%xmm9 + pand %xmm10,%xmm7 + + movdqa %xmm11,%xmm6 + movdqa %xmm3,%xmm5 + psrld $27,%xmm9 + paddd %xmm7,%xmm12 + pxor %xmm10,%xmm6 + + movdqa %xmm2,144-128(%rax) + paddd %xmm2,%xmm12 + por %xmm9,%xmm8 + psrld $31,%xmm5 + pand %xmm14,%xmm6 + movdqa %xmm14,%xmm7 + + pslld $30,%xmm7 + paddd %xmm3,%xmm3 + paddd %xmm6,%xmm12 + + psrld $2,%xmm14 + paddd %xmm8,%xmm12 + por %xmm5,%xmm3 + por %xmm7,%xmm14 + pxor %xmm1,%xmm4 + movdqa 208-128(%rax),%xmm1 + + movdqa %xmm12,%xmm8 + movdqa %xmm10,%xmm7 + pxor 48-128(%rax),%xmm4 + pxor %xmm1,%xmm4 + paddd %xmm15,%xmm11 + pslld $5,%xmm8 + movdqa %xmm12,%xmm9 + pand %xmm14,%xmm7 + + movdqa %xmm10,%xmm6 + movdqa %xmm4,%xmm5 + psrld $27,%xmm9 + paddd %xmm7,%xmm11 + pxor %xmm14,%xmm6 + + movdqa %xmm3,160-128(%rax) + paddd %xmm3,%xmm11 + por %xmm9,%xmm8 + psrld $31,%xmm5 + pand %xmm13,%xmm6 + movdqa %xmm13,%xmm7 + + pslld $30,%xmm7 + paddd %xmm4,%xmm4 + paddd %xmm6,%xmm11 + + psrld $2,%xmm13 + paddd %xmm8,%xmm11 + por %xmm5,%xmm4 + por %xmm7,%xmm13 + pxor %xmm2,%xmm0 + movdqa 224-128(%rax),%xmm2 + + movdqa %xmm11,%xmm8 + movdqa %xmm14,%xmm7 + pxor 64-128(%rax),%xmm0 + pxor %xmm2,%xmm0 + paddd %xmm15,%xmm10 + pslld $5,%xmm8 + movdqa %xmm11,%xmm9 + pand %xmm13,%xmm7 + + movdqa %xmm14,%xmm6 + movdqa %xmm0,%xmm5 + psrld $27,%xmm9 + paddd %xmm7,%xmm10 + pxor %xmm13,%xmm6 + + movdqa %xmm4,176-128(%rax) + paddd %xmm4,%xmm10 + por %xmm9,%xmm8 + psrld $31,%xmm5 + pand %xmm12,%xmm6 + movdqa %xmm12,%xmm7 + + pslld $30,%xmm7 + paddd %xmm0,%xmm0 + paddd %xmm6,%xmm10 + + psrld $2,%xmm12 + paddd %xmm8,%xmm10 + por %xmm5,%xmm0 + por %xmm7,%xmm12 + movdqa 64(%rbp),%xmm15 + pxor %xmm3,%xmm1 + movdqa 240-128(%rax),%xmm3 + + movdqa %xmm10,%xmm8 + movdqa %xmm13,%xmm6 + pxor 80-128(%rax),%xmm1 + paddd %xmm15,%xmm14 + pslld $5,%xmm8 + pxor %xmm11,%xmm6 + + movdqa %xmm10,%xmm9 + movdqa %xmm0,192-128(%rax) + paddd %xmm0,%xmm14 + pxor %xmm3,%xmm1 + psrld $27,%xmm9 + pxor %xmm12,%xmm6 + movdqa %xmm11,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm1,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm14 + paddd %xmm1,%xmm1 + + psrld $2,%xmm11 + paddd %xmm8,%xmm14 + por %xmm5,%xmm1 + por %xmm7,%xmm11 + pxor %xmm4,%xmm2 + movdqa 0-128(%rax),%xmm4 + + movdqa %xmm14,%xmm8 + movdqa %xmm12,%xmm6 + pxor 96-128(%rax),%xmm2 + paddd %xmm15,%xmm13 + pslld $5,%xmm8 + pxor %xmm10,%xmm6 + + movdqa %xmm14,%xmm9 + movdqa %xmm1,208-128(%rax) + paddd %xmm1,%xmm13 + pxor %xmm4,%xmm2 + psrld $27,%xmm9 + pxor %xmm11,%xmm6 + movdqa %xmm10,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm2,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm13 + paddd %xmm2,%xmm2 + + psrld $2,%xmm10 + paddd %xmm8,%xmm13 + por %xmm5,%xmm2 + por %xmm7,%xmm10 + pxor %xmm0,%xmm3 + movdqa 16-128(%rax),%xmm0 + + movdqa %xmm13,%xmm8 + movdqa %xmm11,%xmm6 + pxor 112-128(%rax),%xmm3 + paddd %xmm15,%xmm12 + pslld $5,%xmm8 + pxor %xmm14,%xmm6 + + movdqa %xmm13,%xmm9 + movdqa %xmm2,224-128(%rax) + paddd %xmm2,%xmm12 + pxor %xmm0,%xmm3 + psrld $27,%xmm9 + pxor %xmm10,%xmm6 + movdqa %xmm14,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm3,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm12 + paddd %xmm3,%xmm3 + + psrld $2,%xmm14 + paddd %xmm8,%xmm12 + por %xmm5,%xmm3 + por %xmm7,%xmm14 + pxor %xmm1,%xmm4 + movdqa 32-128(%rax),%xmm1 + + movdqa %xmm12,%xmm8 + movdqa %xmm10,%xmm6 + pxor 128-128(%rax),%xmm4 + paddd %xmm15,%xmm11 + pslld $5,%xmm8 + pxor %xmm13,%xmm6 + + movdqa %xmm12,%xmm9 + movdqa %xmm3,240-128(%rax) + paddd %xmm3,%xmm11 + pxor %xmm1,%xmm4 + psrld $27,%xmm9 + pxor %xmm14,%xmm6 + movdqa %xmm13,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm4,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm11 + paddd %xmm4,%xmm4 + + psrld $2,%xmm13 + paddd %xmm8,%xmm11 + por %xmm5,%xmm4 + por %xmm7,%xmm13 + pxor %xmm2,%xmm0 + movdqa 48-128(%rax),%xmm2 + + movdqa %xmm11,%xmm8 + movdqa %xmm14,%xmm6 + pxor 144-128(%rax),%xmm0 + paddd %xmm15,%xmm10 + pslld $5,%xmm8 + pxor %xmm12,%xmm6 + + movdqa %xmm11,%xmm9 + movdqa %xmm4,0-128(%rax) + paddd %xmm4,%xmm10 + pxor %xmm2,%xmm0 + psrld $27,%xmm9 + pxor %xmm13,%xmm6 + movdqa %xmm12,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm0,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm10 + paddd %xmm0,%xmm0 + + psrld $2,%xmm12 + paddd %xmm8,%xmm10 + por %xmm5,%xmm0 + por %xmm7,%xmm12 + pxor %xmm3,%xmm1 + movdqa 64-128(%rax),%xmm3 + + movdqa %xmm10,%xmm8 + movdqa %xmm13,%xmm6 + pxor 160-128(%rax),%xmm1 + paddd %xmm15,%xmm14 + pslld $5,%xmm8 + pxor %xmm11,%xmm6 + + movdqa %xmm10,%xmm9 + movdqa %xmm0,16-128(%rax) + paddd %xmm0,%xmm14 + pxor %xmm3,%xmm1 + psrld $27,%xmm9 + pxor %xmm12,%xmm6 + movdqa %xmm11,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm1,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm14 + paddd %xmm1,%xmm1 + + psrld $2,%xmm11 + paddd %xmm8,%xmm14 + por %xmm5,%xmm1 + por %xmm7,%xmm11 + pxor %xmm4,%xmm2 + movdqa 80-128(%rax),%xmm4 + + movdqa %xmm14,%xmm8 + movdqa %xmm12,%xmm6 + pxor 176-128(%rax),%xmm2 + paddd %xmm15,%xmm13 + pslld $5,%xmm8 + pxor %xmm10,%xmm6 + + movdqa %xmm14,%xmm9 + movdqa %xmm1,32-128(%rax) + paddd %xmm1,%xmm13 + pxor %xmm4,%xmm2 + psrld $27,%xmm9 + pxor %xmm11,%xmm6 + movdqa %xmm10,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm2,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm13 + paddd %xmm2,%xmm2 + + psrld $2,%xmm10 + paddd %xmm8,%xmm13 + por %xmm5,%xmm2 + por %xmm7,%xmm10 + pxor %xmm0,%xmm3 + movdqa 96-128(%rax),%xmm0 + + movdqa %xmm13,%xmm8 + movdqa %xmm11,%xmm6 + pxor 192-128(%rax),%xmm3 + paddd %xmm15,%xmm12 + pslld $5,%xmm8 + pxor %xmm14,%xmm6 + + movdqa %xmm13,%xmm9 + movdqa %xmm2,48-128(%rax) + paddd %xmm2,%xmm12 + pxor %xmm0,%xmm3 + psrld $27,%xmm9 + pxor %xmm10,%xmm6 + movdqa %xmm14,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm3,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm12 + paddd %xmm3,%xmm3 + + psrld $2,%xmm14 + paddd %xmm8,%xmm12 + por %xmm5,%xmm3 + por %xmm7,%xmm14 + pxor %xmm1,%xmm4 + movdqa 112-128(%rax),%xmm1 + + movdqa %xmm12,%xmm8 + movdqa %xmm10,%xmm6 + pxor 208-128(%rax),%xmm4 + paddd %xmm15,%xmm11 + pslld $5,%xmm8 + pxor %xmm13,%xmm6 + + movdqa %xmm12,%xmm9 + movdqa %xmm3,64-128(%rax) + paddd %xmm3,%xmm11 + pxor %xmm1,%xmm4 + psrld $27,%xmm9 + pxor %xmm14,%xmm6 + movdqa %xmm13,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm4,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm11 + paddd %xmm4,%xmm4 + + psrld $2,%xmm13 + paddd %xmm8,%xmm11 + por %xmm5,%xmm4 + por %xmm7,%xmm13 + pxor %xmm2,%xmm0 + movdqa 128-128(%rax),%xmm2 + + movdqa %xmm11,%xmm8 + movdqa %xmm14,%xmm6 + pxor 224-128(%rax),%xmm0 + paddd %xmm15,%xmm10 + pslld $5,%xmm8 + pxor %xmm12,%xmm6 + + movdqa %xmm11,%xmm9 + movdqa %xmm4,80-128(%rax) + paddd %xmm4,%xmm10 + pxor %xmm2,%xmm0 + psrld $27,%xmm9 + pxor %xmm13,%xmm6 + movdqa %xmm12,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm0,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm10 + paddd %xmm0,%xmm0 + + psrld $2,%xmm12 + paddd %xmm8,%xmm10 + por %xmm5,%xmm0 + por %xmm7,%xmm12 + pxor %xmm3,%xmm1 + movdqa 144-128(%rax),%xmm3 + + movdqa %xmm10,%xmm8 + movdqa %xmm13,%xmm6 + pxor 240-128(%rax),%xmm1 + paddd %xmm15,%xmm14 + pslld $5,%xmm8 + pxor %xmm11,%xmm6 + + movdqa %xmm10,%xmm9 + movdqa %xmm0,96-128(%rax) + paddd %xmm0,%xmm14 + pxor %xmm3,%xmm1 + psrld $27,%xmm9 + pxor %xmm12,%xmm6 + movdqa %xmm11,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm1,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm14 + paddd %xmm1,%xmm1 + + psrld $2,%xmm11 + paddd %xmm8,%xmm14 + por %xmm5,%xmm1 + por %xmm7,%xmm11 + pxor %xmm4,%xmm2 + movdqa 160-128(%rax),%xmm4 + + movdqa %xmm14,%xmm8 + movdqa %xmm12,%xmm6 + pxor 0-128(%rax),%xmm2 + paddd %xmm15,%xmm13 + pslld $5,%xmm8 + pxor %xmm10,%xmm6 + + movdqa %xmm14,%xmm9 + movdqa %xmm1,112-128(%rax) + paddd %xmm1,%xmm13 + pxor %xmm4,%xmm2 + psrld $27,%xmm9 + pxor %xmm11,%xmm6 + movdqa %xmm10,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm2,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm13 + paddd %xmm2,%xmm2 + + psrld $2,%xmm10 + paddd %xmm8,%xmm13 + por %xmm5,%xmm2 + por %xmm7,%xmm10 + pxor %xmm0,%xmm3 + movdqa 176-128(%rax),%xmm0 + + movdqa %xmm13,%xmm8 + movdqa %xmm11,%xmm6 + pxor 16-128(%rax),%xmm3 + paddd %xmm15,%xmm12 + pslld $5,%xmm8 + pxor %xmm14,%xmm6 + + movdqa %xmm13,%xmm9 + paddd %xmm2,%xmm12 + pxor %xmm0,%xmm3 + psrld $27,%xmm9 + pxor %xmm10,%xmm6 + movdqa %xmm14,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm3,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm12 + paddd %xmm3,%xmm3 + + psrld $2,%xmm14 + paddd %xmm8,%xmm12 + por %xmm5,%xmm3 + por %xmm7,%xmm14 + pxor %xmm1,%xmm4 + movdqa 192-128(%rax),%xmm1 + + movdqa %xmm12,%xmm8 + movdqa %xmm10,%xmm6 + pxor 32-128(%rax),%xmm4 + paddd %xmm15,%xmm11 + pslld $5,%xmm8 + pxor %xmm13,%xmm6 + + movdqa %xmm12,%xmm9 + paddd %xmm3,%xmm11 + pxor %xmm1,%xmm4 + psrld $27,%xmm9 + pxor %xmm14,%xmm6 + movdqa %xmm13,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm4,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm11 + paddd %xmm4,%xmm4 + + psrld $2,%xmm13 + paddd %xmm8,%xmm11 + por %xmm5,%xmm4 + por %xmm7,%xmm13 + pxor %xmm2,%xmm0 + movdqa 208-128(%rax),%xmm2 + + movdqa %xmm11,%xmm8 + movdqa %xmm14,%xmm6 + pxor 48-128(%rax),%xmm0 + paddd %xmm15,%xmm10 + pslld $5,%xmm8 + pxor %xmm12,%xmm6 + + movdqa %xmm11,%xmm9 + paddd %xmm4,%xmm10 + pxor %xmm2,%xmm0 + psrld $27,%xmm9 + pxor %xmm13,%xmm6 + movdqa %xmm12,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm0,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm10 + paddd %xmm0,%xmm0 + + psrld $2,%xmm12 + paddd %xmm8,%xmm10 + por %xmm5,%xmm0 + por %xmm7,%xmm12 + pxor %xmm3,%xmm1 + movdqa 224-128(%rax),%xmm3 + + movdqa %xmm10,%xmm8 + movdqa %xmm13,%xmm6 + pxor 64-128(%rax),%xmm1 + paddd %xmm15,%xmm14 + pslld $5,%xmm8 + pxor %xmm11,%xmm6 + + movdqa %xmm10,%xmm9 + paddd %xmm0,%xmm14 + pxor %xmm3,%xmm1 + psrld $27,%xmm9 + pxor %xmm12,%xmm6 + movdqa %xmm11,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm1,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm14 + paddd %xmm1,%xmm1 + + psrld $2,%xmm11 + paddd %xmm8,%xmm14 + por %xmm5,%xmm1 + por %xmm7,%xmm11 + pxor %xmm4,%xmm2 + movdqa 240-128(%rax),%xmm4 + + movdqa %xmm14,%xmm8 + movdqa %xmm12,%xmm6 + pxor 80-128(%rax),%xmm2 + paddd %xmm15,%xmm13 + pslld $5,%xmm8 + pxor %xmm10,%xmm6 + + movdqa %xmm14,%xmm9 + paddd %xmm1,%xmm13 + pxor %xmm4,%xmm2 + psrld $27,%xmm9 + pxor %xmm11,%xmm6 + movdqa %xmm10,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm2,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm13 + paddd %xmm2,%xmm2 + + psrld $2,%xmm10 + paddd %xmm8,%xmm13 + por %xmm5,%xmm2 + por %xmm7,%xmm10 + pxor %xmm0,%xmm3 + movdqa 0-128(%rax),%xmm0 + + movdqa %xmm13,%xmm8 + movdqa %xmm11,%xmm6 + pxor 96-128(%rax),%xmm3 + paddd %xmm15,%xmm12 + pslld $5,%xmm8 + pxor %xmm14,%xmm6 + + movdqa %xmm13,%xmm9 + paddd %xmm2,%xmm12 + pxor %xmm0,%xmm3 + psrld $27,%xmm9 + pxor %xmm10,%xmm6 + movdqa %xmm14,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm3,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm12 + paddd %xmm3,%xmm3 + + psrld $2,%xmm14 + paddd %xmm8,%xmm12 + por %xmm5,%xmm3 + por %xmm7,%xmm14 + pxor %xmm1,%xmm4 + movdqa 16-128(%rax),%xmm1 + + movdqa %xmm12,%xmm8 + movdqa %xmm10,%xmm6 + pxor 112-128(%rax),%xmm4 + paddd %xmm15,%xmm11 + pslld $5,%xmm8 + pxor %xmm13,%xmm6 + + movdqa %xmm12,%xmm9 + paddd %xmm3,%xmm11 + pxor %xmm1,%xmm4 + psrld $27,%xmm9 + pxor %xmm14,%xmm6 + movdqa %xmm13,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm4,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm11 + paddd %xmm4,%xmm4 + + psrld $2,%xmm13 + paddd %xmm8,%xmm11 + por %xmm5,%xmm4 + por %xmm7,%xmm13 + movdqa %xmm11,%xmm8 + paddd %xmm15,%xmm10 + movdqa %xmm14,%xmm6 + pslld $5,%xmm8 + pxor %xmm12,%xmm6 + + movdqa %xmm11,%xmm9 + paddd %xmm4,%xmm10 + psrld $27,%xmm9 + movdqa %xmm12,%xmm7 + pxor %xmm13,%xmm6 + + pslld $30,%xmm7 + por %xmm9,%xmm8 + paddd %xmm6,%xmm10 + + psrld $2,%xmm12 + paddd %xmm8,%xmm10 + por %xmm7,%xmm12 + movdqa (%rbx),%xmm0 + movl $1,%ecx + cmpl 0(%rbx),%ecx + pxor %xmm8,%xmm8 + cmovgeq %rbp,%r8 + cmpl 4(%rbx),%ecx + movdqa %xmm0,%xmm1 + cmovgeq %rbp,%r9 + cmpl 8(%rbx),%ecx + pcmpgtd %xmm8,%xmm1 + cmovgeq %rbp,%r10 + cmpl 12(%rbx),%ecx + paddd %xmm1,%xmm0 + cmovgeq %rbp,%r11 + + movdqu 0(%rdi),%xmm6 + pand %xmm1,%xmm10 + movdqu 32(%rdi),%xmm7 + pand %xmm1,%xmm11 + paddd %xmm6,%xmm10 + movdqu 64(%rdi),%xmm8 + pand %xmm1,%xmm12 + paddd %xmm7,%xmm11 + movdqu 96(%rdi),%xmm9 + pand %xmm1,%xmm13 + paddd %xmm8,%xmm12 + movdqu 128(%rdi),%xmm5 + pand %xmm1,%xmm14 + movdqu %xmm10,0(%rdi) + paddd %xmm9,%xmm13 + movdqu %xmm11,32(%rdi) + paddd %xmm5,%xmm14 + movdqu %xmm12,64(%rdi) + movdqu %xmm13,96(%rdi) + movdqu %xmm14,128(%rdi) + + movdqa %xmm0,(%rbx) + movdqa 96(%rbp),%xmm5 + movdqa -32(%rbp),%xmm15 + decl %edx + jnz .Loop + + movl 280(%rsp),%edx + leaq 16(%rdi),%rdi + leaq 64(%rsi),%rsi + decl %edx + jnz .Loop_grande + +.Ldone: + movq 272(%rsp),%rax + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp +.Lepilogue: + .byte 0xf3,0xc3 +.size sha1_multi_block,.-sha1_multi_block +.type sha1_multi_block_shaext,@function +.align 32 +sha1_multi_block_shaext: +_shaext_shortcut: + movq %rsp,%rax + pushq %rbx + pushq %rbp + subq $288,%rsp + shll $1,%edx + andq $-256,%rsp + leaq 64(%rdi),%rdi + movq %rax,272(%rsp) +.Lbody_shaext: + leaq 256(%rsp),%rbx + movdqa K_XX_XX+128(%rip),%xmm3 + +.Loop_grande_shaext: + movl %edx,280(%rsp) + xorl %edx,%edx + movq 0(%rsi),%r8 + movl 8(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,0(%rbx) + cmovleq %rsp,%r8 + movq 16(%rsi),%r9 + movl 24(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,4(%rbx) + cmovleq %rsp,%r9 + testl %edx,%edx + jz .Ldone_shaext + + movq 0-64(%rdi),%xmm0 + movq 32-64(%rdi),%xmm4 + movq 64-64(%rdi),%xmm5 + movq 96-64(%rdi),%xmm6 + movq 128-64(%rdi),%xmm7 + + punpckldq %xmm4,%xmm0 + punpckldq %xmm6,%xmm5 + + movdqa %xmm0,%xmm8 + punpcklqdq %xmm5,%xmm0 + punpckhqdq %xmm5,%xmm8 + + pshufd $63,%xmm7,%xmm1 + pshufd $127,%xmm7,%xmm9 + pshufd $27,%xmm0,%xmm0 + pshufd $27,%xmm8,%xmm8 + jmp .Loop_shaext + +.align 32 +.Loop_shaext: + movdqu 0(%r8),%xmm4 + movdqu 0(%r9),%xmm11 + movdqu 16(%r8),%xmm5 + movdqu 16(%r9),%xmm12 + movdqu 32(%r8),%xmm6 +.byte 102,15,56,0,227 + movdqu 32(%r9),%xmm13 +.byte 102,68,15,56,0,219 + movdqu 48(%r8),%xmm7 + leaq 64(%r8),%r8 +.byte 102,15,56,0,235 + movdqu 48(%r9),%xmm14 + leaq 64(%r9),%r9 +.byte 102,68,15,56,0,227 + + movdqa %xmm1,80(%rsp) + paddd %xmm4,%xmm1 + movdqa %xmm9,112(%rsp) + paddd %xmm11,%xmm9 + movdqa %xmm0,64(%rsp) + movdqa %xmm0,%xmm2 + movdqa %xmm8,96(%rsp) + movdqa %xmm8,%xmm10 +.byte 15,58,204,193,0 +.byte 15,56,200,213 +.byte 69,15,58,204,193,0 +.byte 69,15,56,200,212 +.byte 102,15,56,0,243 + prefetcht0 127(%r8) +.byte 15,56,201,229 +.byte 102,68,15,56,0,235 + prefetcht0 127(%r9) +.byte 69,15,56,201,220 + +.byte 102,15,56,0,251 + movdqa %xmm0,%xmm1 +.byte 102,68,15,56,0,243 + movdqa %xmm8,%xmm9 +.byte 15,58,204,194,0 +.byte 15,56,200,206 +.byte 69,15,58,204,194,0 +.byte 69,15,56,200,205 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 + pxor %xmm13,%xmm11 +.byte 69,15,56,201,229 + movdqa %xmm0,%xmm2 + movdqa %xmm8,%xmm10 +.byte 15,58,204,193,0 +.byte 15,56,200,215 +.byte 69,15,58,204,193,0 +.byte 69,15,56,200,214 +.byte 15,56,202,231 +.byte 69,15,56,202,222 + pxor %xmm7,%xmm5 +.byte 15,56,201,247 + pxor %xmm14,%xmm12 +.byte 69,15,56,201,238 + movdqa %xmm0,%xmm1 + movdqa %xmm8,%xmm9 +.byte 15,58,204,194,0 +.byte 15,56,200,204 +.byte 69,15,58,204,194,0 +.byte 69,15,56,200,203 +.byte 15,56,202,236 +.byte 69,15,56,202,227 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 + pxor %xmm11,%xmm13 +.byte 69,15,56,201,243 + movdqa %xmm0,%xmm2 + movdqa %xmm8,%xmm10 +.byte 15,58,204,193,0 +.byte 15,56,200,213 +.byte 69,15,58,204,193,0 +.byte 69,15,56,200,212 +.byte 15,56,202,245 +.byte 69,15,56,202,236 + pxor %xmm5,%xmm7 +.byte 15,56,201,229 + pxor %xmm12,%xmm14 +.byte 69,15,56,201,220 + movdqa %xmm0,%xmm1 + movdqa %xmm8,%xmm9 +.byte 15,58,204,194,1 +.byte 15,56,200,206 +.byte 69,15,58,204,194,1 +.byte 69,15,56,200,205 +.byte 15,56,202,254 +.byte 69,15,56,202,245 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 + pxor %xmm13,%xmm11 +.byte 69,15,56,201,229 + movdqa %xmm0,%xmm2 + movdqa %xmm8,%xmm10 +.byte 15,58,204,193,1 +.byte 15,56,200,215 +.byte 69,15,58,204,193,1 +.byte 69,15,56,200,214 +.byte 15,56,202,231 +.byte 69,15,56,202,222 + pxor %xmm7,%xmm5 +.byte 15,56,201,247 + pxor %xmm14,%xmm12 +.byte 69,15,56,201,238 + movdqa %xmm0,%xmm1 + movdqa %xmm8,%xmm9 +.byte 15,58,204,194,1 +.byte 15,56,200,204 +.byte 69,15,58,204,194,1 +.byte 69,15,56,200,203 +.byte 15,56,202,236 +.byte 69,15,56,202,227 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 + pxor %xmm11,%xmm13 +.byte 69,15,56,201,243 + movdqa %xmm0,%xmm2 + movdqa %xmm8,%xmm10 +.byte 15,58,204,193,1 +.byte 15,56,200,213 +.byte 69,15,58,204,193,1 +.byte 69,15,56,200,212 +.byte 15,56,202,245 +.byte 69,15,56,202,236 + pxor %xmm5,%xmm7 +.byte 15,56,201,229 + pxor %xmm12,%xmm14 +.byte 69,15,56,201,220 + movdqa %xmm0,%xmm1 + movdqa %xmm8,%xmm9 +.byte 15,58,204,194,1 +.byte 15,56,200,206 +.byte 69,15,58,204,194,1 +.byte 69,15,56,200,205 +.byte 15,56,202,254 +.byte 69,15,56,202,245 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 + pxor %xmm13,%xmm11 +.byte 69,15,56,201,229 + movdqa %xmm0,%xmm2 + movdqa %xmm8,%xmm10 +.byte 15,58,204,193,2 +.byte 15,56,200,215 +.byte 69,15,58,204,193,2 +.byte 69,15,56,200,214 +.byte 15,56,202,231 +.byte 69,15,56,202,222 + pxor %xmm7,%xmm5 +.byte 15,56,201,247 + pxor %xmm14,%xmm12 +.byte 69,15,56,201,238 + movdqa %xmm0,%xmm1 + movdqa %xmm8,%xmm9 +.byte 15,58,204,194,2 +.byte 15,56,200,204 +.byte 69,15,58,204,194,2 +.byte 69,15,56,200,203 +.byte 15,56,202,236 +.byte 69,15,56,202,227 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 + pxor %xmm11,%xmm13 +.byte 69,15,56,201,243 + movdqa %xmm0,%xmm2 + movdqa %xmm8,%xmm10 +.byte 15,58,204,193,2 +.byte 15,56,200,213 +.byte 69,15,58,204,193,2 +.byte 69,15,56,200,212 +.byte 15,56,202,245 +.byte 69,15,56,202,236 + pxor %xmm5,%xmm7 +.byte 15,56,201,229 + pxor %xmm12,%xmm14 +.byte 69,15,56,201,220 + movdqa %xmm0,%xmm1 + movdqa %xmm8,%xmm9 +.byte 15,58,204,194,2 +.byte 15,56,200,206 +.byte 69,15,58,204,194,2 +.byte 69,15,56,200,205 +.byte 15,56,202,254 +.byte 69,15,56,202,245 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 + pxor %xmm13,%xmm11 +.byte 69,15,56,201,229 + movdqa %xmm0,%xmm2 + movdqa %xmm8,%xmm10 +.byte 15,58,204,193,2 +.byte 15,56,200,215 +.byte 69,15,58,204,193,2 +.byte 69,15,56,200,214 +.byte 15,56,202,231 +.byte 69,15,56,202,222 + pxor %xmm7,%xmm5 +.byte 15,56,201,247 + pxor %xmm14,%xmm12 +.byte 69,15,56,201,238 + movdqa %xmm0,%xmm1 + movdqa %xmm8,%xmm9 +.byte 15,58,204,194,3 +.byte 15,56,200,204 +.byte 69,15,58,204,194,3 +.byte 69,15,56,200,203 +.byte 15,56,202,236 +.byte 69,15,56,202,227 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 + pxor %xmm11,%xmm13 +.byte 69,15,56,201,243 + movdqa %xmm0,%xmm2 + movdqa %xmm8,%xmm10 +.byte 15,58,204,193,3 +.byte 15,56,200,213 +.byte 69,15,58,204,193,3 +.byte 69,15,56,200,212 +.byte 15,56,202,245 +.byte 69,15,56,202,236 + pxor %xmm5,%xmm7 + pxor %xmm12,%xmm14 + + movl $1,%ecx + pxor %xmm4,%xmm4 + cmpl 0(%rbx),%ecx + cmovgeq %rsp,%r8 + + movdqa %xmm0,%xmm1 + movdqa %xmm8,%xmm9 +.byte 15,58,204,194,3 +.byte 15,56,200,206 +.byte 69,15,58,204,194,3 +.byte 69,15,56,200,205 +.byte 15,56,202,254 +.byte 69,15,56,202,245 + + cmpl 4(%rbx),%ecx + cmovgeq %rsp,%r9 + movq (%rbx),%xmm6 + + movdqa %xmm0,%xmm2 + movdqa %xmm8,%xmm10 +.byte 15,58,204,193,3 +.byte 15,56,200,215 +.byte 69,15,58,204,193,3 +.byte 69,15,56,200,214 + + pshufd $0,%xmm6,%xmm11 + pshufd $85,%xmm6,%xmm12 + movdqa %xmm6,%xmm7 + pcmpgtd %xmm4,%xmm11 + pcmpgtd %xmm4,%xmm12 + + movdqa %xmm0,%xmm1 + movdqa %xmm8,%xmm9 +.byte 15,58,204,194,3 +.byte 15,56,200,204 +.byte 69,15,58,204,194,3 +.byte 68,15,56,200,204 + + pcmpgtd %xmm4,%xmm7 + pand %xmm11,%xmm0 + pand %xmm11,%xmm1 + pand %xmm12,%xmm8 + pand %xmm12,%xmm9 + paddd %xmm7,%xmm6 + + paddd 64(%rsp),%xmm0 + paddd 80(%rsp),%xmm1 + paddd 96(%rsp),%xmm8 + paddd 112(%rsp),%xmm9 + + movq %xmm6,(%rbx) + decl %edx + jnz .Loop_shaext + + movl 280(%rsp),%edx + + pshufd $27,%xmm0,%xmm0 + pshufd $27,%xmm8,%xmm8 + + movdqa %xmm0,%xmm6 + punpckldq %xmm8,%xmm0 + punpckhdq %xmm8,%xmm6 + punpckhdq %xmm9,%xmm1 + movq %xmm0,0-64(%rdi) + psrldq $8,%xmm0 + movq %xmm6,64-64(%rdi) + psrldq $8,%xmm6 + movq %xmm0,32-64(%rdi) + psrldq $8,%xmm1 + movq %xmm6,96-64(%rdi) + movq %xmm1,128-64(%rdi) + + leaq 8(%rdi),%rdi + leaq 32(%rsi),%rsi + decl %edx + jnz .Loop_grande_shaext + +.Ldone_shaext: + + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp +.Lepilogue_shaext: + .byte 0xf3,0xc3 +.size sha1_multi_block_shaext,.-sha1_multi_block_shaext + +.align 256 +.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 +.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 +K_XX_XX: +.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 +.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 +.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc +.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc +.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 +.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 +.byte 83,72,65,49,32,109,117,108,116,105,45,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 diff --git a/secure/lib/libcrypto/amd64/sha1-x86_64.S b/secure/lib/libcrypto/amd64/sha1-x86_64.S index 421423a..25c27e5 100644 --- a/secure/lib/libcrypto/amd64/sha1-x86_64.S +++ b/secure/lib/libcrypto/amd64/sha1-x86_64.S @@ -8,23 +8,27 @@ sha1_block_data_order: movl OPENSSL_ia32cap_P+0(%rip),%r9d movl OPENSSL_ia32cap_P+4(%rip),%r8d + movl OPENSSL_ia32cap_P+8(%rip),%r10d testl $512,%r8d jz .Lialu + testl $536870912,%r10d + jnz _shaext_shortcut jmp _ssse3_shortcut .align 16 .Lialu: + movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 pushq %r13 - movq %rsp,%r11 + pushq %r14 movq %rdi,%r8 subq $72,%rsp movq %rsi,%r9 andq $-64,%rsp movq %rdx,%r10 - movq %r11,64(%rsp) + movq %rax,64(%rsp) .Lprologue: movl 0(%r8),%esi @@ -38,1230 +42,1168 @@ sha1_block_data_order: .Lloop: movl 0(%r9),%edx bswapl %edx - movl %edx,0(%rsp) - movl %r11d,%eax movl 4(%r9),%ebp + movl %r12d,%eax + movl %edx,0(%rsp) movl %esi,%ecx - xorl %r12d,%eax bswapl %ebp + xorl %r11d,%eax roll $5,%ecx - leal 1518500249(%rdx,%r13,1),%r13d andl %edi,%eax - movl %ebp,4(%rsp) + leal 1518500249(%rdx,%r13,1),%r13d addl %ecx,%r13d xorl %r12d,%eax roll $30,%edi addl %eax,%r13d - movl %edi,%eax - movl 8(%r9),%edx + movl 8(%r9),%r14d + movl %r11d,%eax + movl %ebp,4(%rsp) movl %r13d,%ecx - xorl %r11d,%eax - bswapl %edx + bswapl %r14d + xorl %edi,%eax roll $5,%ecx - leal 1518500249(%rbp,%r12,1),%r12d andl %esi,%eax - movl %edx,8(%rsp) + leal 1518500249(%rbp,%r12,1),%r12d addl %ecx,%r12d xorl %r11d,%eax roll $30,%esi addl %eax,%r12d - movl %esi,%eax - movl 12(%r9),%ebp + movl 12(%r9),%edx + movl %edi,%eax + movl %r14d,8(%rsp) movl %r12d,%ecx - xorl %edi,%eax - bswapl %ebp + bswapl %edx + xorl %esi,%eax roll $5,%ecx - leal 1518500249(%rdx,%r11,1),%r11d andl %r13d,%eax - movl %ebp,12(%rsp) + leal 1518500249(%r14,%r11,1),%r11d addl %ecx,%r11d xorl %edi,%eax roll $30,%r13d addl %eax,%r11d - movl %r13d,%eax - movl 16(%r9),%edx + movl 16(%r9),%ebp + movl %esi,%eax + movl %edx,12(%rsp) movl %r11d,%ecx - xorl %esi,%eax - bswapl %edx + bswapl %ebp + xorl %r13d,%eax roll $5,%ecx - leal 1518500249(%rbp,%rdi,1),%edi andl %r12d,%eax - movl %edx,16(%rsp) + leal 1518500249(%rdx,%rdi,1),%edi addl %ecx,%edi xorl %esi,%eax roll $30,%r12d addl %eax,%edi - movl %r12d,%eax - movl 20(%r9),%ebp + movl 20(%r9),%r14d + movl %r13d,%eax + movl %ebp,16(%rsp) movl %edi,%ecx - xorl %r13d,%eax - bswapl %ebp + bswapl %r14d + xorl %r12d,%eax roll $5,%ecx - leal 1518500249(%rdx,%rsi,1),%esi andl %r11d,%eax - movl %ebp,20(%rsp) + leal 1518500249(%rbp,%rsi,1),%esi addl %ecx,%esi xorl %r13d,%eax roll $30,%r11d addl %eax,%esi - movl %r11d,%eax movl 24(%r9),%edx + movl %r12d,%eax + movl %r14d,20(%rsp) movl %esi,%ecx - xorl %r12d,%eax bswapl %edx + xorl %r11d,%eax roll $5,%ecx - leal 1518500249(%rbp,%r13,1),%r13d andl %edi,%eax - movl %edx,24(%rsp) + leal 1518500249(%r14,%r13,1),%r13d addl %ecx,%r13d xorl %r12d,%eax roll $30,%edi addl %eax,%r13d - movl %edi,%eax movl 28(%r9),%ebp + movl %r11d,%eax + movl %edx,24(%rsp) movl %r13d,%ecx - xorl %r11d,%eax bswapl %ebp + xorl %edi,%eax roll $5,%ecx - leal 1518500249(%rdx,%r12,1),%r12d andl %esi,%eax - movl %ebp,28(%rsp) + leal 1518500249(%rdx,%r12,1),%r12d addl %ecx,%r12d xorl %r11d,%eax roll $30,%esi addl %eax,%r12d - movl %esi,%eax - movl 32(%r9),%edx + movl 32(%r9),%r14d + movl %edi,%eax + movl %ebp,28(%rsp) movl %r12d,%ecx - xorl %edi,%eax - bswapl %edx + bswapl %r14d + xorl %esi,%eax roll $5,%ecx - leal 1518500249(%rbp,%r11,1),%r11d andl %r13d,%eax - movl %edx,32(%rsp) + leal 1518500249(%rbp,%r11,1),%r11d addl %ecx,%r11d xorl %edi,%eax roll $30,%r13d addl %eax,%r11d - movl %r13d,%eax - movl 36(%r9),%ebp + movl 36(%r9),%edx + movl %esi,%eax + movl %r14d,32(%rsp) movl %r11d,%ecx - xorl %esi,%eax - bswapl %ebp + bswapl %edx + xorl %r13d,%eax roll $5,%ecx - leal 1518500249(%rdx,%rdi,1),%edi andl %r12d,%eax - movl %ebp,36(%rsp) + leal 1518500249(%r14,%rdi,1),%edi addl %ecx,%edi xorl %esi,%eax roll $30,%r12d addl %eax,%edi - movl %r12d,%eax - movl 40(%r9),%edx + movl 40(%r9),%ebp + movl %r13d,%eax + movl %edx,36(%rsp) movl %edi,%ecx - xorl %r13d,%eax - bswapl %edx + bswapl %ebp + xorl %r12d,%eax roll $5,%ecx - leal 1518500249(%rbp,%rsi,1),%esi andl %r11d,%eax - movl %edx,40(%rsp) + leal 1518500249(%rdx,%rsi,1),%esi addl %ecx,%esi xorl %r13d,%eax roll $30,%r11d addl %eax,%esi - movl %r11d,%eax - movl 44(%r9),%ebp + movl 44(%r9),%r14d + movl %r12d,%eax + movl %ebp,40(%rsp) movl %esi,%ecx - xorl %r12d,%eax - bswapl %ebp + bswapl %r14d + xorl %r11d,%eax roll $5,%ecx - leal 1518500249(%rdx,%r13,1),%r13d andl %edi,%eax - movl %ebp,44(%rsp) + leal 1518500249(%rbp,%r13,1),%r13d addl %ecx,%r13d xorl %r12d,%eax roll $30,%edi addl %eax,%r13d - movl %edi,%eax movl 48(%r9),%edx + movl %r11d,%eax + movl %r14d,44(%rsp) movl %r13d,%ecx - xorl %r11d,%eax bswapl %edx + xorl %edi,%eax roll $5,%ecx - leal 1518500249(%rbp,%r12,1),%r12d andl %esi,%eax - movl %edx,48(%rsp) + leal 1518500249(%r14,%r12,1),%r12d addl %ecx,%r12d xorl %r11d,%eax roll $30,%esi addl %eax,%r12d - movl %esi,%eax movl 52(%r9),%ebp + movl %edi,%eax + movl %edx,48(%rsp) movl %r12d,%ecx - xorl %edi,%eax bswapl %ebp + xorl %esi,%eax roll $5,%ecx - leal 1518500249(%rdx,%r11,1),%r11d andl %r13d,%eax - movl %ebp,52(%rsp) + leal 1518500249(%rdx,%r11,1),%r11d addl %ecx,%r11d xorl %edi,%eax roll $30,%r13d addl %eax,%r11d - movl %r13d,%eax - movl 56(%r9),%edx + movl 56(%r9),%r14d + movl %esi,%eax + movl %ebp,52(%rsp) movl %r11d,%ecx - xorl %esi,%eax - bswapl %edx + bswapl %r14d + xorl %r13d,%eax roll $5,%ecx - leal 1518500249(%rbp,%rdi,1),%edi andl %r12d,%eax - movl %edx,56(%rsp) + leal 1518500249(%rbp,%rdi,1),%edi addl %ecx,%edi xorl %esi,%eax roll $30,%r12d addl %eax,%edi - movl %r12d,%eax - movl 60(%r9),%ebp + movl 60(%r9),%edx + movl %r13d,%eax + movl %r14d,56(%rsp) movl %edi,%ecx - xorl %r13d,%eax - bswapl %ebp + bswapl %edx + xorl %r12d,%eax roll $5,%ecx - leal 1518500249(%rdx,%rsi,1),%esi andl %r11d,%eax - movl %ebp,60(%rsp) + leal 1518500249(%r14,%rsi,1),%esi addl %ecx,%esi xorl %r13d,%eax roll $30,%r11d addl %eax,%esi - movl 0(%rsp),%edx - movl %r11d,%eax + xorl 0(%rsp),%ebp + movl %r12d,%eax + movl %edx,60(%rsp) movl %esi,%ecx - xorl 8(%rsp),%edx - xorl %r12d,%eax + xorl 8(%rsp),%ebp + xorl %r11d,%eax roll $5,%ecx - xorl 32(%rsp),%edx + xorl 32(%rsp),%ebp andl %edi,%eax - leal 1518500249(%rbp,%r13,1),%r13d - xorl 52(%rsp),%edx + leal 1518500249(%rdx,%r13,1),%r13d + roll $30,%edi xorl %r12d,%eax - roll $1,%edx addl %ecx,%r13d - roll $30,%edi - movl %edx,0(%rsp) + roll $1,%ebp addl %eax,%r13d - movl 4(%rsp),%ebp - movl %edi,%eax + xorl 4(%rsp),%r14d + movl %r11d,%eax + movl %ebp,0(%rsp) movl %r13d,%ecx - xorl 12(%rsp),%ebp - xorl %r11d,%eax + xorl 12(%rsp),%r14d + xorl %edi,%eax roll $5,%ecx - xorl 36(%rsp),%ebp + xorl 36(%rsp),%r14d andl %esi,%eax - leal 1518500249(%rdx,%r12,1),%r12d - xorl 56(%rsp),%ebp + leal 1518500249(%rbp,%r12,1),%r12d + roll $30,%esi xorl %r11d,%eax - roll $1,%ebp addl %ecx,%r12d - roll $30,%esi - movl %ebp,4(%rsp) + roll $1,%r14d addl %eax,%r12d - movl 8(%rsp),%edx - movl %esi,%eax + xorl 8(%rsp),%edx + movl %edi,%eax + movl %r14d,4(%rsp) movl %r12d,%ecx xorl 16(%rsp),%edx - xorl %edi,%eax + xorl %esi,%eax roll $5,%ecx xorl 40(%rsp),%edx andl %r13d,%eax - leal 1518500249(%rbp,%r11,1),%r11d - xorl 60(%rsp),%edx + leal 1518500249(%r14,%r11,1),%r11d + roll $30,%r13d xorl %edi,%eax - roll $1,%edx addl %ecx,%r11d - roll $30,%r13d - movl %edx,8(%rsp) + roll $1,%edx addl %eax,%r11d - movl 12(%rsp),%ebp - movl %r13d,%eax + xorl 12(%rsp),%ebp + movl %esi,%eax + movl %edx,8(%rsp) movl %r11d,%ecx xorl 20(%rsp),%ebp - xorl %esi,%eax + xorl %r13d,%eax roll $5,%ecx xorl 44(%rsp),%ebp andl %r12d,%eax leal 1518500249(%rdx,%rdi,1),%edi - xorl 0(%rsp),%ebp + roll $30,%r12d xorl %esi,%eax - roll $1,%ebp addl %ecx,%edi - roll $30,%r12d - movl %ebp,12(%rsp) + roll $1,%ebp addl %eax,%edi - movl 16(%rsp),%edx - movl %r12d,%eax + xorl 16(%rsp),%r14d + movl %r13d,%eax + movl %ebp,12(%rsp) movl %edi,%ecx - xorl 24(%rsp),%edx - xorl %r13d,%eax + xorl 24(%rsp),%r14d + xorl %r12d,%eax roll $5,%ecx - xorl 48(%rsp),%edx + xorl 48(%rsp),%r14d andl %r11d,%eax leal 1518500249(%rbp,%rsi,1),%esi - xorl 4(%rsp),%edx + roll $30,%r11d xorl %r13d,%eax - roll $1,%edx addl %ecx,%esi - roll $30,%r11d - movl %edx,16(%rsp) + roll $1,%r14d addl %eax,%esi - movl 20(%rsp),%ebp - movl %r11d,%eax + xorl 20(%rsp),%edx + movl %edi,%eax + movl %r14d,16(%rsp) movl %esi,%ecx - xorl 28(%rsp),%ebp - xorl %edi,%eax - roll $5,%ecx - leal 1859775393(%rdx,%r13,1),%r13d - xorl 52(%rsp),%ebp + xorl 28(%rsp),%edx xorl %r12d,%eax + roll $5,%ecx + xorl 52(%rsp),%edx + leal 1859775393(%r14,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 8(%rsp),%ebp roll $30,%edi addl %eax,%r13d - roll $1,%ebp - movl %ebp,20(%rsp) - movl 24(%rsp),%edx - movl %edi,%eax + roll $1,%edx + xorl 24(%rsp),%ebp + movl %esi,%eax + movl %edx,20(%rsp) movl %r13d,%ecx - xorl 32(%rsp),%edx - xorl %esi,%eax - roll $5,%ecx - leal 1859775393(%rbp,%r12,1),%r12d - xorl 56(%rsp),%edx + xorl 32(%rsp),%ebp xorl %r11d,%eax + roll $5,%ecx + xorl 56(%rsp),%ebp + leal 1859775393(%rdx,%r12,1),%r12d + xorl %edi,%eax addl %ecx,%r12d - xorl 12(%rsp),%edx roll $30,%esi addl %eax,%r12d - roll $1,%edx - movl %edx,24(%rsp) - movl 28(%rsp),%ebp - movl %esi,%eax + roll $1,%ebp + xorl 28(%rsp),%r14d + movl %r13d,%eax + movl %ebp,24(%rsp) movl %r12d,%ecx - xorl 36(%rsp),%ebp - xorl %r13d,%eax - roll $5,%ecx - leal 1859775393(%rdx,%r11,1),%r11d - xorl 60(%rsp),%ebp + xorl 36(%rsp),%r14d xorl %edi,%eax + roll $5,%ecx + xorl 60(%rsp),%r14d + leal 1859775393(%rbp,%r11,1),%r11d + xorl %esi,%eax addl %ecx,%r11d - xorl 16(%rsp),%ebp roll $30,%r13d addl %eax,%r11d - roll $1,%ebp - movl %ebp,28(%rsp) - movl 32(%rsp),%edx - movl %r13d,%eax + roll $1,%r14d + xorl 32(%rsp),%edx + movl %r12d,%eax + movl %r14d,28(%rsp) movl %r11d,%ecx xorl 40(%rsp),%edx - xorl %r12d,%eax + xorl %esi,%eax roll $5,%ecx - leal 1859775393(%rbp,%rdi,1),%edi xorl 0(%rsp),%edx - xorl %esi,%eax + leal 1859775393(%r14,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 20(%rsp),%edx roll $30,%r12d addl %eax,%edi roll $1,%edx + xorl 36(%rsp),%ebp + movl %r11d,%eax movl %edx,32(%rsp) - movl 36(%rsp),%ebp - movl %r12d,%eax movl %edi,%ecx xorl 44(%rsp),%ebp - xorl %r11d,%eax + xorl %r13d,%eax roll $5,%ecx - leal 1859775393(%rdx,%rsi,1),%esi xorl 4(%rsp),%ebp - xorl %r13d,%eax + leal 1859775393(%rdx,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 24(%rsp),%ebp roll $30,%r11d addl %eax,%esi roll $1,%ebp + xorl 40(%rsp),%r14d + movl %edi,%eax movl %ebp,36(%rsp) - movl 40(%rsp),%edx - movl %r11d,%eax movl %esi,%ecx - xorl 48(%rsp),%edx - xorl %edi,%eax + xorl 48(%rsp),%r14d + xorl %r12d,%eax roll $5,%ecx + xorl 8(%rsp),%r14d leal 1859775393(%rbp,%r13,1),%r13d - xorl 8(%rsp),%edx - xorl %r12d,%eax + xorl %r11d,%eax addl %ecx,%r13d - xorl 28(%rsp),%edx roll $30,%edi addl %eax,%r13d - roll $1,%edx - movl %edx,40(%rsp) - movl 44(%rsp),%ebp - movl %edi,%eax + roll $1,%r14d + xorl 44(%rsp),%edx + movl %esi,%eax + movl %r14d,40(%rsp) movl %r13d,%ecx - xorl 52(%rsp),%ebp - xorl %esi,%eax - roll $5,%ecx - leal 1859775393(%rdx,%r12,1),%r12d - xorl 12(%rsp),%ebp + xorl 52(%rsp),%edx xorl %r11d,%eax + roll $5,%ecx + xorl 12(%rsp),%edx + leal 1859775393(%r14,%r12,1),%r12d + xorl %edi,%eax addl %ecx,%r12d - xorl 32(%rsp),%ebp roll $30,%esi addl %eax,%r12d - roll $1,%ebp - movl %ebp,44(%rsp) - movl 48(%rsp),%edx - movl %esi,%eax + roll $1,%edx + xorl 48(%rsp),%ebp + movl %r13d,%eax + movl %edx,44(%rsp) movl %r12d,%ecx - xorl 56(%rsp),%edx - xorl %r13d,%eax - roll $5,%ecx - leal 1859775393(%rbp,%r11,1),%r11d - xorl 16(%rsp),%edx + xorl 56(%rsp),%ebp xorl %edi,%eax + roll $5,%ecx + xorl 16(%rsp),%ebp + leal 1859775393(%rdx,%r11,1),%r11d + xorl %esi,%eax addl %ecx,%r11d - xorl 36(%rsp),%edx roll $30,%r13d addl %eax,%r11d - roll $1,%edx - movl %edx,48(%rsp) - movl 52(%rsp),%ebp - movl %r13d,%eax + roll $1,%ebp + xorl 52(%rsp),%r14d + movl %r12d,%eax + movl %ebp,48(%rsp) movl %r11d,%ecx - xorl 60(%rsp),%ebp - xorl %r12d,%eax - roll $5,%ecx - leal 1859775393(%rdx,%rdi,1),%edi - xorl 20(%rsp),%ebp + xorl 60(%rsp),%r14d xorl %esi,%eax + roll $5,%ecx + xorl 20(%rsp),%r14d + leal 1859775393(%rbp,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 40(%rsp),%ebp roll $30,%r12d addl %eax,%edi - roll $1,%ebp - movl %ebp,52(%rsp) - movl 56(%rsp),%edx - movl %r12d,%eax + roll $1,%r14d + xorl 56(%rsp),%edx + movl %r11d,%eax + movl %r14d,52(%rsp) movl %edi,%ecx xorl 0(%rsp),%edx - xorl %r11d,%eax + xorl %r13d,%eax roll $5,%ecx - leal 1859775393(%rbp,%rsi,1),%esi xorl 24(%rsp),%edx - xorl %r13d,%eax + leal 1859775393(%r14,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 44(%rsp),%edx roll $30,%r11d addl %eax,%esi roll $1,%edx + xorl 60(%rsp),%ebp + movl %edi,%eax movl %edx,56(%rsp) - movl 60(%rsp),%ebp - movl %r11d,%eax movl %esi,%ecx xorl 4(%rsp),%ebp - xorl %edi,%eax + xorl %r12d,%eax roll $5,%ecx - leal 1859775393(%rdx,%r13,1),%r13d xorl 28(%rsp),%ebp - xorl %r12d,%eax + leal 1859775393(%rdx,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 48(%rsp),%ebp roll $30,%edi addl %eax,%r13d roll $1,%ebp + xorl 0(%rsp),%r14d + movl %esi,%eax movl %ebp,60(%rsp) - movl 0(%rsp),%edx - movl %edi,%eax movl %r13d,%ecx - xorl 8(%rsp),%edx - xorl %esi,%eax + xorl 8(%rsp),%r14d + xorl %r11d,%eax roll $5,%ecx + xorl 32(%rsp),%r14d leal 1859775393(%rbp,%r12,1),%r12d - xorl 32(%rsp),%edx - xorl %r11d,%eax + xorl %edi,%eax addl %ecx,%r12d - xorl 52(%rsp),%edx roll $30,%esi addl %eax,%r12d - roll $1,%edx - movl %edx,0(%rsp) - movl 4(%rsp),%ebp - movl %esi,%eax + roll $1,%r14d + xorl 4(%rsp),%edx + movl %r13d,%eax + movl %r14d,0(%rsp) movl %r12d,%ecx - xorl 12(%rsp),%ebp - xorl %r13d,%eax - roll $5,%ecx - leal 1859775393(%rdx,%r11,1),%r11d - xorl 36(%rsp),%ebp + xorl 12(%rsp),%edx xorl %edi,%eax + roll $5,%ecx + xorl 36(%rsp),%edx + leal 1859775393(%r14,%r11,1),%r11d + xorl %esi,%eax addl %ecx,%r11d - xorl 56(%rsp),%ebp roll $30,%r13d addl %eax,%r11d - roll $1,%ebp - movl %ebp,4(%rsp) - movl 8(%rsp),%edx - movl %r13d,%eax + roll $1,%edx + xorl 8(%rsp),%ebp + movl %r12d,%eax + movl %edx,4(%rsp) movl %r11d,%ecx - xorl 16(%rsp),%edx - xorl %r12d,%eax - roll $5,%ecx - leal 1859775393(%rbp,%rdi,1),%edi - xorl 40(%rsp),%edx + xorl 16(%rsp),%ebp xorl %esi,%eax + roll $5,%ecx + xorl 40(%rsp),%ebp + leal 1859775393(%rdx,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 60(%rsp),%edx roll $30,%r12d addl %eax,%edi - roll $1,%edx - movl %edx,8(%rsp) - movl 12(%rsp),%ebp - movl %r12d,%eax + roll $1,%ebp + xorl 12(%rsp),%r14d + movl %r11d,%eax + movl %ebp,8(%rsp) movl %edi,%ecx - xorl 20(%rsp),%ebp - xorl %r11d,%eax - roll $5,%ecx - leal 1859775393(%rdx,%rsi,1),%esi - xorl 44(%rsp),%ebp + xorl 20(%rsp),%r14d xorl %r13d,%eax + roll $5,%ecx + xorl 44(%rsp),%r14d + leal 1859775393(%rbp,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 0(%rsp),%ebp roll $30,%r11d addl %eax,%esi - roll $1,%ebp - movl %ebp,12(%rsp) - movl 16(%rsp),%edx - movl %r11d,%eax + roll $1,%r14d + xorl 16(%rsp),%edx + movl %edi,%eax + movl %r14d,12(%rsp) movl %esi,%ecx xorl 24(%rsp),%edx - xorl %edi,%eax + xorl %r12d,%eax roll $5,%ecx - leal 1859775393(%rbp,%r13,1),%r13d xorl 48(%rsp),%edx - xorl %r12d,%eax + leal 1859775393(%r14,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 4(%rsp),%edx roll $30,%edi addl %eax,%r13d roll $1,%edx + xorl 20(%rsp),%ebp + movl %esi,%eax movl %edx,16(%rsp) - movl 20(%rsp),%ebp - movl %edi,%eax movl %r13d,%ecx xorl 28(%rsp),%ebp - xorl %esi,%eax + xorl %r11d,%eax roll $5,%ecx - leal 1859775393(%rdx,%r12,1),%r12d xorl 52(%rsp),%ebp - xorl %r11d,%eax + leal 1859775393(%rdx,%r12,1),%r12d + xorl %edi,%eax addl %ecx,%r12d - xorl 8(%rsp),%ebp roll $30,%esi addl %eax,%r12d roll $1,%ebp + xorl 24(%rsp),%r14d + movl %r13d,%eax movl %ebp,20(%rsp) - movl 24(%rsp),%edx - movl %esi,%eax movl %r12d,%ecx - xorl 32(%rsp),%edx - xorl %r13d,%eax + xorl 32(%rsp),%r14d + xorl %edi,%eax roll $5,%ecx + xorl 56(%rsp),%r14d leal 1859775393(%rbp,%r11,1),%r11d - xorl 56(%rsp),%edx - xorl %edi,%eax + xorl %esi,%eax addl %ecx,%r11d - xorl 12(%rsp),%edx roll $30,%r13d addl %eax,%r11d - roll $1,%edx - movl %edx,24(%rsp) - movl 28(%rsp),%ebp - movl %r13d,%eax + roll $1,%r14d + xorl 28(%rsp),%edx + movl %r12d,%eax + movl %r14d,24(%rsp) movl %r11d,%ecx - xorl 36(%rsp),%ebp - xorl %r12d,%eax - roll $5,%ecx - leal 1859775393(%rdx,%rdi,1),%edi - xorl 60(%rsp),%ebp + xorl 36(%rsp),%edx xorl %esi,%eax + roll $5,%ecx + xorl 60(%rsp),%edx + leal 1859775393(%r14,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 16(%rsp),%ebp roll $30,%r12d addl %eax,%edi - roll $1,%ebp - movl %ebp,28(%rsp) - movl 32(%rsp),%edx - movl %r12d,%eax + roll $1,%edx + xorl 32(%rsp),%ebp + movl %r11d,%eax + movl %edx,28(%rsp) movl %edi,%ecx - xorl 40(%rsp),%edx - xorl %r11d,%eax - roll $5,%ecx - leal 1859775393(%rbp,%rsi,1),%esi - xorl 0(%rsp),%edx + xorl 40(%rsp),%ebp xorl %r13d,%eax + roll $5,%ecx + xorl 0(%rsp),%ebp + leal 1859775393(%rdx,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 20(%rsp),%edx roll $30,%r11d addl %eax,%esi - roll $1,%edx - movl %edx,32(%rsp) - movl 36(%rsp),%ebp - movl %r11d,%eax - movl %r11d,%ebx - xorl 44(%rsp),%ebp - andl %r12d,%eax + roll $1,%ebp + xorl 36(%rsp),%r14d + movl %r12d,%eax + movl %ebp,32(%rsp) + movl %r12d,%ebx + xorl 44(%rsp),%r14d + andl %r11d,%eax movl %esi,%ecx - xorl 4(%rsp),%ebp - xorl %r12d,%ebx - leal -1894007588(%rdx,%r13,1),%r13d + xorl 4(%rsp),%r14d + leal -1894007588(%rbp,%r13,1),%r13d + xorl %r11d,%ebx roll $5,%ecx - xorl 24(%rsp),%ebp addl %eax,%r13d + roll $1,%r14d andl %edi,%ebx - roll $1,%ebp - addl %ebx,%r13d - roll $30,%edi - movl %ebp,36(%rsp) addl %ecx,%r13d - movl 40(%rsp),%edx - movl %edi,%eax - movl %edi,%ebx + roll $30,%edi + addl %ebx,%r13d + xorl 40(%rsp),%edx + movl %r11d,%eax + movl %r14d,36(%rsp) + movl %r11d,%ebx xorl 48(%rsp),%edx - andl %r11d,%eax + andl %edi,%eax movl %r13d,%ecx xorl 8(%rsp),%edx - xorl %r11d,%ebx - leal -1894007588(%rbp,%r12,1),%r12d + leal -1894007588(%r14,%r12,1),%r12d + xorl %edi,%ebx roll $5,%ecx - xorl 28(%rsp),%edx addl %eax,%r12d - andl %esi,%ebx roll $1,%edx - addl %ebx,%r12d + andl %esi,%ebx + addl %ecx,%r12d roll $30,%esi + addl %ebx,%r12d + xorl 44(%rsp),%ebp + movl %edi,%eax movl %edx,40(%rsp) - addl %ecx,%r12d - movl 44(%rsp),%ebp - movl %esi,%eax - movl %esi,%ebx + movl %edi,%ebx xorl 52(%rsp),%ebp - andl %edi,%eax + andl %esi,%eax movl %r12d,%ecx xorl 12(%rsp),%ebp - xorl %edi,%ebx leal -1894007588(%rdx,%r11,1),%r11d + xorl %esi,%ebx roll $5,%ecx - xorl 32(%rsp),%ebp addl %eax,%r11d - andl %r13d,%ebx roll $1,%ebp - addl %ebx,%r11d + andl %r13d,%ebx + addl %ecx,%r11d roll $30,%r13d + addl %ebx,%r11d + xorl 48(%rsp),%r14d + movl %esi,%eax movl %ebp,44(%rsp) - addl %ecx,%r11d - movl 48(%rsp),%edx - movl %r13d,%eax - movl %r13d,%ebx - xorl 56(%rsp),%edx - andl %esi,%eax + movl %esi,%ebx + xorl 56(%rsp),%r14d + andl %r13d,%eax movl %r11d,%ecx - xorl 16(%rsp),%edx - xorl %esi,%ebx + xorl 16(%rsp),%r14d leal -1894007588(%rbp,%rdi,1),%edi + xorl %r13d,%ebx roll $5,%ecx - xorl 36(%rsp),%edx addl %eax,%edi + roll $1,%r14d andl %r12d,%ebx - roll $1,%edx - addl %ebx,%edi - roll $30,%r12d - movl %edx,48(%rsp) addl %ecx,%edi - movl 52(%rsp),%ebp - movl %r12d,%eax - movl %r12d,%ebx - xorl 60(%rsp),%ebp - andl %r13d,%eax + roll $30,%r12d + addl %ebx,%edi + xorl 52(%rsp),%edx + movl %r13d,%eax + movl %r14d,48(%rsp) + movl %r13d,%ebx + xorl 60(%rsp),%edx + andl %r12d,%eax movl %edi,%ecx - xorl 20(%rsp),%ebp - xorl %r13d,%ebx - leal -1894007588(%rdx,%rsi,1),%esi + xorl 20(%rsp),%edx + leal -1894007588(%r14,%rsi,1),%esi + xorl %r12d,%ebx roll $5,%ecx - xorl 40(%rsp),%ebp addl %eax,%esi + roll $1,%edx andl %r11d,%ebx - roll $1,%ebp - addl %ebx,%esi - roll $30,%r11d - movl %ebp,52(%rsp) addl %ecx,%esi - movl 56(%rsp),%edx - movl %r11d,%eax - movl %r11d,%ebx - xorl 0(%rsp),%edx - andl %r12d,%eax + roll $30,%r11d + addl %ebx,%esi + xorl 56(%rsp),%ebp + movl %r12d,%eax + movl %edx,52(%rsp) + movl %r12d,%ebx + xorl 0(%rsp),%ebp + andl %r11d,%eax movl %esi,%ecx - xorl 24(%rsp),%edx - xorl %r12d,%ebx - leal -1894007588(%rbp,%r13,1),%r13d + xorl 24(%rsp),%ebp + leal -1894007588(%rdx,%r13,1),%r13d + xorl %r11d,%ebx roll $5,%ecx - xorl 44(%rsp),%edx addl %eax,%r13d + roll $1,%ebp andl %edi,%ebx - roll $1,%edx - addl %ebx,%r13d - roll $30,%edi - movl %edx,56(%rsp) addl %ecx,%r13d - movl 60(%rsp),%ebp - movl %edi,%eax - movl %edi,%ebx - xorl 4(%rsp),%ebp - andl %r11d,%eax + roll $30,%edi + addl %ebx,%r13d + xorl 60(%rsp),%r14d + movl %r11d,%eax + movl %ebp,56(%rsp) + movl %r11d,%ebx + xorl 4(%rsp),%r14d + andl %edi,%eax movl %r13d,%ecx - xorl 28(%rsp),%ebp - xorl %r11d,%ebx - leal -1894007588(%rdx,%r12,1),%r12d + xorl 28(%rsp),%r14d + leal -1894007588(%rbp,%r12,1),%r12d + xorl %edi,%ebx roll $5,%ecx - xorl 48(%rsp),%ebp addl %eax,%r12d + roll $1,%r14d andl %esi,%ebx - roll $1,%ebp - addl %ebx,%r12d - roll $30,%esi - movl %ebp,60(%rsp) addl %ecx,%r12d - movl 0(%rsp),%edx - movl %esi,%eax - movl %esi,%ebx + roll $30,%esi + addl %ebx,%r12d + xorl 0(%rsp),%edx + movl %edi,%eax + movl %r14d,60(%rsp) + movl %edi,%ebx xorl 8(%rsp),%edx - andl %edi,%eax + andl %esi,%eax movl %r12d,%ecx xorl 32(%rsp),%edx - xorl %edi,%ebx - leal -1894007588(%rbp,%r11,1),%r11d + leal -1894007588(%r14,%r11,1),%r11d + xorl %esi,%ebx roll $5,%ecx - xorl 52(%rsp),%edx addl %eax,%r11d - andl %r13d,%ebx roll $1,%edx - addl %ebx,%r11d + andl %r13d,%ebx + addl %ecx,%r11d roll $30,%r13d + addl %ebx,%r11d + xorl 4(%rsp),%ebp + movl %esi,%eax movl %edx,0(%rsp) - addl %ecx,%r11d - movl 4(%rsp),%ebp - movl %r13d,%eax - movl %r13d,%ebx + movl %esi,%ebx xorl 12(%rsp),%ebp - andl %esi,%eax + andl %r13d,%eax movl %r11d,%ecx xorl 36(%rsp),%ebp - xorl %esi,%ebx leal -1894007588(%rdx,%rdi,1),%edi + xorl %r13d,%ebx roll $5,%ecx - xorl 56(%rsp),%ebp addl %eax,%edi - andl %r12d,%ebx roll $1,%ebp - addl %ebx,%edi + andl %r12d,%ebx + addl %ecx,%edi roll $30,%r12d + addl %ebx,%edi + xorl 8(%rsp),%r14d + movl %r13d,%eax movl %ebp,4(%rsp) - addl %ecx,%edi - movl 8(%rsp),%edx - movl %r12d,%eax - movl %r12d,%ebx - xorl 16(%rsp),%edx - andl %r13d,%eax + movl %r13d,%ebx + xorl 16(%rsp),%r14d + andl %r12d,%eax movl %edi,%ecx - xorl 40(%rsp),%edx - xorl %r13d,%ebx + xorl 40(%rsp),%r14d leal -1894007588(%rbp,%rsi,1),%esi + xorl %r12d,%ebx roll $5,%ecx - xorl 60(%rsp),%edx addl %eax,%esi + roll $1,%r14d andl %r11d,%ebx - roll $1,%edx - addl %ebx,%esi - roll $30,%r11d - movl %edx,8(%rsp) addl %ecx,%esi - movl 12(%rsp),%ebp - movl %r11d,%eax - movl %r11d,%ebx - xorl 20(%rsp),%ebp - andl %r12d,%eax + roll $30,%r11d + addl %ebx,%esi + xorl 12(%rsp),%edx + movl %r12d,%eax + movl %r14d,8(%rsp) + movl %r12d,%ebx + xorl 20(%rsp),%edx + andl %r11d,%eax movl %esi,%ecx - xorl 44(%rsp),%ebp - xorl %r12d,%ebx - leal -1894007588(%rdx,%r13,1),%r13d + xorl 44(%rsp),%edx + leal -1894007588(%r14,%r13,1),%r13d + xorl %r11d,%ebx roll $5,%ecx - xorl 0(%rsp),%ebp addl %eax,%r13d + roll $1,%edx andl %edi,%ebx - roll $1,%ebp - addl %ebx,%r13d - roll $30,%edi - movl %ebp,12(%rsp) addl %ecx,%r13d - movl 16(%rsp),%edx - movl %edi,%eax - movl %edi,%ebx - xorl 24(%rsp),%edx - andl %r11d,%eax + roll $30,%edi + addl %ebx,%r13d + xorl 16(%rsp),%ebp + movl %r11d,%eax + movl %edx,12(%rsp) + movl %r11d,%ebx + xorl 24(%rsp),%ebp + andl %edi,%eax movl %r13d,%ecx - xorl 48(%rsp),%edx - xorl %r11d,%ebx - leal -1894007588(%rbp,%r12,1),%r12d + xorl 48(%rsp),%ebp + leal -1894007588(%rdx,%r12,1),%r12d + xorl %edi,%ebx roll $5,%ecx - xorl 4(%rsp),%edx addl %eax,%r12d + roll $1,%ebp andl %esi,%ebx - roll $1,%edx - addl %ebx,%r12d - roll $30,%esi - movl %edx,16(%rsp) addl %ecx,%r12d - movl 20(%rsp),%ebp - movl %esi,%eax - movl %esi,%ebx - xorl 28(%rsp),%ebp - andl %edi,%eax + roll $30,%esi + addl %ebx,%r12d + xorl 20(%rsp),%r14d + movl %edi,%eax + movl %ebp,16(%rsp) + movl %edi,%ebx + xorl 28(%rsp),%r14d + andl %esi,%eax movl %r12d,%ecx - xorl 52(%rsp),%ebp - xorl %edi,%ebx - leal -1894007588(%rdx,%r11,1),%r11d + xorl 52(%rsp),%r14d + leal -1894007588(%rbp,%r11,1),%r11d + xorl %esi,%ebx roll $5,%ecx - xorl 8(%rsp),%ebp addl %eax,%r11d + roll $1,%r14d andl %r13d,%ebx - roll $1,%ebp - addl %ebx,%r11d - roll $30,%r13d - movl %ebp,20(%rsp) addl %ecx,%r11d - movl 24(%rsp),%edx - movl %r13d,%eax - movl %r13d,%ebx + roll $30,%r13d + addl %ebx,%r11d + xorl 24(%rsp),%edx + movl %esi,%eax + movl %r14d,20(%rsp) + movl %esi,%ebx xorl 32(%rsp),%edx - andl %esi,%eax + andl %r13d,%eax movl %r11d,%ecx xorl 56(%rsp),%edx - xorl %esi,%ebx - leal -1894007588(%rbp,%rdi,1),%edi + leal -1894007588(%r14,%rdi,1),%edi + xorl %r13d,%ebx roll $5,%ecx - xorl 12(%rsp),%edx addl %eax,%edi - andl %r12d,%ebx roll $1,%edx - addl %ebx,%edi + andl %r12d,%ebx + addl %ecx,%edi roll $30,%r12d + addl %ebx,%edi + xorl 28(%rsp),%ebp + movl %r13d,%eax movl %edx,24(%rsp) - addl %ecx,%edi - movl 28(%rsp),%ebp - movl %r12d,%eax - movl %r12d,%ebx + movl %r13d,%ebx xorl 36(%rsp),%ebp - andl %r13d,%eax + andl %r12d,%eax movl %edi,%ecx xorl 60(%rsp),%ebp - xorl %r13d,%ebx leal -1894007588(%rdx,%rsi,1),%esi + xorl %r12d,%ebx roll $5,%ecx - xorl 16(%rsp),%ebp addl %eax,%esi - andl %r11d,%ebx roll $1,%ebp - addl %ebx,%esi + andl %r11d,%ebx + addl %ecx,%esi roll $30,%r11d + addl %ebx,%esi + xorl 32(%rsp),%r14d + movl %r12d,%eax movl %ebp,28(%rsp) - addl %ecx,%esi - movl 32(%rsp),%edx - movl %r11d,%eax - movl %r11d,%ebx - xorl 40(%rsp),%edx - andl %r12d,%eax + movl %r12d,%ebx + xorl 40(%rsp),%r14d + andl %r11d,%eax movl %esi,%ecx - xorl 0(%rsp),%edx - xorl %r12d,%ebx + xorl 0(%rsp),%r14d leal -1894007588(%rbp,%r13,1),%r13d + xorl %r11d,%ebx roll $5,%ecx - xorl 20(%rsp),%edx addl %eax,%r13d + roll $1,%r14d andl %edi,%ebx - roll $1,%edx - addl %ebx,%r13d - roll $30,%edi - movl %edx,32(%rsp) addl %ecx,%r13d - movl 36(%rsp),%ebp - movl %edi,%eax - movl %edi,%ebx - xorl 44(%rsp),%ebp - andl %r11d,%eax + roll $30,%edi + addl %ebx,%r13d + xorl 36(%rsp),%edx + movl %r11d,%eax + movl %r14d,32(%rsp) + movl %r11d,%ebx + xorl 44(%rsp),%edx + andl %edi,%eax movl %r13d,%ecx - xorl 4(%rsp),%ebp - xorl %r11d,%ebx - leal -1894007588(%rdx,%r12,1),%r12d + xorl 4(%rsp),%edx + leal -1894007588(%r14,%r12,1),%r12d + xorl %edi,%ebx roll $5,%ecx - xorl 24(%rsp),%ebp addl %eax,%r12d + roll $1,%edx andl %esi,%ebx - roll $1,%ebp - addl %ebx,%r12d - roll $30,%esi - movl %ebp,36(%rsp) addl %ecx,%r12d - movl 40(%rsp),%edx - movl %esi,%eax - movl %esi,%ebx - xorl 48(%rsp),%edx - andl %edi,%eax + roll $30,%esi + addl %ebx,%r12d + xorl 40(%rsp),%ebp + movl %edi,%eax + movl %edx,36(%rsp) + movl %edi,%ebx + xorl 48(%rsp),%ebp + andl %esi,%eax movl %r12d,%ecx - xorl 8(%rsp),%edx - xorl %edi,%ebx - leal -1894007588(%rbp,%r11,1),%r11d + xorl 8(%rsp),%ebp + leal -1894007588(%rdx,%r11,1),%r11d + xorl %esi,%ebx roll $5,%ecx - xorl 28(%rsp),%edx addl %eax,%r11d + roll $1,%ebp andl %r13d,%ebx - roll $1,%edx - addl %ebx,%r11d - roll $30,%r13d - movl %edx,40(%rsp) addl %ecx,%r11d - movl 44(%rsp),%ebp - movl %r13d,%eax - movl %r13d,%ebx - xorl 52(%rsp),%ebp - andl %esi,%eax + roll $30,%r13d + addl %ebx,%r11d + xorl 44(%rsp),%r14d + movl %esi,%eax + movl %ebp,40(%rsp) + movl %esi,%ebx + xorl 52(%rsp),%r14d + andl %r13d,%eax movl %r11d,%ecx - xorl 12(%rsp),%ebp - xorl %esi,%ebx - leal -1894007588(%rdx,%rdi,1),%edi + xorl 12(%rsp),%r14d + leal -1894007588(%rbp,%rdi,1),%edi + xorl %r13d,%ebx roll $5,%ecx - xorl 32(%rsp),%ebp addl %eax,%edi + roll $1,%r14d andl %r12d,%ebx - roll $1,%ebp - addl %ebx,%edi - roll $30,%r12d - movl %ebp,44(%rsp) addl %ecx,%edi - movl 48(%rsp),%edx - movl %r12d,%eax - movl %r12d,%ebx + roll $30,%r12d + addl %ebx,%edi + xorl 48(%rsp),%edx + movl %r13d,%eax + movl %r14d,44(%rsp) + movl %r13d,%ebx xorl 56(%rsp),%edx - andl %r13d,%eax + andl %r12d,%eax movl %edi,%ecx xorl 16(%rsp),%edx - xorl %r13d,%ebx - leal -1894007588(%rbp,%rsi,1),%esi + leal -1894007588(%r14,%rsi,1),%esi + xorl %r12d,%ebx roll $5,%ecx - xorl 36(%rsp),%edx addl %eax,%esi - andl %r11d,%ebx roll $1,%edx - addl %ebx,%esi + andl %r11d,%ebx + addl %ecx,%esi roll $30,%r11d + addl %ebx,%esi + xorl 52(%rsp),%ebp + movl %edi,%eax movl %edx,48(%rsp) - addl %ecx,%esi - movl 52(%rsp),%ebp - movl %r11d,%eax movl %esi,%ecx xorl 60(%rsp),%ebp - xorl %edi,%eax + xorl %r12d,%eax roll $5,%ecx - leal -899497514(%rdx,%r13,1),%r13d xorl 20(%rsp),%ebp - xorl %r12d,%eax + leal -899497514(%rdx,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 40(%rsp),%ebp roll $30,%edi addl %eax,%r13d roll $1,%ebp + xorl 56(%rsp),%r14d + movl %esi,%eax movl %ebp,52(%rsp) - movl 56(%rsp),%edx - movl %edi,%eax movl %r13d,%ecx - xorl 0(%rsp),%edx - xorl %esi,%eax + xorl 0(%rsp),%r14d + xorl %r11d,%eax roll $5,%ecx + xorl 24(%rsp),%r14d leal -899497514(%rbp,%r12,1),%r12d - xorl 24(%rsp),%edx - xorl %r11d,%eax + xorl %edi,%eax addl %ecx,%r12d - xorl 44(%rsp),%edx roll $30,%esi addl %eax,%r12d - roll $1,%edx - movl %edx,56(%rsp) - movl 60(%rsp),%ebp - movl %esi,%eax + roll $1,%r14d + xorl 60(%rsp),%edx + movl %r13d,%eax + movl %r14d,56(%rsp) movl %r12d,%ecx - xorl 4(%rsp),%ebp - xorl %r13d,%eax - roll $5,%ecx - leal -899497514(%rdx,%r11,1),%r11d - xorl 28(%rsp),%ebp + xorl 4(%rsp),%edx xorl %edi,%eax + roll $5,%ecx + xorl 28(%rsp),%edx + leal -899497514(%r14,%r11,1),%r11d + xorl %esi,%eax addl %ecx,%r11d - xorl 48(%rsp),%ebp roll $30,%r13d addl %eax,%r11d - roll $1,%ebp - movl %ebp,60(%rsp) - movl 0(%rsp),%edx - movl %r13d,%eax + roll $1,%edx + xorl 0(%rsp),%ebp + movl %r12d,%eax + movl %edx,60(%rsp) movl %r11d,%ecx - xorl 8(%rsp),%edx - xorl %r12d,%eax - roll $5,%ecx - leal -899497514(%rbp,%rdi,1),%edi - xorl 32(%rsp),%edx + xorl 8(%rsp),%ebp xorl %esi,%eax + roll $5,%ecx + xorl 32(%rsp),%ebp + leal -899497514(%rdx,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 52(%rsp),%edx roll $30,%r12d addl %eax,%edi - roll $1,%edx - movl %edx,0(%rsp) - movl 4(%rsp),%ebp - movl %r12d,%eax + roll $1,%ebp + xorl 4(%rsp),%r14d + movl %r11d,%eax + movl %ebp,0(%rsp) movl %edi,%ecx - xorl 12(%rsp),%ebp - xorl %r11d,%eax - roll $5,%ecx - leal -899497514(%rdx,%rsi,1),%esi - xorl 36(%rsp),%ebp + xorl 12(%rsp),%r14d xorl %r13d,%eax + roll $5,%ecx + xorl 36(%rsp),%r14d + leal -899497514(%rbp,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 56(%rsp),%ebp roll $30,%r11d addl %eax,%esi - roll $1,%ebp - movl %ebp,4(%rsp) - movl 8(%rsp),%edx - movl %r11d,%eax + roll $1,%r14d + xorl 8(%rsp),%edx + movl %edi,%eax + movl %r14d,4(%rsp) movl %esi,%ecx xorl 16(%rsp),%edx - xorl %edi,%eax + xorl %r12d,%eax roll $5,%ecx - leal -899497514(%rbp,%r13,1),%r13d xorl 40(%rsp),%edx - xorl %r12d,%eax + leal -899497514(%r14,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 60(%rsp),%edx roll $30,%edi addl %eax,%r13d roll $1,%edx + xorl 12(%rsp),%ebp + movl %esi,%eax movl %edx,8(%rsp) - movl 12(%rsp),%ebp - movl %edi,%eax movl %r13d,%ecx xorl 20(%rsp),%ebp - xorl %esi,%eax + xorl %r11d,%eax roll $5,%ecx - leal -899497514(%rdx,%r12,1),%r12d xorl 44(%rsp),%ebp - xorl %r11d,%eax + leal -899497514(%rdx,%r12,1),%r12d + xorl %edi,%eax addl %ecx,%r12d - xorl 0(%rsp),%ebp roll $30,%esi addl %eax,%r12d roll $1,%ebp + xorl 16(%rsp),%r14d + movl %r13d,%eax movl %ebp,12(%rsp) - movl 16(%rsp),%edx - movl %esi,%eax movl %r12d,%ecx - xorl 24(%rsp),%edx - xorl %r13d,%eax + xorl 24(%rsp),%r14d + xorl %edi,%eax roll $5,%ecx + xorl 48(%rsp),%r14d leal -899497514(%rbp,%r11,1),%r11d - xorl 48(%rsp),%edx - xorl %edi,%eax + xorl %esi,%eax addl %ecx,%r11d - xorl 4(%rsp),%edx roll $30,%r13d addl %eax,%r11d - roll $1,%edx - movl %edx,16(%rsp) - movl 20(%rsp),%ebp - movl %r13d,%eax + roll $1,%r14d + xorl 20(%rsp),%edx + movl %r12d,%eax + movl %r14d,16(%rsp) movl %r11d,%ecx - xorl 28(%rsp),%ebp - xorl %r12d,%eax - roll $5,%ecx - leal -899497514(%rdx,%rdi,1),%edi - xorl 52(%rsp),%ebp + xorl 28(%rsp),%edx xorl %esi,%eax + roll $5,%ecx + xorl 52(%rsp),%edx + leal -899497514(%r14,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 8(%rsp),%ebp roll $30,%r12d addl %eax,%edi - roll $1,%ebp - movl %ebp,20(%rsp) - movl 24(%rsp),%edx - movl %r12d,%eax + roll $1,%edx + xorl 24(%rsp),%ebp + movl %r11d,%eax + movl %edx,20(%rsp) movl %edi,%ecx - xorl 32(%rsp),%edx - xorl %r11d,%eax - roll $5,%ecx - leal -899497514(%rbp,%rsi,1),%esi - xorl 56(%rsp),%edx + xorl 32(%rsp),%ebp xorl %r13d,%eax + roll $5,%ecx + xorl 56(%rsp),%ebp + leal -899497514(%rdx,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 12(%rsp),%edx roll $30,%r11d addl %eax,%esi - roll $1,%edx - movl %edx,24(%rsp) - movl 28(%rsp),%ebp - movl %r11d,%eax + roll $1,%ebp + xorl 28(%rsp),%r14d + movl %edi,%eax + movl %ebp,24(%rsp) movl %esi,%ecx - xorl 36(%rsp),%ebp - xorl %edi,%eax - roll $5,%ecx - leal -899497514(%rdx,%r13,1),%r13d - xorl 60(%rsp),%ebp + xorl 36(%rsp),%r14d xorl %r12d,%eax + roll $5,%ecx + xorl 60(%rsp),%r14d + leal -899497514(%rbp,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 16(%rsp),%ebp roll $30,%edi addl %eax,%r13d - roll $1,%ebp - movl %ebp,28(%rsp) - movl 32(%rsp),%edx - movl %edi,%eax + roll $1,%r14d + xorl 32(%rsp),%edx + movl %esi,%eax + movl %r14d,28(%rsp) movl %r13d,%ecx xorl 40(%rsp),%edx - xorl %esi,%eax + xorl %r11d,%eax roll $5,%ecx - leal -899497514(%rbp,%r12,1),%r12d xorl 0(%rsp),%edx - xorl %r11d,%eax + leal -899497514(%r14,%r12,1),%r12d + xorl %edi,%eax addl %ecx,%r12d - xorl 20(%rsp),%edx roll $30,%esi addl %eax,%r12d roll $1,%edx - movl %edx,32(%rsp) - movl 36(%rsp),%ebp - movl %esi,%eax + xorl 36(%rsp),%ebp + movl %r13d,%eax + movl %r12d,%ecx xorl 44(%rsp),%ebp - xorl %r13d,%eax + xorl %edi,%eax roll $5,%ecx - leal -899497514(%rdx,%r11,1),%r11d xorl 4(%rsp),%ebp - xorl %edi,%eax + leal -899497514(%rdx,%r11,1),%r11d + xorl %esi,%eax addl %ecx,%r11d - xorl 24(%rsp),%ebp roll $30,%r13d addl %eax,%r11d roll $1,%ebp - movl %ebp,36(%rsp) - movl 40(%rsp),%edx - movl %r13d,%eax + xorl 40(%rsp),%r14d + movl %r12d,%eax + movl %r11d,%ecx - xorl 48(%rsp),%edx - xorl %r12d,%eax + xorl 48(%rsp),%r14d + xorl %esi,%eax roll $5,%ecx + xorl 8(%rsp),%r14d leal -899497514(%rbp,%rdi,1),%edi - xorl 8(%rsp),%edx - xorl %esi,%eax + xorl %r13d,%eax addl %ecx,%edi - xorl 28(%rsp),%edx roll $30,%r12d addl %eax,%edi - roll $1,%edx - movl %edx,40(%rsp) - movl 44(%rsp),%ebp - movl %r12d,%eax + roll $1,%r14d + xorl 44(%rsp),%edx + movl %r11d,%eax + movl %edi,%ecx - xorl 52(%rsp),%ebp - xorl %r11d,%eax - roll $5,%ecx - leal -899497514(%rdx,%rsi,1),%esi - xorl 12(%rsp),%ebp + xorl 52(%rsp),%edx xorl %r13d,%eax + roll $5,%ecx + xorl 12(%rsp),%edx + leal -899497514(%r14,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 32(%rsp),%ebp roll $30,%r11d addl %eax,%esi - roll $1,%ebp - movl %ebp,44(%rsp) - movl 48(%rsp),%edx - movl %r11d,%eax + roll $1,%edx + xorl 48(%rsp),%ebp + movl %edi,%eax + movl %esi,%ecx - xorl 56(%rsp),%edx - xorl %edi,%eax - roll $5,%ecx - leal -899497514(%rbp,%r13,1),%r13d - xorl 16(%rsp),%edx + xorl 56(%rsp),%ebp xorl %r12d,%eax + roll $5,%ecx + xorl 16(%rsp),%ebp + leal -899497514(%rdx,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 36(%rsp),%edx roll $30,%edi addl %eax,%r13d - roll $1,%edx - movl %edx,48(%rsp) - movl 52(%rsp),%ebp - movl %edi,%eax + roll $1,%ebp + xorl 52(%rsp),%r14d + movl %esi,%eax + movl %r13d,%ecx - xorl 60(%rsp),%ebp - xorl %esi,%eax - roll $5,%ecx - leal -899497514(%rdx,%r12,1),%r12d - xorl 20(%rsp),%ebp + xorl 60(%rsp),%r14d xorl %r11d,%eax + roll $5,%ecx + xorl 20(%rsp),%r14d + leal -899497514(%rbp,%r12,1),%r12d + xorl %edi,%eax addl %ecx,%r12d - xorl 40(%rsp),%ebp roll $30,%esi addl %eax,%r12d - roll $1,%ebp - movl 56(%rsp),%edx - movl %esi,%eax + roll $1,%r14d + xorl 56(%rsp),%edx + movl %r13d,%eax + movl %r12d,%ecx xorl 0(%rsp),%edx - xorl %r13d,%eax + xorl %edi,%eax roll $5,%ecx - leal -899497514(%rbp,%r11,1),%r11d xorl 24(%rsp),%edx - xorl %edi,%eax + leal -899497514(%r14,%r11,1),%r11d + xorl %esi,%eax addl %ecx,%r11d - xorl 44(%rsp),%edx roll $30,%r13d addl %eax,%r11d roll $1,%edx - movl 60(%rsp),%ebp - movl %r13d,%eax + xorl 60(%rsp),%ebp + movl %r12d,%eax + movl %r11d,%ecx xorl 4(%rsp),%ebp - xorl %r12d,%eax + xorl %esi,%eax roll $5,%ecx - leal -899497514(%rdx,%rdi,1),%edi xorl 28(%rsp),%ebp - xorl %esi,%eax + leal -899497514(%rdx,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 48(%rsp),%ebp roll $30,%r12d addl %eax,%edi roll $1,%ebp - movl %r12d,%eax + movl %r11d,%eax movl %edi,%ecx - xorl %r11d,%eax + xorl %r13d,%eax leal -899497514(%rbp,%rsi,1),%esi roll $5,%ecx - xorl %r13d,%eax + xorl %r12d,%eax addl %ecx,%esi roll $30,%r11d addl %eax,%esi @@ -1281,29 +1223,202 @@ sha1_block_data_order: jnz .Lloop movq 64(%rsp),%rsi - movq (%rsi),%r13 - movq 8(%rsi),%r12 - movq 16(%rsi),%rbp - movq 24(%rsi),%rbx - leaq 32(%rsi),%rsp + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp .Lepilogue: .byte 0xf3,0xc3 .size sha1_block_data_order,.-sha1_block_data_order +.type sha1_block_data_order_shaext,@function +.align 32 +sha1_block_data_order_shaext: +_shaext_shortcut: + movdqu (%rdi),%xmm0 + movd 16(%rdi),%xmm1 + movdqa K_XX_XX+160(%rip),%xmm3 + + movdqu (%rsi),%xmm4 + pshufd $27,%xmm0,%xmm0 + movdqu 16(%rsi),%xmm5 + pshufd $27,%xmm1,%xmm1 + movdqu 32(%rsi),%xmm6 +.byte 102,15,56,0,227 + movdqu 48(%rsi),%xmm7 +.byte 102,15,56,0,235 +.byte 102,15,56,0,243 + movdqa %xmm1,%xmm9 +.byte 102,15,56,0,251 + jmp .Loop_shaext + +.align 16 +.Loop_shaext: + decq %rdx + leaq 64(%rsi),%rax + paddd %xmm4,%xmm1 + cmovneq %rax,%rsi + movdqa %xmm0,%xmm8 +.byte 15,56,201,229 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,0 +.byte 15,56,200,213 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 +.byte 15,56,202,231 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,0 +.byte 15,56,200,206 + pxor %xmm7,%xmm5 +.byte 15,56,202,236 +.byte 15,56,201,247 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,0 +.byte 15,56,200,215 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 +.byte 15,56,202,245 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,0 +.byte 15,56,200,204 + pxor %xmm5,%xmm7 +.byte 15,56,202,254 +.byte 15,56,201,229 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,0 +.byte 15,56,200,213 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 +.byte 15,56,202,231 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,1 +.byte 15,56,200,206 + pxor %xmm7,%xmm5 +.byte 15,56,202,236 +.byte 15,56,201,247 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,1 +.byte 15,56,200,215 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 +.byte 15,56,202,245 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,1 +.byte 15,56,200,204 + pxor %xmm5,%xmm7 +.byte 15,56,202,254 +.byte 15,56,201,229 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,1 +.byte 15,56,200,213 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 +.byte 15,56,202,231 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,1 +.byte 15,56,200,206 + pxor %xmm7,%xmm5 +.byte 15,56,202,236 +.byte 15,56,201,247 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,2 +.byte 15,56,200,215 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 +.byte 15,56,202,245 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,2 +.byte 15,56,200,204 + pxor %xmm5,%xmm7 +.byte 15,56,202,254 +.byte 15,56,201,229 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,2 +.byte 15,56,200,213 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 +.byte 15,56,202,231 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,2 +.byte 15,56,200,206 + pxor %xmm7,%xmm5 +.byte 15,56,202,236 +.byte 15,56,201,247 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,2 +.byte 15,56,200,215 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 +.byte 15,56,202,245 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,3 +.byte 15,56,200,204 + pxor %xmm5,%xmm7 +.byte 15,56,202,254 + movdqu (%rsi),%xmm4 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,3 +.byte 15,56,200,213 + movdqu 16(%rsi),%xmm5 +.byte 102,15,56,0,227 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,3 +.byte 15,56,200,206 + movdqu 32(%rsi),%xmm6 +.byte 102,15,56,0,235 + + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,3 +.byte 15,56,200,215 + movdqu 48(%rsi),%xmm7 +.byte 102,15,56,0,243 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,3 +.byte 65,15,56,200,201 +.byte 102,15,56,0,251 + + paddd %xmm8,%xmm0 + movdqa %xmm1,%xmm9 + + jnz .Loop_shaext + + pshufd $27,%xmm0,%xmm0 + pshufd $27,%xmm1,%xmm1 + movdqu %xmm0,(%rdi) + movd %xmm1,16(%rdi) + .byte 0xf3,0xc3 +.size sha1_block_data_order_shaext,.-sha1_block_data_order_shaext .type sha1_block_data_order_ssse3,@function .align 16 sha1_block_data_order_ssse3: _ssse3_shortcut: + movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 + pushq %r13 + pushq %r14 leaq -64(%rsp),%rsp + movq %rax,%r14 + andq $-64,%rsp movq %rdi,%r8 movq %rsi,%r9 movq %rdx,%r10 shlq $6,%r10 addq %r9,%r10 - leaq K_XX_XX(%rip),%r11 + leaq K_XX_XX+64(%rip),%r11 movl 0(%r8),%eax movl 4(%r8),%ebx @@ -1311,19 +1426,22 @@ _ssse3_shortcut: movl 12(%r8),%edx movl %ebx,%esi movl 16(%r8),%ebp + movl %ecx,%edi + xorl %edx,%edi + andl %edi,%esi movdqa 64(%r11),%xmm6 - movdqa 0(%r11),%xmm9 + movdqa -64(%r11),%xmm9 movdqu 0(%r9),%xmm0 movdqu 16(%r9),%xmm1 movdqu 32(%r9),%xmm2 movdqu 48(%r9),%xmm3 .byte 102,15,56,0,198 - addq $64,%r9 .byte 102,15,56,0,206 .byte 102,15,56,0,214 -.byte 102,15,56,0,222 + addq $64,%r9 paddd %xmm9,%xmm0 +.byte 102,15,56,0,222 paddd %xmm9,%xmm1 paddd %xmm9,%xmm2 movdqa %xmm0,0(%rsp) @@ -1335,904 +1453,882 @@ _ssse3_shortcut: jmp .Loop_ssse3 .align 16 .Loop_ssse3: - movdqa %xmm1,%xmm4 - addl 0(%rsp),%ebp - xorl %edx,%ecx + rorl $2,%ebx + pshufd $238,%xmm0,%xmm4 + xorl %edx,%esi movdqa %xmm3,%xmm8 -.byte 102,15,58,15,224,8 + paddd %xmm3,%xmm9 movl %eax,%edi + addl 0(%rsp),%ebp + punpcklqdq %xmm1,%xmm4 + xorl %ecx,%ebx roll $5,%eax - paddd %xmm3,%xmm9 - andl %ecx,%esi - xorl %edx,%ecx + addl %esi,%ebp psrldq $4,%xmm8 - xorl %edx,%esi - addl %eax,%ebp + andl %ebx,%edi + xorl %ecx,%ebx pxor %xmm0,%xmm4 - rorl $2,%ebx - addl %esi,%ebp + addl %eax,%ebp + rorl $7,%eax pxor %xmm2,%xmm8 - addl 4(%rsp),%edx - xorl %ecx,%ebx + xorl %ecx,%edi movl %ebp,%esi - roll $5,%ebp + addl 4(%rsp),%edx pxor %xmm8,%xmm4 - andl %ebx,%edi - xorl %ecx,%ebx + xorl %ebx,%eax + roll $5,%ebp movdqa %xmm9,48(%rsp) - xorl %ecx,%edi - addl %ebp,%edx - movdqa %xmm4,%xmm10 - movdqa %xmm4,%xmm8 - rorl $7,%eax addl %edi,%edx - addl 8(%rsp),%ecx + andl %eax,%esi + movdqa %xmm4,%xmm10 xorl %ebx,%eax + addl %ebp,%edx + rorl $7,%ebp + movdqa %xmm4,%xmm8 + xorl %ebx,%esi pslldq $12,%xmm10 paddd %xmm4,%xmm4 movl %edx,%edi - roll $5,%edx - andl %eax,%esi - xorl %ebx,%eax + addl 8(%rsp),%ecx psrld $31,%xmm8 - xorl %ebx,%esi - addl %edx,%ecx - movdqa %xmm10,%xmm9 - rorl $7,%ebp + xorl %eax,%ebp + roll $5,%edx addl %esi,%ecx + movdqa %xmm10,%xmm9 + andl %ebp,%edi + xorl %eax,%ebp psrld $30,%xmm10 + addl %edx,%ecx + rorl $7,%edx por %xmm8,%xmm4 - addl 12(%rsp),%ebx - xorl %eax,%ebp + xorl %eax,%edi movl %ecx,%esi - roll $5,%ecx + addl 12(%rsp),%ebx pslld $2,%xmm9 pxor %xmm10,%xmm4 - andl %ebp,%edi - xorl %eax,%ebp - movdqa 0(%r11),%xmm10 - xorl %eax,%edi - addl %ecx,%ebx - pxor %xmm9,%xmm4 - rorl $7,%edx + xorl %ebp,%edx + movdqa -64(%r11),%xmm10 + roll $5,%ecx addl %edi,%ebx - movdqa %xmm2,%xmm5 - addl 16(%rsp),%eax + andl %edx,%esi + pxor %xmm9,%xmm4 xorl %ebp,%edx + addl %ecx,%ebx + rorl $7,%ecx + pshufd $238,%xmm1,%xmm5 + xorl %ebp,%esi movdqa %xmm4,%xmm9 -.byte 102,15,58,15,233,8 + paddd %xmm4,%xmm10 movl %ebx,%edi + addl 16(%rsp),%eax + punpcklqdq %xmm2,%xmm5 + xorl %edx,%ecx roll $5,%ebx - paddd %xmm4,%xmm10 - andl %edx,%esi - xorl %ebp,%edx + addl %esi,%eax psrldq $4,%xmm9 - xorl %ebp,%esi - addl %ebx,%eax + andl %ecx,%edi + xorl %edx,%ecx pxor %xmm1,%xmm5 - rorl $7,%ecx - addl %esi,%eax + addl %ebx,%eax + rorl $7,%ebx pxor %xmm3,%xmm9 - addl 20(%rsp),%ebp - xorl %edx,%ecx + xorl %edx,%edi movl %eax,%esi - roll $5,%eax + addl 20(%rsp),%ebp pxor %xmm9,%xmm5 - andl %ecx,%edi - xorl %edx,%ecx + xorl %ecx,%ebx + roll $5,%eax movdqa %xmm10,0(%rsp) - xorl %edx,%edi - addl %eax,%ebp - movdqa %xmm5,%xmm8 - movdqa %xmm5,%xmm9 - rorl $7,%ebx addl %edi,%ebp - addl 24(%rsp),%edx + andl %ebx,%esi + movdqa %xmm5,%xmm8 xorl %ecx,%ebx + addl %eax,%ebp + rorl $7,%eax + movdqa %xmm5,%xmm9 + xorl %ecx,%esi pslldq $12,%xmm8 paddd %xmm5,%xmm5 movl %ebp,%edi - roll $5,%ebp - andl %ebx,%esi - xorl %ecx,%ebx + addl 24(%rsp),%edx psrld $31,%xmm9 - xorl %ecx,%esi - addl %ebp,%edx - movdqa %xmm8,%xmm10 - rorl $7,%eax + xorl %ebx,%eax + roll $5,%ebp addl %esi,%edx + movdqa %xmm8,%xmm10 + andl %eax,%edi + xorl %ebx,%eax psrld $30,%xmm8 + addl %ebp,%edx + rorl $7,%ebp por %xmm9,%xmm5 - addl 28(%rsp),%ecx - xorl %ebx,%eax + xorl %ebx,%edi movl %edx,%esi - roll $5,%edx + addl 28(%rsp),%ecx pslld $2,%xmm10 pxor %xmm8,%xmm5 - andl %eax,%edi - xorl %ebx,%eax - movdqa 16(%r11),%xmm8 - xorl %ebx,%edi - addl %edx,%ecx - pxor %xmm10,%xmm5 - rorl $7,%ebp + xorl %eax,%ebp + movdqa -32(%r11),%xmm8 + roll $5,%edx addl %edi,%ecx - movdqa %xmm3,%xmm6 - addl 32(%rsp),%ebx + andl %ebp,%esi + pxor %xmm10,%xmm5 xorl %eax,%ebp + addl %edx,%ecx + rorl $7,%edx + pshufd $238,%xmm2,%xmm6 + xorl %eax,%esi movdqa %xmm5,%xmm10 -.byte 102,15,58,15,242,8 + paddd %xmm5,%xmm8 movl %ecx,%edi + addl 32(%rsp),%ebx + punpcklqdq %xmm3,%xmm6 + xorl %ebp,%edx roll $5,%ecx - paddd %xmm5,%xmm8 - andl %ebp,%esi - xorl %eax,%ebp + addl %esi,%ebx psrldq $4,%xmm10 - xorl %eax,%esi - addl %ecx,%ebx + andl %edx,%edi + xorl %ebp,%edx pxor %xmm2,%xmm6 - rorl $7,%edx - addl %esi,%ebx + addl %ecx,%ebx + rorl $7,%ecx pxor %xmm4,%xmm10 - addl 36(%rsp),%eax - xorl %ebp,%edx + xorl %ebp,%edi movl %ebx,%esi - roll $5,%ebx + addl 36(%rsp),%eax pxor %xmm10,%xmm6 - andl %edx,%edi - xorl %ebp,%edx + xorl %edx,%ecx + roll $5,%ebx movdqa %xmm8,16(%rsp) - xorl %ebp,%edi - addl %ebx,%eax - movdqa %xmm6,%xmm9 - movdqa %xmm6,%xmm10 - rorl $7,%ecx addl %edi,%eax - addl 40(%rsp),%ebp + andl %ecx,%esi + movdqa %xmm6,%xmm9 xorl %edx,%ecx + addl %ebx,%eax + rorl $7,%ebx + movdqa %xmm6,%xmm10 + xorl %edx,%esi pslldq $12,%xmm9 paddd %xmm6,%xmm6 movl %eax,%edi - roll $5,%eax - andl %ecx,%esi - xorl %edx,%ecx + addl 40(%rsp),%ebp psrld $31,%xmm10 - xorl %edx,%esi - addl %eax,%ebp - movdqa %xmm9,%xmm8 - rorl $7,%ebx + xorl %ecx,%ebx + roll $5,%eax addl %esi,%ebp + movdqa %xmm9,%xmm8 + andl %ebx,%edi + xorl %ecx,%ebx psrld $30,%xmm9 + addl %eax,%ebp + rorl $7,%eax por %xmm10,%xmm6 - addl 44(%rsp),%edx - xorl %ecx,%ebx + xorl %ecx,%edi movl %ebp,%esi - roll $5,%ebp + addl 44(%rsp),%edx pslld $2,%xmm8 pxor %xmm9,%xmm6 - andl %ebx,%edi - xorl %ecx,%ebx - movdqa 16(%r11),%xmm9 - xorl %ecx,%edi - addl %ebp,%edx - pxor %xmm8,%xmm6 - rorl $7,%eax + xorl %ebx,%eax + movdqa -32(%r11),%xmm9 + roll $5,%ebp addl %edi,%edx - movdqa %xmm4,%xmm7 - addl 48(%rsp),%ecx + andl %eax,%esi + pxor %xmm8,%xmm6 xorl %ebx,%eax + addl %ebp,%edx + rorl $7,%ebp + pshufd $238,%xmm3,%xmm7 + xorl %ebx,%esi movdqa %xmm6,%xmm8 -.byte 102,15,58,15,251,8 + paddd %xmm6,%xmm9 movl %edx,%edi + addl 48(%rsp),%ecx + punpcklqdq %xmm4,%xmm7 + xorl %eax,%ebp roll $5,%edx - paddd %xmm6,%xmm9 - andl %eax,%esi - xorl %ebx,%eax + addl %esi,%ecx psrldq $4,%xmm8 - xorl %ebx,%esi - addl %edx,%ecx + andl %ebp,%edi + xorl %eax,%ebp pxor %xmm3,%xmm7 - rorl $7,%ebp - addl %esi,%ecx + addl %edx,%ecx + rorl $7,%edx pxor %xmm5,%xmm8 - addl 52(%rsp),%ebx - xorl %eax,%ebp + xorl %eax,%edi movl %ecx,%esi - roll $5,%ecx + addl 52(%rsp),%ebx pxor %xmm8,%xmm7 - andl %ebp,%edi - xorl %eax,%ebp + xorl %ebp,%edx + roll $5,%ecx movdqa %xmm9,32(%rsp) - xorl %eax,%edi - addl %ecx,%ebx - movdqa %xmm7,%xmm10 - movdqa %xmm7,%xmm8 - rorl $7,%edx addl %edi,%ebx - addl 56(%rsp),%eax + andl %edx,%esi + movdqa %xmm7,%xmm10 xorl %ebp,%edx + addl %ecx,%ebx + rorl $7,%ecx + movdqa %xmm7,%xmm8 + xorl %ebp,%esi pslldq $12,%xmm10 paddd %xmm7,%xmm7 movl %ebx,%edi - roll $5,%ebx - andl %edx,%esi - xorl %ebp,%edx + addl 56(%rsp),%eax psrld $31,%xmm8 - xorl %ebp,%esi - addl %ebx,%eax - movdqa %xmm10,%xmm9 - rorl $7,%ecx + xorl %edx,%ecx + roll $5,%ebx addl %esi,%eax + movdqa %xmm10,%xmm9 + andl %ecx,%edi + xorl %edx,%ecx psrld $30,%xmm10 + addl %ebx,%eax + rorl $7,%ebx por %xmm8,%xmm7 - addl 60(%rsp),%ebp - xorl %edx,%ecx + xorl %edx,%edi movl %eax,%esi - roll $5,%eax + addl 60(%rsp),%ebp pslld $2,%xmm9 pxor %xmm10,%xmm7 - andl %ecx,%edi - xorl %edx,%ecx - movdqa 16(%r11),%xmm10 - xorl %edx,%edi - addl %eax,%ebp - pxor %xmm9,%xmm7 - rorl $7,%ebx + xorl %ecx,%ebx + movdqa -32(%r11),%xmm10 + roll $5,%eax addl %edi,%ebp - movdqa %xmm7,%xmm9 - addl 0(%rsp),%edx - pxor %xmm4,%xmm0 -.byte 102,68,15,58,15,206,8 + andl %ebx,%esi + pxor %xmm9,%xmm7 + pshufd $238,%xmm6,%xmm9 xorl %ecx,%ebx + addl %eax,%ebp + rorl $7,%eax + pxor %xmm4,%xmm0 + xorl %ecx,%esi movl %ebp,%edi + addl 0(%rsp),%edx + punpcklqdq %xmm7,%xmm9 + xorl %ebx,%eax roll $5,%ebp pxor %xmm1,%xmm0 - andl %ebx,%esi - xorl %ecx,%ebx + addl %esi,%edx + andl %eax,%edi movdqa %xmm10,%xmm8 + xorl %ebx,%eax paddd %xmm7,%xmm10 - xorl %ecx,%esi addl %ebp,%edx pxor %xmm9,%xmm0 - rorl $7,%eax - addl %esi,%edx + rorl $7,%ebp + xorl %ebx,%edi + movl %edx,%esi addl 4(%rsp),%ecx - xorl %ebx,%eax movdqa %xmm0,%xmm9 - movdqa %xmm10,48(%rsp) - movl %edx,%esi + xorl %eax,%ebp roll $5,%edx - andl %eax,%edi - xorl %ebx,%eax + movdqa %xmm10,48(%rsp) + addl %edi,%ecx + andl %ebp,%esi + xorl %eax,%ebp pslld $2,%xmm0 - xorl %ebx,%edi addl %edx,%ecx + rorl $7,%edx psrld $30,%xmm9 - rorl $7,%ebp - addl %edi,%ecx - addl 8(%rsp),%ebx - xorl %eax,%ebp + xorl %eax,%esi movl %ecx,%edi - roll $5,%ecx + addl 8(%rsp),%ebx por %xmm9,%xmm0 - andl %ebp,%esi - xorl %eax,%ebp - movdqa %xmm0,%xmm10 - xorl %eax,%esi - addl %ecx,%ebx - rorl $7,%edx - addl %esi,%ebx - addl 12(%rsp),%eax xorl %ebp,%edx - movl %ebx,%esi - roll $5,%ebx + roll $5,%ecx + pshufd $238,%xmm7,%xmm10 + addl %esi,%ebx andl %edx,%edi xorl %ebp,%edx + addl %ecx,%ebx + addl 12(%rsp),%eax xorl %ebp,%edi - addl %ebx,%eax - rorl $7,%ecx + movl %ebx,%esi + roll $5,%ebx addl %edi,%eax - addl 16(%rsp),%ebp - pxor %xmm5,%xmm1 -.byte 102,68,15,58,15,215,8 xorl %edx,%esi + rorl $7,%ecx + addl %ebx,%eax + pxor %xmm5,%xmm1 + addl 16(%rsp),%ebp + xorl %ecx,%esi + punpcklqdq %xmm0,%xmm10 movl %eax,%edi roll $5,%eax pxor %xmm2,%xmm1 - xorl %ecx,%esi - addl %eax,%ebp + addl %esi,%ebp + xorl %ecx,%edi movdqa %xmm8,%xmm9 - paddd %xmm0,%xmm8 rorl $7,%ebx - addl %esi,%ebp + paddd %xmm0,%xmm8 + addl %eax,%ebp pxor %xmm10,%xmm1 addl 20(%rsp),%edx - xorl %ecx,%edi + xorl %ebx,%edi movl %ebp,%esi roll $5,%ebp movdqa %xmm1,%xmm10 + addl %edi,%edx + xorl %ebx,%esi movdqa %xmm8,0(%rsp) - xorl %ebx,%edi - addl %ebp,%edx rorl $7,%eax - addl %edi,%edx - pslld $2,%xmm1 + addl %ebp,%edx addl 24(%rsp),%ecx - xorl %ebx,%esi - psrld $30,%xmm10 + pslld $2,%xmm1 + xorl %eax,%esi movl %edx,%edi + psrld $30,%xmm10 roll $5,%edx - xorl %eax,%esi - addl %edx,%ecx - rorl $7,%ebp addl %esi,%ecx + xorl %eax,%edi + rorl $7,%ebp por %xmm10,%xmm1 + addl %edx,%ecx addl 28(%rsp),%ebx - xorl %eax,%edi - movdqa %xmm1,%xmm8 + pshufd $238,%xmm0,%xmm8 + xorl %ebp,%edi movl %ecx,%esi roll $5,%ecx - xorl %ebp,%edi - addl %ecx,%ebx - rorl $7,%edx addl %edi,%ebx - addl 32(%rsp),%eax - pxor %xmm6,%xmm2 -.byte 102,68,15,58,15,192,8 xorl %ebp,%esi + rorl $7,%edx + addl %ecx,%ebx + pxor %xmm6,%xmm2 + addl 32(%rsp),%eax + xorl %edx,%esi + punpcklqdq %xmm1,%xmm8 movl %ebx,%edi roll $5,%ebx pxor %xmm3,%xmm2 - xorl %edx,%esi - addl %ebx,%eax - movdqa 32(%r11),%xmm10 - paddd %xmm1,%xmm9 - rorl $7,%ecx addl %esi,%eax + xorl %edx,%edi + movdqa 0(%r11),%xmm10 + rorl $7,%ecx + paddd %xmm1,%xmm9 + addl %ebx,%eax pxor %xmm8,%xmm2 addl 36(%rsp),%ebp - xorl %edx,%edi + xorl %ecx,%edi movl %eax,%esi roll $5,%eax movdqa %xmm2,%xmm8 + addl %edi,%ebp + xorl %ecx,%esi movdqa %xmm9,16(%rsp) - xorl %ecx,%edi - addl %eax,%ebp rorl $7,%ebx - addl %edi,%ebp - pslld $2,%xmm2 + addl %eax,%ebp addl 40(%rsp),%edx - xorl %ecx,%esi - psrld $30,%xmm8 + pslld $2,%xmm2 + xorl %ebx,%esi movl %ebp,%edi + psrld $30,%xmm8 roll $5,%ebp - xorl %ebx,%esi - addl %ebp,%edx - rorl $7,%eax addl %esi,%edx + xorl %ebx,%edi + rorl $7,%eax por %xmm8,%xmm2 + addl %ebp,%edx addl 44(%rsp),%ecx - xorl %ebx,%edi - movdqa %xmm2,%xmm9 + pshufd $238,%xmm1,%xmm9 + xorl %eax,%edi movl %edx,%esi roll $5,%edx - xorl %eax,%edi - addl %edx,%ecx - rorl $7,%ebp addl %edi,%ecx - addl 48(%rsp),%ebx - pxor %xmm7,%xmm3 -.byte 102,68,15,58,15,201,8 xorl %eax,%esi + rorl $7,%ebp + addl %edx,%ecx + pxor %xmm7,%xmm3 + addl 48(%rsp),%ebx + xorl %ebp,%esi + punpcklqdq %xmm2,%xmm9 movl %ecx,%edi roll $5,%ecx pxor %xmm4,%xmm3 - xorl %ebp,%esi - addl %ecx,%ebx + addl %esi,%ebx + xorl %ebp,%edi movdqa %xmm10,%xmm8 - paddd %xmm2,%xmm10 rorl $7,%edx - addl %esi,%ebx + paddd %xmm2,%xmm10 + addl %ecx,%ebx pxor %xmm9,%xmm3 addl 52(%rsp),%eax - xorl %ebp,%edi + xorl %edx,%edi movl %ebx,%esi roll $5,%ebx movdqa %xmm3,%xmm9 + addl %edi,%eax + xorl %edx,%esi movdqa %xmm10,32(%rsp) - xorl %edx,%edi - addl %ebx,%eax rorl $7,%ecx - addl %edi,%eax - pslld $2,%xmm3 + addl %ebx,%eax addl 56(%rsp),%ebp - xorl %edx,%esi - psrld $30,%xmm9 + pslld $2,%xmm3 + xorl %ecx,%esi movl %eax,%edi + psrld $30,%xmm9 roll $5,%eax - xorl %ecx,%esi - addl %eax,%ebp - rorl $7,%ebx addl %esi,%ebp + xorl %ecx,%edi + rorl $7,%ebx por %xmm9,%xmm3 + addl %eax,%ebp addl 60(%rsp),%edx - xorl %ecx,%edi - movdqa %xmm3,%xmm10 + pshufd $238,%xmm2,%xmm10 + xorl %ebx,%edi movl %ebp,%esi roll $5,%ebp - xorl %ebx,%edi - addl %ebp,%edx - rorl $7,%eax addl %edi,%edx - addl 0(%rsp),%ecx - pxor %xmm0,%xmm4 -.byte 102,68,15,58,15,210,8 xorl %ebx,%esi + rorl $7,%eax + addl %ebp,%edx + pxor %xmm0,%xmm4 + addl 0(%rsp),%ecx + xorl %eax,%esi + punpcklqdq %xmm3,%xmm10 movl %edx,%edi roll $5,%edx pxor %xmm5,%xmm4 - xorl %eax,%esi - addl %edx,%ecx + addl %esi,%ecx + xorl %eax,%edi movdqa %xmm8,%xmm9 - paddd %xmm3,%xmm8 rorl $7,%ebp - addl %esi,%ecx + paddd %xmm3,%xmm8 + addl %edx,%ecx pxor %xmm10,%xmm4 addl 4(%rsp),%ebx - xorl %eax,%edi + xorl %ebp,%edi movl %ecx,%esi roll $5,%ecx movdqa %xmm4,%xmm10 + addl %edi,%ebx + xorl %ebp,%esi movdqa %xmm8,48(%rsp) - xorl %ebp,%edi - addl %ecx,%ebx rorl $7,%edx - addl %edi,%ebx - pslld $2,%xmm4 + addl %ecx,%ebx addl 8(%rsp),%eax - xorl %ebp,%esi - psrld $30,%xmm10 + pslld $2,%xmm4 + xorl %edx,%esi movl %ebx,%edi + psrld $30,%xmm10 roll $5,%ebx - xorl %edx,%esi - addl %ebx,%eax - rorl $7,%ecx addl %esi,%eax + xorl %edx,%edi + rorl $7,%ecx por %xmm10,%xmm4 + addl %ebx,%eax addl 12(%rsp),%ebp - xorl %edx,%edi - movdqa %xmm4,%xmm8 + pshufd $238,%xmm3,%xmm8 + xorl %ecx,%edi movl %eax,%esi roll $5,%eax - xorl %ecx,%edi - addl %eax,%ebp - rorl $7,%ebx addl %edi,%ebp - addl 16(%rsp),%edx - pxor %xmm1,%xmm5 -.byte 102,68,15,58,15,195,8 xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%ebp + pxor %xmm1,%xmm5 + addl 16(%rsp),%edx + xorl %ebx,%esi + punpcklqdq %xmm4,%xmm8 movl %ebp,%edi roll $5,%ebp pxor %xmm6,%xmm5 - xorl %ebx,%esi - addl %ebp,%edx + addl %esi,%edx + xorl %ebx,%edi movdqa %xmm9,%xmm10 - paddd %xmm4,%xmm9 rorl $7,%eax - addl %esi,%edx + paddd %xmm4,%xmm9 + addl %ebp,%edx pxor %xmm8,%xmm5 addl 20(%rsp),%ecx - xorl %ebx,%edi + xorl %eax,%edi movl %edx,%esi roll $5,%edx movdqa %xmm5,%xmm8 + addl %edi,%ecx + xorl %eax,%esi movdqa %xmm9,0(%rsp) - xorl %eax,%edi - addl %edx,%ecx rorl $7,%ebp - addl %edi,%ecx - pslld $2,%xmm5 + addl %edx,%ecx addl 24(%rsp),%ebx - xorl %eax,%esi - psrld $30,%xmm8 + pslld $2,%xmm5 + xorl %ebp,%esi movl %ecx,%edi + psrld $30,%xmm8 roll $5,%ecx - xorl %ebp,%esi - addl %ecx,%ebx - rorl $7,%edx addl %esi,%ebx + xorl %ebp,%edi + rorl $7,%edx por %xmm8,%xmm5 + addl %ecx,%ebx addl 28(%rsp),%eax - xorl %ebp,%edi - movdqa %xmm5,%xmm9 + pshufd $238,%xmm4,%xmm9 + rorl $7,%ecx movl %ebx,%esi - roll $5,%ebx xorl %edx,%edi - addl %ebx,%eax - rorl $7,%ecx + roll $5,%ebx addl %edi,%eax - movl %ecx,%edi - pxor %xmm2,%xmm6 -.byte 102,68,15,58,15,204,8 + xorl %ecx,%esi xorl %edx,%ecx + addl %ebx,%eax + pxor %xmm2,%xmm6 addl 32(%rsp),%ebp - andl %edx,%edi - pxor %xmm7,%xmm6 andl %ecx,%esi + xorl %edx,%ecx rorl $7,%ebx - movdqa %xmm10,%xmm8 - paddd %xmm5,%xmm10 - addl %edi,%ebp + punpcklqdq %xmm5,%xmm9 movl %eax,%edi - pxor %xmm9,%xmm6 + xorl %ecx,%esi + pxor %xmm7,%xmm6 roll $5,%eax addl %esi,%ebp - xorl %edx,%ecx - addl %eax,%ebp - movdqa %xmm6,%xmm9 - movdqa %xmm10,16(%rsp) - movl %ebx,%esi + movdqa %xmm10,%xmm8 + xorl %ebx,%edi + paddd %xmm5,%xmm10 xorl %ecx,%ebx + pxor %xmm9,%xmm6 + addl %eax,%ebp addl 36(%rsp),%edx - andl %ecx,%esi - pslld $2,%xmm6 andl %ebx,%edi + xorl %ecx,%ebx rorl $7,%eax - psrld $30,%xmm9 - addl %esi,%edx + movdqa %xmm6,%xmm9 movl %ebp,%esi + xorl %ebx,%edi + movdqa %xmm10,16(%rsp) roll $5,%ebp addl %edi,%edx - xorl %ecx,%ebx - addl %ebp,%edx - por %xmm9,%xmm6 - movl %eax,%edi + xorl %eax,%esi + pslld $2,%xmm6 xorl %ebx,%eax - movdqa %xmm6,%xmm10 + addl %ebp,%edx + psrld $30,%xmm9 addl 40(%rsp),%ecx - andl %ebx,%edi andl %eax,%esi + xorl %ebx,%eax + por %xmm9,%xmm6 rorl $7,%ebp - addl %edi,%ecx movl %edx,%edi + xorl %eax,%esi roll $5,%edx + pshufd $238,%xmm5,%xmm10 addl %esi,%ecx - xorl %ebx,%eax - addl %edx,%ecx - movl %ebp,%esi + xorl %ebp,%edi xorl %eax,%ebp + addl %edx,%ecx addl 44(%rsp),%ebx - andl %eax,%esi andl %ebp,%edi + xorl %eax,%ebp rorl $7,%edx - addl %esi,%ebx movl %ecx,%esi + xorl %ebp,%edi roll $5,%ecx addl %edi,%ebx - xorl %eax,%ebp + xorl %edx,%esi + xorl %ebp,%edx addl %ecx,%ebx - movl %edx,%edi pxor %xmm3,%xmm7 -.byte 102,68,15,58,15,213,8 - xorl %ebp,%edx addl 48(%rsp),%eax - andl %ebp,%edi - pxor %xmm0,%xmm7 andl %edx,%esi + xorl %ebp,%edx rorl $7,%ecx - movdqa 48(%r11),%xmm9 - paddd %xmm6,%xmm8 - addl %edi,%eax + punpcklqdq %xmm6,%xmm10 movl %ebx,%edi - pxor %xmm10,%xmm7 + xorl %edx,%esi + pxor %xmm0,%xmm7 roll $5,%ebx addl %esi,%eax - xorl %ebp,%edx - addl %ebx,%eax - movdqa %xmm7,%xmm10 - movdqa %xmm8,32(%rsp) - movl %ecx,%esi + movdqa 32(%r11),%xmm9 + xorl %ecx,%edi + paddd %xmm6,%xmm8 xorl %edx,%ecx + pxor %xmm10,%xmm7 + addl %ebx,%eax addl 52(%rsp),%ebp - andl %edx,%esi - pslld $2,%xmm7 andl %ecx,%edi + xorl %edx,%ecx rorl $7,%ebx - psrld $30,%xmm10 - addl %esi,%ebp + movdqa %xmm7,%xmm10 movl %eax,%esi + xorl %ecx,%edi + movdqa %xmm8,32(%rsp) roll $5,%eax addl %edi,%ebp - xorl %edx,%ecx - addl %eax,%ebp - por %xmm10,%xmm7 - movl %ebx,%edi + xorl %ebx,%esi + pslld $2,%xmm7 xorl %ecx,%ebx - movdqa %xmm7,%xmm8 + addl %eax,%ebp + psrld $30,%xmm10 addl 56(%rsp),%edx - andl %ecx,%edi andl %ebx,%esi + xorl %ecx,%ebx + por %xmm10,%xmm7 rorl $7,%eax - addl %edi,%edx movl %ebp,%edi + xorl %ebx,%esi roll $5,%ebp + pshufd $238,%xmm6,%xmm8 addl %esi,%edx - xorl %ecx,%ebx - addl %ebp,%edx - movl %eax,%esi + xorl %eax,%edi xorl %ebx,%eax + addl %ebp,%edx addl 60(%rsp),%ecx - andl %ebx,%esi andl %eax,%edi + xorl %ebx,%eax rorl $7,%ebp - addl %esi,%ecx movl %edx,%esi + xorl %eax,%edi roll $5,%edx addl %edi,%ecx - xorl %ebx,%eax + xorl %ebp,%esi + xorl %eax,%ebp addl %edx,%ecx - movl %ebp,%edi pxor %xmm4,%xmm0 -.byte 102,68,15,58,15,198,8 - xorl %eax,%ebp addl 0(%rsp),%ebx - andl %eax,%edi - pxor %xmm1,%xmm0 andl %ebp,%esi + xorl %eax,%ebp rorl $7,%edx - movdqa %xmm9,%xmm10 - paddd %xmm7,%xmm9 - addl %edi,%ebx + punpcklqdq %xmm7,%xmm8 movl %ecx,%edi - pxor %xmm8,%xmm0 + xorl %ebp,%esi + pxor %xmm1,%xmm0 roll $5,%ecx addl %esi,%ebx - xorl %eax,%ebp - addl %ecx,%ebx - movdqa %xmm0,%xmm8 - movdqa %xmm9,48(%rsp) - movl %edx,%esi + movdqa %xmm9,%xmm10 + xorl %edx,%edi + paddd %xmm7,%xmm9 xorl %ebp,%edx + pxor %xmm8,%xmm0 + addl %ecx,%ebx addl 4(%rsp),%eax - andl %ebp,%esi - pslld $2,%xmm0 andl %edx,%edi + xorl %ebp,%edx rorl $7,%ecx - psrld $30,%xmm8 - addl %esi,%eax + movdqa %xmm0,%xmm8 movl %ebx,%esi + xorl %edx,%edi + movdqa %xmm9,48(%rsp) roll $5,%ebx addl %edi,%eax - xorl %ebp,%edx - addl %ebx,%eax - por %xmm8,%xmm0 - movl %ecx,%edi + xorl %ecx,%esi + pslld $2,%xmm0 xorl %edx,%ecx - movdqa %xmm0,%xmm9 + addl %ebx,%eax + psrld $30,%xmm8 addl 8(%rsp),%ebp - andl %edx,%edi andl %ecx,%esi + xorl %edx,%ecx + por %xmm8,%xmm0 rorl $7,%ebx - addl %edi,%ebp movl %eax,%edi + xorl %ecx,%esi roll $5,%eax + pshufd $238,%xmm7,%xmm9 addl %esi,%ebp - xorl %edx,%ecx - addl %eax,%ebp - movl %ebx,%esi + xorl %ebx,%edi xorl %ecx,%ebx + addl %eax,%ebp addl 12(%rsp),%edx - andl %ecx,%esi andl %ebx,%edi + xorl %ecx,%ebx rorl $7,%eax - addl %esi,%edx movl %ebp,%esi + xorl %ebx,%edi roll $5,%ebp addl %edi,%edx - xorl %ecx,%ebx + xorl %eax,%esi + xorl %ebx,%eax addl %ebp,%edx - movl %eax,%edi pxor %xmm5,%xmm1 -.byte 102,68,15,58,15,207,8 - xorl %ebx,%eax addl 16(%rsp),%ecx - andl %ebx,%edi - pxor %xmm2,%xmm1 andl %eax,%esi + xorl %ebx,%eax rorl $7,%ebp - movdqa %xmm10,%xmm8 - paddd %xmm0,%xmm10 - addl %edi,%ecx + punpcklqdq %xmm0,%xmm9 movl %edx,%edi - pxor %xmm9,%xmm1 + xorl %eax,%esi + pxor %xmm2,%xmm1 roll $5,%edx addl %esi,%ecx - xorl %ebx,%eax - addl %edx,%ecx - movdqa %xmm1,%xmm9 - movdqa %xmm10,0(%rsp) - movl %ebp,%esi + movdqa %xmm10,%xmm8 + xorl %ebp,%edi + paddd %xmm0,%xmm10 xorl %eax,%ebp + pxor %xmm9,%xmm1 + addl %edx,%ecx addl 20(%rsp),%ebx - andl %eax,%esi - pslld $2,%xmm1 andl %ebp,%edi + xorl %eax,%ebp rorl $7,%edx - psrld $30,%xmm9 - addl %esi,%ebx + movdqa %xmm1,%xmm9 movl %ecx,%esi + xorl %ebp,%edi + movdqa %xmm10,0(%rsp) roll $5,%ecx addl %edi,%ebx - xorl %eax,%ebp - addl %ecx,%ebx - por %xmm9,%xmm1 - movl %edx,%edi + xorl %edx,%esi + pslld $2,%xmm1 xorl %ebp,%edx - movdqa %xmm1,%xmm10 + addl %ecx,%ebx + psrld $30,%xmm9 addl 24(%rsp),%eax - andl %ebp,%edi andl %edx,%esi + xorl %ebp,%edx + por %xmm9,%xmm1 rorl $7,%ecx - addl %edi,%eax movl %ebx,%edi + xorl %edx,%esi roll $5,%ebx + pshufd $238,%xmm0,%xmm10 addl %esi,%eax - xorl %ebp,%edx - addl %ebx,%eax - movl %ecx,%esi + xorl %ecx,%edi xorl %edx,%ecx + addl %ebx,%eax addl 28(%rsp),%ebp - andl %edx,%esi andl %ecx,%edi + xorl %edx,%ecx rorl $7,%ebx - addl %esi,%ebp movl %eax,%esi + xorl %ecx,%edi roll $5,%eax addl %edi,%ebp - xorl %edx,%ecx + xorl %ebx,%esi + xorl %ecx,%ebx addl %eax,%ebp - movl %ebx,%edi pxor %xmm6,%xmm2 -.byte 102,68,15,58,15,208,8 - xorl %ecx,%ebx addl 32(%rsp),%edx - andl %ecx,%edi - pxor %xmm3,%xmm2 andl %ebx,%esi + xorl %ecx,%ebx rorl $7,%eax - movdqa %xmm8,%xmm9 - paddd %xmm1,%xmm8 - addl %edi,%edx + punpcklqdq %xmm1,%xmm10 movl %ebp,%edi - pxor %xmm10,%xmm2 + xorl %ebx,%esi + pxor %xmm3,%xmm2 roll $5,%ebp addl %esi,%edx - xorl %ecx,%ebx - addl %ebp,%edx - movdqa %xmm2,%xmm10 - movdqa %xmm8,16(%rsp) - movl %eax,%esi + movdqa %xmm8,%xmm9 + xorl %eax,%edi + paddd %xmm1,%xmm8 xorl %ebx,%eax + pxor %xmm10,%xmm2 + addl %ebp,%edx addl 36(%rsp),%ecx - andl %ebx,%esi - pslld $2,%xmm2 andl %eax,%edi + xorl %ebx,%eax rorl $7,%ebp - psrld $30,%xmm10 - addl %esi,%ecx + movdqa %xmm2,%xmm10 movl %edx,%esi + xorl %eax,%edi + movdqa %xmm8,16(%rsp) roll $5,%edx addl %edi,%ecx - xorl %ebx,%eax - addl %edx,%ecx - por %xmm10,%xmm2 - movl %ebp,%edi + xorl %ebp,%esi + pslld $2,%xmm2 xorl %eax,%ebp - movdqa %xmm2,%xmm8 + addl %edx,%ecx + psrld $30,%xmm10 addl 40(%rsp),%ebx - andl %eax,%edi andl %ebp,%esi + xorl %eax,%ebp + por %xmm10,%xmm2 rorl $7,%edx - addl %edi,%ebx movl %ecx,%edi + xorl %ebp,%esi roll $5,%ecx + pshufd $238,%xmm1,%xmm8 addl %esi,%ebx - xorl %eax,%ebp - addl %ecx,%ebx - movl %edx,%esi + xorl %edx,%edi xorl %ebp,%edx + addl %ecx,%ebx addl 44(%rsp),%eax - andl %ebp,%esi andl %edx,%edi + xorl %ebp,%edx rorl $7,%ecx - addl %esi,%eax movl %ebx,%esi + xorl %edx,%edi roll $5,%ebx addl %edi,%eax - xorl %ebp,%edx + xorl %edx,%esi addl %ebx,%eax - addl 48(%rsp),%ebp pxor %xmm7,%xmm3 -.byte 102,68,15,58,15,193,8 - xorl %edx,%esi + addl 48(%rsp),%ebp + xorl %ecx,%esi + punpcklqdq %xmm2,%xmm8 movl %eax,%edi roll $5,%eax pxor %xmm4,%xmm3 - xorl %ecx,%esi - addl %eax,%ebp + addl %esi,%ebp + xorl %ecx,%edi movdqa %xmm9,%xmm10 - paddd %xmm2,%xmm9 rorl $7,%ebx - addl %esi,%ebp + paddd %xmm2,%xmm9 + addl %eax,%ebp pxor %xmm8,%xmm3 addl 52(%rsp),%edx - xorl %ecx,%edi + xorl %ebx,%edi movl %ebp,%esi roll $5,%ebp movdqa %xmm3,%xmm8 + addl %edi,%edx + xorl %ebx,%esi movdqa %xmm9,32(%rsp) - xorl %ebx,%edi - addl %ebp,%edx rorl $7,%eax - addl %edi,%edx - pslld $2,%xmm3 + addl %ebp,%edx addl 56(%rsp),%ecx - xorl %ebx,%esi - psrld $30,%xmm8 + pslld $2,%xmm3 + xorl %eax,%esi movl %edx,%edi + psrld $30,%xmm8 roll $5,%edx - xorl %eax,%esi - addl %edx,%ecx - rorl $7,%ebp addl %esi,%ecx + xorl %eax,%edi + rorl $7,%ebp por %xmm8,%xmm3 + addl %edx,%ecx addl 60(%rsp),%ebx - xorl %eax,%edi + xorl %ebp,%edi movl %ecx,%esi roll $5,%ecx - xorl %ebp,%edi - addl %ecx,%ebx - rorl $7,%edx addl %edi,%ebx - addl 0(%rsp),%eax - paddd %xmm3,%xmm10 xorl %ebp,%esi + rorl $7,%edx + addl %ecx,%ebx + addl 0(%rsp),%eax + xorl %edx,%esi movl %ebx,%edi roll $5,%ebx - xorl %edx,%esi + paddd %xmm3,%xmm10 + addl %esi,%eax + xorl %edx,%edi movdqa %xmm10,48(%rsp) - addl %ebx,%eax rorl $7,%ecx - addl %esi,%eax + addl %ebx,%eax addl 4(%rsp),%ebp - xorl %edx,%edi + xorl %ecx,%edi movl %eax,%esi roll $5,%eax - xorl %ecx,%edi - addl %eax,%ebp - rorl $7,%ebx addl %edi,%ebp - addl 8(%rsp),%edx xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%ebp + addl 8(%rsp),%edx + xorl %ebx,%esi movl %ebp,%edi roll $5,%ebp - xorl %ebx,%esi - addl %ebp,%edx - rorl $7,%eax addl %esi,%edx - addl 12(%rsp),%ecx xorl %ebx,%edi + rorl $7,%eax + addl %ebp,%edx + addl 12(%rsp),%ecx + xorl %eax,%edi movl %edx,%esi roll $5,%edx - xorl %eax,%edi - addl %edx,%ecx - rorl $7,%ebp addl %edi,%ecx + xorl %eax,%esi + rorl $7,%ebp + addl %edx,%ecx cmpq %r10,%r9 je .Ldone_ssse3 movdqa 64(%r11),%xmm6 - movdqa 0(%r11),%xmm9 + movdqa -64(%r11),%xmm9 movdqu 0(%r9),%xmm0 movdqu 16(%r9),%xmm1 movdqu 32(%r9),%xmm2 @@ -2240,113 +2336,112 @@ _ssse3_shortcut: .byte 102,15,56,0,198 addq $64,%r9 addl 16(%rsp),%ebx - xorl %eax,%esi -.byte 102,15,56,0,206 + xorl %ebp,%esi movl %ecx,%edi +.byte 102,15,56,0,206 roll $5,%ecx + addl %esi,%ebx + xorl %ebp,%edi + rorl $7,%edx paddd %xmm9,%xmm0 - xorl %ebp,%esi addl %ecx,%ebx - rorl $7,%edx - addl %esi,%ebx - movdqa %xmm0,0(%rsp) addl 20(%rsp),%eax - xorl %ebp,%edi - psubd %xmm9,%xmm0 + xorl %edx,%edi movl %ebx,%esi + movdqa %xmm0,0(%rsp) roll $5,%ebx - xorl %edx,%edi - addl %ebx,%eax - rorl $7,%ecx addl %edi,%eax - addl 24(%rsp),%ebp xorl %edx,%esi + rorl $7,%ecx + psubd %xmm9,%xmm0 + addl %ebx,%eax + addl 24(%rsp),%ebp + xorl %ecx,%esi movl %eax,%edi roll $5,%eax - xorl %ecx,%esi - addl %eax,%ebp - rorl $7,%ebx addl %esi,%ebp - addl 28(%rsp),%edx xorl %ecx,%edi + rorl $7,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi movl %ebp,%esi roll $5,%ebp - xorl %ebx,%edi - addl %ebp,%edx - rorl $7,%eax addl %edi,%edx - addl 32(%rsp),%ecx xorl %ebx,%esi -.byte 102,15,56,0,214 + rorl $7,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi movl %edx,%edi +.byte 102,15,56,0,214 roll $5,%edx + addl %esi,%ecx + xorl %eax,%edi + rorl $7,%ebp paddd %xmm9,%xmm1 - xorl %eax,%esi addl %edx,%ecx - rorl $7,%ebp - addl %esi,%ecx - movdqa %xmm1,16(%rsp) addl 36(%rsp),%ebx - xorl %eax,%edi - psubd %xmm9,%xmm1 + xorl %ebp,%edi movl %ecx,%esi + movdqa %xmm1,16(%rsp) roll $5,%ecx - xorl %ebp,%edi - addl %ecx,%ebx - rorl $7,%edx addl %edi,%ebx - addl 40(%rsp),%eax xorl %ebp,%esi + rorl $7,%edx + psubd %xmm9,%xmm1 + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi movl %ebx,%edi roll $5,%ebx - xorl %edx,%esi - addl %ebx,%eax - rorl $7,%ecx addl %esi,%eax - addl 44(%rsp),%ebp xorl %edx,%edi + rorl $7,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + xorl %ecx,%edi movl %eax,%esi roll $5,%eax - xorl %ecx,%edi - addl %eax,%ebp - rorl $7,%ebx addl %edi,%ebp - addl 48(%rsp),%edx xorl %ecx,%esi -.byte 102,15,56,0,222 + rorl $7,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi movl %ebp,%edi +.byte 102,15,56,0,222 roll $5,%ebp + addl %esi,%edx + xorl %ebx,%edi + rorl $7,%eax paddd %xmm9,%xmm2 - xorl %ebx,%esi addl %ebp,%edx - rorl $7,%eax - addl %esi,%edx - movdqa %xmm2,32(%rsp) addl 52(%rsp),%ecx - xorl %ebx,%edi - psubd %xmm9,%xmm2 + xorl %eax,%edi movl %edx,%esi + movdqa %xmm2,32(%rsp) roll $5,%edx - xorl %eax,%edi - addl %edx,%ecx - rorl $7,%ebp addl %edi,%ecx - addl 56(%rsp),%ebx xorl %eax,%esi + rorl $7,%ebp + psubd %xmm9,%xmm2 + addl %edx,%ecx + addl 56(%rsp),%ebx + xorl %ebp,%esi movl %ecx,%edi roll $5,%ecx - xorl %ebp,%esi - addl %ecx,%ebx - rorl $7,%edx addl %esi,%ebx - addl 60(%rsp),%eax xorl %ebp,%edi + rorl $7,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi movl %ebx,%esi roll $5,%ebx - xorl %edx,%edi - addl %ebx,%eax - rorl $7,%ecx addl %edi,%eax + rorl $7,%ecx + addl %ebx,%eax addl 0(%r8),%eax addl 4(%r8),%esi addl 8(%r8),%ecx @@ -2356,108 +2451,110 @@ _ssse3_shortcut: movl %esi,4(%r8) movl %esi,%ebx movl %ecx,8(%r8) + movl %ecx,%edi movl %edx,12(%r8) + xorl %edx,%edi movl %ebp,16(%r8) + andl %edi,%esi jmp .Loop_ssse3 .align 16 .Ldone_ssse3: addl 16(%rsp),%ebx - xorl %eax,%esi + xorl %ebp,%esi movl %ecx,%edi roll $5,%ecx - xorl %ebp,%esi - addl %ecx,%ebx - rorl $7,%edx addl %esi,%ebx - addl 20(%rsp),%eax xorl %ebp,%edi + rorl $7,%edx + addl %ecx,%ebx + addl 20(%rsp),%eax + xorl %edx,%edi movl %ebx,%esi roll $5,%ebx - xorl %edx,%edi - addl %ebx,%eax - rorl $7,%ecx addl %edi,%eax - addl 24(%rsp),%ebp xorl %edx,%esi + rorl $7,%ecx + addl %ebx,%eax + addl 24(%rsp),%ebp + xorl %ecx,%esi movl %eax,%edi roll $5,%eax - xorl %ecx,%esi - addl %eax,%ebp - rorl $7,%ebx addl %esi,%ebp - addl 28(%rsp),%edx xorl %ecx,%edi + rorl $7,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi movl %ebp,%esi roll $5,%ebp - xorl %ebx,%edi - addl %ebp,%edx - rorl $7,%eax addl %edi,%edx - addl 32(%rsp),%ecx xorl %ebx,%esi + rorl $7,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi movl %edx,%edi roll $5,%edx - xorl %eax,%esi - addl %edx,%ecx - rorl $7,%ebp addl %esi,%ecx - addl 36(%rsp),%ebx xorl %eax,%edi + rorl $7,%ebp + addl %edx,%ecx + addl 36(%rsp),%ebx + xorl %ebp,%edi movl %ecx,%esi roll $5,%ecx - xorl %ebp,%edi - addl %ecx,%ebx - rorl $7,%edx addl %edi,%ebx - addl 40(%rsp),%eax xorl %ebp,%esi + rorl $7,%edx + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi movl %ebx,%edi roll $5,%ebx - xorl %edx,%esi - addl %ebx,%eax - rorl $7,%ecx addl %esi,%eax - addl 44(%rsp),%ebp xorl %edx,%edi + rorl $7,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + xorl %ecx,%edi movl %eax,%esi roll $5,%eax - xorl %ecx,%edi - addl %eax,%ebp - rorl $7,%ebx addl %edi,%ebp - addl 48(%rsp),%edx xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi movl %ebp,%edi roll $5,%ebp - xorl %ebx,%esi - addl %ebp,%edx - rorl $7,%eax addl %esi,%edx - addl 52(%rsp),%ecx xorl %ebx,%edi + rorl $7,%eax + addl %ebp,%edx + addl 52(%rsp),%ecx + xorl %eax,%edi movl %edx,%esi roll $5,%edx - xorl %eax,%edi - addl %edx,%ecx - rorl $7,%ebp addl %edi,%ecx - addl 56(%rsp),%ebx xorl %eax,%esi + rorl $7,%ebp + addl %edx,%ecx + addl 56(%rsp),%ebx + xorl %ebp,%esi movl %ecx,%edi roll $5,%ecx - xorl %ebp,%esi - addl %ecx,%ebx - rorl $7,%edx addl %esi,%ebx - addl 60(%rsp),%eax xorl %ebp,%edi + rorl $7,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi movl %ebx,%esi roll $5,%ebx - xorl %edx,%edi - addl %ebx,%eax - rorl $7,%ecx addl %edi,%eax + rorl $7,%ecx + addl %ebx,%eax addl 0(%r8),%eax addl 4(%r8),%esi addl 8(%r8),%ecx @@ -2468,20 +2565,28 @@ _ssse3_shortcut: movl %ecx,8(%r8) movl %edx,12(%r8) movl %ebp,16(%r8) - leaq 64(%rsp),%rsi - movq 0(%rsi),%r12 - movq 8(%rsi),%rbp - movq 16(%rsi),%rbx - leaq 24(%rsi),%rsp + leaq (%r14),%rsi + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp .Lepilogue_ssse3: .byte 0xf3,0xc3 .size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3 .align 64 K_XX_XX: -.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 -.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 -.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc -.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 -.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 +.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 +.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 +.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 +.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc +.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc +.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 +.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 .byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 diff --git a/secure/lib/libcrypto/amd64/sha256-mb-x86_64.S b/secure/lib/libcrypto/amd64/sha256-mb-x86_64.S new file mode 100644 index 0000000..893d42a --- /dev/null +++ b/secure/lib/libcrypto/amd64/sha256-mb-x86_64.S @@ -0,0 +1,3259 @@ + # $FreeBSD$ +.text + + + +.globl sha256_multi_block +.type sha256_multi_block,@function +.align 32 +sha256_multi_block: + movq OPENSSL_ia32cap_P+4(%rip),%rcx + btq $61,%rcx + jc _shaext_shortcut + movq %rsp,%rax + pushq %rbx + pushq %rbp + subq $288,%rsp + andq $-256,%rsp + movq %rax,272(%rsp) +.Lbody: + leaq K256+128(%rip),%rbp + leaq 256(%rsp),%rbx + leaq 128(%rdi),%rdi + +.Loop_grande: + movl %edx,280(%rsp) + xorl %edx,%edx + movq 0(%rsi),%r8 + movl 8(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,0(%rbx) + cmovleq %rbp,%r8 + movq 16(%rsi),%r9 + movl 24(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,4(%rbx) + cmovleq %rbp,%r9 + movq 32(%rsi),%r10 + movl 40(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,8(%rbx) + cmovleq %rbp,%r10 + movq 48(%rsi),%r11 + movl 56(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,12(%rbx) + cmovleq %rbp,%r11 + testl %edx,%edx + jz .Ldone + + movdqu 0-128(%rdi),%xmm8 + leaq 128(%rsp),%rax + movdqu 32-128(%rdi),%xmm9 + movdqu 64-128(%rdi),%xmm10 + movdqu 96-128(%rdi),%xmm11 + movdqu 128-128(%rdi),%xmm12 + movdqu 160-128(%rdi),%xmm13 + movdqu 192-128(%rdi),%xmm14 + movdqu 224-128(%rdi),%xmm15 + movdqu .Lpbswap(%rip),%xmm6 + jmp .Loop + +.align 32 +.Loop: + movdqa %xmm10,%xmm4 + pxor %xmm9,%xmm4 + movd 0(%r8),%xmm5 + movd 0(%r9),%xmm0 + movd 0(%r10),%xmm1 + movd 0(%r11),%xmm2 + punpckldq %xmm1,%xmm5 + punpckldq %xmm2,%xmm0 + punpckldq %xmm0,%xmm5 + movdqa %xmm12,%xmm7 +.byte 102,15,56,0,238 + movdqa %xmm12,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm12,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,0-128(%rax) + paddd %xmm15,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd -128(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm12,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm12,%xmm3 + pslld $26-21,%xmm2 + pandn %xmm14,%xmm0 + pand %xmm13,%xmm3 + pxor %xmm1,%xmm7 + + + movdqa %xmm8,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm8,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + movdqa %xmm9,%xmm3 + movdqa %xmm8,%xmm7 + pslld $10,%xmm2 + pxor %xmm8,%xmm3 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm3,%xmm4 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm9,%xmm15 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm4,%xmm15 + paddd %xmm5,%xmm11 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm15 + paddd %xmm7,%xmm15 + movd 4(%r8),%xmm5 + movd 4(%r9),%xmm0 + movd 4(%r10),%xmm1 + movd 4(%r11),%xmm2 + punpckldq %xmm1,%xmm5 + punpckldq %xmm2,%xmm0 + punpckldq %xmm0,%xmm5 + movdqa %xmm11,%xmm7 + + movdqa %xmm11,%xmm2 +.byte 102,15,56,0,238 + psrld $6,%xmm7 + movdqa %xmm11,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,16-128(%rax) + paddd %xmm14,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd -96(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm11,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm11,%xmm4 + pslld $26-21,%xmm2 + pandn %xmm13,%xmm0 + pand %xmm12,%xmm4 + pxor %xmm1,%xmm7 + + + movdqa %xmm15,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm15,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm4,%xmm0 + movdqa %xmm8,%xmm4 + movdqa %xmm15,%xmm7 + pslld $10,%xmm2 + pxor %xmm15,%xmm4 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm4,%xmm3 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm8,%xmm14 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm3,%xmm14 + paddd %xmm5,%xmm10 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm14 + paddd %xmm7,%xmm14 + movd 8(%r8),%xmm5 + movd 8(%r9),%xmm0 + movd 8(%r10),%xmm1 + movd 8(%r11),%xmm2 + punpckldq %xmm1,%xmm5 + punpckldq %xmm2,%xmm0 + punpckldq %xmm0,%xmm5 + movdqa %xmm10,%xmm7 +.byte 102,15,56,0,238 + movdqa %xmm10,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm10,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,32-128(%rax) + paddd %xmm13,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd -64(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm10,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm10,%xmm3 + pslld $26-21,%xmm2 + pandn %xmm12,%xmm0 + pand %xmm11,%xmm3 + pxor %xmm1,%xmm7 + + + movdqa %xmm14,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm14,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + movdqa %xmm15,%xmm3 + movdqa %xmm14,%xmm7 + pslld $10,%xmm2 + pxor %xmm14,%xmm3 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm3,%xmm4 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm15,%xmm13 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm4,%xmm13 + paddd %xmm5,%xmm9 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm13 + paddd %xmm7,%xmm13 + movd 12(%r8),%xmm5 + movd 12(%r9),%xmm0 + movd 12(%r10),%xmm1 + movd 12(%r11),%xmm2 + punpckldq %xmm1,%xmm5 + punpckldq %xmm2,%xmm0 + punpckldq %xmm0,%xmm5 + movdqa %xmm9,%xmm7 + + movdqa %xmm9,%xmm2 +.byte 102,15,56,0,238 + psrld $6,%xmm7 + movdqa %xmm9,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,48-128(%rax) + paddd %xmm12,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd -32(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm9,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm9,%xmm4 + pslld $26-21,%xmm2 + pandn %xmm11,%xmm0 + pand %xmm10,%xmm4 + pxor %xmm1,%xmm7 + + + movdqa %xmm13,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm13,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm4,%xmm0 + movdqa %xmm14,%xmm4 + movdqa %xmm13,%xmm7 + pslld $10,%xmm2 + pxor %xmm13,%xmm4 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm4,%xmm3 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm14,%xmm12 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm3,%xmm12 + paddd %xmm5,%xmm8 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm12 + paddd %xmm7,%xmm12 + movd 16(%r8),%xmm5 + movd 16(%r9),%xmm0 + movd 16(%r10),%xmm1 + movd 16(%r11),%xmm2 + punpckldq %xmm1,%xmm5 + punpckldq %xmm2,%xmm0 + punpckldq %xmm0,%xmm5 + movdqa %xmm8,%xmm7 +.byte 102,15,56,0,238 + movdqa %xmm8,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm8,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,64-128(%rax) + paddd %xmm11,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd 0(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm8,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm8,%xmm3 + pslld $26-21,%xmm2 + pandn %xmm10,%xmm0 + pand %xmm9,%xmm3 + pxor %xmm1,%xmm7 + + + movdqa %xmm12,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm12,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + movdqa %xmm13,%xmm3 + movdqa %xmm12,%xmm7 + pslld $10,%xmm2 + pxor %xmm12,%xmm3 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm3,%xmm4 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm13,%xmm11 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm4,%xmm11 + paddd %xmm5,%xmm15 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm11 + paddd %xmm7,%xmm11 + movd 20(%r8),%xmm5 + movd 20(%r9),%xmm0 + movd 20(%r10),%xmm1 + movd 20(%r11),%xmm2 + punpckldq %xmm1,%xmm5 + punpckldq %xmm2,%xmm0 + punpckldq %xmm0,%xmm5 + movdqa %xmm15,%xmm7 + + movdqa %xmm15,%xmm2 +.byte 102,15,56,0,238 + psrld $6,%xmm7 + movdqa %xmm15,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,80-128(%rax) + paddd %xmm10,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd 32(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm15,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm15,%xmm4 + pslld $26-21,%xmm2 + pandn %xmm9,%xmm0 + pand %xmm8,%xmm4 + pxor %xmm1,%xmm7 + + + movdqa %xmm11,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm11,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm4,%xmm0 + movdqa %xmm12,%xmm4 + movdqa %xmm11,%xmm7 + pslld $10,%xmm2 + pxor %xmm11,%xmm4 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm4,%xmm3 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm12,%xmm10 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm3,%xmm10 + paddd %xmm5,%xmm14 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm10 + paddd %xmm7,%xmm10 + movd 24(%r8),%xmm5 + movd 24(%r9),%xmm0 + movd 24(%r10),%xmm1 + movd 24(%r11),%xmm2 + punpckldq %xmm1,%xmm5 + punpckldq %xmm2,%xmm0 + punpckldq %xmm0,%xmm5 + movdqa %xmm14,%xmm7 +.byte 102,15,56,0,238 + movdqa %xmm14,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm14,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,96-128(%rax) + paddd %xmm9,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd 64(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm14,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm14,%xmm3 + pslld $26-21,%xmm2 + pandn %xmm8,%xmm0 + pand %xmm15,%xmm3 + pxor %xmm1,%xmm7 + + + movdqa %xmm10,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm10,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + movdqa %xmm11,%xmm3 + movdqa %xmm10,%xmm7 + pslld $10,%xmm2 + pxor %xmm10,%xmm3 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm3,%xmm4 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm11,%xmm9 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm4,%xmm9 + paddd %xmm5,%xmm13 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm9 + paddd %xmm7,%xmm9 + movd 28(%r8),%xmm5 + movd 28(%r9),%xmm0 + movd 28(%r10),%xmm1 + movd 28(%r11),%xmm2 + punpckldq %xmm1,%xmm5 + punpckldq %xmm2,%xmm0 + punpckldq %xmm0,%xmm5 + movdqa %xmm13,%xmm7 + + movdqa %xmm13,%xmm2 +.byte 102,15,56,0,238 + psrld $6,%xmm7 + movdqa %xmm13,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,112-128(%rax) + paddd %xmm8,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd 96(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm13,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm13,%xmm4 + pslld $26-21,%xmm2 + pandn %xmm15,%xmm0 + pand %xmm14,%xmm4 + pxor %xmm1,%xmm7 + + + movdqa %xmm9,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm9,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm4,%xmm0 + movdqa %xmm10,%xmm4 + movdqa %xmm9,%xmm7 + pslld $10,%xmm2 + pxor %xmm9,%xmm4 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm4,%xmm3 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm10,%xmm8 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm3,%xmm8 + paddd %xmm5,%xmm12 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm8 + paddd %xmm7,%xmm8 + leaq 256(%rbp),%rbp + movd 32(%r8),%xmm5 + movd 32(%r9),%xmm0 + movd 32(%r10),%xmm1 + movd 32(%r11),%xmm2 + punpckldq %xmm1,%xmm5 + punpckldq %xmm2,%xmm0 + punpckldq %xmm0,%xmm5 + movdqa %xmm12,%xmm7 +.byte 102,15,56,0,238 + movdqa %xmm12,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm12,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,128-128(%rax) + paddd %xmm15,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd -128(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm12,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm12,%xmm3 + pslld $26-21,%xmm2 + pandn %xmm14,%xmm0 + pand %xmm13,%xmm3 + pxor %xmm1,%xmm7 + + + movdqa %xmm8,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm8,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + movdqa %xmm9,%xmm3 + movdqa %xmm8,%xmm7 + pslld $10,%xmm2 + pxor %xmm8,%xmm3 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm3,%xmm4 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm9,%xmm15 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm4,%xmm15 + paddd %xmm5,%xmm11 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm15 + paddd %xmm7,%xmm15 + movd 36(%r8),%xmm5 + movd 36(%r9),%xmm0 + movd 36(%r10),%xmm1 + movd 36(%r11),%xmm2 + punpckldq %xmm1,%xmm5 + punpckldq %xmm2,%xmm0 + punpckldq %xmm0,%xmm5 + movdqa %xmm11,%xmm7 + + movdqa %xmm11,%xmm2 +.byte 102,15,56,0,238 + psrld $6,%xmm7 + movdqa %xmm11,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,144-128(%rax) + paddd %xmm14,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd -96(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm11,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm11,%xmm4 + pslld $26-21,%xmm2 + pandn %xmm13,%xmm0 + pand %xmm12,%xmm4 + pxor %xmm1,%xmm7 + + + movdqa %xmm15,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm15,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm4,%xmm0 + movdqa %xmm8,%xmm4 + movdqa %xmm15,%xmm7 + pslld $10,%xmm2 + pxor %xmm15,%xmm4 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm4,%xmm3 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm8,%xmm14 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm3,%xmm14 + paddd %xmm5,%xmm10 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm14 + paddd %xmm7,%xmm14 + movd 40(%r8),%xmm5 + movd 40(%r9),%xmm0 + movd 40(%r10),%xmm1 + movd 40(%r11),%xmm2 + punpckldq %xmm1,%xmm5 + punpckldq %xmm2,%xmm0 + punpckldq %xmm0,%xmm5 + movdqa %xmm10,%xmm7 +.byte 102,15,56,0,238 + movdqa %xmm10,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm10,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,160-128(%rax) + paddd %xmm13,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd -64(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm10,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm10,%xmm3 + pslld $26-21,%xmm2 + pandn %xmm12,%xmm0 + pand %xmm11,%xmm3 + pxor %xmm1,%xmm7 + + + movdqa %xmm14,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm14,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + movdqa %xmm15,%xmm3 + movdqa %xmm14,%xmm7 + pslld $10,%xmm2 + pxor %xmm14,%xmm3 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm3,%xmm4 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm15,%xmm13 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm4,%xmm13 + paddd %xmm5,%xmm9 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm13 + paddd %xmm7,%xmm13 + movd 44(%r8),%xmm5 + movd 44(%r9),%xmm0 + movd 44(%r10),%xmm1 + movd 44(%r11),%xmm2 + punpckldq %xmm1,%xmm5 + punpckldq %xmm2,%xmm0 + punpckldq %xmm0,%xmm5 + movdqa %xmm9,%xmm7 + + movdqa %xmm9,%xmm2 +.byte 102,15,56,0,238 + psrld $6,%xmm7 + movdqa %xmm9,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,176-128(%rax) + paddd %xmm12,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd -32(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm9,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm9,%xmm4 + pslld $26-21,%xmm2 + pandn %xmm11,%xmm0 + pand %xmm10,%xmm4 + pxor %xmm1,%xmm7 + + + movdqa %xmm13,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm13,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm4,%xmm0 + movdqa %xmm14,%xmm4 + movdqa %xmm13,%xmm7 + pslld $10,%xmm2 + pxor %xmm13,%xmm4 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm4,%xmm3 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm14,%xmm12 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm3,%xmm12 + paddd %xmm5,%xmm8 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm12 + paddd %xmm7,%xmm12 + movd 48(%r8),%xmm5 + movd 48(%r9),%xmm0 + movd 48(%r10),%xmm1 + movd 48(%r11),%xmm2 + punpckldq %xmm1,%xmm5 + punpckldq %xmm2,%xmm0 + punpckldq %xmm0,%xmm5 + movdqa %xmm8,%xmm7 +.byte 102,15,56,0,238 + movdqa %xmm8,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm8,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,192-128(%rax) + paddd %xmm11,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd 0(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm8,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm8,%xmm3 + pslld $26-21,%xmm2 + pandn %xmm10,%xmm0 + pand %xmm9,%xmm3 + pxor %xmm1,%xmm7 + + + movdqa %xmm12,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm12,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + movdqa %xmm13,%xmm3 + movdqa %xmm12,%xmm7 + pslld $10,%xmm2 + pxor %xmm12,%xmm3 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm3,%xmm4 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm13,%xmm11 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm4,%xmm11 + paddd %xmm5,%xmm15 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm11 + paddd %xmm7,%xmm11 + movd 52(%r8),%xmm5 + movd 52(%r9),%xmm0 + movd 52(%r10),%xmm1 + movd 52(%r11),%xmm2 + punpckldq %xmm1,%xmm5 + punpckldq %xmm2,%xmm0 + punpckldq %xmm0,%xmm5 + movdqa %xmm15,%xmm7 + + movdqa %xmm15,%xmm2 +.byte 102,15,56,0,238 + psrld $6,%xmm7 + movdqa %xmm15,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,208-128(%rax) + paddd %xmm10,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd 32(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm15,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm15,%xmm4 + pslld $26-21,%xmm2 + pandn %xmm9,%xmm0 + pand %xmm8,%xmm4 + pxor %xmm1,%xmm7 + + + movdqa %xmm11,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm11,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm4,%xmm0 + movdqa %xmm12,%xmm4 + movdqa %xmm11,%xmm7 + pslld $10,%xmm2 + pxor %xmm11,%xmm4 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm4,%xmm3 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm12,%xmm10 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm3,%xmm10 + paddd %xmm5,%xmm14 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm10 + paddd %xmm7,%xmm10 + movd 56(%r8),%xmm5 + movd 56(%r9),%xmm0 + movd 56(%r10),%xmm1 + movd 56(%r11),%xmm2 + punpckldq %xmm1,%xmm5 + punpckldq %xmm2,%xmm0 + punpckldq %xmm0,%xmm5 + movdqa %xmm14,%xmm7 +.byte 102,15,56,0,238 + movdqa %xmm14,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm14,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,224-128(%rax) + paddd %xmm9,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd 64(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm14,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm14,%xmm3 + pslld $26-21,%xmm2 + pandn %xmm8,%xmm0 + pand %xmm15,%xmm3 + pxor %xmm1,%xmm7 + + + movdqa %xmm10,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm10,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + movdqa %xmm11,%xmm3 + movdqa %xmm10,%xmm7 + pslld $10,%xmm2 + pxor %xmm10,%xmm3 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm3,%xmm4 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm11,%xmm9 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm4,%xmm9 + paddd %xmm5,%xmm13 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm9 + paddd %xmm7,%xmm9 + movd 60(%r8),%xmm5 + leaq 64(%r8),%r8 + movd 60(%r9),%xmm0 + leaq 64(%r9),%r9 + movd 60(%r10),%xmm1 + leaq 64(%r10),%r10 + movd 60(%r11),%xmm2 + leaq 64(%r11),%r11 + punpckldq %xmm1,%xmm5 + punpckldq %xmm2,%xmm0 + punpckldq %xmm0,%xmm5 + movdqa %xmm13,%xmm7 + + movdqa %xmm13,%xmm2 +.byte 102,15,56,0,238 + psrld $6,%xmm7 + movdqa %xmm13,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,240-128(%rax) + paddd %xmm8,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd 96(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm13,%xmm0 + prefetcht0 63(%r8) + pxor %xmm2,%xmm7 + movdqa %xmm13,%xmm4 + pslld $26-21,%xmm2 + pandn %xmm15,%xmm0 + pand %xmm14,%xmm4 + pxor %xmm1,%xmm7 + + prefetcht0 63(%r9) + movdqa %xmm9,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm9,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm4,%xmm0 + movdqa %xmm10,%xmm4 + movdqa %xmm9,%xmm7 + pslld $10,%xmm2 + pxor %xmm9,%xmm4 + + prefetcht0 63(%r10) + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm4,%xmm3 + pxor %xmm7,%xmm1 + + prefetcht0 63(%r11) + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm10,%xmm8 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm3,%xmm8 + paddd %xmm5,%xmm12 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm8 + paddd %xmm7,%xmm8 + leaq 256(%rbp),%rbp + movdqu 0-128(%rax),%xmm5 + movl $3,%ecx + jmp .Loop_16_xx +.align 32 +.Loop_16_xx: + movdqa 16-128(%rax),%xmm6 + paddd 144-128(%rax),%xmm5 + + movdqa %xmm6,%xmm7 + movdqa %xmm6,%xmm1 + psrld $3,%xmm7 + movdqa %xmm6,%xmm2 + + psrld $7,%xmm1 + movdqa 224-128(%rax),%xmm0 + pslld $14,%xmm2 + pxor %xmm1,%xmm7 + psrld $18-7,%xmm1 + movdqa %xmm0,%xmm3 + pxor %xmm2,%xmm7 + pslld $25-14,%xmm2 + pxor %xmm1,%xmm7 + psrld $10,%xmm0 + movdqa %xmm3,%xmm1 + + psrld $17,%xmm3 + pxor %xmm2,%xmm7 + pslld $13,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + psrld $19-17,%xmm3 + pxor %xmm1,%xmm0 + pslld $15-13,%xmm1 + pxor %xmm3,%xmm0 + pxor %xmm1,%xmm0 + paddd %xmm0,%xmm5 + movdqa %xmm12,%xmm7 + + movdqa %xmm12,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm12,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,0-128(%rax) + paddd %xmm15,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd -128(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm12,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm12,%xmm3 + pslld $26-21,%xmm2 + pandn %xmm14,%xmm0 + pand %xmm13,%xmm3 + pxor %xmm1,%xmm7 + + + movdqa %xmm8,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm8,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + movdqa %xmm9,%xmm3 + movdqa %xmm8,%xmm7 + pslld $10,%xmm2 + pxor %xmm8,%xmm3 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm3,%xmm4 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm9,%xmm15 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm4,%xmm15 + paddd %xmm5,%xmm11 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm15 + paddd %xmm7,%xmm15 + movdqa 32-128(%rax),%xmm5 + paddd 160-128(%rax),%xmm6 + + movdqa %xmm5,%xmm7 + movdqa %xmm5,%xmm1 + psrld $3,%xmm7 + movdqa %xmm5,%xmm2 + + psrld $7,%xmm1 + movdqa 240-128(%rax),%xmm0 + pslld $14,%xmm2 + pxor %xmm1,%xmm7 + psrld $18-7,%xmm1 + movdqa %xmm0,%xmm4 + pxor %xmm2,%xmm7 + pslld $25-14,%xmm2 + pxor %xmm1,%xmm7 + psrld $10,%xmm0 + movdqa %xmm4,%xmm1 + + psrld $17,%xmm4 + pxor %xmm2,%xmm7 + pslld $13,%xmm1 + paddd %xmm7,%xmm6 + pxor %xmm4,%xmm0 + psrld $19-17,%xmm4 + pxor %xmm1,%xmm0 + pslld $15-13,%xmm1 + pxor %xmm4,%xmm0 + pxor %xmm1,%xmm0 + paddd %xmm0,%xmm6 + movdqa %xmm11,%xmm7 + + movdqa %xmm11,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm11,%xmm1 + pslld $7,%xmm2 + movdqa %xmm6,16-128(%rax) + paddd %xmm14,%xmm6 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd -96(%rbp),%xmm6 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm11,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm11,%xmm4 + pslld $26-21,%xmm2 + pandn %xmm13,%xmm0 + pand %xmm12,%xmm4 + pxor %xmm1,%xmm7 + + + movdqa %xmm15,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm15,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm6 + pxor %xmm4,%xmm0 + movdqa %xmm8,%xmm4 + movdqa %xmm15,%xmm7 + pslld $10,%xmm2 + pxor %xmm15,%xmm4 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm6 + pslld $19-10,%xmm2 + pand %xmm4,%xmm3 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm8,%xmm14 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm3,%xmm14 + paddd %xmm6,%xmm10 + pxor %xmm2,%xmm7 + + paddd %xmm6,%xmm14 + paddd %xmm7,%xmm14 + movdqa 48-128(%rax),%xmm6 + paddd 176-128(%rax),%xmm5 + + movdqa %xmm6,%xmm7 + movdqa %xmm6,%xmm1 + psrld $3,%xmm7 + movdqa %xmm6,%xmm2 + + psrld $7,%xmm1 + movdqa 0-128(%rax),%xmm0 + pslld $14,%xmm2 + pxor %xmm1,%xmm7 + psrld $18-7,%xmm1 + movdqa %xmm0,%xmm3 + pxor %xmm2,%xmm7 + pslld $25-14,%xmm2 + pxor %xmm1,%xmm7 + psrld $10,%xmm0 + movdqa %xmm3,%xmm1 + + psrld $17,%xmm3 + pxor %xmm2,%xmm7 + pslld $13,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + psrld $19-17,%xmm3 + pxor %xmm1,%xmm0 + pslld $15-13,%xmm1 + pxor %xmm3,%xmm0 + pxor %xmm1,%xmm0 + paddd %xmm0,%xmm5 + movdqa %xmm10,%xmm7 + + movdqa %xmm10,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm10,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,32-128(%rax) + paddd %xmm13,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd -64(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm10,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm10,%xmm3 + pslld $26-21,%xmm2 + pandn %xmm12,%xmm0 + pand %xmm11,%xmm3 + pxor %xmm1,%xmm7 + + + movdqa %xmm14,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm14,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + movdqa %xmm15,%xmm3 + movdqa %xmm14,%xmm7 + pslld $10,%xmm2 + pxor %xmm14,%xmm3 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm3,%xmm4 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm15,%xmm13 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm4,%xmm13 + paddd %xmm5,%xmm9 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm13 + paddd %xmm7,%xmm13 + movdqa 64-128(%rax),%xmm5 + paddd 192-128(%rax),%xmm6 + + movdqa %xmm5,%xmm7 + movdqa %xmm5,%xmm1 + psrld $3,%xmm7 + movdqa %xmm5,%xmm2 + + psrld $7,%xmm1 + movdqa 16-128(%rax),%xmm0 + pslld $14,%xmm2 + pxor %xmm1,%xmm7 + psrld $18-7,%xmm1 + movdqa %xmm0,%xmm4 + pxor %xmm2,%xmm7 + pslld $25-14,%xmm2 + pxor %xmm1,%xmm7 + psrld $10,%xmm0 + movdqa %xmm4,%xmm1 + + psrld $17,%xmm4 + pxor %xmm2,%xmm7 + pslld $13,%xmm1 + paddd %xmm7,%xmm6 + pxor %xmm4,%xmm0 + psrld $19-17,%xmm4 + pxor %xmm1,%xmm0 + pslld $15-13,%xmm1 + pxor %xmm4,%xmm0 + pxor %xmm1,%xmm0 + paddd %xmm0,%xmm6 + movdqa %xmm9,%xmm7 + + movdqa %xmm9,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm9,%xmm1 + pslld $7,%xmm2 + movdqa %xmm6,48-128(%rax) + paddd %xmm12,%xmm6 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd -32(%rbp),%xmm6 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm9,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm9,%xmm4 + pslld $26-21,%xmm2 + pandn %xmm11,%xmm0 + pand %xmm10,%xmm4 + pxor %xmm1,%xmm7 + + + movdqa %xmm13,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm13,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm6 + pxor %xmm4,%xmm0 + movdqa %xmm14,%xmm4 + movdqa %xmm13,%xmm7 + pslld $10,%xmm2 + pxor %xmm13,%xmm4 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm6 + pslld $19-10,%xmm2 + pand %xmm4,%xmm3 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm14,%xmm12 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm3,%xmm12 + paddd %xmm6,%xmm8 + pxor %xmm2,%xmm7 + + paddd %xmm6,%xmm12 + paddd %xmm7,%xmm12 + movdqa 80-128(%rax),%xmm6 + paddd 208-128(%rax),%xmm5 + + movdqa %xmm6,%xmm7 + movdqa %xmm6,%xmm1 + psrld $3,%xmm7 + movdqa %xmm6,%xmm2 + + psrld $7,%xmm1 + movdqa 32-128(%rax),%xmm0 + pslld $14,%xmm2 + pxor %xmm1,%xmm7 + psrld $18-7,%xmm1 + movdqa %xmm0,%xmm3 + pxor %xmm2,%xmm7 + pslld $25-14,%xmm2 + pxor %xmm1,%xmm7 + psrld $10,%xmm0 + movdqa %xmm3,%xmm1 + + psrld $17,%xmm3 + pxor %xmm2,%xmm7 + pslld $13,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + psrld $19-17,%xmm3 + pxor %xmm1,%xmm0 + pslld $15-13,%xmm1 + pxor %xmm3,%xmm0 + pxor %xmm1,%xmm0 + paddd %xmm0,%xmm5 + movdqa %xmm8,%xmm7 + + movdqa %xmm8,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm8,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,64-128(%rax) + paddd %xmm11,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd 0(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm8,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm8,%xmm3 + pslld $26-21,%xmm2 + pandn %xmm10,%xmm0 + pand %xmm9,%xmm3 + pxor %xmm1,%xmm7 + + + movdqa %xmm12,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm12,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + movdqa %xmm13,%xmm3 + movdqa %xmm12,%xmm7 + pslld $10,%xmm2 + pxor %xmm12,%xmm3 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm3,%xmm4 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm13,%xmm11 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm4,%xmm11 + paddd %xmm5,%xmm15 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm11 + paddd %xmm7,%xmm11 + movdqa 96-128(%rax),%xmm5 + paddd 224-128(%rax),%xmm6 + + movdqa %xmm5,%xmm7 + movdqa %xmm5,%xmm1 + psrld $3,%xmm7 + movdqa %xmm5,%xmm2 + + psrld $7,%xmm1 + movdqa 48-128(%rax),%xmm0 + pslld $14,%xmm2 + pxor %xmm1,%xmm7 + psrld $18-7,%xmm1 + movdqa %xmm0,%xmm4 + pxor %xmm2,%xmm7 + pslld $25-14,%xmm2 + pxor %xmm1,%xmm7 + psrld $10,%xmm0 + movdqa %xmm4,%xmm1 + + psrld $17,%xmm4 + pxor %xmm2,%xmm7 + pslld $13,%xmm1 + paddd %xmm7,%xmm6 + pxor %xmm4,%xmm0 + psrld $19-17,%xmm4 + pxor %xmm1,%xmm0 + pslld $15-13,%xmm1 + pxor %xmm4,%xmm0 + pxor %xmm1,%xmm0 + paddd %xmm0,%xmm6 + movdqa %xmm15,%xmm7 + + movdqa %xmm15,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm15,%xmm1 + pslld $7,%xmm2 + movdqa %xmm6,80-128(%rax) + paddd %xmm10,%xmm6 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd 32(%rbp),%xmm6 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm15,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm15,%xmm4 + pslld $26-21,%xmm2 + pandn %xmm9,%xmm0 + pand %xmm8,%xmm4 + pxor %xmm1,%xmm7 + + + movdqa %xmm11,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm11,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm6 + pxor %xmm4,%xmm0 + movdqa %xmm12,%xmm4 + movdqa %xmm11,%xmm7 + pslld $10,%xmm2 + pxor %xmm11,%xmm4 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm6 + pslld $19-10,%xmm2 + pand %xmm4,%xmm3 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm12,%xmm10 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm3,%xmm10 + paddd %xmm6,%xmm14 + pxor %xmm2,%xmm7 + + paddd %xmm6,%xmm10 + paddd %xmm7,%xmm10 + movdqa 112-128(%rax),%xmm6 + paddd 240-128(%rax),%xmm5 + + movdqa %xmm6,%xmm7 + movdqa %xmm6,%xmm1 + psrld $3,%xmm7 + movdqa %xmm6,%xmm2 + + psrld $7,%xmm1 + movdqa 64-128(%rax),%xmm0 + pslld $14,%xmm2 + pxor %xmm1,%xmm7 + psrld $18-7,%xmm1 + movdqa %xmm0,%xmm3 + pxor %xmm2,%xmm7 + pslld $25-14,%xmm2 + pxor %xmm1,%xmm7 + psrld $10,%xmm0 + movdqa %xmm3,%xmm1 + + psrld $17,%xmm3 + pxor %xmm2,%xmm7 + pslld $13,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + psrld $19-17,%xmm3 + pxor %xmm1,%xmm0 + pslld $15-13,%xmm1 + pxor %xmm3,%xmm0 + pxor %xmm1,%xmm0 + paddd %xmm0,%xmm5 + movdqa %xmm14,%xmm7 + + movdqa %xmm14,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm14,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,96-128(%rax) + paddd %xmm9,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd 64(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm14,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm14,%xmm3 + pslld $26-21,%xmm2 + pandn %xmm8,%xmm0 + pand %xmm15,%xmm3 + pxor %xmm1,%xmm7 + + + movdqa %xmm10,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm10,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + movdqa %xmm11,%xmm3 + movdqa %xmm10,%xmm7 + pslld $10,%xmm2 + pxor %xmm10,%xmm3 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm3,%xmm4 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm11,%xmm9 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm4,%xmm9 + paddd %xmm5,%xmm13 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm9 + paddd %xmm7,%xmm9 + movdqa 128-128(%rax),%xmm5 + paddd 0-128(%rax),%xmm6 + + movdqa %xmm5,%xmm7 + movdqa %xmm5,%xmm1 + psrld $3,%xmm7 + movdqa %xmm5,%xmm2 + + psrld $7,%xmm1 + movdqa 80-128(%rax),%xmm0 + pslld $14,%xmm2 + pxor %xmm1,%xmm7 + psrld $18-7,%xmm1 + movdqa %xmm0,%xmm4 + pxor %xmm2,%xmm7 + pslld $25-14,%xmm2 + pxor %xmm1,%xmm7 + psrld $10,%xmm0 + movdqa %xmm4,%xmm1 + + psrld $17,%xmm4 + pxor %xmm2,%xmm7 + pslld $13,%xmm1 + paddd %xmm7,%xmm6 + pxor %xmm4,%xmm0 + psrld $19-17,%xmm4 + pxor %xmm1,%xmm0 + pslld $15-13,%xmm1 + pxor %xmm4,%xmm0 + pxor %xmm1,%xmm0 + paddd %xmm0,%xmm6 + movdqa %xmm13,%xmm7 + + movdqa %xmm13,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm13,%xmm1 + pslld $7,%xmm2 + movdqa %xmm6,112-128(%rax) + paddd %xmm8,%xmm6 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd 96(%rbp),%xmm6 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm13,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm13,%xmm4 + pslld $26-21,%xmm2 + pandn %xmm15,%xmm0 + pand %xmm14,%xmm4 + pxor %xmm1,%xmm7 + + + movdqa %xmm9,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm9,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm6 + pxor %xmm4,%xmm0 + movdqa %xmm10,%xmm4 + movdqa %xmm9,%xmm7 + pslld $10,%xmm2 + pxor %xmm9,%xmm4 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm6 + pslld $19-10,%xmm2 + pand %xmm4,%xmm3 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm10,%xmm8 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm3,%xmm8 + paddd %xmm6,%xmm12 + pxor %xmm2,%xmm7 + + paddd %xmm6,%xmm8 + paddd %xmm7,%xmm8 + leaq 256(%rbp),%rbp + movdqa 144-128(%rax),%xmm6 + paddd 16-128(%rax),%xmm5 + + movdqa %xmm6,%xmm7 + movdqa %xmm6,%xmm1 + psrld $3,%xmm7 + movdqa %xmm6,%xmm2 + + psrld $7,%xmm1 + movdqa 96-128(%rax),%xmm0 + pslld $14,%xmm2 + pxor %xmm1,%xmm7 + psrld $18-7,%xmm1 + movdqa %xmm0,%xmm3 + pxor %xmm2,%xmm7 + pslld $25-14,%xmm2 + pxor %xmm1,%xmm7 + psrld $10,%xmm0 + movdqa %xmm3,%xmm1 + + psrld $17,%xmm3 + pxor %xmm2,%xmm7 + pslld $13,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + psrld $19-17,%xmm3 + pxor %xmm1,%xmm0 + pslld $15-13,%xmm1 + pxor %xmm3,%xmm0 + pxor %xmm1,%xmm0 + paddd %xmm0,%xmm5 + movdqa %xmm12,%xmm7 + + movdqa %xmm12,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm12,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,128-128(%rax) + paddd %xmm15,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd -128(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm12,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm12,%xmm3 + pslld $26-21,%xmm2 + pandn %xmm14,%xmm0 + pand %xmm13,%xmm3 + pxor %xmm1,%xmm7 + + + movdqa %xmm8,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm8,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + movdqa %xmm9,%xmm3 + movdqa %xmm8,%xmm7 + pslld $10,%xmm2 + pxor %xmm8,%xmm3 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm3,%xmm4 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm9,%xmm15 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm4,%xmm15 + paddd %xmm5,%xmm11 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm15 + paddd %xmm7,%xmm15 + movdqa 160-128(%rax),%xmm5 + paddd 32-128(%rax),%xmm6 + + movdqa %xmm5,%xmm7 + movdqa %xmm5,%xmm1 + psrld $3,%xmm7 + movdqa %xmm5,%xmm2 + + psrld $7,%xmm1 + movdqa 112-128(%rax),%xmm0 + pslld $14,%xmm2 + pxor %xmm1,%xmm7 + psrld $18-7,%xmm1 + movdqa %xmm0,%xmm4 + pxor %xmm2,%xmm7 + pslld $25-14,%xmm2 + pxor %xmm1,%xmm7 + psrld $10,%xmm0 + movdqa %xmm4,%xmm1 + + psrld $17,%xmm4 + pxor %xmm2,%xmm7 + pslld $13,%xmm1 + paddd %xmm7,%xmm6 + pxor %xmm4,%xmm0 + psrld $19-17,%xmm4 + pxor %xmm1,%xmm0 + pslld $15-13,%xmm1 + pxor %xmm4,%xmm0 + pxor %xmm1,%xmm0 + paddd %xmm0,%xmm6 + movdqa %xmm11,%xmm7 + + movdqa %xmm11,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm11,%xmm1 + pslld $7,%xmm2 + movdqa %xmm6,144-128(%rax) + paddd %xmm14,%xmm6 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd -96(%rbp),%xmm6 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm11,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm11,%xmm4 + pslld $26-21,%xmm2 + pandn %xmm13,%xmm0 + pand %xmm12,%xmm4 + pxor %xmm1,%xmm7 + + + movdqa %xmm15,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm15,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm6 + pxor %xmm4,%xmm0 + movdqa %xmm8,%xmm4 + movdqa %xmm15,%xmm7 + pslld $10,%xmm2 + pxor %xmm15,%xmm4 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm6 + pslld $19-10,%xmm2 + pand %xmm4,%xmm3 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm8,%xmm14 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm3,%xmm14 + paddd %xmm6,%xmm10 + pxor %xmm2,%xmm7 + + paddd %xmm6,%xmm14 + paddd %xmm7,%xmm14 + movdqa 176-128(%rax),%xmm6 + paddd 48-128(%rax),%xmm5 + + movdqa %xmm6,%xmm7 + movdqa %xmm6,%xmm1 + psrld $3,%xmm7 + movdqa %xmm6,%xmm2 + + psrld $7,%xmm1 + movdqa 128-128(%rax),%xmm0 + pslld $14,%xmm2 + pxor %xmm1,%xmm7 + psrld $18-7,%xmm1 + movdqa %xmm0,%xmm3 + pxor %xmm2,%xmm7 + pslld $25-14,%xmm2 + pxor %xmm1,%xmm7 + psrld $10,%xmm0 + movdqa %xmm3,%xmm1 + + psrld $17,%xmm3 + pxor %xmm2,%xmm7 + pslld $13,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + psrld $19-17,%xmm3 + pxor %xmm1,%xmm0 + pslld $15-13,%xmm1 + pxor %xmm3,%xmm0 + pxor %xmm1,%xmm0 + paddd %xmm0,%xmm5 + movdqa %xmm10,%xmm7 + + movdqa %xmm10,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm10,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,160-128(%rax) + paddd %xmm13,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd -64(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm10,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm10,%xmm3 + pslld $26-21,%xmm2 + pandn %xmm12,%xmm0 + pand %xmm11,%xmm3 + pxor %xmm1,%xmm7 + + + movdqa %xmm14,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm14,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + movdqa %xmm15,%xmm3 + movdqa %xmm14,%xmm7 + pslld $10,%xmm2 + pxor %xmm14,%xmm3 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm3,%xmm4 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm15,%xmm13 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm4,%xmm13 + paddd %xmm5,%xmm9 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm13 + paddd %xmm7,%xmm13 + movdqa 192-128(%rax),%xmm5 + paddd 64-128(%rax),%xmm6 + + movdqa %xmm5,%xmm7 + movdqa %xmm5,%xmm1 + psrld $3,%xmm7 + movdqa %xmm5,%xmm2 + + psrld $7,%xmm1 + movdqa 144-128(%rax),%xmm0 + pslld $14,%xmm2 + pxor %xmm1,%xmm7 + psrld $18-7,%xmm1 + movdqa %xmm0,%xmm4 + pxor %xmm2,%xmm7 + pslld $25-14,%xmm2 + pxor %xmm1,%xmm7 + psrld $10,%xmm0 + movdqa %xmm4,%xmm1 + + psrld $17,%xmm4 + pxor %xmm2,%xmm7 + pslld $13,%xmm1 + paddd %xmm7,%xmm6 + pxor %xmm4,%xmm0 + psrld $19-17,%xmm4 + pxor %xmm1,%xmm0 + pslld $15-13,%xmm1 + pxor %xmm4,%xmm0 + pxor %xmm1,%xmm0 + paddd %xmm0,%xmm6 + movdqa %xmm9,%xmm7 + + movdqa %xmm9,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm9,%xmm1 + pslld $7,%xmm2 + movdqa %xmm6,176-128(%rax) + paddd %xmm12,%xmm6 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd -32(%rbp),%xmm6 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm9,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm9,%xmm4 + pslld $26-21,%xmm2 + pandn %xmm11,%xmm0 + pand %xmm10,%xmm4 + pxor %xmm1,%xmm7 + + + movdqa %xmm13,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm13,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm6 + pxor %xmm4,%xmm0 + movdqa %xmm14,%xmm4 + movdqa %xmm13,%xmm7 + pslld $10,%xmm2 + pxor %xmm13,%xmm4 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm6 + pslld $19-10,%xmm2 + pand %xmm4,%xmm3 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm14,%xmm12 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm3,%xmm12 + paddd %xmm6,%xmm8 + pxor %xmm2,%xmm7 + + paddd %xmm6,%xmm12 + paddd %xmm7,%xmm12 + movdqa 208-128(%rax),%xmm6 + paddd 80-128(%rax),%xmm5 + + movdqa %xmm6,%xmm7 + movdqa %xmm6,%xmm1 + psrld $3,%xmm7 + movdqa %xmm6,%xmm2 + + psrld $7,%xmm1 + movdqa 160-128(%rax),%xmm0 + pslld $14,%xmm2 + pxor %xmm1,%xmm7 + psrld $18-7,%xmm1 + movdqa %xmm0,%xmm3 + pxor %xmm2,%xmm7 + pslld $25-14,%xmm2 + pxor %xmm1,%xmm7 + psrld $10,%xmm0 + movdqa %xmm3,%xmm1 + + psrld $17,%xmm3 + pxor %xmm2,%xmm7 + pslld $13,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + psrld $19-17,%xmm3 + pxor %xmm1,%xmm0 + pslld $15-13,%xmm1 + pxor %xmm3,%xmm0 + pxor %xmm1,%xmm0 + paddd %xmm0,%xmm5 + movdqa %xmm8,%xmm7 + + movdqa %xmm8,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm8,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,192-128(%rax) + paddd %xmm11,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd 0(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm8,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm8,%xmm3 + pslld $26-21,%xmm2 + pandn %xmm10,%xmm0 + pand %xmm9,%xmm3 + pxor %xmm1,%xmm7 + + + movdqa %xmm12,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm12,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + movdqa %xmm13,%xmm3 + movdqa %xmm12,%xmm7 + pslld $10,%xmm2 + pxor %xmm12,%xmm3 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm3,%xmm4 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm13,%xmm11 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm4,%xmm11 + paddd %xmm5,%xmm15 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm11 + paddd %xmm7,%xmm11 + movdqa 224-128(%rax),%xmm5 + paddd 96-128(%rax),%xmm6 + + movdqa %xmm5,%xmm7 + movdqa %xmm5,%xmm1 + psrld $3,%xmm7 + movdqa %xmm5,%xmm2 + + psrld $7,%xmm1 + movdqa 176-128(%rax),%xmm0 + pslld $14,%xmm2 + pxor %xmm1,%xmm7 + psrld $18-7,%xmm1 + movdqa %xmm0,%xmm4 + pxor %xmm2,%xmm7 + pslld $25-14,%xmm2 + pxor %xmm1,%xmm7 + psrld $10,%xmm0 + movdqa %xmm4,%xmm1 + + psrld $17,%xmm4 + pxor %xmm2,%xmm7 + pslld $13,%xmm1 + paddd %xmm7,%xmm6 + pxor %xmm4,%xmm0 + psrld $19-17,%xmm4 + pxor %xmm1,%xmm0 + pslld $15-13,%xmm1 + pxor %xmm4,%xmm0 + pxor %xmm1,%xmm0 + paddd %xmm0,%xmm6 + movdqa %xmm15,%xmm7 + + movdqa %xmm15,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm15,%xmm1 + pslld $7,%xmm2 + movdqa %xmm6,208-128(%rax) + paddd %xmm10,%xmm6 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd 32(%rbp),%xmm6 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm15,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm15,%xmm4 + pslld $26-21,%xmm2 + pandn %xmm9,%xmm0 + pand %xmm8,%xmm4 + pxor %xmm1,%xmm7 + + + movdqa %xmm11,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm11,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm6 + pxor %xmm4,%xmm0 + movdqa %xmm12,%xmm4 + movdqa %xmm11,%xmm7 + pslld $10,%xmm2 + pxor %xmm11,%xmm4 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm6 + pslld $19-10,%xmm2 + pand %xmm4,%xmm3 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm12,%xmm10 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm3,%xmm10 + paddd %xmm6,%xmm14 + pxor %xmm2,%xmm7 + + paddd %xmm6,%xmm10 + paddd %xmm7,%xmm10 + movdqa 240-128(%rax),%xmm6 + paddd 112-128(%rax),%xmm5 + + movdqa %xmm6,%xmm7 + movdqa %xmm6,%xmm1 + psrld $3,%xmm7 + movdqa %xmm6,%xmm2 + + psrld $7,%xmm1 + movdqa 192-128(%rax),%xmm0 + pslld $14,%xmm2 + pxor %xmm1,%xmm7 + psrld $18-7,%xmm1 + movdqa %xmm0,%xmm3 + pxor %xmm2,%xmm7 + pslld $25-14,%xmm2 + pxor %xmm1,%xmm7 + psrld $10,%xmm0 + movdqa %xmm3,%xmm1 + + psrld $17,%xmm3 + pxor %xmm2,%xmm7 + pslld $13,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + psrld $19-17,%xmm3 + pxor %xmm1,%xmm0 + pslld $15-13,%xmm1 + pxor %xmm3,%xmm0 + pxor %xmm1,%xmm0 + paddd %xmm0,%xmm5 + movdqa %xmm14,%xmm7 + + movdqa %xmm14,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm14,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,224-128(%rax) + paddd %xmm9,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd 64(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm14,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm14,%xmm3 + pslld $26-21,%xmm2 + pandn %xmm8,%xmm0 + pand %xmm15,%xmm3 + pxor %xmm1,%xmm7 + + + movdqa %xmm10,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm10,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + movdqa %xmm11,%xmm3 + movdqa %xmm10,%xmm7 + pslld $10,%xmm2 + pxor %xmm10,%xmm3 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm3,%xmm4 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm11,%xmm9 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm4,%xmm9 + paddd %xmm5,%xmm13 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm9 + paddd %xmm7,%xmm9 + movdqa 0-128(%rax),%xmm5 + paddd 128-128(%rax),%xmm6 + + movdqa %xmm5,%xmm7 + movdqa %xmm5,%xmm1 + psrld $3,%xmm7 + movdqa %xmm5,%xmm2 + + psrld $7,%xmm1 + movdqa 208-128(%rax),%xmm0 + pslld $14,%xmm2 + pxor %xmm1,%xmm7 + psrld $18-7,%xmm1 + movdqa %xmm0,%xmm4 + pxor %xmm2,%xmm7 + pslld $25-14,%xmm2 + pxor %xmm1,%xmm7 + psrld $10,%xmm0 + movdqa %xmm4,%xmm1 + + psrld $17,%xmm4 + pxor %xmm2,%xmm7 + pslld $13,%xmm1 + paddd %xmm7,%xmm6 + pxor %xmm4,%xmm0 + psrld $19-17,%xmm4 + pxor %xmm1,%xmm0 + pslld $15-13,%xmm1 + pxor %xmm4,%xmm0 + pxor %xmm1,%xmm0 + paddd %xmm0,%xmm6 + movdqa %xmm13,%xmm7 + + movdqa %xmm13,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm13,%xmm1 + pslld $7,%xmm2 + movdqa %xmm6,240-128(%rax) + paddd %xmm8,%xmm6 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd 96(%rbp),%xmm6 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm13,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm13,%xmm4 + pslld $26-21,%xmm2 + pandn %xmm15,%xmm0 + pand %xmm14,%xmm4 + pxor %xmm1,%xmm7 + + + movdqa %xmm9,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm9,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm6 + pxor %xmm4,%xmm0 + movdqa %xmm10,%xmm4 + movdqa %xmm9,%xmm7 + pslld $10,%xmm2 + pxor %xmm9,%xmm4 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm6 + pslld $19-10,%xmm2 + pand %xmm4,%xmm3 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm10,%xmm8 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm3,%xmm8 + paddd %xmm6,%xmm12 + pxor %xmm2,%xmm7 + + paddd %xmm6,%xmm8 + paddd %xmm7,%xmm8 + leaq 256(%rbp),%rbp + decl %ecx + jnz .Loop_16_xx + + movl $1,%ecx + leaq K256+128(%rip),%rbp + + movdqa (%rbx),%xmm7 + cmpl 0(%rbx),%ecx + pxor %xmm0,%xmm0 + cmovgeq %rbp,%r8 + cmpl 4(%rbx),%ecx + movdqa %xmm7,%xmm6 + cmovgeq %rbp,%r9 + cmpl 8(%rbx),%ecx + pcmpgtd %xmm0,%xmm6 + cmovgeq %rbp,%r10 + cmpl 12(%rbx),%ecx + paddd %xmm6,%xmm7 + cmovgeq %rbp,%r11 + + movdqu 0-128(%rdi),%xmm0 + pand %xmm6,%xmm8 + movdqu 32-128(%rdi),%xmm1 + pand %xmm6,%xmm9 + movdqu 64-128(%rdi),%xmm2 + pand %xmm6,%xmm10 + movdqu 96-128(%rdi),%xmm5 + pand %xmm6,%xmm11 + paddd %xmm0,%xmm8 + movdqu 128-128(%rdi),%xmm0 + pand %xmm6,%xmm12 + paddd %xmm1,%xmm9 + movdqu 160-128(%rdi),%xmm1 + pand %xmm6,%xmm13 + paddd %xmm2,%xmm10 + movdqu 192-128(%rdi),%xmm2 + pand %xmm6,%xmm14 + paddd %xmm5,%xmm11 + movdqu 224-128(%rdi),%xmm5 + pand %xmm6,%xmm15 + paddd %xmm0,%xmm12 + paddd %xmm1,%xmm13 + movdqu %xmm8,0-128(%rdi) + paddd %xmm2,%xmm14 + movdqu %xmm9,32-128(%rdi) + paddd %xmm5,%xmm15 + movdqu %xmm10,64-128(%rdi) + movdqu %xmm11,96-128(%rdi) + movdqu %xmm12,128-128(%rdi) + movdqu %xmm13,160-128(%rdi) + movdqu %xmm14,192-128(%rdi) + movdqu %xmm15,224-128(%rdi) + + movdqa %xmm7,(%rbx) + movdqa .Lpbswap(%rip),%xmm6 + decl %edx + jnz .Loop + + movl 280(%rsp),%edx + leaq 16(%rdi),%rdi + leaq 64(%rsi),%rsi + decl %edx + jnz .Loop_grande + +.Ldone: + movq 272(%rsp),%rax + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp +.Lepilogue: + .byte 0xf3,0xc3 +.size sha256_multi_block,.-sha256_multi_block +.type sha256_multi_block_shaext,@function +.align 32 +sha256_multi_block_shaext: +_shaext_shortcut: + movq %rsp,%rax + pushq %rbx + pushq %rbp + subq $288,%rsp + shll $1,%edx + andq $-256,%rsp + leaq 128(%rdi),%rdi + movq %rax,272(%rsp) +.Lbody_shaext: + leaq 256(%rsp),%rbx + leaq K256_shaext+128(%rip),%rbp + +.Loop_grande_shaext: + movl %edx,280(%rsp) + xorl %edx,%edx + movq 0(%rsi),%r8 + movl 8(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,0(%rbx) + cmovleq %rsp,%r8 + movq 16(%rsi),%r9 + movl 24(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,4(%rbx) + cmovleq %rsp,%r9 + testl %edx,%edx + jz .Ldone_shaext + + movq 0-128(%rdi),%xmm12 + movq 32-128(%rdi),%xmm4 + movq 64-128(%rdi),%xmm13 + movq 96-128(%rdi),%xmm5 + movq 128-128(%rdi),%xmm8 + movq 160-128(%rdi),%xmm9 + movq 192-128(%rdi),%xmm10 + movq 224-128(%rdi),%xmm11 + + punpckldq %xmm4,%xmm12 + punpckldq %xmm5,%xmm13 + punpckldq %xmm9,%xmm8 + punpckldq %xmm11,%xmm10 + movdqa K256_shaext-16(%rip),%xmm3 + + movdqa %xmm12,%xmm14 + movdqa %xmm13,%xmm15 + punpcklqdq %xmm8,%xmm12 + punpcklqdq %xmm10,%xmm13 + punpckhqdq %xmm8,%xmm14 + punpckhqdq %xmm10,%xmm15 + + pshufd $27,%xmm12,%xmm12 + pshufd $27,%xmm13,%xmm13 + pshufd $27,%xmm14,%xmm14 + pshufd $27,%xmm15,%xmm15 + jmp .Loop_shaext + +.align 32 +.Loop_shaext: + movdqu 0(%r8),%xmm4 + movdqu 0(%r9),%xmm8 + movdqu 16(%r8),%xmm5 + movdqu 16(%r9),%xmm9 + movdqu 32(%r8),%xmm6 +.byte 102,15,56,0,227 + movdqu 32(%r9),%xmm10 +.byte 102,68,15,56,0,195 + movdqu 48(%r8),%xmm7 + leaq 64(%r8),%r8 + movdqu 48(%r9),%xmm11 + leaq 64(%r9),%r9 + + movdqa 0-128(%rbp),%xmm0 +.byte 102,15,56,0,235 + paddd %xmm4,%xmm0 + pxor %xmm12,%xmm4 + movdqa %xmm0,%xmm1 + movdqa 0-128(%rbp),%xmm2 +.byte 102,68,15,56,0,203 + paddd %xmm8,%xmm2 + movdqa %xmm13,80(%rsp) +.byte 69,15,56,203,236 + pxor %xmm14,%xmm8 + movdqa %xmm2,%xmm0 + movdqa %xmm15,112(%rsp) +.byte 69,15,56,203,254 + pshufd $14,%xmm1,%xmm0 + pxor %xmm12,%xmm4 + movdqa %xmm12,64(%rsp) +.byte 69,15,56,203,229 + pshufd $14,%xmm2,%xmm0 + pxor %xmm14,%xmm8 + movdqa %xmm14,96(%rsp) + movdqa 16-128(%rbp),%xmm1 + paddd %xmm5,%xmm1 +.byte 102,15,56,0,243 +.byte 69,15,56,203,247 + + movdqa %xmm1,%xmm0 + movdqa 16-128(%rbp),%xmm2 + paddd %xmm9,%xmm2 +.byte 69,15,56,203,236 + movdqa %xmm2,%xmm0 + prefetcht0 127(%r8) +.byte 102,15,56,0,251 +.byte 102,68,15,56,0,211 + prefetcht0 127(%r9) +.byte 69,15,56,203,254 + pshufd $14,%xmm1,%xmm0 +.byte 102,68,15,56,0,219 +.byte 15,56,204,229 +.byte 69,15,56,203,229 + pshufd $14,%xmm2,%xmm0 + movdqa 32-128(%rbp),%xmm1 + paddd %xmm6,%xmm1 +.byte 69,15,56,203,247 + + movdqa %xmm1,%xmm0 + movdqa 32-128(%rbp),%xmm2 + paddd %xmm10,%xmm2 +.byte 69,15,56,203,236 +.byte 69,15,56,204,193 + movdqa %xmm2,%xmm0 + movdqa %xmm7,%xmm3 +.byte 69,15,56,203,254 + pshufd $14,%xmm1,%xmm0 +.byte 102,15,58,15,222,4 + paddd %xmm3,%xmm4 + movdqa %xmm11,%xmm3 +.byte 102,65,15,58,15,218,4 +.byte 15,56,204,238 +.byte 69,15,56,203,229 + pshufd $14,%xmm2,%xmm0 + movdqa 48-128(%rbp),%xmm1 + paddd %xmm7,%xmm1 +.byte 69,15,56,203,247 +.byte 69,15,56,204,202 + + movdqa %xmm1,%xmm0 + movdqa 48-128(%rbp),%xmm2 + paddd %xmm3,%xmm8 + paddd %xmm11,%xmm2 +.byte 15,56,205,231 +.byte 69,15,56,203,236 + movdqa %xmm2,%xmm0 + movdqa %xmm4,%xmm3 +.byte 102,15,58,15,223,4 +.byte 69,15,56,203,254 +.byte 69,15,56,205,195 + pshufd $14,%xmm1,%xmm0 + paddd %xmm3,%xmm5 + movdqa %xmm8,%xmm3 +.byte 102,65,15,58,15,219,4 +.byte 15,56,204,247 +.byte 69,15,56,203,229 + pshufd $14,%xmm2,%xmm0 + movdqa 64-128(%rbp),%xmm1 + paddd %xmm4,%xmm1 +.byte 69,15,56,203,247 +.byte 69,15,56,204,211 + movdqa %xmm1,%xmm0 + movdqa 64-128(%rbp),%xmm2 + paddd %xmm3,%xmm9 + paddd %xmm8,%xmm2 +.byte 15,56,205,236 +.byte 69,15,56,203,236 + movdqa %xmm2,%xmm0 + movdqa %xmm5,%xmm3 +.byte 102,15,58,15,220,4 +.byte 69,15,56,203,254 +.byte 69,15,56,205,200 + pshufd $14,%xmm1,%xmm0 + paddd %xmm3,%xmm6 + movdqa %xmm9,%xmm3 +.byte 102,65,15,58,15,216,4 +.byte 15,56,204,252 +.byte 69,15,56,203,229 + pshufd $14,%xmm2,%xmm0 + movdqa 80-128(%rbp),%xmm1 + paddd %xmm5,%xmm1 +.byte 69,15,56,203,247 +.byte 69,15,56,204,216 + movdqa %xmm1,%xmm0 + movdqa 80-128(%rbp),%xmm2 + paddd %xmm3,%xmm10 + paddd %xmm9,%xmm2 +.byte 15,56,205,245 +.byte 69,15,56,203,236 + movdqa %xmm2,%xmm0 + movdqa %xmm6,%xmm3 +.byte 102,15,58,15,221,4 +.byte 69,15,56,203,254 +.byte 69,15,56,205,209 + pshufd $14,%xmm1,%xmm0 + paddd %xmm3,%xmm7 + movdqa %xmm10,%xmm3 +.byte 102,65,15,58,15,217,4 +.byte 15,56,204,229 +.byte 69,15,56,203,229 + pshufd $14,%xmm2,%xmm0 + movdqa 96-128(%rbp),%xmm1 + paddd %xmm6,%xmm1 +.byte 69,15,56,203,247 +.byte 69,15,56,204,193 + movdqa %xmm1,%xmm0 + movdqa 96-128(%rbp),%xmm2 + paddd %xmm3,%xmm11 + paddd %xmm10,%xmm2 +.byte 15,56,205,254 +.byte 69,15,56,203,236 + movdqa %xmm2,%xmm0 + movdqa %xmm7,%xmm3 +.byte 102,15,58,15,222,4 +.byte 69,15,56,203,254 +.byte 69,15,56,205,218 + pshufd $14,%xmm1,%xmm0 + paddd %xmm3,%xmm4 + movdqa %xmm11,%xmm3 +.byte 102,65,15,58,15,218,4 +.byte 15,56,204,238 +.byte 69,15,56,203,229 + pshufd $14,%xmm2,%xmm0 + movdqa 112-128(%rbp),%xmm1 + paddd %xmm7,%xmm1 +.byte 69,15,56,203,247 +.byte 69,15,56,204,202 + movdqa %xmm1,%xmm0 + movdqa 112-128(%rbp),%xmm2 + paddd %xmm3,%xmm8 + paddd %xmm11,%xmm2 +.byte 15,56,205,231 +.byte 69,15,56,203,236 + movdqa %xmm2,%xmm0 + movdqa %xmm4,%xmm3 +.byte 102,15,58,15,223,4 +.byte 69,15,56,203,254 +.byte 69,15,56,205,195 + pshufd $14,%xmm1,%xmm0 + paddd %xmm3,%xmm5 + movdqa %xmm8,%xmm3 +.byte 102,65,15,58,15,219,4 +.byte 15,56,204,247 +.byte 69,15,56,203,229 + pshufd $14,%xmm2,%xmm0 + movdqa 128-128(%rbp),%xmm1 + paddd %xmm4,%xmm1 +.byte 69,15,56,203,247 +.byte 69,15,56,204,211 + movdqa %xmm1,%xmm0 + movdqa 128-128(%rbp),%xmm2 + paddd %xmm3,%xmm9 + paddd %xmm8,%xmm2 +.byte 15,56,205,236 +.byte 69,15,56,203,236 + movdqa %xmm2,%xmm0 + movdqa %xmm5,%xmm3 +.byte 102,15,58,15,220,4 +.byte 69,15,56,203,254 +.byte 69,15,56,205,200 + pshufd $14,%xmm1,%xmm0 + paddd %xmm3,%xmm6 + movdqa %xmm9,%xmm3 +.byte 102,65,15,58,15,216,4 +.byte 15,56,204,252 +.byte 69,15,56,203,229 + pshufd $14,%xmm2,%xmm0 + movdqa 144-128(%rbp),%xmm1 + paddd %xmm5,%xmm1 +.byte 69,15,56,203,247 +.byte 69,15,56,204,216 + movdqa %xmm1,%xmm0 + movdqa 144-128(%rbp),%xmm2 + paddd %xmm3,%xmm10 + paddd %xmm9,%xmm2 +.byte 15,56,205,245 +.byte 69,15,56,203,236 + movdqa %xmm2,%xmm0 + movdqa %xmm6,%xmm3 +.byte 102,15,58,15,221,4 +.byte 69,15,56,203,254 +.byte 69,15,56,205,209 + pshufd $14,%xmm1,%xmm0 + paddd %xmm3,%xmm7 + movdqa %xmm10,%xmm3 +.byte 102,65,15,58,15,217,4 +.byte 15,56,204,229 +.byte 69,15,56,203,229 + pshufd $14,%xmm2,%xmm0 + movdqa 160-128(%rbp),%xmm1 + paddd %xmm6,%xmm1 +.byte 69,15,56,203,247 +.byte 69,15,56,204,193 + movdqa %xmm1,%xmm0 + movdqa 160-128(%rbp),%xmm2 + paddd %xmm3,%xmm11 + paddd %xmm10,%xmm2 +.byte 15,56,205,254 +.byte 69,15,56,203,236 + movdqa %xmm2,%xmm0 + movdqa %xmm7,%xmm3 +.byte 102,15,58,15,222,4 +.byte 69,15,56,203,254 +.byte 69,15,56,205,218 + pshufd $14,%xmm1,%xmm0 + paddd %xmm3,%xmm4 + movdqa %xmm11,%xmm3 +.byte 102,65,15,58,15,218,4 +.byte 15,56,204,238 +.byte 69,15,56,203,229 + pshufd $14,%xmm2,%xmm0 + movdqa 176-128(%rbp),%xmm1 + paddd %xmm7,%xmm1 +.byte 69,15,56,203,247 +.byte 69,15,56,204,202 + movdqa %xmm1,%xmm0 + movdqa 176-128(%rbp),%xmm2 + paddd %xmm3,%xmm8 + paddd %xmm11,%xmm2 +.byte 15,56,205,231 +.byte 69,15,56,203,236 + movdqa %xmm2,%xmm0 + movdqa %xmm4,%xmm3 +.byte 102,15,58,15,223,4 +.byte 69,15,56,203,254 +.byte 69,15,56,205,195 + pshufd $14,%xmm1,%xmm0 + paddd %xmm3,%xmm5 + movdqa %xmm8,%xmm3 +.byte 102,65,15,58,15,219,4 +.byte 15,56,204,247 +.byte 69,15,56,203,229 + pshufd $14,%xmm2,%xmm0 + movdqa 192-128(%rbp),%xmm1 + paddd %xmm4,%xmm1 +.byte 69,15,56,203,247 +.byte 69,15,56,204,211 + movdqa %xmm1,%xmm0 + movdqa 192-128(%rbp),%xmm2 + paddd %xmm3,%xmm9 + paddd %xmm8,%xmm2 +.byte 15,56,205,236 +.byte 69,15,56,203,236 + movdqa %xmm2,%xmm0 + movdqa %xmm5,%xmm3 +.byte 102,15,58,15,220,4 +.byte 69,15,56,203,254 +.byte 69,15,56,205,200 + pshufd $14,%xmm1,%xmm0 + paddd %xmm3,%xmm6 + movdqa %xmm9,%xmm3 +.byte 102,65,15,58,15,216,4 +.byte 15,56,204,252 +.byte 69,15,56,203,229 + pshufd $14,%xmm2,%xmm0 + movdqa 208-128(%rbp),%xmm1 + paddd %xmm5,%xmm1 +.byte 69,15,56,203,247 +.byte 69,15,56,204,216 + movdqa %xmm1,%xmm0 + movdqa 208-128(%rbp),%xmm2 + paddd %xmm3,%xmm10 + paddd %xmm9,%xmm2 +.byte 15,56,205,245 +.byte 69,15,56,203,236 + movdqa %xmm2,%xmm0 + movdqa %xmm6,%xmm3 +.byte 102,15,58,15,221,4 +.byte 69,15,56,203,254 +.byte 69,15,56,205,209 + pshufd $14,%xmm1,%xmm0 + paddd %xmm3,%xmm7 + movdqa %xmm10,%xmm3 +.byte 102,65,15,58,15,217,4 + nop +.byte 69,15,56,203,229 + pshufd $14,%xmm2,%xmm0 + movdqa 224-128(%rbp),%xmm1 + paddd %xmm6,%xmm1 +.byte 69,15,56,203,247 + + movdqa %xmm1,%xmm0 + movdqa 224-128(%rbp),%xmm2 + paddd %xmm3,%xmm11 + paddd %xmm10,%xmm2 +.byte 15,56,205,254 + nop +.byte 69,15,56,203,236 + movdqa %xmm2,%xmm0 + movl $1,%ecx + pxor %xmm6,%xmm6 +.byte 69,15,56,203,254 +.byte 69,15,56,205,218 + pshufd $14,%xmm1,%xmm0 + movdqa 240-128(%rbp),%xmm1 + paddd %xmm7,%xmm1 + movq (%rbx),%xmm7 + nop +.byte 69,15,56,203,229 + pshufd $14,%xmm2,%xmm0 + movdqa 240-128(%rbp),%xmm2 + paddd %xmm11,%xmm2 +.byte 69,15,56,203,247 + + movdqa %xmm1,%xmm0 + cmpl 0(%rbx),%ecx + cmovgeq %rsp,%r8 + cmpl 4(%rbx),%ecx + cmovgeq %rsp,%r9 + pshufd $0,%xmm7,%xmm9 +.byte 69,15,56,203,236 + movdqa %xmm2,%xmm0 + pshufd $85,%xmm7,%xmm10 + movdqa %xmm7,%xmm11 +.byte 69,15,56,203,254 + pshufd $14,%xmm1,%xmm0 + pcmpgtd %xmm6,%xmm9 + pcmpgtd %xmm6,%xmm10 +.byte 69,15,56,203,229 + pshufd $14,%xmm2,%xmm0 + pcmpgtd %xmm6,%xmm11 + movdqa K256_shaext-16(%rip),%xmm3 +.byte 69,15,56,203,247 + + pand %xmm9,%xmm13 + pand %xmm10,%xmm15 + pand %xmm9,%xmm12 + pand %xmm10,%xmm14 + paddd %xmm7,%xmm11 + + paddd 80(%rsp),%xmm13 + paddd 112(%rsp),%xmm15 + paddd 64(%rsp),%xmm12 + paddd 96(%rsp),%xmm14 + + movq %xmm11,(%rbx) + decl %edx + jnz .Loop_shaext + + movl 280(%rsp),%edx + + pshufd $27,%xmm12,%xmm12 + pshufd $27,%xmm13,%xmm13 + pshufd $27,%xmm14,%xmm14 + pshufd $27,%xmm15,%xmm15 + + movdqa %xmm12,%xmm5 + movdqa %xmm13,%xmm6 + punpckldq %xmm14,%xmm12 + punpckhdq %xmm14,%xmm5 + punpckldq %xmm15,%xmm13 + punpckhdq %xmm15,%xmm6 + + movq %xmm12,0-128(%rdi) + psrldq $8,%xmm12 + movq %xmm5,128-128(%rdi) + psrldq $8,%xmm5 + movq %xmm12,32-128(%rdi) + movq %xmm5,160-128(%rdi) + + movq %xmm13,64-128(%rdi) + psrldq $8,%xmm13 + movq %xmm6,192-128(%rdi) + psrldq $8,%xmm6 + movq %xmm13,96-128(%rdi) + movq %xmm6,224-128(%rdi) + + leaq 8(%rdi),%rdi + leaq 32(%rsi),%rsi + decl %edx + jnz .Loop_grande_shaext + +.Ldone_shaext: + + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp +.Lepilogue_shaext: + .byte 0xf3,0xc3 +.size sha256_multi_block_shaext,.-sha256_multi_block_shaext +.align 256 +K256: +.long 1116352408,1116352408,1116352408,1116352408 +.long 1116352408,1116352408,1116352408,1116352408 +.long 1899447441,1899447441,1899447441,1899447441 +.long 1899447441,1899447441,1899447441,1899447441 +.long 3049323471,3049323471,3049323471,3049323471 +.long 3049323471,3049323471,3049323471,3049323471 +.long 3921009573,3921009573,3921009573,3921009573 +.long 3921009573,3921009573,3921009573,3921009573 +.long 961987163,961987163,961987163,961987163 +.long 961987163,961987163,961987163,961987163 +.long 1508970993,1508970993,1508970993,1508970993 +.long 1508970993,1508970993,1508970993,1508970993 +.long 2453635748,2453635748,2453635748,2453635748 +.long 2453635748,2453635748,2453635748,2453635748 +.long 2870763221,2870763221,2870763221,2870763221 +.long 2870763221,2870763221,2870763221,2870763221 +.long 3624381080,3624381080,3624381080,3624381080 +.long 3624381080,3624381080,3624381080,3624381080 +.long 310598401,310598401,310598401,310598401 +.long 310598401,310598401,310598401,310598401 +.long 607225278,607225278,607225278,607225278 +.long 607225278,607225278,607225278,607225278 +.long 1426881987,1426881987,1426881987,1426881987 +.long 1426881987,1426881987,1426881987,1426881987 +.long 1925078388,1925078388,1925078388,1925078388 +.long 1925078388,1925078388,1925078388,1925078388 +.long 2162078206,2162078206,2162078206,2162078206 +.long 2162078206,2162078206,2162078206,2162078206 +.long 2614888103,2614888103,2614888103,2614888103 +.long 2614888103,2614888103,2614888103,2614888103 +.long 3248222580,3248222580,3248222580,3248222580 +.long 3248222580,3248222580,3248222580,3248222580 +.long 3835390401,3835390401,3835390401,3835390401 +.long 3835390401,3835390401,3835390401,3835390401 +.long 4022224774,4022224774,4022224774,4022224774 +.long 4022224774,4022224774,4022224774,4022224774 +.long 264347078,264347078,264347078,264347078 +.long 264347078,264347078,264347078,264347078 +.long 604807628,604807628,604807628,604807628 +.long 604807628,604807628,604807628,604807628 +.long 770255983,770255983,770255983,770255983 +.long 770255983,770255983,770255983,770255983 +.long 1249150122,1249150122,1249150122,1249150122 +.long 1249150122,1249150122,1249150122,1249150122 +.long 1555081692,1555081692,1555081692,1555081692 +.long 1555081692,1555081692,1555081692,1555081692 +.long 1996064986,1996064986,1996064986,1996064986 +.long 1996064986,1996064986,1996064986,1996064986 +.long 2554220882,2554220882,2554220882,2554220882 +.long 2554220882,2554220882,2554220882,2554220882 +.long 2821834349,2821834349,2821834349,2821834349 +.long 2821834349,2821834349,2821834349,2821834349 +.long 2952996808,2952996808,2952996808,2952996808 +.long 2952996808,2952996808,2952996808,2952996808 +.long 3210313671,3210313671,3210313671,3210313671 +.long 3210313671,3210313671,3210313671,3210313671 +.long 3336571891,3336571891,3336571891,3336571891 +.long 3336571891,3336571891,3336571891,3336571891 +.long 3584528711,3584528711,3584528711,3584528711 +.long 3584528711,3584528711,3584528711,3584528711 +.long 113926993,113926993,113926993,113926993 +.long 113926993,113926993,113926993,113926993 +.long 338241895,338241895,338241895,338241895 +.long 338241895,338241895,338241895,338241895 +.long 666307205,666307205,666307205,666307205 +.long 666307205,666307205,666307205,666307205 +.long 773529912,773529912,773529912,773529912 +.long 773529912,773529912,773529912,773529912 +.long 1294757372,1294757372,1294757372,1294757372 +.long 1294757372,1294757372,1294757372,1294757372 +.long 1396182291,1396182291,1396182291,1396182291 +.long 1396182291,1396182291,1396182291,1396182291 +.long 1695183700,1695183700,1695183700,1695183700 +.long 1695183700,1695183700,1695183700,1695183700 +.long 1986661051,1986661051,1986661051,1986661051 +.long 1986661051,1986661051,1986661051,1986661051 +.long 2177026350,2177026350,2177026350,2177026350 +.long 2177026350,2177026350,2177026350,2177026350 +.long 2456956037,2456956037,2456956037,2456956037 +.long 2456956037,2456956037,2456956037,2456956037 +.long 2730485921,2730485921,2730485921,2730485921 +.long 2730485921,2730485921,2730485921,2730485921 +.long 2820302411,2820302411,2820302411,2820302411 +.long 2820302411,2820302411,2820302411,2820302411 +.long 3259730800,3259730800,3259730800,3259730800 +.long 3259730800,3259730800,3259730800,3259730800 +.long 3345764771,3345764771,3345764771,3345764771 +.long 3345764771,3345764771,3345764771,3345764771 +.long 3516065817,3516065817,3516065817,3516065817 +.long 3516065817,3516065817,3516065817,3516065817 +.long 3600352804,3600352804,3600352804,3600352804 +.long 3600352804,3600352804,3600352804,3600352804 +.long 4094571909,4094571909,4094571909,4094571909 +.long 4094571909,4094571909,4094571909,4094571909 +.long 275423344,275423344,275423344,275423344 +.long 275423344,275423344,275423344,275423344 +.long 430227734,430227734,430227734,430227734 +.long 430227734,430227734,430227734,430227734 +.long 506948616,506948616,506948616,506948616 +.long 506948616,506948616,506948616,506948616 +.long 659060556,659060556,659060556,659060556 +.long 659060556,659060556,659060556,659060556 +.long 883997877,883997877,883997877,883997877 +.long 883997877,883997877,883997877,883997877 +.long 958139571,958139571,958139571,958139571 +.long 958139571,958139571,958139571,958139571 +.long 1322822218,1322822218,1322822218,1322822218 +.long 1322822218,1322822218,1322822218,1322822218 +.long 1537002063,1537002063,1537002063,1537002063 +.long 1537002063,1537002063,1537002063,1537002063 +.long 1747873779,1747873779,1747873779,1747873779 +.long 1747873779,1747873779,1747873779,1747873779 +.long 1955562222,1955562222,1955562222,1955562222 +.long 1955562222,1955562222,1955562222,1955562222 +.long 2024104815,2024104815,2024104815,2024104815 +.long 2024104815,2024104815,2024104815,2024104815 +.long 2227730452,2227730452,2227730452,2227730452 +.long 2227730452,2227730452,2227730452,2227730452 +.long 2361852424,2361852424,2361852424,2361852424 +.long 2361852424,2361852424,2361852424,2361852424 +.long 2428436474,2428436474,2428436474,2428436474 +.long 2428436474,2428436474,2428436474,2428436474 +.long 2756734187,2756734187,2756734187,2756734187 +.long 2756734187,2756734187,2756734187,2756734187 +.long 3204031479,3204031479,3204031479,3204031479 +.long 3204031479,3204031479,3204031479,3204031479 +.long 3329325298,3329325298,3329325298,3329325298 +.long 3329325298,3329325298,3329325298,3329325298 +.Lpbswap: +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +K256_shaext: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.byte 83,72,65,50,53,54,32,109,117,108,116,105,45,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 diff --git a/secure/lib/libcrypto/amd64/sha256-x86_64.S b/secure/lib/libcrypto/amd64/sha256-x86_64.S index 79e06b4..a43a668 100644 --- a/secure/lib/libcrypto/amd64/sha256-x86_64.S +++ b/secure/lib/libcrypto/amd64/sha256-x86_64.S @@ -1,10 +1,19 @@ # $FreeBSD$ .text + .globl sha256_block_data_order .type sha256_block_data_order,@function .align 16 sha256_block_data_order: + leaq OPENSSL_ia32cap_P(%rip),%r11 + movl 0(%r11),%r9d + movl 4(%r11),%r10d + movl 8(%r11),%r11d + testl $536870912,%r11d + jnz _shaext_shortcut + testl $512,%r10d + jnz .Lssse3_shortcut pushq %rbx pushq %rbp pushq %r12 @@ -22,8 +31,6 @@ sha256_block_data_order: movq %r11,64+24(%rsp) .Lprologue: - leaq K256(%rip),%rbp - movl 0(%rdi),%eax movl 4(%rdi),%ebx movl 8(%rdi),%ecx @@ -36,1694 +43,1632 @@ sha256_block_data_order: .align 16 .Lloop: - xorq %rdi,%rdi + movl %ebx,%edi + leaq K256(%rip),%rbp + xorl %ecx,%edi movl 0(%rsi),%r12d movl %r8d,%r13d movl %eax,%r14d bswapl %r12d rorl $14,%r13d movl %r9d,%r15d - movl %r12d,0(%rsp) - rorl $9,%r14d xorl %r8d,%r13d + rorl $9,%r14d xorl %r10d,%r15d - rorl $5,%r13d - addl %r11d,%r12d + movl %r12d,0(%rsp) xorl %eax,%r14d - - addl (%rbp,%rdi,4),%r12d andl %r8d,%r15d - movl %ebx,%r11d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d rorl $11,%r14d xorl %r8d,%r13d - xorl %r10d,%r15d + addl %r15d,%r12d - xorl %ecx,%r11d + movl %eax,%r15d + addl (%rbp),%r12d xorl %eax,%r14d - addl %r15d,%r12d - movl %ebx,%r15d + xorl %ebx,%r15d rorl $6,%r13d - andl %eax,%r11d - andl %ecx,%r15d + movl %ebx,%r11d + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r11d + xorl %edi,%r11d addl %r12d,%edx addl %r12d,%r11d - leaq 1(%rdi),%rdi - addl %r14d,%r11d + leaq 4(%rbp),%rbp + addl %r14d,%r11d movl 4(%rsi),%r12d movl %edx,%r13d movl %r11d,%r14d bswapl %r12d rorl $14,%r13d - movl %r8d,%r15d - movl %r12d,4(%rsp) + movl %r8d,%edi - rorl $9,%r14d xorl %edx,%r13d - xorl %r9d,%r15d + rorl $9,%r14d + xorl %r9d,%edi - rorl $5,%r13d - addl %r10d,%r12d + movl %r12d,4(%rsp) xorl %r11d,%r14d + andl %edx,%edi - addl (%rbp,%rdi,4),%r12d - andl %edx,%r15d - movl %eax,%r10d + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi rorl $11,%r14d xorl %edx,%r13d - xorl %r9d,%r15d + addl %edi,%r12d - xorl %ebx,%r10d + movl %r11d,%edi + addl (%rbp),%r12d xorl %r11d,%r14d - addl %r15d,%r12d - movl %eax,%r15d + xorl %eax,%edi rorl $6,%r13d - andl %r11d,%r10d - andl %ebx,%r15d + movl %eax,%r10d + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r10d + xorl %r15d,%r10d addl %r12d,%ecx addl %r12d,%r10d - leaq 1(%rdi),%rdi - addl %r14d,%r10d + leaq 4(%rbp),%rbp + addl %r14d,%r10d movl 8(%rsi),%r12d movl %ecx,%r13d movl %r10d,%r14d bswapl %r12d rorl $14,%r13d movl %edx,%r15d - movl %r12d,8(%rsp) - rorl $9,%r14d xorl %ecx,%r13d + rorl $9,%r14d xorl %r8d,%r15d - rorl $5,%r13d - addl %r9d,%r12d + movl %r12d,8(%rsp) xorl %r10d,%r14d - - addl (%rbp,%rdi,4),%r12d andl %ecx,%r15d - movl %r11d,%r9d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d rorl $11,%r14d xorl %ecx,%r13d - xorl %r8d,%r15d + addl %r15d,%r12d - xorl %eax,%r9d + movl %r10d,%r15d + addl (%rbp),%r12d xorl %r10d,%r14d - addl %r15d,%r12d - movl %r11d,%r15d + xorl %r11d,%r15d rorl $6,%r13d - andl %r10d,%r9d - andl %eax,%r15d + movl %r11d,%r9d + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r9d + xorl %edi,%r9d addl %r12d,%ebx addl %r12d,%r9d - leaq 1(%rdi),%rdi - addl %r14d,%r9d + leaq 4(%rbp),%rbp + addl %r14d,%r9d movl 12(%rsi),%r12d movl %ebx,%r13d movl %r9d,%r14d bswapl %r12d rorl $14,%r13d - movl %ecx,%r15d - movl %r12d,12(%rsp) + movl %ecx,%edi - rorl $9,%r14d xorl %ebx,%r13d - xorl %edx,%r15d + rorl $9,%r14d + xorl %edx,%edi - rorl $5,%r13d - addl %r8d,%r12d + movl %r12d,12(%rsp) xorl %r9d,%r14d + andl %ebx,%edi - addl (%rbp,%rdi,4),%r12d - andl %ebx,%r15d - movl %r10d,%r8d + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi rorl $11,%r14d xorl %ebx,%r13d - xorl %edx,%r15d + addl %edi,%r12d - xorl %r11d,%r8d + movl %r9d,%edi + addl (%rbp),%r12d xorl %r9d,%r14d - addl %r15d,%r12d - movl %r10d,%r15d + xorl %r10d,%edi rorl $6,%r13d - andl %r9d,%r8d - andl %r11d,%r15d + movl %r10d,%r8d + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r8d + xorl %r15d,%r8d addl %r12d,%eax addl %r12d,%r8d - leaq 1(%rdi),%rdi - addl %r14d,%r8d + leaq 20(%rbp),%rbp + addl %r14d,%r8d movl 16(%rsi),%r12d movl %eax,%r13d movl %r8d,%r14d bswapl %r12d rorl $14,%r13d movl %ebx,%r15d - movl %r12d,16(%rsp) - rorl $9,%r14d xorl %eax,%r13d + rorl $9,%r14d xorl %ecx,%r15d - rorl $5,%r13d - addl %edx,%r12d + movl %r12d,16(%rsp) xorl %r8d,%r14d - - addl (%rbp,%rdi,4),%r12d andl %eax,%r15d - movl %r9d,%edx + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d rorl $11,%r14d xorl %eax,%r13d - xorl %ecx,%r15d + addl %r15d,%r12d - xorl %r10d,%edx + movl %r8d,%r15d + addl (%rbp),%r12d xorl %r8d,%r14d - addl %r15d,%r12d - movl %r9d,%r15d + xorl %r9d,%r15d rorl $6,%r13d - andl %r8d,%edx - andl %r10d,%r15d + movl %r9d,%edx + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%edx + xorl %edi,%edx addl %r12d,%r11d addl %r12d,%edx - leaq 1(%rdi),%rdi - addl %r14d,%edx + leaq 4(%rbp),%rbp + addl %r14d,%edx movl 20(%rsi),%r12d movl %r11d,%r13d movl %edx,%r14d bswapl %r12d rorl $14,%r13d - movl %eax,%r15d - movl %r12d,20(%rsp) + movl %eax,%edi - rorl $9,%r14d xorl %r11d,%r13d - xorl %ebx,%r15d + rorl $9,%r14d + xorl %ebx,%edi - rorl $5,%r13d - addl %ecx,%r12d + movl %r12d,20(%rsp) xorl %edx,%r14d + andl %r11d,%edi - addl (%rbp,%rdi,4),%r12d - andl %r11d,%r15d - movl %r8d,%ecx + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi rorl $11,%r14d xorl %r11d,%r13d - xorl %ebx,%r15d + addl %edi,%r12d - xorl %r9d,%ecx + movl %edx,%edi + addl (%rbp),%r12d xorl %edx,%r14d - addl %r15d,%r12d - movl %r8d,%r15d + xorl %r8d,%edi rorl $6,%r13d - andl %edx,%ecx - andl %r9d,%r15d + movl %r8d,%ecx + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%ecx + xorl %r15d,%ecx addl %r12d,%r10d addl %r12d,%ecx - leaq 1(%rdi),%rdi - addl %r14d,%ecx + leaq 4(%rbp),%rbp + addl %r14d,%ecx movl 24(%rsi),%r12d movl %r10d,%r13d movl %ecx,%r14d bswapl %r12d rorl $14,%r13d movl %r11d,%r15d - movl %r12d,24(%rsp) - rorl $9,%r14d xorl %r10d,%r13d + rorl $9,%r14d xorl %eax,%r15d - rorl $5,%r13d - addl %ebx,%r12d + movl %r12d,24(%rsp) xorl %ecx,%r14d - - addl (%rbp,%rdi,4),%r12d andl %r10d,%r15d - movl %edx,%ebx + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d rorl $11,%r14d xorl %r10d,%r13d - xorl %eax,%r15d + addl %r15d,%r12d - xorl %r8d,%ebx + movl %ecx,%r15d + addl (%rbp),%r12d xorl %ecx,%r14d - addl %r15d,%r12d - movl %edx,%r15d + xorl %edx,%r15d rorl $6,%r13d - andl %ecx,%ebx - andl %r8d,%r15d + movl %edx,%ebx + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%ebx + xorl %edi,%ebx addl %r12d,%r9d addl %r12d,%ebx - leaq 1(%rdi),%rdi - addl %r14d,%ebx + leaq 4(%rbp),%rbp + addl %r14d,%ebx movl 28(%rsi),%r12d movl %r9d,%r13d movl %ebx,%r14d bswapl %r12d rorl $14,%r13d - movl %r10d,%r15d - movl %r12d,28(%rsp) + movl %r10d,%edi - rorl $9,%r14d xorl %r9d,%r13d - xorl %r11d,%r15d + rorl $9,%r14d + xorl %r11d,%edi - rorl $5,%r13d - addl %eax,%r12d + movl %r12d,28(%rsp) xorl %ebx,%r14d + andl %r9d,%edi - addl (%rbp,%rdi,4),%r12d - andl %r9d,%r15d - movl %ecx,%eax + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi rorl $11,%r14d xorl %r9d,%r13d - xorl %r11d,%r15d + addl %edi,%r12d - xorl %edx,%eax + movl %ebx,%edi + addl (%rbp),%r12d xorl %ebx,%r14d - addl %r15d,%r12d - movl %ecx,%r15d + xorl %ecx,%edi rorl $6,%r13d - andl %ebx,%eax - andl %edx,%r15d + movl %ecx,%eax + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%eax + xorl %r15d,%eax addl %r12d,%r8d addl %r12d,%eax - leaq 1(%rdi),%rdi - addl %r14d,%eax + leaq 20(%rbp),%rbp + addl %r14d,%eax movl 32(%rsi),%r12d movl %r8d,%r13d movl %eax,%r14d bswapl %r12d rorl $14,%r13d movl %r9d,%r15d - movl %r12d,32(%rsp) - rorl $9,%r14d xorl %r8d,%r13d + rorl $9,%r14d xorl %r10d,%r15d - rorl $5,%r13d - addl %r11d,%r12d + movl %r12d,32(%rsp) xorl %eax,%r14d - - addl (%rbp,%rdi,4),%r12d andl %r8d,%r15d - movl %ebx,%r11d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d rorl $11,%r14d xorl %r8d,%r13d - xorl %r10d,%r15d + addl %r15d,%r12d - xorl %ecx,%r11d + movl %eax,%r15d + addl (%rbp),%r12d xorl %eax,%r14d - addl %r15d,%r12d - movl %ebx,%r15d + xorl %ebx,%r15d rorl $6,%r13d - andl %eax,%r11d - andl %ecx,%r15d + movl %ebx,%r11d + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r11d + xorl %edi,%r11d addl %r12d,%edx addl %r12d,%r11d - leaq 1(%rdi),%rdi - addl %r14d,%r11d + leaq 4(%rbp),%rbp + addl %r14d,%r11d movl 36(%rsi),%r12d movl %edx,%r13d movl %r11d,%r14d bswapl %r12d rorl $14,%r13d - movl %r8d,%r15d - movl %r12d,36(%rsp) + movl %r8d,%edi - rorl $9,%r14d xorl %edx,%r13d - xorl %r9d,%r15d + rorl $9,%r14d + xorl %r9d,%edi - rorl $5,%r13d - addl %r10d,%r12d + movl %r12d,36(%rsp) xorl %r11d,%r14d + andl %edx,%edi - addl (%rbp,%rdi,4),%r12d - andl %edx,%r15d - movl %eax,%r10d + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi rorl $11,%r14d xorl %edx,%r13d - xorl %r9d,%r15d + addl %edi,%r12d - xorl %ebx,%r10d + movl %r11d,%edi + addl (%rbp),%r12d xorl %r11d,%r14d - addl %r15d,%r12d - movl %eax,%r15d + xorl %eax,%edi rorl $6,%r13d - andl %r11d,%r10d - andl %ebx,%r15d + movl %eax,%r10d + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r10d + xorl %r15d,%r10d addl %r12d,%ecx addl %r12d,%r10d - leaq 1(%rdi),%rdi - addl %r14d,%r10d + leaq 4(%rbp),%rbp + addl %r14d,%r10d movl 40(%rsi),%r12d movl %ecx,%r13d movl %r10d,%r14d bswapl %r12d rorl $14,%r13d movl %edx,%r15d - movl %r12d,40(%rsp) - rorl $9,%r14d xorl %ecx,%r13d + rorl $9,%r14d xorl %r8d,%r15d - rorl $5,%r13d - addl %r9d,%r12d + movl %r12d,40(%rsp) xorl %r10d,%r14d - - addl (%rbp,%rdi,4),%r12d andl %ecx,%r15d - movl %r11d,%r9d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d rorl $11,%r14d xorl %ecx,%r13d - xorl %r8d,%r15d + addl %r15d,%r12d - xorl %eax,%r9d + movl %r10d,%r15d + addl (%rbp),%r12d xorl %r10d,%r14d - addl %r15d,%r12d - movl %r11d,%r15d + xorl %r11d,%r15d rorl $6,%r13d - andl %r10d,%r9d - andl %eax,%r15d + movl %r11d,%r9d + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r9d + xorl %edi,%r9d addl %r12d,%ebx addl %r12d,%r9d - leaq 1(%rdi),%rdi - addl %r14d,%r9d + leaq 4(%rbp),%rbp + addl %r14d,%r9d movl 44(%rsi),%r12d movl %ebx,%r13d movl %r9d,%r14d bswapl %r12d rorl $14,%r13d - movl %ecx,%r15d - movl %r12d,44(%rsp) + movl %ecx,%edi - rorl $9,%r14d xorl %ebx,%r13d - xorl %edx,%r15d + rorl $9,%r14d + xorl %edx,%edi - rorl $5,%r13d - addl %r8d,%r12d + movl %r12d,44(%rsp) xorl %r9d,%r14d + andl %ebx,%edi - addl (%rbp,%rdi,4),%r12d - andl %ebx,%r15d - movl %r10d,%r8d + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi rorl $11,%r14d xorl %ebx,%r13d - xorl %edx,%r15d + addl %edi,%r12d - xorl %r11d,%r8d + movl %r9d,%edi + addl (%rbp),%r12d xorl %r9d,%r14d - addl %r15d,%r12d - movl %r10d,%r15d + xorl %r10d,%edi rorl $6,%r13d - andl %r9d,%r8d - andl %r11d,%r15d + movl %r10d,%r8d + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r8d + xorl %r15d,%r8d addl %r12d,%eax addl %r12d,%r8d - leaq 1(%rdi),%rdi - addl %r14d,%r8d + leaq 20(%rbp),%rbp + addl %r14d,%r8d movl 48(%rsi),%r12d movl %eax,%r13d movl %r8d,%r14d bswapl %r12d rorl $14,%r13d movl %ebx,%r15d - movl %r12d,48(%rsp) - rorl $9,%r14d xorl %eax,%r13d + rorl $9,%r14d xorl %ecx,%r15d - rorl $5,%r13d - addl %edx,%r12d + movl %r12d,48(%rsp) xorl %r8d,%r14d - - addl (%rbp,%rdi,4),%r12d andl %eax,%r15d - movl %r9d,%edx + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d rorl $11,%r14d xorl %eax,%r13d - xorl %ecx,%r15d + addl %r15d,%r12d - xorl %r10d,%edx + movl %r8d,%r15d + addl (%rbp),%r12d xorl %r8d,%r14d - addl %r15d,%r12d - movl %r9d,%r15d + xorl %r9d,%r15d rorl $6,%r13d - andl %r8d,%edx - andl %r10d,%r15d + movl %r9d,%edx + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%edx + xorl %edi,%edx addl %r12d,%r11d addl %r12d,%edx - leaq 1(%rdi),%rdi - addl %r14d,%edx + leaq 4(%rbp),%rbp + addl %r14d,%edx movl 52(%rsi),%r12d movl %r11d,%r13d movl %edx,%r14d bswapl %r12d rorl $14,%r13d - movl %eax,%r15d - movl %r12d,52(%rsp) + movl %eax,%edi - rorl $9,%r14d xorl %r11d,%r13d - xorl %ebx,%r15d + rorl $9,%r14d + xorl %ebx,%edi - rorl $5,%r13d - addl %ecx,%r12d + movl %r12d,52(%rsp) xorl %edx,%r14d + andl %r11d,%edi - addl (%rbp,%rdi,4),%r12d - andl %r11d,%r15d - movl %r8d,%ecx + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi rorl $11,%r14d xorl %r11d,%r13d - xorl %ebx,%r15d + addl %edi,%r12d - xorl %r9d,%ecx + movl %edx,%edi + addl (%rbp),%r12d xorl %edx,%r14d - addl %r15d,%r12d - movl %r8d,%r15d + xorl %r8d,%edi rorl $6,%r13d - andl %edx,%ecx - andl %r9d,%r15d + movl %r8d,%ecx + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%ecx + xorl %r15d,%ecx addl %r12d,%r10d addl %r12d,%ecx - leaq 1(%rdi),%rdi - addl %r14d,%ecx + leaq 4(%rbp),%rbp + addl %r14d,%ecx movl 56(%rsi),%r12d movl %r10d,%r13d movl %ecx,%r14d bswapl %r12d rorl $14,%r13d movl %r11d,%r15d - movl %r12d,56(%rsp) - rorl $9,%r14d xorl %r10d,%r13d + rorl $9,%r14d xorl %eax,%r15d - rorl $5,%r13d - addl %ebx,%r12d + movl %r12d,56(%rsp) xorl %ecx,%r14d - - addl (%rbp,%rdi,4),%r12d andl %r10d,%r15d - movl %edx,%ebx + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d rorl $11,%r14d xorl %r10d,%r13d - xorl %eax,%r15d + addl %r15d,%r12d - xorl %r8d,%ebx + movl %ecx,%r15d + addl (%rbp),%r12d xorl %ecx,%r14d - addl %r15d,%r12d - movl %edx,%r15d + xorl %edx,%r15d rorl $6,%r13d - andl %ecx,%ebx - andl %r8d,%r15d + movl %edx,%ebx + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%ebx + xorl %edi,%ebx addl %r12d,%r9d addl %r12d,%ebx - leaq 1(%rdi),%rdi - addl %r14d,%ebx + leaq 4(%rbp),%rbp + addl %r14d,%ebx movl 60(%rsi),%r12d movl %r9d,%r13d movl %ebx,%r14d bswapl %r12d rorl $14,%r13d - movl %r10d,%r15d - movl %r12d,60(%rsp) + movl %r10d,%edi - rorl $9,%r14d xorl %r9d,%r13d - xorl %r11d,%r15d + rorl $9,%r14d + xorl %r11d,%edi - rorl $5,%r13d - addl %eax,%r12d + movl %r12d,60(%rsp) xorl %ebx,%r14d + andl %r9d,%edi - addl (%rbp,%rdi,4),%r12d - andl %r9d,%r15d - movl %ecx,%eax + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi rorl $11,%r14d xorl %r9d,%r13d - xorl %r11d,%r15d + addl %edi,%r12d - xorl %edx,%eax + movl %ebx,%edi + addl (%rbp),%r12d xorl %ebx,%r14d - addl %r15d,%r12d - movl %ecx,%r15d + xorl %ecx,%edi rorl $6,%r13d - andl %ebx,%eax - andl %edx,%r15d + movl %ecx,%eax + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%eax + xorl %r15d,%eax addl %r12d,%r8d addl %r12d,%eax - leaq 1(%rdi),%rdi - addl %r14d,%eax + leaq 20(%rbp),%rbp jmp .Lrounds_16_xx .align 16 .Lrounds_16_xx: movl 4(%rsp),%r13d - movl 56(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 56(%rsp),%r15d - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d - rorl $7,%r12d xorl %r12d,%r13d - movl 36(%rsp),%r12d - - rorl $2,%r15d + shrl $3,%r12d + rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 36(%rsp),%r12d addl 0(%rsp),%r12d movl %r8d,%r13d - addl %r14d,%r12d + addl %r15d,%r12d movl %eax,%r14d rorl $14,%r13d movl %r9d,%r15d - movl %r12d,0(%rsp) - rorl $9,%r14d xorl %r8d,%r13d + rorl $9,%r14d xorl %r10d,%r15d - rorl $5,%r13d - addl %r11d,%r12d + movl %r12d,0(%rsp) xorl %eax,%r14d - - addl (%rbp,%rdi,4),%r12d andl %r8d,%r15d - movl %ebx,%r11d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d rorl $11,%r14d xorl %r8d,%r13d - xorl %r10d,%r15d + addl %r15d,%r12d - xorl %ecx,%r11d + movl %eax,%r15d + addl (%rbp),%r12d xorl %eax,%r14d - addl %r15d,%r12d - movl %ebx,%r15d + xorl %ebx,%r15d rorl $6,%r13d - andl %eax,%r11d - andl %ecx,%r15d + movl %ebx,%r11d + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r11d + xorl %edi,%r11d addl %r12d,%edx addl %r12d,%r11d - leaq 1(%rdi),%rdi - addl %r14d,%r11d + leaq 4(%rbp),%rbp movl 8(%rsp),%r13d - movl 60(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 60(%rsp),%edi - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi - rorl $7,%r12d xorl %r12d,%r13d - movl 40(%rsp),%r12d - - rorl $2,%r15d - xorl %r14d,%r15d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi shrl $10,%r14d - rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 40(%rsp),%r12d addl 4(%rsp),%r12d movl %edx,%r13d - addl %r14d,%r12d + addl %edi,%r12d movl %r11d,%r14d rorl $14,%r13d - movl %r8d,%r15d - movl %r12d,4(%rsp) + movl %r8d,%edi - rorl $9,%r14d xorl %edx,%r13d - xorl %r9d,%r15d + rorl $9,%r14d + xorl %r9d,%edi - rorl $5,%r13d - addl %r10d,%r12d + movl %r12d,4(%rsp) xorl %r11d,%r14d + andl %edx,%edi - addl (%rbp,%rdi,4),%r12d - andl %edx,%r15d - movl %eax,%r10d + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi rorl $11,%r14d xorl %edx,%r13d - xorl %r9d,%r15d + addl %edi,%r12d - xorl %ebx,%r10d + movl %r11d,%edi + addl (%rbp),%r12d xorl %r11d,%r14d - addl %r15d,%r12d - movl %eax,%r15d + xorl %eax,%edi rorl $6,%r13d - andl %r11d,%r10d - andl %ebx,%r15d + movl %eax,%r10d + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r10d + xorl %r15d,%r10d addl %r12d,%ecx addl %r12d,%r10d - leaq 1(%rdi),%rdi - addl %r14d,%r10d + leaq 4(%rbp),%rbp movl 12(%rsp),%r13d - movl 0(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 0(%rsp),%r15d - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d - rorl $7,%r12d xorl %r12d,%r13d - movl 44(%rsp),%r12d - - rorl $2,%r15d + shrl $3,%r12d + rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 44(%rsp),%r12d addl 8(%rsp),%r12d movl %ecx,%r13d - addl %r14d,%r12d + addl %r15d,%r12d movl %r10d,%r14d rorl $14,%r13d movl %edx,%r15d - movl %r12d,8(%rsp) - rorl $9,%r14d xorl %ecx,%r13d + rorl $9,%r14d xorl %r8d,%r15d - rorl $5,%r13d - addl %r9d,%r12d + movl %r12d,8(%rsp) xorl %r10d,%r14d - - addl (%rbp,%rdi,4),%r12d andl %ecx,%r15d - movl %r11d,%r9d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d rorl $11,%r14d xorl %ecx,%r13d - xorl %r8d,%r15d + addl %r15d,%r12d - xorl %eax,%r9d + movl %r10d,%r15d + addl (%rbp),%r12d xorl %r10d,%r14d - addl %r15d,%r12d - movl %r11d,%r15d + xorl %r11d,%r15d rorl $6,%r13d - andl %r10d,%r9d - andl %eax,%r15d + movl %r11d,%r9d + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r9d + xorl %edi,%r9d addl %r12d,%ebx addl %r12d,%r9d - leaq 1(%rdi),%rdi - addl %r14d,%r9d + leaq 4(%rbp),%rbp movl 16(%rsp),%r13d - movl 4(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 4(%rsp),%edi - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi - rorl $7,%r12d xorl %r12d,%r13d - movl 48(%rsp),%r12d - - rorl $2,%r15d - xorl %r14d,%r15d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi shrl $10,%r14d - rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 48(%rsp),%r12d addl 12(%rsp),%r12d movl %ebx,%r13d - addl %r14d,%r12d + addl %edi,%r12d movl %r9d,%r14d rorl $14,%r13d - movl %ecx,%r15d - movl %r12d,12(%rsp) + movl %ecx,%edi - rorl $9,%r14d xorl %ebx,%r13d - xorl %edx,%r15d + rorl $9,%r14d + xorl %edx,%edi - rorl $5,%r13d - addl %r8d,%r12d + movl %r12d,12(%rsp) xorl %r9d,%r14d + andl %ebx,%edi - addl (%rbp,%rdi,4),%r12d - andl %ebx,%r15d - movl %r10d,%r8d + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi rorl $11,%r14d xorl %ebx,%r13d - xorl %edx,%r15d + addl %edi,%r12d - xorl %r11d,%r8d + movl %r9d,%edi + addl (%rbp),%r12d xorl %r9d,%r14d - addl %r15d,%r12d - movl %r10d,%r15d + xorl %r10d,%edi rorl $6,%r13d - andl %r9d,%r8d - andl %r11d,%r15d + movl %r10d,%r8d + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r8d + xorl %r15d,%r8d addl %r12d,%eax addl %r12d,%r8d - leaq 1(%rdi),%rdi - addl %r14d,%r8d + leaq 20(%rbp),%rbp movl 20(%rsp),%r13d - movl 8(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 8(%rsp),%r15d - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d - rorl $7,%r12d xorl %r12d,%r13d - movl 52(%rsp),%r12d - - rorl $2,%r15d + shrl $3,%r12d + rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 52(%rsp),%r12d addl 16(%rsp),%r12d movl %eax,%r13d - addl %r14d,%r12d + addl %r15d,%r12d movl %r8d,%r14d rorl $14,%r13d movl %ebx,%r15d - movl %r12d,16(%rsp) - rorl $9,%r14d xorl %eax,%r13d + rorl $9,%r14d xorl %ecx,%r15d - rorl $5,%r13d - addl %edx,%r12d + movl %r12d,16(%rsp) xorl %r8d,%r14d - - addl (%rbp,%rdi,4),%r12d andl %eax,%r15d - movl %r9d,%edx + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d rorl $11,%r14d xorl %eax,%r13d - xorl %ecx,%r15d + addl %r15d,%r12d - xorl %r10d,%edx + movl %r8d,%r15d + addl (%rbp),%r12d xorl %r8d,%r14d - addl %r15d,%r12d - movl %r9d,%r15d + xorl %r9d,%r15d rorl $6,%r13d - andl %r8d,%edx - andl %r10d,%r15d + movl %r9d,%edx + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%edx + xorl %edi,%edx addl %r12d,%r11d addl %r12d,%edx - leaq 1(%rdi),%rdi - addl %r14d,%edx + leaq 4(%rbp),%rbp movl 24(%rsp),%r13d - movl 12(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 12(%rsp),%edi - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi - rorl $7,%r12d xorl %r12d,%r13d - movl 56(%rsp),%r12d - - rorl $2,%r15d - xorl %r14d,%r15d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi shrl $10,%r14d - rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 56(%rsp),%r12d addl 20(%rsp),%r12d movl %r11d,%r13d - addl %r14d,%r12d + addl %edi,%r12d movl %edx,%r14d rorl $14,%r13d - movl %eax,%r15d - movl %r12d,20(%rsp) + movl %eax,%edi - rorl $9,%r14d xorl %r11d,%r13d - xorl %ebx,%r15d + rorl $9,%r14d + xorl %ebx,%edi - rorl $5,%r13d - addl %ecx,%r12d + movl %r12d,20(%rsp) xorl %edx,%r14d + andl %r11d,%edi - addl (%rbp,%rdi,4),%r12d - andl %r11d,%r15d - movl %r8d,%ecx + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi rorl $11,%r14d xorl %r11d,%r13d - xorl %ebx,%r15d + addl %edi,%r12d - xorl %r9d,%ecx + movl %edx,%edi + addl (%rbp),%r12d xorl %edx,%r14d - addl %r15d,%r12d - movl %r8d,%r15d + xorl %r8d,%edi rorl $6,%r13d - andl %edx,%ecx - andl %r9d,%r15d + movl %r8d,%ecx + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%ecx + xorl %r15d,%ecx addl %r12d,%r10d addl %r12d,%ecx - leaq 1(%rdi),%rdi - addl %r14d,%ecx + leaq 4(%rbp),%rbp movl 28(%rsp),%r13d - movl 16(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 16(%rsp),%r15d - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d - rorl $7,%r12d xorl %r12d,%r13d - movl 60(%rsp),%r12d - - rorl $2,%r15d + shrl $3,%r12d + rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 60(%rsp),%r12d addl 24(%rsp),%r12d movl %r10d,%r13d - addl %r14d,%r12d + addl %r15d,%r12d movl %ecx,%r14d rorl $14,%r13d movl %r11d,%r15d - movl %r12d,24(%rsp) - rorl $9,%r14d xorl %r10d,%r13d + rorl $9,%r14d xorl %eax,%r15d - rorl $5,%r13d - addl %ebx,%r12d + movl %r12d,24(%rsp) xorl %ecx,%r14d - - addl (%rbp,%rdi,4),%r12d andl %r10d,%r15d - movl %edx,%ebx + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d rorl $11,%r14d xorl %r10d,%r13d - xorl %eax,%r15d + addl %r15d,%r12d - xorl %r8d,%ebx + movl %ecx,%r15d + addl (%rbp),%r12d xorl %ecx,%r14d - addl %r15d,%r12d - movl %edx,%r15d + xorl %edx,%r15d rorl $6,%r13d - andl %ecx,%ebx - andl %r8d,%r15d + movl %edx,%ebx + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%ebx + xorl %edi,%ebx addl %r12d,%r9d addl %r12d,%ebx - leaq 1(%rdi),%rdi - addl %r14d,%ebx + leaq 4(%rbp),%rbp movl 32(%rsp),%r13d - movl 20(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 20(%rsp),%edi - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi - rorl $7,%r12d xorl %r12d,%r13d - movl 0(%rsp),%r12d - - rorl $2,%r15d - xorl %r14d,%r15d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi shrl $10,%r14d - rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 0(%rsp),%r12d addl 28(%rsp),%r12d movl %r9d,%r13d - addl %r14d,%r12d + addl %edi,%r12d movl %ebx,%r14d rorl $14,%r13d - movl %r10d,%r15d - movl %r12d,28(%rsp) + movl %r10d,%edi - rorl $9,%r14d xorl %r9d,%r13d - xorl %r11d,%r15d + rorl $9,%r14d + xorl %r11d,%edi - rorl $5,%r13d - addl %eax,%r12d + movl %r12d,28(%rsp) xorl %ebx,%r14d + andl %r9d,%edi - addl (%rbp,%rdi,4),%r12d - andl %r9d,%r15d - movl %ecx,%eax + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi rorl $11,%r14d xorl %r9d,%r13d - xorl %r11d,%r15d + addl %edi,%r12d - xorl %edx,%eax + movl %ebx,%edi + addl (%rbp),%r12d xorl %ebx,%r14d - addl %r15d,%r12d - movl %ecx,%r15d + xorl %ecx,%edi rorl $6,%r13d - andl %ebx,%eax - andl %edx,%r15d + movl %ecx,%eax + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%eax + xorl %r15d,%eax addl %r12d,%r8d addl %r12d,%eax - leaq 1(%rdi),%rdi - addl %r14d,%eax + leaq 20(%rbp),%rbp movl 36(%rsp),%r13d - movl 24(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 24(%rsp),%r15d - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d - rorl $7,%r12d xorl %r12d,%r13d - movl 4(%rsp),%r12d - - rorl $2,%r15d + shrl $3,%r12d + rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 4(%rsp),%r12d addl 32(%rsp),%r12d movl %r8d,%r13d - addl %r14d,%r12d + addl %r15d,%r12d movl %eax,%r14d rorl $14,%r13d movl %r9d,%r15d - movl %r12d,32(%rsp) - rorl $9,%r14d xorl %r8d,%r13d + rorl $9,%r14d xorl %r10d,%r15d - rorl $5,%r13d - addl %r11d,%r12d + movl %r12d,32(%rsp) xorl %eax,%r14d - - addl (%rbp,%rdi,4),%r12d andl %r8d,%r15d - movl %ebx,%r11d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d rorl $11,%r14d xorl %r8d,%r13d - xorl %r10d,%r15d + addl %r15d,%r12d - xorl %ecx,%r11d + movl %eax,%r15d + addl (%rbp),%r12d xorl %eax,%r14d - addl %r15d,%r12d - movl %ebx,%r15d + xorl %ebx,%r15d rorl $6,%r13d - andl %eax,%r11d - andl %ecx,%r15d + movl %ebx,%r11d + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r11d + xorl %edi,%r11d addl %r12d,%edx addl %r12d,%r11d - leaq 1(%rdi),%rdi - addl %r14d,%r11d + leaq 4(%rbp),%rbp movl 40(%rsp),%r13d - movl 28(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 28(%rsp),%edi - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi - rorl $7,%r12d xorl %r12d,%r13d - movl 8(%rsp),%r12d - - rorl $2,%r15d - xorl %r14d,%r15d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi shrl $10,%r14d - rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 8(%rsp),%r12d addl 36(%rsp),%r12d movl %edx,%r13d - addl %r14d,%r12d + addl %edi,%r12d movl %r11d,%r14d rorl $14,%r13d - movl %r8d,%r15d - movl %r12d,36(%rsp) + movl %r8d,%edi - rorl $9,%r14d xorl %edx,%r13d - xorl %r9d,%r15d + rorl $9,%r14d + xorl %r9d,%edi - rorl $5,%r13d - addl %r10d,%r12d + movl %r12d,36(%rsp) xorl %r11d,%r14d + andl %edx,%edi - addl (%rbp,%rdi,4),%r12d - andl %edx,%r15d - movl %eax,%r10d + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi rorl $11,%r14d xorl %edx,%r13d - xorl %r9d,%r15d + addl %edi,%r12d - xorl %ebx,%r10d + movl %r11d,%edi + addl (%rbp),%r12d xorl %r11d,%r14d - addl %r15d,%r12d - movl %eax,%r15d + xorl %eax,%edi rorl $6,%r13d - andl %r11d,%r10d - andl %ebx,%r15d + movl %eax,%r10d + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r10d + xorl %r15d,%r10d addl %r12d,%ecx addl %r12d,%r10d - leaq 1(%rdi),%rdi - addl %r14d,%r10d + leaq 4(%rbp),%rbp movl 44(%rsp),%r13d - movl 32(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 32(%rsp),%r15d - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d - rorl $7,%r12d xorl %r12d,%r13d - movl 12(%rsp),%r12d - - rorl $2,%r15d + shrl $3,%r12d + rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 12(%rsp),%r12d addl 40(%rsp),%r12d movl %ecx,%r13d - addl %r14d,%r12d + addl %r15d,%r12d movl %r10d,%r14d rorl $14,%r13d movl %edx,%r15d - movl %r12d,40(%rsp) - rorl $9,%r14d xorl %ecx,%r13d + rorl $9,%r14d xorl %r8d,%r15d - rorl $5,%r13d - addl %r9d,%r12d + movl %r12d,40(%rsp) xorl %r10d,%r14d - - addl (%rbp,%rdi,4),%r12d andl %ecx,%r15d - movl %r11d,%r9d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d rorl $11,%r14d xorl %ecx,%r13d - xorl %r8d,%r15d + addl %r15d,%r12d - xorl %eax,%r9d + movl %r10d,%r15d + addl (%rbp),%r12d xorl %r10d,%r14d - addl %r15d,%r12d - movl %r11d,%r15d + xorl %r11d,%r15d rorl $6,%r13d - andl %r10d,%r9d - andl %eax,%r15d + movl %r11d,%r9d + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r9d + xorl %edi,%r9d addl %r12d,%ebx addl %r12d,%r9d - leaq 1(%rdi),%rdi - addl %r14d,%r9d + leaq 4(%rbp),%rbp movl 48(%rsp),%r13d - movl 36(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 36(%rsp),%edi - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi - rorl $7,%r12d xorl %r12d,%r13d - movl 16(%rsp),%r12d - - rorl $2,%r15d - xorl %r14d,%r15d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi shrl $10,%r14d - rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 16(%rsp),%r12d addl 44(%rsp),%r12d movl %ebx,%r13d - addl %r14d,%r12d + addl %edi,%r12d movl %r9d,%r14d rorl $14,%r13d - movl %ecx,%r15d - movl %r12d,44(%rsp) + movl %ecx,%edi - rorl $9,%r14d xorl %ebx,%r13d - xorl %edx,%r15d + rorl $9,%r14d + xorl %edx,%edi - rorl $5,%r13d - addl %r8d,%r12d + movl %r12d,44(%rsp) xorl %r9d,%r14d + andl %ebx,%edi - addl (%rbp,%rdi,4),%r12d - andl %ebx,%r15d - movl %r10d,%r8d + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi rorl $11,%r14d xorl %ebx,%r13d - xorl %edx,%r15d + addl %edi,%r12d - xorl %r11d,%r8d + movl %r9d,%edi + addl (%rbp),%r12d xorl %r9d,%r14d - addl %r15d,%r12d - movl %r10d,%r15d + xorl %r10d,%edi rorl $6,%r13d - andl %r9d,%r8d - andl %r11d,%r15d + movl %r10d,%r8d + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r8d + xorl %r15d,%r8d addl %r12d,%eax addl %r12d,%r8d - leaq 1(%rdi),%rdi - addl %r14d,%r8d + leaq 20(%rbp),%rbp movl 52(%rsp),%r13d - movl 40(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 40(%rsp),%r15d - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d - rorl $7,%r12d xorl %r12d,%r13d - movl 20(%rsp),%r12d - - rorl $2,%r15d + shrl $3,%r12d + rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 20(%rsp),%r12d addl 48(%rsp),%r12d movl %eax,%r13d - addl %r14d,%r12d + addl %r15d,%r12d movl %r8d,%r14d rorl $14,%r13d movl %ebx,%r15d - movl %r12d,48(%rsp) - rorl $9,%r14d xorl %eax,%r13d + rorl $9,%r14d xorl %ecx,%r15d - rorl $5,%r13d - addl %edx,%r12d + movl %r12d,48(%rsp) xorl %r8d,%r14d - - addl (%rbp,%rdi,4),%r12d andl %eax,%r15d - movl %r9d,%edx + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d rorl $11,%r14d xorl %eax,%r13d - xorl %ecx,%r15d + addl %r15d,%r12d - xorl %r10d,%edx + movl %r8d,%r15d + addl (%rbp),%r12d xorl %r8d,%r14d - addl %r15d,%r12d - movl %r9d,%r15d + xorl %r9d,%r15d rorl $6,%r13d - andl %r8d,%edx - andl %r10d,%r15d + movl %r9d,%edx + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%edx + xorl %edi,%edx addl %r12d,%r11d addl %r12d,%edx - leaq 1(%rdi),%rdi - addl %r14d,%edx + leaq 4(%rbp),%rbp movl 56(%rsp),%r13d - movl 44(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 44(%rsp),%edi - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi - rorl $7,%r12d xorl %r12d,%r13d - movl 24(%rsp),%r12d - - rorl $2,%r15d - xorl %r14d,%r15d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi shrl $10,%r14d - rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 24(%rsp),%r12d addl 52(%rsp),%r12d movl %r11d,%r13d - addl %r14d,%r12d + addl %edi,%r12d movl %edx,%r14d rorl $14,%r13d - movl %eax,%r15d - movl %r12d,52(%rsp) + movl %eax,%edi - rorl $9,%r14d xorl %r11d,%r13d - xorl %ebx,%r15d + rorl $9,%r14d + xorl %ebx,%edi - rorl $5,%r13d - addl %ecx,%r12d + movl %r12d,52(%rsp) xorl %edx,%r14d + andl %r11d,%edi - addl (%rbp,%rdi,4),%r12d - andl %r11d,%r15d - movl %r8d,%ecx + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi rorl $11,%r14d xorl %r11d,%r13d - xorl %ebx,%r15d + addl %edi,%r12d - xorl %r9d,%ecx + movl %edx,%edi + addl (%rbp),%r12d xorl %edx,%r14d - addl %r15d,%r12d - movl %r8d,%r15d + xorl %r8d,%edi rorl $6,%r13d - andl %edx,%ecx - andl %r9d,%r15d + movl %r8d,%ecx + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%ecx + xorl %r15d,%ecx addl %r12d,%r10d addl %r12d,%ecx - leaq 1(%rdi),%rdi - addl %r14d,%ecx + leaq 4(%rbp),%rbp movl 60(%rsp),%r13d - movl 48(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 48(%rsp),%r15d - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d - rorl $7,%r12d xorl %r12d,%r13d - movl 28(%rsp),%r12d - - rorl $2,%r15d + shrl $3,%r12d + rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 28(%rsp),%r12d addl 56(%rsp),%r12d movl %r10d,%r13d - addl %r14d,%r12d + addl %r15d,%r12d movl %ecx,%r14d rorl $14,%r13d movl %r11d,%r15d - movl %r12d,56(%rsp) - rorl $9,%r14d xorl %r10d,%r13d + rorl $9,%r14d xorl %eax,%r15d - rorl $5,%r13d - addl %ebx,%r12d + movl %r12d,56(%rsp) xorl %ecx,%r14d - - addl (%rbp,%rdi,4),%r12d andl %r10d,%r15d - movl %edx,%ebx + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d rorl $11,%r14d xorl %r10d,%r13d - xorl %eax,%r15d + addl %r15d,%r12d - xorl %r8d,%ebx + movl %ecx,%r15d + addl (%rbp),%r12d xorl %ecx,%r14d - addl %r15d,%r12d - movl %edx,%r15d + xorl %edx,%r15d rorl $6,%r13d - andl %ecx,%ebx - andl %r8d,%r15d + movl %edx,%ebx + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%ebx + xorl %edi,%ebx addl %r12d,%r9d addl %r12d,%ebx - leaq 1(%rdi),%rdi - addl %r14d,%ebx + leaq 4(%rbp),%rbp movl 0(%rsp),%r13d - movl 52(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 52(%rsp),%edi - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi - rorl $7,%r12d xorl %r12d,%r13d - movl 32(%rsp),%r12d - - rorl $2,%r15d - xorl %r14d,%r15d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi shrl $10,%r14d - rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 32(%rsp),%r12d addl 60(%rsp),%r12d movl %r9d,%r13d - addl %r14d,%r12d + addl %edi,%r12d movl %ebx,%r14d rorl $14,%r13d - movl %r10d,%r15d - movl %r12d,60(%rsp) + movl %r10d,%edi - rorl $9,%r14d xorl %r9d,%r13d - xorl %r11d,%r15d + rorl $9,%r14d + xorl %r11d,%edi - rorl $5,%r13d - addl %eax,%r12d + movl %r12d,60(%rsp) xorl %ebx,%r14d + andl %r9d,%edi - addl (%rbp,%rdi,4),%r12d - andl %r9d,%r15d - movl %ecx,%eax + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi rorl $11,%r14d xorl %r9d,%r13d - xorl %r11d,%r15d + addl %edi,%r12d - xorl %edx,%eax + movl %ebx,%edi + addl (%rbp),%r12d xorl %ebx,%r14d - addl %r15d,%r12d - movl %ecx,%r15d + xorl %ecx,%edi rorl $6,%r13d - andl %ebx,%eax - andl %edx,%r15d + movl %ecx,%eax + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%eax + xorl %r15d,%eax addl %r12d,%r8d addl %r12d,%eax - leaq 1(%rdi),%rdi - addl %r14d,%eax - cmpq $64,%rdi - jb .Lrounds_16_xx + leaq 20(%rbp),%rbp + cmpb $0,3(%rbp) + jnz .Lrounds_16_xx movq 64+0(%rsp),%rdi + addl %r14d,%eax leaq 64(%rsi),%rsi addl 0(%rdi),%eax @@ -1762,18 +1707,1344 @@ sha256_block_data_order: .type K256,@object K256: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff +.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff +.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 +.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.type sha256_block_data_order_shaext,@function +.align 64 +sha256_block_data_order_shaext: +_shaext_shortcut: + leaq K256+128(%rip),%rcx + movdqu (%rdi),%xmm1 + movdqu 16(%rdi),%xmm2 + movdqa 512-128(%rcx),%xmm7 + + pshufd $27,%xmm1,%xmm0 + pshufd $177,%xmm1,%xmm1 + pshufd $27,%xmm2,%xmm2 + movdqa %xmm7,%xmm8 +.byte 102,15,58,15,202,8 + punpcklqdq %xmm0,%xmm2 + jmp .Loop_shaext + +.align 16 +.Loop_shaext: + movdqu (%rsi),%xmm3 + movdqu 16(%rsi),%xmm4 + movdqu 32(%rsi),%xmm5 +.byte 102,15,56,0,223 + movdqu 48(%rsi),%xmm6 + + movdqa 0-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 102,15,56,0,231 + movdqa %xmm2,%xmm10 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + nop + movdqa %xmm1,%xmm9 +.byte 15,56,203,202 + + movdqa 32-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 102,15,56,0,239 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + leaq 64(%rsi),%rsi +.byte 15,56,204,220 +.byte 15,56,203,202 + + movdqa 64-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 102,15,56,0,247 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + + movdqa 96-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 128-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 160-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 192-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 224-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 256-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 288-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 320-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 352-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 384-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 416-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 +.byte 15,56,203,202 + paddd %xmm7,%xmm6 + + movdqa 448-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 +.byte 15,56,205,245 + movdqa %xmm8,%xmm7 +.byte 15,56,203,202 + + movdqa 480-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 + nop +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + decq %rdx + nop +.byte 15,56,203,202 + + paddd %xmm10,%xmm2 + paddd %xmm9,%xmm1 + jnz .Loop_shaext + + pshufd $177,%xmm2,%xmm2 + pshufd $27,%xmm1,%xmm7 + pshufd $177,%xmm1,%xmm1 + punpckhqdq %xmm2,%xmm1 +.byte 102,15,58,15,215,8 + + movdqu %xmm1,(%rdi) + movdqu %xmm2,16(%rdi) + .byte 0xf3,0xc3 +.size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext +.type sha256_block_data_order_ssse3,@function +.align 64 +sha256_block_data_order_ssse3: +.Lssse3_shortcut: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq %rsp,%r11 + shlq $4,%rdx + subq $96,%rsp + leaq (%rsi,%rdx,4),%rdx + andq $-64,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %r11,64+24(%rsp) +.Lprologue_ssse3: + + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + + + jmp .Lloop_ssse3 +.align 16 +.Lloop_ssse3: + movdqa K256+512(%rip),%xmm7 + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 +.byte 102,15,56,0,199 + movdqu 48(%rsi),%xmm3 + leaq K256(%rip),%rbp +.byte 102,15,56,0,207 + movdqa 0(%rbp),%xmm4 + movdqa 32(%rbp),%xmm5 +.byte 102,15,56,0,215 + paddd %xmm0,%xmm4 + movdqa 64(%rbp),%xmm6 +.byte 102,15,56,0,223 + movdqa 96(%rbp),%xmm7 + paddd %xmm1,%xmm5 + paddd %xmm2,%xmm6 + paddd %xmm3,%xmm7 + movdqa %xmm4,0(%rsp) + movl %eax,%r14d + movdqa %xmm5,16(%rsp) + movl %ebx,%edi + movdqa %xmm6,32(%rsp) + xorl %ecx,%edi + movdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp .Lssse3_00_47 + +.align 16 +.Lssse3_00_47: + subq $-128,%rbp + rorl $14,%r13d + movdqa %xmm1,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm3,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,224,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,250,4 + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm3,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 4(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm0 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm0 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm0,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 0(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm0,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,0(%rsp) + rorl $14,%r13d + movdqa %xmm2,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm0,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,225,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,251,4 + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm0,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 20(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm1 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm1 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm1,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 32(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm1,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,16(%rsp) + rorl $14,%r13d + movdqa %xmm3,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm1,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,226,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,248,4 + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm1,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 36(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm2 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm2 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm2,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 64(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm2,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,32(%rsp) + rorl $14,%r13d + movdqa %xmm0,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm2,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,227,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,249,4 + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm2,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 52(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm3 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm3 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm3,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 96(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm3,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,48(%rsp) + cmpb $0,131(%rbp) + jne .Lssse3_00_47 + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq 64+0(%rsp),%rdi + movl %r14d,%eax + + addl 0(%rdi),%eax + leaq 64(%rsi),%rsi + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb .Lloop_ssse3 + + movq 64+24(%rsp),%rsi + movq (%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbp + movq 40(%rsi),%rbx + leaq 48(%rsi),%rsp +.Lepilogue_ssse3: + .byte 0xf3,0xc3 +.size sha256_block_data_order_ssse3,.-sha256_block_data_order_ssse3 diff --git a/secure/lib/libcrypto/amd64/sha512-x86_64.S b/secure/lib/libcrypto/amd64/sha512-x86_64.S index 74fc3d0..60518d4 100644 --- a/secure/lib/libcrypto/amd64/sha512-x86_64.S +++ b/secure/lib/libcrypto/amd64/sha512-x86_64.S @@ -1,6 +1,7 @@ # $FreeBSD$ .text + .globl sha512_block_data_order .type sha512_block_data_order,@function .align 16 @@ -22,8 +23,6 @@ sha512_block_data_order: movq %r11,128+24(%rsp) .Lprologue: - leaq K512(%rip),%rbp - movq 0(%rdi),%rax movq 8(%rdi),%rbx movq 16(%rdi),%rcx @@ -36,1694 +35,1632 @@ sha512_block_data_order: .align 16 .Lloop: - xorq %rdi,%rdi + movq %rbx,%rdi + leaq K512(%rip),%rbp + xorq %rcx,%rdi movq 0(%rsi),%r12 movq %r8,%r13 movq %rax,%r14 bswapq %r12 rorq $23,%r13 movq %r9,%r15 - movq %r12,0(%rsp) - rorq $5,%r14 xorq %r8,%r13 + rorq $5,%r14 xorq %r10,%r15 - rorq $4,%r13 - addq %r11,%r12 + movq %r12,0(%rsp) xorq %rax,%r14 - - addq (%rbp,%rdi,8),%r12 andq %r8,%r15 - movq %rbx,%r11 + + rorq $4,%r13 + addq %r11,%r12 + xorq %r10,%r15 rorq $6,%r14 xorq %r8,%r13 - xorq %r10,%r15 + addq %r15,%r12 - xorq %rcx,%r11 + movq %rax,%r15 + addq (%rbp),%r12 xorq %rax,%r14 - addq %r15,%r12 - movq %rbx,%r15 + xorq %rbx,%r15 rorq $14,%r13 - andq %rax,%r11 - andq %rcx,%r15 + movq %rbx,%r11 + andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 - addq %r15,%r11 + xorq %rdi,%r11 addq %r12,%rdx addq %r12,%r11 - leaq 1(%rdi),%rdi - addq %r14,%r11 + leaq 8(%rbp),%rbp + addq %r14,%r11 movq 8(%rsi),%r12 movq %rdx,%r13 movq %r11,%r14 bswapq %r12 rorq $23,%r13 - movq %r8,%r15 - movq %r12,8(%rsp) + movq %r8,%rdi - rorq $5,%r14 xorq %rdx,%r13 - xorq %r9,%r15 + rorq $5,%r14 + xorq %r9,%rdi - rorq $4,%r13 - addq %r10,%r12 + movq %r12,8(%rsp) xorq %r11,%r14 + andq %rdx,%rdi - addq (%rbp,%rdi,8),%r12 - andq %rdx,%r15 - movq %rax,%r10 + rorq $4,%r13 + addq %r10,%r12 + xorq %r9,%rdi rorq $6,%r14 xorq %rdx,%r13 - xorq %r9,%r15 + addq %rdi,%r12 - xorq %rbx,%r10 + movq %r11,%rdi + addq (%rbp),%r12 xorq %r11,%r14 - addq %r15,%r12 - movq %rax,%r15 + xorq %rax,%rdi rorq $14,%r13 - andq %r11,%r10 - andq %rbx,%r15 + movq %rax,%r10 + andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 - addq %r15,%r10 + xorq %r15,%r10 addq %r12,%rcx addq %r12,%r10 - leaq 1(%rdi),%rdi - addq %r14,%r10 + leaq 24(%rbp),%rbp + addq %r14,%r10 movq 16(%rsi),%r12 movq %rcx,%r13 movq %r10,%r14 bswapq %r12 rorq $23,%r13 movq %rdx,%r15 - movq %r12,16(%rsp) - rorq $5,%r14 xorq %rcx,%r13 + rorq $5,%r14 xorq %r8,%r15 - rorq $4,%r13 - addq %r9,%r12 + movq %r12,16(%rsp) xorq %r10,%r14 - - addq (%rbp,%rdi,8),%r12 andq %rcx,%r15 - movq %r11,%r9 + + rorq $4,%r13 + addq %r9,%r12 + xorq %r8,%r15 rorq $6,%r14 xorq %rcx,%r13 - xorq %r8,%r15 + addq %r15,%r12 - xorq %rax,%r9 + movq %r10,%r15 + addq (%rbp),%r12 xorq %r10,%r14 - addq %r15,%r12 - movq %r11,%r15 + xorq %r11,%r15 rorq $14,%r13 - andq %r10,%r9 - andq %rax,%r15 + movq %r11,%r9 + andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 - addq %r15,%r9 + xorq %rdi,%r9 addq %r12,%rbx addq %r12,%r9 - leaq 1(%rdi),%rdi - addq %r14,%r9 + leaq 8(%rbp),%rbp + addq %r14,%r9 movq 24(%rsi),%r12 movq %rbx,%r13 movq %r9,%r14 bswapq %r12 rorq $23,%r13 - movq %rcx,%r15 - movq %r12,24(%rsp) + movq %rcx,%rdi - rorq $5,%r14 xorq %rbx,%r13 - xorq %rdx,%r15 + rorq $5,%r14 + xorq %rdx,%rdi - rorq $4,%r13 - addq %r8,%r12 + movq %r12,24(%rsp) xorq %r9,%r14 + andq %rbx,%rdi - addq (%rbp,%rdi,8),%r12 - andq %rbx,%r15 - movq %r10,%r8 + rorq $4,%r13 + addq %r8,%r12 + xorq %rdx,%rdi rorq $6,%r14 xorq %rbx,%r13 - xorq %rdx,%r15 + addq %rdi,%r12 - xorq %r11,%r8 + movq %r9,%rdi + addq (%rbp),%r12 xorq %r9,%r14 - addq %r15,%r12 - movq %r10,%r15 + xorq %r10,%rdi rorq $14,%r13 - andq %r9,%r8 - andq %r11,%r15 + movq %r10,%r8 + andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 - addq %r15,%r8 + xorq %r15,%r8 addq %r12,%rax addq %r12,%r8 - leaq 1(%rdi),%rdi - addq %r14,%r8 + leaq 24(%rbp),%rbp + addq %r14,%r8 movq 32(%rsi),%r12 movq %rax,%r13 movq %r8,%r14 bswapq %r12 rorq $23,%r13 movq %rbx,%r15 - movq %r12,32(%rsp) - rorq $5,%r14 xorq %rax,%r13 + rorq $5,%r14 xorq %rcx,%r15 - rorq $4,%r13 - addq %rdx,%r12 + movq %r12,32(%rsp) xorq %r8,%r14 - - addq (%rbp,%rdi,8),%r12 andq %rax,%r15 - movq %r9,%rdx + + rorq $4,%r13 + addq %rdx,%r12 + xorq %rcx,%r15 rorq $6,%r14 xorq %rax,%r13 - xorq %rcx,%r15 + addq %r15,%r12 - xorq %r10,%rdx + movq %r8,%r15 + addq (%rbp),%r12 xorq %r8,%r14 - addq %r15,%r12 - movq %r9,%r15 + xorq %r9,%r15 rorq $14,%r13 - andq %r8,%rdx - andq %r10,%r15 + movq %r9,%rdx + andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 - addq %r15,%rdx + xorq %rdi,%rdx addq %r12,%r11 addq %r12,%rdx - leaq 1(%rdi),%rdi - addq %r14,%rdx + leaq 8(%rbp),%rbp + addq %r14,%rdx movq 40(%rsi),%r12 movq %r11,%r13 movq %rdx,%r14 bswapq %r12 rorq $23,%r13 - movq %rax,%r15 - movq %r12,40(%rsp) + movq %rax,%rdi - rorq $5,%r14 xorq %r11,%r13 - xorq %rbx,%r15 + rorq $5,%r14 + xorq %rbx,%rdi - rorq $4,%r13 - addq %rcx,%r12 + movq %r12,40(%rsp) xorq %rdx,%r14 + andq %r11,%rdi - addq (%rbp,%rdi,8),%r12 - andq %r11,%r15 - movq %r8,%rcx + rorq $4,%r13 + addq %rcx,%r12 + xorq %rbx,%rdi rorq $6,%r14 xorq %r11,%r13 - xorq %rbx,%r15 + addq %rdi,%r12 - xorq %r9,%rcx + movq %rdx,%rdi + addq (%rbp),%r12 xorq %rdx,%r14 - addq %r15,%r12 - movq %r8,%r15 + xorq %r8,%rdi rorq $14,%r13 - andq %rdx,%rcx - andq %r9,%r15 + movq %r8,%rcx + andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 - addq %r15,%rcx + xorq %r15,%rcx addq %r12,%r10 addq %r12,%rcx - leaq 1(%rdi),%rdi - addq %r14,%rcx + leaq 24(%rbp),%rbp + addq %r14,%rcx movq 48(%rsi),%r12 movq %r10,%r13 movq %rcx,%r14 bswapq %r12 rorq $23,%r13 movq %r11,%r15 - movq %r12,48(%rsp) - rorq $5,%r14 xorq %r10,%r13 + rorq $5,%r14 xorq %rax,%r15 - rorq $4,%r13 - addq %rbx,%r12 + movq %r12,48(%rsp) xorq %rcx,%r14 - - addq (%rbp,%rdi,8),%r12 andq %r10,%r15 - movq %rdx,%rbx + + rorq $4,%r13 + addq %rbx,%r12 + xorq %rax,%r15 rorq $6,%r14 xorq %r10,%r13 - xorq %rax,%r15 + addq %r15,%r12 - xorq %r8,%rbx + movq %rcx,%r15 + addq (%rbp),%r12 xorq %rcx,%r14 - addq %r15,%r12 - movq %rdx,%r15 + xorq %rdx,%r15 rorq $14,%r13 - andq %rcx,%rbx - andq %r8,%r15 + movq %rdx,%rbx + andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 - addq %r15,%rbx + xorq %rdi,%rbx addq %r12,%r9 addq %r12,%rbx - leaq 1(%rdi),%rdi - addq %r14,%rbx + leaq 8(%rbp),%rbp + addq %r14,%rbx movq 56(%rsi),%r12 movq %r9,%r13 movq %rbx,%r14 bswapq %r12 rorq $23,%r13 - movq %r10,%r15 - movq %r12,56(%rsp) + movq %r10,%rdi - rorq $5,%r14 xorq %r9,%r13 - xorq %r11,%r15 + rorq $5,%r14 + xorq %r11,%rdi - rorq $4,%r13 - addq %rax,%r12 + movq %r12,56(%rsp) xorq %rbx,%r14 + andq %r9,%rdi - addq (%rbp,%rdi,8),%r12 - andq %r9,%r15 - movq %rcx,%rax + rorq $4,%r13 + addq %rax,%r12 + xorq %r11,%rdi rorq $6,%r14 xorq %r9,%r13 - xorq %r11,%r15 + addq %rdi,%r12 - xorq %rdx,%rax + movq %rbx,%rdi + addq (%rbp),%r12 xorq %rbx,%r14 - addq %r15,%r12 - movq %rcx,%r15 + xorq %rcx,%rdi rorq $14,%r13 - andq %rbx,%rax - andq %rdx,%r15 + movq %rcx,%rax + andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 - addq %r15,%rax + xorq %r15,%rax addq %r12,%r8 addq %r12,%rax - leaq 1(%rdi),%rdi - addq %r14,%rax + leaq 24(%rbp),%rbp + addq %r14,%rax movq 64(%rsi),%r12 movq %r8,%r13 movq %rax,%r14 bswapq %r12 rorq $23,%r13 movq %r9,%r15 - movq %r12,64(%rsp) - rorq $5,%r14 xorq %r8,%r13 + rorq $5,%r14 xorq %r10,%r15 - rorq $4,%r13 - addq %r11,%r12 + movq %r12,64(%rsp) xorq %rax,%r14 - - addq (%rbp,%rdi,8),%r12 andq %r8,%r15 - movq %rbx,%r11 + + rorq $4,%r13 + addq %r11,%r12 + xorq %r10,%r15 rorq $6,%r14 xorq %r8,%r13 - xorq %r10,%r15 + addq %r15,%r12 - xorq %rcx,%r11 + movq %rax,%r15 + addq (%rbp),%r12 xorq %rax,%r14 - addq %r15,%r12 - movq %rbx,%r15 + xorq %rbx,%r15 rorq $14,%r13 - andq %rax,%r11 - andq %rcx,%r15 + movq %rbx,%r11 + andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 - addq %r15,%r11 + xorq %rdi,%r11 addq %r12,%rdx addq %r12,%r11 - leaq 1(%rdi),%rdi - addq %r14,%r11 + leaq 8(%rbp),%rbp + addq %r14,%r11 movq 72(%rsi),%r12 movq %rdx,%r13 movq %r11,%r14 bswapq %r12 rorq $23,%r13 - movq %r8,%r15 - movq %r12,72(%rsp) + movq %r8,%rdi - rorq $5,%r14 xorq %rdx,%r13 - xorq %r9,%r15 + rorq $5,%r14 + xorq %r9,%rdi - rorq $4,%r13 - addq %r10,%r12 + movq %r12,72(%rsp) xorq %r11,%r14 + andq %rdx,%rdi - addq (%rbp,%rdi,8),%r12 - andq %rdx,%r15 - movq %rax,%r10 + rorq $4,%r13 + addq %r10,%r12 + xorq %r9,%rdi rorq $6,%r14 xorq %rdx,%r13 - xorq %r9,%r15 + addq %rdi,%r12 - xorq %rbx,%r10 + movq %r11,%rdi + addq (%rbp),%r12 xorq %r11,%r14 - addq %r15,%r12 - movq %rax,%r15 + xorq %rax,%rdi rorq $14,%r13 - andq %r11,%r10 - andq %rbx,%r15 + movq %rax,%r10 + andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 - addq %r15,%r10 + xorq %r15,%r10 addq %r12,%rcx addq %r12,%r10 - leaq 1(%rdi),%rdi - addq %r14,%r10 + leaq 24(%rbp),%rbp + addq %r14,%r10 movq 80(%rsi),%r12 movq %rcx,%r13 movq %r10,%r14 bswapq %r12 rorq $23,%r13 movq %rdx,%r15 - movq %r12,80(%rsp) - rorq $5,%r14 xorq %rcx,%r13 + rorq $5,%r14 xorq %r8,%r15 - rorq $4,%r13 - addq %r9,%r12 + movq %r12,80(%rsp) xorq %r10,%r14 - - addq (%rbp,%rdi,8),%r12 andq %rcx,%r15 - movq %r11,%r9 + + rorq $4,%r13 + addq %r9,%r12 + xorq %r8,%r15 rorq $6,%r14 xorq %rcx,%r13 - xorq %r8,%r15 + addq %r15,%r12 - xorq %rax,%r9 + movq %r10,%r15 + addq (%rbp),%r12 xorq %r10,%r14 - addq %r15,%r12 - movq %r11,%r15 + xorq %r11,%r15 rorq $14,%r13 - andq %r10,%r9 - andq %rax,%r15 + movq %r11,%r9 + andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 - addq %r15,%r9 + xorq %rdi,%r9 addq %r12,%rbx addq %r12,%r9 - leaq 1(%rdi),%rdi - addq %r14,%r9 + leaq 8(%rbp),%rbp + addq %r14,%r9 movq 88(%rsi),%r12 movq %rbx,%r13 movq %r9,%r14 bswapq %r12 rorq $23,%r13 - movq %rcx,%r15 - movq %r12,88(%rsp) + movq %rcx,%rdi - rorq $5,%r14 xorq %rbx,%r13 - xorq %rdx,%r15 + rorq $5,%r14 + xorq %rdx,%rdi - rorq $4,%r13 - addq %r8,%r12 + movq %r12,88(%rsp) xorq %r9,%r14 + andq %rbx,%rdi - addq (%rbp,%rdi,8),%r12 - andq %rbx,%r15 - movq %r10,%r8 + rorq $4,%r13 + addq %r8,%r12 + xorq %rdx,%rdi rorq $6,%r14 xorq %rbx,%r13 - xorq %rdx,%r15 + addq %rdi,%r12 - xorq %r11,%r8 + movq %r9,%rdi + addq (%rbp),%r12 xorq %r9,%r14 - addq %r15,%r12 - movq %r10,%r15 + xorq %r10,%rdi rorq $14,%r13 - andq %r9,%r8 - andq %r11,%r15 + movq %r10,%r8 + andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 - addq %r15,%r8 + xorq %r15,%r8 addq %r12,%rax addq %r12,%r8 - leaq 1(%rdi),%rdi - addq %r14,%r8 + leaq 24(%rbp),%rbp + addq %r14,%r8 movq 96(%rsi),%r12 movq %rax,%r13 movq %r8,%r14 bswapq %r12 rorq $23,%r13 movq %rbx,%r15 - movq %r12,96(%rsp) - rorq $5,%r14 xorq %rax,%r13 + rorq $5,%r14 xorq %rcx,%r15 - rorq $4,%r13 - addq %rdx,%r12 + movq %r12,96(%rsp) xorq %r8,%r14 - - addq (%rbp,%rdi,8),%r12 andq %rax,%r15 - movq %r9,%rdx + + rorq $4,%r13 + addq %rdx,%r12 + xorq %rcx,%r15 rorq $6,%r14 xorq %rax,%r13 - xorq %rcx,%r15 + addq %r15,%r12 - xorq %r10,%rdx + movq %r8,%r15 + addq (%rbp),%r12 xorq %r8,%r14 - addq %r15,%r12 - movq %r9,%r15 + xorq %r9,%r15 rorq $14,%r13 - andq %r8,%rdx - andq %r10,%r15 + movq %r9,%rdx + andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 - addq %r15,%rdx + xorq %rdi,%rdx addq %r12,%r11 addq %r12,%rdx - leaq 1(%rdi),%rdi - addq %r14,%rdx + leaq 8(%rbp),%rbp + addq %r14,%rdx movq 104(%rsi),%r12 movq %r11,%r13 movq %rdx,%r14 bswapq %r12 rorq $23,%r13 - movq %rax,%r15 - movq %r12,104(%rsp) + movq %rax,%rdi - rorq $5,%r14 xorq %r11,%r13 - xorq %rbx,%r15 + rorq $5,%r14 + xorq %rbx,%rdi - rorq $4,%r13 - addq %rcx,%r12 + movq %r12,104(%rsp) xorq %rdx,%r14 + andq %r11,%rdi - addq (%rbp,%rdi,8),%r12 - andq %r11,%r15 - movq %r8,%rcx + rorq $4,%r13 + addq %rcx,%r12 + xorq %rbx,%rdi rorq $6,%r14 xorq %r11,%r13 - xorq %rbx,%r15 + addq %rdi,%r12 - xorq %r9,%rcx + movq %rdx,%rdi + addq (%rbp),%r12 xorq %rdx,%r14 - addq %r15,%r12 - movq %r8,%r15 + xorq %r8,%rdi rorq $14,%r13 - andq %rdx,%rcx - andq %r9,%r15 + movq %r8,%rcx + andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 - addq %r15,%rcx + xorq %r15,%rcx addq %r12,%r10 addq %r12,%rcx - leaq 1(%rdi),%rdi - addq %r14,%rcx + leaq 24(%rbp),%rbp + addq %r14,%rcx movq 112(%rsi),%r12 movq %r10,%r13 movq %rcx,%r14 bswapq %r12 rorq $23,%r13 movq %r11,%r15 - movq %r12,112(%rsp) - rorq $5,%r14 xorq %r10,%r13 + rorq $5,%r14 xorq %rax,%r15 - rorq $4,%r13 - addq %rbx,%r12 + movq %r12,112(%rsp) xorq %rcx,%r14 - - addq (%rbp,%rdi,8),%r12 andq %r10,%r15 - movq %rdx,%rbx + + rorq $4,%r13 + addq %rbx,%r12 + xorq %rax,%r15 rorq $6,%r14 xorq %r10,%r13 - xorq %rax,%r15 + addq %r15,%r12 - xorq %r8,%rbx + movq %rcx,%r15 + addq (%rbp),%r12 xorq %rcx,%r14 - addq %r15,%r12 - movq %rdx,%r15 + xorq %rdx,%r15 rorq $14,%r13 - andq %rcx,%rbx - andq %r8,%r15 + movq %rdx,%rbx + andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 - addq %r15,%rbx + xorq %rdi,%rbx addq %r12,%r9 addq %r12,%rbx - leaq 1(%rdi),%rdi - addq %r14,%rbx + leaq 8(%rbp),%rbp + addq %r14,%rbx movq 120(%rsi),%r12 movq %r9,%r13 movq %rbx,%r14 bswapq %r12 rorq $23,%r13 - movq %r10,%r15 - movq %r12,120(%rsp) + movq %r10,%rdi - rorq $5,%r14 xorq %r9,%r13 - xorq %r11,%r15 + rorq $5,%r14 + xorq %r11,%rdi - rorq $4,%r13 - addq %rax,%r12 + movq %r12,120(%rsp) xorq %rbx,%r14 + andq %r9,%rdi - addq (%rbp,%rdi,8),%r12 - andq %r9,%r15 - movq %rcx,%rax + rorq $4,%r13 + addq %rax,%r12 + xorq %r11,%rdi rorq $6,%r14 xorq %r9,%r13 - xorq %r11,%r15 + addq %rdi,%r12 - xorq %rdx,%rax + movq %rbx,%rdi + addq (%rbp),%r12 xorq %rbx,%r14 - addq %r15,%r12 - movq %rcx,%r15 + xorq %rcx,%rdi rorq $14,%r13 - andq %rbx,%rax - andq %rdx,%r15 + movq %rcx,%rax + andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 - addq %r15,%rax + xorq %r15,%rax addq %r12,%r8 addq %r12,%rax - leaq 1(%rdi),%rdi - addq %r14,%rax + leaq 24(%rbp),%rbp jmp .Lrounds_16_xx .align 16 .Lrounds_16_xx: movq 8(%rsp),%r13 - movq 112(%rsp),%r14 - movq %r13,%r12 - movq %r14,%r15 + movq 112(%rsp),%r15 - rorq $7,%r12 - xorq %r13,%r12 - shrq $7,%r13 + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rax + movq %r15,%r14 + rorq $42,%r15 - rorq $1,%r12 xorq %r12,%r13 - movq 72(%rsp),%r12 - - rorq $42,%r15 + shrq $7,%r12 + rorq $1,%r13 xorq %r14,%r15 shrq $6,%r14 rorq $19,%r15 - addq %r13,%r12 - xorq %r15,%r14 + xorq %r13,%r12 + xorq %r14,%r15 + addq 72(%rsp),%r12 addq 0(%rsp),%r12 movq %r8,%r13 - addq %r14,%r12 + addq %r15,%r12 movq %rax,%r14 rorq $23,%r13 movq %r9,%r15 - movq %r12,0(%rsp) - rorq $5,%r14 xorq %r8,%r13 + rorq $5,%r14 xorq %r10,%r15 - rorq $4,%r13 - addq %r11,%r12 + movq %r12,0(%rsp) xorq %rax,%r14 - - addq (%rbp,%rdi,8),%r12 andq %r8,%r15 - movq %rbx,%r11 + + rorq $4,%r13 + addq %r11,%r12 + xorq %r10,%r15 rorq $6,%r14 xorq %r8,%r13 - xorq %r10,%r15 + addq %r15,%r12 - xorq %rcx,%r11 + movq %rax,%r15 + addq (%rbp),%r12 xorq %rax,%r14 - addq %r15,%r12 - movq %rbx,%r15 + xorq %rbx,%r15 rorq $14,%r13 - andq %rax,%r11 - andq %rcx,%r15 + movq %rbx,%r11 + andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 - addq %r15,%r11 + xorq %rdi,%r11 addq %r12,%rdx addq %r12,%r11 - leaq 1(%rdi),%rdi - addq %r14,%r11 + leaq 8(%rbp),%rbp movq 16(%rsp),%r13 - movq 120(%rsp),%r14 - movq %r13,%r12 - movq %r14,%r15 + movq 120(%rsp),%rdi - rorq $7,%r12 - xorq %r13,%r12 - shrq $7,%r13 + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r11 + movq %rdi,%r14 + rorq $42,%rdi - rorq $1,%r12 xorq %r12,%r13 - movq 80(%rsp),%r12 - - rorq $42,%r15 - xorq %r14,%r15 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi shrq $6,%r14 - rorq $19,%r15 - addq %r13,%r12 - xorq %r15,%r14 + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 80(%rsp),%r12 addq 8(%rsp),%r12 movq %rdx,%r13 - addq %r14,%r12 + addq %rdi,%r12 movq %r11,%r14 rorq $23,%r13 - movq %r8,%r15 - movq %r12,8(%rsp) + movq %r8,%rdi - rorq $5,%r14 xorq %rdx,%r13 - xorq %r9,%r15 + rorq $5,%r14 + xorq %r9,%rdi - rorq $4,%r13 - addq %r10,%r12 + movq %r12,8(%rsp) xorq %r11,%r14 + andq %rdx,%rdi - addq (%rbp,%rdi,8),%r12 - andq %rdx,%r15 - movq %rax,%r10 + rorq $4,%r13 + addq %r10,%r12 + xorq %r9,%rdi rorq $6,%r14 xorq %rdx,%r13 - xorq %r9,%r15 + addq %rdi,%r12 - xorq %rbx,%r10 + movq %r11,%rdi + addq (%rbp),%r12 xorq %r11,%r14 - addq %r15,%r12 - movq %rax,%r15 + xorq %rax,%rdi rorq $14,%r13 - andq %r11,%r10 - andq %rbx,%r15 + movq %rax,%r10 + andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 - addq %r15,%r10 + xorq %r15,%r10 addq %r12,%rcx addq %r12,%r10 - leaq 1(%rdi),%rdi - addq %r14,%r10 + leaq 24(%rbp),%rbp movq 24(%rsp),%r13 - movq 0(%rsp),%r14 - movq %r13,%r12 - movq %r14,%r15 + movq 0(%rsp),%r15 - rorq $7,%r12 - xorq %r13,%r12 - shrq $7,%r13 + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r10 + movq %r15,%r14 + rorq $42,%r15 - rorq $1,%r12 xorq %r12,%r13 - movq 88(%rsp),%r12 - - rorq $42,%r15 + shrq $7,%r12 + rorq $1,%r13 xorq %r14,%r15 shrq $6,%r14 rorq $19,%r15 - addq %r13,%r12 - xorq %r15,%r14 + xorq %r13,%r12 + xorq %r14,%r15 + addq 88(%rsp),%r12 addq 16(%rsp),%r12 movq %rcx,%r13 - addq %r14,%r12 + addq %r15,%r12 movq %r10,%r14 rorq $23,%r13 movq %rdx,%r15 - movq %r12,16(%rsp) - rorq $5,%r14 xorq %rcx,%r13 + rorq $5,%r14 xorq %r8,%r15 - rorq $4,%r13 - addq %r9,%r12 + movq %r12,16(%rsp) xorq %r10,%r14 - - addq (%rbp,%rdi,8),%r12 andq %rcx,%r15 - movq %r11,%r9 + + rorq $4,%r13 + addq %r9,%r12 + xorq %r8,%r15 rorq $6,%r14 xorq %rcx,%r13 - xorq %r8,%r15 + addq %r15,%r12 - xorq %rax,%r9 + movq %r10,%r15 + addq (%rbp),%r12 xorq %r10,%r14 - addq %r15,%r12 - movq %r11,%r15 + xorq %r11,%r15 rorq $14,%r13 - andq %r10,%r9 - andq %rax,%r15 + movq %r11,%r9 + andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 - addq %r15,%r9 + xorq %rdi,%r9 addq %r12,%rbx addq %r12,%r9 - leaq 1(%rdi),%rdi - addq %r14,%r9 + leaq 8(%rbp),%rbp movq 32(%rsp),%r13 - movq 8(%rsp),%r14 - movq %r13,%r12 - movq %r14,%r15 + movq 8(%rsp),%rdi - rorq $7,%r12 - xorq %r13,%r12 - shrq $7,%r13 + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r9 + movq %rdi,%r14 + rorq $42,%rdi - rorq $1,%r12 xorq %r12,%r13 - movq 96(%rsp),%r12 - - rorq $42,%r15 - xorq %r14,%r15 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi shrq $6,%r14 - rorq $19,%r15 - addq %r13,%r12 - xorq %r15,%r14 + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 96(%rsp),%r12 addq 24(%rsp),%r12 movq %rbx,%r13 - addq %r14,%r12 + addq %rdi,%r12 movq %r9,%r14 rorq $23,%r13 - movq %rcx,%r15 - movq %r12,24(%rsp) + movq %rcx,%rdi - rorq $5,%r14 xorq %rbx,%r13 - xorq %rdx,%r15 + rorq $5,%r14 + xorq %rdx,%rdi - rorq $4,%r13 - addq %r8,%r12 + movq %r12,24(%rsp) xorq %r9,%r14 + andq %rbx,%rdi - addq (%rbp,%rdi,8),%r12 - andq %rbx,%r15 - movq %r10,%r8 + rorq $4,%r13 + addq %r8,%r12 + xorq %rdx,%rdi rorq $6,%r14 xorq %rbx,%r13 - xorq %rdx,%r15 + addq %rdi,%r12 - xorq %r11,%r8 + movq %r9,%rdi + addq (%rbp),%r12 xorq %r9,%r14 - addq %r15,%r12 - movq %r10,%r15 + xorq %r10,%rdi rorq $14,%r13 - andq %r9,%r8 - andq %r11,%r15 + movq %r10,%r8 + andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 - addq %r15,%r8 + xorq %r15,%r8 addq %r12,%rax addq %r12,%r8 - leaq 1(%rdi),%rdi - addq %r14,%r8 + leaq 24(%rbp),%rbp movq 40(%rsp),%r13 - movq 16(%rsp),%r14 - movq %r13,%r12 - movq %r14,%r15 + movq 16(%rsp),%r15 - rorq $7,%r12 - xorq %r13,%r12 - shrq $7,%r13 + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r8 + movq %r15,%r14 + rorq $42,%r15 - rorq $1,%r12 xorq %r12,%r13 - movq 104(%rsp),%r12 - - rorq $42,%r15 + shrq $7,%r12 + rorq $1,%r13 xorq %r14,%r15 shrq $6,%r14 rorq $19,%r15 - addq %r13,%r12 - xorq %r15,%r14 + xorq %r13,%r12 + xorq %r14,%r15 + addq 104(%rsp),%r12 addq 32(%rsp),%r12 movq %rax,%r13 - addq %r14,%r12 + addq %r15,%r12 movq %r8,%r14 rorq $23,%r13 movq %rbx,%r15 - movq %r12,32(%rsp) - rorq $5,%r14 xorq %rax,%r13 + rorq $5,%r14 xorq %rcx,%r15 - rorq $4,%r13 - addq %rdx,%r12 + movq %r12,32(%rsp) xorq %r8,%r14 - - addq (%rbp,%rdi,8),%r12 andq %rax,%r15 - movq %r9,%rdx + + rorq $4,%r13 + addq %rdx,%r12 + xorq %rcx,%r15 rorq $6,%r14 xorq %rax,%r13 - xorq %rcx,%r15 + addq %r15,%r12 - xorq %r10,%rdx + movq %r8,%r15 + addq (%rbp),%r12 xorq %r8,%r14 - addq %r15,%r12 - movq %r9,%r15 + xorq %r9,%r15 rorq $14,%r13 - andq %r8,%rdx - andq %r10,%r15 + movq %r9,%rdx + andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 - addq %r15,%rdx + xorq %rdi,%rdx addq %r12,%r11 addq %r12,%rdx - leaq 1(%rdi),%rdi - addq %r14,%rdx + leaq 8(%rbp),%rbp movq 48(%rsp),%r13 - movq 24(%rsp),%r14 - movq %r13,%r12 - movq %r14,%r15 + movq 24(%rsp),%rdi - rorq $7,%r12 - xorq %r13,%r12 - shrq $7,%r13 + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rdx + movq %rdi,%r14 + rorq $42,%rdi - rorq $1,%r12 xorq %r12,%r13 - movq 112(%rsp),%r12 - - rorq $42,%r15 - xorq %r14,%r15 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi shrq $6,%r14 - rorq $19,%r15 - addq %r13,%r12 - xorq %r15,%r14 + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 112(%rsp),%r12 addq 40(%rsp),%r12 movq %r11,%r13 - addq %r14,%r12 + addq %rdi,%r12 movq %rdx,%r14 rorq $23,%r13 - movq %rax,%r15 - movq %r12,40(%rsp) + movq %rax,%rdi - rorq $5,%r14 xorq %r11,%r13 - xorq %rbx,%r15 + rorq $5,%r14 + xorq %rbx,%rdi - rorq $4,%r13 - addq %rcx,%r12 + movq %r12,40(%rsp) xorq %rdx,%r14 + andq %r11,%rdi - addq (%rbp,%rdi,8),%r12 - andq %r11,%r15 - movq %r8,%rcx + rorq $4,%r13 + addq %rcx,%r12 + xorq %rbx,%rdi rorq $6,%r14 xorq %r11,%r13 - xorq %rbx,%r15 + addq %rdi,%r12 - xorq %r9,%rcx + movq %rdx,%rdi + addq (%rbp),%r12 xorq %rdx,%r14 - addq %r15,%r12 - movq %r8,%r15 + xorq %r8,%rdi rorq $14,%r13 - andq %rdx,%rcx - andq %r9,%r15 + movq %r8,%rcx + andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 - addq %r15,%rcx + xorq %r15,%rcx addq %r12,%r10 addq %r12,%rcx - leaq 1(%rdi),%rdi - addq %r14,%rcx + leaq 24(%rbp),%rbp movq 56(%rsp),%r13 - movq 32(%rsp),%r14 - movq %r13,%r12 - movq %r14,%r15 + movq 32(%rsp),%r15 - rorq $7,%r12 - xorq %r13,%r12 - shrq $7,%r13 + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rcx + movq %r15,%r14 + rorq $42,%r15 - rorq $1,%r12 xorq %r12,%r13 - movq 120(%rsp),%r12 - - rorq $42,%r15 + shrq $7,%r12 + rorq $1,%r13 xorq %r14,%r15 shrq $6,%r14 rorq $19,%r15 - addq %r13,%r12 - xorq %r15,%r14 + xorq %r13,%r12 + xorq %r14,%r15 + addq 120(%rsp),%r12 addq 48(%rsp),%r12 movq %r10,%r13 - addq %r14,%r12 + addq %r15,%r12 movq %rcx,%r14 rorq $23,%r13 movq %r11,%r15 - movq %r12,48(%rsp) - rorq $5,%r14 xorq %r10,%r13 + rorq $5,%r14 xorq %rax,%r15 - rorq $4,%r13 - addq %rbx,%r12 + movq %r12,48(%rsp) xorq %rcx,%r14 - - addq (%rbp,%rdi,8),%r12 andq %r10,%r15 - movq %rdx,%rbx + + rorq $4,%r13 + addq %rbx,%r12 + xorq %rax,%r15 rorq $6,%r14 xorq %r10,%r13 - xorq %rax,%r15 + addq %r15,%r12 - xorq %r8,%rbx + movq %rcx,%r15 + addq (%rbp),%r12 xorq %rcx,%r14 - addq %r15,%r12 - movq %rdx,%r15 + xorq %rdx,%r15 rorq $14,%r13 - andq %rcx,%rbx - andq %r8,%r15 + movq %rdx,%rbx + andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 - addq %r15,%rbx + xorq %rdi,%rbx addq %r12,%r9 addq %r12,%rbx - leaq 1(%rdi),%rdi - addq %r14,%rbx + leaq 8(%rbp),%rbp movq 64(%rsp),%r13 - movq 40(%rsp),%r14 - movq %r13,%r12 - movq %r14,%r15 + movq 40(%rsp),%rdi - rorq $7,%r12 - xorq %r13,%r12 - shrq $7,%r13 + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rbx + movq %rdi,%r14 + rorq $42,%rdi - rorq $1,%r12 xorq %r12,%r13 - movq 0(%rsp),%r12 - - rorq $42,%r15 - xorq %r14,%r15 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi shrq $6,%r14 - rorq $19,%r15 - addq %r13,%r12 - xorq %r15,%r14 + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 0(%rsp),%r12 addq 56(%rsp),%r12 movq %r9,%r13 - addq %r14,%r12 + addq %rdi,%r12 movq %rbx,%r14 rorq $23,%r13 - movq %r10,%r15 - movq %r12,56(%rsp) + movq %r10,%rdi - rorq $5,%r14 xorq %r9,%r13 - xorq %r11,%r15 + rorq $5,%r14 + xorq %r11,%rdi - rorq $4,%r13 - addq %rax,%r12 + movq %r12,56(%rsp) xorq %rbx,%r14 + andq %r9,%rdi - addq (%rbp,%rdi,8),%r12 - andq %r9,%r15 - movq %rcx,%rax + rorq $4,%r13 + addq %rax,%r12 + xorq %r11,%rdi rorq $6,%r14 xorq %r9,%r13 - xorq %r11,%r15 + addq %rdi,%r12 - xorq %rdx,%rax + movq %rbx,%rdi + addq (%rbp),%r12 xorq %rbx,%r14 - addq %r15,%r12 - movq %rcx,%r15 + xorq %rcx,%rdi rorq $14,%r13 - andq %rbx,%rax - andq %rdx,%r15 + movq %rcx,%rax + andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 - addq %r15,%rax + xorq %r15,%rax addq %r12,%r8 addq %r12,%rax - leaq 1(%rdi),%rdi - addq %r14,%rax + leaq 24(%rbp),%rbp movq 72(%rsp),%r13 - movq 48(%rsp),%r14 - movq %r13,%r12 - movq %r14,%r15 + movq 48(%rsp),%r15 - rorq $7,%r12 - xorq %r13,%r12 - shrq $7,%r13 + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rax + movq %r15,%r14 + rorq $42,%r15 - rorq $1,%r12 xorq %r12,%r13 - movq 8(%rsp),%r12 - - rorq $42,%r15 + shrq $7,%r12 + rorq $1,%r13 xorq %r14,%r15 shrq $6,%r14 rorq $19,%r15 - addq %r13,%r12 - xorq %r15,%r14 + xorq %r13,%r12 + xorq %r14,%r15 + addq 8(%rsp),%r12 addq 64(%rsp),%r12 movq %r8,%r13 - addq %r14,%r12 + addq %r15,%r12 movq %rax,%r14 rorq $23,%r13 movq %r9,%r15 - movq %r12,64(%rsp) - rorq $5,%r14 xorq %r8,%r13 + rorq $5,%r14 xorq %r10,%r15 - rorq $4,%r13 - addq %r11,%r12 + movq %r12,64(%rsp) xorq %rax,%r14 - - addq (%rbp,%rdi,8),%r12 andq %r8,%r15 - movq %rbx,%r11 + + rorq $4,%r13 + addq %r11,%r12 + xorq %r10,%r15 rorq $6,%r14 xorq %r8,%r13 - xorq %r10,%r15 + addq %r15,%r12 - xorq %rcx,%r11 + movq %rax,%r15 + addq (%rbp),%r12 xorq %rax,%r14 - addq %r15,%r12 - movq %rbx,%r15 + xorq %rbx,%r15 rorq $14,%r13 - andq %rax,%r11 - andq %rcx,%r15 + movq %rbx,%r11 + andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 - addq %r15,%r11 + xorq %rdi,%r11 addq %r12,%rdx addq %r12,%r11 - leaq 1(%rdi),%rdi - addq %r14,%r11 + leaq 8(%rbp),%rbp movq 80(%rsp),%r13 - movq 56(%rsp),%r14 - movq %r13,%r12 - movq %r14,%r15 + movq 56(%rsp),%rdi - rorq $7,%r12 - xorq %r13,%r12 - shrq $7,%r13 + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r11 + movq %rdi,%r14 + rorq $42,%rdi - rorq $1,%r12 xorq %r12,%r13 - movq 16(%rsp),%r12 - - rorq $42,%r15 - xorq %r14,%r15 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi shrq $6,%r14 - rorq $19,%r15 - addq %r13,%r12 - xorq %r15,%r14 + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 16(%rsp),%r12 addq 72(%rsp),%r12 movq %rdx,%r13 - addq %r14,%r12 + addq %rdi,%r12 movq %r11,%r14 rorq $23,%r13 - movq %r8,%r15 - movq %r12,72(%rsp) + movq %r8,%rdi - rorq $5,%r14 xorq %rdx,%r13 - xorq %r9,%r15 + rorq $5,%r14 + xorq %r9,%rdi - rorq $4,%r13 - addq %r10,%r12 + movq %r12,72(%rsp) xorq %r11,%r14 + andq %rdx,%rdi - addq (%rbp,%rdi,8),%r12 - andq %rdx,%r15 - movq %rax,%r10 + rorq $4,%r13 + addq %r10,%r12 + xorq %r9,%rdi rorq $6,%r14 xorq %rdx,%r13 - xorq %r9,%r15 + addq %rdi,%r12 - xorq %rbx,%r10 + movq %r11,%rdi + addq (%rbp),%r12 xorq %r11,%r14 - addq %r15,%r12 - movq %rax,%r15 + xorq %rax,%rdi rorq $14,%r13 - andq %r11,%r10 - andq %rbx,%r15 + movq %rax,%r10 + andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 - addq %r15,%r10 + xorq %r15,%r10 addq %r12,%rcx addq %r12,%r10 - leaq 1(%rdi),%rdi - addq %r14,%r10 + leaq 24(%rbp),%rbp movq 88(%rsp),%r13 - movq 64(%rsp),%r14 - movq %r13,%r12 - movq %r14,%r15 + movq 64(%rsp),%r15 - rorq $7,%r12 - xorq %r13,%r12 - shrq $7,%r13 + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r10 + movq %r15,%r14 + rorq $42,%r15 - rorq $1,%r12 xorq %r12,%r13 - movq 24(%rsp),%r12 - - rorq $42,%r15 + shrq $7,%r12 + rorq $1,%r13 xorq %r14,%r15 shrq $6,%r14 rorq $19,%r15 - addq %r13,%r12 - xorq %r15,%r14 + xorq %r13,%r12 + xorq %r14,%r15 + addq 24(%rsp),%r12 addq 80(%rsp),%r12 movq %rcx,%r13 - addq %r14,%r12 + addq %r15,%r12 movq %r10,%r14 rorq $23,%r13 movq %rdx,%r15 - movq %r12,80(%rsp) - rorq $5,%r14 xorq %rcx,%r13 + rorq $5,%r14 xorq %r8,%r15 - rorq $4,%r13 - addq %r9,%r12 + movq %r12,80(%rsp) xorq %r10,%r14 - - addq (%rbp,%rdi,8),%r12 andq %rcx,%r15 - movq %r11,%r9 + + rorq $4,%r13 + addq %r9,%r12 + xorq %r8,%r15 rorq $6,%r14 xorq %rcx,%r13 - xorq %r8,%r15 + addq %r15,%r12 - xorq %rax,%r9 + movq %r10,%r15 + addq (%rbp),%r12 xorq %r10,%r14 - addq %r15,%r12 - movq %r11,%r15 + xorq %r11,%r15 rorq $14,%r13 - andq %r10,%r9 - andq %rax,%r15 + movq %r11,%r9 + andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 - addq %r15,%r9 + xorq %rdi,%r9 addq %r12,%rbx addq %r12,%r9 - leaq 1(%rdi),%rdi - addq %r14,%r9 + leaq 8(%rbp),%rbp movq 96(%rsp),%r13 - movq 72(%rsp),%r14 - movq %r13,%r12 - movq %r14,%r15 + movq 72(%rsp),%rdi - rorq $7,%r12 - xorq %r13,%r12 - shrq $7,%r13 + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r9 + movq %rdi,%r14 + rorq $42,%rdi - rorq $1,%r12 xorq %r12,%r13 - movq 32(%rsp),%r12 - - rorq $42,%r15 - xorq %r14,%r15 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi shrq $6,%r14 - rorq $19,%r15 - addq %r13,%r12 - xorq %r15,%r14 + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 32(%rsp),%r12 addq 88(%rsp),%r12 movq %rbx,%r13 - addq %r14,%r12 + addq %rdi,%r12 movq %r9,%r14 rorq $23,%r13 - movq %rcx,%r15 - movq %r12,88(%rsp) + movq %rcx,%rdi - rorq $5,%r14 xorq %rbx,%r13 - xorq %rdx,%r15 + rorq $5,%r14 + xorq %rdx,%rdi - rorq $4,%r13 - addq %r8,%r12 + movq %r12,88(%rsp) xorq %r9,%r14 + andq %rbx,%rdi - addq (%rbp,%rdi,8),%r12 - andq %rbx,%r15 - movq %r10,%r8 + rorq $4,%r13 + addq %r8,%r12 + xorq %rdx,%rdi rorq $6,%r14 xorq %rbx,%r13 - xorq %rdx,%r15 + addq %rdi,%r12 - xorq %r11,%r8 + movq %r9,%rdi + addq (%rbp),%r12 xorq %r9,%r14 - addq %r15,%r12 - movq %r10,%r15 + xorq %r10,%rdi rorq $14,%r13 - andq %r9,%r8 - andq %r11,%r15 + movq %r10,%r8 + andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 - addq %r15,%r8 + xorq %r15,%r8 addq %r12,%rax addq %r12,%r8 - leaq 1(%rdi),%rdi - addq %r14,%r8 + leaq 24(%rbp),%rbp movq 104(%rsp),%r13 - movq 80(%rsp),%r14 - movq %r13,%r12 - movq %r14,%r15 + movq 80(%rsp),%r15 - rorq $7,%r12 - xorq %r13,%r12 - shrq $7,%r13 + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r8 + movq %r15,%r14 + rorq $42,%r15 - rorq $1,%r12 xorq %r12,%r13 - movq 40(%rsp),%r12 - - rorq $42,%r15 + shrq $7,%r12 + rorq $1,%r13 xorq %r14,%r15 shrq $6,%r14 rorq $19,%r15 - addq %r13,%r12 - xorq %r15,%r14 + xorq %r13,%r12 + xorq %r14,%r15 + addq 40(%rsp),%r12 addq 96(%rsp),%r12 movq %rax,%r13 - addq %r14,%r12 + addq %r15,%r12 movq %r8,%r14 rorq $23,%r13 movq %rbx,%r15 - movq %r12,96(%rsp) - rorq $5,%r14 xorq %rax,%r13 + rorq $5,%r14 xorq %rcx,%r15 - rorq $4,%r13 - addq %rdx,%r12 + movq %r12,96(%rsp) xorq %r8,%r14 - - addq (%rbp,%rdi,8),%r12 andq %rax,%r15 - movq %r9,%rdx + + rorq $4,%r13 + addq %rdx,%r12 + xorq %rcx,%r15 rorq $6,%r14 xorq %rax,%r13 - xorq %rcx,%r15 + addq %r15,%r12 - xorq %r10,%rdx + movq %r8,%r15 + addq (%rbp),%r12 xorq %r8,%r14 - addq %r15,%r12 - movq %r9,%r15 + xorq %r9,%r15 rorq $14,%r13 - andq %r8,%rdx - andq %r10,%r15 + movq %r9,%rdx + andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 - addq %r15,%rdx + xorq %rdi,%rdx addq %r12,%r11 addq %r12,%rdx - leaq 1(%rdi),%rdi - addq %r14,%rdx + leaq 8(%rbp),%rbp movq 112(%rsp),%r13 - movq 88(%rsp),%r14 - movq %r13,%r12 - movq %r14,%r15 + movq 88(%rsp),%rdi - rorq $7,%r12 - xorq %r13,%r12 - shrq $7,%r13 + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rdx + movq %rdi,%r14 + rorq $42,%rdi - rorq $1,%r12 xorq %r12,%r13 - movq 48(%rsp),%r12 - - rorq $42,%r15 - xorq %r14,%r15 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi shrq $6,%r14 - rorq $19,%r15 - addq %r13,%r12 - xorq %r15,%r14 + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 48(%rsp),%r12 addq 104(%rsp),%r12 movq %r11,%r13 - addq %r14,%r12 + addq %rdi,%r12 movq %rdx,%r14 rorq $23,%r13 - movq %rax,%r15 - movq %r12,104(%rsp) + movq %rax,%rdi - rorq $5,%r14 xorq %r11,%r13 - xorq %rbx,%r15 + rorq $5,%r14 + xorq %rbx,%rdi - rorq $4,%r13 - addq %rcx,%r12 + movq %r12,104(%rsp) xorq %rdx,%r14 + andq %r11,%rdi - addq (%rbp,%rdi,8),%r12 - andq %r11,%r15 - movq %r8,%rcx + rorq $4,%r13 + addq %rcx,%r12 + xorq %rbx,%rdi rorq $6,%r14 xorq %r11,%r13 - xorq %rbx,%r15 + addq %rdi,%r12 - xorq %r9,%rcx + movq %rdx,%rdi + addq (%rbp),%r12 xorq %rdx,%r14 - addq %r15,%r12 - movq %r8,%r15 + xorq %r8,%rdi rorq $14,%r13 - andq %rdx,%rcx - andq %r9,%r15 + movq %r8,%rcx + andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 - addq %r15,%rcx + xorq %r15,%rcx addq %r12,%r10 addq %r12,%rcx - leaq 1(%rdi),%rdi - addq %r14,%rcx + leaq 24(%rbp),%rbp movq 120(%rsp),%r13 - movq 96(%rsp),%r14 - movq %r13,%r12 - movq %r14,%r15 + movq 96(%rsp),%r15 - rorq $7,%r12 - xorq %r13,%r12 - shrq $7,%r13 + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rcx + movq %r15,%r14 + rorq $42,%r15 - rorq $1,%r12 xorq %r12,%r13 - movq 56(%rsp),%r12 - - rorq $42,%r15 + shrq $7,%r12 + rorq $1,%r13 xorq %r14,%r15 shrq $6,%r14 rorq $19,%r15 - addq %r13,%r12 - xorq %r15,%r14 + xorq %r13,%r12 + xorq %r14,%r15 + addq 56(%rsp),%r12 addq 112(%rsp),%r12 movq %r10,%r13 - addq %r14,%r12 + addq %r15,%r12 movq %rcx,%r14 rorq $23,%r13 movq %r11,%r15 - movq %r12,112(%rsp) - rorq $5,%r14 xorq %r10,%r13 + rorq $5,%r14 xorq %rax,%r15 - rorq $4,%r13 - addq %rbx,%r12 + movq %r12,112(%rsp) xorq %rcx,%r14 - - addq (%rbp,%rdi,8),%r12 andq %r10,%r15 - movq %rdx,%rbx + + rorq $4,%r13 + addq %rbx,%r12 + xorq %rax,%r15 rorq $6,%r14 xorq %r10,%r13 - xorq %rax,%r15 + addq %r15,%r12 - xorq %r8,%rbx + movq %rcx,%r15 + addq (%rbp),%r12 xorq %rcx,%r14 - addq %r15,%r12 - movq %rdx,%r15 + xorq %rdx,%r15 rorq $14,%r13 - andq %rcx,%rbx - andq %r8,%r15 + movq %rdx,%rbx + andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 - addq %r15,%rbx + xorq %rdi,%rbx addq %r12,%r9 addq %r12,%rbx - leaq 1(%rdi),%rdi - addq %r14,%rbx + leaq 8(%rbp),%rbp movq 0(%rsp),%r13 - movq 104(%rsp),%r14 - movq %r13,%r12 - movq %r14,%r15 + movq 104(%rsp),%rdi - rorq $7,%r12 - xorq %r13,%r12 - shrq $7,%r13 + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rbx + movq %rdi,%r14 + rorq $42,%rdi - rorq $1,%r12 xorq %r12,%r13 - movq 64(%rsp),%r12 - - rorq $42,%r15 - xorq %r14,%r15 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi shrq $6,%r14 - rorq $19,%r15 - addq %r13,%r12 - xorq %r15,%r14 + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 64(%rsp),%r12 addq 120(%rsp),%r12 movq %r9,%r13 - addq %r14,%r12 + addq %rdi,%r12 movq %rbx,%r14 rorq $23,%r13 - movq %r10,%r15 - movq %r12,120(%rsp) + movq %r10,%rdi - rorq $5,%r14 xorq %r9,%r13 - xorq %r11,%r15 + rorq $5,%r14 + xorq %r11,%rdi - rorq $4,%r13 - addq %rax,%r12 + movq %r12,120(%rsp) xorq %rbx,%r14 + andq %r9,%rdi - addq (%rbp,%rdi,8),%r12 - andq %r9,%r15 - movq %rcx,%rax + rorq $4,%r13 + addq %rax,%r12 + xorq %r11,%rdi rorq $6,%r14 xorq %r9,%r13 - xorq %r11,%r15 + addq %rdi,%r12 - xorq %rdx,%rax + movq %rbx,%rdi + addq (%rbp),%r12 xorq %rbx,%r14 - addq %r15,%r12 - movq %rcx,%r15 + xorq %rcx,%rdi rorq $14,%r13 - andq %rbx,%rax - andq %rdx,%r15 + movq %rcx,%rax + andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 - addq %r15,%rax + xorq %r15,%rax addq %r12,%r8 addq %r12,%rax - leaq 1(%rdi),%rdi - addq %r14,%rax - cmpq $80,%rdi - jb .Lrounds_16_xx + leaq 24(%rbp),%rbp + cmpb $0,7(%rbp) + jnz .Lrounds_16_xx movq 128+0(%rsp),%rdi + addq %r14,%rax leaq 128(%rsi),%rsi addq 0(%rdi),%rax @@ -1762,42 +1699,86 @@ sha512_block_data_order: .type K512,@object K512: .quad 0x428a2f98d728ae22,0x7137449123ef65cd +.quad 0x428a2f98d728ae22,0x7137449123ef65cd +.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc .quad 0x3956c25bf348b538,0x59f111f1b605d019 +.quad 0x3956c25bf348b538,0x59f111f1b605d019 +.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 .quad 0xd807aa98a3030242,0x12835b0145706fbe +.quad 0xd807aa98a3030242,0x12835b0145706fbe +.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 +.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 +.quad 0x9bdc06a725c71235,0xc19bf174cf692694 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 +.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 +.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 +.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 +.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 +.quad 0x983e5152ee66dfab,0xa831c66d2db43210 +.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 +.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 .quad 0x06ca6351e003826f,0x142929670a0e6e70 +.quad 0x06ca6351e003826f,0x142929670a0e6e70 +.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df +.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df +.quad 0x650a73548baf63de,0x766a0abb3c77b2a8 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 .quad 0x81c2c92e47edaee6,0x92722c851482353b +.quad 0x81c2c92e47edaee6,0x92722c851482353b +.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 +.quad 0xc24b8b70d0f89791,0xc76c51a30654be30 .quad 0xd192e819d6ef5218,0xd69906245565a910 +.quad 0xd192e819d6ef5218,0xd69906245565a910 +.quad 0xf40e35855771202a,0x106aa07032bbd1b8 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 +.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 +.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 +.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 +.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 +.quad 0x748f82ee5defb2fc,0x78a5636f43172f60 +.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec .quad 0x90befffa23631e28,0xa4506cebde82bde9 +.quad 0x90befffa23631e28,0xa4506cebde82bde9 +.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b .quad 0xca273eceea26619c,0xd186b8c721c0c207 +.quad 0xca273eceea26619c,0xd186b8c721c0c207 +.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 +.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 .quad 0x113f9804bef90dae,0x1b710b35131c471b +.quad 0x113f9804bef90dae,0x1b710b35131c471b +.quad 0x28db77f523047d84,0x32caab7b40c72493 .quad 0x28db77f523047d84,0x32caab7b40c72493 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c +.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a +.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a +.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + +.quad 0x0001020304050607,0x08090a0b0c0d0e0f +.quad 0x0001020304050607,0x08090a0b0c0d0e0f +.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 diff --git a/secure/lib/libcrypto/amd64/vpaes-x86_64.S b/secure/lib/libcrypto/amd64/vpaes-x86_64.S index 8cb9644..8ec5c40 100644 --- a/secure/lib/libcrypto/amd64/vpaes-x86_64.S +++ b/secure/lib/libcrypto/amd64/vpaes-x86_64.S @@ -32,8 +32,8 @@ _vpaes_encrypt_core: movdqa .Lk_ipt+16(%rip),%xmm0 .byte 102,15,56,0,193 pxor %xmm5,%xmm2 - pxor %xmm2,%xmm0 addq $16,%r9 + pxor %xmm2,%xmm0 leaq .Lk_mc_backward(%rip),%r10 jmp .Lenc_entry @@ -41,19 +41,19 @@ _vpaes_encrypt_core: .Lenc_loop: movdqa %xmm13,%xmm4 -.byte 102,15,56,0,226 - pxor %xmm5,%xmm4 movdqa %xmm12,%xmm0 +.byte 102,15,56,0,226 .byte 102,15,56,0,195 - pxor %xmm4,%xmm0 + pxor %xmm5,%xmm4 movdqa %xmm15,%xmm5 -.byte 102,15,56,0,234 + pxor %xmm4,%xmm0 movdqa -64(%r11,%r10,1),%xmm1 +.byte 102,15,56,0,234 + movdqa (%r11,%r10,1),%xmm4 movdqa %xmm14,%xmm2 .byte 102,15,56,0,211 - pxor %xmm5,%xmm2 - movdqa (%r11,%r10,1),%xmm4 movdqa %xmm0,%xmm3 + pxor %xmm5,%xmm2 .byte 102,15,56,0,193 addq $16,%r9 pxor %xmm2,%xmm0 @@ -62,30 +62,30 @@ _vpaes_encrypt_core: pxor %xmm0,%xmm3 .byte 102,15,56,0,193 andq $48,%r11 - pxor %xmm3,%xmm0 subq $1,%rax + pxor %xmm3,%xmm0 .Lenc_entry: movdqa %xmm9,%xmm1 + movdqa %xmm11,%xmm5 pandn %xmm0,%xmm1 psrld $4,%xmm1 pand %xmm9,%xmm0 - movdqa %xmm11,%xmm5 .byte 102,15,56,0,232 - pxor %xmm1,%xmm0 movdqa %xmm10,%xmm3 + pxor %xmm1,%xmm0 .byte 102,15,56,0,217 - pxor %xmm5,%xmm3 movdqa %xmm10,%xmm4 + pxor %xmm5,%xmm3 .byte 102,15,56,0,224 - pxor %xmm5,%xmm4 movdqa %xmm10,%xmm2 + pxor %xmm5,%xmm4 .byte 102,15,56,0,211 - pxor %xmm0,%xmm2 movdqa %xmm10,%xmm3 - movdqu (%r9),%xmm5 + pxor %xmm0,%xmm2 .byte 102,15,56,0,220 + movdqu (%r9),%xmm5 pxor %xmm1,%xmm3 jnz .Lenc_loop @@ -138,62 +138,61 @@ _vpaes_decrypt_core: movdqa -32(%r10),%xmm4 + movdqa -16(%r10),%xmm1 .byte 102,15,56,0,226 - pxor %xmm0,%xmm4 - movdqa -16(%r10),%xmm0 -.byte 102,15,56,0,195 +.byte 102,15,56,0,203 pxor %xmm4,%xmm0 - addq $16,%r9 - -.byte 102,15,56,0,197 movdqa 0(%r10),%xmm4 -.byte 102,15,56,0,226 - pxor %xmm0,%xmm4 - movdqa 16(%r10),%xmm0 -.byte 102,15,56,0,195 - pxor %xmm4,%xmm0 - subq $1,%rax + pxor %xmm1,%xmm0 + movdqa 16(%r10),%xmm1 -.byte 102,15,56,0,197 - movdqa 32(%r10),%xmm4 .byte 102,15,56,0,226 - pxor %xmm0,%xmm4 - movdqa 48(%r10),%xmm0 -.byte 102,15,56,0,195 +.byte 102,15,56,0,197 +.byte 102,15,56,0,203 pxor %xmm4,%xmm0 + movdqa 32(%r10),%xmm4 + pxor %xmm1,%xmm0 + movdqa 48(%r10),%xmm1 +.byte 102,15,56,0,226 .byte 102,15,56,0,197 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 movdqa 64(%r10),%xmm4 + pxor %xmm1,%xmm0 + movdqa 80(%r10),%xmm1 + .byte 102,15,56,0,226 - pxor %xmm0,%xmm4 - movdqa 80(%r10),%xmm0 -.byte 102,15,56,0,195 +.byte 102,15,56,0,197 +.byte 102,15,56,0,203 pxor %xmm4,%xmm0 - + addq $16,%r9 .byte 102,15,58,15,237,12 + pxor %xmm1,%xmm0 + subq $1,%rax .Ldec_entry: movdqa %xmm9,%xmm1 pandn %xmm0,%xmm1 + movdqa %xmm11,%xmm2 psrld $4,%xmm1 pand %xmm9,%xmm0 - movdqa %xmm11,%xmm2 .byte 102,15,56,0,208 - pxor %xmm1,%xmm0 movdqa %xmm10,%xmm3 + pxor %xmm1,%xmm0 .byte 102,15,56,0,217 - pxor %xmm2,%xmm3 movdqa %xmm10,%xmm4 + pxor %xmm2,%xmm3 .byte 102,15,56,0,224 pxor %xmm2,%xmm4 movdqa %xmm10,%xmm2 .byte 102,15,56,0,211 - pxor %xmm0,%xmm2 movdqa %xmm10,%xmm3 + pxor %xmm0,%xmm2 .byte 102,15,56,0,220 - pxor %xmm1,%xmm3 movdqu (%r9),%xmm0 + pxor %xmm1,%xmm3 jnz .Ldec_loop @@ -221,7 +220,7 @@ _vpaes_schedule_core: - call _vpaes_preheat + call _vpaes_preheat movdqa .Lk_rcon(%rip),%xmm8 movdqu (%rdi),%xmm0 @@ -267,7 +266,7 @@ _vpaes_schedule_core: call _vpaes_schedule_round decq %rsi jz .Lschedule_mangle_last - call _vpaes_schedule_mangle + call _vpaes_schedule_mangle jmp .Loop_schedule_128 @@ -288,7 +287,7 @@ _vpaes_schedule_core: .align 16 .Lschedule_192: movdqu 8(%rdi),%xmm0 - call _vpaes_schedule_transform + call _vpaes_schedule_transform movdqa %xmm0,%xmm6 pxor %xmm4,%xmm4 movhlps %xmm4,%xmm6 @@ -297,13 +296,13 @@ _vpaes_schedule_core: .Loop_schedule_192: call _vpaes_schedule_round .byte 102,15,58,15,198,8 - call _vpaes_schedule_mangle + call _vpaes_schedule_mangle call _vpaes_schedule_192_smear - call _vpaes_schedule_mangle + call _vpaes_schedule_mangle call _vpaes_schedule_round decq %rsi jz .Lschedule_mangle_last - call _vpaes_schedule_mangle + call _vpaes_schedule_mangle call _vpaes_schedule_192_smear jmp .Loop_schedule_192 @@ -320,18 +319,18 @@ _vpaes_schedule_core: .align 16 .Lschedule_256: movdqu 16(%rdi),%xmm0 - call _vpaes_schedule_transform + call _vpaes_schedule_transform movl $7,%esi .Loop_schedule_256: - call _vpaes_schedule_mangle + call _vpaes_schedule_mangle movdqa %xmm0,%xmm6 call _vpaes_schedule_round decq %rsi jz .Lschedule_mangle_last - call _vpaes_schedule_mangle + call _vpaes_schedule_mangle pshufd $255,%xmm0,%xmm0 @@ -369,7 +368,7 @@ _vpaes_schedule_core: .Lschedule_mangle_last_dec: addq $-16,%rdx pxor .Lk_s63(%rip),%xmm0 - call _vpaes_schedule_transform + call _vpaes_schedule_transform movdqu %xmm0,(%rdx) @@ -401,12 +400,12 @@ _vpaes_schedule_core: .type _vpaes_schedule_192_smear,@function .align 16 _vpaes_schedule_192_smear: - pshufd $128,%xmm6,%xmm0 - pxor %xmm0,%xmm6 + pshufd $128,%xmm6,%xmm1 pshufd $254,%xmm7,%xmm0 + pxor %xmm1,%xmm6 + pxor %xmm1,%xmm1 pxor %xmm0,%xmm6 movdqa %xmm6,%xmm0 - pxor %xmm1,%xmm1 movhlps %xmm1,%xmm6 .byte 0xf3,0xc3 .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear diff --git a/secure/lib/libcrypto/amd64/wp-x86_64.S b/secure/lib/libcrypto/amd64/wp-x86_64.S index f6cf81c..36f5bc0 100644 --- a/secure/lib/libcrypto/amd64/wp-x86_64.S +++ b/secure/lib/libcrypto/amd64/wp-x86_64.S @@ -63,233 +63,236 @@ whirlpool_block: movq %r15,64+56(%rsp) xorq %rsi,%rsi movq %rsi,24(%rbx) + jmp .Lround .align 16 .Lround: movq 4096(%rbp,%rsi,8),%r8 movl 0(%rsp),%eax movl 4(%rsp),%ebx - movb %al,%cl - movb %ah,%dl + movzbl %al,%ecx + movzbl %ah,%edx + shrl $16,%eax leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx leaq (%rdx,%rdx,1),%rdi - shrl $16,%eax + movzbl %ah,%edx xorq 0(%rbp,%rsi,8),%r8 movq 7(%rbp,%rdi,8),%r9 - movb %al,%cl - movb %ah,%dl movl 0+8(%rsp),%eax leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx movq 6(%rbp,%rsi,8),%r10 movq 5(%rbp,%rdi,8),%r11 - movb %bl,%cl - movb %bh,%dl + shrl $16,%ebx leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx leaq (%rdx,%rdx,1),%rdi - shrl $16,%ebx + movzbl %bh,%edx movq 4(%rbp,%rsi,8),%r12 movq 3(%rbp,%rdi,8),%r13 - movb %bl,%cl - movb %bh,%dl movl 0+8+4(%rsp),%ebx leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx movq 2(%rbp,%rsi,8),%r14 movq 1(%rbp,%rdi,8),%r15 - movb %al,%cl - movb %ah,%dl + shrl $16,%eax leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx leaq (%rdx,%rdx,1),%rdi - shrl $16,%eax + movzbl %ah,%edx xorq 0(%rbp,%rsi,8),%r9 xorq 7(%rbp,%rdi,8),%r10 - movb %al,%cl - movb %ah,%dl movl 8+8(%rsp),%eax leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx xorq 6(%rbp,%rsi,8),%r11 xorq 5(%rbp,%rdi,8),%r12 - movb %bl,%cl - movb %bh,%dl + shrl $16,%ebx leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx leaq (%rdx,%rdx,1),%rdi - shrl $16,%ebx + movzbl %bh,%edx xorq 4(%rbp,%rsi,8),%r13 xorq 3(%rbp,%rdi,8),%r14 - movb %bl,%cl - movb %bh,%dl movl 8+8+4(%rsp),%ebx leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx xorq 2(%rbp,%rsi,8),%r15 xorq 1(%rbp,%rdi,8),%r8 - movb %al,%cl - movb %ah,%dl + shrl $16,%eax leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx leaq (%rdx,%rdx,1),%rdi - shrl $16,%eax + movzbl %ah,%edx xorq 0(%rbp,%rsi,8),%r10 xorq 7(%rbp,%rdi,8),%r11 - movb %al,%cl - movb %ah,%dl movl 16+8(%rsp),%eax leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx xorq 6(%rbp,%rsi,8),%r12 xorq 5(%rbp,%rdi,8),%r13 - movb %bl,%cl - movb %bh,%dl + shrl $16,%ebx leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx leaq (%rdx,%rdx,1),%rdi - shrl $16,%ebx + movzbl %bh,%edx xorq 4(%rbp,%rsi,8),%r14 xorq 3(%rbp,%rdi,8),%r15 - movb %bl,%cl - movb %bh,%dl movl 16+8+4(%rsp),%ebx leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx xorq 2(%rbp,%rsi,8),%r8 xorq 1(%rbp,%rdi,8),%r9 - movb %al,%cl - movb %ah,%dl + shrl $16,%eax leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx leaq (%rdx,%rdx,1),%rdi - shrl $16,%eax + movzbl %ah,%edx xorq 0(%rbp,%rsi,8),%r11 xorq 7(%rbp,%rdi,8),%r12 - movb %al,%cl - movb %ah,%dl movl 24+8(%rsp),%eax leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx xorq 6(%rbp,%rsi,8),%r13 xorq 5(%rbp,%rdi,8),%r14 - movb %bl,%cl - movb %bh,%dl + shrl $16,%ebx leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx leaq (%rdx,%rdx,1),%rdi - shrl $16,%ebx + movzbl %bh,%edx xorq 4(%rbp,%rsi,8),%r15 xorq 3(%rbp,%rdi,8),%r8 - movb %bl,%cl - movb %bh,%dl movl 24+8+4(%rsp),%ebx leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx xorq 2(%rbp,%rsi,8),%r9 xorq 1(%rbp,%rdi,8),%r10 - movb %al,%cl - movb %ah,%dl + shrl $16,%eax leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx leaq (%rdx,%rdx,1),%rdi - shrl $16,%eax + movzbl %ah,%edx xorq 0(%rbp,%rsi,8),%r12 xorq 7(%rbp,%rdi,8),%r13 - movb %al,%cl - movb %ah,%dl movl 32+8(%rsp),%eax leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx xorq 6(%rbp,%rsi,8),%r14 xorq 5(%rbp,%rdi,8),%r15 - movb %bl,%cl - movb %bh,%dl + shrl $16,%ebx leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx leaq (%rdx,%rdx,1),%rdi - shrl $16,%ebx + movzbl %bh,%edx xorq 4(%rbp,%rsi,8),%r8 xorq 3(%rbp,%rdi,8),%r9 - movb %bl,%cl - movb %bh,%dl movl 32+8+4(%rsp),%ebx leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx xorq 2(%rbp,%rsi,8),%r10 xorq 1(%rbp,%rdi,8),%r11 - movb %al,%cl - movb %ah,%dl + shrl $16,%eax leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx leaq (%rdx,%rdx,1),%rdi - shrl $16,%eax + movzbl %ah,%edx xorq 0(%rbp,%rsi,8),%r13 xorq 7(%rbp,%rdi,8),%r14 - movb %al,%cl - movb %ah,%dl movl 40+8(%rsp),%eax leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx xorq 6(%rbp,%rsi,8),%r15 xorq 5(%rbp,%rdi,8),%r8 - movb %bl,%cl - movb %bh,%dl + shrl $16,%ebx leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx leaq (%rdx,%rdx,1),%rdi - shrl $16,%ebx + movzbl %bh,%edx xorq 4(%rbp,%rsi,8),%r9 xorq 3(%rbp,%rdi,8),%r10 - movb %bl,%cl - movb %bh,%dl movl 40+8+4(%rsp),%ebx leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx xorq 2(%rbp,%rsi,8),%r11 xorq 1(%rbp,%rdi,8),%r12 - movb %al,%cl - movb %ah,%dl + shrl $16,%eax leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx leaq (%rdx,%rdx,1),%rdi - shrl $16,%eax + movzbl %ah,%edx xorq 0(%rbp,%rsi,8),%r14 xorq 7(%rbp,%rdi,8),%r15 - movb %al,%cl - movb %ah,%dl movl 48+8(%rsp),%eax leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx xorq 6(%rbp,%rsi,8),%r8 xorq 5(%rbp,%rdi,8),%r9 - movb %bl,%cl - movb %bh,%dl + shrl $16,%ebx leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx leaq (%rdx,%rdx,1),%rdi - shrl $16,%ebx + movzbl %bh,%edx xorq 4(%rbp,%rsi,8),%r10 xorq 3(%rbp,%rdi,8),%r11 - movb %bl,%cl - movb %bh,%dl movl 48+8+4(%rsp),%ebx leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx xorq 2(%rbp,%rsi,8),%r12 xorq 1(%rbp,%rdi,8),%r13 - movb %al,%cl - movb %ah,%dl + shrl $16,%eax leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx leaq (%rdx,%rdx,1),%rdi - shrl $16,%eax + movzbl %ah,%edx xorq 0(%rbp,%rsi,8),%r15 xorq 7(%rbp,%rdi,8),%r8 - movb %al,%cl - movb %ah,%dl movl 56+8(%rsp),%eax leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx xorq 6(%rbp,%rsi,8),%r9 xorq 5(%rbp,%rdi,8),%r10 - movb %bl,%cl - movb %bh,%dl + shrl $16,%ebx leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx leaq (%rdx,%rdx,1),%rdi - shrl $16,%ebx + movzbl %bh,%edx xorq 4(%rbp,%rsi,8),%r11 xorq 3(%rbp,%rdi,8),%r12 - movb %bl,%cl - movb %bh,%dl movl 56+8+4(%rsp),%ebx leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx xorq 2(%rbp,%rsi,8),%r13 xorq 1(%rbp,%rdi,8),%r14 movq %r8,0(%rsp) @@ -300,228 +303,228 @@ whirlpool_block: movq %r13,40(%rsp) movq %r14,48(%rsp) movq %r15,56(%rsp) - movb %al,%cl - movb %ah,%dl + shrl $16,%eax leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx leaq (%rdx,%rdx,1),%rdi - shrl $16,%eax + movzbl %ah,%edx xorq 0(%rbp,%rsi,8),%r8 xorq 7(%rbp,%rdi,8),%r9 - movb %al,%cl - movb %ah,%dl movl 64+0+8(%rsp),%eax leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx xorq 6(%rbp,%rsi,8),%r10 xorq 5(%rbp,%rdi,8),%r11 - movb %bl,%cl - movb %bh,%dl + shrl $16,%ebx leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx leaq (%rdx,%rdx,1),%rdi - shrl $16,%ebx + movzbl %bh,%edx xorq 4(%rbp,%rsi,8),%r12 xorq 3(%rbp,%rdi,8),%r13 - movb %bl,%cl - movb %bh,%dl movl 64+0+8+4(%rsp),%ebx leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx xorq 2(%rbp,%rsi,8),%r14 xorq 1(%rbp,%rdi,8),%r15 - movb %al,%cl - movb %ah,%dl + shrl $16,%eax leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx leaq (%rdx,%rdx,1),%rdi - shrl $16,%eax + movzbl %ah,%edx xorq 0(%rbp,%rsi,8),%r9 xorq 7(%rbp,%rdi,8),%r10 - movb %al,%cl - movb %ah,%dl movl 64+8+8(%rsp),%eax leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx xorq 6(%rbp,%rsi,8),%r11 xorq 5(%rbp,%rdi,8),%r12 - movb %bl,%cl - movb %bh,%dl + shrl $16,%ebx leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx leaq (%rdx,%rdx,1),%rdi - shrl $16,%ebx + movzbl %bh,%edx xorq 4(%rbp,%rsi,8),%r13 xorq 3(%rbp,%rdi,8),%r14 - movb %bl,%cl - movb %bh,%dl movl 64+8+8+4(%rsp),%ebx leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx xorq 2(%rbp,%rsi,8),%r15 xorq 1(%rbp,%rdi,8),%r8 - movb %al,%cl - movb %ah,%dl + shrl $16,%eax leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx leaq (%rdx,%rdx,1),%rdi - shrl $16,%eax + movzbl %ah,%edx xorq 0(%rbp,%rsi,8),%r10 xorq 7(%rbp,%rdi,8),%r11 - movb %al,%cl - movb %ah,%dl movl 64+16+8(%rsp),%eax leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx xorq 6(%rbp,%rsi,8),%r12 xorq 5(%rbp,%rdi,8),%r13 - movb %bl,%cl - movb %bh,%dl + shrl $16,%ebx leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx leaq (%rdx,%rdx,1),%rdi - shrl $16,%ebx + movzbl %bh,%edx xorq 4(%rbp,%rsi,8),%r14 xorq 3(%rbp,%rdi,8),%r15 - movb %bl,%cl - movb %bh,%dl movl 64+16+8+4(%rsp),%ebx leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx xorq 2(%rbp,%rsi,8),%r8 xorq 1(%rbp,%rdi,8),%r9 - movb %al,%cl - movb %ah,%dl + shrl $16,%eax leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx leaq (%rdx,%rdx,1),%rdi - shrl $16,%eax + movzbl %ah,%edx xorq 0(%rbp,%rsi,8),%r11 xorq 7(%rbp,%rdi,8),%r12 - movb %al,%cl - movb %ah,%dl movl 64+24+8(%rsp),%eax leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx xorq 6(%rbp,%rsi,8),%r13 xorq 5(%rbp,%rdi,8),%r14 - movb %bl,%cl - movb %bh,%dl + shrl $16,%ebx leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx leaq (%rdx,%rdx,1),%rdi - shrl $16,%ebx + movzbl %bh,%edx xorq 4(%rbp,%rsi,8),%r15 xorq 3(%rbp,%rdi,8),%r8 - movb %bl,%cl - movb %bh,%dl movl 64+24+8+4(%rsp),%ebx leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx xorq 2(%rbp,%rsi,8),%r9 xorq 1(%rbp,%rdi,8),%r10 - movb %al,%cl - movb %ah,%dl + shrl $16,%eax leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx leaq (%rdx,%rdx,1),%rdi - shrl $16,%eax + movzbl %ah,%edx xorq 0(%rbp,%rsi,8),%r12 xorq 7(%rbp,%rdi,8),%r13 - movb %al,%cl - movb %ah,%dl movl 64+32+8(%rsp),%eax leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx xorq 6(%rbp,%rsi,8),%r14 xorq 5(%rbp,%rdi,8),%r15 - movb %bl,%cl - movb %bh,%dl + shrl $16,%ebx leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx leaq (%rdx,%rdx,1),%rdi - shrl $16,%ebx + movzbl %bh,%edx xorq 4(%rbp,%rsi,8),%r8 xorq 3(%rbp,%rdi,8),%r9 - movb %bl,%cl - movb %bh,%dl movl 64+32+8+4(%rsp),%ebx leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx xorq 2(%rbp,%rsi,8),%r10 xorq 1(%rbp,%rdi,8),%r11 - movb %al,%cl - movb %ah,%dl + shrl $16,%eax leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx leaq (%rdx,%rdx,1),%rdi - shrl $16,%eax + movzbl %ah,%edx xorq 0(%rbp,%rsi,8),%r13 xorq 7(%rbp,%rdi,8),%r14 - movb %al,%cl - movb %ah,%dl movl 64+40+8(%rsp),%eax leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx xorq 6(%rbp,%rsi,8),%r15 xorq 5(%rbp,%rdi,8),%r8 - movb %bl,%cl - movb %bh,%dl + shrl $16,%ebx leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx leaq (%rdx,%rdx,1),%rdi - shrl $16,%ebx + movzbl %bh,%edx xorq 4(%rbp,%rsi,8),%r9 xorq 3(%rbp,%rdi,8),%r10 - movb %bl,%cl - movb %bh,%dl movl 64+40+8+4(%rsp),%ebx leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx xorq 2(%rbp,%rsi,8),%r11 xorq 1(%rbp,%rdi,8),%r12 - movb %al,%cl - movb %ah,%dl + shrl $16,%eax leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx leaq (%rdx,%rdx,1),%rdi - shrl $16,%eax + movzbl %ah,%edx xorq 0(%rbp,%rsi,8),%r14 xorq 7(%rbp,%rdi,8),%r15 - movb %al,%cl - movb %ah,%dl movl 64+48+8(%rsp),%eax leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx xorq 6(%rbp,%rsi,8),%r8 xorq 5(%rbp,%rdi,8),%r9 - movb %bl,%cl - movb %bh,%dl + shrl $16,%ebx leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx leaq (%rdx,%rdx,1),%rdi - shrl $16,%ebx + movzbl %bh,%edx xorq 4(%rbp,%rsi,8),%r10 xorq 3(%rbp,%rdi,8),%r11 - movb %bl,%cl - movb %bh,%dl movl 64+48+8+4(%rsp),%ebx leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx xorq 2(%rbp,%rsi,8),%r12 xorq 1(%rbp,%rdi,8),%r13 - movb %al,%cl - movb %ah,%dl + shrl $16,%eax leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx leaq (%rdx,%rdx,1),%rdi - shrl $16,%eax + movzbl %ah,%edx xorq 0(%rbp,%rsi,8),%r15 xorq 7(%rbp,%rdi,8),%r8 - movb %al,%cl - movb %ah,%dl leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx xorq 6(%rbp,%rsi,8),%r9 xorq 5(%rbp,%rdi,8),%r10 - movb %bl,%cl - movb %bh,%dl + shrl $16,%ebx leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx leaq (%rdx,%rdx,1),%rdi - shrl $16,%ebx + movzbl %bh,%edx xorq 4(%rbp,%rsi,8),%r11 xorq 3(%rbp,%rdi,8),%r12 - movb %bl,%cl - movb %bh,%dl leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx xorq 2(%rbp,%rsi,8),%r13 xorq 1(%rbp,%rdi,8),%r14 leaq 128(%rsp),%rbx diff --git a/secure/lib/libcrypto/amd64/x86_64-gf2m.S b/secure/lib/libcrypto/amd64/x86_64-gf2m.S index 7279c75..f86c253 100644 --- a/secure/lib/libcrypto/amd64/x86_64-gf2m.S +++ b/secure/lib/libcrypto/amd64/x86_64-gf2m.S @@ -246,13 +246,13 @@ bn_GF2m_mul_2x2: movq $15,%r8 movq %rsi,%rax movq %rcx,%rbp - call _mul_1x1 + call _mul_1x1 movq %rax,16(%rsp) movq %rdx,24(%rsp) movq 48(%rsp),%rax movq 64(%rsp),%rbp - call _mul_1x1 + call _mul_1x1 movq %rax,0(%rsp) movq %rdx,8(%rsp) @@ -260,7 +260,7 @@ bn_GF2m_mul_2x2: movq 56(%rsp),%rbp xorq 48(%rsp),%rax xorq 64(%rsp),%rbp - call _mul_1x1 + call _mul_1x1 movq 0(%rsp),%rbx movq 8(%rsp),%rcx movq 16(%rsp),%rdi diff --git a/secure/lib/libcrypto/amd64/x86_64-mont.S b/secure/lib/libcrypto/amd64/x86_64-mont.S index 5997f3c..bff0fb9 100644 --- a/secure/lib/libcrypto/amd64/x86_64-mont.S +++ b/secure/lib/libcrypto/amd64/x86_64-mont.S @@ -1,6 +1,8 @@ # $FreeBSD$ .text + + .globl bn_mul_mont .type bn_mul_mont,@function .align 16 @@ -11,7 +13,9 @@ bn_mul_mont: jb .Lmul_enter cmpq %rsi,%rdx jne .Lmul4x_enter - jmp .Lsqr4x_enter + testl $7,%r9d + jz .Lsqr8x_enter + jmp .Lmul4x_enter .align 16 .Lmul_enter: @@ -164,7 +168,7 @@ bn_mul_mont: leaq 1(%r14),%r14 cmpq %r9,%r14 - jl .Louter + jb .Louter xorq %r14,%r14 movq (%rsp),%rax @@ -330,7 +334,7 @@ bn_mul4x_mont: movq %rdi,-32(%rsp,%r15,8) movq %rdx,%r13 cmpq %r9,%r15 - jl .L1st4x + jb .L1st4x mulq %rbx addq %rax,%r10 @@ -478,7 +482,7 @@ bn_mul4x_mont: movq %rdi,-32(%rsp,%r15,8) movq %rdx,%r13 cmpq %r9,%r15 - jl .Linner4x + jb .Linner4x mulq %rbx addq %rax,%r10 @@ -524,7 +528,7 @@ bn_mul4x_mont: movq %rdi,(%rsp,%r15,8) cmpq %r9,%r14 - jl .Louter4x + jb .Louter4x movq 16(%rsp,%r9,8),%rdi movq 0(%rsp),%rax pxor %xmm0,%xmm0 @@ -606,10 +610,13 @@ bn_mul4x_mont: .Lmul4x_epilogue: .byte 0xf3,0xc3 .size bn_mul4x_mont,.-bn_mul4x_mont -.type bn_sqr4x_mont,@function -.align 16 -bn_sqr4x_mont: -.Lsqr4x_enter: + + +.type bn_sqr8x_mont,@function +.align 32 +bn_sqr8x_mont: +.Lsqr8x_enter: + movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 @@ -617,759 +624,102 @@ bn_sqr4x_mont: pushq %r14 pushq %r15 + movl %r9d,%r10d shll $3,%r9d - xorq %r10,%r10 - movq %rsp,%r11 - subq %r9,%r10 - movq (%r8),%r8 - leaq -72(%rsp,%r10,2),%rsp - andq $-1024,%rsp - - - - - - - - - - - - movq %rdi,32(%rsp) - movq %rcx,40(%rsp) - movq %r8,48(%rsp) - movq %r11,56(%rsp) -.Lsqr4x_body: - - - - - - - - leaq 32(%r10),%rbp - leaq (%rsi,%r9,1),%rsi - - movq %r9,%rcx - - - movq -32(%rsi,%rbp,1),%r14 - leaq 64(%rsp,%r9,2),%rdi - movq -24(%rsi,%rbp,1),%rax - leaq -32(%rdi,%rbp,1),%rdi - movq -16(%rsi,%rbp,1),%rbx - movq %rax,%r15 - - mulq %r14 - movq %rax,%r10 - movq %rbx,%rax - movq %rdx,%r11 - movq %r10,-24(%rdi,%rbp,1) - - xorq %r10,%r10 - mulq %r14 - addq %rax,%r11 - movq %rbx,%rax - adcq %rdx,%r10 - movq %r11,-16(%rdi,%rbp,1) - - leaq -16(%rbp),%rcx - - - movq 8(%rsi,%rcx,1),%rbx - mulq %r15 - movq %rax,%r12 - movq %rbx,%rax - movq %rdx,%r13 - - xorq %r11,%r11 - addq %r12,%r10 - leaq 16(%rcx),%rcx - adcq $0,%r11 - mulq %r14 - addq %rax,%r10 - movq %rbx,%rax - adcq %rdx,%r11 - movq %r10,-8(%rdi,%rcx,1) - jmp .Lsqr4x_1st - -.align 16 -.Lsqr4x_1st: - movq (%rsi,%rcx,1),%rbx - xorq %r12,%r12 - mulq %r15 - addq %rax,%r13 - movq %rbx,%rax - adcq %rdx,%r12 - - xorq %r10,%r10 - addq %r13,%r11 - adcq $0,%r10 - mulq %r14 - addq %rax,%r11 - movq %rbx,%rax - adcq %rdx,%r10 - movq %r11,(%rdi,%rcx,1) - - - movq 8(%rsi,%rcx,1),%rbx - xorq %r13,%r13 - mulq %r15 - addq %rax,%r12 - movq %rbx,%rax - adcq %rdx,%r13 - - xorq %r11,%r11 - addq %r12,%r10 - adcq $0,%r11 - mulq %r14 - addq %rax,%r10 - movq %rbx,%rax - adcq %rdx,%r11 - movq %r10,8(%rdi,%rcx,1) - - movq 16(%rsi,%rcx,1),%rbx - xorq %r12,%r12 - mulq %r15 - addq %rax,%r13 - movq %rbx,%rax - adcq %rdx,%r12 - - xorq %r10,%r10 - addq %r13,%r11 - adcq $0,%r10 - mulq %r14 - addq %rax,%r11 - movq %rbx,%rax - adcq %rdx,%r10 - movq %r11,16(%rdi,%rcx,1) - - - movq 24(%rsi,%rcx,1),%rbx - xorq %r13,%r13 - mulq %r15 - addq %rax,%r12 - movq %rbx,%rax - adcq %rdx,%r13 - - xorq %r11,%r11 - addq %r12,%r10 - leaq 32(%rcx),%rcx - adcq $0,%r11 - mulq %r14 - addq %rax,%r10 - movq %rbx,%rax - adcq %rdx,%r11 - movq %r10,-8(%rdi,%rcx,1) - - cmpq $0,%rcx - jne .Lsqr4x_1st - - xorq %r12,%r12 - addq %r11,%r13 - adcq $0,%r12 - mulq %r15 - addq %rax,%r13 - adcq %rdx,%r12 - - movq %r13,(%rdi) - leaq 16(%rbp),%rbp - movq %r12,8(%rdi) - jmp .Lsqr4x_outer - -.align 16 -.Lsqr4x_outer: - movq -32(%rsi,%rbp,1),%r14 - leaq 64(%rsp,%r9,2),%rdi - movq -24(%rsi,%rbp,1),%rax - leaq -32(%rdi,%rbp,1),%rdi - movq -16(%rsi,%rbp,1),%rbx - movq %rax,%r15 - - movq -24(%rdi,%rbp,1),%r10 - xorq %r11,%r11 - mulq %r14 - addq %rax,%r10 - movq %rbx,%rax - adcq %rdx,%r11 - movq %r10,-24(%rdi,%rbp,1) - - xorq %r10,%r10 - addq -16(%rdi,%rbp,1),%r11 - adcq $0,%r10 - mulq %r14 - addq %rax,%r11 - movq %rbx,%rax - adcq %rdx,%r10 - movq %r11,-16(%rdi,%rbp,1) - - leaq -16(%rbp),%rcx - xorq %r12,%r12 - - - movq 8(%rsi,%rcx,1),%rbx - xorq %r13,%r13 - addq 8(%rdi,%rcx,1),%r12 - adcq $0,%r13 - mulq %r15 - addq %rax,%r12 - movq %rbx,%rax - adcq %rdx,%r13 - - xorq %r11,%r11 - addq %r12,%r10 - adcq $0,%r11 - mulq %r14 - addq %rax,%r10 - movq %rbx,%rax - adcq %rdx,%r11 - movq %r10,8(%rdi,%rcx,1) - - leaq 16(%rcx),%rcx - jmp .Lsqr4x_inner - -.align 16 -.Lsqr4x_inner: - movq (%rsi,%rcx,1),%rbx - xorq %r12,%r12 - addq (%rdi,%rcx,1),%r13 - adcq $0,%r12 - mulq %r15 - addq %rax,%r13 - movq %rbx,%rax - adcq %rdx,%r12 - - xorq %r10,%r10 - addq %r13,%r11 - adcq $0,%r10 - mulq %r14 - addq %rax,%r11 - movq %rbx,%rax - adcq %rdx,%r10 - movq %r11,(%rdi,%rcx,1) - - movq 8(%rsi,%rcx,1),%rbx - xorq %r13,%r13 - addq 8(%rdi,%rcx,1),%r12 - adcq $0,%r13 - mulq %r15 - addq %rax,%r12 - movq %rbx,%rax - adcq %rdx,%r13 - - xorq %r11,%r11 - addq %r12,%r10 - leaq 16(%rcx),%rcx - adcq $0,%r11 - mulq %r14 - addq %rax,%r10 - movq %rbx,%rax - adcq %rdx,%r11 - movq %r10,-8(%rdi,%rcx,1) - - cmpq $0,%rcx - jne .Lsqr4x_inner - - xorq %r12,%r12 - addq %r11,%r13 - adcq $0,%r12 - mulq %r15 - addq %rax,%r13 - adcq %rdx,%r12 + shlq $3+2,%r10 + negq %r9 - movq %r13,(%rdi) - movq %r12,8(%rdi) - addq $16,%rbp - jnz .Lsqr4x_outer - movq -32(%rsi),%r14 - leaq 64(%rsp,%r9,2),%rdi - movq -24(%rsi),%rax - leaq -32(%rdi,%rbp,1),%rdi - movq -16(%rsi),%rbx - movq %rax,%r15 - - xorq %r11,%r11 - mulq %r14 - addq %rax,%r10 - movq %rbx,%rax - adcq %rdx,%r11 - movq %r10,-24(%rdi) - - xorq %r10,%r10 - addq %r13,%r11 - adcq $0,%r10 - mulq %r14 - addq %rax,%r11 - movq %rbx,%rax - adcq %rdx,%r10 - movq %r11,-16(%rdi) - - movq -8(%rsi),%rbx - mulq %r15 - addq %rax,%r12 - movq %rbx,%rax - adcq $0,%rdx - - xorq %r11,%r11 - addq %r12,%r10 - movq %rdx,%r13 - adcq $0,%r11 - mulq %r14 - addq %rax,%r10 - movq %rbx,%rax - adcq %rdx,%r11 - movq %r10,-8(%rdi) - xorq %r12,%r12 - addq %r11,%r13 - adcq $0,%r12 - mulq %r15 - addq %rax,%r13 - movq -16(%rsi),%rax - adcq %rdx,%r12 - - movq %r13,(%rdi) - movq %r12,8(%rdi) - - mulq %rbx - addq $16,%rbp - xorq %r14,%r14 - subq %r9,%rbp - xorq %r15,%r15 - - addq %r12,%rax - adcq $0,%rdx - movq %rax,8(%rdi) - movq %rdx,16(%rdi) - movq %r15,24(%rdi) - - movq -16(%rsi,%rbp,1),%rax - leaq 64(%rsp,%r9,2),%rdi - xorq %r10,%r10 - movq -24(%rdi,%rbp,2),%r11 - - leaq (%r14,%r10,2),%r12 - shrq $63,%r10 - leaq (%rcx,%r11,2),%r13 - shrq $63,%r11 - orq %r10,%r13 - movq -16(%rdi,%rbp,2),%r10 - movq %r11,%r14 - mulq %rax - negq %r15 - movq -8(%rdi,%rbp,2),%r11 - adcq %rax,%r12 - movq -8(%rsi,%rbp,1),%rax - movq %r12,-32(%rdi,%rbp,2) - adcq %rdx,%r13 - - leaq (%r14,%r10,2),%rbx - movq %r13,-24(%rdi,%rbp,2) - sbbq %r15,%r15 - shrq $63,%r10 - leaq (%rcx,%r11,2),%r8 - shrq $63,%r11 - orq %r10,%r8 - movq 0(%rdi,%rbp,2),%r10 - movq %r11,%r14 - mulq %rax - negq %r15 - movq 8(%rdi,%rbp,2),%r11 - adcq %rax,%rbx - movq 0(%rsi,%rbp,1),%rax - movq %rbx,-16(%rdi,%rbp,2) - adcq %rdx,%r8 - leaq 16(%rbp),%rbp - movq %r8,-40(%rdi,%rbp,2) - sbbq %r15,%r15 - jmp .Lsqr4x_shift_n_add - -.align 16 -.Lsqr4x_shift_n_add: - leaq (%r14,%r10,2),%r12 - shrq $63,%r10 - leaq (%rcx,%r11,2),%r13 - shrq $63,%r11 - orq %r10,%r13 - movq -16(%rdi,%rbp,2),%r10 - movq %r11,%r14 - mulq %rax - negq %r15 - movq -8(%rdi,%rbp,2),%r11 - adcq %rax,%r12 - movq -8(%rsi,%rbp,1),%rax - movq %r12,-32(%rdi,%rbp,2) - adcq %rdx,%r13 - - leaq (%r14,%r10,2),%rbx - movq %r13,-24(%rdi,%rbp,2) - sbbq %r15,%r15 - shrq $63,%r10 - leaq (%rcx,%r11,2),%r8 - shrq $63,%r11 - orq %r10,%r8 - movq 0(%rdi,%rbp,2),%r10 - movq %r11,%r14 - mulq %rax - negq %r15 - movq 8(%rdi,%rbp,2),%r11 - adcq %rax,%rbx - movq 0(%rsi,%rbp,1),%rax - movq %rbx,-16(%rdi,%rbp,2) - adcq %rdx,%r8 - - leaq (%r14,%r10,2),%r12 - movq %r8,-8(%rdi,%rbp,2) - sbbq %r15,%r15 - shrq $63,%r10 - leaq (%rcx,%r11,2),%r13 - shrq $63,%r11 - orq %r10,%r13 - movq 16(%rdi,%rbp,2),%r10 - movq %r11,%r14 - mulq %rax - negq %r15 - movq 24(%rdi,%rbp,2),%r11 - adcq %rax,%r12 - movq 8(%rsi,%rbp,1),%rax - movq %r12,0(%rdi,%rbp,2) - adcq %rdx,%r13 - - leaq (%r14,%r10,2),%rbx - movq %r13,8(%rdi,%rbp,2) - sbbq %r15,%r15 - shrq $63,%r10 - leaq (%rcx,%r11,2),%r8 - shrq $63,%r11 - orq %r10,%r8 - movq 32(%rdi,%rbp,2),%r10 - movq %r11,%r14 - mulq %rax - negq %r15 - movq 40(%rdi,%rbp,2),%r11 - adcq %rax,%rbx - movq 16(%rsi,%rbp,1),%rax - movq %rbx,16(%rdi,%rbp,2) - adcq %rdx,%r8 - movq %r8,24(%rdi,%rbp,2) - sbbq %r15,%r15 - addq $32,%rbp - jnz .Lsqr4x_shift_n_add - - leaq (%r14,%r10,2),%r12 - shrq $63,%r10 - leaq (%rcx,%r11,2),%r13 - shrq $63,%r11 - orq %r10,%r13 - movq -16(%rdi),%r10 - movq %r11,%r14 - mulq %rax - negq %r15 - movq -8(%rdi),%r11 - adcq %rax,%r12 - movq -8(%rsi),%rax - movq %r12,-32(%rdi) - adcq %rdx,%r13 - - leaq (%r14,%r10,2),%rbx - movq %r13,-24(%rdi) - sbbq %r15,%r15 - shrq $63,%r10 - leaq (%rcx,%r11,2),%r8 - shrq $63,%r11 - orq %r10,%r8 - mulq %rax - negq %r15 - adcq %rax,%rbx - adcq %rdx,%r8 - movq %rbx,-16(%rdi) - movq %r8,-8(%rdi) - movq 40(%rsp),%rsi - movq 48(%rsp),%r8 - xorq %rcx,%rcx - movq %r9,0(%rsp) - subq %r9,%rcx - movq 64(%rsp),%r10 - movq %r8,%r14 - leaq 64(%rsp,%r9,2),%rax - leaq 64(%rsp,%r9,1),%rdi - movq %rax,8(%rsp) - leaq (%rsi,%r9,1),%rsi - xorq %rbp,%rbp - - movq 0(%rsi,%rcx,1),%rax - movq 8(%rsi,%rcx,1),%r9 - imulq %r10,%r14 - movq %rax,%rbx - jmp .Lsqr4x_mont_outer - -.align 16 -.Lsqr4x_mont_outer: - xorq %r11,%r11 - mulq %r14 - addq %rax,%r10 - movq %r9,%rax - adcq %rdx,%r11 - movq %r8,%r15 - - xorq %r10,%r10 - addq 8(%rdi,%rcx,1),%r11 - adcq $0,%r10 - mulq %r14 - addq %rax,%r11 - movq %rbx,%rax - adcq %rdx,%r10 - - imulq %r11,%r15 - - movq 16(%rsi,%rcx,1),%rbx - xorq %r13,%r13 - addq %r11,%r12 - adcq $0,%r13 - mulq %r15 - addq %rax,%r12 - movq %rbx,%rax - adcq %rdx,%r13 - movq %r12,8(%rdi,%rcx,1) - - xorq %r11,%r11 - addq 16(%rdi,%rcx,1),%r10 - adcq $0,%r11 - mulq %r14 - addq %rax,%r10 - movq %r9,%rax - adcq %rdx,%r11 - - movq 24(%rsi,%rcx,1),%r9 - xorq %r12,%r12 - addq %r10,%r13 - adcq $0,%r12 - mulq %r15 - addq %rax,%r13 - movq %r9,%rax - adcq %rdx,%r12 - movq %r13,16(%rdi,%rcx,1) - xorq %r10,%r10 - addq 24(%rdi,%rcx,1),%r11 + leaq -64(%rsp,%r9,4),%r11 + movq (%r8),%r8 + subq %rsi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb .Lsqr8x_sp_alt + subq %r11,%rsp + leaq -64(%rsp,%r9,4),%rsp + jmp .Lsqr8x_sp_done + +.align 32 +.Lsqr8x_sp_alt: + leaq 4096-64(,%r9,4),%r10 + leaq -64(%rsp,%r9,4),%rsp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rsp +.Lsqr8x_sp_done: + andq $-64,%rsp + movq %r9,%r10 + negq %r9 + + leaq 64(%rsp,%r9,2),%r11 + movq %r8,32(%rsp) + movq %rax,40(%rsp) +.Lsqr8x_body: + + movq %r9,%rbp +.byte 102,73,15,110,211 + shrq $3+2,%rbp + movl OPENSSL_ia32cap_P+8(%rip),%eax + jmp .Lsqr8x_copy_n + +.align 32 +.Lsqr8x_copy_n: + movq 0(%rcx),%xmm0 + movq 8(%rcx),%xmm1 + movq 16(%rcx),%xmm3 + movq 24(%rcx),%xmm4 leaq 32(%rcx),%rcx - adcq $0,%r10 - mulq %r14 - addq %rax,%r11 - movq %rbx,%rax - adcq %rdx,%r10 - jmp .Lsqr4x_mont_inner - -.align 16 -.Lsqr4x_mont_inner: - movq (%rsi,%rcx,1),%rbx - xorq %r13,%r13 - addq %r11,%r12 - adcq $0,%r13 - mulq %r15 - addq %rax,%r12 - movq %rbx,%rax - adcq %rdx,%r13 - movq %r12,-8(%rdi,%rcx,1) - - xorq %r11,%r11 - addq (%rdi,%rcx,1),%r10 - adcq $0,%r11 - mulq %r14 - addq %rax,%r10 - movq %r9,%rax - adcq %rdx,%r11 - - movq 8(%rsi,%rcx,1),%r9 - xorq %r12,%r12 - addq %r10,%r13 - adcq $0,%r12 - mulq %r15 - addq %rax,%r13 - movq %r9,%rax - adcq %rdx,%r12 - movq %r13,(%rdi,%rcx,1) - - xorq %r10,%r10 - addq 8(%rdi,%rcx,1),%r11 - adcq $0,%r10 - mulq %r14 - addq %rax,%r11 - movq %rbx,%rax - adcq %rdx,%r10 - - - movq 16(%rsi,%rcx,1),%rbx - xorq %r13,%r13 - addq %r11,%r12 - adcq $0,%r13 - mulq %r15 - addq %rax,%r12 - movq %rbx,%rax - adcq %rdx,%r13 - movq %r12,8(%rdi,%rcx,1) - - xorq %r11,%r11 - addq 16(%rdi,%rcx,1),%r10 - adcq $0,%r11 - mulq %r14 - addq %rax,%r10 - movq %r9,%rax - adcq %rdx,%r11 - - movq 24(%rsi,%rcx,1),%r9 - xorq %r12,%r12 - addq %r10,%r13 - adcq $0,%r12 - mulq %r15 - addq %rax,%r13 - movq %r9,%rax - adcq %rdx,%r12 - movq %r13,16(%rdi,%rcx,1) + movdqa %xmm0,0(%r11) + movdqa %xmm1,16(%r11) + movdqa %xmm3,32(%r11) + movdqa %xmm4,48(%r11) + leaq 64(%r11),%r11 + decq %rbp + jnz .Lsqr8x_copy_n - xorq %r10,%r10 - addq 24(%rdi,%rcx,1),%r11 - leaq 32(%rcx),%rcx - adcq $0,%r10 - mulq %r14 - addq %rax,%r11 - movq %rbx,%rax - adcq %rdx,%r10 - cmpq $0,%rcx - jne .Lsqr4x_mont_inner - - subq 0(%rsp),%rcx - movq %r8,%r14 - - xorq %r13,%r13 - addq %r11,%r12 - adcq $0,%r13 - mulq %r15 - addq %rax,%r12 - movq %r9,%rax - adcq %rdx,%r13 - movq %r12,-8(%rdi) - - xorq %r11,%r11 - addq (%rdi),%r10 - adcq $0,%r11 - movq 0(%rsi,%rcx,1),%rbx - addq %rbp,%r10 - adcq $0,%r11 + pxor %xmm0,%xmm0 +.byte 102,72,15,110,207 +.byte 102,73,15,110,218 + call bn_sqr8x_internal - imulq 16(%rdi,%rcx,1),%r14 - xorq %r12,%r12 - movq 8(%rsi,%rcx,1),%r9 - addq %r10,%r13 - movq 16(%rdi,%rcx,1),%r10 - adcq $0,%r12 - mulq %r15 - addq %rax,%r13 - movq %rbx,%rax - adcq %rdx,%r12 - movq %r13,(%rdi) - - xorq %rbp,%rbp - addq 8(%rdi),%r12 - adcq %rbp,%rbp - addq %r11,%r12 - leaq 16(%rdi),%rdi - adcq $0,%rbp - movq %r12,-8(%rdi) - cmpq 8(%rsp),%rdi - jb .Lsqr4x_mont_outer - - movq 0(%rsp),%r9 - movq %rbp,(%rdi) - movq 64(%rsp,%r9,1),%rax - leaq 64(%rsp,%r9,1),%rbx + pxor %xmm0,%xmm0 + leaq 48(%rsp),%rax + leaq 64(%rsp,%r9,2),%rdx + shrq $3+2,%r9 movq 40(%rsp),%rsi - shrq $5,%r9 - movq 8(%rbx),%rdx - xorq %rbp,%rbp - - movq 32(%rsp),%rdi - subq 0(%rsi),%rax - movq 16(%rbx),%r10 - movq 24(%rbx),%r11 - sbbq 8(%rsi),%rdx - leaq -1(%r9),%rcx - jmp .Lsqr4x_sub -.align 16 -.Lsqr4x_sub: - movq %rax,0(%rdi,%rbp,8) - movq %rdx,8(%rdi,%rbp,8) - sbbq 16(%rsi,%rbp,8),%r10 - movq 32(%rbx,%rbp,8),%rax - movq 40(%rbx,%rbp,8),%rdx - sbbq 24(%rsi,%rbp,8),%r11 - movq %r10,16(%rdi,%rbp,8) - movq %r11,24(%rdi,%rbp,8) - sbbq 32(%rsi,%rbp,8),%rax - movq 48(%rbx,%rbp,8),%r10 - movq 56(%rbx,%rbp,8),%r11 - sbbq 40(%rsi,%rbp,8),%rdx - leaq 4(%rbp),%rbp - decq %rcx - jnz .Lsqr4x_sub - - movq %rax,0(%rdi,%rbp,8) - movq 32(%rbx,%rbp,8),%rax - sbbq 16(%rsi,%rbp,8),%r10 - movq %rdx,8(%rdi,%rbp,8) - sbbq 24(%rsi,%rbp,8),%r11 - movq %r10,16(%rdi,%rbp,8) + jmp .Lsqr8x_zero + +.align 32 +.Lsqr8x_zero: + movdqa %xmm0,0(%rax) + movdqa %xmm0,16(%rax) + movdqa %xmm0,32(%rax) + movdqa %xmm0,48(%rax) + leaq 64(%rax),%rax + movdqa %xmm0,0(%rdx) + movdqa %xmm0,16(%rdx) + movdqa %xmm0,32(%rdx) + movdqa %xmm0,48(%rdx) + leaq 64(%rdx),%rdx + decq %r9 + jnz .Lsqr8x_zero - sbbq $0,%rax - movq %r11,24(%rdi,%rbp,8) - xorq %rbp,%rbp - andq %rax,%rbx - notq %rax - movq %rdi,%rsi - andq %rax,%rsi - leaq -1(%r9),%rcx - orq %rsi,%rbx - - pxor %xmm0,%xmm0 - leaq 64(%rsp,%r9,8),%rsi - movdqu (%rbx),%xmm1 - leaq (%rsi,%r9,8),%rsi - movdqa %xmm0,64(%rsp) - movdqa %xmm0,(%rsi) - movdqu %xmm1,(%rdi) - jmp .Lsqr4x_copy -.align 16 -.Lsqr4x_copy: - movdqu 16(%rbx,%rbp,1),%xmm2 - movdqu 32(%rbx,%rbp,1),%xmm1 - movdqa %xmm0,80(%rsp,%rbp,1) - movdqa %xmm0,96(%rsp,%rbp,1) - movdqa %xmm0,16(%rsi,%rbp,1) - movdqa %xmm0,32(%rsi,%rbp,1) - movdqu %xmm2,16(%rdi,%rbp,1) - movdqu %xmm1,32(%rdi,%rbp,1) - leaq 32(%rbp),%rbp - decq %rcx - jnz .Lsqr4x_copy - - movdqu 16(%rbx,%rbp,1),%xmm2 - movdqa %xmm0,80(%rsp,%rbp,1) - movdqa %xmm0,16(%rsi,%rbp,1) - movdqu %xmm2,16(%rdi,%rbp,1) - movq 56(%rsp),%rsi movq $1,%rax - movq 0(%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp -.Lsqr4x_epilogue: + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp +.Lsqr8x_epilogue: .byte 0xf3,0xc3 -.size bn_sqr4x_mont,.-bn_sqr4x_mont +.size bn_sqr8x_mont,.-bn_sqr8x_mont .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 16 diff --git a/secure/lib/libcrypto/amd64/x86_64-mont5.S b/secure/lib/libcrypto/amd64/x86_64-mont5.S index b0b3442..6ab0922 100644 --- a/secure/lib/libcrypto/amd64/x86_64-mont5.S +++ b/secure/lib/libcrypto/amd64/x86_64-mont5.S @@ -1,19 +1,20 @@ # $FreeBSD$ .text + + .globl bn_mul_mont_gather5 .type bn_mul_mont_gather5,@function .align 64 bn_mul_mont_gather5: - testl $3,%r9d + testl $7,%r9d jnz .Lmul_enter - cmpl $8,%r9d - jb .Lmul_enter jmp .Lmul4x_enter .align 16 .Lmul_enter: movl %r9d,%r9d + movq %rsp,%rax movl 8(%rsp),%r10d pushq %rbx pushq %rbp @@ -21,7 +22,6 @@ bn_mul_mont_gather5: pushq %r13 pushq %r14 pushq %r15 - movq %rsp,%rax leaq 2(%r9),%r11 negq %r11 leaq (%rsp,%r11,8),%rsp @@ -222,7 +222,7 @@ bn_mul_mont_gather5: leaq 1(%r14),%r14 cmpq %r9,%r14 - jl .Louter + jb .Louter xorq %r14,%r14 movq (%rsp),%rax @@ -256,477 +256,1526 @@ bn_mul_mont_gather5: movq 8(%rsp,%r9,8),%rsi movq $1,%rax - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp .Lmul_epilogue: .byte 0xf3,0xc3 .size bn_mul_mont_gather5,.-bn_mul_mont_gather5 .type bn_mul4x_mont_gather5,@function -.align 16 +.align 32 bn_mul4x_mont_gather5: .Lmul4x_enter: - movl %r9d,%r9d - movl 8(%rsp),%r10d +.byte 0x67 + movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 - movq %rsp,%rax - leaq 4(%r9),%r11 - negq %r11 - leaq (%rsp,%r11,8),%rsp - andq $-1024,%rsp +.byte 0x67 + movl %r9d,%r10d + shll $3,%r9d + shll $3+2,%r10d + negq %r9 - movq %rax,8(%rsp,%r9,8) + + + + + + + + leaq -64(%rsp,%r9,2),%r11 + subq %rsi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb .Lmul4xsp_alt + subq %r11,%rsp + leaq -64(%rsp,%r9,2),%rsp + jmp .Lmul4xsp_done + +.align 32 +.Lmul4xsp_alt: + leaq 4096-64(,%r9,2),%r10 + leaq -64(%rsp,%r9,2),%rsp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rsp +.Lmul4xsp_done: + andq $-64,%rsp + negq %r9 + + movq %rax,40(%rsp) .Lmul4x_body: - movq %rdi,16(%rsp,%r9,8) - movq %rdx,%r12 + + call mul4x_internal + + movq 40(%rsp),%rsi + movq $1,%rax + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp +.Lmul4x_epilogue: + .byte 0xf3,0xc3 +.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 + +.type mul4x_internal,@function +.align 32 +mul4x_internal: + shlq $5,%r9 + movl 8(%rax),%r10d + leaq 256(%rdx,%r9,1),%r13 + shrq $5,%r9 movq %r10,%r11 shrq $3,%r10 andq $7,%r11 notq %r10 leaq .Lmagic_masks(%rip),%rax andq $3,%r10 - leaq 96(%r12,%r11,8),%r12 + leaq 96(%rdx,%r11,8),%r12 movq 0(%rax,%r10,8),%xmm4 movq 8(%rax,%r10,8),%xmm5 + addq $7,%r11 movq 16(%rax,%r10,8),%xmm6 movq 24(%rax,%r10,8),%xmm7 + andq $7,%r11 movq -96(%r12),%xmm0 + leaq 256(%r12),%r14 movq -32(%r12),%xmm1 pand %xmm4,%xmm0 movq 32(%r12),%xmm2 pand %xmm5,%xmm1 movq 96(%r12),%xmm3 pand %xmm6,%xmm2 +.byte 0x67 por %xmm1,%xmm0 + movq -96(%r14),%xmm1 +.byte 0x67 pand %xmm7,%xmm3 +.byte 0x67 por %xmm2,%xmm0 - leaq 256(%r12),%r12 + movq -32(%r14),%xmm2 +.byte 0x67 + pand %xmm4,%xmm1 +.byte 0x67 por %xmm3,%xmm0 + movq 32(%r14),%xmm3 .byte 102,72,15,126,195 + movq 96(%r14),%xmm0 + movq %r13,16+8(%rsp) + movq %rdi,56+8(%rsp) + movq (%r8),%r8 movq (%rsi),%rax - - xorq %r14,%r14 - xorq %r15,%r15 - - movq -96(%r12),%xmm0 - movq -32(%r12),%xmm1 - pand %xmm4,%xmm0 - movq 32(%r12),%xmm2 - pand %xmm5,%xmm1 + leaq (%rsi,%r9,1),%rsi + negq %r9 movq %r8,%rbp mulq %rbx movq %rax,%r10 movq (%rcx),%rax - movq 96(%r12),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 + pand %xmm5,%xmm2 + pand %xmm6,%xmm3 + por %xmm2,%xmm1 imulq %r10,%rbp + + + + + + + + leaq 64+8(%rsp,%r11,8),%r14 movq %rdx,%r11 - por %xmm2,%xmm0 - leaq 256(%r12),%r12 - por %xmm3,%xmm0 + pand %xmm7,%xmm0 + por %xmm3,%xmm1 + leaq 512(%r12),%r12 + por %xmm1,%xmm0 mulq %rbp addq %rax,%r10 - movq 8(%rsi),%rax + movq 8(%rsi,%r9,1),%rax adcq $0,%rdx movq %rdx,%rdi mulq %rbx addq %rax,%r11 - movq 8(%rcx),%rax + movq 16(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%rdi - movq 16(%rsi),%rax + movq 16(%rsi,%r9,1),%rax adcq $0,%rdx addq %r11,%rdi - leaq 4(%r15),%r15 + leaq 32(%r9),%r15 + leaq 64(%rcx),%rcx adcq $0,%rdx - movq %rdi,(%rsp) + movq %rdi,(%r14) movq %rdx,%r13 jmp .L1st4x -.align 16 + +.align 32 .L1st4x: mulq %rbx addq %rax,%r10 - movq -16(%rcx,%r15,8),%rax + movq -32(%rcx),%rax + leaq 32(%r14),%r14 adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r13 - movq -8(%rsi,%r15,8),%rax + movq -8(%rsi,%r15,1),%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx - movq %r13,-24(%rsp,%r15,8) + movq %r13,-24(%r14) movq %rdx,%rdi mulq %rbx addq %rax,%r11 - movq -8(%rcx,%r15,8),%rax + movq -16(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%rdi - movq (%rsi,%r15,8),%rax + movq (%rsi,%r15,1),%rax adcq $0,%rdx addq %r11,%rdi adcq $0,%rdx - movq %rdi,-16(%rsp,%r15,8) + movq %rdi,-16(%r14) movq %rdx,%r13 mulq %rbx addq %rax,%r10 - movq (%rcx,%r15,8),%rax + movq 0(%rcx),%rax adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r13 - movq 8(%rsi,%r15,8),%rax + movq 8(%rsi,%r15,1),%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx - movq %r13,-8(%rsp,%r15,8) + movq %r13,-8(%r14) movq %rdx,%rdi mulq %rbx addq %rax,%r11 - movq 8(%rcx,%r15,8),%rax + movq 16(%rcx),%rax adcq $0,%rdx - leaq 4(%r15),%r15 movq %rdx,%r10 mulq %rbp addq %rax,%rdi - movq -16(%rsi,%r15,8),%rax + movq 16(%rsi,%r15,1),%rax adcq $0,%rdx addq %r11,%rdi + leaq 64(%rcx),%rcx adcq $0,%rdx - movq %rdi,-32(%rsp,%r15,8) + movq %rdi,(%r14) movq %rdx,%r13 - cmpq %r9,%r15 - jl .L1st4x + + addq $32,%r15 + jnz .L1st4x mulq %rbx addq %rax,%r10 - movq -16(%rcx,%r15,8),%rax + movq -32(%rcx),%rax + leaq 32(%r14),%r14 adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r13 - movq -8(%rsi,%r15,8),%rax + movq -8(%rsi),%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx - movq %r13,-24(%rsp,%r15,8) + movq %r13,-24(%r14) movq %rdx,%rdi mulq %rbx addq %rax,%r11 - movq -8(%rcx,%r15,8),%rax + movq -16(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%rdi - movq (%rsi),%rax + movq (%rsi,%r9,1),%rax adcq $0,%rdx addq %r11,%rdi adcq $0,%rdx - movq %rdi,-16(%rsp,%r15,8) + movq %rdi,-16(%r14) movq %rdx,%r13 .byte 102,72,15,126,195 + leaq (%rcx,%r9,2),%rcx xorq %rdi,%rdi addq %r10,%r13 adcq $0,%rdi - movq %r13,-8(%rsp,%r15,8) - movq %rdi,(%rsp,%r15,8) + movq %r13,-8(%r14) - leaq 1(%r14),%r14 -.align 4 -.Louter4x: - xorq %r15,%r15 - movq -96(%r12),%xmm0 - movq -32(%r12),%xmm1 - pand %xmm4,%xmm0 - movq 32(%r12),%xmm2 - pand %xmm5,%xmm1 + jmp .Louter4x - movq (%rsp),%r10 +.align 32 +.Louter4x: + movq (%r14,%r9,1),%r10 movq %r8,%rbp mulq %rbx addq %rax,%r10 movq (%rcx),%rax adcq $0,%rdx + movq -96(%r12),%xmm0 + movq -32(%r12),%xmm1 + pand %xmm4,%xmm0 + movq 32(%r12),%xmm2 + pand %xmm5,%xmm1 movq 96(%r12),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 imulq %r10,%rbp +.byte 0x67 movq %rdx,%r11 + movq %rdi,(%r14) + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 por %xmm2,%xmm0 + leaq (%r14,%r9,1),%r14 leaq 256(%r12),%r12 por %xmm3,%xmm0 mulq %rbp addq %rax,%r10 - movq 8(%rsi),%rax + movq 8(%rsi,%r9,1),%rax adcq $0,%rdx movq %rdx,%rdi mulq %rbx addq %rax,%r11 - movq 8(%rcx),%rax + movq 16(%rcx),%rax adcq $0,%rdx - addq 8(%rsp),%r11 + addq 8(%r14),%r11 adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%rdi - movq 16(%rsi),%rax + movq 16(%rsi,%r9,1),%rax adcq $0,%rdx addq %r11,%rdi - leaq 4(%r15),%r15 + leaq 32(%r9),%r15 + leaq 64(%rcx),%rcx adcq $0,%rdx movq %rdx,%r13 jmp .Linner4x -.align 16 + +.align 32 .Linner4x: mulq %rbx addq %rax,%r10 - movq -16(%rcx,%r15,8),%rax + movq -32(%rcx),%rax adcq $0,%rdx - addq -16(%rsp,%r15,8),%r10 + addq 16(%r14),%r10 + leaq 32(%r14),%r14 adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r13 - movq -8(%rsi,%r15,8),%rax + movq -8(%rsi,%r15,1),%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx - movq %rdi,-32(%rsp,%r15,8) + movq %rdi,-32(%r14) movq %rdx,%rdi mulq %rbx addq %rax,%r11 - movq -8(%rcx,%r15,8),%rax + movq -16(%rcx),%rax adcq $0,%rdx - addq -8(%rsp,%r15,8),%r11 + addq -8(%r14),%r11 adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%rdi - movq (%rsi,%r15,8),%rax + movq (%rsi,%r15,1),%rax adcq $0,%rdx addq %r11,%rdi adcq $0,%rdx - movq %r13,-24(%rsp,%r15,8) + movq %r13,-24(%r14) movq %rdx,%r13 mulq %rbx addq %rax,%r10 - movq (%rcx,%r15,8),%rax + movq 0(%rcx),%rax adcq $0,%rdx - addq (%rsp,%r15,8),%r10 + addq (%r14),%r10 adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r13 - movq 8(%rsi,%r15,8),%rax + movq 8(%rsi,%r15,1),%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx - movq %rdi,-16(%rsp,%r15,8) + movq %rdi,-16(%r14) movq %rdx,%rdi mulq %rbx addq %rax,%r11 - movq 8(%rcx,%r15,8),%rax + movq 16(%rcx),%rax adcq $0,%rdx - addq 8(%rsp,%r15,8),%r11 + addq 8(%r14),%r11 adcq $0,%rdx - leaq 4(%r15),%r15 movq %rdx,%r10 mulq %rbp addq %rax,%rdi - movq -16(%rsi,%r15,8),%rax + movq 16(%rsi,%r15,1),%rax adcq $0,%rdx addq %r11,%rdi + leaq 64(%rcx),%rcx adcq $0,%rdx - movq %r13,-40(%rsp,%r15,8) + movq %r13,-8(%r14) movq %rdx,%r13 - cmpq %r9,%r15 - jl .Linner4x + + addq $32,%r15 + jnz .Linner4x mulq %rbx addq %rax,%r10 - movq -16(%rcx,%r15,8),%rax + movq -32(%rcx),%rax adcq $0,%rdx - addq -16(%rsp,%r15,8),%r10 + addq 16(%r14),%r10 + leaq 32(%r14),%r14 adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r13 - movq -8(%rsi,%r15,8),%rax + movq -8(%rsi),%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx - movq %rdi,-32(%rsp,%r15,8) + movq %rdi,-32(%r14) movq %rdx,%rdi mulq %rbx addq %rax,%r11 - movq -8(%rcx,%r15,8),%rax + movq %rbp,%rax + movq -16(%rcx),%rbp adcq $0,%rdx - addq -8(%rsp,%r15,8),%r11 + addq -8(%r14),%r11 adcq $0,%rdx - leaq 1(%r14),%r14 movq %rdx,%r10 mulq %rbp addq %rax,%rdi - movq (%rsi),%rax + movq (%rsi,%r9,1),%rax adcq $0,%rdx addq %r11,%rdi adcq $0,%rdx - movq %r13,-24(%rsp,%r15,8) + movq %r13,-24(%r14) movq %rdx,%r13 .byte 102,72,15,126,195 - movq %rdi,-16(%rsp,%r15,8) + movq %rdi,-16(%r14) + leaq (%rcx,%r9,2),%rcx xorq %rdi,%rdi addq %r10,%r13 adcq $0,%rdi - addq (%rsp,%r9,8),%r13 + addq (%r14),%r13 adcq $0,%rdi - movq %r13,-8(%rsp,%r15,8) - movq %rdi,(%rsp,%r15,8) + movq %r13,-8(%r14) + + cmpq 16+8(%rsp),%r12 + jb .Louter4x + subq %r13,%rbp + adcq %r15,%r15 + orq %r15,%rdi + xorq $1,%rdi + leaq (%r14,%r9,1),%rbx + leaq (%rcx,%rdi,8),%rbp + movq %r9,%rcx + sarq $3+2,%rcx + movq 56+8(%rsp),%rdi + jmp .Lsqr4x_sub +.size mul4x_internal,.-mul4x_internal +.globl bn_power5 +.type bn_power5,@function +.align 32 +bn_power5: + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movl %r9d,%r10d + shll $3,%r9d + shll $3+2,%r10d + negq %r9 + movq (%r8),%r8 - cmpq %r9,%r14 - jl .Louter4x - movq 16(%rsp,%r9,8),%rdi - movq 0(%rsp),%rax - pxor %xmm0,%xmm0 - movq 8(%rsp),%rdx - shrq $2,%r9 - leaq (%rsp),%rsi - xorq %r14,%r14 - subq 0(%rcx),%rax - movq 16(%rsi),%rbx - movq 24(%rsi),%rbp - sbbq 8(%rcx),%rdx - leaq -1(%r9),%r15 - jmp .Lsub4x -.align 16 -.Lsub4x: - movq %rax,0(%rdi,%r14,8) - movq %rdx,8(%rdi,%r14,8) - sbbq 16(%rcx,%r14,8),%rbx - movq 32(%rsi,%r14,8),%rax - movq 40(%rsi,%r14,8),%rdx - sbbq 24(%rcx,%r14,8),%rbp - movq %rbx,16(%rdi,%r14,8) - movq %rbp,24(%rdi,%r14,8) - sbbq 32(%rcx,%r14,8),%rax - movq 48(%rsi,%r14,8),%rbx - movq 56(%rsi,%r14,8),%rbp - sbbq 40(%rcx,%r14,8),%rdx - leaq 4(%r14),%r14 - decq %r15 - jnz .Lsub4x - movq %rax,0(%rdi,%r14,8) - movq 32(%rsi,%r14,8),%rax - sbbq 16(%rcx,%r14,8),%rbx - movq %rdx,8(%rdi,%r14,8) - sbbq 24(%rcx,%r14,8),%rbp - movq %rbx,16(%rdi,%r14,8) - sbbq $0,%rax - movq %rbp,24(%rdi,%r14,8) + + + + leaq -64(%rsp,%r9,2),%r11 + subq %rsi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb .Lpwr_sp_alt + subq %r11,%rsp + leaq -64(%rsp,%r9,2),%rsp + jmp .Lpwr_sp_done + +.align 32 +.Lpwr_sp_alt: + leaq 4096-64(,%r9,2),%r10 + leaq -64(%rsp,%r9,2),%rsp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rsp +.Lpwr_sp_done: + andq $-64,%rsp + movq %r9,%r10 + negq %r9 + + + + + + + + + + + movq %r8,32(%rsp) + movq %rax,40(%rsp) +.Lpower5_body: +.byte 102,72,15,110,207 +.byte 102,72,15,110,209 +.byte 102,73,15,110,218 +.byte 102,72,15,110,226 + + call __bn_sqr8x_internal + call __bn_sqr8x_internal + call __bn_sqr8x_internal + call __bn_sqr8x_internal + call __bn_sqr8x_internal + +.byte 102,72,15,126,209 +.byte 102,72,15,126,226 + movq %rsi,%rdi + movq 40(%rsp),%rax + leaq 32(%rsp),%r8 + + call mul4x_internal + + movq 40(%rsp),%rsi + movq $1,%rax + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp +.Lpower5_epilogue: + .byte 0xf3,0xc3 +.size bn_power5,.-bn_power5 + +.globl bn_sqr8x_internal +.hidden bn_sqr8x_internal +.type bn_sqr8x_internal,@function +.align 32 +bn_sqr8x_internal: +__bn_sqr8x_internal: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + leaq 32(%r10),%rbp + leaq (%rsi,%r9,1),%rsi + + movq %r9,%rcx + + + movq -32(%rsi,%rbp,1),%r14 + leaq 48+8(%rsp,%r9,2),%rdi + movq -24(%rsi,%rbp,1),%rax + leaq -32(%rdi,%rbp,1),%rdi + movq -16(%rsi,%rbp,1),%rbx + movq %rax,%r15 + + mulq %r14 + movq %rax,%r10 + movq %rbx,%rax + movq %rdx,%r11 + movq %r10,-24(%rdi,%rbp,1) + + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + adcq $0,%rdx + movq %r11,-16(%rdi,%rbp,1) + movq %rdx,%r10 + + + movq -8(%rsi,%rbp,1),%rbx + mulq %r15 + movq %rax,%r12 + movq %rbx,%rax + movq %rdx,%r13 + + leaq (%rbp),%rcx + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + movq %rdx,%r11 + adcq $0,%r11 + addq %r12,%r10 + adcq $0,%r11 + movq %r10,-8(%rdi,%rcx,1) + jmp .Lsqr4x_1st + +.align 32 +.Lsqr4x_1st: + movq (%rsi,%rcx,1),%rbx + mulq %r15 + addq %rax,%r13 + movq %rbx,%rax + movq %rdx,%r12 + adcq $0,%r12 + + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + movq 8(%rsi,%rcx,1),%rbx + movq %rdx,%r10 + adcq $0,%r10 + addq %r13,%r11 + adcq $0,%r10 + + + mulq %r15 + addq %rax,%r12 + movq %rbx,%rax + movq %r11,(%rdi,%rcx,1) + movq %rdx,%r13 + adcq $0,%r13 + + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + movq 16(%rsi,%rcx,1),%rbx + movq %rdx,%r11 + adcq $0,%r11 + addq %r12,%r10 + adcq $0,%r11 + + mulq %r15 + addq %rax,%r13 + movq %rbx,%rax + movq %r10,8(%rdi,%rcx,1) + movq %rdx,%r12 + adcq $0,%r12 + + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + movq 24(%rsi,%rcx,1),%rbx + movq %rdx,%r10 + adcq $0,%r10 + addq %r13,%r11 + adcq $0,%r10 + + + mulq %r15 + addq %rax,%r12 + movq %rbx,%rax + movq %r11,16(%rdi,%rcx,1) + movq %rdx,%r13 + adcq $0,%r13 + leaq 32(%rcx),%rcx + + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + movq %rdx,%r11 + adcq $0,%r11 + addq %r12,%r10 + adcq $0,%r11 + movq %r10,-8(%rdi,%rcx,1) + + cmpq $0,%rcx + jne .Lsqr4x_1st + + mulq %r15 + addq %rax,%r13 + leaq 16(%rbp),%rbp + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + + movq %r13,(%rdi) + movq %rdx,%r12 + movq %rdx,8(%rdi) + jmp .Lsqr4x_outer + +.align 32 +.Lsqr4x_outer: + movq -32(%rsi,%rbp,1),%r14 + leaq 48+8(%rsp,%r9,2),%rdi + movq -24(%rsi,%rbp,1),%rax + leaq -32(%rdi,%rbp,1),%rdi + movq -16(%rsi,%rbp,1),%rbx + movq %rax,%r15 + + mulq %r14 + movq -24(%rdi,%rbp,1),%r10 + addq %rax,%r10 + movq %rbx,%rax + adcq $0,%rdx + movq %r10,-24(%rdi,%rbp,1) + movq %rdx,%r11 + + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + adcq $0,%rdx + addq -16(%rdi,%rbp,1),%r11 + movq %rdx,%r10 + adcq $0,%r10 + movq %r11,-16(%rdi,%rbp,1) + + xorq %r12,%r12 + + movq -8(%rsi,%rbp,1),%rbx + mulq %r15 + addq %rax,%r12 + movq %rbx,%rax + adcq $0,%rdx + addq -8(%rdi,%rbp,1),%r12 + movq %rdx,%r13 + adcq $0,%r13 + + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + adcq $0,%rdx + addq %r12,%r10 + movq %rdx,%r11 + adcq $0,%r11 + movq %r10,-8(%rdi,%rbp,1) + + leaq (%rbp),%rcx + jmp .Lsqr4x_inner + +.align 32 +.Lsqr4x_inner: + movq (%rsi,%rcx,1),%rbx + mulq %r15 + addq %rax,%r13 + movq %rbx,%rax + movq %rdx,%r12 + adcq $0,%r12 + addq (%rdi,%rcx,1),%r13 + adcq $0,%r12 + +.byte 0x67 + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + movq 8(%rsi,%rcx,1),%rbx + movq %rdx,%r10 + adcq $0,%r10 + addq %r13,%r11 + adcq $0,%r10 + + mulq %r15 + addq %rax,%r12 + movq %r11,(%rdi,%rcx,1) + movq %rbx,%rax + movq %rdx,%r13 + adcq $0,%r13 + addq 8(%rdi,%rcx,1),%r12 + leaq 16(%rcx),%rcx + adcq $0,%r13 + + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + adcq $0,%rdx + addq %r12,%r10 + movq %rdx,%r11 + adcq $0,%r11 + movq %r10,-8(%rdi,%rcx,1) + + cmpq $0,%rcx + jne .Lsqr4x_inner + +.byte 0x67 + mulq %r15 + addq %rax,%r13 + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + + movq %r13,(%rdi) + movq %rdx,%r12 + movq %rdx,8(%rdi) + + addq $16,%rbp + jnz .Lsqr4x_outer + + + movq -32(%rsi),%r14 + leaq 48+8(%rsp,%r9,2),%rdi + movq -24(%rsi),%rax + leaq -32(%rdi,%rbp,1),%rdi + movq -16(%rsi),%rbx + movq %rax,%r15 + + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + movq %rdx,%r11 + adcq $0,%r11 + + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + movq %r10,-24(%rdi) + movq %rdx,%r10 + adcq $0,%r10 + addq %r13,%r11 + movq -8(%rsi),%rbx + adcq $0,%r10 + + mulq %r15 + addq %rax,%r12 + movq %rbx,%rax + movq %r11,-16(%rdi) + movq %rdx,%r13 + adcq $0,%r13 + + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + movq %rdx,%r11 + adcq $0,%r11 + addq %r12,%r10 + adcq $0,%r11 + movq %r10,-8(%rdi) + + mulq %r15 + addq %rax,%r13 + movq -16(%rsi),%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + + movq %r13,(%rdi) + movq %rdx,%r12 + movq %rdx,8(%rdi) + + mulq %rbx + addq $16,%rbp xorq %r14,%r14 - andq %rax,%rsi - notq %rax - movq %rdi,%rcx - andq %rax,%rcx - leaq -1(%r9),%r15 - orq %rcx,%rsi + subq %r9,%rbp + xorq %r15,%r15 + + addq %r12,%rax + adcq $0,%rdx + movq %rax,8(%rdi) + movq %rdx,16(%rdi) + movq %r15,24(%rdi) + + movq -16(%rsi,%rbp,1),%rax + leaq 48+8(%rsp),%rdi + xorq %r10,%r10 + movq 8(%rdi),%r11 + + leaq (%r14,%r10,2),%r12 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r13 + shrq $63,%r11 + orq %r10,%r13 + movq 16(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq 24(%rdi),%r11 + adcq %rax,%r12 + movq -8(%rsi,%rbp,1),%rax + movq %r12,(%rdi) + adcq %rdx,%r13 + + leaq (%r14,%r10,2),%rbx + movq %r13,8(%rdi) + sbbq %r15,%r15 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r8 + shrq $63,%r11 + orq %r10,%r8 + movq 32(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq 40(%rdi),%r11 + adcq %rax,%rbx + movq 0(%rsi,%rbp,1),%rax + movq %rbx,16(%rdi) + adcq %rdx,%r8 + leaq 16(%rbp),%rbp + movq %r8,24(%rdi) + sbbq %r15,%r15 + leaq 64(%rdi),%rdi + jmp .Lsqr4x_shift_n_add + +.align 32 +.Lsqr4x_shift_n_add: + leaq (%r14,%r10,2),%r12 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r13 + shrq $63,%r11 + orq %r10,%r13 + movq -16(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq -8(%rdi),%r11 + adcq %rax,%r12 + movq -8(%rsi,%rbp,1),%rax + movq %r12,-32(%rdi) + adcq %rdx,%r13 + + leaq (%r14,%r10,2),%rbx + movq %r13,-24(%rdi) + sbbq %r15,%r15 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r8 + shrq $63,%r11 + orq %r10,%r8 + movq 0(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq 8(%rdi),%r11 + adcq %rax,%rbx + movq 0(%rsi,%rbp,1),%rax + movq %rbx,-16(%rdi) + adcq %rdx,%r8 + + leaq (%r14,%r10,2),%r12 + movq %r8,-8(%rdi) + sbbq %r15,%r15 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r13 + shrq $63,%r11 + orq %r10,%r13 + movq 16(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq 24(%rdi),%r11 + adcq %rax,%r12 + movq 8(%rsi,%rbp,1),%rax + movq %r12,0(%rdi) + adcq %rdx,%r13 + + leaq (%r14,%r10,2),%rbx + movq %r13,8(%rdi) + sbbq %r15,%r15 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r8 + shrq $63,%r11 + orq %r10,%r8 + movq 32(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq 40(%rdi),%r11 + adcq %rax,%rbx + movq 16(%rsi,%rbp,1),%rax + movq %rbx,16(%rdi) + adcq %rdx,%r8 + movq %r8,24(%rdi) + sbbq %r15,%r15 + leaq 64(%rdi),%rdi + addq $32,%rbp + jnz .Lsqr4x_shift_n_add + + leaq (%r14,%r10,2),%r12 +.byte 0x67 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r13 + shrq $63,%r11 + orq %r10,%r13 + movq -16(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq -8(%rdi),%r11 + adcq %rax,%r12 + movq -8(%rsi),%rax + movq %r12,-32(%rdi) + adcq %rdx,%r13 + + leaq (%r14,%r10,2),%rbx + movq %r13,-24(%rdi) + sbbq %r15,%r15 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r8 + shrq $63,%r11 + orq %r10,%r8 + mulq %rax + negq %r15 + adcq %rax,%rbx + adcq %rdx,%r8 + movq %rbx,-16(%rdi) + movq %r8,-8(%rdi) +.byte 102,72,15,126,213 +sqr8x_reduction: + xorq %rax,%rax + leaq (%rbp,%r9,2),%rcx + leaq 48+8(%rsp,%r9,2),%rdx + movq %rcx,0+8(%rsp) + leaq 48+8(%rsp,%r9,1),%rdi + movq %rdx,8+8(%rsp) + negq %r9 + jmp .L8x_reduction_loop + +.align 32 +.L8x_reduction_loop: + leaq (%rdi,%r9,1),%rdi +.byte 0x66 + movq 0(%rdi),%rbx + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq %rax,(%rdx) + leaq 64(%rdi),%rdi + +.byte 0x67 + movq %rbx,%r8 + imulq 32+8(%rsp),%rbx + movq 0(%rbp),%rax + movl $8,%ecx + jmp .L8x_reduce + +.align 32 +.L8x_reduce: + mulq %rbx + movq 16(%rbp),%rax + negq %r8 + movq %rdx,%r8 + adcq $0,%r8 + + mulq %rbx + addq %rax,%r9 + movq 32(%rbp),%rax + adcq $0,%rdx + addq %r9,%r8 + movq %rbx,48-8+8(%rsp,%rcx,8) + movq %rdx,%r9 + adcq $0,%r9 + + mulq %rbx + addq %rax,%r10 + movq 48(%rbp),%rax + adcq $0,%rdx + addq %r10,%r9 + movq 32+8(%rsp),%rsi + movq %rdx,%r10 + adcq $0,%r10 + + mulq %rbx + addq %rax,%r11 + movq 64(%rbp),%rax + adcq $0,%rdx + imulq %r8,%rsi + addq %r11,%r10 + movq %rdx,%r11 + adcq $0,%r11 + mulq %rbx + addq %rax,%r12 + movq 80(%rbp),%rax + adcq $0,%rdx + addq %r12,%r11 + movq %rdx,%r12 + adcq $0,%r12 + + mulq %rbx + addq %rax,%r13 + movq 96(%rbp),%rax + adcq $0,%rdx + addq %r13,%r12 + movq %rdx,%r13 + adcq $0,%r13 + + mulq %rbx + addq %rax,%r14 + movq 112(%rbp),%rax + adcq $0,%rdx + addq %r14,%r13 + movq %rdx,%r14 + adcq $0,%r14 + + mulq %rbx + movq %rsi,%rbx + addq %rax,%r15 + movq 0(%rbp),%rax + adcq $0,%rdx + addq %r15,%r14 + movq %rdx,%r15 + adcq $0,%r15 + + decl %ecx + jnz .L8x_reduce + + leaq 128(%rbp),%rbp + xorq %rax,%rax + movq 8+8(%rsp),%rdx + cmpq 0+8(%rsp),%rbp + jae .L8x_no_tail + +.byte 0x66 + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + sbbq %rsi,%rsi + + movq 48+56+8(%rsp),%rbx + movl $8,%ecx + movq 0(%rbp),%rax + jmp .L8x_tail + +.align 32 +.L8x_tail: + mulq %rbx + addq %rax,%r8 + movq 16(%rbp),%rax + movq %r8,(%rdi) + movq %rdx,%r8 + adcq $0,%r8 + + mulq %rbx + addq %rax,%r9 + movq 32(%rbp),%rax + adcq $0,%rdx + addq %r9,%r8 + leaq 8(%rdi),%rdi + movq %rdx,%r9 + adcq $0,%r9 + + mulq %rbx + addq %rax,%r10 + movq 48(%rbp),%rax + adcq $0,%rdx + addq %r10,%r9 + movq %rdx,%r10 + adcq $0,%r10 + + mulq %rbx + addq %rax,%r11 + movq 64(%rbp),%rax + adcq $0,%rdx + addq %r11,%r10 + movq %rdx,%r11 + adcq $0,%r11 + + mulq %rbx + addq %rax,%r12 + movq 80(%rbp),%rax + adcq $0,%rdx + addq %r12,%r11 + movq %rdx,%r12 + adcq $0,%r12 + + mulq %rbx + addq %rax,%r13 + movq 96(%rbp),%rax + adcq $0,%rdx + addq %r13,%r12 + movq %rdx,%r13 + adcq $0,%r13 + + mulq %rbx + addq %rax,%r14 + movq 112(%rbp),%rax + adcq $0,%rdx + addq %r14,%r13 + movq %rdx,%r14 + adcq $0,%r14 + + mulq %rbx + movq 48-16+8(%rsp,%rcx,8),%rbx + addq %rax,%r15 + adcq $0,%rdx + addq %r15,%r14 + movq 0(%rbp),%rax + movq %rdx,%r15 + adcq $0,%r15 + + decl %ecx + jnz .L8x_tail + + leaq 128(%rbp),%rbp + movq 8+8(%rsp),%rdx + cmpq 0+8(%rsp),%rbp + jae .L8x_tail_done + + movq 48+56+8(%rsp),%rbx + negq %rsi + movq 0(%rbp),%rax + adcq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + sbbq %rsi,%rsi + + movl $8,%ecx + jmp .L8x_tail + +.align 32 +.L8x_tail_done: + addq (%rdx),%r8 + xorq %rax,%rax + + negq %rsi +.L8x_no_tail: + adcq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + adcq $0,%rax + movq -16(%rbp),%rcx + xorq %rsi,%rsi + +.byte 102,72,15,126,213 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) +.byte 102,73,15,126,217 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + leaq 64(%rdi),%rdi + + cmpq %rdx,%rdi + jb .L8x_reduction_loop + + subq %r15,%rcx + leaq (%rdi,%r9,1),%rbx + adcq %rsi,%rsi + movq %r9,%rcx + orq %rsi,%rax +.byte 102,72,15,126,207 + xorq $1,%rax +.byte 102,72,15,126,206 + leaq (%rbp,%rax,8),%rbp + sarq $3+2,%rcx + jmp .Lsqr4x_sub + +.align 32 +.Lsqr4x_sub: +.byte 0x66 + movq 0(%rbx),%r12 + movq 8(%rbx),%r13 + sbbq 0(%rbp),%r12 + movq 16(%rbx),%r14 + sbbq 16(%rbp),%r13 + movq 24(%rbx),%r15 + leaq 32(%rbx),%rbx + sbbq 32(%rbp),%r14 + movq %r12,0(%rdi) + sbbq 48(%rbp),%r15 + leaq 64(%rbp),%rbp + movq %r13,8(%rdi) + movq %r14,16(%rdi) + movq %r15,24(%rdi) + leaq 32(%rdi),%rdi + + incq %rcx + jnz .Lsqr4x_sub + movq %r9,%r10 + negq %r9 + .byte 0xf3,0xc3 +.size bn_sqr8x_internal,.-bn_sqr8x_internal +.globl bn_from_montgomery +.type bn_from_montgomery,@function +.align 32 +bn_from_montgomery: + testl $7,%r9d + jz bn_from_mont8x + xorl %eax,%eax + .byte 0xf3,0xc3 +.size bn_from_montgomery,.-bn_from_montgomery + +.type bn_from_mont8x,@function +.align 32 +bn_from_mont8x: +.byte 0x67 + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 +.byte 0x67 + movl %r9d,%r10d + shll $3,%r9d + shll $3+2,%r10d + negq %r9 + movq (%r8),%r8 + + + + + + + + leaq -64(%rsp,%r9,2),%r11 + subq %rsi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb .Lfrom_sp_alt + subq %r11,%rsp + leaq -64(%rsp,%r9,2),%rsp + jmp .Lfrom_sp_done + +.align 32 +.Lfrom_sp_alt: + leaq 4096-64(,%r9,2),%r10 + leaq -64(%rsp,%r9,2),%rsp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rsp +.Lfrom_sp_done: + andq $-64,%rsp + movq %r9,%r10 + negq %r9 + + + + + + + + + + + movq %r8,32(%rsp) + movq %rax,40(%rsp) +.Lfrom_body: + movq %r9,%r11 + leaq 48(%rsp),%rax + pxor %xmm0,%xmm0 + jmp .Lmul_by_1 + +.align 32 +.Lmul_by_1: movdqu (%rsi),%xmm1 - movdqa %xmm0,(%rsp) - movdqu %xmm1,(%rdi) - jmp .Lcopy4x -.align 16 -.Lcopy4x: - movdqu 16(%rsi,%r14,1),%xmm2 - movdqu 32(%rsi,%r14,1),%xmm1 - movdqa %xmm0,16(%rsp,%r14,1) - movdqu %xmm2,16(%rdi,%r14,1) - movdqa %xmm0,32(%rsp,%r14,1) - movdqu %xmm1,32(%rdi,%r14,1) - leaq 32(%r14),%r14 - decq %r15 - jnz .Lcopy4x + movdqu 16(%rsi),%xmm2 + movdqu 32(%rsi),%xmm3 + movdqa %xmm0,(%rax,%r9,1) + movdqu 48(%rsi),%xmm4 + movdqa %xmm0,16(%rax,%r9,1) +.byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 + movdqa %xmm1,(%rax) + movdqa %xmm0,32(%rax,%r9,1) + movdqa %xmm2,16(%rax) + movdqa %xmm0,48(%rax,%r9,1) + movdqa %xmm3,32(%rax) + movdqa %xmm4,48(%rax) + leaq 64(%rax),%rax + subq $64,%r11 + jnz .Lmul_by_1 + +.byte 102,72,15,110,207 +.byte 102,72,15,110,209 +.byte 0x67 + movq %rcx,%rbp +.byte 102,73,15,110,218 + call sqr8x_reduction + + pxor %xmm0,%xmm0 + leaq 48(%rsp),%rax + movq 40(%rsp),%rsi + jmp .Lfrom_mont_zero + +.align 32 +.Lfrom_mont_zero: + movdqa %xmm0,0(%rax) + movdqa %xmm0,16(%rax) + movdqa %xmm0,32(%rax) + movdqa %xmm0,48(%rax) + leaq 64(%rax),%rax + subq $32,%r9 + jnz .Lfrom_mont_zero - shlq $2,%r9 - movdqu 16(%rsi,%r14,1),%xmm2 - movdqa %xmm0,16(%rsp,%r14,1) - movdqu %xmm2,16(%rdi,%r14,1) - movq 8(%rsp,%r9,8),%rsi movq $1,%rax - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp -.Lmul4x_epilogue: + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp +.Lfrom_epilogue: .byte 0xf3,0xc3 -.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 +.size bn_from_mont8x,.-bn_from_mont8x +.globl bn_get_bits5 +.type bn_get_bits5,@function +.align 16 +bn_get_bits5: + leaq 0(%rdi),%r10 + leaq 1(%rdi),%r11 + movl %esi,%ecx + shrl $4,%esi + andl $15,%ecx + leal -8(%rcx),%eax + cmpl $11,%ecx + cmovaq %r11,%r10 + cmoval %eax,%ecx + movzwl (%r10,%rsi,2),%eax + shrl %cl,%eax + andl $31,%eax + .byte 0xf3,0xc3 +.size bn_get_bits5,.-bn_get_bits5 + .globl bn_scatter5 .type bn_scatter5,@function .align 16 bn_scatter5: - cmpq $0,%rsi + cmpl $0,%esi jz .Lscatter_epilogue leaq (%rdx,%rcx,8),%rdx .Lscatter: @@ -734,7 +1783,7 @@ bn_scatter5: leaq 8(%rdi),%rdi movq %rax,(%rdx) leaq 256(%rdx),%rdx - subq $1,%rsi + subl $1,%esi jnz .Lscatter .Lscatter_epilogue: .byte 0xf3,0xc3 @@ -744,13 +1793,13 @@ bn_scatter5: .type bn_gather5,@function .align 16 bn_gather5: - movq %rcx,%r11 - shrq $3,%rcx + movl %ecx,%r11d + shrl $3,%ecx andq $7,%r11 - notq %rcx + notl %ecx leaq .Lmagic_masks(%rip),%rax - andq $3,%rcx - leaq 96(%rdx,%r11,8),%rdx + andl $3,%ecx + leaq 128(%rdx,%r11,8),%rdx movq 0(%rax,%rcx,8),%xmm4 movq 8(%rax,%rcx,8),%xmm5 movq 16(%rax,%rcx,8),%xmm6 @@ -758,22 +1807,23 @@ bn_gather5: jmp .Lgather .align 16 .Lgather: - movq -96(%rdx),%xmm0 - movq -32(%rdx),%xmm1 + movq -128(%rdx),%xmm0 + movq -64(%rdx),%xmm1 pand %xmm4,%xmm0 - movq 32(%rdx),%xmm2 + movq 0(%rdx),%xmm2 pand %xmm5,%xmm1 - movq 96(%rdx),%xmm3 + movq 64(%rdx),%xmm3 pand %xmm6,%xmm2 por %xmm1,%xmm0 pand %xmm7,%xmm3 +.byte 0x67,0x67 por %xmm2,%xmm0 leaq 256(%rdx),%rdx por %xmm3,%xmm0 movq %xmm0,(%rdi) leaq 8(%rdi),%rdi - subq $1,%rsi + subl $1,%esi jnz .Lgather .byte 0xf3,0xc3 .LSEH_end_bn_gather5: diff --git a/secure/lib/libcrypto/amd64/x86_64cpuid.S b/secure/lib/libcrypto/amd64/x86_64cpuid.S index c5d8399..93de516 100644 --- a/secure/lib/libcrypto/amd64/x86_64cpuid.S +++ b/secure/lib/libcrypto/amd64/x86_64cpuid.S @@ -5,7 +5,7 @@ call OPENSSL_cpuid_setup .hidden OPENSSL_ia32cap_P -.comm OPENSSL_ia32cap_P,8,4 +.comm OPENSSL_ia32cap_P,16,4 .text @@ -15,11 +15,11 @@ OPENSSL_atomic_add: movl (%rdi),%eax .Lspin: leaq (%rsi,%rax,1),%r8 -.byte 0xf0 +.byte 0xf0 cmpxchgl %r8d,(%rdi) jne .Lspin movl %r8d,%eax -.byte 0x48,0x98 +.byte 0x48,0x98 .byte 0xf3,0xc3 .size OPENSSL_atomic_add,.-OPENSSL_atomic_add @@ -40,6 +40,7 @@ OPENSSL_ia32_cpuid: movq %rbx,%r8 xorl %eax,%eax + movl %eax,8(%rdi) cpuid movl %eax,%r11d @@ -107,6 +108,14 @@ OPENSSL_ia32_cpuid: shrl $14,%r10d andl $4095,%r10d + cmpl $7,%r11d + jb .Lnocacheinfo + + movl $7,%eax + xorl %ecx,%ecx + cpuid + movl %ebx,8(%rdi) + .Lnocacheinfo: movl $1,%eax cpuid @@ -139,13 +148,14 @@ OPENSSL_ia32_cpuid: btl $27,%r9d jnc .Lclear_avx xorl %ecx,%ecx -.byte 0x0f,0x01,0xd0 +.byte 0x0f,0x01,0xd0 andl $6,%eax cmpl $6,%eax je .Ldone .Lclear_avx: movl $4026525695,%eax andl %eax,%r9d + andl $4294967263,8(%rdi) .Ldone: shlq $32,%r9 movl %r10d,%eax @@ -233,3 +243,18 @@ OPENSSL_ia32_rdrand: cmoveq %rcx,%rax .byte 0xf3,0xc3 .size OPENSSL_ia32_rdrand,.-OPENSSL_ia32_rdrand + +.globl OPENSSL_ia32_rdseed +.type OPENSSL_ia32_rdseed,@function +.align 16 +OPENSSL_ia32_rdseed: + movl $8,%ecx +.Loop_rdseed: +.byte 72,15,199,248 + jc .Lbreak_rdseed + loop .Loop_rdseed +.Lbreak_rdseed: + cmpq $0,%rax + cmoveq %rcx,%rax + .byte 0xf3,0xc3 +.size OPENSSL_ia32_rdseed,.-OPENSSL_ia32_rdseed |