diff options
Diffstat (limited to 'secure/lib/libcrypto/amd64')
26 files changed, 38132 insertions, 0 deletions
diff --git a/secure/lib/libcrypto/amd64/aes-x86_64.S b/secure/lib/libcrypto/amd64/aes-x86_64.S new file mode 100644 index 0000000..3243d6d --- /dev/null +++ b/secure/lib/libcrypto/amd64/aes-x86_64.S @@ -0,0 +1,2535 @@ + # $FreeBSD$ +.text +.type _x86_64_AES_encrypt,@function +.align 16 +_x86_64_AES_encrypt: + xorl 0(%r15),%eax + xorl 4(%r15),%ebx + xorl 8(%r15),%ecx + xorl 12(%r15),%edx + + movl 240(%r15),%r13d + subl $1,%r13d + jmp .Lenc_loop +.align 16 +.Lenc_loop: + + movzbl %al,%esi + movzbl %bl,%edi + movzbl %cl,%ebp + movl 0(%r14,%rsi,8),%r10d + movl 0(%r14,%rdi,8),%r11d + movl 0(%r14,%rbp,8),%r12d + + movzbl %bh,%esi + movzbl %ch,%edi + movzbl %dl,%ebp + xorl 3(%r14,%rsi,8),%r10d + xorl 3(%r14,%rdi,8),%r11d + movl 0(%r14,%rbp,8),%r8d + + movzbl %dh,%esi + shrl $16,%ecx + movzbl %ah,%ebp + xorl 3(%r14,%rsi,8),%r12d + shrl $16,%edx + xorl 3(%r14,%rbp,8),%r8d + + shrl $16,%ebx + leaq 16(%r15),%r15 + shrl $16,%eax + + movzbl %cl,%esi + movzbl %dl,%edi + movzbl %al,%ebp + xorl 2(%r14,%rsi,8),%r10d + xorl 2(%r14,%rdi,8),%r11d + xorl 2(%r14,%rbp,8),%r12d + + movzbl %dh,%esi + movzbl %ah,%edi + movzbl %bl,%ebp + xorl 1(%r14,%rsi,8),%r10d + xorl 1(%r14,%rdi,8),%r11d + xorl 2(%r14,%rbp,8),%r8d + + movl 12(%r15),%edx + movzbl %bh,%edi + movzbl %ch,%ebp + movl 0(%r15),%eax + xorl 1(%r14,%rdi,8),%r12d + xorl 1(%r14,%rbp,8),%r8d + + movl 4(%r15),%ebx + movl 8(%r15),%ecx + xorl %r10d,%eax + xorl %r11d,%ebx + xorl %r12d,%ecx + xorl %r8d,%edx + subl $1,%r13d + jnz .Lenc_loop + movzbl %al,%esi + movzbl %bl,%edi + movzbl %cl,%ebp + movzbl 2(%r14,%rsi,8),%r10d + movzbl 2(%r14,%rdi,8),%r11d + movzbl 2(%r14,%rbp,8),%r12d + + movzbl %dl,%esi + movzbl %bh,%edi + movzbl %ch,%ebp + movzbl 2(%r14,%rsi,8),%r8d + movl 0(%r14,%rdi,8),%edi + movl 0(%r14,%rbp,8),%ebp + + andl $65280,%edi + andl $65280,%ebp + + xorl %edi,%r10d + xorl %ebp,%r11d + shrl $16,%ecx + + movzbl %dh,%esi + movzbl %ah,%edi + shrl $16,%edx + movl 0(%r14,%rsi,8),%esi + movl 0(%r14,%rdi,8),%edi + + andl $65280,%esi + andl $65280,%edi + shrl $16,%ebx + xorl %esi,%r12d + xorl %edi,%r8d + shrl $16,%eax + + movzbl %cl,%esi + movzbl %dl,%edi + movzbl %al,%ebp + movl 0(%r14,%rsi,8),%esi + movl 0(%r14,%rdi,8),%edi + movl 0(%r14,%rbp,8),%ebp + + andl $16711680,%esi + andl $16711680,%edi + andl $16711680,%ebp + + xorl %esi,%r10d + xorl %edi,%r11d + xorl %ebp,%r12d + + movzbl %bl,%esi + movzbl %dh,%edi + movzbl %ah,%ebp + movl 0(%r14,%rsi,8),%esi + movl 2(%r14,%rdi,8),%edi + movl 2(%r14,%rbp,8),%ebp + + andl $16711680,%esi + andl $4278190080,%edi + andl $4278190080,%ebp + + xorl %esi,%r8d + xorl %edi,%r10d + xorl %ebp,%r11d + + movzbl %bh,%esi + movzbl %ch,%edi + movl 16+12(%r15),%edx + movl 2(%r14,%rsi,8),%esi + movl 2(%r14,%rdi,8),%edi + movl 16+0(%r15),%eax + + andl $4278190080,%esi + andl $4278190080,%edi + + xorl %esi,%r12d + xorl %edi,%r8d + + movl 16+4(%r15),%ebx + movl 16+8(%r15),%ecx + xorl %r10d,%eax + xorl %r11d,%ebx + xorl %r12d,%ecx + xorl %r8d,%edx +.byte 0xf3,0xc3 +.size _x86_64_AES_encrypt,.-_x86_64_AES_encrypt +.type _x86_64_AES_encrypt_compact,@function +.align 16 +_x86_64_AES_encrypt_compact: + leaq 128(%r14),%r8 + movl 0-128(%r8),%edi + movl 32-128(%r8),%ebp + movl 64-128(%r8),%r10d + movl 96-128(%r8),%r11d + movl 128-128(%r8),%edi + movl 160-128(%r8),%ebp + movl 192-128(%r8),%r10d + movl 224-128(%r8),%r11d + jmp .Lenc_loop_compact +.align 16 +.Lenc_loop_compact: + xorl 0(%r15),%eax + xorl 4(%r15),%ebx + xorl 8(%r15),%ecx + xorl 12(%r15),%edx + leaq 16(%r15),%r15 + movzbl %al,%r10d + movzbl %bl,%r11d + movzbl %cl,%r12d + movzbl %dl,%r8d + movzbl %bh,%esi + movzbl %ch,%edi + shrl $16,%ecx + movzbl %dh,%ebp + movzbl (%r14,%r10,1),%r10d + movzbl (%r14,%r11,1),%r11d + movzbl (%r14,%r12,1),%r12d + movzbl (%r14,%r8,1),%r8d + + movzbl (%r14,%rsi,1),%r9d + movzbl %ah,%esi + movzbl (%r14,%rdi,1),%r13d + movzbl %cl,%edi + movzbl (%r14,%rbp,1),%ebp + movzbl (%r14,%rsi,1),%esi + + shll $8,%r9d + shrl $16,%edx + shll $8,%r13d + xorl %r9d,%r10d + shrl $16,%eax + movzbl %dl,%r9d + shrl $16,%ebx + xorl %r13d,%r11d + shll $8,%ebp + movzbl %al,%r13d + movzbl (%r14,%rdi,1),%edi + xorl %ebp,%r12d + + shll $8,%esi + movzbl %bl,%ebp + shll $16,%edi + xorl %esi,%r8d + movzbl (%r14,%r9,1),%r9d + movzbl %dh,%esi + movzbl (%r14,%r13,1),%r13d + xorl %edi,%r10d + + shrl $8,%ecx + movzbl %ah,%edi + shll $16,%r9d + shrl $8,%ebx + shll $16,%r13d + xorl %r9d,%r11d + movzbl (%r14,%rbp,1),%ebp + movzbl (%r14,%rsi,1),%esi + movzbl (%r14,%rdi,1),%edi + movzbl (%r14,%rcx,1),%edx + movzbl (%r14,%rbx,1),%ecx + + shll $16,%ebp + xorl %r13d,%r12d + shll $24,%esi + xorl %ebp,%r8d + shll $24,%edi + xorl %esi,%r10d + shll $24,%edx + xorl %edi,%r11d + shll $24,%ecx + movl %r10d,%eax + movl %r11d,%ebx + xorl %r12d,%ecx + xorl %r8d,%edx + cmpq 16(%rsp),%r15 + je .Lenc_compact_done + movl $2155905152,%r10d + movl $2155905152,%r11d + andl %eax,%r10d + andl %ebx,%r11d + movl %r10d,%esi + movl %r11d,%edi + shrl $7,%r10d + leal (%rax,%rax,1),%r8d + shrl $7,%r11d + leal (%rbx,%rbx,1),%r9d + subl %r10d,%esi + subl %r11d,%edi + andl $4278124286,%r8d + andl $4278124286,%r9d + andl $454761243,%esi + andl $454761243,%edi + movl %eax,%r10d + movl %ebx,%r11d + xorl %esi,%r8d + xorl %edi,%r9d + + xorl %r8d,%eax + xorl %r9d,%ebx + movl $2155905152,%r12d + roll $24,%eax + movl $2155905152,%ebp + roll $24,%ebx + andl %ecx,%r12d + andl %edx,%ebp + xorl %r8d,%eax + xorl %r9d,%ebx + movl %r12d,%esi + rorl $16,%r10d + movl %ebp,%edi + rorl $16,%r11d + leal (%rcx,%rcx,1),%r8d + shrl $7,%r12d + xorl %r10d,%eax + shrl $7,%ebp + xorl %r11d,%ebx + rorl $8,%r10d + leal (%rdx,%rdx,1),%r9d + rorl $8,%r11d + subl %r12d,%esi + subl %ebp,%edi + xorl %r10d,%eax + xorl %r11d,%ebx + + andl $4278124286,%r8d + andl $4278124286,%r9d + andl $454761243,%esi + andl $454761243,%edi + movl %ecx,%r12d + movl %edx,%ebp + xorl %esi,%r8d + xorl %edi,%r9d + + rorl $16,%r12d + xorl %r8d,%ecx + rorl $16,%ebp + xorl %r9d,%edx + roll $24,%ecx + movl 0(%r14),%esi + roll $24,%edx + xorl %r8d,%ecx + movl 64(%r14),%edi + xorl %r9d,%edx + movl 128(%r14),%r8d + xorl %r12d,%ecx + rorl $8,%r12d + xorl %ebp,%edx + rorl $8,%ebp + xorl %r12d,%ecx + movl 192(%r14),%r9d + xorl %ebp,%edx + jmp .Lenc_loop_compact +.align 16 +.Lenc_compact_done: + xorl 0(%r15),%eax + xorl 4(%r15),%ebx + xorl 8(%r15),%ecx + xorl 12(%r15),%edx +.byte 0xf3,0xc3 +.size _x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact +.globl AES_encrypt +.type AES_encrypt,@function +.align 16 +.globl asm_AES_encrypt +.hidden asm_AES_encrypt +asm_AES_encrypt: +AES_encrypt: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + + movq %rsp,%r10 + leaq -63(%rdx),%rcx + andq $-64,%rsp + subq %rsp,%rcx + negq %rcx + andq $960,%rcx + subq %rcx,%rsp + subq $32,%rsp + + movq %rsi,16(%rsp) + movq %r10,24(%rsp) +.Lenc_prologue: + + movq %rdx,%r15 + movl 240(%r15),%r13d + + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + + shll $4,%r13d + leaq (%r15,%r13,1),%rbp + movq %r15,(%rsp) + movq %rbp,8(%rsp) + + + leaq .LAES_Te+2048(%rip),%r14 + leaq 768(%rsp),%rbp + subq %r14,%rbp + andq $768,%rbp + leaq (%r14,%rbp,1),%r14 + + call _x86_64_AES_encrypt_compact + + movq 16(%rsp),%r9 + movq 24(%rsp),%rsi + movl %eax,0(%r9) + movl %ebx,4(%r9) + movl %ecx,8(%r9) + movl %edx,12(%r9) + + movq (%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbp + movq 40(%rsi),%rbx + leaq 48(%rsi),%rsp +.Lenc_epilogue: + .byte 0xf3,0xc3 +.size AES_encrypt,.-AES_encrypt +.type _x86_64_AES_decrypt,@function +.align 16 +_x86_64_AES_decrypt: + xorl 0(%r15),%eax + xorl 4(%r15),%ebx + xorl 8(%r15),%ecx + xorl 12(%r15),%edx + + movl 240(%r15),%r13d + subl $1,%r13d + jmp .Ldec_loop +.align 16 +.Ldec_loop: + + movzbl %al,%esi + movzbl %bl,%edi + movzbl %cl,%ebp + movl 0(%r14,%rsi,8),%r10d + movl 0(%r14,%rdi,8),%r11d + movl 0(%r14,%rbp,8),%r12d + + movzbl %dh,%esi + movzbl %ah,%edi + movzbl %dl,%ebp + xorl 3(%r14,%rsi,8),%r10d + xorl 3(%r14,%rdi,8),%r11d + movl 0(%r14,%rbp,8),%r8d + + movzbl %bh,%esi + shrl $16,%eax + movzbl %ch,%ebp + xorl 3(%r14,%rsi,8),%r12d + shrl $16,%edx + xorl 3(%r14,%rbp,8),%r8d + + shrl $16,%ebx + leaq 16(%r15),%r15 + shrl $16,%ecx + + movzbl %cl,%esi + movzbl %dl,%edi + movzbl %al,%ebp + xorl 2(%r14,%rsi,8),%r10d + xorl 2(%r14,%rdi,8),%r11d + xorl 2(%r14,%rbp,8),%r12d + + movzbl %bh,%esi + movzbl %ch,%edi + movzbl %bl,%ebp + xorl 1(%r14,%rsi,8),%r10d + xorl 1(%r14,%rdi,8),%r11d + xorl 2(%r14,%rbp,8),%r8d + + movzbl %dh,%esi + movl 12(%r15),%edx + movzbl %ah,%ebp + xorl 1(%r14,%rsi,8),%r12d + movl 0(%r15),%eax + xorl 1(%r14,%rbp,8),%r8d + + xorl %r10d,%eax + movl 4(%r15),%ebx + movl 8(%r15),%ecx + xorl %r12d,%ecx + xorl %r11d,%ebx + xorl %r8d,%edx + subl $1,%r13d + jnz .Ldec_loop + leaq 2048(%r14),%r14 + movzbl %al,%esi + movzbl %bl,%edi + movzbl %cl,%ebp + movzbl (%r14,%rsi,1),%r10d + movzbl (%r14,%rdi,1),%r11d + movzbl (%r14,%rbp,1),%r12d + + movzbl %dl,%esi + movzbl %dh,%edi + movzbl %ah,%ebp + movzbl (%r14,%rsi,1),%r8d + movzbl (%r14,%rdi,1),%edi + movzbl (%r14,%rbp,1),%ebp + + shll $8,%edi + shll $8,%ebp + + xorl %edi,%r10d + xorl %ebp,%r11d + shrl $16,%edx + + movzbl %bh,%esi + movzbl %ch,%edi + shrl $16,%eax + movzbl (%r14,%rsi,1),%esi + movzbl (%r14,%rdi,1),%edi + + shll $8,%esi + shll $8,%edi + shrl $16,%ebx + xorl %esi,%r12d + xorl %edi,%r8d + shrl $16,%ecx + + movzbl %cl,%esi + movzbl %dl,%edi + movzbl %al,%ebp + movzbl (%r14,%rsi,1),%esi + movzbl (%r14,%rdi,1),%edi + movzbl (%r14,%rbp,1),%ebp + + shll $16,%esi + shll $16,%edi + shll $16,%ebp + + xorl %esi,%r10d + xorl %edi,%r11d + xorl %ebp,%r12d + + movzbl %bl,%esi + movzbl %bh,%edi + movzbl %ch,%ebp + movzbl (%r14,%rsi,1),%esi + movzbl (%r14,%rdi,1),%edi + movzbl (%r14,%rbp,1),%ebp + + shll $16,%esi + shll $24,%edi + shll $24,%ebp + + xorl %esi,%r8d + xorl %edi,%r10d + xorl %ebp,%r11d + + movzbl %dh,%esi + movzbl %ah,%edi + movl 16+12(%r15),%edx + movzbl (%r14,%rsi,1),%esi + movzbl (%r14,%rdi,1),%edi + movl 16+0(%r15),%eax + + shll $24,%esi + shll $24,%edi + + xorl %esi,%r12d + xorl %edi,%r8d + + movl 16+4(%r15),%ebx + movl 16+8(%r15),%ecx + leaq -2048(%r14),%r14 + xorl %r10d,%eax + xorl %r11d,%ebx + xorl %r12d,%ecx + xorl %r8d,%edx +.byte 0xf3,0xc3 +.size _x86_64_AES_decrypt,.-_x86_64_AES_decrypt +.type _x86_64_AES_decrypt_compact,@function +.align 16 +_x86_64_AES_decrypt_compact: + leaq 128(%r14),%r8 + movl 0-128(%r8),%edi + movl 32-128(%r8),%ebp + movl 64-128(%r8),%r10d + movl 96-128(%r8),%r11d + movl 128-128(%r8),%edi + movl 160-128(%r8),%ebp + movl 192-128(%r8),%r10d + movl 224-128(%r8),%r11d + jmp .Ldec_loop_compact + +.align 16 +.Ldec_loop_compact: + xorl 0(%r15),%eax + xorl 4(%r15),%ebx + xorl 8(%r15),%ecx + xorl 12(%r15),%edx + leaq 16(%r15),%r15 + movzbl %al,%r10d + movzbl %bl,%r11d + movzbl %cl,%r12d + movzbl %dl,%r8d + movzbl %dh,%esi + movzbl %ah,%edi + shrl $16,%edx + movzbl %bh,%ebp + movzbl (%r14,%r10,1),%r10d + movzbl (%r14,%r11,1),%r11d + movzbl (%r14,%r12,1),%r12d + movzbl (%r14,%r8,1),%r8d + + movzbl (%r14,%rsi,1),%r9d + movzbl %ch,%esi + movzbl (%r14,%rdi,1),%r13d + movzbl (%r14,%rbp,1),%ebp + movzbl (%r14,%rsi,1),%esi + + shrl $16,%ecx + shll $8,%r13d + shll $8,%r9d + movzbl %cl,%edi + shrl $16,%eax + xorl %r9d,%r10d + shrl $16,%ebx + movzbl %dl,%r9d + + shll $8,%ebp + xorl %r13d,%r11d + shll $8,%esi + movzbl %al,%r13d + movzbl (%r14,%rdi,1),%edi + xorl %ebp,%r12d + movzbl %bl,%ebp + + shll $16,%edi + xorl %esi,%r8d + movzbl (%r14,%r9,1),%r9d + movzbl %bh,%esi + movzbl (%r14,%rbp,1),%ebp + xorl %edi,%r10d + movzbl (%r14,%r13,1),%r13d + movzbl %ch,%edi + + shll $16,%ebp + shll $16,%r9d + shll $16,%r13d + xorl %ebp,%r8d + movzbl %dh,%ebp + xorl %r9d,%r11d + shrl $8,%eax + xorl %r13d,%r12d + + movzbl (%r14,%rsi,1),%esi + movzbl (%r14,%rdi,1),%ebx + movzbl (%r14,%rbp,1),%ecx + movzbl (%r14,%rax,1),%edx + + movl %r10d,%eax + shll $24,%esi + shll $24,%ebx + shll $24,%ecx + xorl %esi,%eax + shll $24,%edx + xorl %r11d,%ebx + xorl %r12d,%ecx + xorl %r8d,%edx + cmpq 16(%rsp),%r15 + je .Ldec_compact_done + + movq 256+0(%r14),%rsi + shlq $32,%rbx + shlq $32,%rdx + movq 256+8(%r14),%rdi + orq %rbx,%rax + orq %rdx,%rcx + movq 256+16(%r14),%rbp + movq %rsi,%r9 + movq %rsi,%r12 + andq %rax,%r9 + andq %rcx,%r12 + movq %r9,%rbx + movq %r12,%rdx + shrq $7,%r9 + leaq (%rax,%rax,1),%r8 + shrq $7,%r12 + leaq (%rcx,%rcx,1),%r11 + subq %r9,%rbx + subq %r12,%rdx + andq %rdi,%r8 + andq %rdi,%r11 + andq %rbp,%rbx + andq %rbp,%rdx + xorq %rbx,%r8 + xorq %rdx,%r11 + movq %rsi,%r10 + movq %rsi,%r13 + + andq %r8,%r10 + andq %r11,%r13 + movq %r10,%rbx + movq %r13,%rdx + shrq $7,%r10 + leaq (%r8,%r8,1),%r9 + shrq $7,%r13 + leaq (%r11,%r11,1),%r12 + subq %r10,%rbx + subq %r13,%rdx + andq %rdi,%r9 + andq %rdi,%r12 + andq %rbp,%rbx + andq %rbp,%rdx + xorq %rbx,%r9 + xorq %rdx,%r12 + movq %rsi,%r10 + movq %rsi,%r13 + + andq %r9,%r10 + andq %r12,%r13 + movq %r10,%rbx + movq %r13,%rdx + shrq $7,%r10 + xorq %rax,%r8 + shrq $7,%r13 + xorq %rcx,%r11 + subq %r10,%rbx + subq %r13,%rdx + leaq (%r9,%r9,1),%r10 + leaq (%r12,%r12,1),%r13 + xorq %rax,%r9 + xorq %rcx,%r12 + andq %rdi,%r10 + andq %rdi,%r13 + andq %rbp,%rbx + andq %rbp,%rdx + xorq %rbx,%r10 + xorq %rdx,%r13 + + xorq %r10,%rax + xorq %r13,%rcx + xorq %r10,%r8 + xorq %r13,%r11 + movq %rax,%rbx + movq %rcx,%rdx + xorq %r10,%r9 + shrq $32,%rbx + xorq %r13,%r12 + shrq $32,%rdx + xorq %r8,%r10 + roll $8,%eax + xorq %r11,%r13 + roll $8,%ecx + xorq %r9,%r10 + roll $8,%ebx + xorq %r12,%r13 + + roll $8,%edx + xorl %r10d,%eax + shrq $32,%r10 + xorl %r13d,%ecx + shrq $32,%r13 + xorl %r10d,%ebx + xorl %r13d,%edx + + movq %r8,%r10 + roll $24,%r8d + movq %r11,%r13 + roll $24,%r11d + shrq $32,%r10 + xorl %r8d,%eax + shrq $32,%r13 + xorl %r11d,%ecx + roll $24,%r10d + movq %r9,%r8 + roll $24,%r13d + movq %r12,%r11 + shrq $32,%r8 + xorl %r10d,%ebx + shrq $32,%r11 + xorl %r13d,%edx + + movq 0(%r14),%rsi + roll $16,%r9d + movq 64(%r14),%rdi + roll $16,%r12d + movq 128(%r14),%rbp + roll $16,%r8d + movq 192(%r14),%r10 + xorl %r9d,%eax + roll $16,%r11d + xorl %r12d,%ecx + movq 256(%r14),%r13 + xorl %r8d,%ebx + xorl %r11d,%edx + jmp .Ldec_loop_compact +.align 16 +.Ldec_compact_done: + xorl 0(%r15),%eax + xorl 4(%r15),%ebx + xorl 8(%r15),%ecx + xorl 12(%r15),%edx +.byte 0xf3,0xc3 +.size _x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact +.globl AES_decrypt +.type AES_decrypt,@function +.align 16 +.globl asm_AES_decrypt +.hidden asm_AES_decrypt +asm_AES_decrypt: +AES_decrypt: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + + movq %rsp,%r10 + leaq -63(%rdx),%rcx + andq $-64,%rsp + subq %rsp,%rcx + negq %rcx + andq $960,%rcx + subq %rcx,%rsp + subq $32,%rsp + + movq %rsi,16(%rsp) + movq %r10,24(%rsp) +.Ldec_prologue: + + movq %rdx,%r15 + movl 240(%r15),%r13d + + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + + shll $4,%r13d + leaq (%r15,%r13,1),%rbp + movq %r15,(%rsp) + movq %rbp,8(%rsp) + + + leaq .LAES_Td+2048(%rip),%r14 + leaq 768(%rsp),%rbp + subq %r14,%rbp + andq $768,%rbp + leaq (%r14,%rbp,1),%r14 + shrq $3,%rbp + addq %rbp,%r14 + + call _x86_64_AES_decrypt_compact + + movq 16(%rsp),%r9 + movq 24(%rsp),%rsi + movl %eax,0(%r9) + movl %ebx,4(%r9) + movl %ecx,8(%r9) + movl %edx,12(%r9) + + movq (%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbp + movq 40(%rsi),%rbx + leaq 48(%rsi),%rsp +.Ldec_epilogue: + .byte 0xf3,0xc3 +.size AES_decrypt,.-AES_decrypt +.globl private_AES_set_encrypt_key +.type private_AES_set_encrypt_key,@function +.align 16 +private_AES_set_encrypt_key: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $8,%rsp +.Lenc_key_prologue: + + call _x86_64_AES_set_encrypt_key + + movq 40(%rsp),%rbp + movq 48(%rsp),%rbx + addq $56,%rsp +.Lenc_key_epilogue: + .byte 0xf3,0xc3 +.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key + +.type _x86_64_AES_set_encrypt_key,@function +.align 16 +_x86_64_AES_set_encrypt_key: + movl %esi,%ecx + movq %rdi,%rsi + movq %rdx,%rdi + + testq $-1,%rsi + jz .Lbadpointer + testq $-1,%rdi + jz .Lbadpointer + + leaq .LAES_Te(%rip),%rbp + leaq 2048+128(%rbp),%rbp + + + movl 0-128(%rbp),%eax + movl 32-128(%rbp),%ebx + movl 64-128(%rbp),%r8d + movl 96-128(%rbp),%edx + movl 128-128(%rbp),%eax + movl 160-128(%rbp),%ebx + movl 192-128(%rbp),%r8d + movl 224-128(%rbp),%edx + + cmpl $128,%ecx + je .L10rounds + cmpl $192,%ecx + je .L12rounds + cmpl $256,%ecx + je .L14rounds + movq $-2,%rax + jmp .Lexit + +.L10rounds: + movq 0(%rsi),%rax + movq 8(%rsi),%rdx + movq %rax,0(%rdi) + movq %rdx,8(%rdi) + + shrq $32,%rdx + xorl %ecx,%ecx + jmp .L10shortcut +.align 4 +.L10loop: + movl 0(%rdi),%eax + movl 12(%rdi),%edx +.L10shortcut: + movzbl %dl,%esi + movzbl -128(%rbp,%rsi,1),%ebx + movzbl %dh,%esi + shll $24,%ebx + xorl %ebx,%eax + + movzbl -128(%rbp,%rsi,1),%ebx + shrl $16,%edx + movzbl %dl,%esi + xorl %ebx,%eax + + movzbl -128(%rbp,%rsi,1),%ebx + movzbl %dh,%esi + shll $8,%ebx + xorl %ebx,%eax + + movzbl -128(%rbp,%rsi,1),%ebx + shll $16,%ebx + xorl %ebx,%eax + + xorl 1024-128(%rbp,%rcx,4),%eax + movl %eax,16(%rdi) + xorl 4(%rdi),%eax + movl %eax,20(%rdi) + xorl 8(%rdi),%eax + movl %eax,24(%rdi) + xorl 12(%rdi),%eax + movl %eax,28(%rdi) + addl $1,%ecx + leaq 16(%rdi),%rdi + cmpl $10,%ecx + jl .L10loop + + movl $10,80(%rdi) + xorq %rax,%rax + jmp .Lexit + +.L12rounds: + movq 0(%rsi),%rax + movq 8(%rsi),%rbx + movq 16(%rsi),%rdx + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rdx,16(%rdi) + + shrq $32,%rdx + xorl %ecx,%ecx + jmp .L12shortcut +.align 4 +.L12loop: + movl 0(%rdi),%eax + movl 20(%rdi),%edx +.L12shortcut: + movzbl %dl,%esi + movzbl -128(%rbp,%rsi,1),%ebx + movzbl %dh,%esi + shll $24,%ebx + xorl %ebx,%eax + + movzbl -128(%rbp,%rsi,1),%ebx + shrl $16,%edx + movzbl %dl,%esi + xorl %ebx,%eax + + movzbl -128(%rbp,%rsi,1),%ebx + movzbl %dh,%esi + shll $8,%ebx + xorl %ebx,%eax + + movzbl -128(%rbp,%rsi,1),%ebx + shll $16,%ebx + xorl %ebx,%eax + + xorl 1024-128(%rbp,%rcx,4),%eax + movl %eax,24(%rdi) + xorl 4(%rdi),%eax + movl %eax,28(%rdi) + xorl 8(%rdi),%eax + movl %eax,32(%rdi) + xorl 12(%rdi),%eax + movl %eax,36(%rdi) + + cmpl $7,%ecx + je .L12break + addl $1,%ecx + + xorl 16(%rdi),%eax + movl %eax,40(%rdi) + xorl 20(%rdi),%eax + movl %eax,44(%rdi) + + leaq 24(%rdi),%rdi + jmp .L12loop +.L12break: + movl $12,72(%rdi) + xorq %rax,%rax + jmp .Lexit + +.L14rounds: + movq 0(%rsi),%rax + movq 8(%rsi),%rbx + movq 16(%rsi),%rcx + movq 24(%rsi),%rdx + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + + shrq $32,%rdx + xorl %ecx,%ecx + jmp .L14shortcut +.align 4 +.L14loop: + movl 0(%rdi),%eax + movl 28(%rdi),%edx +.L14shortcut: + movzbl %dl,%esi + movzbl -128(%rbp,%rsi,1),%ebx + movzbl %dh,%esi + shll $24,%ebx + xorl %ebx,%eax + + movzbl -128(%rbp,%rsi,1),%ebx + shrl $16,%edx + movzbl %dl,%esi + xorl %ebx,%eax + + movzbl -128(%rbp,%rsi,1),%ebx + movzbl %dh,%esi + shll $8,%ebx + xorl %ebx,%eax + + movzbl -128(%rbp,%rsi,1),%ebx + shll $16,%ebx + xorl %ebx,%eax + + xorl 1024-128(%rbp,%rcx,4),%eax + movl %eax,32(%rdi) + xorl 4(%rdi),%eax + movl %eax,36(%rdi) + xorl 8(%rdi),%eax + movl %eax,40(%rdi) + xorl 12(%rdi),%eax + movl %eax,44(%rdi) + + cmpl $6,%ecx + je .L14break + addl $1,%ecx + + movl %eax,%edx + movl 16(%rdi),%eax + movzbl %dl,%esi + movzbl -128(%rbp,%rsi,1),%ebx + movzbl %dh,%esi + xorl %ebx,%eax + + movzbl -128(%rbp,%rsi,1),%ebx + shrl $16,%edx + shll $8,%ebx + movzbl %dl,%esi + xorl %ebx,%eax + + movzbl -128(%rbp,%rsi,1),%ebx + movzbl %dh,%esi + shll $16,%ebx + xorl %ebx,%eax + + movzbl -128(%rbp,%rsi,1),%ebx + shll $24,%ebx + xorl %ebx,%eax + + movl %eax,48(%rdi) + xorl 20(%rdi),%eax + movl %eax,52(%rdi) + xorl 24(%rdi),%eax + movl %eax,56(%rdi) + xorl 28(%rdi),%eax + movl %eax,60(%rdi) + + leaq 32(%rdi),%rdi + jmp .L14loop +.L14break: + movl $14,48(%rdi) + xorq %rax,%rax + jmp .Lexit + +.Lbadpointer: + movq $-1,%rax +.Lexit: +.byte 0xf3,0xc3 +.size _x86_64_AES_set_encrypt_key,.-_x86_64_AES_set_encrypt_key +.globl private_AES_set_decrypt_key +.type private_AES_set_decrypt_key,@function +.align 16 +private_AES_set_decrypt_key: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rdx +.Ldec_key_prologue: + + call _x86_64_AES_set_encrypt_key + movq (%rsp),%r8 + cmpl $0,%eax + jne .Labort + + movl 240(%r8),%r14d + xorq %rdi,%rdi + leaq (%rdi,%r14,4),%rcx + movq %r8,%rsi + leaq (%r8,%rcx,4),%rdi +.align 4 +.Linvert: + movq 0(%rsi),%rax + movq 8(%rsi),%rbx + movq 0(%rdi),%rcx + movq 8(%rdi),%rdx + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rcx,0(%rsi) + movq %rdx,8(%rsi) + leaq 16(%rsi),%rsi + leaq -16(%rdi),%rdi + cmpq %rsi,%rdi + jne .Linvert + + leaq .LAES_Te+2048+1024(%rip),%rax + + movq 40(%rax),%rsi + movq 48(%rax),%rdi + movq 56(%rax),%rbp + + movq %r8,%r15 + subl $1,%r14d +.align 4 +.Lpermute: + leaq 16(%r15),%r15 + movq 0(%r15),%rax + movq 8(%r15),%rcx + movq %rsi,%r9 + movq %rsi,%r12 + andq %rax,%r9 + andq %rcx,%r12 + movq %r9,%rbx + movq %r12,%rdx + shrq $7,%r9 + leaq (%rax,%rax,1),%r8 + shrq $7,%r12 + leaq (%rcx,%rcx,1),%r11 + subq %r9,%rbx + subq %r12,%rdx + andq %rdi,%r8 + andq %rdi,%r11 + andq %rbp,%rbx + andq %rbp,%rdx + xorq %rbx,%r8 + xorq %rdx,%r11 + movq %rsi,%r10 + movq %rsi,%r13 + + andq %r8,%r10 + andq %r11,%r13 + movq %r10,%rbx + movq %r13,%rdx + shrq $7,%r10 + leaq (%r8,%r8,1),%r9 + shrq $7,%r13 + leaq (%r11,%r11,1),%r12 + subq %r10,%rbx + subq %r13,%rdx + andq %rdi,%r9 + andq %rdi,%r12 + andq %rbp,%rbx + andq %rbp,%rdx + xorq %rbx,%r9 + xorq %rdx,%r12 + movq %rsi,%r10 + movq %rsi,%r13 + + andq %r9,%r10 + andq %r12,%r13 + movq %r10,%rbx + movq %r13,%rdx + shrq $7,%r10 + xorq %rax,%r8 + shrq $7,%r13 + xorq %rcx,%r11 + subq %r10,%rbx + subq %r13,%rdx + leaq (%r9,%r9,1),%r10 + leaq (%r12,%r12,1),%r13 + xorq %rax,%r9 + xorq %rcx,%r12 + andq %rdi,%r10 + andq %rdi,%r13 + andq %rbp,%rbx + andq %rbp,%rdx + xorq %rbx,%r10 + xorq %rdx,%r13 + + xorq %r10,%rax + xorq %r13,%rcx + xorq %r10,%r8 + xorq %r13,%r11 + movq %rax,%rbx + movq %rcx,%rdx + xorq %r10,%r9 + shrq $32,%rbx + xorq %r13,%r12 + shrq $32,%rdx + xorq %r8,%r10 + roll $8,%eax + xorq %r11,%r13 + roll $8,%ecx + xorq %r9,%r10 + roll $8,%ebx + xorq %r12,%r13 + + roll $8,%edx + xorl %r10d,%eax + shrq $32,%r10 + xorl %r13d,%ecx + shrq $32,%r13 + xorl %r10d,%ebx + xorl %r13d,%edx + + movq %r8,%r10 + roll $24,%r8d + movq %r11,%r13 + roll $24,%r11d + shrq $32,%r10 + xorl %r8d,%eax + shrq $32,%r13 + xorl %r11d,%ecx + roll $24,%r10d + movq %r9,%r8 + roll $24,%r13d + movq %r12,%r11 + shrq $32,%r8 + xorl %r10d,%ebx + shrq $32,%r11 + xorl %r13d,%edx + + + roll $16,%r9d + + roll $16,%r12d + + roll $16,%r8d + + xorl %r9d,%eax + roll $16,%r11d + xorl %r12d,%ecx + + xorl %r8d,%ebx + xorl %r11d,%edx + movl %eax,0(%r15) + movl %ebx,4(%r15) + movl %ecx,8(%r15) + movl %edx,12(%r15) + subl $1,%r14d + jnz .Lpermute + + xorq %rax,%rax +.Labort: + movq 8(%rsp),%r15 + movq 16(%rsp),%r14 + movq 24(%rsp),%r13 + movq 32(%rsp),%r12 + movq 40(%rsp),%rbp + movq 48(%rsp),%rbx + addq $56,%rsp +.Ldec_key_epilogue: + .byte 0xf3,0xc3 +.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key +.globl AES_cbc_encrypt +.type AES_cbc_encrypt,@function +.align 16 + +.globl asm_AES_cbc_encrypt +.hidden asm_AES_cbc_encrypt +asm_AES_cbc_encrypt: +AES_cbc_encrypt: + cmpq $0,%rdx + je .Lcbc_epilogue + pushfq + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 +.Lcbc_prologue: + + cld + movl %r9d,%r9d + + leaq .LAES_Te(%rip),%r14 + cmpq $0,%r9 + jne .Lcbc_picked_te + leaq .LAES_Td(%rip),%r14 +.Lcbc_picked_te: + + movl OPENSSL_ia32cap_P(%rip),%r10d + cmpq $512,%rdx + jb .Lcbc_slow_prologue + testq $15,%rdx + jnz .Lcbc_slow_prologue + btl $28,%r10d + jc .Lcbc_slow_prologue + + + leaq -88-248(%rsp),%r15 + andq $-64,%r15 + + + movq %r14,%r10 + leaq 2304(%r14),%r11 + movq %r15,%r12 + andq $4095,%r10 + andq $4095,%r11 + andq $4095,%r12 + + cmpq %r11,%r12 + jb .Lcbc_te_break_out + subq %r11,%r12 + subq %r12,%r15 + jmp .Lcbc_te_ok +.Lcbc_te_break_out: + subq %r10,%r12 + andq $4095,%r12 + addq $320,%r12 + subq %r12,%r15 +.align 4 +.Lcbc_te_ok: + + xchgq %rsp,%r15 + + movq %r15,16(%rsp) +.Lcbc_fast_body: + movq %rdi,24(%rsp) + movq %rsi,32(%rsp) + movq %rdx,40(%rsp) + movq %rcx,48(%rsp) + movq %r8,56(%rsp) + movl $0,80+240(%rsp) + movq %r8,%rbp + movq %r9,%rbx + movq %rsi,%r9 + movq %rdi,%r8 + movq %rcx,%r15 + + movl 240(%r15),%eax + + movq %r15,%r10 + subq %r14,%r10 + andq $4095,%r10 + cmpq $2304,%r10 + jb .Lcbc_do_ecopy + cmpq $4096-248,%r10 + jb .Lcbc_skip_ecopy +.align 4 +.Lcbc_do_ecopy: + movq %r15,%rsi + leaq 80(%rsp),%rdi + leaq 80(%rsp),%r15 + movl $30,%ecx +.long 0x90A548F3 + movl %eax,(%rdi) +.Lcbc_skip_ecopy: + movq %r15,0(%rsp) + + movl $18,%ecx +.align 4 +.Lcbc_prefetch_te: + movq 0(%r14),%r10 + movq 32(%r14),%r11 + movq 64(%r14),%r12 + movq 96(%r14),%r13 + leaq 128(%r14),%r14 + subl $1,%ecx + jnz .Lcbc_prefetch_te + leaq -2304(%r14),%r14 + + cmpq $0,%rbx + je .LFAST_DECRYPT + + + movl 0(%rbp),%eax + movl 4(%rbp),%ebx + movl 8(%rbp),%ecx + movl 12(%rbp),%edx + +.align 4 +.Lcbc_fast_enc_loop: + xorl 0(%r8),%eax + xorl 4(%r8),%ebx + xorl 8(%r8),%ecx + xorl 12(%r8),%edx + movq 0(%rsp),%r15 + movq %r8,24(%rsp) + + call _x86_64_AES_encrypt + + movq 24(%rsp),%r8 + movq 40(%rsp),%r10 + movl %eax,0(%r9) + movl %ebx,4(%r9) + movl %ecx,8(%r9) + movl %edx,12(%r9) + + leaq 16(%r8),%r8 + leaq 16(%r9),%r9 + subq $16,%r10 + testq $-16,%r10 + movq %r10,40(%rsp) + jnz .Lcbc_fast_enc_loop + movq 56(%rsp),%rbp + movl %eax,0(%rbp) + movl %ebx,4(%rbp) + movl %ecx,8(%rbp) + movl %edx,12(%rbp) + + jmp .Lcbc_fast_cleanup + + +.align 16 +.LFAST_DECRYPT: + cmpq %r8,%r9 + je .Lcbc_fast_dec_in_place + + movq %rbp,64(%rsp) +.align 4 +.Lcbc_fast_dec_loop: + movl 0(%r8),%eax + movl 4(%r8),%ebx + movl 8(%r8),%ecx + movl 12(%r8),%edx + movq 0(%rsp),%r15 + movq %r8,24(%rsp) + + call _x86_64_AES_decrypt + + movq 64(%rsp),%rbp + movq 24(%rsp),%r8 + movq 40(%rsp),%r10 + xorl 0(%rbp),%eax + xorl 4(%rbp),%ebx + xorl 8(%rbp),%ecx + xorl 12(%rbp),%edx + movq %r8,%rbp + + subq $16,%r10 + movq %r10,40(%rsp) + movq %rbp,64(%rsp) + + movl %eax,0(%r9) + movl %ebx,4(%r9) + movl %ecx,8(%r9) + movl %edx,12(%r9) + + leaq 16(%r8),%r8 + leaq 16(%r9),%r9 + jnz .Lcbc_fast_dec_loop + movq 56(%rsp),%r12 + movq 0(%rbp),%r10 + movq 8(%rbp),%r11 + movq %r10,0(%r12) + movq %r11,8(%r12) + jmp .Lcbc_fast_cleanup + +.align 16 +.Lcbc_fast_dec_in_place: + movq 0(%rbp),%r10 + movq 8(%rbp),%r11 + movq %r10,0+64(%rsp) + movq %r11,8+64(%rsp) +.align 4 +.Lcbc_fast_dec_in_place_loop: + movl 0(%r8),%eax + movl 4(%r8),%ebx + movl 8(%r8),%ecx + movl 12(%r8),%edx + movq 0(%rsp),%r15 + movq %r8,24(%rsp) + + call _x86_64_AES_decrypt + + movq 24(%rsp),%r8 + movq 40(%rsp),%r10 + xorl 0+64(%rsp),%eax + xorl 4+64(%rsp),%ebx + xorl 8+64(%rsp),%ecx + xorl 12+64(%rsp),%edx + + movq 0(%r8),%r11 + movq 8(%r8),%r12 + subq $16,%r10 + jz .Lcbc_fast_dec_in_place_done + + movq %r11,0+64(%rsp) + movq %r12,8+64(%rsp) + + movl %eax,0(%r9) + movl %ebx,4(%r9) + movl %ecx,8(%r9) + movl %edx,12(%r9) + + leaq 16(%r8),%r8 + leaq 16(%r9),%r9 + movq %r10,40(%rsp) + jmp .Lcbc_fast_dec_in_place_loop +.Lcbc_fast_dec_in_place_done: + movq 56(%rsp),%rdi + movq %r11,0(%rdi) + movq %r12,8(%rdi) + + movl %eax,0(%r9) + movl %ebx,4(%r9) + movl %ecx,8(%r9) + movl %edx,12(%r9) + +.align 4 +.Lcbc_fast_cleanup: + cmpl $0,80+240(%rsp) + leaq 80(%rsp),%rdi + je .Lcbc_exit + movl $30,%ecx + xorq %rax,%rax +.long 0x90AB48F3 + + jmp .Lcbc_exit + + +.align 16 +.Lcbc_slow_prologue: + + leaq -88(%rsp),%rbp + andq $-64,%rbp + + leaq -88-63(%rcx),%r10 + subq %rbp,%r10 + negq %r10 + andq $960,%r10 + subq %r10,%rbp + + xchgq %rsp,%rbp + + movq %rbp,16(%rsp) +.Lcbc_slow_body: + + + + + movq %r8,56(%rsp) + movq %r8,%rbp + movq %r9,%rbx + movq %rsi,%r9 + movq %rdi,%r8 + movq %rcx,%r15 + movq %rdx,%r10 + + movl 240(%r15),%eax + movq %r15,0(%rsp) + shll $4,%eax + leaq (%r15,%rax,1),%rax + movq %rax,8(%rsp) + + + leaq 2048(%r14),%r14 + leaq 768-8(%rsp),%rax + subq %r14,%rax + andq $768,%rax + leaq (%r14,%rax,1),%r14 + + cmpq $0,%rbx + je .LSLOW_DECRYPT + + + testq $-16,%r10 + movl 0(%rbp),%eax + movl 4(%rbp),%ebx + movl 8(%rbp),%ecx + movl 12(%rbp),%edx + jz .Lcbc_slow_enc_tail + +.align 4 +.Lcbc_slow_enc_loop: + xorl 0(%r8),%eax + xorl 4(%r8),%ebx + xorl 8(%r8),%ecx + xorl 12(%r8),%edx + movq 0(%rsp),%r15 + movq %r8,24(%rsp) + movq %r9,32(%rsp) + movq %r10,40(%rsp) + + call _x86_64_AES_encrypt_compact + + movq 24(%rsp),%r8 + movq 32(%rsp),%r9 + movq 40(%rsp),%r10 + movl %eax,0(%r9) + movl %ebx,4(%r9) + movl %ecx,8(%r9) + movl %edx,12(%r9) + + leaq 16(%r8),%r8 + leaq 16(%r9),%r9 + subq $16,%r10 + testq $-16,%r10 + jnz .Lcbc_slow_enc_loop + testq $15,%r10 + jnz .Lcbc_slow_enc_tail + movq 56(%rsp),%rbp + movl %eax,0(%rbp) + movl %ebx,4(%rbp) + movl %ecx,8(%rbp) + movl %edx,12(%rbp) + + jmp .Lcbc_exit + +.align 4 +.Lcbc_slow_enc_tail: + movq %rax,%r11 + movq %rcx,%r12 + movq %r10,%rcx + movq %r8,%rsi + movq %r9,%rdi +.long 0x9066A4F3 + movq $16,%rcx + subq %r10,%rcx + xorq %rax,%rax +.long 0x9066AAF3 + movq %r9,%r8 + movq $16,%r10 + movq %r11,%rax + movq %r12,%rcx + jmp .Lcbc_slow_enc_loop + +.align 16 +.LSLOW_DECRYPT: + shrq $3,%rax + addq %rax,%r14 + + movq 0(%rbp),%r11 + movq 8(%rbp),%r12 + movq %r11,0+64(%rsp) + movq %r12,8+64(%rsp) + +.align 4 +.Lcbc_slow_dec_loop: + movl 0(%r8),%eax + movl 4(%r8),%ebx + movl 8(%r8),%ecx + movl 12(%r8),%edx + movq 0(%rsp),%r15 + movq %r8,24(%rsp) + movq %r9,32(%rsp) + movq %r10,40(%rsp) + + call _x86_64_AES_decrypt_compact + + movq 24(%rsp),%r8 + movq 32(%rsp),%r9 + movq 40(%rsp),%r10 + xorl 0+64(%rsp),%eax + xorl 4+64(%rsp),%ebx + xorl 8+64(%rsp),%ecx + xorl 12+64(%rsp),%edx + + movq 0(%r8),%r11 + movq 8(%r8),%r12 + subq $16,%r10 + jc .Lcbc_slow_dec_partial + jz .Lcbc_slow_dec_done + + movq %r11,0+64(%rsp) + movq %r12,8+64(%rsp) + + movl %eax,0(%r9) + movl %ebx,4(%r9) + movl %ecx,8(%r9) + movl %edx,12(%r9) + + leaq 16(%r8),%r8 + leaq 16(%r9),%r9 + jmp .Lcbc_slow_dec_loop +.Lcbc_slow_dec_done: + movq 56(%rsp),%rdi + movq %r11,0(%rdi) + movq %r12,8(%rdi) + + movl %eax,0(%r9) + movl %ebx,4(%r9) + movl %ecx,8(%r9) + movl %edx,12(%r9) + + jmp .Lcbc_exit + +.align 4 +.Lcbc_slow_dec_partial: + movq 56(%rsp),%rdi + movq %r11,0(%rdi) + movq %r12,8(%rdi) + + movl %eax,0+64(%rsp) + movl %ebx,4+64(%rsp) + movl %ecx,8+64(%rsp) + movl %edx,12+64(%rsp) + + movq %r9,%rdi + leaq 64(%rsp),%rsi + leaq 16(%r10),%rcx +.long 0x9066A4F3 + jmp .Lcbc_exit + +.align 16 +.Lcbc_exit: + movq 16(%rsp),%rsi + movq (%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbp + movq 40(%rsi),%rbx + leaq 48(%rsi),%rsp +.Lcbc_popfq: + popfq +.Lcbc_epilogue: + .byte 0xf3,0xc3 +.size AES_cbc_encrypt,.-AES_cbc_encrypt +.align 64 +.LAES_Te: +.long 0xa56363c6,0xa56363c6 +.long 0x847c7cf8,0x847c7cf8 +.long 0x997777ee,0x997777ee +.long 0x8d7b7bf6,0x8d7b7bf6 +.long 0x0df2f2ff,0x0df2f2ff +.long 0xbd6b6bd6,0xbd6b6bd6 +.long 0xb16f6fde,0xb16f6fde +.long 0x54c5c591,0x54c5c591 +.long 0x50303060,0x50303060 +.long 0x03010102,0x03010102 +.long 0xa96767ce,0xa96767ce +.long 0x7d2b2b56,0x7d2b2b56 +.long 0x19fefee7,0x19fefee7 +.long 0x62d7d7b5,0x62d7d7b5 +.long 0xe6abab4d,0xe6abab4d +.long 0x9a7676ec,0x9a7676ec +.long 0x45caca8f,0x45caca8f +.long 0x9d82821f,0x9d82821f +.long 0x40c9c989,0x40c9c989 +.long 0x877d7dfa,0x877d7dfa +.long 0x15fafaef,0x15fafaef +.long 0xeb5959b2,0xeb5959b2 +.long 0xc947478e,0xc947478e +.long 0x0bf0f0fb,0x0bf0f0fb +.long 0xecadad41,0xecadad41 +.long 0x67d4d4b3,0x67d4d4b3 +.long 0xfda2a25f,0xfda2a25f +.long 0xeaafaf45,0xeaafaf45 +.long 0xbf9c9c23,0xbf9c9c23 +.long 0xf7a4a453,0xf7a4a453 +.long 0x967272e4,0x967272e4 +.long 0x5bc0c09b,0x5bc0c09b +.long 0xc2b7b775,0xc2b7b775 +.long 0x1cfdfde1,0x1cfdfde1 +.long 0xae93933d,0xae93933d +.long 0x6a26264c,0x6a26264c +.long 0x5a36366c,0x5a36366c +.long 0x413f3f7e,0x413f3f7e +.long 0x02f7f7f5,0x02f7f7f5 +.long 0x4fcccc83,0x4fcccc83 +.long 0x5c343468,0x5c343468 +.long 0xf4a5a551,0xf4a5a551 +.long 0x34e5e5d1,0x34e5e5d1 +.long 0x08f1f1f9,0x08f1f1f9 +.long 0x937171e2,0x937171e2 +.long 0x73d8d8ab,0x73d8d8ab +.long 0x53313162,0x53313162 +.long 0x3f15152a,0x3f15152a +.long 0x0c040408,0x0c040408 +.long 0x52c7c795,0x52c7c795 +.long 0x65232346,0x65232346 +.long 0x5ec3c39d,0x5ec3c39d +.long 0x28181830,0x28181830 +.long 0xa1969637,0xa1969637 +.long 0x0f05050a,0x0f05050a +.long 0xb59a9a2f,0xb59a9a2f +.long 0x0907070e,0x0907070e +.long 0x36121224,0x36121224 +.long 0x9b80801b,0x9b80801b +.long 0x3de2e2df,0x3de2e2df +.long 0x26ebebcd,0x26ebebcd +.long 0x6927274e,0x6927274e +.long 0xcdb2b27f,0xcdb2b27f +.long 0x9f7575ea,0x9f7575ea +.long 0x1b090912,0x1b090912 +.long 0x9e83831d,0x9e83831d +.long 0x742c2c58,0x742c2c58 +.long 0x2e1a1a34,0x2e1a1a34 +.long 0x2d1b1b36,0x2d1b1b36 +.long 0xb26e6edc,0xb26e6edc +.long 0xee5a5ab4,0xee5a5ab4 +.long 0xfba0a05b,0xfba0a05b +.long 0xf65252a4,0xf65252a4 +.long 0x4d3b3b76,0x4d3b3b76 +.long 0x61d6d6b7,0x61d6d6b7 +.long 0xceb3b37d,0xceb3b37d +.long 0x7b292952,0x7b292952 +.long 0x3ee3e3dd,0x3ee3e3dd +.long 0x712f2f5e,0x712f2f5e +.long 0x97848413,0x97848413 +.long 0xf55353a6,0xf55353a6 +.long 0x68d1d1b9,0x68d1d1b9 +.long 0x00000000,0x00000000 +.long 0x2cededc1,0x2cededc1 +.long 0x60202040,0x60202040 +.long 0x1ffcfce3,0x1ffcfce3 +.long 0xc8b1b179,0xc8b1b179 +.long 0xed5b5bb6,0xed5b5bb6 +.long 0xbe6a6ad4,0xbe6a6ad4 +.long 0x46cbcb8d,0x46cbcb8d +.long 0xd9bebe67,0xd9bebe67 +.long 0x4b393972,0x4b393972 +.long 0xde4a4a94,0xde4a4a94 +.long 0xd44c4c98,0xd44c4c98 +.long 0xe85858b0,0xe85858b0 +.long 0x4acfcf85,0x4acfcf85 +.long 0x6bd0d0bb,0x6bd0d0bb +.long 0x2aefefc5,0x2aefefc5 +.long 0xe5aaaa4f,0xe5aaaa4f +.long 0x16fbfbed,0x16fbfbed +.long 0xc5434386,0xc5434386 +.long 0xd74d4d9a,0xd74d4d9a +.long 0x55333366,0x55333366 +.long 0x94858511,0x94858511 +.long 0xcf45458a,0xcf45458a +.long 0x10f9f9e9,0x10f9f9e9 +.long 0x06020204,0x06020204 +.long 0x817f7ffe,0x817f7ffe +.long 0xf05050a0,0xf05050a0 +.long 0x443c3c78,0x443c3c78 +.long 0xba9f9f25,0xba9f9f25 +.long 0xe3a8a84b,0xe3a8a84b +.long 0xf35151a2,0xf35151a2 +.long 0xfea3a35d,0xfea3a35d +.long 0xc0404080,0xc0404080 +.long 0x8a8f8f05,0x8a8f8f05 +.long 0xad92923f,0xad92923f +.long 0xbc9d9d21,0xbc9d9d21 +.long 0x48383870,0x48383870 +.long 0x04f5f5f1,0x04f5f5f1 +.long 0xdfbcbc63,0xdfbcbc63 +.long 0xc1b6b677,0xc1b6b677 +.long 0x75dadaaf,0x75dadaaf +.long 0x63212142,0x63212142 +.long 0x30101020,0x30101020 +.long 0x1affffe5,0x1affffe5 +.long 0x0ef3f3fd,0x0ef3f3fd +.long 0x6dd2d2bf,0x6dd2d2bf +.long 0x4ccdcd81,0x4ccdcd81 +.long 0x140c0c18,0x140c0c18 +.long 0x35131326,0x35131326 +.long 0x2fececc3,0x2fececc3 +.long 0xe15f5fbe,0xe15f5fbe +.long 0xa2979735,0xa2979735 +.long 0xcc444488,0xcc444488 +.long 0x3917172e,0x3917172e +.long 0x57c4c493,0x57c4c493 +.long 0xf2a7a755,0xf2a7a755 +.long 0x827e7efc,0x827e7efc +.long 0x473d3d7a,0x473d3d7a +.long 0xac6464c8,0xac6464c8 +.long 0xe75d5dba,0xe75d5dba +.long 0x2b191932,0x2b191932 +.long 0x957373e6,0x957373e6 +.long 0xa06060c0,0xa06060c0 +.long 0x98818119,0x98818119 +.long 0xd14f4f9e,0xd14f4f9e +.long 0x7fdcdca3,0x7fdcdca3 +.long 0x66222244,0x66222244 +.long 0x7e2a2a54,0x7e2a2a54 +.long 0xab90903b,0xab90903b +.long 0x8388880b,0x8388880b +.long 0xca46468c,0xca46468c +.long 0x29eeeec7,0x29eeeec7 +.long 0xd3b8b86b,0xd3b8b86b +.long 0x3c141428,0x3c141428 +.long 0x79dedea7,0x79dedea7 +.long 0xe25e5ebc,0xe25e5ebc +.long 0x1d0b0b16,0x1d0b0b16 +.long 0x76dbdbad,0x76dbdbad +.long 0x3be0e0db,0x3be0e0db +.long 0x56323264,0x56323264 +.long 0x4e3a3a74,0x4e3a3a74 +.long 0x1e0a0a14,0x1e0a0a14 +.long 0xdb494992,0xdb494992 +.long 0x0a06060c,0x0a06060c +.long 0x6c242448,0x6c242448 +.long 0xe45c5cb8,0xe45c5cb8 +.long 0x5dc2c29f,0x5dc2c29f +.long 0x6ed3d3bd,0x6ed3d3bd +.long 0xefacac43,0xefacac43 +.long 0xa66262c4,0xa66262c4 +.long 0xa8919139,0xa8919139 +.long 0xa4959531,0xa4959531 +.long 0x37e4e4d3,0x37e4e4d3 +.long 0x8b7979f2,0x8b7979f2 +.long 0x32e7e7d5,0x32e7e7d5 +.long 0x43c8c88b,0x43c8c88b +.long 0x5937376e,0x5937376e +.long 0xb76d6dda,0xb76d6dda +.long 0x8c8d8d01,0x8c8d8d01 +.long 0x64d5d5b1,0x64d5d5b1 +.long 0xd24e4e9c,0xd24e4e9c +.long 0xe0a9a949,0xe0a9a949 +.long 0xb46c6cd8,0xb46c6cd8 +.long 0xfa5656ac,0xfa5656ac +.long 0x07f4f4f3,0x07f4f4f3 +.long 0x25eaeacf,0x25eaeacf +.long 0xaf6565ca,0xaf6565ca +.long 0x8e7a7af4,0x8e7a7af4 +.long 0xe9aeae47,0xe9aeae47 +.long 0x18080810,0x18080810 +.long 0xd5baba6f,0xd5baba6f +.long 0x887878f0,0x887878f0 +.long 0x6f25254a,0x6f25254a +.long 0x722e2e5c,0x722e2e5c +.long 0x241c1c38,0x241c1c38 +.long 0xf1a6a657,0xf1a6a657 +.long 0xc7b4b473,0xc7b4b473 +.long 0x51c6c697,0x51c6c697 +.long 0x23e8e8cb,0x23e8e8cb +.long 0x7cdddda1,0x7cdddda1 +.long 0x9c7474e8,0x9c7474e8 +.long 0x211f1f3e,0x211f1f3e +.long 0xdd4b4b96,0xdd4b4b96 +.long 0xdcbdbd61,0xdcbdbd61 +.long 0x868b8b0d,0x868b8b0d +.long 0x858a8a0f,0x858a8a0f +.long 0x907070e0,0x907070e0 +.long 0x423e3e7c,0x423e3e7c +.long 0xc4b5b571,0xc4b5b571 +.long 0xaa6666cc,0xaa6666cc +.long 0xd8484890,0xd8484890 +.long 0x05030306,0x05030306 +.long 0x01f6f6f7,0x01f6f6f7 +.long 0x120e0e1c,0x120e0e1c +.long 0xa36161c2,0xa36161c2 +.long 0x5f35356a,0x5f35356a +.long 0xf95757ae,0xf95757ae +.long 0xd0b9b969,0xd0b9b969 +.long 0x91868617,0x91868617 +.long 0x58c1c199,0x58c1c199 +.long 0x271d1d3a,0x271d1d3a +.long 0xb99e9e27,0xb99e9e27 +.long 0x38e1e1d9,0x38e1e1d9 +.long 0x13f8f8eb,0x13f8f8eb +.long 0xb398982b,0xb398982b +.long 0x33111122,0x33111122 +.long 0xbb6969d2,0xbb6969d2 +.long 0x70d9d9a9,0x70d9d9a9 +.long 0x898e8e07,0x898e8e07 +.long 0xa7949433,0xa7949433 +.long 0xb69b9b2d,0xb69b9b2d +.long 0x221e1e3c,0x221e1e3c +.long 0x92878715,0x92878715 +.long 0x20e9e9c9,0x20e9e9c9 +.long 0x49cece87,0x49cece87 +.long 0xff5555aa,0xff5555aa +.long 0x78282850,0x78282850 +.long 0x7adfdfa5,0x7adfdfa5 +.long 0x8f8c8c03,0x8f8c8c03 +.long 0xf8a1a159,0xf8a1a159 +.long 0x80898909,0x80898909 +.long 0x170d0d1a,0x170d0d1a +.long 0xdabfbf65,0xdabfbf65 +.long 0x31e6e6d7,0x31e6e6d7 +.long 0xc6424284,0xc6424284 +.long 0xb86868d0,0xb86868d0 +.long 0xc3414182,0xc3414182 +.long 0xb0999929,0xb0999929 +.long 0x772d2d5a,0x772d2d5a +.long 0x110f0f1e,0x110f0f1e +.long 0xcbb0b07b,0xcbb0b07b +.long 0xfc5454a8,0xfc5454a8 +.long 0xd6bbbb6d,0xd6bbbb6d +.long 0x3a16162c,0x3a16162c +.byte 0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5 +.byte 0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76 +.byte 0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0 +.byte 0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0 +.byte 0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc +.byte 0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15 +.byte 0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a +.byte 0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75 +.byte 0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0 +.byte 0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84 +.byte 0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b +.byte 0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf +.byte 0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85 +.byte 0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8 +.byte 0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5 +.byte 0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2 +.byte 0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17 +.byte 0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73 +.byte 0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88 +.byte 0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb +.byte 0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c +.byte 0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79 +.byte 0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9 +.byte 0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08 +.byte 0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6 +.byte 0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a +.byte 0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e +.byte 0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e +.byte 0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94 +.byte 0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf +.byte 0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68 +.byte 0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16 +.byte 0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5 +.byte 0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76 +.byte 0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0 +.byte 0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0 +.byte 0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc +.byte 0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15 +.byte 0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a +.byte 0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75 +.byte 0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0 +.byte 0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84 +.byte 0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b +.byte 0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf +.byte 0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85 +.byte 0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8 +.byte 0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5 +.byte 0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2 +.byte 0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17 +.byte 0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73 +.byte 0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88 +.byte 0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb +.byte 0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c +.byte 0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79 +.byte 0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9 +.byte 0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08 +.byte 0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6 +.byte 0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a +.byte 0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e +.byte 0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e +.byte 0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94 +.byte 0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf +.byte 0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68 +.byte 0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16 +.byte 0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5 +.byte 0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76 +.byte 0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0 +.byte 0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0 +.byte 0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc +.byte 0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15 +.byte 0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a +.byte 0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75 +.byte 0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0 +.byte 0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84 +.byte 0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b +.byte 0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf +.byte 0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85 +.byte 0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8 +.byte 0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5 +.byte 0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2 +.byte 0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17 +.byte 0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73 +.byte 0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88 +.byte 0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb +.byte 0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c +.byte 0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79 +.byte 0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9 +.byte 0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08 +.byte 0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6 +.byte 0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a +.byte 0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e +.byte 0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e +.byte 0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94 +.byte 0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf +.byte 0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68 +.byte 0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16 +.byte 0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5 +.byte 0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76 +.byte 0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0 +.byte 0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0 +.byte 0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc +.byte 0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15 +.byte 0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a +.byte 0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75 +.byte 0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0 +.byte 0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84 +.byte 0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b +.byte 0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf +.byte 0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85 +.byte 0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8 +.byte 0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5 +.byte 0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2 +.byte 0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17 +.byte 0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73 +.byte 0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88 +.byte 0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb +.byte 0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c +.byte 0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79 +.byte 0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9 +.byte 0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08 +.byte 0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6 +.byte 0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a +.byte 0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e +.byte 0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e +.byte 0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94 +.byte 0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf +.byte 0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68 +.byte 0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16 +.long 0x00000001, 0x00000002, 0x00000004, 0x00000008 +.long 0x00000010, 0x00000020, 0x00000040, 0x00000080 +.long 0x0000001b, 0x00000036, 0x80808080, 0x80808080 +.long 0xfefefefe, 0xfefefefe, 0x1b1b1b1b, 0x1b1b1b1b +.align 64 +.LAES_Td: +.long 0x50a7f451,0x50a7f451 +.long 0x5365417e,0x5365417e +.long 0xc3a4171a,0xc3a4171a +.long 0x965e273a,0x965e273a +.long 0xcb6bab3b,0xcb6bab3b +.long 0xf1459d1f,0xf1459d1f +.long 0xab58faac,0xab58faac +.long 0x9303e34b,0x9303e34b +.long 0x55fa3020,0x55fa3020 +.long 0xf66d76ad,0xf66d76ad +.long 0x9176cc88,0x9176cc88 +.long 0x254c02f5,0x254c02f5 +.long 0xfcd7e54f,0xfcd7e54f +.long 0xd7cb2ac5,0xd7cb2ac5 +.long 0x80443526,0x80443526 +.long 0x8fa362b5,0x8fa362b5 +.long 0x495ab1de,0x495ab1de +.long 0x671bba25,0x671bba25 +.long 0x980eea45,0x980eea45 +.long 0xe1c0fe5d,0xe1c0fe5d +.long 0x02752fc3,0x02752fc3 +.long 0x12f04c81,0x12f04c81 +.long 0xa397468d,0xa397468d +.long 0xc6f9d36b,0xc6f9d36b +.long 0xe75f8f03,0xe75f8f03 +.long 0x959c9215,0x959c9215 +.long 0xeb7a6dbf,0xeb7a6dbf +.long 0xda595295,0xda595295 +.long 0x2d83bed4,0x2d83bed4 +.long 0xd3217458,0xd3217458 +.long 0x2969e049,0x2969e049 +.long 0x44c8c98e,0x44c8c98e +.long 0x6a89c275,0x6a89c275 +.long 0x78798ef4,0x78798ef4 +.long 0x6b3e5899,0x6b3e5899 +.long 0xdd71b927,0xdd71b927 +.long 0xb64fe1be,0xb64fe1be +.long 0x17ad88f0,0x17ad88f0 +.long 0x66ac20c9,0x66ac20c9 +.long 0xb43ace7d,0xb43ace7d +.long 0x184adf63,0x184adf63 +.long 0x82311ae5,0x82311ae5 +.long 0x60335197,0x60335197 +.long 0x457f5362,0x457f5362 +.long 0xe07764b1,0xe07764b1 +.long 0x84ae6bbb,0x84ae6bbb +.long 0x1ca081fe,0x1ca081fe +.long 0x942b08f9,0x942b08f9 +.long 0x58684870,0x58684870 +.long 0x19fd458f,0x19fd458f +.long 0x876cde94,0x876cde94 +.long 0xb7f87b52,0xb7f87b52 +.long 0x23d373ab,0x23d373ab +.long 0xe2024b72,0xe2024b72 +.long 0x578f1fe3,0x578f1fe3 +.long 0x2aab5566,0x2aab5566 +.long 0x0728ebb2,0x0728ebb2 +.long 0x03c2b52f,0x03c2b52f +.long 0x9a7bc586,0x9a7bc586 +.long 0xa50837d3,0xa50837d3 +.long 0xf2872830,0xf2872830 +.long 0xb2a5bf23,0xb2a5bf23 +.long 0xba6a0302,0xba6a0302 +.long 0x5c8216ed,0x5c8216ed +.long 0x2b1ccf8a,0x2b1ccf8a +.long 0x92b479a7,0x92b479a7 +.long 0xf0f207f3,0xf0f207f3 +.long 0xa1e2694e,0xa1e2694e +.long 0xcdf4da65,0xcdf4da65 +.long 0xd5be0506,0xd5be0506 +.long 0x1f6234d1,0x1f6234d1 +.long 0x8afea6c4,0x8afea6c4 +.long 0x9d532e34,0x9d532e34 +.long 0xa055f3a2,0xa055f3a2 +.long 0x32e18a05,0x32e18a05 +.long 0x75ebf6a4,0x75ebf6a4 +.long 0x39ec830b,0x39ec830b +.long 0xaaef6040,0xaaef6040 +.long 0x069f715e,0x069f715e +.long 0x51106ebd,0x51106ebd +.long 0xf98a213e,0xf98a213e +.long 0x3d06dd96,0x3d06dd96 +.long 0xae053edd,0xae053edd +.long 0x46bde64d,0x46bde64d +.long 0xb58d5491,0xb58d5491 +.long 0x055dc471,0x055dc471 +.long 0x6fd40604,0x6fd40604 +.long 0xff155060,0xff155060 +.long 0x24fb9819,0x24fb9819 +.long 0x97e9bdd6,0x97e9bdd6 +.long 0xcc434089,0xcc434089 +.long 0x779ed967,0x779ed967 +.long 0xbd42e8b0,0xbd42e8b0 +.long 0x888b8907,0x888b8907 +.long 0x385b19e7,0x385b19e7 +.long 0xdbeec879,0xdbeec879 +.long 0x470a7ca1,0x470a7ca1 +.long 0xe90f427c,0xe90f427c +.long 0xc91e84f8,0xc91e84f8 +.long 0x00000000,0x00000000 +.long 0x83868009,0x83868009 +.long 0x48ed2b32,0x48ed2b32 +.long 0xac70111e,0xac70111e +.long 0x4e725a6c,0x4e725a6c +.long 0xfbff0efd,0xfbff0efd +.long 0x5638850f,0x5638850f +.long 0x1ed5ae3d,0x1ed5ae3d +.long 0x27392d36,0x27392d36 +.long 0x64d90f0a,0x64d90f0a +.long 0x21a65c68,0x21a65c68 +.long 0xd1545b9b,0xd1545b9b +.long 0x3a2e3624,0x3a2e3624 +.long 0xb1670a0c,0xb1670a0c +.long 0x0fe75793,0x0fe75793 +.long 0xd296eeb4,0xd296eeb4 +.long 0x9e919b1b,0x9e919b1b +.long 0x4fc5c080,0x4fc5c080 +.long 0xa220dc61,0xa220dc61 +.long 0x694b775a,0x694b775a +.long 0x161a121c,0x161a121c +.long 0x0aba93e2,0x0aba93e2 +.long 0xe52aa0c0,0xe52aa0c0 +.long 0x43e0223c,0x43e0223c +.long 0x1d171b12,0x1d171b12 +.long 0x0b0d090e,0x0b0d090e +.long 0xadc78bf2,0xadc78bf2 +.long 0xb9a8b62d,0xb9a8b62d +.long 0xc8a91e14,0xc8a91e14 +.long 0x8519f157,0x8519f157 +.long 0x4c0775af,0x4c0775af +.long 0xbbdd99ee,0xbbdd99ee +.long 0xfd607fa3,0xfd607fa3 +.long 0x9f2601f7,0x9f2601f7 +.long 0xbcf5725c,0xbcf5725c +.long 0xc53b6644,0xc53b6644 +.long 0x347efb5b,0x347efb5b +.long 0x7629438b,0x7629438b +.long 0xdcc623cb,0xdcc623cb +.long 0x68fcedb6,0x68fcedb6 +.long 0x63f1e4b8,0x63f1e4b8 +.long 0xcadc31d7,0xcadc31d7 +.long 0x10856342,0x10856342 +.long 0x40229713,0x40229713 +.long 0x2011c684,0x2011c684 +.long 0x7d244a85,0x7d244a85 +.long 0xf83dbbd2,0xf83dbbd2 +.long 0x1132f9ae,0x1132f9ae +.long 0x6da129c7,0x6da129c7 +.long 0x4b2f9e1d,0x4b2f9e1d +.long 0xf330b2dc,0xf330b2dc +.long 0xec52860d,0xec52860d +.long 0xd0e3c177,0xd0e3c177 +.long 0x6c16b32b,0x6c16b32b +.long 0x99b970a9,0x99b970a9 +.long 0xfa489411,0xfa489411 +.long 0x2264e947,0x2264e947 +.long 0xc48cfca8,0xc48cfca8 +.long 0x1a3ff0a0,0x1a3ff0a0 +.long 0xd82c7d56,0xd82c7d56 +.long 0xef903322,0xef903322 +.long 0xc74e4987,0xc74e4987 +.long 0xc1d138d9,0xc1d138d9 +.long 0xfea2ca8c,0xfea2ca8c +.long 0x360bd498,0x360bd498 +.long 0xcf81f5a6,0xcf81f5a6 +.long 0x28de7aa5,0x28de7aa5 +.long 0x268eb7da,0x268eb7da +.long 0xa4bfad3f,0xa4bfad3f +.long 0xe49d3a2c,0xe49d3a2c +.long 0x0d927850,0x0d927850 +.long 0x9bcc5f6a,0x9bcc5f6a +.long 0x62467e54,0x62467e54 +.long 0xc2138df6,0xc2138df6 +.long 0xe8b8d890,0xe8b8d890 +.long 0x5ef7392e,0x5ef7392e +.long 0xf5afc382,0xf5afc382 +.long 0xbe805d9f,0xbe805d9f +.long 0x7c93d069,0x7c93d069 +.long 0xa92dd56f,0xa92dd56f +.long 0xb31225cf,0xb31225cf +.long 0x3b99acc8,0x3b99acc8 +.long 0xa77d1810,0xa77d1810 +.long 0x6e639ce8,0x6e639ce8 +.long 0x7bbb3bdb,0x7bbb3bdb +.long 0x097826cd,0x097826cd +.long 0xf418596e,0xf418596e +.long 0x01b79aec,0x01b79aec +.long 0xa89a4f83,0xa89a4f83 +.long 0x656e95e6,0x656e95e6 +.long 0x7ee6ffaa,0x7ee6ffaa +.long 0x08cfbc21,0x08cfbc21 +.long 0xe6e815ef,0xe6e815ef +.long 0xd99be7ba,0xd99be7ba +.long 0xce366f4a,0xce366f4a +.long 0xd4099fea,0xd4099fea +.long 0xd67cb029,0xd67cb029 +.long 0xafb2a431,0xafb2a431 +.long 0x31233f2a,0x31233f2a +.long 0x3094a5c6,0x3094a5c6 +.long 0xc066a235,0xc066a235 +.long 0x37bc4e74,0x37bc4e74 +.long 0xa6ca82fc,0xa6ca82fc +.long 0xb0d090e0,0xb0d090e0 +.long 0x15d8a733,0x15d8a733 +.long 0x4a9804f1,0x4a9804f1 +.long 0xf7daec41,0xf7daec41 +.long 0x0e50cd7f,0x0e50cd7f +.long 0x2ff69117,0x2ff69117 +.long 0x8dd64d76,0x8dd64d76 +.long 0x4db0ef43,0x4db0ef43 +.long 0x544daacc,0x544daacc +.long 0xdf0496e4,0xdf0496e4 +.long 0xe3b5d19e,0xe3b5d19e +.long 0x1b886a4c,0x1b886a4c +.long 0xb81f2cc1,0xb81f2cc1 +.long 0x7f516546,0x7f516546 +.long 0x04ea5e9d,0x04ea5e9d +.long 0x5d358c01,0x5d358c01 +.long 0x737487fa,0x737487fa +.long 0x2e410bfb,0x2e410bfb +.long 0x5a1d67b3,0x5a1d67b3 +.long 0x52d2db92,0x52d2db92 +.long 0x335610e9,0x335610e9 +.long 0x1347d66d,0x1347d66d +.long 0x8c61d79a,0x8c61d79a +.long 0x7a0ca137,0x7a0ca137 +.long 0x8e14f859,0x8e14f859 +.long 0x893c13eb,0x893c13eb +.long 0xee27a9ce,0xee27a9ce +.long 0x35c961b7,0x35c961b7 +.long 0xede51ce1,0xede51ce1 +.long 0x3cb1477a,0x3cb1477a +.long 0x59dfd29c,0x59dfd29c +.long 0x3f73f255,0x3f73f255 +.long 0x79ce1418,0x79ce1418 +.long 0xbf37c773,0xbf37c773 +.long 0xeacdf753,0xeacdf753 +.long 0x5baafd5f,0x5baafd5f +.long 0x146f3ddf,0x146f3ddf +.long 0x86db4478,0x86db4478 +.long 0x81f3afca,0x81f3afca +.long 0x3ec468b9,0x3ec468b9 +.long 0x2c342438,0x2c342438 +.long 0x5f40a3c2,0x5f40a3c2 +.long 0x72c31d16,0x72c31d16 +.long 0x0c25e2bc,0x0c25e2bc +.long 0x8b493c28,0x8b493c28 +.long 0x41950dff,0x41950dff +.long 0x7101a839,0x7101a839 +.long 0xdeb30c08,0xdeb30c08 +.long 0x9ce4b4d8,0x9ce4b4d8 +.long 0x90c15664,0x90c15664 +.long 0x6184cb7b,0x6184cb7b +.long 0x70b632d5,0x70b632d5 +.long 0x745c6c48,0x745c6c48 +.long 0x4257b8d0,0x4257b8d0 +.byte 0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38 +.byte 0xbf,0x40,0xa3,0x9e,0x81,0xf3,0xd7,0xfb +.byte 0x7c,0xe3,0x39,0x82,0x9b,0x2f,0xff,0x87 +.byte 0x34,0x8e,0x43,0x44,0xc4,0xde,0xe9,0xcb +.byte 0x54,0x7b,0x94,0x32,0xa6,0xc2,0x23,0x3d +.byte 0xee,0x4c,0x95,0x0b,0x42,0xfa,0xc3,0x4e +.byte 0x08,0x2e,0xa1,0x66,0x28,0xd9,0x24,0xb2 +.byte 0x76,0x5b,0xa2,0x49,0x6d,0x8b,0xd1,0x25 +.byte 0x72,0xf8,0xf6,0x64,0x86,0x68,0x98,0x16 +.byte 0xd4,0xa4,0x5c,0xcc,0x5d,0x65,0xb6,0x92 +.byte 0x6c,0x70,0x48,0x50,0xfd,0xed,0xb9,0xda +.byte 0x5e,0x15,0x46,0x57,0xa7,0x8d,0x9d,0x84 +.byte 0x90,0xd8,0xab,0x00,0x8c,0xbc,0xd3,0x0a +.byte 0xf7,0xe4,0x58,0x05,0xb8,0xb3,0x45,0x06 +.byte 0xd0,0x2c,0x1e,0x8f,0xca,0x3f,0x0f,0x02 +.byte 0xc1,0xaf,0xbd,0x03,0x01,0x13,0x8a,0x6b +.byte 0x3a,0x91,0x11,0x41,0x4f,0x67,0xdc,0xea +.byte 0x97,0xf2,0xcf,0xce,0xf0,0xb4,0xe6,0x73 +.byte 0x96,0xac,0x74,0x22,0xe7,0xad,0x35,0x85 +.byte 0xe2,0xf9,0x37,0xe8,0x1c,0x75,0xdf,0x6e +.byte 0x47,0xf1,0x1a,0x71,0x1d,0x29,0xc5,0x89 +.byte 0x6f,0xb7,0x62,0x0e,0xaa,0x18,0xbe,0x1b +.byte 0xfc,0x56,0x3e,0x4b,0xc6,0xd2,0x79,0x20 +.byte 0x9a,0xdb,0xc0,0xfe,0x78,0xcd,0x5a,0xf4 +.byte 0x1f,0xdd,0xa8,0x33,0x88,0x07,0xc7,0x31 +.byte 0xb1,0x12,0x10,0x59,0x27,0x80,0xec,0x5f +.byte 0x60,0x51,0x7f,0xa9,0x19,0xb5,0x4a,0x0d +.byte 0x2d,0xe5,0x7a,0x9f,0x93,0xc9,0x9c,0xef +.byte 0xa0,0xe0,0x3b,0x4d,0xae,0x2a,0xf5,0xb0 +.byte 0xc8,0xeb,0xbb,0x3c,0x83,0x53,0x99,0x61 +.byte 0x17,0x2b,0x04,0x7e,0xba,0x77,0xd6,0x26 +.byte 0xe1,0x69,0x14,0x63,0x55,0x21,0x0c,0x7d +.long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe +.long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0 +.byte 0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38 +.byte 0xbf,0x40,0xa3,0x9e,0x81,0xf3,0xd7,0xfb +.byte 0x7c,0xe3,0x39,0x82,0x9b,0x2f,0xff,0x87 +.byte 0x34,0x8e,0x43,0x44,0xc4,0xde,0xe9,0xcb +.byte 0x54,0x7b,0x94,0x32,0xa6,0xc2,0x23,0x3d +.byte 0xee,0x4c,0x95,0x0b,0x42,0xfa,0xc3,0x4e +.byte 0x08,0x2e,0xa1,0x66,0x28,0xd9,0x24,0xb2 +.byte 0x76,0x5b,0xa2,0x49,0x6d,0x8b,0xd1,0x25 +.byte 0x72,0xf8,0xf6,0x64,0x86,0x68,0x98,0x16 +.byte 0xd4,0xa4,0x5c,0xcc,0x5d,0x65,0xb6,0x92 +.byte 0x6c,0x70,0x48,0x50,0xfd,0xed,0xb9,0xda +.byte 0x5e,0x15,0x46,0x57,0xa7,0x8d,0x9d,0x84 +.byte 0x90,0xd8,0xab,0x00,0x8c,0xbc,0xd3,0x0a +.byte 0xf7,0xe4,0x58,0x05,0xb8,0xb3,0x45,0x06 +.byte 0xd0,0x2c,0x1e,0x8f,0xca,0x3f,0x0f,0x02 +.byte 0xc1,0xaf,0xbd,0x03,0x01,0x13,0x8a,0x6b +.byte 0x3a,0x91,0x11,0x41,0x4f,0x67,0xdc,0xea +.byte 0x97,0xf2,0xcf,0xce,0xf0,0xb4,0xe6,0x73 +.byte 0x96,0xac,0x74,0x22,0xe7,0xad,0x35,0x85 +.byte 0xe2,0xf9,0x37,0xe8,0x1c,0x75,0xdf,0x6e +.byte 0x47,0xf1,0x1a,0x71,0x1d,0x29,0xc5,0x89 +.byte 0x6f,0xb7,0x62,0x0e,0xaa,0x18,0xbe,0x1b +.byte 0xfc,0x56,0x3e,0x4b,0xc6,0xd2,0x79,0x20 +.byte 0x9a,0xdb,0xc0,0xfe,0x78,0xcd,0x5a,0xf4 +.byte 0x1f,0xdd,0xa8,0x33,0x88,0x07,0xc7,0x31 +.byte 0xb1,0x12,0x10,0x59,0x27,0x80,0xec,0x5f +.byte 0x60,0x51,0x7f,0xa9,0x19,0xb5,0x4a,0x0d +.byte 0x2d,0xe5,0x7a,0x9f,0x93,0xc9,0x9c,0xef +.byte 0xa0,0xe0,0x3b,0x4d,0xae,0x2a,0xf5,0xb0 +.byte 0xc8,0xeb,0xbb,0x3c,0x83,0x53,0x99,0x61 +.byte 0x17,0x2b,0x04,0x7e,0xba,0x77,0xd6,0x26 +.byte 0xe1,0x69,0x14,0x63,0x55,0x21,0x0c,0x7d +.long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe +.long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0 +.byte 0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38 +.byte 0xbf,0x40,0xa3,0x9e,0x81,0xf3,0xd7,0xfb +.byte 0x7c,0xe3,0x39,0x82,0x9b,0x2f,0xff,0x87 +.byte 0x34,0x8e,0x43,0x44,0xc4,0xde,0xe9,0xcb +.byte 0x54,0x7b,0x94,0x32,0xa6,0xc2,0x23,0x3d +.byte 0xee,0x4c,0x95,0x0b,0x42,0xfa,0xc3,0x4e +.byte 0x08,0x2e,0xa1,0x66,0x28,0xd9,0x24,0xb2 +.byte 0x76,0x5b,0xa2,0x49,0x6d,0x8b,0xd1,0x25 +.byte 0x72,0xf8,0xf6,0x64,0x86,0x68,0x98,0x16 +.byte 0xd4,0xa4,0x5c,0xcc,0x5d,0x65,0xb6,0x92 +.byte 0x6c,0x70,0x48,0x50,0xfd,0xed,0xb9,0xda +.byte 0x5e,0x15,0x46,0x57,0xa7,0x8d,0x9d,0x84 +.byte 0x90,0xd8,0xab,0x00,0x8c,0xbc,0xd3,0x0a +.byte 0xf7,0xe4,0x58,0x05,0xb8,0xb3,0x45,0x06 +.byte 0xd0,0x2c,0x1e,0x8f,0xca,0x3f,0x0f,0x02 +.byte 0xc1,0xaf,0xbd,0x03,0x01,0x13,0x8a,0x6b +.byte 0x3a,0x91,0x11,0x41,0x4f,0x67,0xdc,0xea +.byte 0x97,0xf2,0xcf,0xce,0xf0,0xb4,0xe6,0x73 +.byte 0x96,0xac,0x74,0x22,0xe7,0xad,0x35,0x85 +.byte 0xe2,0xf9,0x37,0xe8,0x1c,0x75,0xdf,0x6e +.byte 0x47,0xf1,0x1a,0x71,0x1d,0x29,0xc5,0x89 +.byte 0x6f,0xb7,0x62,0x0e,0xaa,0x18,0xbe,0x1b +.byte 0xfc,0x56,0x3e,0x4b,0xc6,0xd2,0x79,0x20 +.byte 0x9a,0xdb,0xc0,0xfe,0x78,0xcd,0x5a,0xf4 +.byte 0x1f,0xdd,0xa8,0x33,0x88,0x07,0xc7,0x31 +.byte 0xb1,0x12,0x10,0x59,0x27,0x80,0xec,0x5f +.byte 0x60,0x51,0x7f,0xa9,0x19,0xb5,0x4a,0x0d +.byte 0x2d,0xe5,0x7a,0x9f,0x93,0xc9,0x9c,0xef +.byte 0xa0,0xe0,0x3b,0x4d,0xae,0x2a,0xf5,0xb0 +.byte 0xc8,0xeb,0xbb,0x3c,0x83,0x53,0x99,0x61 +.byte 0x17,0x2b,0x04,0x7e,0xba,0x77,0xd6,0x26 +.byte 0xe1,0x69,0x14,0x63,0x55,0x21,0x0c,0x7d +.long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe +.long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0 +.byte 0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38 +.byte 0xbf,0x40,0xa3,0x9e,0x81,0xf3,0xd7,0xfb +.byte 0x7c,0xe3,0x39,0x82,0x9b,0x2f,0xff,0x87 +.byte 0x34,0x8e,0x43,0x44,0xc4,0xde,0xe9,0xcb +.byte 0x54,0x7b,0x94,0x32,0xa6,0xc2,0x23,0x3d +.byte 0xee,0x4c,0x95,0x0b,0x42,0xfa,0xc3,0x4e +.byte 0x08,0x2e,0xa1,0x66,0x28,0xd9,0x24,0xb2 +.byte 0x76,0x5b,0xa2,0x49,0x6d,0x8b,0xd1,0x25 +.byte 0x72,0xf8,0xf6,0x64,0x86,0x68,0x98,0x16 +.byte 0xd4,0xa4,0x5c,0xcc,0x5d,0x65,0xb6,0x92 +.byte 0x6c,0x70,0x48,0x50,0xfd,0xed,0xb9,0xda +.byte 0x5e,0x15,0x46,0x57,0xa7,0x8d,0x9d,0x84 +.byte 0x90,0xd8,0xab,0x00,0x8c,0xbc,0xd3,0x0a +.byte 0xf7,0xe4,0x58,0x05,0xb8,0xb3,0x45,0x06 +.byte 0xd0,0x2c,0x1e,0x8f,0xca,0x3f,0x0f,0x02 +.byte 0xc1,0xaf,0xbd,0x03,0x01,0x13,0x8a,0x6b +.byte 0x3a,0x91,0x11,0x41,0x4f,0x67,0xdc,0xea +.byte 0x97,0xf2,0xcf,0xce,0xf0,0xb4,0xe6,0x73 +.byte 0x96,0xac,0x74,0x22,0xe7,0xad,0x35,0x85 +.byte 0xe2,0xf9,0x37,0xe8,0x1c,0x75,0xdf,0x6e +.byte 0x47,0xf1,0x1a,0x71,0x1d,0x29,0xc5,0x89 +.byte 0x6f,0xb7,0x62,0x0e,0xaa,0x18,0xbe,0x1b +.byte 0xfc,0x56,0x3e,0x4b,0xc6,0xd2,0x79,0x20 +.byte 0x9a,0xdb,0xc0,0xfe,0x78,0xcd,0x5a,0xf4 +.byte 0x1f,0xdd,0xa8,0x33,0x88,0x07,0xc7,0x31 +.byte 0xb1,0x12,0x10,0x59,0x27,0x80,0xec,0x5f +.byte 0x60,0x51,0x7f,0xa9,0x19,0xb5,0x4a,0x0d +.byte 0x2d,0xe5,0x7a,0x9f,0x93,0xc9,0x9c,0xef +.byte 0xa0,0xe0,0x3b,0x4d,0xae,0x2a,0xf5,0xb0 +.byte 0xc8,0xeb,0xbb,0x3c,0x83,0x53,0x99,0x61 +.byte 0x17,0x2b,0x04,0x7e,0xba,0x77,0xd6,0x26 +.byte 0xe1,0x69,0x14,0x63,0x55,0x21,0x0c,0x7d +.long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe +.long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0 +.byte 65,69,83,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 64 diff --git a/secure/lib/libcrypto/amd64/aesni-gcm-x86_64.S b/secure/lib/libcrypto/amd64/aesni-gcm-x86_64.S new file mode 100644 index 0000000..9e99e71 --- /dev/null +++ b/secure/lib/libcrypto/amd64/aesni-gcm-x86_64.S @@ -0,0 +1,16 @@ + # $FreeBSD$ +.text + +.globl aesni_gcm_encrypt +.type aesni_gcm_encrypt,@function +aesni_gcm_encrypt: + xorl %eax,%eax + .byte 0xf3,0xc3 +.size aesni_gcm_encrypt,.-aesni_gcm_encrypt + +.globl aesni_gcm_decrypt +.type aesni_gcm_decrypt,@function +aesni_gcm_decrypt: + xorl %eax,%eax + .byte 0xf3,0xc3 +.size aesni_gcm_decrypt,.-aesni_gcm_decrypt diff --git a/secure/lib/libcrypto/amd64/aesni-mb-x86_64.S b/secure/lib/libcrypto/amd64/aesni-mb-x86_64.S new file mode 100644 index 0000000..7043ec3 --- /dev/null +++ b/secure/lib/libcrypto/amd64/aesni-mb-x86_64.S @@ -0,0 +1,507 @@ + # $FreeBSD$ +.text + + + +.globl aesni_multi_cbc_encrypt +.type aesni_multi_cbc_encrypt,@function +.align 32 +aesni_multi_cbc_encrypt: + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + + + + + + subq $48,%rsp + andq $-64,%rsp + movq %rax,16(%rsp) + +.Lenc4x_body: + movdqu (%rsi),%xmm12 + leaq 120(%rsi),%rsi + leaq 80(%rdi),%rdi + +.Lenc4x_loop_grande: + movl %edx,24(%rsp) + xorl %edx,%edx + movl -64(%rdi),%ecx + movq -80(%rdi),%r8 + cmpl %edx,%ecx + movq -72(%rdi),%r12 + cmovgl %ecx,%edx + testl %ecx,%ecx + movdqu -56(%rdi),%xmm2 + movl %ecx,32(%rsp) + cmovleq %rsp,%r8 + movl -24(%rdi),%ecx + movq -40(%rdi),%r9 + cmpl %edx,%ecx + movq -32(%rdi),%r13 + cmovgl %ecx,%edx + testl %ecx,%ecx + movdqu -16(%rdi),%xmm3 + movl %ecx,36(%rsp) + cmovleq %rsp,%r9 + movl 16(%rdi),%ecx + movq 0(%rdi),%r10 + cmpl %edx,%ecx + movq 8(%rdi),%r14 + cmovgl %ecx,%edx + testl %ecx,%ecx + movdqu 24(%rdi),%xmm4 + movl %ecx,40(%rsp) + cmovleq %rsp,%r10 + movl 56(%rdi),%ecx + movq 40(%rdi),%r11 + cmpl %edx,%ecx + movq 48(%rdi),%r15 + cmovgl %ecx,%edx + testl %ecx,%ecx + movdqu 64(%rdi),%xmm5 + movl %ecx,44(%rsp) + cmovleq %rsp,%r11 + testl %edx,%edx + jz .Lenc4x_done + + movups 16-120(%rsi),%xmm1 + pxor %xmm12,%xmm2 + movups 32-120(%rsi),%xmm0 + pxor %xmm12,%xmm3 + movl 240-120(%rsi),%eax + pxor %xmm12,%xmm4 + movdqu (%r8),%xmm6 + pxor %xmm12,%xmm5 + movdqu (%r9),%xmm7 + pxor %xmm6,%xmm2 + movdqu (%r10),%xmm8 + pxor %xmm7,%xmm3 + movdqu (%r11),%xmm9 + pxor %xmm8,%xmm4 + pxor %xmm9,%xmm5 + movdqa 32(%rsp),%xmm10 + xorq %rbx,%rbx + jmp .Loop_enc4x + +.align 32 +.Loop_enc4x: + addq $16,%rbx + leaq 16(%rsp),%rbp + movl $1,%ecx + subq %rbx,%rbp + +.byte 102,15,56,220,209 + prefetcht0 31(%r8,%rbx,1) + prefetcht0 31(%r9,%rbx,1) +.byte 102,15,56,220,217 + prefetcht0 31(%r10,%rbx,1) + prefetcht0 31(%r10,%rbx,1) +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movups 48-120(%rsi),%xmm1 + cmpl 32(%rsp),%ecx +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 + cmovgeq %rbp,%r8 + cmovgq %rbp,%r12 +.byte 102,15,56,220,232 + movups -56(%rsi),%xmm0 + cmpl 36(%rsp),%ecx +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 + cmovgeq %rbp,%r9 + cmovgq %rbp,%r13 +.byte 102,15,56,220,233 + movups -40(%rsi),%xmm1 + cmpl 40(%rsp),%ecx +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 + cmovgeq %rbp,%r10 + cmovgq %rbp,%r14 +.byte 102,15,56,220,232 + movups -24(%rsi),%xmm0 + cmpl 44(%rsp),%ecx +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 + cmovgeq %rbp,%r11 + cmovgq %rbp,%r15 +.byte 102,15,56,220,233 + movups -8(%rsi),%xmm1 + movdqa %xmm10,%xmm11 +.byte 102,15,56,220,208 + prefetcht0 15(%r12,%rbx,1) + prefetcht0 15(%r13,%rbx,1) +.byte 102,15,56,220,216 + prefetcht0 15(%r14,%rbx,1) + prefetcht0 15(%r15,%rbx,1) +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + movups 128-120(%rsi),%xmm0 + pxor %xmm12,%xmm12 + +.byte 102,15,56,220,209 + pcmpgtd %xmm12,%xmm11 + movdqu -120(%rsi),%xmm12 +.byte 102,15,56,220,217 + paddd %xmm11,%xmm10 + movdqa %xmm10,32(%rsp) +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movups 144-120(%rsi),%xmm1 + + cmpl $11,%eax + +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + movups 160-120(%rsi),%xmm0 + + jb .Lenc4x_tail + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movups 176-120(%rsi),%xmm1 + +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + movups 192-120(%rsi),%xmm0 + + je .Lenc4x_tail + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movups 208-120(%rsi),%xmm1 + +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + movups 224-120(%rsi),%xmm0 + jmp .Lenc4x_tail + +.align 32 +.Lenc4x_tail: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movdqu (%r8,%rbx,1),%xmm6 + movdqu 16-120(%rsi),%xmm1 + +.byte 102,15,56,221,208 + movdqu (%r9,%rbx,1),%xmm7 + pxor %xmm12,%xmm6 +.byte 102,15,56,221,216 + movdqu (%r10,%rbx,1),%xmm8 + pxor %xmm12,%xmm7 +.byte 102,15,56,221,224 + movdqu (%r11,%rbx,1),%xmm9 + pxor %xmm12,%xmm8 +.byte 102,15,56,221,232 + movdqu 32-120(%rsi),%xmm0 + pxor %xmm12,%xmm9 + + movups %xmm2,-16(%r12,%rbx,1) + pxor %xmm6,%xmm2 + movups %xmm3,-16(%r13,%rbx,1) + pxor %xmm7,%xmm3 + movups %xmm4,-16(%r14,%rbx,1) + pxor %xmm8,%xmm4 + movups %xmm5,-16(%r15,%rbx,1) + pxor %xmm9,%xmm5 + + decl %edx + jnz .Loop_enc4x + + movq 16(%rsp),%rax + movl 24(%rsp),%edx + + + + + + + + + + + leaq 160(%rdi),%rdi + decl %edx + jnz .Lenc4x_loop_grande + +.Lenc4x_done: + movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp +.Lenc4x_epilogue: + .byte 0xf3,0xc3 +.size aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt + +.globl aesni_multi_cbc_decrypt +.type aesni_multi_cbc_decrypt,@function +.align 32 +aesni_multi_cbc_decrypt: + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + + + + + + subq $48,%rsp + andq $-64,%rsp + movq %rax,16(%rsp) + +.Ldec4x_body: + movdqu (%rsi),%xmm12 + leaq 120(%rsi),%rsi + leaq 80(%rdi),%rdi + +.Ldec4x_loop_grande: + movl %edx,24(%rsp) + xorl %edx,%edx + movl -64(%rdi),%ecx + movq -80(%rdi),%r8 + cmpl %edx,%ecx + movq -72(%rdi),%r12 + cmovgl %ecx,%edx + testl %ecx,%ecx + movdqu -56(%rdi),%xmm6 + movl %ecx,32(%rsp) + cmovleq %rsp,%r8 + movl -24(%rdi),%ecx + movq -40(%rdi),%r9 + cmpl %edx,%ecx + movq -32(%rdi),%r13 + cmovgl %ecx,%edx + testl %ecx,%ecx + movdqu -16(%rdi),%xmm7 + movl %ecx,36(%rsp) + cmovleq %rsp,%r9 + movl 16(%rdi),%ecx + movq 0(%rdi),%r10 + cmpl %edx,%ecx + movq 8(%rdi),%r14 + cmovgl %ecx,%edx + testl %ecx,%ecx + movdqu 24(%rdi),%xmm8 + movl %ecx,40(%rsp) + cmovleq %rsp,%r10 + movl 56(%rdi),%ecx + movq 40(%rdi),%r11 + cmpl %edx,%ecx + movq 48(%rdi),%r15 + cmovgl %ecx,%edx + testl %ecx,%ecx + movdqu 64(%rdi),%xmm9 + movl %ecx,44(%rsp) + cmovleq %rsp,%r11 + testl %edx,%edx + jz .Ldec4x_done + + movups 16-120(%rsi),%xmm1 + movups 32-120(%rsi),%xmm0 + movl 240-120(%rsi),%eax + movdqu (%r8),%xmm2 + movdqu (%r9),%xmm3 + pxor %xmm12,%xmm2 + movdqu (%r10),%xmm4 + pxor %xmm12,%xmm3 + movdqu (%r11),%xmm5 + pxor %xmm12,%xmm4 + pxor %xmm12,%xmm5 + movdqa 32(%rsp),%xmm10 + xorq %rbx,%rbx + jmp .Loop_dec4x + +.align 32 +.Loop_dec4x: + addq $16,%rbx + leaq 16(%rsp),%rbp + movl $1,%ecx + subq %rbx,%rbp + +.byte 102,15,56,222,209 + prefetcht0 31(%r8,%rbx,1) + prefetcht0 31(%r9,%rbx,1) +.byte 102,15,56,222,217 + prefetcht0 31(%r10,%rbx,1) + prefetcht0 31(%r11,%rbx,1) +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + movups 48-120(%rsi),%xmm1 + cmpl 32(%rsp),%ecx +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 + cmovgeq %rbp,%r8 + cmovgq %rbp,%r12 +.byte 102,15,56,222,232 + movups -56(%rsi),%xmm0 + cmpl 36(%rsp),%ecx +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 + cmovgeq %rbp,%r9 + cmovgq %rbp,%r13 +.byte 102,15,56,222,233 + movups -40(%rsi),%xmm1 + cmpl 40(%rsp),%ecx +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 + cmovgeq %rbp,%r10 + cmovgq %rbp,%r14 +.byte 102,15,56,222,232 + movups -24(%rsi),%xmm0 + cmpl 44(%rsp),%ecx +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 + cmovgeq %rbp,%r11 + cmovgq %rbp,%r15 +.byte 102,15,56,222,233 + movups -8(%rsi),%xmm1 + movdqa %xmm10,%xmm11 +.byte 102,15,56,222,208 + prefetcht0 15(%r12,%rbx,1) + prefetcht0 15(%r13,%rbx,1) +.byte 102,15,56,222,216 + prefetcht0 15(%r14,%rbx,1) + prefetcht0 15(%r15,%rbx,1) +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 + movups 128-120(%rsi),%xmm0 + pxor %xmm12,%xmm12 + +.byte 102,15,56,222,209 + pcmpgtd %xmm12,%xmm11 + movdqu -120(%rsi),%xmm12 +.byte 102,15,56,222,217 + paddd %xmm11,%xmm10 + movdqa %xmm10,32(%rsp) +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + movups 144-120(%rsi),%xmm1 + + cmpl $11,%eax + +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 + movups 160-120(%rsi),%xmm0 + + jb .Ldec4x_tail + +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + movups 176-120(%rsi),%xmm1 + +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 + movups 192-120(%rsi),%xmm0 + + je .Ldec4x_tail + +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + movups 208-120(%rsi),%xmm1 + +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 + movups 224-120(%rsi),%xmm0 + jmp .Ldec4x_tail + +.align 32 +.Ldec4x_tail: +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 + pxor %xmm0,%xmm6 + pxor %xmm0,%xmm7 +.byte 102,15,56,222,233 + movdqu 16-120(%rsi),%xmm1 + pxor %xmm0,%xmm8 + pxor %xmm0,%xmm9 + movdqu 32-120(%rsi),%xmm0 + +.byte 102,15,56,223,214 +.byte 102,15,56,223,223 + movdqu -16(%r8,%rbx,1),%xmm6 + movdqu -16(%r9,%rbx,1),%xmm7 +.byte 102,65,15,56,223,224 +.byte 102,65,15,56,223,233 + movdqu -16(%r10,%rbx,1),%xmm8 + movdqu -16(%r11,%rbx,1),%xmm9 + + movups %xmm2,-16(%r12,%rbx,1) + movdqu (%r8,%rbx,1),%xmm2 + movups %xmm3,-16(%r13,%rbx,1) + movdqu (%r9,%rbx,1),%xmm3 + pxor %xmm12,%xmm2 + movups %xmm4,-16(%r14,%rbx,1) + movdqu (%r10,%rbx,1),%xmm4 + pxor %xmm12,%xmm3 + movups %xmm5,-16(%r15,%rbx,1) + movdqu (%r11,%rbx,1),%xmm5 + pxor %xmm12,%xmm4 + pxor %xmm12,%xmm5 + + decl %edx + jnz .Loop_dec4x + + movq 16(%rsp),%rax + movl 24(%rsp),%edx + + leaq 160(%rdi),%rdi + decl %edx + jnz .Ldec4x_loop_grande + +.Ldec4x_done: + movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp +.Ldec4x_epilogue: + .byte 0xf3,0xc3 +.size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt diff --git a/secure/lib/libcrypto/amd64/aesni-sha1-x86_64.S b/secure/lib/libcrypto/amd64/aesni-sha1-x86_64.S new file mode 100644 index 0000000..fa16434 --- /dev/null +++ b/secure/lib/libcrypto/amd64/aesni-sha1-x86_64.S @@ -0,0 +1,1682 @@ + # $FreeBSD$ +.text + + +.globl aesni_cbc_sha1_enc +.type aesni_cbc_sha1_enc,@function +.align 32 +aesni_cbc_sha1_enc: + + movl OPENSSL_ia32cap_P+0(%rip),%r10d + movq OPENSSL_ia32cap_P+4(%rip),%r11 + btq $61,%r11 + jc aesni_cbc_sha1_enc_shaext + jmp aesni_cbc_sha1_enc_ssse3 + .byte 0xf3,0xc3 +.size aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc +.type aesni_cbc_sha1_enc_ssse3,@function +.align 32 +aesni_cbc_sha1_enc_ssse3: + movq 8(%rsp),%r10 + + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + leaq -104(%rsp),%rsp + + + movq %rdi,%r12 + movq %rsi,%r13 + movq %rdx,%r14 + leaq 112(%rcx),%r15 + movdqu (%r8),%xmm2 + movq %r8,88(%rsp) + shlq $6,%r14 + subq %r12,%r13 + movl 240-112(%r15),%r8d + addq %r10,%r14 + + leaq K_XX_XX(%rip),%r11 + movl 0(%r9),%eax + movl 4(%r9),%ebx + movl 8(%r9),%ecx + movl 12(%r9),%edx + movl %ebx,%esi + movl 16(%r9),%ebp + movl %ecx,%edi + xorl %edx,%edi + andl %edi,%esi + + movdqa 64(%r11),%xmm3 + movdqa 0(%r11),%xmm13 + movdqu 0(%r10),%xmm4 + movdqu 16(%r10),%xmm5 + movdqu 32(%r10),%xmm6 + movdqu 48(%r10),%xmm7 +.byte 102,15,56,0,227 +.byte 102,15,56,0,235 +.byte 102,15,56,0,243 + addq $64,%r10 + paddd %xmm13,%xmm4 +.byte 102,15,56,0,251 + paddd %xmm13,%xmm5 + paddd %xmm13,%xmm6 + movdqa %xmm4,0(%rsp) + psubd %xmm13,%xmm4 + movdqa %xmm5,16(%rsp) + psubd %xmm13,%xmm5 + movdqa %xmm6,32(%rsp) + psubd %xmm13,%xmm6 + movups -112(%r15),%xmm15 + movups 16-112(%r15),%xmm0 + jmp .Loop_ssse3 +.align 32 +.Loop_ssse3: + rorl $2,%ebx + movups 0(%r12),%xmm14 + xorps %xmm15,%xmm14 + xorps %xmm14,%xmm2 + movups -80(%r15),%xmm1 +.byte 102,15,56,220,208 + pshufd $238,%xmm4,%xmm8 + xorl %edx,%esi + movdqa %xmm7,%xmm12 + paddd %xmm7,%xmm13 + movl %eax,%edi + addl 0(%rsp),%ebp + punpcklqdq %xmm5,%xmm8 + xorl %ecx,%ebx + roll $5,%eax + addl %esi,%ebp + psrldq $4,%xmm12 + andl %ebx,%edi + xorl %ecx,%ebx + pxor %xmm4,%xmm8 + addl %eax,%ebp + rorl $7,%eax + pxor %xmm6,%xmm12 + xorl %ecx,%edi + movl %ebp,%esi + addl 4(%rsp),%edx + pxor %xmm12,%xmm8 + xorl %ebx,%eax + roll $5,%ebp + movdqa %xmm13,48(%rsp) + addl %edi,%edx + movups -64(%r15),%xmm0 +.byte 102,15,56,220,209 + andl %eax,%esi + movdqa %xmm8,%xmm3 + xorl %ebx,%eax + addl %ebp,%edx + rorl $7,%ebp + movdqa %xmm8,%xmm12 + xorl %ebx,%esi + pslldq $12,%xmm3 + paddd %xmm8,%xmm8 + movl %edx,%edi + addl 8(%rsp),%ecx + psrld $31,%xmm12 + xorl %eax,%ebp + roll $5,%edx + addl %esi,%ecx + movdqa %xmm3,%xmm13 + andl %ebp,%edi + xorl %eax,%ebp + psrld $30,%xmm3 + addl %edx,%ecx + rorl $7,%edx + por %xmm12,%xmm8 + xorl %eax,%edi + movl %ecx,%esi + addl 12(%rsp),%ebx + movups -48(%r15),%xmm1 +.byte 102,15,56,220,208 + pslld $2,%xmm13 + pxor %xmm3,%xmm8 + xorl %ebp,%edx + movdqa 0(%r11),%xmm3 + roll $5,%ecx + addl %edi,%ebx + andl %edx,%esi + pxor %xmm13,%xmm8 + xorl %ebp,%edx + addl %ecx,%ebx + rorl $7,%ecx + pshufd $238,%xmm5,%xmm9 + xorl %ebp,%esi + movdqa %xmm8,%xmm13 + paddd %xmm8,%xmm3 + movl %ebx,%edi + addl 16(%rsp),%eax + punpcklqdq %xmm6,%xmm9 + xorl %edx,%ecx + roll $5,%ebx + addl %esi,%eax + psrldq $4,%xmm13 + andl %ecx,%edi + xorl %edx,%ecx + pxor %xmm5,%xmm9 + addl %ebx,%eax + rorl $7,%ebx + movups -32(%r15),%xmm0 +.byte 102,15,56,220,209 + pxor %xmm7,%xmm13 + xorl %edx,%edi + movl %eax,%esi + addl 20(%rsp),%ebp + pxor %xmm13,%xmm9 + xorl %ecx,%ebx + roll $5,%eax + movdqa %xmm3,0(%rsp) + addl %edi,%ebp + andl %ebx,%esi + movdqa %xmm9,%xmm12 + xorl %ecx,%ebx + addl %eax,%ebp + rorl $7,%eax + movdqa %xmm9,%xmm13 + xorl %ecx,%esi + pslldq $12,%xmm12 + paddd %xmm9,%xmm9 + movl %ebp,%edi + addl 24(%rsp),%edx + psrld $31,%xmm13 + xorl %ebx,%eax + roll $5,%ebp + addl %esi,%edx + movups -16(%r15),%xmm1 +.byte 102,15,56,220,208 + movdqa %xmm12,%xmm3 + andl %eax,%edi + xorl %ebx,%eax + psrld $30,%xmm12 + addl %ebp,%edx + rorl $7,%ebp + por %xmm13,%xmm9 + xorl %ebx,%edi + movl %edx,%esi + addl 28(%rsp),%ecx + pslld $2,%xmm3 + pxor %xmm12,%xmm9 + xorl %eax,%ebp + movdqa 16(%r11),%xmm12 + roll $5,%edx + addl %edi,%ecx + andl %ebp,%esi + pxor %xmm3,%xmm9 + xorl %eax,%ebp + addl %edx,%ecx + rorl $7,%edx + pshufd $238,%xmm6,%xmm10 + xorl %eax,%esi + movdqa %xmm9,%xmm3 + paddd %xmm9,%xmm12 + movl %ecx,%edi + addl 32(%rsp),%ebx + movups 0(%r15),%xmm0 +.byte 102,15,56,220,209 + punpcklqdq %xmm7,%xmm10 + xorl %ebp,%edx + roll $5,%ecx + addl %esi,%ebx + psrldq $4,%xmm3 + andl %edx,%edi + xorl %ebp,%edx + pxor %xmm6,%xmm10 + addl %ecx,%ebx + rorl $7,%ecx + pxor %xmm8,%xmm3 + xorl %ebp,%edi + movl %ebx,%esi + addl 36(%rsp),%eax + pxor %xmm3,%xmm10 + xorl %edx,%ecx + roll $5,%ebx + movdqa %xmm12,16(%rsp) + addl %edi,%eax + andl %ecx,%esi + movdqa %xmm10,%xmm13 + xorl %edx,%ecx + addl %ebx,%eax + rorl $7,%ebx + movups 16(%r15),%xmm1 +.byte 102,15,56,220,208 + movdqa %xmm10,%xmm3 + xorl %edx,%esi + pslldq $12,%xmm13 + paddd %xmm10,%xmm10 + movl %eax,%edi + addl 40(%rsp),%ebp + psrld $31,%xmm3 + xorl %ecx,%ebx + roll $5,%eax + addl %esi,%ebp + movdqa %xmm13,%xmm12 + andl %ebx,%edi + xorl %ecx,%ebx + psrld $30,%xmm13 + addl %eax,%ebp + rorl $7,%eax + por %xmm3,%xmm10 + xorl %ecx,%edi + movl %ebp,%esi + addl 44(%rsp),%edx + pslld $2,%xmm12 + pxor %xmm13,%xmm10 + xorl %ebx,%eax + movdqa 16(%r11),%xmm13 + roll $5,%ebp + addl %edi,%edx + movups 32(%r15),%xmm0 +.byte 102,15,56,220,209 + andl %eax,%esi + pxor %xmm12,%xmm10 + xorl %ebx,%eax + addl %ebp,%edx + rorl $7,%ebp + pshufd $238,%xmm7,%xmm11 + xorl %ebx,%esi + movdqa %xmm10,%xmm12 + paddd %xmm10,%xmm13 + movl %edx,%edi + addl 48(%rsp),%ecx + punpcklqdq %xmm8,%xmm11 + xorl %eax,%ebp + roll $5,%edx + addl %esi,%ecx + psrldq $4,%xmm12 + andl %ebp,%edi + xorl %eax,%ebp + pxor %xmm7,%xmm11 + addl %edx,%ecx + rorl $7,%edx + pxor %xmm9,%xmm12 + xorl %eax,%edi + movl %ecx,%esi + addl 52(%rsp),%ebx + movups 48(%r15),%xmm1 +.byte 102,15,56,220,208 + pxor %xmm12,%xmm11 + xorl %ebp,%edx + roll $5,%ecx + movdqa %xmm13,32(%rsp) + addl %edi,%ebx + andl %edx,%esi + movdqa %xmm11,%xmm3 + xorl %ebp,%edx + addl %ecx,%ebx + rorl $7,%ecx + movdqa %xmm11,%xmm12 + xorl %ebp,%esi + pslldq $12,%xmm3 + paddd %xmm11,%xmm11 + movl %ebx,%edi + addl 56(%rsp),%eax + psrld $31,%xmm12 + xorl %edx,%ecx + roll $5,%ebx + addl %esi,%eax + movdqa %xmm3,%xmm13 + andl %ecx,%edi + xorl %edx,%ecx + psrld $30,%xmm3 + addl %ebx,%eax + rorl $7,%ebx + cmpl $11,%r8d + jb .Laesenclast1 + movups 64(%r15),%xmm0 +.byte 102,15,56,220,209 + movups 80(%r15),%xmm1 +.byte 102,15,56,220,208 + je .Laesenclast1 + movups 96(%r15),%xmm0 +.byte 102,15,56,220,209 + movups 112(%r15),%xmm1 +.byte 102,15,56,220,208 +.Laesenclast1: +.byte 102,15,56,221,209 + movups 16-112(%r15),%xmm0 + por %xmm12,%xmm11 + xorl %edx,%edi + movl %eax,%esi + addl 60(%rsp),%ebp + pslld $2,%xmm13 + pxor %xmm3,%xmm11 + xorl %ecx,%ebx + movdqa 16(%r11),%xmm3 + roll $5,%eax + addl %edi,%ebp + andl %ebx,%esi + pxor %xmm13,%xmm11 + pshufd $238,%xmm10,%xmm13 + xorl %ecx,%ebx + addl %eax,%ebp + rorl $7,%eax + pxor %xmm8,%xmm4 + xorl %ecx,%esi + movl %ebp,%edi + addl 0(%rsp),%edx + punpcklqdq %xmm11,%xmm13 + xorl %ebx,%eax + roll $5,%ebp + pxor %xmm5,%xmm4 + addl %esi,%edx + movups 16(%r12),%xmm14 + xorps %xmm15,%xmm14 + movups %xmm2,0(%r12,%r13,1) + xorps %xmm14,%xmm2 + movups -80(%r15),%xmm1 +.byte 102,15,56,220,208 + andl %eax,%edi + movdqa %xmm3,%xmm12 + xorl %ebx,%eax + paddd %xmm11,%xmm3 + addl %ebp,%edx + pxor %xmm13,%xmm4 + rorl $7,%ebp + xorl %ebx,%edi + movl %edx,%esi + addl 4(%rsp),%ecx + movdqa %xmm4,%xmm13 + xorl %eax,%ebp + roll $5,%edx + movdqa %xmm3,48(%rsp) + addl %edi,%ecx + andl %ebp,%esi + xorl %eax,%ebp + pslld $2,%xmm4 + addl %edx,%ecx + rorl $7,%edx + psrld $30,%xmm13 + xorl %eax,%esi + movl %ecx,%edi + addl 8(%rsp),%ebx + movups -64(%r15),%xmm0 +.byte 102,15,56,220,209 + por %xmm13,%xmm4 + xorl %ebp,%edx + roll $5,%ecx + pshufd $238,%xmm11,%xmm3 + addl %esi,%ebx + andl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 12(%rsp),%eax + xorl %ebp,%edi + movl %ebx,%esi + roll $5,%ebx + addl %edi,%eax + xorl %edx,%esi + rorl $7,%ecx + addl %ebx,%eax + pxor %xmm9,%xmm5 + addl 16(%rsp),%ebp + movups -48(%r15),%xmm1 +.byte 102,15,56,220,208 + xorl %ecx,%esi + punpcklqdq %xmm4,%xmm3 + movl %eax,%edi + roll $5,%eax + pxor %xmm6,%xmm5 + addl %esi,%ebp + xorl %ecx,%edi + movdqa %xmm12,%xmm13 + rorl $7,%ebx + paddd %xmm4,%xmm12 + addl %eax,%ebp + pxor %xmm3,%xmm5 + addl 20(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + roll $5,%ebp + movdqa %xmm5,%xmm3 + addl %edi,%edx + xorl %ebx,%esi + movdqa %xmm12,0(%rsp) + rorl $7,%eax + addl %ebp,%edx + addl 24(%rsp),%ecx + pslld $2,%xmm5 + xorl %eax,%esi + movl %edx,%edi + psrld $30,%xmm3 + roll $5,%edx + addl %esi,%ecx + movups -32(%r15),%xmm0 +.byte 102,15,56,220,209 + xorl %eax,%edi + rorl $7,%ebp + por %xmm3,%xmm5 + addl %edx,%ecx + addl 28(%rsp),%ebx + pshufd $238,%xmm4,%xmm12 + xorl %ebp,%edi + movl %ecx,%esi + roll $5,%ecx + addl %edi,%ebx + xorl %ebp,%esi + rorl $7,%edx + addl %ecx,%ebx + pxor %xmm10,%xmm6 + addl 32(%rsp),%eax + xorl %edx,%esi + punpcklqdq %xmm5,%xmm12 + movl %ebx,%edi + roll $5,%ebx + pxor %xmm7,%xmm6 + addl %esi,%eax + xorl %edx,%edi + movdqa 32(%r11),%xmm3 + rorl $7,%ecx + paddd %xmm5,%xmm13 + addl %ebx,%eax + pxor %xmm12,%xmm6 + addl 36(%rsp),%ebp + movups -16(%r15),%xmm1 +.byte 102,15,56,220,208 + xorl %ecx,%edi + movl %eax,%esi + roll $5,%eax + movdqa %xmm6,%xmm12 + addl %edi,%ebp + xorl %ecx,%esi + movdqa %xmm13,16(%rsp) + rorl $7,%ebx + addl %eax,%ebp + addl 40(%rsp),%edx + pslld $2,%xmm6 + xorl %ebx,%esi + movl %ebp,%edi + psrld $30,%xmm12 + roll $5,%ebp + addl %esi,%edx + xorl %ebx,%edi + rorl $7,%eax + por %xmm12,%xmm6 + addl %ebp,%edx + addl 44(%rsp),%ecx + pshufd $238,%xmm5,%xmm13 + xorl %eax,%edi + movl %edx,%esi + roll $5,%edx + addl %edi,%ecx + movups 0(%r15),%xmm0 +.byte 102,15,56,220,209 + xorl %eax,%esi + rorl $7,%ebp + addl %edx,%ecx + pxor %xmm11,%xmm7 + addl 48(%rsp),%ebx + xorl %ebp,%esi + punpcklqdq %xmm6,%xmm13 + movl %ecx,%edi + roll $5,%ecx + pxor %xmm8,%xmm7 + addl %esi,%ebx + xorl %ebp,%edi + movdqa %xmm3,%xmm12 + rorl $7,%edx + paddd %xmm6,%xmm3 + addl %ecx,%ebx + pxor %xmm13,%xmm7 + addl 52(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + roll $5,%ebx + movdqa %xmm7,%xmm13 + addl %edi,%eax + xorl %edx,%esi + movdqa %xmm3,32(%rsp) + rorl $7,%ecx + addl %ebx,%eax + addl 56(%rsp),%ebp + movups 16(%r15),%xmm1 +.byte 102,15,56,220,208 + pslld $2,%xmm7 + xorl %ecx,%esi + movl %eax,%edi + psrld $30,%xmm13 + roll $5,%eax + addl %esi,%ebp + xorl %ecx,%edi + rorl $7,%ebx + por %xmm13,%xmm7 + addl %eax,%ebp + addl 60(%rsp),%edx + pshufd $238,%xmm6,%xmm3 + xorl %ebx,%edi + movl %ebp,%esi + roll $5,%ebp + addl %edi,%edx + xorl %ebx,%esi + rorl $7,%eax + addl %ebp,%edx + pxor %xmm4,%xmm8 + addl 0(%rsp),%ecx + xorl %eax,%esi + punpcklqdq %xmm7,%xmm3 + movl %edx,%edi + roll $5,%edx + pxor %xmm9,%xmm8 + addl %esi,%ecx + movups 32(%r15),%xmm0 +.byte 102,15,56,220,209 + xorl %eax,%edi + movdqa %xmm12,%xmm13 + rorl $7,%ebp + paddd %xmm7,%xmm12 + addl %edx,%ecx + pxor %xmm3,%xmm8 + addl 4(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + roll $5,%ecx + movdqa %xmm8,%xmm3 + addl %edi,%ebx + xorl %ebp,%esi + movdqa %xmm12,48(%rsp) + rorl $7,%edx + addl %ecx,%ebx + addl 8(%rsp),%eax + pslld $2,%xmm8 + xorl %edx,%esi + movl %ebx,%edi + psrld $30,%xmm3 + roll $5,%ebx + addl %esi,%eax + xorl %edx,%edi + rorl $7,%ecx + por %xmm3,%xmm8 + addl %ebx,%eax + addl 12(%rsp),%ebp + movups 48(%r15),%xmm1 +.byte 102,15,56,220,208 + pshufd $238,%xmm7,%xmm12 + xorl %ecx,%edi + movl %eax,%esi + roll $5,%eax + addl %edi,%ebp + xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%ebp + pxor %xmm5,%xmm9 + addl 16(%rsp),%edx + xorl %ebx,%esi + punpcklqdq %xmm8,%xmm12 + movl %ebp,%edi + roll $5,%ebp + pxor %xmm10,%xmm9 + addl %esi,%edx + xorl %ebx,%edi + movdqa %xmm13,%xmm3 + rorl $7,%eax + paddd %xmm8,%xmm13 + addl %ebp,%edx + pxor %xmm12,%xmm9 + addl 20(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + roll $5,%edx + movdqa %xmm9,%xmm12 + addl %edi,%ecx + cmpl $11,%r8d + jb .Laesenclast2 + movups 64(%r15),%xmm0 +.byte 102,15,56,220,209 + movups 80(%r15),%xmm1 +.byte 102,15,56,220,208 + je .Laesenclast2 + movups 96(%r15),%xmm0 +.byte 102,15,56,220,209 + movups 112(%r15),%xmm1 +.byte 102,15,56,220,208 +.Laesenclast2: +.byte 102,15,56,221,209 + movups 16-112(%r15),%xmm0 + xorl %eax,%esi + movdqa %xmm13,0(%rsp) + rorl $7,%ebp + addl %edx,%ecx + addl 24(%rsp),%ebx + pslld $2,%xmm9 + xorl %ebp,%esi + movl %ecx,%edi + psrld $30,%xmm12 + roll $5,%ecx + addl %esi,%ebx + xorl %ebp,%edi + rorl $7,%edx + por %xmm12,%xmm9 + addl %ecx,%ebx + addl 28(%rsp),%eax + pshufd $238,%xmm8,%xmm13 + rorl $7,%ecx + movl %ebx,%esi + xorl %edx,%edi + roll $5,%ebx + addl %edi,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + pxor %xmm6,%xmm10 + addl 32(%rsp),%ebp + movups 32(%r12),%xmm14 + xorps %xmm15,%xmm14 + movups %xmm2,16(%r13,%r12,1) + xorps %xmm14,%xmm2 + movups -80(%r15),%xmm1 +.byte 102,15,56,220,208 + andl %ecx,%esi + xorl %edx,%ecx + rorl $7,%ebx + punpcklqdq %xmm9,%xmm13 + movl %eax,%edi + xorl %ecx,%esi + pxor %xmm11,%xmm10 + roll $5,%eax + addl %esi,%ebp + movdqa %xmm3,%xmm12 + xorl %ebx,%edi + paddd %xmm9,%xmm3 + xorl %ecx,%ebx + pxor %xmm13,%xmm10 + addl %eax,%ebp + addl 36(%rsp),%edx + andl %ebx,%edi + xorl %ecx,%ebx + rorl $7,%eax + movdqa %xmm10,%xmm13 + movl %ebp,%esi + xorl %ebx,%edi + movdqa %xmm3,16(%rsp) + roll $5,%ebp + addl %edi,%edx + movups -64(%r15),%xmm0 +.byte 102,15,56,220,209 + xorl %eax,%esi + pslld $2,%xmm10 + xorl %ebx,%eax + addl %ebp,%edx + psrld $30,%xmm13 + addl 40(%rsp),%ecx + andl %eax,%esi + xorl %ebx,%eax + por %xmm13,%xmm10 + rorl $7,%ebp + movl %edx,%edi + xorl %eax,%esi + roll $5,%edx + pshufd $238,%xmm9,%xmm3 + addl %esi,%ecx + xorl %ebp,%edi + xorl %eax,%ebp + addl %edx,%ecx + addl 44(%rsp),%ebx + andl %ebp,%edi + xorl %eax,%ebp + rorl $7,%edx + movups -48(%r15),%xmm1 +.byte 102,15,56,220,208 + movl %ecx,%esi + xorl %ebp,%edi + roll $5,%ecx + addl %edi,%ebx + xorl %edx,%esi + xorl %ebp,%edx + addl %ecx,%ebx + pxor %xmm7,%xmm11 + addl 48(%rsp),%eax + andl %edx,%esi + xorl %ebp,%edx + rorl $7,%ecx + punpcklqdq %xmm10,%xmm3 + movl %ebx,%edi + xorl %edx,%esi + pxor %xmm4,%xmm11 + roll $5,%ebx + addl %esi,%eax + movdqa 48(%r11),%xmm13 + xorl %ecx,%edi + paddd %xmm10,%xmm12 + xorl %edx,%ecx + pxor %xmm3,%xmm11 + addl %ebx,%eax + addl 52(%rsp),%ebp + movups -32(%r15),%xmm0 +.byte 102,15,56,220,209 + andl %ecx,%edi + xorl %edx,%ecx + rorl $7,%ebx + movdqa %xmm11,%xmm3 + movl %eax,%esi + xorl %ecx,%edi + movdqa %xmm12,32(%rsp) + roll $5,%eax + addl %edi,%ebp + xorl %ebx,%esi + pslld $2,%xmm11 + xorl %ecx,%ebx + addl %eax,%ebp + psrld $30,%xmm3 + addl 56(%rsp),%edx + andl %ebx,%esi + xorl %ecx,%ebx + por %xmm3,%xmm11 + rorl $7,%eax + movl %ebp,%edi + xorl %ebx,%esi + roll $5,%ebp + pshufd $238,%xmm10,%xmm12 + addl %esi,%edx + movups -16(%r15),%xmm1 +.byte 102,15,56,220,208 + xorl %eax,%edi + xorl %ebx,%eax + addl %ebp,%edx + addl 60(%rsp),%ecx + andl %eax,%edi + xorl %ebx,%eax + rorl $7,%ebp + movl %edx,%esi + xorl %eax,%edi + roll $5,%edx + addl %edi,%ecx + xorl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + pxor %xmm8,%xmm4 + addl 0(%rsp),%ebx + andl %ebp,%esi + xorl %eax,%ebp + rorl $7,%edx + movups 0(%r15),%xmm0 +.byte 102,15,56,220,209 + punpcklqdq %xmm11,%xmm12 + movl %ecx,%edi + xorl %ebp,%esi + pxor %xmm5,%xmm4 + roll $5,%ecx + addl %esi,%ebx + movdqa %xmm13,%xmm3 + xorl %edx,%edi + paddd %xmm11,%xmm13 + xorl %ebp,%edx + pxor %xmm12,%xmm4 + addl %ecx,%ebx + addl 4(%rsp),%eax + andl %edx,%edi + xorl %ebp,%edx + rorl $7,%ecx + movdqa %xmm4,%xmm12 + movl %ebx,%esi + xorl %edx,%edi + movdqa %xmm13,48(%rsp) + roll $5,%ebx + addl %edi,%eax + xorl %ecx,%esi + pslld $2,%xmm4 + xorl %edx,%ecx + addl %ebx,%eax + psrld $30,%xmm12 + addl 8(%rsp),%ebp + movups 16(%r15),%xmm1 +.byte 102,15,56,220,208 + andl %ecx,%esi + xorl %edx,%ecx + por %xmm12,%xmm4 + rorl $7,%ebx + movl %eax,%edi + xorl %ecx,%esi + roll $5,%eax + pshufd $238,%xmm11,%xmm13 + addl %esi,%ebp + xorl %ebx,%edi + xorl %ecx,%ebx + addl %eax,%ebp + addl 12(%rsp),%edx + andl %ebx,%edi + xorl %ecx,%ebx + rorl $7,%eax + movl %ebp,%esi + xorl %ebx,%edi + roll $5,%ebp + addl %edi,%edx + movups 32(%r15),%xmm0 +.byte 102,15,56,220,209 + xorl %eax,%esi + xorl %ebx,%eax + addl %ebp,%edx + pxor %xmm9,%xmm5 + addl 16(%rsp),%ecx + andl %eax,%esi + xorl %ebx,%eax + rorl $7,%ebp + punpcklqdq %xmm4,%xmm13 + movl %edx,%edi + xorl %eax,%esi + pxor %xmm6,%xmm5 + roll $5,%edx + addl %esi,%ecx + movdqa %xmm3,%xmm12 + xorl %ebp,%edi + paddd %xmm4,%xmm3 + xorl %eax,%ebp + pxor %xmm13,%xmm5 + addl %edx,%ecx + addl 20(%rsp),%ebx + andl %ebp,%edi + xorl %eax,%ebp + rorl $7,%edx + movups 48(%r15),%xmm1 +.byte 102,15,56,220,208 + movdqa %xmm5,%xmm13 + movl %ecx,%esi + xorl %ebp,%edi + movdqa %xmm3,0(%rsp) + roll $5,%ecx + addl %edi,%ebx + xorl %edx,%esi + pslld $2,%xmm5 + xorl %ebp,%edx + addl %ecx,%ebx + psrld $30,%xmm13 + addl 24(%rsp),%eax + andl %edx,%esi + xorl %ebp,%edx + por %xmm13,%xmm5 + rorl $7,%ecx + movl %ebx,%edi + xorl %edx,%esi + roll $5,%ebx + pshufd $238,%xmm4,%xmm3 + addl %esi,%eax + xorl %ecx,%edi + xorl %edx,%ecx + addl %ebx,%eax + addl 28(%rsp),%ebp + cmpl $11,%r8d + jb .Laesenclast3 + movups 64(%r15),%xmm0 +.byte 102,15,56,220,209 + movups 80(%r15),%xmm1 +.byte 102,15,56,220,208 + je .Laesenclast3 + movups 96(%r15),%xmm0 +.byte 102,15,56,220,209 + movups 112(%r15),%xmm1 +.byte 102,15,56,220,208 +.Laesenclast3: +.byte 102,15,56,221,209 + movups 16-112(%r15),%xmm0 + andl %ecx,%edi + xorl %edx,%ecx + rorl $7,%ebx + movl %eax,%esi + xorl %ecx,%edi + roll $5,%eax + addl %edi,%ebp + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%ebp + pxor %xmm10,%xmm6 + addl 32(%rsp),%edx + andl %ebx,%esi + xorl %ecx,%ebx + rorl $7,%eax + punpcklqdq %xmm5,%xmm3 + movl %ebp,%edi + xorl %ebx,%esi + pxor %xmm7,%xmm6 + roll $5,%ebp + addl %esi,%edx + movups 48(%r12),%xmm14 + xorps %xmm15,%xmm14 + movups %xmm2,32(%r13,%r12,1) + xorps %xmm14,%xmm2 + movups -80(%r15),%xmm1 +.byte 102,15,56,220,208 + movdqa %xmm12,%xmm13 + xorl %eax,%edi + paddd %xmm5,%xmm12 + xorl %ebx,%eax + pxor %xmm3,%xmm6 + addl %ebp,%edx + addl 36(%rsp),%ecx + andl %eax,%edi + xorl %ebx,%eax + rorl $7,%ebp + movdqa %xmm6,%xmm3 + movl %edx,%esi + xorl %eax,%edi + movdqa %xmm12,16(%rsp) + roll $5,%edx + addl %edi,%ecx + xorl %ebp,%esi + pslld $2,%xmm6 + xorl %eax,%ebp + addl %edx,%ecx + psrld $30,%xmm3 + addl 40(%rsp),%ebx + andl %ebp,%esi + xorl %eax,%ebp + por %xmm3,%xmm6 + rorl $7,%edx + movups -64(%r15),%xmm0 +.byte 102,15,56,220,209 + movl %ecx,%edi + xorl %ebp,%esi + roll $5,%ecx + pshufd $238,%xmm5,%xmm12 + addl %esi,%ebx + xorl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 44(%rsp),%eax + andl %edx,%edi + xorl %ebp,%edx + rorl $7,%ecx + movl %ebx,%esi + xorl %edx,%edi + roll $5,%ebx + addl %edi,%eax + xorl %edx,%esi + addl %ebx,%eax + pxor %xmm11,%xmm7 + addl 48(%rsp),%ebp + movups -48(%r15),%xmm1 +.byte 102,15,56,220,208 + xorl %ecx,%esi + punpcklqdq %xmm6,%xmm12 + movl %eax,%edi + roll $5,%eax + pxor %xmm8,%xmm7 + addl %esi,%ebp + xorl %ecx,%edi + movdqa %xmm13,%xmm3 + rorl $7,%ebx + paddd %xmm6,%xmm13 + addl %eax,%ebp + pxor %xmm12,%xmm7 + addl 52(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + roll $5,%ebp + movdqa %xmm7,%xmm12 + addl %edi,%edx + xorl %ebx,%esi + movdqa %xmm13,32(%rsp) + rorl $7,%eax + addl %ebp,%edx + addl 56(%rsp),%ecx + pslld $2,%xmm7 + xorl %eax,%esi + movl %edx,%edi + psrld $30,%xmm12 + roll $5,%edx + addl %esi,%ecx + movups -32(%r15),%xmm0 +.byte 102,15,56,220,209 + xorl %eax,%edi + rorl $7,%ebp + por %xmm12,%xmm7 + addl %edx,%ecx + addl 60(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + roll $5,%ecx + addl %edi,%ebx + xorl %ebp,%esi + rorl $7,%edx + addl %ecx,%ebx + addl 0(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + roll $5,%ebx + paddd %xmm7,%xmm3 + addl %esi,%eax + xorl %edx,%edi + movdqa %xmm3,48(%rsp) + rorl $7,%ecx + addl %ebx,%eax + addl 4(%rsp),%ebp + movups -16(%r15),%xmm1 +.byte 102,15,56,220,208 + xorl %ecx,%edi + movl %eax,%esi + roll $5,%eax + addl %edi,%ebp + xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%ebp + addl 8(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + roll $5,%ebp + addl %esi,%edx + xorl %ebx,%edi + rorl $7,%eax + addl %ebp,%edx + addl 12(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + roll $5,%edx + addl %edi,%ecx + movups 0(%r15),%xmm0 +.byte 102,15,56,220,209 + xorl %eax,%esi + rorl $7,%ebp + addl %edx,%ecx + cmpq %r14,%r10 + je .Ldone_ssse3 + movdqa 64(%r11),%xmm3 + movdqa 0(%r11),%xmm13 + movdqu 0(%r10),%xmm4 + movdqu 16(%r10),%xmm5 + movdqu 32(%r10),%xmm6 + movdqu 48(%r10),%xmm7 +.byte 102,15,56,0,227 + addq $64,%r10 + addl 16(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi +.byte 102,15,56,0,235 + roll $5,%ecx + addl %esi,%ebx + xorl %ebp,%edi + rorl $7,%edx + paddd %xmm13,%xmm4 + addl %ecx,%ebx + addl 20(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + movdqa %xmm4,0(%rsp) + roll $5,%ebx + addl %edi,%eax + xorl %edx,%esi + rorl $7,%ecx + psubd %xmm13,%xmm4 + addl %ebx,%eax + addl 24(%rsp),%ebp + movups 16(%r15),%xmm1 +.byte 102,15,56,220,208 + xorl %ecx,%esi + movl %eax,%edi + roll $5,%eax + addl %esi,%ebp + xorl %ecx,%edi + rorl $7,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + roll $5,%ebp + addl %edi,%edx + xorl %ebx,%esi + rorl $7,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi +.byte 102,15,56,0,243 + roll $5,%edx + addl %esi,%ecx + movups 32(%r15),%xmm0 +.byte 102,15,56,220,209 + xorl %eax,%edi + rorl $7,%ebp + paddd %xmm13,%xmm5 + addl %edx,%ecx + addl 36(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + movdqa %xmm5,16(%rsp) + roll $5,%ecx + addl %edi,%ebx + xorl %ebp,%esi + rorl $7,%edx + psubd %xmm13,%xmm5 + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + roll $5,%ebx + addl %esi,%eax + xorl %edx,%edi + rorl $7,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + movups 48(%r15),%xmm1 +.byte 102,15,56,220,208 + xorl %ecx,%edi + movl %eax,%esi + roll $5,%eax + addl %edi,%ebp + xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi +.byte 102,15,56,0,251 + roll $5,%ebp + addl %esi,%edx + xorl %ebx,%edi + rorl $7,%eax + paddd %xmm13,%xmm6 + addl %ebp,%edx + addl 52(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + movdqa %xmm6,32(%rsp) + roll $5,%edx + addl %edi,%ecx + cmpl $11,%r8d + jb .Laesenclast4 + movups 64(%r15),%xmm0 +.byte 102,15,56,220,209 + movups 80(%r15),%xmm1 +.byte 102,15,56,220,208 + je .Laesenclast4 + movups 96(%r15),%xmm0 +.byte 102,15,56,220,209 + movups 112(%r15),%xmm1 +.byte 102,15,56,220,208 +.Laesenclast4: +.byte 102,15,56,221,209 + movups 16-112(%r15),%xmm0 + xorl %eax,%esi + rorl $7,%ebp + psubd %xmm13,%xmm6 + addl %edx,%ecx + addl 56(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + roll $5,%ecx + addl %esi,%ebx + xorl %ebp,%edi + rorl $7,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + roll $5,%ebx + addl %edi,%eax + rorl $7,%ecx + addl %ebx,%eax + movups %xmm2,48(%r13,%r12,1) + leaq 64(%r12),%r12 + + addl 0(%r9),%eax + addl 4(%r9),%esi + addl 8(%r9),%ecx + addl 12(%r9),%edx + movl %eax,0(%r9) + addl 16(%r9),%ebp + movl %esi,4(%r9) + movl %esi,%ebx + movl %ecx,8(%r9) + movl %ecx,%edi + movl %edx,12(%r9) + xorl %edx,%edi + movl %ebp,16(%r9) + andl %edi,%esi + jmp .Loop_ssse3 + +.Ldone_ssse3: + addl 16(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + roll $5,%ecx + addl %esi,%ebx + xorl %ebp,%edi + rorl $7,%edx + addl %ecx,%ebx + addl 20(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + roll $5,%ebx + addl %edi,%eax + xorl %edx,%esi + rorl $7,%ecx + addl %ebx,%eax + addl 24(%rsp),%ebp + movups 16(%r15),%xmm1 +.byte 102,15,56,220,208 + xorl %ecx,%esi + movl %eax,%edi + roll $5,%eax + addl %esi,%ebp + xorl %ecx,%edi + rorl $7,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + roll $5,%ebp + addl %edi,%edx + xorl %ebx,%esi + rorl $7,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + roll $5,%edx + addl %esi,%ecx + movups 32(%r15),%xmm0 +.byte 102,15,56,220,209 + xorl %eax,%edi + rorl $7,%ebp + addl %edx,%ecx + addl 36(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + roll $5,%ecx + addl %edi,%ebx + xorl %ebp,%esi + rorl $7,%edx + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + roll $5,%ebx + addl %esi,%eax + xorl %edx,%edi + rorl $7,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + movups 48(%r15),%xmm1 +.byte 102,15,56,220,208 + xorl %ecx,%edi + movl %eax,%esi + roll $5,%eax + addl %edi,%ebp + xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + roll $5,%ebp + addl %esi,%edx + xorl %ebx,%edi + rorl $7,%eax + addl %ebp,%edx + addl 52(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + roll $5,%edx + addl %edi,%ecx + cmpl $11,%r8d + jb .Laesenclast5 + movups 64(%r15),%xmm0 +.byte 102,15,56,220,209 + movups 80(%r15),%xmm1 +.byte 102,15,56,220,208 + je .Laesenclast5 + movups 96(%r15),%xmm0 +.byte 102,15,56,220,209 + movups 112(%r15),%xmm1 +.byte 102,15,56,220,208 +.Laesenclast5: +.byte 102,15,56,221,209 + movups 16-112(%r15),%xmm0 + xorl %eax,%esi + rorl $7,%ebp + addl %edx,%ecx + addl 56(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + roll $5,%ecx + addl %esi,%ebx + xorl %ebp,%edi + rorl $7,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + roll $5,%ebx + addl %edi,%eax + rorl $7,%ecx + addl %ebx,%eax + movups %xmm2,48(%r13,%r12,1) + movq 88(%rsp),%r8 + + addl 0(%r9),%eax + addl 4(%r9),%esi + addl 8(%r9),%ecx + movl %eax,0(%r9) + addl 12(%r9),%edx + movl %esi,4(%r9) + addl 16(%r9),%ebp + movl %ecx,8(%r9) + movl %edx,12(%r9) + movl %ebp,16(%r9) + movups %xmm2,(%r8) + leaq 104(%rsp),%rsi + movq 0(%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbp + movq 40(%rsi),%rbx + leaq 48(%rsi),%rsp +.Lepilogue_ssse3: + .byte 0xf3,0xc3 +.size aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3 +.align 64 +K_XX_XX: +.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 +.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 +.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc +.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 + +.byte 65,69,83,78,73,45,67,66,67,43,83,72,65,49,32,115,116,105,116,99,104,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 64 +.type aesni_cbc_sha1_enc_shaext,@function +.align 32 +aesni_cbc_sha1_enc_shaext: + movq 8(%rsp),%r10 + movdqu (%r9),%xmm8 + movd 16(%r9),%xmm9 + movdqa K_XX_XX+80(%rip),%xmm7 + + movl 240(%rcx),%r11d + subq %rdi,%rsi + movups (%rcx),%xmm15 + movups 16(%rcx),%xmm0 + leaq 112(%rcx),%rcx + + pshufd $27,%xmm8,%xmm8 + pshufd $27,%xmm9,%xmm9 + jmp .Loop_shaext + +.align 16 +.Loop_shaext: + movups 0(%rdi),%xmm14 + xorps %xmm15,%xmm14 + xorps %xmm14,%xmm2 + movups -80(%rcx),%xmm1 +.byte 102,15,56,220,208 + movdqu (%r10),%xmm3 + movdqa %xmm9,%xmm12 +.byte 102,15,56,0,223 + movdqu 16(%r10),%xmm4 + movdqa %xmm8,%xmm11 + movups -64(%rcx),%xmm0 +.byte 102,15,56,220,209 +.byte 102,15,56,0,231 + + paddd %xmm3,%xmm9 + movdqu 32(%r10),%xmm5 + leaq 64(%r10),%r10 + pxor %xmm12,%xmm3 + movups -48(%rcx),%xmm1 +.byte 102,15,56,220,208 + pxor %xmm12,%xmm3 + movdqa %xmm8,%xmm10 +.byte 102,15,56,0,239 +.byte 69,15,58,204,193,0 +.byte 68,15,56,200,212 + movups -32(%rcx),%xmm0 +.byte 102,15,56,220,209 +.byte 15,56,201,220 + movdqu -16(%r10),%xmm6 + movdqa %xmm8,%xmm9 +.byte 102,15,56,0,247 + movups -16(%rcx),%xmm1 +.byte 102,15,56,220,208 +.byte 69,15,58,204,194,0 +.byte 68,15,56,200,205 + pxor %xmm5,%xmm3 +.byte 15,56,201,229 + movups 0(%rcx),%xmm0 +.byte 102,15,56,220,209 + movdqa %xmm8,%xmm10 +.byte 69,15,58,204,193,0 +.byte 68,15,56,200,214 + movups 16(%rcx),%xmm1 +.byte 102,15,56,220,208 +.byte 15,56,202,222 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 + movups 32(%rcx),%xmm0 +.byte 102,15,56,220,209 + movdqa %xmm8,%xmm9 +.byte 69,15,58,204,194,0 +.byte 68,15,56,200,203 + movups 48(%rcx),%xmm1 +.byte 102,15,56,220,208 +.byte 15,56,202,227 + pxor %xmm3,%xmm5 +.byte 15,56,201,243 + cmpl $11,%r11d + jb .Laesenclast6 + movups 64(%rcx),%xmm0 +.byte 102,15,56,220,209 + movups 80(%rcx),%xmm1 +.byte 102,15,56,220,208 + je .Laesenclast6 + movups 96(%rcx),%xmm0 +.byte 102,15,56,220,209 + movups 112(%rcx),%xmm1 +.byte 102,15,56,220,208 +.Laesenclast6: +.byte 102,15,56,221,209 + movups 16-112(%rcx),%xmm0 + movdqa %xmm8,%xmm10 +.byte 69,15,58,204,193,0 +.byte 68,15,56,200,212 + movups 16(%rdi),%xmm14 + xorps %xmm15,%xmm14 + movups %xmm2,0(%rsi,%rdi,1) + xorps %xmm14,%xmm2 + movups -80(%rcx),%xmm1 +.byte 102,15,56,220,208 +.byte 15,56,202,236 + pxor %xmm4,%xmm6 +.byte 15,56,201,220 + movups -64(%rcx),%xmm0 +.byte 102,15,56,220,209 + movdqa %xmm8,%xmm9 +.byte 69,15,58,204,194,1 +.byte 68,15,56,200,205 + movups -48(%rcx),%xmm1 +.byte 102,15,56,220,208 +.byte 15,56,202,245 + pxor %xmm5,%xmm3 +.byte 15,56,201,229 + movups -32(%rcx),%xmm0 +.byte 102,15,56,220,209 + movdqa %xmm8,%xmm10 +.byte 69,15,58,204,193,1 +.byte 68,15,56,200,214 + movups -16(%rcx),%xmm1 +.byte 102,15,56,220,208 +.byte 15,56,202,222 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 + movups 0(%rcx),%xmm0 +.byte 102,15,56,220,209 + movdqa %xmm8,%xmm9 +.byte 69,15,58,204,194,1 +.byte 68,15,56,200,203 + movups 16(%rcx),%xmm1 +.byte 102,15,56,220,208 +.byte 15,56,202,227 + pxor %xmm3,%xmm5 +.byte 15,56,201,243 + movups 32(%rcx),%xmm0 +.byte 102,15,56,220,209 + movdqa %xmm8,%xmm10 +.byte 69,15,58,204,193,1 +.byte 68,15,56,200,212 + movups 48(%rcx),%xmm1 +.byte 102,15,56,220,208 +.byte 15,56,202,236 + pxor %xmm4,%xmm6 +.byte 15,56,201,220 + cmpl $11,%r11d + jb .Laesenclast7 + movups 64(%rcx),%xmm0 +.byte 102,15,56,220,209 + movups 80(%rcx),%xmm1 +.byte 102,15,56,220,208 + je .Laesenclast7 + movups 96(%rcx),%xmm0 +.byte 102,15,56,220,209 + movups 112(%rcx),%xmm1 +.byte 102,15,56,220,208 +.Laesenclast7: +.byte 102,15,56,221,209 + movups 16-112(%rcx),%xmm0 + movdqa %xmm8,%xmm9 +.byte 69,15,58,204,194,1 +.byte 68,15,56,200,205 + movups 32(%rdi),%xmm14 + xorps %xmm15,%xmm14 + movups %xmm2,16(%rsi,%rdi,1) + xorps %xmm14,%xmm2 + movups -80(%rcx),%xmm1 +.byte 102,15,56,220,208 +.byte 15,56,202,245 + pxor %xmm5,%xmm3 +.byte 15,56,201,229 + movups -64(%rcx),%xmm0 +.byte 102,15,56,220,209 + movdqa %xmm8,%xmm10 +.byte 69,15,58,204,193,2 +.byte 68,15,56,200,214 + movups -48(%rcx),%xmm1 +.byte 102,15,56,220,208 +.byte 15,56,202,222 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 + movups -32(%rcx),%xmm0 +.byte 102,15,56,220,209 + movdqa %xmm8,%xmm9 +.byte 69,15,58,204,194,2 +.byte 68,15,56,200,203 + movups -16(%rcx),%xmm1 +.byte 102,15,56,220,208 +.byte 15,56,202,227 + pxor %xmm3,%xmm5 +.byte 15,56,201,243 + movups 0(%rcx),%xmm0 +.byte 102,15,56,220,209 + movdqa %xmm8,%xmm10 +.byte 69,15,58,204,193,2 +.byte 68,15,56,200,212 + movups 16(%rcx),%xmm1 +.byte 102,15,56,220,208 +.byte 15,56,202,236 + pxor %xmm4,%xmm6 +.byte 15,56,201,220 + movups 32(%rcx),%xmm0 +.byte 102,15,56,220,209 + movdqa %xmm8,%xmm9 +.byte 69,15,58,204,194,2 +.byte 68,15,56,200,205 + movups 48(%rcx),%xmm1 +.byte 102,15,56,220,208 +.byte 15,56,202,245 + pxor %xmm5,%xmm3 +.byte 15,56,201,229 + cmpl $11,%r11d + jb .Laesenclast8 + movups 64(%rcx),%xmm0 +.byte 102,15,56,220,209 + movups 80(%rcx),%xmm1 +.byte 102,15,56,220,208 + je .Laesenclast8 + movups 96(%rcx),%xmm0 +.byte 102,15,56,220,209 + movups 112(%rcx),%xmm1 +.byte 102,15,56,220,208 +.Laesenclast8: +.byte 102,15,56,221,209 + movups 16-112(%rcx),%xmm0 + movdqa %xmm8,%xmm10 +.byte 69,15,58,204,193,2 +.byte 68,15,56,200,214 + movups 48(%rdi),%xmm14 + xorps %xmm15,%xmm14 + movups %xmm2,32(%rsi,%rdi,1) + xorps %xmm14,%xmm2 + movups -80(%rcx),%xmm1 +.byte 102,15,56,220,208 +.byte 15,56,202,222 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 + movups -64(%rcx),%xmm0 +.byte 102,15,56,220,209 + movdqa %xmm8,%xmm9 +.byte 69,15,58,204,194,3 +.byte 68,15,56,200,203 + movups -48(%rcx),%xmm1 +.byte 102,15,56,220,208 +.byte 15,56,202,227 + pxor %xmm3,%xmm5 +.byte 15,56,201,243 + movups -32(%rcx),%xmm0 +.byte 102,15,56,220,209 + movdqa %xmm8,%xmm10 +.byte 69,15,58,204,193,3 +.byte 68,15,56,200,212 +.byte 15,56,202,236 + pxor %xmm4,%xmm6 + movups -16(%rcx),%xmm1 +.byte 102,15,56,220,208 + movdqa %xmm8,%xmm9 +.byte 69,15,58,204,194,3 +.byte 68,15,56,200,205 +.byte 15,56,202,245 + movups 0(%rcx),%xmm0 +.byte 102,15,56,220,209 + movdqa %xmm12,%xmm5 + movdqa %xmm8,%xmm10 +.byte 69,15,58,204,193,3 +.byte 68,15,56,200,214 + movups 16(%rcx),%xmm1 +.byte 102,15,56,220,208 + movdqa %xmm8,%xmm9 +.byte 69,15,58,204,194,3 +.byte 68,15,56,200,205 + movups 32(%rcx),%xmm0 +.byte 102,15,56,220,209 + movups 48(%rcx),%xmm1 +.byte 102,15,56,220,208 + cmpl $11,%r11d + jb .Laesenclast9 + movups 64(%rcx),%xmm0 +.byte 102,15,56,220,209 + movups 80(%rcx),%xmm1 +.byte 102,15,56,220,208 + je .Laesenclast9 + movups 96(%rcx),%xmm0 +.byte 102,15,56,220,209 + movups 112(%rcx),%xmm1 +.byte 102,15,56,220,208 +.Laesenclast9: +.byte 102,15,56,221,209 + movups 16-112(%rcx),%xmm0 + decq %rdx + + paddd %xmm11,%xmm8 + movups %xmm2,48(%rsi,%rdi,1) + leaq 64(%rdi),%rdi + jnz .Loop_shaext + + pshufd $27,%xmm8,%xmm8 + pshufd $27,%xmm9,%xmm9 + movups %xmm2,(%r8) + movdqu %xmm8,(%r9) + movd %xmm9,16(%r9) + .byte 0xf3,0xc3 +.size aesni_cbc_sha1_enc_shaext,.-aesni_cbc_sha1_enc_shaext diff --git a/secure/lib/libcrypto/amd64/aesni-sha256-x86_64.S b/secure/lib/libcrypto/amd64/aesni-sha256-x86_64.S new file mode 100644 index 0000000..a940892 --- /dev/null +++ b/secure/lib/libcrypto/amd64/aesni-sha256-x86_64.S @@ -0,0 +1,58 @@ + # $FreeBSD$ +.text + + +.globl aesni_cbc_sha256_enc +.type aesni_cbc_sha256_enc,@function +.align 16 +aesni_cbc_sha256_enc: + xorl %eax,%eax + cmpq $0,%rdi + je .Lprobe + ud2 +.Lprobe: + .byte 0xf3,0xc3 +.size aesni_cbc_sha256_enc,.-aesni_cbc_sha256_enc + +.align 64 +.type K256,@object +K256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1 +.long 0,0,0,0, 0,0,0,0 +.byte 65,69,83,78,73,45,67,66,67,43,83,72,65,50,53,54,32,115,116,105,116,99,104,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 64 diff --git a/secure/lib/libcrypto/amd64/aesni-x86_64.S b/secure/lib/libcrypto/amd64/aesni-x86_64.S new file mode 100644 index 0000000..082a306 --- /dev/null +++ b/secure/lib/libcrypto/amd64/aesni-x86_64.S @@ -0,0 +1,3552 @@ + # $FreeBSD$ +.text + +.globl aesni_encrypt +.type aesni_encrypt,@function +.align 16 +aesni_encrypt: + movups (%rdi),%xmm2 + movl 240(%rdx),%eax + movups (%rdx),%xmm0 + movups 16(%rdx),%xmm1 + leaq 32(%rdx),%rdx + xorps %xmm0,%xmm2 +.Loop_enc1_1: +.byte 102,15,56,220,209 + decl %eax + movups (%rdx),%xmm1 + leaq 16(%rdx),%rdx + jnz .Loop_enc1_1 +.byte 102,15,56,221,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 + .byte 0xf3,0xc3 +.size aesni_encrypt,.-aesni_encrypt + +.globl aesni_decrypt +.type aesni_decrypt,@function +.align 16 +aesni_decrypt: + movups (%rdi),%xmm2 + movl 240(%rdx),%eax + movups (%rdx),%xmm0 + movups 16(%rdx),%xmm1 + leaq 32(%rdx),%rdx + xorps %xmm0,%xmm2 +.Loop_dec1_2: +.byte 102,15,56,222,209 + decl %eax + movups (%rdx),%xmm1 + leaq 16(%rdx),%rdx + jnz .Loop_dec1_2 +.byte 102,15,56,223,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 + .byte 0xf3,0xc3 +.size aesni_decrypt, .-aesni_decrypt +.type _aesni_encrypt2,@function +.align 16 +_aesni_encrypt2: + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + xorps %xmm0,%xmm3 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax + addq $16,%rax + +.Lenc_loop2: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Lenc_loop2 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,221,208 +.byte 102,15,56,221,216 + .byte 0xf3,0xc3 +.size _aesni_encrypt2,.-_aesni_encrypt2 +.type _aesni_decrypt2,@function +.align 16 +_aesni_decrypt2: + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + xorps %xmm0,%xmm3 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax + addq $16,%rax + +.Ldec_loop2: +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Ldec_loop2 + +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,223,208 +.byte 102,15,56,223,216 + .byte 0xf3,0xc3 +.size _aesni_decrypt2,.-_aesni_decrypt2 +.type _aesni_encrypt3,@function +.align 16 +_aesni_encrypt3: + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + xorps %xmm0,%xmm3 + xorps %xmm0,%xmm4 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax + addq $16,%rax + +.Lenc_loop3: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Lenc_loop3 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,221,208 +.byte 102,15,56,221,216 +.byte 102,15,56,221,224 + .byte 0xf3,0xc3 +.size _aesni_encrypt3,.-_aesni_encrypt3 +.type _aesni_decrypt3,@function +.align 16 +_aesni_decrypt3: + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + xorps %xmm0,%xmm3 + xorps %xmm0,%xmm4 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax + addq $16,%rax + +.Ldec_loop3: +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Ldec_loop3 + +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,223,208 +.byte 102,15,56,223,216 +.byte 102,15,56,223,224 + .byte 0xf3,0xc3 +.size _aesni_decrypt3,.-_aesni_decrypt3 +.type _aesni_encrypt4,@function +.align 16 +_aesni_encrypt4: + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + xorps %xmm0,%xmm3 + xorps %xmm0,%xmm4 + xorps %xmm0,%xmm5 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax +.byte 0x0f,0x1f,0x00 + addq $16,%rax + +.Lenc_loop4: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Lenc_loop4 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,221,208 +.byte 102,15,56,221,216 +.byte 102,15,56,221,224 +.byte 102,15,56,221,232 + .byte 0xf3,0xc3 +.size _aesni_encrypt4,.-_aesni_encrypt4 +.type _aesni_decrypt4,@function +.align 16 +_aesni_decrypt4: + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + xorps %xmm0,%xmm3 + xorps %xmm0,%xmm4 + xorps %xmm0,%xmm5 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax +.byte 0x0f,0x1f,0x00 + addq $16,%rax + +.Ldec_loop4: +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Ldec_loop4 + +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,223,208 +.byte 102,15,56,223,216 +.byte 102,15,56,223,224 +.byte 102,15,56,223,232 + .byte 0xf3,0xc3 +.size _aesni_decrypt4,.-_aesni_decrypt4 +.type _aesni_encrypt6,@function +.align 16 +_aesni_encrypt6: + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + pxor %xmm0,%xmm4 +.byte 102,15,56,220,209 + leaq 32(%rcx,%rax,1),%rcx + negq %rax +.byte 102,15,56,220,217 + pxor %xmm0,%xmm5 + pxor %xmm0,%xmm6 +.byte 102,15,56,220,225 + pxor %xmm0,%xmm7 + movups (%rcx,%rax,1),%xmm0 + addq $16,%rax + jmp .Lenc_loop6_enter +.align 16 +.Lenc_loop6: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.Lenc_loop6_enter: +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Lenc_loop6 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,15,56,221,208 +.byte 102,15,56,221,216 +.byte 102,15,56,221,224 +.byte 102,15,56,221,232 +.byte 102,15,56,221,240 +.byte 102,15,56,221,248 + .byte 0xf3,0xc3 +.size _aesni_encrypt6,.-_aesni_encrypt6 +.type _aesni_decrypt6,@function +.align 16 +_aesni_decrypt6: + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + pxor %xmm0,%xmm4 +.byte 102,15,56,222,209 + leaq 32(%rcx,%rax,1),%rcx + negq %rax +.byte 102,15,56,222,217 + pxor %xmm0,%xmm5 + pxor %xmm0,%xmm6 +.byte 102,15,56,222,225 + pxor %xmm0,%xmm7 + movups (%rcx,%rax,1),%xmm0 + addq $16,%rax + jmp .Ldec_loop6_enter +.align 16 +.Ldec_loop6: +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.Ldec_loop6_enter: +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Ldec_loop6 + +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,15,56,223,208 +.byte 102,15,56,223,216 +.byte 102,15,56,223,224 +.byte 102,15,56,223,232 +.byte 102,15,56,223,240 +.byte 102,15,56,223,248 + .byte 0xf3,0xc3 +.size _aesni_decrypt6,.-_aesni_decrypt6 +.type _aesni_encrypt8,@function +.align 16 +_aesni_encrypt8: + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + xorps %xmm0,%xmm3 + pxor %xmm0,%xmm4 + pxor %xmm0,%xmm5 + pxor %xmm0,%xmm6 + leaq 32(%rcx,%rax,1),%rcx + negq %rax +.byte 102,15,56,220,209 + pxor %xmm0,%xmm7 + pxor %xmm0,%xmm8 +.byte 102,15,56,220,217 + pxor %xmm0,%xmm9 + movups (%rcx,%rax,1),%xmm0 + addq $16,%rax + jmp .Lenc_loop8_inner +.align 16 +.Lenc_loop8: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.Lenc_loop8_inner: +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 +.Lenc_loop8_enter: + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Lenc_loop8 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 +.byte 102,15,56,221,208 +.byte 102,15,56,221,216 +.byte 102,15,56,221,224 +.byte 102,15,56,221,232 +.byte 102,15,56,221,240 +.byte 102,15,56,221,248 +.byte 102,68,15,56,221,192 +.byte 102,68,15,56,221,200 + .byte 0xf3,0xc3 +.size _aesni_encrypt8,.-_aesni_encrypt8 +.type _aesni_decrypt8,@function +.align 16 +_aesni_decrypt8: + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + xorps %xmm0,%xmm3 + pxor %xmm0,%xmm4 + pxor %xmm0,%xmm5 + pxor %xmm0,%xmm6 + leaq 32(%rcx,%rax,1),%rcx + negq %rax +.byte 102,15,56,222,209 + pxor %xmm0,%xmm7 + pxor %xmm0,%xmm8 +.byte 102,15,56,222,217 + pxor %xmm0,%xmm9 + movups (%rcx,%rax,1),%xmm0 + addq $16,%rax + jmp .Ldec_loop8_inner +.align 16 +.Ldec_loop8: +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.Ldec_loop8_inner: +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 +.Ldec_loop8_enter: + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Ldec_loop8 + +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 +.byte 102,15,56,223,208 +.byte 102,15,56,223,216 +.byte 102,15,56,223,224 +.byte 102,15,56,223,232 +.byte 102,15,56,223,240 +.byte 102,15,56,223,248 +.byte 102,68,15,56,223,192 +.byte 102,68,15,56,223,200 + .byte 0xf3,0xc3 +.size _aesni_decrypt8,.-_aesni_decrypt8 +.globl aesni_ecb_encrypt +.type aesni_ecb_encrypt,@function +.align 16 +aesni_ecb_encrypt: + andq $-16,%rdx + jz .Lecb_ret + + movl 240(%rcx),%eax + movups (%rcx),%xmm0 + movq %rcx,%r11 + movl %eax,%r10d + testl %r8d,%r8d + jz .Lecb_decrypt + + cmpq $128,%rdx + jb .Lecb_enc_tail + + movdqu (%rdi),%xmm2 + movdqu 16(%rdi),%xmm3 + movdqu 32(%rdi),%xmm4 + movdqu 48(%rdi),%xmm5 + movdqu 64(%rdi),%xmm6 + movdqu 80(%rdi),%xmm7 + movdqu 96(%rdi),%xmm8 + movdqu 112(%rdi),%xmm9 + leaq 128(%rdi),%rdi + subq $128,%rdx + jmp .Lecb_enc_loop8_enter +.align 16 +.Lecb_enc_loop8: + movups %xmm2,(%rsi) + movq %r11,%rcx + movdqu (%rdi),%xmm2 + movl %r10d,%eax + movups %xmm3,16(%rsi) + movdqu 16(%rdi),%xmm3 + movups %xmm4,32(%rsi) + movdqu 32(%rdi),%xmm4 + movups %xmm5,48(%rsi) + movdqu 48(%rdi),%xmm5 + movups %xmm6,64(%rsi) + movdqu 64(%rdi),%xmm6 + movups %xmm7,80(%rsi) + movdqu 80(%rdi),%xmm7 + movups %xmm8,96(%rsi) + movdqu 96(%rdi),%xmm8 + movups %xmm9,112(%rsi) + leaq 128(%rsi),%rsi + movdqu 112(%rdi),%xmm9 + leaq 128(%rdi),%rdi +.Lecb_enc_loop8_enter: + + call _aesni_encrypt8 + + subq $128,%rdx + jnc .Lecb_enc_loop8 + + movups %xmm2,(%rsi) + movq %r11,%rcx + movups %xmm3,16(%rsi) + movl %r10d,%eax + movups %xmm4,32(%rsi) + movups %xmm5,48(%rsi) + movups %xmm6,64(%rsi) + movups %xmm7,80(%rsi) + movups %xmm8,96(%rsi) + movups %xmm9,112(%rsi) + leaq 128(%rsi),%rsi + addq $128,%rdx + jz .Lecb_ret + +.Lecb_enc_tail: + movups (%rdi),%xmm2 + cmpq $32,%rdx + jb .Lecb_enc_one + movups 16(%rdi),%xmm3 + je .Lecb_enc_two + movups 32(%rdi),%xmm4 + cmpq $64,%rdx + jb .Lecb_enc_three + movups 48(%rdi),%xmm5 + je .Lecb_enc_four + movups 64(%rdi),%xmm6 + cmpq $96,%rdx + jb .Lecb_enc_five + movups 80(%rdi),%xmm7 + je .Lecb_enc_six + movdqu 96(%rdi),%xmm8 + xorps %xmm9,%xmm9 + call _aesni_encrypt8 + movups %xmm2,(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + movups %xmm5,48(%rsi) + movups %xmm6,64(%rsi) + movups %xmm7,80(%rsi) + movups %xmm8,96(%rsi) + jmp .Lecb_ret +.align 16 +.Lecb_enc_one: + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx + xorps %xmm0,%xmm2 +.Loop_enc1_3: +.byte 102,15,56,220,209 + decl %eax + movups (%rcx),%xmm1 + leaq 16(%rcx),%rcx + jnz .Loop_enc1_3 +.byte 102,15,56,221,209 + movups %xmm2,(%rsi) + jmp .Lecb_ret +.align 16 +.Lecb_enc_two: + call _aesni_encrypt2 + movups %xmm2,(%rsi) + movups %xmm3,16(%rsi) + jmp .Lecb_ret +.align 16 +.Lecb_enc_three: + call _aesni_encrypt3 + movups %xmm2,(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + jmp .Lecb_ret +.align 16 +.Lecb_enc_four: + call _aesni_encrypt4 + movups %xmm2,(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + movups %xmm5,48(%rsi) + jmp .Lecb_ret +.align 16 +.Lecb_enc_five: + xorps %xmm7,%xmm7 + call _aesni_encrypt6 + movups %xmm2,(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + movups %xmm5,48(%rsi) + movups %xmm6,64(%rsi) + jmp .Lecb_ret +.align 16 +.Lecb_enc_six: + call _aesni_encrypt6 + movups %xmm2,(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + movups %xmm5,48(%rsi) + movups %xmm6,64(%rsi) + movups %xmm7,80(%rsi) + jmp .Lecb_ret + +.align 16 +.Lecb_decrypt: + cmpq $128,%rdx + jb .Lecb_dec_tail + + movdqu (%rdi),%xmm2 + movdqu 16(%rdi),%xmm3 + movdqu 32(%rdi),%xmm4 + movdqu 48(%rdi),%xmm5 + movdqu 64(%rdi),%xmm6 + movdqu 80(%rdi),%xmm7 + movdqu 96(%rdi),%xmm8 + movdqu 112(%rdi),%xmm9 + leaq 128(%rdi),%rdi + subq $128,%rdx + jmp .Lecb_dec_loop8_enter +.align 16 +.Lecb_dec_loop8: + movups %xmm2,(%rsi) + movq %r11,%rcx + movdqu (%rdi),%xmm2 + movl %r10d,%eax + movups %xmm3,16(%rsi) + movdqu 16(%rdi),%xmm3 + movups %xmm4,32(%rsi) + movdqu 32(%rdi),%xmm4 + movups %xmm5,48(%rsi) + movdqu 48(%rdi),%xmm5 + movups %xmm6,64(%rsi) + movdqu 64(%rdi),%xmm6 + movups %xmm7,80(%rsi) + movdqu 80(%rdi),%xmm7 + movups %xmm8,96(%rsi) + movdqu 96(%rdi),%xmm8 + movups %xmm9,112(%rsi) + leaq 128(%rsi),%rsi + movdqu 112(%rdi),%xmm9 + leaq 128(%rdi),%rdi +.Lecb_dec_loop8_enter: + + call _aesni_decrypt8 + + movups (%r11),%xmm0 + subq $128,%rdx + jnc .Lecb_dec_loop8 + + movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 + movq %r11,%rcx + movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 + movl %r10d,%eax + movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 + movups %xmm5,48(%rsi) + pxor %xmm5,%xmm5 + movups %xmm6,64(%rsi) + pxor %xmm6,%xmm6 + movups %xmm7,80(%rsi) + pxor %xmm7,%xmm7 + movups %xmm8,96(%rsi) + pxor %xmm8,%xmm8 + movups %xmm9,112(%rsi) + pxor %xmm9,%xmm9 + leaq 128(%rsi),%rsi + addq $128,%rdx + jz .Lecb_ret + +.Lecb_dec_tail: + movups (%rdi),%xmm2 + cmpq $32,%rdx + jb .Lecb_dec_one + movups 16(%rdi),%xmm3 + je .Lecb_dec_two + movups 32(%rdi),%xmm4 + cmpq $64,%rdx + jb .Lecb_dec_three + movups 48(%rdi),%xmm5 + je .Lecb_dec_four + movups 64(%rdi),%xmm6 + cmpq $96,%rdx + jb .Lecb_dec_five + movups 80(%rdi),%xmm7 + je .Lecb_dec_six + movups 96(%rdi),%xmm8 + movups (%rcx),%xmm0 + xorps %xmm9,%xmm9 + call _aesni_decrypt8 + movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 + movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 + movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 + movups %xmm5,48(%rsi) + pxor %xmm5,%xmm5 + movups %xmm6,64(%rsi) + pxor %xmm6,%xmm6 + movups %xmm7,80(%rsi) + pxor %xmm7,%xmm7 + movups %xmm8,96(%rsi) + pxor %xmm8,%xmm8 + pxor %xmm9,%xmm9 + jmp .Lecb_ret +.align 16 +.Lecb_dec_one: + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx + xorps %xmm0,%xmm2 +.Loop_dec1_4: +.byte 102,15,56,222,209 + decl %eax + movups (%rcx),%xmm1 + leaq 16(%rcx),%rcx + jnz .Loop_dec1_4 +.byte 102,15,56,223,209 + movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 + jmp .Lecb_ret +.align 16 +.Lecb_dec_two: + call _aesni_decrypt2 + movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 + movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 + jmp .Lecb_ret +.align 16 +.Lecb_dec_three: + call _aesni_decrypt3 + movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 + movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 + movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 + jmp .Lecb_ret +.align 16 +.Lecb_dec_four: + call _aesni_decrypt4 + movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 + movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 + movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 + movups %xmm5,48(%rsi) + pxor %xmm5,%xmm5 + jmp .Lecb_ret +.align 16 +.Lecb_dec_five: + xorps %xmm7,%xmm7 + call _aesni_decrypt6 + movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 + movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 + movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 + movups %xmm5,48(%rsi) + pxor %xmm5,%xmm5 + movups %xmm6,64(%rsi) + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + jmp .Lecb_ret +.align 16 +.Lecb_dec_six: + call _aesni_decrypt6 + movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 + movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 + movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 + movups %xmm5,48(%rsi) + pxor %xmm5,%xmm5 + movups %xmm6,64(%rsi) + pxor %xmm6,%xmm6 + movups %xmm7,80(%rsi) + pxor %xmm7,%xmm7 + +.Lecb_ret: + xorps %xmm0,%xmm0 + pxor %xmm1,%xmm1 + .byte 0xf3,0xc3 +.size aesni_ecb_encrypt,.-aesni_ecb_encrypt +.globl aesni_ccm64_encrypt_blocks +.type aesni_ccm64_encrypt_blocks,@function +.align 16 +aesni_ccm64_encrypt_blocks: + movl 240(%rcx),%eax + movdqu (%r8),%xmm6 + movdqa .Lincrement64(%rip),%xmm9 + movdqa .Lbswap_mask(%rip),%xmm7 + + shll $4,%eax + movl $16,%r10d + leaq 0(%rcx),%r11 + movdqu (%r9),%xmm3 + movdqa %xmm6,%xmm2 + leaq 32(%rcx,%rax,1),%rcx +.byte 102,15,56,0,247 + subq %rax,%r10 + jmp .Lccm64_enc_outer +.align 16 +.Lccm64_enc_outer: + movups (%r11),%xmm0 + movq %r10,%rax + movups (%rdi),%xmm8 + + xorps %xmm0,%xmm2 + movups 16(%r11),%xmm1 + xorps %xmm8,%xmm0 + xorps %xmm0,%xmm3 + movups 32(%r11),%xmm0 + +.Lccm64_enc2_loop: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Lccm64_enc2_loop +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + paddq %xmm9,%xmm6 + decq %rdx +.byte 102,15,56,221,208 +.byte 102,15,56,221,216 + + leaq 16(%rdi),%rdi + xorps %xmm2,%xmm8 + movdqa %xmm6,%xmm2 + movups %xmm8,(%rsi) +.byte 102,15,56,0,215 + leaq 16(%rsi),%rsi + jnz .Lccm64_enc_outer + + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + movups %xmm3,(%r9) + pxor %xmm3,%xmm3 + pxor %xmm8,%xmm8 + pxor %xmm6,%xmm6 + .byte 0xf3,0xc3 +.size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks +.globl aesni_ccm64_decrypt_blocks +.type aesni_ccm64_decrypt_blocks,@function +.align 16 +aesni_ccm64_decrypt_blocks: + movl 240(%rcx),%eax + movups (%r8),%xmm6 + movdqu (%r9),%xmm3 + movdqa .Lincrement64(%rip),%xmm9 + movdqa .Lbswap_mask(%rip),%xmm7 + + movaps %xmm6,%xmm2 + movl %eax,%r10d + movq %rcx,%r11 +.byte 102,15,56,0,247 + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx + xorps %xmm0,%xmm2 +.Loop_enc1_5: +.byte 102,15,56,220,209 + decl %eax + movups (%rcx),%xmm1 + leaq 16(%rcx),%rcx + jnz .Loop_enc1_5 +.byte 102,15,56,221,209 + shll $4,%r10d + movl $16,%eax + movups (%rdi),%xmm8 + paddq %xmm9,%xmm6 + leaq 16(%rdi),%rdi + subq %r10,%rax + leaq 32(%r11,%r10,1),%rcx + movq %rax,%r10 + jmp .Lccm64_dec_outer +.align 16 +.Lccm64_dec_outer: + xorps %xmm2,%xmm8 + movdqa %xmm6,%xmm2 + movups %xmm8,(%rsi) + leaq 16(%rsi),%rsi +.byte 102,15,56,0,215 + + subq $1,%rdx + jz .Lccm64_dec_break + + movups (%r11),%xmm0 + movq %r10,%rax + movups 16(%r11),%xmm1 + xorps %xmm0,%xmm8 + xorps %xmm0,%xmm2 + xorps %xmm8,%xmm3 + movups 32(%r11),%xmm0 + jmp .Lccm64_dec2_loop +.align 16 +.Lccm64_dec2_loop: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Lccm64_dec2_loop + movups (%rdi),%xmm8 + paddq %xmm9,%xmm6 +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,221,208 +.byte 102,15,56,221,216 + leaq 16(%rdi),%rdi + jmp .Lccm64_dec_outer + +.align 16 +.Lccm64_dec_break: + + movl 240(%r11),%eax + movups (%r11),%xmm0 + movups 16(%r11),%xmm1 + xorps %xmm0,%xmm8 + leaq 32(%r11),%r11 + xorps %xmm8,%xmm3 +.Loop_enc1_6: +.byte 102,15,56,220,217 + decl %eax + movups (%r11),%xmm1 + leaq 16(%r11),%r11 + jnz .Loop_enc1_6 +.byte 102,15,56,221,217 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + movups %xmm3,(%r9) + pxor %xmm3,%xmm3 + pxor %xmm8,%xmm8 + pxor %xmm6,%xmm6 + .byte 0xf3,0xc3 +.size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks +.globl aesni_ctr32_encrypt_blocks +.type aesni_ctr32_encrypt_blocks,@function +.align 16 +aesni_ctr32_encrypt_blocks: + cmpq $1,%rdx + jne .Lctr32_bulk + + + + movups (%r8),%xmm2 + movups (%rdi),%xmm3 + movl 240(%rcx),%edx + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx + xorps %xmm0,%xmm2 +.Loop_enc1_7: +.byte 102,15,56,220,209 + decl %edx + movups (%rcx),%xmm1 + leaq 16(%rcx),%rcx + jnz .Loop_enc1_7 +.byte 102,15,56,221,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + xorps %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movups %xmm2,(%rsi) + xorps %xmm2,%xmm2 + jmp .Lctr32_epilogue + +.align 16 +.Lctr32_bulk: + leaq (%rsp),%rax + pushq %rbp + subq $128,%rsp + andq $-16,%rsp + leaq -8(%rax),%rbp + + + + + movdqu (%r8),%xmm2 + movdqu (%rcx),%xmm0 + movl 12(%r8),%r8d + pxor %xmm0,%xmm2 + movl 12(%rcx),%r11d + movdqa %xmm2,0(%rsp) + bswapl %r8d + movdqa %xmm2,%xmm3 + movdqa %xmm2,%xmm4 + movdqa %xmm2,%xmm5 + movdqa %xmm2,64(%rsp) + movdqa %xmm2,80(%rsp) + movdqa %xmm2,96(%rsp) + movq %rdx,%r10 + movdqa %xmm2,112(%rsp) + + leaq 1(%r8),%rax + leaq 2(%r8),%rdx + bswapl %eax + bswapl %edx + xorl %r11d,%eax + xorl %r11d,%edx +.byte 102,15,58,34,216,3 + leaq 3(%r8),%rax + movdqa %xmm3,16(%rsp) +.byte 102,15,58,34,226,3 + bswapl %eax + movq %r10,%rdx + leaq 4(%r8),%r10 + movdqa %xmm4,32(%rsp) + xorl %r11d,%eax + bswapl %r10d +.byte 102,15,58,34,232,3 + xorl %r11d,%r10d + movdqa %xmm5,48(%rsp) + leaq 5(%r8),%r9 + movl %r10d,64+12(%rsp) + bswapl %r9d + leaq 6(%r8),%r10 + movl 240(%rcx),%eax + xorl %r11d,%r9d + bswapl %r10d + movl %r9d,80+12(%rsp) + xorl %r11d,%r10d + leaq 7(%r8),%r9 + movl %r10d,96+12(%rsp) + bswapl %r9d + movl OPENSSL_ia32cap_P+4(%rip),%r10d + xorl %r11d,%r9d + andl $71303168,%r10d + movl %r9d,112+12(%rsp) + + movups 16(%rcx),%xmm1 + + movdqa 64(%rsp),%xmm6 + movdqa 80(%rsp),%xmm7 + + cmpq $8,%rdx + jb .Lctr32_tail + + subq $6,%rdx + cmpl $4194304,%r10d + je .Lctr32_6x + + leaq 128(%rcx),%rcx + subq $2,%rdx + jmp .Lctr32_loop8 + +.align 16 +.Lctr32_6x: + shll $4,%eax + movl $48,%r10d + bswapl %r11d + leaq 32(%rcx,%rax,1),%rcx + subq %rax,%r10 + jmp .Lctr32_loop6 + +.align 16 +.Lctr32_loop6: + addl $6,%r8d + movups -48(%rcx,%r10,1),%xmm0 +.byte 102,15,56,220,209 + movl %r8d,%eax + xorl %r11d,%eax +.byte 102,15,56,220,217 +.byte 0x0f,0x38,0xf1,0x44,0x24,12 + leal 1(%r8),%eax +.byte 102,15,56,220,225 + xorl %r11d,%eax +.byte 0x0f,0x38,0xf1,0x44,0x24,28 +.byte 102,15,56,220,233 + leal 2(%r8),%eax + xorl %r11d,%eax +.byte 102,15,56,220,241 +.byte 0x0f,0x38,0xf1,0x44,0x24,44 + leal 3(%r8),%eax +.byte 102,15,56,220,249 + movups -32(%rcx,%r10,1),%xmm1 + xorl %r11d,%eax + +.byte 102,15,56,220,208 +.byte 0x0f,0x38,0xf1,0x44,0x24,60 + leal 4(%r8),%eax +.byte 102,15,56,220,216 + xorl %r11d,%eax +.byte 0x0f,0x38,0xf1,0x44,0x24,76 +.byte 102,15,56,220,224 + leal 5(%r8),%eax + xorl %r11d,%eax +.byte 102,15,56,220,232 +.byte 0x0f,0x38,0xf1,0x44,0x24,92 + movq %r10,%rax +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 + movups -16(%rcx,%r10,1),%xmm0 + + call .Lenc_loop6 + + movdqu (%rdi),%xmm8 + movdqu 16(%rdi),%xmm9 + movdqu 32(%rdi),%xmm10 + movdqu 48(%rdi),%xmm11 + movdqu 64(%rdi),%xmm12 + movdqu 80(%rdi),%xmm13 + leaq 96(%rdi),%rdi + movups -64(%rcx,%r10,1),%xmm1 + pxor %xmm2,%xmm8 + movaps 0(%rsp),%xmm2 + pxor %xmm3,%xmm9 + movaps 16(%rsp),%xmm3 + pxor %xmm4,%xmm10 + movaps 32(%rsp),%xmm4 + pxor %xmm5,%xmm11 + movaps 48(%rsp),%xmm5 + pxor %xmm6,%xmm12 + movaps 64(%rsp),%xmm6 + pxor %xmm7,%xmm13 + movaps 80(%rsp),%xmm7 + movdqu %xmm8,(%rsi) + movdqu %xmm9,16(%rsi) + movdqu %xmm10,32(%rsi) + movdqu %xmm11,48(%rsi) + movdqu %xmm12,64(%rsi) + movdqu %xmm13,80(%rsi) + leaq 96(%rsi),%rsi + + subq $6,%rdx + jnc .Lctr32_loop6 + + addq $6,%rdx + jz .Lctr32_done + + leal -48(%r10),%eax + leaq -80(%rcx,%r10,1),%rcx + negl %eax + shrl $4,%eax + jmp .Lctr32_tail + +.align 32 +.Lctr32_loop8: + addl $8,%r8d + movdqa 96(%rsp),%xmm8 +.byte 102,15,56,220,209 + movl %r8d,%r9d + movdqa 112(%rsp),%xmm9 +.byte 102,15,56,220,217 + bswapl %r9d + movups 32-128(%rcx),%xmm0 +.byte 102,15,56,220,225 + xorl %r11d,%r9d + nop +.byte 102,15,56,220,233 + movl %r9d,0+12(%rsp) + leaq 1(%r8),%r9 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 48-128(%rcx),%xmm1 + bswapl %r9d +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + xorl %r11d,%r9d +.byte 0x66,0x90 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + movl %r9d,16+12(%rsp) + leaq 2(%r8),%r9 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 64-128(%rcx),%xmm0 + bswapl %r9d +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + xorl %r11d,%r9d +.byte 0x66,0x90 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movl %r9d,32+12(%rsp) + leaq 3(%r8),%r9 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 80-128(%rcx),%xmm1 + bswapl %r9d +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + xorl %r11d,%r9d +.byte 0x66,0x90 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + movl %r9d,48+12(%rsp) + leaq 4(%r8),%r9 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 96-128(%rcx),%xmm0 + bswapl %r9d +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + xorl %r11d,%r9d +.byte 0x66,0x90 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movl %r9d,64+12(%rsp) + leaq 5(%r8),%r9 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 112-128(%rcx),%xmm1 + bswapl %r9d +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + xorl %r11d,%r9d +.byte 0x66,0x90 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + movl %r9d,80+12(%rsp) + leaq 6(%r8),%r9 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 128-128(%rcx),%xmm0 + bswapl %r9d +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + xorl %r11d,%r9d +.byte 0x66,0x90 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movl %r9d,96+12(%rsp) + leaq 7(%r8),%r9 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 144-128(%rcx),%xmm1 + bswapl %r9d +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 + xorl %r11d,%r9d + movdqu 0(%rdi),%xmm10 +.byte 102,15,56,220,232 + movl %r9d,112+12(%rsp) + cmpl $11,%eax +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 160-128(%rcx),%xmm0 + + jb .Lctr32_enc_done + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 176-128(%rcx),%xmm1 + +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 192-128(%rcx),%xmm0 + je .Lctr32_enc_done + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 208-128(%rcx),%xmm1 + +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 224-128(%rcx),%xmm0 + jmp .Lctr32_enc_done + +.align 16 +.Lctr32_enc_done: + movdqu 16(%rdi),%xmm11 + pxor %xmm0,%xmm10 + movdqu 32(%rdi),%xmm12 + pxor %xmm0,%xmm11 + movdqu 48(%rdi),%xmm13 + pxor %xmm0,%xmm12 + movdqu 64(%rdi),%xmm14 + pxor %xmm0,%xmm13 + movdqu 80(%rdi),%xmm15 + pxor %xmm0,%xmm14 + pxor %xmm0,%xmm15 +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movdqu 96(%rdi),%xmm1 + leaq 128(%rdi),%rdi + +.byte 102,65,15,56,221,210 + pxor %xmm0,%xmm1 + movdqu 112-128(%rdi),%xmm10 +.byte 102,65,15,56,221,219 + pxor %xmm0,%xmm10 + movdqa 0(%rsp),%xmm11 +.byte 102,65,15,56,221,228 +.byte 102,65,15,56,221,237 + movdqa 16(%rsp),%xmm12 + movdqa 32(%rsp),%xmm13 +.byte 102,65,15,56,221,246 +.byte 102,65,15,56,221,255 + movdqa 48(%rsp),%xmm14 + movdqa 64(%rsp),%xmm15 +.byte 102,68,15,56,221,193 + movdqa 80(%rsp),%xmm0 + movups 16-128(%rcx),%xmm1 +.byte 102,69,15,56,221,202 + + movups %xmm2,(%rsi) + movdqa %xmm11,%xmm2 + movups %xmm3,16(%rsi) + movdqa %xmm12,%xmm3 + movups %xmm4,32(%rsi) + movdqa %xmm13,%xmm4 + movups %xmm5,48(%rsi) + movdqa %xmm14,%xmm5 + movups %xmm6,64(%rsi) + movdqa %xmm15,%xmm6 + movups %xmm7,80(%rsi) + movdqa %xmm0,%xmm7 + movups %xmm8,96(%rsi) + movups %xmm9,112(%rsi) + leaq 128(%rsi),%rsi + + subq $8,%rdx + jnc .Lctr32_loop8 + + addq $8,%rdx + jz .Lctr32_done + leaq -128(%rcx),%rcx + +.Lctr32_tail: + + + leaq 16(%rcx),%rcx + cmpq $4,%rdx + jb .Lctr32_loop3 + je .Lctr32_loop4 + + + shll $4,%eax + movdqa 96(%rsp),%xmm8 + pxor %xmm9,%xmm9 + + movups 16(%rcx),%xmm0 +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + leaq 32-16(%rcx,%rax,1),%rcx + negq %rax +.byte 102,15,56,220,225 + addq $16,%rax + movups (%rdi),%xmm10 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 + movups 16(%rdi),%xmm11 + movups 32(%rdi),%xmm12 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 + + call .Lenc_loop8_enter + + movdqu 48(%rdi),%xmm13 + pxor %xmm10,%xmm2 + movdqu 64(%rdi),%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm10,%xmm6 + movdqu %xmm5,48(%rsi) + movdqu %xmm6,64(%rsi) + cmpq $6,%rdx + jb .Lctr32_done + + movups 80(%rdi),%xmm11 + xorps %xmm11,%xmm7 + movups %xmm7,80(%rsi) + je .Lctr32_done + + movups 96(%rdi),%xmm12 + xorps %xmm12,%xmm8 + movups %xmm8,96(%rsi) + jmp .Lctr32_done + +.align 32 +.Lctr32_loop4: +.byte 102,15,56,220,209 + leaq 16(%rcx),%rcx + decl %eax +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movups (%rcx),%xmm1 + jnz .Lctr32_loop4 +.byte 102,15,56,221,209 +.byte 102,15,56,221,217 + movups (%rdi),%xmm10 + movups 16(%rdi),%xmm11 +.byte 102,15,56,221,225 +.byte 102,15,56,221,233 + movups 32(%rdi),%xmm12 + movups 48(%rdi),%xmm13 + + xorps %xmm10,%xmm2 + movups %xmm2,(%rsi) + xorps %xmm11,%xmm3 + movups %xmm3,16(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm4,32(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm5,48(%rsi) + jmp .Lctr32_done + +.align 32 +.Lctr32_loop3: +.byte 102,15,56,220,209 + leaq 16(%rcx),%rcx + decl %eax +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 + movups (%rcx),%xmm1 + jnz .Lctr32_loop3 +.byte 102,15,56,221,209 +.byte 102,15,56,221,217 +.byte 102,15,56,221,225 + + movups (%rdi),%xmm10 + xorps %xmm10,%xmm2 + movups %xmm2,(%rsi) + cmpq $2,%rdx + jb .Lctr32_done + + movups 16(%rdi),%xmm11 + xorps %xmm11,%xmm3 + movups %xmm3,16(%rsi) + je .Lctr32_done + + movups 32(%rdi),%xmm12 + xorps %xmm12,%xmm4 + movups %xmm4,32(%rsi) + +.Lctr32_done: + xorps %xmm0,%xmm0 + xorl %r11d,%r11d + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + movaps %xmm0,0(%rsp) + pxor %xmm8,%xmm8 + movaps %xmm0,16(%rsp) + pxor %xmm9,%xmm9 + movaps %xmm0,32(%rsp) + pxor %xmm10,%xmm10 + movaps %xmm0,48(%rsp) + pxor %xmm11,%xmm11 + movaps %xmm0,64(%rsp) + pxor %xmm12,%xmm12 + movaps %xmm0,80(%rsp) + pxor %xmm13,%xmm13 + movaps %xmm0,96(%rsp) + pxor %xmm14,%xmm14 + movaps %xmm0,112(%rsp) + pxor %xmm15,%xmm15 + leaq (%rbp),%rsp + popq %rbp +.Lctr32_epilogue: + .byte 0xf3,0xc3 +.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks +.globl aesni_xts_encrypt +.type aesni_xts_encrypt,@function +.align 16 +aesni_xts_encrypt: + leaq (%rsp),%rax + pushq %rbp + subq $112,%rsp + andq $-16,%rsp + leaq -8(%rax),%rbp + movups (%r9),%xmm2 + movl 240(%r8),%eax + movl 240(%rcx),%r10d + movups (%r8),%xmm0 + movups 16(%r8),%xmm1 + leaq 32(%r8),%r8 + xorps %xmm0,%xmm2 +.Loop_enc1_8: +.byte 102,15,56,220,209 + decl %eax + movups (%r8),%xmm1 + leaq 16(%r8),%r8 + jnz .Loop_enc1_8 +.byte 102,15,56,221,209 + movups (%rcx),%xmm0 + movq %rcx,%r11 + movl %r10d,%eax + shll $4,%r10d + movq %rdx,%r9 + andq $-16,%rdx + + movups 16(%rcx,%r10,1),%xmm1 + + movdqa .Lxts_magic(%rip),%xmm8 + movdqa %xmm2,%xmm15 + pshufd $95,%xmm2,%xmm9 + pxor %xmm0,%xmm1 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 + movdqa %xmm15,%xmm10 + psrad $31,%xmm14 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm10 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 + movdqa %xmm15,%xmm11 + psrad $31,%xmm14 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm11 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 + movdqa %xmm15,%xmm12 + psrad $31,%xmm14 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm12 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 + movdqa %xmm15,%xmm13 + psrad $31,%xmm14 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm13 + pxor %xmm14,%xmm15 + movdqa %xmm15,%xmm14 + psrad $31,%xmm9 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm9 + pxor %xmm0,%xmm14 + pxor %xmm9,%xmm15 + movaps %xmm1,96(%rsp) + + subq $96,%rdx + jc .Lxts_enc_short + + movl $16+96,%eax + leaq 32(%r11,%r10,1),%rcx + subq %r10,%rax + movups 16(%r11),%xmm1 + movq %rax,%r10 + leaq .Lxts_magic(%rip),%r8 + jmp .Lxts_enc_grandloop + +.align 32 +.Lxts_enc_grandloop: + movdqu 0(%rdi),%xmm2 + movdqa %xmm0,%xmm8 + movdqu 16(%rdi),%xmm3 + pxor %xmm10,%xmm2 + movdqu 32(%rdi),%xmm4 + pxor %xmm11,%xmm3 +.byte 102,15,56,220,209 + movdqu 48(%rdi),%xmm5 + pxor %xmm12,%xmm4 +.byte 102,15,56,220,217 + movdqu 64(%rdi),%xmm6 + pxor %xmm13,%xmm5 +.byte 102,15,56,220,225 + movdqu 80(%rdi),%xmm7 + pxor %xmm15,%xmm8 + movdqa 96(%rsp),%xmm9 + pxor %xmm14,%xmm6 +.byte 102,15,56,220,233 + movups 32(%r11),%xmm0 + leaq 96(%rdi),%rdi + pxor %xmm8,%xmm7 + + pxor %xmm9,%xmm10 +.byte 102,15,56,220,241 + pxor %xmm9,%xmm11 + movdqa %xmm10,0(%rsp) +.byte 102,15,56,220,249 + movups 48(%r11),%xmm1 + pxor %xmm9,%xmm12 + +.byte 102,15,56,220,208 + pxor %xmm9,%xmm13 + movdqa %xmm11,16(%rsp) +.byte 102,15,56,220,216 + pxor %xmm9,%xmm14 + movdqa %xmm12,32(%rsp) +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + pxor %xmm9,%xmm8 + movdqa %xmm14,64(%rsp) +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 + movups 64(%r11),%xmm0 + movdqa %xmm8,80(%rsp) + pshufd $95,%xmm15,%xmm9 + jmp .Lxts_enc_loop6 +.align 32 +.Lxts_enc_loop6: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 + movups -64(%rcx,%rax,1),%xmm1 + addq $32,%rax + +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 + movups -80(%rcx,%rax,1),%xmm0 + jnz .Lxts_enc_loop6 + + movdqa (%r8),%xmm8 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 +.byte 102,15,56,220,209 + paddq %xmm15,%xmm15 + psrad $31,%xmm14 +.byte 102,15,56,220,217 + pand %xmm8,%xmm14 + movups (%r11),%xmm10 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 + pxor %xmm14,%xmm15 + movaps %xmm10,%xmm11 +.byte 102,15,56,220,249 + movups -64(%rcx),%xmm1 + + movdqa %xmm9,%xmm14 +.byte 102,15,56,220,208 + paddd %xmm9,%xmm9 + pxor %xmm15,%xmm10 +.byte 102,15,56,220,216 + psrad $31,%xmm14 + paddq %xmm15,%xmm15 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + pand %xmm8,%xmm14 + movaps %xmm11,%xmm12 +.byte 102,15,56,220,240 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 +.byte 102,15,56,220,248 + movups -48(%rcx),%xmm0 + + paddd %xmm9,%xmm9 +.byte 102,15,56,220,209 + pxor %xmm15,%xmm11 + psrad $31,%xmm14 +.byte 102,15,56,220,217 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movdqa %xmm13,48(%rsp) + pxor %xmm14,%xmm15 +.byte 102,15,56,220,241 + movaps %xmm12,%xmm13 + movdqa %xmm9,%xmm14 +.byte 102,15,56,220,249 + movups -32(%rcx),%xmm1 + + paddd %xmm9,%xmm9 +.byte 102,15,56,220,208 + pxor %xmm15,%xmm12 + psrad $31,%xmm14 +.byte 102,15,56,220,216 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 + pxor %xmm14,%xmm15 + movaps %xmm13,%xmm14 +.byte 102,15,56,220,248 + + movdqa %xmm9,%xmm0 + paddd %xmm9,%xmm9 +.byte 102,15,56,220,209 + pxor %xmm15,%xmm13 + psrad $31,%xmm0 +.byte 102,15,56,220,217 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm0 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + pxor %xmm0,%xmm15 + movups (%r11),%xmm0 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 + movups 16(%r11),%xmm1 + + pxor %xmm15,%xmm14 +.byte 102,15,56,221,84,36,0 + psrad $31,%xmm9 + paddq %xmm15,%xmm15 +.byte 102,15,56,221,92,36,16 +.byte 102,15,56,221,100,36,32 + pand %xmm8,%xmm9 + movq %r10,%rax +.byte 102,15,56,221,108,36,48 +.byte 102,15,56,221,116,36,64 +.byte 102,15,56,221,124,36,80 + pxor %xmm9,%xmm15 + + leaq 96(%rsi),%rsi + movups %xmm2,-96(%rsi) + movups %xmm3,-80(%rsi) + movups %xmm4,-64(%rsi) + movups %xmm5,-48(%rsi) + movups %xmm6,-32(%rsi) + movups %xmm7,-16(%rsi) + subq $96,%rdx + jnc .Lxts_enc_grandloop + + movl $16+96,%eax + subl %r10d,%eax + movq %r11,%rcx + shrl $4,%eax + +.Lxts_enc_short: + + movl %eax,%r10d + pxor %xmm0,%xmm10 + addq $96,%rdx + jz .Lxts_enc_done + + pxor %xmm0,%xmm11 + cmpq $32,%rdx + jb .Lxts_enc_one + pxor %xmm0,%xmm12 + je .Lxts_enc_two + + pxor %xmm0,%xmm13 + cmpq $64,%rdx + jb .Lxts_enc_three + pxor %xmm0,%xmm14 + je .Lxts_enc_four + + movdqu (%rdi),%xmm2 + movdqu 16(%rdi),%xmm3 + movdqu 32(%rdi),%xmm4 + pxor %xmm10,%xmm2 + movdqu 48(%rdi),%xmm5 + pxor %xmm11,%xmm3 + movdqu 64(%rdi),%xmm6 + leaq 80(%rdi),%rdi + pxor %xmm12,%xmm4 + pxor %xmm13,%xmm5 + pxor %xmm14,%xmm6 + pxor %xmm7,%xmm7 + + call _aesni_encrypt6 + + xorps %xmm10,%xmm2 + movdqa %xmm15,%xmm10 + xorps %xmm11,%xmm3 + xorps %xmm12,%xmm4 + movdqu %xmm2,(%rsi) + xorps %xmm13,%xmm5 + movdqu %xmm3,16(%rsi) + xorps %xmm14,%xmm6 + movdqu %xmm4,32(%rsi) + movdqu %xmm5,48(%rsi) + movdqu %xmm6,64(%rsi) + leaq 80(%rsi),%rsi + jmp .Lxts_enc_done + +.align 16 +.Lxts_enc_one: + movups (%rdi),%xmm2 + leaq 16(%rdi),%rdi + xorps %xmm10,%xmm2 + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx + xorps %xmm0,%xmm2 +.Loop_enc1_9: +.byte 102,15,56,220,209 + decl %eax + movups (%rcx),%xmm1 + leaq 16(%rcx),%rcx + jnz .Loop_enc1_9 +.byte 102,15,56,221,209 + xorps %xmm10,%xmm2 + movdqa %xmm11,%xmm10 + movups %xmm2,(%rsi) + leaq 16(%rsi),%rsi + jmp .Lxts_enc_done + +.align 16 +.Lxts_enc_two: + movups (%rdi),%xmm2 + movups 16(%rdi),%xmm3 + leaq 32(%rdi),%rdi + xorps %xmm10,%xmm2 + xorps %xmm11,%xmm3 + + call _aesni_encrypt2 + + xorps %xmm10,%xmm2 + movdqa %xmm12,%xmm10 + xorps %xmm11,%xmm3 + movups %xmm2,(%rsi) + movups %xmm3,16(%rsi) + leaq 32(%rsi),%rsi + jmp .Lxts_enc_done + +.align 16 +.Lxts_enc_three: + movups (%rdi),%xmm2 + movups 16(%rdi),%xmm3 + movups 32(%rdi),%xmm4 + leaq 48(%rdi),%rdi + xorps %xmm10,%xmm2 + xorps %xmm11,%xmm3 + xorps %xmm12,%xmm4 + + call _aesni_encrypt3 + + xorps %xmm10,%xmm2 + movdqa %xmm13,%xmm10 + xorps %xmm11,%xmm3 + xorps %xmm12,%xmm4 + movups %xmm2,(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + leaq 48(%rsi),%rsi + jmp .Lxts_enc_done + +.align 16 +.Lxts_enc_four: + movups (%rdi),%xmm2 + movups 16(%rdi),%xmm3 + movups 32(%rdi),%xmm4 + xorps %xmm10,%xmm2 + movups 48(%rdi),%xmm5 + leaq 64(%rdi),%rdi + xorps %xmm11,%xmm3 + xorps %xmm12,%xmm4 + xorps %xmm13,%xmm5 + + call _aesni_encrypt4 + + pxor %xmm10,%xmm2 + movdqa %xmm14,%xmm10 + pxor %xmm11,%xmm3 + pxor %xmm12,%xmm4 + movdqu %xmm2,(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm3,16(%rsi) + movdqu %xmm4,32(%rsi) + movdqu %xmm5,48(%rsi) + leaq 64(%rsi),%rsi + jmp .Lxts_enc_done + +.align 16 +.Lxts_enc_done: + andq $15,%r9 + jz .Lxts_enc_ret + movq %r9,%rdx + +.Lxts_enc_steal: + movzbl (%rdi),%eax + movzbl -16(%rsi),%ecx + leaq 1(%rdi),%rdi + movb %al,-16(%rsi) + movb %cl,0(%rsi) + leaq 1(%rsi),%rsi + subq $1,%rdx + jnz .Lxts_enc_steal + + subq %r9,%rsi + movq %r11,%rcx + movl %r10d,%eax + + movups -16(%rsi),%xmm2 + xorps %xmm10,%xmm2 + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx + xorps %xmm0,%xmm2 +.Loop_enc1_10: +.byte 102,15,56,220,209 + decl %eax + movups (%rcx),%xmm1 + leaq 16(%rcx),%rcx + jnz .Loop_enc1_10 +.byte 102,15,56,221,209 + xorps %xmm10,%xmm2 + movups %xmm2,-16(%rsi) + +.Lxts_enc_ret: + xorps %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + movaps %xmm0,0(%rsp) + pxor %xmm8,%xmm8 + movaps %xmm0,16(%rsp) + pxor %xmm9,%xmm9 + movaps %xmm0,32(%rsp) + pxor %xmm10,%xmm10 + movaps %xmm0,48(%rsp) + pxor %xmm11,%xmm11 + movaps %xmm0,64(%rsp) + pxor %xmm12,%xmm12 + movaps %xmm0,80(%rsp) + pxor %xmm13,%xmm13 + movaps %xmm0,96(%rsp) + pxor %xmm14,%xmm14 + pxor %xmm15,%xmm15 + leaq (%rbp),%rsp + popq %rbp +.Lxts_enc_epilogue: + .byte 0xf3,0xc3 +.size aesni_xts_encrypt,.-aesni_xts_encrypt +.globl aesni_xts_decrypt +.type aesni_xts_decrypt,@function +.align 16 +aesni_xts_decrypt: + leaq (%rsp),%rax + pushq %rbp + subq $112,%rsp + andq $-16,%rsp + leaq -8(%rax),%rbp + movups (%r9),%xmm2 + movl 240(%r8),%eax + movl 240(%rcx),%r10d + movups (%r8),%xmm0 + movups 16(%r8),%xmm1 + leaq 32(%r8),%r8 + xorps %xmm0,%xmm2 +.Loop_enc1_11: +.byte 102,15,56,220,209 + decl %eax + movups (%r8),%xmm1 + leaq 16(%r8),%r8 + jnz .Loop_enc1_11 +.byte 102,15,56,221,209 + xorl %eax,%eax + testq $15,%rdx + setnz %al + shlq $4,%rax + subq %rax,%rdx + + movups (%rcx),%xmm0 + movq %rcx,%r11 + movl %r10d,%eax + shll $4,%r10d + movq %rdx,%r9 + andq $-16,%rdx + + movups 16(%rcx,%r10,1),%xmm1 + + movdqa .Lxts_magic(%rip),%xmm8 + movdqa %xmm2,%xmm15 + pshufd $95,%xmm2,%xmm9 + pxor %xmm0,%xmm1 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 + movdqa %xmm15,%xmm10 + psrad $31,%xmm14 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm10 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 + movdqa %xmm15,%xmm11 + psrad $31,%xmm14 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm11 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 + movdqa %xmm15,%xmm12 + psrad $31,%xmm14 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm12 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 + movdqa %xmm15,%xmm13 + psrad $31,%xmm14 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm13 + pxor %xmm14,%xmm15 + movdqa %xmm15,%xmm14 + psrad $31,%xmm9 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm9 + pxor %xmm0,%xmm14 + pxor %xmm9,%xmm15 + movaps %xmm1,96(%rsp) + + subq $96,%rdx + jc .Lxts_dec_short + + movl $16+96,%eax + leaq 32(%r11,%r10,1),%rcx + subq %r10,%rax + movups 16(%r11),%xmm1 + movq %rax,%r10 + leaq .Lxts_magic(%rip),%r8 + jmp .Lxts_dec_grandloop + +.align 32 +.Lxts_dec_grandloop: + movdqu 0(%rdi),%xmm2 + movdqa %xmm0,%xmm8 + movdqu 16(%rdi),%xmm3 + pxor %xmm10,%xmm2 + movdqu 32(%rdi),%xmm4 + pxor %xmm11,%xmm3 +.byte 102,15,56,222,209 + movdqu 48(%rdi),%xmm5 + pxor %xmm12,%xmm4 +.byte 102,15,56,222,217 + movdqu 64(%rdi),%xmm6 + pxor %xmm13,%xmm5 +.byte 102,15,56,222,225 + movdqu 80(%rdi),%xmm7 + pxor %xmm15,%xmm8 + movdqa 96(%rsp),%xmm9 + pxor %xmm14,%xmm6 +.byte 102,15,56,222,233 + movups 32(%r11),%xmm0 + leaq 96(%rdi),%rdi + pxor %xmm8,%xmm7 + + pxor %xmm9,%xmm10 +.byte 102,15,56,222,241 + pxor %xmm9,%xmm11 + movdqa %xmm10,0(%rsp) +.byte 102,15,56,222,249 + movups 48(%r11),%xmm1 + pxor %xmm9,%xmm12 + +.byte 102,15,56,222,208 + pxor %xmm9,%xmm13 + movdqa %xmm11,16(%rsp) +.byte 102,15,56,222,216 + pxor %xmm9,%xmm14 + movdqa %xmm12,32(%rsp) +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 + pxor %xmm9,%xmm8 + movdqa %xmm14,64(%rsp) +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 + movups 64(%r11),%xmm0 + movdqa %xmm8,80(%rsp) + pshufd $95,%xmm15,%xmm9 + jmp .Lxts_dec_loop6 +.align 32 +.Lxts_dec_loop6: +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 + movups -64(%rcx,%rax,1),%xmm1 + addq $32,%rax + +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 + movups -80(%rcx,%rax,1),%xmm0 + jnz .Lxts_dec_loop6 + + movdqa (%r8),%xmm8 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 +.byte 102,15,56,222,209 + paddq %xmm15,%xmm15 + psrad $31,%xmm14 +.byte 102,15,56,222,217 + pand %xmm8,%xmm14 + movups (%r11),%xmm10 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 + pxor %xmm14,%xmm15 + movaps %xmm10,%xmm11 +.byte 102,15,56,222,249 + movups -64(%rcx),%xmm1 + + movdqa %xmm9,%xmm14 +.byte 102,15,56,222,208 + paddd %xmm9,%xmm9 + pxor %xmm15,%xmm10 +.byte 102,15,56,222,216 + psrad $31,%xmm14 + paddq %xmm15,%xmm15 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 + pand %xmm8,%xmm14 + movaps %xmm11,%xmm12 +.byte 102,15,56,222,240 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 +.byte 102,15,56,222,248 + movups -48(%rcx),%xmm0 + + paddd %xmm9,%xmm9 +.byte 102,15,56,222,209 + pxor %xmm15,%xmm11 + psrad $31,%xmm14 +.byte 102,15,56,222,217 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + movdqa %xmm13,48(%rsp) + pxor %xmm14,%xmm15 +.byte 102,15,56,222,241 + movaps %xmm12,%xmm13 + movdqa %xmm9,%xmm14 +.byte 102,15,56,222,249 + movups -32(%rcx),%xmm1 + + paddd %xmm9,%xmm9 +.byte 102,15,56,222,208 + pxor %xmm15,%xmm12 + psrad $31,%xmm14 +.byte 102,15,56,222,216 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 + pxor %xmm14,%xmm15 + movaps %xmm13,%xmm14 +.byte 102,15,56,222,248 + + movdqa %xmm9,%xmm0 + paddd %xmm9,%xmm9 +.byte 102,15,56,222,209 + pxor %xmm15,%xmm13 + psrad $31,%xmm0 +.byte 102,15,56,222,217 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm0 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + pxor %xmm0,%xmm15 + movups (%r11),%xmm0 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 + movups 16(%r11),%xmm1 + + pxor %xmm15,%xmm14 +.byte 102,15,56,223,84,36,0 + psrad $31,%xmm9 + paddq %xmm15,%xmm15 +.byte 102,15,56,223,92,36,16 +.byte 102,15,56,223,100,36,32 + pand %xmm8,%xmm9 + movq %r10,%rax +.byte 102,15,56,223,108,36,48 +.byte 102,15,56,223,116,36,64 +.byte 102,15,56,223,124,36,80 + pxor %xmm9,%xmm15 + + leaq 96(%rsi),%rsi + movups %xmm2,-96(%rsi) + movups %xmm3,-80(%rsi) + movups %xmm4,-64(%rsi) + movups %xmm5,-48(%rsi) + movups %xmm6,-32(%rsi) + movups %xmm7,-16(%rsi) + subq $96,%rdx + jnc .Lxts_dec_grandloop + + movl $16+96,%eax + subl %r10d,%eax + movq %r11,%rcx + shrl $4,%eax + +.Lxts_dec_short: + + movl %eax,%r10d + pxor %xmm0,%xmm10 + pxor %xmm0,%xmm11 + addq $96,%rdx + jz .Lxts_dec_done + + pxor %xmm0,%xmm12 + cmpq $32,%rdx + jb .Lxts_dec_one + pxor %xmm0,%xmm13 + je .Lxts_dec_two + + pxor %xmm0,%xmm14 + cmpq $64,%rdx + jb .Lxts_dec_three + je .Lxts_dec_four + + movdqu (%rdi),%xmm2 + movdqu 16(%rdi),%xmm3 + movdqu 32(%rdi),%xmm4 + pxor %xmm10,%xmm2 + movdqu 48(%rdi),%xmm5 + pxor %xmm11,%xmm3 + movdqu 64(%rdi),%xmm6 + leaq 80(%rdi),%rdi + pxor %xmm12,%xmm4 + pxor %xmm13,%xmm5 + pxor %xmm14,%xmm6 + + call _aesni_decrypt6 + + xorps %xmm10,%xmm2 + xorps %xmm11,%xmm3 + xorps %xmm12,%xmm4 + movdqu %xmm2,(%rsi) + xorps %xmm13,%xmm5 + movdqu %xmm3,16(%rsi) + xorps %xmm14,%xmm6 + movdqu %xmm4,32(%rsi) + pxor %xmm14,%xmm14 + movdqu %xmm5,48(%rsi) + pcmpgtd %xmm15,%xmm14 + movdqu %xmm6,64(%rsi) + leaq 80(%rsi),%rsi + pshufd $19,%xmm14,%xmm11 + andq $15,%r9 + jz .Lxts_dec_ret + + movdqa %xmm15,%xmm10 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm11 + pxor %xmm15,%xmm11 + jmp .Lxts_dec_done2 + +.align 16 +.Lxts_dec_one: + movups (%rdi),%xmm2 + leaq 16(%rdi),%rdi + xorps %xmm10,%xmm2 + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx + xorps %xmm0,%xmm2 +.Loop_dec1_12: +.byte 102,15,56,222,209 + decl %eax + movups (%rcx),%xmm1 + leaq 16(%rcx),%rcx + jnz .Loop_dec1_12 +.byte 102,15,56,223,209 + xorps %xmm10,%xmm2 + movdqa %xmm11,%xmm10 + movups %xmm2,(%rsi) + movdqa %xmm12,%xmm11 + leaq 16(%rsi),%rsi + jmp .Lxts_dec_done + +.align 16 +.Lxts_dec_two: + movups (%rdi),%xmm2 + movups 16(%rdi),%xmm3 + leaq 32(%rdi),%rdi + xorps %xmm10,%xmm2 + xorps %xmm11,%xmm3 + + call _aesni_decrypt2 + + xorps %xmm10,%xmm2 + movdqa %xmm12,%xmm10 + xorps %xmm11,%xmm3 + movdqa %xmm13,%xmm11 + movups %xmm2,(%rsi) + movups %xmm3,16(%rsi) + leaq 32(%rsi),%rsi + jmp .Lxts_dec_done + +.align 16 +.Lxts_dec_three: + movups (%rdi),%xmm2 + movups 16(%rdi),%xmm3 + movups 32(%rdi),%xmm4 + leaq 48(%rdi),%rdi + xorps %xmm10,%xmm2 + xorps %xmm11,%xmm3 + xorps %xmm12,%xmm4 + + call _aesni_decrypt3 + + xorps %xmm10,%xmm2 + movdqa %xmm13,%xmm10 + xorps %xmm11,%xmm3 + movdqa %xmm14,%xmm11 + xorps %xmm12,%xmm4 + movups %xmm2,(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + leaq 48(%rsi),%rsi + jmp .Lxts_dec_done + +.align 16 +.Lxts_dec_four: + movups (%rdi),%xmm2 + movups 16(%rdi),%xmm3 + movups 32(%rdi),%xmm4 + xorps %xmm10,%xmm2 + movups 48(%rdi),%xmm5 + leaq 64(%rdi),%rdi + xorps %xmm11,%xmm3 + xorps %xmm12,%xmm4 + xorps %xmm13,%xmm5 + + call _aesni_decrypt4 + + pxor %xmm10,%xmm2 + movdqa %xmm14,%xmm10 + pxor %xmm11,%xmm3 + movdqa %xmm15,%xmm11 + pxor %xmm12,%xmm4 + movdqu %xmm2,(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm3,16(%rsi) + movdqu %xmm4,32(%rsi) + movdqu %xmm5,48(%rsi) + leaq 64(%rsi),%rsi + jmp .Lxts_dec_done + +.align 16 +.Lxts_dec_done: + andq $15,%r9 + jz .Lxts_dec_ret +.Lxts_dec_done2: + movq %r9,%rdx + movq %r11,%rcx + movl %r10d,%eax + + movups (%rdi),%xmm2 + xorps %xmm11,%xmm2 + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx + xorps %xmm0,%xmm2 +.Loop_dec1_13: +.byte 102,15,56,222,209 + decl %eax + movups (%rcx),%xmm1 + leaq 16(%rcx),%rcx + jnz .Loop_dec1_13 +.byte 102,15,56,223,209 + xorps %xmm11,%xmm2 + movups %xmm2,(%rsi) + +.Lxts_dec_steal: + movzbl 16(%rdi),%eax + movzbl (%rsi),%ecx + leaq 1(%rdi),%rdi + movb %al,(%rsi) + movb %cl,16(%rsi) + leaq 1(%rsi),%rsi + subq $1,%rdx + jnz .Lxts_dec_steal + + subq %r9,%rsi + movq %r11,%rcx + movl %r10d,%eax + + movups (%rsi),%xmm2 + xorps %xmm10,%xmm2 + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx + xorps %xmm0,%xmm2 +.Loop_dec1_14: +.byte 102,15,56,222,209 + decl %eax + movups (%rcx),%xmm1 + leaq 16(%rcx),%rcx + jnz .Loop_dec1_14 +.byte 102,15,56,223,209 + xorps %xmm10,%xmm2 + movups %xmm2,(%rsi) + +.Lxts_dec_ret: + xorps %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + movaps %xmm0,0(%rsp) + pxor %xmm8,%xmm8 + movaps %xmm0,16(%rsp) + pxor %xmm9,%xmm9 + movaps %xmm0,32(%rsp) + pxor %xmm10,%xmm10 + movaps %xmm0,48(%rsp) + pxor %xmm11,%xmm11 + movaps %xmm0,64(%rsp) + pxor %xmm12,%xmm12 + movaps %xmm0,80(%rsp) + pxor %xmm13,%xmm13 + movaps %xmm0,96(%rsp) + pxor %xmm14,%xmm14 + pxor %xmm15,%xmm15 + leaq (%rbp),%rsp + popq %rbp +.Lxts_dec_epilogue: + .byte 0xf3,0xc3 +.size aesni_xts_decrypt,.-aesni_xts_decrypt +.globl aesni_cbc_encrypt +.type aesni_cbc_encrypt,@function +.align 16 +aesni_cbc_encrypt: + testq %rdx,%rdx + jz .Lcbc_ret + + movl 240(%rcx),%r10d + movq %rcx,%r11 + testl %r9d,%r9d + jz .Lcbc_decrypt + + movups (%r8),%xmm2 + movl %r10d,%eax + cmpq $16,%rdx + jb .Lcbc_enc_tail + subq $16,%rdx + jmp .Lcbc_enc_loop +.align 16 +.Lcbc_enc_loop: + movups (%rdi),%xmm3 + leaq 16(%rdi),%rdi + + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm3 + leaq 32(%rcx),%rcx + xorps %xmm3,%xmm2 +.Loop_enc1_15: +.byte 102,15,56,220,209 + decl %eax + movups (%rcx),%xmm1 + leaq 16(%rcx),%rcx + jnz .Loop_enc1_15 +.byte 102,15,56,221,209 + movl %r10d,%eax + movq %r11,%rcx + movups %xmm2,0(%rsi) + leaq 16(%rsi),%rsi + subq $16,%rdx + jnc .Lcbc_enc_loop + addq $16,%rdx + jnz .Lcbc_enc_tail + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + movups %xmm2,(%r8) + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + jmp .Lcbc_ret + +.Lcbc_enc_tail: + movq %rdx,%rcx + xchgq %rdi,%rsi +.long 0x9066A4F3 + movl $16,%ecx + subq %rdx,%rcx + xorl %eax,%eax +.long 0x9066AAF3 + leaq -16(%rdi),%rdi + movl %r10d,%eax + movq %rdi,%rsi + movq %r11,%rcx + xorq %rdx,%rdx + jmp .Lcbc_enc_loop + +.align 16 +.Lcbc_decrypt: + cmpq $16,%rdx + jne .Lcbc_decrypt_bulk + + + + movdqu (%rdi),%xmm2 + movdqu (%r8),%xmm3 + movdqa %xmm2,%xmm4 + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx + xorps %xmm0,%xmm2 +.Loop_dec1_16: +.byte 102,15,56,222,209 + decl %r10d + movups (%rcx),%xmm1 + leaq 16(%rcx),%rcx + jnz .Loop_dec1_16 +.byte 102,15,56,223,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + movdqu %xmm4,(%r8) + xorps %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 + jmp .Lcbc_ret +.align 16 +.Lcbc_decrypt_bulk: + leaq (%rsp),%rax + pushq %rbp + subq $16,%rsp + andq $-16,%rsp + leaq -8(%rax),%rbp + movups (%r8),%xmm10 + movl %r10d,%eax + cmpq $80,%rdx + jbe .Lcbc_dec_tail + + movups (%rcx),%xmm0 + movdqu 0(%rdi),%xmm2 + movdqu 16(%rdi),%xmm3 + movdqa %xmm2,%xmm11 + movdqu 32(%rdi),%xmm4 + movdqa %xmm3,%xmm12 + movdqu 48(%rdi),%xmm5 + movdqa %xmm4,%xmm13 + movdqu 64(%rdi),%xmm6 + movdqa %xmm5,%xmm14 + movdqu 80(%rdi),%xmm7 + movdqa %xmm6,%xmm15 + movl OPENSSL_ia32cap_P+4(%rip),%r9d + cmpq $112,%rdx + jbe .Lcbc_dec_six_or_seven + + andl $71303168,%r9d + subq $80,%rdx + cmpl $4194304,%r9d + je .Lcbc_dec_loop6_enter + subq $32,%rdx + leaq 112(%rcx),%rcx + jmp .Lcbc_dec_loop8_enter +.align 16 +.Lcbc_dec_loop8: + movups %xmm9,(%rsi) + leaq 16(%rsi),%rsi +.Lcbc_dec_loop8_enter: + movdqu 96(%rdi),%xmm8 + pxor %xmm0,%xmm2 + movdqu 112(%rdi),%xmm9 + pxor %xmm0,%xmm3 + movups 16-112(%rcx),%xmm1 + pxor %xmm0,%xmm4 + xorq %r11,%r11 + cmpq $112,%rdx + pxor %xmm0,%xmm5 + pxor %xmm0,%xmm6 + pxor %xmm0,%xmm7 + pxor %xmm0,%xmm8 + +.byte 102,15,56,222,209 + pxor %xmm0,%xmm9 + movups 32-112(%rcx),%xmm0 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 + setnc %r11b + shlq $7,%r11 +.byte 102,68,15,56,222,201 + addq %rdi,%r11 + movups 48-112(%rcx),%xmm1 +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 64-112(%rcx),%xmm0 + nop +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movups 80-112(%rcx),%xmm1 + nop +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 96-112(%rcx),%xmm0 + nop +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movups 112-112(%rcx),%xmm1 + nop +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 128-112(%rcx),%xmm0 + nop +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movups 144-112(%rcx),%xmm1 + cmpl $11,%eax +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 160-112(%rcx),%xmm0 + jb .Lcbc_dec_done +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movups 176-112(%rcx),%xmm1 + nop +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 192-112(%rcx),%xmm0 + je .Lcbc_dec_done +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movups 208-112(%rcx),%xmm1 + nop +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 224-112(%rcx),%xmm0 + jmp .Lcbc_dec_done +.align 16 +.Lcbc_dec_done: +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 + pxor %xmm0,%xmm10 + pxor %xmm0,%xmm11 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + pxor %xmm0,%xmm12 + pxor %xmm0,%xmm13 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 + pxor %xmm0,%xmm14 + pxor %xmm0,%xmm15 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movdqu 80(%rdi),%xmm1 + +.byte 102,65,15,56,223,210 + movdqu 96(%rdi),%xmm10 + pxor %xmm0,%xmm1 +.byte 102,65,15,56,223,219 + pxor %xmm0,%xmm10 + movdqu 112(%rdi),%xmm0 +.byte 102,65,15,56,223,228 + leaq 128(%rdi),%rdi + movdqu 0(%r11),%xmm11 +.byte 102,65,15,56,223,237 +.byte 102,65,15,56,223,246 + movdqu 16(%r11),%xmm12 + movdqu 32(%r11),%xmm13 +.byte 102,65,15,56,223,255 +.byte 102,68,15,56,223,193 + movdqu 48(%r11),%xmm14 + movdqu 64(%r11),%xmm15 +.byte 102,69,15,56,223,202 + movdqa %xmm0,%xmm10 + movdqu 80(%r11),%xmm1 + movups -112(%rcx),%xmm0 + + movups %xmm2,(%rsi) + movdqa %xmm11,%xmm2 + movups %xmm3,16(%rsi) + movdqa %xmm12,%xmm3 + movups %xmm4,32(%rsi) + movdqa %xmm13,%xmm4 + movups %xmm5,48(%rsi) + movdqa %xmm14,%xmm5 + movups %xmm6,64(%rsi) + movdqa %xmm15,%xmm6 + movups %xmm7,80(%rsi) + movdqa %xmm1,%xmm7 + movups %xmm8,96(%rsi) + leaq 112(%rsi),%rsi + + subq $128,%rdx + ja .Lcbc_dec_loop8 + + movaps %xmm9,%xmm2 + leaq -112(%rcx),%rcx + addq $112,%rdx + jle .Lcbc_dec_clear_tail_collected + movups %xmm9,(%rsi) + leaq 16(%rsi),%rsi + cmpq $80,%rdx + jbe .Lcbc_dec_tail + + movaps %xmm11,%xmm2 +.Lcbc_dec_six_or_seven: + cmpq $96,%rdx + ja .Lcbc_dec_seven + + movaps %xmm7,%xmm8 + call _aesni_decrypt6 + pxor %xmm10,%xmm2 + movaps %xmm8,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm3,%xmm3 + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm4,%xmm4 + pxor %xmm14,%xmm6 + movdqu %xmm5,48(%rsi) + pxor %xmm5,%xmm5 + pxor %xmm15,%xmm7 + movdqu %xmm6,64(%rsi) + pxor %xmm6,%xmm6 + leaq 80(%rsi),%rsi + movdqa %xmm7,%xmm2 + pxor %xmm7,%xmm7 + jmp .Lcbc_dec_tail_collected + +.align 16 +.Lcbc_dec_seven: + movups 96(%rdi),%xmm8 + xorps %xmm9,%xmm9 + call _aesni_decrypt8 + movups 80(%rdi),%xmm9 + pxor %xmm10,%xmm2 + movups 96(%rdi),%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm3,%xmm3 + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm4,%xmm4 + pxor %xmm14,%xmm6 + movdqu %xmm5,48(%rsi) + pxor %xmm5,%xmm5 + pxor %xmm15,%xmm7 + movdqu %xmm6,64(%rsi) + pxor %xmm6,%xmm6 + pxor %xmm9,%xmm8 + movdqu %xmm7,80(%rsi) + pxor %xmm7,%xmm7 + leaq 96(%rsi),%rsi + movdqa %xmm8,%xmm2 + pxor %xmm8,%xmm8 + pxor %xmm9,%xmm9 + jmp .Lcbc_dec_tail_collected + +.align 16 +.Lcbc_dec_loop6: + movups %xmm7,(%rsi) + leaq 16(%rsi),%rsi + movdqu 0(%rdi),%xmm2 + movdqu 16(%rdi),%xmm3 + movdqa %xmm2,%xmm11 + movdqu 32(%rdi),%xmm4 + movdqa %xmm3,%xmm12 + movdqu 48(%rdi),%xmm5 + movdqa %xmm4,%xmm13 + movdqu 64(%rdi),%xmm6 + movdqa %xmm5,%xmm14 + movdqu 80(%rdi),%xmm7 + movdqa %xmm6,%xmm15 +.Lcbc_dec_loop6_enter: + leaq 96(%rdi),%rdi + movdqa %xmm7,%xmm8 + + call _aesni_decrypt6 + + pxor %xmm10,%xmm2 + movdqa %xmm8,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm14,%xmm6 + movq %r11,%rcx + movdqu %xmm5,48(%rsi) + pxor %xmm15,%xmm7 + movl %r10d,%eax + movdqu %xmm6,64(%rsi) + leaq 80(%rsi),%rsi + subq $96,%rdx + ja .Lcbc_dec_loop6 + + movdqa %xmm7,%xmm2 + addq $80,%rdx + jle .Lcbc_dec_clear_tail_collected + movups %xmm7,(%rsi) + leaq 16(%rsi),%rsi + +.Lcbc_dec_tail: + movups (%rdi),%xmm2 + subq $16,%rdx + jbe .Lcbc_dec_one + + movups 16(%rdi),%xmm3 + movaps %xmm2,%xmm11 + subq $16,%rdx + jbe .Lcbc_dec_two + + movups 32(%rdi),%xmm4 + movaps %xmm3,%xmm12 + subq $16,%rdx + jbe .Lcbc_dec_three + + movups 48(%rdi),%xmm5 + movaps %xmm4,%xmm13 + subq $16,%rdx + jbe .Lcbc_dec_four + + movups 64(%rdi),%xmm6 + movaps %xmm5,%xmm14 + movaps %xmm6,%xmm15 + xorps %xmm7,%xmm7 + call _aesni_decrypt6 + pxor %xmm10,%xmm2 + movaps %xmm15,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm3,%xmm3 + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm4,%xmm4 + pxor %xmm14,%xmm6 + movdqu %xmm5,48(%rsi) + pxor %xmm5,%xmm5 + leaq 64(%rsi),%rsi + movdqa %xmm6,%xmm2 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + subq $16,%rdx + jmp .Lcbc_dec_tail_collected + +.align 16 +.Lcbc_dec_one: + movaps %xmm2,%xmm11 + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx + xorps %xmm0,%xmm2 +.Loop_dec1_17: +.byte 102,15,56,222,209 + decl %eax + movups (%rcx),%xmm1 + leaq 16(%rcx),%rcx + jnz .Loop_dec1_17 +.byte 102,15,56,223,209 + xorps %xmm10,%xmm2 + movaps %xmm11,%xmm10 + jmp .Lcbc_dec_tail_collected +.align 16 +.Lcbc_dec_two: + movaps %xmm3,%xmm12 + call _aesni_decrypt2 + pxor %xmm10,%xmm2 + movaps %xmm12,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + movdqa %xmm3,%xmm2 + pxor %xmm3,%xmm3 + leaq 16(%rsi),%rsi + jmp .Lcbc_dec_tail_collected +.align 16 +.Lcbc_dec_three: + movaps %xmm4,%xmm13 + call _aesni_decrypt3 + pxor %xmm10,%xmm2 + movaps %xmm13,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm3,%xmm3 + movdqa %xmm4,%xmm2 + pxor %xmm4,%xmm4 + leaq 32(%rsi),%rsi + jmp .Lcbc_dec_tail_collected +.align 16 +.Lcbc_dec_four: + movaps %xmm5,%xmm14 + call _aesni_decrypt4 + pxor %xmm10,%xmm2 + movaps %xmm14,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm3,%xmm3 + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm4,%xmm4 + movdqa %xmm5,%xmm2 + pxor %xmm5,%xmm5 + leaq 48(%rsi),%rsi + jmp .Lcbc_dec_tail_collected + +.align 16 +.Lcbc_dec_clear_tail_collected: + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + pxor %xmm8,%xmm8 + pxor %xmm9,%xmm9 +.Lcbc_dec_tail_collected: + movups %xmm10,(%r8) + andq $15,%rdx + jnz .Lcbc_dec_tail_partial + movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 + jmp .Lcbc_dec_ret +.align 16 +.Lcbc_dec_tail_partial: + movaps %xmm2,(%rsp) + pxor %xmm2,%xmm2 + movq $16,%rcx + movq %rsi,%rdi + subq %rdx,%rcx + leaq (%rsp),%rsi +.long 0x9066A4F3 + movdqa %xmm2,(%rsp) + +.Lcbc_dec_ret: + xorps %xmm0,%xmm0 + pxor %xmm1,%xmm1 + leaq (%rbp),%rsp + popq %rbp +.Lcbc_ret: + .byte 0xf3,0xc3 +.size aesni_cbc_encrypt,.-aesni_cbc_encrypt +.globl aesni_set_decrypt_key +.type aesni_set_decrypt_key,@function +.align 16 +aesni_set_decrypt_key: +.byte 0x48,0x83,0xEC,0x08 + call __aesni_set_encrypt_key + shll $4,%esi + testl %eax,%eax + jnz .Ldec_key_ret + leaq 16(%rdx,%rsi,1),%rdi + + movups (%rdx),%xmm0 + movups (%rdi),%xmm1 + movups %xmm0,(%rdi) + movups %xmm1,(%rdx) + leaq 16(%rdx),%rdx + leaq -16(%rdi),%rdi + +.Ldec_key_inverse: + movups (%rdx),%xmm0 + movups (%rdi),%xmm1 +.byte 102,15,56,219,192 +.byte 102,15,56,219,201 + leaq 16(%rdx),%rdx + leaq -16(%rdi),%rdi + movups %xmm0,16(%rdi) + movups %xmm1,-16(%rdx) + cmpq %rdx,%rdi + ja .Ldec_key_inverse + + movups (%rdx),%xmm0 +.byte 102,15,56,219,192 + pxor %xmm1,%xmm1 + movups %xmm0,(%rdi) + pxor %xmm0,%xmm0 +.Ldec_key_ret: + addq $8,%rsp + .byte 0xf3,0xc3 +.LSEH_end_set_decrypt_key: +.size aesni_set_decrypt_key,.-aesni_set_decrypt_key +.globl aesni_set_encrypt_key +.type aesni_set_encrypt_key,@function +.align 16 +aesni_set_encrypt_key: +__aesni_set_encrypt_key: +.byte 0x48,0x83,0xEC,0x08 + movq $-1,%rax + testq %rdi,%rdi + jz .Lenc_key_ret + testq %rdx,%rdx + jz .Lenc_key_ret + + movl $268437504,%r10d + movups (%rdi),%xmm0 + xorps %xmm4,%xmm4 + andl OPENSSL_ia32cap_P+4(%rip),%r10d + leaq 16(%rdx),%rax + cmpl $256,%esi + je .L14rounds + cmpl $192,%esi + je .L12rounds + cmpl $128,%esi + jne .Lbad_keybits + +.L10rounds: + movl $9,%esi + cmpl $268435456,%r10d + je .L10rounds_alt + + movups %xmm0,(%rdx) +.byte 102,15,58,223,200,1 + call .Lkey_expansion_128_cold +.byte 102,15,58,223,200,2 + call .Lkey_expansion_128 +.byte 102,15,58,223,200,4 + call .Lkey_expansion_128 +.byte 102,15,58,223,200,8 + call .Lkey_expansion_128 +.byte 102,15,58,223,200,16 + call .Lkey_expansion_128 +.byte 102,15,58,223,200,32 + call .Lkey_expansion_128 +.byte 102,15,58,223,200,64 + call .Lkey_expansion_128 +.byte 102,15,58,223,200,128 + call .Lkey_expansion_128 +.byte 102,15,58,223,200,27 + call .Lkey_expansion_128 +.byte 102,15,58,223,200,54 + call .Lkey_expansion_128 + movups %xmm0,(%rax) + movl %esi,80(%rax) + xorl %eax,%eax + jmp .Lenc_key_ret + +.align 16 +.L10rounds_alt: + movdqa .Lkey_rotate(%rip),%xmm5 + movl $8,%r10d + movdqa .Lkey_rcon1(%rip),%xmm4 + movdqa %xmm0,%xmm2 + movdqu %xmm0,(%rdx) + jmp .Loop_key128 + +.align 16 +.Loop_key128: +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + pslld $1,%xmm4 + leaq 16(%rax),%rax + + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + + pxor %xmm2,%xmm0 + movdqu %xmm0,-16(%rax) + movdqa %xmm0,%xmm2 + + decl %r10d + jnz .Loop_key128 + + movdqa .Lkey_rcon1b(%rip),%xmm4 + +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + pslld $1,%xmm4 + + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + + pxor %xmm2,%xmm0 + movdqu %xmm0,(%rax) + + movdqa %xmm0,%xmm2 +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + + pxor %xmm2,%xmm0 + movdqu %xmm0,16(%rax) + + movl %esi,96(%rax) + xorl %eax,%eax + jmp .Lenc_key_ret + +.align 16 +.L12rounds: + movq 16(%rdi),%xmm2 + movl $11,%esi + cmpl $268435456,%r10d + je .L12rounds_alt + + movups %xmm0,(%rdx) +.byte 102,15,58,223,202,1 + call .Lkey_expansion_192a_cold +.byte 102,15,58,223,202,2 + call .Lkey_expansion_192b +.byte 102,15,58,223,202,4 + call .Lkey_expansion_192a +.byte 102,15,58,223,202,8 + call .Lkey_expansion_192b +.byte 102,15,58,223,202,16 + call .Lkey_expansion_192a +.byte 102,15,58,223,202,32 + call .Lkey_expansion_192b +.byte 102,15,58,223,202,64 + call .Lkey_expansion_192a +.byte 102,15,58,223,202,128 + call .Lkey_expansion_192b + movups %xmm0,(%rax) + movl %esi,48(%rax) + xorq %rax,%rax + jmp .Lenc_key_ret + +.align 16 +.L12rounds_alt: + movdqa .Lkey_rotate192(%rip),%xmm5 + movdqa .Lkey_rcon1(%rip),%xmm4 + movl $8,%r10d + movdqu %xmm0,(%rdx) + jmp .Loop_key192 + +.align 16 +.Loop_key192: + movq %xmm2,0(%rax) + movdqa %xmm2,%xmm1 +.byte 102,15,56,0,213 +.byte 102,15,56,221,212 + pslld $1,%xmm4 + leaq 24(%rax),%rax + + movdqa %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm3,%xmm0 + + pshufd $255,%xmm0,%xmm3 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + + pxor %xmm2,%xmm0 + pxor %xmm3,%xmm2 + movdqu %xmm0,-16(%rax) + + decl %r10d + jnz .Loop_key192 + + movl %esi,32(%rax) + xorl %eax,%eax + jmp .Lenc_key_ret + +.align 16 +.L14rounds: + movups 16(%rdi),%xmm2 + movl $13,%esi + leaq 16(%rax),%rax + cmpl $268435456,%r10d + je .L14rounds_alt + + movups %xmm0,(%rdx) + movups %xmm2,16(%rdx) +.byte 102,15,58,223,202,1 + call .Lkey_expansion_256a_cold +.byte 102,15,58,223,200,1 + call .Lkey_expansion_256b +.byte 102,15,58,223,202,2 + call .Lkey_expansion_256a +.byte 102,15,58,223,200,2 + call .Lkey_expansion_256b +.byte 102,15,58,223,202,4 + call .Lkey_expansion_256a +.byte 102,15,58,223,200,4 + call .Lkey_expansion_256b +.byte 102,15,58,223,202,8 + call .Lkey_expansion_256a +.byte 102,15,58,223,200,8 + call .Lkey_expansion_256b +.byte 102,15,58,223,202,16 + call .Lkey_expansion_256a +.byte 102,15,58,223,200,16 + call .Lkey_expansion_256b +.byte 102,15,58,223,202,32 + call .Lkey_expansion_256a +.byte 102,15,58,223,200,32 + call .Lkey_expansion_256b +.byte 102,15,58,223,202,64 + call .Lkey_expansion_256a + movups %xmm0,(%rax) + movl %esi,16(%rax) + xorq %rax,%rax + jmp .Lenc_key_ret + +.align 16 +.L14rounds_alt: + movdqa .Lkey_rotate(%rip),%xmm5 + movdqa .Lkey_rcon1(%rip),%xmm4 + movl $7,%r10d + movdqu %xmm0,0(%rdx) + movdqa %xmm2,%xmm1 + movdqu %xmm2,16(%rdx) + jmp .Loop_key256 + +.align 16 +.Loop_key256: +.byte 102,15,56,0,213 +.byte 102,15,56,221,212 + + movdqa %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm3,%xmm0 + pslld $1,%xmm4 + + pxor %xmm2,%xmm0 + movdqu %xmm0,(%rax) + + decl %r10d + jz .Ldone_key256 + + pshufd $255,%xmm0,%xmm2 + pxor %xmm3,%xmm3 +.byte 102,15,56,221,211 + + movdqa %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm3,%xmm1 + + pxor %xmm1,%xmm2 + movdqu %xmm2,16(%rax) + leaq 32(%rax),%rax + movdqa %xmm2,%xmm1 + + jmp .Loop_key256 + +.Ldone_key256: + movl %esi,16(%rax) + xorl %eax,%eax + jmp .Lenc_key_ret + +.align 16 +.Lbad_keybits: + movq $-2,%rax +.Lenc_key_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + addq $8,%rsp + .byte 0xf3,0xc3 +.LSEH_end_set_encrypt_key: + +.align 16 +.Lkey_expansion_128: + movups %xmm0,(%rax) + leaq 16(%rax),%rax +.Lkey_expansion_128_cold: + shufps $16,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps $140,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps $255,%xmm1,%xmm1 + xorps %xmm1,%xmm0 + .byte 0xf3,0xc3 + +.align 16 +.Lkey_expansion_192a: + movups %xmm0,(%rax) + leaq 16(%rax),%rax +.Lkey_expansion_192a_cold: + movaps %xmm2,%xmm5 +.Lkey_expansion_192b_warm: + shufps $16,%xmm0,%xmm4 + movdqa %xmm2,%xmm3 + xorps %xmm4,%xmm0 + shufps $140,%xmm0,%xmm4 + pslldq $4,%xmm3 + xorps %xmm4,%xmm0 + pshufd $85,%xmm1,%xmm1 + pxor %xmm3,%xmm2 + pxor %xmm1,%xmm0 + pshufd $255,%xmm0,%xmm3 + pxor %xmm3,%xmm2 + .byte 0xf3,0xc3 + +.align 16 +.Lkey_expansion_192b: + movaps %xmm0,%xmm3 + shufps $68,%xmm0,%xmm5 + movups %xmm5,(%rax) + shufps $78,%xmm2,%xmm3 + movups %xmm3,16(%rax) + leaq 32(%rax),%rax + jmp .Lkey_expansion_192b_warm + +.align 16 +.Lkey_expansion_256a: + movups %xmm2,(%rax) + leaq 16(%rax),%rax +.Lkey_expansion_256a_cold: + shufps $16,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps $140,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps $255,%xmm1,%xmm1 + xorps %xmm1,%xmm0 + .byte 0xf3,0xc3 + +.align 16 +.Lkey_expansion_256b: + movups %xmm0,(%rax) + leaq 16(%rax),%rax + + shufps $16,%xmm2,%xmm4 + xorps %xmm4,%xmm2 + shufps $140,%xmm2,%xmm4 + xorps %xmm4,%xmm2 + shufps $170,%xmm1,%xmm1 + xorps %xmm1,%xmm2 + .byte 0xf3,0xc3 +.size aesni_set_encrypt_key,.-aesni_set_encrypt_key +.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key +.align 64 +.Lbswap_mask: +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.Lincrement32: +.long 6,6,6,0 +.Lincrement64: +.long 1,0,0,0 +.Lxts_magic: +.long 0x87,0,1,0 +.Lincrement1: +.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 +.Lkey_rotate: +.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d +.Lkey_rotate192: +.long 0x04070605,0x04070605,0x04070605,0x04070605 +.Lkey_rcon1: +.long 1,1,1,1 +.Lkey_rcon1b: +.long 0x1b,0x1b,0x1b,0x1b + +.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 64 diff --git a/secure/lib/libcrypto/amd64/bsaes-x86_64.S b/secure/lib/libcrypto/amd64/bsaes-x86_64.S new file mode 100644 index 0000000..be410de --- /dev/null +++ b/secure/lib/libcrypto/amd64/bsaes-x86_64.S @@ -0,0 +1,2499 @@ + # $FreeBSD$ +.text + + + + +.type _bsaes_encrypt8,@function +.align 64 +_bsaes_encrypt8: + leaq .LBS0(%rip),%r11 + + movdqa (%rax),%xmm8 + leaq 16(%rax),%rax + movdqa 80(%r11),%xmm7 + pxor %xmm8,%xmm15 + pxor %xmm8,%xmm0 + pxor %xmm8,%xmm1 + pxor %xmm8,%xmm2 +.byte 102,68,15,56,0,255 +.byte 102,15,56,0,199 + pxor %xmm8,%xmm3 + pxor %xmm8,%xmm4 +.byte 102,15,56,0,207 +.byte 102,15,56,0,215 + pxor %xmm8,%xmm5 + pxor %xmm8,%xmm6 +.byte 102,15,56,0,223 +.byte 102,15,56,0,231 +.byte 102,15,56,0,239 +.byte 102,15,56,0,247 +_bsaes_encrypt8_bitslice: + movdqa 0(%r11),%xmm7 + movdqa 16(%r11),%xmm8 + movdqa %xmm5,%xmm9 + psrlq $1,%xmm5 + movdqa %xmm3,%xmm10 + psrlq $1,%xmm3 + pxor %xmm6,%xmm5 + pxor %xmm4,%xmm3 + pand %xmm7,%xmm5 + pand %xmm7,%xmm3 + pxor %xmm5,%xmm6 + psllq $1,%xmm5 + pxor %xmm3,%xmm4 + psllq $1,%xmm3 + pxor %xmm9,%xmm5 + pxor %xmm10,%xmm3 + movdqa %xmm1,%xmm9 + psrlq $1,%xmm1 + movdqa %xmm15,%xmm10 + psrlq $1,%xmm15 + pxor %xmm2,%xmm1 + pxor %xmm0,%xmm15 + pand %xmm7,%xmm1 + pand %xmm7,%xmm15 + pxor %xmm1,%xmm2 + psllq $1,%xmm1 + pxor %xmm15,%xmm0 + psllq $1,%xmm15 + pxor %xmm9,%xmm1 + pxor %xmm10,%xmm15 + movdqa 32(%r11),%xmm7 + movdqa %xmm4,%xmm9 + psrlq $2,%xmm4 + movdqa %xmm3,%xmm10 + psrlq $2,%xmm3 + pxor %xmm6,%xmm4 + pxor %xmm5,%xmm3 + pand %xmm8,%xmm4 + pand %xmm8,%xmm3 + pxor %xmm4,%xmm6 + psllq $2,%xmm4 + pxor %xmm3,%xmm5 + psllq $2,%xmm3 + pxor %xmm9,%xmm4 + pxor %xmm10,%xmm3 + movdqa %xmm0,%xmm9 + psrlq $2,%xmm0 + movdqa %xmm15,%xmm10 + psrlq $2,%xmm15 + pxor %xmm2,%xmm0 + pxor %xmm1,%xmm15 + pand %xmm8,%xmm0 + pand %xmm8,%xmm15 + pxor %xmm0,%xmm2 + psllq $2,%xmm0 + pxor %xmm15,%xmm1 + psllq $2,%xmm15 + pxor %xmm9,%xmm0 + pxor %xmm10,%xmm15 + movdqa %xmm2,%xmm9 + psrlq $4,%xmm2 + movdqa %xmm1,%xmm10 + psrlq $4,%xmm1 + pxor %xmm6,%xmm2 + pxor %xmm5,%xmm1 + pand %xmm7,%xmm2 + pand %xmm7,%xmm1 + pxor %xmm2,%xmm6 + psllq $4,%xmm2 + pxor %xmm1,%xmm5 + psllq $4,%xmm1 + pxor %xmm9,%xmm2 + pxor %xmm10,%xmm1 + movdqa %xmm0,%xmm9 + psrlq $4,%xmm0 + movdqa %xmm15,%xmm10 + psrlq $4,%xmm15 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pand %xmm7,%xmm0 + pand %xmm7,%xmm15 + pxor %xmm0,%xmm4 + psllq $4,%xmm0 + pxor %xmm15,%xmm3 + psllq $4,%xmm15 + pxor %xmm9,%xmm0 + pxor %xmm10,%xmm15 + decl %r10d + jmp .Lenc_sbox +.align 16 +.Lenc_loop: + pxor 0(%rax),%xmm15 + pxor 16(%rax),%xmm0 + pxor 32(%rax),%xmm1 + pxor 48(%rax),%xmm2 +.byte 102,68,15,56,0,255 +.byte 102,15,56,0,199 + pxor 64(%rax),%xmm3 + pxor 80(%rax),%xmm4 +.byte 102,15,56,0,207 +.byte 102,15,56,0,215 + pxor 96(%rax),%xmm5 + pxor 112(%rax),%xmm6 +.byte 102,15,56,0,223 +.byte 102,15,56,0,231 +.byte 102,15,56,0,239 +.byte 102,15,56,0,247 + leaq 128(%rax),%rax +.Lenc_sbox: + pxor %xmm5,%xmm4 + pxor %xmm0,%xmm1 + pxor %xmm15,%xmm2 + pxor %xmm1,%xmm5 + pxor %xmm15,%xmm4 + + pxor %xmm2,%xmm5 + pxor %xmm6,%xmm2 + pxor %xmm4,%xmm6 + pxor %xmm3,%xmm2 + pxor %xmm4,%xmm3 + pxor %xmm0,%xmm2 + + pxor %xmm6,%xmm1 + pxor %xmm4,%xmm0 + movdqa %xmm6,%xmm10 + movdqa %xmm0,%xmm9 + movdqa %xmm4,%xmm8 + movdqa %xmm1,%xmm12 + movdqa %xmm5,%xmm11 + + pxor %xmm3,%xmm10 + pxor %xmm1,%xmm9 + pxor %xmm2,%xmm8 + movdqa %xmm10,%xmm13 + pxor %xmm3,%xmm12 + movdqa %xmm9,%xmm7 + pxor %xmm15,%xmm11 + movdqa %xmm10,%xmm14 + + por %xmm8,%xmm9 + por %xmm11,%xmm10 + pxor %xmm7,%xmm14 + pand %xmm11,%xmm13 + pxor %xmm8,%xmm11 + pand %xmm8,%xmm7 + pand %xmm11,%xmm14 + movdqa %xmm2,%xmm11 + pxor %xmm15,%xmm11 + pand %xmm11,%xmm12 + pxor %xmm12,%xmm10 + pxor %xmm12,%xmm9 + movdqa %xmm6,%xmm12 + movdqa %xmm4,%xmm11 + pxor %xmm0,%xmm12 + pxor %xmm5,%xmm11 + movdqa %xmm12,%xmm8 + pand %xmm11,%xmm12 + por %xmm11,%xmm8 + pxor %xmm12,%xmm7 + pxor %xmm14,%xmm10 + pxor %xmm13,%xmm9 + pxor %xmm14,%xmm8 + movdqa %xmm1,%xmm11 + pxor %xmm13,%xmm7 + movdqa %xmm3,%xmm12 + pxor %xmm13,%xmm8 + movdqa %xmm0,%xmm13 + pand %xmm2,%xmm11 + movdqa %xmm6,%xmm14 + pand %xmm15,%xmm12 + pand %xmm4,%xmm13 + por %xmm5,%xmm14 + pxor %xmm11,%xmm10 + pxor %xmm12,%xmm9 + pxor %xmm13,%xmm8 + pxor %xmm14,%xmm7 + + + + + + movdqa %xmm10,%xmm11 + pand %xmm8,%xmm10 + pxor %xmm9,%xmm11 + + movdqa %xmm7,%xmm13 + movdqa %xmm11,%xmm14 + pxor %xmm10,%xmm13 + pand %xmm13,%xmm14 + + movdqa %xmm8,%xmm12 + pxor %xmm9,%xmm14 + pxor %xmm7,%xmm12 + + pxor %xmm9,%xmm10 + + pand %xmm10,%xmm12 + + movdqa %xmm13,%xmm9 + pxor %xmm7,%xmm12 + + pxor %xmm12,%xmm9 + pxor %xmm12,%xmm8 + + pand %xmm7,%xmm9 + + pxor %xmm9,%xmm13 + pxor %xmm9,%xmm8 + + pand %xmm14,%xmm13 + + pxor %xmm11,%xmm13 + movdqa %xmm5,%xmm11 + movdqa %xmm4,%xmm7 + movdqa %xmm14,%xmm9 + pxor %xmm13,%xmm9 + pand %xmm5,%xmm9 + pxor %xmm4,%xmm5 + pand %xmm14,%xmm4 + pand %xmm13,%xmm5 + pxor %xmm4,%xmm5 + pxor %xmm9,%xmm4 + pxor %xmm15,%xmm11 + pxor %xmm2,%xmm7 + pxor %xmm12,%xmm14 + pxor %xmm8,%xmm13 + movdqa %xmm14,%xmm10 + movdqa %xmm12,%xmm9 + pxor %xmm13,%xmm10 + pxor %xmm8,%xmm9 + pand %xmm11,%xmm10 + pand %xmm15,%xmm9 + pxor %xmm7,%xmm11 + pxor %xmm2,%xmm15 + pand %xmm14,%xmm7 + pand %xmm12,%xmm2 + pand %xmm13,%xmm11 + pand %xmm8,%xmm15 + pxor %xmm11,%xmm7 + pxor %xmm2,%xmm15 + pxor %xmm10,%xmm11 + pxor %xmm9,%xmm2 + pxor %xmm11,%xmm5 + pxor %xmm11,%xmm15 + pxor %xmm7,%xmm4 + pxor %xmm7,%xmm2 + + movdqa %xmm6,%xmm11 + movdqa %xmm0,%xmm7 + pxor %xmm3,%xmm11 + pxor %xmm1,%xmm7 + movdqa %xmm14,%xmm10 + movdqa %xmm12,%xmm9 + pxor %xmm13,%xmm10 + pxor %xmm8,%xmm9 + pand %xmm11,%xmm10 + pand %xmm3,%xmm9 + pxor %xmm7,%xmm11 + pxor %xmm1,%xmm3 + pand %xmm14,%xmm7 + pand %xmm12,%xmm1 + pand %xmm13,%xmm11 + pand %xmm8,%xmm3 + pxor %xmm11,%xmm7 + pxor %xmm1,%xmm3 + pxor %xmm10,%xmm11 + pxor %xmm9,%xmm1 + pxor %xmm12,%xmm14 + pxor %xmm8,%xmm13 + movdqa %xmm14,%xmm10 + pxor %xmm13,%xmm10 + pand %xmm6,%xmm10 + pxor %xmm0,%xmm6 + pand %xmm14,%xmm0 + pand %xmm13,%xmm6 + pxor %xmm0,%xmm6 + pxor %xmm10,%xmm0 + pxor %xmm11,%xmm6 + pxor %xmm11,%xmm3 + pxor %xmm7,%xmm0 + pxor %xmm7,%xmm1 + pxor %xmm15,%xmm6 + pxor %xmm5,%xmm0 + pxor %xmm6,%xmm3 + pxor %xmm15,%xmm5 + pxor %xmm0,%xmm15 + + pxor %xmm4,%xmm0 + pxor %xmm1,%xmm4 + pxor %xmm2,%xmm1 + pxor %xmm4,%xmm2 + pxor %xmm4,%xmm3 + + pxor %xmm2,%xmm5 + decl %r10d + jl .Lenc_done + pshufd $147,%xmm15,%xmm7 + pshufd $147,%xmm0,%xmm8 + pxor %xmm7,%xmm15 + pshufd $147,%xmm3,%xmm9 + pxor %xmm8,%xmm0 + pshufd $147,%xmm5,%xmm10 + pxor %xmm9,%xmm3 + pshufd $147,%xmm2,%xmm11 + pxor %xmm10,%xmm5 + pshufd $147,%xmm6,%xmm12 + pxor %xmm11,%xmm2 + pshufd $147,%xmm1,%xmm13 + pxor %xmm12,%xmm6 + pshufd $147,%xmm4,%xmm14 + pxor %xmm13,%xmm1 + pxor %xmm14,%xmm4 + + pxor %xmm15,%xmm8 + pxor %xmm4,%xmm7 + pxor %xmm4,%xmm8 + pshufd $78,%xmm15,%xmm15 + pxor %xmm0,%xmm9 + pshufd $78,%xmm0,%xmm0 + pxor %xmm2,%xmm12 + pxor %xmm7,%xmm15 + pxor %xmm6,%xmm13 + pxor %xmm8,%xmm0 + pxor %xmm5,%xmm11 + pshufd $78,%xmm2,%xmm7 + pxor %xmm1,%xmm14 + pshufd $78,%xmm6,%xmm8 + pxor %xmm3,%xmm10 + pshufd $78,%xmm5,%xmm2 + pxor %xmm4,%xmm10 + pshufd $78,%xmm4,%xmm6 + pxor %xmm4,%xmm11 + pshufd $78,%xmm1,%xmm5 + pxor %xmm11,%xmm7 + pshufd $78,%xmm3,%xmm1 + pxor %xmm12,%xmm8 + pxor %xmm10,%xmm2 + pxor %xmm14,%xmm6 + pxor %xmm13,%xmm5 + movdqa %xmm7,%xmm3 + pxor %xmm9,%xmm1 + movdqa %xmm8,%xmm4 + movdqa 48(%r11),%xmm7 + jnz .Lenc_loop + movdqa 64(%r11),%xmm7 + jmp .Lenc_loop +.align 16 +.Lenc_done: + movdqa 0(%r11),%xmm7 + movdqa 16(%r11),%xmm8 + movdqa %xmm1,%xmm9 + psrlq $1,%xmm1 + movdqa %xmm2,%xmm10 + psrlq $1,%xmm2 + pxor %xmm4,%xmm1 + pxor %xmm6,%xmm2 + pand %xmm7,%xmm1 + pand %xmm7,%xmm2 + pxor %xmm1,%xmm4 + psllq $1,%xmm1 + pxor %xmm2,%xmm6 + psllq $1,%xmm2 + pxor %xmm9,%xmm1 + pxor %xmm10,%xmm2 + movdqa %xmm3,%xmm9 + psrlq $1,%xmm3 + movdqa %xmm15,%xmm10 + psrlq $1,%xmm15 + pxor %xmm5,%xmm3 + pxor %xmm0,%xmm15 + pand %xmm7,%xmm3 + pand %xmm7,%xmm15 + pxor %xmm3,%xmm5 + psllq $1,%xmm3 + pxor %xmm15,%xmm0 + psllq $1,%xmm15 + pxor %xmm9,%xmm3 + pxor %xmm10,%xmm15 + movdqa 32(%r11),%xmm7 + movdqa %xmm6,%xmm9 + psrlq $2,%xmm6 + movdqa %xmm2,%xmm10 + psrlq $2,%xmm2 + pxor %xmm4,%xmm6 + pxor %xmm1,%xmm2 + pand %xmm8,%xmm6 + pand %xmm8,%xmm2 + pxor %xmm6,%xmm4 + psllq $2,%xmm6 + pxor %xmm2,%xmm1 + psllq $2,%xmm2 + pxor %xmm9,%xmm6 + pxor %xmm10,%xmm2 + movdqa %xmm0,%xmm9 + psrlq $2,%xmm0 + movdqa %xmm15,%xmm10 + psrlq $2,%xmm15 + pxor %xmm5,%xmm0 + pxor %xmm3,%xmm15 + pand %xmm8,%xmm0 + pand %xmm8,%xmm15 + pxor %xmm0,%xmm5 + psllq $2,%xmm0 + pxor %xmm15,%xmm3 + psllq $2,%xmm15 + pxor %xmm9,%xmm0 + pxor %xmm10,%xmm15 + movdqa %xmm5,%xmm9 + psrlq $4,%xmm5 + movdqa %xmm3,%xmm10 + psrlq $4,%xmm3 + pxor %xmm4,%xmm5 + pxor %xmm1,%xmm3 + pand %xmm7,%xmm5 + pand %xmm7,%xmm3 + pxor %xmm5,%xmm4 + psllq $4,%xmm5 + pxor %xmm3,%xmm1 + psllq $4,%xmm3 + pxor %xmm9,%xmm5 + pxor %xmm10,%xmm3 + movdqa %xmm0,%xmm9 + psrlq $4,%xmm0 + movdqa %xmm15,%xmm10 + psrlq $4,%xmm15 + pxor %xmm6,%xmm0 + pxor %xmm2,%xmm15 + pand %xmm7,%xmm0 + pand %xmm7,%xmm15 + pxor %xmm0,%xmm6 + psllq $4,%xmm0 + pxor %xmm15,%xmm2 + psllq $4,%xmm15 + pxor %xmm9,%xmm0 + pxor %xmm10,%xmm15 + movdqa (%rax),%xmm7 + pxor %xmm7,%xmm3 + pxor %xmm7,%xmm5 + pxor %xmm7,%xmm2 + pxor %xmm7,%xmm6 + pxor %xmm7,%xmm1 + pxor %xmm7,%xmm4 + pxor %xmm7,%xmm15 + pxor %xmm7,%xmm0 + .byte 0xf3,0xc3 +.size _bsaes_encrypt8,.-_bsaes_encrypt8 + +.type _bsaes_decrypt8,@function +.align 64 +_bsaes_decrypt8: + leaq .LBS0(%rip),%r11 + + movdqa (%rax),%xmm8 + leaq 16(%rax),%rax + movdqa -48(%r11),%xmm7 + pxor %xmm8,%xmm15 + pxor %xmm8,%xmm0 + pxor %xmm8,%xmm1 + pxor %xmm8,%xmm2 +.byte 102,68,15,56,0,255 +.byte 102,15,56,0,199 + pxor %xmm8,%xmm3 + pxor %xmm8,%xmm4 +.byte 102,15,56,0,207 +.byte 102,15,56,0,215 + pxor %xmm8,%xmm5 + pxor %xmm8,%xmm6 +.byte 102,15,56,0,223 +.byte 102,15,56,0,231 +.byte 102,15,56,0,239 +.byte 102,15,56,0,247 + movdqa 0(%r11),%xmm7 + movdqa 16(%r11),%xmm8 + movdqa %xmm5,%xmm9 + psrlq $1,%xmm5 + movdqa %xmm3,%xmm10 + psrlq $1,%xmm3 + pxor %xmm6,%xmm5 + pxor %xmm4,%xmm3 + pand %xmm7,%xmm5 + pand %xmm7,%xmm3 + pxor %xmm5,%xmm6 + psllq $1,%xmm5 + pxor %xmm3,%xmm4 + psllq $1,%xmm3 + pxor %xmm9,%xmm5 + pxor %xmm10,%xmm3 + movdqa %xmm1,%xmm9 + psrlq $1,%xmm1 + movdqa %xmm15,%xmm10 + psrlq $1,%xmm15 + pxor %xmm2,%xmm1 + pxor %xmm0,%xmm15 + pand %xmm7,%xmm1 + pand %xmm7,%xmm15 + pxor %xmm1,%xmm2 + psllq $1,%xmm1 + pxor %xmm15,%xmm0 + psllq $1,%xmm15 + pxor %xmm9,%xmm1 + pxor %xmm10,%xmm15 + movdqa 32(%r11),%xmm7 + movdqa %xmm4,%xmm9 + psrlq $2,%xmm4 + movdqa %xmm3,%xmm10 + psrlq $2,%xmm3 + pxor %xmm6,%xmm4 + pxor %xmm5,%xmm3 + pand %xmm8,%xmm4 + pand %xmm8,%xmm3 + pxor %xmm4,%xmm6 + psllq $2,%xmm4 + pxor %xmm3,%xmm5 + psllq $2,%xmm3 + pxor %xmm9,%xmm4 + pxor %xmm10,%xmm3 + movdqa %xmm0,%xmm9 + psrlq $2,%xmm0 + movdqa %xmm15,%xmm10 + psrlq $2,%xmm15 + pxor %xmm2,%xmm0 + pxor %xmm1,%xmm15 + pand %xmm8,%xmm0 + pand %xmm8,%xmm15 + pxor %xmm0,%xmm2 + psllq $2,%xmm0 + pxor %xmm15,%xmm1 + psllq $2,%xmm15 + pxor %xmm9,%xmm0 + pxor %xmm10,%xmm15 + movdqa %xmm2,%xmm9 + psrlq $4,%xmm2 + movdqa %xmm1,%xmm10 + psrlq $4,%xmm1 + pxor %xmm6,%xmm2 + pxor %xmm5,%xmm1 + pand %xmm7,%xmm2 + pand %xmm7,%xmm1 + pxor %xmm2,%xmm6 + psllq $4,%xmm2 + pxor %xmm1,%xmm5 + psllq $4,%xmm1 + pxor %xmm9,%xmm2 + pxor %xmm10,%xmm1 + movdqa %xmm0,%xmm9 + psrlq $4,%xmm0 + movdqa %xmm15,%xmm10 + psrlq $4,%xmm15 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pand %xmm7,%xmm0 + pand %xmm7,%xmm15 + pxor %xmm0,%xmm4 + psllq $4,%xmm0 + pxor %xmm15,%xmm3 + psllq $4,%xmm15 + pxor %xmm9,%xmm0 + pxor %xmm10,%xmm15 + decl %r10d + jmp .Ldec_sbox +.align 16 +.Ldec_loop: + pxor 0(%rax),%xmm15 + pxor 16(%rax),%xmm0 + pxor 32(%rax),%xmm1 + pxor 48(%rax),%xmm2 +.byte 102,68,15,56,0,255 +.byte 102,15,56,0,199 + pxor 64(%rax),%xmm3 + pxor 80(%rax),%xmm4 +.byte 102,15,56,0,207 +.byte 102,15,56,0,215 + pxor 96(%rax),%xmm5 + pxor 112(%rax),%xmm6 +.byte 102,15,56,0,223 +.byte 102,15,56,0,231 +.byte 102,15,56,0,239 +.byte 102,15,56,0,247 + leaq 128(%rax),%rax +.Ldec_sbox: + pxor %xmm3,%xmm2 + + pxor %xmm6,%xmm3 + pxor %xmm6,%xmm1 + pxor %xmm3,%xmm5 + pxor %xmm5,%xmm6 + pxor %xmm6,%xmm0 + + pxor %xmm0,%xmm15 + pxor %xmm4,%xmm1 + pxor %xmm15,%xmm2 + pxor %xmm15,%xmm4 + pxor %xmm2,%xmm0 + movdqa %xmm2,%xmm10 + movdqa %xmm6,%xmm9 + movdqa %xmm0,%xmm8 + movdqa %xmm3,%xmm12 + movdqa %xmm4,%xmm11 + + pxor %xmm15,%xmm10 + pxor %xmm3,%xmm9 + pxor %xmm5,%xmm8 + movdqa %xmm10,%xmm13 + pxor %xmm15,%xmm12 + movdqa %xmm9,%xmm7 + pxor %xmm1,%xmm11 + movdqa %xmm10,%xmm14 + + por %xmm8,%xmm9 + por %xmm11,%xmm10 + pxor %xmm7,%xmm14 + pand %xmm11,%xmm13 + pxor %xmm8,%xmm11 + pand %xmm8,%xmm7 + pand %xmm11,%xmm14 + movdqa %xmm5,%xmm11 + pxor %xmm1,%xmm11 + pand %xmm11,%xmm12 + pxor %xmm12,%xmm10 + pxor %xmm12,%xmm9 + movdqa %xmm2,%xmm12 + movdqa %xmm0,%xmm11 + pxor %xmm6,%xmm12 + pxor %xmm4,%xmm11 + movdqa %xmm12,%xmm8 + pand %xmm11,%xmm12 + por %xmm11,%xmm8 + pxor %xmm12,%xmm7 + pxor %xmm14,%xmm10 + pxor %xmm13,%xmm9 + pxor %xmm14,%xmm8 + movdqa %xmm3,%xmm11 + pxor %xmm13,%xmm7 + movdqa %xmm15,%xmm12 + pxor %xmm13,%xmm8 + movdqa %xmm6,%xmm13 + pand %xmm5,%xmm11 + movdqa %xmm2,%xmm14 + pand %xmm1,%xmm12 + pand %xmm0,%xmm13 + por %xmm4,%xmm14 + pxor %xmm11,%xmm10 + pxor %xmm12,%xmm9 + pxor %xmm13,%xmm8 + pxor %xmm14,%xmm7 + + + + + + movdqa %xmm10,%xmm11 + pand %xmm8,%xmm10 + pxor %xmm9,%xmm11 + + movdqa %xmm7,%xmm13 + movdqa %xmm11,%xmm14 + pxor %xmm10,%xmm13 + pand %xmm13,%xmm14 + + movdqa %xmm8,%xmm12 + pxor %xmm9,%xmm14 + pxor %xmm7,%xmm12 + + pxor %xmm9,%xmm10 + + pand %xmm10,%xmm12 + + movdqa %xmm13,%xmm9 + pxor %xmm7,%xmm12 + + pxor %xmm12,%xmm9 + pxor %xmm12,%xmm8 + + pand %xmm7,%xmm9 + + pxor %xmm9,%xmm13 + pxor %xmm9,%xmm8 + + pand %xmm14,%xmm13 + + pxor %xmm11,%xmm13 + movdqa %xmm4,%xmm11 + movdqa %xmm0,%xmm7 + movdqa %xmm14,%xmm9 + pxor %xmm13,%xmm9 + pand %xmm4,%xmm9 + pxor %xmm0,%xmm4 + pand %xmm14,%xmm0 + pand %xmm13,%xmm4 + pxor %xmm0,%xmm4 + pxor %xmm9,%xmm0 + pxor %xmm1,%xmm11 + pxor %xmm5,%xmm7 + pxor %xmm12,%xmm14 + pxor %xmm8,%xmm13 + movdqa %xmm14,%xmm10 + movdqa %xmm12,%xmm9 + pxor %xmm13,%xmm10 + pxor %xmm8,%xmm9 + pand %xmm11,%xmm10 + pand %xmm1,%xmm9 + pxor %xmm7,%xmm11 + pxor %xmm5,%xmm1 + pand %xmm14,%xmm7 + pand %xmm12,%xmm5 + pand %xmm13,%xmm11 + pand %xmm8,%xmm1 + pxor %xmm11,%xmm7 + pxor %xmm5,%xmm1 + pxor %xmm10,%xmm11 + pxor %xmm9,%xmm5 + pxor %xmm11,%xmm4 + pxor %xmm11,%xmm1 + pxor %xmm7,%xmm0 + pxor %xmm7,%xmm5 + + movdqa %xmm2,%xmm11 + movdqa %xmm6,%xmm7 + pxor %xmm15,%xmm11 + pxor %xmm3,%xmm7 + movdqa %xmm14,%xmm10 + movdqa %xmm12,%xmm9 + pxor %xmm13,%xmm10 + pxor %xmm8,%xmm9 + pand %xmm11,%xmm10 + pand %xmm15,%xmm9 + pxor %xmm7,%xmm11 + pxor %xmm3,%xmm15 + pand %xmm14,%xmm7 + pand %xmm12,%xmm3 + pand %xmm13,%xmm11 + pand %xmm8,%xmm15 + pxor %xmm11,%xmm7 + pxor %xmm3,%xmm15 + pxor %xmm10,%xmm11 + pxor %xmm9,%xmm3 + pxor %xmm12,%xmm14 + pxor %xmm8,%xmm13 + movdqa %xmm14,%xmm10 + pxor %xmm13,%xmm10 + pand %xmm2,%xmm10 + pxor %xmm6,%xmm2 + pand %xmm14,%xmm6 + pand %xmm13,%xmm2 + pxor %xmm6,%xmm2 + pxor %xmm10,%xmm6 + pxor %xmm11,%xmm2 + pxor %xmm11,%xmm15 + pxor %xmm7,%xmm6 + pxor %xmm7,%xmm3 + pxor %xmm6,%xmm0 + pxor %xmm4,%xmm5 + + pxor %xmm0,%xmm3 + pxor %xmm6,%xmm1 + pxor %xmm6,%xmm4 + pxor %xmm1,%xmm3 + pxor %xmm15,%xmm6 + pxor %xmm4,%xmm3 + pxor %xmm5,%xmm2 + pxor %xmm0,%xmm5 + pxor %xmm3,%xmm2 + + pxor %xmm15,%xmm3 + pxor %xmm2,%xmm6 + decl %r10d + jl .Ldec_done + + pshufd $78,%xmm15,%xmm7 + pshufd $78,%xmm2,%xmm13 + pxor %xmm15,%xmm7 + pshufd $78,%xmm4,%xmm14 + pxor %xmm2,%xmm13 + pshufd $78,%xmm0,%xmm8 + pxor %xmm4,%xmm14 + pshufd $78,%xmm5,%xmm9 + pxor %xmm0,%xmm8 + pshufd $78,%xmm3,%xmm10 + pxor %xmm5,%xmm9 + pxor %xmm13,%xmm15 + pxor %xmm13,%xmm0 + pshufd $78,%xmm1,%xmm11 + pxor %xmm3,%xmm10 + pxor %xmm7,%xmm5 + pxor %xmm8,%xmm3 + pshufd $78,%xmm6,%xmm12 + pxor %xmm1,%xmm11 + pxor %xmm14,%xmm0 + pxor %xmm9,%xmm1 + pxor %xmm6,%xmm12 + + pxor %xmm14,%xmm5 + pxor %xmm13,%xmm3 + pxor %xmm13,%xmm1 + pxor %xmm10,%xmm6 + pxor %xmm11,%xmm2 + pxor %xmm14,%xmm1 + pxor %xmm14,%xmm6 + pxor %xmm12,%xmm4 + pshufd $147,%xmm15,%xmm7 + pshufd $147,%xmm0,%xmm8 + pxor %xmm7,%xmm15 + pshufd $147,%xmm5,%xmm9 + pxor %xmm8,%xmm0 + pshufd $147,%xmm3,%xmm10 + pxor %xmm9,%xmm5 + pshufd $147,%xmm1,%xmm11 + pxor %xmm10,%xmm3 + pshufd $147,%xmm6,%xmm12 + pxor %xmm11,%xmm1 + pshufd $147,%xmm2,%xmm13 + pxor %xmm12,%xmm6 + pshufd $147,%xmm4,%xmm14 + pxor %xmm13,%xmm2 + pxor %xmm14,%xmm4 + + pxor %xmm15,%xmm8 + pxor %xmm4,%xmm7 + pxor %xmm4,%xmm8 + pshufd $78,%xmm15,%xmm15 + pxor %xmm0,%xmm9 + pshufd $78,%xmm0,%xmm0 + pxor %xmm1,%xmm12 + pxor %xmm7,%xmm15 + pxor %xmm6,%xmm13 + pxor %xmm8,%xmm0 + pxor %xmm3,%xmm11 + pshufd $78,%xmm1,%xmm7 + pxor %xmm2,%xmm14 + pshufd $78,%xmm6,%xmm8 + pxor %xmm5,%xmm10 + pshufd $78,%xmm3,%xmm1 + pxor %xmm4,%xmm10 + pshufd $78,%xmm4,%xmm6 + pxor %xmm4,%xmm11 + pshufd $78,%xmm2,%xmm3 + pxor %xmm11,%xmm7 + pshufd $78,%xmm5,%xmm2 + pxor %xmm12,%xmm8 + pxor %xmm1,%xmm10 + pxor %xmm14,%xmm6 + pxor %xmm3,%xmm13 + movdqa %xmm7,%xmm3 + pxor %xmm9,%xmm2 + movdqa %xmm13,%xmm5 + movdqa %xmm8,%xmm4 + movdqa %xmm2,%xmm1 + movdqa %xmm10,%xmm2 + movdqa -16(%r11),%xmm7 + jnz .Ldec_loop + movdqa -32(%r11),%xmm7 + jmp .Ldec_loop +.align 16 +.Ldec_done: + movdqa 0(%r11),%xmm7 + movdqa 16(%r11),%xmm8 + movdqa %xmm2,%xmm9 + psrlq $1,%xmm2 + movdqa %xmm1,%xmm10 + psrlq $1,%xmm1 + pxor %xmm4,%xmm2 + pxor %xmm6,%xmm1 + pand %xmm7,%xmm2 + pand %xmm7,%xmm1 + pxor %xmm2,%xmm4 + psllq $1,%xmm2 + pxor %xmm1,%xmm6 + psllq $1,%xmm1 + pxor %xmm9,%xmm2 + pxor %xmm10,%xmm1 + movdqa %xmm5,%xmm9 + psrlq $1,%xmm5 + movdqa %xmm15,%xmm10 + psrlq $1,%xmm15 + pxor %xmm3,%xmm5 + pxor %xmm0,%xmm15 + pand %xmm7,%xmm5 + pand %xmm7,%xmm15 + pxor %xmm5,%xmm3 + psllq $1,%xmm5 + pxor %xmm15,%xmm0 + psllq $1,%xmm15 + pxor %xmm9,%xmm5 + pxor %xmm10,%xmm15 + movdqa 32(%r11),%xmm7 + movdqa %xmm6,%xmm9 + psrlq $2,%xmm6 + movdqa %xmm1,%xmm10 + psrlq $2,%xmm1 + pxor %xmm4,%xmm6 + pxor %xmm2,%xmm1 + pand %xmm8,%xmm6 + pand %xmm8,%xmm1 + pxor %xmm6,%xmm4 + psllq $2,%xmm6 + pxor %xmm1,%xmm2 + psllq $2,%xmm1 + pxor %xmm9,%xmm6 + pxor %xmm10,%xmm1 + movdqa %xmm0,%xmm9 + psrlq $2,%xmm0 + movdqa %xmm15,%xmm10 + psrlq $2,%xmm15 + pxor %xmm3,%xmm0 + pxor %xmm5,%xmm15 + pand %xmm8,%xmm0 + pand %xmm8,%xmm15 + pxor %xmm0,%xmm3 + psllq $2,%xmm0 + pxor %xmm15,%xmm5 + psllq $2,%xmm15 + pxor %xmm9,%xmm0 + pxor %xmm10,%xmm15 + movdqa %xmm3,%xmm9 + psrlq $4,%xmm3 + movdqa %xmm5,%xmm10 + psrlq $4,%xmm5 + pxor %xmm4,%xmm3 + pxor %xmm2,%xmm5 + pand %xmm7,%xmm3 + pand %xmm7,%xmm5 + pxor %xmm3,%xmm4 + psllq $4,%xmm3 + pxor %xmm5,%xmm2 + psllq $4,%xmm5 + pxor %xmm9,%xmm3 + pxor %xmm10,%xmm5 + movdqa %xmm0,%xmm9 + psrlq $4,%xmm0 + movdqa %xmm15,%xmm10 + psrlq $4,%xmm15 + pxor %xmm6,%xmm0 + pxor %xmm1,%xmm15 + pand %xmm7,%xmm0 + pand %xmm7,%xmm15 + pxor %xmm0,%xmm6 + psllq $4,%xmm0 + pxor %xmm15,%xmm1 + psllq $4,%xmm15 + pxor %xmm9,%xmm0 + pxor %xmm10,%xmm15 + movdqa (%rax),%xmm7 + pxor %xmm7,%xmm5 + pxor %xmm7,%xmm3 + pxor %xmm7,%xmm1 + pxor %xmm7,%xmm6 + pxor %xmm7,%xmm2 + pxor %xmm7,%xmm4 + pxor %xmm7,%xmm15 + pxor %xmm7,%xmm0 + .byte 0xf3,0xc3 +.size _bsaes_decrypt8,.-_bsaes_decrypt8 +.type _bsaes_key_convert,@function +.align 16 +_bsaes_key_convert: + leaq .Lmasks(%rip),%r11 + movdqu (%rcx),%xmm7 + leaq 16(%rcx),%rcx + movdqa 0(%r11),%xmm0 + movdqa 16(%r11),%xmm1 + movdqa 32(%r11),%xmm2 + movdqa 48(%r11),%xmm3 + movdqa 64(%r11),%xmm4 + pcmpeqd %xmm5,%xmm5 + + movdqu (%rcx),%xmm6 + movdqa %xmm7,(%rax) + leaq 16(%rax),%rax + decl %r10d + jmp .Lkey_loop +.align 16 +.Lkey_loop: +.byte 102,15,56,0,244 + + movdqa %xmm0,%xmm8 + movdqa %xmm1,%xmm9 + + pand %xmm6,%xmm8 + pand %xmm6,%xmm9 + movdqa %xmm2,%xmm10 + pcmpeqb %xmm0,%xmm8 + psllq $4,%xmm0 + movdqa %xmm3,%xmm11 + pcmpeqb %xmm1,%xmm9 + psllq $4,%xmm1 + + pand %xmm6,%xmm10 + pand %xmm6,%xmm11 + movdqa %xmm0,%xmm12 + pcmpeqb %xmm2,%xmm10 + psllq $4,%xmm2 + movdqa %xmm1,%xmm13 + pcmpeqb %xmm3,%xmm11 + psllq $4,%xmm3 + + movdqa %xmm2,%xmm14 + movdqa %xmm3,%xmm15 + pxor %xmm5,%xmm8 + pxor %xmm5,%xmm9 + + pand %xmm6,%xmm12 + pand %xmm6,%xmm13 + movdqa %xmm8,0(%rax) + pcmpeqb %xmm0,%xmm12 + psrlq $4,%xmm0 + movdqa %xmm9,16(%rax) + pcmpeqb %xmm1,%xmm13 + psrlq $4,%xmm1 + leaq 16(%rcx),%rcx + + pand %xmm6,%xmm14 + pand %xmm6,%xmm15 + movdqa %xmm10,32(%rax) + pcmpeqb %xmm2,%xmm14 + psrlq $4,%xmm2 + movdqa %xmm11,48(%rax) + pcmpeqb %xmm3,%xmm15 + psrlq $4,%xmm3 + movdqu (%rcx),%xmm6 + + pxor %xmm5,%xmm13 + pxor %xmm5,%xmm14 + movdqa %xmm12,64(%rax) + movdqa %xmm13,80(%rax) + movdqa %xmm14,96(%rax) + movdqa %xmm15,112(%rax) + leaq 128(%rax),%rax + decl %r10d + jnz .Lkey_loop + + movdqa 80(%r11),%xmm7 + + .byte 0xf3,0xc3 +.size _bsaes_key_convert,.-_bsaes_key_convert + +.globl bsaes_cbc_encrypt +.type bsaes_cbc_encrypt,@function +.align 16 +bsaes_cbc_encrypt: + cmpl $0,%r9d + jne asm_AES_cbc_encrypt + cmpq $128,%rdx + jb asm_AES_cbc_encrypt + + movq %rsp,%rax +.Lcbc_dec_prologue: + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + leaq -72(%rsp),%rsp + movq %rsp,%rbp + movl 240(%rcx),%eax + movq %rdi,%r12 + movq %rsi,%r13 + movq %rdx,%r14 + movq %rcx,%r15 + movq %r8,%rbx + shrq $4,%r14 + + movl %eax,%edx + shlq $7,%rax + subq $96,%rax + subq %rax,%rsp + + movq %rsp,%rax + movq %r15,%rcx + movl %edx,%r10d + call _bsaes_key_convert + pxor (%rsp),%xmm7 + movdqa %xmm6,(%rax) + movdqa %xmm7,(%rsp) + + movdqu (%rbx),%xmm14 + subq $8,%r14 +.Lcbc_dec_loop: + movdqu 0(%r12),%xmm15 + movdqu 16(%r12),%xmm0 + movdqu 32(%r12),%xmm1 + movdqu 48(%r12),%xmm2 + movdqu 64(%r12),%xmm3 + movdqu 80(%r12),%xmm4 + movq %rsp,%rax + movdqu 96(%r12),%xmm5 + movl %edx,%r10d + movdqu 112(%r12),%xmm6 + movdqa %xmm14,32(%rbp) + + call _bsaes_decrypt8 + + pxor 32(%rbp),%xmm15 + movdqu 0(%r12),%xmm7 + movdqu 16(%r12),%xmm8 + pxor %xmm7,%xmm0 + movdqu 32(%r12),%xmm9 + pxor %xmm8,%xmm5 + movdqu 48(%r12),%xmm10 + pxor %xmm9,%xmm3 + movdqu 64(%r12),%xmm11 + pxor %xmm10,%xmm1 + movdqu 80(%r12),%xmm12 + pxor %xmm11,%xmm6 + movdqu 96(%r12),%xmm13 + pxor %xmm12,%xmm2 + movdqu 112(%r12),%xmm14 + pxor %xmm13,%xmm4 + movdqu %xmm15,0(%r13) + leaq 128(%r12),%r12 + movdqu %xmm0,16(%r13) + movdqu %xmm5,32(%r13) + movdqu %xmm3,48(%r13) + movdqu %xmm1,64(%r13) + movdqu %xmm6,80(%r13) + movdqu %xmm2,96(%r13) + movdqu %xmm4,112(%r13) + leaq 128(%r13),%r13 + subq $8,%r14 + jnc .Lcbc_dec_loop + + addq $8,%r14 + jz .Lcbc_dec_done + + movdqu 0(%r12),%xmm15 + movq %rsp,%rax + movl %edx,%r10d + cmpq $2,%r14 + jb .Lcbc_dec_one + movdqu 16(%r12),%xmm0 + je .Lcbc_dec_two + movdqu 32(%r12),%xmm1 + cmpq $4,%r14 + jb .Lcbc_dec_three + movdqu 48(%r12),%xmm2 + je .Lcbc_dec_four + movdqu 64(%r12),%xmm3 + cmpq $6,%r14 + jb .Lcbc_dec_five + movdqu 80(%r12),%xmm4 + je .Lcbc_dec_six + movdqu 96(%r12),%xmm5 + movdqa %xmm14,32(%rbp) + call _bsaes_decrypt8 + pxor 32(%rbp),%xmm15 + movdqu 0(%r12),%xmm7 + movdqu 16(%r12),%xmm8 + pxor %xmm7,%xmm0 + movdqu 32(%r12),%xmm9 + pxor %xmm8,%xmm5 + movdqu 48(%r12),%xmm10 + pxor %xmm9,%xmm3 + movdqu 64(%r12),%xmm11 + pxor %xmm10,%xmm1 + movdqu 80(%r12),%xmm12 + pxor %xmm11,%xmm6 + movdqu 96(%r12),%xmm14 + pxor %xmm12,%xmm2 + movdqu %xmm15,0(%r13) + movdqu %xmm0,16(%r13) + movdqu %xmm5,32(%r13) + movdqu %xmm3,48(%r13) + movdqu %xmm1,64(%r13) + movdqu %xmm6,80(%r13) + movdqu %xmm2,96(%r13) + jmp .Lcbc_dec_done +.align 16 +.Lcbc_dec_six: + movdqa %xmm14,32(%rbp) + call _bsaes_decrypt8 + pxor 32(%rbp),%xmm15 + movdqu 0(%r12),%xmm7 + movdqu 16(%r12),%xmm8 + pxor %xmm7,%xmm0 + movdqu 32(%r12),%xmm9 + pxor %xmm8,%xmm5 + movdqu 48(%r12),%xmm10 + pxor %xmm9,%xmm3 + movdqu 64(%r12),%xmm11 + pxor %xmm10,%xmm1 + movdqu 80(%r12),%xmm14 + pxor %xmm11,%xmm6 + movdqu %xmm15,0(%r13) + movdqu %xmm0,16(%r13) + movdqu %xmm5,32(%r13) + movdqu %xmm3,48(%r13) + movdqu %xmm1,64(%r13) + movdqu %xmm6,80(%r13) + jmp .Lcbc_dec_done +.align 16 +.Lcbc_dec_five: + movdqa %xmm14,32(%rbp) + call _bsaes_decrypt8 + pxor 32(%rbp),%xmm15 + movdqu 0(%r12),%xmm7 + movdqu 16(%r12),%xmm8 + pxor %xmm7,%xmm0 + movdqu 32(%r12),%xmm9 + pxor %xmm8,%xmm5 + movdqu 48(%r12),%xmm10 + pxor %xmm9,%xmm3 + movdqu 64(%r12),%xmm14 + pxor %xmm10,%xmm1 + movdqu %xmm15,0(%r13) + movdqu %xmm0,16(%r13) + movdqu %xmm5,32(%r13) + movdqu %xmm3,48(%r13) + movdqu %xmm1,64(%r13) + jmp .Lcbc_dec_done +.align 16 +.Lcbc_dec_four: + movdqa %xmm14,32(%rbp) + call _bsaes_decrypt8 + pxor 32(%rbp),%xmm15 + movdqu 0(%r12),%xmm7 + movdqu 16(%r12),%xmm8 + pxor %xmm7,%xmm0 + movdqu 32(%r12),%xmm9 + pxor %xmm8,%xmm5 + movdqu 48(%r12),%xmm14 + pxor %xmm9,%xmm3 + movdqu %xmm15,0(%r13) + movdqu %xmm0,16(%r13) + movdqu %xmm5,32(%r13) + movdqu %xmm3,48(%r13) + jmp .Lcbc_dec_done +.align 16 +.Lcbc_dec_three: + movdqa %xmm14,32(%rbp) + call _bsaes_decrypt8 + pxor 32(%rbp),%xmm15 + movdqu 0(%r12),%xmm7 + movdqu 16(%r12),%xmm8 + pxor %xmm7,%xmm0 + movdqu 32(%r12),%xmm14 + pxor %xmm8,%xmm5 + movdqu %xmm15,0(%r13) + movdqu %xmm0,16(%r13) + movdqu %xmm5,32(%r13) + jmp .Lcbc_dec_done +.align 16 +.Lcbc_dec_two: + movdqa %xmm14,32(%rbp) + call _bsaes_decrypt8 + pxor 32(%rbp),%xmm15 + movdqu 0(%r12),%xmm7 + movdqu 16(%r12),%xmm14 + pxor %xmm7,%xmm0 + movdqu %xmm15,0(%r13) + movdqu %xmm0,16(%r13) + jmp .Lcbc_dec_done +.align 16 +.Lcbc_dec_one: + leaq (%r12),%rdi + leaq 32(%rbp),%rsi + leaq (%r15),%rdx + call asm_AES_decrypt + pxor 32(%rbp),%xmm14 + movdqu %xmm14,(%r13) + movdqa %xmm15,%xmm14 + +.Lcbc_dec_done: + movdqu %xmm14,(%rbx) + leaq (%rsp),%rax + pxor %xmm0,%xmm0 +.Lcbc_dec_bzero: + movdqa %xmm0,0(%rax) + movdqa %xmm0,16(%rax) + leaq 32(%rax),%rax + cmpq %rax,%rbp + ja .Lcbc_dec_bzero + + leaq (%rbp),%rsp + movq 72(%rsp),%r15 + movq 80(%rsp),%r14 + movq 88(%rsp),%r13 + movq 96(%rsp),%r12 + movq 104(%rsp),%rbx + movq 112(%rsp),%rax + leaq 120(%rsp),%rsp + movq %rax,%rbp +.Lcbc_dec_epilogue: + .byte 0xf3,0xc3 +.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt + +.globl bsaes_ctr32_encrypt_blocks +.type bsaes_ctr32_encrypt_blocks,@function +.align 16 +bsaes_ctr32_encrypt_blocks: + movq %rsp,%rax +.Lctr_enc_prologue: + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + leaq -72(%rsp),%rsp + movq %rsp,%rbp + movdqu (%r8),%xmm0 + movl 240(%rcx),%eax + movq %rdi,%r12 + movq %rsi,%r13 + movq %rdx,%r14 + movq %rcx,%r15 + movdqa %xmm0,32(%rbp) + cmpq $8,%rdx + jb .Lctr_enc_short + + movl %eax,%ebx + shlq $7,%rax + subq $96,%rax + subq %rax,%rsp + + movq %rsp,%rax + movq %r15,%rcx + movl %ebx,%r10d + call _bsaes_key_convert + pxor %xmm6,%xmm7 + movdqa %xmm7,(%rax) + + movdqa (%rsp),%xmm8 + leaq .LADD1(%rip),%r11 + movdqa 32(%rbp),%xmm15 + movdqa -32(%r11),%xmm7 +.byte 102,68,15,56,0,199 +.byte 102,68,15,56,0,255 + movdqa %xmm8,(%rsp) + jmp .Lctr_enc_loop +.align 16 +.Lctr_enc_loop: + movdqa %xmm15,32(%rbp) + movdqa %xmm15,%xmm0 + movdqa %xmm15,%xmm1 + paddd 0(%r11),%xmm0 + movdqa %xmm15,%xmm2 + paddd 16(%r11),%xmm1 + movdqa %xmm15,%xmm3 + paddd 32(%r11),%xmm2 + movdqa %xmm15,%xmm4 + paddd 48(%r11),%xmm3 + movdqa %xmm15,%xmm5 + paddd 64(%r11),%xmm4 + movdqa %xmm15,%xmm6 + paddd 80(%r11),%xmm5 + paddd 96(%r11),%xmm6 + + + + movdqa (%rsp),%xmm8 + leaq 16(%rsp),%rax + movdqa -16(%r11),%xmm7 + pxor %xmm8,%xmm15 + pxor %xmm8,%xmm0 + pxor %xmm8,%xmm1 + pxor %xmm8,%xmm2 +.byte 102,68,15,56,0,255 +.byte 102,15,56,0,199 + pxor %xmm8,%xmm3 + pxor %xmm8,%xmm4 +.byte 102,15,56,0,207 +.byte 102,15,56,0,215 + pxor %xmm8,%xmm5 + pxor %xmm8,%xmm6 +.byte 102,15,56,0,223 +.byte 102,15,56,0,231 +.byte 102,15,56,0,239 +.byte 102,15,56,0,247 + leaq .LBS0(%rip),%r11 + movl %ebx,%r10d + + call _bsaes_encrypt8_bitslice + + subq $8,%r14 + jc .Lctr_enc_loop_done + + movdqu 0(%r12),%xmm7 + movdqu 16(%r12),%xmm8 + movdqu 32(%r12),%xmm9 + movdqu 48(%r12),%xmm10 + movdqu 64(%r12),%xmm11 + movdqu 80(%r12),%xmm12 + movdqu 96(%r12),%xmm13 + movdqu 112(%r12),%xmm14 + leaq 128(%r12),%r12 + pxor %xmm15,%xmm7 + movdqa 32(%rbp),%xmm15 + pxor %xmm8,%xmm0 + movdqu %xmm7,0(%r13) + pxor %xmm9,%xmm3 + movdqu %xmm0,16(%r13) + pxor %xmm10,%xmm5 + movdqu %xmm3,32(%r13) + pxor %xmm11,%xmm2 + movdqu %xmm5,48(%r13) + pxor %xmm12,%xmm6 + movdqu %xmm2,64(%r13) + pxor %xmm13,%xmm1 + movdqu %xmm6,80(%r13) + pxor %xmm14,%xmm4 + movdqu %xmm1,96(%r13) + leaq .LADD1(%rip),%r11 + movdqu %xmm4,112(%r13) + leaq 128(%r13),%r13 + paddd 112(%r11),%xmm15 + jnz .Lctr_enc_loop + + jmp .Lctr_enc_done +.align 16 +.Lctr_enc_loop_done: + addq $8,%r14 + movdqu 0(%r12),%xmm7 + pxor %xmm7,%xmm15 + movdqu %xmm15,0(%r13) + cmpq $2,%r14 + jb .Lctr_enc_done + movdqu 16(%r12),%xmm8 + pxor %xmm8,%xmm0 + movdqu %xmm0,16(%r13) + je .Lctr_enc_done + movdqu 32(%r12),%xmm9 + pxor %xmm9,%xmm3 + movdqu %xmm3,32(%r13) + cmpq $4,%r14 + jb .Lctr_enc_done + movdqu 48(%r12),%xmm10 + pxor %xmm10,%xmm5 + movdqu %xmm5,48(%r13) + je .Lctr_enc_done + movdqu 64(%r12),%xmm11 + pxor %xmm11,%xmm2 + movdqu %xmm2,64(%r13) + cmpq $6,%r14 + jb .Lctr_enc_done + movdqu 80(%r12),%xmm12 + pxor %xmm12,%xmm6 + movdqu %xmm6,80(%r13) + je .Lctr_enc_done + movdqu 96(%r12),%xmm13 + pxor %xmm13,%xmm1 + movdqu %xmm1,96(%r13) + jmp .Lctr_enc_done + +.align 16 +.Lctr_enc_short: + leaq 32(%rbp),%rdi + leaq 48(%rbp),%rsi + leaq (%r15),%rdx + call asm_AES_encrypt + movdqu (%r12),%xmm0 + leaq 16(%r12),%r12 + movl 44(%rbp),%eax + bswapl %eax + pxor 48(%rbp),%xmm0 + incl %eax + movdqu %xmm0,(%r13) + bswapl %eax + leaq 16(%r13),%r13 + movl %eax,44(%rsp) + decq %r14 + jnz .Lctr_enc_short + +.Lctr_enc_done: + leaq (%rsp),%rax + pxor %xmm0,%xmm0 +.Lctr_enc_bzero: + movdqa %xmm0,0(%rax) + movdqa %xmm0,16(%rax) + leaq 32(%rax),%rax + cmpq %rax,%rbp + ja .Lctr_enc_bzero + + leaq (%rbp),%rsp + movq 72(%rsp),%r15 + movq 80(%rsp),%r14 + movq 88(%rsp),%r13 + movq 96(%rsp),%r12 + movq 104(%rsp),%rbx + movq 112(%rsp),%rax + leaq 120(%rsp),%rsp + movq %rax,%rbp +.Lctr_enc_epilogue: + .byte 0xf3,0xc3 +.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks +.globl bsaes_xts_encrypt +.type bsaes_xts_encrypt,@function +.align 16 +bsaes_xts_encrypt: + movq %rsp,%rax +.Lxts_enc_prologue: + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + leaq -72(%rsp),%rsp + movq %rsp,%rbp + movq %rdi,%r12 + movq %rsi,%r13 + movq %rdx,%r14 + movq %rcx,%r15 + + leaq (%r9),%rdi + leaq 32(%rbp),%rsi + leaq (%r8),%rdx + call asm_AES_encrypt + + movl 240(%r15),%eax + movq %r14,%rbx + + movl %eax,%edx + shlq $7,%rax + subq $96,%rax + subq %rax,%rsp + + movq %rsp,%rax + movq %r15,%rcx + movl %edx,%r10d + call _bsaes_key_convert + pxor %xmm6,%xmm7 + movdqa %xmm7,(%rax) + + andq $-16,%r14 + subq $128,%rsp + movdqa 32(%rbp),%xmm6 + + pxor %xmm14,%xmm14 + movdqa .Lxts_magic(%rip),%xmm12 + pcmpgtd %xmm6,%xmm14 + + subq $128,%r14 + jc .Lxts_enc_short + jmp .Lxts_enc_loop + +.align 16 +.Lxts_enc_loop: + pshufd $19,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm15 + movdqa %xmm6,0(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + pshufd $19,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm0 + movdqa %xmm6,16(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 0(%r12),%xmm7 + pshufd $19,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm1 + movdqa %xmm6,32(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 16(%r12),%xmm8 + pxor %xmm7,%xmm15 + pshufd $19,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm2 + movdqa %xmm6,48(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 32(%r12),%xmm9 + pxor %xmm8,%xmm0 + pshufd $19,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm3 + movdqa %xmm6,64(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 48(%r12),%xmm10 + pxor %xmm9,%xmm1 + pshufd $19,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm4 + movdqa %xmm6,80(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 64(%r12),%xmm11 + pxor %xmm10,%xmm2 + pshufd $19,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm5 + movdqa %xmm6,96(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 80(%r12),%xmm12 + pxor %xmm11,%xmm3 + movdqu 96(%r12),%xmm13 + pxor %xmm12,%xmm4 + movdqu 112(%r12),%xmm14 + leaq 128(%r12),%r12 + movdqa %xmm6,112(%rsp) + pxor %xmm13,%xmm5 + leaq 128(%rsp),%rax + pxor %xmm14,%xmm6 + movl %edx,%r10d + + call _bsaes_encrypt8 + + pxor 0(%rsp),%xmm15 + pxor 16(%rsp),%xmm0 + movdqu %xmm15,0(%r13) + pxor 32(%rsp),%xmm3 + movdqu %xmm0,16(%r13) + pxor 48(%rsp),%xmm5 + movdqu %xmm3,32(%r13) + pxor 64(%rsp),%xmm2 + movdqu %xmm5,48(%r13) + pxor 80(%rsp),%xmm6 + movdqu %xmm2,64(%r13) + pxor 96(%rsp),%xmm1 + movdqu %xmm6,80(%r13) + pxor 112(%rsp),%xmm4 + movdqu %xmm1,96(%r13) + movdqu %xmm4,112(%r13) + leaq 128(%r13),%r13 + + movdqa 112(%rsp),%xmm6 + pxor %xmm14,%xmm14 + movdqa .Lxts_magic(%rip),%xmm12 + pcmpgtd %xmm6,%xmm14 + pshufd $19,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + + subq $128,%r14 + jnc .Lxts_enc_loop + +.Lxts_enc_short: + addq $128,%r14 + jz .Lxts_enc_done + pshufd $19,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm15 + movdqa %xmm6,0(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + pshufd $19,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm0 + movdqa %xmm6,16(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 0(%r12),%xmm7 + cmpq $16,%r14 + je .Lxts_enc_1 + pshufd $19,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm1 + movdqa %xmm6,32(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 16(%r12),%xmm8 + cmpq $32,%r14 + je .Lxts_enc_2 + pxor %xmm7,%xmm15 + pshufd $19,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm2 + movdqa %xmm6,48(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 32(%r12),%xmm9 + cmpq $48,%r14 + je .Lxts_enc_3 + pxor %xmm8,%xmm0 + pshufd $19,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm3 + movdqa %xmm6,64(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 48(%r12),%xmm10 + cmpq $64,%r14 + je .Lxts_enc_4 + pxor %xmm9,%xmm1 + pshufd $19,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm4 + movdqa %xmm6,80(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 64(%r12),%xmm11 + cmpq $80,%r14 + je .Lxts_enc_5 + pxor %xmm10,%xmm2 + pshufd $19,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm5 + movdqa %xmm6,96(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 80(%r12),%xmm12 + cmpq $96,%r14 + je .Lxts_enc_6 + pxor %xmm11,%xmm3 + movdqu 96(%r12),%xmm13 + pxor %xmm12,%xmm4 + movdqa %xmm6,112(%rsp) + leaq 112(%r12),%r12 + pxor %xmm13,%xmm5 + leaq 128(%rsp),%rax + movl %edx,%r10d + + call _bsaes_encrypt8 + + pxor 0(%rsp),%xmm15 + pxor 16(%rsp),%xmm0 + movdqu %xmm15,0(%r13) + pxor 32(%rsp),%xmm3 + movdqu %xmm0,16(%r13) + pxor 48(%rsp),%xmm5 + movdqu %xmm3,32(%r13) + pxor 64(%rsp),%xmm2 + movdqu %xmm5,48(%r13) + pxor 80(%rsp),%xmm6 + movdqu %xmm2,64(%r13) + pxor 96(%rsp),%xmm1 + movdqu %xmm6,80(%r13) + movdqu %xmm1,96(%r13) + leaq 112(%r13),%r13 + + movdqa 112(%rsp),%xmm6 + jmp .Lxts_enc_done +.align 16 +.Lxts_enc_6: + pxor %xmm11,%xmm3 + leaq 96(%r12),%r12 + pxor %xmm12,%xmm4 + leaq 128(%rsp),%rax + movl %edx,%r10d + + call _bsaes_encrypt8 + + pxor 0(%rsp),%xmm15 + pxor 16(%rsp),%xmm0 + movdqu %xmm15,0(%r13) + pxor 32(%rsp),%xmm3 + movdqu %xmm0,16(%r13) + pxor 48(%rsp),%xmm5 + movdqu %xmm3,32(%r13) + pxor 64(%rsp),%xmm2 + movdqu %xmm5,48(%r13) + pxor 80(%rsp),%xmm6 + movdqu %xmm2,64(%r13) + movdqu %xmm6,80(%r13) + leaq 96(%r13),%r13 + + movdqa 96(%rsp),%xmm6 + jmp .Lxts_enc_done +.align 16 +.Lxts_enc_5: + pxor %xmm10,%xmm2 + leaq 80(%r12),%r12 + pxor %xmm11,%xmm3 + leaq 128(%rsp),%rax + movl %edx,%r10d + + call _bsaes_encrypt8 + + pxor 0(%rsp),%xmm15 + pxor 16(%rsp),%xmm0 + movdqu %xmm15,0(%r13) + pxor 32(%rsp),%xmm3 + movdqu %xmm0,16(%r13) + pxor 48(%rsp),%xmm5 + movdqu %xmm3,32(%r13) + pxor 64(%rsp),%xmm2 + movdqu %xmm5,48(%r13) + movdqu %xmm2,64(%r13) + leaq 80(%r13),%r13 + + movdqa 80(%rsp),%xmm6 + jmp .Lxts_enc_done +.align 16 +.Lxts_enc_4: + pxor %xmm9,%xmm1 + leaq 64(%r12),%r12 + pxor %xmm10,%xmm2 + leaq 128(%rsp),%rax + movl %edx,%r10d + + call _bsaes_encrypt8 + + pxor 0(%rsp),%xmm15 + pxor 16(%rsp),%xmm0 + movdqu %xmm15,0(%r13) + pxor 32(%rsp),%xmm3 + movdqu %xmm0,16(%r13) + pxor 48(%rsp),%xmm5 + movdqu %xmm3,32(%r13) + movdqu %xmm5,48(%r13) + leaq 64(%r13),%r13 + + movdqa 64(%rsp),%xmm6 + jmp .Lxts_enc_done +.align 16 +.Lxts_enc_3: + pxor %xmm8,%xmm0 + leaq 48(%r12),%r12 + pxor %xmm9,%xmm1 + leaq 128(%rsp),%rax + movl %edx,%r10d + + call _bsaes_encrypt8 + + pxor 0(%rsp),%xmm15 + pxor 16(%rsp),%xmm0 + movdqu %xmm15,0(%r13) + pxor 32(%rsp),%xmm3 + movdqu %xmm0,16(%r13) + movdqu %xmm3,32(%r13) + leaq 48(%r13),%r13 + + movdqa 48(%rsp),%xmm6 + jmp .Lxts_enc_done +.align 16 +.Lxts_enc_2: + pxor %xmm7,%xmm15 + leaq 32(%r12),%r12 + pxor %xmm8,%xmm0 + leaq 128(%rsp),%rax + movl %edx,%r10d + + call _bsaes_encrypt8 + + pxor 0(%rsp),%xmm15 + pxor 16(%rsp),%xmm0 + movdqu %xmm15,0(%r13) + movdqu %xmm0,16(%r13) + leaq 32(%r13),%r13 + + movdqa 32(%rsp),%xmm6 + jmp .Lxts_enc_done +.align 16 +.Lxts_enc_1: + pxor %xmm15,%xmm7 + leaq 16(%r12),%r12 + movdqa %xmm7,32(%rbp) + leaq 32(%rbp),%rdi + leaq 32(%rbp),%rsi + leaq (%r15),%rdx + call asm_AES_encrypt + pxor 32(%rbp),%xmm15 + + + + + + movdqu %xmm15,0(%r13) + leaq 16(%r13),%r13 + + movdqa 16(%rsp),%xmm6 + +.Lxts_enc_done: + andl $15,%ebx + jz .Lxts_enc_ret + movq %r13,%rdx + +.Lxts_enc_steal: + movzbl (%r12),%eax + movzbl -16(%rdx),%ecx + leaq 1(%r12),%r12 + movb %al,-16(%rdx) + movb %cl,0(%rdx) + leaq 1(%rdx),%rdx + subl $1,%ebx + jnz .Lxts_enc_steal + + movdqu -16(%r13),%xmm15 + leaq 32(%rbp),%rdi + pxor %xmm6,%xmm15 + leaq 32(%rbp),%rsi + movdqa %xmm15,32(%rbp) + leaq (%r15),%rdx + call asm_AES_encrypt + pxor 32(%rbp),%xmm6 + movdqu %xmm6,-16(%r13) + +.Lxts_enc_ret: + leaq (%rsp),%rax + pxor %xmm0,%xmm0 +.Lxts_enc_bzero: + movdqa %xmm0,0(%rax) + movdqa %xmm0,16(%rax) + leaq 32(%rax),%rax + cmpq %rax,%rbp + ja .Lxts_enc_bzero + + leaq (%rbp),%rsp + movq 72(%rsp),%r15 + movq 80(%rsp),%r14 + movq 88(%rsp),%r13 + movq 96(%rsp),%r12 + movq 104(%rsp),%rbx + movq 112(%rsp),%rax + leaq 120(%rsp),%rsp + movq %rax,%rbp +.Lxts_enc_epilogue: + .byte 0xf3,0xc3 +.size bsaes_xts_encrypt,.-bsaes_xts_encrypt + +.globl bsaes_xts_decrypt +.type bsaes_xts_decrypt,@function +.align 16 +bsaes_xts_decrypt: + movq %rsp,%rax +.Lxts_dec_prologue: + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + leaq -72(%rsp),%rsp + movq %rsp,%rbp + movq %rdi,%r12 + movq %rsi,%r13 + movq %rdx,%r14 + movq %rcx,%r15 + + leaq (%r9),%rdi + leaq 32(%rbp),%rsi + leaq (%r8),%rdx + call asm_AES_encrypt + + movl 240(%r15),%eax + movq %r14,%rbx + + movl %eax,%edx + shlq $7,%rax + subq $96,%rax + subq %rax,%rsp + + movq %rsp,%rax + movq %r15,%rcx + movl %edx,%r10d + call _bsaes_key_convert + pxor (%rsp),%xmm7 + movdqa %xmm6,(%rax) + movdqa %xmm7,(%rsp) + + xorl %eax,%eax + andq $-16,%r14 + testl $15,%ebx + setnz %al + shlq $4,%rax + subq %rax,%r14 + + subq $128,%rsp + movdqa 32(%rbp),%xmm6 + + pxor %xmm14,%xmm14 + movdqa .Lxts_magic(%rip),%xmm12 + pcmpgtd %xmm6,%xmm14 + + subq $128,%r14 + jc .Lxts_dec_short + jmp .Lxts_dec_loop + +.align 16 +.Lxts_dec_loop: + pshufd $19,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm15 + movdqa %xmm6,0(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + pshufd $19,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm0 + movdqa %xmm6,16(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 0(%r12),%xmm7 + pshufd $19,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm1 + movdqa %xmm6,32(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 16(%r12),%xmm8 + pxor %xmm7,%xmm15 + pshufd $19,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm2 + movdqa %xmm6,48(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 32(%r12),%xmm9 + pxor %xmm8,%xmm0 + pshufd $19,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm3 + movdqa %xmm6,64(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 48(%r12),%xmm10 + pxor %xmm9,%xmm1 + pshufd $19,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm4 + movdqa %xmm6,80(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 64(%r12),%xmm11 + pxor %xmm10,%xmm2 + pshufd $19,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm5 + movdqa %xmm6,96(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 80(%r12),%xmm12 + pxor %xmm11,%xmm3 + movdqu 96(%r12),%xmm13 + pxor %xmm12,%xmm4 + movdqu 112(%r12),%xmm14 + leaq 128(%r12),%r12 + movdqa %xmm6,112(%rsp) + pxor %xmm13,%xmm5 + leaq 128(%rsp),%rax + pxor %xmm14,%xmm6 + movl %edx,%r10d + + call _bsaes_decrypt8 + + pxor 0(%rsp),%xmm15 + pxor 16(%rsp),%xmm0 + movdqu %xmm15,0(%r13) + pxor 32(%rsp),%xmm5 + movdqu %xmm0,16(%r13) + pxor 48(%rsp),%xmm3 + movdqu %xmm5,32(%r13) + pxor 64(%rsp),%xmm1 + movdqu %xmm3,48(%r13) + pxor 80(%rsp),%xmm6 + movdqu %xmm1,64(%r13) + pxor 96(%rsp),%xmm2 + movdqu %xmm6,80(%r13) + pxor 112(%rsp),%xmm4 + movdqu %xmm2,96(%r13) + movdqu %xmm4,112(%r13) + leaq 128(%r13),%r13 + + movdqa 112(%rsp),%xmm6 + pxor %xmm14,%xmm14 + movdqa .Lxts_magic(%rip),%xmm12 + pcmpgtd %xmm6,%xmm14 + pshufd $19,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + + subq $128,%r14 + jnc .Lxts_dec_loop + +.Lxts_dec_short: + addq $128,%r14 + jz .Lxts_dec_done + pshufd $19,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm15 + movdqa %xmm6,0(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + pshufd $19,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm0 + movdqa %xmm6,16(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 0(%r12),%xmm7 + cmpq $16,%r14 + je .Lxts_dec_1 + pshufd $19,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm1 + movdqa %xmm6,32(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 16(%r12),%xmm8 + cmpq $32,%r14 + je .Lxts_dec_2 + pxor %xmm7,%xmm15 + pshufd $19,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm2 + movdqa %xmm6,48(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 32(%r12),%xmm9 + cmpq $48,%r14 + je .Lxts_dec_3 + pxor %xmm8,%xmm0 + pshufd $19,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm3 + movdqa %xmm6,64(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 48(%r12),%xmm10 + cmpq $64,%r14 + je .Lxts_dec_4 + pxor %xmm9,%xmm1 + pshufd $19,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm4 + movdqa %xmm6,80(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 64(%r12),%xmm11 + cmpq $80,%r14 + je .Lxts_dec_5 + pxor %xmm10,%xmm2 + pshufd $19,%xmm14,%xmm13 + pxor %xmm14,%xmm14 + movdqa %xmm6,%xmm5 + movdqa %xmm6,96(%rsp) + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + pcmpgtd %xmm6,%xmm14 + pxor %xmm13,%xmm6 + movdqu 80(%r12),%xmm12 + cmpq $96,%r14 + je .Lxts_dec_6 + pxor %xmm11,%xmm3 + movdqu 96(%r12),%xmm13 + pxor %xmm12,%xmm4 + movdqa %xmm6,112(%rsp) + leaq 112(%r12),%r12 + pxor %xmm13,%xmm5 + leaq 128(%rsp),%rax + movl %edx,%r10d + + call _bsaes_decrypt8 + + pxor 0(%rsp),%xmm15 + pxor 16(%rsp),%xmm0 + movdqu %xmm15,0(%r13) + pxor 32(%rsp),%xmm5 + movdqu %xmm0,16(%r13) + pxor 48(%rsp),%xmm3 + movdqu %xmm5,32(%r13) + pxor 64(%rsp),%xmm1 + movdqu %xmm3,48(%r13) + pxor 80(%rsp),%xmm6 + movdqu %xmm1,64(%r13) + pxor 96(%rsp),%xmm2 + movdqu %xmm6,80(%r13) + movdqu %xmm2,96(%r13) + leaq 112(%r13),%r13 + + movdqa 112(%rsp),%xmm6 + jmp .Lxts_dec_done +.align 16 +.Lxts_dec_6: + pxor %xmm11,%xmm3 + leaq 96(%r12),%r12 + pxor %xmm12,%xmm4 + leaq 128(%rsp),%rax + movl %edx,%r10d + + call _bsaes_decrypt8 + + pxor 0(%rsp),%xmm15 + pxor 16(%rsp),%xmm0 + movdqu %xmm15,0(%r13) + pxor 32(%rsp),%xmm5 + movdqu %xmm0,16(%r13) + pxor 48(%rsp),%xmm3 + movdqu %xmm5,32(%r13) + pxor 64(%rsp),%xmm1 + movdqu %xmm3,48(%r13) + pxor 80(%rsp),%xmm6 + movdqu %xmm1,64(%r13) + movdqu %xmm6,80(%r13) + leaq 96(%r13),%r13 + + movdqa 96(%rsp),%xmm6 + jmp .Lxts_dec_done +.align 16 +.Lxts_dec_5: + pxor %xmm10,%xmm2 + leaq 80(%r12),%r12 + pxor %xmm11,%xmm3 + leaq 128(%rsp),%rax + movl %edx,%r10d + + call _bsaes_decrypt8 + + pxor 0(%rsp),%xmm15 + pxor 16(%rsp),%xmm0 + movdqu %xmm15,0(%r13) + pxor 32(%rsp),%xmm5 + movdqu %xmm0,16(%r13) + pxor 48(%rsp),%xmm3 + movdqu %xmm5,32(%r13) + pxor 64(%rsp),%xmm1 + movdqu %xmm3,48(%r13) + movdqu %xmm1,64(%r13) + leaq 80(%r13),%r13 + + movdqa 80(%rsp),%xmm6 + jmp .Lxts_dec_done +.align 16 +.Lxts_dec_4: + pxor %xmm9,%xmm1 + leaq 64(%r12),%r12 + pxor %xmm10,%xmm2 + leaq 128(%rsp),%rax + movl %edx,%r10d + + call _bsaes_decrypt8 + + pxor 0(%rsp),%xmm15 + pxor 16(%rsp),%xmm0 + movdqu %xmm15,0(%r13) + pxor 32(%rsp),%xmm5 + movdqu %xmm0,16(%r13) + pxor 48(%rsp),%xmm3 + movdqu %xmm5,32(%r13) + movdqu %xmm3,48(%r13) + leaq 64(%r13),%r13 + + movdqa 64(%rsp),%xmm6 + jmp .Lxts_dec_done +.align 16 +.Lxts_dec_3: + pxor %xmm8,%xmm0 + leaq 48(%r12),%r12 + pxor %xmm9,%xmm1 + leaq 128(%rsp),%rax + movl %edx,%r10d + + call _bsaes_decrypt8 + + pxor 0(%rsp),%xmm15 + pxor 16(%rsp),%xmm0 + movdqu %xmm15,0(%r13) + pxor 32(%rsp),%xmm5 + movdqu %xmm0,16(%r13) + movdqu %xmm5,32(%r13) + leaq 48(%r13),%r13 + + movdqa 48(%rsp),%xmm6 + jmp .Lxts_dec_done +.align 16 +.Lxts_dec_2: + pxor %xmm7,%xmm15 + leaq 32(%r12),%r12 + pxor %xmm8,%xmm0 + leaq 128(%rsp),%rax + movl %edx,%r10d + + call _bsaes_decrypt8 + + pxor 0(%rsp),%xmm15 + pxor 16(%rsp),%xmm0 + movdqu %xmm15,0(%r13) + movdqu %xmm0,16(%r13) + leaq 32(%r13),%r13 + + movdqa 32(%rsp),%xmm6 + jmp .Lxts_dec_done +.align 16 +.Lxts_dec_1: + pxor %xmm15,%xmm7 + leaq 16(%r12),%r12 + movdqa %xmm7,32(%rbp) + leaq 32(%rbp),%rdi + leaq 32(%rbp),%rsi + leaq (%r15),%rdx + call asm_AES_decrypt + pxor 32(%rbp),%xmm15 + + + + + + movdqu %xmm15,0(%r13) + leaq 16(%r13),%r13 + + movdqa 16(%rsp),%xmm6 + +.Lxts_dec_done: + andl $15,%ebx + jz .Lxts_dec_ret + + pxor %xmm14,%xmm14 + movdqa .Lxts_magic(%rip),%xmm12 + pcmpgtd %xmm6,%xmm14 + pshufd $19,%xmm14,%xmm13 + movdqa %xmm6,%xmm5 + paddq %xmm6,%xmm6 + pand %xmm12,%xmm13 + movdqu (%r12),%xmm15 + pxor %xmm13,%xmm6 + + leaq 32(%rbp),%rdi + pxor %xmm6,%xmm15 + leaq 32(%rbp),%rsi + movdqa %xmm15,32(%rbp) + leaq (%r15),%rdx + call asm_AES_decrypt + pxor 32(%rbp),%xmm6 + movq %r13,%rdx + movdqu %xmm6,(%r13) + +.Lxts_dec_steal: + movzbl 16(%r12),%eax + movzbl (%rdx),%ecx + leaq 1(%r12),%r12 + movb %al,(%rdx) + movb %cl,16(%rdx) + leaq 1(%rdx),%rdx + subl $1,%ebx + jnz .Lxts_dec_steal + + movdqu (%r13),%xmm15 + leaq 32(%rbp),%rdi + pxor %xmm5,%xmm15 + leaq 32(%rbp),%rsi + movdqa %xmm15,32(%rbp) + leaq (%r15),%rdx + call asm_AES_decrypt + pxor 32(%rbp),%xmm5 + movdqu %xmm5,(%r13) + +.Lxts_dec_ret: + leaq (%rsp),%rax + pxor %xmm0,%xmm0 +.Lxts_dec_bzero: + movdqa %xmm0,0(%rax) + movdqa %xmm0,16(%rax) + leaq 32(%rax),%rax + cmpq %rax,%rbp + ja .Lxts_dec_bzero + + leaq (%rbp),%rsp + movq 72(%rsp),%r15 + movq 80(%rsp),%r14 + movq 88(%rsp),%r13 + movq 96(%rsp),%r12 + movq 104(%rsp),%rbx + movq 112(%rsp),%rax + leaq 120(%rsp),%rsp + movq %rax,%rbp +.Lxts_dec_epilogue: + .byte 0xf3,0xc3 +.size bsaes_xts_decrypt,.-bsaes_xts_decrypt +.type _bsaes_const,@object +.align 64 +_bsaes_const: +.LM0ISR: +.quad 0x0a0e0206070b0f03, 0x0004080c0d010509 +.LISRM0: +.quad 0x01040b0e0205080f, 0x0306090c00070a0d +.LISR: +.quad 0x0504070602010003, 0x0f0e0d0c080b0a09 +.LBS0: +.quad 0x5555555555555555, 0x5555555555555555 +.LBS1: +.quad 0x3333333333333333, 0x3333333333333333 +.LBS2: +.quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f +.LSR: +.quad 0x0504070600030201, 0x0f0e0d0c0a09080b +.LSRM0: +.quad 0x0304090e00050a0f, 0x01060b0c0207080d +.LM0SR: +.quad 0x0a0e02060f03070b, 0x0004080c05090d01 +.LSWPUP: +.quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 +.LSWPUPM0SR: +.quad 0x0a0d02060c03070b, 0x0004080f05090e01 +.LADD1: +.quad 0x0000000000000000, 0x0000000100000000 +.LADD2: +.quad 0x0000000000000000, 0x0000000200000000 +.LADD3: +.quad 0x0000000000000000, 0x0000000300000000 +.LADD4: +.quad 0x0000000000000000, 0x0000000400000000 +.LADD5: +.quad 0x0000000000000000, 0x0000000500000000 +.LADD6: +.quad 0x0000000000000000, 0x0000000600000000 +.LADD7: +.quad 0x0000000000000000, 0x0000000700000000 +.LADD8: +.quad 0x0000000000000000, 0x0000000800000000 +.Lxts_magic: +.long 0x87,0,1,0 +.Lmasks: +.quad 0x0101010101010101, 0x0101010101010101 +.quad 0x0202020202020202, 0x0202020202020202 +.quad 0x0404040404040404, 0x0404040404040404 +.quad 0x0808080808080808, 0x0808080808080808 +.LM0: +.quad 0x02060a0e03070b0f, 0x0004080c0105090d +.L63: +.quad 0x6363636363636363, 0x6363636363636363 +.byte 66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,69,109,105,108,105,97,32,75,195,164,115,112,101,114,44,32,80,101,116,101,114,32,83,99,104,119,97,98,101,44,32,65,110,100,121,32,80,111,108,121,97,107,111,118,0 +.align 64 +.size _bsaes_const,.-_bsaes_const diff --git a/secure/lib/libcrypto/amd64/cmll-x86_64.S b/secure/lib/libcrypto/amd64/cmll-x86_64.S new file mode 100644 index 0000000..ecd33f1 --- /dev/null +++ b/secure/lib/libcrypto/amd64/cmll-x86_64.S @@ -0,0 +1,1839 @@ + # $FreeBSD$ +.text + + +.globl Camellia_EncryptBlock +.type Camellia_EncryptBlock,@function +.align 16 +Camellia_EncryptBlock: + movl $128,%eax + subl %edi,%eax + movl $3,%edi + adcl $0,%edi + jmp .Lenc_rounds +.size Camellia_EncryptBlock,.-Camellia_EncryptBlock + +.globl Camellia_EncryptBlock_Rounds +.type Camellia_EncryptBlock_Rounds,@function +.align 16 +.Lenc_rounds: +Camellia_EncryptBlock_Rounds: + pushq %rbx + pushq %rbp + pushq %r13 + pushq %r14 + pushq %r15 +.Lenc_prologue: + + + movq %rcx,%r13 + movq %rdx,%r14 + + shll $6,%edi + leaq .LCamellia_SBOX(%rip),%rbp + leaq (%r14,%rdi,1),%r15 + + movl 0(%rsi),%r8d + movl 4(%rsi),%r9d + movl 8(%rsi),%r10d + bswapl %r8d + movl 12(%rsi),%r11d + bswapl %r9d + bswapl %r10d + bswapl %r11d + + call _x86_64_Camellia_encrypt + + bswapl %r8d + bswapl %r9d + bswapl %r10d + movl %r8d,0(%r13) + bswapl %r11d + movl %r9d,4(%r13) + movl %r10d,8(%r13) + movl %r11d,12(%r13) + + movq 0(%rsp),%r15 + movq 8(%rsp),%r14 + movq 16(%rsp),%r13 + movq 24(%rsp),%rbp + movq 32(%rsp),%rbx + leaq 40(%rsp),%rsp +.Lenc_epilogue: + .byte 0xf3,0xc3 +.size Camellia_EncryptBlock_Rounds,.-Camellia_EncryptBlock_Rounds + +.type _x86_64_Camellia_encrypt,@function +.align 16 +_x86_64_Camellia_encrypt: + xorl 0(%r14),%r9d + xorl 4(%r14),%r8d + xorl 8(%r14),%r11d + xorl 12(%r14),%r10d +.align 16 +.Leloop: + movl 16(%r14),%ebx + movl 20(%r14),%eax + + xorl %r8d,%eax + xorl %r9d,%ebx + movzbl %ah,%esi + movzbl %bl,%edi + movl 2052(%rbp,%rsi,8),%edx + movl 0(%rbp,%rdi,8),%ecx + movzbl %al,%esi + shrl $16,%eax + movzbl %bh,%edi + xorl 4(%rbp,%rsi,8),%edx + shrl $16,%ebx + xorl 4(%rbp,%rdi,8),%ecx + movzbl %ah,%esi + movzbl %bl,%edi + xorl 0(%rbp,%rsi,8),%edx + xorl 2052(%rbp,%rdi,8),%ecx + movzbl %al,%esi + movzbl %bh,%edi + xorl 2048(%rbp,%rsi,8),%edx + xorl 2048(%rbp,%rdi,8),%ecx + movl 24(%r14),%ebx + movl 28(%r14),%eax + xorl %edx,%ecx + rorl $8,%edx + xorl %ecx,%r10d + xorl %ecx,%r11d + xorl %edx,%r11d + xorl %r10d,%eax + xorl %r11d,%ebx + movzbl %ah,%esi + movzbl %bl,%edi + movl 2052(%rbp,%rsi,8),%edx + movl 0(%rbp,%rdi,8),%ecx + movzbl %al,%esi + shrl $16,%eax + movzbl %bh,%edi + xorl 4(%rbp,%rsi,8),%edx + shrl $16,%ebx + xorl 4(%rbp,%rdi,8),%ecx + movzbl %ah,%esi + movzbl %bl,%edi + xorl 0(%rbp,%rsi,8),%edx + xorl 2052(%rbp,%rdi,8),%ecx + movzbl %al,%esi + movzbl %bh,%edi + xorl 2048(%rbp,%rsi,8),%edx + xorl 2048(%rbp,%rdi,8),%ecx + movl 32(%r14),%ebx + movl 36(%r14),%eax + xorl %edx,%ecx + rorl $8,%edx + xorl %ecx,%r8d + xorl %ecx,%r9d + xorl %edx,%r9d + xorl %r8d,%eax + xorl %r9d,%ebx + movzbl %ah,%esi + movzbl %bl,%edi + movl 2052(%rbp,%rsi,8),%edx + movl 0(%rbp,%rdi,8),%ecx + movzbl %al,%esi + shrl $16,%eax + movzbl %bh,%edi + xorl 4(%rbp,%rsi,8),%edx + shrl $16,%ebx + xorl 4(%rbp,%rdi,8),%ecx + movzbl %ah,%esi + movzbl %bl,%edi + xorl 0(%rbp,%rsi,8),%edx + xorl 2052(%rbp,%rdi,8),%ecx + movzbl %al,%esi + movzbl %bh,%edi + xorl 2048(%rbp,%rsi,8),%edx + xorl 2048(%rbp,%rdi,8),%ecx + movl 40(%r14),%ebx + movl 44(%r14),%eax + xorl %edx,%ecx + rorl $8,%edx + xorl %ecx,%r10d + xorl %ecx,%r11d + xorl %edx,%r11d + xorl %r10d,%eax + xorl %r11d,%ebx + movzbl %ah,%esi + movzbl %bl,%edi + movl 2052(%rbp,%rsi,8),%edx + movl 0(%rbp,%rdi,8),%ecx + movzbl %al,%esi + shrl $16,%eax + movzbl %bh,%edi + xorl 4(%rbp,%rsi,8),%edx + shrl $16,%ebx + xorl 4(%rbp,%rdi,8),%ecx + movzbl %ah,%esi + movzbl %bl,%edi + xorl 0(%rbp,%rsi,8),%edx + xorl 2052(%rbp,%rdi,8),%ecx + movzbl %al,%esi + movzbl %bh,%edi + xorl 2048(%rbp,%rsi,8),%edx + xorl 2048(%rbp,%rdi,8),%ecx + movl 48(%r14),%ebx + movl 52(%r14),%eax + xorl %edx,%ecx + rorl $8,%edx + xorl %ecx,%r8d + xorl %ecx,%r9d + xorl %edx,%r9d + xorl %r8d,%eax + xorl %r9d,%ebx + movzbl %ah,%esi + movzbl %bl,%edi + movl 2052(%rbp,%rsi,8),%edx + movl 0(%rbp,%rdi,8),%ecx + movzbl %al,%esi + shrl $16,%eax + movzbl %bh,%edi + xorl 4(%rbp,%rsi,8),%edx + shrl $16,%ebx + xorl 4(%rbp,%rdi,8),%ecx + movzbl %ah,%esi + movzbl %bl,%edi + xorl 0(%rbp,%rsi,8),%edx + xorl 2052(%rbp,%rdi,8),%ecx + movzbl %al,%esi + movzbl %bh,%edi + xorl 2048(%rbp,%rsi,8),%edx + xorl 2048(%rbp,%rdi,8),%ecx + movl 56(%r14),%ebx + movl 60(%r14),%eax + xorl %edx,%ecx + rorl $8,%edx + xorl %ecx,%r10d + xorl %ecx,%r11d + xorl %edx,%r11d + xorl %r10d,%eax + xorl %r11d,%ebx + movzbl %ah,%esi + movzbl %bl,%edi + movl 2052(%rbp,%rsi,8),%edx + movl 0(%rbp,%rdi,8),%ecx + movzbl %al,%esi + shrl $16,%eax + movzbl %bh,%edi + xorl 4(%rbp,%rsi,8),%edx + shrl $16,%ebx + xorl 4(%rbp,%rdi,8),%ecx + movzbl %ah,%esi + movzbl %bl,%edi + xorl 0(%rbp,%rsi,8),%edx + xorl 2052(%rbp,%rdi,8),%ecx + movzbl %al,%esi + movzbl %bh,%edi + xorl 2048(%rbp,%rsi,8),%edx + xorl 2048(%rbp,%rdi,8),%ecx + movl 64(%r14),%ebx + movl 68(%r14),%eax + xorl %edx,%ecx + rorl $8,%edx + xorl %ecx,%r8d + xorl %ecx,%r9d + xorl %edx,%r9d + leaq 64(%r14),%r14 + cmpq %r15,%r14 + movl 8(%r14),%edx + movl 12(%r14),%ecx + je .Ledone + + andl %r8d,%eax + orl %r11d,%edx + roll $1,%eax + xorl %edx,%r10d + xorl %eax,%r9d + andl %r10d,%ecx + orl %r9d,%ebx + roll $1,%ecx + xorl %ebx,%r8d + xorl %ecx,%r11d + jmp .Leloop + +.align 16 +.Ledone: + xorl %r10d,%eax + xorl %r11d,%ebx + xorl %r8d,%ecx + xorl %r9d,%edx + + movl %eax,%r8d + movl %ebx,%r9d + movl %ecx,%r10d + movl %edx,%r11d + +.byte 0xf3,0xc3 +.size _x86_64_Camellia_encrypt,.-_x86_64_Camellia_encrypt + + +.globl Camellia_DecryptBlock +.type Camellia_DecryptBlock,@function +.align 16 +Camellia_DecryptBlock: + movl $128,%eax + subl %edi,%eax + movl $3,%edi + adcl $0,%edi + jmp .Ldec_rounds +.size Camellia_DecryptBlock,.-Camellia_DecryptBlock + +.globl Camellia_DecryptBlock_Rounds +.type Camellia_DecryptBlock_Rounds,@function +.align 16 +.Ldec_rounds: +Camellia_DecryptBlock_Rounds: + pushq %rbx + pushq %rbp + pushq %r13 + pushq %r14 + pushq %r15 +.Ldec_prologue: + + + movq %rcx,%r13 + movq %rdx,%r15 + + shll $6,%edi + leaq .LCamellia_SBOX(%rip),%rbp + leaq (%r15,%rdi,1),%r14 + + movl 0(%rsi),%r8d + movl 4(%rsi),%r9d + movl 8(%rsi),%r10d + bswapl %r8d + movl 12(%rsi),%r11d + bswapl %r9d + bswapl %r10d + bswapl %r11d + + call _x86_64_Camellia_decrypt + + bswapl %r8d + bswapl %r9d + bswapl %r10d + movl %r8d,0(%r13) + bswapl %r11d + movl %r9d,4(%r13) + movl %r10d,8(%r13) + movl %r11d,12(%r13) + + movq 0(%rsp),%r15 + movq 8(%rsp),%r14 + movq 16(%rsp),%r13 + movq 24(%rsp),%rbp + movq 32(%rsp),%rbx + leaq 40(%rsp),%rsp +.Ldec_epilogue: + .byte 0xf3,0xc3 +.size Camellia_DecryptBlock_Rounds,.-Camellia_DecryptBlock_Rounds + +.type _x86_64_Camellia_decrypt,@function +.align 16 +_x86_64_Camellia_decrypt: + xorl 0(%r14),%r9d + xorl 4(%r14),%r8d + xorl 8(%r14),%r11d + xorl 12(%r14),%r10d +.align 16 +.Ldloop: + movl -8(%r14),%ebx + movl -4(%r14),%eax + + xorl %r8d,%eax + xorl %r9d,%ebx + movzbl %ah,%esi + movzbl %bl,%edi + movl 2052(%rbp,%rsi,8),%edx + movl 0(%rbp,%rdi,8),%ecx + movzbl %al,%esi + shrl $16,%eax + movzbl %bh,%edi + xorl 4(%rbp,%rsi,8),%edx + shrl $16,%ebx + xorl 4(%rbp,%rdi,8),%ecx + movzbl %ah,%esi + movzbl %bl,%edi + xorl 0(%rbp,%rsi,8),%edx + xorl 2052(%rbp,%rdi,8),%ecx + movzbl %al,%esi + movzbl %bh,%edi + xorl 2048(%rbp,%rsi,8),%edx + xorl 2048(%rbp,%rdi,8),%ecx + movl -16(%r14),%ebx + movl -12(%r14),%eax + xorl %edx,%ecx + rorl $8,%edx + xorl %ecx,%r10d + xorl %ecx,%r11d + xorl %edx,%r11d + xorl %r10d,%eax + xorl %r11d,%ebx + movzbl %ah,%esi + movzbl %bl,%edi + movl 2052(%rbp,%rsi,8),%edx + movl 0(%rbp,%rdi,8),%ecx + movzbl %al,%esi + shrl $16,%eax + movzbl %bh,%edi + xorl 4(%rbp,%rsi,8),%edx + shrl $16,%ebx + xorl 4(%rbp,%rdi,8),%ecx + movzbl %ah,%esi + movzbl %bl,%edi + xorl 0(%rbp,%rsi,8),%edx + xorl 2052(%rbp,%rdi,8),%ecx + movzbl %al,%esi + movzbl %bh,%edi + xorl 2048(%rbp,%rsi,8),%edx + xorl 2048(%rbp,%rdi,8),%ecx + movl -24(%r14),%ebx + movl -20(%r14),%eax + xorl %edx,%ecx + rorl $8,%edx + xorl %ecx,%r8d + xorl %ecx,%r9d + xorl %edx,%r9d + xorl %r8d,%eax + xorl %r9d,%ebx + movzbl %ah,%esi + movzbl %bl,%edi + movl 2052(%rbp,%rsi,8),%edx + movl 0(%rbp,%rdi,8),%ecx + movzbl %al,%esi + shrl $16,%eax + movzbl %bh,%edi + xorl 4(%rbp,%rsi,8),%edx + shrl $16,%ebx + xorl 4(%rbp,%rdi,8),%ecx + movzbl %ah,%esi + movzbl %bl,%edi + xorl 0(%rbp,%rsi,8),%edx + xorl 2052(%rbp,%rdi,8),%ecx + movzbl %al,%esi + movzbl %bh,%edi + xorl 2048(%rbp,%rsi,8),%edx + xorl 2048(%rbp,%rdi,8),%ecx + movl -32(%r14),%ebx + movl -28(%r14),%eax + xorl %edx,%ecx + rorl $8,%edx + xorl %ecx,%r10d + xorl %ecx,%r11d + xorl %edx,%r11d + xorl %r10d,%eax + xorl %r11d,%ebx + movzbl %ah,%esi + movzbl %bl,%edi + movl 2052(%rbp,%rsi,8),%edx + movl 0(%rbp,%rdi,8),%ecx + movzbl %al,%esi + shrl $16,%eax + movzbl %bh,%edi + xorl 4(%rbp,%rsi,8),%edx + shrl $16,%ebx + xorl 4(%rbp,%rdi,8),%ecx + movzbl %ah,%esi + movzbl %bl,%edi + xorl 0(%rbp,%rsi,8),%edx + xorl 2052(%rbp,%rdi,8),%ecx + movzbl %al,%esi + movzbl %bh,%edi + xorl 2048(%rbp,%rsi,8),%edx + xorl 2048(%rbp,%rdi,8),%ecx + movl -40(%r14),%ebx + movl -36(%r14),%eax + xorl %edx,%ecx + rorl $8,%edx + xorl %ecx,%r8d + xorl %ecx,%r9d + xorl %edx,%r9d + xorl %r8d,%eax + xorl %r9d,%ebx + movzbl %ah,%esi + movzbl %bl,%edi + movl 2052(%rbp,%rsi,8),%edx + movl 0(%rbp,%rdi,8),%ecx + movzbl %al,%esi + shrl $16,%eax + movzbl %bh,%edi + xorl 4(%rbp,%rsi,8),%edx + shrl $16,%ebx + xorl 4(%rbp,%rdi,8),%ecx + movzbl %ah,%esi + movzbl %bl,%edi + xorl 0(%rbp,%rsi,8),%edx + xorl 2052(%rbp,%rdi,8),%ecx + movzbl %al,%esi + movzbl %bh,%edi + xorl 2048(%rbp,%rsi,8),%edx + xorl 2048(%rbp,%rdi,8),%ecx + movl -48(%r14),%ebx + movl -44(%r14),%eax + xorl %edx,%ecx + rorl $8,%edx + xorl %ecx,%r10d + xorl %ecx,%r11d + xorl %edx,%r11d + xorl %r10d,%eax + xorl %r11d,%ebx + movzbl %ah,%esi + movzbl %bl,%edi + movl 2052(%rbp,%rsi,8),%edx + movl 0(%rbp,%rdi,8),%ecx + movzbl %al,%esi + shrl $16,%eax + movzbl %bh,%edi + xorl 4(%rbp,%rsi,8),%edx + shrl $16,%ebx + xorl 4(%rbp,%rdi,8),%ecx + movzbl %ah,%esi + movzbl %bl,%edi + xorl 0(%rbp,%rsi,8),%edx + xorl 2052(%rbp,%rdi,8),%ecx + movzbl %al,%esi + movzbl %bh,%edi + xorl 2048(%rbp,%rsi,8),%edx + xorl 2048(%rbp,%rdi,8),%ecx + movl -56(%r14),%ebx + movl -52(%r14),%eax + xorl %edx,%ecx + rorl $8,%edx + xorl %ecx,%r8d + xorl %ecx,%r9d + xorl %edx,%r9d + leaq -64(%r14),%r14 + cmpq %r15,%r14 + movl 0(%r14),%edx + movl 4(%r14),%ecx + je .Lddone + + andl %r8d,%eax + orl %r11d,%edx + roll $1,%eax + xorl %edx,%r10d + xorl %eax,%r9d + andl %r10d,%ecx + orl %r9d,%ebx + roll $1,%ecx + xorl %ebx,%r8d + xorl %ecx,%r11d + + jmp .Ldloop + +.align 16 +.Lddone: + xorl %r10d,%ecx + xorl %r11d,%edx + xorl %r8d,%eax + xorl %r9d,%ebx + + movl %ecx,%r8d + movl %edx,%r9d + movl %eax,%r10d + movl %ebx,%r11d + +.byte 0xf3,0xc3 +.size _x86_64_Camellia_decrypt,.-_x86_64_Camellia_decrypt +.globl Camellia_Ekeygen +.type Camellia_Ekeygen,@function +.align 16 +Camellia_Ekeygen: + pushq %rbx + pushq %rbp + pushq %r13 + pushq %r14 + pushq %r15 +.Lkey_prologue: + + movl %edi,%r15d + movq %rdx,%r13 + + movl 0(%rsi),%r8d + movl 4(%rsi),%r9d + movl 8(%rsi),%r10d + movl 12(%rsi),%r11d + + bswapl %r8d + bswapl %r9d + bswapl %r10d + bswapl %r11d + movl %r9d,0(%r13) + movl %r8d,4(%r13) + movl %r11d,8(%r13) + movl %r10d,12(%r13) + cmpq $128,%r15 + je .L1st128 + + movl 16(%rsi),%r8d + movl 20(%rsi),%r9d + cmpq $192,%r15 + je .L1st192 + movl 24(%rsi),%r10d + movl 28(%rsi),%r11d + jmp .L1st256 +.L1st192: + movl %r8d,%r10d + movl %r9d,%r11d + notl %r10d + notl %r11d +.L1st256: + bswapl %r8d + bswapl %r9d + bswapl %r10d + bswapl %r11d + movl %r9d,32(%r13) + movl %r8d,36(%r13) + movl %r11d,40(%r13) + movl %r10d,44(%r13) + xorl 0(%r13),%r9d + xorl 4(%r13),%r8d + xorl 8(%r13),%r11d + xorl 12(%r13),%r10d + +.L1st128: + leaq .LCamellia_SIGMA(%rip),%r14 + leaq .LCamellia_SBOX(%rip),%rbp + + movl 0(%r14),%ebx + movl 4(%r14),%eax + xorl %r8d,%eax + xorl %r9d,%ebx + movzbl %ah,%esi + movzbl %bl,%edi + movl 2052(%rbp,%rsi,8),%edx + movl 0(%rbp,%rdi,8),%ecx + movzbl %al,%esi + shrl $16,%eax + movzbl %bh,%edi + xorl 4(%rbp,%rsi,8),%edx + shrl $16,%ebx + xorl 4(%rbp,%rdi,8),%ecx + movzbl %ah,%esi + movzbl %bl,%edi + xorl 0(%rbp,%rsi,8),%edx + xorl 2052(%rbp,%rdi,8),%ecx + movzbl %al,%esi + movzbl %bh,%edi + xorl 2048(%rbp,%rsi,8),%edx + xorl 2048(%rbp,%rdi,8),%ecx + movl 8(%r14),%ebx + movl 12(%r14),%eax + xorl %edx,%ecx + rorl $8,%edx + xorl %ecx,%r10d + xorl %ecx,%r11d + xorl %edx,%r11d + xorl %r10d,%eax + xorl %r11d,%ebx + movzbl %ah,%esi + movzbl %bl,%edi + movl 2052(%rbp,%rsi,8),%edx + movl 0(%rbp,%rdi,8),%ecx + movzbl %al,%esi + shrl $16,%eax + movzbl %bh,%edi + xorl 4(%rbp,%rsi,8),%edx + shrl $16,%ebx + xorl 4(%rbp,%rdi,8),%ecx + movzbl %ah,%esi + movzbl %bl,%edi + xorl 0(%rbp,%rsi,8),%edx + xorl 2052(%rbp,%rdi,8),%ecx + movzbl %al,%esi + movzbl %bh,%edi + xorl 2048(%rbp,%rsi,8),%edx + xorl 2048(%rbp,%rdi,8),%ecx + movl 16(%r14),%ebx + movl 20(%r14),%eax + xorl %edx,%ecx + rorl $8,%edx + xorl %ecx,%r8d + xorl %ecx,%r9d + xorl %edx,%r9d + xorl 0(%r13),%r9d + xorl 4(%r13),%r8d + xorl 8(%r13),%r11d + xorl 12(%r13),%r10d + xorl %r8d,%eax + xorl %r9d,%ebx + movzbl %ah,%esi + movzbl %bl,%edi + movl 2052(%rbp,%rsi,8),%edx + movl 0(%rbp,%rdi,8),%ecx + movzbl %al,%esi + shrl $16,%eax + movzbl %bh,%edi + xorl 4(%rbp,%rsi,8),%edx + shrl $16,%ebx + xorl 4(%rbp,%rdi,8),%ecx + movzbl %ah,%esi + movzbl %bl,%edi + xorl 0(%rbp,%rsi,8),%edx + xorl 2052(%rbp,%rdi,8),%ecx + movzbl %al,%esi + movzbl %bh,%edi + xorl 2048(%rbp,%rsi,8),%edx + xorl 2048(%rbp,%rdi,8),%ecx + movl 24(%r14),%ebx + movl 28(%r14),%eax + xorl %edx,%ecx + rorl $8,%edx + xorl %ecx,%r10d + xorl %ecx,%r11d + xorl %edx,%r11d + xorl %r10d,%eax + xorl %r11d,%ebx + movzbl %ah,%esi + movzbl %bl,%edi + movl 2052(%rbp,%rsi,8),%edx + movl 0(%rbp,%rdi,8),%ecx + movzbl %al,%esi + shrl $16,%eax + movzbl %bh,%edi + xorl 4(%rbp,%rsi,8),%edx + shrl $16,%ebx + xorl 4(%rbp,%rdi,8),%ecx + movzbl %ah,%esi + movzbl %bl,%edi + xorl 0(%rbp,%rsi,8),%edx + xorl 2052(%rbp,%rdi,8),%ecx + movzbl %al,%esi + movzbl %bh,%edi + xorl 2048(%rbp,%rsi,8),%edx + xorl 2048(%rbp,%rdi,8),%ecx + movl 32(%r14),%ebx + movl 36(%r14),%eax + xorl %edx,%ecx + rorl $8,%edx + xorl %ecx,%r8d + xorl %ecx,%r9d + xorl %edx,%r9d + cmpq $128,%r15 + jne .L2nd256 + + leaq 128(%r13),%r13 + shlq $32,%r8 + shlq $32,%r10 + orq %r9,%r8 + orq %r11,%r10 + movq -128(%r13),%rax + movq -120(%r13),%rbx + movq %r8,-112(%r13) + movq %r10,-104(%r13) + movq %rax,%r11 + shlq $15,%rax + movq %rbx,%r9 + shrq $49,%r9 + shrq $49,%r11 + orq %r9,%rax + shlq $15,%rbx + orq %r11,%rbx + movq %rax,-96(%r13) + movq %rbx,-88(%r13) + movq %r8,%r11 + shlq $15,%r8 + movq %r10,%r9 + shrq $49,%r9 + shrq $49,%r11 + orq %r9,%r8 + shlq $15,%r10 + orq %r11,%r10 + movq %r8,-80(%r13) + movq %r10,-72(%r13) + movq %r8,%r11 + shlq $15,%r8 + movq %r10,%r9 + shrq $49,%r9 + shrq $49,%r11 + orq %r9,%r8 + shlq $15,%r10 + orq %r11,%r10 + movq %r8,-64(%r13) + movq %r10,-56(%r13) + movq %rax,%r11 + shlq $30,%rax + movq %rbx,%r9 + shrq $34,%r9 + shrq $34,%r11 + orq %r9,%rax + shlq $30,%rbx + orq %r11,%rbx + movq %rax,-48(%r13) + movq %rbx,-40(%r13) + movq %r8,%r11 + shlq $15,%r8 + movq %r10,%r9 + shrq $49,%r9 + shrq $49,%r11 + orq %r9,%r8 + shlq $15,%r10 + orq %r11,%r10 + movq %r8,-32(%r13) + movq %rax,%r11 + shlq $15,%rax + movq %rbx,%r9 + shrq $49,%r9 + shrq $49,%r11 + orq %r9,%rax + shlq $15,%rbx + orq %r11,%rbx + movq %rbx,-24(%r13) + movq %r8,%r11 + shlq $15,%r8 + movq %r10,%r9 + shrq $49,%r9 + shrq $49,%r11 + orq %r9,%r8 + shlq $15,%r10 + orq %r11,%r10 + movq %r8,-16(%r13) + movq %r10,-8(%r13) + movq %rax,%r11 + shlq $17,%rax + movq %rbx,%r9 + shrq $47,%r9 + shrq $47,%r11 + orq %r9,%rax + shlq $17,%rbx + orq %r11,%rbx + movq %rax,0(%r13) + movq %rbx,8(%r13) + movq %rax,%r11 + shlq $17,%rax + movq %rbx,%r9 + shrq $47,%r9 + shrq $47,%r11 + orq %r9,%rax + shlq $17,%rbx + orq %r11,%rbx + movq %rax,16(%r13) + movq %rbx,24(%r13) + movq %r8,%r11 + shlq $34,%r8 + movq %r10,%r9 + shrq $30,%r9 + shrq $30,%r11 + orq %r9,%r8 + shlq $34,%r10 + orq %r11,%r10 + movq %r8,32(%r13) + movq %r10,40(%r13) + movq %rax,%r11 + shlq $17,%rax + movq %rbx,%r9 + shrq $47,%r9 + shrq $47,%r11 + orq %r9,%rax + shlq $17,%rbx + orq %r11,%rbx + movq %rax,48(%r13) + movq %rbx,56(%r13) + movq %r8,%r11 + shlq $17,%r8 + movq %r10,%r9 + shrq $47,%r9 + shrq $47,%r11 + orq %r9,%r8 + shlq $17,%r10 + orq %r11,%r10 + movq %r8,64(%r13) + movq %r10,72(%r13) + movl $3,%eax + jmp .Ldone +.align 16 +.L2nd256: + movl %r9d,48(%r13) + movl %r8d,52(%r13) + movl %r11d,56(%r13) + movl %r10d,60(%r13) + xorl 32(%r13),%r9d + xorl 36(%r13),%r8d + xorl 40(%r13),%r11d + xorl 44(%r13),%r10d + xorl %r8d,%eax + xorl %r9d,%ebx + movzbl %ah,%esi + movzbl %bl,%edi + movl 2052(%rbp,%rsi,8),%edx + movl 0(%rbp,%rdi,8),%ecx + movzbl %al,%esi + shrl $16,%eax + movzbl %bh,%edi + xorl 4(%rbp,%rsi,8),%edx + shrl $16,%ebx + xorl 4(%rbp,%rdi,8),%ecx + movzbl %ah,%esi + movzbl %bl,%edi + xorl 0(%rbp,%rsi,8),%edx + xorl 2052(%rbp,%rdi,8),%ecx + movzbl %al,%esi + movzbl %bh,%edi + xorl 2048(%rbp,%rsi,8),%edx + xorl 2048(%rbp,%rdi,8),%ecx + movl 40(%r14),%ebx + movl 44(%r14),%eax + xorl %edx,%ecx + rorl $8,%edx + xorl %ecx,%r10d + xorl %ecx,%r11d + xorl %edx,%r11d + xorl %r10d,%eax + xorl %r11d,%ebx + movzbl %ah,%esi + movzbl %bl,%edi + movl 2052(%rbp,%rsi,8),%edx + movl 0(%rbp,%rdi,8),%ecx + movzbl %al,%esi + shrl $16,%eax + movzbl %bh,%edi + xorl 4(%rbp,%rsi,8),%edx + shrl $16,%ebx + xorl 4(%rbp,%rdi,8),%ecx + movzbl %ah,%esi + movzbl %bl,%edi + xorl 0(%rbp,%rsi,8),%edx + xorl 2052(%rbp,%rdi,8),%ecx + movzbl %al,%esi + movzbl %bh,%edi + xorl 2048(%rbp,%rsi,8),%edx + xorl 2048(%rbp,%rdi,8),%ecx + movl 48(%r14),%ebx + movl 52(%r14),%eax + xorl %edx,%ecx + rorl $8,%edx + xorl %ecx,%r8d + xorl %ecx,%r9d + xorl %edx,%r9d + movq 0(%r13),%rax + movq 8(%r13),%rbx + movq 32(%r13),%rcx + movq 40(%r13),%rdx + movq 48(%r13),%r14 + movq 56(%r13),%r15 + leaq 128(%r13),%r13 + shlq $32,%r8 + shlq $32,%r10 + orq %r9,%r8 + orq %r11,%r10 + movq %r8,-112(%r13) + movq %r10,-104(%r13) + movq %rcx,%r11 + shlq $15,%rcx + movq %rdx,%r9 + shrq $49,%r9 + shrq $49,%r11 + orq %r9,%rcx + shlq $15,%rdx + orq %r11,%rdx + movq %rcx,-96(%r13) + movq %rdx,-88(%r13) + movq %r14,%r11 + shlq $15,%r14 + movq %r15,%r9 + shrq $49,%r9 + shrq $49,%r11 + orq %r9,%r14 + shlq $15,%r15 + orq %r11,%r15 + movq %r14,-80(%r13) + movq %r15,-72(%r13) + movq %rcx,%r11 + shlq $15,%rcx + movq %rdx,%r9 + shrq $49,%r9 + shrq $49,%r11 + orq %r9,%rcx + shlq $15,%rdx + orq %r11,%rdx + movq %rcx,-64(%r13) + movq %rdx,-56(%r13) + movq %r8,%r11 + shlq $30,%r8 + movq %r10,%r9 + shrq $34,%r9 + shrq $34,%r11 + orq %r9,%r8 + shlq $30,%r10 + orq %r11,%r10 + movq %r8,-48(%r13) + movq %r10,-40(%r13) + movq %rax,%r11 + shlq $45,%rax + movq %rbx,%r9 + shrq $19,%r9 + shrq $19,%r11 + orq %r9,%rax + shlq $45,%rbx + orq %r11,%rbx + movq %rax,-32(%r13) + movq %rbx,-24(%r13) + movq %r14,%r11 + shlq $30,%r14 + movq %r15,%r9 + shrq $34,%r9 + shrq $34,%r11 + orq %r9,%r14 + shlq $30,%r15 + orq %r11,%r15 + movq %r14,-16(%r13) + movq %r15,-8(%r13) + movq %rax,%r11 + shlq $15,%rax + movq %rbx,%r9 + shrq $49,%r9 + shrq $49,%r11 + orq %r9,%rax + shlq $15,%rbx + orq %r11,%rbx + movq %rax,0(%r13) + movq %rbx,8(%r13) + movq %rcx,%r11 + shlq $30,%rcx + movq %rdx,%r9 + shrq $34,%r9 + shrq $34,%r11 + orq %r9,%rcx + shlq $30,%rdx + orq %r11,%rdx + movq %rcx,16(%r13) + movq %rdx,24(%r13) + movq %r8,%r11 + shlq $30,%r8 + movq %r10,%r9 + shrq $34,%r9 + shrq $34,%r11 + orq %r9,%r8 + shlq $30,%r10 + orq %r11,%r10 + movq %r8,32(%r13) + movq %r10,40(%r13) + movq %rax,%r11 + shlq $17,%rax + movq %rbx,%r9 + shrq $47,%r9 + shrq $47,%r11 + orq %r9,%rax + shlq $17,%rbx + orq %r11,%rbx + movq %rax,48(%r13) + movq %rbx,56(%r13) + movq %r14,%r11 + shlq $32,%r14 + movq %r15,%r9 + shrq $32,%r9 + shrq $32,%r11 + orq %r9,%r14 + shlq $32,%r15 + orq %r11,%r15 + movq %r14,64(%r13) + movq %r15,72(%r13) + movq %rcx,%r11 + shlq $34,%rcx + movq %rdx,%r9 + shrq $30,%r9 + shrq $30,%r11 + orq %r9,%rcx + shlq $34,%rdx + orq %r11,%rdx + movq %rcx,80(%r13) + movq %rdx,88(%r13) + movq %r14,%r11 + shlq $17,%r14 + movq %r15,%r9 + shrq $47,%r9 + shrq $47,%r11 + orq %r9,%r14 + shlq $17,%r15 + orq %r11,%r15 + movq %r14,96(%r13) + movq %r15,104(%r13) + movq %rax,%r11 + shlq $34,%rax + movq %rbx,%r9 + shrq $30,%r9 + shrq $30,%r11 + orq %r9,%rax + shlq $34,%rbx + orq %r11,%rbx + movq %rax,112(%r13) + movq %rbx,120(%r13) + movq %r8,%r11 + shlq $51,%r8 + movq %r10,%r9 + shrq $13,%r9 + shrq $13,%r11 + orq %r9,%r8 + shlq $51,%r10 + orq %r11,%r10 + movq %r8,128(%r13) + movq %r10,136(%r13) + movl $4,%eax +.Ldone: + movq 0(%rsp),%r15 + movq 8(%rsp),%r14 + movq 16(%rsp),%r13 + movq 24(%rsp),%rbp + movq 32(%rsp),%rbx + leaq 40(%rsp),%rsp +.Lkey_epilogue: + .byte 0xf3,0xc3 +.size Camellia_Ekeygen,.-Camellia_Ekeygen +.align 64 +.LCamellia_SIGMA: +.long 0x3bcc908b, 0xa09e667f, 0x4caa73b2, 0xb67ae858 +.long 0xe94f82be, 0xc6ef372f, 0xf1d36f1c, 0x54ff53a5 +.long 0xde682d1d, 0x10e527fa, 0xb3e6c1fd, 0xb05688c2 +.long 0, 0, 0, 0 +.LCamellia_SBOX: +.long 0x70707000,0x70700070 +.long 0x82828200,0x2c2c002c +.long 0x2c2c2c00,0xb3b300b3 +.long 0xececec00,0xc0c000c0 +.long 0xb3b3b300,0xe4e400e4 +.long 0x27272700,0x57570057 +.long 0xc0c0c000,0xeaea00ea +.long 0xe5e5e500,0xaeae00ae +.long 0xe4e4e400,0x23230023 +.long 0x85858500,0x6b6b006b +.long 0x57575700,0x45450045 +.long 0x35353500,0xa5a500a5 +.long 0xeaeaea00,0xeded00ed +.long 0x0c0c0c00,0x4f4f004f +.long 0xaeaeae00,0x1d1d001d +.long 0x41414100,0x92920092 +.long 0x23232300,0x86860086 +.long 0xefefef00,0xafaf00af +.long 0x6b6b6b00,0x7c7c007c +.long 0x93939300,0x1f1f001f +.long 0x45454500,0x3e3e003e +.long 0x19191900,0xdcdc00dc +.long 0xa5a5a500,0x5e5e005e +.long 0x21212100,0x0b0b000b +.long 0xededed00,0xa6a600a6 +.long 0x0e0e0e00,0x39390039 +.long 0x4f4f4f00,0xd5d500d5 +.long 0x4e4e4e00,0x5d5d005d +.long 0x1d1d1d00,0xd9d900d9 +.long 0x65656500,0x5a5a005a +.long 0x92929200,0x51510051 +.long 0xbdbdbd00,0x6c6c006c +.long 0x86868600,0x8b8b008b +.long 0xb8b8b800,0x9a9a009a +.long 0xafafaf00,0xfbfb00fb +.long 0x8f8f8f00,0xb0b000b0 +.long 0x7c7c7c00,0x74740074 +.long 0xebebeb00,0x2b2b002b +.long 0x1f1f1f00,0xf0f000f0 +.long 0xcecece00,0x84840084 +.long 0x3e3e3e00,0xdfdf00df +.long 0x30303000,0xcbcb00cb +.long 0xdcdcdc00,0x34340034 +.long 0x5f5f5f00,0x76760076 +.long 0x5e5e5e00,0x6d6d006d +.long 0xc5c5c500,0xa9a900a9 +.long 0x0b0b0b00,0xd1d100d1 +.long 0x1a1a1a00,0x04040004 +.long 0xa6a6a600,0x14140014 +.long 0xe1e1e100,0x3a3a003a +.long 0x39393900,0xdede00de +.long 0xcacaca00,0x11110011 +.long 0xd5d5d500,0x32320032 +.long 0x47474700,0x9c9c009c +.long 0x5d5d5d00,0x53530053 +.long 0x3d3d3d00,0xf2f200f2 +.long 0xd9d9d900,0xfefe00fe +.long 0x01010100,0xcfcf00cf +.long 0x5a5a5a00,0xc3c300c3 +.long 0xd6d6d600,0x7a7a007a +.long 0x51515100,0x24240024 +.long 0x56565600,0xe8e800e8 +.long 0x6c6c6c00,0x60600060 +.long 0x4d4d4d00,0x69690069 +.long 0x8b8b8b00,0xaaaa00aa +.long 0x0d0d0d00,0xa0a000a0 +.long 0x9a9a9a00,0xa1a100a1 +.long 0x66666600,0x62620062 +.long 0xfbfbfb00,0x54540054 +.long 0xcccccc00,0x1e1e001e +.long 0xb0b0b000,0xe0e000e0 +.long 0x2d2d2d00,0x64640064 +.long 0x74747400,0x10100010 +.long 0x12121200,0x00000000 +.long 0x2b2b2b00,0xa3a300a3 +.long 0x20202000,0x75750075 +.long 0xf0f0f000,0x8a8a008a +.long 0xb1b1b100,0xe6e600e6 +.long 0x84848400,0x09090009 +.long 0x99999900,0xdddd00dd +.long 0xdfdfdf00,0x87870087 +.long 0x4c4c4c00,0x83830083 +.long 0xcbcbcb00,0xcdcd00cd +.long 0xc2c2c200,0x90900090 +.long 0x34343400,0x73730073 +.long 0x7e7e7e00,0xf6f600f6 +.long 0x76767600,0x9d9d009d +.long 0x05050500,0xbfbf00bf +.long 0x6d6d6d00,0x52520052 +.long 0xb7b7b700,0xd8d800d8 +.long 0xa9a9a900,0xc8c800c8 +.long 0x31313100,0xc6c600c6 +.long 0xd1d1d100,0x81810081 +.long 0x17171700,0x6f6f006f +.long 0x04040400,0x13130013 +.long 0xd7d7d700,0x63630063 +.long 0x14141400,0xe9e900e9 +.long 0x58585800,0xa7a700a7 +.long 0x3a3a3a00,0x9f9f009f +.long 0x61616100,0xbcbc00bc +.long 0xdedede00,0x29290029 +.long 0x1b1b1b00,0xf9f900f9 +.long 0x11111100,0x2f2f002f +.long 0x1c1c1c00,0xb4b400b4 +.long 0x32323200,0x78780078 +.long 0x0f0f0f00,0x06060006 +.long 0x9c9c9c00,0xe7e700e7 +.long 0x16161600,0x71710071 +.long 0x53535300,0xd4d400d4 +.long 0x18181800,0xabab00ab +.long 0xf2f2f200,0x88880088 +.long 0x22222200,0x8d8d008d +.long 0xfefefe00,0x72720072 +.long 0x44444400,0xb9b900b9 +.long 0xcfcfcf00,0xf8f800f8 +.long 0xb2b2b200,0xacac00ac +.long 0xc3c3c300,0x36360036 +.long 0xb5b5b500,0x2a2a002a +.long 0x7a7a7a00,0x3c3c003c +.long 0x91919100,0xf1f100f1 +.long 0x24242400,0x40400040 +.long 0x08080800,0xd3d300d3 +.long 0xe8e8e800,0xbbbb00bb +.long 0xa8a8a800,0x43430043 +.long 0x60606000,0x15150015 +.long 0xfcfcfc00,0xadad00ad +.long 0x69696900,0x77770077 +.long 0x50505000,0x80800080 +.long 0xaaaaaa00,0x82820082 +.long 0xd0d0d000,0xecec00ec +.long 0xa0a0a000,0x27270027 +.long 0x7d7d7d00,0xe5e500e5 +.long 0xa1a1a100,0x85850085 +.long 0x89898900,0x35350035 +.long 0x62626200,0x0c0c000c +.long 0x97979700,0x41410041 +.long 0x54545400,0xefef00ef +.long 0x5b5b5b00,0x93930093 +.long 0x1e1e1e00,0x19190019 +.long 0x95959500,0x21210021 +.long 0xe0e0e000,0x0e0e000e +.long 0xffffff00,0x4e4e004e +.long 0x64646400,0x65650065 +.long 0xd2d2d200,0xbdbd00bd +.long 0x10101000,0xb8b800b8 +.long 0xc4c4c400,0x8f8f008f +.long 0x00000000,0xebeb00eb +.long 0x48484800,0xcece00ce +.long 0xa3a3a300,0x30300030 +.long 0xf7f7f700,0x5f5f005f +.long 0x75757500,0xc5c500c5 +.long 0xdbdbdb00,0x1a1a001a +.long 0x8a8a8a00,0xe1e100e1 +.long 0x03030300,0xcaca00ca +.long 0xe6e6e600,0x47470047 +.long 0xdadada00,0x3d3d003d +.long 0x09090900,0x01010001 +.long 0x3f3f3f00,0xd6d600d6 +.long 0xdddddd00,0x56560056 +.long 0x94949400,0x4d4d004d +.long 0x87878700,0x0d0d000d +.long 0x5c5c5c00,0x66660066 +.long 0x83838300,0xcccc00cc +.long 0x02020200,0x2d2d002d +.long 0xcdcdcd00,0x12120012 +.long 0x4a4a4a00,0x20200020 +.long 0x90909000,0xb1b100b1 +.long 0x33333300,0x99990099 +.long 0x73737300,0x4c4c004c +.long 0x67676700,0xc2c200c2 +.long 0xf6f6f600,0x7e7e007e +.long 0xf3f3f300,0x05050005 +.long 0x9d9d9d00,0xb7b700b7 +.long 0x7f7f7f00,0x31310031 +.long 0xbfbfbf00,0x17170017 +.long 0xe2e2e200,0xd7d700d7 +.long 0x52525200,0x58580058 +.long 0x9b9b9b00,0x61610061 +.long 0xd8d8d800,0x1b1b001b +.long 0x26262600,0x1c1c001c +.long 0xc8c8c800,0x0f0f000f +.long 0x37373700,0x16160016 +.long 0xc6c6c600,0x18180018 +.long 0x3b3b3b00,0x22220022 +.long 0x81818100,0x44440044 +.long 0x96969600,0xb2b200b2 +.long 0x6f6f6f00,0xb5b500b5 +.long 0x4b4b4b00,0x91910091 +.long 0x13131300,0x08080008 +.long 0xbebebe00,0xa8a800a8 +.long 0x63636300,0xfcfc00fc +.long 0x2e2e2e00,0x50500050 +.long 0xe9e9e900,0xd0d000d0 +.long 0x79797900,0x7d7d007d +.long 0xa7a7a700,0x89890089 +.long 0x8c8c8c00,0x97970097 +.long 0x9f9f9f00,0x5b5b005b +.long 0x6e6e6e00,0x95950095 +.long 0xbcbcbc00,0xffff00ff +.long 0x8e8e8e00,0xd2d200d2 +.long 0x29292900,0xc4c400c4 +.long 0xf5f5f500,0x48480048 +.long 0xf9f9f900,0xf7f700f7 +.long 0xb6b6b600,0xdbdb00db +.long 0x2f2f2f00,0x03030003 +.long 0xfdfdfd00,0xdada00da +.long 0xb4b4b400,0x3f3f003f +.long 0x59595900,0x94940094 +.long 0x78787800,0x5c5c005c +.long 0x98989800,0x02020002 +.long 0x06060600,0x4a4a004a +.long 0x6a6a6a00,0x33330033 +.long 0xe7e7e700,0x67670067 +.long 0x46464600,0xf3f300f3 +.long 0x71717100,0x7f7f007f +.long 0xbababa00,0xe2e200e2 +.long 0xd4d4d400,0x9b9b009b +.long 0x25252500,0x26260026 +.long 0xababab00,0x37370037 +.long 0x42424200,0x3b3b003b +.long 0x88888800,0x96960096 +.long 0xa2a2a200,0x4b4b004b +.long 0x8d8d8d00,0xbebe00be +.long 0xfafafa00,0x2e2e002e +.long 0x72727200,0x79790079 +.long 0x07070700,0x8c8c008c +.long 0xb9b9b900,0x6e6e006e +.long 0x55555500,0x8e8e008e +.long 0xf8f8f800,0xf5f500f5 +.long 0xeeeeee00,0xb6b600b6 +.long 0xacacac00,0xfdfd00fd +.long 0x0a0a0a00,0x59590059 +.long 0x36363600,0x98980098 +.long 0x49494900,0x6a6a006a +.long 0x2a2a2a00,0x46460046 +.long 0x68686800,0xbaba00ba +.long 0x3c3c3c00,0x25250025 +.long 0x38383800,0x42420042 +.long 0xf1f1f100,0xa2a200a2 +.long 0xa4a4a400,0xfafa00fa +.long 0x40404000,0x07070007 +.long 0x28282800,0x55550055 +.long 0xd3d3d300,0xeeee00ee +.long 0x7b7b7b00,0x0a0a000a +.long 0xbbbbbb00,0x49490049 +.long 0xc9c9c900,0x68680068 +.long 0x43434300,0x38380038 +.long 0xc1c1c100,0xa4a400a4 +.long 0x15151500,0x28280028 +.long 0xe3e3e300,0x7b7b007b +.long 0xadadad00,0xc9c900c9 +.long 0xf4f4f400,0xc1c100c1 +.long 0x77777700,0xe3e300e3 +.long 0xc7c7c700,0xf4f400f4 +.long 0x80808000,0xc7c700c7 +.long 0x9e9e9e00,0x9e9e009e +.long 0x00e0e0e0,0x38003838 +.long 0x00050505,0x41004141 +.long 0x00585858,0x16001616 +.long 0x00d9d9d9,0x76007676 +.long 0x00676767,0xd900d9d9 +.long 0x004e4e4e,0x93009393 +.long 0x00818181,0x60006060 +.long 0x00cbcbcb,0xf200f2f2 +.long 0x00c9c9c9,0x72007272 +.long 0x000b0b0b,0xc200c2c2 +.long 0x00aeaeae,0xab00abab +.long 0x006a6a6a,0x9a009a9a +.long 0x00d5d5d5,0x75007575 +.long 0x00181818,0x06000606 +.long 0x005d5d5d,0x57005757 +.long 0x00828282,0xa000a0a0 +.long 0x00464646,0x91009191 +.long 0x00dfdfdf,0xf700f7f7 +.long 0x00d6d6d6,0xb500b5b5 +.long 0x00272727,0xc900c9c9 +.long 0x008a8a8a,0xa200a2a2 +.long 0x00323232,0x8c008c8c +.long 0x004b4b4b,0xd200d2d2 +.long 0x00424242,0x90009090 +.long 0x00dbdbdb,0xf600f6f6 +.long 0x001c1c1c,0x07000707 +.long 0x009e9e9e,0xa700a7a7 +.long 0x009c9c9c,0x27002727 +.long 0x003a3a3a,0x8e008e8e +.long 0x00cacaca,0xb200b2b2 +.long 0x00252525,0x49004949 +.long 0x007b7b7b,0xde00dede +.long 0x000d0d0d,0x43004343 +.long 0x00717171,0x5c005c5c +.long 0x005f5f5f,0xd700d7d7 +.long 0x001f1f1f,0xc700c7c7 +.long 0x00f8f8f8,0x3e003e3e +.long 0x00d7d7d7,0xf500f5f5 +.long 0x003e3e3e,0x8f008f8f +.long 0x009d9d9d,0x67006767 +.long 0x007c7c7c,0x1f001f1f +.long 0x00606060,0x18001818 +.long 0x00b9b9b9,0x6e006e6e +.long 0x00bebebe,0xaf00afaf +.long 0x00bcbcbc,0x2f002f2f +.long 0x008b8b8b,0xe200e2e2 +.long 0x00161616,0x85008585 +.long 0x00343434,0x0d000d0d +.long 0x004d4d4d,0x53005353 +.long 0x00c3c3c3,0xf000f0f0 +.long 0x00727272,0x9c009c9c +.long 0x00959595,0x65006565 +.long 0x00ababab,0xea00eaea +.long 0x008e8e8e,0xa300a3a3 +.long 0x00bababa,0xae00aeae +.long 0x007a7a7a,0x9e009e9e +.long 0x00b3b3b3,0xec00ecec +.long 0x00020202,0x80008080 +.long 0x00b4b4b4,0x2d002d2d +.long 0x00adadad,0x6b006b6b +.long 0x00a2a2a2,0xa800a8a8 +.long 0x00acacac,0x2b002b2b +.long 0x00d8d8d8,0x36003636 +.long 0x009a9a9a,0xa600a6a6 +.long 0x00171717,0xc500c5c5 +.long 0x001a1a1a,0x86008686 +.long 0x00353535,0x4d004d4d +.long 0x00cccccc,0x33003333 +.long 0x00f7f7f7,0xfd00fdfd +.long 0x00999999,0x66006666 +.long 0x00616161,0x58005858 +.long 0x005a5a5a,0x96009696 +.long 0x00e8e8e8,0x3a003a3a +.long 0x00242424,0x09000909 +.long 0x00565656,0x95009595 +.long 0x00404040,0x10001010 +.long 0x00e1e1e1,0x78007878 +.long 0x00636363,0xd800d8d8 +.long 0x00090909,0x42004242 +.long 0x00333333,0xcc00cccc +.long 0x00bfbfbf,0xef00efef +.long 0x00989898,0x26002626 +.long 0x00979797,0xe500e5e5 +.long 0x00858585,0x61006161 +.long 0x00686868,0x1a001a1a +.long 0x00fcfcfc,0x3f003f3f +.long 0x00ececec,0x3b003b3b +.long 0x000a0a0a,0x82008282 +.long 0x00dadada,0xb600b6b6 +.long 0x006f6f6f,0xdb00dbdb +.long 0x00535353,0xd400d4d4 +.long 0x00626262,0x98009898 +.long 0x00a3a3a3,0xe800e8e8 +.long 0x002e2e2e,0x8b008b8b +.long 0x00080808,0x02000202 +.long 0x00afafaf,0xeb00ebeb +.long 0x00282828,0x0a000a0a +.long 0x00b0b0b0,0x2c002c2c +.long 0x00747474,0x1d001d1d +.long 0x00c2c2c2,0xb000b0b0 +.long 0x00bdbdbd,0x6f006f6f +.long 0x00363636,0x8d008d8d +.long 0x00222222,0x88008888 +.long 0x00383838,0x0e000e0e +.long 0x00646464,0x19001919 +.long 0x001e1e1e,0x87008787 +.long 0x00393939,0x4e004e4e +.long 0x002c2c2c,0x0b000b0b +.long 0x00a6a6a6,0xa900a9a9 +.long 0x00303030,0x0c000c0c +.long 0x00e5e5e5,0x79007979 +.long 0x00444444,0x11001111 +.long 0x00fdfdfd,0x7f007f7f +.long 0x00888888,0x22002222 +.long 0x009f9f9f,0xe700e7e7 +.long 0x00656565,0x59005959 +.long 0x00878787,0xe100e1e1 +.long 0x006b6b6b,0xda00dada +.long 0x00f4f4f4,0x3d003d3d +.long 0x00232323,0xc800c8c8 +.long 0x00484848,0x12001212 +.long 0x00101010,0x04000404 +.long 0x00d1d1d1,0x74007474 +.long 0x00515151,0x54005454 +.long 0x00c0c0c0,0x30003030 +.long 0x00f9f9f9,0x7e007e7e +.long 0x00d2d2d2,0xb400b4b4 +.long 0x00a0a0a0,0x28002828 +.long 0x00555555,0x55005555 +.long 0x00a1a1a1,0x68006868 +.long 0x00414141,0x50005050 +.long 0x00fafafa,0xbe00bebe +.long 0x00434343,0xd000d0d0 +.long 0x00131313,0xc400c4c4 +.long 0x00c4c4c4,0x31003131 +.long 0x002f2f2f,0xcb00cbcb +.long 0x00a8a8a8,0x2a002a2a +.long 0x00b6b6b6,0xad00adad +.long 0x003c3c3c,0x0f000f0f +.long 0x002b2b2b,0xca00caca +.long 0x00c1c1c1,0x70007070 +.long 0x00ffffff,0xff00ffff +.long 0x00c8c8c8,0x32003232 +.long 0x00a5a5a5,0x69006969 +.long 0x00202020,0x08000808 +.long 0x00898989,0x62006262 +.long 0x00000000,0x00000000 +.long 0x00909090,0x24002424 +.long 0x00474747,0xd100d1d1 +.long 0x00efefef,0xfb00fbfb +.long 0x00eaeaea,0xba00baba +.long 0x00b7b7b7,0xed00eded +.long 0x00151515,0x45004545 +.long 0x00060606,0x81008181 +.long 0x00cdcdcd,0x73007373 +.long 0x00b5b5b5,0x6d006d6d +.long 0x00121212,0x84008484 +.long 0x007e7e7e,0x9f009f9f +.long 0x00bbbbbb,0xee00eeee +.long 0x00292929,0x4a004a4a +.long 0x000f0f0f,0xc300c3c3 +.long 0x00b8b8b8,0x2e002e2e +.long 0x00070707,0xc100c1c1 +.long 0x00040404,0x01000101 +.long 0x009b9b9b,0xe600e6e6 +.long 0x00949494,0x25002525 +.long 0x00212121,0x48004848 +.long 0x00666666,0x99009999 +.long 0x00e6e6e6,0xb900b9b9 +.long 0x00cecece,0xb300b3b3 +.long 0x00ededed,0x7b007b7b +.long 0x00e7e7e7,0xf900f9f9 +.long 0x003b3b3b,0xce00cece +.long 0x00fefefe,0xbf00bfbf +.long 0x007f7f7f,0xdf00dfdf +.long 0x00c5c5c5,0x71007171 +.long 0x00a4a4a4,0x29002929 +.long 0x00373737,0xcd00cdcd +.long 0x00b1b1b1,0x6c006c6c +.long 0x004c4c4c,0x13001313 +.long 0x00919191,0x64006464 +.long 0x006e6e6e,0x9b009b9b +.long 0x008d8d8d,0x63006363 +.long 0x00767676,0x9d009d9d +.long 0x00030303,0xc000c0c0 +.long 0x002d2d2d,0x4b004b4b +.long 0x00dedede,0xb700b7b7 +.long 0x00969696,0xa500a5a5 +.long 0x00262626,0x89008989 +.long 0x007d7d7d,0x5f005f5f +.long 0x00c6c6c6,0xb100b1b1 +.long 0x005c5c5c,0x17001717 +.long 0x00d3d3d3,0xf400f4f4 +.long 0x00f2f2f2,0xbc00bcbc +.long 0x004f4f4f,0xd300d3d3 +.long 0x00191919,0x46004646 +.long 0x003f3f3f,0xcf00cfcf +.long 0x00dcdcdc,0x37003737 +.long 0x00797979,0x5e005e5e +.long 0x001d1d1d,0x47004747 +.long 0x00525252,0x94009494 +.long 0x00ebebeb,0xfa00fafa +.long 0x00f3f3f3,0xfc00fcfc +.long 0x006d6d6d,0x5b005b5b +.long 0x005e5e5e,0x97009797 +.long 0x00fbfbfb,0xfe00fefe +.long 0x00696969,0x5a005a5a +.long 0x00b2b2b2,0xac00acac +.long 0x00f0f0f0,0x3c003c3c +.long 0x00313131,0x4c004c4c +.long 0x000c0c0c,0x03000303 +.long 0x00d4d4d4,0x35003535 +.long 0x00cfcfcf,0xf300f3f3 +.long 0x008c8c8c,0x23002323 +.long 0x00e2e2e2,0xb800b8b8 +.long 0x00757575,0x5d005d5d +.long 0x00a9a9a9,0x6a006a6a +.long 0x004a4a4a,0x92009292 +.long 0x00575757,0xd500d5d5 +.long 0x00848484,0x21002121 +.long 0x00111111,0x44004444 +.long 0x00454545,0x51005151 +.long 0x001b1b1b,0xc600c6c6 +.long 0x00f5f5f5,0x7d007d7d +.long 0x00e4e4e4,0x39003939 +.long 0x000e0e0e,0x83008383 +.long 0x00737373,0xdc00dcdc +.long 0x00aaaaaa,0xaa00aaaa +.long 0x00f1f1f1,0x7c007c7c +.long 0x00dddddd,0x77007777 +.long 0x00595959,0x56005656 +.long 0x00141414,0x05000505 +.long 0x006c6c6c,0x1b001b1b +.long 0x00929292,0xa400a4a4 +.long 0x00545454,0x15001515 +.long 0x00d0d0d0,0x34003434 +.long 0x00787878,0x1e001e1e +.long 0x00707070,0x1c001c1c +.long 0x00e3e3e3,0xf800f8f8 +.long 0x00494949,0x52005252 +.long 0x00808080,0x20002020 +.long 0x00505050,0x14001414 +.long 0x00a7a7a7,0xe900e9e9 +.long 0x00f6f6f6,0xbd00bdbd +.long 0x00777777,0xdd00dddd +.long 0x00939393,0xe400e4e4 +.long 0x00868686,0xa100a1a1 +.long 0x00838383,0xe000e0e0 +.long 0x002a2a2a,0x8a008a8a +.long 0x00c7c7c7,0xf100f1f1 +.long 0x005b5b5b,0xd600d6d6 +.long 0x00e9e9e9,0x7a007a7a +.long 0x00eeeeee,0xbb00bbbb +.long 0x008f8f8f,0xe300e3e3 +.long 0x00010101,0x40004040 +.long 0x003d3d3d,0x4f004f4f +.globl Camellia_cbc_encrypt +.type Camellia_cbc_encrypt,@function +.align 16 +Camellia_cbc_encrypt: + cmpq $0,%rdx + je .Lcbc_abort + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 +.Lcbc_prologue: + + movq %rsp,%rbp + subq $64,%rsp + andq $-64,%rsp + + + + leaq -64-63(%rcx),%r10 + subq %rsp,%r10 + negq %r10 + andq $960,%r10 + subq %r10,%rsp + + + movq %rdi,%r12 + movq %rsi,%r13 + movq %r8,%rbx + movq %rcx,%r14 + movl 272(%rcx),%r15d + + movq %r8,40(%rsp) + movq %rbp,48(%rsp) + +.Lcbc_body: + leaq .LCamellia_SBOX(%rip),%rbp + + movl $32,%ecx +.align 4 +.Lcbc_prefetch_sbox: + movq 0(%rbp),%rax + movq 32(%rbp),%rsi + movq 64(%rbp),%rdi + movq 96(%rbp),%r11 + leaq 128(%rbp),%rbp + loop .Lcbc_prefetch_sbox + subq $4096,%rbp + shlq $6,%r15 + movq %rdx,%rcx + leaq (%r14,%r15,1),%r15 + + cmpl $0,%r9d + je .LCBC_DECRYPT + + andq $-16,%rdx + andq $15,%rcx + leaq (%r12,%rdx,1),%rdx + movq %r14,0(%rsp) + movq %rdx,8(%rsp) + movq %rcx,16(%rsp) + + cmpq %r12,%rdx + movl 0(%rbx),%r8d + movl 4(%rbx),%r9d + movl 8(%rbx),%r10d + movl 12(%rbx),%r11d + je .Lcbc_enc_tail + jmp .Lcbc_eloop + +.align 16 +.Lcbc_eloop: + xorl 0(%r12),%r8d + xorl 4(%r12),%r9d + xorl 8(%r12),%r10d + bswapl %r8d + xorl 12(%r12),%r11d + bswapl %r9d + bswapl %r10d + bswapl %r11d + + call _x86_64_Camellia_encrypt + + movq 0(%rsp),%r14 + bswapl %r8d + movq 8(%rsp),%rdx + bswapl %r9d + movq 16(%rsp),%rcx + bswapl %r10d + movl %r8d,0(%r13) + bswapl %r11d + movl %r9d,4(%r13) + movl %r10d,8(%r13) + leaq 16(%r12),%r12 + movl %r11d,12(%r13) + cmpq %rdx,%r12 + leaq 16(%r13),%r13 + jne .Lcbc_eloop + + cmpq $0,%rcx + jne .Lcbc_enc_tail + + movq 40(%rsp),%r13 + movl %r8d,0(%r13) + movl %r9d,4(%r13) + movl %r10d,8(%r13) + movl %r11d,12(%r13) + jmp .Lcbc_done + +.align 16 +.Lcbc_enc_tail: + xorq %rax,%rax + movq %rax,0+24(%rsp) + movq %rax,8+24(%rsp) + movq %rax,16(%rsp) + +.Lcbc_enc_pushf: + pushfq + cld + movq %r12,%rsi + leaq 8+24(%rsp),%rdi +.long 0x9066A4F3 + popfq +.Lcbc_enc_popf: + + leaq 24(%rsp),%r12 + leaq 16+24(%rsp),%rax + movq %rax,8(%rsp) + jmp .Lcbc_eloop + +.align 16 +.LCBC_DECRYPT: + xchgq %r14,%r15 + addq $15,%rdx + andq $15,%rcx + andq $-16,%rdx + movq %r14,0(%rsp) + leaq (%r12,%rdx,1),%rdx + movq %rdx,8(%rsp) + movq %rcx,16(%rsp) + + movq (%rbx),%rax + movq 8(%rbx),%rbx + jmp .Lcbc_dloop +.align 16 +.Lcbc_dloop: + movl 0(%r12),%r8d + movl 4(%r12),%r9d + movl 8(%r12),%r10d + bswapl %r8d + movl 12(%r12),%r11d + bswapl %r9d + movq %rax,0+24(%rsp) + bswapl %r10d + movq %rbx,8+24(%rsp) + bswapl %r11d + + call _x86_64_Camellia_decrypt + + movq 0(%rsp),%r14 + movq 8(%rsp),%rdx + movq 16(%rsp),%rcx + + bswapl %r8d + movq (%r12),%rax + bswapl %r9d + movq 8(%r12),%rbx + bswapl %r10d + xorl 0+24(%rsp),%r8d + bswapl %r11d + xorl 4+24(%rsp),%r9d + xorl 8+24(%rsp),%r10d + leaq 16(%r12),%r12 + xorl 12+24(%rsp),%r11d + cmpq %rdx,%r12 + je .Lcbc_ddone + + movl %r8d,0(%r13) + movl %r9d,4(%r13) + movl %r10d,8(%r13) + movl %r11d,12(%r13) + + leaq 16(%r13),%r13 + jmp .Lcbc_dloop + +.align 16 +.Lcbc_ddone: + movq 40(%rsp),%rdx + cmpq $0,%rcx + jne .Lcbc_dec_tail + + movl %r8d,0(%r13) + movl %r9d,4(%r13) + movl %r10d,8(%r13) + movl %r11d,12(%r13) + + movq %rax,(%rdx) + movq %rbx,8(%rdx) + jmp .Lcbc_done +.align 16 +.Lcbc_dec_tail: + movl %r8d,0+24(%rsp) + movl %r9d,4+24(%rsp) + movl %r10d,8+24(%rsp) + movl %r11d,12+24(%rsp) + +.Lcbc_dec_pushf: + pushfq + cld + leaq 8+24(%rsp),%rsi + leaq (%r13),%rdi +.long 0x9066A4F3 + popfq +.Lcbc_dec_popf: + + movq %rax,(%rdx) + movq %rbx,8(%rdx) + jmp .Lcbc_done + +.align 16 +.Lcbc_done: + movq 48(%rsp),%rcx + movq 0(%rcx),%r15 + movq 8(%rcx),%r14 + movq 16(%rcx),%r13 + movq 24(%rcx),%r12 + movq 32(%rcx),%rbp + movq 40(%rcx),%rbx + leaq 48(%rcx),%rsp +.Lcbc_abort: + .byte 0xf3,0xc3 +.size Camellia_cbc_encrypt,.-Camellia_cbc_encrypt + +.byte 67,97,109,101,108,108,105,97,32,102,111,114,32,120,56,54,95,54,52,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 diff --git a/secure/lib/libcrypto/amd64/ecp_nistz256-x86_64.S b/secure/lib/libcrypto/amd64/ecp_nistz256-x86_64.S new file mode 100644 index 0000000..c5875d7 --- /dev/null +++ b/secure/lib/libcrypto/amd64/ecp_nistz256-x86_64.S @@ -0,0 +1,2005 @@ + # $FreeBSD$ +.text + + + +.align 64 +.Lpoly: +.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001 + + +.LRR: +.quad 0x0000000000000003, 0xfffffffbffffffff, 0xfffffffffffffffe, 0x00000004fffffffd + +.LOne: +.long 1,1,1,1,1,1,1,1 +.LTwo: +.long 2,2,2,2,2,2,2,2 +.LThree: +.long 3,3,3,3,3,3,3,3 +.LONE_mont: +.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe + +.globl ecp_nistz256_mul_by_2 +.type ecp_nistz256_mul_by_2,@function +.align 64 +ecp_nistz256_mul_by_2: + pushq %r12 + pushq %r13 + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + addq %r8,%r8 + movq 16(%rsi),%r10 + adcq %r9,%r9 + movq 24(%rsi),%r11 + leaq .Lpoly(%rip),%rsi + movq %r8,%rax + adcq %r10,%r10 + adcq %r11,%r11 + movq %r9,%rdx + sbbq %r13,%r13 + + subq 0(%rsi),%r8 + movq %r10,%rcx + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + movq %r11,%r12 + sbbq 24(%rsi),%r11 + testq %r13,%r13 + + cmovzq %rax,%r8 + cmovzq %rdx,%r9 + movq %r8,0(%rdi) + cmovzq %rcx,%r10 + movq %r9,8(%rdi) + cmovzq %r12,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + popq %r13 + popq %r12 + .byte 0xf3,0xc3 +.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 + + + +.globl ecp_nistz256_div_by_2 +.type ecp_nistz256_div_by_2,@function +.align 32 +ecp_nistz256_div_by_2: + pushq %r12 + pushq %r13 + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq %r8,%rax + movq 24(%rsi),%r11 + leaq .Lpoly(%rip),%rsi + + movq %r9,%rdx + xorq %r13,%r13 + addq 0(%rsi),%r8 + movq %r10,%rcx + adcq 8(%rsi),%r9 + adcq 16(%rsi),%r10 + movq %r11,%r12 + adcq 24(%rsi),%r11 + adcq $0,%r13 + xorq %rsi,%rsi + testq $1,%rax + + cmovzq %rax,%r8 + cmovzq %rdx,%r9 + cmovzq %rcx,%r10 + cmovzq %r12,%r11 + cmovzq %rsi,%r13 + + movq %r9,%rax + shrq $1,%r8 + shlq $63,%rax + movq %r10,%rdx + shrq $1,%r9 + orq %rax,%r8 + shlq $63,%rdx + movq %r11,%rcx + shrq $1,%r10 + orq %rdx,%r9 + shlq $63,%rcx + shrq $1,%r11 + shlq $63,%r13 + orq %rcx,%r10 + orq %r13,%r11 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + popq %r13 + popq %r12 + .byte 0xf3,0xc3 +.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 + + + +.globl ecp_nistz256_mul_by_3 +.type ecp_nistz256_mul_by_3,@function +.align 32 +ecp_nistz256_mul_by_3: + pushq %r12 + pushq %r13 + + movq 0(%rsi),%r8 + xorq %r13,%r13 + movq 8(%rsi),%r9 + addq %r8,%r8 + movq 16(%rsi),%r10 + adcq %r9,%r9 + movq 24(%rsi),%r11 + movq %r8,%rax + adcq %r10,%r10 + adcq %r11,%r11 + movq %r9,%rdx + adcq $0,%r13 + + subq $-1,%r8 + movq %r10,%rcx + sbbq .Lpoly+8(%rip),%r9 + sbbq $0,%r10 + movq %r11,%r12 + sbbq .Lpoly+24(%rip),%r11 + testq %r13,%r13 + + cmovzq %rax,%r8 + cmovzq %rdx,%r9 + cmovzq %rcx,%r10 + cmovzq %r12,%r11 + + xorq %r13,%r13 + addq 0(%rsi),%r8 + adcq 8(%rsi),%r9 + movq %r8,%rax + adcq 16(%rsi),%r10 + adcq 24(%rsi),%r11 + movq %r9,%rdx + adcq $0,%r13 + + subq $-1,%r8 + movq %r10,%rcx + sbbq .Lpoly+8(%rip),%r9 + sbbq $0,%r10 + movq %r11,%r12 + sbbq .Lpoly+24(%rip),%r11 + testq %r13,%r13 + + cmovzq %rax,%r8 + cmovzq %rdx,%r9 + movq %r8,0(%rdi) + cmovzq %rcx,%r10 + movq %r9,8(%rdi) + cmovzq %r12,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + popq %r13 + popq %r12 + .byte 0xf3,0xc3 +.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 + + + +.globl ecp_nistz256_add +.type ecp_nistz256_add,@function +.align 32 +ecp_nistz256_add: + pushq %r12 + pushq %r13 + + movq 0(%rsi),%r8 + xorq %r13,%r13 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + leaq .Lpoly(%rip),%rsi + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + movq %r8,%rax + adcq 16(%rdx),%r10 + adcq 24(%rdx),%r11 + movq %r9,%rdx + adcq $0,%r13 + + subq 0(%rsi),%r8 + movq %r10,%rcx + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + movq %r11,%r12 + sbbq 24(%rsi),%r11 + testq %r13,%r13 + + cmovzq %rax,%r8 + cmovzq %rdx,%r9 + movq %r8,0(%rdi) + cmovzq %rcx,%r10 + movq %r9,8(%rdi) + cmovzq %r12,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + popq %r13 + popq %r12 + .byte 0xf3,0xc3 +.size ecp_nistz256_add,.-ecp_nistz256_add + + + +.globl ecp_nistz256_sub +.type ecp_nistz256_sub,@function +.align 32 +ecp_nistz256_sub: + pushq %r12 + pushq %r13 + + movq 0(%rsi),%r8 + xorq %r13,%r13 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + leaq .Lpoly(%rip),%rsi + + subq 0(%rdx),%r8 + sbbq 8(%rdx),%r9 + movq %r8,%rax + sbbq 16(%rdx),%r10 + sbbq 24(%rdx),%r11 + movq %r9,%rdx + sbbq $0,%r13 + + addq 0(%rsi),%r8 + movq %r10,%rcx + adcq 8(%rsi),%r9 + adcq 16(%rsi),%r10 + movq %r11,%r12 + adcq 24(%rsi),%r11 + testq %r13,%r13 + + cmovzq %rax,%r8 + cmovzq %rdx,%r9 + movq %r8,0(%rdi) + cmovzq %rcx,%r10 + movq %r9,8(%rdi) + cmovzq %r12,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + popq %r13 + popq %r12 + .byte 0xf3,0xc3 +.size ecp_nistz256_sub,.-ecp_nistz256_sub + + + +.globl ecp_nistz256_neg +.type ecp_nistz256_neg,@function +.align 32 +ecp_nistz256_neg: + pushq %r12 + pushq %r13 + + xorq %r8,%r8 + xorq %r9,%r9 + xorq %r10,%r10 + xorq %r11,%r11 + xorq %r13,%r13 + + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + movq %r8,%rax + sbbq 24(%rsi),%r11 + leaq .Lpoly(%rip),%rsi + movq %r9,%rdx + sbbq $0,%r13 + + addq 0(%rsi),%r8 + movq %r10,%rcx + adcq 8(%rsi),%r9 + adcq 16(%rsi),%r10 + movq %r11,%r12 + adcq 24(%rsi),%r11 + testq %r13,%r13 + + cmovzq %rax,%r8 + cmovzq %rdx,%r9 + movq %r8,0(%rdi) + cmovzq %rcx,%r10 + movq %r9,8(%rdi) + cmovzq %r12,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + popq %r13 + popq %r12 + .byte 0xf3,0xc3 +.size ecp_nistz256_neg,.-ecp_nistz256_neg + + + + +.globl ecp_nistz256_to_mont +.type ecp_nistz256_to_mont,@function +.align 32 +ecp_nistz256_to_mont: + leaq .LRR(%rip),%rdx + jmp .Lmul_mont +.size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont + + + + + + + +.globl ecp_nistz256_mul_mont +.type ecp_nistz256_mul_mont,@function +.align 32 +ecp_nistz256_mul_mont: +.Lmul_mont: + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq %rdx,%rbx + movq 0(%rdx),%rax + movq 0(%rsi),%r9 + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + + call __ecp_nistz256_mul_montq +.Lmul_mont_done: + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + .byte 0xf3,0xc3 +.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont + +.type __ecp_nistz256_mul_montq,@function +.align 32 +__ecp_nistz256_mul_montq: + + + movq %rax,%rbp + mulq %r9 + movq .Lpoly+8(%rip),%r14 + movq %rax,%r8 + movq %rbp,%rax + movq %rdx,%r9 + + mulq %r10 + movq .Lpoly+24(%rip),%r15 + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %r11 + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %r12 + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + xorq %r13,%r13 + movq %rdx,%r12 + + + + + + + + + + + movq %r8,%rbp + shlq $32,%r8 + mulq %r15 + shrq $32,%rbp + addq %r8,%r9 + adcq %rbp,%r10 + adcq %rax,%r11 + movq 8(%rbx),%rax + adcq %rdx,%r12 + adcq $0,%r13 + xorq %r8,%r8 + + + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rcx,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 16(%rsi) + addq %rcx,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 24(%rsi) + addq %rcx,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %r9,%rax + adcq %rdx,%r13 + adcq $0,%r8 + + + + movq %r9,%rbp + shlq $32,%r9 + mulq %r15 + shrq $32,%rbp + addq %r9,%r10 + adcq %rbp,%r11 + adcq %rax,%r12 + movq 16(%rbx),%rax + adcq %rdx,%r13 + adcq $0,%r8 + xorq %r9,%r9 + + + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rcx,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 16(%rsi) + addq %rcx,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 24(%rsi) + addq %rcx,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %r10,%rax + adcq %rdx,%r8 + adcq $0,%r9 + + + + movq %r10,%rbp + shlq $32,%r10 + mulq %r15 + shrq $32,%rbp + addq %r10,%r11 + adcq %rbp,%r12 + adcq %rax,%r13 + movq 24(%rbx),%rax + adcq %rdx,%r8 + adcq $0,%r9 + xorq %r10,%r10 + + + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rcx,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 16(%rsi) + addq %rcx,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 24(%rsi) + addq %rcx,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r11,%rax + adcq %rdx,%r9 + adcq $0,%r10 + + + + movq %r11,%rbp + shlq $32,%r11 + mulq %r15 + shrq $32,%rbp + addq %r11,%r12 + adcq %rbp,%r13 + movq %r12,%rcx + adcq %rax,%r8 + adcq %rdx,%r9 + movq %r13,%rbp + adcq $0,%r10 + + + + subq $-1,%r12 + movq %r8,%rbx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%rdx + sbbq %r15,%r9 + sbbq $0,%r10 + + cmovcq %rcx,%r12 + cmovcq %rbp,%r13 + movq %r12,0(%rdi) + cmovcq %rbx,%r8 + movq %r13,8(%rdi) + cmovcq %rdx,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 +.size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq + + + + + + + + +.globl ecp_nistz256_sqr_mont +.type ecp_nistz256_sqr_mont,@function +.align 32 +ecp_nistz256_sqr_mont: + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq 0(%rsi),%rax + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%r8 + + call __ecp_nistz256_sqr_montq +.Lsqr_mont_done: + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + .byte 0xf3,0xc3 +.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont + +.type __ecp_nistz256_sqr_montq,@function +.align 32 +__ecp_nistz256_sqr_montq: + movq %rax,%r13 + mulq %r14 + movq %rax,%r9 + movq %r15,%rax + movq %rdx,%r10 + + mulq %r13 + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %r13 + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r12 + + + mulq %r14 + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq %r14 + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r12 + movq %rdx,%r13 + adcq $0,%r13 + + + mulq %r15 + xorq %r15,%r15 + addq %rax,%r13 + movq 0(%rsi),%rax + movq %rdx,%r14 + adcq $0,%r14 + + addq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %r12,%r12 + adcq %r13,%r13 + adcq %r14,%r14 + adcq $0,%r15 + + mulq %rax + movq %rax,%r8 + movq 8(%rsi),%rax + movq %rdx,%rcx + + mulq %rax + addq %rcx,%r9 + adcq %rax,%r10 + movq 16(%rsi),%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq %rax + addq %rcx,%r11 + adcq %rax,%r12 + movq 24(%rsi),%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq %rax + addq %rcx,%r13 + adcq %rax,%r14 + movq %r8,%rax + adcq %rdx,%r15 + + movq .Lpoly+8(%rip),%rsi + movq .Lpoly+24(%rip),%rbp + + + + + movq %r8,%rcx + shlq $32,%r8 + mulq %rbp + shrq $32,%rcx + addq %r8,%r9 + adcq %rcx,%r10 + adcq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + + + + movq %r9,%rcx + shlq $32,%r9 + movq %rdx,%r8 + mulq %rbp + shrq $32,%rcx + addq %r9,%r10 + adcq %rcx,%r11 + adcq %rax,%r8 + movq %r10,%rax + adcq $0,%rdx + + + + movq %r10,%rcx + shlq $32,%r10 + movq %rdx,%r9 + mulq %rbp + shrq $32,%rcx + addq %r10,%r11 + adcq %rcx,%r8 + adcq %rax,%r9 + movq %r11,%rax + adcq $0,%rdx + + + + movq %r11,%rcx + shlq $32,%r11 + movq %rdx,%r10 + mulq %rbp + shrq $32,%rcx + addq %r11,%r8 + adcq %rcx,%r9 + adcq %rax,%r10 + adcq $0,%rdx + xorq %r11,%r11 + + + + addq %r8,%r12 + adcq %r9,%r13 + movq %r12,%r8 + adcq %r10,%r14 + adcq %rdx,%r15 + movq %r13,%r9 + adcq $0,%r11 + + subq $-1,%r12 + movq %r14,%r10 + sbbq %rsi,%r13 + sbbq $0,%r14 + movq %r15,%rcx + sbbq %rbp,%r15 + sbbq $0,%r11 + + cmovcq %r8,%r12 + cmovcq %r9,%r13 + movq %r12,0(%rdi) + cmovcq %r10,%r14 + movq %r13,8(%rdi) + cmovcq %rcx,%r15 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + + .byte 0xf3,0xc3 +.size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq + + + + + + +.globl ecp_nistz256_from_mont +.type ecp_nistz256_from_mont,@function +.align 32 +ecp_nistz256_from_mont: + pushq %r12 + pushq %r13 + + movq 0(%rsi),%rax + movq .Lpoly+24(%rip),%r13 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq %rax,%r8 + movq .Lpoly+8(%rip),%r12 + + + + movq %rax,%rcx + shlq $32,%r8 + mulq %r13 + shrq $32,%rcx + addq %r8,%r9 + adcq %rcx,%r10 + adcq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + + + + movq %r9,%rcx + shlq $32,%r9 + movq %rdx,%r8 + mulq %r13 + shrq $32,%rcx + addq %r9,%r10 + adcq %rcx,%r11 + adcq %rax,%r8 + movq %r10,%rax + adcq $0,%rdx + + + + movq %r10,%rcx + shlq $32,%r10 + movq %rdx,%r9 + mulq %r13 + shrq $32,%rcx + addq %r10,%r11 + adcq %rcx,%r8 + adcq %rax,%r9 + movq %r11,%rax + adcq $0,%rdx + + + + movq %r11,%rcx + shlq $32,%r11 + movq %rdx,%r10 + mulq %r13 + shrq $32,%rcx + addq %r11,%r8 + adcq %rcx,%r9 + movq %r8,%rcx + adcq %rax,%r10 + movq %r9,%rsi + adcq $0,%rdx + + + + subq $-1,%r8 + movq %r10,%rax + sbbq %r12,%r9 + sbbq $0,%r10 + movq %rdx,%r11 + sbbq %r13,%rdx + sbbq %r13,%r13 + + cmovnzq %rcx,%r8 + cmovnzq %rsi,%r9 + movq %r8,0(%rdi) + cmovnzq %rax,%r10 + movq %r9,8(%rdi) + cmovzq %rdx,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + popq %r13 + popq %r12 + .byte 0xf3,0xc3 +.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont + + +.globl ecp_nistz256_select_w5 +.type ecp_nistz256_select_w5,@function +.align 32 +ecp_nistz256_select_w5: + movdqa .LOne(%rip),%xmm0 + movd %edx,%xmm1 + + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + + movdqa %xmm0,%xmm8 + pshufd $0,%xmm1,%xmm1 + + movq $16,%rax +.Lselect_loop_sse_w5: + + movdqa %xmm8,%xmm15 + paddd %xmm0,%xmm8 + pcmpeqd %xmm1,%xmm15 + + movdqa 0(%rsi),%xmm9 + movdqa 16(%rsi),%xmm10 + movdqa 32(%rsi),%xmm11 + movdqa 48(%rsi),%xmm12 + movdqa 64(%rsi),%xmm13 + movdqa 80(%rsi),%xmm14 + leaq 96(%rsi),%rsi + + pand %xmm15,%xmm9 + pand %xmm15,%xmm10 + por %xmm9,%xmm2 + pand %xmm15,%xmm11 + por %xmm10,%xmm3 + pand %xmm15,%xmm12 + por %xmm11,%xmm4 + pand %xmm15,%xmm13 + por %xmm12,%xmm5 + pand %xmm15,%xmm14 + por %xmm13,%xmm6 + por %xmm14,%xmm7 + + decq %rax + jnz .Lselect_loop_sse_w5 + + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + movdqu %xmm4,32(%rdi) + movdqu %xmm5,48(%rdi) + movdqu %xmm6,64(%rdi) + movdqu %xmm7,80(%rdi) + .byte 0xf3,0xc3 +.size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5 + + + +.globl ecp_nistz256_select_w7 +.type ecp_nistz256_select_w7,@function +.align 32 +ecp_nistz256_select_w7: + movdqa .LOne(%rip),%xmm8 + movd %edx,%xmm1 + + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + + movdqa %xmm8,%xmm0 + pshufd $0,%xmm1,%xmm1 + movq $64,%rax + +.Lselect_loop_sse_w7: + movdqa %xmm8,%xmm15 + paddd %xmm0,%xmm8 + movdqa 0(%rsi),%xmm9 + movdqa 16(%rsi),%xmm10 + pcmpeqd %xmm1,%xmm15 + movdqa 32(%rsi),%xmm11 + movdqa 48(%rsi),%xmm12 + leaq 64(%rsi),%rsi + + pand %xmm15,%xmm9 + pand %xmm15,%xmm10 + por %xmm9,%xmm2 + pand %xmm15,%xmm11 + por %xmm10,%xmm3 + pand %xmm15,%xmm12 + por %xmm11,%xmm4 + prefetcht0 255(%rsi) + por %xmm12,%xmm5 + + decq %rax + jnz .Lselect_loop_sse_w7 + + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + movdqu %xmm4,32(%rdi) + movdqu %xmm5,48(%rdi) + .byte 0xf3,0xc3 +.size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7 +.globl ecp_nistz256_avx2_select_w7 +.type ecp_nistz256_avx2_select_w7,@function +.align 32 +ecp_nistz256_avx2_select_w7: +.byte 0x0f,0x0b + .byte 0xf3,0xc3 +.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7 +.type __ecp_nistz256_add_toq,@function +.align 32 +__ecp_nistz256_add_toq: + addq 0(%rbx),%r12 + adcq 8(%rbx),%r13 + movq %r12,%rax + adcq 16(%rbx),%r8 + adcq 24(%rbx),%r9 + movq %r13,%rbp + sbbq %r11,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + testq %r11,%r11 + + cmovzq %rax,%r12 + cmovzq %rbp,%r13 + movq %r12,0(%rdi) + cmovzq %rcx,%r8 + movq %r13,8(%rdi) + cmovzq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 +.size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq + +.type __ecp_nistz256_sub_fromq,@function +.align 32 +__ecp_nistz256_sub_fromq: + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r13 + movq %r12,%rax + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + movq %r13,%rbp + sbbq %r11,%r11 + + addq $-1,%r12 + movq %r8,%rcx + adcq %r14,%r13 + adcq $0,%r8 + movq %r9,%r10 + adcq %r15,%r9 + testq %r11,%r11 + + cmovzq %rax,%r12 + cmovzq %rbp,%r13 + movq %r12,0(%rdi) + cmovzq %rcx,%r8 + movq %r13,8(%rdi) + cmovzq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 +.size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq + +.type __ecp_nistz256_subq,@function +.align 32 +__ecp_nistz256_subq: + subq %r12,%rax + sbbq %r13,%rbp + movq %rax,%r12 + sbbq %r8,%rcx + sbbq %r9,%r10 + movq %rbp,%r13 + sbbq %r11,%r11 + + addq $-1,%rax + movq %rcx,%r8 + adcq %r14,%rbp + adcq $0,%rcx + movq %r10,%r9 + adcq %r15,%r10 + testq %r11,%r11 + + cmovnzq %rax,%r12 + cmovnzq %rbp,%r13 + cmovnzq %rcx,%r8 + cmovnzq %r10,%r9 + + .byte 0xf3,0xc3 +.size __ecp_nistz256_subq,.-__ecp_nistz256_subq + +.type __ecp_nistz256_mul_by_2q,@function +.align 32 +__ecp_nistz256_mul_by_2q: + addq %r12,%r12 + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + sbbq %r11,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + testq %r11,%r11 + + cmovzq %rax,%r12 + cmovzq %rbp,%r13 + movq %r12,0(%rdi) + cmovzq %rcx,%r8 + movq %r13,8(%rdi) + cmovzq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 +.size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q +.globl ecp_nistz256_point_double +.type ecp_nistz256_point_double,@function +.align 32 +ecp_nistz256_point_double: + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $160+8,%rsp + + movdqu 0(%rsi),%xmm0 + movq %rsi,%rbx + movdqu 16(%rsi),%xmm1 + movq 32+0(%rsi),%r12 + movq 32+8(%rsi),%r13 + movq 32+16(%rsi),%r8 + movq 32+24(%rsi),%r9 + movq .Lpoly+8(%rip),%r14 + movq .Lpoly+24(%rip),%r15 + movdqa %xmm0,96(%rsp) + movdqa %xmm1,96+16(%rsp) + leaq 32(%rdi),%r10 + leaq 64(%rdi),%r11 +.byte 102,72,15,110,199 +.byte 102,73,15,110,202 +.byte 102,73,15,110,211 + + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_by_2q + + movq 64+0(%rsi),%rax + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + leaq 64-0(%rsi),%rsi + leaq 64(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 0+0(%rsp),%rax + movq 8+0(%rsp),%r14 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + leaq 0(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 32(%rbx),%rax + movq 64+0(%rbx),%r9 + movq 64+8(%rbx),%r10 + movq 64+16(%rbx),%r11 + movq 64+24(%rbx),%r12 + leaq 64-0(%rbx),%rsi + leaq 32(%rbx),%rbx +.byte 102,72,15,126,215 + call __ecp_nistz256_mul_montq + call __ecp_nistz256_mul_by_2q + + movq 96+0(%rsp),%r12 + movq 96+8(%rsp),%r13 + leaq 64(%rsp),%rbx + movq 96+16(%rsp),%r8 + movq 96+24(%rsp),%r9 + leaq 32(%rsp),%rdi + call __ecp_nistz256_add_toq + + movq 96+0(%rsp),%r12 + movq 96+8(%rsp),%r13 + leaq 64(%rsp),%rbx + movq 96+16(%rsp),%r8 + movq 96+24(%rsp),%r9 + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 0+0(%rsp),%rax + movq 8+0(%rsp),%r14 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 +.byte 102,72,15,126,207 + call __ecp_nistz256_sqr_montq + xorq %r9,%r9 + movq %r12,%rax + addq $-1,%r12 + movq %r13,%r10 + adcq %rsi,%r13 + movq %r14,%rcx + adcq $0,%r14 + movq %r15,%r8 + adcq %rbp,%r15 + adcq $0,%r9 + xorq %rsi,%rsi + testq $1,%rax + + cmovzq %rax,%r12 + cmovzq %r10,%r13 + cmovzq %rcx,%r14 + cmovzq %r8,%r15 + cmovzq %rsi,%r9 + + movq %r13,%rax + shrq $1,%r12 + shlq $63,%rax + movq %r14,%r10 + shrq $1,%r13 + orq %rax,%r12 + shlq $63,%r10 + movq %r15,%rcx + shrq $1,%r14 + orq %r10,%r13 + shlq $63,%rcx + movq %r12,0(%rdi) + shrq $1,%r15 + movq %r13,8(%rdi) + shlq $63,%r9 + orq %rcx,%r14 + orq %r9,%r15 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + movq 64(%rsp),%rax + leaq 64(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_by_2q + + leaq 32(%rsp),%rbx + leaq 32(%rsp),%rdi + call __ecp_nistz256_add_toq + + movq 96(%rsp),%rax + leaq 96(%rsp),%rbx + movq 0+0(%rsp),%r9 + movq 8+0(%rsp),%r10 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r11 + movq 24+0(%rsp),%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_by_2q + + movq 0+32(%rsp),%rax + movq 8+32(%rsp),%r14 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r15 + movq 24+32(%rsp),%r8 +.byte 102,72,15,126,199 + call __ecp_nistz256_sqr_montq + + leaq 128(%rsp),%rbx + movq %r14,%r8 + movq %r15,%r9 + movq %rsi,%r14 + movq %rbp,%r15 + call __ecp_nistz256_sub_fromq + + movq 0+0(%rsp),%rax + movq 0+8(%rsp),%rbp + movq 0+16(%rsp),%rcx + movq 0+24(%rsp),%r10 + leaq 0(%rsp),%rdi + call __ecp_nistz256_subq + + movq 32(%rsp),%rax + leaq 32(%rsp),%rbx + movq %r12,%r14 + xorl %ecx,%ecx + movq %r12,0+0(%rsp) + movq %r13,%r10 + movq %r13,0+8(%rsp) + cmovzq %r8,%r11 + movq %r8,0+16(%rsp) + leaq 0-0(%rsp),%rsi + cmovzq %r9,%r12 + movq %r9,0+24(%rsp) + movq %r14,%r9 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montq + +.byte 102,72,15,126,203 +.byte 102,72,15,126,207 + call __ecp_nistz256_sub_fromq + + addq $160+8,%rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + .byte 0xf3,0xc3 +.size ecp_nistz256_point_double,.-ecp_nistz256_point_double +.globl ecp_nistz256_point_add +.type ecp_nistz256_point_add,@function +.align 32 +ecp_nistz256_point_add: + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $576+8,%rsp + + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + movdqu 64(%rsi),%xmm4 + movdqu 80(%rsi),%xmm5 + movq %rsi,%rbx + movq %rdx,%rsi + movdqa %xmm0,384(%rsp) + movdqa %xmm1,384+16(%rsp) + por %xmm0,%xmm1 + movdqa %xmm2,416(%rsp) + movdqa %xmm3,416+16(%rsp) + por %xmm2,%xmm3 + movdqa %xmm4,448(%rsp) + movdqa %xmm5,448+16(%rsp) + por %xmm1,%xmm3 + + movdqu 0(%rsi),%xmm0 + pshufd $177,%xmm3,%xmm5 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + por %xmm3,%xmm5 + movdqu 48(%rsi),%xmm3 + movq 64+0(%rsi),%rax + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + movdqa %xmm0,480(%rsp) + pshufd $30,%xmm5,%xmm4 + movdqa %xmm1,480+16(%rsp) + por %xmm0,%xmm1 +.byte 102,72,15,110,199 + movdqa %xmm2,512(%rsp) + movdqa %xmm3,512+16(%rsp) + por %xmm2,%xmm3 + por %xmm4,%xmm5 + pxor %xmm4,%xmm4 + por %xmm1,%xmm3 + + leaq 64-0(%rsi),%rsi + movq %rax,544+0(%rsp) + movq %r14,544+8(%rsp) + movq %r15,544+16(%rsp) + movq %r8,544+24(%rsp) + leaq 96(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + pcmpeqd %xmm4,%xmm5 + pshufd $177,%xmm3,%xmm4 + por %xmm3,%xmm4 + pshufd $0,%xmm5,%xmm5 + pshufd $30,%xmm4,%xmm3 + por %xmm3,%xmm4 + pxor %xmm3,%xmm3 + pcmpeqd %xmm3,%xmm4 + pshufd $0,%xmm4,%xmm4 + movq 64+0(%rbx),%rax + movq 64+8(%rbx),%r14 + movq 64+16(%rbx),%r15 + movq 64+24(%rbx),%r8 + + leaq 64-0(%rbx),%rsi + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 544(%rsp),%rax + leaq 544(%rsp),%rbx + movq 0+96(%rsp),%r9 + movq 8+96(%rsp),%r10 + leaq 0+96(%rsp),%rsi + movq 16+96(%rsp),%r11 + movq 24+96(%rsp),%r12 + leaq 224(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 448(%rsp),%rax + leaq 448(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 416(%rsp),%rax + leaq 416(%rsp),%rbx + movq 0+224(%rsp),%r9 + movq 8+224(%rsp),%r10 + leaq 0+224(%rsp),%rsi + movq 16+224(%rsp),%r11 + movq 24+224(%rsp),%r12 + leaq 224(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 512(%rsp),%rax + leaq 512(%rsp),%rbx + movq 0+256(%rsp),%r9 + movq 8+256(%rsp),%r10 + leaq 0+256(%rsp),%rsi + movq 16+256(%rsp),%r11 + movq 24+256(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 224(%rsp),%rbx + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + orq %r13,%r12 + movdqa %xmm4,%xmm2 + orq %r8,%r12 + orq %r9,%r12 + por %xmm5,%xmm2 +.byte 102,73,15,110,220 + + movq 384(%rsp),%rax + leaq 384(%rsp),%rbx + movq 0+96(%rsp),%r9 + movq 8+96(%rsp),%r10 + leaq 0+96(%rsp),%rsi + movq 16+96(%rsp),%r11 + movq 24+96(%rsp),%r12 + leaq 160(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 480(%rsp),%rax + leaq 480(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 192(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 160(%rsp),%rbx + leaq 0(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + orq %r13,%r12 + orq %r8,%r12 + orq %r9,%r12 + +.byte 0x3e + jnz .Ladd_proceedq +.byte 102,73,15,126,208 +.byte 102,73,15,126,217 + testq %r8,%r8 + jnz .Ladd_proceedq + testq %r9,%r9 + jz .Ladd_proceedq + +.byte 102,72,15,126,199 + pxor %xmm0,%xmm0 + movdqu %xmm0,0(%rdi) + movdqu %xmm0,16(%rdi) + movdqu %xmm0,32(%rdi) + movdqu %xmm0,48(%rdi) + movdqu %xmm0,64(%rdi) + movdqu %xmm0,80(%rdi) + jmp .Ladd_doneq + +.align 32 +.Ladd_proceedq: + movq 0+64(%rsp),%rax + movq 8+64(%rsp),%r14 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r15 + movq 24+64(%rsp),%r8 + leaq 96(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 448(%rsp),%rax + leaq 448(%rsp),%rbx + movq 0+0(%rsp),%r9 + movq 8+0(%rsp),%r10 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r11 + movq 24+0(%rsp),%r12 + leaq 352(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 0+0(%rsp),%rax + movq 8+0(%rsp),%r14 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 544(%rsp),%rax + leaq 544(%rsp),%rbx + movq 0+352(%rsp),%r9 + movq 8+352(%rsp),%r10 + leaq 0+352(%rsp),%rsi + movq 16+352(%rsp),%r11 + movq 24+352(%rsp),%r12 + leaq 352(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 0(%rsp),%rax + leaq 0(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 160(%rsp),%rax + leaq 160(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 192(%rsp),%rdi + call __ecp_nistz256_mul_montq + + + + + addq %r12,%r12 + leaq 96(%rsp),%rsi + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + sbbq %r11,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + testq %r11,%r11 + + cmovzq %rax,%r12 + movq 0(%rsi),%rax + cmovzq %rbp,%r13 + movq 8(%rsi),%rbp + cmovzq %rcx,%r8 + movq 16(%rsi),%rcx + cmovzq %r10,%r9 + movq 24(%rsi),%r10 + + call __ecp_nistz256_subq + + leaq 128(%rsp),%rbx + leaq 288(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 192+0(%rsp),%rax + movq 192+8(%rsp),%rbp + movq 192+16(%rsp),%rcx + movq 192+24(%rsp),%r10 + leaq 320(%rsp),%rdi + + call __ecp_nistz256_subq + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq 128(%rsp),%rax + leaq 128(%rsp),%rbx + movq 0+224(%rsp),%r9 + movq 8+224(%rsp),%r10 + leaq 0+224(%rsp),%rsi + movq 16+224(%rsp),%r11 + movq 24+224(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 320(%rsp),%rax + leaq 320(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 320(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 256(%rsp),%rbx + leaq 320(%rsp),%rdi + call __ecp_nistz256_sub_fromq + +.byte 102,72,15,126,199 + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 352(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 352+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 544(%rsp),%xmm2 + pand 544+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 448(%rsp),%xmm2 + pand 448+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,64(%rdi) + movdqu %xmm3,80(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 288(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 288+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 480(%rsp),%xmm2 + pand 480+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 384(%rsp),%xmm2 + pand 384+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 320(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 320+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 512(%rsp),%xmm2 + pand 512+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 416(%rsp),%xmm2 + pand 416+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + +.Ladd_doneq: + addq $576+8,%rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + .byte 0xf3,0xc3 +.size ecp_nistz256_point_add,.-ecp_nistz256_point_add +.globl ecp_nistz256_point_add_affine +.type ecp_nistz256_point_add_affine,@function +.align 32 +ecp_nistz256_point_add_affine: + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $480+8,%rsp + + movdqu 0(%rsi),%xmm0 + movq %rdx,%rbx + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + movdqu 64(%rsi),%xmm4 + movdqu 80(%rsi),%xmm5 + movq 64+0(%rsi),%rax + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + movdqa %xmm0,320(%rsp) + movdqa %xmm1,320+16(%rsp) + por %xmm0,%xmm1 + movdqa %xmm2,352(%rsp) + movdqa %xmm3,352+16(%rsp) + por %xmm2,%xmm3 + movdqa %xmm4,384(%rsp) + movdqa %xmm5,384+16(%rsp) + por %xmm1,%xmm3 + + movdqu 0(%rbx),%xmm0 + pshufd $177,%xmm3,%xmm5 + movdqu 16(%rbx),%xmm1 + movdqu 32(%rbx),%xmm2 + por %xmm3,%xmm5 + movdqu 48(%rbx),%xmm3 + movdqa %xmm0,416(%rsp) + pshufd $30,%xmm5,%xmm4 + movdqa %xmm1,416+16(%rsp) + por %xmm0,%xmm1 +.byte 102,72,15,110,199 + movdqa %xmm2,448(%rsp) + movdqa %xmm3,448+16(%rsp) + por %xmm2,%xmm3 + por %xmm4,%xmm5 + pxor %xmm4,%xmm4 + por %xmm1,%xmm3 + + leaq 64-0(%rsi),%rsi + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + pcmpeqd %xmm4,%xmm5 + pshufd $177,%xmm3,%xmm4 + movq 0(%rbx),%rax + + movq %r12,%r9 + por %xmm3,%xmm4 + pshufd $0,%xmm5,%xmm5 + pshufd $30,%xmm4,%xmm3 + movq %r13,%r10 + por %xmm3,%xmm4 + pxor %xmm3,%xmm3 + movq %r14,%r11 + pcmpeqd %xmm3,%xmm4 + pshufd $0,%xmm4,%xmm4 + + leaq 32-0(%rsp),%rsi + movq %r15,%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 320(%rsp),%rbx + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 384(%rsp),%rax + leaq 384(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 384(%rsp),%rax + leaq 384(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 288(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 448(%rsp),%rax + leaq 448(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 352(%rsp),%rbx + leaq 96(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 0+64(%rsp),%rax + movq 8+64(%rsp),%r14 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r15 + movq 24+64(%rsp),%r8 + leaq 128(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 0+96(%rsp),%rax + movq 8+96(%rsp),%r14 + leaq 0+96(%rsp),%rsi + movq 16+96(%rsp),%r15 + movq 24+96(%rsp),%r8 + leaq 192(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 128(%rsp),%rax + leaq 128(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 160(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 320(%rsp),%rax + leaq 320(%rsp),%rbx + movq 0+128(%rsp),%r9 + movq 8+128(%rsp),%r10 + leaq 0+128(%rsp),%rsi + movq 16+128(%rsp),%r11 + movq 24+128(%rsp),%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montq + + + + + addq %r12,%r12 + leaq 192(%rsp),%rsi + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + sbbq %r11,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + testq %r11,%r11 + + cmovzq %rax,%r12 + movq 0(%rsi),%rax + cmovzq %rbp,%r13 + movq 8(%rsi),%rbp + cmovzq %rcx,%r8 + movq 16(%rsi),%rcx + cmovzq %r10,%r9 + movq 24(%rsi),%r10 + + call __ecp_nistz256_subq + + leaq 160(%rsp),%rbx + leaq 224(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 0+0(%rsp),%rax + movq 0+8(%rsp),%rbp + movq 0+16(%rsp),%rcx + movq 0+24(%rsp),%r10 + leaq 64(%rsp),%rdi + + call __ecp_nistz256_subq + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq 352(%rsp),%rax + leaq 352(%rsp),%rbx + movq 0+160(%rsp),%r9 + movq 8+160(%rsp),%r10 + leaq 0+160(%rsp),%rsi + movq 16+160(%rsp),%r11 + movq 24+160(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 96(%rsp),%rax + leaq 96(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 64(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 32(%rsp),%rbx + leaq 256(%rsp),%rdi + call __ecp_nistz256_sub_fromq + +.byte 102,72,15,126,199 + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 288(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 288+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand .LONE_mont(%rip),%xmm2 + pand .LONE_mont+16(%rip),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 384(%rsp),%xmm2 + pand 384+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,64(%rdi) + movdqu %xmm3,80(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 224(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 224+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 416(%rsp),%xmm2 + pand 416+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 320(%rsp),%xmm2 + pand 320+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 256(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 256+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 448(%rsp),%xmm2 + pand 448+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 352(%rsp),%xmm2 + pand 352+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + + addq $480+8,%rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + .byte 0xf3,0xc3 +.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine diff --git a/secure/lib/libcrypto/amd64/ghash-x86_64.S b/secure/lib/libcrypto/amd64/ghash-x86_64.S new file mode 100644 index 0000000..aa93c80 --- /dev/null +++ b/secure/lib/libcrypto/amd64/ghash-x86_64.S @@ -0,0 +1,1319 @@ + # $FreeBSD$ +.text + + +.globl gcm_gmult_4bit +.type gcm_gmult_4bit,@function +.align 16 +gcm_gmult_4bit: + pushq %rbx + pushq %rbp + pushq %r12 +.Lgmult_prologue: + + movzbq 15(%rdi),%r8 + leaq .Lrem_4bit(%rip),%r11 + xorq %rax,%rax + xorq %rbx,%rbx + movb %r8b,%al + movb %r8b,%bl + shlb $4,%al + movq $14,%rcx + movq 8(%rsi,%rax,1),%r8 + movq (%rsi,%rax,1),%r9 + andb $240,%bl + movq %r8,%rdx + jmp .Loop1 + +.align 16 +.Loop1: + shrq $4,%r8 + andq $15,%rdx + movq %r9,%r10 + movb (%rdi,%rcx,1),%al + shrq $4,%r9 + xorq 8(%rsi,%rbx,1),%r8 + shlq $60,%r10 + xorq (%rsi,%rbx,1),%r9 + movb %al,%bl + xorq (%r11,%rdx,8),%r9 + movq %r8,%rdx + shlb $4,%al + xorq %r10,%r8 + decq %rcx + js .Lbreak1 + + shrq $4,%r8 + andq $15,%rdx + movq %r9,%r10 + shrq $4,%r9 + xorq 8(%rsi,%rax,1),%r8 + shlq $60,%r10 + xorq (%rsi,%rax,1),%r9 + andb $240,%bl + xorq (%r11,%rdx,8),%r9 + movq %r8,%rdx + xorq %r10,%r8 + jmp .Loop1 + +.align 16 +.Lbreak1: + shrq $4,%r8 + andq $15,%rdx + movq %r9,%r10 + shrq $4,%r9 + xorq 8(%rsi,%rax,1),%r8 + shlq $60,%r10 + xorq (%rsi,%rax,1),%r9 + andb $240,%bl + xorq (%r11,%rdx,8),%r9 + movq %r8,%rdx + xorq %r10,%r8 + + shrq $4,%r8 + andq $15,%rdx + movq %r9,%r10 + shrq $4,%r9 + xorq 8(%rsi,%rbx,1),%r8 + shlq $60,%r10 + xorq (%rsi,%rbx,1),%r9 + xorq %r10,%r8 + xorq (%r11,%rdx,8),%r9 + + bswapq %r8 + bswapq %r9 + movq %r8,8(%rdi) + movq %r9,(%rdi) + + movq 16(%rsp),%rbx + leaq 24(%rsp),%rsp +.Lgmult_epilogue: + .byte 0xf3,0xc3 +.size gcm_gmult_4bit,.-gcm_gmult_4bit +.globl gcm_ghash_4bit +.type gcm_ghash_4bit,@function +.align 16 +gcm_ghash_4bit: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $280,%rsp +.Lghash_prologue: + movq %rdx,%r14 + movq %rcx,%r15 + subq $-128,%rsi + leaq 16+128(%rsp),%rbp + xorl %edx,%edx + movq 0+0-128(%rsi),%r8 + movq 0+8-128(%rsi),%rax + movb %al,%dl + shrq $4,%rax + movq %r8,%r10 + shrq $4,%r8 + movq 16+0-128(%rsi),%r9 + shlb $4,%dl + movq 16+8-128(%rsi),%rbx + shlq $60,%r10 + movb %dl,0(%rsp) + orq %r10,%rax + movb %bl,%dl + shrq $4,%rbx + movq %r9,%r10 + shrq $4,%r9 + movq %r8,0(%rbp) + movq 32+0-128(%rsi),%r8 + shlb $4,%dl + movq %rax,0-128(%rbp) + movq 32+8-128(%rsi),%rax + shlq $60,%r10 + movb %dl,1(%rsp) + orq %r10,%rbx + movb %al,%dl + shrq $4,%rax + movq %r8,%r10 + shrq $4,%r8 + movq %r9,8(%rbp) + movq 48+0-128(%rsi),%r9 + shlb $4,%dl + movq %rbx,8-128(%rbp) + movq 48+8-128(%rsi),%rbx + shlq $60,%r10 + movb %dl,2(%rsp) + orq %r10,%rax + movb %bl,%dl + shrq $4,%rbx + movq %r9,%r10 + shrq $4,%r9 + movq %r8,16(%rbp) + movq 64+0-128(%rsi),%r8 + shlb $4,%dl + movq %rax,16-128(%rbp) + movq 64+8-128(%rsi),%rax + shlq $60,%r10 + movb %dl,3(%rsp) + orq %r10,%rbx + movb %al,%dl + shrq $4,%rax + movq %r8,%r10 + shrq $4,%r8 + movq %r9,24(%rbp) + movq 80+0-128(%rsi),%r9 + shlb $4,%dl + movq %rbx,24-128(%rbp) + movq 80+8-128(%rsi),%rbx + shlq $60,%r10 + movb %dl,4(%rsp) + orq %r10,%rax + movb %bl,%dl + shrq $4,%rbx + movq %r9,%r10 + shrq $4,%r9 + movq %r8,32(%rbp) + movq 96+0-128(%rsi),%r8 + shlb $4,%dl + movq %rax,32-128(%rbp) + movq 96+8-128(%rsi),%rax + shlq $60,%r10 + movb %dl,5(%rsp) + orq %r10,%rbx + movb %al,%dl + shrq $4,%rax + movq %r8,%r10 + shrq $4,%r8 + movq %r9,40(%rbp) + movq 112+0-128(%rsi),%r9 + shlb $4,%dl + movq %rbx,40-128(%rbp) + movq 112+8-128(%rsi),%rbx + shlq $60,%r10 + movb %dl,6(%rsp) + orq %r10,%rax + movb %bl,%dl + shrq $4,%rbx + movq %r9,%r10 + shrq $4,%r9 + movq %r8,48(%rbp) + movq 128+0-128(%rsi),%r8 + shlb $4,%dl + movq %rax,48-128(%rbp) + movq 128+8-128(%rsi),%rax + shlq $60,%r10 + movb %dl,7(%rsp) + orq %r10,%rbx + movb %al,%dl + shrq $4,%rax + movq %r8,%r10 + shrq $4,%r8 + movq %r9,56(%rbp) + movq 144+0-128(%rsi),%r9 + shlb $4,%dl + movq %rbx,56-128(%rbp) + movq 144+8-128(%rsi),%rbx + shlq $60,%r10 + movb %dl,8(%rsp) + orq %r10,%rax + movb %bl,%dl + shrq $4,%rbx + movq %r9,%r10 + shrq $4,%r9 + movq %r8,64(%rbp) + movq 160+0-128(%rsi),%r8 + shlb $4,%dl + movq %rax,64-128(%rbp) + movq 160+8-128(%rsi),%rax + shlq $60,%r10 + movb %dl,9(%rsp) + orq %r10,%rbx + movb %al,%dl + shrq $4,%rax + movq %r8,%r10 + shrq $4,%r8 + movq %r9,72(%rbp) + movq 176+0-128(%rsi),%r9 + shlb $4,%dl + movq %rbx,72-128(%rbp) + movq 176+8-128(%rsi),%rbx + shlq $60,%r10 + movb %dl,10(%rsp) + orq %r10,%rax + movb %bl,%dl + shrq $4,%rbx + movq %r9,%r10 + shrq $4,%r9 + movq %r8,80(%rbp) + movq 192+0-128(%rsi),%r8 + shlb $4,%dl + movq %rax,80-128(%rbp) + movq 192+8-128(%rsi),%rax + shlq $60,%r10 + movb %dl,11(%rsp) + orq %r10,%rbx + movb %al,%dl + shrq $4,%rax + movq %r8,%r10 + shrq $4,%r8 + movq %r9,88(%rbp) + movq 208+0-128(%rsi),%r9 + shlb $4,%dl + movq %rbx,88-128(%rbp) + movq 208+8-128(%rsi),%rbx + shlq $60,%r10 + movb %dl,12(%rsp) + orq %r10,%rax + movb %bl,%dl + shrq $4,%rbx + movq %r9,%r10 + shrq $4,%r9 + movq %r8,96(%rbp) + movq 224+0-128(%rsi),%r8 + shlb $4,%dl + movq %rax,96-128(%rbp) + movq 224+8-128(%rsi),%rax + shlq $60,%r10 + movb %dl,13(%rsp) + orq %r10,%rbx + movb %al,%dl + shrq $4,%rax + movq %r8,%r10 + shrq $4,%r8 + movq %r9,104(%rbp) + movq 240+0-128(%rsi),%r9 + shlb $4,%dl + movq %rbx,104-128(%rbp) + movq 240+8-128(%rsi),%rbx + shlq $60,%r10 + movb %dl,14(%rsp) + orq %r10,%rax + movb %bl,%dl + shrq $4,%rbx + movq %r9,%r10 + shrq $4,%r9 + movq %r8,112(%rbp) + shlb $4,%dl + movq %rax,112-128(%rbp) + shlq $60,%r10 + movb %dl,15(%rsp) + orq %r10,%rbx + movq %r9,120(%rbp) + movq %rbx,120-128(%rbp) + addq $-128,%rsi + movq 8(%rdi),%r8 + movq 0(%rdi),%r9 + addq %r14,%r15 + leaq .Lrem_8bit(%rip),%r11 + jmp .Louter_loop +.align 16 +.Louter_loop: + xorq (%r14),%r9 + movq 8(%r14),%rdx + leaq 16(%r14),%r14 + xorq %r8,%rdx + movq %r9,(%rdi) + movq %rdx,8(%rdi) + shrq $32,%rdx + xorq %rax,%rax + roll $8,%edx + movb %dl,%al + movzbl %dl,%ebx + shlb $4,%al + shrl $4,%ebx + roll $8,%edx + movq 8(%rsi,%rax,1),%r8 + movq (%rsi,%rax,1),%r9 + movb %dl,%al + movzbl %dl,%ecx + shlb $4,%al + movzbq (%rsp,%rbx,1),%r12 + shrl $4,%ecx + xorq %r8,%r12 + movq %r9,%r10 + shrq $8,%r8 + movzbq %r12b,%r12 + shrq $8,%r9 + xorq -128(%rbp,%rbx,8),%r8 + shlq $56,%r10 + xorq (%rbp,%rbx,8),%r9 + roll $8,%edx + xorq 8(%rsi,%rax,1),%r8 + xorq (%rsi,%rax,1),%r9 + movb %dl,%al + xorq %r10,%r8 + movzwq (%r11,%r12,2),%r12 + movzbl %dl,%ebx + shlb $4,%al + movzbq (%rsp,%rcx,1),%r13 + shrl $4,%ebx + shlq $48,%r12 + xorq %r8,%r13 + movq %r9,%r10 + xorq %r12,%r9 + shrq $8,%r8 + movzbq %r13b,%r13 + shrq $8,%r9 + xorq -128(%rbp,%rcx,8),%r8 + shlq $56,%r10 + xorq (%rbp,%rcx,8),%r9 + roll $8,%edx + xorq 8(%rsi,%rax,1),%r8 + xorq (%rsi,%rax,1),%r9 + movb %dl,%al + xorq %r10,%r8 + movzwq (%r11,%r13,2),%r13 + movzbl %dl,%ecx + shlb $4,%al + movzbq (%rsp,%rbx,1),%r12 + shrl $4,%ecx + shlq $48,%r13 + xorq %r8,%r12 + movq %r9,%r10 + xorq %r13,%r9 + shrq $8,%r8 + movzbq %r12b,%r12 + movl 8(%rdi),%edx + shrq $8,%r9 + xorq -128(%rbp,%rbx,8),%r8 + shlq $56,%r10 + xorq (%rbp,%rbx,8),%r9 + roll $8,%edx + xorq 8(%rsi,%rax,1),%r8 + xorq (%rsi,%rax,1),%r9 + movb %dl,%al + xorq %r10,%r8 + movzwq (%r11,%r12,2),%r12 + movzbl %dl,%ebx + shlb $4,%al + movzbq (%rsp,%rcx,1),%r13 + shrl $4,%ebx + shlq $48,%r12 + xorq %r8,%r13 + movq %r9,%r10 + xorq %r12,%r9 + shrq $8,%r8 + movzbq %r13b,%r13 + shrq $8,%r9 + xorq -128(%rbp,%rcx,8),%r8 + shlq $56,%r10 + xorq (%rbp,%rcx,8),%r9 + roll $8,%edx + xorq 8(%rsi,%rax,1),%r8 + xorq (%rsi,%rax,1),%r9 + movb %dl,%al + xorq %r10,%r8 + movzwq (%r11,%r13,2),%r13 + movzbl %dl,%ecx + shlb $4,%al + movzbq (%rsp,%rbx,1),%r12 + shrl $4,%ecx + shlq $48,%r13 + xorq %r8,%r12 + movq %r9,%r10 + xorq %r13,%r9 + shrq $8,%r8 + movzbq %r12b,%r12 + shrq $8,%r9 + xorq -128(%rbp,%rbx,8),%r8 + shlq $56,%r10 + xorq (%rbp,%rbx,8),%r9 + roll $8,%edx + xorq 8(%rsi,%rax,1),%r8 + xorq (%rsi,%rax,1),%r9 + movb %dl,%al + xorq %r10,%r8 + movzwq (%r11,%r12,2),%r12 + movzbl %dl,%ebx + shlb $4,%al + movzbq (%rsp,%rcx,1),%r13 + shrl $4,%ebx + shlq $48,%r12 + xorq %r8,%r13 + movq %r9,%r10 + xorq %r12,%r9 + shrq $8,%r8 + movzbq %r13b,%r13 + shrq $8,%r9 + xorq -128(%rbp,%rcx,8),%r8 + shlq $56,%r10 + xorq (%rbp,%rcx,8),%r9 + roll $8,%edx + xorq 8(%rsi,%rax,1),%r8 + xorq (%rsi,%rax,1),%r9 + movb %dl,%al + xorq %r10,%r8 + movzwq (%r11,%r13,2),%r13 + movzbl %dl,%ecx + shlb $4,%al + movzbq (%rsp,%rbx,1),%r12 + shrl $4,%ecx + shlq $48,%r13 + xorq %r8,%r12 + movq %r9,%r10 + xorq %r13,%r9 + shrq $8,%r8 + movzbq %r12b,%r12 + movl 4(%rdi),%edx + shrq $8,%r9 + xorq -128(%rbp,%rbx,8),%r8 + shlq $56,%r10 + xorq (%rbp,%rbx,8),%r9 + roll $8,%edx + xorq 8(%rsi,%rax,1),%r8 + xorq (%rsi,%rax,1),%r9 + movb %dl,%al + xorq %r10,%r8 + movzwq (%r11,%r12,2),%r12 + movzbl %dl,%ebx + shlb $4,%al + movzbq (%rsp,%rcx,1),%r13 + shrl $4,%ebx + shlq $48,%r12 + xorq %r8,%r13 + movq %r9,%r10 + xorq %r12,%r9 + shrq $8,%r8 + movzbq %r13b,%r13 + shrq $8,%r9 + xorq -128(%rbp,%rcx,8),%r8 + shlq $56,%r10 + xorq (%rbp,%rcx,8),%r9 + roll $8,%edx + xorq 8(%rsi,%rax,1),%r8 + xorq (%rsi,%rax,1),%r9 + movb %dl,%al + xorq %r10,%r8 + movzwq (%r11,%r13,2),%r13 + movzbl %dl,%ecx + shlb $4,%al + movzbq (%rsp,%rbx,1),%r12 + shrl $4,%ecx + shlq $48,%r13 + xorq %r8,%r12 + movq %r9,%r10 + xorq %r13,%r9 + shrq $8,%r8 + movzbq %r12b,%r12 + shrq $8,%r9 + xorq -128(%rbp,%rbx,8),%r8 + shlq $56,%r10 + xorq (%rbp,%rbx,8),%r9 + roll $8,%edx + xorq 8(%rsi,%rax,1),%r8 + xorq (%rsi,%rax,1),%r9 + movb %dl,%al + xorq %r10,%r8 + movzwq (%r11,%r12,2),%r12 + movzbl %dl,%ebx + shlb $4,%al + movzbq (%rsp,%rcx,1),%r13 + shrl $4,%ebx + shlq $48,%r12 + xorq %r8,%r13 + movq %r9,%r10 + xorq %r12,%r9 + shrq $8,%r8 + movzbq %r13b,%r13 + shrq $8,%r9 + xorq -128(%rbp,%rcx,8),%r8 + shlq $56,%r10 + xorq (%rbp,%rcx,8),%r9 + roll $8,%edx + xorq 8(%rsi,%rax,1),%r8 + xorq (%rsi,%rax,1),%r9 + movb %dl,%al + xorq %r10,%r8 + movzwq (%r11,%r13,2),%r13 + movzbl %dl,%ecx + shlb $4,%al + movzbq (%rsp,%rbx,1),%r12 + shrl $4,%ecx + shlq $48,%r13 + xorq %r8,%r12 + movq %r9,%r10 + xorq %r13,%r9 + shrq $8,%r8 + movzbq %r12b,%r12 + movl 0(%rdi),%edx + shrq $8,%r9 + xorq -128(%rbp,%rbx,8),%r8 + shlq $56,%r10 + xorq (%rbp,%rbx,8),%r9 + roll $8,%edx + xorq 8(%rsi,%rax,1),%r8 + xorq (%rsi,%rax,1),%r9 + movb %dl,%al + xorq %r10,%r8 + movzwq (%r11,%r12,2),%r12 + movzbl %dl,%ebx + shlb $4,%al + movzbq (%rsp,%rcx,1),%r13 + shrl $4,%ebx + shlq $48,%r12 + xorq %r8,%r13 + movq %r9,%r10 + xorq %r12,%r9 + shrq $8,%r8 + movzbq %r13b,%r13 + shrq $8,%r9 + xorq -128(%rbp,%rcx,8),%r8 + shlq $56,%r10 + xorq (%rbp,%rcx,8),%r9 + roll $8,%edx + xorq 8(%rsi,%rax,1),%r8 + xorq (%rsi,%rax,1),%r9 + movb %dl,%al + xorq %r10,%r8 + movzwq (%r11,%r13,2),%r13 + movzbl %dl,%ecx + shlb $4,%al + movzbq (%rsp,%rbx,1),%r12 + shrl $4,%ecx + shlq $48,%r13 + xorq %r8,%r12 + movq %r9,%r10 + xorq %r13,%r9 + shrq $8,%r8 + movzbq %r12b,%r12 + shrq $8,%r9 + xorq -128(%rbp,%rbx,8),%r8 + shlq $56,%r10 + xorq (%rbp,%rbx,8),%r9 + roll $8,%edx + xorq 8(%rsi,%rax,1),%r8 + xorq (%rsi,%rax,1),%r9 + movb %dl,%al + xorq %r10,%r8 + movzwq (%r11,%r12,2),%r12 + movzbl %dl,%ebx + shlb $4,%al + movzbq (%rsp,%rcx,1),%r13 + shrl $4,%ebx + shlq $48,%r12 + xorq %r8,%r13 + movq %r9,%r10 + xorq %r12,%r9 + shrq $8,%r8 + movzbq %r13b,%r13 + shrq $8,%r9 + xorq -128(%rbp,%rcx,8),%r8 + shlq $56,%r10 + xorq (%rbp,%rcx,8),%r9 + roll $8,%edx + xorq 8(%rsi,%rax,1),%r8 + xorq (%rsi,%rax,1),%r9 + movb %dl,%al + xorq %r10,%r8 + movzwq (%r11,%r13,2),%r13 + movzbl %dl,%ecx + shlb $4,%al + movzbq (%rsp,%rbx,1),%r12 + andl $240,%ecx + shlq $48,%r13 + xorq %r8,%r12 + movq %r9,%r10 + xorq %r13,%r9 + shrq $8,%r8 + movzbq %r12b,%r12 + movl -4(%rdi),%edx + shrq $8,%r9 + xorq -128(%rbp,%rbx,8),%r8 + shlq $56,%r10 + xorq (%rbp,%rbx,8),%r9 + movzwq (%r11,%r12,2),%r12 + xorq 8(%rsi,%rax,1),%r8 + xorq (%rsi,%rax,1),%r9 + shlq $48,%r12 + xorq %r10,%r8 + xorq %r12,%r9 + movzbq %r8b,%r13 + shrq $4,%r8 + movq %r9,%r10 + shlb $4,%r13b + shrq $4,%r9 + xorq 8(%rsi,%rcx,1),%r8 + movzwq (%r11,%r13,2),%r13 + shlq $60,%r10 + xorq (%rsi,%rcx,1),%r9 + xorq %r10,%r8 + shlq $48,%r13 + bswapq %r8 + xorq %r13,%r9 + bswapq %r9 + cmpq %r15,%r14 + jb .Louter_loop + movq %r8,8(%rdi) + movq %r9,(%rdi) + + leaq 280(%rsp),%rsi + movq 0(%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbp + movq 40(%rsi),%rbx + leaq 48(%rsi),%rsp +.Lghash_epilogue: + .byte 0xf3,0xc3 +.size gcm_ghash_4bit,.-gcm_ghash_4bit +.globl gcm_init_clmul +.type gcm_init_clmul,@function +.align 16 +gcm_init_clmul: +.L_init_clmul: + movdqu (%rsi),%xmm2 + pshufd $78,%xmm2,%xmm2 + + + pshufd $255,%xmm2,%xmm4 + movdqa %xmm2,%xmm3 + psllq $1,%xmm2 + pxor %xmm5,%xmm5 + psrlq $63,%xmm3 + pcmpgtd %xmm4,%xmm5 + pslldq $8,%xmm3 + por %xmm3,%xmm2 + + + pand .L0x1c2_polynomial(%rip),%xmm5 + pxor %xmm5,%xmm2 + + + pshufd $78,%xmm2,%xmm6 + movdqa %xmm2,%xmm0 + pxor %xmm2,%xmm6 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 +.byte 102,15,58,68,194,0 +.byte 102,15,58,68,202,17 +.byte 102,15,58,68,222,0 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + pshufd $78,%xmm2,%xmm3 + pshufd $78,%xmm0,%xmm4 + pxor %xmm2,%xmm3 + movdqu %xmm2,0(%rdi) + pxor %xmm0,%xmm4 + movdqu %xmm0,16(%rdi) +.byte 102,15,58,15,227,8 + movdqu %xmm4,32(%rdi) + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 +.byte 102,15,58,68,194,0 +.byte 102,15,58,68,202,17 +.byte 102,15,58,68,222,0 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + movdqa %xmm0,%xmm5 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 +.byte 102,15,58,68,194,0 +.byte 102,15,58,68,202,17 +.byte 102,15,58,68,222,0 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + pshufd $78,%xmm5,%xmm3 + pshufd $78,%xmm0,%xmm4 + pxor %xmm5,%xmm3 + movdqu %xmm5,48(%rdi) + pxor %xmm0,%xmm4 + movdqu %xmm0,64(%rdi) +.byte 102,15,58,15,227,8 + movdqu %xmm4,80(%rdi) + .byte 0xf3,0xc3 +.size gcm_init_clmul,.-gcm_init_clmul +.globl gcm_gmult_clmul +.type gcm_gmult_clmul,@function +.align 16 +gcm_gmult_clmul: +.L_gmult_clmul: + movdqu (%rdi),%xmm0 + movdqa .Lbswap_mask(%rip),%xmm5 + movdqu (%rsi),%xmm2 + movdqu 32(%rsi),%xmm4 +.byte 102,15,56,0,197 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 +.byte 102,15,58,68,194,0 +.byte 102,15,58,68,202,17 +.byte 102,15,58,68,220,0 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 +.byte 102,15,56,0,197 + movdqu %xmm0,(%rdi) + .byte 0xf3,0xc3 +.size gcm_gmult_clmul,.-gcm_gmult_clmul +.globl gcm_ghash_clmul +.type gcm_ghash_clmul,@function +.align 32 +gcm_ghash_clmul: +.L_ghash_clmul: + movdqa .Lbswap_mask(%rip),%xmm10 + + movdqu (%rdi),%xmm0 + movdqu (%rsi),%xmm2 + movdqu 32(%rsi),%xmm7 +.byte 102,65,15,56,0,194 + + subq $16,%rcx + jz .Lodd_tail + + movdqu 16(%rsi),%xmm6 + movl OPENSSL_ia32cap_P+4(%rip),%eax + cmpq $48,%rcx + jb .Lskip4x + + andl $71303168,%eax + cmpl $4194304,%eax + je .Lskip4x + + subq $48,%rcx + movq $11547335547999543296,%rax + movdqu 48(%rsi),%xmm14 + movdqu 64(%rsi),%xmm15 + + + + + movdqu 48(%rdx),%xmm3 + movdqu 32(%rdx),%xmm11 +.byte 102,65,15,56,0,218 +.byte 102,69,15,56,0,218 + movdqa %xmm3,%xmm5 + pshufd $78,%xmm3,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,68,218,0 +.byte 102,15,58,68,234,17 +.byte 102,15,58,68,231,0 + + movdqa %xmm11,%xmm13 + pshufd $78,%xmm11,%xmm12 + pxor %xmm11,%xmm12 +.byte 102,68,15,58,68,222,0 +.byte 102,68,15,58,68,238,17 +.byte 102,68,15,58,68,231,16 + xorps %xmm11,%xmm3 + xorps %xmm13,%xmm5 + movups 80(%rsi),%xmm7 + xorps %xmm12,%xmm4 + + movdqu 16(%rdx),%xmm11 + movdqu 0(%rdx),%xmm8 +.byte 102,69,15,56,0,218 +.byte 102,69,15,56,0,194 + movdqa %xmm11,%xmm13 + pshufd $78,%xmm11,%xmm12 + pxor %xmm8,%xmm0 + pxor %xmm11,%xmm12 +.byte 102,69,15,58,68,222,0 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm8 + pxor %xmm0,%xmm8 +.byte 102,69,15,58,68,238,17 +.byte 102,68,15,58,68,231,0 + xorps %xmm11,%xmm3 + xorps %xmm13,%xmm5 + + leaq 64(%rdx),%rdx + subq $64,%rcx + jc .Ltail4x + + jmp .Lmod4_loop +.align 32 +.Lmod4_loop: +.byte 102,65,15,58,68,199,0 + xorps %xmm12,%xmm4 + movdqu 48(%rdx),%xmm11 +.byte 102,69,15,56,0,218 +.byte 102,65,15,58,68,207,17 + xorps %xmm3,%xmm0 + movdqu 32(%rdx),%xmm3 + movdqa %xmm11,%xmm13 +.byte 102,68,15,58,68,199,16 + pshufd $78,%xmm11,%xmm12 + xorps %xmm5,%xmm1 + pxor %xmm11,%xmm12 +.byte 102,65,15,56,0,218 + movups 32(%rsi),%xmm7 + xorps %xmm4,%xmm8 +.byte 102,68,15,58,68,218,0 + pshufd $78,%xmm3,%xmm4 + + pxor %xmm0,%xmm8 + movdqa %xmm3,%xmm5 + pxor %xmm1,%xmm8 + pxor %xmm3,%xmm4 + movdqa %xmm8,%xmm9 +.byte 102,68,15,58,68,234,17 + pslldq $8,%xmm8 + psrldq $8,%xmm9 + pxor %xmm8,%xmm0 + movdqa .L7_mask(%rip),%xmm8 + pxor %xmm9,%xmm1 +.byte 102,76,15,110,200 + + pand %xmm0,%xmm8 +.byte 102,69,15,56,0,200 + pxor %xmm0,%xmm9 +.byte 102,68,15,58,68,231,0 + psllq $57,%xmm9 + movdqa %xmm9,%xmm8 + pslldq $8,%xmm9 +.byte 102,15,58,68,222,0 + psrldq $8,%xmm8 + pxor %xmm9,%xmm0 + pxor %xmm8,%xmm1 + movdqu 0(%rdx),%xmm8 + + movdqa %xmm0,%xmm9 + psrlq $1,%xmm0 +.byte 102,15,58,68,238,17 + xorps %xmm11,%xmm3 + movdqu 16(%rdx),%xmm11 +.byte 102,69,15,56,0,218 +.byte 102,15,58,68,231,16 + xorps %xmm13,%xmm5 + movups 80(%rsi),%xmm7 +.byte 102,69,15,56,0,194 + pxor %xmm9,%xmm1 + pxor %xmm0,%xmm9 + psrlq $5,%xmm0 + + movdqa %xmm11,%xmm13 + pxor %xmm12,%xmm4 + pshufd $78,%xmm11,%xmm12 + pxor %xmm9,%xmm0 + pxor %xmm8,%xmm1 + pxor %xmm11,%xmm12 +.byte 102,69,15,58,68,222,0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + movdqa %xmm0,%xmm1 +.byte 102,69,15,58,68,238,17 + xorps %xmm11,%xmm3 + pshufd $78,%xmm0,%xmm8 + pxor %xmm0,%xmm8 + +.byte 102,68,15,58,68,231,0 + xorps %xmm13,%xmm5 + + leaq 64(%rdx),%rdx + subq $64,%rcx + jnc .Lmod4_loop + +.Ltail4x: +.byte 102,65,15,58,68,199,0 +.byte 102,65,15,58,68,207,17 +.byte 102,68,15,58,68,199,16 + xorps %xmm12,%xmm4 + xorps %xmm3,%xmm0 + xorps %xmm5,%xmm1 + pxor %xmm0,%xmm1 + pxor %xmm4,%xmm8 + + pxor %xmm1,%xmm8 + pxor %xmm0,%xmm1 + + movdqa %xmm8,%xmm9 + psrldq $8,%xmm8 + pslldq $8,%xmm9 + pxor %xmm8,%xmm1 + pxor %xmm9,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + addq $64,%rcx + jz .Ldone + movdqu 32(%rsi),%xmm7 + subq $16,%rcx + jz .Lodd_tail +.Lskip4x: + + + + + + movdqu (%rdx),%xmm8 + movdqu 16(%rdx),%xmm3 +.byte 102,69,15,56,0,194 +.byte 102,65,15,56,0,218 + pxor %xmm8,%xmm0 + + movdqa %xmm3,%xmm5 + pshufd $78,%xmm3,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,68,218,0 +.byte 102,15,58,68,234,17 +.byte 102,15,58,68,231,0 + + leaq 32(%rdx),%rdx + nop + subq $32,%rcx + jbe .Leven_tail + nop + jmp .Lmod_loop + +.align 32 +.Lmod_loop: + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm8 + pshufd $78,%xmm0,%xmm4 + pxor %xmm0,%xmm4 + +.byte 102,15,58,68,198,0 +.byte 102,15,58,68,206,17 +.byte 102,15,58,68,231,16 + + pxor %xmm3,%xmm0 + pxor %xmm5,%xmm1 + movdqu (%rdx),%xmm9 + pxor %xmm0,%xmm8 +.byte 102,69,15,56,0,202 + movdqu 16(%rdx),%xmm3 + + pxor %xmm1,%xmm8 + pxor %xmm9,%xmm1 + pxor %xmm8,%xmm4 +.byte 102,65,15,56,0,218 + movdqa %xmm4,%xmm8 + psrldq $8,%xmm8 + pslldq $8,%xmm4 + pxor %xmm8,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm3,%xmm5 + + movdqa %xmm0,%xmm9 + movdqa %xmm0,%xmm8 + psllq $5,%xmm0 + pxor %xmm0,%xmm8 +.byte 102,15,58,68,218,0 + psllq $1,%xmm0 + pxor %xmm8,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm8 + pslldq $8,%xmm0 + psrldq $8,%xmm8 + pxor %xmm9,%xmm0 + pshufd $78,%xmm5,%xmm4 + pxor %xmm8,%xmm1 + pxor %xmm5,%xmm4 + + movdqa %xmm0,%xmm9 + psrlq $1,%xmm0 +.byte 102,15,58,68,234,17 + pxor %xmm9,%xmm1 + pxor %xmm0,%xmm9 + psrlq $5,%xmm0 + pxor %xmm9,%xmm0 + leaq 32(%rdx),%rdx + psrlq $1,%xmm0 +.byte 102,15,58,68,231,0 + pxor %xmm1,%xmm0 + + subq $32,%rcx + ja .Lmod_loop + +.Leven_tail: + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm8 + pshufd $78,%xmm0,%xmm4 + pxor %xmm0,%xmm4 + +.byte 102,15,58,68,198,0 +.byte 102,15,58,68,206,17 +.byte 102,15,58,68,231,16 + + pxor %xmm3,%xmm0 + pxor %xmm5,%xmm1 + pxor %xmm0,%xmm8 + pxor %xmm1,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm8 + psrldq $8,%xmm8 + pslldq $8,%xmm4 + pxor %xmm8,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + testq %rcx,%rcx + jnz .Ldone + +.Lodd_tail: + movdqu (%rdx),%xmm8 +.byte 102,69,15,56,0,194 + pxor %xmm8,%xmm0 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 +.byte 102,15,58,68,194,0 +.byte 102,15,58,68,202,17 +.byte 102,15,58,68,223,0 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 +.Ldone: +.byte 102,65,15,56,0,194 + movdqu %xmm0,(%rdi) + .byte 0xf3,0xc3 +.size gcm_ghash_clmul,.-gcm_ghash_clmul +.globl gcm_init_avx +.type gcm_init_avx,@function +.align 32 +gcm_init_avx: + jmp .L_init_clmul +.size gcm_init_avx,.-gcm_init_avx +.globl gcm_gmult_avx +.type gcm_gmult_avx,@function +.align 32 +gcm_gmult_avx: + jmp .L_gmult_clmul +.size gcm_gmult_avx,.-gcm_gmult_avx +.globl gcm_ghash_avx +.type gcm_ghash_avx,@function +.align 32 +gcm_ghash_avx: + jmp .L_ghash_clmul +.size gcm_ghash_avx,.-gcm_ghash_avx +.align 64 +.Lbswap_mask: +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.L0x1c2_polynomial: +.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +.L7_mask: +.long 7,0,7,0 +.L7_mask_poly: +.long 7,0,450,0 +.align 64 +.type .Lrem_4bit,@object +.Lrem_4bit: +.long 0,0,0,471859200,0,943718400,0,610271232 +.long 0,1887436800,0,1822425088,0,1220542464,0,1423966208 +.long 0,3774873600,0,4246732800,0,3644850176,0,3311403008 +.long 0,2441084928,0,2376073216,0,2847932416,0,3051356160 +.type .Lrem_8bit,@object +.Lrem_8bit: +.value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E +.value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E +.value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E +.value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E +.value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E +.value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E +.value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E +.value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E +.value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE +.value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE +.value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE +.value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE +.value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E +.value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E +.value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE +.value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE +.value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E +.value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E +.value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E +.value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E +.value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E +.value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E +.value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E +.value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E +.value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE +.value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE +.value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE +.value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE +.value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E +.value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E +.value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE +.value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE + +.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 64 diff --git a/secure/lib/libcrypto/amd64/md5-x86_64.S b/secure/lib/libcrypto/amd64/md5-x86_64.S new file mode 100644 index 0000000..94fb761 --- /dev/null +++ b/secure/lib/libcrypto/amd64/md5-x86_64.S @@ -0,0 +1,669 @@ + # $FreeBSD$ +.text +.align 16 + +.globl md5_block_asm_data_order +.type md5_block_asm_data_order,@function +md5_block_asm_data_order: + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r14 + pushq %r15 +.Lprologue: + + + + + movq %rdi,%rbp + shlq $6,%rdx + leaq (%rsi,%rdx,1),%rdi + movl 0(%rbp),%eax + movl 4(%rbp),%ebx + movl 8(%rbp),%ecx + movl 12(%rbp),%edx + + + + + + + + cmpq %rdi,%rsi + je .Lend + + +.Lloop: + movl %eax,%r8d + movl %ebx,%r9d + movl %ecx,%r14d + movl %edx,%r15d + movl 0(%rsi),%r10d + movl %edx,%r11d + xorl %ecx,%r11d + leal -680876936(%rax,%r10,1),%eax + andl %ebx,%r11d + xorl %edx,%r11d + movl 4(%rsi),%r10d + addl %r11d,%eax + roll $7,%eax + movl %ecx,%r11d + addl %ebx,%eax + xorl %ebx,%r11d + leal -389564586(%rdx,%r10,1),%edx + andl %eax,%r11d + xorl %ecx,%r11d + movl 8(%rsi),%r10d + addl %r11d,%edx + roll $12,%edx + movl %ebx,%r11d + addl %eax,%edx + xorl %eax,%r11d + leal 606105819(%rcx,%r10,1),%ecx + andl %edx,%r11d + xorl %ebx,%r11d + movl 12(%rsi),%r10d + addl %r11d,%ecx + roll $17,%ecx + movl %eax,%r11d + addl %edx,%ecx + xorl %edx,%r11d + leal -1044525330(%rbx,%r10,1),%ebx + andl %ecx,%r11d + xorl %eax,%r11d + movl 16(%rsi),%r10d + addl %r11d,%ebx + roll $22,%ebx + movl %edx,%r11d + addl %ecx,%ebx + xorl %ecx,%r11d + leal -176418897(%rax,%r10,1),%eax + andl %ebx,%r11d + xorl %edx,%r11d + movl 20(%rsi),%r10d + addl %r11d,%eax + roll $7,%eax + movl %ecx,%r11d + addl %ebx,%eax + xorl %ebx,%r11d + leal 1200080426(%rdx,%r10,1),%edx + andl %eax,%r11d + xorl %ecx,%r11d + movl 24(%rsi),%r10d + addl %r11d,%edx + roll $12,%edx + movl %ebx,%r11d + addl %eax,%edx + xorl %eax,%r11d + leal -1473231341(%rcx,%r10,1),%ecx + andl %edx,%r11d + xorl %ebx,%r11d + movl 28(%rsi),%r10d + addl %r11d,%ecx + roll $17,%ecx + movl %eax,%r11d + addl %edx,%ecx + xorl %edx,%r11d + leal -45705983(%rbx,%r10,1),%ebx + andl %ecx,%r11d + xorl %eax,%r11d + movl 32(%rsi),%r10d + addl %r11d,%ebx + roll $22,%ebx + movl %edx,%r11d + addl %ecx,%ebx + xorl %ecx,%r11d + leal 1770035416(%rax,%r10,1),%eax + andl %ebx,%r11d + xorl %edx,%r11d + movl 36(%rsi),%r10d + addl %r11d,%eax + roll $7,%eax + movl %ecx,%r11d + addl %ebx,%eax + xorl %ebx,%r11d + leal -1958414417(%rdx,%r10,1),%edx + andl %eax,%r11d + xorl %ecx,%r11d + movl 40(%rsi),%r10d + addl %r11d,%edx + roll $12,%edx + movl %ebx,%r11d + addl %eax,%edx + xorl %eax,%r11d + leal -42063(%rcx,%r10,1),%ecx + andl %edx,%r11d + xorl %ebx,%r11d + movl 44(%rsi),%r10d + addl %r11d,%ecx + roll $17,%ecx + movl %eax,%r11d + addl %edx,%ecx + xorl %edx,%r11d + leal -1990404162(%rbx,%r10,1),%ebx + andl %ecx,%r11d + xorl %eax,%r11d + movl 48(%rsi),%r10d + addl %r11d,%ebx + roll $22,%ebx + movl %edx,%r11d + addl %ecx,%ebx + xorl %ecx,%r11d + leal 1804603682(%rax,%r10,1),%eax + andl %ebx,%r11d + xorl %edx,%r11d + movl 52(%rsi),%r10d + addl %r11d,%eax + roll $7,%eax + movl %ecx,%r11d + addl %ebx,%eax + xorl %ebx,%r11d + leal -40341101(%rdx,%r10,1),%edx + andl %eax,%r11d + xorl %ecx,%r11d + movl 56(%rsi),%r10d + addl %r11d,%edx + roll $12,%edx + movl %ebx,%r11d + addl %eax,%edx + xorl %eax,%r11d + leal -1502002290(%rcx,%r10,1),%ecx + andl %edx,%r11d + xorl %ebx,%r11d + movl 60(%rsi),%r10d + addl %r11d,%ecx + roll $17,%ecx + movl %eax,%r11d + addl %edx,%ecx + xorl %edx,%r11d + leal 1236535329(%rbx,%r10,1),%ebx + andl %ecx,%r11d + xorl %eax,%r11d + movl 0(%rsi),%r10d + addl %r11d,%ebx + roll $22,%ebx + movl %edx,%r11d + addl %ecx,%ebx + movl 4(%rsi),%r10d + movl %edx,%r11d + movl %edx,%r12d + notl %r11d + leal -165796510(%rax,%r10,1),%eax + andl %ebx,%r12d + andl %ecx,%r11d + movl 24(%rsi),%r10d + orl %r11d,%r12d + movl %ecx,%r11d + addl %r12d,%eax + movl %ecx,%r12d + roll $5,%eax + addl %ebx,%eax + notl %r11d + leal -1069501632(%rdx,%r10,1),%edx + andl %eax,%r12d + andl %ebx,%r11d + movl 44(%rsi),%r10d + orl %r11d,%r12d + movl %ebx,%r11d + addl %r12d,%edx + movl %ebx,%r12d + roll $9,%edx + addl %eax,%edx + notl %r11d + leal 643717713(%rcx,%r10,1),%ecx + andl %edx,%r12d + andl %eax,%r11d + movl 0(%rsi),%r10d + orl %r11d,%r12d + movl %eax,%r11d + addl %r12d,%ecx + movl %eax,%r12d + roll $14,%ecx + addl %edx,%ecx + notl %r11d + leal -373897302(%rbx,%r10,1),%ebx + andl %ecx,%r12d + andl %edx,%r11d + movl 20(%rsi),%r10d + orl %r11d,%r12d + movl %edx,%r11d + addl %r12d,%ebx + movl %edx,%r12d + roll $20,%ebx + addl %ecx,%ebx + notl %r11d + leal -701558691(%rax,%r10,1),%eax + andl %ebx,%r12d + andl %ecx,%r11d + movl 40(%rsi),%r10d + orl %r11d,%r12d + movl %ecx,%r11d + addl %r12d,%eax + movl %ecx,%r12d + roll $5,%eax + addl %ebx,%eax + notl %r11d + leal 38016083(%rdx,%r10,1),%edx + andl %eax,%r12d + andl %ebx,%r11d + movl 60(%rsi),%r10d + orl %r11d,%r12d + movl %ebx,%r11d + addl %r12d,%edx + movl %ebx,%r12d + roll $9,%edx + addl %eax,%edx + notl %r11d + leal -660478335(%rcx,%r10,1),%ecx + andl %edx,%r12d + andl %eax,%r11d + movl 16(%rsi),%r10d + orl %r11d,%r12d + movl %eax,%r11d + addl %r12d,%ecx + movl %eax,%r12d + roll $14,%ecx + addl %edx,%ecx + notl %r11d + leal -405537848(%rbx,%r10,1),%ebx + andl %ecx,%r12d + andl %edx,%r11d + movl 36(%rsi),%r10d + orl %r11d,%r12d + movl %edx,%r11d + addl %r12d,%ebx + movl %edx,%r12d + roll $20,%ebx + addl %ecx,%ebx + notl %r11d + leal 568446438(%rax,%r10,1),%eax + andl %ebx,%r12d + andl %ecx,%r11d + movl 56(%rsi),%r10d + orl %r11d,%r12d + movl %ecx,%r11d + addl %r12d,%eax + movl %ecx,%r12d + roll $5,%eax + addl %ebx,%eax + notl %r11d + leal -1019803690(%rdx,%r10,1),%edx + andl %eax,%r12d + andl %ebx,%r11d + movl 12(%rsi),%r10d + orl %r11d,%r12d + movl %ebx,%r11d + addl %r12d,%edx + movl %ebx,%r12d + roll $9,%edx + addl %eax,%edx + notl %r11d + leal -187363961(%rcx,%r10,1),%ecx + andl %edx,%r12d + andl %eax,%r11d + movl 32(%rsi),%r10d + orl %r11d,%r12d + movl %eax,%r11d + addl %r12d,%ecx + movl %eax,%r12d + roll $14,%ecx + addl %edx,%ecx + notl %r11d + leal 1163531501(%rbx,%r10,1),%ebx + andl %ecx,%r12d + andl %edx,%r11d + movl 52(%rsi),%r10d + orl %r11d,%r12d + movl %edx,%r11d + addl %r12d,%ebx + movl %edx,%r12d + roll $20,%ebx + addl %ecx,%ebx + notl %r11d + leal -1444681467(%rax,%r10,1),%eax + andl %ebx,%r12d + andl %ecx,%r11d + movl 8(%rsi),%r10d + orl %r11d,%r12d + movl %ecx,%r11d + addl %r12d,%eax + movl %ecx,%r12d + roll $5,%eax + addl %ebx,%eax + notl %r11d + leal -51403784(%rdx,%r10,1),%edx + andl %eax,%r12d + andl %ebx,%r11d + movl 28(%rsi),%r10d + orl %r11d,%r12d + movl %ebx,%r11d + addl %r12d,%edx + movl %ebx,%r12d + roll $9,%edx + addl %eax,%edx + notl %r11d + leal 1735328473(%rcx,%r10,1),%ecx + andl %edx,%r12d + andl %eax,%r11d + movl 48(%rsi),%r10d + orl %r11d,%r12d + movl %eax,%r11d + addl %r12d,%ecx + movl %eax,%r12d + roll $14,%ecx + addl %edx,%ecx + notl %r11d + leal -1926607734(%rbx,%r10,1),%ebx + andl %ecx,%r12d + andl %edx,%r11d + movl 0(%rsi),%r10d + orl %r11d,%r12d + movl %edx,%r11d + addl %r12d,%ebx + movl %edx,%r12d + roll $20,%ebx + addl %ecx,%ebx + movl 20(%rsi),%r10d + movl %ecx,%r11d + leal -378558(%rax,%r10,1),%eax + movl 32(%rsi),%r10d + xorl %edx,%r11d + xorl %ebx,%r11d + addl %r11d,%eax + roll $4,%eax + movl %ebx,%r11d + addl %ebx,%eax + leal -2022574463(%rdx,%r10,1),%edx + movl 44(%rsi),%r10d + xorl %ecx,%r11d + xorl %eax,%r11d + addl %r11d,%edx + roll $11,%edx + movl %eax,%r11d + addl %eax,%edx + leal 1839030562(%rcx,%r10,1),%ecx + movl 56(%rsi),%r10d + xorl %ebx,%r11d + xorl %edx,%r11d + addl %r11d,%ecx + roll $16,%ecx + movl %edx,%r11d + addl %edx,%ecx + leal -35309556(%rbx,%r10,1),%ebx + movl 4(%rsi),%r10d + xorl %eax,%r11d + xorl %ecx,%r11d + addl %r11d,%ebx + roll $23,%ebx + movl %ecx,%r11d + addl %ecx,%ebx + leal -1530992060(%rax,%r10,1),%eax + movl 16(%rsi),%r10d + xorl %edx,%r11d + xorl %ebx,%r11d + addl %r11d,%eax + roll $4,%eax + movl %ebx,%r11d + addl %ebx,%eax + leal 1272893353(%rdx,%r10,1),%edx + movl 28(%rsi),%r10d + xorl %ecx,%r11d + xorl %eax,%r11d + addl %r11d,%edx + roll $11,%edx + movl %eax,%r11d + addl %eax,%edx + leal -155497632(%rcx,%r10,1),%ecx + movl 40(%rsi),%r10d + xorl %ebx,%r11d + xorl %edx,%r11d + addl %r11d,%ecx + roll $16,%ecx + movl %edx,%r11d + addl %edx,%ecx + leal -1094730640(%rbx,%r10,1),%ebx + movl 52(%rsi),%r10d + xorl %eax,%r11d + xorl %ecx,%r11d + addl %r11d,%ebx + roll $23,%ebx + movl %ecx,%r11d + addl %ecx,%ebx + leal 681279174(%rax,%r10,1),%eax + movl 0(%rsi),%r10d + xorl %edx,%r11d + xorl %ebx,%r11d + addl %r11d,%eax + roll $4,%eax + movl %ebx,%r11d + addl %ebx,%eax + leal -358537222(%rdx,%r10,1),%edx + movl 12(%rsi),%r10d + xorl %ecx,%r11d + xorl %eax,%r11d + addl %r11d,%edx + roll $11,%edx + movl %eax,%r11d + addl %eax,%edx + leal -722521979(%rcx,%r10,1),%ecx + movl 24(%rsi),%r10d + xorl %ebx,%r11d + xorl %edx,%r11d + addl %r11d,%ecx + roll $16,%ecx + movl %edx,%r11d + addl %edx,%ecx + leal 76029189(%rbx,%r10,1),%ebx + movl 36(%rsi),%r10d + xorl %eax,%r11d + xorl %ecx,%r11d + addl %r11d,%ebx + roll $23,%ebx + movl %ecx,%r11d + addl %ecx,%ebx + leal -640364487(%rax,%r10,1),%eax + movl 48(%rsi),%r10d + xorl %edx,%r11d + xorl %ebx,%r11d + addl %r11d,%eax + roll $4,%eax + movl %ebx,%r11d + addl %ebx,%eax + leal -421815835(%rdx,%r10,1),%edx + movl 60(%rsi),%r10d + xorl %ecx,%r11d + xorl %eax,%r11d + addl %r11d,%edx + roll $11,%edx + movl %eax,%r11d + addl %eax,%edx + leal 530742520(%rcx,%r10,1),%ecx + movl 8(%rsi),%r10d + xorl %ebx,%r11d + xorl %edx,%r11d + addl %r11d,%ecx + roll $16,%ecx + movl %edx,%r11d + addl %edx,%ecx + leal -995338651(%rbx,%r10,1),%ebx + movl 0(%rsi),%r10d + xorl %eax,%r11d + xorl %ecx,%r11d + addl %r11d,%ebx + roll $23,%ebx + movl %ecx,%r11d + addl %ecx,%ebx + movl 0(%rsi),%r10d + movl $4294967295,%r11d + xorl %edx,%r11d + leal -198630844(%rax,%r10,1),%eax + orl %ebx,%r11d + xorl %ecx,%r11d + addl %r11d,%eax + movl 28(%rsi),%r10d + movl $4294967295,%r11d + roll $6,%eax + xorl %ecx,%r11d + addl %ebx,%eax + leal 1126891415(%rdx,%r10,1),%edx + orl %eax,%r11d + xorl %ebx,%r11d + addl %r11d,%edx + movl 56(%rsi),%r10d + movl $4294967295,%r11d + roll $10,%edx + xorl %ebx,%r11d + addl %eax,%edx + leal -1416354905(%rcx,%r10,1),%ecx + orl %edx,%r11d + xorl %eax,%r11d + addl %r11d,%ecx + movl 20(%rsi),%r10d + movl $4294967295,%r11d + roll $15,%ecx + xorl %eax,%r11d + addl %edx,%ecx + leal -57434055(%rbx,%r10,1),%ebx + orl %ecx,%r11d + xorl %edx,%r11d + addl %r11d,%ebx + movl 48(%rsi),%r10d + movl $4294967295,%r11d + roll $21,%ebx + xorl %edx,%r11d + addl %ecx,%ebx + leal 1700485571(%rax,%r10,1),%eax + orl %ebx,%r11d + xorl %ecx,%r11d + addl %r11d,%eax + movl 12(%rsi),%r10d + movl $4294967295,%r11d + roll $6,%eax + xorl %ecx,%r11d + addl %ebx,%eax + leal -1894986606(%rdx,%r10,1),%edx + orl %eax,%r11d + xorl %ebx,%r11d + addl %r11d,%edx + movl 40(%rsi),%r10d + movl $4294967295,%r11d + roll $10,%edx + xorl %ebx,%r11d + addl %eax,%edx + leal -1051523(%rcx,%r10,1),%ecx + orl %edx,%r11d + xorl %eax,%r11d + addl %r11d,%ecx + movl 4(%rsi),%r10d + movl $4294967295,%r11d + roll $15,%ecx + xorl %eax,%r11d + addl %edx,%ecx + leal -2054922799(%rbx,%r10,1),%ebx + orl %ecx,%r11d + xorl %edx,%r11d + addl %r11d,%ebx + movl 32(%rsi),%r10d + movl $4294967295,%r11d + roll $21,%ebx + xorl %edx,%r11d + addl %ecx,%ebx + leal 1873313359(%rax,%r10,1),%eax + orl %ebx,%r11d + xorl %ecx,%r11d + addl %r11d,%eax + movl 60(%rsi),%r10d + movl $4294967295,%r11d + roll $6,%eax + xorl %ecx,%r11d + addl %ebx,%eax + leal -30611744(%rdx,%r10,1),%edx + orl %eax,%r11d + xorl %ebx,%r11d + addl %r11d,%edx + movl 24(%rsi),%r10d + movl $4294967295,%r11d + roll $10,%edx + xorl %ebx,%r11d + addl %eax,%edx + leal -1560198380(%rcx,%r10,1),%ecx + orl %edx,%r11d + xorl %eax,%r11d + addl %r11d,%ecx + movl 52(%rsi),%r10d + movl $4294967295,%r11d + roll $15,%ecx + xorl %eax,%r11d + addl %edx,%ecx + leal 1309151649(%rbx,%r10,1),%ebx + orl %ecx,%r11d + xorl %edx,%r11d + addl %r11d,%ebx + movl 16(%rsi),%r10d + movl $4294967295,%r11d + roll $21,%ebx + xorl %edx,%r11d + addl %ecx,%ebx + leal -145523070(%rax,%r10,1),%eax + orl %ebx,%r11d + xorl %ecx,%r11d + addl %r11d,%eax + movl 44(%rsi),%r10d + movl $4294967295,%r11d + roll $6,%eax + xorl %ecx,%r11d + addl %ebx,%eax + leal -1120210379(%rdx,%r10,1),%edx + orl %eax,%r11d + xorl %ebx,%r11d + addl %r11d,%edx + movl 8(%rsi),%r10d + movl $4294967295,%r11d + roll $10,%edx + xorl %ebx,%r11d + addl %eax,%edx + leal 718787259(%rcx,%r10,1),%ecx + orl %edx,%r11d + xorl %eax,%r11d + addl %r11d,%ecx + movl 36(%rsi),%r10d + movl $4294967295,%r11d + roll $15,%ecx + xorl %eax,%r11d + addl %edx,%ecx + leal -343485551(%rbx,%r10,1),%ebx + orl %ecx,%r11d + xorl %edx,%r11d + addl %r11d,%ebx + movl 0(%rsi),%r10d + movl $4294967295,%r11d + roll $21,%ebx + xorl %edx,%r11d + addl %ecx,%ebx + + addl %r8d,%eax + addl %r9d,%ebx + addl %r14d,%ecx + addl %r15d,%edx + + + addq $64,%rsi + cmpq %rdi,%rsi + jb .Lloop + + +.Lend: + movl %eax,0(%rbp) + movl %ebx,4(%rbp) + movl %ecx,8(%rbp) + movl %edx,12(%rbp) + + movq (%rsp),%r15 + movq 8(%rsp),%r14 + movq 16(%rsp),%r12 + movq 24(%rsp),%rbx + movq 32(%rsp),%rbp + addq $40,%rsp +.Lepilogue: + .byte 0xf3,0xc3 +.size md5_block_asm_data_order,.-md5_block_asm_data_order diff --git a/secure/lib/libcrypto/amd64/rc4-md5-x86_64.S b/secure/lib/libcrypto/amd64/rc4-md5-x86_64.S new file mode 100644 index 0000000..c94f649 --- /dev/null +++ b/secure/lib/libcrypto/amd64/rc4-md5-x86_64.S @@ -0,0 +1,1260 @@ + # $FreeBSD$ +.text +.align 16 + +.globl rc4_md5_enc +.type rc4_md5_enc,@function +rc4_md5_enc: + cmpq $0,%r9 + je .Labort + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $40,%rsp +.Lbody: + movq %rcx,%r11 + movq %r9,%r12 + movq %rsi,%r13 + movq %rdx,%r14 + movq %r8,%r15 + xorq %rbp,%rbp + xorq %rcx,%rcx + + leaq 8(%rdi),%rdi + movb -8(%rdi),%bpl + movb -4(%rdi),%cl + + incb %bpl + subq %r13,%r14 + movl (%rdi,%rbp,4),%eax + addb %al,%cl + leaq (%rdi,%rbp,4),%rsi + shlq $6,%r12 + addq %r15,%r12 + movq %r12,16(%rsp) + + movq %r11,24(%rsp) + movl 0(%r11),%r8d + movl 4(%r11),%r9d + movl 8(%r11),%r10d + movl 12(%r11),%r11d + jmp .Loop + +.align 16 +.Loop: + movl %r8d,0(%rsp) + movl %r9d,4(%rsp) + movl %r10d,8(%rsp) + movl %r11d,%r12d + movl %r11d,12(%rsp) + pxor %xmm0,%xmm0 + movl (%rdi,%rcx,4),%edx + xorl %r10d,%r12d + movl %eax,(%rdi,%rcx,4) + andl %r9d,%r12d + addl 0(%r15),%r8d + addb %dl,%al + movl 4(%rsi),%ebx + addl $3614090360,%r8d + xorl %r11d,%r12d + movzbl %al,%eax + movl %edx,0(%rsi) + addl %r12d,%r8d + addb %bl,%cl + roll $7,%r8d + movl %r10d,%r12d + movd (%rdi,%rax,4),%xmm0 + + addl %r9d,%r8d + pxor %xmm1,%xmm1 + movl (%rdi,%rcx,4),%edx + xorl %r9d,%r12d + movl %ebx,(%rdi,%rcx,4) + andl %r8d,%r12d + addl 4(%r15),%r11d + addb %dl,%bl + movl 8(%rsi),%eax + addl $3905402710,%r11d + xorl %r10d,%r12d + movzbl %bl,%ebx + movl %edx,4(%rsi) + addl %r12d,%r11d + addb %al,%cl + roll $12,%r11d + movl %r9d,%r12d + movd (%rdi,%rbx,4),%xmm1 + + addl %r8d,%r11d + movl (%rdi,%rcx,4),%edx + xorl %r8d,%r12d + movl %eax,(%rdi,%rcx,4) + andl %r11d,%r12d + addl 8(%r15),%r10d + addb %dl,%al + movl 12(%rsi),%ebx + addl $606105819,%r10d + xorl %r9d,%r12d + movzbl %al,%eax + movl %edx,8(%rsi) + addl %r12d,%r10d + addb %bl,%cl + roll $17,%r10d + movl %r8d,%r12d + pinsrw $1,(%rdi,%rax,4),%xmm0 + + addl %r11d,%r10d + movl (%rdi,%rcx,4),%edx + xorl %r11d,%r12d + movl %ebx,(%rdi,%rcx,4) + andl %r10d,%r12d + addl 12(%r15),%r9d + addb %dl,%bl + movl 16(%rsi),%eax + addl $3250441966,%r9d + xorl %r8d,%r12d + movzbl %bl,%ebx + movl %edx,12(%rsi) + addl %r12d,%r9d + addb %al,%cl + roll $22,%r9d + movl %r11d,%r12d + pinsrw $1,(%rdi,%rbx,4),%xmm1 + + addl %r10d,%r9d + movl (%rdi,%rcx,4),%edx + xorl %r10d,%r12d + movl %eax,(%rdi,%rcx,4) + andl %r9d,%r12d + addl 16(%r15),%r8d + addb %dl,%al + movl 20(%rsi),%ebx + addl $4118548399,%r8d + xorl %r11d,%r12d + movzbl %al,%eax + movl %edx,16(%rsi) + addl %r12d,%r8d + addb %bl,%cl + roll $7,%r8d + movl %r10d,%r12d + pinsrw $2,(%rdi,%rax,4),%xmm0 + + addl %r9d,%r8d + movl (%rdi,%rcx,4),%edx + xorl %r9d,%r12d + movl %ebx,(%rdi,%rcx,4) + andl %r8d,%r12d + addl 20(%r15),%r11d + addb %dl,%bl + movl 24(%rsi),%eax + addl $1200080426,%r11d + xorl %r10d,%r12d + movzbl %bl,%ebx + movl %edx,20(%rsi) + addl %r12d,%r11d + addb %al,%cl + roll $12,%r11d + movl %r9d,%r12d + pinsrw $2,(%rdi,%rbx,4),%xmm1 + + addl %r8d,%r11d + movl (%rdi,%rcx,4),%edx + xorl %r8d,%r12d + movl %eax,(%rdi,%rcx,4) + andl %r11d,%r12d + addl 24(%r15),%r10d + addb %dl,%al + movl 28(%rsi),%ebx + addl $2821735955,%r10d + xorl %r9d,%r12d + movzbl %al,%eax + movl %edx,24(%rsi) + addl %r12d,%r10d + addb %bl,%cl + roll $17,%r10d + movl %r8d,%r12d + pinsrw $3,(%rdi,%rax,4),%xmm0 + + addl %r11d,%r10d + movl (%rdi,%rcx,4),%edx + xorl %r11d,%r12d + movl %ebx,(%rdi,%rcx,4) + andl %r10d,%r12d + addl 28(%r15),%r9d + addb %dl,%bl + movl 32(%rsi),%eax + addl $4249261313,%r9d + xorl %r8d,%r12d + movzbl %bl,%ebx + movl %edx,28(%rsi) + addl %r12d,%r9d + addb %al,%cl + roll $22,%r9d + movl %r11d,%r12d + pinsrw $3,(%rdi,%rbx,4),%xmm1 + + addl %r10d,%r9d + movl (%rdi,%rcx,4),%edx + xorl %r10d,%r12d + movl %eax,(%rdi,%rcx,4) + andl %r9d,%r12d + addl 32(%r15),%r8d + addb %dl,%al + movl 36(%rsi),%ebx + addl $1770035416,%r8d + xorl %r11d,%r12d + movzbl %al,%eax + movl %edx,32(%rsi) + addl %r12d,%r8d + addb %bl,%cl + roll $7,%r8d + movl %r10d,%r12d + pinsrw $4,(%rdi,%rax,4),%xmm0 + + addl %r9d,%r8d + movl (%rdi,%rcx,4),%edx + xorl %r9d,%r12d + movl %ebx,(%rdi,%rcx,4) + andl %r8d,%r12d + addl 36(%r15),%r11d + addb %dl,%bl + movl 40(%rsi),%eax + addl $2336552879,%r11d + xorl %r10d,%r12d + movzbl %bl,%ebx + movl %edx,36(%rsi) + addl %r12d,%r11d + addb %al,%cl + roll $12,%r11d + movl %r9d,%r12d + pinsrw $4,(%rdi,%rbx,4),%xmm1 + + addl %r8d,%r11d + movl (%rdi,%rcx,4),%edx + xorl %r8d,%r12d + movl %eax,(%rdi,%rcx,4) + andl %r11d,%r12d + addl 40(%r15),%r10d + addb %dl,%al + movl 44(%rsi),%ebx + addl $4294925233,%r10d + xorl %r9d,%r12d + movzbl %al,%eax + movl %edx,40(%rsi) + addl %r12d,%r10d + addb %bl,%cl + roll $17,%r10d + movl %r8d,%r12d + pinsrw $5,(%rdi,%rax,4),%xmm0 + + addl %r11d,%r10d + movl (%rdi,%rcx,4),%edx + xorl %r11d,%r12d + movl %ebx,(%rdi,%rcx,4) + andl %r10d,%r12d + addl 44(%r15),%r9d + addb %dl,%bl + movl 48(%rsi),%eax + addl $2304563134,%r9d + xorl %r8d,%r12d + movzbl %bl,%ebx + movl %edx,44(%rsi) + addl %r12d,%r9d + addb %al,%cl + roll $22,%r9d + movl %r11d,%r12d + pinsrw $5,(%rdi,%rbx,4),%xmm1 + + addl %r10d,%r9d + movl (%rdi,%rcx,4),%edx + xorl %r10d,%r12d + movl %eax,(%rdi,%rcx,4) + andl %r9d,%r12d + addl 48(%r15),%r8d + addb %dl,%al + movl 52(%rsi),%ebx + addl $1804603682,%r8d + xorl %r11d,%r12d + movzbl %al,%eax + movl %edx,48(%rsi) + addl %r12d,%r8d + addb %bl,%cl + roll $7,%r8d + movl %r10d,%r12d + pinsrw $6,(%rdi,%rax,4),%xmm0 + + addl %r9d,%r8d + movl (%rdi,%rcx,4),%edx + xorl %r9d,%r12d + movl %ebx,(%rdi,%rcx,4) + andl %r8d,%r12d + addl 52(%r15),%r11d + addb %dl,%bl + movl 56(%rsi),%eax + addl $4254626195,%r11d + xorl %r10d,%r12d + movzbl %bl,%ebx + movl %edx,52(%rsi) + addl %r12d,%r11d + addb %al,%cl + roll $12,%r11d + movl %r9d,%r12d + pinsrw $6,(%rdi,%rbx,4),%xmm1 + + addl %r8d,%r11d + movl (%rdi,%rcx,4),%edx + xorl %r8d,%r12d + movl %eax,(%rdi,%rcx,4) + andl %r11d,%r12d + addl 56(%r15),%r10d + addb %dl,%al + movl 60(%rsi),%ebx + addl $2792965006,%r10d + xorl %r9d,%r12d + movzbl %al,%eax + movl %edx,56(%rsi) + addl %r12d,%r10d + addb %bl,%cl + roll $17,%r10d + movl %r8d,%r12d + pinsrw $7,(%rdi,%rax,4),%xmm0 + + addl %r11d,%r10d + movdqu (%r13),%xmm2 + movl (%rdi,%rcx,4),%edx + xorl %r11d,%r12d + movl %ebx,(%rdi,%rcx,4) + andl %r10d,%r12d + addl 60(%r15),%r9d + addb %dl,%bl + movl 64(%rsi),%eax + addl $1236535329,%r9d + xorl %r8d,%r12d + movzbl %bl,%ebx + movl %edx,60(%rsi) + addl %r12d,%r9d + addb %al,%cl + roll $22,%r9d + movl %r10d,%r12d + pinsrw $7,(%rdi,%rbx,4),%xmm1 + + addl %r10d,%r9d + psllq $8,%xmm1 + pxor %xmm0,%xmm2 + pxor %xmm1,%xmm2 + pxor %xmm0,%xmm0 + movl (%rdi,%rcx,4),%edx + xorl %r9d,%r12d + movl %eax,(%rdi,%rcx,4) + andl %r11d,%r12d + addl 4(%r15),%r8d + addb %dl,%al + movl 68(%rsi),%ebx + addl $4129170786,%r8d + xorl %r10d,%r12d + movzbl %al,%eax + movl %edx,64(%rsi) + addl %r12d,%r8d + addb %bl,%cl + roll $5,%r8d + movl %r9d,%r12d + movd (%rdi,%rax,4),%xmm0 + + addl %r9d,%r8d + pxor %xmm1,%xmm1 + movl (%rdi,%rcx,4),%edx + xorl %r8d,%r12d + movl %ebx,(%rdi,%rcx,4) + andl %r10d,%r12d + addl 24(%r15),%r11d + addb %dl,%bl + movl 72(%rsi),%eax + addl $3225465664,%r11d + xorl %r9d,%r12d + movzbl %bl,%ebx + movl %edx,68(%rsi) + addl %r12d,%r11d + addb %al,%cl + roll $9,%r11d + movl %r8d,%r12d + movd (%rdi,%rbx,4),%xmm1 + + addl %r8d,%r11d + movl (%rdi,%rcx,4),%edx + xorl %r11d,%r12d + movl %eax,(%rdi,%rcx,4) + andl %r9d,%r12d + addl 44(%r15),%r10d + addb %dl,%al + movl 76(%rsi),%ebx + addl $643717713,%r10d + xorl %r8d,%r12d + movzbl %al,%eax + movl %edx,72(%rsi) + addl %r12d,%r10d + addb %bl,%cl + roll $14,%r10d + movl %r11d,%r12d + pinsrw $1,(%rdi,%rax,4),%xmm0 + + addl %r11d,%r10d + movl (%rdi,%rcx,4),%edx + xorl %r10d,%r12d + movl %ebx,(%rdi,%rcx,4) + andl %r8d,%r12d + addl 0(%r15),%r9d + addb %dl,%bl + movl 80(%rsi),%eax + addl $3921069994,%r9d + xorl %r11d,%r12d + movzbl %bl,%ebx + movl %edx,76(%rsi) + addl %r12d,%r9d + addb %al,%cl + roll $20,%r9d + movl %r10d,%r12d + pinsrw $1,(%rdi,%rbx,4),%xmm1 + + addl %r10d,%r9d + movl (%rdi,%rcx,4),%edx + xorl %r9d,%r12d + movl %eax,(%rdi,%rcx,4) + andl %r11d,%r12d + addl 20(%r15),%r8d + addb %dl,%al + movl 84(%rsi),%ebx + addl $3593408605,%r8d + xorl %r10d,%r12d + movzbl %al,%eax + movl %edx,80(%rsi) + addl %r12d,%r8d + addb %bl,%cl + roll $5,%r8d + movl %r9d,%r12d + pinsrw $2,(%rdi,%rax,4),%xmm0 + + addl %r9d,%r8d + movl (%rdi,%rcx,4),%edx + xorl %r8d,%r12d + movl %ebx,(%rdi,%rcx,4) + andl %r10d,%r12d + addl 40(%r15),%r11d + addb %dl,%bl + movl 88(%rsi),%eax + addl $38016083,%r11d + xorl %r9d,%r12d + movzbl %bl,%ebx + movl %edx,84(%rsi) + addl %r12d,%r11d + addb %al,%cl + roll $9,%r11d + movl %r8d,%r12d + pinsrw $2,(%rdi,%rbx,4),%xmm1 + + addl %r8d,%r11d + movl (%rdi,%rcx,4),%edx + xorl %r11d,%r12d + movl %eax,(%rdi,%rcx,4) + andl %r9d,%r12d + addl 60(%r15),%r10d + addb %dl,%al + movl 92(%rsi),%ebx + addl $3634488961,%r10d + xorl %r8d,%r12d + movzbl %al,%eax + movl %edx,88(%rsi) + addl %r12d,%r10d + addb %bl,%cl + roll $14,%r10d + movl %r11d,%r12d + pinsrw $3,(%rdi,%rax,4),%xmm0 + + addl %r11d,%r10d + movl (%rdi,%rcx,4),%edx + xorl %r10d,%r12d + movl %ebx,(%rdi,%rcx,4) + andl %r8d,%r12d + addl 16(%r15),%r9d + addb %dl,%bl + movl 96(%rsi),%eax + addl $3889429448,%r9d + xorl %r11d,%r12d + movzbl %bl,%ebx + movl %edx,92(%rsi) + addl %r12d,%r9d + addb %al,%cl + roll $20,%r9d + movl %r10d,%r12d + pinsrw $3,(%rdi,%rbx,4),%xmm1 + + addl %r10d,%r9d + movl (%rdi,%rcx,4),%edx + xorl %r9d,%r12d + movl %eax,(%rdi,%rcx,4) + andl %r11d,%r12d + addl 36(%r15),%r8d + addb %dl,%al + movl 100(%rsi),%ebx + addl $568446438,%r8d + xorl %r10d,%r12d + movzbl %al,%eax + movl %edx,96(%rsi) + addl %r12d,%r8d + addb %bl,%cl + roll $5,%r8d + movl %r9d,%r12d + pinsrw $4,(%rdi,%rax,4),%xmm0 + + addl %r9d,%r8d + movl (%rdi,%rcx,4),%edx + xorl %r8d,%r12d + movl %ebx,(%rdi,%rcx,4) + andl %r10d,%r12d + addl 56(%r15),%r11d + addb %dl,%bl + movl 104(%rsi),%eax + addl $3275163606,%r11d + xorl %r9d,%r12d + movzbl %bl,%ebx + movl %edx,100(%rsi) + addl %r12d,%r11d + addb %al,%cl + roll $9,%r11d + movl %r8d,%r12d + pinsrw $4,(%rdi,%rbx,4),%xmm1 + + addl %r8d,%r11d + movl (%rdi,%rcx,4),%edx + xorl %r11d,%r12d + movl %eax,(%rdi,%rcx,4) + andl %r9d,%r12d + addl 12(%r15),%r10d + addb %dl,%al + movl 108(%rsi),%ebx + addl $4107603335,%r10d + xorl %r8d,%r12d + movzbl %al,%eax + movl %edx,104(%rsi) + addl %r12d,%r10d + addb %bl,%cl + roll $14,%r10d + movl %r11d,%r12d + pinsrw $5,(%rdi,%rax,4),%xmm0 + + addl %r11d,%r10d + movl (%rdi,%rcx,4),%edx + xorl %r10d,%r12d + movl %ebx,(%rdi,%rcx,4) + andl %r8d,%r12d + addl 32(%r15),%r9d + addb %dl,%bl + movl 112(%rsi),%eax + addl $1163531501,%r9d + xorl %r11d,%r12d + movzbl %bl,%ebx + movl %edx,108(%rsi) + addl %r12d,%r9d + addb %al,%cl + roll $20,%r9d + movl %r10d,%r12d + pinsrw $5,(%rdi,%rbx,4),%xmm1 + + addl %r10d,%r9d + movl (%rdi,%rcx,4),%edx + xorl %r9d,%r12d + movl %eax,(%rdi,%rcx,4) + andl %r11d,%r12d + addl 52(%r15),%r8d + addb %dl,%al + movl 116(%rsi),%ebx + addl $2850285829,%r8d + xorl %r10d,%r12d + movzbl %al,%eax + movl %edx,112(%rsi) + addl %r12d,%r8d + addb %bl,%cl + roll $5,%r8d + movl %r9d,%r12d + pinsrw $6,(%rdi,%rax,4),%xmm0 + + addl %r9d,%r8d + movl (%rdi,%rcx,4),%edx + xorl %r8d,%r12d + movl %ebx,(%rdi,%rcx,4) + andl %r10d,%r12d + addl 8(%r15),%r11d + addb %dl,%bl + movl 120(%rsi),%eax + addl $4243563512,%r11d + xorl %r9d,%r12d + movzbl %bl,%ebx + movl %edx,116(%rsi) + addl %r12d,%r11d + addb %al,%cl + roll $9,%r11d + movl %r8d,%r12d + pinsrw $6,(%rdi,%rbx,4),%xmm1 + + addl %r8d,%r11d + movl (%rdi,%rcx,4),%edx + xorl %r11d,%r12d + movl %eax,(%rdi,%rcx,4) + andl %r9d,%r12d + addl 28(%r15),%r10d + addb %dl,%al + movl 124(%rsi),%ebx + addl $1735328473,%r10d + xorl %r8d,%r12d + movzbl %al,%eax + movl %edx,120(%rsi) + addl %r12d,%r10d + addb %bl,%cl + roll $14,%r10d + movl %r11d,%r12d + pinsrw $7,(%rdi,%rax,4),%xmm0 + + addl %r11d,%r10d + movdqu 16(%r13),%xmm3 + addb $32,%bpl + movl (%rdi,%rcx,4),%edx + xorl %r10d,%r12d + movl %ebx,(%rdi,%rcx,4) + andl %r8d,%r12d + addl 48(%r15),%r9d + addb %dl,%bl + movl 0(%rdi,%rbp,4),%eax + addl $2368359562,%r9d + xorl %r11d,%r12d + movzbl %bl,%ebx + movl %edx,124(%rsi) + addl %r12d,%r9d + addb %al,%cl + roll $20,%r9d + movl %r11d,%r12d + pinsrw $7,(%rdi,%rbx,4),%xmm1 + + addl %r10d,%r9d + movq %rcx,%rsi + xorq %rcx,%rcx + movb %sil,%cl + leaq (%rdi,%rbp,4),%rsi + psllq $8,%xmm1 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + pxor %xmm0,%xmm0 + movl (%rdi,%rcx,4),%edx + xorl %r10d,%r12d + movl %eax,(%rdi,%rcx,4) + xorl %r9d,%r12d + addl 20(%r15),%r8d + addb %dl,%al + movl 4(%rsi),%ebx + addl $4294588738,%r8d + movzbl %al,%eax + addl %r12d,%r8d + movl %edx,0(%rsi) + addb %bl,%cl + roll $4,%r8d + movl %r10d,%r12d + movd (%rdi,%rax,4),%xmm0 + + addl %r9d,%r8d + pxor %xmm1,%xmm1 + movl (%rdi,%rcx,4),%edx + xorl %r9d,%r12d + movl %ebx,(%rdi,%rcx,4) + xorl %r8d,%r12d + addl 32(%r15),%r11d + addb %dl,%bl + movl 8(%rsi),%eax + addl $2272392833,%r11d + movzbl %bl,%ebx + addl %r12d,%r11d + movl %edx,4(%rsi) + addb %al,%cl + roll $11,%r11d + movl %r9d,%r12d + movd (%rdi,%rbx,4),%xmm1 + + addl %r8d,%r11d + movl (%rdi,%rcx,4),%edx + xorl %r8d,%r12d + movl %eax,(%rdi,%rcx,4) + xorl %r11d,%r12d + addl 44(%r15),%r10d + addb %dl,%al + movl 12(%rsi),%ebx + addl $1839030562,%r10d + movzbl %al,%eax + addl %r12d,%r10d + movl %edx,8(%rsi) + addb %bl,%cl + roll $16,%r10d + movl %r8d,%r12d + pinsrw $1,(%rdi,%rax,4),%xmm0 + + addl %r11d,%r10d + movl (%rdi,%rcx,4),%edx + xorl %r11d,%r12d + movl %ebx,(%rdi,%rcx,4) + xorl %r10d,%r12d + addl 56(%r15),%r9d + addb %dl,%bl + movl 16(%rsi),%eax + addl $4259657740,%r9d + movzbl %bl,%ebx + addl %r12d,%r9d + movl %edx,12(%rsi) + addb %al,%cl + roll $23,%r9d + movl %r11d,%r12d + pinsrw $1,(%rdi,%rbx,4),%xmm1 + + addl %r10d,%r9d + movl (%rdi,%rcx,4),%edx + xorl %r10d,%r12d + movl %eax,(%rdi,%rcx,4) + xorl %r9d,%r12d + addl 4(%r15),%r8d + addb %dl,%al + movl 20(%rsi),%ebx + addl $2763975236,%r8d + movzbl %al,%eax + addl %r12d,%r8d + movl %edx,16(%rsi) + addb %bl,%cl + roll $4,%r8d + movl %r10d,%r12d + pinsrw $2,(%rdi,%rax,4),%xmm0 + + addl %r9d,%r8d + movl (%rdi,%rcx,4),%edx + xorl %r9d,%r12d + movl %ebx,(%rdi,%rcx,4) + xorl %r8d,%r12d + addl 16(%r15),%r11d + addb %dl,%bl + movl 24(%rsi),%eax + addl $1272893353,%r11d + movzbl %bl,%ebx + addl %r12d,%r11d + movl %edx,20(%rsi) + addb %al,%cl + roll $11,%r11d + movl %r9d,%r12d + pinsrw $2,(%rdi,%rbx,4),%xmm1 + + addl %r8d,%r11d + movl (%rdi,%rcx,4),%edx + xorl %r8d,%r12d + movl %eax,(%rdi,%rcx,4) + xorl %r11d,%r12d + addl 28(%r15),%r10d + addb %dl,%al + movl 28(%rsi),%ebx + addl $4139469664,%r10d + movzbl %al,%eax + addl %r12d,%r10d + movl %edx,24(%rsi) + addb %bl,%cl + roll $16,%r10d + movl %r8d,%r12d + pinsrw $3,(%rdi,%rax,4),%xmm0 + + addl %r11d,%r10d + movl (%rdi,%rcx,4),%edx + xorl %r11d,%r12d + movl %ebx,(%rdi,%rcx,4) + xorl %r10d,%r12d + addl 40(%r15),%r9d + addb %dl,%bl + movl 32(%rsi),%eax + addl $3200236656,%r9d + movzbl %bl,%ebx + addl %r12d,%r9d + movl %edx,28(%rsi) + addb %al,%cl + roll $23,%r9d + movl %r11d,%r12d + pinsrw $3,(%rdi,%rbx,4),%xmm1 + + addl %r10d,%r9d + movl (%rdi,%rcx,4),%edx + xorl %r10d,%r12d + movl %eax,(%rdi,%rcx,4) + xorl %r9d,%r12d + addl 52(%r15),%r8d + addb %dl,%al + movl 36(%rsi),%ebx + addl $681279174,%r8d + movzbl %al,%eax + addl %r12d,%r8d + movl %edx,32(%rsi) + addb %bl,%cl + roll $4,%r8d + movl %r10d,%r12d + pinsrw $4,(%rdi,%rax,4),%xmm0 + + addl %r9d,%r8d + movl (%rdi,%rcx,4),%edx + xorl %r9d,%r12d + movl %ebx,(%rdi,%rcx,4) + xorl %r8d,%r12d + addl 0(%r15),%r11d + addb %dl,%bl + movl 40(%rsi),%eax + addl $3936430074,%r11d + movzbl %bl,%ebx + addl %r12d,%r11d + movl %edx,36(%rsi) + addb %al,%cl + roll $11,%r11d + movl %r9d,%r12d + pinsrw $4,(%rdi,%rbx,4),%xmm1 + + addl %r8d,%r11d + movl (%rdi,%rcx,4),%edx + xorl %r8d,%r12d + movl %eax,(%rdi,%rcx,4) + xorl %r11d,%r12d + addl 12(%r15),%r10d + addb %dl,%al + movl 44(%rsi),%ebx + addl $3572445317,%r10d + movzbl %al,%eax + addl %r12d,%r10d + movl %edx,40(%rsi) + addb %bl,%cl + roll $16,%r10d + movl %r8d,%r12d + pinsrw $5,(%rdi,%rax,4),%xmm0 + + addl %r11d,%r10d + movl (%rdi,%rcx,4),%edx + xorl %r11d,%r12d + movl %ebx,(%rdi,%rcx,4) + xorl %r10d,%r12d + addl 24(%r15),%r9d + addb %dl,%bl + movl 48(%rsi),%eax + addl $76029189,%r9d + movzbl %bl,%ebx + addl %r12d,%r9d + movl %edx,44(%rsi) + addb %al,%cl + roll $23,%r9d + movl %r11d,%r12d + pinsrw $5,(%rdi,%rbx,4),%xmm1 + + addl %r10d,%r9d + movl (%rdi,%rcx,4),%edx + xorl %r10d,%r12d + movl %eax,(%rdi,%rcx,4) + xorl %r9d,%r12d + addl 36(%r15),%r8d + addb %dl,%al + movl 52(%rsi),%ebx + addl $3654602809,%r8d + movzbl %al,%eax + addl %r12d,%r8d + movl %edx,48(%rsi) + addb %bl,%cl + roll $4,%r8d + movl %r10d,%r12d + pinsrw $6,(%rdi,%rax,4),%xmm0 + + addl %r9d,%r8d + movl (%rdi,%rcx,4),%edx + xorl %r9d,%r12d + movl %ebx,(%rdi,%rcx,4) + xorl %r8d,%r12d + addl 48(%r15),%r11d + addb %dl,%bl + movl 56(%rsi),%eax + addl $3873151461,%r11d + movzbl %bl,%ebx + addl %r12d,%r11d + movl %edx,52(%rsi) + addb %al,%cl + roll $11,%r11d + movl %r9d,%r12d + pinsrw $6,(%rdi,%rbx,4),%xmm1 + + addl %r8d,%r11d + movl (%rdi,%rcx,4),%edx + xorl %r8d,%r12d + movl %eax,(%rdi,%rcx,4) + xorl %r11d,%r12d + addl 60(%r15),%r10d + addb %dl,%al + movl 60(%rsi),%ebx + addl $530742520,%r10d + movzbl %al,%eax + addl %r12d,%r10d + movl %edx,56(%rsi) + addb %bl,%cl + roll $16,%r10d + movl %r8d,%r12d + pinsrw $7,(%rdi,%rax,4),%xmm0 + + addl %r11d,%r10d + movdqu 32(%r13),%xmm4 + movl (%rdi,%rcx,4),%edx + xorl %r11d,%r12d + movl %ebx,(%rdi,%rcx,4) + xorl %r10d,%r12d + addl 8(%r15),%r9d + addb %dl,%bl + movl 64(%rsi),%eax + addl $3299628645,%r9d + movzbl %bl,%ebx + addl %r12d,%r9d + movl %edx,60(%rsi) + addb %al,%cl + roll $23,%r9d + movl $-1,%r12d + pinsrw $7,(%rdi,%rbx,4),%xmm1 + + addl %r10d,%r9d + psllq $8,%xmm1 + pxor %xmm0,%xmm4 + pxor %xmm1,%xmm4 + pxor %xmm0,%xmm0 + movl (%rdi,%rcx,4),%edx + xorl %r11d,%r12d + movl %eax,(%rdi,%rcx,4) + orl %r9d,%r12d + addl 0(%r15),%r8d + addb %dl,%al + movl 68(%rsi),%ebx + addl $4096336452,%r8d + movzbl %al,%eax + xorl %r10d,%r12d + movl %edx,64(%rsi) + addl %r12d,%r8d + addb %bl,%cl + roll $6,%r8d + movl $-1,%r12d + movd (%rdi,%rax,4),%xmm0 + + addl %r9d,%r8d + pxor %xmm1,%xmm1 + movl (%rdi,%rcx,4),%edx + xorl %r10d,%r12d + movl %ebx,(%rdi,%rcx,4) + orl %r8d,%r12d + addl 28(%r15),%r11d + addb %dl,%bl + movl 72(%rsi),%eax + addl $1126891415,%r11d + movzbl %bl,%ebx + xorl %r9d,%r12d + movl %edx,68(%rsi) + addl %r12d,%r11d + addb %al,%cl + roll $10,%r11d + movl $-1,%r12d + movd (%rdi,%rbx,4),%xmm1 + + addl %r8d,%r11d + movl (%rdi,%rcx,4),%edx + xorl %r9d,%r12d + movl %eax,(%rdi,%rcx,4) + orl %r11d,%r12d + addl 56(%r15),%r10d + addb %dl,%al + movl 76(%rsi),%ebx + addl $2878612391,%r10d + movzbl %al,%eax + xorl %r8d,%r12d + movl %edx,72(%rsi) + addl %r12d,%r10d + addb %bl,%cl + roll $15,%r10d + movl $-1,%r12d + pinsrw $1,(%rdi,%rax,4),%xmm0 + + addl %r11d,%r10d + movl (%rdi,%rcx,4),%edx + xorl %r8d,%r12d + movl %ebx,(%rdi,%rcx,4) + orl %r10d,%r12d + addl 20(%r15),%r9d + addb %dl,%bl + movl 80(%rsi),%eax + addl $4237533241,%r9d + movzbl %bl,%ebx + xorl %r11d,%r12d + movl %edx,76(%rsi) + addl %r12d,%r9d + addb %al,%cl + roll $21,%r9d + movl $-1,%r12d + pinsrw $1,(%rdi,%rbx,4),%xmm1 + + addl %r10d,%r9d + movl (%rdi,%rcx,4),%edx + xorl %r11d,%r12d + movl %eax,(%rdi,%rcx,4) + orl %r9d,%r12d + addl 48(%r15),%r8d + addb %dl,%al + movl 84(%rsi),%ebx + addl $1700485571,%r8d + movzbl %al,%eax + xorl %r10d,%r12d + movl %edx,80(%rsi) + addl %r12d,%r8d + addb %bl,%cl + roll $6,%r8d + movl $-1,%r12d + pinsrw $2,(%rdi,%rax,4),%xmm0 + + addl %r9d,%r8d + movl (%rdi,%rcx,4),%edx + xorl %r10d,%r12d + movl %ebx,(%rdi,%rcx,4) + orl %r8d,%r12d + addl 12(%r15),%r11d + addb %dl,%bl + movl 88(%rsi),%eax + addl $2399980690,%r11d + movzbl %bl,%ebx + xorl %r9d,%r12d + movl %edx,84(%rsi) + addl %r12d,%r11d + addb %al,%cl + roll $10,%r11d + movl $-1,%r12d + pinsrw $2,(%rdi,%rbx,4),%xmm1 + + addl %r8d,%r11d + movl (%rdi,%rcx,4),%edx + xorl %r9d,%r12d + movl %eax,(%rdi,%rcx,4) + orl %r11d,%r12d + addl 40(%r15),%r10d + addb %dl,%al + movl 92(%rsi),%ebx + addl $4293915773,%r10d + movzbl %al,%eax + xorl %r8d,%r12d + movl %edx,88(%rsi) + addl %r12d,%r10d + addb %bl,%cl + roll $15,%r10d + movl $-1,%r12d + pinsrw $3,(%rdi,%rax,4),%xmm0 + + addl %r11d,%r10d + movl (%rdi,%rcx,4),%edx + xorl %r8d,%r12d + movl %ebx,(%rdi,%rcx,4) + orl %r10d,%r12d + addl 4(%r15),%r9d + addb %dl,%bl + movl 96(%rsi),%eax + addl $2240044497,%r9d + movzbl %bl,%ebx + xorl %r11d,%r12d + movl %edx,92(%rsi) + addl %r12d,%r9d + addb %al,%cl + roll $21,%r9d + movl $-1,%r12d + pinsrw $3,(%rdi,%rbx,4),%xmm1 + + addl %r10d,%r9d + movl (%rdi,%rcx,4),%edx + xorl %r11d,%r12d + movl %eax,(%rdi,%rcx,4) + orl %r9d,%r12d + addl 32(%r15),%r8d + addb %dl,%al + movl 100(%rsi),%ebx + addl $1873313359,%r8d + movzbl %al,%eax + xorl %r10d,%r12d + movl %edx,96(%rsi) + addl %r12d,%r8d + addb %bl,%cl + roll $6,%r8d + movl $-1,%r12d + pinsrw $4,(%rdi,%rax,4),%xmm0 + + addl %r9d,%r8d + movl (%rdi,%rcx,4),%edx + xorl %r10d,%r12d + movl %ebx,(%rdi,%rcx,4) + orl %r8d,%r12d + addl 60(%r15),%r11d + addb %dl,%bl + movl 104(%rsi),%eax + addl $4264355552,%r11d + movzbl %bl,%ebx + xorl %r9d,%r12d + movl %edx,100(%rsi) + addl %r12d,%r11d + addb %al,%cl + roll $10,%r11d + movl $-1,%r12d + pinsrw $4,(%rdi,%rbx,4),%xmm1 + + addl %r8d,%r11d + movl (%rdi,%rcx,4),%edx + xorl %r9d,%r12d + movl %eax,(%rdi,%rcx,4) + orl %r11d,%r12d + addl 24(%r15),%r10d + addb %dl,%al + movl 108(%rsi),%ebx + addl $2734768916,%r10d + movzbl %al,%eax + xorl %r8d,%r12d + movl %edx,104(%rsi) + addl %r12d,%r10d + addb %bl,%cl + roll $15,%r10d + movl $-1,%r12d + pinsrw $5,(%rdi,%rax,4),%xmm0 + + addl %r11d,%r10d + movl (%rdi,%rcx,4),%edx + xorl %r8d,%r12d + movl %ebx,(%rdi,%rcx,4) + orl %r10d,%r12d + addl 52(%r15),%r9d + addb %dl,%bl + movl 112(%rsi),%eax + addl $1309151649,%r9d + movzbl %bl,%ebx + xorl %r11d,%r12d + movl %edx,108(%rsi) + addl %r12d,%r9d + addb %al,%cl + roll $21,%r9d + movl $-1,%r12d + pinsrw $5,(%rdi,%rbx,4),%xmm1 + + addl %r10d,%r9d + movl (%rdi,%rcx,4),%edx + xorl %r11d,%r12d + movl %eax,(%rdi,%rcx,4) + orl %r9d,%r12d + addl 16(%r15),%r8d + addb %dl,%al + movl 116(%rsi),%ebx + addl $4149444226,%r8d + movzbl %al,%eax + xorl %r10d,%r12d + movl %edx,112(%rsi) + addl %r12d,%r8d + addb %bl,%cl + roll $6,%r8d + movl $-1,%r12d + pinsrw $6,(%rdi,%rax,4),%xmm0 + + addl %r9d,%r8d + movl (%rdi,%rcx,4),%edx + xorl %r10d,%r12d + movl %ebx,(%rdi,%rcx,4) + orl %r8d,%r12d + addl 44(%r15),%r11d + addb %dl,%bl + movl 120(%rsi),%eax + addl $3174756917,%r11d + movzbl %bl,%ebx + xorl %r9d,%r12d + movl %edx,116(%rsi) + addl %r12d,%r11d + addb %al,%cl + roll $10,%r11d + movl $-1,%r12d + pinsrw $6,(%rdi,%rbx,4),%xmm1 + + addl %r8d,%r11d + movl (%rdi,%rcx,4),%edx + xorl %r9d,%r12d + movl %eax,(%rdi,%rcx,4) + orl %r11d,%r12d + addl 8(%r15),%r10d + addb %dl,%al + movl 124(%rsi),%ebx + addl $718787259,%r10d + movzbl %al,%eax + xorl %r8d,%r12d + movl %edx,120(%rsi) + addl %r12d,%r10d + addb %bl,%cl + roll $15,%r10d + movl $-1,%r12d + pinsrw $7,(%rdi,%rax,4),%xmm0 + + addl %r11d,%r10d + movdqu 48(%r13),%xmm5 + addb $32,%bpl + movl (%rdi,%rcx,4),%edx + xorl %r8d,%r12d + movl %ebx,(%rdi,%rcx,4) + orl %r10d,%r12d + addl 36(%r15),%r9d + addb %dl,%bl + movl 0(%rdi,%rbp,4),%eax + addl $3951481745,%r9d + movzbl %bl,%ebx + xorl %r11d,%r12d + movl %edx,124(%rsi) + addl %r12d,%r9d + addb %al,%cl + roll $21,%r9d + movl $-1,%r12d + pinsrw $7,(%rdi,%rbx,4),%xmm1 + + addl %r10d,%r9d + movq %rbp,%rsi + xorq %rbp,%rbp + movb %sil,%bpl + movq %rcx,%rsi + xorq %rcx,%rcx + movb %sil,%cl + leaq (%rdi,%rbp,4),%rsi + psllq $8,%xmm1 + pxor %xmm0,%xmm5 + pxor %xmm1,%xmm5 + addl 0(%rsp),%r8d + addl 4(%rsp),%r9d + addl 8(%rsp),%r10d + addl 12(%rsp),%r11d + + movdqu %xmm2,(%r14,%r13,1) + movdqu %xmm3,16(%r14,%r13,1) + movdqu %xmm4,32(%r14,%r13,1) + movdqu %xmm5,48(%r14,%r13,1) + leaq 64(%r15),%r15 + leaq 64(%r13),%r13 + cmpq 16(%rsp),%r15 + jb .Loop + + movq 24(%rsp),%r12 + subb %al,%cl + movl %r8d,0(%r12) + movl %r9d,4(%r12) + movl %r10d,8(%r12) + movl %r11d,12(%r12) + subb $1,%bpl + movl %ebp,-8(%rdi) + movl %ecx,-4(%rdi) + + movq 40(%rsp),%r15 + movq 48(%rsp),%r14 + movq 56(%rsp),%r13 + movq 64(%rsp),%r12 + movq 72(%rsp),%rbp + movq 80(%rsp),%rbx + leaq 88(%rsp),%rsp +.Lepilogue: +.Labort: + .byte 0xf3,0xc3 +.size rc4_md5_enc,.-rc4_md5_enc diff --git a/secure/lib/libcrypto/amd64/rc4-x86_64.S b/secure/lib/libcrypto/amd64/rc4-x86_64.S new file mode 100644 index 0000000..c51ca89 --- /dev/null +++ b/secure/lib/libcrypto/amd64/rc4-x86_64.S @@ -0,0 +1,616 @@ + # $FreeBSD$ +.text + + +.globl RC4 +.type RC4,@function +.align 16 +RC4: orq %rsi,%rsi + jne .Lentry + .byte 0xf3,0xc3 +.Lentry: + pushq %rbx + pushq %r12 + pushq %r13 +.Lprologue: + movq %rsi,%r11 + movq %rdx,%r12 + movq %rcx,%r13 + xorq %r10,%r10 + xorq %rcx,%rcx + + leaq 8(%rdi),%rdi + movb -8(%rdi),%r10b + movb -4(%rdi),%cl + cmpl $-1,256(%rdi) + je .LRC4_CHAR + movl OPENSSL_ia32cap_P(%rip),%r8d + xorq %rbx,%rbx + incb %r10b + subq %r10,%rbx + subq %r12,%r13 + movl (%rdi,%r10,4),%eax + testq $-16,%r11 + jz .Lloop1 + btl $30,%r8d + jc .Lintel + andq $7,%rbx + leaq 1(%r10),%rsi + jz .Loop8 + subq %rbx,%r11 +.Loop8_warmup: + addb %al,%cl + movl (%rdi,%rcx,4),%edx + movl %eax,(%rdi,%rcx,4) + movl %edx,(%rdi,%r10,4) + addb %dl,%al + incb %r10b + movl (%rdi,%rax,4),%edx + movl (%rdi,%r10,4),%eax + xorb (%r12),%dl + movb %dl,(%r12,%r13,1) + leaq 1(%r12),%r12 + decq %rbx + jnz .Loop8_warmup + + leaq 1(%r10),%rsi + jmp .Loop8 +.align 16 +.Loop8: + addb %al,%cl + movl (%rdi,%rcx,4),%edx + movl %eax,(%rdi,%rcx,4) + movl 0(%rdi,%rsi,4),%ebx + rorq $8,%r8 + movl %edx,0(%rdi,%r10,4) + addb %al,%dl + movb (%rdi,%rdx,4),%r8b + addb %bl,%cl + movl (%rdi,%rcx,4),%edx + movl %ebx,(%rdi,%rcx,4) + movl 4(%rdi,%rsi,4),%eax + rorq $8,%r8 + movl %edx,4(%rdi,%r10,4) + addb %bl,%dl + movb (%rdi,%rdx,4),%r8b + addb %al,%cl + movl (%rdi,%rcx,4),%edx + movl %eax,(%rdi,%rcx,4) + movl 8(%rdi,%rsi,4),%ebx + rorq $8,%r8 + movl %edx,8(%rdi,%r10,4) + addb %al,%dl + movb (%rdi,%rdx,4),%r8b + addb %bl,%cl + movl (%rdi,%rcx,4),%edx + movl %ebx,(%rdi,%rcx,4) + movl 12(%rdi,%rsi,4),%eax + rorq $8,%r8 + movl %edx,12(%rdi,%r10,4) + addb %bl,%dl + movb (%rdi,%rdx,4),%r8b + addb %al,%cl + movl (%rdi,%rcx,4),%edx + movl %eax,(%rdi,%rcx,4) + movl 16(%rdi,%rsi,4),%ebx + rorq $8,%r8 + movl %edx,16(%rdi,%r10,4) + addb %al,%dl + movb (%rdi,%rdx,4),%r8b + addb %bl,%cl + movl (%rdi,%rcx,4),%edx + movl %ebx,(%rdi,%rcx,4) + movl 20(%rdi,%rsi,4),%eax + rorq $8,%r8 + movl %edx,20(%rdi,%r10,4) + addb %bl,%dl + movb (%rdi,%rdx,4),%r8b + addb %al,%cl + movl (%rdi,%rcx,4),%edx + movl %eax,(%rdi,%rcx,4) + movl 24(%rdi,%rsi,4),%ebx + rorq $8,%r8 + movl %edx,24(%rdi,%r10,4) + addb %al,%dl + movb (%rdi,%rdx,4),%r8b + addb $8,%sil + addb %bl,%cl + movl (%rdi,%rcx,4),%edx + movl %ebx,(%rdi,%rcx,4) + movl -4(%rdi,%rsi,4),%eax + rorq $8,%r8 + movl %edx,28(%rdi,%r10,4) + addb %bl,%dl + movb (%rdi,%rdx,4),%r8b + addb $8,%r10b + rorq $8,%r8 + subq $8,%r11 + + xorq (%r12),%r8 + movq %r8,(%r12,%r13,1) + leaq 8(%r12),%r12 + + testq $-8,%r11 + jnz .Loop8 + cmpq $0,%r11 + jne .Lloop1 + jmp .Lexit + +.align 16 +.Lintel: + testq $-32,%r11 + jz .Lloop1 + andq $15,%rbx + jz .Loop16_is_hot + subq %rbx,%r11 +.Loop16_warmup: + addb %al,%cl + movl (%rdi,%rcx,4),%edx + movl %eax,(%rdi,%rcx,4) + movl %edx,(%rdi,%r10,4) + addb %dl,%al + incb %r10b + movl (%rdi,%rax,4),%edx + movl (%rdi,%r10,4),%eax + xorb (%r12),%dl + movb %dl,(%r12,%r13,1) + leaq 1(%r12),%r12 + decq %rbx + jnz .Loop16_warmup + + movq %rcx,%rbx + xorq %rcx,%rcx + movb %bl,%cl + +.Loop16_is_hot: + leaq (%rdi,%r10,4),%rsi + addb %al,%cl + movl (%rdi,%rcx,4),%edx + pxor %xmm0,%xmm0 + movl %eax,(%rdi,%rcx,4) + addb %dl,%al + movl 4(%rsi),%ebx + movzbl %al,%eax + movl %edx,0(%rsi) + addb %bl,%cl + pinsrw $0,(%rdi,%rax,4),%xmm0 + jmp .Loop16_enter +.align 16 +.Loop16: + addb %al,%cl + movl (%rdi,%rcx,4),%edx + pxor %xmm0,%xmm2 + psllq $8,%xmm1 + pxor %xmm0,%xmm0 + movl %eax,(%rdi,%rcx,4) + addb %dl,%al + movl 4(%rsi),%ebx + movzbl %al,%eax + movl %edx,0(%rsi) + pxor %xmm1,%xmm2 + addb %bl,%cl + pinsrw $0,(%rdi,%rax,4),%xmm0 + movdqu %xmm2,(%r12,%r13,1) + leaq 16(%r12),%r12 +.Loop16_enter: + movl (%rdi,%rcx,4),%edx + pxor %xmm1,%xmm1 + movl %ebx,(%rdi,%rcx,4) + addb %dl,%bl + movl 8(%rsi),%eax + movzbl %bl,%ebx + movl %edx,4(%rsi) + addb %al,%cl + pinsrw $0,(%rdi,%rbx,4),%xmm1 + movl (%rdi,%rcx,4),%edx + movl %eax,(%rdi,%rcx,4) + addb %dl,%al + movl 12(%rsi),%ebx + movzbl %al,%eax + movl %edx,8(%rsi) + addb %bl,%cl + pinsrw $1,(%rdi,%rax,4),%xmm0 + movl (%rdi,%rcx,4),%edx + movl %ebx,(%rdi,%rcx,4) + addb %dl,%bl + movl 16(%rsi),%eax + movzbl %bl,%ebx + movl %edx,12(%rsi) + addb %al,%cl + pinsrw $1,(%rdi,%rbx,4),%xmm1 + movl (%rdi,%rcx,4),%edx + movl %eax,(%rdi,%rcx,4) + addb %dl,%al + movl 20(%rsi),%ebx + movzbl %al,%eax + movl %edx,16(%rsi) + addb %bl,%cl + pinsrw $2,(%rdi,%rax,4),%xmm0 + movl (%rdi,%rcx,4),%edx + movl %ebx,(%rdi,%rcx,4) + addb %dl,%bl + movl 24(%rsi),%eax + movzbl %bl,%ebx + movl %edx,20(%rsi) + addb %al,%cl + pinsrw $2,(%rdi,%rbx,4),%xmm1 + movl (%rdi,%rcx,4),%edx + movl %eax,(%rdi,%rcx,4) + addb %dl,%al + movl 28(%rsi),%ebx + movzbl %al,%eax + movl %edx,24(%rsi) + addb %bl,%cl + pinsrw $3,(%rdi,%rax,4),%xmm0 + movl (%rdi,%rcx,4),%edx + movl %ebx,(%rdi,%rcx,4) + addb %dl,%bl + movl 32(%rsi),%eax + movzbl %bl,%ebx + movl %edx,28(%rsi) + addb %al,%cl + pinsrw $3,(%rdi,%rbx,4),%xmm1 + movl (%rdi,%rcx,4),%edx + movl %eax,(%rdi,%rcx,4) + addb %dl,%al + movl 36(%rsi),%ebx + movzbl %al,%eax + movl %edx,32(%rsi) + addb %bl,%cl + pinsrw $4,(%rdi,%rax,4),%xmm0 + movl (%rdi,%rcx,4),%edx + movl %ebx,(%rdi,%rcx,4) + addb %dl,%bl + movl 40(%rsi),%eax + movzbl %bl,%ebx + movl %edx,36(%rsi) + addb %al,%cl + pinsrw $4,(%rdi,%rbx,4),%xmm1 + movl (%rdi,%rcx,4),%edx + movl %eax,(%rdi,%rcx,4) + addb %dl,%al + movl 44(%rsi),%ebx + movzbl %al,%eax + movl %edx,40(%rsi) + addb %bl,%cl + pinsrw $5,(%rdi,%rax,4),%xmm0 + movl (%rdi,%rcx,4),%edx + movl %ebx,(%rdi,%rcx,4) + addb %dl,%bl + movl 48(%rsi),%eax + movzbl %bl,%ebx + movl %edx,44(%rsi) + addb %al,%cl + pinsrw $5,(%rdi,%rbx,4),%xmm1 + movl (%rdi,%rcx,4),%edx + movl %eax,(%rdi,%rcx,4) + addb %dl,%al + movl 52(%rsi),%ebx + movzbl %al,%eax + movl %edx,48(%rsi) + addb %bl,%cl + pinsrw $6,(%rdi,%rax,4),%xmm0 + movl (%rdi,%rcx,4),%edx + movl %ebx,(%rdi,%rcx,4) + addb %dl,%bl + movl 56(%rsi),%eax + movzbl %bl,%ebx + movl %edx,52(%rsi) + addb %al,%cl + pinsrw $6,(%rdi,%rbx,4),%xmm1 + movl (%rdi,%rcx,4),%edx + movl %eax,(%rdi,%rcx,4) + addb %dl,%al + movl 60(%rsi),%ebx + movzbl %al,%eax + movl %edx,56(%rsi) + addb %bl,%cl + pinsrw $7,(%rdi,%rax,4),%xmm0 + addb $16,%r10b + movdqu (%r12),%xmm2 + movl (%rdi,%rcx,4),%edx + movl %ebx,(%rdi,%rcx,4) + addb %dl,%bl + movzbl %bl,%ebx + movl %edx,60(%rsi) + leaq (%rdi,%r10,4),%rsi + pinsrw $7,(%rdi,%rbx,4),%xmm1 + movl (%rsi),%eax + movq %rcx,%rbx + xorq %rcx,%rcx + subq $16,%r11 + movb %bl,%cl + testq $-16,%r11 + jnz .Loop16 + + psllq $8,%xmm1 + pxor %xmm0,%xmm2 + pxor %xmm1,%xmm2 + movdqu %xmm2,(%r12,%r13,1) + leaq 16(%r12),%r12 + + cmpq $0,%r11 + jne .Lloop1 + jmp .Lexit + +.align 16 +.Lloop1: + addb %al,%cl + movl (%rdi,%rcx,4),%edx + movl %eax,(%rdi,%rcx,4) + movl %edx,(%rdi,%r10,4) + addb %dl,%al + incb %r10b + movl (%rdi,%rax,4),%edx + movl (%rdi,%r10,4),%eax + xorb (%r12),%dl + movb %dl,(%r12,%r13,1) + leaq 1(%r12),%r12 + decq %r11 + jnz .Lloop1 + jmp .Lexit + +.align 16 +.LRC4_CHAR: + addb $1,%r10b + movzbl (%rdi,%r10,1),%eax + testq $-8,%r11 + jz .Lcloop1 + jmp .Lcloop8 +.align 16 +.Lcloop8: + movl (%r12),%r8d + movl 4(%r12),%r9d + addb %al,%cl + leaq 1(%r10),%rsi + movzbl (%rdi,%rcx,1),%edx + movzbl %sil,%esi + movzbl (%rdi,%rsi,1),%ebx + movb %al,(%rdi,%rcx,1) + cmpq %rsi,%rcx + movb %dl,(%rdi,%r10,1) + jne .Lcmov0 + movq %rax,%rbx +.Lcmov0: + addb %al,%dl + xorb (%rdi,%rdx,1),%r8b + rorl $8,%r8d + addb %bl,%cl + leaq 1(%rsi),%r10 + movzbl (%rdi,%rcx,1),%edx + movzbl %r10b,%r10d + movzbl (%rdi,%r10,1),%eax + movb %bl,(%rdi,%rcx,1) + cmpq %r10,%rcx + movb %dl,(%rdi,%rsi,1) + jne .Lcmov1 + movq %rbx,%rax +.Lcmov1: + addb %bl,%dl + xorb (%rdi,%rdx,1),%r8b + rorl $8,%r8d + addb %al,%cl + leaq 1(%r10),%rsi + movzbl (%rdi,%rcx,1),%edx + movzbl %sil,%esi + movzbl (%rdi,%rsi,1),%ebx + movb %al,(%rdi,%rcx,1) + cmpq %rsi,%rcx + movb %dl,(%rdi,%r10,1) + jne .Lcmov2 + movq %rax,%rbx +.Lcmov2: + addb %al,%dl + xorb (%rdi,%rdx,1),%r8b + rorl $8,%r8d + addb %bl,%cl + leaq 1(%rsi),%r10 + movzbl (%rdi,%rcx,1),%edx + movzbl %r10b,%r10d + movzbl (%rdi,%r10,1),%eax + movb %bl,(%rdi,%rcx,1) + cmpq %r10,%rcx + movb %dl,(%rdi,%rsi,1) + jne .Lcmov3 + movq %rbx,%rax +.Lcmov3: + addb %bl,%dl + xorb (%rdi,%rdx,1),%r8b + rorl $8,%r8d + addb %al,%cl + leaq 1(%r10),%rsi + movzbl (%rdi,%rcx,1),%edx + movzbl %sil,%esi + movzbl (%rdi,%rsi,1),%ebx + movb %al,(%rdi,%rcx,1) + cmpq %rsi,%rcx + movb %dl,(%rdi,%r10,1) + jne .Lcmov4 + movq %rax,%rbx +.Lcmov4: + addb %al,%dl + xorb (%rdi,%rdx,1),%r9b + rorl $8,%r9d + addb %bl,%cl + leaq 1(%rsi),%r10 + movzbl (%rdi,%rcx,1),%edx + movzbl %r10b,%r10d + movzbl (%rdi,%r10,1),%eax + movb %bl,(%rdi,%rcx,1) + cmpq %r10,%rcx + movb %dl,(%rdi,%rsi,1) + jne .Lcmov5 + movq %rbx,%rax +.Lcmov5: + addb %bl,%dl + xorb (%rdi,%rdx,1),%r9b + rorl $8,%r9d + addb %al,%cl + leaq 1(%r10),%rsi + movzbl (%rdi,%rcx,1),%edx + movzbl %sil,%esi + movzbl (%rdi,%rsi,1),%ebx + movb %al,(%rdi,%rcx,1) + cmpq %rsi,%rcx + movb %dl,(%rdi,%r10,1) + jne .Lcmov6 + movq %rax,%rbx +.Lcmov6: + addb %al,%dl + xorb (%rdi,%rdx,1),%r9b + rorl $8,%r9d + addb %bl,%cl + leaq 1(%rsi),%r10 + movzbl (%rdi,%rcx,1),%edx + movzbl %r10b,%r10d + movzbl (%rdi,%r10,1),%eax + movb %bl,(%rdi,%rcx,1) + cmpq %r10,%rcx + movb %dl,(%rdi,%rsi,1) + jne .Lcmov7 + movq %rbx,%rax +.Lcmov7: + addb %bl,%dl + xorb (%rdi,%rdx,1),%r9b + rorl $8,%r9d + leaq -8(%r11),%r11 + movl %r8d,(%r13) + leaq 8(%r12),%r12 + movl %r9d,4(%r13) + leaq 8(%r13),%r13 + + testq $-8,%r11 + jnz .Lcloop8 + cmpq $0,%r11 + jne .Lcloop1 + jmp .Lexit +.align 16 +.Lcloop1: + addb %al,%cl + movzbl %cl,%ecx + movzbl (%rdi,%rcx,1),%edx + movb %al,(%rdi,%rcx,1) + movb %dl,(%rdi,%r10,1) + addb %al,%dl + addb $1,%r10b + movzbl %dl,%edx + movzbl %r10b,%r10d + movzbl (%rdi,%rdx,1),%edx + movzbl (%rdi,%r10,1),%eax + xorb (%r12),%dl + leaq 1(%r12),%r12 + movb %dl,(%r13) + leaq 1(%r13),%r13 + subq $1,%r11 + jnz .Lcloop1 + jmp .Lexit + +.align 16 +.Lexit: + subb $1,%r10b + movl %r10d,-8(%rdi) + movl %ecx,-4(%rdi) + + movq (%rsp),%r13 + movq 8(%rsp),%r12 + movq 16(%rsp),%rbx + addq $24,%rsp +.Lepilogue: + .byte 0xf3,0xc3 +.size RC4,.-RC4 +.globl private_RC4_set_key +.type private_RC4_set_key,@function +.align 16 +private_RC4_set_key: + leaq 8(%rdi),%rdi + leaq (%rdx,%rsi,1),%rdx + negq %rsi + movq %rsi,%rcx + xorl %eax,%eax + xorq %r9,%r9 + xorq %r10,%r10 + xorq %r11,%r11 + + movl OPENSSL_ia32cap_P(%rip),%r8d + btl $20,%r8d + jc .Lc1stloop + jmp .Lw1stloop + +.align 16 +.Lw1stloop: + movl %eax,(%rdi,%rax,4) + addb $1,%al + jnc .Lw1stloop + + xorq %r9,%r9 + xorq %r8,%r8 +.align 16 +.Lw2ndloop: + movl (%rdi,%r9,4),%r10d + addb (%rdx,%rsi,1),%r8b + addb %r10b,%r8b + addq $1,%rsi + movl (%rdi,%r8,4),%r11d + cmovzq %rcx,%rsi + movl %r10d,(%rdi,%r8,4) + movl %r11d,(%rdi,%r9,4) + addb $1,%r9b + jnc .Lw2ndloop + jmp .Lexit_key + +.align 16 +.Lc1stloop: + movb %al,(%rdi,%rax,1) + addb $1,%al + jnc .Lc1stloop + + xorq %r9,%r9 + xorq %r8,%r8 +.align 16 +.Lc2ndloop: + movb (%rdi,%r9,1),%r10b + addb (%rdx,%rsi,1),%r8b + addb %r10b,%r8b + addq $1,%rsi + movb (%rdi,%r8,1),%r11b + jnz .Lcnowrap + movq %rcx,%rsi +.Lcnowrap: + movb %r10b,(%rdi,%r8,1) + movb %r11b,(%rdi,%r9,1) + addb $1,%r9b + jnc .Lc2ndloop + movl $-1,256(%rdi) + +.align 16 +.Lexit_key: + xorl %eax,%eax + movl %eax,-8(%rdi) + movl %eax,-4(%rdi) + .byte 0xf3,0xc3 +.size private_RC4_set_key,.-private_RC4_set_key + +.globl RC4_options +.type RC4_options,@function +.align 16 +RC4_options: + leaq .Lopts(%rip),%rax + movl OPENSSL_ia32cap_P(%rip),%edx + btl $20,%edx + jc .L8xchar + btl $30,%edx + jnc .Ldone + addq $25,%rax + .byte 0xf3,0xc3 +.L8xchar: + addq $12,%rax +.Ldone: + .byte 0xf3,0xc3 +.align 64 +.Lopts: +.byte 114,99,52,40,56,120,44,105,110,116,41,0 +.byte 114,99,52,40,56,120,44,99,104,97,114,41,0 +.byte 114,99,52,40,49,54,120,44,105,110,116,41,0 +.byte 82,67,52,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 64 +.size RC4_options,.-RC4_options diff --git a/secure/lib/libcrypto/amd64/rsaz-avx2.S b/secure/lib/libcrypto/amd64/rsaz-avx2.S new file mode 100644 index 0000000..ba13765 --- /dev/null +++ b/secure/lib/libcrypto/amd64/rsaz-avx2.S @@ -0,0 +1,26 @@ + # $FreeBSD$ +.text + +.globl rsaz_avx2_eligible +.type rsaz_avx2_eligible,@function +rsaz_avx2_eligible: + xorl %eax,%eax + .byte 0xf3,0xc3 +.size rsaz_avx2_eligible,.-rsaz_avx2_eligible + +.globl rsaz_1024_sqr_avx2 +.globl rsaz_1024_mul_avx2 +.globl rsaz_1024_norm2red_avx2 +.globl rsaz_1024_red2norm_avx2 +.globl rsaz_1024_scatter5_avx2 +.globl rsaz_1024_gather5_avx2 +.type rsaz_1024_sqr_avx2,@function +rsaz_1024_sqr_avx2: +rsaz_1024_mul_avx2: +rsaz_1024_norm2red_avx2: +rsaz_1024_red2norm_avx2: +rsaz_1024_scatter5_avx2: +rsaz_1024_gather5_avx2: +.byte 0x0f,0x0b + .byte 0xf3,0xc3 +.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 diff --git a/secure/lib/libcrypto/amd64/rsaz-x86_64.S b/secure/lib/libcrypto/amd64/rsaz-x86_64.S new file mode 100644 index 0000000..efd229a --- /dev/null +++ b/secure/lib/libcrypto/amd64/rsaz-x86_64.S @@ -0,0 +1,1118 @@ + # $FreeBSD$ +.text + + + +.globl rsaz_512_sqr +.type rsaz_512_sqr,@function +.align 32 +rsaz_512_sqr: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $128+24,%rsp +.Lsqr_body: + movq %rdx,%rbp + movq (%rsi),%rdx + movq 8(%rsi),%rax + movq %rcx,128(%rsp) + jmp .Loop_sqr + +.align 32 +.Loop_sqr: + movl %r8d,128+8(%rsp) + + movq %rdx,%rbx + mulq %rdx + movq %rax,%r8 + movq 16(%rsi),%rax + movq %rdx,%r9 + + mulq %rbx + addq %rax,%r9 + movq 24(%rsi),%rax + movq %rdx,%r10 + adcq $0,%r10 + + mulq %rbx + addq %rax,%r10 + movq 32(%rsi),%rax + movq %rdx,%r11 + adcq $0,%r11 + + mulq %rbx + addq %rax,%r11 + movq 40(%rsi),%rax + movq %rdx,%r12 + adcq $0,%r12 + + mulq %rbx + addq %rax,%r12 + movq 48(%rsi),%rax + movq %rdx,%r13 + adcq $0,%r13 + + mulq %rbx + addq %rax,%r13 + movq 56(%rsi),%rax + movq %rdx,%r14 + adcq $0,%r14 + + mulq %rbx + addq %rax,%r14 + movq %rbx,%rax + movq %rdx,%r15 + adcq $0,%r15 + + addq %r8,%r8 + movq %r9,%rcx + adcq %r9,%r9 + + mulq %rax + movq %rax,(%rsp) + addq %rdx,%r8 + adcq $0,%r9 + + movq %r8,8(%rsp) + shrq $63,%rcx + + + movq 8(%rsi),%r8 + movq 16(%rsi),%rax + mulq %r8 + addq %rax,%r10 + movq 24(%rsi),%rax + movq %rdx,%rbx + adcq $0,%rbx + + mulq %r8 + addq %rax,%r11 + movq 32(%rsi),%rax + adcq $0,%rdx + addq %rbx,%r11 + movq %rdx,%rbx + adcq $0,%rbx + + mulq %r8 + addq %rax,%r12 + movq 40(%rsi),%rax + adcq $0,%rdx + addq %rbx,%r12 + movq %rdx,%rbx + adcq $0,%rbx + + mulq %r8 + addq %rax,%r13 + movq 48(%rsi),%rax + adcq $0,%rdx + addq %rbx,%r13 + movq %rdx,%rbx + adcq $0,%rbx + + mulq %r8 + addq %rax,%r14 + movq 56(%rsi),%rax + adcq $0,%rdx + addq %rbx,%r14 + movq %rdx,%rbx + adcq $0,%rbx + + mulq %r8 + addq %rax,%r15 + movq %r8,%rax + adcq $0,%rdx + addq %rbx,%r15 + movq %rdx,%r8 + movq %r10,%rdx + adcq $0,%r8 + + addq %rdx,%rdx + leaq (%rcx,%r10,2),%r10 + movq %r11,%rbx + adcq %r11,%r11 + + mulq %rax + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0,%r11 + + movq %r9,16(%rsp) + movq %r10,24(%rsp) + shrq $63,%rbx + + + movq 16(%rsi),%r9 + movq 24(%rsi),%rax + mulq %r9 + addq %rax,%r12 + movq 32(%rsi),%rax + movq %rdx,%rcx + adcq $0,%rcx + + mulq %r9 + addq %rax,%r13 + movq 40(%rsi),%rax + adcq $0,%rdx + addq %rcx,%r13 + movq %rdx,%rcx + adcq $0,%rcx + + mulq %r9 + addq %rax,%r14 + movq 48(%rsi),%rax + adcq $0,%rdx + addq %rcx,%r14 + movq %rdx,%rcx + adcq $0,%rcx + + mulq %r9 + movq %r12,%r10 + leaq (%rbx,%r12,2),%r12 + addq %rax,%r15 + movq 56(%rsi),%rax + adcq $0,%rdx + addq %rcx,%r15 + movq %rdx,%rcx + adcq $0,%rcx + + mulq %r9 + shrq $63,%r10 + addq %rax,%r8 + movq %r9,%rax + adcq $0,%rdx + addq %rcx,%r8 + movq %rdx,%r9 + adcq $0,%r9 + + movq %r13,%rcx + leaq (%r10,%r13,2),%r13 + + mulq %rax + addq %rax,%r11 + adcq %rdx,%r12 + adcq $0,%r13 + + movq %r11,32(%rsp) + movq %r12,40(%rsp) + shrq $63,%rcx + + + movq 24(%rsi),%r10 + movq 32(%rsi),%rax + mulq %r10 + addq %rax,%r14 + movq 40(%rsi),%rax + movq %rdx,%rbx + adcq $0,%rbx + + mulq %r10 + addq %rax,%r15 + movq 48(%rsi),%rax + adcq $0,%rdx + addq %rbx,%r15 + movq %rdx,%rbx + adcq $0,%rbx + + mulq %r10 + movq %r14,%r12 + leaq (%rcx,%r14,2),%r14 + addq %rax,%r8 + movq 56(%rsi),%rax + adcq $0,%rdx + addq %rbx,%r8 + movq %rdx,%rbx + adcq $0,%rbx + + mulq %r10 + shrq $63,%r12 + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + addq %rbx,%r9 + movq %rdx,%r10 + adcq $0,%r10 + + movq %r15,%rbx + leaq (%r12,%r15,2),%r15 + + mulq %rax + addq %rax,%r13 + adcq %rdx,%r14 + adcq $0,%r15 + + movq %r13,48(%rsp) + movq %r14,56(%rsp) + shrq $63,%rbx + + + movq 32(%rsi),%r11 + movq 40(%rsi),%rax + mulq %r11 + addq %rax,%r8 + movq 48(%rsi),%rax + movq %rdx,%rcx + adcq $0,%rcx + + mulq %r11 + addq %rax,%r9 + movq 56(%rsi),%rax + adcq $0,%rdx + movq %r8,%r12 + leaq (%rbx,%r8,2),%r8 + addq %rcx,%r9 + movq %rdx,%rcx + adcq $0,%rcx + + mulq %r11 + shrq $63,%r12 + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + addq %rcx,%r10 + movq %rdx,%r11 + adcq $0,%r11 + + movq %r9,%rcx + leaq (%r12,%r9,2),%r9 + + mulq %rax + addq %rax,%r15 + adcq %rdx,%r8 + adcq $0,%r9 + + movq %r15,64(%rsp) + movq %r8,72(%rsp) + shrq $63,%rcx + + + movq 40(%rsi),%r12 + movq 48(%rsi),%rax + mulq %r12 + addq %rax,%r10 + movq 56(%rsi),%rax + movq %rdx,%rbx + adcq $0,%rbx + + mulq %r12 + addq %rax,%r11 + movq %r12,%rax + movq %r10,%r15 + leaq (%rcx,%r10,2),%r10 + adcq $0,%rdx + shrq $63,%r15 + addq %rbx,%r11 + movq %rdx,%r12 + adcq $0,%r12 + + movq %r11,%rbx + leaq (%r15,%r11,2),%r11 + + mulq %rax + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0,%r11 + + movq %r9,80(%rsp) + movq %r10,88(%rsp) + + + movq 48(%rsi),%r13 + movq 56(%rsi),%rax + mulq %r13 + addq %rax,%r12 + movq %r13,%rax + movq %rdx,%r13 + adcq $0,%r13 + + xorq %r14,%r14 + shlq $1,%rbx + adcq %r12,%r12 + adcq %r13,%r13 + adcq %r14,%r14 + + mulq %rax + addq %rax,%r11 + adcq %rdx,%r12 + adcq $0,%r13 + + movq %r11,96(%rsp) + movq %r12,104(%rsp) + + + movq 56(%rsi),%rax + mulq %rax + addq %rax,%r13 + adcq $0,%rdx + + addq %rdx,%r14 + + movq %r13,112(%rsp) + movq %r14,120(%rsp) + + movq (%rsp),%r8 + movq 8(%rsp),%r9 + movq 16(%rsp),%r10 + movq 24(%rsp),%r11 + movq 32(%rsp),%r12 + movq 40(%rsp),%r13 + movq 48(%rsp),%r14 + movq 56(%rsp),%r15 + + call __rsaz_512_reduce + + addq 64(%rsp),%r8 + adcq 72(%rsp),%r9 + adcq 80(%rsp),%r10 + adcq 88(%rsp),%r11 + adcq 96(%rsp),%r12 + adcq 104(%rsp),%r13 + adcq 112(%rsp),%r14 + adcq 120(%rsp),%r15 + sbbq %rcx,%rcx + + call __rsaz_512_subtract + + movq %r8,%rdx + movq %r9,%rax + movl 128+8(%rsp),%r8d + movq %rdi,%rsi + + decl %r8d + jnz .Loop_sqr + + leaq 128+24+48(%rsp),%rax + movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp +.Lsqr_epilogue: + .byte 0xf3,0xc3 +.size rsaz_512_sqr,.-rsaz_512_sqr +.globl rsaz_512_mul +.type rsaz_512_mul,@function +.align 32 +rsaz_512_mul: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $128+24,%rsp +.Lmul_body: +.byte 102,72,15,110,199 +.byte 102,72,15,110,201 + movq %r8,128(%rsp) + movq (%rdx),%rbx + movq %rdx,%rbp + call __rsaz_512_mul + +.byte 102,72,15,126,199 +.byte 102,72,15,126,205 + + movq (%rsp),%r8 + movq 8(%rsp),%r9 + movq 16(%rsp),%r10 + movq 24(%rsp),%r11 + movq 32(%rsp),%r12 + movq 40(%rsp),%r13 + movq 48(%rsp),%r14 + movq 56(%rsp),%r15 + + call __rsaz_512_reduce + addq 64(%rsp),%r8 + adcq 72(%rsp),%r9 + adcq 80(%rsp),%r10 + adcq 88(%rsp),%r11 + adcq 96(%rsp),%r12 + adcq 104(%rsp),%r13 + adcq 112(%rsp),%r14 + adcq 120(%rsp),%r15 + sbbq %rcx,%rcx + + call __rsaz_512_subtract + + leaq 128+24+48(%rsp),%rax + movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp +.Lmul_epilogue: + .byte 0xf3,0xc3 +.size rsaz_512_mul,.-rsaz_512_mul +.globl rsaz_512_mul_gather4 +.type rsaz_512_mul_gather4,@function +.align 32 +rsaz_512_mul_gather4: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + movl %r9d,%r9d + subq $128+24,%rsp +.Lmul_gather4_body: + movl 64(%rdx,%r9,4),%eax +.byte 102,72,15,110,199 + movl (%rdx,%r9,4),%ebx +.byte 102,72,15,110,201 + movq %r8,128(%rsp) + + shlq $32,%rax + orq %rax,%rbx + movq (%rsi),%rax + movq 8(%rsi),%rcx + leaq 128(%rdx,%r9,4),%rbp + mulq %rbx + movq %rax,(%rsp) + movq %rcx,%rax + movq %rdx,%r8 + + mulq %rbx + movd (%rbp),%xmm4 + addq %rax,%r8 + movq 16(%rsi),%rax + movq %rdx,%r9 + adcq $0,%r9 + + mulq %rbx + movd 64(%rbp),%xmm5 + addq %rax,%r9 + movq 24(%rsi),%rax + movq %rdx,%r10 + adcq $0,%r10 + + mulq %rbx + pslldq $4,%xmm5 + addq %rax,%r10 + movq 32(%rsi),%rax + movq %rdx,%r11 + adcq $0,%r11 + + mulq %rbx + por %xmm5,%xmm4 + addq %rax,%r11 + movq 40(%rsi),%rax + movq %rdx,%r12 + adcq $0,%r12 + + mulq %rbx + addq %rax,%r12 + movq 48(%rsi),%rax + movq %rdx,%r13 + adcq $0,%r13 + + mulq %rbx + leaq 128(%rbp),%rbp + addq %rax,%r13 + movq 56(%rsi),%rax + movq %rdx,%r14 + adcq $0,%r14 + + mulq %rbx +.byte 102,72,15,126,227 + addq %rax,%r14 + movq (%rsi),%rax + movq %rdx,%r15 + adcq $0,%r15 + + leaq 8(%rsp),%rdi + movl $7,%ecx + jmp .Loop_mul_gather + +.align 32 +.Loop_mul_gather: + mulq %rbx + addq %rax,%r8 + movq 8(%rsi),%rax + movq %r8,(%rdi) + movq %rdx,%r8 + adcq $0,%r8 + + mulq %rbx + movd (%rbp),%xmm4 + addq %rax,%r9 + movq 16(%rsi),%rax + adcq $0,%rdx + addq %r9,%r8 + movq %rdx,%r9 + adcq $0,%r9 + + mulq %rbx + movd 64(%rbp),%xmm5 + addq %rax,%r10 + movq 24(%rsi),%rax + adcq $0,%rdx + addq %r10,%r9 + movq %rdx,%r10 + adcq $0,%r10 + + mulq %rbx + pslldq $4,%xmm5 + addq %rax,%r11 + movq 32(%rsi),%rax + adcq $0,%rdx + addq %r11,%r10 + movq %rdx,%r11 + adcq $0,%r11 + + mulq %rbx + por %xmm5,%xmm4 + addq %rax,%r12 + movq 40(%rsi),%rax + adcq $0,%rdx + addq %r12,%r11 + movq %rdx,%r12 + adcq $0,%r12 + + mulq %rbx + addq %rax,%r13 + movq 48(%rsi),%rax + adcq $0,%rdx + addq %r13,%r12 + movq %rdx,%r13 + adcq $0,%r13 + + mulq %rbx + addq %rax,%r14 + movq 56(%rsi),%rax + adcq $0,%rdx + addq %r14,%r13 + movq %rdx,%r14 + adcq $0,%r14 + + mulq %rbx +.byte 102,72,15,126,227 + addq %rax,%r15 + movq (%rsi),%rax + adcq $0,%rdx + addq %r15,%r14 + movq %rdx,%r15 + adcq $0,%r15 + + leaq 128(%rbp),%rbp + leaq 8(%rdi),%rdi + + decl %ecx + jnz .Loop_mul_gather + + movq %r8,(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + +.byte 102,72,15,126,199 +.byte 102,72,15,126,205 + + movq (%rsp),%r8 + movq 8(%rsp),%r9 + movq 16(%rsp),%r10 + movq 24(%rsp),%r11 + movq 32(%rsp),%r12 + movq 40(%rsp),%r13 + movq 48(%rsp),%r14 + movq 56(%rsp),%r15 + + call __rsaz_512_reduce + addq 64(%rsp),%r8 + adcq 72(%rsp),%r9 + adcq 80(%rsp),%r10 + adcq 88(%rsp),%r11 + adcq 96(%rsp),%r12 + adcq 104(%rsp),%r13 + adcq 112(%rsp),%r14 + adcq 120(%rsp),%r15 + sbbq %rcx,%rcx + + call __rsaz_512_subtract + + leaq 128+24+48(%rsp),%rax + movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp +.Lmul_gather4_epilogue: + .byte 0xf3,0xc3 +.size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4 +.globl rsaz_512_mul_scatter4 +.type rsaz_512_mul_scatter4,@function +.align 32 +rsaz_512_mul_scatter4: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + movl %r9d,%r9d + subq $128+24,%rsp +.Lmul_scatter4_body: + leaq (%r8,%r9,4),%r8 +.byte 102,72,15,110,199 +.byte 102,72,15,110,202 +.byte 102,73,15,110,208 + movq %rcx,128(%rsp) + + movq %rdi,%rbp + movq (%rdi),%rbx + call __rsaz_512_mul + +.byte 102,72,15,126,199 +.byte 102,72,15,126,205 + + movq (%rsp),%r8 + movq 8(%rsp),%r9 + movq 16(%rsp),%r10 + movq 24(%rsp),%r11 + movq 32(%rsp),%r12 + movq 40(%rsp),%r13 + movq 48(%rsp),%r14 + movq 56(%rsp),%r15 + + call __rsaz_512_reduce + addq 64(%rsp),%r8 + adcq 72(%rsp),%r9 + adcq 80(%rsp),%r10 + adcq 88(%rsp),%r11 + adcq 96(%rsp),%r12 + adcq 104(%rsp),%r13 + adcq 112(%rsp),%r14 + adcq 120(%rsp),%r15 +.byte 102,72,15,126,214 + sbbq %rcx,%rcx + + call __rsaz_512_subtract + + movl %r8d,0(%rsi) + shrq $32,%r8 + movl %r9d,128(%rsi) + shrq $32,%r9 + movl %r10d,256(%rsi) + shrq $32,%r10 + movl %r11d,384(%rsi) + shrq $32,%r11 + movl %r12d,512(%rsi) + shrq $32,%r12 + movl %r13d,640(%rsi) + shrq $32,%r13 + movl %r14d,768(%rsi) + shrq $32,%r14 + movl %r15d,896(%rsi) + shrq $32,%r15 + movl %r8d,64(%rsi) + movl %r9d,192(%rsi) + movl %r10d,320(%rsi) + movl %r11d,448(%rsi) + movl %r12d,576(%rsi) + movl %r13d,704(%rsi) + movl %r14d,832(%rsi) + movl %r15d,960(%rsi) + + leaq 128+24+48(%rsp),%rax + movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp +.Lmul_scatter4_epilogue: + .byte 0xf3,0xc3 +.size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4 +.globl rsaz_512_mul_by_one +.type rsaz_512_mul_by_one,@function +.align 32 +rsaz_512_mul_by_one: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $128+24,%rsp +.Lmul_by_one_body: + movq %rdx,%rbp + movq %rcx,128(%rsp) + + movq (%rsi),%r8 + pxor %xmm0,%xmm0 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + movq 56(%rsi),%r15 + + movdqa %xmm0,(%rsp) + movdqa %xmm0,16(%rsp) + movdqa %xmm0,32(%rsp) + movdqa %xmm0,48(%rsp) + movdqa %xmm0,64(%rsp) + movdqa %xmm0,80(%rsp) + movdqa %xmm0,96(%rsp) + call __rsaz_512_reduce + movq %r8,(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + leaq 128+24+48(%rsp),%rax + movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp +.Lmul_by_one_epilogue: + .byte 0xf3,0xc3 +.size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one +.type __rsaz_512_reduce,@function +.align 32 +__rsaz_512_reduce: + movq %r8,%rbx + imulq 128+8(%rsp),%rbx + movq 0(%rbp),%rax + movl $8,%ecx + jmp .Lreduction_loop + +.align 32 +.Lreduction_loop: + mulq %rbx + movq 8(%rbp),%rax + negq %r8 + movq %rdx,%r8 + adcq $0,%r8 + + mulq %rbx + addq %rax,%r9 + movq 16(%rbp),%rax + adcq $0,%rdx + addq %r9,%r8 + movq %rdx,%r9 + adcq $0,%r9 + + mulq %rbx + addq %rax,%r10 + movq 24(%rbp),%rax + adcq $0,%rdx + addq %r10,%r9 + movq %rdx,%r10 + adcq $0,%r10 + + mulq %rbx + addq %rax,%r11 + movq 32(%rbp),%rax + adcq $0,%rdx + addq %r11,%r10 + movq 128+8(%rsp),%rsi + + + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbx + addq %rax,%r12 + movq 40(%rbp),%rax + adcq $0,%rdx + imulq %r8,%rsi + addq %r12,%r11 + movq %rdx,%r12 + adcq $0,%r12 + + mulq %rbx + addq %rax,%r13 + movq 48(%rbp),%rax + adcq $0,%rdx + addq %r13,%r12 + movq %rdx,%r13 + adcq $0,%r13 + + mulq %rbx + addq %rax,%r14 + movq 56(%rbp),%rax + adcq $0,%rdx + addq %r14,%r13 + movq %rdx,%r14 + adcq $0,%r14 + + mulq %rbx + movq %rsi,%rbx + addq %rax,%r15 + movq 0(%rbp),%rax + adcq $0,%rdx + addq %r15,%r14 + movq %rdx,%r15 + adcq $0,%r15 + + decl %ecx + jne .Lreduction_loop + + .byte 0xf3,0xc3 +.size __rsaz_512_reduce,.-__rsaz_512_reduce +.type __rsaz_512_subtract,@function +.align 32 +__rsaz_512_subtract: + movq %r8,(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + movq 0(%rbp),%r8 + movq 8(%rbp),%r9 + negq %r8 + notq %r9 + andq %rcx,%r8 + movq 16(%rbp),%r10 + andq %rcx,%r9 + notq %r10 + movq 24(%rbp),%r11 + andq %rcx,%r10 + notq %r11 + movq 32(%rbp),%r12 + andq %rcx,%r11 + notq %r12 + movq 40(%rbp),%r13 + andq %rcx,%r12 + notq %r13 + movq 48(%rbp),%r14 + andq %rcx,%r13 + notq %r14 + movq 56(%rbp),%r15 + andq %rcx,%r14 + notq %r15 + andq %rcx,%r15 + + addq (%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + + movq %r8,(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + .byte 0xf3,0xc3 +.size __rsaz_512_subtract,.-__rsaz_512_subtract +.type __rsaz_512_mul,@function +.align 32 +__rsaz_512_mul: + leaq 8(%rsp),%rdi + + movq (%rsi),%rax + mulq %rbx + movq %rax,(%rdi) + movq 8(%rsi),%rax + movq %rdx,%r8 + + mulq %rbx + addq %rax,%r8 + movq 16(%rsi),%rax + movq %rdx,%r9 + adcq $0,%r9 + + mulq %rbx + addq %rax,%r9 + movq 24(%rsi),%rax + movq %rdx,%r10 + adcq $0,%r10 + + mulq %rbx + addq %rax,%r10 + movq 32(%rsi),%rax + movq %rdx,%r11 + adcq $0,%r11 + + mulq %rbx + addq %rax,%r11 + movq 40(%rsi),%rax + movq %rdx,%r12 + adcq $0,%r12 + + mulq %rbx + addq %rax,%r12 + movq 48(%rsi),%rax + movq %rdx,%r13 + adcq $0,%r13 + + mulq %rbx + addq %rax,%r13 + movq 56(%rsi),%rax + movq %rdx,%r14 + adcq $0,%r14 + + mulq %rbx + addq %rax,%r14 + movq (%rsi),%rax + movq %rdx,%r15 + adcq $0,%r15 + + leaq 8(%rbp),%rbp + leaq 8(%rdi),%rdi + + movl $7,%ecx + jmp .Loop_mul + +.align 32 +.Loop_mul: + movq (%rbp),%rbx + mulq %rbx + addq %rax,%r8 + movq 8(%rsi),%rax + movq %r8,(%rdi) + movq %rdx,%r8 + adcq $0,%r8 + + mulq %rbx + addq %rax,%r9 + movq 16(%rsi),%rax + adcq $0,%rdx + addq %r9,%r8 + movq %rdx,%r9 + adcq $0,%r9 + + mulq %rbx + addq %rax,%r10 + movq 24(%rsi),%rax + adcq $0,%rdx + addq %r10,%r9 + movq %rdx,%r10 + adcq $0,%r10 + + mulq %rbx + addq %rax,%r11 + movq 32(%rsi),%rax + adcq $0,%rdx + addq %r11,%r10 + movq %rdx,%r11 + adcq $0,%r11 + + mulq %rbx + addq %rax,%r12 + movq 40(%rsi),%rax + adcq $0,%rdx + addq %r12,%r11 + movq %rdx,%r12 + adcq $0,%r12 + + mulq %rbx + addq %rax,%r13 + movq 48(%rsi),%rax + adcq $0,%rdx + addq %r13,%r12 + movq %rdx,%r13 + adcq $0,%r13 + + mulq %rbx + addq %rax,%r14 + movq 56(%rsi),%rax + adcq $0,%rdx + addq %r14,%r13 + movq %rdx,%r14 + leaq 8(%rbp),%rbp + adcq $0,%r14 + + mulq %rbx + addq %rax,%r15 + movq (%rsi),%rax + adcq $0,%rdx + addq %r15,%r14 + movq %rdx,%r15 + adcq $0,%r15 + + leaq 8(%rdi),%rdi + + decl %ecx + jnz .Loop_mul + + movq %r8,(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + .byte 0xf3,0xc3 +.size __rsaz_512_mul,.-__rsaz_512_mul +.globl rsaz_512_scatter4 +.type rsaz_512_scatter4,@function +.align 16 +rsaz_512_scatter4: + leaq (%rdi,%rdx,4),%rdi + movl $8,%r9d + jmp .Loop_scatter +.align 16 +.Loop_scatter: + movq (%rsi),%rax + leaq 8(%rsi),%rsi + movl %eax,(%rdi) + shrq $32,%rax + movl %eax,64(%rdi) + leaq 128(%rdi),%rdi + decl %r9d + jnz .Loop_scatter + .byte 0xf3,0xc3 +.size rsaz_512_scatter4,.-rsaz_512_scatter4 + +.globl rsaz_512_gather4 +.type rsaz_512_gather4,@function +.align 16 +rsaz_512_gather4: + leaq (%rsi,%rdx,4),%rsi + movl $8,%r9d + jmp .Loop_gather +.align 16 +.Loop_gather: + movl (%rsi),%eax + movl 64(%rsi),%r8d + leaq 128(%rsi),%rsi + shlq $32,%r8 + orq %r8,%rax + movq %rax,(%rdi) + leaq 8(%rdi),%rdi + decl %r9d + jnz .Loop_gather + .byte 0xf3,0xc3 +.size rsaz_512_gather4,.-rsaz_512_gather4 diff --git a/secure/lib/libcrypto/amd64/sha1-mb-x86_64.S b/secure/lib/libcrypto/amd64/sha1-mb-x86_64.S new file mode 100644 index 0000000..6c7cd2f --- /dev/null +++ b/secure/lib/libcrypto/amd64/sha1-mb-x86_64.S @@ -0,0 +1,2935 @@ + # $FreeBSD$ +.text + + + +.globl sha1_multi_block +.type sha1_multi_block,@function +.align 32 +sha1_multi_block: + movq OPENSSL_ia32cap_P+4(%rip),%rcx + btq $61,%rcx + jc _shaext_shortcut + movq %rsp,%rax + pushq %rbx + pushq %rbp + subq $288,%rsp + andq $-256,%rsp + movq %rax,272(%rsp) +.Lbody: + leaq K_XX_XX(%rip),%rbp + leaq 256(%rsp),%rbx + +.Loop_grande: + movl %edx,280(%rsp) + xorl %edx,%edx + movq 0(%rsi),%r8 + movl 8(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,0(%rbx) + cmovleq %rbp,%r8 + movq 16(%rsi),%r9 + movl 24(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,4(%rbx) + cmovleq %rbp,%r9 + movq 32(%rsi),%r10 + movl 40(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,8(%rbx) + cmovleq %rbp,%r10 + movq 48(%rsi),%r11 + movl 56(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,12(%rbx) + cmovleq %rbp,%r11 + testl %edx,%edx + jz .Ldone + + movdqu 0(%rdi),%xmm10 + leaq 128(%rsp),%rax + movdqu 32(%rdi),%xmm11 + movdqu 64(%rdi),%xmm12 + movdqu 96(%rdi),%xmm13 + movdqu 128(%rdi),%xmm14 + movdqa 96(%rbp),%xmm5 + movdqa -32(%rbp),%xmm15 + jmp .Loop + +.align 32 +.Loop: + movd (%r8),%xmm0 + leaq 64(%r8),%r8 + movd (%r9),%xmm2 + leaq 64(%r9),%r9 + movd (%r10),%xmm3 + leaq 64(%r10),%r10 + movd (%r11),%xmm4 + leaq 64(%r11),%r11 + punpckldq %xmm3,%xmm0 + movd -60(%r8),%xmm1 + punpckldq %xmm4,%xmm2 + movd -60(%r9),%xmm9 + punpckldq %xmm2,%xmm0 + movd -60(%r10),%xmm8 +.byte 102,15,56,0,197 + movd -60(%r11),%xmm7 + punpckldq %xmm8,%xmm1 + movdqa %xmm10,%xmm8 + paddd %xmm15,%xmm14 + punpckldq %xmm7,%xmm9 + movdqa %xmm11,%xmm7 + movdqa %xmm11,%xmm6 + pslld $5,%xmm8 + pandn %xmm13,%xmm7 + pand %xmm12,%xmm6 + punpckldq %xmm9,%xmm1 + movdqa %xmm10,%xmm9 + + movdqa %xmm0,0-128(%rax) + paddd %xmm0,%xmm14 + movd -56(%r8),%xmm2 + psrld $27,%xmm9 + pxor %xmm7,%xmm6 + movdqa %xmm11,%xmm7 + + por %xmm9,%xmm8 + movd -56(%r9),%xmm9 + pslld $30,%xmm7 + paddd %xmm6,%xmm14 + + psrld $2,%xmm11 + paddd %xmm8,%xmm14 +.byte 102,15,56,0,205 + movd -56(%r10),%xmm8 + por %xmm7,%xmm11 + movd -56(%r11),%xmm7 + punpckldq %xmm8,%xmm2 + movdqa %xmm14,%xmm8 + paddd %xmm15,%xmm13 + punpckldq %xmm7,%xmm9 + movdqa %xmm10,%xmm7 + movdqa %xmm10,%xmm6 + pslld $5,%xmm8 + pandn %xmm12,%xmm7 + pand %xmm11,%xmm6 + punpckldq %xmm9,%xmm2 + movdqa %xmm14,%xmm9 + + movdqa %xmm1,16-128(%rax) + paddd %xmm1,%xmm13 + movd -52(%r8),%xmm3 + psrld $27,%xmm9 + pxor %xmm7,%xmm6 + movdqa %xmm10,%xmm7 + + por %xmm9,%xmm8 + movd -52(%r9),%xmm9 + pslld $30,%xmm7 + paddd %xmm6,%xmm13 + + psrld $2,%xmm10 + paddd %xmm8,%xmm13 +.byte 102,15,56,0,213 + movd -52(%r10),%xmm8 + por %xmm7,%xmm10 + movd -52(%r11),%xmm7 + punpckldq %xmm8,%xmm3 + movdqa %xmm13,%xmm8 + paddd %xmm15,%xmm12 + punpckldq %xmm7,%xmm9 + movdqa %xmm14,%xmm7 + movdqa %xmm14,%xmm6 + pslld $5,%xmm8 + pandn %xmm11,%xmm7 + pand %xmm10,%xmm6 + punpckldq %xmm9,%xmm3 + movdqa %xmm13,%xmm9 + + movdqa %xmm2,32-128(%rax) + paddd %xmm2,%xmm12 + movd -48(%r8),%xmm4 + psrld $27,%xmm9 + pxor %xmm7,%xmm6 + movdqa %xmm14,%xmm7 + + por %xmm9,%xmm8 + movd -48(%r9),%xmm9 + pslld $30,%xmm7 + paddd %xmm6,%xmm12 + + psrld $2,%xmm14 + paddd %xmm8,%xmm12 +.byte 102,15,56,0,221 + movd -48(%r10),%xmm8 + por %xmm7,%xmm14 + movd -48(%r11),%xmm7 + punpckldq %xmm8,%xmm4 + movdqa %xmm12,%xmm8 + paddd %xmm15,%xmm11 + punpckldq %xmm7,%xmm9 + movdqa %xmm13,%xmm7 + movdqa %xmm13,%xmm6 + pslld $5,%xmm8 + pandn %xmm10,%xmm7 + pand %xmm14,%xmm6 + punpckldq %xmm9,%xmm4 + movdqa %xmm12,%xmm9 + + movdqa %xmm3,48-128(%rax) + paddd %xmm3,%xmm11 + movd -44(%r8),%xmm0 + psrld $27,%xmm9 + pxor %xmm7,%xmm6 + movdqa %xmm13,%xmm7 + + por %xmm9,%xmm8 + movd -44(%r9),%xmm9 + pslld $30,%xmm7 + paddd %xmm6,%xmm11 + + psrld $2,%xmm13 + paddd %xmm8,%xmm11 +.byte 102,15,56,0,229 + movd -44(%r10),%xmm8 + por %xmm7,%xmm13 + movd -44(%r11),%xmm7 + punpckldq %xmm8,%xmm0 + movdqa %xmm11,%xmm8 + paddd %xmm15,%xmm10 + punpckldq %xmm7,%xmm9 + movdqa %xmm12,%xmm7 + movdqa %xmm12,%xmm6 + pslld $5,%xmm8 + pandn %xmm14,%xmm7 + pand %xmm13,%xmm6 + punpckldq %xmm9,%xmm0 + movdqa %xmm11,%xmm9 + + movdqa %xmm4,64-128(%rax) + paddd %xmm4,%xmm10 + movd -40(%r8),%xmm1 + psrld $27,%xmm9 + pxor %xmm7,%xmm6 + movdqa %xmm12,%xmm7 + + por %xmm9,%xmm8 + movd -40(%r9),%xmm9 + pslld $30,%xmm7 + paddd %xmm6,%xmm10 + + psrld $2,%xmm12 + paddd %xmm8,%xmm10 +.byte 102,15,56,0,197 + movd -40(%r10),%xmm8 + por %xmm7,%xmm12 + movd -40(%r11),%xmm7 + punpckldq %xmm8,%xmm1 + movdqa %xmm10,%xmm8 + paddd %xmm15,%xmm14 + punpckldq %xmm7,%xmm9 + movdqa %xmm11,%xmm7 + movdqa %xmm11,%xmm6 + pslld $5,%xmm8 + pandn %xmm13,%xmm7 + pand %xmm12,%xmm6 + punpckldq %xmm9,%xmm1 + movdqa %xmm10,%xmm9 + + movdqa %xmm0,80-128(%rax) + paddd %xmm0,%xmm14 + movd -36(%r8),%xmm2 + psrld $27,%xmm9 + pxor %xmm7,%xmm6 + movdqa %xmm11,%xmm7 + + por %xmm9,%xmm8 + movd -36(%r9),%xmm9 + pslld $30,%xmm7 + paddd %xmm6,%xmm14 + + psrld $2,%xmm11 + paddd %xmm8,%xmm14 +.byte 102,15,56,0,205 + movd -36(%r10),%xmm8 + por %xmm7,%xmm11 + movd -36(%r11),%xmm7 + punpckldq %xmm8,%xmm2 + movdqa %xmm14,%xmm8 + paddd %xmm15,%xmm13 + punpckldq %xmm7,%xmm9 + movdqa %xmm10,%xmm7 + movdqa %xmm10,%xmm6 + pslld $5,%xmm8 + pandn %xmm12,%xmm7 + pand %xmm11,%xmm6 + punpckldq %xmm9,%xmm2 + movdqa %xmm14,%xmm9 + + movdqa %xmm1,96-128(%rax) + paddd %xmm1,%xmm13 + movd -32(%r8),%xmm3 + psrld $27,%xmm9 + pxor %xmm7,%xmm6 + movdqa %xmm10,%xmm7 + + por %xmm9,%xmm8 + movd -32(%r9),%xmm9 + pslld $30,%xmm7 + paddd %xmm6,%xmm13 + + psrld $2,%xmm10 + paddd %xmm8,%xmm13 +.byte 102,15,56,0,213 + movd -32(%r10),%xmm8 + por %xmm7,%xmm10 + movd -32(%r11),%xmm7 + punpckldq %xmm8,%xmm3 + movdqa %xmm13,%xmm8 + paddd %xmm15,%xmm12 + punpckldq %xmm7,%xmm9 + movdqa %xmm14,%xmm7 + movdqa %xmm14,%xmm6 + pslld $5,%xmm8 + pandn %xmm11,%xmm7 + pand %xmm10,%xmm6 + punpckldq %xmm9,%xmm3 + movdqa %xmm13,%xmm9 + + movdqa %xmm2,112-128(%rax) + paddd %xmm2,%xmm12 + movd -28(%r8),%xmm4 + psrld $27,%xmm9 + pxor %xmm7,%xmm6 + movdqa %xmm14,%xmm7 + + por %xmm9,%xmm8 + movd -28(%r9),%xmm9 + pslld $30,%xmm7 + paddd %xmm6,%xmm12 + + psrld $2,%xmm14 + paddd %xmm8,%xmm12 +.byte 102,15,56,0,221 + movd -28(%r10),%xmm8 + por %xmm7,%xmm14 + movd -28(%r11),%xmm7 + punpckldq %xmm8,%xmm4 + movdqa %xmm12,%xmm8 + paddd %xmm15,%xmm11 + punpckldq %xmm7,%xmm9 + movdqa %xmm13,%xmm7 + movdqa %xmm13,%xmm6 + pslld $5,%xmm8 + pandn %xmm10,%xmm7 + pand %xmm14,%xmm6 + punpckldq %xmm9,%xmm4 + movdqa %xmm12,%xmm9 + + movdqa %xmm3,128-128(%rax) + paddd %xmm3,%xmm11 + movd -24(%r8),%xmm0 + psrld $27,%xmm9 + pxor %xmm7,%xmm6 + movdqa %xmm13,%xmm7 + + por %xmm9,%xmm8 + movd -24(%r9),%xmm9 + pslld $30,%xmm7 + paddd %xmm6,%xmm11 + + psrld $2,%xmm13 + paddd %xmm8,%xmm11 +.byte 102,15,56,0,229 + movd -24(%r10),%xmm8 + por %xmm7,%xmm13 + movd -24(%r11),%xmm7 + punpckldq %xmm8,%xmm0 + movdqa %xmm11,%xmm8 + paddd %xmm15,%xmm10 + punpckldq %xmm7,%xmm9 + movdqa %xmm12,%xmm7 + movdqa %xmm12,%xmm6 + pslld $5,%xmm8 + pandn %xmm14,%xmm7 + pand %xmm13,%xmm6 + punpckldq %xmm9,%xmm0 + movdqa %xmm11,%xmm9 + + movdqa %xmm4,144-128(%rax) + paddd %xmm4,%xmm10 + movd -20(%r8),%xmm1 + psrld $27,%xmm9 + pxor %xmm7,%xmm6 + movdqa %xmm12,%xmm7 + + por %xmm9,%xmm8 + movd -20(%r9),%xmm9 + pslld $30,%xmm7 + paddd %xmm6,%xmm10 + + psrld $2,%xmm12 + paddd %xmm8,%xmm10 +.byte 102,15,56,0,197 + movd -20(%r10),%xmm8 + por %xmm7,%xmm12 + movd -20(%r11),%xmm7 + punpckldq %xmm8,%xmm1 + movdqa %xmm10,%xmm8 + paddd %xmm15,%xmm14 + punpckldq %xmm7,%xmm9 + movdqa %xmm11,%xmm7 + movdqa %xmm11,%xmm6 + pslld $5,%xmm8 + pandn %xmm13,%xmm7 + pand %xmm12,%xmm6 + punpckldq %xmm9,%xmm1 + movdqa %xmm10,%xmm9 + + movdqa %xmm0,160-128(%rax) + paddd %xmm0,%xmm14 + movd -16(%r8),%xmm2 + psrld $27,%xmm9 + pxor %xmm7,%xmm6 + movdqa %xmm11,%xmm7 + + por %xmm9,%xmm8 + movd -16(%r9),%xmm9 + pslld $30,%xmm7 + paddd %xmm6,%xmm14 + + psrld $2,%xmm11 + paddd %xmm8,%xmm14 +.byte 102,15,56,0,205 + movd -16(%r10),%xmm8 + por %xmm7,%xmm11 + movd -16(%r11),%xmm7 + punpckldq %xmm8,%xmm2 + movdqa %xmm14,%xmm8 + paddd %xmm15,%xmm13 + punpckldq %xmm7,%xmm9 + movdqa %xmm10,%xmm7 + movdqa %xmm10,%xmm6 + pslld $5,%xmm8 + pandn %xmm12,%xmm7 + pand %xmm11,%xmm6 + punpckldq %xmm9,%xmm2 + movdqa %xmm14,%xmm9 + + movdqa %xmm1,176-128(%rax) + paddd %xmm1,%xmm13 + movd -12(%r8),%xmm3 + psrld $27,%xmm9 + pxor %xmm7,%xmm6 + movdqa %xmm10,%xmm7 + + por %xmm9,%xmm8 + movd -12(%r9),%xmm9 + pslld $30,%xmm7 + paddd %xmm6,%xmm13 + + psrld $2,%xmm10 + paddd %xmm8,%xmm13 +.byte 102,15,56,0,213 + movd -12(%r10),%xmm8 + por %xmm7,%xmm10 + movd -12(%r11),%xmm7 + punpckldq %xmm8,%xmm3 + movdqa %xmm13,%xmm8 + paddd %xmm15,%xmm12 + punpckldq %xmm7,%xmm9 + movdqa %xmm14,%xmm7 + movdqa %xmm14,%xmm6 + pslld $5,%xmm8 + pandn %xmm11,%xmm7 + pand %xmm10,%xmm6 + punpckldq %xmm9,%xmm3 + movdqa %xmm13,%xmm9 + + movdqa %xmm2,192-128(%rax) + paddd %xmm2,%xmm12 + movd -8(%r8),%xmm4 + psrld $27,%xmm9 + pxor %xmm7,%xmm6 + movdqa %xmm14,%xmm7 + + por %xmm9,%xmm8 + movd -8(%r9),%xmm9 + pslld $30,%xmm7 + paddd %xmm6,%xmm12 + + psrld $2,%xmm14 + paddd %xmm8,%xmm12 +.byte 102,15,56,0,221 + movd -8(%r10),%xmm8 + por %xmm7,%xmm14 + movd -8(%r11),%xmm7 + punpckldq %xmm8,%xmm4 + movdqa %xmm12,%xmm8 + paddd %xmm15,%xmm11 + punpckldq %xmm7,%xmm9 + movdqa %xmm13,%xmm7 + movdqa %xmm13,%xmm6 + pslld $5,%xmm8 + pandn %xmm10,%xmm7 + pand %xmm14,%xmm6 + punpckldq %xmm9,%xmm4 + movdqa %xmm12,%xmm9 + + movdqa %xmm3,208-128(%rax) + paddd %xmm3,%xmm11 + movd -4(%r8),%xmm0 + psrld $27,%xmm9 + pxor %xmm7,%xmm6 + movdqa %xmm13,%xmm7 + + por %xmm9,%xmm8 + movd -4(%r9),%xmm9 + pslld $30,%xmm7 + paddd %xmm6,%xmm11 + + psrld $2,%xmm13 + paddd %xmm8,%xmm11 +.byte 102,15,56,0,229 + movd -4(%r10),%xmm8 + por %xmm7,%xmm13 + movdqa 0-128(%rax),%xmm1 + movd -4(%r11),%xmm7 + punpckldq %xmm8,%xmm0 + movdqa %xmm11,%xmm8 + paddd %xmm15,%xmm10 + punpckldq %xmm7,%xmm9 + movdqa %xmm12,%xmm7 + movdqa %xmm12,%xmm6 + pslld $5,%xmm8 + prefetcht0 63(%r8) + pandn %xmm14,%xmm7 + pand %xmm13,%xmm6 + punpckldq %xmm9,%xmm0 + movdqa %xmm11,%xmm9 + + movdqa %xmm4,224-128(%rax) + paddd %xmm4,%xmm10 + psrld $27,%xmm9 + pxor %xmm7,%xmm6 + movdqa %xmm12,%xmm7 + prefetcht0 63(%r9) + + por %xmm9,%xmm8 + pslld $30,%xmm7 + paddd %xmm6,%xmm10 + prefetcht0 63(%r10) + + psrld $2,%xmm12 + paddd %xmm8,%xmm10 +.byte 102,15,56,0,197 + prefetcht0 63(%r11) + por %xmm7,%xmm12 + movdqa 16-128(%rax),%xmm2 + pxor %xmm3,%xmm1 + movdqa 32-128(%rax),%xmm3 + + movdqa %xmm10,%xmm8 + pxor 128-128(%rax),%xmm1 + paddd %xmm15,%xmm14 + movdqa %xmm11,%xmm7 + pslld $5,%xmm8 + pxor %xmm3,%xmm1 + movdqa %xmm11,%xmm6 + pandn %xmm13,%xmm7 + movdqa %xmm1,%xmm5 + pand %xmm12,%xmm6 + movdqa %xmm10,%xmm9 + psrld $31,%xmm5 + paddd %xmm1,%xmm1 + + movdqa %xmm0,240-128(%rax) + paddd %xmm0,%xmm14 + psrld $27,%xmm9 + pxor %xmm7,%xmm6 + + movdqa %xmm11,%xmm7 + por %xmm9,%xmm8 + pslld $30,%xmm7 + paddd %xmm6,%xmm14 + + psrld $2,%xmm11 + paddd %xmm8,%xmm14 + por %xmm5,%xmm1 + por %xmm7,%xmm11 + pxor %xmm4,%xmm2 + movdqa 48-128(%rax),%xmm4 + + movdqa %xmm14,%xmm8 + pxor 144-128(%rax),%xmm2 + paddd %xmm15,%xmm13 + movdqa %xmm10,%xmm7 + pslld $5,%xmm8 + pxor %xmm4,%xmm2 + movdqa %xmm10,%xmm6 + pandn %xmm12,%xmm7 + movdqa %xmm2,%xmm5 + pand %xmm11,%xmm6 + movdqa %xmm14,%xmm9 + psrld $31,%xmm5 + paddd %xmm2,%xmm2 + + movdqa %xmm1,0-128(%rax) + paddd %xmm1,%xmm13 + psrld $27,%xmm9 + pxor %xmm7,%xmm6 + + movdqa %xmm10,%xmm7 + por %xmm9,%xmm8 + pslld $30,%xmm7 + paddd %xmm6,%xmm13 + + psrld $2,%xmm10 + paddd %xmm8,%xmm13 + por %xmm5,%xmm2 + por %xmm7,%xmm10 + pxor %xmm0,%xmm3 + movdqa 64-128(%rax),%xmm0 + + movdqa %xmm13,%xmm8 + pxor 160-128(%rax),%xmm3 + paddd %xmm15,%xmm12 + movdqa %xmm14,%xmm7 + pslld $5,%xmm8 + pxor %xmm0,%xmm3 + movdqa %xmm14,%xmm6 + pandn %xmm11,%xmm7 + movdqa %xmm3,%xmm5 + pand %xmm10,%xmm6 + movdqa %xmm13,%xmm9 + psrld $31,%xmm5 + paddd %xmm3,%xmm3 + + movdqa %xmm2,16-128(%rax) + paddd %xmm2,%xmm12 + psrld $27,%xmm9 + pxor %xmm7,%xmm6 + + movdqa %xmm14,%xmm7 + por %xmm9,%xmm8 + pslld $30,%xmm7 + paddd %xmm6,%xmm12 + + psrld $2,%xmm14 + paddd %xmm8,%xmm12 + por %xmm5,%xmm3 + por %xmm7,%xmm14 + pxor %xmm1,%xmm4 + movdqa 80-128(%rax),%xmm1 + + movdqa %xmm12,%xmm8 + pxor 176-128(%rax),%xmm4 + paddd %xmm15,%xmm11 + movdqa %xmm13,%xmm7 + pslld $5,%xmm8 + pxor %xmm1,%xmm4 + movdqa %xmm13,%xmm6 + pandn %xmm10,%xmm7 + movdqa %xmm4,%xmm5 + pand %xmm14,%xmm6 + movdqa %xmm12,%xmm9 + psrld $31,%xmm5 + paddd %xmm4,%xmm4 + + movdqa %xmm3,32-128(%rax) + paddd %xmm3,%xmm11 + psrld $27,%xmm9 + pxor %xmm7,%xmm6 + + movdqa %xmm13,%xmm7 + por %xmm9,%xmm8 + pslld $30,%xmm7 + paddd %xmm6,%xmm11 + + psrld $2,%xmm13 + paddd %xmm8,%xmm11 + por %xmm5,%xmm4 + por %xmm7,%xmm13 + pxor %xmm2,%xmm0 + movdqa 96-128(%rax),%xmm2 + + movdqa %xmm11,%xmm8 + pxor 192-128(%rax),%xmm0 + paddd %xmm15,%xmm10 + movdqa %xmm12,%xmm7 + pslld $5,%xmm8 + pxor %xmm2,%xmm0 + movdqa %xmm12,%xmm6 + pandn %xmm14,%xmm7 + movdqa %xmm0,%xmm5 + pand %xmm13,%xmm6 + movdqa %xmm11,%xmm9 + psrld $31,%xmm5 + paddd %xmm0,%xmm0 + + movdqa %xmm4,48-128(%rax) + paddd %xmm4,%xmm10 + psrld $27,%xmm9 + pxor %xmm7,%xmm6 + + movdqa %xmm12,%xmm7 + por %xmm9,%xmm8 + pslld $30,%xmm7 + paddd %xmm6,%xmm10 + + psrld $2,%xmm12 + paddd %xmm8,%xmm10 + por %xmm5,%xmm0 + por %xmm7,%xmm12 + movdqa 0(%rbp),%xmm15 + pxor %xmm3,%xmm1 + movdqa 112-128(%rax),%xmm3 + + movdqa %xmm10,%xmm8 + movdqa %xmm13,%xmm6 + pxor 208-128(%rax),%xmm1 + paddd %xmm15,%xmm14 + pslld $5,%xmm8 + pxor %xmm11,%xmm6 + + movdqa %xmm10,%xmm9 + movdqa %xmm0,64-128(%rax) + paddd %xmm0,%xmm14 + pxor %xmm3,%xmm1 + psrld $27,%xmm9 + pxor %xmm12,%xmm6 + movdqa %xmm11,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm1,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm14 + paddd %xmm1,%xmm1 + + psrld $2,%xmm11 + paddd %xmm8,%xmm14 + por %xmm5,%xmm1 + por %xmm7,%xmm11 + pxor %xmm4,%xmm2 + movdqa 128-128(%rax),%xmm4 + + movdqa %xmm14,%xmm8 + movdqa %xmm12,%xmm6 + pxor 224-128(%rax),%xmm2 + paddd %xmm15,%xmm13 + pslld $5,%xmm8 + pxor %xmm10,%xmm6 + + movdqa %xmm14,%xmm9 + movdqa %xmm1,80-128(%rax) + paddd %xmm1,%xmm13 + pxor %xmm4,%xmm2 + psrld $27,%xmm9 + pxor %xmm11,%xmm6 + movdqa %xmm10,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm2,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm13 + paddd %xmm2,%xmm2 + + psrld $2,%xmm10 + paddd %xmm8,%xmm13 + por %xmm5,%xmm2 + por %xmm7,%xmm10 + pxor %xmm0,%xmm3 + movdqa 144-128(%rax),%xmm0 + + movdqa %xmm13,%xmm8 + movdqa %xmm11,%xmm6 + pxor 240-128(%rax),%xmm3 + paddd %xmm15,%xmm12 + pslld $5,%xmm8 + pxor %xmm14,%xmm6 + + movdqa %xmm13,%xmm9 + movdqa %xmm2,96-128(%rax) + paddd %xmm2,%xmm12 + pxor %xmm0,%xmm3 + psrld $27,%xmm9 + pxor %xmm10,%xmm6 + movdqa %xmm14,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm3,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm12 + paddd %xmm3,%xmm3 + + psrld $2,%xmm14 + paddd %xmm8,%xmm12 + por %xmm5,%xmm3 + por %xmm7,%xmm14 + pxor %xmm1,%xmm4 + movdqa 160-128(%rax),%xmm1 + + movdqa %xmm12,%xmm8 + movdqa %xmm10,%xmm6 + pxor 0-128(%rax),%xmm4 + paddd %xmm15,%xmm11 + pslld $5,%xmm8 + pxor %xmm13,%xmm6 + + movdqa %xmm12,%xmm9 + movdqa %xmm3,112-128(%rax) + paddd %xmm3,%xmm11 + pxor %xmm1,%xmm4 + psrld $27,%xmm9 + pxor %xmm14,%xmm6 + movdqa %xmm13,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm4,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm11 + paddd %xmm4,%xmm4 + + psrld $2,%xmm13 + paddd %xmm8,%xmm11 + por %xmm5,%xmm4 + por %xmm7,%xmm13 + pxor %xmm2,%xmm0 + movdqa 176-128(%rax),%xmm2 + + movdqa %xmm11,%xmm8 + movdqa %xmm14,%xmm6 + pxor 16-128(%rax),%xmm0 + paddd %xmm15,%xmm10 + pslld $5,%xmm8 + pxor %xmm12,%xmm6 + + movdqa %xmm11,%xmm9 + movdqa %xmm4,128-128(%rax) + paddd %xmm4,%xmm10 + pxor %xmm2,%xmm0 + psrld $27,%xmm9 + pxor %xmm13,%xmm6 + movdqa %xmm12,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm0,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm10 + paddd %xmm0,%xmm0 + + psrld $2,%xmm12 + paddd %xmm8,%xmm10 + por %xmm5,%xmm0 + por %xmm7,%xmm12 + pxor %xmm3,%xmm1 + movdqa 192-128(%rax),%xmm3 + + movdqa %xmm10,%xmm8 + movdqa %xmm13,%xmm6 + pxor 32-128(%rax),%xmm1 + paddd %xmm15,%xmm14 + pslld $5,%xmm8 + pxor %xmm11,%xmm6 + + movdqa %xmm10,%xmm9 + movdqa %xmm0,144-128(%rax) + paddd %xmm0,%xmm14 + pxor %xmm3,%xmm1 + psrld $27,%xmm9 + pxor %xmm12,%xmm6 + movdqa %xmm11,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm1,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm14 + paddd %xmm1,%xmm1 + + psrld $2,%xmm11 + paddd %xmm8,%xmm14 + por %xmm5,%xmm1 + por %xmm7,%xmm11 + pxor %xmm4,%xmm2 + movdqa 208-128(%rax),%xmm4 + + movdqa %xmm14,%xmm8 + movdqa %xmm12,%xmm6 + pxor 48-128(%rax),%xmm2 + paddd %xmm15,%xmm13 + pslld $5,%xmm8 + pxor %xmm10,%xmm6 + + movdqa %xmm14,%xmm9 + movdqa %xmm1,160-128(%rax) + paddd %xmm1,%xmm13 + pxor %xmm4,%xmm2 + psrld $27,%xmm9 + pxor %xmm11,%xmm6 + movdqa %xmm10,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm2,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm13 + paddd %xmm2,%xmm2 + + psrld $2,%xmm10 + paddd %xmm8,%xmm13 + por %xmm5,%xmm2 + por %xmm7,%xmm10 + pxor %xmm0,%xmm3 + movdqa 224-128(%rax),%xmm0 + + movdqa %xmm13,%xmm8 + movdqa %xmm11,%xmm6 + pxor 64-128(%rax),%xmm3 + paddd %xmm15,%xmm12 + pslld $5,%xmm8 + pxor %xmm14,%xmm6 + + movdqa %xmm13,%xmm9 + movdqa %xmm2,176-128(%rax) + paddd %xmm2,%xmm12 + pxor %xmm0,%xmm3 + psrld $27,%xmm9 + pxor %xmm10,%xmm6 + movdqa %xmm14,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm3,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm12 + paddd %xmm3,%xmm3 + + psrld $2,%xmm14 + paddd %xmm8,%xmm12 + por %xmm5,%xmm3 + por %xmm7,%xmm14 + pxor %xmm1,%xmm4 + movdqa 240-128(%rax),%xmm1 + + movdqa %xmm12,%xmm8 + movdqa %xmm10,%xmm6 + pxor 80-128(%rax),%xmm4 + paddd %xmm15,%xmm11 + pslld $5,%xmm8 + pxor %xmm13,%xmm6 + + movdqa %xmm12,%xmm9 + movdqa %xmm3,192-128(%rax) + paddd %xmm3,%xmm11 + pxor %xmm1,%xmm4 + psrld $27,%xmm9 + pxor %xmm14,%xmm6 + movdqa %xmm13,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm4,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm11 + paddd %xmm4,%xmm4 + + psrld $2,%xmm13 + paddd %xmm8,%xmm11 + por %xmm5,%xmm4 + por %xmm7,%xmm13 + pxor %xmm2,%xmm0 + movdqa 0-128(%rax),%xmm2 + + movdqa %xmm11,%xmm8 + movdqa %xmm14,%xmm6 + pxor 96-128(%rax),%xmm0 + paddd %xmm15,%xmm10 + pslld $5,%xmm8 + pxor %xmm12,%xmm6 + + movdqa %xmm11,%xmm9 + movdqa %xmm4,208-128(%rax) + paddd %xmm4,%xmm10 + pxor %xmm2,%xmm0 + psrld $27,%xmm9 + pxor %xmm13,%xmm6 + movdqa %xmm12,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm0,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm10 + paddd %xmm0,%xmm0 + + psrld $2,%xmm12 + paddd %xmm8,%xmm10 + por %xmm5,%xmm0 + por %xmm7,%xmm12 + pxor %xmm3,%xmm1 + movdqa 16-128(%rax),%xmm3 + + movdqa %xmm10,%xmm8 + movdqa %xmm13,%xmm6 + pxor 112-128(%rax),%xmm1 + paddd %xmm15,%xmm14 + pslld $5,%xmm8 + pxor %xmm11,%xmm6 + + movdqa %xmm10,%xmm9 + movdqa %xmm0,224-128(%rax) + paddd %xmm0,%xmm14 + pxor %xmm3,%xmm1 + psrld $27,%xmm9 + pxor %xmm12,%xmm6 + movdqa %xmm11,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm1,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm14 + paddd %xmm1,%xmm1 + + psrld $2,%xmm11 + paddd %xmm8,%xmm14 + por %xmm5,%xmm1 + por %xmm7,%xmm11 + pxor %xmm4,%xmm2 + movdqa 32-128(%rax),%xmm4 + + movdqa %xmm14,%xmm8 + movdqa %xmm12,%xmm6 + pxor 128-128(%rax),%xmm2 + paddd %xmm15,%xmm13 + pslld $5,%xmm8 + pxor %xmm10,%xmm6 + + movdqa %xmm14,%xmm9 + movdqa %xmm1,240-128(%rax) + paddd %xmm1,%xmm13 + pxor %xmm4,%xmm2 + psrld $27,%xmm9 + pxor %xmm11,%xmm6 + movdqa %xmm10,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm2,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm13 + paddd %xmm2,%xmm2 + + psrld $2,%xmm10 + paddd %xmm8,%xmm13 + por %xmm5,%xmm2 + por %xmm7,%xmm10 + pxor %xmm0,%xmm3 + movdqa 48-128(%rax),%xmm0 + + movdqa %xmm13,%xmm8 + movdqa %xmm11,%xmm6 + pxor 144-128(%rax),%xmm3 + paddd %xmm15,%xmm12 + pslld $5,%xmm8 + pxor %xmm14,%xmm6 + + movdqa %xmm13,%xmm9 + movdqa %xmm2,0-128(%rax) + paddd %xmm2,%xmm12 + pxor %xmm0,%xmm3 + psrld $27,%xmm9 + pxor %xmm10,%xmm6 + movdqa %xmm14,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm3,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm12 + paddd %xmm3,%xmm3 + + psrld $2,%xmm14 + paddd %xmm8,%xmm12 + por %xmm5,%xmm3 + por %xmm7,%xmm14 + pxor %xmm1,%xmm4 + movdqa 64-128(%rax),%xmm1 + + movdqa %xmm12,%xmm8 + movdqa %xmm10,%xmm6 + pxor 160-128(%rax),%xmm4 + paddd %xmm15,%xmm11 + pslld $5,%xmm8 + pxor %xmm13,%xmm6 + + movdqa %xmm12,%xmm9 + movdqa %xmm3,16-128(%rax) + paddd %xmm3,%xmm11 + pxor %xmm1,%xmm4 + psrld $27,%xmm9 + pxor %xmm14,%xmm6 + movdqa %xmm13,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm4,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm11 + paddd %xmm4,%xmm4 + + psrld $2,%xmm13 + paddd %xmm8,%xmm11 + por %xmm5,%xmm4 + por %xmm7,%xmm13 + pxor %xmm2,%xmm0 + movdqa 80-128(%rax),%xmm2 + + movdqa %xmm11,%xmm8 + movdqa %xmm14,%xmm6 + pxor 176-128(%rax),%xmm0 + paddd %xmm15,%xmm10 + pslld $5,%xmm8 + pxor %xmm12,%xmm6 + + movdqa %xmm11,%xmm9 + movdqa %xmm4,32-128(%rax) + paddd %xmm4,%xmm10 + pxor %xmm2,%xmm0 + psrld $27,%xmm9 + pxor %xmm13,%xmm6 + movdqa %xmm12,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm0,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm10 + paddd %xmm0,%xmm0 + + psrld $2,%xmm12 + paddd %xmm8,%xmm10 + por %xmm5,%xmm0 + por %xmm7,%xmm12 + pxor %xmm3,%xmm1 + movdqa 96-128(%rax),%xmm3 + + movdqa %xmm10,%xmm8 + movdqa %xmm13,%xmm6 + pxor 192-128(%rax),%xmm1 + paddd %xmm15,%xmm14 + pslld $5,%xmm8 + pxor %xmm11,%xmm6 + + movdqa %xmm10,%xmm9 + movdqa %xmm0,48-128(%rax) + paddd %xmm0,%xmm14 + pxor %xmm3,%xmm1 + psrld $27,%xmm9 + pxor %xmm12,%xmm6 + movdqa %xmm11,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm1,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm14 + paddd %xmm1,%xmm1 + + psrld $2,%xmm11 + paddd %xmm8,%xmm14 + por %xmm5,%xmm1 + por %xmm7,%xmm11 + pxor %xmm4,%xmm2 + movdqa 112-128(%rax),%xmm4 + + movdqa %xmm14,%xmm8 + movdqa %xmm12,%xmm6 + pxor 208-128(%rax),%xmm2 + paddd %xmm15,%xmm13 + pslld $5,%xmm8 + pxor %xmm10,%xmm6 + + movdqa %xmm14,%xmm9 + movdqa %xmm1,64-128(%rax) + paddd %xmm1,%xmm13 + pxor %xmm4,%xmm2 + psrld $27,%xmm9 + pxor %xmm11,%xmm6 + movdqa %xmm10,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm2,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm13 + paddd %xmm2,%xmm2 + + psrld $2,%xmm10 + paddd %xmm8,%xmm13 + por %xmm5,%xmm2 + por %xmm7,%xmm10 + pxor %xmm0,%xmm3 + movdqa 128-128(%rax),%xmm0 + + movdqa %xmm13,%xmm8 + movdqa %xmm11,%xmm6 + pxor 224-128(%rax),%xmm3 + paddd %xmm15,%xmm12 + pslld $5,%xmm8 + pxor %xmm14,%xmm6 + + movdqa %xmm13,%xmm9 + movdqa %xmm2,80-128(%rax) + paddd %xmm2,%xmm12 + pxor %xmm0,%xmm3 + psrld $27,%xmm9 + pxor %xmm10,%xmm6 + movdqa %xmm14,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm3,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm12 + paddd %xmm3,%xmm3 + + psrld $2,%xmm14 + paddd %xmm8,%xmm12 + por %xmm5,%xmm3 + por %xmm7,%xmm14 + pxor %xmm1,%xmm4 + movdqa 144-128(%rax),%xmm1 + + movdqa %xmm12,%xmm8 + movdqa %xmm10,%xmm6 + pxor 240-128(%rax),%xmm4 + paddd %xmm15,%xmm11 + pslld $5,%xmm8 + pxor %xmm13,%xmm6 + + movdqa %xmm12,%xmm9 + movdqa %xmm3,96-128(%rax) + paddd %xmm3,%xmm11 + pxor %xmm1,%xmm4 + psrld $27,%xmm9 + pxor %xmm14,%xmm6 + movdqa %xmm13,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm4,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm11 + paddd %xmm4,%xmm4 + + psrld $2,%xmm13 + paddd %xmm8,%xmm11 + por %xmm5,%xmm4 + por %xmm7,%xmm13 + pxor %xmm2,%xmm0 + movdqa 160-128(%rax),%xmm2 + + movdqa %xmm11,%xmm8 + movdqa %xmm14,%xmm6 + pxor 0-128(%rax),%xmm0 + paddd %xmm15,%xmm10 + pslld $5,%xmm8 + pxor %xmm12,%xmm6 + + movdqa %xmm11,%xmm9 + movdqa %xmm4,112-128(%rax) + paddd %xmm4,%xmm10 + pxor %xmm2,%xmm0 + psrld $27,%xmm9 + pxor %xmm13,%xmm6 + movdqa %xmm12,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm0,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm10 + paddd %xmm0,%xmm0 + + psrld $2,%xmm12 + paddd %xmm8,%xmm10 + por %xmm5,%xmm0 + por %xmm7,%xmm12 + movdqa 32(%rbp),%xmm15 + pxor %xmm3,%xmm1 + movdqa 176-128(%rax),%xmm3 + + movdqa %xmm10,%xmm8 + movdqa %xmm13,%xmm7 + pxor 16-128(%rax),%xmm1 + pxor %xmm3,%xmm1 + paddd %xmm15,%xmm14 + pslld $5,%xmm8 + movdqa %xmm10,%xmm9 + pand %xmm12,%xmm7 + + movdqa %xmm13,%xmm6 + movdqa %xmm1,%xmm5 + psrld $27,%xmm9 + paddd %xmm7,%xmm14 + pxor %xmm12,%xmm6 + + movdqa %xmm0,128-128(%rax) + paddd %xmm0,%xmm14 + por %xmm9,%xmm8 + psrld $31,%xmm5 + pand %xmm11,%xmm6 + movdqa %xmm11,%xmm7 + + pslld $30,%xmm7 + paddd %xmm1,%xmm1 + paddd %xmm6,%xmm14 + + psrld $2,%xmm11 + paddd %xmm8,%xmm14 + por %xmm5,%xmm1 + por %xmm7,%xmm11 + pxor %xmm4,%xmm2 + movdqa 192-128(%rax),%xmm4 + + movdqa %xmm14,%xmm8 + movdqa %xmm12,%xmm7 + pxor 32-128(%rax),%xmm2 + pxor %xmm4,%xmm2 + paddd %xmm15,%xmm13 + pslld $5,%xmm8 + movdqa %xmm14,%xmm9 + pand %xmm11,%xmm7 + + movdqa %xmm12,%xmm6 + movdqa %xmm2,%xmm5 + psrld $27,%xmm9 + paddd %xmm7,%xmm13 + pxor %xmm11,%xmm6 + + movdqa %xmm1,144-128(%rax) + paddd %xmm1,%xmm13 + por %xmm9,%xmm8 + psrld $31,%xmm5 + pand %xmm10,%xmm6 + movdqa %xmm10,%xmm7 + + pslld $30,%xmm7 + paddd %xmm2,%xmm2 + paddd %xmm6,%xmm13 + + psrld $2,%xmm10 + paddd %xmm8,%xmm13 + por %xmm5,%xmm2 + por %xmm7,%xmm10 + pxor %xmm0,%xmm3 + movdqa 208-128(%rax),%xmm0 + + movdqa %xmm13,%xmm8 + movdqa %xmm11,%xmm7 + pxor 48-128(%rax),%xmm3 + pxor %xmm0,%xmm3 + paddd %xmm15,%xmm12 + pslld $5,%xmm8 + movdqa %xmm13,%xmm9 + pand %xmm10,%xmm7 + + movdqa %xmm11,%xmm6 + movdqa %xmm3,%xmm5 + psrld $27,%xmm9 + paddd %xmm7,%xmm12 + pxor %xmm10,%xmm6 + + movdqa %xmm2,160-128(%rax) + paddd %xmm2,%xmm12 + por %xmm9,%xmm8 + psrld $31,%xmm5 + pand %xmm14,%xmm6 + movdqa %xmm14,%xmm7 + + pslld $30,%xmm7 + paddd %xmm3,%xmm3 + paddd %xmm6,%xmm12 + + psrld $2,%xmm14 + paddd %xmm8,%xmm12 + por %xmm5,%xmm3 + por %xmm7,%xmm14 + pxor %xmm1,%xmm4 + movdqa 224-128(%rax),%xmm1 + + movdqa %xmm12,%xmm8 + movdqa %xmm10,%xmm7 + pxor 64-128(%rax),%xmm4 + pxor %xmm1,%xmm4 + paddd %xmm15,%xmm11 + pslld $5,%xmm8 + movdqa %xmm12,%xmm9 + pand %xmm14,%xmm7 + + movdqa %xmm10,%xmm6 + movdqa %xmm4,%xmm5 + psrld $27,%xmm9 + paddd %xmm7,%xmm11 + pxor %xmm14,%xmm6 + + movdqa %xmm3,176-128(%rax) + paddd %xmm3,%xmm11 + por %xmm9,%xmm8 + psrld $31,%xmm5 + pand %xmm13,%xmm6 + movdqa %xmm13,%xmm7 + + pslld $30,%xmm7 + paddd %xmm4,%xmm4 + paddd %xmm6,%xmm11 + + psrld $2,%xmm13 + paddd %xmm8,%xmm11 + por %xmm5,%xmm4 + por %xmm7,%xmm13 + pxor %xmm2,%xmm0 + movdqa 240-128(%rax),%xmm2 + + movdqa %xmm11,%xmm8 + movdqa %xmm14,%xmm7 + pxor 80-128(%rax),%xmm0 + pxor %xmm2,%xmm0 + paddd %xmm15,%xmm10 + pslld $5,%xmm8 + movdqa %xmm11,%xmm9 + pand %xmm13,%xmm7 + + movdqa %xmm14,%xmm6 + movdqa %xmm0,%xmm5 + psrld $27,%xmm9 + paddd %xmm7,%xmm10 + pxor %xmm13,%xmm6 + + movdqa %xmm4,192-128(%rax) + paddd %xmm4,%xmm10 + por %xmm9,%xmm8 + psrld $31,%xmm5 + pand %xmm12,%xmm6 + movdqa %xmm12,%xmm7 + + pslld $30,%xmm7 + paddd %xmm0,%xmm0 + paddd %xmm6,%xmm10 + + psrld $2,%xmm12 + paddd %xmm8,%xmm10 + por %xmm5,%xmm0 + por %xmm7,%xmm12 + pxor %xmm3,%xmm1 + movdqa 0-128(%rax),%xmm3 + + movdqa %xmm10,%xmm8 + movdqa %xmm13,%xmm7 + pxor 96-128(%rax),%xmm1 + pxor %xmm3,%xmm1 + paddd %xmm15,%xmm14 + pslld $5,%xmm8 + movdqa %xmm10,%xmm9 + pand %xmm12,%xmm7 + + movdqa %xmm13,%xmm6 + movdqa %xmm1,%xmm5 + psrld $27,%xmm9 + paddd %xmm7,%xmm14 + pxor %xmm12,%xmm6 + + movdqa %xmm0,208-128(%rax) + paddd %xmm0,%xmm14 + por %xmm9,%xmm8 + psrld $31,%xmm5 + pand %xmm11,%xmm6 + movdqa %xmm11,%xmm7 + + pslld $30,%xmm7 + paddd %xmm1,%xmm1 + paddd %xmm6,%xmm14 + + psrld $2,%xmm11 + paddd %xmm8,%xmm14 + por %xmm5,%xmm1 + por %xmm7,%xmm11 + pxor %xmm4,%xmm2 + movdqa 16-128(%rax),%xmm4 + + movdqa %xmm14,%xmm8 + movdqa %xmm12,%xmm7 + pxor 112-128(%rax),%xmm2 + pxor %xmm4,%xmm2 + paddd %xmm15,%xmm13 + pslld $5,%xmm8 + movdqa %xmm14,%xmm9 + pand %xmm11,%xmm7 + + movdqa %xmm12,%xmm6 + movdqa %xmm2,%xmm5 + psrld $27,%xmm9 + paddd %xmm7,%xmm13 + pxor %xmm11,%xmm6 + + movdqa %xmm1,224-128(%rax) + paddd %xmm1,%xmm13 + por %xmm9,%xmm8 + psrld $31,%xmm5 + pand %xmm10,%xmm6 + movdqa %xmm10,%xmm7 + + pslld $30,%xmm7 + paddd %xmm2,%xmm2 + paddd %xmm6,%xmm13 + + psrld $2,%xmm10 + paddd %xmm8,%xmm13 + por %xmm5,%xmm2 + por %xmm7,%xmm10 + pxor %xmm0,%xmm3 + movdqa 32-128(%rax),%xmm0 + + movdqa %xmm13,%xmm8 + movdqa %xmm11,%xmm7 + pxor 128-128(%rax),%xmm3 + pxor %xmm0,%xmm3 + paddd %xmm15,%xmm12 + pslld $5,%xmm8 + movdqa %xmm13,%xmm9 + pand %xmm10,%xmm7 + + movdqa %xmm11,%xmm6 + movdqa %xmm3,%xmm5 + psrld $27,%xmm9 + paddd %xmm7,%xmm12 + pxor %xmm10,%xmm6 + + movdqa %xmm2,240-128(%rax) + paddd %xmm2,%xmm12 + por %xmm9,%xmm8 + psrld $31,%xmm5 + pand %xmm14,%xmm6 + movdqa %xmm14,%xmm7 + + pslld $30,%xmm7 + paddd %xmm3,%xmm3 + paddd %xmm6,%xmm12 + + psrld $2,%xmm14 + paddd %xmm8,%xmm12 + por %xmm5,%xmm3 + por %xmm7,%xmm14 + pxor %xmm1,%xmm4 + movdqa 48-128(%rax),%xmm1 + + movdqa %xmm12,%xmm8 + movdqa %xmm10,%xmm7 + pxor 144-128(%rax),%xmm4 + pxor %xmm1,%xmm4 + paddd %xmm15,%xmm11 + pslld $5,%xmm8 + movdqa %xmm12,%xmm9 + pand %xmm14,%xmm7 + + movdqa %xmm10,%xmm6 + movdqa %xmm4,%xmm5 + psrld $27,%xmm9 + paddd %xmm7,%xmm11 + pxor %xmm14,%xmm6 + + movdqa %xmm3,0-128(%rax) + paddd %xmm3,%xmm11 + por %xmm9,%xmm8 + psrld $31,%xmm5 + pand %xmm13,%xmm6 + movdqa %xmm13,%xmm7 + + pslld $30,%xmm7 + paddd %xmm4,%xmm4 + paddd %xmm6,%xmm11 + + psrld $2,%xmm13 + paddd %xmm8,%xmm11 + por %xmm5,%xmm4 + por %xmm7,%xmm13 + pxor %xmm2,%xmm0 + movdqa 64-128(%rax),%xmm2 + + movdqa %xmm11,%xmm8 + movdqa %xmm14,%xmm7 + pxor 160-128(%rax),%xmm0 + pxor %xmm2,%xmm0 + paddd %xmm15,%xmm10 + pslld $5,%xmm8 + movdqa %xmm11,%xmm9 + pand %xmm13,%xmm7 + + movdqa %xmm14,%xmm6 + movdqa %xmm0,%xmm5 + psrld $27,%xmm9 + paddd %xmm7,%xmm10 + pxor %xmm13,%xmm6 + + movdqa %xmm4,16-128(%rax) + paddd %xmm4,%xmm10 + por %xmm9,%xmm8 + psrld $31,%xmm5 + pand %xmm12,%xmm6 + movdqa %xmm12,%xmm7 + + pslld $30,%xmm7 + paddd %xmm0,%xmm0 + paddd %xmm6,%xmm10 + + psrld $2,%xmm12 + paddd %xmm8,%xmm10 + por %xmm5,%xmm0 + por %xmm7,%xmm12 + pxor %xmm3,%xmm1 + movdqa 80-128(%rax),%xmm3 + + movdqa %xmm10,%xmm8 + movdqa %xmm13,%xmm7 + pxor 176-128(%rax),%xmm1 + pxor %xmm3,%xmm1 + paddd %xmm15,%xmm14 + pslld $5,%xmm8 + movdqa %xmm10,%xmm9 + pand %xmm12,%xmm7 + + movdqa %xmm13,%xmm6 + movdqa %xmm1,%xmm5 + psrld $27,%xmm9 + paddd %xmm7,%xmm14 + pxor %xmm12,%xmm6 + + movdqa %xmm0,32-128(%rax) + paddd %xmm0,%xmm14 + por %xmm9,%xmm8 + psrld $31,%xmm5 + pand %xmm11,%xmm6 + movdqa %xmm11,%xmm7 + + pslld $30,%xmm7 + paddd %xmm1,%xmm1 + paddd %xmm6,%xmm14 + + psrld $2,%xmm11 + paddd %xmm8,%xmm14 + por %xmm5,%xmm1 + por %xmm7,%xmm11 + pxor %xmm4,%xmm2 + movdqa 96-128(%rax),%xmm4 + + movdqa %xmm14,%xmm8 + movdqa %xmm12,%xmm7 + pxor 192-128(%rax),%xmm2 + pxor %xmm4,%xmm2 + paddd %xmm15,%xmm13 + pslld $5,%xmm8 + movdqa %xmm14,%xmm9 + pand %xmm11,%xmm7 + + movdqa %xmm12,%xmm6 + movdqa %xmm2,%xmm5 + psrld $27,%xmm9 + paddd %xmm7,%xmm13 + pxor %xmm11,%xmm6 + + movdqa %xmm1,48-128(%rax) + paddd %xmm1,%xmm13 + por %xmm9,%xmm8 + psrld $31,%xmm5 + pand %xmm10,%xmm6 + movdqa %xmm10,%xmm7 + + pslld $30,%xmm7 + paddd %xmm2,%xmm2 + paddd %xmm6,%xmm13 + + psrld $2,%xmm10 + paddd %xmm8,%xmm13 + por %xmm5,%xmm2 + por %xmm7,%xmm10 + pxor %xmm0,%xmm3 + movdqa 112-128(%rax),%xmm0 + + movdqa %xmm13,%xmm8 + movdqa %xmm11,%xmm7 + pxor 208-128(%rax),%xmm3 + pxor %xmm0,%xmm3 + paddd %xmm15,%xmm12 + pslld $5,%xmm8 + movdqa %xmm13,%xmm9 + pand %xmm10,%xmm7 + + movdqa %xmm11,%xmm6 + movdqa %xmm3,%xmm5 + psrld $27,%xmm9 + paddd %xmm7,%xmm12 + pxor %xmm10,%xmm6 + + movdqa %xmm2,64-128(%rax) + paddd %xmm2,%xmm12 + por %xmm9,%xmm8 + psrld $31,%xmm5 + pand %xmm14,%xmm6 + movdqa %xmm14,%xmm7 + + pslld $30,%xmm7 + paddd %xmm3,%xmm3 + paddd %xmm6,%xmm12 + + psrld $2,%xmm14 + paddd %xmm8,%xmm12 + por %xmm5,%xmm3 + por %xmm7,%xmm14 + pxor %xmm1,%xmm4 + movdqa 128-128(%rax),%xmm1 + + movdqa %xmm12,%xmm8 + movdqa %xmm10,%xmm7 + pxor 224-128(%rax),%xmm4 + pxor %xmm1,%xmm4 + paddd %xmm15,%xmm11 + pslld $5,%xmm8 + movdqa %xmm12,%xmm9 + pand %xmm14,%xmm7 + + movdqa %xmm10,%xmm6 + movdqa %xmm4,%xmm5 + psrld $27,%xmm9 + paddd %xmm7,%xmm11 + pxor %xmm14,%xmm6 + + movdqa %xmm3,80-128(%rax) + paddd %xmm3,%xmm11 + por %xmm9,%xmm8 + psrld $31,%xmm5 + pand %xmm13,%xmm6 + movdqa %xmm13,%xmm7 + + pslld $30,%xmm7 + paddd %xmm4,%xmm4 + paddd %xmm6,%xmm11 + + psrld $2,%xmm13 + paddd %xmm8,%xmm11 + por %xmm5,%xmm4 + por %xmm7,%xmm13 + pxor %xmm2,%xmm0 + movdqa 144-128(%rax),%xmm2 + + movdqa %xmm11,%xmm8 + movdqa %xmm14,%xmm7 + pxor 240-128(%rax),%xmm0 + pxor %xmm2,%xmm0 + paddd %xmm15,%xmm10 + pslld $5,%xmm8 + movdqa %xmm11,%xmm9 + pand %xmm13,%xmm7 + + movdqa %xmm14,%xmm6 + movdqa %xmm0,%xmm5 + psrld $27,%xmm9 + paddd %xmm7,%xmm10 + pxor %xmm13,%xmm6 + + movdqa %xmm4,96-128(%rax) + paddd %xmm4,%xmm10 + por %xmm9,%xmm8 + psrld $31,%xmm5 + pand %xmm12,%xmm6 + movdqa %xmm12,%xmm7 + + pslld $30,%xmm7 + paddd %xmm0,%xmm0 + paddd %xmm6,%xmm10 + + psrld $2,%xmm12 + paddd %xmm8,%xmm10 + por %xmm5,%xmm0 + por %xmm7,%xmm12 + pxor %xmm3,%xmm1 + movdqa 160-128(%rax),%xmm3 + + movdqa %xmm10,%xmm8 + movdqa %xmm13,%xmm7 + pxor 0-128(%rax),%xmm1 + pxor %xmm3,%xmm1 + paddd %xmm15,%xmm14 + pslld $5,%xmm8 + movdqa %xmm10,%xmm9 + pand %xmm12,%xmm7 + + movdqa %xmm13,%xmm6 + movdqa %xmm1,%xmm5 + psrld $27,%xmm9 + paddd %xmm7,%xmm14 + pxor %xmm12,%xmm6 + + movdqa %xmm0,112-128(%rax) + paddd %xmm0,%xmm14 + por %xmm9,%xmm8 + psrld $31,%xmm5 + pand %xmm11,%xmm6 + movdqa %xmm11,%xmm7 + + pslld $30,%xmm7 + paddd %xmm1,%xmm1 + paddd %xmm6,%xmm14 + + psrld $2,%xmm11 + paddd %xmm8,%xmm14 + por %xmm5,%xmm1 + por %xmm7,%xmm11 + pxor %xmm4,%xmm2 + movdqa 176-128(%rax),%xmm4 + + movdqa %xmm14,%xmm8 + movdqa %xmm12,%xmm7 + pxor 16-128(%rax),%xmm2 + pxor %xmm4,%xmm2 + paddd %xmm15,%xmm13 + pslld $5,%xmm8 + movdqa %xmm14,%xmm9 + pand %xmm11,%xmm7 + + movdqa %xmm12,%xmm6 + movdqa %xmm2,%xmm5 + psrld $27,%xmm9 + paddd %xmm7,%xmm13 + pxor %xmm11,%xmm6 + + movdqa %xmm1,128-128(%rax) + paddd %xmm1,%xmm13 + por %xmm9,%xmm8 + psrld $31,%xmm5 + pand %xmm10,%xmm6 + movdqa %xmm10,%xmm7 + + pslld $30,%xmm7 + paddd %xmm2,%xmm2 + paddd %xmm6,%xmm13 + + psrld $2,%xmm10 + paddd %xmm8,%xmm13 + por %xmm5,%xmm2 + por %xmm7,%xmm10 + pxor %xmm0,%xmm3 + movdqa 192-128(%rax),%xmm0 + + movdqa %xmm13,%xmm8 + movdqa %xmm11,%xmm7 + pxor 32-128(%rax),%xmm3 + pxor %xmm0,%xmm3 + paddd %xmm15,%xmm12 + pslld $5,%xmm8 + movdqa %xmm13,%xmm9 + pand %xmm10,%xmm7 + + movdqa %xmm11,%xmm6 + movdqa %xmm3,%xmm5 + psrld $27,%xmm9 + paddd %xmm7,%xmm12 + pxor %xmm10,%xmm6 + + movdqa %xmm2,144-128(%rax) + paddd %xmm2,%xmm12 + por %xmm9,%xmm8 + psrld $31,%xmm5 + pand %xmm14,%xmm6 + movdqa %xmm14,%xmm7 + + pslld $30,%xmm7 + paddd %xmm3,%xmm3 + paddd %xmm6,%xmm12 + + psrld $2,%xmm14 + paddd %xmm8,%xmm12 + por %xmm5,%xmm3 + por %xmm7,%xmm14 + pxor %xmm1,%xmm4 + movdqa 208-128(%rax),%xmm1 + + movdqa %xmm12,%xmm8 + movdqa %xmm10,%xmm7 + pxor 48-128(%rax),%xmm4 + pxor %xmm1,%xmm4 + paddd %xmm15,%xmm11 + pslld $5,%xmm8 + movdqa %xmm12,%xmm9 + pand %xmm14,%xmm7 + + movdqa %xmm10,%xmm6 + movdqa %xmm4,%xmm5 + psrld $27,%xmm9 + paddd %xmm7,%xmm11 + pxor %xmm14,%xmm6 + + movdqa %xmm3,160-128(%rax) + paddd %xmm3,%xmm11 + por %xmm9,%xmm8 + psrld $31,%xmm5 + pand %xmm13,%xmm6 + movdqa %xmm13,%xmm7 + + pslld $30,%xmm7 + paddd %xmm4,%xmm4 + paddd %xmm6,%xmm11 + + psrld $2,%xmm13 + paddd %xmm8,%xmm11 + por %xmm5,%xmm4 + por %xmm7,%xmm13 + pxor %xmm2,%xmm0 + movdqa 224-128(%rax),%xmm2 + + movdqa %xmm11,%xmm8 + movdqa %xmm14,%xmm7 + pxor 64-128(%rax),%xmm0 + pxor %xmm2,%xmm0 + paddd %xmm15,%xmm10 + pslld $5,%xmm8 + movdqa %xmm11,%xmm9 + pand %xmm13,%xmm7 + + movdqa %xmm14,%xmm6 + movdqa %xmm0,%xmm5 + psrld $27,%xmm9 + paddd %xmm7,%xmm10 + pxor %xmm13,%xmm6 + + movdqa %xmm4,176-128(%rax) + paddd %xmm4,%xmm10 + por %xmm9,%xmm8 + psrld $31,%xmm5 + pand %xmm12,%xmm6 + movdqa %xmm12,%xmm7 + + pslld $30,%xmm7 + paddd %xmm0,%xmm0 + paddd %xmm6,%xmm10 + + psrld $2,%xmm12 + paddd %xmm8,%xmm10 + por %xmm5,%xmm0 + por %xmm7,%xmm12 + movdqa 64(%rbp),%xmm15 + pxor %xmm3,%xmm1 + movdqa 240-128(%rax),%xmm3 + + movdqa %xmm10,%xmm8 + movdqa %xmm13,%xmm6 + pxor 80-128(%rax),%xmm1 + paddd %xmm15,%xmm14 + pslld $5,%xmm8 + pxor %xmm11,%xmm6 + + movdqa %xmm10,%xmm9 + movdqa %xmm0,192-128(%rax) + paddd %xmm0,%xmm14 + pxor %xmm3,%xmm1 + psrld $27,%xmm9 + pxor %xmm12,%xmm6 + movdqa %xmm11,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm1,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm14 + paddd %xmm1,%xmm1 + + psrld $2,%xmm11 + paddd %xmm8,%xmm14 + por %xmm5,%xmm1 + por %xmm7,%xmm11 + pxor %xmm4,%xmm2 + movdqa 0-128(%rax),%xmm4 + + movdqa %xmm14,%xmm8 + movdqa %xmm12,%xmm6 + pxor 96-128(%rax),%xmm2 + paddd %xmm15,%xmm13 + pslld $5,%xmm8 + pxor %xmm10,%xmm6 + + movdqa %xmm14,%xmm9 + movdqa %xmm1,208-128(%rax) + paddd %xmm1,%xmm13 + pxor %xmm4,%xmm2 + psrld $27,%xmm9 + pxor %xmm11,%xmm6 + movdqa %xmm10,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm2,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm13 + paddd %xmm2,%xmm2 + + psrld $2,%xmm10 + paddd %xmm8,%xmm13 + por %xmm5,%xmm2 + por %xmm7,%xmm10 + pxor %xmm0,%xmm3 + movdqa 16-128(%rax),%xmm0 + + movdqa %xmm13,%xmm8 + movdqa %xmm11,%xmm6 + pxor 112-128(%rax),%xmm3 + paddd %xmm15,%xmm12 + pslld $5,%xmm8 + pxor %xmm14,%xmm6 + + movdqa %xmm13,%xmm9 + movdqa %xmm2,224-128(%rax) + paddd %xmm2,%xmm12 + pxor %xmm0,%xmm3 + psrld $27,%xmm9 + pxor %xmm10,%xmm6 + movdqa %xmm14,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm3,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm12 + paddd %xmm3,%xmm3 + + psrld $2,%xmm14 + paddd %xmm8,%xmm12 + por %xmm5,%xmm3 + por %xmm7,%xmm14 + pxor %xmm1,%xmm4 + movdqa 32-128(%rax),%xmm1 + + movdqa %xmm12,%xmm8 + movdqa %xmm10,%xmm6 + pxor 128-128(%rax),%xmm4 + paddd %xmm15,%xmm11 + pslld $5,%xmm8 + pxor %xmm13,%xmm6 + + movdqa %xmm12,%xmm9 + movdqa %xmm3,240-128(%rax) + paddd %xmm3,%xmm11 + pxor %xmm1,%xmm4 + psrld $27,%xmm9 + pxor %xmm14,%xmm6 + movdqa %xmm13,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm4,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm11 + paddd %xmm4,%xmm4 + + psrld $2,%xmm13 + paddd %xmm8,%xmm11 + por %xmm5,%xmm4 + por %xmm7,%xmm13 + pxor %xmm2,%xmm0 + movdqa 48-128(%rax),%xmm2 + + movdqa %xmm11,%xmm8 + movdqa %xmm14,%xmm6 + pxor 144-128(%rax),%xmm0 + paddd %xmm15,%xmm10 + pslld $5,%xmm8 + pxor %xmm12,%xmm6 + + movdqa %xmm11,%xmm9 + movdqa %xmm4,0-128(%rax) + paddd %xmm4,%xmm10 + pxor %xmm2,%xmm0 + psrld $27,%xmm9 + pxor %xmm13,%xmm6 + movdqa %xmm12,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm0,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm10 + paddd %xmm0,%xmm0 + + psrld $2,%xmm12 + paddd %xmm8,%xmm10 + por %xmm5,%xmm0 + por %xmm7,%xmm12 + pxor %xmm3,%xmm1 + movdqa 64-128(%rax),%xmm3 + + movdqa %xmm10,%xmm8 + movdqa %xmm13,%xmm6 + pxor 160-128(%rax),%xmm1 + paddd %xmm15,%xmm14 + pslld $5,%xmm8 + pxor %xmm11,%xmm6 + + movdqa %xmm10,%xmm9 + movdqa %xmm0,16-128(%rax) + paddd %xmm0,%xmm14 + pxor %xmm3,%xmm1 + psrld $27,%xmm9 + pxor %xmm12,%xmm6 + movdqa %xmm11,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm1,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm14 + paddd %xmm1,%xmm1 + + psrld $2,%xmm11 + paddd %xmm8,%xmm14 + por %xmm5,%xmm1 + por %xmm7,%xmm11 + pxor %xmm4,%xmm2 + movdqa 80-128(%rax),%xmm4 + + movdqa %xmm14,%xmm8 + movdqa %xmm12,%xmm6 + pxor 176-128(%rax),%xmm2 + paddd %xmm15,%xmm13 + pslld $5,%xmm8 + pxor %xmm10,%xmm6 + + movdqa %xmm14,%xmm9 + movdqa %xmm1,32-128(%rax) + paddd %xmm1,%xmm13 + pxor %xmm4,%xmm2 + psrld $27,%xmm9 + pxor %xmm11,%xmm6 + movdqa %xmm10,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm2,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm13 + paddd %xmm2,%xmm2 + + psrld $2,%xmm10 + paddd %xmm8,%xmm13 + por %xmm5,%xmm2 + por %xmm7,%xmm10 + pxor %xmm0,%xmm3 + movdqa 96-128(%rax),%xmm0 + + movdqa %xmm13,%xmm8 + movdqa %xmm11,%xmm6 + pxor 192-128(%rax),%xmm3 + paddd %xmm15,%xmm12 + pslld $5,%xmm8 + pxor %xmm14,%xmm6 + + movdqa %xmm13,%xmm9 + movdqa %xmm2,48-128(%rax) + paddd %xmm2,%xmm12 + pxor %xmm0,%xmm3 + psrld $27,%xmm9 + pxor %xmm10,%xmm6 + movdqa %xmm14,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm3,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm12 + paddd %xmm3,%xmm3 + + psrld $2,%xmm14 + paddd %xmm8,%xmm12 + por %xmm5,%xmm3 + por %xmm7,%xmm14 + pxor %xmm1,%xmm4 + movdqa 112-128(%rax),%xmm1 + + movdqa %xmm12,%xmm8 + movdqa %xmm10,%xmm6 + pxor 208-128(%rax),%xmm4 + paddd %xmm15,%xmm11 + pslld $5,%xmm8 + pxor %xmm13,%xmm6 + + movdqa %xmm12,%xmm9 + movdqa %xmm3,64-128(%rax) + paddd %xmm3,%xmm11 + pxor %xmm1,%xmm4 + psrld $27,%xmm9 + pxor %xmm14,%xmm6 + movdqa %xmm13,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm4,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm11 + paddd %xmm4,%xmm4 + + psrld $2,%xmm13 + paddd %xmm8,%xmm11 + por %xmm5,%xmm4 + por %xmm7,%xmm13 + pxor %xmm2,%xmm0 + movdqa 128-128(%rax),%xmm2 + + movdqa %xmm11,%xmm8 + movdqa %xmm14,%xmm6 + pxor 224-128(%rax),%xmm0 + paddd %xmm15,%xmm10 + pslld $5,%xmm8 + pxor %xmm12,%xmm6 + + movdqa %xmm11,%xmm9 + movdqa %xmm4,80-128(%rax) + paddd %xmm4,%xmm10 + pxor %xmm2,%xmm0 + psrld $27,%xmm9 + pxor %xmm13,%xmm6 + movdqa %xmm12,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm0,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm10 + paddd %xmm0,%xmm0 + + psrld $2,%xmm12 + paddd %xmm8,%xmm10 + por %xmm5,%xmm0 + por %xmm7,%xmm12 + pxor %xmm3,%xmm1 + movdqa 144-128(%rax),%xmm3 + + movdqa %xmm10,%xmm8 + movdqa %xmm13,%xmm6 + pxor 240-128(%rax),%xmm1 + paddd %xmm15,%xmm14 + pslld $5,%xmm8 + pxor %xmm11,%xmm6 + + movdqa %xmm10,%xmm9 + movdqa %xmm0,96-128(%rax) + paddd %xmm0,%xmm14 + pxor %xmm3,%xmm1 + psrld $27,%xmm9 + pxor %xmm12,%xmm6 + movdqa %xmm11,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm1,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm14 + paddd %xmm1,%xmm1 + + psrld $2,%xmm11 + paddd %xmm8,%xmm14 + por %xmm5,%xmm1 + por %xmm7,%xmm11 + pxor %xmm4,%xmm2 + movdqa 160-128(%rax),%xmm4 + + movdqa %xmm14,%xmm8 + movdqa %xmm12,%xmm6 + pxor 0-128(%rax),%xmm2 + paddd %xmm15,%xmm13 + pslld $5,%xmm8 + pxor %xmm10,%xmm6 + + movdqa %xmm14,%xmm9 + movdqa %xmm1,112-128(%rax) + paddd %xmm1,%xmm13 + pxor %xmm4,%xmm2 + psrld $27,%xmm9 + pxor %xmm11,%xmm6 + movdqa %xmm10,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm2,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm13 + paddd %xmm2,%xmm2 + + psrld $2,%xmm10 + paddd %xmm8,%xmm13 + por %xmm5,%xmm2 + por %xmm7,%xmm10 + pxor %xmm0,%xmm3 + movdqa 176-128(%rax),%xmm0 + + movdqa %xmm13,%xmm8 + movdqa %xmm11,%xmm6 + pxor 16-128(%rax),%xmm3 + paddd %xmm15,%xmm12 + pslld $5,%xmm8 + pxor %xmm14,%xmm6 + + movdqa %xmm13,%xmm9 + paddd %xmm2,%xmm12 + pxor %xmm0,%xmm3 + psrld $27,%xmm9 + pxor %xmm10,%xmm6 + movdqa %xmm14,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm3,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm12 + paddd %xmm3,%xmm3 + + psrld $2,%xmm14 + paddd %xmm8,%xmm12 + por %xmm5,%xmm3 + por %xmm7,%xmm14 + pxor %xmm1,%xmm4 + movdqa 192-128(%rax),%xmm1 + + movdqa %xmm12,%xmm8 + movdqa %xmm10,%xmm6 + pxor 32-128(%rax),%xmm4 + paddd %xmm15,%xmm11 + pslld $5,%xmm8 + pxor %xmm13,%xmm6 + + movdqa %xmm12,%xmm9 + paddd %xmm3,%xmm11 + pxor %xmm1,%xmm4 + psrld $27,%xmm9 + pxor %xmm14,%xmm6 + movdqa %xmm13,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm4,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm11 + paddd %xmm4,%xmm4 + + psrld $2,%xmm13 + paddd %xmm8,%xmm11 + por %xmm5,%xmm4 + por %xmm7,%xmm13 + pxor %xmm2,%xmm0 + movdqa 208-128(%rax),%xmm2 + + movdqa %xmm11,%xmm8 + movdqa %xmm14,%xmm6 + pxor 48-128(%rax),%xmm0 + paddd %xmm15,%xmm10 + pslld $5,%xmm8 + pxor %xmm12,%xmm6 + + movdqa %xmm11,%xmm9 + paddd %xmm4,%xmm10 + pxor %xmm2,%xmm0 + psrld $27,%xmm9 + pxor %xmm13,%xmm6 + movdqa %xmm12,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm0,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm10 + paddd %xmm0,%xmm0 + + psrld $2,%xmm12 + paddd %xmm8,%xmm10 + por %xmm5,%xmm0 + por %xmm7,%xmm12 + pxor %xmm3,%xmm1 + movdqa 224-128(%rax),%xmm3 + + movdqa %xmm10,%xmm8 + movdqa %xmm13,%xmm6 + pxor 64-128(%rax),%xmm1 + paddd %xmm15,%xmm14 + pslld $5,%xmm8 + pxor %xmm11,%xmm6 + + movdqa %xmm10,%xmm9 + paddd %xmm0,%xmm14 + pxor %xmm3,%xmm1 + psrld $27,%xmm9 + pxor %xmm12,%xmm6 + movdqa %xmm11,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm1,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm14 + paddd %xmm1,%xmm1 + + psrld $2,%xmm11 + paddd %xmm8,%xmm14 + por %xmm5,%xmm1 + por %xmm7,%xmm11 + pxor %xmm4,%xmm2 + movdqa 240-128(%rax),%xmm4 + + movdqa %xmm14,%xmm8 + movdqa %xmm12,%xmm6 + pxor 80-128(%rax),%xmm2 + paddd %xmm15,%xmm13 + pslld $5,%xmm8 + pxor %xmm10,%xmm6 + + movdqa %xmm14,%xmm9 + paddd %xmm1,%xmm13 + pxor %xmm4,%xmm2 + psrld $27,%xmm9 + pxor %xmm11,%xmm6 + movdqa %xmm10,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm2,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm13 + paddd %xmm2,%xmm2 + + psrld $2,%xmm10 + paddd %xmm8,%xmm13 + por %xmm5,%xmm2 + por %xmm7,%xmm10 + pxor %xmm0,%xmm3 + movdqa 0-128(%rax),%xmm0 + + movdqa %xmm13,%xmm8 + movdqa %xmm11,%xmm6 + pxor 96-128(%rax),%xmm3 + paddd %xmm15,%xmm12 + pslld $5,%xmm8 + pxor %xmm14,%xmm6 + + movdqa %xmm13,%xmm9 + paddd %xmm2,%xmm12 + pxor %xmm0,%xmm3 + psrld $27,%xmm9 + pxor %xmm10,%xmm6 + movdqa %xmm14,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm3,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm12 + paddd %xmm3,%xmm3 + + psrld $2,%xmm14 + paddd %xmm8,%xmm12 + por %xmm5,%xmm3 + por %xmm7,%xmm14 + pxor %xmm1,%xmm4 + movdqa 16-128(%rax),%xmm1 + + movdqa %xmm12,%xmm8 + movdqa %xmm10,%xmm6 + pxor 112-128(%rax),%xmm4 + paddd %xmm15,%xmm11 + pslld $5,%xmm8 + pxor %xmm13,%xmm6 + + movdqa %xmm12,%xmm9 + paddd %xmm3,%xmm11 + pxor %xmm1,%xmm4 + psrld $27,%xmm9 + pxor %xmm14,%xmm6 + movdqa %xmm13,%xmm7 + + pslld $30,%xmm7 + movdqa %xmm4,%xmm5 + por %xmm9,%xmm8 + psrld $31,%xmm5 + paddd %xmm6,%xmm11 + paddd %xmm4,%xmm4 + + psrld $2,%xmm13 + paddd %xmm8,%xmm11 + por %xmm5,%xmm4 + por %xmm7,%xmm13 + movdqa %xmm11,%xmm8 + paddd %xmm15,%xmm10 + movdqa %xmm14,%xmm6 + pslld $5,%xmm8 + pxor %xmm12,%xmm6 + + movdqa %xmm11,%xmm9 + paddd %xmm4,%xmm10 + psrld $27,%xmm9 + movdqa %xmm12,%xmm7 + pxor %xmm13,%xmm6 + + pslld $30,%xmm7 + por %xmm9,%xmm8 + paddd %xmm6,%xmm10 + + psrld $2,%xmm12 + paddd %xmm8,%xmm10 + por %xmm7,%xmm12 + movdqa (%rbx),%xmm0 + movl $1,%ecx + cmpl 0(%rbx),%ecx + pxor %xmm8,%xmm8 + cmovgeq %rbp,%r8 + cmpl 4(%rbx),%ecx + movdqa %xmm0,%xmm1 + cmovgeq %rbp,%r9 + cmpl 8(%rbx),%ecx + pcmpgtd %xmm8,%xmm1 + cmovgeq %rbp,%r10 + cmpl 12(%rbx),%ecx + paddd %xmm1,%xmm0 + cmovgeq %rbp,%r11 + + movdqu 0(%rdi),%xmm6 + pand %xmm1,%xmm10 + movdqu 32(%rdi),%xmm7 + pand %xmm1,%xmm11 + paddd %xmm6,%xmm10 + movdqu 64(%rdi),%xmm8 + pand %xmm1,%xmm12 + paddd %xmm7,%xmm11 + movdqu 96(%rdi),%xmm9 + pand %xmm1,%xmm13 + paddd %xmm8,%xmm12 + movdqu 128(%rdi),%xmm5 + pand %xmm1,%xmm14 + movdqu %xmm10,0(%rdi) + paddd %xmm9,%xmm13 + movdqu %xmm11,32(%rdi) + paddd %xmm5,%xmm14 + movdqu %xmm12,64(%rdi) + movdqu %xmm13,96(%rdi) + movdqu %xmm14,128(%rdi) + + movdqa %xmm0,(%rbx) + movdqa 96(%rbp),%xmm5 + movdqa -32(%rbp),%xmm15 + decl %edx + jnz .Loop + + movl 280(%rsp),%edx + leaq 16(%rdi),%rdi + leaq 64(%rsi),%rsi + decl %edx + jnz .Loop_grande + +.Ldone: + movq 272(%rsp),%rax + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp +.Lepilogue: + .byte 0xf3,0xc3 +.size sha1_multi_block,.-sha1_multi_block +.type sha1_multi_block_shaext,@function +.align 32 +sha1_multi_block_shaext: +_shaext_shortcut: + movq %rsp,%rax + pushq %rbx + pushq %rbp + subq $288,%rsp + shll $1,%edx + andq $-256,%rsp + leaq 64(%rdi),%rdi + movq %rax,272(%rsp) +.Lbody_shaext: + leaq 256(%rsp),%rbx + movdqa K_XX_XX+128(%rip),%xmm3 + +.Loop_grande_shaext: + movl %edx,280(%rsp) + xorl %edx,%edx + movq 0(%rsi),%r8 + movl 8(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,0(%rbx) + cmovleq %rsp,%r8 + movq 16(%rsi),%r9 + movl 24(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,4(%rbx) + cmovleq %rsp,%r9 + testl %edx,%edx + jz .Ldone_shaext + + movq 0-64(%rdi),%xmm0 + movq 32-64(%rdi),%xmm4 + movq 64-64(%rdi),%xmm5 + movq 96-64(%rdi),%xmm6 + movq 128-64(%rdi),%xmm7 + + punpckldq %xmm4,%xmm0 + punpckldq %xmm6,%xmm5 + + movdqa %xmm0,%xmm8 + punpcklqdq %xmm5,%xmm0 + punpckhqdq %xmm5,%xmm8 + + pshufd $63,%xmm7,%xmm1 + pshufd $127,%xmm7,%xmm9 + pshufd $27,%xmm0,%xmm0 + pshufd $27,%xmm8,%xmm8 + jmp .Loop_shaext + +.align 32 +.Loop_shaext: + movdqu 0(%r8),%xmm4 + movdqu 0(%r9),%xmm11 + movdqu 16(%r8),%xmm5 + movdqu 16(%r9),%xmm12 + movdqu 32(%r8),%xmm6 +.byte 102,15,56,0,227 + movdqu 32(%r9),%xmm13 +.byte 102,68,15,56,0,219 + movdqu 48(%r8),%xmm7 + leaq 64(%r8),%r8 +.byte 102,15,56,0,235 + movdqu 48(%r9),%xmm14 + leaq 64(%r9),%r9 +.byte 102,68,15,56,0,227 + + movdqa %xmm1,80(%rsp) + paddd %xmm4,%xmm1 + movdqa %xmm9,112(%rsp) + paddd %xmm11,%xmm9 + movdqa %xmm0,64(%rsp) + movdqa %xmm0,%xmm2 + movdqa %xmm8,96(%rsp) + movdqa %xmm8,%xmm10 +.byte 15,58,204,193,0 +.byte 15,56,200,213 +.byte 69,15,58,204,193,0 +.byte 69,15,56,200,212 +.byte 102,15,56,0,243 + prefetcht0 127(%r8) +.byte 15,56,201,229 +.byte 102,68,15,56,0,235 + prefetcht0 127(%r9) +.byte 69,15,56,201,220 + +.byte 102,15,56,0,251 + movdqa %xmm0,%xmm1 +.byte 102,68,15,56,0,243 + movdqa %xmm8,%xmm9 +.byte 15,58,204,194,0 +.byte 15,56,200,206 +.byte 69,15,58,204,194,0 +.byte 69,15,56,200,205 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 + pxor %xmm13,%xmm11 +.byte 69,15,56,201,229 + movdqa %xmm0,%xmm2 + movdqa %xmm8,%xmm10 +.byte 15,58,204,193,0 +.byte 15,56,200,215 +.byte 69,15,58,204,193,0 +.byte 69,15,56,200,214 +.byte 15,56,202,231 +.byte 69,15,56,202,222 + pxor %xmm7,%xmm5 +.byte 15,56,201,247 + pxor %xmm14,%xmm12 +.byte 69,15,56,201,238 + movdqa %xmm0,%xmm1 + movdqa %xmm8,%xmm9 +.byte 15,58,204,194,0 +.byte 15,56,200,204 +.byte 69,15,58,204,194,0 +.byte 69,15,56,200,203 +.byte 15,56,202,236 +.byte 69,15,56,202,227 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 + pxor %xmm11,%xmm13 +.byte 69,15,56,201,243 + movdqa %xmm0,%xmm2 + movdqa %xmm8,%xmm10 +.byte 15,58,204,193,0 +.byte 15,56,200,213 +.byte 69,15,58,204,193,0 +.byte 69,15,56,200,212 +.byte 15,56,202,245 +.byte 69,15,56,202,236 + pxor %xmm5,%xmm7 +.byte 15,56,201,229 + pxor %xmm12,%xmm14 +.byte 69,15,56,201,220 + movdqa %xmm0,%xmm1 + movdqa %xmm8,%xmm9 +.byte 15,58,204,194,1 +.byte 15,56,200,206 +.byte 69,15,58,204,194,1 +.byte 69,15,56,200,205 +.byte 15,56,202,254 +.byte 69,15,56,202,245 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 + pxor %xmm13,%xmm11 +.byte 69,15,56,201,229 + movdqa %xmm0,%xmm2 + movdqa %xmm8,%xmm10 +.byte 15,58,204,193,1 +.byte 15,56,200,215 +.byte 69,15,58,204,193,1 +.byte 69,15,56,200,214 +.byte 15,56,202,231 +.byte 69,15,56,202,222 + pxor %xmm7,%xmm5 +.byte 15,56,201,247 + pxor %xmm14,%xmm12 +.byte 69,15,56,201,238 + movdqa %xmm0,%xmm1 + movdqa %xmm8,%xmm9 +.byte 15,58,204,194,1 +.byte 15,56,200,204 +.byte 69,15,58,204,194,1 +.byte 69,15,56,200,203 +.byte 15,56,202,236 +.byte 69,15,56,202,227 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 + pxor %xmm11,%xmm13 +.byte 69,15,56,201,243 + movdqa %xmm0,%xmm2 + movdqa %xmm8,%xmm10 +.byte 15,58,204,193,1 +.byte 15,56,200,213 +.byte 69,15,58,204,193,1 +.byte 69,15,56,200,212 +.byte 15,56,202,245 +.byte 69,15,56,202,236 + pxor %xmm5,%xmm7 +.byte 15,56,201,229 + pxor %xmm12,%xmm14 +.byte 69,15,56,201,220 + movdqa %xmm0,%xmm1 + movdqa %xmm8,%xmm9 +.byte 15,58,204,194,1 +.byte 15,56,200,206 +.byte 69,15,58,204,194,1 +.byte 69,15,56,200,205 +.byte 15,56,202,254 +.byte 69,15,56,202,245 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 + pxor %xmm13,%xmm11 +.byte 69,15,56,201,229 + movdqa %xmm0,%xmm2 + movdqa %xmm8,%xmm10 +.byte 15,58,204,193,2 +.byte 15,56,200,215 +.byte 69,15,58,204,193,2 +.byte 69,15,56,200,214 +.byte 15,56,202,231 +.byte 69,15,56,202,222 + pxor %xmm7,%xmm5 +.byte 15,56,201,247 + pxor %xmm14,%xmm12 +.byte 69,15,56,201,238 + movdqa %xmm0,%xmm1 + movdqa %xmm8,%xmm9 +.byte 15,58,204,194,2 +.byte 15,56,200,204 +.byte 69,15,58,204,194,2 +.byte 69,15,56,200,203 +.byte 15,56,202,236 +.byte 69,15,56,202,227 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 + pxor %xmm11,%xmm13 +.byte 69,15,56,201,243 + movdqa %xmm0,%xmm2 + movdqa %xmm8,%xmm10 +.byte 15,58,204,193,2 +.byte 15,56,200,213 +.byte 69,15,58,204,193,2 +.byte 69,15,56,200,212 +.byte 15,56,202,245 +.byte 69,15,56,202,236 + pxor %xmm5,%xmm7 +.byte 15,56,201,229 + pxor %xmm12,%xmm14 +.byte 69,15,56,201,220 + movdqa %xmm0,%xmm1 + movdqa %xmm8,%xmm9 +.byte 15,58,204,194,2 +.byte 15,56,200,206 +.byte 69,15,58,204,194,2 +.byte 69,15,56,200,205 +.byte 15,56,202,254 +.byte 69,15,56,202,245 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 + pxor %xmm13,%xmm11 +.byte 69,15,56,201,229 + movdqa %xmm0,%xmm2 + movdqa %xmm8,%xmm10 +.byte 15,58,204,193,2 +.byte 15,56,200,215 +.byte 69,15,58,204,193,2 +.byte 69,15,56,200,214 +.byte 15,56,202,231 +.byte 69,15,56,202,222 + pxor %xmm7,%xmm5 +.byte 15,56,201,247 + pxor %xmm14,%xmm12 +.byte 69,15,56,201,238 + movdqa %xmm0,%xmm1 + movdqa %xmm8,%xmm9 +.byte 15,58,204,194,3 +.byte 15,56,200,204 +.byte 69,15,58,204,194,3 +.byte 69,15,56,200,203 +.byte 15,56,202,236 +.byte 69,15,56,202,227 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 + pxor %xmm11,%xmm13 +.byte 69,15,56,201,243 + movdqa %xmm0,%xmm2 + movdqa %xmm8,%xmm10 +.byte 15,58,204,193,3 +.byte 15,56,200,213 +.byte 69,15,58,204,193,3 +.byte 69,15,56,200,212 +.byte 15,56,202,245 +.byte 69,15,56,202,236 + pxor %xmm5,%xmm7 + pxor %xmm12,%xmm14 + + movl $1,%ecx + pxor %xmm4,%xmm4 + cmpl 0(%rbx),%ecx + cmovgeq %rsp,%r8 + + movdqa %xmm0,%xmm1 + movdqa %xmm8,%xmm9 +.byte 15,58,204,194,3 +.byte 15,56,200,206 +.byte 69,15,58,204,194,3 +.byte 69,15,56,200,205 +.byte 15,56,202,254 +.byte 69,15,56,202,245 + + cmpl 4(%rbx),%ecx + cmovgeq %rsp,%r9 + movq (%rbx),%xmm6 + + movdqa %xmm0,%xmm2 + movdqa %xmm8,%xmm10 +.byte 15,58,204,193,3 +.byte 15,56,200,215 +.byte 69,15,58,204,193,3 +.byte 69,15,56,200,214 + + pshufd $0,%xmm6,%xmm11 + pshufd $85,%xmm6,%xmm12 + movdqa %xmm6,%xmm7 + pcmpgtd %xmm4,%xmm11 + pcmpgtd %xmm4,%xmm12 + + movdqa %xmm0,%xmm1 + movdqa %xmm8,%xmm9 +.byte 15,58,204,194,3 +.byte 15,56,200,204 +.byte 69,15,58,204,194,3 +.byte 68,15,56,200,204 + + pcmpgtd %xmm4,%xmm7 + pand %xmm11,%xmm0 + pand %xmm11,%xmm1 + pand %xmm12,%xmm8 + pand %xmm12,%xmm9 + paddd %xmm7,%xmm6 + + paddd 64(%rsp),%xmm0 + paddd 80(%rsp),%xmm1 + paddd 96(%rsp),%xmm8 + paddd 112(%rsp),%xmm9 + + movq %xmm6,(%rbx) + decl %edx + jnz .Loop_shaext + + movl 280(%rsp),%edx + + pshufd $27,%xmm0,%xmm0 + pshufd $27,%xmm8,%xmm8 + + movdqa %xmm0,%xmm6 + punpckldq %xmm8,%xmm0 + punpckhdq %xmm8,%xmm6 + punpckhdq %xmm9,%xmm1 + movq %xmm0,0-64(%rdi) + psrldq $8,%xmm0 + movq %xmm6,64-64(%rdi) + psrldq $8,%xmm6 + movq %xmm0,32-64(%rdi) + psrldq $8,%xmm1 + movq %xmm6,96-64(%rdi) + movq %xmm1,128-64(%rdi) + + leaq 8(%rdi),%rdi + leaq 32(%rsi),%rsi + decl %edx + jnz .Loop_grande_shaext + +.Ldone_shaext: + + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp +.Lepilogue_shaext: + .byte 0xf3,0xc3 +.size sha1_multi_block_shaext,.-sha1_multi_block_shaext + +.align 256 +.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 +.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 +K_XX_XX: +.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 +.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 +.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc +.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc +.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 +.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 +.byte 83,72,65,49,32,109,117,108,116,105,45,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 diff --git a/secure/lib/libcrypto/amd64/sha1-x86_64.S b/secure/lib/libcrypto/amd64/sha1-x86_64.S new file mode 100644 index 0000000..25c27e5 --- /dev/null +++ b/secure/lib/libcrypto/amd64/sha1-x86_64.S @@ -0,0 +1,2592 @@ + # $FreeBSD$ +.text + + +.globl sha1_block_data_order +.type sha1_block_data_order,@function +.align 16 +sha1_block_data_order: + movl OPENSSL_ia32cap_P+0(%rip),%r9d + movl OPENSSL_ia32cap_P+4(%rip),%r8d + movl OPENSSL_ia32cap_P+8(%rip),%r10d + testl $512,%r8d + jz .Lialu + testl $536870912,%r10d + jnz _shaext_shortcut + jmp _ssse3_shortcut + +.align 16 +.Lialu: + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + movq %rdi,%r8 + subq $72,%rsp + movq %rsi,%r9 + andq $-64,%rsp + movq %rdx,%r10 + movq %rax,64(%rsp) +.Lprologue: + + movl 0(%r8),%esi + movl 4(%r8),%edi + movl 8(%r8),%r11d + movl 12(%r8),%r12d + movl 16(%r8),%r13d + jmp .Lloop + +.align 16 +.Lloop: + movl 0(%r9),%edx + bswapl %edx + movl 4(%r9),%ebp + movl %r12d,%eax + movl %edx,0(%rsp) + movl %esi,%ecx + bswapl %ebp + xorl %r11d,%eax + roll $5,%ecx + andl %edi,%eax + leal 1518500249(%rdx,%r13,1),%r13d + addl %ecx,%r13d + xorl %r12d,%eax + roll $30,%edi + addl %eax,%r13d + movl 8(%r9),%r14d + movl %r11d,%eax + movl %ebp,4(%rsp) + movl %r13d,%ecx + bswapl %r14d + xorl %edi,%eax + roll $5,%ecx + andl %esi,%eax + leal 1518500249(%rbp,%r12,1),%r12d + addl %ecx,%r12d + xorl %r11d,%eax + roll $30,%esi + addl %eax,%r12d + movl 12(%r9),%edx + movl %edi,%eax + movl %r14d,8(%rsp) + movl %r12d,%ecx + bswapl %edx + xorl %esi,%eax + roll $5,%ecx + andl %r13d,%eax + leal 1518500249(%r14,%r11,1),%r11d + addl %ecx,%r11d + xorl %edi,%eax + roll $30,%r13d + addl %eax,%r11d + movl 16(%r9),%ebp + movl %esi,%eax + movl %edx,12(%rsp) + movl %r11d,%ecx + bswapl %ebp + xorl %r13d,%eax + roll $5,%ecx + andl %r12d,%eax + leal 1518500249(%rdx,%rdi,1),%edi + addl %ecx,%edi + xorl %esi,%eax + roll $30,%r12d + addl %eax,%edi + movl 20(%r9),%r14d + movl %r13d,%eax + movl %ebp,16(%rsp) + movl %edi,%ecx + bswapl %r14d + xorl %r12d,%eax + roll $5,%ecx + andl %r11d,%eax + leal 1518500249(%rbp,%rsi,1),%esi + addl %ecx,%esi + xorl %r13d,%eax + roll $30,%r11d + addl %eax,%esi + movl 24(%r9),%edx + movl %r12d,%eax + movl %r14d,20(%rsp) + movl %esi,%ecx + bswapl %edx + xorl %r11d,%eax + roll $5,%ecx + andl %edi,%eax + leal 1518500249(%r14,%r13,1),%r13d + addl %ecx,%r13d + xorl %r12d,%eax + roll $30,%edi + addl %eax,%r13d + movl 28(%r9),%ebp + movl %r11d,%eax + movl %edx,24(%rsp) + movl %r13d,%ecx + bswapl %ebp + xorl %edi,%eax + roll $5,%ecx + andl %esi,%eax + leal 1518500249(%rdx,%r12,1),%r12d + addl %ecx,%r12d + xorl %r11d,%eax + roll $30,%esi + addl %eax,%r12d + movl 32(%r9),%r14d + movl %edi,%eax + movl %ebp,28(%rsp) + movl %r12d,%ecx + bswapl %r14d + xorl %esi,%eax + roll $5,%ecx + andl %r13d,%eax + leal 1518500249(%rbp,%r11,1),%r11d + addl %ecx,%r11d + xorl %edi,%eax + roll $30,%r13d + addl %eax,%r11d + movl 36(%r9),%edx + movl %esi,%eax + movl %r14d,32(%rsp) + movl %r11d,%ecx + bswapl %edx + xorl %r13d,%eax + roll $5,%ecx + andl %r12d,%eax + leal 1518500249(%r14,%rdi,1),%edi + addl %ecx,%edi + xorl %esi,%eax + roll $30,%r12d + addl %eax,%edi + movl 40(%r9),%ebp + movl %r13d,%eax + movl %edx,36(%rsp) + movl %edi,%ecx + bswapl %ebp + xorl %r12d,%eax + roll $5,%ecx + andl %r11d,%eax + leal 1518500249(%rdx,%rsi,1),%esi + addl %ecx,%esi + xorl %r13d,%eax + roll $30,%r11d + addl %eax,%esi + movl 44(%r9),%r14d + movl %r12d,%eax + movl %ebp,40(%rsp) + movl %esi,%ecx + bswapl %r14d + xorl %r11d,%eax + roll $5,%ecx + andl %edi,%eax + leal 1518500249(%rbp,%r13,1),%r13d + addl %ecx,%r13d + xorl %r12d,%eax + roll $30,%edi + addl %eax,%r13d + movl 48(%r9),%edx + movl %r11d,%eax + movl %r14d,44(%rsp) + movl %r13d,%ecx + bswapl %edx + xorl %edi,%eax + roll $5,%ecx + andl %esi,%eax + leal 1518500249(%r14,%r12,1),%r12d + addl %ecx,%r12d + xorl %r11d,%eax + roll $30,%esi + addl %eax,%r12d + movl 52(%r9),%ebp + movl %edi,%eax + movl %edx,48(%rsp) + movl %r12d,%ecx + bswapl %ebp + xorl %esi,%eax + roll $5,%ecx + andl %r13d,%eax + leal 1518500249(%rdx,%r11,1),%r11d + addl %ecx,%r11d + xorl %edi,%eax + roll $30,%r13d + addl %eax,%r11d + movl 56(%r9),%r14d + movl %esi,%eax + movl %ebp,52(%rsp) + movl %r11d,%ecx + bswapl %r14d + xorl %r13d,%eax + roll $5,%ecx + andl %r12d,%eax + leal 1518500249(%rbp,%rdi,1),%edi + addl %ecx,%edi + xorl %esi,%eax + roll $30,%r12d + addl %eax,%edi + movl 60(%r9),%edx + movl %r13d,%eax + movl %r14d,56(%rsp) + movl %edi,%ecx + bswapl %edx + xorl %r12d,%eax + roll $5,%ecx + andl %r11d,%eax + leal 1518500249(%r14,%rsi,1),%esi + addl %ecx,%esi + xorl %r13d,%eax + roll $30,%r11d + addl %eax,%esi + xorl 0(%rsp),%ebp + movl %r12d,%eax + movl %edx,60(%rsp) + movl %esi,%ecx + xorl 8(%rsp),%ebp + xorl %r11d,%eax + roll $5,%ecx + xorl 32(%rsp),%ebp + andl %edi,%eax + leal 1518500249(%rdx,%r13,1),%r13d + roll $30,%edi + xorl %r12d,%eax + addl %ecx,%r13d + roll $1,%ebp + addl %eax,%r13d + xorl 4(%rsp),%r14d + movl %r11d,%eax + movl %ebp,0(%rsp) + movl %r13d,%ecx + xorl 12(%rsp),%r14d + xorl %edi,%eax + roll $5,%ecx + xorl 36(%rsp),%r14d + andl %esi,%eax + leal 1518500249(%rbp,%r12,1),%r12d + roll $30,%esi + xorl %r11d,%eax + addl %ecx,%r12d + roll $1,%r14d + addl %eax,%r12d + xorl 8(%rsp),%edx + movl %edi,%eax + movl %r14d,4(%rsp) + movl %r12d,%ecx + xorl 16(%rsp),%edx + xorl %esi,%eax + roll $5,%ecx + xorl 40(%rsp),%edx + andl %r13d,%eax + leal 1518500249(%r14,%r11,1),%r11d + roll $30,%r13d + xorl %edi,%eax + addl %ecx,%r11d + roll $1,%edx + addl %eax,%r11d + xorl 12(%rsp),%ebp + movl %esi,%eax + movl %edx,8(%rsp) + movl %r11d,%ecx + xorl 20(%rsp),%ebp + xorl %r13d,%eax + roll $5,%ecx + xorl 44(%rsp),%ebp + andl %r12d,%eax + leal 1518500249(%rdx,%rdi,1),%edi + roll $30,%r12d + xorl %esi,%eax + addl %ecx,%edi + roll $1,%ebp + addl %eax,%edi + xorl 16(%rsp),%r14d + movl %r13d,%eax + movl %ebp,12(%rsp) + movl %edi,%ecx + xorl 24(%rsp),%r14d + xorl %r12d,%eax + roll $5,%ecx + xorl 48(%rsp),%r14d + andl %r11d,%eax + leal 1518500249(%rbp,%rsi,1),%esi + roll $30,%r11d + xorl %r13d,%eax + addl %ecx,%esi + roll $1,%r14d + addl %eax,%esi + xorl 20(%rsp),%edx + movl %edi,%eax + movl %r14d,16(%rsp) + movl %esi,%ecx + xorl 28(%rsp),%edx + xorl %r12d,%eax + roll $5,%ecx + xorl 52(%rsp),%edx + leal 1859775393(%r14,%r13,1),%r13d + xorl %r11d,%eax + addl %ecx,%r13d + roll $30,%edi + addl %eax,%r13d + roll $1,%edx + xorl 24(%rsp),%ebp + movl %esi,%eax + movl %edx,20(%rsp) + movl %r13d,%ecx + xorl 32(%rsp),%ebp + xorl %r11d,%eax + roll $5,%ecx + xorl 56(%rsp),%ebp + leal 1859775393(%rdx,%r12,1),%r12d + xorl %edi,%eax + addl %ecx,%r12d + roll $30,%esi + addl %eax,%r12d + roll $1,%ebp + xorl 28(%rsp),%r14d + movl %r13d,%eax + movl %ebp,24(%rsp) + movl %r12d,%ecx + xorl 36(%rsp),%r14d + xorl %edi,%eax + roll $5,%ecx + xorl 60(%rsp),%r14d + leal 1859775393(%rbp,%r11,1),%r11d + xorl %esi,%eax + addl %ecx,%r11d + roll $30,%r13d + addl %eax,%r11d + roll $1,%r14d + xorl 32(%rsp),%edx + movl %r12d,%eax + movl %r14d,28(%rsp) + movl %r11d,%ecx + xorl 40(%rsp),%edx + xorl %esi,%eax + roll $5,%ecx + xorl 0(%rsp),%edx + leal 1859775393(%r14,%rdi,1),%edi + xorl %r13d,%eax + addl %ecx,%edi + roll $30,%r12d + addl %eax,%edi + roll $1,%edx + xorl 36(%rsp),%ebp + movl %r11d,%eax + movl %edx,32(%rsp) + movl %edi,%ecx + xorl 44(%rsp),%ebp + xorl %r13d,%eax + roll $5,%ecx + xorl 4(%rsp),%ebp + leal 1859775393(%rdx,%rsi,1),%esi + xorl %r12d,%eax + addl %ecx,%esi + roll $30,%r11d + addl %eax,%esi + roll $1,%ebp + xorl 40(%rsp),%r14d + movl %edi,%eax + movl %ebp,36(%rsp) + movl %esi,%ecx + xorl 48(%rsp),%r14d + xorl %r12d,%eax + roll $5,%ecx + xorl 8(%rsp),%r14d + leal 1859775393(%rbp,%r13,1),%r13d + xorl %r11d,%eax + addl %ecx,%r13d + roll $30,%edi + addl %eax,%r13d + roll $1,%r14d + xorl 44(%rsp),%edx + movl %esi,%eax + movl %r14d,40(%rsp) + movl %r13d,%ecx + xorl 52(%rsp),%edx + xorl %r11d,%eax + roll $5,%ecx + xorl 12(%rsp),%edx + leal 1859775393(%r14,%r12,1),%r12d + xorl %edi,%eax + addl %ecx,%r12d + roll $30,%esi + addl %eax,%r12d + roll $1,%edx + xorl 48(%rsp),%ebp + movl %r13d,%eax + movl %edx,44(%rsp) + movl %r12d,%ecx + xorl 56(%rsp),%ebp + xorl %edi,%eax + roll $5,%ecx + xorl 16(%rsp),%ebp + leal 1859775393(%rdx,%r11,1),%r11d + xorl %esi,%eax + addl %ecx,%r11d + roll $30,%r13d + addl %eax,%r11d + roll $1,%ebp + xorl 52(%rsp),%r14d + movl %r12d,%eax + movl %ebp,48(%rsp) + movl %r11d,%ecx + xorl 60(%rsp),%r14d + xorl %esi,%eax + roll $5,%ecx + xorl 20(%rsp),%r14d + leal 1859775393(%rbp,%rdi,1),%edi + xorl %r13d,%eax + addl %ecx,%edi + roll $30,%r12d + addl %eax,%edi + roll $1,%r14d + xorl 56(%rsp),%edx + movl %r11d,%eax + movl %r14d,52(%rsp) + movl %edi,%ecx + xorl 0(%rsp),%edx + xorl %r13d,%eax + roll $5,%ecx + xorl 24(%rsp),%edx + leal 1859775393(%r14,%rsi,1),%esi + xorl %r12d,%eax + addl %ecx,%esi + roll $30,%r11d + addl %eax,%esi + roll $1,%edx + xorl 60(%rsp),%ebp + movl %edi,%eax + movl %edx,56(%rsp) + movl %esi,%ecx + xorl 4(%rsp),%ebp + xorl %r12d,%eax + roll $5,%ecx + xorl 28(%rsp),%ebp + leal 1859775393(%rdx,%r13,1),%r13d + xorl %r11d,%eax + addl %ecx,%r13d + roll $30,%edi + addl %eax,%r13d + roll $1,%ebp + xorl 0(%rsp),%r14d + movl %esi,%eax + movl %ebp,60(%rsp) + movl %r13d,%ecx + xorl 8(%rsp),%r14d + xorl %r11d,%eax + roll $5,%ecx + xorl 32(%rsp),%r14d + leal 1859775393(%rbp,%r12,1),%r12d + xorl %edi,%eax + addl %ecx,%r12d + roll $30,%esi + addl %eax,%r12d + roll $1,%r14d + xorl 4(%rsp),%edx + movl %r13d,%eax + movl %r14d,0(%rsp) + movl %r12d,%ecx + xorl 12(%rsp),%edx + xorl %edi,%eax + roll $5,%ecx + xorl 36(%rsp),%edx + leal 1859775393(%r14,%r11,1),%r11d + xorl %esi,%eax + addl %ecx,%r11d + roll $30,%r13d + addl %eax,%r11d + roll $1,%edx + xorl 8(%rsp),%ebp + movl %r12d,%eax + movl %edx,4(%rsp) + movl %r11d,%ecx + xorl 16(%rsp),%ebp + xorl %esi,%eax + roll $5,%ecx + xorl 40(%rsp),%ebp + leal 1859775393(%rdx,%rdi,1),%edi + xorl %r13d,%eax + addl %ecx,%edi + roll $30,%r12d + addl %eax,%edi + roll $1,%ebp + xorl 12(%rsp),%r14d + movl %r11d,%eax + movl %ebp,8(%rsp) + movl %edi,%ecx + xorl 20(%rsp),%r14d + xorl %r13d,%eax + roll $5,%ecx + xorl 44(%rsp),%r14d + leal 1859775393(%rbp,%rsi,1),%esi + xorl %r12d,%eax + addl %ecx,%esi + roll $30,%r11d + addl %eax,%esi + roll $1,%r14d + xorl 16(%rsp),%edx + movl %edi,%eax + movl %r14d,12(%rsp) + movl %esi,%ecx + xorl 24(%rsp),%edx + xorl %r12d,%eax + roll $5,%ecx + xorl 48(%rsp),%edx + leal 1859775393(%r14,%r13,1),%r13d + xorl %r11d,%eax + addl %ecx,%r13d + roll $30,%edi + addl %eax,%r13d + roll $1,%edx + xorl 20(%rsp),%ebp + movl %esi,%eax + movl %edx,16(%rsp) + movl %r13d,%ecx + xorl 28(%rsp),%ebp + xorl %r11d,%eax + roll $5,%ecx + xorl 52(%rsp),%ebp + leal 1859775393(%rdx,%r12,1),%r12d + xorl %edi,%eax + addl %ecx,%r12d + roll $30,%esi + addl %eax,%r12d + roll $1,%ebp + xorl 24(%rsp),%r14d + movl %r13d,%eax + movl %ebp,20(%rsp) + movl %r12d,%ecx + xorl 32(%rsp),%r14d + xorl %edi,%eax + roll $5,%ecx + xorl 56(%rsp),%r14d + leal 1859775393(%rbp,%r11,1),%r11d + xorl %esi,%eax + addl %ecx,%r11d + roll $30,%r13d + addl %eax,%r11d + roll $1,%r14d + xorl 28(%rsp),%edx + movl %r12d,%eax + movl %r14d,24(%rsp) + movl %r11d,%ecx + xorl 36(%rsp),%edx + xorl %esi,%eax + roll $5,%ecx + xorl 60(%rsp),%edx + leal 1859775393(%r14,%rdi,1),%edi + xorl %r13d,%eax + addl %ecx,%edi + roll $30,%r12d + addl %eax,%edi + roll $1,%edx + xorl 32(%rsp),%ebp + movl %r11d,%eax + movl %edx,28(%rsp) + movl %edi,%ecx + xorl 40(%rsp),%ebp + xorl %r13d,%eax + roll $5,%ecx + xorl 0(%rsp),%ebp + leal 1859775393(%rdx,%rsi,1),%esi + xorl %r12d,%eax + addl %ecx,%esi + roll $30,%r11d + addl %eax,%esi + roll $1,%ebp + xorl 36(%rsp),%r14d + movl %r12d,%eax + movl %ebp,32(%rsp) + movl %r12d,%ebx + xorl 44(%rsp),%r14d + andl %r11d,%eax + movl %esi,%ecx + xorl 4(%rsp),%r14d + leal -1894007588(%rbp,%r13,1),%r13d + xorl %r11d,%ebx + roll $5,%ecx + addl %eax,%r13d + roll $1,%r14d + andl %edi,%ebx + addl %ecx,%r13d + roll $30,%edi + addl %ebx,%r13d + xorl 40(%rsp),%edx + movl %r11d,%eax + movl %r14d,36(%rsp) + movl %r11d,%ebx + xorl 48(%rsp),%edx + andl %edi,%eax + movl %r13d,%ecx + xorl 8(%rsp),%edx + leal -1894007588(%r14,%r12,1),%r12d + xorl %edi,%ebx + roll $5,%ecx + addl %eax,%r12d + roll $1,%edx + andl %esi,%ebx + addl %ecx,%r12d + roll $30,%esi + addl %ebx,%r12d + xorl 44(%rsp),%ebp + movl %edi,%eax + movl %edx,40(%rsp) + movl %edi,%ebx + xorl 52(%rsp),%ebp + andl %esi,%eax + movl %r12d,%ecx + xorl 12(%rsp),%ebp + leal -1894007588(%rdx,%r11,1),%r11d + xorl %esi,%ebx + roll $5,%ecx + addl %eax,%r11d + roll $1,%ebp + andl %r13d,%ebx + addl %ecx,%r11d + roll $30,%r13d + addl %ebx,%r11d + xorl 48(%rsp),%r14d + movl %esi,%eax + movl %ebp,44(%rsp) + movl %esi,%ebx + xorl 56(%rsp),%r14d + andl %r13d,%eax + movl %r11d,%ecx + xorl 16(%rsp),%r14d + leal -1894007588(%rbp,%rdi,1),%edi + xorl %r13d,%ebx + roll $5,%ecx + addl %eax,%edi + roll $1,%r14d + andl %r12d,%ebx + addl %ecx,%edi + roll $30,%r12d + addl %ebx,%edi + xorl 52(%rsp),%edx + movl %r13d,%eax + movl %r14d,48(%rsp) + movl %r13d,%ebx + xorl 60(%rsp),%edx + andl %r12d,%eax + movl %edi,%ecx + xorl 20(%rsp),%edx + leal -1894007588(%r14,%rsi,1),%esi + xorl %r12d,%ebx + roll $5,%ecx + addl %eax,%esi + roll $1,%edx + andl %r11d,%ebx + addl %ecx,%esi + roll $30,%r11d + addl %ebx,%esi + xorl 56(%rsp),%ebp + movl %r12d,%eax + movl %edx,52(%rsp) + movl %r12d,%ebx + xorl 0(%rsp),%ebp + andl %r11d,%eax + movl %esi,%ecx + xorl 24(%rsp),%ebp + leal -1894007588(%rdx,%r13,1),%r13d + xorl %r11d,%ebx + roll $5,%ecx + addl %eax,%r13d + roll $1,%ebp + andl %edi,%ebx + addl %ecx,%r13d + roll $30,%edi + addl %ebx,%r13d + xorl 60(%rsp),%r14d + movl %r11d,%eax + movl %ebp,56(%rsp) + movl %r11d,%ebx + xorl 4(%rsp),%r14d + andl %edi,%eax + movl %r13d,%ecx + xorl 28(%rsp),%r14d + leal -1894007588(%rbp,%r12,1),%r12d + xorl %edi,%ebx + roll $5,%ecx + addl %eax,%r12d + roll $1,%r14d + andl %esi,%ebx + addl %ecx,%r12d + roll $30,%esi + addl %ebx,%r12d + xorl 0(%rsp),%edx + movl %edi,%eax + movl %r14d,60(%rsp) + movl %edi,%ebx + xorl 8(%rsp),%edx + andl %esi,%eax + movl %r12d,%ecx + xorl 32(%rsp),%edx + leal -1894007588(%r14,%r11,1),%r11d + xorl %esi,%ebx + roll $5,%ecx + addl %eax,%r11d + roll $1,%edx + andl %r13d,%ebx + addl %ecx,%r11d + roll $30,%r13d + addl %ebx,%r11d + xorl 4(%rsp),%ebp + movl %esi,%eax + movl %edx,0(%rsp) + movl %esi,%ebx + xorl 12(%rsp),%ebp + andl %r13d,%eax + movl %r11d,%ecx + xorl 36(%rsp),%ebp + leal -1894007588(%rdx,%rdi,1),%edi + xorl %r13d,%ebx + roll $5,%ecx + addl %eax,%edi + roll $1,%ebp + andl %r12d,%ebx + addl %ecx,%edi + roll $30,%r12d + addl %ebx,%edi + xorl 8(%rsp),%r14d + movl %r13d,%eax + movl %ebp,4(%rsp) + movl %r13d,%ebx + xorl 16(%rsp),%r14d + andl %r12d,%eax + movl %edi,%ecx + xorl 40(%rsp),%r14d + leal -1894007588(%rbp,%rsi,1),%esi + xorl %r12d,%ebx + roll $5,%ecx + addl %eax,%esi + roll $1,%r14d + andl %r11d,%ebx + addl %ecx,%esi + roll $30,%r11d + addl %ebx,%esi + xorl 12(%rsp),%edx + movl %r12d,%eax + movl %r14d,8(%rsp) + movl %r12d,%ebx + xorl 20(%rsp),%edx + andl %r11d,%eax + movl %esi,%ecx + xorl 44(%rsp),%edx + leal -1894007588(%r14,%r13,1),%r13d + xorl %r11d,%ebx + roll $5,%ecx + addl %eax,%r13d + roll $1,%edx + andl %edi,%ebx + addl %ecx,%r13d + roll $30,%edi + addl %ebx,%r13d + xorl 16(%rsp),%ebp + movl %r11d,%eax + movl %edx,12(%rsp) + movl %r11d,%ebx + xorl 24(%rsp),%ebp + andl %edi,%eax + movl %r13d,%ecx + xorl 48(%rsp),%ebp + leal -1894007588(%rdx,%r12,1),%r12d + xorl %edi,%ebx + roll $5,%ecx + addl %eax,%r12d + roll $1,%ebp + andl %esi,%ebx + addl %ecx,%r12d + roll $30,%esi + addl %ebx,%r12d + xorl 20(%rsp),%r14d + movl %edi,%eax + movl %ebp,16(%rsp) + movl %edi,%ebx + xorl 28(%rsp),%r14d + andl %esi,%eax + movl %r12d,%ecx + xorl 52(%rsp),%r14d + leal -1894007588(%rbp,%r11,1),%r11d + xorl %esi,%ebx + roll $5,%ecx + addl %eax,%r11d + roll $1,%r14d + andl %r13d,%ebx + addl %ecx,%r11d + roll $30,%r13d + addl %ebx,%r11d + xorl 24(%rsp),%edx + movl %esi,%eax + movl %r14d,20(%rsp) + movl %esi,%ebx + xorl 32(%rsp),%edx + andl %r13d,%eax + movl %r11d,%ecx + xorl 56(%rsp),%edx + leal -1894007588(%r14,%rdi,1),%edi + xorl %r13d,%ebx + roll $5,%ecx + addl %eax,%edi + roll $1,%edx + andl %r12d,%ebx + addl %ecx,%edi + roll $30,%r12d + addl %ebx,%edi + xorl 28(%rsp),%ebp + movl %r13d,%eax + movl %edx,24(%rsp) + movl %r13d,%ebx + xorl 36(%rsp),%ebp + andl %r12d,%eax + movl %edi,%ecx + xorl 60(%rsp),%ebp + leal -1894007588(%rdx,%rsi,1),%esi + xorl %r12d,%ebx + roll $5,%ecx + addl %eax,%esi + roll $1,%ebp + andl %r11d,%ebx + addl %ecx,%esi + roll $30,%r11d + addl %ebx,%esi + xorl 32(%rsp),%r14d + movl %r12d,%eax + movl %ebp,28(%rsp) + movl %r12d,%ebx + xorl 40(%rsp),%r14d + andl %r11d,%eax + movl %esi,%ecx + xorl 0(%rsp),%r14d + leal -1894007588(%rbp,%r13,1),%r13d + xorl %r11d,%ebx + roll $5,%ecx + addl %eax,%r13d + roll $1,%r14d + andl %edi,%ebx + addl %ecx,%r13d + roll $30,%edi + addl %ebx,%r13d + xorl 36(%rsp),%edx + movl %r11d,%eax + movl %r14d,32(%rsp) + movl %r11d,%ebx + xorl 44(%rsp),%edx + andl %edi,%eax + movl %r13d,%ecx + xorl 4(%rsp),%edx + leal -1894007588(%r14,%r12,1),%r12d + xorl %edi,%ebx + roll $5,%ecx + addl %eax,%r12d + roll $1,%edx + andl %esi,%ebx + addl %ecx,%r12d + roll $30,%esi + addl %ebx,%r12d + xorl 40(%rsp),%ebp + movl %edi,%eax + movl %edx,36(%rsp) + movl %edi,%ebx + xorl 48(%rsp),%ebp + andl %esi,%eax + movl %r12d,%ecx + xorl 8(%rsp),%ebp + leal -1894007588(%rdx,%r11,1),%r11d + xorl %esi,%ebx + roll $5,%ecx + addl %eax,%r11d + roll $1,%ebp + andl %r13d,%ebx + addl %ecx,%r11d + roll $30,%r13d + addl %ebx,%r11d + xorl 44(%rsp),%r14d + movl %esi,%eax + movl %ebp,40(%rsp) + movl %esi,%ebx + xorl 52(%rsp),%r14d + andl %r13d,%eax + movl %r11d,%ecx + xorl 12(%rsp),%r14d + leal -1894007588(%rbp,%rdi,1),%edi + xorl %r13d,%ebx + roll $5,%ecx + addl %eax,%edi + roll $1,%r14d + andl %r12d,%ebx + addl %ecx,%edi + roll $30,%r12d + addl %ebx,%edi + xorl 48(%rsp),%edx + movl %r13d,%eax + movl %r14d,44(%rsp) + movl %r13d,%ebx + xorl 56(%rsp),%edx + andl %r12d,%eax + movl %edi,%ecx + xorl 16(%rsp),%edx + leal -1894007588(%r14,%rsi,1),%esi + xorl %r12d,%ebx + roll $5,%ecx + addl %eax,%esi + roll $1,%edx + andl %r11d,%ebx + addl %ecx,%esi + roll $30,%r11d + addl %ebx,%esi + xorl 52(%rsp),%ebp + movl %edi,%eax + movl %edx,48(%rsp) + movl %esi,%ecx + xorl 60(%rsp),%ebp + xorl %r12d,%eax + roll $5,%ecx + xorl 20(%rsp),%ebp + leal -899497514(%rdx,%r13,1),%r13d + xorl %r11d,%eax + addl %ecx,%r13d + roll $30,%edi + addl %eax,%r13d + roll $1,%ebp + xorl 56(%rsp),%r14d + movl %esi,%eax + movl %ebp,52(%rsp) + movl %r13d,%ecx + xorl 0(%rsp),%r14d + xorl %r11d,%eax + roll $5,%ecx + xorl 24(%rsp),%r14d + leal -899497514(%rbp,%r12,1),%r12d + xorl %edi,%eax + addl %ecx,%r12d + roll $30,%esi + addl %eax,%r12d + roll $1,%r14d + xorl 60(%rsp),%edx + movl %r13d,%eax + movl %r14d,56(%rsp) + movl %r12d,%ecx + xorl 4(%rsp),%edx + xorl %edi,%eax + roll $5,%ecx + xorl 28(%rsp),%edx + leal -899497514(%r14,%r11,1),%r11d + xorl %esi,%eax + addl %ecx,%r11d + roll $30,%r13d + addl %eax,%r11d + roll $1,%edx + xorl 0(%rsp),%ebp + movl %r12d,%eax + movl %edx,60(%rsp) + movl %r11d,%ecx + xorl 8(%rsp),%ebp + xorl %esi,%eax + roll $5,%ecx + xorl 32(%rsp),%ebp + leal -899497514(%rdx,%rdi,1),%edi + xorl %r13d,%eax + addl %ecx,%edi + roll $30,%r12d + addl %eax,%edi + roll $1,%ebp + xorl 4(%rsp),%r14d + movl %r11d,%eax + movl %ebp,0(%rsp) + movl %edi,%ecx + xorl 12(%rsp),%r14d + xorl %r13d,%eax + roll $5,%ecx + xorl 36(%rsp),%r14d + leal -899497514(%rbp,%rsi,1),%esi + xorl %r12d,%eax + addl %ecx,%esi + roll $30,%r11d + addl %eax,%esi + roll $1,%r14d + xorl 8(%rsp),%edx + movl %edi,%eax + movl %r14d,4(%rsp) + movl %esi,%ecx + xorl 16(%rsp),%edx + xorl %r12d,%eax + roll $5,%ecx + xorl 40(%rsp),%edx + leal -899497514(%r14,%r13,1),%r13d + xorl %r11d,%eax + addl %ecx,%r13d + roll $30,%edi + addl %eax,%r13d + roll $1,%edx + xorl 12(%rsp),%ebp + movl %esi,%eax + movl %edx,8(%rsp) + movl %r13d,%ecx + xorl 20(%rsp),%ebp + xorl %r11d,%eax + roll $5,%ecx + xorl 44(%rsp),%ebp + leal -899497514(%rdx,%r12,1),%r12d + xorl %edi,%eax + addl %ecx,%r12d + roll $30,%esi + addl %eax,%r12d + roll $1,%ebp + xorl 16(%rsp),%r14d + movl %r13d,%eax + movl %ebp,12(%rsp) + movl %r12d,%ecx + xorl 24(%rsp),%r14d + xorl %edi,%eax + roll $5,%ecx + xorl 48(%rsp),%r14d + leal -899497514(%rbp,%r11,1),%r11d + xorl %esi,%eax + addl %ecx,%r11d + roll $30,%r13d + addl %eax,%r11d + roll $1,%r14d + xorl 20(%rsp),%edx + movl %r12d,%eax + movl %r14d,16(%rsp) + movl %r11d,%ecx + xorl 28(%rsp),%edx + xorl %esi,%eax + roll $5,%ecx + xorl 52(%rsp),%edx + leal -899497514(%r14,%rdi,1),%edi + xorl %r13d,%eax + addl %ecx,%edi + roll $30,%r12d + addl %eax,%edi + roll $1,%edx + xorl 24(%rsp),%ebp + movl %r11d,%eax + movl %edx,20(%rsp) + movl %edi,%ecx + xorl 32(%rsp),%ebp + xorl %r13d,%eax + roll $5,%ecx + xorl 56(%rsp),%ebp + leal -899497514(%rdx,%rsi,1),%esi + xorl %r12d,%eax + addl %ecx,%esi + roll $30,%r11d + addl %eax,%esi + roll $1,%ebp + xorl 28(%rsp),%r14d + movl %edi,%eax + movl %ebp,24(%rsp) + movl %esi,%ecx + xorl 36(%rsp),%r14d + xorl %r12d,%eax + roll $5,%ecx + xorl 60(%rsp),%r14d + leal -899497514(%rbp,%r13,1),%r13d + xorl %r11d,%eax + addl %ecx,%r13d + roll $30,%edi + addl %eax,%r13d + roll $1,%r14d + xorl 32(%rsp),%edx + movl %esi,%eax + movl %r14d,28(%rsp) + movl %r13d,%ecx + xorl 40(%rsp),%edx + xorl %r11d,%eax + roll $5,%ecx + xorl 0(%rsp),%edx + leal -899497514(%r14,%r12,1),%r12d + xorl %edi,%eax + addl %ecx,%r12d + roll $30,%esi + addl %eax,%r12d + roll $1,%edx + xorl 36(%rsp),%ebp + movl %r13d,%eax + + movl %r12d,%ecx + xorl 44(%rsp),%ebp + xorl %edi,%eax + roll $5,%ecx + xorl 4(%rsp),%ebp + leal -899497514(%rdx,%r11,1),%r11d + xorl %esi,%eax + addl %ecx,%r11d + roll $30,%r13d + addl %eax,%r11d + roll $1,%ebp + xorl 40(%rsp),%r14d + movl %r12d,%eax + + movl %r11d,%ecx + xorl 48(%rsp),%r14d + xorl %esi,%eax + roll $5,%ecx + xorl 8(%rsp),%r14d + leal -899497514(%rbp,%rdi,1),%edi + xorl %r13d,%eax + addl %ecx,%edi + roll $30,%r12d + addl %eax,%edi + roll $1,%r14d + xorl 44(%rsp),%edx + movl %r11d,%eax + + movl %edi,%ecx + xorl 52(%rsp),%edx + xorl %r13d,%eax + roll $5,%ecx + xorl 12(%rsp),%edx + leal -899497514(%r14,%rsi,1),%esi + xorl %r12d,%eax + addl %ecx,%esi + roll $30,%r11d + addl %eax,%esi + roll $1,%edx + xorl 48(%rsp),%ebp + movl %edi,%eax + + movl %esi,%ecx + xorl 56(%rsp),%ebp + xorl %r12d,%eax + roll $5,%ecx + xorl 16(%rsp),%ebp + leal -899497514(%rdx,%r13,1),%r13d + xorl %r11d,%eax + addl %ecx,%r13d + roll $30,%edi + addl %eax,%r13d + roll $1,%ebp + xorl 52(%rsp),%r14d + movl %esi,%eax + + movl %r13d,%ecx + xorl 60(%rsp),%r14d + xorl %r11d,%eax + roll $5,%ecx + xorl 20(%rsp),%r14d + leal -899497514(%rbp,%r12,1),%r12d + xorl %edi,%eax + addl %ecx,%r12d + roll $30,%esi + addl %eax,%r12d + roll $1,%r14d + xorl 56(%rsp),%edx + movl %r13d,%eax + + movl %r12d,%ecx + xorl 0(%rsp),%edx + xorl %edi,%eax + roll $5,%ecx + xorl 24(%rsp),%edx + leal -899497514(%r14,%r11,1),%r11d + xorl %esi,%eax + addl %ecx,%r11d + roll $30,%r13d + addl %eax,%r11d + roll $1,%edx + xorl 60(%rsp),%ebp + movl %r12d,%eax + + movl %r11d,%ecx + xorl 4(%rsp),%ebp + xorl %esi,%eax + roll $5,%ecx + xorl 28(%rsp),%ebp + leal -899497514(%rdx,%rdi,1),%edi + xorl %r13d,%eax + addl %ecx,%edi + roll $30,%r12d + addl %eax,%edi + roll $1,%ebp + movl %r11d,%eax + movl %edi,%ecx + xorl %r13d,%eax + leal -899497514(%rbp,%rsi,1),%esi + roll $5,%ecx + xorl %r12d,%eax + addl %ecx,%esi + roll $30,%r11d + addl %eax,%esi + addl 0(%r8),%esi + addl 4(%r8),%edi + addl 8(%r8),%r11d + addl 12(%r8),%r12d + addl 16(%r8),%r13d + movl %esi,0(%r8) + movl %edi,4(%r8) + movl %r11d,8(%r8) + movl %r12d,12(%r8) + movl %r13d,16(%r8) + + subq $1,%r10 + leaq 64(%r9),%r9 + jnz .Lloop + + movq 64(%rsp),%rsi + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp +.Lepilogue: + .byte 0xf3,0xc3 +.size sha1_block_data_order,.-sha1_block_data_order +.type sha1_block_data_order_shaext,@function +.align 32 +sha1_block_data_order_shaext: +_shaext_shortcut: + movdqu (%rdi),%xmm0 + movd 16(%rdi),%xmm1 + movdqa K_XX_XX+160(%rip),%xmm3 + + movdqu (%rsi),%xmm4 + pshufd $27,%xmm0,%xmm0 + movdqu 16(%rsi),%xmm5 + pshufd $27,%xmm1,%xmm1 + movdqu 32(%rsi),%xmm6 +.byte 102,15,56,0,227 + movdqu 48(%rsi),%xmm7 +.byte 102,15,56,0,235 +.byte 102,15,56,0,243 + movdqa %xmm1,%xmm9 +.byte 102,15,56,0,251 + jmp .Loop_shaext + +.align 16 +.Loop_shaext: + decq %rdx + leaq 64(%rsi),%rax + paddd %xmm4,%xmm1 + cmovneq %rax,%rsi + movdqa %xmm0,%xmm8 +.byte 15,56,201,229 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,0 +.byte 15,56,200,213 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 +.byte 15,56,202,231 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,0 +.byte 15,56,200,206 + pxor %xmm7,%xmm5 +.byte 15,56,202,236 +.byte 15,56,201,247 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,0 +.byte 15,56,200,215 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 +.byte 15,56,202,245 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,0 +.byte 15,56,200,204 + pxor %xmm5,%xmm7 +.byte 15,56,202,254 +.byte 15,56,201,229 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,0 +.byte 15,56,200,213 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 +.byte 15,56,202,231 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,1 +.byte 15,56,200,206 + pxor %xmm7,%xmm5 +.byte 15,56,202,236 +.byte 15,56,201,247 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,1 +.byte 15,56,200,215 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 +.byte 15,56,202,245 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,1 +.byte 15,56,200,204 + pxor %xmm5,%xmm7 +.byte 15,56,202,254 +.byte 15,56,201,229 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,1 +.byte 15,56,200,213 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 +.byte 15,56,202,231 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,1 +.byte 15,56,200,206 + pxor %xmm7,%xmm5 +.byte 15,56,202,236 +.byte 15,56,201,247 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,2 +.byte 15,56,200,215 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 +.byte 15,56,202,245 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,2 +.byte 15,56,200,204 + pxor %xmm5,%xmm7 +.byte 15,56,202,254 +.byte 15,56,201,229 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,2 +.byte 15,56,200,213 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 +.byte 15,56,202,231 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,2 +.byte 15,56,200,206 + pxor %xmm7,%xmm5 +.byte 15,56,202,236 +.byte 15,56,201,247 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,2 +.byte 15,56,200,215 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 +.byte 15,56,202,245 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,3 +.byte 15,56,200,204 + pxor %xmm5,%xmm7 +.byte 15,56,202,254 + movdqu (%rsi),%xmm4 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,3 +.byte 15,56,200,213 + movdqu 16(%rsi),%xmm5 +.byte 102,15,56,0,227 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,3 +.byte 15,56,200,206 + movdqu 32(%rsi),%xmm6 +.byte 102,15,56,0,235 + + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,3 +.byte 15,56,200,215 + movdqu 48(%rsi),%xmm7 +.byte 102,15,56,0,243 + + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,3 +.byte 65,15,56,200,201 +.byte 102,15,56,0,251 + + paddd %xmm8,%xmm0 + movdqa %xmm1,%xmm9 + + jnz .Loop_shaext + + pshufd $27,%xmm0,%xmm0 + pshufd $27,%xmm1,%xmm1 + movdqu %xmm0,(%rdi) + movd %xmm1,16(%rdi) + .byte 0xf3,0xc3 +.size sha1_block_data_order_shaext,.-sha1_block_data_order_shaext +.type sha1_block_data_order_ssse3,@function +.align 16 +sha1_block_data_order_ssse3: +_ssse3_shortcut: + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + leaq -64(%rsp),%rsp + movq %rax,%r14 + andq $-64,%rsp + movq %rdi,%r8 + movq %rsi,%r9 + movq %rdx,%r10 + + shlq $6,%r10 + addq %r9,%r10 + leaq K_XX_XX+64(%rip),%r11 + + movl 0(%r8),%eax + movl 4(%r8),%ebx + movl 8(%r8),%ecx + movl 12(%r8),%edx + movl %ebx,%esi + movl 16(%r8),%ebp + movl %ecx,%edi + xorl %edx,%edi + andl %edi,%esi + + movdqa 64(%r11),%xmm6 + movdqa -64(%r11),%xmm9 + movdqu 0(%r9),%xmm0 + movdqu 16(%r9),%xmm1 + movdqu 32(%r9),%xmm2 + movdqu 48(%r9),%xmm3 +.byte 102,15,56,0,198 +.byte 102,15,56,0,206 +.byte 102,15,56,0,214 + addq $64,%r9 + paddd %xmm9,%xmm0 +.byte 102,15,56,0,222 + paddd %xmm9,%xmm1 + paddd %xmm9,%xmm2 + movdqa %xmm0,0(%rsp) + psubd %xmm9,%xmm0 + movdqa %xmm1,16(%rsp) + psubd %xmm9,%xmm1 + movdqa %xmm2,32(%rsp) + psubd %xmm9,%xmm2 + jmp .Loop_ssse3 +.align 16 +.Loop_ssse3: + rorl $2,%ebx + pshufd $238,%xmm0,%xmm4 + xorl %edx,%esi + movdqa %xmm3,%xmm8 + paddd %xmm3,%xmm9 + movl %eax,%edi + addl 0(%rsp),%ebp + punpcklqdq %xmm1,%xmm4 + xorl %ecx,%ebx + roll $5,%eax + addl %esi,%ebp + psrldq $4,%xmm8 + andl %ebx,%edi + xorl %ecx,%ebx + pxor %xmm0,%xmm4 + addl %eax,%ebp + rorl $7,%eax + pxor %xmm2,%xmm8 + xorl %ecx,%edi + movl %ebp,%esi + addl 4(%rsp),%edx + pxor %xmm8,%xmm4 + xorl %ebx,%eax + roll $5,%ebp + movdqa %xmm9,48(%rsp) + addl %edi,%edx + andl %eax,%esi + movdqa %xmm4,%xmm10 + xorl %ebx,%eax + addl %ebp,%edx + rorl $7,%ebp + movdqa %xmm4,%xmm8 + xorl %ebx,%esi + pslldq $12,%xmm10 + paddd %xmm4,%xmm4 + movl %edx,%edi + addl 8(%rsp),%ecx + psrld $31,%xmm8 + xorl %eax,%ebp + roll $5,%edx + addl %esi,%ecx + movdqa %xmm10,%xmm9 + andl %ebp,%edi + xorl %eax,%ebp + psrld $30,%xmm10 + addl %edx,%ecx + rorl $7,%edx + por %xmm8,%xmm4 + xorl %eax,%edi + movl %ecx,%esi + addl 12(%rsp),%ebx + pslld $2,%xmm9 + pxor %xmm10,%xmm4 + xorl %ebp,%edx + movdqa -64(%r11),%xmm10 + roll $5,%ecx + addl %edi,%ebx + andl %edx,%esi + pxor %xmm9,%xmm4 + xorl %ebp,%edx + addl %ecx,%ebx + rorl $7,%ecx + pshufd $238,%xmm1,%xmm5 + xorl %ebp,%esi + movdqa %xmm4,%xmm9 + paddd %xmm4,%xmm10 + movl %ebx,%edi + addl 16(%rsp),%eax + punpcklqdq %xmm2,%xmm5 + xorl %edx,%ecx + roll $5,%ebx + addl %esi,%eax + psrldq $4,%xmm9 + andl %ecx,%edi + xorl %edx,%ecx + pxor %xmm1,%xmm5 + addl %ebx,%eax + rorl $7,%ebx + pxor %xmm3,%xmm9 + xorl %edx,%edi + movl %eax,%esi + addl 20(%rsp),%ebp + pxor %xmm9,%xmm5 + xorl %ecx,%ebx + roll $5,%eax + movdqa %xmm10,0(%rsp) + addl %edi,%ebp + andl %ebx,%esi + movdqa %xmm5,%xmm8 + xorl %ecx,%ebx + addl %eax,%ebp + rorl $7,%eax + movdqa %xmm5,%xmm9 + xorl %ecx,%esi + pslldq $12,%xmm8 + paddd %xmm5,%xmm5 + movl %ebp,%edi + addl 24(%rsp),%edx + psrld $31,%xmm9 + xorl %ebx,%eax + roll $5,%ebp + addl %esi,%edx + movdqa %xmm8,%xmm10 + andl %eax,%edi + xorl %ebx,%eax + psrld $30,%xmm8 + addl %ebp,%edx + rorl $7,%ebp + por %xmm9,%xmm5 + xorl %ebx,%edi + movl %edx,%esi + addl 28(%rsp),%ecx + pslld $2,%xmm10 + pxor %xmm8,%xmm5 + xorl %eax,%ebp + movdqa -32(%r11),%xmm8 + roll $5,%edx + addl %edi,%ecx + andl %ebp,%esi + pxor %xmm10,%xmm5 + xorl %eax,%ebp + addl %edx,%ecx + rorl $7,%edx + pshufd $238,%xmm2,%xmm6 + xorl %eax,%esi + movdqa %xmm5,%xmm10 + paddd %xmm5,%xmm8 + movl %ecx,%edi + addl 32(%rsp),%ebx + punpcklqdq %xmm3,%xmm6 + xorl %ebp,%edx + roll $5,%ecx + addl %esi,%ebx + psrldq $4,%xmm10 + andl %edx,%edi + xorl %ebp,%edx + pxor %xmm2,%xmm6 + addl %ecx,%ebx + rorl $7,%ecx + pxor %xmm4,%xmm10 + xorl %ebp,%edi + movl %ebx,%esi + addl 36(%rsp),%eax + pxor %xmm10,%xmm6 + xorl %edx,%ecx + roll $5,%ebx + movdqa %xmm8,16(%rsp) + addl %edi,%eax + andl %ecx,%esi + movdqa %xmm6,%xmm9 + xorl %edx,%ecx + addl %ebx,%eax + rorl $7,%ebx + movdqa %xmm6,%xmm10 + xorl %edx,%esi + pslldq $12,%xmm9 + paddd %xmm6,%xmm6 + movl %eax,%edi + addl 40(%rsp),%ebp + psrld $31,%xmm10 + xorl %ecx,%ebx + roll $5,%eax + addl %esi,%ebp + movdqa %xmm9,%xmm8 + andl %ebx,%edi + xorl %ecx,%ebx + psrld $30,%xmm9 + addl %eax,%ebp + rorl $7,%eax + por %xmm10,%xmm6 + xorl %ecx,%edi + movl %ebp,%esi + addl 44(%rsp),%edx + pslld $2,%xmm8 + pxor %xmm9,%xmm6 + xorl %ebx,%eax + movdqa -32(%r11),%xmm9 + roll $5,%ebp + addl %edi,%edx + andl %eax,%esi + pxor %xmm8,%xmm6 + xorl %ebx,%eax + addl %ebp,%edx + rorl $7,%ebp + pshufd $238,%xmm3,%xmm7 + xorl %ebx,%esi + movdqa %xmm6,%xmm8 + paddd %xmm6,%xmm9 + movl %edx,%edi + addl 48(%rsp),%ecx + punpcklqdq %xmm4,%xmm7 + xorl %eax,%ebp + roll $5,%edx + addl %esi,%ecx + psrldq $4,%xmm8 + andl %ebp,%edi + xorl %eax,%ebp + pxor %xmm3,%xmm7 + addl %edx,%ecx + rorl $7,%edx + pxor %xmm5,%xmm8 + xorl %eax,%edi + movl %ecx,%esi + addl 52(%rsp),%ebx + pxor %xmm8,%xmm7 + xorl %ebp,%edx + roll $5,%ecx + movdqa %xmm9,32(%rsp) + addl %edi,%ebx + andl %edx,%esi + movdqa %xmm7,%xmm10 + xorl %ebp,%edx + addl %ecx,%ebx + rorl $7,%ecx + movdqa %xmm7,%xmm8 + xorl %ebp,%esi + pslldq $12,%xmm10 + paddd %xmm7,%xmm7 + movl %ebx,%edi + addl 56(%rsp),%eax + psrld $31,%xmm8 + xorl %edx,%ecx + roll $5,%ebx + addl %esi,%eax + movdqa %xmm10,%xmm9 + andl %ecx,%edi + xorl %edx,%ecx + psrld $30,%xmm10 + addl %ebx,%eax + rorl $7,%ebx + por %xmm8,%xmm7 + xorl %edx,%edi + movl %eax,%esi + addl 60(%rsp),%ebp + pslld $2,%xmm9 + pxor %xmm10,%xmm7 + xorl %ecx,%ebx + movdqa -32(%r11),%xmm10 + roll $5,%eax + addl %edi,%ebp + andl %ebx,%esi + pxor %xmm9,%xmm7 + pshufd $238,%xmm6,%xmm9 + xorl %ecx,%ebx + addl %eax,%ebp + rorl $7,%eax + pxor %xmm4,%xmm0 + xorl %ecx,%esi + movl %ebp,%edi + addl 0(%rsp),%edx + punpcklqdq %xmm7,%xmm9 + xorl %ebx,%eax + roll $5,%ebp + pxor %xmm1,%xmm0 + addl %esi,%edx + andl %eax,%edi + movdqa %xmm10,%xmm8 + xorl %ebx,%eax + paddd %xmm7,%xmm10 + addl %ebp,%edx + pxor %xmm9,%xmm0 + rorl $7,%ebp + xorl %ebx,%edi + movl %edx,%esi + addl 4(%rsp),%ecx + movdqa %xmm0,%xmm9 + xorl %eax,%ebp + roll $5,%edx + movdqa %xmm10,48(%rsp) + addl %edi,%ecx + andl %ebp,%esi + xorl %eax,%ebp + pslld $2,%xmm0 + addl %edx,%ecx + rorl $7,%edx + psrld $30,%xmm9 + xorl %eax,%esi + movl %ecx,%edi + addl 8(%rsp),%ebx + por %xmm9,%xmm0 + xorl %ebp,%edx + roll $5,%ecx + pshufd $238,%xmm7,%xmm10 + addl %esi,%ebx + andl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 12(%rsp),%eax + xorl %ebp,%edi + movl %ebx,%esi + roll $5,%ebx + addl %edi,%eax + xorl %edx,%esi + rorl $7,%ecx + addl %ebx,%eax + pxor %xmm5,%xmm1 + addl 16(%rsp),%ebp + xorl %ecx,%esi + punpcklqdq %xmm0,%xmm10 + movl %eax,%edi + roll $5,%eax + pxor %xmm2,%xmm1 + addl %esi,%ebp + xorl %ecx,%edi + movdqa %xmm8,%xmm9 + rorl $7,%ebx + paddd %xmm0,%xmm8 + addl %eax,%ebp + pxor %xmm10,%xmm1 + addl 20(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + roll $5,%ebp + movdqa %xmm1,%xmm10 + addl %edi,%edx + xorl %ebx,%esi + movdqa %xmm8,0(%rsp) + rorl $7,%eax + addl %ebp,%edx + addl 24(%rsp),%ecx + pslld $2,%xmm1 + xorl %eax,%esi + movl %edx,%edi + psrld $30,%xmm10 + roll $5,%edx + addl %esi,%ecx + xorl %eax,%edi + rorl $7,%ebp + por %xmm10,%xmm1 + addl %edx,%ecx + addl 28(%rsp),%ebx + pshufd $238,%xmm0,%xmm8 + xorl %ebp,%edi + movl %ecx,%esi + roll $5,%ecx + addl %edi,%ebx + xorl %ebp,%esi + rorl $7,%edx + addl %ecx,%ebx + pxor %xmm6,%xmm2 + addl 32(%rsp),%eax + xorl %edx,%esi + punpcklqdq %xmm1,%xmm8 + movl %ebx,%edi + roll $5,%ebx + pxor %xmm3,%xmm2 + addl %esi,%eax + xorl %edx,%edi + movdqa 0(%r11),%xmm10 + rorl $7,%ecx + paddd %xmm1,%xmm9 + addl %ebx,%eax + pxor %xmm8,%xmm2 + addl 36(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + roll $5,%eax + movdqa %xmm2,%xmm8 + addl %edi,%ebp + xorl %ecx,%esi + movdqa %xmm9,16(%rsp) + rorl $7,%ebx + addl %eax,%ebp + addl 40(%rsp),%edx + pslld $2,%xmm2 + xorl %ebx,%esi + movl %ebp,%edi + psrld $30,%xmm8 + roll $5,%ebp + addl %esi,%edx + xorl %ebx,%edi + rorl $7,%eax + por %xmm8,%xmm2 + addl %ebp,%edx + addl 44(%rsp),%ecx + pshufd $238,%xmm1,%xmm9 + xorl %eax,%edi + movl %edx,%esi + roll $5,%edx + addl %edi,%ecx + xorl %eax,%esi + rorl $7,%ebp + addl %edx,%ecx + pxor %xmm7,%xmm3 + addl 48(%rsp),%ebx + xorl %ebp,%esi + punpcklqdq %xmm2,%xmm9 + movl %ecx,%edi + roll $5,%ecx + pxor %xmm4,%xmm3 + addl %esi,%ebx + xorl %ebp,%edi + movdqa %xmm10,%xmm8 + rorl $7,%edx + paddd %xmm2,%xmm10 + addl %ecx,%ebx + pxor %xmm9,%xmm3 + addl 52(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + roll $5,%ebx + movdqa %xmm3,%xmm9 + addl %edi,%eax + xorl %edx,%esi + movdqa %xmm10,32(%rsp) + rorl $7,%ecx + addl %ebx,%eax + addl 56(%rsp),%ebp + pslld $2,%xmm3 + xorl %ecx,%esi + movl %eax,%edi + psrld $30,%xmm9 + roll $5,%eax + addl %esi,%ebp + xorl %ecx,%edi + rorl $7,%ebx + por %xmm9,%xmm3 + addl %eax,%ebp + addl 60(%rsp),%edx + pshufd $238,%xmm2,%xmm10 + xorl %ebx,%edi + movl %ebp,%esi + roll $5,%ebp + addl %edi,%edx + xorl %ebx,%esi + rorl $7,%eax + addl %ebp,%edx + pxor %xmm0,%xmm4 + addl 0(%rsp),%ecx + xorl %eax,%esi + punpcklqdq %xmm3,%xmm10 + movl %edx,%edi + roll $5,%edx + pxor %xmm5,%xmm4 + addl %esi,%ecx + xorl %eax,%edi + movdqa %xmm8,%xmm9 + rorl $7,%ebp + paddd %xmm3,%xmm8 + addl %edx,%ecx + pxor %xmm10,%xmm4 + addl 4(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + roll $5,%ecx + movdqa %xmm4,%xmm10 + addl %edi,%ebx + xorl %ebp,%esi + movdqa %xmm8,48(%rsp) + rorl $7,%edx + addl %ecx,%ebx + addl 8(%rsp),%eax + pslld $2,%xmm4 + xorl %edx,%esi + movl %ebx,%edi + psrld $30,%xmm10 + roll $5,%ebx + addl %esi,%eax + xorl %edx,%edi + rorl $7,%ecx + por %xmm10,%xmm4 + addl %ebx,%eax + addl 12(%rsp),%ebp + pshufd $238,%xmm3,%xmm8 + xorl %ecx,%edi + movl %eax,%esi + roll $5,%eax + addl %edi,%ebp + xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%ebp + pxor %xmm1,%xmm5 + addl 16(%rsp),%edx + xorl %ebx,%esi + punpcklqdq %xmm4,%xmm8 + movl %ebp,%edi + roll $5,%ebp + pxor %xmm6,%xmm5 + addl %esi,%edx + xorl %ebx,%edi + movdqa %xmm9,%xmm10 + rorl $7,%eax + paddd %xmm4,%xmm9 + addl %ebp,%edx + pxor %xmm8,%xmm5 + addl 20(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + roll $5,%edx + movdqa %xmm5,%xmm8 + addl %edi,%ecx + xorl %eax,%esi + movdqa %xmm9,0(%rsp) + rorl $7,%ebp + addl %edx,%ecx + addl 24(%rsp),%ebx + pslld $2,%xmm5 + xorl %ebp,%esi + movl %ecx,%edi + psrld $30,%xmm8 + roll $5,%ecx + addl %esi,%ebx + xorl %ebp,%edi + rorl $7,%edx + por %xmm8,%xmm5 + addl %ecx,%ebx + addl 28(%rsp),%eax + pshufd $238,%xmm4,%xmm9 + rorl $7,%ecx + movl %ebx,%esi + xorl %edx,%edi + roll $5,%ebx + addl %edi,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + pxor %xmm2,%xmm6 + addl 32(%rsp),%ebp + andl %ecx,%esi + xorl %edx,%ecx + rorl $7,%ebx + punpcklqdq %xmm5,%xmm9 + movl %eax,%edi + xorl %ecx,%esi + pxor %xmm7,%xmm6 + roll $5,%eax + addl %esi,%ebp + movdqa %xmm10,%xmm8 + xorl %ebx,%edi + paddd %xmm5,%xmm10 + xorl %ecx,%ebx + pxor %xmm9,%xmm6 + addl %eax,%ebp + addl 36(%rsp),%edx + andl %ebx,%edi + xorl %ecx,%ebx + rorl $7,%eax + movdqa %xmm6,%xmm9 + movl %ebp,%esi + xorl %ebx,%edi + movdqa %xmm10,16(%rsp) + roll $5,%ebp + addl %edi,%edx + xorl %eax,%esi + pslld $2,%xmm6 + xorl %ebx,%eax + addl %ebp,%edx + psrld $30,%xmm9 + addl 40(%rsp),%ecx + andl %eax,%esi + xorl %ebx,%eax + por %xmm9,%xmm6 + rorl $7,%ebp + movl %edx,%edi + xorl %eax,%esi + roll $5,%edx + pshufd $238,%xmm5,%xmm10 + addl %esi,%ecx + xorl %ebp,%edi + xorl %eax,%ebp + addl %edx,%ecx + addl 44(%rsp),%ebx + andl %ebp,%edi + xorl %eax,%ebp + rorl $7,%edx + movl %ecx,%esi + xorl %ebp,%edi + roll $5,%ecx + addl %edi,%ebx + xorl %edx,%esi + xorl %ebp,%edx + addl %ecx,%ebx + pxor %xmm3,%xmm7 + addl 48(%rsp),%eax + andl %edx,%esi + xorl %ebp,%edx + rorl $7,%ecx + punpcklqdq %xmm6,%xmm10 + movl %ebx,%edi + xorl %edx,%esi + pxor %xmm0,%xmm7 + roll $5,%ebx + addl %esi,%eax + movdqa 32(%r11),%xmm9 + xorl %ecx,%edi + paddd %xmm6,%xmm8 + xorl %edx,%ecx + pxor %xmm10,%xmm7 + addl %ebx,%eax + addl 52(%rsp),%ebp + andl %ecx,%edi + xorl %edx,%ecx + rorl $7,%ebx + movdqa %xmm7,%xmm10 + movl %eax,%esi + xorl %ecx,%edi + movdqa %xmm8,32(%rsp) + roll $5,%eax + addl %edi,%ebp + xorl %ebx,%esi + pslld $2,%xmm7 + xorl %ecx,%ebx + addl %eax,%ebp + psrld $30,%xmm10 + addl 56(%rsp),%edx + andl %ebx,%esi + xorl %ecx,%ebx + por %xmm10,%xmm7 + rorl $7,%eax + movl %ebp,%edi + xorl %ebx,%esi + roll $5,%ebp + pshufd $238,%xmm6,%xmm8 + addl %esi,%edx + xorl %eax,%edi + xorl %ebx,%eax + addl %ebp,%edx + addl 60(%rsp),%ecx + andl %eax,%edi + xorl %ebx,%eax + rorl $7,%ebp + movl %edx,%esi + xorl %eax,%edi + roll $5,%edx + addl %edi,%ecx + xorl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + pxor %xmm4,%xmm0 + addl 0(%rsp),%ebx + andl %ebp,%esi + xorl %eax,%ebp + rorl $7,%edx + punpcklqdq %xmm7,%xmm8 + movl %ecx,%edi + xorl %ebp,%esi + pxor %xmm1,%xmm0 + roll $5,%ecx + addl %esi,%ebx + movdqa %xmm9,%xmm10 + xorl %edx,%edi + paddd %xmm7,%xmm9 + xorl %ebp,%edx + pxor %xmm8,%xmm0 + addl %ecx,%ebx + addl 4(%rsp),%eax + andl %edx,%edi + xorl %ebp,%edx + rorl $7,%ecx + movdqa %xmm0,%xmm8 + movl %ebx,%esi + xorl %edx,%edi + movdqa %xmm9,48(%rsp) + roll $5,%ebx + addl %edi,%eax + xorl %ecx,%esi + pslld $2,%xmm0 + xorl %edx,%ecx + addl %ebx,%eax + psrld $30,%xmm8 + addl 8(%rsp),%ebp + andl %ecx,%esi + xorl %edx,%ecx + por %xmm8,%xmm0 + rorl $7,%ebx + movl %eax,%edi + xorl %ecx,%esi + roll $5,%eax + pshufd $238,%xmm7,%xmm9 + addl %esi,%ebp + xorl %ebx,%edi + xorl %ecx,%ebx + addl %eax,%ebp + addl 12(%rsp),%edx + andl %ebx,%edi + xorl %ecx,%ebx + rorl $7,%eax + movl %ebp,%esi + xorl %ebx,%edi + roll $5,%ebp + addl %edi,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %ebp,%edx + pxor %xmm5,%xmm1 + addl 16(%rsp),%ecx + andl %eax,%esi + xorl %ebx,%eax + rorl $7,%ebp + punpcklqdq %xmm0,%xmm9 + movl %edx,%edi + xorl %eax,%esi + pxor %xmm2,%xmm1 + roll $5,%edx + addl %esi,%ecx + movdqa %xmm10,%xmm8 + xorl %ebp,%edi + paddd %xmm0,%xmm10 + xorl %eax,%ebp + pxor %xmm9,%xmm1 + addl %edx,%ecx + addl 20(%rsp),%ebx + andl %ebp,%edi + xorl %eax,%ebp + rorl $7,%edx + movdqa %xmm1,%xmm9 + movl %ecx,%esi + xorl %ebp,%edi + movdqa %xmm10,0(%rsp) + roll $5,%ecx + addl %edi,%ebx + xorl %edx,%esi + pslld $2,%xmm1 + xorl %ebp,%edx + addl %ecx,%ebx + psrld $30,%xmm9 + addl 24(%rsp),%eax + andl %edx,%esi + xorl %ebp,%edx + por %xmm9,%xmm1 + rorl $7,%ecx + movl %ebx,%edi + xorl %edx,%esi + roll $5,%ebx + pshufd $238,%xmm0,%xmm10 + addl %esi,%eax + xorl %ecx,%edi + xorl %edx,%ecx + addl %ebx,%eax + addl 28(%rsp),%ebp + andl %ecx,%edi + xorl %edx,%ecx + rorl $7,%ebx + movl %eax,%esi + xorl %ecx,%edi + roll $5,%eax + addl %edi,%ebp + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%ebp + pxor %xmm6,%xmm2 + addl 32(%rsp),%edx + andl %ebx,%esi + xorl %ecx,%ebx + rorl $7,%eax + punpcklqdq %xmm1,%xmm10 + movl %ebp,%edi + xorl %ebx,%esi + pxor %xmm3,%xmm2 + roll $5,%ebp + addl %esi,%edx + movdqa %xmm8,%xmm9 + xorl %eax,%edi + paddd %xmm1,%xmm8 + xorl %ebx,%eax + pxor %xmm10,%xmm2 + addl %ebp,%edx + addl 36(%rsp),%ecx + andl %eax,%edi + xorl %ebx,%eax + rorl $7,%ebp + movdqa %xmm2,%xmm10 + movl %edx,%esi + xorl %eax,%edi + movdqa %xmm8,16(%rsp) + roll $5,%edx + addl %edi,%ecx + xorl %ebp,%esi + pslld $2,%xmm2 + xorl %eax,%ebp + addl %edx,%ecx + psrld $30,%xmm10 + addl 40(%rsp),%ebx + andl %ebp,%esi + xorl %eax,%ebp + por %xmm10,%xmm2 + rorl $7,%edx + movl %ecx,%edi + xorl %ebp,%esi + roll $5,%ecx + pshufd $238,%xmm1,%xmm8 + addl %esi,%ebx + xorl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 44(%rsp),%eax + andl %edx,%edi + xorl %ebp,%edx + rorl $7,%ecx + movl %ebx,%esi + xorl %edx,%edi + roll $5,%ebx + addl %edi,%eax + xorl %edx,%esi + addl %ebx,%eax + pxor %xmm7,%xmm3 + addl 48(%rsp),%ebp + xorl %ecx,%esi + punpcklqdq %xmm2,%xmm8 + movl %eax,%edi + roll $5,%eax + pxor %xmm4,%xmm3 + addl %esi,%ebp + xorl %ecx,%edi + movdqa %xmm9,%xmm10 + rorl $7,%ebx + paddd %xmm2,%xmm9 + addl %eax,%ebp + pxor %xmm8,%xmm3 + addl 52(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + roll $5,%ebp + movdqa %xmm3,%xmm8 + addl %edi,%edx + xorl %ebx,%esi + movdqa %xmm9,32(%rsp) + rorl $7,%eax + addl %ebp,%edx + addl 56(%rsp),%ecx + pslld $2,%xmm3 + xorl %eax,%esi + movl %edx,%edi + psrld $30,%xmm8 + roll $5,%edx + addl %esi,%ecx + xorl %eax,%edi + rorl $7,%ebp + por %xmm8,%xmm3 + addl %edx,%ecx + addl 60(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + roll $5,%ecx + addl %edi,%ebx + xorl %ebp,%esi + rorl $7,%edx + addl %ecx,%ebx + addl 0(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + roll $5,%ebx + paddd %xmm3,%xmm10 + addl %esi,%eax + xorl %edx,%edi + movdqa %xmm10,48(%rsp) + rorl $7,%ecx + addl %ebx,%eax + addl 4(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + roll $5,%eax + addl %edi,%ebp + xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%ebp + addl 8(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + roll $5,%ebp + addl %esi,%edx + xorl %ebx,%edi + rorl $7,%eax + addl %ebp,%edx + addl 12(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + roll $5,%edx + addl %edi,%ecx + xorl %eax,%esi + rorl $7,%ebp + addl %edx,%ecx + cmpq %r10,%r9 + je .Ldone_ssse3 + movdqa 64(%r11),%xmm6 + movdqa -64(%r11),%xmm9 + movdqu 0(%r9),%xmm0 + movdqu 16(%r9),%xmm1 + movdqu 32(%r9),%xmm2 + movdqu 48(%r9),%xmm3 +.byte 102,15,56,0,198 + addq $64,%r9 + addl 16(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi +.byte 102,15,56,0,206 + roll $5,%ecx + addl %esi,%ebx + xorl %ebp,%edi + rorl $7,%edx + paddd %xmm9,%xmm0 + addl %ecx,%ebx + addl 20(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + movdqa %xmm0,0(%rsp) + roll $5,%ebx + addl %edi,%eax + xorl %edx,%esi + rorl $7,%ecx + psubd %xmm9,%xmm0 + addl %ebx,%eax + addl 24(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + roll $5,%eax + addl %esi,%ebp + xorl %ecx,%edi + rorl $7,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + roll $5,%ebp + addl %edi,%edx + xorl %ebx,%esi + rorl $7,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi +.byte 102,15,56,0,214 + roll $5,%edx + addl %esi,%ecx + xorl %eax,%edi + rorl $7,%ebp + paddd %xmm9,%xmm1 + addl %edx,%ecx + addl 36(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + movdqa %xmm1,16(%rsp) + roll $5,%ecx + addl %edi,%ebx + xorl %ebp,%esi + rorl $7,%edx + psubd %xmm9,%xmm1 + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + roll $5,%ebx + addl %esi,%eax + xorl %edx,%edi + rorl $7,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + roll $5,%eax + addl %edi,%ebp + xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi +.byte 102,15,56,0,222 + roll $5,%ebp + addl %esi,%edx + xorl %ebx,%edi + rorl $7,%eax + paddd %xmm9,%xmm2 + addl %ebp,%edx + addl 52(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + movdqa %xmm2,32(%rsp) + roll $5,%edx + addl %edi,%ecx + xorl %eax,%esi + rorl $7,%ebp + psubd %xmm9,%xmm2 + addl %edx,%ecx + addl 56(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + roll $5,%ecx + addl %esi,%ebx + xorl %ebp,%edi + rorl $7,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + roll $5,%ebx + addl %edi,%eax + rorl $7,%ecx + addl %ebx,%eax + addl 0(%r8),%eax + addl 4(%r8),%esi + addl 8(%r8),%ecx + addl 12(%r8),%edx + movl %eax,0(%r8) + addl 16(%r8),%ebp + movl %esi,4(%r8) + movl %esi,%ebx + movl %ecx,8(%r8) + movl %ecx,%edi + movl %edx,12(%r8) + xorl %edx,%edi + movl %ebp,16(%r8) + andl %edi,%esi + jmp .Loop_ssse3 + +.align 16 +.Ldone_ssse3: + addl 16(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + roll $5,%ecx + addl %esi,%ebx + xorl %ebp,%edi + rorl $7,%edx + addl %ecx,%ebx + addl 20(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + roll $5,%ebx + addl %edi,%eax + xorl %edx,%esi + rorl $7,%ecx + addl %ebx,%eax + addl 24(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + roll $5,%eax + addl %esi,%ebp + xorl %ecx,%edi + rorl $7,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + roll $5,%ebp + addl %edi,%edx + xorl %ebx,%esi + rorl $7,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + roll $5,%edx + addl %esi,%ecx + xorl %eax,%edi + rorl $7,%ebp + addl %edx,%ecx + addl 36(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + roll $5,%ecx + addl %edi,%ebx + xorl %ebp,%esi + rorl $7,%edx + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + roll $5,%ebx + addl %esi,%eax + xorl %edx,%edi + rorl $7,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + roll $5,%eax + addl %edi,%ebp + xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + roll $5,%ebp + addl %esi,%edx + xorl %ebx,%edi + rorl $7,%eax + addl %ebp,%edx + addl 52(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + roll $5,%edx + addl %edi,%ecx + xorl %eax,%esi + rorl $7,%ebp + addl %edx,%ecx + addl 56(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + roll $5,%ecx + addl %esi,%ebx + xorl %ebp,%edi + rorl $7,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + roll $5,%ebx + addl %edi,%eax + rorl $7,%ecx + addl %ebx,%eax + addl 0(%r8),%eax + addl 4(%r8),%esi + addl 8(%r8),%ecx + movl %eax,0(%r8) + addl 12(%r8),%edx + movl %esi,4(%r8) + addl 16(%r8),%ebp + movl %ecx,8(%r8) + movl %edx,12(%r8) + movl %ebp,16(%r8) + leaq (%r14),%rsi + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp +.Lepilogue_ssse3: + .byte 0xf3,0xc3 +.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3 +.align 64 +K_XX_XX: +.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 +.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 +.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 +.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 +.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc +.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc +.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 +.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 +.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 64 diff --git a/secure/lib/libcrypto/amd64/sha256-mb-x86_64.S b/secure/lib/libcrypto/amd64/sha256-mb-x86_64.S new file mode 100644 index 0000000..893d42a --- /dev/null +++ b/secure/lib/libcrypto/amd64/sha256-mb-x86_64.S @@ -0,0 +1,3259 @@ + # $FreeBSD$ +.text + + + +.globl sha256_multi_block +.type sha256_multi_block,@function +.align 32 +sha256_multi_block: + movq OPENSSL_ia32cap_P+4(%rip),%rcx + btq $61,%rcx + jc _shaext_shortcut + movq %rsp,%rax + pushq %rbx + pushq %rbp + subq $288,%rsp + andq $-256,%rsp + movq %rax,272(%rsp) +.Lbody: + leaq K256+128(%rip),%rbp + leaq 256(%rsp),%rbx + leaq 128(%rdi),%rdi + +.Loop_grande: + movl %edx,280(%rsp) + xorl %edx,%edx + movq 0(%rsi),%r8 + movl 8(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,0(%rbx) + cmovleq %rbp,%r8 + movq 16(%rsi),%r9 + movl 24(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,4(%rbx) + cmovleq %rbp,%r9 + movq 32(%rsi),%r10 + movl 40(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,8(%rbx) + cmovleq %rbp,%r10 + movq 48(%rsi),%r11 + movl 56(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,12(%rbx) + cmovleq %rbp,%r11 + testl %edx,%edx + jz .Ldone + + movdqu 0-128(%rdi),%xmm8 + leaq 128(%rsp),%rax + movdqu 32-128(%rdi),%xmm9 + movdqu 64-128(%rdi),%xmm10 + movdqu 96-128(%rdi),%xmm11 + movdqu 128-128(%rdi),%xmm12 + movdqu 160-128(%rdi),%xmm13 + movdqu 192-128(%rdi),%xmm14 + movdqu 224-128(%rdi),%xmm15 + movdqu .Lpbswap(%rip),%xmm6 + jmp .Loop + +.align 32 +.Loop: + movdqa %xmm10,%xmm4 + pxor %xmm9,%xmm4 + movd 0(%r8),%xmm5 + movd 0(%r9),%xmm0 + movd 0(%r10),%xmm1 + movd 0(%r11),%xmm2 + punpckldq %xmm1,%xmm5 + punpckldq %xmm2,%xmm0 + punpckldq %xmm0,%xmm5 + movdqa %xmm12,%xmm7 +.byte 102,15,56,0,238 + movdqa %xmm12,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm12,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,0-128(%rax) + paddd %xmm15,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd -128(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm12,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm12,%xmm3 + pslld $26-21,%xmm2 + pandn %xmm14,%xmm0 + pand %xmm13,%xmm3 + pxor %xmm1,%xmm7 + + + movdqa %xmm8,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm8,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + movdqa %xmm9,%xmm3 + movdqa %xmm8,%xmm7 + pslld $10,%xmm2 + pxor %xmm8,%xmm3 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm3,%xmm4 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm9,%xmm15 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm4,%xmm15 + paddd %xmm5,%xmm11 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm15 + paddd %xmm7,%xmm15 + movd 4(%r8),%xmm5 + movd 4(%r9),%xmm0 + movd 4(%r10),%xmm1 + movd 4(%r11),%xmm2 + punpckldq %xmm1,%xmm5 + punpckldq %xmm2,%xmm0 + punpckldq %xmm0,%xmm5 + movdqa %xmm11,%xmm7 + + movdqa %xmm11,%xmm2 +.byte 102,15,56,0,238 + psrld $6,%xmm7 + movdqa %xmm11,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,16-128(%rax) + paddd %xmm14,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd -96(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm11,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm11,%xmm4 + pslld $26-21,%xmm2 + pandn %xmm13,%xmm0 + pand %xmm12,%xmm4 + pxor %xmm1,%xmm7 + + + movdqa %xmm15,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm15,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm4,%xmm0 + movdqa %xmm8,%xmm4 + movdqa %xmm15,%xmm7 + pslld $10,%xmm2 + pxor %xmm15,%xmm4 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm4,%xmm3 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm8,%xmm14 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm3,%xmm14 + paddd %xmm5,%xmm10 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm14 + paddd %xmm7,%xmm14 + movd 8(%r8),%xmm5 + movd 8(%r9),%xmm0 + movd 8(%r10),%xmm1 + movd 8(%r11),%xmm2 + punpckldq %xmm1,%xmm5 + punpckldq %xmm2,%xmm0 + punpckldq %xmm0,%xmm5 + movdqa %xmm10,%xmm7 +.byte 102,15,56,0,238 + movdqa %xmm10,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm10,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,32-128(%rax) + paddd %xmm13,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd -64(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm10,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm10,%xmm3 + pslld $26-21,%xmm2 + pandn %xmm12,%xmm0 + pand %xmm11,%xmm3 + pxor %xmm1,%xmm7 + + + movdqa %xmm14,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm14,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + movdqa %xmm15,%xmm3 + movdqa %xmm14,%xmm7 + pslld $10,%xmm2 + pxor %xmm14,%xmm3 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm3,%xmm4 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm15,%xmm13 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm4,%xmm13 + paddd %xmm5,%xmm9 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm13 + paddd %xmm7,%xmm13 + movd 12(%r8),%xmm5 + movd 12(%r9),%xmm0 + movd 12(%r10),%xmm1 + movd 12(%r11),%xmm2 + punpckldq %xmm1,%xmm5 + punpckldq %xmm2,%xmm0 + punpckldq %xmm0,%xmm5 + movdqa %xmm9,%xmm7 + + movdqa %xmm9,%xmm2 +.byte 102,15,56,0,238 + psrld $6,%xmm7 + movdqa %xmm9,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,48-128(%rax) + paddd %xmm12,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd -32(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm9,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm9,%xmm4 + pslld $26-21,%xmm2 + pandn %xmm11,%xmm0 + pand %xmm10,%xmm4 + pxor %xmm1,%xmm7 + + + movdqa %xmm13,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm13,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm4,%xmm0 + movdqa %xmm14,%xmm4 + movdqa %xmm13,%xmm7 + pslld $10,%xmm2 + pxor %xmm13,%xmm4 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm4,%xmm3 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm14,%xmm12 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm3,%xmm12 + paddd %xmm5,%xmm8 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm12 + paddd %xmm7,%xmm12 + movd 16(%r8),%xmm5 + movd 16(%r9),%xmm0 + movd 16(%r10),%xmm1 + movd 16(%r11),%xmm2 + punpckldq %xmm1,%xmm5 + punpckldq %xmm2,%xmm0 + punpckldq %xmm0,%xmm5 + movdqa %xmm8,%xmm7 +.byte 102,15,56,0,238 + movdqa %xmm8,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm8,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,64-128(%rax) + paddd %xmm11,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd 0(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm8,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm8,%xmm3 + pslld $26-21,%xmm2 + pandn %xmm10,%xmm0 + pand %xmm9,%xmm3 + pxor %xmm1,%xmm7 + + + movdqa %xmm12,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm12,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + movdqa %xmm13,%xmm3 + movdqa %xmm12,%xmm7 + pslld $10,%xmm2 + pxor %xmm12,%xmm3 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm3,%xmm4 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm13,%xmm11 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm4,%xmm11 + paddd %xmm5,%xmm15 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm11 + paddd %xmm7,%xmm11 + movd 20(%r8),%xmm5 + movd 20(%r9),%xmm0 + movd 20(%r10),%xmm1 + movd 20(%r11),%xmm2 + punpckldq %xmm1,%xmm5 + punpckldq %xmm2,%xmm0 + punpckldq %xmm0,%xmm5 + movdqa %xmm15,%xmm7 + + movdqa %xmm15,%xmm2 +.byte 102,15,56,0,238 + psrld $6,%xmm7 + movdqa %xmm15,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,80-128(%rax) + paddd %xmm10,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd 32(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm15,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm15,%xmm4 + pslld $26-21,%xmm2 + pandn %xmm9,%xmm0 + pand %xmm8,%xmm4 + pxor %xmm1,%xmm7 + + + movdqa %xmm11,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm11,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm4,%xmm0 + movdqa %xmm12,%xmm4 + movdqa %xmm11,%xmm7 + pslld $10,%xmm2 + pxor %xmm11,%xmm4 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm4,%xmm3 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm12,%xmm10 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm3,%xmm10 + paddd %xmm5,%xmm14 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm10 + paddd %xmm7,%xmm10 + movd 24(%r8),%xmm5 + movd 24(%r9),%xmm0 + movd 24(%r10),%xmm1 + movd 24(%r11),%xmm2 + punpckldq %xmm1,%xmm5 + punpckldq %xmm2,%xmm0 + punpckldq %xmm0,%xmm5 + movdqa %xmm14,%xmm7 +.byte 102,15,56,0,238 + movdqa %xmm14,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm14,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,96-128(%rax) + paddd %xmm9,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd 64(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm14,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm14,%xmm3 + pslld $26-21,%xmm2 + pandn %xmm8,%xmm0 + pand %xmm15,%xmm3 + pxor %xmm1,%xmm7 + + + movdqa %xmm10,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm10,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + movdqa %xmm11,%xmm3 + movdqa %xmm10,%xmm7 + pslld $10,%xmm2 + pxor %xmm10,%xmm3 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm3,%xmm4 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm11,%xmm9 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm4,%xmm9 + paddd %xmm5,%xmm13 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm9 + paddd %xmm7,%xmm9 + movd 28(%r8),%xmm5 + movd 28(%r9),%xmm0 + movd 28(%r10),%xmm1 + movd 28(%r11),%xmm2 + punpckldq %xmm1,%xmm5 + punpckldq %xmm2,%xmm0 + punpckldq %xmm0,%xmm5 + movdqa %xmm13,%xmm7 + + movdqa %xmm13,%xmm2 +.byte 102,15,56,0,238 + psrld $6,%xmm7 + movdqa %xmm13,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,112-128(%rax) + paddd %xmm8,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd 96(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm13,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm13,%xmm4 + pslld $26-21,%xmm2 + pandn %xmm15,%xmm0 + pand %xmm14,%xmm4 + pxor %xmm1,%xmm7 + + + movdqa %xmm9,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm9,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm4,%xmm0 + movdqa %xmm10,%xmm4 + movdqa %xmm9,%xmm7 + pslld $10,%xmm2 + pxor %xmm9,%xmm4 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm4,%xmm3 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm10,%xmm8 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm3,%xmm8 + paddd %xmm5,%xmm12 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm8 + paddd %xmm7,%xmm8 + leaq 256(%rbp),%rbp + movd 32(%r8),%xmm5 + movd 32(%r9),%xmm0 + movd 32(%r10),%xmm1 + movd 32(%r11),%xmm2 + punpckldq %xmm1,%xmm5 + punpckldq %xmm2,%xmm0 + punpckldq %xmm0,%xmm5 + movdqa %xmm12,%xmm7 +.byte 102,15,56,0,238 + movdqa %xmm12,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm12,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,128-128(%rax) + paddd %xmm15,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd -128(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm12,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm12,%xmm3 + pslld $26-21,%xmm2 + pandn %xmm14,%xmm0 + pand %xmm13,%xmm3 + pxor %xmm1,%xmm7 + + + movdqa %xmm8,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm8,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + movdqa %xmm9,%xmm3 + movdqa %xmm8,%xmm7 + pslld $10,%xmm2 + pxor %xmm8,%xmm3 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm3,%xmm4 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm9,%xmm15 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm4,%xmm15 + paddd %xmm5,%xmm11 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm15 + paddd %xmm7,%xmm15 + movd 36(%r8),%xmm5 + movd 36(%r9),%xmm0 + movd 36(%r10),%xmm1 + movd 36(%r11),%xmm2 + punpckldq %xmm1,%xmm5 + punpckldq %xmm2,%xmm0 + punpckldq %xmm0,%xmm5 + movdqa %xmm11,%xmm7 + + movdqa %xmm11,%xmm2 +.byte 102,15,56,0,238 + psrld $6,%xmm7 + movdqa %xmm11,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,144-128(%rax) + paddd %xmm14,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd -96(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm11,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm11,%xmm4 + pslld $26-21,%xmm2 + pandn %xmm13,%xmm0 + pand %xmm12,%xmm4 + pxor %xmm1,%xmm7 + + + movdqa %xmm15,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm15,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm4,%xmm0 + movdqa %xmm8,%xmm4 + movdqa %xmm15,%xmm7 + pslld $10,%xmm2 + pxor %xmm15,%xmm4 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm4,%xmm3 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm8,%xmm14 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm3,%xmm14 + paddd %xmm5,%xmm10 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm14 + paddd %xmm7,%xmm14 + movd 40(%r8),%xmm5 + movd 40(%r9),%xmm0 + movd 40(%r10),%xmm1 + movd 40(%r11),%xmm2 + punpckldq %xmm1,%xmm5 + punpckldq %xmm2,%xmm0 + punpckldq %xmm0,%xmm5 + movdqa %xmm10,%xmm7 +.byte 102,15,56,0,238 + movdqa %xmm10,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm10,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,160-128(%rax) + paddd %xmm13,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd -64(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm10,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm10,%xmm3 + pslld $26-21,%xmm2 + pandn %xmm12,%xmm0 + pand %xmm11,%xmm3 + pxor %xmm1,%xmm7 + + + movdqa %xmm14,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm14,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + movdqa %xmm15,%xmm3 + movdqa %xmm14,%xmm7 + pslld $10,%xmm2 + pxor %xmm14,%xmm3 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm3,%xmm4 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm15,%xmm13 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm4,%xmm13 + paddd %xmm5,%xmm9 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm13 + paddd %xmm7,%xmm13 + movd 44(%r8),%xmm5 + movd 44(%r9),%xmm0 + movd 44(%r10),%xmm1 + movd 44(%r11),%xmm2 + punpckldq %xmm1,%xmm5 + punpckldq %xmm2,%xmm0 + punpckldq %xmm0,%xmm5 + movdqa %xmm9,%xmm7 + + movdqa %xmm9,%xmm2 +.byte 102,15,56,0,238 + psrld $6,%xmm7 + movdqa %xmm9,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,176-128(%rax) + paddd %xmm12,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd -32(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm9,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm9,%xmm4 + pslld $26-21,%xmm2 + pandn %xmm11,%xmm0 + pand %xmm10,%xmm4 + pxor %xmm1,%xmm7 + + + movdqa %xmm13,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm13,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm4,%xmm0 + movdqa %xmm14,%xmm4 + movdqa %xmm13,%xmm7 + pslld $10,%xmm2 + pxor %xmm13,%xmm4 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm4,%xmm3 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm14,%xmm12 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm3,%xmm12 + paddd %xmm5,%xmm8 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm12 + paddd %xmm7,%xmm12 + movd 48(%r8),%xmm5 + movd 48(%r9),%xmm0 + movd 48(%r10),%xmm1 + movd 48(%r11),%xmm2 + punpckldq %xmm1,%xmm5 + punpckldq %xmm2,%xmm0 + punpckldq %xmm0,%xmm5 + movdqa %xmm8,%xmm7 +.byte 102,15,56,0,238 + movdqa %xmm8,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm8,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,192-128(%rax) + paddd %xmm11,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd 0(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm8,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm8,%xmm3 + pslld $26-21,%xmm2 + pandn %xmm10,%xmm0 + pand %xmm9,%xmm3 + pxor %xmm1,%xmm7 + + + movdqa %xmm12,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm12,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + movdqa %xmm13,%xmm3 + movdqa %xmm12,%xmm7 + pslld $10,%xmm2 + pxor %xmm12,%xmm3 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm3,%xmm4 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm13,%xmm11 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm4,%xmm11 + paddd %xmm5,%xmm15 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm11 + paddd %xmm7,%xmm11 + movd 52(%r8),%xmm5 + movd 52(%r9),%xmm0 + movd 52(%r10),%xmm1 + movd 52(%r11),%xmm2 + punpckldq %xmm1,%xmm5 + punpckldq %xmm2,%xmm0 + punpckldq %xmm0,%xmm5 + movdqa %xmm15,%xmm7 + + movdqa %xmm15,%xmm2 +.byte 102,15,56,0,238 + psrld $6,%xmm7 + movdqa %xmm15,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,208-128(%rax) + paddd %xmm10,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd 32(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm15,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm15,%xmm4 + pslld $26-21,%xmm2 + pandn %xmm9,%xmm0 + pand %xmm8,%xmm4 + pxor %xmm1,%xmm7 + + + movdqa %xmm11,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm11,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm4,%xmm0 + movdqa %xmm12,%xmm4 + movdqa %xmm11,%xmm7 + pslld $10,%xmm2 + pxor %xmm11,%xmm4 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm4,%xmm3 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm12,%xmm10 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm3,%xmm10 + paddd %xmm5,%xmm14 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm10 + paddd %xmm7,%xmm10 + movd 56(%r8),%xmm5 + movd 56(%r9),%xmm0 + movd 56(%r10),%xmm1 + movd 56(%r11),%xmm2 + punpckldq %xmm1,%xmm5 + punpckldq %xmm2,%xmm0 + punpckldq %xmm0,%xmm5 + movdqa %xmm14,%xmm7 +.byte 102,15,56,0,238 + movdqa %xmm14,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm14,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,224-128(%rax) + paddd %xmm9,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd 64(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm14,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm14,%xmm3 + pslld $26-21,%xmm2 + pandn %xmm8,%xmm0 + pand %xmm15,%xmm3 + pxor %xmm1,%xmm7 + + + movdqa %xmm10,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm10,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + movdqa %xmm11,%xmm3 + movdqa %xmm10,%xmm7 + pslld $10,%xmm2 + pxor %xmm10,%xmm3 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm3,%xmm4 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm11,%xmm9 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm4,%xmm9 + paddd %xmm5,%xmm13 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm9 + paddd %xmm7,%xmm9 + movd 60(%r8),%xmm5 + leaq 64(%r8),%r8 + movd 60(%r9),%xmm0 + leaq 64(%r9),%r9 + movd 60(%r10),%xmm1 + leaq 64(%r10),%r10 + movd 60(%r11),%xmm2 + leaq 64(%r11),%r11 + punpckldq %xmm1,%xmm5 + punpckldq %xmm2,%xmm0 + punpckldq %xmm0,%xmm5 + movdqa %xmm13,%xmm7 + + movdqa %xmm13,%xmm2 +.byte 102,15,56,0,238 + psrld $6,%xmm7 + movdqa %xmm13,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,240-128(%rax) + paddd %xmm8,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd 96(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm13,%xmm0 + prefetcht0 63(%r8) + pxor %xmm2,%xmm7 + movdqa %xmm13,%xmm4 + pslld $26-21,%xmm2 + pandn %xmm15,%xmm0 + pand %xmm14,%xmm4 + pxor %xmm1,%xmm7 + + prefetcht0 63(%r9) + movdqa %xmm9,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm9,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm4,%xmm0 + movdqa %xmm10,%xmm4 + movdqa %xmm9,%xmm7 + pslld $10,%xmm2 + pxor %xmm9,%xmm4 + + prefetcht0 63(%r10) + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm4,%xmm3 + pxor %xmm7,%xmm1 + + prefetcht0 63(%r11) + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm10,%xmm8 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm3,%xmm8 + paddd %xmm5,%xmm12 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm8 + paddd %xmm7,%xmm8 + leaq 256(%rbp),%rbp + movdqu 0-128(%rax),%xmm5 + movl $3,%ecx + jmp .Loop_16_xx +.align 32 +.Loop_16_xx: + movdqa 16-128(%rax),%xmm6 + paddd 144-128(%rax),%xmm5 + + movdqa %xmm6,%xmm7 + movdqa %xmm6,%xmm1 + psrld $3,%xmm7 + movdqa %xmm6,%xmm2 + + psrld $7,%xmm1 + movdqa 224-128(%rax),%xmm0 + pslld $14,%xmm2 + pxor %xmm1,%xmm7 + psrld $18-7,%xmm1 + movdqa %xmm0,%xmm3 + pxor %xmm2,%xmm7 + pslld $25-14,%xmm2 + pxor %xmm1,%xmm7 + psrld $10,%xmm0 + movdqa %xmm3,%xmm1 + + psrld $17,%xmm3 + pxor %xmm2,%xmm7 + pslld $13,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + psrld $19-17,%xmm3 + pxor %xmm1,%xmm0 + pslld $15-13,%xmm1 + pxor %xmm3,%xmm0 + pxor %xmm1,%xmm0 + paddd %xmm0,%xmm5 + movdqa %xmm12,%xmm7 + + movdqa %xmm12,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm12,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,0-128(%rax) + paddd %xmm15,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd -128(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm12,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm12,%xmm3 + pslld $26-21,%xmm2 + pandn %xmm14,%xmm0 + pand %xmm13,%xmm3 + pxor %xmm1,%xmm7 + + + movdqa %xmm8,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm8,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + movdqa %xmm9,%xmm3 + movdqa %xmm8,%xmm7 + pslld $10,%xmm2 + pxor %xmm8,%xmm3 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm3,%xmm4 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm9,%xmm15 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm4,%xmm15 + paddd %xmm5,%xmm11 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm15 + paddd %xmm7,%xmm15 + movdqa 32-128(%rax),%xmm5 + paddd 160-128(%rax),%xmm6 + + movdqa %xmm5,%xmm7 + movdqa %xmm5,%xmm1 + psrld $3,%xmm7 + movdqa %xmm5,%xmm2 + + psrld $7,%xmm1 + movdqa 240-128(%rax),%xmm0 + pslld $14,%xmm2 + pxor %xmm1,%xmm7 + psrld $18-7,%xmm1 + movdqa %xmm0,%xmm4 + pxor %xmm2,%xmm7 + pslld $25-14,%xmm2 + pxor %xmm1,%xmm7 + psrld $10,%xmm0 + movdqa %xmm4,%xmm1 + + psrld $17,%xmm4 + pxor %xmm2,%xmm7 + pslld $13,%xmm1 + paddd %xmm7,%xmm6 + pxor %xmm4,%xmm0 + psrld $19-17,%xmm4 + pxor %xmm1,%xmm0 + pslld $15-13,%xmm1 + pxor %xmm4,%xmm0 + pxor %xmm1,%xmm0 + paddd %xmm0,%xmm6 + movdqa %xmm11,%xmm7 + + movdqa %xmm11,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm11,%xmm1 + pslld $7,%xmm2 + movdqa %xmm6,16-128(%rax) + paddd %xmm14,%xmm6 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd -96(%rbp),%xmm6 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm11,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm11,%xmm4 + pslld $26-21,%xmm2 + pandn %xmm13,%xmm0 + pand %xmm12,%xmm4 + pxor %xmm1,%xmm7 + + + movdqa %xmm15,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm15,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm6 + pxor %xmm4,%xmm0 + movdqa %xmm8,%xmm4 + movdqa %xmm15,%xmm7 + pslld $10,%xmm2 + pxor %xmm15,%xmm4 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm6 + pslld $19-10,%xmm2 + pand %xmm4,%xmm3 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm8,%xmm14 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm3,%xmm14 + paddd %xmm6,%xmm10 + pxor %xmm2,%xmm7 + + paddd %xmm6,%xmm14 + paddd %xmm7,%xmm14 + movdqa 48-128(%rax),%xmm6 + paddd 176-128(%rax),%xmm5 + + movdqa %xmm6,%xmm7 + movdqa %xmm6,%xmm1 + psrld $3,%xmm7 + movdqa %xmm6,%xmm2 + + psrld $7,%xmm1 + movdqa 0-128(%rax),%xmm0 + pslld $14,%xmm2 + pxor %xmm1,%xmm7 + psrld $18-7,%xmm1 + movdqa %xmm0,%xmm3 + pxor %xmm2,%xmm7 + pslld $25-14,%xmm2 + pxor %xmm1,%xmm7 + psrld $10,%xmm0 + movdqa %xmm3,%xmm1 + + psrld $17,%xmm3 + pxor %xmm2,%xmm7 + pslld $13,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + psrld $19-17,%xmm3 + pxor %xmm1,%xmm0 + pslld $15-13,%xmm1 + pxor %xmm3,%xmm0 + pxor %xmm1,%xmm0 + paddd %xmm0,%xmm5 + movdqa %xmm10,%xmm7 + + movdqa %xmm10,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm10,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,32-128(%rax) + paddd %xmm13,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd -64(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm10,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm10,%xmm3 + pslld $26-21,%xmm2 + pandn %xmm12,%xmm0 + pand %xmm11,%xmm3 + pxor %xmm1,%xmm7 + + + movdqa %xmm14,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm14,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + movdqa %xmm15,%xmm3 + movdqa %xmm14,%xmm7 + pslld $10,%xmm2 + pxor %xmm14,%xmm3 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm3,%xmm4 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm15,%xmm13 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm4,%xmm13 + paddd %xmm5,%xmm9 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm13 + paddd %xmm7,%xmm13 + movdqa 64-128(%rax),%xmm5 + paddd 192-128(%rax),%xmm6 + + movdqa %xmm5,%xmm7 + movdqa %xmm5,%xmm1 + psrld $3,%xmm7 + movdqa %xmm5,%xmm2 + + psrld $7,%xmm1 + movdqa 16-128(%rax),%xmm0 + pslld $14,%xmm2 + pxor %xmm1,%xmm7 + psrld $18-7,%xmm1 + movdqa %xmm0,%xmm4 + pxor %xmm2,%xmm7 + pslld $25-14,%xmm2 + pxor %xmm1,%xmm7 + psrld $10,%xmm0 + movdqa %xmm4,%xmm1 + + psrld $17,%xmm4 + pxor %xmm2,%xmm7 + pslld $13,%xmm1 + paddd %xmm7,%xmm6 + pxor %xmm4,%xmm0 + psrld $19-17,%xmm4 + pxor %xmm1,%xmm0 + pslld $15-13,%xmm1 + pxor %xmm4,%xmm0 + pxor %xmm1,%xmm0 + paddd %xmm0,%xmm6 + movdqa %xmm9,%xmm7 + + movdqa %xmm9,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm9,%xmm1 + pslld $7,%xmm2 + movdqa %xmm6,48-128(%rax) + paddd %xmm12,%xmm6 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd -32(%rbp),%xmm6 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm9,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm9,%xmm4 + pslld $26-21,%xmm2 + pandn %xmm11,%xmm0 + pand %xmm10,%xmm4 + pxor %xmm1,%xmm7 + + + movdqa %xmm13,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm13,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm6 + pxor %xmm4,%xmm0 + movdqa %xmm14,%xmm4 + movdqa %xmm13,%xmm7 + pslld $10,%xmm2 + pxor %xmm13,%xmm4 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm6 + pslld $19-10,%xmm2 + pand %xmm4,%xmm3 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm14,%xmm12 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm3,%xmm12 + paddd %xmm6,%xmm8 + pxor %xmm2,%xmm7 + + paddd %xmm6,%xmm12 + paddd %xmm7,%xmm12 + movdqa 80-128(%rax),%xmm6 + paddd 208-128(%rax),%xmm5 + + movdqa %xmm6,%xmm7 + movdqa %xmm6,%xmm1 + psrld $3,%xmm7 + movdqa %xmm6,%xmm2 + + psrld $7,%xmm1 + movdqa 32-128(%rax),%xmm0 + pslld $14,%xmm2 + pxor %xmm1,%xmm7 + psrld $18-7,%xmm1 + movdqa %xmm0,%xmm3 + pxor %xmm2,%xmm7 + pslld $25-14,%xmm2 + pxor %xmm1,%xmm7 + psrld $10,%xmm0 + movdqa %xmm3,%xmm1 + + psrld $17,%xmm3 + pxor %xmm2,%xmm7 + pslld $13,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + psrld $19-17,%xmm3 + pxor %xmm1,%xmm0 + pslld $15-13,%xmm1 + pxor %xmm3,%xmm0 + pxor %xmm1,%xmm0 + paddd %xmm0,%xmm5 + movdqa %xmm8,%xmm7 + + movdqa %xmm8,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm8,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,64-128(%rax) + paddd %xmm11,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd 0(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm8,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm8,%xmm3 + pslld $26-21,%xmm2 + pandn %xmm10,%xmm0 + pand %xmm9,%xmm3 + pxor %xmm1,%xmm7 + + + movdqa %xmm12,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm12,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + movdqa %xmm13,%xmm3 + movdqa %xmm12,%xmm7 + pslld $10,%xmm2 + pxor %xmm12,%xmm3 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm3,%xmm4 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm13,%xmm11 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm4,%xmm11 + paddd %xmm5,%xmm15 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm11 + paddd %xmm7,%xmm11 + movdqa 96-128(%rax),%xmm5 + paddd 224-128(%rax),%xmm6 + + movdqa %xmm5,%xmm7 + movdqa %xmm5,%xmm1 + psrld $3,%xmm7 + movdqa %xmm5,%xmm2 + + psrld $7,%xmm1 + movdqa 48-128(%rax),%xmm0 + pslld $14,%xmm2 + pxor %xmm1,%xmm7 + psrld $18-7,%xmm1 + movdqa %xmm0,%xmm4 + pxor %xmm2,%xmm7 + pslld $25-14,%xmm2 + pxor %xmm1,%xmm7 + psrld $10,%xmm0 + movdqa %xmm4,%xmm1 + + psrld $17,%xmm4 + pxor %xmm2,%xmm7 + pslld $13,%xmm1 + paddd %xmm7,%xmm6 + pxor %xmm4,%xmm0 + psrld $19-17,%xmm4 + pxor %xmm1,%xmm0 + pslld $15-13,%xmm1 + pxor %xmm4,%xmm0 + pxor %xmm1,%xmm0 + paddd %xmm0,%xmm6 + movdqa %xmm15,%xmm7 + + movdqa %xmm15,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm15,%xmm1 + pslld $7,%xmm2 + movdqa %xmm6,80-128(%rax) + paddd %xmm10,%xmm6 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd 32(%rbp),%xmm6 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm15,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm15,%xmm4 + pslld $26-21,%xmm2 + pandn %xmm9,%xmm0 + pand %xmm8,%xmm4 + pxor %xmm1,%xmm7 + + + movdqa %xmm11,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm11,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm6 + pxor %xmm4,%xmm0 + movdqa %xmm12,%xmm4 + movdqa %xmm11,%xmm7 + pslld $10,%xmm2 + pxor %xmm11,%xmm4 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm6 + pslld $19-10,%xmm2 + pand %xmm4,%xmm3 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm12,%xmm10 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm3,%xmm10 + paddd %xmm6,%xmm14 + pxor %xmm2,%xmm7 + + paddd %xmm6,%xmm10 + paddd %xmm7,%xmm10 + movdqa 112-128(%rax),%xmm6 + paddd 240-128(%rax),%xmm5 + + movdqa %xmm6,%xmm7 + movdqa %xmm6,%xmm1 + psrld $3,%xmm7 + movdqa %xmm6,%xmm2 + + psrld $7,%xmm1 + movdqa 64-128(%rax),%xmm0 + pslld $14,%xmm2 + pxor %xmm1,%xmm7 + psrld $18-7,%xmm1 + movdqa %xmm0,%xmm3 + pxor %xmm2,%xmm7 + pslld $25-14,%xmm2 + pxor %xmm1,%xmm7 + psrld $10,%xmm0 + movdqa %xmm3,%xmm1 + + psrld $17,%xmm3 + pxor %xmm2,%xmm7 + pslld $13,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + psrld $19-17,%xmm3 + pxor %xmm1,%xmm0 + pslld $15-13,%xmm1 + pxor %xmm3,%xmm0 + pxor %xmm1,%xmm0 + paddd %xmm0,%xmm5 + movdqa %xmm14,%xmm7 + + movdqa %xmm14,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm14,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,96-128(%rax) + paddd %xmm9,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd 64(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm14,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm14,%xmm3 + pslld $26-21,%xmm2 + pandn %xmm8,%xmm0 + pand %xmm15,%xmm3 + pxor %xmm1,%xmm7 + + + movdqa %xmm10,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm10,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + movdqa %xmm11,%xmm3 + movdqa %xmm10,%xmm7 + pslld $10,%xmm2 + pxor %xmm10,%xmm3 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm3,%xmm4 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm11,%xmm9 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm4,%xmm9 + paddd %xmm5,%xmm13 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm9 + paddd %xmm7,%xmm9 + movdqa 128-128(%rax),%xmm5 + paddd 0-128(%rax),%xmm6 + + movdqa %xmm5,%xmm7 + movdqa %xmm5,%xmm1 + psrld $3,%xmm7 + movdqa %xmm5,%xmm2 + + psrld $7,%xmm1 + movdqa 80-128(%rax),%xmm0 + pslld $14,%xmm2 + pxor %xmm1,%xmm7 + psrld $18-7,%xmm1 + movdqa %xmm0,%xmm4 + pxor %xmm2,%xmm7 + pslld $25-14,%xmm2 + pxor %xmm1,%xmm7 + psrld $10,%xmm0 + movdqa %xmm4,%xmm1 + + psrld $17,%xmm4 + pxor %xmm2,%xmm7 + pslld $13,%xmm1 + paddd %xmm7,%xmm6 + pxor %xmm4,%xmm0 + psrld $19-17,%xmm4 + pxor %xmm1,%xmm0 + pslld $15-13,%xmm1 + pxor %xmm4,%xmm0 + pxor %xmm1,%xmm0 + paddd %xmm0,%xmm6 + movdqa %xmm13,%xmm7 + + movdqa %xmm13,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm13,%xmm1 + pslld $7,%xmm2 + movdqa %xmm6,112-128(%rax) + paddd %xmm8,%xmm6 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd 96(%rbp),%xmm6 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm13,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm13,%xmm4 + pslld $26-21,%xmm2 + pandn %xmm15,%xmm0 + pand %xmm14,%xmm4 + pxor %xmm1,%xmm7 + + + movdqa %xmm9,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm9,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm6 + pxor %xmm4,%xmm0 + movdqa %xmm10,%xmm4 + movdqa %xmm9,%xmm7 + pslld $10,%xmm2 + pxor %xmm9,%xmm4 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm6 + pslld $19-10,%xmm2 + pand %xmm4,%xmm3 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm10,%xmm8 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm3,%xmm8 + paddd %xmm6,%xmm12 + pxor %xmm2,%xmm7 + + paddd %xmm6,%xmm8 + paddd %xmm7,%xmm8 + leaq 256(%rbp),%rbp + movdqa 144-128(%rax),%xmm6 + paddd 16-128(%rax),%xmm5 + + movdqa %xmm6,%xmm7 + movdqa %xmm6,%xmm1 + psrld $3,%xmm7 + movdqa %xmm6,%xmm2 + + psrld $7,%xmm1 + movdqa 96-128(%rax),%xmm0 + pslld $14,%xmm2 + pxor %xmm1,%xmm7 + psrld $18-7,%xmm1 + movdqa %xmm0,%xmm3 + pxor %xmm2,%xmm7 + pslld $25-14,%xmm2 + pxor %xmm1,%xmm7 + psrld $10,%xmm0 + movdqa %xmm3,%xmm1 + + psrld $17,%xmm3 + pxor %xmm2,%xmm7 + pslld $13,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + psrld $19-17,%xmm3 + pxor %xmm1,%xmm0 + pslld $15-13,%xmm1 + pxor %xmm3,%xmm0 + pxor %xmm1,%xmm0 + paddd %xmm0,%xmm5 + movdqa %xmm12,%xmm7 + + movdqa %xmm12,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm12,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,128-128(%rax) + paddd %xmm15,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd -128(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm12,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm12,%xmm3 + pslld $26-21,%xmm2 + pandn %xmm14,%xmm0 + pand %xmm13,%xmm3 + pxor %xmm1,%xmm7 + + + movdqa %xmm8,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm8,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + movdqa %xmm9,%xmm3 + movdqa %xmm8,%xmm7 + pslld $10,%xmm2 + pxor %xmm8,%xmm3 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm3,%xmm4 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm9,%xmm15 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm4,%xmm15 + paddd %xmm5,%xmm11 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm15 + paddd %xmm7,%xmm15 + movdqa 160-128(%rax),%xmm5 + paddd 32-128(%rax),%xmm6 + + movdqa %xmm5,%xmm7 + movdqa %xmm5,%xmm1 + psrld $3,%xmm7 + movdqa %xmm5,%xmm2 + + psrld $7,%xmm1 + movdqa 112-128(%rax),%xmm0 + pslld $14,%xmm2 + pxor %xmm1,%xmm7 + psrld $18-7,%xmm1 + movdqa %xmm0,%xmm4 + pxor %xmm2,%xmm7 + pslld $25-14,%xmm2 + pxor %xmm1,%xmm7 + psrld $10,%xmm0 + movdqa %xmm4,%xmm1 + + psrld $17,%xmm4 + pxor %xmm2,%xmm7 + pslld $13,%xmm1 + paddd %xmm7,%xmm6 + pxor %xmm4,%xmm0 + psrld $19-17,%xmm4 + pxor %xmm1,%xmm0 + pslld $15-13,%xmm1 + pxor %xmm4,%xmm0 + pxor %xmm1,%xmm0 + paddd %xmm0,%xmm6 + movdqa %xmm11,%xmm7 + + movdqa %xmm11,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm11,%xmm1 + pslld $7,%xmm2 + movdqa %xmm6,144-128(%rax) + paddd %xmm14,%xmm6 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd -96(%rbp),%xmm6 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm11,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm11,%xmm4 + pslld $26-21,%xmm2 + pandn %xmm13,%xmm0 + pand %xmm12,%xmm4 + pxor %xmm1,%xmm7 + + + movdqa %xmm15,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm15,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm6 + pxor %xmm4,%xmm0 + movdqa %xmm8,%xmm4 + movdqa %xmm15,%xmm7 + pslld $10,%xmm2 + pxor %xmm15,%xmm4 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm6 + pslld $19-10,%xmm2 + pand %xmm4,%xmm3 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm8,%xmm14 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm3,%xmm14 + paddd %xmm6,%xmm10 + pxor %xmm2,%xmm7 + + paddd %xmm6,%xmm14 + paddd %xmm7,%xmm14 + movdqa 176-128(%rax),%xmm6 + paddd 48-128(%rax),%xmm5 + + movdqa %xmm6,%xmm7 + movdqa %xmm6,%xmm1 + psrld $3,%xmm7 + movdqa %xmm6,%xmm2 + + psrld $7,%xmm1 + movdqa 128-128(%rax),%xmm0 + pslld $14,%xmm2 + pxor %xmm1,%xmm7 + psrld $18-7,%xmm1 + movdqa %xmm0,%xmm3 + pxor %xmm2,%xmm7 + pslld $25-14,%xmm2 + pxor %xmm1,%xmm7 + psrld $10,%xmm0 + movdqa %xmm3,%xmm1 + + psrld $17,%xmm3 + pxor %xmm2,%xmm7 + pslld $13,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + psrld $19-17,%xmm3 + pxor %xmm1,%xmm0 + pslld $15-13,%xmm1 + pxor %xmm3,%xmm0 + pxor %xmm1,%xmm0 + paddd %xmm0,%xmm5 + movdqa %xmm10,%xmm7 + + movdqa %xmm10,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm10,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,160-128(%rax) + paddd %xmm13,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd -64(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm10,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm10,%xmm3 + pslld $26-21,%xmm2 + pandn %xmm12,%xmm0 + pand %xmm11,%xmm3 + pxor %xmm1,%xmm7 + + + movdqa %xmm14,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm14,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + movdqa %xmm15,%xmm3 + movdqa %xmm14,%xmm7 + pslld $10,%xmm2 + pxor %xmm14,%xmm3 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm3,%xmm4 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm15,%xmm13 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm4,%xmm13 + paddd %xmm5,%xmm9 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm13 + paddd %xmm7,%xmm13 + movdqa 192-128(%rax),%xmm5 + paddd 64-128(%rax),%xmm6 + + movdqa %xmm5,%xmm7 + movdqa %xmm5,%xmm1 + psrld $3,%xmm7 + movdqa %xmm5,%xmm2 + + psrld $7,%xmm1 + movdqa 144-128(%rax),%xmm0 + pslld $14,%xmm2 + pxor %xmm1,%xmm7 + psrld $18-7,%xmm1 + movdqa %xmm0,%xmm4 + pxor %xmm2,%xmm7 + pslld $25-14,%xmm2 + pxor %xmm1,%xmm7 + psrld $10,%xmm0 + movdqa %xmm4,%xmm1 + + psrld $17,%xmm4 + pxor %xmm2,%xmm7 + pslld $13,%xmm1 + paddd %xmm7,%xmm6 + pxor %xmm4,%xmm0 + psrld $19-17,%xmm4 + pxor %xmm1,%xmm0 + pslld $15-13,%xmm1 + pxor %xmm4,%xmm0 + pxor %xmm1,%xmm0 + paddd %xmm0,%xmm6 + movdqa %xmm9,%xmm7 + + movdqa %xmm9,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm9,%xmm1 + pslld $7,%xmm2 + movdqa %xmm6,176-128(%rax) + paddd %xmm12,%xmm6 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd -32(%rbp),%xmm6 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm9,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm9,%xmm4 + pslld $26-21,%xmm2 + pandn %xmm11,%xmm0 + pand %xmm10,%xmm4 + pxor %xmm1,%xmm7 + + + movdqa %xmm13,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm13,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm6 + pxor %xmm4,%xmm0 + movdqa %xmm14,%xmm4 + movdqa %xmm13,%xmm7 + pslld $10,%xmm2 + pxor %xmm13,%xmm4 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm6 + pslld $19-10,%xmm2 + pand %xmm4,%xmm3 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm14,%xmm12 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm3,%xmm12 + paddd %xmm6,%xmm8 + pxor %xmm2,%xmm7 + + paddd %xmm6,%xmm12 + paddd %xmm7,%xmm12 + movdqa 208-128(%rax),%xmm6 + paddd 80-128(%rax),%xmm5 + + movdqa %xmm6,%xmm7 + movdqa %xmm6,%xmm1 + psrld $3,%xmm7 + movdqa %xmm6,%xmm2 + + psrld $7,%xmm1 + movdqa 160-128(%rax),%xmm0 + pslld $14,%xmm2 + pxor %xmm1,%xmm7 + psrld $18-7,%xmm1 + movdqa %xmm0,%xmm3 + pxor %xmm2,%xmm7 + pslld $25-14,%xmm2 + pxor %xmm1,%xmm7 + psrld $10,%xmm0 + movdqa %xmm3,%xmm1 + + psrld $17,%xmm3 + pxor %xmm2,%xmm7 + pslld $13,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + psrld $19-17,%xmm3 + pxor %xmm1,%xmm0 + pslld $15-13,%xmm1 + pxor %xmm3,%xmm0 + pxor %xmm1,%xmm0 + paddd %xmm0,%xmm5 + movdqa %xmm8,%xmm7 + + movdqa %xmm8,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm8,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,192-128(%rax) + paddd %xmm11,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd 0(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm8,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm8,%xmm3 + pslld $26-21,%xmm2 + pandn %xmm10,%xmm0 + pand %xmm9,%xmm3 + pxor %xmm1,%xmm7 + + + movdqa %xmm12,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm12,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + movdqa %xmm13,%xmm3 + movdqa %xmm12,%xmm7 + pslld $10,%xmm2 + pxor %xmm12,%xmm3 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm3,%xmm4 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm13,%xmm11 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm4,%xmm11 + paddd %xmm5,%xmm15 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm11 + paddd %xmm7,%xmm11 + movdqa 224-128(%rax),%xmm5 + paddd 96-128(%rax),%xmm6 + + movdqa %xmm5,%xmm7 + movdqa %xmm5,%xmm1 + psrld $3,%xmm7 + movdqa %xmm5,%xmm2 + + psrld $7,%xmm1 + movdqa 176-128(%rax),%xmm0 + pslld $14,%xmm2 + pxor %xmm1,%xmm7 + psrld $18-7,%xmm1 + movdqa %xmm0,%xmm4 + pxor %xmm2,%xmm7 + pslld $25-14,%xmm2 + pxor %xmm1,%xmm7 + psrld $10,%xmm0 + movdqa %xmm4,%xmm1 + + psrld $17,%xmm4 + pxor %xmm2,%xmm7 + pslld $13,%xmm1 + paddd %xmm7,%xmm6 + pxor %xmm4,%xmm0 + psrld $19-17,%xmm4 + pxor %xmm1,%xmm0 + pslld $15-13,%xmm1 + pxor %xmm4,%xmm0 + pxor %xmm1,%xmm0 + paddd %xmm0,%xmm6 + movdqa %xmm15,%xmm7 + + movdqa %xmm15,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm15,%xmm1 + pslld $7,%xmm2 + movdqa %xmm6,208-128(%rax) + paddd %xmm10,%xmm6 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd 32(%rbp),%xmm6 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm15,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm15,%xmm4 + pslld $26-21,%xmm2 + pandn %xmm9,%xmm0 + pand %xmm8,%xmm4 + pxor %xmm1,%xmm7 + + + movdqa %xmm11,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm11,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm6 + pxor %xmm4,%xmm0 + movdqa %xmm12,%xmm4 + movdqa %xmm11,%xmm7 + pslld $10,%xmm2 + pxor %xmm11,%xmm4 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm6 + pslld $19-10,%xmm2 + pand %xmm4,%xmm3 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm12,%xmm10 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm3,%xmm10 + paddd %xmm6,%xmm14 + pxor %xmm2,%xmm7 + + paddd %xmm6,%xmm10 + paddd %xmm7,%xmm10 + movdqa 240-128(%rax),%xmm6 + paddd 112-128(%rax),%xmm5 + + movdqa %xmm6,%xmm7 + movdqa %xmm6,%xmm1 + psrld $3,%xmm7 + movdqa %xmm6,%xmm2 + + psrld $7,%xmm1 + movdqa 192-128(%rax),%xmm0 + pslld $14,%xmm2 + pxor %xmm1,%xmm7 + psrld $18-7,%xmm1 + movdqa %xmm0,%xmm3 + pxor %xmm2,%xmm7 + pslld $25-14,%xmm2 + pxor %xmm1,%xmm7 + psrld $10,%xmm0 + movdqa %xmm3,%xmm1 + + psrld $17,%xmm3 + pxor %xmm2,%xmm7 + pslld $13,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + psrld $19-17,%xmm3 + pxor %xmm1,%xmm0 + pslld $15-13,%xmm1 + pxor %xmm3,%xmm0 + pxor %xmm1,%xmm0 + paddd %xmm0,%xmm5 + movdqa %xmm14,%xmm7 + + movdqa %xmm14,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm14,%xmm1 + pslld $7,%xmm2 + movdqa %xmm5,224-128(%rax) + paddd %xmm9,%xmm5 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd 64(%rbp),%xmm5 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm14,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm14,%xmm3 + pslld $26-21,%xmm2 + pandn %xmm8,%xmm0 + pand %xmm15,%xmm3 + pxor %xmm1,%xmm7 + + + movdqa %xmm10,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm10,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm5 + pxor %xmm3,%xmm0 + movdqa %xmm11,%xmm3 + movdqa %xmm10,%xmm7 + pslld $10,%xmm2 + pxor %xmm10,%xmm3 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm5 + pslld $19-10,%xmm2 + pand %xmm3,%xmm4 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm11,%xmm9 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm4,%xmm9 + paddd %xmm5,%xmm13 + pxor %xmm2,%xmm7 + + paddd %xmm5,%xmm9 + paddd %xmm7,%xmm9 + movdqa 0-128(%rax),%xmm5 + paddd 128-128(%rax),%xmm6 + + movdqa %xmm5,%xmm7 + movdqa %xmm5,%xmm1 + psrld $3,%xmm7 + movdqa %xmm5,%xmm2 + + psrld $7,%xmm1 + movdqa 208-128(%rax),%xmm0 + pslld $14,%xmm2 + pxor %xmm1,%xmm7 + psrld $18-7,%xmm1 + movdqa %xmm0,%xmm4 + pxor %xmm2,%xmm7 + pslld $25-14,%xmm2 + pxor %xmm1,%xmm7 + psrld $10,%xmm0 + movdqa %xmm4,%xmm1 + + psrld $17,%xmm4 + pxor %xmm2,%xmm7 + pslld $13,%xmm1 + paddd %xmm7,%xmm6 + pxor %xmm4,%xmm0 + psrld $19-17,%xmm4 + pxor %xmm1,%xmm0 + pslld $15-13,%xmm1 + pxor %xmm4,%xmm0 + pxor %xmm1,%xmm0 + paddd %xmm0,%xmm6 + movdqa %xmm13,%xmm7 + + movdqa %xmm13,%xmm2 + + psrld $6,%xmm7 + movdqa %xmm13,%xmm1 + pslld $7,%xmm2 + movdqa %xmm6,240-128(%rax) + paddd %xmm8,%xmm6 + + psrld $11,%xmm1 + pxor %xmm2,%xmm7 + pslld $21-7,%xmm2 + paddd 96(%rbp),%xmm6 + pxor %xmm1,%xmm7 + + psrld $25-11,%xmm1 + movdqa %xmm13,%xmm0 + + pxor %xmm2,%xmm7 + movdqa %xmm13,%xmm4 + pslld $26-21,%xmm2 + pandn %xmm15,%xmm0 + pand %xmm14,%xmm4 + pxor %xmm1,%xmm7 + + + movdqa %xmm9,%xmm1 + pxor %xmm2,%xmm7 + movdqa %xmm9,%xmm2 + psrld $2,%xmm1 + paddd %xmm7,%xmm6 + pxor %xmm4,%xmm0 + movdqa %xmm10,%xmm4 + movdqa %xmm9,%xmm7 + pslld $10,%xmm2 + pxor %xmm9,%xmm4 + + + psrld $13,%xmm7 + pxor %xmm2,%xmm1 + paddd %xmm0,%xmm6 + pslld $19-10,%xmm2 + pand %xmm4,%xmm3 + pxor %xmm7,%xmm1 + + + psrld $22-13,%xmm7 + pxor %xmm2,%xmm1 + movdqa %xmm10,%xmm8 + pslld $30-19,%xmm2 + pxor %xmm1,%xmm7 + pxor %xmm3,%xmm8 + paddd %xmm6,%xmm12 + pxor %xmm2,%xmm7 + + paddd %xmm6,%xmm8 + paddd %xmm7,%xmm8 + leaq 256(%rbp),%rbp + decl %ecx + jnz .Loop_16_xx + + movl $1,%ecx + leaq K256+128(%rip),%rbp + + movdqa (%rbx),%xmm7 + cmpl 0(%rbx),%ecx + pxor %xmm0,%xmm0 + cmovgeq %rbp,%r8 + cmpl 4(%rbx),%ecx + movdqa %xmm7,%xmm6 + cmovgeq %rbp,%r9 + cmpl 8(%rbx),%ecx + pcmpgtd %xmm0,%xmm6 + cmovgeq %rbp,%r10 + cmpl 12(%rbx),%ecx + paddd %xmm6,%xmm7 + cmovgeq %rbp,%r11 + + movdqu 0-128(%rdi),%xmm0 + pand %xmm6,%xmm8 + movdqu 32-128(%rdi),%xmm1 + pand %xmm6,%xmm9 + movdqu 64-128(%rdi),%xmm2 + pand %xmm6,%xmm10 + movdqu 96-128(%rdi),%xmm5 + pand %xmm6,%xmm11 + paddd %xmm0,%xmm8 + movdqu 128-128(%rdi),%xmm0 + pand %xmm6,%xmm12 + paddd %xmm1,%xmm9 + movdqu 160-128(%rdi),%xmm1 + pand %xmm6,%xmm13 + paddd %xmm2,%xmm10 + movdqu 192-128(%rdi),%xmm2 + pand %xmm6,%xmm14 + paddd %xmm5,%xmm11 + movdqu 224-128(%rdi),%xmm5 + pand %xmm6,%xmm15 + paddd %xmm0,%xmm12 + paddd %xmm1,%xmm13 + movdqu %xmm8,0-128(%rdi) + paddd %xmm2,%xmm14 + movdqu %xmm9,32-128(%rdi) + paddd %xmm5,%xmm15 + movdqu %xmm10,64-128(%rdi) + movdqu %xmm11,96-128(%rdi) + movdqu %xmm12,128-128(%rdi) + movdqu %xmm13,160-128(%rdi) + movdqu %xmm14,192-128(%rdi) + movdqu %xmm15,224-128(%rdi) + + movdqa %xmm7,(%rbx) + movdqa .Lpbswap(%rip),%xmm6 + decl %edx + jnz .Loop + + movl 280(%rsp),%edx + leaq 16(%rdi),%rdi + leaq 64(%rsi),%rsi + decl %edx + jnz .Loop_grande + +.Ldone: + movq 272(%rsp),%rax + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp +.Lepilogue: + .byte 0xf3,0xc3 +.size sha256_multi_block,.-sha256_multi_block +.type sha256_multi_block_shaext,@function +.align 32 +sha256_multi_block_shaext: +_shaext_shortcut: + movq %rsp,%rax + pushq %rbx + pushq %rbp + subq $288,%rsp + shll $1,%edx + andq $-256,%rsp + leaq 128(%rdi),%rdi + movq %rax,272(%rsp) +.Lbody_shaext: + leaq 256(%rsp),%rbx + leaq K256_shaext+128(%rip),%rbp + +.Loop_grande_shaext: + movl %edx,280(%rsp) + xorl %edx,%edx + movq 0(%rsi),%r8 + movl 8(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,0(%rbx) + cmovleq %rsp,%r8 + movq 16(%rsi),%r9 + movl 24(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,4(%rbx) + cmovleq %rsp,%r9 + testl %edx,%edx + jz .Ldone_shaext + + movq 0-128(%rdi),%xmm12 + movq 32-128(%rdi),%xmm4 + movq 64-128(%rdi),%xmm13 + movq 96-128(%rdi),%xmm5 + movq 128-128(%rdi),%xmm8 + movq 160-128(%rdi),%xmm9 + movq 192-128(%rdi),%xmm10 + movq 224-128(%rdi),%xmm11 + + punpckldq %xmm4,%xmm12 + punpckldq %xmm5,%xmm13 + punpckldq %xmm9,%xmm8 + punpckldq %xmm11,%xmm10 + movdqa K256_shaext-16(%rip),%xmm3 + + movdqa %xmm12,%xmm14 + movdqa %xmm13,%xmm15 + punpcklqdq %xmm8,%xmm12 + punpcklqdq %xmm10,%xmm13 + punpckhqdq %xmm8,%xmm14 + punpckhqdq %xmm10,%xmm15 + + pshufd $27,%xmm12,%xmm12 + pshufd $27,%xmm13,%xmm13 + pshufd $27,%xmm14,%xmm14 + pshufd $27,%xmm15,%xmm15 + jmp .Loop_shaext + +.align 32 +.Loop_shaext: + movdqu 0(%r8),%xmm4 + movdqu 0(%r9),%xmm8 + movdqu 16(%r8),%xmm5 + movdqu 16(%r9),%xmm9 + movdqu 32(%r8),%xmm6 +.byte 102,15,56,0,227 + movdqu 32(%r9),%xmm10 +.byte 102,68,15,56,0,195 + movdqu 48(%r8),%xmm7 + leaq 64(%r8),%r8 + movdqu 48(%r9),%xmm11 + leaq 64(%r9),%r9 + + movdqa 0-128(%rbp),%xmm0 +.byte 102,15,56,0,235 + paddd %xmm4,%xmm0 + pxor %xmm12,%xmm4 + movdqa %xmm0,%xmm1 + movdqa 0-128(%rbp),%xmm2 +.byte 102,68,15,56,0,203 + paddd %xmm8,%xmm2 + movdqa %xmm13,80(%rsp) +.byte 69,15,56,203,236 + pxor %xmm14,%xmm8 + movdqa %xmm2,%xmm0 + movdqa %xmm15,112(%rsp) +.byte 69,15,56,203,254 + pshufd $14,%xmm1,%xmm0 + pxor %xmm12,%xmm4 + movdqa %xmm12,64(%rsp) +.byte 69,15,56,203,229 + pshufd $14,%xmm2,%xmm0 + pxor %xmm14,%xmm8 + movdqa %xmm14,96(%rsp) + movdqa 16-128(%rbp),%xmm1 + paddd %xmm5,%xmm1 +.byte 102,15,56,0,243 +.byte 69,15,56,203,247 + + movdqa %xmm1,%xmm0 + movdqa 16-128(%rbp),%xmm2 + paddd %xmm9,%xmm2 +.byte 69,15,56,203,236 + movdqa %xmm2,%xmm0 + prefetcht0 127(%r8) +.byte 102,15,56,0,251 +.byte 102,68,15,56,0,211 + prefetcht0 127(%r9) +.byte 69,15,56,203,254 + pshufd $14,%xmm1,%xmm0 +.byte 102,68,15,56,0,219 +.byte 15,56,204,229 +.byte 69,15,56,203,229 + pshufd $14,%xmm2,%xmm0 + movdqa 32-128(%rbp),%xmm1 + paddd %xmm6,%xmm1 +.byte 69,15,56,203,247 + + movdqa %xmm1,%xmm0 + movdqa 32-128(%rbp),%xmm2 + paddd %xmm10,%xmm2 +.byte 69,15,56,203,236 +.byte 69,15,56,204,193 + movdqa %xmm2,%xmm0 + movdqa %xmm7,%xmm3 +.byte 69,15,56,203,254 + pshufd $14,%xmm1,%xmm0 +.byte 102,15,58,15,222,4 + paddd %xmm3,%xmm4 + movdqa %xmm11,%xmm3 +.byte 102,65,15,58,15,218,4 +.byte 15,56,204,238 +.byte 69,15,56,203,229 + pshufd $14,%xmm2,%xmm0 + movdqa 48-128(%rbp),%xmm1 + paddd %xmm7,%xmm1 +.byte 69,15,56,203,247 +.byte 69,15,56,204,202 + + movdqa %xmm1,%xmm0 + movdqa 48-128(%rbp),%xmm2 + paddd %xmm3,%xmm8 + paddd %xmm11,%xmm2 +.byte 15,56,205,231 +.byte 69,15,56,203,236 + movdqa %xmm2,%xmm0 + movdqa %xmm4,%xmm3 +.byte 102,15,58,15,223,4 +.byte 69,15,56,203,254 +.byte 69,15,56,205,195 + pshufd $14,%xmm1,%xmm0 + paddd %xmm3,%xmm5 + movdqa %xmm8,%xmm3 +.byte 102,65,15,58,15,219,4 +.byte 15,56,204,247 +.byte 69,15,56,203,229 + pshufd $14,%xmm2,%xmm0 + movdqa 64-128(%rbp),%xmm1 + paddd %xmm4,%xmm1 +.byte 69,15,56,203,247 +.byte 69,15,56,204,211 + movdqa %xmm1,%xmm0 + movdqa 64-128(%rbp),%xmm2 + paddd %xmm3,%xmm9 + paddd %xmm8,%xmm2 +.byte 15,56,205,236 +.byte 69,15,56,203,236 + movdqa %xmm2,%xmm0 + movdqa %xmm5,%xmm3 +.byte 102,15,58,15,220,4 +.byte 69,15,56,203,254 +.byte 69,15,56,205,200 + pshufd $14,%xmm1,%xmm0 + paddd %xmm3,%xmm6 + movdqa %xmm9,%xmm3 +.byte 102,65,15,58,15,216,4 +.byte 15,56,204,252 +.byte 69,15,56,203,229 + pshufd $14,%xmm2,%xmm0 + movdqa 80-128(%rbp),%xmm1 + paddd %xmm5,%xmm1 +.byte 69,15,56,203,247 +.byte 69,15,56,204,216 + movdqa %xmm1,%xmm0 + movdqa 80-128(%rbp),%xmm2 + paddd %xmm3,%xmm10 + paddd %xmm9,%xmm2 +.byte 15,56,205,245 +.byte 69,15,56,203,236 + movdqa %xmm2,%xmm0 + movdqa %xmm6,%xmm3 +.byte 102,15,58,15,221,4 +.byte 69,15,56,203,254 +.byte 69,15,56,205,209 + pshufd $14,%xmm1,%xmm0 + paddd %xmm3,%xmm7 + movdqa %xmm10,%xmm3 +.byte 102,65,15,58,15,217,4 +.byte 15,56,204,229 +.byte 69,15,56,203,229 + pshufd $14,%xmm2,%xmm0 + movdqa 96-128(%rbp),%xmm1 + paddd %xmm6,%xmm1 +.byte 69,15,56,203,247 +.byte 69,15,56,204,193 + movdqa %xmm1,%xmm0 + movdqa 96-128(%rbp),%xmm2 + paddd %xmm3,%xmm11 + paddd %xmm10,%xmm2 +.byte 15,56,205,254 +.byte 69,15,56,203,236 + movdqa %xmm2,%xmm0 + movdqa %xmm7,%xmm3 +.byte 102,15,58,15,222,4 +.byte 69,15,56,203,254 +.byte 69,15,56,205,218 + pshufd $14,%xmm1,%xmm0 + paddd %xmm3,%xmm4 + movdqa %xmm11,%xmm3 +.byte 102,65,15,58,15,218,4 +.byte 15,56,204,238 +.byte 69,15,56,203,229 + pshufd $14,%xmm2,%xmm0 + movdqa 112-128(%rbp),%xmm1 + paddd %xmm7,%xmm1 +.byte 69,15,56,203,247 +.byte 69,15,56,204,202 + movdqa %xmm1,%xmm0 + movdqa 112-128(%rbp),%xmm2 + paddd %xmm3,%xmm8 + paddd %xmm11,%xmm2 +.byte 15,56,205,231 +.byte 69,15,56,203,236 + movdqa %xmm2,%xmm0 + movdqa %xmm4,%xmm3 +.byte 102,15,58,15,223,4 +.byte 69,15,56,203,254 +.byte 69,15,56,205,195 + pshufd $14,%xmm1,%xmm0 + paddd %xmm3,%xmm5 + movdqa %xmm8,%xmm3 +.byte 102,65,15,58,15,219,4 +.byte 15,56,204,247 +.byte 69,15,56,203,229 + pshufd $14,%xmm2,%xmm0 + movdqa 128-128(%rbp),%xmm1 + paddd %xmm4,%xmm1 +.byte 69,15,56,203,247 +.byte 69,15,56,204,211 + movdqa %xmm1,%xmm0 + movdqa 128-128(%rbp),%xmm2 + paddd %xmm3,%xmm9 + paddd %xmm8,%xmm2 +.byte 15,56,205,236 +.byte 69,15,56,203,236 + movdqa %xmm2,%xmm0 + movdqa %xmm5,%xmm3 +.byte 102,15,58,15,220,4 +.byte 69,15,56,203,254 +.byte 69,15,56,205,200 + pshufd $14,%xmm1,%xmm0 + paddd %xmm3,%xmm6 + movdqa %xmm9,%xmm3 +.byte 102,65,15,58,15,216,4 +.byte 15,56,204,252 +.byte 69,15,56,203,229 + pshufd $14,%xmm2,%xmm0 + movdqa 144-128(%rbp),%xmm1 + paddd %xmm5,%xmm1 +.byte 69,15,56,203,247 +.byte 69,15,56,204,216 + movdqa %xmm1,%xmm0 + movdqa 144-128(%rbp),%xmm2 + paddd %xmm3,%xmm10 + paddd %xmm9,%xmm2 +.byte 15,56,205,245 +.byte 69,15,56,203,236 + movdqa %xmm2,%xmm0 + movdqa %xmm6,%xmm3 +.byte 102,15,58,15,221,4 +.byte 69,15,56,203,254 +.byte 69,15,56,205,209 + pshufd $14,%xmm1,%xmm0 + paddd %xmm3,%xmm7 + movdqa %xmm10,%xmm3 +.byte 102,65,15,58,15,217,4 +.byte 15,56,204,229 +.byte 69,15,56,203,229 + pshufd $14,%xmm2,%xmm0 + movdqa 160-128(%rbp),%xmm1 + paddd %xmm6,%xmm1 +.byte 69,15,56,203,247 +.byte 69,15,56,204,193 + movdqa %xmm1,%xmm0 + movdqa 160-128(%rbp),%xmm2 + paddd %xmm3,%xmm11 + paddd %xmm10,%xmm2 +.byte 15,56,205,254 +.byte 69,15,56,203,236 + movdqa %xmm2,%xmm0 + movdqa %xmm7,%xmm3 +.byte 102,15,58,15,222,4 +.byte 69,15,56,203,254 +.byte 69,15,56,205,218 + pshufd $14,%xmm1,%xmm0 + paddd %xmm3,%xmm4 + movdqa %xmm11,%xmm3 +.byte 102,65,15,58,15,218,4 +.byte 15,56,204,238 +.byte 69,15,56,203,229 + pshufd $14,%xmm2,%xmm0 + movdqa 176-128(%rbp),%xmm1 + paddd %xmm7,%xmm1 +.byte 69,15,56,203,247 +.byte 69,15,56,204,202 + movdqa %xmm1,%xmm0 + movdqa 176-128(%rbp),%xmm2 + paddd %xmm3,%xmm8 + paddd %xmm11,%xmm2 +.byte 15,56,205,231 +.byte 69,15,56,203,236 + movdqa %xmm2,%xmm0 + movdqa %xmm4,%xmm3 +.byte 102,15,58,15,223,4 +.byte 69,15,56,203,254 +.byte 69,15,56,205,195 + pshufd $14,%xmm1,%xmm0 + paddd %xmm3,%xmm5 + movdqa %xmm8,%xmm3 +.byte 102,65,15,58,15,219,4 +.byte 15,56,204,247 +.byte 69,15,56,203,229 + pshufd $14,%xmm2,%xmm0 + movdqa 192-128(%rbp),%xmm1 + paddd %xmm4,%xmm1 +.byte 69,15,56,203,247 +.byte 69,15,56,204,211 + movdqa %xmm1,%xmm0 + movdqa 192-128(%rbp),%xmm2 + paddd %xmm3,%xmm9 + paddd %xmm8,%xmm2 +.byte 15,56,205,236 +.byte 69,15,56,203,236 + movdqa %xmm2,%xmm0 + movdqa %xmm5,%xmm3 +.byte 102,15,58,15,220,4 +.byte 69,15,56,203,254 +.byte 69,15,56,205,200 + pshufd $14,%xmm1,%xmm0 + paddd %xmm3,%xmm6 + movdqa %xmm9,%xmm3 +.byte 102,65,15,58,15,216,4 +.byte 15,56,204,252 +.byte 69,15,56,203,229 + pshufd $14,%xmm2,%xmm0 + movdqa 208-128(%rbp),%xmm1 + paddd %xmm5,%xmm1 +.byte 69,15,56,203,247 +.byte 69,15,56,204,216 + movdqa %xmm1,%xmm0 + movdqa 208-128(%rbp),%xmm2 + paddd %xmm3,%xmm10 + paddd %xmm9,%xmm2 +.byte 15,56,205,245 +.byte 69,15,56,203,236 + movdqa %xmm2,%xmm0 + movdqa %xmm6,%xmm3 +.byte 102,15,58,15,221,4 +.byte 69,15,56,203,254 +.byte 69,15,56,205,209 + pshufd $14,%xmm1,%xmm0 + paddd %xmm3,%xmm7 + movdqa %xmm10,%xmm3 +.byte 102,65,15,58,15,217,4 + nop +.byte 69,15,56,203,229 + pshufd $14,%xmm2,%xmm0 + movdqa 224-128(%rbp),%xmm1 + paddd %xmm6,%xmm1 +.byte 69,15,56,203,247 + + movdqa %xmm1,%xmm0 + movdqa 224-128(%rbp),%xmm2 + paddd %xmm3,%xmm11 + paddd %xmm10,%xmm2 +.byte 15,56,205,254 + nop +.byte 69,15,56,203,236 + movdqa %xmm2,%xmm0 + movl $1,%ecx + pxor %xmm6,%xmm6 +.byte 69,15,56,203,254 +.byte 69,15,56,205,218 + pshufd $14,%xmm1,%xmm0 + movdqa 240-128(%rbp),%xmm1 + paddd %xmm7,%xmm1 + movq (%rbx),%xmm7 + nop +.byte 69,15,56,203,229 + pshufd $14,%xmm2,%xmm0 + movdqa 240-128(%rbp),%xmm2 + paddd %xmm11,%xmm2 +.byte 69,15,56,203,247 + + movdqa %xmm1,%xmm0 + cmpl 0(%rbx),%ecx + cmovgeq %rsp,%r8 + cmpl 4(%rbx),%ecx + cmovgeq %rsp,%r9 + pshufd $0,%xmm7,%xmm9 +.byte 69,15,56,203,236 + movdqa %xmm2,%xmm0 + pshufd $85,%xmm7,%xmm10 + movdqa %xmm7,%xmm11 +.byte 69,15,56,203,254 + pshufd $14,%xmm1,%xmm0 + pcmpgtd %xmm6,%xmm9 + pcmpgtd %xmm6,%xmm10 +.byte 69,15,56,203,229 + pshufd $14,%xmm2,%xmm0 + pcmpgtd %xmm6,%xmm11 + movdqa K256_shaext-16(%rip),%xmm3 +.byte 69,15,56,203,247 + + pand %xmm9,%xmm13 + pand %xmm10,%xmm15 + pand %xmm9,%xmm12 + pand %xmm10,%xmm14 + paddd %xmm7,%xmm11 + + paddd 80(%rsp),%xmm13 + paddd 112(%rsp),%xmm15 + paddd 64(%rsp),%xmm12 + paddd 96(%rsp),%xmm14 + + movq %xmm11,(%rbx) + decl %edx + jnz .Loop_shaext + + movl 280(%rsp),%edx + + pshufd $27,%xmm12,%xmm12 + pshufd $27,%xmm13,%xmm13 + pshufd $27,%xmm14,%xmm14 + pshufd $27,%xmm15,%xmm15 + + movdqa %xmm12,%xmm5 + movdqa %xmm13,%xmm6 + punpckldq %xmm14,%xmm12 + punpckhdq %xmm14,%xmm5 + punpckldq %xmm15,%xmm13 + punpckhdq %xmm15,%xmm6 + + movq %xmm12,0-128(%rdi) + psrldq $8,%xmm12 + movq %xmm5,128-128(%rdi) + psrldq $8,%xmm5 + movq %xmm12,32-128(%rdi) + movq %xmm5,160-128(%rdi) + + movq %xmm13,64-128(%rdi) + psrldq $8,%xmm13 + movq %xmm6,192-128(%rdi) + psrldq $8,%xmm6 + movq %xmm13,96-128(%rdi) + movq %xmm6,224-128(%rdi) + + leaq 8(%rdi),%rdi + leaq 32(%rsi),%rsi + decl %edx + jnz .Loop_grande_shaext + +.Ldone_shaext: + + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp +.Lepilogue_shaext: + .byte 0xf3,0xc3 +.size sha256_multi_block_shaext,.-sha256_multi_block_shaext +.align 256 +K256: +.long 1116352408,1116352408,1116352408,1116352408 +.long 1116352408,1116352408,1116352408,1116352408 +.long 1899447441,1899447441,1899447441,1899447441 +.long 1899447441,1899447441,1899447441,1899447441 +.long 3049323471,3049323471,3049323471,3049323471 +.long 3049323471,3049323471,3049323471,3049323471 +.long 3921009573,3921009573,3921009573,3921009573 +.long 3921009573,3921009573,3921009573,3921009573 +.long 961987163,961987163,961987163,961987163 +.long 961987163,961987163,961987163,961987163 +.long 1508970993,1508970993,1508970993,1508970993 +.long 1508970993,1508970993,1508970993,1508970993 +.long 2453635748,2453635748,2453635748,2453635748 +.long 2453635748,2453635748,2453635748,2453635748 +.long 2870763221,2870763221,2870763221,2870763221 +.long 2870763221,2870763221,2870763221,2870763221 +.long 3624381080,3624381080,3624381080,3624381080 +.long 3624381080,3624381080,3624381080,3624381080 +.long 310598401,310598401,310598401,310598401 +.long 310598401,310598401,310598401,310598401 +.long 607225278,607225278,607225278,607225278 +.long 607225278,607225278,607225278,607225278 +.long 1426881987,1426881987,1426881987,1426881987 +.long 1426881987,1426881987,1426881987,1426881987 +.long 1925078388,1925078388,1925078388,1925078388 +.long 1925078388,1925078388,1925078388,1925078388 +.long 2162078206,2162078206,2162078206,2162078206 +.long 2162078206,2162078206,2162078206,2162078206 +.long 2614888103,2614888103,2614888103,2614888103 +.long 2614888103,2614888103,2614888103,2614888103 +.long 3248222580,3248222580,3248222580,3248222580 +.long 3248222580,3248222580,3248222580,3248222580 +.long 3835390401,3835390401,3835390401,3835390401 +.long 3835390401,3835390401,3835390401,3835390401 +.long 4022224774,4022224774,4022224774,4022224774 +.long 4022224774,4022224774,4022224774,4022224774 +.long 264347078,264347078,264347078,264347078 +.long 264347078,264347078,264347078,264347078 +.long 604807628,604807628,604807628,604807628 +.long 604807628,604807628,604807628,604807628 +.long 770255983,770255983,770255983,770255983 +.long 770255983,770255983,770255983,770255983 +.long 1249150122,1249150122,1249150122,1249150122 +.long 1249150122,1249150122,1249150122,1249150122 +.long 1555081692,1555081692,1555081692,1555081692 +.long 1555081692,1555081692,1555081692,1555081692 +.long 1996064986,1996064986,1996064986,1996064986 +.long 1996064986,1996064986,1996064986,1996064986 +.long 2554220882,2554220882,2554220882,2554220882 +.long 2554220882,2554220882,2554220882,2554220882 +.long 2821834349,2821834349,2821834349,2821834349 +.long 2821834349,2821834349,2821834349,2821834349 +.long 2952996808,2952996808,2952996808,2952996808 +.long 2952996808,2952996808,2952996808,2952996808 +.long 3210313671,3210313671,3210313671,3210313671 +.long 3210313671,3210313671,3210313671,3210313671 +.long 3336571891,3336571891,3336571891,3336571891 +.long 3336571891,3336571891,3336571891,3336571891 +.long 3584528711,3584528711,3584528711,3584528711 +.long 3584528711,3584528711,3584528711,3584528711 +.long 113926993,113926993,113926993,113926993 +.long 113926993,113926993,113926993,113926993 +.long 338241895,338241895,338241895,338241895 +.long 338241895,338241895,338241895,338241895 +.long 666307205,666307205,666307205,666307205 +.long 666307205,666307205,666307205,666307205 +.long 773529912,773529912,773529912,773529912 +.long 773529912,773529912,773529912,773529912 +.long 1294757372,1294757372,1294757372,1294757372 +.long 1294757372,1294757372,1294757372,1294757372 +.long 1396182291,1396182291,1396182291,1396182291 +.long 1396182291,1396182291,1396182291,1396182291 +.long 1695183700,1695183700,1695183700,1695183700 +.long 1695183700,1695183700,1695183700,1695183700 +.long 1986661051,1986661051,1986661051,1986661051 +.long 1986661051,1986661051,1986661051,1986661051 +.long 2177026350,2177026350,2177026350,2177026350 +.long 2177026350,2177026350,2177026350,2177026350 +.long 2456956037,2456956037,2456956037,2456956037 +.long 2456956037,2456956037,2456956037,2456956037 +.long 2730485921,2730485921,2730485921,2730485921 +.long 2730485921,2730485921,2730485921,2730485921 +.long 2820302411,2820302411,2820302411,2820302411 +.long 2820302411,2820302411,2820302411,2820302411 +.long 3259730800,3259730800,3259730800,3259730800 +.long 3259730800,3259730800,3259730800,3259730800 +.long 3345764771,3345764771,3345764771,3345764771 +.long 3345764771,3345764771,3345764771,3345764771 +.long 3516065817,3516065817,3516065817,3516065817 +.long 3516065817,3516065817,3516065817,3516065817 +.long 3600352804,3600352804,3600352804,3600352804 +.long 3600352804,3600352804,3600352804,3600352804 +.long 4094571909,4094571909,4094571909,4094571909 +.long 4094571909,4094571909,4094571909,4094571909 +.long 275423344,275423344,275423344,275423344 +.long 275423344,275423344,275423344,275423344 +.long 430227734,430227734,430227734,430227734 +.long 430227734,430227734,430227734,430227734 +.long 506948616,506948616,506948616,506948616 +.long 506948616,506948616,506948616,506948616 +.long 659060556,659060556,659060556,659060556 +.long 659060556,659060556,659060556,659060556 +.long 883997877,883997877,883997877,883997877 +.long 883997877,883997877,883997877,883997877 +.long 958139571,958139571,958139571,958139571 +.long 958139571,958139571,958139571,958139571 +.long 1322822218,1322822218,1322822218,1322822218 +.long 1322822218,1322822218,1322822218,1322822218 +.long 1537002063,1537002063,1537002063,1537002063 +.long 1537002063,1537002063,1537002063,1537002063 +.long 1747873779,1747873779,1747873779,1747873779 +.long 1747873779,1747873779,1747873779,1747873779 +.long 1955562222,1955562222,1955562222,1955562222 +.long 1955562222,1955562222,1955562222,1955562222 +.long 2024104815,2024104815,2024104815,2024104815 +.long 2024104815,2024104815,2024104815,2024104815 +.long 2227730452,2227730452,2227730452,2227730452 +.long 2227730452,2227730452,2227730452,2227730452 +.long 2361852424,2361852424,2361852424,2361852424 +.long 2361852424,2361852424,2361852424,2361852424 +.long 2428436474,2428436474,2428436474,2428436474 +.long 2428436474,2428436474,2428436474,2428436474 +.long 2756734187,2756734187,2756734187,2756734187 +.long 2756734187,2756734187,2756734187,2756734187 +.long 3204031479,3204031479,3204031479,3204031479 +.long 3204031479,3204031479,3204031479,3204031479 +.long 3329325298,3329325298,3329325298,3329325298 +.long 3329325298,3329325298,3329325298,3329325298 +.Lpbswap: +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +K256_shaext: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.byte 83,72,65,50,53,54,32,109,117,108,116,105,45,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 diff --git a/secure/lib/libcrypto/amd64/sha256-x86_64.S b/secure/lib/libcrypto/amd64/sha256-x86_64.S new file mode 100644 index 0000000..a43a668 --- /dev/null +++ b/secure/lib/libcrypto/amd64/sha256-x86_64.S @@ -0,0 +1,3050 @@ + # $FreeBSD$ +.text + + +.globl sha256_block_data_order +.type sha256_block_data_order,@function +.align 16 +sha256_block_data_order: + leaq OPENSSL_ia32cap_P(%rip),%r11 + movl 0(%r11),%r9d + movl 4(%r11),%r10d + movl 8(%r11),%r11d + testl $536870912,%r11d + jnz _shaext_shortcut + testl $512,%r10d + jnz .Lssse3_shortcut + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq %rsp,%r11 + shlq $4,%rdx + subq $64+32,%rsp + leaq (%rsi,%rdx,4),%rdx + andq $-64,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %r11,64+24(%rsp) +.Lprologue: + + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + jmp .Lloop + +.align 16 +.Lloop: + movl %ebx,%edi + leaq K256(%rip),%rbp + xorl %ecx,%edi + movl 0(%rsi),%r12d + movl %r8d,%r13d + movl %eax,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,0(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl (%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + + leaq 4(%rbp),%rbp + addl %r14d,%r11d + movl 4(%rsi),%r12d + movl %edx,%r13d + movl %r11d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,4(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl (%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + + leaq 4(%rbp),%rbp + addl %r14d,%r10d + movl 8(%rsi),%r12d + movl %ecx,%r13d + movl %r10d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,8(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl (%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + + leaq 4(%rbp),%rbp + addl %r14d,%r9d + movl 12(%rsi),%r12d + movl %ebx,%r13d + movl %r9d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,12(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl (%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + + leaq 20(%rbp),%rbp + addl %r14d,%r8d + movl 16(%rsi),%r12d + movl %eax,%r13d + movl %r8d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,16(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl (%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + + leaq 4(%rbp),%rbp + addl %r14d,%edx + movl 20(%rsi),%r12d + movl %r11d,%r13d + movl %edx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,20(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl (%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + + leaq 4(%rbp),%rbp + addl %r14d,%ecx + movl 24(%rsi),%r12d + movl %r10d,%r13d + movl %ecx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,24(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl (%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + + leaq 4(%rbp),%rbp + addl %r14d,%ebx + movl 28(%rsi),%r12d + movl %r9d,%r13d + movl %ebx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,28(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl (%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + + leaq 20(%rbp),%rbp + addl %r14d,%eax + movl 32(%rsi),%r12d + movl %r8d,%r13d + movl %eax,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,32(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl (%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + + leaq 4(%rbp),%rbp + addl %r14d,%r11d + movl 36(%rsi),%r12d + movl %edx,%r13d + movl %r11d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,36(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl (%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + + leaq 4(%rbp),%rbp + addl %r14d,%r10d + movl 40(%rsi),%r12d + movl %ecx,%r13d + movl %r10d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,40(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl (%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + + leaq 4(%rbp),%rbp + addl %r14d,%r9d + movl 44(%rsi),%r12d + movl %ebx,%r13d + movl %r9d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,44(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl (%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + + leaq 20(%rbp),%rbp + addl %r14d,%r8d + movl 48(%rsi),%r12d + movl %eax,%r13d + movl %r8d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,48(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl (%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + + leaq 4(%rbp),%rbp + addl %r14d,%edx + movl 52(%rsi),%r12d + movl %r11d,%r13d + movl %edx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,52(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl (%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + + leaq 4(%rbp),%rbp + addl %r14d,%ecx + movl 56(%rsi),%r12d + movl %r10d,%r13d + movl %ecx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,56(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl (%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + + leaq 4(%rbp),%rbp + addl %r14d,%ebx + movl 60(%rsi),%r12d + movl %r9d,%r13d + movl %ebx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,60(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl (%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + + leaq 20(%rbp),%rbp + jmp .Lrounds_16_xx +.align 16 +.Lrounds_16_xx: + movl 4(%rsp),%r13d + movl 56(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 36(%rsp),%r12d + + addl 0(%rsp),%r12d + movl %r8d,%r13d + addl %r15d,%r12d + movl %eax,%r14d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,0(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl (%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + + leaq 4(%rbp),%rbp + movl 8(%rsp),%r13d + movl 60(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 40(%rsp),%r12d + + addl 4(%rsp),%r12d + movl %edx,%r13d + addl %edi,%r12d + movl %r11d,%r14d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,4(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl (%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + + leaq 4(%rbp),%rbp + movl 12(%rsp),%r13d + movl 0(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 44(%rsp),%r12d + + addl 8(%rsp),%r12d + movl %ecx,%r13d + addl %r15d,%r12d + movl %r10d,%r14d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,8(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl (%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + + leaq 4(%rbp),%rbp + movl 16(%rsp),%r13d + movl 4(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 48(%rsp),%r12d + + addl 12(%rsp),%r12d + movl %ebx,%r13d + addl %edi,%r12d + movl %r9d,%r14d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,12(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl (%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + + leaq 20(%rbp),%rbp + movl 20(%rsp),%r13d + movl 8(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 52(%rsp),%r12d + + addl 16(%rsp),%r12d + movl %eax,%r13d + addl %r15d,%r12d + movl %r8d,%r14d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,16(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl (%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + + leaq 4(%rbp),%rbp + movl 24(%rsp),%r13d + movl 12(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 56(%rsp),%r12d + + addl 20(%rsp),%r12d + movl %r11d,%r13d + addl %edi,%r12d + movl %edx,%r14d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,20(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl (%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + + leaq 4(%rbp),%rbp + movl 28(%rsp),%r13d + movl 16(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 60(%rsp),%r12d + + addl 24(%rsp),%r12d + movl %r10d,%r13d + addl %r15d,%r12d + movl %ecx,%r14d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,24(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl (%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + + leaq 4(%rbp),%rbp + movl 32(%rsp),%r13d + movl 20(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 0(%rsp),%r12d + + addl 28(%rsp),%r12d + movl %r9d,%r13d + addl %edi,%r12d + movl %ebx,%r14d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,28(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl (%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + + leaq 20(%rbp),%rbp + movl 36(%rsp),%r13d + movl 24(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 4(%rsp),%r12d + + addl 32(%rsp),%r12d + movl %r8d,%r13d + addl %r15d,%r12d + movl %eax,%r14d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,32(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl (%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + + leaq 4(%rbp),%rbp + movl 40(%rsp),%r13d + movl 28(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 8(%rsp),%r12d + + addl 36(%rsp),%r12d + movl %edx,%r13d + addl %edi,%r12d + movl %r11d,%r14d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,36(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl (%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + + leaq 4(%rbp),%rbp + movl 44(%rsp),%r13d + movl 32(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 12(%rsp),%r12d + + addl 40(%rsp),%r12d + movl %ecx,%r13d + addl %r15d,%r12d + movl %r10d,%r14d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,40(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl (%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + + leaq 4(%rbp),%rbp + movl 48(%rsp),%r13d + movl 36(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 16(%rsp),%r12d + + addl 44(%rsp),%r12d + movl %ebx,%r13d + addl %edi,%r12d + movl %r9d,%r14d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,44(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl (%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + + leaq 20(%rbp),%rbp + movl 52(%rsp),%r13d + movl 40(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 20(%rsp),%r12d + + addl 48(%rsp),%r12d + movl %eax,%r13d + addl %r15d,%r12d + movl %r8d,%r14d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,48(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl (%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + + leaq 4(%rbp),%rbp + movl 56(%rsp),%r13d + movl 44(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 24(%rsp),%r12d + + addl 52(%rsp),%r12d + movl %r11d,%r13d + addl %edi,%r12d + movl %edx,%r14d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,52(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl (%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + + leaq 4(%rbp),%rbp + movl 60(%rsp),%r13d + movl 48(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 28(%rsp),%r12d + + addl 56(%rsp),%r12d + movl %r10d,%r13d + addl %r15d,%r12d + movl %ecx,%r14d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,56(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl (%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + + leaq 4(%rbp),%rbp + movl 0(%rsp),%r13d + movl 52(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 32(%rsp),%r12d + + addl 60(%rsp),%r12d + movl %r9d,%r13d + addl %edi,%r12d + movl %ebx,%r14d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,60(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl (%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + + leaq 20(%rbp),%rbp + cmpb $0,3(%rbp) + jnz .Lrounds_16_xx + + movq 64+0(%rsp),%rdi + addl %r14d,%eax + leaq 64(%rsi),%rsi + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb .Lloop + + movq 64+24(%rsp),%rsi + movq (%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbp + movq 40(%rsi),%rbx + leaq 48(%rsi),%rsp +.Lepilogue: + .byte 0xf3,0xc3 +.size sha256_block_data_order,.-sha256_block_data_order +.align 64 +.type K256,@object +K256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff +.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff +.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 +.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.type sha256_block_data_order_shaext,@function +.align 64 +sha256_block_data_order_shaext: +_shaext_shortcut: + leaq K256+128(%rip),%rcx + movdqu (%rdi),%xmm1 + movdqu 16(%rdi),%xmm2 + movdqa 512-128(%rcx),%xmm7 + + pshufd $27,%xmm1,%xmm0 + pshufd $177,%xmm1,%xmm1 + pshufd $27,%xmm2,%xmm2 + movdqa %xmm7,%xmm8 +.byte 102,15,58,15,202,8 + punpcklqdq %xmm0,%xmm2 + jmp .Loop_shaext + +.align 16 +.Loop_shaext: + movdqu (%rsi),%xmm3 + movdqu 16(%rsi),%xmm4 + movdqu 32(%rsi),%xmm5 +.byte 102,15,56,0,223 + movdqu 48(%rsi),%xmm6 + + movdqa 0-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 102,15,56,0,231 + movdqa %xmm2,%xmm10 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + nop + movdqa %xmm1,%xmm9 +.byte 15,56,203,202 + + movdqa 32-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 102,15,56,0,239 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + leaq 64(%rsi),%rsi +.byte 15,56,204,220 +.byte 15,56,203,202 + + movdqa 64-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 102,15,56,0,247 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + + movdqa 96-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 128-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 160-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 192-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 224-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 256-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 288-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 320-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 352-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 384-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 416-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 +.byte 15,56,203,202 + paddd %xmm7,%xmm6 + + movdqa 448-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 +.byte 15,56,205,245 + movdqa %xmm8,%xmm7 +.byte 15,56,203,202 + + movdqa 480-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 + nop +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + decq %rdx + nop +.byte 15,56,203,202 + + paddd %xmm10,%xmm2 + paddd %xmm9,%xmm1 + jnz .Loop_shaext + + pshufd $177,%xmm2,%xmm2 + pshufd $27,%xmm1,%xmm7 + pshufd $177,%xmm1,%xmm1 + punpckhqdq %xmm2,%xmm1 +.byte 102,15,58,15,215,8 + + movdqu %xmm1,(%rdi) + movdqu %xmm2,16(%rdi) + .byte 0xf3,0xc3 +.size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext +.type sha256_block_data_order_ssse3,@function +.align 64 +sha256_block_data_order_ssse3: +.Lssse3_shortcut: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq %rsp,%r11 + shlq $4,%rdx + subq $96,%rsp + leaq (%rsi,%rdx,4),%rdx + andq $-64,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %r11,64+24(%rsp) +.Lprologue_ssse3: + + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + + + jmp .Lloop_ssse3 +.align 16 +.Lloop_ssse3: + movdqa K256+512(%rip),%xmm7 + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 +.byte 102,15,56,0,199 + movdqu 48(%rsi),%xmm3 + leaq K256(%rip),%rbp +.byte 102,15,56,0,207 + movdqa 0(%rbp),%xmm4 + movdqa 32(%rbp),%xmm5 +.byte 102,15,56,0,215 + paddd %xmm0,%xmm4 + movdqa 64(%rbp),%xmm6 +.byte 102,15,56,0,223 + movdqa 96(%rbp),%xmm7 + paddd %xmm1,%xmm5 + paddd %xmm2,%xmm6 + paddd %xmm3,%xmm7 + movdqa %xmm4,0(%rsp) + movl %eax,%r14d + movdqa %xmm5,16(%rsp) + movl %ebx,%edi + movdqa %xmm6,32(%rsp) + xorl %ecx,%edi + movdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp .Lssse3_00_47 + +.align 16 +.Lssse3_00_47: + subq $-128,%rbp + rorl $14,%r13d + movdqa %xmm1,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm3,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,224,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,250,4 + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm3,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 4(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm0 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm0 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm0,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 0(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm0,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,0(%rsp) + rorl $14,%r13d + movdqa %xmm2,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm0,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,225,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,251,4 + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm0,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 20(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm1 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm1 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm1,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 32(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm1,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,16(%rsp) + rorl $14,%r13d + movdqa %xmm3,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm1,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,226,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,248,4 + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm1,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 36(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm2 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm2 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm2,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 64(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm2,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,32(%rsp) + rorl $14,%r13d + movdqa %xmm0,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm2,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,227,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,249,4 + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm2,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 52(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm3 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm3 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm3,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 96(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm3,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,48(%rsp) + cmpb $0,131(%rbp) + jne .Lssse3_00_47 + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq 64+0(%rsp),%rdi + movl %r14d,%eax + + addl 0(%rdi),%eax + leaq 64(%rsi),%rsi + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb .Lloop_ssse3 + + movq 64+24(%rsp),%rsi + movq (%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbp + movq 40(%rsi),%rbx + leaq 48(%rsi),%rsp +.Lepilogue_ssse3: + .byte 0xf3,0xc3 +.size sha256_block_data_order_ssse3,.-sha256_block_data_order_ssse3 diff --git a/secure/lib/libcrypto/amd64/sha512-x86_64.S b/secure/lib/libcrypto/amd64/sha512-x86_64.S new file mode 100644 index 0000000..60518d4 --- /dev/null +++ b/secure/lib/libcrypto/amd64/sha512-x86_64.S @@ -0,0 +1,1784 @@ + # $FreeBSD$ +.text + + +.globl sha512_block_data_order +.type sha512_block_data_order,@function +.align 16 +sha512_block_data_order: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq %rsp,%r11 + shlq $4,%rdx + subq $128+32,%rsp + leaq (%rsi,%rdx,8),%rdx + andq $-64,%rsp + movq %rdi,128+0(%rsp) + movq %rsi,128+8(%rsp) + movq %rdx,128+16(%rsp) + movq %r11,128+24(%rsp) +.Lprologue: + + movq 0(%rdi),%rax + movq 8(%rdi),%rbx + movq 16(%rdi),%rcx + movq 24(%rdi),%rdx + movq 32(%rdi),%r8 + movq 40(%rdi),%r9 + movq 48(%rdi),%r10 + movq 56(%rdi),%r11 + jmp .Lloop + +.align 16 +.Lloop: + movq %rbx,%rdi + leaq K512(%rip),%rbp + xorq %rcx,%rdi + movq 0(%rsi),%r12 + movq %r8,%r13 + movq %rax,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r9,%r15 + + xorq %r8,%r13 + rorq $5,%r14 + xorq %r10,%r15 + + movq %r12,0(%rsp) + xorq %rax,%r14 + andq %r8,%r15 + + rorq $4,%r13 + addq %r11,%r12 + xorq %r10,%r15 + + rorq $6,%r14 + xorq %r8,%r13 + addq %r15,%r12 + + movq %rax,%r15 + addq (%rbp),%r12 + xorq %rax,%r14 + + xorq %rbx,%r15 + rorq $14,%r13 + movq %rbx,%r11 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r11 + addq %r12,%rdx + addq %r12,%r11 + + leaq 8(%rbp),%rbp + addq %r14,%r11 + movq 8(%rsi),%r12 + movq %rdx,%r13 + movq %r11,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r8,%rdi + + xorq %rdx,%r13 + rorq $5,%r14 + xorq %r9,%rdi + + movq %r12,8(%rsp) + xorq %r11,%r14 + andq %rdx,%rdi + + rorq $4,%r13 + addq %r10,%r12 + xorq %r9,%rdi + + rorq $6,%r14 + xorq %rdx,%r13 + addq %rdi,%r12 + + movq %r11,%rdi + addq (%rbp),%r12 + xorq %r11,%r14 + + xorq %rax,%rdi + rorq $14,%r13 + movq %rax,%r10 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r10 + addq %r12,%rcx + addq %r12,%r10 + + leaq 24(%rbp),%rbp + addq %r14,%r10 + movq 16(%rsi),%r12 + movq %rcx,%r13 + movq %r10,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rdx,%r15 + + xorq %rcx,%r13 + rorq $5,%r14 + xorq %r8,%r15 + + movq %r12,16(%rsp) + xorq %r10,%r14 + andq %rcx,%r15 + + rorq $4,%r13 + addq %r9,%r12 + xorq %r8,%r15 + + rorq $6,%r14 + xorq %rcx,%r13 + addq %r15,%r12 + + movq %r10,%r15 + addq (%rbp),%r12 + xorq %r10,%r14 + + xorq %r11,%r15 + rorq $14,%r13 + movq %r11,%r9 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r9 + addq %r12,%rbx + addq %r12,%r9 + + leaq 8(%rbp),%rbp + addq %r14,%r9 + movq 24(%rsi),%r12 + movq %rbx,%r13 + movq %r9,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rcx,%rdi + + xorq %rbx,%r13 + rorq $5,%r14 + xorq %rdx,%rdi + + movq %r12,24(%rsp) + xorq %r9,%r14 + andq %rbx,%rdi + + rorq $4,%r13 + addq %r8,%r12 + xorq %rdx,%rdi + + rorq $6,%r14 + xorq %rbx,%r13 + addq %rdi,%r12 + + movq %r9,%rdi + addq (%rbp),%r12 + xorq %r9,%r14 + + xorq %r10,%rdi + rorq $14,%r13 + movq %r10,%r8 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r8 + addq %r12,%rax + addq %r12,%r8 + + leaq 24(%rbp),%rbp + addq %r14,%r8 + movq 32(%rsi),%r12 + movq %rax,%r13 + movq %r8,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rbx,%r15 + + xorq %rax,%r13 + rorq $5,%r14 + xorq %rcx,%r15 + + movq %r12,32(%rsp) + xorq %r8,%r14 + andq %rax,%r15 + + rorq $4,%r13 + addq %rdx,%r12 + xorq %rcx,%r15 + + rorq $6,%r14 + xorq %rax,%r13 + addq %r15,%r12 + + movq %r8,%r15 + addq (%rbp),%r12 + xorq %r8,%r14 + + xorq %r9,%r15 + rorq $14,%r13 + movq %r9,%rdx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rdx + addq %r12,%r11 + addq %r12,%rdx + + leaq 8(%rbp),%rbp + addq %r14,%rdx + movq 40(%rsi),%r12 + movq %r11,%r13 + movq %rdx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rax,%rdi + + xorq %r11,%r13 + rorq $5,%r14 + xorq %rbx,%rdi + + movq %r12,40(%rsp) + xorq %rdx,%r14 + andq %r11,%rdi + + rorq $4,%r13 + addq %rcx,%r12 + xorq %rbx,%rdi + + rorq $6,%r14 + xorq %r11,%r13 + addq %rdi,%r12 + + movq %rdx,%rdi + addq (%rbp),%r12 + xorq %rdx,%r14 + + xorq %r8,%rdi + rorq $14,%r13 + movq %r8,%rcx + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rcx + addq %r12,%r10 + addq %r12,%rcx + + leaq 24(%rbp),%rbp + addq %r14,%rcx + movq 48(%rsi),%r12 + movq %r10,%r13 + movq %rcx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r11,%r15 + + xorq %r10,%r13 + rorq $5,%r14 + xorq %rax,%r15 + + movq %r12,48(%rsp) + xorq %rcx,%r14 + andq %r10,%r15 + + rorq $4,%r13 + addq %rbx,%r12 + xorq %rax,%r15 + + rorq $6,%r14 + xorq %r10,%r13 + addq %r15,%r12 + + movq %rcx,%r15 + addq (%rbp),%r12 + xorq %rcx,%r14 + + xorq %rdx,%r15 + rorq $14,%r13 + movq %rdx,%rbx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rbx + addq %r12,%r9 + addq %r12,%rbx + + leaq 8(%rbp),%rbp + addq %r14,%rbx + movq 56(%rsi),%r12 + movq %r9,%r13 + movq %rbx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r10,%rdi + + xorq %r9,%r13 + rorq $5,%r14 + xorq %r11,%rdi + + movq %r12,56(%rsp) + xorq %rbx,%r14 + andq %r9,%rdi + + rorq $4,%r13 + addq %rax,%r12 + xorq %r11,%rdi + + rorq $6,%r14 + xorq %r9,%r13 + addq %rdi,%r12 + + movq %rbx,%rdi + addq (%rbp),%r12 + xorq %rbx,%r14 + + xorq %rcx,%rdi + rorq $14,%r13 + movq %rcx,%rax + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rax + addq %r12,%r8 + addq %r12,%rax + + leaq 24(%rbp),%rbp + addq %r14,%rax + movq 64(%rsi),%r12 + movq %r8,%r13 + movq %rax,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r9,%r15 + + xorq %r8,%r13 + rorq $5,%r14 + xorq %r10,%r15 + + movq %r12,64(%rsp) + xorq %rax,%r14 + andq %r8,%r15 + + rorq $4,%r13 + addq %r11,%r12 + xorq %r10,%r15 + + rorq $6,%r14 + xorq %r8,%r13 + addq %r15,%r12 + + movq %rax,%r15 + addq (%rbp),%r12 + xorq %rax,%r14 + + xorq %rbx,%r15 + rorq $14,%r13 + movq %rbx,%r11 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r11 + addq %r12,%rdx + addq %r12,%r11 + + leaq 8(%rbp),%rbp + addq %r14,%r11 + movq 72(%rsi),%r12 + movq %rdx,%r13 + movq %r11,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r8,%rdi + + xorq %rdx,%r13 + rorq $5,%r14 + xorq %r9,%rdi + + movq %r12,72(%rsp) + xorq %r11,%r14 + andq %rdx,%rdi + + rorq $4,%r13 + addq %r10,%r12 + xorq %r9,%rdi + + rorq $6,%r14 + xorq %rdx,%r13 + addq %rdi,%r12 + + movq %r11,%rdi + addq (%rbp),%r12 + xorq %r11,%r14 + + xorq %rax,%rdi + rorq $14,%r13 + movq %rax,%r10 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r10 + addq %r12,%rcx + addq %r12,%r10 + + leaq 24(%rbp),%rbp + addq %r14,%r10 + movq 80(%rsi),%r12 + movq %rcx,%r13 + movq %r10,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rdx,%r15 + + xorq %rcx,%r13 + rorq $5,%r14 + xorq %r8,%r15 + + movq %r12,80(%rsp) + xorq %r10,%r14 + andq %rcx,%r15 + + rorq $4,%r13 + addq %r9,%r12 + xorq %r8,%r15 + + rorq $6,%r14 + xorq %rcx,%r13 + addq %r15,%r12 + + movq %r10,%r15 + addq (%rbp),%r12 + xorq %r10,%r14 + + xorq %r11,%r15 + rorq $14,%r13 + movq %r11,%r9 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r9 + addq %r12,%rbx + addq %r12,%r9 + + leaq 8(%rbp),%rbp + addq %r14,%r9 + movq 88(%rsi),%r12 + movq %rbx,%r13 + movq %r9,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rcx,%rdi + + xorq %rbx,%r13 + rorq $5,%r14 + xorq %rdx,%rdi + + movq %r12,88(%rsp) + xorq %r9,%r14 + andq %rbx,%rdi + + rorq $4,%r13 + addq %r8,%r12 + xorq %rdx,%rdi + + rorq $6,%r14 + xorq %rbx,%r13 + addq %rdi,%r12 + + movq %r9,%rdi + addq (%rbp),%r12 + xorq %r9,%r14 + + xorq %r10,%rdi + rorq $14,%r13 + movq %r10,%r8 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r8 + addq %r12,%rax + addq %r12,%r8 + + leaq 24(%rbp),%rbp + addq %r14,%r8 + movq 96(%rsi),%r12 + movq %rax,%r13 + movq %r8,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rbx,%r15 + + xorq %rax,%r13 + rorq $5,%r14 + xorq %rcx,%r15 + + movq %r12,96(%rsp) + xorq %r8,%r14 + andq %rax,%r15 + + rorq $4,%r13 + addq %rdx,%r12 + xorq %rcx,%r15 + + rorq $6,%r14 + xorq %rax,%r13 + addq %r15,%r12 + + movq %r8,%r15 + addq (%rbp),%r12 + xorq %r8,%r14 + + xorq %r9,%r15 + rorq $14,%r13 + movq %r9,%rdx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rdx + addq %r12,%r11 + addq %r12,%rdx + + leaq 8(%rbp),%rbp + addq %r14,%rdx + movq 104(%rsi),%r12 + movq %r11,%r13 + movq %rdx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rax,%rdi + + xorq %r11,%r13 + rorq $5,%r14 + xorq %rbx,%rdi + + movq %r12,104(%rsp) + xorq %rdx,%r14 + andq %r11,%rdi + + rorq $4,%r13 + addq %rcx,%r12 + xorq %rbx,%rdi + + rorq $6,%r14 + xorq %r11,%r13 + addq %rdi,%r12 + + movq %rdx,%rdi + addq (%rbp),%r12 + xorq %rdx,%r14 + + xorq %r8,%rdi + rorq $14,%r13 + movq %r8,%rcx + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rcx + addq %r12,%r10 + addq %r12,%rcx + + leaq 24(%rbp),%rbp + addq %r14,%rcx + movq 112(%rsi),%r12 + movq %r10,%r13 + movq %rcx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r11,%r15 + + xorq %r10,%r13 + rorq $5,%r14 + xorq %rax,%r15 + + movq %r12,112(%rsp) + xorq %rcx,%r14 + andq %r10,%r15 + + rorq $4,%r13 + addq %rbx,%r12 + xorq %rax,%r15 + + rorq $6,%r14 + xorq %r10,%r13 + addq %r15,%r12 + + movq %rcx,%r15 + addq (%rbp),%r12 + xorq %rcx,%r14 + + xorq %rdx,%r15 + rorq $14,%r13 + movq %rdx,%rbx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rbx + addq %r12,%r9 + addq %r12,%rbx + + leaq 8(%rbp),%rbp + addq %r14,%rbx + movq 120(%rsi),%r12 + movq %r9,%r13 + movq %rbx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r10,%rdi + + xorq %r9,%r13 + rorq $5,%r14 + xorq %r11,%rdi + + movq %r12,120(%rsp) + xorq %rbx,%r14 + andq %r9,%rdi + + rorq $4,%r13 + addq %rax,%r12 + xorq %r11,%rdi + + rorq $6,%r14 + xorq %r9,%r13 + addq %rdi,%r12 + + movq %rbx,%rdi + addq (%rbp),%r12 + xorq %rbx,%r14 + + xorq %rcx,%rdi + rorq $14,%r13 + movq %rcx,%rax + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rax + addq %r12,%r8 + addq %r12,%rax + + leaq 24(%rbp),%rbp + jmp .Lrounds_16_xx +.align 16 +.Lrounds_16_xx: + movq 8(%rsp),%r13 + movq 112(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rax + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 72(%rsp),%r12 + + addq 0(%rsp),%r12 + movq %r8,%r13 + addq %r15,%r12 + movq %rax,%r14 + rorq $23,%r13 + movq %r9,%r15 + + xorq %r8,%r13 + rorq $5,%r14 + xorq %r10,%r15 + + movq %r12,0(%rsp) + xorq %rax,%r14 + andq %r8,%r15 + + rorq $4,%r13 + addq %r11,%r12 + xorq %r10,%r15 + + rorq $6,%r14 + xorq %r8,%r13 + addq %r15,%r12 + + movq %rax,%r15 + addq (%rbp),%r12 + xorq %rax,%r14 + + xorq %rbx,%r15 + rorq $14,%r13 + movq %rbx,%r11 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r11 + addq %r12,%rdx + addq %r12,%r11 + + leaq 8(%rbp),%rbp + movq 16(%rsp),%r13 + movq 120(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r11 + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 80(%rsp),%r12 + + addq 8(%rsp),%r12 + movq %rdx,%r13 + addq %rdi,%r12 + movq %r11,%r14 + rorq $23,%r13 + movq %r8,%rdi + + xorq %rdx,%r13 + rorq $5,%r14 + xorq %r9,%rdi + + movq %r12,8(%rsp) + xorq %r11,%r14 + andq %rdx,%rdi + + rorq $4,%r13 + addq %r10,%r12 + xorq %r9,%rdi + + rorq $6,%r14 + xorq %rdx,%r13 + addq %rdi,%r12 + + movq %r11,%rdi + addq (%rbp),%r12 + xorq %r11,%r14 + + xorq %rax,%rdi + rorq $14,%r13 + movq %rax,%r10 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r10 + addq %r12,%rcx + addq %r12,%r10 + + leaq 24(%rbp),%rbp + movq 24(%rsp),%r13 + movq 0(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r10 + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 88(%rsp),%r12 + + addq 16(%rsp),%r12 + movq %rcx,%r13 + addq %r15,%r12 + movq %r10,%r14 + rorq $23,%r13 + movq %rdx,%r15 + + xorq %rcx,%r13 + rorq $5,%r14 + xorq %r8,%r15 + + movq %r12,16(%rsp) + xorq %r10,%r14 + andq %rcx,%r15 + + rorq $4,%r13 + addq %r9,%r12 + xorq %r8,%r15 + + rorq $6,%r14 + xorq %rcx,%r13 + addq %r15,%r12 + + movq %r10,%r15 + addq (%rbp),%r12 + xorq %r10,%r14 + + xorq %r11,%r15 + rorq $14,%r13 + movq %r11,%r9 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r9 + addq %r12,%rbx + addq %r12,%r9 + + leaq 8(%rbp),%rbp + movq 32(%rsp),%r13 + movq 8(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r9 + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 96(%rsp),%r12 + + addq 24(%rsp),%r12 + movq %rbx,%r13 + addq %rdi,%r12 + movq %r9,%r14 + rorq $23,%r13 + movq %rcx,%rdi + + xorq %rbx,%r13 + rorq $5,%r14 + xorq %rdx,%rdi + + movq %r12,24(%rsp) + xorq %r9,%r14 + andq %rbx,%rdi + + rorq $4,%r13 + addq %r8,%r12 + xorq %rdx,%rdi + + rorq $6,%r14 + xorq %rbx,%r13 + addq %rdi,%r12 + + movq %r9,%rdi + addq (%rbp),%r12 + xorq %r9,%r14 + + xorq %r10,%rdi + rorq $14,%r13 + movq %r10,%r8 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r8 + addq %r12,%rax + addq %r12,%r8 + + leaq 24(%rbp),%rbp + movq 40(%rsp),%r13 + movq 16(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r8 + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 104(%rsp),%r12 + + addq 32(%rsp),%r12 + movq %rax,%r13 + addq %r15,%r12 + movq %r8,%r14 + rorq $23,%r13 + movq %rbx,%r15 + + xorq %rax,%r13 + rorq $5,%r14 + xorq %rcx,%r15 + + movq %r12,32(%rsp) + xorq %r8,%r14 + andq %rax,%r15 + + rorq $4,%r13 + addq %rdx,%r12 + xorq %rcx,%r15 + + rorq $6,%r14 + xorq %rax,%r13 + addq %r15,%r12 + + movq %r8,%r15 + addq (%rbp),%r12 + xorq %r8,%r14 + + xorq %r9,%r15 + rorq $14,%r13 + movq %r9,%rdx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rdx + addq %r12,%r11 + addq %r12,%rdx + + leaq 8(%rbp),%rbp + movq 48(%rsp),%r13 + movq 24(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rdx + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 112(%rsp),%r12 + + addq 40(%rsp),%r12 + movq %r11,%r13 + addq %rdi,%r12 + movq %rdx,%r14 + rorq $23,%r13 + movq %rax,%rdi + + xorq %r11,%r13 + rorq $5,%r14 + xorq %rbx,%rdi + + movq %r12,40(%rsp) + xorq %rdx,%r14 + andq %r11,%rdi + + rorq $4,%r13 + addq %rcx,%r12 + xorq %rbx,%rdi + + rorq $6,%r14 + xorq %r11,%r13 + addq %rdi,%r12 + + movq %rdx,%rdi + addq (%rbp),%r12 + xorq %rdx,%r14 + + xorq %r8,%rdi + rorq $14,%r13 + movq %r8,%rcx + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rcx + addq %r12,%r10 + addq %r12,%rcx + + leaq 24(%rbp),%rbp + movq 56(%rsp),%r13 + movq 32(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rcx + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 120(%rsp),%r12 + + addq 48(%rsp),%r12 + movq %r10,%r13 + addq %r15,%r12 + movq %rcx,%r14 + rorq $23,%r13 + movq %r11,%r15 + + xorq %r10,%r13 + rorq $5,%r14 + xorq %rax,%r15 + + movq %r12,48(%rsp) + xorq %rcx,%r14 + andq %r10,%r15 + + rorq $4,%r13 + addq %rbx,%r12 + xorq %rax,%r15 + + rorq $6,%r14 + xorq %r10,%r13 + addq %r15,%r12 + + movq %rcx,%r15 + addq (%rbp),%r12 + xorq %rcx,%r14 + + xorq %rdx,%r15 + rorq $14,%r13 + movq %rdx,%rbx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rbx + addq %r12,%r9 + addq %r12,%rbx + + leaq 8(%rbp),%rbp + movq 64(%rsp),%r13 + movq 40(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rbx + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 0(%rsp),%r12 + + addq 56(%rsp),%r12 + movq %r9,%r13 + addq %rdi,%r12 + movq %rbx,%r14 + rorq $23,%r13 + movq %r10,%rdi + + xorq %r9,%r13 + rorq $5,%r14 + xorq %r11,%rdi + + movq %r12,56(%rsp) + xorq %rbx,%r14 + andq %r9,%rdi + + rorq $4,%r13 + addq %rax,%r12 + xorq %r11,%rdi + + rorq $6,%r14 + xorq %r9,%r13 + addq %rdi,%r12 + + movq %rbx,%rdi + addq (%rbp),%r12 + xorq %rbx,%r14 + + xorq %rcx,%rdi + rorq $14,%r13 + movq %rcx,%rax + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rax + addq %r12,%r8 + addq %r12,%rax + + leaq 24(%rbp),%rbp + movq 72(%rsp),%r13 + movq 48(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rax + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 8(%rsp),%r12 + + addq 64(%rsp),%r12 + movq %r8,%r13 + addq %r15,%r12 + movq %rax,%r14 + rorq $23,%r13 + movq %r9,%r15 + + xorq %r8,%r13 + rorq $5,%r14 + xorq %r10,%r15 + + movq %r12,64(%rsp) + xorq %rax,%r14 + andq %r8,%r15 + + rorq $4,%r13 + addq %r11,%r12 + xorq %r10,%r15 + + rorq $6,%r14 + xorq %r8,%r13 + addq %r15,%r12 + + movq %rax,%r15 + addq (%rbp),%r12 + xorq %rax,%r14 + + xorq %rbx,%r15 + rorq $14,%r13 + movq %rbx,%r11 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r11 + addq %r12,%rdx + addq %r12,%r11 + + leaq 8(%rbp),%rbp + movq 80(%rsp),%r13 + movq 56(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r11 + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 16(%rsp),%r12 + + addq 72(%rsp),%r12 + movq %rdx,%r13 + addq %rdi,%r12 + movq %r11,%r14 + rorq $23,%r13 + movq %r8,%rdi + + xorq %rdx,%r13 + rorq $5,%r14 + xorq %r9,%rdi + + movq %r12,72(%rsp) + xorq %r11,%r14 + andq %rdx,%rdi + + rorq $4,%r13 + addq %r10,%r12 + xorq %r9,%rdi + + rorq $6,%r14 + xorq %rdx,%r13 + addq %rdi,%r12 + + movq %r11,%rdi + addq (%rbp),%r12 + xorq %r11,%r14 + + xorq %rax,%rdi + rorq $14,%r13 + movq %rax,%r10 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r10 + addq %r12,%rcx + addq %r12,%r10 + + leaq 24(%rbp),%rbp + movq 88(%rsp),%r13 + movq 64(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r10 + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 24(%rsp),%r12 + + addq 80(%rsp),%r12 + movq %rcx,%r13 + addq %r15,%r12 + movq %r10,%r14 + rorq $23,%r13 + movq %rdx,%r15 + + xorq %rcx,%r13 + rorq $5,%r14 + xorq %r8,%r15 + + movq %r12,80(%rsp) + xorq %r10,%r14 + andq %rcx,%r15 + + rorq $4,%r13 + addq %r9,%r12 + xorq %r8,%r15 + + rorq $6,%r14 + xorq %rcx,%r13 + addq %r15,%r12 + + movq %r10,%r15 + addq (%rbp),%r12 + xorq %r10,%r14 + + xorq %r11,%r15 + rorq $14,%r13 + movq %r11,%r9 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r9 + addq %r12,%rbx + addq %r12,%r9 + + leaq 8(%rbp),%rbp + movq 96(%rsp),%r13 + movq 72(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r9 + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 32(%rsp),%r12 + + addq 88(%rsp),%r12 + movq %rbx,%r13 + addq %rdi,%r12 + movq %r9,%r14 + rorq $23,%r13 + movq %rcx,%rdi + + xorq %rbx,%r13 + rorq $5,%r14 + xorq %rdx,%rdi + + movq %r12,88(%rsp) + xorq %r9,%r14 + andq %rbx,%rdi + + rorq $4,%r13 + addq %r8,%r12 + xorq %rdx,%rdi + + rorq $6,%r14 + xorq %rbx,%r13 + addq %rdi,%r12 + + movq %r9,%rdi + addq (%rbp),%r12 + xorq %r9,%r14 + + xorq %r10,%rdi + rorq $14,%r13 + movq %r10,%r8 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r8 + addq %r12,%rax + addq %r12,%r8 + + leaq 24(%rbp),%rbp + movq 104(%rsp),%r13 + movq 80(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r8 + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 40(%rsp),%r12 + + addq 96(%rsp),%r12 + movq %rax,%r13 + addq %r15,%r12 + movq %r8,%r14 + rorq $23,%r13 + movq %rbx,%r15 + + xorq %rax,%r13 + rorq $5,%r14 + xorq %rcx,%r15 + + movq %r12,96(%rsp) + xorq %r8,%r14 + andq %rax,%r15 + + rorq $4,%r13 + addq %rdx,%r12 + xorq %rcx,%r15 + + rorq $6,%r14 + xorq %rax,%r13 + addq %r15,%r12 + + movq %r8,%r15 + addq (%rbp),%r12 + xorq %r8,%r14 + + xorq %r9,%r15 + rorq $14,%r13 + movq %r9,%rdx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rdx + addq %r12,%r11 + addq %r12,%rdx + + leaq 8(%rbp),%rbp + movq 112(%rsp),%r13 + movq 88(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rdx + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 48(%rsp),%r12 + + addq 104(%rsp),%r12 + movq %r11,%r13 + addq %rdi,%r12 + movq %rdx,%r14 + rorq $23,%r13 + movq %rax,%rdi + + xorq %r11,%r13 + rorq $5,%r14 + xorq %rbx,%rdi + + movq %r12,104(%rsp) + xorq %rdx,%r14 + andq %r11,%rdi + + rorq $4,%r13 + addq %rcx,%r12 + xorq %rbx,%rdi + + rorq $6,%r14 + xorq %r11,%r13 + addq %rdi,%r12 + + movq %rdx,%rdi + addq (%rbp),%r12 + xorq %rdx,%r14 + + xorq %r8,%rdi + rorq $14,%r13 + movq %r8,%rcx + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rcx + addq %r12,%r10 + addq %r12,%rcx + + leaq 24(%rbp),%rbp + movq 120(%rsp),%r13 + movq 96(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rcx + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 56(%rsp),%r12 + + addq 112(%rsp),%r12 + movq %r10,%r13 + addq %r15,%r12 + movq %rcx,%r14 + rorq $23,%r13 + movq %r11,%r15 + + xorq %r10,%r13 + rorq $5,%r14 + xorq %rax,%r15 + + movq %r12,112(%rsp) + xorq %rcx,%r14 + andq %r10,%r15 + + rorq $4,%r13 + addq %rbx,%r12 + xorq %rax,%r15 + + rorq $6,%r14 + xorq %r10,%r13 + addq %r15,%r12 + + movq %rcx,%r15 + addq (%rbp),%r12 + xorq %rcx,%r14 + + xorq %rdx,%r15 + rorq $14,%r13 + movq %rdx,%rbx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rbx + addq %r12,%r9 + addq %r12,%rbx + + leaq 8(%rbp),%rbp + movq 0(%rsp),%r13 + movq 104(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rbx + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 64(%rsp),%r12 + + addq 120(%rsp),%r12 + movq %r9,%r13 + addq %rdi,%r12 + movq %rbx,%r14 + rorq $23,%r13 + movq %r10,%rdi + + xorq %r9,%r13 + rorq $5,%r14 + xorq %r11,%rdi + + movq %r12,120(%rsp) + xorq %rbx,%r14 + andq %r9,%rdi + + rorq $4,%r13 + addq %rax,%r12 + xorq %r11,%rdi + + rorq $6,%r14 + xorq %r9,%r13 + addq %rdi,%r12 + + movq %rbx,%rdi + addq (%rbp),%r12 + xorq %rbx,%r14 + + xorq %rcx,%rdi + rorq $14,%r13 + movq %rcx,%rax + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rax + addq %r12,%r8 + addq %r12,%rax + + leaq 24(%rbp),%rbp + cmpb $0,7(%rbp) + jnz .Lrounds_16_xx + + movq 128+0(%rsp),%rdi + addq %r14,%rax + leaq 128(%rsi),%rsi + + addq 0(%rdi),%rax + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + addq 48(%rdi),%r10 + addq 56(%rdi),%r11 + + cmpq 128+16(%rsp),%rsi + + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) + jb .Lloop + + movq 128+24(%rsp),%rsi + movq (%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbp + movq 40(%rsi),%rbx + leaq 48(%rsi),%rsp +.Lepilogue: + .byte 0xf3,0xc3 +.size sha512_block_data_order,.-sha512_block_data_order +.align 64 +.type K512,@object +K512: +.quad 0x428a2f98d728ae22,0x7137449123ef65cd +.quad 0x428a2f98d728ae22,0x7137449123ef65cd +.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc +.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc +.quad 0x3956c25bf348b538,0x59f111f1b605d019 +.quad 0x3956c25bf348b538,0x59f111f1b605d019 +.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 +.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 +.quad 0xd807aa98a3030242,0x12835b0145706fbe +.quad 0xd807aa98a3030242,0x12835b0145706fbe +.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 +.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 +.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 +.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 +.quad 0x9bdc06a725c71235,0xc19bf174cf692694 +.quad 0x9bdc06a725c71235,0xc19bf174cf692694 +.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 +.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 +.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 +.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 +.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 +.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 +.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 +.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 +.quad 0x983e5152ee66dfab,0xa831c66d2db43210 +.quad 0x983e5152ee66dfab,0xa831c66d2db43210 +.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 +.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 +.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 +.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 +.quad 0x06ca6351e003826f,0x142929670a0e6e70 +.quad 0x06ca6351e003826f,0x142929670a0e6e70 +.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 +.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 +.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df +.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df +.quad 0x650a73548baf63de,0x766a0abb3c77b2a8 +.quad 0x650a73548baf63de,0x766a0abb3c77b2a8 +.quad 0x81c2c92e47edaee6,0x92722c851482353b +.quad 0x81c2c92e47edaee6,0x92722c851482353b +.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 +.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 +.quad 0xc24b8b70d0f89791,0xc76c51a30654be30 +.quad 0xc24b8b70d0f89791,0xc76c51a30654be30 +.quad 0xd192e819d6ef5218,0xd69906245565a910 +.quad 0xd192e819d6ef5218,0xd69906245565a910 +.quad 0xf40e35855771202a,0x106aa07032bbd1b8 +.quad 0xf40e35855771202a,0x106aa07032bbd1b8 +.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 +.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 +.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 +.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 +.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb +.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb +.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 +.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 +.quad 0x748f82ee5defb2fc,0x78a5636f43172f60 +.quad 0x748f82ee5defb2fc,0x78a5636f43172f60 +.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec +.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec +.quad 0x90befffa23631e28,0xa4506cebde82bde9 +.quad 0x90befffa23631e28,0xa4506cebde82bde9 +.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b +.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b +.quad 0xca273eceea26619c,0xd186b8c721c0c207 +.quad 0xca273eceea26619c,0xd186b8c721c0c207 +.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 +.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 +.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 +.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 +.quad 0x113f9804bef90dae,0x1b710b35131c471b +.quad 0x113f9804bef90dae,0x1b710b35131c471b +.quad 0x28db77f523047d84,0x32caab7b40c72493 +.quad 0x28db77f523047d84,0x32caab7b40c72493 +.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c +.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c +.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a +.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a +.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 +.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + +.quad 0x0001020304050607,0x08090a0b0c0d0e0f +.quad 0x0001020304050607,0x08090a0b0c0d0e0f +.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 diff --git a/secure/lib/libcrypto/amd64/vpaes-x86_64.S b/secure/lib/libcrypto/amd64/vpaes-x86_64.S new file mode 100644 index 0000000..8ec5c40 --- /dev/null +++ b/secure/lib/libcrypto/amd64/vpaes-x86_64.S @@ -0,0 +1,828 @@ + # $FreeBSD$ +.text + + + + + + + + + + + + + + + + +.type _vpaes_encrypt_core,@function +.align 16 +_vpaes_encrypt_core: + movq %rdx,%r9 + movq $16,%r11 + movl 240(%rdx),%eax + movdqa %xmm9,%xmm1 + movdqa .Lk_ipt(%rip),%xmm2 + pandn %xmm0,%xmm1 + movdqu (%r9),%xmm5 + psrld $4,%xmm1 + pand %xmm9,%xmm0 +.byte 102,15,56,0,208 + movdqa .Lk_ipt+16(%rip),%xmm0 +.byte 102,15,56,0,193 + pxor %xmm5,%xmm2 + addq $16,%r9 + pxor %xmm2,%xmm0 + leaq .Lk_mc_backward(%rip),%r10 + jmp .Lenc_entry + +.align 16 +.Lenc_loop: + + movdqa %xmm13,%xmm4 + movdqa %xmm12,%xmm0 +.byte 102,15,56,0,226 +.byte 102,15,56,0,195 + pxor %xmm5,%xmm4 + movdqa %xmm15,%xmm5 + pxor %xmm4,%xmm0 + movdqa -64(%r11,%r10,1),%xmm1 +.byte 102,15,56,0,234 + movdqa (%r11,%r10,1),%xmm4 + movdqa %xmm14,%xmm2 +.byte 102,15,56,0,211 + movdqa %xmm0,%xmm3 + pxor %xmm5,%xmm2 +.byte 102,15,56,0,193 + addq $16,%r9 + pxor %xmm2,%xmm0 +.byte 102,15,56,0,220 + addq $16,%r11 + pxor %xmm0,%xmm3 +.byte 102,15,56,0,193 + andq $48,%r11 + subq $1,%rax + pxor %xmm3,%xmm0 + +.Lenc_entry: + + movdqa %xmm9,%xmm1 + movdqa %xmm11,%xmm5 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm9,%xmm0 +.byte 102,15,56,0,232 + movdqa %xmm10,%xmm3 + pxor %xmm1,%xmm0 +.byte 102,15,56,0,217 + movdqa %xmm10,%xmm4 + pxor %xmm5,%xmm3 +.byte 102,15,56,0,224 + movdqa %xmm10,%xmm2 + pxor %xmm5,%xmm4 +.byte 102,15,56,0,211 + movdqa %xmm10,%xmm3 + pxor %xmm0,%xmm2 +.byte 102,15,56,0,220 + movdqu (%r9),%xmm5 + pxor %xmm1,%xmm3 + jnz .Lenc_loop + + + movdqa -96(%r10),%xmm4 + movdqa -80(%r10),%xmm0 +.byte 102,15,56,0,226 + pxor %xmm5,%xmm4 +.byte 102,15,56,0,195 + movdqa 64(%r11,%r10,1),%xmm1 + pxor %xmm4,%xmm0 +.byte 102,15,56,0,193 + .byte 0xf3,0xc3 +.size _vpaes_encrypt_core,.-_vpaes_encrypt_core + + + + + + +.type _vpaes_decrypt_core,@function +.align 16 +_vpaes_decrypt_core: + movq %rdx,%r9 + movl 240(%rdx),%eax + movdqa %xmm9,%xmm1 + movdqa .Lk_dipt(%rip),%xmm2 + pandn %xmm0,%xmm1 + movq %rax,%r11 + psrld $4,%xmm1 + movdqu (%r9),%xmm5 + shlq $4,%r11 + pand %xmm9,%xmm0 +.byte 102,15,56,0,208 + movdqa .Lk_dipt+16(%rip),%xmm0 + xorq $48,%r11 + leaq .Lk_dsbd(%rip),%r10 +.byte 102,15,56,0,193 + andq $48,%r11 + pxor %xmm5,%xmm2 + movdqa .Lk_mc_forward+48(%rip),%xmm5 + pxor %xmm2,%xmm0 + addq $16,%r9 + addq %r10,%r11 + jmp .Ldec_entry + +.align 16 +.Ldec_loop: + + + + movdqa -32(%r10),%xmm4 + movdqa -16(%r10),%xmm1 +.byte 102,15,56,0,226 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 + movdqa 0(%r10),%xmm4 + pxor %xmm1,%xmm0 + movdqa 16(%r10),%xmm1 + +.byte 102,15,56,0,226 +.byte 102,15,56,0,197 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 + movdqa 32(%r10),%xmm4 + pxor %xmm1,%xmm0 + movdqa 48(%r10),%xmm1 + +.byte 102,15,56,0,226 +.byte 102,15,56,0,197 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 + movdqa 64(%r10),%xmm4 + pxor %xmm1,%xmm0 + movdqa 80(%r10),%xmm1 + +.byte 102,15,56,0,226 +.byte 102,15,56,0,197 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 + addq $16,%r9 +.byte 102,15,58,15,237,12 + pxor %xmm1,%xmm0 + subq $1,%rax + +.Ldec_entry: + + movdqa %xmm9,%xmm1 + pandn %xmm0,%xmm1 + movdqa %xmm11,%xmm2 + psrld $4,%xmm1 + pand %xmm9,%xmm0 +.byte 102,15,56,0,208 + movdqa %xmm10,%xmm3 + pxor %xmm1,%xmm0 +.byte 102,15,56,0,217 + movdqa %xmm10,%xmm4 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,224 + pxor %xmm2,%xmm4 + movdqa %xmm10,%xmm2 +.byte 102,15,56,0,211 + movdqa %xmm10,%xmm3 + pxor %xmm0,%xmm2 +.byte 102,15,56,0,220 + movdqu (%r9),%xmm0 + pxor %xmm1,%xmm3 + jnz .Ldec_loop + + + movdqa 96(%r10),%xmm4 +.byte 102,15,56,0,226 + pxor %xmm0,%xmm4 + movdqa 112(%r10),%xmm0 + movdqa -352(%r11),%xmm2 +.byte 102,15,56,0,195 + pxor %xmm4,%xmm0 +.byte 102,15,56,0,194 + .byte 0xf3,0xc3 +.size _vpaes_decrypt_core,.-_vpaes_decrypt_core + + + + + + +.type _vpaes_schedule_core,@function +.align 16 +_vpaes_schedule_core: + + + + + + call _vpaes_preheat + movdqa .Lk_rcon(%rip),%xmm8 + movdqu (%rdi),%xmm0 + + + movdqa %xmm0,%xmm3 + leaq .Lk_ipt(%rip),%r11 + call _vpaes_schedule_transform + movdqa %xmm0,%xmm7 + + leaq .Lk_sr(%rip),%r10 + testq %rcx,%rcx + jnz .Lschedule_am_decrypting + + + movdqu %xmm0,(%rdx) + jmp .Lschedule_go + +.Lschedule_am_decrypting: + + movdqa (%r8,%r10,1),%xmm1 +.byte 102,15,56,0,217 + movdqu %xmm3,(%rdx) + xorq $48,%r8 + +.Lschedule_go: + cmpl $192,%esi + ja .Lschedule_256 + je .Lschedule_192 + + + + + + + + + + +.Lschedule_128: + movl $10,%esi + +.Loop_schedule_128: + call _vpaes_schedule_round + decq %rsi + jz .Lschedule_mangle_last + call _vpaes_schedule_mangle + jmp .Loop_schedule_128 + + + + + + + + + + + + + + + + +.align 16 +.Lschedule_192: + movdqu 8(%rdi),%xmm0 + call _vpaes_schedule_transform + movdqa %xmm0,%xmm6 + pxor %xmm4,%xmm4 + movhlps %xmm4,%xmm6 + movl $4,%esi + +.Loop_schedule_192: + call _vpaes_schedule_round +.byte 102,15,58,15,198,8 + call _vpaes_schedule_mangle + call _vpaes_schedule_192_smear + call _vpaes_schedule_mangle + call _vpaes_schedule_round + decq %rsi + jz .Lschedule_mangle_last + call _vpaes_schedule_mangle + call _vpaes_schedule_192_smear + jmp .Loop_schedule_192 + + + + + + + + + + + +.align 16 +.Lschedule_256: + movdqu 16(%rdi),%xmm0 + call _vpaes_schedule_transform + movl $7,%esi + +.Loop_schedule_256: + call _vpaes_schedule_mangle + movdqa %xmm0,%xmm6 + + + call _vpaes_schedule_round + decq %rsi + jz .Lschedule_mangle_last + call _vpaes_schedule_mangle + + + pshufd $255,%xmm0,%xmm0 + movdqa %xmm7,%xmm5 + movdqa %xmm6,%xmm7 + call _vpaes_schedule_low_round + movdqa %xmm5,%xmm7 + + jmp .Loop_schedule_256 + + + + + + + + + + + + +.align 16 +.Lschedule_mangle_last: + + leaq .Lk_deskew(%rip),%r11 + testq %rcx,%rcx + jnz .Lschedule_mangle_last_dec + + + movdqa (%r8,%r10,1),%xmm1 +.byte 102,15,56,0,193 + leaq .Lk_opt(%rip),%r11 + addq $32,%rdx + +.Lschedule_mangle_last_dec: + addq $-16,%rdx + pxor .Lk_s63(%rip),%xmm0 + call _vpaes_schedule_transform + movdqu %xmm0,(%rdx) + + + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + .byte 0xf3,0xc3 +.size _vpaes_schedule_core,.-_vpaes_schedule_core + + + + + + + + + + + + + + + +.type _vpaes_schedule_192_smear,@function +.align 16 +_vpaes_schedule_192_smear: + pshufd $128,%xmm6,%xmm1 + pshufd $254,%xmm7,%xmm0 + pxor %xmm1,%xmm6 + pxor %xmm1,%xmm1 + pxor %xmm0,%xmm6 + movdqa %xmm6,%xmm0 + movhlps %xmm1,%xmm6 + .byte 0xf3,0xc3 +.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear + + + + + + + + + + + + + + + + + + + +.type _vpaes_schedule_round,@function +.align 16 +_vpaes_schedule_round: + + pxor %xmm1,%xmm1 +.byte 102,65,15,58,15,200,15 +.byte 102,69,15,58,15,192,15 + pxor %xmm1,%xmm7 + + + pshufd $255,%xmm0,%xmm0 +.byte 102,15,58,15,192,1 + + + + +_vpaes_schedule_low_round: + + movdqa %xmm7,%xmm1 + pslldq $4,%xmm7 + pxor %xmm1,%xmm7 + movdqa %xmm7,%xmm1 + pslldq $8,%xmm7 + pxor %xmm1,%xmm7 + pxor .Lk_s63(%rip),%xmm7 + + + movdqa %xmm9,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm9,%xmm0 + movdqa %xmm11,%xmm2 +.byte 102,15,56,0,208 + pxor %xmm1,%xmm0 + movdqa %xmm10,%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 + movdqa %xmm10,%xmm4 +.byte 102,15,56,0,224 + pxor %xmm2,%xmm4 + movdqa %xmm10,%xmm2 +.byte 102,15,56,0,211 + pxor %xmm0,%xmm2 + movdqa %xmm10,%xmm3 +.byte 102,15,56,0,220 + pxor %xmm1,%xmm3 + movdqa %xmm13,%xmm4 +.byte 102,15,56,0,226 + movdqa %xmm12,%xmm0 +.byte 102,15,56,0,195 + pxor %xmm4,%xmm0 + + + pxor %xmm7,%xmm0 + movdqa %xmm0,%xmm7 + .byte 0xf3,0xc3 +.size _vpaes_schedule_round,.-_vpaes_schedule_round + + + + + + + + + + +.type _vpaes_schedule_transform,@function +.align 16 +_vpaes_schedule_transform: + movdqa %xmm9,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm9,%xmm0 + movdqa (%r11),%xmm2 +.byte 102,15,56,0,208 + movdqa 16(%r11),%xmm0 +.byte 102,15,56,0,193 + pxor %xmm2,%xmm0 + .byte 0xf3,0xc3 +.size _vpaes_schedule_transform,.-_vpaes_schedule_transform + + + + + + + + + + + + + + + + + + + + + + + + +.type _vpaes_schedule_mangle,@function +.align 16 +_vpaes_schedule_mangle: + movdqa %xmm0,%xmm4 + movdqa .Lk_mc_forward(%rip),%xmm5 + testq %rcx,%rcx + jnz .Lschedule_mangle_dec + + + addq $16,%rdx + pxor .Lk_s63(%rip),%xmm4 +.byte 102,15,56,0,229 + movdqa %xmm4,%xmm3 +.byte 102,15,56,0,229 + pxor %xmm4,%xmm3 +.byte 102,15,56,0,229 + pxor %xmm4,%xmm3 + + jmp .Lschedule_mangle_both +.align 16 +.Lschedule_mangle_dec: + + leaq .Lk_dksd(%rip),%r11 + movdqa %xmm9,%xmm1 + pandn %xmm4,%xmm1 + psrld $4,%xmm1 + pand %xmm9,%xmm4 + + movdqa 0(%r11),%xmm2 +.byte 102,15,56,0,212 + movdqa 16(%r11),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,221 + + movdqa 32(%r11),%xmm2 +.byte 102,15,56,0,212 + pxor %xmm3,%xmm2 + movdqa 48(%r11),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,221 + + movdqa 64(%r11),%xmm2 +.byte 102,15,56,0,212 + pxor %xmm3,%xmm2 + movdqa 80(%r11),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,221 + + movdqa 96(%r11),%xmm2 +.byte 102,15,56,0,212 + pxor %xmm3,%xmm2 + movdqa 112(%r11),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 + + addq $-16,%rdx + +.Lschedule_mangle_both: + movdqa (%r8,%r10,1),%xmm1 +.byte 102,15,56,0,217 + addq $-16,%r8 + andq $48,%r8 + movdqu %xmm3,(%rdx) + .byte 0xf3,0xc3 +.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle + + + + +.globl vpaes_set_encrypt_key +.type vpaes_set_encrypt_key,@function +.align 16 +vpaes_set_encrypt_key: + movl %esi,%eax + shrl $5,%eax + addl $5,%eax + movl %eax,240(%rdx) + + movl $0,%ecx + movl $48,%r8d + call _vpaes_schedule_core + xorl %eax,%eax + .byte 0xf3,0xc3 +.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key + +.globl vpaes_set_decrypt_key +.type vpaes_set_decrypt_key,@function +.align 16 +vpaes_set_decrypt_key: + movl %esi,%eax + shrl $5,%eax + addl $5,%eax + movl %eax,240(%rdx) + shll $4,%eax + leaq 16(%rdx,%rax,1),%rdx + + movl $1,%ecx + movl %esi,%r8d + shrl $1,%r8d + andl $32,%r8d + xorl $32,%r8d + call _vpaes_schedule_core + xorl %eax,%eax + .byte 0xf3,0xc3 +.size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key + +.globl vpaes_encrypt +.type vpaes_encrypt,@function +.align 16 +vpaes_encrypt: + movdqu (%rdi),%xmm0 + call _vpaes_preheat + call _vpaes_encrypt_core + movdqu %xmm0,(%rsi) + .byte 0xf3,0xc3 +.size vpaes_encrypt,.-vpaes_encrypt + +.globl vpaes_decrypt +.type vpaes_decrypt,@function +.align 16 +vpaes_decrypt: + movdqu (%rdi),%xmm0 + call _vpaes_preheat + call _vpaes_decrypt_core + movdqu %xmm0,(%rsi) + .byte 0xf3,0xc3 +.size vpaes_decrypt,.-vpaes_decrypt +.globl vpaes_cbc_encrypt +.type vpaes_cbc_encrypt,@function +.align 16 +vpaes_cbc_encrypt: + xchgq %rcx,%rdx + subq $16,%rcx + jc .Lcbc_abort + movdqu (%r8),%xmm6 + subq %rdi,%rsi + call _vpaes_preheat + cmpl $0,%r9d + je .Lcbc_dec_loop + jmp .Lcbc_enc_loop +.align 16 +.Lcbc_enc_loop: + movdqu (%rdi),%xmm0 + pxor %xmm6,%xmm0 + call _vpaes_encrypt_core + movdqa %xmm0,%xmm6 + movdqu %xmm0,(%rsi,%rdi,1) + leaq 16(%rdi),%rdi + subq $16,%rcx + jnc .Lcbc_enc_loop + jmp .Lcbc_done +.align 16 +.Lcbc_dec_loop: + movdqu (%rdi),%xmm0 + movdqa %xmm0,%xmm7 + call _vpaes_decrypt_core + pxor %xmm6,%xmm0 + movdqa %xmm7,%xmm6 + movdqu %xmm0,(%rsi,%rdi,1) + leaq 16(%rdi),%rdi + subq $16,%rcx + jnc .Lcbc_dec_loop +.Lcbc_done: + movdqu %xmm6,(%r8) +.Lcbc_abort: + .byte 0xf3,0xc3 +.size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt + + + + + + +.type _vpaes_preheat,@function +.align 16 +_vpaes_preheat: + leaq .Lk_s0F(%rip),%r10 + movdqa -32(%r10),%xmm10 + movdqa -16(%r10),%xmm11 + movdqa 0(%r10),%xmm9 + movdqa 48(%r10),%xmm13 + movdqa 64(%r10),%xmm12 + movdqa 80(%r10),%xmm15 + movdqa 96(%r10),%xmm14 + .byte 0xf3,0xc3 +.size _vpaes_preheat,.-_vpaes_preheat + + + + + +.type _vpaes_consts,@object +.align 64 +_vpaes_consts: +.Lk_inv: +.quad 0x0E05060F0D080180, 0x040703090A0B0C02 +.quad 0x01040A060F0B0780, 0x030D0E0C02050809 + +.Lk_s0F: +.quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F + +.Lk_ipt: +.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 +.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 + +.Lk_sb1: +.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 +.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF +.Lk_sb2: +.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD +.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A +.Lk_sbo: +.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 +.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA + +.Lk_mc_forward: +.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 +.quad 0x080B0A0904070605, 0x000302010C0F0E0D +.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 +.quad 0x000302010C0F0E0D, 0x080B0A0904070605 + +.Lk_mc_backward: +.quad 0x0605040702010003, 0x0E0D0C0F0A09080B +.quad 0x020100030E0D0C0F, 0x0A09080B06050407 +.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 +.quad 0x0A09080B06050407, 0x020100030E0D0C0F + +.Lk_sr: +.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 +.quad 0x030E09040F0A0500, 0x0B06010C07020D08 +.quad 0x0F060D040B020900, 0x070E050C030A0108 +.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 + +.Lk_rcon: +.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 + +.Lk_s63: +.quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B + +.Lk_opt: +.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 +.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 + +.Lk_deskew: +.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A +.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 + + + + + +.Lk_dksd: +.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 +.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E +.Lk_dksb: +.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 +.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 +.Lk_dkse: +.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 +.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 +.Lk_dks9: +.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC +.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE + + + + + +.Lk_dipt: +.quad 0x0F505B040B545F00, 0x154A411E114E451A +.quad 0x86E383E660056500, 0x12771772F491F194 + +.Lk_dsb9: +.quad 0x851C03539A86D600, 0xCAD51F504F994CC9 +.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 +.Lk_dsbd: +.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 +.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 +.Lk_dsbb: +.quad 0xD022649296B44200, 0x602646F6B0F2D404 +.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B +.Lk_dsbe: +.quad 0x46F2929626D4D000, 0x2242600464B4F6B0 +.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 +.Lk_dsbo: +.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D +.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C +.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 +.align 64 +.size _vpaes_consts,.-_vpaes_consts diff --git a/secure/lib/libcrypto/amd64/wp-x86_64.S b/secure/lib/libcrypto/amd64/wp-x86_64.S new file mode 100644 index 0000000..36f5bc0 --- /dev/null +++ b/secure/lib/libcrypto/amd64/wp-x86_64.S @@ -0,0 +1,862 @@ + # $FreeBSD$ +.text + +.globl whirlpool_block +.type whirlpool_block,@function +.align 16 +whirlpool_block: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + movq %rsp,%r11 + subq $128+40,%rsp + andq $-64,%rsp + + leaq 128(%rsp),%r10 + movq %rdi,0(%r10) + movq %rsi,8(%r10) + movq %rdx,16(%r10) + movq %r11,32(%r10) +.Lprologue: + + movq %r10,%rbx + leaq .Ltable(%rip),%rbp + + xorq %rcx,%rcx + xorq %rdx,%rdx + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 +.Louterloop: + movq %r8,0(%rsp) + movq %r9,8(%rsp) + movq %r10,16(%rsp) + movq %r11,24(%rsp) + movq %r12,32(%rsp) + movq %r13,40(%rsp) + movq %r14,48(%rsp) + movq %r15,56(%rsp) + xorq 0(%rsi),%r8 + xorq 8(%rsi),%r9 + xorq 16(%rsi),%r10 + xorq 24(%rsi),%r11 + xorq 32(%rsi),%r12 + xorq 40(%rsi),%r13 + xorq 48(%rsi),%r14 + xorq 56(%rsi),%r15 + movq %r8,64+0(%rsp) + movq %r9,64+8(%rsp) + movq %r10,64+16(%rsp) + movq %r11,64+24(%rsp) + movq %r12,64+32(%rsp) + movq %r13,64+40(%rsp) + movq %r14,64+48(%rsp) + movq %r15,64+56(%rsp) + xorq %rsi,%rsi + movq %rsi,24(%rbx) + jmp .Lround +.align 16 +.Lround: + movq 4096(%rbp,%rsi,8),%r8 + movl 0(%rsp),%eax + movl 4(%rsp),%ebx + movzbl %al,%ecx + movzbl %ah,%edx + shrl $16,%eax + leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx + xorq 0(%rbp,%rsi,8),%r8 + movq 7(%rbp,%rdi,8),%r9 + movl 0+8(%rsp),%eax + leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx + movq 6(%rbp,%rsi,8),%r10 + movq 5(%rbp,%rdi,8),%r11 + shrl $16,%ebx + leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx + movq 4(%rbp,%rsi,8),%r12 + movq 3(%rbp,%rdi,8),%r13 + movl 0+8+4(%rsp),%ebx + leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx + movq 2(%rbp,%rsi,8),%r14 + movq 1(%rbp,%rdi,8),%r15 + shrl $16,%eax + leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx + xorq 0(%rbp,%rsi,8),%r9 + xorq 7(%rbp,%rdi,8),%r10 + movl 8+8(%rsp),%eax + leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx + xorq 6(%rbp,%rsi,8),%r11 + xorq 5(%rbp,%rdi,8),%r12 + shrl $16,%ebx + leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx + xorq 4(%rbp,%rsi,8),%r13 + xorq 3(%rbp,%rdi,8),%r14 + movl 8+8+4(%rsp),%ebx + leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx + xorq 2(%rbp,%rsi,8),%r15 + xorq 1(%rbp,%rdi,8),%r8 + shrl $16,%eax + leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx + xorq 0(%rbp,%rsi,8),%r10 + xorq 7(%rbp,%rdi,8),%r11 + movl 16+8(%rsp),%eax + leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx + xorq 6(%rbp,%rsi,8),%r12 + xorq 5(%rbp,%rdi,8),%r13 + shrl $16,%ebx + leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx + xorq 4(%rbp,%rsi,8),%r14 + xorq 3(%rbp,%rdi,8),%r15 + movl 16+8+4(%rsp),%ebx + leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx + xorq 2(%rbp,%rsi,8),%r8 + xorq 1(%rbp,%rdi,8),%r9 + shrl $16,%eax + leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx + xorq 0(%rbp,%rsi,8),%r11 + xorq 7(%rbp,%rdi,8),%r12 + movl 24+8(%rsp),%eax + leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx + xorq 6(%rbp,%rsi,8),%r13 + xorq 5(%rbp,%rdi,8),%r14 + shrl $16,%ebx + leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx + xorq 4(%rbp,%rsi,8),%r15 + xorq 3(%rbp,%rdi,8),%r8 + movl 24+8+4(%rsp),%ebx + leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx + xorq 2(%rbp,%rsi,8),%r9 + xorq 1(%rbp,%rdi,8),%r10 + shrl $16,%eax + leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx + xorq 0(%rbp,%rsi,8),%r12 + xorq 7(%rbp,%rdi,8),%r13 + movl 32+8(%rsp),%eax + leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx + xorq 6(%rbp,%rsi,8),%r14 + xorq 5(%rbp,%rdi,8),%r15 + shrl $16,%ebx + leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx + xorq 4(%rbp,%rsi,8),%r8 + xorq 3(%rbp,%rdi,8),%r9 + movl 32+8+4(%rsp),%ebx + leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx + xorq 2(%rbp,%rsi,8),%r10 + xorq 1(%rbp,%rdi,8),%r11 + shrl $16,%eax + leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx + xorq 0(%rbp,%rsi,8),%r13 + xorq 7(%rbp,%rdi,8),%r14 + movl 40+8(%rsp),%eax + leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx + xorq 6(%rbp,%rsi,8),%r15 + xorq 5(%rbp,%rdi,8),%r8 + shrl $16,%ebx + leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx + xorq 4(%rbp,%rsi,8),%r9 + xorq 3(%rbp,%rdi,8),%r10 + movl 40+8+4(%rsp),%ebx + leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx + xorq 2(%rbp,%rsi,8),%r11 + xorq 1(%rbp,%rdi,8),%r12 + shrl $16,%eax + leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx + xorq 0(%rbp,%rsi,8),%r14 + xorq 7(%rbp,%rdi,8),%r15 + movl 48+8(%rsp),%eax + leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx + xorq 6(%rbp,%rsi,8),%r8 + xorq 5(%rbp,%rdi,8),%r9 + shrl $16,%ebx + leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx + xorq 4(%rbp,%rsi,8),%r10 + xorq 3(%rbp,%rdi,8),%r11 + movl 48+8+4(%rsp),%ebx + leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx + xorq 2(%rbp,%rsi,8),%r12 + xorq 1(%rbp,%rdi,8),%r13 + shrl $16,%eax + leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx + xorq 0(%rbp,%rsi,8),%r15 + xorq 7(%rbp,%rdi,8),%r8 + movl 56+8(%rsp),%eax + leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx + xorq 6(%rbp,%rsi,8),%r9 + xorq 5(%rbp,%rdi,8),%r10 + shrl $16,%ebx + leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx + xorq 4(%rbp,%rsi,8),%r11 + xorq 3(%rbp,%rdi,8),%r12 + movl 56+8+4(%rsp),%ebx + leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx + xorq 2(%rbp,%rsi,8),%r13 + xorq 1(%rbp,%rdi,8),%r14 + movq %r8,0(%rsp) + movq %r9,8(%rsp) + movq %r10,16(%rsp) + movq %r11,24(%rsp) + movq %r12,32(%rsp) + movq %r13,40(%rsp) + movq %r14,48(%rsp) + movq %r15,56(%rsp) + shrl $16,%eax + leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx + xorq 0(%rbp,%rsi,8),%r8 + xorq 7(%rbp,%rdi,8),%r9 + movl 64+0+8(%rsp),%eax + leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx + xorq 6(%rbp,%rsi,8),%r10 + xorq 5(%rbp,%rdi,8),%r11 + shrl $16,%ebx + leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx + xorq 4(%rbp,%rsi,8),%r12 + xorq 3(%rbp,%rdi,8),%r13 + movl 64+0+8+4(%rsp),%ebx + leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx + xorq 2(%rbp,%rsi,8),%r14 + xorq 1(%rbp,%rdi,8),%r15 + shrl $16,%eax + leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx + xorq 0(%rbp,%rsi,8),%r9 + xorq 7(%rbp,%rdi,8),%r10 + movl 64+8+8(%rsp),%eax + leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx + xorq 6(%rbp,%rsi,8),%r11 + xorq 5(%rbp,%rdi,8),%r12 + shrl $16,%ebx + leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx + xorq 4(%rbp,%rsi,8),%r13 + xorq 3(%rbp,%rdi,8),%r14 + movl 64+8+8+4(%rsp),%ebx + leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx + xorq 2(%rbp,%rsi,8),%r15 + xorq 1(%rbp,%rdi,8),%r8 + shrl $16,%eax + leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx + xorq 0(%rbp,%rsi,8),%r10 + xorq 7(%rbp,%rdi,8),%r11 + movl 64+16+8(%rsp),%eax + leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx + xorq 6(%rbp,%rsi,8),%r12 + xorq 5(%rbp,%rdi,8),%r13 + shrl $16,%ebx + leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx + xorq 4(%rbp,%rsi,8),%r14 + xorq 3(%rbp,%rdi,8),%r15 + movl 64+16+8+4(%rsp),%ebx + leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx + xorq 2(%rbp,%rsi,8),%r8 + xorq 1(%rbp,%rdi,8),%r9 + shrl $16,%eax + leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx + xorq 0(%rbp,%rsi,8),%r11 + xorq 7(%rbp,%rdi,8),%r12 + movl 64+24+8(%rsp),%eax + leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx + xorq 6(%rbp,%rsi,8),%r13 + xorq 5(%rbp,%rdi,8),%r14 + shrl $16,%ebx + leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx + xorq 4(%rbp,%rsi,8),%r15 + xorq 3(%rbp,%rdi,8),%r8 + movl 64+24+8+4(%rsp),%ebx + leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx + xorq 2(%rbp,%rsi,8),%r9 + xorq 1(%rbp,%rdi,8),%r10 + shrl $16,%eax + leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx + xorq 0(%rbp,%rsi,8),%r12 + xorq 7(%rbp,%rdi,8),%r13 + movl 64+32+8(%rsp),%eax + leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx + xorq 6(%rbp,%rsi,8),%r14 + xorq 5(%rbp,%rdi,8),%r15 + shrl $16,%ebx + leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx + xorq 4(%rbp,%rsi,8),%r8 + xorq 3(%rbp,%rdi,8),%r9 + movl 64+32+8+4(%rsp),%ebx + leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx + xorq 2(%rbp,%rsi,8),%r10 + xorq 1(%rbp,%rdi,8),%r11 + shrl $16,%eax + leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx + xorq 0(%rbp,%rsi,8),%r13 + xorq 7(%rbp,%rdi,8),%r14 + movl 64+40+8(%rsp),%eax + leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx + xorq 6(%rbp,%rsi,8),%r15 + xorq 5(%rbp,%rdi,8),%r8 + shrl $16,%ebx + leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx + xorq 4(%rbp,%rsi,8),%r9 + xorq 3(%rbp,%rdi,8),%r10 + movl 64+40+8+4(%rsp),%ebx + leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx + xorq 2(%rbp,%rsi,8),%r11 + xorq 1(%rbp,%rdi,8),%r12 + shrl $16,%eax + leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx + xorq 0(%rbp,%rsi,8),%r14 + xorq 7(%rbp,%rdi,8),%r15 + movl 64+48+8(%rsp),%eax + leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx + xorq 6(%rbp,%rsi,8),%r8 + xorq 5(%rbp,%rdi,8),%r9 + shrl $16,%ebx + leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx + xorq 4(%rbp,%rsi,8),%r10 + xorq 3(%rbp,%rdi,8),%r11 + movl 64+48+8+4(%rsp),%ebx + leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx + xorq 2(%rbp,%rsi,8),%r12 + xorq 1(%rbp,%rdi,8),%r13 + shrl $16,%eax + leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx + xorq 0(%rbp,%rsi,8),%r15 + xorq 7(%rbp,%rdi,8),%r8 + + leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx + xorq 6(%rbp,%rsi,8),%r9 + xorq 5(%rbp,%rdi,8),%r10 + shrl $16,%ebx + leaq (%rcx,%rcx,1),%rsi + movzbl %bl,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %bh,%edx + xorq 4(%rbp,%rsi,8),%r11 + xorq 3(%rbp,%rdi,8),%r12 + + leaq (%rcx,%rcx,1),%rsi + movzbl %al,%ecx + leaq (%rdx,%rdx,1),%rdi + movzbl %ah,%edx + xorq 2(%rbp,%rsi,8),%r13 + xorq 1(%rbp,%rdi,8),%r14 + leaq 128(%rsp),%rbx + movq 24(%rbx),%rsi + addq $1,%rsi + cmpq $10,%rsi + je .Lroundsdone + + movq %rsi,24(%rbx) + movq %r8,64+0(%rsp) + movq %r9,64+8(%rsp) + movq %r10,64+16(%rsp) + movq %r11,64+24(%rsp) + movq %r12,64+32(%rsp) + movq %r13,64+40(%rsp) + movq %r14,64+48(%rsp) + movq %r15,64+56(%rsp) + jmp .Lround +.align 16 +.Lroundsdone: + movq 0(%rbx),%rdi + movq 8(%rbx),%rsi + movq 16(%rbx),%rax + xorq 0(%rsi),%r8 + xorq 8(%rsi),%r9 + xorq 16(%rsi),%r10 + xorq 24(%rsi),%r11 + xorq 32(%rsi),%r12 + xorq 40(%rsi),%r13 + xorq 48(%rsi),%r14 + xorq 56(%rsi),%r15 + xorq 0(%rdi),%r8 + xorq 8(%rdi),%r9 + xorq 16(%rdi),%r10 + xorq 24(%rdi),%r11 + xorq 32(%rdi),%r12 + xorq 40(%rdi),%r13 + xorq 48(%rdi),%r14 + xorq 56(%rdi),%r15 + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + leaq 64(%rsi),%rsi + subq $1,%rax + jz .Lalldone + movq %rsi,8(%rbx) + movq %rax,16(%rbx) + jmp .Louterloop +.Lalldone: + movq 32(%rbx),%rsi + movq (%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbp + movq 40(%rsi),%rbx + leaq 48(%rsi),%rsp +.Lepilogue: + .byte 0xf3,0xc3 +.size whirlpool_block,.-whirlpool_block + +.align 64 +.type .Ltable,@object +.Ltable: +.byte 24,24,96,24,192,120,48,216,24,24,96,24,192,120,48,216 +.byte 35,35,140,35,5,175,70,38,35,35,140,35,5,175,70,38 +.byte 198,198,63,198,126,249,145,184,198,198,63,198,126,249,145,184 +.byte 232,232,135,232,19,111,205,251,232,232,135,232,19,111,205,251 +.byte 135,135,38,135,76,161,19,203,135,135,38,135,76,161,19,203 +.byte 184,184,218,184,169,98,109,17,184,184,218,184,169,98,109,17 +.byte 1,1,4,1,8,5,2,9,1,1,4,1,8,5,2,9 +.byte 79,79,33,79,66,110,158,13,79,79,33,79,66,110,158,13 +.byte 54,54,216,54,173,238,108,155,54,54,216,54,173,238,108,155 +.byte 166,166,162,166,89,4,81,255,166,166,162,166,89,4,81,255 +.byte 210,210,111,210,222,189,185,12,210,210,111,210,222,189,185,12 +.byte 245,245,243,245,251,6,247,14,245,245,243,245,251,6,247,14 +.byte 121,121,249,121,239,128,242,150,121,121,249,121,239,128,242,150 +.byte 111,111,161,111,95,206,222,48,111,111,161,111,95,206,222,48 +.byte 145,145,126,145,252,239,63,109,145,145,126,145,252,239,63,109 +.byte 82,82,85,82,170,7,164,248,82,82,85,82,170,7,164,248 +.byte 96,96,157,96,39,253,192,71,96,96,157,96,39,253,192,71 +.byte 188,188,202,188,137,118,101,53,188,188,202,188,137,118,101,53 +.byte 155,155,86,155,172,205,43,55,155,155,86,155,172,205,43,55 +.byte 142,142,2,142,4,140,1,138,142,142,2,142,4,140,1,138 +.byte 163,163,182,163,113,21,91,210,163,163,182,163,113,21,91,210 +.byte 12,12,48,12,96,60,24,108,12,12,48,12,96,60,24,108 +.byte 123,123,241,123,255,138,246,132,123,123,241,123,255,138,246,132 +.byte 53,53,212,53,181,225,106,128,53,53,212,53,181,225,106,128 +.byte 29,29,116,29,232,105,58,245,29,29,116,29,232,105,58,245 +.byte 224,224,167,224,83,71,221,179,224,224,167,224,83,71,221,179 +.byte 215,215,123,215,246,172,179,33,215,215,123,215,246,172,179,33 +.byte 194,194,47,194,94,237,153,156,194,194,47,194,94,237,153,156 +.byte 46,46,184,46,109,150,92,67,46,46,184,46,109,150,92,67 +.byte 75,75,49,75,98,122,150,41,75,75,49,75,98,122,150,41 +.byte 254,254,223,254,163,33,225,93,254,254,223,254,163,33,225,93 +.byte 87,87,65,87,130,22,174,213,87,87,65,87,130,22,174,213 +.byte 21,21,84,21,168,65,42,189,21,21,84,21,168,65,42,189 +.byte 119,119,193,119,159,182,238,232,119,119,193,119,159,182,238,232 +.byte 55,55,220,55,165,235,110,146,55,55,220,55,165,235,110,146 +.byte 229,229,179,229,123,86,215,158,229,229,179,229,123,86,215,158 +.byte 159,159,70,159,140,217,35,19,159,159,70,159,140,217,35,19 +.byte 240,240,231,240,211,23,253,35,240,240,231,240,211,23,253,35 +.byte 74,74,53,74,106,127,148,32,74,74,53,74,106,127,148,32 +.byte 218,218,79,218,158,149,169,68,218,218,79,218,158,149,169,68 +.byte 88,88,125,88,250,37,176,162,88,88,125,88,250,37,176,162 +.byte 201,201,3,201,6,202,143,207,201,201,3,201,6,202,143,207 +.byte 41,41,164,41,85,141,82,124,41,41,164,41,85,141,82,124 +.byte 10,10,40,10,80,34,20,90,10,10,40,10,80,34,20,90 +.byte 177,177,254,177,225,79,127,80,177,177,254,177,225,79,127,80 +.byte 160,160,186,160,105,26,93,201,160,160,186,160,105,26,93,201 +.byte 107,107,177,107,127,218,214,20,107,107,177,107,127,218,214,20 +.byte 133,133,46,133,92,171,23,217,133,133,46,133,92,171,23,217 +.byte 189,189,206,189,129,115,103,60,189,189,206,189,129,115,103,60 +.byte 93,93,105,93,210,52,186,143,93,93,105,93,210,52,186,143 +.byte 16,16,64,16,128,80,32,144,16,16,64,16,128,80,32,144 +.byte 244,244,247,244,243,3,245,7,244,244,247,244,243,3,245,7 +.byte 203,203,11,203,22,192,139,221,203,203,11,203,22,192,139,221 +.byte 62,62,248,62,237,198,124,211,62,62,248,62,237,198,124,211 +.byte 5,5,20,5,40,17,10,45,5,5,20,5,40,17,10,45 +.byte 103,103,129,103,31,230,206,120,103,103,129,103,31,230,206,120 +.byte 228,228,183,228,115,83,213,151,228,228,183,228,115,83,213,151 +.byte 39,39,156,39,37,187,78,2,39,39,156,39,37,187,78,2 +.byte 65,65,25,65,50,88,130,115,65,65,25,65,50,88,130,115 +.byte 139,139,22,139,44,157,11,167,139,139,22,139,44,157,11,167 +.byte 167,167,166,167,81,1,83,246,167,167,166,167,81,1,83,246 +.byte 125,125,233,125,207,148,250,178,125,125,233,125,207,148,250,178 +.byte 149,149,110,149,220,251,55,73,149,149,110,149,220,251,55,73 +.byte 216,216,71,216,142,159,173,86,216,216,71,216,142,159,173,86 +.byte 251,251,203,251,139,48,235,112,251,251,203,251,139,48,235,112 +.byte 238,238,159,238,35,113,193,205,238,238,159,238,35,113,193,205 +.byte 124,124,237,124,199,145,248,187,124,124,237,124,199,145,248,187 +.byte 102,102,133,102,23,227,204,113,102,102,133,102,23,227,204,113 +.byte 221,221,83,221,166,142,167,123,221,221,83,221,166,142,167,123 +.byte 23,23,92,23,184,75,46,175,23,23,92,23,184,75,46,175 +.byte 71,71,1,71,2,70,142,69,71,71,1,71,2,70,142,69 +.byte 158,158,66,158,132,220,33,26,158,158,66,158,132,220,33,26 +.byte 202,202,15,202,30,197,137,212,202,202,15,202,30,197,137,212 +.byte 45,45,180,45,117,153,90,88,45,45,180,45,117,153,90,88 +.byte 191,191,198,191,145,121,99,46,191,191,198,191,145,121,99,46 +.byte 7,7,28,7,56,27,14,63,7,7,28,7,56,27,14,63 +.byte 173,173,142,173,1,35,71,172,173,173,142,173,1,35,71,172 +.byte 90,90,117,90,234,47,180,176,90,90,117,90,234,47,180,176 +.byte 131,131,54,131,108,181,27,239,131,131,54,131,108,181,27,239 +.byte 51,51,204,51,133,255,102,182,51,51,204,51,133,255,102,182 +.byte 99,99,145,99,63,242,198,92,99,99,145,99,63,242,198,92 +.byte 2,2,8,2,16,10,4,18,2,2,8,2,16,10,4,18 +.byte 170,170,146,170,57,56,73,147,170,170,146,170,57,56,73,147 +.byte 113,113,217,113,175,168,226,222,113,113,217,113,175,168,226,222 +.byte 200,200,7,200,14,207,141,198,200,200,7,200,14,207,141,198 +.byte 25,25,100,25,200,125,50,209,25,25,100,25,200,125,50,209 +.byte 73,73,57,73,114,112,146,59,73,73,57,73,114,112,146,59 +.byte 217,217,67,217,134,154,175,95,217,217,67,217,134,154,175,95 +.byte 242,242,239,242,195,29,249,49,242,242,239,242,195,29,249,49 +.byte 227,227,171,227,75,72,219,168,227,227,171,227,75,72,219,168 +.byte 91,91,113,91,226,42,182,185,91,91,113,91,226,42,182,185 +.byte 136,136,26,136,52,146,13,188,136,136,26,136,52,146,13,188 +.byte 154,154,82,154,164,200,41,62,154,154,82,154,164,200,41,62 +.byte 38,38,152,38,45,190,76,11,38,38,152,38,45,190,76,11 +.byte 50,50,200,50,141,250,100,191,50,50,200,50,141,250,100,191 +.byte 176,176,250,176,233,74,125,89,176,176,250,176,233,74,125,89 +.byte 233,233,131,233,27,106,207,242,233,233,131,233,27,106,207,242 +.byte 15,15,60,15,120,51,30,119,15,15,60,15,120,51,30,119 +.byte 213,213,115,213,230,166,183,51,213,213,115,213,230,166,183,51 +.byte 128,128,58,128,116,186,29,244,128,128,58,128,116,186,29,244 +.byte 190,190,194,190,153,124,97,39,190,190,194,190,153,124,97,39 +.byte 205,205,19,205,38,222,135,235,205,205,19,205,38,222,135,235 +.byte 52,52,208,52,189,228,104,137,52,52,208,52,189,228,104,137 +.byte 72,72,61,72,122,117,144,50,72,72,61,72,122,117,144,50 +.byte 255,255,219,255,171,36,227,84,255,255,219,255,171,36,227,84 +.byte 122,122,245,122,247,143,244,141,122,122,245,122,247,143,244,141 +.byte 144,144,122,144,244,234,61,100,144,144,122,144,244,234,61,100 +.byte 95,95,97,95,194,62,190,157,95,95,97,95,194,62,190,157 +.byte 32,32,128,32,29,160,64,61,32,32,128,32,29,160,64,61 +.byte 104,104,189,104,103,213,208,15,104,104,189,104,103,213,208,15 +.byte 26,26,104,26,208,114,52,202,26,26,104,26,208,114,52,202 +.byte 174,174,130,174,25,44,65,183,174,174,130,174,25,44,65,183 +.byte 180,180,234,180,201,94,117,125,180,180,234,180,201,94,117,125 +.byte 84,84,77,84,154,25,168,206,84,84,77,84,154,25,168,206 +.byte 147,147,118,147,236,229,59,127,147,147,118,147,236,229,59,127 +.byte 34,34,136,34,13,170,68,47,34,34,136,34,13,170,68,47 +.byte 100,100,141,100,7,233,200,99,100,100,141,100,7,233,200,99 +.byte 241,241,227,241,219,18,255,42,241,241,227,241,219,18,255,42 +.byte 115,115,209,115,191,162,230,204,115,115,209,115,191,162,230,204 +.byte 18,18,72,18,144,90,36,130,18,18,72,18,144,90,36,130 +.byte 64,64,29,64,58,93,128,122,64,64,29,64,58,93,128,122 +.byte 8,8,32,8,64,40,16,72,8,8,32,8,64,40,16,72 +.byte 195,195,43,195,86,232,155,149,195,195,43,195,86,232,155,149 +.byte 236,236,151,236,51,123,197,223,236,236,151,236,51,123,197,223 +.byte 219,219,75,219,150,144,171,77,219,219,75,219,150,144,171,77 +.byte 161,161,190,161,97,31,95,192,161,161,190,161,97,31,95,192 +.byte 141,141,14,141,28,131,7,145,141,141,14,141,28,131,7,145 +.byte 61,61,244,61,245,201,122,200,61,61,244,61,245,201,122,200 +.byte 151,151,102,151,204,241,51,91,151,151,102,151,204,241,51,91 +.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +.byte 207,207,27,207,54,212,131,249,207,207,27,207,54,212,131,249 +.byte 43,43,172,43,69,135,86,110,43,43,172,43,69,135,86,110 +.byte 118,118,197,118,151,179,236,225,118,118,197,118,151,179,236,225 +.byte 130,130,50,130,100,176,25,230,130,130,50,130,100,176,25,230 +.byte 214,214,127,214,254,169,177,40,214,214,127,214,254,169,177,40 +.byte 27,27,108,27,216,119,54,195,27,27,108,27,216,119,54,195 +.byte 181,181,238,181,193,91,119,116,181,181,238,181,193,91,119,116 +.byte 175,175,134,175,17,41,67,190,175,175,134,175,17,41,67,190 +.byte 106,106,181,106,119,223,212,29,106,106,181,106,119,223,212,29 +.byte 80,80,93,80,186,13,160,234,80,80,93,80,186,13,160,234 +.byte 69,69,9,69,18,76,138,87,69,69,9,69,18,76,138,87 +.byte 243,243,235,243,203,24,251,56,243,243,235,243,203,24,251,56 +.byte 48,48,192,48,157,240,96,173,48,48,192,48,157,240,96,173 +.byte 239,239,155,239,43,116,195,196,239,239,155,239,43,116,195,196 +.byte 63,63,252,63,229,195,126,218,63,63,252,63,229,195,126,218 +.byte 85,85,73,85,146,28,170,199,85,85,73,85,146,28,170,199 +.byte 162,162,178,162,121,16,89,219,162,162,178,162,121,16,89,219 +.byte 234,234,143,234,3,101,201,233,234,234,143,234,3,101,201,233 +.byte 101,101,137,101,15,236,202,106,101,101,137,101,15,236,202,106 +.byte 186,186,210,186,185,104,105,3,186,186,210,186,185,104,105,3 +.byte 47,47,188,47,101,147,94,74,47,47,188,47,101,147,94,74 +.byte 192,192,39,192,78,231,157,142,192,192,39,192,78,231,157,142 +.byte 222,222,95,222,190,129,161,96,222,222,95,222,190,129,161,96 +.byte 28,28,112,28,224,108,56,252,28,28,112,28,224,108,56,252 +.byte 253,253,211,253,187,46,231,70,253,253,211,253,187,46,231,70 +.byte 77,77,41,77,82,100,154,31,77,77,41,77,82,100,154,31 +.byte 146,146,114,146,228,224,57,118,146,146,114,146,228,224,57,118 +.byte 117,117,201,117,143,188,234,250,117,117,201,117,143,188,234,250 +.byte 6,6,24,6,48,30,12,54,6,6,24,6,48,30,12,54 +.byte 138,138,18,138,36,152,9,174,138,138,18,138,36,152,9,174 +.byte 178,178,242,178,249,64,121,75,178,178,242,178,249,64,121,75 +.byte 230,230,191,230,99,89,209,133,230,230,191,230,99,89,209,133 +.byte 14,14,56,14,112,54,28,126,14,14,56,14,112,54,28,126 +.byte 31,31,124,31,248,99,62,231,31,31,124,31,248,99,62,231 +.byte 98,98,149,98,55,247,196,85,98,98,149,98,55,247,196,85 +.byte 212,212,119,212,238,163,181,58,212,212,119,212,238,163,181,58 +.byte 168,168,154,168,41,50,77,129,168,168,154,168,41,50,77,129 +.byte 150,150,98,150,196,244,49,82,150,150,98,150,196,244,49,82 +.byte 249,249,195,249,155,58,239,98,249,249,195,249,155,58,239,98 +.byte 197,197,51,197,102,246,151,163,197,197,51,197,102,246,151,163 +.byte 37,37,148,37,53,177,74,16,37,37,148,37,53,177,74,16 +.byte 89,89,121,89,242,32,178,171,89,89,121,89,242,32,178,171 +.byte 132,132,42,132,84,174,21,208,132,132,42,132,84,174,21,208 +.byte 114,114,213,114,183,167,228,197,114,114,213,114,183,167,228,197 +.byte 57,57,228,57,213,221,114,236,57,57,228,57,213,221,114,236 +.byte 76,76,45,76,90,97,152,22,76,76,45,76,90,97,152,22 +.byte 94,94,101,94,202,59,188,148,94,94,101,94,202,59,188,148 +.byte 120,120,253,120,231,133,240,159,120,120,253,120,231,133,240,159 +.byte 56,56,224,56,221,216,112,229,56,56,224,56,221,216,112,229 +.byte 140,140,10,140,20,134,5,152,140,140,10,140,20,134,5,152 +.byte 209,209,99,209,198,178,191,23,209,209,99,209,198,178,191,23 +.byte 165,165,174,165,65,11,87,228,165,165,174,165,65,11,87,228 +.byte 226,226,175,226,67,77,217,161,226,226,175,226,67,77,217,161 +.byte 97,97,153,97,47,248,194,78,97,97,153,97,47,248,194,78 +.byte 179,179,246,179,241,69,123,66,179,179,246,179,241,69,123,66 +.byte 33,33,132,33,21,165,66,52,33,33,132,33,21,165,66,52 +.byte 156,156,74,156,148,214,37,8,156,156,74,156,148,214,37,8 +.byte 30,30,120,30,240,102,60,238,30,30,120,30,240,102,60,238 +.byte 67,67,17,67,34,82,134,97,67,67,17,67,34,82,134,97 +.byte 199,199,59,199,118,252,147,177,199,199,59,199,118,252,147,177 +.byte 252,252,215,252,179,43,229,79,252,252,215,252,179,43,229,79 +.byte 4,4,16,4,32,20,8,36,4,4,16,4,32,20,8,36 +.byte 81,81,89,81,178,8,162,227,81,81,89,81,178,8,162,227 +.byte 153,153,94,153,188,199,47,37,153,153,94,153,188,199,47,37 +.byte 109,109,169,109,79,196,218,34,109,109,169,109,79,196,218,34 +.byte 13,13,52,13,104,57,26,101,13,13,52,13,104,57,26,101 +.byte 250,250,207,250,131,53,233,121,250,250,207,250,131,53,233,121 +.byte 223,223,91,223,182,132,163,105,223,223,91,223,182,132,163,105 +.byte 126,126,229,126,215,155,252,169,126,126,229,126,215,155,252,169 +.byte 36,36,144,36,61,180,72,25,36,36,144,36,61,180,72,25 +.byte 59,59,236,59,197,215,118,254,59,59,236,59,197,215,118,254 +.byte 171,171,150,171,49,61,75,154,171,171,150,171,49,61,75,154 +.byte 206,206,31,206,62,209,129,240,206,206,31,206,62,209,129,240 +.byte 17,17,68,17,136,85,34,153,17,17,68,17,136,85,34,153 +.byte 143,143,6,143,12,137,3,131,143,143,6,143,12,137,3,131 +.byte 78,78,37,78,74,107,156,4,78,78,37,78,74,107,156,4 +.byte 183,183,230,183,209,81,115,102,183,183,230,183,209,81,115,102 +.byte 235,235,139,235,11,96,203,224,235,235,139,235,11,96,203,224 +.byte 60,60,240,60,253,204,120,193,60,60,240,60,253,204,120,193 +.byte 129,129,62,129,124,191,31,253,129,129,62,129,124,191,31,253 +.byte 148,148,106,148,212,254,53,64,148,148,106,148,212,254,53,64 +.byte 247,247,251,247,235,12,243,28,247,247,251,247,235,12,243,28 +.byte 185,185,222,185,161,103,111,24,185,185,222,185,161,103,111,24 +.byte 19,19,76,19,152,95,38,139,19,19,76,19,152,95,38,139 +.byte 44,44,176,44,125,156,88,81,44,44,176,44,125,156,88,81 +.byte 211,211,107,211,214,184,187,5,211,211,107,211,214,184,187,5 +.byte 231,231,187,231,107,92,211,140,231,231,187,231,107,92,211,140 +.byte 110,110,165,110,87,203,220,57,110,110,165,110,87,203,220,57 +.byte 196,196,55,196,110,243,149,170,196,196,55,196,110,243,149,170 +.byte 3,3,12,3,24,15,6,27,3,3,12,3,24,15,6,27 +.byte 86,86,69,86,138,19,172,220,86,86,69,86,138,19,172,220 +.byte 68,68,13,68,26,73,136,94,68,68,13,68,26,73,136,94 +.byte 127,127,225,127,223,158,254,160,127,127,225,127,223,158,254,160 +.byte 169,169,158,169,33,55,79,136,169,169,158,169,33,55,79,136 +.byte 42,42,168,42,77,130,84,103,42,42,168,42,77,130,84,103 +.byte 187,187,214,187,177,109,107,10,187,187,214,187,177,109,107,10 +.byte 193,193,35,193,70,226,159,135,193,193,35,193,70,226,159,135 +.byte 83,83,81,83,162,2,166,241,83,83,81,83,162,2,166,241 +.byte 220,220,87,220,174,139,165,114,220,220,87,220,174,139,165,114 +.byte 11,11,44,11,88,39,22,83,11,11,44,11,88,39,22,83 +.byte 157,157,78,157,156,211,39,1,157,157,78,157,156,211,39,1 +.byte 108,108,173,108,71,193,216,43,108,108,173,108,71,193,216,43 +.byte 49,49,196,49,149,245,98,164,49,49,196,49,149,245,98,164 +.byte 116,116,205,116,135,185,232,243,116,116,205,116,135,185,232,243 +.byte 246,246,255,246,227,9,241,21,246,246,255,246,227,9,241,21 +.byte 70,70,5,70,10,67,140,76,70,70,5,70,10,67,140,76 +.byte 172,172,138,172,9,38,69,165,172,172,138,172,9,38,69,165 +.byte 137,137,30,137,60,151,15,181,137,137,30,137,60,151,15,181 +.byte 20,20,80,20,160,68,40,180,20,20,80,20,160,68,40,180 +.byte 225,225,163,225,91,66,223,186,225,225,163,225,91,66,223,186 +.byte 22,22,88,22,176,78,44,166,22,22,88,22,176,78,44,166 +.byte 58,58,232,58,205,210,116,247,58,58,232,58,205,210,116,247 +.byte 105,105,185,105,111,208,210,6,105,105,185,105,111,208,210,6 +.byte 9,9,36,9,72,45,18,65,9,9,36,9,72,45,18,65 +.byte 112,112,221,112,167,173,224,215,112,112,221,112,167,173,224,215 +.byte 182,182,226,182,217,84,113,111,182,182,226,182,217,84,113,111 +.byte 208,208,103,208,206,183,189,30,208,208,103,208,206,183,189,30 +.byte 237,237,147,237,59,126,199,214,237,237,147,237,59,126,199,214 +.byte 204,204,23,204,46,219,133,226,204,204,23,204,46,219,133,226 +.byte 66,66,21,66,42,87,132,104,66,66,21,66,42,87,132,104 +.byte 152,152,90,152,180,194,45,44,152,152,90,152,180,194,45,44 +.byte 164,164,170,164,73,14,85,237,164,164,170,164,73,14,85,237 +.byte 40,40,160,40,93,136,80,117,40,40,160,40,93,136,80,117 +.byte 92,92,109,92,218,49,184,134,92,92,109,92,218,49,184,134 +.byte 248,248,199,248,147,63,237,107,248,248,199,248,147,63,237,107 +.byte 134,134,34,134,68,164,17,194,134,134,34,134,68,164,17,194 +.byte 24,35,198,232,135,184,1,79 +.byte 54,166,210,245,121,111,145,82 +.byte 96,188,155,142,163,12,123,53 +.byte 29,224,215,194,46,75,254,87 +.byte 21,119,55,229,159,240,74,218 +.byte 88,201,41,10,177,160,107,133 +.byte 189,93,16,244,203,62,5,103 +.byte 228,39,65,139,167,125,149,216 +.byte 251,238,124,102,221,23,71,158 +.byte 202,45,191,7,173,90,131,51 diff --git a/secure/lib/libcrypto/amd64/x86_64-gf2m.S b/secure/lib/libcrypto/amd64/x86_64-gf2m.S new file mode 100644 index 0000000..f86c253 --- /dev/null +++ b/secure/lib/libcrypto/amd64/x86_64-gf2m.S @@ -0,0 +1,292 @@ + # $FreeBSD$ +.text + +.type _mul_1x1,@function +.align 16 +_mul_1x1: + subq $128+8,%rsp + movq $-1,%r9 + leaq (%rax,%rax,1),%rsi + shrq $3,%r9 + leaq (,%rax,4),%rdi + andq %rax,%r9 + leaq (,%rax,8),%r12 + sarq $63,%rax + leaq (%r9,%r9,1),%r10 + sarq $63,%rsi + leaq (,%r9,4),%r11 + andq %rbp,%rax + sarq $63,%rdi + movq %rax,%rdx + shlq $63,%rax + andq %rbp,%rsi + shrq $1,%rdx + movq %rsi,%rcx + shlq $62,%rsi + andq %rbp,%rdi + shrq $2,%rcx + xorq %rsi,%rax + movq %rdi,%rbx + shlq $61,%rdi + xorq %rcx,%rdx + shrq $3,%rbx + xorq %rdi,%rax + xorq %rbx,%rdx + + movq %r9,%r13 + movq $0,0(%rsp) + xorq %r10,%r13 + movq %r9,8(%rsp) + movq %r11,%r14 + movq %r10,16(%rsp) + xorq %r12,%r14 + movq %r13,24(%rsp) + + xorq %r11,%r9 + movq %r11,32(%rsp) + xorq %r11,%r10 + movq %r9,40(%rsp) + xorq %r11,%r13 + movq %r10,48(%rsp) + xorq %r14,%r9 + movq %r13,56(%rsp) + xorq %r14,%r10 + + movq %r12,64(%rsp) + xorq %r14,%r13 + movq %r9,72(%rsp) + xorq %r11,%r9 + movq %r10,80(%rsp) + xorq %r11,%r10 + movq %r13,88(%rsp) + + xorq %r11,%r13 + movq %r14,96(%rsp) + movq %r8,%rsi + movq %r9,104(%rsp) + andq %rbp,%rsi + movq %r10,112(%rsp) + shrq $4,%rbp + movq %r13,120(%rsp) + movq %r8,%rdi + andq %rbp,%rdi + shrq $4,%rbp + + movq (%rsp,%rsi,8),%xmm0 + movq %r8,%rsi + andq %rbp,%rsi + shrq $4,%rbp + movq (%rsp,%rdi,8),%rcx + movq %r8,%rdi + movq %rcx,%rbx + shlq $4,%rcx + andq %rbp,%rdi + movq (%rsp,%rsi,8),%xmm1 + shrq $60,%rbx + xorq %rcx,%rax + pslldq $1,%xmm1 + movq %r8,%rsi + shrq $4,%rbp + xorq %rbx,%rdx + andq %rbp,%rsi + shrq $4,%rbp + pxor %xmm1,%xmm0 + movq (%rsp,%rdi,8),%rcx + movq %r8,%rdi + movq %rcx,%rbx + shlq $12,%rcx + andq %rbp,%rdi + movq (%rsp,%rsi,8),%xmm1 + shrq $52,%rbx + xorq %rcx,%rax + pslldq $2,%xmm1 + movq %r8,%rsi + shrq $4,%rbp + xorq %rbx,%rdx + andq %rbp,%rsi + shrq $4,%rbp + pxor %xmm1,%xmm0 + movq (%rsp,%rdi,8),%rcx + movq %r8,%rdi + movq %rcx,%rbx + shlq $20,%rcx + andq %rbp,%rdi + movq (%rsp,%rsi,8),%xmm1 + shrq $44,%rbx + xorq %rcx,%rax + pslldq $3,%xmm1 + movq %r8,%rsi + shrq $4,%rbp + xorq %rbx,%rdx + andq %rbp,%rsi + shrq $4,%rbp + pxor %xmm1,%xmm0 + movq (%rsp,%rdi,8),%rcx + movq %r8,%rdi + movq %rcx,%rbx + shlq $28,%rcx + andq %rbp,%rdi + movq (%rsp,%rsi,8),%xmm1 + shrq $36,%rbx + xorq %rcx,%rax + pslldq $4,%xmm1 + movq %r8,%rsi + shrq $4,%rbp + xorq %rbx,%rdx + andq %rbp,%rsi + shrq $4,%rbp + pxor %xmm1,%xmm0 + movq (%rsp,%rdi,8),%rcx + movq %r8,%rdi + movq %rcx,%rbx + shlq $36,%rcx + andq %rbp,%rdi + movq (%rsp,%rsi,8),%xmm1 + shrq $28,%rbx + xorq %rcx,%rax + pslldq $5,%xmm1 + movq %r8,%rsi + shrq $4,%rbp + xorq %rbx,%rdx + andq %rbp,%rsi + shrq $4,%rbp + pxor %xmm1,%xmm0 + movq (%rsp,%rdi,8),%rcx + movq %r8,%rdi + movq %rcx,%rbx + shlq $44,%rcx + andq %rbp,%rdi + movq (%rsp,%rsi,8),%xmm1 + shrq $20,%rbx + xorq %rcx,%rax + pslldq $6,%xmm1 + movq %r8,%rsi + shrq $4,%rbp + xorq %rbx,%rdx + andq %rbp,%rsi + shrq $4,%rbp + pxor %xmm1,%xmm0 + movq (%rsp,%rdi,8),%rcx + movq %r8,%rdi + movq %rcx,%rbx + shlq $52,%rcx + andq %rbp,%rdi + movq (%rsp,%rsi,8),%xmm1 + shrq $12,%rbx + xorq %rcx,%rax + pslldq $7,%xmm1 + movq %r8,%rsi + shrq $4,%rbp + xorq %rbx,%rdx + andq %rbp,%rsi + shrq $4,%rbp + pxor %xmm1,%xmm0 + movq (%rsp,%rdi,8),%rcx + movq %rcx,%rbx + shlq $60,%rcx +.byte 102,72,15,126,198 + shrq $4,%rbx + xorq %rcx,%rax + psrldq $8,%xmm0 + xorq %rbx,%rdx +.byte 102,72,15,126,199 + xorq %rsi,%rax + xorq %rdi,%rdx + + addq $128+8,%rsp + .byte 0xf3,0xc3 +.Lend_mul_1x1: +.size _mul_1x1,.-_mul_1x1 + +.globl bn_GF2m_mul_2x2 +.type bn_GF2m_mul_2x2,@function +.align 16 +bn_GF2m_mul_2x2: + movq OPENSSL_ia32cap_P(%rip),%rax + btq $33,%rax + jnc .Lvanilla_mul_2x2 + +.byte 102,72,15,110,198 +.byte 102,72,15,110,201 +.byte 102,72,15,110,210 +.byte 102,73,15,110,216 + movdqa %xmm0,%xmm4 + movdqa %xmm1,%xmm5 +.byte 102,15,58,68,193,0 + pxor %xmm2,%xmm4 + pxor %xmm3,%xmm5 +.byte 102,15,58,68,211,0 +.byte 102,15,58,68,229,0 + xorps %xmm0,%xmm4 + xorps %xmm2,%xmm4 + movdqa %xmm4,%xmm5 + pslldq $8,%xmm4 + psrldq $8,%xmm5 + pxor %xmm4,%xmm2 + pxor %xmm5,%xmm0 + movdqu %xmm2,0(%rdi) + movdqu %xmm0,16(%rdi) + .byte 0xf3,0xc3 + +.align 16 +.Lvanilla_mul_2x2: + leaq -136(%rsp),%rsp + movq %r14,80(%rsp) + movq %r13,88(%rsp) + movq %r12,96(%rsp) + movq %rbp,104(%rsp) + movq %rbx,112(%rsp) +.Lbody_mul_2x2: + movq %rdi,32(%rsp) + movq %rsi,40(%rsp) + movq %rdx,48(%rsp) + movq %rcx,56(%rsp) + movq %r8,64(%rsp) + + movq $15,%r8 + movq %rsi,%rax + movq %rcx,%rbp + call _mul_1x1 + movq %rax,16(%rsp) + movq %rdx,24(%rsp) + + movq 48(%rsp),%rax + movq 64(%rsp),%rbp + call _mul_1x1 + movq %rax,0(%rsp) + movq %rdx,8(%rsp) + + movq 40(%rsp),%rax + movq 56(%rsp),%rbp + xorq 48(%rsp),%rax + xorq 64(%rsp),%rbp + call _mul_1x1 + movq 0(%rsp),%rbx + movq 8(%rsp),%rcx + movq 16(%rsp),%rdi + movq 24(%rsp),%rsi + movq 32(%rsp),%rbp + + xorq %rdx,%rax + xorq %rcx,%rdx + xorq %rbx,%rax + movq %rbx,0(%rbp) + xorq %rdi,%rdx + movq %rsi,24(%rbp) + xorq %rsi,%rax + xorq %rsi,%rdx + xorq %rdx,%rax + movq %rdx,16(%rbp) + movq %rax,8(%rbp) + + movq 80(%rsp),%r14 + movq 88(%rsp),%r13 + movq 96(%rsp),%r12 + movq 104(%rsp),%rbp + movq 112(%rsp),%rbx + leaq 136(%rsp),%rsp + .byte 0xf3,0xc3 +.Lend_mul_2x2: +.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 +.byte 71,70,40,50,94,109,41,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 16 diff --git a/secure/lib/libcrypto/amd64/x86_64-mont.S b/secure/lib/libcrypto/amd64/x86_64-mont.S new file mode 100644 index 0000000..bff0fb9 --- /dev/null +++ b/secure/lib/libcrypto/amd64/x86_64-mont.S @@ -0,0 +1,725 @@ + # $FreeBSD$ +.text + + + +.globl bn_mul_mont +.type bn_mul_mont,@function +.align 16 +bn_mul_mont: + testl $3,%r9d + jnz .Lmul_enter + cmpl $8,%r9d + jb .Lmul_enter + cmpq %rsi,%rdx + jne .Lmul4x_enter + testl $7,%r9d + jz .Lsqr8x_enter + jmp .Lmul4x_enter + +.align 16 +.Lmul_enter: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + movl %r9d,%r9d + leaq 2(%r9),%r10 + movq %rsp,%r11 + negq %r10 + leaq (%rsp,%r10,8),%rsp + andq $-1024,%rsp + + movq %r11,8(%rsp,%r9,8) +.Lmul_body: + movq %rdx,%r12 + movq (%r8),%r8 + movq (%r12),%rbx + movq (%rsi),%rax + + xorq %r14,%r14 + xorq %r15,%r15 + + movq %r8,%rbp + mulq %rbx + movq %rax,%r10 + movq (%rcx),%rax + + imulq %r10,%rbp + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq %rdx,%r13 + + leaq 1(%r15),%r15 + jmp .L1st_enter + +.align 16 +.L1st: + addq %rax,%r13 + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%r13 + movq %r10,%r11 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + +.L1st_enter: + mulq %rbx + addq %rax,%r11 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + leaq 1(%r15),%r15 + movq %rdx,%r10 + + mulq %rbp + cmpq %r9,%r15 + jne .L1st + + addq %rax,%r13 + movq (%rsi),%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + movq %r10,%r11 + + xorq %rdx,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r9,8) + movq %rdx,(%rsp,%r9,8) + + leaq 1(%r14),%r14 + jmp .Louter +.align 16 +.Louter: + movq (%r12,%r14,8),%rbx + xorq %r15,%r15 + movq %r8,%rbp + movq (%rsp),%r10 + mulq %rbx + addq %rax,%r10 + movq (%rcx),%rax + adcq $0,%rdx + + imulq %r10,%rbp + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq 8(%rsp),%r10 + movq %rdx,%r13 + + leaq 1(%r15),%r15 + jmp .Linner_enter + +.align 16 +.Linner: + addq %rax,%r13 + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + movq (%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + +.Linner_enter: + mulq %rbx + addq %rax,%r11 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + addq %r11,%r10 + movq %rdx,%r11 + adcq $0,%r11 + leaq 1(%r15),%r15 + + mulq %rbp + cmpq %r9,%r15 + jne .Linner + + addq %rax,%r13 + movq (%rsi),%rax + adcq $0,%rdx + addq %r10,%r13 + movq (%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + + xorq %rdx,%rdx + addq %r11,%r13 + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r9,8) + movq %rdx,(%rsp,%r9,8) + + leaq 1(%r14),%r14 + cmpq %r9,%r14 + jb .Louter + + xorq %r14,%r14 + movq (%rsp),%rax + leaq (%rsp),%rsi + movq %r9,%r15 + jmp .Lsub +.align 16 +.Lsub: sbbq (%rcx,%r14,8),%rax + movq %rax,(%rdi,%r14,8) + movq 8(%rsi,%r14,8),%rax + leaq 1(%r14),%r14 + decq %r15 + jnz .Lsub + + sbbq $0,%rax + xorq %r14,%r14 + andq %rax,%rsi + notq %rax + movq %rdi,%rcx + andq %rax,%rcx + movq %r9,%r15 + orq %rcx,%rsi +.align 16 +.Lcopy: + movq (%rsi,%r14,8),%rax + movq %r14,(%rsp,%r14,8) + movq %rax,(%rdi,%r14,8) + leaq 1(%r14),%r14 + subq $1,%r15 + jnz .Lcopy + + movq 8(%rsp,%r9,8),%rsi + movq $1,%rax + movq (%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbp + movq 40(%rsi),%rbx + leaq 48(%rsi),%rsp +.Lmul_epilogue: + .byte 0xf3,0xc3 +.size bn_mul_mont,.-bn_mul_mont +.type bn_mul4x_mont,@function +.align 16 +bn_mul4x_mont: +.Lmul4x_enter: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + movl %r9d,%r9d + leaq 4(%r9),%r10 + movq %rsp,%r11 + negq %r10 + leaq (%rsp,%r10,8),%rsp + andq $-1024,%rsp + + movq %r11,8(%rsp,%r9,8) +.Lmul4x_body: + movq %rdi,16(%rsp,%r9,8) + movq %rdx,%r12 + movq (%r8),%r8 + movq (%r12),%rbx + movq (%rsi),%rax + + xorq %r14,%r14 + xorq %r15,%r15 + + movq %r8,%rbp + mulq %rbx + movq %rax,%r10 + movq (%rcx),%rax + + imulq %r10,%rbp + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq 16(%rsi),%rax + adcq $0,%rdx + addq %r11,%rdi + leaq 4(%r15),%r15 + adcq $0,%rdx + movq %rdi,(%rsp) + movq %rdx,%r13 + jmp .L1st4x +.align 16 +.L1st4x: + mulq %rbx + addq %rax,%r10 + movq -16(%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%rsp,%r15,8) + movq %rdx,%r13 + + mulq %rbx + addq %rax,%r10 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq 8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx,%r15,8),%rax + adcq $0,%rdx + leaq 4(%r15),%r15 + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq -16(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-32(%rsp,%r15,8) + movq %rdx,%r13 + cmpq %r9,%r15 + jb .L1st4x + + mulq %rbx + addq %rax,%r10 + movq -16(%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%rsp,%r15,8) + movq %rdx,%r13 + + xorq %rdi,%rdi + addq %r10,%r13 + adcq $0,%rdi + movq %r13,-8(%rsp,%r15,8) + movq %rdi,(%rsp,%r15,8) + + leaq 1(%r14),%r14 +.align 4 +.Louter4x: + movq (%r12,%r14,8),%rbx + xorq %r15,%r15 + movq (%rsp),%r10 + movq %r8,%rbp + mulq %rbx + addq %rax,%r10 + movq (%rcx),%rax + adcq $0,%rdx + + imulq %r10,%rbp + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx),%rax + adcq $0,%rdx + addq 8(%rsp),%r11 + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq 16(%rsi),%rax + adcq $0,%rdx + addq %r11,%rdi + leaq 4(%r15),%r15 + adcq $0,%rdx + movq %rdi,(%rsp) + movq %rdx,%r13 + jmp .Linner4x +.align 16 +.Linner4x: + mulq %rbx + addq %rax,%r10 + movq -16(%rcx,%r15,8),%rax + adcq $0,%rdx + addq -16(%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx,%r15,8),%rax + adcq $0,%rdx + addq -8(%rsp,%r15,8),%r11 + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%rsp,%r15,8) + movq %rdx,%r13 + + mulq %rbx + addq %rax,%r10 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + addq (%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq 8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx,%r15,8),%rax + adcq $0,%rdx + addq 8(%rsp,%r15,8),%r11 + adcq $0,%rdx + leaq 4(%r15),%r15 + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq -16(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-32(%rsp,%r15,8) + movq %rdx,%r13 + cmpq %r9,%r15 + jb .Linner4x + + mulq %rbx + addq %rax,%r10 + movq -16(%rcx,%r15,8),%rax + adcq $0,%rdx + addq -16(%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx,%r15,8),%rax + adcq $0,%rdx + addq -8(%rsp,%r15,8),%r11 + adcq $0,%rdx + leaq 1(%r14),%r14 + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%rsp,%r15,8) + movq %rdx,%r13 + + xorq %rdi,%rdi + addq %r10,%r13 + adcq $0,%rdi + addq (%rsp,%r9,8),%r13 + adcq $0,%rdi + movq %r13,-8(%rsp,%r15,8) + movq %rdi,(%rsp,%r15,8) + + cmpq %r9,%r14 + jb .Louter4x + movq 16(%rsp,%r9,8),%rdi + movq 0(%rsp),%rax + pxor %xmm0,%xmm0 + movq 8(%rsp),%rdx + shrq $2,%r9 + leaq (%rsp),%rsi + xorq %r14,%r14 + + subq 0(%rcx),%rax + movq 16(%rsi),%rbx + movq 24(%rsi),%rbp + sbbq 8(%rcx),%rdx + leaq -1(%r9),%r15 + jmp .Lsub4x +.align 16 +.Lsub4x: + movq %rax,0(%rdi,%r14,8) + movq %rdx,8(%rdi,%r14,8) + sbbq 16(%rcx,%r14,8),%rbx + movq 32(%rsi,%r14,8),%rax + movq 40(%rsi,%r14,8),%rdx + sbbq 24(%rcx,%r14,8),%rbp + movq %rbx,16(%rdi,%r14,8) + movq %rbp,24(%rdi,%r14,8) + sbbq 32(%rcx,%r14,8),%rax + movq 48(%rsi,%r14,8),%rbx + movq 56(%rsi,%r14,8),%rbp + sbbq 40(%rcx,%r14,8),%rdx + leaq 4(%r14),%r14 + decq %r15 + jnz .Lsub4x + + movq %rax,0(%rdi,%r14,8) + movq 32(%rsi,%r14,8),%rax + sbbq 16(%rcx,%r14,8),%rbx + movq %rdx,8(%rdi,%r14,8) + sbbq 24(%rcx,%r14,8),%rbp + movq %rbx,16(%rdi,%r14,8) + + sbbq $0,%rax + movq %rbp,24(%rdi,%r14,8) + xorq %r14,%r14 + andq %rax,%rsi + notq %rax + movq %rdi,%rcx + andq %rax,%rcx + leaq -1(%r9),%r15 + orq %rcx,%rsi + + movdqu (%rsi),%xmm1 + movdqa %xmm0,(%rsp) + movdqu %xmm1,(%rdi) + jmp .Lcopy4x +.align 16 +.Lcopy4x: + movdqu 16(%rsi,%r14,1),%xmm2 + movdqu 32(%rsi,%r14,1),%xmm1 + movdqa %xmm0,16(%rsp,%r14,1) + movdqu %xmm2,16(%rdi,%r14,1) + movdqa %xmm0,32(%rsp,%r14,1) + movdqu %xmm1,32(%rdi,%r14,1) + leaq 32(%r14),%r14 + decq %r15 + jnz .Lcopy4x + + shlq $2,%r9 + movdqu 16(%rsi,%r14,1),%xmm2 + movdqa %xmm0,16(%rsp,%r14,1) + movdqu %xmm2,16(%rdi,%r14,1) + movq 8(%rsp,%r9,8),%rsi + movq $1,%rax + movq (%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbp + movq 40(%rsi),%rbx + leaq 48(%rsi),%rsp +.Lmul4x_epilogue: + .byte 0xf3,0xc3 +.size bn_mul4x_mont,.-bn_mul4x_mont + + +.type bn_sqr8x_mont,@function +.align 32 +bn_sqr8x_mont: +.Lsqr8x_enter: + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + movl %r9d,%r10d + shll $3,%r9d + shlq $3+2,%r10 + negq %r9 + + + + + + + leaq -64(%rsp,%r9,4),%r11 + movq (%r8),%r8 + subq %rsi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb .Lsqr8x_sp_alt + subq %r11,%rsp + leaq -64(%rsp,%r9,4),%rsp + jmp .Lsqr8x_sp_done + +.align 32 +.Lsqr8x_sp_alt: + leaq 4096-64(,%r9,4),%r10 + leaq -64(%rsp,%r9,4),%rsp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rsp +.Lsqr8x_sp_done: + andq $-64,%rsp + movq %r9,%r10 + negq %r9 + + leaq 64(%rsp,%r9,2),%r11 + movq %r8,32(%rsp) + movq %rax,40(%rsp) +.Lsqr8x_body: + + movq %r9,%rbp +.byte 102,73,15,110,211 + shrq $3+2,%rbp + movl OPENSSL_ia32cap_P+8(%rip),%eax + jmp .Lsqr8x_copy_n + +.align 32 +.Lsqr8x_copy_n: + movq 0(%rcx),%xmm0 + movq 8(%rcx),%xmm1 + movq 16(%rcx),%xmm3 + movq 24(%rcx),%xmm4 + leaq 32(%rcx),%rcx + movdqa %xmm0,0(%r11) + movdqa %xmm1,16(%r11) + movdqa %xmm3,32(%r11) + movdqa %xmm4,48(%r11) + leaq 64(%r11),%r11 + decq %rbp + jnz .Lsqr8x_copy_n + + pxor %xmm0,%xmm0 +.byte 102,72,15,110,207 +.byte 102,73,15,110,218 + call bn_sqr8x_internal + + pxor %xmm0,%xmm0 + leaq 48(%rsp),%rax + leaq 64(%rsp,%r9,2),%rdx + shrq $3+2,%r9 + movq 40(%rsp),%rsi + jmp .Lsqr8x_zero + +.align 32 +.Lsqr8x_zero: + movdqa %xmm0,0(%rax) + movdqa %xmm0,16(%rax) + movdqa %xmm0,32(%rax) + movdqa %xmm0,48(%rax) + leaq 64(%rax),%rax + movdqa %xmm0,0(%rdx) + movdqa %xmm0,16(%rdx) + movdqa %xmm0,32(%rdx) + movdqa %xmm0,48(%rdx) + leaq 64(%rdx),%rdx + decq %r9 + jnz .Lsqr8x_zero + + movq $1,%rax + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp +.Lsqr8x_epilogue: + .byte 0xf3,0xc3 +.size bn_sqr8x_mont,.-bn_sqr8x_mont +.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 16 diff --git a/secure/lib/libcrypto/amd64/x86_64-mont5.S b/secure/lib/libcrypto/amd64/x86_64-mont5.S new file mode 100644 index 0000000..19162e8 --- /dev/null +++ b/secure/lib/libcrypto/amd64/x86_64-mont5.S @@ -0,0 +1,1844 @@ + # $FreeBSD$ +.text + + + +.globl bn_mul_mont_gather5 +.type bn_mul_mont_gather5,@function +.align 64 +bn_mul_mont_gather5: + testl $7,%r9d + jnz .Lmul_enter + jmp .Lmul4x_enter + +.align 16 +.Lmul_enter: + movl %r9d,%r9d + movq %rsp,%rax + movl 8(%rsp),%r10d + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + leaq 2(%r9),%r11 + negq %r11 + leaq (%rsp,%r11,8),%rsp + andq $-1024,%rsp + + movq %rax,8(%rsp,%r9,8) +.Lmul_body: + movq %rdx,%r12 + movq %r10,%r11 + shrq $3,%r10 + andq $7,%r11 + notq %r10 + leaq .Lmagic_masks(%rip),%rax + andq $3,%r10 + leaq 96(%r12,%r11,8),%r12 + movq 0(%rax,%r10,8),%xmm4 + movq 8(%rax,%r10,8),%xmm5 + movq 16(%rax,%r10,8),%xmm6 + movq 24(%rax,%r10,8),%xmm7 + + movq -96(%r12),%xmm0 + movq -32(%r12),%xmm1 + pand %xmm4,%xmm0 + movq 32(%r12),%xmm2 + pand %xmm5,%xmm1 + movq 96(%r12),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + por %xmm2,%xmm0 + leaq 256(%r12),%r12 + por %xmm3,%xmm0 + +.byte 102,72,15,126,195 + + movq (%r8),%r8 + movq (%rsi),%rax + + xorq %r14,%r14 + xorq %r15,%r15 + + movq -96(%r12),%xmm0 + movq -32(%r12),%xmm1 + pand %xmm4,%xmm0 + movq 32(%r12),%xmm2 + pand %xmm5,%xmm1 + + movq %r8,%rbp + mulq %rbx + movq %rax,%r10 + movq (%rcx),%rax + + movq 96(%r12),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + + imulq %r10,%rbp + movq %rdx,%r11 + + por %xmm2,%xmm0 + leaq 256(%r12),%r12 + por %xmm3,%xmm0 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq %rdx,%r13 + + leaq 1(%r15),%r15 + jmp .L1st_enter + +.align 16 +.L1st: + addq %rax,%r13 + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%r13 + movq %r10,%r11 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + +.L1st_enter: + mulq %rbx + addq %rax,%r11 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + leaq 1(%r15),%r15 + movq %rdx,%r10 + + mulq %rbp + cmpq %r9,%r15 + jne .L1st + +.byte 102,72,15,126,195 + + addq %rax,%r13 + movq (%rsi),%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + movq %r10,%r11 + + xorq %rdx,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r9,8) + movq %rdx,(%rsp,%r9,8) + + leaq 1(%r14),%r14 + jmp .Louter +.align 16 +.Louter: + xorq %r15,%r15 + movq %r8,%rbp + movq (%rsp),%r10 + + movq -96(%r12),%xmm0 + movq -32(%r12),%xmm1 + pand %xmm4,%xmm0 + movq 32(%r12),%xmm2 + pand %xmm5,%xmm1 + + mulq %rbx + addq %rax,%r10 + movq (%rcx),%rax + adcq $0,%rdx + + movq 96(%r12),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + + imulq %r10,%rbp + movq %rdx,%r11 + + por %xmm2,%xmm0 + leaq 256(%r12),%r12 + por %xmm3,%xmm0 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq 8(%rsp),%r10 + movq %rdx,%r13 + + leaq 1(%r15),%r15 + jmp .Linner_enter + +.align 16 +.Linner: + addq %rax,%r13 + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + movq (%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + +.Linner_enter: + mulq %rbx + addq %rax,%r11 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + addq %r11,%r10 + movq %rdx,%r11 + adcq $0,%r11 + leaq 1(%r15),%r15 + + mulq %rbp + cmpq %r9,%r15 + jne .Linner + +.byte 102,72,15,126,195 + + addq %rax,%r13 + movq (%rsi),%rax + adcq $0,%rdx + addq %r10,%r13 + movq (%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + + xorq %rdx,%rdx + addq %r11,%r13 + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r9,8) + movq %rdx,(%rsp,%r9,8) + + leaq 1(%r14),%r14 + cmpq %r9,%r14 + jb .Louter + + xorq %r14,%r14 + movq (%rsp),%rax + leaq (%rsp),%rsi + movq %r9,%r15 + jmp .Lsub +.align 16 +.Lsub: sbbq (%rcx,%r14,8),%rax + movq %rax,(%rdi,%r14,8) + movq 8(%rsi,%r14,8),%rax + leaq 1(%r14),%r14 + decq %r15 + jnz .Lsub + + sbbq $0,%rax + xorq %r14,%r14 + andq %rax,%rsi + notq %rax + movq %rdi,%rcx + andq %rax,%rcx + movq %r9,%r15 + orq %rcx,%rsi +.align 16 +.Lcopy: + movq (%rsi,%r14,8),%rax + movq %r14,(%rsp,%r14,8) + movq %rax,(%rdi,%r14,8) + leaq 1(%r14),%r14 + subq $1,%r15 + jnz .Lcopy + + movq 8(%rsp,%r9,8),%rsi + movq $1,%rax + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp +.Lmul_epilogue: + .byte 0xf3,0xc3 +.size bn_mul_mont_gather5,.-bn_mul_mont_gather5 +.type bn_mul4x_mont_gather5,@function +.align 32 +bn_mul4x_mont_gather5: +.Lmul4x_enter: +.byte 0x67 + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 +.byte 0x67 + movl %r9d,%r10d + shll $3,%r9d + shll $3+2,%r10d + negq %r9 + + + + + + + + + leaq -64(%rsp,%r9,2),%r11 + subq %rsi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb .Lmul4xsp_alt + subq %r11,%rsp + leaq -64(%rsp,%r9,2),%rsp + jmp .Lmul4xsp_done + +.align 32 +.Lmul4xsp_alt: + leaq 4096-64(,%r9,2),%r10 + leaq -64(%rsp,%r9,2),%rsp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rsp +.Lmul4xsp_done: + andq $-64,%rsp + negq %r9 + + movq %rax,40(%rsp) +.Lmul4x_body: + + call mul4x_internal + + movq 40(%rsp),%rsi + movq $1,%rax + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp +.Lmul4x_epilogue: + .byte 0xf3,0xc3 +.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 + +.type mul4x_internal,@function +.align 32 +mul4x_internal: + shlq $5,%r9 + movl 8(%rax),%r10d + leaq 256(%rdx,%r9,1),%r13 + shrq $5,%r9 + movq %r10,%r11 + shrq $3,%r10 + andq $7,%r11 + notq %r10 + leaq .Lmagic_masks(%rip),%rax + andq $3,%r10 + leaq 96(%rdx,%r11,8),%r12 + movq 0(%rax,%r10,8),%xmm4 + movq 8(%rax,%r10,8),%xmm5 + addq $7,%r11 + movq 16(%rax,%r10,8),%xmm6 + movq 24(%rax,%r10,8),%xmm7 + andq $7,%r11 + + movq -96(%r12),%xmm0 + leaq 256(%r12),%r14 + movq -32(%r12),%xmm1 + pand %xmm4,%xmm0 + movq 32(%r12),%xmm2 + pand %xmm5,%xmm1 + movq 96(%r12),%xmm3 + pand %xmm6,%xmm2 +.byte 0x67 + por %xmm1,%xmm0 + movq -96(%r14),%xmm1 +.byte 0x67 + pand %xmm7,%xmm3 +.byte 0x67 + por %xmm2,%xmm0 + movq -32(%r14),%xmm2 +.byte 0x67 + pand %xmm4,%xmm1 +.byte 0x67 + por %xmm3,%xmm0 + movq 32(%r14),%xmm3 + +.byte 102,72,15,126,195 + movq 96(%r14),%xmm0 + movq %r13,16+8(%rsp) + movq %rdi,56+8(%rsp) + + movq (%r8),%r8 + movq (%rsi),%rax + leaq (%rsi,%r9,1),%rsi + negq %r9 + + movq %r8,%rbp + mulq %rbx + movq %rax,%r10 + movq (%rcx),%rax + + pand %xmm5,%xmm2 + pand %xmm6,%xmm3 + por %xmm2,%xmm1 + + imulq %r10,%rbp + + + + + + + + leaq 64+8(%rsp,%r11,8),%r14 + movq %rdx,%r11 + + pand %xmm7,%xmm0 + por %xmm3,%xmm1 + leaq 512(%r12),%r12 + por %xmm1,%xmm0 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi,%r9,1),%rax + adcq $0,%rdx + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 16(%rcx),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq 16(%rsi,%r9,1),%rax + adcq $0,%rdx + addq %r11,%rdi + leaq 32(%r9),%r15 + leaq 64(%rcx),%rcx + adcq $0,%rdx + movq %rdi,(%r14) + movq %rdx,%r13 + jmp .L1st4x + +.align 32 +.L1st4x: + mulq %rbx + addq %rax,%r10 + movq -32(%rcx),%rax + leaq 32(%r14),%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%r14) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -16(%rcx),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%r14) + movq %rdx,%r13 + + mulq %rbx + addq %rax,%r10 + movq 0(%rcx),%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq 8(%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-8(%r14) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 16(%rcx),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq 16(%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r11,%rdi + leaq 64(%rcx),%rcx + adcq $0,%rdx + movq %rdi,(%r14) + movq %rdx,%r13 + + addq $32,%r15 + jnz .L1st4x + + mulq %rbx + addq %rax,%r10 + movq -32(%rcx),%rax + leaq 32(%r14),%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%r14) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -16(%rcx),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi,%r9,1),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%r14) + movq %rdx,%r13 + +.byte 102,72,15,126,195 + leaq (%rcx,%r9,2),%rcx + + xorq %rdi,%rdi + addq %r10,%r13 + adcq $0,%rdi + movq %r13,-8(%r14) + + jmp .Louter4x + +.align 32 +.Louter4x: + movq (%r14,%r9,1),%r10 + movq %r8,%rbp + mulq %rbx + addq %rax,%r10 + movq (%rcx),%rax + adcq $0,%rdx + + movq -96(%r12),%xmm0 + movq -32(%r12),%xmm1 + pand %xmm4,%xmm0 + movq 32(%r12),%xmm2 + pand %xmm5,%xmm1 + movq 96(%r12),%xmm3 + + imulq %r10,%rbp +.byte 0x67 + movq %rdx,%r11 + movq %rdi,(%r14) + + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + por %xmm2,%xmm0 + leaq (%r14,%r9,1),%r14 + leaq 256(%r12),%r12 + por %xmm3,%xmm0 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi,%r9,1),%rax + adcq $0,%rdx + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 16(%rcx),%rax + adcq $0,%rdx + addq 8(%r14),%r11 + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq 16(%rsi,%r9,1),%rax + adcq $0,%rdx + addq %r11,%rdi + leaq 32(%r9),%r15 + leaq 64(%rcx),%rcx + adcq $0,%rdx + movq %rdx,%r13 + jmp .Linner4x + +.align 32 +.Linner4x: + mulq %rbx + addq %rax,%r10 + movq -32(%rcx),%rax + adcq $0,%rdx + addq 16(%r14),%r10 + leaq 32(%r14),%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdi,-32(%r14) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -16(%rcx),%rax + adcq $0,%rdx + addq -8(%r14),%r11 + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %r13,-24(%r14) + movq %rdx,%r13 + + mulq %rbx + addq %rax,%r10 + movq 0(%rcx),%rax + adcq $0,%rdx + addq (%r14),%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq 8(%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdi,-16(%r14) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 16(%rcx),%rax + adcq $0,%rdx + addq 8(%r14),%r11 + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq 16(%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r11,%rdi + leaq 64(%rcx),%rcx + adcq $0,%rdx + movq %r13,-8(%r14) + movq %rdx,%r13 + + addq $32,%r15 + jnz .Linner4x + + mulq %rbx + addq %rax,%r10 + movq -32(%rcx),%rax + adcq $0,%rdx + addq 16(%r14),%r10 + leaq 32(%r14),%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdi,-32(%r14) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq %rbp,%rax + movq -16(%rcx),%rbp + adcq $0,%rdx + addq -8(%r14),%r11 + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi,%r9,1),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %r13,-24(%r14) + movq %rdx,%r13 + +.byte 102,72,15,126,195 + movq %rdi,-16(%r14) + leaq (%rcx,%r9,2),%rcx + + xorq %rdi,%rdi + addq %r10,%r13 + adcq $0,%rdi + addq (%r14),%r13 + adcq $0,%rdi + movq %r13,-8(%r14) + + cmpq 16+8(%rsp),%r12 + jb .Louter4x + subq %r13,%rbp + adcq %r15,%r15 + orq %r15,%rdi + xorq $1,%rdi + leaq (%r14,%r9,1),%rbx + leaq (%rcx,%rdi,8),%rbp + movq %r9,%rcx + sarq $3+2,%rcx + movq 56+8(%rsp),%rdi + jmp .Lsqr4x_sub +.size mul4x_internal,.-mul4x_internal +.globl bn_power5 +.type bn_power5,@function +.align 32 +bn_power5: + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movl %r9d,%r10d + shll $3,%r9d + shll $3+2,%r10d + negq %r9 + movq (%r8),%r8 + + + + + + + + leaq -64(%rsp,%r9,2),%r11 + subq %rsi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb .Lpwr_sp_alt + subq %r11,%rsp + leaq -64(%rsp,%r9,2),%rsp + jmp .Lpwr_sp_done + +.align 32 +.Lpwr_sp_alt: + leaq 4096-64(,%r9,2),%r10 + leaq -64(%rsp,%r9,2),%rsp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rsp +.Lpwr_sp_done: + andq $-64,%rsp + movq %r9,%r10 + negq %r9 + + + + + + + + + + + movq %r8,32(%rsp) + movq %rax,40(%rsp) +.Lpower5_body: +.byte 102,72,15,110,207 +.byte 102,72,15,110,209 +.byte 102,73,15,110,218 +.byte 102,72,15,110,226 + + call __bn_sqr8x_internal + call __bn_sqr8x_internal + call __bn_sqr8x_internal + call __bn_sqr8x_internal + call __bn_sqr8x_internal + +.byte 102,72,15,126,209 +.byte 102,72,15,126,226 + movq %rsi,%rdi + movq 40(%rsp),%rax + leaq 32(%rsp),%r8 + + call mul4x_internal + + movq 40(%rsp),%rsi + movq $1,%rax + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp +.Lpower5_epilogue: + .byte 0xf3,0xc3 +.size bn_power5,.-bn_power5 + +.globl bn_sqr8x_internal +.hidden bn_sqr8x_internal +.type bn_sqr8x_internal,@function +.align 32 +bn_sqr8x_internal: +__bn_sqr8x_internal: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + leaq 32(%r10),%rbp + leaq (%rsi,%r9,1),%rsi + + movq %r9,%rcx + + + movq -32(%rsi,%rbp,1),%r14 + leaq 48+8(%rsp,%r9,2),%rdi + movq -24(%rsi,%rbp,1),%rax + leaq -32(%rdi,%rbp,1),%rdi + movq -16(%rsi,%rbp,1),%rbx + movq %rax,%r15 + + mulq %r14 + movq %rax,%r10 + movq %rbx,%rax + movq %rdx,%r11 + movq %r10,-24(%rdi,%rbp,1) + + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + adcq $0,%rdx + movq %r11,-16(%rdi,%rbp,1) + movq %rdx,%r10 + + + movq -8(%rsi,%rbp,1),%rbx + mulq %r15 + movq %rax,%r12 + movq %rbx,%rax + movq %rdx,%r13 + + leaq (%rbp),%rcx + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + movq %rdx,%r11 + adcq $0,%r11 + addq %r12,%r10 + adcq $0,%r11 + movq %r10,-8(%rdi,%rcx,1) + jmp .Lsqr4x_1st + +.align 32 +.Lsqr4x_1st: + movq (%rsi,%rcx,1),%rbx + mulq %r15 + addq %rax,%r13 + movq %rbx,%rax + movq %rdx,%r12 + adcq $0,%r12 + + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + movq 8(%rsi,%rcx,1),%rbx + movq %rdx,%r10 + adcq $0,%r10 + addq %r13,%r11 + adcq $0,%r10 + + + mulq %r15 + addq %rax,%r12 + movq %rbx,%rax + movq %r11,(%rdi,%rcx,1) + movq %rdx,%r13 + adcq $0,%r13 + + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + movq 16(%rsi,%rcx,1),%rbx + movq %rdx,%r11 + adcq $0,%r11 + addq %r12,%r10 + adcq $0,%r11 + + mulq %r15 + addq %rax,%r13 + movq %rbx,%rax + movq %r10,8(%rdi,%rcx,1) + movq %rdx,%r12 + adcq $0,%r12 + + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + movq 24(%rsi,%rcx,1),%rbx + movq %rdx,%r10 + adcq $0,%r10 + addq %r13,%r11 + adcq $0,%r10 + + + mulq %r15 + addq %rax,%r12 + movq %rbx,%rax + movq %r11,16(%rdi,%rcx,1) + movq %rdx,%r13 + adcq $0,%r13 + leaq 32(%rcx),%rcx + + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + movq %rdx,%r11 + adcq $0,%r11 + addq %r12,%r10 + adcq $0,%r11 + movq %r10,-8(%rdi,%rcx,1) + + cmpq $0,%rcx + jne .Lsqr4x_1st + + mulq %r15 + addq %rax,%r13 + leaq 16(%rbp),%rbp + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + + movq %r13,(%rdi) + movq %rdx,%r12 + movq %rdx,8(%rdi) + jmp .Lsqr4x_outer + +.align 32 +.Lsqr4x_outer: + movq -32(%rsi,%rbp,1),%r14 + leaq 48+8(%rsp,%r9,2),%rdi + movq -24(%rsi,%rbp,1),%rax + leaq -32(%rdi,%rbp,1),%rdi + movq -16(%rsi,%rbp,1),%rbx + movq %rax,%r15 + + mulq %r14 + movq -24(%rdi,%rbp,1),%r10 + addq %rax,%r10 + movq %rbx,%rax + adcq $0,%rdx + movq %r10,-24(%rdi,%rbp,1) + movq %rdx,%r11 + + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + adcq $0,%rdx + addq -16(%rdi,%rbp,1),%r11 + movq %rdx,%r10 + adcq $0,%r10 + movq %r11,-16(%rdi,%rbp,1) + + xorq %r12,%r12 + + movq -8(%rsi,%rbp,1),%rbx + mulq %r15 + addq %rax,%r12 + movq %rbx,%rax + adcq $0,%rdx + addq -8(%rdi,%rbp,1),%r12 + movq %rdx,%r13 + adcq $0,%r13 + + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + adcq $0,%rdx + addq %r12,%r10 + movq %rdx,%r11 + adcq $0,%r11 + movq %r10,-8(%rdi,%rbp,1) + + leaq (%rbp),%rcx + jmp .Lsqr4x_inner + +.align 32 +.Lsqr4x_inner: + movq (%rsi,%rcx,1),%rbx + mulq %r15 + addq %rax,%r13 + movq %rbx,%rax + movq %rdx,%r12 + adcq $0,%r12 + addq (%rdi,%rcx,1),%r13 + adcq $0,%r12 + +.byte 0x67 + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + movq 8(%rsi,%rcx,1),%rbx + movq %rdx,%r10 + adcq $0,%r10 + addq %r13,%r11 + adcq $0,%r10 + + mulq %r15 + addq %rax,%r12 + movq %r11,(%rdi,%rcx,1) + movq %rbx,%rax + movq %rdx,%r13 + adcq $0,%r13 + addq 8(%rdi,%rcx,1),%r12 + leaq 16(%rcx),%rcx + adcq $0,%r13 + + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + adcq $0,%rdx + addq %r12,%r10 + movq %rdx,%r11 + adcq $0,%r11 + movq %r10,-8(%rdi,%rcx,1) + + cmpq $0,%rcx + jne .Lsqr4x_inner + +.byte 0x67 + mulq %r15 + addq %rax,%r13 + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + + movq %r13,(%rdi) + movq %rdx,%r12 + movq %rdx,8(%rdi) + + addq $16,%rbp + jnz .Lsqr4x_outer + + + movq -32(%rsi),%r14 + leaq 48+8(%rsp,%r9,2),%rdi + movq -24(%rsi),%rax + leaq -32(%rdi,%rbp,1),%rdi + movq -16(%rsi),%rbx + movq %rax,%r15 + + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + movq %rdx,%r11 + adcq $0,%r11 + + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + movq %r10,-24(%rdi) + movq %rdx,%r10 + adcq $0,%r10 + addq %r13,%r11 + movq -8(%rsi),%rbx + adcq $0,%r10 + + mulq %r15 + addq %rax,%r12 + movq %rbx,%rax + movq %r11,-16(%rdi) + movq %rdx,%r13 + adcq $0,%r13 + + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + movq %rdx,%r11 + adcq $0,%r11 + addq %r12,%r10 + adcq $0,%r11 + movq %r10,-8(%rdi) + + mulq %r15 + addq %rax,%r13 + movq -16(%rsi),%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + + movq %r13,(%rdi) + movq %rdx,%r12 + movq %rdx,8(%rdi) + + mulq %rbx + addq $16,%rbp + xorq %r14,%r14 + subq %r9,%rbp + xorq %r15,%r15 + + addq %r12,%rax + adcq $0,%rdx + movq %rax,8(%rdi) + movq %rdx,16(%rdi) + movq %r15,24(%rdi) + + movq -16(%rsi,%rbp,1),%rax + leaq 48+8(%rsp),%rdi + xorq %r10,%r10 + movq 8(%rdi),%r11 + + leaq (%r14,%r10,2),%r12 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r13 + shrq $63,%r11 + orq %r10,%r13 + movq 16(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq 24(%rdi),%r11 + adcq %rax,%r12 + movq -8(%rsi,%rbp,1),%rax + movq %r12,(%rdi) + adcq %rdx,%r13 + + leaq (%r14,%r10,2),%rbx + movq %r13,8(%rdi) + sbbq %r15,%r15 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r8 + shrq $63,%r11 + orq %r10,%r8 + movq 32(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq 40(%rdi),%r11 + adcq %rax,%rbx + movq 0(%rsi,%rbp,1),%rax + movq %rbx,16(%rdi) + adcq %rdx,%r8 + leaq 16(%rbp),%rbp + movq %r8,24(%rdi) + sbbq %r15,%r15 + leaq 64(%rdi),%rdi + jmp .Lsqr4x_shift_n_add + +.align 32 +.Lsqr4x_shift_n_add: + leaq (%r14,%r10,2),%r12 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r13 + shrq $63,%r11 + orq %r10,%r13 + movq -16(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq -8(%rdi),%r11 + adcq %rax,%r12 + movq -8(%rsi,%rbp,1),%rax + movq %r12,-32(%rdi) + adcq %rdx,%r13 + + leaq (%r14,%r10,2),%rbx + movq %r13,-24(%rdi) + sbbq %r15,%r15 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r8 + shrq $63,%r11 + orq %r10,%r8 + movq 0(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq 8(%rdi),%r11 + adcq %rax,%rbx + movq 0(%rsi,%rbp,1),%rax + movq %rbx,-16(%rdi) + adcq %rdx,%r8 + + leaq (%r14,%r10,2),%r12 + movq %r8,-8(%rdi) + sbbq %r15,%r15 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r13 + shrq $63,%r11 + orq %r10,%r13 + movq 16(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq 24(%rdi),%r11 + adcq %rax,%r12 + movq 8(%rsi,%rbp,1),%rax + movq %r12,0(%rdi) + adcq %rdx,%r13 + + leaq (%r14,%r10,2),%rbx + movq %r13,8(%rdi) + sbbq %r15,%r15 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r8 + shrq $63,%r11 + orq %r10,%r8 + movq 32(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq 40(%rdi),%r11 + adcq %rax,%rbx + movq 16(%rsi,%rbp,1),%rax + movq %rbx,16(%rdi) + adcq %rdx,%r8 + movq %r8,24(%rdi) + sbbq %r15,%r15 + leaq 64(%rdi),%rdi + addq $32,%rbp + jnz .Lsqr4x_shift_n_add + + leaq (%r14,%r10,2),%r12 +.byte 0x67 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r13 + shrq $63,%r11 + orq %r10,%r13 + movq -16(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq -8(%rdi),%r11 + adcq %rax,%r12 + movq -8(%rsi),%rax + movq %r12,-32(%rdi) + adcq %rdx,%r13 + + leaq (%r14,%r10,2),%rbx + movq %r13,-24(%rdi) + sbbq %r15,%r15 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r8 + shrq $63,%r11 + orq %r10,%r8 + mulq %rax + negq %r15 + adcq %rax,%rbx + adcq %rdx,%r8 + movq %rbx,-16(%rdi) + movq %r8,-8(%rdi) +.byte 102,72,15,126,213 +sqr8x_reduction: + xorq %rax,%rax + leaq (%rbp,%r9,2),%rcx + leaq 48+8(%rsp,%r9,2),%rdx + movq %rcx,0+8(%rsp) + leaq 48+8(%rsp,%r9,1),%rdi + movq %rdx,8+8(%rsp) + negq %r9 + jmp .L8x_reduction_loop + +.align 32 +.L8x_reduction_loop: + leaq (%rdi,%r9,1),%rdi +.byte 0x66 + movq 0(%rdi),%rbx + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq %rax,(%rdx) + leaq 64(%rdi),%rdi + +.byte 0x67 + movq %rbx,%r8 + imulq 32+8(%rsp),%rbx + movq 0(%rbp),%rax + movl $8,%ecx + jmp .L8x_reduce + +.align 32 +.L8x_reduce: + mulq %rbx + movq 16(%rbp),%rax + negq %r8 + movq %rdx,%r8 + adcq $0,%r8 + + mulq %rbx + addq %rax,%r9 + movq 32(%rbp),%rax + adcq $0,%rdx + addq %r9,%r8 + movq %rbx,48-8+8(%rsp,%rcx,8) + movq %rdx,%r9 + adcq $0,%r9 + + mulq %rbx + addq %rax,%r10 + movq 48(%rbp),%rax + adcq $0,%rdx + addq %r10,%r9 + movq 32+8(%rsp),%rsi + movq %rdx,%r10 + adcq $0,%r10 + + mulq %rbx + addq %rax,%r11 + movq 64(%rbp),%rax + adcq $0,%rdx + imulq %r8,%rsi + addq %r11,%r10 + movq %rdx,%r11 + adcq $0,%r11 + + mulq %rbx + addq %rax,%r12 + movq 80(%rbp),%rax + adcq $0,%rdx + addq %r12,%r11 + movq %rdx,%r12 + adcq $0,%r12 + + mulq %rbx + addq %rax,%r13 + movq 96(%rbp),%rax + adcq $0,%rdx + addq %r13,%r12 + movq %rdx,%r13 + adcq $0,%r13 + + mulq %rbx + addq %rax,%r14 + movq 112(%rbp),%rax + adcq $0,%rdx + addq %r14,%r13 + movq %rdx,%r14 + adcq $0,%r14 + + mulq %rbx + movq %rsi,%rbx + addq %rax,%r15 + movq 0(%rbp),%rax + adcq $0,%rdx + addq %r15,%r14 + movq %rdx,%r15 + adcq $0,%r15 + + decl %ecx + jnz .L8x_reduce + + leaq 128(%rbp),%rbp + xorq %rax,%rax + movq 8+8(%rsp),%rdx + cmpq 0+8(%rsp),%rbp + jae .L8x_no_tail + +.byte 0x66 + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + sbbq %rsi,%rsi + + movq 48+56+8(%rsp),%rbx + movl $8,%ecx + movq 0(%rbp),%rax + jmp .L8x_tail + +.align 32 +.L8x_tail: + mulq %rbx + addq %rax,%r8 + movq 16(%rbp),%rax + movq %r8,(%rdi) + movq %rdx,%r8 + adcq $0,%r8 + + mulq %rbx + addq %rax,%r9 + movq 32(%rbp),%rax + adcq $0,%rdx + addq %r9,%r8 + leaq 8(%rdi),%rdi + movq %rdx,%r9 + adcq $0,%r9 + + mulq %rbx + addq %rax,%r10 + movq 48(%rbp),%rax + adcq $0,%rdx + addq %r10,%r9 + movq %rdx,%r10 + adcq $0,%r10 + + mulq %rbx + addq %rax,%r11 + movq 64(%rbp),%rax + adcq $0,%rdx + addq %r11,%r10 + movq %rdx,%r11 + adcq $0,%r11 + + mulq %rbx + addq %rax,%r12 + movq 80(%rbp),%rax + adcq $0,%rdx + addq %r12,%r11 + movq %rdx,%r12 + adcq $0,%r12 + + mulq %rbx + addq %rax,%r13 + movq 96(%rbp),%rax + adcq $0,%rdx + addq %r13,%r12 + movq %rdx,%r13 + adcq $0,%r13 + + mulq %rbx + addq %rax,%r14 + movq 112(%rbp),%rax + adcq $0,%rdx + addq %r14,%r13 + movq %rdx,%r14 + adcq $0,%r14 + + mulq %rbx + movq 48-16+8(%rsp,%rcx,8),%rbx + addq %rax,%r15 + adcq $0,%rdx + addq %r15,%r14 + movq 0(%rbp),%rax + movq %rdx,%r15 + adcq $0,%r15 + + decl %ecx + jnz .L8x_tail + + leaq 128(%rbp),%rbp + movq 8+8(%rsp),%rdx + cmpq 0+8(%rsp),%rbp + jae .L8x_tail_done + + movq 48+56+8(%rsp),%rbx + negq %rsi + movq 0(%rbp),%rax + adcq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + sbbq %rsi,%rsi + + movl $8,%ecx + jmp .L8x_tail + +.align 32 +.L8x_tail_done: + addq (%rdx),%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + + + xorq %rax,%rax + + negq %rsi +.L8x_no_tail: + adcq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + adcq $0,%rax + movq -16(%rbp),%rcx + xorq %rsi,%rsi + +.byte 102,72,15,126,213 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) +.byte 102,73,15,126,217 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + leaq 64(%rdi),%rdi + + cmpq %rdx,%rdi + jb .L8x_reduction_loop + + subq %r15,%rcx + leaq (%rdi,%r9,1),%rbx + adcq %rsi,%rsi + movq %r9,%rcx + orq %rsi,%rax +.byte 102,72,15,126,207 + xorq $1,%rax +.byte 102,72,15,126,206 + leaq (%rbp,%rax,8),%rbp + sarq $3+2,%rcx + jmp .Lsqr4x_sub + +.align 32 +.Lsqr4x_sub: +.byte 0x66 + movq 0(%rbx),%r12 + movq 8(%rbx),%r13 + sbbq 0(%rbp),%r12 + movq 16(%rbx),%r14 + sbbq 16(%rbp),%r13 + movq 24(%rbx),%r15 + leaq 32(%rbx),%rbx + sbbq 32(%rbp),%r14 + movq %r12,0(%rdi) + sbbq 48(%rbp),%r15 + leaq 64(%rbp),%rbp + movq %r13,8(%rdi) + movq %r14,16(%rdi) + movq %r15,24(%rdi) + leaq 32(%rdi),%rdi + + incq %rcx + jnz .Lsqr4x_sub + movq %r9,%r10 + negq %r9 + .byte 0xf3,0xc3 +.size bn_sqr8x_internal,.-bn_sqr8x_internal +.globl bn_from_montgomery +.type bn_from_montgomery,@function +.align 32 +bn_from_montgomery: + testl $7,%r9d + jz bn_from_mont8x + xorl %eax,%eax + .byte 0xf3,0xc3 +.size bn_from_montgomery,.-bn_from_montgomery + +.type bn_from_mont8x,@function +.align 32 +bn_from_mont8x: +.byte 0x67 + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 +.byte 0x67 + movl %r9d,%r10d + shll $3,%r9d + shll $3+2,%r10d + negq %r9 + movq (%r8),%r8 + + + + + + + + leaq -64(%rsp,%r9,2),%r11 + subq %rsi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb .Lfrom_sp_alt + subq %r11,%rsp + leaq -64(%rsp,%r9,2),%rsp + jmp .Lfrom_sp_done + +.align 32 +.Lfrom_sp_alt: + leaq 4096-64(,%r9,2),%r10 + leaq -64(%rsp,%r9,2),%rsp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rsp +.Lfrom_sp_done: + andq $-64,%rsp + movq %r9,%r10 + negq %r9 + + + + + + + + + + + movq %r8,32(%rsp) + movq %rax,40(%rsp) +.Lfrom_body: + movq %r9,%r11 + leaq 48(%rsp),%rax + pxor %xmm0,%xmm0 + jmp .Lmul_by_1 + +.align 32 +.Lmul_by_1: + movdqu (%rsi),%xmm1 + movdqu 16(%rsi),%xmm2 + movdqu 32(%rsi),%xmm3 + movdqa %xmm0,(%rax,%r9,1) + movdqu 48(%rsi),%xmm4 + movdqa %xmm0,16(%rax,%r9,1) +.byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 + movdqa %xmm1,(%rax) + movdqa %xmm0,32(%rax,%r9,1) + movdqa %xmm2,16(%rax) + movdqa %xmm0,48(%rax,%r9,1) + movdqa %xmm3,32(%rax) + movdqa %xmm4,48(%rax) + leaq 64(%rax),%rax + subq $64,%r11 + jnz .Lmul_by_1 + +.byte 102,72,15,110,207 +.byte 102,72,15,110,209 +.byte 0x67 + movq %rcx,%rbp +.byte 102,73,15,110,218 + call sqr8x_reduction + + pxor %xmm0,%xmm0 + leaq 48(%rsp),%rax + movq 40(%rsp),%rsi + jmp .Lfrom_mont_zero + +.align 32 +.Lfrom_mont_zero: + movdqa %xmm0,0(%rax) + movdqa %xmm0,16(%rax) + movdqa %xmm0,32(%rax) + movdqa %xmm0,48(%rax) + leaq 64(%rax),%rax + subq $32,%r9 + jnz .Lfrom_mont_zero + + movq $1,%rax + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp +.Lfrom_epilogue: + .byte 0xf3,0xc3 +.size bn_from_mont8x,.-bn_from_mont8x +.globl bn_get_bits5 +.type bn_get_bits5,@function +.align 16 +bn_get_bits5: + leaq 0(%rdi),%r10 + leaq 1(%rdi),%r11 + movl %esi,%ecx + shrl $4,%esi + andl $15,%ecx + leal -8(%rcx),%eax + cmpl $11,%ecx + cmovaq %r11,%r10 + cmoval %eax,%ecx + movzwl (%r10,%rsi,2),%eax + shrl %cl,%eax + andl $31,%eax + .byte 0xf3,0xc3 +.size bn_get_bits5,.-bn_get_bits5 + +.globl bn_scatter5 +.type bn_scatter5,@function +.align 16 +bn_scatter5: + cmpl $0,%esi + jz .Lscatter_epilogue + leaq (%rdx,%rcx,8),%rdx +.Lscatter: + movq (%rdi),%rax + leaq 8(%rdi),%rdi + movq %rax,(%rdx) + leaq 256(%rdx),%rdx + subl $1,%esi + jnz .Lscatter +.Lscatter_epilogue: + .byte 0xf3,0xc3 +.size bn_scatter5,.-bn_scatter5 + +.globl bn_gather5 +.type bn_gather5,@function +.align 16 +bn_gather5: + movl %ecx,%r11d + shrl $3,%ecx + andq $7,%r11 + notl %ecx + leaq .Lmagic_masks(%rip),%rax + andl $3,%ecx + leaq 128(%rdx,%r11,8),%rdx + movq 0(%rax,%rcx,8),%xmm4 + movq 8(%rax,%rcx,8),%xmm5 + movq 16(%rax,%rcx,8),%xmm6 + movq 24(%rax,%rcx,8),%xmm7 + jmp .Lgather +.align 16 +.Lgather: + movq -128(%rdx),%xmm0 + movq -64(%rdx),%xmm1 + pand %xmm4,%xmm0 + movq 0(%rdx),%xmm2 + pand %xmm5,%xmm1 + movq 64(%rdx),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 +.byte 0x67,0x67 + por %xmm2,%xmm0 + leaq 256(%rdx),%rdx + por %xmm3,%xmm0 + + movq %xmm0,(%rdi) + leaq 8(%rdi),%rdi + subl $1,%esi + jnz .Lgather + .byte 0xf3,0xc3 +.LSEH_end_bn_gather5: +.size bn_gather5,.-bn_gather5 +.align 64 +.Lmagic_masks: +.long 0,0, 0,0, 0,0, -1,-1 +.long 0,0, 0,0, 0,0, 0,0 +.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 diff --git a/secure/lib/libcrypto/amd64/x86_64cpuid.S b/secure/lib/libcrypto/amd64/x86_64cpuid.S new file mode 100644 index 0000000..93de516 --- /dev/null +++ b/secure/lib/libcrypto/amd64/x86_64cpuid.S @@ -0,0 +1,260 @@ + # $FreeBSD$ + +.hidden OPENSSL_cpuid_setup +.section .init + call OPENSSL_cpuid_setup + +.hidden OPENSSL_ia32cap_P +.comm OPENSSL_ia32cap_P,16,4 + +.text + +.globl OPENSSL_atomic_add +.type OPENSSL_atomic_add,@function +.align 16 +OPENSSL_atomic_add: + movl (%rdi),%eax +.Lspin: leaq (%rsi,%rax,1),%r8 +.byte 0xf0 + cmpxchgl %r8d,(%rdi) + jne .Lspin + movl %r8d,%eax +.byte 0x48,0x98 + .byte 0xf3,0xc3 +.size OPENSSL_atomic_add,.-OPENSSL_atomic_add + +.globl OPENSSL_rdtsc +.type OPENSSL_rdtsc,@function +.align 16 +OPENSSL_rdtsc: + rdtsc + shlq $32,%rdx + orq %rdx,%rax + .byte 0xf3,0xc3 +.size OPENSSL_rdtsc,.-OPENSSL_rdtsc + +.globl OPENSSL_ia32_cpuid +.type OPENSSL_ia32_cpuid,@function +.align 16 +OPENSSL_ia32_cpuid: + movq %rbx,%r8 + + xorl %eax,%eax + movl %eax,8(%rdi) + cpuid + movl %eax,%r11d + + xorl %eax,%eax + cmpl $1970169159,%ebx + setne %al + movl %eax,%r9d + cmpl $1231384169,%edx + setne %al + orl %eax,%r9d + cmpl $1818588270,%ecx + setne %al + orl %eax,%r9d + jz .Lintel + + cmpl $1752462657,%ebx + setne %al + movl %eax,%r10d + cmpl $1769238117,%edx + setne %al + orl %eax,%r10d + cmpl $1145913699,%ecx + setne %al + orl %eax,%r10d + jnz .Lintel + + + movl $2147483648,%eax + cpuid + cmpl $2147483649,%eax + jb .Lintel + movl %eax,%r10d + movl $2147483649,%eax + cpuid + orl %ecx,%r9d + andl $2049,%r9d + + cmpl $2147483656,%r10d + jb .Lintel + + movl $2147483656,%eax + cpuid + movzbq %cl,%r10 + incq %r10 + + movl $1,%eax + cpuid + btl $28,%edx + jnc .Lgeneric + shrl $16,%ebx + cmpb %r10b,%bl + ja .Lgeneric + andl $4026531839,%edx + jmp .Lgeneric + +.Lintel: + cmpl $4,%r11d + movl $-1,%r10d + jb .Lnocacheinfo + + movl $4,%eax + movl $0,%ecx + cpuid + movl %eax,%r10d + shrl $14,%r10d + andl $4095,%r10d + + cmpl $7,%r11d + jb .Lnocacheinfo + + movl $7,%eax + xorl %ecx,%ecx + cpuid + movl %ebx,8(%rdi) + +.Lnocacheinfo: + movl $1,%eax + cpuid + andl $3220176895,%edx + cmpl $0,%r9d + jne .Lnotintel + orl $1073741824,%edx + andb $15,%ah + cmpb $15,%ah + jne .Lnotintel + orl $1048576,%edx +.Lnotintel: + btl $28,%edx + jnc .Lgeneric + andl $4026531839,%edx + cmpl $0,%r10d + je .Lgeneric + + orl $268435456,%edx + shrl $16,%ebx + cmpb $1,%bl + ja .Lgeneric + andl $4026531839,%edx +.Lgeneric: + andl $2048,%r9d + andl $4294965247,%ecx + orl %ecx,%r9d + + movl %edx,%r10d + btl $27,%r9d + jnc .Lclear_avx + xorl %ecx,%ecx +.byte 0x0f,0x01,0xd0 + andl $6,%eax + cmpl $6,%eax + je .Ldone +.Lclear_avx: + movl $4026525695,%eax + andl %eax,%r9d + andl $4294967263,8(%rdi) +.Ldone: + shlq $32,%r9 + movl %r10d,%eax + movq %r8,%rbx + orq %r9,%rax + .byte 0xf3,0xc3 +.size OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid + +.globl OPENSSL_cleanse +.type OPENSSL_cleanse,@function +.align 16 +OPENSSL_cleanse: + xorq %rax,%rax + cmpq $15,%rsi + jae .Lot + cmpq $0,%rsi + je .Lret +.Little: + movb %al,(%rdi) + subq $1,%rsi + leaq 1(%rdi),%rdi + jnz .Little +.Lret: + .byte 0xf3,0xc3 +.align 16 +.Lot: + testq $7,%rdi + jz .Laligned + movb %al,(%rdi) + leaq -1(%rsi),%rsi + leaq 1(%rdi),%rdi + jmp .Lot +.Laligned: + movq %rax,(%rdi) + leaq -8(%rsi),%rsi + testq $-8,%rsi + leaq 8(%rdi),%rdi + jnz .Laligned + cmpq $0,%rsi + jne .Little + .byte 0xf3,0xc3 +.size OPENSSL_cleanse,.-OPENSSL_cleanse +.globl OPENSSL_wipe_cpu +.type OPENSSL_wipe_cpu,@function +.align 16 +OPENSSL_wipe_cpu: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + pxor %xmm8,%xmm8 + pxor %xmm9,%xmm9 + pxor %xmm10,%xmm10 + pxor %xmm11,%xmm11 + pxor %xmm12,%xmm12 + pxor %xmm13,%xmm13 + pxor %xmm14,%xmm14 + pxor %xmm15,%xmm15 + xorq %rcx,%rcx + xorq %rdx,%rdx + xorq %rsi,%rsi + xorq %rdi,%rdi + xorq %r8,%r8 + xorq %r9,%r9 + xorq %r10,%r10 + xorq %r11,%r11 + leaq 8(%rsp),%rax + .byte 0xf3,0xc3 +.size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu +.globl OPENSSL_ia32_rdrand +.type OPENSSL_ia32_rdrand,@function +.align 16 +OPENSSL_ia32_rdrand: + movl $8,%ecx +.Loop_rdrand: +.byte 72,15,199,240 + jc .Lbreak_rdrand + loop .Loop_rdrand +.Lbreak_rdrand: + cmpq $0,%rax + cmoveq %rcx,%rax + .byte 0xf3,0xc3 +.size OPENSSL_ia32_rdrand,.-OPENSSL_ia32_rdrand + +.globl OPENSSL_ia32_rdseed +.type OPENSSL_ia32_rdseed,@function +.align 16 +OPENSSL_ia32_rdseed: + movl $8,%ecx +.Loop_rdseed: +.byte 72,15,199,248 + jc .Lbreak_rdseed + loop .Loop_rdseed +.Lbreak_rdseed: + cmpq $0,%rax + cmoveq %rcx,%rax + .byte 0xf3,0xc3 +.size OPENSSL_ia32_rdseed,.-OPENSSL_ia32_rdseed |