diff options
Diffstat (limited to 'secure/lib/libcrypto/amd64/rsaz-x86_64.S')
-rw-r--r-- | secure/lib/libcrypto/amd64/rsaz-x86_64.S | 657 |
1 files changed, 656 insertions, 1 deletions
diff --git a/secure/lib/libcrypto/amd64/rsaz-x86_64.S b/secure/lib/libcrypto/amd64/rsaz-x86_64.S index e2b0313..f09f499 100644 --- a/secure/lib/libcrypto/amd64/rsaz-x86_64.S +++ b/secure/lib/libcrypto/amd64/rsaz-x86_64.S @@ -1,4 +1,5 @@ - # $FreeBSD$ +# $FreeBSD$ +# Do not modify. This file is auto-generated from rsaz-x86_64.pl. .text @@ -20,6 +21,10 @@ rsaz_512_sqr: movq (%rsi),%rdx movq 8(%rsi),%rax movq %rcx,128(%rsp) + movl $0x80100,%r11d + andl OPENSSL_ia32cap_P+8(%rip),%r11d + cmpl $0x80100,%r11d + je .Loop_sqrx jmp .Loop_sqr .align 32 @@ -383,6 +388,276 @@ rsaz_512_sqr: decl %r8d jnz .Loop_sqr + jmp .Lsqr_tail + +.align 32 +.Loop_sqrx: + movl %r8d,128+8(%rsp) +.byte 102,72,15,110,199 +.byte 102,72,15,110,205 + + mulxq %rax,%r8,%r9 + + mulxq 16(%rsi),%rcx,%r10 + xorq %rbp,%rbp + + mulxq 24(%rsi),%rax,%r11 + adcxq %rcx,%r9 + + mulxq 32(%rsi),%rcx,%r12 + adcxq %rax,%r10 + + mulxq 40(%rsi),%rax,%r13 + adcxq %rcx,%r11 + +.byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 + adcxq %rax,%r12 + adcxq %rcx,%r13 + +.byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 + adcxq %rax,%r14 + adcxq %rbp,%r15 + + movq %r9,%rcx + shldq $1,%r8,%r9 + shlq $1,%r8 + + xorl %ebp,%ebp + mulxq %rdx,%rax,%rdx + adcxq %rdx,%r8 + movq 8(%rsi),%rdx + adcxq %rbp,%r9 + + movq %rax,(%rsp) + movq %r8,8(%rsp) + + + mulxq 16(%rsi),%rax,%rbx + adoxq %rax,%r10 + adcxq %rbx,%r11 + +.byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 + adoxq %rdi,%r11 + adcxq %r8,%r12 + + mulxq 32(%rsi),%rax,%rbx + adoxq %rax,%r12 + adcxq %rbx,%r13 + + mulxq 40(%rsi),%rdi,%r8 + adoxq %rdi,%r13 + adcxq %r8,%r14 + +.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 + adoxq %rax,%r14 + adcxq %rbx,%r15 + +.byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 + adoxq %rdi,%r15 + adcxq %rbp,%r8 + adoxq %rbp,%r8 + + movq %r11,%rbx + shldq $1,%r10,%r11 + shldq $1,%rcx,%r10 + + xorl %ebp,%ebp + mulxq %rdx,%rax,%rcx + movq 16(%rsi),%rdx + adcxq %rax,%r9 + adcxq %rcx,%r10 + adcxq %rbp,%r11 + + movq %r9,16(%rsp) +.byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 + + +.byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 + adoxq %rdi,%r12 + adcxq %r9,%r13 + + mulxq 32(%rsi),%rax,%rcx + adoxq %rax,%r13 + adcxq %rcx,%r14 + + mulxq 40(%rsi),%rdi,%r9 + adoxq %rdi,%r14 + adcxq %r9,%r15 + +.byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 + adoxq %rax,%r15 + adcxq %rcx,%r8 + +.byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 + adoxq %rdi,%r8 + adcxq %rbp,%r9 + adoxq %rbp,%r9 + + movq %r13,%rcx + shldq $1,%r12,%r13 + shldq $1,%rbx,%r12 + + xorl %ebp,%ebp + mulxq %rdx,%rax,%rdx + adcxq %rax,%r11 + adcxq %rdx,%r12 + movq 24(%rsi),%rdx + adcxq %rbp,%r13 + + movq %r11,32(%rsp) +.byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 + + +.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 + adoxq %rax,%r14 + adcxq %rbx,%r15 + + mulxq 40(%rsi),%rdi,%r10 + adoxq %rdi,%r15 + adcxq %r10,%r8 + + mulxq 48(%rsi),%rax,%rbx + adoxq %rax,%r8 + adcxq %rbx,%r9 + + mulxq 56(%rsi),%rdi,%r10 + adoxq %rdi,%r9 + adcxq %rbp,%r10 + adoxq %rbp,%r10 + +.byte 0x66 + movq %r15,%rbx + shldq $1,%r14,%r15 + shldq $1,%rcx,%r14 + + xorl %ebp,%ebp + mulxq %rdx,%rax,%rdx + adcxq %rax,%r13 + adcxq %rdx,%r14 + movq 32(%rsi),%rdx + adcxq %rbp,%r15 + + movq %r13,48(%rsp) + movq %r14,56(%rsp) + + +.byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 + adoxq %rdi,%r8 + adcxq %r11,%r9 + + mulxq 48(%rsi),%rax,%rcx + adoxq %rax,%r9 + adcxq %rcx,%r10 + + mulxq 56(%rsi),%rdi,%r11 + adoxq %rdi,%r10 + adcxq %rbp,%r11 + adoxq %rbp,%r11 + + movq %r9,%rcx + shldq $1,%r8,%r9 + shldq $1,%rbx,%r8 + + xorl %ebp,%ebp + mulxq %rdx,%rax,%rdx + adcxq %rax,%r15 + adcxq %rdx,%r8 + movq 40(%rsi),%rdx + adcxq %rbp,%r9 + + movq %r15,64(%rsp) + movq %r8,72(%rsp) + + +.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 + adoxq %rax,%r10 + adcxq %rbx,%r11 + +.byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 + adoxq %rdi,%r11 + adcxq %rbp,%r12 + adoxq %rbp,%r12 + + movq %r11,%rbx + shldq $1,%r10,%r11 + shldq $1,%rcx,%r10 + + xorl %ebp,%ebp + mulxq %rdx,%rax,%rdx + adcxq %rax,%r9 + adcxq %rdx,%r10 + movq 48(%rsi),%rdx + adcxq %rbp,%r11 + + movq %r9,80(%rsp) + movq %r10,88(%rsp) + + +.byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 + adoxq %rax,%r12 + adoxq %rbp,%r13 + + xorq %r14,%r14 + shldq $1,%r13,%r14 + shldq $1,%r12,%r13 + shldq $1,%rbx,%r12 + + xorl %ebp,%ebp + mulxq %rdx,%rax,%rdx + adcxq %rax,%r11 + adcxq %rdx,%r12 + movq 56(%rsi),%rdx + adcxq %rbp,%r13 + +.byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 +.byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 + + + mulxq %rdx,%rax,%rdx + adoxq %rax,%r13 + adoxq %rbp,%rdx + +.byte 0x66 + addq %rdx,%r14 + + movq %r13,112(%rsp) + movq %r14,120(%rsp) +.byte 102,72,15,126,199 +.byte 102,72,15,126,205 + + movq 128(%rsp),%rdx + movq (%rsp),%r8 + movq 8(%rsp),%r9 + movq 16(%rsp),%r10 + movq 24(%rsp),%r11 + movq 32(%rsp),%r12 + movq 40(%rsp),%r13 + movq 48(%rsp),%r14 + movq 56(%rsp),%r15 + + call __rsaz_512_reducex + + addq 64(%rsp),%r8 + adcq 72(%rsp),%r9 + adcq 80(%rsp),%r10 + adcq 88(%rsp),%r11 + adcq 96(%rsp),%r12 + adcq 104(%rsp),%r13 + adcq 112(%rsp),%r14 + adcq 120(%rsp),%r15 + sbbq %rcx,%rcx + + call __rsaz_512_subtract + + movq %r8,%rdx + movq %r9,%rax + movl 128+8(%rsp),%r8d + movq %rdi,%rsi + + decl %r8d + jnz .Loop_sqrx + +.Lsqr_tail: leaq 128+24+48(%rsp),%rax movq -48(%rax),%r15 @@ -411,6 +686,10 @@ rsaz_512_mul: .byte 102,72,15,110,199 .byte 102,72,15,110,201 movq %r8,128(%rsp) + movl $0x80100,%r11d + andl OPENSSL_ia32cap_P+8(%rip),%r11d + cmpl $0x80100,%r11d + je .Lmulx movq (%rdx),%rbx movq %rdx,%rbp call __rsaz_512_mul @@ -428,6 +707,29 @@ rsaz_512_mul: movq 56(%rsp),%r15 call __rsaz_512_reduce + jmp .Lmul_tail + +.align 32 +.Lmulx: + movq %rdx,%rbp + movq (%rdx),%rdx + call __rsaz_512_mulx + +.byte 102,72,15,126,199 +.byte 102,72,15,126,205 + + movq 128(%rsp),%rdx + movq (%rsp),%r8 + movq 8(%rsp),%r9 + movq 16(%rsp),%r10 + movq 24(%rsp),%r11 + movq 32(%rsp),%r12 + movq 40(%rsp),%r13 + movq 48(%rsp),%r14 + movq 56(%rsp),%r15 + + call __rsaz_512_reducex +.Lmul_tail: addq 64(%rsp),%r8 adcq 72(%rsp),%r9 adcq 80(%rsp),%r10 @@ -518,6 +820,10 @@ rsaz_512_mul_gather4: por %xmm9,%xmm8 pshufd $0x4e,%xmm8,%xmm9 por %xmm9,%xmm8 + movl $0x80100,%r11d + andl OPENSSL_ia32cap_P+8(%rip),%r11d + cmpl $0x80100,%r11d + je .Lmulx_gather .byte 102,76,15,126,195 movq %r8,128(%rsp) @@ -698,6 +1004,142 @@ rsaz_512_mul_gather4: movq 56(%rsp),%r15 call __rsaz_512_reduce + jmp .Lmul_gather_tail + +.align 32 +.Lmulx_gather: +.byte 102,76,15,126,194 + + movq %r8,128(%rsp) + movq %rdi,128+8(%rsp) + movq %rcx,128+16(%rsp) + + mulxq (%rsi),%rbx,%r8 + movq %rbx,(%rsp) + xorl %edi,%edi + + mulxq 8(%rsi),%rax,%r9 + + mulxq 16(%rsi),%rbx,%r10 + adcxq %rax,%r8 + + mulxq 24(%rsi),%rax,%r11 + adcxq %rbx,%r9 + + mulxq 32(%rsi),%rbx,%r12 + adcxq %rax,%r10 + + mulxq 40(%rsi),%rax,%r13 + adcxq %rbx,%r11 + + mulxq 48(%rsi),%rbx,%r14 + adcxq %rax,%r12 + + mulxq 56(%rsi),%rax,%r15 + adcxq %rbx,%r13 + adcxq %rax,%r14 +.byte 0x67 + movq %r8,%rbx + adcxq %rdi,%r15 + + movq $-7,%rcx + jmp .Loop_mulx_gather + +.align 32 +.Loop_mulx_gather: + movdqa 0(%rbp),%xmm8 + movdqa 16(%rbp),%xmm9 + movdqa 32(%rbp),%xmm10 + movdqa 48(%rbp),%xmm11 + pand %xmm0,%xmm8 + movdqa 64(%rbp),%xmm12 + pand %xmm1,%xmm9 + movdqa 80(%rbp),%xmm13 + pand %xmm2,%xmm10 + movdqa 96(%rbp),%xmm14 + pand %xmm3,%xmm11 + movdqa 112(%rbp),%xmm15 + leaq 128(%rbp),%rbp + pand %xmm4,%xmm12 + pand %xmm5,%xmm13 + pand %xmm6,%xmm14 + pand %xmm7,%xmm15 + por %xmm10,%xmm8 + por %xmm11,%xmm9 + por %xmm12,%xmm8 + por %xmm13,%xmm9 + por %xmm14,%xmm8 + por %xmm15,%xmm9 + + por %xmm9,%xmm8 + pshufd $0x4e,%xmm8,%xmm9 + por %xmm9,%xmm8 +.byte 102,76,15,126,194 + +.byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 + adcxq %rax,%rbx + adoxq %r9,%r8 + + mulxq 8(%rsi),%rax,%r9 + adcxq %rax,%r8 + adoxq %r10,%r9 + + mulxq 16(%rsi),%rax,%r10 + adcxq %rax,%r9 + adoxq %r11,%r10 + +.byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 + adcxq %rax,%r10 + adoxq %r12,%r11 + + mulxq 32(%rsi),%rax,%r12 + adcxq %rax,%r11 + adoxq %r13,%r12 + + mulxq 40(%rsi),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + +.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 + adcxq %rax,%r13 +.byte 0x67 + adoxq %r15,%r14 + + mulxq 56(%rsi),%rax,%r15 + movq %rbx,64(%rsp,%rcx,8) + adcxq %rax,%r14 + adoxq %rdi,%r15 + movq %r8,%rbx + adcxq %rdi,%r15 + + incq %rcx + jnz .Loop_mulx_gather + + movq %r8,64(%rsp) + movq %r9,64+8(%rsp) + movq %r10,64+16(%rsp) + movq %r11,64+24(%rsp) + movq %r12,64+32(%rsp) + movq %r13,64+40(%rsp) + movq %r14,64+48(%rsp) + movq %r15,64+56(%rsp) + + movq 128(%rsp),%rdx + movq 128+8(%rsp),%rdi + movq 128+16(%rsp),%rbp + + movq (%rsp),%r8 + movq 8(%rsp),%r9 + movq 16(%rsp),%r10 + movq 24(%rsp),%r11 + movq 32(%rsp),%r12 + movq 40(%rsp),%r13 + movq 48(%rsp),%r14 + movq 56(%rsp),%r15 + + call __rsaz_512_reducex + +.Lmul_gather_tail: addq 64(%rsp),%r8 adcq 72(%rsp),%r9 adcq 80(%rsp),%r10 @@ -742,6 +1184,10 @@ rsaz_512_mul_scatter4: movq %rcx,128(%rsp) movq %rdi,%rbp + movl $0x80100,%r11d + andl OPENSSL_ia32cap_P+8(%rip),%r11d + cmpl $0x80100,%r11d + je .Lmulx_scatter movq (%rdi),%rbx call __rsaz_512_mul @@ -758,6 +1204,29 @@ rsaz_512_mul_scatter4: movq 56(%rsp),%r15 call __rsaz_512_reduce + jmp .Lmul_scatter_tail + +.align 32 +.Lmulx_scatter: + movq (%rdi),%rdx + call __rsaz_512_mulx + +.byte 102,72,15,126,199 +.byte 102,72,15,126,205 + + movq 128(%rsp),%rdx + movq (%rsp),%r8 + movq 8(%rsp),%r9 + movq 16(%rsp),%r10 + movq 24(%rsp),%r11 + movq 32(%rsp),%r12 + movq 40(%rsp),%r13 + movq 48(%rsp),%r14 + movq 56(%rsp),%r15 + + call __rsaz_512_reducex + +.Lmul_scatter_tail: addq 64(%rsp),%r8 adcq 72(%rsp),%r9 adcq 80(%rsp),%r10 @@ -804,6 +1273,7 @@ rsaz_512_mul_by_one: subq $128+24,%rsp .Lmul_by_one_body: + movl OPENSSL_ia32cap_P+8(%rip),%eax movq %rdx,%rbp movq %rcx,128(%rsp) @@ -824,7 +1294,16 @@ rsaz_512_mul_by_one: movdqa %xmm0,64(%rsp) movdqa %xmm0,80(%rsp) movdqa %xmm0,96(%rsp) + andl $0x80100,%eax + cmpl $0x80100,%eax + je .Lby_one_callx call __rsaz_512_reduce + jmp .Lby_one_tail +.align 32 +.Lby_one_callx: + movq 128(%rsp),%rdx + call __rsaz_512_reducex +.Lby_one_tail: movq %r8,(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) @@ -928,6 +1407,62 @@ __rsaz_512_reduce: .byte 0xf3,0xc3 .size __rsaz_512_reduce,.-__rsaz_512_reduce +.type __rsaz_512_reducex,@function +.align 32 +__rsaz_512_reducex: + + imulq %r8,%rdx + xorq %rsi,%rsi + movl $8,%ecx + jmp .Lreduction_loopx + +.align 32 +.Lreduction_loopx: + movq %r8,%rbx + mulxq 0(%rbp),%rax,%r8 + adcxq %rbx,%rax + adoxq %r9,%r8 + + mulxq 8(%rbp),%rax,%r9 + adcxq %rax,%r8 + adoxq %r10,%r9 + + mulxq 16(%rbp),%rbx,%r10 + adcxq %rbx,%r9 + adoxq %r11,%r10 + + mulxq 24(%rbp),%rbx,%r11 + adcxq %rbx,%r10 + adoxq %r12,%r11 + +.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 + movq %rdx,%rax + movq %r8,%rdx + adcxq %rbx,%r11 + adoxq %r13,%r12 + + mulxq 128+8(%rsp),%rbx,%rdx + movq %rax,%rdx + + mulxq 40(%rbp),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + +.byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 + adcxq %rax,%r13 + adoxq %r15,%r14 + + mulxq 56(%rbp),%rax,%r15 + movq %rbx,%rdx + adcxq %rax,%r14 + adoxq %rsi,%r15 + adcxq %rsi,%r15 + + decl %ecx + jne .Lreduction_loopx + + .byte 0xf3,0xc3 +.size __rsaz_512_reducex,.-__rsaz_512_reducex .type __rsaz_512_subtract,@function .align 32 __rsaz_512_subtract: @@ -1127,6 +1662,126 @@ __rsaz_512_mul: .byte 0xf3,0xc3 .size __rsaz_512_mul,.-__rsaz_512_mul +.type __rsaz_512_mulx,@function +.align 32 +__rsaz_512_mulx: + mulxq (%rsi),%rbx,%r8 + movq $-6,%rcx + + mulxq 8(%rsi),%rax,%r9 + movq %rbx,8(%rsp) + + mulxq 16(%rsi),%rbx,%r10 + adcq %rax,%r8 + + mulxq 24(%rsi),%rax,%r11 + adcq %rbx,%r9 + + mulxq 32(%rsi),%rbx,%r12 + adcq %rax,%r10 + + mulxq 40(%rsi),%rax,%r13 + adcq %rbx,%r11 + + mulxq 48(%rsi),%rbx,%r14 + adcq %rax,%r12 + + mulxq 56(%rsi),%rax,%r15 + movq 8(%rbp),%rdx + adcq %rbx,%r13 + adcq %rax,%r14 + adcq $0,%r15 + + xorq %rdi,%rdi + jmp .Loop_mulx + +.align 32 +.Loop_mulx: + movq %r8,%rbx + mulxq (%rsi),%rax,%r8 + adcxq %rax,%rbx + adoxq %r9,%r8 + + mulxq 8(%rsi),%rax,%r9 + adcxq %rax,%r8 + adoxq %r10,%r9 + + mulxq 16(%rsi),%rax,%r10 + adcxq %rax,%r9 + adoxq %r11,%r10 + + mulxq 24(%rsi),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + +.byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 + adcxq %rax,%r11 + adoxq %r13,%r12 + + mulxq 40(%rsi),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + + mulxq 48(%rsi),%rax,%r14 + adcxq %rax,%r13 + adoxq %r15,%r14 + + mulxq 56(%rsi),%rax,%r15 + movq 64(%rbp,%rcx,8),%rdx + movq %rbx,8+64-8(%rsp,%rcx,8) + adcxq %rax,%r14 + adoxq %rdi,%r15 + adcxq %rdi,%r15 + + incq %rcx + jnz .Loop_mulx + + movq %r8,%rbx + mulxq (%rsi),%rax,%r8 + adcxq %rax,%rbx + adoxq %r9,%r8 + +.byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 + adcxq %rax,%r8 + adoxq %r10,%r9 + +.byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 + adcxq %rax,%r9 + adoxq %r11,%r10 + + mulxq 24(%rsi),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + + mulxq 32(%rsi),%rax,%r12 + adcxq %rax,%r11 + adoxq %r13,%r12 + + mulxq 40(%rsi),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + +.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 + adcxq %rax,%r13 + adoxq %r15,%r14 + +.byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 + adcxq %rax,%r14 + adoxq %rdi,%r15 + adcxq %rdi,%r15 + + movq %rbx,8+64-8(%rsp) + movq %r8,8+64(%rsp) + movq %r9,8+64+8(%rsp) + movq %r10,8+64+16(%rsp) + movq %r11,8+64+24(%rsp) + movq %r12,8+64+32(%rsp) + movq %r13,8+64+40(%rsp) + movq %r14,8+64+48(%rsp) + movq %r15,8+64+56(%rsp) + + .byte 0xf3,0xc3 +.size __rsaz_512_mulx,.-__rsaz_512_mulx .globl rsaz_512_scatter4 .type rsaz_512_scatter4,@function .align 16 |