diff options
Diffstat (limited to 'secure/lib/libcrypto/amd64/x86_64-mont.S')
-rw-r--r-- | secure/lib/libcrypto/amd64/x86_64-mont.S | 108 |
1 files changed, 65 insertions, 43 deletions
diff --git a/secure/lib/libcrypto/amd64/x86_64-mont.S b/secure/lib/libcrypto/amd64/x86_64-mont.S index bff0fb9..3e67383 100644 --- a/secure/lib/libcrypto/amd64/x86_64-mont.S +++ b/secure/lib/libcrypto/amd64/x86_64-mont.S @@ -634,20 +634,20 @@ bn_sqr8x_mont: - leaq -64(%rsp,%r9,4),%r11 + leaq -64(%rsp,%r9,2),%r11 movq (%r8),%r8 subq %rsi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb .Lsqr8x_sp_alt subq %r11,%rsp - leaq -64(%rsp,%r9,4),%rsp + leaq -64(%rsp,%r9,2),%rsp jmp .Lsqr8x_sp_done .align 32 .Lsqr8x_sp_alt: - leaq 4096-64(,%r9,4),%r10 - leaq -64(%rsp,%r9,4),%rsp + leaq 4096-64(,%r9,2),%r10 + leaq -64(%rsp,%r9,2),%rsp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 @@ -657,58 +657,80 @@ bn_sqr8x_mont: movq %r9,%r10 negq %r9 - leaq 64(%rsp,%r9,2),%r11 movq %r8,32(%rsp) movq %rax,40(%rsp) .Lsqr8x_body: - movq %r9,%rbp -.byte 102,73,15,110,211 - shrq $3+2,%rbp - movl OPENSSL_ia32cap_P+8(%rip),%eax - jmp .Lsqr8x_copy_n - -.align 32 -.Lsqr8x_copy_n: - movq 0(%rcx),%xmm0 - movq 8(%rcx),%xmm1 - movq 16(%rcx),%xmm3 - movq 24(%rcx),%xmm4 - leaq 32(%rcx),%rcx - movdqa %xmm0,0(%r11) - movdqa %xmm1,16(%r11) - movdqa %xmm3,32(%r11) - movdqa %xmm4,48(%r11) - leaq 64(%r11),%r11 - decq %rbp - jnz .Lsqr8x_copy_n - +.byte 102,72,15,110,209 pxor %xmm0,%xmm0 .byte 102,72,15,110,207 .byte 102,73,15,110,218 call bn_sqr8x_internal + + + + leaq (%rdi,%r9,1),%rbx + movq %r9,%rcx + movq %r9,%rdx +.byte 102,72,15,126,207 + sarq $3+2,%rcx + jmp .Lsqr8x_sub + +.align 32 +.Lsqr8x_sub: + movq 0(%rbx),%r12 + movq 8(%rbx),%r13 + movq 16(%rbx),%r14 + movq 24(%rbx),%r15 + leaq 32(%rbx),%rbx + sbbq 0(%rbp),%r12 + sbbq 8(%rbp),%r13 + sbbq 16(%rbp),%r14 + sbbq 24(%rbp),%r15 + leaq 32(%rbp),%rbp + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r14,16(%rdi) + movq %r15,24(%rdi) + leaq 32(%rdi),%rdi + incq %rcx + jnz .Lsqr8x_sub + + sbbq $0,%rax + leaq (%rbx,%r9,1),%rbx + leaq (%rdi,%r9,1),%rdi + +.byte 102,72,15,110,200 pxor %xmm0,%xmm0 - leaq 48(%rsp),%rax - leaq 64(%rsp,%r9,2),%rdx - shrq $3+2,%r9 + pshufd $0,%xmm1,%xmm1 movq 40(%rsp),%rsi - jmp .Lsqr8x_zero + jmp .Lsqr8x_cond_copy .align 32 -.Lsqr8x_zero: - movdqa %xmm0,0(%rax) - movdqa %xmm0,16(%rax) - movdqa %xmm0,32(%rax) - movdqa %xmm0,48(%rax) - leaq 64(%rax),%rax - movdqa %xmm0,0(%rdx) - movdqa %xmm0,16(%rdx) - movdqa %xmm0,32(%rdx) - movdqa %xmm0,48(%rdx) - leaq 64(%rdx),%rdx - decq %r9 - jnz .Lsqr8x_zero +.Lsqr8x_cond_copy: + movdqa 0(%rbx),%xmm2 + movdqa 16(%rbx),%xmm3 + leaq 32(%rbx),%rbx + movdqu 0(%rdi),%xmm4 + movdqu 16(%rdi),%xmm5 + leaq 32(%rdi),%rdi + movdqa %xmm0,-32(%rbx) + movdqa %xmm0,-16(%rbx) + movdqa %xmm0,-32(%rbx,%rdx,1) + movdqa %xmm0,-16(%rbx,%rdx,1) + pcmpeqd %xmm1,%xmm0 + pand %xmm1,%xmm2 + pand %xmm1,%xmm3 + pand %xmm0,%xmm4 + pand %xmm0,%xmm5 + pxor %xmm0,%xmm0 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqu %xmm4,-32(%rdi) + movdqu %xmm5,-16(%rdi) + addq $32,%r9 + jnz .Lsqr8x_cond_copy movq $1,%rax movq -48(%rsi),%r15 |