diff options
Diffstat (limited to 'secure/lib/libcrypto/amd64/x86_64-mont.S')
-rw-r--r-- | secure/lib/libcrypto/amd64/x86_64-mont.S | 155 |
1 files changed, 95 insertions, 60 deletions
diff --git a/secure/lib/libcrypto/amd64/x86_64-mont.S b/secure/lib/libcrypto/amd64/x86_64-mont.S index 9a83800..77cb521 100644 --- a/secure/lib/libcrypto/amd64/x86_64-mont.S +++ b/secure/lib/libcrypto/amd64/x86_64-mont.S @@ -8,6 +8,8 @@ .type bn_mul_mont,@function .align 16 bn_mul_mont: + movl %r9d,%r9d + movq %rsp,%rax testl $3,%r9d jnz .Lmul_enter cmpl $8,%r9d @@ -28,29 +30,36 @@ bn_mul_mont: pushq %r14 pushq %r15 - movl %r9d,%r9d - leaq 2(%r9),%r10 + negq %r9 movq %rsp,%r11 - negq %r10 - leaq (%rsp,%r10,8),%rsp - andq $-1024,%rsp + leaq -16(%rsp,%r9,8),%r10 + negq %r9 + andq $-1024,%r10 - movq %r11,8(%rsp,%r9,8) -.Lmul_body: - subq %rsp,%r11 + subq %r10,%r11 andq $-4096,%r11 + leaq (%r10,%r11,1),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja .Lmul_page_walk + jmp .Lmul_page_walk_done + +.align 16 .Lmul_page_walk: - movq (%rsp,%r11,1),%r10 - subq $4096,%r11 -.byte 0x66,0x2e - jnc .Lmul_page_walk + leaq -4096(%rsp),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja .Lmul_page_walk +.Lmul_page_walk_done: + movq %rax,8(%rsp,%r9,8) +.Lmul_body: movq %rdx,%r12 movq (%r8),%r8 movq (%r12),%rbx @@ -218,19 +227,21 @@ bn_mul_mont: movq 8(%rsp,%r9,8),%rsi movq $1,%rax - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp .Lmul_epilogue: .byte 0xf3,0xc3 .size bn_mul_mont,.-bn_mul_mont .type bn_mul4x_mont,@function .align 16 bn_mul4x_mont: + movl %r9d,%r9d + movq %rsp,%rax .Lmul4x_enter: andl $0x80100,%r11d cmpl $0x80100,%r11d @@ -242,23 +253,29 @@ bn_mul4x_mont: pushq %r14 pushq %r15 - movl %r9d,%r9d - leaq 4(%r9),%r10 + negq %r9 movq %rsp,%r11 - negq %r10 - leaq (%rsp,%r10,8),%rsp - andq $-1024,%rsp + leaq -32(%rsp,%r9,8),%r10 + negq %r9 + andq $-1024,%r10 - movq %r11,8(%rsp,%r9,8) -.Lmul4x_body: - subq %rsp,%r11 + subq %r10,%r11 andq $-4096,%r11 + leaq (%r10,%r11,1),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja .Lmul4x_page_walk + jmp .Lmul4x_page_walk_done + .Lmul4x_page_walk: - movq (%rsp,%r11,1),%r10 - subq $4096,%r11 -.byte 0x2e - jnc .Lmul4x_page_walk + leaq -4096(%rsp),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja .Lmul4x_page_walk +.Lmul4x_page_walk_done: + movq %rax,8(%rsp,%r9,8) +.Lmul4x_body: movq %rdi,16(%rsp,%r9,8) movq %rdx,%r12 movq (%r8),%r8 @@ -627,13 +644,13 @@ bn_mul4x_mont: movdqu %xmm2,16(%rdi,%r14,1) movq 8(%rsp,%r9,8),%rsi movq $1,%rax - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp .Lmul4x_epilogue: .byte 0xf3,0xc3 .size bn_mul4x_mont,.-bn_mul4x_mont @@ -643,14 +660,15 @@ bn_mul4x_mont: .type bn_sqr8x_mont,@function .align 32 bn_sqr8x_mont: -.Lsqr8x_enter: movq %rsp,%rax +.Lsqr8x_enter: pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 +.Lsqr8x_prologue: movl %r9d,%r10d shll $3,%r9d @@ -663,33 +681,42 @@ bn_sqr8x_mont: leaq -64(%rsp,%r9,2),%r11 + movq %rsp,%rbp movq (%r8),%r8 subq %rsi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb .Lsqr8x_sp_alt - subq %r11,%rsp - leaq -64(%rsp,%r9,2),%rsp + subq %r11,%rbp + leaq -64(%rbp,%r9,2),%rbp jmp .Lsqr8x_sp_done .align 32 .Lsqr8x_sp_alt: leaq 4096-64(,%r9,2),%r10 - leaq -64(%rsp,%r9,2),%rsp + leaq -64(%rbp,%r9,2),%rbp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 - subq %r11,%rsp + subq %r11,%rbp .Lsqr8x_sp_done: - andq $-64,%rsp - movq %rax,%r11 - subq %rsp,%r11 + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lsqr8x_page_walk + jmp .Lsqr8x_page_walk_done + +.align 16 .Lsqr8x_page_walk: - movq (%rsp,%r11,1),%r10 - subq $4096,%r11 -.byte 0x2e - jnc .Lsqr8x_page_walk + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lsqr8x_page_walk +.Lsqr8x_page_walk_done: movq %r9,%r10 negq %r9 @@ -802,30 +829,38 @@ bn_sqr8x_mont: .type bn_mulx4x_mont,@function .align 32 bn_mulx4x_mont: -.Lmulx4x_enter: movq %rsp,%rax +.Lmulx4x_enter: pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 +.Lmulx4x_prologue: shll $3,%r9d -.byte 0x67 xorq %r10,%r10 subq %r9,%r10 movq (%r8),%r8 - leaq -72(%rsp,%r10,1),%rsp - andq $-128,%rsp - movq %rax,%r11 - subq %rsp,%r11 + leaq -72(%rsp,%r10,1),%rbp + andq $-128,%rbp + movq %rsp,%r11 + subq %rbp,%r11 andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lmulx4x_page_walk + jmp .Lmulx4x_page_walk_done + +.align 16 .Lmulx4x_page_walk: - movq (%rsp,%r11,1),%r10 - subq $4096,%r11 -.byte 0x66,0x2e - jnc .Lmulx4x_page_walk + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lmulx4x_page_walk +.Lmulx4x_page_walk_done: leaq (%rdx,%r9,1),%r10 |