diff options
Diffstat (limited to 'secure/lib/libcrypto/amd64/x86_64-mont5.S')
-rw-r--r-- | secure/lib/libcrypto/amd64/x86_64-mont5.S | 204 |
1 files changed, 129 insertions, 75 deletions
diff --git a/secure/lib/libcrypto/amd64/x86_64-mont5.S b/secure/lib/libcrypto/amd64/x86_64-mont5.S index 7e8d43f..bbfba02 100644 --- a/secure/lib/libcrypto/amd64/x86_64-mont5.S +++ b/secure/lib/libcrypto/amd64/x86_64-mont5.S @@ -8,6 +8,8 @@ .type bn_mul_mont_gather5,@function .align 64 bn_mul_mont_gather5: + movl %r9d,%r9d + movq %rsp,%rax testl $7,%r9d jnz .Lmul_enter movl OPENSSL_ia32cap_P+8(%rip),%r11d @@ -15,10 +17,7 @@ bn_mul_mont_gather5: .align 16 .Lmul_enter: - movl %r9d,%r9d - movq %rsp,%rax movd 8(%rsp),%xmm5 - leaq .Linc(%rip),%r10 pushq %rbx pushq %rbp pushq %r12 @@ -26,26 +25,36 @@ bn_mul_mont_gather5: pushq %r14 pushq %r15 - leaq 2(%r9),%r11 - negq %r11 - leaq -264(%rsp,%r11,8),%rsp - andq $-1024,%rsp + negq %r9 + movq %rsp,%r11 + leaq -280(%rsp,%r9,8),%r10 + negq %r9 + andq $-1024,%r10 - movq %rax,8(%rsp,%r9,8) -.Lmul_body: - subq %rsp,%rax - andq $-4096,%rax + subq %r10,%r11 + andq $-4096,%r11 + leaq (%r10,%r11,1),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja .Lmul_page_walk + jmp .Lmul_page_walk_done + .Lmul_page_walk: - movq (%rsp,%rax,1),%r11 - subq $4096,%rax -.byte 0x2e - jnc .Lmul_page_walk + leaq -4096(%rsp),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja .Lmul_page_walk +.Lmul_page_walk_done: + + leaq .Linc(%rip),%r10 + movq %rax,8(%rsp,%r9,8) +.Lmul_body: leaq 128(%rdx),%r12 movdqa 0(%r10),%xmm0 @@ -416,18 +425,19 @@ bn_mul_mont_gather5: .type bn_mul4x_mont_gather5,@function .align 32 bn_mul4x_mont_gather5: +.byte 0x67 + movq %rsp,%rax .Lmul4x_enter: andl $0x80108,%r11d cmpl $0x80108,%r11d je .Lmulx4x_enter -.byte 0x67 - movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 +.Lmul4x_prologue: .byte 0x67 shll $3,%r9d @@ -444,32 +454,40 @@ bn_mul4x_mont_gather5: leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb .Lmul4xsp_alt - subq %r11,%rsp - leaq -320(%rsp,%r9,2),%rsp + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp jmp .Lmul4xsp_done .align 32 .Lmul4xsp_alt: leaq 4096-320(,%r9,2),%r10 - leaq -320(%rsp,%r9,2),%rsp + leaq -320(%rbp,%r9,2),%rbp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 - subq %r11,%rsp + subq %r11,%rbp .Lmul4xsp_done: - andq $-64,%rsp - movq %rax,%r11 - subq %rsp,%r11 + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lmul4x_page_walk + jmp .Lmul4x_page_walk_done + .Lmul4x_page_walk: - movq (%rsp,%r11,1),%r10 - subq $4096,%r11 -.byte 0x2e - jnc .Lmul4x_page_walk + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lmul4x_page_walk +.Lmul4x_page_walk_done: negq %r9 @@ -1021,17 +1039,18 @@ mul4x_internal: .type bn_power5,@function .align 32 bn_power5: + movq %rsp,%rax movl OPENSSL_ia32cap_P+8(%rip),%r11d andl $0x80108,%r11d cmpl $0x80108,%r11d je .Lpowerx5_enter - movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 +.Lpower5_prologue: shll $3,%r9d leal (%r9,%r9,2),%r10d @@ -1046,32 +1065,40 @@ bn_power5: leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb .Lpwr_sp_alt - subq %r11,%rsp - leaq -320(%rsp,%r9,2),%rsp + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp jmp .Lpwr_sp_done .align 32 .Lpwr_sp_alt: leaq 4096-320(,%r9,2),%r10 - leaq -320(%rsp,%r9,2),%rsp + leaq -320(%rbp,%r9,2),%rbp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 - subq %r11,%rsp + subq %r11,%rbp .Lpwr_sp_done: - andq $-64,%rsp - movq %rax,%r11 - subq %rsp,%r11 + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lpwr_page_walk + jmp .Lpwr_page_walk_done + .Lpwr_page_walk: - movq (%rsp,%r11,1),%r10 - subq $4096,%r11 -.byte 0x2e - jnc .Lpwr_page_walk + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lpwr_page_walk +.Lpwr_page_walk_done: movq %r9,%r10 negq %r9 @@ -1982,6 +2009,7 @@ bn_from_mont8x: pushq %r13 pushq %r14 pushq %r15 +.Lfrom_prologue: shll $3,%r9d leaq (%r9,%r9,2),%r10 @@ -1996,32 +2024,40 @@ bn_from_mont8x: leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb .Lfrom_sp_alt - subq %r11,%rsp - leaq -320(%rsp,%r9,2),%rsp + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp jmp .Lfrom_sp_done .align 32 .Lfrom_sp_alt: leaq 4096-320(,%r9,2),%r10 - leaq -320(%rsp,%r9,2),%rsp + leaq -320(%rbp,%r9,2),%rbp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 - subq %r11,%rsp + subq %r11,%rbp .Lfrom_sp_done: - andq $-64,%rsp - movq %rax,%r11 - subq %rsp,%r11 + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lfrom_page_walk + jmp .Lfrom_page_walk_done + .Lfrom_page_walk: - movq (%rsp,%r11,1),%r10 - subq $4096,%r11 -.byte 0x2e - jnc .Lfrom_page_walk + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lfrom_page_walk +.Lfrom_page_walk_done: movq %r9,%r10 negq %r9 @@ -2115,14 +2151,15 @@ bn_from_mont8x: .type bn_mulx4x_mont_gather5,@function .align 32 bn_mulx4x_mont_gather5: -.Lmulx4x_enter: movq %rsp,%rax +.Lmulx4x_enter: pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 +.Lmulx4x_prologue: shll $3,%r9d leaq (%r9,%r9,2),%r10 @@ -2139,31 +2176,39 @@ bn_mulx4x_mont_gather5: leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb .Lmulx4xsp_alt - subq %r11,%rsp - leaq -320(%rsp,%r9,2),%rsp + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp jmp .Lmulx4xsp_done .Lmulx4xsp_alt: leaq 4096-320(,%r9,2),%r10 - leaq -320(%rsp,%r9,2),%rsp + leaq -320(%rbp,%r9,2),%rbp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 - subq %r11,%rsp + subq %r11,%rbp .Lmulx4xsp_done: - andq $-64,%rsp - movq %rax,%r11 - subq %rsp,%r11 + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lmulx4x_page_walk + jmp .Lmulx4x_page_walk_done + .Lmulx4x_page_walk: - movq (%rsp,%r11,1),%r10 - subq $4096,%r11 -.byte 0x2e - jnc .Lmulx4x_page_walk + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lmulx4x_page_walk +.Lmulx4x_page_walk_done: @@ -2621,14 +2666,15 @@ mulx4x_internal: .type bn_powerx5,@function .align 32 bn_powerx5: -.Lpowerx5_enter: movq %rsp,%rax +.Lpowerx5_enter: pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 +.Lpowerx5_prologue: shll $3,%r9d leaq (%r9,%r9,2),%r10 @@ -2643,32 +2689,40 @@ bn_powerx5: leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb .Lpwrx_sp_alt - subq %r11,%rsp - leaq -320(%rsp,%r9,2),%rsp + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp jmp .Lpwrx_sp_done .align 32 .Lpwrx_sp_alt: leaq 4096-320(,%r9,2),%r10 - leaq -320(%rsp,%r9,2),%rsp + leaq -320(%rbp,%r9,2),%rbp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 - subq %r11,%rsp + subq %r11,%rbp .Lpwrx_sp_done: - andq $-64,%rsp - movq %rax,%r11 - subq %rsp,%r11 + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lpwrx_page_walk + jmp .Lpwrx_page_walk_done + .Lpwrx_page_walk: - movq (%rsp,%r11,1),%r10 - subq $4096,%r11 -.byte 0x2e - jnc .Lpwrx_page_walk + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lpwrx_page_walk +.Lpwrx_page_walk_done: movq %r9,%r10 negq %r9 |