summaryrefslogtreecommitdiffstats
path: root/secure/lib/libcrypto/amd64/x86_64-mont.S
diff options
context:
space:
mode:
Diffstat (limited to 'secure/lib/libcrypto/amd64/x86_64-mont.S')
-rw-r--r--secure/lib/libcrypto/amd64/x86_64-mont.S155
1 files changed, 95 insertions, 60 deletions
diff --git a/secure/lib/libcrypto/amd64/x86_64-mont.S b/secure/lib/libcrypto/amd64/x86_64-mont.S
index 9a83800..77cb521 100644
--- a/secure/lib/libcrypto/amd64/x86_64-mont.S
+++ b/secure/lib/libcrypto/amd64/x86_64-mont.S
@@ -8,6 +8,8 @@
.type bn_mul_mont,@function
.align 16
bn_mul_mont:
+ movl %r9d,%r9d
+ movq %rsp,%rax
testl $3,%r9d
jnz .Lmul_enter
cmpl $8,%r9d
@@ -28,29 +30,36 @@ bn_mul_mont:
pushq %r14
pushq %r15
- movl %r9d,%r9d
- leaq 2(%r9),%r10
+ negq %r9
movq %rsp,%r11
- negq %r10
- leaq (%rsp,%r10,8),%rsp
- andq $-1024,%rsp
+ leaq -16(%rsp,%r9,8),%r10
+ negq %r9
+ andq $-1024,%r10
- movq %r11,8(%rsp,%r9,8)
-.Lmul_body:
- subq %rsp,%r11
+ subq %r10,%r11
andq $-4096,%r11
+ leaq (%r10,%r11,1),%rsp
+ movq (%rsp),%r11
+ cmpq %r10,%rsp
+ ja .Lmul_page_walk
+ jmp .Lmul_page_walk_done
+
+.align 16
.Lmul_page_walk:
- movq (%rsp,%r11,1),%r10
- subq $4096,%r11
-.byte 0x66,0x2e
- jnc .Lmul_page_walk
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r11
+ cmpq %r10,%rsp
+ ja .Lmul_page_walk
+.Lmul_page_walk_done:
+ movq %rax,8(%rsp,%r9,8)
+.Lmul_body:
movq %rdx,%r12
movq (%r8),%r8
movq (%r12),%rbx
@@ -218,19 +227,21 @@ bn_mul_mont:
movq 8(%rsp,%r9,8),%rsi
movq $1,%rax
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
+ movq -48(%rsi),%r15
+ movq -40(%rsi),%r14
+ movq -32(%rsi),%r13
+ movq -24(%rsi),%r12
+ movq -16(%rsi),%rbp
+ movq -8(%rsi),%rbx
+ leaq (%rsi),%rsp
.Lmul_epilogue:
.byte 0xf3,0xc3
.size bn_mul_mont,.-bn_mul_mont
.type bn_mul4x_mont,@function
.align 16
bn_mul4x_mont:
+ movl %r9d,%r9d
+ movq %rsp,%rax
.Lmul4x_enter:
andl $0x80100,%r11d
cmpl $0x80100,%r11d
@@ -242,23 +253,29 @@ bn_mul4x_mont:
pushq %r14
pushq %r15
- movl %r9d,%r9d
- leaq 4(%r9),%r10
+ negq %r9
movq %rsp,%r11
- negq %r10
- leaq (%rsp,%r10,8),%rsp
- andq $-1024,%rsp
+ leaq -32(%rsp,%r9,8),%r10
+ negq %r9
+ andq $-1024,%r10
- movq %r11,8(%rsp,%r9,8)
-.Lmul4x_body:
- subq %rsp,%r11
+ subq %r10,%r11
andq $-4096,%r11
+ leaq (%r10,%r11,1),%rsp
+ movq (%rsp),%r11
+ cmpq %r10,%rsp
+ ja .Lmul4x_page_walk
+ jmp .Lmul4x_page_walk_done
+
.Lmul4x_page_walk:
- movq (%rsp,%r11,1),%r10
- subq $4096,%r11
-.byte 0x2e
- jnc .Lmul4x_page_walk
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r11
+ cmpq %r10,%rsp
+ ja .Lmul4x_page_walk
+.Lmul4x_page_walk_done:
+ movq %rax,8(%rsp,%r9,8)
+.Lmul4x_body:
movq %rdi,16(%rsp,%r9,8)
movq %rdx,%r12
movq (%r8),%r8
@@ -627,13 +644,13 @@ bn_mul4x_mont:
movdqu %xmm2,16(%rdi,%r14,1)
movq 8(%rsp,%r9,8),%rsi
movq $1,%rax
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
+ movq -48(%rsi),%r15
+ movq -40(%rsi),%r14
+ movq -32(%rsi),%r13
+ movq -24(%rsi),%r12
+ movq -16(%rsi),%rbp
+ movq -8(%rsi),%rbx
+ leaq (%rsi),%rsp
.Lmul4x_epilogue:
.byte 0xf3,0xc3
.size bn_mul4x_mont,.-bn_mul4x_mont
@@ -643,14 +660,15 @@ bn_mul4x_mont:
.type bn_sqr8x_mont,@function
.align 32
bn_sqr8x_mont:
-.Lsqr8x_enter:
movq %rsp,%rax
+.Lsqr8x_enter:
pushq %rbx
pushq %rbp
pushq %r12
pushq %r13
pushq %r14
pushq %r15
+.Lsqr8x_prologue:
movl %r9d,%r10d
shll $3,%r9d
@@ -663,33 +681,42 @@ bn_sqr8x_mont:
leaq -64(%rsp,%r9,2),%r11
+ movq %rsp,%rbp
movq (%r8),%r8
subq %rsi,%r11
andq $4095,%r11
cmpq %r11,%r10
jb .Lsqr8x_sp_alt
- subq %r11,%rsp
- leaq -64(%rsp,%r9,2),%rsp
+ subq %r11,%rbp
+ leaq -64(%rbp,%r9,2),%rbp
jmp .Lsqr8x_sp_done
.align 32
.Lsqr8x_sp_alt:
leaq 4096-64(,%r9,2),%r10
- leaq -64(%rsp,%r9,2),%rsp
+ leaq -64(%rbp,%r9,2),%rbp
subq %r10,%r11
movq $0,%r10
cmovcq %r10,%r11
- subq %r11,%rsp
+ subq %r11,%rbp
.Lsqr8x_sp_done:
- andq $-64,%rsp
- movq %rax,%r11
- subq %rsp,%r11
+ andq $-64,%rbp
+ movq %rsp,%r11
+ subq %rbp,%r11
andq $-4096,%r11
+ leaq (%r11,%rbp,1),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lsqr8x_page_walk
+ jmp .Lsqr8x_page_walk_done
+
+.align 16
.Lsqr8x_page_walk:
- movq (%rsp,%r11,1),%r10
- subq $4096,%r11
-.byte 0x2e
- jnc .Lsqr8x_page_walk
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lsqr8x_page_walk
+.Lsqr8x_page_walk_done:
movq %r9,%r10
negq %r9
@@ -802,30 +829,38 @@ bn_sqr8x_mont:
.type bn_mulx4x_mont,@function
.align 32
bn_mulx4x_mont:
-.Lmulx4x_enter:
movq %rsp,%rax
+.Lmulx4x_enter:
pushq %rbx
pushq %rbp
pushq %r12
pushq %r13
pushq %r14
pushq %r15
+.Lmulx4x_prologue:
shll $3,%r9d
-.byte 0x67
xorq %r10,%r10
subq %r9,%r10
movq (%r8),%r8
- leaq -72(%rsp,%r10,1),%rsp
- andq $-128,%rsp
- movq %rax,%r11
- subq %rsp,%r11
+ leaq -72(%rsp,%r10,1),%rbp
+ andq $-128,%rbp
+ movq %rsp,%r11
+ subq %rbp,%r11
andq $-4096,%r11
+ leaq (%r11,%rbp,1),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lmulx4x_page_walk
+ jmp .Lmulx4x_page_walk_done
+
+.align 16
.Lmulx4x_page_walk:
- movq (%rsp,%r11,1),%r10
- subq $4096,%r11
-.byte 0x66,0x2e
- jnc .Lmulx4x_page_walk
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lmulx4x_page_walk
+.Lmulx4x_page_walk_done:
leaq (%rdx,%r9,1),%r10
OpenPOWER on IntegriCloud