diff options
Diffstat (limited to 'secure/lib/libcrypto/amd64/rsaz-x86_64.S')
-rw-r--r-- | secure/lib/libcrypto/amd64/rsaz-x86_64.S | 218 |
1 files changed, 160 insertions, 58 deletions
diff --git a/secure/lib/libcrypto/amd64/rsaz-x86_64.S b/secure/lib/libcrypto/amd64/rsaz-x86_64.S index efd229a..e2b0313 100644 --- a/secure/lib/libcrypto/amd64/rsaz-x86_64.S +++ b/secure/lib/libcrypto/amd64/rsaz-x86_64.S @@ -462,48 +462,94 @@ rsaz_512_mul_gather4: pushq %r14 pushq %r15 - movl %r9d,%r9d - subq $128+24,%rsp + subq $152,%rsp .Lmul_gather4_body: - movl 64(%rdx,%r9,4),%eax -.byte 102,72,15,110,199 - movl (%rdx,%r9,4),%ebx -.byte 102,72,15,110,201 + movd %r9d,%xmm8 + movdqa .Linc+16(%rip),%xmm1 + movdqa .Linc(%rip),%xmm0 + + pshufd $0,%xmm8,%xmm8 + movdqa %xmm1,%xmm7 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm8,%xmm0 + movdqa %xmm7,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm8,%xmm1 + movdqa %xmm7,%xmm4 + paddd %xmm2,%xmm3 + pcmpeqd %xmm8,%xmm2 + movdqa %xmm7,%xmm5 + paddd %xmm3,%xmm4 + pcmpeqd %xmm8,%xmm3 + movdqa %xmm7,%xmm6 + paddd %xmm4,%xmm5 + pcmpeqd %xmm8,%xmm4 + paddd %xmm5,%xmm6 + pcmpeqd %xmm8,%xmm5 + paddd %xmm6,%xmm7 + pcmpeqd %xmm8,%xmm6 + pcmpeqd %xmm8,%xmm7 + + movdqa 0(%rdx),%xmm8 + movdqa 16(%rdx),%xmm9 + movdqa 32(%rdx),%xmm10 + movdqa 48(%rdx),%xmm11 + pand %xmm0,%xmm8 + movdqa 64(%rdx),%xmm12 + pand %xmm1,%xmm9 + movdqa 80(%rdx),%xmm13 + pand %xmm2,%xmm10 + movdqa 96(%rdx),%xmm14 + pand %xmm3,%xmm11 + movdqa 112(%rdx),%xmm15 + leaq 128(%rdx),%rbp + pand %xmm4,%xmm12 + pand %xmm5,%xmm13 + pand %xmm6,%xmm14 + pand %xmm7,%xmm15 + por %xmm10,%xmm8 + por %xmm11,%xmm9 + por %xmm12,%xmm8 + por %xmm13,%xmm9 + por %xmm14,%xmm8 + por %xmm15,%xmm9 + + por %xmm9,%xmm8 + pshufd $0x4e,%xmm8,%xmm9 + por %xmm9,%xmm8 +.byte 102,76,15,126,195 + movq %r8,128(%rsp) + movq %rdi,128+8(%rsp) + movq %rcx,128+16(%rsp) - shlq $32,%rax - orq %rax,%rbx movq (%rsi),%rax movq 8(%rsi),%rcx - leaq 128(%rdx,%r9,4),%rbp mulq %rbx movq %rax,(%rsp) movq %rcx,%rax movq %rdx,%r8 mulq %rbx - movd (%rbp),%xmm4 addq %rax,%r8 movq 16(%rsi),%rax movq %rdx,%r9 adcq $0,%r9 mulq %rbx - movd 64(%rbp),%xmm5 addq %rax,%r9 movq 24(%rsi),%rax movq %rdx,%r10 adcq $0,%r10 mulq %rbx - pslldq $4,%xmm5 addq %rax,%r10 movq 32(%rsi),%rax movq %rdx,%r11 adcq $0,%r11 mulq %rbx - por %xmm5,%xmm4 addq %rax,%r11 movq 40(%rsi),%rax movq %rdx,%r12 @@ -516,14 +562,12 @@ rsaz_512_mul_gather4: adcq $0,%r13 mulq %rbx - leaq 128(%rbp),%rbp addq %rax,%r13 movq 56(%rsi),%rax movq %rdx,%r14 adcq $0,%r14 mulq %rbx -.byte 102,72,15,126,227 addq %rax,%r14 movq (%rsi),%rax movq %rdx,%r15 @@ -535,6 +579,35 @@ rsaz_512_mul_gather4: .align 32 .Loop_mul_gather: + movdqa 0(%rbp),%xmm8 + movdqa 16(%rbp),%xmm9 + movdqa 32(%rbp),%xmm10 + movdqa 48(%rbp),%xmm11 + pand %xmm0,%xmm8 + movdqa 64(%rbp),%xmm12 + pand %xmm1,%xmm9 + movdqa 80(%rbp),%xmm13 + pand %xmm2,%xmm10 + movdqa 96(%rbp),%xmm14 + pand %xmm3,%xmm11 + movdqa 112(%rbp),%xmm15 + leaq 128(%rbp),%rbp + pand %xmm4,%xmm12 + pand %xmm5,%xmm13 + pand %xmm6,%xmm14 + pand %xmm7,%xmm15 + por %xmm10,%xmm8 + por %xmm11,%xmm9 + por %xmm12,%xmm8 + por %xmm13,%xmm9 + por %xmm14,%xmm8 + por %xmm15,%xmm9 + + por %xmm9,%xmm8 + pshufd $0x4e,%xmm8,%xmm9 + por %xmm9,%xmm8 +.byte 102,76,15,126,195 + mulq %rbx addq %rax,%r8 movq 8(%rsi),%rax @@ -543,7 +616,6 @@ rsaz_512_mul_gather4: adcq $0,%r8 mulq %rbx - movd (%rbp),%xmm4 addq %rax,%r9 movq 16(%rsi),%rax adcq $0,%rdx @@ -552,7 +624,6 @@ rsaz_512_mul_gather4: adcq $0,%r9 mulq %rbx - movd 64(%rbp),%xmm5 addq %rax,%r10 movq 24(%rsi),%rax adcq $0,%rdx @@ -561,7 +632,6 @@ rsaz_512_mul_gather4: adcq $0,%r10 mulq %rbx - pslldq $4,%xmm5 addq %rax,%r11 movq 32(%rsi),%rax adcq $0,%rdx @@ -570,7 +640,6 @@ rsaz_512_mul_gather4: adcq $0,%r11 mulq %rbx - por %xmm5,%xmm4 addq %rax,%r12 movq 40(%rsi),%rax adcq $0,%rdx @@ -595,7 +664,6 @@ rsaz_512_mul_gather4: adcq $0,%r14 mulq %rbx -.byte 102,72,15,126,227 addq %rax,%r15 movq (%rsi),%rax adcq $0,%rdx @@ -603,7 +671,6 @@ rsaz_512_mul_gather4: movq %rdx,%r15 adcq $0,%r15 - leaq 128(%rbp),%rbp leaq 8(%rdi),%rdi decl %ecx @@ -618,8 +685,8 @@ rsaz_512_mul_gather4: movq %r14,48(%rdi) movq %r15,56(%rdi) -.byte 102,72,15,126,199 -.byte 102,72,15,126,205 + movq 128+8(%rsp),%rdi + movq 128+16(%rsp),%rbp movq (%rsp),%r8 movq 8(%rsp),%r9 @@ -668,7 +735,7 @@ rsaz_512_mul_scatter4: movl %r9d,%r9d subq $128+24,%rsp .Lmul_scatter4_body: - leaq (%r8,%r9,4),%r8 + leaq (%r8,%r9,8),%r8 .byte 102,72,15,110,199 .byte 102,72,15,110,202 .byte 102,73,15,110,208 @@ -704,30 +771,14 @@ rsaz_512_mul_scatter4: call __rsaz_512_subtract - movl %r8d,0(%rsi) - shrq $32,%r8 - movl %r9d,128(%rsi) - shrq $32,%r9 - movl %r10d,256(%rsi) - shrq $32,%r10 - movl %r11d,384(%rsi) - shrq $32,%r11 - movl %r12d,512(%rsi) - shrq $32,%r12 - movl %r13d,640(%rsi) - shrq $32,%r13 - movl %r14d,768(%rsi) - shrq $32,%r14 - movl %r15d,896(%rsi) - shrq $32,%r15 - movl %r8d,64(%rsi) - movl %r9d,192(%rsi) - movl %r10d,320(%rsi) - movl %r11d,448(%rsi) - movl %r12d,576(%rsi) - movl %r13d,704(%rsi) - movl %r14d,832(%rsi) - movl %r15d,960(%rsi) + movq %r8,0(%rsi) + movq %r9,128(%rsi) + movq %r10,256(%rsi) + movq %r11,384(%rsi) + movq %r12,512(%rsi) + movq %r13,640(%rsi) + movq %r14,768(%rsi) + movq %r15,896(%rsi) leaq 128+24+48(%rsp),%rax movq -48(%rax),%r15 @@ -1080,16 +1131,14 @@ __rsaz_512_mul: .type rsaz_512_scatter4,@function .align 16 rsaz_512_scatter4: - leaq (%rdi,%rdx,4),%rdi + leaq (%rdi,%rdx,8),%rdi movl $8,%r9d jmp .Loop_scatter .align 16 .Loop_scatter: movq (%rsi),%rax leaq 8(%rsi),%rsi - movl %eax,(%rdi) - shrq $32,%rax - movl %eax,64(%rdi) + movq %rax,(%rdi) leaq 128(%rdi),%rdi decl %r9d jnz .Loop_scatter @@ -1100,19 +1149,72 @@ rsaz_512_scatter4: .type rsaz_512_gather4,@function .align 16 rsaz_512_gather4: - leaq (%rsi,%rdx,4),%rsi + movd %edx,%xmm8 + movdqa .Linc+16(%rip),%xmm1 + movdqa .Linc(%rip),%xmm0 + + pshufd $0,%xmm8,%xmm8 + movdqa %xmm1,%xmm7 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm8,%xmm0 + movdqa %xmm7,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm8,%xmm1 + movdqa %xmm7,%xmm4 + paddd %xmm2,%xmm3 + pcmpeqd %xmm8,%xmm2 + movdqa %xmm7,%xmm5 + paddd %xmm3,%xmm4 + pcmpeqd %xmm8,%xmm3 + movdqa %xmm7,%xmm6 + paddd %xmm4,%xmm5 + pcmpeqd %xmm8,%xmm4 + paddd %xmm5,%xmm6 + pcmpeqd %xmm8,%xmm5 + paddd %xmm6,%xmm7 + pcmpeqd %xmm8,%xmm6 + pcmpeqd %xmm8,%xmm7 movl $8,%r9d jmp .Loop_gather .align 16 .Loop_gather: - movl (%rsi),%eax - movl 64(%rsi),%r8d + movdqa 0(%rsi),%xmm8 + movdqa 16(%rsi),%xmm9 + movdqa 32(%rsi),%xmm10 + movdqa 48(%rsi),%xmm11 + pand %xmm0,%xmm8 + movdqa 64(%rsi),%xmm12 + pand %xmm1,%xmm9 + movdqa 80(%rsi),%xmm13 + pand %xmm2,%xmm10 + movdqa 96(%rsi),%xmm14 + pand %xmm3,%xmm11 + movdqa 112(%rsi),%xmm15 leaq 128(%rsi),%rsi - shlq $32,%r8 - orq %r8,%rax - movq %rax,(%rdi) + pand %xmm4,%xmm12 + pand %xmm5,%xmm13 + pand %xmm6,%xmm14 + pand %xmm7,%xmm15 + por %xmm10,%xmm8 + por %xmm11,%xmm9 + por %xmm12,%xmm8 + por %xmm13,%xmm9 + por %xmm14,%xmm8 + por %xmm15,%xmm9 + + por %xmm9,%xmm8 + pshufd $0x4e,%xmm8,%xmm9 + por %xmm9,%xmm8 + movq %xmm8,(%rdi) leaq 8(%rdi),%rdi decl %r9d jnz .Loop_gather .byte 0xf3,0xc3 +.LSEH_end_rsaz_512_gather4: .size rsaz_512_gather4,.-rsaz_512_gather4 + +.align 64 +.Linc: +.long 0,0, 1,1 +.long 2,2, 2,2 |