diff options
Diffstat (limited to 'crypto/bn/asm')
-rwxr-xr-x | crypto/bn/asm/ppc-mont.pl | 10 | ||||
-rw-r--r-- | crypto/bn/asm/ppc.pl | 10 | ||||
-rwxr-xr-x | crypto/bn/asm/ppc64-mont.pl | 12 | ||||
-rwxr-xr-x | crypto/bn/asm/x86-mont.pl | 15 | ||||
-rwxr-xr-x | crypto/bn/asm/x86_64-mont.pl | 42 | ||||
-rwxr-xr-x | crypto/bn/asm/x86_64-mont5.pl | 61 |
6 files changed, 132 insertions, 18 deletions
diff --git a/crypto/bn/asm/ppc-mont.pl b/crypto/bn/asm/ppc-mont.pl index da69c6a..6930a3a 100755 --- a/crypto/bn/asm/ppc-mont.pl +++ b/crypto/bn/asm/ppc-mont.pl @@ -191,7 +191,7 @@ L1st: addi $j,$j,$BNSZ ; j++ addi $tp,$tp,$BNSZ ; tp++ - bdnz- L1st + bdnz L1st ;L1st addc $lo0,$alo,$hi0 addze $hi0,$ahi @@ -253,7 +253,7 @@ Linner: addze $hi1,$hi1 $ST $lo1,0($tp) ; tp[j-1] addi $tp,$tp,$BNSZ ; tp++ - bdnz- Linner + bdnz Linner ;Linner $LD $tj,$BNSZ($tp) ; tp[j] addc $lo0,$alo,$hi0 @@ -276,7 +276,7 @@ Linner: slwi $tj,$num,`log($BNSZ)/log(2)` $UCMP $i,$tj addi $i,$i,$BNSZ - ble- Louter + ble Louter addi $num,$num,2 ; restore $num subfc $j,$j,$j ; j=0 and "clear" XER[CA] @@ -289,7 +289,7 @@ Lsub: $LDX $tj,$tp,$j subfe $aj,$nj,$tj ; tp[j]-np[j] $STX $aj,$rp,$j addi $j,$j,$BNSZ - bdnz- Lsub + bdnz Lsub li $j,0 mtctr $num @@ -304,7 +304,7 @@ Lcopy: ; copy or in-place refresh $STX $tj,$rp,$j $STX $j,$tp,$j ; zap at once addi $j,$j,$BNSZ - bdnz- Lcopy + bdnz Lcopy $POP $tj,0($sp) li r3,1 diff --git a/crypto/bn/asm/ppc.pl b/crypto/bn/asm/ppc.pl index 04df1fe..446d8ba 100644 --- a/crypto/bn/asm/ppc.pl +++ b/crypto/bn/asm/ppc.pl @@ -1556,7 +1556,7 @@ Lppcasm_sub_mainloop: # if carry = 1 this is r7-r8. Else it # is r7-r8 -1 as we need. $STU r6,$BNSZ(r3) - bdnz- Lppcasm_sub_mainloop + bdnz Lppcasm_sub_mainloop Lppcasm_sub_adios: subfze r3,r0 # if carry bit is set then r3 = 0 else -1 andi. r3,r3,1 # keep only last bit. @@ -1603,7 +1603,7 @@ Lppcasm_add_mainloop: $LDU r8,$BNSZ(r5) adde r8,r7,r8 $STU r8,$BNSZ(r3) - bdnz- Lppcasm_add_mainloop + bdnz Lppcasm_add_mainloop Lppcasm_add_adios: addze r3,r0 #return carry bit. blr @@ -1762,7 +1762,7 @@ Lppcasm_sqr_mainloop: $UMULH r8,r6,r6 $STU r7,$BNSZ(r3) $STU r8,$BNSZ(r3) - bdnz- Lppcasm_sqr_mainloop + bdnz Lppcasm_sqr_mainloop Lppcasm_sqr_adios: blr .long 0 @@ -1827,7 +1827,7 @@ Lppcasm_mw_LOOP: addi r3,r3,`4*$BNSZ` addi r4,r4,`4*$BNSZ` - bdnz- Lppcasm_mw_LOOP + bdnz Lppcasm_mw_LOOP Lppcasm_mw_REM: andi. r5,r5,0x3 @@ -1951,7 +1951,7 @@ Lppcasm_maw_mainloop: $ST r11,`3*$BNSZ`(r3) addi r3,r3,`4*$BNSZ` addi r4,r4,`4*$BNSZ` - bdnz- Lppcasm_maw_mainloop + bdnz Lppcasm_maw_mainloop Lppcasm_maw_leftover: andi. r5,r5,0x3 diff --git a/crypto/bn/asm/ppc64-mont.pl b/crypto/bn/asm/ppc64-mont.pl index 9e3c12d..595fc6d 100755 --- a/crypto/bn/asm/ppc64-mont.pl +++ b/crypto/bn/asm/ppc64-mont.pl @@ -734,7 +734,7 @@ $code.=<<___; ___ } $code.=<<___; - bdnz- L1st + bdnz L1st fctid $dota,$dota fctid $dotb,$dotb @@ -1280,7 +1280,7 @@ $code.=<<___; ___ } $code.=<<___; - bdnz- Linner + bdnz Linner fctid $dota,$dota fctid $dotb,$dotb @@ -1490,7 +1490,7 @@ Lsub: ldx $t0,$tp,$i stdx $t0,$rp,$i stdx $t2,$t6,$i addi $i,$i,16 - bdnz- Lsub + bdnz Lsub li $i,0 subfe $ovf,$i,$ovf ; handle upmost overflow bit @@ -1517,7 +1517,7 @@ Lcopy: ; copy or in-place refresh stdx $i,$tp,$i ; zap tp at once stdx $i,$t4,$i addi $i,$i,16 - bdnz- Lcopy + bdnz Lcopy ___ $code.=<<___ if ($SIZE_T==4); subf $np,$num,$np ; rewind np @@ -1550,7 +1550,7 @@ Lsub: lwz $t0,12($tp) ; load tp[j..j+3] in 64-bit word order stw $t5,8($rp) stw $t6,12($rp) stwu $t7,16($rp) - bdnz- Lsub + bdnz Lsub li $i,0 subfe $ovf,$i,$ovf ; handle upmost overflow bit @@ -1582,7 +1582,7 @@ Lcopy: ; copy or in-place refresh stwu $t3,16($rp) std $i,8($tp) ; zap tp at once stdu $i,16($tp) - bdnz- Lcopy + bdnz Lcopy ___ $code.=<<___; diff --git a/crypto/bn/asm/x86-mont.pl b/crypto/bn/asm/x86-mont.pl index e8f6b05..89f4de6 100755 --- a/crypto/bn/asm/x86-mont.pl +++ b/crypto/bn/asm/x86-mont.pl @@ -85,6 +85,21 @@ $frame=32; # size of above frame rounded up to 16n &and ("esp",-64); # align to cache line + # Some OSes, *cough*-dows, insist on stack being "wired" to + # physical memory in strictly sequential manner, i.e. if stack + # allocation spans two pages, then reference to farmost one can + # be punishable by SEGV. But page walking can do good even on + # other OSes, because it guarantees that villain thread hits + # the guard page before it can make damage to innocent one... + &mov ("eax","ebp"); + &sub ("eax","esp"); + &and ("eax",-4096); +&set_label("page_walk"); + &mov ("edx",&DWP(0,"esp","eax")); + &sub ("eax",4096); + &data_byte(0x2e); + &jnc (&label("page_walk")); + ################################# load argument block... &mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp &mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap diff --git a/crypto/bn/asm/x86_64-mont.pl b/crypto/bn/asm/x86_64-mont.pl index 29ba122..8fb6c99 100755 --- a/crypto/bn/asm/x86_64-mont.pl +++ b/crypto/bn/asm/x86_64-mont.pl @@ -130,6 +130,20 @@ $code.=<<___; mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp .Lmul_body: + # Some OSes, *cough*-dows, insist on stack being "wired" to + # physical memory in strictly sequential manner, i.e. if stack + # allocation spans two pages, then reference to farmost one can + # be punishable by SEGV. But page walking can do good even on + # other OSes, because it guarantees that villain thread hits + # the guard page before it can make damage to innocent one... + sub %rsp,%r11 + and \$-4096,%r11 +.Lmul_page_walk: + mov (%rsp,%r11),%r10 + sub \$4096,%r11 + .byte 0x66,0x2e # predict non-taken + jnc .Lmul_page_walk + mov $bp,%r12 # reassign $bp ___ $bp="%r12"; @@ -342,6 +356,14 @@ $code.=<<___; mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp .Lmul4x_body: + sub %rsp,%r11 + and \$-4096,%r11 +.Lmul4x_page_walk: + mov (%rsp,%r11),%r10 + sub \$4096,%r11 + .byte 0x2e # predict non-taken + jnc .Lmul4x_page_walk + mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp mov %rdx,%r12 # reassign $bp ___ @@ -795,6 +817,15 @@ bn_sqr8x_mont: sub %r11,%rsp .Lsqr8x_sp_done: and \$-64,%rsp + mov %rax,%r11 + sub %rsp,%r11 + and \$-4096,%r11 +.Lsqr8x_page_walk: + mov (%rsp,%r11),%r10 + sub \$4096,%r11 + .byte 0x2e # predict non-taken + jnc .Lsqr8x_page_walk + mov $num,%r10 neg $num @@ -932,8 +963,17 @@ bn_mulx4x_mont: sub $num,%r10 # -$num mov ($n0),$n0 # *n0 lea -72(%rsp,%r10),%rsp # alloca(frame+$num+8) - lea ($bp,$num),%r10 and \$-128,%rsp + mov %rax,%r11 + sub %rsp,%r11 + and \$-4096,%r11 +.Lmulx4x_page_walk: + mov (%rsp,%r11),%r10 + sub \$4096,%r11 + .byte 0x66,0x2e # predict non-taken + jnc .Lmulx4x_page_walk + + lea ($bp,$num),%r10 ############################################################## # Stack layout # +0 num diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl index 2e8c9db..938e170 100755 --- a/crypto/bn/asm/x86_64-mont5.pl +++ b/crypto/bn/asm/x86_64-mont5.pl @@ -115,6 +115,20 @@ $code.=<<___; mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp .Lmul_body: + # Some OSes, *cough*-dows, insist on stack being "wired" to + # physical memory in strictly sequential manner, i.e. if stack + # allocation spans two pages, then reference to farmost one can + # be punishable by SEGV. But page walking can do good even on + # other OSes, because it guarantees that villain thread hits + # the guard page before it can make damage to innocent one... + sub %rsp,%rax + and \$-4096,%rax +.Lmul_page_walk: + mov (%rsp,%rax),%r11 + sub \$4096,%rax + .byte 0x2e # predict non-taken + jnc .Lmul_page_walk + lea 128($bp),%r12 # reassign $bp (+size optimization) ___ $bp="%r12"; @@ -469,6 +483,15 @@ $code.=<<___; sub %r11,%rsp .Lmul4xsp_done: and \$-64,%rsp + mov %rax,%r11 + sub %rsp,%r11 + and \$-4096,%r11 +.Lmul4x_page_walk: + mov (%rsp,%r11),%r10 + sub \$4096,%r11 + .byte 0x2e # predict non-taken + jnc .Lmul4x_page_walk + neg $num mov %rax,40(%rsp) @@ -1058,6 +1081,15 @@ $code.=<<___; sub %r11,%rsp .Lpwr_sp_done: and \$-64,%rsp + mov %rax,%r11 + sub %rsp,%r11 + and \$-4096,%r11 +.Lpwr_page_walk: + mov (%rsp,%r11),%r10 + sub \$4096,%r11 + .byte 0x2e # predict non-taken + jnc .Lpwr_page_walk + mov $num,%r10 neg $num @@ -2028,7 +2060,16 @@ bn_from_mont8x: sub %r11,%rsp .Lfrom_sp_done: and \$-64,%rsp - mov $num,%r10 + mov %rax,%r11 + sub %rsp,%r11 + and \$-4096,%r11 +.Lfrom_page_walk: + mov (%rsp,%r11),%r10 + sub \$4096,%r11 + .byte 0x2e # predict non-taken + jnc .Lfrom_page_walk + + mov $num,%r10 neg $num ############################################################## @@ -2173,6 +2214,15 @@ bn_mulx4x_mont_gather5: sub %r11,%rsp .Lmulx4xsp_done: and \$-64,%rsp # ensure alignment + mov %rax,%r11 + sub %rsp,%r11 + and \$-4096,%r11 +.Lmulx4x_page_walk: + mov (%rsp,%r11),%r10 + sub \$4096,%r11 + .byte 0x2e # predict non-taken + jnc .Lmulx4x_page_walk + ############################################################## # Stack layout # +0 -num @@ -2619,6 +2669,15 @@ bn_powerx5: sub %r11,%rsp .Lpwrx_sp_done: and \$-64,%rsp + mov %rax,%r11 + sub %rsp,%r11 + and \$-4096,%r11 +.Lpwrx_page_walk: + mov (%rsp,%r11),%r10 + sub \$4096,%r11 + .byte 0x2e # predict non-taken + jnc .Lpwrx_page_walk + mov $num,%r10 neg $num |