diff options
Diffstat (limited to 'crypto/bn')
-rw-r--r-- | crypto/bn/Makefile | 9 | ||||
-rw-r--r-- | crypto/bn/asm/ia64.S | 35 | ||||
-rwxr-xr-x | crypto/bn/asm/mo-586.pl | 603 | ||||
-rwxr-xr-x | crypto/bn/asm/x86_64-mont.pl | 214 | ||||
-rw-r--r-- | crypto/bn/bn.h | 25 | ||||
-rw-r--r-- | crypto/bn/bn_blind.c | 12 | ||||
-rw-r--r-- | crypto/bn/bn_div.c | 249 | ||||
-rw-r--r-- | crypto/bn/bn_err.c | 4 | ||||
-rw-r--r-- | crypto/bn/bn_exp.c | 20 | ||||
-rw-r--r-- | crypto/bn/bn_gcd.c | 161 | ||||
-rw-r--r-- | crypto/bn/bn_gf2m.c | 6 | ||||
-rw-r--r-- | crypto/bn/bn_lcl.h | 1 | ||||
-rw-r--r-- | crypto/bn/bn_lib.c | 2 | ||||
-rw-r--r-- | crypto/bn/bn_mont.c | 321 | ||||
-rw-r--r-- | crypto/bn/bn_mul.c | 15 | ||||
-rw-r--r-- | crypto/bn/bn_nist.c | 653 | ||||
-rw-r--r-- | crypto/bn/bn_prime.c | 4 | ||||
-rw-r--r-- | crypto/bn/bn_prime.h | 4 | ||||
-rw-r--r-- | crypto/bn/bn_prime.pl | 4 | ||||
-rw-r--r-- | crypto/bn/bntest.c | 60 |
20 files changed, 1981 insertions, 421 deletions
diff --git a/crypto/bn/Makefile b/crypto/bn/Makefile index 5c3e08f..0491e3d 100644 --- a/crypto/bn/Makefile +++ b/crypto/bn/Makefile @@ -67,16 +67,22 @@ bn86-elf.s: asm/bn-586.pl ../perlasm/x86asm.pl (cd asm; $(PERL) bn-586.pl elf $(CFLAGS) > ../$@) co86-elf.s: asm/co-586.pl ../perlasm/x86asm.pl (cd asm; $(PERL) co-586.pl elf $(CFLAGS) > ../$@) +mo86-elf.s: asm/mo-586.pl ../perlasm/x86asm.pl + (cd asm; $(PERL) mo-586.pl elf $(CFLAGS) > ../$@) # COFF bn86-cof.s: asm/bn-586.pl ../perlasm/x86asm.pl (cd asm; $(PERL) bn-586.pl coff $(CFLAGS) > ../$@) co86-cof.s: asm/co-586.pl ../perlasm/x86asm.pl (cd asm; $(PERL) co-586.pl coff $(CFLAGS) > ../$@) +mo86-cof.s: asm/mo-586.pl ../perlasm/x86asm.pl + (cd asm; $(PERL) mo-586.pl coff $(CFLAGS) > ../$@) # a.out bn86-out.s: asm/bn-586.pl ../perlasm/x86asm.pl (cd asm; $(PERL) bn-586.pl a.out $(CFLAGS) > ../$@) co86-out.s: asm/co-586.pl ../perlasm/x86asm.pl (cd asm; $(PERL) co-586.pl a.out $(CFLAGS) > ../$@) +mo86-out.s: asm/mo-586.pl ../perlasm/x86asm.pl + (cd asm; $(PERL) mo-586.pl a.out $(CFLAGS) > ../$@) sparcv8.o: asm/sparcv8.S $(CC) $(CFLAGS) -c asm/sparcv8.S @@ -91,6 +97,8 @@ bn-mips3.o: asm/mips3.s x86_64-gcc.o: asm/x86_64-gcc.c $(CC) $(CFLAGS) -c -o $@ asm/x86_64-gcc.c +x86_64-mont.s: asm/x86_64-mont.pl + $(PERL) asm/x86_64-mont.pl $@ bn-ia64.s: asm/ia64.S $(CC) $(CFLAGS) -E asm/ia64.S > $@ @@ -108,6 +116,7 @@ linux_ppc64.s: asm/ppc.pl; $(PERL) $< $@ aix_ppc32.s: asm/ppc.pl; $(PERL) asm/ppc.pl $@ aix_ppc64.s: asm/ppc.pl; $(PERL) asm/ppc.pl $@ osx_ppc32.s: asm/ppc.pl; $(PERL) $< $@ +osx_ppc64.s: asm/ppc.pl; $(PERL) $< $@ files: $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO diff --git a/crypto/bn/asm/ia64.S b/crypto/bn/asm/ia64.S index 7b82b82..951abc5 100644 --- a/crypto/bn/asm/ia64.S +++ b/crypto/bn/asm/ia64.S @@ -171,21 +171,21 @@ .skip 32 // makes the loop body aligned at 64-byte boundary bn_add_words: .prologue - .fframe 0 .save ar.pfs,r2 { .mii; alloc r2=ar.pfs,4,12,0,16 cmp4.le p6,p0=r35,r0 };; { .mfb; mov r8=r0 // return value (p6) br.ret.spnt.many b0 };; - .save ar.lc,r3 { .mib; sub r10=r35,r0,1 + .save ar.lc,r3 mov r3=ar.lc brp.loop.imp .L_bn_add_words_ctop,.L_bn_add_words_cend-16 } - .body { .mib; ADDP r14=0,r32 // rp + .save pr,r9 mov r9=pr };; + .body { .mii; ADDP r15=0,r33 // ap mov ar.lc=r10 mov ar.ec=6 } @@ -224,21 +224,21 @@ bn_add_words: .skip 32 // makes the loop body aligned at 64-byte boundary bn_sub_words: .prologue - .fframe 0 .save ar.pfs,r2 { .mii; alloc r2=ar.pfs,4,12,0,16 cmp4.le p6,p0=r35,r0 };; { .mfb; mov r8=r0 // return value (p6) br.ret.spnt.many b0 };; - .save ar.lc,r3 { .mib; sub r10=r35,r0,1 + .save ar.lc,r3 mov r3=ar.lc brp.loop.imp .L_bn_sub_words_ctop,.L_bn_sub_words_cend-16 } - .body { .mib; ADDP r14=0,r32 // rp + .save pr,r9 mov r9=pr };; + .body { .mii; ADDP r15=0,r33 // ap mov ar.lc=r10 mov ar.ec=6 } @@ -283,7 +283,6 @@ bn_sub_words: .skip 32 // makes the loop body aligned at 64-byte boundary bn_mul_words: .prologue - .fframe 0 .save ar.pfs,r2 #ifdef XMA_TEMPTATION { .mfi; alloc r2=ar.pfs,4,0,0,0 };; @@ -294,9 +293,10 @@ bn_mul_words: cmp4.le p6,p0=r34,r0 (p6) br.ret.spnt.many b0 };; - .save ar.lc,r3 { .mii; sub r10=r34,r0,1 + .save ar.lc,r3 mov r3=ar.lc + .save pr,r9 mov r9=pr };; .body @@ -397,22 +397,21 @@ bn_mul_words: .skip 48 // makes the loop body aligned at 64-byte boundary bn_mul_add_words: .prologue - .fframe 0 .save ar.pfs,r2 - .save ar.lc,r3 - .save pr,r9 { .mmi; alloc r2=ar.pfs,4,4,0,8 cmp4.le p6,p0=r34,r0 + .save ar.lc,r3 mov r3=ar.lc };; { .mib; mov r8=r0 // return value sub r10=r34,r0,1 (p6) br.ret.spnt.many b0 };; - .body { .mib; setf.sig f8=r35 // w + .save pr,r9 mov r9=pr brp.loop.imp .L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16 } + .body { .mmi; ADDP r14=0,r32 // rp ADDP r15=0,r33 // ap mov ar.lc=r10 } @@ -466,7 +465,6 @@ bn_mul_add_words: .skip 32 // makes the loop body aligned at 64-byte boundary bn_sqr_words: .prologue - .fframe 0 .save ar.pfs,r2 { .mii; alloc r2=ar.pfs,3,0,0,0 sxt4 r34=r34 };; @@ -476,9 +474,10 @@ bn_sqr_words: nop.f 0x0 (p6) br.ret.spnt.many b0 };; - .save ar.lc,r3 { .mii; sub r10=r34,r0,1 + .save ar.lc,r3 mov r3=ar.lc + .save pr,r9 mov r9=pr };; .body @@ -545,7 +544,6 @@ bn_sqr_words: .align 64 bn_sqr_comba8: .prologue - .fframe 0 .save ar.pfs,r2 #if defined(_HPUX_SOURCE) && !defined(_LP64) { .mii; alloc r2=ar.pfs,2,1,0,0 @@ -617,7 +615,6 @@ bn_sqr_comba8: .align 64 bn_mul_comba8: .prologue - .fframe 0 .save ar.pfs,r2 #if defined(_HPUX_SOURCE) && !defined(_LP64) { .mii; alloc r2=ar.pfs,3,0,0,0 @@ -1175,7 +1172,6 @@ bn_mul_comba8: .align 64 bn_sqr_comba4: .prologue - .fframe 0 .save ar.pfs,r2 #if defined(_HPUX_SOURCE) && !defined(_LP64) { .mii; alloc r2=ar.pfs,2,1,0,0 @@ -1208,7 +1204,6 @@ bn_sqr_comba4: .align 64 bn_mul_comba4: .prologue - .fframe 0 .save ar.pfs,r2 #if defined(_HPUX_SOURCE) && !defined(_LP64) { .mii; alloc r2=ar.pfs,3,0,0,0 @@ -1411,11 +1406,11 @@ equ=p24 .align 64 bn_div_words: .prologue - .fframe 0 .save ar.pfs,r2 - .save b0,r3 { .mii; alloc r2=ar.pfs,3,5,0,8 + .save b0,r3 mov r3=b0 + .save pr,r10 mov r10=pr };; { .mmb; cmp.eq p6,p0=r34,r0 mov r8=-1 diff --git a/crypto/bn/asm/mo-586.pl b/crypto/bn/asm/mo-586.pl new file mode 100755 index 0000000..0982293 --- /dev/null +++ b/crypto/bn/asm/mo-586.pl @@ -0,0 +1,603 @@ +#!/usr/bin/env perl + +# This is crypto/bn/asm/x86-mont.pl (with asciz from crypto/perlasm/x86asm.pl) +# from OpenSSL 0.9.9-dev + +sub ::asciz +{ my @str=unpack("C*",shift); + push @str,0; + while ($#str>15) { + &data_byte(@str[0..15]); + foreach (0..15) { shift @str; } + } + &data_byte(@str) if (@str); +} + +# ==================================================================== +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# October 2005 +# +# This is a "teaser" code, as it can be improved in several ways... +# First of all non-SSE2 path should be implemented (yes, for now it +# performs Montgomery multiplication/convolution only on SSE2-capable +# CPUs such as P4, others fall down to original code). Then inner loop +# can be unrolled and modulo-scheduled to improve ILP and possibly +# moved to 128-bit XMM register bank (though it would require input +# rearrangement and/or increase bus bandwidth utilization). Dedicated +# squaring procedure should give further performance improvement... +# Yet, for being draft, the code improves rsa512 *sign* benchmark by +# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-) + +# December 2006 +# +# Modulo-scheduling SSE2 loops results in further 15-20% improvement. +# Integer-only code [being equipped with dedicated squaring procedure] +# gives ~40% on rsa512 sign benchmark... + +push(@INC,"perlasm","../../perlasm"); +require "x86asm.pl"; + +&asm_init($ARGV[0],$0); + +$sse2=0; +for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } + +&external_label("OPENSSL_ia32cap_P") if ($sse2); + +&function_begin("bn_mul_mont"); + +$i="edx"; +$j="ecx"; +$ap="esi"; $tp="esi"; # overlapping variables!!! +$rp="edi"; $bp="edi"; # overlapping variables!!! +$np="ebp"; +$num="ebx"; + +$_num=&DWP(4*0,"esp"); # stack top layout +$_rp=&DWP(4*1,"esp"); +$_ap=&DWP(4*2,"esp"); +$_bp=&DWP(4*3,"esp"); +$_np=&DWP(4*4,"esp"); +$_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp"); +$_sp=&DWP(4*6,"esp"); +$_bpend=&DWP(4*7,"esp"); +$frame=32; # size of above frame rounded up to 16n + + &xor ("eax","eax"); + &mov ("edi",&wparam(5)); # int num + &cmp ("edi",4); + &jl (&label("just_leave")); + + &lea ("esi",&wparam(0)); # put aside pointer to argument block + &lea ("edx",&wparam(1)); # load ap + &mov ("ebp","esp"); # saved stack pointer! + &add ("edi",2); # extra two words on top of tp + &neg ("edi"); + &lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2)) + &neg ("edi"); + + # minimize cache contention by arraning 2K window between stack + # pointer and ap argument [np is also position sensitive vector, + # but it's assumed to be near ap, as it's allocated at ~same + # time]. + &mov ("eax","esp"); + &sub ("eax","edx"); + &and ("eax",2047); + &sub ("esp","eax"); # this aligns sp and ap modulo 2048 + + &xor ("edx","esp"); + &and ("edx",2048); + &xor ("edx",2048); + &sub ("esp","edx"); # this splits them apart modulo 4096 + + &and ("esp",-64); # align to cache line + + ################################# load argument block... + &mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp + &mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap + &mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp + &mov ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np + &mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0 + #&mov ("edi",&DWP(5*4,"esi"));# int num + + &mov ("esi",&DWP(0,"esi")); # pull n0[0] + &mov ($_rp,"eax"); # ... save a copy of argument block + &mov ($_ap,"ebx"); + &mov ($_bp,"ecx"); + &mov ($_np,"edx"); + &mov ($_n0,"esi"); + &lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling + #&mov ($_num,$num); # redundant as $num is not reused + &mov ($_sp,"ebp"); # saved stack pointer! + +if($sse2) { +$acc0="mm0"; # mmx register bank layout +$acc1="mm1"; +$car0="mm2"; +$car1="mm3"; +$mul0="mm4"; +$mul1="mm5"; +$temp="mm6"; +$mask="mm7"; + + &picmeup("eax","OPENSSL_ia32cap_P"); + &bt (&DWP(0,"eax"),26); + &jnc (&label("non_sse2")); + + &mov ("eax",-1); + &movd ($mask,"eax"); # mask 32 lower bits + + &mov ($ap,$_ap); # load input pointers + &mov ($bp,$_bp); + &mov ($np,$_np); + + &xor ($i,$i); # i=0 + &xor ($j,$j); # j=0 + + &movd ($mul0,&DWP(0,$bp)); # bp[0] + &movd ($mul1,&DWP(0,$ap)); # ap[0] + &movd ($car1,&DWP(0,$np)); # np[0] + + &pmuludq($mul1,$mul0); # ap[0]*bp[0] + &movq ($car0,$mul1); + &movq ($acc0,$mul1); # I wish movd worked for + &pand ($acc0,$mask); # inter-register transfers + + &pmuludq($mul1,$_n0q); # *=n0 + + &pmuludq($car1,$mul1); # "t[0]"*np[0]*n0 + &paddq ($car1,$acc0); + + &movd ($acc1,&DWP(4,$np)); # np[1] + &movd ($acc0,&DWP(4,$ap)); # ap[1] + + &psrlq ($car0,32); + &psrlq ($car1,32); + + &inc ($j); # j++ +&set_label("1st",16); + &pmuludq($acc0,$mul0); # ap[j]*bp[0] + &pmuludq($acc1,$mul1); # np[j]*m1 + &paddq ($car0,$acc0); # +=c0 + &paddq ($car1,$acc1); # +=c1 + + &movq ($acc0,$car0); + &pand ($acc0,$mask); + &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1] + &paddq ($car1,$acc0); # +=ap[j]*bp[0]; + &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1] + &psrlq ($car0,32); + &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[j-1]= + &psrlq ($car1,32); + + &lea ($j,&DWP(1,$j)); + &cmp ($j,$num); + &jl (&label("1st")); + + &pmuludq($acc0,$mul0); # ap[num-1]*bp[0] + &pmuludq($acc1,$mul1); # np[num-1]*m1 + &paddq ($car0,$acc0); # +=c0 + &paddq ($car1,$acc1); # +=c1 + + &movq ($acc0,$car0); + &pand ($acc0,$mask); + &paddq ($car1,$acc0); # +=ap[num-1]*bp[0]; + &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]= + + &psrlq ($car0,32); + &psrlq ($car1,32); + + &paddq ($car1,$car0); + &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1] + + &inc ($i); # i++ +&set_label("outer"); + &xor ($j,$j); # j=0 + + &movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i] + &movd ($mul1,&DWP(0,$ap)); # ap[0] + &movd ($temp,&DWP($frame,"esp")); # tp[0] + &movd ($car1,&DWP(0,$np)); # np[0] + &pmuludq($mul1,$mul0); # ap[0]*bp[i] + + &paddq ($mul1,$temp); # +=tp[0] + &movq ($acc0,$mul1); + &movq ($car0,$mul1); + &pand ($acc0,$mask); + + &pmuludq($mul1,$_n0q); # *=n0 + + &pmuludq($car1,$mul1); + &paddq ($car1,$acc0); + + &movd ($temp,&DWP($frame+4,"esp")); # tp[1] + &movd ($acc1,&DWP(4,$np)); # np[1] + &movd ($acc0,&DWP(4,$ap)); # ap[1] + + &psrlq ($car0,32); + &psrlq ($car1,32); + &paddq ($car0,$temp); # +=tp[1] + + &inc ($j); # j++ + &dec ($num); +&set_label("inner"); + &pmuludq($acc0,$mul0); # ap[j]*bp[i] + &pmuludq($acc1,$mul1); # np[j]*m1 + &paddq ($car0,$acc0); # +=c0 + &paddq ($car1,$acc1); # +=c1 + + &movq ($acc0,$car0); + &movd ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1] + &pand ($acc0,$mask); + &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1] + &paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j] + &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1] + &psrlq ($car0,32); + &movd (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]= + &psrlq ($car1,32); + &paddq ($car0,$temp); # +=tp[j+1] + + &dec ($num); + &lea ($j,&DWP(1,$j)); # j++ + &jnz (&label("inner")); + + &mov ($num,$j); + &pmuludq($acc0,$mul0); # ap[num-1]*bp[i] + &pmuludq($acc1,$mul1); # np[num-1]*m1 + &paddq ($car0,$acc0); # +=c0 + &paddq ($car1,$acc1); # +=c1 + + &movq ($acc0,$car0); + &pand ($acc0,$mask); + &paddq ($car1,$acc0); # +=ap[num-1]*bp[i]+tp[num-1] + &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]= + &psrlq ($car0,32); + &psrlq ($car1,32); + + &movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num] + &paddq ($car1,$car0); + &paddq ($car1,$temp); + &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1] + + &lea ($i,&DWP(1,$i)); # i++ + &cmp ($i,$num); + &jle (&label("outer")); + + &emms (); # done with mmx bank + &jmp (&label("common_tail")); + +&set_label("non_sse2",16); +} + +if (0) { + &mov ("esp",$_sp); + &xor ("eax","eax"); # signal "not fast enough [yet]" + &jmp (&label("just_leave")); + # While the below code provides competitive performance for + # all key lengthes on modern Intel cores, it's still more + # than 10% slower for 4096-bit key elsewhere:-( "Competitive" + # means compared to the original integer-only assembler. + # 512-bit RSA sign is better by ~40%, but that's about all + # one can say about all CPUs... +} else { +$inp="esi"; # integer path uses these registers differently +$word="edi"; +$carry="ebp"; + + &mov ($inp,$_ap); + &lea ($carry,&DWP(1,$num)); + &mov ($word,$_bp); + &xor ($j,$j); # j=0 + &mov ("edx",$inp); + &and ($carry,1); # see if num is even + &sub ("edx",$word); # see if ap==bp + &lea ("eax",&DWP(4,$word,$num,4)); # &bp[num] + &or ($carry,"edx"); + &mov ($word,&DWP(0,$word)); # bp[0] + &jz (&label("bn_sqr_mont")); + &mov ($_bpend,"eax"); + &mov ("eax",&DWP(0,$inp)); + &xor ("edx","edx"); + +&set_label("mull",16); + &mov ($carry,"edx"); + &mul ($word); # ap[j]*bp[0] + &add ($carry,"eax"); + &lea ($j,&DWP(1,$j)); + &adc ("edx",0); + &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1] + &cmp ($j,$num); + &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= + &jl (&label("mull")); + + &mov ($carry,"edx"); + &mul ($word); # ap[num-1]*bp[0] + &mov ($word,$_n0); + &add ("eax",$carry); + &mov ($inp,$_np); + &adc ("edx",0); + &imul ($word,&DWP($frame,"esp")); # n0*tp[0] + + &mov (&DWP($frame,"esp",$num,4),"eax"); # tp[num-1]= + &xor ($j,$j); + &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]= + &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]= + + &mov ("eax",&DWP(0,$inp)); # np[0] + &mul ($word); # np[0]*m + &add ("eax",&DWP($frame,"esp")); # +=tp[0] + &mov ("eax",&DWP(4,$inp)); # np[1] + &adc ("edx",0); + &inc ($j); + + &jmp (&label("2ndmadd")); + +&set_label("1stmadd",16); + &mov ($carry,"edx"); + &mul ($word); # ap[j]*bp[i] + &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] + &lea ($j,&DWP(1,$j)); + &adc ("edx",0); + &add ($carry,"eax"); + &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1] + &adc ("edx",0); + &cmp ($j,$num); + &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= + &jl (&label("1stmadd")); + + &mov ($carry,"edx"); + &mul ($word); # ap[num-1]*bp[i] + &add ("eax",&DWP($frame,"esp",$num,4)); # +=tp[num-1] + &mov ($word,$_n0); + &adc ("edx",0); + &mov ($inp,$_np); + &add ($carry,"eax"); + &adc ("edx",0); + &imul ($word,&DWP($frame,"esp")); # n0*tp[0] + + &xor ($j,$j); + &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] + &mov (&DWP($frame,"esp",$num,4),$carry); # tp[num-1]= + &adc ($j,0); + &mov ("eax",&DWP(0,$inp)); # np[0] + &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]= + &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]= + + &mul ($word); # np[0]*m + &add ("eax",&DWP($frame,"esp")); # +=tp[0] + &mov ("eax",&DWP(4,$inp)); # np[1] + &adc ("edx",0); + &mov ($j,1); + +&set_label("2ndmadd",16); + &mov ($carry,"edx"); + &mul ($word); # np[j]*m + &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] + &lea ($j,&DWP(1,$j)); + &adc ("edx",0); + &add ($carry,"eax"); + &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+1] + &adc ("edx",0); + &cmp ($j,$num); + &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j-1]= + &jl (&label("2ndmadd")); + + &mov ($carry,"edx"); + &mul ($word); # np[j]*m + &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1] + &adc ("edx",0); + &add ($carry,"eax"); + &adc ("edx",0); + &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]= + + &xor ("eax","eax"); + &mov ($j,$_bp); # &bp[i] + &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] + &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1] + &lea ($j,&DWP(4,$j)); + &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]= + &cmp ($j,$_bpend); + &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]= + &je (&label("common_tail")); + + &mov ($word,&DWP(0,$j)); # bp[i+1] + &mov ($inp,$_ap); + &mov ($_bp,$j); # &bp[++i] + &xor ($j,$j); + &xor ("edx","edx"); + &mov ("eax",&DWP(0,$inp)); + &jmp (&label("1stmadd")); + +&set_label("bn_sqr_mont",16); +$sbit=$num; + &mov ($_num,$num); + &mov ($_bp,$j); # i=0 + + &mov ("eax",$word); # ap[0] + &mul ($word); # ap[0]*ap[0] + &mov (&DWP($frame,"esp"),"eax"); # tp[0]= + &mov ($sbit,"edx"); + &shr ("edx",1); + &and ($sbit,1); + &inc ($j); +&set_label("sqr",16); + &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j] + &mov ($carry,"edx"); + &mul ($word); # ap[j]*ap[0] + &add ("eax",$carry); + &lea ($j,&DWP(1,$j)); + &adc ("edx",0); + &lea ($carry,&DWP(0,$sbit,"eax",2)); + &shr ("eax",31); + &cmp ($j,$_num); + &mov ($sbit,"eax"); + &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= + &jl (&label("sqr")); + + &mov ("eax",&DWP(0,$inp,$j,4)); # ap[num-1] + &mov ($carry,"edx"); + &mul ($word); # ap[num-1]*ap[0] + &add ("eax",$carry); + &mov ($word,$_n0); + &adc ("edx",0); + &mov ($inp,$_np); + &lea ($carry,&DWP(0,$sbit,"eax",2)); + &imul ($word,&DWP($frame,"esp")); # n0*tp[0] + &shr ("eax",31); + &mov (&DWP($frame,"esp",$j,4),$carry); # tp[num-1]= + + &lea ($carry,&DWP(0,"eax","edx",2)); + &mov ("eax",&DWP(0,$inp)); # np[0] + &shr ("edx",31); + &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num]= + &mov (&DWP($frame+8,"esp",$j,4),"edx"); # tp[num+1]= + + &mul ($word); # np[0]*m + &add ("eax",&DWP($frame,"esp")); # +=tp[0] + &mov ($num,$j); + &adc ("edx",0); + &mov ("eax",&DWP(4,$inp)); # np[1] + &mov ($j,1); + +&set_label("3rdmadd",16); + &mov ($carry,"edx"); + &mul ($word); # np[j]*m + &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] + &adc ("edx",0); + &add ($carry,"eax"); + &mov ("eax",&DWP(4,$inp,$j,4)); # np[j+1] + &adc ("edx",0); + &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j-1]= + + &mov ($carry,"edx"); + &mul ($word); # np[j+1]*m + &add ($carry,&DWP($frame+4,"esp",$j,4)); # +=tp[j+1] + &lea ($j,&DWP(2,$j)); + &adc ("edx",0); + &add ($carry,"eax"); + &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+2] + &adc ("edx",0); + &cmp ($j,$num); + &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j]= + &jl (&label("3rdmadd")); + + &mov ($carry,"edx"); + &mul ($word); # np[j]*m + &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1] + &adc ("edx",0); + &add ($carry,"eax"); + &adc ("edx",0); + &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]= + + &mov ($j,$_bp); # i + &xor ("eax","eax"); + &mov ($inp,$_ap); + &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] + &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1] + &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]= + &cmp ($j,$num); + &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]= + &je (&label("common_tail")); + + &mov ($word,&DWP(4,$inp,$j,4)); # ap[i] + &lea ($j,&DWP(1,$j)); + &mov ("eax",$word); + &mov ($_bp,$j); # ++i + &mul ($word); # ap[i]*ap[i] + &add ("eax",&DWP($frame,"esp",$j,4)); # +=tp[i] + &adc ("edx",0); + &mov (&DWP($frame,"esp",$j,4),"eax"); # tp[i]= + &xor ($carry,$carry); + &cmp ($j,$num); + &lea ($j,&DWP(1,$j)); + &je (&label("sqrlast")); + + &mov ($sbit,"edx"); # zaps $num + &shr ("edx",1); + &and ($sbit,1); +&set_label("sqradd",16); + &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j] + &mov ($carry,"edx"); + &mul ($word); # ap[j]*ap[i] + &add ("eax",$carry); + &lea ($carry,&DWP(0,"eax","eax")); + &adc ("edx",0); + &shr ("eax",31); + &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] + &lea ($j,&DWP(1,$j)); + &adc ("eax",0); + &add ($carry,$sbit); + &adc ("eax",0); + &cmp ($j,$_num); + &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= + &mov ($sbit,"eax"); + &jle (&label("sqradd")); + + &mov ($carry,"edx"); + &lea ("edx",&DWP(0,$sbit,"edx",2)); + &shr ($carry,31); +&set_label("sqrlast"); + &mov ($word,$_n0); + &mov ($inp,$_np); + &imul ($word,&DWP($frame,"esp")); # n0*tp[0] + + &add ("edx",&DWP($frame,"esp",$j,4)); # +=tp[num] + &mov ("eax",&DWP(0,$inp)); # np[0] + &adc ($carry,0); + &mov (&DWP($frame,"esp",$j,4),"edx"); # tp[num]= + &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num+1]= + + &mul ($word); # np[0]*m + &add ("eax",&DWP($frame,"esp")); # +=tp[0] + &lea ($num,&DWP(-1,$j)); + &adc ("edx",0); + &mov ($j,1); + &mov ("eax",&DWP(4,$inp)); # np[1] + + &jmp (&label("3rdmadd")); +} + +&set_label("common_tail",16); + &mov ($np,$_np); # load modulus pointer + &mov ($rp,$_rp); # load result pointer + &lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped] + + &mov ("eax",&DWP(0,$tp)); # tp[0] + &mov ($j,$num); # j=num-1 + &xor ($i,$i); # i=0 and clear CF! + +&set_label("sub",16); + &sbb ("eax",&DWP(0,$np,$i,4)); + &mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i] + &dec ($j); # doesn't affect CF! + &mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1] + &lea ($i,&DWP(1,$i)); # i++ + &jge (&label("sub")); + + &sbb ("eax",0); # handle upmost overflow bit + &and ($tp,"eax"); + ¬ ("eax"); + &mov ($np,$rp); + &and ($np,"eax"); + &or ($tp,$np); # tp=carry?tp:rp + +&set_label("copy",16); # copy or in-place refresh + &mov ("eax",&DWP(0,$tp,$num,4)); + &mov (&DWP(0,$rp,$num,4),"eax"); # rp[i]=tp[i] + &mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector + &dec ($num); + &jge (&label("copy")); + + &mov ("esp",$_sp); # pull saved stack pointer + &mov ("eax",1); +&set_label("just_leave"); +&function_end("bn_mul_mont"); + +&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>"); + +&asm_finish(); diff --git a/crypto/bn/asm/x86_64-mont.pl b/crypto/bn/asm/x86_64-mont.pl new file mode 100755 index 0000000..c43b695 --- /dev/null +++ b/crypto/bn/asm/x86_64-mont.pl @@ -0,0 +1,214 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# October 2005. +# +# Montgomery multiplication routine for x86_64. While it gives modest +# 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more +# than twice, >2x, as fast. Most common rsa1024 sign is improved by +# respectful 50%. It remains to be seen if loop unrolling and +# dedicated squaring routine can provide further improvement... + +$output=shift; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| $^X $xlate $output"; + +# int bn_mul_mont( +$rp="%rdi"; # BN_ULONG *rp, +$ap="%rsi"; # const BN_ULONG *ap, +$bp="%rdx"; # const BN_ULONG *bp, +$np="%rcx"; # const BN_ULONG *np, +$n0="%r8"; # const BN_ULONG *n0, +$num="%r9"; # int num); +$lo0="%r10"; +$hi0="%r11"; +$bp="%r12"; # reassign $bp +$hi1="%r13"; +$i="%r14"; +$j="%r15"; +$m0="%rbx"; +$m1="%rbp"; + +$code=<<___; +.text + +.globl bn_mul_mont +.type bn_mul_mont,\@function,6 +.align 16 +bn_mul_mont: + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + mov ${num}d,${num}d + lea 2($num),%rax + mov %rsp,%rbp + neg %rax + lea (%rsp,%rax,8),%rsp # tp=alloca(8*(num+2)) + and \$-1024,%rsp # minimize TLB usage + + mov %rbp,8(%rsp,$num,8) # tp[num+1]=%rsp + mov %rdx,$bp # $bp reassigned, remember? + + mov ($n0),$n0 # pull n0[0] value + + xor $i,$i # i=0 + xor $j,$j # j=0 + + mov ($bp),$m0 # m0=bp[0] + mov ($ap),%rax + mulq $m0 # ap[0]*bp[0] + mov %rax,$lo0 + mov %rdx,$hi0 + + imulq $n0,%rax # "tp[0]"*n0 + mov %rax,$m1 + + mulq ($np) # np[0]*m1 + add $lo0,%rax # discarded + adc \$0,%rdx + mov %rdx,$hi1 + + lea 1($j),$j # j++ +.L1st: + mov ($ap,$j,8),%rax + mulq $m0 # ap[j]*bp[0] + add $hi0,%rax + adc \$0,%rdx + mov %rax,$lo0 + mov ($np,$j,8),%rax + mov %rdx,$hi0 + + mulq $m1 # np[j]*m1 + add $hi1,%rax + lea 1($j),$j # j++ + adc \$0,%rdx + add $lo0,%rax # np[j]*m1+ap[j]*bp[0] + adc \$0,%rdx + mov %rax,-16(%rsp,$j,8) # tp[j-1] + cmp $num,$j + mov %rdx,$hi1 + jl .L1st + + xor %rdx,%rdx + add $hi0,$hi1 + adc \$0,%rdx + mov $hi1,-8(%rsp,$num,8) + mov %rdx,(%rsp,$num,8) # store upmost overflow bit + + lea 1($i),$i # i++ +.align 4 +.Louter: + xor $j,$j # j=0 + + mov ($bp,$i,8),$m0 # m0=bp[i] + mov ($ap),%rax # ap[0] + mulq $m0 # ap[0]*bp[i] + add (%rsp),%rax # ap[0]*bp[i]+tp[0] + adc \$0,%rdx + mov %rax,$lo0 + mov %rdx,$hi0 + + imulq $n0,%rax # tp[0]*n0 + mov %rax,$m1 + + mulq ($np,$j,8) # np[0]*m1 + add $lo0,%rax # discarded + mov 8(%rsp),$lo0 # tp[1] + adc \$0,%rdx + mov %rdx,$hi1 + + lea 1($j),$j # j++ +.align 4 +.Linner: + mov ($ap,$j,8),%rax + mulq $m0 # ap[j]*bp[i] + add $hi0,%rax + adc \$0,%rdx + add %rax,$lo0 # ap[j]*bp[i]+tp[j] + mov ($np,$j,8),%rax + adc \$0,%rdx + mov %rdx,$hi0 + + mulq $m1 # np[j]*m1 + add $hi1,%rax + lea 1($j),$j # j++ + adc \$0,%rdx + add $lo0,%rax # np[j]*m1+ap[j]*bp[i]+tp[j] + adc \$0,%rdx + mov (%rsp,$j,8),$lo0 + cmp $num,$j + mov %rax,-16(%rsp,$j,8) # tp[j-1] + mov %rdx,$hi1 + jl .Linner + + xor %rdx,%rdx + add $hi0,$hi1 + adc \$0,%rdx + add $lo0,$hi1 # pull upmost overflow bit + adc \$0,%rdx + mov $hi1,-8(%rsp,$num,8) + mov %rdx,(%rsp,$num,8) # store upmost overflow bit + + lea 1($i),$i # i++ + cmp $num,$i + jl .Louter + + lea (%rsp),$ap # borrow ap for tp + lea -1($num),$j # j=num-1 + + mov ($ap),%rax # tp[0] + xor $i,$i # i=0 and clear CF! + jmp .Lsub +.align 16 +.Lsub: sbb ($np,$i,8),%rax + mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] + dec $j # doesn't affect CF! + mov 8($ap,$i,8),%rax # tp[i+1] + lea 1($i),$i # i++ + jge .Lsub + + sbb \$0,%rax # handle upmost overflow bit + and %rax,$ap + not %rax + mov $rp,$np + and %rax,$np + lea -1($num),$j + or $np,$ap # ap=borrow?tp:rp +.align 16 +.Lcopy: # copy or in-place refresh + mov ($ap,$j,8),%rax + mov %rax,($rp,$j,8) # rp[i]=tp[i] + mov $i,(%rsp,$j,8) # zap temporary vector + dec $j + jge .Lcopy + + mov 8(%rsp,$num,8),%rsp # restore %rsp + mov \$1,%rax + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + ret +.size bn_mul_mont,.-bn_mul_mont +.asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>" +___ + +print $code; +close STDOUT; diff --git a/crypto/bn/bn.h b/crypto/bn/bn.h index 95c5d64..6d754d5 100644 --- a/crypto/bn/bn.h +++ b/crypto/bn/bn.h @@ -245,8 +245,18 @@ extern "C" { #define BN_FLG_MALLOCED 0x01 #define BN_FLG_STATIC_DATA 0x02 -#define BN_FLG_EXP_CONSTTIME 0x04 /* avoid leaking exponent information through timings - * (BN_mod_exp_mont() will call BN_mod_exp_mont_consttime) */ +#define BN_FLG_CONSTTIME 0x04 /* avoid leaking exponent information through timing, + * BN_mod_exp_mont() will call BN_mod_exp_mont_consttime, + * BN_div() will call BN_div_no_branch, + * BN_mod_inverse() will call BN_mod_inverse_no_branch. + */ + +#ifndef OPENSSL_NO_DEPRECATED +#define BN_FLG_EXP_CONSTTIME BN_FLG_CONSTTIME /* deprecated name for the flag */ + /* avoid leaking exponent information through timings + * (BN_mod_exp_mont() will call BN_mod_exp_mont_consttime) */ +#endif + #ifndef OPENSSL_NO_DEPRECATED #define BN_FLG_FREE 0x8000 /* used for debuging */ #endif @@ -293,7 +303,12 @@ struct bn_mont_ctx_st BIGNUM N; /* The modulus */ BIGNUM Ni; /* R*(1/R mod N) - N*Ni = 1 * (Ni is only stored for bignum algorithm) */ +#if 0 + /* OpenSSL 0.9.9 preview: */ + BN_ULONG n0[2];/* least significant word(s) of Ni */ +#else BN_ULONG n0; /* least significant word of Ni */ +#endif int flags; }; @@ -534,7 +549,7 @@ BN_MONT_CTX *BN_MONT_CTX_set_locked(BN_MONT_CTX **pmont, int lock, #define BN_BLINDING_NO_UPDATE 0x00000001 #define BN_BLINDING_NO_RECREATE 0x00000002 -BN_BLINDING *BN_BLINDING_new(const BIGNUM *A, const BIGNUM *Ai, BIGNUM *mod); +BN_BLINDING *BN_BLINDING_new(const BIGNUM *A, const BIGNUM *Ai, /* const */ BIGNUM *mod); void BN_BLINDING_free(BN_BLINDING *b); int BN_BLINDING_update(BN_BLINDING *b,BN_CTX *ctx); int BN_BLINDING_convert(BIGNUM *n, BN_BLINDING *b, BN_CTX *ctx); @@ -546,7 +561,7 @@ void BN_BLINDING_set_thread_id(BN_BLINDING *, unsigned long); unsigned long BN_BLINDING_get_flags(const BN_BLINDING *); void BN_BLINDING_set_flags(BN_BLINDING *, unsigned long); BN_BLINDING *BN_BLINDING_create_param(BN_BLINDING *b, - const BIGNUM *e, BIGNUM *m, BN_CTX *ctx, + const BIGNUM *e, /* const */ BIGNUM *m, BN_CTX *ctx, int (*bn_mod_exp)(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *m_ctx), BN_MONT_CTX *m_ctx); @@ -775,6 +790,7 @@ void ERR_load_BN_strings(void); #define BN_F_BN_CTX_NEW 106 #define BN_F_BN_CTX_START 129 #define BN_F_BN_DIV 107 +#define BN_F_BN_DIV_NO_BRANCH 138 #define BN_F_BN_DIV_RECP 130 #define BN_F_BN_EXP 123 #define BN_F_BN_EXPAND2 108 @@ -793,6 +809,7 @@ void ERR_load_BN_strings(void); #define BN_F_BN_MOD_EXP_RECP 125 #define BN_F_BN_MOD_EXP_SIMPLE 126 #define BN_F_BN_MOD_INVERSE 110 +#define BN_F_BN_MOD_INVERSE_NO_BRANCH 139 #define BN_F_BN_MOD_LSHIFT_QUICK 119 #define BN_F_BN_MOD_MUL_RECIPROCAL 111 #define BN_F_BN_MOD_SQRT 121 diff --git a/crypto/bn/bn_blind.c b/crypto/bn/bn_blind.c index ca22d4f..c11fb4cc 100644 --- a/crypto/bn/bn_blind.c +++ b/crypto/bn/bn_blind.c @@ -131,7 +131,7 @@ struct bn_blinding_st BN_MONT_CTX *m_ctx); }; -BN_BLINDING *BN_BLINDING_new(const BIGNUM *A, const BIGNUM *Ai, BIGNUM *mod) +BN_BLINDING *BN_BLINDING_new(const BIGNUM *A, const BIGNUM *Ai, /* const */ BIGNUM *mod) { BN_BLINDING *ret=NULL; @@ -151,7 +151,12 @@ BN_BLINDING *BN_BLINDING_new(const BIGNUM *A, const BIGNUM *Ai, BIGNUM *mod) { if ((ret->Ai = BN_dup(Ai)) == NULL) goto err; } - ret->mod = mod; + + /* save a copy of mod in the BN_BLINDING structure */ + if ((ret->mod = BN_dup(mod)) == NULL) goto err; + if (BN_get_flags(mod, BN_FLG_CONSTTIME) != 0) + BN_set_flags(ret->mod, BN_FLG_CONSTTIME); + ret->counter = BN_BLINDING_COUNTER; return(ret); err: @@ -167,6 +172,7 @@ void BN_BLINDING_free(BN_BLINDING *r) if (r->A != NULL) BN_free(r->A ); if (r->Ai != NULL) BN_free(r->Ai); if (r->e != NULL) BN_free(r->e ); + if (r->mod != NULL) BN_free(r->mod); OPENSSL_free(r); } @@ -278,7 +284,7 @@ void BN_BLINDING_set_flags(BN_BLINDING *b, unsigned long flags) } BN_BLINDING *BN_BLINDING_create_param(BN_BLINDING *b, - const BIGNUM *e, BIGNUM *m, BN_CTX *ctx, + const BIGNUM *e, /* const */ BIGNUM *m, BN_CTX *ctx, int (*bn_mod_exp)(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *m_ctx), BN_MONT_CTX *m_ctx) diff --git a/crypto/bn/bn_div.c b/crypto/bn/bn_div.c index 2857f44..1e8e576 100644 --- a/crypto/bn/bn_div.c +++ b/crypto/bn/bn_div.c @@ -169,13 +169,15 @@ int BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, const BIGNUM *d, #endif /* OPENSSL_NO_ASM */ -/* BN_div computes dv := num / divisor, rounding towards zero, and sets up - * rm such that dv*divisor + rm = num holds. +/* BN_div[_no_branch] computes dv := num / divisor, rounding towards + * zero, and sets up rm such that dv*divisor + rm = num holds. * Thus: * dv->neg == num->neg ^ divisor->neg (unless the result is zero) * rm->neg == num->neg (unless the remainder is zero) * If 'dv' or 'rm' is NULL, the respective value is not returned. */ +static int BN_div_no_branch(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, + const BIGNUM *divisor, BN_CTX *ctx); int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor, BN_CTX *ctx) { @@ -185,9 +187,25 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor, BN_ULONG d0,d1; int num_n,div_n; + /* Invalid zero-padding would have particularly bad consequences + * in the case of 'num', so don't just rely on bn_check_top() for this one + * (bn_check_top() works only for BN_DEBUG builds) */ + if (num->top > 0 && num->d[num->top - 1] == 0) + { + BNerr(BN_F_BN_DIV,BN_R_NOT_INITIALIZED); + return 0; + } + + bn_check_top(num); + + if ((BN_get_flags(num, BN_FLG_CONSTTIME) != 0) || (BN_get_flags(divisor, BN_FLG_CONSTTIME) != 0)) + { + return BN_div_no_branch(dv, rm, num, divisor, ctx); + } + bn_check_top(dv); bn_check_top(rm); - bn_check_top(num); + /* bn_check_top(num); */ /* 'num' has been checked already */ bn_check_top(divisor); if (BN_is_zero(divisor)) @@ -397,4 +415,229 @@ err: return(0); } + +/* BN_div_no_branch is a special version of BN_div. It does not contain + * branches that may leak sensitive information. + */ +static int BN_div_no_branch(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, + const BIGNUM *divisor, BN_CTX *ctx) + { + int norm_shift,i,loop; + BIGNUM *tmp,wnum,*snum,*sdiv,*res; + BN_ULONG *resp,*wnump; + BN_ULONG d0,d1; + int num_n,div_n; + + bn_check_top(dv); + bn_check_top(rm); + /* bn_check_top(num); */ /* 'num' has been checked in BN_div() */ + bn_check_top(divisor); + + if (BN_is_zero(divisor)) + { + BNerr(BN_F_BN_DIV_NO_BRANCH,BN_R_DIV_BY_ZERO); + return(0); + } + + BN_CTX_start(ctx); + tmp=BN_CTX_get(ctx); + snum=BN_CTX_get(ctx); + sdiv=BN_CTX_get(ctx); + if (dv == NULL) + res=BN_CTX_get(ctx); + else res=dv; + if (sdiv == NULL || res == NULL) goto err; + + /* First we normalise the numbers */ + norm_shift=BN_BITS2-((BN_num_bits(divisor))%BN_BITS2); + if (!(BN_lshift(sdiv,divisor,norm_shift))) goto err; + sdiv->neg=0; + norm_shift+=BN_BITS2; + if (!(BN_lshift(snum,num,norm_shift))) goto err; + snum->neg=0; + + /* Since we don't know whether snum is larger than sdiv, + * we pad snum with enough zeroes without changing its + * value. + */ + if (snum->top <= sdiv->top+1) + { + if (bn_wexpand(snum, sdiv->top + 2) == NULL) goto err; + for (i = snum->top; i < sdiv->top + 2; i++) snum->d[i] = 0; + snum->top = sdiv->top + 2; + } + else + { + if (bn_wexpand(snum, snum->top + 1) == NULL) goto err; + snum->d[snum->top] = 0; + snum->top ++; + } + + div_n=sdiv->top; + num_n=snum->top; + loop=num_n-div_n; + /* Lets setup a 'window' into snum + * This is the part that corresponds to the current + * 'area' being divided */ + wnum.neg = 0; + wnum.d = &(snum->d[loop]); + wnum.top = div_n; + /* only needed when BN_ucmp messes up the values between top and max */ + wnum.dmax = snum->dmax - loop; /* so we don't step out of bounds */ + + /* Get the top 2 words of sdiv */ + /* div_n=sdiv->top; */ + d0=sdiv->d[div_n-1]; + d1=(div_n == 1)?0:sdiv->d[div_n-2]; + + /* pointer to the 'top' of snum */ + wnump= &(snum->d[num_n-1]); + + /* Setup to 'res' */ + res->neg= (num->neg^divisor->neg); + if (!bn_wexpand(res,(loop+1))) goto err; + res->top=loop-1; + resp= &(res->d[loop-1]); + + /* space for temp */ + if (!bn_wexpand(tmp,(div_n+1))) goto err; + + /* if res->top == 0 then clear the neg value otherwise decrease + * the resp pointer */ + if (res->top == 0) + res->neg = 0; + else + resp--; + + for (i=0; i<loop-1; i++, wnump--, resp--) + { + BN_ULONG q,l0; + /* the first part of the loop uses the top two words of + * snum and sdiv to calculate a BN_ULONG q such that + * | wnum - sdiv * q | < sdiv */ +#if defined(BN_DIV3W) && !defined(OPENSSL_NO_ASM) + BN_ULONG bn_div_3_words(BN_ULONG*,BN_ULONG,BN_ULONG); + q=bn_div_3_words(wnump,d1,d0); +#else + BN_ULONG n0,n1,rem=0; + + n0=wnump[0]; + n1=wnump[-1]; + if (n0 == d0) + q=BN_MASK2; + else /* n0 < d0 */ + { +#ifdef BN_LLONG + BN_ULLONG t2; + +#if defined(BN_LLONG) && defined(BN_DIV2W) && !defined(bn_div_words) + q=(BN_ULONG)(((((BN_ULLONG)n0)<<BN_BITS2)|n1)/d0); +#else + q=bn_div_words(n0,n1,d0); +#ifdef BN_DEBUG_LEVITTE + fprintf(stderr,"DEBUG: bn_div_words(0x%08X,0x%08X,0x%08\ +X) -> 0x%08X\n", + n0, n1, d0, q); +#endif +#endif + +#ifndef REMAINDER_IS_ALREADY_CALCULATED + /* + * rem doesn't have to be BN_ULLONG. The least we + * know it's less that d0, isn't it? + */ + rem=(n1-q*d0)&BN_MASK2; +#endif + t2=(BN_ULLONG)d1*q; + + for (;;) + { + if (t2 <= ((((BN_ULLONG)rem)<<BN_BITS2)|wnump[-2])) + break; + q--; + rem += d0; + if (rem < d0) break; /* don't let rem overflow */ + t2 -= d1; + } +#else /* !BN_LLONG */ + BN_ULONG t2l,t2h,ql,qh; + + q=bn_div_words(n0,n1,d0); +#ifdef BN_DEBUG_LEVITTE + fprintf(stderr,"DEBUG: bn_div_words(0x%08X,0x%08X,0x%08\ +X) -> 0x%08X\n", + n0, n1, d0, q); +#endif +#ifndef REMAINDER_IS_ALREADY_CALCULATED + rem=(n1-q*d0)&BN_MASK2; +#endif + +#if defined(BN_UMULT_LOHI) + BN_UMULT_LOHI(t2l,t2h,d1,q); +#elif defined(BN_UMULT_HIGH) + t2l = d1 * q; + t2h = BN_UMULT_HIGH(d1,q); +#else + t2l=LBITS(d1); t2h=HBITS(d1); + ql =LBITS(q); qh =HBITS(q); + mul64(t2l,t2h,ql,qh); /* t2=(BN_ULLONG)d1*q; */ +#endif + + for (;;) + { + if ((t2h < rem) || + ((t2h == rem) && (t2l <= wnump[-2]))) + break; + q--; + rem += d0; + if (rem < d0) break; /* don't let rem overflow */ + if (t2l < d1) t2h--; t2l -= d1; + } +#endif /* !BN_LLONG */ + } +#endif /* !BN_DIV3W */ + + l0=bn_mul_words(tmp->d,sdiv->d,div_n,q); + tmp->d[div_n]=l0; + wnum.d--; + /* ingore top values of the bignums just sub the two + * BN_ULONG arrays with bn_sub_words */ + if (bn_sub_words(wnum.d, wnum.d, tmp->d, div_n+1)) + { + /* Note: As we have considered only the leading + * two BN_ULONGs in the calculation of q, sdiv * q + * might be greater than wnum (but then (q-1) * sdiv + * is less or equal than wnum) + */ + q--; + if (bn_add_words(wnum.d, wnum.d, sdiv->d, div_n)) + /* we can't have an overflow here (assuming + * that q != 0, but if q == 0 then tmp is + * zero anyway) */ + (*wnump)++; + } + /* store part of the result */ + *resp = q; + } + bn_correct_top(snum); + if (rm != NULL) + { + /* Keep a copy of the neg flag in num because if rm==num + * BN_rshift() will overwrite it. + */ + int neg = num->neg; + BN_rshift(rm,snum,norm_shift); + if (!BN_is_zero(rm)) + rm->neg = neg; + bn_check_top(rm); + } + bn_correct_top(res); + BN_CTX_end(ctx); + return(1); +err: + bn_check_top(rm); + BN_CTX_end(ctx); + return(0); + } + #endif diff --git a/crypto/bn/bn_err.c b/crypto/bn/bn_err.c index 24fbbb7..cfe2eb9 100644 --- a/crypto/bn/bn_err.c +++ b/crypto/bn/bn_err.c @@ -1,6 +1,6 @@ /* crypto/bn/bn_err.c */ /* ==================================================================== - * Copyright (c) 1999-2005 The OpenSSL Project. All rights reserved. + * Copyright (c) 1999-2007 The OpenSSL Project. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -82,6 +82,7 @@ static ERR_STRING_DATA BN_str_functs[]= {ERR_FUNC(BN_F_BN_CTX_NEW), "BN_CTX_new"}, {ERR_FUNC(BN_F_BN_CTX_START), "BN_CTX_start"}, {ERR_FUNC(BN_F_BN_DIV), "BN_div"}, +{ERR_FUNC(BN_F_BN_DIV_NO_BRANCH), "BN_div_no_branch"}, {ERR_FUNC(BN_F_BN_DIV_RECP), "BN_div_recp"}, {ERR_FUNC(BN_F_BN_EXP), "BN_exp"}, {ERR_FUNC(BN_F_BN_EXPAND2), "bn_expand2"}, @@ -100,6 +101,7 @@ static ERR_STRING_DATA BN_str_functs[]= {ERR_FUNC(BN_F_BN_MOD_EXP_RECP), "BN_mod_exp_recp"}, {ERR_FUNC(BN_F_BN_MOD_EXP_SIMPLE), "BN_mod_exp_simple"}, {ERR_FUNC(BN_F_BN_MOD_INVERSE), "BN_mod_inverse"}, +{ERR_FUNC(BN_F_BN_MOD_INVERSE_NO_BRANCH), "BN_mod_inverse_no_branch"}, {ERR_FUNC(BN_F_BN_MOD_LSHIFT_QUICK), "BN_mod_lshift_quick"}, {ERR_FUNC(BN_F_BN_MOD_MUL_RECIPROCAL), "BN_mod_mul_reciprocal"}, {ERR_FUNC(BN_F_BN_MOD_SQRT), "BN_mod_sqrt"}, diff --git a/crypto/bn/bn_exp.c b/crypto/bn/bn_exp.c index 8f8c694..70a33f0 100644 --- a/crypto/bn/bn_exp.c +++ b/crypto/bn/bn_exp.c @@ -122,9 +122,9 @@ int BN_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx) int i,bits,ret=0; BIGNUM *v,*rr; - if (BN_get_flags(p, BN_FLG_EXP_CONSTTIME) != 0) + if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0) { - /* BN_FLG_EXP_CONSTTIME only supported by BN_mod_exp_mont() */ + /* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */ BNerr(BN_F_BN_EXP,ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); return -1; } @@ -213,7 +213,7 @@ int BN_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m, if (BN_is_odd(m)) { # ifdef MONT_EXP_WORD - if (a->top == 1 && !a->neg && (BN_get_flags(p, BN_FLG_EXP_CONSTTIME) == 0)) + if (a->top == 1 && !a->neg && (BN_get_flags(p, BN_FLG_CONSTTIME) == 0)) { BN_ULONG A = a->d[0]; ret=BN_mod_exp_mont_word(r,A,p,m,ctx,NULL); @@ -245,9 +245,9 @@ int BN_mod_exp_recp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BIGNUM *val[TABLE_SIZE]; BN_RECP_CTX recp; - if (BN_get_flags(p, BN_FLG_EXP_CONSTTIME) != 0) + if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0) { - /* BN_FLG_EXP_CONSTTIME only supported by BN_mod_exp_mont() */ + /* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */ BNerr(BN_F_BN_MOD_EXP_RECP,ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); return -1; } @@ -379,7 +379,7 @@ int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, BIGNUM *val[TABLE_SIZE]; BN_MONT_CTX *mont=NULL; - if (BN_get_flags(p, BN_FLG_EXP_CONSTTIME) != 0) + if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0) { return BN_mod_exp_mont_consttime(rr, a, p, m, ctx, in_mont); } @@ -745,9 +745,9 @@ int BN_mod_exp_mont_word(BIGNUM *rr, BN_ULONG a, const BIGNUM *p, #define BN_TO_MONTGOMERY_WORD(r, w, mont) \ (BN_set_word(r, (w)) && BN_to_montgomery(r, r, (mont), ctx)) - if (BN_get_flags(p, BN_FLG_EXP_CONSTTIME) != 0) + if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0) { - /* BN_FLG_EXP_CONSTTIME only supported by BN_mod_exp_mont() */ + /* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */ BNerr(BN_F_BN_MOD_EXP_MONT_WORD,ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); return -1; } @@ -881,9 +881,9 @@ int BN_mod_exp_simple(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, /* Table of variables obtained from 'ctx' */ BIGNUM *val[TABLE_SIZE]; - if (BN_get_flags(p, BN_FLG_EXP_CONSTTIME) != 0) + if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0) { - /* BN_FLG_EXP_CONSTTIME only supported by BN_mod_exp_mont() */ + /* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */ BNerr(BN_F_BN_MOD_EXP_SIMPLE,ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); return -1; } diff --git a/crypto/bn/bn_gcd.c b/crypto/bn/bn_gcd.c index f02e6fc..4a35211 100644 --- a/crypto/bn/bn_gcd.c +++ b/crypto/bn/bn_gcd.c @@ -203,6 +203,8 @@ err: /* solves ax == 1 (mod n) */ +static BIGNUM *BN_mod_inverse_no_branch(BIGNUM *in, + const BIGNUM *a, const BIGNUM *n, BN_CTX *ctx); BIGNUM *BN_mod_inverse(BIGNUM *in, const BIGNUM *a, const BIGNUM *n, BN_CTX *ctx) { @@ -210,6 +212,11 @@ BIGNUM *BN_mod_inverse(BIGNUM *in, BIGNUM *ret=NULL; int sign; + if ((BN_get_flags(a, BN_FLG_CONSTTIME) != 0) || (BN_get_flags(n, BN_FLG_CONSTTIME) != 0)) + { + return BN_mod_inverse_no_branch(in, a, n, ctx); + } + bn_check_top(a); bn_check_top(n); @@ -491,3 +498,157 @@ err: bn_check_top(ret); return(ret); } + + +/* BN_mod_inverse_no_branch is a special version of BN_mod_inverse. + * It does not contain branches that may leak sensitive information. + */ +static BIGNUM *BN_mod_inverse_no_branch(BIGNUM *in, + const BIGNUM *a, const BIGNUM *n, BN_CTX *ctx) + { + BIGNUM *A,*B,*X,*Y,*M,*D,*T,*R=NULL; + BIGNUM local_A, local_B; + BIGNUM *pA, *pB; + BIGNUM *ret=NULL; + int sign; + + bn_check_top(a); + bn_check_top(n); + + BN_CTX_start(ctx); + A = BN_CTX_get(ctx); + B = BN_CTX_get(ctx); + X = BN_CTX_get(ctx); + D = BN_CTX_get(ctx); + M = BN_CTX_get(ctx); + Y = BN_CTX_get(ctx); + T = BN_CTX_get(ctx); + if (T == NULL) goto err; + + if (in == NULL) + R=BN_new(); + else + R=in; + if (R == NULL) goto err; + + BN_one(X); + BN_zero(Y); + if (BN_copy(B,a) == NULL) goto err; + if (BN_copy(A,n) == NULL) goto err; + A->neg = 0; + + if (B->neg || (BN_ucmp(B, A) >= 0)) + { + /* Turn BN_FLG_CONSTTIME flag on, so that when BN_div is invoked, + * BN_div_no_branch will be called eventually. + */ + pB = &local_B; + BN_with_flags(pB, B, BN_FLG_CONSTTIME); + if (!BN_nnmod(B, pB, A, ctx)) goto err; + } + sign = -1; + /* From B = a mod |n|, A = |n| it follows that + * + * 0 <= B < A, + * -sign*X*a == B (mod |n|), + * sign*Y*a == A (mod |n|). + */ + + while (!BN_is_zero(B)) + { + BIGNUM *tmp; + + /* + * 0 < B < A, + * (*) -sign*X*a == B (mod |n|), + * sign*Y*a == A (mod |n|) + */ + + /* Turn BN_FLG_CONSTTIME flag on, so that when BN_div is invoked, + * BN_div_no_branch will be called eventually. + */ + pA = &local_A; + BN_with_flags(pA, A, BN_FLG_CONSTTIME); + + /* (D, M) := (A/B, A%B) ... */ + if (!BN_div(D,M,pA,B,ctx)) goto err; + + /* Now + * A = D*B + M; + * thus we have + * (**) sign*Y*a == D*B + M (mod |n|). + */ + + tmp=A; /* keep the BIGNUM object, the value does not matter */ + + /* (A, B) := (B, A mod B) ... */ + A=B; + B=M; + /* ... so we have 0 <= B < A again */ + + /* Since the former M is now B and the former B is now A, + * (**) translates into + * sign*Y*a == D*A + B (mod |n|), + * i.e. + * sign*Y*a - D*A == B (mod |n|). + * Similarly, (*) translates into + * -sign*X*a == A (mod |n|). + * + * Thus, + * sign*Y*a + D*sign*X*a == B (mod |n|), + * i.e. + * sign*(Y + D*X)*a == B (mod |n|). + * + * So if we set (X, Y, sign) := (Y + D*X, X, -sign), we arrive back at + * -sign*X*a == B (mod |n|), + * sign*Y*a == A (mod |n|). + * Note that X and Y stay non-negative all the time. + */ + + if (!BN_mul(tmp,D,X,ctx)) goto err; + if (!BN_add(tmp,tmp,Y)) goto err; + + M=Y; /* keep the BIGNUM object, the value does not matter */ + Y=X; + X=tmp; + sign = -sign; + } + + /* + * The while loop (Euclid's algorithm) ends when + * A == gcd(a,n); + * we have + * sign*Y*a == A (mod |n|), + * where Y is non-negative. + */ + + if (sign < 0) + { + if (!BN_sub(Y,n,Y)) goto err; + } + /* Now Y*a == A (mod |n|). */ + + if (BN_is_one(A)) + { + /* Y*a == 1 (mod |n|) */ + if (!Y->neg && BN_ucmp(Y,n) < 0) + { + if (!BN_copy(R,Y)) goto err; + } + else + { + if (!BN_nnmod(R,Y,n,ctx)) goto err; + } + } + else + { + BNerr(BN_F_BN_MOD_INVERSE_NO_BRANCH,BN_R_NO_INVERSE); + goto err; + } + ret=R; +err: + if ((ret == NULL) && (in == NULL)) BN_free(R); + BN_CTX_end(ctx); + bn_check_top(ret); + return(ret); + } diff --git a/crypto/bn/bn_gf2m.c b/crypto/bn/bn_gf2m.c index 6a79385..306f029 100644 --- a/crypto/bn/bn_gf2m.c +++ b/crypto/bn/bn_gf2m.c @@ -384,7 +384,11 @@ int BN_GF2m_mod_arr(BIGNUM *r, const BIGNUM *a, const unsigned int p[]) if (zz == 0) break; d1 = BN_BITS2 - d0; - if (d0) z[dN] = (z[dN] << d1) >> d1; /* clear up the top d1 bits */ + /* clear up the top d1 bits */ + if (d0) + z[dN] = (z[dN] << d1) >> d1; + else + z[dN] = 0; z[0] ^= zz; /* reduction t^0 component */ for (k = 1; p[k] != 0; k++) diff --git a/crypto/bn/bn_lcl.h b/crypto/bn/bn_lcl.h index ad4ca7f..27ac439 100644 --- a/crypto/bn/bn_lcl.h +++ b/crypto/bn/bn_lcl.h @@ -481,6 +481,7 @@ BN_ULONG bn_add_part_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int cl, int dl); BN_ULONG bn_sub_part_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int cl, int dl); +int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num); #ifdef __cplusplus } diff --git a/crypto/bn/bn_lib.c b/crypto/bn/bn_lib.c index 210ccb4..2649b8c 100644 --- a/crypto/bn/bn_lib.c +++ b/crypto/bn/bn_lib.c @@ -763,7 +763,7 @@ int BN_is_bit_set(const BIGNUM *a, int n) i=n/BN_BITS2; j=n%BN_BITS2; if (a->top <= i) return 0; - return((a->d[i]&(((BN_ULONG)1)<<j))?1:0); + return(((a->d[i])>>j)&((BN_ULONG)1)); } int BN_mask_bits(BIGNUM *a, int n) diff --git a/crypto/bn/bn_mont.c b/crypto/bn/bn_mont.c index 961ca67..4799b15 100644 --- a/crypto/bn/bn_mont.c +++ b/crypto/bn/bn_mont.c @@ -122,11 +122,50 @@ #define MONT_WORD /* use the faster word-based algorithm */ +#if defined(MONT_WORD) && defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32) +/* This condition means we have a specific non-default build: + * In the 0.9.8 branch, OPENSSL_BN_ASM_MONT is normally not set for any + * BN_BITS2<=32 platform; an explicit "enable-montasm" is required. + * I.e., if we are here, the user intentionally deviates from the + * normal stable build to get better Montgomery performance from + * the 0.9.9-dev backport. + * + * In this case only, we also enable BN_from_montgomery_word() + * (another non-stable feature from 0.9.9-dev). + */ +#define MONT_FROM_WORD___NON_DEFAULT_0_9_8_BUILD +#endif + +#ifdef MONT_FROM_WORD___NON_DEFAULT_0_9_8_BUILD +static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont); +#endif + + + int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_MONT_CTX *mont, BN_CTX *ctx) { BIGNUM *tmp; int ret=0; +#if defined(OPENSSL_BN_ASM_MONT) && defined(MONT_WORD) + int num = mont->N.top; + + if (num>1 && a->top==num && b->top==num) + { + if (bn_wexpand(r,num) == NULL) return(0); +#if 0 /* for OpenSSL 0.9.9 mont->n0 */ + if (bn_mul_mont(r->d,a->d,b->d,mont->N.d,mont->n0,num)) +#else + if (bn_mul_mont(r->d,a->d,b->d,mont->N.d,&mont->n0,num)) +#endif + { + r->neg = a->neg^b->neg; + r->top = num; + bn_correct_top(r); + return(1); + } + } +#endif BN_CTX_start(ctx); tmp = BN_CTX_get(ctx); @@ -142,7 +181,11 @@ int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, if (!BN_mul(tmp,a,b,ctx)) goto err; } /* reduce from aRR to aR */ +#ifdef MONT_FROM_WORD___NON_DEFAULT_0_9_8_BUILD + if (!BN_from_montgomery_word(r,tmp,mont)) goto err; +#else if (!BN_from_montgomery(r,tmp,mont,ctx)) goto err; +#endif bn_check_top(r); ret=1; err: @@ -150,6 +193,150 @@ err: return(ret); } +#ifdef MONT_FROM_WORD___NON_DEFAULT_0_9_8_BUILD +static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont) + { + BIGNUM *n; + BN_ULONG *ap,*np,*rp,n0,v,*nrp; + int al,nl,max,i,x,ri; + + n= &(mont->N); + /* mont->ri is the size of mont->N in bits (rounded up + to the word size) */ + al=ri=mont->ri/BN_BITS2; + + nl=n->top; + if ((al == 0) || (nl == 0)) { ret->top=0; return(1); } + + max=(nl+al+1); /* allow for overflow (no?) XXX */ + if (bn_wexpand(r,max) == NULL) return(0); + + r->neg^=n->neg; + np=n->d; + rp=r->d; + nrp= &(r->d[nl]); + + /* clear the top words of T */ + for (i=r->top; i<max; i++) /* memset? XXX */ + r->d[i]=0; + + r->top=max; +#if 0 /* for OpenSSL 0.9.9 mont->n0 */ + n0=mont->n0[0]; +#else + n0=mont->n0; +#endif + +#ifdef BN_COUNT + fprintf(stderr,"word BN_from_montgomery_word %d * %d\n",nl,nl); +#endif + for (i=0; i<nl; i++) + { +#ifdef __TANDEM + { + long long t1; + long long t2; + long long t3; + t1 = rp[0] * (n0 & 0177777); + t2 = 037777600000l; + t2 = n0 & t2; + t3 = rp[0] & 0177777; + t2 = (t3 * t2) & BN_MASK2; + t1 = t1 + t2; + v=bn_mul_add_words(rp,np,nl,(BN_ULONG) t1); + } +#else + v=bn_mul_add_words(rp,np,nl,(rp[0]*n0)&BN_MASK2); +#endif + nrp++; + rp++; + if (((nrp[-1]+=v)&BN_MASK2) >= v) + continue; + else + { + if (((++nrp[0])&BN_MASK2) != 0) continue; + if (((++nrp[1])&BN_MASK2) != 0) continue; + for (x=2; (((++nrp[x])&BN_MASK2) == 0); x++) ; + } + } + bn_correct_top(r); + + /* mont->ri will be a multiple of the word size and below code + * is kind of BN_rshift(ret,r,mont->ri) equivalent */ + if (r->top <= ri) + { + ret->top=0; + return(1); + } + al=r->top-ri; + + if (bn_wexpand(ret,ri) == NULL) return(0); + x=0-(((al-ri)>>(sizeof(al)*8-1))&1); + ret->top=x=(ri&~x)|(al&x); /* min(ri,al) */ + ret->neg=r->neg; + + rp=ret->d; + ap=&(r->d[ri]); + + { + size_t m1,m2; + + v=bn_sub_words(rp,ap,np,ri); + /* this ----------------^^ works even in al<ri case + * thanks to zealous zeroing of top of the vector in the + * beginning. */ + + /* if (al==ri && !v) || al>ri) nrp=rp; else nrp=ap; */ + /* in other words if subtraction result is real, then + * trick unconditional memcpy below to perform in-place + * "refresh" instead of actual copy. */ + m1=0-(size_t)(((al-ri)>>(sizeof(al)*8-1))&1); /* al<ri */ + m2=0-(size_t)(((ri-al)>>(sizeof(al)*8-1))&1); /* al>ri */ + m1|=m2; /* (al!=ri) */ + m1|=(0-(size_t)v); /* (al!=ri || v) */ + m1&=~m2; /* (al!=ri || v) && !al>ri */ + nrp=(BN_ULONG *)(((size_t)rp&~m1)|((size_t)ap&m1)); + } + + /* 'i<ri' is chosen to eliminate dependency on input data, even + * though it results in redundant copy in al<ri case. */ + for (i=0,ri-=4; i<ri; i+=4) + { + BN_ULONG t1,t2,t3,t4; + + t1=nrp[i+0]; + t2=nrp[i+1]; + t3=nrp[i+2]; ap[i+0]=0; + t4=nrp[i+3]; ap[i+1]=0; + rp[i+0]=t1; ap[i+2]=0; + rp[i+1]=t2; ap[i+3]=0; + rp[i+2]=t3; + rp[i+3]=t4; + } + for (ri+=4; i<ri; i++) + rp[i]=nrp[i], ap[i]=0; + bn_correct_top(r); + bn_correct_top(ret); + bn_check_top(ret); + + return(1); + } + +int BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, BN_MONT_CTX *mont, + BN_CTX *ctx) + { + int retn=0; + BIGNUM *t; + + BN_CTX_start(ctx); + if ((t = BN_CTX_get(ctx)) && BN_copy(t,a)) + retn = BN_from_montgomery_word(ret,t,mont); + BN_CTX_end(ctx); + return retn; + } + +#else /* !MONT_FROM_WORD___NON_DEFAULT_0_9_8_BUILD */ + int BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, BN_MONT_CTX *mont, BN_CTX *ctx) { @@ -176,7 +363,6 @@ int BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, BN_MONT_CTX *mont, max=(nl+al+1); /* allow for overflow (no?) XXX */ if (bn_wexpand(r,max) == NULL) goto err; - if (bn_wexpand(ret,max) == NULL) goto err; r->neg=a->neg^n->neg; np=n->d; @@ -228,19 +414,72 @@ int BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, BN_MONT_CTX *mont, } bn_correct_top(r); - /* mont->ri will be a multiple of the word size */ -#if 0 - BN_rshift(ret,r,mont->ri); -#else - ret->neg = r->neg; - x=ri; + /* mont->ri will be a multiple of the word size and below code + * is kind of BN_rshift(ret,r,mont->ri) equivalent */ + if (r->top <= ri) + { + ret->top=0; + retn=1; + goto err; + } + al=r->top-ri; + +# define BRANCH_FREE 1 +# if BRANCH_FREE + if (bn_wexpand(ret,ri) == NULL) goto err; + x=0-(((al-ri)>>(sizeof(al)*8-1))&1); + ret->top=x=(ri&~x)|(al&x); /* min(ri,al) */ + ret->neg=r->neg; + rp=ret->d; - ap= &(r->d[x]); - if (r->top < x) - al=0; - else - al=r->top-x; + ap=&(r->d[ri]); + + { + size_t m1,m2; + + v=bn_sub_words(rp,ap,np,ri); + /* this ----------------^^ works even in al<ri case + * thanks to zealous zeroing of top of the vector in the + * beginning. */ + + /* if (al==ri && !v) || al>ri) nrp=rp; else nrp=ap; */ + /* in other words if subtraction result is real, then + * trick unconditional memcpy below to perform in-place + * "refresh" instead of actual copy. */ + m1=0-(size_t)(((al-ri)>>(sizeof(al)*8-1))&1); /* al<ri */ + m2=0-(size_t)(((ri-al)>>(sizeof(al)*8-1))&1); /* al>ri */ + m1|=m2; /* (al!=ri) */ + m1|=(0-(size_t)v); /* (al!=ri || v) */ + m1&=~m2; /* (al!=ri || v) && !al>ri */ + nrp=(BN_ULONG *)(((size_t)rp&~m1)|((size_t)ap&m1)); + } + + /* 'i<ri' is chosen to eliminate dependency on input data, even + * though it results in redundant copy in al<ri case. */ + for (i=0,ri-=4; i<ri; i+=4) + { + BN_ULONG t1,t2,t3,t4; + + t1=nrp[i+0]; + t2=nrp[i+1]; + t3=nrp[i+2]; ap[i+0]=0; + t4=nrp[i+3]; ap[i+1]=0; + rp[i+0]=t1; ap[i+2]=0; + rp[i+1]=t2; ap[i+3]=0; + rp[i+2]=t3; + rp[i+3]=t4; + } + for (ri+=4; i<ri; i++) + rp[i]=nrp[i], ap[i]=0; + bn_correct_top(r); + bn_correct_top(ret); +# else + if (bn_wexpand(ret,al) == NULL) goto err; ret->top=al; + ret->neg=r->neg; + + rp=ret->d; + ap=&(r->d[ri]); al-=4; for (i=0; i<al; i+=4) { @@ -258,7 +497,7 @@ int BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, BN_MONT_CTX *mont, al+=4; for (; i<al; i++) rp[i]=ap[i]; -#endif +# endif #else /* !MONT_WORD */ BIGNUM *t1,*t2; @@ -278,16 +517,19 @@ int BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, BN_MONT_CTX *mont, if (!BN_rshift(ret,t2,mont->ri)) goto err; #endif /* MONT_WORD */ +#if !defined(BRANCH_FREE) || BRANCH_FREE==0 if (BN_ucmp(ret, &(mont->N)) >= 0) { if (!BN_usub(ret,ret,&(mont->N))) goto err; } +#endif retn=1; bn_check_top(ret); err: BN_CTX_end(ctx); return(retn); } +#endif /* MONT_FROM_WORD___NON_DEFAULT_0_9_8_BUILD */ BN_MONT_CTX *BN_MONT_CTX_new(void) { @@ -307,6 +549,11 @@ void BN_MONT_CTX_init(BN_MONT_CTX *ctx) BN_init(&(ctx->RR)); BN_init(&(ctx->N)); BN_init(&(ctx->Ni)); +#if 0 /* for OpenSSL 0.9.9 mont->n0 */ + ctx->n0[0] = ctx->n0[1] = 0; +#else + ctx->n0 = 0; +#endif ctx->flags=0; } @@ -340,14 +587,51 @@ int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx) mont->ri=(BN_num_bits(mod)+(BN_BITS2-1))/BN_BITS2*BN_BITS2; BN_zero(R); +#if 0 /* for OpenSSL 0.9.9 mont->n0, would be "#if defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32)", + only certain BN_BITS2<=32 platforms actually need this */ + if (!(BN_set_bit(R,2*BN_BITS2))) goto err; /* R */ +#else if (!(BN_set_bit(R,BN_BITS2))) goto err; /* R */ +#endif buf[0]=mod->d[0]; /* tmod = N mod word size */ buf[1]=0; + + BN_init(&tmod); tmod.d=buf; tmod.top = buf[0] != 0 ? 1 : 0; tmod.dmax=2; tmod.neg=0; + +#if 0 /* for OpenSSL 0.9.9 mont->n0, would be "#if defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32)"; + only certain BN_BITS2<=32 platforms actually need this */ + tmod.top=0; + if ((buf[0] = mod->d[0])) tmod.top=1; + if ((buf[1] = mod->top>1 ? mod->d[1] : 0)) tmod.top=2; + + if ((BN_mod_inverse(Ri,R,&tmod,ctx)) == NULL) + goto err; + if (!BN_lshift(Ri,Ri,2*BN_BITS2)) goto err; /* R*Ri */ + if (!BN_is_zero(Ri)) + { + if (!BN_sub_word(Ri,1)) goto err; + } + else /* if N mod word size == 1 */ + { + if (bn_expand(Ri,(int)sizeof(BN_ULONG)*2) == NULL) + goto err; + /* Ri-- (mod double word size) */ + Ri->neg=0; + Ri->d[0]=BN_MASK2; + Ri->d[1]=BN_MASK2; + Ri->top=2; + } + if (!BN_div(Ri,NULL,Ri,&tmod,ctx)) goto err; + /* Ni = (R*Ri-1)/N, + * keep only couple of least significant words: */ + mont->n0[0] = (Ri->top > 0) ? Ri->d[0] : 0; + mont->n0[1] = (Ri->top > 1) ? Ri->d[1] : 0; +#else /* Ri = R^-1 mod N*/ if ((BN_mod_inverse(Ri,R,&tmod,ctx)) == NULL) goto err; @@ -363,7 +647,13 @@ int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx) if (!BN_div(Ri,NULL,Ri,&tmod,ctx)) goto err; /* Ni = (R*Ri-1)/N, * keep only least significant word: */ +# if 0 /* for OpenSSL 0.9.9 mont->n0 */ + mont->n0[0] = (Ri->top > 0) ? Ri->d[0] : 0; + mont->n0[1] = 0; +# else mont->n0 = (Ri->top > 0) ? Ri->d[0] : 0; +# endif +#endif } #else /* !MONT_WORD */ { /* bignum version */ @@ -399,7 +689,12 @@ BN_MONT_CTX *BN_MONT_CTX_copy(BN_MONT_CTX *to, BN_MONT_CTX *from) if (!BN_copy(&(to->N),&(from->N))) return NULL; if (!BN_copy(&(to->Ni),&(from->Ni))) return NULL; to->ri=from->ri; +#if 0 /* for OpenSSL 0.9.9 mont->n0 */ + to->n0[0]=from->n0[0]; + to->n0[1]=from->n0[1]; +#else to->n0=from->n0; +#endif return(to); } diff --git a/crypto/bn/bn_mul.c b/crypto/bn/bn_mul.c index aec1eaf..b848c8c 100644 --- a/crypto/bn/bn_mul.c +++ b/crypto/bn/bn_mul.c @@ -389,6 +389,7 @@ BN_ULONG bn_add_part_words(BN_ULONG *r, * a[0]*b[0]+a[1]*b[1]+(a[0]-a[1])*(b[1]-b[0]) * a[1]*b[1] */ +/* dnX may not be positive, but n2/2+dnX has to be */ void bn_mul_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2, int dna, int dnb, BN_ULONG *t) { @@ -398,7 +399,7 @@ void bn_mul_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2, BN_ULONG ln,lo,*p; # ifdef BN_COUNT - fprintf(stderr," bn_mul_recursive %d * %d\n",n2,n2); + fprintf(stderr," bn_mul_recursive %d%+d * %d%+d\n",n2,dna,n2,dnb); # endif # ifdef BN_MUL_COMBA # if 0 @@ -545,6 +546,7 @@ void bn_mul_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2, /* n+tn is the word length * t needs to be n*4 is size, as does r */ +/* tnX may not be negative but less than n */ void bn_mul_part_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n, int tna, int tnb, BN_ULONG *t) { @@ -553,8 +555,8 @@ void bn_mul_part_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n, BN_ULONG ln,lo,*p; # ifdef BN_COUNT - fprintf(stderr," bn_mul_part_recursive (%d+%d) * (%d+%d)\n", - tna, n, tnb, n); + fprintf(stderr," bn_mul_part_recursive (%d%+d) * (%d%+d)\n", + n, tna, n, tnb); # endif if (n < 8) { @@ -655,14 +657,17 @@ void bn_mul_part_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n, for (;;) { i/=2; - if (i < tna && i < tnb) + /* these simplified conditions work + * exclusively because difference + * between tna and tnb is 1 or 0 */ + if (i < tna || i < tnb) { bn_mul_part_recursive(&(r[n2]), &(a[n]),&(b[n]), i,tna-i,tnb-i,p); break; } - else if (i <= tna && i <= tnb) + else if (i == tna || i == tnb) { bn_mul_recursive(&(r[n2]), &(a[n]),&(b[n]), diff --git a/crypto/bn/bn_nist.c b/crypto/bn/bn_nist.c index f8e306b..1fc94f5 100644 --- a/crypto/bn/bn_nist.c +++ b/crypto/bn/bn_nist.c @@ -59,6 +59,7 @@ #include "bn_lcl.h" #include "cryptlib.h" + #define BN_NIST_192_TOP (192+BN_BITS2-1)/BN_BITS2 #define BN_NIST_224_TOP (224+BN_BITS2-1)/BN_BITS2 #define BN_NIST_256_TOP (256+BN_BITS2-1)/BN_BITS2 @@ -99,114 +100,106 @@ static const BN_ULONG _nist_p_521[] = {0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF, 0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF, 0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF, 0xFFFFFFFF,0x000001FF}; -#elif BN_BITS2 == 16 -static const BN_ULONG _nist_p_192[] = {0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFE, - 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF}; -static const BN_ULONG _nist_p_224[] = {0x0001,0x0000,0x0000,0x0000,0x0000, - 0x0000,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF}; -static const BN_ULONG _nist_p_256[] = {0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF, - 0xFFFF,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0001,0x0000,0xFFFF, - 0xFFFF}; -static const BN_ULONG _nist_p_384[] = {0xFFFF,0xFFFF,0x0000,0x0000,0x0000, - 0x0000,0xFFFF,0xFFFF,0xFFFE,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF, - 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF}; -static const BN_ULONG _nist_p_521[] = {0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF, - 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF, - 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF, - 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0x01FF}; -#elif BN_BITS2 == 8 -static const BN_ULONG _nist_p_192[] = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, - 0xFE,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, - 0xFF,0xFF}; -static const BN_ULONG _nist_p_224[] = {0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, - 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF}; -static const BN_ULONG _nist_p_256[] = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, - 0xFF,0xFF,0xFF,0xFF,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x01,0x00,0x00,0x00,0xFF,0xFF,0xFF,0xFF}; -static const BN_ULONG _nist_p_384[] = {0xFF,0xFF,0xFF,0xFF,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0xFF,0xFF,0xFF,0xFF,0xFE,0xFF,0xFF,0xFF,0xFF,0xFF, - 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, - 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF}; -static const BN_ULONG _nist_p_521[] = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, - 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, - 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, - 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, - 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, - 0xFF,0x01}; #endif + +static const BIGNUM _bignum_nist_p_192 = + { + (BN_ULONG *)_nist_p_192, + BN_NIST_192_TOP, + BN_NIST_192_TOP, + 0, + BN_FLG_STATIC_DATA + }; + +static const BIGNUM _bignum_nist_p_224 = + { + (BN_ULONG *)_nist_p_224, + BN_NIST_224_TOP, + BN_NIST_224_TOP, + 0, + BN_FLG_STATIC_DATA + }; + +static const BIGNUM _bignum_nist_p_256 = + { + (BN_ULONG *)_nist_p_256, + BN_NIST_256_TOP, + BN_NIST_256_TOP, + 0, + BN_FLG_STATIC_DATA + }; + +static const BIGNUM _bignum_nist_p_384 = + { + (BN_ULONG *)_nist_p_384, + BN_NIST_384_TOP, + BN_NIST_384_TOP, + 0, + BN_FLG_STATIC_DATA + }; + +static const BIGNUM _bignum_nist_p_521 = + { + (BN_ULONG *)_nist_p_521, + BN_NIST_521_TOP, + BN_NIST_521_TOP, + 0, + BN_FLG_STATIC_DATA + }; + + const BIGNUM *BN_get0_nist_prime_192(void) { - static BIGNUM const_nist_192 = { (BN_ULONG *)_nist_p_192, - BN_NIST_192_TOP, BN_NIST_192_TOP, 0, BN_FLG_STATIC_DATA }; - return &const_nist_192; + return &_bignum_nist_p_192; } const BIGNUM *BN_get0_nist_prime_224(void) { - static BIGNUM const_nist_224 = { (BN_ULONG *)_nist_p_224, - BN_NIST_224_TOP, BN_NIST_224_TOP, 0, BN_FLG_STATIC_DATA }; - return &const_nist_224; + return &_bignum_nist_p_224; } const BIGNUM *BN_get0_nist_prime_256(void) { - static BIGNUM const_nist_256 = { (BN_ULONG *)_nist_p_256, - BN_NIST_256_TOP, BN_NIST_256_TOP, 0, BN_FLG_STATIC_DATA }; - return &const_nist_256; + return &_bignum_nist_p_256; } const BIGNUM *BN_get0_nist_prime_384(void) { - static BIGNUM const_nist_384 = { (BN_ULONG *)_nist_p_384, - BN_NIST_384_TOP, BN_NIST_384_TOP, 0, BN_FLG_STATIC_DATA }; - return &const_nist_384; + return &_bignum_nist_p_384; } const BIGNUM *BN_get0_nist_prime_521(void) { - static BIGNUM const_nist_521 = { (BN_ULONG *)_nist_p_521, - BN_NIST_521_TOP, BN_NIST_521_TOP, 0, BN_FLG_STATIC_DATA }; - return &const_nist_521; + return &_bignum_nist_p_521; } -/* some misc internal functions */ -#if BN_BITS2 != 64 -static BN_ULONG _256_data[BN_NIST_256_TOP*6]; -static int _is_set_256_data = 0; -static void _init_256_data(void); - -static BN_ULONG _384_data[BN_NIST_384_TOP*8]; -static int _is_set_384_data = 0; -static void _init_384_data(void); -#endif - -#define BN_NIST_ADD_ONE(a) while (!(++(*(a)))) ++(a); static void nist_cp_bn_0(BN_ULONG *buf, BN_ULONG *a, int top, int max) - { + { int i; - BN_ULONG *_tmp1 = (buf), *_tmp2 = (a); - for (i = (top); i != 0; i--) - *_tmp1++ = *_tmp2++; - for (i = (max) - (top); i != 0; i--) - *_tmp1++ = (BN_ULONG) 0; - } + BN_ULONG *_tmp1 = (buf), *_tmp2 = (a); + + OPENSSL_assert(top <= max); + for (i = (top); i != 0; i--) + *_tmp1++ = *_tmp2++; + for (i = (max) - (top); i != 0; i--) + *_tmp1++ = (BN_ULONG) 0; + } static void nist_cp_bn(BN_ULONG *buf, BN_ULONG *a, int top) - { + { int i; - BN_ULONG *_tmp1 = (buf), *_tmp2 = (a); - for (i = (top); i != 0; i--) - *_tmp1++ = *_tmp2++; - } + BN_ULONG *_tmp1 = (buf), *_tmp2 = (a); + for (i = (top); i != 0; i--) + *_tmp1++ = *_tmp2++; + } #if BN_BITS2 == 64 -#define bn_cp_64(to, n, from, m) (to)[n] = (from)[m]; +#define bn_cp_64(to, n, from, m) (to)[n] = (m>=0)?((from)[m]):0; #define bn_64_set_0(to, n) (to)[n] = (BN_ULONG)0; /* TBD */ -#define bn_cp_32(to, n, from, m) (to)[n] = (from)[m]; +#define bn_cp_32(to, n, from, m) (to)[n] = (m>=0)?((from)[m]):0; #define bn_32_set_0(to, n) (to)[n] = (BN_ULONG)0; #else #define bn_cp_64(to, n, from, m) \ @@ -220,26 +213,8 @@ static void nist_cp_bn(BN_ULONG *buf, BN_ULONG *a, int top) bn_32_set_0(to, (n)*2+1); \ } #if BN_BITS2 == 32 -#define bn_cp_32(to, n, from, m) (to)[n] = (from)[m]; +#define bn_cp_32(to, n, from, m) (to)[n] = (m>=0)?((from)[m]):0; #define bn_32_set_0(to, n) (to)[n] = (BN_ULONG)0; -#elif BN_BITS2 == 16 -#define bn_cp_32(to, n, from, m) \ - { \ - (to)[(n)*2] = (from)[(m)*2]; \ - (to)[(n)*2+1] = (from)[(m)*2+1];\ - } -#define bn_32_set_0(to, n) { (to)[(n)*2] = 0; (to)[(n)*2+1] = 0; } -#elif BN_BITS2 == 8 -#define bn_cp_32(to, n, from, m) \ - { \ - (to)[(n)*4] = (from)[(m)*4]; \ - (to)[(n)*4+1] = (from)[(m)*4+1];\ - (to)[(n)*4+2] = (from)[(m)*4+2];\ - (to)[(n)*4+3] = (from)[(m)*4+3];\ - } -#define bn_32_set_0(to, n) \ - { (to)[(n)*4] = (BN_ULONG)0; (to)[(n)*4+1] = (BN_ULONG)0; \ - (to)[(n)*4+2] = (BN_ULONG)0; (to)[(n)*4+3] = (BN_ULONG)0; } #endif #endif /* BN_BITS2 != 64 */ @@ -255,10 +230,18 @@ int BN_nist_mod_192(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, BN_CTX *ctx) { int top = a->top, i; - BN_ULONG carry = 0; + int carry; register BN_ULONG *r_d, *a_d = a->d; BN_ULONG t_d[BN_NIST_192_TOP], - buf[BN_NIST_192_TOP]; + buf[BN_NIST_192_TOP], + c_d[BN_NIST_192_TOP], + *res; + size_t mask; + + field = &_bignum_nist_p_192; /* just to make sure */ + + if (BN_is_negative(a) || a->top > 2*BN_NIST_192_TOP) + return BN_nnmod(r, field, a, ctx); i = BN_ucmp(field, a); if (i == 0) @@ -269,9 +252,6 @@ int BN_nist_mod_192(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, else if (i > 0) return (r == a) ? 1 : (BN_copy(r ,a) != NULL); - if (top == BN_NIST_192_TOP) - return BN_usub(r, a, field); - if (r != a) { if (!bn_wexpand(r, BN_NIST_192_TOP)) @@ -284,41 +264,33 @@ int BN_nist_mod_192(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, nist_cp_bn_0(buf, a_d + BN_NIST_192_TOP, top - BN_NIST_192_TOP, BN_NIST_192_TOP); -#if defined(OPENSSL_SYS_VMS) && defined(__DECC) -# pragma message save -# pragma message disable BADSUBSCRIPT -#endif - nist_set_192(t_d, buf, 0, 3, 3); - if (bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP)) - ++carry; - + carry = bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP); + mask = 0-(size_t)bn_sub_words(c_d,r_d,_nist_p_192,BN_NIST_192_TOP); + mask = ~mask | (0-(size_t)carry); + res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask)); + nist_set_192(t_d, buf, 4, 4, 0); - if (bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP)) - ++carry; - -#if defined(OPENSSL_SYS_VMS) && defined(__DECC) -# pragma message restore -#endif + carry = bn_add_words(r_d, res, t_d, BN_NIST_192_TOP); + mask = 0-(size_t)bn_sub_words(c_d,r_d,_nist_p_192,BN_NIST_192_TOP); + mask = ~mask | (0-(size_t)carry); + res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask)); nist_set_192(t_d, buf, 5, 5, 5) - if (bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP)) - ++carry; + carry = bn_add_words(r_d, res, t_d, BN_NIST_192_TOP); + mask = 0-(size_t)bn_sub_words(c_d,r_d,_nist_p_192,BN_NIST_192_TOP); + mask = ~mask | (0-(size_t)carry); + res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask)); - while (carry) - { - if (bn_sub_words(r_d, r_d, _nist_p_192, BN_NIST_192_TOP)) - --carry; - } + nist_cp_bn(r_d, res, BN_NIST_192_TOP); r->top = BN_NIST_192_TOP; bn_correct_top(r); - if (BN_ucmp(r, field) >= 0) + + if (BN_ucmp(field, r) <= 0) { - bn_sub_words(r_d, r_d, _nist_p_192, BN_NIST_192_TOP); - bn_correct_top(r); + if (!BN_usub(r, r, field)) return 0; } - bn_check_top(r); return 1; } @@ -336,12 +308,20 @@ int BN_nist_mod_192(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, int BN_nist_mod_224(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, BN_CTX *ctx) { -#if BN_BITS2 != 64 +#if BN_BITS2 == 32 int top = a->top, i; - int carry = 0; + int carry; BN_ULONG *r_d, *a_d = a->d; BN_ULONG t_d[BN_NIST_224_TOP], - buf[BN_NIST_224_TOP]; + buf[BN_NIST_224_TOP], + c_d[BN_NIST_224_TOP], + *res; + size_t mask; + + field = &_bignum_nist_p_224; /* just to make sure */ + + if (BN_is_negative(a) || a->top > 2*BN_NIST_224_TOP) + return BN_nnmod(r, field, a, ctx); i = BN_ucmp(field, a); if (i == 0) @@ -352,9 +332,6 @@ int BN_nist_mod_224(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, else if (i > 0) return (r == a)? 1 : (BN_copy(r ,a) != NULL); - if (top == BN_NIST_224_TOP) - return BN_usub(r, a, field); - if (r != a) { if (!bn_wexpand(r, BN_NIST_224_TOP)) @@ -368,65 +345,53 @@ int BN_nist_mod_224(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, nist_cp_bn_0(buf, a_d + BN_NIST_224_TOP, top - BN_NIST_224_TOP, BN_NIST_224_TOP); nist_set_224(t_d, buf, 10, 9, 8, 7, 0, 0, 0); - if (bn_add_words(r_d, r_d, t_d, BN_NIST_224_TOP)) - ++carry; + carry = bn_add_words(r_d, r_d, t_d, BN_NIST_224_TOP); + mask = 0-(size_t)bn_sub_words(c_d,r_d,_nist_p_224,BN_NIST_224_TOP); + mask = ~mask | (0-(size_t)carry); + res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask)); + nist_set_224(t_d, buf, 0, 13, 12, 11, 0, 0, 0); - if (bn_add_words(r_d, r_d, t_d, BN_NIST_224_TOP)) - ++carry; + carry = bn_add_words(r_d, res, t_d, BN_NIST_224_TOP); + mask = 0-(size_t)bn_sub_words(c_d,r_d,_nist_p_224,BN_NIST_224_TOP); + mask = ~mask | (0-(size_t)carry); + res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask)); + nist_set_224(t_d, buf, 13, 12, 11, 10, 9, 8, 7); - if (bn_sub_words(r_d, r_d, t_d, BN_NIST_224_TOP)) - --carry; +#if BRANCH_FREE + carry = bn_sub_words(r_d, res, t_d, BN_NIST_224_TOP); + bn_add_words(c_d,r_d,_nist_p_224,BN_NIST_224_TOP); + mask = 0-(size_t)carry; + res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask)); +#else + if (bn_sub_words(r_d, res, t_d, BN_NIST_224_TOP)) + bn_add_words(r_d,r_d,_nist_p_224,BN_NIST_224_TOP); +#endif nist_set_224(t_d, buf, 0, 0, 0, 0, 13, 12, 11); - if (bn_sub_words(r_d, r_d, t_d, BN_NIST_224_TOP)) - --carry; - - if (carry > 0) - while (carry) - { - if (bn_sub_words(r_d,r_d,_nist_p_224,BN_NIST_224_TOP)) - --carry; - } - else if (carry < 0) - while (carry) - { - if (bn_add_words(r_d,r_d,_nist_p_224,BN_NIST_224_TOP)) - ++carry; - } +#if BRANCH_FREE + carry = bn_sub_words(r_d, res, t_d, BN_NIST_224_TOP); + bn_add_words(c_d,r_d,_nist_p_224,BN_NIST_224_TOP); + mask = 0-(size_t)carry; + res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask)); + nist_cp_bn(r_d, res, BN_NIST_224_TOP); +#else + if (bn_sub_words(r_d, r_d, t_d, BN_NIST_224_TOP)) + bn_add_words(r_d,r_d,_nist_p_224,BN_NIST_224_TOP); +#endif r->top = BN_NIST_224_TOP; bn_correct_top(r); - if (BN_ucmp(r, field) >= 0) + + if (BN_ucmp(field, r) <= 0) { - bn_sub_words(r_d, r_d, _nist_p_224, BN_NIST_224_TOP); - bn_correct_top(r); + if (!BN_usub(r, r, field)) return 0; } - bn_check_top(r); + return 1; -#else +#else /* BN_BITS!=32 */ return 0; #endif } -#if BN_BITS2 != 64 -static void _init_256_data(void) - { - int i; - BN_ULONG *tmp1 = _256_data; - const BN_ULONG *tmp2 = tmp1; - - memcpy(tmp1, _nist_p_256, BN_NIST_256_TOP * sizeof(BN_ULONG)); - tmp1 += BN_NIST_256_TOP; - - for (i=0; i<5; i++) - { - bn_add_words(tmp1, _nist_p_256, tmp2, BN_NIST_256_TOP); - tmp2 = tmp1; - tmp1 += BN_NIST_256_TOP; - } - _is_set_256_data = 1; - } -#endif - #define nist_set_256(to, from, a1, a2, a3, a4, a5, a6, a7, a8) \ { \ if (a8 != 0) bn_cp_32(to, 0, from, (a8) - 8) else bn_32_set_0(to, 0)\ @@ -442,24 +407,21 @@ static void _init_256_data(void) int BN_nist_mod_256(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, BN_CTX *ctx) { -#if BN_BITS2 != 64 +#if BN_BITS2 == 32 int i, top = a->top; int carry = 0; register BN_ULONG *a_d = a->d, *r_d; BN_ULONG t_d[BN_NIST_256_TOP], - t_d2[BN_NIST_256_TOP], - buf[BN_NIST_256_TOP]; + buf[BN_NIST_256_TOP], + c_d[BN_NIST_256_TOP], + *res; + size_t mask; + + field = &_bignum_nist_p_256; /* just to make sure */ + + if (BN_is_negative(a) || a->top > 2*BN_NIST_256_TOP) + return BN_nnmod(r, field, a, ctx); - if (!_is_set_256_data) - { - CRYPTO_w_lock(CRYPTO_LOCK_BN); - - if (!_is_set_256_data) - _init_256_data(); - - CRYPTO_w_unlock(CRYPTO_LOCK_BN); - } - i = BN_ucmp(field, a); if (i == 0) { @@ -469,9 +431,6 @@ int BN_nist_mod_256(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, else if (i > 0) return (r == a)? 1 : (BN_copy(r ,a) != NULL); - if (top == BN_NIST_256_TOP) - return BN_usub(r, a, field); - if (r != a) { if (!bn_wexpand(r, BN_NIST_256_TOP)) @@ -487,98 +446,96 @@ int BN_nist_mod_256(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, /*S1*/ nist_set_256(t_d, buf, 15, 14, 13, 12, 11, 0, 0, 0); /*S2*/ - nist_set_256(t_d2,buf, 0, 15, 14, 13, 12, 0, 0, 0); - if (bn_add_words(t_d, t_d, t_d2, BN_NIST_256_TOP)) - carry = 2; - /* left shift */ - { - register BN_ULONG *ap,t,c; - ap = t_d; - c=0; - for (i = BN_NIST_256_TOP; i != 0; --i) - { - t= *ap; - *(ap++)=((t<<1)|c)&BN_MASK2; - c=(t & BN_TBIT)?1:0; - } - if (c) - ++carry; - } + nist_set_256(c_d,buf, 0, 15, 14, 13, 12, 0, 0, 0); + carry = bn_add_words(t_d, t_d, c_d, BN_NIST_256_TOP); + mask = 0-(size_t)bn_sub_words(c_d,t_d,_nist_p_256,BN_NIST_256_TOP); + mask = ~mask | (0-(size_t)carry); + res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)t_d&~mask)); + + carry = bn_add_words(t_d, res, res, BN_NIST_256_TOP); + mask = 0-(size_t)bn_sub_words(c_d,t_d,_nist_p_256,BN_NIST_256_TOP); + mask = ~mask | (0-(size_t)carry); + res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)t_d&~mask)); + + carry = bn_add_words(r_d, r_d, res, BN_NIST_256_TOP); + mask = 0-(size_t)bn_sub_words(c_d,r_d,_nist_p_256,BN_NIST_256_TOP); + mask = ~mask | (0-(size_t)carry); + res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask)); - if (bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP)) - ++carry; /*S3*/ nist_set_256(t_d, buf, 15, 14, 0, 0, 0, 10, 9, 8); - if (bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP)) - ++carry; + carry = bn_add_words(r_d, res, t_d, BN_NIST_256_TOP); + mask = 0-(size_t)bn_sub_words(c_d,r_d,_nist_p_256,BN_NIST_256_TOP); + mask = ~mask | (0-(size_t)carry); + res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask)); + /*S4*/ nist_set_256(t_d, buf, 8, 13, 15, 14, 13, 11, 10, 9); - if (bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP)) - ++carry; + carry = bn_add_words(r_d, res, t_d, BN_NIST_256_TOP); + mask = 0-(size_t)bn_sub_words(c_d,r_d,_nist_p_256,BN_NIST_256_TOP); + mask = ~mask | (0-(size_t)carry); + res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask)); + /*D1*/ nist_set_256(t_d, buf, 10, 8, 0, 0, 0, 13, 12, 11); - if (bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP)) - --carry; +#if BRANCH_FREE + carry = bn_sub_words(r_d, res, t_d, BN_NIST_256_TOP); + bn_add_words(c_d,r_d,_nist_p_256,BN_NIST_256_TOP); + mask = 0-(size_t)carry; + res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask)); +#else + if (bn_sub_words(r_d, res, t_d, BN_NIST_256_TOP)) + bn_add_words(r_d,r_d,_nist_p_256,BN_NIST_256_TOP); +#endif /*D2*/ nist_set_256(t_d, buf, 11, 9, 0, 0, 15, 14, 13, 12); +#if BRANCH_FREE + carry = bn_sub_words(r_d, res, t_d, BN_NIST_256_TOP); + bn_add_words(c_d,r_d,_nist_p_256,BN_NIST_256_TOP); + mask = 0-(size_t)carry; + res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask)); +#else if (bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP)) - --carry; + bn_add_words(r_d,r_d,_nist_p_256,BN_NIST_256_TOP); +#endif /*D3*/ nist_set_256(t_d, buf, 12, 0, 10, 9, 8, 15, 14, 13); +#if BRANCH_FREE + carry = bn_sub_words(r_d, res, t_d, BN_NIST_256_TOP); + bn_add_words(c_d,r_d,_nist_p_256,BN_NIST_256_TOP); + mask = 0-(size_t)carry; + res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask)); +#else if (bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP)) - --carry; + bn_add_words(r_d,r_d,_nist_p_256,BN_NIST_256_TOP); +#endif /*D4*/ nist_set_256(t_d, buf, 13, 0, 11, 10, 9, 0, 15, 14); - if (bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP)) - --carry; - - if (carry) - { - if (carry > 0) - bn_sub_words(r_d, r_d, _256_data + BN_NIST_256_TOP * - --carry, BN_NIST_256_TOP); - else - { - carry = -carry; - bn_add_words(r_d, r_d, _256_data + BN_NIST_256_TOP * - --carry, BN_NIST_256_TOP); - } - } +#if BRANCH_FREE + carry = bn_sub_words(r_d, res, t_d, BN_NIST_256_TOP); + bn_add_words(c_d,r_d,_nist_p_256,BN_NIST_256_TOP); + mask = 0-(size_t)carry; + res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask)); + nist_cp_bn(r_d, res, BN_NIST_384_TOP); +#else + if (bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP)) + bn_add_words(r_d,r_d,_nist_p_256,BN_NIST_256_TOP); +#endif r->top = BN_NIST_256_TOP; bn_correct_top(r); - if (BN_ucmp(r, field) >= 0) + + if (BN_ucmp(field, r) <= 0) { - bn_sub_words(r_d, r_d, _nist_p_256, BN_NIST_256_TOP); - bn_correct_top(r); + if (!BN_usub(r, r, field)) return 0; } - bn_check_top(r); + return 1; -#else +#else /* BN_BITS!=32 */ return 0; #endif } -#if BN_BITS2 != 64 -static void _init_384_data(void) - { - int i; - BN_ULONG *tmp1 = _384_data; - const BN_ULONG *tmp2 = tmp1; - - memcpy(tmp1, _nist_p_384, BN_NIST_384_TOP * sizeof(BN_ULONG)); - tmp1 += BN_NIST_384_TOP; - - for (i=0; i<7; i++) - { - bn_add_words(tmp1, _nist_p_384, tmp2, BN_NIST_384_TOP); - tmp2 = tmp1; - tmp1 += BN_NIST_384_TOP; - } - _is_set_384_data = 1; - } -#endif - #define nist_set_384(to,from,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12) \ { \ if (a12 != 0) bn_cp_32(to, 0, from, (a12) - 12) else bn_32_set_0(to, 0)\ @@ -598,22 +555,20 @@ static void _init_384_data(void) int BN_nist_mod_384(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, BN_CTX *ctx) { -#if BN_BITS2 != 64 +#if BN_BITS2 == 32 int i, top = a->top; int carry = 0; register BN_ULONG *r_d, *a_d = a->d; BN_ULONG t_d[BN_NIST_384_TOP], - buf[BN_NIST_384_TOP]; + buf[BN_NIST_384_TOP], + c_d[BN_NIST_384_TOP], + *res; + size_t mask; - if (!_is_set_384_data) - { - CRYPTO_w_lock(CRYPTO_LOCK_BN); - - if (!_is_set_384_data) - _init_384_data(); + field = &_bignum_nist_p_384; /* just to make sure */ - CRYPTO_w_unlock(CRYPTO_LOCK_BN); - } + if (BN_is_negative(a) || a->top > 2*BN_NIST_384_TOP) + return BN_nnmod(r, field, a, ctx); i = BN_ucmp(field, a); if (i == 0) @@ -624,9 +579,6 @@ int BN_nist_mod_384(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, else if (i > 0) return (r == a)? 1 : (BN_copy(r ,a) != NULL); - if (top == BN_NIST_384_TOP) - return BN_usub(r, a, field); - if (r != a) { if (!bn_wexpand(r, BN_NIST_384_TOP)) @@ -646,72 +598,108 @@ int BN_nist_mod_384(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, register BN_ULONG *ap,t,c; ap = t_d; c=0; - for (i = BN_NIST_256_TOP; i != 0; --i) + for (i = 3; i != 0; --i) { t= *ap; *(ap++)=((t<<1)|c)&BN_MASK2; c=(t & BN_TBIT)?1:0; } + *ap=c; } - if (bn_add_words(r_d+(128/BN_BITS2), r_d+(128/BN_BITS2), - t_d, BN_NIST_256_TOP)) - ++carry; + carry = bn_add_words(r_d+(128/BN_BITS2), r_d+(128/BN_BITS2), + t_d, BN_NIST_256_TOP); + /* + * we need if (result>=modulus) subtract(result,modulus); + * in n-bit space this can be expressed as + * if (carry || result>=modulus) subtract(result,modulus); + * the catch is that comparison implies subtraction and + * therefore one can write tmp=subtract(result,modulus); + * and then if(carry || !borrow) result=tmp; this's what + * happens below, but without explicit if:-) a. + */ + mask = 0-(size_t)bn_sub_words(c_d,r_d,_nist_p_384,BN_NIST_384_TOP); + mask = ~mask | (0-(size_t)carry); + res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask)); + /*S2 */ - if (bn_add_words(r_d, r_d, buf, BN_NIST_384_TOP)) - ++carry; + carry = bn_add_words(r_d, res, buf, BN_NIST_384_TOP); + mask = 0-(size_t)bn_sub_words(c_d,r_d,_nist_p_384,BN_NIST_384_TOP); + mask = ~mask | (0-(size_t)carry); + res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask)); + /*S3*/ nist_set_384(t_d,buf,20,19,18,17,16,15,14,13,12,23,22,21); - if (bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP)) - ++carry; + carry = bn_add_words(r_d, res, t_d, BN_NIST_384_TOP); + mask = 0-(size_t)bn_sub_words(c_d,r_d,_nist_p_384,BN_NIST_384_TOP); + mask = ~mask | (0-(size_t)carry); + res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask)); + /*S4*/ nist_set_384(t_d,buf,19,18,17,16,15,14,13,12,20,0,23,0); - if (bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP)) - ++carry; + carry = bn_add_words(r_d, res, t_d, BN_NIST_384_TOP); + mask = 0-(size_t)bn_sub_words(c_d,r_d,_nist_p_384,BN_NIST_384_TOP); + mask = ~mask | (0-(size_t)carry); + res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask)); + /*S5*/ - nist_set_256(t_d, buf, 0, 0, 0, 0, 23-4, 22-4, 21-4, 20-4); - if (bn_add_words(r_d+(128/BN_BITS2), r_d+(128/BN_BITS2), - t_d, BN_NIST_256_TOP)) - ++carry; + nist_set_384(t_d, buf,0,0,0,0,23,22,21,20,0,0,0,0); + carry = bn_add_words(r_d, res, t_d, BN_NIST_384_TOP); + mask = 0-(size_t)bn_sub_words(c_d,r_d,_nist_p_384,BN_NIST_384_TOP); + mask = ~mask | (0-(size_t)carry); + res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask)); + /*S6*/ nist_set_384(t_d,buf,0,0,0,0,0,0,23,22,21,0,0,20); - if (bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP)) - ++carry; + carry = bn_add_words(r_d, res, t_d, BN_NIST_384_TOP); + mask = 0-(size_t)bn_sub_words(c_d,r_d,_nist_p_384,BN_NIST_384_TOP); + mask = ~mask | (0-(size_t)carry); + res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask)); + /*D1*/ nist_set_384(t_d,buf,22,21,20,19,18,17,16,15,14,13,12,23); - if (bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP)) - --carry; +#if BRANCH_FREE + carry = bn_sub_words(r_d, res, t_d, BN_NIST_384_TOP); + bn_add_words(c_d,r_d,_nist_p_384,BN_NIST_384_TOP); + mask = 0-(size_t)carry; + res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask)); +#else + if (bn_sub_words(r_d, res, t_d, BN_NIST_384_TOP)) + bn_add_words(r_d,r_d,_nist_p_384,BN_NIST_384_TOP); +#endif /*D2*/ nist_set_384(t_d,buf,0,0,0,0,0,0,0,23,22,21,20,0); +#if BRANCH_FREE + carry = bn_sub_words(r_d, res, t_d, BN_NIST_384_TOP); + bn_add_words(c_d,r_d,_nist_p_384,BN_NIST_384_TOP); + mask = 0-(size_t)carry; + res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask)); +#else if (bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP)) - --carry; + bn_add_words(r_d,r_d,_nist_p_384,BN_NIST_384_TOP); +#endif /*D3*/ nist_set_384(t_d,buf,0,0,0,0,0,0,0,23,23,0,0,0); - if (bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP)) - --carry; - - if (carry) - { - if (carry > 0) - bn_sub_words(r_d, r_d, _384_data + BN_NIST_384_TOP * - --carry, BN_NIST_384_TOP); - else - { - carry = -carry; - bn_add_words(r_d, r_d, _384_data + BN_NIST_384_TOP * - --carry, BN_NIST_384_TOP); - } - } +#if BRANCH_FREE + carry = bn_sub_words(r_d, res, t_d, BN_NIST_384_TOP); + bn_add_words(c_d,r_d,_nist_p_384,BN_NIST_384_TOP); + mask = 0-(size_t)carry; + res = (BN_ULONG *)(((size_t)c_d&mask) | ((size_t)r_d&~mask)); + nist_cp_bn(r_d, res, BN_NIST_384_TOP); +#else + if (bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP)) + bn_add_words(r_d,r_d,_nist_p_384,BN_NIST_384_TOP); +#endif r->top = BN_NIST_384_TOP; bn_correct_top(r); - if (BN_ucmp(r, field) >= 0) + + if (BN_ucmp(field, r) <= 0) { - bn_sub_words(r_d, r_d, _nist_p_384, BN_NIST_384_TOP); - bn_correct_top(r); + if (!BN_usub(r, r, field)) return 0; } - bn_check_top(r); + return 1; -#else +#else /* BN_BITS!=32 */ return 0; #endif } @@ -723,20 +711,37 @@ int BN_nist_mod_521(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, #define BN_NIST_521_TOP_MASK (BN_ULONG)0x1FF #elif BN_BITS2 == 32 #define BN_NIST_521_TOP_MASK (BN_ULONG)0x1FF -#elif BN_BITS2 == 16 -#define BN_NIST_521_TOP_MASK (BN_ULONG)0x1FF -#elif BN_BITS2 == 8 -#define BN_NIST_521_TOP_MASK (BN_ULONG)0x1 #endif int top, ret = 0; - BN_ULONG *r_d; BIGNUM *tmp; + field = &_bignum_nist_p_521; /* just to make sure */ + + if (BN_is_negative(a)) + return BN_nnmod(r, field, a, ctx); + /* check whether a reduction is necessary */ top = a->top; if (top < BN_NIST_521_TOP || ( top == BN_NIST_521_TOP && - (!(a->d[BN_NIST_521_TOP-1] & ~(BN_NIST_521_TOP_MASK))))) - return (r == a)? 1 : (BN_copy(r ,a) != NULL); + (!(a->d[BN_NIST_521_TOP-1] & ~(BN_NIST_521_TOP_MASK))))) + { + int i = BN_ucmp(field, a); + if (i == 0) + { + BN_zero(r); + return 1; + } + else + { +#ifdef BN_DEBUG + OPENSSL_assert(i > 0); /* because 'field' is 1111...1111 */ +#endif + return (r == a)? 1 : (BN_copy(r ,a) != NULL); + } + } + + if (BN_num_bits(a) > 2*521) + return BN_nnmod(r, field, a, ctx); BN_CTX_start(ctx); tmp = BN_CTX_get(ctx); @@ -756,15 +761,11 @@ int BN_nist_mod_521(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, if (!BN_uadd(r, tmp, r)) goto err; - top = r->top; - r_d = r->d; - if (top == BN_NIST_521_TOP && - (r_d[BN_NIST_521_TOP-1] & ~(BN_NIST_521_TOP_MASK))) + + if (BN_ucmp(field, r) <= 0) { - BN_NIST_ADD_ONE(r_d) - r_d[BN_NIST_521_TOP-1] &= BN_NIST_521_TOP_MASK; + if (!BN_usub(r, r, field)) goto err; } - bn_correct_top(r); ret = 1; err: diff --git a/crypto/bn/bn_prime.c b/crypto/bn/bn_prime.c index 5bab019..7b25979 100644 --- a/crypto/bn/bn_prime.c +++ b/crypto/bn/bn_prime.c @@ -377,14 +377,14 @@ static int witness(BIGNUM *w, const BIGNUM *a, const BIGNUM *a1, static int probable_prime(BIGNUM *rnd, int bits) { int i; - BN_ULONG mods[NUMPRIMES]; + prime_t mods[NUMPRIMES]; BN_ULONG delta,maxdelta; again: if (!BN_rand(rnd,bits,1,1)) return(0); /* we now have a random number 'rand' to test. */ for (i=1; i<NUMPRIMES; i++) - mods[i]=BN_mod_word(rnd,(BN_ULONG)primes[i]); + mods[i]=(prime_t)BN_mod_word(rnd,(BN_ULONG)primes[i]); maxdelta=BN_MASK2 - primes[NUMPRIMES-1]; delta=0; loop: for (i=1; i<NUMPRIMES; i++) diff --git a/crypto/bn/bn_prime.h b/crypto/bn/bn_prime.h index b7cf9a9..51d2194 100644 --- a/crypto/bn/bn_prime.h +++ b/crypto/bn/bn_prime.h @@ -58,10 +58,12 @@ #ifndef EIGHT_BIT #define NUMPRIMES 2048 +typedef unsigned short prime_t; #else #define NUMPRIMES 54 +typedef unsigned char prime_t; #endif -static const unsigned int primes[NUMPRIMES]= +static const prime_t primes[NUMPRIMES]= { 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, diff --git a/crypto/bn/bn_prime.pl b/crypto/bn/bn_prime.pl index e583d1d..3fafb6f 100644 --- a/crypto/bn/bn_prime.pl +++ b/crypto/bn/bn_prime.pl @@ -101,10 +101,12 @@ for ($i=0; $i <= $#primes; $i++) printf "#ifndef EIGHT_BIT\n"; printf "#define NUMPRIMES %d\n",$num; +printf "typedef unsigned short prime_t;\n"; printf "#else\n"; printf "#define NUMPRIMES %d\n",$eight; +printf "typedef unsigned char prime_t;\n"; printf "#endif\n"; -print "static const unsigned int primes[NUMPRIMES]=\n\t{\n\t"; +print "static const prime_t primes[NUMPRIMES]=\n\t{\n\t"; $init=0; for ($i=0; $i <= $#primes; $i++) { diff --git a/crypto/bn/bntest.c b/crypto/bn/bntest.c index c885300..310763e 100644 --- a/crypto/bn/bntest.c +++ b/crypto/bn/bntest.c @@ -184,120 +184,120 @@ int main(int argc, char *argv[]) message(out,"BN_add"); if (!test_add(out)) goto err; - BIO_flush(out); + (void)BIO_flush(out); message(out,"BN_sub"); if (!test_sub(out)) goto err; - BIO_flush(out); + (void)BIO_flush(out); message(out,"BN_lshift1"); if (!test_lshift1(out)) goto err; - BIO_flush(out); + (void)BIO_flush(out); message(out,"BN_lshift (fixed)"); if (!test_lshift(out,ctx,BN_bin2bn(lst,sizeof(lst)-1,NULL))) goto err; - BIO_flush(out); + (void)BIO_flush(out); message(out,"BN_lshift"); if (!test_lshift(out,ctx,NULL)) goto err; - BIO_flush(out); + (void)BIO_flush(out); message(out,"BN_rshift1"); if (!test_rshift1(out)) goto err; - BIO_flush(out); + (void)BIO_flush(out); message(out,"BN_rshift"); if (!test_rshift(out,ctx)) goto err; - BIO_flush(out); + (void)BIO_flush(out); message(out,"BN_sqr"); if (!test_sqr(out,ctx)) goto err; - BIO_flush(out); + (void)BIO_flush(out); message(out,"BN_mul"); if (!test_mul(out)) goto err; - BIO_flush(out); + (void)BIO_flush(out); message(out,"BN_div"); if (!test_div(out,ctx)) goto err; - BIO_flush(out); + (void)BIO_flush(out); message(out,"BN_div_word"); if (!test_div_word(out)) goto err; - BIO_flush(out); + (void)BIO_flush(out); message(out,"BN_div_recp"); if (!test_div_recp(out,ctx)) goto err; - BIO_flush(out); + (void)BIO_flush(out); message(out,"BN_mod"); if (!test_mod(out,ctx)) goto err; - BIO_flush(out); + (void)BIO_flush(out); message(out,"BN_mod_mul"); if (!test_mod_mul(out,ctx)) goto err; - BIO_flush(out); + (void)BIO_flush(out); message(out,"BN_mont"); if (!test_mont(out,ctx)) goto err; - BIO_flush(out); + (void)BIO_flush(out); message(out,"BN_mod_exp"); if (!test_mod_exp(out,ctx)) goto err; - BIO_flush(out); + (void)BIO_flush(out); message(out,"BN_mod_exp_mont_consttime"); if (!test_mod_exp_mont_consttime(out,ctx)) goto err; - BIO_flush(out); + (void)BIO_flush(out); message(out,"BN_exp"); if (!test_exp(out,ctx)) goto err; - BIO_flush(out); + (void)BIO_flush(out); message(out,"BN_kronecker"); if (!test_kron(out,ctx)) goto err; - BIO_flush(out); + (void)BIO_flush(out); message(out,"BN_mod_sqrt"); if (!test_sqrt(out,ctx)) goto err; - BIO_flush(out); + (void)BIO_flush(out); message(out,"BN_GF2m_add"); if (!test_gf2m_add(out)) goto err; - BIO_flush(out); + (void)BIO_flush(out); message(out,"BN_GF2m_mod"); if (!test_gf2m_mod(out)) goto err; - BIO_flush(out); + (void)BIO_flush(out); message(out,"BN_GF2m_mod_mul"); if (!test_gf2m_mod_mul(out,ctx)) goto err; - BIO_flush(out); + (void)BIO_flush(out); message(out,"BN_GF2m_mod_sqr"); if (!test_gf2m_mod_sqr(out,ctx)) goto err; - BIO_flush(out); + (void)BIO_flush(out); message(out,"BN_GF2m_mod_inv"); if (!test_gf2m_mod_inv(out,ctx)) goto err; - BIO_flush(out); + (void)BIO_flush(out); message(out,"BN_GF2m_mod_div"); if (!test_gf2m_mod_div(out,ctx)) goto err; - BIO_flush(out); + (void)BIO_flush(out); message(out,"BN_GF2m_mod_exp"); if (!test_gf2m_mod_exp(out,ctx)) goto err; - BIO_flush(out); + (void)BIO_flush(out); message(out,"BN_GF2m_mod_sqrt"); if (!test_gf2m_mod_sqrt(out,ctx)) goto err; - BIO_flush(out); + (void)BIO_flush(out); message(out,"BN_GF2m_mod_solve_quad"); if (!test_gf2m_mod_solve_quad(out,ctx)) goto err; - BIO_flush(out); + (void)BIO_flush(out); BN_CTX_free(ctx); BIO_free(out); @@ -307,7 +307,7 @@ int main(int argc, char *argv[]) err: BIO_puts(out,"1\n"); /* make sure the Perl script fed by bc notices * the failure, see test_bn in test/Makefile.ssl*/ - BIO_flush(out); + (void)BIO_flush(out); ERR_load_crypto_strings(); ERR_print_errors_fp(stderr); EXIT(1); |