diff options
author | Miod Vallat <miod@cvs.openbsd.org> | 2014-04-17 18:16:46 +0000 |
---|---|---|
committer | Miod Vallat <miod@cvs.openbsd.org> | 2014-04-17 18:16:46 +0000 |
commit | dc1bbba22268c44b47967c17fa2e43f15fe59bf0 (patch) | |
tree | c7030843ca3e64bea8319af2fca2ee531db3f38a /lib/libcrypto | |
parent | cde0e7d71fe0bef955ef6a27178b658f0c12a660 (diff) |
Ok, there was a need for OPENSSL_cleanse() instead of bzero() to prevent
supposedly smart compilers from optimizing memory cleanups away. Understood.
Ok, in case of an hypothetically super smart compiler, OPENSSL_cleanse() had
to be convoluted enough for the compiler not to recognize that this was
actually bzero() in disguise. Understood.
But then why there had been optimized assembler versions of OPENSSL_cleanse()
is beyond me. Did someone not trust the C obfuscation?
Diffstat (limited to 'lib/libcrypto')
-rw-r--r-- | lib/libcrypto/alphacpuid.pl | 37 | ||||
-rw-r--r-- | lib/libcrypto/armv4cpuid.S | 32 | ||||
-rw-r--r-- | lib/libcrypto/ia64cpuid.S | 9 | ||||
-rw-r--r-- | lib/libcrypto/pariscid.pl | 61 | ||||
-rwxr-xr-x | lib/libcrypto/ppccpuid.pl | 68 | ||||
-rw-r--r-- | lib/libcrypto/s390xcpuid.S | 50 | ||||
-rw-r--r-- | lib/libcrypto/sparccpuid.S | 120 | ||||
-rw-r--r-- | lib/libcrypto/x86_64cpuid.pl | 35 | ||||
-rw-r--r-- | lib/libcrypto/x86cpuid.pl | 149 |
9 files changed, 290 insertions, 271 deletions
diff --git a/lib/libcrypto/alphacpuid.pl b/lib/libcrypto/alphacpuid.pl index 4b3cbb9827d..f6aea6a7663 100644 --- a/lib/libcrypto/alphacpuid.pl +++ b/lib/libcrypto/alphacpuid.pl @@ -86,41 +86,4 @@ OPENSSL_rdtsc: rpcc $0 ret ($26) .end OPENSSL_rdtsc - -.globl OPENSSL_cleanse -.ent OPENSSL_cleanse -OPENSSL_cleanse: - .frame $30,0,$26 - .prologue 0 - beq $17,.Ldone - and $16,7,$0 - bic $17,7,$at - beq $at,.Little - beq $0,.Laligned - -.Little: - subq $0,8,$0 - ldq_u $1,0($16) - mov $16,$2 -.Lalign: - mskbl $1,$16,$1 - lda $16,1($16) - subq $17,1,$17 - addq $0,1,$0 - beq $17,.Lout - bne $0,.Lalign -.Lout: stq_u $1,0($2) - beq $17,.Ldone - bic $17,7,$at - beq $at,.Little - -.Laligned: - stq $31,0($16) - subq $17,8,$17 - lda $16,8($16) - bic $17,7,$at - bne $at,.Laligned - bne $17,.Little -.Ldone: ret ($26) -.end OPENSSL_cleanse ___ diff --git a/lib/libcrypto/armv4cpuid.S b/lib/libcrypto/armv4cpuid.S index 2d618deaa43..bdfde19c1fc 100644 --- a/lib/libcrypto/armv4cpuid.S +++ b/lib/libcrypto/armv4cpuid.S @@ -54,38 +54,6 @@ OPENSSL_atomic_add: #endif .size OPENSSL_atomic_add,.-OPENSSL_atomic_add -.global OPENSSL_cleanse -.type OPENSSL_cleanse,%function -OPENSSL_cleanse: - eor ip,ip,ip - cmp r1,#7 - subhs r1,r1,#4 - bhs .Lot - cmp r1,#0 - beq .Lcleanse_done -.Little: - strb ip,[r0],#1 - subs r1,r1,#1 - bhi .Little - b .Lcleanse_done - -.Lot: tst r0,#3 - beq .Laligned - strb ip,[r0],#1 - sub r1,r1,#1 - b .Lot -.Laligned: - str ip,[r0],#4 - subs r1,r1,#4 - bhs .Laligned - adds r1,r1,#4 - bne .Little -.Lcleanse_done: - tst lr,#1 - moveq pc,lr - .word 0xe12fff1e @ bx lr -.size OPENSSL_cleanse,.-OPENSSL_cleanse - .global OPENSSL_wipe_cpu .type OPENSSL_wipe_cpu,%function OPENSSL_wipe_cpu: diff --git a/lib/libcrypto/ia64cpuid.S b/lib/libcrypto/ia64cpuid.S index 04fbb3439eb..517d9388881 100644 --- a/lib/libcrypto/ia64cpuid.S +++ b/lib/libcrypto/ia64cpuid.S @@ -1,6 +1,13 @@ // Works on all IA-64 platforms: Linux, HP-UX, Win64i... // On Win64i compile with ias.exe. .text + +.global OPENSSL_cpuid_setup# +.proc OPENSSL_cpuid_setup# +OPENSSL_cpuid_setup: +{ .mib; br.ret.sptk.many b0 };; +.endp OPENSSL_cpuid_setup# + .global OPENSSL_rdtsc# .proc OPENSSL_rdtsc# OPENSSL_rdtsc: @@ -19,7 +26,7 @@ OPENSSL_atomic_add: { .mii; mov ar.ccv=r2 add r8=r2,r33 mov r3=r2 };; -{ .mmi; mf +{ .mmi; mf;; cmpxchg4.acq r2=[r32],r8,ar.ccv nop.i 0 };; { .mib; cmp.ne p6,p0=r2,r3 diff --git a/lib/libcrypto/pariscid.pl b/lib/libcrypto/pariscid.pl index 477ec9b87dd..38985afbacb 100644 --- a/lib/libcrypto/pariscid.pl +++ b/lib/libcrypto/pariscid.pl @@ -87,50 +87,6 @@ OPENSSL_wipe_cpu .PROCEND ___ { -my $inp="%r26"; -my $len="%r25"; - -$code.=<<___; - .EXPORT OPENSSL_cleanse,ENTRY,ARGW0=GR,ARGW1=GR - .ALIGN 8 -OPENSSL_cleanse - .PROC - .CALLINFO NO_CALLS - .ENTRY - cmpib,*= 0,$len,Ldone - nop - cmpib,*>>= 15,$len,Little - ldi $SIZE_T-1,%r1 - -Lalign - and,*<> $inp,%r1,%r28 - b,n Laligned - stb %r0,0($inp) - ldo -1($len),$len - b Lalign - ldo 1($inp),$inp - -Laligned - andcm $len,%r1,%r28 -Lot - $ST %r0,0($inp) - addib,*<> -$SIZE_T,%r28,Lot - ldo $SIZE_T($inp),$inp - - and,*<> $len,%r1,$len - b,n Ldone -Little - stb %r0,0($inp) - addib,*<> -1,$len,Little - ldo 1($inp),$inp -Ldone - bv ($rp) - .EXIT - nop - .PROCEND -___ -} -{ my ($out,$cnt,$max)=("%r26","%r25","%r24"); my ($tick,$lasttick)=("%r23","%r22"); my ($diff,$lastdiff)=("%r21","%r20"); @@ -151,7 +107,7 @@ OPENSSL_instrument_bus ldw 0($out),$tick add $diff,$tick,$tick stw $tick,0($out) -Loop +L\$oop mfctl %cr16,$tick sub $tick,$lasttick,$diff copy $tick,$lasttick @@ -161,7 +117,7 @@ Loop add $diff,$tick,$tick stw $tick,0($out) - addib,<> -1,$cnt,Loop + addib,<> -1,$cnt,L\$oop addi 4,$out,$out bv ($rp) @@ -190,14 +146,14 @@ OPENSSL_instrument_bus2 mfctl %cr16,$tick sub $tick,$lasttick,$diff copy $tick,$lasttick -Loop2 +L\$oop2 copy $diff,$lastdiff fdc 0($out) ldw 0($out),$tick add $diff,$tick,$tick stw $tick,0($out) - addib,= -1,$max,Ldone2 + addib,= -1,$max,L\$done2 nop mfctl %cr16,$tick @@ -208,17 +164,18 @@ Loop2 ldi 1,%r1 xor %r1,$tick,$tick - addb,<> $tick,$cnt,Loop2 + addb,<> $tick,$cnt,L\$oop2 shladd,l $tick,2,$out,$out -Ldone2 +L\$done2 bv ($rp) .EXIT add $rv,$cnt,$rv .PROCEND ___ } -$code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4); -$code =~ s/,\*/,/gm if ($SIZE_T==4); +$code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4); +$code =~ s/,\*/,/gm if ($SIZE_T==4); +$code =~ s/\bbv\b/bve/gm if ($SIZE_T==8); print $code; close STDOUT; diff --git a/lib/libcrypto/ppccpuid.pl b/lib/libcrypto/ppccpuid.pl index fe44ff07bc6..cf48714e338 100755 --- a/lib/libcrypto/ppccpuid.pl +++ b/lib/libcrypto/ppccpuid.pl @@ -23,36 +23,67 @@ $code=<<___; .machine "any" .text -.globl .OPENSSL_cpuid_setup +.globl .OPENSSL_ppc64_probe .align 4 -.OPENSSL_cpuid_setup: +.OPENSSL_ppc64_probe: + fcfid f1,f1 + extrdi r0,r0,32,0 blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 + +.globl .OPENSSL_altivec_probe +.align 4 +.OPENSSL_altivec_probe: + .long 0x10000484 # vor v0,v0,v0 + blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 .globl .OPENSSL_wipe_cpu .align 4 .OPENSSL_wipe_cpu: xor r0,r0,r0 + fmr f0,f31 + fmr f1,f31 + fmr f2,f31 mr r3,r1 + fmr f3,f31 xor r4,r4,r4 + fmr f4,f31 xor r5,r5,r5 + fmr f5,f31 xor r6,r6,r6 + fmr f6,f31 xor r7,r7,r7 + fmr f7,f31 xor r8,r8,r8 + fmr f8,f31 xor r9,r9,r9 + fmr f9,f31 xor r10,r10,r10 + fmr f10,f31 xor r11,r11,r11 + fmr f11,f31 xor r12,r12,r12 + fmr f12,f31 + fmr f13,f31 blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 .globl .OPENSSL_atomic_add .align 4 .OPENSSL_atomic_add: -Loop: lwarx r5,0,r3 +Ladd: lwarx r5,0,r3 add r0,r4,r5 stwcx. r0,0,r3 - bne- Loop + bne- Ladd $SIGNX r3,r0 blr + .long 0 + .byte 0,12,0x14,0,0,0,2,0 + .long 0 .globl .OPENSSL_rdtsc .align 4 @@ -60,33 +91,8 @@ Loop: lwarx r5,0,r3 mftb r3 mftbu r4 blr - -.globl .OPENSSL_cleanse -.align 4 -.OPENSSL_cleanse: - $CMPLI r4,7 - li r0,0 - bge Lot -Little: mtctr r4 - stb r0,0(r3) - addi r3,r3,1 - bdnz- \$-8 - blr -Lot: andi. r5,r3,3 - beq Laligned - stb r0,0(r3) - subi r4,r4,1 - addi r3,r3,1 - b Lot -Laligned: - $SHRLI r5,r4,2 - mtctr r5 - stw r0,0(r3) - addi r3,r3,4 - bdnz- \$-8 - andi. r4,r4,3 - bne Little - blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; diff --git a/lib/libcrypto/s390xcpuid.S b/lib/libcrypto/s390xcpuid.S index 8500133ad0f..89bf6be82f5 100644 --- a/lib/libcrypto/s390xcpuid.S +++ b/lib/libcrypto/s390xcpuid.S @@ -1,19 +1,18 @@ .text -.globl OPENSSL_cpuid_setup -.type OPENSSL_cpuid_setup,@function -.align 16 -OPENSSL_cpuid_setup: - br %r14 # reserved for future -.size OPENSSL_cpuid_setup,.-OPENSSL_cpuid_setup - .globl OPENSSL_s390x_facilities .type OPENSSL_s390x_facilities,@function .align 16 OPENSSL_s390x_facilities: lghi %r0,0 - .long 0xb2b0f010 # stfle 16(%r15) - lg %r2,16(%r15) + larl %r2,OPENSSL_s390xcap_P + stg %r0,8(%r2) + .long 0xb2b02000 # stfle 0(%r2) + brc 8,.Ldone + lghi %r0,1 + .long 0xb2b02000 # stfle 0(%r2) +.Ldone: + lg %r2,0(%r2) br %r14 .size OPENSSL_s390x_facilities,.-OPENSSL_s390x_facilities @@ -59,32 +58,7 @@ OPENSSL_wipe_cpu: br %r14 .size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu -.globl OPENSSL_cleanse -.type OPENSSL_cleanse,@function -.align 16 -OPENSSL_cleanse: - lghi %r4,15 - lghi %r0,0 - clgr %r3,%r4 - jh .Lot -.Little: - stc %r0,0(%r2) - la %r2,1(%r2) - brctg %r3,.Little - br %r14 -.align 4 -.Lot: tmll %r2,7 - jz .Laligned - stc %r0,0(%r2) - la %r2,1(%r2) - brctg %r3,.Lot -.Laligned: - srlg %r4,%r3,3 -.Loop: stg %r0,0(%r2) - la %r2,8(%r2) - brctg %r4,.Loop - lghi %r4,7 - ngr %r3,%r4 - jnz .Little - br %r14 -.size OPENSSL_cleanse,.-OPENSSL_cleanse +.section .init + brasl %r14,OPENSSL_cpuid_setup + +.comm OPENSSL_s390xcap_P,16,8 diff --git a/lib/libcrypto/sparccpuid.S b/lib/libcrypto/sparccpuid.S index c17350fc89e..d8b44af2f0a 100644 --- a/lib/libcrypto/sparccpuid.S +++ b/lib/libcrypto/sparccpuid.S @@ -34,7 +34,8 @@ OPENSSL_wipe_cpu: nop call .PIC.zero.up mov .zero-(.-4),%o0 - ldd [%o0],%f0 + ld [%o0],%f0 + ld [%o0],%f1 subcc %g0,1,%o0 ! Following is V9 "rd %ccr,%o0" instruction. However! V8 @@ -166,6 +167,7 @@ walk_reg_wins: .global OPENSSL_atomic_add .type OPENSSL_atomic_add,#function +.align 32 OPENSSL_atomic_add: #ifndef ABI64 subcc %g0,1,%o2 @@ -177,7 +179,7 @@ OPENSSL_atomic_add: ba .enter nop #ifdef __sun -! Note that you don't have to link with libthread to call thr_yield, +! Note that you do not have to link with libthread to call thr_yield, ! as libc provides a stub, which is overloaded the moment you link ! with *either* libpthread or libthread... #define YIELD_CPU thr_yield @@ -213,27 +215,105 @@ OPENSSL_atomic_add: sra %o0,%g0,%o0 ! we return signed int, remember? .size OPENSSL_atomic_add,.-OPENSSL_atomic_add -.global OPENSSL_rdtsc +.global _sparcv9_rdtick +.align 32 +_sparcv9_rdtick: subcc %g0,1,%o0 .word 0x91408000 !rd %ccr,%o0 cmp %o0,0x99 - bne .notsc + bne .notick xor %o0,%o0,%o0 - save %sp,FRAME-16,%sp - mov 513,%o0 !SI_PLATFORM - add %sp,BIAS+16,%o1 - call sysinfo - mov 256,%o2 - - add %sp,BIAS-16,%o1 - ld [%o1],%l0 - ld [%o1+4],%l1 - ld [%o1+8],%l2 - mov %lo('SUNW'),%l3 - ret - restore -.notsc: + .word 0x91410000 !rd %tick,%o0 + retl + .word 0x93323020 !srlx %o0,32,%o1 +.notick: + retl + xor %o1,%o1,%o1 +.type _sparcv9_rdtick,#function +.size _sparcv9_rdtick,.-_sparcv9_rdtick + +.global _sparcv9_vis1_probe +.align 8 +_sparcv9_vis1_probe: + add %sp,BIAS+2,%o1 + .word 0xc19a5a40 !ldda [%o1]ASI_FP16_P,%f0 + retl + .word 0x81b00d80 !fxor %f0,%f0,%f0 +.type _sparcv9_vis1_probe,#function +.size _sparcv9_vis1_probe,.-_sparcv9_vis1_probe + +! Probe and instrument VIS1 instruction. Output is number of cycles it +! takes to execute rdtick and pair of VIS1 instructions. US-Tx VIS unit +! is slow (documented to be 6 cycles on T2) and the core is in-order +! single-issue, it should be possible to distinguish Tx reliably... +! Observed return values are: +! +! UltraSPARC IIe 7 +! UltraSPARC III 7 +! UltraSPARC T1 24 +! +! Numbers for T2 and SPARC64 V-VII are more than welcomed. +! +! It would be possible to detect specifically US-T1 by instrumenting +! fmul8ulx16, which is emulated on T1 and as such accounts for quite +! a lot of %tick-s, couple of thousand on Linux... +.global _sparcv9_vis1_instrument +.align 8 +_sparcv9_vis1_instrument: + .word 0x91410000 !rd %tick,%o0 + .word 0x81b00d80 !fxor %f0,%f0,%f0 + .word 0x85b08d82 !fxor %f2,%f2,%f2 + .word 0x93410000 !rd %tick,%o1 + .word 0x81b00d80 !fxor %f0,%f0,%f0 + .word 0x85b08d82 !fxor %f2,%f2,%f2 + .word 0x95410000 !rd %tick,%o2 + .word 0x81b00d80 !fxor %f0,%f0,%f0 + .word 0x85b08d82 !fxor %f2,%f2,%f2 + .word 0x97410000 !rd %tick,%o3 + .word 0x81b00d80 !fxor %f0,%f0,%f0 + .word 0x85b08d82 !fxor %f2,%f2,%f2 + .word 0x99410000 !rd %tick,%o4 + + ! calculate intervals + sub %o1,%o0,%o0 + sub %o2,%o1,%o1 + sub %o3,%o2,%o2 + sub %o4,%o3,%o3 + + ! find minumum value + cmp %o0,%o1 + .word 0x38680002 !bgu,a %xcc,.+8 + mov %o1,%o0 + cmp %o0,%o2 + .word 0x38680002 !bgu,a %xcc,.+8 + mov %o2,%o0 + cmp %o0,%o3 + .word 0x38680002 !bgu,a %xcc,.+8 + mov %o3,%o0 + retl nop -.type OPENSSL_rdtsc,#function -.size OPENSSL_rdtsc,.-OPENSSL_atomic_add +.type _sparcv9_vis1_instrument,#function +.size _sparcv9_vis1_instrument,.-_sparcv9_vis1_instrument + +.global _sparcv9_vis2_probe +.align 8 +_sparcv9_vis2_probe: + retl + .word 0x81b00980 !bshuffle %f0,%f0,%f0 +.type _sparcv9_vis2_probe,#function +.size _sparcv9_vis2_probe,.-_sparcv9_vis2_probe + +.global _sparcv9_fmadd_probe +.align 8 +_sparcv9_fmadd_probe: + .word 0x81b00d80 !fxor %f0,%f0,%f0 + .word 0x85b08d82 !fxor %f2,%f2,%f2 + retl + .word 0x81b80440 !fmaddd %f0,%f0,%f2,%f0 +.type _sparcv9_fmadd_probe,#function +.size _sparcv9_fmadd_probe,.-_sparcv9_fmadd_probe + +.section ".init",#alloc,#execinstr + call OPENSSL_cpuid_setup + nop diff --git a/lib/libcrypto/x86_64cpuid.pl b/lib/libcrypto/x86_64cpuid.pl index 6ebfd017ea5..8422e913426 100644 --- a/lib/libcrypto/x86_64cpuid.pl +++ b/lib/libcrypto/x86_64cpuid.pl @@ -172,41 +172,6 @@ OPENSSL_ia32_cpuid: or %r9,%rax ret .size OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid - -.globl OPENSSL_cleanse -.type OPENSSL_cleanse,\@abi-omnipotent -.align 16 -OPENSSL_cleanse: - xor %rax,%rax - cmp \$15,$arg2 - jae .Lot - cmp \$0,$arg2 - je .Lret -.Little: - mov %al,($arg1) - sub \$1,$arg2 - lea 1($arg1),$arg1 - jnz .Little -.Lret: - ret -.align 16 -.Lot: - test \$7,$arg1 - jz .Laligned - mov %al,($arg1) - lea -1($arg2),$arg2 - lea 1($arg1),$arg1 - jmp .Lot -.Laligned: - mov %rax,($arg1) - lea -8($arg2),$arg2 - test \$-8,$arg2 - lea 8($arg1),$arg1 - jnz .Laligned - cmp \$0,$arg2 - jne .Little - ret -.size OPENSSL_cleanse,.-OPENSSL_cleanse ___ print<<___ if (!$win64); diff --git a/lib/libcrypto/x86cpuid.pl b/lib/libcrypto/x86cpuid.pl index 4408ef2936e..0da613f6971 100644 --- a/lib/libcrypto/x86cpuid.pl +++ b/lib/libcrypto/x86cpuid.pl @@ -1,6 +1,7 @@ #!/usr/bin/env perl -push(@INC,"perlasm"); +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC, "${dir}perlasm", "perlasm"); require "x86asm.pl"; &asm_init($ARGV[0],"x86cpuid"); @@ -18,42 +19,127 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } &pushf (); &pop ("eax"); &xor ("ecx","eax"); - &bt ("ecx",21); - &jnc (&label("done")); &xor ("eax","eax"); + &bt ("ecx",21); + &jnc (&label("nocpuid")); &cpuid (); + &mov ("edi","eax"); # max value for standard query level + &xor ("eax","eax"); &cmp ("ebx",0x756e6547); # "Genu" - &data_byte(0x0f,0x95,0xc0); #&setne (&LB("eax")); + &setne (&LB("eax")); &mov ("ebp","eax"); &cmp ("edx",0x49656e69); # "ineI" - &data_byte(0x0f,0x95,0xc0); #&setne (&LB("eax")); + &setne (&LB("eax")); &or ("ebp","eax"); &cmp ("ecx",0x6c65746e); # "ntel" - &data_byte(0x0f,0x95,0xc0); #&setne (&LB("eax")); - &or ("ebp","eax"); + &setne (&LB("eax")); + &or ("ebp","eax"); # 0 indicates Intel CPU + &jz (&label("intel")); + + &cmp ("ebx",0x68747541); # "Auth" + &setne (&LB("eax")); + &mov ("esi","eax"); + &cmp ("edx",0x69746E65); # "enti" + &setne (&LB("eax")); + &or ("esi","eax"); + &cmp ("ecx",0x444D4163); # "cAMD" + &setne (&LB("eax")); + &or ("esi","eax"); # 0 indicates AMD CPU + &jnz (&label("intel")); + + # AMD specific + &mov ("eax",0x80000000); + &cpuid (); + &cmp ("eax",0x80000001); + &jb (&label("intel")); + &mov ("esi","eax"); + &mov ("eax",0x80000001); + &cpuid (); + &or ("ebp","ecx"); + &and ("ebp",1<<11|1); # isolate XOP bit + &cmp ("esi",0x80000008); + &jb (&label("intel")); + + &mov ("eax",0x80000008); + &cpuid (); + &movz ("esi",&LB("ecx")); # number of cores - 1 + &inc ("esi"); # number of cores + + &mov ("eax",1); + &xor ("ecx","ecx"); + &cpuid (); + &bt ("edx",28); + &jnc (&label("generic")); + &shr ("ebx",16); + &and ("ebx",0xff); + &cmp ("ebx","esi"); + &ja (&label("generic")); + &and ("edx",0xefffffff); # clear hyper-threading bit + &jmp (&label("generic")); + +&set_label("intel"); + &cmp ("edi",4); + &mov ("edi",-1); + &jb (&label("nocacheinfo")); + + &mov ("eax",4); + &mov ("ecx",0); # query L1D + &cpuid (); + &mov ("edi","eax"); + &shr ("edi",14); + &and ("edi",0xfff); # number of cores -1 per L1D + +&set_label("nocacheinfo"); &mov ("eax",1); + &xor ("ecx","ecx"); &cpuid (); + &and ("edx",0xbfefffff); # force reserved bits #20, #30 to 0 &cmp ("ebp",0); - &jne (&label("notP4")); - &and ("eax",15<<8); # familiy ID - &cmp ("eax",15<<8); # P4? - &jne (&label("notP4")); - &or ("edx",1<<20); # use reserved bit to engage RC4_CHAR -&set_label("notP4"); + &jne (&label("notintel")); + &or ("edx",1<<30); # set reserved bit#30 on Intel CPUs + &and (&HB("eax"),15); # familiy ID + &cmp (&HB("eax"),15); # P4? + &jne (&label("notintel")); + &or ("edx",1<<20); # set reserved bit#20 to engage RC4_CHAR +&set_label("notintel"); &bt ("edx",28); # test hyper-threading bit - &jnc (&label("done")); + &jnc (&label("generic")); + &and ("edx",0xefffffff); + &cmp ("edi",0); + &je (&label("generic")); + + &or ("edx",0x10000000); &shr ("ebx",16); - &and ("ebx",0xff); - &cmp ("ebx",1); # see if cache is shared(*) - &ja (&label("done")); + &cmp (&LB("ebx"),1); + &ja (&label("generic")); &and ("edx",0xefffffff); # clear hyper-threading bit if not + +&set_label("generic"); + &and ("ebp",1<<11); # isolate AMD XOP flag + &and ("ecx",0xfffff7ff); # force 11th bit to 0 + &mov ("esi","edx"); + &or ("ebp","ecx"); # merge AMD XOP flag + + &bt ("ecx",27); # check OSXSAVE bit + &jnc (&label("clear_avx")); + &xor ("ecx","ecx"); + &data_byte(0x0f,0x01,0xd0); # xgetbv + &and ("eax",6); + &cmp ("eax",6); + &je (&label("done")); + &cmp ("eax",2); + &je (&label("clear_avx")); +&set_label("clear_xmm"); + &and ("ebp",0xfdfffffd); # clear AESNI and PCLMULQDQ bits + &and ("esi",0xfeffffff); # clear FXSR +&set_label("clear_avx"); + &and ("ebp",0xefffe7ff); # clear AVX, FMA and AMD XOP bits &set_label("done"); - &mov ("eax","edx"); - &mov ("edx","ecx"); + &mov ("eax","esi"); + &mov ("edx","ebp"); +&set_label("nocpuid"); &function_end("OPENSSL_ia32_cpuid"); -# (*) on Core2 this value is set to 2 denoting the fact that L2 -# cache is shared between cores. &external_label("OPENSSL_ia32cap_P"); @@ -81,7 +167,7 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } &jnz (&label("nohalt")); # not enough privileges &pushf (); - &pop ("eax") + &pop ("eax"); &bt ("eax",9); &jnc (&label("nohalt")); # interrupts are disabled @@ -146,8 +232,9 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } &bt (&DWP(0,"ecx"),1); &jnc (&label("no_x87")); if ($sse2) { - &bt (&DWP(0,"ecx"),26); - &jnc (&label("no_sse2")); + &and ("ecx",1<<26|1<<24); # check SSE2 and FXSR bits + &cmp ("ecx",1<<26|1<<24); + &jne (&label("no_sse2")); &pxor ("xmm0","xmm0"); &pxor ("xmm1","xmm1"); &pxor ("xmm2","xmm2"); @@ -195,7 +282,7 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } # arguments is 1 or 2! &function_begin_B("OPENSSL_indirect_call"); { - my $i,$max=7; # $max has to be chosen as 4*n-1 + my ($max,$i)=(7,); # $max has to be chosen as 4*n-1 # in order to preserve eventual # stack alignment &push ("ebp"); @@ -220,6 +307,18 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } } &function_end_B("OPENSSL_indirect_call"); +&function_begin_B("OPENSSL_ia32_rdrand"); + &mov ("ecx",8); +&set_label("loop"); + &rdrand ("eax"); + &jc (&label("break")); + &loop (&label("loop")); +&set_label("break"); + &cmp ("eax",0); + &cmove ("eax","ecx"); + &ret (); +&function_end_B("OPENSSL_ia32_rdrand"); + &initseg("OPENSSL_cpuid_setup"); &asm_finish(); |