summaryrefslogtreecommitdiff
path: root/lib/libcrypto
diff options
context:
space:
mode:
authorMiod Vallat <miod@cvs.openbsd.org>2014-04-17 18:16:46 +0000
committerMiod Vallat <miod@cvs.openbsd.org>2014-04-17 18:16:46 +0000
commitdc1bbba22268c44b47967c17fa2e43f15fe59bf0 (patch)
treec7030843ca3e64bea8319af2fca2ee531db3f38a /lib/libcrypto
parentcde0e7d71fe0bef955ef6a27178b658f0c12a660 (diff)
Ok, there was a need for OPENSSL_cleanse() instead of bzero() to prevent
supposedly smart compilers from optimizing memory cleanups away. Understood. Ok, in case of an hypothetically super smart compiler, OPENSSL_cleanse() had to be convoluted enough for the compiler not to recognize that this was actually bzero() in disguise. Understood. But then why there had been optimized assembler versions of OPENSSL_cleanse() is beyond me. Did someone not trust the C obfuscation?
Diffstat (limited to 'lib/libcrypto')
-rw-r--r--lib/libcrypto/alphacpuid.pl37
-rw-r--r--lib/libcrypto/armv4cpuid.S32
-rw-r--r--lib/libcrypto/ia64cpuid.S9
-rw-r--r--lib/libcrypto/pariscid.pl61
-rwxr-xr-xlib/libcrypto/ppccpuid.pl68
-rw-r--r--lib/libcrypto/s390xcpuid.S50
-rw-r--r--lib/libcrypto/sparccpuid.S120
-rw-r--r--lib/libcrypto/x86_64cpuid.pl35
-rw-r--r--lib/libcrypto/x86cpuid.pl149
9 files changed, 290 insertions, 271 deletions
diff --git a/lib/libcrypto/alphacpuid.pl b/lib/libcrypto/alphacpuid.pl
index 4b3cbb9827d..f6aea6a7663 100644
--- a/lib/libcrypto/alphacpuid.pl
+++ b/lib/libcrypto/alphacpuid.pl
@@ -86,41 +86,4 @@ OPENSSL_rdtsc:
rpcc $0
ret ($26)
.end OPENSSL_rdtsc
-
-.globl OPENSSL_cleanse
-.ent OPENSSL_cleanse
-OPENSSL_cleanse:
- .frame $30,0,$26
- .prologue 0
- beq $17,.Ldone
- and $16,7,$0
- bic $17,7,$at
- beq $at,.Little
- beq $0,.Laligned
-
-.Little:
- subq $0,8,$0
- ldq_u $1,0($16)
- mov $16,$2
-.Lalign:
- mskbl $1,$16,$1
- lda $16,1($16)
- subq $17,1,$17
- addq $0,1,$0
- beq $17,.Lout
- bne $0,.Lalign
-.Lout: stq_u $1,0($2)
- beq $17,.Ldone
- bic $17,7,$at
- beq $at,.Little
-
-.Laligned:
- stq $31,0($16)
- subq $17,8,$17
- lda $16,8($16)
- bic $17,7,$at
- bne $at,.Laligned
- bne $17,.Little
-.Ldone: ret ($26)
-.end OPENSSL_cleanse
___
diff --git a/lib/libcrypto/armv4cpuid.S b/lib/libcrypto/armv4cpuid.S
index 2d618deaa43..bdfde19c1fc 100644
--- a/lib/libcrypto/armv4cpuid.S
+++ b/lib/libcrypto/armv4cpuid.S
@@ -54,38 +54,6 @@ OPENSSL_atomic_add:
#endif
.size OPENSSL_atomic_add,.-OPENSSL_atomic_add
-.global OPENSSL_cleanse
-.type OPENSSL_cleanse,%function
-OPENSSL_cleanse:
- eor ip,ip,ip
- cmp r1,#7
- subhs r1,r1,#4
- bhs .Lot
- cmp r1,#0
- beq .Lcleanse_done
-.Little:
- strb ip,[r0],#1
- subs r1,r1,#1
- bhi .Little
- b .Lcleanse_done
-
-.Lot: tst r0,#3
- beq .Laligned
- strb ip,[r0],#1
- sub r1,r1,#1
- b .Lot
-.Laligned:
- str ip,[r0],#4
- subs r1,r1,#4
- bhs .Laligned
- adds r1,r1,#4
- bne .Little
-.Lcleanse_done:
- tst lr,#1
- moveq pc,lr
- .word 0xe12fff1e @ bx lr
-.size OPENSSL_cleanse,.-OPENSSL_cleanse
-
.global OPENSSL_wipe_cpu
.type OPENSSL_wipe_cpu,%function
OPENSSL_wipe_cpu:
diff --git a/lib/libcrypto/ia64cpuid.S b/lib/libcrypto/ia64cpuid.S
index 04fbb3439eb..517d9388881 100644
--- a/lib/libcrypto/ia64cpuid.S
+++ b/lib/libcrypto/ia64cpuid.S
@@ -1,6 +1,13 @@
// Works on all IA-64 platforms: Linux, HP-UX, Win64i...
// On Win64i compile with ias.exe.
.text
+
+.global OPENSSL_cpuid_setup#
+.proc OPENSSL_cpuid_setup#
+OPENSSL_cpuid_setup:
+{ .mib; br.ret.sptk.many b0 };;
+.endp OPENSSL_cpuid_setup#
+
.global OPENSSL_rdtsc#
.proc OPENSSL_rdtsc#
OPENSSL_rdtsc:
@@ -19,7 +26,7 @@ OPENSSL_atomic_add:
{ .mii; mov ar.ccv=r2
add r8=r2,r33
mov r3=r2 };;
-{ .mmi; mf
+{ .mmi; mf;;
cmpxchg4.acq r2=[r32],r8,ar.ccv
nop.i 0 };;
{ .mib; cmp.ne p6,p0=r2,r3
diff --git a/lib/libcrypto/pariscid.pl b/lib/libcrypto/pariscid.pl
index 477ec9b87dd..38985afbacb 100644
--- a/lib/libcrypto/pariscid.pl
+++ b/lib/libcrypto/pariscid.pl
@@ -87,50 +87,6 @@ OPENSSL_wipe_cpu
.PROCEND
___
{
-my $inp="%r26";
-my $len="%r25";
-
-$code.=<<___;
- .EXPORT OPENSSL_cleanse,ENTRY,ARGW0=GR,ARGW1=GR
- .ALIGN 8
-OPENSSL_cleanse
- .PROC
- .CALLINFO NO_CALLS
- .ENTRY
- cmpib,*= 0,$len,Ldone
- nop
- cmpib,*>>= 15,$len,Little
- ldi $SIZE_T-1,%r1
-
-Lalign
- and,*<> $inp,%r1,%r28
- b,n Laligned
- stb %r0,0($inp)
- ldo -1($len),$len
- b Lalign
- ldo 1($inp),$inp
-
-Laligned
- andcm $len,%r1,%r28
-Lot
- $ST %r0,0($inp)
- addib,*<> -$SIZE_T,%r28,Lot
- ldo $SIZE_T($inp),$inp
-
- and,*<> $len,%r1,$len
- b,n Ldone
-Little
- stb %r0,0($inp)
- addib,*<> -1,$len,Little
- ldo 1($inp),$inp
-Ldone
- bv ($rp)
- .EXIT
- nop
- .PROCEND
-___
-}
-{
my ($out,$cnt,$max)=("%r26","%r25","%r24");
my ($tick,$lasttick)=("%r23","%r22");
my ($diff,$lastdiff)=("%r21","%r20");
@@ -151,7 +107,7 @@ OPENSSL_instrument_bus
ldw 0($out),$tick
add $diff,$tick,$tick
stw $tick,0($out)
-Loop
+L\$oop
mfctl %cr16,$tick
sub $tick,$lasttick,$diff
copy $tick,$lasttick
@@ -161,7 +117,7 @@ Loop
add $diff,$tick,$tick
stw $tick,0($out)
- addib,<> -1,$cnt,Loop
+ addib,<> -1,$cnt,L\$oop
addi 4,$out,$out
bv ($rp)
@@ -190,14 +146,14 @@ OPENSSL_instrument_bus2
mfctl %cr16,$tick
sub $tick,$lasttick,$diff
copy $tick,$lasttick
-Loop2
+L\$oop2
copy $diff,$lastdiff
fdc 0($out)
ldw 0($out),$tick
add $diff,$tick,$tick
stw $tick,0($out)
- addib,= -1,$max,Ldone2
+ addib,= -1,$max,L\$done2
nop
mfctl %cr16,$tick
@@ -208,17 +164,18 @@ Loop2
ldi 1,%r1
xor %r1,$tick,$tick
- addb,<> $tick,$cnt,Loop2
+ addb,<> $tick,$cnt,L\$oop2
shladd,l $tick,2,$out,$out
-Ldone2
+L\$done2
bv ($rp)
.EXIT
add $rv,$cnt,$rv
.PROCEND
___
}
-$code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4);
-$code =~ s/,\*/,/gm if ($SIZE_T==4);
+$code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4);
+$code =~ s/,\*/,/gm if ($SIZE_T==4);
+$code =~ s/\bbv\b/bve/gm if ($SIZE_T==8);
print $code;
close STDOUT;
diff --git a/lib/libcrypto/ppccpuid.pl b/lib/libcrypto/ppccpuid.pl
index fe44ff07bc6..cf48714e338 100755
--- a/lib/libcrypto/ppccpuid.pl
+++ b/lib/libcrypto/ppccpuid.pl
@@ -23,36 +23,67 @@ $code=<<___;
.machine "any"
.text
-.globl .OPENSSL_cpuid_setup
+.globl .OPENSSL_ppc64_probe
.align 4
-.OPENSSL_cpuid_setup:
+.OPENSSL_ppc64_probe:
+ fcfid f1,f1
+ extrdi r0,r0,32,0
blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+
+.globl .OPENSSL_altivec_probe
+.align 4
+.OPENSSL_altivec_probe:
+ .long 0x10000484 # vor v0,v0,v0
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
.globl .OPENSSL_wipe_cpu
.align 4
.OPENSSL_wipe_cpu:
xor r0,r0,r0
+ fmr f0,f31
+ fmr f1,f31
+ fmr f2,f31
mr r3,r1
+ fmr f3,f31
xor r4,r4,r4
+ fmr f4,f31
xor r5,r5,r5
+ fmr f5,f31
xor r6,r6,r6
+ fmr f6,f31
xor r7,r7,r7
+ fmr f7,f31
xor r8,r8,r8
+ fmr f8,f31
xor r9,r9,r9
+ fmr f9,f31
xor r10,r10,r10
+ fmr f10,f31
xor r11,r11,r11
+ fmr f11,f31
xor r12,r12,r12
+ fmr f12,f31
+ fmr f13,f31
blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
.globl .OPENSSL_atomic_add
.align 4
.OPENSSL_atomic_add:
-Loop: lwarx r5,0,r3
+Ladd: lwarx r5,0,r3
add r0,r4,r5
stwcx. r0,0,r3
- bne- Loop
+ bne- Ladd
$SIGNX r3,r0
blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,2,0
+ .long 0
.globl .OPENSSL_rdtsc
.align 4
@@ -60,33 +91,8 @@ Loop: lwarx r5,0,r3
mftb r3
mftbu r4
blr
-
-.globl .OPENSSL_cleanse
-.align 4
-.OPENSSL_cleanse:
- $CMPLI r4,7
- li r0,0
- bge Lot
-Little: mtctr r4
- stb r0,0(r3)
- addi r3,r3,1
- bdnz- \$-8
- blr
-Lot: andi. r5,r3,3
- beq Laligned
- stb r0,0(r3)
- subi r4,r4,1
- addi r3,r3,1
- b Lot
-Laligned:
- $SHRLI r5,r4,2
- mtctr r5
- stw r0,0(r3)
- addi r3,r3,4
- bdnz- \$-8
- andi. r4,r4,3
- bne Little
- blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/lib/libcrypto/s390xcpuid.S b/lib/libcrypto/s390xcpuid.S
index 8500133ad0f..89bf6be82f5 100644
--- a/lib/libcrypto/s390xcpuid.S
+++ b/lib/libcrypto/s390xcpuid.S
@@ -1,19 +1,18 @@
.text
-.globl OPENSSL_cpuid_setup
-.type OPENSSL_cpuid_setup,@function
-.align 16
-OPENSSL_cpuid_setup:
- br %r14 # reserved for future
-.size OPENSSL_cpuid_setup,.-OPENSSL_cpuid_setup
-
.globl OPENSSL_s390x_facilities
.type OPENSSL_s390x_facilities,@function
.align 16
OPENSSL_s390x_facilities:
lghi %r0,0
- .long 0xb2b0f010 # stfle 16(%r15)
- lg %r2,16(%r15)
+ larl %r2,OPENSSL_s390xcap_P
+ stg %r0,8(%r2)
+ .long 0xb2b02000 # stfle 0(%r2)
+ brc 8,.Ldone
+ lghi %r0,1
+ .long 0xb2b02000 # stfle 0(%r2)
+.Ldone:
+ lg %r2,0(%r2)
br %r14
.size OPENSSL_s390x_facilities,.-OPENSSL_s390x_facilities
@@ -59,32 +58,7 @@ OPENSSL_wipe_cpu:
br %r14
.size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
-.globl OPENSSL_cleanse
-.type OPENSSL_cleanse,@function
-.align 16
-OPENSSL_cleanse:
- lghi %r4,15
- lghi %r0,0
- clgr %r3,%r4
- jh .Lot
-.Little:
- stc %r0,0(%r2)
- la %r2,1(%r2)
- brctg %r3,.Little
- br %r14
-.align 4
-.Lot: tmll %r2,7
- jz .Laligned
- stc %r0,0(%r2)
- la %r2,1(%r2)
- brctg %r3,.Lot
-.Laligned:
- srlg %r4,%r3,3
-.Loop: stg %r0,0(%r2)
- la %r2,8(%r2)
- brctg %r4,.Loop
- lghi %r4,7
- ngr %r3,%r4
- jnz .Little
- br %r14
-.size OPENSSL_cleanse,.-OPENSSL_cleanse
+.section .init
+ brasl %r14,OPENSSL_cpuid_setup
+
+.comm OPENSSL_s390xcap_P,16,8
diff --git a/lib/libcrypto/sparccpuid.S b/lib/libcrypto/sparccpuid.S
index c17350fc89e..d8b44af2f0a 100644
--- a/lib/libcrypto/sparccpuid.S
+++ b/lib/libcrypto/sparccpuid.S
@@ -34,7 +34,8 @@ OPENSSL_wipe_cpu:
nop
call .PIC.zero.up
mov .zero-(.-4),%o0
- ldd [%o0],%f0
+ ld [%o0],%f0
+ ld [%o0],%f1
subcc %g0,1,%o0
! Following is V9 "rd %ccr,%o0" instruction. However! V8
@@ -166,6 +167,7 @@ walk_reg_wins:
.global OPENSSL_atomic_add
.type OPENSSL_atomic_add,#function
+.align 32
OPENSSL_atomic_add:
#ifndef ABI64
subcc %g0,1,%o2
@@ -177,7 +179,7 @@ OPENSSL_atomic_add:
ba .enter
nop
#ifdef __sun
-! Note that you don't have to link with libthread to call thr_yield,
+! Note that you do not have to link with libthread to call thr_yield,
! as libc provides a stub, which is overloaded the moment you link
! with *either* libpthread or libthread...
#define YIELD_CPU thr_yield
@@ -213,27 +215,105 @@ OPENSSL_atomic_add:
sra %o0,%g0,%o0 ! we return signed int, remember?
.size OPENSSL_atomic_add,.-OPENSSL_atomic_add
-.global OPENSSL_rdtsc
+.global _sparcv9_rdtick
+.align 32
+_sparcv9_rdtick:
subcc %g0,1,%o0
.word 0x91408000 !rd %ccr,%o0
cmp %o0,0x99
- bne .notsc
+ bne .notick
xor %o0,%o0,%o0
- save %sp,FRAME-16,%sp
- mov 513,%o0 !SI_PLATFORM
- add %sp,BIAS+16,%o1
- call sysinfo
- mov 256,%o2
-
- add %sp,BIAS-16,%o1
- ld [%o1],%l0
- ld [%o1+4],%l1
- ld [%o1+8],%l2
- mov %lo('SUNW'),%l3
- ret
- restore
-.notsc:
+ .word 0x91410000 !rd %tick,%o0
+ retl
+ .word 0x93323020 !srlx %o0,32,%o1
+.notick:
+ retl
+ xor %o1,%o1,%o1
+.type _sparcv9_rdtick,#function
+.size _sparcv9_rdtick,.-_sparcv9_rdtick
+
+.global _sparcv9_vis1_probe
+.align 8
+_sparcv9_vis1_probe:
+ add %sp,BIAS+2,%o1
+ .word 0xc19a5a40 !ldda [%o1]ASI_FP16_P,%f0
+ retl
+ .word 0x81b00d80 !fxor %f0,%f0,%f0
+.type _sparcv9_vis1_probe,#function
+.size _sparcv9_vis1_probe,.-_sparcv9_vis1_probe
+
+! Probe and instrument VIS1 instruction. Output is number of cycles it
+! takes to execute rdtick and pair of VIS1 instructions. US-Tx VIS unit
+! is slow (documented to be 6 cycles on T2) and the core is in-order
+! single-issue, it should be possible to distinguish Tx reliably...
+! Observed return values are:
+!
+! UltraSPARC IIe 7
+! UltraSPARC III 7
+! UltraSPARC T1 24
+!
+! Numbers for T2 and SPARC64 V-VII are more than welcomed.
+!
+! It would be possible to detect specifically US-T1 by instrumenting
+! fmul8ulx16, which is emulated on T1 and as such accounts for quite
+! a lot of %tick-s, couple of thousand on Linux...
+.global _sparcv9_vis1_instrument
+.align 8
+_sparcv9_vis1_instrument:
+ .word 0x91410000 !rd %tick,%o0
+ .word 0x81b00d80 !fxor %f0,%f0,%f0
+ .word 0x85b08d82 !fxor %f2,%f2,%f2
+ .word 0x93410000 !rd %tick,%o1
+ .word 0x81b00d80 !fxor %f0,%f0,%f0
+ .word 0x85b08d82 !fxor %f2,%f2,%f2
+ .word 0x95410000 !rd %tick,%o2
+ .word 0x81b00d80 !fxor %f0,%f0,%f0
+ .word 0x85b08d82 !fxor %f2,%f2,%f2
+ .word 0x97410000 !rd %tick,%o3
+ .word 0x81b00d80 !fxor %f0,%f0,%f0
+ .word 0x85b08d82 !fxor %f2,%f2,%f2
+ .word 0x99410000 !rd %tick,%o4
+
+ ! calculate intervals
+ sub %o1,%o0,%o0
+ sub %o2,%o1,%o1
+ sub %o3,%o2,%o2
+ sub %o4,%o3,%o3
+
+ ! find minumum value
+ cmp %o0,%o1
+ .word 0x38680002 !bgu,a %xcc,.+8
+ mov %o1,%o0
+ cmp %o0,%o2
+ .word 0x38680002 !bgu,a %xcc,.+8
+ mov %o2,%o0
+ cmp %o0,%o3
+ .word 0x38680002 !bgu,a %xcc,.+8
+ mov %o3,%o0
+
retl
nop
-.type OPENSSL_rdtsc,#function
-.size OPENSSL_rdtsc,.-OPENSSL_atomic_add
+.type _sparcv9_vis1_instrument,#function
+.size _sparcv9_vis1_instrument,.-_sparcv9_vis1_instrument
+
+.global _sparcv9_vis2_probe
+.align 8
+_sparcv9_vis2_probe:
+ retl
+ .word 0x81b00980 !bshuffle %f0,%f0,%f0
+.type _sparcv9_vis2_probe,#function
+.size _sparcv9_vis2_probe,.-_sparcv9_vis2_probe
+
+.global _sparcv9_fmadd_probe
+.align 8
+_sparcv9_fmadd_probe:
+ .word 0x81b00d80 !fxor %f0,%f0,%f0
+ .word 0x85b08d82 !fxor %f2,%f2,%f2
+ retl
+ .word 0x81b80440 !fmaddd %f0,%f0,%f2,%f0
+.type _sparcv9_fmadd_probe,#function
+.size _sparcv9_fmadd_probe,.-_sparcv9_fmadd_probe
+
+.section ".init",#alloc,#execinstr
+ call OPENSSL_cpuid_setup
+ nop
diff --git a/lib/libcrypto/x86_64cpuid.pl b/lib/libcrypto/x86_64cpuid.pl
index 6ebfd017ea5..8422e913426 100644
--- a/lib/libcrypto/x86_64cpuid.pl
+++ b/lib/libcrypto/x86_64cpuid.pl
@@ -172,41 +172,6 @@ OPENSSL_ia32_cpuid:
or %r9,%rax
ret
.size OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid
-
-.globl OPENSSL_cleanse
-.type OPENSSL_cleanse,\@abi-omnipotent
-.align 16
-OPENSSL_cleanse:
- xor %rax,%rax
- cmp \$15,$arg2
- jae .Lot
- cmp \$0,$arg2
- je .Lret
-.Little:
- mov %al,($arg1)
- sub \$1,$arg2
- lea 1($arg1),$arg1
- jnz .Little
-.Lret:
- ret
-.align 16
-.Lot:
- test \$7,$arg1
- jz .Laligned
- mov %al,($arg1)
- lea -1($arg2),$arg2
- lea 1($arg1),$arg1
- jmp .Lot
-.Laligned:
- mov %rax,($arg1)
- lea -8($arg2),$arg2
- test \$-8,$arg2
- lea 8($arg1),$arg1
- jnz .Laligned
- cmp \$0,$arg2
- jne .Little
- ret
-.size OPENSSL_cleanse,.-OPENSSL_cleanse
___
print<<___ if (!$win64);
diff --git a/lib/libcrypto/x86cpuid.pl b/lib/libcrypto/x86cpuid.pl
index 4408ef2936e..0da613f6971 100644
--- a/lib/libcrypto/x86cpuid.pl
+++ b/lib/libcrypto/x86cpuid.pl
@@ -1,6 +1,7 @@
#!/usr/bin/env perl
-push(@INC,"perlasm");
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC, "${dir}perlasm", "perlasm");
require "x86asm.pl";
&asm_init($ARGV[0],"x86cpuid");
@@ -18,42 +19,127 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
&pushf ();
&pop ("eax");
&xor ("ecx","eax");
- &bt ("ecx",21);
- &jnc (&label("done"));
&xor ("eax","eax");
+ &bt ("ecx",21);
+ &jnc (&label("nocpuid"));
&cpuid ();
+ &mov ("edi","eax"); # max value for standard query level
+
&xor ("eax","eax");
&cmp ("ebx",0x756e6547); # "Genu"
- &data_byte(0x0f,0x95,0xc0); #&setne (&LB("eax"));
+ &setne (&LB("eax"));
&mov ("ebp","eax");
&cmp ("edx",0x49656e69); # "ineI"
- &data_byte(0x0f,0x95,0xc0); #&setne (&LB("eax"));
+ &setne (&LB("eax"));
&or ("ebp","eax");
&cmp ("ecx",0x6c65746e); # "ntel"
- &data_byte(0x0f,0x95,0xc0); #&setne (&LB("eax"));
- &or ("ebp","eax");
+ &setne (&LB("eax"));
+ &or ("ebp","eax"); # 0 indicates Intel CPU
+ &jz (&label("intel"));
+
+ &cmp ("ebx",0x68747541); # "Auth"
+ &setne (&LB("eax"));
+ &mov ("esi","eax");
+ &cmp ("edx",0x69746E65); # "enti"
+ &setne (&LB("eax"));
+ &or ("esi","eax");
+ &cmp ("ecx",0x444D4163); # "cAMD"
+ &setne (&LB("eax"));
+ &or ("esi","eax"); # 0 indicates AMD CPU
+ &jnz (&label("intel"));
+
+ # AMD specific
+ &mov ("eax",0x80000000);
+ &cpuid ();
+ &cmp ("eax",0x80000001);
+ &jb (&label("intel"));
+ &mov ("esi","eax");
+ &mov ("eax",0x80000001);
+ &cpuid ();
+ &or ("ebp","ecx");
+ &and ("ebp",1<<11|1); # isolate XOP bit
+ &cmp ("esi",0x80000008);
+ &jb (&label("intel"));
+
+ &mov ("eax",0x80000008);
+ &cpuid ();
+ &movz ("esi",&LB("ecx")); # number of cores - 1
+ &inc ("esi"); # number of cores
+
+ &mov ("eax",1);
+ &xor ("ecx","ecx");
+ &cpuid ();
+ &bt ("edx",28);
+ &jnc (&label("generic"));
+ &shr ("ebx",16);
+ &and ("ebx",0xff);
+ &cmp ("ebx","esi");
+ &ja (&label("generic"));
+ &and ("edx",0xefffffff); # clear hyper-threading bit
+ &jmp (&label("generic"));
+
+&set_label("intel");
+ &cmp ("edi",4);
+ &mov ("edi",-1);
+ &jb (&label("nocacheinfo"));
+
+ &mov ("eax",4);
+ &mov ("ecx",0); # query L1D
+ &cpuid ();
+ &mov ("edi","eax");
+ &shr ("edi",14);
+ &and ("edi",0xfff); # number of cores -1 per L1D
+
+&set_label("nocacheinfo");
&mov ("eax",1);
+ &xor ("ecx","ecx");
&cpuid ();
+ &and ("edx",0xbfefffff); # force reserved bits #20, #30 to 0
&cmp ("ebp",0);
- &jne (&label("notP4"));
- &and ("eax",15<<8); # familiy ID
- &cmp ("eax",15<<8); # P4?
- &jne (&label("notP4"));
- &or ("edx",1<<20); # use reserved bit to engage RC4_CHAR
-&set_label("notP4");
+ &jne (&label("notintel"));
+ &or ("edx",1<<30); # set reserved bit#30 on Intel CPUs
+ &and (&HB("eax"),15); # familiy ID
+ &cmp (&HB("eax"),15); # P4?
+ &jne (&label("notintel"));
+ &or ("edx",1<<20); # set reserved bit#20 to engage RC4_CHAR
+&set_label("notintel");
&bt ("edx",28); # test hyper-threading bit
- &jnc (&label("done"));
+ &jnc (&label("generic"));
+ &and ("edx",0xefffffff);
+ &cmp ("edi",0);
+ &je (&label("generic"));
+
+ &or ("edx",0x10000000);
&shr ("ebx",16);
- &and ("ebx",0xff);
- &cmp ("ebx",1); # see if cache is shared(*)
- &ja (&label("done"));
+ &cmp (&LB("ebx"),1);
+ &ja (&label("generic"));
&and ("edx",0xefffffff); # clear hyper-threading bit if not
+
+&set_label("generic");
+ &and ("ebp",1<<11); # isolate AMD XOP flag
+ &and ("ecx",0xfffff7ff); # force 11th bit to 0
+ &mov ("esi","edx");
+ &or ("ebp","ecx"); # merge AMD XOP flag
+
+ &bt ("ecx",27); # check OSXSAVE bit
+ &jnc (&label("clear_avx"));
+ &xor ("ecx","ecx");
+ &data_byte(0x0f,0x01,0xd0); # xgetbv
+ &and ("eax",6);
+ &cmp ("eax",6);
+ &je (&label("done"));
+ &cmp ("eax",2);
+ &je (&label("clear_avx"));
+&set_label("clear_xmm");
+ &and ("ebp",0xfdfffffd); # clear AESNI and PCLMULQDQ bits
+ &and ("esi",0xfeffffff); # clear FXSR
+&set_label("clear_avx");
+ &and ("ebp",0xefffe7ff); # clear AVX, FMA and AMD XOP bits
&set_label("done");
- &mov ("eax","edx");
- &mov ("edx","ecx");
+ &mov ("eax","esi");
+ &mov ("edx","ebp");
+&set_label("nocpuid");
&function_end("OPENSSL_ia32_cpuid");
-# (*) on Core2 this value is set to 2 denoting the fact that L2
-# cache is shared between cores.
&external_label("OPENSSL_ia32cap_P");
@@ -81,7 +167,7 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
&jnz (&label("nohalt")); # not enough privileges
&pushf ();
- &pop ("eax")
+ &pop ("eax");
&bt ("eax",9);
&jnc (&label("nohalt")); # interrupts are disabled
@@ -146,8 +232,9 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
&bt (&DWP(0,"ecx"),1);
&jnc (&label("no_x87"));
if ($sse2) {
- &bt (&DWP(0,"ecx"),26);
- &jnc (&label("no_sse2"));
+ &and ("ecx",1<<26|1<<24); # check SSE2 and FXSR bits
+ &cmp ("ecx",1<<26|1<<24);
+ &jne (&label("no_sse2"));
&pxor ("xmm0","xmm0");
&pxor ("xmm1","xmm1");
&pxor ("xmm2","xmm2");
@@ -195,7 +282,7 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
# arguments is 1 or 2!
&function_begin_B("OPENSSL_indirect_call");
{
- my $i,$max=7; # $max has to be chosen as 4*n-1
+ my ($max,$i)=(7,); # $max has to be chosen as 4*n-1
# in order to preserve eventual
# stack alignment
&push ("ebp");
@@ -220,6 +307,18 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
}
&function_end_B("OPENSSL_indirect_call");
+&function_begin_B("OPENSSL_ia32_rdrand");
+ &mov ("ecx",8);
+&set_label("loop");
+ &rdrand ("eax");
+ &jc (&label("break"));
+ &loop (&label("loop"));
+&set_label("break");
+ &cmp ("eax",0);
+ &cmove ("eax","ecx");
+ &ret ();
+&function_end_B("OPENSSL_ia32_rdrand");
+
&initseg("OPENSSL_cpuid_setup");
&asm_finish();