src - OpenBSD base system

diff options


context:
space:
mode:

author	Miod Vallat <miod@cvs.openbsd.org>	2014-04-17 18:16:46 +0000
committer	Miod Vallat <miod@cvs.openbsd.org>	2014-04-17 18:16:46 +0000
commit	dc1bbba22268c44b47967c17fa2e43f15fe59bf0 (patch)
tree	c7030843ca3e64bea8319af2fca2ee531db3f38a /lib/libcrypto
parent	cde0e7d71fe0bef955ef6a27178b658f0c12a660 (diff)

Ok, there was a need for OPENSSL_cleanse() instead of bzero() to prevent

supposedly smart compilers from optimizing memory cleanups away. Understood. Ok, in case of an hypothetically super smart compiler, OPENSSL_cleanse() had to be convoluted enough for the compiler not to recognize that this was actually bzero() in disguise. Understood. But then why there had been optimized assembler versions of OPENSSL_cleanse() is beyond me. Did someone not trust the C obfuscation?

Diffstat (limited to 'lib/libcrypto')

-rw-r--r--

lib/libcrypto/alphacpuid.pl

-rw-r--r--

lib/libcrypto/armv4cpuid.S

-rw-r--r--

lib/libcrypto/ia64cpuid.S

-rw-r--r--

lib/libcrypto/pariscid.pl

-rwxr-xr-x

lib/libcrypto/ppccpuid.pl

-rw-r--r--

lib/libcrypto/s390xcpuid.S

-rw-r--r--

lib/libcrypto/sparccpuid.S

120

-rw-r--r--

lib/libcrypto/x86_64cpuid.pl

-rw-r--r--

lib/libcrypto/x86cpuid.pl

149

9 files changed, 290 insertions, 271 deletions

diff --git a/lib/libcrypto/alphacpuid.pl b/lib/libcrypto/alphacpuid.pl
index 4b3cbb9827d..f6aea6a7663 100644
--- a/lib/libcrypto/alphacpuid.pl
+++ b/lib/libcrypto/alphacpuid.pl

@@ -86,41 +86,4 @@ OPENSSL_rdtsc:

rpcc $0

ret ($26)

.end OPENSSL_rdtsc

-.globl OPENSSL_cleanse

-.ent OPENSSL_cleanse

-OPENSSL_cleanse:

- .frame $30,0,$26

- .prologue 0

- beq $17,.Ldone

- and $16,7,$0

- bic $17,7,$at

- beq $at,.Little

- beq $0,.Laligned

-.Little:

- subq $0,8,$0

- ldq_u $1,0($16)

- mov $16,$2

-.Lalign:

- mskbl $1,$16,$1

- lda $16,1($16)

- subq $17,1,$17

- addq $0,1,$0

- beq $17,.Lout

- bne $0,.Lalign

-.Lout: stq_u $1,0($2)

- beq $17,.Ldone

- bic $17,7,$at

- beq $at,.Little

-.Laligned:

- stq $31,0($16)

- subq $17,8,$17

- lda $16,8($16)

- bic $17,7,$at

- bne $at,.Laligned

- bne $17,.Little

-.Ldone: ret ($26)

-.end OPENSSL_cleanse

___

diff --git a/lib/libcrypto/armv4cpuid.S b/lib/libcrypto/armv4cpuid.S
index 2d618deaa43..bdfde19c1fc 100644
--- a/lib/libcrypto/armv4cpuid.S
+++ b/lib/libcrypto/armv4cpuid.S

@@ -54,38 +54,6 @@ OPENSSL_atomic_add:

#endif

.size OPENSSL_atomic_add,.-OPENSSL_atomic_add

-.global OPENSSL_cleanse

-.type OPENSSL_cleanse,%function

-OPENSSL_cleanse:

- eor ip,ip,ip

- cmp r1,#7

- subhs r1,r1,#4

- bhs .Lot

- cmp r1,#0

- beq .Lcleanse_done

-.Little:

- strb ip,[r0],#1

- subs r1,r1,#1

- bhi .Little

- b .Lcleanse_done

-.Lot: tst r0,#3

- beq .Laligned

- strb ip,[r0],#1

- sub r1,r1,#1

- b .Lot

-.Laligned:

- str ip,[r0],#4

- subs r1,r1,#4

- bhs .Laligned

- adds r1,r1,#4

- bne .Little

-.Lcleanse_done:

- tst lr,#1

- moveq pc,lr

- .word 0xe12fff1e @ bx lr

-.size OPENSSL_cleanse,.-OPENSSL_cleanse

.global OPENSSL_wipe_cpu

.type OPENSSL_wipe_cpu,%function

OPENSSL_wipe_cpu:

diff --git a/lib/libcrypto/ia64cpuid.S b/lib/libcrypto/ia64cpuid.S
index 04fbb3439eb..517d9388881 100644
--- a/lib/libcrypto/ia64cpuid.S
+++ b/lib/libcrypto/ia64cpuid.S

@@ -1,6 +1,13 @@

// Works on all IA-64 platforms: Linux, HP-UX, Win64i...

// On Win64i compile with ias.exe.

.text

+.global OPENSSL_cpuid_setup#

+.proc OPENSSL_cpuid_setup#

+OPENSSL_cpuid_setup:

+{ .mib; br.ret.sptk.many b0 };;

+.endp OPENSSL_cpuid_setup#

.global OPENSSL_rdtsc#

.proc OPENSSL_rdtsc#

OPENSSL_rdtsc:

@@ -19,7 +26,7 @@ OPENSSL_atomic_add:

{ .mii; mov ar.ccv=r2

add r8=r2,r33

mov r3=r2 };;

-{ .mmi; mf

+{ .mmi; mf;;

cmpxchg4.acq r2=[r32],r8,ar.ccv

nop.i 0 };;

{ .mib; cmp.ne p6,p0=r2,r3

diff --git a/lib/libcrypto/pariscid.pl b/lib/libcrypto/pariscid.pl
index 477ec9b87dd..38985afbacb 100644
--- a/lib/libcrypto/pariscid.pl
+++ b/lib/libcrypto/pariscid.pl

@@ -87,50 +87,6 @@ OPENSSL_wipe_cpu

.PROCEND

___

{

-my $inp="%r26";

-my $len="%r25";

-$code.=<<___;

- .EXPORT OPENSSL_cleanse,ENTRY,ARGW0=GR,ARGW1=GR

- .ALIGN 8

-OPENSSL_cleanse

- .PROC

- .CALLINFO NO_CALLS

- .ENTRY

- cmpib,*= 0,$len,Ldone

- nop

- cmpib,*>>= 15,$len,Little

- ldi $SIZE_T-1,%r1

-Lalign

- and,*<> $inp,%r1,%r28

- b,n Laligned

- stb %r0,0($inp)

- ldo -1($len),$len

- b Lalign

- ldo 1($inp),$inp

-Laligned

- andcm $len,%r1,%r28

-Lot

- $ST %r0,0($inp)

- addib,*<> -$SIZE_T,%r28,Lot

- ldo $SIZE_T($inp),$inp

- and,*<> $len,%r1,$len

- b,n Ldone

-Little

- stb %r0,0($inp)

- addib,*<> -1,$len,Little

- ldo 1($inp),$inp

-Ldone

- bv ($rp)

- .EXIT

- nop

- .PROCEND

-___

my ($out,$cnt,$max)=("%r26","%r25","%r24");

my ($tick,$lasttick)=("%r23","%r22");

my ($diff,$lastdiff)=("%r21","%r20");

@@ -151,7 +107,7 @@ OPENSSL_instrument_bus

ldw 0($out),$tick

add $diff,$tick,$tick

stw $tick,0($out)

-Loop

+L\$oop

mfctl %cr16,$tick

sub $tick,$lasttick,$diff

copy $tick,$lasttick

@@ -161,7 +117,7 @@ Loop

add $diff,$tick,$tick

stw $tick,0($out)

- addib,<> -1,$cnt,Loop

+ addib,<> -1,$cnt,L\$oop

addi 4,$out,$out

bv ($rp)

@@ -190,14 +146,14 @@ OPENSSL_instrument_bus2

mfctl %cr16,$tick

sub $tick,$lasttick,$diff

copy $tick,$lasttick

-Loop2

+L\$oop2

copy $diff,$lastdiff

fdc 0($out)

ldw 0($out),$tick

add $diff,$tick,$tick

stw $tick,0($out)

- addib,= -1,$max,Ldone2

+ addib,= -1,$max,L\$done2

nop

mfctl %cr16,$tick

@@ -208,17 +164,18 @@ Loop2

ldi 1,%r1

xor %r1,$tick,$tick

- addb,<> $tick,$cnt,Loop2

+ addb,<> $tick,$cnt,L\$oop2

shladd,l $tick,2,$out,$out

-Ldone2

+L\$done2

bv ($rp)

.EXIT

add $rv,$cnt,$rv

.PROCEND

___

}

-$code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4);

-$code =~ s/,\*/,/gm if ($SIZE_T==4);

+$code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4);

+$code =~ s/,\*/,/gm if ($SIZE_T==4);

+$code =~ s/\bbv\b/bve/gm if ($SIZE_T==8);

print $code;

close STDOUT;

diff --git a/lib/libcrypto/ppccpuid.pl b/lib/libcrypto/ppccpuid.pl
index fe44ff07bc6..cf48714e338 100755
--- a/lib/libcrypto/ppccpuid.pl
+++ b/lib/libcrypto/ppccpuid.pl

@@ -23,36 +23,67 @@ $code=<<___;

.machine "any"

.text

-.globl .OPENSSL_cpuid_setup

+.globl .OPENSSL_ppc64_probe

.align 4

-.OPENSSL_cpuid_setup:

+.OPENSSL_ppc64_probe:

+ fcfid f1,f1

+ extrdi r0,r0,32,0

blr

+ .long 0

+ .byte 0,12,0x14,0,0,0,0,0

+.globl .OPENSSL_altivec_probe

+.align 4

+.OPENSSL_altivec_probe:

+ .long 0x10000484 # vor v0,v0,v0

+ blr

+ .long 0

+ .byte 0,12,0x14,0,0,0,0,0

.globl .OPENSSL_wipe_cpu

.align 4

.OPENSSL_wipe_cpu:

xor r0,r0,r0

+ fmr f0,f31

+ fmr f1,f31

+ fmr f2,f31

mr r3,r1

+ fmr f3,f31

xor r4,r4,r4

+ fmr f4,f31

xor r5,r5,r5

+ fmr f5,f31

xor r6,r6,r6

+ fmr f6,f31

xor r7,r7,r7

+ fmr f7,f31

xor r8,r8,r8

+ fmr f8,f31

xor r9,r9,r9

+ fmr f9,f31

xor r10,r10,r10

+ fmr f10,f31

xor r11,r11,r11

+ fmr f11,f31

xor r12,r12,r12

+ fmr f12,f31

+ fmr f13,f31

blr

+ .long 0

+ .byte 0,12,0x14,0,0,0,0,0

.globl .OPENSSL_atomic_add

.align 4

.OPENSSL_atomic_add:

-Loop: lwarx r5,0,r3

+Ladd: lwarx r5,0,r3

add r0,r4,r5

stwcx. r0,0,r3

- bne- Loop

+ bne- Ladd

$SIGNX r3,r0

blr

+ .long 0

+ .byte 0,12,0x14,0,0,0,2,0

+ .long 0

.globl .OPENSSL_rdtsc

.align 4

@@ -60,33 +91,8 @@ Loop: lwarx r5,0,r3

mftb r3

mftbu r4

blr

-.globl .OPENSSL_cleanse

-.align 4

-.OPENSSL_cleanse:

- $CMPLI r4,7

- li r0,0

- bge Lot

-Little: mtctr r4

- stb r0,0(r3)

- addi r3,r3,1

- bdnz- \$-8

- blr

-Lot: andi. r5,r3,3

- beq Laligned

- stb r0,0(r3)

- subi r4,r4,1

- addi r3,r3,1

- b Lot

-Laligned:

- $SHRLI r5,r4,2

- mtctr r5

- stw r0,0(r3)

- addi r3,r3,4

- bdnz- \$-8

- andi. r4,r4,3

- bne Little

- blr

+ .long 0

+ .byte 0,12,0x14,0,0,0,0,0

___

$code =~ s/\`([^\`]*)\`/eval $1/gem;

diff --git a/lib/libcrypto/s390xcpuid.S b/lib/libcrypto/s390xcpuid.S
index 8500133ad0f..89bf6be82f5 100644
--- a/lib/libcrypto/s390xcpuid.S
+++ b/lib/libcrypto/s390xcpuid.S

@@ -1,19 +1,18 @@

.text

-.globl OPENSSL_cpuid_setup

-.type OPENSSL_cpuid_setup,@function

-.align 16

-OPENSSL_cpuid_setup:

- br %r14 # reserved for future

-.size OPENSSL_cpuid_setup,.-OPENSSL_cpuid_setup

.globl OPENSSL_s390x_facilities

.type OPENSSL_s390x_facilities,@function

.align 16

OPENSSL_s390x_facilities:

lghi %r0,0

- .long 0xb2b0f010 # stfle 16(%r15)

- lg %r2,16(%r15)

+ larl %r2,OPENSSL_s390xcap_P

+ stg %r0,8(%r2)

+ .long 0xb2b02000 # stfle 0(%r2)

+ brc 8,.Ldone

+ lghi %r0,1

+ .long 0xb2b02000 # stfle 0(%r2)

+.Ldone:

+ lg %r2,0(%r2)

br %r14

.size OPENSSL_s390x_facilities,.-OPENSSL_s390x_facilities

@@ -59,32 +58,7 @@ OPENSSL_wipe_cpu:

br %r14

.size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu

-.globl OPENSSL_cleanse

-.type OPENSSL_cleanse,@function

-.align 16

-OPENSSL_cleanse:

- lghi %r4,15

- lghi %r0,0

- clgr %r3,%r4

- jh .Lot

-.Little:

- stc %r0,0(%r2)

- la %r2,1(%r2)

- brctg %r3,.Little

- br %r14

-.align 4

-.Lot: tmll %r2,7

- jz .Laligned

- stc %r0,0(%r2)

- la %r2,1(%r2)

- brctg %r3,.Lot

-.Laligned:

- srlg %r4,%r3,3

-.Loop: stg %r0,0(%r2)

- la %r2,8(%r2)

- brctg %r4,.Loop

- lghi %r4,7

- ngr %r3,%r4

- jnz .Little

- br %r14

-.size OPENSSL_cleanse,.-OPENSSL_cleanse

+.section .init

+ brasl %r14,OPENSSL_cpuid_setup

+.comm OPENSSL_s390xcap_P,16,8

diff --git a/lib/libcrypto/sparccpuid.S b/lib/libcrypto/sparccpuid.S
index c17350fc89e..d8b44af2f0a 100644
--- a/lib/libcrypto/sparccpuid.S
+++ b/lib/libcrypto/sparccpuid.S

@@ -34,7 +34,8 @@ OPENSSL_wipe_cpu:

nop

call .PIC.zero.up

mov .zero-(.-4),%o0

- ldd [%o0],%f0

+ ld [%o0],%f0

+ ld [%o0],%f1

subcc %g0,1,%o0

! Following is V9 "rd %ccr,%o0" instruction. However! V8

@@ -166,6 +167,7 @@ walk_reg_wins:

.global OPENSSL_atomic_add

.type OPENSSL_atomic_add,#function

+.align 32

OPENSSL_atomic_add:

#ifndef ABI64

subcc %g0,1,%o2

@@ -177,7 +179,7 @@ OPENSSL_atomic_add:

ba .enter

nop

#ifdef __sun

-! Note that you don't have to link with libthread to call thr_yield,

+! Note that you do not have to link with libthread to call thr_yield,

! as libc provides a stub, which is overloaded the moment you link

! with *either* libpthread or libthread...

#define YIELD_CPU thr_yield

@@ -213,27 +215,105 @@ OPENSSL_atomic_add:

sra %o0,%g0,%o0 ! we return signed int, remember?

.size OPENSSL_atomic_add,.-OPENSSL_atomic_add

-.global OPENSSL_rdtsc

+.global _sparcv9_rdtick

+.align 32

+_sparcv9_rdtick:

subcc %g0,1,%o0

.word 0x91408000 !rd %ccr,%o0

cmp %o0,0x99

- bne .notsc

+ bne .notick

xor %o0,%o0,%o0

- save %sp,FRAME-16,%sp

- mov 513,%o0 !SI_PLATFORM

- add %sp,BIAS+16,%o1

- call sysinfo

- mov 256,%o2

- add %sp,BIAS-16,%o1

- ld [%o1],%l0

- ld [%o1+4],%l1

- ld [%o1+8],%l2

- mov %lo('SUNW'),%l3

- ret

- restore

-.notsc:

+ .word 0x91410000 !rd %tick,%o0

+ retl

+ .word 0x93323020 !srlx %o0,32,%o1

+.notick:

+ retl

+ xor %o1,%o1,%o1

+.type _sparcv9_rdtick,#function

+.size _sparcv9_rdtick,.-_sparcv9_rdtick

+.global _sparcv9_vis1_probe

+.align 8

+_sparcv9_vis1_probe:

+ add %sp,BIAS+2,%o1

+ .word 0xc19a5a40 !ldda [%o1]ASI_FP16_P,%f0

+ retl

+ .word 0x81b00d80 !fxor %f0,%f0,%f0

+.type _sparcv9_vis1_probe,#function

+.size _sparcv9_vis1_probe,.-_sparcv9_vis1_probe

+! Probe and instrument VIS1 instruction. Output is number of cycles it

+! takes to execute rdtick and pair of VIS1 instructions. US-Tx VIS unit

+! is slow (documented to be 6 cycles on T2) and the core is in-order

+! single-issue, it should be possible to distinguish Tx reliably...

+! Observed return values are:

+! UltraSPARC IIe 7

+! UltraSPARC III 7

+! UltraSPARC T1 24

+! Numbers for T2 and SPARC64 V-VII are more than welcomed.

+! It would be possible to detect specifically US-T1 by instrumenting

+! fmul8ulx16, which is emulated on T1 and as such accounts for quite

+! a lot of %tick-s, couple of thousand on Linux...

+.global _sparcv9_vis1_instrument

+.align 8

+_sparcv9_vis1_instrument:

+ .word 0x91410000 !rd %tick,%o0

+ .word 0x81b00d80 !fxor %f0,%f0,%f0

+ .word 0x85b08d82 !fxor %f2,%f2,%f2

+ .word 0x93410000 !rd %tick,%o1

+ .word 0x81b00d80 !fxor %f0,%f0,%f0

+ .word 0x85b08d82 !fxor %f2,%f2,%f2

+ .word 0x95410000 !rd %tick,%o2

+ .word 0x81b00d80 !fxor %f0,%f0,%f0

+ .word 0x85b08d82 !fxor %f2,%f2,%f2

+ .word 0x97410000 !rd %tick,%o3

+ .word 0x81b00d80 !fxor %f0,%f0,%f0

+ .word 0x85b08d82 !fxor %f2,%f2,%f2

+ .word 0x99410000 !rd %tick,%o4

+ ! calculate intervals

+ sub %o1,%o0,%o0

+ sub %o2,%o1,%o1

+ sub %o3,%o2,%o2

+ sub %o4,%o3,%o3

+ ! find minumum value

+ cmp %o0,%o1

+ .word 0x38680002 !bgu,a %xcc,.+8

+ mov %o1,%o0

+ cmp %o0,%o2

+ .word 0x38680002 !bgu,a %xcc,.+8

+ mov %o2,%o0

+ cmp %o0,%o3

+ .word 0x38680002 !bgu,a %xcc,.+8

+ mov %o3,%o0

retl

nop

-.type OPENSSL_rdtsc,#function

-.size OPENSSL_rdtsc,.-OPENSSL_atomic_add

+.type _sparcv9_vis1_instrument,#function

+.size _sparcv9_vis1_instrument,.-_sparcv9_vis1_instrument

+.global _sparcv9_vis2_probe

+.align 8

+_sparcv9_vis2_probe:

+ retl

+ .word 0x81b00980 !bshuffle %f0,%f0,%f0

+.type _sparcv9_vis2_probe,#function

+.size _sparcv9_vis2_probe,.-_sparcv9_vis2_probe

+.global _sparcv9_fmadd_probe

+.align 8

+_sparcv9_fmadd_probe:

+ .word 0x81b00d80 !fxor %f0,%f0,%f0

+ .word 0x85b08d82 !fxor %f2,%f2,%f2

+ retl

+ .word 0x81b80440 !fmaddd %f0,%f0,%f2,%f0

+.type _sparcv9_fmadd_probe,#function

+.size _sparcv9_fmadd_probe,.-_sparcv9_fmadd_probe

+.section ".init",#alloc,#execinstr

+ call OPENSSL_cpuid_setup

+ nop

diff --git a/lib/libcrypto/x86_64cpuid.pl b/lib/libcrypto/x86_64cpuid.pl
index 6ebfd017ea5..8422e913426 100644
--- a/lib/libcrypto/x86_64cpuid.pl
+++ b/lib/libcrypto/x86_64cpuid.pl

@@ -172,41 +172,6 @@ OPENSSL_ia32_cpuid:

or %r9,%rax

ret

.size OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid

-.globl OPENSSL_cleanse

-.type OPENSSL_cleanse,\@abi-omnipotent

-.align 16

-OPENSSL_cleanse:

- xor %rax,%rax

- cmp \$15,$arg2

- jae .Lot

- cmp \$0,$arg2

- je .Lret

-.Little:

- mov %al,($arg1)

- sub \$1,$arg2

- lea 1($arg1),$arg1

- jnz .Little

-.Lret:

- ret

-.align 16

-.Lot:

- test \$7,$arg1

- jz .Laligned

- mov %al,($arg1)

- lea -1($arg2),$arg2

- lea 1($arg1),$arg1

- jmp .Lot

-.Laligned:

- mov %rax,($arg1)

- lea -8($arg2),$arg2

- test \$-8,$arg2

- lea 8($arg1),$arg1

- jnz .Laligned

- cmp \$0,$arg2

- jne .Little

- ret

-.size OPENSSL_cleanse,.-OPENSSL_cleanse

___

print<<___ if (!$win64);

diff --git a/lib/libcrypto/x86cpuid.pl b/lib/libcrypto/x86cpuid.pl
index 4408ef2936e..0da613f6971 100644
--- a/lib/libcrypto/x86cpuid.pl
+++ b/lib/libcrypto/x86cpuid.pl

@@ -1,6 +1,7 @@

#!/usr/bin/env perl

-push(@INC,"perlasm");

+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;

+push(@INC, "${dir}perlasm", "perlasm");

require "x86asm.pl";

&asm_init($ARGV[0],"x86cpuid");

@@ -18,42 +19,127 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }

&pushf ();

&pop ("eax");

&xor ("ecx","eax");

- &bt ("ecx",21);

- &jnc (&label("done"));

&xor ("eax","eax");

+ &bt ("ecx",21);

+ &jnc (&label("nocpuid"));

&cpuid ();

+ &mov ("edi","eax"); # max value for standard query level

&xor ("eax","eax");

&cmp ("ebx",0x756e6547); # "Genu"

- &data_byte(0x0f,0x95,0xc0); #&setne (&LB("eax"));

+ &setne (&LB("eax"));

&mov ("ebp","eax");

&cmp ("edx",0x49656e69); # "ineI"

- &data_byte(0x0f,0x95,0xc0); #&setne (&LB("eax"));

+ &setne (&LB("eax"));

&or ("ebp","eax");

&cmp ("ecx",0x6c65746e); # "ntel"

- &data_byte(0x0f,0x95,0xc0); #&setne (&LB("eax"));

- &or ("ebp","eax");

+ &setne (&LB("eax"));

+ &or ("ebp","eax"); # 0 indicates Intel CPU

+ &jz (&label("intel"));

+ &cmp ("ebx",0x68747541); # "Auth"

+ &setne (&LB("eax"));

+ &mov ("esi","eax");

+ &cmp ("edx",0x69746E65); # "enti"

+ &setne (&LB("eax"));

+ &or ("esi","eax");

+ &cmp ("ecx",0x444D4163); # "cAMD"

+ &setne (&LB("eax"));

+ &or ("esi","eax"); # 0 indicates AMD CPU

+ &jnz (&label("intel"));

+ # AMD specific

+ &mov ("eax",0x80000000);

+ &cpuid ();

+ &cmp ("eax",0x80000001);

+ &jb (&label("intel"));

+ &mov ("esi","eax");

+ &mov ("eax",0x80000001);

+ &cpuid ();

+ &or ("ebp","ecx");

+ &and ("ebp",1<<11|1); # isolate XOP bit

+ &cmp ("esi",0x80000008);

+ &jb (&label("intel"));

+ &mov ("eax",0x80000008);

+ &cpuid ();

+ &movz ("esi",&LB("ecx")); # number of cores - 1

+ &inc ("esi"); # number of cores

+ &mov ("eax",1);

+ &xor ("ecx","ecx");

+ &cpuid ();

+ &bt ("edx",28);

+ &jnc (&label("generic"));

+ &shr ("ebx",16);

+ &and ("ebx",0xff);

+ &cmp ("ebx","esi");

+ &ja (&label("generic"));

+ &and ("edx",0xefffffff); # clear hyper-threading bit

+ &jmp (&label("generic"));

+&set_label("intel");

+ &cmp ("edi",4);

+ &mov ("edi",-1);

+ &jb (&label("nocacheinfo"));

+ &mov ("eax",4);

+ &mov ("ecx",0); # query L1D

+ &cpuid ();

+ &mov ("edi","eax");

+ &shr ("edi",14);

+ &and ("edi",0xfff); # number of cores -1 per L1D

+&set_label("nocacheinfo");

&mov ("eax",1);

+ &xor ("ecx","ecx");

&cpuid ();

+ &and ("edx",0xbfefffff); # force reserved bits #20, #30 to 0

&cmp ("ebp",0);

- &jne (&label("notP4"));

- &and ("eax",15<<8); # familiy ID

- &cmp ("eax",15<<8); # P4?

- &jne (&label("notP4"));

- &or ("edx",1<<20); # use reserved bit to engage RC4_CHAR

-&set_label("notP4");

+ &jne (&label("notintel"));

+ &or ("edx",1<<30); # set reserved bit#30 on Intel CPUs

+ &and (&HB("eax"),15); # familiy ID

+ &cmp (&HB("eax"),15); # P4?

+ &jne (&label("notintel"));

+ &or ("edx",1<<20); # set reserved bit#20 to engage RC4_CHAR

+&set_label("notintel");

&bt ("edx",28); # test hyper-threading bit

- &jnc (&label("done"));

+ &jnc (&label("generic"));

+ &and ("edx",0xefffffff);

+ &cmp ("edi",0);

+ &je (&label("generic"));

+ &or ("edx",0x10000000);

&shr ("ebx",16);

- &and ("ebx",0xff);

- &cmp ("ebx",1); # see if cache is shared(*)

- &ja (&label("done"));

+ &cmp (&LB("ebx"),1);

+ &ja (&label("generic"));

&and ("edx",0xefffffff); # clear hyper-threading bit if not

+&set_label("generic");

+ &and ("ebp",1<<11); # isolate AMD XOP flag

+ &and ("ecx",0xfffff7ff); # force 11th bit to 0

+ &mov ("esi","edx");

+ &or ("ebp","ecx"); # merge AMD XOP flag

+ &bt ("ecx",27); # check OSXSAVE bit

+ &jnc (&label("clear_avx"));

+ &xor ("ecx","ecx");

+ &data_byte(0x0f,0x01,0xd0); # xgetbv

+ &and ("eax",6);

+ &cmp ("eax",6);

+ &je (&label("done"));

+ &cmp ("eax",2);

+ &je (&label("clear_avx"));

+&set_label("clear_xmm");

+ &and ("ebp",0xfdfffffd); # clear AESNI and PCLMULQDQ bits

+ &and ("esi",0xfeffffff); # clear FXSR

+&set_label("clear_avx");

+ &and ("ebp",0xefffe7ff); # clear AVX, FMA and AMD XOP bits

&set_label("done");

- &mov ("eax","edx");

- &mov ("edx","ecx");

+ &mov ("eax","esi");

+ &mov ("edx","ebp");

+&set_label("nocpuid");

&function_end("OPENSSL_ia32_cpuid");

-# (*) on Core2 this value is set to 2 denoting the fact that L2

-# cache is shared between cores.

&external_label("OPENSSL_ia32cap_P");

@@ -81,7 +167,7 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }

&jnz (&label("nohalt")); # not enough privileges

&pushf ();

- &pop ("eax")

+ &pop ("eax");

&bt ("eax",9);

&jnc (&label("nohalt")); # interrupts are disabled

@@ -146,8 +232,9 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }

&bt (&DWP(0,"ecx"),1);

&jnc (&label("no_x87"));

if ($sse2) {

- &bt (&DWP(0,"ecx"),26);

- &jnc (&label("no_sse2"));

+ &and ("ecx",1<<26|1<<24); # check SSE2 and FXSR bits

+ &cmp ("ecx",1<<26|1<<24);

+ &jne (&label("no_sse2"));

&pxor ("xmm0","xmm0");

&pxor ("xmm1","xmm1");

&pxor ("xmm2","xmm2");

@@ -195,7 +282,7 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }

# arguments is 1 or 2!

&function_begin_B("OPENSSL_indirect_call");

{

- my $i,$max=7; # $max has to be chosen as 4*n-1

+ my ($max,$i)=(7,); # $max has to be chosen as 4*n-1

# in order to preserve eventual

# stack alignment

&push ("ebp");

@@ -220,6 +307,18 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }

}

&function_end_B("OPENSSL_indirect_call");

+&function_begin_B("OPENSSL_ia32_rdrand");

+ &mov ("ecx",8);

+&set_label("loop");

+ &rdrand ("eax");

+ &jc (&label("break"));

+ &loop (&label("loop"));

+&set_label("break");

+ &cmp ("eax",0);

+ &cmove ("eax","ecx");

+ &ret ();

+&function_end_B("OPENSSL_ia32_rdrand");

&initseg("OPENSSL_cpuid_setup");

&asm_finish();