summaryrefslogtreecommitdiff
path: root/sys/arch/powerpc
diff options
context:
space:
mode:
authorGeorge Koehler <gkoehler@cvs.openbsd.org>2022-10-22 00:58:57 +0000
committerGeorge Koehler <gkoehler@cvs.openbsd.org>2022-10-22 00:58:57 +0000
commit02cd8542c783cc10b6df65048a09c71fb4fce783 (patch)
tree7b981a9c5fc22c068e782307d867aaef0fe3fec0 /sys/arch/powerpc
parentebdd629c20fba9b043a7e3387800de2d01a9ba53 (diff)
AltiVec assist for subnormal floats in vectors
The old CPU in a macppc traps AltiVec instructions when they encounter denormal or subnormal floats. Emulate most of them. They operate on vectors of 4 single-precision floats. The emulations either use scalar operations (so vmaddfp becomes 4 of fmadds) or a formula (like vrsqrtefp's 1 / sqrt(b) = 1 / sqrt(b * 2**126) * 2**63). I am forgetting to emulate some instructions (at least vrfin, vrfiz, vrfip, vrfim). If I don't emulate it, it will still cause SIGFPE. Mac OS never emulated these instructions, but set AltiVec's "non-Java" NJ bit (which changes all subnormal floats to zero). FreeBSD also sets NJ; NetBSD does SIGFPE; Linux emulates them. The POWER9 running OpenBSD/powerpc64 does them in hardware (without trapping). ok kettenis@ miod@
Diffstat (limited to 'sys/arch/powerpc')
-rw-r--r--sys/arch/powerpc/conf/files.powerpc3
-rw-r--r--sys/arch/powerpc/include/trap.h5
-rw-r--r--sys/arch/powerpc/powerpc/trap.c95
-rw-r--r--sys/arch/powerpc/powerpc/vecast.S476
4 files changed, 574 insertions, 5 deletions
diff --git a/sys/arch/powerpc/conf/files.powerpc b/sys/arch/powerpc/conf/files.powerpc
index a05878c88e2..43ded9eb9c1 100644
--- a/sys/arch/powerpc/conf/files.powerpc
+++ b/sys/arch/powerpc/conf/files.powerpc
@@ -1,4 +1,4 @@
-# $OpenBSD: files.powerpc,v 1.55 2018/01/25 15:06:29 mpi Exp $
+# $OpenBSD: files.powerpc,v 1.56 2022/10/22 00:58:56 gkoehler Exp $
#
file arch/powerpc/powerpc/setjmp.S ddb
@@ -16,6 +16,7 @@ file arch/powerpc/powerpc/vm_machdep.c
file arch/powerpc/powerpc/lock_machdep.c multiprocessor
file arch/powerpc/powerpc/intr.c
file arch/powerpc/powerpc/softintr.c
+file arch/powerpc/powerpc/vecast.S altivec
file arch/powerpc/ddb/db_memrw.c ddb
file arch/powerpc/ddb/db_disasm.c ddb
diff --git a/sys/arch/powerpc/include/trap.h b/sys/arch/powerpc/include/trap.h
index 420729a3ed5..ba86f95c6d5 100644
--- a/sys/arch/powerpc/include/trap.h
+++ b/sys/arch/powerpc/include/trap.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: trap.h,v 1.8 2022/05/19 05:43:48 miod Exp $ */
+/* $OpenBSD: trap.h,v 1.9 2022/10/22 00:58:56 gkoehler Exp $ */
/* $NetBSD: trap.h,v 1.1 1996/09/30 16:34:35 ws Exp $ */
/*
@@ -51,7 +51,8 @@
#define EXC_VEC 0x0f20 /* AltiVec Unavailable */
#define EXC_BPT 0x1300 /* Instruction Breakpoint */
#define EXC_SMI 0x1400 /* System Management Interrupt */
-#define EXC_VECAST 0x1600 /* AltiVec Assist */
+#define EXC_VECAST_G4 0x1600 /* AltiVec Assist */
+#define EXC_VECAST_G5 0x1700 /* AltiVec Assist */
/* And these are only on the 603: */
#define EXC_IMISS 0x1000 /* Instruction translation miss */
diff --git a/sys/arch/powerpc/powerpc/trap.c b/sys/arch/powerpc/powerpc/trap.c
index 990615fbae4..da68970686b 100644
--- a/sys/arch/powerpc/powerpc/trap.c
+++ b/sys/arch/powerpc/powerpc/trap.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: trap.c,v 1.125 2022/01/21 14:07:06 tobhe Exp $ */
+/* $OpenBSD: trap.c,v 1.126 2022/10/22 00:58:56 gkoehler Exp $ */
/* $NetBSD: trap.c,v 1.3 1996/10/13 03:31:37 christos Exp $ */
/*
@@ -67,6 +67,8 @@ void trap(struct trapframe *frame);
#define MOREARGS(sp) ((caddr_t)((int)(sp) + 8)) /* more args go here */
#ifdef ALTIVEC
+static int altivec_assist(void *);
+
/*
* Save state of the vector processor, This is done lazily in the hope
* that few processes in the system will be using the vector unit
@@ -511,7 +513,14 @@ brain_damage:
break;
#endif
- case EXC_VECAST|EXC_USER:
+ case EXC_VECAST_G4|EXC_USER:
+ case EXC_VECAST_G5|EXC_USER:
+#ifdef ALTIVEC
+ if (altivec_assist((void *)frame->srr0) == 0) {
+ frame->srr0 += 4;
+ break;
+ }
+#endif
sv.sival_int = frame->srr0;
trapsignal(p, SIGFPE, 0, FPE_FLTRES, sv);
break;
@@ -638,3 +647,85 @@ fix_unaligned(struct proc *p, struct trapframe *frame)
}
return -1;
}
+
+#ifdef ALTIVEC
+static int
+altivec_assist(void *user_pc)
+{
+ /* These labels are in vecast.S */
+ void vecast_asm(uint32_t, void *);
+ void vecast_vaddfp(void);
+ void vecast_vsubfp(void);
+ void vecast_vmaddfp(void);
+ void vecast_vnmsubfp(void);
+ void vecast_vrefp(void);
+ void vecast_vrsqrtefp(void);
+ void vecast_vlogefp(void);
+ void vecast_vexptefp(void);
+ void vecast_vctuxs(void);
+ void vecast_vctsxs(void);
+
+ uint32_t insn, op, va, vc, lo;
+ void (*lab)(void);
+
+ if (copyin(user_pc, &insn, sizeof(insn)) != 0)
+ return -1;
+ op = (insn & 0xfc000000) >> 26; /* primary opcode */
+ va = (insn & 0x001f0000) >> 16; /* vector A */
+ vc = (insn & 0x000007c0) >> 6; /* vector C or extended opcode */
+ lo = insn & 0x0000003f; /* extended opcode */
+
+ /* Stop if this isn't an altivec instruction. */
+ if (op != 4)
+ return -1;
+
+ /* Decide which instruction to emulate. */
+ lab = NULL;
+ switch (lo) {
+ case 10:
+ switch (vc) {
+ case 0:
+ lab = vecast_vaddfp;
+ break;
+ case 1:
+ lab = vecast_vsubfp;
+ break;
+ case 4:
+ if (va == 0)
+ lab = vecast_vrefp;
+ break;
+ case 5:
+ if (va == 0)
+ lab = vecast_vrsqrtefp;
+ break;
+ case 6:
+ if (va == 0)
+ lab = vecast_vexptefp;
+ break;
+ case 7:
+ if (va == 0)
+ lab = vecast_vlogefp;
+ break;
+ case 14:
+ lab = vecast_vctuxs;
+ break;
+ case 15:
+ lab = vecast_vctsxs;
+ break;
+ }
+ break;
+ case 46:
+ lab = vecast_vmaddfp;
+ break;
+ case 47:
+ lab = vecast_vnmsubfp;
+ break;
+ }
+
+ if (lab) {
+ vecast_asm(insn, lab); /* Emulate it. */
+ return 0;
+ } else
+ return -1;
+}
+#endif
diff --git a/sys/arch/powerpc/powerpc/vecast.S b/sys/arch/powerpc/powerpc/vecast.S
new file mode 100644
index 00000000000..a4b1032bdbc
--- /dev/null
+++ b/sys/arch/powerpc/powerpc/vecast.S
@@ -0,0 +1,476 @@
+/* $OpenBSD: vecast.S,v 1.1 2022/10/22 00:58:56 gkoehler Exp $ */
+
+/*
+ * Copyright (c) 2022 George Koehler <gkoehler@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <machine/asm.h>
+#include <machine/psl.h>
+
+/*
+ * To load or store an arbitrary AltiVec register, we extract its
+ * number from the instruction and multiply it by 8. We do both using
+ * rlwinm to rotate it left into bits 24 to 28.
+ *
+ * 0 10 15 20 24 28
+ * | | | | | |
+ * 000100dddddaaaaabbbbbcccccxxxxxx
+ */
+#define VD_ROTATE 14, 24, 28
+#define VA_ROTATE 19, 24, 28
+#define VB_ROTATE 24, 24, 28
+#define VC_ROTATE 29, 24, 28
+
+/*
+ * vctuxs, vctsxs have an unsigned immediate UI in bits 11 to 15. We
+ * extract it into bits 4 to 8, then add FLOAT_1_IS to make 2**UI.
+ */
+#define UI_ROTATE 7, 4, 8
+#define FLOAT_1_IS 0x3f80 /* (float 1) >> 16 */
+
+ .rodata
+ .balign 4
+.Lzero: .float 0
+.Lone: .float 1
+.Ln126: .float 126
+.Ltwo63: .float 0x1p63
+.Ltwo126: .float 0x1p126
+.Lmin: .float 0x1p-126 /* FLT_MIN */
+
+ .text
+
+/* This is the stack frame for vecast_asm. */
+#define s_size 128
+#define s_f31 120
+#define s_f30 112
+#define s_f29 104
+#define s_f28 96
+#define s_f27 88
+#define s_f26 80
+#define s_f25 72
+#define s_f24 64
+#define s_vc 48
+#define s_vb 32
+#define s_va 16
+
+/*
+ * vecast_asm(insn r3, label r4) emulates an AltiVec instruction when
+ * it traps a denormal or subnormal float (with an AltiVec assist
+ * exception). Such a float f has 0 < |f| < FLT_MIN 2**-126.
+ *
+ * MPC7450 RISC Microprocessor Family Reference Manual, 7.1.2.5 Java
+ * Mode, NaNs, Denormalized Numbers, and Zeros, has a list of trapping
+ * instructions: vaddfp, vsubfp, vmaddfp, vnmsubfp, vrefp, vrsqrtefp,
+ * vlogefp, vexptefp, vctsxs, vctuxs.
+ */
+ENTRY(vecast_asm)
+ mflr %r0 /* r0 = return address */
+ RETGUARD_SETUP_LATE(vecast_asm, %r9, %r0)
+ stwu %r1, -s_size(%r1)
+ mfmsr %r5 /* r5 = old msr */
+
+ /*
+ * Borrow the vector and floating-point units. We must
+ * preserve all float and most vector registers.
+ */
+ rlwinm %r6, %r5, 0, 17, 15 /* r6 = r5 & ~PSL_EE */
+ oris %r6, %r6, PSL_VEC >> 16
+ ori %r6, %r6, PSL_FP
+ mtmsr %r6
+ isync
+
+ stfd %f31, s_f31(%r1)
+ stfd %f30, s_f30(%r1)
+ stfd %f29, s_f29(%r1)
+ stfd %f28, s_f28(%r1)
+ stfd %f27, s_f27(%r1)
+ stfd %f26, s_f26(%r1)
+ stfd %f25, s_f25(%r1)
+ stfd %f24, s_f24(%r1)
+ mffs %f31 /* f31 = old fpscr */
+
+ lis %r6, .Lzero@ha
+ la %r6, .Lzero@l(%r6) /* r6 = address of .Lzero */
+
+ /* fpscr = zero (round to nearest, no traps) */
+ lfs %f30, 0(%r6) /* f30 = zero */
+ mtfsf 255, %f30
+
+ /* All instructions do s_vb = VB now; VD = s_va at finish. */
+ rlwinm %r7, %r3, VB_ROTATE
+ la %r8, s_vb(%r1)
+ bl vecast_store_vector
+
+ mtctr %r4
+ li %r4, 4 /* r4 = 4 loop iterations */
+ bctr /* Branch to our instruction's label. */
+
+/*
+ * vaddfp: d = a + b
+ */
+ .globl vecast_vaddfp
+vecast_vaddfp:
+ rlwinm %r7, %r3, VA_ROTATE
+ la %r8, s_va(%r1)
+ bl vecast_store_vector
+
+ /* s_va = s_va + s_vb */
+ mtctr %r4
+ la %r7, (s_va - 4)(%r1)
+1: lfsu %f30, 4(%r7) /* r7 += 4, then load (r7). */
+ lfs %f29, (s_vb - s_va)(%r7)
+ fadds %f30, %f30, %f29
+ stfs %f30, 0(%r7)
+ bdnz 1b /* Loop 4 times. */
+ b vecast_finish
+
+/*
+ * vsubfp: d = a + b
+ */
+ .globl vecast_vsubfp
+vecast_vsubfp:
+ rlwinm %r7, %r3, VA_ROTATE
+ la %r8, s_va(%r1)
+ bl vecast_store_vector
+
+ /* s_va = s_va - s_vb */
+ mtctr %r4
+ la %r7, (s_va - 4)(%r1)
+1: lfsu %f30, 4(%r7)
+ lfs %f29, (s_vb - s_va)(%r7)
+ fsubs %f30, %f30, %f29
+ stfs %f30, 0(%r7)
+ bdnz 1b
+ b vecast_finish
+
+/*
+ * vmaddfp: d = a * c + b
+ */
+ .globl vecast_vmaddfp
+vecast_vmaddfp:
+ rlwinm %r7, %r3, VA_ROTATE
+ la %r8, s_va(%r1)
+ bl vecast_store_vector
+ rlwinm %r7, %r3, VC_ROTATE
+ la %r8, s_vc(%r1)
+ bl vecast_store_vector
+
+ /* s_va = s_va * s_vc + s_vb */
+ mtctr %r4
+ la %r7, (s_va - 4)(%r1)
+1: lfsu %f30, 4(%r7)
+ lfs %f29, (s_vb - s_va)(%r7)
+ lfs %f28, (s_vc - s_va)(%r7)
+ fmadds %f30, %f30, %f28, %f29
+ stfs %f30, 0(%r7)
+ bdnz 1b
+ b vecast_finish
+
+/*
+ * vnmsubfp: d = b - a * c
+ */
+ .globl vecast_vnmsubfp
+vecast_vnmsubfp:
+ rlwinm %r7, %r3, VA_ROTATE
+ la %r8, s_va(%r1)
+ bl vecast_store_vector
+ rlwinm %r7, %r3, VC_ROTATE
+ la %r8, s_vc(%r1)
+ bl vecast_store_vector
+
+ /* s_va = -(s_va * s_vc - s_vb) */
+ mtctr %r4
+ la %r7, (s_va - 4)(%r1)
+1: lfsu %f30, 4(%r7)
+ lfs %f29, (s_vb - s_va)(%r7)
+ lfs %f28, (s_vc - s_va)(%r7)
+ fnmsubs %f30, %f30, %f28, %f29
+ stfs %f30, 0(%r7)
+ bdnz 1b
+ b vecast_finish
+
+/*
+ * vrefp: d = estimate 1 / b
+ */
+ .globl vecast_vrefp
+vecast_vrefp:
+ /* s_va = estimate 1 / s_vb */
+ mtctr %r4
+ la %r7, (s_vb - 4)(%r1)
+1: lfsu %f30, 4(%r7)
+ fres %f30, %f30
+ stfs %f30, (s_va - s_vb)(%r7)
+ bdnz 1b
+ b vecast_finish
+
+/*
+ * vrsqrtefp: d = estimate 1 / sqrt(b)
+ * 1 / sqrt(b) = 1 / sqrt(b * 2**126) * 2**63 when b < 2**-126
+ *
+ * MPC7455's frsqrte does 1 / sqrt(1) = 0.984375, relative error 1/64.
+ * AltiVec must not err over 1/4096, so avoid frsqrte.
+ */
+ .globl vecast_vrsqrtefp
+vecast_vrsqrtefp:
+ /* f30 = 1; f29 = 2**63, f28 = 2**126; f27 = 2**-126 */
+ lfs %f30, (.Lone - .Lzero)(%r6)
+ lfs %f29, (.Ltwo63 - .Lzero)(%r6)
+ lfs %f28, (.Ltwo126 - .Lzero)(%r6)
+ lfs %f27, (.Lmin - .Lzero)(%r6)
+
+ /*
+ * s_vb = s_vb * 2**126, s_va = 2**63 when b < 2**-126
+ * s_va = 1 when b >= 2**-126
+ */
+ mtctr %r4
+ la %r7, (s_vb - 4)(%r1)
+1: lfsu %f26, 4(%r7)
+ fmuls %f25, %f26, %f28
+ fsubs %f24, %f26, %f27 /* f24 selects b >= 2**-126 */
+ fsel %f26, %f24, %f26, %f25 /* f26 = b or b * 2**126 */
+ stfs %f26, 0(%r7)
+ fsel %f25, %f24, %f30, %f29 /* f25 = 1 or 2**63 */
+ stfs %f25, (s_va - s_vb)(%r7)
+ bdnz 1b
+
+ /* s_vb = estimate 1 / sqrt(s_vb) */
+ la %r7, s_vc(%r1)
+ la %r8, s_vb(%r1)
+ stvx %v31, 0, %r7 /* Save v31 in s_vc. */
+ lvx %v31, 0, %r8
+ vrsqrtefp %v31, %v31
+ stvx %v31, 0, %r8
+ lvx %v31, 0, %r7
+
+ /* s_va = s_vb * s_va */
+ mtctr %r4
+ la %r7, (s_va - 4)(%r1)
+1: lfsu %f30, 4(%r7)
+ lfs %f29, (s_vb - s_va)(%r7)
+ fmuls %f30, %f29, %f30
+ stfs %f30, 0(%r7)
+ bdnz 1b
+ b vecast_finish
+
+/*
+ * vlogefp: d = estimate log2(b)
+ * log2(b) = log2(b * 2**126) - 126 when b < 2**-126
+ */
+ .globl vecast_vlogefp
+vecast_vlogefp:
+ /* f30 = 0; f29 = 126; f28 = 2**126; f27 = 2**-126 */
+ lfs %f29, (.Ln126 - .Lzero)(%r6)
+ lfs %f28, (.Ltwo126 - .Lzero)(%r6)
+ lfs %f27, (.Lmin - .Lzero)(%r6)
+
+ /*
+ * s_vb = s_vb * 2**126, s_va = 126 when s_vb < 2**-126
+ * s_va = 0 when s_vb >= 2**-126
+ */
+ mtctr %r4
+ la %r7, (s_vb - 4)(%r1)
+1: lfsu %f26, 4(%r7)
+ fmuls %f25, %f26, %f28
+ fsubs %f24, %f26, %f27 /* f24 selects b >= 2**-126 */
+ fsel %f26, %f24, %f26, %f25 /* f26 = b or b * 2**126 */
+ stfs %f26, 0(%r7)
+ fsel %f25, %f24, %f30, %f29 /* f25 = 0 or 126 */
+ stfs %f25, (s_va - s_vb)(%r7)
+ bdnz 1b
+
+ /* s_vb = estimate log2(s_vb) */
+ la %r7, s_vc(%r1)
+ la %r8, s_vb(%r1)
+ stvx %v31, 0, %r7
+ lvx %v31, 0, %r8
+ vlogefp %v31, %v31
+ stvx %v31, 0, %r8
+ lvx %v31, 0, %r7
+
+ /* s_va = s_vb - s_va */
+ mtctr %r4
+ la %r7, (s_va - 4)(%r1)
+1: lfsu %f30, 4(%r7)
+ lfs %f29, (s_vb - s_va)(%r7)
+ fsubs %f30, %f29, %f30
+ stfs %f30, 0(%r7)
+ bdnz 1b
+ b vecast_finish
+
+/*
+ * vexptefp: d = estimate 2**b
+ * 2**b = 2**(b + 126) * 2**-126 when -252 <= b < -126
+ */
+ .globl vecast_vexptefp
+vecast_vexptefp:
+ /* f30 = 1; f29 = 126; f28 = 2**-126 */
+ lfs %f30, (.Lone - .Lzero)(%r6)
+ lfs %f29, (.Ln126 - .Lzero)(%r6)
+ lfs %f28, (.Lmin - .Lzero)(%r6)
+
+ /*
+ * s_vb = s_vb + 126 when -252 <= b < -126
+ * s_va = 2**-126 when b < -126
+ * s_va = 1 when b >= -126
+ *
+ * If b < -252, we avoid a possibly subnormal 2**(b + 126)
+ * by calculating 2**b * 2**-126 = 0 * 2**-126 = 0.
+ */
+ mtctr %r4
+ la %r7, (s_vb - 4)(%r1)
+1: lfsu %f27, 4(%r7)
+ fadds %f26, %f27, %f29 /* f26 selects b >= -126 */
+ fadds %f25, %f26, %f29 /* f25 selects b >= -252 */
+ fsel %f24, %f26, %f27, %f26
+ fsel %f24, %f25, %f24, %f27 /* f24 = b or b + 126 */
+ stfs %f24, 0(%r7)
+ fsel %f27, %f26, %f30, %f28 /* f27 = 1 or 2**-126 */
+ stfs %f27, (s_va - s_vb)(%r7)
+ bdnz 1b
+
+ /* s_vb = estimate 2**s_vb */
+ la %r7, s_vc(%r1)
+ la %r8, s_vb(%r1)
+ stvx %v31, 0, %r7
+ lvx %v31, 0, %r8
+ vexptefp %v31, %v31
+ stvx %v31, 0, %r8
+ lvx %v31, 0, %r7
+
+ /* s_va = s_vb * s_va */
+ mtctr %r4
+ la %r7, (s_va - 4)(%r1)
+1: lfsu %f30, 4(%r7)
+ lfs %f29, (s_vb - s_va)(%r7)
+ fmuls %f30, %f29, %f30
+ stfs %f30, 0(%r7)
+ bdnz 1b
+ b vecast_finish
+
+/*
+ * vctsxs: d = (int32_t)(b * 2**u) where 0 <= u < 32
+ * d = 0 when |b| < 2**-126
+ */
+ .globl vecast_vctsxs
+vecast_vctsxs:
+ /* f30 = 0; f29 = 2**-126; f28 = 2**u */
+ lfs %f29, (.Lmin - .Lzero)(%r6)
+ rlwinm %r7, %r3, UI_ROTATE
+ addis %r7, %r7, FLOAT_1_IS
+ stw %r7, s_va(%r1)
+ lfs %f28, s_va(%r1)
+
+ /* s_va = s_vb * 2**u, unless b is tiny. */
+ mtctr %r4
+ la %r7, (s_vb - 4)(%r1)
+1: lfsu %f27, 4(%r7)
+ fmuls %f26, %f27, %f28
+ fabs %f27, %f27
+ fsubs %f27, %f27, %f29 /* f27 selects |b| >= 2**-126 */
+ fsel %f26, %f27, %f26, %f30 /* f26 = b * 2**u or 0 */
+ stfs %f26, (s_va - s_vb)(%r7)
+ bdnz 1b
+
+ /* s_va = (int32_t)b */
+ la %r7, s_vc(%r1)
+ la %r8, s_va(%r1)
+ stvx %v31, 0, %r7
+ lvx %v31, 0, %r8
+ vctsxs %v31, %v31, 0 /* May set SAT in vscr. */
+ stvx %v31, 0, %r8
+ lvx %v31, 0, %r7
+ b vecast_finish
+
+/*
+ * vctuxs: d = (uint32_t)(b * 2**u) where 0 <= u < 32
+ * d = 0 when |b| < 2**-126
+ */
+ .globl vecast_vctuxs
+vecast_vctuxs:
+ /* f30 = 0; f29 = 2**-126; f28 = 2**u */
+ lfs %f29, (.Lmin - .Lzero)(%r6)
+ rlwinm %r7, %r3, UI_ROTATE
+ addis %r7, %r7, FLOAT_1_IS
+ stw %r7, s_va(%r1)
+ lfs %f28, s_va(%r1)
+
+ /* s_va = s_vb * 2**u, unless b is tiny. */
+ mtctr %r4
+ la %r7, (s_vb - 4)(%r1)
+1: lfsu %f27, 4(%r7)
+ fmuls %f26, %f27, %f28
+ fabs %f27, %f27
+ fsubs %f27, %f27, %f29 /* f27 selects |b| >= 2**-126 */
+ fsel %f26, %f27, %f26, %f30 /* f26 = b * 2**u or 0 */
+ stfs %f26, (s_va - s_vb)(%r7)
+ bdnz 1b
+
+ /* s_va = (uint32_t)b */
+ la %r7, s_vc(%r1)
+ la %r8, s_va(%r1)
+ stvx %v31, 0, %r7
+ lvx %v31, 0, %r8
+ vctuxs %v31, %v31, 0 /* May set SAT in vscr. */
+ stvx %v31, 0, %r8
+ lvx %v31, 0, %r7
+ /* b vecast_finish */
+
+vecast_finish:
+ /* VD = s_va */
+ rlwinm %r7, %r3, VD_ROTATE
+ addis %r7, %r7, 1f@ha
+ addi %r7, %r7, 1f@l
+ mtctr %r7
+ la %r8, s_va(%r1)
+ bctr
+#define M(n) lvx %v##n, 0, %r8; b 2f
+1: M( 0); M( 1); M( 2); M( 3); M( 4); M( 5); M( 6); M( 7)
+ M( 8); M( 9); M(10); M(11); M(12); M(13); M(14); M(15)
+ M(16); M(17); M(18); M(19); M(20); M(21); M(22); M(23)
+ M(24); M(25); M(26); M(27); M(28); M(29); M(30); M(31)
+#undef M
+2: mtlr %r0
+ mtfsf 255, %f31 /* Restore old fpscr. */
+ lfd %f24, s_f24(%r1)
+ lfd %f25, s_f25(%r1)
+ lfd %f26, s_f26(%r1)
+ lfd %f27, s_f27(%r1)
+ lfd %f28, s_f28(%r1)
+ lfd %f29, s_f29(%r1)
+ lfd %f30, s_f30(%r1)
+ lfd %f31, s_f31(%r1)
+ mtmsr %r5 /* Restore old msr. */
+ isync
+ addi %r1, %r1, s_size
+ RETGUARD_CHECK(vecast_asm, %r9, %r0)
+ blr
+
+/*
+ * Stores vector v(r7 / 8) to address r8.
+ */
+vecast_store_vector:
+ RETGUARD_SETUP(vecast_store_vector, %r11, %r12)
+ addis %r7, %r7, 1f@ha
+ addi %r7, %r7, 1f@l
+ mtctr %r7
+ bctr
+#define M(n) stvx %v##n, 0, %r8; b 2f
+1: M( 0); M( 1); M( 2); M( 3); M( 4); M( 5); M( 6); M( 7)
+ M( 8); M( 9); M(10); M(11); M(12); M(13); M(14); M(15)
+ M(16); M(17); M(18); M(19); M(20); M(21); M(22); M(23)
+ M(24); M(25); M(26); M(27); M(28); M(29); M(30); M(31)
+#undef M
+2: RETGUARD_CHECK(vecast_store_vector, %r11, %r12)
+ blr