summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTheo de Raadt <deraadt@cvs.openbsd.org>1999-12-07 00:49:11 +0000
committerTheo de Raadt <deraadt@cvs.openbsd.org>1999-12-07 00:49:11 +0000
commit09517c7586448c4de5e68230335e44d1a2ae8ad9 (patch)
tree2d3da8f4022ae3266513662db1601fca33dca637
parent739b5aab7680cf40ec4d92e9a9648898ddec3b9b (diff)
move .mul and .umul into the kernel. if v8 sparc is detected, replace with
the equivelant 2-instruction sequence.
-rw-r--r--sys/arch/sparc/sparc/autoconf.c11
-rw-r--r--sys/arch/sparc/sparc/locore.s274
-rw-r--r--sys/lib/libkern/arch/sparc/Makefile.inc6
-rw-r--r--sys/lib/libkern/arch/sparc/mul.S160
-rw-r--r--sys/lib/libkern/arch/sparc/umul.S193
5 files changed, 286 insertions, 358 deletions
diff --git a/sys/arch/sparc/sparc/autoconf.c b/sys/arch/sparc/sparc/autoconf.c
index 68afc63f0d8..814e1bc83d7 100644
--- a/sys/arch/sparc/sparc/autoconf.c
+++ b/sys/arch/sparc/sparc/autoconf.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: autoconf.c,v 1.31 1999/09/03 18:01:57 art Exp $ */
+/* $OpenBSD: autoconf.c,v 1.32 1999/12/07 00:49:07 deraadt Exp $ */
/* $NetBSD: autoconf.c,v 1.73 1997/07/29 09:41:53 fair Exp $ */
/*
@@ -286,6 +286,15 @@ bootstrap()
extern void setpte4m __P((u_int, u_int));
extern struct timer_4m *timerreg_4m;
extern struct counter_4m *counterreg_4m;
+ extern void *_umulreplace, *_umul, *_umulreplace_end;
+ extern void *_mulreplace, *_mul, *_mulreplace_end;
+
+ /*
+ * Whack the slow sun4/sun4c umul/mul functions with
+ * fast V8 ones
+ */
+ bcopy(_umulreplace, _umul, _umulreplace_end-_umulreplace);
+ bcopy(_mulreplace, _mul, _mulreplace_end-_mulreplace);
if ((node = opennode("/obio/interrupt")) == 0)
if ((node=search_prom(findroot(),"interrupt"))==0)
diff --git a/sys/arch/sparc/sparc/locore.s b/sys/arch/sparc/sparc/locore.s
index c97c659d2ff..2b98b97deb6 100644
--- a/sys/arch/sparc/sparc/locore.s
+++ b/sys/arch/sparc/sparc/locore.s
@@ -1,4 +1,4 @@
-/* $OpenBSD: locore.s,v 1.27 1999/11/13 00:11:52 deraadt Exp $ */
+/* $OpenBSD: locore.s,v 1.28 1999/12/07 00:49:07 deraadt Exp $ */
/* $NetBSD: locore.s,v 1.73 1997/09/13 20:36:48 pk Exp $ */
/*
@@ -5895,6 +5895,278 @@ ENTRY(ffs)
add %o0, 24, %o0
/*
+ * V8 sparc mul/umul replacements.
+ */
+.globl __mulreplace, __mulreplace_end
+__mulreplace:
+ retl
+ smul %o0, %o1, %o0
+__mulreplace_end:
+
+.globl __umulreplace, __umulreplace_end
+__umulreplace:
+ retl
+ umul %o0, %o1, %o0
+__umulreplace_end:
+
+/*
+ * Signed multiply, from Appendix E of the Sparc Version 8
+ * Architecture Manual.
+ *
+ * Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the upper 32 bits of
+ * the 64-bit product).
+ *
+ * This code optimizes short (less than 13-bit) multiplies.
+ */
+.globl .mul, __mul
+.mul:
+__mul:
+ mov %o0, %y ! multiplier -> Y
+ andncc %o0, 0xfff, %g0 ! test bits 12..31
+ be Lmul_shortway ! if zero, can do it the short way
+ andcc %g0, %g0, %o4 ! zero the partial product and clear N and V
+
+ /*
+ * Long multiply. 32 steps, followed by a final shift step.
+ */
+ mulscc %o4, %o1, %o4 ! 1
+ mulscc %o4, %o1, %o4 ! 2
+ mulscc %o4, %o1, %o4 ! 3
+ mulscc %o4, %o1, %o4 ! 4
+ mulscc %o4, %o1, %o4 ! 5
+ mulscc %o4, %o1, %o4 ! 6
+ mulscc %o4, %o1, %o4 ! 7
+ mulscc %o4, %o1, %o4 ! 8
+ mulscc %o4, %o1, %o4 ! 9
+ mulscc %o4, %o1, %o4 ! 10
+ mulscc %o4, %o1, %o4 ! 11
+ mulscc %o4, %o1, %o4 ! 12
+ mulscc %o4, %o1, %o4 ! 13
+ mulscc %o4, %o1, %o4 ! 14
+ mulscc %o4, %o1, %o4 ! 15
+ mulscc %o4, %o1, %o4 ! 16
+ mulscc %o4, %o1, %o4 ! 17
+ mulscc %o4, %o1, %o4 ! 18
+ mulscc %o4, %o1, %o4 ! 19
+ mulscc %o4, %o1, %o4 ! 20
+ mulscc %o4, %o1, %o4 ! 21
+ mulscc %o4, %o1, %o4 ! 22
+ mulscc %o4, %o1, %o4 ! 23
+ mulscc %o4, %o1, %o4 ! 24
+ mulscc %o4, %o1, %o4 ! 25
+ mulscc %o4, %o1, %o4 ! 26
+ mulscc %o4, %o1, %o4 ! 27
+ mulscc %o4, %o1, %o4 ! 28
+ mulscc %o4, %o1, %o4 ! 29
+ mulscc %o4, %o1, %o4 ! 30
+ mulscc %o4, %o1, %o4 ! 31
+ mulscc %o4, %o1, %o4 ! 32
+ mulscc %o4, %g0, %o4 ! final shift
+
+ ! If %o0 was negative, the result is
+ ! (%o0 * %o1) + (%o1 << 32))
+ ! We fix that here.
+
+ tst %o0
+ bge 1f
+ rd %y, %o0
+
+ ! %o0 was indeed negative; fix upper 32 bits of result by subtracting
+ ! %o1 (i.e., return %o4 - %o1 in %o1).
+ retl
+ sub %o4, %o1, %o1
+
+1:
+ retl
+ mov %o4, %o1
+
+Lmul_shortway:
+ /*
+ * Short multiply. 12 steps, followed by a final shift step.
+ * The resulting bits are off by 12 and (32-12) = 20 bit positions,
+ * but there is no problem with %o0 being negative (unlike above).
+ */
+ mulscc %o4, %o1, %o4 ! 1
+ mulscc %o4, %o1, %o4 ! 2
+ mulscc %o4, %o1, %o4 ! 3
+ mulscc %o4, %o1, %o4 ! 4
+ mulscc %o4, %o1, %o4 ! 5
+ mulscc %o4, %o1, %o4 ! 6
+ mulscc %o4, %o1, %o4 ! 7
+ mulscc %o4, %o1, %o4 ! 8
+ mulscc %o4, %o1, %o4 ! 9
+ mulscc %o4, %o1, %o4 ! 10
+ mulscc %o4, %o1, %o4 ! 11
+ mulscc %o4, %o1, %o4 ! 12
+ mulscc %o4, %g0, %o4 ! final shift
+
+ /*
+ * %o4 has 20 of the bits that should be in the low part of the
+ * result; %y has the bottom 12 (as %y's top 12). That is:
+ *
+ * %o4 %y
+ * +----------------+----------------+
+ * | -12- | -20- | -12- | -20- |
+ * +------(---------+------)---------+
+ * --hi-- ----low-part----
+ *
+ * The upper 12 bits of %o4 should be sign-extended to form the
+ * high part of the product (i.e., highpart = %o4 >> 20).
+ */
+
+ rd %y, %o5
+ sll %o4, 12, %o0 ! shift middle bits left 12
+ srl %o5, 20, %o5 ! shift low bits right 20, zero fill at left
+ or %o5, %o0, %o0 ! construct low part of result
+ retl
+ sra %o4, 20, %o1 ! ... and extract high part of result
+
+/*
+ * Unsigned multiply. Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the
+ * upper 32 bits of the 64-bit product).
+ *
+ * This code optimizes short (less than 13-bit) multiplies. Short
+ * multiplies require 25 instruction cycles, and long ones require
+ * 45 instruction cycles.
+ *
+ * On return, overflow has occurred (%o1 is not zero) if and only if
+ * the Z condition code is clear, allowing, e.g., the following:
+ *
+ * call .umul
+ * nop
+ * bnz overflow (or tnz)
+ */
+.globl .umul, __umul
+.umul:
+__umul:
+ or %o0, %o1, %o4
+ mov %o0, %y ! multiplier -> Y
+ andncc %o4, 0xfff, %g0 ! test bits 12..31 of *both* args
+ be Lumul_shortway ! if zero, can do it the short way
+ andcc %g0, %g0, %o4 ! zero the partial product and clear N and V
+
+ /*
+ * Long multiply. 32 steps, followed by a final shift step.
+ */
+ mulscc %o4, %o1, %o4 ! 1
+ mulscc %o4, %o1, %o4 ! 2
+ mulscc %o4, %o1, %o4 ! 3
+ mulscc %o4, %o1, %o4 ! 4
+ mulscc %o4, %o1, %o4 ! 5
+ mulscc %o4, %o1, %o4 ! 6
+ mulscc %o4, %o1, %o4 ! 7
+ mulscc %o4, %o1, %o4 ! 8
+ mulscc %o4, %o1, %o4 ! 9
+ mulscc %o4, %o1, %o4 ! 10
+ mulscc %o4, %o1, %o4 ! 11
+ mulscc %o4, %o1, %o4 ! 12
+ mulscc %o4, %o1, %o4 ! 13
+ mulscc %o4, %o1, %o4 ! 14
+ mulscc %o4, %o1, %o4 ! 15
+ mulscc %o4, %o1, %o4 ! 16
+ mulscc %o4, %o1, %o4 ! 17
+ mulscc %o4, %o1, %o4 ! 18
+ mulscc %o4, %o1, %o4 ! 19
+ mulscc %o4, %o1, %o4 ! 20
+ mulscc %o4, %o1, %o4 ! 21
+ mulscc %o4, %o1, %o4 ! 22
+ mulscc %o4, %o1, %o4 ! 23
+ mulscc %o4, %o1, %o4 ! 24
+ mulscc %o4, %o1, %o4 ! 25
+ mulscc %o4, %o1, %o4 ! 26
+ mulscc %o4, %o1, %o4 ! 27
+ mulscc %o4, %o1, %o4 ! 28
+ mulscc %o4, %o1, %o4 ! 29
+ mulscc %o4, %o1, %o4 ! 30
+ mulscc %o4, %o1, %o4 ! 31
+ mulscc %o4, %o1, %o4 ! 32
+ mulscc %o4, %g0, %o4 ! final shift
+
+
+ /*
+ * Normally, with the shift-and-add approach, if both numbers are
+ * positive you get the correct result. WIth 32-bit two's-complement
+ * numbers, -x is represented as
+ *
+ * x 32
+ * ( 2 - ------ ) mod 2 * 2
+ * 32
+ * 2
+ *
+ * (the `mod 2' subtracts 1 from 1.bbbb). To avoid lots of 2^32s,
+ * we can treat this as if the radix point were just to the left
+ * of the sign bit (multiply by 2^32), and get
+ *
+ * -x = (2 - x) mod 2
+ *
+ * Then, ignoring the `mod 2's for convenience:
+ *
+ * x * y = xy
+ * -x * y = 2y - xy
+ * x * -y = 2x - xy
+ * -x * -y = 4 - 2x - 2y + xy
+ *
+ * For signed multiplies, we subtract (x << 32) from the partial
+ * product to fix this problem for negative multipliers (see mul.s).
+ * Because of the way the shift into the partial product is calculated
+ * (N xor V), this term is automatically removed for the multiplicand,
+ * so we don't have to adjust.
+ *
+ * But for unsigned multiplies, the high order bit wasn't a sign bit,
+ * and the correction is wrong. So for unsigned multiplies where the
+ * high order bit is one, we end up with xy - (y << 32). To fix it
+ * we add y << 32.
+ */
+ tst %o1
+ bl,a 1f ! if %o1 < 0 (high order bit = 1),
+ add %o4, %o0, %o4 ! %o4 += %o0 (add y to upper half)
+1: rd %y, %o0 ! get lower half of product
+ retl
+ addcc %o4, %g0, %o1 ! put upper half in place and set Z for %o1==0
+
+Lumul_shortway:
+ /*
+ * Short multiply. 12 steps, followed by a final shift step.
+ * The resulting bits are off by 12 and (32-12) = 20 bit positions,
+ * but there is no problem with %o0 being negative (unlike above),
+ * and overflow is impossible (the answer is at most 24 bits long).
+ */
+ mulscc %o4, %o1, %o4 ! 1
+ mulscc %o4, %o1, %o4 ! 2
+ mulscc %o4, %o1, %o4 ! 3
+ mulscc %o4, %o1, %o4 ! 4
+ mulscc %o4, %o1, %o4 ! 5
+ mulscc %o4, %o1, %o4 ! 6
+ mulscc %o4, %o1, %o4 ! 7
+ mulscc %o4, %o1, %o4 ! 8
+ mulscc %o4, %o1, %o4 ! 9
+ mulscc %o4, %o1, %o4 ! 10
+ mulscc %o4, %o1, %o4 ! 11
+ mulscc %o4, %o1, %o4 ! 12
+ mulscc %o4, %g0, %o4 ! final shift
+
+ /*
+ * %o4 has 20 of the bits that should be in the result; %y has
+ * the bottom 12 (as %y's top 12). That is:
+ *
+ * %o4 %y
+ * +----------------+----------------+
+ * | -12- | -20- | -12- | -20- |
+ * +------(---------+------)---------+
+ * -----result-----
+ *
+ * The 12 bits of %o4 left of the `result' area are all zero;
+ * in fact, all top 20 bits of %o4 are zero.
+ */
+
+ rd %y, %o5
+ sll %o4, 12, %o0 ! shift middle bits left 12
+ srl %o5, 20, %o5 ! shift low bits right 20
+ or %o5, %o0, %o0
+ retl
+ addcc %g0, %g0, %o1 ! %o1 = zero, and set Z
+
+/*
* Here is a very good random number generator. This implementation is
* based on ``Two Fast Implementations of the "Minimal Standard" Random
* Number Generator", David G. Carta, Communications of the ACM, Jan 1990,
diff --git a/sys/lib/libkern/arch/sparc/Makefile.inc b/sys/lib/libkern/arch/sparc/Makefile.inc
index 49da2f42b7c..3d14262e7a8 100644
--- a/sys/lib/libkern/arch/sparc/Makefile.inc
+++ b/sys/lib/libkern/arch/sparc/Makefile.inc
@@ -1,4 +1,4 @@
-# $OpenBSD: Makefile.inc,v 1.5 1998/06/02 20:38:26 deraadt Exp $
+# $OpenBSD: Makefile.inc,v 1.6 1999/12/07 00:49:10 deraadt Exp $
# $NetBSD: Makefile.inc,v 1.12 1996/04/23 23:05:22 christos Exp $
SRCS+= __main.c imax.c imin.c lmax.c lmin.c max.c min.c ulmax.c ulmin.c \
@@ -7,8 +7,8 @@ SRCS+= __main.c imax.c imin.c lmax.c lmin.c max.c min.c ulmax.c ulmin.c \
strncpy.c htonl.S htons.S ntohl.S ntohs.S scanc.c skpc.c \
strncasecmp.c
-SRCS+= umul.S mul.S rem.S sdiv.S udiv.S umul.S urem.S
-SRCS+= mul.S saveregs.S umul.S
+SRCS+= rem.S sdiv.S udiv.S urem.S
+SRCS+= saveregs.S
# `source' files built from m4 source
# the name `div.o' is taken for the ANSI C `div' function, hence sdiv here
diff --git a/sys/lib/libkern/arch/sparc/mul.S b/sys/lib/libkern/arch/sparc/mul.S
deleted file mode 100644
index 028237f2c47..00000000000
--- a/sys/lib/libkern/arch/sparc/mul.S
+++ /dev/null
@@ -1,160 +0,0 @@
-/* $OpenBSD: mul.S,v 1.2 1997/11/07 15:57:37 niklas Exp $ */
-/* $NetBSD: mul.S,v 1.2 1994/10/26 06:40:01 cgd Exp $ */
-
-/*
- * Copyright (c) 1992, 1993
- * The Regents of the University of California. All rights reserved.
- *
- * This software was developed by the Computer Systems Engineering group
- * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
- * contributed to Berkeley.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * This product includes software developed by the University of
- * California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * Header: mul.s,v 1.5 92/06/25 13:24:03 torek Exp
- */
-
-#if defined(LIBC_SCCS) && !defined(lint)
-#ifdef notdef
- .asciz "@(#)mul.s 8.1 (Berkeley) 6/4/93"
-#endif
- .asciz "$OpenBSD: mul.S,v 1.2 1997/11/07 15:57:37 niklas Exp $"
-#endif /* LIBC_SCCS and not lint */
-
-/*
- * Signed multiply, from Appendix E of the Sparc Version 8
- * Architecture Manual.
- *
- * Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the upper 32 bits of
- * the 64-bit product).
- *
- * This code optimizes short (less than 13-bit) multiplies.
- */
-
-#include "DEFS.h"
-FUNC(.mul)
- mov %o0, %y ! multiplier -> Y
- andncc %o0, 0xfff, %g0 ! test bits 12..31
- be Lmul_shortway ! if zero, can do it the short way
- andcc %g0, %g0, %o4 ! zero the partial product and clear N and V
-
- /*
- * Long multiply. 32 steps, followed by a final shift step.
- */
- mulscc %o4, %o1, %o4 ! 1
- mulscc %o4, %o1, %o4 ! 2
- mulscc %o4, %o1, %o4 ! 3
- mulscc %o4, %o1, %o4 ! 4
- mulscc %o4, %o1, %o4 ! 5
- mulscc %o4, %o1, %o4 ! 6
- mulscc %o4, %o1, %o4 ! 7
- mulscc %o4, %o1, %o4 ! 8
- mulscc %o4, %o1, %o4 ! 9
- mulscc %o4, %o1, %o4 ! 10
- mulscc %o4, %o1, %o4 ! 11
- mulscc %o4, %o1, %o4 ! 12
- mulscc %o4, %o1, %o4 ! 13
- mulscc %o4, %o1, %o4 ! 14
- mulscc %o4, %o1, %o4 ! 15
- mulscc %o4, %o1, %o4 ! 16
- mulscc %o4, %o1, %o4 ! 17
- mulscc %o4, %o1, %o4 ! 18
- mulscc %o4, %o1, %o4 ! 19
- mulscc %o4, %o1, %o4 ! 20
- mulscc %o4, %o1, %o4 ! 21
- mulscc %o4, %o1, %o4 ! 22
- mulscc %o4, %o1, %o4 ! 23
- mulscc %o4, %o1, %o4 ! 24
- mulscc %o4, %o1, %o4 ! 25
- mulscc %o4, %o1, %o4 ! 26
- mulscc %o4, %o1, %o4 ! 27
- mulscc %o4, %o1, %o4 ! 28
- mulscc %o4, %o1, %o4 ! 29
- mulscc %o4, %o1, %o4 ! 30
- mulscc %o4, %o1, %o4 ! 31
- mulscc %o4, %o1, %o4 ! 32
- mulscc %o4, %g0, %o4 ! final shift
-
- ! If %o0 was negative, the result is
- ! (%o0 * %o1) + (%o1 << 32))
- ! We fix that here.
-
- tst %o0
- bge 1f
- rd %y, %o0
-
- ! %o0 was indeed negative; fix upper 32 bits of result by subtracting
- ! %o1 (i.e., return %o4 - %o1 in %o1).
- retl
- sub %o4, %o1, %o1
-
-1:
- retl
- mov %o4, %o1
-
-Lmul_shortway:
- /*
- * Short multiply. 12 steps, followed by a final shift step.
- * The resulting bits are off by 12 and (32-12) = 20 bit positions,
- * but there is no problem with %o0 being negative (unlike above).
- */
- mulscc %o4, %o1, %o4 ! 1
- mulscc %o4, %o1, %o4 ! 2
- mulscc %o4, %o1, %o4 ! 3
- mulscc %o4, %o1, %o4 ! 4
- mulscc %o4, %o1, %o4 ! 5
- mulscc %o4, %o1, %o4 ! 6
- mulscc %o4, %o1, %o4 ! 7
- mulscc %o4, %o1, %o4 ! 8
- mulscc %o4, %o1, %o4 ! 9
- mulscc %o4, %o1, %o4 ! 10
- mulscc %o4, %o1, %o4 ! 11
- mulscc %o4, %o1, %o4 ! 12
- mulscc %o4, %g0, %o4 ! final shift
-
- /*
- * %o4 has 20 of the bits that should be in the low part of the
- * result; %y has the bottom 12 (as %y's top 12). That is:
- *
- * %o4 %y
- * +----------------+----------------+
- * | -12- | -20- | -12- | -20- |
- * +------(---------+------)---------+
- * --hi-- ----low-part----
- *
- * The upper 12 bits of %o4 should be sign-extended to form the
- * high part of the product (i.e., highpart = %o4 >> 20).
- */
-
- rd %y, %o5
- sll %o4, 12, %o0 ! shift middle bits left 12
- srl %o5, 20, %o5 ! shift low bits right 20, zero fill at left
- or %o5, %o0, %o0 ! construct low part of result
- retl
- sra %o4, 20, %o1 ! ... and extract high part of result
diff --git a/sys/lib/libkern/arch/sparc/umul.S b/sys/lib/libkern/arch/sparc/umul.S
deleted file mode 100644
index 6a5b7005e22..00000000000
--- a/sys/lib/libkern/arch/sparc/umul.S
+++ /dev/null
@@ -1,193 +0,0 @@
-/* $OpenBSD: umul.S,v 1.2 1997/11/07 15:57:42 niklas Exp $ */
-/* $NetBSD: umul.S,v 1.2 1994/10/26 06:40:10 cgd Exp $ */
-
-/*
- * Copyright (c) 1992, 1993
- * The Regents of the University of California. All rights reserved.
- *
- * This software was developed by the Computer Systems Engineering group
- * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
- * contributed to Berkeley.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * This product includes software developed by the University of
- * California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * Header: umul.s,v 1.4 92/06/25 13:24:05 torek Exp
- */
-
-#if defined(LIBC_SCCS) && !defined(lint)
-#ifdef notdef
- .asciz "@(#)umul.s 8.1 (Berkeley) 6/4/93"
-#endif
- .asciz "$OpenBSD: umul.S,v 1.2 1997/11/07 15:57:42 niklas Exp $"
-#endif /* LIBC_SCCS and not lint */
-
-/*
- * Unsigned multiply. Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the
- * upper 32 bits of the 64-bit product).
- *
- * This code optimizes short (less than 13-bit) multiplies. Short
- * multiplies require 25 instruction cycles, and long ones require
- * 45 instruction cycles.
- *
- * On return, overflow has occurred (%o1 is not zero) if and only if
- * the Z condition code is clear, allowing, e.g., the following:
- *
- * call .umul
- * nop
- * bnz overflow (or tnz)
- */
-
-#include "DEFS.h"
-FUNC(.umul)
- or %o0, %o1, %o4
- mov %o0, %y ! multiplier -> Y
- andncc %o4, 0xfff, %g0 ! test bits 12..31 of *both* args
- be Lmul_shortway ! if zero, can do it the short way
- andcc %g0, %g0, %o4 ! zero the partial product and clear N and V
-
- /*
- * Long multiply. 32 steps, followed by a final shift step.
- */
- mulscc %o4, %o1, %o4 ! 1
- mulscc %o4, %o1, %o4 ! 2
- mulscc %o4, %o1, %o4 ! 3
- mulscc %o4, %o1, %o4 ! 4
- mulscc %o4, %o1, %o4 ! 5
- mulscc %o4, %o1, %o4 ! 6
- mulscc %o4, %o1, %o4 ! 7
- mulscc %o4, %o1, %o4 ! 8
- mulscc %o4, %o1, %o4 ! 9
- mulscc %o4, %o1, %o4 ! 10
- mulscc %o4, %o1, %o4 ! 11
- mulscc %o4, %o1, %o4 ! 12
- mulscc %o4, %o1, %o4 ! 13
- mulscc %o4, %o1, %o4 ! 14
- mulscc %o4, %o1, %o4 ! 15
- mulscc %o4, %o1, %o4 ! 16
- mulscc %o4, %o1, %o4 ! 17
- mulscc %o4, %o1, %o4 ! 18
- mulscc %o4, %o1, %o4 ! 19
- mulscc %o4, %o1, %o4 ! 20
- mulscc %o4, %o1, %o4 ! 21
- mulscc %o4, %o1, %o4 ! 22
- mulscc %o4, %o1, %o4 ! 23
- mulscc %o4, %o1, %o4 ! 24
- mulscc %o4, %o1, %o4 ! 25
- mulscc %o4, %o1, %o4 ! 26
- mulscc %o4, %o1, %o4 ! 27
- mulscc %o4, %o1, %o4 ! 28
- mulscc %o4, %o1, %o4 ! 29
- mulscc %o4, %o1, %o4 ! 30
- mulscc %o4, %o1, %o4 ! 31
- mulscc %o4, %o1, %o4 ! 32
- mulscc %o4, %g0, %o4 ! final shift
-
-
- /*
- * Normally, with the shift-and-add approach, if both numbers are
- * positive you get the correct result. WIth 32-bit two's-complement
- * numbers, -x is represented as
- *
- * x 32
- * ( 2 - ------ ) mod 2 * 2
- * 32
- * 2
- *
- * (the `mod 2' subtracts 1 from 1.bbbb). To avoid lots of 2^32s,
- * we can treat this as if the radix point were just to the left
- * of the sign bit (multiply by 2^32), and get
- *
- * -x = (2 - x) mod 2
- *
- * Then, ignoring the `mod 2's for convenience:
- *
- * x * y = xy
- * -x * y = 2y - xy
- * x * -y = 2x - xy
- * -x * -y = 4 - 2x - 2y + xy
- *
- * For signed multiplies, we subtract (x << 32) from the partial
- * product to fix this problem for negative multipliers (see mul.s).
- * Because of the way the shift into the partial product is calculated
- * (N xor V), this term is automatically removed for the multiplicand,
- * so we don't have to adjust.
- *
- * But for unsigned multiplies, the high order bit wasn't a sign bit,
- * and the correction is wrong. So for unsigned multiplies where the
- * high order bit is one, we end up with xy - (y << 32). To fix it
- * we add y << 32.
- */
- tst %o1
- bl,a 1f ! if %o1 < 0 (high order bit = 1),
- add %o4, %o0, %o4 ! %o4 += %o0 (add y to upper half)
-1: rd %y, %o0 ! get lower half of product
- retl
- addcc %o4, %g0, %o1 ! put upper half in place and set Z for %o1==0
-
-Lmul_shortway:
- /*
- * Short multiply. 12 steps, followed by a final shift step.
- * The resulting bits are off by 12 and (32-12) = 20 bit positions,
- * but there is no problem with %o0 being negative (unlike above),
- * and overflow is impossible (the answer is at most 24 bits long).
- */
- mulscc %o4, %o1, %o4 ! 1
- mulscc %o4, %o1, %o4 ! 2
- mulscc %o4, %o1, %o4 ! 3
- mulscc %o4, %o1, %o4 ! 4
- mulscc %o4, %o1, %o4 ! 5
- mulscc %o4, %o1, %o4 ! 6
- mulscc %o4, %o1, %o4 ! 7
- mulscc %o4, %o1, %o4 ! 8
- mulscc %o4, %o1, %o4 ! 9
- mulscc %o4, %o1, %o4 ! 10
- mulscc %o4, %o1, %o4 ! 11
- mulscc %o4, %o1, %o4 ! 12
- mulscc %o4, %g0, %o4 ! final shift
-
- /*
- * %o4 has 20 of the bits that should be in the result; %y has
- * the bottom 12 (as %y's top 12). That is:
- *
- * %o4 %y
- * +----------------+----------------+
- * | -12- | -20- | -12- | -20- |
- * +------(---------+------)---------+
- * -----result-----
- *
- * The 12 bits of %o4 left of the `result' area are all zero;
- * in fact, all top 20 bits of %o4 are zero.
- */
-
- rd %y, %o5
- sll %o4, 12, %o0 ! shift middle bits left 12
- srl %o5, 20, %o5 ! shift low bits right 20
- or %o5, %o0, %o0
- retl
- addcc %g0, %g0, %o1 ! %o1 = zero, and set Z