src - OpenBSD base system

diff options


context:
space:
mode:

author	Theo de Raadt <deraadt@cvs.openbsd.org>	1999-12-07 00:49:11 +0000
committer	Theo de Raadt <deraadt@cvs.openbsd.org>	1999-12-07 00:49:11 +0000
commit	09517c7586448c4de5e68230335e44d1a2ae8ad9 (patch)
tree	2d3da8f4022ae3266513662db1601fca33dca637
parent	739b5aab7680cf40ec4d92e9a9648898ddec3b9b (diff)

move .mul and .umul into the kernel. if v8 sparc is detected, replace with

the equivelant 2-instruction sequence.

Diffstat

-rw-r--r--

sys/arch/sparc/sparc/autoconf.c

-rw-r--r--

sys/arch/sparc/sparc/locore.s

274

-rw-r--r--

sys/lib/libkern/arch/sparc/Makefile.inc

-rw-r--r--

sys/lib/libkern/arch/sparc/mul.S

160

-rw-r--r--

sys/lib/libkern/arch/sparc/umul.S

193

5 files changed, 286 insertions, 358 deletions

diff --git a/sys/arch/sparc/sparc/autoconf.c b/sys/arch/sparc/sparc/autoconf.c
index 68afc63f0d8..814e1bc83d7 100644
--- a/sys/arch/sparc/sparc/autoconf.c
+++ b/sys/arch/sparc/sparc/autoconf.c

@@ -1,4 +1,4 @@

-/* $OpenBSD: autoconf.c,v 1.31 1999/09/03 18:01:57 art Exp $ */

+/* $OpenBSD: autoconf.c,v 1.32 1999/12/07 00:49:07 deraadt Exp $ */

/* $NetBSD: autoconf.c,v 1.73 1997/07/29 09:41:53 fair Exp $ */

@@ -286,6 +286,15 @@ bootstrap()

extern void setpte4m __P((u_int, u_int));

extern struct timer_4m *timerreg_4m;

extern struct counter_4m *counterreg_4m;

+ extern void *_umulreplace, *_umul, *_umulreplace_end;

+ extern void *_mulreplace, *_mul, *_mulreplace_end;

+ /*

+ * Whack the slow sun4/sun4c umul/mul functions with

+ * fast V8 ones

+ */

+ bcopy(_umulreplace, _umul, _umulreplace_end-_umulreplace);

+ bcopy(_mulreplace, _mul, _mulreplace_end-_mulreplace);

if ((node = opennode("/obio/interrupt")) == 0)

if ((node=search_prom(findroot(),"interrupt"))==0)

diff --git a/sys/arch/sparc/sparc/locore.s b/sys/arch/sparc/sparc/locore.s
index c97c659d2ff..2b98b97deb6 100644
--- a/sys/arch/sparc/sparc/locore.s
+++ b/sys/arch/sparc/sparc/locore.s

@@ -1,4 +1,4 @@

-/* $OpenBSD: locore.s,v 1.27 1999/11/13 00:11:52 deraadt Exp $ */

+/* $OpenBSD: locore.s,v 1.28 1999/12/07 00:49:07 deraadt Exp $ */

/* $NetBSD: locore.s,v 1.73 1997/09/13 20:36:48 pk Exp $ */

@@ -5895,6 +5895,278 @@ ENTRY(ffs)

add %o0, 24, %o0

+ * V8 sparc mul/umul replacements.

+ */

+.globl __mulreplace, __mulreplace_end

+__mulreplace:

+ retl

+ smul %o0, %o1, %o0

+__mulreplace_end:

+.globl __umulreplace, __umulreplace_end

+__umulreplace:

+ retl

+ umul %o0, %o1, %o0

+__umulreplace_end:

+/*

+ * Signed multiply, from Appendix E of the Sparc Version 8

+ * Architecture Manual.

+ *

+ * Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the upper 32 bits of

+ * the 64-bit product).

+ *

+ * This code optimizes short (less than 13-bit) multiplies.

+ */

+.globl .mul, __mul

+.mul:

+__mul:

+ mov %o0, %y ! multiplier -> Y

+ andncc %o0, 0xfff, %g0 ! test bits 12..31

+ be Lmul_shortway ! if zero, can do it the short way

+ andcc %g0, %g0, %o4 ! zero the partial product and clear N and V

+ /*

+ * Long multiply. 32 steps, followed by a final shift step.

+ */

+ mulscc %o4, %o1, %o4 ! 1

+ mulscc %o4, %o1, %o4 ! 2

+ mulscc %o4, %o1, %o4 ! 3

+ mulscc %o4, %o1, %o4 ! 4

+ mulscc %o4, %o1, %o4 ! 5

+ mulscc %o4, %o1, %o4 ! 6

+ mulscc %o4, %o1, %o4 ! 7

+ mulscc %o4, %o1, %o4 ! 8

+ mulscc %o4, %o1, %o4 ! 9

+ mulscc %o4, %o1, %o4 ! 10

+ mulscc %o4, %o1, %o4 ! 11

+ mulscc %o4, %o1, %o4 ! 12

+ mulscc %o4, %o1, %o4 ! 13

+ mulscc %o4, %o1, %o4 ! 14

+ mulscc %o4, %o1, %o4 ! 15

+ mulscc %o4, %o1, %o4 ! 16

+ mulscc %o4, %o1, %o4 ! 17

+ mulscc %o4, %o1, %o4 ! 18

+ mulscc %o4, %o1, %o4 ! 19

+ mulscc %o4, %o1, %o4 ! 20

+ mulscc %o4, %o1, %o4 ! 21

+ mulscc %o4, %o1, %o4 ! 22

+ mulscc %o4, %o1, %o4 ! 23

+ mulscc %o4, %o1, %o4 ! 24

+ mulscc %o4, %o1, %o4 ! 25

+ mulscc %o4, %o1, %o4 ! 26

+ mulscc %o4, %o1, %o4 ! 27

+ mulscc %o4, %o1, %o4 ! 28

+ mulscc %o4, %o1, %o4 ! 29

+ mulscc %o4, %o1, %o4 ! 30

+ mulscc %o4, %o1, %o4 ! 31

+ mulscc %o4, %o1, %o4 ! 32

+ mulscc %o4, %g0, %o4 ! final shift

+ ! If %o0 was negative, the result is

+ ! (%o0 * %o1) + (%o1 << 32))

+ ! We fix that here.

+ tst %o0

+ bge 1f

+ rd %y, %o0

+ ! %o0 was indeed negative; fix upper 32 bits of result by subtracting

+ ! %o1 (i.e., return %o4 - %o1 in %o1).

+ retl

+ sub %o4, %o1, %o1

+1:

+ retl

+ mov %o4, %o1

+Lmul_shortway:

+ /*

+ * Short multiply. 12 steps, followed by a final shift step.

+ * The resulting bits are off by 12 and (32-12) = 20 bit positions,

+ * but there is no problem with %o0 being negative (unlike above).

+ */

+ mulscc %o4, %o1, %o4 ! 1

+ mulscc %o4, %o1, %o4 ! 2

+ mulscc %o4, %o1, %o4 ! 3

+ mulscc %o4, %o1, %o4 ! 4

+ mulscc %o4, %o1, %o4 ! 5

+ mulscc %o4, %o1, %o4 ! 6

+ mulscc %o4, %o1, %o4 ! 7

+ mulscc %o4, %o1, %o4 ! 8

+ mulscc %o4, %o1, %o4 ! 9

+ mulscc %o4, %o1, %o4 ! 10

+ mulscc %o4, %o1, %o4 ! 11

+ mulscc %o4, %o1, %o4 ! 12

+ mulscc %o4, %g0, %o4 ! final shift

+ /*

+ * %o4 has 20 of the bits that should be in the low part of the

+ * result; %y has the bottom 12 (as %y's top 12). That is:

+ *

+ * %o4 %y

+ * +----------------+----------------+

+ * | -12- | -20- | -12- | -20- |

+ * +------(---------+------)---------+

+ * --hi-- ----low-part----

+ *

+ * The upper 12 bits of %o4 should be sign-extended to form the

+ * high part of the product (i.e., highpart = %o4 >> 20).

+ */

+ rd %y, %o5

+ sll %o4, 12, %o0 ! shift middle bits left 12

+ srl %o5, 20, %o5 ! shift low bits right 20, zero fill at left

+ or %o5, %o0, %o0 ! construct low part of result

+ retl

+ sra %o4, 20, %o1 ! ... and extract high part of result

+/*

+ * Unsigned multiply. Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the

+ * upper 32 bits of the 64-bit product).

+ *

+ * This code optimizes short (less than 13-bit) multiplies. Short

+ * multiplies require 25 instruction cycles, and long ones require

+ * 45 instruction cycles.

+ *

+ * On return, overflow has occurred (%o1 is not zero) if and only if

+ * the Z condition code is clear, allowing, e.g., the following:

+ *

+ * call .umul

+ * nop

+ * bnz overflow (or tnz)

+ */

+.globl .umul, __umul

+.umul:

+__umul:

+ or %o0, %o1, %o4

+ mov %o0, %y ! multiplier -> Y

+ andncc %o4, 0xfff, %g0 ! test bits 12..31 of *both* args

+ be Lumul_shortway ! if zero, can do it the short way

+ andcc %g0, %g0, %o4 ! zero the partial product and clear N and V

+ /*

+ * Long multiply. 32 steps, followed by a final shift step.

+ */

+ mulscc %o4, %o1, %o4 ! 1

+ mulscc %o4, %o1, %o4 ! 2

+ mulscc %o4, %o1, %o4 ! 3

+ mulscc %o4, %o1, %o4 ! 4

+ mulscc %o4, %o1, %o4 ! 5

+ mulscc %o4, %o1, %o4 ! 6

+ mulscc %o4, %o1, %o4 ! 7

+ mulscc %o4, %o1, %o4 ! 8

+ mulscc %o4, %o1, %o4 ! 9

+ mulscc %o4, %o1, %o4 ! 10

+ mulscc %o4, %o1, %o4 ! 11

+ mulscc %o4, %o1, %o4 ! 12

+ mulscc %o4, %o1, %o4 ! 13

+ mulscc %o4, %o1, %o4 ! 14

+ mulscc %o4, %o1, %o4 ! 15

+ mulscc %o4, %o1, %o4 ! 16

+ mulscc %o4, %o1, %o4 ! 17

+ mulscc %o4, %o1, %o4 ! 18

+ mulscc %o4, %o1, %o4 ! 19

+ mulscc %o4, %o1, %o4 ! 20

+ mulscc %o4, %o1, %o4 ! 21

+ mulscc %o4, %o1, %o4 ! 22

+ mulscc %o4, %o1, %o4 ! 23

+ mulscc %o4, %o1, %o4 ! 24

+ mulscc %o4, %o1, %o4 ! 25

+ mulscc %o4, %o1, %o4 ! 26

+ mulscc %o4, %o1, %o4 ! 27

+ mulscc %o4, %o1, %o4 ! 28

+ mulscc %o4, %o1, %o4 ! 29

+ mulscc %o4, %o1, %o4 ! 30

+ mulscc %o4, %o1, %o4 ! 31

+ mulscc %o4, %o1, %o4 ! 32

+ mulscc %o4, %g0, %o4 ! final shift

+ /*

+ * Normally, with the shift-and-add approach, if both numbers are

+ * positive you get the correct result. WIth 32-bit two's-complement

+ * numbers, -x is represented as

+ *

+ * x 32

+ * ( 2 - ------ ) mod 2 * 2

+ * 32

+ * 2

+ *

+ * (the `mod 2' subtracts 1 from 1.bbbb). To avoid lots of 2^32s,

+ * we can treat this as if the radix point were just to the left

+ * of the sign bit (multiply by 2^32), and get

+ *

+ * -x = (2 - x) mod 2

+ *

+ * Then, ignoring the `mod 2's for convenience:

+ *

+ * x * y = xy

+ * -x * y = 2y - xy

+ * x * -y = 2x - xy

+ * -x * -y = 4 - 2x - 2y + xy

+ *

+ * For signed multiplies, we subtract (x << 32) from the partial

+ * product to fix this problem for negative multipliers (see mul.s).

+ * Because of the way the shift into the partial product is calculated

+ * (N xor V), this term is automatically removed for the multiplicand,

+ * so we don't have to adjust.

+ *

+ * But for unsigned multiplies, the high order bit wasn't a sign bit,

+ * and the correction is wrong. So for unsigned multiplies where the

+ * high order bit is one, we end up with xy - (y << 32). To fix it

+ * we add y << 32.

+ */

+ tst %o1

+ bl,a 1f ! if %o1 < 0 (high order bit = 1),

+ add %o4, %o0, %o4 ! %o4 += %o0 (add y to upper half)

+1: rd %y, %o0 ! get lower half of product

+ retl

+ addcc %o4, %g0, %o1 ! put upper half in place and set Z for %o1==0

+Lumul_shortway:

+ /*

+ * Short multiply. 12 steps, followed by a final shift step.

+ * The resulting bits are off by 12 and (32-12) = 20 bit positions,

+ * but there is no problem with %o0 being negative (unlike above),

+ * and overflow is impossible (the answer is at most 24 bits long).

+ */

+ mulscc %o4, %o1, %o4 ! 1

+ mulscc %o4, %o1, %o4 ! 2

+ mulscc %o4, %o1, %o4 ! 3

+ mulscc %o4, %o1, %o4 ! 4

+ mulscc %o4, %o1, %o4 ! 5

+ mulscc %o4, %o1, %o4 ! 6

+ mulscc %o4, %o1, %o4 ! 7

+ mulscc %o4, %o1, %o4 ! 8

+ mulscc %o4, %o1, %o4 ! 9

+ mulscc %o4, %o1, %o4 ! 10

+ mulscc %o4, %o1, %o4 ! 11

+ mulscc %o4, %o1, %o4 ! 12

+ mulscc %o4, %g0, %o4 ! final shift

+ /*

+ * %o4 has 20 of the bits that should be in the result; %y has

+ * the bottom 12 (as %y's top 12). That is:

+ *

+ * %o4 %y

+ * +----------------+----------------+

+ * | -12- | -20- | -12- | -20- |

+ * +------(---------+------)---------+

+ * -----result-----

+ *

+ * The 12 bits of %o4 left of the `result' area are all zero;

+ * in fact, all top 20 bits of %o4 are zero.

+ */

+ rd %y, %o5

+ sll %o4, 12, %o0 ! shift middle bits left 12

+ srl %o5, 20, %o5 ! shift low bits right 20

+ or %o5, %o0, %o0

+ retl

+ addcc %g0, %g0, %o1 ! %o1 = zero, and set Z

+/*

* Here is a very good random number generator. This implementation is

* based on ``Two Fast Implementations of the "Minimal Standard" Random

* Number Generator", David G. Carta, Communications of the ACM, Jan 1990,

diff --git a/sys/lib/libkern/arch/sparc/Makefile.inc b/sys/lib/libkern/arch/sparc/Makefile.inc
index 49da2f42b7c..3d14262e7a8 100644
--- a/sys/lib/libkern/arch/sparc/Makefile.inc
+++ b/sys/lib/libkern/arch/sparc/Makefile.inc

@@ -1,4 +1,4 @@

-# $OpenBSD: Makefile.inc,v 1.5 1998/06/02 20:38:26 deraadt Exp $

+# $OpenBSD: Makefile.inc,v 1.6 1999/12/07 00:49:10 deraadt Exp $

# $NetBSD: Makefile.inc,v 1.12 1996/04/23 23:05:22 christos Exp $

SRCS+= __main.c imax.c imin.c lmax.c lmin.c max.c min.c ulmax.c ulmin.c \

@@ -7,8 +7,8 @@ SRCS+= __main.c imax.c imin.c lmax.c lmin.c max.c min.c ulmax.c ulmin.c \

strncpy.c htonl.S htons.S ntohl.S ntohs.S scanc.c skpc.c \

strncasecmp.c

-SRCS+= umul.S mul.S rem.S sdiv.S udiv.S umul.S urem.S

-SRCS+= mul.S saveregs.S umul.S

+SRCS+= rem.S sdiv.S udiv.S urem.S

+SRCS+= saveregs.S

# `source' files built from m4 source

# the name `div.o' is taken for the ANSI C `div' function, hence sdiv here

diff --git a/sys/lib/libkern/arch/sparc/mul.S b/sys/lib/libkern/arch/sparc/mul.S
deleted file mode 100644
index 028237f2c47..00000000000
--- a/sys/lib/libkern/arch/sparc/mul.S
+++ /dev/null

@@ -1,160 +0,0 @@

-/* $OpenBSD: mul.S,v 1.2 1997/11/07 15:57:37 niklas Exp $ */

-/* $NetBSD: mul.S,v 1.2 1994/10/26 06:40:01 cgd Exp $ */

-/*

- *

- * This software was developed by the Computer Systems Engineering group

- * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and

- * contributed to Berkeley.

- *

- * Redistribution and use in source and binary forms, with or without

- * modification, are permitted provided that the following conditions

- * are met:

- * 1. Redistributions of source code must retain the above copyright

- * notice, this list of conditions and the following disclaimer.

- * 2. Redistributions in binary form must reproduce the above copyright

- * notice, this list of conditions and the following disclaimer in the

- * documentation and/or other materials provided with the distribution.

- * 3. All advertising materials mentioning features or use of this software

- * must display the following acknowledgement:

- * This product includes software developed by the University of

- * California, Berkeley and its contributors.

- * 4. Neither the name of the University nor the names of its contributors

- * may be used to endorse or promote products derived from this software

- * without specific prior written permission.

- *

- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND

- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE

- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL

- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS

- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)

- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT

- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY

- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF

- * SUCH DAMAGE.

- *

- * Header: mul.s,v 1.5 92/06/25 13:24:03 torek Exp

- */

-#if defined(LIBC_SCCS) && !defined(lint)

-#ifdef notdef

- .asciz "@(#)mul.s 8.1 (Berkeley) 6/4/93"

-#endif

- .asciz "$OpenBSD: mul.S,v 1.2 1997/11/07 15:57:37 niklas Exp $"

-#endif /* LIBC_SCCS and not lint */

-/*

- * Signed multiply, from Appendix E of the Sparc Version 8

- * Architecture Manual.

- *

- * Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the upper 32 bits of

- * the 64-bit product).

- *

- * This code optimizes short (less than 13-bit) multiplies.

- */

-#include "DEFS.h"

-FUNC(.mul)

- mov %o0, %y ! multiplier -> Y

- andncc %o0, 0xfff, %g0 ! test bits 12..31

- be Lmul_shortway ! if zero, can do it the short way

- andcc %g0, %g0, %o4 ! zero the partial product and clear N and V

- /*

- * Long multiply. 32 steps, followed by a final shift step.

- */

- mulscc %o4, %o1, %o4 ! 1

- mulscc %o4, %o1, %o4 ! 2

- mulscc %o4, %o1, %o4 ! 3

- mulscc %o4, %o1, %o4 ! 4

- mulscc %o4, %o1, %o4 ! 5

- mulscc %o4, %o1, %o4 ! 6

- mulscc %o4, %o1, %o4 ! 7

- mulscc %o4, %o1, %o4 ! 8

- mulscc %o4, %o1, %o4 ! 9

- mulscc %o4, %o1, %o4 ! 10

- mulscc %o4, %o1, %o4 ! 11

- mulscc %o4, %o1, %o4 ! 12

- mulscc %o4, %o1, %o4 ! 13

- mulscc %o4, %o1, %o4 ! 14

- mulscc %o4, %o1, %o4 ! 15

- mulscc %o4, %o1, %o4 ! 16

- mulscc %o4, %o1, %o4 ! 17

- mulscc %o4, %o1, %o4 ! 18

- mulscc %o4, %o1, %o4 ! 19

- mulscc %o4, %o1, %o4 ! 20

- mulscc %o4, %o1, %o4 ! 21

- mulscc %o4, %o1, %o4 ! 22

- mulscc %o4, %o1, %o4 ! 23

- mulscc %o4, %o1, %o4 ! 24

- mulscc %o4, %o1, %o4 ! 25

- mulscc %o4, %o1, %o4 ! 26

- mulscc %o4, %o1, %o4 ! 27

- mulscc %o4, %o1, %o4 ! 28

- mulscc %o4, %o1, %o4 ! 29

- mulscc %o4, %o1, %o4 ! 30

- mulscc %o4, %o1, %o4 ! 31

- mulscc %o4, %o1, %o4 ! 32

- mulscc %o4, %g0, %o4 ! final shift

- ! If %o0 was negative, the result is

- ! (%o0 * %o1) + (%o1 << 32))

- ! We fix that here.

- tst %o0

- bge 1f

- rd %y, %o0

- ! %o0 was indeed negative; fix upper 32 bits of result by subtracting

- ! %o1 (i.e., return %o4 - %o1 in %o1).

- retl

- sub %o4, %o1, %o1

-1:

- retl

- mov %o4, %o1

-Lmul_shortway:

- /*

- * Short multiply. 12 steps, followed by a final shift step.

- * The resulting bits are off by 12 and (32-12) = 20 bit positions,

- * but there is no problem with %o0 being negative (unlike above).

- */

- mulscc %o4, %o1, %o4 ! 1

- mulscc %o4, %o1, %o4 ! 2

- mulscc %o4, %o1, %o4 ! 3

- mulscc %o4, %o1, %o4 ! 4

- mulscc %o4, %o1, %o4 ! 5

- mulscc %o4, %o1, %o4 ! 6

- mulscc %o4, %o1, %o4 ! 7

- mulscc %o4, %o1, %o4 ! 8

- mulscc %o4, %o1, %o4 ! 9

- mulscc %o4, %o1, %o4 ! 10

- mulscc %o4, %o1, %o4 ! 11

- mulscc %o4, %o1, %o4 ! 12

- mulscc %o4, %g0, %o4 ! final shift

- /*

- * %o4 has 20 of the bits that should be in the low part of the

- * result; %y has the bottom 12 (as %y's top 12). That is:

- *

- * %o4 %y

- * +----------------+----------------+

- * | -12- | -20- | -12- | -20- |

- * +------(---------+------)---------+

- * --hi-- ----low-part----

- *

- * The upper 12 bits of %o4 should be sign-extended to form the

- * high part of the product (i.e., highpart = %o4 >> 20).

- */

- rd %y, %o5

- sll %o4, 12, %o0 ! shift middle bits left 12

- srl %o5, 20, %o5 ! shift low bits right 20, zero fill at left

- or %o5, %o0, %o0 ! construct low part of result

- retl

- sra %o4, 20, %o1 ! ... and extract high part of result

diff --git a/sys/lib/libkern/arch/sparc/umul.S b/sys/lib/libkern/arch/sparc/umul.S
deleted file mode 100644
index 6a5b7005e22..00000000000
--- a/sys/lib/libkern/arch/sparc/umul.S
+++ /dev/null

@@ -1,193 +0,0 @@

-/* $OpenBSD: umul.S,v 1.2 1997/11/07 15:57:42 niklas Exp $ */

-/* $NetBSD: umul.S,v 1.2 1994/10/26 06:40:10 cgd Exp $ */