src - OpenBSD base system

diff options


context:
space:
mode:

author	Todd C. Miller <millert@cvs.openbsd.org>	2000-01-10 03:51:46 +0000
committer	Todd C. Miller <millert@cvs.openbsd.org>	2000-01-10 03:51:46 +0000
commit	847549b3092ffc3090607904d987c1dc953328f4 (patch)
tree	73f6b2ad3910df68a44f79f0bf7c883f2859cfbd /sys/arch/kbus
parent	587b676322ca9eccce3a8b9c2570507a5b73c02f (diff)

move mul/umul into the kernel to match sparc

Diffstat (limited to 'sys/arch/kbus')

-rw-r--r--

sys/arch/kbus/kbus/locore.s

257

1 files changed, 257 insertions, 0 deletions

diff --git a/sys/arch/kbus/kbus/locore.s b/sys/arch/kbus/kbus/locore.s
index 2f897872900..7c63f5c7938 100644
--- a/sys/arch/kbus/kbus/locore.s
+++ b/sys/arch/kbus/kbus/locore.s

@@ -4290,6 +4290,263 @@ ENTRY(ffs)

add %o0, 24, %o0

+ * Signed multiply, from Appendix E of the Sparc Version 8

+ * Architecture Manual.

+ *

+ * Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the upper 32 bits of

+ * the 64-bit product).

+ *

+ * This code optimizes short (less than 13-bit) multiplies.

+ */

+.globl .mul, __mul

+.mul:

+__mul:

+ mov %o0, %y ! multiplier -> Y

+ andncc %o0, 0xfff, %g0 ! test bits 12..31

+ be Lmul_shortway ! if zero, can do it the short way

+ andcc %g0, %g0, %o4 ! zero the partial product and clear N and V

+ /*

+ * Long multiply. 32 steps, followed by a final shift step.

+ */

+ mulscc %o4, %o1, %o4 ! 1

+ mulscc %o4, %o1, %o4 ! 2

+ mulscc %o4, %o1, %o4 ! 3

+ mulscc %o4, %o1, %o4 ! 4

+ mulscc %o4, %o1, %o4 ! 5

+ mulscc %o4, %o1, %o4 ! 6

+ mulscc %o4, %o1, %o4 ! 7

+ mulscc %o4, %o1, %o4 ! 8

+ mulscc %o4, %o1, %o4 ! 9

+ mulscc %o4, %o1, %o4 ! 10

+ mulscc %o4, %o1, %o4 ! 11

+ mulscc %o4, %o1, %o4 ! 12

+ mulscc %o4, %o1, %o4 ! 13

+ mulscc %o4, %o1, %o4 ! 14

+ mulscc %o4, %o1, %o4 ! 15

+ mulscc %o4, %o1, %o4 ! 16

+ mulscc %o4, %o1, %o4 ! 17

+ mulscc %o4, %o1, %o4 ! 18

+ mulscc %o4, %o1, %o4 ! 19

+ mulscc %o4, %o1, %o4 ! 20

+ mulscc %o4, %o1, %o4 ! 21

+ mulscc %o4, %o1, %o4 ! 22

+ mulscc %o4, %o1, %o4 ! 23

+ mulscc %o4, %o1, %o4 ! 24

+ mulscc %o4, %o1, %o4 ! 25

+ mulscc %o4, %o1, %o4 ! 26

+ mulscc %o4, %o1, %o4 ! 27

+ mulscc %o4, %o1, %o4 ! 28

+ mulscc %o4, %o1, %o4 ! 29

+ mulscc %o4, %o1, %o4 ! 30

+ mulscc %o4, %o1, %o4 ! 31

+ mulscc %o4, %o1, %o4 ! 32

+ mulscc %o4, %g0, %o4 ! final shift

+ ! If %o0 was negative, the result is

+ ! (%o0 * %o1) + (%o1 << 32))

+ ! We fix that here.

+ tst %o0

+ bge 1f

+ rd %y, %o0

+ ! %o0 was indeed negative; fix upper 32 bits of result by subtracting

+ ! %o1 (i.e., return %o4 - %o1 in %o1).

+ retl

+ sub %o4, %o1, %o1

+1:

+ retl

+ mov %o4, %o1

+Lmul_shortway:

+ /*

+ * Short multiply. 12 steps, followed by a final shift step.

+ * The resulting bits are off by 12 and (32-12) = 20 bit positions,

+ * but there is no problem with %o0 being negative (unlike above).

+ */

+ mulscc %o4, %o1, %o4 ! 1

+ mulscc %o4, %o1, %o4 ! 2

+ mulscc %o4, %o1, %o4 ! 3

+ mulscc %o4, %o1, %o4 ! 4

+ mulscc %o4, %o1, %o4 ! 5

+ mulscc %o4, %o1, %o4 ! 6

+ mulscc %o4, %o1, %o4 ! 7

+ mulscc %o4, %o1, %o4 ! 8

+ mulscc %o4, %o1, %o4 ! 9

+ mulscc %o4, %o1, %o4 ! 10

+ mulscc %o4, %o1, %o4 ! 11

+ mulscc %o4, %o1, %o4 ! 12

+ mulscc %o4, %g0, %o4 ! final shift

+ /*

+ * %o4 has 20 of the bits that should be in the low part of the

+ * result; %y has the bottom 12 (as %y's top 12). That is:

+ *

+ * %o4 %y

+ * +----------------+----------------+

+ * | -12- | -20- | -12- | -20- |

+ * +------(---------+------)---------+

+ * --hi-- ----low-part----

+ *

+ * The upper 12 bits of %o4 should be sign-extended to form the

+ * high part of the product (i.e., highpart = %o4 >> 20).

+ */

+ rd %y, %o5

+ sll %o4, 12, %o0 ! shift middle bits left 12

+ srl %o5, 20, %o5 ! shift low bits right 20, zero fill at left

+ or %o5, %o0, %o0 ! construct low part of result

+ retl

+ sra %o4, 20, %o1 ! ... and extract high part of result

+/*

+ * Unsigned multiply. Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the

+ * upper 32 bits of the 64-bit product).

+ *

+ * This code optimizes short (less than 13-bit) multiplies. Short

+ * multiplies require 25 instruction cycles, and long ones require

+ * 45 instruction cycles.

+ *

+ * On return, overflow has occurred (%o1 is not zero) if and only if

+ * the Z condition code is clear, allowing, e.g., the following:

+ *

+ * call .umul

+ * nop

+ * bnz overflow (or tnz)

+ */

+.globl .umul, __umul

+.umul:

+__umul:

+ or %o0, %o1, %o4

+ mov %o0, %y ! multiplier -> Y

+ andncc %o4, 0xfff, %g0 ! test bits 12..31 of *both* args

+ be Lumul_shortway ! if zero, can do it the short way

+ andcc %g0, %g0, %o4 ! zero the partial product and clear N and V

+ /*

+ * Long multiply. 32 steps, followed by a final shift step.

+ */

+ mulscc %o4, %o1, %o4 ! 1

+ mulscc %o4, %o1, %o4 ! 2

+ mulscc %o4, %o1, %o4 ! 3

+ mulscc %o4, %o1, %o4 ! 4

+ mulscc %o4, %o1, %o4 ! 5

+ mulscc %o4, %o1, %o4 ! 6

+ mulscc %o4, %o1, %o4 ! 7

+ mulscc %o4, %o1, %o4 ! 8

+ mulscc %o4, %o1, %o4 ! 9

+ mulscc %o4, %o1, %o4 ! 10

+ mulscc %o4, %o1, %o4 ! 11

+ mulscc %o4, %o1, %o4 ! 12

+ mulscc %o4, %o1, %o4 ! 13

+ mulscc %o4, %o1, %o4 ! 14

+ mulscc %o4, %o1, %o4 ! 15

+ mulscc %o4, %o1, %o4 ! 16

+ mulscc %o4, %o1, %o4 ! 17

+ mulscc %o4, %o1, %o4 ! 18

+ mulscc %o4, %o1, %o4 ! 19

+ mulscc %o4, %o1, %o4 ! 20

+ mulscc %o4, %o1, %o4 ! 21

+ mulscc %o4, %o1, %o4 ! 22

+ mulscc %o4, %o1, %o4 ! 23

+ mulscc %o4, %o1, %o4 ! 24

+ mulscc %o4, %o1, %o4 ! 25

+ mulscc %o4, %o1, %o4 ! 26

+ mulscc %o4, %o1, %o4 ! 27

+ mulscc %o4, %o1, %o4 ! 28

+ mulscc %o4, %o1, %o4 ! 29

+ mulscc %o4, %o1, %o4 ! 30

+ mulscc %o4, %o1, %o4 ! 31

+ mulscc %o4, %o1, %o4 ! 32

+ mulscc %o4, %g0, %o4 ! final shift

+ /*

+ * Normally, with the shift-and-add approach, if both numbers are

+ * positive you get the correct result. WIth 32-bit two's-complement

+ * numbers, -x is represented as

+ *

+ * x 32

+ * ( 2 - ------ ) mod 2 * 2

+ * 32

+ * 2

+ *

+ * (the `mod 2' subtracts 1 from 1.bbbb). To avoid lots of 2^32s,

+ * we can treat this as if the radix point were just to the left

+ * of the sign bit (multiply by 2^32), and get

+ *

+ * -x = (2 - x) mod 2

+ *

+ * Then, ignoring the `mod 2's for convenience:

+ *

+ * x * y = xy

+ * -x * y = 2y - xy

+ * x * -y = 2x - xy

+ * -x * -y = 4 - 2x - 2y + xy

+ *

+ * For signed multiplies, we subtract (x << 32) from the partial

+ * product to fix this problem for negative multipliers (see mul.s).

+ * Because of the way the shift into the partial product is calculated

+ * (N xor V), this term is automatically removed for the multiplicand,

+ * so we don't have to adjust.

+ *

+ * But for unsigned multiplies, the high order bit wasn't a sign bit,

+ * and the correction is wrong. So for unsigned multiplies where the

+ * high order bit is one, we end up with xy - (y << 32). To fix it

+ * we add y << 32.

+ */

+ tst %o1

+ bl,a 1f ! if %o1 < 0 (high order bit = 1),

+ add %o4, %o0, %o4 ! %o4 += %o0 (add y to upper half)

+1: rd %y, %o0 ! get lower half of product

+ retl

+ addcc %o4, %g0, %o1 ! put upper half in place and set Z for %o1==0

+Lumul_shortway:

+ /*

+ * Short multiply. 12 steps, followed by a final shift step.

+ * The resulting bits are off by 12 and (32-12) = 20 bit positions,

+ * but there is no problem with %o0 being negative (unlike above),

+ * and overflow is impossible (the answer is at most 24 bits long).

+ */

+ mulscc %o4, %o1, %o4 ! 1

+ mulscc %o4, %o1, %o4 ! 2

+ mulscc %o4, %o1, %o4 ! 3

+ mulscc %o4, %o1, %o4 ! 4

+ mulscc %o4, %o1, %o4 ! 5

+ mulscc %o4, %o1, %o4 ! 6

+ mulscc %o4, %o1, %o4 ! 7

+ mulscc %o4, %o1, %o4 ! 8

+ mulscc %o4, %o1, %o4 ! 9

+ mulscc %o4, %o1, %o4 ! 10

+ mulscc %o4, %o1, %o4 ! 11

+ mulscc %o4, %o1, %o4 ! 12

+ mulscc %o4, %g0, %o4 ! final shift

+ /*

+ * %o4 has 20 of the bits that should be in the result; %y has

+ * the bottom 12 (as %y's top 12). That is:

+ *

+ * %o4 %y

+ * +----------------+----------------+

+ * | -12- | -20- | -12- | -20- |

+ * +------(---------+------)---------+

+ * -----result-----

+ *

+ * The 12 bits of %o4 left of the `result' area are all zero;

+ * in fact, all top 20 bits of %o4 are zero.

+ */

+ rd %y, %o5

+ sll %o4, 12, %o0 ! shift middle bits left 12

+ srl %o5, 20, %o5 ! shift low bits right 20

+ or %o5, %o0, %o0

+ retl

+ addcc %g0, %g0, %o1 ! %o1 = zero, and set Z

+/*

* Here is a very good random number generator. This implementation is

* based on ``Two Fast Implementations of the "Minimal Standard" Random

* Number Generator", David G. Carta, Communications of the ACM, Jan 1990,