1 files changed, 257 insertions, 0 deletions
diff --git a/sys/arch/kbus/kbus/locore.s b/sys/arch/kbus/kbus/locore.s
index 2f897872900..7c63f5c7938 100644
--- a/sys/arch/kbus/kbus/locore.s
+++ b/sys/arch/kbus/kbus/locore.s
@@ -4290,6 +4290,263 @@ ENTRY(ffs)
 	add	%o0, 24, %o0
 
 /*
+ * Signed multiply, from Appendix E of the Sparc Version 8
+ * Architecture Manual.
+ *
+ * Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the upper 32 bits of
+ * the 64-bit product).
+ *
+ * This code optimizes short (less than 13-bit) multiplies.
+ */
+.globl .mul, __mul
+.mul:
+__mul:
+	mov	%o0, %y		! multiplier -> Y
+	andncc	%o0, 0xfff, %g0	! test bits 12..31
+	be	Lmul_shortway	! if zero, can do it the short way
+	 andcc	%g0, %g0, %o4	! zero the partial product and clear N and V
+
+	/*
+	 * Long multiply.  32 steps, followed by a final shift step.
+	 */
+	mulscc	%o4, %o1, %o4	! 1
+	mulscc	%o4, %o1, %o4	! 2
+	mulscc	%o4, %o1, %o4	! 3
+	mulscc	%o4, %o1, %o4	! 4
+	mulscc	%o4, %o1, %o4	! 5
+	mulscc	%o4, %o1, %o4	! 6
+	mulscc	%o4, %o1, %o4	! 7
+	mulscc	%o4, %o1, %o4	! 8
+	mulscc	%o4, %o1, %o4	! 9
+	mulscc	%o4, %o1, %o4	! 10
+	mulscc	%o4, %o1, %o4	! 11
+	mulscc	%o4, %o1, %o4	! 12
+	mulscc	%o4, %o1, %o4	! 13
+	mulscc	%o4, %o1, %o4	! 14
+	mulscc	%o4, %o1, %o4	! 15
+	mulscc	%o4, %o1, %o4	! 16
+	mulscc	%o4, %o1, %o4	! 17
+	mulscc	%o4, %o1, %o4	! 18
+	mulscc	%o4, %o1, %o4	! 19
+	mulscc	%o4, %o1, %o4	! 20
+	mulscc	%o4, %o1, %o4	! 21
+	mulscc	%o4, %o1, %o4	! 22
+	mulscc	%o4, %o1, %o4	! 23
+	mulscc	%o4, %o1, %o4	! 24
+	mulscc	%o4, %o1, %o4	! 25
+	mulscc	%o4, %o1, %o4	! 26
+	mulscc	%o4, %o1, %o4	! 27
+	mulscc	%o4, %o1, %o4	! 28
+	mulscc	%o4, %o1, %o4	! 29
+	mulscc	%o4, %o1, %o4	! 30
+	mulscc	%o4, %o1, %o4	! 31
+	mulscc	%o4, %o1, %o4	! 32
+	mulscc	%o4, %g0, %o4	! final shift
+
+	! If %o0 was negative, the result is
+	!	(%o0 * %o1) + (%o1 << 32))
+	! We fix that here.
+
+	tst	%o0
+	bge	1f
+	 rd	%y, %o0
+
+	! %o0 was indeed negative; fix upper 32 bits of result by subtracting 
+	! %o1 (i.e., return %o4 - %o1 in %o1).
+	retl
+	 sub	%o4, %o1, %o1
+
+1:
+	retl
+	 mov	%o4, %o1
+
+Lmul_shortway:
+	/*
+	 * Short multiply.  12 steps, followed by a final shift step.
+	 * The resulting bits are off by 12 and (32-12) = 20 bit positions,
+	 * but there is no problem with %o0 being negative (unlike above).
+	 */
+	mulscc	%o4, %o1, %o4	! 1
+	mulscc	%o4, %o1, %o4	! 2
+	mulscc	%o4, %o1, %o4	! 3
+	mulscc	%o4, %o1, %o4	! 4
+	mulscc	%o4, %o1, %o4	! 5
+	mulscc	%o4, %o1, %o4	! 6
+	mulscc	%o4, %o1, %o4	! 7
+	mulscc	%o4, %o1, %o4	! 8
+	mulscc	%o4, %o1, %o4	! 9
+	mulscc	%o4, %o1, %o4	! 10
+	mulscc	%o4, %o1, %o4	! 11
+	mulscc	%o4, %o1, %o4	! 12
+	mulscc	%o4, %g0, %o4	! final shift
+
+	/*
+	 *  %o4 has 20 of the bits that should be in the low part of the
+	 * result; %y has the bottom 12 (as %y's top 12).  That is:
+	 *
+	 *	  %o4		    %y
+	 * +----------------+----------------+
+	 * | -12- |   -20-  | -12- |   -20-  |
+	 * +------(---------+------)---------+
+	 *  --hi-- ----low-part----
+	 *
+	 * The upper 12 bits of %o4 should be sign-extended to form the
+	 * high part of the product (i.e., highpart = %o4 >> 20).
+	 */
+
+	rd	%y, %o5
+	sll	%o4, 12, %o0	! shift middle bits left 12
+	srl	%o5, 20, %o5	! shift low bits right 20, zero fill at left
+	or	%o5, %o0, %o0	! construct low part of result
+	retl
+	 sra	%o4, 20, %o1	! ... and extract high part of result
+
+/*
+ * Unsigned multiply.  Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the
+ * upper 32 bits of the 64-bit product).
+ *
+ * This code optimizes short (less than 13-bit) multiplies.  Short
+ * multiplies require 25 instruction cycles, and long ones require
+ * 45 instruction cycles.
+ *
+ * On return, overflow has occurred (%o1 is not zero) if and only if
+ * the Z condition code is clear, allowing, e.g., the following:
+ *
+ *	call	.umul
+ *	nop
+ *	bnz	overflow	(or tnz)
+ */
+.globl	.umul, __umul
+.umul:
+__umul:
+	or	%o0, %o1, %o4
+	mov	%o0, %y		! multiplier -> Y
+	andncc	%o4, 0xfff, %g0	! test bits 12..31 of *both* args
+	be	Lumul_shortway	! if zero, can do it the short way
+	 andcc	%g0, %g0, %o4	! zero the partial product and clear N and V
+
+	/*
+	 * Long multiply.  32 steps, followed by a final shift step.
+	 */
+	mulscc	%o4, %o1, %o4	! 1
+	mulscc	%o4, %o1, %o4	! 2
+	mulscc	%o4, %o1, %o4	! 3
+	mulscc	%o4, %o1, %o4	! 4
+	mulscc	%o4, %o1, %o4	! 5
+	mulscc	%o4, %o1, %o4	! 6
+	mulscc	%o4, %o1, %o4	! 7
+	mulscc	%o4, %o1, %o4	! 8
+	mulscc	%o4, %o1, %o4	! 9
+	mulscc	%o4, %o1, %o4	! 10
+	mulscc	%o4, %o1, %o4	! 11
+	mulscc	%o4, %o1, %o4	! 12
+	mulscc	%o4, %o1, %o4	! 13
+	mulscc	%o4, %o1, %o4	! 14
+	mulscc	%o4, %o1, %o4	! 15
+	mulscc	%o4, %o1, %o4	! 16
+	mulscc	%o4, %o1, %o4	! 17
+	mulscc	%o4, %o1, %o4	! 18
+	mulscc	%o4, %o1, %o4	! 19
+	mulscc	%o4, %o1, %o4	! 20
+	mulscc	%o4, %o1, %o4	! 21
+	mulscc	%o4, %o1, %o4	! 22
+	mulscc	%o4, %o1, %o4	! 23
+	mulscc	%o4, %o1, %o4	! 24
+	mulscc	%o4, %o1, %o4	! 25
+	mulscc	%o4, %o1, %o4	! 26
+	mulscc	%o4, %o1, %o4	! 27
+	mulscc	%o4, %o1, %o4	! 28
+	mulscc	%o4, %o1, %o4	! 29
+	mulscc	%o4, %o1, %o4	! 30
+	mulscc	%o4, %o1, %o4	! 31
+	mulscc	%o4, %o1, %o4	! 32
+	mulscc	%o4, %g0, %o4	! final shift
+
+
+	/*
+	 * Normally, with the shift-and-add approach, if both numbers are
+	 * positive you get the correct result.  WIth 32-bit two's-complement
+	 * numbers, -x is represented as
+	 *
+	 *		  x		    32
+	 *	( 2  -  ------ ) mod 2  *  2
+	 *		   32
+	 *		  2
+	 *
+	 * (the `mod 2' subtracts 1 from 1.bbbb).  To avoid lots of 2^32s,
+	 * we can treat this as if the radix point were just to the left
+	 * of the sign bit (multiply by 2^32), and get
+	 *
+	 *	-x  =  (2 - x) mod 2
+	 *
+	 * Then, ignoring the `mod 2's for convenience:
+	 *
+	 *   x *  y	= xy
+	 *  -x *  y	= 2y - xy
+	 *   x * -y	= 2x - xy
+	 *  -x * -y	= 4 - 2x - 2y + xy
+	 *
+	 * For signed multiplies, we subtract (x << 32) from the partial
+	 * product to fix this problem for negative multipliers (see mul.s).
+	 * Because of the way the shift into the partial product is calculated
+	 * (N xor V), this term is automatically removed for the multiplicand,
+	 * so we don't have to adjust.
+	 *
+	 * But for unsigned multiplies, the high order bit wasn't a sign bit,
+	 * and the correction is wrong.  So for unsigned multiplies where the
+	 * high order bit is one, we end up with xy - (y << 32).  To fix it
+	 * we add y << 32.
+	 */
+	tst	%o1
+	bl,a	1f		! if %o1 < 0 (high order bit = 1),
+	add	%o4, %o0, %o4	! %o4 += %o0 (add y to upper half)
+1:	rd	%y, %o0		! get lower half of product
+	retl
+	 addcc	%o4, %g0, %o1	! put upper half in place and set Z for %o1==0
+
+Lumul_shortway:
+	/*
+	 * Short multiply.  12 steps, followed by a final shift step.
+	 * The resulting bits are off by 12 and (32-12) = 20 bit positions,
+	 * but there is no problem with %o0 being negative (unlike above),
+	 * and overflow is impossible (the answer is at most 24 bits long).
+	 */
+	mulscc	%o4, %o1, %o4	! 1
+	mulscc	%o4, %o1, %o4	! 2
+	mulscc	%o4, %o1, %o4	! 3
+	mulscc	%o4, %o1, %o4	! 4
+	mulscc	%o4, %o1, %o4	! 5
+	mulscc	%o4, %o1, %o4	! 6
+	mulscc	%o4, %o1, %o4	! 7
+	mulscc	%o4, %o1, %o4	! 8
+	mulscc	%o4, %o1, %o4	! 9
+	mulscc	%o4, %o1, %o4	! 10
+	mulscc	%o4, %o1, %o4	! 11
+	mulscc	%o4, %o1, %o4	! 12
+	mulscc	%o4, %g0, %o4	! final shift
+
+	/*
+	 * %o4 has 20 of the bits that should be in the result; %y has
+	 * the bottom 12 (as %y's top 12).  That is:
+	 *
+	 *	  %o4		    %y
+	 * +----------------+----------------+
+	 * | -12- |   -20-  | -12- |   -20-  |
+	 * +------(---------+------)---------+
+	 *	   -----result-----
+	 *
+	 * The 12 bits of %o4 left of the `result' area are all zero;
+	 * in fact, all top 20 bits of %o4 are zero.
+	 */
+
+	rd	%y, %o5
+	sll	%o4, 12, %o0	! shift middle bits left 12
+	srl	%o5, 20, %o5	! shift low bits right 20
+	or	%o5, %o0, %o0
+	retl
+	addcc	%g0, %g0, %o1	! %o1 = zero, and set Z
+
+/*
  * Here is a very good random number generator.  This implementation is
  * based on ``Two Fast Implementations of the "Minimal Standard" Random
  * Number Generator", David G. Carta, Communications of the ACM, Jan 1990,