1 files changed, 187 insertions, 0 deletions
diff --git a/sys/arch/mvme68k/stand/sboot/oc_cksum.S b/sys/arch/mvme68k/stand/sboot/oc_cksum.S
new file mode 100644
index 00000000000..2c9d6a63a24
--- /dev/null
+++ b/sys/arch/mvme68k/stand/sboot/oc_cksum.S
@@ -0,0 +1,187 @@
+|	$NetBSD: oc_cksum.s,v 1.4 1994/10/26 07:51:13 cgd Exp $
+
+| Copyright (c) 1988 Regents of the University of California.
+| All rights reserved.
+|
+| Redistribution and use in source and binary forms, with or without
+| modification, are permitted provided that the following conditions
+| are met:
+| 1. Redistributions of source code must retain the above copyright
+|    notice, this list of conditions and the following disclaimer.
+| 2. Redistributions in binary form must reproduce the above copyright
+|    notice, this list of conditions and the following disclaimer in the
+|    documentation and/or other materials provided with the distribution.
+| 3. All advertising materials mentioning features or use of this software
+|    must display the following acknowledgement:
+|	This product includes software developed by the University of
+|	California, Berkeley and its contributors.
+| 4. Neither the name of the University nor the names of its contributors
+|    may be used to endorse or promote products derived from this software
+|    without specific prior written permission.
+|
+| THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+| ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+| ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+| FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+| OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+| HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+| LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+| OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+| SUCH DAMAGE.
+|
+|	@(#)oc_cksum.s	7.2 (Berkeley) 11/3/90
+|
+|
+| oc_cksum: ones complement 16 bit checksum for MC68020.
+|
+| oc_cksum (buffer, count, strtval)
+|
+| Do a 16 bit ones complement sum of 'count' bytes from 'buffer'.
+| 'strtval' is the starting value of the sum (usually zero).
+|
+| It simplifies life in in_cksum if strtval can be >= 2^16.
+| This routine will work as long as strtval is < 2^31.
+|
+| Performance
+| -----------
+| This routine is intended for MC 68020s but should also work
+| for 68030s.  It (deliberately) does not worry about the alignment
+| of the buffer so will only work on a 68010 if the buffer is
+| aligned on an even address.  (Also, a routine written to use
+| 68010 "loop mode" would almost certainly be faster than this
+| code on a 68010).
+|
+| We do not worry about alignment because this routine is frequently
+| called with small counts: 20 bytes for IP header checksums and 40
+| bytes for TCP ack checksums.  For these small counts, testing for
+| bad alignment adds ~10% to the per-call cost.  Since, by the nature
+| of the kernel allocator, the data we are called with is almost
+| always longword aligned, there is no benefit to this added cost
+| and we are better off letting the loop take a big performance hit
+| in the rare cases where we are handed an unaligned buffer.
+|
+| Loop unrolling constants of 2, 4, 8, 16, 32 and 64 times were
+| tested on random data on four different types of processors (see
+| list below -- 64 was the largest unrolling because anything more
+| overflows the 68020 Icache).  On all the processors, the
+| throughput asymptote was located between 8 and 16 (closer to 8).
+| However, 16 was substantially better than 8 for small counts.
+| (It is clear why this happens for a count of 40: unroll-8 pays a
+| loop branch cost and unroll-16 does not.  But the tests also showed
+| that 16 was better than 8 for a count of 20.  It is not obvious to
+| me why.)  So, since 16 was good for both large and small counts,
+| the loop below is unrolled 16 times.
+| 
+| The processors tested and their average time to checksum 1024 bytes
+| of random data were:
+| 	Sun 3/50 (15MHz)	190 us/KB
+| 	Sun 3/180 (16.6MHz)	175 us/KB
+| 	Sun 3/60 (20MHz)	134 us/KB
+| 	Sun 3/280 (25MHz)	 95 us/KB
+| 
+| The cost of calling this routine was typically 10% of the per-
+| kilobyte cost.  E.g., checksumming zero bytes on a 3/60 cost 9us
+| and each additional byte cost 125ns.  With the high fixed cost,
+| it would clearly be a gain to "inline" this routine -- the
+| subroutine call adds 400% overhead to an IP header checksum.
+| However, in absolute terms, inlining would only gain 10us per
+| packet -- a 1% effect for a 1ms ethernet packet.  This is not
+| enough gain to be worth the effort.
+
+#include <machine/asm.h>
+
+	.text
+
+	.text; .even; .globl _oc_cksum; _oc_cksum:
+	movl	sp@(4),a0	| get buffer ptr
+	movl	sp@(8),d1	| get byte count
+	movl	sp@(12),d0	| get starting value
+	movl	d2,sp@-		| free a reg
+
+	| test for possible 1, 2 or 3 bytes of excess at end
+	| of buffer.  The usual case is no excess (the usual
+	| case is header checksums) so we give that the faster
+	| 'not taken' leg of the compare.  (We do the excess
+	| first because we are about the trash the low order
+	| bits of the count in d1.)
+
+	btst	#0,d1
+	jne	L5		| if one or three bytes excess
+	btst	#1,d1
+	jne	L7		| if two bytes excess
+L1:
+	movl	d1,d2
+	lsrl	#6,d1		| make cnt into # of 64 byte chunks
+	andl	#0x3c,d2	| then find fractions of a chunk
+	negl	d2
+	andb	#0xf,cc		| clear X
+	jmp	pc@(L3-.-2:b,d2)
+L2:
+	movl	a0@+,d2
+	addxl	d2,d0
+	movl	a0@+,d2
+	addxl	d2,d0
+	movl	a0@+,d2
+	addxl	d2,d0
+	movl	a0@+,d2
+	addxl	d2,d0
+	movl	a0@+,d2
+	addxl	d2,d0
+	movl	a0@+,d2
+	addxl	d2,d0
+	movl	a0@+,d2
+	addxl	d2,d0
+	movl	a0@+,d2
+	addxl	d2,d0
+	movl	a0@+,d2
+	addxl	d2,d0
+	movl	a0@+,d2
+	addxl	d2,d0
+	movl	a0@+,d2
+	addxl	d2,d0
+	movl	a0@+,d2
+	addxl	d2,d0
+	movl	a0@+,d2
+	addxl	d2,d0
+	movl	a0@+,d2
+	addxl	d2,d0
+	movl	a0@+,d2
+	addxl	d2,d0
+	movl	a0@+,d2
+	addxl	d2,d0
+L3:
+	dbra	d1,L2		| (NB- dbra does not affect X)
+
+	movl	d0,d1		| fold 32 bit sum to 16 bits
+	swap	d1		| (NB- swap does not affect X)
+	addxw	d1,d0
+	jcc	L4
+	addw	#1,d0
+L4:
+	andl	#0xffff,d0
+	movl	sp@+,d2
+	rts
+
+L5:	| deal with 1 or 3 excess bytes at the end of the buffer.
+	btst	#1,d1
+	jeq	L6		| if 1 excess
+
+	| 3 bytes excess
+	clrl	d2
+	movw	a0@(-3,d1:l),d2	| add in last full word then drop
+	addl	d2,d0		|  through to pick up last byte
+
+L6:	| 1 byte excess
+	clrl	d2
+	movb	a0@(-1,d1:l),d2
+	lsll	#8,d2
+	addl	d2,d0
+	jra	L1
+
+L7:	| 2 bytes excess
+	clrl	d2
+	movw	a0@(-2,d1:l),d2
+	addl	d2,d0
+	jra	L1