summaryrefslogtreecommitdiff
path: root/sys/arch/mvme68k/stand/sboot/oc_cksum.S
blob: 328bf77bab96ede0136dfcf17befbe9232002f14 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
|	$OpenBSD: oc_cksum.S,v 1.3 1996/04/28 10:49:42 deraadt Exp $

| Copyright (c) 1988 Regents of the University of California.
| All rights reserved.
|
| Redistribution and use in source and binary forms, with or without
| modification, are permitted provided that the following conditions
| are met:
| 1. Redistributions of source code must retain the above copyright
|    notice, this list of conditions and the following disclaimer.
| 2. Redistributions in binary form must reproduce the above copyright
|    notice, this list of conditions and the following disclaimer in the
|    documentation and/or other materials provided with the distribution.
| 3. All advertising materials mentioning features or use of this software
|    must display the following acknowledgement:
|	This product includes software developed by the University of
|	California, Berkeley and its contributors.
| 4. Neither the name of the University nor the names of its contributors
|    may be used to endorse or promote products derived from this software
|    without specific prior written permission.
|
| THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
| ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
| ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
| FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
| OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
| HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
| LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
| OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
| SUCH DAMAGE.
|
|	@(#)oc_cksum.s	7.2 (Berkeley) 11/3/90
|
|
| oc_cksum: ones complement 16 bit checksum for MC68020.
|
| oc_cksum (buffer, count, strtval)
|
| Do a 16 bit ones complement sum of 'count' bytes from 'buffer'.
| 'strtval' is the starting value of the sum (usually zero).
|
| It simplifies life in in_cksum if strtval can be >= 2^16.
| This routine will work as long as strtval is < 2^31.
|
| Performance
| -----------
| This routine is intended for MC 68020s but should also work
| for 68030s.  It (deliberately) does not worry about the alignment
| of the buffer so will only work on a 68010 if the buffer is
| aligned on an even address.  (Also, a routine written to use
| 68010 "loop mode" would almost certainly be faster than this
| code on a 68010).
|
| We do not worry about alignment because this routine is frequently
| called with small counts: 20 bytes for IP header checksums and 40
| bytes for TCP ack checksums.  For these small counts, testing for
| bad alignment adds ~10% to the per-call cost.  Since, by the nature
| of the kernel allocator, the data we are called with is almost
| always longword aligned, there is no benefit to this added cost
| and we are better off letting the loop take a big performance hit
| in the rare cases where we are handed an unaligned buffer.
|
| Loop unrolling constants of 2, 4, 8, 16, 32 and 64 times were
| tested on random data on four different types of processors (see
| list below -- 64 was the largest unrolling because anything more
| overflows the 68020 Icache).  On all the processors, the
| throughput asymptote was located between 8 and 16 (closer to 8).
| However, 16 was substantially better than 8 for small counts.
| (It is clear why this happens for a count of 40: unroll-8 pays a
| loop branch cost and unroll-16 does not.  But the tests also showed
| that 16 was better than 8 for a count of 20.  It is not obvious to
| me why.)  So, since 16 was good for both large and small counts,
| the loop below is unrolled 16 times.
| 
| The processors tested and their average time to checksum 1024 bytes
| of random data were:
| 	Sun 3/50 (15MHz)	190 us/KB
| 	Sun 3/180 (16.6MHz)	175 us/KB
| 	Sun 3/60 (20MHz)	134 us/KB
| 	Sun 3/280 (25MHz)	 95 us/KB
| 
| The cost of calling this routine was typically 10% of the per-
| kilobyte cost.  E.g., checksumming zero bytes on a 3/60 cost 9us
| and each additional byte cost 125ns.  With the high fixed cost,
| it would clearly be a gain to "inline" this routine -- the
| subroutine call adds 400% overhead to an IP header checksum.
| However, in absolute terms, inlining would only gain 10us per
| packet -- a 1% effect for a 1ms ethernet packet.  This is not
| enough gain to be worth the effort.

#include <machine/asm.h>

	.text

	.text; .even; .globl _oc_cksum; _oc_cksum:
	movl	sp@(4),a0	| get buffer ptr
	movl	sp@(8),d1	| get byte count
	movl	sp@(12),d0	| get starting value
	movl	d2,sp@-		| free a reg

	| test for possible 1, 2 or 3 bytes of excess at end
	| of buffer.  The usual case is no excess (the usual
	| case is header checksums) so we give that the faster
	| 'not taken' leg of the compare.  (We do the excess
	| first because we are about the trash the low order
	| bits of the count in d1.)

	btst	#0,d1
	jne	L5		| if one or three bytes excess
	btst	#1,d1
	jne	L7		| if two bytes excess
L1:
	movl	d1,d2
	lsrl	#6,d1		| make cnt into # of 64 byte chunks
	andl	#0x3c,d2	| then find fractions of a chunk
	negl	d2
	andb	#0xf,cc		| clear X
	jmp	pc@(L3-.-2:b,d2)
L2:
	movl	a0@+,d2
	addxl	d2,d0
	movl	a0@+,d2
	addxl	d2,d0
	movl	a0@+,d2
	addxl	d2,d0
	movl	a0@+,d2
	addxl	d2,d0
	movl	a0@+,d2
	addxl	d2,d0
	movl	a0@+,d2
	addxl	d2,d0
	movl	a0@+,d2
	addxl	d2,d0
	movl	a0@+,d2
	addxl	d2,d0
	movl	a0@+,d2
	addxl	d2,d0
	movl	a0@+,d2
	addxl	d2,d0
	movl	a0@+,d2
	addxl	d2,d0
	movl	a0@+,d2
	addxl	d2,d0
	movl	a0@+,d2
	addxl	d2,d0
	movl	a0@+,d2
	addxl	d2,d0
	movl	a0@+,d2
	addxl	d2,d0
	movl	a0@+,d2
	addxl	d2,d0
L3:
	dbra	d1,L2		| (NB- dbra does not affect X)

	movl	d0,d1		| fold 32 bit sum to 16 bits
	swap	d1		| (NB- swap does not affect X)
	addxw	d1,d0
	jcc	L4
	addw	#1,d0
L4:
	andl	#0xffff,d0
	movl	sp@+,d2
	rts

L5:	| deal with 1 or 3 excess bytes at the end of the buffer.
	btst	#1,d1
	jeq	L6		| if 1 excess

	| 3 bytes excess
	clrl	d2
	movw	a0@(-3,d1:l),d2	| add in last full word then drop
	addl	d2,d0		|  through to pick up last byte

L6:	| 1 byte excess
	clrl	d2
	movb	a0@(-1,d1:l),d2
	lsll	#8,d2
	addl	d2,d0
	jra	L1

L7:	| 2 bytes excess
	clrl	d2
	movw	a0@(-2,d1:l),d2
	addl	d2,d0
	jra	L1