diff options
author | chuck <chuck@cvs.openbsd.org> | 1995-11-07 20:22:46 +0000 |
---|---|---|
committer | chuck <chuck@cvs.openbsd.org> | 1995-11-07 20:22:46 +0000 |
commit | 13e018fa2064ca59e3e62a83013422a54354d132 (patch) | |
tree | e69c70df9635f2e88220e2911caaabfdbca5502a /sys/arch/sparc | |
parent | 506a9f45b3b4659876b3bd29f6bff4027e18cc16 (diff) |
optimized in_cksum from Zubin Dittia <zubin@dworkin.wustl.edu>
Zubin says:
The checksum computation code here is significantly faster than its
vanilla C counterpart (by significantly, I mean 2-3 times faster if
the data is in cache, and 1.5-2 times faster if the data is not in
cache).
We optimize on three fronts:
1. By using the add-with-carry (addxcc) instruction, we can use
32-bit operations instead of 16-bit operations.
2. By unrolling the main loop to reduce branch overheads.
3. By doing a sequence of load,load,add,add,load,load,add,add,
we can avoid the extra stall cycle which is incurred if the
instruction immediately following a load tries to use the
target register of the load.
Another possible optimization is to replace a pair of 32-bit loads
with a single 64-bit load (ldd) instruction, but I found that although
this improves performance somewhat on Sun4c machines, it actually
reduces performance considerably on Sun4m machines (because of their
superscaler architecture). So I chose to leave it out.
Diffstat (limited to 'sys/arch/sparc')
-rw-r--r-- | sys/arch/sparc/sparc/in_cksum.c | 136 |
1 files changed, 91 insertions, 45 deletions
diff --git a/sys/arch/sparc/sparc/in_cksum.c b/sys/arch/sparc/sparc/in_cksum.c index 9e03990bacb..180271b6ead 100644 --- a/sys/arch/sparc/sparc/in_cksum.c +++ b/sys/arch/sparc/sparc/in_cksum.c @@ -1,6 +1,7 @@ /* $NetBSD: in_cksum.c,v 1.3 1995/04/26 13:30:03 pk Exp $ */ /* + * Copyright (c) 1995 Zubin Dittia. * Copyright (c) 1995 Matthew Green. * Copyright (c) 1994 Charles Hannum. * Copyright (c) 1992, 1993 @@ -55,49 +56,81 @@ */ /* - * This idea here is that we do as many 32 bit operations as possible - * for maximum efficiency. We also unroll all loops in to assembly. - * This gains about 20% extra efficiency over the non-pipelined method. + * The checksum computation code here is significantly faster than its + * vanilla C counterpart (by significantly, I mean 2-3 times faster if + * the data is in cache, and 1.5-2 times faster if the data is not in + * cache). + * We optimize on three fronts: + * 1. By using the add-with-carry (addxcc) instruction, we can use + * 32-bit operations instead of 16-bit operations. + * 2. By unrolling the main loop to reduce branch overheads. + * 3. By doing a sequence of load,load,add,add,load,load,add,add, + * we can avoid the extra stall cycle which is incurred if the + * instruction immediately following a load tries to use the + * target register of the load. + * Another possible optimization is to replace a pair of 32-bit loads + * with a single 64-bit load (ldd) instruction, but I found that although + * this improves performance somewhat on Sun4c machines, it actually + * reduces performance considerably on Sun4m machines (because of their + * superscaler architecture). So I chose to leave it out. * - * XXX - this code really needs further performance analysis. At the - * moment it has only been run on a SPARC ELC. + * Zubin Dittia (zubin@dworkin.wustl.edu) */ -#define Asm __asm __volatile -#define ADD32 Asm(" ld [%2+28],%%i0; ld [%2+24],%%i1; \ - ld [%2+20],%%i2; ld [%2+16],%%i3; \ - addcc %0,%%i0,%0; addxcc %0,%%i1,%0; \ - ld [%2+12],%%i4; ld [%2+8],%%i5; \ - addxcc %0,%%i2,%0; addxcc %0,%%i3,%0; \ - ld [%2+4],%%i0; ld [%2],%%i1; \ - addxcc %0,%%i4,%0; addxcc %0,%%i5,%0; \ - addxcc %0,%%i0,%0; addxcc %0,%%i1,%0; \ +#define Asm __asm __volatile +#define ADD64 Asm(" ld [%2],%3; ld [%2+4],%4; \ + addcc %0,%3,%0; addxcc %0,%4,%0; \ + ld [%2+8],%3; ld [%2+12],%4; \ + addxcc %0,%3,%0; addxcc %0,%4,%0; \ + ld [%2+16],%3; ld [%2+20],%4; \ + addxcc %0,%3,%0; addxcc %0,%4,%0; \ + ld [%2+24],%3; ld [%2+28],%4; \ + addxcc %0,%3,%0; addxcc %0,%4,%0; \ + ld [%2+32],%3; ld [%2+36],%4; \ + addxcc %0,%3,%0; addxcc %0,%4,%0; \ + ld [%2+40],%3; ld [%2+44],%4; \ + addxcc %0,%3,%0; addxcc %0,%4,%0; \ + ld [%2+48],%3; ld [%2+52],%4; \ + addxcc %0,%3,%0; addxcc %0,%4,%0; \ + ld [%2+56],%3; ld [%2+60],%4; \ + addxcc %0,%3,%0; addxcc %0,%4,%0; \ addxcc %0,0,%0" \ - : "=r" (sum) : "0" (sum), "r" (w) \ - : "%i0", "%i1", "%i2", "%i3", "%i4", "%i5") -#define ADD16 Asm(" ld [%2+12],%%i0; ld [%2+8],%%i1; \ - addcc %0,%%i0,%0; addxcc %0,%%i1,%0; \ - ld [%2+4],%%i2; ld [%2],%%i3; \ - addxcc %0,%%i2,%0; addxcc %0,%%i3,%0; \ + : "=r" (sum) \ + : "0" (sum), "r" (w), "r" (tmp1), "r" (tmp2)) +#define ADD32 Asm(" ld [%2],%3; ld [%2+4],%4; \ + addcc %0,%3,%0; addxcc %0,%4,%0; \ + ld [%2+8],%3; ld [%2+12],%4; \ + addxcc %0,%3,%0; addxcc %0,%4,%0; \ + ld [%2+16],%3; ld [%2+20],%4; \ + addxcc %0,%3,%0; addxcc %0,%4,%0; \ + ld [%2+24],%3; ld [%2+28],%4; \ + addxcc %0,%3,%0; addxcc %0,%4,%0; \ addxcc %0,0,%0" \ - : "=r" (sum) : "0" (sum), "r" (w) \ - : "%i0", "%i1", "%i2", "%i3") -#define ADD8 Asm(" ld [%2+4],%%i0; ld [%2],%%i1; \ - addcc %0,%%i0,%0; addxcc %0,%%i1,%0; \ + : "=r" (sum) \ + : "0" (sum), "r" (w), "r" (tmp1), "r" (tmp2)) +#define ADD16 Asm(" ld [%2],%3; ld [%2+4],%4; \ + addcc %0,%3,%0; addxcc %0,%4,%0; \ + ld [%2+8],%3; ld [%2+12],%4; \ + addxcc %0,%3,%0; addxcc %0,%4,%0; \ addxcc %0,0,%0" \ - : "=r" (sum) : "0" (sum), "r" (w) \ - : "%i0", "%i1") -#define ADD4 Asm(" ld [%2],%%i0; addcc %0,%%i0,%0; \ + : "=r" (sum) \ + : "0" (sum), "r" (w), "r" (tmp1), "r" (tmp2)) +#define ADD8 Asm(" ld [%2],%3; ld [%2+4],%4; \ + addcc %0,%3,%0; addxcc %0,%4,%0; \ addxcc %0,0,%0" \ - : "=r" (sum) : "0" (sum), "r" (w) \ - : "%i0") + : "=r" (sum) \ + : "0" (sum), "r" (w), "r" (tmp1), "r" (tmp2)) +#define ADD4 Asm(" ld [%2],%3; addcc %0,%3,%0; \ + addxcc %0,0,%0" \ + : "=r" (sum) \ + : "0" (sum), "r" (w), "r" (tmp1)) #define REDUCE {sum = (sum & 0xffff) + (sum >> 16);} #define ADDCARRY {if (sum > 0xffff) sum -= 0xffff;} #define ROL {sum = sum << 8;} /* depends on recent REDUCE */ -#define ADDB {ROL; sum += *w; byte_swapped ^= 1;} -#define ADDS {sum += *(u_short *)w;} -#define SHIFT(n) {w += n; mlen -= n;} +#define ADDBYTE {ROL; sum += *w; byte_swapped ^= 1;} +#define ADDSHORT {sum += *(u_short *)w;} +#define ADVANCE(n) {w += n; mlen -= n;} int in_cksum(m, len) @@ -109,6 +142,13 @@ in_cksum(m, len) register int mlen = 0; int byte_swapped = 0; + /* + * Declare two temporary registers for use by the asm code. We + * allow the compiler to pick which specific machine registers to + * use, instead of hard-coding this in the asm code above. + */ + register u_int tmp1, tmp2; + for (; m && len; m = m->m_next) { if (m->m_len == 0) continue; @@ -125,45 +165,50 @@ in_cksum(m, len) if ((3 & (long)w) != 0) { REDUCE; if ((1 & (long)w) != 0 && mlen >= 1) { - ADDB; - SHIFT(1); + ADDBYTE; + ADVANCE(1); } if ((2 & (long)w) != 0 && mlen >= 2) { - ADDS; - SHIFT(2); + ADDSHORT; + ADVANCE(2); } } + /* * Do as many 32 bit operattions as possible using the - * 32/16/8/4 macro's above, using as many as possible of + * 64/32/16/8/4 macro's above, using as many as possible of * these. */ - while (mlen >= 32) { + while (mlen >= 64) { + ADD64; + ADVANCE(64); + } + if (mlen >= 32) { ADD32; - SHIFT(32); + ADVANCE(32); } if (mlen >= 16) { ADD16; - SHIFT(16); + ADVANCE(16); } if (mlen >= 8) { ADD8; - SHIFT(8); + ADVANCE(8); } if (mlen >= 4) { ADD4; - SHIFT(4) + ADVANCE(4) } if (mlen == 0) continue; REDUCE; if (mlen >= 2) { - ADDS; - SHIFT(2); + ADDSHORT; + ADVANCE(2); } if (mlen == 1) { - ADDB; + ADDBYTE; } } if (byte_swapped) { @@ -172,5 +217,6 @@ in_cksum(m, len) } REDUCE; ADDCARRY; + return (0xffff ^ sum); } |