src - OpenBSD base system

diff options


context:
space:
mode:

author	chuck <chuck@cvs.openbsd.org>	1995-11-07 20:22:46 +0000
committer	chuck <chuck@cvs.openbsd.org>	1995-11-07 20:22:46 +0000
commit	13e018fa2064ca59e3e62a83013422a54354d132 (patch)
tree	e69c70df9635f2e88220e2911caaabfdbca5502a /sys/arch/sparc
parent	506a9f45b3b4659876b3bd29f6bff4027e18cc16 (diff)

optimized in_cksum from Zubin Dittia <zubin@dworkin.wustl.edu>

Zubin says: The checksum computation code here is significantly faster than its vanilla C counterpart (by significantly, I mean 2-3 times faster if the data is in cache, and 1.5-2 times faster if the data is not in cache). We optimize on three fronts: 1. By using the add-with-carry (addxcc) instruction, we can use 32-bit operations instead of 16-bit operations. 2. By unrolling the main loop to reduce branch overheads. 3. By doing a sequence of load,load,add,add,load,load,add,add, we can avoid the extra stall cycle which is incurred if the instruction immediately following a load tries to use the target register of the load. Another possible optimization is to replace a pair of 32-bit loads with a single 64-bit load (ldd) instruction, but I found that although this improves performance somewhat on Sun4c machines, it actually reduces performance considerably on Sun4m machines (because of their superscaler architecture). So I chose to leave it out.

Diffstat (limited to 'sys/arch/sparc')

-rw-r--r--

sys/arch/sparc/sparc/in_cksum.c

136

1 files changed, 91 insertions, 45 deletions

diff --git a/sys/arch/sparc/sparc/in_cksum.c b/sys/arch/sparc/sparc/in_cksum.c
index 9e03990bacb..180271b6ead 100644
--- a/sys/arch/sparc/sparc/in_cksum.c
+++ b/sys/arch/sparc/sparc/in_cksum.c

@@ -1,6 +1,7 @@

/* $NetBSD: in_cksum.c,v 1.3 1995/04/26 13:30:03 pk Exp $ */

@@ -55,49 +56,81 @@

- * This idea here is that we do as many 32 bit operations as possible

- * for maximum efficiency. We also unroll all loops in to assembly.

- * This gains about 20% extra efficiency over the non-pipelined method.

+ * The checksum computation code here is significantly faster than its

+ * vanilla C counterpart (by significantly, I mean 2-3 times faster if

+ * the data is in cache, and 1.5-2 times faster if the data is not in

+ * cache).

+ * We optimize on three fronts:

+ * 1. By using the add-with-carry (addxcc) instruction, we can use

+ * 32-bit operations instead of 16-bit operations.

+ * 2. By unrolling the main loop to reduce branch overheads.

+ * 3. By doing a sequence of load,load,add,add,load,load,add,add,

+ * we can avoid the extra stall cycle which is incurred if the

+ * instruction immediately following a load tries to use the

+ * target register of the load.

+ * Another possible optimization is to replace a pair of 32-bit loads

+ * with a single 64-bit load (ldd) instruction, but I found that although

+ * this improves performance somewhat on Sun4c machines, it actually

+ * reduces performance considerably on Sun4m machines (because of their

+ * superscaler architecture). So I chose to leave it out.

- * XXX - this code really needs further performance analysis. At the

- * moment it has only been run on a SPARC ELC.

+ * Zubin Dittia (zubin@dworkin.wustl.edu)

-#define Asm __asm __volatile

-#define ADD32 Asm(" ld [%2+28],%%i0; ld [%2+24],%%i1; \

- ld [%2+20],%%i2; ld [%2+16],%%i3; \

- addcc %0,%%i0,%0; addxcc %0,%%i1,%0; \

- ld [%2+12],%%i4; ld [%2+8],%%i5; \

- addxcc %0,%%i2,%0; addxcc %0,%%i3,%0; \

- ld [%2+4],%%i0; ld [%2],%%i1; \

- addxcc %0,%%i4,%0; addxcc %0,%%i5,%0; \

- addxcc %0,%%i0,%0; addxcc %0,%%i1,%0; \

+#define Asm __asm __volatile

+#define ADD64 Asm(" ld [%2],%3; ld [%2+4],%4; \

+ addcc %0,%3,%0; addxcc %0,%4,%0; \

+ ld [%2+8],%3; ld [%2+12],%4; \

+ addxcc %0,%3,%0; addxcc %0,%4,%0; \

+ ld [%2+16],%3; ld [%2+20],%4; \

+ addxcc %0,%3,%0; addxcc %0,%4,%0; \

+ ld [%2+24],%3; ld [%2+28],%4; \

+ addxcc %0,%3,%0; addxcc %0,%4,%0; \

+ ld [%2+32],%3; ld [%2+36],%4; \

+ addxcc %0,%3,%0; addxcc %0,%4,%0; \

+ ld [%2+40],%3; ld [%2+44],%4; \

+ addxcc %0,%3,%0; addxcc %0,%4,%0; \

+ ld [%2+48],%3; ld [%2+52],%4; \

+ addxcc %0,%3,%0; addxcc %0,%4,%0; \

+ ld [%2+56],%3; ld [%2+60],%4; \

+ addxcc %0,%3,%0; addxcc %0,%4,%0; \

addxcc %0,0,%0" \

- : "=r" (sum) : "0" (sum), "r" (w) \

- : "%i0", "%i1", "%i2", "%i3", "%i4", "%i5")

-#define ADD16 Asm(" ld [%2+12],%%i0; ld [%2+8],%%i1; \

- addcc %0,%%i0,%0; addxcc %0,%%i1,%0; \

- ld [%2+4],%%i2; ld [%2],%%i3; \

- addxcc %0,%%i2,%0; addxcc %0,%%i3,%0; \

+ : "=r" (sum) \

+ : "0" (sum), "r" (w), "r" (tmp1), "r" (tmp2))

+#define ADD32 Asm(" ld [%2],%3; ld [%2+4],%4; \

+ addcc %0,%3,%0; addxcc %0,%4,%0; \

+ ld [%2+8],%3; ld [%2+12],%4; \

+ addxcc %0,%3,%0; addxcc %0,%4,%0; \

+ ld [%2+16],%3; ld [%2+20],%4; \

+ addxcc %0,%3,%0; addxcc %0,%4,%0; \

+ ld [%2+24],%3; ld [%2+28],%4; \

+ addxcc %0,%3,%0; addxcc %0,%4,%0; \

addxcc %0,0,%0" \

- : "=r" (sum) : "0" (sum), "r" (w) \

- : "%i0", "%i1", "%i2", "%i3")

-#define ADD8 Asm(" ld [%2+4],%%i0; ld [%2],%%i1; \

- addcc %0,%%i0,%0; addxcc %0,%%i1,%0; \

+ : "=r" (sum) \

+ : "0" (sum), "r" (w), "r" (tmp1), "r" (tmp2))

+#define ADD16 Asm(" ld [%2],%3; ld [%2+4],%4; \

+ addcc %0,%3,%0; addxcc %0,%4,%0; \

+ ld [%2+8],%3; ld [%2+12],%4; \

+ addxcc %0,%3,%0; addxcc %0,%4,%0; \

addxcc %0,0,%0" \

- : "=r" (sum) : "0" (sum), "r" (w) \

- : "%i0", "%i1")

-#define ADD4 Asm(" ld [%2],%%i0; addcc %0,%%i0,%0; \

+ : "=r" (sum) \

+ : "0" (sum), "r" (w), "r" (tmp1), "r" (tmp2))

+#define ADD8 Asm(" ld [%2],%3; ld [%2+4],%4; \

+ addcc %0,%3,%0; addxcc %0,%4,%0; \

addxcc %0,0,%0" \

- : "=r" (sum) : "0" (sum), "r" (w) \

- : "%i0")

+ : "=r" (sum) \

+ : "0" (sum), "r" (w), "r" (tmp1), "r" (tmp2))

+#define ADD4 Asm(" ld [%2],%3; addcc %0,%3,%0; \

+ addxcc %0,0,%0" \

+ : "=r" (sum) \

+ : "0" (sum), "r" (w), "r" (tmp1))

#define REDUCE {sum = (sum & 0xffff) + (sum >> 16);}

#define ADDCARRY {if (sum > 0xffff) sum -= 0xffff;}

#define ROL {sum = sum << 8;} /* depends on recent REDUCE */

-#define ADDB {ROL; sum += *w; byte_swapped ^= 1;}

-#define ADDS {sum += *(u_short *)w;}

-#define SHIFT(n) {w += n; mlen -= n;}

+#define ADDBYTE {ROL; sum += *w; byte_swapped ^= 1;}

+#define ADDSHORT {sum += *(u_short *)w;}

+#define ADVANCE(n) {w += n; mlen -= n;}

int

in_cksum(m, len)

@@ -109,6 +142,13 @@ in_cksum(m, len)

int byte_swapped = 0;

+ /*

+ * Declare two temporary registers for use by the asm code. We

+ * allow the compiler to pick which specific machine registers to

+ * use, instead of hard-coding this in the asm code above.

+ */

+ register u_int tmp1, tmp2;

for (; m && len; m = m->m_next) {

if (m->m_len == 0)

continue;

@@ -125,45 +165,50 @@ in_cksum(m, len)

if ((3 & (long)w) != 0) {

REDUCE;

if ((1 & (long)w) != 0 && mlen >= 1) {

- ADDB;

- SHIFT(1);

+ ADDBYTE;

+ ADVANCE(1);

}

if ((2 & (long)w) != 0 && mlen >= 2) {

- ADDS;

- SHIFT(2);

+ ADDSHORT;

+ ADVANCE(2);

}

* Do as many 32 bit operattions as possible using the

- * 32/16/8/4 macro's above, using as many as possible of

+ * 64/32/16/8/4 macro's above, using as many as possible of

* these.

- while (mlen >= 32) {

+ while (mlen >= 64) {

+ ADD64;

+ ADVANCE(64);

+ }

+ if (mlen >= 32) {

ADD32;

- SHIFT(32);

+ ADVANCE(32);

}

if (mlen >= 16) {

ADD16;

- SHIFT(16);

+ ADVANCE(16);

}

if (mlen >= 8) {

ADD8;

- SHIFT(8);

+ ADVANCE(8);

}

if (mlen >= 4) {

ADD4;

- SHIFT(4)

+ ADVANCE(4)

}

if (mlen == 0)

continue;

REDUCE;

if (mlen >= 2) {

- ADDS;

- SHIFT(2);

+ ADDSHORT;

+ ADVANCE(2);

}

if (mlen == 1) {

- ADDB;

+ ADDBYTE;

}

if (byte_swapped) {

@@ -172,5 +217,6 @@ in_cksum(m, len)

}

REDUCE;

ADDCARRY;

return (0xffff ^ sum);

}