summaryrefslogtreecommitdiff
path: root/sys
diff options
context:
space:
mode:
authorReyk Floeter <reyk@cvs.openbsd.org>2014-12-09 15:13:58 +0000
committerReyk Floeter <reyk@cvs.openbsd.org>2014-12-09 15:13:58 +0000
commit6e5e07917c70bceaab2dcbc8c8a46c9d362fbff2 (patch)
tree2e3a30f02b21b1937e6667ee9f2c1a6bd9b35748 /sys
parenta80c6be43d3d1690c54c91702f92cabf89858029 (diff)
Like libc, also for the kernel: Import new amd64 assembly versions of
strchr/index, strrchr/rindex, and strlen that provide a significantly faster performance than our previous .c or .S implementations. Based on NetBSD's code. Tested with different amd64 CPUs. ok deraadt@ mikeb@
Diffstat (limited to 'sys')
-rw-r--r--sys/lib/libkern/arch/amd64/strchr.S134
-rw-r--r--sys/lib/libkern/arch/amd64/strcmp.S121
-rw-r--r--sys/lib/libkern/arch/amd64/strlen.S156
-rw-r--r--sys/lib/libkern/arch/amd64/strrchr.S124
4 files changed, 442 insertions, 93 deletions
diff --git a/sys/lib/libkern/arch/amd64/strchr.S b/sys/lib/libkern/arch/amd64/strchr.S
index 058c56a5358..f78e030486e 100644
--- a/sys/lib/libkern/arch/amd64/strchr.S
+++ b/sys/lib/libkern/arch/amd64/strchr.S
@@ -1,21 +1,125 @@
-/*
- * Written by J.T. Conklin <jtc@netbsd.org>.
- * Public domain.
- * Adapted for NetBSD/x86_64 by Frank van der Linden <fvdl@wasabisystems.com>
+/* $OpenBSD: strchr.S,v 1.3 2014/12/09 15:13:57 reyk Exp $ */
+/* $NetBSD: strchr.S,v 1.7 2014/03/22 19:16:34 jakllsch Exp $ */
+
+/*-
+ * Copyright (c) 2009 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by David Laight.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
*/
+/* See comments in strlen.S about checking words for byte values */
+
#include <machine/asm.h>
+STRONG_ALIAS(index, strchr)
+
+/*
+ * On entry %rdi is the buffer and the low byte of %rsi (%sil) the
+ * character to search for.
+ *
+ * Registers %rdx, %rcx, %r8-%r11 and %rax are also usable
+ */
+
ENTRY(strchr)
- movq %rdi,%rax
- movb %sil,%cl
-L1:
- movb (%rax),%dl
- cmpb %dl,%cl /* found char? */
- je L2
- incq %rax
- testb %dl,%dl /* null terminator? */
- jnz L1
- xorq %rax,%rax
-L2:
+ movabsq $0x0101010101010101,%r8
+
+ movzbq %sil,%rdx /* value to search for (c) */
+ /* These imul are 'directpath' on athlons, so are fast */
+ imul $0x80,%r8,%r9 /* 0x8080808080808080 */
+ imul %r8,%rdx /* (c) copied to all bytes */
+ test $7,%dil
+ jnz 20f /* jump if misaligned */
+
+ _ALIGN_TEXT /* one byte nop */
+1:
+ movq (%rdi),%rax /* bytes to check (x) */
+2:
+ addq $8,%rdi
+ mov %rax,%r10
+ mov %rax,%r11 /* for 'char' check */
+ not %r10 /* invert of data (~x) */
+
+ xorq %rdx,%r11 /* convert 'char' test to one for NUL */
+ subq %r8,%rax /* x - 0x10 */
+ movq %r10,%rsi /* ~x */
+ subq %r8,%r11 /* (x ^ c) - 0x10 */
+/*
+ * Here we could check ((x - 0x10) | ((x ^ c) - 0x10)) & 0x80
+ * and short-circuit the case where no top bits are set, and
+ * we continue the loop.
+ * However it needs 3 more clocks that are difficult to interleave
+ * in the existing dependency chain ...
+ */
+ andq %r9,%rax /* (x - 0x10) & 0x80 */
+ xorq %rdx,%rsi /* c ^ ~x == ~(c ^ x) */
+ andq %r9,%r11 /* ((x ^ c) - 0x10) & 0x80 */
+ andq %r10,%rax /* (x - 0x10) & 0x80 & ~x */
+ jne 10f /* jump if string ends */
+ andq %rsi,%r11 /* ((x ^ c) - 0x10) & 0x80 & ~(x ^ c) */
+ je 1b /* jump if no match */
+
+ /* Found char, since LE can use bit scan */
+ bsf %r11,%r11 /* 7, 15, 23 ... 63 */
+8: shr $3,%r11 /* 0, 1, 2 .. 7 */
+ lea -8(%r11,%rdi),%rax
ret
+
+/* End of string, check whether char is before NUL */
+ _ALIGN_TEXT /* adds three byte nop */
+10:
+ bsf %rax,%rax /* count to NUL */
+ andq %rsi,%r11 /* check for char in last 8 bytes */
+ je 11f
+ bsf %r11,%r11 /* NUL and char - see which was first */
+ cmp %r11,%rax
+ jae 8b /* return 'found' if same - searching for NUL */
+11: xor %eax,%eax /* char not found */
+ ret
+
+/* Source misaligned: read aligned word and make low bytes invalid */
+/* I (dsl) think a _ALIGN_TEXT here will slow things down! */
+20:
+ xor %rcx,%rcx
+ sub %dil,%cl /* Convert low address values 1..7 ... */
+ sbb %rsi,%rsi /* carry was set, so %rsi now ~0u! */
+ and $7,%cl /* ... to 7..1 */
+ and $~7,%dil /* move address to start of word */
+ shl $3,%cl /* now 56, 48 ... 16, 8 */
+ movq (%rdi),%rax /* aligned word containing first data */
+ xor %rdx,%rsi /* invert of search pattern (~c) */
+ je 22f /* searching for 0xff */
+21: shr %cl,%rsi /* ~c in low bytes */
+ or %rsi,%rax /* set some bits making low bytes invalid */
+ jmp 2b
+
+/* We are searching for 0xff, so can't use ~pattern for invalid value */
+22:
+ mov %r8,%r10 /* 0x01 pattern */
+ lea (%r8,%r8),%rsi /* 0x02 - bits gets set (above) */
+ not %r10 /* now 0xfe */
+ sar %cl,%r10 /* top bytes 0xff */
+ and %r10,%rax /* clear lsb from unwanted low bytes */
+ jmp 21b
diff --git a/sys/lib/libkern/arch/amd64/strcmp.S b/sys/lib/libkern/arch/amd64/strcmp.S
index 45c33fd60f1..7f1656ee80a 100644
--- a/sys/lib/libkern/arch/amd64/strcmp.S
+++ b/sys/lib/libkern/arch/amd64/strcmp.S
@@ -1,84 +1,71 @@
+/* $OpenBSD: strcmp.S,v 1.3 2014/12/09 15:13:57 reyk Exp $ */
+/* $NetBSD: strcmp.S,v 1.2 2014/03/22 19:16:34 jakllsch Exp $ */
+
/*
- * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Written by J.T. Conklin <jtc@acorntoolworks.com>
* Public domain.
- * Adapted for NetBSD/x86_64 by Frank van der Linden <fvdl@wasabisystems.com>
*/
#include <machine/asm.h>
-/*
- * NOTE: I've unrolled the loop eight times: large enough to make a
- * significant difference, and small enough not to totally trash the
- * cache.
- */
-
ENTRY(strcmp)
- jmp L2 /* Jump into the loop. */
-
-L1: incq %rdi
- incq %rsi
-L2: movb (%rdi),%cl
- testb %cl,%cl /* null terminator */
- jz L3
- cmpb %cl,(%rsi) /* chars match */
- jne L3
-
- incq %rdi
- incq %rsi
- movb (%rdi),%cl
- testb %cl,%cl
- jz L3
- cmpb %cl,(%rsi)
- jne L3
-
+ /*
+ * Align s1 to word boundary.
+ * Consider unrolling loop?
+ */
+.Ls1align:
+ testb $7,%dil
+ je .Ls1aligned
+ movb (%rdi),%al
incq %rdi
+ movb (%rsi),%dl
incq %rsi
- movb (%rdi),%cl
- testb %cl,%cl
- jz L3
- cmpb %cl,(%rsi)
- jne L3
+ testb %al,%al
+ je .Ldone
+ cmpb %al,%dl
+ je .Ls1align
+ jmp .Ldone
- incq %rdi
- incq %rsi
- movb (%rdi),%cl
- testb %cl,%cl
- jz L3
- cmpb %cl,(%rsi)
- jne L3
+ /*
+ * Check whether s2 is aligned to a word boundary. If it is, we
+ * can compare by words. Otherwise we have to compare by bytes.
+ */
+.Ls1aligned:
+ testb $7,%sil
+ jne .Lbyte_loop
- incq %rdi
- incq %rsi
- movb (%rdi),%cl
- testb %cl,%cl
- jz L3
- cmpb %cl,(%rsi)
- jne L3
+ movabsq $0x0101010101010101,%r8
+ subq $8,%rdi
+ movabsq $0x8080808080808080,%r9
+ subq $8,%rsi
- incq %rdi
- incq %rsi
- movb (%rdi),%cl
- testb %cl,%cl
- jz L3
- cmpb %cl,(%rsi)
- jne L3
+ _ALIGN_TEXT
+.Lword_loop:
+ movq 8(%rdi),%rax
+ addq $8,%rdi
+ movq 8(%rsi),%rdx
+ addq $8,%rsi
+ cmpq %rax,%rdx
+ jne .Lbyte_loop
+ subq %r8,%rdx
+ notq %rax
+ andq %rax,%rdx
+ testq %r9,%rdx
+ je .Lword_loop
+ _ALIGN_TEXT
+.Lbyte_loop:
+ movb (%rdi),%al
incq %rdi
+ movb (%rsi),%dl
incq %rsi
- movb (%rdi),%cl
- testb %cl,%cl
- jz L3
- cmpb %cl,(%rsi)
- jne L3
+ testb %al,%al
+ je .Ldone
+ cmpb %al,%dl
+ je .Lbyte_loop
- incq %rdi
- incq %rsi
- movb (%rdi),%cl
- testb %cl,%cl
- jz L3
- cmpb %cl,(%rsi)
- je L1
-L3: movzbl (%rdi),%eax /* unsigned comparison */
- movzbl (%rsi),%edx
- subl %edx,%eax
+.Ldone:
+ movzbq %al,%rax
+ movzbq %dl,%rdx
+ subq %rdx,%rax
ret
diff --git a/sys/lib/libkern/arch/amd64/strlen.S b/sys/lib/libkern/arch/amd64/strlen.S
new file mode 100644
index 00000000000..fd3fd5c95dc
--- /dev/null
+++ b/sys/lib/libkern/arch/amd64/strlen.S
@@ -0,0 +1,156 @@
+/* $OpenBSD: strlen.S,v 1.4 2014/12/09 15:13:57 reyk Exp $ */
+/* $NetBSD: strlen.S,v 1.6 2014/03/22 19:16:34 jakllsch Exp $ */
+
+/*-
+ * Copyright (c) 2009 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by David Laight.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Inspired by a version written by J.T. Conklin <jtc@acorntoolworks.com>
+ * (Only the long comment really remains his work!)
+ */
+
+#include <machine/asm.h>
+
+/*
+ * There are many well known branch-free sequences which are used
+ * for determining whether a zero-byte is contained within a word.
+ * These sequences are generally much more efficent than loading
+ * and comparing each byte individually.
+ *
+ * The expression [1,2]:
+ *
+ * (1) ~(((x & 0x7f....7f) + 0x7f....7f) | (x | 0x7f....7f))
+ *
+ * evaluates to a non-zero value if any of the bytes in the
+ * original word is zero.
+ *
+ * It also has the useful property that bytes in the result word
+ * that correspond to non-zero bytes in the original word have
+ * the value 0x00, while bytes corresponding to zero bytes have
+ * the value 0x80. This allows calculation of the first (and
+ * last) occurrence of a zero byte within the word (useful for C's
+ * str* primitives) by counting the number of leading (or
+ * trailing) zeros and dividing the result by 8. On machines
+ * without (or with slow) clz() / ctz() instructions, testing
+ * each byte in the result word for zero is necessary.
+ *
+ * This typically takes 4 instructions (5 on machines without
+ * "not-or") not including those needed to load the constant.
+ *
+ *
+ * The expression:
+ *
+ * (2) ((x - 0x01....01) & 0x80....80 & ~x)
+ *
+ * evaluates to a non-zero value if any of the bytes in the
+ * original word is zero.
+ *
+ * On little endian machines, the first byte in the result word
+ * that corresponds to a zero byte in the original byte is 0x80,
+ * so clz() can be used as above. On big endian machines, and
+ * little endian machines without (or with a slow) clz() insn,
+ * testing each byte in the original for zero is necessary.
+ *
+ * This typically takes 3 instructions (4 on machines without
+ * "and with complement") not including those needed to load
+ * constants.
+ *
+ *
+ * The expression:
+ *
+ * (3) ((x - 0x01....01) & 0x80....80)
+ *
+ * always evaluates to a non-zero value if any of the bytes in
+ * the original word is zero or has the top bit set.
+ * For strings that are likely to only contain 7-bit ascii these
+ * false positives will be rare.
+ *
+ * To account for possible false positives, each byte of the
+ * original word must be checked when the expression evaluates to
+ * a non-zero value. However, because it is simpler than those
+ * presented above, code that uses it will be faster as long as
+ * the rate of false positives is low.
+ *
+ * This is likely, because the the false positive can only occur
+ * if the most siginificant bit of a byte within the word is set.
+ * The expression will never fail for typical 7-bit ASCII strings.
+ *
+ * This typically takes 2 instructions not including those needed
+ * to load constants.
+ *
+ *
+ * [1] Henry S. Warren Jr., "Hacker's Delight", Addison-Westley 2003
+ *
+ * [2] International Business Machines, "The PowerPC Compiler Writer's
+ * Guide", Warthman Associates, 1996
+ */
+
+ENTRY(strlen)
+ movabsq $0x0101010101010101,%r8
+
+ test $7,%dil
+ movq %rdi,%rax /* Buffer, %rdi unchanged */
+ movabsq $0x8080808080808080,%r9
+ jnz 10f /* Jump if misaligned */
+
+ _ALIGN_TEXT
+1:
+ movq (%rax),%rdx /* get bytes to check */
+2:
+ addq $8,%rax
+ mov %rdx,%rcx /* save for later check */
+ subq %r8,%rdx /* alg (3) above first */
+ not %rcx /* Invert of data */
+ andq %r9,%rdx
+ je 1b /* jump if all 0x01-0x80 */
+
+ /* Do check from alg (2) above - loops for 0x81..0xff bytes */
+ andq %rcx,%rdx
+ je 1b
+
+ /* Since we are LE, use bit scan for first 0x80 byte */
+ sub %rdi,%rax /* length to next word */
+ bsf %rdx,%rdx /* 7, 15, 23 ... 63 */
+ shr $3,%rdx /* 0, 1, 2 ... 7 */
+ lea -8(%rax,%rdx),%rax
+ ret
+
+/* Misaligned, read aligned word and make low bytes non-zero */
+ _ALIGN_TEXT
+10:
+ mov %al,%cl
+ mov $1,%rsi
+ and $7,%cl /* offset into word 1..7 */
+ and $~7,%al /* start of word with buffer */
+ shl $3,%cl /* bit count 8, 16 .. 56 */
+ movq (%rax),%rdx /* first data in high bytes */
+ shl %cl,%rsi
+ dec %rsi
+ or %rsi,%rdx /* low bytes now non-zero */
+ jmp 2b
diff --git a/sys/lib/libkern/arch/amd64/strrchr.S b/sys/lib/libkern/arch/amd64/strrchr.S
index 097b4f1f699..50561cba786 100644
--- a/sys/lib/libkern/arch/amd64/strrchr.S
+++ b/sys/lib/libkern/arch/amd64/strrchr.S
@@ -1,21 +1,123 @@
+/* $OpenBSD: strrchr.S,v 1.3 2014/12/09 15:13:57 reyk Exp $ */
+/* $NetBSD: strrchr.S,v 1.3 2014/03/22 19:16:34 jakllsch Exp $ */
+
/*
- * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Written by J.T. Conklin <jtc@acorntoolworks.com>
* Public domain.
- * Adapted for NetBSD/x86_64 by Frank van der Linden <fvdl@wasabisystems.com>
*/
#include <machine/asm.h>
+STRONG_ALIAS(rindex, strrchr)
+
ENTRY(strrchr)
- movb %sil,%cl
- xorq %rax,%rax /* init pointer to null */
-L1:
+ movzbq %sil,%rcx
+
+ /* zero return value */
+ xorq %rax,%rax
+
+ /*
+ * Align to word boundary.
+ * Consider unrolling loop?
+ */
+.Lalign:
+ testb $7,%dil
+ je .Lword_aligned
movb (%rdi),%dl
- cmpb %dl,%cl
- jne L2
- movq %rdi,%rax
-L2:
+ cmpb %cl,%dl
+ cmoveq %rdi,%rax
incq %rdi
- testb %dl,%dl /* null terminator??? */
- jnz L1
+ testb %dl,%dl
+ jne .Lalign
+ jmp .Ldone
+
+.Lword_aligned:
+ /* copy char to all bytes in word */
+ movb %cl,%ch
+ movq %rcx,%rdx
+ salq $16,%rcx
+ orq %rdx,%rcx
+ movq %rcx,%rdx
+ salq $32,%rcx
+ orq %rdx,%rcx
+
+ movabsq $0x0101010101010101,%r8
+ movabsq $0x8080808080808080,%r9
+
+ /* Check whether any byte in the word is equal to ch or 0. */
+ _ALIGN_TEXT
+.Lloop:
+ movq (%rdi),%rdx
+ addq $8,%rdi
+ movq %rdx,%rsi
+ subq %r8,%rdx
+ xorq %rcx,%rsi
+ subq %r8,%rsi
+ orq %rsi,%rdx
+ testq %r9,%rdx
+ je .Lloop
+
+ /*
+ * In rare cases, the above loop may exit prematurely. We must
+ * return to the loop if none of the bytes in the word match
+ * ch or are equal to 0.
+ */
+
+ movb -8(%rdi),%dl
+ cmpb %cl,%dl /* 1st byte == ch? */
+ jne 1f
+ leaq -8(%rdi),%rax
+1: testb %dl,%dl /* 1st byte == 0? */
+ je .Ldone
+
+ movb -7(%rdi),%dl
+ cmpb %cl,%dl /* 2nd byte == ch? */
+ jne 1f
+ leaq -7(%rdi),%rax
+1: testb %dl,%dl /* 2nd byte == 0? */
+ je .Ldone
+
+ movb -6(%rdi),%dl
+ cmpb %cl,%dl /* 3rd byte == ch? */
+ jne 1f
+ leaq -6(%rdi),%rax
+1: testb %dl,%dl /* 3rd byte == 0? */
+ je .Ldone
+
+ movb -5(%rdi),%dl
+ cmpb %cl,%dl /* 4th byte == ch? */
+ jne 1f
+ leaq -5(%rdi),%rax
+1: testb %dl,%dl /* 4th byte == 0? */
+ je .Ldone
+
+ movb -4(%rdi),%dl
+ cmpb %cl,%dl /* 5th byte == ch? */
+ jne 1f
+ leaq -4(%rdi),%rax
+1: testb %dl,%dl /* 5th byte == 0? */
+ je .Ldone
+
+ movb -3(%rdi),%dl
+ cmpb %cl,%dl /* 6th byte == ch? */
+ jne 1f
+ leaq -3(%rdi),%rax
+1: testb %dl,%dl /* 6th byte == 0? */
+ je .Ldone
+
+ movb -2(%rdi),%dl
+ cmpb %cl,%dl /* 7th byte == ch? */
+ jne 1f
+ leaq -2(%rdi),%rax
+1: testb %dl,%dl /* 7th byte == 0? */
+ je .Ldone
+
+ movb -1(%rdi),%dl
+ cmpb %cl,%dl /* 8th byte == ch? */
+ jne 1f
+ leaq -1(%rdi),%rax
+1: testb %dl,%dl /* 8th byte == 0? */
+ jne .Lloop
+
+.Ldone:
ret