From 6e5e07917c70bceaab2dcbc8c8a46c9d362fbff2 Mon Sep 17 00:00:00 2001 From: Reyk Floeter Date: Tue, 9 Dec 2014 15:13:58 +0000 Subject: Like libc, also for the kernel: Import new amd64 assembly versions of strchr/index, strrchr/rindex, and strlen that provide a significantly faster performance than our previous .c or .S implementations. Based on NetBSD's code. Tested with different amd64 CPUs. ok deraadt@ mikeb@ --- sys/lib/libkern/arch/amd64/strchr.S | 134 ++++++++++++++++++++++++++---- sys/lib/libkern/arch/amd64/strcmp.S | 121 ++++++++++++--------------- sys/lib/libkern/arch/amd64/strlen.S | 156 +++++++++++++++++++++++++++++++++++ sys/lib/libkern/arch/amd64/strrchr.S | 124 +++++++++++++++++++++++++--- 4 files changed, 442 insertions(+), 93 deletions(-) create mode 100644 sys/lib/libkern/arch/amd64/strlen.S (limited to 'sys/lib/libkern/arch') diff --git a/sys/lib/libkern/arch/amd64/strchr.S b/sys/lib/libkern/arch/amd64/strchr.S index 058c56a5358..f78e030486e 100644 --- a/sys/lib/libkern/arch/amd64/strchr.S +++ b/sys/lib/libkern/arch/amd64/strchr.S @@ -1,21 +1,125 @@ -/* - * Written by J.T. Conklin . - * Public domain. - * Adapted for NetBSD/x86_64 by Frank van der Linden +/* $OpenBSD: strchr.S,v 1.3 2014/12/09 15:13:57 reyk Exp $ */ +/* $NetBSD: strchr.S,v 1.7 2014/03/22 19:16:34 jakllsch Exp $ */ + +/*- + * Copyright (c) 2009 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by David Laight. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. */ +/* See comments in strlen.S about checking words for byte values */ + #include +STRONG_ALIAS(index, strchr) + +/* + * On entry %rdi is the buffer and the low byte of %rsi (%sil) the + * character to search for. + * + * Registers %rdx, %rcx, %r8-%r11 and %rax are also usable + */ + ENTRY(strchr) - movq %rdi,%rax - movb %sil,%cl -L1: - movb (%rax),%dl - cmpb %dl,%cl /* found char? */ - je L2 - incq %rax - testb %dl,%dl /* null terminator? */ - jnz L1 - xorq %rax,%rax -L2: + movabsq $0x0101010101010101,%r8 + + movzbq %sil,%rdx /* value to search for (c) */ + /* These imul are 'directpath' on athlons, so are fast */ + imul $0x80,%r8,%r9 /* 0x8080808080808080 */ + imul %r8,%rdx /* (c) copied to all bytes */ + test $7,%dil + jnz 20f /* jump if misaligned */ + + _ALIGN_TEXT /* one byte nop */ +1: + movq (%rdi),%rax /* bytes to check (x) */ +2: + addq $8,%rdi + mov %rax,%r10 + mov %rax,%r11 /* for 'char' check */ + not %r10 /* invert of data (~x) */ + + xorq %rdx,%r11 /* convert 'char' test to one for NUL */ + subq %r8,%rax /* x - 0x10 */ + movq %r10,%rsi /* ~x */ + subq %r8,%r11 /* (x ^ c) - 0x10 */ +/* + * Here we could check ((x - 0x10) | ((x ^ c) - 0x10)) & 0x80 + * and short-circuit the case where no top bits are set, and + * we continue the loop. + * However it needs 3 more clocks that are difficult to interleave + * in the existing dependency chain ... + */ + andq %r9,%rax /* (x - 0x10) & 0x80 */ + xorq %rdx,%rsi /* c ^ ~x == ~(c ^ x) */ + andq %r9,%r11 /* ((x ^ c) - 0x10) & 0x80 */ + andq %r10,%rax /* (x - 0x10) & 0x80 & ~x */ + jne 10f /* jump if string ends */ + andq %rsi,%r11 /* ((x ^ c) - 0x10) & 0x80 & ~(x ^ c) */ + je 1b /* jump if no match */ + + /* Found char, since LE can use bit scan */ + bsf %r11,%r11 /* 7, 15, 23 ... 63 */ +8: shr $3,%r11 /* 0, 1, 2 .. 7 */ + lea -8(%r11,%rdi),%rax ret + +/* End of string, check whether char is before NUL */ + _ALIGN_TEXT /* adds three byte nop */ +10: + bsf %rax,%rax /* count to NUL */ + andq %rsi,%r11 /* check for char in last 8 bytes */ + je 11f + bsf %r11,%r11 /* NUL and char - see which was first */ + cmp %r11,%rax + jae 8b /* return 'found' if same - searching for NUL */ +11: xor %eax,%eax /* char not found */ + ret + +/* Source misaligned: read aligned word and make low bytes invalid */ +/* I (dsl) think a _ALIGN_TEXT here will slow things down! */ +20: + xor %rcx,%rcx + sub %dil,%cl /* Convert low address values 1..7 ... */ + sbb %rsi,%rsi /* carry was set, so %rsi now ~0u! */ + and $7,%cl /* ... to 7..1 */ + and $~7,%dil /* move address to start of word */ + shl $3,%cl /* now 56, 48 ... 16, 8 */ + movq (%rdi),%rax /* aligned word containing first data */ + xor %rdx,%rsi /* invert of search pattern (~c) */ + je 22f /* searching for 0xff */ +21: shr %cl,%rsi /* ~c in low bytes */ + or %rsi,%rax /* set some bits making low bytes invalid */ + jmp 2b + +/* We are searching for 0xff, so can't use ~pattern for invalid value */ +22: + mov %r8,%r10 /* 0x01 pattern */ + lea (%r8,%r8),%rsi /* 0x02 - bits gets set (above) */ + not %r10 /* now 0xfe */ + sar %cl,%r10 /* top bytes 0xff */ + and %r10,%rax /* clear lsb from unwanted low bytes */ + jmp 21b diff --git a/sys/lib/libkern/arch/amd64/strcmp.S b/sys/lib/libkern/arch/amd64/strcmp.S index 45c33fd60f1..7f1656ee80a 100644 --- a/sys/lib/libkern/arch/amd64/strcmp.S +++ b/sys/lib/libkern/arch/amd64/strcmp.S @@ -1,84 +1,71 @@ +/* $OpenBSD: strcmp.S,v 1.3 2014/12/09 15:13:57 reyk Exp $ */ +/* $NetBSD: strcmp.S,v 1.2 2014/03/22 19:16:34 jakllsch Exp $ */ + /* - * Written by J.T. Conklin . + * Written by J.T. Conklin * Public domain. - * Adapted for NetBSD/x86_64 by Frank van der Linden */ #include -/* - * NOTE: I've unrolled the loop eight times: large enough to make a - * significant difference, and small enough not to totally trash the - * cache. - */ - ENTRY(strcmp) - jmp L2 /* Jump into the loop. */ - -L1: incq %rdi - incq %rsi -L2: movb (%rdi),%cl - testb %cl,%cl /* null terminator */ - jz L3 - cmpb %cl,(%rsi) /* chars match */ - jne L3 - - incq %rdi - incq %rsi - movb (%rdi),%cl - testb %cl,%cl - jz L3 - cmpb %cl,(%rsi) - jne L3 - + /* + * Align s1 to word boundary. + * Consider unrolling loop? + */ +.Ls1align: + testb $7,%dil + je .Ls1aligned + movb (%rdi),%al incq %rdi + movb (%rsi),%dl incq %rsi - movb (%rdi),%cl - testb %cl,%cl - jz L3 - cmpb %cl,(%rsi) - jne L3 + testb %al,%al + je .Ldone + cmpb %al,%dl + je .Ls1align + jmp .Ldone - incq %rdi - incq %rsi - movb (%rdi),%cl - testb %cl,%cl - jz L3 - cmpb %cl,(%rsi) - jne L3 + /* + * Check whether s2 is aligned to a word boundary. If it is, we + * can compare by words. Otherwise we have to compare by bytes. + */ +.Ls1aligned: + testb $7,%sil + jne .Lbyte_loop - incq %rdi - incq %rsi - movb (%rdi),%cl - testb %cl,%cl - jz L3 - cmpb %cl,(%rsi) - jne L3 + movabsq $0x0101010101010101,%r8 + subq $8,%rdi + movabsq $0x8080808080808080,%r9 + subq $8,%rsi - incq %rdi - incq %rsi - movb (%rdi),%cl - testb %cl,%cl - jz L3 - cmpb %cl,(%rsi) - jne L3 + _ALIGN_TEXT +.Lword_loop: + movq 8(%rdi),%rax + addq $8,%rdi + movq 8(%rsi),%rdx + addq $8,%rsi + cmpq %rax,%rdx + jne .Lbyte_loop + subq %r8,%rdx + notq %rax + andq %rax,%rdx + testq %r9,%rdx + je .Lword_loop + _ALIGN_TEXT +.Lbyte_loop: + movb (%rdi),%al incq %rdi + movb (%rsi),%dl incq %rsi - movb (%rdi),%cl - testb %cl,%cl - jz L3 - cmpb %cl,(%rsi) - jne L3 + testb %al,%al + je .Ldone + cmpb %al,%dl + je .Lbyte_loop - incq %rdi - incq %rsi - movb (%rdi),%cl - testb %cl,%cl - jz L3 - cmpb %cl,(%rsi) - je L1 -L3: movzbl (%rdi),%eax /* unsigned comparison */ - movzbl (%rsi),%edx - subl %edx,%eax +.Ldone: + movzbq %al,%rax + movzbq %dl,%rdx + subq %rdx,%rax ret diff --git a/sys/lib/libkern/arch/amd64/strlen.S b/sys/lib/libkern/arch/amd64/strlen.S new file mode 100644 index 00000000000..fd3fd5c95dc --- /dev/null +++ b/sys/lib/libkern/arch/amd64/strlen.S @@ -0,0 +1,156 @@ +/* $OpenBSD: strlen.S,v 1.4 2014/12/09 15:13:57 reyk Exp $ */ +/* $NetBSD: strlen.S,v 1.6 2014/03/22 19:16:34 jakllsch Exp $ */ + +/*- + * Copyright (c) 2009 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by David Laight. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Inspired by a version written by J.T. Conklin + * (Only the long comment really remains his work!) + */ + +#include + +/* + * There are many well known branch-free sequences which are used + * for determining whether a zero-byte is contained within a word. + * These sequences are generally much more efficent than loading + * and comparing each byte individually. + * + * The expression [1,2]: + * + * (1) ~(((x & 0x7f....7f) + 0x7f....7f) | (x | 0x7f....7f)) + * + * evaluates to a non-zero value if any of the bytes in the + * original word is zero. + * + * It also has the useful property that bytes in the result word + * that correspond to non-zero bytes in the original word have + * the value 0x00, while bytes corresponding to zero bytes have + * the value 0x80. This allows calculation of the first (and + * last) occurrence of a zero byte within the word (useful for C's + * str* primitives) by counting the number of leading (or + * trailing) zeros and dividing the result by 8. On machines + * without (or with slow) clz() / ctz() instructions, testing + * each byte in the result word for zero is necessary. + * + * This typically takes 4 instructions (5 on machines without + * "not-or") not including those needed to load the constant. + * + * + * The expression: + * + * (2) ((x - 0x01....01) & 0x80....80 & ~x) + * + * evaluates to a non-zero value if any of the bytes in the + * original word is zero. + * + * On little endian machines, the first byte in the result word + * that corresponds to a zero byte in the original byte is 0x80, + * so clz() can be used as above. On big endian machines, and + * little endian machines without (or with a slow) clz() insn, + * testing each byte in the original for zero is necessary. + * + * This typically takes 3 instructions (4 on machines without + * "and with complement") not including those needed to load + * constants. + * + * + * The expression: + * + * (3) ((x - 0x01....01) & 0x80....80) + * + * always evaluates to a non-zero value if any of the bytes in + * the original word is zero or has the top bit set. + * For strings that are likely to only contain 7-bit ascii these + * false positives will be rare. + * + * To account for possible false positives, each byte of the + * original word must be checked when the expression evaluates to + * a non-zero value. However, because it is simpler than those + * presented above, code that uses it will be faster as long as + * the rate of false positives is low. + * + * This is likely, because the the false positive can only occur + * if the most siginificant bit of a byte within the word is set. + * The expression will never fail for typical 7-bit ASCII strings. + * + * This typically takes 2 instructions not including those needed + * to load constants. + * + * + * [1] Henry S. Warren Jr., "Hacker's Delight", Addison-Westley 2003 + * + * [2] International Business Machines, "The PowerPC Compiler Writer's + * Guide", Warthman Associates, 1996 + */ + +ENTRY(strlen) + movabsq $0x0101010101010101,%r8 + + test $7,%dil + movq %rdi,%rax /* Buffer, %rdi unchanged */ + movabsq $0x8080808080808080,%r9 + jnz 10f /* Jump if misaligned */ + + _ALIGN_TEXT +1: + movq (%rax),%rdx /* get bytes to check */ +2: + addq $8,%rax + mov %rdx,%rcx /* save for later check */ + subq %r8,%rdx /* alg (3) above first */ + not %rcx /* Invert of data */ + andq %r9,%rdx + je 1b /* jump if all 0x01-0x80 */ + + /* Do check from alg (2) above - loops for 0x81..0xff bytes */ + andq %rcx,%rdx + je 1b + + /* Since we are LE, use bit scan for first 0x80 byte */ + sub %rdi,%rax /* length to next word */ + bsf %rdx,%rdx /* 7, 15, 23 ... 63 */ + shr $3,%rdx /* 0, 1, 2 ... 7 */ + lea -8(%rax,%rdx),%rax + ret + +/* Misaligned, read aligned word and make low bytes non-zero */ + _ALIGN_TEXT +10: + mov %al,%cl + mov $1,%rsi + and $7,%cl /* offset into word 1..7 */ + and $~7,%al /* start of word with buffer */ + shl $3,%cl /* bit count 8, 16 .. 56 */ + movq (%rax),%rdx /* first data in high bytes */ + shl %cl,%rsi + dec %rsi + or %rsi,%rdx /* low bytes now non-zero */ + jmp 2b diff --git a/sys/lib/libkern/arch/amd64/strrchr.S b/sys/lib/libkern/arch/amd64/strrchr.S index 097b4f1f699..50561cba786 100644 --- a/sys/lib/libkern/arch/amd64/strrchr.S +++ b/sys/lib/libkern/arch/amd64/strrchr.S @@ -1,21 +1,123 @@ +/* $OpenBSD: strrchr.S,v 1.3 2014/12/09 15:13:57 reyk Exp $ */ +/* $NetBSD: strrchr.S,v 1.3 2014/03/22 19:16:34 jakllsch Exp $ */ + /* - * Written by J.T. Conklin . + * Written by J.T. Conklin * Public domain. - * Adapted for NetBSD/x86_64 by Frank van der Linden */ #include +STRONG_ALIAS(rindex, strrchr) + ENTRY(strrchr) - movb %sil,%cl - xorq %rax,%rax /* init pointer to null */ -L1: + movzbq %sil,%rcx + + /* zero return value */ + xorq %rax,%rax + + /* + * Align to word boundary. + * Consider unrolling loop? + */ +.Lalign: + testb $7,%dil + je .Lword_aligned movb (%rdi),%dl - cmpb %dl,%cl - jne L2 - movq %rdi,%rax -L2: + cmpb %cl,%dl + cmoveq %rdi,%rax incq %rdi - testb %dl,%dl /* null terminator??? */ - jnz L1 + testb %dl,%dl + jne .Lalign + jmp .Ldone + +.Lword_aligned: + /* copy char to all bytes in word */ + movb %cl,%ch + movq %rcx,%rdx + salq $16,%rcx + orq %rdx,%rcx + movq %rcx,%rdx + salq $32,%rcx + orq %rdx,%rcx + + movabsq $0x0101010101010101,%r8 + movabsq $0x8080808080808080,%r9 + + /* Check whether any byte in the word is equal to ch or 0. */ + _ALIGN_TEXT +.Lloop: + movq (%rdi),%rdx + addq $8,%rdi + movq %rdx,%rsi + subq %r8,%rdx + xorq %rcx,%rsi + subq %r8,%rsi + orq %rsi,%rdx + testq %r9,%rdx + je .Lloop + + /* + * In rare cases, the above loop may exit prematurely. We must + * return to the loop if none of the bytes in the word match + * ch or are equal to 0. + */ + + movb -8(%rdi),%dl + cmpb %cl,%dl /* 1st byte == ch? */ + jne 1f + leaq -8(%rdi),%rax +1: testb %dl,%dl /* 1st byte == 0? */ + je .Ldone + + movb -7(%rdi),%dl + cmpb %cl,%dl /* 2nd byte == ch? */ + jne 1f + leaq -7(%rdi),%rax +1: testb %dl,%dl /* 2nd byte == 0? */ + je .Ldone + + movb -6(%rdi),%dl + cmpb %cl,%dl /* 3rd byte == ch? */ + jne 1f + leaq -6(%rdi),%rax +1: testb %dl,%dl /* 3rd byte == 0? */ + je .Ldone + + movb -5(%rdi),%dl + cmpb %cl,%dl /* 4th byte == ch? */ + jne 1f + leaq -5(%rdi),%rax +1: testb %dl,%dl /* 4th byte == 0? */ + je .Ldone + + movb -4(%rdi),%dl + cmpb %cl,%dl /* 5th byte == ch? */ + jne 1f + leaq -4(%rdi),%rax +1: testb %dl,%dl /* 5th byte == 0? */ + je .Ldone + + movb -3(%rdi),%dl + cmpb %cl,%dl /* 6th byte == ch? */ + jne 1f + leaq -3(%rdi),%rax +1: testb %dl,%dl /* 6th byte == 0? */ + je .Ldone + + movb -2(%rdi),%dl + cmpb %cl,%dl /* 7th byte == ch? */ + jne 1f + leaq -2(%rdi),%rax +1: testb %dl,%dl /* 7th byte == 0? */ + je .Ldone + + movb -1(%rdi),%dl + cmpb %cl,%dl /* 8th byte == ch? */ + jne 1f + leaq -1(%rdi),%rax +1: testb %dl,%dl /* 8th byte == 0? */ + jne .Lloop + +.Ldone: ret -- cgit v1.2.3