Message ID | 20220712192910.351121-6-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v1] x86: Move strcat SSE2 implementation to multiarch/strcat-sse2.S | expand |
On Tue, Jul 12, 2022 at 12:29 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > This commit doesn't affect libc.so.6, its just housekeeping to prepare > for adding explicit ISA level support. > > Tested build on x86_64 and x86_32 with/without multiarch. > --- > sysdeps/x86_64/multiarch/strcat-sse2.S | 242 ++++++++++++++++++++++++- > sysdeps/x86_64/strcat.S | 239 +----------------------- > 2 files changed, 238 insertions(+), 243 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/strcat-sse2.S b/sysdeps/x86_64/multiarch/strcat-sse2.S > index 449e102438..244c4a6d74 100644 > --- a/sysdeps/x86_64/multiarch/strcat-sse2.S > +++ b/sysdeps/x86_64/multiarch/strcat-sse2.S > @@ -17,12 +17,242 @@ > <https://www.gnu.org/licenses/>. */ > > #if IS_IN (libc) > +# ifndef STRCAT > +# define STRCAT __strcat_sse2 > +# endif > +#endif > > -# include <sysdep.h> > -# define strcat __strcat_sse2 > +#include <sysdep.h> > + > + .text > +ENTRY (STRCAT) > + movq %rdi, %rcx /* Dest. register. */ > + andl $7, %ecx /* mask alignment bits */ > + movq %rdi, %rax /* Duplicate destination pointer. */ > + movq $0xfefefefefefefeff,%r8 > + > + /* First step: Find end of destination. */ > + jz 4f /* aligned => start loop */ > + > + neg %ecx /* We need to align to 8 bytes. */ > + addl $8,%ecx > + /* Search the first bytes directly. */ > +0: cmpb $0x0,(%rax) /* is byte NUL? */ > + je 2f /* yes => start copy */ > + incq %rax /* increment pointer */ > + decl %ecx > + jnz 0b > + > + > + > + /* Now the source is aligned. Scan for NUL byte. */ > + .p2align 4 > +4: > + /* First unroll. */ > + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ > + addq $8,%rax /* adjust pointer for next word */ > + movq %r8, %rdx /* magic value */ > + addq %rcx, %rdx /* add the magic value to the word. We get > + carry bits reported for each byte which > + is *not* 0 */ > + jnc 3f /* highest byte is NUL => return pointer */ > + xorq %rcx, %rdx /* (word+magic)^word */ > + orq %r8, %rdx /* set all non-carry bits */ > + incq %rdx /* add 1: if one carry bit was *not* set > + the addition will not result in 0. */ > + jnz 3f /* found NUL => return pointer */ > + > + /* Second unroll. */ > + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ > + addq $8,%rax /* adjust pointer for next word */ > + movq %r8, %rdx /* magic value */ > + addq %rcx, %rdx /* add the magic value to the word. We get > + carry bits reported for each byte which > + is *not* 0 */ > + jnc 3f /* highest byte is NUL => return pointer */ > + xorq %rcx, %rdx /* (word+magic)^word */ > + orq %r8, %rdx /* set all non-carry bits */ > + incq %rdx /* add 1: if one carry bit was *not* set > + the addition will not result in 0. */ > + jnz 3f /* found NUL => return pointer */ > + > + /* Third unroll. */ > + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ > + addq $8,%rax /* adjust pointer for next word */ > + movq %r8, %rdx /* magic value */ > + addq %rcx, %rdx /* add the magic value to the word. We get > + carry bits reported for each byte which > + is *not* 0 */ > + jnc 3f /* highest byte is NUL => return pointer */ > + xorq %rcx, %rdx /* (word+magic)^word */ > + orq %r8, %rdx /* set all non-carry bits */ > + incq %rdx /* add 1: if one carry bit was *not* set > + the addition will not result in 0. */ > + jnz 3f /* found NUL => return pointer */ > + > + /* Fourth unroll. */ > + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ > + addq $8,%rax /* adjust pointer for next word */ > + movq %r8, %rdx /* magic value */ > + addq %rcx, %rdx /* add the magic value to the word. We get > + carry bits reported for each byte which > + is *not* 0 */ > + jnc 3f /* highest byte is NUL => return pointer */ > + xorq %rcx, %rdx /* (word+magic)^word */ > + orq %r8, %rdx /* set all non-carry bits */ > + incq %rdx /* add 1: if one carry bit was *not* set > + the addition will not result in 0. */ > + jz 4b /* no NUL found => continue loop */ > + > + .p2align 4 /* Align, it's a jump target. */ > +3: subq $8,%rax /* correct pointer increment. */ > + > + testb %cl, %cl /* is first byte NUL? */ > + jz 2f /* yes => return */ > + incq %rax /* increment pointer */ > + > + testb %ch, %ch /* is second byte NUL? */ > + jz 2f /* yes => return */ > + incq %rax /* increment pointer */ > + > + testl $0x00ff0000, %ecx /* is third byte NUL? */ > + jz 2f /* yes => return pointer */ > + incq %rax /* increment pointer */ > + > + testl $0xff000000, %ecx /* is fourth byte NUL? */ > + jz 2f /* yes => return pointer */ > + incq %rax /* increment pointer */ > + > + shrq $32, %rcx /* look at other half. */ > + > + testb %cl, %cl /* is first byte NUL? */ > + jz 2f /* yes => return */ > + incq %rax /* increment pointer */ > + > + testb %ch, %ch /* is second byte NUL? */ > + jz 2f /* yes => return */ > + incq %rax /* increment pointer */ > + > + testl $0xff0000, %ecx /* is third byte NUL? */ > + jz 2f /* yes => return pointer */ > + incq %rax /* increment pointer */ > + > +2: > + /* Second step: Copy source to destination. */ > + > + movq %rsi, %rcx /* duplicate */ > + andl $7,%ecx /* mask alignment bits */ > + movq %rax, %rdx /* move around */ > + jz 22f /* aligned => start loop */ > + > + neg %ecx /* align to 8 bytes. */ > + addl $8, %ecx > + /* Align the source pointer. */ > +21: > + movb (%rsi), %al /* Fetch a byte */ > + testb %al, %al /* Is it NUL? */ > + movb %al, (%rdx) /* Store it */ > + jz 24f /* If it was NUL, done! */ > + incq %rsi > + incq %rdx > + decl %ecx > + jnz 21b > + > + /* Now the sources is aligned. Unfortunatly we cannot force > + to have both source and destination aligned, so ignore the > + alignment of the destination. */ > + .p2align 4 > +22: > + /* 1st unroll. */ > + movq (%rsi), %rax /* Read double word (8 bytes). */ > + addq $8, %rsi /* Adjust pointer for next word. */ > + movq %rax, %r9 /* Save a copy for NUL finding. */ > + addq %r8, %r9 /* add the magic value to the word. We get > + carry bits reported for each byte which > + is *not* 0 */ > + jnc 23f /* highest byte is NUL => return pointer */ > + xorq %rax, %r9 /* (word+magic)^word */ > + orq %r8, %r9 /* set all non-carry bits */ > + incq %r9 /* add 1: if one carry bit was *not* set > + the addition will not result in 0. */ > + > + jnz 23f /* found NUL => return pointer */ > + > + movq %rax, (%rdx) /* Write value to destination. */ > + addq $8, %rdx /* Adjust pointer. */ > + > + /* 2nd unroll. */ > + movq (%rsi), %rax /* Read double word (8 bytes). */ > + addq $8, %rsi /* Adjust pointer for next word. */ > + movq %rax, %r9 /* Save a copy for NUL finding. */ > + addq %r8, %r9 /* add the magic value to the word. We get > + carry bits reported for each byte which > + is *not* 0 */ > + jnc 23f /* highest byte is NUL => return pointer */ > + xorq %rax, %r9 /* (word+magic)^word */ > + orq %r8, %r9 /* set all non-carry bits */ > + incq %r9 /* add 1: if one carry bit was *not* set > + the addition will not result in 0. */ > + > + jnz 23f /* found NUL => return pointer */ > + > + movq %rax, (%rdx) /* Write value to destination. */ > + addq $8, %rdx /* Adjust pointer. */ > + > + /* 3rd unroll. */ > + movq (%rsi), %rax /* Read double word (8 bytes). */ > + addq $8, %rsi /* Adjust pointer for next word. */ > + movq %rax, %r9 /* Save a copy for NUL finding. */ > + addq %r8, %r9 /* add the magic value to the word. We get > + carry bits reported for each byte which > + is *not* 0 */ > + jnc 23f /* highest byte is NUL => return pointer */ > + xorq %rax, %r9 /* (word+magic)^word */ > + orq %r8, %r9 /* set all non-carry bits */ > + incq %r9 /* add 1: if one carry bit was *not* set > + the addition will not result in 0. */ > + > + jnz 23f /* found NUL => return pointer */ > + > + movq %rax, (%rdx) /* Write value to destination. */ > + addq $8, %rdx /* Adjust pointer. */ > + > + /* 4th unroll. */ > + movq (%rsi), %rax /* Read double word (8 bytes). */ > + addq $8, %rsi /* Adjust pointer for next word. */ > + movq %rax, %r9 /* Save a copy for NUL finding. */ > + addq %r8, %r9 /* add the magic value to the word. We get > + carry bits reported for each byte which > + is *not* 0 */ > + jnc 23f /* highest byte is NUL => return pointer */ > + xorq %rax, %r9 /* (word+magic)^word */ > + orq %r8, %r9 /* set all non-carry bits */ > + incq %r9 /* add 1: if one carry bit was *not* set > + the addition will not result in 0. */ > + > + jnz 23f /* found NUL => return pointer */ > + > + movq %rax, (%rdx) /* Write value to destination. */ > + addq $8, %rdx /* Adjust pointer. */ > + jmp 22b /* Next iteration. */ > + > + /* Do the last few bytes. %rax contains the value to write. > + The loop is unrolled twice. */ > + .p2align 4 > +23: > + movb %al, (%rdx) /* 1st byte. */ > + testb %al, %al /* Is it NUL. */ > + jz 24f /* yes, finish. */ > + incq %rdx /* Increment destination. */ > + movb %ah, (%rdx) /* 2nd byte. */ > + testb %ah, %ah /* Is it NUL?. */ > + jz 24f /* yes, finish. */ > + incq %rdx /* Increment destination. */ > + shrq $16, %rax /* Shift... */ > + jmp 23b /* and look at next two bytes in %rax. */ > > -# undef libc_hidden_builtin_def > -# define libc_hidden_builtin_def(strcat) > -#endif > > -#include <sysdeps/x86_64/strcat.S> > +24: > + movq %rdi, %rax /* Source is return value. */ > + retq > +END (STRCAT) > diff --git a/sysdeps/x86_64/strcat.S b/sysdeps/x86_64/strcat.S > index 565a9c785a..fc3e8a9bcf 100644 > --- a/sysdeps/x86_64/strcat.S > +++ b/sysdeps/x86_64/strcat.S > @@ -17,241 +17,6 @@ > License along with the GNU C Library; if not, see > <https://www.gnu.org/licenses/>. */ > > -#include <sysdep.h> > -#include "asm-syntax.h" > - > -/* Will be removed when new strcpy implementation gets merged. */ > - > - .text > -ENTRY (strcat) > - movq %rdi, %rcx /* Dest. register. */ > - andl $7, %ecx /* mask alignment bits */ > - movq %rdi, %rax /* Duplicate destination pointer. */ > - movq $0xfefefefefefefeff,%r8 > - > - /* First step: Find end of destination. */ > - jz 4f /* aligned => start loop */ > - > - neg %ecx /* We need to align to 8 bytes. */ > - addl $8,%ecx > - /* Search the first bytes directly. */ > -0: cmpb $0x0,(%rax) /* is byte NUL? */ > - je 2f /* yes => start copy */ > - incq %rax /* increment pointer */ > - decl %ecx > - jnz 0b > - > - > - > - /* Now the source is aligned. Scan for NUL byte. */ > - .p2align 4 > -4: > - /* First unroll. */ > - movq (%rax), %rcx /* get double word (= 8 bytes) in question */ > - addq $8,%rax /* adjust pointer for next word */ > - movq %r8, %rdx /* magic value */ > - addq %rcx, %rdx /* add the magic value to the word. We get > - carry bits reported for each byte which > - is *not* 0 */ > - jnc 3f /* highest byte is NUL => return pointer */ > - xorq %rcx, %rdx /* (word+magic)^word */ > - orq %r8, %rdx /* set all non-carry bits */ > - incq %rdx /* add 1: if one carry bit was *not* set > - the addition will not result in 0. */ > - jnz 3f /* found NUL => return pointer */ > - > - /* Second unroll. */ > - movq (%rax), %rcx /* get double word (= 8 bytes) in question */ > - addq $8,%rax /* adjust pointer for next word */ > - movq %r8, %rdx /* magic value */ > - addq %rcx, %rdx /* add the magic value to the word. We get > - carry bits reported for each byte which > - is *not* 0 */ > - jnc 3f /* highest byte is NUL => return pointer */ > - xorq %rcx, %rdx /* (word+magic)^word */ > - orq %r8, %rdx /* set all non-carry bits */ > - incq %rdx /* add 1: if one carry bit was *not* set > - the addition will not result in 0. */ > - jnz 3f /* found NUL => return pointer */ > - > - /* Third unroll. */ > - movq (%rax), %rcx /* get double word (= 8 bytes) in question */ > - addq $8,%rax /* adjust pointer for next word */ > - movq %r8, %rdx /* magic value */ > - addq %rcx, %rdx /* add the magic value to the word. We get > - carry bits reported for each byte which > - is *not* 0 */ > - jnc 3f /* highest byte is NUL => return pointer */ > - xorq %rcx, %rdx /* (word+magic)^word */ > - orq %r8, %rdx /* set all non-carry bits */ > - incq %rdx /* add 1: if one carry bit was *not* set > - the addition will not result in 0. */ > - jnz 3f /* found NUL => return pointer */ > - > - /* Fourth unroll. */ > - movq (%rax), %rcx /* get double word (= 8 bytes) in question */ > - addq $8,%rax /* adjust pointer for next word */ > - movq %r8, %rdx /* magic value */ > - addq %rcx, %rdx /* add the magic value to the word. We get > - carry bits reported for each byte which > - is *not* 0 */ > - jnc 3f /* highest byte is NUL => return pointer */ > - xorq %rcx, %rdx /* (word+magic)^word */ > - orq %r8, %rdx /* set all non-carry bits */ > - incq %rdx /* add 1: if one carry bit was *not* set > - the addition will not result in 0. */ > - jz 4b /* no NUL found => continue loop */ > - > - .p2align 4 /* Align, it's a jump target. */ > -3: subq $8,%rax /* correct pointer increment. */ > - > - testb %cl, %cl /* is first byte NUL? */ > - jz 2f /* yes => return */ > - incq %rax /* increment pointer */ > - > - testb %ch, %ch /* is second byte NUL? */ > - jz 2f /* yes => return */ > - incq %rax /* increment pointer */ > - > - testl $0x00ff0000, %ecx /* is third byte NUL? */ > - jz 2f /* yes => return pointer */ > - incq %rax /* increment pointer */ > - > - testl $0xff000000, %ecx /* is fourth byte NUL? */ > - jz 2f /* yes => return pointer */ > - incq %rax /* increment pointer */ > - > - shrq $32, %rcx /* look at other half. */ > - > - testb %cl, %cl /* is first byte NUL? */ > - jz 2f /* yes => return */ > - incq %rax /* increment pointer */ > - > - testb %ch, %ch /* is second byte NUL? */ > - jz 2f /* yes => return */ > - incq %rax /* increment pointer */ > - > - testl $0xff0000, %ecx /* is third byte NUL? */ > - jz 2f /* yes => return pointer */ > - incq %rax /* increment pointer */ > - > -2: > - /* Second step: Copy source to destination. */ > - > - movq %rsi, %rcx /* duplicate */ > - andl $7,%ecx /* mask alignment bits */ > - movq %rax, %rdx /* move around */ > - jz 22f /* aligned => start loop */ > - > - neg %ecx /* align to 8 bytes. */ > - addl $8, %ecx > - /* Align the source pointer. */ > -21: > - movb (%rsi), %al /* Fetch a byte */ > - testb %al, %al /* Is it NUL? */ > - movb %al, (%rdx) /* Store it */ > - jz 24f /* If it was NUL, done! */ > - incq %rsi > - incq %rdx > - decl %ecx > - jnz 21b > - > - /* Now the sources is aligned. Unfortunatly we cannot force > - to have both source and destination aligned, so ignore the > - alignment of the destination. */ > - .p2align 4 > -22: > - /* 1st unroll. */ > - movq (%rsi), %rax /* Read double word (8 bytes). */ > - addq $8, %rsi /* Adjust pointer for next word. */ > - movq %rax, %r9 /* Save a copy for NUL finding. */ > - addq %r8, %r9 /* add the magic value to the word. We get > - carry bits reported for each byte which > - is *not* 0 */ > - jnc 23f /* highest byte is NUL => return pointer */ > - xorq %rax, %r9 /* (word+magic)^word */ > - orq %r8, %r9 /* set all non-carry bits */ > - incq %r9 /* add 1: if one carry bit was *not* set > - the addition will not result in 0. */ > - > - jnz 23f /* found NUL => return pointer */ > - > - movq %rax, (%rdx) /* Write value to destination. */ > - addq $8, %rdx /* Adjust pointer. */ > - > - /* 2nd unroll. */ > - movq (%rsi), %rax /* Read double word (8 bytes). */ > - addq $8, %rsi /* Adjust pointer for next word. */ > - movq %rax, %r9 /* Save a copy for NUL finding. */ > - addq %r8, %r9 /* add the magic value to the word. We get > - carry bits reported for each byte which > - is *not* 0 */ > - jnc 23f /* highest byte is NUL => return pointer */ > - xorq %rax, %r9 /* (word+magic)^word */ > - orq %r8, %r9 /* set all non-carry bits */ > - incq %r9 /* add 1: if one carry bit was *not* set > - the addition will not result in 0. */ > - > - jnz 23f /* found NUL => return pointer */ > - > - movq %rax, (%rdx) /* Write value to destination. */ > - addq $8, %rdx /* Adjust pointer. */ > - > - /* 3rd unroll. */ > - movq (%rsi), %rax /* Read double word (8 bytes). */ > - addq $8, %rsi /* Adjust pointer for next word. */ > - movq %rax, %r9 /* Save a copy for NUL finding. */ > - addq %r8, %r9 /* add the magic value to the word. We get > - carry bits reported for each byte which > - is *not* 0 */ > - jnc 23f /* highest byte is NUL => return pointer */ > - xorq %rax, %r9 /* (word+magic)^word */ > - orq %r8, %r9 /* set all non-carry bits */ > - incq %r9 /* add 1: if one carry bit was *not* set > - the addition will not result in 0. */ > - > - jnz 23f /* found NUL => return pointer */ > - > - movq %rax, (%rdx) /* Write value to destination. */ > - addq $8, %rdx /* Adjust pointer. */ > - > - /* 4th unroll. */ > - movq (%rsi), %rax /* Read double word (8 bytes). */ > - addq $8, %rsi /* Adjust pointer for next word. */ > - movq %rax, %r9 /* Save a copy for NUL finding. */ > - addq %r8, %r9 /* add the magic value to the word. We get > - carry bits reported for each byte which > - is *not* 0 */ > - jnc 23f /* highest byte is NUL => return pointer */ > - xorq %rax, %r9 /* (word+magic)^word */ > - orq %r8, %r9 /* set all non-carry bits */ > - incq %r9 /* add 1: if one carry bit was *not* set > - the addition will not result in 0. */ > - > - jnz 23f /* found NUL => return pointer */ > - > - movq %rax, (%rdx) /* Write value to destination. */ > - addq $8, %rdx /* Adjust pointer. */ > - jmp 22b /* Next iteration. */ > - > - /* Do the last few bytes. %rax contains the value to write. > - The loop is unrolled twice. */ > - .p2align 4 > -23: > - movb %al, (%rdx) /* 1st byte. */ > - testb %al, %al /* Is it NUL. */ > - jz 24f /* yes, finish. */ > - incq %rdx /* Increment destination. */ > - movb %ah, (%rdx) /* 2nd byte. */ > - testb %ah, %ah /* Is it NUL?. */ > - jz 24f /* yes, finish. */ > - incq %rdx /* Increment destination. */ > - shrq $16, %rax /* Shift... */ > - jmp 23b /* and look at next two bytes in %rax. */ > - > - > -24: > - movq %rdi, %rax /* Source is return value. */ > - retq > -END (strcat) > +#define STRCAT strcat > +#include "multiarch/strcat-sse2.S" > libc_hidden_builtin_def (strcat) > -- > 2.34.1 > LGTM. Thanks.
diff --git a/sysdeps/x86_64/multiarch/strcat-sse2.S b/sysdeps/x86_64/multiarch/strcat-sse2.S index 449e102438..244c4a6d74 100644 --- a/sysdeps/x86_64/multiarch/strcat-sse2.S +++ b/sysdeps/x86_64/multiarch/strcat-sse2.S @@ -17,12 +17,242 @@ <https://www.gnu.org/licenses/>. */ #if IS_IN (libc) +# ifndef STRCAT +# define STRCAT __strcat_sse2 +# endif +#endif -# include <sysdep.h> -# define strcat __strcat_sse2 +#include <sysdep.h> + + .text +ENTRY (STRCAT) + movq %rdi, %rcx /* Dest. register. */ + andl $7, %ecx /* mask alignment bits */ + movq %rdi, %rax /* Duplicate destination pointer. */ + movq $0xfefefefefefefeff,%r8 + + /* First step: Find end of destination. */ + jz 4f /* aligned => start loop */ + + neg %ecx /* We need to align to 8 bytes. */ + addl $8,%ecx + /* Search the first bytes directly. */ +0: cmpb $0x0,(%rax) /* is byte NUL? */ + je 2f /* yes => start copy */ + incq %rax /* increment pointer */ + decl %ecx + jnz 0b + + + + /* Now the source is aligned. Scan for NUL byte. */ + .p2align 4 +4: + /* First unroll. */ + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ + addq $8,%rax /* adjust pointer for next word */ + movq %r8, %rdx /* magic value */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 3f /* found NUL => return pointer */ + + /* Second unroll. */ + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ + addq $8,%rax /* adjust pointer for next word */ + movq %r8, %rdx /* magic value */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 3f /* found NUL => return pointer */ + + /* Third unroll. */ + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ + addq $8,%rax /* adjust pointer for next word */ + movq %r8, %rdx /* magic value */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 3f /* found NUL => return pointer */ + + /* Fourth unroll. */ + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ + addq $8,%rax /* adjust pointer for next word */ + movq %r8, %rdx /* magic value */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jz 4b /* no NUL found => continue loop */ + + .p2align 4 /* Align, it's a jump target. */ +3: subq $8,%rax /* correct pointer increment. */ + + testb %cl, %cl /* is first byte NUL? */ + jz 2f /* yes => return */ + incq %rax /* increment pointer */ + + testb %ch, %ch /* is second byte NUL? */ + jz 2f /* yes => return */ + incq %rax /* increment pointer */ + + testl $0x00ff0000, %ecx /* is third byte NUL? */ + jz 2f /* yes => return pointer */ + incq %rax /* increment pointer */ + + testl $0xff000000, %ecx /* is fourth byte NUL? */ + jz 2f /* yes => return pointer */ + incq %rax /* increment pointer */ + + shrq $32, %rcx /* look at other half. */ + + testb %cl, %cl /* is first byte NUL? */ + jz 2f /* yes => return */ + incq %rax /* increment pointer */ + + testb %ch, %ch /* is second byte NUL? */ + jz 2f /* yes => return */ + incq %rax /* increment pointer */ + + testl $0xff0000, %ecx /* is third byte NUL? */ + jz 2f /* yes => return pointer */ + incq %rax /* increment pointer */ + +2: + /* Second step: Copy source to destination. */ + + movq %rsi, %rcx /* duplicate */ + andl $7,%ecx /* mask alignment bits */ + movq %rax, %rdx /* move around */ + jz 22f /* aligned => start loop */ + + neg %ecx /* align to 8 bytes. */ + addl $8, %ecx + /* Align the source pointer. */ +21: + movb (%rsi), %al /* Fetch a byte */ + testb %al, %al /* Is it NUL? */ + movb %al, (%rdx) /* Store it */ + jz 24f /* If it was NUL, done! */ + incq %rsi + incq %rdx + decl %ecx + jnz 21b + + /* Now the sources is aligned. Unfortunatly we cannot force + to have both source and destination aligned, so ignore the + alignment of the destination. */ + .p2align 4 +22: + /* 1st unroll. */ + movq (%rsi), %rax /* Read double word (8 bytes). */ + addq $8, %rsi /* Adjust pointer for next word. */ + movq %rax, %r9 /* Save a copy for NUL finding. */ + addq %r8, %r9 /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 23f /* highest byte is NUL => return pointer */ + xorq %rax, %r9 /* (word+magic)^word */ + orq %r8, %r9 /* set all non-carry bits */ + incq %r9 /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + jnz 23f /* found NUL => return pointer */ + + movq %rax, (%rdx) /* Write value to destination. */ + addq $8, %rdx /* Adjust pointer. */ + + /* 2nd unroll. */ + movq (%rsi), %rax /* Read double word (8 bytes). */ + addq $8, %rsi /* Adjust pointer for next word. */ + movq %rax, %r9 /* Save a copy for NUL finding. */ + addq %r8, %r9 /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 23f /* highest byte is NUL => return pointer */ + xorq %rax, %r9 /* (word+magic)^word */ + orq %r8, %r9 /* set all non-carry bits */ + incq %r9 /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + jnz 23f /* found NUL => return pointer */ + + movq %rax, (%rdx) /* Write value to destination. */ + addq $8, %rdx /* Adjust pointer. */ + + /* 3rd unroll. */ + movq (%rsi), %rax /* Read double word (8 bytes). */ + addq $8, %rsi /* Adjust pointer for next word. */ + movq %rax, %r9 /* Save a copy for NUL finding. */ + addq %r8, %r9 /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 23f /* highest byte is NUL => return pointer */ + xorq %rax, %r9 /* (word+magic)^word */ + orq %r8, %r9 /* set all non-carry bits */ + incq %r9 /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + jnz 23f /* found NUL => return pointer */ + + movq %rax, (%rdx) /* Write value to destination. */ + addq $8, %rdx /* Adjust pointer. */ + + /* 4th unroll. */ + movq (%rsi), %rax /* Read double word (8 bytes). */ + addq $8, %rsi /* Adjust pointer for next word. */ + movq %rax, %r9 /* Save a copy for NUL finding. */ + addq %r8, %r9 /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 23f /* highest byte is NUL => return pointer */ + xorq %rax, %r9 /* (word+magic)^word */ + orq %r8, %r9 /* set all non-carry bits */ + incq %r9 /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + jnz 23f /* found NUL => return pointer */ + + movq %rax, (%rdx) /* Write value to destination. */ + addq $8, %rdx /* Adjust pointer. */ + jmp 22b /* Next iteration. */ + + /* Do the last few bytes. %rax contains the value to write. + The loop is unrolled twice. */ + .p2align 4 +23: + movb %al, (%rdx) /* 1st byte. */ + testb %al, %al /* Is it NUL. */ + jz 24f /* yes, finish. */ + incq %rdx /* Increment destination. */ + movb %ah, (%rdx) /* 2nd byte. */ + testb %ah, %ah /* Is it NUL?. */ + jz 24f /* yes, finish. */ + incq %rdx /* Increment destination. */ + shrq $16, %rax /* Shift... */ + jmp 23b /* and look at next two bytes in %rax. */ -# undef libc_hidden_builtin_def -# define libc_hidden_builtin_def(strcat) -#endif -#include <sysdeps/x86_64/strcat.S> +24: + movq %rdi, %rax /* Source is return value. */ + retq +END (STRCAT) diff --git a/sysdeps/x86_64/strcat.S b/sysdeps/x86_64/strcat.S index 565a9c785a..fc3e8a9bcf 100644 --- a/sysdeps/x86_64/strcat.S +++ b/sysdeps/x86_64/strcat.S @@ -17,241 +17,6 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#include <sysdep.h> -#include "asm-syntax.h" - -/* Will be removed when new strcpy implementation gets merged. */ - - .text -ENTRY (strcat) - movq %rdi, %rcx /* Dest. register. */ - andl $7, %ecx /* mask alignment bits */ - movq %rdi, %rax /* Duplicate destination pointer. */ - movq $0xfefefefefefefeff,%r8 - - /* First step: Find end of destination. */ - jz 4f /* aligned => start loop */ - - neg %ecx /* We need to align to 8 bytes. */ - addl $8,%ecx - /* Search the first bytes directly. */ -0: cmpb $0x0,(%rax) /* is byte NUL? */ - je 2f /* yes => start copy */ - incq %rax /* increment pointer */ - decl %ecx - jnz 0b - - - - /* Now the source is aligned. Scan for NUL byte. */ - .p2align 4 -4: - /* First unroll. */ - movq (%rax), %rcx /* get double word (= 8 bytes) in question */ - addq $8,%rax /* adjust pointer for next word */ - movq %r8, %rdx /* magic value */ - addq %rcx, %rdx /* add the magic value to the word. We get - carry bits reported for each byte which - is *not* 0 */ - jnc 3f /* highest byte is NUL => return pointer */ - xorq %rcx, %rdx /* (word+magic)^word */ - orq %r8, %rdx /* set all non-carry bits */ - incq %rdx /* add 1: if one carry bit was *not* set - the addition will not result in 0. */ - jnz 3f /* found NUL => return pointer */ - - /* Second unroll. */ - movq (%rax), %rcx /* get double word (= 8 bytes) in question */ - addq $8,%rax /* adjust pointer for next word */ - movq %r8, %rdx /* magic value */ - addq %rcx, %rdx /* add the magic value to the word. We get - carry bits reported for each byte which - is *not* 0 */ - jnc 3f /* highest byte is NUL => return pointer */ - xorq %rcx, %rdx /* (word+magic)^word */ - orq %r8, %rdx /* set all non-carry bits */ - incq %rdx /* add 1: if one carry bit was *not* set - the addition will not result in 0. */ - jnz 3f /* found NUL => return pointer */ - - /* Third unroll. */ - movq (%rax), %rcx /* get double word (= 8 bytes) in question */ - addq $8,%rax /* adjust pointer for next word */ - movq %r8, %rdx /* magic value */ - addq %rcx, %rdx /* add the magic value to the word. We get - carry bits reported for each byte which - is *not* 0 */ - jnc 3f /* highest byte is NUL => return pointer */ - xorq %rcx, %rdx /* (word+magic)^word */ - orq %r8, %rdx /* set all non-carry bits */ - incq %rdx /* add 1: if one carry bit was *not* set - the addition will not result in 0. */ - jnz 3f /* found NUL => return pointer */ - - /* Fourth unroll. */ - movq (%rax), %rcx /* get double word (= 8 bytes) in question */ - addq $8,%rax /* adjust pointer for next word */ - movq %r8, %rdx /* magic value */ - addq %rcx, %rdx /* add the magic value to the word. We get - carry bits reported for each byte which - is *not* 0 */ - jnc 3f /* highest byte is NUL => return pointer */ - xorq %rcx, %rdx /* (word+magic)^word */ - orq %r8, %rdx /* set all non-carry bits */ - incq %rdx /* add 1: if one carry bit was *not* set - the addition will not result in 0. */ - jz 4b /* no NUL found => continue loop */ - - .p2align 4 /* Align, it's a jump target. */ -3: subq $8,%rax /* correct pointer increment. */ - - testb %cl, %cl /* is first byte NUL? */ - jz 2f /* yes => return */ - incq %rax /* increment pointer */ - - testb %ch, %ch /* is second byte NUL? */ - jz 2f /* yes => return */ - incq %rax /* increment pointer */ - - testl $0x00ff0000, %ecx /* is third byte NUL? */ - jz 2f /* yes => return pointer */ - incq %rax /* increment pointer */ - - testl $0xff000000, %ecx /* is fourth byte NUL? */ - jz 2f /* yes => return pointer */ - incq %rax /* increment pointer */ - - shrq $32, %rcx /* look at other half. */ - - testb %cl, %cl /* is first byte NUL? */ - jz 2f /* yes => return */ - incq %rax /* increment pointer */ - - testb %ch, %ch /* is second byte NUL? */ - jz 2f /* yes => return */ - incq %rax /* increment pointer */ - - testl $0xff0000, %ecx /* is third byte NUL? */ - jz 2f /* yes => return pointer */ - incq %rax /* increment pointer */ - -2: - /* Second step: Copy source to destination. */ - - movq %rsi, %rcx /* duplicate */ - andl $7,%ecx /* mask alignment bits */ - movq %rax, %rdx /* move around */ - jz 22f /* aligned => start loop */ - - neg %ecx /* align to 8 bytes. */ - addl $8, %ecx - /* Align the source pointer. */ -21: - movb (%rsi), %al /* Fetch a byte */ - testb %al, %al /* Is it NUL? */ - movb %al, (%rdx) /* Store it */ - jz 24f /* If it was NUL, done! */ - incq %rsi - incq %rdx - decl %ecx - jnz 21b - - /* Now the sources is aligned. Unfortunatly we cannot force - to have both source and destination aligned, so ignore the - alignment of the destination. */ - .p2align 4 -22: - /* 1st unroll. */ - movq (%rsi), %rax /* Read double word (8 bytes). */ - addq $8, %rsi /* Adjust pointer for next word. */ - movq %rax, %r9 /* Save a copy for NUL finding. */ - addq %r8, %r9 /* add the magic value to the word. We get - carry bits reported for each byte which - is *not* 0 */ - jnc 23f /* highest byte is NUL => return pointer */ - xorq %rax, %r9 /* (word+magic)^word */ - orq %r8, %r9 /* set all non-carry bits */ - incq %r9 /* add 1: if one carry bit was *not* set - the addition will not result in 0. */ - - jnz 23f /* found NUL => return pointer */ - - movq %rax, (%rdx) /* Write value to destination. */ - addq $8, %rdx /* Adjust pointer. */ - - /* 2nd unroll. */ - movq (%rsi), %rax /* Read double word (8 bytes). */ - addq $8, %rsi /* Adjust pointer for next word. */ - movq %rax, %r9 /* Save a copy for NUL finding. */ - addq %r8, %r9 /* add the magic value to the word. We get - carry bits reported for each byte which - is *not* 0 */ - jnc 23f /* highest byte is NUL => return pointer */ - xorq %rax, %r9 /* (word+magic)^word */ - orq %r8, %r9 /* set all non-carry bits */ - incq %r9 /* add 1: if one carry bit was *not* set - the addition will not result in 0. */ - - jnz 23f /* found NUL => return pointer */ - - movq %rax, (%rdx) /* Write value to destination. */ - addq $8, %rdx /* Adjust pointer. */ - - /* 3rd unroll. */ - movq (%rsi), %rax /* Read double word (8 bytes). */ - addq $8, %rsi /* Adjust pointer for next word. */ - movq %rax, %r9 /* Save a copy for NUL finding. */ - addq %r8, %r9 /* add the magic value to the word. We get - carry bits reported for each byte which - is *not* 0 */ - jnc 23f /* highest byte is NUL => return pointer */ - xorq %rax, %r9 /* (word+magic)^word */ - orq %r8, %r9 /* set all non-carry bits */ - incq %r9 /* add 1: if one carry bit was *not* set - the addition will not result in 0. */ - - jnz 23f /* found NUL => return pointer */ - - movq %rax, (%rdx) /* Write value to destination. */ - addq $8, %rdx /* Adjust pointer. */ - - /* 4th unroll. */ - movq (%rsi), %rax /* Read double word (8 bytes). */ - addq $8, %rsi /* Adjust pointer for next word. */ - movq %rax, %r9 /* Save a copy for NUL finding. */ - addq %r8, %r9 /* add the magic value to the word. We get - carry bits reported for each byte which - is *not* 0 */ - jnc 23f /* highest byte is NUL => return pointer */ - xorq %rax, %r9 /* (word+magic)^word */ - orq %r8, %r9 /* set all non-carry bits */ - incq %r9 /* add 1: if one carry bit was *not* set - the addition will not result in 0. */ - - jnz 23f /* found NUL => return pointer */ - - movq %rax, (%rdx) /* Write value to destination. */ - addq $8, %rdx /* Adjust pointer. */ - jmp 22b /* Next iteration. */ - - /* Do the last few bytes. %rax contains the value to write. - The loop is unrolled twice. */ - .p2align 4 -23: - movb %al, (%rdx) /* 1st byte. */ - testb %al, %al /* Is it NUL. */ - jz 24f /* yes, finish. */ - incq %rdx /* Increment destination. */ - movb %ah, (%rdx) /* 2nd byte. */ - testb %ah, %ah /* Is it NUL?. */ - jz 24f /* yes, finish. */ - incq %rdx /* Increment destination. */ - shrq $16, %rax /* Shift... */ - jmp 23b /* and look at next two bytes in %rax. */ - - -24: - movq %rdi, %rax /* Source is return value. */ - retq -END (strcat) +#define STRCAT strcat +#include "multiarch/strcat-sse2.S" libc_hidden_builtin_def (strcat)