Message ID | 20150827162836.GA28702@domone |
---|---|
State | New |
Headers | show |
On Thu, Aug 27, 2015 at 9:28 AM, Ondřej Bílka <neleai@seznam.cz> wrote: > Hi, > > I got idea that I mentioned on aarch thread to improve memset and memcpy > performance by using conditional moves to avoid branch misprediction > penalty. Previously setting one byte was very slow as we needed decision > tree to determine implementation for each size and for some range it > needed to be slow so we selected this one as unlikely. > > A trick that I use could be done in C to handle n less than 16 > > char stack[64]; > uint64_t *p1 = (uint64_t *) s; > uint32_t *p2 = (uint32_t *) (s + (n & 8)); > uint16_t *p3 = (uint16_t *) (s + (n & 12)); > char *p4 = s + n - 1; > if (n & 8 == 0) p1 = stack; > if (n & 4 == 0) p2 = stack; > if (n & 2 == 0) p3 = stack; > if (n & 1 == 0) p4 = stack; > *p1 = vc; > *p2 = vc; > *p3 = vc; > *p4 = vc; > > and for 32-128 byte range its same as what in 16-64 range you could do > with 8 byte moves > > char stack[64]; > int add = n > 32 ? 16 : 0; > int sub = n > 32 ? -16 : 0; > *((uint64_t *)(s)) = vc; > *((uint64_t *)(s + 8)) = vc; > *((uint64_t *)(s + add)) = vc; > *((uint64_t *)(s + 8 + add)) = vc; > *((uint64_t *)(s + n - 16)) = vc; > *((uint64_t *)(s + 8 + n - 16)) = vc; > *((uint64_t *)(s + n - 16 + sub)) = vc; > *((uint64_t *)(s + 8 + n - 16 + sub)) = vc; > > This could be used generically, I believe that on arm it could give > considerable speedup but would need access to machine for testing. > > This gives on average 5% over current implementation, see profile > results here, I will resend alternate rep stosq implementation later, > see following: > > http://kam.mff.cuni.cz/~ondra/benchmark_string/memset_profile_cmov.html > http://kam.mff.cuni.cz/~ondra/benchmark_string/memset_profile_cmov270815.tar.bz2 > > That as I recall was where I left memset as I didn't know on which sizes > to select it as it needs to be set empirically and is different for each > architecture. Problem is that on most architectures its faster when data > are in L3 cache or main memory and I dont know since what sizes it > usually happens as it may be case in lot smaller size than cache size > limit. > > I also included optimization of counted loop, which helps processor to > correctly predict end until 512 bytes. > > Second one is that I extended header to sizes upto 256 bytes which gives > additional one percent or so. > > > There is additional microoptimization possible that I could save few > cycles with ssse3 pshufb so should I add another variant there > surrounded by standard #ifdef USE_SSSE3 pattern? > > I didn't do more performance testing yet as I plan to use this pattern > also for memcpy and strcpy. > > So comments? > > * sysdeps/x86_64/memset.S: Use conditional moves to improve > performance. > > diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S > index e496254..bea402d 100644 > --- a/sysdeps/x86_64/memset.S > +++ b/sysdeps/x86_64/memset.S > @@ -24,7 +24,7 @@ > ENTRY(__bzero) > movq %rdi, %rax /* Set return value. */ > movq %rsi, %rdx /* Set n. */ > - pxor %xmm8, %xmm8 > + pxor %xmm0, %xmm0 The %xmm8->%xmm0 changes are in master branch now. > jmp L(entry_from_bzero) > END(__bzero) > weak_alias (__bzero, bzero) > @@ -33,10 +33,10 @@ weak_alias (__bzero, bzero) > ENTRY(__memset_tail) > movq %rcx, %rax /* Set return value. */ > > - movd %esi, %xmm8 > - punpcklbw %xmm8, %xmm8 > - punpcklwd %xmm8, %xmm8 > - pshufd $0, %xmm8, %xmm8 > + movd %esi, %xmm0 > + punpcklbw %xmm0, %xmm0 > + punpcklwd %xmm0, %xmm0 > + pshufd $0, %xmm0, %xmm0 > > jmp L(entry_from_bzero) > END(__memset_tail) > @@ -50,78 +50,116 @@ END_CHK (__memset_chk) > #endif > > ENTRY (memset) > - movd %esi, %xmm8 > + movd %esi, %xmm0 > movq %rdi, %rax > - punpcklbw %xmm8, %xmm8 > - punpcklwd %xmm8, %xmm8 > - pshufd $0, %xmm8, %xmm8 > + punpcklbw %xmm0, %xmm0 > + punpcklwd %xmm0, %xmm0 > + pshufd $0, %xmm0, %xmm0 > L(entry_from_bzero): > - cmpq $64, %rdx > + cmpq $256, %rdx > ja L(loop_start) > - cmpq $16, %rdx > - jbe L(less_16_bytes) > - cmpq $32, %rdx > - movdqu %xmm8, (%rdi) > - movdqu %xmm8, -16(%rdi,%rdx) > - ja L(between_32_64_bytes) > -L(return): > - rep > + cmp $128, %edx > + jae L(between_128_256_bytes) > + > + cmp $32, %edx > + jb L(less_32_bytes) > + > + lea -32(%rdi), %rcx > + lea 32(%rdi), %rsi > + > + cmp $64, %edx > + cmovb %rdi, %rcx > + cmovb %rdi, %rsi > + > + movdqu %xmm0, (%rdi) > + movdqu %xmm0, 16(%rdi) > + movdqu %xmm0, (%rsi) > + movdqu %xmm0, 16(%rsi) > + > + movdqu %xmm0, -32(%rcx, %rdx) > + movdqu %xmm0, -16(%rcx, %rdx) > + movdqu %xmm0, -32(%rdi, %rdx) > + movdqu %xmm0, -16(%rdi, %rdx) > ret > + > .p2align 4 > -L(between_32_64_bytes): > - movdqu %xmm8, 16(%rdi) > - movdqu %xmm8, -32(%rdi,%rdx) > +L(between_128_256_bytes): > + lea (%rdi, %rdx), %rsi > + movdqu %xmm0, (%rdi) > + movdqu %xmm0, 16(%rdi) > + movdqu %xmm0, 32(%rdi) > + movdqu %xmm0, 48(%rdi) > + movdqu %xmm0, 64(%rdi) > + movdqu %xmm0, 80(%rdi) > + movdqu %xmm0, 96(%rdi) > + movdqu %xmm0, 112(%rdi) > + movdqu %xmm0, -128(%rsi) > + movdqu %xmm0, -112(%rsi) > + movdqu %xmm0, -96(%rsi) > + movdqu %xmm0, -80(%rsi) > + movdqu %xmm0, -64(%rsi) > + movdqu %xmm0, -48(%rsi) > + movdqu %xmm0, -32(%rsi) > + movdqu %xmm0, -16(%rsi) > ret > + > .p2align 4 > L(loop_start): > - leaq 64(%rdi), %rcx > - movdqu %xmm8, (%rdi) > - andq $-64, %rcx > - movdqu %xmm8, -16(%rdi,%rdx) > - movdqu %xmm8, 16(%rdi) > - movdqu %xmm8, -32(%rdi,%rdx) > - movdqu %xmm8, 32(%rdi) > - movdqu %xmm8, -48(%rdi,%rdx) > - movdqu %xmm8, 48(%rdi) > - movdqu %xmm8, -64(%rdi,%rdx) > - addq %rdi, %rdx > - andq $-64, %rdx > - cmpq %rdx, %rcx > - je L(return) > - .p2align 4 > + lea (%rdi, %rdx), %rsi > + leaq 16(%rdi), %rcx > + andq $-16, %rcx > + movdqu %xmm0, -16(%rsi) > + movdqu %xmm0, -32(%rsi) > + movdqu %xmm0, -48(%rsi) > + movdqu %xmm0, -64(%rsi) > + movdqu %xmm0, (%rdi) > + subq %rcx, %rsi > + shrq $6, %rsi > L(loop): > - movdqa %xmm8, (%rcx) > - movdqa %xmm8, 16(%rcx) > - movdqa %xmm8, 32(%rcx) > - movdqa %xmm8, 48(%rcx) > + movdqa %xmm0, (%rcx) > + movdqa %xmm0, 16(%rcx) > + movdqa %xmm0, 32(%rcx) > + movdqa %xmm0, 48(%rcx) > addq $64, %rcx > - cmpq %rcx, %rdx > + subq $1, %rsi > jne L(loop) > - rep > - ret > -L(less_16_bytes): > - movq %xmm8, %rcx > - testb $24, %dl > - jne L(between8_16bytes) > - testb $4, %dl > - jne L(between4_7bytes) > - testb $1, %dl > - je L(odd_byte) > - movb %cl, (%rdi) > -L(odd_byte): > - testb $2, %dl > - je L(return) > - movw %cx, -2(%rax,%rdx) > - ret > -L(between4_7bytes): > - movl %ecx, (%rdi) > - movl %ecx, -4(%rdi,%rdx) > - ret > -L(between8_16bytes): > - movq %rcx, (%rdi) > - movq %rcx, -8(%rdi,%rdx) > ret > > +L(less_32_bytes): > + movq %xmm0, %rcx > + lea -64(%rsp), %r10 > + sub %rax, %r10 > + lea 64(%rdi), %r9 > + lea 63(%rdi, %rdx), %rsi > + mov %rdx, %r8 > + mov %rdx, %r11 > + mov %rdx, %rdi > + and $16, %r8 > + and $24, %rdi > + and $28, %r11 > + > + test $16, %edx > + cmove %rsp, %r9 > + > + test $8, %edx > + cmove %r10, %r8 > + > + test $4, %edx > + cmove %r10, %rdi > + > + test $2, %edx > + cmove %r10, %r11 > + > + test $1, %edx > + cmove %rsp, %rsi > + > + movdqu %xmm0, -64(%r9) > + movq %rcx, (%rax, %r8) > + movl %ecx, (%rax, %rdi) > + movw %cx, (%rax, %r11) > + movb %cl, -64(%rsi) > + > + ret > END (memset) > libc_hidden_builtin_def (memset) > They look OK to me. Thanks.
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S index e496254..bea402d 100644 --- a/sysdeps/x86_64/memset.S +++ b/sysdeps/x86_64/memset.S @@ -24,7 +24,7 @@ ENTRY(__bzero) movq %rdi, %rax /* Set return value. */ movq %rsi, %rdx /* Set n. */ - pxor %xmm8, %xmm8 + pxor %xmm0, %xmm0 jmp L(entry_from_bzero) END(__bzero) weak_alias (__bzero, bzero) @@ -33,10 +33,10 @@ weak_alias (__bzero, bzero) ENTRY(__memset_tail) movq %rcx, %rax /* Set return value. */ - movd %esi, %xmm8 - punpcklbw %xmm8, %xmm8 - punpcklwd %xmm8, %xmm8 - pshufd $0, %xmm8, %xmm8 + movd %esi, %xmm0 + punpcklbw %xmm0, %xmm0 + punpcklwd %xmm0, %xmm0 + pshufd $0, %xmm0, %xmm0 jmp L(entry_from_bzero) END(__memset_tail) @@ -50,78 +50,116 @@ END_CHK (__memset_chk) #endif ENTRY (memset) - movd %esi, %xmm8 + movd %esi, %xmm0 movq %rdi, %rax - punpcklbw %xmm8, %xmm8 - punpcklwd %xmm8, %xmm8 - pshufd $0, %xmm8, %xmm8 + punpcklbw %xmm0, %xmm0 + punpcklwd %xmm0, %xmm0 + pshufd $0, %xmm0, %xmm0 L(entry_from_bzero): - cmpq $64, %rdx + cmpq $256, %rdx ja L(loop_start) - cmpq $16, %rdx - jbe L(less_16_bytes) - cmpq $32, %rdx - movdqu %xmm8, (%rdi) - movdqu %xmm8, -16(%rdi,%rdx) - ja L(between_32_64_bytes) -L(return): - rep + cmp $128, %edx + jae L(between_128_256_bytes) + + cmp $32, %edx + jb L(less_32_bytes) + + lea -32(%rdi), %rcx + lea 32(%rdi), %rsi + + cmp $64, %edx + cmovb %rdi, %rcx + cmovb %rdi, %rsi + + movdqu %xmm0, (%rdi) + movdqu %xmm0, 16(%rdi) + movdqu %xmm0, (%rsi) + movdqu %xmm0, 16(%rsi) + + movdqu %xmm0, -32(%rcx, %rdx) + movdqu %xmm0, -16(%rcx, %rdx) + movdqu %xmm0, -32(%rdi, %rdx) + movdqu %xmm0, -16(%rdi, %rdx) ret + .p2align 4 -L(between_32_64_bytes): - movdqu %xmm8, 16(%rdi) - movdqu %xmm8, -32(%rdi,%rdx) +L(between_128_256_bytes): + lea (%rdi, %rdx), %rsi + movdqu %xmm0, (%rdi) + movdqu %xmm0, 16(%rdi) + movdqu %xmm0, 32(%rdi) + movdqu %xmm0, 48(%rdi) + movdqu %xmm0, 64(%rdi) + movdqu %xmm0, 80(%rdi) + movdqu %xmm0, 96(%rdi) + movdqu %xmm0, 112(%rdi) + movdqu %xmm0, -128(%rsi) + movdqu %xmm0, -112(%rsi) + movdqu %xmm0, -96(%rsi) + movdqu %xmm0, -80(%rsi) + movdqu %xmm0, -64(%rsi) + movdqu %xmm0, -48(%rsi) + movdqu %xmm0, -32(%rsi) + movdqu %xmm0, -16(%rsi) ret + .p2align 4 L(loop_start): - leaq 64(%rdi), %rcx - movdqu %xmm8, (%rdi) - andq $-64, %rcx - movdqu %xmm8, -16(%rdi,%rdx) - movdqu %xmm8, 16(%rdi) - movdqu %xmm8, -32(%rdi,%rdx) - movdqu %xmm8, 32(%rdi) - movdqu %xmm8, -48(%rdi,%rdx) - movdqu %xmm8, 48(%rdi) - movdqu %xmm8, -64(%rdi,%rdx) - addq %rdi, %rdx - andq $-64, %rdx - cmpq %rdx, %rcx - je L(return) - .p2align 4 + lea (%rdi, %rdx), %rsi + leaq 16(%rdi), %rcx + andq $-16, %rcx + movdqu %xmm0, -16(%rsi) + movdqu %xmm0, -32(%rsi) + movdqu %xmm0, -48(%rsi) + movdqu %xmm0, -64(%rsi) + movdqu %xmm0, (%rdi) + subq %rcx, %rsi + shrq $6, %rsi L(loop): - movdqa %xmm8, (%rcx) - movdqa %xmm8, 16(%rcx) - movdqa %xmm8, 32(%rcx) - movdqa %xmm8, 48(%rcx) + movdqa %xmm0, (%rcx) + movdqa %xmm0, 16(%rcx) + movdqa %xmm0, 32(%rcx) + movdqa %xmm0, 48(%rcx) addq $64, %rcx - cmpq %rcx, %rdx + subq $1, %rsi jne L(loop) - rep - ret -L(less_16_bytes): - movq %xmm8, %rcx - testb $24, %dl - jne L(between8_16bytes) - testb $4, %dl - jne L(between4_7bytes) - testb $1, %dl - je L(odd_byte) - movb %cl, (%rdi) -L(odd_byte): - testb $2, %dl - je L(return) - movw %cx, -2(%rax,%rdx) - ret -L(between4_7bytes): - movl %ecx, (%rdi) - movl %ecx, -4(%rdi,%rdx) - ret -L(between8_16bytes): - movq %rcx, (%rdi) - movq %rcx, -8(%rdi,%rdx) ret +L(less_32_bytes): + movq %xmm0, %rcx + lea -64(%rsp), %r10 + sub %rax, %r10 + lea 64(%rdi), %r9 + lea 63(%rdi, %rdx), %rsi + mov %rdx, %r8 + mov %rdx, %r11 + mov %rdx, %rdi + and $16, %r8 + and $24, %rdi + and $28, %r11 + + test $16, %edx + cmove %rsp, %r9 + + test $8, %edx + cmove %r10, %r8 + + test $4, %edx + cmove %r10, %rdi + + test $2, %edx + cmove %r10, %r11 + + test $1, %edx + cmove %rsp, %rsi + + movdqu %xmm0, -64(%r9) + movq %rcx, (%rax, %r8) + movl %ecx, (%rax, %rdi) + movw %cx, (%rax, %r11) + movb %cl, -64(%rsi) + + ret END (memset) libc_hidden_builtin_def (memset)