Message ID | 1401945190-13296-1-git-send-email-vidya@linux.vnet.ibm.com |
---|---|
State | New |
Headers | show |
Hi Vidya, Path looks good, however you forgot to add a ChangeLog. Also one typo below. Please send the CL entry that will push upstream. On 05-06-2014 02:13, vidya@linux.vnet.ibm.com wrote: > From: Vidya Ranganathan <vidya@linux.vnet.ibm.com> > > Optimization is achieved on 8 byte aligned strings with double word > comparison using cmpb instruction. On unaligned strings loop unrolling > is applied for Power7 gain. > > Signed-off-by: Vidya Ranganathan <vidya@linux.vnet.ibm.com> > --- > sysdeps/powerpc/powerpc64/multiarch/Makefile | 2 +- > .../powerpc/powerpc64/multiarch/ifunc-impl-list.c | 7 + > .../powerpc/powerpc64/multiarch/strcmp-power7.S | 40 +++++ > sysdeps/powerpc/powerpc64/multiarch/strcmp-ppc64.S | 43 +++++ > sysdeps/powerpc/powerpc64/multiarch/strcmp.c | 31 ++++ > sysdeps/powerpc/powerpc64/power7/strcmp.S | 195 +++++++++++++++++++++ > 6 files changed, 317 insertions(+), 1 deletion(-) > create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strcmp-power7.S > create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strcmp-ppc64.S > create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strcmp.c > create mode 100644 sysdeps/powerpc/powerpc64/power7/strcmp.S > > diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile > index 35020a7..05744e9 100644 > --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile > +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile > @@ -17,7 +17,7 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \ > strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \ > strspn-power7 strspn-ppc64 strcspn-power7 strcspn-ppc64 \ > strpbrk-power7 strpbrk-ppc64 strncpy-power7 strncpy-ppc64 \ > - stpncpy-power7 stpncpy-ppc64 > + stpncpy-power7 stpncpy-ppc64 strcmp-power7 strcmp-ppc64 > > CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops > CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops > diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c > index d8578fb..b3933a5 100644 > --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c > @@ -294,5 +294,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, stpncpy, 1, > __stpncpy_ppc)) > > + /* Support sysdeps/powerpc/powerpc64/multiarch/strcmp.c. */ > + IFUNC_IMPL (i, name, strcmp, > + IFUNC_IMPL_ADD (array, i, strcmp, > + hwcap & PPC_FEATURE_HAS_VSX, > + __strcmp_power7) > + IFUNC_IMPL_ADD (array, i, strcmp, 1, > + __strcmp_ppc)) > return i; > } > diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp-power7.S b/sysdeps/powerpc/powerpc64/multiarch/strcmp-power7.S > new file mode 100644 > index 0000000..790ce8d > --- /dev/null > +++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp-power7.S > @@ -0,0 +1,40 @@ > +/* Optimized strcmp implementation for POWER7. > + Copyright (C) 2014 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <http://www.gnu.org/licenses/>. */ > + > +#include <sysdep.h> > + > +#undef EALIGN > +#define EALIGN(name, alignt, words) \ > + .section ".text"; \ > + ENTRY_2(__strcmp_power7) \ > + .align ALIGNARG(alignt); \ > + EALIGN_W_##words; \ > + BODY_LABEL(__strcmp_power7): \ > + cfi_startproc; \ > + LOCALENTRY(__strcmp_power7) > + > +#undef END > +#define END(name) \ > + cfi_endproc; \ > + TRACEBACK(__strcmp_power7) \ > + END_2(__strcmp_power7) > + > +#undef libc_hidden_builtin_def > +#define libc_hidden_builtin_def(name) > + > +#include <sysdeps/powerpc/powerpc64/power7/strcmp.S> > diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp-ppc64.S b/sysdeps/powerpc/powerpc64/multiarch/strcmp-ppc64.S > new file mode 100644 > index 0000000..93d1277 > --- /dev/null > +++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp-ppc64.S > @@ -0,0 +1,43 @@ > +/* Default strcmp implementation for PowerPC64. > + Copyright (C) 2014 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <http://www.gnu.org/licenses/>. */ > + > +#include <sysdep.h> > + > +#if defined SHARED && !defined NOT_IN_libc > +# undef EALIGN > +# define EALIGN(name, alignt, words) \ > + .section ".text"; \ > + ENTRY_2(__strcmp_ppc) \ > + .align ALIGNARG(alignt); \ > + EALIGN_W_##words; \ > + BODY_LABEL(__strcmp_ppc): \ > + cfi_startproc; \ > + LOCALENTRY(__strcmp_ppc) > + > +# undef END > +# define END(name) \ > + cfi_endproc; \ > + TRACEBACK(__strcmp_ppc) \ > + END_2(__strcmp_ppc) > + > +# undef libc_hidden_builtin_def > +# define libc_hidden_builtin_def(name) \ > + .globl __GI_strcmp; __GI_strcmp = __strcmp_ppc > +#endif > + > +#include <sysdeps/powerpc/powerpc64/strcmp.S> > diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c > new file mode 100644 > index 0000000..2013301 > --- /dev/null > +++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c > @@ -0,0 +1,31 @@ > +/* Multiple versions of strcmp. PowerPC64 version. > + Copyright (C) 2014 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <http://www.gnu.org/licenses/>. */ > + > +#if defined SHARED && !defined NOT_IN_libc > +# include <string.h> > +# include <shlib-compat.h> > +# include "init-arch.h" > + > +extern __typeof (strcmp) __strcmp_ppc attribute_hidden; > +extern __typeof (strcmp) __strcmp_power7 attribute_hidden; > + > +libc_ifunc (strcmp, > + (hwcap & PPC_FEATURE_HAS_VSX) > + ? __strcmp_power7 > + : __strcmp_ppc); > +#endif > diff --git a/sysdeps/powerpc/powerpc64/power7/strcmp.S b/sysdeps/powerpc/powerpc64/power7/strcmp.S > new file mode 100644 > index 0000000..f16a9d8 > --- /dev/null > +++ b/sysdeps/powerpc/powerpc64/power7/strcmp.S > @@ -0,0 +1,195 @@ > +/* Optimized strcmp implementation for Power7 using 'cmpb' instruction > + Copyright (C) 2014 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <http://www.gnu.org/licenses/>. */ > + > +/* The optimization is achieved here through cmpb instruction. > + 8byte aligned strings are processed with double word comparision s/comparision/comparison > + and unaligned strings are handled effectively with loop unrolling > + technique */ > + > +#include <sysdep.h> > + > +/* int [r3] strcmp (const char *s1 [r3], const char *s2 [r4]) */ > + > +EALIGN (strcmp, 4, 0) > + CALL_MCOUNT 2 > + > + or r9, r3, r4 > + rldicl. r10, r9, 0, 61 /* are s1 and s2 8 byte aligned..? */ > + bne cr0, L(process_unaligned_bytes) > + > +/* process input parameters on double word aligned boundary */ > + ld r9, 0(r4) /* load s2 at offset=0 */ > + li r10, 0 /* load mask=0 */ > + cmpb r10, r9, r10 /* compare bytes at s2 with mask */ > + cmpdi cr7, r10, 0 /* is NULL found ..? is end of string HIT */ > + bne cr7, L(process_unaligned_bytes) /* process byte by byte */ > + > + ld r10, 0(r3) /* load s1 at offset=0 */ > + li r8, 0 /* load mask=0 */ > + cmpb r8, r10, r8 /* compare bytes at s1 with mask */ > + cmpdi cr7, r8, 0 /* is NULL found ..? is end of string HIT */ > + bne cr7, L(process_unaligned_bytes) /* process byte by byte */ > + > +/*s1 and s2 does not contain NULL now , so compare all 8 bytes in a GO */ > + cmpb r9, r10, r9 /* compare s1 and s2 */ > + cmpdi cr7, r9, -1 /* compare result with 0xFFFFFFFFFFFFFFFF */ > + bne cr7, L(process_unaligned_bytes) /* s1,s2 mismatch found */ > + > + addi r5, r3, 8 /* save next offset of s2 */ > + addi r11, r4, 8 /* save next offset of s1 */ > + ld r8, 8(r4) /* load s2 at offset=8 */ > + li r9, 0 /* load mask=0 */ > + cmpb r9, r8, r9 /* compare bytes at s2 with mask */ > + cmpdi cr7, r9, 0 /* NULL found ..? */ > + bne cr7, L(processBytes)/* update input and process bytes one by one */ > + > + mr r9, r4 /* save s2 */ > + li r10, 0 /* load mask=0 */ > + > + ld r7, 8(r3) /* load s1 at offset=8 */ > + cmpb r6, r7, r10 /* compare bytes at s1 with mask */ > + cmpdi cr7, r6, 0 /* is NULL found */ > + bne cr7, L(processBytes)/* mismatch, so process one by one */ > + > +L(unrollDword): > + cmpb r8, r7, r8 /* compare s1 and s2 */ > + cmpdi cr7, r8, -1 /* compare result with 0xFFFFFFFFFFFFFFFF */ > + bne cr7, L(processBytes)/* mismatch with s1 and s2 */ > + > + addi r5, r3, 16 /* save offset=16 of s1 */ > + addi r4, r9, 16 /* save offset=16 of s2 */ > + ld r8, 16(r9) /* load s2 at offset=16 */ > + cmpb r7, r8, r10 /* compare bytes at s2 with mask */ > + cmpdi cr7, r7, 0 /* NULL found ..? */ > + bne cr7, L(update2processBytes) > + > + ld r7, 16(r3) /* load s1 at offset=16 */ > + cmpb r6, r7, r10 /* check s1 for end of string */ > + cmpdi cr7, r6, 0 /* end of s1 ?,then handle byte by byte */ > + bne 7,L(update2processBytes) > + > + cmpb r8, r7, r8 /* compare s1 and s2 double words */ > + cmpdi cr7, r8, -1 /* compare results with 0xFFFFFFFFFFFFFFFF */ > + bne cr7,L(update2processBytes) > + > + addi r5, r3, 24 /* update s1 to offset=24 */ > + addi r4, r9, 24 /* update s2 to offset=24 */ > + > + ld r8, 24(r9) /* load s2 */ > + cmpb r7, r8, r10 /* compare s2 for NULL */ > + cmpdi cr7, r7, 0 /* verify if s2 is ending now */ > + bne cr7,L(update2processBytes) > + > + ld r7, 24(r3) /* load s1 at offset=24 */ > + cmpb r6, r7, r10 /* verify for NULL */ > + cmpdi cr7, r6, 0 /* is NULL found */ > + bne cr7, L(update2processBytes) > + > + cmpb r8, r7, r8 /* compare s1 and s2 */ > + cmpdi cr7, r8, -1 /* are s1 and s2 same ..? */ > + bne cr7, L(update2processBytes) > + > + addi r7, r9, 32 /* update s2 to next double word */ > + addi r3, r3, 32 /* update s1 to next double word */ > + > + ld r8, 32(r9) /* load s2 */ > + mr r4, r7 /* save s2 */ > + cmpb r6, r8, r10 /* compare s2 with NULL */ > + cmpdi cr7, r6, 0 /* end of s2 ..? */ > + bne cr7, L(process_unaligned_bytes) > + > + ld r6, 0(r3) /* load and compare s1 for NULL */ > + cmpb r5, r6, r10 > + cmpdi cr7, r5, 0 > + bne cr7, L(process_unaligned_bytes) > + > + cmpb r8, r6, r8 /* compare s1 and s2 */ > + cmpdi cr7, r8, -1 > + bne cr7, L(process_unaligned_bytes) > + > + addi r5, r3, 8 /* increment s1 and d2 here */ > + addi r11, r9, 40 > + > + ld r8, 40(r9) /* process s2 now */ > + cmpb r9, r8, r10 > + cmpdi cr7, r9, 0 > + bne cr7, L(processBytes) > + > + mr r9, r7 > + ld r7, 8(r3) /* process s1 now */ > + cmpb r6, r7, r10 > + cmpdi cr7, r6, 0 > + beq cr7, L(unrollDword) /* unroll to compare s1 and s2 */ > + > +L(processBytes): > + mr r4, r11 /* update input params */ > + mr r3, r5 > + > + .p2align 4 > +L(process_unaligned_bytes): > + lbz r9, 0(r3) /* load byte from s1 */ > + lbz r10, 0(r4) /* load byte from s2 */ > + cmpdi cr7, r9, 0 /* compare *s1 with NULL */ > + beq cr7, L(diffOfNULL) /* if *s1 is NULL , return *s1 - *s2 */ > + cmplw cr7, r9, r10 /* compare *s1 and *s2 */ > + bne cr7, L(ComputeDiff) /* branch to compute difference and return */ > + > + lbz r9, 1(r3) /* load next byte from s1 */ > + lbz r10, 1(r4) /* load next byte from s2 */ > + cmpdi cr7, r9, 0 /* compare *s1 with NULL */ > + beq cr7, L(diffOfNULL) /* if *s1 is NULL , return *s1 - *s2 */ > + cmplw cr7, r9, r10 /* compare *s1 and *s2 */ > + bne cr7, L(ComputeDiff) /* branch to compute difference and return */ > + > + lbz r9, 2(r3) /* unroll 3rd byte here */ > + lbz r10, 2(r4) > + cmpdi cr7, r9, 0 > + beq cr7, L(diffOfNULL) > + cmplw cr7, r9, r10 > + bne 7, L(ComputeDiff) > + > + lbz r9, 3(r3) /* unroll 4th byte now */ > + lbz r10, 3(r4) > + addi r3, r3, 4 /* increment s1 by unroll factor */ > + cmpdi cr7, r9, 0 > + cmplw cr6, 9, r10 > + beq cr7, L(diffOfNULL) > + addi r4, r4, 4 /* increment s2 by unroll factor */ > + beq cr6, L(process_unaligned_bytes) /* unroll byte processing */ > + > + .p2align 4 > +L(ComputeDiff): > + extsw r9, r9 > + subf r10, r10, r9 /* compute s1 - s2 */ > + extsw r3, r10 > + blr /* return */ > + > + .p2align 4 > +L(diffOfNULL): > + li r9, 0 > + subf r10, r10, r9 /* compute s1 - s2 */ > + extsw r3, r10 /* sign extend result */ > + blr /* return */ > + > + .p2align 4 > +L(update2processBytes): > + mr r3, r5 /* update and proceed */ > + b L(process_unaligned_bytes) > + > +END (strcmp) > +libc_hidden_builtin_def (strcmp)
Hi Adhemerval , thanks for the review. Yeah , i seem to have forgotten the ChangeLog. Here it is: ----------------------------------------------------------------------------------------------------------- 2014-06-05 Vidya Ranganathan <vidya@linux.vnet.ibm.com> * sysdeps/powerpc/powerpc64/power7/strcmp.S: New file: Optimization. * sysdeps/powerpc/powerpc64/multiarch/strcmp.c: New file: multiarch strcmp for PPC64. * sysdeps/powerpc/powerpc64/multiarch/strcmp-ppc64.S: New file * sysdeps/powerpc/powerpc64/multiarch/strcmp-power7.S: New file * sysdeps/powerpc/powerpc64/multiarch/Makefile: Add strcmp multiarch optimizations. * sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c: (__libc_ifunc_impl_list): Likewise. ----------------------------------------------------------------------------------------------------------- On Tuesday 10 June 2014 07:43 PM, Adhemerval Zanella wrote: > Hi Vidya, > > Path looks good, however you forgot to add a ChangeLog. Also one typo below. > Please send the CL entry that will push upstream. > > On 05-06-2014 02:13, vidya@linux.vnet.ibm.com wrote: > >> From: Vidya Ranganathan <vidya@linux.vnet.ibm.com> >> >> Optimization is achieved on 8 byte aligned strings with double word >> comparison using cmpb instruction. On unaligned strings loop unrolling >> is applied for Power7 gain. >> >> Signed-off-by: Vidya Ranganathan <vidya@linux.vnet.ibm.com> >> --- >> sysdeps/powerpc/powerpc64/multiarch/Makefile | 2 +- >> .../powerpc/powerpc64/multiarch/ifunc-impl-list.c | 7 + >> .../powerpc/powerpc64/multiarch/strcmp-power7.S | 40 +++++ >> sysdeps/powerpc/powerpc64/multiarch/strcmp-ppc64.S | 43 +++++ >> sysdeps/powerpc/powerpc64/multiarch/strcmp.c | 31 ++++ >> sysdeps/powerpc/powerpc64/power7/strcmp.S | 195 +++++++++++++++++++++ >> 6 files changed, 317 insertions(+), 1 deletion(-) >> create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strcmp-power7.S >> create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strcmp-ppc64.S >> create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strcmp.c >> create mode 100644 sysdeps/powerpc/powerpc64/power7/strcmp.S >> >> diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile >> index 35020a7..05744e9 100644 >> --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile >> +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile >> @@ -17,7 +17,7 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \ >> strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \ >> strspn-power7 strspn-ppc64 strcspn-power7 strcspn-ppc64 \ >> strpbrk-power7 strpbrk-ppc64 strncpy-power7 strncpy-ppc64 \ >> - stpncpy-power7 stpncpy-ppc64 >> + stpncpy-power7 stpncpy-ppc64 strcmp-power7 strcmp-ppc64 >> >> CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops >> CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops >> diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c >> index d8578fb..b3933a5 100644 >> --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c >> +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c >> @@ -294,5 +294,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, >> IFUNC_IMPL_ADD (array, i, stpncpy, 1, >> __stpncpy_ppc)) >> >> + /* Support sysdeps/powerpc/powerpc64/multiarch/strcmp.c. */ >> + IFUNC_IMPL (i, name, strcmp, >> + IFUNC_IMPL_ADD (array, i, strcmp, >> + hwcap & PPC_FEATURE_HAS_VSX, >> + __strcmp_power7) >> + IFUNC_IMPL_ADD (array, i, strcmp, 1, >> + __strcmp_ppc)) >> return i; >> } >> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp-power7.S b/sysdeps/powerpc/powerpc64/multiarch/strcmp-power7.S >> new file mode 100644 >> index 0000000..790ce8d >> --- /dev/null >> +++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp-power7.S >> @@ -0,0 +1,40 @@ >> +/* Optimized strcmp implementation for POWER7. >> + Copyright (C) 2014 Free Software Foundation, Inc. >> + This file is part of the GNU C Library. >> + >> + The GNU C Library is free software; you can redistribute it and/or >> + modify it under the terms of the GNU Lesser General Public >> + License as published by the Free Software Foundation; either >> + version 2.1 of the License, or (at your option) any later version. >> + >> + The GNU C Library is distributed in the hope that it will be useful, >> + but WITHOUT ANY WARRANTY; without even the implied warranty of >> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> + Lesser General Public License for more details. >> + >> + You should have received a copy of the GNU Lesser General Public >> + License along with the GNU C Library; if not, see >> + <http://www.gnu.org/licenses/>. */ >> + >> +#include <sysdep.h> >> + >> +#undef EALIGN >> +#define EALIGN(name, alignt, words) \ >> + .section ".text"; \ >> + ENTRY_2(__strcmp_power7) \ >> + .align ALIGNARG(alignt); \ >> + EALIGN_W_##words; \ >> + BODY_LABEL(__strcmp_power7): \ >> + cfi_startproc; \ >> + LOCALENTRY(__strcmp_power7) >> + >> +#undef END >> +#define END(name) \ >> + cfi_endproc; \ >> + TRACEBACK(__strcmp_power7) \ >> + END_2(__strcmp_power7) >> + >> +#undef libc_hidden_builtin_def >> +#define libc_hidden_builtin_def(name) >> + >> +#include <sysdeps/powerpc/powerpc64/power7/strcmp.S> >> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp-ppc64.S b/sysdeps/powerpc/powerpc64/multiarch/strcmp-ppc64.S >> new file mode 100644 >> index 0000000..93d1277 >> --- /dev/null >> +++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp-ppc64.S >> @@ -0,0 +1,43 @@ >> +/* Default strcmp implementation for PowerPC64. >> + Copyright (C) 2014 Free Software Foundation, Inc. >> + This file is part of the GNU C Library. >> + >> + The GNU C Library is free software; you can redistribute it and/or >> + modify it under the terms of the GNU Lesser General Public >> + License as published by the Free Software Foundation; either >> + version 2.1 of the License, or (at your option) any later version. >> + >> + The GNU C Library is distributed in the hope that it will be useful, >> + but WITHOUT ANY WARRANTY; without even the implied warranty of >> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> + Lesser General Public License for more details. >> + >> + You should have received a copy of the GNU Lesser General Public >> + License along with the GNU C Library; if not, see >> + <http://www.gnu.org/licenses/>. */ >> + >> +#include <sysdep.h> >> + >> +#if defined SHARED && !defined NOT_IN_libc >> +# undef EALIGN >> +# define EALIGN(name, alignt, words) \ >> + .section ".text"; \ >> + ENTRY_2(__strcmp_ppc) \ >> + .align ALIGNARG(alignt); \ >> + EALIGN_W_##words; \ >> + BODY_LABEL(__strcmp_ppc): \ >> + cfi_startproc; \ >> + LOCALENTRY(__strcmp_ppc) >> + >> +# undef END >> +# define END(name) \ >> + cfi_endproc; \ >> + TRACEBACK(__strcmp_ppc) \ >> + END_2(__strcmp_ppc) >> + >> +# undef libc_hidden_builtin_def >> +# define libc_hidden_builtin_def(name) \ >> + .globl __GI_strcmp; __GI_strcmp = __strcmp_ppc >> +#endif >> + >> +#include <sysdeps/powerpc/powerpc64/strcmp.S> >> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c >> new file mode 100644 >> index 0000000..2013301 >> --- /dev/null >> +++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c >> @@ -0,0 +1,31 @@ >> +/* Multiple versions of strcmp. PowerPC64 version. >> + Copyright (C) 2014 Free Software Foundation, Inc. >> + This file is part of the GNU C Library. >> + >> + The GNU C Library is free software; you can redistribute it and/or >> + modify it under the terms of the GNU Lesser General Public >> + License as published by the Free Software Foundation; either >> + version 2.1 of the License, or (at your option) any later version. >> + >> + The GNU C Library is distributed in the hope that it will be useful, >> + but WITHOUT ANY WARRANTY; without even the implied warranty of >> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> + Lesser General Public License for more details. >> + >> + You should have received a copy of the GNU Lesser General Public >> + License along with the GNU C Library; if not, see >> + <http://www.gnu.org/licenses/>. */ >> + >> +#if defined SHARED && !defined NOT_IN_libc >> +# include <string.h> >> +# include <shlib-compat.h> >> +# include "init-arch.h" >> + >> +extern __typeof (strcmp) __strcmp_ppc attribute_hidden; >> +extern __typeof (strcmp) __strcmp_power7 attribute_hidden; >> + >> +libc_ifunc (strcmp, >> + (hwcap & PPC_FEATURE_HAS_VSX) >> + ? __strcmp_power7 >> + : __strcmp_ppc); >> +#endif >> diff --git a/sysdeps/powerpc/powerpc64/power7/strcmp.S b/sysdeps/powerpc/powerpc64/power7/strcmp.S >> new file mode 100644 >> index 0000000..f16a9d8 >> --- /dev/null >> +++ b/sysdeps/powerpc/powerpc64/power7/strcmp.S >> @@ -0,0 +1,195 @@ >> +/* Optimized strcmp implementation for Power7 using 'cmpb' instruction >> + Copyright (C) 2014 Free Software Foundation, Inc. >> + This file is part of the GNU C Library. >> + >> + The GNU C Library is free software; you can redistribute it and/or >> + modify it under the terms of the GNU Lesser General Public >> + License as published by the Free Software Foundation; either >> + version 2.1 of the License, or (at your option) any later version. >> + >> + The GNU C Library is distributed in the hope that it will be useful, >> + but WITHOUT ANY WARRANTY; without even the implied warranty of >> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> + Lesser General Public License for more details. >> + >> + You should have received a copy of the GNU Lesser General Public >> + License along with the GNU C Library; if not, see >> + <http://www.gnu.org/licenses/>. */ >> + >> +/* The optimization is achieved here through cmpb instruction. >> + 8byte aligned strings are processed with double word comparision > s/comparision/comparison >> + and unaligned strings are handled effectively with loop unrolling >> + technique */ >> + >> +#include <sysdep.h> >> + >> +/* int [r3] strcmp (const char *s1 [r3], const char *s2 [r4]) */ >> + >> +EALIGN (strcmp, 4, 0) >> + CALL_MCOUNT 2 >> + >> + or r9, r3, r4 >> + rldicl. r10, r9, 0, 61 /* are s1 and s2 8 byte aligned..? */ >> + bne cr0, L(process_unaligned_bytes) >> + >> +/* process input parameters on double word aligned boundary */ >> + ld r9, 0(r4) /* load s2 at offset=0 */ >> + li r10, 0 /* load mask=0 */ >> + cmpb r10, r9, r10 /* compare bytes at s2 with mask */ >> + cmpdi cr7, r10, 0 /* is NULL found ..? is end of string HIT */ >> + bne cr7, L(process_unaligned_bytes) /* process byte by byte */ >> + >> + ld r10, 0(r3) /* load s1 at offset=0 */ >> + li r8, 0 /* load mask=0 */ >> + cmpb r8, r10, r8 /* compare bytes at s1 with mask */ >> + cmpdi cr7, r8, 0 /* is NULL found ..? is end of string HIT */ >> + bne cr7, L(process_unaligned_bytes) /* process byte by byte */ >> + >> +/*s1 and s2 does not contain NULL now , so compare all 8 bytes in a GO */ >> + cmpb r9, r10, r9 /* compare s1 and s2 */ >> + cmpdi cr7, r9, -1 /* compare result with 0xFFFFFFFFFFFFFFFF */ >> + bne cr7, L(process_unaligned_bytes) /* s1,s2 mismatch found */ >> + >> + addi r5, r3, 8 /* save next offset of s2 */ >> + addi r11, r4, 8 /* save next offset of s1 */ >> + ld r8, 8(r4) /* load s2 at offset=8 */ >> + li r9, 0 /* load mask=0 */ >> + cmpb r9, r8, r9 /* compare bytes at s2 with mask */ >> + cmpdi cr7, r9, 0 /* NULL found ..? */ >> + bne cr7, L(processBytes)/* update input and process bytes one by one */ >> + >> + mr r9, r4 /* save s2 */ >> + li r10, 0 /* load mask=0 */ >> + >> + ld r7, 8(r3) /* load s1 at offset=8 */ >> + cmpb r6, r7, r10 /* compare bytes at s1 with mask */ >> + cmpdi cr7, r6, 0 /* is NULL found */ >> + bne cr7, L(processBytes)/* mismatch, so process one by one */ >> + >> +L(unrollDword): >> + cmpb r8, r7, r8 /* compare s1 and s2 */ >> + cmpdi cr7, r8, -1 /* compare result with 0xFFFFFFFFFFFFFFFF */ >> + bne cr7, L(processBytes)/* mismatch with s1 and s2 */ >> + >> + addi r5, r3, 16 /* save offset=16 of s1 */ >> + addi r4, r9, 16 /* save offset=16 of s2 */ >> + ld r8, 16(r9) /* load s2 at offset=16 */ >> + cmpb r7, r8, r10 /* compare bytes at s2 with mask */ >> + cmpdi cr7, r7, 0 /* NULL found ..? */ >> + bne cr7, L(update2processBytes) >> + >> + ld r7, 16(r3) /* load s1 at offset=16 */ >> + cmpb r6, r7, r10 /* check s1 for end of string */ >> + cmpdi cr7, r6, 0 /* end of s1 ?,then handle byte by byte */ >> + bne 7,L(update2processBytes) >> + >> + cmpb r8, r7, r8 /* compare s1 and s2 double words */ >> + cmpdi cr7, r8, -1 /* compare results with 0xFFFFFFFFFFFFFFFF */ >> + bne cr7,L(update2processBytes) >> + >> + addi r5, r3, 24 /* update s1 to offset=24 */ >> + addi r4, r9, 24 /* update s2 to offset=24 */ >> + >> + ld r8, 24(r9) /* load s2 */ >> + cmpb r7, r8, r10 /* compare s2 for NULL */ >> + cmpdi cr7, r7, 0 /* verify if s2 is ending now */ >> + bne cr7,L(update2processBytes) >> + >> + ld r7, 24(r3) /* load s1 at offset=24 */ >> + cmpb r6, r7, r10 /* verify for NULL */ >> + cmpdi cr7, r6, 0 /* is NULL found */ >> + bne cr7, L(update2processBytes) >> + >> + cmpb r8, r7, r8 /* compare s1 and s2 */ >> + cmpdi cr7, r8, -1 /* are s1 and s2 same ..? */ >> + bne cr7, L(update2processBytes) >> + >> + addi r7, r9, 32 /* update s2 to next double word */ >> + addi r3, r3, 32 /* update s1 to next double word */ >> + >> + ld r8, 32(r9) /* load s2 */ >> + mr r4, r7 /* save s2 */ >> + cmpb r6, r8, r10 /* compare s2 with NULL */ >> + cmpdi cr7, r6, 0 /* end of s2 ..? */ >> + bne cr7, L(process_unaligned_bytes) >> + >> + ld r6, 0(r3) /* load and compare s1 for NULL */ >> + cmpb r5, r6, r10 >> + cmpdi cr7, r5, 0 >> + bne cr7, L(process_unaligned_bytes) >> + >> + cmpb r8, r6, r8 /* compare s1 and s2 */ >> + cmpdi cr7, r8, -1 >> + bne cr7, L(process_unaligned_bytes) >> + >> + addi r5, r3, 8 /* increment s1 and d2 here */ >> + addi r11, r9, 40 >> + >> + ld r8, 40(r9) /* process s2 now */ >> + cmpb r9, r8, r10 >> + cmpdi cr7, r9, 0 >> + bne cr7, L(processBytes) >> + >> + mr r9, r7 >> + ld r7, 8(r3) /* process s1 now */ >> + cmpb r6, r7, r10 >> + cmpdi cr7, r6, 0 >> + beq cr7, L(unrollDword) /* unroll to compare s1 and s2 */ >> + >> +L(processBytes): >> + mr r4, r11 /* update input params */ >> + mr r3, r5 >> + >> + .p2align 4 >> +L(process_unaligned_bytes): >> + lbz r9, 0(r3) /* load byte from s1 */ >> + lbz r10, 0(r4) /* load byte from s2 */ >> + cmpdi cr7, r9, 0 /* compare *s1 with NULL */ >> + beq cr7, L(diffOfNULL) /* if *s1 is NULL , return *s1 - *s2 */ >> + cmplw cr7, r9, r10 /* compare *s1 and *s2 */ >> + bne cr7, L(ComputeDiff) /* branch to compute difference and return */ >> + >> + lbz r9, 1(r3) /* load next byte from s1 */ >> + lbz r10, 1(r4) /* load next byte from s2 */ >> + cmpdi cr7, r9, 0 /* compare *s1 with NULL */ >> + beq cr7, L(diffOfNULL) /* if *s1 is NULL , return *s1 - *s2 */ >> + cmplw cr7, r9, r10 /* compare *s1 and *s2 */ >> + bne cr7, L(ComputeDiff) /* branch to compute difference and return */ >> + >> + lbz r9, 2(r3) /* unroll 3rd byte here */ >> + lbz r10, 2(r4) >> + cmpdi cr7, r9, 0 >> + beq cr7, L(diffOfNULL) >> + cmplw cr7, r9, r10 >> + bne 7, L(ComputeDiff) >> + >> + lbz r9, 3(r3) /* unroll 4th byte now */ >> + lbz r10, 3(r4) >> + addi r3, r3, 4 /* increment s1 by unroll factor */ >> + cmpdi cr7, r9, 0 >> + cmplw cr6, 9, r10 >> + beq cr7, L(diffOfNULL) >> + addi r4, r4, 4 /* increment s2 by unroll factor */ >> + beq cr6, L(process_unaligned_bytes) /* unroll byte processing */ >> + >> + .p2align 4 >> +L(ComputeDiff): >> + extsw r9, r9 >> + subf r10, r10, r9 /* compute s1 - s2 */ >> + extsw r3, r10 >> + blr /* return */ >> + >> + .p2align 4 >> +L(diffOfNULL): >> + li r9, 0 >> + subf r10, r10, r9 /* compute s1 - s2 */ >> + extsw r3, r10 /* sign extend result */ >> + blr /* return */ >> + >> + .p2align 4 >> +L(update2processBytes): >> + mr r3, r5 /* update and proceed */ >> + b L(process_unaligned_bytes) >> + >> +END (strcmp) >> +libc_hidden_builtin_def (strcmp)
Thanks, pushed upstream as e23d3d2690bf63207b1a47e83a94693daebbbfe5 On 10-06-2014 13:06, R Vidya wrote: > Hi Adhemerval , thanks for the review. Yeah , i seem to have forgotten the ChangeLog. > Here it is: > ----------------------------------------------------------------------------------------------------------- > 2014-06-05 Vidya Ranganathan <vidya@linux.vnet.ibm.com> > > * sysdeps/powerpc/powerpc64/power7/strcmp.S: New file: Optimization. > * sysdeps/powerpc/powerpc64/multiarch/strcmp.c: New file: > multiarch strcmp for PPC64. > * sysdeps/powerpc/powerpc64/multiarch/strcmp-ppc64.S: New file > * sysdeps/powerpc/powerpc64/multiarch/strcmp-power7.S: New file > * sysdeps/powerpc/powerpc64/multiarch/Makefile: Add strcmp > multiarch optimizations. > * sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c: > (__libc_ifunc_impl_list): Likewise. > > ----------------------------------------------------------------------------------------------------------- > > On Tuesday 10 June 2014 07:43 PM, Adhemerval Zanella wrote: >> Hi Vidya, >> >> Path looks good, however you forgot to add a ChangeLog. Also one typo below. >> Please send the CL entry that will push upstream. >> >> On 05-06-2014 02:13, vidya@linux.vnet.ibm.com wrote: >> >>> From: Vidya Ranganathan <vidya@linux.vnet.ibm.com> >>> >>> Optimization is achieved on 8 byte aligned strings with double word >>> comparison using cmpb instruction. On unaligned strings loop unrolling >>> is applied for Power7 gain. >>> >>> Signed-off-by: Vidya Ranganathan <vidya@linux.vnet.ibm.com> >>> --- >>> sysdeps/powerpc/powerpc64/multiarch/Makefile | 2 +- >>> .../powerpc/powerpc64/multiarch/ifunc-impl-list.c | 7 + >>> .../powerpc/powerpc64/multiarch/strcmp-power7.S | 40 +++++ >>> sysdeps/powerpc/powerpc64/multiarch/strcmp-ppc64.S | 43 +++++ >>> sysdeps/powerpc/powerpc64/multiarch/strcmp.c | 31 ++++ >>> sysdeps/powerpc/powerpc64/power7/strcmp.S | 195 +++++++++++++++++++++ >>> 6 files changed, 317 insertions(+), 1 deletion(-) >>> create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strcmp-power7.S >>> create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strcmp-ppc64.S >>> create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strcmp.c >>> create mode 100644 sysdeps/powerpc/powerpc64/power7/strcmp.S >>> >>> diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile >>> index 35020a7..05744e9 100644 >>> --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile >>> +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile >>> @@ -17,7 +17,7 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \ >>> strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \ >>> strspn-power7 strspn-ppc64 strcspn-power7 strcspn-ppc64 \ >>> strpbrk-power7 strpbrk-ppc64 strncpy-power7 strncpy-ppc64 \ >>> - stpncpy-power7 stpncpy-ppc64 >>> + stpncpy-power7 stpncpy-ppc64 strcmp-power7 strcmp-ppc64 >>> >>> CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops >>> CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops >>> diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c >>> index d8578fb..b3933a5 100644 >>> --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c >>> +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c >>> @@ -294,5 +294,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, >>> IFUNC_IMPL_ADD (array, i, stpncpy, 1, >>> __stpncpy_ppc)) >>> >>> + /* Support sysdeps/powerpc/powerpc64/multiarch/strcmp.c. */ >>> + IFUNC_IMPL (i, name, strcmp, >>> + IFUNC_IMPL_ADD (array, i, strcmp, >>> + hwcap & PPC_FEATURE_HAS_VSX, >>> + __strcmp_power7) >>> + IFUNC_IMPL_ADD (array, i, strcmp, 1, >>> + __strcmp_ppc)) >>> return i; >>> } >>> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp-power7.S b/sysdeps/powerpc/powerpc64/multiarch/strcmp-power7.S >>> new file mode 100644 >>> index 0000000..790ce8d >>> --- /dev/null >>> +++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp-power7.S >>> @@ -0,0 +1,40 @@ >>> +/* Optimized strcmp implementation for POWER7. >>> + Copyright (C) 2014 Free Software Foundation, Inc. >>> + This file is part of the GNU C Library. >>> + >>> + The GNU C Library is free software; you can redistribute it and/or >>> + modify it under the terms of the GNU Lesser General Public >>> + License as published by the Free Software Foundation; either >>> + version 2.1 of the License, or (at your option) any later version. >>> + >>> + The GNU C Library is distributed in the hope that it will be useful, >>> + but WITHOUT ANY WARRANTY; without even the implied warranty of >>> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >>> + Lesser General Public License for more details. >>> + >>> + You should have received a copy of the GNU Lesser General Public >>> + License along with the GNU C Library; if not, see >>> + <http://www.gnu.org/licenses/>. */ >>> + >>> +#include <sysdep.h> >>> + >>> +#undef EALIGN >>> +#define EALIGN(name, alignt, words) \ >>> + .section ".text"; \ >>> + ENTRY_2(__strcmp_power7) \ >>> + .align ALIGNARG(alignt); \ >>> + EALIGN_W_##words; \ >>> + BODY_LABEL(__strcmp_power7): \ >>> + cfi_startproc; \ >>> + LOCALENTRY(__strcmp_power7) >>> + >>> +#undef END >>> +#define END(name) \ >>> + cfi_endproc; \ >>> + TRACEBACK(__strcmp_power7) \ >>> + END_2(__strcmp_power7) >>> + >>> +#undef libc_hidden_builtin_def >>> +#define libc_hidden_builtin_def(name) >>> + >>> +#include <sysdeps/powerpc/powerpc64/power7/strcmp.S> >>> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp-ppc64.S b/sysdeps/powerpc/powerpc64/multiarch/strcmp-ppc64.S >>> new file mode 100644 >>> index 0000000..93d1277 >>> --- /dev/null >>> +++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp-ppc64.S >>> @@ -0,0 +1,43 @@ >>> +/* Default strcmp implementation for PowerPC64. >>> + Copyright (C) 2014 Free Software Foundation, Inc. >>> + This file is part of the GNU C Library. >>> + >>> + The GNU C Library is free software; you can redistribute it and/or >>> + modify it under the terms of the GNU Lesser General Public >>> + License as published by the Free Software Foundation; either >>> + version 2.1 of the License, or (at your option) any later version. >>> + >>> + The GNU C Library is distributed in the hope that it will be useful, >>> + but WITHOUT ANY WARRANTY; without even the implied warranty of >>> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >>> + Lesser General Public License for more details. >>> + >>> + You should have received a copy of the GNU Lesser General Public >>> + License along with the GNU C Library; if not, see >>> + <http://www.gnu.org/licenses/>. */ >>> + >>> +#include <sysdep.h> >>> + >>> +#if defined SHARED && !defined NOT_IN_libc >>> +# undef EALIGN >>> +# define EALIGN(name, alignt, words) \ >>> + .section ".text"; \ >>> + ENTRY_2(__strcmp_ppc) \ >>> + .align ALIGNARG(alignt); \ >>> + EALIGN_W_##words; \ >>> + BODY_LABEL(__strcmp_ppc): \ >>> + cfi_startproc; \ >>> + LOCALENTRY(__strcmp_ppc) >>> + >>> +# undef END >>> +# define END(name) \ >>> + cfi_endproc; \ >>> + TRACEBACK(__strcmp_ppc) \ >>> + END_2(__strcmp_ppc) >>> + >>> +# undef libc_hidden_builtin_def >>> +# define libc_hidden_builtin_def(name) \ >>> + .globl __GI_strcmp; __GI_strcmp = __strcmp_ppc >>> +#endif >>> + >>> +#include <sysdeps/powerpc/powerpc64/strcmp.S> >>> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c >>> new file mode 100644 >>> index 0000000..2013301 >>> --- /dev/null >>> +++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c >>> @@ -0,0 +1,31 @@ >>> +/* Multiple versions of strcmp. PowerPC64 version. >>> + Copyright (C) 2014 Free Software Foundation, Inc. >>> + This file is part of the GNU C Library. >>> + >>> + The GNU C Library is free software; you can redistribute it and/or >>> + modify it under the terms of the GNU Lesser General Public >>> + License as published by the Free Software Foundation; either >>> + version 2.1 of the License, or (at your option) any later version. >>> + >>> + The GNU C Library is distributed in the hope that it will be useful, >>> + but WITHOUT ANY WARRANTY; without even the implied warranty of >>> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >>> + Lesser General Public License for more details. >>> + >>> + You should have received a copy of the GNU Lesser General Public >>> + License along with the GNU C Library; if not, see >>> + <http://www.gnu.org/licenses/>. */ >>> + >>> +#if defined SHARED && !defined NOT_IN_libc >>> +# include <string.h> >>> +# include <shlib-compat.h> >>> +# include "init-arch.h" >>> + >>> +extern __typeof (strcmp) __strcmp_ppc attribute_hidden; >>> +extern __typeof (strcmp) __strcmp_power7 attribute_hidden; >>> + >>> +libc_ifunc (strcmp, >>> + (hwcap & PPC_FEATURE_HAS_VSX) >>> + ? __strcmp_power7 >>> + : __strcmp_ppc); >>> +#endif >>> diff --git a/sysdeps/powerpc/powerpc64/power7/strcmp.S b/sysdeps/powerpc/powerpc64/power7/strcmp.S >>> new file mode 100644 >>> index 0000000..f16a9d8 >>> --- /dev/null >>> +++ b/sysdeps/powerpc/powerpc64/power7/strcmp.S >>> @@ -0,0 +1,195 @@ >>> +/* Optimized strcmp implementation for Power7 using 'cmpb' instruction >>> + Copyright (C) 2014 Free Software Foundation, Inc. >>> + This file is part of the GNU C Library. >>> + >>> + The GNU C Library is free software; you can redistribute it and/or >>> + modify it under the terms of the GNU Lesser General Public >>> + License as published by the Free Software Foundation; either >>> + version 2.1 of the License, or (at your option) any later version. >>> + >>> + The GNU C Library is distributed in the hope that it will be useful, >>> + but WITHOUT ANY WARRANTY; without even the implied warranty of >>> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >>> + Lesser General Public License for more details. >>> + >>> + You should have received a copy of the GNU Lesser General Public >>> + License along with the GNU C Library; if not, see >>> + <http://www.gnu.org/licenses/>. */ >>> + >>> +/* The optimization is achieved here through cmpb instruction. >>> + 8byte aligned strings are processed with double word comparision >> s/comparision/comparison >>> + and unaligned strings are handled effectively with loop unrolling >>> + technique */ >>> + >>> +#include <sysdep.h> >>> + >>> +/* int [r3] strcmp (const char *s1 [r3], const char *s2 [r4]) */ >>> + >>> +EALIGN (strcmp, 4, 0) >>> + CALL_MCOUNT 2 >>> + >>> + or r9, r3, r4 >>> + rldicl. r10, r9, 0, 61 /* are s1 and s2 8 byte aligned..? */ >>> + bne cr0, L(process_unaligned_bytes) >>> + >>> +/* process input parameters on double word aligned boundary */ >>> + ld r9, 0(r4) /* load s2 at offset=0 */ >>> + li r10, 0 /* load mask=0 */ >>> + cmpb r10, r9, r10 /* compare bytes at s2 with mask */ >>> + cmpdi cr7, r10, 0 /* is NULL found ..? is end of string HIT */ >>> + bne cr7, L(process_unaligned_bytes) /* process byte by byte */ >>> + >>> + ld r10, 0(r3) /* load s1 at offset=0 */ >>> + li r8, 0 /* load mask=0 */ >>> + cmpb r8, r10, r8 /* compare bytes at s1 with mask */ >>> + cmpdi cr7, r8, 0 /* is NULL found ..? is end of string HIT */ >>> + bne cr7, L(process_unaligned_bytes) /* process byte by byte */ >>> + >>> +/*s1 and s2 does not contain NULL now , so compare all 8 bytes in a GO */ >>> + cmpb r9, r10, r9 /* compare s1 and s2 */ >>> + cmpdi cr7, r9, -1 /* compare result with 0xFFFFFFFFFFFFFFFF */ >>> + bne cr7, L(process_unaligned_bytes) /* s1,s2 mismatch found */ >>> + >>> + addi r5, r3, 8 /* save next offset of s2 */ >>> + addi r11, r4, 8 /* save next offset of s1 */ >>> + ld r8, 8(r4) /* load s2 at offset=8 */ >>> + li r9, 0 /* load mask=0 */ >>> + cmpb r9, r8, r9 /* compare bytes at s2 with mask */ >>> + cmpdi cr7, r9, 0 /* NULL found ..? */ >>> + bne cr7, L(processBytes)/* update input and process bytes one by one */ >>> + >>> + mr r9, r4 /* save s2 */ >>> + li r10, 0 /* load mask=0 */ >>> + >>> + ld r7, 8(r3) /* load s1 at offset=8 */ >>> + cmpb r6, r7, r10 /* compare bytes at s1 with mask */ >>> + cmpdi cr7, r6, 0 /* is NULL found */ >>> + bne cr7, L(processBytes)/* mismatch, so process one by one */ >>> + >>> +L(unrollDword): >>> + cmpb r8, r7, r8 /* compare s1 and s2 */ >>> + cmpdi cr7, r8, -1 /* compare result with 0xFFFFFFFFFFFFFFFF */ >>> + bne cr7, L(processBytes)/* mismatch with s1 and s2 */ >>> + >>> + addi r5, r3, 16 /* save offset=16 of s1 */ >>> + addi r4, r9, 16 /* save offset=16 of s2 */ >>> + ld r8, 16(r9) /* load s2 at offset=16 */ >>> + cmpb r7, r8, r10 /* compare bytes at s2 with mask */ >>> + cmpdi cr7, r7, 0 /* NULL found ..? */ >>> + bne cr7, L(update2processBytes) >>> + >>> + ld r7, 16(r3) /* load s1 at offset=16 */ >>> + cmpb r6, r7, r10 /* check s1 for end of string */ >>> + cmpdi cr7, r6, 0 /* end of s1 ?,then handle byte by byte */ >>> + bne 7,L(update2processBytes) >>> + >>> + cmpb r8, r7, r8 /* compare s1 and s2 double words */ >>> + cmpdi cr7, r8, -1 /* compare results with 0xFFFFFFFFFFFFFFFF */ >>> + bne cr7,L(update2processBytes) >>> + >>> + addi r5, r3, 24 /* update s1 to offset=24 */ >>> + addi r4, r9, 24 /* update s2 to offset=24 */ >>> + >>> + ld r8, 24(r9) /* load s2 */ >>> + cmpb r7, r8, r10 /* compare s2 for NULL */ >>> + cmpdi cr7, r7, 0 /* verify if s2 is ending now */ >>> + bne cr7,L(update2processBytes) >>> + >>> + ld r7, 24(r3) /* load s1 at offset=24 */ >>> + cmpb r6, r7, r10 /* verify for NULL */ >>> + cmpdi cr7, r6, 0 /* is NULL found */ >>> + bne cr7, L(update2processBytes) >>> + >>> + cmpb r8, r7, r8 /* compare s1 and s2 */ >>> + cmpdi cr7, r8, -1 /* are s1 and s2 same ..? */ >>> + bne cr7, L(update2processBytes) >>> + >>> + addi r7, r9, 32 /* update s2 to next double word */ >>> + addi r3, r3, 32 /* update s1 to next double word */ >>> + >>> + ld r8, 32(r9) /* load s2 */ >>> + mr r4, r7 /* save s2 */ >>> + cmpb r6, r8, r10 /* compare s2 with NULL */ >>> + cmpdi cr7, r6, 0 /* end of s2 ..? */ >>> + bne cr7, L(process_unaligned_bytes) >>> + >>> + ld r6, 0(r3) /* load and compare s1 for NULL */ >>> + cmpb r5, r6, r10 >>> + cmpdi cr7, r5, 0 >>> + bne cr7, L(process_unaligned_bytes) >>> + >>> + cmpb r8, r6, r8 /* compare s1 and s2 */ >>> + cmpdi cr7, r8, -1 >>> + bne cr7, L(process_unaligned_bytes) >>> + >>> + addi r5, r3, 8 /* increment s1 and d2 here */ >>> + addi r11, r9, 40 >>> + >>> + ld r8, 40(r9) /* process s2 now */ >>> + cmpb r9, r8, r10 >>> + cmpdi cr7, r9, 0 >>> + bne cr7, L(processBytes) >>> + >>> + mr r9, r7 >>> + ld r7, 8(r3) /* process s1 now */ >>> + cmpb r6, r7, r10 >>> + cmpdi cr7, r6, 0 >>> + beq cr7, L(unrollDword) /* unroll to compare s1 and s2 */ >>> + >>> +L(processBytes): >>> + mr r4, r11 /* update input params */ >>> + mr r3, r5 >>> + >>> + .p2align 4 >>> +L(process_unaligned_bytes): >>> + lbz r9, 0(r3) /* load byte from s1 */ >>> + lbz r10, 0(r4) /* load byte from s2 */ >>> + cmpdi cr7, r9, 0 /* compare *s1 with NULL */ >>> + beq cr7, L(diffOfNULL) /* if *s1 is NULL , return *s1 - *s2 */ >>> + cmplw cr7, r9, r10 /* compare *s1 and *s2 */ >>> + bne cr7, L(ComputeDiff) /* branch to compute difference and return */ >>> + >>> + lbz r9, 1(r3) /* load next byte from s1 */ >>> + lbz r10, 1(r4) /* load next byte from s2 */ >>> + cmpdi cr7, r9, 0 /* compare *s1 with NULL */ >>> + beq cr7, L(diffOfNULL) /* if *s1 is NULL , return *s1 - *s2 */ >>> + cmplw cr7, r9, r10 /* compare *s1 and *s2 */ >>> + bne cr7, L(ComputeDiff) /* branch to compute difference and return */ >>> + >>> + lbz r9, 2(r3) /* unroll 3rd byte here */ >>> + lbz r10, 2(r4) >>> + cmpdi cr7, r9, 0 >>> + beq cr7, L(diffOfNULL) >>> + cmplw cr7, r9, r10 >>> + bne 7, L(ComputeDiff) >>> + >>> + lbz r9, 3(r3) /* unroll 4th byte now */ >>> + lbz r10, 3(r4) >>> + addi r3, r3, 4 /* increment s1 by unroll factor */ >>> + cmpdi cr7, r9, 0 >>> + cmplw cr6, 9, r10 >>> + beq cr7, L(diffOfNULL) >>> + addi r4, r4, 4 /* increment s2 by unroll factor */ >>> + beq cr6, L(process_unaligned_bytes) /* unroll byte processing */ >>> + >>> + .p2align 4 >>> +L(ComputeDiff): >>> + extsw r9, r9 >>> + subf r10, r10, r9 /* compute s1 - s2 */ >>> + extsw r3, r10 >>> + blr /* return */ >>> + >>> + .p2align 4 >>> +L(diffOfNULL): >>> + li r9, 0 >>> + subf r10, r10, r9 /* compute s1 - s2 */ >>> + extsw r3, r10 /* sign extend result */ >>> + blr /* return */ >>> + >>> + .p2align 4 >>> +L(update2processBytes): >>> + mr r3, r5 /* update and proceed */ >>> + b L(process_unaligned_bytes) >>> + >>> +END (strcmp) >>> +libc_hidden_builtin_def (strcmp) >
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile index 35020a7..05744e9 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile @@ -17,7 +17,7 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \ strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \ strspn-power7 strspn-ppc64 strcspn-power7 strcspn-ppc64 \ strpbrk-power7 strpbrk-ppc64 strncpy-power7 strncpy-ppc64 \ - stpncpy-power7 stpncpy-ppc64 + stpncpy-power7 stpncpy-ppc64 strcmp-power7 strcmp-ppc64 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c index d8578fb..b3933a5 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c @@ -294,5 +294,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_ppc)) + /* Support sysdeps/powerpc/powerpc64/multiarch/strcmp.c. */ + IFUNC_IMPL (i, name, strcmp, + IFUNC_IMPL_ADD (array, i, strcmp, + hwcap & PPC_FEATURE_HAS_VSX, + __strcmp_power7) + IFUNC_IMPL_ADD (array, i, strcmp, 1, + __strcmp_ppc)) return i; } diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp-power7.S b/sysdeps/powerpc/powerpc64/multiarch/strcmp-power7.S new file mode 100644 index 0000000..790ce8d --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp-power7.S @@ -0,0 +1,40 @@ +/* Optimized strcmp implementation for POWER7. + Copyright (C) 2014 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#undef EALIGN +#define EALIGN(name, alignt, words) \ + .section ".text"; \ + ENTRY_2(__strcmp_power7) \ + .align ALIGNARG(alignt); \ + EALIGN_W_##words; \ + BODY_LABEL(__strcmp_power7): \ + cfi_startproc; \ + LOCALENTRY(__strcmp_power7) + +#undef END +#define END(name) \ + cfi_endproc; \ + TRACEBACK(__strcmp_power7) \ + END_2(__strcmp_power7) + +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) + +#include <sysdeps/powerpc/powerpc64/power7/strcmp.S> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp-ppc64.S b/sysdeps/powerpc/powerpc64/multiarch/strcmp-ppc64.S new file mode 100644 index 0000000..93d1277 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp-ppc64.S @@ -0,0 +1,43 @@ +/* Default strcmp implementation for PowerPC64. + Copyright (C) 2014 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#if defined SHARED && !defined NOT_IN_libc +# undef EALIGN +# define EALIGN(name, alignt, words) \ + .section ".text"; \ + ENTRY_2(__strcmp_ppc) \ + .align ALIGNARG(alignt); \ + EALIGN_W_##words; \ + BODY_LABEL(__strcmp_ppc): \ + cfi_startproc; \ + LOCALENTRY(__strcmp_ppc) + +# undef END +# define END(name) \ + cfi_endproc; \ + TRACEBACK(__strcmp_ppc) \ + END_2(__strcmp_ppc) + +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(name) \ + .globl __GI_strcmp; __GI_strcmp = __strcmp_ppc +#endif + +#include <sysdeps/powerpc/powerpc64/strcmp.S> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c new file mode 100644 index 0000000..2013301 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c @@ -0,0 +1,31 @@ +/* Multiple versions of strcmp. PowerPC64 version. + Copyright (C) 2014 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if defined SHARED && !defined NOT_IN_libc +# include <string.h> +# include <shlib-compat.h> +# include "init-arch.h" + +extern __typeof (strcmp) __strcmp_ppc attribute_hidden; +extern __typeof (strcmp) __strcmp_power7 attribute_hidden; + +libc_ifunc (strcmp, + (hwcap & PPC_FEATURE_HAS_VSX) + ? __strcmp_power7 + : __strcmp_ppc); +#endif diff --git a/sysdeps/powerpc/powerpc64/power7/strcmp.S b/sysdeps/powerpc/powerpc64/power7/strcmp.S new file mode 100644 index 0000000..f16a9d8 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power7/strcmp.S @@ -0,0 +1,195 @@ +/* Optimized strcmp implementation for Power7 using 'cmpb' instruction + Copyright (C) 2014 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* The optimization is achieved here through cmpb instruction. + 8byte aligned strings are processed with double word comparision + and unaligned strings are handled effectively with loop unrolling + technique */ + +#include <sysdep.h> + +/* int [r3] strcmp (const char *s1 [r3], const char *s2 [r4]) */ + +EALIGN (strcmp, 4, 0) + CALL_MCOUNT 2 + + or r9, r3, r4 + rldicl. r10, r9, 0, 61 /* are s1 and s2 8 byte aligned..? */ + bne cr0, L(process_unaligned_bytes) + +/* process input parameters on double word aligned boundary */ + ld r9, 0(r4) /* load s2 at offset=0 */ + li r10, 0 /* load mask=0 */ + cmpb r10, r9, r10 /* compare bytes at s2 with mask */ + cmpdi cr7, r10, 0 /* is NULL found ..? is end of string HIT */ + bne cr7, L(process_unaligned_bytes) /* process byte by byte */ + + ld r10, 0(r3) /* load s1 at offset=0 */ + li r8, 0 /* load mask=0 */ + cmpb r8, r10, r8 /* compare bytes at s1 with mask */ + cmpdi cr7, r8, 0 /* is NULL found ..? is end of string HIT */ + bne cr7, L(process_unaligned_bytes) /* process byte by byte */ + +/*s1 and s2 does not contain NULL now , so compare all 8 bytes in a GO */ + cmpb r9, r10, r9 /* compare s1 and s2 */ + cmpdi cr7, r9, -1 /* compare result with 0xFFFFFFFFFFFFFFFF */ + bne cr7, L(process_unaligned_bytes) /* s1,s2 mismatch found */ + + addi r5, r3, 8 /* save next offset of s2 */ + addi r11, r4, 8 /* save next offset of s1 */ + ld r8, 8(r4) /* load s2 at offset=8 */ + li r9, 0 /* load mask=0 */ + cmpb r9, r8, r9 /* compare bytes at s2 with mask */ + cmpdi cr7, r9, 0 /* NULL found ..? */ + bne cr7, L(processBytes)/* update input and process bytes one by one */ + + mr r9, r4 /* save s2 */ + li r10, 0 /* load mask=0 */ + + ld r7, 8(r3) /* load s1 at offset=8 */ + cmpb r6, r7, r10 /* compare bytes at s1 with mask */ + cmpdi cr7, r6, 0 /* is NULL found */ + bne cr7, L(processBytes)/* mismatch, so process one by one */ + +L(unrollDword): + cmpb r8, r7, r8 /* compare s1 and s2 */ + cmpdi cr7, r8, -1 /* compare result with 0xFFFFFFFFFFFFFFFF */ + bne cr7, L(processBytes)/* mismatch with s1 and s2 */ + + addi r5, r3, 16 /* save offset=16 of s1 */ + addi r4, r9, 16 /* save offset=16 of s2 */ + ld r8, 16(r9) /* load s2 at offset=16 */ + cmpb r7, r8, r10 /* compare bytes at s2 with mask */ + cmpdi cr7, r7, 0 /* NULL found ..? */ + bne cr7, L(update2processBytes) + + ld r7, 16(r3) /* load s1 at offset=16 */ + cmpb r6, r7, r10 /* check s1 for end of string */ + cmpdi cr7, r6, 0 /* end of s1 ?,then handle byte by byte */ + bne 7,L(update2processBytes) + + cmpb r8, r7, r8 /* compare s1 and s2 double words */ + cmpdi cr7, r8, -1 /* compare results with 0xFFFFFFFFFFFFFFFF */ + bne cr7,L(update2processBytes) + + addi r5, r3, 24 /* update s1 to offset=24 */ + addi r4, r9, 24 /* update s2 to offset=24 */ + + ld r8, 24(r9) /* load s2 */ + cmpb r7, r8, r10 /* compare s2 for NULL */ + cmpdi cr7, r7, 0 /* verify if s2 is ending now */ + bne cr7,L(update2processBytes) + + ld r7, 24(r3) /* load s1 at offset=24 */ + cmpb r6, r7, r10 /* verify for NULL */ + cmpdi cr7, r6, 0 /* is NULL found */ + bne cr7, L(update2processBytes) + + cmpb r8, r7, r8 /* compare s1 and s2 */ + cmpdi cr7, r8, -1 /* are s1 and s2 same ..? */ + bne cr7, L(update2processBytes) + + addi r7, r9, 32 /* update s2 to next double word */ + addi r3, r3, 32 /* update s1 to next double word */ + + ld r8, 32(r9) /* load s2 */ + mr r4, r7 /* save s2 */ + cmpb r6, r8, r10 /* compare s2 with NULL */ + cmpdi cr7, r6, 0 /* end of s2 ..? */ + bne cr7, L(process_unaligned_bytes) + + ld r6, 0(r3) /* load and compare s1 for NULL */ + cmpb r5, r6, r10 + cmpdi cr7, r5, 0 + bne cr7, L(process_unaligned_bytes) + + cmpb r8, r6, r8 /* compare s1 and s2 */ + cmpdi cr7, r8, -1 + bne cr7, L(process_unaligned_bytes) + + addi r5, r3, 8 /* increment s1 and d2 here */ + addi r11, r9, 40 + + ld r8, 40(r9) /* process s2 now */ + cmpb r9, r8, r10 + cmpdi cr7, r9, 0 + bne cr7, L(processBytes) + + mr r9, r7 + ld r7, 8(r3) /* process s1 now */ + cmpb r6, r7, r10 + cmpdi cr7, r6, 0 + beq cr7, L(unrollDword) /* unroll to compare s1 and s2 */ + +L(processBytes): + mr r4, r11 /* update input params */ + mr r3, r5 + + .p2align 4 +L(process_unaligned_bytes): + lbz r9, 0(r3) /* load byte from s1 */ + lbz r10, 0(r4) /* load byte from s2 */ + cmpdi cr7, r9, 0 /* compare *s1 with NULL */ + beq cr7, L(diffOfNULL) /* if *s1 is NULL , return *s1 - *s2 */ + cmplw cr7, r9, r10 /* compare *s1 and *s2 */ + bne cr7, L(ComputeDiff) /* branch to compute difference and return */ + + lbz r9, 1(r3) /* load next byte from s1 */ + lbz r10, 1(r4) /* load next byte from s2 */ + cmpdi cr7, r9, 0 /* compare *s1 with NULL */ + beq cr7, L(diffOfNULL) /* if *s1 is NULL , return *s1 - *s2 */ + cmplw cr7, r9, r10 /* compare *s1 and *s2 */ + bne cr7, L(ComputeDiff) /* branch to compute difference and return */ + + lbz r9, 2(r3) /* unroll 3rd byte here */ + lbz r10, 2(r4) + cmpdi cr7, r9, 0 + beq cr7, L(diffOfNULL) + cmplw cr7, r9, r10 + bne 7, L(ComputeDiff) + + lbz r9, 3(r3) /* unroll 4th byte now */ + lbz r10, 3(r4) + addi r3, r3, 4 /* increment s1 by unroll factor */ + cmpdi cr7, r9, 0 + cmplw cr6, 9, r10 + beq cr7, L(diffOfNULL) + addi r4, r4, 4 /* increment s2 by unroll factor */ + beq cr6, L(process_unaligned_bytes) /* unroll byte processing */ + + .p2align 4 +L(ComputeDiff): + extsw r9, r9 + subf r10, r10, r9 /* compute s1 - s2 */ + extsw r3, r10 + blr /* return */ + + .p2align 4 +L(diffOfNULL): + li r9, 0 + subf r10, r10, r9 /* compute s1 - s2 */ + extsw r3, r10 /* sign extend result */ + blr /* return */ + + .p2align 4 +L(update2processBytes): + mr r3, r5 /* update and proceed */ + b L(process_unaligned_bytes) + +END (strcmp) +libc_hidden_builtin_def (strcmp)