Message ID | 1506542999-97895-3-git-send-email-patrick.mcgehearty@oracle.com |
---|---|
State | New |
Headers | show |
Series | sparc M7 optimized memcpy/memset | expand |
Hi Patrick. Nitpick below. Sam > + > +.Ls2alg: > + lduh [%o1], %o3 /* know src is 2 byte aligned */ > + inc 2, %o1 > + srl %o3, 8, %o4 > + stb %o4, [%o0] /* have to do bytes, */ > + stb %o3, [%o0 + 1] /* don't know dst alingment */ > + inc 2, %o0 > + dec 2, %o2 > + > +.Laldst: > + andcc %o0, 3, %o5 /* align the destination address */ .Lald: bz,pn %icc, .Lw4cp Label on own line would make patches more readable. But src looks OK. > + cmp %o5, 2 > + bz,pn %icc, .Lw2cp > + cmp %o5, 3 > +.Lw3cp: > + lduw [%o1], %o4 > + inc 4, %o1 > + srl %o4, 24, %o5 > + stb %o5, [%o0] > + bne,pt %icc, .Lw1cp > + inc %o0 > + dec 1, %o2 > + andn %o2, 3, %o3 /* i3 is aligned word count */ > + dec 4, %o3 /* avoid reading beyond tail of src */ > + sub %o1, %o0, %o1 /* i1 gets the difference */ > + > +1: sll %o4, 8, %g1 /* save residual bytes */ > + lduw [%o1+%o0], %o4 > + deccc 4, %o3 > + srl %o4, 24, %o5 /* merge with residual */ > + or %o5, %g1, %g1 > + st %g1, [%o0] > + bnz,pt %XCC, 1b > + inc 4, %o0 > + sub %o1, 3, %o1 /* used one byte of last word read */ > + and %o2, 3, %o2 > + b 7f > + inc 4, %o2 > + > +.Lw1cp: > + srl %o4, 8, %o5 > + sth %o5, [%o0] > + inc 2, %o0 > + dec 3, %o2 > + andn %o2, 3, %o3 > + dec 4, %o3 /* avoid reading beyond tail of src */ > + sub %o1, %o0, %o1 /* i1 gets the difference */ > + > +2: sll %o4, 24, %g1 /* save residual bytes */ > + lduw [%o1+%o0], %o4 > + deccc 4, %o3 > + srl %o4, 8, %o5 /* merge with residual */ > + or %o5, %g1, %g1 > + st %g1, [%o0] > + bnz,pt %XCC, 2b > + inc 4, %o0 > + sub %o1, 1, %o1 /* used three bytes of last word read */ > + and %o2, 3, %o2 > + b 7f > + inc 4, %o2 Delay slot - indent instruction with one space. > + > +.Lw2cp: > + lduw [%o1], %o4 > + inc 4, %o1 > + srl %o4, 16, %o5 > + sth %o5, [%o0] > + inc 2, %o0 > + dec 2, %o2 > + andn %o2, 3, %o3 /* i3 is aligned word count */ > + dec 4, %o3 /* avoid reading beyond tail of src */ > + sub %o1, %o0, %o1 /* i1 gets the difference */ > + > +3: sll %o4, 16, %g1 /* save residual bytes */ > + lduw [%o1+%o0], %o4 > + deccc 4, %o3 > + srl %o4, 16, %o5 /* merge with residual */ > + or %o5, %g1, %g1 > + st %g1, [%o0] > + bnz,pt %XCC, 3b > + inc 4, %o0 > + sub %o1, 2, %o1 /* used two bytes of last word read */ > + and %o2, 3, %o2 > + b 7f > + inc 4, %o2 > + > +.Lw4cp: > + andn %o2, 3, %o3 /* i3 is aligned word count */ > + sub %o1, %o0, %o1 /* i1 gets the difference */ > + > +1: lduw [%o1+%o0], %o4 /* read from address */ > + deccc 4, %o3 /* decrement count */ > + st %o4, [%o0] /* write at destination address */ > + bg,pt %XCC, 1b > + inc 4, %o0 /* increment to address */ > + b 7f > + and %o2, 3, %o2 /* number of leftover bytes, if any */ > + > +/* > + * differenced byte copy, works with any alignment > + */ > +.Ldbytecp: > + b 7f > + sub %o1, %o0, %o1 /* i1 gets the difference */ > + > +4: stb %o4, [%o0] /* write to address */ > + inc %o0 /* inc to address */ > +7: deccc %o2 /* decrement count */ > + bge,a %XCC, 4b /* loop till done */ > + ldub [%o1+%o0], %o4 /* read from address */ > + retl > + mov %g2, %o0 /* return pointer to destination */ > + > +/* > + * an overlapped copy that must be done "backwards" > + */ > +.Lovbc: > + add %o1, %o2, %o1 /* get to end of source space */ > + add %o0, %o2, %o0 /* get to end of destination space */ > + sub %o1, %o0, %o1 /* i1 gets the difference */ > + > +5: dec %o0 /* decrement to address */ > + ldub [%o1+%o0], %o3 /* read a byte */ > + deccc %o2 /* decrement count */ > + bg,pt %XCC, 5b /* loop until done */ > + stb %o3, [%o0] /* write byte */ > + retl > + mov %g2, %o0 /* return pointer to destination */ > +END(memmove) > + > +libc_hidden_builtin_def (memmove)
I'll clean up the nits and double check for any other missing delay slot spaces. I expect I'll have it ready for resubmission later today. - patrick On 9/27/2017 3:40 PM, Sam Ravnborg wrote: > Hi Patrick. > > Nitpick below. > > Sam > >> + >> +.Ls2alg: >> + lduh [%o1], %o3 /* know src is 2 byte aligned */ >> + inc 2, %o1 >> + srl %o3, 8, %o4 >> + stb %o4, [%o0] /* have to do bytes, */ >> + stb %o3, [%o0 + 1] /* don't know dst alingment */ >> + inc 2, %o0 >> + dec 2, %o2 >> + >> +.Laldst: >> + andcc %o0, 3, %o5 /* align the destination address */ > .Lald: bz,pn %icc, .Lw4cp > Label on own line would make patches more readable. > But src looks OK. > >> + cmp %o5, 2 >> + bz,pn %icc, .Lw2cp >> + cmp %o5, 3 >> +.Lw3cp: >> + lduw [%o1], %o4 >> + inc 4, %o1 >> + srl %o4, 24, %o5 >> + stb %o5, [%o0] >> + bne,pt %icc, .Lw1cp >> + inc %o0 >> + dec 1, %o2 >> + andn %o2, 3, %o3 /* i3 is aligned word count */ >> + dec 4, %o3 /* avoid reading beyond tail of src */ >> + sub %o1, %o0, %o1 /* i1 gets the difference */ >> + >> +1: sll %o4, 8, %g1 /* save residual bytes */ >> + lduw [%o1+%o0], %o4 >> + deccc 4, %o3 >> + srl %o4, 24, %o5 /* merge with residual */ >> + or %o5, %g1, %g1 >> + st %g1, [%o0] >> + bnz,pt %XCC, 1b >> + inc 4, %o0 >> + sub %o1, 3, %o1 /* used one byte of last word read */ >> + and %o2, 3, %o2 >> + b 7f >> + inc 4, %o2 >> + >> +.Lw1cp: >> + srl %o4, 8, %o5 >> + sth %o5, [%o0] >> + inc 2, %o0 >> + dec 3, %o2 >> + andn %o2, 3, %o3 >> + dec 4, %o3 /* avoid reading beyond tail of src */ >> + sub %o1, %o0, %o1 /* i1 gets the difference */ >> + >> +2: sll %o4, 24, %g1 /* save residual bytes */ >> + lduw [%o1+%o0], %o4 >> + deccc 4, %o3 >> + srl %o4, 8, %o5 /* merge with residual */ >> + or %o5, %g1, %g1 >> + st %g1, [%o0] >> + bnz,pt %XCC, 2b >> + inc 4, %o0 >> + sub %o1, 1, %o1 /* used three bytes of last word read */ >> + and %o2, 3, %o2 >> + b 7f >> + inc 4, %o2 > Delay slot - indent instruction with one space. > >> + >> +.Lw2cp: >> + lduw [%o1], %o4 >> + inc 4, %o1 >> + srl %o4, 16, %o5 >> + sth %o5, [%o0] >> + inc 2, %o0 >> + dec 2, %o2 >> + andn %o2, 3, %o3 /* i3 is aligned word count */ >> + dec 4, %o3 /* avoid reading beyond tail of src */ >> + sub %o1, %o0, %o1 /* i1 gets the difference */ >> + >> +3: sll %o4, 16, %g1 /* save residual bytes */ >> + lduw [%o1+%o0], %o4 >> + deccc 4, %o3 >> + srl %o4, 16, %o5 /* merge with residual */ >> + or %o5, %g1, %g1 >> + st %g1, [%o0] >> + bnz,pt %XCC, 3b >> + inc 4, %o0 >> + sub %o1, 2, %o1 /* used two bytes of last word read */ >> + and %o2, 3, %o2 >> + b 7f >> + inc 4, %o2 >> + >> +.Lw4cp: >> + andn %o2, 3, %o3 /* i3 is aligned word count */ >> + sub %o1, %o0, %o1 /* i1 gets the difference */ >> + >> +1: lduw [%o1+%o0], %o4 /* read from address */ >> + deccc 4, %o3 /* decrement count */ >> + st %o4, [%o0] /* write at destination address */ >> + bg,pt %XCC, 1b >> + inc 4, %o0 /* increment to address */ >> + b 7f >> + and %o2, 3, %o2 /* number of leftover bytes, if any */ >> + >> +/* >> + * differenced byte copy, works with any alignment >> + */ >> +.Ldbytecp: >> + b 7f >> + sub %o1, %o0, %o1 /* i1 gets the difference */ >> + >> +4: stb %o4, [%o0] /* write to address */ >> + inc %o0 /* inc to address */ >> +7: deccc %o2 /* decrement count */ >> + bge,a %XCC, 4b /* loop till done */ >> + ldub [%o1+%o0], %o4 /* read from address */ >> + retl >> + mov %g2, %o0 /* return pointer to destination */ >> + >> +/* >> + * an overlapped copy that must be done "backwards" >> + */ >> +.Lovbc: >> + add %o1, %o2, %o1 /* get to end of source space */ >> + add %o0, %o2, %o0 /* get to end of destination space */ >> + sub %o1, %o0, %o1 /* i1 gets the difference */ >> + >> +5: dec %o0 /* decrement to address */ >> + ldub [%o1+%o0], %o3 /* read a byte */ >> + deccc %o2 /* decrement count */ >> + bg,pt %XCC, 5b /* loop until done */ >> + stb %o3, [%o0] /* write byte */ >> + retl >> + mov %g2, %o0 /* return pointer to destination */ >> +END(memmove) >> + >> +libc_hidden_builtin_def (memmove)
On 27/09/2017 13:09, Patrick McGehearty wrote: > diff --git a/sysdeps/sparc/sparc32/sparcv9/rtld-memmove.c b/sysdeps/sparc/sparc32/sparcv9/rtld-memmove.c > new file mode 100644 > index 0000000..a2fe190 > --- /dev/null > +++ b/sysdeps/sparc/sparc32/sparcv9/rtld-memmove.c > @@ -0,0 +1 @@ > +#include <sparc64/rtld-memmove.c> I will try to avoid these cross-reference arch references (it is a source of problems for future cleanups and consolidations), just use the default implementation directly. Also, since you are adding a new default sparc64 implementation, why can't you use it for the loader?
On 9/28/2017 11:17 AM, Adhemerval Zanella wrote: > > On 27/09/2017 13:09, Patrick McGehearty wrote: >> diff --git a/sysdeps/sparc/sparc32/sparcv9/rtld-memmove.c b/sysdeps/sparc/sparc32/sparcv9/rtld-memmove.c >> new file mode 100644 >> index 0000000..a2fe190 >> --- /dev/null >> +++ b/sysdeps/sparc/sparc32/sparcv9/rtld-memmove.c >> @@ -0,0 +1 @@ >> +#include <sparc64/rtld-memmove.c> > I will try to avoid these cross-reference arch references (it is a source of > problems for future cleanups and consolidations), just use the default > implementation directly. > > Also, since you are adding a new default sparc64 implementation, why can't > you use it for the loader? > > The pattern above is widely used in sparc32 code. Examples include in sysdeps/sparc/sparc32/sparcv9: rawmemchr.S, rtld-memcpy.c, rtld-memmove.c, rtld-memset.c, stpcpy.S, stpncpy.S, strcat.S, strchr.S, strcmp.S, strcpy.S, strcspn.S, strlen.S, strncmp.S, strncpy.S, strpbrk.S, strspn.S and in sysdeps/sparc/sparc32/sparcv9/multiarch:: memcpy-niagara1.S, memcpy-niagara2.S, memcpy-niagara4.S, memcpy.S, memcpy-ultra3.S, memmove.S, memset-niagara1.S, memset-niagara4.S, memset.S, rtld-memcpy.c, rtld-memmove.c, rtld-memset.c, sha256-block.c, sha256-crop.S, sha512-block.c, sha512-crop.S It would add to implementation complexity to have two different methods in use for similar purposes. Revising the current method on such a range of functions is beyond the scope of this patch set. - patrick
On 28/09/2017 11:35, Patrick McGehearty wrote: > On 9/28/2017 11:17 AM, Adhemerval Zanella wrote: >> >> On 27/09/2017 13:09, Patrick McGehearty wrote: >>> diff --git a/sysdeps/sparc/sparc32/sparcv9/rtld-memmove.c >>> b/sysdeps/sparc/sparc32/sparcv9/rtld-memmove.c >>> new file mode 100644 >>> index 0000000..a2fe190 >>> --- /dev/null >>> +++ b/sysdeps/sparc/sparc32/sparcv9/rtld-memmove.c >>> @@ -0,0 +1 @@ >>> +#include <sparc64/rtld-memmove.c> >> I will try to avoid these cross-reference arch references (it is a >> source of >> problems for future cleanups and consolidations), just use the default >> implementation directly. >> >> Also, since you are adding a new default sparc64 implementation, why >> can't >> you use it for the loader? >> >> > The pattern above is widely used in sparc32 code. > > Examples include in sysdeps/sparc/sparc32/sparcv9: > rawmemchr.S, rtld-memcpy.c, rtld-memmove.c, rtld-memset.c, > stpcpy.S, stpncpy.S, strcat.S, strchr.S, strcmp.S, > strcpy.S, strcspn.S, strlen.S, strncmp.S, strncpy.S, > strpbrk.S, strspn.S > > and in sysdeps/sparc/sparc32/sparcv9/multiarch:: > memcpy-niagara1.S, memcpy-niagara2.S, memcpy-niagara4.S, > memcpy.S, memcpy-ultra3.S, memmove.S, memset-niagara1.S, > memset-niagara4.S, memset.S, rtld-memcpy.c, > rtld-memmove.c, rtld-memset.c, sha256-block.c, > sha256-crop.S, sha512-block.c, sha512-crop.S > > It would add to implementation complexity to have > two different methods in use for similar purposes. > Revising the current method on such a range of > functions is beyond the scope of this patch set. > > - patrick > Fair enough, although I am only suggesting adequate for current patch (not really change on other files).
diff --git a/ChangeLog b/ChangeLog index 3f9db7a..ee70dde 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,12 @@ 2017-09-26 Jose E. Marchesi <jose.marchesi@oracle.com> + * sysdeps/sparc/sparc32/sparcv9/memmove.S: New file. + * sysdeps/sparc/sparc32/sparcv9/rtld-memmove.c: Likewise. + * sysdeps/sparc/sparc64/memmove.S: Likewise. + * sysdeps/sparc/sparc64/rtld-memmove.c: Likewise. + +2017-09-26 Jose E. Marchesi <jose.marchesi@oracle.com> + * sysdeps/sparc/bits/hwcap.h (HWCAP_SPARC_ADP): Defined. * sysdeps/sparc/dl-procinfo.c: Added "adp" to the _dl_sparc_cap_flags array. diff --git a/sysdeps/sparc/sparc32/sparcv9/memmove.S b/sysdeps/sparc/sparc32/sparcv9/memmove.S new file mode 100644 index 0000000..39adeb2 --- /dev/null +++ b/sysdeps/sparc/sparc32/sparcv9/memmove.S @@ -0,0 +1,2 @@ +#define XCC icc +#include <sparc64/memmove.S> diff --git a/sysdeps/sparc/sparc32/sparcv9/rtld-memmove.c b/sysdeps/sparc/sparc32/sparcv9/rtld-memmove.c new file mode 100644 index 0000000..a2fe190 --- /dev/null +++ b/sysdeps/sparc/sparc32/sparcv9/rtld-memmove.c @@ -0,0 +1 @@ +#include <sparc64/rtld-memmove.c> diff --git a/sysdeps/sparc/sparc64/memmove.S b/sysdeps/sparc/sparc64/memmove.S new file mode 100644 index 0000000..eb71ef3 --- /dev/null +++ b/sysdeps/sparc/sparc64/memmove.S @@ -0,0 +1,186 @@ +/* Copy memory to memory until the specified number of bytes + has been copied. Overlap is handled correctly. + For SPARC V9. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#ifndef XCC +# define XCC xcc + .register %g2, #scratch +#endif + +ENTRY(memmove) + mov %o0, %g2 /* Save pointer to destination */ + cmp %o1, %o0 /* if from address is >= to use forward copy */ + bgeu,a %XCC, 2f /* else use backward if ... */ + cmp %o2, 17 /* delay slot, for small counts copy bytes */ + + sub %o0, %o1, %o4 /* get difference of two addresses */ + cmp %o2, %o4 /* compare size and difference of addresses */ + bgu %XCC, .Lovbc /* if size is bigger, have to do overlapped copy */ + cmp %o2, 17 /* delay slot, for small counts copy bytes */ +/* + * normal, copy forwards + */ +2: ble %XCC, .Ldbytecp + andcc %o1, 3, %o5 /* is src word aligned */ + bz,pn %icc, .Laldst + cmp %o5, 2 /* is src half-word aligned */ + be,pn %icc, .Ls2alg + cmp %o5, 3 /* src is byte aligned */ + ldub [%o1], %o3 /* move 1 or 3 bytes to align it */ + inc 1, %o1 + stb %o3, [%o0] /* move a byte to align src */ + inc 1, %o0 + bne,pn %icc, .Ls2alg + dec %o2 + b .Lald /* now go align dest */ + andcc %o0, 3, %o5 + +.Ls2alg: + lduh [%o1], %o3 /* know src is 2 byte aligned */ + inc 2, %o1 + srl %o3, 8, %o4 + stb %o4, [%o0] /* have to do bytes, */ + stb %o3, [%o0 + 1] /* don't know dst alingment */ + inc 2, %o0 + dec 2, %o2 + +.Laldst: + andcc %o0, 3, %o5 /* align the destination address */ +.Lald: bz,pn %icc, .Lw4cp + cmp %o5, 2 + bz,pn %icc, .Lw2cp + cmp %o5, 3 +.Lw3cp: + lduw [%o1], %o4 + inc 4, %o1 + srl %o4, 24, %o5 + stb %o5, [%o0] + bne,pt %icc, .Lw1cp + inc %o0 + dec 1, %o2 + andn %o2, 3, %o3 /* i3 is aligned word count */ + dec 4, %o3 /* avoid reading beyond tail of src */ + sub %o1, %o0, %o1 /* i1 gets the difference */ + +1: sll %o4, 8, %g1 /* save residual bytes */ + lduw [%o1+%o0], %o4 + deccc 4, %o3 + srl %o4, 24, %o5 /* merge with residual */ + or %o5, %g1, %g1 + st %g1, [%o0] + bnz,pt %XCC, 1b + inc 4, %o0 + sub %o1, 3, %o1 /* used one byte of last word read */ + and %o2, 3, %o2 + b 7f + inc 4, %o2 + +.Lw1cp: + srl %o4, 8, %o5 + sth %o5, [%o0] + inc 2, %o0 + dec 3, %o2 + andn %o2, 3, %o3 + dec 4, %o3 /* avoid reading beyond tail of src */ + sub %o1, %o0, %o1 /* i1 gets the difference */ + +2: sll %o4, 24, %g1 /* save residual bytes */ + lduw [%o1+%o0], %o4 + deccc 4, %o3 + srl %o4, 8, %o5 /* merge with residual */ + or %o5, %g1, %g1 + st %g1, [%o0] + bnz,pt %XCC, 2b + inc 4, %o0 + sub %o1, 1, %o1 /* used three bytes of last word read */ + and %o2, 3, %o2 + b 7f + inc 4, %o2 + +.Lw2cp: + lduw [%o1], %o4 + inc 4, %o1 + srl %o4, 16, %o5 + sth %o5, [%o0] + inc 2, %o0 + dec 2, %o2 + andn %o2, 3, %o3 /* i3 is aligned word count */ + dec 4, %o3 /* avoid reading beyond tail of src */ + sub %o1, %o0, %o1 /* i1 gets the difference */ + +3: sll %o4, 16, %g1 /* save residual bytes */ + lduw [%o1+%o0], %o4 + deccc 4, %o3 + srl %o4, 16, %o5 /* merge with residual */ + or %o5, %g1, %g1 + st %g1, [%o0] + bnz,pt %XCC, 3b + inc 4, %o0 + sub %o1, 2, %o1 /* used two bytes of last word read */ + and %o2, 3, %o2 + b 7f + inc 4, %o2 + +.Lw4cp: + andn %o2, 3, %o3 /* i3 is aligned word count */ + sub %o1, %o0, %o1 /* i1 gets the difference */ + +1: lduw [%o1+%o0], %o4 /* read from address */ + deccc 4, %o3 /* decrement count */ + st %o4, [%o0] /* write at destination address */ + bg,pt %XCC, 1b + inc 4, %o0 /* increment to address */ + b 7f + and %o2, 3, %o2 /* number of leftover bytes, if any */ + +/* + * differenced byte copy, works with any alignment + */ +.Ldbytecp: + b 7f + sub %o1, %o0, %o1 /* i1 gets the difference */ + +4: stb %o4, [%o0] /* write to address */ + inc %o0 /* inc to address */ +7: deccc %o2 /* decrement count */ + bge,a %XCC, 4b /* loop till done */ + ldub [%o1+%o0], %o4 /* read from address */ + retl + mov %g2, %o0 /* return pointer to destination */ + +/* + * an overlapped copy that must be done "backwards" + */ +.Lovbc: + add %o1, %o2, %o1 /* get to end of source space */ + add %o0, %o2, %o0 /* get to end of destination space */ + sub %o1, %o0, %o1 /* i1 gets the difference */ + +5: dec %o0 /* decrement to address */ + ldub [%o1+%o0], %o3 /* read a byte */ + deccc %o2 /* decrement count */ + bg,pt %XCC, 5b /* loop until done */ + stb %o3, [%o0] /* write byte */ + retl + mov %g2, %o0 /* return pointer to destination */ +END(memmove) + +libc_hidden_builtin_def (memmove) diff --git a/sysdeps/sparc/sparc64/rtld-memmove.c b/sysdeps/sparc/sparc64/rtld-memmove.c new file mode 100644 index 0000000..1e73c6b --- /dev/null +++ b/sysdeps/sparc/sparc64/rtld-memmove.c @@ -0,0 +1,2 @@ +#include <string/wordcopy.c> +#include <string/memmove.c>
From: Jose E. Marchesi <jose.marchesi@oracle.com> Tested in sparcv9-*-* and sparc64-*-* targets in both non-multi-arch and multi-arch configurations. --- ChangeLog | 7 + sysdeps/sparc/sparc32/sparcv9/memmove.S | 2 + sysdeps/sparc/sparc32/sparcv9/rtld-memmove.c | 1 + sysdeps/sparc/sparc64/memmove.S | 186 ++++++++++++++++++++++++++ sysdeps/sparc/sparc64/rtld-memmove.c | 2 + 5 files changed, 198 insertions(+), 0 deletions(-) create mode 100644 sysdeps/sparc/sparc32/sparcv9/memmove.S create mode 100644 sysdeps/sparc/sparc32/sparcv9/rtld-memmove.c create mode 100644 sysdeps/sparc/sparc64/memmove.S create mode 100644 sysdeps/sparc/sparc64/rtld-memmove.c