Message ID | 20141218211348.GA16854@domone |
---|---|
State | New |
Headers | show |
On 12/19/2014 02:43 AM, Ondřej Bílka wrote: > On Wed, Dec 17, 2014 at 09:34:53PM +0530, Rajalakshmi Srinivasaraghavan wrote: >> >> This patch optimizes strcpy for ppc64 for unaligned source or >> destination address. The source or destination address is aligned >> to doubleword and data is shifted based on the alignment and >> added with the previous loaded data to be written as a doubleword. >> For each load, cmpb instruction is used for faster null check. >> >> More combination of unaligned inputs is also added in benchtest >> to measure the improvement.The new optimization shows 2 to 80% of >> performance improvement for longer string though it does not show >> big difference on string size less than 16 due to additional checks. >> >> This patch is tested on powerpc64 BE and LE and I have also attached >> the benchtest result. >> > As I wrote that benchtests are suspect first retest what happens if you > do not always call strcpy with same input and output buffer. What > diffence that makes in benchmark? > I applied this patch with and without my optimization and I could not see any decrease in performance. Attached the results. > diff --git a/benchtests/bench-strcpy.c b/benchtests/bench-strcpy.c > index c3ab4cf..0329f60 100644 > --- a/benchtests/bench-strcpy.c > +++ b/benchtests/bench-strcpy.c > @@ -71,25 +71,25 @@ SIMPLE_STRCPY (CHAR *dst, const CHAR *src) > typedef CHAR *(*proto_t) (CHAR *, const CHAR *); > > static void > -do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, > +do_one_test (impl_t *impl, CHAR **dst, CHAR **src, > size_t len __attribute__((unused))) > { > size_t i, iters = INNER_LOOP_ITERS; > timing_t start, stop, cur; > > - if (CALL (impl, dst, src) != STRCPY_RESULT (dst, len)) > + if (CALL (impl, dst[0], src[0]) != STRCPY_RESULT (dst[0], len[0])) Modified it as if (CALL (impl, dst[0], src[0]) != STRCPY_RESULT (dst[0], len)) > { > error (0, 0, "Wrong result in function %s %p %p", impl->name, > - CALL (impl, dst, src), STRCPY_RESULT (dst, len)); > + CALL (impl, dst[0], src[0]), STRCPY_RESULT (dst[0], len)); > ret = 1; > return; > } > > - if (STRCMP (dst, src) != 0) > + if (STRCMP (dst[0], src[0]) != 0) > { > error (0, 0, > "Wrong result in function %s dst \"%" sfmt "\" src \"%" sfmt "\"", > - impl->name, dst, src); > + impl->name, dst[0], src[0]); > ret = 1; > return; > } > @@ -97,7 +97,7 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, > TIMING_NOW (start); > for (i = 0; i < iters; ++i) > { > - CALL (impl, dst, src); > + CALL (impl, dst[i % 16], src[i % 16]); > } > TIMING_NOW (stop); > > @@ -109,8 +109,8 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, > static void > do_test (size_t align1, size_t align2, size_t len, int max_char) > { > - size_t i; > - CHAR *s1, *s2; > + size_t i, j; > + CHAR **s1, **s2; > /* For wcscpy: align1 and align2 here mean alignment not in bytes, > but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t)) > len for wcschr here isn't in bytes but it's number of wchar_t symbols. */ > @@ -122,12 +122,17 @@ do_test (size_t align1, size_t align2, size_t len, int max_char) > if ((align2 + len) * sizeof(CHAR) >= page_size) > return; > > - s1 = (CHAR *) (buf1) + align1; > - s2 = (CHAR *) (buf2) + align2; > + s1 = calloc (sizeof (char *), 16); > + s2 = calloc (sizeof (char *), 16); > + for (j = 0; j < 16; j++) > + { > + s1[j] = ((CHAR *) calloc (align1 + len + 1, sizeof (CHAR))) + align1; > + s2[j] = ((CHAR *) calloc (align2 + len + 1, sizeof (CHAR))) + align2; > > - for (i = 0; i < len; i++) > - s1[i] = 32 + 23 * i % (max_char - 32); > - s1[len] = 0; > + for (i = 0; i < len; i++) > + s1[j][i] = 32 + 23 * i % (max_char - 32); > + s1[j][len] = 0; > + } > > printf ("Length %4zd, alignments in bytes %2zd/%2zd:", len, align1 * sizeof(CHAR), align2 * sizeof(CHAR)); > > >
diff --git a/benchtests/bench-strcpy.c b/benchtests/bench-strcpy.c index c3ab4cf..0329f60 100644 --- a/benchtests/bench-strcpy.c +++ b/benchtests/bench-strcpy.c @@ -71,25 +71,25 @@ SIMPLE_STRCPY (CHAR *dst, const CHAR *src) typedef CHAR *(*proto_t) (CHAR *, const CHAR *); static void -do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, +do_one_test (impl_t *impl, CHAR **dst, CHAR **src, size_t len __attribute__((unused))) { size_t i, iters = INNER_LOOP_ITERS; timing_t start, stop, cur; - if (CALL (impl, dst, src) != STRCPY_RESULT (dst, len)) + if (CALL (impl, dst[0], src[0]) != STRCPY_RESULT (dst[0], len[0])) { error (0, 0, "Wrong result in function %s %p %p", impl->name, - CALL (impl, dst, src), STRCPY_RESULT (dst, len)); + CALL (impl, dst[0], src[0]), STRCPY_RESULT (dst[0], len)); ret = 1; return; } - if (STRCMP (dst, src) != 0) + if (STRCMP (dst[0], src[0]) != 0) { error (0, 0, "Wrong result in function %s dst \"%" sfmt "\" src \"%" sfmt "\"", - impl->name, dst, src); + impl->name, dst[0], src[0]); ret = 1; return; } @@ -97,7 +97,7 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, TIMING_NOW (start); for (i = 0; i < iters; ++i) { - CALL (impl, dst, src); + CALL (impl, dst[i % 16], src[i % 16]); } TIMING_NOW (stop); @@ -109,8 +109,8 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src, static void do_test (size_t align1, size_t align2, size_t len, int max_char) { - size_t i; - CHAR *s1, *s2; + size_t i, j; + CHAR **s1, **s2; /* For wcscpy: align1 and align2 here mean alignment not in bytes, but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t)) len for wcschr here isn't in bytes but it's number of wchar_t symbols. */ @@ -122,12 +122,17 @@ do_test (size_t align1, size_t align2, size_t len, int max_char) if ((align2 + len) * sizeof(CHAR) >= page_size) return; - s1 = (CHAR *) (buf1) + align1; - s2 = (CHAR *) (buf2) + align2; + s1 = calloc (sizeof (char *), 16); + s2 = calloc (sizeof (char *), 16); + for (j = 0; j < 16; j++) + { + s1[j] = ((CHAR *) calloc (align1 + len + 1, sizeof (CHAR))) + align1; + s2[j] = ((CHAR *) calloc (align2 + len + 1, sizeof (CHAR))) + align2; - for (i = 0; i < len; i++) - s1[i] = 32 + 23 * i % (max_char - 32); - s1[len] = 0; + for (i = 0; i < len; i++) + s1[j][i] = 32 + 23 * i % (max_char - 32); + s1[j][len] = 0; + } printf ("Length %4zd, alignments in bytes %2zd/%2zd:", len, align1 * sizeof(CHAR), align2 * sizeof(CHAR));