Message ID | 20240901015456.1782446-1-visitorckw@gmail.com |
---|---|
State | New |
Headers | show |
Series | [RFC] Optimize bsearch() implementation for performance | expand |
On Sun, Sep 01, 2024 at 09:54:56AM +0800, Kuan-Wei Chiu wrote: > Optimize the bsearch() function to improve binary search performance. > This modification reduces the execution time by approximately 8% on an > x86 machine with a 100,000-element array under 1e8 searches. The > optimization slightly increases the code size by 8 bytes. > > code size: > * old: > text data bss dec hex filename > 250 0 0 250 fa ./stdlib/bsearch.o > * new: > text data bss dec hex filename > 258 0 0 258 102 ./stdlib/bsearch.o > > benchmark: > Old bsearch elapsed time: 4536833 > New bsearch elapsed time: 4137556 > Improve efficiency by 8 % > > Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com> > --- > Since I only have x86 machines available for testing, I am not entirely > certain whether this patch will bring performance improvements across > different architectures. Therefore, I am sending this as an RFC patch > first. > > bits/stdlib-bsearch.h | 20 +++++++++----------- > 1 file changed, 9 insertions(+), 11 deletions(-) > FWIW, here is the code I used for testing: /* tst-bsearch1.c */ #include <stdio.h> #include <stdlib.h> #include <time.h> #define TEST_OLD int arr[100000]; #define narr (sizeof (arr) / sizeof (arr[0])) static int comp (const void *p1, const void *p2) { int x1 = *(int *) p1; int x2 = *(int *) p2; if (x1 < x2) return -1; if (x1 > x2) return 1; return 0; } __attribute__((noinline)) void * newbsearch (const void *__key, const void *__base, size_t __nmemb, size_t __size, __compar_fn_t __compar) { const void *__p; int __comparison; while (__nmemb) { __p = (const void *) (((const char *) __base) + ((__nmemb >> 1) * __size)); __comparison = (*__compar) (__key, __p); if (__comparison == 0) { #if __GNUC_PREREQ(4, 6) # pragma GCC diagnostic push # pragma GCC diagnostic ignored "-Wcast-qual" #endif return (void *) __p; #if __GNUC_PREREQ(4, 6) # pragma GCC diagnostic pop #endif } if (__comparison > 0) { __base = ((const char *) __p) + __size; --__nmemb; } __nmemb >>= 1; } return NULL; } __attribute__((noinline)) void * oldbsearch (const void *__key, const void *__base, size_t __nmemb, size_t __size, __compar_fn_t __compar) { size_t __l, __u, __idx; const void *__p; int __comparison; __l = 0; __u = __nmemb; while (__l < __u) { __idx = (__l + __u) / 2; __p = (const void *) (((const char *) __base) + (__idx * __size)); __comparison = (*__compar) (__key, __p); if (__comparison < 0) __u = __idx; else if (__comparison > 0) __l = __idx + 1; else { #if __GNUC_PREREQ(4, 6) # pragma GCC diagnostic push # pragma GCC diagnostic ignored "-Wcast-qual" #endif return (void *) __p; #if __GNUC_PREREQ(4, 6) # pragma GCC diagnostic pop #endif } } return NULL; } static int do_test (void) { volatile size_t i; volatile int *res; volatile clock_t begin, end; int key; long long int old_time, new_time; for (i = 0; i < narr; ++i) arr[i] = i; asm volatile ("" : : : "memory"); begin = clock (); asm volatile ("" : : : "memory"); for (i = 0; i < 1e8; ++i) { key = i % narr; res = (int *) oldbsearch (&key, arr, narr, sizeof (arr[0]), comp); if (res != arr + key) { printf ("entry %zd got wrong answer\n", i); return 1; } } asm volatile ("" : : : "memory"); end = clock (); asm volatile ("" : : : "memory"); old_time = end - begin; printf ("Old bsearch elapsed time: %lld\n", old_time); asm volatile ("" : : : "memory"); begin = clock (); asm volatile ("" : : : "memory"); for (i = 0; i < 1e8; ++i) { key = i % narr; res = (int *) newbsearch (&key, arr, narr, sizeof (arr[0]), comp); if (res != arr + key) { printf ("entry %zd got wrong answer\n", i); return 1; } } asm volatile ("" : : : "memory"); end = clock (); asm volatile ("" : : : "memory"); new_time = end - begin; printf ("New bsearch elapsed time: %lld\n", new_time); printf ("Improve efficiency by %lld %%\n", (old_time - new_time) * 100 / old_time); return 0; } #define TEST_FUNCTION do_test () #include "../test-skeleton.c"
On Sat, Aug 31, 2024 at 6:57 PM Kuan-Wei Chiu <visitorckw@gmail.com> wrote: > > On Sun, Sep 01, 2024 at 09:54:56AM +0800, Kuan-Wei Chiu wrote: > > Optimize the bsearch() function to improve binary search performance. > > This modification reduces the execution time by approximately 8% on an > > x86 machine with a 100,000-element array under 1e8 searches. The > > optimization slightly increases the code size by 8 bytes. > > > > code size: > > * old: > > text data bss dec hex filename > > 250 0 0 250 fa ./stdlib/bsearch.o > > * new: > > text data bss dec hex filename > > 258 0 0 258 102 ./stdlib/bsearch.o > > > > benchmark: > > Old bsearch elapsed time: 4536833 > > New bsearch elapsed time: 4137556 > > Improve efficiency by 8 % > > > > Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com> > > --- > > Since I only have x86 machines available for testing, I am not entirely > > certain whether this patch will bring performance improvements across > > different architectures. Therefore, I am sending this as an RFC patch > > first. > > > > bits/stdlib-bsearch.h | 20 +++++++++----------- > > 1 file changed, 9 insertions(+), 11 deletions(-) > > > FWIW, here is the code I used for testing: > > /* tst-bsearch1.c */ You could add this as a benchtest in this series :) If you are interested in doing see other benchmark files `benchmarks/bench-*.c` for example on how we doing timing/output (TIMING_* macros and json respectively). > > #include <stdio.h> > #include <stdlib.h> > #include <time.h> > > #define TEST_OLD > > int arr[100000]; > #define narr (sizeof (arr) / sizeof (arr[0])) > > static int > comp (const void *p1, const void *p2) > { > int x1 = *(int *) p1; > int x2 = *(int *) p2; > > if (x1 < x2) > return -1; > if (x1 > x2) > return 1; > return 0; > } > > __attribute__((noinline)) void * > newbsearch (const void *__key, const void *__base, size_t __nmemb, size_t __size, > __compar_fn_t __compar) > { > const void *__p; > int __comparison; > > while (__nmemb) > { > __p = (const void *) (((const char *) __base) + ((__nmemb >> 1) * __size)); > __comparison = (*__compar) (__key, __p); > if (__comparison == 0) > { > #if __GNUC_PREREQ(4, 6) > # pragma GCC diagnostic push > # pragma GCC diagnostic ignored "-Wcast-qual" > #endif > return (void *) __p; > #if __GNUC_PREREQ(4, 6) > # pragma GCC diagnostic pop > #endif > } > if (__comparison > 0) > { > __base = ((const char *) __p) + __size; > --__nmemb; > } > __nmemb >>= 1; > } > > return NULL; > } > > __attribute__((noinline)) void * > oldbsearch (const void *__key, const void *__base, size_t __nmemb, size_t __size, > __compar_fn_t __compar) > { > size_t __l, __u, __idx; > const void *__p; > int __comparison; > > __l = 0; > __u = __nmemb; > while (__l < __u) > { > __idx = (__l + __u) / 2; > __p = (const void *) (((const char *) __base) + (__idx * __size)); > __comparison = (*__compar) (__key, __p); > if (__comparison < 0) > __u = __idx; > else if (__comparison > 0) > __l = __idx + 1; > else > { > #if __GNUC_PREREQ(4, 6) > # pragma GCC diagnostic push > # pragma GCC diagnostic ignored "-Wcast-qual" > #endif > return (void *) __p; > #if __GNUC_PREREQ(4, 6) > # pragma GCC diagnostic pop > #endif > } > } > > return NULL; > } > > static int > do_test (void) > { > volatile size_t i; > volatile int *res; > volatile clock_t begin, end; > int key; > long long int old_time, new_time; > > for (i = 0; i < narr; ++i) > arr[i] = i; > > asm volatile ("" : : : "memory"); > begin = clock (); > asm volatile ("" : : : "memory"); > for (i = 0; i < 1e8; ++i) > { > key = i % narr; > res = (int *) oldbsearch (&key, arr, narr, sizeof (arr[0]), comp); > if (res != arr + key) > { > printf ("entry %zd got wrong answer\n", i); > return 1; > } > } > asm volatile ("" : : : "memory"); > end = clock (); > asm volatile ("" : : : "memory"); > old_time = end - begin; > printf ("Old bsearch elapsed time: %lld\n", old_time); > > asm volatile ("" : : : "memory"); > begin = clock (); > asm volatile ("" : : : "memory"); > for (i = 0; i < 1e8; ++i) > { > key = i % narr; > res = (int *) newbsearch (&key, arr, narr, sizeof (arr[0]), comp); > if (res != arr + key) > { > printf ("entry %zd got wrong answer\n", i); > return 1; > } > } > asm volatile ("" : : : "memory"); > end = clock (); > asm volatile ("" : : : "memory"); > new_time = end - begin; > printf ("New bsearch elapsed time: %lld\n", new_time); > > printf ("Improve efficiency by %lld %%\n", (old_time - new_time) * 100 / old_time); > > return 0; > } > > #define TEST_FUNCTION do_test () > #include "../test-skeleton.c" >
On Sun, Sep 01, 2024 at 04:42:17PM -0700, Noah Goldstein wrote: > On Sat, Aug 31, 2024 at 6:57 PM Kuan-Wei Chiu <visitorckw@gmail.com> wrote: > > > > On Sun, Sep 01, 2024 at 09:54:56AM +0800, Kuan-Wei Chiu wrote: > > > Optimize the bsearch() function to improve binary search performance. > > > This modification reduces the execution time by approximately 8% on an > > > x86 machine with a 100,000-element array under 1e8 searches. The > > > optimization slightly increases the code size by 8 bytes. > > > > > > code size: > > > * old: > > > text data bss dec hex filename > > > 250 0 0 250 fa ./stdlib/bsearch.o > > > * new: > > > text data bss dec hex filename > > > 258 0 0 258 102 ./stdlib/bsearch.o > > > > > > benchmark: > > > Old bsearch elapsed time: 4536833 > > > New bsearch elapsed time: 4137556 > > > Improve efficiency by 8 % > > > > > > Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com> > > > --- > > > Since I only have x86 machines available for testing, I am not entirely > > > certain whether this patch will bring performance improvements across > > > different architectures. Therefore, I am sending this as an RFC patch > > > first. > > > > > > bits/stdlib-bsearch.h | 20 +++++++++----------- > > > 1 file changed, 9 insertions(+), 11 deletions(-) > > > > > FWIW, here is the code I used for testing: > > > > /* tst-bsearch1.c */ > You could add this as a benchtest in this series :) > > If you are interested in doing see other benchmark files > `benchmarks/bench-*.c` for example on how we doing > timing/output (TIMING_* macros and json respectively). > Thanks for your guidance! :) I'll give it a try. Regards, Kuan-Wei > > > > #include <stdio.h> > > #include <stdlib.h> > > #include <time.h> > > > > #define TEST_OLD > > > > int arr[100000]; > > #define narr (sizeof (arr) / sizeof (arr[0])) > > > > static int > > comp (const void *p1, const void *p2) > > { > > int x1 = *(int *) p1; > > int x2 = *(int *) p2; > > > > if (x1 < x2) > > return -1; > > if (x1 > x2) > > return 1; > > return 0; > > } > > > > __attribute__((noinline)) void * > > newbsearch (const void *__key, const void *__base, size_t __nmemb, size_t __size, > > __compar_fn_t __compar) > > { > > const void *__p; > > int __comparison; > > > > while (__nmemb) > > { > > __p = (const void *) (((const char *) __base) + ((__nmemb >> 1) * __size)); > > __comparison = (*__compar) (__key, __p); > > if (__comparison == 0) > > { > > #if __GNUC_PREREQ(4, 6) > > # pragma GCC diagnostic push > > # pragma GCC diagnostic ignored "-Wcast-qual" > > #endif > > return (void *) __p; > > #if __GNUC_PREREQ(4, 6) > > # pragma GCC diagnostic pop > > #endif > > } > > if (__comparison > 0) > > { > > __base = ((const char *) __p) + __size; > > --__nmemb; > > } > > __nmemb >>= 1; > > } > > > > return NULL; > > } > > > > __attribute__((noinline)) void * > > oldbsearch (const void *__key, const void *__base, size_t __nmemb, size_t __size, > > __compar_fn_t __compar) > > { > > size_t __l, __u, __idx; > > const void *__p; > > int __comparison; > > > > __l = 0; > > __u = __nmemb; > > while (__l < __u) > > { > > __idx = (__l + __u) / 2; > > __p = (const void *) (((const char *) __base) + (__idx * __size)); > > __comparison = (*__compar) (__key, __p); > > if (__comparison < 0) > > __u = __idx; > > else if (__comparison > 0) > > __l = __idx + 1; > > else > > { > > #if __GNUC_PREREQ(4, 6) > > # pragma GCC diagnostic push > > # pragma GCC diagnostic ignored "-Wcast-qual" > > #endif > > return (void *) __p; > > #if __GNUC_PREREQ(4, 6) > > # pragma GCC diagnostic pop > > #endif > > } > > } > > > > return NULL; > > } > > > > static int > > do_test (void) > > { > > volatile size_t i; > > volatile int *res; > > volatile clock_t begin, end; > > int key; > > long long int old_time, new_time; > > > > for (i = 0; i < narr; ++i) > > arr[i] = i; > > > > asm volatile ("" : : : "memory"); > > begin = clock (); > > asm volatile ("" : : : "memory"); > > for (i = 0; i < 1e8; ++i) > > { > > key = i % narr; > > res = (int *) oldbsearch (&key, arr, narr, sizeof (arr[0]), comp); > > if (res != arr + key) > > { > > printf ("entry %zd got wrong answer\n", i); > > return 1; > > } > > } > > asm volatile ("" : : : "memory"); > > end = clock (); > > asm volatile ("" : : : "memory"); > > old_time = end - begin; > > printf ("Old bsearch elapsed time: %lld\n", old_time); > > > > asm volatile ("" : : : "memory"); > > begin = clock (); > > asm volatile ("" : : : "memory"); > > for (i = 0; i < 1e8; ++i) > > { > > key = i % narr; > > res = (int *) newbsearch (&key, arr, narr, sizeof (arr[0]), comp); > > if (res != arr + key) > > { > > printf ("entry %zd got wrong answer\n", i); > > return 1; > > } > > } > > asm volatile ("" : : : "memory"); > > end = clock (); > > asm volatile ("" : : : "memory"); > > new_time = end - begin; > > printf ("New bsearch elapsed time: %lld\n", new_time); > > > > printf ("Improve efficiency by %lld %%\n", (old_time - new_time) * 100 / old_time); > > > > return 0; > > } > > > > #define TEST_FUNCTION do_test () > > #include "../test-skeleton.c" > >
diff --git a/bits/stdlib-bsearch.h b/bits/stdlib-bsearch.h index 540d718952..57f060b504 100644 --- a/bits/stdlib-bsearch.h +++ b/bits/stdlib-bsearch.h @@ -20,22 +20,14 @@ __extern_inline void * bsearch (const void *__key, const void *__base, size_t __nmemb, size_t __size, __compar_fn_t __compar) { - size_t __l, __u, __idx; const void *__p; int __comparison; - __l = 0; - __u = __nmemb; - while (__l < __u) + while (__nmemb) { - __idx = (__l + __u) / 2; - __p = (const void *) (((const char *) __base) + (__idx * __size)); + __p = (const void *) (((const char *) __base) + ((__nmemb >> 1) * __size)); __comparison = (*__compar) (__key, __p); - if (__comparison < 0) - __u = __idx; - else if (__comparison > 0) - __l = __idx + 1; - else + if (__comparison == 0) { #if __GNUC_PREREQ(4, 6) # pragma GCC diagnostic push @@ -46,6 +38,12 @@ bsearch (const void *__key, const void *__base, size_t __nmemb, size_t __size, # pragma GCC diagnostic pop #endif } + if (__comparison > 0) + { + __base = ((const char *) __p) + __size; + --__nmemb; + } + __nmemb >>= 1; } return NULL;
Optimize the bsearch() function to improve binary search performance. This modification reduces the execution time by approximately 8% on an x86 machine with a 100,000-element array under 1e8 searches. The optimization slightly increases the code size by 8 bytes. code size: * old: text data bss dec hex filename 250 0 0 250 fa ./stdlib/bsearch.o * new: text data bss dec hex filename 258 0 0 258 102 ./stdlib/bsearch.o benchmark: Old bsearch elapsed time: 4536833 New bsearch elapsed time: 4137556 Improve efficiency by 8 % Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com> --- Since I only have x86 machines available for testing, I am not entirely certain whether this patch will bring performance improvements across different architectures. Therefore, I am sending this as an RFC patch first. bits/stdlib-bsearch.h | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-)