Message ID | 20230718141543.3925254-2-adhemerval.zanella@linaro.org |
---|---|
State | New |
Headers | show |
Series | Use introsort for qsort | expand |
Ping. On 18/07/23 11:15, Adhemerval Zanella wrote: > The optimization takes in consideration both the most common elements > are either 32 or 64 bit in size and inputs are aligned to the word > boundary. This is similar to what msort does. > > For large buffer the swap operation uses memcpy/mempcpy with a > small fixed size buffer (so compiler might inline the operations). > > Checked on x86_64-linux-gnu. > --- > stdlib/qsort.c | 116 +++++++++++++++++++++++++++++++++++++++++-------- > 1 file changed, 98 insertions(+), 18 deletions(-) > > diff --git a/stdlib/qsort.c b/stdlib/qsort.c > index 728a0ed370..5bcc287c79 100644 > --- a/stdlib/qsort.c > +++ b/stdlib/qsort.c > @@ -23,20 +23,92 @@ > #include <limits.h> > #include <stdlib.h> > #include <string.h> > +#include <stdbool.h> > > -/* Byte-wise swap two items of size SIZE. */ > -#define SWAP(a, b, size) \ > - do \ > - { \ > - size_t __size = (size); \ > - char *__a = (a), *__b = (b); \ > - do \ > - { \ > - char __tmp = *__a; \ > - *__a++ = *__b; \ > - *__b++ = __tmp; \ > - } while (--__size > 0); \ > - } while (0) > +/* Swap SIZE bytes between addresses A and B. These helpers are provided > + along the generic one as an optimization. */ > + > +enum swap_type_t > + { > + SWAP_WORDS_64, > + SWAP_WORDS_32, > + SWAP_BYTES > + }; > + > +/* If this function returns true, elements can be safely copied using word > + loads and stores. Otherwise, it might not be safe. BASE (as an integer) > + must be a multiple of the word alignment. SIZE must be a multiple of > + WORDSIZE. Since WORDSIZE must be a multiple of the word alignment, and > + WORDSIZE is a power of two on all supported platforms, this function for > + speed merely checks that BASE and SIZE are both multiples of the word > + size. */ > +static inline bool > +is_aligned (const void *base, size_t size, size_t wordsize) > +{ > + return (((uintptr_t) base | size) & (wordsize - 1)) == 0; > +} > + > +static inline void > +swap_words_64 (void * restrict a, void * restrict b, size_t n) > +{ > + typedef uint64_t __attribute__ ((__may_alias__)) u64_alias_t; > + do > + { > + n -= 8; > + u64_alias_t t = *(u64_alias_t *)(a + n); > + *(u64_alias_t *)(a + n) = *(u64_alias_t *)(b + n); > + *(u64_alias_t *)(b + n) = t; > + } while (n); > +} > + > +static inline void > +swap_words_32 (void * restrict a, void * restrict b, size_t n) > +{ > + typedef uint32_t __attribute__ ((__may_alias__)) u32_alias_t; > + do > + { > + n -= 4; > + u32_alias_t t = *(u32_alias_t *)(a + n); > + *(u32_alias_t *)(a + n) = *(u32_alias_t *)(b + n); > + *(u32_alias_t *)(b + n) = t; > + } while (n); > +} > + > +static inline void > +swap_bytes (void * restrict a, void * restrict b, size_t n) > +{ > + /* Use multiple small memcpys with constant size to enable inlining > + on most targets. */ > + enum { SWAP_GENERIC_SIZE = 32 }; > + unsigned char tmp[SWAP_GENERIC_SIZE]; > + while (n > SWAP_GENERIC_SIZE) > + { > + memcpy (tmp, a, SWAP_GENERIC_SIZE); > + a = __mempcpy (a, b, SWAP_GENERIC_SIZE); > + b = __mempcpy (b, tmp, SWAP_GENERIC_SIZE); > + n -= SWAP_GENERIC_SIZE; > + } > + while (n > 0) > + { > + unsigned char t = ((unsigned char *)a)[--n]; > + ((unsigned char *)a)[n] = ((unsigned char *)b)[n]; > + ((unsigned char *)b)[n] = t; > + } > +} > + > +/* Replace the indirect call with a serie of if statements. It should help > + the branch predictor. */ > +static void > +do_swap (void * restrict a, void * restrict b, size_t size, > + enum swap_type_t swap_func) > +{ > + if (swap_func == SWAP_WORDS_64) > + swap_words_64 (a, b, size); > + else if (swap_func == SWAP_WORDS_32) > + swap_words_32 (a, b, size); > + else > + swap_bytes (a, b, size); > +} > > /* Discontinue quicksort algorithm when partition gets below this size. > This particular magic number was chosen to work best on a Sun 4/260. */ > @@ -96,6 +168,14 @@ _quicksort (void *const pbase, size_t total_elems, size_t size, > /* Avoid lossage with unsigned arithmetic below. */ > return; > > + enum swap_type_t swap_type; > + if (is_aligned (pbase, size, 8)) > + swap_type = SWAP_WORDS_64; > + else if (is_aligned (pbase, size, 4)) > + swap_type = SWAP_WORDS_32; > + else > + swap_type = SWAP_BYTES; > + > if (total_elems > MAX_THRESH) > { > char *lo = base_ptr; > @@ -119,13 +199,13 @@ _quicksort (void *const pbase, size_t total_elems, size_t size, > char *mid = lo + size * ((hi - lo) / size >> 1); > > if ((*cmp) ((void *) mid, (void *) lo, arg) < 0) > - SWAP (mid, lo, size); > + do_swap (mid, lo, size, swap_type); > if ((*cmp) ((void *) hi, (void *) mid, arg) < 0) > - SWAP (mid, hi, size); > + do_swap (mid, hi, size, swap_type); > else > goto jump_over; > if ((*cmp) ((void *) mid, (void *) lo, arg) < 0) > - SWAP (mid, lo, size); > + do_swap (mid, lo, size, swap_type); > jump_over:; > > left_ptr = lo + size; > @@ -144,7 +224,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size, > > if (left_ptr < right_ptr) > { > - SWAP (left_ptr, right_ptr, size); > + do_swap (left_ptr, right_ptr, size, swap_type); > if (mid == left_ptr) > mid = right_ptr; > else if (mid == right_ptr) > @@ -216,7 +296,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size, > tmp_ptr = run_ptr; > > if (tmp_ptr != base_ptr) > - SWAP (tmp_ptr, base_ptr, size); > + do_swap (tmp_ptr, base_ptr, size, swap_type); > > /* Insertion sort, running from left-hand-side up to right-hand-side. */ >
On Tue, Jul 18, 2023 at 9:16 AM Adhemerval Zanella via Libc-alpha <libc-alpha@sourceware.org> wrote: > > The optimization takes in consideration both the most common elements > are either 32 or 64 bit in size and inputs are aligned to the word > boundary. This is similar to what msort does. > > For large buffer the swap operation uses memcpy/mempcpy with a > small fixed size buffer (so compiler might inline the operations). > > Checked on x86_64-linux-gnu. > --- > stdlib/qsort.c | 116 +++++++++++++++++++++++++++++++++++++++++-------- > 1 file changed, 98 insertions(+), 18 deletions(-) > > diff --git a/stdlib/qsort.c b/stdlib/qsort.c > index 728a0ed370..5bcc287c79 100644 > --- a/stdlib/qsort.c > +++ b/stdlib/qsort.c > @@ -23,20 +23,92 @@ > #include <limits.h> > #include <stdlib.h> > #include <string.h> > +#include <stdbool.h> > > -/* Byte-wise swap two items of size SIZE. */ > -#define SWAP(a, b, size) \ > - do \ > - { \ > - size_t __size = (size); \ > - char *__a = (a), *__b = (b); \ > - do \ > - { \ > - char __tmp = *__a; \ > - *__a++ = *__b; \ > - *__b++ = __tmp; \ > - } while (--__size > 0); \ > - } while (0) > +/* Swap SIZE bytes between addresses A and B. These helpers are provided > + along the generic one as an optimization. */ > + > +enum swap_type_t > + { > + SWAP_WORDS_64, > + SWAP_WORDS_32, > + SWAP_BYTES > + }; > + > +/* If this function returns true, elements can be safely copied using word > + loads and stores. Otherwise, it might not be safe. BASE (as an integer) > + must be a multiple of the word alignment. SIZE must be a multiple of > + WORDSIZE. Since WORDSIZE must be a multiple of the word alignment, and > + WORDSIZE is a power of two on all supported platforms, this function for > + speed merely checks that BASE and SIZE are both multiples of the word > + size. */ > +static inline bool > +is_aligned (const void *base, size_t size, size_t wordsize) > +{ > + return (((uintptr_t) base | size) & (wordsize - 1)) == 0; > +} > + > +static inline void > +swap_words_64 (void * restrict a, void * restrict b, size_t n) > +{ > + typedef uint64_t __attribute__ ((__may_alias__)) u64_alias_t; > + do > + { > + n -= 8; > + u64_alias_t t = *(u64_alias_t *)(a + n); > + *(u64_alias_t *)(a + n) = *(u64_alias_t *)(b + n); > + *(u64_alias_t *)(b + n) = t; > + } while (n); > +} > + > +static inline void > +swap_words_32 (void * restrict a, void * restrict b, size_t n) > +{ > + typedef uint32_t __attribute__ ((__may_alias__)) u32_alias_t; > + do > + { > + n -= 4; > + u32_alias_t t = *(u32_alias_t *)(a + n); > + *(u32_alias_t *)(a + n) = *(u32_alias_t *)(b + n); > + *(u32_alias_t *)(b + n) = t; > + } while (n); > +} > + > +static inline void > +swap_bytes (void * restrict a, void * restrict b, size_t n) > +{ > + /* Use multiple small memcpys with constant size to enable inlining > + on most targets. */ > + enum { SWAP_GENERIC_SIZE = 32 }; Just intuitively think 16 makes more sense here, but no particularly good reason. > + unsigned char tmp[SWAP_GENERIC_SIZE]; > + while (n > SWAP_GENERIC_SIZE) > + { > + memcpy (tmp, a, SWAP_GENERIC_SIZE); > + a = __mempcpy (a, b, SWAP_GENERIC_SIZE); > + b = __mempcpy (b, tmp, SWAP_GENERIC_SIZE); > + n -= SWAP_GENERIC_SIZE; > + } > + while (n > 0) > + { > + unsigned char t = ((unsigned char *)a)[--n]; > + ((unsigned char *)a)[n] = ((unsigned char *)b)[n]; > + ((unsigned char *)b)[n] = t; > + } > +} > + > +/* Replace the indirect call with a serie of if statements. It should help > + the branch predictor. */ > +static void > +do_swap (void * restrict a, void * restrict b, size_t size, > + enum swap_type_t swap_func) > +{ nit: swap_func -> swap_type/swap_method (think was previously a function pointer but the name still implies as much). > + if (swap_func == SWAP_WORDS_64) > + swap_words_64 (a, b, size); > + else if (swap_func == SWAP_WORDS_32) > + swap_words_32 (a, b, size); > + else > + swap_bytes (a, b, size); > +} > > /* Discontinue quicksort algorithm when partition gets below this size. > This particular magic number was chosen to work best on a Sun 4/260. */ > @@ -96,6 +168,14 @@ _quicksort (void *const pbase, size_t total_elems, size_t size, > /* Avoid lossage with unsigned arithmetic below. */ > return; > > + enum swap_type_t swap_type; > + if (is_aligned (pbase, size, 8)) > + swap_type = SWAP_WORDS_64; > + else if (is_aligned (pbase, size, 4)) > + swap_type = SWAP_WORDS_32; > + else > + swap_type = SWAP_BYTES; > + > if (total_elems > MAX_THRESH) > { > char *lo = base_ptr; > @@ -119,13 +199,13 @@ _quicksort (void *const pbase, size_t total_elems, size_t size, > char *mid = lo + size * ((hi - lo) / size >> 1); > > if ((*cmp) ((void *) mid, (void *) lo, arg) < 0) > - SWAP (mid, lo, size); > + do_swap (mid, lo, size, swap_type); > if ((*cmp) ((void *) hi, (void *) mid, arg) < 0) > - SWAP (mid, hi, size); > + do_swap (mid, hi, size, swap_type); > else > goto jump_over; > if ((*cmp) ((void *) mid, (void *) lo, arg) < 0) > - SWAP (mid, lo, size); > + do_swap (mid, lo, size, swap_type); > jump_over:; > > left_ptr = lo + size; > @@ -144,7 +224,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size, > > if (left_ptr < right_ptr) > { > - SWAP (left_ptr, right_ptr, size); > + do_swap (left_ptr, right_ptr, size, swap_type); > if (mid == left_ptr) > mid = right_ptr; > else if (mid == right_ptr) > @@ -216,7 +296,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size, > tmp_ptr = run_ptr; > > if (tmp_ptr != base_ptr) > - SWAP (tmp_ptr, base_ptr, size); > + do_swap (tmp_ptr, base_ptr, size, swap_type); > > /* Insertion sort, running from left-hand-side up to right-hand-side. */ > > -- > 2.34.1 >
On 29/08/23 04:33, Noah Goldstein wrote: > On Tue, Jul 18, 2023 at 9:16 AM Adhemerval Zanella via Libc-alpha > <libc-alpha@sourceware.org> wrote: >> >> The optimization takes in consideration both the most common elements >> are either 32 or 64 bit in size and inputs are aligned to the word >> boundary. This is similar to what msort does. >> >> For large buffer the swap operation uses memcpy/mempcpy with a >> small fixed size buffer (so compiler might inline the operations). >> >> Checked on x86_64-linux-gnu. >> --- >> stdlib/qsort.c | 116 +++++++++++++++++++++++++++++++++++++++++-------- >> 1 file changed, 98 insertions(+), 18 deletions(-) >> >> diff --git a/stdlib/qsort.c b/stdlib/qsort.c >> index 728a0ed370..5bcc287c79 100644 >> --- a/stdlib/qsort.c >> +++ b/stdlib/qsort.c >> @@ -23,20 +23,92 @@ >> #include <limits.h> >> #include <stdlib.h> >> #include <string.h> >> +#include <stdbool.h> >> >> -/* Byte-wise swap two items of size SIZE. */ >> -#define SWAP(a, b, size) \ >> - do \ >> - { \ >> - size_t __size = (size); \ >> - char *__a = (a), *__b = (b); \ >> - do \ >> - { \ >> - char __tmp = *__a; \ >> - *__a++ = *__b; \ >> - *__b++ = __tmp; \ >> - } while (--__size > 0); \ >> - } while (0) >> +/* Swap SIZE bytes between addresses A and B. These helpers are provided >> + along the generic one as an optimization. */ >> + >> +enum swap_type_t >> + { >> + SWAP_WORDS_64, >> + SWAP_WORDS_32, >> + SWAP_BYTES >> + }; >> + >> +/* If this function returns true, elements can be safely copied using word >> + loads and stores. Otherwise, it might not be safe. BASE (as an integer) >> + must be a multiple of the word alignment. SIZE must be a multiple of >> + WORDSIZE. Since WORDSIZE must be a multiple of the word alignment, and >> + WORDSIZE is a power of two on all supported platforms, this function for >> + speed merely checks that BASE and SIZE are both multiples of the word >> + size. */ >> +static inline bool >> +is_aligned (const void *base, size_t size, size_t wordsize) >> +{ >> + return (((uintptr_t) base | size) & (wordsize - 1)) == 0; >> +} >> + >> +static inline void >> +swap_words_64 (void * restrict a, void * restrict b, size_t n) >> +{ >> + typedef uint64_t __attribute__ ((__may_alias__)) u64_alias_t; >> + do >> + { >> + n -= 8; >> + u64_alias_t t = *(u64_alias_t *)(a + n); >> + *(u64_alias_t *)(a + n) = *(u64_alias_t *)(b + n); >> + *(u64_alias_t *)(b + n) = t; >> + } while (n); >> +} >> + >> +static inline void >> +swap_words_32 (void * restrict a, void * restrict b, size_t n) >> +{ >> + typedef uint32_t __attribute__ ((__may_alias__)) u32_alias_t; >> + do >> + { >> + n -= 4; >> + u32_alias_t t = *(u32_alias_t *)(a + n); >> + *(u32_alias_t *)(a + n) = *(u32_alias_t *)(b + n); >> + *(u32_alias_t *)(b + n) = t; >> + } while (n); >> +} >> + >> +static inline void >> +swap_bytes (void * restrict a, void * restrict b, size_t n) >> +{ >> + /* Use multiple small memcpys with constant size to enable inlining >> + on most targets. */ >> + enum { SWAP_GENERIC_SIZE = 32 }; > Just intuitively think 16 makes more sense here, but no particularly good > reason. I don't have a strong opinion here, and most likely the best value will depend on the architecture. I picked a small value so memcpy/__mempcpy could be inline (and it should be easier to add a arch-specific hook to change this if this is really required). >> + unsigned char tmp[SWAP_GENERIC_SIZE]; >> + while (n > SWAP_GENERIC_SIZE) >> + { >> + memcpy (tmp, a, SWAP_GENERIC_SIZE); >> + a = __mempcpy (a, b, SWAP_GENERIC_SIZE); >> + b = __mempcpy (b, tmp, SWAP_GENERIC_SIZE); >> + n -= SWAP_GENERIC_SIZE; >> + } >> + while (n > 0) >> + { >> + unsigned char t = ((unsigned char *)a)[--n]; >> + ((unsigned char *)a)[n] = ((unsigned char *)b)[n]; >> + ((unsigned char *)b)[n] = t; >> + } >> +} >> + >> +/* Replace the indirect call with a serie of if statements. It should help >> + the branch predictor. */ >> +static void >> +do_swap (void * restrict a, void * restrict b, size_t size, >> + enum swap_type_t swap_func) >> +{ > nit: swap_func -> swap_type/swap_method (think was previously > a function pointer but the name still implies as much). Ack. >> + if (swap_func == SWAP_WORDS_64) >> + swap_words_64 (a, b, size); >> + else if (swap_func == SWAP_WORDS_32) >> + swap_words_32 (a, b, size); >> + else >> + swap_bytes (a, b, size); >> +} >> >> /* Discontinue quicksort algorithm when partition gets below this size. >> This particular magic number was chosen to work best on a Sun 4/260. */ >> @@ -96,6 +168,14 @@ _quicksort (void *const pbase, size_t total_elems, size_t size, >> /* Avoid lossage with unsigned arithmetic below. */ >> return; >> >> + enum swap_type_t swap_type; >> + if (is_aligned (pbase, size, 8)) >> + swap_type = SWAP_WORDS_64; >> + else if (is_aligned (pbase, size, 4)) >> + swap_type = SWAP_WORDS_32; >> + else >> + swap_type = SWAP_BYTES; >> + >> if (total_elems > MAX_THRESH) >> { >> char *lo = base_ptr; >> @@ -119,13 +199,13 @@ _quicksort (void *const pbase, size_t total_elems, size_t size, >> char *mid = lo + size * ((hi - lo) / size >> 1); >> >> if ((*cmp) ((void *) mid, (void *) lo, arg) < 0) >> - SWAP (mid, lo, size); >> + do_swap (mid, lo, size, swap_type); >> if ((*cmp) ((void *) hi, (void *) mid, arg) < 0) >> - SWAP (mid, hi, size); >> + do_swap (mid, hi, size, swap_type); >> else >> goto jump_over; >> if ((*cmp) ((void *) mid, (void *) lo, arg) < 0) >> - SWAP (mid, lo, size); >> + do_swap (mid, lo, size, swap_type); >> jump_over:; >> >> left_ptr = lo + size; >> @@ -144,7 +224,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size, >> >> if (left_ptr < right_ptr) >> { >> - SWAP (left_ptr, right_ptr, size); >> + do_swap (left_ptr, right_ptr, size, swap_type); >> if (mid == left_ptr) >> mid = right_ptr; >> else if (mid == right_ptr) >> @@ -216,7 +296,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size, >> tmp_ptr = run_ptr; >> >> if (tmp_ptr != base_ptr) >> - SWAP (tmp_ptr, base_ptr, size); >> + do_swap (tmp_ptr, base_ptr, size, swap_type); >> >> /* Insertion sort, running from left-hand-side up to right-hand-side. */ >> >> -- >> 2.34.1 >>
* Adhemerval Zanella via Libc-alpha: > The optimization takes in consideration both the most common elements > are either 32 or 64 bit in size and inputs are aligned to the word > boundary. This is similar to what msort does. > > For large buffer the swap operation uses memcpy/mempcpy with a > small fixed size buffer (so compiler might inline the operations). Should we add a public memswap function that works on non-overlapping buffers and swaps them? It seems this might be useful in general. Then architecture maintainers can optimize it as needed. Thanks, Florian
On Tue, Aug 29, 2023 at 11:53 AM Florian Weimer via Libc-alpha <libc-alpha@sourceware.org> wrote: > > * Adhemerval Zanella via Libc-alpha: > > > The optimization takes in consideration both the most common elements > > are either 32 or 64 bit in size and inputs are aligned to the word > > boundary. This is similar to what msort does. > > > > For large buffer the swap operation uses memcpy/mempcpy with a > > small fixed size buffer (so compiler might inline the operations). > > Should we add a public memswap function that works on non-overlapping > buffers and swaps them? It seems this might be useful in general. Then > architecture maintainers can optimize it as needed. > +1 > Thanks, > Florian >
On 29/08/23 19:52, Noah Goldstein via Libc-alpha wrote: > On Tue, Aug 29, 2023 at 11:53 AM Florian Weimer via Libc-alpha > <libc-alpha@sourceware.org> wrote: >> >> * Adhemerval Zanella via Libc-alpha: >> >>> The optimization takes in consideration both the most common elements >>> are either 32 or 64 bit in size and inputs are aligned to the word >>> boundary. This is similar to what msort does. >>> >>> For large buffer the swap operation uses memcpy/mempcpy with a >>> small fixed size buffer (so compiler might inline the operations). >> >> Should we add a public memswap function that works on non-overlapping >> buffers and swaps them? It seems this might be useful in general. Then >> architecture maintainers can optimize it as needed. >> > +1 >> Thanks, >> Florian >> Fair enough, I think we can start with a internal symbol for qsort.
diff --git a/stdlib/qsort.c b/stdlib/qsort.c index 728a0ed370..5bcc287c79 100644 --- a/stdlib/qsort.c +++ b/stdlib/qsort.c @@ -23,20 +23,92 @@ #include <limits.h> #include <stdlib.h> #include <string.h> +#include <stdbool.h> -/* Byte-wise swap two items of size SIZE. */ -#define SWAP(a, b, size) \ - do \ - { \ - size_t __size = (size); \ - char *__a = (a), *__b = (b); \ - do \ - { \ - char __tmp = *__a; \ - *__a++ = *__b; \ - *__b++ = __tmp; \ - } while (--__size > 0); \ - } while (0) +/* Swap SIZE bytes between addresses A and B. These helpers are provided + along the generic one as an optimization. */ + +enum swap_type_t + { + SWAP_WORDS_64, + SWAP_WORDS_32, + SWAP_BYTES + }; + +/* If this function returns true, elements can be safely copied using word + loads and stores. Otherwise, it might not be safe. BASE (as an integer) + must be a multiple of the word alignment. SIZE must be a multiple of + WORDSIZE. Since WORDSIZE must be a multiple of the word alignment, and + WORDSIZE is a power of two on all supported platforms, this function for + speed merely checks that BASE and SIZE are both multiples of the word + size. */ +static inline bool +is_aligned (const void *base, size_t size, size_t wordsize) +{ + return (((uintptr_t) base | size) & (wordsize - 1)) == 0; +} + +static inline void +swap_words_64 (void * restrict a, void * restrict b, size_t n) +{ + typedef uint64_t __attribute__ ((__may_alias__)) u64_alias_t; + do + { + n -= 8; + u64_alias_t t = *(u64_alias_t *)(a + n); + *(u64_alias_t *)(a + n) = *(u64_alias_t *)(b + n); + *(u64_alias_t *)(b + n) = t; + } while (n); +} + +static inline void +swap_words_32 (void * restrict a, void * restrict b, size_t n) +{ + typedef uint32_t __attribute__ ((__may_alias__)) u32_alias_t; + do + { + n -= 4; + u32_alias_t t = *(u32_alias_t *)(a + n); + *(u32_alias_t *)(a + n) = *(u32_alias_t *)(b + n); + *(u32_alias_t *)(b + n) = t; + } while (n); +} + +static inline void +swap_bytes (void * restrict a, void * restrict b, size_t n) +{ + /* Use multiple small memcpys with constant size to enable inlining + on most targets. */ + enum { SWAP_GENERIC_SIZE = 32 }; + unsigned char tmp[SWAP_GENERIC_SIZE]; + while (n > SWAP_GENERIC_SIZE) + { + memcpy (tmp, a, SWAP_GENERIC_SIZE); + a = __mempcpy (a, b, SWAP_GENERIC_SIZE); + b = __mempcpy (b, tmp, SWAP_GENERIC_SIZE); + n -= SWAP_GENERIC_SIZE; + } + while (n > 0) + { + unsigned char t = ((unsigned char *)a)[--n]; + ((unsigned char *)a)[n] = ((unsigned char *)b)[n]; + ((unsigned char *)b)[n] = t; + } +} + +/* Replace the indirect call with a serie of if statements. It should help + the branch predictor. */ +static void +do_swap (void * restrict a, void * restrict b, size_t size, + enum swap_type_t swap_func) +{ + if (swap_func == SWAP_WORDS_64) + swap_words_64 (a, b, size); + else if (swap_func == SWAP_WORDS_32) + swap_words_32 (a, b, size); + else + swap_bytes (a, b, size); +} /* Discontinue quicksort algorithm when partition gets below this size. This particular magic number was chosen to work best on a Sun 4/260. */ @@ -96,6 +168,14 @@ _quicksort (void *const pbase, size_t total_elems, size_t size, /* Avoid lossage with unsigned arithmetic below. */ return; + enum swap_type_t swap_type; + if (is_aligned (pbase, size, 8)) + swap_type = SWAP_WORDS_64; + else if (is_aligned (pbase, size, 4)) + swap_type = SWAP_WORDS_32; + else + swap_type = SWAP_BYTES; + if (total_elems > MAX_THRESH) { char *lo = base_ptr; @@ -119,13 +199,13 @@ _quicksort (void *const pbase, size_t total_elems, size_t size, char *mid = lo + size * ((hi - lo) / size >> 1); if ((*cmp) ((void *) mid, (void *) lo, arg) < 0) - SWAP (mid, lo, size); + do_swap (mid, lo, size, swap_type); if ((*cmp) ((void *) hi, (void *) mid, arg) < 0) - SWAP (mid, hi, size); + do_swap (mid, hi, size, swap_type); else goto jump_over; if ((*cmp) ((void *) mid, (void *) lo, arg) < 0) - SWAP (mid, lo, size); + do_swap (mid, lo, size, swap_type); jump_over:; left_ptr = lo + size; @@ -144,7 +224,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size, if (left_ptr < right_ptr) { - SWAP (left_ptr, right_ptr, size); + do_swap (left_ptr, right_ptr, size, swap_type); if (mid == left_ptr) mid = right_ptr; else if (mid == right_ptr) @@ -216,7 +296,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size, tmp_ptr = run_ptr; if (tmp_ptr != base_ptr) - SWAP (tmp_ptr, base_ptr, size); + do_swap (tmp_ptr, base_ptr, size, swap_type); /* Insertion sort, running from left-hand-side up to right-hand-side. */