Message ID | 20230425214554.3624392-1-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v4] x86: Increase `non_temporal_threshold` to roughly `sizeof_L3 / 2` | expand |
On Tue, Apr 25, 2023 at 2:45 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > Current `non_temporal_threshold` set to roughly '3/4 * sizeof_L3 / > ncores_per_socket'. This patch updates that value to roughly > 'sizeof_L3 / 2` > > The original value (specifically dividing the `ncores_per_socket`) was > done to limit the amount of other threads' data a `memcpy`/`memset` > could evict. > > Dividing by 'ncores_per_socket', however leads to exceedingly low > non-temporal thresholds and leads to using non-temporal stores in > cases where REP MOVSB is multiple times faster. > > Furthermore, non-temporal stores are written directly to main memory > so using it at a size much smaller than L3 can place soon to be > accessed data much further away than it otherwise could be. As well, > modern machines are able to detect streaming patterns (especially if > REP MOVSB is used) and provide LRU hints to the memory subsystem. This > in affect caps the total amount of eviction at 1/cache_associativity, > far below meaningfully thrashing the entire cache. > > As best I can tell, the benchmarks that lead this small threshold > where done comparing non-temporal stores versus standard cacheable > stores. A better comparison (linked below) is to be REP MOVSB which, > on the measure systems, is nearly 2x faster than non-temporal stores > at the low-end of the previous threshold, and within 10% for over > 100MB copies (well past even the current threshold). In cases with a > low number of threads competing for bandwidth, REP MOVSB is ~2x faster > up to `sizeof_L3`. > > Benchmarks comparing non-temporal stores, REP MOVSB, and cacheable > stores where done using: > https://github.com/goldsteinn/memcpy-nt-benchmarks > > Sheets results (also available in pdf on the github): > https://docs.google.com/spreadsheets/d/e/2PACX-1vS183r0rW_jRX6tG_E90m9qVuFiMbRIJvi5VAE8yYOvEOIEEc3aSNuEsrFbuXw5c3nGboxMmrupZD7K/pubhtml > --- > sysdeps/x86/dl-cacheinfo.h | 70 +++++++++++++++++++++++--------------- > 1 file changed, 43 insertions(+), 27 deletions(-) > > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h > index ec88945b39..4f1fd419f8 100644 > --- a/sysdeps/x86/dl-cacheinfo.h > +++ b/sysdeps/x86/dl-cacheinfo.h > @@ -407,7 +407,7 @@ handle_zhaoxin (int name) > } > > static void > -get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr, > +get_common_cache_info (long int *shared_ptr, long int * shared_per_thread_ptr, unsigned int *threads_ptr, > long int core) > { > unsigned int eax; > @@ -426,6 +426,7 @@ get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr, > unsigned int family = cpu_features->basic.family; > unsigned int model = cpu_features->basic.model; > long int shared = *shared_ptr; > + long int shared_per_thread = *shared_per_thread_ptr; > unsigned int threads = *threads_ptr; > bool inclusive_cache = true; > bool support_count_mask = true; > @@ -441,6 +442,7 @@ get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr, > /* Try L2 otherwise. */ > level = 2; > shared = core; > + shared_per_thread = core; > threads_l2 = 0; > threads_l3 = -1; > } > @@ -597,29 +599,28 @@ get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr, > } > else > { > -intel_bug_no_cache_info: > - /* Assume that all logical threads share the highest cache > - level. */ > - threads > - = ((cpu_features->features[CPUID_INDEX_1].cpuid.ebx >> 16) > - & 0xff); > - } > - > - /* Cap usage of highest cache level to the number of supported > - threads. */ > - if (shared > 0 && threads > 0) > - shared /= threads; > + intel_bug_no_cache_info: > + /* Assume that all logical threads share the highest cache > + level. */ > + threads = ((cpu_features->features[CPUID_INDEX_1].cpuid.ebx >> 16) > + & 0xff); > + > + /* Get per-thread size of highest level cache. */ > + if (shared_per_thread > 0 && threads > 0) > + shared_per_thread /= threads; > + } > } > > /* Account for non-inclusive L2 and L3 caches. */ > if (!inclusive_cache) > { > if (threads_l2 > 0) > - core /= threads_l2; > + shared_per_thread += core / threads_l2; > shared += core; > } > > *shared_ptr = shared; > + *shared_per_thread_ptr = shared_per_thread; > *threads_ptr = threads; > } > > @@ -629,6 +630,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > /* Find out what brand of processor. */ > long int data = -1; > long int shared = -1; > + long int shared_per_thread = -1; > long int core = -1; > unsigned int threads = 0; > unsigned long int level1_icache_size = -1; > @@ -649,6 +651,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features); > core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features); > shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features); > + shared_per_thread = shared; > > level1_icache_size > = handle_intel (_SC_LEVEL1_ICACHE_SIZE, cpu_features); > @@ -672,13 +675,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > level4_cache_size > = handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features); > > - get_common_cache_info (&shared, &threads, core); > + get_common_cache_info (&shared, &shared_per_thread, &threads, core); > } > else if (cpu_features->basic.kind == arch_kind_zhaoxin) > { > data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE); > core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE); > shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE); > + shared_per_thread = shared; > > level1_icache_size = handle_zhaoxin (_SC_LEVEL1_ICACHE_SIZE); > level1_icache_linesize = handle_zhaoxin (_SC_LEVEL1_ICACHE_LINESIZE); > @@ -692,13 +696,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC); > level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE); > > - get_common_cache_info (&shared, &threads, core); > + get_common_cache_info (&shared, &shared_per_thread, &threads, core); > } > else if (cpu_features->basic.kind == arch_kind_amd) > { > data = handle_amd (_SC_LEVEL1_DCACHE_SIZE); > core = handle_amd (_SC_LEVEL2_CACHE_SIZE); > shared = handle_amd (_SC_LEVEL3_CACHE_SIZE); > + shared_per_thread = shared; > > level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE); > level1_icache_linesize = handle_amd (_SC_LEVEL1_ICACHE_LINESIZE); > @@ -715,6 +720,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > if (shared <= 0) > /* No shared L3 cache. All we have is the L2 cache. */ > shared = core; > + > + if (shared_per_thread <= 0) > + shared_per_thread = shared; > } > > cpu_features->level1_icache_size = level1_icache_size; > @@ -730,17 +738,25 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > cpu_features->level3_cache_linesize = level3_cache_linesize; > cpu_features->level4_cache_size = level4_cache_size; > > - /* The default setting for the non_temporal threshold is 3/4 of one > - thread's share of the chip's cache. For most Intel and AMD processors > - with an initial release date between 2017 and 2020, a thread's typical > - share of the cache is from 500 KBytes to 2 MBytes. Using the 3/4 > - threshold leaves 125 KBytes to 500 KBytes of the thread's data > - in cache after a maximum temporal copy, which will maintain > - in cache a reasonable portion of the thread's stack and other > - active data. If the threshold is set higher than one thread's > - share of the cache, it has a substantial risk of negatively > - impacting the performance of other threads running on the chip. */ > - unsigned long int non_temporal_threshold = shared * 3 / 4; > + /* The default setting for the non_temporal threshold is 1/2 of size > + of the chip's cache. For most Intel and AMD processors with an > + initial release date between 2017 and 2023, a thread's typical > + share of the cache is from 18-64MB. Using the 1/2 L3 is meant to > + estimate the point where non-temporal stores begin outcompeting > + REP MOVSB. As well the point where the fact that non-temporal > + stores are forced back to main memory would already occurred to the > + majority of the lines in the copy. Note, concerns about the > + entire L3 cache being evicted by the copy are mostly alleviated > + by the fact that modern HW detects streaming patterns and > + provides proper LRU hints so that the maximum thrashing > + capped at 1/associativity. */ > + unsigned long int non_temporal_threshold = shared / 2; > + /* If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run > + a higher risk of actually thrashing the cache as they don't have a HW LRU > + hint. As well, there performance in highly parallel situations is > + noticeably worse. */ > + if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > + non_temporal_threshold = shared_per_thread * 3 / 4; > /* SIZE_MAX >> 4 because memmove-vec-unaligned-erms right-shifts the value of > 'x86_non_temporal_threshold' by `LOG_4X_MEMCPY_THRESH` (4) and it is best > if that operation cannot overflow. Minimum of 0x4040 (16448) because the > -- > 2.34.1 > LGTM. Thanks.
On Wed, Apr 26, 2023 at 10:59 AM H.J. Lu <hjl.tools@gmail.com> wrote: > > On Tue, Apr 25, 2023 at 2:45 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > Current `non_temporal_threshold` set to roughly '3/4 * sizeof_L3 / > > ncores_per_socket'. This patch updates that value to roughly > > 'sizeof_L3 / 2` > > > > The original value (specifically dividing the `ncores_per_socket`) was > > done to limit the amount of other threads' data a `memcpy`/`memset` > > could evict. > > > > Dividing by 'ncores_per_socket', however leads to exceedingly low > > non-temporal thresholds and leads to using non-temporal stores in > > cases where REP MOVSB is multiple times faster. > > > > Furthermore, non-temporal stores are written directly to main memory > > so using it at a size much smaller than L3 can place soon to be > > accessed data much further away than it otherwise could be. As well, > > modern machines are able to detect streaming patterns (especially if > > REP MOVSB is used) and provide LRU hints to the memory subsystem. This > > in affect caps the total amount of eviction at 1/cache_associativity, > > far below meaningfully thrashing the entire cache. > > > > As best I can tell, the benchmarks that lead this small threshold > > where done comparing non-temporal stores versus standard cacheable > > stores. A better comparison (linked below) is to be REP MOVSB which, > > on the measure systems, is nearly 2x faster than non-temporal stores > > at the low-end of the previous threshold, and within 10% for over > > 100MB copies (well past even the current threshold). In cases with a > > low number of threads competing for bandwidth, REP MOVSB is ~2x faster > > up to `sizeof_L3`. > > > > Benchmarks comparing non-temporal stores, REP MOVSB, and cacheable > > stores where done using: > > https://github.com/goldsteinn/memcpy-nt-benchmarks > > > > Sheets results (also available in pdf on the github): > > https://docs.google.com/spreadsheets/d/e/2PACX-1vS183r0rW_jRX6tG_E90m9qVuFiMbRIJvi5VAE8yYOvEOIEEc3aSNuEsrFbuXw5c3nGboxMmrupZD7K/pubhtml > > --- > > sysdeps/x86/dl-cacheinfo.h | 70 +++++++++++++++++++++++--------------- > > 1 file changed, 43 insertions(+), 27 deletions(-) > > > > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h > > index ec88945b39..4f1fd419f8 100644 > > --- a/sysdeps/x86/dl-cacheinfo.h > > +++ b/sysdeps/x86/dl-cacheinfo.h > > @@ -407,7 +407,7 @@ handle_zhaoxin (int name) > > } > > > > static void > > -get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr, > > +get_common_cache_info (long int *shared_ptr, long int * shared_per_thread_ptr, unsigned int *threads_ptr, > > long int core) > > { > > unsigned int eax; > > @@ -426,6 +426,7 @@ get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr, > > unsigned int family = cpu_features->basic.family; > > unsigned int model = cpu_features->basic.model; > > long int shared = *shared_ptr; > > + long int shared_per_thread = *shared_per_thread_ptr; > > unsigned int threads = *threads_ptr; > > bool inclusive_cache = true; > > bool support_count_mask = true; > > @@ -441,6 +442,7 @@ get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr, > > /* Try L2 otherwise. */ > > level = 2; > > shared = core; > > + shared_per_thread = core; > > threads_l2 = 0; > > threads_l3 = -1; > > } > > @@ -597,29 +599,28 @@ get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr, > > } > > else > > { > > -intel_bug_no_cache_info: > > - /* Assume that all logical threads share the highest cache > > - level. */ > > - threads > > - = ((cpu_features->features[CPUID_INDEX_1].cpuid.ebx >> 16) > > - & 0xff); > > - } > > - > > - /* Cap usage of highest cache level to the number of supported > > - threads. */ > > - if (shared > 0 && threads > 0) > > - shared /= threads; > > + intel_bug_no_cache_info: > > + /* Assume that all logical threads share the highest cache > > + level. */ > > + threads = ((cpu_features->features[CPUID_INDEX_1].cpuid.ebx >> 16) > > + & 0xff); > > + > > + /* Get per-thread size of highest level cache. */ > > + if (shared_per_thread > 0 && threads > 0) > > + shared_per_thread /= threads; > > + } > > } > > > > /* Account for non-inclusive L2 and L3 caches. */ > > if (!inclusive_cache) > > { > > if (threads_l2 > 0) > > - core /= threads_l2; > > + shared_per_thread += core / threads_l2; > > shared += core; > > } > > > > *shared_ptr = shared; > > + *shared_per_thread_ptr = shared_per_thread; > > *threads_ptr = threads; > > } > > > > @@ -629,6 +630,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > /* Find out what brand of processor. */ > > long int data = -1; > > long int shared = -1; > > + long int shared_per_thread = -1; > > long int core = -1; > > unsigned int threads = 0; > > unsigned long int level1_icache_size = -1; > > @@ -649,6 +651,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features); > > core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features); > > shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features); > > + shared_per_thread = shared; > > > > level1_icache_size > > = handle_intel (_SC_LEVEL1_ICACHE_SIZE, cpu_features); > > @@ -672,13 +675,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > level4_cache_size > > = handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features); > > > > - get_common_cache_info (&shared, &threads, core); > > + get_common_cache_info (&shared, &shared_per_thread, &threads, core); > > } > > else if (cpu_features->basic.kind == arch_kind_zhaoxin) > > { > > data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE); > > core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE); > > shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE); > > + shared_per_thread = shared; > > > > level1_icache_size = handle_zhaoxin (_SC_LEVEL1_ICACHE_SIZE); > > level1_icache_linesize = handle_zhaoxin (_SC_LEVEL1_ICACHE_LINESIZE); > > @@ -692,13 +696,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC); > > level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE); > > > > - get_common_cache_info (&shared, &threads, core); > > + get_common_cache_info (&shared, &shared_per_thread, &threads, core); > > } > > else if (cpu_features->basic.kind == arch_kind_amd) > > { > > data = handle_amd (_SC_LEVEL1_DCACHE_SIZE); > > core = handle_amd (_SC_LEVEL2_CACHE_SIZE); > > shared = handle_amd (_SC_LEVEL3_CACHE_SIZE); > > + shared_per_thread = shared; > > > > level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE); > > level1_icache_linesize = handle_amd (_SC_LEVEL1_ICACHE_LINESIZE); > > @@ -715,6 +720,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > if (shared <= 0) > > /* No shared L3 cache. All we have is the L2 cache. */ > > shared = core; > > + > > + if (shared_per_thread <= 0) > > + shared_per_thread = shared; > > } > > > > cpu_features->level1_icache_size = level1_icache_size; > > @@ -730,17 +738,25 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > cpu_features->level3_cache_linesize = level3_cache_linesize; > > cpu_features->level4_cache_size = level4_cache_size; > > > > - /* The default setting for the non_temporal threshold is 3/4 of one > > - thread's share of the chip's cache. For most Intel and AMD processors > > - with an initial release date between 2017 and 2020, a thread's typical > > - share of the cache is from 500 KBytes to 2 MBytes. Using the 3/4 > > - threshold leaves 125 KBytes to 500 KBytes of the thread's data > > - in cache after a maximum temporal copy, which will maintain > > - in cache a reasonable portion of the thread's stack and other > > - active data. If the threshold is set higher than one thread's > > - share of the cache, it has a substantial risk of negatively > > - impacting the performance of other threads running on the chip. */ > > - unsigned long int non_temporal_threshold = shared * 3 / 4; > > + /* The default setting for the non_temporal threshold is 1/2 of size > > + of the chip's cache. For most Intel and AMD processors with an > > + initial release date between 2017 and 2023, a thread's typical > > + share of the cache is from 18-64MB. Using the 1/2 L3 is meant to > > + estimate the point where non-temporal stores begin outcompeting > > + REP MOVSB. As well the point where the fact that non-temporal > > + stores are forced back to main memory would already occurred to the > > + majority of the lines in the copy. Note, concerns about the > > + entire L3 cache being evicted by the copy are mostly alleviated > > + by the fact that modern HW detects streaming patterns and > > + provides proper LRU hints so that the maximum thrashing > > + capped at 1/associativity. */ > > + unsigned long int non_temporal_threshold = shared / 2; > > + /* If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run > > + a higher risk of actually thrashing the cache as they don't have a HW LRU > > + hint. As well, there performance in highly parallel situations is > > + noticeably worse. */ > > + if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > + non_temporal_threshold = shared_per_thread * 3 / 4; > > /* SIZE_MAX >> 4 because memmove-vec-unaligned-erms right-shifts the value of > > 'x86_non_temporal_threshold' by `LOG_4X_MEMCPY_THRESH` (4) and it is best > > if that operation cannot overflow. Minimum of 0x4040 (16448) because the > > -- > > 2.34.1 > > > > LGTM. > > Thanks. > Thanks. I'm currently running some benchmarks on Broadwell and Carlos is reproducing independently (on ICX I think), so will wait to push until all that has come to fruition. > > -- > H.J.
On Wed, Apr 26, 2023 at 12:15 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > On Wed, Apr 26, 2023 at 10:59 AM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > On Tue, Apr 25, 2023 at 2:45 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > Current `non_temporal_threshold` set to roughly '3/4 * sizeof_L3 / > > > ncores_per_socket'. This patch updates that value to roughly > > > 'sizeof_L3 / 2` > > > > > > The original value (specifically dividing the `ncores_per_socket`) was > > > done to limit the amount of other threads' data a `memcpy`/`memset` > > > could evict. > > > > > > Dividing by 'ncores_per_socket', however leads to exceedingly low > > > non-temporal thresholds and leads to using non-temporal stores in > > > cases where REP MOVSB is multiple times faster. > > > > > > Furthermore, non-temporal stores are written directly to main memory > > > so using it at a size much smaller than L3 can place soon to be > > > accessed data much further away than it otherwise could be. As well, > > > modern machines are able to detect streaming patterns (especially if > > > REP MOVSB is used) and provide LRU hints to the memory subsystem. This > > > in affect caps the total amount of eviction at 1/cache_associativity, > > > far below meaningfully thrashing the entire cache. > > > > > > As best I can tell, the benchmarks that lead this small threshold > > > where done comparing non-temporal stores versus standard cacheable > > > stores. A better comparison (linked below) is to be REP MOVSB which, > > > on the measure systems, is nearly 2x faster than non-temporal stores > > > at the low-end of the previous threshold, and within 10% for over > > > 100MB copies (well past even the current threshold). In cases with a > > > low number of threads competing for bandwidth, REP MOVSB is ~2x faster > > > up to `sizeof_L3`. > > > > > > Benchmarks comparing non-temporal stores, REP MOVSB, and cacheable > > > stores where done using: > > > https://github.com/goldsteinn/memcpy-nt-benchmarks > > > > > > Sheets results (also available in pdf on the github): > > > https://docs.google.com/spreadsheets/d/e/2PACX-1vS183r0rW_jRX6tG_E90m9qVuFiMbRIJvi5VAE8yYOvEOIEEc3aSNuEsrFbuXw5c3nGboxMmrupZD7K/pubhtml > > > --- > > > sysdeps/x86/dl-cacheinfo.h | 70 +++++++++++++++++++++++--------------- > > > 1 file changed, 43 insertions(+), 27 deletions(-) > > > > > > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h > > > index ec88945b39..4f1fd419f8 100644 > > > --- a/sysdeps/x86/dl-cacheinfo.h > > > +++ b/sysdeps/x86/dl-cacheinfo.h > > > @@ -407,7 +407,7 @@ handle_zhaoxin (int name) > > > } > > > > > > static void > > > -get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr, > > > +get_common_cache_info (long int *shared_ptr, long int * shared_per_thread_ptr, unsigned int *threads_ptr, > > > long int core) > > > { > > > unsigned int eax; > > > @@ -426,6 +426,7 @@ get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr, > > > unsigned int family = cpu_features->basic.family; > > > unsigned int model = cpu_features->basic.model; > > > long int shared = *shared_ptr; > > > + long int shared_per_thread = *shared_per_thread_ptr; > > > unsigned int threads = *threads_ptr; > > > bool inclusive_cache = true; > > > bool support_count_mask = true; > > > @@ -441,6 +442,7 @@ get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr, > > > /* Try L2 otherwise. */ > > > level = 2; > > > shared = core; > > > + shared_per_thread = core; > > > threads_l2 = 0; > > > threads_l3 = -1; > > > } > > > @@ -597,29 +599,28 @@ get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr, > > > } > > > else > > > { > > > -intel_bug_no_cache_info: > > > - /* Assume that all logical threads share the highest cache > > > - level. */ > > > - threads > > > - = ((cpu_features->features[CPUID_INDEX_1].cpuid.ebx >> 16) > > > - & 0xff); > > > - } > > > - > > > - /* Cap usage of highest cache level to the number of supported > > > - threads. */ > > > - if (shared > 0 && threads > 0) > > > - shared /= threads; > > > + intel_bug_no_cache_info: > > > + /* Assume that all logical threads share the highest cache > > > + level. */ > > > + threads = ((cpu_features->features[CPUID_INDEX_1].cpuid.ebx >> 16) > > > + & 0xff); > > > + > > > + /* Get per-thread size of highest level cache. */ > > > + if (shared_per_thread > 0 && threads > 0) > > > + shared_per_thread /= threads; > > > + } > > > } > > > > > > /* Account for non-inclusive L2 and L3 caches. */ > > > if (!inclusive_cache) > > > { > > > if (threads_l2 > 0) > > > - core /= threads_l2; > > > + shared_per_thread += core / threads_l2; > > > shared += core; > > > } > > > > > > *shared_ptr = shared; > > > + *shared_per_thread_ptr = shared_per_thread; > > > *threads_ptr = threads; > > > } > > > > > > @@ -629,6 +630,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > > /* Find out what brand of processor. */ > > > long int data = -1; > > > long int shared = -1; > > > + long int shared_per_thread = -1; > > > long int core = -1; > > > unsigned int threads = 0; > > > unsigned long int level1_icache_size = -1; > > > @@ -649,6 +651,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > > data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features); > > > core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features); > > > shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features); > > > + shared_per_thread = shared; > > > > > > level1_icache_size > > > = handle_intel (_SC_LEVEL1_ICACHE_SIZE, cpu_features); > > > @@ -672,13 +675,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > > level4_cache_size > > > = handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features); > > > > > > - get_common_cache_info (&shared, &threads, core); > > > + get_common_cache_info (&shared, &shared_per_thread, &threads, core); > > > } > > > else if (cpu_features->basic.kind == arch_kind_zhaoxin) > > > { > > > data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE); > > > core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE); > > > shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE); > > > + shared_per_thread = shared; > > > > > > level1_icache_size = handle_zhaoxin (_SC_LEVEL1_ICACHE_SIZE); > > > level1_icache_linesize = handle_zhaoxin (_SC_LEVEL1_ICACHE_LINESIZE); > > > @@ -692,13 +696,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > > level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC); > > > level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE); > > > > > > - get_common_cache_info (&shared, &threads, core); > > > + get_common_cache_info (&shared, &shared_per_thread, &threads, core); > > > } > > > else if (cpu_features->basic.kind == arch_kind_amd) > > > { > > > data = handle_amd (_SC_LEVEL1_DCACHE_SIZE); > > > core = handle_amd (_SC_LEVEL2_CACHE_SIZE); > > > shared = handle_amd (_SC_LEVEL3_CACHE_SIZE); > > > + shared_per_thread = shared; > > > > > > level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE); > > > level1_icache_linesize = handle_amd (_SC_LEVEL1_ICACHE_LINESIZE); > > > @@ -715,6 +720,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > > if (shared <= 0) > > > /* No shared L3 cache. All we have is the L2 cache. */ > > > shared = core; > > > + > > > + if (shared_per_thread <= 0) > > > + shared_per_thread = shared; > > > } > > > > > > cpu_features->level1_icache_size = level1_icache_size; > > > @@ -730,17 +738,25 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > > cpu_features->level3_cache_linesize = level3_cache_linesize; > > > cpu_features->level4_cache_size = level4_cache_size; > > > > > > - /* The default setting for the non_temporal threshold is 3/4 of one > > > - thread's share of the chip's cache. For most Intel and AMD processors > > > - with an initial release date between 2017 and 2020, a thread's typical > > > - share of the cache is from 500 KBytes to 2 MBytes. Using the 3/4 > > > - threshold leaves 125 KBytes to 500 KBytes of the thread's data > > > - in cache after a maximum temporal copy, which will maintain > > > - in cache a reasonable portion of the thread's stack and other > > > - active data. If the threshold is set higher than one thread's > > > - share of the cache, it has a substantial risk of negatively > > > - impacting the performance of other threads running on the chip. */ > > > - unsigned long int non_temporal_threshold = shared * 3 / 4; > > > + /* The default setting for the non_temporal threshold is 1/2 of size > > > + of the chip's cache. For most Intel and AMD processors with an > > > + initial release date between 2017 and 2023, a thread's typical > > > + share of the cache is from 18-64MB. Using the 1/2 L3 is meant to > > > + estimate the point where non-temporal stores begin outcompeting > > > + REP MOVSB. As well the point where the fact that non-temporal > > > + stores are forced back to main memory would already occurred to the > > > + majority of the lines in the copy. Note, concerns about the > > > + entire L3 cache being evicted by the copy are mostly alleviated > > > + by the fact that modern HW detects streaming patterns and > > > + provides proper LRU hints so that the maximum thrashing > > > + capped at 1/associativity. */ > > > + unsigned long int non_temporal_threshold = shared / 2; > > > + /* If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run > > > + a higher risk of actually thrashing the cache as they don't have a HW LRU > > > + hint. As well, there performance in highly parallel situations is > > > + noticeably worse. */ > > > + if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > > + non_temporal_threshold = shared_per_thread * 3 / 4; > > > /* SIZE_MAX >> 4 because memmove-vec-unaligned-erms right-shifts the value of > > > 'x86_non_temporal_threshold' by `LOG_4X_MEMCPY_THRESH` (4) and it is best > > > if that operation cannot overflow. Minimum of 0x4040 (16448) because the > > > -- > > > 2.34.1 > > > > > > > LGTM. > > > > Thanks. > > > > Thanks. > > I'm currently running some benchmarks on Broadwell and Carlos is reproducing > independently (on ICX I think), so will wait to push until all that > has come to fruition. > > > > -- > > H.J. Carlos, I benchmarked on BWD: https://docs.google.com/spreadsheets/d/1kfXonk4LAZXBySuPnfDenrTizZ52IN0vvLIt3k1ex9c/edit?usp=sharing or https://github.com/goldsteinn/memcpy-nt-benchmarks/blob/master/results-bwd-pdf/bwd-memcpy-0--standard.pdf On BWD, unlike SKX/ICX, non-temporal stores perform better than REP_MOVSB and standard stores. Somewhat counter-intuitively the results are most pronounced in the single-threaded. At roughly the 4MB range non-tempora stores become by far the best basically regardless of the number of threads. The machine I tested on had 35MB of cache and 28 threads per socket so our current threshold is ~1MB which is still to low. But the proposal in this patch is do L3 / 2 is too high (~16MB in this case). At the current threshold, in the multithreaded case. Between ~[1MB, 4MB) non-temporal stores at 60-110% SLOWER than ERMS. OTOH, between [4MB, 16MB] non-temporal stores are about 10-30% faster. I think we still have a net benefit from this patch, but maybe we want to tune the exact percentage by CPU arch? HJ what do you think about that? SKX and newer -> L3 / 2. BWD and older -> L3 / 8. Then we can add additional cases for different machines as benchmarks indicate.
On Wed, May 3, 2023 at 8:28 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > On Wed, Apr 26, 2023 at 12:15 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > On Wed, Apr 26, 2023 at 10:59 AM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > > > On Tue, Apr 25, 2023 at 2:45 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > > > Current `non_temporal_threshold` set to roughly '3/4 * sizeof_L3 / > > > > ncores_per_socket'. This patch updates that value to roughly > > > > 'sizeof_L3 / 2` > > > > > > > > The original value (specifically dividing the `ncores_per_socket`) was > > > > done to limit the amount of other threads' data a `memcpy`/`memset` > > > > could evict. > > > > > > > > Dividing by 'ncores_per_socket', however leads to exceedingly low > > > > non-temporal thresholds and leads to using non-temporal stores in > > > > cases where REP MOVSB is multiple times faster. > > > > > > > > Furthermore, non-temporal stores are written directly to main memory > > > > so using it at a size much smaller than L3 can place soon to be > > > > accessed data much further away than it otherwise could be. As well, > > > > modern machines are able to detect streaming patterns (especially if > > > > REP MOVSB is used) and provide LRU hints to the memory subsystem. This > > > > in affect caps the total amount of eviction at 1/cache_associativity, > > > > far below meaningfully thrashing the entire cache. > > > > > > > > As best I can tell, the benchmarks that lead this small threshold > > > > where done comparing non-temporal stores versus standard cacheable > > > > stores. A better comparison (linked below) is to be REP MOVSB which, > > > > on the measure systems, is nearly 2x faster than non-temporal stores > > > > at the low-end of the previous threshold, and within 10% for over > > > > 100MB copies (well past even the current threshold). In cases with a > > > > low number of threads competing for bandwidth, REP MOVSB is ~2x faster > > > > up to `sizeof_L3`. > > > > > > > > Benchmarks comparing non-temporal stores, REP MOVSB, and cacheable > > > > stores where done using: > > > > https://github.com/goldsteinn/memcpy-nt-benchmarks > > > > > > > > Sheets results (also available in pdf on the github): > > > > https://docs.google.com/spreadsheets/d/e/2PACX-1vS183r0rW_jRX6tG_E90m9qVuFiMbRIJvi5VAE8yYOvEOIEEc3aSNuEsrFbuXw5c3nGboxMmrupZD7K/pubhtml > > > > --- > > > > sysdeps/x86/dl-cacheinfo.h | 70 +++++++++++++++++++++++--------------- > > > > 1 file changed, 43 insertions(+), 27 deletions(-) > > > > > > > > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h > > > > index ec88945b39..4f1fd419f8 100644 > > > > --- a/sysdeps/x86/dl-cacheinfo.h > > > > +++ b/sysdeps/x86/dl-cacheinfo.h > > > > @@ -407,7 +407,7 @@ handle_zhaoxin (int name) > > > > } > > > > > > > > static void > > > > -get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr, > > > > +get_common_cache_info (long int *shared_ptr, long int * shared_per_thread_ptr, unsigned int *threads_ptr, > > > > long int core) > > > > { > > > > unsigned int eax; > > > > @@ -426,6 +426,7 @@ get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr, > > > > unsigned int family = cpu_features->basic.family; > > > > unsigned int model = cpu_features->basic.model; > > > > long int shared = *shared_ptr; > > > > + long int shared_per_thread = *shared_per_thread_ptr; > > > > unsigned int threads = *threads_ptr; > > > > bool inclusive_cache = true; > > > > bool support_count_mask = true; > > > > @@ -441,6 +442,7 @@ get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr, > > > > /* Try L2 otherwise. */ > > > > level = 2; > > > > shared = core; > > > > + shared_per_thread = core; > > > > threads_l2 = 0; > > > > threads_l3 = -1; > > > > } > > > > @@ -597,29 +599,28 @@ get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr, > > > > } > > > > else > > > > { > > > > -intel_bug_no_cache_info: > > > > - /* Assume that all logical threads share the highest cache > > > > - level. */ > > > > - threads > > > > - = ((cpu_features->features[CPUID_INDEX_1].cpuid.ebx >> 16) > > > > - & 0xff); > > > > - } > > > > - > > > > - /* Cap usage of highest cache level to the number of supported > > > > - threads. */ > > > > - if (shared > 0 && threads > 0) > > > > - shared /= threads; > > > > + intel_bug_no_cache_info: > > > > + /* Assume that all logical threads share the highest cache > > > > + level. */ > > > > + threads = ((cpu_features->features[CPUID_INDEX_1].cpuid.ebx >> 16) > > > > + & 0xff); > > > > + > > > > + /* Get per-thread size of highest level cache. */ > > > > + if (shared_per_thread > 0 && threads > 0) > > > > + shared_per_thread /= threads; > > > > + } > > > > } > > > > > > > > /* Account for non-inclusive L2 and L3 caches. */ > > > > if (!inclusive_cache) > > > > { > > > > if (threads_l2 > 0) > > > > - core /= threads_l2; > > > > + shared_per_thread += core / threads_l2; > > > > shared += core; > > > > } > > > > > > > > *shared_ptr = shared; > > > > + *shared_per_thread_ptr = shared_per_thread; > > > > *threads_ptr = threads; > > > > } > > > > > > > > @@ -629,6 +630,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > > > /* Find out what brand of processor. */ > > > > long int data = -1; > > > > long int shared = -1; > > > > + long int shared_per_thread = -1; > > > > long int core = -1; > > > > unsigned int threads = 0; > > > > unsigned long int level1_icache_size = -1; > > > > @@ -649,6 +651,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > > > data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features); > > > > core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features); > > > > shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features); > > > > + shared_per_thread = shared; > > > > > > > > level1_icache_size > > > > = handle_intel (_SC_LEVEL1_ICACHE_SIZE, cpu_features); > > > > @@ -672,13 +675,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > > > level4_cache_size > > > > = handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features); > > > > > > > > - get_common_cache_info (&shared, &threads, core); > > > > + get_common_cache_info (&shared, &shared_per_thread, &threads, core); > > > > } > > > > else if (cpu_features->basic.kind == arch_kind_zhaoxin) > > > > { > > > > data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE); > > > > core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE); > > > > shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE); > > > > + shared_per_thread = shared; > > > > > > > > level1_icache_size = handle_zhaoxin (_SC_LEVEL1_ICACHE_SIZE); > > > > level1_icache_linesize = handle_zhaoxin (_SC_LEVEL1_ICACHE_LINESIZE); > > > > @@ -692,13 +696,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > > > level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC); > > > > level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE); > > > > > > > > - get_common_cache_info (&shared, &threads, core); > > > > + get_common_cache_info (&shared, &shared_per_thread, &threads, core); > > > > } > > > > else if (cpu_features->basic.kind == arch_kind_amd) > > > > { > > > > data = handle_amd (_SC_LEVEL1_DCACHE_SIZE); > > > > core = handle_amd (_SC_LEVEL2_CACHE_SIZE); > > > > shared = handle_amd (_SC_LEVEL3_CACHE_SIZE); > > > > + shared_per_thread = shared; > > > > > > > > level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE); > > > > level1_icache_linesize = handle_amd (_SC_LEVEL1_ICACHE_LINESIZE); > > > > @@ -715,6 +720,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > > > if (shared <= 0) > > > > /* No shared L3 cache. All we have is the L2 cache. */ > > > > shared = core; > > > > + > > > > + if (shared_per_thread <= 0) > > > > + shared_per_thread = shared; > > > > } > > > > > > > > cpu_features->level1_icache_size = level1_icache_size; > > > > @@ -730,17 +738,25 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > > > cpu_features->level3_cache_linesize = level3_cache_linesize; > > > > cpu_features->level4_cache_size = level4_cache_size; > > > > > > > > - /* The default setting for the non_temporal threshold is 3/4 of one > > > > - thread's share of the chip's cache. For most Intel and AMD processors > > > > - with an initial release date between 2017 and 2020, a thread's typical > > > > - share of the cache is from 500 KBytes to 2 MBytes. Using the 3/4 > > > > - threshold leaves 125 KBytes to 500 KBytes of the thread's data > > > > - in cache after a maximum temporal copy, which will maintain > > > > - in cache a reasonable portion of the thread's stack and other > > > > - active data. If the threshold is set higher than one thread's > > > > - share of the cache, it has a substantial risk of negatively > > > > - impacting the performance of other threads running on the chip. */ > > > > - unsigned long int non_temporal_threshold = shared * 3 / 4; > > > > + /* The default setting for the non_temporal threshold is 1/2 of size > > > > + of the chip's cache. For most Intel and AMD processors with an > > > > + initial release date between 2017 and 2023, a thread's typical > > > > + share of the cache is from 18-64MB. Using the 1/2 L3 is meant to > > > > + estimate the point where non-temporal stores begin outcompeting > > > > + REP MOVSB. As well the point where the fact that non-temporal > > > > + stores are forced back to main memory would already occurred to the > > > > + majority of the lines in the copy. Note, concerns about the > > > > + entire L3 cache being evicted by the copy are mostly alleviated > > > > + by the fact that modern HW detects streaming patterns and > > > > + provides proper LRU hints so that the maximum thrashing > > > > + capped at 1/associativity. */ > > > > + unsigned long int non_temporal_threshold = shared / 2; > > > > + /* If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run > > > > + a higher risk of actually thrashing the cache as they don't have a HW LRU > > > > + hint. As well, there performance in highly parallel situations is > > > > + noticeably worse. */ > > > > + if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > > > + non_temporal_threshold = shared_per_thread * 3 / 4; > > > > /* SIZE_MAX >> 4 because memmove-vec-unaligned-erms right-shifts the value of > > > > 'x86_non_temporal_threshold' by `LOG_4X_MEMCPY_THRESH` (4) and it is best > > > > if that operation cannot overflow. Minimum of 0x4040 (16448) because the > > > > -- > > > > 2.34.1 > > > > > > > > > > LGTM. > > > > > > Thanks. > > > > > > > Thanks. > > > > I'm currently running some benchmarks on Broadwell and Carlos is reproducing > > independently (on ICX I think), so will wait to push until all that > > has come to fruition. > > > > > > -- > > > H.J. > > Carlos, I benchmarked on BWD: > https://docs.google.com/spreadsheets/d/1kfXonk4LAZXBySuPnfDenrTizZ52IN0vvLIt3k1ex9c/edit?usp=sharing > or > https://github.com/goldsteinn/memcpy-nt-benchmarks/blob/master/results-bwd-pdf/bwd-memcpy-0--standard.pdf > > On BWD, unlike SKX/ICX, non-temporal stores perform better than REP_MOVSB > and standard stores. Somewhat counter-intuitively the results are most > pronounced > in the single-threaded. > > At roughly the 4MB range non-tempora stores become by far the best > basically regardless > of the number of threads. > The machine I tested on had 35MB of cache and 28 threads per socket so > our current > threshold is ~1MB which is still to low. But the proposal in this > patch is do L3 / 2 is too > high (~16MB in this case). > > At the current threshold, in the multithreaded case. Between ~[1MB, > 4MB) non-temporal > stores at 60-110% SLOWER than ERMS. OTOH, between [4MB, 16MB] > non-temporal stores > are about 10-30% faster. > > I think we still have a net benefit from this patch, but maybe we want > to tune the exact > percentage by CPU arch? > HJ what do you think about that? SKX and newer -> L3 / 2. BWD and > older -> L3 / 8. This sounds good to me. > Then we can add additional cases for different machines as benchmarks indicate. Thanks.
On Fri, May 5, 2023 at 1:07 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > On Wed, May 3, 2023 at 8:28 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > On Wed, Apr 26, 2023 at 12:15 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > On Wed, Apr 26, 2023 at 10:59 AM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > > > > > On Tue, Apr 25, 2023 at 2:45 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > > > > > Current `non_temporal_threshold` set to roughly '3/4 * sizeof_L3 / > > > > > ncores_per_socket'. This patch updates that value to roughly > > > > > 'sizeof_L3 / 2` > > > > > > > > > > The original value (specifically dividing the `ncores_per_socket`) was > > > > > done to limit the amount of other threads' data a `memcpy`/`memset` > > > > > could evict. > > > > > > > > > > Dividing by 'ncores_per_socket', however leads to exceedingly low > > > > > non-temporal thresholds and leads to using non-temporal stores in > > > > > cases where REP MOVSB is multiple times faster. > > > > > > > > > > Furthermore, non-temporal stores are written directly to main memory > > > > > so using it at a size much smaller than L3 can place soon to be > > > > > accessed data much further away than it otherwise could be. As well, > > > > > modern machines are able to detect streaming patterns (especially if > > > > > REP MOVSB is used) and provide LRU hints to the memory subsystem. This > > > > > in affect caps the total amount of eviction at 1/cache_associativity, > > > > > far below meaningfully thrashing the entire cache. > > > > > > > > > > As best I can tell, the benchmarks that lead this small threshold > > > > > where done comparing non-temporal stores versus standard cacheable > > > > > stores. A better comparison (linked below) is to be REP MOVSB which, > > > > > on the measure systems, is nearly 2x faster than non-temporal stores > > > > > at the low-end of the previous threshold, and within 10% for over > > > > > 100MB copies (well past even the current threshold). In cases with a > > > > > low number of threads competing for bandwidth, REP MOVSB is ~2x faster > > > > > up to `sizeof_L3`. > > > > > > > > > > Benchmarks comparing non-temporal stores, REP MOVSB, and cacheable > > > > > stores where done using: > > > > > https://github.com/goldsteinn/memcpy-nt-benchmarks > > > > > > > > > > Sheets results (also available in pdf on the github): > > > > > https://docs.google.com/spreadsheets/d/e/2PACX-1vS183r0rW_jRX6tG_E90m9qVuFiMbRIJvi5VAE8yYOvEOIEEc3aSNuEsrFbuXw5c3nGboxMmrupZD7K/pubhtml > > > > > --- > > > > > sysdeps/x86/dl-cacheinfo.h | 70 +++++++++++++++++++++++--------------- > > > > > 1 file changed, 43 insertions(+), 27 deletions(-) > > > > > > > > > > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h > > > > > index ec88945b39..4f1fd419f8 100644 > > > > > --- a/sysdeps/x86/dl-cacheinfo.h > > > > > +++ b/sysdeps/x86/dl-cacheinfo.h > > > > > @@ -407,7 +407,7 @@ handle_zhaoxin (int name) > > > > > } > > > > > > > > > > static void > > > > > -get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr, > > > > > +get_common_cache_info (long int *shared_ptr, long int * shared_per_thread_ptr, unsigned int *threads_ptr, > > > > > long int core) > > > > > { > > > > > unsigned int eax; > > > > > @@ -426,6 +426,7 @@ get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr, > > > > > unsigned int family = cpu_features->basic.family; > > > > > unsigned int model = cpu_features->basic.model; > > > > > long int shared = *shared_ptr; > > > > > + long int shared_per_thread = *shared_per_thread_ptr; > > > > > unsigned int threads = *threads_ptr; > > > > > bool inclusive_cache = true; > > > > > bool support_count_mask = true; > > > > > @@ -441,6 +442,7 @@ get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr, > > > > > /* Try L2 otherwise. */ > > > > > level = 2; > > > > > shared = core; > > > > > + shared_per_thread = core; > > > > > threads_l2 = 0; > > > > > threads_l3 = -1; > > > > > } > > > > > @@ -597,29 +599,28 @@ get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr, > > > > > } > > > > > else > > > > > { > > > > > -intel_bug_no_cache_info: > > > > > - /* Assume that all logical threads share the highest cache > > > > > - level. */ > > > > > - threads > > > > > - = ((cpu_features->features[CPUID_INDEX_1].cpuid.ebx >> 16) > > > > > - & 0xff); > > > > > - } > > > > > - > > > > > - /* Cap usage of highest cache level to the number of supported > > > > > - threads. */ > > > > > - if (shared > 0 && threads > 0) > > > > > - shared /= threads; > > > > > + intel_bug_no_cache_info: > > > > > + /* Assume that all logical threads share the highest cache > > > > > + level. */ > > > > > + threads = ((cpu_features->features[CPUID_INDEX_1].cpuid.ebx >> 16) > > > > > + & 0xff); > > > > > + > > > > > + /* Get per-thread size of highest level cache. */ > > > > > + if (shared_per_thread > 0 && threads > 0) > > > > > + shared_per_thread /= threads; > > > > > + } > > > > > } > > > > > > > > > > /* Account for non-inclusive L2 and L3 caches. */ > > > > > if (!inclusive_cache) > > > > > { > > > > > if (threads_l2 > 0) > > > > > - core /= threads_l2; > > > > > + shared_per_thread += core / threads_l2; > > > > > shared += core; > > > > > } > > > > > > > > > > *shared_ptr = shared; > > > > > + *shared_per_thread_ptr = shared_per_thread; > > > > > *threads_ptr = threads; > > > > > } > > > > > > > > > > @@ -629,6 +630,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > > > > /* Find out what brand of processor. */ > > > > > long int data = -1; > > > > > long int shared = -1; > > > > > + long int shared_per_thread = -1; > > > > > long int core = -1; > > > > > unsigned int threads = 0; > > > > > unsigned long int level1_icache_size = -1; > > > > > @@ -649,6 +651,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > > > > data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features); > > > > > core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features); > > > > > shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features); > > > > > + shared_per_thread = shared; > > > > > > > > > > level1_icache_size > > > > > = handle_intel (_SC_LEVEL1_ICACHE_SIZE, cpu_features); > > > > > @@ -672,13 +675,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > > > > level4_cache_size > > > > > = handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features); > > > > > > > > > > - get_common_cache_info (&shared, &threads, core); > > > > > + get_common_cache_info (&shared, &shared_per_thread, &threads, core); > > > > > } > > > > > else if (cpu_features->basic.kind == arch_kind_zhaoxin) > > > > > { > > > > > data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE); > > > > > core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE); > > > > > shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE); > > > > > + shared_per_thread = shared; > > > > > > > > > > level1_icache_size = handle_zhaoxin (_SC_LEVEL1_ICACHE_SIZE); > > > > > level1_icache_linesize = handle_zhaoxin (_SC_LEVEL1_ICACHE_LINESIZE); > > > > > @@ -692,13 +696,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > > > > level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC); > > > > > level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE); > > > > > > > > > > - get_common_cache_info (&shared, &threads, core); > > > > > + get_common_cache_info (&shared, &shared_per_thread, &threads, core); > > > > > } > > > > > else if (cpu_features->basic.kind == arch_kind_amd) > > > > > { > > > > > data = handle_amd (_SC_LEVEL1_DCACHE_SIZE); > > > > > core = handle_amd (_SC_LEVEL2_CACHE_SIZE); > > > > > shared = handle_amd (_SC_LEVEL3_CACHE_SIZE); > > > > > + shared_per_thread = shared; > > > > > > > > > > level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE); > > > > > level1_icache_linesize = handle_amd (_SC_LEVEL1_ICACHE_LINESIZE); > > > > > @@ -715,6 +720,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > > > > if (shared <= 0) > > > > > /* No shared L3 cache. All we have is the L2 cache. */ > > > > > shared = core; > > > > > + > > > > > + if (shared_per_thread <= 0) > > > > > + shared_per_thread = shared; > > > > > } > > > > > > > > > > cpu_features->level1_icache_size = level1_icache_size; > > > > > @@ -730,17 +738,25 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > > > > cpu_features->level3_cache_linesize = level3_cache_linesize; > > > > > cpu_features->level4_cache_size = level4_cache_size; > > > > > > > > > > - /* The default setting for the non_temporal threshold is 3/4 of one > > > > > - thread's share of the chip's cache. For most Intel and AMD processors > > > > > - with an initial release date between 2017 and 2020, a thread's typical > > > > > - share of the cache is from 500 KBytes to 2 MBytes. Using the 3/4 > > > > > - threshold leaves 125 KBytes to 500 KBytes of the thread's data > > > > > - in cache after a maximum temporal copy, which will maintain > > > > > - in cache a reasonable portion of the thread's stack and other > > > > > - active data. If the threshold is set higher than one thread's > > > > > - share of the cache, it has a substantial risk of negatively > > > > > - impacting the performance of other threads running on the chip. */ > > > > > - unsigned long int non_temporal_threshold = shared * 3 / 4; > > > > > + /* The default setting for the non_temporal threshold is 1/2 of size > > > > > + of the chip's cache. For most Intel and AMD processors with an > > > > > + initial release date between 2017 and 2023, a thread's typical > > > > > + share of the cache is from 18-64MB. Using the 1/2 L3 is meant to > > > > > + estimate the point where non-temporal stores begin outcompeting > > > > > + REP MOVSB. As well the point where the fact that non-temporal > > > > > + stores are forced back to main memory would already occurred to the > > > > > + majority of the lines in the copy. Note, concerns about the > > > > > + entire L3 cache being evicted by the copy are mostly alleviated > > > > > + by the fact that modern HW detects streaming patterns and > > > > > + provides proper LRU hints so that the maximum thrashing > > > > > + capped at 1/associativity. */ > > > > > + unsigned long int non_temporal_threshold = shared / 2; > > > > > + /* If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run > > > > > + a higher risk of actually thrashing the cache as they don't have a HW LRU > > > > > + hint. As well, there performance in highly parallel situations is > > > > > + noticeably worse. */ > > > > > + if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > > > > + non_temporal_threshold = shared_per_thread * 3 / 4; > > > > > /* SIZE_MAX >> 4 because memmove-vec-unaligned-erms right-shifts the value of > > > > > 'x86_non_temporal_threshold' by `LOG_4X_MEMCPY_THRESH` (4) and it is best > > > > > if that operation cannot overflow. Minimum of 0x4040 (16448) because the > > > > > -- > > > > > 2.34.1 > > > > > > > > > > > > > LGTM. > > > > > > > > Thanks. > > > > > > > > > > Thanks. > > > > > > I'm currently running some benchmarks on Broadwell and Carlos is reproducing > > > independently (on ICX I think), so will wait to push until all that > > > has come to fruition. > > > > > > > > -- > > > > H.J. > > > > Carlos, I benchmarked on BWD: > > https://docs.google.com/spreadsheets/d/1kfXonk4LAZXBySuPnfDenrTizZ52IN0vvLIt3k1ex9c/edit?usp=sharing > > or > > https://github.com/goldsteinn/memcpy-nt-benchmarks/blob/master/results-bwd-pdf/bwd-memcpy-0--standard.pdf > > > > On BWD, unlike SKX/ICX, non-temporal stores perform better than REP_MOVSB > > and standard stores. Somewhat counter-intuitively the results are most > > pronounced > > in the single-threaded. > > > > At roughly the 4MB range non-tempora stores become by far the best > > basically regardless > > of the number of threads. > > The machine I tested on had 35MB of cache and 28 threads per socket so > > our current > > threshold is ~1MB which is still to low. But the proposal in this > > patch is do L3 / 2 is too > > high (~16MB in this case). > > > > At the current threshold, in the multithreaded case. Between ~[1MB, > > 4MB) non-temporal > > stores at 60-110% SLOWER than ERMS. OTOH, between [4MB, 16MB] > > non-temporal stores > > are about 10-30% faster. > > > > I think we still have a net benefit from this patch, but maybe we want > > to tune the exact > > percentage by CPU arch? > > HJ what do you think about that? SKX and newer -> L3 / 2. BWD and > > older -> L3 / 8. > > This sounds good to me. > > > Then we can add additional cases for different machines as benchmarks indicate. > > Thanks. > Posted v5 to do this (in new series as title changes). > -- > H.J.
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h index ec88945b39..4f1fd419f8 100644 --- a/sysdeps/x86/dl-cacheinfo.h +++ b/sysdeps/x86/dl-cacheinfo.h @@ -407,7 +407,7 @@ handle_zhaoxin (int name) } static void -get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr, +get_common_cache_info (long int *shared_ptr, long int * shared_per_thread_ptr, unsigned int *threads_ptr, long int core) { unsigned int eax; @@ -426,6 +426,7 @@ get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr, unsigned int family = cpu_features->basic.family; unsigned int model = cpu_features->basic.model; long int shared = *shared_ptr; + long int shared_per_thread = *shared_per_thread_ptr; unsigned int threads = *threads_ptr; bool inclusive_cache = true; bool support_count_mask = true; @@ -441,6 +442,7 @@ get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr, /* Try L2 otherwise. */ level = 2; shared = core; + shared_per_thread = core; threads_l2 = 0; threads_l3 = -1; } @@ -597,29 +599,28 @@ get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr, } else { -intel_bug_no_cache_info: - /* Assume that all logical threads share the highest cache - level. */ - threads - = ((cpu_features->features[CPUID_INDEX_1].cpuid.ebx >> 16) - & 0xff); - } - - /* Cap usage of highest cache level to the number of supported - threads. */ - if (shared > 0 && threads > 0) - shared /= threads; + intel_bug_no_cache_info: + /* Assume that all logical threads share the highest cache + level. */ + threads = ((cpu_features->features[CPUID_INDEX_1].cpuid.ebx >> 16) + & 0xff); + + /* Get per-thread size of highest level cache. */ + if (shared_per_thread > 0 && threads > 0) + shared_per_thread /= threads; + } } /* Account for non-inclusive L2 and L3 caches. */ if (!inclusive_cache) { if (threads_l2 > 0) - core /= threads_l2; + shared_per_thread += core / threads_l2; shared += core; } *shared_ptr = shared; + *shared_per_thread_ptr = shared_per_thread; *threads_ptr = threads; } @@ -629,6 +630,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) /* Find out what brand of processor. */ long int data = -1; long int shared = -1; + long int shared_per_thread = -1; long int core = -1; unsigned int threads = 0; unsigned long int level1_icache_size = -1; @@ -649,6 +651,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features); core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features); shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features); + shared_per_thread = shared; level1_icache_size = handle_intel (_SC_LEVEL1_ICACHE_SIZE, cpu_features); @@ -672,13 +675,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) level4_cache_size = handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features); - get_common_cache_info (&shared, &threads, core); + get_common_cache_info (&shared, &shared_per_thread, &threads, core); } else if (cpu_features->basic.kind == arch_kind_zhaoxin) { data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE); core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE); shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE); + shared_per_thread = shared; level1_icache_size = handle_zhaoxin (_SC_LEVEL1_ICACHE_SIZE); level1_icache_linesize = handle_zhaoxin (_SC_LEVEL1_ICACHE_LINESIZE); @@ -692,13 +696,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC); level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE); - get_common_cache_info (&shared, &threads, core); + get_common_cache_info (&shared, &shared_per_thread, &threads, core); } else if (cpu_features->basic.kind == arch_kind_amd) { data = handle_amd (_SC_LEVEL1_DCACHE_SIZE); core = handle_amd (_SC_LEVEL2_CACHE_SIZE); shared = handle_amd (_SC_LEVEL3_CACHE_SIZE); + shared_per_thread = shared; level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE); level1_icache_linesize = handle_amd (_SC_LEVEL1_ICACHE_LINESIZE); @@ -715,6 +720,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) if (shared <= 0) /* No shared L3 cache. All we have is the L2 cache. */ shared = core; + + if (shared_per_thread <= 0) + shared_per_thread = shared; } cpu_features->level1_icache_size = level1_icache_size; @@ -730,17 +738,25 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) cpu_features->level3_cache_linesize = level3_cache_linesize; cpu_features->level4_cache_size = level4_cache_size; - /* The default setting for the non_temporal threshold is 3/4 of one - thread's share of the chip's cache. For most Intel and AMD processors - with an initial release date between 2017 and 2020, a thread's typical - share of the cache is from 500 KBytes to 2 MBytes. Using the 3/4 - threshold leaves 125 KBytes to 500 KBytes of the thread's data - in cache after a maximum temporal copy, which will maintain - in cache a reasonable portion of the thread's stack and other - active data. If the threshold is set higher than one thread's - share of the cache, it has a substantial risk of negatively - impacting the performance of other threads running on the chip. */ - unsigned long int non_temporal_threshold = shared * 3 / 4; + /* The default setting for the non_temporal threshold is 1/2 of size + of the chip's cache. For most Intel and AMD processors with an + initial release date between 2017 and 2023, a thread's typical + share of the cache is from 18-64MB. Using the 1/2 L3 is meant to + estimate the point where non-temporal stores begin outcompeting + REP MOVSB. As well the point where the fact that non-temporal + stores are forced back to main memory would already occurred to the + majority of the lines in the copy. Note, concerns about the + entire L3 cache being evicted by the copy are mostly alleviated + by the fact that modern HW detects streaming patterns and + provides proper LRU hints so that the maximum thrashing + capped at 1/associativity. */ + unsigned long int non_temporal_threshold = shared / 2; + /* If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run + a higher risk of actually thrashing the cache as they don't have a HW LRU + hint. As well, there performance in highly parallel situations is + noticeably worse. */ + if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS)) + non_temporal_threshold = shared_per_thread * 3 / 4; /* SIZE_MAX >> 4 because memmove-vec-unaligned-erms right-shifts the value of 'x86_non_temporal_threshold' by `LOG_4X_MEMCPY_THRESH` (4) and it is best if that operation cannot overflow. Minimum of 0x4040 (16448) because the