@@ -637,6 +637,7 @@ init_cpu_features (struct cpu_features *cpu_features)
unsigned int stepping = 0;
enum cpu_features_kind kind;
+ cpu_features->cachesize_non_temporal_divisor = 4;
#if !HAS_CPUID
if (__get_cpuid_max (0, 0) == 0)
{
@@ -720,6 +721,8 @@ init_cpu_features (struct cpu_features *cpu_features)
break;
case INTEL_BIGCORE_NEHALEM:
case INTEL_BIGCORE_WESTMERE:
+ /* Older CPUs prefer non-temporal stores at lower threshold. */
+ cpu_features->cachesize_non_temporal_divisor = 8;
/* Rep string instructions, unaligned load, unaligned copy,
and pminub are fast on Intel Core i3, i5 and i7. */
cpu_features->preferred[index_arch_Fast_Rep_String]
@@ -728,11 +731,12 @@ init_cpu_features (struct cpu_features *cpu_features)
| bit_arch_Prefer_PMINUB_for_stringop);
break;
- /* Untuned Bigcore microarch. */
case INTEL_BIGCORE_SANDYBRIDGE:
case INTEL_BIGCORE_IVYBRIDGE:
case INTEL_BIGCORE_HASWELL:
case INTEL_BIGCORE_BROADWELL:
+ cpu_features->cachesize_non_temporal_divisor = 8;
+ break;
case INTEL_BIGCORE_SKYLAKE:
case INTEL_BIGCORE_AMBERLAKE:
case INTEL_BIGCORE_COFFEELAKE:
@@ -753,11 +757,10 @@ init_cpu_features (struct cpu_features *cpu_features)
case INTEL_BIGCORE_SAPPHIRERAPIDS:
case INTEL_BIGCORE_EMERALDRAPIDS:
case INTEL_BIGCORE_GRANITERAPIDS:
- break;
-
- /* Untuned Mixed (bigcore + atom SOC). */
+ /* Mixed (bigcore + atom SOC). */
case INTEL_MIXED_LAKEFIELD:
case INTEL_MIXED_ALDERLAKE:
+ cpu_features->cachesize_non_temporal_divisor = 2;
break;
}
@@ -738,19 +738,25 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
cpu_features->level3_cache_linesize = level3_cache_linesize;
cpu_features->level4_cache_size = level4_cache_size;
- /* The default setting for the non_temporal threshold is 1/4 of size
- of the chip's cache. For most Intel and AMD processors with an
- initial release date between 2017 and 2023, a thread's typical
- share of the cache is from 18-64MB. Using the 1/4 L3 is meant to
- estimate the point where non-temporal stores begin outcompeting
- REP MOVSB. As well the point where the fact that non-temporal
- stores are forced back to main memory would already occurred to the
- majority of the lines in the copy. Note, concerns about the
- entire L3 cache being evicted by the copy are mostly alleviated
- by the fact that modern HW detects streaming patterns and
- provides proper LRU hints so that the maximum thrashing
- capped at 1/associativity. */
- unsigned long int non_temporal_threshold = shared / 4;
+ unsigned long int cachesize_non_temporal_divisor
+ = cpu_features->cachesize_non_temporal_divisor;
+ if (cachesize_non_temporal_divisor <= 0)
+ cachesize_non_temporal_divisor = 4;
+
+ /* The default setting for the non_temporal threshold is [1/2, 1/8] of size
+ of the chip's cache (depending on `cachesize_non_temporal_divisor` which
+ is microarch specific. The defeault is 1/4). For most Intel and AMD
+ processors with an initial release date between 2017 and 2023, a thread's
+ typical share of the cache is from 18-64MB. Using a reasonable size
+ fraction of L3 is meant to estimate the point where non-temporal stores
+ begin outcompeting REP MOVSB. As well the point where the fact that
+ non-temporal stores are forced back to main memory would already occurred
+ to the majority of the lines in the copy. Note, concerns about the entire
+ L3 cache being evicted by the copy are mostly alleviated by the fact that
+ modern HW detects streaming patterns and provides proper LRU hints so that
+ the maximum thrashing capped at 1/associativity. */
+ unsigned long int non_temporal_threshold
+ = shared / cachesize_non_temporal_divisor;
/* If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run
a higher risk of actually thrashing the cache as they don't have a HW LRU
hint. As well, there performance in highly parallel situations is
@@ -915,6 +915,9 @@ struct cpu_features
unsigned long int shared_cache_size;
/* Threshold to use non temporal store. */
unsigned long int non_temporal_threshold;
+ /* When no user non_temporal_threshold is specified. We default to
+ cachesize / cachesize_non_temporal_divisor. */
+ unsigned long int cachesize_non_temporal_divisor;
/* Threshold to use "rep movsb". */
unsigned long int rep_movsb_threshold;
/* Threshold to stop using "rep movsb". */