@@ -8660,6 +8660,8 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
if (TARGET_AVX256_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 128)
move_mode = TImode;
+ if (TARGET_AVX512_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 256)
+ move_mode = OImode;
/* Find the corresponding vector mode with the same size as MOVE_MODE.
MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
@@ -2980,6 +2980,8 @@ ix86_option_override_internal (bool main_args_p,
}
if (ix86_tune_features [X86_TUNE_AVOID_256FMA_CHAINS])
+ SET_OPTION_IF_UNSET (opts, opts_set, param_avoid_fma_max_bits, 512);
+ else if (ix86_tune_features [X86_TUNE_AVOID_256FMA_CHAINS])
SET_OPTION_IF_UNSET (opts, opts_set, param_avoid_fma_max_bits, 256);
else if (ix86_tune_features [X86_TUNE_AVOID_128FMA_CHAINS])
SET_OPTION_IF_UNSET (opts, opts_set, param_avoid_fma_max_bits, 128);
@@ -20363,10 +20363,13 @@ ix86_vec_cost (machine_mode mode, int cost)
if (GET_MODE_BITSIZE (mode) == 128
&& TARGET_SSE_SPLIT_REGS)
- return cost * 2;
- if (GET_MODE_BITSIZE (mode) > 128
+ return cost * GET_MODE_BITSIZE (mode) / 64;
+ else if (GET_MODE_BITSIZE (mode) > 128
&& TARGET_AVX256_SPLIT_REGS)
return cost * GET_MODE_BITSIZE (mode) / 128;
+ else if (GET_MODE_BITSIZE (mode) > 256
+ && TARGET_AVX512_SPLIT_REGS)
+ return cost * GET_MODE_BITSIZE (mode) / 256;
return cost;
}
@@ -23090,7 +23093,9 @@ ix86_reassociation_width (unsigned int op, machine_mode mode)
return 1;
/* Account for targets that splits wide vectors into multiple parts. */
- if (TARGET_AVX256_SPLIT_REGS && GET_MODE_BITSIZE (mode) > 128)
+ if (TARGET_AVX512_SPLIT_REGS && GET_MODE_BITSIZE (mode) > 256)
+ div = GET_MODE_BITSIZE (mode) / 256;
+ else if (TARGET_AVX256_SPLIT_REGS && GET_MODE_BITSIZE (mode) > 128)
div = GET_MODE_BITSIZE (mode) / 128;
else if (TARGET_SSE_SPLIT_REGS && GET_MODE_BITSIZE (mode) > 64)
div = GET_MODE_BITSIZE (mode) / 64;
@@ -419,6 +419,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
ix86_tune_features[X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL]
#define TARGET_AVX256_SPLIT_REGS \
ix86_tune_features[X86_TUNE_AVX256_SPLIT_REGS]
+#define TARGET_AVX512_SPLIT_REGS \
+ ix86_tune_features[X86_TUNE_AVX512_SPLIT_REGS]
#define TARGET_GENERAL_REGS_SSE_SPILL \
ix86_tune_features[X86_TUNE_GENERAL_REGS_SSE_SPILL]
#define TARGET_AVOID_MEM_OPND_FOR_CMOVE \
@@ -481,12 +481,12 @@ DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, "avoid_4byte_prefixes",
/* X86_TUNE_USE_GATHER_2PARTS: Use gather instructions for vectors with 2
elements. */
DEF_TUNE (X86_TUNE_USE_GATHER_2PARTS, "use_gather_2parts",
- ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ALDERLAKE | m_CORE_ATOM | m_GENERIC))
+ ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ALDERLAKE | m_CORE_ATOM | m_GENERIC))
/* X86_TUNE_USE_GATHER_4PARTS: Use gather instructions for vectors with 4
elements. */
DEF_TUNE (X86_TUNE_USE_GATHER_4PARTS, "use_gather_4parts",
- ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ALDERLAKE | m_CORE_ATOM | m_GENERIC))
+ ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ALDERLAKE | m_CORE_ATOM | m_GENERIC))
/* X86_TUNE_USE_GATHER: Use gather instructions for vectors with 8 or more
elements. */
@@ -499,7 +499,9 @@ DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER)
/* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or
smaller FMA chain. */
-DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | m_ZNVER3)
+DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | m_ZNVER3 | m_ZNVER4)
+
+DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_ZNVER4)
/* X86_TUNE_V2DF_REDUCTION_PREFER_PHADDPD: Prefer haddpd
for v2df vector reduction. */
@@ -531,27 +533,30 @@ DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER2
/* X86_TUNE_AVX256_OPTIMAL: Use 256-bit AVX instructions instead of 512-bit AVX
instructions in the auto-vectorizer. */
-DEF_TUNE (X86_TUNE_AVX256_OPTIMAL, "avx256_optimal", m_CORE_AVX512)
+DEF_TUNE (X86_TUNE_AVX256_OPTIMAL, "avx256_optimal", m_CORE_AVX512 | m_ZNVER4)
+
+/* X86_TUNE_AVX256_SPLIT_REGS: if true, AVX512 ops are split into two AVX256 ops. */
+DEF_TUNE (X86_TUNE_AVX512_SPLIT_REGS, "avx512_split_regs", m_ZNVER4)
/* X86_TUNE_AVX256_MOVE_BY_PIECES: Optimize move_by_pieces with 256-bit
AVX instructions. */
DEF_TUNE (X86_TUNE_AVX256_MOVE_BY_PIECES, "avx256_move_by_pieces",
- m_ALDERLAKE | m_CORE_AVX2)
+ m_ALDERLAKE | m_CORE_AVX2 | m_ZNVER1 | m_ZNVER2 | m_ZNVER3)
/* X86_TUNE_AVX256_STORE_BY_PIECES: Optimize store_by_pieces with 256-bit
AVX instructions. */
DEF_TUNE (X86_TUNE_AVX256_STORE_BY_PIECES, "avx256_store_by_pieces",
- m_ALDERLAKE | m_CORE_AVX2)
+ m_ALDERLAKE | m_CORE_AVX2 | m_ZNVER1 | m_ZNVER2 | m_ZNVER3)
/* X86_TUNE_AVX512_MOVE_BY_PIECES: Optimize move_by_pieces with 512-bit
AVX instructions. */
DEF_TUNE (X86_TUNE_AVX512_MOVE_BY_PIECES, "avx512_move_by_pieces",
- m_SAPPHIRERAPIDS)
+ m_SAPPHIRERAPIDS | m_ZNVER4)
/* X86_TUNE_AVX512_STORE_BY_PIECES: Optimize store_by_pieces with 512-bit
AVX instructions. */
DEF_TUNE (X86_TUNE_AVX512_STORE_BY_PIECES, "avx512_store_by_pieces",
- m_SAPPHIRERAPIDS)
+ m_SAPPHIRERAPIDS | m_ZNVER4)
/*****************************************************************************/
/*****************************************************************************/