@@ -13648,7 +13648,7 @@ static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
with all elements equal to VAR. Return true if successful. */
-static bool
+bool
ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
rtx target, rtx val)
{
@@ -50,6 +50,9 @@ extern void ix86_reset_previous_fndecl (void);
extern bool ix86_using_red_zone (void);
+extern unsigned int ix86_minimum_incoming_stack_boundary (bool,
+ bool = false);
+
extern unsigned int ix86_regmode_natural_size (machine_mode);
#ifdef RTX_CODE
extern int standard_80387_constant_p (rtx);
@@ -257,6 +260,8 @@ extern void ix86_expand_mul_widen_hilo (rtx, rtx, rtx, bool, bool);
extern void ix86_expand_sse2_mulv4si3 (rtx, rtx, rtx);
extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx);
extern void ix86_expand_sse2_abs (rtx, rtx);
+extern bool ix86_expand_vector_init_duplicate (bool, machine_mode, rtx,
+ rtx);
/* In i386-c.c */
extern void ix86_target_macros (void);
@@ -415,7 +415,6 @@ static unsigned int split_stack_prologue_scratch_regno (void);
static bool i386_asm_output_addr_const_extra (FILE *, rtx);
static bool ix86_can_inline_p (tree, tree);
-static unsigned int ix86_minimum_incoming_stack_boundary (bool);
/* Whether -mtune= or -march= were specified */
@@ -7232,8 +7231,9 @@ find_drap_reg (void)
/* Return minimum incoming stack alignment. */
-static unsigned int
-ix86_minimum_incoming_stack_boundary (bool sibcall)
+unsigned int
+ix86_minimum_incoming_stack_boundary (bool sibcall,
+ bool ignore_estimated)
{
unsigned int incoming_stack_boundary;
@@ -7248,7 +7248,8 @@ ix86_minimum_incoming_stack_boundary (bool sibcall)
estimated stack alignment is 128bit. */
else if (!sibcall
&& ix86_force_align_arg_pointer
- && crtl->stack_alignment_estimated == 128)
+ && (ignore_estimated
+ || crtl->stack_alignment_estimated == 128))
incoming_stack_boundary = MIN_STACK_BOUNDARY;
else
incoming_stack_boundary = ix86_default_incoming_stack_boundary;
@@ -23052,6 +23053,259 @@ ix86_optab_supported_p (int op, machine_mode mode1, machine_mode,
}
}
+/* Return the RTL for memset in MODE from PREV. */
+
+static rtx
+ix86_gen_memset_value_from_prev (by_pieces_prev *prevp,
+ scalar_int_mode mode)
+{
+ rtx prev = prevp->data;
+
+ /* Use the previous data in the same mode. */
+ if (prevp->mode == mode)
+ return prev;
+
+ machine_mode prev_mode = prevp->mode;
+ size_t size = GET_MODE_SIZE (prev_mode);
+
+ /* NB: Skip if the previous value is 1 byte or less. CONST_WIDE_INT
+ is in VOIDmode whose size is 0. */
+ if (size <= 1)
+ return nullptr;
+
+ rtx reg, reg_ti;
+ switch (size)
+ {
+ default:
+ gcc_unreachable ();
+
+ case 2:
+ case 4:
+ return simplify_gen_subreg (mode, prev, prev_mode, 0);
+
+ case 8:
+ /* In 64-bit mode, use SUBREG since word size is 8 bytes. */
+ if (TARGET_64BIT)
+ return simplify_gen_subreg (mode, prev, prev_mode, 0);
+
+ switch (GET_MODE_SIZE (mode))
+ {
+ default:
+ gcc_unreachable ();
+ case 2:
+ case 4:
+do_hi_si_mode:
+ /* In 32-bit mode, Extract the value from an 8-byte
+ register into an integer register first. */
+ reg = gen_reg_rtx (SImode);
+ emit_move_insn (reg,
+ simplify_gen_subreg (SImode, prev,
+ prev_mode, 0));
+ return simplify_gen_subreg (mode, reg, SImode, 0);
+ }
+ break;
+
+ case 16:
+ switch (GET_MODE_SIZE (mode))
+ {
+ default:
+ gcc_unreachable ();
+ case 2:
+ case 4:
+ /* Extract the value from a 16-byte vector register into
+ an integer register first. */
+ goto do_hi_si_mode;
+ case 8:
+ return simplify_gen_subreg (mode, prev, prev_mode, 0);
+ case 16:
+ return prev;
+ }
+ break;
+
+ case 32:
+ switch (GET_MODE_SIZE (mode))
+ {
+ default:
+ gcc_unreachable ();
+ case 2:
+do_himode:
+ /* Extract the value from a 32-byte vector register into
+ a 16-byte vector register first. */
+ reg_ti = gen_reg_rtx (TImode);
+ emit_move_insn (reg_ti,
+ simplify_gen_subreg (TImode, prev,
+ prev_mode, 0));
+ /* Then extract the value from a 16-byte vector register
+ into an integer register. */
+ reg = gen_reg_rtx (SImode);
+ emit_move_insn (reg,
+ simplify_gen_subreg (SImode, reg_ti,
+ TImode, 0));
+ return simplify_gen_subreg (mode, reg, SImode, 0);
+
+ case 4:
+ case 8:
+do_si_di_mode:
+ /* Extract the value from a 32-byte vector register into
+ a 16-byte vector register first. */
+ reg_ti = gen_reg_rtx (TImode);
+ emit_move_insn (reg_ti,
+ simplify_gen_subreg (TImode, prev,
+ prev_mode, 0));
+ /* Generate 4/8-byte SSE -> INT move instruction. */
+ reg = gen_reg_rtx (mode);
+ emit_move_insn (reg,
+ simplify_gen_subreg (mode, reg_ti,
+ TImode, 0));
+ return reg;
+ case 16:
+ return simplify_gen_subreg (mode, prev, prev_mode, 0);
+ case 32:
+ return prev;
+ }
+
+ case 64:
+ switch (GET_MODE_SIZE (mode))
+ {
+ default:
+ gcc_unreachable ();
+ case 2:
+ /* Extract the value from a 64-byte vector register into
+ a 16-byte vector register first. */
+ goto do_himode;
+ case 4:
+ case 8:
+ /* Extract the value from a 64-byte vector register into
+ a 16-byte vector register first. */
+ goto do_si_di_mode;
+ case 16:
+ case 32:
+ return simplify_gen_subreg (mode, prev, prev_mode, 0);
+ case 64:
+ return prev;
+ }
+ }
+
+ return nullptr;
+}
+
+/* Implement the TARGET_GEN_MEMSET_VALUE hook. */
+
+static rtx
+ix86_gen_memset_value (rtx data, void *prevp, scalar_int_mode mode)
+{
+ /* Don't use the previous value if size is 1. */
+ if (GET_MODE_SIZE (mode) == 1)
+ return data;
+
+ by_pieces_prev *prev = (by_pieces_prev *) prevp;
+ if (prev != nullptr && prev->data != nullptr)
+ {
+ rtx value = ix86_gen_memset_value_from_prev (prev, mode);
+ if (value)
+ return value;
+ }
+
+ /* Use default_gen_memset_value for vector store won't be used. */
+ if (GET_MODE_SIZE (mode) <= GET_MODE_SIZE (DImode))
+ return default_gen_memset_value (data, prevp, mode);
+
+ rtx one, target;
+ scalar_mode one_mode;
+
+ unsigned int incoming_stack_boundary
+ = ix86_minimum_incoming_stack_boundary (false, true);
+
+ switch (GET_MODE_SIZE (mode))
+ {
+ default:
+ gcc_unreachable ();
+
+ case 64:
+ if (!TARGET_AVX512BW)
+ {
+ rtx tmp;
+ /* NB: Don't increase stack alignment requirement by using a
+ scratch SSE register. */
+ if (GET_MODE_ALIGNMENT (V32QImode) > incoming_stack_boundary)
+ tmp = gen_rtx_REG (V32QImode, SCRATCH_SSE_REG);
+ else
+ tmp = gen_reg_rtx (V32QImode);
+ if (!ix86_expand_vector_init_duplicate (false, V32QImode,
+ tmp, data))
+ gcc_unreachable ();
+ target = gen_rtx_VEC_CONCAT (V64QImode, tmp, tmp);
+ if (REGNO (tmp) == SCRATCH_SSE_REG)
+ {
+ tmp = gen_rtx_REG (V64QImode, SCRATCH_SSE_REG);
+ emit_move_insn (tmp, target);
+ return gen_rtx_REG (mode, SCRATCH_SSE_REG);
+ }
+ else
+ return convert_to_mode (mode, target, 1);
+ }
+ /* FALLTHRU */
+ case 16:
+ case 32:
+ one_mode = QImode;
+ one = data;
+ break;
+ }
+
+ unsigned int nunits = GET_MODE_SIZE (mode) / GET_MODE_SIZE (one_mode);
+ machine_mode vector_mode;
+ if (!mode_for_vector (one_mode, nunits).exists (&vector_mode))
+ gcc_unreachable ();
+
+ /* NB: Don't increase stack alignment requirement by using a scratch
+ SSE register. */
+ if (GET_MODE_ALIGNMENT (vector_mode) > incoming_stack_boundary)
+ target = gen_rtx_REG (vector_mode, SCRATCH_SSE_REG);
+ else
+ target = gen_reg_rtx (vector_mode);
+ if (!ix86_expand_vector_init_duplicate (false, vector_mode, target,
+ one))
+ gcc_unreachable ();
+
+ if (REGNO (target) == SCRATCH_SSE_REG)
+ return gen_rtx_REG (mode, SCRATCH_SSE_REG);
+ else
+ return convert_to_mode (mode, target, 1);
+}
+
+/* Implement the TARGET_READ_MEMSET_VALUE hook. */
+
+static rtx
+ix86_read_memset_value (const char *str, void *prevp,
+ scalar_int_mode mode)
+{
+ rtx value;
+
+ by_pieces_prev *prev = (by_pieces_prev *) prevp;
+ if (prev != nullptr && prev->data != nullptr)
+ {
+ /* Don't use the previous value if size is 1. */
+ if (GET_MODE_SIZE (mode) == 1)
+ return default_read_memset_value (str, nullptr, mode);
+
+ value = ix86_gen_memset_value_from_prev (prev, mode);
+ if (value)
+ return value;
+
+ return default_read_memset_value (str, nullptr, mode);
+ }
+
+ /* Use default_gen_memset_value if vector store can't be used.
+ NB: Need AVX2 for fast vector duplication and gen_reg_rtx. */
+ if (GET_MODE_SIZE (mode) <= GET_MODE_SIZE (DImode)
+ || !TARGET_AVX2
+ || !reg_rtx_no)
+ return default_read_memset_value (str, nullptr, mode);
+
+ value = default_read_memset_value (str, nullptr, QImode);
+ return ix86_gen_memset_value (value, nullptr, mode);
+}
+
/* Address space support.
This is not "far pointers" in the 16-bit sense, but an easy way
@@ -23953,6 +24207,12 @@ static bool ix86_libc_has_fast_function (int fcode ATTRIBUTE_UNUSED)
#undef TARGET_LIBC_HAS_FAST_FUNCTION
#define TARGET_LIBC_HAS_FAST_FUNCTION ix86_libc_has_fast_function
+#undef TARGET_GEN_MEMSET_VALUE
+#define TARGET_GEN_MEMSET_VALUE ix86_gen_memset_value
+
+#undef TARGET_READ_MEMSET_VALUE
+#define TARGET_READ_MEMSET_VALUE ix86_read_memset_value
+
#if CHECKING_P
#undef TARGET_RUN_TARGET_SELFTESTS
#define TARGET_RUN_TARGET_SELFTESTS selftest::ix86_run_selftests
@@ -1131,6 +1131,10 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
#define FIRST_MASK_REG MASK0_REG
#define LAST_MASK_REG MASK7_REG
+/* A scratch vector reg. */
+#define SCRATCH_SSE_REG \
+ (TARGET_64BIT ? LAST_REX_SSE_REG : LAST_SSE_REG)
+
/* Override this in other tm.h files to cope with various OS lossage
requiring a frame pointer. */
#ifndef SUBTARGET_FRAME_POINTER_REQUIRED
new file mode 100644
@@ -0,0 +1,14 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -march=skylake-avx512" } */
+
+extern char *dst;
+
+void
+foo (int c)
+{
+ __builtin_memset (dst, c, 17);
+}
+
+/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%edi, %xmm\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]+%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
+/* { dg-final { scan-assembler-times "movb\[\\t \]+%dil, 16\\(%\[\^,\]+\\)" 1 } } */
new file mode 100644
@@ -0,0 +1,14 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -march=skylake-avx512" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+ __builtin_memset (dst, -1, 17);
+}
+
+/* { dg-final { scan-assembler-times "vpcmpeqd" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]+%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
+/* { dg-final { scan-assembler-times "movb\[\\t \]+\\\$-1, 16\\(%\[\^,\]+\\)" 1 } } */
new file mode 100644
@@ -0,0 +1,14 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -march=skylake-avx512" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+ __builtin_memset (dst, 12, 19);
+}
+
+/* { dg-final { scan-assembler-times "vpbroadcastb" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]+%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovd\[\\t \]+%xmm\[0-9\]+, 15\\(%\[\^,\]+\\)" 1 } } */
new file mode 100644
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake-avx512" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+ __builtin_memset (dst, 12, 9);
+}
+
+/* { dg-final { scan-assembler-times "movabsq\[\\t \]+\\\$868082074056920076, %r" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+\\\$202116108, \\(%\[\^,\]+\\)" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+\\\$202116108, 4\\(%\[\^,\]+\\)" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movb\[\\t \]+\\\$12, 8\\(%\[\^,\]+\\)" 1 } } */
new file mode 100644
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+ __builtin_memset (dst, 12, 9);
+}
+
+/* { dg-final { scan-assembler-times "movabsq\[\\t \]+\\\$868082074056920076, %r" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+\\\$202116108, \\(%\[\^,\]+\\)" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+\\\$202116108, 4\\(%\[\^,\]+\\)" 1 { target ia32 } } } */