Message ID | 20240224031848.3866630-1-quic_apinski@quicinc.com |
---|---|
State | New |
Headers | show |
Series | [1/2] aarch64: Use fmov s/d/hN, FP_CST for some vector CST [PR113856] | expand |
Andrew Pinski <quic_apinski@quicinc.com> writes: > Aarch64 has a way to form some floating point CSTs via the fmov instructions, > these instructions also zero out the upper parts of the registers so they can > be used for vector CSTs that have have one non-zero constant that would be able > to formed via the fmov in the first element. > > This implements this "small" optimization so these vector cst don't need to do > loads from memory. > > Built and tested on aarch64-linux-gnu with no regressions. > > PR target/113856 > > gcc/ChangeLog: > > * config/aarch64/aarch64.cc (struct simd_immediate_info): > Add FMOV_SDH to insn_type. For scalar_float_mode constructor > add insn_in. > (aarch64_simd_valid_immediate): Catch `{fp, 0...}` vector_cst > and return a simd_immediate_info which uses FMOV_SDH. > (aarch64_output_simd_mov_immediate): Support outputting > fmov for FMOV_SDH. > > gcc/testsuite/ChangeLog: > > * gcc.target/aarch64/fmov-zero-cst-1.c: New test. > * gcc.target/aarch64/fmov-zero-cst-2.c: New test. > > Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com> > --- > gcc/config/aarch64/aarch64.cc | 48 ++++++++++++++--- > .../gcc.target/aarch64/fmov-zero-cst-1.c | 52 +++++++++++++++++++ > .../gcc.target/aarch64/fmov-zero-cst-2.c | 19 +++++++ > 3 files changed, 111 insertions(+), 8 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/aarch64/fmov-zero-cst-1.c > create mode 100644 gcc/testsuite/gcc.target/aarch64/fmov-zero-cst-2.c > > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > index 5dd0814f198..c4386591a9b 100644 > --- a/gcc/config/aarch64/aarch64.cc > +++ b/gcc/config/aarch64/aarch64.cc > @@ -126,11 +126,11 @@ constexpr auto AARCH64_STATE_OUT = 1U << 2; > /* Information about a legitimate vector immediate operand. */ > struct simd_immediate_info > { > - enum insn_type { MOV, MVN, INDEX, PTRUE }; > + enum insn_type { MOV, FMOV_SDH, MVN, INDEX, PTRUE }; > enum modifier_type { LSL, MSL }; > > simd_immediate_info () {} > - simd_immediate_info (scalar_float_mode, rtx); > + simd_immediate_info (scalar_float_mode, rtx, insn_type = MOV); > simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT, > insn_type = MOV, modifier_type = LSL, > unsigned int = 0); > @@ -145,7 +145,7 @@ struct simd_immediate_info > > union > { > - /* For MOV and MVN. */ > + /* For MOV, FMOV_SDH and MVN. */ > struct > { > /* The value of each element. */ > @@ -173,9 +173,10 @@ struct simd_immediate_info > /* Construct a floating-point immediate in which each element has mode > ELT_MODE_IN and value VALUE_IN. */ > inline simd_immediate_info > -::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in) > - : elt_mode (elt_mode_in), insn (MOV) > +::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in, insn_type insn_in) Nit: long line. > + : elt_mode (elt_mode_in), insn (insn_in) > { > + gcc_assert (insn_in == MOV || insn_in == FMOV_SDH); > u.mov.value = value_in; > u.mov.modifier = LSL; > u.mov.shift = 0; > @@ -22932,6 +22933,35 @@ aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info, > return true; > } > } > + /* See if we can use fmov d0/s0/h0 ... for the constant. */ > + if (n_elts >= 1 This condition seems unnecessary. n_elts can't be zero. > + && (vec_flags & VEC_ADVSIMD) > + && is_a <scalar_float_mode> (elt_mode, &elt_float_mode) > + && !CONST_VECTOR_DUPLICATE_P (op)) I think we should also drop this. I guess it's to undo: if (CONST_VECTOR_P (op) && CONST_VECTOR_DUPLICATE_P (op)) n_elts = CONST_VECTOR_NPATTERNS (op); but we can use GET_MODE_NUNITS (mode) directly instead. > + { > + rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0); > + if (aarch64_float_const_zero_rtx_p (elt) > + || aarch64_float_const_representable_p (elt)) What's the effect of including aarch64_float_const_zero_rtx_p for the first element? Does it change the code we generate for any cases involving +0.0? Or is it more for -0.0? > + { > + bool valid = true; > + for (unsigned int i = 1; i < n_elts; i++) > + { > + rtx elt1 = CONST_VECTOR_ENCODED_ELT (op, i); > + if (!aarch64_float_const_zero_rtx_p (elt1)) > + { > + valid = false; > + break; > + } > + } > + if (valid) > + { > + if (info) > + *info = simd_immediate_info (elt_float_mode, elt, > + simd_immediate_info::FMOV_SDH); > + return true; > + } > + } > + } > > /* If all elements in an SVE vector have the same value, we have a free > choice between using the element mode and using the container mode. > @@ -25121,7 +25151,8 @@ aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width, > > if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT) > { > - gcc_assert (info.insn == simd_immediate_info::MOV > + gcc_assert ((info.insn == simd_immediate_info::MOV > + || info.insn == simd_immediate_info::FMOV_SDH) > && info.u.mov.shift == 0); > /* For FP zero change it to a CONST_INT 0 and use the integer SIMD > move immediate path. */ > @@ -25134,8 +25165,9 @@ aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width, > real_to_decimal_for_mode (float_buf, > CONST_DOUBLE_REAL_VALUE (info.u.mov.value), > buf_size, buf_size, 1, info.elt_mode); > - > - if (lane_count == 1) > + if (info.insn == simd_immediate_info::FMOV_SDH) > + snprintf (templ, sizeof (templ), "fmov\t%%%c0, %s", element_char, float_buf); Long line. I think this is more general than the lane_count == 1 handling (which would need to change if we ever added 32-bit vectors), so it's probably better to add "|| lane_count == 1" to this if rather than have an else if. > + else if (lane_count == 1) > snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf); > else > snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s", > diff --git a/gcc/testsuite/gcc.target/aarch64/fmov-zero-cst-1.c b/gcc/testsuite/gcc.target/aarch64/fmov-zero-cst-1.c > new file mode 100644 > index 00000000000..9b13ef7b1ef > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/fmov-zero-cst-1.c > @@ -0,0 +1,52 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mcmodel=tiny" } */ > +/* { dg-final { check-function-bodies "**" "" "" } } */ > +/* PR target/113856 */ The tests look specific to little-endian, so I think they need to be guarded with aarch64_little_endian. Thanks, Richard > + > +#define vect64 __attribute__((vector_size(8) )) > +#define vect128 __attribute__((vector_size(16) )) > + > +/* > +** f1: > +** fmov s0, 1.0e\+0 > +** ret > +*/ > +vect64 float f1() > +{ > + return (vect64 float){1.0f, 0}; > +} > + > +/* > +** f2: > +** fmov s0, 1.0e\+0 > +** ret > +*/ > +vect128 float f2() > +{ > + return (vect128 float){1.0f, 0, 0, 0}; > +} > + > +/* f3 should only be done for -ffast-math. */ > +/* > +** f3: > +** ldr q0, .LC[0-9]+ > +** ret > +*/ > +vect128 float f3() > +{ > + return (vect128 float){1.0f, 0, -0.0f, 0.0f}; > +} > + > +/* f4 cannot be using fmov here, > + Note this is checked here as {1.0, 0.0, 1.0, 0.0} > + has CONST_VECTOR_DUPLICATE_P set > + and represented interanlly as: {1.0, 0.0}. */ > +/* > +** f4: > +** ldr q0, .LC[0-9]+ > +** ret > +*/ > +vect128 float f4() > +{ > + return (vect128 float){1.0f, 0, 1.0f, 0.0f}; > +} > diff --git a/gcc/testsuite/gcc.target/aarch64/fmov-zero-cst-2.c b/gcc/testsuite/gcc.target/aarch64/fmov-zero-cst-2.c > new file mode 100644 > index 00000000000..c97d85b68a9 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/fmov-zero-cst-2.c > @@ -0,0 +1,19 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mcmodel=tiny -ffast-math" } */ > +/* { dg-final { check-function-bodies "**" "" "" } } */ > +/* PR target/113856 */ > + > +#define vect64 __attribute__((vector_size(8) )) > +#define vect128 __attribute__((vector_size(16) )) > + > +/* f3 can be done with -ffast-math. */ > +/* > +** f3: > +** fmov s0, 1.0e\+0 > +** ret > +*/ > +vect128 float f3() > +{ > + return (vect128 float){1.0f, 0, -0.0f, 0.0f}; > +} > +
Richard Sandiford <richard.sandiford@arm.com> writes: > Andrew Pinski <quic_apinski@quicinc.com> writes: >> Aarch64 has a way to form some floating point CSTs via the fmov instructions, >> these instructions also zero out the upper parts of the registers so they can >> be used for vector CSTs that have have one non-zero constant that would be able >> to formed via the fmov in the first element. >> >> This implements this "small" optimization so these vector cst don't need to do >> loads from memory. >> >> Built and tested on aarch64-linux-gnu with no regressions. >> >> PR target/113856 >> >> gcc/ChangeLog: >> >> * config/aarch64/aarch64.cc (struct simd_immediate_info): >> Add FMOV_SDH to insn_type. For scalar_float_mode constructor >> add insn_in. >> (aarch64_simd_valid_immediate): Catch `{fp, 0...}` vector_cst >> and return a simd_immediate_info which uses FMOV_SDH. >> (aarch64_output_simd_mov_immediate): Support outputting >> fmov for FMOV_SDH. >> >> gcc/testsuite/ChangeLog: >> >> * gcc.target/aarch64/fmov-zero-cst-1.c: New test. >> * gcc.target/aarch64/fmov-zero-cst-2.c: New test. >> >> Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com> >> --- >> gcc/config/aarch64/aarch64.cc | 48 ++++++++++++++--- >> .../gcc.target/aarch64/fmov-zero-cst-1.c | 52 +++++++++++++++++++ >> .../gcc.target/aarch64/fmov-zero-cst-2.c | 19 +++++++ >> 3 files changed, 111 insertions(+), 8 deletions(-) >> create mode 100644 gcc/testsuite/gcc.target/aarch64/fmov-zero-cst-1.c >> create mode 100644 gcc/testsuite/gcc.target/aarch64/fmov-zero-cst-2.c >> >> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc >> index 5dd0814f198..c4386591a9b 100644 >> --- a/gcc/config/aarch64/aarch64.cc >> +++ b/gcc/config/aarch64/aarch64.cc >> @@ -126,11 +126,11 @@ constexpr auto AARCH64_STATE_OUT = 1U << 2; >> /* Information about a legitimate vector immediate operand. */ >> struct simd_immediate_info >> { >> - enum insn_type { MOV, MVN, INDEX, PTRUE }; >> + enum insn_type { MOV, FMOV_SDH, MVN, INDEX, PTRUE }; >> enum modifier_type { LSL, MSL }; >> >> simd_immediate_info () {} >> - simd_immediate_info (scalar_float_mode, rtx); >> + simd_immediate_info (scalar_float_mode, rtx, insn_type = MOV); >> simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT, >> insn_type = MOV, modifier_type = LSL, >> unsigned int = 0); >> @@ -145,7 +145,7 @@ struct simd_immediate_info >> >> union >> { >> - /* For MOV and MVN. */ >> + /* For MOV, FMOV_SDH and MVN. */ >> struct >> { >> /* The value of each element. */ >> @@ -173,9 +173,10 @@ struct simd_immediate_info >> /* Construct a floating-point immediate in which each element has mode >> ELT_MODE_IN and value VALUE_IN. */ >> inline simd_immediate_info >> -::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in) >> - : elt_mode (elt_mode_in), insn (MOV) >> +::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in, insn_type insn_in) > > Nit: long line. > >> + : elt_mode (elt_mode_in), insn (insn_in) >> { >> + gcc_assert (insn_in == MOV || insn_in == FMOV_SDH); >> u.mov.value = value_in; >> u.mov.modifier = LSL; >> u.mov.shift = 0; >> @@ -22932,6 +22933,35 @@ aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info, >> return true; >> } >> } >> + /* See if we can use fmov d0/s0/h0 ... for the constant. */ >> + if (n_elts >= 1 > > This condition seems unnecessary. n_elts can't be zero. > >> + && (vec_flags & VEC_ADVSIMD) >> + && is_a <scalar_float_mode> (elt_mode, &elt_float_mode) >> + && !CONST_VECTOR_DUPLICATE_P (op)) > > I think we should also drop this. I guess it's to undo: > > if (CONST_VECTOR_P (op) > && CONST_VECTOR_DUPLICATE_P (op)) > n_elts = CONST_VECTOR_NPATTERNS (op); > > but we can use GET_MODE_NUNITS (mode) directly instead. > >> + { >> + rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0); >> + if (aarch64_float_const_zero_rtx_p (elt) >> + || aarch64_float_const_representable_p (elt)) > > What's the effect of including aarch64_float_const_zero_rtx_p for the > first element? Does it change the code we generate for any cases > involving +0.0? Or is it more for -0.0? > >> + { >> + bool valid = true; >> + for (unsigned int i = 1; i < n_elts; i++) >> + { >> + rtx elt1 = CONST_VECTOR_ENCODED_ELT (op, i); >> + if (!aarch64_float_const_zero_rtx_p (elt1)) >> + { >> + valid = false; >> + break; >> + } >> + } >> + if (valid) >> + { >> + if (info) >> + *info = simd_immediate_info (elt_float_mode, elt, >> + simd_immediate_info::FMOV_SDH); >> + return true; >> + } >> + } >> + } Sorry to reply to myself almost immediately, but did you consider extending: scalar_float_mode elt_float_mode; if (n_elts == 1 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode)) { rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0); if (aarch64_float_const_zero_rtx_p (elt) || aarch64_float_const_representable_p (elt)) { if (info) *info = simd_immediate_info (elt_float_mode, elt); return true; } } rather than adding a new move type? I think the same trick works for SVE as well as Advanced SIMD. Richard
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 5dd0814f198..c4386591a9b 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -126,11 +126,11 @@ constexpr auto AARCH64_STATE_OUT = 1U << 2; /* Information about a legitimate vector immediate operand. */ struct simd_immediate_info { - enum insn_type { MOV, MVN, INDEX, PTRUE }; + enum insn_type { MOV, FMOV_SDH, MVN, INDEX, PTRUE }; enum modifier_type { LSL, MSL }; simd_immediate_info () {} - simd_immediate_info (scalar_float_mode, rtx); + simd_immediate_info (scalar_float_mode, rtx, insn_type = MOV); simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT, insn_type = MOV, modifier_type = LSL, unsigned int = 0); @@ -145,7 +145,7 @@ struct simd_immediate_info union { - /* For MOV and MVN. */ + /* For MOV, FMOV_SDH and MVN. */ struct { /* The value of each element. */ @@ -173,9 +173,10 @@ struct simd_immediate_info /* Construct a floating-point immediate in which each element has mode ELT_MODE_IN and value VALUE_IN. */ inline simd_immediate_info -::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in) - : elt_mode (elt_mode_in), insn (MOV) +::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in, insn_type insn_in) + : elt_mode (elt_mode_in), insn (insn_in) { + gcc_assert (insn_in == MOV || insn_in == FMOV_SDH); u.mov.value = value_in; u.mov.modifier = LSL; u.mov.shift = 0; @@ -22932,6 +22933,35 @@ aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info, return true; } } + /* See if we can use fmov d0/s0/h0 ... for the constant. */ + if (n_elts >= 1 + && (vec_flags & VEC_ADVSIMD) + && is_a <scalar_float_mode> (elt_mode, &elt_float_mode) + && !CONST_VECTOR_DUPLICATE_P (op)) + { + rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0); + if (aarch64_float_const_zero_rtx_p (elt) + || aarch64_float_const_representable_p (elt)) + { + bool valid = true; + for (unsigned int i = 1; i < n_elts; i++) + { + rtx elt1 = CONST_VECTOR_ENCODED_ELT (op, i); + if (!aarch64_float_const_zero_rtx_p (elt1)) + { + valid = false; + break; + } + } + if (valid) + { + if (info) + *info = simd_immediate_info (elt_float_mode, elt, + simd_immediate_info::FMOV_SDH); + return true; + } + } + } /* If all elements in an SVE vector have the same value, we have a free choice between using the element mode and using the container mode. @@ -25121,7 +25151,8 @@ aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width, if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT) { - gcc_assert (info.insn == simd_immediate_info::MOV + gcc_assert ((info.insn == simd_immediate_info::MOV + || info.insn == simd_immediate_info::FMOV_SDH) && info.u.mov.shift == 0); /* For FP zero change it to a CONST_INT 0 and use the integer SIMD move immediate path. */ @@ -25134,8 +25165,9 @@ aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width, real_to_decimal_for_mode (float_buf, CONST_DOUBLE_REAL_VALUE (info.u.mov.value), buf_size, buf_size, 1, info.elt_mode); - - if (lane_count == 1) + if (info.insn == simd_immediate_info::FMOV_SDH) + snprintf (templ, sizeof (templ), "fmov\t%%%c0, %s", element_char, float_buf); + else if (lane_count == 1) snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf); else snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s", diff --git a/gcc/testsuite/gcc.target/aarch64/fmov-zero-cst-1.c b/gcc/testsuite/gcc.target/aarch64/fmov-zero-cst-1.c new file mode 100644 index 00000000000..9b13ef7b1ef --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/fmov-zero-cst-1.c @@ -0,0 +1,52 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mcmodel=tiny" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ +/* PR target/113856 */ + +#define vect64 __attribute__((vector_size(8) )) +#define vect128 __attribute__((vector_size(16) )) + +/* +** f1: +** fmov s0, 1.0e\+0 +** ret +*/ +vect64 float f1() +{ + return (vect64 float){1.0f, 0}; +} + +/* +** f2: +** fmov s0, 1.0e\+0 +** ret +*/ +vect128 float f2() +{ + return (vect128 float){1.0f, 0, 0, 0}; +} + +/* f3 should only be done for -ffast-math. */ +/* +** f3: +** ldr q0, .LC[0-9]+ +** ret +*/ +vect128 float f3() +{ + return (vect128 float){1.0f, 0, -0.0f, 0.0f}; +} + +/* f4 cannot be using fmov here, + Note this is checked here as {1.0, 0.0, 1.0, 0.0} + has CONST_VECTOR_DUPLICATE_P set + and represented interanlly as: {1.0, 0.0}. */ +/* +** f4: +** ldr q0, .LC[0-9]+ +** ret +*/ +vect128 float f4() +{ + return (vect128 float){1.0f, 0, 1.0f, 0.0f}; +} diff --git a/gcc/testsuite/gcc.target/aarch64/fmov-zero-cst-2.c b/gcc/testsuite/gcc.target/aarch64/fmov-zero-cst-2.c new file mode 100644 index 00000000000..c97d85b68a9 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/fmov-zero-cst-2.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mcmodel=tiny -ffast-math" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ +/* PR target/113856 */ + +#define vect64 __attribute__((vector_size(8) )) +#define vect128 __attribute__((vector_size(16) )) + +/* f3 can be done with -ffast-math. */ +/* +** f3: +** fmov s0, 1.0e\+0 +** ret +*/ +vect128 float f3() +{ + return (vect128 float){1.0f, 0, -0.0f, 0.0f}; +} +
Aarch64 has a way to form some floating point CSTs via the fmov instructions, these instructions also zero out the upper parts of the registers so they can be used for vector CSTs that have have one non-zero constant that would be able to formed via the fmov in the first element. This implements this "small" optimization so these vector cst don't need to do loads from memory. Built and tested on aarch64-linux-gnu with no regressions. PR target/113856 gcc/ChangeLog: * config/aarch64/aarch64.cc (struct simd_immediate_info): Add FMOV_SDH to insn_type. For scalar_float_mode constructor add insn_in. (aarch64_simd_valid_immediate): Catch `{fp, 0...}` vector_cst and return a simd_immediate_info which uses FMOV_SDH. (aarch64_output_simd_mov_immediate): Support outputting fmov for FMOV_SDH. gcc/testsuite/ChangeLog: * gcc.target/aarch64/fmov-zero-cst-1.c: New test. * gcc.target/aarch64/fmov-zero-cst-2.c: New test. Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com> --- gcc/config/aarch64/aarch64.cc | 48 ++++++++++++++--- .../gcc.target/aarch64/fmov-zero-cst-1.c | 52 +++++++++++++++++++ .../gcc.target/aarch64/fmov-zero-cst-2.c | 19 +++++++ 3 files changed, 111 insertions(+), 8 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/fmov-zero-cst-1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/fmov-zero-cst-2.c