commit 1e3a8ea7fbaed4cd6638b08fb18b951a9c02f889
Author: Julian Brown <jbrown@build6-lucid-cs.sje.mentorg.com>
Date: Fri Jul 13 07:13:09 2012 -0700
Optimize NEON vdup builtins.
@@ -7608,6 +7608,17 @@ arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
}
return true;
+ case CONST_VECTOR:
+ if (TARGET_NEON
+ && TARGET_HARD_FLOAT
+ && outer == SET
+ && (VALID_NEON_DREG_MODE (mode) || VALID_NEON_QREG_MODE (mode))
+ && neon_immediate_valid_for_move (x, mode, NULL, NULL))
+ *total = COSTS_N_INSNS (1);
+ else
+ *total = COSTS_N_INSNS (4);
+ return true;
+
default:
*total = COSTS_N_INSNS (4);
return false;
@@ -7948,6 +7959,17 @@ arm_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
*total = COSTS_N_INSNS (4);
return true;
+ case CONST_VECTOR:
+ if (TARGET_NEON
+ && TARGET_HARD_FLOAT
+ && outer_code == SET
+ && (VALID_NEON_DREG_MODE (mode) || VALID_NEON_QREG_MODE (mode))
+ && neon_immediate_valid_for_move (x, mode, NULL, NULL))
+ *total = COSTS_N_INSNS (1);
+ else
+ *total = COSTS_N_INSNS (4);
+ return true;
+
case HIGH:
case LO_SUM:
/* We prefer constant pool entries to MOVW/MOVT pairs, so bump the
@@ -8768,11 +8790,14 @@ vfp3_const_double_rtx (rtx x)
vmov i64 17 aaaaaaaa bbbbbbbb cccccccc dddddddd
eeeeeeee ffffffff gggggggg hhhhhhhh
vmov f32 18 aBbbbbbc defgh000 00000000 00000000
+ vmov f32 19 00000000 00000000 00000000 00000000
For case 18, B = !b. Representable values are exactly those accepted by
vfp3_const_double_index, but are output as floating-point numbers rather
than indices.
+ For case 19, we will change it to vmov.i32 when assembling.
+
Variants 0-5 (inclusive) may also be used as immediates for the second
operand of VORR/VBIC instructions.
@@ -8829,7 +8854,7 @@ neon_valid_immediate (rtx op, enum machine_mode mode, int inverse,
rtx el0 = CONST_VECTOR_ELT (op, 0);
REAL_VALUE_TYPE r0;
- if (!vfp3_const_double_rtx (el0))
+ if (!vfp3_const_double_rtx (el0) && el0 != CONST0_RTX (GET_MODE (el0)))
return -1;
REAL_VALUE_FROM_CONST_DOUBLE (r0, el0);
@@ -8851,7 +8876,10 @@ neon_valid_immediate (rtx op, enum machine_mode mode, int inverse,
if (elementwidth)
*elementwidth = 0;
- return 18;
+ if (el0 == CONST0_RTX (GET_MODE (el0)))
+ return 19;
+ else
+ return 18;
}
/* Splat vector constant out into a byte vector. */
@@ -4,10 +4,8 @@
/* { dg-add-options arm_neon } */
#include <arm_neon.h>
-float32x2_t f_sub_abs_to_vabd_32()
+float32x2_t f_sub_abs_to_vabd_32(float32x2_t val1, float32x2_t val2)
{
- float32x2_t val1 = vdup_n_f32 (10);
- float32x2_t val2 = vdup_n_f32 (30);
float32x2_t sres = vsub_f32(val1, val2);
float32x2_t res = vabs_f32 (sres);
@@ -16,10 +14,8 @@ float32x2_t f_sub_abs_to_vabd_32()
/* { dg-final { scan-assembler "vabd\.f32" } }*/
#include <arm_neon.h>
-int8x8_t sub_abs_to_vabd_8()
+int8x8_t sub_abs_to_vabd_8(int8x8_t val1, int8x8_t val2)
{
- int8x8_t val1 = vdup_n_s8 (10);
- int8x8_t val2 = vdup_n_s8 (30);
int8x8_t sres = vsub_s8(val1, val2);
int8x8_t res = vabs_s8 (sres);
@@ -27,10 +23,8 @@ int8x8_t sub_abs_to_vabd_8()
}
/* { dg-final { scan-assembler "vabd\.s8" } }*/
-int16x4_t sub_abs_to_vabd_16()
+int16x4_t sub_abs_to_vabd_16(int16x4_t val1, int16x4_t val2)
{
- int16x4_t val1 = vdup_n_s16 (10);
- int16x4_t val2 = vdup_n_s16 (30);
int16x4_t sres = vsub_s16(val1, val2);
int16x4_t res = vabs_s16 (sres);
@@ -38,10 +32,8 @@ int16x4_t sub_abs_to_vabd_16()
}
/* { dg-final { scan-assembler "vabd\.s16" } }*/
-int32x2_t sub_abs_to_vabd_32()
+int32x2_t sub_abs_to_vabd_32(int32x2_t val1, int32x2_t val2)
{
- int32x2_t val1 = vdup_n_s32 (10);
- int32x2_t val2 = vdup_n_s32 (30);
int32x2_t sres = vsub_s32(val1, val2);
int32x2_t res = vabs_s32 (sres);
new file mode 100644
@@ -0,0 +1,17 @@
+/* Test the optimization of `vdupq_n_f32' ARM Neon intrinsic. */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+float32x4_t out_float32x4_t;
+void test_vdupq_nf32 (void)
+{
+ out_float32x4_t = vdupq_n_f32 (0.0);
+}
+
+/* { dg-final { scan-assembler "vmov\.f32\[ \]+\[qQ\]\[0-9\]+, #0\.0\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
new file mode 100644
@@ -0,0 +1,17 @@
+/* Test the optimization of `vdupq_n_u32' ARM Neon intrinsic. */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+uint32x4_t out_uint32x4_t;
+void test_vdupq_nu32 (void)
+{
+ out_uint32x4_t = vdupq_n_u32 (~0x12000000);
+}
+
+/* { dg-final { scan-assembler "vmov\.i32\[ \]+\[qQ\]\[0-9\]+, #3992977407\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
new file mode 100644
@@ -0,0 +1,17 @@
+/* Test the optimization of `vdupq_n_u16' ARM Neon intrinsic. */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+uint16x8_t out_uint16x8_t;
+void test_vdupq_nu16 (void)
+{
+ out_uint16x8_t = vdupq_n_u16 (0x12);
+}
+
+/* { dg-final { scan-assembler "vmov\.i16\[ \]+\[qQ\]\[0-9\]+, #18\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
new file mode 100644
@@ -0,0 +1,17 @@
+/* Test the optimization of `vdupq_n_u16' ARM Neon intrinsic. */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+uint16x8_t out_uint16x8_t;
+void test_vdupq_nu16 (void)
+{
+ out_uint16x8_t = vdupq_n_u16 (0x1200);
+}
+
+/* { dg-final { scan-assembler "vmov\.i16\[ \]+\[qQ\]\[0-9\]+, #4608\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
new file mode 100644
@@ -0,0 +1,17 @@
+/* Test the optimization of `vdupq_n_u16' ARM Neon intrinsic. */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+uint16x8_t out_uint16x8_t;
+void test_vdupq_nu16 (void)
+{
+ out_uint16x8_t = vdupq_n_u16 (~0x12);
+}
+
+/* { dg-final { scan-assembler "vmov\.i16\[ \]+\[qQ\]\[0-9\]+, #65517\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
new file mode 100644
@@ -0,0 +1,17 @@
+/* Test the optimization of `vdupq_n_u16' ARM Neon intrinsic. */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+uint16x8_t out_uint16x8_t;
+void test_vdupq_nu16 (void)
+{
+ out_uint16x8_t = vdupq_n_u16 (~0x1200);
+}
+
+/* { dg-final { scan-assembler "vmov\.i16\[ \]+\[qQ\]\[0-9\]+, #60927\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
new file mode 100644
@@ -0,0 +1,17 @@
+/* Test the optimization of `vdupq_n_u8' ARM Neon intrinsic. */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+uint8x16_t out_uint8x16_t;
+void test_vdupq_nu8 (void)
+{
+ out_uint8x16_t = vdupq_n_u8 (0x12);
+}
+
+/* { dg-final { scan-assembler "vmov\.i8\[ \]+\[qQ\]\[0-9\]+, #18\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
new file mode 100644
@@ -0,0 +1,17 @@
+/* Test the optimization of `vdupq_n_u32' ARM Neon intrinsic. */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+uint32x4_t out_uint32x4_t;
+void test_vdupq_nu32 (void)
+{
+ out_uint32x4_t = vdupq_n_u32 (0x12ff);
+}
+
+/* { dg-final { scan-assembler "vmov\.i32\[ \]+\[qQ\]\[0-9\]+, #4863\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
new file mode 100644
@@ -0,0 +1,17 @@
+/* Test the optimization of `vdupq_n_u32' ARM Neon intrinsic. */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+uint32x4_t out_uint32x4_t;
+void test_vdupq_nu32 (void)
+{
+ out_uint32x4_t = vdupq_n_u32 (0x12ffff);
+}
+
+/* { dg-final { scan-assembler "vmov\.i32\[ \]+\[qQ\]\[0-9\]+, #1245183\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
new file mode 100644
@@ -0,0 +1,17 @@
+/* Test the optimization of `vdupq_n_u32' ARM Neon intrinsic. */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+uint32x4_t out_uint32x4_t;
+void test_vdupq_nu32 (void)
+{
+ out_uint32x4_t = vdupq_n_u32 (~0x12ff);
+}
+
+/* { dg-final { scan-assembler "vmov\.i32\[ \]+\[qQ\]\[0-9\]+, #4294962432\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
new file mode 100644
@@ -0,0 +1,17 @@
+/* Test the optimization of `vdupq_n_u32' ARM Neon intrinsic. */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+uint32x4_t out_uint32x4_t;
+void test_vdupq_nu32 (void)
+{
+ out_uint32x4_t = vdupq_n_u32 (~0x12ffff);
+}
+
+/* { dg-final { scan-assembler "vmov\.i32\[ \]+\[qQ\]\[0-9\]+, #4293722112\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
new file mode 100644
@@ -0,0 +1,17 @@
+/* Test the optimization of `vdupq_n_f32' ARM Neon intrinsic. */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+float32x4_t out_float32x4_t;
+void test_vdupq_nf32 (void)
+{
+ out_float32x4_t = vdupq_n_f32 (0.125);
+}
+
+/* { dg-final { scan-assembler "vmov\.f32\[ \]+\[qQ\]\[0-9\]+, #1\.25e-1\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
new file mode 100644
@@ -0,0 +1,17 @@
+/* Test the optimization of `vdupq_n_u32' ARM Neon intrinsic. */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+uint32x4_t out_uint32x4_t;
+void test_vdupq_nu32 (void)
+{
+ out_uint32x4_t = vdupq_n_u32 (0x12);
+}
+
+/* { dg-final { scan-assembler "vmov\.i32\[ \]+\[qQ\]\[0-9\]+, #18\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
new file mode 100644
@@ -0,0 +1,17 @@
+/* Test the optimization of `vdupq_n_u32' ARM Neon intrinsic. */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+uint32x4_t out_uint32x4_t;
+void test_vdupq_nu32 (void)
+{
+ out_uint32x4_t = vdupq_n_u32 (0x1200);
+}
+
+/* { dg-final { scan-assembler "vmov\.i32\[ \]+\[qQ\]\[0-9\]+, #4608\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
new file mode 100644
@@ -0,0 +1,17 @@
+/* Test the optimization of `vdupq_n_u32' ARM Neon intrinsic. */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+uint32x4_t out_uint32x4_t;
+void test_vdupq_nu32 (void)
+{
+ out_uint32x4_t = vdupq_n_u32 (0x120000);
+}
+
+/* { dg-final { scan-assembler "vmov\.i32\[ \]+\[qQ\]\[0-9\]+, #1179648\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
new file mode 100644
@@ -0,0 +1,17 @@
+/* Test the optimization of `vdupq_n_u32' ARM Neon intrinsic. */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+uint32x4_t out_uint32x4_t;
+void test_vdupq_nu32 (void)
+{
+ out_uint32x4_t = vdupq_n_u32 (0x12000000);
+}
+
+/* { dg-final { scan-assembler "vmov\.i32\[ \]+\[qQ\]\[0-9\]+, #301989888\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
new file mode 100644
@@ -0,0 +1,17 @@
+/* Test the optimization of `vdupq_n_u32' ARM Neon intrinsic. */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+uint32x4_t out_uint32x4_t;
+void test_vdupq_nu32 (void)
+{
+ out_uint32x4_t = vdupq_n_u32 (~0x12);
+}
+
+/* { dg-final { scan-assembler "vmov\.i32\[ \]+\[qQ\]\[0-9\]+, #4294967277\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
new file mode 100644
@@ -0,0 +1,17 @@
+/* Test the optimization of `vdupq_n_u32' ARM Neon intrinsic. */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+uint32x4_t out_uint32x4_t;
+void test_vdupq_nu32 (void)
+{
+ out_uint32x4_t = vdupq_n_u32 (~0x1200);
+}
+
+/* { dg-final { scan-assembler "vmov\.i32\[ \]+\[qQ\]\[0-9\]+, #4294962687\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
new file mode 100644
@@ -0,0 +1,17 @@
+/* Test the optimization of `vdupq_n_u32' ARM Neon intrinsic. */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+uint32x4_t out_uint32x4_t;
+void test_vdupq_nu32 (void)
+{
+ out_uint32x4_t = vdupq_n_u32 (~0x120000);
+}
+
+/* { dg-final { scan-assembler "vmov\.i32\[ \]+\[qQ\]\[0-9\]+, #4293787647\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */