Message ID | 1454525947-14690-3-git-send-email-charles.baylis@linaro.org |
---|---|
State | New |
Headers | show |
Hi Charles, On 03/02/16 18:59, charles.baylis@linaro.org wrote: > From: Charles Baylis <charles.baylis@linaro.org> > > gcc/ChangeLog: > > 2016-02-03 Charles Baylis <charles.baylis@linaro.org> > > PR target/68532 > * config/arm/arm.c (arm_evpc_neon_vzip): Allow for big endian lane > order. > * config/arm/arm_neon.h (vzipq_s8): Adjust shuffle patterns for big > endian. > (vzipq_s16): Likewise. > (vzipq_s32): Likewise. > (vzipq_f32): Likewise. > (vzipq_u8): Likewise. > (vzipq_u16): Likewise. > (vzipq_u32): Likewise. > (vzipq_p8): Likewise. > (vzipq_p16): Likewise. > > Change-Id: I327678f5e73c1de2f413c1d22769ab42ce1d6c16 > > diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c > index e9aa982..24239db 100644 > --- a/gcc/config/arm/arm.c > +++ b/gcc/config/arm/arm.c > @@ -28318,15 +28318,21 @@ arm_evpc_neon_vzip (struct expand_vec_perm_d *d) > unsigned int i, high, mask, nelt = d->nelt; > rtx out0, out1, in0, in1; > rtx (*gen)(rtx, rtx, rtx, rtx); > + int first_elem; > + bool is_swapped; > > if (GET_MODE_UNIT_SIZE (d->vmode) >= 8) > return false; > > + is_swapped = BYTES_BIG_ENDIAN ? true : false; This is just "is_swapped = BYTES_BIG_ENDIAN;" > + > /* Note that these are little-endian tests. Adjust for big-endian later. */ I think you can remove this comment now, like in patch 1/2 > + first_elem = d->perm[neon_endian_lane_map (d->vmode, 0) ^ is_swapped]; > + > high = nelt / 2; > - if (d->perm[0] == high) > + if (first_elem == neon_endian_lane_map (d->vmode, high)) > ; > - else if (d->perm[0] == 0) > + else if (first_elem == neon_endian_lane_map (d->vmode, 0)) > high = 0; > else > return false; > @@ -28334,11 +28340,16 @@ arm_evpc_neon_vzip (struct expand_vec_perm_d *d) > > for (i = 0; i < nelt / 2; i++) > { > - unsigned elt = (i + high) & mask; > - if (d->perm[i * 2] != elt) > + unsigned elt = > + neon_pair_endian_lane_map (d->vmode, i + high) & mask; > + if (d->perm[neon_pair_endian_lane_map (d->vmode, 2 * i + is_swapped)] > + != elt) > return false; > - elt = (elt + nelt) & mask; > - if (d->perm[i * 2 + 1] != elt) > + elt = > + neon_pair_endian_lane_map (d->vmode, i + nelt + high) > + & mask; The "& mask" can go on the previous line. > + if (d->perm[neon_pair_endian_lane_map (d->vmode, 2 * i + !is_swapped)] > + != elt) > return false; > } > > @@ -28362,10 +28373,9 @@ arm_evpc_neon_vzip (struct expand_vec_perm_d *d) > > in0 = d->op0; > in1 = d->op1; > - if (BYTES_BIG_ENDIAN) > + if (is_swapped) > { > std::swap (in0, in1); > - high = !high; > } remove the braces around the std::swap. Ok with these changes. I've tried out both patch and they do fix execution failures on big-endian and don't break any NEON intrinsics tests that I threw at them. > > out0 = d->target; > diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h > index 2e014b6..aa17f49 100644 > --- a/gcc/config/arm/arm_neon.h > +++ b/gcc/config/arm/arm_neon.h > @@ -8453,9 +8453,9 @@ vzipq_s8 (int8x16_t __a, int8x16_t __b) > int8x16x2_t __rv; > #ifdef __ARM_BIG_ENDIAN > __rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t) > - { 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 }); > + { 20, 4, 21, 5, 22, 6, 23, 7, 16, 0, 17, 1, 18, 2, 19, 3 }); > __rv.val[1] = __builtin_shuffle (__a, __b, (uint8x16_t) > - { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 }); > + { 28, 12, 29, 13, 30, 14, 31, 15, 24, 8, 25, 9, 26, 10, 27, 11 }); > #else > __rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t) > { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 }); > @@ -8471,9 +8471,9 @@ vzipq_s16 (int16x8_t __a, int16x8_t __b) > int16x8x2_t __rv; > #ifdef __ARM_BIG_ENDIAN > __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t) > - { 12, 4, 13, 5, 14, 6, 15, 7 }); > + { 10, 2, 11, 3, 8, 0, 9, 1 }); > __rv.val[1] = __builtin_shuffle (__a, __b, (uint16x8_t) > - { 8, 0, 9, 1, 10, 2, 11, 3 }); > + { 14, 6, 15, 7, 12, 4, 13, 5 }); > #else > __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t) > { 0, 8, 1, 9, 2, 10, 3, 11 }); > @@ -8488,8 +8488,8 @@ vzipq_s32 (int32x4_t __a, int32x4_t __b) > { > int32x4x2_t __rv; > #ifdef __ARM_BIG_ENDIAN > - __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 6, 2, 7, 3 }); > - __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 4, 0, 5, 1 }); > + __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 5, 1, 4, 0 }); > + __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 7, 3, 6, 2 }); > #else > __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 4, 1, 5 }); > __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 2, 6, 3, 7 }); > @@ -8502,8 +8502,8 @@ vzipq_f32 (float32x4_t __a, float32x4_t __b) > { > float32x4x2_t __rv; > #ifdef __ARM_BIG_ENDIAN > - __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 6, 2, 7, 3 }); > - __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 4, 0, 5, 1 }); > + __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 5, 1, 4, 0 }); > + __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 7, 3, 6, 2 }); > #else > __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 4, 1, 5 }); > __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 2, 6, 3, 7 }); > @@ -8517,9 +8517,9 @@ vzipq_u8 (uint8x16_t __a, uint8x16_t __b) > uint8x16x2_t __rv; > #ifdef __ARM_BIG_ENDIAN > __rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t) > - { 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 }); > + { 20, 4, 21, 5, 22, 6, 23, 7, 16, 0, 17, 1, 18, 2, 19, 3 }); > __rv.val[1] = __builtin_shuffle (__a, __b, (uint8x16_t) > - { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 }); > + { 28, 12, 29, 13, 30, 14, 31, 15, 24, 8, 25, 9, 26, 10, 27, 11 }); > #else > __rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t) > { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 }); > @@ -8535,9 +8535,9 @@ vzipq_u16 (uint16x8_t __a, uint16x8_t __b) > uint16x8x2_t __rv; > #ifdef __ARM_BIG_ENDIAN > __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t) > - { 12, 4, 13, 5, 14, 6, 15, 7 }); > + { 10, 2, 11, 3, 8, 0, 9, 1 }); > __rv.val[1] = __builtin_shuffle (__a, __b, (uint16x8_t) > - { 8, 0, 9, 1, 10, 2, 11, 3 }); > + { 14, 6, 15, 7, 12, 4, 13, 5 }); > #else > __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t) > { 0, 8, 1, 9, 2, 10, 3, 11 }); > @@ -8552,8 +8552,8 @@ vzipq_u32 (uint32x4_t __a, uint32x4_t __b) > { > uint32x4x2_t __rv; > #ifdef __ARM_BIG_ENDIAN > - __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 6, 2, 7, 3 }); > - __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 4, 0, 5, 1 }); > + __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 5, 1, 4, 0 }); > + __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 7, 3, 6, 2 }); > #else > __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 4, 1, 5 }); > __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 2, 6, 3, 7 }); > @@ -8567,9 +8567,9 @@ vzipq_p8 (poly8x16_t __a, poly8x16_t __b) > poly8x16x2_t __rv; > #ifdef __ARM_BIG_ENDIAN > __rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t) > - { 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 }); > + { 20, 4, 21, 5, 22, 6, 23, 7, 16, 0, 17, 1, 18, 2, 19, 3 }); > __rv.val[1] = __builtin_shuffle (__a, __b, (uint8x16_t) > - { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 }); > + { 28, 12, 29, 13, 30, 14, 31, 15, 24, 8, 25, 9, 26, 10, 27, 11 }); > #else > __rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t) > { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 }); > @@ -8585,9 +8585,9 @@ vzipq_p16 (poly16x8_t __a, poly16x8_t __b) > poly16x8x2_t __rv; > #ifdef __ARM_BIG_ENDIAN > __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t) > - { 12, 4, 13, 5, 14, 6, 15, 7 }); > + { 10, 2, 11, 3, 8, 0, 9, 1 }); > __rv.val[1] = __builtin_shuffle (__a, __b, (uint16x8_t) > - { 8, 0, 9, 1, 10, 2, 11, 3 }); > + { 14, 6, 15, 7, 12, 4, 13, 5 }); > #else > __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t) > { 0, 8, 1, 9, 2, 10, 3, 11 });
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index e9aa982..24239db 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -28318,15 +28318,21 @@ arm_evpc_neon_vzip (struct expand_vec_perm_d *d) unsigned int i, high, mask, nelt = d->nelt; rtx out0, out1, in0, in1; rtx (*gen)(rtx, rtx, rtx, rtx); + int first_elem; + bool is_swapped; if (GET_MODE_UNIT_SIZE (d->vmode) >= 8) return false; + is_swapped = BYTES_BIG_ENDIAN ? true : false; + /* Note that these are little-endian tests. Adjust for big-endian later. */ + first_elem = d->perm[neon_endian_lane_map (d->vmode, 0) ^ is_swapped]; + high = nelt / 2; - if (d->perm[0] == high) + if (first_elem == neon_endian_lane_map (d->vmode, high)) ; - else if (d->perm[0] == 0) + else if (first_elem == neon_endian_lane_map (d->vmode, 0)) high = 0; else return false; @@ -28334,11 +28340,16 @@ arm_evpc_neon_vzip (struct expand_vec_perm_d *d) for (i = 0; i < nelt / 2; i++) { - unsigned elt = (i + high) & mask; - if (d->perm[i * 2] != elt) + unsigned elt = + neon_pair_endian_lane_map (d->vmode, i + high) & mask; + if (d->perm[neon_pair_endian_lane_map (d->vmode, 2 * i + is_swapped)] + != elt) return false; - elt = (elt + nelt) & mask; - if (d->perm[i * 2 + 1] != elt) + elt = + neon_pair_endian_lane_map (d->vmode, i + nelt + high) + & mask; + if (d->perm[neon_pair_endian_lane_map (d->vmode, 2 * i + !is_swapped)] + != elt) return false; } @@ -28362,10 +28373,9 @@ arm_evpc_neon_vzip (struct expand_vec_perm_d *d) in0 = d->op0; in1 = d->op1; - if (BYTES_BIG_ENDIAN) + if (is_swapped) { std::swap (in0, in1); - high = !high; } out0 = d->target; diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index 2e014b6..aa17f49 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -8453,9 +8453,9 @@ vzipq_s8 (int8x16_t __a, int8x16_t __b) int8x16x2_t __rv; #ifdef __ARM_BIG_ENDIAN __rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t) - { 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 }); + { 20, 4, 21, 5, 22, 6, 23, 7, 16, 0, 17, 1, 18, 2, 19, 3 }); __rv.val[1] = __builtin_shuffle (__a, __b, (uint8x16_t) - { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 }); + { 28, 12, 29, 13, 30, 14, 31, 15, 24, 8, 25, 9, 26, 10, 27, 11 }); #else __rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t) { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 }); @@ -8471,9 +8471,9 @@ vzipq_s16 (int16x8_t __a, int16x8_t __b) int16x8x2_t __rv; #ifdef __ARM_BIG_ENDIAN __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t) - { 12, 4, 13, 5, 14, 6, 15, 7 }); + { 10, 2, 11, 3, 8, 0, 9, 1 }); __rv.val[1] = __builtin_shuffle (__a, __b, (uint16x8_t) - { 8, 0, 9, 1, 10, 2, 11, 3 }); + { 14, 6, 15, 7, 12, 4, 13, 5 }); #else __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t) { 0, 8, 1, 9, 2, 10, 3, 11 }); @@ -8488,8 +8488,8 @@ vzipq_s32 (int32x4_t __a, int32x4_t __b) { int32x4x2_t __rv; #ifdef __ARM_BIG_ENDIAN - __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 6, 2, 7, 3 }); - __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 4, 0, 5, 1 }); + __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 5, 1, 4, 0 }); + __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 7, 3, 6, 2 }); #else __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 4, 1, 5 }); __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 2, 6, 3, 7 }); @@ -8502,8 +8502,8 @@ vzipq_f32 (float32x4_t __a, float32x4_t __b) { float32x4x2_t __rv; #ifdef __ARM_BIG_ENDIAN - __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 6, 2, 7, 3 }); - __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 4, 0, 5, 1 }); + __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 5, 1, 4, 0 }); + __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 7, 3, 6, 2 }); #else __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 4, 1, 5 }); __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 2, 6, 3, 7 }); @@ -8517,9 +8517,9 @@ vzipq_u8 (uint8x16_t __a, uint8x16_t __b) uint8x16x2_t __rv; #ifdef __ARM_BIG_ENDIAN __rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t) - { 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 }); + { 20, 4, 21, 5, 22, 6, 23, 7, 16, 0, 17, 1, 18, 2, 19, 3 }); __rv.val[1] = __builtin_shuffle (__a, __b, (uint8x16_t) - { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 }); + { 28, 12, 29, 13, 30, 14, 31, 15, 24, 8, 25, 9, 26, 10, 27, 11 }); #else __rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t) { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 }); @@ -8535,9 +8535,9 @@ vzipq_u16 (uint16x8_t __a, uint16x8_t __b) uint16x8x2_t __rv; #ifdef __ARM_BIG_ENDIAN __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t) - { 12, 4, 13, 5, 14, 6, 15, 7 }); + { 10, 2, 11, 3, 8, 0, 9, 1 }); __rv.val[1] = __builtin_shuffle (__a, __b, (uint16x8_t) - { 8, 0, 9, 1, 10, 2, 11, 3 }); + { 14, 6, 15, 7, 12, 4, 13, 5 }); #else __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t) { 0, 8, 1, 9, 2, 10, 3, 11 }); @@ -8552,8 +8552,8 @@ vzipq_u32 (uint32x4_t __a, uint32x4_t __b) { uint32x4x2_t __rv; #ifdef __ARM_BIG_ENDIAN - __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 6, 2, 7, 3 }); - __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 4, 0, 5, 1 }); + __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 5, 1, 4, 0 }); + __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 7, 3, 6, 2 }); #else __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 4, 1, 5 }); __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 2, 6, 3, 7 }); @@ -8567,9 +8567,9 @@ vzipq_p8 (poly8x16_t __a, poly8x16_t __b) poly8x16x2_t __rv; #ifdef __ARM_BIG_ENDIAN __rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t) - { 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 }); + { 20, 4, 21, 5, 22, 6, 23, 7, 16, 0, 17, 1, 18, 2, 19, 3 }); __rv.val[1] = __builtin_shuffle (__a, __b, (uint8x16_t) - { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 }); + { 28, 12, 29, 13, 30, 14, 31, 15, 24, 8, 25, 9, 26, 10, 27, 11 }); #else __rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t) { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 }); @@ -8585,9 +8585,9 @@ vzipq_p16 (poly16x8_t __a, poly16x8_t __b) poly16x8x2_t __rv; #ifdef __ARM_BIG_ENDIAN __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t) - { 12, 4, 13, 5, 14, 6, 15, 7 }); + { 10, 2, 11, 3, 8, 0, 9, 1 }); __rv.val[1] = __builtin_shuffle (__a, __b, (uint16x8_t) - { 8, 0, 9, 1, 10, 2, 11, 3 }); + { 14, 6, 15, 7, 12, 4, 13, 5 }); #else __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t) { 0, 8, 1, 9, 2, 10, 3, 11 });
From: Charles Baylis <charles.baylis@linaro.org> gcc/ChangeLog: 2016-02-03 Charles Baylis <charles.baylis@linaro.org> PR target/68532 * config/arm/arm.c (arm_evpc_neon_vzip): Allow for big endian lane order. * config/arm/arm_neon.h (vzipq_s8): Adjust shuffle patterns for big endian. (vzipq_s16): Likewise. (vzipq_s32): Likewise. (vzipq_f32): Likewise. (vzipq_u8): Likewise. (vzipq_u16): Likewise. (vzipq_u32): Likewise. (vzipq_p8): Likewise. (vzipq_p16): Likewise. Change-Id: I327678f5e73c1de2f413c1d22769ab42ce1d6c16