@@ -8505,6 +8505,18 @@ (define_insn "aarch64_<PERMUTE:perm_insn><mode><vczle><vczbe>"
[(set_attr "type" "neon_permute<q>")]
)
+;; ZIP1 ignores the contents of the upper halves of the registers,
+;; so we can describe 128-bit operations in terms of 64-bit inputs.
+(define_insn "aarch64_zip1<mode>_low"
+ [(set (match_operand:VQ 0 "register_operand" "=w")
+ (unspec:VQ [(match_operand:<VHALF> 1 "register_operand" "w")
+ (match_operand:<VHALF> 2 "register_operand" "w")]
+ UNSPEC_ZIP1))]
+ "TARGET_SIMD"
+ "zip1\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
+ [(set_attr "type" "neon_permute_q")]
+)
+
;; This instruction's pattern is generated directly by
;; aarch64_expand_vec_perm_const, so any changes to the pattern would
;; need corresponding changes there. Note that the immediate (third)
@@ -9685,9 +9697,8 @@ (define_insn_and_split "<optab><Vnarrowq><mode>2"
not sufficient uses of the zero to make the split worthwhile. */
rtx res = simplify_gen_subreg (<VNARROWQ2>mode, operands[0],
<MODE>mode, 0);
- rtx zero = aarch64_gen_shareable_zero (<VNARROWQ2>mode);
- rtx op = lowpart_subreg (<VNARROWQ2>mode, operands[1], <VNARROWQ>mode);
- emit_insn (gen_aarch64_zip1<Vnarrowq2> (res, op, zero));
+ rtx zero = aarch64_gen_shareable_zero (<VNARROWQ>mode);
+ emit_insn (gen_aarch64_zip1<Vnarrowq2>_low (res, operands[1], zero));
DONE;
}
[(set_attr "type" "neon_shift_imm_long")]
new file mode 100644
@@ -0,0 +1,25 @@
+/* { dg-options "-O" } */
+
+#include <arm_neon.h>
+
+void test()
+{
+ while (1)
+ {
+ static const uint16_t jsimd_rgb_ycc_neon_consts[] = {19595, 0, 0, 0, 0, 0, 0, 0};
+ uint16x8_t consts = vld1q_u16(jsimd_rgb_ycc_neon_consts);
+
+ uint8_t tmp_buf[0];
+ uint8x8x3_t input_pixels = vld3_u8(tmp_buf);
+ uint16x8_t r = vmovl_u8(input_pixels.val[1]);
+ uint32x4_t y_l = vmull_laneq_u16(vget_low_u16(r), consts, 0);
+
+ uint32x4_t s = vdupq_n_u32(1);
+ uint16x4_t a = vrshrn_n_u32(s, 16);
+ uint16x4_t y = vrshrn_n_u32(y_l, 16);
+ uint16x8_t ay = vcombine_u16(a, y);
+
+ unsigned char ***out_buf;
+ vst1_u8(out_buf[1][0], vmovn_u16(ay));
+ }
+}
new file mode 100644
@@ -0,0 +1,40 @@
+/* { dg-options "-O2" } */
+
+#pragma GCC aarch64 "arm_neon.h"
+typedef __Uint8x8_t uint8x8_t;
+typedef __Uint16x4_t uint16x4_t;
+typedef __Int16x8_t int16x8_t;
+typedef __Uint16x8_t uint16x8_t;
+int jsimd_extbgrx_ycc_convert_neon_image_width,
+ jsimd_extbgrx_ycc_convert_neon___trans_tmp_1;
+uint16x4_t jsimd_extbgrx_ycc_convert_neon___trans_tmp_2;
+uint16x8_t vcombine_u16();
+uint16x8_t vmovl_u8(uint8x8_t __a) {
+ return __builtin_aarch64_uxtlv8hi_uu(__a);
+}
+__inline int __attribute__((__gnu_inline__)) vmull_laneq_u16();
+uint8x8x4_t vld4_u8();
+void jsimd_extbgrx_ycc_convert_neon() {
+ int scaled_128_5 = jsimd_extbgrx_ycc_convert_neon___trans_tmp_1,
+ cols_remaining = jsimd_extbgrx_ycc_convert_neon_image_width;
+ for (;;)
+ if (cols_remaining) {
+ uint8x8x4_t input_pixels = vld4_u8();
+ uint16x8_t r = vmovl_u8(input_pixels.val[2]);
+ uint16x8_t g = vmovl_u8(input_pixels.val[1]);
+ uint16x8_t b = vmovl_u8(input_pixels.val[0]);
+ int y_l = vmull_laneq_u16(r);
+ uint16x8_t __a = g;
+ jsimd_extbgrx_ycc_convert_neon___trans_tmp_2 =
+ (uint16x4_t)__builtin_aarch64_get_lowv8hi((int16x8_t)__a);
+ __a = b;
+ int cb_l = scaled_128_5;
+ int cb_h = scaled_128_5;
+ int cr_l = scaled_128_5;
+ int cr_h = scaled_128_5;
+ uint16x8_t y_u16 = vcombine_u16(y_l);
+ uint16x8_t cb_u16 = vcombine_u16(cb_l, cb_h);
+ uint16x8_t cr_u16 = vcombine_u16(cr_l, cr_h);
+ __a = y_u16 = cb_u16 = cr_u16;
+ }
+}