@@ -780,7 +780,7 @@ typedef struct
AARCH64_SIMD_BUILTIN_##T##_##N##A,
#undef ENTRY
-#define ENTRY(N, S, M0, M1, M2, M3, U) \
+#define ENTRY(N, S, M0, M1, M2, M3, M4, U) \
AARCH64_##N,
enum aarch64_builtins
@@ -1593,6 +1593,7 @@ enum class aarch64_builtin_signatures
binary,
binary_fpm,
ternary_fpm,
+ ternary_fpm_lane,
unary_fpm,
};
@@ -1643,10 +1644,10 @@ namespace simd_types {
}
#undef ENTRY
-#define ENTRY(N, S, T0, T1, T2, T3, U) \
+#define ENTRY(N, S, T0, T1, T2, T3, T4, U) \
{#N, aarch64_builtin_signatures::S, simd_types::T0, simd_types::T1, \
- simd_types::T2, simd_types::T3, U, \
- aarch64_required_extensions::REQUIRED_EXTENSIONS},
+ simd_types::T2, simd_types::T3, simd_types::T4, U, \
+ aarch64_required_extensions::REQUIRED_EXTENSIONS},
/* Initialize pragma builtins. */
@@ -1654,7 +1655,7 @@ struct aarch64_pragma_builtins_data
{
const char *name;
aarch64_builtin_signatures signature;
- simd_type types[4];
+ simd_type types[5];
int unspec;
aarch64_required_extensions required_extensions;
};
@@ -1667,6 +1668,7 @@ static tree
aarch64_fntype (const aarch64_pragma_builtins_data &builtin_data)
{
tree type0, type1, type2, type3;
+ tree immtype = aarch64_simd_builtin_type (SImode, qualifier_lane_index);
switch (builtin_data.signature)
{
@@ -1701,6 +1703,18 @@ aarch64_fntype (const aarch64_pragma_builtins_data &builtin_data)
return build_function_type_list (type0, type1, type2, type3,
uint64_type_node, NULL_TREE);
+ case aarch64_builtin_signatures::ternary_fpm_lane:
+ type0 = aarch64_simd_builtin_type (builtin_data.types[0].mode,
+ builtin_data.types[0].qualifiers);
+ type1 = aarch64_simd_builtin_type (builtin_data.types[1].mode,
+ builtin_data.types[1].qualifiers);
+ type2 = aarch64_simd_builtin_type (builtin_data.types[2].mode,
+ builtin_data.types[2].qualifiers);
+ type3 = aarch64_simd_builtin_type (builtin_data.types[3].mode,
+ builtin_data.types[3].qualifiers);
+ return build_function_type_list (type0, type1, type2, type3, immtype,
+ uint64_type_node, NULL_TREE);
+
case aarch64_builtin_signatures::unary_fpm:
type0 = aarch64_simd_builtin_type (builtin_data.types[0].mode,
builtin_data.types[0].qualifiers);
@@ -2519,6 +2533,80 @@ aarch64_general_required_extensions (unsigned int code)
return ext::streaming_compatible (0);
}
+namespace function_checker {
+
+void
+require_integer_constant (location_t location, tree arg)
+{
+ if (TREE_CODE (arg) != INTEGER_CST)
+ {
+ error_at (location, "Constant-type integer argument expected");
+ return;
+ }
+}
+
+void
+require_immediate_range (location_t location, tree arg, HOST_WIDE_INT min,
+ HOST_WIDE_INT max)
+{
+ if (wi::to_widest (arg) < min || wi::to_widest (arg) > max)
+ {
+ error_at (location, "lane out of range %wd - %wd", min, max);
+ return;
+ }
+}
+
+/* Validates indexing into a vector using the index's size and the instruction,
+ where instruction is represented by the unspec.
+ This only works for intrinsics declared using pragmas in
+ aarch64-simd-pragma-builtins.def. */
+
+void
+check_simd_lane_bounds (location_t location, const aarch64_pragma_builtins_data
+ *builtin_data, tree *args)
+{
+ if (builtin_data == NULL)
+ // Don't check for functions that are not declared in
+ // aarch64-simd-pragma-builtins.def.
+ return;
+
+ switch (builtin_data->signature)
+ {
+ case aarch64_builtin_signatures::ternary_fpm_lane:
+ {
+ auto index_arg = args[3];
+ require_integer_constant (location, index_arg);
+
+ auto vector_to_index_mode = builtin_data->types[3].mode;
+ int vector_to_index_mode_size
+ = GET_MODE_NUNITS (vector_to_index_mode).to_constant ();
+
+ switch (builtin_data->unspec)
+ {
+ case UNSPEC_VDOT2:
+ require_immediate_range (location, index_arg, 0,
+ vector_to_index_mode_size / 2 - 1);
+ break;
+
+ case UNSPEC_VDOT4:
+ require_immediate_range (location, index_arg, 0,
+ vector_to_index_mode_size / 4 - 1);
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+ }
+
+ default:
+ // Other signatures don't have lanes and this check doesn't apply to
+ // them.
+ return;
+ }
+}
+
+};
+
bool
aarch64_general_check_builtin_call (location_t location, vec<location_t>,
unsigned int code, tree fndecl,
@@ -2530,6 +2618,9 @@ aarch64_general_check_builtin_call (location_t location, vec<location_t>,
if (!aarch64_check_required_extensions (location, decl, required_extensions))
return false;
+ auto builtin_data = aarch64_get_pragma_builtin (code);
+ function_checker::check_simd_lane_bounds (location, builtin_data, args);
+
switch (code)
{
case AARCH64_RSR:
@@ -3425,7 +3516,8 @@ aarch64_expand_pragma_builtin (tree exp, rtx target,
const aarch64_pragma_builtins_data *builtin_data)
{
auto unspec = builtin_data->unspec;
- expand_operand ops[4];
+ expand_operand ops[5];
+ insn_code icode;
switch (builtin_data->signature)
{
@@ -3445,6 +3537,40 @@ aarch64_expand_pragma_builtin (tree exp, rtx target,
break;
}
+ case aarch64_builtin_signatures::ternary_fpm_lane:
+ {
+ auto input1 = expand_normal (CALL_EXPR_ARG (exp, 0));
+ auto input2 = expand_normal (CALL_EXPR_ARG (exp, 1));
+ auto input3 = expand_normal (CALL_EXPR_ARG (exp, 2));
+ auto index = expand_normal (CALL_EXPR_ARG (exp, 3));
+ auto fpm_input = expand_normal (CALL_EXPR_ARG (exp, 4));
+
+ if (!CONST_INT_P (index))
+ {
+ error_at (EXPR_LOCATION (exp),
+ "argument should have been a constant");
+ break;
+ }
+
+ auto fpmr = gen_rtx_REG (DImode, FPM_REGNUM);
+ emit_move_insn (fpmr, fpm_input);
+
+ create_output_operand (&ops[0], target, builtin_data->types[0].mode);
+ create_input_operand (&ops[1], input1, builtin_data->types[1].mode);
+ create_input_operand (&ops[2], input2, builtin_data->types[2].mode);
+ create_input_operand (&ops[3], input3, builtin_data->types[3].mode);
+ create_input_operand (&ops[4], index, SImode);
+
+ icode = code_for_aarch64 (unspec,
+ builtin_data->types[0].mode,
+ builtin_data->types[1].mode,
+ builtin_data->types[2].mode,
+ builtin_data->types[3].mode,
+ SImode);
+ expand_insn (icode, 5, ops);
+ break;
+ }
+
case aarch64_builtin_signatures::binary_fpm:
{
auto input1 = expand_normal (CALL_EXPR_ARG (exp, 0));
@@ -259,6 +259,10 @@ aarch64_update_cpp_builtins (cpp_reader *pfile)
aarch64_def_or_undef (TARGET_FP8, "__ARM_FEATURE_FP8", pfile);
+ aarch64_def_or_undef (TARGET_FP8DOT2, "__ARM_FEATURE_FP8DOT2", pfile);
+
+ aarch64_def_or_undef (TARGET_FP8DOT4, "__ARM_FEATURE_FP8DOT4", pfile);
+
aarch64_def_or_undef (TARGET_LS64,
"__ARM_FEATURE_LS64", pfile);
aarch64_def_or_undef (TARGET_RCPC, "__ARM_FEATURE_RCPC", pfile);
@@ -234,6 +234,10 @@ AARCH64_OPT_EXTENSION("gcs", GCS, (), (), (), "gcs")
AARCH64_OPT_EXTENSION("fp8", FP8, (SIMD), (), (), "fp8")
+AARCH64_OPT_EXTENSION("fp8dot2", FP8DOT2, (SIMD), (), (), "fp8dot2")
+
+AARCH64_OPT_EXTENSION("fp8dot4", FP8DOT4, (SIMD), (), (), "fp8dot4")
+
AARCH64_OPT_EXTENSION("faminmax", FAMINMAX, (SIMD), (), (), "faminmax")
#undef AARCH64_OPT_FMV_EXTENSION
@@ -21,17 +21,36 @@
#undef ENTRY_BINARY
#define ENTRY_BINARY(N, S, T0, T1, T2, U) \
- ENTRY (N, S, T0, T1, T2, none, U)
+ ENTRY (N, S, T0, T1, T2, none, none, U)
#undef ENTRY_BINARY_FPM
#define ENTRY_BINARY_FPM(N, S, T0, T1, T2, U) \
- ENTRY (N, S, T0, T1, T2, none, U)
+ ENTRY (N, S, T0, T1, T2, none, none, U)
#define ENTRY_TERNARY_FPM(N, S, T0, T1, T2, T3, U) \
- ENTRY (N, S, T0, T1, T2, T3, U)
-
+ ENTRY (N, S, T0, T1, T2, T3, none, U)
+
+#undef ENTRY_TERNARY_FPM_LANE
+#define ENTRY_TERNARY_FPM_LANE(N, S, T0, T1, T2, T3, U) \
+ ENTRY (N, S, T0, T1, T2, T3, none, U)
+
+#undef ENTRY_VDOT_FPM
+#define ENTRY_VDOT_FPM(T, U) \
+ ENTRY_TERNARY_FPM (vdot_##T##_mf8_fpm, ternary_fpm, T, T, f8, f8, U) \
+ ENTRY_TERNARY_FPM (vdotq_##T##_mf8_fpm, ternary_fpm, T##q, T##q, f8q, f8q, \
+ U) \
+ ENTRY_TERNARY_FPM_LANE (vdot_lane_##T##_mf8_fpm, ternary_fpm_lane, T, T, \
+ f8, f8, U) \
+ ENTRY_TERNARY_FPM_LANE (vdot_laneq_##T##_mf8_fpm, ternary_fpm_lane, T, T, \
+ f8, f8q, U) \
+ ENTRY_TERNARY_FPM_LANE (vdotq_lane_##T##_mf8_fpm, ternary_fpm_lane, T##q, \
+ T##q, f8q, f8, U) \
+ ENTRY_TERNARY_FPM_LANE (vdotq_laneq_##T##_mf8_fpm, ternary_fpm_lane, T##q, \
+ T##q, f8q, f8q, U)
+
+#undef ENTRY_UNARY_FPM
#define ENTRY_UNARY_FPM(N, S, T0, T1, U) \
- ENTRY (N, S, T0, T1, none, none, U)
+ ENTRY (N, S, T0, T1, none, none, none, U)
#undef ENTRY_VHSDF
#define ENTRY_VHSDF(NAME, SIGNATURE, UNSPEC) \
@@ -92,3 +111,13 @@ ENTRY_TERNARY_FPM (vcvt_high_mf8_f32_fpm, ternary_fpm, f8q, f8, f32q, f32q, \
#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FP8)
ENTRY_VHSDF_VHSDI (vscale, binary, UNSPEC_FSCALE)
#undef REQUIRED_EXTENSIONS
+
+// fpm dot product
+#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FP8DOT2)
+ENTRY_VDOT_FPM (f16, UNSPEC_VDOT2)
+#undef REQUIRED_EXTENSIONS
+
+// fpm dot4 product
+#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FP8DOT4)
+ENTRY_VDOT_FPM (f32, UNSPEC_VDOT4)
+#undef REQUIRED_EXTENSIONS
@@ -10067,3 +10067,61 @@
"TARGET_FP8"
"<fpm_uns_op>\t%0.<VHSDF:Vtype>, %1.<VHSDF:Vtype>, %2.<VHSDI:Vtype>"
)
+
+;; fpm vdot2 instructions.
+(define_insn
+ "@aarch64_<fpm_uns_op><VHF:mode><VHF:mode><VB:mode><VB:mode>"
+ [(set (match_operand:VHF 0 "register_operand" "=w")
+ (unspec:VHF
+ [(match_operand:VHF 1 "register_operand" "w")
+ (match_operand:VB 2 "register_operand" "w")
+ (match_operand:VB 3 "register_operand" "w")
+ (reg:DI FPM_REGNUM)]
+ FPM_VDOT2_UNS))]
+ "TARGET_FP8DOT2"
+ "<fpm_uns_op>\t%1.<VHF:Vtype>, %2.<VB:Vtype>, %3.<VB:Vtype>"
+)
+
+;; fpm vdot2 instructions with lane.
+(define_insn
+ "@aarch64_<fpm_uns_op><VHF:mode><VHF:mode><VB:mode><VB2:mode><SI_ONLY:mode>"
+ [(set (match_operand:VHF 0 "register_operand" "=w")
+ (unspec:VHF
+ [(match_operand:VHF 1 "register_operand" "w")
+ (match_operand:VB 2 "register_operand" "w")
+ (match_operand:VB2 3 "register_operand" "w")
+ (match_operand:SI_ONLY 4 "const_int_operand" "n")
+ (reg:DI FPM_REGNUM)]
+ FPM_VDOT2_UNS))]
+ "TARGET_FP8DOT2"
+ "<fpm_uns_op>\t%1.<VHF:Vtype>, %2.<VB:Vtype>, %3.<VHF:Vdotlanetype>[%4]"
+)
+
+;; fpm vdot4 instructions.
+(define_insn
+ "@aarch64_<fpm_uns_op><VDQSF:mode><VDQSF:mode><VB:mode><VB:mode>"
+ [(set (match_operand:VDQSF 0 "register_operand" "=w")
+ (unspec:VDQSF
+ [(match_operand:VDQSF 1 "register_operand" "w")
+ (match_operand:VB 2 "register_operand" "w")
+ (match_operand:VB 3 "register_operand" "w")
+ (reg:DI FPM_REGNUM)]
+ FPM_VDOT4_UNS))]
+ "TARGET_FP8DOT4"
+ "<fpm_uns_op>\t%1.<VDQSF:Vtype>, %2.<VB:Vtype>, %3.<VB:Vtype>"
+)
+
+;; fpm vdot4 instructions with lane.
+(define_insn
+ "@aarch64_<fpm_uns_op><VDQSF:mode><VDQSF:mode><VB:mode><VB2:mode><SI_ONLY:mode>"
+ [(set (match_operand:VDQSF 0 "register_operand" "=w")
+ (unspec:VDQSF
+ [(match_operand:VDQSF 1 "register_operand" "w")
+ (match_operand:VB 2 "register_operand" "w")
+ (match_operand:VB2 3 "register_operand" "w")
+ (match_operand:SI_ONLY 4 "const_int_operand" "n")
+ (reg:DI FPM_REGNUM)]
+ FPM_VDOT4_UNS))]
+ "TARGET_FP8DOT4"
+ "<fpm_uns_op>\t%1.<VDQSF:Vtype>, %2.<VB:Vtype>, %3.<VDQSF:Vdotlanetype>[%4]"
+)
@@ -483,6 +483,12 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE ATTRIBUTE_UNUSED
/* fp8 instructions are enabled through +fp8. */
#define TARGET_FP8 AARCH64_HAVE_ISA (FP8)
+/* fp8 dot product instructions are enabled through +fp8dot2. */
+#define TARGET_FP8DOT2 AARCH64_HAVE_ISA (FP8DOT2)
+
+/* fp8 dot product instructions are enabled through +fp8dot4. */
+#define TARGET_FP8DOT4 AARCH64_HAVE_ISA (FP8DOT4)
+
/* Standard register usage. */
/* 31 64-bit general purpose registers R0-R30:
@@ -163,6 +163,10 @@
;; Advanced SIMD Float modes.
(define_mode_iterator VDQF [V2SF V4SF V2DF])
+
+(define_mode_iterator VHF [(V4HF "TARGET_SIMD_F16INST")
+ (V8HF "TARGET_SIMD_F16INST")])
+
(define_mode_iterator VHSDF [(V4HF "TARGET_SIMD_F16INST")
(V8HF "TARGET_SIMD_F16INST")
V2SF V4SF V2DF])
@@ -321,6 +325,7 @@
;; All byte modes.
(define_mode_iterator VB [V8QI V16QI])
+(define_mode_iterator VB2 [VB])
;; 1 and 2 lane DI and DF modes.
(define_mode_iterator V12DIF [V1DI V1DF V2DI V2DF])
@@ -766,6 +771,8 @@
UNSPEC_VCVT2_HIGH_F16 ; Used in aarch64-simd.md.
UNSPEC_VCVT2_LOW_BF16 ; Used in aarch64-simd.md.
UNSPEC_VCVT2_LOW_F16 ; Used in aarch64-simd.md.
+ UNSPEC_VDOT2 ; Used in aarch64-simd.md.
+ UNSPEC_VDOT4 ; Used in aarch64-simd.md.
UNSPEC_TBL ; Used in vector permute patterns.
UNSPEC_TBX ; Used in vector permute patterns.
UNSPEC_CONCAT ; Used in vector permute patterns.
@@ -2427,6 +2434,11 @@
(VNx8HF ".h") (VNx16HF "") (VNx32HF "")
(VNx8HI ".h") (VNx16HI "") (VNx32HI "")])
+
+;; Lane index suffix for fp8 vdot operations depends on the output mode
+(define_mode_attr Vdotlanetype [(V4HF "2b") (V8HF "2b")
+ (V2SF "4b") (V4SF "4b")])
+
;; The number of bytes controlled by a predicate
(define_mode_attr data_bytes [(VNx16BI "1") (VNx8BI "2")
(VNx4BI "4") (VNx2BI "8")])
@@ -4597,6 +4609,10 @@
(define_int_iterator FPM_TERNARY_VCVT_UNS [UNSPEC_VCVT_HIGH_F32])
+(define_int_iterator FPM_VDOT2_UNS [UNSPEC_VDOT2])
+
+(define_int_iterator FPM_VDOT4_UNS [UNSPEC_VDOT4])
+
(define_int_attr fpm_uns_op
[(UNSPEC_FSCALE "fscale")
(UNSPEC_VCVT_F16 "fcvtn")
@@ -4614,7 +4630,9 @@
(UNSPEC_VCVT2_HIGH_BF16 "bf2cvtl2")
(UNSPEC_VCVT2_HIGH_F16 "f2cvtl2")
(UNSPEC_VCVT2_LOW_BF16 "bf2cvtl")
- (UNSPEC_VCVT2_LOW_F16 "f2cvtl")])
+ (UNSPEC_VCVT2_LOW_F16 "f2cvtl")
+ (UNSPEC_VDOT2 "fdot")
+ (UNSPEC_VDOT4 "fdot")])
(define_int_attr fpm_uns_name
[(UNSPEC_VCVT_F16 "vcvt_mf8_f16_fpm")
@@ -21805,6 +21805,10 @@ Enable support for Armv8.9-a/9.4-a translation hardening extension.
Enable the RCpc3 (Release Consistency) extension.
@item fp8
Enable the fp8 (8-bit floating point) extension.
+@item fp8dot2
+Enable the fp8dot2 (8-bit floating point dot product) extension.
+@item fp8dot4
+Enable the fp8dot4 (8-bit floating point dot product) extension.
@item faminmax
Enable the Floating Point Absolute Maximum/Minimum extension.
new file mode 100644
@@ -0,0 +1,77 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv9-a+fp8dot2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include "arm_neon.h"
+
+/*
+** test_vdot_f16_fpm:
+** msr fpmr, x0
+** fdot v0.4h, v1.8b, v2.8b
+** ret
+*/
+float16x4_t
+test_vdot_f16_fpm (float16x4_t a, mfloat8x8_t b, mfloat8x8_t c, fpm_t d)
+{
+ return vdot_f16_mf8_fpm (a, b, c, d);
+}
+
+/*
+** test_vdotq_f16_fpm:
+** msr fpmr, x0
+** fdot v0.8h, v1.16b, v2.16b
+** ret
+*/
+float16x8_t
+test_vdotq_f16_fpm (float16x8_t a, mfloat8x16_t b, mfloat8x16_t c, fpm_t d)
+{
+ return vdotq_f16_mf8_fpm (a, b, c, d);
+}
+
+/*
+** test_vdot_lane_f16_fpm:
+** msr fpmr, x0
+** fdot v0.4h, v1.8b, v2.2b\[1\]
+** ret
+*/
+float16x4_t
+test_vdot_lane_f16_fpm (float16x4_t a, mfloat8x8_t b, mfloat8x8_t c, fpm_t d)
+{
+ return vdot_lane_f16_mf8_fpm (a, b, c, 1, d);
+}
+
+/*
+** test_vdot_laneq_f16_fpm:
+** msr fpmr, x0
+** fdot v0.4h, v1.8b, v2.2b\[1\]
+** ret
+*/
+float16x4_t
+test_vdot_laneq_f16_fpm (float16x4_t a, mfloat8x8_t b, mfloat8x16_t c, fpm_t d)
+{
+ return vdot_laneq_f16_mf8_fpm (a, b, c, 1, d);
+}
+
+/*
+** test_vdotq_lane_f16_fpm:
+** msr fpmr, x0
+** fdot v0.8h, v1.16b, v2.2b\[1\]
+** ret
+*/
+float16x8_t
+test_vdotq_lane_f16_fpm (float16x8_t a, mfloat8x16_t b, mfloat8x8_t c, fpm_t d)
+{
+ return vdotq_lane_f16_mf8_fpm (a, b, c, 1, d);
+}
+
+/*
+** test_vdotq_laneq_f16_fpm:
+** msr fpmr, x0
+** fdot v0.8h, v1.16b, v2.2b\[1\]
+** ret
+*/
+float16x8_t
+test_vdotq_laneq_f16_fpm (float16x8_t a, mfloat8x16_t b, mfloat8x16_t c, fpm_t d)
+{
+ return vdotq_laneq_f16_mf8_fpm (a, b, c, 1, d);
+}
new file mode 100644
@@ -0,0 +1,77 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv9-a+fp8dot4" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include "arm_neon.h"
+
+/*
+** test_vdot_f32_fpm:
+** msr fpmr, x0
+** fdot v0.2s, v1.8b, v2.8b
+** ret
+*/
+float32x2_t
+test_vdot_f32_fpm (float32x2_t a, mfloat8x8_t b, mfloat8x8_t c, fpm_t d)
+{
+ return vdot_f32_mf8_fpm (a, b, c, d);
+}
+
+/*
+** test_vdotq_f32_fpm:
+** msr fpmr, x0
+** fdot v0.4s, v1.16b, v2.16b
+** ret
+*/
+float32x4_t
+test_vdotq_f32_fpm (float32x4_t a, mfloat8x16_t b, mfloat8x16_t c, fpm_t d)
+{
+ return vdotq_f32_mf8_fpm (a, b, c, d);
+}
+
+/*
+** test_vdot_lane_f32_fpm:
+** msr fpmr, x0
+** fdot v0.2s, v1.8b, v2.4b\[1\]
+** ret
+*/
+float32x2_t
+test_vdot_lane_f32_fpm (float32x2_t a, mfloat8x8_t b, mfloat8x8_t c, fpm_t d)
+{
+ return vdot_lane_f32_mf8_fpm (a, b, c, 1, d);
+}
+
+/*
+** test_vdot_laneq_f32_fpm:
+** msr fpmr, x0
+** fdot v0.2s, v1.8b, v2.4b\[1\]
+** ret
+*/
+float32x2_t
+test_vdot_laneq_f32_fpm (float32x2_t a, mfloat8x8_t b, mfloat8x16_t c, fpm_t d)
+{
+ return vdot_laneq_f32_mf8_fpm (a, b, c, 1, d);
+}
+
+/*
+** test_vdotq_lane_f32_fpm:
+** msr fpmr, x0
+** fdot v0.4s, v1.16b, v2.4b\[1\]
+** ret
+*/
+float32x4_t
+test_vdotq_lane_f32_fpm (float32x4_t a, mfloat8x16_t b, mfloat8x8_t c, fpm_t d)
+{
+ return vdotq_lane_f32_mf8_fpm (a, b, c, 1, d);
+}
+
+/*
+** test_vdotq_laneq_f32_fpm:
+** msr fpmr, x0
+** fdot v0.4s, v1.16b, v2.4b\[1\]
+** ret
+*/
+float32x4_t
+test_vdotq_laneq_f32_fpm (float32x4_t a, mfloat8x16_t b, mfloat8x16_t c, fpm_t d)
+{
+ return vdotq_laneq_f32_mf8_fpm (a, b, c, 1, d);
+}