diff mbox series

[2/3] aarch64: Add support for fp8dot2 and fp8dot4

Message ID 20241106100358.3622028-3-saurabh.jha@arm.com
State New
Headers show
Series aarch64: Add fp8, fp8dot2, fp8dot4, and fp8fma acle | expand

Commit Message

Saurabh Jha Nov. 6, 2024, 10:03 a.m. UTC
The AArch64 FEAT_FP8DOT2 and FEAT_FP8DOT4 extension introduces
instructions for dot product of vectors.

This patch introduces the following intrinsics:
1. vdot{q}_{fp16|fp32}_mf8_fpm.
2. vdot{q}_lane{q}_{fp16|fp32}_mf8_fpm.

It introduces two flags: fp8dot2 and fp8dot4.

We had to add space for another type in aarch64_pragma_builtins_data
struct. The macros were updated to reflect that.

We added a new aarch64_builtin_signature variant, ternary_fpm_lane, and
added support it in declaration of types and expansion to RTL.

We added a new namespace, function_checker, to implement range checks
for functions defined using the new pragma approach. The old intrinsic
range checks should remain unaffected. All the new AdvSIMD intrinsics we
define that need lane checks should be using the function in this
namespace to implement the checks.

gcc/ChangeLog:

	* config/aarch64/aarch64-builtins.cc
	(ENTRY): Change to handle extra type.
	(enum class): Added new variant.
	(struct aarch64_pragma_builtins_data): Add support for another
	type.
	(aarch64_fntype): Handle new signature.
	(require_integer_constant): New function to check whether the
	operand is an integer constant.
	(require_immediate_range): New function to validate index
	ranges.
	(check_simd_lane_bounds): New function to validate index
	operands.
	(aarch64_expand_pragma_builtin): Handle new signature.
	* config/aarch64/aarch64-c.cc
	(aarch64_update_cpp_builtins): New flags.
	* config/aarch64/aarch64-option-extensions.def
	(AARCH64_OPT_EXTENSION): New flags.
	* config/aarch64/aarch64-simd-pragma-builtins.def
	(ENTRY_BINARY):	Change to handle extra type.
	(ENTRY_BINARY_FPM): Change to handle extra type.
	(ENTRY_TERNARY_FPM_LANE): Macro to declare fpm ternary with
	lane intrinsics.
	(ENTRY_VDOT_FPM): Change to handle extra type.
	(ENTRY_UNARY_FPM): Change to handle extra type.
	* config/aarch64/aarch64-simd.md: New instruction pattern for
	fp8dot2 and fp8dot4 instructions.
	* config/aarch64/aarch64.h
	(TARGET_FP8DOT2): New flag for fp8dot2 instructions.
	(TARGET_FP8DOT4): New flag for fp8dot4 instructions.
	* config/aarch64/iterators.md: New attributes and iterators.
	* doc/invoke.texi: New flag for fp8dot2 and fp8dot4
	instructions.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/simd/vdot2_fpmdot.c: New test.
	* gcc.target/aarch64/simd/vdot4_fpmdot.c: New test.

	---

	Is there a better to validate indices?
---
 gcc/config/aarch64/aarch64-builtins.cc        | 138 +++++++++++++++++-
 gcc/config/aarch64/aarch64-c.cc               |   4 +
 .../aarch64/aarch64-option-extensions.def     |   4 +
 .../aarch64/aarch64-simd-pragma-builtins.def  |  39 ++++-
 gcc/config/aarch64/aarch64-simd.md            |  58 ++++++++
 gcc/config/aarch64/aarch64.h                  |   6 +
 gcc/config/aarch64/iterators.md               |  20 ++-
 gcc/doc/invoke.texi                           |   4 +
 .../gcc.target/aarch64/simd/vdot2_fpmdot.c    |  77 ++++++++++
 .../gcc.target/aarch64/simd/vdot4_fpmdot.c    |  77 ++++++++++
 10 files changed, 415 insertions(+), 12 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/vdot2_fpmdot.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/vdot4_fpmdot.c
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc
index df19bff71d0..ba3bffaa4f9 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -780,7 +780,7 @@  typedef struct
   AARCH64_SIMD_BUILTIN_##T##_##N##A,
 
 #undef ENTRY
-#define ENTRY(N, S, M0, M1, M2, M3, U) \
+#define ENTRY(N, S, M0, M1, M2, M3, M4, U)	\
   AARCH64_##N,
 
 enum aarch64_builtins
@@ -1593,6 +1593,7 @@  enum class aarch64_builtin_signatures
   binary,
   binary_fpm,
   ternary_fpm,
+  ternary_fpm_lane,
   unary_fpm,
 };
 
@@ -1643,10 +1644,10 @@  namespace simd_types {
 }
 
 #undef ENTRY
-#define ENTRY(N, S, T0, T1, T2, T3, U) \
+#define ENTRY(N, S, T0, T1, T2, T3, T4, U)			      \
   {#N, aarch64_builtin_signatures::S, simd_types::T0, simd_types::T1, \
-    simd_types::T2, simd_types::T3, U, \
-    aarch64_required_extensions::REQUIRED_EXTENSIONS},
+      simd_types::T2, simd_types::T3, simd_types::T4, U,	      \
+      aarch64_required_extensions::REQUIRED_EXTENSIONS},
 
 /* Initialize pragma builtins.  */
 
@@ -1654,7 +1655,7 @@  struct aarch64_pragma_builtins_data
 {
   const char *name;
   aarch64_builtin_signatures signature;
-  simd_type types[4];
+  simd_type types[5];
   int unspec;
   aarch64_required_extensions required_extensions;
 };
@@ -1667,6 +1668,7 @@  static tree
 aarch64_fntype (const aarch64_pragma_builtins_data &builtin_data)
 {
   tree type0, type1, type2, type3;
+  tree immtype = aarch64_simd_builtin_type (SImode, qualifier_lane_index);
 
   switch (builtin_data.signature)
     {
@@ -1701,6 +1703,18 @@  aarch64_fntype (const aarch64_pragma_builtins_data &builtin_data)
       return build_function_type_list (type0, type1, type2, type3,
 				       uint64_type_node, NULL_TREE);
 
+    case aarch64_builtin_signatures::ternary_fpm_lane:
+      type0 = aarch64_simd_builtin_type (builtin_data.types[0].mode,
+					 builtin_data.types[0].qualifiers);
+      type1 = aarch64_simd_builtin_type (builtin_data.types[1].mode,
+					 builtin_data.types[1].qualifiers);
+      type2 = aarch64_simd_builtin_type (builtin_data.types[2].mode,
+					 builtin_data.types[2].qualifiers);
+      type3 = aarch64_simd_builtin_type (builtin_data.types[3].mode,
+					 builtin_data.types[3].qualifiers);
+      return build_function_type_list (type0, type1, type2, type3, immtype,
+				       uint64_type_node, NULL_TREE);
+
     case aarch64_builtin_signatures::unary_fpm:
       type0 = aarch64_simd_builtin_type (builtin_data.types[0].mode,
 					 builtin_data.types[0].qualifiers);
@@ -2519,6 +2533,80 @@  aarch64_general_required_extensions (unsigned int code)
   return ext::streaming_compatible (0);
 }
 
+namespace function_checker {
+
+void
+require_integer_constant (location_t location, tree arg)
+{
+  if (TREE_CODE (arg) != INTEGER_CST)
+    {
+      error_at (location, "Constant-type integer argument expected");
+      return;
+    }
+}
+
+void
+require_immediate_range (location_t location, tree arg, HOST_WIDE_INT min,
+			 HOST_WIDE_INT max)
+{
+  if (wi::to_widest (arg) < min || wi::to_widest (arg) > max)
+    {
+      error_at (location, "lane out of range %wd - %wd", min, max);
+      return;
+    }
+}
+
+/* Validates indexing into a vector using the index's size and the instruction,
+   where instruction is represented by the unspec.
+   This only works for intrinsics declared using pragmas in
+   aarch64-simd-pragma-builtins.def.  */
+
+void
+check_simd_lane_bounds (location_t location, const aarch64_pragma_builtins_data
+			*builtin_data, tree *args)
+{
+  if (builtin_data == NULL)
+    // Don't check for functions that are not declared in
+    // aarch64-simd-pragma-builtins.def.
+    return;
+
+  switch (builtin_data->signature)
+    {
+    case aarch64_builtin_signatures::ternary_fpm_lane:
+      {
+	auto index_arg = args[3];
+	require_integer_constant (location, index_arg);
+
+	auto vector_to_index_mode = builtin_data->types[3].mode;
+	int vector_to_index_mode_size
+	  = GET_MODE_NUNITS (vector_to_index_mode).to_constant ();
+
+	switch (builtin_data->unspec)
+	  {
+	  case UNSPEC_VDOT2:
+	    require_immediate_range (location, index_arg, 0,
+				     vector_to_index_mode_size / 2 - 1);
+	    break;
+
+	  case UNSPEC_VDOT4:
+	    require_immediate_range (location, index_arg, 0,
+				     vector_to_index_mode_size / 4 - 1);
+	    break;
+
+	  default:
+	    gcc_unreachable ();
+	  }
+      }
+
+    default:
+      // Other signatures don't have lanes and this check doesn't apply to
+      // them.
+      return;
+    }
+}
+
+};
+
 bool
 aarch64_general_check_builtin_call (location_t location, vec<location_t>,
 				    unsigned int code, tree fndecl,
@@ -2530,6 +2618,9 @@  aarch64_general_check_builtin_call (location_t location, vec<location_t>,
   if (!aarch64_check_required_extensions (location, decl, required_extensions))
     return false;
 
+  auto builtin_data = aarch64_get_pragma_builtin (code);
+  function_checker::check_simd_lane_bounds (location, builtin_data, args);
+
   switch (code)
     {
     case AARCH64_RSR:
@@ -3425,7 +3516,8 @@  aarch64_expand_pragma_builtin (tree exp, rtx target,
 			       const aarch64_pragma_builtins_data *builtin_data)
 {
   auto unspec = builtin_data->unspec;
-  expand_operand ops[4];
+  expand_operand ops[5];
+  insn_code icode;
 
   switch (builtin_data->signature)
     {
@@ -3445,6 +3537,40 @@  aarch64_expand_pragma_builtin (tree exp, rtx target,
 	break;
       }
 
+    case aarch64_builtin_signatures::ternary_fpm_lane:
+      {
+	auto input1 = expand_normal (CALL_EXPR_ARG (exp, 0));
+	auto input2 = expand_normal (CALL_EXPR_ARG (exp, 1));
+	auto input3 = expand_normal (CALL_EXPR_ARG (exp, 2));
+	auto index = expand_normal (CALL_EXPR_ARG (exp, 3));
+	auto fpm_input  = expand_normal (CALL_EXPR_ARG (exp, 4));
+
+	if (!CONST_INT_P (index))
+	{
+	  error_at (EXPR_LOCATION (exp),
+		    "argument should have been a constant");
+	  break;
+	}
+
+	auto fpmr = gen_rtx_REG (DImode, FPM_REGNUM);
+	emit_move_insn (fpmr, fpm_input);
+
+	create_output_operand (&ops[0], target, builtin_data->types[0].mode);
+	create_input_operand (&ops[1], input1, builtin_data->types[1].mode);
+	create_input_operand (&ops[2], input2, builtin_data->types[2].mode);
+	create_input_operand (&ops[3], input3, builtin_data->types[3].mode);
+	create_input_operand (&ops[4], index, SImode);
+
+	icode = code_for_aarch64 (unspec,
+				  builtin_data->types[0].mode,
+				  builtin_data->types[1].mode,
+				  builtin_data->types[2].mode,
+				  builtin_data->types[3].mode,
+				  SImode);
+	expand_insn (icode, 5, ops);
+	break;
+      }
+
     case aarch64_builtin_signatures::binary_fpm:
       {
 	auto input1 = expand_normal (CALL_EXPR_ARG (exp, 0));
diff --git a/gcc/config/aarch64/aarch64-c.cc b/gcc/config/aarch64/aarch64-c.cc
index 68f9180520a..3e30ba5afd9 100644
--- a/gcc/config/aarch64/aarch64-c.cc
+++ b/gcc/config/aarch64/aarch64-c.cc
@@ -259,6 +259,10 @@  aarch64_update_cpp_builtins (cpp_reader *pfile)
 
   aarch64_def_or_undef (TARGET_FP8, "__ARM_FEATURE_FP8", pfile);
 
+  aarch64_def_or_undef (TARGET_FP8DOT2, "__ARM_FEATURE_FP8DOT2", pfile);
+
+  aarch64_def_or_undef (TARGET_FP8DOT4, "__ARM_FEATURE_FP8DOT4", pfile);
+
   aarch64_def_or_undef (TARGET_LS64,
 			"__ARM_FEATURE_LS64", pfile);
   aarch64_def_or_undef (TARGET_RCPC, "__ARM_FEATURE_RCPC", pfile);
diff --git a/gcc/config/aarch64/aarch64-option-extensions.def b/gcc/config/aarch64/aarch64-option-extensions.def
index 8279f5a76ea..fd4d29e5df6 100644
--- a/gcc/config/aarch64/aarch64-option-extensions.def
+++ b/gcc/config/aarch64/aarch64-option-extensions.def
@@ -234,6 +234,10 @@  AARCH64_OPT_EXTENSION("gcs", GCS, (), (), (), "gcs")
 
 AARCH64_OPT_EXTENSION("fp8", FP8, (SIMD), (), (), "fp8")
 
+AARCH64_OPT_EXTENSION("fp8dot2", FP8DOT2, (SIMD), (), (), "fp8dot2")
+
+AARCH64_OPT_EXTENSION("fp8dot4", FP8DOT4, (SIMD), (), (), "fp8dot4")
+
 AARCH64_OPT_EXTENSION("faminmax", FAMINMAX, (SIMD), (), (), "faminmax")
 
 #undef AARCH64_OPT_FMV_EXTENSION
diff --git a/gcc/config/aarch64/aarch64-simd-pragma-builtins.def b/gcc/config/aarch64/aarch64-simd-pragma-builtins.def
index cb5b546c541..9dea2939b47 100644
--- a/gcc/config/aarch64/aarch64-simd-pragma-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-pragma-builtins.def
@@ -21,17 +21,36 @@ 
 
 #undef ENTRY_BINARY
 #define ENTRY_BINARY(N, S, T0, T1, T2, U) \
-  ENTRY (N, S, T0, T1, T2, none, U)
+  ENTRY (N, S, T0, T1, T2, none, none, U)
 
 #undef ENTRY_BINARY_FPM
 #define ENTRY_BINARY_FPM(N, S, T0, T1, T2, U) \
-  ENTRY (N, S, T0, T1, T2, none, U)
+  ENTRY (N, S, T0, T1, T2, none, none, U)
 
 #define ENTRY_TERNARY_FPM(N, S, T0, T1, T2, T3, U) \
-  ENTRY (N, S, T0, T1, T2, T3, U)
-
+  ENTRY (N, S, T0, T1, T2, T3, none, U)
+
+#undef ENTRY_TERNARY_FPM_LANE
+#define ENTRY_TERNARY_FPM_LANE(N, S, T0, T1, T2, T3, U)	\
+  ENTRY (N, S, T0, T1, T2, T3, none, U)
+
+#undef ENTRY_VDOT_FPM
+#define ENTRY_VDOT_FPM(T, U)						\
+  ENTRY_TERNARY_FPM (vdot_##T##_mf8_fpm, ternary_fpm, T, T, f8, f8, U)	\
+  ENTRY_TERNARY_FPM (vdotq_##T##_mf8_fpm, ternary_fpm, T##q, T##q, f8q, f8q, \
+		     U)							\
+  ENTRY_TERNARY_FPM_LANE (vdot_lane_##T##_mf8_fpm, ternary_fpm_lane, T, T, \
+			  f8, f8, U)					\
+  ENTRY_TERNARY_FPM_LANE (vdot_laneq_##T##_mf8_fpm, ternary_fpm_lane, T, T, \
+			  f8, f8q, U)					\
+  ENTRY_TERNARY_FPM_LANE (vdotq_lane_##T##_mf8_fpm, ternary_fpm_lane, T##q, \
+			  T##q, f8q, f8, U)				\
+  ENTRY_TERNARY_FPM_LANE (vdotq_laneq_##T##_mf8_fpm, ternary_fpm_lane, T##q, \
+			  T##q, f8q, f8q, U)
+
+#undef ENTRY_UNARY_FPM
 #define ENTRY_UNARY_FPM(N, S, T0, T1, U) \
-  ENTRY (N, S, T0, T1, none, none, U)
+  ENTRY (N, S, T0, T1, none, none, none, U)
 
 #undef ENTRY_VHSDF
 #define ENTRY_VHSDF(NAME, SIGNATURE, UNSPEC) \
@@ -92,3 +111,13 @@  ENTRY_TERNARY_FPM (vcvt_high_mf8_f32_fpm, ternary_fpm, f8q, f8, f32q, f32q, \
 #define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FP8)
 ENTRY_VHSDF_VHSDI (vscale, binary, UNSPEC_FSCALE)
 #undef REQUIRED_EXTENSIONS
+
+// fpm dot product
+#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FP8DOT2)
+ENTRY_VDOT_FPM (f16, UNSPEC_VDOT2)
+#undef REQUIRED_EXTENSIONS
+
+// fpm dot4 product
+#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FP8DOT4)
+ENTRY_VDOT_FPM (f32, UNSPEC_VDOT4)
+#undef REQUIRED_EXTENSIONS
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 87bbfb0e586..ea1ef4963d2 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -10067,3 +10067,61 @@ 
   "TARGET_FP8"
   "<fpm_uns_op>\t%0.<VHSDF:Vtype>, %1.<VHSDF:Vtype>, %2.<VHSDI:Vtype>"
 )
+
+;; fpm vdot2 instructions.
+(define_insn
+  "@aarch64_<fpm_uns_op><VHF:mode><VHF:mode><VB:mode><VB:mode>"
+  [(set (match_operand:VHF 0 "register_operand" "=w")
+	(unspec:VHF
+	 [(match_operand:VHF 1 "register_operand" "w")
+	  (match_operand:VB 2 "register_operand" "w")
+	  (match_operand:VB 3 "register_operand" "w")
+	  (reg:DI FPM_REGNUM)]
+	FPM_VDOT2_UNS))]
+  "TARGET_FP8DOT2"
+  "<fpm_uns_op>\t%1.<VHF:Vtype>, %2.<VB:Vtype>, %3.<VB:Vtype>"
+)
+
+;; fpm vdot2 instructions with lane.
+(define_insn
+  "@aarch64_<fpm_uns_op><VHF:mode><VHF:mode><VB:mode><VB2:mode><SI_ONLY:mode>"
+  [(set (match_operand:VHF 0 "register_operand" "=w")
+	(unspec:VHF
+	 [(match_operand:VHF 1 "register_operand" "w")
+	  (match_operand:VB 2 "register_operand" "w")
+	  (match_operand:VB2 3 "register_operand" "w")
+	  (match_operand:SI_ONLY 4 "const_int_operand" "n")
+	  (reg:DI FPM_REGNUM)]
+	FPM_VDOT2_UNS))]
+  "TARGET_FP8DOT2"
+  "<fpm_uns_op>\t%1.<VHF:Vtype>, %2.<VB:Vtype>, %3.<VHF:Vdotlanetype>[%4]"
+)
+
+;; fpm vdot4 instructions.
+(define_insn
+  "@aarch64_<fpm_uns_op><VDQSF:mode><VDQSF:mode><VB:mode><VB:mode>"
+  [(set (match_operand:VDQSF 0 "register_operand" "=w")
+	(unspec:VDQSF
+	 [(match_operand:VDQSF 1 "register_operand" "w")
+	  (match_operand:VB 2 "register_operand" "w")
+	  (match_operand:VB 3 "register_operand" "w")
+	  (reg:DI FPM_REGNUM)]
+	FPM_VDOT4_UNS))]
+  "TARGET_FP8DOT4"
+  "<fpm_uns_op>\t%1.<VDQSF:Vtype>, %2.<VB:Vtype>, %3.<VB:Vtype>"
+)
+
+;; fpm vdot4 instructions with lane.
+(define_insn
+  "@aarch64_<fpm_uns_op><VDQSF:mode><VDQSF:mode><VB:mode><VB2:mode><SI_ONLY:mode>"
+  [(set (match_operand:VDQSF 0 "register_operand" "=w")
+	(unspec:VDQSF
+	 [(match_operand:VDQSF 1 "register_operand" "w")
+	  (match_operand:VB 2 "register_operand" "w")
+	  (match_operand:VB2 3 "register_operand" "w")
+	  (match_operand:SI_ONLY 4 "const_int_operand" "n")
+	  (reg:DI FPM_REGNUM)]
+	FPM_VDOT4_UNS))]
+  "TARGET_FP8DOT4"
+  "<fpm_uns_op>\t%1.<VDQSF:Vtype>, %2.<VB:Vtype>, %3.<VDQSF:Vdotlanetype>[%4]"
+)
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 593319fd472..bbe56afcb62 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -483,6 +483,12 @@  constexpr auto AARCH64_FL_DEFAULT_ISA_MODE ATTRIBUTE_UNUSED
 /* fp8 instructions are enabled through +fp8.  */
 #define TARGET_FP8 AARCH64_HAVE_ISA (FP8)
 
+/* fp8 dot product instructions are enabled through +fp8dot2.  */
+#define TARGET_FP8DOT2 AARCH64_HAVE_ISA (FP8DOT2)
+
+/* fp8 dot product instructions are enabled through +fp8dot4.  */
+#define TARGET_FP8DOT4 AARCH64_HAVE_ISA (FP8DOT4)
+
 /* Standard register usage.  */
 
 /* 31 64-bit general purpose registers R0-R30:
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index e3026c36e1c..45b9e74c231 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -163,6 +163,10 @@ 
 
 ;; Advanced SIMD Float modes.
 (define_mode_iterator VDQF [V2SF V4SF V2DF])
+
+(define_mode_iterator VHF [(V4HF "TARGET_SIMD_F16INST")
+			   (V8HF "TARGET_SIMD_F16INST")])
+
 (define_mode_iterator VHSDF [(V4HF "TARGET_SIMD_F16INST")
 			     (V8HF "TARGET_SIMD_F16INST")
 			     V2SF V4SF V2DF])
@@ -321,6 +325,7 @@ 
 
 ;; All byte modes.
 (define_mode_iterator VB [V8QI V16QI])
+(define_mode_iterator VB2 [VB])
 
 ;; 1 and 2 lane DI and DF modes.
 (define_mode_iterator V12DIF [V1DI V1DF V2DI V2DF])
@@ -766,6 +771,8 @@ 
     UNSPEC_VCVT2_HIGH_F16	; Used in aarch64-simd.md.
     UNSPEC_VCVT2_LOW_BF16	; Used in aarch64-simd.md.
     UNSPEC_VCVT2_LOW_F16	; Used in aarch64-simd.md.
+    UNSPEC_VDOT2		; Used in aarch64-simd.md.
+    UNSPEC_VDOT4		; Used in aarch64-simd.md.
     UNSPEC_TBL		; Used in vector permute patterns.
     UNSPEC_TBX		; Used in vector permute patterns.
     UNSPEC_CONCAT	; Used in vector permute patterns.
@@ -2427,6 +2434,11 @@ 
 			    (VNx8HF ".h") (VNx16HF "") (VNx32HF "")
 			    (VNx8HI ".h") (VNx16HI "") (VNx32HI "")])
 
+
+;; Lane index suffix for fp8 vdot operations depends on the output mode
+(define_mode_attr Vdotlanetype [(V4HF "2b") (V8HF "2b")
+				(V2SF "4b") (V4SF "4b")])
+
 ;; The number of bytes controlled by a predicate
 (define_mode_attr data_bytes [(VNx16BI "1") (VNx8BI "2")
 			      (VNx4BI "4") (VNx2BI "8")])
@@ -4597,6 +4609,10 @@ 
 
 (define_int_iterator FPM_TERNARY_VCVT_UNS [UNSPEC_VCVT_HIGH_F32])
 
+(define_int_iterator FPM_VDOT2_UNS [UNSPEC_VDOT2])
+
+(define_int_iterator FPM_VDOT4_UNS [UNSPEC_VDOT4])
+
 (define_int_attr fpm_uns_op
   [(UNSPEC_FSCALE "fscale")
    (UNSPEC_VCVT_F16 "fcvtn")
@@ -4614,7 +4630,9 @@ 
    (UNSPEC_VCVT2_HIGH_BF16 "bf2cvtl2")
    (UNSPEC_VCVT2_HIGH_F16 "f2cvtl2")
    (UNSPEC_VCVT2_LOW_BF16 "bf2cvtl")
-   (UNSPEC_VCVT2_LOW_F16 "f2cvtl")])
+   (UNSPEC_VCVT2_LOW_F16 "f2cvtl")
+   (UNSPEC_VDOT2 "fdot")
+   (UNSPEC_VDOT4 "fdot")])
 
 (define_int_attr fpm_uns_name
   [(UNSPEC_VCVT_F16 "vcvt_mf8_f16_fpm")
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 7146163d66d..332c664b30f 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -21805,6 +21805,10 @@  Enable support for Armv8.9-a/9.4-a translation hardening extension.
 Enable the RCpc3 (Release Consistency) extension.
 @item fp8
 Enable the fp8 (8-bit floating point) extension.
+@item fp8dot2
+Enable the fp8dot2 (8-bit floating point dot product) extension.
+@item fp8dot4
+Enable the fp8dot4 (8-bit floating point dot product) extension.
 @item faminmax
 Enable the Floating Point Absolute Maximum/Minimum extension.
 
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vdot2_fpmdot.c b/gcc/testsuite/gcc.target/aarch64/simd/vdot2_fpmdot.c
new file mode 100644
index 00000000000..3e888a67ec7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vdot2_fpmdot.c
@@ -0,0 +1,77 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv9-a+fp8dot2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include "arm_neon.h"
+
+/*
+** test_vdot_f16_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.4h, v1.8b, v2.8b
+**	ret
+*/
+float16x4_t
+test_vdot_f16_fpm (float16x4_t a, mfloat8x8_t b, mfloat8x8_t c, fpm_t d)
+{
+  return vdot_f16_mf8_fpm (a, b, c, d);
+}
+
+/*
+** test_vdotq_f16_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.8h, v1.16b, v2.16b
+**	ret
+*/
+float16x8_t
+test_vdotq_f16_fpm (float16x8_t a, mfloat8x16_t b, mfloat8x16_t c, fpm_t d)
+{
+  return vdotq_f16_mf8_fpm (a, b, c, d);
+}
+
+/*
+** test_vdot_lane_f16_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.4h, v1.8b, v2.2b\[1\]
+**	ret
+*/
+float16x4_t
+test_vdot_lane_f16_fpm (float16x4_t a, mfloat8x8_t b, mfloat8x8_t c, fpm_t d)
+{
+  return vdot_lane_f16_mf8_fpm (a, b, c, 1, d);
+}
+
+/*
+** test_vdot_laneq_f16_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.4h, v1.8b, v2.2b\[1\]
+**	ret
+*/
+float16x4_t
+test_vdot_laneq_f16_fpm (float16x4_t a, mfloat8x8_t b, mfloat8x16_t c, fpm_t d)
+{
+  return vdot_laneq_f16_mf8_fpm (a, b, c, 1, d);
+}
+
+/*
+** test_vdotq_lane_f16_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.8h, v1.16b, v2.2b\[1\]
+**	ret
+*/
+float16x8_t
+test_vdotq_lane_f16_fpm (float16x8_t a, mfloat8x16_t b, mfloat8x8_t c, fpm_t d)
+{
+  return vdotq_lane_f16_mf8_fpm (a, b, c, 1, d);
+}
+
+/*
+** test_vdotq_laneq_f16_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.8h, v1.16b, v2.2b\[1\]
+**	ret
+*/
+float16x8_t
+test_vdotq_laneq_f16_fpm (float16x8_t a, mfloat8x16_t b, mfloat8x16_t c, fpm_t d)
+{
+  return vdotq_laneq_f16_mf8_fpm (a, b, c, 1, d);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vdot4_fpmdot.c b/gcc/testsuite/gcc.target/aarch64/simd/vdot4_fpmdot.c
new file mode 100644
index 00000000000..f03dd0a0d36
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vdot4_fpmdot.c
@@ -0,0 +1,77 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv9-a+fp8dot4" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include "arm_neon.h"
+
+/*
+** test_vdot_f32_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.2s, v1.8b, v2.8b
+**	ret
+*/
+float32x2_t
+test_vdot_f32_fpm (float32x2_t a, mfloat8x8_t b, mfloat8x8_t c, fpm_t d)
+{
+  return vdot_f32_mf8_fpm (a, b, c, d);
+}
+
+/*
+** test_vdotq_f32_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.4s, v1.16b, v2.16b
+**	ret
+*/
+float32x4_t
+test_vdotq_f32_fpm (float32x4_t a, mfloat8x16_t b, mfloat8x16_t c, fpm_t d)
+{
+  return vdotq_f32_mf8_fpm (a, b, c, d);
+}
+
+/*
+** test_vdot_lane_f32_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.2s, v1.8b, v2.4b\[1\]
+**	ret
+*/
+float32x2_t
+test_vdot_lane_f32_fpm (float32x2_t a, mfloat8x8_t b, mfloat8x8_t c, fpm_t d)
+{
+  return vdot_lane_f32_mf8_fpm (a, b, c, 1, d);
+}
+
+/*
+** test_vdot_laneq_f32_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.2s, v1.8b, v2.4b\[1\]
+**	ret
+*/
+float32x2_t
+test_vdot_laneq_f32_fpm (float32x2_t a, mfloat8x8_t b, mfloat8x16_t c, fpm_t d)
+{
+  return vdot_laneq_f32_mf8_fpm (a, b, c, 1, d);
+}
+
+/*
+** test_vdotq_lane_f32_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.4s, v1.16b, v2.4b\[1\]
+**	ret
+*/
+float32x4_t
+test_vdotq_lane_f32_fpm (float32x4_t a, mfloat8x16_t b, mfloat8x8_t c, fpm_t d)
+{
+  return vdotq_lane_f32_mf8_fpm (a, b, c, 1, d);
+}
+
+/*
+** test_vdotq_laneq_f32_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.4s, v1.16b, v2.4b\[1\]
+**	ret
+*/
+float32x4_t
+test_vdotq_laneq_f32_fpm (float32x4_t a, mfloat8x16_t b, mfloat8x16_t c, fpm_t d)
+{
+  return vdotq_laneq_f32_mf8_fpm (a, b, c, 1, d);
+}