diff mbox series

[V2] VECT: Support mask_len_strided_load/mask_len_strided_store in loop vectorize

Message ID 20231106065205.290215-1-juzhe.zhong@rivai.ai
State New
Headers show
Series [V2] VECT: Support mask_len_strided_load/mask_len_strided_store in loop vectorize | expand

Commit Message

钟居哲 Nov. 6, 2023, 6:52 a.m. UTC
This patch adds strided load/store support on loop vectorizer depending on STMT_VINFO_STRIDED_P.

Bootstrap and regression on X86 passed.

Ok for trunk ?

gcc/ChangeLog:

	* internal-fn.cc (strided_load_direct): New function.
	(strided_store_direct): Ditto.
	(expand_strided_store_optab_fn): Ditto.
	(expand_scatter_store_optab_fn): Add strided store.
	(expand_strided_load_optab_fn): New function.
	(expand_gather_load_optab_fn): Add strided load.
	(direct_strided_load_optab_supported_p): New function.
	(direct_strided_store_optab_supported_p): Ditto.
	(internal_load_fn_p): Add strided load.
	(internal_strided_fn_p): New function.
	(internal_fn_len_index): Add strided load/store.
	(internal_fn_mask_index): Ditto.
	(internal_fn_stored_value_index): Add strided store.
	(internal_strided_fn_supported_p): New function.
	* internal-fn.def (MASK_LEN_STRIDED_LOAD): New IFN.
	(MASK_LEN_STRIDED_STORE): Ditto.
	* internal-fn.h (internal_strided_fn_p): New function.
	(internal_strided_fn_supported_p): Ditto.
	* optabs-query.cc (supports_vec_gather_load_p): Add strided load.
	(supports_vec_scatter_store_p): Add strided store.
	* optabs-query.h (supports_vec_gather_load_p): Add strided load.
	(supports_vec_scatter_store_p): Add strided store.
	* tree-vect-data-refs.cc (vect_prune_runtime_alias_test_list): Add strided load/store.
	(vect_gather_scatter_fn_p): Ditto.
	(vect_check_gather_scatter): Ditto.
	* tree-vect-stmts.cc (check_load_store_for_partial_vectors): Ditto.
	(vect_truncate_gather_scatter_offset): Ditto.
	(vect_use_strided_gather_scatters_p): Ditto.
	(vect_get_strided_load_store_ops): Ditto.
	(vectorizable_store): Ditto.
	(vectorizable_load): Ditto.
	* tree-vectorizer.h (vect_gather_scatter_fn_p): Ditto.

---
 gcc/internal-fn.cc         | 101 ++++++++++++++++++++++++++++++++-----
 gcc/internal-fn.def        |   4 ++
 gcc/internal-fn.h          |   2 +
 gcc/optabs-query.cc        |  25 ++++++---
 gcc/optabs-query.h         |   4 +-
 gcc/tree-vect-data-refs.cc |  45 +++++++++++++----
 gcc/tree-vect-stmts.cc     |  65 ++++++++++++++++++------
 gcc/tree-vectorizer.h      |   2 +-
 8 files changed, 199 insertions(+), 49 deletions(-)

Comments

钟居哲 Nov. 6, 2023, 6:54 a.m. UTC | #1
Sorry. 
This is middle-end patch, sending to wrong  CC lists.
Forget about this patch.



juzhe.zhong@rivai.ai
 
From: Juzhe-Zhong
Date: 2023-11-06 14:52
To: gcc-patches
CC: kito.cheng; kito.cheng; jeffreyalaw; rdapp.gcc; Juzhe-Zhong
Subject: [PATCH V2] VECT: Support mask_len_strided_load/mask_len_strided_store in loop vectorize
This patch adds strided load/store support on loop vectorizer depending on STMT_VINFO_STRIDED_P.
 
Bootstrap and regression on X86 passed.
 
Ok for trunk ?
 
gcc/ChangeLog:
 
* internal-fn.cc (strided_load_direct): New function.
(strided_store_direct): Ditto.
(expand_strided_store_optab_fn): Ditto.
(expand_scatter_store_optab_fn): Add strided store.
(expand_strided_load_optab_fn): New function.
(expand_gather_load_optab_fn): Add strided load.
(direct_strided_load_optab_supported_p): New function.
(direct_strided_store_optab_supported_p): Ditto.
(internal_load_fn_p): Add strided load.
(internal_strided_fn_p): New function.
(internal_fn_len_index): Add strided load/store.
(internal_fn_mask_index): Ditto.
(internal_fn_stored_value_index): Add strided store.
(internal_strided_fn_supported_p): New function.
* internal-fn.def (MASK_LEN_STRIDED_LOAD): New IFN.
(MASK_LEN_STRIDED_STORE): Ditto.
* internal-fn.h (internal_strided_fn_p): New function.
(internal_strided_fn_supported_p): Ditto.
* optabs-query.cc (supports_vec_gather_load_p): Add strided load.
(supports_vec_scatter_store_p): Add strided store.
* optabs-query.h (supports_vec_gather_load_p): Add strided load.
(supports_vec_scatter_store_p): Add strided store.
* tree-vect-data-refs.cc (vect_prune_runtime_alias_test_list): Add strided load/store.
(vect_gather_scatter_fn_p): Ditto.
(vect_check_gather_scatter): Ditto.
* tree-vect-stmts.cc (check_load_store_for_partial_vectors): Ditto.
(vect_truncate_gather_scatter_offset): Ditto.
(vect_use_strided_gather_scatters_p): Ditto.
(vect_get_strided_load_store_ops): Ditto.
(vectorizable_store): Ditto.
(vectorizable_load): Ditto.
* tree-vectorizer.h (vect_gather_scatter_fn_p): Ditto.
 
---
gcc/internal-fn.cc         | 101 ++++++++++++++++++++++++++++++++-----
gcc/internal-fn.def        |   4 ++
gcc/internal-fn.h          |   2 +
gcc/optabs-query.cc        |  25 ++++++---
gcc/optabs-query.h         |   4 +-
gcc/tree-vect-data-refs.cc |  45 +++++++++++++----
gcc/tree-vect-stmts.cc     |  65 ++++++++++++++++++------
gcc/tree-vectorizer.h      |   2 +-
8 files changed, 199 insertions(+), 49 deletions(-)
 
diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index c7d3564faef..a31a65755c7 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -164,6 +164,7 @@ init_internal_fns ()
#define load_lanes_direct { -1, -1, false }
#define mask_load_lanes_direct { -1, -1, false }
#define gather_load_direct { 3, 1, false }
+#define strided_load_direct { -1, -1, false }
#define len_load_direct { -1, -1, false }
#define mask_len_load_direct { -1, 4, false }
#define mask_store_direct { 3, 2, false }
@@ -172,6 +173,7 @@ init_internal_fns ()
#define vec_cond_mask_direct { 1, 0, false }
#define vec_cond_direct { 2, 0, false }
#define scatter_store_direct { 3, 1, false }
+#define strided_store_direct { 1, 1, false }
#define len_store_direct { 3, 3, false }
#define mask_len_store_direct { 4, 5, false }
#define vec_set_direct { 3, 3, false }
@@ -3561,62 +3563,87 @@ expand_LAUNDER (internal_fn, gcall *call)
   expand_assignment (lhs, gimple_call_arg (call, 0), false);
}
+#define expand_strided_store_optab_fn expand_scatter_store_optab_fn
+
/* Expand {MASK_,}SCATTER_STORE{S,U} call CALL using optab OPTAB.  */
static void
expand_scatter_store_optab_fn (internal_fn, gcall *stmt, direct_optab optab)
{
+  insn_code icode;
   internal_fn ifn = gimple_call_internal_fn (stmt);
   int rhs_index = internal_fn_stored_value_index (ifn);
   tree base = gimple_call_arg (stmt, 0);
   tree offset = gimple_call_arg (stmt, 1);
-  tree scale = gimple_call_arg (stmt, 2);
   tree rhs = gimple_call_arg (stmt, rhs_index);
   rtx base_rtx = expand_normal (base);
   rtx offset_rtx = expand_normal (offset);
-  HOST_WIDE_INT scale_int = tree_to_shwi (scale);
   rtx rhs_rtx = expand_normal (rhs);
   class expand_operand ops[8];
   int i = 0;
   create_address_operand (&ops[i++], base_rtx);
-  create_input_operand (&ops[i++], offset_rtx, TYPE_MODE (TREE_TYPE (offset)));
-  create_integer_operand (&ops[i++], TYPE_UNSIGNED (TREE_TYPE (offset)));
-  create_integer_operand (&ops[i++], scale_int);
+  if (internal_strided_fn_p (ifn))
+    {
+      create_address_operand (&ops[i++], offset_rtx);
+      icode = direct_optab_handler (optab, TYPE_MODE (TREE_TYPE (rhs)));
+    }
+  else
+    {
+      tree scale = gimple_call_arg (stmt, 2);
+      HOST_WIDE_INT scale_int = tree_to_shwi (scale);
+      create_input_operand (&ops[i++], offset_rtx,
+     TYPE_MODE (TREE_TYPE (offset)));
+      create_integer_operand (&ops[i++], TYPE_UNSIGNED (TREE_TYPE (offset)));
+      create_integer_operand (&ops[i++], scale_int);
+      icode = convert_optab_handler (optab, TYPE_MODE (TREE_TYPE (rhs)),
+      TYPE_MODE (TREE_TYPE (offset)));
+    }
   create_input_operand (&ops[i++], rhs_rtx, TYPE_MODE (TREE_TYPE (rhs)));
   i = add_mask_and_len_args (ops, i, stmt);
-  insn_code icode = convert_optab_handler (optab, TYPE_MODE (TREE_TYPE (rhs)),
-    TYPE_MODE (TREE_TYPE (offset)));
   expand_insn (icode, i, ops);
}
+#define expand_strided_load_optab_fn expand_gather_load_optab_fn
+
/* Expand {MASK_,}GATHER_LOAD call CALL using optab OPTAB.  */
static void
expand_gather_load_optab_fn (internal_fn, gcall *stmt, direct_optab optab)
{
+  insn_code icode;
+  internal_fn ifn = gimple_call_internal_fn (stmt);
   tree lhs = gimple_call_lhs (stmt);
   tree base = gimple_call_arg (stmt, 0);
   tree offset = gimple_call_arg (stmt, 1);
-  tree scale = gimple_call_arg (stmt, 2);
   rtx lhs_rtx = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
   rtx base_rtx = expand_normal (base);
   rtx offset_rtx = expand_normal (offset);
-  HOST_WIDE_INT scale_int = tree_to_shwi (scale);
   int i = 0;
   class expand_operand ops[8];
   create_output_operand (&ops[i++], lhs_rtx, TYPE_MODE (TREE_TYPE (lhs)));
   create_address_operand (&ops[i++], base_rtx);
-  create_input_operand (&ops[i++], offset_rtx, TYPE_MODE (TREE_TYPE (offset)));
-  create_integer_operand (&ops[i++], TYPE_UNSIGNED (TREE_TYPE (offset)));
-  create_integer_operand (&ops[i++], scale_int);
+  if (internal_strided_fn_p (ifn))
+    {
+      create_address_operand (&ops[i++], offset_rtx);
+      icode = direct_optab_handler (optab, TYPE_MODE (TREE_TYPE (lhs)));
+    }
+  else
+    {
+      tree scale = gimple_call_arg (stmt, 2);
+      HOST_WIDE_INT scale_int = tree_to_shwi (scale);
+      create_input_operand (&ops[i++], offset_rtx,
+     TYPE_MODE (TREE_TYPE (offset)));
+      create_integer_operand (&ops[i++], TYPE_UNSIGNED (TREE_TYPE (offset)));
+      create_integer_operand (&ops[i++], scale_int);
+      icode = convert_optab_handler (optab, TYPE_MODE (TREE_TYPE (lhs)),
+      TYPE_MODE (TREE_TYPE (offset)));
+    }
   i = add_mask_and_len_args (ops, i, stmt);
-  insn_code icode = convert_optab_handler (optab, TYPE_MODE (TREE_TYPE (lhs)),
-    TYPE_MODE (TREE_TYPE (offset)));
   expand_insn (icode, i, ops);
   if (!rtx_equal_p (lhs_rtx, ops[0].value))
     emit_move_insn (lhs_rtx, ops[0].value);
@@ -4012,6 +4039,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
#define direct_load_lanes_optab_supported_p multi_vector_optab_supported_p
#define direct_mask_load_lanes_optab_supported_p multi_vector_optab_supported_p
#define direct_gather_load_optab_supported_p convert_optab_supported_p
+#define direct_strided_load_optab_supported_p direct_optab_supported_p
#define direct_len_load_optab_supported_p direct_optab_supported_p
#define direct_mask_len_load_optab_supported_p convert_optab_supported_p
#define direct_mask_store_optab_supported_p convert_optab_supported_p
@@ -4020,6 +4048,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
#define direct_vec_cond_mask_optab_supported_p convert_optab_supported_p
#define direct_vec_cond_optab_supported_p convert_optab_supported_p
#define direct_scatter_store_optab_supported_p convert_optab_supported_p
+#define direct_strided_store_optab_supported_p direct_optab_supported_p
#define direct_len_store_optab_supported_p direct_optab_supported_p
#define direct_mask_len_store_optab_supported_p convert_optab_supported_p
#define direct_while_optab_supported_p convert_optab_supported_p
@@ -4596,6 +4625,7 @@ internal_load_fn_p (internal_fn fn)
     case IFN_GATHER_LOAD:
     case IFN_MASK_GATHER_LOAD:
     case IFN_MASK_LEN_GATHER_LOAD:
+    case IFN_MASK_LEN_STRIDED_LOAD:
     case IFN_LEN_LOAD:
     case IFN_MASK_LEN_LOAD:
       return true;
@@ -4648,6 +4678,22 @@ internal_gather_scatter_fn_p (internal_fn fn)
     }
}
+/* Return true if IFN is some form of strided load or strided store.  */
+
+bool
+internal_strided_fn_p (internal_fn fn)
+{
+  switch (fn)
+    {
+    case IFN_MASK_LEN_STRIDED_LOAD:
+    case IFN_MASK_LEN_STRIDED_STORE:
+      return true;
+
+    default:
+      return false;
+    }
+}
+
/* If FN takes a vector len argument, return the index of that argument,
    otherwise return -1.  */
@@ -4683,6 +4729,8 @@ internal_fn_len_index (internal_fn fn)
     case IFN_COND_LEN_XOR:
     case IFN_COND_LEN_SHL:
     case IFN_COND_LEN_SHR:
+    case IFN_MASK_LEN_STRIDED_LOAD:
+    case IFN_MASK_LEN_STRIDED_STORE:
       return 4;
     case IFN_COND_LEN_NEG:
@@ -4776,6 +4824,10 @@ internal_fn_mask_index (internal_fn fn)
     case IFN_MASK_LEN_STORE:
       return 2;
+    case IFN_MASK_LEN_STRIDED_LOAD:
+    case IFN_MASK_LEN_STRIDED_STORE:
+      return 3;
+
     case IFN_MASK_GATHER_LOAD:
     case IFN_MASK_SCATTER_STORE:
     case IFN_MASK_LEN_GATHER_LOAD:
@@ -4796,6 +4848,9 @@ internal_fn_stored_value_index (internal_fn fn)
{
   switch (fn)
     {
+    case IFN_MASK_LEN_STRIDED_STORE:
+      return 2;
+
     case IFN_MASK_STORE:
     case IFN_MASK_STORE_LANES:
     case IFN_SCATTER_STORE:
@@ -4845,6 +4900,24 @@ internal_gather_scatter_fn_supported_p (internal_fn ifn, tree vector_type,
  && insn_operand_matches (icode, 3 + output_ops, GEN_INT (scale)));
}
+/* Return true if the target supports strided load or strided store function
+   IFN.  For loads, VECTOR_TYPE is the vector type of the load result,
+   while for stores it is the vector type of the stored data argument.
+   MEMORY_ELEMENT_TYPE is the type of the memory elements being loaded
+   or stored.  */
+
+bool
+internal_strided_fn_supported_p (internal_fn ifn, tree vector_type,
+ tree memory_element_type)
+{
+  if (!tree_int_cst_equal (TYPE_SIZE (TREE_TYPE (vector_type)),
+    TYPE_SIZE (memory_element_type)))
+    return false;
+  optab optab = direct_internal_fn_optab (ifn);
+  insn_code icode = direct_optab_handler (optab, TYPE_MODE (vector_type));
+  return icode != CODE_FOR_nothing;
+}
+
/* Return true if the target supports IFN_CHECK_{RAW,WAR}_PTRS function IFN
    for pointers of type TYPE when the accesses have LENGTH bytes and their
    common byte alignment is ALIGN.  */
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index a2023ab9c3d..922359ed5cf 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -199,6 +199,8 @@ DEF_INTERNAL_OPTAB_FN (MASK_GATHER_LOAD, ECF_PURE,
       mask_gather_load, gather_load)
DEF_INTERNAL_OPTAB_FN (MASK_LEN_GATHER_LOAD, ECF_PURE,
       mask_len_gather_load, gather_load)
+DEF_INTERNAL_OPTAB_FN (MASK_LEN_STRIDED_LOAD, ECF_PURE,
+        mask_len_strided_load, strided_load)
DEF_INTERNAL_OPTAB_FN (LEN_LOAD, ECF_PURE, len_load, len_load)
DEF_INTERNAL_OPTAB_FN (MASK_LEN_LOAD, ECF_PURE, mask_len_load, mask_len_load)
@@ -208,6 +210,8 @@ DEF_INTERNAL_OPTAB_FN (MASK_SCATTER_STORE, 0,
       mask_scatter_store, scatter_store)
DEF_INTERNAL_OPTAB_FN (MASK_LEN_SCATTER_STORE, 0,
       mask_len_scatter_store, scatter_store)
+DEF_INTERNAL_OPTAB_FN (MASK_LEN_STRIDED_STORE, 0,
+        mask_len_strided_store, strided_store)
DEF_INTERNAL_OPTAB_FN (MASK_STORE, 0, maskstore, mask_store)
DEF_INTERNAL_OPTAB_FN (STORE_LANES, ECF_CONST, vec_store_lanes, store_lanes)
diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
index 7d72f4db2d0..ea6a7369f23 100644
--- a/gcc/internal-fn.h
+++ b/gcc/internal-fn.h
@@ -235,12 +235,14 @@ extern bool can_interpret_as_conditional_op_p (gimple *, tree *,
extern bool internal_load_fn_p (internal_fn);
extern bool internal_store_fn_p (internal_fn);
extern bool internal_gather_scatter_fn_p (internal_fn);
+extern bool internal_strided_fn_p (internal_fn);
extern int internal_fn_mask_index (internal_fn);
extern int internal_fn_len_index (internal_fn);
extern int internal_fn_else_index (internal_fn);
extern int internal_fn_stored_value_index (internal_fn);
extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree,
    tree, tree, int);
+extern bool internal_strided_fn_supported_p (internal_fn, tree, tree);
extern bool internal_check_ptrs_fn_supported_p (internal_fn, tree,
poly_uint64, unsigned int);
#define VECT_PARTIAL_BIAS_UNSUPPORTED 127
diff --git a/gcc/optabs-query.cc b/gcc/optabs-query.cc
index 947ccef218c..860e744995b 100644
--- a/gcc/optabs-query.cc
+++ b/gcc/optabs-query.cc
@@ -670,14 +670,18 @@ supports_vec_convert_optab_p (optab op, machine_mode mode)
    for at least one vector mode.  */
bool
-supports_vec_gather_load_p (machine_mode mode)
+supports_vec_gather_load_p (machine_mode mode, bool strided_p)
{
   if (!this_fn_optabs->supports_vec_gather_load[mode])
     this_fn_optabs->supports_vec_gather_load[mode]
       = (supports_vec_convert_optab_p (gather_load_optab, mode)
- || supports_vec_convert_optab_p (mask_gather_load_optab, mode)
- || supports_vec_convert_optab_p (mask_len_gather_load_optab, mode)
- ? 1 : -1);
+      || supports_vec_convert_optab_p (mask_gather_load_optab, mode)
+      || supports_vec_convert_optab_p (mask_len_gather_load_optab, mode)
+      || (strided_p
+ && direct_optab_handler (mask_len_strided_load_optab, mode)
+       != CODE_FOR_nothing)
+    ? 1
+    : -1);
   return this_fn_optabs->supports_vec_gather_load[mode] > 0;
}
@@ -687,14 +691,19 @@ supports_vec_gather_load_p (machine_mode mode)
    for at least one vector mode.  */
bool
-supports_vec_scatter_store_p (machine_mode mode)
+supports_vec_scatter_store_p (machine_mode mode, bool strided_p)
{
   if (!this_fn_optabs->supports_vec_scatter_store[mode])
     this_fn_optabs->supports_vec_scatter_store[mode]
       = (supports_vec_convert_optab_p (scatter_store_optab, mode)
- || supports_vec_convert_optab_p (mask_scatter_store_optab, mode)
- || supports_vec_convert_optab_p (mask_len_scatter_store_optab, mode)
- ? 1 : -1);
+      || supports_vec_convert_optab_p (mask_scatter_store_optab, mode)
+      || supports_vec_convert_optab_p (mask_len_scatter_store_optab,
+       mode)
+      || (strided_p
+ && direct_optab_handler (mask_len_strided_store_optab, mode)
+       != CODE_FOR_nothing)
+    ? 1
+    : -1);
   return this_fn_optabs->supports_vec_scatter_store[mode] > 0;
}
diff --git a/gcc/optabs-query.h b/gcc/optabs-query.h
index 920eb6a1b67..7c22edc5a78 100644
--- a/gcc/optabs-query.h
+++ b/gcc/optabs-query.h
@@ -191,8 +191,8 @@ bool can_compare_and_swap_p (machine_mode, bool);
bool can_atomic_exchange_p (machine_mode, bool);
bool can_atomic_load_p (machine_mode);
bool lshift_cheap_p (bool);
-bool supports_vec_gather_load_p (machine_mode = E_VOIDmode);
-bool supports_vec_scatter_store_p (machine_mode = E_VOIDmode);
+bool supports_vec_gather_load_p (machine_mode = E_VOIDmode, bool = false);
+bool supports_vec_scatter_store_p (machine_mode = E_VOIDmode, bool = false);
bool can_vec_extract (machine_mode, machine_mode);
/* Version of find_widening_optab_handler_and_mode that operates on
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
index d5c9c4a11c2..8662a570417 100644
--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
@@ -3903,7 +3903,8 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
/* Check whether we can use an internal function for a gather load
    or scatter store.  READ_P is true for loads and false for stores.
-   MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
+   MASKED_P is true if the load or store is conditional.  STRIDED_P
+   is true if the load or store is strided access.  MEMORY_TYPE is
    the type of the memory elements being loaded or stored.  OFFSET_TYPE
    is the type of the offset that is being applied to the invariant
    base address.  SCALE is the amount by which the offset should
@@ -3914,8 +3915,8 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
bool
vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
-   tree vectype, tree memory_type, tree offset_type,
-   int scale, internal_fn *ifn_out,
+   bool strided_p, tree vectype, tree memory_type,
+   tree offset_type, int scale, internal_fn *ifn_out,
  tree *offset_vectype_out)
{
   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
@@ -3926,7 +3927,7 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
     return false;
   /* Work out which function we need.  */
-  internal_fn ifn, alt_ifn, alt_ifn2;
+  internal_fn ifn, alt_ifn, alt_ifn2, alt_ifn3;
   if (read_p)
     {
       ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
@@ -3935,6 +3936,12 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
use MASK_LEN_GATHER_LOAD regardless whether len and
mask are valid or not.  */
       alt_ifn2 = IFN_MASK_LEN_GATHER_LOAD;
+      /* When target supports MASK_LEN_STRIDED_LOAD, we can relax the
+ restrictions around the relationship of the vector offset type
+ to the loaded by using a gather load with strided access.
+ E.g. a "gather" of N bytes with a 64-bit stride would in principle
+ be possible without needing an Nx64-bit vector offset type.  */
+      alt_ifn3 = IFN_MASK_LEN_STRIDED_LOAD;
     }
   else
     {
@@ -3944,10 +3951,26 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
use MASK_LEN_SCATTER_STORE regardless whether len and
mask are valid or not.  */
       alt_ifn2 = IFN_MASK_LEN_SCATTER_STORE;
+      /* When target supports MASK_LEN_STRIDED_STORE, we can relax the
+ restrictions around the relationship of the vector offset type
+ to the stored by using a scatter store with strided access.
+ E.g. a "scatter" of N bytes with a 64-bit stride would in principle
+ be possible without needing an Nx64-bit vector offset type.  */
+      alt_ifn3 = IFN_MASK_LEN_STRIDED_STORE;
     }
   for (;;)
     {
+      /* We don't need to check whether target supports gather/scatter IFN
+ with expected vector offset for gather/scatter with a strided access
+ when target itself support strided load/store IFN.  */
+      if (strided_p
+   && internal_strided_fn_supported_p (alt_ifn3, vectype, memory_type))
+ {
+   *ifn_out = alt_ifn3;
+   return true;
+ }
+
       tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
       if (!offset_vectype)
return false;
@@ -4030,6 +4053,7 @@ vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
   internal_fn ifn;
   tree offset_vectype;
   bool masked_p = false;
+  bool strided_p = STMT_VINFO_STRIDED_P (stmt_info);
   /* See whether this is already a call to a gather/scatter internal function.
      If not, see whether it's a masked load or store.  */
@@ -4197,12 +4221,14 @@ vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
supports it for at least some offset type.  */
      if (use_ifn_p
  && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
- masked_p, vectype, memory_type,
+ masked_p, strided_p,
+ vectype, memory_type,
signed_char_type_node,
new_scale, &ifn,
&offset_vectype)
  && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
- masked_p, vectype, memory_type,
+ masked_p, strided_p,
+ vectype, memory_type,
unsigned_char_type_node,
new_scale, &ifn,
&offset_vectype))
@@ -4226,7 +4252,8 @@ vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
      && TREE_CODE (off) == SSA_NAME
      && !POINTER_TYPE_P (TREE_TYPE (off))
      && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
-    masked_p, vectype, memory_type,
+    masked_p, strided_p,
+    vectype, memory_type,
   TREE_TYPE (off), scale, &ifn,
   &offset_vectype))
    break;
@@ -4281,8 +4308,8 @@ vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
   if (use_ifn_p)
     {
       if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
-      vectype, memory_type, offtype, scale,
-      &ifn, &offset_vectype))
+      strided_p, vectype, memory_type, offtype,
+      scale, &ifn, &offset_vectype))
ifn = IFN_LAST;
       decl = NULL_TREE;
     }
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index f895aaf3083..907d0c0bcbb 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1515,10 +1515,14 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
       internal_fn len_ifn = (is_load
     ? IFN_MASK_LEN_GATHER_LOAD
     : IFN_MASK_LEN_SCATTER_STORE);
-      if (internal_gather_scatter_fn_supported_p (len_ifn, vectype,
-   gs_info->memory_type,
-   gs_info->offset_vectype,
-   gs_info->scale))
+      if (internal_strided_fn_p (gs_info->ifn)
+   && internal_strided_fn_supported_p (gs_info->ifn, vectype,
+       gs_info->memory_type))
+ vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1);
+      else if (internal_gather_scatter_fn_supported_p (len_ifn, vectype,
+        gs_info->memory_type,
+        gs_info->offset_vectype,
+        gs_info->scale))
vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1);
       else if (internal_gather_scatter_fn_supported_p (ifn, vectype,
       gs_info->memory_type,
@@ -1703,6 +1707,7 @@ vect_truncate_gather_scatter_offset (stmt_vec_info stmt_info,
no narrower than OFFSET_TYPE.  */
       tree memory_type = TREE_TYPE (DR_REF (dr));
       if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
+      STMT_VINFO_STRIDED_P (stmt_info),
     vectype, memory_type, offset_type, scale,
     &gs_info->ifn, &gs_info->offset_vectype)
  || gs_info->ifn == IFN_LAST)
@@ -1743,6 +1748,15 @@ vect_use_strided_gather_scatters_p (stmt_vec_info stmt_info,
       || gs_info->ifn == IFN_LAST)
     return vect_truncate_gather_scatter_offset (stmt_info, loop_vinfo,
masked_p, gs_info);
+  else if (internal_strided_fn_p (gs_info->ifn))
+    {
+      if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "using strided IFN for strided/grouped access,"
+ " scale = %d\n",
+ gs_info->scale);
+      return true;
+    }
   tree old_offset_type = TREE_TYPE (gs_info->offset);
   tree new_offset_type = TREE_TYPE (gs_info->offset_vectype);
@@ -3012,6 +3026,14 @@ vect_get_strided_load_store_ops (stmt_vec_info stmt_info,
       *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump);
     }
+  /* Target supports strided load/store use DR_STEP directly.  */
+  if (internal_strided_fn_p (gs_info->ifn))
+    {
+      *vec_offset = cse_and_gimplify_to_preheader (loop_vinfo,
+    unshare_expr (DR_STEP (dr)));
+      return;
+    }
+
   /* The offset given in GS_INFO can have pointer type, so use the element
      type of the vector instead.  */
   tree offset_type = TREE_TYPE (gs_info->offset_vectype);
@@ -9130,7 +9152,7 @@ vectorizable_store (vec_info *vinfo,
vec_offset = vec_offsets[j];
      tree scale = size_int (gs_info.scale);
-       if (gs_info.ifn == IFN_MASK_LEN_SCATTER_STORE)
+       if (internal_fn_len_index (gs_info.ifn) >= 0)
{
  if (loop_lens)
    final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
@@ -9150,10 +9172,18 @@ vectorizable_store (vec_info *vinfo,
      gcall *call;
      if (final_len && final_mask)
- call = gimple_build_call_internal (IFN_MASK_LEN_SCATTER_STORE,
-    7, dataref_ptr, vec_offset,
-    scale, vec_oprnd, final_mask,
-    final_len, bias);
+ {
+   if (internal_strided_fn_p (gs_info.ifn))
+     call
+       = gimple_build_call_internal (IFN_MASK_LEN_STRIDED_STORE,
+     6, dataref_ptr, vec_offset,
+     vec_oprnd, final_mask,
+     final_len, bias);
+   else
+     call = gimple_build_call_internal (
+       IFN_MASK_LEN_SCATTER_STORE, 7, dataref_ptr, vec_offset,
+       scale, vec_oprnd, final_mask, final_len, bias);
+ }
      else if (final_mask)
call
  = gimple_build_call_internal (IFN_MASK_SCATTER_STORE, 5,
@@ -10955,7 +10985,7 @@ vectorizable_load (vec_info *vinfo,
  tree zero = build_zero_cst (vectype);
  tree scale = size_int (gs_info.scale);
-   if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD)
+   if (internal_fn_len_index (gs_info.ifn) >= 0)
    {
      if (loop_lens)
final_len
@@ -10978,11 +11008,16 @@ vectorizable_load (vec_info *vinfo,
  gcall *call;
  if (final_len && final_mask)
-     call
-       = gimple_build_call_internal (IFN_MASK_LEN_GATHER_LOAD, 7,
-     dataref_ptr, vec_offset,
-     scale, zero, final_mask,
-     final_len, bias);
+     {
+       if (internal_strided_fn_p (gs_info.ifn))
+ call = gimple_build_call_internal (
+   IFN_MASK_LEN_STRIDED_LOAD, 6, dataref_ptr, vec_offset,
+   zero, final_mask, final_len, bias);
+       else
+ call = gimple_build_call_internal (
+   IFN_MASK_LEN_GATHER_LOAD, 7, dataref_ptr, vec_offset,
+   scale, zero, final_mask, final_len, bias);
+     }
  else if (final_mask)
    call = gimple_build_call_internal (IFN_MASK_GATHER_LOAD, 5,
       dataref_ptr, vec_offset,
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 254d172231d..4d9e9799470 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2309,7 +2309,7 @@ extern opt_result vect_analyze_data_refs_alignment (loop_vec_info);
extern bool vect_slp_analyze_instance_alignment (vec_info *, slp_instance);
extern opt_result vect_analyze_data_ref_accesses (vec_info *, vec<int> *);
extern opt_result vect_prune_runtime_alias_test_list (loop_vec_info);
-extern bool vect_gather_scatter_fn_p (vec_info *, bool, bool, tree, tree,
+extern bool vect_gather_scatter_fn_p (vec_info *, bool, bool, bool, tree, tree,
      tree, int, internal_fn *, tree *);
extern bool vect_check_gather_scatter (stmt_vec_info, loop_vec_info,
       gather_scatter_info *);
diff mbox series

Patch

diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index c7d3564faef..a31a65755c7 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -164,6 +164,7 @@  init_internal_fns ()
 #define load_lanes_direct { -1, -1, false }
 #define mask_load_lanes_direct { -1, -1, false }
 #define gather_load_direct { 3, 1, false }
+#define strided_load_direct { -1, -1, false }
 #define len_load_direct { -1, -1, false }
 #define mask_len_load_direct { -1, 4, false }
 #define mask_store_direct { 3, 2, false }
@@ -172,6 +173,7 @@  init_internal_fns ()
 #define vec_cond_mask_direct { 1, 0, false }
 #define vec_cond_direct { 2, 0, false }
 #define scatter_store_direct { 3, 1, false }
+#define strided_store_direct { 1, 1, false }
 #define len_store_direct { 3, 3, false }
 #define mask_len_store_direct { 4, 5, false }
 #define vec_set_direct { 3, 3, false }
@@ -3561,62 +3563,87 @@  expand_LAUNDER (internal_fn, gcall *call)
   expand_assignment (lhs, gimple_call_arg (call, 0), false);
 }
 
+#define expand_strided_store_optab_fn expand_scatter_store_optab_fn
+
 /* Expand {MASK_,}SCATTER_STORE{S,U} call CALL using optab OPTAB.  */
 
 static void
 expand_scatter_store_optab_fn (internal_fn, gcall *stmt, direct_optab optab)
 {
+  insn_code icode;
   internal_fn ifn = gimple_call_internal_fn (stmt);
   int rhs_index = internal_fn_stored_value_index (ifn);
   tree base = gimple_call_arg (stmt, 0);
   tree offset = gimple_call_arg (stmt, 1);
-  tree scale = gimple_call_arg (stmt, 2);
   tree rhs = gimple_call_arg (stmt, rhs_index);
 
   rtx base_rtx = expand_normal (base);
   rtx offset_rtx = expand_normal (offset);
-  HOST_WIDE_INT scale_int = tree_to_shwi (scale);
   rtx rhs_rtx = expand_normal (rhs);
 
   class expand_operand ops[8];
   int i = 0;
   create_address_operand (&ops[i++], base_rtx);
-  create_input_operand (&ops[i++], offset_rtx, TYPE_MODE (TREE_TYPE (offset)));
-  create_integer_operand (&ops[i++], TYPE_UNSIGNED (TREE_TYPE (offset)));
-  create_integer_operand (&ops[i++], scale_int);
+  if (internal_strided_fn_p (ifn))
+    {
+      create_address_operand (&ops[i++], offset_rtx);
+      icode = direct_optab_handler (optab, TYPE_MODE (TREE_TYPE (rhs)));
+    }
+  else
+    {
+      tree scale = gimple_call_arg (stmt, 2);
+      HOST_WIDE_INT scale_int = tree_to_shwi (scale);
+      create_input_operand (&ops[i++], offset_rtx,
+			    TYPE_MODE (TREE_TYPE (offset)));
+      create_integer_operand (&ops[i++], TYPE_UNSIGNED (TREE_TYPE (offset)));
+      create_integer_operand (&ops[i++], scale_int);
+      icode = convert_optab_handler (optab, TYPE_MODE (TREE_TYPE (rhs)),
+				     TYPE_MODE (TREE_TYPE (offset)));
+    }
   create_input_operand (&ops[i++], rhs_rtx, TYPE_MODE (TREE_TYPE (rhs)));
   i = add_mask_and_len_args (ops, i, stmt);
 
-  insn_code icode = convert_optab_handler (optab, TYPE_MODE (TREE_TYPE (rhs)),
-					   TYPE_MODE (TREE_TYPE (offset)));
   expand_insn (icode, i, ops);
 }
 
+#define expand_strided_load_optab_fn expand_gather_load_optab_fn
+
 /* Expand {MASK_,}GATHER_LOAD call CALL using optab OPTAB.  */
 
 static void
 expand_gather_load_optab_fn (internal_fn, gcall *stmt, direct_optab optab)
 {
+  insn_code icode;
+  internal_fn ifn = gimple_call_internal_fn (stmt);
   tree lhs = gimple_call_lhs (stmt);
   tree base = gimple_call_arg (stmt, 0);
   tree offset = gimple_call_arg (stmt, 1);
-  tree scale = gimple_call_arg (stmt, 2);
 
   rtx lhs_rtx = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
   rtx base_rtx = expand_normal (base);
   rtx offset_rtx = expand_normal (offset);
-  HOST_WIDE_INT scale_int = tree_to_shwi (scale);
 
   int i = 0;
   class expand_operand ops[8];
   create_output_operand (&ops[i++], lhs_rtx, TYPE_MODE (TREE_TYPE (lhs)));
   create_address_operand (&ops[i++], base_rtx);
-  create_input_operand (&ops[i++], offset_rtx, TYPE_MODE (TREE_TYPE (offset)));
-  create_integer_operand (&ops[i++], TYPE_UNSIGNED (TREE_TYPE (offset)));
-  create_integer_operand (&ops[i++], scale_int);
+  if (internal_strided_fn_p (ifn))
+    {
+      create_address_operand (&ops[i++], offset_rtx);
+      icode = direct_optab_handler (optab, TYPE_MODE (TREE_TYPE (lhs)));
+    }
+  else
+    {
+      tree scale = gimple_call_arg (stmt, 2);
+      HOST_WIDE_INT scale_int = tree_to_shwi (scale);
+      create_input_operand (&ops[i++], offset_rtx,
+			    TYPE_MODE (TREE_TYPE (offset)));
+      create_integer_operand (&ops[i++], TYPE_UNSIGNED (TREE_TYPE (offset)));
+      create_integer_operand (&ops[i++], scale_int);
+      icode = convert_optab_handler (optab, TYPE_MODE (TREE_TYPE (lhs)),
+				     TYPE_MODE (TREE_TYPE (offset)));
+    }
   i = add_mask_and_len_args (ops, i, stmt);
-  insn_code icode = convert_optab_handler (optab, TYPE_MODE (TREE_TYPE (lhs)),
-					   TYPE_MODE (TREE_TYPE (offset)));
   expand_insn (icode, i, ops);
   if (!rtx_equal_p (lhs_rtx, ops[0].value))
     emit_move_insn (lhs_rtx, ops[0].value);
@@ -4012,6 +4039,7 @@  multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
 #define direct_load_lanes_optab_supported_p multi_vector_optab_supported_p
 #define direct_mask_load_lanes_optab_supported_p multi_vector_optab_supported_p
 #define direct_gather_load_optab_supported_p convert_optab_supported_p
+#define direct_strided_load_optab_supported_p direct_optab_supported_p
 #define direct_len_load_optab_supported_p direct_optab_supported_p
 #define direct_mask_len_load_optab_supported_p convert_optab_supported_p
 #define direct_mask_store_optab_supported_p convert_optab_supported_p
@@ -4020,6 +4048,7 @@  multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
 #define direct_vec_cond_mask_optab_supported_p convert_optab_supported_p
 #define direct_vec_cond_optab_supported_p convert_optab_supported_p
 #define direct_scatter_store_optab_supported_p convert_optab_supported_p
+#define direct_strided_store_optab_supported_p direct_optab_supported_p
 #define direct_len_store_optab_supported_p direct_optab_supported_p
 #define direct_mask_len_store_optab_supported_p convert_optab_supported_p
 #define direct_while_optab_supported_p convert_optab_supported_p
@@ -4596,6 +4625,7 @@  internal_load_fn_p (internal_fn fn)
     case IFN_GATHER_LOAD:
     case IFN_MASK_GATHER_LOAD:
     case IFN_MASK_LEN_GATHER_LOAD:
+    case IFN_MASK_LEN_STRIDED_LOAD:
     case IFN_LEN_LOAD:
     case IFN_MASK_LEN_LOAD:
       return true;
@@ -4648,6 +4678,22 @@  internal_gather_scatter_fn_p (internal_fn fn)
     }
 }
 
+/* Return true if IFN is some form of strided load or strided store.  */
+
+bool
+internal_strided_fn_p (internal_fn fn)
+{
+  switch (fn)
+    {
+    case IFN_MASK_LEN_STRIDED_LOAD:
+    case IFN_MASK_LEN_STRIDED_STORE:
+      return true;
+
+    default:
+      return false;
+    }
+}
+
 /* If FN takes a vector len argument, return the index of that argument,
    otherwise return -1.  */
 
@@ -4683,6 +4729,8 @@  internal_fn_len_index (internal_fn fn)
     case IFN_COND_LEN_XOR:
     case IFN_COND_LEN_SHL:
     case IFN_COND_LEN_SHR:
+    case IFN_MASK_LEN_STRIDED_LOAD:
+    case IFN_MASK_LEN_STRIDED_STORE:
       return 4;
 
     case IFN_COND_LEN_NEG:
@@ -4776,6 +4824,10 @@  internal_fn_mask_index (internal_fn fn)
     case IFN_MASK_LEN_STORE:
       return 2;
 
+    case IFN_MASK_LEN_STRIDED_LOAD:
+    case IFN_MASK_LEN_STRIDED_STORE:
+      return 3;
+
     case IFN_MASK_GATHER_LOAD:
     case IFN_MASK_SCATTER_STORE:
     case IFN_MASK_LEN_GATHER_LOAD:
@@ -4796,6 +4848,9 @@  internal_fn_stored_value_index (internal_fn fn)
 {
   switch (fn)
     {
+    case IFN_MASK_LEN_STRIDED_STORE:
+      return 2;
+
     case IFN_MASK_STORE:
     case IFN_MASK_STORE_LANES:
     case IFN_SCATTER_STORE:
@@ -4845,6 +4900,24 @@  internal_gather_scatter_fn_supported_p (internal_fn ifn, tree vector_type,
 	  && insn_operand_matches (icode, 3 + output_ops, GEN_INT (scale)));
 }
 
+/* Return true if the target supports strided load or strided store function
+   IFN.  For loads, VECTOR_TYPE is the vector type of the load result,
+   while for stores it is the vector type of the stored data argument.
+   MEMORY_ELEMENT_TYPE is the type of the memory elements being loaded
+   or stored.  */
+
+bool
+internal_strided_fn_supported_p (internal_fn ifn, tree vector_type,
+				 tree memory_element_type)
+{
+  if (!tree_int_cst_equal (TYPE_SIZE (TREE_TYPE (vector_type)),
+			   TYPE_SIZE (memory_element_type)))
+    return false;
+  optab optab = direct_internal_fn_optab (ifn);
+  insn_code icode = direct_optab_handler (optab, TYPE_MODE (vector_type));
+  return icode != CODE_FOR_nothing;
+}
+
 /* Return true if the target supports IFN_CHECK_{RAW,WAR}_PTRS function IFN
    for pointers of type TYPE when the accesses have LENGTH bytes and their
    common byte alignment is ALIGN.  */
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index a2023ab9c3d..922359ed5cf 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -199,6 +199,8 @@  DEF_INTERNAL_OPTAB_FN (MASK_GATHER_LOAD, ECF_PURE,
 		       mask_gather_load, gather_load)
 DEF_INTERNAL_OPTAB_FN (MASK_LEN_GATHER_LOAD, ECF_PURE,
 		       mask_len_gather_load, gather_load)
+DEF_INTERNAL_OPTAB_FN (MASK_LEN_STRIDED_LOAD, ECF_PURE,
+		       mask_len_strided_load, strided_load)
 
 DEF_INTERNAL_OPTAB_FN (LEN_LOAD, ECF_PURE, len_load, len_load)
 DEF_INTERNAL_OPTAB_FN (MASK_LEN_LOAD, ECF_PURE, mask_len_load, mask_len_load)
@@ -208,6 +210,8 @@  DEF_INTERNAL_OPTAB_FN (MASK_SCATTER_STORE, 0,
 		       mask_scatter_store, scatter_store)
 DEF_INTERNAL_OPTAB_FN (MASK_LEN_SCATTER_STORE, 0,
 		       mask_len_scatter_store, scatter_store)
+DEF_INTERNAL_OPTAB_FN (MASK_LEN_STRIDED_STORE, 0,
+		       mask_len_strided_store, strided_store)
 
 DEF_INTERNAL_OPTAB_FN (MASK_STORE, 0, maskstore, mask_store)
 DEF_INTERNAL_OPTAB_FN (STORE_LANES, ECF_CONST, vec_store_lanes, store_lanes)
diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
index 7d72f4db2d0..ea6a7369f23 100644
--- a/gcc/internal-fn.h
+++ b/gcc/internal-fn.h
@@ -235,12 +235,14 @@  extern bool can_interpret_as_conditional_op_p (gimple *, tree *,
 extern bool internal_load_fn_p (internal_fn);
 extern bool internal_store_fn_p (internal_fn);
 extern bool internal_gather_scatter_fn_p (internal_fn);
+extern bool internal_strided_fn_p (internal_fn);
 extern int internal_fn_mask_index (internal_fn);
 extern int internal_fn_len_index (internal_fn);
 extern int internal_fn_else_index (internal_fn);
 extern int internal_fn_stored_value_index (internal_fn);
 extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree,
 						    tree, tree, int);
+extern bool internal_strided_fn_supported_p (internal_fn, tree, tree);
 extern bool internal_check_ptrs_fn_supported_p (internal_fn, tree,
 						poly_uint64, unsigned int);
 #define VECT_PARTIAL_BIAS_UNSUPPORTED 127
diff --git a/gcc/optabs-query.cc b/gcc/optabs-query.cc
index 947ccef218c..860e744995b 100644
--- a/gcc/optabs-query.cc
+++ b/gcc/optabs-query.cc
@@ -670,14 +670,18 @@  supports_vec_convert_optab_p (optab op, machine_mode mode)
    for at least one vector mode.  */
 
 bool
-supports_vec_gather_load_p (machine_mode mode)
+supports_vec_gather_load_p (machine_mode mode, bool strided_p)
 {
   if (!this_fn_optabs->supports_vec_gather_load[mode])
     this_fn_optabs->supports_vec_gather_load[mode]
       = (supports_vec_convert_optab_p (gather_load_optab, mode)
-	 || supports_vec_convert_optab_p (mask_gather_load_optab, mode)
-	 || supports_vec_convert_optab_p (mask_len_gather_load_optab, mode)
-	 ? 1 : -1);
+	     || supports_vec_convert_optab_p (mask_gather_load_optab, mode)
+	     || supports_vec_convert_optab_p (mask_len_gather_load_optab, mode)
+	     || (strided_p
+		 && direct_optab_handler (mask_len_strided_load_optab, mode)
+		      != CODE_FOR_nothing)
+	   ? 1
+	   : -1);
 
   return this_fn_optabs->supports_vec_gather_load[mode] > 0;
 }
@@ -687,14 +691,19 @@  supports_vec_gather_load_p (machine_mode mode)
    for at least one vector mode.  */
 
 bool
-supports_vec_scatter_store_p (machine_mode mode)
+supports_vec_scatter_store_p (machine_mode mode, bool strided_p)
 {
   if (!this_fn_optabs->supports_vec_scatter_store[mode])
     this_fn_optabs->supports_vec_scatter_store[mode]
       = (supports_vec_convert_optab_p (scatter_store_optab, mode)
-	 || supports_vec_convert_optab_p (mask_scatter_store_optab, mode)
-	 || supports_vec_convert_optab_p (mask_len_scatter_store_optab, mode)
-	 ? 1 : -1);
+	     || supports_vec_convert_optab_p (mask_scatter_store_optab, mode)
+	     || supports_vec_convert_optab_p (mask_len_scatter_store_optab,
+					      mode)
+	     || (strided_p
+		 && direct_optab_handler (mask_len_strided_store_optab, mode)
+		      != CODE_FOR_nothing)
+	   ? 1
+	   : -1);
 
   return this_fn_optabs->supports_vec_scatter_store[mode] > 0;
 }
diff --git a/gcc/optabs-query.h b/gcc/optabs-query.h
index 920eb6a1b67..7c22edc5a78 100644
--- a/gcc/optabs-query.h
+++ b/gcc/optabs-query.h
@@ -191,8 +191,8 @@  bool can_compare_and_swap_p (machine_mode, bool);
 bool can_atomic_exchange_p (machine_mode, bool);
 bool can_atomic_load_p (machine_mode);
 bool lshift_cheap_p (bool);
-bool supports_vec_gather_load_p (machine_mode = E_VOIDmode);
-bool supports_vec_scatter_store_p (machine_mode = E_VOIDmode);
+bool supports_vec_gather_load_p (machine_mode = E_VOIDmode, bool = false);
+bool supports_vec_scatter_store_p (machine_mode = E_VOIDmode, bool = false);
 bool can_vec_extract (machine_mode, machine_mode);
 
 /* Version of find_widening_optab_handler_and_mode that operates on
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
index d5c9c4a11c2..8662a570417 100644
--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
@@ -3903,7 +3903,8 @@  vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
 
 /* Check whether we can use an internal function for a gather load
    or scatter store.  READ_P is true for loads and false for stores.
-   MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
+   MASKED_P is true if the load or store is conditional.  STRIDED_P
+   is true if the load or store is strided access.  MEMORY_TYPE is
    the type of the memory elements being loaded or stored.  OFFSET_TYPE
    is the type of the offset that is being applied to the invariant
    base address.  SCALE is the amount by which the offset should
@@ -3914,8 +3915,8 @@  vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
 
 bool
 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
-			  tree vectype, tree memory_type, tree offset_type,
-			  int scale, internal_fn *ifn_out,
+			  bool strided_p, tree vectype, tree memory_type,
+			  tree offset_type, int scale, internal_fn *ifn_out,
 			  tree *offset_vectype_out)
 {
   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
@@ -3926,7 +3927,7 @@  vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
     return false;
 
   /* Work out which function we need.  */
-  internal_fn ifn, alt_ifn, alt_ifn2;
+  internal_fn ifn, alt_ifn, alt_ifn2, alt_ifn3;
   if (read_p)
     {
       ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
@@ -3935,6 +3936,12 @@  vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
 	 use MASK_LEN_GATHER_LOAD regardless whether len and
 	 mask are valid or not.  */
       alt_ifn2 = IFN_MASK_LEN_GATHER_LOAD;
+      /* When target supports MASK_LEN_STRIDED_LOAD, we can relax the
+	 restrictions around the relationship of the vector offset type
+	 to the loaded by using a gather load with strided access.
+	 E.g. a "gather" of N bytes with a 64-bit stride would in principle
+	 be possible without needing an Nx64-bit vector offset type.  */
+      alt_ifn3 = IFN_MASK_LEN_STRIDED_LOAD;
     }
   else
     {
@@ -3944,10 +3951,26 @@  vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
 	 use MASK_LEN_SCATTER_STORE regardless whether len and
 	 mask are valid or not.  */
       alt_ifn2 = IFN_MASK_LEN_SCATTER_STORE;
+      /* When target supports MASK_LEN_STRIDED_STORE, we can relax the
+	 restrictions around the relationship of the vector offset type
+	 to the stored by using a scatter store with strided access.
+	 E.g. a "scatter" of N bytes with a 64-bit stride would in principle
+	 be possible without needing an Nx64-bit vector offset type.  */
+      alt_ifn3 = IFN_MASK_LEN_STRIDED_STORE;
     }
 
   for (;;)
     {
+      /* We don't need to check whether target supports gather/scatter IFN
+	 with expected vector offset for gather/scatter with a strided access
+	 when target itself support strided load/store IFN.  */
+      if (strided_p
+	  && internal_strided_fn_supported_p (alt_ifn3, vectype, memory_type))
+	{
+	  *ifn_out = alt_ifn3;
+	  return true;
+	}
+
       tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
       if (!offset_vectype)
 	return false;
@@ -4030,6 +4053,7 @@  vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
   internal_fn ifn;
   tree offset_vectype;
   bool masked_p = false;
+  bool strided_p = STMT_VINFO_STRIDED_P (stmt_info);
 
   /* See whether this is already a call to a gather/scatter internal function.
      If not, see whether it's a masked load or store.  */
@@ -4197,12 +4221,14 @@  vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
 		 supports it for at least some offset type.  */
 	      if (use_ifn_p
 		  && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
-						masked_p, vectype, memory_type,
+						masked_p, strided_p,
+						vectype, memory_type,
 						signed_char_type_node,
 						new_scale, &ifn,
 						&offset_vectype)
 		  && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
-						masked_p, vectype, memory_type,
+						masked_p, strided_p,
+						vectype, memory_type,
 						unsigned_char_type_node,
 						new_scale, &ifn,
 						&offset_vectype))
@@ -4226,7 +4252,8 @@  vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
 	      && TREE_CODE (off) == SSA_NAME
 	      && !POINTER_TYPE_P (TREE_TYPE (off))
 	      && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
-					   masked_p, vectype, memory_type,
+					   masked_p, strided_p,
+					   vectype, memory_type,
 					   TREE_TYPE (off), scale, &ifn,
 					   &offset_vectype))
 	    break;
@@ -4281,8 +4308,8 @@  vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
   if (use_ifn_p)
     {
       if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
-				     vectype, memory_type, offtype, scale,
-				     &ifn, &offset_vectype))
+				     strided_p, vectype, memory_type, offtype,
+				     scale, &ifn, &offset_vectype))
 	ifn = IFN_LAST;
       decl = NULL_TREE;
     }
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index f895aaf3083..907d0c0bcbb 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1515,10 +1515,14 @@  check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
       internal_fn len_ifn = (is_load
 			     ? IFN_MASK_LEN_GATHER_LOAD
 			     : IFN_MASK_LEN_SCATTER_STORE);
-      if (internal_gather_scatter_fn_supported_p (len_ifn, vectype,
-						  gs_info->memory_type,
-						  gs_info->offset_vectype,
-						  gs_info->scale))
+      if (internal_strided_fn_p (gs_info->ifn)
+	  && internal_strided_fn_supported_p (gs_info->ifn, vectype,
+					      gs_info->memory_type))
+	vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1);
+      else if (internal_gather_scatter_fn_supported_p (len_ifn, vectype,
+						       gs_info->memory_type,
+						       gs_info->offset_vectype,
+						       gs_info->scale))
 	vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1);
       else if (internal_gather_scatter_fn_supported_p (ifn, vectype,
 						       gs_info->memory_type,
@@ -1703,6 +1707,7 @@  vect_truncate_gather_scatter_offset (stmt_vec_info stmt_info,
 	 no narrower than OFFSET_TYPE.  */
       tree memory_type = TREE_TYPE (DR_REF (dr));
       if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
+				     STMT_VINFO_STRIDED_P (stmt_info),
 				     vectype, memory_type, offset_type, scale,
 				     &gs_info->ifn, &gs_info->offset_vectype)
 	  || gs_info->ifn == IFN_LAST)
@@ -1743,6 +1748,15 @@  vect_use_strided_gather_scatters_p (stmt_vec_info stmt_info,
       || gs_info->ifn == IFN_LAST)
     return vect_truncate_gather_scatter_offset (stmt_info, loop_vinfo,
 						masked_p, gs_info);
+  else if (internal_strided_fn_p (gs_info->ifn))
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "using strided IFN for strided/grouped access,"
+			 " scale = %d\n",
+			 gs_info->scale);
+      return true;
+    }
 
   tree old_offset_type = TREE_TYPE (gs_info->offset);
   tree new_offset_type = TREE_TYPE (gs_info->offset_vectype);
@@ -3012,6 +3026,14 @@  vect_get_strided_load_store_ops (stmt_vec_info stmt_info,
       *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump);
     }
 
+  /* Target supports strided load/store use DR_STEP directly.  */
+  if (internal_strided_fn_p (gs_info->ifn))
+    {
+      *vec_offset = cse_and_gimplify_to_preheader (loop_vinfo,
+						   unshare_expr (DR_STEP (dr)));
+      return;
+    }
+
   /* The offset given in GS_INFO can have pointer type, so use the element
      type of the vector instead.  */
   tree offset_type = TREE_TYPE (gs_info->offset_vectype);
@@ -9130,7 +9152,7 @@  vectorizable_store (vec_info *vinfo,
 		vec_offset = vec_offsets[j];
 	      tree scale = size_int (gs_info.scale);
 
-	      if (gs_info.ifn == IFN_MASK_LEN_SCATTER_STORE)
+	      if (internal_fn_len_index (gs_info.ifn) >= 0)
 		{
 		  if (loop_lens)
 		    final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
@@ -9150,10 +9172,18 @@  vectorizable_store (vec_info *vinfo,
 
 	      gcall *call;
 	      if (final_len && final_mask)
-		call = gimple_build_call_internal (IFN_MASK_LEN_SCATTER_STORE,
-						   7, dataref_ptr, vec_offset,
-						   scale, vec_oprnd, final_mask,
-						   final_len, bias);
+		{
+		  if (internal_strided_fn_p (gs_info.ifn))
+		    call
+		      = gimple_build_call_internal (IFN_MASK_LEN_STRIDED_STORE,
+						    6, dataref_ptr, vec_offset,
+						    vec_oprnd, final_mask,
+						    final_len, bias);
+		  else
+		    call = gimple_build_call_internal (
+		      IFN_MASK_LEN_SCATTER_STORE, 7, dataref_ptr, vec_offset,
+		      scale, vec_oprnd, final_mask, final_len, bias);
+		}
 	      else if (final_mask)
 		call
 		  = gimple_build_call_internal (IFN_MASK_SCATTER_STORE, 5,
@@ -10955,7 +10985,7 @@  vectorizable_load (vec_info *vinfo,
 		  tree zero = build_zero_cst (vectype);
 		  tree scale = size_int (gs_info.scale);
 
-		  if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD)
+		  if (internal_fn_len_index (gs_info.ifn) >= 0)
 		    {
 		      if (loop_lens)
 			final_len
@@ -10978,11 +11008,16 @@  vectorizable_load (vec_info *vinfo,
 
 		  gcall *call;
 		  if (final_len && final_mask)
-		    call
-		      = gimple_build_call_internal (IFN_MASK_LEN_GATHER_LOAD, 7,
-						    dataref_ptr, vec_offset,
-						    scale, zero, final_mask,
-						    final_len, bias);
+		    {
+		      if (internal_strided_fn_p (gs_info.ifn))
+			call = gimple_build_call_internal (
+			  IFN_MASK_LEN_STRIDED_LOAD, 6, dataref_ptr, vec_offset,
+			  zero, final_mask, final_len, bias);
+		      else
+			call = gimple_build_call_internal (
+			  IFN_MASK_LEN_GATHER_LOAD, 7, dataref_ptr, vec_offset,
+			  scale, zero, final_mask, final_len, bias);
+		    }
 		  else if (final_mask)
 		    call = gimple_build_call_internal (IFN_MASK_GATHER_LOAD, 5,
 						       dataref_ptr, vec_offset,
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 254d172231d..4d9e9799470 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2309,7 +2309,7 @@  extern opt_result vect_analyze_data_refs_alignment (loop_vec_info);
 extern bool vect_slp_analyze_instance_alignment (vec_info *, slp_instance);
 extern opt_result vect_analyze_data_ref_accesses (vec_info *, vec<int> *);
 extern opt_result vect_prune_runtime_alias_test_list (loop_vec_info);
-extern bool vect_gather_scatter_fn_p (vec_info *, bool, bool, tree, tree,
+extern bool vect_gather_scatter_fn_p (vec_info *, bool, bool, bool, tree, tree,
 				      tree, int, internal_fn *, tree *);
 extern bool vect_check_gather_scatter (stmt_vec_info, loop_vec_info,
 				       gather_scatter_info *);