Message ID | c6f16aef-bf98-49bb-961e-f187a05a1429@gmail.com |
---|---|
State | New |
Headers | show |
Series | RISC-V: Add initial cost handling for segment loads/stores. | expand |
This patch looks odd to me.
I don't see memrefs in the trunk code.
Also, I prefer list all cost in cost tune info for NF = 2 ~ 8 like ARM SVE does:
/* Detect cases in which a vector load or store represents an
LD[234] or ST[234] instruction. */
switch (aarch64_ld234_st234_vectors (kind, stmt_info))
{
case 2:
stmt_cost += simd_costs->ld2_st2_permute_cost;
break;
case 3:
stmt_cost += simd_costs->ld3_st3_permute_cost;
break;
case 4:
stmt_cost += simd_costs->ld4_st4_permute_cost;
break;
}
juzhe.zhong@rivai.ai
From: Robin Dapp
Date: 2024-02-26 23:54
To: gcc-patches; palmer; Kito Cheng; juzhe.zhong@rivai.ai
CC: rdapp.gcc; jeffreyalaw
Subject: [PATCH] RISC-V: Add initial cost handling for segment loads/stores.
Hi,
This has been sitting on my local tree - I've been wanting to post it
for a while but somehow forgot.
This patch makes segment loads and stores more expensive. It adds
segment_load and segment_store cost fields to the common vector costs
and adds handling to adjust_stmt_cost. In the future we could handle
this in a more fine-grained manner but let's start somehow.
Regtested on rv64.
Regards
Robin
gcc/ChangeLog:
* config/riscv/riscv-protos.h (struct common_vector_cost): Add
segment_[load/store]_cost.
* config/riscv/riscv-vector-costs.cc (costs::adjust_stmt_cost):
Handle segment loads/stores.
* config/riscv/riscv.cc: Initialize segment_[load/store]_cost
to 1.
gcc/testsuite/ChangeLog:
* gcc.dg/vect/costmodel/riscv/rvv/pr113112-4.c: Expect m4
instead of m2.
---
gcc/config/riscv/riscv-protos.h | 4 +
gcc/config/riscv/riscv-vector-costs.cc | 127 ++++++++++++------
gcc/config/riscv/riscv.cc | 4 +
.../vect/costmodel/riscv/rvv/pr113112-4.c | 4 +-
4 files changed, 95 insertions(+), 44 deletions(-)
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 80efdf2b7e5..2e8ab9990a8 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -218,6 +218,10 @@ struct common_vector_cost
const int gather_load_cost;
const int scatter_store_cost;
+ /* Segment load/store cost. */
+ const int segment_load_cost;
+ const int segment_store_cost;
+
/* Cost of a vector-to-scalar operation. */
const int vec_to_scalar_cost;
diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc
index adf9c197df5..d3c12444773 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -1043,6 +1043,24 @@ costs::better_main_loop_than_p (const vector_costs *uncast_other) const
return vector_costs::better_main_loop_than_p (other);
}
+/* Returns the group size i.e. the number of vectors to be loaded by a
+ segmented load/store instruction. Return 0 if it is no segmented
+ load/store. */
+static int
+segment_loadstore_group_size (enum vect_cost_for_stmt kind,
+ stmt_vec_info stmt_info)
+{
+ if ((kind == vector_load || kind == vector_store)
+ && STMT_VINFO_DATA_REF (stmt_info))
+ {
+ stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
+ if (stmt_info
+ && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES)
+ return DR_GROUP_SIZE (stmt_info);
+ }
+ return 0;
+}
+
/* Adjust vectorization cost after calling riscv_builtin_vectorization_cost.
For some statement, we would like to further fine-grain tweak the cost on
top of riscv_builtin_vectorization_cost handling which doesn't have any
@@ -1067,55 +1085,80 @@ costs::adjust_stmt_cost (enum vect_cost_for_stmt kind, loop_vec_info loop,
case vector_load:
case vector_store:
{
- /* Unit-stride vector loads and stores do not have offset addressing
- as opposed to scalar loads and stores.
- If the address depends on a variable we need an additional
- add/sub for each load/store in the worst case. */
- if (stmt_info && stmt_info->stmt)
+ if (stmt_info && stmt_info->stmt && STMT_VINFO_DATA_REF (stmt_info))
{
- data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
- class loop *father = stmt_info->stmt->bb->loop_father;
- if (!loop && father && !father->inner && father->superloops)
+ int group_size;
+ if ((group_size
+ = segment_loadstore_group_size (kind, stmt_info)) > 1)
{
- tree ref;
- if (TREE_CODE (dr->ref) != MEM_REF
- || !(ref = TREE_OPERAND (dr->ref, 0))
- || TREE_CODE (ref) != SSA_NAME)
- break;
+ /* Segment loads and stores. When the group size is > 1
+ the vectorizer will add a vector load/store statement for
+ each vector in the group. Note that STMT_COST is
+ overwritten here rather than adjusted. */
+ if (riscv_v_ext_vector_mode_p (loop->vector_mode))
+ stmt_cost
+ = (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
+ ? costs->vla->segment_load_cost
+ : costs->vla->segment_store_cost);
+ else
+ stmt_cost
+ = (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
+ ? costs->vls->segment_load_cost
+ : costs->vls->segment_store_cost);
+ break;
+ /* TODO: Indexed and ordered/unordered cost. */
+ }
+ else
+ {
+ /* Unit-stride vector loads and stores do not have offset
+ addressing as opposed to scalar loads and stores.
+ If the address depends on a variable we need an additional
+ add/sub for each load/store in the worst case. */
+ data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
+ class loop *father = stmt_info->stmt->bb->loop_father;
+ if (!loop && father && !father->inner && father->superloops)
+ {
+ tree ref;
+ if (TREE_CODE (dr->ref) != MEM_REF
+ || !(ref = TREE_OPERAND (dr->ref, 0))
+ || TREE_CODE (ref) != SSA_NAME)
+ break;
- if (SSA_NAME_IS_DEFAULT_DEF (ref))
- break;
+ if (SSA_NAME_IS_DEFAULT_DEF (ref))
+ break;
- if (memrefs.contains ({ref, cst0}))
- break;
+ if (memrefs.contains ({ref, cst0}))
+ break;
- memrefs.add ({ref, cst0});
+ memrefs.add ({ref, cst0});
- /* In case we have not seen REF before and the base address
- is a pointer operation try a bit harder. */
- tree base = DR_BASE_ADDRESS (dr);
- if (TREE_CODE (base) == POINTER_PLUS_EXPR
- || TREE_CODE (base) == POINTER_DIFF_EXPR)
- {
- /* Deconstruct BASE's first operand. If it is a binary
- operation, i.e. a base and an "offset" store this
- pair. Only increase the stmt_cost if we haven't seen
- it before. */
- tree argp = TREE_OPERAND (base, 1);
- typedef std::pair<tree, tree> addr_pair;
- addr_pair pair;
- if (TREE_CODE_CLASS (TREE_CODE (argp)) == tcc_binary)
+ /* In case we have not seen REF before and the base
+ address is a pointer operation try a bit harder. */
+ tree base = DR_BASE_ADDRESS (dr);
+ if (TREE_CODE (base) == POINTER_PLUS_EXPR
+ || TREE_CODE (base) == POINTER_DIFF_EXPR)
{
- tree argp0 = tree_strip_nop_conversions
- (TREE_OPERAND (argp, 0));
- tree argp1 = TREE_OPERAND (argp, 1);
- pair = addr_pair (argp0, argp1);
- if (memrefs.contains (pair))
- break;
-
- memrefs.add (pair);
- stmt_cost += builtin_vectorization_cost (scalar_stmt,
- NULL_TREE, 0);
+ /* Deconstruct BASE's first operand. If it is a
+ binary operation, i.e. a base and an "offset"
+ store this pair. Only increase the stmt_cost if
+ we haven't seen it before. */
+ tree argp = TREE_OPERAND (base, 1);
+ typedef std::pair<tree, tree> addr_pair;
+ addr_pair pair;
+ if (TREE_CODE_CLASS (TREE_CODE (argp)) == tcc_binary)
+ {
+ tree argp0 = tree_strip_nop_conversions
+ (TREE_OPERAND (argp, 0));
+ tree argp1 = TREE_OPERAND (argp, 1);
+ pair = addr_pair (argp0, argp1);
+ if (memrefs.contains (pair))
+ break;
+
+ memrefs.add (pair);
+ stmt_cost
+ += builtin_vectorization_cost (scalar_stmt,
+ NULL_TREE, 0);
+ }
}
}
}
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 5e984ee2a55..6c2f0ec34f4 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -365,6 +365,8 @@ static const common_vector_cost rvv_vls_vector_cost = {
1, /* fp_stmt_cost */
1, /* gather_load_cost */
1, /* scatter_store_cost */
+ 2, /* segment_load_cost */
+ 2, /* segment_store_cost */
1, /* vec_to_scalar_cost */
1, /* scalar_to_vec_cost */
1, /* permute_cost */
@@ -381,6 +383,8 @@ static const scalable_vector_cost rvv_vla_vector_cost = {
1, /* fp_stmt_cost */
1, /* gather_load_cost */
1, /* scatter_store_cost */
+ 2, /* segment_load_cost */
+ 2, /* segment_store_cost */
1, /* vec_to_scalar_cost */
1, /* scalar_to_vec_cost */
1, /* permute_cost */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-4.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-4.c
index 5c55a66ed77..bdeedaf5224 100644
--- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-4.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-4.c
@@ -21,8 +21,8 @@ void move_replacements (rtx *x, rtx *y, int n_replacements)
}
}
-/* { dg-final { scan-assembler {e64,m2} } } */
-/* { dg-final { scan-assembler-not {e64,m4} } } */
+/* { dg-final { scan-assembler-not {e64,m2} } } */
+/* { dg-final { scan-assembler {e64,m4} } } */
/* { dg-final { scan-assembler-not {jr} } } */
/* { dg-final { scan-assembler {ret} } } */
/* { dg-final { scan-assembler-not {sp} } } */
> This patch looks odd to me. > I don't see memrefs in the trunk code. It's on top of the vle/vse offset handling patch from a while back that I haven't committed yet. > Also, I prefer list all cost in cost tune info for NF = 2 ~ 8 like ARM SVE does: I don't mind having separate costs for each but I figured they scale anyway with the number of vectors already. Attached v2 is more similar to aarch64. Regards Robin Subject: [PATCH v2] RISC-V: Add initial cost handling for segment loads/stores. This patch makes segment loads and stores more expensive. It adds segment_permute_2 (as well as 4 and 8) cost fields to the common vector costs and adds handling to adjust_stmt_cost. gcc/ChangeLog: * config/riscv/riscv-protos.h (struct common_vector_cost): Add segment_permute cost. * config/riscv/riscv-vector-costs.cc (costs::adjust_stmt_cost): Handle segment loads/stores. * config/riscv/riscv.cc: Initialize segment_permute_[248] to 1. --- gcc/config/riscv/riscv-protos.h | 5 + gcc/config/riscv/riscv-vector-costs.cc | 139 +++++++++++++++++-------- gcc/config/riscv/riscv.cc | 6 ++ 3 files changed, 108 insertions(+), 42 deletions(-) diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h index 80efdf2b7e5..9b737aca1a3 100644 --- a/gcc/config/riscv/riscv-protos.h +++ b/gcc/config/riscv/riscv-protos.h @@ -218,6 +218,11 @@ struct common_vector_cost const int gather_load_cost; const int scatter_store_cost; + /* Segment load/store permute cost. */ + const int segment_permute_2; + const int segment_permute_4; + const int segment_permute_8; + /* Cost of a vector-to-scalar operation. */ const int vec_to_scalar_cost; diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc index adf9c197df5..c8178d71101 100644 --- a/gcc/config/riscv/riscv-vector-costs.cc +++ b/gcc/config/riscv/riscv-vector-costs.cc @@ -1043,6 +1043,25 @@ costs::better_main_loop_than_p (const vector_costs *uncast_other) const return vector_costs::better_main_loop_than_p (other); } +/* Returns the group size i.e. the number of vectors to be loaded by a + segmented load/store instruction. Return 0 if it is no segmented + load/store. */ +static int +segment_loadstore_group_size (enum vect_cost_for_stmt kind, + stmt_vec_info stmt_info) +{ + if (stmt_info + && (kind == vector_load || kind == vector_store) + && STMT_VINFO_DATA_REF (stmt_info)) + { + stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); + if (stmt_info + && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES) + return DR_GROUP_SIZE (stmt_info); + } + return 0; +} + /* Adjust vectorization cost after calling riscv_builtin_vectorization_cost. For some statement, we would like to further fine-grain tweak the cost on top of riscv_builtin_vectorization_cost handling which doesn't have any @@ -1067,55 +1086,91 @@ costs::adjust_stmt_cost (enum vect_cost_for_stmt kind, loop_vec_info loop, case vector_load: case vector_store: { - /* Unit-stride vector loads and stores do not have offset addressing - as opposed to scalar loads and stores. - If the address depends on a variable we need an additional - add/sub for each load/store in the worst case. */ - if (stmt_info && stmt_info->stmt) + if (stmt_info && stmt_info->stmt && STMT_VINFO_DATA_REF (stmt_info)) { - data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); - class loop *father = stmt_info->stmt->bb->loop_father; - if (!loop && father && !father->inner && father->superloops) + /* Segment loads and stores. When the group size is > 1 + the vectorizer will add a vector load/store statement for + each vector in the group. Here we additionally add permute + costs for each. */ + /* TODO: Indexed and ordered/unordered cost. */ + int group_size = segment_loadstore_group_size (kind, stmt_info); + if (group_size > 1) + { + switch (group_size) + { + case 2: + if (riscv_v_ext_vector_mode_p (loop->vector_mode)) + stmt_cost += costs->vla->segment_permute_2; + else + stmt_cost += costs->vls->segment_permute_2; + break; + case 4: + if (riscv_v_ext_vector_mode_p (loop->vector_mode)) + stmt_cost += costs->vla->segment_permute_4; + else + stmt_cost += costs->vls->segment_permute_4; + break; + case 8: + if (riscv_v_ext_vector_mode_p (loop->vector_mode)) + stmt_cost += costs->vla->segment_permute_8; + else + stmt_cost += costs->vls->segment_permute_8; + break; + default: + break; + } + } + else { - tree ref; - if (TREE_CODE (dr->ref) != MEM_REF - || !(ref = TREE_OPERAND (dr->ref, 0)) - || TREE_CODE (ref) != SSA_NAME) - break; + /* Unit-stride vector loads and stores do not have offset + addressing as opposed to scalar loads and stores. + If the address depends on a variable we need an additional + add/sub for each load/store in the worst case. */ + data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); + class loop *father = stmt_info->stmt->bb->loop_father; + if (!loop && father && !father->inner && father->superloops) + { + tree ref; + if (TREE_CODE (dr->ref) != MEM_REF + || !(ref = TREE_OPERAND (dr->ref, 0)) + || TREE_CODE (ref) != SSA_NAME) + break; - if (SSA_NAME_IS_DEFAULT_DEF (ref)) - break; + if (SSA_NAME_IS_DEFAULT_DEF (ref)) + break; - if (memrefs.contains ({ref, cst0})) - break; + if (memrefs.contains ({ref, cst0})) + break; - memrefs.add ({ref, cst0}); + memrefs.add ({ref, cst0}); - /* In case we have not seen REF before and the base address - is a pointer operation try a bit harder. */ - tree base = DR_BASE_ADDRESS (dr); - if (TREE_CODE (base) == POINTER_PLUS_EXPR - || TREE_CODE (base) == POINTER_DIFF_EXPR) - { - /* Deconstruct BASE's first operand. If it is a binary - operation, i.e. a base and an "offset" store this - pair. Only increase the stmt_cost if we haven't seen - it before. */ - tree argp = TREE_OPERAND (base, 1); - typedef std::pair<tree, tree> addr_pair; - addr_pair pair; - if (TREE_CODE_CLASS (TREE_CODE (argp)) == tcc_binary) + /* In case we have not seen REF before and the base + address is a pointer operation try a bit harder. */ + tree base = DR_BASE_ADDRESS (dr); + if (TREE_CODE (base) == POINTER_PLUS_EXPR + || TREE_CODE (base) == POINTER_DIFF_EXPR) { - tree argp0 = tree_strip_nop_conversions - (TREE_OPERAND (argp, 0)); - tree argp1 = TREE_OPERAND (argp, 1); - pair = addr_pair (argp0, argp1); - if (memrefs.contains (pair)) - break; - - memrefs.add (pair); - stmt_cost += builtin_vectorization_cost (scalar_stmt, - NULL_TREE, 0); + /* Deconstruct BASE's first operand. If it is a + binary operation, i.e. a base and an "offset" + store this pair. Only increase the stmt_cost if + we haven't seen it before. */ + tree argp = TREE_OPERAND (base, 1); + typedef std::pair<tree, tree> addr_pair; + addr_pair pair; + if (TREE_CODE_CLASS (TREE_CODE (argp)) == tcc_binary) + { + tree argp0 = tree_strip_nop_conversions + (TREE_OPERAND (argp, 0)); + tree argp1 = TREE_OPERAND (argp, 1); + pair = addr_pair (argp0, argp1); + if (memrefs.contains (pair)) + break; + + memrefs.add (pair); + stmt_cost + += builtin_vectorization_cost (scalar_stmt, + NULL_TREE, 0); + } } } } diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index 5e984ee2a55..141278ec35e 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -365,6 +365,9 @@ static const common_vector_cost rvv_vls_vector_cost = { 1, /* fp_stmt_cost */ 1, /* gather_load_cost */ 1, /* scatter_store_cost */ + 1, /* segment_permute (2) */ + 1, /* segment_permute (4) */ + 1, /* segment_permute (8) */ 1, /* vec_to_scalar_cost */ 1, /* scalar_to_vec_cost */ 1, /* permute_cost */ @@ -381,6 +384,9 @@ static const scalable_vector_cost rvv_vla_vector_cost = { 1, /* fp_stmt_cost */ 1, /* gather_load_cost */ 1, /* scatter_store_cost */ + 1, /* segment_permute (2) */ + 1, /* segment_permute (4) */ + 1, /* segment_permute (8) */ 1, /* vec_to_scalar_cost */ 1, /* scalar_to_vec_cost */ 1, /* permute_cost */
+ /* Segment load/store permute cost. */ + const int segment_permute_2; + const int segment_permute_4; + const int segment_permute_8; Why do we only have 2/4/8, I think we should have 2/3/4/5/6/7/8 juzhe.zhong@rivai.ai From: Robin Dapp Date: 2024-02-28 05:27 To: juzhe.zhong@rivai.ai; gcc-patches; palmer; kito.cheng CC: rdapp.gcc; jeffreyalaw Subject: Re: [PATCH] RISC-V: Add initial cost handling for segment loads/stores. > This patch looks odd to me. > I don't see memrefs in the trunk code. It's on top of the vle/vse offset handling patch from a while back that I haven't committed yet. > Also, I prefer list all cost in cost tune info for NF = 2 ~ 8 like ARM SVE does: I don't mind having separate costs for each but I figured they scale anyway with the number of vectors already. Attached v2 is more similar to aarch64. Regards Robin Subject: [PATCH v2] RISC-V: Add initial cost handling for segment loads/stores. This patch makes segment loads and stores more expensive. It adds segment_permute_2 (as well as 4 and 8) cost fields to the common vector costs and adds handling to adjust_stmt_cost. gcc/ChangeLog: * config/riscv/riscv-protos.h (struct common_vector_cost): Add segment_permute cost. * config/riscv/riscv-vector-costs.cc (costs::adjust_stmt_cost): Handle segment loads/stores. * config/riscv/riscv.cc: Initialize segment_permute_[248] to 1. --- gcc/config/riscv/riscv-protos.h | 5 + gcc/config/riscv/riscv-vector-costs.cc | 139 +++++++++++++++++-------- gcc/config/riscv/riscv.cc | 6 ++ 3 files changed, 108 insertions(+), 42 deletions(-) diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h index 80efdf2b7e5..9b737aca1a3 100644 --- a/gcc/config/riscv/riscv-protos.h +++ b/gcc/config/riscv/riscv-protos.h @@ -218,6 +218,11 @@ struct common_vector_cost const int gather_load_cost; const int scatter_store_cost; + /* Segment load/store permute cost. */ + const int segment_permute_2; + const int segment_permute_4; + const int segment_permute_8; + /* Cost of a vector-to-scalar operation. */ const int vec_to_scalar_cost; diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc index adf9c197df5..c8178d71101 100644 --- a/gcc/config/riscv/riscv-vector-costs.cc +++ b/gcc/config/riscv/riscv-vector-costs.cc @@ -1043,6 +1043,25 @@ costs::better_main_loop_than_p (const vector_costs *uncast_other) const return vector_costs::better_main_loop_than_p (other); } +/* Returns the group size i.e. the number of vectors to be loaded by a + segmented load/store instruction. Return 0 if it is no segmented + load/store. */ +static int +segment_loadstore_group_size (enum vect_cost_for_stmt kind, + stmt_vec_info stmt_info) +{ + if (stmt_info + && (kind == vector_load || kind == vector_store) + && STMT_VINFO_DATA_REF (stmt_info)) + { + stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); + if (stmt_info + && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES) + return DR_GROUP_SIZE (stmt_info); + } + return 0; +} + /* Adjust vectorization cost after calling riscv_builtin_vectorization_cost. For some statement, we would like to further fine-grain tweak the cost on top of riscv_builtin_vectorization_cost handling which doesn't have any @@ -1067,55 +1086,91 @@ costs::adjust_stmt_cost (enum vect_cost_for_stmt kind, loop_vec_info loop, case vector_load: case vector_store: { - /* Unit-stride vector loads and stores do not have offset addressing - as opposed to scalar loads and stores. - If the address depends on a variable we need an additional - add/sub for each load/store in the worst case. */ - if (stmt_info && stmt_info->stmt) + if (stmt_info && stmt_info->stmt && STMT_VINFO_DATA_REF (stmt_info)) { - data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); - class loop *father = stmt_info->stmt->bb->loop_father; - if (!loop && father && !father->inner && father->superloops) + /* Segment loads and stores. When the group size is > 1 + the vectorizer will add a vector load/store statement for + each vector in the group. Here we additionally add permute + costs for each. */ + /* TODO: Indexed and ordered/unordered cost. */ + int group_size = segment_loadstore_group_size (kind, stmt_info); + if (group_size > 1) + { + switch (group_size) + { + case 2: + if (riscv_v_ext_vector_mode_p (loop->vector_mode)) + stmt_cost += costs->vla->segment_permute_2; + else + stmt_cost += costs->vls->segment_permute_2; + break; + case 4: + if (riscv_v_ext_vector_mode_p (loop->vector_mode)) + stmt_cost += costs->vla->segment_permute_4; + else + stmt_cost += costs->vls->segment_permute_4; + break; + case 8: + if (riscv_v_ext_vector_mode_p (loop->vector_mode)) + stmt_cost += costs->vla->segment_permute_8; + else + stmt_cost += costs->vls->segment_permute_8; + break; + default: + break; + } + } + else { - tree ref; - if (TREE_CODE (dr->ref) != MEM_REF - || !(ref = TREE_OPERAND (dr->ref, 0)) - || TREE_CODE (ref) != SSA_NAME) - break; + /* Unit-stride vector loads and stores do not have offset + addressing as opposed to scalar loads and stores. + If the address depends on a variable we need an additional + add/sub for each load/store in the worst case. */ + data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); + class loop *father = stmt_info->stmt->bb->loop_father; + if (!loop && father && !father->inner && father->superloops) + { + tree ref; + if (TREE_CODE (dr->ref) != MEM_REF + || !(ref = TREE_OPERAND (dr->ref, 0)) + || TREE_CODE (ref) != SSA_NAME) + break; - if (SSA_NAME_IS_DEFAULT_DEF (ref)) - break; + if (SSA_NAME_IS_DEFAULT_DEF (ref)) + break; - if (memrefs.contains ({ref, cst0})) - break; + if (memrefs.contains ({ref, cst0})) + break; - memrefs.add ({ref, cst0}); + memrefs.add ({ref, cst0}); - /* In case we have not seen REF before and the base address - is a pointer operation try a bit harder. */ - tree base = DR_BASE_ADDRESS (dr); - if (TREE_CODE (base) == POINTER_PLUS_EXPR - || TREE_CODE (base) == POINTER_DIFF_EXPR) - { - /* Deconstruct BASE's first operand. If it is a binary - operation, i.e. a base and an "offset" store this - pair. Only increase the stmt_cost if we haven't seen - it before. */ - tree argp = TREE_OPERAND (base, 1); - typedef std::pair<tree, tree> addr_pair; - addr_pair pair; - if (TREE_CODE_CLASS (TREE_CODE (argp)) == tcc_binary) + /* In case we have not seen REF before and the base + address is a pointer operation try a bit harder. */ + tree base = DR_BASE_ADDRESS (dr); + if (TREE_CODE (base) == POINTER_PLUS_EXPR + || TREE_CODE (base) == POINTER_DIFF_EXPR) { - tree argp0 = tree_strip_nop_conversions - (TREE_OPERAND (argp, 0)); - tree argp1 = TREE_OPERAND (argp, 1); - pair = addr_pair (argp0, argp1); - if (memrefs.contains (pair)) - break; - - memrefs.add (pair); - stmt_cost += builtin_vectorization_cost (scalar_stmt, - NULL_TREE, 0); + /* Deconstruct BASE's first operand. If it is a + binary operation, i.e. a base and an "offset" + store this pair. Only increase the stmt_cost if + we haven't seen it before. */ + tree argp = TREE_OPERAND (base, 1); + typedef std::pair<tree, tree> addr_pair; + addr_pair pair; + if (TREE_CODE_CLASS (TREE_CODE (argp)) == tcc_binary) + { + tree argp0 = tree_strip_nop_conversions + (TREE_OPERAND (argp, 0)); + tree argp1 = TREE_OPERAND (argp, 1); + pair = addr_pair (argp0, argp1); + if (memrefs.contains (pair)) + break; + + memrefs.add (pair); + stmt_cost + += builtin_vectorization_cost (scalar_stmt, + NULL_TREE, 0); + } } } } diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index 5e984ee2a55..141278ec35e 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -365,6 +365,9 @@ static const common_vector_cost rvv_vls_vector_cost = { 1, /* fp_stmt_cost */ 1, /* gather_load_cost */ 1, /* scatter_store_cost */ + 1, /* segment_permute (2) */ + 1, /* segment_permute (4) */ + 1, /* segment_permute (8) */ 1, /* vec_to_scalar_cost */ 1, /* scalar_to_vec_cost */ 1, /* permute_cost */ @@ -381,6 +384,9 @@ static const scalable_vector_cost rvv_vla_vector_cost = { 1, /* fp_stmt_cost */ 1, /* gather_load_cost */ 1, /* scatter_store_cost */ + 1, /* segment_permute (2) */ + 1, /* segment_permute (4) */ + 1, /* segment_permute (8) */ 1, /* vec_to_scalar_cost */ 1, /* scalar_to_vec_cost */ 1, /* permute_cost */
> + /* Segment load/store permute cost. */ > + const int segment_permute_2; > + const int segment_permute_4; > + const int segment_permute_8; > > Why do we only have 2/4/8, I think we should have 2/3/4/5/6/7/8 No idea why I posted that (wrong) version, I used it for some testing locally. Attached is the proper version, still called it v3... Regards Robin Subject: [PATCH v3] RISC-V: Add initial cost handling for segment loads/stores. This patch makes segment loads and stores more expensive. It adds segment_permute_2 as well as 3 to 8 cost fields to the common vector costs and adds handling to adjust_stmt_cost. gcc/ChangeLog: * config/riscv/riscv-protos.h (struct common_vector_cost): Add segment_permute cost. * config/riscv/riscv-vector-costs.cc (costs::adjust_stmt_cost): Handle segment loads/stores. * config/riscv/riscv.cc: Initialize segment_permute_[2-8] to 1. --- gcc/config/riscv/riscv-protos.h | 9 ++ gcc/config/riscv/riscv-vector-costs.cc | 163 ++++++++++++++++++------- gcc/config/riscv/riscv.cc | 14 +++ 3 files changed, 144 insertions(+), 42 deletions(-) diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h index 80efdf2b7e5..90d1fcbb3b1 100644 --- a/gcc/config/riscv/riscv-protos.h +++ b/gcc/config/riscv/riscv-protos.h @@ -218,6 +218,15 @@ struct common_vector_cost const int gather_load_cost; const int scatter_store_cost; + /* Segment load/store permute cost. */ + const int segment_permute_2; + const int segment_permute_3; + const int segment_permute_4; + const int segment_permute_5; + const int segment_permute_6; + const int segment_permute_7; + const int segment_permute_8; + /* Cost of a vector-to-scalar operation. */ const int vec_to_scalar_cost; diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc index adf9c197df5..f4da213fe14 100644 --- a/gcc/config/riscv/riscv-vector-costs.cc +++ b/gcc/config/riscv/riscv-vector-costs.cc @@ -1043,6 +1043,25 @@ costs::better_main_loop_than_p (const vector_costs *uncast_other) const return vector_costs::better_main_loop_than_p (other); } +/* Returns the group size i.e. the number of vectors to be loaded by a + segmented load/store instruction. Return 0 if it is no segmented + load/store. */ +static int +segment_loadstore_group_size (enum vect_cost_for_stmt kind, + stmt_vec_info stmt_info) +{ + if (stmt_info + && (kind == vector_load || kind == vector_store) + && STMT_VINFO_DATA_REF (stmt_info)) + { + stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); + if (stmt_info + && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES) + return DR_GROUP_SIZE (stmt_info); + } + return 0; +} + /* Adjust vectorization cost after calling riscv_builtin_vectorization_cost. For some statement, we would like to further fine-grain tweak the cost on top of riscv_builtin_vectorization_cost handling which doesn't have any @@ -1067,55 +1086,115 @@ costs::adjust_stmt_cost (enum vect_cost_for_stmt kind, loop_vec_info loop, case vector_load: case vector_store: { - /* Unit-stride vector loads and stores do not have offset addressing - as opposed to scalar loads and stores. - If the address depends on a variable we need an additional - add/sub for each load/store in the worst case. */ - if (stmt_info && stmt_info->stmt) + if (stmt_info && stmt_info->stmt && STMT_VINFO_DATA_REF (stmt_info)) { - data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); - class loop *father = stmt_info->stmt->bb->loop_father; - if (!loop && father && !father->inner && father->superloops) + /* Segment loads and stores. When the group size is > 1 + the vectorizer will add a vector load/store statement for + each vector in the group. Here we additionally add permute + costs for each. */ + /* TODO: Indexed and ordered/unordered cost. */ + int group_size = segment_loadstore_group_size (kind, stmt_info); + if (group_size > 1) + { + switch (group_size) + { + case 2: + if (riscv_v_ext_vector_mode_p (loop->vector_mode)) + stmt_cost += costs->vla->segment_permute_2; + else + stmt_cost += costs->vls->segment_permute_2; + break; + case 3: + if (riscv_v_ext_vector_mode_p (loop->vector_mode)) + stmt_cost += costs->vla->segment_permute_3; + else + stmt_cost += costs->vls->segment_permute_3; + break; + case 4: + if (riscv_v_ext_vector_mode_p (loop->vector_mode)) + stmt_cost += costs->vla->segment_permute_4; + else + stmt_cost += costs->vls->segment_permute_4; + break; + case 5: + if (riscv_v_ext_vector_mode_p (loop->vector_mode)) + stmt_cost += costs->vla->segment_permute_5; + else + stmt_cost += costs->vls->segment_permute_5; + break; + case 6: + if (riscv_v_ext_vector_mode_p (loop->vector_mode)) + stmt_cost += costs->vla->segment_permute_6; + else + stmt_cost += costs->vls->segment_permute_6; + break; + case 7: + if (riscv_v_ext_vector_mode_p (loop->vector_mode)) + stmt_cost += costs->vla->segment_permute_7; + else + stmt_cost += costs->vls->segment_permute_7; + break; + case 8: + if (riscv_v_ext_vector_mode_p (loop->vector_mode)) + stmt_cost += costs->vla->segment_permute_8; + else + stmt_cost += costs->vls->segment_permute_8; + break; + default: + break; + } + } + else { - tree ref; - if (TREE_CODE (dr->ref) != MEM_REF - || !(ref = TREE_OPERAND (dr->ref, 0)) - || TREE_CODE (ref) != SSA_NAME) - break; + /* Unit-stride vector loads and stores do not have offset + addressing as opposed to scalar loads and stores. + If the address depends on a variable we need an additional + add/sub for each load/store in the worst case. */ + data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); + class loop *father = stmt_info->stmt->bb->loop_father; + if (!loop && father && !father->inner && father->superloops) + { + tree ref; + if (TREE_CODE (dr->ref) != MEM_REF + || !(ref = TREE_OPERAND (dr->ref, 0)) + || TREE_CODE (ref) != SSA_NAME) + break; - if (SSA_NAME_IS_DEFAULT_DEF (ref)) - break; + if (SSA_NAME_IS_DEFAULT_DEF (ref)) + break; - if (memrefs.contains ({ref, cst0})) - break; + if (memrefs.contains ({ref, cst0})) + break; - memrefs.add ({ref, cst0}); + memrefs.add ({ref, cst0}); - /* In case we have not seen REF before and the base address - is a pointer operation try a bit harder. */ - tree base = DR_BASE_ADDRESS (dr); - if (TREE_CODE (base) == POINTER_PLUS_EXPR - || TREE_CODE (base) == POINTER_DIFF_EXPR) - { - /* Deconstruct BASE's first operand. If it is a binary - operation, i.e. a base and an "offset" store this - pair. Only increase the stmt_cost if we haven't seen - it before. */ - tree argp = TREE_OPERAND (base, 1); - typedef std::pair<tree, tree> addr_pair; - addr_pair pair; - if (TREE_CODE_CLASS (TREE_CODE (argp)) == tcc_binary) + /* In case we have not seen REF before and the base + address is a pointer operation try a bit harder. */ + tree base = DR_BASE_ADDRESS (dr); + if (TREE_CODE (base) == POINTER_PLUS_EXPR + || TREE_CODE (base) == POINTER_DIFF_EXPR) { - tree argp0 = tree_strip_nop_conversions - (TREE_OPERAND (argp, 0)); - tree argp1 = TREE_OPERAND (argp, 1); - pair = addr_pair (argp0, argp1); - if (memrefs.contains (pair)) - break; - - memrefs.add (pair); - stmt_cost += builtin_vectorization_cost (scalar_stmt, - NULL_TREE, 0); + /* Deconstruct BASE's first operand. If it is a + binary operation, i.e. a base and an "offset" + store this pair. Only increase the stmt_cost if + we haven't seen it before. */ + tree argp = TREE_OPERAND (base, 1); + typedef std::pair<tree, tree> addr_pair; + addr_pair pair; + if (TREE_CODE_CLASS (TREE_CODE (argp)) == tcc_binary) + { + tree argp0 = tree_strip_nop_conversions + (TREE_OPERAND (argp, 0)); + tree argp1 = TREE_OPERAND (argp, 1); + pair = addr_pair (argp0, argp1); + if (memrefs.contains (pair)) + break; + + memrefs.add (pair); + stmt_cost + += builtin_vectorization_cost (scalar_stmt, + NULL_TREE, 0); + } } } } diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index 5e984ee2a55..874a42873d7 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -365,6 +365,13 @@ static const common_vector_cost rvv_vls_vector_cost = { 1, /* fp_stmt_cost */ 1, /* gather_load_cost */ 1, /* scatter_store_cost */ + 1, /* segment_permute (2) */ + 1, /* segment_permute (3) */ + 1, /* segment_permute (4) */ + 1, /* segment_permute (5) */ + 1, /* segment_permute (6) */ + 1, /* segment_permute (7) */ + 1, /* segment_permute (8) */ 1, /* vec_to_scalar_cost */ 1, /* scalar_to_vec_cost */ 1, /* permute_cost */ @@ -381,6 +388,13 @@ static const scalable_vector_cost rvv_vla_vector_cost = { 1, /* fp_stmt_cost */ 1, /* gather_load_cost */ 1, /* scatter_store_cost */ + 1, /* segment_permute (2) */ + 1, /* segment_permute (3) */ + 1, /* segment_permute (4) */ + 1, /* segment_permute (5) */ + 1, /* segment_permute (6) */ + 1, /* segment_permute (7) */ + 1, /* segment_permute (8) */ 1, /* vec_to_scalar_cost */ 1, /* scalar_to_vec_cost */ 1, /* permute_cost */
Could you rebase to the trunk ? I don't think segment load store cost depends on previous patch you sent. juzhe.zhong@rivai.ai From: Robin Dapp Date: 2024-03-01 23:07 To: 钟居哲; gcc-patches; palmer; kito.cheng CC: rdapp.gcc; Jeff Law Subject: Re: [PATCH] RISC-V: Add initial cost handling for segment loads/stores. > + /* Segment load/store permute cost. */ > + const int segment_permute_2; > + const int segment_permute_4; > + const int segment_permute_8; > > Why do we only have 2/4/8, I think we should have 2/3/4/5/6/7/8 No idea why I posted that (wrong) version, I used it for some testing locally. Attached is the proper version, still called it v3... Regards Robin Subject: [PATCH v3] RISC-V: Add initial cost handling for segment loads/stores. This patch makes segment loads and stores more expensive. It adds segment_permute_2 as well as 3 to 8 cost fields to the common vector costs and adds handling to adjust_stmt_cost. gcc/ChangeLog: * config/riscv/riscv-protos.h (struct common_vector_cost): Add segment_permute cost. * config/riscv/riscv-vector-costs.cc (costs::adjust_stmt_cost): Handle segment loads/stores. * config/riscv/riscv.cc: Initialize segment_permute_[2-8] to 1. --- gcc/config/riscv/riscv-protos.h | 9 ++ gcc/config/riscv/riscv-vector-costs.cc | 163 ++++++++++++++++++------- gcc/config/riscv/riscv.cc | 14 +++ 3 files changed, 144 insertions(+), 42 deletions(-) diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h index 80efdf2b7e5..90d1fcbb3b1 100644 --- a/gcc/config/riscv/riscv-protos.h +++ b/gcc/config/riscv/riscv-protos.h @@ -218,6 +218,15 @@ struct common_vector_cost const int gather_load_cost; const int scatter_store_cost; + /* Segment load/store permute cost. */ + const int segment_permute_2; + const int segment_permute_3; + const int segment_permute_4; + const int segment_permute_5; + const int segment_permute_6; + const int segment_permute_7; + const int segment_permute_8; + /* Cost of a vector-to-scalar operation. */ const int vec_to_scalar_cost; diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc index adf9c197df5..f4da213fe14 100644 --- a/gcc/config/riscv/riscv-vector-costs.cc +++ b/gcc/config/riscv/riscv-vector-costs.cc @@ -1043,6 +1043,25 @@ costs::better_main_loop_than_p (const vector_costs *uncast_other) const return vector_costs::better_main_loop_than_p (other); } +/* Returns the group size i.e. the number of vectors to be loaded by a + segmented load/store instruction. Return 0 if it is no segmented + load/store. */ +static int +segment_loadstore_group_size (enum vect_cost_for_stmt kind, + stmt_vec_info stmt_info) +{ + if (stmt_info + && (kind == vector_load || kind == vector_store) + && STMT_VINFO_DATA_REF (stmt_info)) + { + stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); + if (stmt_info + && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES) + return DR_GROUP_SIZE (stmt_info); + } + return 0; +} + /* Adjust vectorization cost after calling riscv_builtin_vectorization_cost. For some statement, we would like to further fine-grain tweak the cost on top of riscv_builtin_vectorization_cost handling which doesn't have any @@ -1067,55 +1086,115 @@ costs::adjust_stmt_cost (enum vect_cost_for_stmt kind, loop_vec_info loop, case vector_load: case vector_store: { - /* Unit-stride vector loads and stores do not have offset addressing - as opposed to scalar loads and stores. - If the address depends on a variable we need an additional - add/sub for each load/store in the worst case. */ - if (stmt_info && stmt_info->stmt) + if (stmt_info && stmt_info->stmt && STMT_VINFO_DATA_REF (stmt_info)) { - data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); - class loop *father = stmt_info->stmt->bb->loop_father; - if (!loop && father && !father->inner && father->superloops) + /* Segment loads and stores. When the group size is > 1 + the vectorizer will add a vector load/store statement for + each vector in the group. Here we additionally add permute + costs for each. */ + /* TODO: Indexed and ordered/unordered cost. */ + int group_size = segment_loadstore_group_size (kind, stmt_info); + if (group_size > 1) + { + switch (group_size) + { + case 2: + if (riscv_v_ext_vector_mode_p (loop->vector_mode)) + stmt_cost += costs->vla->segment_permute_2; + else + stmt_cost += costs->vls->segment_permute_2; + break; + case 3: + if (riscv_v_ext_vector_mode_p (loop->vector_mode)) + stmt_cost += costs->vla->segment_permute_3; + else + stmt_cost += costs->vls->segment_permute_3; + break; + case 4: + if (riscv_v_ext_vector_mode_p (loop->vector_mode)) + stmt_cost += costs->vla->segment_permute_4; + else + stmt_cost += costs->vls->segment_permute_4; + break; + case 5: + if (riscv_v_ext_vector_mode_p (loop->vector_mode)) + stmt_cost += costs->vla->segment_permute_5; + else + stmt_cost += costs->vls->segment_permute_5; + break; + case 6: + if (riscv_v_ext_vector_mode_p (loop->vector_mode)) + stmt_cost += costs->vla->segment_permute_6; + else + stmt_cost += costs->vls->segment_permute_6; + break; + case 7: + if (riscv_v_ext_vector_mode_p (loop->vector_mode)) + stmt_cost += costs->vla->segment_permute_7; + else + stmt_cost += costs->vls->segment_permute_7; + break; + case 8: + if (riscv_v_ext_vector_mode_p (loop->vector_mode)) + stmt_cost += costs->vla->segment_permute_8; + else + stmt_cost += costs->vls->segment_permute_8; + break; + default: + break; + } + } + else { - tree ref; - if (TREE_CODE (dr->ref) != MEM_REF - || !(ref = TREE_OPERAND (dr->ref, 0)) - || TREE_CODE (ref) != SSA_NAME) - break; + /* Unit-stride vector loads and stores do not have offset + addressing as opposed to scalar loads and stores. + If the address depends on a variable we need an additional + add/sub for each load/store in the worst case. */ + data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); + class loop *father = stmt_info->stmt->bb->loop_father; + if (!loop && father && !father->inner && father->superloops) + { + tree ref; + if (TREE_CODE (dr->ref) != MEM_REF + || !(ref = TREE_OPERAND (dr->ref, 0)) + || TREE_CODE (ref) != SSA_NAME) + break; - if (SSA_NAME_IS_DEFAULT_DEF (ref)) - break; + if (SSA_NAME_IS_DEFAULT_DEF (ref)) + break; - if (memrefs.contains ({ref, cst0})) - break; + if (memrefs.contains ({ref, cst0})) + break; - memrefs.add ({ref, cst0}); + memrefs.add ({ref, cst0}); - /* In case we have not seen REF before and the base address - is a pointer operation try a bit harder. */ - tree base = DR_BASE_ADDRESS (dr); - if (TREE_CODE (base) == POINTER_PLUS_EXPR - || TREE_CODE (base) == POINTER_DIFF_EXPR) - { - /* Deconstruct BASE's first operand. If it is a binary - operation, i.e. a base and an "offset" store this - pair. Only increase the stmt_cost if we haven't seen - it before. */ - tree argp = TREE_OPERAND (base, 1); - typedef std::pair<tree, tree> addr_pair; - addr_pair pair; - if (TREE_CODE_CLASS (TREE_CODE (argp)) == tcc_binary) + /* In case we have not seen REF before and the base + address is a pointer operation try a bit harder. */ + tree base = DR_BASE_ADDRESS (dr); + if (TREE_CODE (base) == POINTER_PLUS_EXPR + || TREE_CODE (base) == POINTER_DIFF_EXPR) { - tree argp0 = tree_strip_nop_conversions - (TREE_OPERAND (argp, 0)); - tree argp1 = TREE_OPERAND (argp, 1); - pair = addr_pair (argp0, argp1); - if (memrefs.contains (pair)) - break; - - memrefs.add (pair); - stmt_cost += builtin_vectorization_cost (scalar_stmt, - NULL_TREE, 0); + /* Deconstruct BASE's first operand. If it is a + binary operation, i.e. a base and an "offset" + store this pair. Only increase the stmt_cost if + we haven't seen it before. */ + tree argp = TREE_OPERAND (base, 1); + typedef std::pair<tree, tree> addr_pair; + addr_pair pair; + if (TREE_CODE_CLASS (TREE_CODE (argp)) == tcc_binary) + { + tree argp0 = tree_strip_nop_conversions + (TREE_OPERAND (argp, 0)); + tree argp1 = TREE_OPERAND (argp, 1); + pair = addr_pair (argp0, argp1); + if (memrefs.contains (pair)) + break; + + memrefs.add (pair); + stmt_cost + += builtin_vectorization_cost (scalar_stmt, + NULL_TREE, 0); + } } } } diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index 5e984ee2a55..874a42873d7 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -365,6 +365,13 @@ static const common_vector_cost rvv_vls_vector_cost = { 1, /* fp_stmt_cost */ 1, /* gather_load_cost */ 1, /* scatter_store_cost */ + 1, /* segment_permute (2) */ + 1, /* segment_permute (3) */ + 1, /* segment_permute (4) */ + 1, /* segment_permute (5) */ + 1, /* segment_permute (6) */ + 1, /* segment_permute (7) */ + 1, /* segment_permute (8) */ 1, /* vec_to_scalar_cost */ 1, /* scalar_to_vec_cost */ 1, /* permute_cost */ @@ -381,6 +388,13 @@ static const scalable_vector_cost rvv_vla_vector_cost = { 1, /* fp_stmt_cost */ 1, /* gather_load_cost */ 1, /* scatter_store_cost */ + 1, /* segment_permute (2) */ + 1, /* segment_permute (3) */ + 1, /* segment_permute (4) */ + 1, /* segment_permute (5) */ + 1, /* segment_permute (6) */ + 1, /* segment_permute (7) */ + 1, /* segment_permute (8) */ 1, /* vec_to_scalar_cost */ 1, /* scalar_to_vec_cost */ 1, /* permute_cost */
On 3/1/24 8:07 AM, Robin Dapp wrote: >> + /* Segment load/store permute cost. */ >> + const int segment_permute_2; >> + const int segment_permute_4; >> + const int segment_permute_8; >> >> Why do we only have 2/4/8, I think we should have 2/3/4/5/6/7/8 > > No idea why I posted that (wrong) version, I used it for > some testing locally. Attached is the proper version, still > called it v3... > > Regards > Robin > > Subject: [PATCH v3] RISC-V: Add initial cost handling for segment > loads/stores. > > This patch makes segment loads and stores more expensive. It adds > segment_permute_2 as well as 3 to 8 cost fields to the common vector > costs and adds handling to adjust_stmt_cost. > > gcc/ChangeLog: > > * config/riscv/riscv-protos.h (struct common_vector_cost): Add > segment_permute cost. > * config/riscv/riscv-vector-costs.cc (costs::adjust_stmt_cost): > Handle segment loads/stores. > * config/riscv/riscv.cc: Initialize segment_permute_[2-8] to 1. So where do we stand with this? Juzhe asked it to be rebased, but I don't see a rebased version in my inbox and I don't see anything that looks like this on the trunk. jeff
> So where do we stand with this? Juzhe asked it to be rebased, but I > don't see a rebased version in my inbox and I don't see anything that > looks like this on the trunk. I missed this one and figured as we're pretty late in the cycle it can wait until GCC 15. Therefore let's call it "deferred". Regards Robin
I think it's harmless to let this patch in GCC-14. So LGTM from my side to land this path in GCC-14.. juzhe.zhong@rivai.ai From: Robin Dapp Date: 2024-03-26 01:07 To: Jeff Law; 钟居哲; gcc-patches; palmer; kito.cheng CC: rdapp.gcc Subject: Re: [PATCH] RISC-V: Add initial cost handling for segment loads/stores. > So where do we stand with this? Juzhe asked it to be rebased, but I > don't see a rebased version in my inbox and I don't see anything that > looks like this on the trunk. I missed this one and figured as we're pretty late in the cycle it can wait until GCC 15. Therefore let's call it "deferred". Regards Robin
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h index 80efdf2b7e5..2e8ab9990a8 100644 --- a/gcc/config/riscv/riscv-protos.h +++ b/gcc/config/riscv/riscv-protos.h @@ -218,6 +218,10 @@ struct common_vector_cost const int gather_load_cost; const int scatter_store_cost; + /* Segment load/store cost. */ + const int segment_load_cost; + const int segment_store_cost; + /* Cost of a vector-to-scalar operation. */ const int vec_to_scalar_cost; diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc index adf9c197df5..d3c12444773 100644 --- a/gcc/config/riscv/riscv-vector-costs.cc +++ b/gcc/config/riscv/riscv-vector-costs.cc @@ -1043,6 +1043,24 @@ costs::better_main_loop_than_p (const vector_costs *uncast_other) const return vector_costs::better_main_loop_than_p (other); } +/* Returns the group size i.e. the number of vectors to be loaded by a + segmented load/store instruction. Return 0 if it is no segmented + load/store. */ +static int +segment_loadstore_group_size (enum vect_cost_for_stmt kind, + stmt_vec_info stmt_info) +{ + if ((kind == vector_load || kind == vector_store) + && STMT_VINFO_DATA_REF (stmt_info)) + { + stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); + if (stmt_info + && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES) + return DR_GROUP_SIZE (stmt_info); + } + return 0; +} + /* Adjust vectorization cost after calling riscv_builtin_vectorization_cost. For some statement, we would like to further fine-grain tweak the cost on top of riscv_builtin_vectorization_cost handling which doesn't have any @@ -1067,55 +1085,80 @@ costs::adjust_stmt_cost (enum vect_cost_for_stmt kind, loop_vec_info loop, case vector_load: case vector_store: { - /* Unit-stride vector loads and stores do not have offset addressing - as opposed to scalar loads and stores. - If the address depends on a variable we need an additional - add/sub for each load/store in the worst case. */ - if (stmt_info && stmt_info->stmt) + if (stmt_info && stmt_info->stmt && STMT_VINFO_DATA_REF (stmt_info)) { - data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); - class loop *father = stmt_info->stmt->bb->loop_father; - if (!loop && father && !father->inner && father->superloops) + int group_size; + if ((group_size + = segment_loadstore_group_size (kind, stmt_info)) > 1) { - tree ref; - if (TREE_CODE (dr->ref) != MEM_REF - || !(ref = TREE_OPERAND (dr->ref, 0)) - || TREE_CODE (ref) != SSA_NAME) - break; + /* Segment loads and stores. When the group size is > 1 + the vectorizer will add a vector load/store statement for + each vector in the group. Note that STMT_COST is + overwritten here rather than adjusted. */ + if (riscv_v_ext_vector_mode_p (loop->vector_mode)) + stmt_cost + = (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)) + ? costs->vla->segment_load_cost + : costs->vla->segment_store_cost); + else + stmt_cost + = (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)) + ? costs->vls->segment_load_cost + : costs->vls->segment_store_cost); + break; + /* TODO: Indexed and ordered/unordered cost. */ + } + else + { + /* Unit-stride vector loads and stores do not have offset + addressing as opposed to scalar loads and stores. + If the address depends on a variable we need an additional + add/sub for each load/store in the worst case. */ + data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); + class loop *father = stmt_info->stmt->bb->loop_father; + if (!loop && father && !father->inner && father->superloops) + { + tree ref; + if (TREE_CODE (dr->ref) != MEM_REF + || !(ref = TREE_OPERAND (dr->ref, 0)) + || TREE_CODE (ref) != SSA_NAME) + break; - if (SSA_NAME_IS_DEFAULT_DEF (ref)) - break; + if (SSA_NAME_IS_DEFAULT_DEF (ref)) + break; - if (memrefs.contains ({ref, cst0})) - break; + if (memrefs.contains ({ref, cst0})) + break; - memrefs.add ({ref, cst0}); + memrefs.add ({ref, cst0}); - /* In case we have not seen REF before and the base address - is a pointer operation try a bit harder. */ - tree base = DR_BASE_ADDRESS (dr); - if (TREE_CODE (base) == POINTER_PLUS_EXPR - || TREE_CODE (base) == POINTER_DIFF_EXPR) - { - /* Deconstruct BASE's first operand. If it is a binary - operation, i.e. a base and an "offset" store this - pair. Only increase the stmt_cost if we haven't seen - it before. */ - tree argp = TREE_OPERAND (base, 1); - typedef std::pair<tree, tree> addr_pair; - addr_pair pair; - if (TREE_CODE_CLASS (TREE_CODE (argp)) == tcc_binary) + /* In case we have not seen REF before and the base + address is a pointer operation try a bit harder. */ + tree base = DR_BASE_ADDRESS (dr); + if (TREE_CODE (base) == POINTER_PLUS_EXPR + || TREE_CODE (base) == POINTER_DIFF_EXPR) { - tree argp0 = tree_strip_nop_conversions - (TREE_OPERAND (argp, 0)); - tree argp1 = TREE_OPERAND (argp, 1); - pair = addr_pair (argp0, argp1); - if (memrefs.contains (pair)) - break; - - memrefs.add (pair); - stmt_cost += builtin_vectorization_cost (scalar_stmt, - NULL_TREE, 0); + /* Deconstruct BASE's first operand. If it is a + binary operation, i.e. a base and an "offset" + store this pair. Only increase the stmt_cost if + we haven't seen it before. */ + tree argp = TREE_OPERAND (base, 1); + typedef std::pair<tree, tree> addr_pair; + addr_pair pair; + if (TREE_CODE_CLASS (TREE_CODE (argp)) == tcc_binary) + { + tree argp0 = tree_strip_nop_conversions + (TREE_OPERAND (argp, 0)); + tree argp1 = TREE_OPERAND (argp, 1); + pair = addr_pair (argp0, argp1); + if (memrefs.contains (pair)) + break; + + memrefs.add (pair); + stmt_cost + += builtin_vectorization_cost (scalar_stmt, + NULL_TREE, 0); + } } } } diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index 5e984ee2a55..6c2f0ec34f4 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -365,6 +365,8 @@ static const common_vector_cost rvv_vls_vector_cost = { 1, /* fp_stmt_cost */ 1, /* gather_load_cost */ 1, /* scatter_store_cost */ + 2, /* segment_load_cost */ + 2, /* segment_store_cost */ 1, /* vec_to_scalar_cost */ 1, /* scalar_to_vec_cost */ 1, /* permute_cost */ @@ -381,6 +383,8 @@ static const scalable_vector_cost rvv_vla_vector_cost = { 1, /* fp_stmt_cost */ 1, /* gather_load_cost */ 1, /* scatter_store_cost */ + 2, /* segment_load_cost */ + 2, /* segment_store_cost */ 1, /* vec_to_scalar_cost */ 1, /* scalar_to_vec_cost */ 1, /* permute_cost */ diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-4.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-4.c index 5c55a66ed77..bdeedaf5224 100644 --- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-4.c +++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-4.c @@ -21,8 +21,8 @@ void move_replacements (rtx *x, rtx *y, int n_replacements) } } -/* { dg-final { scan-assembler {e64,m2} } } */ -/* { dg-final { scan-assembler-not {e64,m4} } } */ +/* { dg-final { scan-assembler-not {e64,m2} } } */ +/* { dg-final { scan-assembler {e64,m4} } } */ /* { dg-final { scan-assembler-not {jr} } } */ /* { dg-final { scan-assembler {ret} } } */ /* { dg-final { scan-assembler-not {sp} } } */