Message ID | PAWPR08MB8982FFA5F943B02C53858EAD83F8A@PAWPR08MB8982.eurprd08.prod.outlook.com |
---|---|
State | New |
Headers | show |
Series | [v2] AArch64: Fix strict-align cpymem/setmem [PR103100] | expand |
ping v2: Use UINTVAL, rename max_mops_size. The cpymemdi/setmemdi implementation doesn't fully support strict alignment. Block the expansion if the alignment is less than 16 with STRICT_ALIGNMENT. Clean up the condition when to use MOPS. Passes regress/bootstrap, OK for commit? gcc/ChangeLog/ PR target/103100 * config/aarch64/aarch64.md (cpymemdi): Remove pattern condition. (setmemdi): Likewise. * config/aarch64/aarch64.cc (aarch64_expand_cpymem): Support strict-align. Cleanup condition for using MOPS. (aarch64_expand_setmem): Likewise. --- diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index dd6874d13a75f20d10a244578afc355b25c73da2..8a12894d6b80de1031d6e7d02dca680c57bce136 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -25261,27 +25261,23 @@ aarch64_expand_cpymem (rtx *operands) int mode_bits; rtx dst = operands[0]; rtx src = operands[1]; + unsigned align = UINTVAL (operands[3]); rtx base; machine_mode cur_mode = BLKmode; + bool size_p = optimize_function_for_size_p (cfun); - /* Variable-sized memcpy can go through the MOPS expansion if available. */ - if (!CONST_INT_P (operands[2])) + /* Variable-sized or strict-align copies may use the MOPS expansion. */ + if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16)) return aarch64_expand_cpymem_mops (operands); - unsigned HOST_WIDE_INT size = INTVAL (operands[2]); - - /* Try to inline up to 256 bytes or use the MOPS threshold if available. */ - unsigned HOST_WIDE_INT max_copy_size - = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256; + unsigned HOST_WIDE_INT size = UINTVAL (operands[2]); - bool size_p = optimize_function_for_size_p (cfun); + /* Try to inline up to 256 bytes. */ + unsigned max_copy_size = 256; + unsigned mops_threshold = aarch64_mops_memcpy_size_threshold; - /* Large constant-sized cpymem should go through MOPS when possible. - It should be a win even for size optimization in the general case. - For speed optimization the choice between MOPS and the SIMD sequence - depends on the size of the copy, rather than number of instructions, - alignment etc. */ - if (size > max_copy_size) + /* Large copies use MOPS when available or a library call. */ + if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold)) return aarch64_expand_cpymem_mops (operands); int copy_bits = 256; @@ -25445,12 +25441,13 @@ aarch64_expand_setmem (rtx *operands) unsigned HOST_WIDE_INT len; rtx dst = operands[0]; rtx val = operands[2], src; + unsigned align = UINTVAL (operands[3]); rtx base; machine_mode cur_mode = BLKmode, next_mode; - /* If we don't have SIMD registers or the size is variable use the MOPS - inlined sequence if possible. */ - if (!CONST_INT_P (operands[1]) || !TARGET_SIMD) + /* Variable-sized or strict-align memset may use the MOPS expansion. */ + if (!CONST_INT_P (operands[1]) || !TARGET_SIMD + || (STRICT_ALIGNMENT && align < 16)) return aarch64_expand_setmem_mops (operands); bool size_p = optimize_function_for_size_p (cfun); @@ -25458,10 +25455,13 @@ aarch64_expand_setmem (rtx *operands) /* Default the maximum to 256-bytes when considering only libcall vs SIMD broadcast sequence. */ unsigned max_set_size = 256; + unsigned mops_threshold = aarch64_mops_memset_size_threshold; - len = INTVAL (operands[1]); - if (len > max_set_size && !TARGET_MOPS) - return false; + len = UINTVAL (operands[1]); + + /* Large memset uses MOPS when available or a library call. */ + if (len > max_set_size || (TARGET_MOPS && len > mops_threshold)) + return aarch64_expand_setmem_mops (operands); int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0)); /* The MOPS sequence takes: @@ -25474,12 +25474,6 @@ aarch64_expand_setmem (rtx *operands) the arguments + 1 for the call. */ unsigned libcall_cost = 4; - /* Upper bound check. For large constant-sized setmem use the MOPS sequence - when available. */ - if (TARGET_MOPS - && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold) - return aarch64_expand_setmem_mops (operands); - /* Attempt a sequence with a vector broadcast followed by stores. Count the number of operations involved to see if it's worth it against the alternatives. A simple counter simd_ops on the @@ -25521,10 +25515,8 @@ aarch64_expand_setmem (rtx *operands) simd_ops++; n -= mode_bits; - /* Do certain trailing copies as overlapping if it's going to be - cheaper. i.e. less instructions to do so. For instance doing a 15 - byte copy it's more efficient to do two overlapping 8 byte copies than - 8 + 4 + 2 + 1. Only do this when -mstrict-align is not supplied. */ + /* Emit trailing writes using overlapping unaligned accesses + (when !STRICT_ALIGNMENT) - this is smaller and faster. */ if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT) { next_mode = smallest_mode_for_size (n, MODE_INT); diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 6d0f072a9dd6d094e8764a513222a9129d8296fa..96508a2580876d1fdbdfa6c67d1a3d02608c1d24 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -1627,7 +1627,7 @@ (define_expand "cpymemdi" (match_operand:BLK 1 "memory_operand") (match_operand:DI 2 "general_operand") (match_operand:DI 3 "immediate_operand")] - "!STRICT_ALIGNMENT || TARGET_MOPS" + "" { if (aarch64_expand_cpymem (operands)) DONE; @@ -1724,7 +1724,7 @@ (define_expand "setmemdi" (match_operand:QI 2 "nonmemory_operand")) ;; Value (use (match_operand:DI 1 "general_operand")) ;; Length (match_operand 3 "immediate_operand")] ;; Align - "TARGET_SIMD || TARGET_MOPS" + "" { if (aarch64_expand_setmem (operands)) DONE;
ping v2: Use UINTVAL, rename max_mops_size. The cpymemdi/setmemdi implementation doesn't fully support strict alignment. Block the expansion if the alignment is less than 16 with STRICT_ALIGNMENT. Clean up the condition when to use MOPS. Passes regress/bootstrap, OK for commit? gcc/ChangeLog/ PR target/103100 * config/aarch64/aarch64.md (cpymemdi): Remove pattern condition. (setmemdi): Likewise. * config/aarch64/aarch64.cc (aarch64_expand_cpymem): Support strict-align. Cleanup condition for using MOPS. (aarch64_expand_setmem): Likewise. --- diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index dd6874d13a75f20d10a244578afc355b25c73da2..8a12894d6b80de1031d6e7d02dca680c57bce136 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -25261,27 +25261,23 @@ aarch64_expand_cpymem (rtx *operands) int mode_bits; rtx dst = operands[0]; rtx src = operands[1]; + unsigned align = UINTVAL (operands[3]); rtx base; machine_mode cur_mode = BLKmode; + bool size_p = optimize_function_for_size_p (cfun); - /* Variable-sized memcpy can go through the MOPS expansion if available. */ - if (!CONST_INT_P (operands[2])) + /* Variable-sized or strict-align copies may use the MOPS expansion. */ + if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16)) return aarch64_expand_cpymem_mops (operands); - unsigned HOST_WIDE_INT size = INTVAL (operands[2]); - - /* Try to inline up to 256 bytes or use the MOPS threshold if available. */ - unsigned HOST_WIDE_INT max_copy_size - = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256; + unsigned HOST_WIDE_INT size = UINTVAL (operands[2]); - bool size_p = optimize_function_for_size_p (cfun); + /* Try to inline up to 256 bytes. */ + unsigned max_copy_size = 256; + unsigned mops_threshold = aarch64_mops_memcpy_size_threshold; - /* Large constant-sized cpymem should go through MOPS when possible. - It should be a win even for size optimization in the general case. - For speed optimization the choice between MOPS and the SIMD sequence - depends on the size of the copy, rather than number of instructions, - alignment etc. */ - if (size > max_copy_size) + /* Large copies use MOPS when available or a library call. */ + if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold)) return aarch64_expand_cpymem_mops (operands); int copy_bits = 256; @@ -25445,12 +25441,13 @@ aarch64_expand_setmem (rtx *operands) unsigned HOST_WIDE_INT len; rtx dst = operands[0]; rtx val = operands[2], src; + unsigned align = UINTVAL (operands[3]); rtx base; machine_mode cur_mode = BLKmode, next_mode; - /* If we don't have SIMD registers or the size is variable use the MOPS - inlined sequence if possible. */ - if (!CONST_INT_P (operands[1]) || !TARGET_SIMD) + /* Variable-sized or strict-align memset may use the MOPS expansion. */ + if (!CONST_INT_P (operands[1]) || !TARGET_SIMD + || (STRICT_ALIGNMENT && align < 16)) return aarch64_expand_setmem_mops (operands); bool size_p = optimize_function_for_size_p (cfun); @@ -25458,10 +25455,13 @@ aarch64_expand_setmem (rtx *operands) /* Default the maximum to 256-bytes when considering only libcall vs SIMD broadcast sequence. */ unsigned max_set_size = 256; + unsigned mops_threshold = aarch64_mops_memset_size_threshold; - len = INTVAL (operands[1]); - if (len > max_set_size && !TARGET_MOPS) - return false; + len = UINTVAL (operands[1]); + + /* Large memset uses MOPS when available or a library call. */ + if (len > max_set_size || (TARGET_MOPS && len > mops_threshold)) + return aarch64_expand_setmem_mops (operands); int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0)); /* The MOPS sequence takes: @@ -25474,12 +25474,6 @@ aarch64_expand_setmem (rtx *operands) the arguments + 1 for the call. */ unsigned libcall_cost = 4; - /* Upper bound check. For large constant-sized setmem use the MOPS sequence - when available. */ - if (TARGET_MOPS - && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold) - return aarch64_expand_setmem_mops (operands); - /* Attempt a sequence with a vector broadcast followed by stores. Count the number of operations involved to see if it's worth it against the alternatives. A simple counter simd_ops on the @@ -25521,10 +25515,8 @@ aarch64_expand_setmem (rtx *operands) simd_ops++; n -= mode_bits; - /* Do certain trailing copies as overlapping if it's going to be - cheaper. i.e. less instructions to do so. For instance doing a 15 - byte copy it's more efficient to do two overlapping 8 byte copies than - 8 + 4 + 2 + 1. Only do this when -mstrict-align is not supplied. */ + /* Emit trailing writes using overlapping unaligned accesses + (when !STRICT_ALIGNMENT) - this is smaller and faster. */ if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT) { next_mode = smallest_mode_for_size (n, MODE_INT); diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 6d0f072a9dd6d094e8764a513222a9129d8296fa..96508a2580876d1fdbdfa6c67d1a3d02608c1d24 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -1627,7 +1627,7 @@ (define_expand "cpymemdi" (match_operand:BLK 1 "memory_operand") (match_operand:DI 2 "general_operand") (match_operand:DI 3 "immediate_operand")] - "!STRICT_ALIGNMENT || TARGET_MOPS" + "" { if (aarch64_expand_cpymem (operands)) DONE; @@ -1724,7 +1724,7 @@ (define_expand "setmemdi" (match_operand:QI 2 "nonmemory_operand")) ;; Value (use (match_operand:DI 1 "general_operand")) ;; Length (match_operand 3 "immediate_operand")] ;; Align - "TARGET_SIMD || TARGET_MOPS" + "" { if (aarch64_expand_setmem (operands)) DONE;
Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes: > v2: Use UINTVAL, rename max_mops_size. > > The cpymemdi/setmemdi implementation doesn't fully support strict alignment. > Block the expansion if the alignment is less than 16 with STRICT_ALIGNMENT. > Clean up the condition when to use MOPS. > > Passes regress/bootstrap, OK for commit? > > gcc/ChangeLog/ > PR target/103100 > * config/aarch64/aarch64.md (cpymemdi): Remove pattern condition. > (setmemdi): Likewise. > * config/aarch64/aarch64.cc (aarch64_expand_cpymem): Support > strict-align. Cleanup condition for using MOPS. > (aarch64_expand_setmem): Likewise. > > --- > > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > index dd6874d13a75f20d10a244578afc355b25c73da2..8a12894d6b80de1031d6e7d02dca680c57bce136 100644 > --- a/gcc/config/aarch64/aarch64.cc > +++ b/gcc/config/aarch64/aarch64.cc > @@ -25261,27 +25261,23 @@ aarch64_expand_cpymem (rtx *operands) > int mode_bits; > rtx dst = operands[0]; > rtx src = operands[1]; > + unsigned align = UINTVAL (operands[3]); > rtx base; > machine_mode cur_mode = BLKmode; > + bool size_p = optimize_function_for_size_p (cfun); > > - /* Variable-sized memcpy can go through the MOPS expansion if available. */ > - if (!CONST_INT_P (operands[2])) > + /* Variable-sized or strict-align copies may use the MOPS expansion. */ > + if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16)) > return aarch64_expand_cpymem_mops (operands); > > - unsigned HOST_WIDE_INT size = INTVAL (operands[2]); > - > - /* Try to inline up to 256 bytes or use the MOPS threshold if available. */ > - unsigned HOST_WIDE_INT max_copy_size > - = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256; > + unsigned HOST_WIDE_INT size = UINTVAL (operands[2]); > > - bool size_p = optimize_function_for_size_p (cfun); > + /* Try to inline up to 256 bytes. */ > + unsigned max_copy_size = 256; > + unsigned mops_threshold = aarch64_mops_memcpy_size_threshold; > > - /* Large constant-sized cpymem should go through MOPS when possible. > - It should be a win even for size optimization in the general case. > - For speed optimization the choice between MOPS and the SIMD sequence > - depends on the size of the copy, rather than number of instructions, > - alignment etc. */ > - if (size > max_copy_size) > + /* Large copies use MOPS when available or a library call. */ > + if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold)) > return aarch64_expand_cpymem_mops (operands); It feels a little unintuitive to be calling aarch64_expand_cpymem_mops for !TARGET_MOPS, but that's pre-existing, and I can see there are arguments both ways. Although !TARGET_SIMD is a niche interest on current trunk, it becomes important for streaming-compatible mode. So we might want to look again at the different handling of !TARGET_SIMD in this function (where we lower the copy size but not the threshold) and aarch64_expand_setmem (where we bail out early). That's not something for this patch though, just mentioning it. The patch is OK with me, but please give Richard E a day to object. Thanks, Richard > > int copy_bits = 256; > @@ -25445,12 +25441,13 @@ aarch64_expand_setmem (rtx *operands) > unsigned HOST_WIDE_INT len; > rtx dst = operands[0]; > rtx val = operands[2], src; > + unsigned align = UINTVAL (operands[3]); > rtx base; > machine_mode cur_mode = BLKmode, next_mode; > > - /* If we don't have SIMD registers or the size is variable use the MOPS > - inlined sequence if possible. */ > - if (!CONST_INT_P (operands[1]) || !TARGET_SIMD) > + /* Variable-sized or strict-align memset may use the MOPS expansion. */ > + if (!CONST_INT_P (operands[1]) || !TARGET_SIMD > + || (STRICT_ALIGNMENT && align < 16)) > return aarch64_expand_setmem_mops (operands); > > bool size_p = optimize_function_for_size_p (cfun); > @@ -25458,10 +25455,13 @@ aarch64_expand_setmem (rtx *operands) > /* Default the maximum to 256-bytes when considering only libcall vs > SIMD broadcast sequence. */ > unsigned max_set_size = 256; > + unsigned mops_threshold = aarch64_mops_memset_size_threshold; > > - len = INTVAL (operands[1]); > - if (len > max_set_size && !TARGET_MOPS) > - return false; > + len = UINTVAL (operands[1]); > + > + /* Large memset uses MOPS when available or a library call. */ > + if (len > max_set_size || (TARGET_MOPS && len > mops_threshold)) > + return aarch64_expand_setmem_mops (operands); > > int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0)); > /* The MOPS sequence takes: > @@ -25474,12 +25474,6 @@ aarch64_expand_setmem (rtx *operands) > the arguments + 1 for the call. */ > unsigned libcall_cost = 4; > > - /* Upper bound check. For large constant-sized setmem use the MOPS sequence > - when available. */ > - if (TARGET_MOPS > - && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold) > - return aarch64_expand_setmem_mops (operands); > - > /* Attempt a sequence with a vector broadcast followed by stores. > Count the number of operations involved to see if it's worth it > against the alternatives. A simple counter simd_ops on the > @@ -25521,10 +25515,8 @@ aarch64_expand_setmem (rtx *operands) > simd_ops++; > n -= mode_bits; > > - /* Do certain trailing copies as overlapping if it's going to be > - cheaper. i.e. less instructions to do so. For instance doing a 15 > - byte copy it's more efficient to do two overlapping 8 byte copies than > - 8 + 4 + 2 + 1. Only do this when -mstrict-align is not supplied. */ > + /* Emit trailing writes using overlapping unaligned accesses > + (when !STRICT_ALIGNMENT) - this is smaller and faster. */ > if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT) > { > next_mode = smallest_mode_for_size (n, MODE_INT); > diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md > index 6d0f072a9dd6d094e8764a513222a9129d8296fa..96508a2580876d1fdbdfa6c67d1a3d02608c1d24 100644 > --- a/gcc/config/aarch64/aarch64.md > +++ b/gcc/config/aarch64/aarch64.md > @@ -1627,7 +1627,7 @@ (define_expand "cpymemdi" > (match_operand:BLK 1 "memory_operand") > (match_operand:DI 2 "general_operand") > (match_operand:DI 3 "immediate_operand")] > - "!STRICT_ALIGNMENT || TARGET_MOPS" > + "" > { > if (aarch64_expand_cpymem (operands)) > DONE; > @@ -1724,7 +1724,7 @@ (define_expand "setmemdi" > (match_operand:QI 2 "nonmemory_operand")) ;; Value > (use (match_operand:DI 1 "general_operand")) ;; Length > (match_operand 3 "immediate_operand")] ;; Align > - "TARGET_SIMD || TARGET_MOPS" > + "" > { > if (aarch64_expand_setmem (operands)) > DONE;
On 29/11/2023 18:09, Richard Sandiford wrote: > Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes: >> v2: Use UINTVAL, rename max_mops_size. >> >> The cpymemdi/setmemdi implementation doesn't fully support strict alignment. >> Block the expansion if the alignment is less than 16 with STRICT_ALIGNMENT. >> Clean up the condition when to use MOPS. >> >> Passes regress/bootstrap, OK for commit? >> >> gcc/ChangeLog/ >> PR target/103100 >> * config/aarch64/aarch64.md (cpymemdi): Remove pattern condition. >> (setmemdi): Likewise. >> * config/aarch64/aarch64.cc (aarch64_expand_cpymem): Support >> strict-align. Cleanup condition for using MOPS. >> (aarch64_expand_setmem): Likewise. >> >> --- >> >> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc >> index dd6874d13a75f20d10a244578afc355b25c73da2..8a12894d6b80de1031d6e7d02dca680c57bce136 100644 >> --- a/gcc/config/aarch64/aarch64.cc >> +++ b/gcc/config/aarch64/aarch64.cc >> @@ -25261,27 +25261,23 @@ aarch64_expand_cpymem (rtx *operands) >> int mode_bits; >> rtx dst = operands[0]; >> rtx src = operands[1]; >> + unsigned align = UINTVAL (operands[3]); >> rtx base; >> machine_mode cur_mode = BLKmode; >> + bool size_p = optimize_function_for_size_p (cfun); >> >> - /* Variable-sized memcpy can go through the MOPS expansion if available. */ >> - if (!CONST_INT_P (operands[2])) >> + /* Variable-sized or strict-align copies may use the MOPS expansion. */ >> + if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16)) >> return aarch64_expand_cpymem_mops (operands); >> >> - unsigned HOST_WIDE_INT size = INTVAL (operands[2]); >> - >> - /* Try to inline up to 256 bytes or use the MOPS threshold if available. */ >> - unsigned HOST_WIDE_INT max_copy_size >> - = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256; >> + unsigned HOST_WIDE_INT size = UINTVAL (operands[2]); >> >> - bool size_p = optimize_function_for_size_p (cfun); >> + /* Try to inline up to 256 bytes. */ >> + unsigned max_copy_size = 256; >> + unsigned mops_threshold = aarch64_mops_memcpy_size_threshold; >> >> - /* Large constant-sized cpymem should go through MOPS when possible. >> - It should be a win even for size optimization in the general case. >> - For speed optimization the choice between MOPS and the SIMD sequence >> - depends on the size of the copy, rather than number of instructions, >> - alignment etc. */ >> - if (size > max_copy_size) >> + /* Large copies use MOPS when available or a library call. */ >> + if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold)) >> return aarch64_expand_cpymem_mops (operands); > > It feels a little unintuitive to be calling aarch64_expand_cpymem_mops > for !TARGET_MOPS, but that's pre-existing, and I can see there are > arguments both ways. > > Although !TARGET_SIMD is a niche interest on current trunk, it becomes > important for streaming-compatible mode. So we might want to look > again at the different handling of !TARGET_SIMD in this function (where > we lower the copy size but not the threshold) and aarch64_expand_setmem > (where we bail out early). That's not something for this patch though, > just mentioning it. > > The patch is OK with me, but please give Richard E a day to object. This is fine by me. R. > > Thanks, > Richard > >> >> int copy_bits = 256; >> @@ -25445,12 +25441,13 @@ aarch64_expand_setmem (rtx *operands) >> unsigned HOST_WIDE_INT len; >> rtx dst = operands[0]; >> rtx val = operands[2], src; >> + unsigned align = UINTVAL (operands[3]); >> rtx base; >> machine_mode cur_mode = BLKmode, next_mode; >> >> - /* If we don't have SIMD registers or the size is variable use the MOPS >> - inlined sequence if possible. */ >> - if (!CONST_INT_P (operands[1]) || !TARGET_SIMD) >> + /* Variable-sized or strict-align memset may use the MOPS expansion. */ >> + if (!CONST_INT_P (operands[1]) || !TARGET_SIMD >> + || (STRICT_ALIGNMENT && align < 16)) >> return aarch64_expand_setmem_mops (operands); >> >> bool size_p = optimize_function_for_size_p (cfun); >> @@ -25458,10 +25455,13 @@ aarch64_expand_setmem (rtx *operands) >> /* Default the maximum to 256-bytes when considering only libcall vs >> SIMD broadcast sequence. */ >> unsigned max_set_size = 256; >> + unsigned mops_threshold = aarch64_mops_memset_size_threshold; >> >> - len = INTVAL (operands[1]); >> - if (len > max_set_size && !TARGET_MOPS) >> - return false; >> + len = UINTVAL (operands[1]); >> + >> + /* Large memset uses MOPS when available or a library call. */ >> + if (len > max_set_size || (TARGET_MOPS && len > mops_threshold)) >> + return aarch64_expand_setmem_mops (operands); >> >> int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0)); >> /* The MOPS sequence takes: >> @@ -25474,12 +25474,6 @@ aarch64_expand_setmem (rtx *operands) >> the arguments + 1 for the call. */ >> unsigned libcall_cost = 4; >> >> - /* Upper bound check. For large constant-sized setmem use the MOPS sequence >> - when available. */ >> - if (TARGET_MOPS >> - && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold) >> - return aarch64_expand_setmem_mops (operands); >> - >> /* Attempt a sequence with a vector broadcast followed by stores. >> Count the number of operations involved to see if it's worth it >> against the alternatives. A simple counter simd_ops on the >> @@ -25521,10 +25515,8 @@ aarch64_expand_setmem (rtx *operands) >> simd_ops++; >> n -= mode_bits; >> >> - /* Do certain trailing copies as overlapping if it's going to be >> - cheaper. i.e. less instructions to do so. For instance doing a 15 >> - byte copy it's more efficient to do two overlapping 8 byte copies than >> - 8 + 4 + 2 + 1. Only do this when -mstrict-align is not supplied. */ >> + /* Emit trailing writes using overlapping unaligned accesses >> + (when !STRICT_ALIGNMENT) - this is smaller and faster. */ >> if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT) >> { >> next_mode = smallest_mode_for_size (n, MODE_INT); >> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md >> index 6d0f072a9dd6d094e8764a513222a9129d8296fa..96508a2580876d1fdbdfa6c67d1a3d02608c1d24 100644 >> --- a/gcc/config/aarch64/aarch64.md >> +++ b/gcc/config/aarch64/aarch64.md >> @@ -1627,7 +1627,7 @@ (define_expand "cpymemdi" >> (match_operand:BLK 1 "memory_operand") >> (match_operand:DI 2 "general_operand") >> (match_operand:DI 3 "immediate_operand")] >> - "!STRICT_ALIGNMENT || TARGET_MOPS" >> + "" >> { >> if (aarch64_expand_cpymem (operands)) >> DONE; >> @@ -1724,7 +1724,7 @@ (define_expand "setmemdi" >> (match_operand:QI 2 "nonmemory_operand")) ;; Value >> (use (match_operand:DI 1 "general_operand")) ;; Length >> (match_operand 3 "immediate_operand")] ;; Align >> - "TARGET_SIMD || TARGET_MOPS" >> + "" >> { >> if (aarch64_expand_setmem (operands)) >> DONE;
OK to backport to GCC13 (it applies cleanly and regress/bootstrap passes)? Cheers, Wilco On 29/11/2023 18:09, Richard Sandiford wrote: > Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes: >> v2: Use UINTVAL, rename max_mops_size. >> >> The cpymemdi/setmemdi implementation doesn't fully support strict alignment. >> Block the expansion if the alignment is less than 16 with STRICT_ALIGNMENT. >> Clean up the condition when to use MOPS. >> >> Passes regress/bootstrap, OK for commit? >> >> gcc/ChangeLog/ >> PR target/103100 >> * config/aarch64/aarch64.md (cpymemdi): Remove pattern condition. >> (setmemdi): Likewise. >> * config/aarch64/aarch64.cc (aarch64_expand_cpymem): Support >> strict-align. Cleanup condition for using MOPS. >> (aarch64_expand_setmem): Likewise. >> >> --- >> >> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc >> index dd6874d13a75f20d10a244578afc355b25c73da2..8a12894d6b80de1031d6e7d02dca680c57bce136 100644 >> --- a/gcc/config/aarch64/aarch64.cc >> +++ b/gcc/config/aarch64/aarch64.cc >> @@ -25261,27 +25261,23 @@ aarch64_expand_cpymem (rtx *operands) >> int mode_bits; >> rtx dst = operands[0]; >> rtx src = operands[1]; >> + unsigned align = UINTVAL (operands[3]); >> rtx base; >> machine_mode cur_mode = BLKmode; >> + bool size_p = optimize_function_for_size_p (cfun); >> >> - /* Variable-sized memcpy can go through the MOPS expansion if available. */ >> - if (!CONST_INT_P (operands[2])) >> + /* Variable-sized or strict-align copies may use the MOPS expansion. */ >> + if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16)) >> return aarch64_expand_cpymem_mops (operands); >> >> - unsigned HOST_WIDE_INT size = INTVAL (operands[2]); >> - >> - /* Try to inline up to 256 bytes or use the MOPS threshold if available. */ >> - unsigned HOST_WIDE_INT max_copy_size >> - = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256; >> + unsigned HOST_WIDE_INT size = UINTVAL (operands[2]); >> >> - bool size_p = optimize_function_for_size_p (cfun); >> + /* Try to inline up to 256 bytes. */ >> + unsigned max_copy_size = 256; >> + unsigned mops_threshold = aarch64_mops_memcpy_size_threshold; >> >> - /* Large constant-sized cpymem should go through MOPS when possible. >> - It should be a win even for size optimization in the general case. >> - For speed optimization the choice between MOPS and the SIMD sequence >> - depends on the size of the copy, rather than number of instructions, >> - alignment etc. */ >> - if (size > max_copy_size) >> + /* Large copies use MOPS when available or a library call. */ >> + if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold)) >> return aarch64_expand_cpymem_mops (operands); > > It feels a little unintuitive to be calling aarch64_expand_cpymem_mops > for !TARGET_MOPS, but that's pre-existing, and I can see there are > arguments both ways. > > Although !TARGET_SIMD is a niche interest on current trunk, it becomes > important for streaming-compatible mode. So we might want to look > again at the different handling of !TARGET_SIMD in this function (where > we lower the copy size but not the threshold) and aarch64_expand_setmem > (where we bail out early). That's not something for this patch though, > just mentioning it. > > The patch is OK with me, but please give Richard E a day to object. This is fine by me. R. > > Thanks, > Richard > >> >> int copy_bits = 256; >> @@ -25445,12 +25441,13 @@ aarch64_expand_setmem (rtx *operands) >> unsigned HOST_WIDE_INT len; >> rtx dst = operands[0]; >> rtx val = operands[2], src; >> + unsigned align = UINTVAL (operands[3]); >> rtx base; >> machine_mode cur_mode = BLKmode, next_mode; >> >> - /* If we don't have SIMD registers or the size is variable use the MOPS >> - inlined sequence if possible. */ >> - if (!CONST_INT_P (operands[1]) || !TARGET_SIMD) >> + /* Variable-sized or strict-align memset may use the MOPS expansion. */ >> + if (!CONST_INT_P (operands[1]) || !TARGET_SIMD >> + || (STRICT_ALIGNMENT && align < 16)) >> return aarch64_expand_setmem_mops (operands); >> >> bool size_p = optimize_function_for_size_p (cfun); >> @@ -25458,10 +25455,13 @@ aarch64_expand_setmem (rtx *operands) >> /* Default the maximum to 256-bytes when considering only libcall vs >> SIMD broadcast sequence. */ >> unsigned max_set_size = 256; >> + unsigned mops_threshold = aarch64_mops_memset_size_threshold; >> >> - len = INTVAL (operands[1]); >> - if (len > max_set_size && !TARGET_MOPS) >> - return false; >> + len = UINTVAL (operands[1]); >> + >> + /* Large memset uses MOPS when available or a library call. */ >> + if (len > max_set_size || (TARGET_MOPS && len > mops_threshold)) >> + return aarch64_expand_setmem_mops (operands); >> >> int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0)); >> /* The MOPS sequence takes: >> @@ -25474,12 +25474,6 @@ aarch64_expand_setmem (rtx *operands) >> the arguments + 1 for the call. */ >> unsigned libcall_cost = 4; >> >> - /* Upper bound check. For large constant-sized setmem use the MOPS sequence >> - when available. */ >> - if (TARGET_MOPS >> - && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold) >> - return aarch64_expand_setmem_mops (operands); >> - >> /* Attempt a sequence with a vector broadcast followed by stores. >> Count the number of operations involved to see if it's worth it >> against the alternatives. A simple counter simd_ops on the >> @@ -25521,10 +25515,8 @@ aarch64_expand_setmem (rtx *operands) >> simd_ops++; >> n -= mode_bits; >> >> - /* Do certain trailing copies as overlapping if it's going to be >> - cheaper. i.e. less instructions to do so. For instance doing a 15 >> - byte copy it's more efficient to do two overlapping 8 byte copies than >> - 8 + 4 + 2 + 1. Only do this when -mstrict-align is not supplied. */ >> + /* Emit trailing writes using overlapping unaligned accesses >> + (when !STRICT_ALIGNMENT) - this is smaller and faster. */ >> if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT) >> { >> next_mode = smallest_mode_for_size (n, MODE_INT); >> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md >> index 6d0f072a9dd6d094e8764a513222a9129d8296fa..96508a2580876d1fdbdfa6c67d1a3d02608c1d24 100644 >> --- a/gcc/config/aarch64/aarch64.md >> +++ b/gcc/config/aarch64/aarch64.md >> @@ -1627,7 +1627,7 @@ (define_expand "cpymemdi" >> (match_operand:BLK 1 "memory_operand") >> (match_operand:DI 2 "general_operand") >> (match_operand:DI 3 "immediate_operand")] >> - "!STRICT_ALIGNMENT || TARGET_MOPS" >> + "" >> { >> if (aarch64_expand_cpymem (operands)) >> DONE; >> @@ -1724,7 +1724,7 @@ (define_expand "setmemdi" >> (match_operand:QI 2 "nonmemory_operand")) ;; Value >> (use (match_operand:DI 1 "general_operand")) ;; Length >> (match_operand 3 "immediate_operand")] ;; Align >> - "TARGET_SIMD || TARGET_MOPS" >> + "" >> { >> if (aarch64_expand_setmem (operands)) >> DONE;
Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes: > OK to backport to GCC13 (it applies cleanly and regress/bootstrap passes)? Yes, thanks. Richard > > Cheers, > Wilco > > On 29/11/2023 18:09, Richard Sandiford wrote: >> Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes: >>> v2: Use UINTVAL, rename max_mops_size. >>> >>> The cpymemdi/setmemdi implementation doesn't fully support strict alignment. >>> Block the expansion if the alignment is less than 16 with STRICT_ALIGNMENT. >>> Clean up the condition when to use MOPS. >>> >>> Passes regress/bootstrap, OK for commit? >>> >>> gcc/ChangeLog/ >>> PR target/103100 >>> * config/aarch64/aarch64.md (cpymemdi): Remove pattern condition. >>> (setmemdi): Likewise. >>> * config/aarch64/aarch64.cc (aarch64_expand_cpymem): Support >>> strict-align. Cleanup condition for using MOPS. >>> (aarch64_expand_setmem): Likewise. >>> >>> --- >>> >>> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc >>> index dd6874d13a75f20d10a244578afc355b25c73da2..8a12894d6b80de1031d6e7d02dca680c57bce136 100644 >>> --- a/gcc/config/aarch64/aarch64.cc >>> +++ b/gcc/config/aarch64/aarch64.cc >>> @@ -25261,27 +25261,23 @@ aarch64_expand_cpymem (rtx *operands) >>> int mode_bits; >>> rtx dst = operands[0]; >>> rtx src = operands[1]; >>> + unsigned align = UINTVAL (operands[3]); >>> rtx base; >>> machine_mode cur_mode = BLKmode; >>> + bool size_p = optimize_function_for_size_p (cfun); >>> >>> - /* Variable-sized memcpy can go through the MOPS expansion if available. */ >>> - if (!CONST_INT_P (operands[2])) >>> + /* Variable-sized or strict-align copies may use the MOPS expansion. */ >>> + if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16)) >>> return aarch64_expand_cpymem_mops (operands); >>> >>> - unsigned HOST_WIDE_INT size = INTVAL (operands[2]); >>> - >>> - /* Try to inline up to 256 bytes or use the MOPS threshold if available. */ >>> - unsigned HOST_WIDE_INT max_copy_size >>> - = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256; >>> + unsigned HOST_WIDE_INT size = UINTVAL (operands[2]); >>> >>> - bool size_p = optimize_function_for_size_p (cfun); >>> + /* Try to inline up to 256 bytes. */ >>> + unsigned max_copy_size = 256; >>> + unsigned mops_threshold = aarch64_mops_memcpy_size_threshold; >>> >>> - /* Large constant-sized cpymem should go through MOPS when possible. >>> - It should be a win even for size optimization in the general case. >>> - For speed optimization the choice between MOPS and the SIMD sequence >>> - depends on the size of the copy, rather than number of instructions, >>> - alignment etc. */ >>> - if (size > max_copy_size) >>> + /* Large copies use MOPS when available or a library call. */ >>> + if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold)) >>> return aarch64_expand_cpymem_mops (operands); >> >> It feels a little unintuitive to be calling aarch64_expand_cpymem_mops >> for !TARGET_MOPS, but that's pre-existing, and I can see there are >> arguments both ways. >> >> Although !TARGET_SIMD is a niche interest on current trunk, it becomes >> important for streaming-compatible mode. So we might want to look >> again at the different handling of !TARGET_SIMD in this function (where >> we lower the copy size but not the threshold) and aarch64_expand_setmem >> (where we bail out early). That's not something for this patch though, >> just mentioning it. >> >> The patch is OK with me, but please give Richard E a day to object. > > This is fine by me. > > R. > >> >> Thanks, >> Richard >> >>> >>> int copy_bits = 256; >>> @@ -25445,12 +25441,13 @@ aarch64_expand_setmem (rtx *operands) >>> unsigned HOST_WIDE_INT len; >>> rtx dst = operands[0]; >>> rtx val = operands[2], src; >>> + unsigned align = UINTVAL (operands[3]); >>> rtx base; >>> machine_mode cur_mode = BLKmode, next_mode; >>> >>> - /* If we don't have SIMD registers or the size is variable use the MOPS >>> - inlined sequence if possible. */ >>> - if (!CONST_INT_P (operands[1]) || !TARGET_SIMD) >>> + /* Variable-sized or strict-align memset may use the MOPS expansion. */ >>> + if (!CONST_INT_P (operands[1]) || !TARGET_SIMD >>> + || (STRICT_ALIGNMENT && align < 16)) >>> return aarch64_expand_setmem_mops (operands); >>> >>> bool size_p = optimize_function_for_size_p (cfun); >>> @@ -25458,10 +25455,13 @@ aarch64_expand_setmem (rtx *operands) >>> /* Default the maximum to 256-bytes when considering only libcall vs >>> SIMD broadcast sequence. */ >>> unsigned max_set_size = 256; >>> + unsigned mops_threshold = aarch64_mops_memset_size_threshold; >>> >>> - len = INTVAL (operands[1]); >>> - if (len > max_set_size && !TARGET_MOPS) >>> - return false; >>> + len = UINTVAL (operands[1]); >>> + >>> + /* Large memset uses MOPS when available or a library call. */ >>> + if (len > max_set_size || (TARGET_MOPS && len > mops_threshold)) >>> + return aarch64_expand_setmem_mops (operands); >>> >>> int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0)); >>> /* The MOPS sequence takes: >>> @@ -25474,12 +25474,6 @@ aarch64_expand_setmem (rtx *operands) >>> the arguments + 1 for the call. */ >>> unsigned libcall_cost = 4; >>> >>> - /* Upper bound check. For large constant-sized setmem use the MOPS sequence >>> - when available. */ >>> - if (TARGET_MOPS >>> - && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold) >>> - return aarch64_expand_setmem_mops (operands); >>> - >>> /* Attempt a sequence with a vector broadcast followed by stores. >>> Count the number of operations involved to see if it's worth it >>> against the alternatives. A simple counter simd_ops on the >>> @@ -25521,10 +25515,8 @@ aarch64_expand_setmem (rtx *operands) >>> simd_ops++; >>> n -= mode_bits; >>> >>> - /* Do certain trailing copies as overlapping if it's going to be >>> - cheaper. i.e. less instructions to do so. For instance doing a 15 >>> - byte copy it's more efficient to do two overlapping 8 byte copies than >>> - 8 + 4 + 2 + 1. Only do this when -mstrict-align is not supplied. */ >>> + /* Emit trailing writes using overlapping unaligned accesses >>> + (when !STRICT_ALIGNMENT) - this is smaller and faster. */ >>> if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT) >>> { >>> next_mode = smallest_mode_for_size (n, MODE_INT); >>> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md >>> index 6d0f072a9dd6d094e8764a513222a9129d8296fa..96508a2580876d1fdbdfa6c67d1a3d02608c1d24 100644 >>> --- a/gcc/config/aarch64/aarch64.md >>> +++ b/gcc/config/aarch64/aarch64.md >>> @@ -1627,7 +1627,7 @@ (define_expand "cpymemdi" >>> (match_operand:BLK 1 "memory_operand") >>> (match_operand:DI 2 "general_operand") >>> (match_operand:DI 3 "immediate_operand")] >>> - "!STRICT_ALIGNMENT || TARGET_MOPS" >>> + "" >>> { >>> if (aarch64_expand_cpymem (operands)) >>> DONE; >>> @@ -1724,7 +1724,7 @@ (define_expand "setmemdi" >>> (match_operand:QI 2 "nonmemory_operand")) ;; Value >>> (use (match_operand:DI 1 "general_operand")) ;; Length >>> (match_operand 3 "immediate_operand")] ;; Align >>> - "TARGET_SIMD || TARGET_MOPS" >>> + "" >>> { >>> if (aarch64_expand_setmem (operands)) >>> DONE;
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index dd6874d13a75f20d10a244578afc355b25c73da2..8a12894d6b80de1031d6e7d02dca680c57bce136 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -25261,27 +25261,23 @@ aarch64_expand_cpymem (rtx *operands) int mode_bits; rtx dst = operands[0]; rtx src = operands[1]; + unsigned align = UINTVAL (operands[3]); rtx base; machine_mode cur_mode = BLKmode; + bool size_p = optimize_function_for_size_p (cfun); - /* Variable-sized memcpy can go through the MOPS expansion if available. */ - if (!CONST_INT_P (operands[2])) + /* Variable-sized or strict-align copies may use the MOPS expansion. */ + if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16)) return aarch64_expand_cpymem_mops (operands); - unsigned HOST_WIDE_INT size = INTVAL (operands[2]); - - /* Try to inline up to 256 bytes or use the MOPS threshold if available. */ - unsigned HOST_WIDE_INT max_copy_size - = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256; + unsigned HOST_WIDE_INT size = UINTVAL (operands[2]); - bool size_p = optimize_function_for_size_p (cfun); + /* Try to inline up to 256 bytes. */ + unsigned max_copy_size = 256; + unsigned mops_threshold = aarch64_mops_memcpy_size_threshold; - /* Large constant-sized cpymem should go through MOPS when possible. - It should be a win even for size optimization in the general case. - For speed optimization the choice between MOPS and the SIMD sequence - depends on the size of the copy, rather than number of instructions, - alignment etc. */ - if (size > max_copy_size) + /* Large copies use MOPS when available or a library call. */ + if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold)) return aarch64_expand_cpymem_mops (operands); int copy_bits = 256; @@ -25445,12 +25441,13 @@ aarch64_expand_setmem (rtx *operands) unsigned HOST_WIDE_INT len; rtx dst = operands[0]; rtx val = operands[2], src; + unsigned align = UINTVAL (operands[3]); rtx base; machine_mode cur_mode = BLKmode, next_mode; - /* If we don't have SIMD registers or the size is variable use the MOPS - inlined sequence if possible. */ - if (!CONST_INT_P (operands[1]) || !TARGET_SIMD) + /* Variable-sized or strict-align memset may use the MOPS expansion. */ + if (!CONST_INT_P (operands[1]) || !TARGET_SIMD + || (STRICT_ALIGNMENT && align < 16)) return aarch64_expand_setmem_mops (operands); bool size_p = optimize_function_for_size_p (cfun); @@ -25458,10 +25455,13 @@ aarch64_expand_setmem (rtx *operands) /* Default the maximum to 256-bytes when considering only libcall vs SIMD broadcast sequence. */ unsigned max_set_size = 256; + unsigned mops_threshold = aarch64_mops_memset_size_threshold; - len = INTVAL (operands[1]); - if (len > max_set_size && !TARGET_MOPS) - return false; + len = UINTVAL (operands[1]); + + /* Large memset uses MOPS when available or a library call. */ + if (len > max_set_size || (TARGET_MOPS && len > mops_threshold)) + return aarch64_expand_setmem_mops (operands); int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0)); /* The MOPS sequence takes: @@ -25474,12 +25474,6 @@ aarch64_expand_setmem (rtx *operands) the arguments + 1 for the call. */ unsigned libcall_cost = 4; - /* Upper bound check. For large constant-sized setmem use the MOPS sequence - when available. */ - if (TARGET_MOPS - && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold) - return aarch64_expand_setmem_mops (operands); - /* Attempt a sequence with a vector broadcast followed by stores. Count the number of operations involved to see if it's worth it against the alternatives. A simple counter simd_ops on the @@ -25521,10 +25515,8 @@ aarch64_expand_setmem (rtx *operands) simd_ops++; n -= mode_bits; - /* Do certain trailing copies as overlapping if it's going to be - cheaper. i.e. less instructions to do so. For instance doing a 15 - byte copy it's more efficient to do two overlapping 8 byte copies than - 8 + 4 + 2 + 1. Only do this when -mstrict-align is not supplied. */ + /* Emit trailing writes using overlapping unaligned accesses + (when !STRICT_ALIGNMENT) - this is smaller and faster. */ if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT) { next_mode = smallest_mode_for_size (n, MODE_INT); diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 6d0f072a9dd6d094e8764a513222a9129d8296fa..96508a2580876d1fdbdfa6c67d1a3d02608c1d24 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -1627,7 +1627,7 @@ (define_expand "cpymemdi" (match_operand:BLK 1 "memory_operand") (match_operand:DI 2 "general_operand") (match_operand:DI 3 "immediate_operand")] - "!STRICT_ALIGNMENT || TARGET_MOPS" + "" { if (aarch64_expand_cpymem (operands)) DONE; @@ -1724,7 +1724,7 @@ (define_expand "setmemdi" (match_operand:QI 2 "nonmemory_operand")) ;; Value (use (match_operand:DI 1 "general_operand")) ;; Length (match_operand 3 "immediate_operand")] ;; Align - "TARGET_SIMD || TARGET_MOPS" + "" { if (aarch64_expand_setmem (operands)) DONE;