@@ -147,15 +147,17 @@ void test5 (uint8_t *a, uint8_t *b, uint8_t *c, uint8_t *d, int n)
/*
** test5:
**...
-** dlstp.8 lr, r[0-9]+
+** (?:mov (r[0-9]+), r3)?
+**...
+** dlstp.8 lr, (?:r[0-9]+|ip)
**...
** vldrb.8 q[0-9]+, \[r1\]
** vldrb.8 q[0-9]+, \[r2\]
**...
** vadd.i8 (q[0-9]+), q[0-9]+, q[0-9]+
**...
-** vstrb.8 \1, \[r2\]
-** vstrb.8 \1, \[r3\]
+** vstrb.8 \2, \[r2\]
+** vstrb.8 \2, \[(r3|\1)\]
** letp lr, .*
**...
*/
@@ -247,7 +249,7 @@ void test8 (int32_t *a, int32_t *b, int32_t *c, int n, int g)
**...
** dlstp.32 lr, r3
** vldrw.32 q[0-9]+, \[r0\], #16
-** vctp.32 r4
+** vctp.32 (?:r4|ip)
** vpst
** vldrwt.32 q[0-9]+, \[r1\], #16
**...
With r15-1618-g9f168b412f4, I get the following asm generated for the test case: .align 1 .align 2 .global test5 .syntax unified .thumb .thumb_func .type test5, %function test5: @ args = 4, pretend = 0, frame = 0 @ frame_needed = 0, uses_anonymous_args = 0 push {r4, r5, r6, lr} ldr r4, [sp, #16] cmp r4, #0 ble .L37 sub ip, r4, #16 adds r6, r2, r4 adds r5, r1, r4 add r0, r0, r4 dlstp.8 lr, r4 .L39: subs r2, r5, r4 subs r1, r0, r4 vldrb.8 q3, [r1] vldrb.8 q2, [r2] subs r2, r6, r4 mov r4, ip sub ip, ip, #16 vadd.i8 q3, q3, q2 vstrb.8 q3, [r2] vstrb.8 q3, [r3] letp lr, .L39 .L37: pop {r4, r5, r6, pc} .size test5, .-test5 ... .align 1 .align 2 .global test8 .syntax unified .thumb .thumb_func .type test8, %function test8: @ args = 4, pretend = 0, frame = 0 @ frame_needed = 0, uses_anonymous_args = 0 push {r4, lr} ldr r4, [sp, #8] cmp r3, #0 ble .L59 dlstp.32 lr, r3 .L61: vldrw.32 q3, [r0], #16 vctp.32 r4 vpst vldrwt.32 q2, [r1], #16 adds r4, r4, #1 vadd.i32 q3, q3, q2 vstrw.32 q3, [r2], #16 letp lr, .L61 .L59: pop {r4, pc} .size test8, .-test8 With r15-1619-g3b9b8d6cfdf, I instead get: .align 1 .align 2 .global test5 .syntax unified .thumb .thumb_func .type test5, %function test5: @ args = 4, pretend = 0, frame = 0 @ frame_needed = 0, uses_anonymous_args = 0 push {r4, r5, r6, lr} ldr ip, [sp, #16] cmp ip, #0 ble .L37 mov r6, r3 sub r3, ip, #16 add r5, r2, ip add r4, r1, ip add r0, r0, ip dlstp.8 lr, ip .L39: sub r2, r4, ip sub r1, r0, ip vldrb.8 q3, [r1] vldrb.8 q2, [r2] sub r2, r5, ip mov ip, r3 subs r3, r3, #16 vadd.i8 q3, q3, q2 vstrb.8 q3, [r2] vstrb.8 q3, [r6] letp lr, .L39 .L37: pop {r4, r5, r6, pc} .size test5, .-test5 ... .align 1 .align 2 .global test8 .syntax unified .thumb .thumb_func .type test8, %function test8: @ args = 4, pretend = 0, frame = 0 @ frame_needed = 0, uses_anonymous_args = 0 push {lr} ldr ip, [sp, #4] cmp r3, #0 ble .L59 dlstp.32 lr, r3 .L61: vldrw.32 q3, [r0], #16 vctp.32 ip vpst vldrwt.32 q2, [r1], #16 add ip, ip, #1 vadd.i32 q3, q3, q2 vstrw.32 q3, [r2], #16 letp lr, .L61 .L59: ldr pc, [sp], #4 .size test8, .-test8 As can be seen, with r15-1619-g3b9b8d6cfdf, it now uses ip in ways that it did not before. I think this part is fine. It also, for some reason, decides to move r3 into r6 in test5 and then use that later for the vstrb.8. While I suppose it does work, it will consume one extra mov, so it's slightly bigger. With below patch, I no longer see any failure reported for arm-none-eabi. Even with the slight size increase for test5, is it ok for trunk? -- Since r15-1619-g3b9b8d6cfdf, test5 and test8 fails due to that "ip" might be used and r3 might be moved to another register for later dereference. gcc/testsuite/ChangeLog: PR testsuite/116623 * gcc.target/arm/mve/dlstp-compile-asm-2.c: Align test5 and test8 with changes in r15-1619-g3b9b8d6cfdf. Signed-off-by: Torbjörn SVENSSON <torbjorn.svensson@foss.st.com> --- gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-2.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-)