Message ID | 20240625183536.1672454-3-richard.henderson@linaro.org |
---|---|
State | New |
Headers | show |
Series | target/arm: AdvSIMD conversion, part 2 | expand |
25.06.2024 21:35, Richard Henderson wrote: > The inner loop, bounded by eltspersegment, must not be > larger than the outer loop, bounded by elements. > > Cc: qemu-stable@nongnu.org > Reviewed-by: Peter Maydell <peter.maydell@linaro.org> > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> > --- > target/arm/tcg/vec_helper.c | 24 ++++++++++++++++-------- > 1 file changed, 16 insertions(+), 8 deletions(-) If my understanding is correct, this one Fixes: f80701cb44d3 ("target/arm: Convert SQDMULH, SQRDMULH to decodetree") and before this commit, there was no issue. Is my understanding correct? Thanks, /mjt > diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c > index 7b34cc98af..d477479bb1 100644 > --- a/target/arm/tcg/vec_helper.c > +++ b/target/arm/tcg/vec_helper.c > @@ -317,10 +317,12 @@ void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm, > intptr_t i, j, opr_sz = simd_oprsz(desc); > int idx = simd_data(desc); > int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); > + intptr_t elements = opr_sz / 2; > + intptr_t eltspersegment = MIN(16 / 2, elements); > > - for (i = 0; i < opr_sz / 2; i += 16 / 2) { > + for (i = 0; i < elements; i += 16 / 2) { > int16_t mm = m[i]; > - for (j = 0; j < 16 / 2; ++j) { > + for (j = 0; j < eltspersegment; ++j) { > d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq); > } > } > @@ -333,10 +335,12 @@ void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, > intptr_t i, j, opr_sz = simd_oprsz(desc); > int idx = simd_data(desc); > int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); > + intptr_t elements = opr_sz / 2; > + intptr_t eltspersegment = MIN(16 / 2, elements); > > - for (i = 0; i < opr_sz / 2; i += 16 / 2) { > + for (i = 0; i < elements; i += 16 / 2) { > int16_t mm = m[i]; > - for (j = 0; j < 16 / 2; ++j) { > + for (j = 0; j < eltspersegment; ++j) { > d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq); > } > } > @@ -512,10 +516,12 @@ void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm, > intptr_t i, j, opr_sz = simd_oprsz(desc); > int idx = simd_data(desc); > int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); > + intptr_t elements = opr_sz / 4; > + intptr_t eltspersegment = MIN(16 / 4, elements); > > - for (i = 0; i < opr_sz / 4; i += 16 / 4) { > + for (i = 0; i < elements; i += 16 / 4) { > int32_t mm = m[i]; > - for (j = 0; j < 16 / 4; ++j) { > + for (j = 0; j < eltspersegment; ++j) { > d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq); > } > } > @@ -528,10 +534,12 @@ void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, > intptr_t i, j, opr_sz = simd_oprsz(desc); > int idx = simd_data(desc); > int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); > + intptr_t elements = opr_sz / 4; > + intptr_t eltspersegment = MIN(16 / 4, elements); > > - for (i = 0; i < opr_sz / 4; i += 16 / 4) { > + for (i = 0; i < elements; i += 16 / 4) { > int32_t mm = m[i]; > - for (j = 0; j < 16 / 4; ++j) { > + for (j = 0; j < eltspersegment; ++j) { > d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq); > } > }
On 7/1/24 23:48, Michael Tokarev wrote: > 25.06.2024 21:35, Richard Henderson wrote: >> The inner loop, bounded by eltspersegment, must not be >> larger than the outer loop, bounded by elements. >> >> Cc: qemu-stable@nongnu.org >> Reviewed-by: Peter Maydell <peter.maydell@linaro.org> >> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> >> --- >> target/arm/tcg/vec_helper.c | 24 ++++++++++++++++-------- >> 1 file changed, 16 insertions(+), 8 deletions(-) > > If my understanding is correct, this one > > Fixes: f80701cb44d3 ("target/arm: Convert SQDMULH, SQRDMULH to decodetree") > > and before this commit, there was no issue. > > Is my understanding correct? Yes. So, not as old a bug as I thought. r~
diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c index 7b34cc98af..d477479bb1 100644 --- a/target/arm/tcg/vec_helper.c +++ b/target/arm/tcg/vec_helper.c @@ -317,10 +317,12 @@ void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm, intptr_t i, j, opr_sz = simd_oprsz(desc); int idx = simd_data(desc); int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); + intptr_t elements = opr_sz / 2; + intptr_t eltspersegment = MIN(16 / 2, elements); - for (i = 0; i < opr_sz / 2; i += 16 / 2) { + for (i = 0; i < elements; i += 16 / 2) { int16_t mm = m[i]; - for (j = 0; j < 16 / 2; ++j) { + for (j = 0; j < eltspersegment; ++j) { d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq); } } @@ -333,10 +335,12 @@ void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, intptr_t i, j, opr_sz = simd_oprsz(desc); int idx = simd_data(desc); int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); + intptr_t elements = opr_sz / 2; + intptr_t eltspersegment = MIN(16 / 2, elements); - for (i = 0; i < opr_sz / 2; i += 16 / 2) { + for (i = 0; i < elements; i += 16 / 2) { int16_t mm = m[i]; - for (j = 0; j < 16 / 2; ++j) { + for (j = 0; j < eltspersegment; ++j) { d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq); } } @@ -512,10 +516,12 @@ void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm, intptr_t i, j, opr_sz = simd_oprsz(desc); int idx = simd_data(desc); int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); + intptr_t elements = opr_sz / 4; + intptr_t eltspersegment = MIN(16 / 4, elements); - for (i = 0; i < opr_sz / 4; i += 16 / 4) { + for (i = 0; i < elements; i += 16 / 4) { int32_t mm = m[i]; - for (j = 0; j < 16 / 4; ++j) { + for (j = 0; j < eltspersegment; ++j) { d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq); } } @@ -528,10 +534,12 @@ void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, intptr_t i, j, opr_sz = simd_oprsz(desc); int idx = simd_data(desc); int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); + intptr_t elements = opr_sz / 4; + intptr_t eltspersegment = MIN(16 / 4, elements); - for (i = 0; i < opr_sz / 4; i += 16 / 4) { + for (i = 0; i < elements; i += 16 / 4) { int32_t mm = m[i]; - for (j = 0; j < 16 / 4; ++j) { + for (j = 0; j < eltspersegment; ++j) { d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq); } }