Message ID | 20240801142322.3948866-4-peter.maydell@linaro.org |
---|---|
State | New |
Headers | show |
Series | None | expand |
01.08.2024 17:23, Peter Maydell wrote: > The FMOPA (widening) SME instruction takes pairs of half-precision > floating point values, widens them to single-precision, does a > two-way dot product and accumulates the results into a > single-precision destination. We don't quite correctly handle the > FPCR bits FZ and FZ16 which control flushing of denormal inputs and > outputs. This is because at the moment we pass a single float_status > value to the helper function, which then uses that configuration for > all the fp operations it does. However, because the inputs to this > operation are float16 and the outputs are float32 we need to use the > fp_status_f16 for the float16 input widening but the normal fp_status > for everything else. Otherwise we will apply the flushing control > FPCR.FZ16 to the 32-bit output rather than the FPCR.FZ control, and > incorrectly flush a denormal output to zero when we should not (or > vice-versa). > > (In commit 207d30b5fdb5b we tried to fix the FZ handling but > didn't get it right, switching from "use FPCR.FZ for everything" to > "use FPCR.FZ16 for everything".) > > Pass the CPU env to the sme_fmopa_h helper instead of an fp_status > pointer, and have the helper pass an extra fp_status into the > f16_dotadd() function so that we can use the right status for the > right parts of this operation. > > Cc: qemu-stable@nongnu.org > Fixes: 207d30b5fdb5 ("target/arm: Use FPST_F16 for SME FMOPA (widening)") > Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2373 I know it's too late already, but it looks like this Fixes needs to be: Fixes: 3916841ac75 ("target/arm: Implement FMOPA, FMOPS (widening)") FWIW. Thanks, /mjt
On Fri, 2 Aug 2024 at 08:45, Michael Tokarev <mjt@tls.msk.ru> wrote: > > 01.08.2024 17:23, Peter Maydell wrote: > > The FMOPA (widening) SME instruction takes pairs of half-precision > > floating point values, widens them to single-precision, does a > > two-way dot product and accumulates the results into a > > single-precision destination. We don't quite correctly handle the > > FPCR bits FZ and FZ16 which control flushing of denormal inputs and > > outputs. This is because at the moment we pass a single float_status > > value to the helper function, which then uses that configuration for > > all the fp operations it does. However, because the inputs to this > > operation are float16 and the outputs are float32 we need to use the > > fp_status_f16 for the float16 input widening but the normal fp_status > > for everything else. Otherwise we will apply the flushing control > > FPCR.FZ16 to the 32-bit output rather than the FPCR.FZ control, and > > incorrectly flush a denormal output to zero when we should not (or > > vice-versa). > > > > (In commit 207d30b5fdb5b we tried to fix the FZ handling but > > didn't get it right, switching from "use FPCR.FZ for everything" to > > "use FPCR.FZ16 for everything".) > > > > Pass the CPU env to the sme_fmopa_h helper instead of an fp_status > > pointer, and have the helper pass an extra fp_status into the > > f16_dotadd() function so that we can use the right status for the > > right parts of this operation. > > > > Cc: qemu-stable@nongnu.org > > Fixes: 207d30b5fdb5 ("target/arm: Use FPST_F16 for SME FMOPA (widening)") > > Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2373 > > I know it's too late already, but it looks like this Fixes needs to be: > Fixes: 3916841ac75 ("target/arm: Implement FMOPA, FMOPS (widening)") It's fixing a mistake in 207d30b5fdb5, which is in turn fixing a mistake in 3916841ac75 (but didn't quite get it right). -- PMM
diff --git a/target/arm/tcg/helper-sme.h b/target/arm/tcg/helper-sme.h index 27eef49a11e..d22bf9d21b0 100644 --- a/target/arm/tcg/helper-sme.h +++ b/target/arm/tcg/helper-sme.h @@ -121,7 +121,7 @@ DEF_HELPER_FLAGS_5(sme_addha_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_5(sme_addva_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_7(sme_fmopa_h, TCG_CALL_NO_RWG, - void, ptr, ptr, ptr, ptr, ptr, ptr, i32) + void, ptr, ptr, ptr, ptr, ptr, env, i32) DEF_HELPER_FLAGS_7(sme_fmopa_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_7(sme_fmopa_d, TCG_CALL_NO_RWG, diff --git a/target/arm/tcg/sme_helper.c b/target/arm/tcg/sme_helper.c index 3ba826a6ceb..02106809ce1 100644 --- a/target/arm/tcg/sme_helper.c +++ b/target/arm/tcg/sme_helper.c @@ -992,12 +992,23 @@ static inline uint32_t f16mop_adj_pair(uint32_t pair, uint32_t pg, uint32_t neg) } static float32 f16_dotadd(float32 sum, uint32_t e1, uint32_t e2, - float_status *s_std, float_status *s_odd) + float_status *s_f16, float_status *s_std, + float_status *s_odd) { - float64 e1r = float16_to_float64(e1 & 0xffff, true, s_std); - float64 e1c = float16_to_float64(e1 >> 16, true, s_std); - float64 e2r = float16_to_float64(e2 & 0xffff, true, s_std); - float64 e2c = float16_to_float64(e2 >> 16, true, s_std); + /* + * We need three different float_status for different parts of this + * operation: + * - the input conversion of the float16 values must use the + * f16-specific float_status, so that the FPCR.FZ16 control is applied + * - operations on float32 including the final accumulation must use + * the normal float_status, so that FPCR.FZ is applied + * - we have pre-set-up copy of s_std which is set to round-to-odd, + * for the multiply (see below) + */ + float64 e1r = float16_to_float64(e1 & 0xffff, true, s_f16); + float64 e1c = float16_to_float64(e1 >> 16, true, s_f16); + float64 e2r = float16_to_float64(e2 & 0xffff, true, s_f16); + float64 e2c = float16_to_float64(e2 >> 16, true, s_f16); float64 t64; float32 t32; @@ -1019,20 +1030,23 @@ static float32 f16_dotadd(float32 sum, uint32_t e1, uint32_t e2, } void HELPER(sme_fmopa_h)(void *vza, void *vzn, void *vzm, void *vpn, - void *vpm, void *vst, uint32_t desc) + void *vpm, CPUARMState *env, uint32_t desc) { intptr_t row, col, oprsz = simd_maxsz(desc); uint32_t neg = simd_data(desc) * 0x80008000u; uint16_t *pn = vpn, *pm = vpm; - float_status fpst_odd, fpst_std; + float_status fpst_odd, fpst_std, fpst_f16; /* - * Make a copy of float_status because this operation does not - * update the cumulative fp exception status. It also produces - * default nans. Make a second copy with round-to-odd -- see above. + * Make copies of fp_status and fp_status_f16, because this operation + * does not update the cumulative fp exception status. It also + * produces default NaNs. We also need a second copy of fp_status with + * round-to-odd -- see above. */ - fpst_std = *(float_status *)vst; + fpst_f16 = env->vfp.fp_status_f16; + fpst_std = env->vfp.fp_status; set_default_nan_mode(true, &fpst_std); + set_default_nan_mode(true, &fpst_f16); fpst_odd = fpst_std; set_float_rounding_mode(float_round_to_odd, &fpst_odd); @@ -1052,7 +1066,8 @@ void HELPER(sme_fmopa_h)(void *vza, void *vzn, void *vzm, void *vpn, uint32_t m = *(uint32_t *)(vzm + H1_4(col)); m = f16mop_adj_pair(m, pcol, 0); - *a = f16_dotadd(*a, n, m, &fpst_std, &fpst_odd); + *a = f16_dotadd(*a, n, m, + &fpst_f16, &fpst_std, &fpst_odd); } col += 4; pcol >>= 4; diff --git a/target/arm/tcg/translate-sme.c b/target/arm/tcg/translate-sme.c index a50a419af27..ae42ddef7b3 100644 --- a/target/arm/tcg/translate-sme.c +++ b/target/arm/tcg/translate-sme.c @@ -334,8 +334,29 @@ static bool do_outprod_fpst(DisasContext *s, arg_op *a, MemOp esz, return true; } -TRANS_FEAT(FMOPA_h, aa64_sme, do_outprod_fpst, a, - MO_32, FPST_FPCR_F16, gen_helper_sme_fmopa_h) +static bool do_outprod_env(DisasContext *s, arg_op *a, MemOp esz, + gen_helper_gvec_5_ptr *fn) +{ + int svl = streaming_vec_reg_size(s); + uint32_t desc = simd_desc(svl, svl, a->sub); + TCGv_ptr za, zn, zm, pn, pm; + + if (!sme_smza_enabled_check(s)) { + return true; + } + + za = get_tile(s, esz, a->zad); + zn = vec_full_reg_ptr(s, a->zn); + zm = vec_full_reg_ptr(s, a->zm); + pn = pred_full_reg_ptr(s, a->pn); + pm = pred_full_reg_ptr(s, a->pm); + + fn(za, zn, zm, pn, pm, tcg_env, tcg_constant_i32(desc)); + return true; +} + +TRANS_FEAT(FMOPA_h, aa64_sme, do_outprod_env, a, + MO_32, gen_helper_sme_fmopa_h) TRANS_FEAT(FMOPA_s, aa64_sme, do_outprod_fpst, a, MO_32, FPST_FPCR, gen_helper_sme_fmopa_s) TRANS_FEAT(FMOPA_d, aa64_sme_f64f64, do_outprod_fpst, a,