Message ID | b3a1f6980056160646b35d50a673a851383a3bc5.1261078375.git.rth@twiddle.net |
---|---|
State | New |
Headers | show |
On Thu, Dec 17, 2009 at 6:32 PM, Richard Henderson <rth@twiddle.net> wrote: > Implement conditional moves in the x86_64 backend. > > Signed-off-by: Richard Henderson <rth@twiddle.net> > --- > tcg/x86_64/tcg-target.c | 65 ++++++++++++++++++++++++++++++++++++++++++++-- > 1 files changed, 62 insertions(+), 3 deletions(-) > > diff --git a/tcg/x86_64/tcg-target.c b/tcg/x86_64/tcg-target.c > index 2339091..e411755 100644 > --- a/tcg/x86_64/tcg-target.c > +++ b/tcg/x86_64/tcg-target.c > @@ -491,9 +491,8 @@ static void tcg_out_jxx(TCGContext *s, int opc, int label_index) > } > } > > -static void tcg_out_brcond(TCGContext *s, int cond, > - TCGArg arg1, TCGArg arg2, int const_arg2, > - int label_index, int rexw) > +static void tcg_out_cond(TCGContext *s, int cond, TCGArg arg1, > + TCGArg arg2, int const_arg2, int rexw) > { > if (const_arg2) { > if (arg2 == 0) { > @@ -508,9 +507,45 @@ static void tcg_out_brcond(TCGContext *s, int cond, > } else { > tcg_out_modrm(s, 0x01 | (ARITH_CMP << 3) | rexw, arg2, arg1); > } > +} > + > +static void tcg_out_brcond(TCGContext *s, int cond, > + TCGArg arg1, TCGArg arg2, int const_arg2, > + int label_index, int rexw) > +{ > + tcg_out_cond(s, cond, arg1, arg2, const_arg2, rexw); > tcg_out_jxx(s, tcg_cond_to_jcc[cond], label_index); > } > > +static void tcg_out_setcond(TCGContext *s, int cond, TCGArg arg0, > + TCGArg arg1, TCGArg arg2, int const_arg2, int rexw) Perhaps renaming arg0 to dest would make things slightly more readable. > +{ > + int use_xor = (arg0 != arg1 && (const_arg2 || arg0 != arg2)); > + > + if (use_xor) > + tcg_out_movi(s, TCG_TYPE_I32, arg0, 0); > + tcg_out_cond(s, cond, arg1, arg2, const_arg2, rexw); > + tcg_out_modrm(s, 0x90 | tcg_cond_to_jcc[cond] | P_EXT | P_REXB, 0, arg0); A comment saying this is a setcc would be nice. Also note that tcg_out_modrm will generate an unneeded prefix for some registers. cf. the patch I sent to the list months ago. > + if (!use_xor) > + tgen_arithi32(s, ARITH_AND, arg0, 0xff); Wouldn't movzbl be better? Regarding the xor optimization, I tested it on my i7 and it was (very) slightly slower running a 64-bit SPEC2k gcc. > +} > + > +static void tcg_out_movcond(TCGContext *s, int cond, TCGArg arg0, > + TCGArg arg1, TCGArg arg2, int const_arg2, > + TCGArg arg3, TCGArg arg4, int rexw) Perhaps renaming arg0 to dest would make things slightly more readable. You should also add a note stating that arg3 != arg4. > +{ > + if (arg0 == arg3) { > + cond = tcg_invert_cond(cond); > + arg3 = arg4; > + arg4 = arg0; > + } > + > + tcg_out_cond(s, cond, arg1, arg2, const_arg2, rexw); > + if (arg0 != arg4) > + tcg_out_mov(s, arg0, arg4); > + tcg_out_modrm(s, 0x40 | tcg_cond_to_jcc[cond] | P_EXT | rexw, arg0, arg3); A comment saying this is cmovcc would be nice. Note, I didn't check the correctness, though it looks OK. I'll make real tests at the next iteration :-) > +} > + > #if defined(CONFIG_SOFTMMU) > > #include "../../softmmu_defs.h" > @@ -1197,6 +1232,24 @@ static inline void tcg_out_op(TCGContext *s, int opc, const TCGArg *args, > tcg_out_modrm(s, 0x8b, args[0], args[1]); > break; > > + case INDEX_op_setcond_i32: > + tcg_out_setcond(s, args[3], args[0], args[1], args[2], > + const_args[2], 0); > + break; > + case INDEX_op_setcond_i64: > + tcg_out_setcond(s, args[3], args[0], args[1], args[2], > + const_args[2], P_REXW); > + break; > + > + case INDEX_op_movcond_i32: > + tcg_out_movcond(s, args[5], args[0], args[1], args[2], > + const_args[2], args[3], args[4], 0); > + break; > + case INDEX_op_movcond_i64: > + tcg_out_movcond(s, args[5], args[0], args[1], args[2], > + const_args[2], args[3], args[4], P_REXW); > + break; > + > case INDEX_op_qemu_ld8u: > tcg_out_qemu_ld(s, args, 0); > break; > @@ -1376,6 +1429,12 @@ static const TCGTargetOpDef x86_64_op_defs[] = { > { INDEX_op_ext16u_i64, { "r", "r"} }, > { INDEX_op_ext32u_i64, { "r", "r"} }, > > + { INDEX_op_setcond_i32, { "r", "r", "re" } }, > + { INDEX_op_setcond_i64, { "r", "r", "re" } }, > + > + { INDEX_op_movcond_i32, { "r", "r", "re", "r", "r" } }, > + { INDEX_op_movcond_i64, { "r", "r", "re", "r", "r" } }, For the i32 variants, "ri" instead of "re" is enough. Laurent > + > { INDEX_op_qemu_ld8u, { "r", "L" } }, > { INDEX_op_qemu_ld8s, { "r", "L" } }, > { INDEX_op_qemu_ld16u, { "r", "L" } }, > -- > 1.6.5.2 > >
On 12/18/2009 03:39 AM, Laurent Desnogues wrote: >> +static void tcg_out_setcond(TCGContext *s, int cond, TCGArg arg0, >> + TCGArg arg1, TCGArg arg2, int const_arg2, int rexw) > > Perhaps renaming arg0 to dest would make things slightly > more readable. Ok. > Also note that tcg_out_modrm will generate an unneeded prefix > for some registers. cf. the patch I sent to the list months ago. Huh. Didn't notice since the disassembler printed what I expected to see. Is fixing this at the same time a requirement for acceptance? I'd prefer to tackle that separately, since no doubt it affects every use of P_REXB. >> + tgen_arithi32(s, ARITH_AND, arg0, 0xff); > > Wouldn't movzbl be better? Handled inside tgen_arithi32: } else if (c == ARITH_AND && val == 0xffu) { /* movzbl */ tcg_out_modrm(s, 0xb6 | P_EXT | P_REXB, r0, r0); I didn't feel the need to replicate that. > Regarding the xor optimization, I tested it on my i7 and it was > (very) slightly slower running a 64-bit SPEC2k gcc. Huh. It used to be recommended. The partial word store used to stall the pipeline until the old value was ready, and the XOR was special-cased as a clear, which broke both the input dependency and also prevented a partial-register stall on the output. Actually, this recommendation is still present: Section 3.5.1.6 in the November 2009 revision of the Intel Optimization Reference Manual. If it's all the same, I'd prefer to keep what I have there. All other things being equal, the XOR is 2 bytes and the MOVZBL is 3. >> +static void tcg_out_movcond(TCGContext *s, int cond, TCGArg arg0, >> + TCGArg arg1, TCGArg arg2, int const_arg2, >> + TCGArg arg3, TCGArg arg4, int rexw) > > Perhaps renaming arg0 to dest would make things slightly > more readable. Ok. > You should also add a note stating that arg3 != arg4. I don't believe that's true though. It's caught immediately when we emit the movcond opcode, but there's no check later once copy-propagation has been done within TCG. I check for that in the i386 and sparc backends, because dest==arg3 && dest==arg4 would actually generate incorrect code. Here in the x86_64 backend, where we always use cmov it doesn't generate incorrect code, merely inefficient. I could add an early out for that case, if you prefer. >> + { INDEX_op_setcond_i32, { "r", "r", "re" } }, >> + { INDEX_op_setcond_i64, { "r", "r", "re" } }, >> + >> + { INDEX_op_movcond_i32, { "r", "r", "re", "r", "r" } }, >> + { INDEX_op_movcond_i64, { "r", "r", "re", "r", "r" } }, > > For the i32 variants, "ri" instead of "re" is enough. Ah, quite right. r~
On Fri, Dec 18, 2009 at 6:11 PM, Richard Henderson <rth@twiddle.net> wrote: >> Also note that tcg_out_modrm will generate an unneeded prefix >> for some registers. cf. the patch I sent to the list months ago. > > Huh. Didn't notice since the disassembler printed what I expected to see. > Is fixing this at the same time a requirement for acceptance? > I'd prefer to tackle that separately, since no doubt it affects every use of > P_REXB. I agree this change can be delayed. >>> + tgen_arithi32(s, ARITH_AND, arg0, 0xff); >> >> Wouldn't movzbl be better? > > Handled inside tgen_arithi32: > > } else if (c == ARITH_AND && val == 0xffu) { > /* movzbl */ > tcg_out_modrm(s, 0xb6 | P_EXT | P_REXB, r0, r0); > > I didn't feel the need to replicate that. Oups, I compared with my code which has an explicit mozbl :) >> Regarding the xor optimization, I tested it on my i7 and it was >> (very) slightly slower running a 64-bit SPEC2k gcc. > > Huh. It used to be recommended. The partial word store used to stall the > pipeline until the old value was ready, and the XOR was special-cased as a > clear, which broke both the input dependency and also prevented a > partial-register stall on the output. > > Actually, this recommendation is still present: Section 3.5.1.6 in the > November 2009 revision of the Intel Optimization Reference Manual. > > If it's all the same, I'd prefer to keep what I have there. All other > things being equal, the XOR is 2 bytes and the MOVZBL is 3. I agree too. Anyway my measure is not representative enough to mean anything. And in that case I think shorter code is better, so let's go for XOR. >>> +static void tcg_out_movcond(TCGContext *s, int cond, TCGArg arg0, >>> + TCGArg arg1, TCGArg arg2, int const_arg2, >>> + TCGArg arg3, TCGArg arg4, int rexw) >> >> Perhaps renaming arg0 to dest would make things slightly >> more readable. > > Ok. > >> You should also add a note stating that arg3 != arg4. > > I don't believe that's true though. It's caught immediately when we emit > the movcond opcode, but there's no check later once copy-propagation has > been done within TCG. > > I check for that in the i386 and sparc backends, because dest==arg3 && > dest==arg4 would actually generate incorrect code. Here in the x86_64 > backend, where we always use cmov it doesn't generate incorrect code, merely > inefficient. > > I could add an early out for that case, if you prefer. No, you can leave it as is unless someone else objects. Laurent
diff --git a/tcg/x86_64/tcg-target.c b/tcg/x86_64/tcg-target.c index 2339091..e411755 100644 --- a/tcg/x86_64/tcg-target.c +++ b/tcg/x86_64/tcg-target.c @@ -491,9 +491,8 @@ static void tcg_out_jxx(TCGContext *s, int opc, int label_index) } } -static void tcg_out_brcond(TCGContext *s, int cond, - TCGArg arg1, TCGArg arg2, int const_arg2, - int label_index, int rexw) +static void tcg_out_cond(TCGContext *s, int cond, TCGArg arg1, + TCGArg arg2, int const_arg2, int rexw) { if (const_arg2) { if (arg2 == 0) { @@ -508,9 +507,45 @@ static void tcg_out_brcond(TCGContext *s, int cond, } else { tcg_out_modrm(s, 0x01 | (ARITH_CMP << 3) | rexw, arg2, arg1); } +} + +static void tcg_out_brcond(TCGContext *s, int cond, + TCGArg arg1, TCGArg arg2, int const_arg2, + int label_index, int rexw) +{ + tcg_out_cond(s, cond, arg1, arg2, const_arg2, rexw); tcg_out_jxx(s, tcg_cond_to_jcc[cond], label_index); } +static void tcg_out_setcond(TCGContext *s, int cond, TCGArg arg0, + TCGArg arg1, TCGArg arg2, int const_arg2, int rexw) +{ + int use_xor = (arg0 != arg1 && (const_arg2 || arg0 != arg2)); + + if (use_xor) + tcg_out_movi(s, TCG_TYPE_I32, arg0, 0); + tcg_out_cond(s, cond, arg1, arg2, const_arg2, rexw); + tcg_out_modrm(s, 0x90 | tcg_cond_to_jcc[cond] | P_EXT | P_REXB, 0, arg0); + if (!use_xor) + tgen_arithi32(s, ARITH_AND, arg0, 0xff); +} + +static void tcg_out_movcond(TCGContext *s, int cond, TCGArg arg0, + TCGArg arg1, TCGArg arg2, int const_arg2, + TCGArg arg3, TCGArg arg4, int rexw) +{ + if (arg0 == arg3) { + cond = tcg_invert_cond(cond); + arg3 = arg4; + arg4 = arg0; + } + + tcg_out_cond(s, cond, arg1, arg2, const_arg2, rexw); + if (arg0 != arg4) + tcg_out_mov(s, arg0, arg4); + tcg_out_modrm(s, 0x40 | tcg_cond_to_jcc[cond] | P_EXT | rexw, arg0, arg3); +} + #if defined(CONFIG_SOFTMMU) #include "../../softmmu_defs.h" @@ -1197,6 +1232,24 @@ static inline void tcg_out_op(TCGContext *s, int opc, const TCGArg *args, tcg_out_modrm(s, 0x8b, args[0], args[1]); break; + case INDEX_op_setcond_i32: + tcg_out_setcond(s, args[3], args[0], args[1], args[2], + const_args[2], 0); + break; + case INDEX_op_setcond_i64: + tcg_out_setcond(s, args[3], args[0], args[1], args[2], + const_args[2], P_REXW); + break; + + case INDEX_op_movcond_i32: + tcg_out_movcond(s, args[5], args[0], args[1], args[2], + const_args[2], args[3], args[4], 0); + break; + case INDEX_op_movcond_i64: + tcg_out_movcond(s, args[5], args[0], args[1], args[2], + const_args[2], args[3], args[4], P_REXW); + break; + case INDEX_op_qemu_ld8u: tcg_out_qemu_ld(s, args, 0); break; @@ -1376,6 +1429,12 @@ static const TCGTargetOpDef x86_64_op_defs[] = { { INDEX_op_ext16u_i64, { "r", "r"} }, { INDEX_op_ext32u_i64, { "r", "r"} }, + { INDEX_op_setcond_i32, { "r", "r", "re" } }, + { INDEX_op_setcond_i64, { "r", "r", "re" } }, + + { INDEX_op_movcond_i32, { "r", "r", "re", "r", "r" } }, + { INDEX_op_movcond_i64, { "r", "r", "re", "r", "r" } }, + { INDEX_op_qemu_ld8u, { "r", "L" } }, { INDEX_op_qemu_ld8s, { "r", "L" } }, { INDEX_op_qemu_ld16u, { "r", "L" } },
Implement conditional moves in the x86_64 backend. Signed-off-by: Richard Henderson <rth@twiddle.net> --- tcg/x86_64/tcg-target.c | 65 ++++++++++++++++++++++++++++++++++++++++++++-- 1 files changed, 62 insertions(+), 3 deletions(-)