Message ID | 1347836915-14091-4-git-send-email-aurelien@aurel32.net |
---|---|
State | New |
Headers | show |
On Mon, Sep 17, 2012 at 1:08 AM, Aurelien Jarno <aurelien@aurel32.net> wrote: > Now that the setcond TCG op is available, it's possible to replace > shl and shr helpers by TCG code. The code generated by TCG is slightly > longer than the code generated by GCC for the helper but is still worth > it as this avoid all the consequences of using an helper: globals saved > back to memory, no possible optimization, call overhead, etc. > > Cc: Peter Maydell <peter.maydell@linaro.org> > Signed-off-by: Aurelien Jarno <aurelien@aurel32.net> > --- > target-arm/helper.h | 2 -- > target-arm/op_helper.c | 16 ---------------- > target-arm/translate.c | 32 ++++++++++++++++++++++++++++---- > 3 files changed, 28 insertions(+), 22 deletions(-) > > diff --git a/target-arm/helper.h b/target-arm/helper.h > index 7151e28..b123d3e 100644 > --- a/target-arm/helper.h > +++ b/target-arm/helper.h > @@ -145,8 +145,6 @@ DEF_HELPER_5(neon_tbl, i32, env, i32, i32, i32, i32) > DEF_HELPER_3(adc_cc, i32, env, i32, i32) > DEF_HELPER_3(sbc_cc, i32, env, i32, i32) > > -DEF_HELPER_3(shl, i32, env, i32, i32) > -DEF_HELPER_3(shr, i32, env, i32, i32) > DEF_HELPER_3(sar, i32, env, i32, i32) > DEF_HELPER_3(shl_cc, i32, env, i32, i32) > DEF_HELPER_3(shr_cc, i32, env, i32, i32) > diff --git a/target-arm/op_helper.c b/target-arm/op_helper.c > index 6095f24..5fcd12c 100644 > --- a/target-arm/op_helper.c > +++ b/target-arm/op_helper.c > @@ -355,22 +355,6 @@ uint32_t HELPER(sbc_cc)(CPUARMState *env, uint32_t a, uint32_t b) > > /* Similarly for variable shift instructions. */ > > -uint32_t HELPER(shl)(CPUARMState *env, uint32_t x, uint32_t i) > -{ > - int shift = i & 0xff; > - if (shift >= 32) > - return 0; > - return x << shift; > -} > - > -uint32_t HELPER(shr)(CPUARMState *env, uint32_t x, uint32_t i) > -{ > - int shift = i & 0xff; > - if (shift >= 32) > - return 0; > - return (uint32_t)x >> shift; > -} > - > uint32_t HELPER(sar)(CPUARMState *env, uint32_t x, uint32_t i) > { > int shift = i & 0xff; > diff --git a/target-arm/translate.c b/target-arm/translate.c > index 19bb1e8..9c29065 100644 > --- a/target-arm/translate.c > +++ b/target-arm/translate.c > @@ -440,6 +440,26 @@ static void gen_sub_CC(TCGv dest, TCGv t0, TCGv t1) > tcg_gen_mov_i32(dest, cpu_NF); > } > > +#define GEN_SHIFT(name) \ > +static void gen_##name(TCGv dest, TCGv t0, TCGv t1) \ > +{ \ > + TCGv tmp1, tmp2; \ > + tmp1 = tcg_temp_new_i32(); \ > + tcg_gen_andi_i32(tmp1, t1, 0xff); \ > + tmp2 = tcg_temp_new_i32(); \ > + tcg_gen_setcondi_i32(TCG_COND_GE, tmp2, tmp1, 32); \ > + tcg_gen_andi_i32(tmp1, tmp1, 0x1f); \ I don't think the 'and 0x1f' is needed given that later you'll and with 0 if the shift amount is >= 32. > + tcg_gen_##name##_i32(dest, t0, tmp1); \ > + tcg_temp_free_i32(tmp1); \ > + tcg_gen_subi_i32(tmp2, tmp2, 1); \ > + tcg_gen_and_i32(dest, dest, tmp2); \ > + tcg_temp_free_i32(tmp2); \ > +} Laurent > +GEN_SHIFT(shl) > +GEN_SHIFT(shr) > +#undef GEN_SHIFT > + > + > /* FIXME: Implement this natively. */ > #define tcg_gen_abs_i32(t0, t1) gen_helper_abs(t0, t1) > > @@ -516,8 +536,12 @@ static inline void gen_arm_shift_reg(TCGv var, int shiftop, > } > } else { > switch (shiftop) { > - case 0: gen_helper_shl(var, cpu_env, var, shift); break; > - case 1: gen_helper_shr(var, cpu_env, var, shift); break; > + case 0: > + gen_shl(var, var, shift); > + break; > + case 1: > + gen_shr(var, var, shift); > + break; > case 2: gen_helper_sar(var, cpu_env, var, shift); break; > case 3: tcg_gen_andi_i32(shift, shift, 0x1f); > tcg_gen_rotr_i32(var, var, shift); break; > @@ -9161,7 +9185,7 @@ static void disas_thumb_insn(CPUARMState *env, DisasContext *s) > break; > case 0x2: /* lsl */ > if (s->condexec_mask) { > - gen_helper_shl(tmp2, cpu_env, tmp2, tmp); > + gen_shl(tmp2, tmp2, tmp); > } else { > gen_helper_shl_cc(tmp2, cpu_env, tmp2, tmp); > gen_logic_CC(tmp2); > @@ -9169,7 +9193,7 @@ static void disas_thumb_insn(CPUARMState *env, DisasContext *s) > break; > case 0x3: /* lsr */ > if (s->condexec_mask) { > - gen_helper_shr(tmp2, cpu_env, tmp2, tmp); > + gen_shr(tmp2, tmp2, tmp); > } else { > gen_helper_shr_cc(tmp2, cpu_env, tmp2, tmp); > gen_logic_CC(tmp2); > -- > 1.7.10.4 > >
On 17 September 2012 10:30, Laurent Desnogues <laurent.desnogues@gmail.com> wrote: > On Mon, Sep 17, 2012 at 1:08 AM, Aurelien Jarno <aurelien@aurel32.net> wrote: >> +#define GEN_SHIFT(name) \ >> +static void gen_##name(TCGv dest, TCGv t0, TCGv t1) \ >> +{ \ >> + TCGv tmp1, tmp2; \ >> + tmp1 = tcg_temp_new_i32(); \ >> + tcg_gen_andi_i32(tmp1, t1, 0xff); \ >> + tmp2 = tcg_temp_new_i32(); \ >> + tcg_gen_setcondi_i32(TCG_COND_GE, tmp2, tmp1, 32); \ >> + tcg_gen_andi_i32(tmp1, tmp1, 0x1f); \ > > I don't think the 'and 0x1f' is needed given that later you'll and > with 0 if the shift amount is >= 32. The TCG shift operations are undefined behaviour (not merely undefined result) if the shift is >= 32, so we must avoid doing that even if we're going to throw away the answer. -- PMM
On Mon, Sep 17, 2012 at 11:43 AM, Peter Maydell <peter.maydell@linaro.org> wrote: > On 17 September 2012 10:30, Laurent Desnogues > <laurent.desnogues@gmail.com> wrote: >> On Mon, Sep 17, 2012 at 1:08 AM, Aurelien Jarno <aurelien@aurel32.net> wrote: >>> +#define GEN_SHIFT(name) \ >>> +static void gen_##name(TCGv dest, TCGv t0, TCGv t1) \ >>> +{ \ >>> + TCGv tmp1, tmp2; \ >>> + tmp1 = tcg_temp_new_i32(); \ >>> + tcg_gen_andi_i32(tmp1, t1, 0xff); \ >>> + tmp2 = tcg_temp_new_i32(); \ >>> + tcg_gen_setcondi_i32(TCG_COND_GE, tmp2, tmp1, 32); \ >>> + tcg_gen_andi_i32(tmp1, tmp1, 0x1f); \ >> >> I don't think the 'and 0x1f' is needed given that later you'll and >> with 0 if the shift amount is >= 32. > > The TCG shift operations are undefined behaviour (not merely > undefined result) if the shift is >= 32, so we must avoid > doing that even if we're going to throw away the answer. That's odd that it doesn't just state that the result is undefined. I wonder what "undefined behavior" means. I understand what undefined behavior (as opposed tu undefined result) means for divisions by 0, but not for a shift larger than data type width. Anyway that makes my comment about removing the & 0x1f pointless. Laurent
On 17 September 2012 14:36, Laurent Desnogues <laurent.desnogues@gmail.com> wrote: > On Mon, Sep 17, 2012 at 11:43 AM, Peter Maydell > <peter.maydell@linaro.org> wrote: >> The TCG shift operations are undefined behaviour (not merely >> undefined result) if the shift is >= 32, so we must avoid >> doing that even if we're going to throw away the answer. > > That's odd that it doesn't just state that the result is undefined. > I wonder what "undefined behavior" means. I understand > what undefined behavior (as opposed tu undefined result) > means for divisions by 0, but not for a shift larger than data > type width. Well, it means anything the implementation likes :-) The only concrete example I could come up with was the old pre-286 behaviour where the 8086 didn't mask the shift count at all, so a variable shift by a large amount would be implemented as that many one-bit-per-cycle shifts and could take an extremely long time. -- PMM
On 09/17/2012 02:43 AM, Peter Maydell wrote: > The TCG shift operations are undefined behaviour (not merely > undefined result) if the shift is >= 32, so we must avoid > doing that even if we're going to throw away the answer. I don't think that ought to be true. In particular, I've just posted a patch to cure the Sparc TCG backend from producing an illegal insn in the face of a shift with a count of 64. I'd much rather we simply had an undefined result. r~
On 09/16/2012 04:08 PM, Aurelien Jarno wrote: > + tcg_gen_setcondi_i32(TCG_COND_GE, tmp2, tmp1, 32); \ > + tcg_gen_andi_i32(tmp1, tmp1, 0x1f); \ > + tcg_gen_##name##_i32(dest, t0, tmp1); \ > + tcg_temp_free_i32(tmp1); \ > + tcg_gen_subi_i32(tmp2, tmp2, 1); \ > + tcg_gen_and_i32(dest, dest, tmp2); \ Ought I revive my movcond patch? Surely you can agree it's much more natural to write this as a conditional move of zero than playing masking games. r~
On 17 September 2012 16:34, Richard Henderson <rth@twiddle.net> wrote: > On 09/17/2012 02:43 AM, Peter Maydell wrote: >> The TCG shift operations are undefined behaviour (not merely >> undefined result) if the shift is >= 32, so we must avoid >> doing that even if we're going to throw away the answer. > > I don't think that ought to be true. > > In particular, I've just posted a patch to cure the Sparc TCG > backend from producing an illegal insn in the face of a shift > with a count of 64. > > I'd much rather we simply had an undefined result. Variously: * I didn't really want to have to audit all the TCG backends to see if they were taking advantage of the 'undefined behaviour' clause * the C standard says 'undefined behaviour', not 'unknown value', for out of range shifts * the TCG spec doesn't currently have the concept of 'unknown value' at all and I wasn't sure it was worth introducing it which all added up to 'not really worth it to save one insn'. Of these, the first is the most significant. If you want to wade through architecture manuals for 8 architectures, be my guest :-) -- PMM
On 09/17/2012 08:41 AM, Peter Maydell wrote: > Of these, the first is the most significant. If you want to wade > through architecture manuals for 8 architectures, be my guest :-) I think that's probably easier than auditing all of our guests now. ;-) r~
On Mon, Sep 17, 2012 at 08:36:31AM -0700, Richard Henderson wrote: > On 09/16/2012 04:08 PM, Aurelien Jarno wrote: > > + tcg_gen_setcondi_i32(TCG_COND_GE, tmp2, tmp1, 32); \ > > + tcg_gen_andi_i32(tmp1, tmp1, 0x1f); \ > > + tcg_gen_##name##_i32(dest, t0, tmp1); \ > > + tcg_temp_free_i32(tmp1); \ > > + tcg_gen_subi_i32(tmp2, tmp2, 1); \ > > + tcg_gen_and_i32(dest, dest, tmp2); \ > > Ought I revive my movcond patch? > > Surely you can agree it's much more natural to write this > as a conditional move of zero than playing masking games. > I agree it's more natural, that said in that case it doesn't spare that many instructions, and I am still not sure we want to introduce such a TCG op. What is important is to avoid helpers and branches which are really killing the performances. If you insist, maybe we can just add a movcond op that uses setcond, so it makes the code more readable? We introduced setcond a lot of time ago, and there are still plenty of places where it's not used. We can get higher performances already by fixing that.
On 09/17/2012 11:09 AM, Aurelien Jarno wrote: > If you insist, maybe we can just add a movcond op that uses setcond, so > it makes the code more readable? Well, that's certainly a good first step. And an easy fall back for hosts that choose not to implement the full conditional move. But at least 4 of our 8 hosts do have a proper cmove insn. But honestly I can't understand the resistance to movcond. r~
On Mon, Sep 17, 2012 at 11:47:05AM -0700, Richard Henderson wrote: > On 09/17/2012 11:09 AM, Aurelien Jarno wrote: > > If you insist, maybe we can just add a movcond op that uses setcond, so > > it makes the code more readable? > > Well, that's certainly a good first step. And an easy fall back for > hosts that choose not to implement the full conditional move. But at > least 4 of our 8 hosts do have a proper cmove insn. > > But honestly I can't understand the resistance to movcond. > I am not fundamentally opposed to a conditional mov instruction, that said there are things I don't like in the way you implemented movcond. When introducing new instructions, especially mandatory ones, we have to really take care of designing it correctly, as changing it latter won't be that easy and might requite an audit of all targets (look at the shifts undefined result/behavior). What I don't really like in the implementation you proposed: - It's a mandatory op. - It's implemented using jumps as a fallback at least on i386 and mips. One of the main reasons to have a movcond instruction, is to avoid jumps. - I am not sure it really matches what's the host CPUs are providing, and also not sure it matches the needs. What you offer is a mix between isel and cmov. One last argument is that in the patch series you offered, movcond wasn't used a lot in the targets. Now that we might use it for implementing shifts this argument has lost parts of its value.
On Mon, Sep 17, 2012 at 03:36:46PM +0200, Laurent Desnogues wrote: > On Mon, Sep 17, 2012 at 11:43 AM, Peter Maydell > <peter.maydell@linaro.org> wrote: > > On 17 September 2012 10:30, Laurent Desnogues > > <laurent.desnogues@gmail.com> wrote: > >> On Mon, Sep 17, 2012 at 1:08 AM, Aurelien Jarno <aurelien@aurel32.net> wrote: > >>> +#define GEN_SHIFT(name) \ > >>> +static void gen_##name(TCGv dest, TCGv t0, TCGv t1) \ > >>> +{ \ > >>> + TCGv tmp1, tmp2; \ > >>> + tmp1 = tcg_temp_new_i32(); \ > >>> + tcg_gen_andi_i32(tmp1, t1, 0xff); \ > >>> + tmp2 = tcg_temp_new_i32(); \ > >>> + tcg_gen_setcondi_i32(TCG_COND_GE, tmp2, tmp1, 32); \ > >>> + tcg_gen_andi_i32(tmp1, tmp1, 0x1f); \ > >> > >> I don't think the 'and 0x1f' is needed given that later you'll and > >> with 0 if the shift amount is >= 32. > > > > The TCG shift operations are undefined behaviour (not merely > > undefined result) if the shift is >= 32, so we must avoid > > doing that even if we're going to throw away the answer. > > That's odd that it doesn't just state that the result is undefined. > I wonder what "undefined behavior" means. I understand > what undefined behavior (as opposed tu undefined result) > means for divisions by 0, but not for a shift larger than data > type width. > > Anyway that makes my comment about removing the & 0x1f > pointless. Hi, I'm a bit late with email but I don't think your comment is irrelevant. It sounds like if we are optimizing for very odd or nonexisting archs. Is there any modern arch that has undefined behaviour (in reality and not only in theoretical specs) these days? Cheers, E
diff --git a/target-arm/helper.h b/target-arm/helper.h index 7151e28..b123d3e 100644 --- a/target-arm/helper.h +++ b/target-arm/helper.h @@ -145,8 +145,6 @@ DEF_HELPER_5(neon_tbl, i32, env, i32, i32, i32, i32) DEF_HELPER_3(adc_cc, i32, env, i32, i32) DEF_HELPER_3(sbc_cc, i32, env, i32, i32) -DEF_HELPER_3(shl, i32, env, i32, i32) -DEF_HELPER_3(shr, i32, env, i32, i32) DEF_HELPER_3(sar, i32, env, i32, i32) DEF_HELPER_3(shl_cc, i32, env, i32, i32) DEF_HELPER_3(shr_cc, i32, env, i32, i32) diff --git a/target-arm/op_helper.c b/target-arm/op_helper.c index 6095f24..5fcd12c 100644 --- a/target-arm/op_helper.c +++ b/target-arm/op_helper.c @@ -355,22 +355,6 @@ uint32_t HELPER(sbc_cc)(CPUARMState *env, uint32_t a, uint32_t b) /* Similarly for variable shift instructions. */ -uint32_t HELPER(shl)(CPUARMState *env, uint32_t x, uint32_t i) -{ - int shift = i & 0xff; - if (shift >= 32) - return 0; - return x << shift; -} - -uint32_t HELPER(shr)(CPUARMState *env, uint32_t x, uint32_t i) -{ - int shift = i & 0xff; - if (shift >= 32) - return 0; - return (uint32_t)x >> shift; -} - uint32_t HELPER(sar)(CPUARMState *env, uint32_t x, uint32_t i) { int shift = i & 0xff; diff --git a/target-arm/translate.c b/target-arm/translate.c index 19bb1e8..9c29065 100644 --- a/target-arm/translate.c +++ b/target-arm/translate.c @@ -440,6 +440,26 @@ static void gen_sub_CC(TCGv dest, TCGv t0, TCGv t1) tcg_gen_mov_i32(dest, cpu_NF); } +#define GEN_SHIFT(name) \ +static void gen_##name(TCGv dest, TCGv t0, TCGv t1) \ +{ \ + TCGv tmp1, tmp2; \ + tmp1 = tcg_temp_new_i32(); \ + tcg_gen_andi_i32(tmp1, t1, 0xff); \ + tmp2 = tcg_temp_new_i32(); \ + tcg_gen_setcondi_i32(TCG_COND_GE, tmp2, tmp1, 32); \ + tcg_gen_andi_i32(tmp1, tmp1, 0x1f); \ + tcg_gen_##name##_i32(dest, t0, tmp1); \ + tcg_temp_free_i32(tmp1); \ + tcg_gen_subi_i32(tmp2, tmp2, 1); \ + tcg_gen_and_i32(dest, dest, tmp2); \ + tcg_temp_free_i32(tmp2); \ +} +GEN_SHIFT(shl) +GEN_SHIFT(shr) +#undef GEN_SHIFT + + /* FIXME: Implement this natively. */ #define tcg_gen_abs_i32(t0, t1) gen_helper_abs(t0, t1) @@ -516,8 +536,12 @@ static inline void gen_arm_shift_reg(TCGv var, int shiftop, } } else { switch (shiftop) { - case 0: gen_helper_shl(var, cpu_env, var, shift); break; - case 1: gen_helper_shr(var, cpu_env, var, shift); break; + case 0: + gen_shl(var, var, shift); + break; + case 1: + gen_shr(var, var, shift); + break; case 2: gen_helper_sar(var, cpu_env, var, shift); break; case 3: tcg_gen_andi_i32(shift, shift, 0x1f); tcg_gen_rotr_i32(var, var, shift); break; @@ -9161,7 +9185,7 @@ static void disas_thumb_insn(CPUARMState *env, DisasContext *s) break; case 0x2: /* lsl */ if (s->condexec_mask) { - gen_helper_shl(tmp2, cpu_env, tmp2, tmp); + gen_shl(tmp2, tmp2, tmp); } else { gen_helper_shl_cc(tmp2, cpu_env, tmp2, tmp); gen_logic_CC(tmp2); @@ -9169,7 +9193,7 @@ static void disas_thumb_insn(CPUARMState *env, DisasContext *s) break; case 0x3: /* lsr */ if (s->condexec_mask) { - gen_helper_shr(tmp2, cpu_env, tmp2, tmp); + gen_shr(tmp2, tmp2, tmp); } else { gen_helper_shr_cc(tmp2, cpu_env, tmp2, tmp); gen_logic_CC(tmp2);
Now that the setcond TCG op is available, it's possible to replace shl and shr helpers by TCG code. The code generated by TCG is slightly longer than the code generated by GCC for the helper but is still worth it as this avoid all the consequences of using an helper: globals saved back to memory, no possible optimization, call overhead, etc. Cc: Peter Maydell <peter.maydell@linaro.org> Signed-off-by: Aurelien Jarno <aurelien@aurel32.net> --- target-arm/helper.h | 2 -- target-arm/op_helper.c | 16 ---------------- target-arm/translate.c | 32 ++++++++++++++++++++++++++++---- 3 files changed, 28 insertions(+), 22 deletions(-)