Message ID | Pine.LNX.4.64.0912171825330.4454@linmac.oyster.ru |
---|---|
State | New |
Headers | show |
On Thu, Dec 17, 2009 at 4:32 PM, malc <av1474@comtv.ru> wrote: [...] > > Some: > a. It breaks tcg on PPC[1]: > > ...qemu/tcg/tcg.c:1378: tcg fatal error What a surprise :-) I can provide a similar patch for ARM (I already have one for my own implementation of setcond), but I'll wait for this patch series to stabilize first. Laurent > b. Documentation for movcond has a typo, t0 is assigned not t1 > > c. Historically things like that were made conditional with > a generic fallback (bswap, neg, not, rot, etc) > > d. Documentation for setcond2 is missing > > e. There's some indentation weirdness here and there and `git am' > complains about added trailing whitespace > > It would also be interesting to learn what impact adding those two > has on performance, any results? > > [..snip..] > > [1] With following i can run some i386 user tests on PPC32 (ls, > openssl) > > diff --git a/tcg/ppc/tcg-target.c b/tcg/ppc/tcg-target.c > index 07e6941..195af13 100644 > --- a/tcg/ppc/tcg-target.c > +++ b/tcg/ppc/tcg-target.c > @@ -316,6 +316,7 @@ static int tcg_target_const_match(tcg_target_long val, > #define STH OPCD(44) > #define STW OPCD(36) > > +#define ADDIC OPCD(12) > #define ADDI OPCD(14) > #define ADDIS OPCD(15) > #define ORI OPCD(24) > @@ -339,6 +340,7 @@ static int tcg_target_const_match(tcg_target_long val, > #define CRANDC XO19(129) > #define CRNAND XO19(225) > #define CROR XO19(449) > +#define CRNOR XO19( 33) > > #define EXTSB XO31(954) > #define EXTSH XO31(922) > @@ -365,6 +367,8 @@ static int tcg_target_const_match(tcg_target_long val, > #define MTSPR XO31(467) > #define SRAWI XO31(824) > #define NEG XO31(104) > +#define MFCR XO31( 19) > +#define CNTLZW XO31( 26) > > #define LBZX XO31( 87) > #define LHZX XO31(279) > @@ -1073,6 +1077,95 @@ static void tcg_out_brcond (TCGContext *s, int cond, > tcg_out_bc (s, tcg_to_bc[cond], label_index); > } > > +static void tcg_out_setcond (TCGContext *s, int cond, TCGArg arg0, > + TCGArg arg1, TCGArg arg2, int const_arg2) > +{ > + int crop, sh; > + > + switch (cond) { > + case TCG_COND_EQ: > + if (const_arg2) { > + if ((uint16_t) arg2 == arg2) { > + tcg_out32 (s, XORI | RS (arg1) | RA (0) | arg2); > + } > + else { > + tcg_out_movi (s, TCG_TYPE_I32, 0, arg2); > + tcg_out32 (s, XOR | SAB (arg1, 0, 0)); > + } > + } > + else { > + tcg_out32 (s, XOR | SAB (arg1, 0, arg2)); > + } > + tcg_out32 (s, CNTLZW | RS (0) | RA (0)); > + tcg_out32 (s, (RLWINM > + | RA (arg0) > + | RS (0) > + | SH (27) > + | MB (5) > + | ME (31) > + ) > + ); > + return; > + > + case TCG_COND_NE: > + if (const_arg2) { > + if ((uint16_t) arg2 == arg2) { > + tcg_out32 (s, XORI | RS (arg1) | RA (0) | arg2); > + } > + else { > + tcg_out_movi (s, TCG_TYPE_I32, 0, arg2); > + tcg_out32 (s, XOR | SAB (arg1, 0, 0)); > + } > + } > + else { > + tcg_out32 (s, XOR | SAB (arg1, 0, arg2)); > + } > + > + tcg_out32 (s, ADDIC | RT (arg0) | RA (0) | 0xffff); > + tcg_out32 (s, SUBFE | TAB (arg0, arg0, 0)); > + return; > + > + case TCG_COND_LTU: > + case TCG_COND_LT: > + sh = 29; > + crop = 0; > + break; > + > + case TCG_COND_GEU: > + case TCG_COND_GE: > + sh = 31; > + crop = CRNOR | BT (7, CR_EQ) | BA (7, CR_LT) | BB (7, CR_LT); > + break; > + > + case TCG_COND_LEU: > + case TCG_COND_LE: > + sh = 31; > + crop = CRNOR | BT (7, CR_EQ) | BA (7, CR_GT) | BB (7, CR_GT); > + break; > + > + case TCG_COND_GTU: > + case TCG_COND_GT: > + sh = 30; > + crop = 0; > + break; > + > + default: > + tcg_abort (); > + } > + > + tcg_out_cmp (s, cond, arg1, arg2, const_arg2, 7); > + tcg_out32 (s, MFCR | RT (0)); > + if (crop) tcg_out32 (s, crop); > + tcg_out32 (s, (RLWINM > + | RA (arg0) > + | RS (0) > + | SH (sh) > + | MB (31) > + | ME (31) > + ) > + ); > +} > + > /* XXX: we implement it at the target level to avoid having to > handle cross basic blocks temporaries */ > static void tcg_out_brcond2 (TCGContext *s, const TCGArg *args, > @@ -1496,6 +1589,10 @@ static void tcg_out_op(TCGContext *s, int opc, const TCGArg *args, > tcg_out32 (s, EXTSH | RS (args[1]) | RA (args[0])); > break; > > + case INDEX_op_setcond_i32: > + tcg_out_setcond(s, args[3], args[0], args[1], args[2], const_args[2]); > + break; > + > default: > tcg_dump_ops (s, stderr); > tcg_abort (); > @@ -1544,6 +1641,8 @@ static const TCGTargetOpDef ppc_op_defs[] = { > > { INDEX_op_neg_i32, { "r", "r" } }, > > + { INDEX_op_setcond_i32, { "r", "r", "ri" } }, > + > #if TARGET_LONG_BITS == 32 > { INDEX_op_qemu_ld8u, { "r", "L" } }, > { INDEX_op_qemu_ld8s, { "r", "L" } }, > > -- > mailto:av1474@comtv.ru > > >
On 12/17/2009 07:32 AM, malc wrote: >> These new opcodes are considered "required" by the backend, >> because expanding them at the tcg level breaks the basic block. >> There might be some way to emulate within tcg internals, but >> that doesn't seem worthwhile, as essentially all hosts have >> some form of support for these. ... > c. Historically things like that were made conditional with > a generic fallback (bswap, neg, not, rot, etc) I answered this one above. A generic fallback would break the basic block, which would break TCGs simple register allocation. > b. Documentation for movcond has a typo, t0 is assigned not t1 Oops. Will fix. > d. Documentation for setcond2 is missing Ah, I see that brcond2 is missing as well; I'll fix that too. > It would also be interesting to learn what impact adding those two > has on performance, any results? Hmph, not as much as I would have liked. I suppose Intel is getting pretty darned good with its branch prediction. It shaved about 3 minutes off 183.equake from what I posted earlier this week; that's something around a 7% improvement, assuming it's not just all noise (I havn't run that test enough times to see what the variation is). > + case TCG_COND_NE: > + if (const_arg2) { > + if ((uint16_t) arg2 == arg2) { > + tcg_out32 (s, XORI | RS (arg1) | RA (0) | arg2); > + } > + else { > + tcg_out_movi (s, TCG_TYPE_I32, 0, arg2); > + tcg_out32 (s, XOR | SAB (arg1, 0, 0)); > + } > + } > + else { > + tcg_out32 (s, XOR | SAB (arg1, 0, arg2)); > + } > + > + tcg_out32 (s, ADDIC | RT (arg0) | RA (0) | 0xffff); > + tcg_out32 (s, SUBFE | TAB (arg0, arg0, 0)); > + return; Heh, you know a trick that gcc doesn't for powerpc. It just adds an xor at the end of the EQ sequence. r~
On Thu, 17 Dec 2009, Richard Henderson wrote: > On 12/17/2009 07:32 AM, malc wrote: > > > These new opcodes are considered "required" by the backend, > > > because expanding them at the tcg level breaks the basic block. > > > There might be some way to emulate within tcg internals, but > > > that doesn't seem worthwhile, as essentially all hosts have > > > some form of support for these. > .. > > c. Historically things like that were made conditional with > > a generic fallback (bswap, neg, not, rot, etc) > > I answered this one above. A generic fallback would break the > basic block, which would break TCGs simple register allocation. Urgh.. I really hate implementing those xxxx2 ops. See for example this lovely thread: http://www.archivum.info/qemu-devel@nongnu.org/2008-06/00306/%5BQemu-devel%5D_%5B4705%5D_Fix_div%5Bu%5D2. > > > b. Documentation for movcond has a typo, t0 is assigned not t1 > > Oops. Will fix. > > > d. Documentation for setcond2 is missing > > Ah, I see that brcond2 is missing as well; I'll fix that too. > > > It would also be interesting to learn what impact adding those two > > has on performance, any results? > > Hmph, not as much as I would have liked. I suppose Intel is getting pretty > darned good with its branch prediction. It shaved about 3 minutes off > 183.equake from what I posted earlier this week; that's something around a 7% > improvement, assuming it's not just all noise (I havn't run that test enough > times to see what the variation is). If 3 minutes(!!) is only 7% then this test is a monster. > > > + case TCG_COND_NE: > > + if (const_arg2) { > > + if ((uint16_t) arg2 == arg2) { > > + tcg_out32 (s, XORI | RS (arg1) | RA (0) | arg2); > > + } > > + else { > > + tcg_out_movi (s, TCG_TYPE_I32, 0, arg2); > > + tcg_out32 (s, XOR | SAB (arg1, 0, 0)); > > + } > > + } > > + else { > > + tcg_out32 (s, XOR | SAB (arg1, 0, arg2)); > > + } > > + > > + tcg_out32 (s, ADDIC | RT (arg0) | RA (0) | 0xffff); > > + tcg_out32 (s, SUBFE | TAB (arg0, arg0, 0)); > > + return; > > Heh, you know a trick that gcc doesn't for powerpc. It just adds an xor at > the end of the EQ sequence. Well, truth be told, i just looked at what gcc 4.4.1 produces for: return op1 != op2 ? 1 : 0; == And did the same. FWIW gcc's handling of LT,LE,GT,GE is not as naive as this implementation (it avoid CR ops/moves when op2 is immediate), but i'm not sure if the gain is worth the pain though, so left it is for simplicity's sake. P.S. BTW PPC has the same dilema w.r.t. conditional moves, on x86 it's cmov that is not universally available for PPC it's isel, given that there's a also some fruit in going with LDBRX were available i guess it's worthwile investigation the proper interface for enquiring the host CPU features...
On 12/17/2009 09:07 AM, Richard Henderson wrote: >> + case TCG_COND_NE: >> + if (const_arg2) { >> + if ((uint16_t) arg2 == arg2) { >> + tcg_out32 (s, XORI | RS (arg1) | RA (0) | arg2); >> + } >> + else { >> + tcg_out_movi (s, TCG_TYPE_I32, 0, arg2); >> + tcg_out32 (s, XOR | SAB (arg1, 0, 0)); >> + } >> + } >> + else { >> + tcg_out32 (s, XOR | SAB (arg1, 0, arg2)); >> + } >> + >> + tcg_out32 (s, ADDIC | RT (arg0) | RA (0) | 0xffff); >> + tcg_out32 (s, SUBFE | TAB (arg0, arg0, 0)); >> + return; > > Heh, you know a trick that gcc doesn't for powerpc. It just adds an xor > at the end of the EQ sequence. Ah, gcc does know the trick, it just thinks it's only a win on old Power systems. r~
On 12/17/2009 09:47 AM, malc wrote: > Urgh.. I really hate implementing those xxxx2 ops. > > See for example this lovely thread: > http://www.archivum.info/qemu-devel@nongnu.org/2008-06/00306/%5BQemu-devel%5D_%5B4705%5D_Fix_div%5Bu%5D2. Heh, that one's pretty nasty. But fwiw, you can just call into your brcond2 implementation for setcond2. There's a bit more work to do on sparc and mips because they'd prefer we filled the delay slots. r~
On Thu, 17 Dec 2009, Richard Henderson wrote: > On 12/17/2009 07:32 AM, malc wrote: > > > These new opcodes are considered "required" by the backend, > > > because expanding them at the tcg level breaks the basic block. > > > There might be some way to emulate within tcg internals, but > > > that doesn't seem worthwhile, as essentially all hosts have > > > some form of support for these. > .. > > c. Historically things like that were made conditional with > > a generic fallback (bswap, neg, not, rot, etc) > > I answered this one above. A generic fallback would break the > basic block, which would break TCGs simple register allocation. > > > b. Documentation for movcond has a typo, t0 is assigned not t1 > > Oops. Will fix. > > > d. Documentation for setcond2 is missing > > Ah, I see that brcond2 is missing as well; I'll fix that too. > > > It would also be interesting to learn what impact adding those two > > has on performance, any results? > > Hmph, not as much as I would have liked. I suppose Intel is getting pretty > darned good with its branch prediction. It shaved about 3 minutes off > 183.equake from what I posted earlier this week; that's something around a 7% > improvement, assuming it's not just all noise (I havn't run that test enough > times to see what the variation is). > After fixing a bug (crop was done after reading the cr) i run some openssl speed benchmarks, and, at least here on an MPC7447A, got a speed degradation, tiny but consistent. Took a very quick glance at the generated code and the first thing i saw was this: ---------------- IN: 0x40082295: movzbl (%eax),%eax 0x40082298: cmp $0x3d,%al 0x4008229a: setne %dl 0x4008229d: test %al,%al 0x4008229f: je 0x400822d2 OP after liveness analysis: mov_i32 tmp2,eax qemu_ld8u tmp0,tmp2,$0xffffffff mov_i32 eax,tmp0 movi_i32 tmp1,$0x3d mov_i32 tmp0,eax nopn $0x2,$0x2 sub_i32 cc_dst,tmp0,tmp1 movi_i32 tmp13,$0xff and_i32 tmp4,cc_dst,tmp13 movi_i32 tmp13,$0x0 setcond_i32 tmp0,tmp4,tmp13,ne movi_i32 tmp14,$0xff and_i32 tmp13,tmp0,tmp14 .... OUT: [size=204] 0x601051b0: lwz r14,0(r27) 0x601051b4: lbzx r14,0,r14 0x601051b8: mr r15,r14 0x601051bc: addi r15,r15,-61 0x601051c0: andi. r15,r15,255 0x601051c4: cmpwi cr6,r15,0 0x601051c8: crnot 4*cr7+eq,4*cr6+eq 0x601051cc: mfcr r0 0x601051d0: rlwinm r15,r0,31,31,31 0x601051d4: andi. r15,r15,255 ... So the fact that setcond produces 0/1 was never communicated to the tcg, not that i would claim that it's possible at all... [..snip..]
On 12/18/2009 07:40 AM, malc wrote: > After fixing a bug (crop was done after reading the cr) i run some > openssl speed benchmarks, and, at least here on an MPC7447A, got a > speed degradation, tiny but consistent. Well, you could try rendering the setcond with branches instead of logical operations. You'll still gain the benefit of not having ended the TCG basic block, and forced the stores of globals to their slots etc etc. > IN: > 0x40082295: movzbl (%eax),%eax > 0x40082298: cmp $0x3d,%al > 0x4008229a: setne %dl > 0x4008229d: test %al,%al > 0x4008229f: je 0x400822d2 > > OP after liveness analysis: > mov_i32 tmp2,eax > qemu_ld8u tmp0,tmp2,$0xffffffff > mov_i32 eax,tmp0 > movi_i32 tmp1,$0x3d > mov_i32 tmp0,eax > nopn $0x2,$0x2 > sub_i32 cc_dst,tmp0,tmp1 > movi_i32 tmp13,$0xff > and_i32 tmp4,cc_dst,tmp13 > movi_i32 tmp13,$0x0 > setcond_i32 tmp0,tmp4,tmp13,ne > movi_i32 tmp14,$0xff > and_i32 tmp13,tmp0,tmp14 > > .... > > OUT: [size=204] > 0x601051b0: lwz r14,0(r27) > 0x601051b4: lbzx r14,0,r14 > 0x601051b8: mr r15,r14 > 0x601051bc: addi r15,r15,-61 > 0x601051c0: andi. r15,r15,255 > 0x601051c4: cmpwi cr6,r15,0 > 0x601051c8: crnot 4*cr7+eq,4*cr6+eq > 0x601051cc: mfcr r0 > 0x601051d0: rlwinm r15,r0,31,31,31 > 0x601051d4: andi. r15,r15,255 > > ... > > So the fact that setcond produces 0/1 was never communicated to the > tcg, not that i would claim that it's possible at all... It isn't. And anyway, if you look at the opcodes generated without the setcond patch you'll see that and 255 in there as well. Some more surgery on the i386 translator could probably get rid of that. All I replaced were sequences of brcond c1,c2,$lab_true movi dest,0 br $lab_over movi dest,1 r~
diff --git a/tcg/ppc/tcg-target.c b/tcg/ppc/tcg-target.c index 07e6941..195af13 100644 --- a/tcg/ppc/tcg-target.c +++ b/tcg/ppc/tcg-target.c @@ -316,6 +316,7 @@ static int tcg_target_const_match(tcg_target_long val, #define STH OPCD(44) #define STW OPCD(36) +#define ADDIC OPCD(12) #define ADDI OPCD(14) #define ADDIS OPCD(15) #define ORI OPCD(24) @@ -339,6 +340,7 @@ static int tcg_target_const_match(tcg_target_long val, #define CRANDC XO19(129) #define CRNAND XO19(225) #define CROR XO19(449) +#define CRNOR XO19( 33) #define EXTSB XO31(954) #define EXTSH XO31(922) @@ -365,6 +367,8 @@ static int tcg_target_const_match(tcg_target_long val, #define MTSPR XO31(467) #define SRAWI XO31(824) #define NEG XO31(104) +#define MFCR XO31( 19) +#define CNTLZW XO31( 26) #define LBZX XO31( 87) #define LHZX XO31(279) @@ -1073,6 +1077,95 @@ static void tcg_out_brcond (TCGContext *s, int cond, tcg_out_bc (s, tcg_to_bc[cond], label_index); } +static void tcg_out_setcond (TCGContext *s, int cond, TCGArg arg0, + TCGArg arg1, TCGArg arg2, int const_arg2) +{ + int crop, sh; + + switch (cond) { + case TCG_COND_EQ: + if (const_arg2) { + if ((uint16_t) arg2 == arg2) { + tcg_out32 (s, XORI | RS (arg1) | RA (0) | arg2); + } + else { + tcg_out_movi (s, TCG_TYPE_I32, 0, arg2); + tcg_out32 (s, XOR | SAB (arg1, 0, 0)); + } + } + else { + tcg_out32 (s, XOR | SAB (arg1, 0, arg2)); + } + tcg_out32 (s, CNTLZW | RS (0) | RA (0)); + tcg_out32 (s, (RLWINM + | RA (arg0) + | RS (0) + | SH (27) + | MB (5) + | ME (31) + ) + ); + return; + + case TCG_COND_NE: + if (const_arg2) { + if ((uint16_t) arg2 == arg2) { + tcg_out32 (s, XORI | RS (arg1) | RA (0) | arg2); + } + else { + tcg_out_movi (s, TCG_TYPE_I32, 0, arg2); + tcg_out32 (s, XOR | SAB (arg1, 0, 0)); + } + } + else { + tcg_out32 (s, XOR | SAB (arg1, 0, arg2)); + } + + tcg_out32 (s, ADDIC | RT (arg0) | RA (0) | 0xffff); + tcg_out32 (s, SUBFE | TAB (arg0, arg0, 0)); + return; + + case TCG_COND_LTU: + case TCG_COND_LT: + sh = 29; + crop = 0; + break; + + case TCG_COND_GEU: + case TCG_COND_GE: + sh = 31; + crop = CRNOR | BT (7, CR_EQ) | BA (7, CR_LT) | BB (7, CR_LT); + break; + + case TCG_COND_LEU: + case TCG_COND_LE: + sh = 31; + crop = CRNOR | BT (7, CR_EQ) | BA (7, CR_GT) | BB (7, CR_GT); + break; + + case TCG_COND_GTU: + case TCG_COND_GT: + sh = 30; + crop = 0; + break; + + default: + tcg_abort (); + } + + tcg_out_cmp (s, cond, arg1, arg2, const_arg2, 7); + tcg_out32 (s, MFCR | RT (0)); + if (crop) tcg_out32 (s, crop); + tcg_out32 (s, (RLWINM + | RA (arg0) + | RS (0) + | SH (sh) + | MB (31) + | ME (31) + ) + ); +} + /* XXX: we implement it at the target level to avoid having to handle cross basic blocks temporaries */ static void tcg_out_brcond2 (TCGContext *s, const TCGArg *args, @@ -1496,6 +1589,10 @@ static void tcg_out_op(TCGContext *s, int opc, const TCGArg *args, tcg_out32 (s, EXTSH | RS (args[1]) | RA (args[0])); break; + case INDEX_op_setcond_i32: + tcg_out_setcond(s, args[3], args[0], args[1], args[2], const_args[2]); + break; + default: tcg_dump_ops (s, stderr); tcg_abort (); @@ -1544,6 +1641,8 @@ static const TCGTargetOpDef ppc_op_defs[] = { { INDEX_op_neg_i32, { "r", "r" } }, + { INDEX_op_setcond_i32, { "r", "r", "ri" } }, + #if TARGET_LONG_BITS == 32 { INDEX_op_qemu_ld8u, { "r", "L" } }, { INDEX_op_qemu_ld8s, { "r", "L" } },