Message ID | e36e7f72dbd7724145773f759814a3c0a184c667.1271277329.git.rth@twiddle.net |
---|---|
State | New |
Headers | show |
On Tue, Apr 13, 2010 at 04:33:59PM -0700, Richard Henderson wrote: > Define OPC_BSWAP. Factor opcode emission to separate functions. > Use bswap+shift to implement 16-bit swap instead of a rolw; this > gets the proper zero-extension required by INDEX_op_bswap16_i32. This is not required by INDEX_op_bswap16_i32. What is need is that the value in the input register has the 16 upper bits set to 0. Considering that, the rolw instruction is faster than bswap + shift. > Signed-off-by: Richard Henderson <rth@twiddle.net> > --- > tcg/i386/tcg-target.c | 53 +++++++++++++++++++++++++------------------------ > 1 files changed, 27 insertions(+), 26 deletions(-) > > diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c > index 75b9915..0bafd00 100644 > --- a/tcg/i386/tcg-target.c > +++ b/tcg/i386/tcg-target.c > @@ -163,6 +163,7 @@ static inline int tcg_target_const_match(tcg_target_long val, > > #define P_EXT 0x100 /* 0x0f opcode prefix */ > > +#define OPC_BSWAP (0xc8 | P_EXT) > #define OPC_MOVZBL (0xb6 | P_EXT) > #define OPC_MOVZWL (0xb7 | P_EXT) > #define OPC_MOVSBL (0xbe | P_EXT) > @@ -339,6 +340,22 @@ static inline void tcg_out_ext16s(TCGContext *s, int dest, int src) > tcg_out_modrm(s, OPC_MOVSWL, dest, src); > } > > +static inline void tcg_out_bswap32(TCGContext *s, int reg) > +{ > + tcg_out_opc(s, OPC_BSWAP + reg); > +} > + > +static inline void tcg_out_bswap16(TCGContext *s, int reg, int sign) > +{ > + /* This swap+shift combination guarantees that the high part contains > + the sign or zero extension required. It also doesn't suffer the > + problem of partial register stalls that using rolw does. */ > + tcg_out_bswap32(s, reg); > + /* shr $16, dest */ > + tcg_out_modrm(s, 0xc1, (sign ? SHIFT_SAR : SHIFT_SHR), reg); > + tcg_out8(s, 16); > +} > + > static inline void tgen_arithi(TCGContext *s, int c, int r0, int32_t val, int cf) > { > if (!cf && ((c == ARITH_ADD && val == 1) || (c == ARITH_SUB && val == -1))) { > @@ -745,31 +762,21 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, > /* movzwl */ > tcg_out_modrm_offset(s, OPC_MOVZWL, data_reg, r0, GUEST_BASE); > if (bswap) { > - /* rolw $8, data_reg */ > - tcg_out8(s, 0x66); > - tcg_out_modrm(s, 0xc1, 0, data_reg); > - tcg_out8(s, 8); > + tcg_out_bswap16(s, data_reg, 0); > } > break; > case 1 | 4: > /* movswl */ > tcg_out_modrm_offset(s, OPC_MOVSWL, data_reg, r0, GUEST_BASE); > if (bswap) { > - /* rolw $8, data_reg */ > - tcg_out8(s, 0x66); > - tcg_out_modrm(s, 0xc1, 0, data_reg); > - tcg_out8(s, 8); > - > - /* movswl data_reg, data_reg */ > - tcg_out_modrm(s, OPC_MOVSWL, data_reg, data_reg); > + tcg_out_bswap16(s, data_reg, 1); > } > break; > case 2: > /* movl (r0), data_reg */ > tcg_out_modrm_offset(s, 0x8b, data_reg, r0, GUEST_BASE); > if (bswap) { > - /* bswap */ > - tcg_out_opc(s, (0xc8 + data_reg) | P_EXT); > + tcg_out_bswap32(s, data_reg); > } > break; > case 3: > @@ -786,11 +793,10 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, > tcg_out_modrm_offset(s, 0x8b, data_reg2, r0, GUEST_BASE + 4); > } else { > tcg_out_modrm_offset(s, 0x8b, data_reg, r0, GUEST_BASE + 4); > - tcg_out_opc(s, (0xc8 + data_reg) | P_EXT); > + tcg_out_bswap32(s, data_reg); > > tcg_out_modrm_offset(s, 0x8b, data_reg2, r0, GUEST_BASE); > - /* bswap */ > - tcg_out_opc(s, (0xc8 + data_reg2) | P_EXT); > + tcg_out_bswap32(s, data_reg2); > } > break; > default: > @@ -982,8 +988,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, > case 2: > if (bswap) { > tcg_out_mov(s, r1, data_reg); > - /* bswap data_reg */ > - tcg_out_opc(s, (0xc8 + r1) | P_EXT); > + tcg_out_bswap32(s, r1); > data_reg = r1; > } > /* movl */ > @@ -992,12 +997,10 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, > case 3: > if (bswap) { > tcg_out_mov(s, r1, data_reg2); > - /* bswap data_reg */ > - tcg_out_opc(s, (0xc8 + r1) | P_EXT); > + tcg_out_bswap32(s, r1); > tcg_out_modrm_offset(s, 0x89, r1, r0, GUEST_BASE); > tcg_out_mov(s, r1, data_reg); > - /* bswap data_reg */ > - tcg_out_opc(s, (0xc8 + r1) | P_EXT); > + tcg_out_bswap32(s, r1); > tcg_out_modrm_offset(s, 0x89, r1, r0, GUEST_BASE + 4); > } else { > tcg_out_modrm_offset(s, 0x89, data_reg, r0, GUEST_BASE); > @@ -1195,12 +1198,10 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, > break; > > case INDEX_op_bswap16_i32: > - tcg_out8(s, 0x66); > - tcg_out_modrm(s, 0xc1, SHIFT_ROL, args[0]); > - tcg_out8(s, 8); > + tcg_out_bswap16(s, args[0], 0); > break; > case INDEX_op_bswap32_i32: > - tcg_out_opc(s, (0xc8 + args[0]) | P_EXT); > + tcg_out_bswap32(s, args[0]); > break; > > case INDEX_op_neg_i32: > -- > 1.6.2.5 > > > >
On 04/18/2010 05:13 PM, Aurelien Jarno wrote: > On Tue, Apr 13, 2010 at 04:33:59PM -0700, Richard Henderson wrote: >> Define OPC_BSWAP. Factor opcode emission to separate functions. >> Use bswap+shift to implement 16-bit swap instead of a rolw; this >> gets the proper zero-extension required by INDEX_op_bswap16_i32. > > This is not required by INDEX_op_bswap16_i32. What is need is that the > value in the input register has the 16 upper bits set to 0. Ah. > Considering > that, the rolw instruction is faster than bswap + shift. Well, no, it isn't. static inline int test_rolw(unsigned short *s) { int i, start, end; asm volatile("rdtsc\n\t" "movl %%eax, %1\n\t" "movzwl %3,%2\n\t" "rolw $8, %w2\n\t" "addl $1,%2\n\t" "rdtsc" : "=&a"(end), "=r"(start), "=r"(i) : "m"(*s) : "edx"); return end - start; } static inline int test_bswap(unsigned short *s) { int i, start, end; asm volatile("rdtsc\n\t" "movl %%eax, %1\n\t" "movzwl %3,%2\n\t" "bswap %2\n\t" "shl $16,%2\n\t" "addl $1,%2\n\t" "rdtsc" : "=&a"(end), "=r"(start), "=r"(i) : "m"(*s) : "edx"); return end - start; } model name : Intel(R) Core(TM)2 Duo CPU T7700 @ 2.40GHz rolw 60 60 72 60 60 72 60 60 72 60 bswap 60 60 60 60 60 60 60 60 60 60 model name : Dual-Core AMD Opteron(tm) Processor 1210 rolw 9 10 9 9 8 8 8 8 8 8 bswap 9 9 8 8 8 8 8 8 8 8 The rolw sequence isn't ever faster, and it's more unstable, likely due to the partial register stall I mentioned. I will grant that the rolw sequence is smaller, and I can adjust this patch to use that sequence if you wish. r~
diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c index 75b9915..0bafd00 100644 --- a/tcg/i386/tcg-target.c +++ b/tcg/i386/tcg-target.c @@ -163,6 +163,7 @@ static inline int tcg_target_const_match(tcg_target_long val, #define P_EXT 0x100 /* 0x0f opcode prefix */ +#define OPC_BSWAP (0xc8 | P_EXT) #define OPC_MOVZBL (0xb6 | P_EXT) #define OPC_MOVZWL (0xb7 | P_EXT) #define OPC_MOVSBL (0xbe | P_EXT) @@ -339,6 +340,22 @@ static inline void tcg_out_ext16s(TCGContext *s, int dest, int src) tcg_out_modrm(s, OPC_MOVSWL, dest, src); } +static inline void tcg_out_bswap32(TCGContext *s, int reg) +{ + tcg_out_opc(s, OPC_BSWAP + reg); +} + +static inline void tcg_out_bswap16(TCGContext *s, int reg, int sign) +{ + /* This swap+shift combination guarantees that the high part contains + the sign or zero extension required. It also doesn't suffer the + problem of partial register stalls that using rolw does. */ + tcg_out_bswap32(s, reg); + /* shr $16, dest */ + tcg_out_modrm(s, 0xc1, (sign ? SHIFT_SAR : SHIFT_SHR), reg); + tcg_out8(s, 16); +} + static inline void tgen_arithi(TCGContext *s, int c, int r0, int32_t val, int cf) { if (!cf && ((c == ARITH_ADD && val == 1) || (c == ARITH_SUB && val == -1))) { @@ -745,31 +762,21 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, /* movzwl */ tcg_out_modrm_offset(s, OPC_MOVZWL, data_reg, r0, GUEST_BASE); if (bswap) { - /* rolw $8, data_reg */ - tcg_out8(s, 0x66); - tcg_out_modrm(s, 0xc1, 0, data_reg); - tcg_out8(s, 8); + tcg_out_bswap16(s, data_reg, 0); } break; case 1 | 4: /* movswl */ tcg_out_modrm_offset(s, OPC_MOVSWL, data_reg, r0, GUEST_BASE); if (bswap) { - /* rolw $8, data_reg */ - tcg_out8(s, 0x66); - tcg_out_modrm(s, 0xc1, 0, data_reg); - tcg_out8(s, 8); - - /* movswl data_reg, data_reg */ - tcg_out_modrm(s, OPC_MOVSWL, data_reg, data_reg); + tcg_out_bswap16(s, data_reg, 1); } break; case 2: /* movl (r0), data_reg */ tcg_out_modrm_offset(s, 0x8b, data_reg, r0, GUEST_BASE); if (bswap) { - /* bswap */ - tcg_out_opc(s, (0xc8 + data_reg) | P_EXT); + tcg_out_bswap32(s, data_reg); } break; case 3: @@ -786,11 +793,10 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, tcg_out_modrm_offset(s, 0x8b, data_reg2, r0, GUEST_BASE + 4); } else { tcg_out_modrm_offset(s, 0x8b, data_reg, r0, GUEST_BASE + 4); - tcg_out_opc(s, (0xc8 + data_reg) | P_EXT); + tcg_out_bswap32(s, data_reg); tcg_out_modrm_offset(s, 0x8b, data_reg2, r0, GUEST_BASE); - /* bswap */ - tcg_out_opc(s, (0xc8 + data_reg2) | P_EXT); + tcg_out_bswap32(s, data_reg2); } break; default: @@ -982,8 +988,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, case 2: if (bswap) { tcg_out_mov(s, r1, data_reg); - /* bswap data_reg */ - tcg_out_opc(s, (0xc8 + r1) | P_EXT); + tcg_out_bswap32(s, r1); data_reg = r1; } /* movl */ @@ -992,12 +997,10 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, case 3: if (bswap) { tcg_out_mov(s, r1, data_reg2); - /* bswap data_reg */ - tcg_out_opc(s, (0xc8 + r1) | P_EXT); + tcg_out_bswap32(s, r1); tcg_out_modrm_offset(s, 0x89, r1, r0, GUEST_BASE); tcg_out_mov(s, r1, data_reg); - /* bswap data_reg */ - tcg_out_opc(s, (0xc8 + r1) | P_EXT); + tcg_out_bswap32(s, r1); tcg_out_modrm_offset(s, 0x89, r1, r0, GUEST_BASE + 4); } else { tcg_out_modrm_offset(s, 0x89, data_reg, r0, GUEST_BASE); @@ -1195,12 +1198,10 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, break; case INDEX_op_bswap16_i32: - tcg_out8(s, 0x66); - tcg_out_modrm(s, 0xc1, SHIFT_ROL, args[0]); - tcg_out8(s, 8); + tcg_out_bswap16(s, args[0], 0); break; case INDEX_op_bswap32_i32: - tcg_out_opc(s, (0xc8 + args[0]) | P_EXT); + tcg_out_bswap32(s, args[0]); break; case INDEX_op_neg_i32:
Define OPC_BSWAP. Factor opcode emission to separate functions. Use bswap+shift to implement 16-bit swap instead of a rolw; this gets the proper zero-extension required by INDEX_op_bswap16_i32. Signed-off-by: Richard Henderson <rth@twiddle.net> --- tcg/i386/tcg-target.c | 53 +++++++++++++++++++++++++------------------------ 1 files changed, 27 insertions(+), 26 deletions(-)