Message ID | 1442621006-4231-1-git-send-email-gang.chen.5i5j@gmail.com |
---|---|
State | New |
Headers | show |
On 09/18/2015 05:03 PM, gang.chen.5i5j@gmail.com wrote: > +uint64_t helper_v1add(uint64_t a, uint64_t b) > +{ > + uint64_t r = 0; > + int i; > + > + for (i = 0; i < 64; i += 8) { > + int64_t ae = (int8_t)(a >> i); > + int64_t be = (int8_t)(b >> i); > + r |= ((ae + be) & 0xff) << i; > + } > + return r; > +} > + > +uint64_t helper_v2add(uint64_t a, uint64_t b) > +{ > + uint64_t r = 0; > + int i; > + > + for (i = 0; i < 64; i += 16) { > + int64_t ae = (int16_t)(a >> i); > + int64_t be = (int16_t)(b >> i); > + r |= ((ae + be) & 0xffff) << i; > + } > + return r; > +} There's a trick for this that's more efficient for 4 or more elements per vector (i.e. good for v2 and v1, but not v4): a + b = (a & 0x7f7f7f7f) + (b & 0x7f7f7f7f)) ^ ((a ^ b) & 0x80808080) a - b = (a | 0x80808080) - (b & 0x7f7f7f7f)) ^ ((a ^ ~b) & 0x80808080) > +uint64_t helper_v4add(uint64_t a, uint64_t b) > +{ > + uint64_t r = 0; > + int i; > + > + for (i = 0; i < 64; i += 32) { > + int64_t ae = (int32_t)(a >> i); > + int64_t be = (int32_t)(b >> i); > + r |= ((ae + be) & 0xffffffff) << i; > + } > + return r; > +} I should have mentioned this in the previous patch... I think probably it would be best to open-code all, or most of, the v4 operations. Something like static void gen_v4op(TCGv d64, TCGv a64, TCGv b64, void (*generate)(TCGv_i32, TCGv_i32, TCGv_i32)) { TCGv_i32 al = tcg_temp_new_i32(); TCGv_i32 ah = tcg_temp_new_i32(); TCGv_i32 bl = tcg_temp_new_i32(); TCGv_i32 bh = tcg_temp_new_i32(); tcg_gen_extr_i64_i32(al, ah, a64); tcg_gen_extr_i64_i32(bl, bh, b64); generate(al, al, bl); generate(ah, ah, bh); tcg_gen_concat_i32_i64(d64, al, ah); tcg_temp_free_i32(al); tcg_temp_free_i32(ah); tcg_temp_free_i32(bl); tcg_temp_free_i32(bh); } > case OE_RRR(V4ADD, 0, X0): > case OE_RRR(V4ADD, 0, X1): > - return TILEGX_EXCP_OPCODE_UNIMPLEMENTED; > + gen_helper_v4add(tdest, tsrca, tsrcb); And then gen_v4op(tdest, tsrca, tsrcb, tcg_gen_add_i32); r~
On 9/19/15 10:34, Richard Henderson wrote: > On 09/18/2015 05:03 PM, gang.chen.5i5j@gmail.com wrote: >> +uint64_t helper_v1add(uint64_t a, uint64_t b) >> +{ >> + uint64_t r = 0; >> + int i; >> + >> + for (i = 0; i < 64; i += 8) { >> + int64_t ae = (int8_t)(a >> i); >> + int64_t be = (int8_t)(b >> i); >> + r |= ((ae + be) & 0xff) << i; >> + } >> + return r; >> +} >> + >> +uint64_t helper_v2add(uint64_t a, uint64_t b) >> +{ >> + uint64_t r = 0; >> + int i; >> + >> + for (i = 0; i < 64; i += 16) { >> + int64_t ae = (int16_t)(a >> i); >> + int64_t be = (int16_t)(b >> i); >> + r |= ((ae + be) & 0xffff) << i; >> + } >> + return r; >> +} > > There's a trick for this that's more efficient for 4 or more elements per vector (i.e. good for v2 and v1, but not v4): > > a + b = (a & 0x7f7f7f7f) + (b & 0x7f7f7f7f)) ^ ((a ^ b) & 0x80808080) > > a - b = (a | 0x80808080) - (b & 0x7f7f7f7f)) ^ ((a ^ ~b) & 0x80808080) > OK, thanks, for me, it is a good idea. :-) >> +uint64_t helper_v4add(uint64_t a, uint64_t b) >> +{ >> + uint64_t r = 0; >> + int i; >> + >> + for (i = 0; i < 64; i += 32) { >> + int64_t ae = (int32_t)(a >> i); >> + int64_t be = (int32_t)(b >> i); >> + r |= ((ae + be) & 0xffffffff) << i; >> + } >> + return r; >> +} > > I should have mentioned this in the previous patch... > mm... maybe, but at least, I forgot. > I think probably it would be best to open-code all, or most of, the v4 operations. Something like > > static void gen_v4op(TCGv d64, TCGv a64, TCGv b64, > void (*generate)(TCGv_i32, TCGv_i32, TCGv_i32)) > { > TCGv_i32 al = tcg_temp_new_i32(); > TCGv_i32 ah = tcg_temp_new_i32(); > TCGv_i32 bl = tcg_temp_new_i32(); > TCGv_i32 bh = tcg_temp_new_i32(); > > tcg_gen_extr_i64_i32(al, ah, a64); > tcg_gen_extr_i64_i32(bl, bh, b64); > generate(al, al, bl); > generate(ah, ah, bh); > tcg_gen_concat_i32_i64(d64, al, ah); > > tcg_temp_free_i32(al); > tcg_temp_free_i32(ah); > tcg_temp_free_i32(bl); > tcg_temp_free_i32(bh); > } > >> case OE_RRR(V4ADD, 0, X0): >> case OE_RRR(V4ADD, 0, X1): >> - return TILEGX_EXCP_OPCODE_UNIMPLEMENTED; >> + gen_helper_v4add(tdest, tsrca, tsrcb); > > And then > > gen_v4op(tdest, tsrca, tsrcb, tcg_gen_add_i32); > OK, thanks. At least for me, what you said sounds reasonalbe. Thanks.
On 2015年09月19日 10:34, Richard Henderson wrote: > > There's a trick for this that's more efficient for 4 or more elements > per vector (i.e. good for v2 and v1, but not v4): > > a + b = (a & 0x7f7f7f7f) + (b & 0x7f7f7f7f)) ^ ((a ^ b) & 0x80808080) > > a - b = (a | 0x80808080) - (b & 0x7f7f7f7f)) ^ ((a ^ ~b) & 0x80808080) > For me, we need use "(a ^ b) & 0x80..." instead of "(a ^ ~b) & 0x80...". Thanks.
On 09/21/2015 10:54 PM, Chen Gang wrote: > On 2015年09月19日 10:34, Richard Henderson wrote: >> >> There's a trick for this that's more efficient for 4 or more elements >> per vector (i.e. good for v2 and v1, but not v4): >> >> a + b = (a & 0x7f7f7f7f) + (b & 0x7f7f7f7f)) ^ ((a ^ b) & 0x80808080) >> >> a - b = (a | 0x80808080) - (b & 0x7f7f7f7f)) ^ ((a ^ ~b) & 0x80808080) >> > > For me, we need use "(a ^ b) & 0x80..." instead of "(a ^ ~b) & 0x80...". No. What you did wrong was not use (a | 0x80808080). r~
On 9/22/15 22:45, Richard Henderson wrote: > On 09/21/2015 10:54 PM, Chen Gang wrote: >> On 2015年09月19日 10:34, Richard Henderson wrote: >>> >>> There's a trick for this that's more efficient for 4 or more elements >>> per vector (i.e. good for v2 and v1, but not v4): >>> >>> a + b = (a & 0x7f7f7f7f) + (b & 0x7f7f7f7f)) ^ ((a ^ b) & 0x80808080) >>> >>> a - b = (a | 0x80808080) - (b & 0x7f7f7f7f)) ^ ((a ^ ~b) & 0x80808080) >>> >> >> For me, we need use "(a ^ b) & 0x80..." instead of "(a ^ ~b) & 0x80...". > > No. What you did wrong was not use (a | 0x80808080). > Oh, sorry. I shall send patch v3 for it. :-) Thanks.
diff --git a/target-tilegx/helper.h b/target-tilegx/helper.h index 15093973..c366984 100644 --- a/target-tilegx/helper.h +++ b/target-tilegx/helper.h @@ -5,12 +5,20 @@ DEF_HELPER_FLAGS_1(pcnt, TCG_CALL_NO_RWG_SE, i64, i64) DEF_HELPER_FLAGS_1(revbits, TCG_CALL_NO_RWG_SE, i64, i64) DEF_HELPER_FLAGS_3(shufflebytes, TCG_CALL_NO_RWG_SE, i64, i64, i64, i64) +DEF_HELPER_FLAGS_2(v1add, TCG_CALL_NO_RWG_SE, i64, i64, i64) DEF_HELPER_FLAGS_2(v1shl, TCG_CALL_NO_RWG_SE, i64, i64, i64) DEF_HELPER_FLAGS_2(v1shru, TCG_CALL_NO_RWG_SE, i64, i64, i64) DEF_HELPER_FLAGS_2(v1shrs, TCG_CALL_NO_RWG_SE, i64, i64, i64) +DEF_HELPER_FLAGS_2(v1sub, TCG_CALL_NO_RWG_SE, i64, i64, i64) + +DEF_HELPER_FLAGS_2(v2add, TCG_CALL_NO_RWG_SE, i64, i64, i64) DEF_HELPER_FLAGS_2(v2shl, TCG_CALL_NO_RWG_SE, i64, i64, i64) DEF_HELPER_FLAGS_2(v2shru, TCG_CALL_NO_RWG_SE, i64, i64, i64) DEF_HELPER_FLAGS_2(v2shrs, TCG_CALL_NO_RWG_SE, i64, i64, i64) +DEF_HELPER_FLAGS_2(v2sub, TCG_CALL_NO_RWG_SE, i64, i64, i64) + +DEF_HELPER_FLAGS_2(v4add, TCG_CALL_NO_RWG_SE, i64, i64, i64) DEF_HELPER_FLAGS_2(v4shl, TCG_CALL_NO_RWG_SE, i64, i64, i64) DEF_HELPER_FLAGS_2(v4shru, TCG_CALL_NO_RWG_SE, i64, i64, i64) DEF_HELPER_FLAGS_2(v4shrs, TCG_CALL_NO_RWG_SE, i64, i64, i64) +DEF_HELPER_FLAGS_2(v4sub, TCG_CALL_NO_RWG_SE, i64, i64, i64) diff --git a/target-tilegx/simd_helper.c b/target-tilegx/simd_helper.c index 6546337..ec589fe 100644 --- a/target-tilegx/simd_helper.c +++ b/target-tilegx/simd_helper.c @@ -22,6 +22,83 @@ #include "qemu-common.h" #include "exec/helper-proto.h" +uint64_t helper_v1add(uint64_t a, uint64_t b) +{ + uint64_t r = 0; + int i; + + for (i = 0; i < 64; i += 8) { + int64_t ae = (int8_t)(a >> i); + int64_t be = (int8_t)(b >> i); + r |= ((ae + be) & 0xff) << i; + } + return r; +} + +uint64_t helper_v2add(uint64_t a, uint64_t b) +{ + uint64_t r = 0; + int i; + + for (i = 0; i < 64; i += 16) { + int64_t ae = (int16_t)(a >> i); + int64_t be = (int16_t)(b >> i); + r |= ((ae + be) & 0xffff) << i; + } + return r; +} + +uint64_t helper_v4add(uint64_t a, uint64_t b) +{ + uint64_t r = 0; + int i; + + for (i = 0; i < 64; i += 32) { + int64_t ae = (int32_t)(a >> i); + int64_t be = (int32_t)(b >> i); + r |= ((ae + be) & 0xffffffff) << i; + } + return r; +} + +uint64_t helper_v1sub(uint64_t a, uint64_t b) +{ + uint64_t r = 0; + int i; + + for (i = 0; i < 64; i += 8) { + int64_t ae = (int8_t)(a >> i); + int64_t be = (int8_t)(b >> i); + r |= ((ae - be) & 0xff) << i; + } + return r; +} + +uint64_t helper_v2sub(uint64_t a, uint64_t b) +{ + uint64_t r = 0; + int i; + + for (i = 0; i < 64; i += 16) { + int64_t ae = (int16_t)(a >> i); + int64_t be = (int16_t)(b >> i); + r |= ((ae - be) & 0xffff) << i; + } + return r; +} + +uint64_t helper_v4sub(uint64_t a, uint64_t b) +{ + uint64_t r = 0; + int i; + + for (i = 0; i < 64; i += 32) { + int64_t ae = (int32_t)(a >> i); + int64_t be = (int32_t)(b >> i); + r |= ((ae - be) & 0xffffffff) << i; + } + return r; +} uint64_t helper_v1shl(uint64_t a, uint64_t b) { diff --git a/target-tilegx/translate.c b/target-tilegx/translate.c index c8247ac..2246243 100644 --- a/target-tilegx/translate.c +++ b/target-tilegx/translate.c @@ -1024,8 +1024,12 @@ static TileExcp gen_rrr_opcode(DisasContext *dc, unsigned opext, break; case OE_RRR(V1ADDUC, 0, X0): case OE_RRR(V1ADDUC, 0, X1): + return TILEGX_EXCP_OPCODE_UNIMPLEMENTED; case OE_RRR(V1ADD, 0, X0): case OE_RRR(V1ADD, 0, X1): + gen_helper_v1add(tdest, tsrca, tsrcb); + mnemonic = "v1add"; + break; case OE_RRR(V1ADIFFU, 0, X0): case OE_RRR(V1AVGU, 0, X0): return TILEGX_EXCP_OPCODE_UNIMPLEMENTED; @@ -1095,12 +1099,20 @@ static TileExcp gen_rrr_opcode(DisasContext *dc, unsigned opext, break; case OE_RRR(V1SUBUC, 0, X0): case OE_RRR(V1SUBUC, 0, X1): + return TILEGX_EXCP_OPCODE_UNIMPLEMENTED; case OE_RRR(V1SUB, 0, X0): case OE_RRR(V1SUB, 0, X1): + gen_helper_v1sub(tdest, tsrca, tsrcb); + mnemonic = "v1sub"; + break; case OE_RRR(V2ADDSC, 0, X0): case OE_RRR(V2ADDSC, 0, X1): + return TILEGX_EXCP_OPCODE_UNIMPLEMENTED; case OE_RRR(V2ADD, 0, X0): case OE_RRR(V2ADD, 0, X1): + gen_helper_v2add(tdest, tsrca, tsrcb); + mnemonic = "v2add"; + break; case OE_RRR(V2ADIFFS, 0, X0): case OE_RRR(V2AVGS, 0, X0): case OE_RRR(V2CMPEQ, 0, X0): @@ -1162,13 +1174,20 @@ static TileExcp gen_rrr_opcode(DisasContext *dc, unsigned opext, break; case OE_RRR(V2SUBSC, 0, X0): case OE_RRR(V2SUBSC, 0, X1): + return TILEGX_EXCP_OPCODE_UNIMPLEMENTED; case OE_RRR(V2SUB, 0, X0): case OE_RRR(V2SUB, 0, X1): + gen_helper_v2sub(tdest, tsrca, tsrcb); + mnemonic = "v2sub"; + break; case OE_RRR(V4ADDSC, 0, X0): case OE_RRR(V4ADDSC, 0, X1): + return TILEGX_EXCP_OPCODE_UNIMPLEMENTED; case OE_RRR(V4ADD, 0, X0): case OE_RRR(V4ADD, 0, X1): - return TILEGX_EXCP_OPCODE_UNIMPLEMENTED; + gen_helper_v4add(tdest, tsrca, tsrcb); + mnemonic = "v4add"; + break; case OE_RRR(V4INT_H, 0, X0): case OE_RRR(V4INT_H, 0, X1): tcg_gen_shri_tl(tdest, tsrcb, 32); @@ -1202,9 +1221,12 @@ static TileExcp gen_rrr_opcode(DisasContext *dc, unsigned opext, break; case OE_RRR(V4SUBSC, 0, X0): case OE_RRR(V4SUBSC, 0, X1): + return TILEGX_EXCP_OPCODE_UNIMPLEMENTED; case OE_RRR(V4SUB, 0, X0): case OE_RRR(V4SUB, 0, X1): - return TILEGX_EXCP_OPCODE_UNIMPLEMENTED; + gen_helper_v2sub(tdest, tsrca, tsrcb); + mnemonic = "v2sub"; + break; case OE_RRR(XOR, 0, X0): case OE_RRR(XOR, 0, X1): case OE_RRR(XOR, 5, Y0):