diff mbox

[09/10,v11] target-tilegx: Generate tcg instructions to finish "Hello world"

Message ID BLU436-SMTP68C6610B03B00ADDE5679FB9C80@phx.gbl
State New
Headers show

Commit Message

Chen Gang May 30, 2015, 9:18 p.m. UTC
Generate related tcg instructions, and qemu tilegx can finish running
"Hello world". The elf64 binary can be static or shared.

Signed-off-by: Chen Gang <gang.chen.5i5j@gmail.com>
---
 target-tilegx/translate.c | 2787 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 2787 insertions(+)
 create mode 100644 target-tilegx/translate.c

Comments

Richard Henderson June 1, 2015, 6:40 p.m. UTC | #1
First, what happened to the decoding skeleton patch?  You seem to have merged
it with patch 9 here.  That said, see the bottom of this message.

On 05/30/2015 02:18 PM, Chen Gang wrote:
> +/* mfspr can be only in X1 pipe, so it doesn't need to be bufferd */
> +static void gen_mfspr(struct DisasContext *dc, uint8_t rdst, uint16_t imm14)

I'm not keen on this as a comment.  Clearly it could be buffered, with what is
implemented here now.  But there are plenty of SPRs for which produce side
effects, and *cannot* be buffered.

Adjust the comment to

/* Many SPR reads have side effects and cannot be buffered.  However, they are
   all in the X1 pipe, which we are executing last, therefore we need not do
   additional buffering.  */

> +/* mtspr can be only in X1 pipe, so it doesn't need to be bufferd */

Same, but s/reads/writes/.

> +#if 1

Do not include this.

> +/*
> + * uint64_t output = 0;
> + * uint32_t counter;
> + * for (counter = 0; counter < (WORD_SIZE / BYTE_SIZE); counter++)
> + * {
> + *     int8_t srca = getByte (rf[SrcA], counter);
> + *     int8_t srcb = signExtend8 (Imm8);
> + *     output = setByte (output, counter, ((srca == srcb) ? 1 : 0));
> + * }
> + * rf[Dest] = output;
> + */
> +static void gen_v1cmpeqi(struct DisasContext *dc,
> +                         uint8_t rdst, uint8_t rsrc, int8_t imm8)

Pass in the condition to use, since you'll eventually need to implement
v1cmpltsi, v1cmpltui.

> +static void gen_v1cmpeq(struct DisasContext *dc,
> +                        uint8_t rdst, uint8_t rsrc, uint8_t rsrcb)

Likewise for v1cmples, v1cmpleu, v1cmplts, v1cmpltu, v1cmpne.

> +    tcg_gen_movi_i64(vdst, 0); /* or Assertion `ts->val_type == TEMP_VAL_REG' */

These comments are unnecessary.  Of course it's illegal to use an uninitialized
temporary.

> +static void gen_v4int_l(struct DisasContext *dc,
> +                        uint8_t rdst, uint8_t rsrc, uint8_t rsrcb)
> +{
> +    qemu_log_mask(CPU_LOG_TB_IN_ASM, "v4int_l r%d, r%d, r%d\n",
> +                  rdst, rsrc, rsrcb);
> +    tcg_gen_deposit_i64(dest_gr(dc, rdst), load_gr(dc, rsrc),
> +                        load_gr(dc, rsrcb), 0, 32);

This is incorrect.  This produces { A1, B0 }, not { A0, B0 }.

As I said, you did want "32, 32" as the field insert, but you have the source
operands in the wrong order.

> +static void gen_addx(struct DisasContext *dc,
> +                     uint8_t rdst, uint8_t rsrc, uint8_t rsrcb)
> +{
> +    TCGv vdst = dest_gr(dc, rdst);
> +
> +    /* High bits have no effect with low bits, so addx and addxsc are merged. */
> +    qemu_log_mask(CPU_LOG_TB_IN_ASM, "addx(sc) r%d, r%d, r%d\n",
> +                  rdst, rsrc, rsrcb);

Um, no, addxsc does signed saturation before sign extension.

> +static void gen_mul_u_u(struct DisasContext *dc,
> +                        uint8_t rdst, uint8_t rsrc, uint8_t rsrcb,
> +                        int8 high, int8 highb, int8 add, const char *code)

A better name for this function is warranted, since it does much more than
mul_u_u.  The add parameter should be type bool.

Given the existence of mul_hs_hs, mul_hu_ls, etc, you're probably better off
passing in extraction functions.  E.g.

static void ext32s_high(TCGv d, TCGv s)
{
    tcg_gen_sari_i64(d, s, 32);
}

static void ext32u_high(TCGv d, TCGv s)
{
    tcg_gen_shri_i64(d, s, 32);
}

  gen_mul(dc, rdst, rsrc, rsrcb, ext32s_high, ext32s_high,
          false, "mul_hs_hs");
  gen_mul(dc, rdst, rsrc, rsrcb, ext32s_high, ext32u_high,
          false, "mul_hs_hu");
  gen_mul(dc, rdst, rsrc, rsrcb, ext32s_high, tcg_gen_ext32s_i64,
          false, "mul_hs_ls");
  gen_mul(dc, rdst, rsrc, rsrcb, ext32s_high, tcg_gen_ext32u_i64,
          false, "mul_hs_lu");

  gen_mul(dc, rdst, rsrc, rsrcb, ext32u_high, ext32u_high,
          false, "mul_hu_hu");
  gen_mul(dc, rdst, rsrc, rsrcb, ext32u_high, tcg_gen_ext32s_i64,
          false, "mul_hu_ls");
  gen_mul(dc, rdst, rsrc, rsrcb, ext32u_high, tcg_gen_ext32u_i64,
          false, "mul_hu_lu");

  gen_mul(dc, rdst, rsrc, rsrcb, tcg_gen_ext32s_i64, tcg_gen_ext32s_i64,
          false, "mul_ls_ls");
  gen_mul(dc, rdst, rsrc, rsrcb, tcg_gen_ext32s_i64, tcg_gen_ext32u_i64,
          false, "mul_ls_lu");

  gen_mul(dc, rdst, rsrc, rsrcb, tcg_gen_ext32u_i64, tcg_gen_ext32u_i64,
          false, "mul_lu_lu");


and of course the same for the mula insns with true instead of false for the
"add" parameter.

> +static void gen_shladd(struct DisasContext *dc,
> +                       uint8_t rdst, uint8_t rsrc, uint8_t rsrcb,
> +                       uint8_t shift, uint8_t cast)

cast should be bool.

> +static void gen_dblalign(struct DisasContext *dc,
> +                         uint8_t rdst, uint8_t rsrc, uint8_t rsrcb)
> +{
> +    TCGv vdst = dest_gr(dc, rdst);
> +    TCGv mask = tcg_temp_new_i64();
> +    TCGv tmp = tcg_temp_new_i64();
> +
> +    qemu_log_mask(CPU_LOG_TB_IN_ASM, "dblalign r%d, r%d, r%d\n",
> +                  rdst, rsrc, rsrcb);
> +
> +    tcg_gen_andi_i64(mask, load_gr(dc, rsrcb), 7);
> +    tcg_gen_muli_i64(mask, mask, 8);

tcg_gen_shli_i64(mask, mask, 3);

> +    tcg_gen_shr_i64(vdst, load_gr(dc, rdst), mask);
> +
> +    tcg_gen_movi_i64(tmp, 64);
> +    tcg_gen_sub_i64(mask, tmp, mask);
> +    tcg_gen_shl_i64(mask, load_gr(dc, rsrc), mask);
> +
> +    tcg_gen_or_i64(vdst, vdst, mask);

Does not produce the correct results for mask == 0.

What you want is when mask == 0, you shift A by 64 bits, i.e. produce a zero.
But you can't do that in TCG (or C for that matter).  Best is to do two shifts:

  tcg_gen_xori_i64(mask, mask, 63); /* compute 1's compliment of the shift */
  tcg_gen_shl_i64(mask, load_gr(dc, rsrc), mask);
  tcg_gen_shli_i64(mask, mask, 1); /* one more to produce 2's compliment */

> +static void gen_ld_add(struct DisasContext *dc,
> +                       uint8_t rdst, uint8_t rsrc, int8_t imm8,
> +                       TCGMemOp ops, const char *code)
> +{
> +    qemu_log_mask(CPU_LOG_TB_IN_ASM, "%s r%d, r%d, %d\n",
> +                  code, rdst, rsrc, imm8);
> +
> +    tcg_gen_qemu_ld_i64(dest_gr(dc, rdst), load_gr(dc, rsrc),
> +                        MMU_USER_IDX, ops);
> +    /*
> +     * Each pipe only have one temp val which is already used, and it is only
> +     * for pipe X1, so can use real register
> +     */
> +    if (rsrc < TILEGX_R_COUNT) {
> +        tcg_gen_addi_i64(cpu_regs[rsrc], load_gr(dc, rsrc), imm8);
> +    }
> +}

This is a poor comment.  Clearly each pipe can have two outputs, so this
limitation is simply of your own design.

Further, the < TILEGX_R_COUNT restriction is also incorrect.  True, you don't
actually implement the top 7 special registers, but that doesn't matter, you
should still be incrementing them.

> +
> +    return;

Do not add bare return statments at the ends of functions.

> +static int gen_blb(struct DisasContext *dc, uint8_t rsrc, int32_t off,
> +                   TCGCond cond, const char *code)

Unused return value.  What were you intending?

> +static void decode_rrr_1_opcode_y0(struct DisasContext *dc,
> +                                   tilegx_bundle_bits bundle)
> +{
> +    uint8_t rsrc = get_SrcA_Y0(bundle);
> +    uint8_t rsrcb = get_SrcB_Y0(bundle);
> +    uint8_t rdst = get_Dest_Y0(bundle);
> +
> +    switch (get_RRROpcodeExtension_Y0(bundle)) {
> +    case UNARY_RRR_1_OPCODE_Y0:
> +        switch (get_UnaryOpcodeExtension_Y0(bundle)) {
> +        case CNTLZ_UNARY_OPCODE_Y0:
> +            gen_cntlz(dc, rdst, rsrc);
> +            return;
> +        case CNTTZ_UNARY_OPCODE_Y0:
> +            gen_cnttz(dc, rdst, rsrc);
> +            return;
> +        case NOP_UNARY_OPCODE_Y0:
> +        case  FNOP_UNARY_OPCODE_Y0:
> +            if (!rsrc && !rdst) {
> +                qemu_log_mask(CPU_LOG_TB_IN_ASM, "(f)nop\n");
> +                return;
> +            }
> +            break;
> +        case FSINGLE_PACK1_UNARY_OPCODE_Y0:
> +        case PCNT_UNARY_OPCODE_Y0:
> +        case REVBITS_UNARY_OPCODE_Y0:
> +        case REVBYTES_UNARY_OPCODE_Y0:
> +        case TBLIDXB0_UNARY_OPCODE_Y0:
> +        case TBLIDXB1_UNARY_OPCODE_Y0:
> +        case TBLIDXB2_UNARY_OPCODE_Y0:
> +        case TBLIDXB3_UNARY_OPCODE_Y0:
> +        default:
> +            break;
> +        }
> +        break;
> +    case SHL1ADD_RRR_1_OPCODE_Y0:
> +        gen_shladd(dc, rdst, rsrc, rsrcb, 1, 0);
> +        return;
> +    case SHL2ADD_RRR_1_OPCODE_Y0:
> +        gen_shladd(dc, rdst, rsrc, rsrcb, 2, 0);
> +        return;
> +    case SHL3ADD_RRR_1_OPCODE_Y0:
> +        gen_shladd(dc, rdst, rsrc, rsrcb, 3, 0);
> +        return;
> +    default:
> +        break;
> +    }
> +
> +    qemu_log_mask(LOG_UNIMP, "UNIMP rrr_1_opcode_y0, [" FMT64X "]\n", bundle);
> +}
> +

I can't help thinking, as I read all of these decode functions, that it would
be better if the output disassembly, i.e. qemu_log_mask(CPU_LOG_TB_IN_ASM, *),
were to happen here, instead of being spread across 99 other functions.

This has a side effect of reducing many of your functions to a single
statement, invoking another tcg generator, at which point it's worth inlining them.

For example:

static void decode_rrr_1_unary_y0(struct DisasContext *dc,
                                  tilegx_bundle_bits bundle,
                                  uint8_t rdst, uint8_t rsrc)
{
    unsigned ext = get_UnaryOpcodeExtension_Y0(bundle);
    const char *mnemonic;
    TCGv vdst, vsrc;

    if (ext == NOP_UNARY_OPCODE_Y0 || ext == FNOP_UNARY_OPCODE_Y0) {
        if (rsrc != 0 || rdst != 0) {
            goto unimplemented;
        }
        qemu_log_mask(CPU_LOG_TB_IN_ASM, "(f)nop\n");
        return;
    }

    vdst = dest_gr(dc, rdst);
    vsrc = load_gr(dc, rsrc);

    switch (ext) {
    case CNTLZ_UNARY_OPCODE_Y0:
        gen_helper_cntlz(vdst, vsrc);
        mnemonic = "cntlz";
        break;
    case CNTTZ_UNARY_OPCODE_Y0:
        gen_helper_cnttz(vdst, vsrc);
        mnemonic = "cnttz";
        break;
    case FSINGLE_PACK1_UNARY_OPCODE_Y0:
    case PCNT_UNARY_OPCODE_Y0:
    case REVBITS_UNARY_OPCODE_Y0:
    case REVBYTES_UNARY_OPCODE_Y0:
    case TBLIDXB0_UNARY_OPCODE_Y0:
    case TBLIDXB1_UNARY_OPCODE_Y0:
    case TBLIDXB2_UNARY_OPCODE_Y0:
    case TBLIDXB3_UNARY_OPCODE_Y0:
    default:
    unimplemented:
        qemu_log_mask(LOG_UNIMP, "UNIMP rrr_1_unary_y0, [" FMT64X "]\n",
                      bundle);
        dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
        return;
    }

    qemu_log_mask(CPU_LOG_TB_IN_ASM, "%s r%d,r%d\n",
                  mnemonic, rdst, rsrc);
}

static void decode_rrr_1_opcode_y0(struct DisasContext *dc,
                                   tilegx_bundle_bits bundle)
{
    unsigned ext = get_RRROpcodeExtension_Y0(bundle);
    uint8_t rsrca = get_SrcA_Y0(bundle);
    uint8_t rsrcb = get_SrcB_Y0(bundle);
    uint8_t rdst = get_Dest_Y0(bundle);
    const char *mnemonic;
    TCGv vdst, vsrca, vsrcb;

    if (ext == UNARY_RRR_1_OPCODE_Y0) {
        decode_rrr_1_unary_y0(dc, bundle, rdst, rsrc);
        return;
    }

    vdst = dest_gr(dc, rdst);
    vsrca = load_gr(dc, rsrca);
    vsrcb = load_gr(dc, rsrcb);

    switch (ext) {
    case SHL1ADD_RRR_1_OPCODE_Y0:
        gen_shladd(vdst, vsrca, vsrcb, 1, 0);
        mnemonic = "shl1add";
        break;
    case SHL2ADD_RRR_1_OPCODE_Y0:
        gen_shladd(vdst, vsrca, vsrcb, 2, 0);
        mnemonic = "shl2add";
        break;
    case SHL3ADD_RRR_1_OPCODE_Y0:
        gen_shladd(vdst, vsrca, vsrcb, 3, 0);
        mnemonic = "shl3add";
        break;
    default:
        qemu_log_mask(LOG_UNIMP, "UNIMP rrr_1_opcode_y0, [" FMT64X "]\n",
                      bundle);
        dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
        return;
    }
    qemu_log_mask(CPU_LOG_TB_IN_ASM, "%s r%d,r%d,r%d\n",
                  mnemonic, rdst, rsrca, rsrcb);
}


r~
Chen Gang June 1, 2015, 8:54 p.m. UTC | #2
Firstly, thank you very much for your valuable work and quick response.

On 6/2/15 02:40, Richard Henderson wrote:
> First, what happened to the decoding skeleton patch?  You seem to have merged
> it with patch 9 here.  That said, see the bottom of this message.
> 

Yes, I merged them together. For me, it would be easier for reading and
discussing (they are in one c file within 3K, related with each other,
and sent together).


> On 05/30/2015 02:18 PM, Chen Gang wrote:
>> +/* mfspr can be only in X1 pipe, so it doesn't need to be bufferd */
>> +static void gen_mfspr(struct DisasContext *dc, uint8_t rdst, uint16_t imm14)
> 
> I'm not keen on this as a comment.  Clearly it could be buffered, with what is
> implemented here now.  But there are plenty of SPRs for which produce side
> effects, and *cannot* be buffered.
> 
> Adjust the comment to
> 
> /* Many SPR reads have side effects and cannot be buffered.  However, they are
>    all in the X1 pipe, which we are executing last, therefore we need not do
>    additional buffering.  */
> 
>> +/* mtspr can be only in X1 pipe, so it doesn't need to be bufferd */
> 
> Same, but s/reads/writes/.
>

OK, thanks.
 
>> +#if 1
> 
> Do not include this.
> 

OK, thanks.

>> +/*
>> + * uint64_t output = 0;
>> + * uint32_t counter;
>> + * for (counter = 0; counter < (WORD_SIZE / BYTE_SIZE); counter++)
>> + * {
>> + *     int8_t srca = getByte (rf[SrcA], counter);
>> + *     int8_t srcb = signExtend8 (Imm8);
>> + *     output = setByte (output, counter, ((srca == srcb) ? 1 : 0));
>> + * }
>> + * rf[Dest] = output;
>> + */
>> +static void gen_v1cmpeqi(struct DisasContext *dc,
>> +                         uint8_t rdst, uint8_t rsrc, int8_t imm8)
> 
> Pass in the condition to use, since you'll eventually need to implement
> v1cmpltsi, v1cmpltui.
> 
>> +static void gen_v1cmpeq(struct DisasContext *dc,
>> +                        uint8_t rdst, uint8_t rsrc, uint8_t rsrcb)
> 
> Likewise for v1cmples, v1cmpleu, v1cmplts, v1cmpltu, v1cmpne.
> 

OK, thanks.

>> +    tcg_gen_movi_i64(vdst, 0); /* or Assertion `ts->val_type == TEMP_VAL_REG' */
> 
> These comments are unnecessary.  Of course it's illegal to use an uninitialized
> temporary.
> 

OK, thanks.

>> +static void gen_v4int_l(struct DisasContext *dc,
>> +                        uint8_t rdst, uint8_t rsrc, uint8_t rsrcb)
>> +{
>> +    qemu_log_mask(CPU_LOG_TB_IN_ASM, "v4int_l r%d, r%d, r%d\n",
>> +                  rdst, rsrc, rsrcb);
>> +    tcg_gen_deposit_i64(dest_gr(dc, rdst), load_gr(dc, rsrc),
>> +                        load_gr(dc, rsrcb), 0, 32);
> 
> This is incorrect.  This produces { A1, B0 }, not { A0, B0 }.
> 
> As I said, you did want "32, 32" as the field insert, but you have the source
> operands in the wrong order.
> 

OK, thank you very much.

>> +static void gen_addx(struct DisasContext *dc,
>> +                     uint8_t rdst, uint8_t rsrc, uint8_t rsrcb)
>> +{
>> +    TCGv vdst = dest_gr(dc, rdst);
>> +
>> +    /* High bits have no effect with low bits, so addx and addxsc are merged. */
>> +    qemu_log_mask(CPU_LOG_TB_IN_ASM, "addx(sc) r%d, r%d, r%d\n",
>> +                  rdst, rsrc, rsrcb);
> 
> Um, no, addxsc does signed saturation before sign extension.
> 

OK, thank you very much. I shall fix it reference to add_saturate of arm
helper function.

>> +static void gen_mul_u_u(struct DisasContext *dc,
>> +                        uint8_t rdst, uint8_t rsrc, uint8_t rsrcb,
>> +                        int8 high, int8 highb, int8 add, const char *code)
> 
> A better name for this function is warranted, since it does much more than
> mul_u_u.  The add parameter should be type bool.
> 
> Given the existence of mul_hs_hs, mul_hu_ls, etc, you're probably better off
> passing in extraction functions.  E.g.
> 
> static void ext32s_high(TCGv d, TCGv s)
> {
>     tcg_gen_sari_i64(d, s, 32);
> }
> 
> static void ext32u_high(TCGv d, TCGv s)
> {
>     tcg_gen_shri_i64(d, s, 32);
> }
> 
>   gen_mul(dc, rdst, rsrc, rsrcb, ext32s_high, ext32s_high,
>           false, "mul_hs_hs");
>   gen_mul(dc, rdst, rsrc, rsrcb, ext32s_high, ext32u_high,
>           false, "mul_hs_hu");
>   gen_mul(dc, rdst, rsrc, rsrcb, ext32s_high, tcg_gen_ext32s_i64,
>           false, "mul_hs_ls");
>   gen_mul(dc, rdst, rsrc, rsrcb, ext32s_high, tcg_gen_ext32u_i64,
>           false, "mul_hs_lu");
> 
>   gen_mul(dc, rdst, rsrc, rsrcb, ext32u_high, ext32u_high,
>           false, "mul_hu_hu");
>   gen_mul(dc, rdst, rsrc, rsrcb, ext32u_high, tcg_gen_ext32s_i64,
>           false, "mul_hu_ls");
>   gen_mul(dc, rdst, rsrc, rsrcb, ext32u_high, tcg_gen_ext32u_i64,
>           false, "mul_hu_lu");
> 
>   gen_mul(dc, rdst, rsrc, rsrcb, tcg_gen_ext32s_i64, tcg_gen_ext32s_i64,
>           false, "mul_ls_ls");
>   gen_mul(dc, rdst, rsrc, rsrcb, tcg_gen_ext32s_i64, tcg_gen_ext32u_i64,
>           false, "mul_ls_lu");
> 
>   gen_mul(dc, rdst, rsrc, rsrcb, tcg_gen_ext32u_i64, tcg_gen_ext32u_i64,
>           false, "mul_lu_lu");
> 
> 
> and of course the same for the mula insns with true instead of false for the
> "add" parameter.
> 

OK, thanks.

>> +static void gen_shladd(struct DisasContext *dc,
>> +                       uint8_t rdst, uint8_t rsrc, uint8_t rsrcb,
>> +                       uint8_t shift, uint8_t cast)
> 
> cast should be bool.
> 

OK, thanks.

>> +static void gen_dblalign(struct DisasContext *dc,
>> +                         uint8_t rdst, uint8_t rsrc, uint8_t rsrcb)
>> +{
>> +    TCGv vdst = dest_gr(dc, rdst);
>> +    TCGv mask = tcg_temp_new_i64();
>> +    TCGv tmp = tcg_temp_new_i64();
>> +
>> +    qemu_log_mask(CPU_LOG_TB_IN_ASM, "dblalign r%d, r%d, r%d\n",
>> +                  rdst, rsrc, rsrcb);
>> +
>> +    tcg_gen_andi_i64(mask, load_gr(dc, rsrcb), 7);
>> +    tcg_gen_muli_i64(mask, mask, 8);
> 
> tcg_gen_shli_i64(mask, mask, 3);
> 

OK, thanks.

>> +    tcg_gen_shr_i64(vdst, load_gr(dc, rdst), mask);
>> +
>> +    tcg_gen_movi_i64(tmp, 64);
>> +    tcg_gen_sub_i64(mask, tmp, mask);
>> +    tcg_gen_shl_i64(mask, load_gr(dc, rsrc), mask);
>> +
>> +    tcg_gen_or_i64(vdst, vdst, mask);
> 
> Does not produce the correct results for mask == 0.
> 
> What you want is when mask == 0, you shift A by 64 bits, i.e. produce a zero.
> But you can't do that in TCG (or C for that matter).  Best is to do two shifts:
> 

OK, thank you very much.

>   tcg_gen_xori_i64(mask, mask, 63); /* compute 1's compliment of the shift */
>   tcg_gen_shl_i64(mask, load_gr(dc, rsrc), mask);
>   tcg_gen_shli_i64(mask, mask, 1); /* one more to produce 2's compliment */
> 

OK, thanks.

>> +static void gen_ld_add(struct DisasContext *dc,
>> +                       uint8_t rdst, uint8_t rsrc, int8_t imm8,
>> +                       TCGMemOp ops, const char *code)
>> +{
>> +    qemu_log_mask(CPU_LOG_TB_IN_ASM, "%s r%d, r%d, %d\n",
>> +                  code, rdst, rsrc, imm8);
>> +
>> +    tcg_gen_qemu_ld_i64(dest_gr(dc, rdst), load_gr(dc, rsrc),
>> +                        MMU_USER_IDX, ops);
>> +    /*
>> +     * Each pipe only have one temp val which is already used, and it is only
>> +     * for pipe X1, so can use real register
>> +     */
>> +    if (rsrc < TILEGX_R_COUNT) {
>> +        tcg_gen_addi_i64(cpu_regs[rsrc], load_gr(dc, rsrc), imm8);
>> +    }
>> +}
> 
> This is a poor comment.  Clearly each pipe can have two outputs, so this
> limitation is simply of your own design.
> 

OK, thanks. The comments need to be improved.

> Further, the < TILEGX_R_COUNT restriction is also incorrect.  True, you don't
> actually implement the top 7 special registers, but that doesn't matter, you
> should still be incrementing them.
> 

We did not implement them, so can not increment them, either.

They are hidden to outside, or we have to define and implement them.

So for me, the current code is correct.

>> +
>> +    return;
> 
> Do not add bare return statments at the ends of functions.
> 

OK, thanks.

>> +static int gen_blb(struct DisasContext *dc, uint8_t rsrc, int32_t off,
>> +                   TCGCond cond, const char *code)
> 
> Unused return value.  What were you intending?
> 

OK, thanks.

>> +static void decode_rrr_1_opcode_y0(struct DisasContext *dc,
>> +                                   tilegx_bundle_bits bundle)
>> +{
>> +    uint8_t rsrc = get_SrcA_Y0(bundle);
>> +    uint8_t rsrcb = get_SrcB_Y0(bundle);
>> +    uint8_t rdst = get_Dest_Y0(bundle);
>> +
>> +    switch (get_RRROpcodeExtension_Y0(bundle)) {
>> +    case UNARY_RRR_1_OPCODE_Y0:
>> +        switch (get_UnaryOpcodeExtension_Y0(bundle)) {
>> +        case CNTLZ_UNARY_OPCODE_Y0:
>> +            gen_cntlz(dc, rdst, rsrc);
>> +            return;
>> +        case CNTTZ_UNARY_OPCODE_Y0:
>> +            gen_cnttz(dc, rdst, rsrc);
>> +            return;
>> +        case NOP_UNARY_OPCODE_Y0:
>> +        case  FNOP_UNARY_OPCODE_Y0:
>> +            if (!rsrc && !rdst) {
>> +                qemu_log_mask(CPU_LOG_TB_IN_ASM, "(f)nop\n");
>> +                return;
>> +            }
>> +            break;
>> +        case FSINGLE_PACK1_UNARY_OPCODE_Y0:
>> +        case PCNT_UNARY_OPCODE_Y0:
>> +        case REVBITS_UNARY_OPCODE_Y0:
>> +        case REVBYTES_UNARY_OPCODE_Y0:
>> +        case TBLIDXB0_UNARY_OPCODE_Y0:
>> +        case TBLIDXB1_UNARY_OPCODE_Y0:
>> +        case TBLIDXB2_UNARY_OPCODE_Y0:
>> +        case TBLIDXB3_UNARY_OPCODE_Y0:
>> +        default:
>> +            break;
>> +        }
>> +        break;
>> +    case SHL1ADD_RRR_1_OPCODE_Y0:
>> +        gen_shladd(dc, rdst, rsrc, rsrcb, 1, 0);
>> +        return;
>> +    case SHL2ADD_RRR_1_OPCODE_Y0:
>> +        gen_shladd(dc, rdst, rsrc, rsrcb, 2, 0);
>> +        return;
>> +    case SHL3ADD_RRR_1_OPCODE_Y0:
>> +        gen_shladd(dc, rdst, rsrc, rsrcb, 3, 0);
>> +        return;
>> +    default:
>> +        break;
>> +    }
>> +
>> +    qemu_log_mask(LOG_UNIMP, "UNIMP rrr_1_opcode_y0, [" FMT64X "]\n", bundle);
>> +}
>> +
> 
> I can't help thinking, as I read all of these decode functions, that it would
> be better if the output disassembly, i.e. qemu_log_mask(CPU_LOG_TB_IN_ASM, *),
> were to happen here, instead of being spread across 99 other functions.
> 
> This has a side effect of reducing many of your functions to a single
> statement, invoking another tcg generator, at which point it's worth inlining them.
> 

OK, thanks.

> For example:
> 
> static void decode_rrr_1_unary_y0(struct DisasContext *dc,
>                                   tilegx_bundle_bits bundle,
>                                   uint8_t rdst, uint8_t rsrc)
> {
>     unsigned ext = get_UnaryOpcodeExtension_Y0(bundle);
>     const char *mnemonic;
>     TCGv vdst, vsrc;
> 
>     if (ext == NOP_UNARY_OPCODE_Y0 || ext == FNOP_UNARY_OPCODE_Y0) {
>         if (rsrc != 0 || rdst != 0) {
>             goto unimplemented;
>         }
>         qemu_log_mask(CPU_LOG_TB_IN_ASM, "(f)nop\n");
>         return;
>     }
> 
>     vdst = dest_gr(dc, rdst);
>     vsrc = load_gr(dc, rsrc);
> 
>     switch (ext) {
>     case CNTLZ_UNARY_OPCODE_Y0:
>         gen_helper_cntlz(vdst, vsrc);
>         mnemonic = "cntlz";
>         break;
>     case CNTTZ_UNARY_OPCODE_Y0:
>         gen_helper_cnttz(vdst, vsrc);
>         mnemonic = "cnttz";
>         break;
>     case FSINGLE_PACK1_UNARY_OPCODE_Y0:
>     case PCNT_UNARY_OPCODE_Y0:
>     case REVBITS_UNARY_OPCODE_Y0:
>     case REVBYTES_UNARY_OPCODE_Y0:
>     case TBLIDXB0_UNARY_OPCODE_Y0:
>     case TBLIDXB1_UNARY_OPCODE_Y0:
>     case TBLIDXB2_UNARY_OPCODE_Y0:
>     case TBLIDXB3_UNARY_OPCODE_Y0:
>     default:
>     unimplemented:
>         qemu_log_mask(LOG_UNIMP, "UNIMP rrr_1_unary_y0, [" FMT64X "]\n",
>                       bundle);
>         dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
>         return;
>     }
> 
>     qemu_log_mask(CPU_LOG_TB_IN_ASM, "%s r%d,r%d\n",
>                   mnemonic, rdst, rsrc);
> }
> 
> static void decode_rrr_1_opcode_y0(struct DisasContext *dc,
>                                    tilegx_bundle_bits bundle)
> {
>     unsigned ext = get_RRROpcodeExtension_Y0(bundle);
>     uint8_t rsrca = get_SrcA_Y0(bundle);
>     uint8_t rsrcb = get_SrcB_Y0(bundle);
>     uint8_t rdst = get_Dest_Y0(bundle);
>     const char *mnemonic;
>     TCGv vdst, vsrca, vsrcb;
> 
>     if (ext == UNARY_RRR_1_OPCODE_Y0) {
>         decode_rrr_1_unary_y0(dc, bundle, rdst, rsrc);
>         return;
>     }
> 
>     vdst = dest_gr(dc, rdst);
>     vsrca = load_gr(dc, rsrca);
>     vsrcb = load_gr(dc, rsrcb);
> 
>     switch (ext) {
>     case SHL1ADD_RRR_1_OPCODE_Y0:
>         gen_shladd(vdst, vsrca, vsrcb, 1, 0);
>         mnemonic = "shl1add";
>         break;
>     case SHL2ADD_RRR_1_OPCODE_Y0:
>         gen_shladd(vdst, vsrca, vsrcb, 2, 0);
>         mnemonic = "shl2add";
>         break;
>     case SHL3ADD_RRR_1_OPCODE_Y0:
>         gen_shladd(vdst, vsrca, vsrcb, 3, 0);
>         mnemonic = "shl3add";
>         break;
>     default:
>         qemu_log_mask(LOG_UNIMP, "UNIMP rrr_1_opcode_y0, [" FMT64X "]\n",
>                       bundle);
>         dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
>         return;
>     }
>     qemu_log_mask(CPU_LOG_TB_IN_ASM, "%s r%d,r%d,r%d\n",
>                   mnemonic, rdst, rsrca, rsrcb);
> }
> 

OK, thank you very much.
Richard Henderson June 2, 2015, 4:32 p.m. UTC | #3
On 06/01/2015 01:54 PM, Chen Gang wrote:
>> Further, the < TILEGX_R_COUNT restriction is also incorrect.  True, you don't
>> actually implement the top 7 special registers, but that doesn't matter, you
>> should still be incrementing them.
>>
> 
> We did not implement them, so can not increment them, either.
> 
> They are hidden to outside, or we have to define and implement them.
> 
> So for me, the current code is correct.

It isn't correct, it's simply functional.  These registers may eventually be
implemented, and at that point this code will fail.  You'll note that your
store_add functions don't have the same problem, because they don't have this
R_COUNT check.  It would be better to increase the number of buffer slots and
do the right thing here in load_add.

My suggestion is to expand tmp_regs to 4, drop tmp_regcur, and have dest_gr
manage all of the indexing.  I.e.

static TCGv dest_gr(DisasContext *dc, uint8_t rdst)
{
    int n = dc->n_tmp_regs++;
    assert(n < ARRAY_SIZE(dc->tmp_regs));
    dc->tmp_regs[n].idx = rdst;
    return dc->tmp_regs[n].val = tcg_temp_new_i64();
}

In this way you can in fact call dest_gr twice within load_add and everything
will Just Work.


r~
Peter Maydell June 2, 2015, 5:54 p.m. UTC | #4
On 30 May 2015 at 22:18, Chen Gang <xili_gchen_5257@hotmail.com> wrote:
> Generate related tcg instructions, and qemu tilegx can finish running
> "Hello world". The elf64 binary can be static or shared.
>
> Signed-off-by: Chen Gang <gang.chen.5i5j@gmail.com>
> ---
>  target-tilegx/translate.c | 2787 +++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 2787 insertions(+)
>  create mode 100644 target-tilegx/translate.c

For me, this patch is just too long to review sensibly.
(Simply trying to deal with replying to this email in my mail
client was a pain.)

But Richard is going to review this so I'll let him make
that decision.

thanks
-- PMM
Chen Gang June 2, 2015, 8:25 p.m. UTC | #5
On 6/3/15 01:54, Peter Maydell wrote:
> On 30 May 2015 at 22:18, Chen Gang <xili_gchen_5257@hotmail.com> wrote:
>> Generate related tcg instructions, and qemu tilegx can finish running
>> "Hello world". The elf64 binary can be static or shared.
>>
>> Signed-off-by: Chen Gang <gang.chen.5i5j@gmail.com>
>> ---
>>  target-tilegx/translate.c | 2787 +++++++++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 2787 insertions(+)
>>  create mode 100644 target-tilegx/translate.c
> 
> For me, this patch is just too long to review sensibly.
> (Simply trying to deal with replying to this email in my mail
> client was a pain.)
> 
> But Richard is going to review this so I'll let him make
> that decision.
> 

OK, thanks.
Chen Gang June 2, 2015, 9:30 p.m. UTC | #6
On 6/3/15 00:32, Richard Henderson wrote:
> On 06/01/2015 01:54 PM, Chen Gang wrote:
>>> Further, the < TILEGX_R_COUNT restriction is also incorrect.  True, you don't
>>> actually implement the top 7 special registers, but that doesn't matter, you
>>> should still be incrementing them.
>>>
>>
>> We did not implement them, so can not increment them, either.
>>
>> They are hidden to outside, or we have to define and implement them.
>>
>> So for me, the current code is correct.
> 
> It isn't correct, it's simply functional.  These registers may eventually be
> implemented, and at that point this code will fail.  You'll note that your
> store_add functions don't have the same problem, because they don't have this
> R_COUNT check.  It would be better to increase the number of buffer slots and
> do the right thing here in load_add.
> 

For me, it is about 2 discussions:

 - Whether need implement additional 7 registers.

   I guess not. But if we will really implement them in future, we need
   only let TILEGX_R_COUNT = TILEGX_R_ZERO, and all things should still
   be OK.

 - Whether need 2 or more tmp variables for one pipe.

   It is not necessary, but it will let the code simplier.


> My suggestion is to expand tmp_regs to 4, drop tmp_regcur, and have dest_gr
> manage all of the indexing.  I.e.
> 
> static TCGv dest_gr(DisasContext *dc, uint8_t rdst)
> {
>     int n = dc->n_tmp_regs++;
>     assert(n < ARRAY_SIZE(dc->tmp_regs));
>     dc->tmp_regs[n].idx = rdst;
>     return dc->tmp_regs[n].val = tcg_temp_new_i64();
> }
> 
> In this way you can in fact call dest_gr twice within load_add and everything
> will Just Work.
> 

For me, the code is fine (and reset dc->n_tmp_regs for each bundle).

Thanks.
Chen Gang June 7, 2015, 10:20 p.m. UTC | #7
After thinking of again, for me, I still prefer to keep gen_cntlz() and
others, the reason is below:

 - gen_* (include gen_cntlz) are used in multiple areas, and most gen_*
   are not single statement. For each gen_*, printing insns is easy (and
   may be helpful).

 - decode* is for switch opcode (as branch), not for implementation (as
   leaf).

 - After we use individual functions for all unary opcode extensions, we
   can let decode* very simple (although they may have long code), if we
   let decode* printing insns, it will let them seem a little complex.


Thanks.

On 6/2/15 04:54, Chen Gang wrote:
>> I can't help thinking, as I read all of these decode functions, that it would
>> > be better if the output disassembly, i.e. qemu_log_mask(CPU_LOG_TB_IN_ASM, *),
>> > were to happen here, instead of being spread across 99 other functions.
>> > 
>> > This has a side effect of reducing many of your functions to a single
>> > statement, invoking another tcg generator, at which point it's worth inlining them.
>> > 
> OK, thanks.
> 
>> > For example:
>> > 
>> > static void decode_rrr_1_unary_y0(struct DisasContext *dc,
>> >                                   tilegx_bundle_bits bundle,
>> >                                   uint8_t rdst, uint8_t rsrc)
>> > {
>> >     unsigned ext = get_UnaryOpcodeExtension_Y0(bundle);
>> >     const char *mnemonic;
>> >     TCGv vdst, vsrc;
>> > 
>> >     if (ext == NOP_UNARY_OPCODE_Y0 || ext == FNOP_UNARY_OPCODE_Y0) {
>> >         if (rsrc != 0 || rdst != 0) {
>> >             goto unimplemented;
>> >         }
>> >         qemu_log_mask(CPU_LOG_TB_IN_ASM, "(f)nop\n");
>> >         return;
>> >     }
>> > 
>> >     vdst = dest_gr(dc, rdst);
>> >     vsrc = load_gr(dc, rsrc);
>> > 
>> >     switch (ext) {
>> >     case CNTLZ_UNARY_OPCODE_Y0:
>> >         gen_helper_cntlz(vdst, vsrc);
>> >         mnemonic = "cntlz";
>> >         break;
>> >     case CNTTZ_UNARY_OPCODE_Y0:
>> >         gen_helper_cnttz(vdst, vsrc);
>> >         mnemonic = "cnttz";
>> >         break;
>> >     case FSINGLE_PACK1_UNARY_OPCODE_Y0:
>> >     case PCNT_UNARY_OPCODE_Y0:
>> >     case REVBITS_UNARY_OPCODE_Y0:
>> >     case REVBYTES_UNARY_OPCODE_Y0:
>> >     case TBLIDXB0_UNARY_OPCODE_Y0:
>> >     case TBLIDXB1_UNARY_OPCODE_Y0:
>> >     case TBLIDXB2_UNARY_OPCODE_Y0:
>> >     case TBLIDXB3_UNARY_OPCODE_Y0:
>> >     default:
>> >     unimplemented:
>> >         qemu_log_mask(LOG_UNIMP, "UNIMP rrr_1_unary_y0, [" FMT64X "]\n",
>> >                       bundle);
>> >         dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
>> >         return;
>> >     }
>> > 
>> >     qemu_log_mask(CPU_LOG_TB_IN_ASM, "%s r%d,r%d\n",
>> >                   mnemonic, rdst, rsrc);
>> > }
>> > 
>> > static void decode_rrr_1_opcode_y0(struct DisasContext *dc,
>> >                                    tilegx_bundle_bits bundle)
>> > {
>> >     unsigned ext = get_RRROpcodeExtension_Y0(bundle);
>> >     uint8_t rsrca = get_SrcA_Y0(bundle);
>> >     uint8_t rsrcb = get_SrcB_Y0(bundle);
>> >     uint8_t rdst = get_Dest_Y0(bundle);
>> >     const char *mnemonic;
>> >     TCGv vdst, vsrca, vsrcb;
>> > 
>> >     if (ext == UNARY_RRR_1_OPCODE_Y0) {
>> >         decode_rrr_1_unary_y0(dc, bundle, rdst, rsrc);
>> >         return;
>> >     }
>> > 
>> >     vdst = dest_gr(dc, rdst);
>> >     vsrca = load_gr(dc, rsrca);
>> >     vsrcb = load_gr(dc, rsrcb);
>> > 
>> >     switch (ext) {
>> >     case SHL1ADD_RRR_1_OPCODE_Y0:
>> >         gen_shladd(vdst, vsrca, vsrcb, 1, 0);
>> >         mnemonic = "shl1add";
>> >         break;
>> >     case SHL2ADD_RRR_1_OPCODE_Y0:
>> >         gen_shladd(vdst, vsrca, vsrcb, 2, 0);
>> >         mnemonic = "shl2add";
>> >         break;
>> >     case SHL3ADD_RRR_1_OPCODE_Y0:
>> >         gen_shladd(vdst, vsrca, vsrcb, 3, 0);
>> >         mnemonic = "shl3add";
>> >         break;
>> >     default:
>> >         qemu_log_mask(LOG_UNIMP, "UNIMP rrr_1_opcode_y0, [" FMT64X "]\n",
>> >                       bundle);
>> >         dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
>> >         return;
>> >     }
>> >     qemu_log_mask(CPU_LOG_TB_IN_ASM, "%s r%d,r%d,r%d\n",
>> >                   mnemonic, rdst, rsrca, rsrcb);
>> > }
>> > 
> OK, thank you very much.
>
diff mbox

Patch

diff --git a/target-tilegx/translate.c b/target-tilegx/translate.c
new file mode 100644
index 0000000..7e45118
--- /dev/null
+++ b/target-tilegx/translate.c
@@ -0,0 +1,2787 @@ 
+/*
+ * QEMU TILE-Gx CPU
+ *
+ *  Copyright (c) 2015 Chen Gang
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see
+ * <http://www.gnu.org/licenses/lgpl-2.1.html>
+ */
+
+#include "cpu.h"
+#include "qemu/log.h"
+#include "disas/disas.h"
+#include "tcg-op.h"
+#include "exec/cpu_ldst.h"
+#include "opcode_tilegx.h"
+#include "spr_def_64.h"
+
+#define FMT64X                          "%016" PRIx64
+
+static TCGv_ptr cpu_env;
+static TCGv cpu_pc;
+static TCGv cpu_regs[TILEGX_R_COUNT];
+static TCGv cpu_spregs[TILEGX_SPR_COUNT];
+#if defined(CONFIG_USER_ONLY)
+static TCGv_i32 cpu_excparam;
+#endif
+
+static const char * const reg_names[] = {
+     "r0",  "r1",  "r2",  "r3",  "r4",  "r5",  "r6",  "r7",
+     "r8",  "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+    "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
+    "r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31",
+    "r32", "r33", "r34", "r35", "r36", "r37", "r38", "r39",
+    "r40", "r41", "r42", "r43", "r44", "r45", "r46", "r47",
+    "r48", "r49", "r50", "r51",  "bp",  "tp",  "sp",  "lr"
+};
+
+static const char * const spreg_names[] = {
+    "cmpexch", "criticalsec", "simcontrol"
+};
+
+/* It is for temporary registers */
+typedef struct DisasContextTemp {
+    uint8_t idx;                   /* index */
+    TCGv val;                      /* value */
+} DisasContextTemp;
+
+/* This is the state at translation time.  */
+typedef struct DisasContext {
+    uint64_t pc;                   /* Current pc */
+    uint64_t exception;            /* Current exception */
+
+    TCGv zero;                     /* For zero register */
+
+    DisasContextTemp *tmp_regcur;  /* Current temporary registers */
+    DisasContextTemp tmp_regs[TILEGX_MAX_INSTRUCTIONS_PER_BUNDLE];
+                                   /* All temporary registers */
+    struct {
+        TCGCond cond;              /* Branch condition */
+        TCGv dest;                 /* pc jump destination, if will jump */
+        TCGv val1;                 /* Firt value for condition comparing */
+        TCGv val2;                 /* Second value for condition comparing */
+    } jmp;                         /* Jump object, only once in each TB block */
+} DisasContext;
+
+#include "exec/gen-icount.h"
+
+static TCGv load_zero(DisasContext *dc)
+{
+    if (TCGV_IS_UNUSED_I64(dc->zero)) {
+        dc->zero = tcg_const_i64(0);
+    }
+    return dc->zero;
+}
+
+static TCGv load_gr(DisasContext *dc, uint8_t reg)
+{
+    if (likely(reg < TILEGX_R_COUNT)) {
+        return cpu_regs[reg];
+    } else if (reg != TILEGX_R_ZERO) {
+        dc->exception = TILEGX_EXCP_REG_UNSUPPORTED;
+    }
+    return load_zero(dc);
+}
+
+static TCGv dest_gr(DisasContext *dc, uint8_t rdst)
+{
+    DisasContextTemp *tmp = dc->tmp_regcur;
+    tmp->idx = rdst;
+    tmp->val = tcg_temp_new_i64();
+    return tmp->val;
+}
+
+static void gen_exception(DisasContext *dc, int num)
+{
+    TCGv_i32 tmp = tcg_const_i32(num);
+
+    gen_helper_exception(cpu_env, tmp);
+    tcg_temp_free_i32(tmp);
+}
+
+/* mfspr can be only in X1 pipe, so it doesn't need to be bufferd */
+static void gen_mfspr(struct DisasContext *dc, uint8_t rdst, uint16_t imm14)
+{
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "mfspr r%d, 0x%x\n", rdst, imm14);
+
+    if (rdst >= TILEGX_R_COUNT) {
+        if (rdst != TILEGX_R_ZERO) {
+            dc->exception = TILEGX_EXCP_REG_UNSUPPORTED;
+        }
+        return;
+    }
+
+    switch (imm14) {
+    case SPR_CMPEXCH_VALUE:
+        tcg_gen_mov_i64(cpu_regs[rdst], cpu_spregs[TILEGX_SPR_CMPEXCH]);
+        return;
+    case SPR_INTERRUPT_CRITICAL_SECTION:
+        tcg_gen_mov_i64(cpu_regs[rdst], cpu_spregs[TILEGX_SPR_CRITICAL_SEC]);
+        return;
+    case SPR_SIM_CONTROL:
+        tcg_gen_mov_i64(cpu_regs[rdst], cpu_spregs[TILEGX_SPR_SIM_CONTROL]);
+        return;
+    default:
+        qemu_log_mask(LOG_UNIMP, "UNIMP mfspr 0x%x.\n", imm14);
+    }
+    dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
+}
+
+/* mtspr can be only in X1 pipe, so it doesn't need to be bufferd */
+static void gen_mtspr(struct DisasContext *dc, uint8_t rsrc, uint16_t imm14)
+{
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "mtspr 0x%x, r%d\n", imm14, rsrc);
+
+    switch (imm14) {
+    case SPR_CMPEXCH_VALUE:
+        tcg_gen_mov_i64(cpu_spregs[TILEGX_SPR_CMPEXCH], load_gr(dc, rsrc));
+        return;
+    case SPR_INTERRUPT_CRITICAL_SECTION:
+        tcg_gen_mov_i64(cpu_spregs[TILEGX_SPR_CRITICAL_SEC], load_gr(dc, rsrc));
+        return;
+    case SPR_SIM_CONTROL:
+        tcg_gen_mov_i64(cpu_spregs[TILEGX_SPR_SIM_CONTROL], load_gr(dc, rsrc));
+        return;
+    default:
+        qemu_log_mask(LOG_UNIMP, "UNIMP mtspr 0x%x.\n", imm14);
+    }
+    dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
+}
+
+#if 1
+static void extract_v1(TCGv out, TCGv in, unsigned byte)
+{
+    tcg_gen_shri_i64(out, in, byte * 8);
+    tcg_gen_ext8u_i64(out, out);
+}
+
+static void insert_v1(TCGv out, TCGv in, unsigned byte)
+{
+    tcg_gen_deposit_i64(out, out, in, byte * 8, 8);
+}
+#endif
+
+static void gen_cmpi(struct DisasContext *dc,
+                     uint8_t rdst, uint8_t rsrc, int8_t imm8,
+                     TCGCond cond, const char *code)
+{
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "%s r%d, r%d, %d\n",
+                  code, rdst, rsrc, imm8);
+    tcg_gen_setcondi_i64(cond,
+                         dest_gr(dc, rdst), load_gr(dc, rsrc), imm8);
+}
+
+static void gen_cmp(struct DisasContext *dc,
+                    uint8_t rdst, uint8_t rsrc, uint8_t rsrcb,
+                    TCGCond cond, const char *code)
+{
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "%s r%d, r%d, r%d\n",
+                  code, rdst, rsrc, rsrcb);
+    tcg_gen_setcond_i64(cond, dest_gr(dc, rdst), load_gr(dc, rsrc),
+                        load_gr(dc, rsrcb));
+}
+
+static void gen_atomic_excp(struct DisasContext *dc,
+                            uint8_t rdst, uint8_t rsrc, uint8_t rsrcb,
+                            int excp, const char *code)
+{
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "%s r%d, r%d, r%d\n",
+                  code, rdst, rsrc, rsrcb);
+#if defined(CONFIG_USER_ONLY)
+    tcg_gen_movi_i32(cpu_excparam, (rdst << 16) | (rsrc << 8) | rsrcb);
+    tcg_gen_movi_i64(cpu_pc, dc->pc + TILEGX_BUNDLE_SIZE_IN_BYTES);
+    dc->exception = excp;
+#else
+    dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
+#endif
+}
+
+/*
+ * uint64_t output = 0;
+ * uint32_t counter;
+ * for (counter = 0; counter < (WORD_SIZE / BYTE_SIZE); counter++)
+ * {
+ *     int8_t srca = getByte (rf[SrcA], counter);
+ *     int8_t srcb = signExtend8 (Imm8);
+ *     output = setByte (output, counter, ((srca == srcb) ? 1 : 0));
+ * }
+ * rf[Dest] = output;
+ */
+static void gen_v1cmpeqi(struct DisasContext *dc,
+                         uint8_t rdst, uint8_t rsrc, int8_t imm8)
+{
+    int count;
+    TCGv vdst = dest_gr(dc, rdst);
+    TCGv vsrc = load_gr(dc, rsrc);
+    TCGv tmp = tcg_temp_new_i64();
+
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "v1cmpeqi r%d, r%d, %d\n",
+                  rdst, rsrc, imm8);
+
+    tcg_gen_movi_i64(vdst, 0); /* or Assertion `ts->val_type == TEMP_VAL_REG' */
+    for (count = 0; count < 8; count++) {
+        extract_v1(tmp, vsrc, count);
+        tcg_gen_setcondi_i64(TCG_COND_EQ, tmp, tmp, imm8);
+        insert_v1(vdst, tmp, count);
+    }
+    tcg_temp_free_i64(tmp);
+}
+
+static void gen_v1cmpeq(struct DisasContext *dc,
+                        uint8_t rdst, uint8_t rsrc, uint8_t rsrcb)
+{
+    int count;
+    TCGv vdst = dest_gr(dc, rdst);
+    TCGv vsrc = load_gr(dc, rsrc);
+    TCGv vsrcb = load_gr(dc, rsrcb);
+    TCGv tmp = tcg_temp_new_i64();
+    TCGv tmp2 = tcg_temp_new_i64();
+
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "v1cmpeq r%d, r%d, r%d\n",
+                  rdst, rsrc, rsrcb);
+
+    tcg_gen_movi_i64(vdst, 0); /* or Assertion `ts->val_type == TEMP_VAL_REG' */
+    for (count = 0; count < 8; count++) {
+        extract_v1(tmp, vsrc, count);
+        extract_v1(tmp2, vsrcb, count);
+        tcg_gen_setcond_i64(TCG_COND_EQ, tmp, tmp, tmp2);
+        insert_v1(vdst, tmp, count);
+    }
+    tcg_temp_free_i64(tmp2);
+    tcg_temp_free_i64(tmp);
+}
+
+/*
+ * Description
+ *
+ * Interleave the four low-order bytes of the first operand with the four
+ * low-order bytes of the second operand. The low-order byte of the result will
+ * be the low-order byte of the second operand. For example if the first operand
+ * contains the packed bytes {A7,A6,A5,A4,A3,A2,A1,A0} and the second operand
+ * contains the packed bytes {B7,B6,B5,B4,B3,B2,B1,B0} then the result will be
+ * {A3,B3,A2,B2,A1,B1,A0,B0}.
+ *
+ * Functional Description
+ *
+ *        uint64_t output = 0;
+ *        uint32_t counter;
+ *        for (counter = 0; counter < (WORD_SIZE / BYTE_SIZE); counter++)
+ *          {
+ *            bool asel = ((counter & 1) == 1);
+ *            int in_sel = 0 + counter / 2;
+ *            int8_t srca = getByte (rf[SrcA], in_sel);
+ *            int8_t srcb = getByte (rf[SrcB], in_sel);
+ *            output = setByte (output, counter, (asel ? srca : srcb));
+ *          }
+ *        rf[Dest] = output;
+ */
+static void gen_v1int_l(struct DisasContext *dc,
+                        uint8_t rdst, uint8_t rsrc, uint8_t rsrcb)
+{
+    int count;
+    TCGv vdst = dest_gr(dc, rdst);
+    TCGv vsrc = load_gr(dc, rsrc);
+    TCGv vsrcb = load_gr(dc, rsrcb);
+    TCGv tmp = tcg_temp_new_i64();
+
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "v1int_l r%d, r%d, r%d\n",
+                  rdst, rsrc, rsrcb);
+
+    tcg_gen_movi_i64(vdst, 0); /* or Assertion `ts->val_type == TEMP_VAL_REG' */
+    for (count = 0; count < 4; count++) {
+        extract_v1(tmp, vsrc, count);
+        insert_v1(vdst, tmp, 2 * count + 1);
+        extract_v1(tmp, vsrcb, count);
+        insert_v1(vdst, tmp, 2 * count);
+    }
+    tcg_temp_free_i64(tmp);
+}
+
+static void gen_v4int_l(struct DisasContext *dc,
+                        uint8_t rdst, uint8_t rsrc, uint8_t rsrcb)
+{
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "v4int_l r%d, r%d, r%d\n",
+                  rdst, rsrc, rsrcb);
+    tcg_gen_deposit_i64(dest_gr(dc, rdst), load_gr(dc, rsrc),
+                        load_gr(dc, rsrcb), 0, 32);
+}
+
+static void gen_cmov(struct DisasContext *dc,
+                     uint8_t rdst, uint8_t rsrc, uint8_t rsrcb,
+                     TCGCond cond, const char *code)
+{
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "%s r%d, r%d, r%d\n",
+                  code, rdst, rsrc, rsrcb);
+    tcg_gen_movcond_i64(cond, dest_gr(dc, rdst), load_gr(dc, rsrc),
+                        load_zero(dc), load_gr(dc, rsrcb), load_gr(dc, rdst));
+}
+
+static void gen_menz(struct DisasContext *dc,
+                     uint8_t rdst, uint8_t rsrc, uint8_t rsrcb,
+                     TCGCond cond, const char *code)
+{
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "%s r%d, r%d, r%d\n",
+                  code, rdst, rsrc, rsrcb);
+
+    tcg_gen_movcond_i64(cond, dest_gr(dc, rdst), load_gr(dc, rsrc),
+                        load_zero(dc), load_gr(dc, rsrcb), load_zero(dc));
+}
+
+static void gen_add(struct DisasContext *dc,
+                    uint8_t rdst, uint8_t rsrc, uint8_t rsrcb)
+{
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "add r%d, r%d, r%d\n", rdst, rsrc, rsrcb);
+    tcg_gen_add_i64(dest_gr(dc, rdst), load_gr(dc, rsrc), load_gr(dc, rsrcb));
+}
+
+static void gen_addimm(struct DisasContext *dc,
+                       uint8_t rdst, uint8_t rsrc, int16_t imm)
+{
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "add(l)i r%d, r%d, %d\n",
+                  rdst, rsrc, imm);
+    tcg_gen_addi_i64(dest_gr(dc, rdst), load_gr(dc, rsrc), imm);
+}
+
+static void gen_addx(struct DisasContext *dc,
+                     uint8_t rdst, uint8_t rsrc, uint8_t rsrcb)
+{
+    TCGv vdst = dest_gr(dc, rdst);
+
+    /* High bits have no effect with low bits, so addx and addxsc are merged. */
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "addx(sc) r%d, r%d, r%d\n",
+                  rdst, rsrc, rsrcb);
+    tcg_gen_add_i64(vdst, load_gr(dc, rsrc), load_gr(dc, rsrcb));
+    tcg_gen_ext32s_i64(vdst, vdst);
+}
+
+static void gen_addximm(struct DisasContext *dc,
+                        uint8_t rdst, uint8_t rsrc, int16_t imm)
+{
+    TCGv vdst = dest_gr(dc, rdst);
+
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "addx(l)i r%d, r%d, %d\n",
+                  rdst, rsrc, imm);
+    tcg_gen_addi_i64(vdst, load_gr(dc, rsrc), imm);
+    tcg_gen_ext32s_i64(vdst, vdst);
+}
+
+static void gen_sub(struct DisasContext *dc,
+                    uint8_t rdst, uint8_t rsrc, uint8_t rsrcb)
+{
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "sub r%d, r%d, r%d\n", rdst, rsrc, rsrcb);
+    tcg_gen_sub_i64(dest_gr(dc, rdst), load_gr(dc, rsrc), load_gr(dc, rsrcb));
+}
+
+static void gen_subx(struct DisasContext *dc,
+                     uint8_t rdst, uint8_t rsrc, uint8_t rsrcb)
+{
+    TCGv vdst = dest_gr(dc, rdst);
+
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "subx r%d, r%d, r%d\n", rdst, rsrc, rsrcb);
+    tcg_gen_sub_i64(vdst, load_gr(dc, rsrc), load_gr(dc, rsrcb));
+    tcg_gen_ext32s_i64(vdst, vdst);
+}
+
+/*
+ * uint64_t mask = 0;
+ * int64_t background = ((rf[SrcA] >> BFEnd) & 1) ? -1ULL : 0ULL;
+ * mask = ((-1ULL) ^ ((-1ULL << ((BFEnd - BFStart) & 63)) << 1));
+ * uint64_t rot_src = (((uint64_t) rf[SrcA]) >> BFStart)
+ *                    | (rf[SrcA] << (64 - BFStart));
+ * rf[Dest] = (rot_src & mask) | (background & ~mask);
+ */
+static void gen_bfexts(struct DisasContext *dc, uint8_t rdst, uint8_t rsrc,
+                       uint8_t start, uint8_t end)
+{
+    uint64_t mask = (-1ULL) ^ ((-1ULL << ((end - start) & 63)) << 1);
+    TCGv vdst = dest_gr(dc, rdst);
+    TCGv tmp = tcg_temp_new_i64();
+
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "bfexts r%d, r%d, %d, %d\n",
+                  rdst, rsrc, start, end);
+
+    tcg_gen_rotri_i64(vdst, load_gr(dc, rsrc), start);
+    tcg_gen_andi_i64(vdst, vdst, mask);
+
+    tcg_gen_shri_i64(tmp, load_gr(dc, rsrc), end);
+    tcg_gen_andi_i64(tmp, tmp, 1);
+    tcg_gen_neg_i64(tmp, tmp);
+    tcg_gen_andi_i64(tmp, tmp, ~mask);
+    tcg_gen_or_i64(vdst, vdst, tmp);
+
+    tcg_temp_free_i64(tmp);
+}
+
+/*
+ * The related functional description for bfextu in isa document:
+ *
+ * uint64_t mask = 0;
+ * mask = (-1ULL) ^ ((-1ULL << ((BFEnd - BFStart) & 63)) << 1);
+ * uint64_t rot_src = (((uint64_t) rf[SrcA]) >> BFStart)
+ *                    | (rf[SrcA] << (64 - BFStart));
+ * rf[Dest] = rot_src & mask;
+ */
+static void gen_bfextu(struct DisasContext *dc, uint8_t rdst, uint8_t rsrc,
+                       uint8_t start, uint8_t end)
+{
+    uint64_t mask = (-1ULL) ^ ((-1ULL << ((end - start) & 63)) << 1);
+    TCGv vdst = dest_gr(dc, rdst);
+
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "bfextu r%d, r%d, %d, %d\n",
+                  rdst, rsrc, start, end);
+
+    tcg_gen_rotri_i64(vdst, load_gr(dc, rsrc), start);
+    tcg_gen_andi_i64(vdst, vdst, mask);
+}
+
+/*
+ * mask = (start <= end) ? ((-1ULL << start) ^ ((-1ULL << end) << 1))
+ *                       : ((-1ULL << start) | (-1ULL >> (63 - end)));
+ * uint64_t rot_src = (rf[SrcA] << start)
+ *                    | ((uint64_t) rf[SrcA] >> (64 - start));
+ * rf[Dest] = (rot_src & mask) | (rf[Dest] & (-1ULL ^ mask));
+ */
+static void gen_bfins(struct DisasContext *dc, uint8_t rdst, uint8_t rsrc,
+                      uint8_t start, uint8_t end)
+{
+    uint64_t mask = (start <= end) ? ((-1ULL << start) ^ ((-1ULL << end) << 1))
+                                   : ((-1ULL << start) | (-1ULL >> (63 - end)));
+    TCGv vdst = dest_gr(dc, rdst);
+    TCGv tmp = tcg_temp_new_i64();
+
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "bfins r%d, r%d, %d, %d\n",
+                  rdst, rsrc, start, end);
+
+    tcg_gen_rotli_i64(tmp, load_gr(dc, rsrc), start);
+
+    tcg_gen_andi_i64(tmp, tmp, mask);
+    tcg_gen_andi_i64(vdst, load_gr(dc, rdst), -1ULL ^ mask);
+    tcg_gen_or_i64(vdst, vdst, tmp);
+
+    tcg_temp_free_i64(tmp);
+}
+
+static void gen_or(struct DisasContext *dc,
+                   uint8_t rdst, uint8_t rsrc, uint8_t rsrcb)
+{
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "or r%d, r%d, r%d\n", rdst, rsrc, rsrcb);
+    tcg_gen_or_i64(dest_gr(dc, rdst), load_gr(dc, rsrc), load_gr(dc, rsrcb));
+}
+
+static void gen_ori(struct DisasContext *dc,
+                    uint8_t rdst, uint8_t rsrc, int8_t imm8)
+{
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "ori r%d, r%d, %d\n", rdst, rsrc, imm8);
+    tcg_gen_ori_i64(dest_gr(dc, rdst), load_gr(dc, rsrc), imm8);
+}
+
+static void gen_xor(struct DisasContext *dc,
+                    uint8_t rdst, uint8_t rsrc, uint8_t rsrcb)
+{
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "xor r%d, r%d, r%d\n", rdst, rsrc, rsrcb);
+    tcg_gen_xor_i64(dest_gr(dc, rdst), load_gr(dc, rsrc), load_gr(dc, rsrcb));
+}
+
+static void gen_nor(struct DisasContext *dc,
+                    uint8_t rdst, uint8_t rsrc, uint8_t rsrcb)
+{
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "nor r%d, r%d, r%d\n", rdst, rsrc, rsrcb);
+    tcg_gen_nor_i64(dest_gr(dc, rdst), load_gr(dc, rsrc), load_gr(dc, rsrcb));
+}
+
+static void gen_and(struct DisasContext *dc,
+                    uint8_t rdst, uint8_t rsrc, uint8_t rsrcb)
+{
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "and r%d, r%d, r%d\n", rdst, rsrc, rsrcb);
+    tcg_gen_and_i64(dest_gr(dc, rdst), load_gr(dc, rsrc), load_gr(dc, rsrcb));
+}
+
+static void gen_andi(struct DisasContext *dc,
+                     uint8_t rdst, uint8_t rsrc, int8_t imm8)
+{
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "andi r%d, r%d, %d\n", rdst, rsrc, imm8);
+    tcg_gen_andi_i64(dest_gr(dc, rdst), load_gr(dc, rsrc), imm8);
+}
+
+static void gen_mulx(struct DisasContext *dc,
+                     uint8_t rdst, uint8_t rsrc, uint8_t rsrcb)
+{
+    TCGv vdst = dest_gr(dc, rdst);
+
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "mulx r%d, r%d, r%d\n", rdst, rsrc, rsrcb);
+
+    tcg_gen_mul_i64(vdst, load_gr(dc, rsrc), load_gr(dc, rsrcb));
+    tcg_gen_ext32s_i64(vdst, vdst);
+}
+
+static void gen_mul_u_u(struct DisasContext *dc,
+                        uint8_t rdst, uint8_t rsrc, uint8_t rsrcb,
+                        int8 high, int8 highb, int8 add, const char *code)
+{
+    TCGv vdst = dest_gr(dc, rdst);
+    TCGv tmp = tcg_temp_new_i64();
+
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "%s r%d, r%d, r%d\n",
+                  code, rdst, rsrc, rsrcb);
+
+    if (high) {
+        tcg_gen_shri_i64(tmp, load_gr(dc, rsrc), 32);
+    } else {
+        tcg_gen_andi_i64(tmp, load_gr(dc, rsrc), 0xffffffff);
+    }
+    if (highb) {
+        tcg_gen_shri_i64(vdst, load_gr(dc, rsrcb), 32);
+    } else {
+        tcg_gen_andi_i64(vdst, load_gr(dc, rsrcb), 0xffffffff);
+    }
+    tcg_gen_mul_i64(vdst, tmp, vdst);
+
+    if (add) {
+        tcg_gen_add_i64(vdst, load_gr(dc, rdst), vdst);
+    }
+
+    tcg_temp_free_i64(tmp);
+}
+
+static void gen_shlx(struct DisasContext *dc,
+                     uint8_t rdst, uint8_t rsrc, uint8_t rsrcb)
+{
+    TCGv vdst = dest_gr(dc, rdst);
+
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "shlx r%d, r%d, r%d\n", rdst, rsrc, rsrcb);
+    tcg_gen_andi_i64(vdst, load_gr(dc, rsrcb), 31);
+    tcg_gen_shl_i64(vdst, load_gr(dc, rsrc), vdst);
+    tcg_gen_ext32s_i64(vdst, vdst);
+}
+
+static void gen_shl(struct DisasContext *dc,
+                    uint8_t rdst, uint8_t rsrc, uint8_t rsrcb)
+{
+    TCGv vdst = dest_gr(dc, rdst);
+
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "shl r%d, r%d, r%d\n", rdst, rsrc, rsrcb);
+    tcg_gen_andi_i64(vdst, load_gr(dc, rsrcb), 63);
+    tcg_gen_shl_i64(vdst, load_gr(dc, rsrc), vdst);
+}
+
+static void gen_shli(struct DisasContext *dc,
+                     uint8_t rdst, uint8_t rsrc, uint8_t shamt)
+{
+    TCGv vdst = dest_gr(dc, rdst);
+
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "shli r%d, r%d, %u\n", rdst, rsrc, shamt);
+    tcg_gen_shli_i64(vdst, load_gr(dc, rsrc), shamt);
+}
+
+static void gen_shlxi(struct DisasContext *dc,
+                      uint8_t rdst, uint8_t rsrc, uint8_t shamt)
+{
+    TCGv vdst = dest_gr(dc, rdst);
+
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "shlxi r%d, r%d, %u\n", rdst, rsrc, shamt);
+    tcg_gen_shli_i64(vdst, load_gr(dc, rsrc), shamt & 31);
+    tcg_gen_ext32s_i64(vdst, vdst);
+}
+
+static void gen_shladd(struct DisasContext *dc,
+                       uint8_t rdst, uint8_t rsrc, uint8_t rsrcb,
+                       uint8_t shift, uint8_t cast)
+{
+    TCGv vdst = dest_gr(dc, rdst);
+
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "shl%dadd%s r%d, r%d, r%d\n",
+                  shift, cast ? "x" : "", rdst, rsrc, rsrcb);
+    tcg_gen_shli_i64(vdst, load_gr(dc, rsrc), shift);
+    tcg_gen_add_i64(vdst, vdst, load_gr(dc, rsrcb));
+    if (cast) {
+        tcg_gen_ext32s_i64(vdst, vdst);
+    }
+}
+
+static void gen_shl16insli(struct DisasContext *dc,
+                           uint8_t rdst, uint8_t rsrc, uint16_t uimm16)
+{
+    TCGv vdst = dest_gr(dc, rdst);
+
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "shl16insli r%d, r%d, 0x%x\n",
+                  rdst, rsrc, uimm16);
+    tcg_gen_shli_i64(vdst, load_gr(dc, rsrc), 16);
+    tcg_gen_ori_i64(vdst, vdst, uimm16);
+}
+
+static void gen_shrs(struct DisasContext *dc,
+                     uint8_t rdst, uint8_t rsrc, uint8_t rsrcb)
+{
+    TCGv vdst = dest_gr(dc, rdst);
+
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "shrs r%d, r%d, r%d\n", rdst, rsrc, rsrcb);
+    tcg_gen_andi_i64(vdst, load_gr(dc, rsrcb), 63);
+    tcg_gen_sar_i64(vdst, load_gr(dc, rsrc), vdst);
+}
+
+static void gen_shrux(struct DisasContext *dc,
+                      uint8_t rdst, uint8_t rsrc, uint8_t rsrcb)
+{
+    TCGv vdst = dest_gr(dc, rdst);
+    TCGv tmp = tcg_temp_new_i64();
+
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "shrux r%d, r%d, r%d\n",
+                  rdst, rsrc, rsrcb);
+    tcg_gen_andi_i64(vdst, load_gr(dc, rsrcb), 31);
+    tcg_gen_andi_i64(tmp, load_gr(dc, rsrc), 0xffffffff);
+    tcg_gen_shr_i64(vdst, tmp, vdst);
+    tcg_gen_ext32s_i64(vdst, vdst);
+
+    tcg_temp_free_i64(tmp);
+}
+
+static void gen_shru(struct DisasContext *dc,
+                     uint8_t rdst, uint8_t rsrc, uint8_t rsrcb)
+{
+    TCGv vdst = dest_gr(dc, rdst);
+
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "shru r%d, r%d, r%d\n", rdst, rsrc, rsrcb);
+    tcg_gen_andi_i64(vdst, load_gr(dc, rsrcb), 63);
+    tcg_gen_shr_i64(vdst, load_gr(dc, rsrc), vdst);
+}
+
+static void gen_shufflebytes(struct DisasContext *dc,
+                             uint8_t rdst, uint8_t rsrc, uint8_t rsrcb)
+{
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "shufflebytes r%d, r%d, r%d\n",
+                  rdst, rsrc, rsrcb);
+    gen_helper_shufflebytes(dest_gr(dc, rdst), load_gr(dc, rdst),
+                            load_gr(dc, rsrc), load_gr(dc, rsrcb));
+}
+
+static void gen_shrsi(struct DisasContext *dc,
+                      uint8_t rdst, uint8_t rsrc, uint8_t shamt)
+{
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "shrsi r%d, r%d, %u\n", rdst, rsrc, shamt);
+    tcg_gen_sari_i64(dest_gr(dc, rdst), load_gr(dc, rsrc), shamt);
+}
+
+static void gen_shrui(struct DisasContext *dc,
+                      uint8_t rdst, uint8_t rsrc, uint8_t shamt)
+{
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "shrui r%d, r%d, %u\n", rdst, rsrc, shamt);
+    tcg_gen_shri_i64(dest_gr(dc, rdst), load_gr(dc, rsrc), shamt);
+}
+
+static void gen_shruxi(struct DisasContext *dc,
+                      uint8_t rdst, uint8_t rsrc, uint8_t shamt)
+{
+    TCGv vdst = dest_gr(dc, rdst);
+
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "shruxi r%d, r%d, %u\n",
+                  rdst, rsrc, shamt);
+    tcg_gen_andi_i64(vdst, load_gr(dc, rsrc), 0xffffffff);
+    tcg_gen_shri_i64(vdst, vdst, shamt & 31);
+    tcg_gen_ext32s_i64(vdst, vdst);
+}
+
+static void gen_v1shrui(struct DisasContext *dc,
+                      uint8_t rdst, uint8_t rsrc, uint8_t shamt)
+{
+    int count;
+    TCGv vdst = dest_gr(dc, rdst);
+    TCGv vsrc = load_gr(dc, rsrc);
+    TCGv tmp = tcg_temp_new_i64();
+
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "v1shrui r%d, r%d, %u\n",
+                  rdst, rsrc, shamt);
+
+    shamt &= 7;
+    tcg_gen_movi_i64(vdst, 0); /* or Assertion `ts->val_type == TEMP_VAL_REG' */
+    for (count = 0; count < 8; count++) {
+        extract_v1(tmp, vsrc, count);
+        tcg_gen_shri_i64(tmp, tmp, shamt);
+        insert_v1(vdst, tmp, count);
+    }
+    tcg_temp_free_i64(tmp);
+}
+
+static void gen_dblalign(struct DisasContext *dc,
+                         uint8_t rdst, uint8_t rsrc, uint8_t rsrcb)
+{
+    TCGv vdst = dest_gr(dc, rdst);
+    TCGv mask = tcg_temp_new_i64();
+    TCGv tmp = tcg_temp_new_i64();
+
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "dblalign r%d, r%d, r%d\n",
+                  rdst, rsrc, rsrcb);
+
+    tcg_gen_andi_i64(mask, load_gr(dc, rsrcb), 7);
+    tcg_gen_muli_i64(mask, mask, 8);
+    tcg_gen_shr_i64(vdst, load_gr(dc, rdst), mask);
+
+    tcg_gen_movi_i64(tmp, 64);
+    tcg_gen_sub_i64(mask, tmp, mask);
+    tcg_gen_shl_i64(mask, load_gr(dc, rsrc), mask);
+
+    tcg_gen_or_i64(vdst, vdst, mask);
+
+    tcg_temp_free_i64(tmp);
+    tcg_temp_free_i64(mask);
+}
+
+static void gen_cntlz(struct DisasContext *dc, uint8_t rdst, uint8_t rsrc)
+{
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "cntlz r%d, r%d\n", rdst, rsrc);
+    gen_helper_cntlz(dest_gr(dc, rdst), load_gr(dc, rsrc));
+}
+
+static void gen_cnttz(struct DisasContext *dc, uint8_t rdst, uint8_t rsrc)
+{
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "ctz r%d, r%d\n", rdst, rsrc);
+    gen_helper_cnttz(dest_gr(dc, rdst), load_gr(dc, rsrc));
+}
+
+static void gen_ld(struct DisasContext *dc,
+                   uint8_t rdst, uint8_t rsrc,
+                   TCGMemOp ops, const char *code)
+{
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "%s r%d, r%d\n", code, rdst, rsrc);
+    tcg_gen_qemu_ld_i64(dest_gr(dc, rdst), load_gr(dc, rsrc),
+                        MMU_USER_IDX, ops);
+}
+
+static void gen_ld_add(struct DisasContext *dc,
+                       uint8_t rdst, uint8_t rsrc, int8_t imm8,
+                       TCGMemOp ops, const char *code)
+{
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "%s r%d, r%d, %d\n",
+                  code, rdst, rsrc, imm8);
+
+    tcg_gen_qemu_ld_i64(dest_gr(dc, rdst), load_gr(dc, rsrc),
+                        MMU_USER_IDX, ops);
+    /*
+     * Each pipe only have one temp val which is already used, and it is only
+     * for pipe X1, so can use real register
+     */
+    if (rsrc < TILEGX_R_COUNT) {
+        tcg_gen_addi_i64(cpu_regs[rsrc], load_gr(dc, rsrc), imm8);
+    }
+}
+
+static void gen_st(struct DisasContext *dc,
+                   uint8_t rsrc, uint8_t rsrcb,
+                   TCGMemOp ops, const char *code)
+{
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "%s r%d, r%d\n", code, rsrc, rsrcb);
+    tcg_gen_qemu_st_i64(load_gr(dc, rsrcb), load_gr(dc, rsrc),
+                        MMU_USER_IDX, ops);
+}
+
+static void gen_st_add(struct DisasContext *dc,
+                       uint8_t rsrc, uint8_t rsrcb, uint8_t imm8,
+                       TCGMemOp ops, const char *code)
+{
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "%s r%d, r%d, %d\n",
+                  code, rsrc, rsrcb, imm8);
+    tcg_gen_qemu_st_i64(load_gr(dc, rsrcb), load_gr(dc, rsrc),
+                        MMU_USER_IDX, ops);
+    tcg_gen_addi_i64(dest_gr(dc, rsrc), load_gr(dc, rsrc), imm8);
+}
+
+static void gen_lnk(struct DisasContext *dc, uint8_t rdst)
+{
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "lnk r%d\n", rdst);
+    tcg_gen_movi_i64(dest_gr(dc, rdst), dc->pc + TILEGX_BUNDLE_SIZE_IN_BYTES);
+}
+
+static void gen_b(struct DisasContext *dc,
+                  uint8_t rsrc, int32_t off, TCGCond cond, const char *code)
+{
+    uint64_t pos = dc->pc + (int64_t)off * TILEGX_BUNDLE_SIZE_IN_BYTES;
+
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "%s r%d, %d ([" TARGET_FMT_lx "] %s)\n",
+                  code, rsrc, off, pos, lookup_symbol(pos));
+
+    dc->jmp.dest = tcg_temp_new_i64();
+    dc->jmp.val1 = tcg_temp_new_i64();
+    dc->jmp.val2 = tcg_temp_new_i64();
+
+    dc->jmp.cond = cond;
+    tcg_gen_movi_i64(dc->jmp.dest, pos);
+    tcg_gen_mov_i64(dc->jmp.val1, load_gr(dc, rsrc));
+    tcg_gen_movi_i64(dc->jmp.val2, 0);
+
+    return;
+}
+
+static int gen_blb(struct DisasContext *dc, uint8_t rsrc, int32_t off,
+                   TCGCond cond, const char *code)
+{
+    uint64_t pos = dc->pc + (int64_t)off * TILEGX_BUNDLE_SIZE_IN_BYTES;
+
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "%s r%d, %d ([" TARGET_FMT_lx "] %s)\n",
+                  code, rsrc, off, pos, lookup_symbol(pos));
+
+    dc->jmp.dest = tcg_temp_new_i64();
+    dc->jmp.val1 = tcg_temp_new_i64();
+    dc->jmp.val2 = tcg_temp_new_i64();
+
+    dc->jmp.cond = cond;
+    tcg_gen_movi_i64(dc->jmp.dest, pos);
+    tcg_gen_mov_i64(dc->jmp.val1, load_gr(dc, rsrc));
+    tcg_gen_andi_i64(dc->jmp.val1, dc->jmp.val1, 1ULL);
+    tcg_gen_movi_i64(dc->jmp.val2, 0);
+
+    return 0;
+}
+
+/* For memory fence */
+static void gen_mf(struct DisasContext *dc)
+{
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "mf\n");
+    /* FIXME: Do we need any implementation for it? I guess no. */
+}
+
+/* Write hitt 64 bytes. It is about cache. */
+static void gen_wh64(struct DisasContext *dc, uint8_t rsrc)
+{
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "wh64 r%d\n", rsrc);
+    /* FIXME: Do we need any implementation for it? I guess no. */
+}
+
+static void gen_jr(struct DisasContext *dc, uint8_t rsrc)
+{
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "jr(p) r%d\n", rsrc);
+
+    dc->jmp.dest = tcg_temp_new_i64();
+
+    dc->jmp.cond = TCG_COND_ALWAYS;
+    tcg_gen_andi_i64(dc->jmp.dest, load_gr(dc, rsrc), ~(sizeof(uint64_t) - 1));
+}
+
+static void gen_jalr(struct DisasContext *dc, uint8_t rsrc)
+{
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "jalr(p) r%d\n", rsrc);
+
+    dc->jmp.dest = tcg_temp_new_i64();
+    tcg_gen_movi_i64(dest_gr(dc, TILEGX_R_LR),
+                     dc->pc + TILEGX_BUNDLE_SIZE_IN_BYTES);
+
+    dc->jmp.cond = TCG_COND_ALWAYS;
+    tcg_gen_andi_i64(dc->jmp.dest, load_gr(dc, rsrc), ~(sizeof(uint64_t) - 1));
+}
+
+static void gen_j(struct DisasContext *dc, int off)
+{
+    uint64_t pos = dc->pc + (int64_t)off * TILEGX_BUNDLE_SIZE_IN_BYTES;
+
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "j %d ([" TARGET_FMT_lx "] %s)\n",
+                  off, pos, lookup_symbol(pos));
+
+    dc->jmp.dest = tcg_temp_new_i64();
+
+    dc->jmp.cond = TCG_COND_ALWAYS;
+    tcg_gen_movi_i64(dc->jmp.dest, pos);
+}
+
+static void gen_jal(struct DisasContext *dc, int off)
+{
+    uint64_t pos = dc->pc + (int64_t)off * TILEGX_BUNDLE_SIZE_IN_BYTES;
+
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "jal %d ([" TARGET_FMT_lx "] %s)\n",
+                  off, pos, lookup_symbol(pos));
+
+    dc->jmp.dest = tcg_temp_new_i64();
+    tcg_gen_movi_i64(dest_gr(dc, TILEGX_R_LR),
+                     dc->pc + TILEGX_BUNDLE_SIZE_IN_BYTES);
+
+    dc->jmp.cond = TCG_COND_ALWAYS;
+    tcg_gen_movi_i64(dc->jmp.dest, pos);
+}
+
+static void gen_swint1(struct DisasContext *dc)
+{
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "swint1\n");
+
+    tcg_gen_movi_i64(cpu_pc, dc->pc + TILEGX_BUNDLE_SIZE_IN_BYTES);
+    dc->exception = TILEGX_EXCP_SYSCALL;
+}
+
+static void decode_rrr_0_opcode_y0(struct DisasContext *dc,
+                                   tilegx_bundle_bits bundle)
+{
+    uint8_t rsrc = get_SrcA_Y0(bundle);
+    uint8_t rsrcb = get_SrcB_Y0(bundle);
+    uint8_t rdst = get_Dest_Y0(bundle);
+
+    switch (get_RRROpcodeExtension_Y0(bundle)) {
+    case ADD_RRR_0_OPCODE_Y0:
+        gen_add(dc, rdst, rsrc, rsrcb);
+        return;
+    case ADDX_RRR_0_OPCODE_Y0:
+        gen_addx(dc, rdst, rsrc, rsrcb);
+        return;
+    case SUBX_RRR_0_OPCODE_Y0:
+        gen_subx(dc, rdst, rsrc, rsrcb);
+        return;
+    case SUB_RRR_0_OPCODE_Y0:
+        gen_sub(dc, rdst, rsrc, rsrcb);
+        return;
+    default:
+        break;
+    }
+
+    qemu_log_mask(LOG_UNIMP, "UNIMP rrr_0_opcode_y0, [" FMT64X "]\n", bundle);
+    dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
+}
+
+static void decode_rrr_1_opcode_y0(struct DisasContext *dc,
+                                   tilegx_bundle_bits bundle)
+{
+    uint8_t rsrc = get_SrcA_Y0(bundle);
+    uint8_t rsrcb = get_SrcB_Y0(bundle);
+    uint8_t rdst = get_Dest_Y0(bundle);
+
+    switch (get_RRROpcodeExtension_Y0(bundle)) {
+    case UNARY_RRR_1_OPCODE_Y0:
+        switch (get_UnaryOpcodeExtension_Y0(bundle)) {
+        case CNTLZ_UNARY_OPCODE_Y0:
+            gen_cntlz(dc, rdst, rsrc);
+            return;
+        case CNTTZ_UNARY_OPCODE_Y0:
+            gen_cnttz(dc, rdst, rsrc);
+            return;
+        case NOP_UNARY_OPCODE_Y0:
+        case  FNOP_UNARY_OPCODE_Y0:
+            if (!rsrc && !rdst) {
+                qemu_log_mask(CPU_LOG_TB_IN_ASM, "(f)nop\n");
+                return;
+            }
+            break;
+        case FSINGLE_PACK1_UNARY_OPCODE_Y0:
+        case PCNT_UNARY_OPCODE_Y0:
+        case REVBITS_UNARY_OPCODE_Y0:
+        case REVBYTES_UNARY_OPCODE_Y0:
+        case TBLIDXB0_UNARY_OPCODE_Y0:
+        case TBLIDXB1_UNARY_OPCODE_Y0:
+        case TBLIDXB2_UNARY_OPCODE_Y0:
+        case TBLIDXB3_UNARY_OPCODE_Y0:
+        default:
+            break;
+        }
+        break;
+    case SHL1ADD_RRR_1_OPCODE_Y0:
+        gen_shladd(dc, rdst, rsrc, rsrcb, 1, 0);
+        return;
+    case SHL2ADD_RRR_1_OPCODE_Y0:
+        gen_shladd(dc, rdst, rsrc, rsrcb, 2, 0);
+        return;
+    case SHL3ADD_RRR_1_OPCODE_Y0:
+        gen_shladd(dc, rdst, rsrc, rsrcb, 3, 0);
+        return;
+    default:
+        break;
+    }
+
+    qemu_log_mask(LOG_UNIMP, "UNIMP rrr_1_opcode_y0, [" FMT64X "]\n", bundle);
+}
+
+static void decode_rrr_2_opcode_y0(struct DisasContext *dc,
+                                   tilegx_bundle_bits bundle)
+{
+    uint8_t rsrc = get_SrcA_Y0(bundle);
+    uint8_t rsrcb = get_SrcB_Y0(bundle);
+    uint8_t rdst = get_Dest_Y0(bundle);
+
+    switch (get_RRROpcodeExtension_Y0(bundle)) {
+    case CMPLES_RRR_2_OPCODE_Y0:
+        gen_cmp(dc, rdst, rsrc, rsrcb, TCG_COND_LE, "cmples");
+        return;
+    case CMPLEU_RRR_2_OPCODE_Y0:
+        gen_cmp(dc, rdst, rsrc, rsrcb, TCG_COND_LEU, "cmpleu");
+        return;
+    case CMPLTS_RRR_2_OPCODE_Y0:
+        gen_cmp(dc, rdst, rsrc, rsrcb, TCG_COND_LT, "cmplts");
+        return;
+    case CMPLTU_RRR_2_OPCODE_Y0:
+        gen_cmp(dc, rdst, rsrc, rsrcb, TCG_COND_LTU, "cmpltu");
+        return;
+    default:
+        break;
+    }
+
+    qemu_log_mask(LOG_UNIMP, "UNIMP rrr_2_opcode_y0, [" FMT64X "]\n", bundle);
+    dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
+}
+
+static void decode_rrr_3_opcode_y0(struct DisasContext *dc,
+                                   tilegx_bundle_bits bundle)
+{
+    uint8_t rsrc = get_SrcA_Y0(bundle);
+    uint8_t rsrcb = get_SrcB_Y0(bundle);
+    uint8_t rdst = get_Dest_Y0(bundle);
+
+    switch (get_RRROpcodeExtension_Y0(bundle)) {
+    case CMPEQ_RRR_3_OPCODE_Y0:
+        gen_cmp(dc, rdst, rsrc, rsrcb, TCG_COND_EQ, "cmpeq");
+        return;
+    case CMPNE_RRR_3_OPCODE_Y0:
+        gen_cmp(dc, rdst, rsrc, rsrcb, TCG_COND_NE, "cmpne");
+        return;
+    case MULX_RRR_3_OPCODE_Y0:
+        gen_mulx(dc, rdst, rsrc, rsrcb);
+        return;
+    case MULAX_RRR_3_OPCODE_Y0:
+    default:
+        break;
+    }
+
+    qemu_log_mask(LOG_UNIMP, "UNIMP rrr_3_opcode_y0, [" FMT64X "]\n", bundle);
+    dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
+}
+
+static void decode_rrr_4_opcode_y0(struct DisasContext *dc,
+                                   tilegx_bundle_bits bundle)
+{
+    uint8_t rsrc = get_SrcA_Y0(bundle);
+    uint8_t rsrcb = get_SrcB_Y0(bundle);
+    uint8_t rdst = get_Dest_Y0(bundle);
+
+    switch (get_RRROpcodeExtension_Y0(bundle)) {
+    case CMOVNEZ_RRR_4_OPCODE_Y0:
+        gen_cmov(dc, rdst, rsrc, rsrcb, TCG_COND_NE, "cmovnez");
+        return;
+    case CMOVEQZ_RRR_4_OPCODE_Y0:
+        gen_cmov(dc, rdst, rsrc, rsrcb, TCG_COND_EQ, "cmoveqz");
+        return;
+    case MNZ_RRR_4_OPCODE_Y0:
+        gen_menz(dc, rdst, rsrc, rsrcb, TCG_COND_NE, "mnz");
+        return;
+    case MZ_RRR_4_OPCODE_Y0:
+        gen_menz(dc, rdst, rsrc, rsrcb, TCG_COND_EQ, "mz");
+        return;
+    default:
+        break;
+    }
+
+    qemu_log_mask(LOG_UNIMP, "UNIMP rrr_4_opcode_y0, [" FMT64X "]\n", bundle);
+    dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
+}
+
+static void decode_rrr_5_opcode_y0(struct DisasContext *dc,
+                                   tilegx_bundle_bits bundle)
+{
+    uint8_t rsrc = get_SrcA_Y0(bundle);
+    uint8_t rsrcb = get_SrcB_Y0(bundle);
+    uint8_t rdst = get_Dest_Y0(bundle);
+
+    switch (get_RRROpcodeExtension_Y0(bundle)) {
+    case OR_RRR_5_OPCODE_Y0:
+        gen_or(dc, rdst, rsrc, rsrcb);
+        return;
+    case AND_RRR_5_OPCODE_Y0:
+        gen_and(dc, rdst, rsrc, rsrcb);
+        return;
+    case NOR_RRR_5_OPCODE_Y0:
+        gen_nor(dc, rdst, rsrc, rsrcb);
+        return;
+    case XOR_RRR_5_OPCODE_Y0:
+        gen_xor(dc, rdst, rsrc, rsrcb);
+        return;
+    default:
+        break;
+    }
+    qemu_log_mask(LOG_UNIMP, "UNIMP rrr_5_opcode_y0, [" FMT64X "]\n", bundle);
+    dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
+}
+
+static void decode_rrr_6_opcode_y0(struct DisasContext *dc,
+                                   tilegx_bundle_bits bundle)
+{
+    uint8_t rsrc = get_SrcA_Y0(bundle);
+    uint8_t rsrcb = get_SrcB_Y0(bundle);
+    uint8_t rdst = get_Dest_Y0(bundle);
+
+    switch (get_RRROpcodeExtension_Y0(bundle)) {
+    case SHL_RRR_6_OPCODE_Y0:
+        gen_shl(dc, rdst, rsrc, rsrcb);
+        return;
+    case SHRS_RRR_6_OPCODE_Y0:
+        gen_shrs(dc, rdst, rsrc, rsrcb);
+        return;
+    case SHRU_RRR_6_OPCODE_Y0:
+        gen_shru(dc, rdst, rsrc, rsrcb);
+        return;
+    case ROTL_RRR_6_OPCODE_Y0:
+    default:
+        break;
+    }
+    qemu_log_mask(LOG_UNIMP, "UNIMP rrr_6_opcode_y0, [" FMT64X "]\n", bundle);
+    dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
+}
+
+static void decode_rrr_9_opcode_y0(struct DisasContext *dc,
+                                   tilegx_bundle_bits bundle)
+{
+    uint8_t rsrc = get_SrcA_Y0(bundle);
+    uint8_t rsrcb = get_SrcB_Y0(bundle);
+    uint8_t rdst = get_Dest_Y0(bundle);
+
+    switch (get_RRROpcodeExtension_Y0(bundle)) {
+    case MULA_HU_HU_RRR_9_OPCODE_Y0:
+        gen_mul_u_u(dc, rdst, rsrc, rsrcb, 1, 1, 1, "mula_hu_hu");
+        return;
+    case MULA_LU_LU_RRR_9_OPCODE_Y0:
+        gen_mul_u_u(dc, rdst, rsrc, rsrcb, 0, 0, 1, "mula_lu_lu");
+        return;
+    case MULA_HS_HS_RRR_9_OPCODE_Y0:
+    case MULA_LS_LS_RRR_9_OPCODE_Y0:
+    default:
+        break;
+    }
+    qemu_log_mask(LOG_UNIMP, "UNIMP rrr_9_opcode_y0, [" FMT64X "]\n", bundle);
+    dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
+}
+
+static void decode_shift_opcode_y0(struct DisasContext *dc,
+                                   tilegx_bundle_bits bundle)
+{
+    uint8_t rsrc = get_SrcA_Y0(bundle);
+    uint8_t shamt = get_ShAmt_Y0(bundle);
+    uint8_t rdst = get_Dest_Y0(bundle);
+
+    switch (get_ShiftOpcodeExtension_Y0(bundle)) {
+    case SHLI_SHIFT_OPCODE_Y0:
+        gen_shli(dc, rdst, rsrc, shamt);
+        return;
+    case SHRUI_SHIFT_OPCODE_Y0:
+        gen_shrui(dc, rdst, rsrc, shamt);
+        return;
+    case SHRSI_SHIFT_OPCODE_Y0:
+        gen_shrsi(dc, rdst, rsrc, shamt);
+        return;
+    case ROTLI_SHIFT_OPCODE_Y0:
+    default:
+        break;
+    }
+    qemu_log_mask(LOG_UNIMP, "UNIMP shift_opcode_y0, [" FMT64X "]\n", bundle);
+    dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
+}
+
+static void decode_rrr_0_opcode_y1(struct DisasContext *dc,
+                                   tilegx_bundle_bits bundle)
+{
+    uint8_t rsrc = get_SrcA_Y1(bundle);
+    uint8_t rsrcb = get_SrcB_Y1(bundle);
+    uint8_t rdst = get_Dest_Y1(bundle);
+
+    switch (get_RRROpcodeExtension_Y1(bundle)) {
+    case ADDX_SPECIAL_0_OPCODE_Y1:
+        gen_addx(dc, rdst, rsrc, rsrcb);
+        return;
+    case ADD_SPECIAL_0_OPCODE_Y1:
+        gen_add(dc, rdst, rsrc, rsrcb);
+        return;
+    case SUBX_RRR_0_OPCODE_Y1:
+        gen_subx(dc, rdst, rsrc, rsrcb);
+        return;
+    case SUB_RRR_0_OPCODE_Y1:
+        gen_sub(dc, rdst, rsrc, rsrcb);
+        return;
+    default:
+        break;
+    }
+    qemu_log_mask(LOG_UNIMP, "UNIMP rrr_0_opcode_y1, [" FMT64X "]\n", bundle);
+    dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
+}
+
+static void decode_rrr_1_opcode_y1(struct DisasContext *dc,
+                                   tilegx_bundle_bits bundle)
+{
+    uint8_t rsrc = get_SrcA_Y1(bundle);
+    uint8_t rsrcb = get_SrcB_Y1(bundle);
+    uint8_t rdst = get_Dest_Y1(bundle);
+
+    switch (get_RRROpcodeExtension_Y1(bundle)) {
+    case UNARY_RRR_1_OPCODE_Y1:
+        switch (get_UnaryOpcodeExtension_Y1(bundle)) {
+        case NOP_UNARY_OPCODE_Y1:
+        case FNOP_UNARY_OPCODE_Y1:
+            if (!rsrc && !rdst) {
+                qemu_log_mask(CPU_LOG_TB_IN_ASM, "(f)nop\n");
+                return;
+            }
+            break;
+        case JALRP_UNARY_OPCODE_Y1:
+        case JALR_UNARY_OPCODE_Y1:
+            if (!rdst) {
+                gen_jalr(dc, rsrc);
+                return;
+            }
+            break;
+        case JR_UNARY_OPCODE_Y1:
+        case JRP_UNARY_OPCODE_Y1:
+            if (!rdst) {
+                gen_jr(dc, rsrc);
+                return;
+            }
+            break;
+        case LNK_UNARY_OPCODE_Y1:
+            if (!rsrc) {
+                gen_lnk(dc, rdst);
+                return;
+            }
+            break;
+        case ILL_UNARY_OPCODE_Y1:
+        default:
+            break;
+        }
+        break;
+    case SHL1ADD_RRR_1_OPCODE_Y1:
+        gen_shladd(dc, rdst, rsrc, rsrcb, 1, 0);
+        return;
+    case SHL2ADD_RRR_1_OPCODE_Y1:
+        gen_shladd(dc, rdst, rsrc, rsrcb, 2, 0);
+        return;
+    case SHL3ADD_RRR_1_OPCODE_Y1:
+        gen_shladd(dc, rdst, rsrc, rsrcb, 3, 0);
+        return;
+    default:
+        break;
+    }
+    qemu_log_mask(LOG_UNIMP, "UNIMP rrr_1_opcode_y1, [" FMT64X "]\n", bundle);
+    dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
+}
+
+static void decode_rrr_2_opcode_y1(struct DisasContext *dc,
+                                   tilegx_bundle_bits bundle)
+{
+    uint8_t rsrc = get_SrcA_Y1(bundle);
+    uint8_t rsrcb = get_SrcB_Y1(bundle);
+    uint8_t rdst = get_Dest_Y1(bundle);
+
+    switch (get_RRROpcodeExtension_Y1(bundle)) {
+    case CMPLES_RRR_2_OPCODE_Y1:
+        gen_cmp(dc, rdst, rsrc, rsrcb, TCG_COND_LE, "cmples");
+        return;
+    case CMPLEU_RRR_2_OPCODE_Y1:
+        gen_cmp(dc, rdst, rsrc, rsrcb, TCG_COND_LEU, "cmpleu");
+        return;
+    case CMPLTS_RRR_2_OPCODE_Y1:
+        gen_cmp(dc, rdst, rsrc, rsrcb, TCG_COND_LT, "cmplts");
+        return;
+    case CMPLTU_RRR_2_OPCODE_Y1:
+        gen_cmp(dc, rdst, rsrc, rsrcb, TCG_COND_LTU, "cmpltu");
+        return;
+    default:
+        break;
+    }
+
+    qemu_log_mask(LOG_UNIMP, "UNIMP rrr_2_opcode_y1, [" FMT64X "]\n", bundle);
+    dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
+}
+
+static void decode_rrr_3_opcode_y1(struct DisasContext *dc,
+                                   tilegx_bundle_bits bundle)
+{
+    uint8_t rsrc = get_SrcA_Y1(bundle);
+    uint8_t rsrcb = get_SrcB_Y1(bundle);
+    uint8_t rdst = get_Dest_Y1(bundle);
+
+    switch (get_RRROpcodeExtension_Y1(bundle)) {
+    case CMPEQ_RRR_3_OPCODE_Y1:
+        gen_cmp(dc, rdst, rsrc, rsrcb, TCG_COND_EQ, "cmpeq");
+        return;
+    case CMPNE_RRR_3_OPCODE_Y1:
+        gen_cmp(dc, rdst, rsrc, rsrcb, TCG_COND_NE, "cmpne");
+        return;
+    default:
+        break;
+    }
+
+    qemu_log_mask(LOG_UNIMP, "UNIMP rrr_3_opcode_y1, [" FMT64X "]\n", bundle);
+    dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
+}
+
+static void decode_rrr_5_opcode_y1(struct DisasContext *dc,
+                                   tilegx_bundle_bits bundle)
+{
+    uint8_t rsrc = get_SrcA_Y1(bundle);
+    uint8_t rsrcb = get_SrcB_Y1(bundle);
+    uint8_t rdst = get_Dest_Y1(bundle);
+
+    switch (get_RRROpcodeExtension_Y1(bundle)) {
+    case OR_RRR_5_OPCODE_Y1:
+        gen_or(dc, rdst, rsrc, rsrcb);
+        return;
+    case AND_RRR_5_OPCODE_Y1:
+        gen_and(dc, rdst, rsrc, rsrcb);
+        return;
+    case NOR_RRR_5_OPCODE_Y1:
+        gen_nor(dc, rdst, rsrc, rsrcb);
+        return;
+    case XOR_RRR_5_OPCODE_Y1:
+        gen_xor(dc, rdst, rsrc, rsrcb);
+        return;
+    default:
+        break;
+    }
+    qemu_log_mask(LOG_UNIMP, "UNIMP rrr_5_opcode_y1, [" FMT64X "]\n", bundle);
+    dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
+}
+
+static void decode_shift_opcode_y1(struct DisasContext *dc,
+                                   tilegx_bundle_bits bundle)
+{
+    uint8_t rsrc = get_SrcA_Y1(bundle);
+    uint8_t rdst = get_Dest_Y1(bundle);
+    uint8_t shamt = get_ShAmt_Y1(bundle);
+
+    switch (get_RRROpcodeExtension_Y1(bundle)) {
+    case SHLI_SHIFT_OPCODE_Y1:
+        gen_shli(dc, rdst, rsrc, shamt);
+        return;
+    case SHRSI_SHIFT_OPCODE_Y1:
+        gen_shrsi(dc, rdst, rsrc, shamt);
+        return;
+    case SHRUI_SHIFT_OPCODE_Y1:
+        gen_shrui(dc, rdst, rsrc, shamt);
+        return;
+    case ROTLI_SHIFT_OPCODE_Y1:
+    default:
+        break;
+    }
+    qemu_log_mask(LOG_UNIMP, "UNIMP shift_opcode_y1, [" FMT64X "]\n", bundle);
+    dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
+}
+
+static void decode_ldst0_opcode_y2(struct DisasContext *dc,
+                                   tilegx_bundle_bits bundle)
+{
+    uint8_t rsrca = get_SrcA_Y2(bundle);
+    uint8_t rsrcbdst = get_SrcBDest_Y2(bundle);
+
+    switch (get_Mode(bundle)) {
+    case MODE_OPCODE_YA2:
+        gen_ld(dc, rsrcbdst, rsrca, MO_SB, "ld1s");
+        return;
+    case MODE_OPCODE_YC2:
+        gen_st(dc, rsrca, rsrcbdst, MO_UB, "st1");
+        return;
+    case MODE_OPCODE_YB2:
+    default:
+        break;
+    }
+    qemu_log_mask(LOG_UNIMP, "UNIMP ldst0_opcode_y2, [" FMT64X "]\n", bundle);
+    dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
+}
+
+static void decode_ldst1_opcode_y2(struct DisasContext *dc,
+                                   tilegx_bundle_bits bundle)
+{
+    uint8_t rsrc = (uint8_t)get_SrcA_Y2(bundle);
+    uint8_t rsrcbdst = (uint8_t)get_SrcBDest_Y2(bundle);
+
+    switch (get_Mode(bundle)) {
+    case MODE_OPCODE_YA2:
+        if (rsrcbdst == TILEGX_R_ZERO) {
+            /* Need nothing */
+            qemu_log_mask(CPU_LOG_TB_IN_ASM, "prefetch r%d\n", rsrc);
+            return;
+        }
+        gen_ld(dc, rsrcbdst, rsrc, MO_UB, "ld1u");
+        return;
+    case MODE_OPCODE_YB2:
+        gen_ld(dc, rsrcbdst, rsrc, MO_LESL, "ld4s");
+        return;
+    case MODE_OPCODE_YC2:
+        gen_st(dc, rsrc, rsrcbdst, MO_LEUW, "st2");
+        return;
+    default:
+        break;
+    }
+    qemu_log_mask(LOG_UNIMP, "UNIMP ldst1_opcode_y2, [" FMT64X "]\n", bundle);
+    dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
+}
+
+static void decode_ldst2_opcode_y2(struct DisasContext *dc,
+                                   tilegx_bundle_bits bundle)
+{
+    uint8_t rsrc = get_SrcA_Y2(bundle);
+    uint8_t rsrcbdst = get_SrcBDest_Y2(bundle);
+
+    switch (get_Mode(bundle)) {
+    case MODE_OPCODE_YC2:
+        gen_st(dc, rsrc, rsrcbdst, MO_LEUL, "st4");
+        return;
+    case MODE_OPCODE_YB2:
+        gen_ld(dc, rsrcbdst, rsrc, MO_LEUL, "ld4u");
+        return;
+    case MODE_OPCODE_YA2:
+    default:
+        break;
+    }
+    qemu_log_mask(LOG_UNIMP, "UNIMP ldst2_opcode_y2, [" FMT64X "]\n", bundle);
+    dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
+}
+
+static void decode_ldst3_opcode_y2(struct DisasContext *dc,
+                                   tilegx_bundle_bits bundle)
+{
+    uint8_t rsrca = get_SrcA_Y2(bundle);
+    uint8_t rsrcbdst = get_SrcBDest_Y2(bundle);
+
+    switch (get_Mode(bundle)) {
+    case MODE_OPCODE_YA2:
+        gen_ld(dc, rsrcbdst, rsrca, MO_LEUW, "ld2u");
+        return;
+    case MODE_OPCODE_YB2:
+        gen_ld(dc, rsrcbdst, rsrca, MO_LEQ, "ld(na)");
+        return;
+    case MODE_OPCODE_YC2:
+        gen_st(dc, rsrca, rsrcbdst, MO_LEQ, "st");
+        return;
+    default:
+        break;
+    }
+    qemu_log_mask(LOG_UNIMP, "UNIMP ldst3_opcode_y2, [" FMT64X "]\n", bundle);
+    dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
+}
+
+static void decode_bf_opcode_x0(struct DisasContext *dc,
+                                tilegx_bundle_bits bundle)
+{
+    uint8_t rsrc = get_SrcA_X0(bundle);
+    uint8_t rdst = get_Dest_X0(bundle);
+    uint8_t start = get_BFStart_X0(bundle);
+    uint8_t end = get_BFEnd_X0(bundle);
+
+    switch (get_BFOpcodeExtension_X0(bundle)) {
+    case BFEXTS_BF_OPCODE_X0:
+        gen_bfexts(dc, rdst, rsrc, start, end);
+        return;
+    case BFEXTU_BF_OPCODE_X0:
+        gen_bfextu(dc, rdst, rsrc, start, end);
+        return;
+    case BFINS_BF_OPCODE_X0:
+        gen_bfins(dc, rdst, rsrc, start, end);
+        return;
+    case MM_BF_OPCODE_X0:
+    default:
+        break;
+    }
+
+    qemu_log_mask(LOG_UNIMP, "UNIMP bf_opcode_x0, [" FMT64X "]\n", bundle);
+    dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
+}
+
+static void decode_imm8_opcode_x0(struct DisasContext *dc,
+                                  tilegx_bundle_bits bundle)
+{
+    uint8_t rsrc = get_SrcA_X0(bundle);
+    uint8_t rdst = get_Dest_X0(bundle);
+    int8_t imm8 = get_Imm8_X0(bundle);
+
+    switch (get_Imm8OpcodeExtension_X0(bundle)) {
+    case ADDI_IMM8_OPCODE_X0:
+        gen_addimm(dc, rdst, rsrc, imm8);
+        return;
+    case ADDXI_IMM8_OPCODE_X0:
+        gen_addximm(dc, rdst, rsrc, imm8);
+        return;
+    case ANDI_IMM8_OPCODE_X0:
+        gen_andi(dc, rdst, rsrc, imm8);
+        return;
+    case CMPEQI_IMM8_OPCODE_X0:
+        gen_cmpi(dc, rdst, rsrc, imm8, TCG_COND_EQ, "cmpeqi");
+        return;
+    case CMPLTSI_IMM8_OPCODE_X0:
+        gen_cmpi(dc, rdst, rsrc, imm8, TCG_COND_LT, "cmpltsi");
+        return;
+    case CMPLTUI_IMM8_OPCODE_X0:
+        gen_cmpi(dc, rdst, rsrc, imm8, TCG_COND_LTU, "cmpltui");
+        return;
+    case ORI_IMM8_OPCODE_X0:
+        gen_ori(dc, rdst, rsrc, imm8);
+        return;
+    case V1CMPEQI_IMM8_OPCODE_X0:
+        gen_v1cmpeqi(dc, rdst, rsrc, imm8);
+        return;
+    case V1ADDI_IMM8_OPCODE_X0:
+    case V1CMPLTSI_IMM8_OPCODE_X0:
+    case V1CMPLTUI_IMM8_OPCODE_X0:
+    case V1MAXUI_IMM8_OPCODE_X0:
+    case V1MINUI_IMM8_OPCODE_X0:
+    case V2ADDI_IMM8_OPCODE_X0:
+    case V2CMPEQI_IMM8_OPCODE_X0:
+    case V2CMPLTSI_IMM8_OPCODE_X0:
+    case V2CMPLTUI_IMM8_OPCODE_X0:
+    case V2MAXSI_IMM8_OPCODE_X0:
+    case V2MINSI_IMM8_OPCODE_X0:
+    case XORI_IMM8_OPCODE_X0:
+    default:
+        break;
+    }
+    qemu_log_mask(LOG_UNIMP, "UNIMP imm8_opcode_x0, [" FMT64X "]\n", bundle);
+    dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
+}
+
+static void decode_rrr_0_opcode_x0(struct DisasContext *dc,
+                                   tilegx_bundle_bits bundle)
+{
+    uint8_t rsrc = get_SrcA_X0(bundle);
+    uint8_t rsrcb = get_SrcB_X0(bundle);
+    uint8_t rdst = get_Dest_X0(bundle);
+
+    switch (get_RRROpcodeExtension_X0(bundle)) {
+    case ADD_RRR_0_OPCODE_X0:
+        gen_add(dc, rdst, rsrc, rsrcb);
+        return;
+    case ADDXSC_RRR_0_OPCODE_X0:
+    case ADDX_RRR_0_OPCODE_X0:
+        gen_addx(dc, rdst, rsrc, rsrcb);
+        return;
+    case AND_RRR_0_OPCODE_X0:
+        gen_and(dc, rdst, rsrc, rsrcb);
+        return;
+    case CMOVEQZ_RRR_0_OPCODE_X0:
+        gen_cmov(dc, rdst, rsrc, rsrcb, TCG_COND_EQ, "cmoveqz");
+        return;
+    case CMOVNEZ_RRR_0_OPCODE_X0:
+        gen_cmov(dc, rdst, rsrc, rsrcb, TCG_COND_NE, "cmovnez");
+        return;
+    case CMPEQ_RRR_0_OPCODE_X0:
+        gen_cmp(dc, rdst, rsrc, rsrcb, TCG_COND_EQ, "cmpeq");
+        return;
+    case CMPLES_RRR_0_OPCODE_X0:
+        gen_cmp(dc, rdst, rsrc, rsrcb, TCG_COND_LE, "cmples");
+        return;
+    case CMPLEU_RRR_0_OPCODE_X0:
+        gen_cmp(dc, rdst, rsrc, rsrcb, TCG_COND_LEU, "cmpleu");
+        return;
+    case CMPLTS_RRR_0_OPCODE_X0:
+        gen_cmp(dc, rdst, rsrc, rsrcb, TCG_COND_LT, "cmplts");
+        return;
+    case CMPLTU_RRR_0_OPCODE_X0:
+        gen_cmp(dc, rdst, rsrc, rsrcb, TCG_COND_LTU, "cmpltu");
+        return;
+    case CMPNE_RRR_0_OPCODE_X0:
+        gen_cmp(dc, rdst, rsrc, rsrcb, TCG_COND_NE, "cmpne");
+        return;
+    case DBLALIGN_RRR_0_OPCODE_X0:
+        gen_dblalign(dc, rdst, rsrc, rsrcb);
+        return;
+    case MNZ_RRR_0_OPCODE_X0:
+        gen_menz(dc, rdst, rsrc, rsrcb, TCG_COND_NE, "mnz");
+        return;
+    case MZ_RRR_0_OPCODE_X0:
+        gen_menz(dc, rdst, rsrc, rsrcb, TCG_COND_EQ, "mz");
+        return;
+    case MULX_RRR_0_OPCODE_X0:
+        gen_mulx(dc, rdst, rsrc, rsrcb);
+        return;
+    case MULA_HU_HU_RRR_0_OPCODE_X0:
+        gen_mul_u_u(dc, rdst, rsrc, rsrcb, 1, 1, 1, "mula_hu_hu");
+        return;
+    case MULA_HU_LU_RRR_0_OPCODE_X0:
+        gen_mul_u_u(dc, rdst, rsrc, rsrcb, 1, 0, 1, "mula_hu_lu");
+        return;
+    case MULA_LU_LU_RRR_0_OPCODE_X0:
+        gen_mul_u_u(dc, rdst, rsrc, rsrcb, 0, 0, 1, "mula_lu_lu");
+        return;
+    case MUL_HU_HU_RRR_0_OPCODE_X0:
+        gen_mul_u_u(dc, rdst, rsrc, rsrcb, 1, 1, 0, "mul_hu_hu");
+        return;
+    case MUL_HU_LU_RRR_0_OPCODE_X0:
+        gen_mul_u_u(dc, rdst, rsrc, rsrcb, 1, 0, 0, "mul_hu_lu");
+        return;
+    case MUL_LU_LU_RRR_0_OPCODE_X0:
+        gen_mul_u_u(dc, rdst, rsrc, rsrcb, 0, 0, 0, "mul_lu_lu");
+        return;
+    case NOR_RRR_0_OPCODE_X0:
+        gen_nor(dc, rdst, rsrc, rsrcb);
+        return;
+    case OR_RRR_0_OPCODE_X0:
+        gen_or(dc, rdst, rsrc, rsrcb);
+        return;
+    case SHL_RRR_0_OPCODE_X0:
+        gen_shl(dc, rdst, rsrc, rsrcb);
+        return;
+    case SHL1ADDX_RRR_0_OPCODE_X0:
+        gen_shladd(dc, rdst, rsrc, rsrcb, 1, 1);
+        return;
+    case SHL1ADD_RRR_0_OPCODE_X0:
+        gen_shladd(dc, rdst, rsrc, rsrcb, 1, 0);
+        return;
+    case SHL2ADDX_RRR_0_OPCODE_X0:
+        gen_shladd(dc, rdst, rsrc, rsrcb, 2, 1);
+        return;
+    case SHL2ADD_RRR_0_OPCODE_X0:
+        gen_shladd(dc, rdst, rsrc, rsrcb, 2, 0);
+        return;
+    case SHL3ADDX_RRR_0_OPCODE_X0:
+        gen_shladd(dc, rdst, rsrc, rsrcb, 3, 1);
+        return;
+    case SHL3ADD_RRR_0_OPCODE_X0:
+        gen_shladd(dc, rdst, rsrc, rsrcb, 3, 0);
+        return;
+    case SHLX_RRR_0_OPCODE_X0:
+        gen_shlx(dc, rdst, rsrc, rsrcb);
+        return;
+    case SHRS_RRR_0_OPCODE_X0:
+        gen_shrs(dc, rdst, rsrc, rsrcb);
+        return;
+    case SHRUX_RRR_0_OPCODE_X0:
+        gen_shrux(dc, rdst, rsrc, rsrcb);
+        return;
+    case SHRU_RRR_0_OPCODE_X0:
+        gen_shru(dc, rdst, rsrc, rsrcb);
+        return;
+    case SHUFFLEBYTES_RRR_0_OPCODE_X0:
+        gen_shufflebytes(dc, rdst, rsrc, rsrcb);
+        return;
+    case SUBX_RRR_0_OPCODE_X0:
+        gen_subx(dc, rdst, rsrc, rsrcb);
+        return;
+    case SUB_RRR_0_OPCODE_X0:
+        gen_sub(dc, rdst, rsrc, rsrcb);
+        return;
+    case UNARY_RRR_0_OPCODE_X0:
+        switch (get_UnaryOpcodeExtension_X0(bundle)) {
+        case CNTLZ_UNARY_OPCODE_X0:
+            gen_cntlz(dc, rdst, rsrc);
+            return;
+        case CNTTZ_UNARY_OPCODE_X0:
+            gen_cnttz(dc, rdst, rsrc);
+            return;
+        case FNOP_UNARY_OPCODE_X0:
+        case NOP_UNARY_OPCODE_X0:
+            if (!rsrc && !rdst) {
+                qemu_log_mask(CPU_LOG_TB_IN_ASM, "(f)nop\n");
+                return;
+            }
+            break;
+        case FSINGLE_PACK1_UNARY_OPCODE_X0:
+        case PCNT_UNARY_OPCODE_X0:
+        case REVBITS_UNARY_OPCODE_X0:
+        case REVBYTES_UNARY_OPCODE_X0:
+        case TBLIDXB0_UNARY_OPCODE_X0:
+        case TBLIDXB1_UNARY_OPCODE_X0:
+        case TBLIDXB2_UNARY_OPCODE_X0:
+        case TBLIDXB3_UNARY_OPCODE_X0:
+        default:
+            break;
+        }
+        break;
+    case V1INT_L_RRR_0_OPCODE_X0:
+        gen_v1int_l(dc, rdst, rsrc, rsrcb);
+        return;
+    case V4INT_L_RRR_0_OPCODE_X0:
+        gen_v4int_l(dc, rdst, rsrc, rsrcb);
+        return;
+    case V1CMPEQ_RRR_0_OPCODE_X0:
+        gen_v1cmpeq(dc, rdst, rsrc, rsrcb);
+        return;
+    case XOR_RRR_0_OPCODE_X0:
+        gen_xor(dc, rdst, rsrc, rsrcb);
+        return;
+    case CMULAF_RRR_0_OPCODE_X0:
+    case CMULA_RRR_0_OPCODE_X0:
+    case CMULFR_RRR_0_OPCODE_X0:
+    case CMULF_RRR_0_OPCODE_X0:
+    case CMULHR_RRR_0_OPCODE_X0:
+    case CMULH_RRR_0_OPCODE_X0:
+    case CMUL_RRR_0_OPCODE_X0:
+    case CRC32_32_RRR_0_OPCODE_X0:
+    case CRC32_8_RRR_0_OPCODE_X0:
+    case DBLALIGN2_RRR_0_OPCODE_X0:
+    case DBLALIGN4_RRR_0_OPCODE_X0:
+    case DBLALIGN6_RRR_0_OPCODE_X0:
+    case FDOUBLE_ADDSUB_RRR_0_OPCODE_X0:
+    case FDOUBLE_ADD_FLAGS_RRR_0_OPCODE_X0:
+    case FDOUBLE_MUL_FLAGS_RRR_0_OPCODE_X0:
+    case FDOUBLE_PACK1_RRR_0_OPCODE_X0:
+    case FDOUBLE_PACK2_RRR_0_OPCODE_X0:
+    case FDOUBLE_SUB_FLAGS_RRR_0_OPCODE_X0:
+    case FDOUBLE_UNPACK_MAX_RRR_0_OPCODE_X0:
+    case FDOUBLE_UNPACK_MIN_RRR_0_OPCODE_X0:
+    case FSINGLE_ADD1_RRR_0_OPCODE_X0:
+    case FSINGLE_ADDSUB2_RRR_0_OPCODE_X0:
+    case FSINGLE_MUL1_RRR_0_OPCODE_X0:
+    case FSINGLE_MUL2_RRR_0_OPCODE_X0:
+    case FSINGLE_PACK2_RRR_0_OPCODE_X0:
+    case FSINGLE_SUB1_RRR_0_OPCODE_X0:
+    case MULAX_RRR_0_OPCODE_X0:
+    case MULA_HS_HS_RRR_0_OPCODE_X0:
+    case MULA_HS_HU_RRR_0_OPCODE_X0:
+    case MULA_HS_LS_RRR_0_OPCODE_X0:
+    case MULA_HS_LU_RRR_0_OPCODE_X0:
+    case MULA_HU_LS_RRR_0_OPCODE_X0:
+    case MULA_LS_LS_RRR_0_OPCODE_X0:
+    case MULA_LS_LU_RRR_0_OPCODE_X0:
+    case MUL_HS_HS_RRR_0_OPCODE_X0:
+    case MUL_HS_HU_RRR_0_OPCODE_X0:
+    case MUL_HS_LS_RRR_0_OPCODE_X0:
+    case MUL_HS_LU_RRR_0_OPCODE_X0:
+    case MUL_HU_LS_RRR_0_OPCODE_X0:
+    case MUL_LS_LS_RRR_0_OPCODE_X0:
+    case MUL_LS_LU_RRR_0_OPCODE_X0:
+    case ROTL_RRR_0_OPCODE_X0:
+    case SUBXSC_RRR_0_OPCODE_X0:
+    case V1ADDUC_RRR_0_OPCODE_X0:
+    case V1ADD_RRR_0_OPCODE_X0:
+    case V1ADIFFU_RRR_0_OPCODE_X0:
+    case V1AVGU_RRR_0_OPCODE_X0:
+    case V1CMPLES_RRR_0_OPCODE_X0:
+    case V1CMPLEU_RRR_0_OPCODE_X0:
+    case V1CMPLTS_RRR_0_OPCODE_X0:
+    case V1CMPLTU_RRR_0_OPCODE_X0:
+    case V1CMPNE_RRR_0_OPCODE_X0:
+    case V1DDOTPUSA_RRR_0_OPCODE_X0:
+    case V1DDOTPUS_RRR_0_OPCODE_X0:
+    case V1DOTPA_RRR_0_OPCODE_X0:
+    case V1DOTPUSA_RRR_0_OPCODE_X0:
+    case V1DOTPUS_RRR_0_OPCODE_X0:
+    case V1DOTP_RRR_0_OPCODE_X0:
+    case V1MAXU_RRR_0_OPCODE_X0:
+    case V1MINU_RRR_0_OPCODE_X0:
+    case V1MNZ_RRR_0_OPCODE_X0:
+    case V1MULTU_RRR_0_OPCODE_X0:
+    case V1MULUS_RRR_0_OPCODE_X0:
+    case V1MULU_RRR_0_OPCODE_X0:
+    case V1MZ_RRR_0_OPCODE_X0:
+    case V1SADAU_RRR_0_OPCODE_X0:
+    case V1SADU_RRR_0_OPCODE_X0:
+    case V1SHL_RRR_0_OPCODE_X0:
+    case V1SHRS_RRR_0_OPCODE_X0:
+    case V1SHRU_RRR_0_OPCODE_X0:
+    case V1SUBUC_RRR_0_OPCODE_X0:
+    case V1SUB_RRR_0_OPCODE_X0:
+    case V1INT_H_RRR_0_OPCODE_X0:
+    case V2INT_H_RRR_0_OPCODE_X0:
+    case V2INT_L_RRR_0_OPCODE_X0:
+    case V4INT_H_RRR_0_OPCODE_X0:
+    case V2ADDSC_RRR_0_OPCODE_X0:
+    case V2ADD_RRR_0_OPCODE_X0:
+    case V2ADIFFS_RRR_0_OPCODE_X0:
+    case V2AVGS_RRR_0_OPCODE_X0:
+    case V2CMPEQ_RRR_0_OPCODE_X0:
+    case V2CMPLES_RRR_0_OPCODE_X0:
+    case V2CMPLEU_RRR_0_OPCODE_X0:
+    case V2CMPLTS_RRR_0_OPCODE_X0:
+    case V2CMPLTU_RRR_0_OPCODE_X0:
+    case V2CMPNE_RRR_0_OPCODE_X0:
+    case V2DOTPA_RRR_0_OPCODE_X0:
+    case V2DOTP_RRR_0_OPCODE_X0:
+    case V2MAXS_RRR_0_OPCODE_X0:
+    case V2MINS_RRR_0_OPCODE_X0:
+    case V2MNZ_RRR_0_OPCODE_X0:
+    case V2MULFSC_RRR_0_OPCODE_X0:
+    case V2MULS_RRR_0_OPCODE_X0:
+    case V2MULTS_RRR_0_OPCODE_X0:
+    case V2MZ_RRR_0_OPCODE_X0:
+    case V2PACKH_RRR_0_OPCODE_X0:
+    case V2PACKL_RRR_0_OPCODE_X0:
+    case V2PACKUC_RRR_0_OPCODE_X0:
+    case V2SADAS_RRR_0_OPCODE_X0:
+    case V2SADAU_RRR_0_OPCODE_X0:
+    case V2SADS_RRR_0_OPCODE_X0:
+    case V2SADU_RRR_0_OPCODE_X0:
+    case V2SHLSC_RRR_0_OPCODE_X0:
+    case V2SHL_RRR_0_OPCODE_X0:
+    case V2SHRS_RRR_0_OPCODE_X0:
+    case V2SHRU_RRR_0_OPCODE_X0:
+    case V2SUBSC_RRR_0_OPCODE_X0:
+    case V2SUB_RRR_0_OPCODE_X0:
+    case V4ADDSC_RRR_0_OPCODE_X0:
+    case V4ADD_RRR_0_OPCODE_X0:
+    case V4PACKSC_RRR_0_OPCODE_X0:
+    case V4SHLSC_RRR_0_OPCODE_X0:
+    case V4SHL_RRR_0_OPCODE_X0:
+    case V4SHRS_RRR_0_OPCODE_X0:
+    case V4SHRU_RRR_0_OPCODE_X0:
+    case V4SUBSC_RRR_0_OPCODE_X0:
+    case V4SUB_RRR_0_OPCODE_X0:
+    case V1DDOTPUA_RRR_0_OPCODE_X0:
+    case V1DDOTPU_RRR_0_OPCODE_X0:
+    case V1DOTPUA_RRR_0_OPCODE_X0:
+    case V1DOTPU_RRR_0_OPCODE_X0:
+    default:
+        break;
+    }
+    qemu_log_mask(LOG_UNIMP, "UNIMP rrr_0_opcode_x0, [" FMT64X "]\n", bundle);
+    dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
+}
+
+static void decode_shift_opcode_x0(struct DisasContext *dc,
+                                   tilegx_bundle_bits bundle)
+{
+    uint8_t rsrc = get_SrcA_X0(bundle);
+    uint8_t rdst = get_Dest_X0(bundle);
+    uint8_t shamt = get_ShAmt_X0(bundle);
+
+    switch (get_ShiftOpcodeExtension_X0(bundle)) {
+    case SHLI_SHIFT_OPCODE_X0:
+        gen_shli(dc, rdst, rsrc, shamt);
+        return;
+    case SHLXI_SHIFT_OPCODE_X0:
+        gen_shlxi(dc, rdst, rsrc, shamt);
+        return;
+    case SHRSI_SHIFT_OPCODE_X0:
+        gen_shrsi(dc, rdst, rsrc, shamt);
+        return;
+    case SHRUI_SHIFT_OPCODE_X0:
+        gen_shrui(dc, rdst, rsrc, shamt);
+        return;
+    case SHRUXI_SHIFT_OPCODE_X0:
+        gen_shruxi(dc, rdst, rsrc, shamt);
+        return;
+    case V1SHRUI_SHIFT_OPCODE_X0:
+        gen_v1shrui(dc, rdst, rsrc, shamt);
+        return;
+    case ROTLI_SHIFT_OPCODE_X0:
+    case V1SHLI_SHIFT_OPCODE_X0:
+    case V1SHRSI_SHIFT_OPCODE_X0:
+    case V2SHLI_SHIFT_OPCODE_X0:
+    case V2SHRSI_SHIFT_OPCODE_X0:
+    case V2SHRUI_SHIFT_OPCODE_X0:
+    default:
+        break;
+    }
+    qemu_log_mask(LOG_UNIMP, "UNIMP shift_opcode_x0, [" FMT64X "]\n", bundle);
+    dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
+}
+
+static void decode_branch_opcode_x1(struct DisasContext *dc,
+                                    tilegx_bundle_bits bundle)
+{
+    uint8_t src = get_SrcA_X1(bundle);
+    int32_t off = sign_extend(get_BrOff_X1(bundle), 17);
+
+    switch (get_BrType_X1(bundle)) {
+    case BEQZT_BRANCH_OPCODE_X1:
+    case BEQZ_BRANCH_OPCODE_X1:
+        gen_b(dc, src, off, TCG_COND_EQ, "beqz(t)");
+        return;
+    case BNEZT_BRANCH_OPCODE_X1:
+    case BNEZ_BRANCH_OPCODE_X1:
+        gen_b(dc, src, off, TCG_COND_NE, "bnez(t)");
+        return;
+    case BLBCT_BRANCH_OPCODE_X1:
+    case BLBC_BRANCH_OPCODE_X1:
+        gen_blb(dc, src, off, TCG_COND_EQ, "blbc(t)");
+        return;
+    case BLBST_BRANCH_OPCODE_X1:
+    case BLBS_BRANCH_OPCODE_X1:
+        gen_blb(dc, src, off, TCG_COND_NE, "blbs(t)");
+        return;
+    case BLEZT_BRANCH_OPCODE_X1:
+    case BLEZ_BRANCH_OPCODE_X1:
+        gen_b(dc, src, off, TCG_COND_LE, "blez(t)");
+        return;
+    case BLTZT_BRANCH_OPCODE_X1:
+    case BLTZ_BRANCH_OPCODE_X1:
+        gen_b(dc, src, off, TCG_COND_LT, "bltz(t)");
+        return;
+    case BGTZT_BRANCH_OPCODE_X1:
+    case BGTZ_BRANCH_OPCODE_X1:
+        gen_b(dc, src, off, TCG_COND_GT, "bgtz(t)");
+        return;
+    case BGEZT_BRANCH_OPCODE_X1:
+    case BGEZ_BRANCH_OPCODE_X1:
+        gen_b(dc, src, off, TCG_COND_GE, "bgez(t)");
+        return;
+    default:
+        break;
+    }
+    qemu_log_mask(LOG_UNIMP, "UNIMP branch_opcode_x1, [" FMT64X "]\n", bundle);
+    dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
+}
+
+static void decode_imm8_opcode_x1(struct DisasContext *dc,
+                                  tilegx_bundle_bits bundle)
+{
+    uint8_t rsrc = get_SrcA_X1(bundle);
+    uint8_t rdst = get_Dest_X1(bundle);
+    int8_t imm8 = get_Imm8_X1(bundle);
+    uint8_t rsrcb = get_SrcB_X1(bundle);
+    int8_t dimm8 = get_Dest_Imm8_X1(bundle);
+
+    switch (get_Imm8OpcodeExtension_X1(bundle)) {
+    case ADDI_IMM8_OPCODE_X1:
+        gen_addimm(dc, rdst, rsrc, imm8);
+        return;
+    case ADDXI_IMM8_OPCODE_X1:
+        gen_addximm(dc, rdst, rsrc, imm8);
+        return;
+    case ANDI_IMM8_OPCODE_X1:
+        gen_andi(dc, rdst, rsrc, imm8);
+        return;
+    case CMPEQI_IMM8_OPCODE_X1:
+        gen_cmpi(dc, rdst, rsrc, imm8, TCG_COND_EQ, "cmpeqi");
+        return;
+    case CMPLTSI_IMM8_OPCODE_X1:
+        gen_cmpi(dc, rdst, rsrc, imm8, TCG_COND_LT, "cmpltsi");
+        return;
+    case CMPLTUI_IMM8_OPCODE_X1:
+        gen_cmpi(dc, rdst, rsrc, imm8, TCG_COND_LTU, "cmpltui");
+        return;
+    case LD1S_ADD_IMM8_OPCODE_X1:
+        gen_ld_add(dc, rdst, rsrc, imm8, MO_SB, "ld1s_add");
+        return;
+    case LD1U_ADD_IMM8_OPCODE_X1:
+        gen_ld_add(dc, rdst, rsrc, imm8, MO_UB, "ld1u_add");
+        return;
+    case LD2S_ADD_IMM8_OPCODE_X1:
+        gen_ld_add(dc, rdst, rsrc, imm8, MO_LESW, "ld2s_add");
+        return;
+    case LD2U_ADD_IMM8_OPCODE_X1:
+        gen_ld_add(dc, rdst, rsrc, imm8, MO_LEUW, "ld2u_add");
+        return;
+    case LD4S_ADD_IMM8_OPCODE_X1:
+        gen_ld_add(dc, rdst, rsrc, imm8, MO_LESL, "ld4s_add");
+        return;
+    case LD4U_ADD_IMM8_OPCODE_X1:
+        gen_ld_add(dc, rdst, rsrc, imm8, MO_LEUL, "ld4u_add");
+        return;
+    case LD_ADD_IMM8_OPCODE_X1:
+        gen_ld_add(dc, rdst, rsrc, imm8, MO_LEQ, "ld(na)_add");
+        return;
+    case MFSPR_IMM8_OPCODE_X1:
+        gen_mfspr(dc, rdst, get_MF_Imm14_X1(bundle));
+        return;
+    case MTSPR_IMM8_OPCODE_X1:
+        gen_mtspr(dc, rsrc, get_MT_Imm14_X1(bundle));
+        return;
+    case ORI_IMM8_OPCODE_X1:
+        gen_ori(dc, rdst, rsrc, imm8);
+        return;
+    case ST_ADD_IMM8_OPCODE_X1:
+        gen_st_add(dc, rsrc, rsrcb, dimm8, MO_LEQ, "st_add");
+        return;
+    case ST1_ADD_IMM8_OPCODE_X1:
+        gen_st_add(dc, rsrc, rsrcb, dimm8, MO_UB, "st1_add");
+        return;
+    case ST2_ADD_IMM8_OPCODE_X1:
+        gen_st_add(dc, rsrc, rsrcb, dimm8, MO_LEUW, "st2_add");
+        return;
+    case ST4_ADD_IMM8_OPCODE_X1:
+        gen_st_add(dc, rsrc, rsrcb, dimm8, MO_LEUL, "st4_add");
+        return;
+    case V1CMPEQI_IMM8_OPCODE_X1:
+        gen_v1cmpeqi(dc, rdst, rsrc, imm8);
+        return;
+    case LDNT1S_ADD_IMM8_OPCODE_X1:
+    case LDNT1U_ADD_IMM8_OPCODE_X1:
+    case LDNT2S_ADD_IMM8_OPCODE_X1:
+    case LDNT2U_ADD_IMM8_OPCODE_X1:
+    case LDNT4S_ADD_IMM8_OPCODE_X1:
+    case LDNT4U_ADD_IMM8_OPCODE_X1:
+    case LDNT_ADD_IMM8_OPCODE_X1:
+    case LWNA_ADD_IMM8_OPCODE_X1:
+    case STNT1_ADD_IMM8_OPCODE_X1:
+    case STNT2_ADD_IMM8_OPCODE_X1:
+    case STNT4_ADD_IMM8_OPCODE_X1:
+    case STNT_ADD_IMM8_OPCODE_X1:
+    case V1ADDI_IMM8_OPCODE_X1:
+    case V1CMPLTSI_IMM8_OPCODE_X1:
+    case V1CMPLTUI_IMM8_OPCODE_X1:
+    case V1MAXUI_IMM8_OPCODE_X1:
+    case V1MINUI_IMM8_OPCODE_X1:
+    case V2ADDI_IMM8_OPCODE_X1:
+    case V2CMPEQI_IMM8_OPCODE_X1:
+    case V2CMPLTSI_IMM8_OPCODE_X1:
+    case V2CMPLTUI_IMM8_OPCODE_X1:
+    case V2MAXSI_IMM8_OPCODE_X1:
+    case V2MINSI_IMM8_OPCODE_X1:
+    case XORI_IMM8_OPCODE_X1:
+    default:
+        qemu_log_mask(LOG_UNIMP, "UNIMP opcode ext: %u\n",
+                      get_Imm8OpcodeExtension_X1(bundle));
+        break;
+    }
+    qemu_log_mask(LOG_UNIMP, "UNIMP imm8_opcode_x1, [" FMT64X "]\n", bundle);
+    dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
+}
+
+static void decode_jump_opcode_x1(struct DisasContext *dc,
+                                  tilegx_bundle_bits bundle)
+{
+    int off = sign_extend(get_JumpOff_X1(bundle), 27);
+
+    switch (get_JumpOpcodeExtension_X1(bundle)) {
+    case JAL_JUMP_OPCODE_X1:
+        gen_jal(dc, off);
+        return;
+    case J_JUMP_OPCODE_X1:
+        gen_j(dc, off);
+        return;
+    default:
+        break;
+    }
+    qemu_log_mask(LOG_UNIMP, "UNIMP jump_opcode_x1, [" FMT64X "]\n", bundle);
+    dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
+}
+
+static void decode_rrr_0_opcode_x1(struct DisasContext *dc,
+                                   tilegx_bundle_bits bundle)
+{
+    uint8_t rsrc = get_SrcA_X1(bundle);
+    uint8_t rsrcb = get_SrcB_X1(bundle);
+    uint8_t rdst = get_Dest_X1(bundle);
+
+    switch (get_RRROpcodeExtension_X1(bundle)) {
+    case ADDX_RRR_0_OPCODE_X1:
+    case ADDXSC_RRR_0_OPCODE_X1:
+        gen_addx(dc, rdst, rsrc, rsrcb);
+        return;
+    case ADD_RRR_0_OPCODE_X1:
+        gen_add(dc, rdst, rsrc, rsrcb);
+        return;
+    case AND_RRR_0_OPCODE_X1:
+        gen_and(dc, rdst, rsrc, rsrcb);
+        return;
+    case CMPEQ_RRR_0_OPCODE_X1:
+        gen_cmp(dc, rdst, rsrc, rsrcb, TCG_COND_EQ, "cmpeq");
+        return;
+    case CMPLES_RRR_0_OPCODE_X1:
+        gen_cmp(dc, rdst, rsrc, rsrcb, TCG_COND_LE, "cmples");
+        return;
+    case CMPLEU_RRR_0_OPCODE_X1:
+        gen_cmp(dc, rdst, rsrc, rsrcb, TCG_COND_LEU, "cmpleu");
+        return;
+    case CMPEXCH4_RRR_0_OPCODE_X1:
+        gen_atomic_excp(dc, rdst, rsrc, rsrcb,
+                        TILEGX_EXCP_OPCODE_CMPEXCH4, "cmpexch4");
+        return;
+    case CMPEXCH_RRR_0_OPCODE_X1:
+        gen_atomic_excp(dc, rdst, rsrc, rsrcb,
+                        TILEGX_EXCP_OPCODE_CMPEXCH, "cmpexch");
+        return;
+    case CMPLTS_RRR_0_OPCODE_X1:
+        gen_cmp(dc, rdst, rsrc, rsrcb, TCG_COND_LT, "cmplts");
+        return;
+    case CMPLTU_RRR_0_OPCODE_X1:
+        gen_cmp(dc, rdst, rsrc, rsrcb, TCG_COND_LTU, "cmpltu");
+        return;
+    case EXCH4_RRR_0_OPCODE_X1:
+        gen_atomic_excp(dc, rdst, rsrc, rsrcb,
+                        TILEGX_EXCP_OPCODE_EXCH4, "exch4");
+        return;
+    case EXCH_RRR_0_OPCODE_X1:
+        gen_atomic_excp(dc, rdst, rsrc, rsrcb,
+                        TILEGX_EXCP_OPCODE_EXCH, "exch");
+        return;
+    case FETCHADD_RRR_0_OPCODE_X1:
+        gen_atomic_excp(dc, rdst, rsrc, rsrcb,
+                        TILEGX_EXCP_OPCODE_FETCHADD, "fetchadd");
+        return;
+    case FETCHADD4_RRR_0_OPCODE_X1:
+        gen_atomic_excp(dc, rdst, rsrc, rsrcb,
+                        TILEGX_EXCP_OPCODE_FETCHADD4, "fetchadd4");
+        return;
+    case FETCHADDGEZ_RRR_0_OPCODE_X1:
+        gen_atomic_excp(dc, rdst, rsrc, rsrcb,
+                        TILEGX_EXCP_OPCODE_FETCHADDGEZ, "fetchaddgez");
+        return;
+    case FETCHADDGEZ4_RRR_0_OPCODE_X1:
+        gen_atomic_excp(dc, rdst, rsrc, rsrcb,
+                        TILEGX_EXCP_OPCODE_FETCHADDGEZ4, "fetchaddgez4");
+        return;
+    case FETCHAND_RRR_0_OPCODE_X1:
+        gen_atomic_excp(dc, rdst, rsrc, rsrcb,
+                        TILEGX_EXCP_OPCODE_FETCHAND, "fetchand");
+        return;
+    case FETCHAND4_RRR_0_OPCODE_X1:
+        gen_atomic_excp(dc, rdst, rsrc, rsrcb,
+                        TILEGX_EXCP_OPCODE_FETCHAND4, "fetchand4");
+        return;
+    case FETCHOR_RRR_0_OPCODE_X1:
+        gen_atomic_excp(dc, rdst, rsrc, rsrcb,
+                        TILEGX_EXCP_OPCODE_FETCHOR, "fetchor");
+        return;
+    case FETCHOR4_RRR_0_OPCODE_X1:
+        gen_atomic_excp(dc, rdst, rsrc, rsrcb,
+                        TILEGX_EXCP_OPCODE_FETCHOR4, "fetchor4");
+        return;
+    case MZ_RRR_0_OPCODE_X1:
+        gen_menz(dc, rdst, rsrc, rsrcb, TCG_COND_EQ, "mz");
+        return;
+    case MNZ_RRR_0_OPCODE_X1:
+        gen_menz(dc, rdst, rsrc, rsrcb, TCG_COND_NE, "mnz");
+        return;
+    case NOR_RRR_0_OPCODE_X1:
+        gen_nor(dc, rdst, rsrc, rsrcb);
+        return;
+    case OR_RRR_0_OPCODE_X1:
+        gen_or(dc, rdst, rsrc, rsrcb);
+        return;
+    case CMPNE_RRR_0_OPCODE_X1:
+        gen_cmp(dc, rdst, rsrc, rsrcb, TCG_COND_NE, "cmpne");
+        return;
+    case SHL1ADDX_RRR_0_OPCODE_X1:
+        gen_shladd(dc, rdst, rsrc, rsrcb, 1, 1);
+        return;
+    case SHL1ADD_RRR_0_OPCODE_X1:
+        gen_shladd(dc, rdst, rsrc, rsrcb, 1, 0);
+        return;
+    case SHL2ADDX_RRR_0_OPCODE_X1:
+        gen_shladd(dc, rdst, rsrc, rsrcb, 2, 1);
+        return;
+    case SHL2ADD_RRR_0_OPCODE_X1:
+        gen_shladd(dc, rdst, rsrc, rsrcb, 2, 0);
+        return;
+    case SHL3ADDX_RRR_0_OPCODE_X1:
+        gen_shladd(dc, rdst, rsrc, rsrcb, 3, 1);
+        return;
+    case SHL3ADD_RRR_0_OPCODE_X1:
+        gen_shladd(dc, rdst, rsrc, rsrcb, 3, 0);
+        return;
+    case SHLX_RRR_0_OPCODE_X1:
+        gen_shlx(dc, rdst, rsrc, rsrcb);
+        return;
+    case SHL_RRR_0_OPCODE_X1:
+        gen_shl(dc, rdst, rsrc, rsrcb);
+        return;
+    case SHRS_RRR_0_OPCODE_X1:
+        gen_shrs(dc, rdst, rsrc, rsrcb);
+        return;
+    case SHRUX_RRR_0_OPCODE_X1:
+        gen_shrux(dc, rdst, rsrc, rsrcb);
+        return;
+    case SHRU_RRR_0_OPCODE_X1:
+        gen_shru(dc, rdst, rsrc, rsrcb);
+        return;
+    case ST1_RRR_0_OPCODE_X1:
+        if (!rdst) {
+            gen_st(dc, rsrc, rsrcb, MO_UB, "st1");
+            return;
+        }
+        break;
+    case ST2_RRR_0_OPCODE_X1:
+        if (!rdst) {
+            gen_st(dc, rsrc, rsrcb, MO_LEUW, "st2");
+            return;
+        }
+        break;
+    case ST4_RRR_0_OPCODE_X1:
+        if (!rdst) {
+            gen_st(dc, rsrc, rsrcb, MO_LEUL, "st4");
+            return;
+        }
+        break;
+    case ST_RRR_0_OPCODE_X1:
+        if (!rdst) {
+            gen_st(dc, rsrc, rsrcb, MO_LEQ, "st");
+            return;
+        }
+        break;
+    case SUB_RRR_0_OPCODE_X1:
+        gen_sub(dc, rdst, rsrc, rsrcb);
+        return;
+    case SUBX_RRR_0_OPCODE_X1:
+        gen_subx(dc, rdst, rsrc, rsrcb);
+        return;
+    case UNARY_RRR_0_OPCODE_X1:
+        switch (get_UnaryOpcodeExtension_X1(bundle)) {
+        case NOP_UNARY_OPCODE_X1:
+        case FNOP_UNARY_OPCODE_X1:
+            if (!rdst && !rsrc) {
+                qemu_log_mask(CPU_LOG_TB_IN_ASM, "(f)nop\n");
+                return;
+            }
+            break;
+        case JALRP_UNARY_OPCODE_X1:
+        case JALR_UNARY_OPCODE_X1:
+            if (!rdst) {
+                gen_jalr(dc, rsrc);
+                return;
+            }
+            break;
+        case JRP_UNARY_OPCODE_X1:
+        case JR_UNARY_OPCODE_X1:
+            if (!rdst) {
+                gen_jr(dc, rsrc);
+                return;
+            }
+            break;
+        case LD1S_UNARY_OPCODE_X1:
+            gen_ld(dc, rdst, rsrc, MO_SB, "ld1s");
+            return;
+        case LD1U_UNARY_OPCODE_X1:
+            gen_ld(dc, rdst, rsrc, MO_UB, "ld1u");
+            return;
+        case LD2S_UNARY_OPCODE_X1:
+            gen_ld(dc, rdst, rsrc, MO_LESW, "ld2s");
+            return;
+        case LD2U_UNARY_OPCODE_X1:
+            gen_ld(dc, rdst, rsrc, MO_LEUW, "ld2u");
+            return;
+        case LD4S_UNARY_OPCODE_X1:
+            gen_ld(dc, rdst, rsrc, MO_LESL, "ld4s");
+            return;
+        case LD4U_UNARY_OPCODE_X1:
+            gen_ld(dc, rdst, rsrc, MO_LEUL, "ld4u");
+            return;
+        case LDNA_UNARY_OPCODE_X1:
+        case LD_UNARY_OPCODE_X1:
+            gen_ld(dc, rdst, rsrc, MO_LEQ, "ld(na)");
+            return;
+        case LNK_UNARY_OPCODE_X1:
+            if (!rsrc) {
+                gen_lnk(dc, rdst);
+                return;
+            }
+            break;
+        case MF_UNARY_OPCODE_X1:
+            if (!rdst && !rsrc) {
+                gen_mf(dc);
+                return;
+            }
+        case SWINT1_UNARY_OPCODE_X1:
+            if (!rsrc && !rdst) {
+                gen_swint1(dc);
+                return;
+            }
+            break;
+        case WH64_UNARY_OPCODE_X1:
+            if (!rdst) {
+                gen_wh64(dc, rsrc);
+                return;
+            }
+            break;
+        case DRAIN_UNARY_OPCODE_X1:
+        case DTLBPR_UNARY_OPCODE_X1:
+        case FINV_UNARY_OPCODE_X1:
+        case FLUSHWB_UNARY_OPCODE_X1:
+        case FLUSH_UNARY_OPCODE_X1:
+        case ICOH_UNARY_OPCODE_X1:
+        case ILL_UNARY_OPCODE_X1:
+        case INV_UNARY_OPCODE_X1:
+        case IRET_UNARY_OPCODE_X1:
+        case LDNT1S_UNARY_OPCODE_X1:
+        case LDNT1U_UNARY_OPCODE_X1:
+        case LDNT2S_UNARY_OPCODE_X1:
+        case LDNT2U_UNARY_OPCODE_X1:
+        case LDNT4S_UNARY_OPCODE_X1:
+        case LDNT4U_UNARY_OPCODE_X1:
+        case LDNT_UNARY_OPCODE_X1:
+        case NAP_UNARY_OPCODE_X1:
+        case SWINT0_UNARY_OPCODE_X1:
+        case SWINT2_UNARY_OPCODE_X1:
+        case SWINT3_UNARY_OPCODE_X1:
+        default:
+            break;
+        }
+        break;
+    case V1INT_L_RRR_0_OPCODE_X1:
+        gen_v1int_l(dc, rdst, rsrc, rsrcb);
+        return;
+    case V4INT_L_RRR_0_OPCODE_X1:
+        gen_v4int_l(dc, rdst, rsrc, rsrcb);
+        return;
+    case V1CMPEQ_RRR_0_OPCODE_X1:
+        gen_v1cmpeq(dc, rdst, rsrc, rsrcb);
+        return;
+    case XOR_RRR_0_OPCODE_X1:
+        gen_xor(dc, rdst, rsrc, rsrcb);
+        return;
+    case DBLALIGN2_RRR_0_OPCODE_X1:
+    case DBLALIGN4_RRR_0_OPCODE_X1:
+    case DBLALIGN6_RRR_0_OPCODE_X1:
+    case ROTL_RRR_0_OPCODE_X1:
+    case STNT1_RRR_0_OPCODE_X1:
+    case STNT2_RRR_0_OPCODE_X1:
+    case STNT4_RRR_0_OPCODE_X1:
+    case STNT_RRR_0_OPCODE_X1:
+    case SUBXSC_RRR_0_OPCODE_X1:
+    case V1INT_H_RRR_0_OPCODE_X1:
+    case V2INT_H_RRR_0_OPCODE_X1:
+    case V2INT_L_RRR_0_OPCODE_X1:
+    case V4INT_H_RRR_0_OPCODE_X1:
+    case V1ADDUC_RRR_0_OPCODE_X1:
+    case V1ADD_RRR_0_OPCODE_X1:
+    case V1CMPLES_RRR_0_OPCODE_X1:
+    case V1CMPLEU_RRR_0_OPCODE_X1:
+    case V1CMPLTS_RRR_0_OPCODE_X1:
+    case V1CMPLTU_RRR_0_OPCODE_X1:
+    case V1CMPNE_RRR_0_OPCODE_X1:
+    case V1MAXU_RRR_0_OPCODE_X1:
+    case V1MINU_RRR_0_OPCODE_X1:
+    case V1MNZ_RRR_0_OPCODE_X1:
+    case V1MZ_RRR_0_OPCODE_X1:
+    case V1SHL_RRR_0_OPCODE_X1:
+    case V1SHRS_RRR_0_OPCODE_X1:
+    case V1SHRU_RRR_0_OPCODE_X1:
+    case V1SUBUC_RRR_0_OPCODE_X1:
+    case V1SUB_RRR_0_OPCODE_X1:
+    case V2ADDSC_RRR_0_OPCODE_X1:
+    case V2ADD_RRR_0_OPCODE_X1:
+    case V2CMPEQ_RRR_0_OPCODE_X1:
+    case V2CMPLES_RRR_0_OPCODE_X1:
+    case V2CMPLEU_RRR_0_OPCODE_X1:
+    case V2CMPLTS_RRR_0_OPCODE_X1:
+    case V2CMPLTU_RRR_0_OPCODE_X1:
+    case V2CMPNE_RRR_0_OPCODE_X1:
+    case V2MAXS_RRR_0_OPCODE_X1:
+    case V2MINS_RRR_0_OPCODE_X1:
+    case V2MNZ_RRR_0_OPCODE_X1:
+    case V2MZ_RRR_0_OPCODE_X1:
+    case V2PACKH_RRR_0_OPCODE_X1:
+    case V2PACKL_RRR_0_OPCODE_X1:
+    case V2PACKUC_RRR_0_OPCODE_X1:
+    case V2SHLSC_RRR_0_OPCODE_X1:
+    case V2SHL_RRR_0_OPCODE_X1:
+    case V2SHRS_RRR_0_OPCODE_X1:
+    case V2SHRU_RRR_0_OPCODE_X1:
+    case V2SUBSC_RRR_0_OPCODE_X1:
+    case V2SUB_RRR_0_OPCODE_X1:
+    case V4ADDSC_RRR_0_OPCODE_X1:
+    case V4ADD_RRR_0_OPCODE_X1:
+    case V4PACKSC_RRR_0_OPCODE_X1:
+    case V4SHLSC_RRR_0_OPCODE_X1:
+    case V4SHL_RRR_0_OPCODE_X1:
+    case V4SHRS_RRR_0_OPCODE_X1:
+    case V4SHRU_RRR_0_OPCODE_X1:
+    case V4SUBSC_RRR_0_OPCODE_X1:
+    case V4SUB_RRR_0_OPCODE_X1:
+    default:
+        break;
+    }
+    qemu_log_mask(LOG_UNIMP, "UNIMP rrr_0_opcode_x1, [" FMT64X "]\n", bundle);
+    dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
+}
+
+static void decode_shift_opcode_x1(struct DisasContext *dc,
+                                   tilegx_bundle_bits bundle)
+{
+    uint8_t rsrc = get_SrcA_X1(bundle);
+    uint8_t rdst = get_Dest_X1(bundle);
+    uint8_t shamt = get_ShAmt_X1(bundle);
+
+    switch (get_ShiftOpcodeExtension_X1(bundle)) {
+    case SHLI_SHIFT_OPCODE_X1:
+        gen_shli(dc, rdst, rsrc, shamt);
+        return;
+    case SHLXI_SHIFT_OPCODE_X1:
+        gen_shlxi(dc, rdst, rsrc, shamt);
+        return;
+    case SHRSI_SHIFT_OPCODE_X1:
+        gen_shrsi(dc, rdst, rsrc, shamt);
+        return;
+    case SHRUI_SHIFT_OPCODE_X1:
+        gen_shrui(dc, rdst, rsrc, shamt);
+        return;
+    case SHRUXI_SHIFT_OPCODE_X1:
+        gen_shruxi(dc, rdst, rsrc, shamt);
+        return;
+    case V1SHRUI_SHIFT_OPCODE_X1:
+        gen_v1shrui(dc, rdst, rsrc, shamt);
+        return;
+    case ROTLI_SHIFT_OPCODE_X1:
+    case V1SHLI_SHIFT_OPCODE_X1:
+    case V1SHRSI_SHIFT_OPCODE_X1:
+    case V2SHLI_SHIFT_OPCODE_X1:
+    case V2SHRSI_SHIFT_OPCODE_X1:
+    case V2SHRUI_SHIFT_OPCODE_X1:
+    default:
+        break;
+    }
+    qemu_log_mask(LOG_UNIMP, "UNIMP shift_opcode_x1, [" FMT64X "]\n", bundle);
+    dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
+}
+
+static void decode_y0(struct DisasContext *dc, tilegx_bundle_bits bundle)
+{
+    unsigned int opcode = get_Opcode_Y0(bundle);
+    uint8_t rsrc = get_SrcA_Y0(bundle);
+    uint8_t rdst = get_Dest_Y0(bundle);
+    int8_t imm8 = get_Imm8_Y0(bundle);
+
+    dc->tmp_regcur = dc->tmp_regs + 0;
+
+    switch (opcode) {
+    case ADDI_OPCODE_Y0:
+        gen_addimm(dc, rdst, rsrc, imm8);
+        return;
+    case ADDXI_OPCODE_Y0:
+        gen_addximm(dc, rdst, rsrc, imm8);
+        return;
+    case ANDI_OPCODE_Y0:
+        gen_andi(dc, rdst, rsrc, imm8);
+        return;
+    case CMPEQI_OPCODE_Y0:
+        gen_cmpi(dc, rdst, rsrc, imm8, TCG_COND_EQ, "cmpeqi");
+        return;
+    case CMPLTSI_OPCODE_Y0:
+        gen_cmpi(dc, rdst, rsrc, imm8, TCG_COND_LT, "cmpltsi");
+        return;
+    case RRR_0_OPCODE_Y0:
+        decode_rrr_0_opcode_y0(dc, bundle);
+        return;
+    case RRR_1_OPCODE_Y0:
+        decode_rrr_1_opcode_y0(dc, bundle);
+        return;
+    case RRR_2_OPCODE_Y0:
+        decode_rrr_2_opcode_y0(dc, bundle);
+        return;
+    case RRR_3_OPCODE_Y0:
+        decode_rrr_3_opcode_y0(dc, bundle);
+        return;
+    case RRR_4_OPCODE_Y0:
+        decode_rrr_4_opcode_y0(dc, bundle);
+        return;
+    case RRR_5_OPCODE_Y0:
+        decode_rrr_5_opcode_y0(dc, bundle);
+        return;
+    case RRR_6_OPCODE_Y0:
+        decode_rrr_6_opcode_y0(dc, bundle);
+        return;
+    case RRR_9_OPCODE_Y0:
+        decode_rrr_9_opcode_y0(dc, bundle);
+        return;
+    case SHIFT_OPCODE_Y0:
+        decode_shift_opcode_y0(dc, bundle);
+        return;
+    case RRR_7_OPCODE_Y0:
+    case RRR_8_OPCODE_Y0:
+    default:
+        qemu_log_mask(LOG_UNIMP,
+                      "UNIMP y0, opcode %d, bundle [" FMT64X "]\n",
+                      opcode, bundle);
+        dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
+        return;
+    }
+}
+
+static void decode_y1(struct DisasContext *dc, tilegx_bundle_bits bundle)
+{
+    unsigned int opcode = get_Opcode_Y1(bundle);
+    uint8_t rsrc = get_SrcA_Y1(bundle);
+    uint8_t rdst = get_Dest_Y1(bundle);
+    int8_t imm8 = get_Imm8_Y1(bundle);
+
+    dc->tmp_regcur = dc->tmp_regs + 1;
+
+    switch (opcode) {
+    case ADDI_OPCODE_Y1:
+        gen_addimm(dc, rdst, rsrc, imm8);
+        return;
+    case ADDXI_OPCODE_Y1:
+        gen_addximm(dc, rdst, rsrc, imm8);
+        return;
+    case ANDI_OPCODE_Y1:
+        gen_andi(dc, rdst, rsrc, imm8);
+        return;
+    case CMPEQI_OPCODE_Y1:
+        gen_cmpi(dc, rdst, rsrc, imm8, TCG_COND_EQ, "cmpeqi");
+        return;
+    case CMPLTSI_OPCODE_Y1:
+        gen_cmpi(dc, rdst, rsrc, imm8, TCG_COND_LT, "cmpltsi");
+        return;
+    case RRR_0_OPCODE_Y1:
+        decode_rrr_0_opcode_y1(dc, bundle);
+        return;
+    case RRR_1_OPCODE_Y1:
+        decode_rrr_1_opcode_y1(dc, bundle);
+        return;
+    case RRR_2_OPCODE_Y1:
+        decode_rrr_2_opcode_y1(dc, bundle);
+        return;
+    case RRR_3_OPCODE_Y1:
+        decode_rrr_3_opcode_y1(dc, bundle);
+        return;
+    case RRR_5_OPCODE_Y1:
+        decode_rrr_5_opcode_y1(dc, bundle);
+        return;
+    case SHIFT_OPCODE_Y1:
+        decode_shift_opcode_y1(dc, bundle);
+        return;
+    case RRR_4_OPCODE_Y1:
+    case RRR_6_OPCODE_Y1:
+    case RRR_7_OPCODE_Y1:
+    default:
+        qemu_log_mask(LOG_UNIMP,
+                      "UNIMP y1, opcode %d, bundle [" FMT64X "]\n",
+                      opcode, bundle);
+        dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
+        return;
+    }
+}
+
+static void decode_y2(struct DisasContext *dc, tilegx_bundle_bits bundle)
+{
+    unsigned int opcode = get_Opcode_Y2(bundle);
+
+    dc->tmp_regcur = dc->tmp_regs + 2;
+
+    switch (opcode) {
+    case 0: /* LD1S_OPCODE_Y2, ST1_OPCODE_Y2 */
+        decode_ldst0_opcode_y2(dc, bundle);
+        return;
+    case 1: /* LD4S_OPCODE_Y2, LD1U_OPCODE_Y2, ST2_OPCODE_Y2 */
+        decode_ldst1_opcode_y2(dc, bundle);
+        return;
+    case 2: /* LD2S_OPCODE_Y2, LD4U_OPCODE_Y2, ST4_OPCODE_Y2 */
+        decode_ldst2_opcode_y2(dc, bundle);
+        return;
+    case 3: /* LD_OPCODE_Y2, ST_OPCODE_Y2, LD2U_OPCODE_Y2 */
+        decode_ldst3_opcode_y2(dc, bundle);
+        return;
+    default:
+        qemu_log_mask(LOG_UNIMP,
+                      "UNIMP y2, opcode %d, bundle [" FMT64X "]\n",
+                      opcode, bundle);
+        dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
+        return;
+    }
+}
+
+static void decode_x0(struct DisasContext *dc, tilegx_bundle_bits bundle)
+{
+    unsigned int opcode = get_Opcode_X0(bundle);
+    uint8_t rsrc = get_SrcA_X0(bundle);
+    uint8_t rdst = get_Dest_X0(bundle);
+    int16_t imm16 = get_Imm16_X0(bundle);
+
+
+    dc->tmp_regcur = dc->tmp_regs + 0;
+
+    switch (opcode) {
+    case ADDLI_OPCODE_X0:
+        gen_addimm(dc, rdst, rsrc, imm16);
+        return;
+    case ADDXLI_OPCODE_X0:
+        gen_addximm(dc, rdst, rsrc, imm16);
+        return;
+    case BF_OPCODE_X0:
+        decode_bf_opcode_x0(dc, bundle);
+        return;
+    case IMM8_OPCODE_X0:
+        decode_imm8_opcode_x0(dc, bundle);
+        return;
+    case RRR_0_OPCODE_X0:
+        decode_rrr_0_opcode_x0(dc, bundle);
+        return;
+    case SHIFT_OPCODE_X0:
+        decode_shift_opcode_x0(dc, bundle);
+        return;
+    case SHL16INSLI_OPCODE_X0:
+        gen_shl16insli(dc, rdst, rsrc, (uint16_t)imm16);
+        return;
+    default:
+        qemu_log_mask(LOG_UNIMP,
+                      "UNIMP x0, opcode %d, bundle [" FMT64X "]\n",
+                      opcode, bundle);
+        dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
+        return;
+    }
+}
+
+static void decode_x1(struct DisasContext *dc, tilegx_bundle_bits bundle)
+{
+    unsigned int opcode = get_Opcode_X1(bundle);
+    uint8_t rsrc = (uint8_t)get_SrcA_X1(bundle);
+    uint8_t rdst = (uint8_t)get_Dest_X1(bundle);
+    int16_t imm16 = (int16_t)get_Imm16_X1(bundle);
+
+    dc->tmp_regcur = dc->tmp_regs + 1;
+
+    switch (opcode) {
+    case ADDLI_OPCODE_X1:
+        gen_addimm(dc, rdst, rsrc, imm16);
+        return;
+    case ADDXLI_OPCODE_X1:
+        gen_addximm(dc, rdst, rsrc, imm16);
+        return;
+    case BRANCH_OPCODE_X1:
+        decode_branch_opcode_x1(dc, bundle);
+        return;
+    case IMM8_OPCODE_X1:
+        decode_imm8_opcode_x1(dc, bundle);
+        return;
+    case JUMP_OPCODE_X1:
+        decode_jump_opcode_x1(dc, bundle);
+        return;
+    case RRR_0_OPCODE_X1:
+        decode_rrr_0_opcode_x1(dc, bundle);
+        return;
+    case SHIFT_OPCODE_X1:
+        decode_shift_opcode_x1(dc, bundle);
+        return;
+    case SHL16INSLI_OPCODE_X1:
+        gen_shl16insli(dc, rdst, rsrc, (uint16_t)imm16);
+        return;
+    default:
+        qemu_log_mask(LOG_UNIMP,
+                      "UNIMP x1, opcode %d, bundle [" FMT64X "]\n",
+                      opcode, bundle);
+        dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
+        return;
+    }
+}
+
+static void translate_one_bundle(struct DisasContext *dc, uint64_t bundle)
+{
+    int i;
+    TCGv tmp;
+
+    for (i = 0; i < TILEGX_MAX_INSTRUCTIONS_PER_BUNDLE; i++) {
+        dc->tmp_regs[i].idx = TILEGX_R_NOREG;
+        TCGV_UNUSED_I64(dc->tmp_regs[i].val);
+    }
+
+    if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP | CPU_LOG_TB_OP_OPT))) {
+        tcg_gen_debug_insn_start(dc->pc);
+    }
+
+    if (get_Mode(bundle)) {
+        decode_y0(dc, bundle);
+        decode_y1(dc, bundle);
+        decode_y2(dc, bundle);
+    } else {
+        decode_x0(dc, bundle);
+        decode_x1(dc, bundle);
+    }
+
+    for (i = 0; i < TILEGX_MAX_INSTRUCTIONS_PER_BUNDLE; i++) {
+        if (dc->tmp_regs[i].idx == TILEGX_R_NOREG) {
+            continue;
+        }
+        if (dc->tmp_regs[i].idx < TILEGX_R_COUNT) {
+            tcg_gen_mov_i64(cpu_regs[dc->tmp_regs[i].idx], dc->tmp_regs[i].val);
+        }
+        tcg_temp_free_i64(dc->tmp_regs[i].val);
+    }
+
+    if (dc->jmp.cond != TCG_COND_NEVER) {
+        if (dc->jmp.cond == TCG_COND_ALWAYS) {
+            tcg_gen_mov_i64(cpu_pc, dc->jmp.dest);
+        } else {
+            tmp = tcg_const_i64(dc->pc + TILEGX_BUNDLE_SIZE_IN_BYTES);
+            tcg_gen_movcond_i64(dc->jmp.cond, cpu_pc,
+                                dc->jmp.val1, dc->jmp.val2,
+                                dc->jmp.dest, tmp);
+            tcg_temp_free_i64(dc->jmp.val1);
+            tcg_temp_free_i64(dc->jmp.val2);
+            tcg_temp_free_i64(tmp);
+        }
+        tcg_temp_free_i64(dc->jmp.dest);
+        tcg_gen_exit_tb(0);
+    }
+}
+
+static inline void gen_intermediate_code_internal(TileGXCPU *cpu,
+                                                  TranslationBlock *tb,
+                                                  bool search_pc)
+{
+    DisasContext ctx;
+    DisasContext *dc = &ctx;
+
+    CPUTLGState *env = &cpu->env;
+    uint64_t pc_start = tb->pc;
+    uint64_t next_page_start = (pc_start & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE;
+    int j, lj = -1;
+    int num_insns = 0;
+    int max_insns = tb->cflags & CF_COUNT_MASK;
+
+    dc->pc = pc_start;
+    dc->exception = TILEGX_EXCP_NONE;
+    dc->jmp.cond = TCG_COND_NEVER;
+    TCGV_UNUSED_I64(dc->jmp.dest);
+    TCGV_UNUSED_I64(dc->jmp.val1);
+    TCGV_UNUSED_I64(dc->jmp.val2);
+
+    if (!max_insns) {
+        max_insns = CF_COUNT_MASK;
+    }
+    gen_tb_start(tb);
+
+    do {
+        TCGV_UNUSED_I64(dc->zero);
+        if (search_pc) {
+            j = tcg_op_buf_count();
+            if (lj < j) {
+                lj++;
+                while (lj < j) {
+                    tcg_ctx.gen_opc_instr_start[lj++] = 0;
+                }
+            }
+            tcg_ctx.gen_opc_pc[lj] = dc->pc;
+            tcg_ctx.gen_opc_instr_start[lj] = 1;
+            tcg_ctx.gen_opc_icount[lj] = num_insns;
+        }
+        translate_one_bundle(dc, cpu_ldq_data(env, dc->pc));
+        num_insns++;
+        dc->pc += TILEGX_BUNDLE_SIZE_IN_BYTES;
+        if (dc->exception != TILEGX_EXCP_NONE) {
+            gen_exception(dc, dc->exception);
+            break;
+        }
+    } while (dc->jmp.cond == TCG_COND_NEVER && dc->pc < next_page_start
+             && num_insns < max_insns && !tcg_op_buf_full());
+
+    if (dc->jmp.cond == TCG_COND_NEVER) {
+        tcg_gen_movi_i64(cpu_pc, dc->pc);
+        tcg_gen_exit_tb(0);
+    }
+
+    gen_tb_end(tb, num_insns);
+    if (search_pc) {
+        j = tcg_op_buf_count();
+        lj++;
+        while (lj <= j) {
+            tcg_ctx.gen_opc_instr_start[lj++] = 0;
+        }
+    } else {
+        tb->size = dc->pc - pc_start;
+        tb->icount = num_insns;
+    }
+
+    return;
+}
+
+void gen_intermediate_code(CPUTLGState *env, struct TranslationBlock *tb)
+{
+    gen_intermediate_code_internal(tilegx_env_get_cpu(env), tb, false);
+}
+
+void gen_intermediate_code_pc(CPUTLGState *env, struct TranslationBlock *tb)
+{
+    gen_intermediate_code_internal(tilegx_env_get_cpu(env), tb, true);
+}
+
+void restore_state_to_opc(CPUTLGState *env, TranslationBlock *tb, int pc_pos)
+{
+    env->pc = tcg_ctx.gen_opc_pc[pc_pos];
+}
+
+void tilegx_tcg_init(void)
+{
+    int i;
+
+    cpu_env = tcg_global_reg_new_ptr(TCG_AREG0, "env");
+    cpu_pc = tcg_global_mem_new_i64(TCG_AREG0, offsetof(CPUTLGState, pc), "pc");
+    for (i = 0; i < TILEGX_R_COUNT; i++) {
+        cpu_regs[i] = tcg_global_mem_new_i64(TCG_AREG0,
+                                             offsetof(CPUTLGState, regs[i]),
+                                             reg_names[i]);
+    }
+    for (i = 0; i < TILEGX_SPR_COUNT; i++) {
+        cpu_spregs[i] = tcg_global_mem_new_i64(TCG_AREG0,
+                                               offsetof(CPUTLGState, spregs[i]),
+                                               spreg_names[i]);
+    }
+#if defined(CONFIG_USER_ONLY)
+    cpu_excparam  = tcg_global_mem_new_i32(TCG_AREG0,
+                                           offsetof(CPUTLGState, excparam),
+                                           "cpu_excparam");
+#endif
+}