diff mbox

[05/14] tcg-sparc: Simplify qemu_ld/st direct memory paths.

Message ID 1332894743-27418-6-git-send-email-rth@twiddle.net
State New
Headers show

Commit Message

Richard Henderson March 28, 2012, 12:32 a.m. UTC
Given that we have an opcode for all sizes, all endianness,
turn the functions into a simple table lookup.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/sparc/tcg-target.c |  384 +++++++++++++++++++-----------------------------
 1 files changed, 150 insertions(+), 234 deletions(-)

Comments

Blue Swirl March 29, 2012, 6:47 p.m. UTC | #1
On Wed, Mar 28, 2012 at 00:32, Richard Henderson <rth@twiddle.net> wrote:
> Given that we have an opcode for all sizes, all endianness,
> turn the functions into a simple table lookup.
>
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  tcg/sparc/tcg-target.c |  384 +++++++++++++++++++-----------------------------
>  1 files changed, 150 insertions(+), 234 deletions(-)
>
> diff --git a/tcg/sparc/tcg-target.c b/tcg/sparc/tcg-target.c
> index c74fc2c..5cea5a8 100644
> --- a/tcg/sparc/tcg-target.c
> +++ b/tcg/sparc/tcg-target.c
> @@ -294,6 +294,16 @@ static inline int tcg_target_const_match(tcg_target_long val,
>  #define ASI_PRIMARY_LITTLE 0x88
>  #endif
>
> +#define LDUH_LE    (LDUHA | INSN_ASI(ASI_PRIMARY_LITTLE))
> +#define LDSH_LE    (LDSHA | INSN_ASI(ASI_PRIMARY_LITTLE))
> +#define LDUW_LE    (LDUWA | INSN_ASI(ASI_PRIMARY_LITTLE))
> +#define LDSW_LE    (LDSWA | INSN_ASI(ASI_PRIMARY_LITTLE))
> +#define LDX_LE     (LDXA  | INSN_ASI(ASI_PRIMARY_LITTLE))
> +
> +#define STH_LE     (STHA  | INSN_ASI(ASI_PRIMARY_LITTLE))
> +#define STW_LE     (STWA  | INSN_ASI(ASI_PRIMARY_LITTLE))
> +#define STX_LE     (STXA  | INSN_ASI(ASI_PRIMARY_LITTLE))
> +
>  static inline void tcg_out_arith(TCGContext *s, int rd, int rs1, int rs2,
>                                  int op)
>  {
> @@ -366,66 +376,46 @@ static inline void tcg_out_movi(TCGContext *s, TCGType type,
>     }
>  }
>
> -static inline void tcg_out_ld_raw(TCGContext *s, int ret,
> -                                  tcg_target_long arg)
> +static inline void tcg_out_ldst_rr(TCGContext *s, int data, int a1,
> +                                   int a2, int op)
>  {
> -    tcg_out_sethi(s, ret, arg);
> -    tcg_out32(s, LDUW | INSN_RD(ret) | INSN_RS1(ret) |
> -              INSN_IMM13(arg & 0x3ff));
> +    tcg_out32(s, op | INSN_RD(data) | INSN_RS1(a1) | INSN_RS2(a2));
>  }
>
> -static inline void tcg_out_ld_ptr(TCGContext *s, int ret,
> -                                  tcg_target_long arg)
> +static inline void tcg_out_ldst(TCGContext *s, int ret, int addr,
> +                                int offset, int op)
>  {
> -    if (!check_fit_tl(arg, 10))
> -        tcg_out_movi(s, TCG_TYPE_PTR, ret, arg & ~0x3ffULL);
> -    if (TCG_TARGET_REG_BITS == 64) {
> -        tcg_out32(s, LDX | INSN_RD(ret) | INSN_RS1(ret) |
> -                  INSN_IMM13(arg & 0x3ff));
> -    } else {
> -        tcg_out32(s, LDUW | INSN_RD(ret) | INSN_RS1(ret) |
> -                  INSN_IMM13(arg & 0x3ff));
> -    }
> -}
> -
> -static inline void tcg_out_ldst(TCGContext *s, int ret, int addr, int offset, int op)
> -{
> -    if (check_fit_tl(offset, 13))
> +    if (check_fit_tl(offset, 13)) {
>         tcg_out32(s, op | INSN_RD(ret) | INSN_RS1(addr) |
>                   INSN_IMM13(offset));
> -    else {
> +    } else {
>         tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_I5, offset);
> -        tcg_out32(s, op | INSN_RD(ret) | INSN_RS1(TCG_REG_I5) |
> -                  INSN_RS2(addr));
> +        tcg_out_ldst_rr(s, ret, addr, TCG_REG_I5, op);
>     }
>  }
>
> -static inline void tcg_out_ldst_asi(TCGContext *s, int ret, int addr,
> -                                    int offset, int op, int asi)
> -{
> -    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_I5, offset);
> -    tcg_out32(s, op | INSN_RD(ret) | INSN_RS1(TCG_REG_I5) |
> -              INSN_ASI(asi) | INSN_RS2(addr));
> -}
> -
>  static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
>                               TCGReg arg1, tcg_target_long arg2)
>  {
> -    if (type == TCG_TYPE_I32)
> -        tcg_out_ldst(s, ret, arg1, arg2, LDUW);
> -    else
> -        tcg_out_ldst(s, ret, arg1, arg2, LDX);
> +    tcg_out_ldst(s, ret, arg1, arg2, (type == TCG_TYPE_I32 ? LDUW : LDX));
>  }
>
>  static inline void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
>                               TCGReg arg1, tcg_target_long arg2)
>  {
> -    if (type == TCG_TYPE_I32)
> -        tcg_out_ldst(s, arg, arg1, arg2, STW);
> -    else
> -        tcg_out_ldst(s, arg, arg1, arg2, STX);
> +    tcg_out_ldst(s, arg, arg1, arg2, (type == TCG_TYPE_I32 ? STW : STX));
> +}
> +
> +static inline void tcg_out_ld_ptr(TCGContext *s, int ret,
> +                                  tcg_target_long arg)
> +{
> +    if (!check_fit_tl(arg, 10)) {
> +        tcg_out_movi(s, TCG_TYPE_PTR, ret, arg & ~0x3ff);
> +    }
> +    tcg_out_ld(s, TCG_TYPE_PTR, ret, ret, arg & 0x3ff);
>  }
>
> +
>  static inline void tcg_out_sety(TCGContext *s, int rs)
>  {
>     tcg_out32(s, WRY | INSN_RS1(TCG_REG_G0) | INSN_RS2(rs));
> @@ -757,22 +747,16 @@ static const void * const qemu_st_helpers[4] = {
>    WHICH is the offset into the CPUTLBEntry structure of the slot to read.
>    This should be offsetof addr_read or addr_write.
>
> -   Outputs:
> -   LABEL_PTRS is filled with the position of the forward jumps to the
> -   TLB miss case.  This will always be a ,PN insn, so a 19-bit offset.
> -
> -   Returns a register loaded with the low part of the address, adjusted
> -   as indicated by the TLB and so is a host address.  Undefined in the
> -   TLB miss case.  */
> +   The result of the TLB comparison is in %[ix]cc.  The sanitized address
> +   is in the returned register, maybe %o0.  The TLB addend is in %o1.  */
>
>  static int tcg_out_tlb_load(TCGContext *s, int addrlo_idx, int mem_index,
> -                            int s_bits, const TCGArg *args,
> -                            uint32_t **label_ptr, int which)
> +                            int s_bits, const TCGArg *args, int which)
>  {
>     const int addrlo = args[addrlo_idx];
> -    const int r0 = tcg_target_call_iarg_regs[0];
> -    const int r1 = tcg_target_call_iarg_regs[1];
> -    const int r2 = tcg_target_call_iarg_regs[2];
> +    const int r0 = TCG_REG_O0;
> +    const int r1 = TCG_REG_O1;
> +    const int r2 = TCG_REG_O2;
>     int addr = addrlo;
>     int tlb_ofs;
>
> @@ -803,110 +787,39 @@ static int tcg_out_tlb_load(TCGContext *s, int addrlo_idx, int mem_index,
>         tlb_ofs = 0;
>     }
>
> -    /* ld [arg1 + which], arg2 */
> +    /* Load the tlb comparator and the addend.  */
>     tcg_out_ld(s, TCG_TYPE_TL, r2, r1, tlb_ofs + which);
> +    tcg_out_ld(s, TCG_TYPE_PTR, r1, r1, tlb_ofs+offsetof(CPUTLBEntry, addend));
>
>     /* subcc arg0, arg2, %g0 */
>     tcg_out_cmp(s, r0, r2, 0);
>
> -    /* bne,pn %[ix]cc, label0 */
> -    *label_ptr = (uint32_t *)s->code_ptr;
> -    tcg_out32(s, (INSN_OP(0) | INSN_COND(COND_NE, 0) | INSN_OP2(0x1) |
> -                  ((TARGET_LONG_BITS == 64) << 21)));
> -
> -    /* TLB Hit.  Compute the host address into r1.  The ld is in the
> -       branch delay slot; harmless for the TLB miss case.  */
> -    tcg_out_ld(s, TCG_TYPE_PTR, r1, r1, tlb_ofs+offsetof(CPUTLBEntry, addend));
> -
> +    /* If the guest address must be zero-extended, do so now.  */
>     if (TCG_TARGET_REG_BITS == 64 && TARGET_LONG_BITS == 32) {
>         tcg_out_arithi(s, r0, addrlo, 0, SHIFT_SRL);
> -        tcg_out_arith(s, r1, r0, r1, ARITH_ADD);
> -    } else {
> -        tcg_out_arith(s, r1, addrlo, r1, ARITH_ADD);
> +        return r0;
>     }
> -
> -    return r1;
> +    return addrlo;
>  }
>  #endif /* CONFIG_SOFTMMU */
>
> -static void tcg_out_qemu_ld_direct(TCGContext *s, int addr, int datalo,
> -                                   int datahi, int sizeop)
> -{
> +static const int qemu_ld_opc[8] = {
>  #ifdef TARGET_WORDS_BIGENDIAN
> -    const int bigendian = 1;
> +    LDUB, LDUH, LDUW, LDX, LDSB, LDSH, LDSW, LDX
>  #else
> -    const int bigendian = 0;
> +    LDUB, LDUH_LE, LDUW_LE, LDX_LE, LDSB, LDSH_LE, LDSW_LE, LDX_LE
>  #endif
> -    switch (sizeop) {
> -    case 0:
> -        /* ldub [addr], datalo */
> -        tcg_out_ldst(s, datalo, addr, 0, LDUB);
> -        break;
> -    case 0 | 4:
> -        /* ldsb [addr], datalo */
> -        tcg_out_ldst(s, datalo, addr, 0, LDSB);
> -        break;
> -    case 1:
> -        if (bigendian) {
> -            /* lduh [addr], datalo */
> -            tcg_out_ldst(s, datalo, addr, 0, LDUH);
> -        } else {
> -            /* lduha [addr] ASI_PRIMARY_LITTLE, datalo */
> -            tcg_out_ldst_asi(s, datalo, addr, 0, LDUHA, ASI_PRIMARY_LITTLE);
> -        }
> -        break;
> -    case 1 | 4:
> -        if (bigendian) {
> -            /* ldsh [addr], datalo */
> -            tcg_out_ldst(s, datalo, addr, 0, LDSH);
> -        } else {
> -            /* ldsha [addr] ASI_PRIMARY_LITTLE, datalo */
> -            tcg_out_ldst_asi(s, datalo, addr, 0, LDSHA, ASI_PRIMARY_LITTLE);
> -        }
> -        break;
> -    case 2:
> -        if (bigendian) {
> -            /* lduw [addr], datalo */
> -            tcg_out_ldst(s, datalo, addr, 0, LDUW);
> -        } else {
> -            /* lduwa [addr] ASI_PRIMARY_LITTLE, datalo */
> -            tcg_out_ldst_asi(s, datalo, addr, 0, LDUWA, ASI_PRIMARY_LITTLE);
> -        }
> -        break;
> -    case 2 | 4:
> -        if (bigendian) {
> -            /* ldsw [addr], datalo */
> -            tcg_out_ldst(s, datalo, addr, 0, LDSW);
> -        } else {
> -            /* ldswa [addr] ASI_PRIMARY_LITTLE, datalo */
> -            tcg_out_ldst_asi(s, datalo, addr, 0, LDSWA, ASI_PRIMARY_LITTLE);
> -        }
> -        break;
> -    case 3:
> -        if (TCG_TARGET_REG_BITS == 64) {
> -            if (bigendian) {
> -                /* ldx [addr], datalo */
> -                tcg_out_ldst(s, datalo, addr, 0, LDX);
> -            } else {
> -                /* ldxa [addr] ASI_PRIMARY_LITTLE, datalo */
> -                tcg_out_ldst_asi(s, datalo, addr, 0, LDXA, ASI_PRIMARY_LITTLE);
> -            }
> -        } else {
> -            if (bigendian) {
> -                tcg_out_ldst(s, datahi, addr, 0, LDUW);
> -                tcg_out_ldst(s, datalo, addr, 4, LDUW);
> -            } else {
> -                tcg_out_ldst_asi(s, datalo, addr, 0, LDUWA, ASI_PRIMARY_LITTLE);
> -                tcg_out_ldst_asi(s, datahi, addr, 4, LDUWA, ASI_PRIMARY_LITTLE);
> -            }
> -        }
> -        break;
> -    default:
> -        tcg_abort();
> -    }
> -}
> +};
>
> -static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
> +static const int qemu_st_opc[4] = {
> +#ifdef TARGET_WORDS_BIGENDIAN
> +    STB, STH, STW, STX
> +#else
> +    STB, STH_LE, STW_LE, STX_LE
> +#endif
> +};
> +
> +static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int sizeop)
>  {
>     int addrlo_idx = 1, datalo, datahi, addr_reg;
>  #if defined(CONFIG_SOFTMMU)
> @@ -915,7 +828,7 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
>  #endif
>
>     datahi = datalo = args[0];
> -    if (TCG_TARGET_REG_BITS == 32 && opc == 3) {
> +    if (TCG_TARGET_REG_BITS == 32 && sizeop == 3) {
>         datahi = args[1];
>         addrlo_idx = 2;
>     }
> @@ -923,27 +836,59 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
>  #if defined(CONFIG_SOFTMMU)
>     memi_idx = addrlo_idx + 1 + (TARGET_LONG_BITS > TCG_TARGET_REG_BITS);
>     memi = args[memi_idx];
> -    s_bits = opc & 3;
> +    s_bits = sizeop & 3;
>
>     addr_reg = tcg_out_tlb_load(s, addrlo_idx, memi, s_bits, args,
> -                                label_ptr, offsetof(CPUTLBEntry, addr_read));
> +                                offsetof(CPUTLBEntry, addr_read));
>
> -    /* TLB Hit.  */
> -    tcg_out_qemu_ld_direct(s, addr_reg, datalo, datahi, opc);
> +    if (TCG_TARGET_REG_BITS == 32 && sizeop == 3) {
> +        int reg64;
>
> -    /* b,pt,n label1 */
> -    label_ptr[1] = (uint32_t *)s->code_ptr;
> -    tcg_out32(s, (INSN_OP(0) | INSN_COND(COND_A, 0) | INSN_OP2(0x1)
> -                  | (1 << 29) | (1 << 19)));
> +        /* bne,pn %[xi]cc, label0 */
> +        label_ptr[0] = (uint32_t *)s->code_ptr;
> +        tcg_out32(s, (INSN_OP(0) | INSN_COND(COND_NE, 0) | INSN_OP2(0x1)
> +                      | ((TARGET_LONG_BITS == 64) << 21)));
> +
> +        /* TLB Hit.  */
> +        /* Load all 64-bits into an O/G register.  */
> +        reg64 = (datalo < 16 ? datalo : TCG_REG_O0);
> +        tcg_out_ldst_rr(s, reg64, addr_reg, TCG_REG_O1, qemu_ld_opc[sizeop]);
> +
> +        /* Move the two 32-bit pieces into the destination registers.  */
> +        tcg_out_arithi(s, datahi, reg64, 32, SHIFT_SRLX);
> +        if (reg64 != datalo) {
> +            tcg_out_mov(s, TCG_TYPE_I32, datalo, reg64);
> +        }
> +
> +        /* b,pt,n label1 */
> +        label_ptr[1] = (uint32_t *)s->code_ptr;
> +        tcg_out32(s, (INSN_OP(0) | INSN_COND(COND_A, 0) | INSN_OP2(0x1)
> +                      | (1 << 29) | (1 << 19)));
> +    } else {
> +        /* The fast path is exactly one insn.  Thus we can perform the
> +           entire TLB Hit in the (annulled) delay slot of the branch
> +           over the TLB Miss case.  */
> +
> +        /* beq,a,pt %[xi]cc, label0 */
> +        label_ptr[0] = NULL;
> +        label_ptr[1] = (uint32_t *)s->code_ptr;
> +        tcg_out32(s, (INSN_OP(0) | INSN_COND(COND_E, 0) | INSN_OP2(0x1)
> +                      | ((TARGET_LONG_BITS == 64) << 21)
> +                      | (1 << 29) | (1 << 19)));
> +        /* delay slot */
> +        tcg_out_ldst_rr(s, datalo, addr_reg, TCG_REG_O1, qemu_ld_opc[sizeop]);
> +    }
>
>     /* TLB Miss.  */
>
> -    *label_ptr[0] |= INSN_OFF19((unsigned long)s->code_ptr -
> -                                (unsigned long)label_ptr[0]);
> -    n = 0;
> -#ifdef CONFIG_TCG_PASS_AREG0
> -    tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[n++], TCG_AREG0);
> -#endif
> +    if (label_ptr[0]) {
> +        *label_ptr[0] |= INSN_OFF19((unsigned long)s->code_ptr -
> +                                    (unsigned long)label_ptr[0]);
> +    }
> +    n = ARG_OFFSET;
> +    if (ARG_OFFSET) {
> +       tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);

WARNING: suspect code indent for conditional statements (4, 7)
#395: FILE: tcg/sparc/tcg-target.c:889:
+    if (ARG_OFFSET) {
+       tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);

WARNING: suspect code indent for conditional statements (4, 9)
#542: FILE: tcg/sparc/tcg-target.c:1013:
+    if (ARG_OFFSET) {
+         tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);

total: 0 errors, 2 warnings, 525 lines checked

> +    }
>     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
>         tcg_out_mov(s, TCG_TYPE_REG, tcg_target_call_iarg_regs[n++],
>                     args[addrlo_idx + 1]);
> @@ -971,7 +916,7 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
>
>     n = tcg_target_call_oarg_regs[0];
>     /* datalo = sign_extend(arg0) */
> -    switch(opc) {
> +    switch (sizeop) {
>     case 0 | 4:
>         /* Recall that SRA sign extends from bit 31 through bit 63.  */
>         tcg_out_arithi(s, datalo, n, 24, SHIFT_SLL);
> @@ -1008,75 +953,31 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
>         tcg_out_arithi(s, TCG_REG_I5, addr_reg, 0, SHIFT_SRL);
>         addr_reg = TCG_REG_I5;
>     }
> -    tcg_out_qemu_ld_direct(s, addr_reg, datalo, datahi, opc);
> -#endif /* CONFIG_SOFTMMU */
> -}
> +    if (TCG_TARGET_REG_BITS == 32 && sizeop == 3) {
> +        int reg64 = (datalo < 16 ? datalo : TCG_REG_O0);
>
> -static void tcg_out_qemu_st_direct(TCGContext *s, int addr, int datalo,
> -                                   int datahi, int sizeop)
> -{
> -#ifdef TARGET_WORDS_BIGENDIAN
> -    const int bigendian = 1;
> -#else
> -    const int bigendian = 0;
> -#endif
> -    switch (sizeop) {
> -    case 0:
> -        /* stb datalo, [addr] */
> -        tcg_out_ldst(s, datalo, addr, 0, STB);
> -        break;
> -    case 1:
> -        if (bigendian) {
> -            /* sth datalo, [addr] */
> -            tcg_out_ldst(s, datalo, addr, 0, STH);
> -        } else {
> -            /* stha datalo, [addr] ASI_PRIMARY_LITTLE */
> -            tcg_out_ldst_asi(s, datalo, addr, 0, STHA, ASI_PRIMARY_LITTLE);
> -        }
> -        break;
> -    case 2:
> -        if (bigendian) {
> -            /* stw datalo, [addr] */
> -            tcg_out_ldst(s, datalo, addr, 0, STW);
> -        } else {
> -            /* stwa datalo, [addr] ASI_PRIMARY_LITTLE */
> -            tcg_out_ldst_asi(s, datalo, addr, 0, STWA, ASI_PRIMARY_LITTLE);
> -        }
> -        break;
> -    case 3:
> -        if (TCG_TARGET_REG_BITS == 64) {
> -            if (bigendian) {
> -                /* stx datalo, [addr] */
> -                tcg_out_ldst(s, datalo, addr, 0, STX);
> -            } else {
> -                /* stxa datalo, [addr] ASI_PRIMARY_LITTLE */
> -                tcg_out_ldst_asi(s, datalo, addr, 0, STXA, ASI_PRIMARY_LITTLE);
> -            }
> -        } else {
> -            if (bigendian) {
> -                tcg_out_ldst(s, datahi, addr, 0, STW);
> -                tcg_out_ldst(s, datalo, addr, 4, STW);
> -            } else {
> -                tcg_out_ldst_asi(s, datalo, addr, 0, STWA, ASI_PRIMARY_LITTLE);
> -                tcg_out_ldst_asi(s, datahi, addr, 4, STWA, ASI_PRIMARY_LITTLE);
> -            }
> +        tcg_out_ldst_rr(s, reg64, addr_reg, TCG_REG_G0, qemu_ld_opc[sizeop]);
> +
> +        tcg_out_arithi(s, datahi, reg64, 32, SHIFT_SRLX);
> +        if (reg64 != datalo) {
> +            tcg_out_mov(s, TCG_TYPE_I32, datalo, reg64);
>         }
> -        break;
> -    default:
> -        tcg_abort();
> +    } else {
> +        tcg_out_ldst_rr(s, datalo, addr_reg, TCG_REG_G0, qemu_ld_opc[sizeop]);
>     }
> +#endif /* CONFIG_SOFTMMU */
>  }
>
> -static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc)
> +static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int sizeop)
>  {
>     int addrlo_idx = 1, datalo, datahi, addr_reg;
>  #if defined(CONFIG_SOFTMMU)
>     int memi_idx, memi, n;
> -    uint32_t *label_ptr[2];
> +    uint32_t *label_ptr;
>  #endif
>
>     datahi = datalo = args[0];
> -    if (TCG_TARGET_REG_BITS == 32 && opc == 3) {
> +    if (TCG_TARGET_REG_BITS == 32 && sizeop == 3) {
>         datahi = args[1];
>         addrlo_idx = 2;
>     }
> @@ -1085,33 +986,40 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc)
>     memi_idx = addrlo_idx + 1 + (TARGET_LONG_BITS > TCG_TARGET_REG_BITS);
>     memi = args[memi_idx];
>
> -    addr_reg = tcg_out_tlb_load(s, addrlo_idx, memi, opc, args,
> -                                label_ptr, offsetof(CPUTLBEntry, addr_write));
> +    addr_reg = tcg_out_tlb_load(s, addrlo_idx, memi, sizeop, args,
> +                                offsetof(CPUTLBEntry, addr_write));
>
> -    /* TLB Hit.  */
> -    tcg_out_qemu_st_direct(s, addr_reg, datalo, datahi, opc);
> +    if (TCG_TARGET_REG_BITS == 32 && sizeop == 3) {
> +        /* Reconstruct the full 64-bit value in %g1, using %o2 as temp.  */
> +        /* ??? Redefine the temps from %i4/%i5 so that we have a o/g temp. */
> +        tcg_out_arithi(s, TCG_REG_G1, datalo, 0, SHIFT_SRL);
> +        tcg_out_arithi(s, TCG_REG_O2, datahi, 32, SHIFT_SLLX);
> +        tcg_out_arith(s, TCG_REG_G1, TCG_REG_G1, TCG_REG_O2, ARITH_OR);
> +        datalo = TCG_REG_G1;
> +    }
>
> -    /* b,pt,n label1 */
> -    label_ptr[1] = (uint32_t *)s->code_ptr;
> -    tcg_out32(s, (INSN_OP(0) | INSN_COND(COND_A, 0) | INSN_OP2(0x1)
> +    /* The fast path is exactly one insn.  Thus we can perform the entire
> +       TLB Hit in the (annulled) delay slot of the branch over TLB Miss.  */
> +    /* beq,a,pt %[xi]cc, label0 */
> +    label_ptr = (uint32_t *)s->code_ptr;
> +    tcg_out32(s, (INSN_OP(0) | INSN_COND(COND_E, 0) | INSN_OP2(0x1)
> +                  | ((TARGET_LONG_BITS == 64) << 21)
>                   | (1 << 29) | (1 << 19)));
> +    /* delay slot */
> +    tcg_out_ldst_rr(s, datalo, addr_reg, TCG_REG_O1, qemu_st_opc[sizeop]);
>
>     /* TLB Miss.  */
> -
> -    *label_ptr[0] |= INSN_OFF19((unsigned long)s->code_ptr -
> -                                (unsigned long)label_ptr[0]);
> -
> -    n = 0;
> -#ifdef CONFIG_TCG_PASS_AREG0
> -    tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[n++], TCG_AREG0);
> -#endif
> +    n = ARG_OFFSET;
> +    if (ARG_OFFSET) {
> +         tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
> +    }
>     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
>         tcg_out_mov(s, TCG_TYPE_REG, tcg_target_call_iarg_regs[n++],
>                     args[addrlo_idx + 1]);
>     }
>     tcg_out_mov(s, TCG_TYPE_REG, tcg_target_call_iarg_regs[n++],
>                 args[addrlo_idx]);
> -    if (TCG_TARGET_REG_BITS == 32 && opc == 3) {
> +    if (TCG_TARGET_REG_BITS == 32 && sizeop == 3) {
>         tcg_out_mov(s, TCG_TYPE_REG, tcg_target_call_iarg_regs[n++], datahi);
>     }
>     tcg_out_mov(s, TCG_TYPE_REG, tcg_target_call_iarg_regs[n++], datalo);
> @@ -1123,7 +1031,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc)
>                sizeof(long));
>
>     /* qemu_st_helper[s_bits](arg0, arg1, arg2) */
> -    tcg_out32(s, CALL | ((((tcg_target_ulong)qemu_st_helpers[opc]
> +    tcg_out32(s, CALL | ((((tcg_target_ulong)qemu_st_helpers[sizeop]
>                            - (tcg_target_ulong)s->code_ptr) >> 2)
>                          & 0x3fffffff));
>     /* delay slot */
> @@ -1134,15 +1042,23 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc)
>                TCG_TARGET_CALL_STACK_OFFSET - TCG_STATIC_CALL_ARGS_SIZE -
>                sizeof(long));
>
> -    *label_ptr[1] |= INSN_OFF19((unsigned long)s->code_ptr -
> -                                (unsigned long)label_ptr[1]);
> +    *label_ptr |= INSN_OFF19((unsigned long)s->code_ptr -
> +                             (unsigned long)label_ptr);
>  #else
>     addr_reg = args[addrlo_idx];
>     if (TCG_TARGET_REG_BITS == 64 && TARGET_LONG_BITS == 32) {
>         tcg_out_arithi(s, TCG_REG_I5, addr_reg, 0, SHIFT_SRL);
>         addr_reg = TCG_REG_I5;
>     }
> -    tcg_out_qemu_st_direct(s, addr_reg, datalo, datahi, opc);
> +    if (TCG_TARGET_REG_BITS == 32 && sizeop == 3) {
> +        /* Reconstruct the full 64-bit value in %g1, using %o2 as temp.  */
> +        /* ??? Redefine the temps from %i4/%i5 so that we have a o/g temp. */
> +        tcg_out_arithi(s, TCG_REG_G1, datalo, 0, SHIFT_SRL);
> +        tcg_out_arithi(s, TCG_REG_O2, datahi, 32, SHIFT_SLLX);
> +        tcg_out_arith(s, TCG_REG_G1, TCG_REG_G1, TCG_REG_O2, ARITH_OR);
> +        datalo = TCG_REG_G1;
> +    }
> +    tcg_out_ldst_rr(s, datalo, addr_reg, TCG_REG_G0, qemu_st_opc[sizeop]);
>  #endif /* CONFIG_SOFTMMU */
>  }
>
> --
> 1.7.7.6
>
diff mbox

Patch

diff --git a/tcg/sparc/tcg-target.c b/tcg/sparc/tcg-target.c
index c74fc2c..5cea5a8 100644
--- a/tcg/sparc/tcg-target.c
+++ b/tcg/sparc/tcg-target.c
@@ -294,6 +294,16 @@  static inline int tcg_target_const_match(tcg_target_long val,
 #define ASI_PRIMARY_LITTLE 0x88
 #endif
 
+#define LDUH_LE    (LDUHA | INSN_ASI(ASI_PRIMARY_LITTLE))
+#define LDSH_LE    (LDSHA | INSN_ASI(ASI_PRIMARY_LITTLE))
+#define LDUW_LE    (LDUWA | INSN_ASI(ASI_PRIMARY_LITTLE))
+#define LDSW_LE    (LDSWA | INSN_ASI(ASI_PRIMARY_LITTLE))
+#define LDX_LE     (LDXA  | INSN_ASI(ASI_PRIMARY_LITTLE))
+
+#define STH_LE     (STHA  | INSN_ASI(ASI_PRIMARY_LITTLE))
+#define STW_LE     (STWA  | INSN_ASI(ASI_PRIMARY_LITTLE))
+#define STX_LE     (STXA  | INSN_ASI(ASI_PRIMARY_LITTLE))
+
 static inline void tcg_out_arith(TCGContext *s, int rd, int rs1, int rs2,
                                  int op)
 {
@@ -366,66 +376,46 @@  static inline void tcg_out_movi(TCGContext *s, TCGType type,
     }
 }
 
-static inline void tcg_out_ld_raw(TCGContext *s, int ret,
-                                  tcg_target_long arg)
+static inline void tcg_out_ldst_rr(TCGContext *s, int data, int a1,
+                                   int a2, int op)
 {
-    tcg_out_sethi(s, ret, arg);
-    tcg_out32(s, LDUW | INSN_RD(ret) | INSN_RS1(ret) |
-              INSN_IMM13(arg & 0x3ff));
+    tcg_out32(s, op | INSN_RD(data) | INSN_RS1(a1) | INSN_RS2(a2));
 }
 
-static inline void tcg_out_ld_ptr(TCGContext *s, int ret,
-                                  tcg_target_long arg)
+static inline void tcg_out_ldst(TCGContext *s, int ret, int addr,
+                                int offset, int op)
 {
-    if (!check_fit_tl(arg, 10))
-        tcg_out_movi(s, TCG_TYPE_PTR, ret, arg & ~0x3ffULL);
-    if (TCG_TARGET_REG_BITS == 64) {
-        tcg_out32(s, LDX | INSN_RD(ret) | INSN_RS1(ret) |
-                  INSN_IMM13(arg & 0x3ff));
-    } else {
-        tcg_out32(s, LDUW | INSN_RD(ret) | INSN_RS1(ret) |
-                  INSN_IMM13(arg & 0x3ff));
-    }
-}
-
-static inline void tcg_out_ldst(TCGContext *s, int ret, int addr, int offset, int op)
-{
-    if (check_fit_tl(offset, 13))
+    if (check_fit_tl(offset, 13)) {
         tcg_out32(s, op | INSN_RD(ret) | INSN_RS1(addr) |
                   INSN_IMM13(offset));
-    else {
+    } else {
         tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_I5, offset);
-        tcg_out32(s, op | INSN_RD(ret) | INSN_RS1(TCG_REG_I5) |
-                  INSN_RS2(addr));
+        tcg_out_ldst_rr(s, ret, addr, TCG_REG_I5, op);
     }
 }
 
-static inline void tcg_out_ldst_asi(TCGContext *s, int ret, int addr,
-                                    int offset, int op, int asi)
-{
-    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_I5, offset);
-    tcg_out32(s, op | INSN_RD(ret) | INSN_RS1(TCG_REG_I5) |
-              INSN_ASI(asi) | INSN_RS2(addr));
-}
-
 static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
                               TCGReg arg1, tcg_target_long arg2)
 {
-    if (type == TCG_TYPE_I32)
-        tcg_out_ldst(s, ret, arg1, arg2, LDUW);
-    else
-        tcg_out_ldst(s, ret, arg1, arg2, LDX);
+    tcg_out_ldst(s, ret, arg1, arg2, (type == TCG_TYPE_I32 ? LDUW : LDX));
 }
 
 static inline void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
                               TCGReg arg1, tcg_target_long arg2)
 {
-    if (type == TCG_TYPE_I32)
-        tcg_out_ldst(s, arg, arg1, arg2, STW);
-    else
-        tcg_out_ldst(s, arg, arg1, arg2, STX);
+    tcg_out_ldst(s, arg, arg1, arg2, (type == TCG_TYPE_I32 ? STW : STX));
+}
+
+static inline void tcg_out_ld_ptr(TCGContext *s, int ret,
+                                  tcg_target_long arg)
+{
+    if (!check_fit_tl(arg, 10)) {
+        tcg_out_movi(s, TCG_TYPE_PTR, ret, arg & ~0x3ff);
+    }
+    tcg_out_ld(s, TCG_TYPE_PTR, ret, ret, arg & 0x3ff);
 }
 
+
 static inline void tcg_out_sety(TCGContext *s, int rs)
 {
     tcg_out32(s, WRY | INSN_RS1(TCG_REG_G0) | INSN_RS2(rs));
@@ -757,22 +747,16 @@  static const void * const qemu_st_helpers[4] = {
    WHICH is the offset into the CPUTLBEntry structure of the slot to read.
    This should be offsetof addr_read or addr_write.
 
-   Outputs:
-   LABEL_PTRS is filled with the position of the forward jumps to the
-   TLB miss case.  This will always be a ,PN insn, so a 19-bit offset.
-
-   Returns a register loaded with the low part of the address, adjusted
-   as indicated by the TLB and so is a host address.  Undefined in the
-   TLB miss case.  */
+   The result of the TLB comparison is in %[ix]cc.  The sanitized address
+   is in the returned register, maybe %o0.  The TLB addend is in %o1.  */
 
 static int tcg_out_tlb_load(TCGContext *s, int addrlo_idx, int mem_index,
-                            int s_bits, const TCGArg *args,
-                            uint32_t **label_ptr, int which)
+                            int s_bits, const TCGArg *args, int which)
 {
     const int addrlo = args[addrlo_idx];
-    const int r0 = tcg_target_call_iarg_regs[0];
-    const int r1 = tcg_target_call_iarg_regs[1];
-    const int r2 = tcg_target_call_iarg_regs[2];
+    const int r0 = TCG_REG_O0;
+    const int r1 = TCG_REG_O1;
+    const int r2 = TCG_REG_O2;
     int addr = addrlo;
     int tlb_ofs;
 
@@ -803,110 +787,39 @@  static int tcg_out_tlb_load(TCGContext *s, int addrlo_idx, int mem_index,
         tlb_ofs = 0;
     }
 
-    /* ld [arg1 + which], arg2 */
+    /* Load the tlb comparator and the addend.  */
     tcg_out_ld(s, TCG_TYPE_TL, r2, r1, tlb_ofs + which);
+    tcg_out_ld(s, TCG_TYPE_PTR, r1, r1, tlb_ofs+offsetof(CPUTLBEntry, addend));
 
     /* subcc arg0, arg2, %g0 */
     tcg_out_cmp(s, r0, r2, 0);
 
-    /* bne,pn %[ix]cc, label0 */
-    *label_ptr = (uint32_t *)s->code_ptr;
-    tcg_out32(s, (INSN_OP(0) | INSN_COND(COND_NE, 0) | INSN_OP2(0x1) |
-                  ((TARGET_LONG_BITS == 64) << 21)));
-
-    /* TLB Hit.  Compute the host address into r1.  The ld is in the
-       branch delay slot; harmless for the TLB miss case.  */
-    tcg_out_ld(s, TCG_TYPE_PTR, r1, r1, tlb_ofs+offsetof(CPUTLBEntry, addend));
-
+    /* If the guest address must be zero-extended, do so now.  */
     if (TCG_TARGET_REG_BITS == 64 && TARGET_LONG_BITS == 32) {
         tcg_out_arithi(s, r0, addrlo, 0, SHIFT_SRL);
-        tcg_out_arith(s, r1, r0, r1, ARITH_ADD);
-    } else {
-        tcg_out_arith(s, r1, addrlo, r1, ARITH_ADD);
+        return r0;
     }
-
-    return r1;
+    return addrlo;
 }
 #endif /* CONFIG_SOFTMMU */
 
-static void tcg_out_qemu_ld_direct(TCGContext *s, int addr, int datalo,
-                                   int datahi, int sizeop)
-{
+static const int qemu_ld_opc[8] = {
 #ifdef TARGET_WORDS_BIGENDIAN
-    const int bigendian = 1;
+    LDUB, LDUH, LDUW, LDX, LDSB, LDSH, LDSW, LDX
 #else
-    const int bigendian = 0;
+    LDUB, LDUH_LE, LDUW_LE, LDX_LE, LDSB, LDSH_LE, LDSW_LE, LDX_LE
 #endif
-    switch (sizeop) {
-    case 0:
-        /* ldub [addr], datalo */
-        tcg_out_ldst(s, datalo, addr, 0, LDUB);
-        break;
-    case 0 | 4:
-        /* ldsb [addr], datalo */
-        tcg_out_ldst(s, datalo, addr, 0, LDSB);
-        break;
-    case 1:
-        if (bigendian) {
-            /* lduh [addr], datalo */
-            tcg_out_ldst(s, datalo, addr, 0, LDUH);
-        } else {
-            /* lduha [addr] ASI_PRIMARY_LITTLE, datalo */
-            tcg_out_ldst_asi(s, datalo, addr, 0, LDUHA, ASI_PRIMARY_LITTLE);
-        }
-        break;
-    case 1 | 4:
-        if (bigendian) {
-            /* ldsh [addr], datalo */
-            tcg_out_ldst(s, datalo, addr, 0, LDSH);
-        } else {
-            /* ldsha [addr] ASI_PRIMARY_LITTLE, datalo */
-            tcg_out_ldst_asi(s, datalo, addr, 0, LDSHA, ASI_PRIMARY_LITTLE);
-        }
-        break;
-    case 2:
-        if (bigendian) {
-            /* lduw [addr], datalo */
-            tcg_out_ldst(s, datalo, addr, 0, LDUW);
-        } else {
-            /* lduwa [addr] ASI_PRIMARY_LITTLE, datalo */
-            tcg_out_ldst_asi(s, datalo, addr, 0, LDUWA, ASI_PRIMARY_LITTLE);
-        }
-        break;
-    case 2 | 4:
-        if (bigendian) {
-            /* ldsw [addr], datalo */
-            tcg_out_ldst(s, datalo, addr, 0, LDSW);
-        } else {
-            /* ldswa [addr] ASI_PRIMARY_LITTLE, datalo */
-            tcg_out_ldst_asi(s, datalo, addr, 0, LDSWA, ASI_PRIMARY_LITTLE);
-        }
-        break;
-    case 3:
-        if (TCG_TARGET_REG_BITS == 64) {
-            if (bigendian) {
-                /* ldx [addr], datalo */
-                tcg_out_ldst(s, datalo, addr, 0, LDX);
-            } else {
-                /* ldxa [addr] ASI_PRIMARY_LITTLE, datalo */
-                tcg_out_ldst_asi(s, datalo, addr, 0, LDXA, ASI_PRIMARY_LITTLE);
-            }
-        } else {
-            if (bigendian) {
-                tcg_out_ldst(s, datahi, addr, 0, LDUW);
-                tcg_out_ldst(s, datalo, addr, 4, LDUW);
-            } else {
-                tcg_out_ldst_asi(s, datalo, addr, 0, LDUWA, ASI_PRIMARY_LITTLE);
-                tcg_out_ldst_asi(s, datahi, addr, 4, LDUWA, ASI_PRIMARY_LITTLE);
-            }
-        }
-        break;
-    default:
-        tcg_abort();
-    }
-}
+};
 
-static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
+static const int qemu_st_opc[4] = {
+#ifdef TARGET_WORDS_BIGENDIAN
+    STB, STH, STW, STX
+#else
+    STB, STH_LE, STW_LE, STX_LE
+#endif
+};
+
+static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int sizeop)
 {
     int addrlo_idx = 1, datalo, datahi, addr_reg;
 #if defined(CONFIG_SOFTMMU)
@@ -915,7 +828,7 @@  static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
 #endif
 
     datahi = datalo = args[0];
-    if (TCG_TARGET_REG_BITS == 32 && opc == 3) {
+    if (TCG_TARGET_REG_BITS == 32 && sizeop == 3) {
         datahi = args[1];
         addrlo_idx = 2;
     }
@@ -923,27 +836,59 @@  static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
 #if defined(CONFIG_SOFTMMU)
     memi_idx = addrlo_idx + 1 + (TARGET_LONG_BITS > TCG_TARGET_REG_BITS);
     memi = args[memi_idx];
-    s_bits = opc & 3;
+    s_bits = sizeop & 3;
 
     addr_reg = tcg_out_tlb_load(s, addrlo_idx, memi, s_bits, args,
-                                label_ptr, offsetof(CPUTLBEntry, addr_read));
+                                offsetof(CPUTLBEntry, addr_read));
 
-    /* TLB Hit.  */
-    tcg_out_qemu_ld_direct(s, addr_reg, datalo, datahi, opc);
+    if (TCG_TARGET_REG_BITS == 32 && sizeop == 3) {
+        int reg64;
 
-    /* b,pt,n label1 */
-    label_ptr[1] = (uint32_t *)s->code_ptr;
-    tcg_out32(s, (INSN_OP(0) | INSN_COND(COND_A, 0) | INSN_OP2(0x1)
-                  | (1 << 29) | (1 << 19)));
+        /* bne,pn %[xi]cc, label0 */
+        label_ptr[0] = (uint32_t *)s->code_ptr;
+        tcg_out32(s, (INSN_OP(0) | INSN_COND(COND_NE, 0) | INSN_OP2(0x1)
+                      | ((TARGET_LONG_BITS == 64) << 21)));
+
+        /* TLB Hit.  */
+        /* Load all 64-bits into an O/G register.  */
+        reg64 = (datalo < 16 ? datalo : TCG_REG_O0);
+        tcg_out_ldst_rr(s, reg64, addr_reg, TCG_REG_O1, qemu_ld_opc[sizeop]);
+
+        /* Move the two 32-bit pieces into the destination registers.  */
+        tcg_out_arithi(s, datahi, reg64, 32, SHIFT_SRLX);
+        if (reg64 != datalo) {
+            tcg_out_mov(s, TCG_TYPE_I32, datalo, reg64);
+        }
+
+        /* b,pt,n label1 */
+        label_ptr[1] = (uint32_t *)s->code_ptr;
+        tcg_out32(s, (INSN_OP(0) | INSN_COND(COND_A, 0) | INSN_OP2(0x1)
+                      | (1 << 29) | (1 << 19)));
+    } else {
+        /* The fast path is exactly one insn.  Thus we can perform the
+           entire TLB Hit in the (annulled) delay slot of the branch
+           over the TLB Miss case.  */
+
+        /* beq,a,pt %[xi]cc, label0 */
+        label_ptr[0] = NULL;
+        label_ptr[1] = (uint32_t *)s->code_ptr;
+        tcg_out32(s, (INSN_OP(0) | INSN_COND(COND_E, 0) | INSN_OP2(0x1)
+                      | ((TARGET_LONG_BITS == 64) << 21)
+                      | (1 << 29) | (1 << 19)));
+        /* delay slot */
+        tcg_out_ldst_rr(s, datalo, addr_reg, TCG_REG_O1, qemu_ld_opc[sizeop]);
+    }
 
     /* TLB Miss.  */
 
-    *label_ptr[0] |= INSN_OFF19((unsigned long)s->code_ptr -
-                                (unsigned long)label_ptr[0]);
-    n = 0;
-#ifdef CONFIG_TCG_PASS_AREG0
-    tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[n++], TCG_AREG0);
-#endif
+    if (label_ptr[0]) {
+        *label_ptr[0] |= INSN_OFF19((unsigned long)s->code_ptr -
+                                    (unsigned long)label_ptr[0]);
+    }
+    n = ARG_OFFSET;
+    if (ARG_OFFSET) {
+       tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
+    }
     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
         tcg_out_mov(s, TCG_TYPE_REG, tcg_target_call_iarg_regs[n++],
                     args[addrlo_idx + 1]);
@@ -971,7 +916,7 @@  static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
 
     n = tcg_target_call_oarg_regs[0];
     /* datalo = sign_extend(arg0) */
-    switch(opc) {
+    switch (sizeop) {
     case 0 | 4:
         /* Recall that SRA sign extends from bit 31 through bit 63.  */
         tcg_out_arithi(s, datalo, n, 24, SHIFT_SLL);
@@ -1008,75 +953,31 @@  static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
         tcg_out_arithi(s, TCG_REG_I5, addr_reg, 0, SHIFT_SRL);
         addr_reg = TCG_REG_I5;
     }
-    tcg_out_qemu_ld_direct(s, addr_reg, datalo, datahi, opc);
-#endif /* CONFIG_SOFTMMU */
-}
+    if (TCG_TARGET_REG_BITS == 32 && sizeop == 3) {
+        int reg64 = (datalo < 16 ? datalo : TCG_REG_O0);
 
-static void tcg_out_qemu_st_direct(TCGContext *s, int addr, int datalo,
-                                   int datahi, int sizeop)
-{
-#ifdef TARGET_WORDS_BIGENDIAN
-    const int bigendian = 1;
-#else
-    const int bigendian = 0;
-#endif
-    switch (sizeop) {
-    case 0:
-        /* stb datalo, [addr] */
-        tcg_out_ldst(s, datalo, addr, 0, STB);
-        break;
-    case 1:
-        if (bigendian) {
-            /* sth datalo, [addr] */
-            tcg_out_ldst(s, datalo, addr, 0, STH);
-        } else {
-            /* stha datalo, [addr] ASI_PRIMARY_LITTLE */
-            tcg_out_ldst_asi(s, datalo, addr, 0, STHA, ASI_PRIMARY_LITTLE);
-        }
-        break;
-    case 2:
-        if (bigendian) {
-            /* stw datalo, [addr] */
-            tcg_out_ldst(s, datalo, addr, 0, STW);
-        } else {
-            /* stwa datalo, [addr] ASI_PRIMARY_LITTLE */
-            tcg_out_ldst_asi(s, datalo, addr, 0, STWA, ASI_PRIMARY_LITTLE);
-        }
-        break;
-    case 3:
-        if (TCG_TARGET_REG_BITS == 64) {
-            if (bigendian) {
-                /* stx datalo, [addr] */
-                tcg_out_ldst(s, datalo, addr, 0, STX);
-            } else {
-                /* stxa datalo, [addr] ASI_PRIMARY_LITTLE */
-                tcg_out_ldst_asi(s, datalo, addr, 0, STXA, ASI_PRIMARY_LITTLE);
-            }
-        } else {
-            if (bigendian) {
-                tcg_out_ldst(s, datahi, addr, 0, STW);
-                tcg_out_ldst(s, datalo, addr, 4, STW);
-            } else {
-                tcg_out_ldst_asi(s, datalo, addr, 0, STWA, ASI_PRIMARY_LITTLE);
-                tcg_out_ldst_asi(s, datahi, addr, 4, STWA, ASI_PRIMARY_LITTLE);
-            }
+        tcg_out_ldst_rr(s, reg64, addr_reg, TCG_REG_G0, qemu_ld_opc[sizeop]);
+
+        tcg_out_arithi(s, datahi, reg64, 32, SHIFT_SRLX);
+        if (reg64 != datalo) {
+            tcg_out_mov(s, TCG_TYPE_I32, datalo, reg64);
         }
-        break;
-    default:
-        tcg_abort();
+    } else {
+        tcg_out_ldst_rr(s, datalo, addr_reg, TCG_REG_G0, qemu_ld_opc[sizeop]);
     }
+#endif /* CONFIG_SOFTMMU */
 }
 
-static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc)
+static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int sizeop)
 {
     int addrlo_idx = 1, datalo, datahi, addr_reg;
 #if defined(CONFIG_SOFTMMU)
     int memi_idx, memi, n;
-    uint32_t *label_ptr[2];
+    uint32_t *label_ptr;
 #endif
 
     datahi = datalo = args[0];
-    if (TCG_TARGET_REG_BITS == 32 && opc == 3) {
+    if (TCG_TARGET_REG_BITS == 32 && sizeop == 3) {
         datahi = args[1];
         addrlo_idx = 2;
     }
@@ -1085,33 +986,40 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc)
     memi_idx = addrlo_idx + 1 + (TARGET_LONG_BITS > TCG_TARGET_REG_BITS);
     memi = args[memi_idx];
 
-    addr_reg = tcg_out_tlb_load(s, addrlo_idx, memi, opc, args,
-                                label_ptr, offsetof(CPUTLBEntry, addr_write));
+    addr_reg = tcg_out_tlb_load(s, addrlo_idx, memi, sizeop, args,
+                                offsetof(CPUTLBEntry, addr_write));
 
-    /* TLB Hit.  */
-    tcg_out_qemu_st_direct(s, addr_reg, datalo, datahi, opc);
+    if (TCG_TARGET_REG_BITS == 32 && sizeop == 3) {
+        /* Reconstruct the full 64-bit value in %g1, using %o2 as temp.  */
+        /* ??? Redefine the temps from %i4/%i5 so that we have a o/g temp. */
+        tcg_out_arithi(s, TCG_REG_G1, datalo, 0, SHIFT_SRL);
+        tcg_out_arithi(s, TCG_REG_O2, datahi, 32, SHIFT_SLLX);
+        tcg_out_arith(s, TCG_REG_G1, TCG_REG_G1, TCG_REG_O2, ARITH_OR);
+        datalo = TCG_REG_G1;
+    }
 
-    /* b,pt,n label1 */
-    label_ptr[1] = (uint32_t *)s->code_ptr;
-    tcg_out32(s, (INSN_OP(0) | INSN_COND(COND_A, 0) | INSN_OP2(0x1)
+    /* The fast path is exactly one insn.  Thus we can perform the entire
+       TLB Hit in the (annulled) delay slot of the branch over TLB Miss.  */
+    /* beq,a,pt %[xi]cc, label0 */
+    label_ptr = (uint32_t *)s->code_ptr;
+    tcg_out32(s, (INSN_OP(0) | INSN_COND(COND_E, 0) | INSN_OP2(0x1)
+                  | ((TARGET_LONG_BITS == 64) << 21)
                   | (1 << 29) | (1 << 19)));
+    /* delay slot */
+    tcg_out_ldst_rr(s, datalo, addr_reg, TCG_REG_O1, qemu_st_opc[sizeop]);
 
     /* TLB Miss.  */
-
-    *label_ptr[0] |= INSN_OFF19((unsigned long)s->code_ptr -
-                                (unsigned long)label_ptr[0]);
-
-    n = 0;
-#ifdef CONFIG_TCG_PASS_AREG0
-    tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[n++], TCG_AREG0);
-#endif
+    n = ARG_OFFSET;
+    if (ARG_OFFSET) {
+         tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
+    }
     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
         tcg_out_mov(s, TCG_TYPE_REG, tcg_target_call_iarg_regs[n++],
                     args[addrlo_idx + 1]);
     }
     tcg_out_mov(s, TCG_TYPE_REG, tcg_target_call_iarg_regs[n++],
                 args[addrlo_idx]);
-    if (TCG_TARGET_REG_BITS == 32 && opc == 3) {
+    if (TCG_TARGET_REG_BITS == 32 && sizeop == 3) {
         tcg_out_mov(s, TCG_TYPE_REG, tcg_target_call_iarg_regs[n++], datahi);
     }
     tcg_out_mov(s, TCG_TYPE_REG, tcg_target_call_iarg_regs[n++], datalo);
@@ -1123,7 +1031,7 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc)
                sizeof(long));
 
     /* qemu_st_helper[s_bits](arg0, arg1, arg2) */
-    tcg_out32(s, CALL | ((((tcg_target_ulong)qemu_st_helpers[opc]
+    tcg_out32(s, CALL | ((((tcg_target_ulong)qemu_st_helpers[sizeop]
                            - (tcg_target_ulong)s->code_ptr) >> 2)
                          & 0x3fffffff));
     /* delay slot */
@@ -1134,15 +1042,23 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc)
                TCG_TARGET_CALL_STACK_OFFSET - TCG_STATIC_CALL_ARGS_SIZE -
                sizeof(long));
 
-    *label_ptr[1] |= INSN_OFF19((unsigned long)s->code_ptr -
-                                (unsigned long)label_ptr[1]);
+    *label_ptr |= INSN_OFF19((unsigned long)s->code_ptr -
+                             (unsigned long)label_ptr);
 #else
     addr_reg = args[addrlo_idx];
     if (TCG_TARGET_REG_BITS == 64 && TARGET_LONG_BITS == 32) {
         tcg_out_arithi(s, TCG_REG_I5, addr_reg, 0, SHIFT_SRL);
         addr_reg = TCG_REG_I5;
     }
-    tcg_out_qemu_st_direct(s, addr_reg, datalo, datahi, opc);
+    if (TCG_TARGET_REG_BITS == 32 && sizeop == 3) {
+        /* Reconstruct the full 64-bit value in %g1, using %o2 as temp.  */
+        /* ??? Redefine the temps from %i4/%i5 so that we have a o/g temp. */
+        tcg_out_arithi(s, TCG_REG_G1, datalo, 0, SHIFT_SRL);
+        tcg_out_arithi(s, TCG_REG_O2, datahi, 32, SHIFT_SLLX);
+        tcg_out_arith(s, TCG_REG_G1, TCG_REG_G1, TCG_REG_O2, ARITH_OR);
+        datalo = TCG_REG_G1;
+    }
+    tcg_out_ldst_rr(s, datalo, addr_reg, TCG_REG_G0, qemu_st_opc[sizeop]);
 #endif /* CONFIG_SOFTMMU */
 }