@@ -272,9 +272,11 @@ DEF_HELPER_5(vmsumshm, void, env, avr, avr, avr, avr)
DEF_HELPER_5(vmsumshs, void, env, avr, avr, avr, avr)
DEF_HELPER_4(vmladduhm, void, avr, avr, avr, avr)
DEF_HELPER_2(mtvscr, void, env, avr)
+DEF_HELPER_3(lvx, void, env, i32, tl)
DEF_HELPER_3(lvebx, void, env, avr, tl)
DEF_HELPER_3(lvehx, void, env, avr, tl)
DEF_HELPER_3(lvewx, void, env, avr, tl)
+DEF_HELPER_3(stvx, void, env, i32, tl)
DEF_HELPER_3(stvebx, void, env, avr, tl)
DEF_HELPER_3(stvehx, void, env, avr, tl)
DEF_HELPER_3(stvewx, void, env, avr, tl)
@@ -225,6 +225,66 @@ target_ulong helper_lscbx(CPUPPCState *env, target_ulong addr, uint32_t reg,
#define LO_IDX 0
#endif
+void helper_lvx(CPUPPCState *env, uint32_t vr, target_ulong addr)
+{
+ uintptr_t raddr = GETPC();
+ ppc_avr_t *haddr;
+
+ /* Align address */
+ addr &= ~(target_ulong)0xf;
+
+ /* Try fast path translate */
+ haddr = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD, env->dmmu_idx);
+ if (haddr) {
+ if (msr_le && HI_IDX) {
+ memcpy(&env->avr[vr], haddr, 16);
+ } else {
+ env->avr[vr].u64[LO_IDX] = bswap64(haddr->u64[HI_IDX]);
+ env->avr[vr].u64[HI_IDX] = bswap64(haddr->u64[LO_IDX]);
+ }
+ } else {
+ if (needs_byteswap(env)) {
+ env->avr[vr].u64[LO_IDX] =
+ bswap64(cpu_ldq_data_ra(env, addr, raddr));
+ env->avr[vr].u64[HI_IDX] =
+ bswap64(cpu_ldq_data_ra(env, addr + 8, raddr));
+ } else {
+ env->avr[vr].u64[HI_IDX] = cpu_ldq_data_ra(env, addr, raddr);
+ env->avr[vr].u64[LO_IDX] = cpu_ldq_data_ra(env, addr + 8, raddr);
+ }
+ }
+}
+
+void helper_stvx(CPUPPCState *env, uint32_t vr, target_ulong addr)
+{
+ uintptr_t raddr = GETPC();
+ ppc_avr_t *haddr;
+
+ /* Align address */
+ addr &= ~(target_ulong)0xf;
+
+ /* Try fast path translate */
+ haddr = tlb_vaddr_to_host(env, addr, MMU_DATA_STORE, env->dmmu_idx);
+ if (haddr) {
+ if (msr_le && HI_IDX) {
+ memcpy(haddr, &env->avr[vr], 16);
+ } else {
+ haddr->u64[LO_IDX] = bswap64(env->avr[vr].u64[HI_IDX]);
+ haddr->u64[HI_IDX] = bswap64(env->avr[vr].u64[LO_IDX]);
+ }
+ } else {
+ if (needs_byteswap(env)) {
+ cpu_stq_data_ra(env, addr,
+ bswap64(env->avr[vr].u64[LO_IDX]), raddr);
+ cpu_stq_data_ra(env, addr + 8,
+ bswap64(env->avr[vr].u64[HI_IDX]), raddr);
+ } else {
+ cpu_stq_data_ra(env, addr, env->avr[vr].u64[HI_IDX], raddr);
+ cpu_stq_data_ra(env, addr + 8, env->avr[vr].u64[LO_IDX], raddr);
+ }
+ }
+}
+
/* We use msr_le to determine index ordering in a vector. However,
byteswapping is not simply controlled by msr_le. We also need to take
into account endianness of the target. This is done for the little-endian
@@ -15,55 +15,39 @@ static inline TCGv_ptr gen_avr_ptr(int reg)
}
#define GEN_VR_LDX(name, opc2, opc3) \
-static void glue(gen_, name)(DisasContext *ctx) \
+static void glue(gen_, name)(DisasContext *ctx) \
{ \
TCGv EA; \
+ TCGv_i32 t0; \
if (unlikely(!ctx->altivec_enabled)) { \
gen_exception(ctx, POWERPC_EXCP_VPU); \
return; \
} \
gen_set_access_type(ctx, ACCESS_INT); \
EA = tcg_temp_new(); \
+ t0 = tcg_const_i32(rD(ctx->opcode)); \
gen_addr_reg_index(ctx, EA); \
- tcg_gen_andi_tl(EA, EA, ~0xf); \
- /* We only need to swap high and low halves. gen_qemu_ld64 does necessary \
- 64-bit byteswap already. */ \
- if (ctx->le_mode) { \
- gen_qemu_ld64(ctx, cpu_avrl[rD(ctx->opcode)], EA); \
- tcg_gen_addi_tl(EA, EA, 8); \
- gen_qemu_ld64(ctx, cpu_avrh[rD(ctx->opcode)], EA); \
- } else { \
- gen_qemu_ld64(ctx, cpu_avrh[rD(ctx->opcode)], EA); \
- tcg_gen_addi_tl(EA, EA, 8); \
- gen_qemu_ld64(ctx, cpu_avrl[rD(ctx->opcode)], EA); \
- } \
+ gen_helper_lvx(cpu_env, t0, EA); \
tcg_temp_free(EA); \
+ tcg_temp_free_i32(t0); \
}
#define GEN_VR_STX(name, opc2, opc3) \
static void gen_st##name(DisasContext *ctx) \
{ \
TCGv EA; \
+ TCGv_i32 t0; \
if (unlikely(!ctx->altivec_enabled)) { \
gen_exception(ctx, POWERPC_EXCP_VPU); \
return; \
} \
gen_set_access_type(ctx, ACCESS_INT); \
EA = tcg_temp_new(); \
+ t0 = tcg_const_i32(rD(ctx->opcode)); \
gen_addr_reg_index(ctx, EA); \
- tcg_gen_andi_tl(EA, EA, ~0xf); \
- /* We only need to swap high and low halves. gen_qemu_st64 does necessary \
- 64-bit byteswap already. */ \
- if (ctx->le_mode) { \
- gen_qemu_st64(ctx, cpu_avrl[rD(ctx->opcode)], EA); \
- tcg_gen_addi_tl(EA, EA, 8); \
- gen_qemu_st64(ctx, cpu_avrh[rD(ctx->opcode)], EA); \
- } else { \
- gen_qemu_st64(ctx, cpu_avrh[rD(ctx->opcode)], EA); \
- tcg_gen_addi_tl(EA, EA, 8); \
- gen_qemu_st64(ctx, cpu_avrl[rD(ctx->opcode)], EA); \
- } \
+ gen_helper_stvx(cpu_env, t0, EA); \
tcg_temp_free(EA); \
+ tcg_temp_free_i32(t0); \
}
#define GEN_VR_LVE(name, opc2, opc3, size) \
@@ -116,9 +100,9 @@ GEN_VR_LVE(bx, 0x07, 0x00, 1);
GEN_VR_LVE(hx, 0x07, 0x01, 2);
GEN_VR_LVE(wx, 0x07, 0x02, 4);
-GEN_VR_STX(svx, 0x07, 0x07);
+GEN_VR_STX(vx, 0x07, 0x07);
/* As we don't emulate the cache, stvxl is stricly equivalent to stvx */
-GEN_VR_STX(svxl, 0x07, 0x0F);
+GEN_VR_STX(vxl, 0x07, 0x0F);
GEN_VR_STVE(bx, 0x07, 0x04, 1);
GEN_VR_STVE(hx, 0x07, 0x05, 2);
@@ -11,8 +11,8 @@ GEN_VR_LDX(lvxl, 0x07, 0x0B),
GEN_VR_LVE(bx, 0x07, 0x00),
GEN_VR_LVE(hx, 0x07, 0x01),
GEN_VR_LVE(wx, 0x07, 0x02),
-GEN_VR_STX(svx, 0x07, 0x07),
-GEN_VR_STX(svxl, 0x07, 0x0F),
+GEN_VR_STX(vx, 0x07, 0x07),
+GEN_VR_STX(vxl, 0x07, 0x0F),
GEN_VR_STVE(bx, 0x07, 0x04),
GEN_VR_STVE(hx, 0x07, 0x05),
GEN_VR_STVE(wx, 0x07, 0x06),
Those are always naturally aligned, so cannot cross a page boundary, thus instead of generating two 8-byte loads with translation on each (and double swap for LE on LE), we use a helper that will do a single translation and memcpy the result over (or do appropriate swapping if needed). Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> --- target-ppc/helper.h | 2 ++ target-ppc/mem_helper.c | 60 +++++++++++++++++++++++++++++++++++++++++ target-ppc/translate/vmx-impl.c | 38 ++++++++------------------ target-ppc/translate/vmx-ops.c | 4 +-- 4 files changed, 75 insertions(+), 29 deletions(-)