@@ -472,4 +472,44 @@ DEF_HELPER_2(neon_qzip8, void, i32, i32)
DEF_HELPER_2(neon_qzip16, void, i32, i32)
DEF_HELPER_2(neon_qzip32, void, i32, i32)
+/* The VLD/VST multiple ops have a particular set of 'op' fields
+ * which decode into specific (nregs,interleave,spacing) combinations.
+ * There are 11 valid combinations, each of which has a helper
+ * for load and store for four operand sizes.
+ */
+#define FOREACH_VLDST_L_SIZE(OP, N, I, S) \
+ OP(st, 0, N, I, S) \
+ OP(ld, 0, N, I, S) \
+ OP(st, 1, N, I, S) \
+ OP(ld, 1, N, I, S) \
+ OP(st, 2, N, I, S) \
+ OP(ld, 2, N, I, S) \
+ OP(st, 3, N, I, S) \
+ OP(ld, 3, N, I, S)
+
+#define FOREACH_VLDST_HELPER(OP) \
+ FOREACH_VLDST_L_SIZE(OP, 4, 4, 1) \
+ FOREACH_VLDST_L_SIZE(OP, 4, 4, 2) \
+ FOREACH_VLDST_L_SIZE(OP, 4, 1, 1) \
+ FOREACH_VLDST_L_SIZE(OP, 4, 2, 1) \
+ FOREACH_VLDST_L_SIZE(OP, 3, 3, 1) \
+ FOREACH_VLDST_L_SIZE(OP, 3, 3, 2) \
+ FOREACH_VLDST_L_SIZE(OP, 3, 1, 1) \
+ FOREACH_VLDST_L_SIZE(OP, 1, 1, 1) \
+ FOREACH_VLDST_L_SIZE(OP, 2, 2, 1) \
+ FOREACH_VLDST_L_SIZE(OP, 2, 2, 2) \
+ FOREACH_VLDST_L_SIZE(OP, 2, 1, 1)
+
+/* Get the index into a table created by FOREACH_VLDST_HELPER;
+ * the calculation has to match the order in which that macro expands things.
+ */
+#define VLDST_HELPER_INDEX(ld, sz, op) (((op) * 8) + ((sz) * 2) + (ld))
+
+#define VLDST_HELPER_NAME(ld, sz, n, i, s) neon_v##ld##_##sz##_##n##_##i##_##s
+
+#define DECLARE_VLDST_HELPER(ld, sz, n, i, s) \
+ DEF_HELPER_2(VLDST_HELPER_NAME(ld, sz, n, i, s), void, i32, i32)
+
+FOREACH_VLDST_HELPER(DECLARE_VLDST_HELPER)
+
#include "def-helper.h"
@@ -2024,3 +2024,130 @@ void HELPER(neon_zip16)(uint32_t rd, uint32_t rm)
env->vfp.regs[rm] = make_float64(m0);
env->vfp.regs[rd] = make_float64(d0);
}
+
+/* Note that these need to handle unaligned accesses */
+#if defined(CONFIG_USER_ONLY)
+#define LDB(addr) ldub(addr)
+#define LDW(addr) lduw(addr)
+#define LDL(addr) ldl(addr)
+#define LDQ(addr) ldq(addr)
+#define STB(addr, val) stb(addr, val)
+#define STW(addr, val) stw(addr, val)
+#define STL(addr, val) stl(addr, val)
+#define STQ(addr, val) stq(addr, val)
+#define DEFINE_USER_VAR do {} while (0)
+#else
+#define LDB(addr) slow_ldb_mmu(addr, user, GETPC())
+#define LDW(addr) slow_ldw_mmu(addr, user, GETPC())
+#define LDL(addr) slow_ldl_mmu(addr, user, GETPC())
+#define LDQ(addr) slow_ldq_mmu(addr, user, GETPC())
+#define STB(addr, val) slow_stb_mmu(addr, val, user, GETPC())
+#define STW(addr, val) slow_stw_mmu(addr, val, user, GETPC())
+#define STL(addr, val) slow_stl_mmu(addr, val, user, GETPC())
+#define STQ(addr, val) slow_stq_mmu(addr, val, user, GETPC())
+#define DEFINE_USER_VAR int user = cpu_mmu_index(env)
+#endif
+
+/* Helper functions for Neon VLDn/VSTn "multiple structures" forms. */
+
+#define NEON_VLDST_HELPER(ldst, size, nregs, interleave, spacing) \
+void HELPER(VLDST_HELPER_NAME(ldst, size, nregs, interleave, spacing)) \
+ (uint32_t startaddr, uint32_t rd) \
+{ \
+ const int stride = (1 << size) * interleave; \
+ int reg; \
+ uint32_t addr = startaddr; \
+ DEFINE_USER_VAR; \
+ for (reg = 0; reg < nregs; reg++) { \
+ if (interleave > 2 || (interleave == 2 && nregs == 2)) { \
+ addr = startaddr + (1 << size) * reg; \
+ } else if (interleave == 2 && nregs == 4 && reg == 2) { \
+ addr = startaddr + (1 << size); \
+ } \
+ DO_OP_##ldst##_##size(); \
+ rd += spacing; \
+ } \
+}
+
+/* The VLD/VST multiple ops have a particular set of 'op' fields
+ * which decode into specific (nregs,interleave,spacing) combinations.
+ */
+/* The actual payload varies based on size and whether it's a load/store. */
+#define DO_OP_ld_0() \
+ do { \
+ uint64_t tmp64 = 0ull; \
+ int i; \
+ for (i = 0; i < 8; i++, addr += stride) { \
+ tmp64 |= (uint64_t)LDB(addr) << (i * 8); \
+ } \
+ env->vfp.regs[rd] = make_float64(tmp64); \
+ } while (0)
+
+#define DO_OP_st_0() \
+ do { \
+ uint64_t tmp64 = float64_val(env->vfp.regs[rd]); \
+ int i; \
+ for (i = 0; i < 8; i++, addr += stride, tmp64 >>= 8) { \
+ STB(addr, tmp64); \
+ } \
+ } while (0)
+
+#define DO_OP_ld_1() \
+ do { \
+ uint64_t tmp64 = 0ull; \
+ int i; \
+ for (i = 0; i < 4; i++, addr += stride) { \
+ tmp64 |= (uint64_t)LDW(addr) << (i * 16); \
+ } \
+ env->vfp.regs[rd] = make_float64(tmp64); \
+ } while (0)
+
+#define DO_OP_st_1() \
+ do { \
+ uint64_t tmp64 = float64_val(env->vfp.regs[rd]); \
+ int i; \
+ for (i = 0; i < 4; i++, addr += stride, tmp64 >>= 16) { \
+ STW(addr, tmp64); \
+ } \
+ } while (0)
+
+#define DO_OP_ld_2() \
+ do { \
+ uint64_t tmp64 = (uint32_t)LDL(addr); \
+ addr += stride; \
+ tmp64 |= (uint64_t)LDL(addr) << 32; \
+ addr += stride; \
+ env->vfp.regs[rd] = make_float64(tmp64); \
+ } while (0)
+
+#define DO_OP_st_2() \
+ do { \
+ uint64_t tmp64 = float64_val(env->vfp.regs[rd]); \
+ STL(addr, tmp64); \
+ addr += stride; \
+ STL(addr, tmp64 >> 32); \
+ addr += stride; \
+ } while (0)
+
+#define DO_OP_ld_3() \
+ do { \
+ env->vfp.regs[rd] = make_float64(LDQ(addr)); \
+ addr += stride; \
+ } while (0)
+
+#define DO_OP_st_3() \
+ do { \
+ STQ(addr, float64_val(env->vfp.regs[rd])); \
+ addr += stride; \
+ } while (0)
+
+FOREACH_VLDST_HELPER(NEON_VLDST_HELPER)
+
+#undef LDB
+#undef LDW
+#undef LDL
+#undef LDQ
+#undef STB
+#undef STW
+#undef STL
+#undef STQ
@@ -3778,25 +3778,36 @@ static void gen_neon_trn_u16(TCGv t0, TCGv t1)
tcg_temp_free_i32(rd);
}
+/* The op field in the instruction indicates the nregs,
+ * interleave and spacing. For each of these we have a
+ * load and store for each of four sizes.
+ */
+#define VLDST_HELPER_POINTER_COMMA(ld, sz, n, i, s) \
+ HELPER(VLDST_HELPER_NAME(ld, sz, n, i, s)),
-static struct {
- int nregs;
- int interleave;
- int spacing;
-} neon_ls_element_type[11] = {
- {4, 4, 1},
- {4, 4, 2},
- {4, 1, 1},
- {4, 2, 1},
- {3, 3, 1},
- {3, 3, 2},
- {3, 1, 1},
- {1, 1, 1},
- {2, 2, 1},
- {2, 2, 2},
- {2, 1, 1}
+static void *neon_vldst_m_helpers[] = {
+ FOREACH_VLDST_HELPER(VLDST_HELPER_POINTER_COMMA)
};
+static void gen_neon_vldst_multiple(CPUState *env, TCGv addr, int rd,
+ int size, int load, int op)
+{
+ /* Generate a call to the correct helper for a neon vld/vst
+ * "multiple structures" form.
+ */
+ TCGv tmp = tcg_const_i32(rd);
+ void *func = neon_vldst_m_helpers[VLDST_HELPER_INDEX(load, size, op)];
+ TCGArg args[2];
+ int sizemask = 0;
+ dh_sizemask(void, 0);
+ args[0] = GET_TCGV_ptr(addr);
+ dh_sizemask(ptr, 1);
+ args[1] = GET_TCGV_i32(tmp);
+ dh_sizemask(i32, 2);
+ tcg_gen_helperN(func, 0, sizemask, TCG_CALL_DUMMY_ARG, 2, args);
+ tcg_temp_free_i32(tmp);
+}
+
/* Translate a NEON load/store element instruction. Return nonzero if the
instruction is invalid. */
static int disas_neon_ls_insn(CPUState * env, DisasContext *s, uint32_t insn)
@@ -3804,19 +3815,15 @@ static int disas_neon_ls_insn(CPUState * env, DisasContext *s, uint32_t insn)
int rd, rn, rm;
int op;
int nregs;
- int interleave;
- int spacing;
int stride;
int size;
int reg;
int pass;
int load;
int shift;
- int n;
TCGv addr;
TCGv tmp;
TCGv tmp2;
- TCGv_i64 tmp64;
if (!s->vfp_enabled)
return 1;
@@ -3845,97 +3852,14 @@ static int disas_neon_ls_insn(CPUState * env, DisasContext *s, uint32_t insn)
default:
break;
}
- nregs = neon_ls_element_type[op].nregs;
- interleave = neon_ls_element_type[op].interleave;
- spacing = neon_ls_element_type[op].spacing;
- if (size == 3 && (interleave | spacing) != 1)
+ if (size == 3 && ((op & 2) == 0 || op == 3)) {
+ /* VLD1/VST1 with size 3 must UNDEF */
return 1;
+ }
+ nregs = (4 - (op >> 2)) - ((op == 7) ? 2 : 0);
addr = tcg_temp_new_i32();
load_reg_var(s, addr, rn);
- stride = (1 << size) * interleave;
- for (reg = 0; reg < nregs; reg++) {
- if (interleave > 2 || (interleave == 2 && nregs == 2)) {
- load_reg_var(s, addr, rn);
- tcg_gen_addi_i32(addr, addr, (1 << size) * reg);
- } else if (interleave == 2 && nregs == 4 && reg == 2) {
- load_reg_var(s, addr, rn);
- tcg_gen_addi_i32(addr, addr, 1 << size);
- }
- if (size == 3) {
- if (load) {
- tmp64 = gen_ld64(addr, IS_USER(s));
- neon_store_reg64(tmp64, rd);
- tcg_temp_free_i64(tmp64);
- } else {
- tmp64 = tcg_temp_new_i64();
- neon_load_reg64(tmp64, rd);
- gen_st64(tmp64, addr, IS_USER(s));
- }
- tcg_gen_addi_i32(addr, addr, stride);
- } else {
- for (pass = 0; pass < 2; pass++) {
- if (size == 2) {
- if (load) {
- tmp = gen_ld32(addr, IS_USER(s));
- neon_store_reg(rd, pass, tmp);
- } else {
- tmp = neon_load_reg(rd, pass);
- gen_st32(tmp, addr, IS_USER(s));
- }
- tcg_gen_addi_i32(addr, addr, stride);
- } else if (size == 1) {
- if (load) {
- tmp = gen_ld16u(addr, IS_USER(s));
- tcg_gen_addi_i32(addr, addr, stride);
- tmp2 = gen_ld16u(addr, IS_USER(s));
- tcg_gen_addi_i32(addr, addr, stride);
- tcg_gen_shli_i32(tmp2, tmp2, 16);
- tcg_gen_or_i32(tmp, tmp, tmp2);
- tcg_temp_free_i32(tmp2);
- neon_store_reg(rd, pass, tmp);
- } else {
- tmp = neon_load_reg(rd, pass);
- tmp2 = tcg_temp_new_i32();
- tcg_gen_shri_i32(tmp2, tmp, 16);
- gen_st16(tmp, addr, IS_USER(s));
- tcg_gen_addi_i32(addr, addr, stride);
- gen_st16(tmp2, addr, IS_USER(s));
- tcg_gen_addi_i32(addr, addr, stride);
- }
- } else /* size == 0 */ {
- if (load) {
- TCGV_UNUSED(tmp2);
- for (n = 0; n < 4; n++) {
- tmp = gen_ld8u(addr, IS_USER(s));
- tcg_gen_addi_i32(addr, addr, stride);
- if (n == 0) {
- tmp2 = tmp;
- } else {
- tcg_gen_shli_i32(tmp, tmp, n * 8);
- tcg_gen_or_i32(tmp2, tmp2, tmp);
- tcg_temp_free_i32(tmp);
- }
- }
- neon_store_reg(rd, pass, tmp2);
- } else {
- tmp2 = neon_load_reg(rd, pass);
- for (n = 0; n < 4; n++) {
- tmp = tcg_temp_new_i32();
- if (n == 0) {
- tcg_gen_mov_i32(tmp, tmp2);
- } else {
- tcg_gen_shri_i32(tmp, tmp2, n * 8);
- }
- gen_st8(tmp, addr, IS_USER(s));
- tcg_gen_addi_i32(addr, addr, stride);
- }
- tcg_temp_free_i32(tmp2);
- }
- }
- }
- }
- rd += spacing;
- }
+ gen_neon_vldst_multiple(env, addr, rd, size, load, op);
tcg_temp_free_i32(addr);
stride = nregs * 8;
} else {
Move VLD/VST multiple into helper functions, as some cases can generate more TCG ops than the maximum per-instruction limit and certainly more than the recommended 20. Signed-off-by: Peter Maydell <peter.maydell@linaro.org> --- This patch is inspired by one from the meego tree: http://git.linaro.org/gitweb?p=qemu/qemu-linaro.git;a=commitdiff;h=a5b2a79c7929726bac5157783de81d22793efd12 but I've reworked it to do the decoding at translate time rather than in the helper function. It is intended to apply on top of the neon load/store UNDEF fixes: http://patchwork.ozlabs.org/patch/91824/ http://patchwork.ozlabs.org/patch/91825/ but I thought it would be better to push it out now for review rather than waiting for those to be committed. I hope you all like macros :-) target-arm/helper.h | 40 +++++++++++++ target-arm/neon_helper.c | 127 +++++++++++++++++++++++++++++++++++++++++ target-arm/translate.c | 140 +++++++++++----------------------------------- 3 files changed, 199 insertions(+), 108 deletions(-)