From patchwork Mon Feb 14 10:22:49 2011 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Peter Maydell X-Patchwork-Id: 83059 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from lists.gnu.org (lists.gnu.org [199.232.76.165]) (using TLSv1 with cipher DHE-RSA-AES256-SHA (256/256 bits)) (Client did not present a certificate) by ozlabs.org (Postfix) with ESMTPS id 46E8CB71B2 for ; Mon, 14 Feb 2011 21:25:51 +1100 (EST) Received: from localhost ([127.0.0.1]:57314 helo=lists.gnu.org) by lists.gnu.org with esmtp (Exim 4.43) id 1Povci-0002nN-00 for incoming@patchwork.ozlabs.org; Mon, 14 Feb 2011 05:25:40 -0500 Received: from [140.186.70.92] (port=60701 helo=eggs.gnu.org) by lists.gnu.org with esmtp (Exim 4.43) id 1Pova9-0001ep-VU for qemu-devel@nongnu.org; Mon, 14 Feb 2011 05:23:03 -0500 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1Pova8-0007GX-9s for qemu-devel@nongnu.org; Mon, 14 Feb 2011 05:23:01 -0500 Received: from mnementh.archaic.org.uk ([81.2.115.146]:38934) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1Pova7-0007FP-Nh for qemu-devel@nongnu.org; Mon, 14 Feb 2011 05:23:00 -0500 Received: from pm215 by mnementh.archaic.org.uk with local (Exim 4.72) (envelope-from ) id 1PovZx-0000ts-Oj; Mon, 14 Feb 2011 10:22:49 +0000 From: Peter Maydell To: qemu-devel@nongnu.org Date: Mon, 14 Feb 2011 10:22:49 +0000 Message-Id: <1297678969-3433-3-git-send-email-peter.maydell@linaro.org> X-Mailer: git-send-email 1.7.2.3 In-Reply-To: <1297678969-3433-1-git-send-email-peter.maydell@linaro.org> References: <1297678969-3433-1-git-send-email-peter.maydell@linaro.org> X-detected-operating-system: by eggs.gnu.org: GNU/Linux 2.6 (newer, 2) X-Received-From: 81.2.115.146 Cc: patches@linaro.org Subject: [Qemu-devel] [PATCH v2 2/2] target-arm: Move Neon VZIP to helper functions X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: qemu-devel.nongnu.org List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Sender: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org Errors-To: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org Move the implementation of the Neon VUZP unzip instruction from inline code to helper functions. (At 50+ TCG ops it was well over the recommended limit for coding inline.) The helper implementations also give the correct answers where the inline implementation did not. Signed-off-by: Peter Maydell --- target-arm/helpers.h | 5 ++ target-arm/neon_helper.c | 92 ++++++++++++++++++++++++++++++++++++++ target-arm/translate.c | 109 +++++++++++++++------------------------------- 3 files changed, 133 insertions(+), 73 deletions(-) diff --git a/target-arm/helpers.h b/target-arm/helpers.h index 88c2fce..e494b47 100644 --- a/target-arm/helpers.h +++ b/target-arm/helpers.h @@ -465,5 +465,10 @@ DEF_HELPER_3(neon_unzip16, void, env, i32, i32) DEF_HELPER_3(neon_qunzip8, void, env, i32, i32) DEF_HELPER_3(neon_qunzip16, void, env, i32, i32) DEF_HELPER_3(neon_qunzip32, void, env, i32, i32) +DEF_HELPER_3(neon_zip8, void, env, i32, i32) +DEF_HELPER_3(neon_zip16, void, env, i32, i32) +DEF_HELPER_3(neon_qzip8, void, env, i32, i32) +DEF_HELPER_3(neon_qzip16, void, env, i32, i32) +DEF_HELPER_3(neon_qzip32, void, env, i32, i32) #include "def-helper.h" diff --git a/target-arm/neon_helper.c b/target-arm/neon_helper.c index 6e2a2fd..cee9ee4 100644 --- a/target-arm/neon_helper.c +++ b/target-arm/neon_helper.c @@ -1757,3 +1757,95 @@ void HELPER(neon_unzip16)(CPUState *env, uint32_t rd, uint32_t rm) env->vfp.regs[rm] = make_float64(m0); env->vfp.regs[rd] = make_float64(d0); } + +void HELPER(neon_qzip8)(CPUState *env, uint32_t rd, uint32_t rm) +{ + uint64_t zm0 = float64_val(env->vfp.regs[rm]); + uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]); + uint64_t zd0 = float64_val(env->vfp.regs[rd]); + uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]); + uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8) + | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24) + | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40) + | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56); + uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8) + | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24) + | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40) + | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56); + uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8) + | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24) + | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40) + | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56); + uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8) + | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24) + | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40) + | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56); + env->vfp.regs[rm] = make_float64(m0); + env->vfp.regs[rm + 1] = make_float64(m1); + env->vfp.regs[rd] = make_float64(d0); + env->vfp.regs[rd + 1] = make_float64(d1); +} + +void HELPER(neon_qzip16)(CPUState *env, uint32_t rd, uint32_t rm) +{ + uint64_t zm0 = float64_val(env->vfp.regs[rm]); + uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]); + uint64_t zd0 = float64_val(env->vfp.regs[rd]); + uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]); + uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16) + | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48); + uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16) + | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48); + uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16) + | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48); + uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16) + | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48); + env->vfp.regs[rm] = make_float64(m0); + env->vfp.regs[rm + 1] = make_float64(m1); + env->vfp.regs[rd] = make_float64(d0); + env->vfp.regs[rd + 1] = make_float64(d1); +} + +void HELPER(neon_qzip32)(CPUState *env, uint32_t rd, uint32_t rm) +{ + uint64_t zm0 = float64_val(env->vfp.regs[rm]); + uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]); + uint64_t zd0 = float64_val(env->vfp.regs[rd]); + uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]); + uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32); + uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32); + uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32); + uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32); + env->vfp.regs[rm] = make_float64(m0); + env->vfp.regs[rm + 1] = make_float64(m1); + env->vfp.regs[rd] = make_float64(d0); + env->vfp.regs[rd + 1] = make_float64(d1); +} + +void HELPER(neon_zip8)(CPUState *env, uint32_t rd, uint32_t rm) +{ + uint64_t zm = float64_val(env->vfp.regs[rm]); + uint64_t zd = float64_val(env->vfp.regs[rd]); + uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8) + | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24) + | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40) + | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56); + uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8) + | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24) + | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40) + | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56); + env->vfp.regs[rm] = make_float64(m0); + env->vfp.regs[rd] = make_float64(d0); +} + +void HELPER(neon_zip16)(CPUState *env, uint32_t rd, uint32_t rm) +{ + uint64_t zm = float64_val(env->vfp.regs[rm]); + uint64_t zd = float64_val(env->vfp.regs[rd]); + uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16) + | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48); + uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16) + | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48); + env->vfp.regs[rm] = make_float64(m0); + env->vfp.regs[rd] = make_float64(d0); +} diff --git a/target-arm/translate.c b/target-arm/translate.c index 6d94223..50c8fe0 100644 --- a/target-arm/translate.c +++ b/target-arm/translate.c @@ -3653,59 +3653,43 @@ static int gen_neon_unzip(int rd, int rm, int size, int q) return 0; } -static void gen_neon_zip_u8(TCGv t0, TCGv t1) -{ - TCGv rd, rm, tmp; - - rd = new_tmp(); - rm = new_tmp(); - tmp = new_tmp(); - - tcg_gen_andi_i32(rd, t0, 0xff); - tcg_gen_shli_i32(tmp, t1, 8); - tcg_gen_andi_i32(tmp, tmp, 0xff00); - tcg_gen_or_i32(rd, rd, tmp); - tcg_gen_shli_i32(tmp, t0, 16); - tcg_gen_andi_i32(tmp, tmp, 0xff0000); - tcg_gen_or_i32(rd, rd, tmp); - tcg_gen_shli_i32(tmp, t1, 24); - tcg_gen_andi_i32(tmp, tmp, 0xff000000); - tcg_gen_or_i32(rd, rd, tmp); - - tcg_gen_andi_i32(rm, t1, 0xff000000); - tcg_gen_shri_i32(tmp, t0, 8); - tcg_gen_andi_i32(tmp, tmp, 0xff0000); - tcg_gen_or_i32(rm, rm, tmp); - tcg_gen_shri_i32(tmp, t1, 8); - tcg_gen_andi_i32(tmp, tmp, 0xff00); - tcg_gen_or_i32(rm, rm, tmp); - tcg_gen_shri_i32(tmp, t0, 16); - tcg_gen_andi_i32(tmp, tmp, 0xff); - tcg_gen_or_i32(t1, rm, tmp); - tcg_gen_mov_i32(t0, rd); - - dead_tmp(tmp); - dead_tmp(rm); - dead_tmp(rd); -} - -static void gen_neon_zip_u16(TCGv t0, TCGv t1) +static int gen_neon_zip(int rd, int rm, int size, int q) { TCGv tmp, tmp2; - - tmp = new_tmp(); - tmp2 = new_tmp(); - - tcg_gen_andi_i32(tmp, t0, 0xffff); - tcg_gen_shli_i32(tmp2, t1, 16); - tcg_gen_or_i32(tmp, tmp, tmp2); - tcg_gen_andi_i32(t1, t1, 0xffff0000); - tcg_gen_shri_i32(tmp2, t0, 16); - tcg_gen_or_i32(t1, t1, tmp2); - tcg_gen_mov_i32(t0, tmp); - - dead_tmp(tmp2); - dead_tmp(tmp); + if (size == 3 || (!q && size == 2)) { + return 1; + } + tmp = tcg_const_i32(rd); + tmp2 = tcg_const_i32(rm); + if (q) { + switch (size) { + case 0: + gen_helper_neon_qzip8(cpu_env, tmp, tmp2); + break; + case 1: + gen_helper_neon_qzip16(cpu_env, tmp, tmp2); + break; + case 2: + gen_helper_neon_qzip32(cpu_env, tmp, tmp2); + break; + default: + abort(); + } + } else { + switch (size) { + case 0: + gen_helper_neon_zip8(cpu_env, tmp, tmp2); + break; + case 1: + gen_helper_neon_zip16(cpu_env, tmp, tmp2); + break; + default: + abort(); + } + } + tcg_temp_free_i32(tmp); + tcg_temp_free_i32(tmp2); + return 0; } static void gen_neon_trn_u8(TCGv t0, TCGv t1) @@ -5425,29 +5409,8 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) } break; case 35: /* VZIP */ - /* Reg Before After - Rd A3 A2 A1 A0 B1 A1 B0 A0 - Rm B3 B2 B1 B0 B3 A3 B2 A2 - */ - if (size == 3) + if (gen_neon_zip(rd, rm, size, q)) { return 1; - count = (q ? 4 : 2); - for (n = 0; n < count; n++) { - tmp = neon_load_reg(rd, n); - tmp2 = neon_load_reg(rd, n); - switch (size) { - case 0: gen_neon_zip_u8(tmp, tmp2); break; - case 1: gen_neon_zip_u16(tmp, tmp2); break; - case 2: /* no-op */; break; - default: abort(); - } - neon_store_scratch(n * 2, tmp); - neon_store_scratch(n * 2 + 1, tmp2); - } - for (n = 0; n < count * 2; n++) { - int reg = (n < count) ? rd : rm; - tmp = neon_load_scratch(n); - neon_store_reg(reg, n % count, tmp); } break; case 36: case 37: /* VMOVN, VQMOVUN, VQMOVN */