From patchwork Sat Dec 31 04:54:49 2011
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Richard Henderson <rth@twiddle.net>
X-Patchwork-Id: 133725
Return-Path: <qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org>
X-Original-To: incoming@patchwork.ozlabs.org
Delivered-To: patchwork-incoming@bilbo.ozlabs.org
Received: from lists.gnu.org (lists.gnu.org [140.186.70.17])
	(using TLSv1 with cipher AES256-SHA (256/256 bits))
	(Client did not present a certificate)
	by ozlabs.org (Postfix) with ESMTPS id B93F9B6FC9
	for <incoming@patchwork.ozlabs.org>;
	Sat, 31 Dec 2011 15:56:29 +1100 (EST)
Received: from localhost ([::1]:54686 helo=lists.gnu.org)
	by lists.gnu.org with esmtp (Exim 4.71) (envelope-from
	<qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org>)
	id 1Rgqza-0008SC-2l
	for incoming@patchwork.ozlabs.org; Fri, 30 Dec 2011 23:56:26 -0500
Received: from eggs.gnu.org ([140.186.70.92]:52037)
	by lists.gnu.org with esmtp (Exim 4.71)
	(envelope-from <rth7680@gmail.com>) id 1RgqzR-0008S6-Qr
	for qemu-devel@nongnu.org; Fri, 30 Dec 2011 23:56:18 -0500
Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71)
	(envelope-from <rth7680@gmail.com>) id 1RgqzQ-0000fI-B7
	for qemu-devel@nongnu.org; Fri, 30 Dec 2011 23:56:17 -0500
Received: from mail-gx0-f173.google.com ([209.85.161.173]:45215)
	by eggs.gnu.org with esmtp (Exim 4.71)
	(envelope-from <rth7680@gmail.com>) id 1RgqzQ-0000f5-8b
	for qemu-devel@nongnu.org; Fri, 30 Dec 2011 23:56:16 -0500
Received: by mail-gx0-f173.google.com with SMTP id k1so10741955ggn.4
	for <qemu-devel@nongnu.org>; Fri, 30 Dec 2011 20:56:16 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=gamma;
	h=sender:from:to:cc:subject:date:message-id:x-mailer:in-reply-to
	:references; bh=W7V8VElmVZWQktCN9ka/tnbGdfDERx1AVRJtvhiMOgY=;
	b=JODCBEtO0jj4b5T5HK9Evux5uP9Hp8jgqrxohAyYS0pADad8WXZNeS9Y8UGGPT3wRz
	9F7jrZREhddMDwjP81xYr4YBT7EYR78lYOy51GjqeyEXmYWRQZQH4Dl3YI8iVFlPj/dj
	lC3lgiF8hpL+HUO2hvzDTYrHGm0ND6FFiHU1I=
Received: by 10.101.211.3 with SMTP id n3mr8950515anq.75.1325307375990;
	Fri, 30 Dec 2011 20:56:15 -0800 (PST)
Received: from pebble.com ([101.172.149.215]) by mx.google.com with ESMTPS id
	v8sm11097049yhi.10.2011.12.30.20.56.13
	(version=TLSv1/SSLv3 cipher=OTHER);
	Fri, 30 Dec 2011 20:56:15 -0800 (PST)
From: Richard Henderson <rth@twiddle.net>
To: qemu-devel@nongnu.org
Date: Sat, 31 Dec 2011 15:54:49 +1100
Message-Id: <1325307291-6334-2-git-send-email-rth@twiddle.net>
X-Mailer: git-send-email 1.7.7.4
In-Reply-To: <1325307291-6334-1-git-send-email-rth@twiddle.net>
References: <1325307291-6334-1-git-send-email-rth@twiddle.net>
X-detected-operating-system: by eggs.gnu.org: GNU/Linux 2.6 (newer, 2)
X-Received-From: 209.85.161.173
Cc: peter.maydell@linaro.org, afaerber@suse.de,
	Aurelien Jarno <aurelien@aurel32.net>
Subject: [Qemu-devel] [PATCH 2/4] target-mips: Use TCG registers for the FPU.
X-BeenThere: qemu-devel@nongnu.org
X-Mailman-Version: 2.1.14
Precedence: list
List-Id: <qemu-devel.nongnu.org>
List-Unsubscribe: <https://lists.nongnu.org/mailman/options/qemu-devel>,
	<mailto:qemu-devel-request@nongnu.org?subject=unsubscribe>
List-Archive: <http://lists.nongnu.org/archive/html/qemu-devel>
List-Post: <mailto:qemu-devel@nongnu.org>
List-Help: <mailto:qemu-devel-request@nongnu.org?subject=help>
List-Subscribe: <https://lists.nongnu.org/mailman/listinfo/qemu-devel>,
	<mailto:qemu-devel-request@nongnu.org?subject=subscribe>
Errors-To: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org
Sender: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org

With normal FP, this doesn't have much affect on the generated code,
because most of the FP operations are not CONST/PURE, and so we spill
registers in about the same frequency as the explicit load/stores.

But with Loongson multimedia instructions, which are all integral and
whose helpers are in fact CONST+PURE, this greatly improves the code.

On a 64-bit host, rather than over-use the deposit operation, we
create TCG registers for both the 64-bit FPU register as a whole
and the two 32-bit halves.  We only ever reference the whole register
or the two half registers in any one TB, so there's no problem with
aliasing.

On a 32-bit host, we only create the 64-bit FPU registers, and then
directly reference the internal 32-bit TCG register halves as needed.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 target-mips/translate.c |  123 ++++++++++++++++++++++++++++++++++++-----------
 1 files changed, 95 insertions(+), 28 deletions(-)

diff --git a/target-mips/translate.c b/target-mips/translate.c
index d2aeff0..b6a7aeb 100644
--- a/target-mips/translate.c
+++ b/target-mips/translate.c
@@ -486,6 +486,14 @@ static TCGv cpu_dspctrl, btarget, bcond;
 static TCGv_i32 hflags;
 static TCGv_i32 fpu_fcr0, fpu_fcr31;
 
+/* FPU registers.  These alias, but we'll only use one or the other in any
+   one TB based on MIPS_HFLAG_F64.  */
+#if TCG_TARGET_REG_BITS == 64
+static TCGv_i32 fpu_f32[32];
+static TCGv_i32 fpu_fh32[32];
+#endif
+static TCGv_i64 fpu_f64[32];
+
 static uint32_t gen_opc_hflags[OPC_BUF_SIZE];
 
 #include "gen-icount.h"
@@ -555,6 +563,14 @@ static const char *fregnames[] =
       "f16", "f17", "f18", "f19", "f20", "f21", "f22", "f23",
       "f24", "f25", "f26", "f27", "f28", "f29", "f30", "f31", };
 
+#if TCG_TARGET_REG_BITS == 64
+static const char *fhregnames[] =
+    { "fh0",  "fh1",  "fh2",  "fh3",  "fh4",  "fh5",  "fh6",  "fh7",
+      "fh8",  "fh9",  "fh10", "fh11", "fh12", "fh13", "fh14", "fh15",
+      "fh16", "fh17", "fh18", "fh19", "fh20", "fh21", "fh22", "fh23",
+      "fh24", "fh25", "fh26", "fh27", "fh28", "fh29", "fh30", "fh31", };
+#endif
+
 #ifdef MIPS_DEBUG_DISAS
 #define MIPS_DEBUG(fmt, ...)                         \
         qemu_log_mask(CPU_LOG_TB_IN_ASM,                \
@@ -647,55 +663,91 @@ static inline void gen_store_srsgpr (int from, int to)
 }
 
 /* Floating point register moves. */
-static inline void gen_load_fpr32 (DisasContext *ctx, TCGv_i32 t, int reg)
+static void gen_load_fpr32 (DisasContext *ctx, TCGv_i32 t, int reg)
 {
-    tcg_gen_ld_i32(t, cpu_env, offsetof(CPUState, active_fpu.fpr[reg].w[FP_ENDIAN_IDX]));
+#if TCG_TARGET_REG_BITS == 32
+    tcg_gen_mov_i32(t, TCGV_LOW(fpu_f64[reg]));
+#else
+    if (ctx->hflags & MIPS_HFLAG_F64) {
+        tcg_gen_trunc_i64_i32(t, fpu_f64[reg]);
+    } else {
+        tcg_gen_mov_i32(t, fpu_f32[reg]);
+    }
+#endif
 }
 
-static inline void gen_store_fpr32 (DisasContext *ctx, TCGv_i32 t, int reg)
+static void gen_store_fpr32 (DisasContext *ctx, TCGv_i32 t, int reg)
 {
-    tcg_gen_st_i32(t, cpu_env, offsetof(CPUState, active_fpu.fpr[reg].w[FP_ENDIAN_IDX]));
+#if TCG_TARGET_REG_BITS == 32
+    tcg_gen_mov_i32(TCGV_LOW(fpu_f64[reg]), t);
+#else
+    if (ctx->hflags & MIPS_HFLAG_F64) {
+        TCGv_i64 t64 = MAKE_TCGV_I64(GET_TCGV_I32(t));
+        tcg_gen_deposit_i64(fpu_f64[reg], fpu_f64[reg], t64, 0, 32);
+    } else {
+        tcg_gen_mov_i32(fpu_f32[reg], t);
+    }
+#endif
 }
 
-static inline void gen_load_fpr32h (DisasContext *ctx, TCGv_i32 t, int reg)
+static void gen_load_fpr32h (DisasContext *ctx, TCGv_i32 t, int reg)
 {
-    tcg_gen_ld_i32(t, cpu_env, offsetof(CPUState, active_fpu.fpr[reg].w[!FP_ENDIAN_IDX]));
+#if TCG_TARGET_REG_BITS == 32
+    tcg_gen_mov_i32(t, TCGV_HIGH(fpu_f64[reg]));
+#else
+    if (ctx->hflags & MIPS_HFLAG_F64) {
+        TCGv_i64 t64 = tcg_temp_new_i64();
+        tcg_gen_shri_i64(t64, fpu_f64[reg], 32);
+        tcg_gen_trunc_i64_i32(t, t64);
+        tcg_temp_free_i64(t64);
+    } else {
+        tcg_gen_mov_i32(t, fpu_fh32[reg]);
+    }
+#endif
 }
 
-static inline void gen_store_fpr32h (DisasContext *ctx, TCGv_i32 t, int reg)
+static void gen_store_fpr32h (DisasContext *ctx, TCGv_i32 t, int reg)
 {
-    tcg_gen_st_i32(t, cpu_env, offsetof(CPUState, active_fpu.fpr[reg].w[!FP_ENDIAN_IDX]));
+#if TCG_TARGET_REG_BITS == 32
+    tcg_gen_mov_i32(TCGV_HIGH(fpu_f64[reg]), t);
+#else
+    if (ctx->hflags & MIPS_HFLAG_F64) {
+        TCGv_i64 t64 = MAKE_TCGV_I64(GET_TCGV_I32(t));
+        tcg_gen_deposit_i64(fpu_f64[reg], fpu_f64[reg], t64, 32, 32);
+    } else {
+        tcg_gen_mov_i32(fpu_fh32[reg], t);
+    }
+#endif
 }
 
-static inline void gen_load_fpr64 (DisasContext *ctx, TCGv_i64 t, int reg)
+static void gen_load_fpr64 (DisasContext *ctx, TCGv_i64 t, int reg)
 {
     if (ctx->hflags & MIPS_HFLAG_F64) {
-        tcg_gen_ld_i64(t, cpu_env, offsetof(CPUState, active_fpu.fpr[reg].d));
+        tcg_gen_mov_i64(t, fpu_f64[reg]);
     } else {
-        TCGv_i32 t0 = tcg_temp_new_i32();
-        TCGv_i32 t1 = tcg_temp_new_i32();
-        gen_load_fpr32(ctx, t0, reg & ~1);
-        gen_load_fpr32(ctx, t1, reg | 1);
-        tcg_gen_concat_i32_i64(t, t0, t1);
-        tcg_temp_free_i32(t0);
-        tcg_temp_free_i32(t1);
+#if TCG_TARGET_REG_BITS == 32
+        tcg_gen_concat32_i64(t, fpu_f64[reg & ~1], fpu_f64[reg | 1]);
+#else
+        tcg_gen_concat_i32_i64(t, fpu_f32[reg & ~1], fpu_f32[reg | 1]);
+#endif
     }
 }
 
-static inline void gen_store_fpr64 (DisasContext *ctx, TCGv_i64 t, int reg)
+static void gen_store_fpr64 (DisasContext *ctx, TCGv_i64 t, int reg)
 {
     if (ctx->hflags & MIPS_HFLAG_F64) {
-        tcg_gen_st_i64(t, cpu_env, offsetof(CPUState, active_fpu.fpr[reg].d));
+        tcg_gen_mov_i64(fpu_f64[reg], t);
     } else {
-        TCGv_i64 t0 = tcg_temp_new_i64();
-        TCGv_i32 t1 = tcg_temp_new_i32();
-        tcg_gen_trunc_i64_i32(t1, t);
-        gen_store_fpr32(ctx, t1, reg & ~1);
-        tcg_gen_shri_i64(t0, t, 32);
-        tcg_gen_trunc_i64_i32(t1, t0);
-        gen_store_fpr32(ctx, t1, reg | 1);
-        tcg_temp_free_i32(t1);
-        tcg_temp_free_i64(t0);
+#if TCG_TARGET_REG_BITS == 32
+        tcg_gen_mov_i32(TCGV_LOW(fpu_f64[reg & ~1]), TCGV_LOW(t));
+        tcg_gen_mov_i32(TCGV_LOW(fpu_f64[reg | 1]), TCGV_HIGH(t));
+#else
+        TCGv_i64 t64 = tcg_temp_new_i64();
+        tcg_gen_shri_i64(t64, t, 32);
+        tcg_gen_trunc_i64_i32(fpu_f32[reg | 1], t64);
+        tcg_temp_free_i64(t64);
+        tcg_gen_trunc_i64_i32(fpu_f32[reg & ~1], t);
+#endif
     }
 }
 
@@ -12681,6 +12733,21 @@ static void mips_tcg_init(void)
                                        offsetof(CPUState, active_fpu.fcr31),
                                        "fcr31");
 
+#if TCG_TARGET_REG_BITS == 64
+    for (i = 0; i < 32; i++) {
+        int off = offsetof(CPUState, active_fpu.fpr[i].w[FP_ENDIAN_IDX]);
+        fpu_f32[i] = tcg_global_mem_new_i32(TCG_AREG0, off, fregnames[i]);
+    }
+    for (i = 0; i < 32; i++) {
+        int off = offsetof(CPUState, active_fpu.fpr[i].w[!FP_ENDIAN_IDX]);
+        fpu_fh32[i] = tcg_global_mem_new_i32(TCG_AREG0, off, fhregnames[i]);
+    }
+#endif
+    for (i = 0; i < 32; i++) {
+        int off = offsetof(CPUState, active_fpu.fpr[i].w[FP_ENDIAN_IDX]);
+        fpu_f64[i] = tcg_global_mem_new_i64(TCG_AREG0, off, fregnames[i]);
+    }
+
     /* register helpers */
 #define GEN_HELPER 2
 #include "helper.h"