@@ -99,18 +99,31 @@ static const int tcg_target_call_oarg_regs[] = {
# define TCG_REG_L1 TCG_REG_EDX
#endif
+/* The host compiler should supply <cpuid.h> to enable runtime features
+ detection, as we're not going to go so far as our own inline assembly.
+ If not available, default values will be assumed. */
+#if defined(CONFIG_CPUID_H)
+#include <cpuid.h>
+#endif
+
/* For 32-bit, we are going to attempt to determine at runtime whether cmov
- is available. However, the host compiler must supply <cpuid.h>, as we're
- not going to go so far as our own inline assembly. */
+ is available. */
#if TCG_TARGET_REG_BITS == 64
# define have_cmov 1
#elif defined(CONFIG_CPUID_H)
-#include <cpuid.h>
static bool have_cmov;
#else
# define have_cmov 0
#endif
+/* If bit_MOVBE is defined in cpuid.h (added in GCC version 4.6), we are
+ going to attempt to determine at runtime whether movbe is available. */
+#if defined(CONFIG_CPUID_H) && defined(bit_MOVBE)
+static bool have_movbe;
+#else
+# define have_movbe 0
+#endif
+
static uint8_t *tb_ret_addr;
static void patch_reloc(uint8_t *code_ptr, int type,
@@ -280,6 +293,8 @@ static inline int tcg_target_const_match(tcg_target_long val,
#define OPC_MOVB_EvIz (0xc6)
#define OPC_MOVL_EvIz (0xc7)
#define OPC_MOVL_Iv (0xb8)
+#define OPC_MOVBE_GyMy (0xf0 | P_EXT2)
+#define OPC_MOVBE_MyGy (0xf1 | P_EXT2)
#define OPC_MOVSBL (0xbe | P_EXT)
#define OPC_MOVSWL (0xbf | P_EXT)
#define OPC_MOVSLQ (0x63 | P_REXW)
@@ -1363,8 +1378,13 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
break;
case MO_SW:
if (bswap) {
- tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs);
- tcg_out_rolw_8(s, datalo);
+ if (have_movbe) {
+ tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
+ datalo, base, ofs);
+ } else {
+ tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs);
+ tcg_out_rolw_8(s, datalo);
+ }
tcg_out_modrm(s, OPC_MOVSWL + P_REXW, datalo, datalo);
} else {
tcg_out_modrm_offset(s, OPC_MOVSWL + P_REXW + seg,
@@ -1372,16 +1392,25 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
}
break;
case MO_UL:
- tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, datalo, base, ofs);
- if (bswap) {
- tcg_out_bswap32(s, datalo);
+ if (bswap && have_movbe) {
+ tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + seg, datalo, base, ofs);
+ } else {
+ tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, datalo, base, ofs);
+ if (bswap) {
+ tcg_out_bswap32(s, datalo);
+ }
}
break;
#if TCG_TARGET_REG_BITS == 64
case MO_SL:
if (bswap) {
- tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, datalo, base, ofs);
- tcg_out_bswap32(s, datalo);
+ if (have_movbe) {
+ tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + seg,
+ datalo, base, ofs);
+ } else {
+ tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, datalo, base, ofs);
+ tcg_out_bswap32(s, datalo);
+ }
tcg_out_ext32s(s, datalo, datalo);
} else {
tcg_out_modrm_offset(s, OPC_MOVSLQ + seg, datalo, base, ofs);
@@ -1390,29 +1419,34 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
#endif
case MO_Q:
if (TCG_TARGET_REG_BITS == 64) {
- tcg_out_modrm_offset(s, OPC_MOVL_GvEv + P_REXW + seg,
- datalo, base, ofs);
- if (bswap) {
- tcg_out_bswap64(s, datalo);
+ if (bswap && have_movbe) {
+ tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + P_REXW + seg,
+ datalo, base, ofs);
+ } else {
+ tcg_out_modrm_offset(s, OPC_MOVL_GvEv + P_REXW + seg,
+ datalo, base, ofs);
+ if (bswap) {
+ tcg_out_bswap64(s, datalo);
+ }
}
} else {
+ int opc = OPC_MOVL_GvEv;
if (bswap) {
int t = datalo;
datalo = datahi;
datahi = t;
+ if (have_movbe) {
+ opc = OPC_MOVBE_GyMy;
+ }
}
if (base != datalo) {
- tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg,
- datalo, base, ofs);
- tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg,
- datahi, base, ofs + 4);
+ tcg_out_modrm_offset(s, opc + seg, datalo, base, ofs);
+ tcg_out_modrm_offset(s, opc + seg, datahi, base, ofs + 4);
} else {
- tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg,
- datahi, base, ofs + 4);
- tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg,
- datalo, base, ofs);
+ tcg_out_modrm_offset(s, opc + seg, datahi, base, ofs + 4);
+ tcg_out_modrm_offset(s, opc + seg, datalo, base, ofs);
}
- if (bswap) {
+ if (bswap && opc != OPC_MOVBE_GyMy) {
tcg_out_bswap32(s, datalo);
tcg_out_bswap32(s, datahi);
}
@@ -1506,31 +1540,48 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
datalo, base, ofs);
break;
case MO_16:
- if (bswap) {
- tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
- tcg_out_rolw_8(s, scratch);
- datalo = scratch;
+ if (bswap & have_movbe) {
+ tcg_out_modrm_offset(s, OPC_MOVBE_MyGy + P_DATA16 + seg,
+ datalo, base, ofs);
+ } else {
+ if (bswap) {
+ tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
+ tcg_out_rolw_8(s, scratch);
+ datalo = scratch;
+ }
+ tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_DATA16 + seg,
+ datalo, base, ofs);
}
- tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_DATA16 + seg,
- datalo, base, ofs);
break;
case MO_32:
- if (bswap) {
- tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
- tcg_out_bswap32(s, scratch);
- datalo = scratch;
+ if (bswap & have_movbe) {
+ tcg_out_modrm_offset(s, OPC_MOVBE_MyGy + seg, datalo, base, ofs);
+ } else {
+ if (bswap) {
+ tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
+ tcg_out_bswap32(s, scratch);
+ datalo = scratch;
+ }
+ tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datalo, base, ofs);
}
- tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datalo, base, ofs);
break;
case MO_64:
if (TCG_TARGET_REG_BITS == 64) {
- if (bswap) {
- tcg_out_mov(s, TCG_TYPE_I64, scratch, datalo);
- tcg_out_bswap64(s, scratch);
- datalo = scratch;
+ if (bswap && have_movbe) {
+ tcg_out_modrm_offset(s, OPC_MOVBE_MyGy + P_REXW + seg,
+ datalo, base, ofs);
+ } else {
+ if (bswap) {
+ tcg_out_mov(s, TCG_TYPE_I64, scratch, datalo);
+ tcg_out_bswap64(s, scratch);
+ datalo = scratch;
+ }
+ tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_REXW + seg,
+ datalo, base, ofs);
}
- tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_REXW + seg,
- datalo, base, ofs);
+ } else if (bswap && have_movbe) {
+ tcg_out_modrm_offset(s, OPC_MOVBE_MyGy + seg, datahi, base, ofs);
+ tcg_out_modrm_offset(s, OPC_MOVBE_MyGy + seg, datalo, base, ofs+4);
} else if (bswap) {
tcg_out_mov(s, TCG_TYPE_I32, scratch, datahi);
tcg_out_bswap32(s, scratch);
@@ -2167,13 +2218,24 @@ static void tcg_target_qemu_prologue(TCGContext *s)
static void tcg_target_init(TCGContext *s)
{
- /* For 32-bit, 99% certainty that we're running on hardware that supports
- cmov, but we still need to check. In case cmov is not available, we'll
- use a small forward branch. */
-#ifndef have_cmov
+#if !(defined(have_cmov) && defined(have_movbe))
{
unsigned a, b, c, d;
- have_cmov = (__get_cpuid(1, &a, &b, &c, &d) && (d & bit_CMOV));
+ int ret;
+ ret = __get_cpuid(1, &a, &b, &c, &d);
+
+# ifndef have_cmov
+ /* For 32-bit, 99% certainty that we're running on hardware that
+ supports cmov, but we still need to check. In case cmov is not
+ available, we'll use a small forward branch. */
+ have_cmov = ret && (d & bit_CMOV);
+# endif
+
+# ifndef have_movbe
+ /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
+ need to probe for it. */
+ have_movbe = ret && (c & bit_MOVBE);
+# endif
}
#endif
The movbe instruction has been added on some Intel Atom CPUs and on recent Intel Haswell CPUs. It allows to load/store a value and at the same time bswap it. This patch detects the avaibility of this instruction and when available use it in the qemu load/store routines in replacement of load/store + bswap. Note that for 16-bit unsigned loads, movbe + movzw is basically the same as movzw + bswap, so the patch doesn't touch this case. Signed-off-by: Aurelien Jarno <aurelien@aurel32.net> --- tcg/i386/tcg-target.c | 152 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 107 insertions(+), 45 deletions(-)