@@ -33,10 +33,11 @@
do { } while (0)
#endif
-#define TCG_CT_CONST_32 0x100
-#define TCG_CT_CONST_NEG 0x200
-#define TCG_CT_CONST_ADDI 0x400
-#define TCG_CT_CONST_MULI 0x800
+#define TCG_CT_CONST_32 0x0100
+#define TCG_CT_CONST_NEG 0x0200
+#define TCG_CT_CONST_ADDI 0x0400
+#define TCG_CT_CONST_MULI 0x0800
+#define TCG_CT_CONST_ANDI 0x1000
#define TCG_TMP0 TCG_REG_R14
@@ -353,6 +354,10 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
ct->ct &= ~TCG_CT_REG;
ct->ct |= TCG_CT_CONST_MULI;
break;
+ case 'A':
+ ct->ct &= ~TCG_CT_REG;
+ ct->ct |= TCG_CT_CONST_ANDI;
+ break;
default:
break;
}
@@ -362,9 +367,66 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
return 0;
}
+/* Immediates to be used with logical AND. This is an optimization only,
+ since a full 64-bit immediate AND can always be performed with 4 sequential
+ NI[LH][LH] instructions. What we're looking for is immediates that we
+ can load efficiently, and the immediate load plus the reg-reg AND is
+ smaller than the sequential NI's. */
+
+static int tcg_match_andi(int ct, tcg_target_ulong val)
+{
+ int i;
+
+ if (facilities & FACILITY_EXT_IMM) {
+ if (ct & TCG_CT_CONST_32) {
+ /* All 32-bit ANDs can be performed with 1 48-bit insn. */
+ return 1;
+ }
+
+ /* Zero-extensions. */
+ if (val == 0xff || val == 0xffff || val == 0xffffffff) {
+ return 1;
+ }
+ } else {
+ if (ct & TCG_CT_CONST_32) {
+ val = (uint32_t)val;
+ } else if (val == 0xffffffff) {
+ return 1;
+ }
+ }
+
+ /* Try all 32-bit insns that can perform it in one go. */
+ for (i = 0; i < 4; i++) {
+ tcg_target_ulong mask = ~(0xffffull << i*16);
+ if ((val & mask) == mask) {
+ return 1;
+ }
+ }
+
+ /* Look for 16-bit values performing the mask. These are better
+ to load with LLI[LH][LH]. */
+ for (i = 0; i < 4; i++) {
+ tcg_target_ulong mask = 0xffffull << i*16;
+ if ((val & mask) == val) {
+ return 0;
+ }
+ }
+
+ /* Look for 32-bit values performing the 64-bit mask. These
+ are better to load with LLI[LH]F, or if extended immediates
+ not available, with a pair of LLI insns. */
+ if ((ct & TCG_CT_CONST_32) == 0) {
+ if (val <= 0xffffffff || (val & 0xffffffff) == 0) {
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
/* Test if a constant matches the constraint. */
-static inline int tcg_target_const_match(tcg_target_long val,
- const TCGArgConstraint *arg_ct)
+static int tcg_target_const_match(tcg_target_long val,
+ const TCGArgConstraint *arg_ct)
{
int ct = arg_ct->ct;
@@ -401,6 +463,8 @@ static inline int tcg_target_const_match(tcg_target_long val,
} else {
return val == (int16_t)val;
}
+ } else if (ct & TCG_CT_CONST_ANDI) {
+ return tcg_match_andi(ct, val);
}
return 0;
@@ -764,37 +828,6 @@ static void tgen64_addi(TCGContext *s, TCGReg dest, int64_t val)
}
-static void tgen32_andi(TCGContext *s, TCGReg dest, uint32_t val)
-{
- /* Zero-th, look for no-op. */
- if (val == -1) {
- return;
- }
-
- /* First, look for the zero-extensions. */
- if (val == 0xff) {
- tgen_ext8u(s, dest, dest);
- return;
- }
- if (val == 0xffff) {
- tgen_ext16u(s, dest, dest);
- return;
- }
-
- /* Second, try all 32-bit insns that can perform it in one go. */
- if ((val & 0xffff0000) == 0xffff0000) {
- tcg_out_insn(s, RI, NILL, dest, val);
- return;
- }
- if ((val & 0x0000ffff) == 0x0000ffff) {
- tcg_out_insn(s, RI, NILH, dest, val >> 16);
- return;
- }
-
- /* Lastly, perform the entire operation with a 48-bit insn. */
- tcg_out_insn(s, RIL, NILF, dest, val);
-}
-
static void tgen64_andi(TCGContext *s, TCGReg dest, tcg_target_ulong val)
{
static const S390Opcode ni_insns[4] = {
@@ -806,69 +839,61 @@ static void tgen64_andi(TCGContext *s, TCGReg dest, tcg_target_ulong val)
int i;
- /* Zero-th, look for no-op. */
+ /* Look for no-op. */
if (val == -1) {
return;
}
- /* First, look for the zero-extensions. */
- if (val == 0xff) {
- tgen_ext8u(s, dest, dest);
- return;
- }
- if (val == 0xffff) {
- tgen_ext16u(s, dest, dest);
- return;
- }
+ /* Look for the zero-extensions. */
if (val == 0xffffffff) {
tgen_ext32u(s, dest, dest);
return;
}
- /* Second, try all 32-bit insns that can perform it in one go. */
- for (i = 0; i < 4; i++) {
- tcg_target_ulong mask = ~(0xffffull << i*16);
- if ((val & mask) == mask) {
- tcg_out_insn_RI(s, ni_insns[i], dest, val >> i*16);
+ if (facilities & FACILITY_EXT_IMM) {
+ if (val == 0xff) {
+ tgen_ext8u(s, TCG_TYPE_I64, dest, dest);
return;
}
- }
-
- /* Third, try all 48-bit insns that can perform it in one go. */
- for (i = 0; i < 2; i++) {
- tcg_target_ulong mask = ~(0xffffffffull << i*32);
- if ((val & mask) == mask) {
- tcg_out_insn_RIL(s, nif_insns[i], dest, val >> i*32);
+ if (val == 0xffff) {
+ tgen_ext16u(s, TCG_TYPE_I64, dest, dest);
return;
}
- }
- /* Fourth, look for masks that can be loaded with one instruction
- into a register. This is slightly smaller than using two 48-bit
- masks, as below. */
- for (i = 0; i < 4; i++) {
- tcg_target_ulong mask = ~(0xffffull << i*16);
- if ((val & mask) == 0) {
- tcg_out_movi(s, TCG_TYPE_I64, TCG_TMP0, val);
- tcg_out_insn(s, RRE, NGR, dest, TCG_TMP0);
- return;
+ /* Try all 32-bit insns that can perform it in one go. */
+ for (i = 0; i < 4; i++) {
+ tcg_target_ulong mask = ~(0xffffull << i*16);
+ if ((val & mask) == mask) {
+ tcg_out_insn_RI(s, ni_insns[i], dest, val >> i*16);
+ return;
+ }
}
- }
- for (i = 0; i < 2; i++) {
- tcg_target_ulong mask = ~(0xffffffffull << i*32);
- if ((val & mask) == 0) {
- tcg_out_movi(s, TCG_TYPE_I64, TCG_TMP0, val);
- tcg_out_insn(s, RRE, NGR, dest, TCG_TMP0);
- return;
+ /* Try all 48-bit insns that can perform it in one go. */
+ if (facilities & FACILITY_EXT_IMM) {
+ for (i = 0; i < 2; i++) {
+ tcg_target_ulong mask = ~(0xffffffffull << i*32);
+ if ((val & mask) == mask) {
+ tcg_out_insn_RIL(s, nif_insns[i], dest, val >> i*32);
+ return;
+ }
+ }
}
- }
- /* Last, perform the AND via sequential modifications to the
- high and low parts. Do this via recursion to handle 16-bit
- vs 32-bit masks in each half. */
- tgen64_andi(s, dest, val | 0xffffffff00000000ull);
- tgen64_andi(s, dest, val | 0x00000000ffffffffull);
+ /* Perform the AND via sequential modifications to the high and low
+ parts. Do this via recursion to handle 16-bit vs 32-bit masks in
+ each half. */
+ tgen64_andi(s, dest, val | 0xffffffff00000000ull);
+ tgen64_andi(s, dest, val | 0x00000000ffffffffull);
+ } else {
+ /* With no extended-immediate facility, just emit the sequence. */
+ for (i = 0; i < 4; i++) {
+ tcg_target_ulong mask = 0xffffull << i*16;
+ if ((val & mask) != mask) {
+ tcg_out_insn_RI(s, ni_insns[i], dest, val >> i*16);
+ }
+ }
+ }
}
static void tgen64_ori(TCGContext *s, TCGReg dest, tcg_target_ulong val)
@@ -1121,6 +1146,16 @@ static void tcg_out_qemu_st_direct(TCGContext *s, int opc, TCGReg data,
}
#if defined(CONFIG_SOFTMMU)
+static void tgen64_andi_tmp(TCGContext *s, TCGReg dest, tcg_target_ulong val)
+{
+ if (tcg_match_andi(0, val)) {
+ tcg_out_movi(s, TCG_TYPE_I64, TCG_TMP0, val);
+ tcg_out_insn(s, RRE, NGR, dest, TCG_TMP0);
+ } else {
+ tgen64_andi(s, dest, val);
+ }
+}
+
static void tcg_prepare_qemu_ldst(TCGContext* s, TCGReg data_reg,
TCGReg addr_reg, int mem_index, int opc,
uint16_t **label2_ptr_p, int is_store)
@@ -1140,8 +1175,8 @@ static void tcg_prepare_qemu_ldst(TCGContext* s, TCGReg data_reg,
tcg_out_sh64(s, RSY_SRLG, arg1, addr_reg, SH64_REG_NONE,
TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
- tgen64_andi(s, arg0, TARGET_PAGE_MASK | ((1 << s_bits) - 1));
- tgen64_andi(s, arg1, (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS);
+ tgen64_andi_tmp(s, arg0, TARGET_PAGE_MASK | ((1 << s_bits) - 1));
+ tgen64_andi_tmp(s, arg1, (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS);
if (is_store) {
ofs = offsetof(CPUState, tlb_table[mem_index][0].addr_write);
@@ -1413,7 +1448,7 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
case INDEX_op_and_i32:
if (const_args[2]) {
- tgen32_andi(s, args[0], args[2]);
+ tgen64_andi(s, args[0], args[2] | 0xffffffff00000000ull);
} else {
tcg_out_insn(s, RR, NR, args[0], args[2]);
}
@@ -1728,7 +1763,7 @@ static const TCGTargetOpDef s390_op_defs[] = {
{ INDEX_op_div2_i32, { "b", "a", "0", "1", "r" } },
{ INDEX_op_divu2_i32, { "b", "a", "0", "1", "r" } },
- { INDEX_op_and_i32, { "r", "0", "ri" } },
+ { INDEX_op_and_i32, { "r", "0", "rWA" } },
{ INDEX_op_or_i32, { "r", "0", "ri" } },
{ INDEX_op_xor_i32, { "r", "0", "ri" } },
{ INDEX_op_neg_i32, { "r", "r" } },
@@ -1789,7 +1824,7 @@ static const TCGTargetOpDef s390_op_defs[] = {
{ INDEX_op_div2_i64, { "b", "a", "0", "1", "r" } },
{ INDEX_op_divu2_i64, { "b", "a", "0", "1", "r" } },
- { INDEX_op_and_i64, { "r", "0", "ri" } },
+ { INDEX_op_and_i64, { "r", "0", "rA" } },
{ INDEX_op_or_i64, { "r", "0", "ri" } },
{ INDEX_op_xor_i64, { "r", "0", "ri" } },
{ INDEX_op_neg_i64, { "r", "r" } },
The 32-bit immediate AND instructions are in the extended-immediate facility. Use these only if present. At the same time, pull the logic to load immediates into registers into a constraint letter for TCG. Signed-off-by: Richard Henderson <rth@twiddle.net> --- tcg/s390/tcg-target.c | 209 ++++++++++++++++++++++++++++-------------------- 1 files changed, 122 insertions(+), 87 deletions(-)