@@ -413,6 +413,10 @@ static bool tcg_target_const_match(int64_t val, int ct,
#define OPC_UD2 (0x0b | P_EXT)
#define OPC_VPBLENDD (0x02 | P_EXT3A | P_DATA16)
#define OPC_VPBLENDVB (0x4c | P_EXT3A | P_DATA16)
+#define OPC_VPBLENDMB (0x66 | P_EXT38 | P_DATA16 | P_EVEX)
+#define OPC_VPBLENDMW (0x66 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
+#define OPC_VPBLENDMD (0x64 | P_EXT38 | P_DATA16 | P_EVEX)
+#define OPC_VPBLENDMQ (0x64 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
#define OPC_VPCMPB (0x3f | P_EXT3A | P_DATA16 | P_EVEX)
#define OPC_VPCMPUB (0x3e | P_EXT3A | P_DATA16 | P_EVEX)
#define OPC_VPCMPW (0x3f | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
@@ -738,6 +742,16 @@ static void tcg_out_vex_modrm_type(TCGContext *s, int opc,
tcg_out_vex_modrm(s, opc, r, v, rm);
}
+static void tcg_out_evex_modrm_type(TCGContext *s, int opc, int r, int v,
+ int rm, int aaa, bool z, TCGType type)
+{
+ if (type == TCG_TYPE_V256) {
+ opc |= P_VEXL;
+ }
+ tcg_out_evex_opc(s, opc, r, v, rm, 0, aaa, z);
+ tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
+}
+
/* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
We handle either RM and INDEX missing with a negative value. In 64-bit
mode for absolute addresses, ~RM is the size of the immediate operand
@@ -3183,6 +3197,33 @@ static void tcg_out_cmp_vec(TCGContext *s, TCGType type, unsigned vece,
}
}
+static void tcg_out_cmpsel_vec_k1(TCGContext *s, TCGType type, unsigned vece,
+ TCGReg v0, TCGReg c1, TCGReg c2,
+ TCGReg v3, TCGReg v4, TCGCond cond)
+{
+ static const int vpblendm_insn[] = {
+ OPC_VPBLENDMB, OPC_VPBLENDMW, OPC_VPBLENDMD, OPC_VPBLENDMQ
+ };
+ bool z = false;
+
+ /*
+ * We have already eliminated !V3 && !V4.
+ * Swap to place constant in V4 to take advantage of zero-masking.
+ */
+ if (!v3) {
+ z = true;
+ v3 = v4;
+ cond = tcg_invert_cond(cond);
+ } else if (!v4) {
+ z = true;
+ v4 = v3;
+ }
+
+ tcg_out_cmp_vec_k1(s, type, vece, c1, c2, cond);
+ tcg_out_evex_modrm_type(s, vpblendm_insn[vece], v0, v4, v3,
+ /* k1 */1, z, type);
+}
+
static void tcg_out_cmpsel_vec(TCGContext *s, TCGType type, unsigned vece,
TCGReg v0, TCGReg c1, TCGReg c2,
TCGReg v3, TCGReg v4, TCGCond cond)
@@ -3196,6 +3237,11 @@ static void tcg_out_cmpsel_vec(TCGContext *s, TCGType type, unsigned vece,
return;
}
+ if (vece <= MO_16 ? have_avx512bw : have_avx512vl) {
+ tcg_out_cmpsel_vec_k1(s, type, vece, v0, c1, c2, v3, v4, cond);
+ return;
+ }
+
if (tcg_out_cmp_vec_noinv(s, type, vece, TCG_TMP_VEC, c1, c2, cond)) {
TCGReg swap = v3;
v3 = v4;
The avx512 vpblendm* instructions exactly implement cmpsel, using a predicate input. Of course this matches nicely with the avx512 predicate comparison instructions. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- tcg/i386/tcg-target.c.inc | 46 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+)