@@ -503,6 +503,44 @@ static inline void tcg_out_ext16u(TCGContext *s, int cond,
#endif
}
+static inline void tcg_out_bswap16(TCGContext *s, int cond, int rd, int rn)
+{
+#ifdef USE_ARMV6_INSTRUCTIONS
+ /* rev16 */
+ tcg_out32(s, 0x06bf0fb0 | (cond << 28) | (rd << 12) | rn);
+#else
+ tcg_out_dat_reg(s, cond, ARITH_MOV,
+ TCG_REG_R8, 0, rn, SHIFT_IMM_LSL(24));
+ tcg_out_dat_reg(s, cond, ARITH_MOV,
+ TCG_REG_R8, 0, TCG_REG_R8, SHIFT_IMM_LSR(16));
+ tcg_out_dat_reg(s, cond, ARITH_ORR,
+ rd, TCG_REG_R8, rn, SHIFT_IMM_LSR(8));
+#endif
+}
+
+static inline void tcg_out_bswap32(TCGContext *s, int cond, int rd, int rn)
+{
+#ifdef USE_ARMV6_INSTRUCTIONS
+ /* rev */
+ tcg_out32(s, 0x06bf0f30 | (cond << 28) | (rd << 12) | rn);
+#else
+ /* This code only uses one temporary register. There is probably
+ a faster way to do that with more temporary registers. */
+ tcg_out_dat_reg(s, cond, ARITH_MOV,
+ TCG_REG_R8, 0, rn, SHIFT_IMM_ROR(8));
+ tcg_out_dat_imm(s, cond, ARITH_BIC,
+ rd, TCG_REG_R8, 0xff);
+ tcg_out_dat_imm(s, cond, ARITH_BIC,
+ rd, rd, 0xff | 0x800);
+ tcg_out_dat_imm(s, cond, ARITH_BIC,
+ TCG_REG_R8, TCG_REG_R8, 0xff | 0x400);
+ tcg_out_dat_imm(s, cond, ARITH_BIC,
+ TCG_REG_R8, TCG_REG_R8, 0xff | 0xc00);
+ tcg_out_dat_reg(s, cond, ARITH_ORR,
+ rd, rd, TCG_REG_R8, SHIFT_IMM_ROR(16));
+#endif
+}
+
static inline void tcg_out_ld32_12(TCGContext *s, int cond,
int rd, int rn, tcg_target_long im)
{
@@ -1520,6 +1558,13 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
tcg_out_qemu_st(s, COND_AL, args, 3);
break;
+ case INDEX_op_bswap16_i32:
+ tcg_out_bswap16(s, COND_AL, args[0], args[1]);
+ break;
+ case INDEX_op_bswap32_i32:
+ tcg_out_bswap32(s, COND_AL, args[0], args[1]);
+ break;
+
case INDEX_op_ext8s_i32:
tcg_out_ext8s(s, COND_AL, args[0], args[1]);
break;
@@ -1607,6 +1652,9 @@ static const TCGTargetOpDef arm_op_defs[] = {
{ INDEX_op_qemu_st64, { "x", "D", "x", "X" } },
#endif
+ { INDEX_op_bswap16_i32, { "r", "r" } },
+ { INDEX_op_bswap32_i32, { "r", "r" } },
+
{ INDEX_op_ext8s_i32, { "r", "r" } },
{ INDEX_op_ext16s_i32, { "r", "r" } },
{ INDEX_op_ext16u_i32, { "r", "r" } },
@@ -62,8 +62,8 @@ enum {
#define TCG_TARGET_HAS_ext16s_i32
#undef TCG_TARGET_HAS_ext8u_i32 /* and r0, r1, #0xff */
#define TCG_TARGET_HAS_ext16u_i32
-// #define TCG_TARGET_HAS_bswap16_i32
-// #define TCG_TARGET_HAS_bswap32_i32
+#define TCG_TARGET_HAS_bswap16_i32
+#define TCG_TARGET_HAS_bswap32_i32
#define TCG_TARGET_HAS_not_i32
#define TCG_TARGET_HAS_neg_i32
#define TCG_TARGET_HAS_rot_i32
Add an bswap16 and bswap32 ops, either using the rev and rev16 instructions on ARMv6+ or shifts and logical operations on previous ARM versions. In both cases the result use less instructions than the pure TCG version. These ops are also needed by the qemu_ld/st functions. Signed-off-by: Aurelien Jarno <aurelien@aurel32.net> --- tcg/arm/tcg-target.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ tcg/arm/tcg-target.h | 4 ++-- 2 files changed, 50 insertions(+), 2 deletions(-)