[SH] PR 49263 - underutilized "TST #imm, R0" instruction

Message ID	1318641441.19997.404.camel@yam-132-YW-E178-FTW
State	New
Headers	show Return-Path: <gcc-patches-return-304566-incoming=patchwork.ozlabs.org@gcc.gnu.org> Subject: [SH] PR 49263 - underutilized "TST #imm, R0" instruction From: Oleg Endo <oleg.endo@t-online.de> To: gcc-patches <gcc-patches@gcc.gnu.org> Content-Type: multipart/mixed; boundary="=-PZpm96vsKHwvEXtyG4QS" Date: Sat, 15 Oct 2011 03:17:21 +0200 Message-ID: <1318641441.19997.404.camel@yam-132-YW-E178-FTW> Mime-Version: 1.0 Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk Sender: gcc-patches-owner@gcc.gnu.org

Index: gcc/testsuite/gcc.target/sh/pr49263.c =================================================================== --- gcc/testsuite/gcc.target/sh/pr49263.c (revision 0) +++ gcc/testsuite/gcc.target/sh/pr49263.c (revision 0) @@ -0,0 +1,86 @@ +/* Verify that TST #imm, R0 instruction is generated if the constant + allows it. Under some circumstances another compare instruction might + be selected, which is also fine. Any AND instructions are considered + counter productive and fail the test. */ +/* { dg-do compile { target "sh*-*-*" } } */ +/* { dg-options "-O2" } */ +/* { dg-final { scan-assembler-not "and" } } */ + +#define make_func(__valtype__, __valget__, __tstval__, __suff__)\ + int test_imm_##__tstval__##__suff__ (__valtype__ val) \ + {\ + return ((__valget__) & (0x##__tstval__ << 0)) ? -20 : -40;\ + } + +#define make_func_0_F(__valtype__, __valget__, __y__, __suff__)\ + make_func (__valtype__, __valget__, __y__##0, __suff__)\ + make_func (__valtype__, __valget__, __y__##1, __suff__)\ + make_func (__valtype__, __valget__, __y__##2, __suff__)\ + make_func (__valtype__, __valget__, __y__##3, __suff__)\ + make_func (__valtype__, __valget__, __y__##4, __suff__)\ + make_func (__valtype__, __valget__, __y__##5, __suff__)\ + make_func (__valtype__, __valget__, __y__##6, __suff__)\ + make_func (__valtype__, __valget__, __y__##7, __suff__)\ + make_func (__valtype__, __valget__, __y__##8, __suff__)\ + make_func (__valtype__, __valget__, __y__##9, __suff__)\ + make_func (__valtype__, __valget__, __y__##A, __suff__)\ + make_func (__valtype__, __valget__, __y__##B, __suff__)\ + make_func (__valtype__, __valget__, __y__##C, __suff__)\ + make_func (__valtype__, __valget__, __y__##D, __suff__)\ + make_func (__valtype__, __valget__, __y__##E, __suff__)\ + make_func (__valtype__, __valget__, __y__##F, __suff__)\ + +#define make_funcs_0_FF(__valtype__, __valget__, __suff__)\ + make_func_0_F (__valtype__, __valget__, 0, __suff__)\ + make_func_0_F (__valtype__, __valget__, 1, __suff__)\ + make_func_0_F (__valtype__, __valget__, 2, __suff__)\ + make_func_0_F (__valtype__, __valget__, 3, __suff__)\ + make_func_0_F (__valtype__, __valget__, 4, __suff__)\ + make_func_0_F (__valtype__, __valget__, 5, __suff__)\ + make_func_0_F (__valtype__, __valget__, 6, __suff__)\ + make_func_0_F (__valtype__, __valget__, 7, __suff__)\ + make_func_0_F (__valtype__, __valget__, 8, __suff__)\ + make_func_0_F (__valtype__, __valget__, 9, __suff__)\ + make_func_0_F (__valtype__, __valget__, A, __suff__)\ + make_func_0_F (__valtype__, __valget__, B, __suff__)\ + make_func_0_F (__valtype__, __valget__, C, __suff__)\ + make_func_0_F (__valtype__, __valget__, D, __suff__)\ + make_func_0_F (__valtype__, __valget__, E, __suff__)\ + make_func_0_F (__valtype__, __valget__, F, __suff__)\ + +make_funcs_0_FF (signed char*, *val, int8_mem) +make_funcs_0_FF (signed char, val, int8_reg) + +make_funcs_0_FF (unsigned char*, *val, uint8_mem) +make_funcs_0_FF (unsigned char, val, uint8_reg) + +make_funcs_0_FF (short*, *val, int16_mem) +make_funcs_0_FF (short, val, int16_reg) + +make_funcs_0_FF (unsigned short*, *val, uint16_mem) +make_funcs_0_FF (unsigned short, val, uint16_reg) + +make_funcs_0_FF (int*, *val, int32_mem) +make_funcs_0_FF (int, val, int32_reg) + +make_funcs_0_FF (unsigned int*, *val, uint32_mem) +make_funcs_0_FF (unsigned int, val, uint32_reg) + +make_funcs_0_FF (long long*, *val, int64_lowword_mem) +make_funcs_0_FF (long long, val, int64_lowword_reg) + +make_funcs_0_FF (unsigned long long*, *val, uint64_lowword_mem) +make_funcs_0_FF (unsigned long long, val, uint64_lowword_reg) + +make_funcs_0_FF (long long*, *val >> 32, int64_highword_mem) +make_funcs_0_FF (long long, val >> 32, int64_highword_reg) + +make_funcs_0_FF (unsigned long long*, *val >> 32, uint64_highword_mem) +make_funcs_0_FF (unsigned long long, val >> 32, uint64_highword_reg) + +make_funcs_0_FF (long long*, *val >> 16, int64_midword_mem) +make_funcs_0_FF (long long, val >> 16, int64_midword_reg) + +make_funcs_0_FF (unsigned long long*, *val >> 16, uint64_midword_mem) +make_funcs_0_FF (unsigned long long, val >> 16, uint64_midword_reg) + Index: gcc/config/sh/sh.c =================================================================== --- gcc/config/sh/sh.c (revision 179778) +++ gcc/config/sh/sh.c (working copy) @@ -242,7 +242,7 @@ static int flow_dependent_p (rtx, rtx); static void flow_dependent_p_1 (rtx, const_rtx, void *); static int shiftcosts (rtx); -static int and_xor_ior_costs (rtx, int code); +static int and_xor_ior_costs (rtx, int); static int addsubcosts (rtx); static int multcosts (rtx); static bool unspec_caller_rtx_p (rtx); @@ -2995,6 +2995,20 @@ *total = 8; return true; + case EQ: + /* An and with a constant compared against zero is + most likely going to be a TST #imm, R0 instruction. + Notice that this does not catch the zero_extract variants from + the md file. */ + if (GET_CODE (XEXP (x, 0)) == AND + && CONST_INT_P (XEXP (x, 1)) && INTVAL (XEXP (x, 1)) == 0) + { + *total = 1; + return true; + } + else + return false; + case CONST: case LABEL_REF: case SYMBOL_REF: Index: gcc/config/sh/sh.h =================================================================== --- gcc/config/sh/sh.h (revision 179778) +++ gcc/config/sh/sh.h (working copy) @@ -1195,6 +1195,9 @@ #define CONST_OK_FOR_K08(VALUE) (((HOST_WIDE_INT)(VALUE))>= 0 \ && ((HOST_WIDE_INT)(VALUE)) <= 255) +#define ZERO_EXTRACT_ANDMASK(EXTRACT_SZ_RTX, EXTRACT_POS_RTX)\ + (((1 << INTVAL (EXTRACT_SZ_RTX)) - 1) << INTVAL (EXTRACT_POS_RTX)) + #if 0 #define SECONDARY_INOUT_RELOAD_CLASS(CLASS,MODE,X,ELSE) \ ((((REGCLASS_HAS_FP_REG (CLASS) \ Index: gcc/config/sh/sh.md =================================================================== --- gcc/config/sh/sh.md (revision 179778) +++ gcc/config/sh/sh.md (working copy) @@ -585,15 +585,164 @@ ;; SImode signed integer comparisons ;; ------------------------------------------------------------------------- -(define_insn "" +;; Various patterns to generate the TST #imm, R0 instruction. +;; Although this adds some pressure on the R0 register, it can potentially +;; result in faster code, even if the operand has to be moved to R0 first. +;; This is because on SH4 TST #imm, R0 and MOV Rm, Rn are both MT group +;; instructions and thus will be executed in parallel. On SH4A TST #imm, R0 +;; is an EX group instruction but still can be executed in parallel with the +;; MT group MOV Rm, Rn instruction. + +;; Usual TST #imm, R0 patterns for SI, HI and QI +;; This is usually used for bit patterns other than contiguous bits +;; and single bits. + +(define_insn "tstsi_t" [(set (reg:SI T_REG) - (eq:SI (and:SI (match_operand:SI 0 "arith_reg_operand" "z,r") + (eq:SI (and:SI (match_operand:SI 0 "logical_operand" "%z,r") (match_operand:SI 1 "logical_operand" "K08,r")) (const_int 0)))] "TARGET_SH1" "tst %1,%0" [(set_attr "type" "mt_group")]) +(define_insn "tsthi_t" + [(set (reg:SI T_REG) + (eq:SI (subreg:SI (and:HI (match_operand:HI 0 "logical_operand" "%z") + (match_operand 1 "const_int_operand")) 0) + (const_int 0)))] + "TARGET_SH1 + && CONST_OK_FOR_K08 (INTVAL (operands[1]))" + "tst %1,%0" + [(set_attr "type" "mt_group")]) + +(define_insn "tstqi_t" + [(set (reg:SI T_REG) + (eq:SI (subreg:SI (and:QI (match_operand:QI 0 "logical_operand" "%z") + (match_operand 1 "const_int_operand")) 0) + (const_int 0)))] + "TARGET_SH1 + && (CONST_OK_FOR_K08 (INTVAL (operands[1])) + || CONST_OK_FOR_I08 (INTVAL (operands[1])))" +{ + operands[1] = GEN_INT (INTVAL (operands[1]) & 255); + return "tst %1,%0"; +} + [(set_attr "type" "mt_group")]) + +;; Test low QI subreg against zero. +;; This avoids unecessary zero extension before the test. + +(define_insn "tstqi_t_zero" + [(set (reg:SI T_REG) + (eq:SI (match_operand:QI 0 "logical_operand" "z") (const_int 0)))] + "TARGET_SH1" + "tst #255,%0" + [(set_attr "type" "mt_group")]) + +;; Extract LSB, negate and store in T bit. + +(define_insn "tstsi_t_and_not" + [(set (reg:SI T_REG) + (and:SI (not:SI (match_operand:SI 0 "logical_operand" "z")) + (const_int 1)))] + "TARGET_SH1" + "tst #1,%0" + [(set_attr "type" "mt_group")]) + +;; Extract contiguous bits and compare them against zero. + +(define_insn "tstsi_t_zero_extract_eq" + [(set (reg:SI T_REG) + (eq:SI (zero_extract:SI (match_operand 0 "logical_operand" "z") + (match_operand:SI 1 "const_int_operand") + (match_operand:SI 2 "const_int_operand")) + (const_int 0)))] + "TARGET_SH1 + && CONST_OK_FOR_K08 (ZERO_EXTRACT_ANDMASK (operands[1], operands[2]))" +{ + operands[1] = GEN_INT (ZERO_EXTRACT_ANDMASK (operands[1], operands[2])); + return "tst %1,%0"; +} + [(set_attr "type" "mt_group")]) + +;; This split is required when testing bits in a QI subreg. + +(define_split + [(set (reg:SI T_REG) + (eq:SI (if_then_else:SI (zero_extract:SI + (match_operand 0 "logical_operand" "") + (match_operand 1 "const_int_operand") + (match_operand 2 "const_int_operand")) + (match_operand 3 "const_int_operand") + (const_int 0)) + (const_int 0)))] + "TARGET_SH1 + && ZERO_EXTRACT_ANDMASK (operands[1], operands[2]) == INTVAL (operands[3]) + && CONST_OK_FOR_K08 (INTVAL (operands[3]))" + [(set (reg:SI T_REG) (eq:SI (and:SI (match_dup 0) (match_dup 3)) + (const_int 0)))] + " +{ + if (GET_MODE (operands[0]) == QImode) + operands[0] = simplify_gen_subreg (SImode, operands[0], QImode, 0); +}") + +;; Extract single bit, negate and store it in the T bit. +;; Not used for SH4A. + +(define_insn "tstsi_t_zero_extract_xor" + [(set (reg:SI T_REG) + (zero_extract:SI (xor:SI (match_operand:SI 0 "logical_operand" "z") + (match_operand:SI 3 "const_int_operand")) + (match_operand:SI 1 "const_int_operand") + (match_operand:SI 2 "const_int_operand")))] + "TARGET_SH1 + && ZERO_EXTRACT_ANDMASK (operands[1], operands[2]) == INTVAL (operands[3]) + && CONST_OK_FOR_K08 (INTVAL (operands[3]))" + "tst %3,%0" + [(set_attr "type" "mt_group")]) + +;; Extract single bit, negate and store it in the T bit. +;; Used for SH4A little endian. + +(define_insn "tstsi_t_zero_extract_subreg_xor_little" + [(set (reg:SI T_REG) + (zero_extract:SI + (subreg:QI (xor:SI (match_operand:SI 0 "logical_operand" "z") + (match_operand:SI 3 "const_int_operand")) 0) + (match_operand:SI 1 "const_int_operand") + (match_operand:SI 2 "const_int_operand")))] + "TARGET_SH1 && TARGET_LITTLE_ENDIAN + && ZERO_EXTRACT_ANDMASK (operands[1], operands[2]) + == (INTVAL (operands[3]) & 255) + && CONST_OK_FOR_K08 (INTVAL (operands[3]) & 255)" +{ + operands[3] = GEN_INT (INTVAL (operands[3]) & 255); + return "tst %3,%0"; +} + [(set_attr "type" "mt_group")]) + +;; Extract single bit, negate and store it in the T bit. +;; Used for SH4A big endian. + +(define_insn "tstsi_t_zero_extract_subreg_xor_big" + [(set (reg:SI T_REG) + (zero_extract:SI + (subreg:QI (xor:SI (match_operand:SI 0 "logical_operand" "z") + (match_operand:SI 3 "const_int_operand")) 3) + (match_operand:SI 1 "const_int_operand") + (match_operand:SI 2 "const_int_operand")))] + "TARGET_SH1 && ! TARGET_LITTLE_ENDIAN + && ZERO_EXTRACT_ANDMASK (operands[1], operands[2]) + == (INTVAL (operands[3]) & 255) + && CONST_OK_FOR_K08 (INTVAL (operands[3]) & 255)" +{ + operands[3] = GEN_INT (INTVAL (operands[3]) & 255); + return "tst %3,%0"; +} + [(set_attr "type" "mt_group")]) + ;; ??? Perhaps should only accept reg/constant if the register is reg 0. ;; That would still allow reload to create cmpi instructions, but would ;; perhaps allow forcing the constant into a register when that is better. @@ -1157,7 +1306,7 @@ && (arith_reg_operand (operands[1], SImode) || (immediate_operand (operands[1], SImode) && satisfies_constraint_I08 (operands[1])))" - "bt 0f\;mov %1,%0\\n0:" + "bt 0f\;mov %1,%0\\n0:" [(set_attr "type" "mt_group,arith") ;; poor approximation (set_attr "length" "4")]) @@ -1170,7 +1319,7 @@ && (arith_reg_operand (operands[1], SImode) || (immediate_operand (operands[1], SImode) && satisfies_constraint_I08 (operands[1])))" - "bf 0f\;mov %1,%0\\n0:" + "bf 0f\;mov %1,%0\\n0:" [(set_attr "type" "mt_group,arith") ;; poor approximation (set_attr "length" "4")]) @@ -3015,9 +3164,9 @@ ;; ------------------------------------------------------------------------- (define_insn "*andsi3_compact" - [(set (match_operand:SI 0 "arith_reg_dest" "=r,z") + [(set (match_operand:SI 0 "arith_reg_dest" "=z,r") (and:SI (match_operand:SI 1 "arith_reg_operand" "%0,0") - (match_operand:SI 2 "logical_operand" "r,K08")))] + (match_operand:SI 2 "logical_operand" "K08,r")))] "TARGET_SH1" "and %2,%0" [(set_attr "type" "arith")])

[SH] PR 49263 - underutilized "TST #imm, R0" instruction

Commit Message

Comments

Patch