diff mbox

[SH] Add support for inlined builtin-strcmp (2/2)

Message ID 525FF0F9.4010704@st.com
State New
Headers show

Commit Message

Christian Bruel Oct. 17, 2013, 2:15 p.m. UTC
Hello,

This patch adds support to inline an optimized version of strcmp when
not optimizing for size. The generated code makes use of the cmp/str
instruction to test 4 bytes at a time when correctly aligned.

note that a new pattern was added to match the cmp/str instruction, but
no attempt was made to catch it from combine.

This results in general cycles improvements (against both newlib and
glibc implementations), one of which is a 10%  cycle improvement for a
famous strcmp-biased "benchmark" starting with a D.... , but still standard.

This optimization  can be disabled with -fno-builtin-strcmp.

No regressions on sh4 in big and little endian, and sh2 (sh3, and sh4a
are still running for big and little endian for sanity)

OK for trunk

Thanks

Christian

Comments

Oleg Endo Oct. 17, 2013, 11:05 p.m. UTC | #1
Hi,

On Thu, 2013-10-17 at 16:15 +0200, Christian Bruel wrote:
> Hello,
> 
> This patch adds support to inline an optimized version of strcmp when
> not optimizing for size. The generated code makes use of the cmp/str
> instruction to test 4 bytes at a time when correctly aligned.
> 
> note that a new pattern was added to match the cmp/str instruction, but
> no attempt was made to catch it from combine.
> 
> This results in general cycles improvements (against both newlib and
> glibc implementations), one of which is a 10%  cycle improvement for a
> famous strcmp-biased "benchmark" starting with a D.... , but still standard.

Nice.

> This optimization  can be disabled with -fno-builtin-strcmp.
> 
> No regressions on sh4 in big and little endian, and sh2 (sh3, and sh4a
> are still running for big and little endian for sanity)
> 

I was wondering, in file sh-mem.c, the new function
'sh4_expand_cmpstr' ... why is it SH4-something?  It's a bit confusing,
since cmp/str has been around since ever (i.e. since SH1).  Maybe just
rename it to 'sh_expand_cmpstr' instead?  The function always returns
'true', so maybe just make it return 'void'?

Also, in the expander ...

+  [(set (match_operand:SI 0 "register_operand" "")
+	(compare:SI (match_operand:BLK 1 "memory_operand" "")

... no need to use empty "" constraints.

Cheers,
Oleg
Christian Bruel Oct. 18, 2013, 7:59 a.m. UTC | #2
On 10/18/2013 01:05 AM, Oleg Endo wrote:
> I was wondering, in file sh-mem.c, the new function
> 'sh4_expand_cmpstr' ... why is it SH4-something?  It's a bit confusing,
> since cmp/str has been around since ever (i.e. since SH1). Maybe just
> rename it to 'sh_expand_cmpstr' instead?

Just historical. (SH4* are our primary SH platforms). The code is
enabled/tested for all SH1 of course, I will  rename. Thanks .

>  Maybe just
> rename it to 'sh_expand_cmpstr' instead?  The function always returns
> 'true', so maybe just make it return 'void'?

yes, it's for genericity as I plan to reuse/specialize the code based on
the count parameter for strncmp to be contributed next.
>
> Also, in the expander ...
>
> +  [(set (match_operand:SI 0 "register_operand" "")
> +	(compare:SI (match_operand:BLK 1 "memory_operand" "")
>
> ... no need to use empty "" constraints

OK, thanks

Christian

> Cheers,
> Oleg
>
diff mbox

Patch

2013-10-17  Christian Bruel  <christian.bruel@st.com>

	* gcc/config/sh/sh-mem.c (sh4_expand_cmpstr): New function.
	* gcc/config/sh/sh-protos.h (sh4_expand_cmpstr): Declare.
	* gcc/config/sh/sh.md (cmpstrsi, cmpstr_t): New patterns.
	(rotlhi3_8): Rename.

--- gcc/config/sh/sh.md	2013-10-17 15:14:18.000000000 +0200
+++ gcc-new/config/sh/sh.md	2013-10-16 16:13:49.000000000 +0200
@@ -31,9 +31,6 @@ 
 ;; ??? The MAC.W and MAC.L instructions are not supported.  There is no
 ;; way to generate them.
 
-;; ??? The cmp/str instruction is not supported.  Perhaps it can be used
-;; for a str* inline function.
-
 ;; BSR is not generated by the compiler proper, but when relaxing, it
 ;; generates .uses pseudo-ops that allow linker relaxation to create
 ;; BSR.  This is actually implemented in bfd/{coff,elf32}-sh.c
@@ -4037,7 +4034,7 @@ 
   DONE;
 })
 
-(define_insn "*rotlhi3_8"
+(define_insn "rotlhi3_8"
   [(set (match_operand:HI 0 "arith_reg_dest" "=r")
 	(rotate:HI (match_operand:HI 1 "arith_reg_operand" "r")
 		   (const_int 8)))]
@@ -11912,6 +11909,41 @@ 
   "jsr	@%0%#"
   [(set_attr "type" "sfunc")
    (set_attr "needs_delay_slot" "yes")])
+
+;; byte compare pattern
+;; temp = a ^ b;
+;; !((temp & 0xF000) && (temp & 0x0F00) && (temp & 0x00F0) && (temp & 0x000F))
+(define_insn "cmpstr_t"
+  [(set (reg:SI T_REG)
+	(eq:SI (and:SI
+		(and:SI
+		 (and:SI
+		  (zero_extract:SI (xor:SI (match_operand:SI 0 "arith_reg_operand" "r")
+					   (match_operand:SI 1 "arith_reg_operand" "r"))
+				   (const_int 8) (const_int 0))
+		  (zero_extract:SI (xor:SI (match_dup 0) (match_dup 1))
+				   (const_int 8) (const_int 8)))
+		  (zero_extract:SI (xor:SI (match_dup 0) (match_dup 1))
+				   (const_int 8) (const_int 16)))
+		(zero_extract:SI (xor:SI (match_dup 0) (match_dup 1))
+				 (const_int 8) (const_int 24))) (const_int 0)))]
+  "TARGET_SH1"
+  "cmp/str	%0,%1"
+  [(set_attr "type" "mt_group")])
+
+(define_expand "cmpstrsi"
+  [(set (match_operand:SI 0 "register_operand" "")
+	(compare:SI (match_operand:BLK 1 "memory_operand" "")
+		    (match_operand:BLK 2 "memory_operand" "")))
+   (use (match_operand 3 "immediate_operand" ""))]
+  "TARGET_SH1"
+  "
+{
+   if (! optimize_insn_for_size_p () && sh4_expand_cmpstr(operands))
+      DONE;
+   else FAIL;
+}")
+
 
 ;; -------------------------------------------------------------------------
 ;; Floating point instructions.
diff -ru gcc/config/sh/sh-mem.c gcc-new/config/sh/sh-mem.c
--- gcc/config/sh/sh-mem.c	2013-10-17 14:59:02.000000000 +0200
+++ gcc-new/config/sh/sh-mem.c	2013-10-17 14:57:57.000000000 +0200
@@ -23,6 +23,7 @@ 
 #include "tm.h"
 #include "expr.h"
 #include "tm_p.h"
+#include "basic-block.h"
 
 /* Like force_operand, but guarantees that VALUE ends up in TARGET.  */
 static void
@@ -174,3 +175,130 @@ 
 
   return false;
 }
+
+/* Emit code to perform a strcmp.
+
+   OPERANDS[0] is the destination.
+   OPERANDS[1] is the first string.
+   OPERANDS[2] is the second string.
+   OPERANDS[3] is the align.  */
+bool
+sh4_expand_cmpstr (rtx *operands)
+{
+  rtx s1 = copy_rtx (operands[1]);
+  rtx s2 = copy_rtx (operands[2]);
+  rtx s1_addr = copy_addr_to_reg (XEXP (s1, 0));
+  rtx s2_addr = copy_addr_to_reg (XEXP (s2, 0));
+  rtx tmp0 = gen_reg_rtx (SImode);
+  rtx tmp1 = gen_reg_rtx (SImode);
+  rtx tmp2 = gen_reg_rtx (SImode);
+  rtx tmp3 = gen_reg_rtx (SImode);
+
+  rtx L_return = gen_label_rtx ();
+  rtx L_loop_byte = gen_label_rtx ();
+  rtx L_end_loop_byte = gen_label_rtx ();
+  rtx L_loop_long = gen_label_rtx ();
+  rtx L_end_loop_long = gen_label_rtx ();
+
+  rtx jump, addr1, addr2;
+  int prob_unlikely = REG_BR_PROB_BASE / 10;
+  int prob_likely = REG_BR_PROB_BASE / 4;
+
+  emit_insn (gen_iorsi3 (tmp1, s1_addr, s2_addr));
+  emit_move_insn (tmp0, GEN_INT (3));
+
+  emit_insn (gen_tstsi_t (tmp0, tmp1));
+
+  emit_move_insn (tmp0, const0_rtx);
+
+  jump = emit_jump_insn (gen_branch_false (L_loop_byte));
+  add_int_reg_note (jump, REG_BR_PROB, prob_likely);
+
+  addr1 = adjust_automodify_address (s1, SImode, s1_addr, 0);
+  addr2 = adjust_automodify_address (s2, SImode, s2_addr, 0);
+
+  /* tmp2 is aligned, OK to load.  */
+  emit_move_insn (tmp3, addr2);
+  emit_move_insn (s2_addr, plus_constant (Pmode, s2_addr, 4));
+
+  /*start long loop.  */
+  emit_label (L_loop_long);
+
+  emit_move_insn (tmp2, tmp3);
+
+  /* tmp1 is aligned, OK to load.  */
+  emit_move_insn (tmp1, addr1);
+  emit_move_insn (s1_addr, plus_constant (Pmode, s1_addr, 4));
+
+  /* Is there a 0 byte ?  */
+  emit_insn (gen_andsi3 (tmp3, tmp3, tmp1));
+
+  emit_insn (gen_cmpstr_t (tmp0, tmp3));
+  jump = emit_jump_insn (gen_branch_true (L_end_loop_long));
+  add_int_reg_note (jump, REG_BR_PROB, prob_unlikely);
+
+  emit_insn (gen_cmpeqsi_t (tmp1, tmp2));
+
+  /* tmp2 is aligned, OK to load.  */
+  emit_move_insn (tmp3, addr2);
+  emit_move_insn (s2_addr, plus_constant (Pmode, s2_addr, 4));
+
+  jump = emit_jump_insn (gen_branch_true (L_loop_long));
+  add_int_reg_note (jump, REG_BR_PROB, prob_likely);
+  /* end loop.  */
+
+  /* Fallthu, check if one of the word is greater.  */
+  if (TARGET_LITTLE_ENDIAN)
+    {
+      rtx low_1 = gen_lowpart (HImode, tmp1);
+      rtx low_2 = gen_lowpart (HImode, tmp2);
+
+      emit_insn (gen_rotlhi3_8 (low_1, low_1));
+      emit_insn (gen_rotlhi3_8 (low_2, low_2));
+      emit_insn (gen_rotlsi3_16 (tmp1, tmp1));
+      emit_insn (gen_rotlsi3_16 (tmp2, tmp2));
+      emit_insn (gen_rotlhi3_8 (low_1, low_1));
+      emit_insn (gen_rotlhi3_8 (low_2, low_2));
+    }
+
+  jump = emit_jump_insn (gen_jump_compact (L_return));
+  emit_barrier_after (jump);
+
+  /* start byte loop.  */
+  addr1 = adjust_automodify_address (s1, QImode, s1_addr, 0);
+  addr2 = adjust_automodify_address (s2, QImode, s2_addr, 0);
+
+  emit_label (L_end_loop_long);
+
+  emit_move_insn (s1_addr, plus_constant (Pmode, s1_addr, -4));
+  emit_move_insn (s2_addr, plus_constant (Pmode, s2_addr, -4));
+
+  emit_label (L_loop_byte);
+
+  emit_insn (gen_extendqisi2 (tmp2, addr2));
+  emit_move_insn (s2_addr, plus_constant (Pmode, s2_addr, 1));
+
+  emit_insn (gen_extendqisi2 (tmp1, addr1));
+  emit_move_insn (s1_addr, plus_constant (Pmode, s1_addr, 1));
+
+  emit_insn (gen_cmpeqsi_t (tmp2, const0_rtx));
+  jump = emit_jump_insn (gen_branch_true (L_end_loop_byte));
+  add_int_reg_note (jump, REG_BR_PROB, prob_unlikely);
+
+  emit_insn (gen_cmpeqsi_t (tmp1, tmp2));
+  emit_jump_insn (gen_branch_true (L_loop_byte));
+  add_int_reg_note (jump, REG_BR_PROB, prob_likely);
+  /* end loop.  */
+
+  emit_label (L_end_loop_byte);
+
+  emit_insn (gen_zero_extendqisi2 (tmp2, gen_lowpart (QImode, tmp2)));
+  emit_insn (gen_zero_extendqisi2 (tmp1, gen_lowpart (QImode, tmp1)));
+
+  emit_label (L_return);
+
+  emit_insn (gen_subsi3 (operands[0], tmp1, tmp2));
+
+  return true;
+}
+
diff -ru gcc/config/sh/sh-protos.h gcc-new/config/sh/sh-protos.h
--- gcc/config/sh/sh-protos.h	2013-10-17 15:13:46.000000000 +0200
+++ gcc-new/config/sh/sh-protos.h	2013-10-07 14:38:08.000000000 +0200
@@ -116,6 +116,7 @@ 
 extern void output_pic_addr_const (FILE *, rtx);
 extern bool expand_block_move (rtx *);
 extern void prepare_move_operands (rtx[], enum machine_mode mode);
+extern bool sh4_expand_cmpstr (rtx *);
 extern enum rtx_code prepare_cbranch_operands (rtx *, enum machine_mode mode,
 					       enum rtx_code comparison);
 extern void expand_cbranchsi4 (rtx *operands, enum rtx_code comparison, int);