@@ -16974,7 +16974,7 @@
(use (match_operand 4 "immediate_operand"))]
""
{
- rtx addr1, addr2, countreg, align, out;
+ rtx addr1, addr2, countreg, countout, align, out;
if (optimize_insn_for_size_p () && !TARGET_INLINE_ALL_STRINGOPS)
FAIL;
@@ -17006,6 +17006,7 @@
operands[2] = replace_equiv_address_nv (operands[2], addr2);
countreg = ix86_zero_extend_to_Pmode (operands[3]);
+ countout = gen_reg_rtx (Pmode);
/* %%% Iff we are testing strict equality, we can use known alignment
to good advantage. This may be possible with combine, particularly
@@ -17019,14 +17020,14 @@
emit_move_insn (operands[0], const0_rtx);
DONE;
}
- emit_insn (gen_cmpstrnqi_nz_1 (addr1, addr2, countreg, align,
- operands[1], operands[2]));
+ emit_insn (gen_cmpstrnqi_nz_1 (addr1, addr2, countout, align,
+ operands[1], operands[2], countreg));
}
else
{
emit_insn (gen_cmp_1 (Pmode, countreg, countreg));
- emit_insn (gen_cmpstrnqi_1 (addr1, addr2, countreg, align,
- operands[1], operands[2]));
+ emit_insn (gen_cmpstrnqi_1 (addr1, addr2, countout, align,
+ operands[1], operands[2], countreg));
}
out = gen_lowpart (QImode, operands[0]);
@@ -17060,11 +17061,11 @@
[(parallel [(set (reg:CC FLAGS_REG)
(compare:CC (match_operand 4 "memory_operand")
(match_operand 5 "memory_operand")))
- (use (match_operand 2 "register_operand"))
+ (use (match_operand 6 "register_operand"))
(use (match_operand:SI 3 "immediate_operand"))
(clobber (match_operand 0 "register_operand"))
(clobber (match_operand 1 "register_operand"))
- (clobber (match_dup 2))])]
+ (clobber (match_operand 2 "register_operand"))])]
""
{
if (TARGET_CLD)
@@ -17096,16 +17097,15 @@
(define_expand "cmpstrnqi_1"
[(parallel [(set (reg:CC FLAGS_REG)
- (if_then_else:CC (ne (match_operand 2 "register_operand")
+ (if_then_else:CC (ne (match_operand 6 "register_operand")
(const_int 0))
(compare:CC (match_operand 4 "memory_operand")
(match_operand 5 "memory_operand"))
- (const_int 0)))
+ (reg:CC FLAGS_REG)))
(use (match_operand:SI 3 "immediate_operand"))
- (use (reg:CC FLAGS_REG))
(clobber (match_operand 0 "register_operand"))
(clobber (match_operand 1 "register_operand"))
- (clobber (match_dup 2))])]
+ (clobber (match_operand 2 "register_operand"))])]
""
{
if (TARGET_CLD)
@@ -17118,9 +17118,8 @@
(const_int 0))
(compare:CC (mem:BLK (match_operand:P 4 "register_operand" "0"))
(mem:BLK (match_operand:P 5 "register_operand" "1")))
- (const_int 0)))
+ (reg:CC FLAGS_REG)))
(use (match_operand:SI 3 "immediate_operand" "i"))
- (use (reg:CC FLAGS_REG))
(clobber (match_operand:P 0 "register_operand" "=S"))
(clobber (match_operand:P 1 "register_operand" "=D"))
(clobber (match_operand:P 2 "register_operand" "=c"))]
x86 cmpmemsi pattern - single compare
This patch introduces a cmpmemsi pattern to expand to a single compare
insn sequence, involving one bswapped load from each input mem block.
It disregards alignment entirely, leaving it up for the CPU to deal
with it.
for gcc/ChangeLog
* config/i386/i386.md (cmpmemsi): New pattern.
---
gcc/config/i386/i386.md | 114 +++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 114 insertions(+)
@@ -16966,6 +16966,120 @@
(const_string "*")))
(set_attr "mode" "QI")])
+(define_expand "cmpmemsi"
+ [(set (match_operand:SI 0 "register_operand")
+ (compare:SI (match_operand:BLK 1 "general_operand")
+ (match_operand:BLK 2 "general_operand")))
+ (use (match_operand 3 "immediate_operand"))
+ (use (match_operand 4 "immediate_operand"))]
+ ""
+{
+ rtx op1, op2, tmp;
+
+ if (!CONST_INT_P (operands[3]))
+ FAIL;
+
+ if (optimize_insn_for_size_p () && !TARGET_INLINE_ALL_STRINGOPS)
+ FAIL;
+
+ switch (INTVAL (operands[3]))
+ {
+ case 0:
+ emit_move_insn (operands[0], const0_rtx);
+ DONE;
+
+ default:
+ FAIL;
+
+ case 8:
+ if (!TARGET_64BIT)
+ FAIL;
+
+ op1 = gen_rtx_MEM (DImode, XEXP (operands[1], 0));
+ MEM_COPY_ATTRIBUTES (op1, operands[1]);
+
+ tmp = gen_reg_rtx (DImode);
+ emit_insn (gen_bswapdi2 (tmp, op1));
+ op1 = tmp;
+
+ op2 = gen_rtx_MEM (DImode, XEXP (operands[2], 0));
+ MEM_COPY_ATTRIBUTES (op2, operands[2]);
+
+ tmp = gen_reg_rtx (DImode);
+ emit_insn (gen_bswapdi2 (tmp, op2));
+ op2 = tmp;
+
+ emit_insn (gen_cmp_1 (DImode, op1, op2));
+
+ tmp = gen_lowpart (QImode, operands[0]);
+ emit_insn (gen_cmpintqi (tmp));
+ emit_move_insn (operands[0], gen_rtx_SIGN_EXTEND (SImode, tmp));
+ DONE;
+
+ case 4:
+ op1 = gen_rtx_MEM (SImode, XEXP (operands[1], 0));
+ MEM_COPY_ATTRIBUTES (op1, operands[1]);
+
+ tmp = gen_reg_rtx (SImode);
+ emit_insn (gen_bswapsi2 (tmp, op1));
+ op1 = tmp;
+
+ op2 = gen_rtx_MEM (SImode, XEXP (operands[2], 0));
+ MEM_COPY_ATTRIBUTES (op2, operands[2]);
+
+ tmp = gen_reg_rtx (SImode);
+ emit_insn (gen_bswapsi2 (tmp, op2));
+ op2 = tmp;
+
+ emit_insn (gen_cmp_1 (SImode, op1, op2));
+
+ tmp = gen_lowpart (QImode, operands[0]);
+ emit_insn (gen_cmpintqi (tmp));
+ emit_move_insn (operands[0], gen_rtx_SIGN_EXTEND (SImode, tmp));
+ DONE;
+
+ case 2:
+ op1 = gen_rtx_MEM (HImode, XEXP (operands[1], 0));
+ MEM_COPY_ATTRIBUTES (op1, operands[1]);
+
+ tmp = gen_reg_rtx (SImode);
+ emit_insn (gen_zero_extendhisi2 (tmp, op1));
+ emit_insn (gen_bswaphi_lowpart (gen_lowpart (HImode, tmp)));
+ op1 = tmp;
+
+ op2 = gen_rtx_MEM (HImode, XEXP (operands[2], 0));
+ MEM_COPY_ATTRIBUTES (op2, operands[2]);
+
+ tmp = gen_reg_rtx (SImode);
+ emit_insn (gen_zero_extendhisi2 (tmp, op2));
+ emit_insn (gen_bswaphi_lowpart (gen_lowpart (HImode, tmp)));
+ op2 = tmp;
+
+ emit_insn (gen_sub3_insn (operands[0], op1, op2));
+ DONE;
+
+ case 1:
+ op1 = gen_rtx_MEM (QImode, XEXP (operands[1], 0));
+ MEM_COPY_ATTRIBUTES (op1, operands[1]);
+
+ tmp = gen_reg_rtx (SImode);
+ emit_insn (gen_zero_extendqisi2 (tmp, op1));
+ op1 = tmp;
+
+ op2 = gen_rtx_MEM (QImode, XEXP (operands[2], 0));
+ MEM_COPY_ATTRIBUTES (op2, operands[2]);
+
+ tmp = gen_reg_rtx (SImode);
+ emit_insn (gen_zero_extendqisi2 (tmp, op2));
+ op2 = tmp;
+
+ emit_insn (gen_sub3_insn (operands[0], op1, op2));
+ DONE;
+ }
+
+ FAIL;
+})
+
(define_expand "cmpstrnsi"
[(set (match_operand:SI 0 "register_operand")
(compare:SI (match_operand:BLK 1 "general_operand")
extend x86 cmpmemsi to use loops
This patch extends the cmpmemsi expander introduced in the previous
patch to use loops for lengths that extend over multiple words.
for gcc/ChangeLog
* config/i386/i386.md (cmpmemsi): Expand more than one
fragment compare sequence depending on optimization level.
(subcmpsi3): New expand pattern.
---
gcc/config/i386/i386.md | 204 +++++++++++++++++++++++++++++++++++------------
1 file changed, 154 insertions(+), 50 deletions(-)
@@ -16974,112 +16974,216 @@
(use (match_operand 4 "immediate_operand"))]
""
{
- rtx op1, op2, tmp;
-
if (!CONST_INT_P (operands[3]))
FAIL;
+ unsigned HOST_WIDE_INT todo = UINTVAL (operands[3]);
+
+ /* Balance size expansion with optimization level. This will inline
+ memcmp of up to 4 bytes or 1 word at -O1, 4 words or 1 word plus
+ 3 compares at -O2, and 7 words or 4 words plus 3 compares at -O3.
+ These are not so much related with the combinations that make
+ individual memcmp calls faster, but with the significant extra
+ code cache use that each additional sequence of loads, byte
+ swapping and compare incurs. */
+
+ HOST_WIDE_INT size = (TARGET_64BIT ? todo / 8 + !!(todo & 4) : todo / 4);
+ if (size)
+ size++;
+ size += !!(todo & 1) + !!(todo & 2);
+ if (size > 1)
+ size++;
+ if (size > optimize * 3)
+ FAIL;
+
if (optimize_insn_for_size_p () && !TARGET_INLINE_ALL_STRINGOPS)
FAIL;
- switch (INTVAL (operands[3]))
+ if (!todo)
{
- case 0:
emit_move_insn (operands[0], const0_rtx);
DONE;
+ }
- default:
- FAIL;
-
- case 8:
- if (!TARGET_64BIT)
- FAIL;
-
- op1 = gen_rtx_MEM (DImode, XEXP (operands[1], 0));
- MEM_COPY_ATTRIBUTES (op1, operands[1]);
-
- tmp = gen_reg_rtx (DImode);
- emit_insn (gen_bswapdi2 (tmp, op1));
- op1 = tmp;
-
- op2 = gen_rtx_MEM (DImode, XEXP (operands[2], 0));
- MEM_COPY_ATTRIBUTES (op2, operands[2]);
-
- tmp = gen_reg_rtx (DImode);
- emit_insn (gen_bswapdi2 (tmp, op2));
- op2 = tmp;
+ rtx tmpout = operands[0];
+ if (reg_overlap_mentioned_p (operands[0], XEXP (operands[1], 0))
+ || reg_overlap_mentioned_p (operands[0], XEXP (operands[2], 0)))
+ tmpout = gen_reg_rtx (SImode);
- emit_insn (gen_cmp_1 (DImode, op1, op2));
+ rtx_code_label *labnz = 0, *labfv = 0;
+ unsigned HOST_WIDE_INT done = 0;
+ bool needcmpint = false;
- tmp = gen_lowpart (QImode, operands[0]);
- emit_insn (gen_cmpintqi (tmp));
- emit_move_insn (operands[0], gen_rtx_SIGN_EXTEND (SImode, tmp));
- DONE;
+ if (TARGET_64BIT)
+ while (todo >= 8)
+ {
+ rtx op1 = gen_rtx_MEM (DImode, XEXP (operands[1], 0));
+ MEM_COPY_ATTRIBUTES (op1, operands[1]);
+ if (done)
+ op1 = offset_address (op1, GEN_INT (done), 8);
+
+ rtx tmp = gen_reg_rtx (DImode);
+ emit_insn (gen_bswapdi2 (tmp, op1));
+ op1 = tmp;
+
+ rtx op2 = gen_rtx_MEM (DImode, XEXP (operands[2], 0));
+ MEM_COPY_ATTRIBUTES (op2, operands[2]);
+ if (done)
+ op2 = offset_address (op2, GEN_INT (done), 8);
+
+ tmp = gen_reg_rtx (DImode);
+ emit_insn (gen_bswapdi2 (tmp, op2));
+ op2 = tmp;
+
+ emit_insn (gen_cmp_1 (DImode, op1, op2));
+ needcmpint = true;
+
+ done += 8;
+ todo -= 8;
+ if (todo)
+ {
+ if (!labnz)
+ labnz = gen_label_rtx ();
+ LABEL_NUSES (labnz)++;
+ ix86_expand_branch (NE, gen_rtx_REG (CCmode, FLAGS_REG),
+ const0_rtx, labnz);
+ }
+ }
- case 4:
- op1 = gen_rtx_MEM (SImode, XEXP (operands[1], 0));
+ while (todo >= 4)
+ {
+ rtx op1 = gen_rtx_MEM (SImode, XEXP (operands[1], 0));
MEM_COPY_ATTRIBUTES (op1, operands[1]);
+ if (done)
+ op1 = offset_address (op1, GEN_INT (done), 4);
- tmp = gen_reg_rtx (SImode);
+ rtx tmp = gen_reg_rtx (SImode);
emit_insn (gen_bswapsi2 (tmp, op1));
op1 = tmp;
- op2 = gen_rtx_MEM (SImode, XEXP (operands[2], 0));
+ rtx op2 = gen_rtx_MEM (SImode, XEXP (operands[2], 0));
MEM_COPY_ATTRIBUTES (op2, operands[2]);
+ if (done)
+ op2 = offset_address (op2, GEN_INT (done), 4);
tmp = gen_reg_rtx (SImode);
emit_insn (gen_bswapsi2 (tmp, op2));
op2 = tmp;
emit_insn (gen_cmp_1 (SImode, op1, op2));
+ needcmpint = true;
- tmp = gen_lowpart (QImode, operands[0]);
- emit_insn (gen_cmpintqi (tmp));
- emit_move_insn (operands[0], gen_rtx_SIGN_EXTEND (SImode, tmp));
- DONE;
+ done += 4;
+ todo -= 4;
+ if (todo)
+ {
+ if (!labnz)
+ labnz = gen_label_rtx ();
+ LABEL_NUSES (labnz)++;
+ ix86_expand_branch (NE, gen_rtx_REG (CCmode, FLAGS_REG),
+ const0_rtx, labnz);
+ }
+ }
- case 2:
- op1 = gen_rtx_MEM (HImode, XEXP (operands[1], 0));
+ if (todo >= 2)
+ {
+ rtx op1 = gen_rtx_MEM (HImode, XEXP (operands[1], 0));
MEM_COPY_ATTRIBUTES (op1, operands[1]);
+ if (done)
+ op1 = offset_address (op1, GEN_INT (done), 4);
- tmp = gen_reg_rtx (SImode);
+ rtx tmp = gen_reg_rtx (SImode);
emit_insn (gen_zero_extendhisi2 (tmp, op1));
emit_insn (gen_bswaphi_lowpart (gen_lowpart (HImode, tmp)));
op1 = tmp;
- op2 = gen_rtx_MEM (HImode, XEXP (operands[2], 0));
+ rtx op2 = gen_rtx_MEM (HImode, XEXP (operands[2], 0));
MEM_COPY_ATTRIBUTES (op2, operands[2]);
+ if (done)
+ op2 = offset_address (op2, GEN_INT (done), 4);
tmp = gen_reg_rtx (SImode);
emit_insn (gen_zero_extendhisi2 (tmp, op2));
emit_insn (gen_bswaphi_lowpart (gen_lowpart (HImode, tmp)));
op2 = tmp;
- emit_insn (gen_sub3_insn (operands[0], op1, op2));
- DONE;
+ if (needcmpint)
+ emit_insn (gen_cmp_1 (SImode, op1, op2));
+ else
+ emit_insn (gen_subcmpsi3 (tmpout, op1, op2));
- case 1:
- op1 = gen_rtx_MEM (QImode, XEXP (operands[1], 0));
+ done += 2;
+ todo -= 2;
+ if (todo)
+ {
+ rtx_code_label *lab = labnz;
+ if (!needcmpint)
+ lab = labfv = gen_label_rtx ();
+ LABEL_NUSES (lab)++;
+ ix86_expand_branch (NE, gen_rtx_REG (CCmode, FLAGS_REG),
+ const0_rtx, lab);
+ }
+ }
+
+ if (todo >= 1)
+ {
+ rtx op1 = gen_rtx_MEM (QImode, XEXP (operands[1], 0));
MEM_COPY_ATTRIBUTES (op1, operands[1]);
+ if (done)
+ op1 = offset_address (op1, GEN_INT (done), 2);
- tmp = gen_reg_rtx (SImode);
+ rtx tmp = gen_reg_rtx (SImode);
emit_insn (gen_zero_extendqisi2 (tmp, op1));
op1 = tmp;
- op2 = gen_rtx_MEM (QImode, XEXP (operands[2], 0));
+ rtx op2 = gen_rtx_MEM (QImode, XEXP (operands[2], 0));
MEM_COPY_ATTRIBUTES (op2, operands[2]);
+ if (done)
+ op2 = offset_address (op2, GEN_INT (done), 2);
tmp = gen_reg_rtx (SImode);
emit_insn (gen_zero_extendqisi2 (tmp, op2));
op2 = tmp;
- emit_insn (gen_sub3_insn (operands[0], op1, op2));
- DONE;
+ if (needcmpint)
+ emit_insn (gen_cmp_1 (SImode, op1, op2));
+ else
+ emit_insn (gen_subcmpsi3 (tmpout, op1, op2));
+
+ done += 1;
+ todo -= 1;
+ }
+ gcc_assert (!todo);
+
+ if (labnz)
+ emit_label (labnz);
+
+ if (needcmpint)
+ {
+ rtx tmp = gen_lowpart (QImode, tmpout);
+ emit_insn (gen_cmpintqi (tmp));
+ emit_move_insn (tmpout, gen_rtx_SIGN_EXTEND (SImode, tmp));
}
- FAIL;
+ if (labfv)
+ emit_label (labfv);
+
+ if (tmpout != operands[0])
+ emit_move_insn (operands[0], tmpout);
+
+ DONE;
})
+;; Expand a "*sub<mode>_2" pattern with mode=SI.
+(define_expand "subcmpsi3"
+ [(parallel [(set (reg:CC FLAGS_REG)
+ (compare:CC
+ (match_operand:SI 1 "register_operand")
+ (match_operand:SI 2 "register_operand")))
+ (set (match_operand:SI 0 "register_operand")
+ (minus:SI (match_dup 1) (match_dup 2)))])]
+ "")
+
(define_expand "cmpstrnsi"
[(set (match_operand:SI 0 "register_operand")
(compare:SI (match_operand:BLK 1 "general_operand")
DO NOT USE - FTR only - cmpsb-based cmpmemsi pattern for x86
I include this just for the record, in case someone wishes to compare
memcmp performance when implemented as 'repz cmpsb', same as used for
strncmp, with the implementations in glibc or in the proposed patchset
above.
---
gcc/config/i386/i386.md | 56 +++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 56 insertions(+)
@@ -16966,6 +16966,62 @@
(const_string "*")))
(set_attr "mode" "QI")])
+(define_expand "cmpmemsi"
+ [(set (match_operand:SI 0 "register_operand")
+ (compare:SI (match_operand:BLK 1 "general_operand")
+ (match_operand:BLK 2 "general_operand")))
+ (use (match_operand 3 "general_operand"))
+ (use (match_operand 4 "immediate_operand"))]
+ ""
+{
+ rtx addr1, addr2, countreg, countout, align, out;
+
+ if (optimize_insn_for_size_p () && !TARGET_INLINE_ALL_STRINGOPS)
+ FAIL;
+
+ /* Can't use this if the user has appropriated ecx, esi or edi. */
+ if (fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])
+ FAIL;
+
+ addr1 = copy_addr_to_reg (XEXP (operands[1], 0));
+ addr2 = copy_addr_to_reg (XEXP (operands[2], 0));
+ if (addr1 != XEXP (operands[1], 0))
+ operands[1] = replace_equiv_address_nv (operands[1], addr1);
+ if (addr2 != XEXP (operands[2], 0))
+ operands[2] = replace_equiv_address_nv (operands[2], addr2);
+
+ countreg = ix86_zero_extend_to_Pmode (operands[3]);
+ countout = gen_reg_rtx (Pmode);
+
+ /* %%% Iff we are testing strict equality, we can use known alignment
+ to good advantage. This may be possible with combine, particularly
+ once cc0 is dead. */
+ align = operands[4];
+
+ if (CONST_INT_P (operands[3]))
+ {
+ if (operands[3] == const0_rtx)
+ {
+ emit_move_insn (operands[0], const0_rtx);
+ DONE;
+ }
+ emit_insn (gen_cmpstrnqi_nz_1 (addr1, addr2, countout, align,
+ operands[1], operands[2], countreg));
+ }
+ else
+ {
+ emit_insn (gen_cmp_1 (Pmode, countreg, countreg));
+ emit_insn (gen_cmpstrnqi_1 (addr1, addr2, countout, align,
+ operands[1], operands[2], countreg));
+ }
+
+ out = gen_lowpart (QImode, operands[0]);
+ emit_insn (gen_cmpintqi (out));
+ emit_move_insn (operands[0], gen_rtx_SIGN_EXTEND (SImode, out));
+
+ DONE;
+})
+
(define_expand "cmpstrnsi"
[(set (match_operand:SI 0 "register_operand")
(compare:SI (match_operand:BLK 1 "general_operand")