[1/3] i386: Expand mul<VI8_AVX2> earlier

Message ID	1340767258-21086-1-git-send-email-rth@redhat.com
State	New
Headers	show Return-Path: <gcc-patches-return-321734-incoming=patchwork.ozlabs.org@gcc.gnu.org> Comment: DKIM? See http://www.dkim.org Comment: DomainKeys? See http://antispam.yahoo.com/domainkeys DomainKey-Signature: a=rsa-sha1; q=dns; c=nofws; s=default; d=gcc.gnu.org; h=Received:Received:X-SWARE-Spam-Status:X-Spam-Check-By:Received:Received:Received:Received:From:To:Subject:Date:Message-Id:X-IsSubscribed:Mailing-List:Precedence:List-Id:List-Unsubscribe:List-Archive:List-Post:List-Help:Sender:Delivered-To; b=wkInDxm99pd3+0KiBvaCdnk+zdYBOxjpYgREKmBMgn9nI1W1U9Bmk9b5TMzmm4 xWSXcRQVhE8GzonJusjlpU1PLMLIRnqVmsVG2oyvIkp9Vv6xHyoGXc4aia/vkKM3 IVEQqHUP6byzSf4eXHilhjkGFj/+QpsmV58qqaad2jqP4=; From: Richard Henderson <rth@redhat.com> To: gcc-patches@gcc.gnu.org Subject: [PATCH 1/3] i386: Expand mul<VI8_AVX2> earlier Date: Tue, 26 Jun 2012 20:20:56 -0700 Message-Id: <1340767258-21086-1-git-send-email-rth@redhat.com> Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk Sender: gcc-patches-owner@gcc.gnu.org

diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index c860e5a..581b25c 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -227,6 +227,7 @@ extern bool ix86_expand_pinsr (rtx *); extern void ix86_expand_mul_widen_evenodd (rtx, rtx, rtx, bool, bool); extern void ix86_expand_mul_widen_hilo (rtx, rtx, rtx, bool, bool); extern void ix86_expand_sse2_mulv4si3 (rtx, rtx, rtx); +extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx); /* In i386-c.c */ extern void ix86_target_macros (void); diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index c825033..5cf230f 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -32293,6 +32293,14 @@ ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total, extra = 6; *total = cost->fmul * 2 + cost->fabs * extra; } + /* V*DImode is emulated with 5-8 insns. */ + else if (mode == V2DImode || mode == V4DImode) + { + if (TARGET_XOP && mode == V2DImode) + *total = cost->fmul * 2 + cost->fabs * 3; + else + *total = cost->fmul * 3 + cost->fabs * 5; + } /* Without sse4.1, we don't have PMULLD; it's emulated with 7 insns, including two PMULUDQ. */ else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX)) @@ -38915,6 +38923,88 @@ ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2) set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2)); } +void +ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2) +{ + enum machine_mode mode = GET_MODE (op0); + rtx t1, t2, t3, t4, t5, t6; + + if (TARGET_XOP && mode == V2DImode) + { + /* op1: A,B,C,D, op2: E,F,G,H */ + op1 = gen_lowpart (V4SImode, op1); + op2 = gen_lowpart (V4SImode, op2); + + t1 = gen_reg_rtx (V4SImode); + t2 = gen_reg_rtx (V4SImode); + t3 = gen_reg_rtx (V2DImode); + t4 = gen_reg_rtx (V2DImode); + + /* t1: B,A,D,C */ + emit_insn (gen_sse2_pshufd_1 (t1, op1, + GEN_INT (1), + GEN_INT (0), + GEN_INT (3), + GEN_INT (2))); + + /* t2: (B*E),(A*F),(D*G),(C*H) */ + emit_insn (gen_mulv4si3 (t2, t1, op2)); + + /* t3: (B*E)+(A*F), (D*G)+(C*H) */ + emit_insn (gen_xop_phadddq (t3, t2)); + + /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */ + emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32))); + + /* op0: (((B*E)+(A*F))<<32)+(B*F), (((D*G)+(C*H))<<32)+(D*H) */ + emit_insn (gen_xop_pmacsdql (op0, op1, op2, t4)); + } + else + { + enum machine_mode nmode; + rtx (*umul) (rtx, rtx, rtx); + + if (mode == V2DImode) + { + umul = gen_sse2_umulv2siv2di3; + nmode = V4SImode; + } + else if (mode == V4DImode) + { + umul = gen_avx2_umulv4siv4di3; + nmode = V8SImode; + } + else + gcc_unreachable (); + + + /* Multiply low parts. */ + t1 = gen_reg_rtx (mode); + emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2))); + + /* Shift input vectors right 32 bits so we can multiply high parts. */ + t6 = GEN_INT (32); + t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT); + t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT); + + /* Multiply high parts by low parts. */ + t4 = gen_reg_rtx (mode); + t5 = gen_reg_rtx (mode); + emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2))); + emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1))); + + /* Combine and shift the highparts back. */ + t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT); + t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT); + + /* Combine high and low parts. */ + force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT); + } + + set_unique_reg_note (get_last_insn (), REG_EQUAL, + gen_rtx_MULT (mode, op1, op2)); +} + /* Expand an insert into a vector register through pinsr insn. Return true if successful. */ diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 4b51415..81e7dc0 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -5592,91 +5592,13 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "<sseinsnmode>")]) -(define_insn_and_split "mul<mode>3" +(define_expand "mul<mode>3" [(set (match_operand:VI8_AVX2 0 "register_operand") (mult:VI8_AVX2 (match_operand:VI8_AVX2 1 "register_operand") (match_operand:VI8_AVX2 2 "register_operand")))] - "TARGET_SSE2 - && can_create_pseudo_p ()" - "#" - "&& 1" - [(const_int 0)] + "TARGET_SSE2" { - rtx t1, t2, t3, t4, t5, t6, thirtytwo; - rtx op0, op1, op2; - - op0 = operands[0]; - op1 = operands[1]; - op2 = operands[2]; - - if (TARGET_XOP && <MODE>mode == V2DImode) - { - /* op1: A,B,C,D, op2: E,F,G,H */ - op1 = gen_lowpart (V4SImode, op1); - op2 = gen_lowpart (V4SImode, op2); - - t1 = gen_reg_rtx (V4SImode); - t2 = gen_reg_rtx (V4SImode); - t3 = gen_reg_rtx (V2DImode); - t4 = gen_reg_rtx (V2DImode); - - /* t1: B,A,D,C */ - emit_insn (gen_sse2_pshufd_1 (t1, op1, - GEN_INT (1), - GEN_INT (0), - GEN_INT (3), - GEN_INT (2))); - - /* t2: (B*E),(A*F),(D*G),(C*H) */ - emit_insn (gen_mulv4si3 (t2, t1, op2)); - - /* t4: (B*E)+(A*F), (D*G)+(C*H) */ - emit_insn (gen_xop_phadddq (t3, t2)); - - /* t5: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */ - emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32))); - - /* op0: (((B*E)+(A*F))<<32)+(B*F), (((D*G)+(C*H))<<32)+(D*H) */ - emit_insn (gen_xop_pmacsdql (op0, op1, op2, t4)); - } - else - { - t1 = gen_reg_rtx (<MODE>mode); - t2 = gen_reg_rtx (<MODE>mode); - t3 = gen_reg_rtx (<MODE>mode); - t4 = gen_reg_rtx (<MODE>mode); - t5 = gen_reg_rtx (<MODE>mode); - t6 = gen_reg_rtx (<MODE>mode); - thirtytwo = GEN_INT (32); - - /* Multiply low parts. */ - emit_insn (gen_<sse2_avx2>_umulv<ssescalarnum>si<mode>3 - (t1, gen_lowpart (<ssepackmode>mode, op1), - gen_lowpart (<ssepackmode>mode, op2))); - - /* Shift input vectors right 32 bits so we can multiply high parts. */ - emit_insn (gen_lshr<mode>3 (t2, op1, thirtytwo)); - emit_insn (gen_lshr<mode>3 (t3, op2, thirtytwo)); - - /* Multiply high parts by low parts. */ - emit_insn (gen_<sse2_avx2>_umulv<ssescalarnum>si<mode>3 - (t4, gen_lowpart (<ssepackmode>mode, op1), - gen_lowpart (<ssepackmode>mode, t3))); - emit_insn (gen_<sse2_avx2>_umulv<ssescalarnum>si<mode>3 - (t5, gen_lowpart (<ssepackmode>mode, op2), - gen_lowpart (<ssepackmode>mode, t2))); - - /* Shift them back. */ - emit_insn (gen_ashl<mode>3 (t4, t4, thirtytwo)); - emit_insn (gen_ashl<mode>3 (t5, t5, thirtytwo)); - - /* Add the three parts together. */ - emit_insn (gen_add<mode>3 (t6, t1, t4)); - emit_insn (gen_add<mode>3 (op0, t6, t5)); - } - - set_unique_reg_note (get_last_insn (), REG_EQUAL, - gen_rtx_MULT (<MODE>mode, operands[1], operands[2])); + ix86_expand_sse2_mulvxdi3 (operands[0], operands[1], operands[2]); DONE; })

[1/3] i386: Expand mul<VI8_AVX2> earlier

Commit Message

Patch