From patchwork Wed Oct 5 17:47:31 2011 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Richard Henderson X-Patchwork-Id: 117901 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from sourceware.org (server1.sourceware.org [209.132.180.131]) by ozlabs.org (Postfix) with SMTP id 638E8B6FF5 for ; Thu, 6 Oct 2011 04:48:55 +1100 (EST) Received: (qmail 32555 invoked by alias); 5 Oct 2011 17:48:46 -0000 Received: (qmail 32066 invoked by uid 22791); 5 Oct 2011 17:48:42 -0000 X-SWARE-Spam-Status: No, hits=-2.2 required=5.0 tests=AWL, BAYES_00, DKIM_SIGNED, DKIM_VALID, FREEMAIL_ENVFROM_END_DIGIT, FREEMAIL_FROM, RCVD_IN_DNSWL_LOW, TW_TM, T_TO_NO_BRKTS_FREEMAIL X-Spam-Check-By: sourceware.org Received: from mail-ww0-f51.google.com (HELO mail-ww0-f51.google.com) (74.125.82.51) by sourceware.org (qpsmtpd/0.43rc1) with ESMTP; Wed, 05 Oct 2011 17:48:27 +0000 Received: by wwf10 with SMTP id 10so2531679wwf.8 for ; Wed, 05 Oct 2011 10:48:25 -0700 (PDT) Received: by 10.227.172.143 with SMTP id l15mr3420969wbz.58.1317836905391; Wed, 05 Oct 2011 10:48:25 -0700 (PDT) Received: from localhost.localdomain (c-71-227-161-214.hsd1.wa.comcast.net. [71.227.161.214]) by mx.google.com with ESMTPS id gd6sm4214618wbb.1.2011.10.05.10.48.22 (version=TLSv1/SSLv3 cipher=OTHER); Wed, 05 Oct 2011 10:48:24 -0700 (PDT) From: Richard Henderson To: gcc-patches@gcc.gnu.org Cc: artyom.shinkaroff@gmail.com, harsha.jagasia@amd.com Subject: [PATCH 2/3] i386: Rewrite ix86_expand_vshuffle. Date: Wed, 5 Oct 2011 10:47:31 -0700 Message-Id: <1317836852-16178-3-git-send-email-rth@redhat.com> In-Reply-To: <1317836852-16178-1-git-send-email-rth@redhat.com> References: <1317836852-16178-1-git-send-email-rth@redhat.com> X-IsSubscribed: yes Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Archive: List-Post: List-Help: Sender: gcc-patches-owner@gcc.gnu.org Delivered-To: mailing list gcc-patches@gcc.gnu.org 1: Handle TARGET_XOP. 2: Reduce code duplication. 3: Use ASHIFT instead of MULT for scaling. 4: Fix errors in building convert-to-v16qi indicies. 5: Handle v2di without sse4.1. --- gcc/ChangeLog | 6 + gcc/config/i386/i386-protos.h | 2 +- gcc/config/i386/i386.c | 208 ++++++++++++++++++++--------------------- gcc/config/i386/sse.md | 4 +- 4 files changed, 109 insertions(+), 111 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index a88854d..4b5816d 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -6,6 +6,12 @@ code duplication. Do update_stmt here ... (expand_vector_operations_1): ... not here. + * config/i386/i386.c (ix86_expand_vshuffle): Never fail. Handle + TARGET_XOP. Fix pshufb constant vector creation. Reduce code + duplication. Handle V2DI without SSE4.1. + * config/i386/i386-protos.h (ix86_expand_vshuffle): Update decl. + * config/i386/i386.md (vshuffle): Remove assert for ok. + 2011-10-05 DJ Delorie Nick Clifton diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 99327ed..0bbfa9b 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -123,7 +123,7 @@ extern bool ix86_expand_int_movcc (rtx[]); extern bool ix86_expand_fp_movcc (rtx[]); extern bool ix86_expand_fp_vcond (rtx[]); extern bool ix86_expand_int_vcond (rtx[]); -extern bool ix86_expand_vshuffle (rtx[]); +extern void ix86_expand_vshuffle (rtx[]); extern void ix86_expand_sse_unpack (rtx[], bool, bool); extern bool ix86_expand_int_addcc (rtx[]); extern rtx ix86_expand_call (rtx, rtx, rtx, rtx, rtx, bool); diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 4c1db3a..80a9e73 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -19236,145 +19236,139 @@ ix86_expand_int_vcond (rtx operands[]) return true; } -bool +void ix86_expand_vshuffle (rtx operands[]) { rtx target = operands[0]; rtx op0 = operands[1]; rtx op1 = operands[2]; rtx mask = operands[3]; - rtx new_mask, vt, t1, t2, w_vector; + rtx vt, vec[16]; enum machine_mode mode = GET_MODE (op0); enum machine_mode maskmode = GET_MODE (mask); - enum machine_mode maskinner = GET_MODE_INNER (mode); - rtx vec[16]; - int w, i, j; - bool one_operand_shuffle = op0 == op1; + int w, e, i; + bool one_operand_shuffle = rtx_equal_p (op0, op1); - gcc_assert ((TARGET_SSSE3 || TARGET_AVX) && GET_MODE_BITSIZE (mode) == 128); + gcc_checking_assert (GET_MODE_BITSIZE (mode) == 128); /* Number of elements in the vector. */ - w = GET_MODE_BITSIZE (maskmode) / GET_MODE_BITSIZE (maskinner); - - /* generate w_vector = {w, w, ...} */ - for (i = 0; i < w; i++) - vec[i] = GEN_INT (w); - w_vector = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec)); - - /* mask = mask & {w-1, w-1, w-1,...} */ - for (i = 0; i < w; i++) - vec[i] = GEN_INT (w - 1); + w = GET_MODE_NUNITS (mode); + e = GET_MODE_UNIT_SIZE (mode); - vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec)); - new_mask = expand_simple_binop (maskmode, AND, mask, vt, - NULL_RTX, 0, OPTAB_DIRECT); - - /* If the original vector mode is V16QImode, we can just - use pshufb directly. */ - if (mode == V16QImode && one_operand_shuffle) + if (TARGET_XOP) { - t1 = gen_reg_rtx (V16QImode); - emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, new_mask)); - emit_insn (gen_rtx_SET (VOIDmode, target, t1)); - return true; + /* The XOP VPPERM insn supports three inputs. By ignoring the + one_operand_shuffle special case, we avoid creating another + set of constant vectors in memory. */ + one_operand_shuffle = false; + + /* mask = mask & {2*w-1, ...} */ + vt = GEN_INT (2*w - 1); } - else if (mode == V16QImode) + else { - rtx xops[6]; - - t1 = gen_reg_rtx (V16QImode); - t2 = gen_reg_rtx (V16QImode); - emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, new_mask)); - emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, new_mask)); - - /* mask = mask & {w, w, ...} */ - mask = expand_simple_binop (V16QImode, AND, mask, w_vector, - NULL_RTX, 0, OPTAB_DIRECT); - xops[0] = target; - xops[1] = operands[1]; - xops[2] = operands[2]; - xops[3] = gen_rtx_EQ (mode, mask, w_vector); - xops[4] = t1; - xops[5] = t2; - - return ix86_expand_int_vcond (xops); + /* mask = mask & {w-1, ...} */ + vt = GEN_INT (w - 1); } - /* mask = mask * {w, w, ...} */ - new_mask = expand_simple_binop (maskmode, MULT, new_mask, w_vector, - NULL_RTX, 0, OPTAB_DIRECT); - - /* Convert mask to vector of chars. */ - new_mask = simplify_gen_subreg (V16QImode, new_mask, maskmode, 0); - new_mask = force_reg (V16QImode, new_mask); - - /* Build a helper mask wich we will use in pshufb - (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12} - (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...} - ... */ - for (i = 0; i < w; i++) - for (j = 0; j < 16/w; j++) - vec[i*w+j] = GEN_INT (i*16/w); - vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec)); - vt = force_reg (V16QImode, vt); - - t1 = gen_reg_rtx (V16QImode); - emit_insn (gen_ssse3_pshufbv16qi3 (t1, new_mask, vt)); - new_mask = t1; - - /* Convert it into the byte positions by doing - new_mask = new_mask + {0,1,..,16/w, 0,1,..,16/w, ...} */ for (i = 0; i < w; i++) - for (j = 0; j < 16/w; j++) - vec[i*w+j] = GEN_INT (j); + vec[i] = vt; + vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec)); + mask = expand_simple_binop (maskmode, AND, mask, vt, + NULL_RTX, 0, OPTAB_DIRECT); - vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec)); - new_mask = expand_simple_binop (V16QImode, PLUS, new_mask, vt, + /* For non-QImode operations, convert the word permutation control + into a byte permutation control. */ + if (mode != V16QImode) + { + mask = expand_simple_binop (maskmode, ASHIFT, mask, + GEN_INT (exact_log2 (e)), NULL_RTX, 0, OPTAB_DIRECT); - t1 = gen_reg_rtx (V16QImode); + /* Convert mask to vector of chars. */ + mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask)); + + /* Replicate each of the input bytes into byte positions: + (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8} + (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12} + (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */ + for (i = 0; i < 16; ++i) + vec[i] = GEN_INT (i/e * e); + vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec)); + vt = force_const_mem (V16QImode, vt); + if (TARGET_XOP) + emit_insn (gen_xop_pperm (mask, mask, mask, vt)); + else + emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt)); - /* Convert OP0 to vector of chars. */ - op0 = simplify_gen_subreg (V16QImode, op0, mode, 0); - op0 = force_reg (V16QImode, op0); - emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, new_mask)); + /* Convert it into the byte positions by doing + mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */ + for (i = 0; i < 16; ++i) + vec[i] = GEN_INT (i % e); + vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec)); + vt = force_const_mem (V16QImode, vt); + emit_insn (gen_addv16qi3 (mask, mask, vt)); + } - if (one_operand_shuffle) + /* The actual shuffle operations all operate on V16QImode. */ + op0 = gen_lowpart (V16QImode, op0); + op1 = gen_lowpart (V16QImode, op1); + target = gen_lowpart (V16QImode, target); + + if (TARGET_XOP) { - /* Convert it back from vector of chars to the original mode. */ - t1 = simplify_gen_subreg (mode, t1, V16QImode, 0); - emit_insn (gen_rtx_SET (VOIDmode, target, t1)); - return true; + emit_insn (gen_xop_pperm (target, op0, op1, mask)); + } + else if (one_operand_shuffle) + { + emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask)); } else { - rtx xops[6]; + rtx xops[6], t1, t2; + bool ok; + /* Shuffle the two input vectors independently. */ + t1 = gen_reg_rtx (V16QImode); t2 = gen_reg_rtx (V16QImode); - - /* Convert OP1 to vector of chars. */ - op1 = simplify_gen_subreg (V16QImode, op1, mode, 0); - op1 = force_reg (V16QImode, op1); - emit_insn (gen_ssse3_pshufbv16qi3 (t1, op1, new_mask)); - - /* mask = mask & {w, w, ...} */ - mask = expand_simple_binop (V16QImode, AND, mask, w_vector, + emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask)); + emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask)); + + /* Then merge them together. The key is whether any given control + element contained a bit set that indicates the second word. */ + mask = operands[3]; + vt = GEN_INT (w); + if (maskmode == V2DImode && !TARGET_SSE4_1) + { + /* Without SSE4.1, we don't have V2DImode EQ. Perform one + more shuffle to convert the V2DI input mask into a V4SI + input mask. At which point the masking that expand_int_vcond + will work as desired. */ + rtx t3 = gen_reg_rtx (V4SImode); + emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask), + const0_rtx, const0_rtx, + const2_rtx, const2_rtx)); + mask = t3; + maskmode = V4SImode; + e = w = 4; + } + + for (i = 0; i < w; i++) + vec[i] = vt; + vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec)); + vt = force_reg (maskmode, vt); + mask = expand_simple_binop (maskmode, AND, mask, vt, NULL_RTX, 0, OPTAB_DIRECT); - t1 = simplify_gen_subreg (mode, t1, V16QImode, 0); - t2 = simplify_gen_subreg (mode, t2, V16QImode, 0); - - xops[0] = target; - xops[1] = operands[1]; - xops[2] = operands[2]; - xops[3] = gen_rtx_EQ (mode, mask, w_vector); - xops[4] = t1; - xops[5] = t2; - - return ix86_expand_int_vcond (xops); + xops[0] = gen_lowpart (maskmode, operands[0]); + xops[1] = gen_lowpart (maskmode, t2); + xops[2] = gen_lowpart (maskmode, t1); + xops[3] = gen_rtx_EQ (maskmode, mask, vt); + xops[4] = mask; + xops[5] = vt; + ok = ix86_expand_int_vcond (xops); + gcc_assert (ok); } - - return false; } /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 251cdde..ee9cf0b 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -6229,12 +6229,10 @@ (match_operand: 3 "register_operand" "")] "TARGET_SSSE3 || TARGET_AVX" { - bool ok = ix86_expand_vshuffle (operands); - gcc_assert (ok); + ix86_expand_vshuffle (operands); DONE; }) - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; Parallel bitwise logical operations