From patchwork Sun Feb 27 13:45:11 2011 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Uros Bizjak X-Patchwork-Id: 84690 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from sourceware.org (server1.sourceware.org [209.132.180.131]) by ozlabs.org (Postfix) with SMTP id 9C74DB70F8 for ; Mon, 28 Feb 2011 00:45:25 +1100 (EST) Received: (qmail 16661 invoked by alias); 27 Feb 2011 13:45:23 -0000 Received: (qmail 16646 invoked by uid 22791); 27 Feb 2011 13:45:19 -0000 X-SWARE-Spam-Status: No, hits=-2.2 required=5.0 tests=AWL, BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, FREEMAIL_FROM, RCVD_IN_DNSWL_LOW, TW_ZJ, T_TO_NO_BRKTS_FREEMAIL X-Spam-Check-By: sourceware.org Received: from mail-px0-f175.google.com (HELO mail-px0-f175.google.com) (209.85.212.175) by sourceware.org (qpsmtpd/0.43rc1) with ESMTP; Sun, 27 Feb 2011 13:45:13 +0000 Received: by pxi17 with SMTP id 17so643190pxi.20 for ; Sun, 27 Feb 2011 05:45:11 -0800 (PST) MIME-Version: 1.0 Received: by 10.142.113.6 with SMTP id l6mr3381482wfc.296.1298814311492; Sun, 27 Feb 2011 05:45:11 -0800 (PST) Received: by 10.142.54.6 with HTTP; Sun, 27 Feb 2011 05:45:11 -0800 (PST) Date: Sun, 27 Feb 2011 14:45:11 +0100 Message-ID: Subject: [RFC PATCH, i386]: Vectorize calls to floor, ceil, trunc and rint functions. From: Uros Bizjak To: gcc-patches@gcc.gnu.org Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Archive: List-Post: List-Help: Sender: gcc-patches-owner@gcc.gnu.org Delivered-To: mailing list gcc-patches@gcc.gnu.org Hello! Attached [RFC] patch vectorizes calls to floor, ceil, trunc and rint (and their float variants) functions using roundps/roundpd SSE4.1 instruction. 2011-02-27 Uros Bizjak * config/i386/i386.md (ROUND_FLOOR): New constant. (ROUND_CEIL): Ditto. (ROUND_TRUNC): Ditto. (ROUND_MXCSR): Ditto. (ROUND_NO_EXC): Ditto. (rint2): Use new defines instead of numerical constant. (floor2): Ditto. (ceil2): Ditto. (btrunc2): Ditto. * config/i386/i386-builtin-types.def: Define ROUND function type aliases. * config/i386/i386.c (enum ix86_builtins): Add IX86_BUILTIN_{FLOOR,CEIL,TRUNC,RINT}{PS,PD}{,256} defines. (struct builtin_description): Add __builtin_ia32_{floor,ceil,trunc,rint}{pd,ps}{,256} descriptions. (ix86_expand_sse_round): New static function. (ix86_expand_args_builtin): Call ix86_expand_sse_round for ROUND function types. (ix86_builtin_vectorized_function): Handle BUILT_IN_{FLOOR,CEIL,TRUNC,RINT}{,F} builtins. Patch was bootstrapped and regression tested on x86_64-pc-linux-gnu (--with-fpmath=avx). Currently, it does not include test cases, but it is RFC at this gcc development stage anyway. Uros. Index: i386.md =================================================================== --- i386.md (revision 170534) +++ i386.md (working copy) @@ -272,6 +272,15 @@ UNSPECV_SPLIT_STACK_RETURN ]) +;; Constants to represent rounding modes in the ROUND instruction +(define_constants + [(ROUND_FLOOR 0x1) + (ROUND_CEIL 0x2) + (ROUND_TRUNC 0x3) + (ROUND_MXCSR 0x4) + (ROUND_NO_EXC 0x8) + ]) + ;; Constants to represent pcomtrue/pcomfalse variants (define_constants [(PCOM_FALSE 0) @@ -14573,7 +14582,7 @@ FAIL; if (TARGET_ROUND) emit_insn (gen_sse4_1_round2 - (operands[0], operands[1], GEN_INT (0x04))); + (operands[0], operands[1], GEN_INT (ROUND_MXCSR))); else ix86_expand_rint (operand0, operand1); } @@ -14819,7 +14828,7 @@ FAIL; if (TARGET_ROUND) emit_insn (gen_sse4_1_round2 - (operands[0], operands[1], GEN_INT (0x01))); + (operands[0], operands[1], GEN_INT (ROUND_FLOOR))); else if (TARGET_64BIT || (mode != DFmode)) ix86_expand_floorceil (operand0, operand1, true); else @@ -15074,7 +15083,7 @@ { if (TARGET_ROUND) emit_insn (gen_sse4_1_round2 - (operands[0], operands[1], GEN_INT (0x02))); + (operands[0], operands[1], GEN_INT (ROUND_CEIL))); else if (optimize_insn_for_size_p ()) FAIL; else if (TARGET_64BIT || (mode != DFmode)) @@ -15329,7 +15338,7 @@ { if (TARGET_ROUND) emit_insn (gen_sse4_1_round2 - (operands[0], operands[1], GEN_INT (0x03))); + (operands[0], operands[1], GEN_INT (ROUND_TRUNC))); else if (optimize_insn_for_size_p ()) FAIL; else if (TARGET_64BIT || (mode != DFmode)) Index: i386-builtin-types.def =================================================================== --- i386-builtin-types.def (revision 170534) +++ i386-builtin-types.def (working copy) @@ -377,6 +377,11 @@ DEF_FUNCTION_TYPE (V16QI, V16QI, INT, V1 DEF_FUNCTION_TYPE (V8QI, QI, QI, QI, QI, QI, QI, QI, QI) +DEF_FUNCTION_TYPE_ALIAS (V2DF_FTYPE_V2DF, ROUND) +DEF_FUNCTION_TYPE_ALIAS (V4DF_FTYPE_V4DF, ROUND) +DEF_FUNCTION_TYPE_ALIAS (V4SF_FTYPE_V4SF, ROUND) +DEF_FUNCTION_TYPE_ALIAS (V8SF_FTYPE_V8SF, ROUND) + DEF_FUNCTION_TYPE_ALIAS (INT_FTYPE_V2DF_V2DF, PTEST) DEF_FUNCTION_TYPE_ALIAS (INT_FTYPE_V2DI_V2DI, PTEST) DEF_FUNCTION_TYPE_ALIAS (INT_FTYPE_V4DF_V4DF, PTEST) Index: i386.c =================================================================== --- i386.c (revision 170534) +++ i386.c (working copy) @@ -23916,6 +23916,15 @@ enum ix86_builtins IX86_BUILTIN_ROUNDSD, IX86_BUILTIN_ROUNDSS, + IX86_BUILTIN_FLOORPD, + IX86_BUILTIN_CEILPD, + IX86_BUILTIN_TRUNCPD, + IX86_BUILTIN_RINTPD, + IX86_BUILTIN_FLOORPS, + IX86_BUILTIN_CEILPS, + IX86_BUILTIN_TRUNCPS, + IX86_BUILTIN_RINTPS, + IX86_BUILTIN_PTESTZ, IX86_BUILTIN_PTESTC, IX86_BUILTIN_PTESTNZC, @@ -24083,6 +24092,15 @@ enum ix86_builtins IX86_BUILTIN_ROUNDPD256, IX86_BUILTIN_ROUNDPS256, + IX86_BUILTIN_FLOORPD256, + IX86_BUILTIN_CEILPD256, + IX86_BUILTIN_TRUNCPD256, + IX86_BUILTIN_RINTPD256, + IX86_BUILTIN_FLOORPS256, + IX86_BUILTIN_CEILPS256, + IX86_BUILTIN_TRUNCPS256, + IX86_BUILTIN_RINTPS256, + IX86_BUILTIN_UNPCKHPD256, IX86_BUILTIN_UNPCKLPD256, IX86_BUILTIN_UNPCKHPS256, @@ -25105,6 +25123,16 @@ static const struct builtin_description { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT }, { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT }, + { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND }, + { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND }, + { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND }, + { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND }, + + { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND }, + { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND }, + { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND }, + { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND }, + { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST }, { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST }, { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST }, @@ -25217,6 +25245,16 @@ static const struct builtin_description { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT }, { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND }, + + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF }, { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF }, { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF }, @@ -26216,6 +26254,39 @@ ix86_expand_sse_comi (const struct built return SUBREG_REG (target); } +/* Subroutine of ix86_expand_args_builtin to take care of round insns. */ + +static rtx +ix86_expand_sse_round (const struct builtin_description *d, tree exp, + rtx target) +{ + rtx pat; + tree arg0 = CALL_EXPR_ARG (exp, 0); + rtx op1, op0 = expand_normal (arg0); + enum machine_mode tmode = insn_data[d->icode].operand[0].mode; + enum machine_mode mode0 = insn_data[d->icode].operand[1].mode; + + if (optimize || target == 0 + || GET_MODE (target) != tmode + || !insn_data[d->icode].operand[0].predicate (target, tmode)) + target = gen_reg_rtx (tmode); + + if (VECTOR_MODE_P (mode0)) + op0 = safe_vector_operand (op0, mode0); + + if ((optimize && !register_operand (op0, mode0)) + || !insn_data[d->icode].operand[0].predicate (op0, mode0)) + op0 = copy_to_mode_reg (mode0, op0); + + op1 = GEN_INT (d->comparison); + + pat = GEN_FCN (d->icode) (target, op0, op1); + if (! pat) + return 0; + emit_insn (pat); + return target; +} + /* Subroutine of ix86_expand_builtin to take care of ptest insns. */ static rtx @@ -26485,6 +26556,11 @@ ix86_expand_args_builtin (const struct b switch ((enum ix86_builtin_func_type) d->flag) { + case V2DF_FTYPE_V2DF_ROUND: + case V4DF_FTYPE_V4DF_ROUND: + case V4SF_FTYPE_V4SF_ROUND: + case V8SF_FTYPE_V8SF_ROUND: + return ix86_expand_sse_round (d, exp, target); case INT_FTYPE_V8SF_V8SF_PTEST: case INT_FTYPE_V4DI_V4DI_PTEST: case INT_FTYPE_V4DF_V4DF_PTEST: @@ -27581,6 +27657,110 @@ ix86_builtin_vectorized_function (tree f } break; + case BUILT_IN_FLOOR: + /* The round insn does not trap on denormals. */ + if (flag_trapping_math) + return NULL_TREE; + if (out_mode == DFmode && in_mode == DFmode) + { + if (out_n == 2 && in_n == 2) + return ix86_builtins[IX86_BUILTIN_FLOORPD]; + else if (out_n == 4 && in_n == 4) + return ix86_builtins[IX86_BUILTIN_FLOORPD256]; + } + break; + + case BUILT_IN_FLOORF: + /* The round insn does not trap on denormals. */ + if (flag_trapping_math) + return NULL_TREE; + if (out_mode == SFmode && in_mode == SFmode) + { + if (out_n == 4 && in_n == 4) + return ix86_builtins[IX86_BUILTIN_FLOORPS]; + else if (out_n == 8 && in_n == 8) + return ix86_builtins[IX86_BUILTIN_FLOORPS256]; + } + break; + + case BUILT_IN_CEIL: + /* The round insn does not trap on denormals. */ + if (flag_trapping_math) + return NULL_TREE; + if (out_mode == DFmode && in_mode == DFmode) + { + if (out_n == 2 && in_n == 2) + return ix86_builtins[IX86_BUILTIN_CEILPD]; + else if (out_n == 4 && in_n == 4) + return ix86_builtins[IX86_BUILTIN_CEILPD256]; + } + break; + + case BUILT_IN_CEILF: + /* The round insn does not trap on denormals. */ + if (flag_trapping_math) + return NULL_TREE; + if (out_mode == SFmode && in_mode == SFmode) + { + if (out_n == 4 && in_n == 4) + return ix86_builtins[IX86_BUILTIN_CEILPS]; + else if (out_n == 8 && in_n == 8) + return ix86_builtins[IX86_BUILTIN_CEILPS256]; + } + break; + + case BUILT_IN_TRUNC: + /* The round insn does not trap on denormals. */ + if (flag_trapping_math) + return NULL_TREE; + if (out_mode == DFmode && in_mode == DFmode) + { + if (out_n == 2 && in_n == 2) + return ix86_builtins[IX86_BUILTIN_TRUNCPD]; + else if (out_n == 4 && in_n == 4) + return ix86_builtins[IX86_BUILTIN_TRUNCPD256]; + } + break; + + case BUILT_IN_TRUNCF: + /* The round insn does not trap on denormals. */ + if (flag_trapping_math) + return NULL_TREE; + if (out_mode == SFmode && in_mode == SFmode) + { + if (out_n == 4 && in_n == 4) + return ix86_builtins[IX86_BUILTIN_TRUNCPS]; + else if (out_n == 8 && in_n == 8) + return ix86_builtins[IX86_BUILTIN_TRUNCPS256]; + } + break; + + case BUILT_IN_RINT: + /* The round insn does not trap on denormals. */ + if (flag_trapping_math) + return NULL_TREE; + if (out_mode == DFmode && in_mode == DFmode) + { + if (out_n == 2 && in_n == 2) + return ix86_builtins[IX86_BUILTIN_RINTPD]; + else if (out_n == 4 && in_n == 4) + return ix86_builtins[IX86_BUILTIN_RINTPD256]; + } + break; + + case BUILT_IN_RINTF: + /* The round insn does not trap on denormals. */ + if (flag_trapping_math) + return NULL_TREE; + if (out_mode == SFmode && in_mode == SFmode) + { + if (out_n == 4 && in_n == 4) + return ix86_builtins[IX86_BUILTIN_RINTPS]; + else if (out_n == 8 && in_n == 8) + return ix86_builtins[IX86_BUILTIN_RINTPS256]; + } + break; + case BUILT_IN_FMA: if (out_mode == DFmode && in_mode == DFmode) {