, Add power9 support to GCC, patch #7 (direct move enhancements)

Message ID	20151109210601.GA7344@ibm-tiger.the-meissners.org
State	New
Headers	show Return-Path: <gcc-patches-return-413385-incoming=patchwork.ozlabs.org@gcc.gnu.org> DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender:date :from:to:subject:message-id:references:mime-version:content-type :in-reply-to; q=dns; s=default; b=tzg/LQ0l+jR3/xQfCRdzq+405M3KL+ mmtnZjRa7iBfxbMOM7qGjcIaluUYmEMHfJrS4+Mh34WricAALgSqvHnTEgKb7rLC g7nU6sBinADiXEwo/TRl+Xbn5wbmpAx5ty4xVJOtsdzRPCDrGT4ZSpuRrYq7CnfI 0r/rd6uuRM+a8= Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk Sender: gcc-patches-owner@gcc.gnu.org Gateway: Authorized Use Only! Violators will be prosecuted for <gcc-patches@gcc.gnu.org> from <meissner@ibm-tiger.the-meissners.org>; Mon, 9 Nov 2015 16:06:06 -0500 Gateway: Authorized Use Only! Violators will be prosecuted; Mon, 9 Nov 2015 16:06:04 -0500 Date: Mon, 9 Nov 2015 16:06:02 -0500 From: Michael Meissner <meissner@linux.vnet.ibm.com> To: Michael Meissner <meissner@linux.vnet.ibm.com>, gcc-patches@gcc.gnu.org, dje.gcc@gmail.com Subject: Re: [PATCH], Add power9 support to GCC, patch #7 (direct move enhancements) Message-ID: <20151109210601.GA7344@ibm-tiger.the-meissners.org> Mail-Followup-To: Michael Meissner <meissner@linux.vnet.ibm.com>, gcc-patches@gcc.gnu.org, dje.gcc@gmail.com References: <20151103202911.GA5304@ibm-tiger.the-meissners.org> <20151109004856.GG17170@ibm-tiger.the-meissners.org> MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="XsQoSWH+UP9D9v3l" Content-Disposition: inline In-Reply-To: <20151109004856.GG17170@ibm-tiger.the-meissners.org> User-Agent: Mutt/1.5.20 (2009-12-10)

Index: gcc/config/rs6000/constraints.md =================================================================== --- gcc/config/rs6000/constraints.md (revision 229976) +++ gcc/config/rs6000/constraints.md (working copy) @@ -64,7 +64,8 @@ (define_register_constraint "wa" "rs6000 (define_register_constraint "wd" "rs6000_constraints[RS6000_CONSTRAINT_wd]" "VSX vector register to hold vector double data or NO_REGS.") -;; we is not currently used +(define_register_constraint "we" "rs6000_constraints[RS6000_CONSTRAINT_we]" + "VSX register if the -mpower9-vector -m64 options were used or NO_REGS.") (define_register_constraint "wf" "rs6000_constraints[RS6000_CONSTRAINT_wf]" "VSX vector register to hold vector float data or NO_REGS.") @@ -147,6 +148,12 @@ (define_memory_constraint "wG" "Memory operand suitable for TOC fusion memory references" (match_operand 0 "toc_fusion_mem_wrapped")) +(define_constraint "wL" + "Int constant that is the element number mfvsrld accesses in a vector." + (and (match_code "const_int") + (and (match_test "TARGET_DIRECT_MOVE_128") + (match_test "(ival == VECTOR_ELEMENT_MFVSRLD_64BIT)")))) + ;; Lq/stq validates the address for load/store quad (define_memory_constraint "wQ" "Memory operand suitable for the load/store quad instructions" Index: gcc/config/rs6000/rs6000.c =================================================================== --- gcc/config/rs6000/rs6000.c (revision 229977) +++ gcc/config/rs6000/rs6000.c (working copy) @@ -2575,6 +2575,10 @@ rs6000_debug_reg_global (void) if (TARGET_VSX) fprintf (stderr, DEBUG_FMT_D, "VSX easy 64-bit scalar element", (int)VECTOR_ELEMENT_SCALAR_64BIT); + + if (TARGET_DIRECT_MOVE_128) + fprintf (stderr, DEBUG_FMT_D, "VSX easy 64-bit mfvsrld element", + (int)VECTOR_ELEMENT_MFVSRLD_64BIT); } @@ -2986,6 +2990,10 @@ rs6000_init_hard_regno_mode_ok (bool glo rs6000_constraints[RS6000_CONSTRAINT_wp] = VSX_REGS; /* TFmode */ } + /* Support for new direct moves. */ + if (TARGET_DIRECT_MOVE_128) + rs6000_constraints[RS6000_CONSTRAINT_we] = VSX_REGS; + /* Set up the reload helper and direct move functions. */ if (TARGET_VSX || TARGET_ALTIVEC) { @@ -3034,7 +3042,7 @@ rs6000_init_hard_regno_mode_ok (bool glo reg_addr[TImode].reload_load = CODE_FOR_reload_ti_di_load; } - if (TARGET_DIRECT_MOVE) + if (TARGET_DIRECT_MOVE && !TARGET_DIRECT_MOVE_128) { reg_addr[TImode].reload_gpr_vsx = CODE_FOR_reload_gpr_from_vsxti; reg_addr[V1TImode].reload_gpr_vsx = CODE_FOR_reload_gpr_from_vsxv1ti; @@ -18081,6 +18089,11 @@ rs6000_secondary_reload_simple_move (enu || (to_type == VSX_REG_TYPE && from_type == GPR_REG_TYPE))) return true; + else if (TARGET_DIRECT_MOVE_128 && size == 16 + && ((to_type == VSX_REG_TYPE && from_type == GPR_REG_TYPE) + || (to_type == GPR_REG_TYPE && from_type == VSX_REG_TYPE))) + return true; + else if (TARGET_MFPGPR && TARGET_POWERPC64 && size == 8 && ((to_type == GPR_REG_TYPE && from_type == FPR_REG_TYPE) || (to_type == FPR_REG_TYPE && from_type == GPR_REG_TYPE))) @@ -18094,7 +18107,7 @@ rs6000_secondary_reload_simple_move (enu return false; } -/* Power8 helper function for rs6000_secondary_reload, handle all of the +/* Direct move helper function for rs6000_secondary_reload, handle all of the special direct moves that involve allocating an extra register, return the insn code of the helper function if there is such a function or CODE_FOR_nothing if not. */ @@ -18116,8 +18129,8 @@ rs6000_secondary_reload_direct_move (enu if (size == 16) { /* Handle moving 128-bit values from GPRs to VSX point registers on - power8 when running in 64-bit mode using XXPERMDI to glue the two - 64-bit values back together. */ + ISA 2.07 (power8, power9) when running in 64-bit mode using + XXPERMDI to glue the two 64-bit values back together. */ if (to_type == VSX_REG_TYPE && from_type == GPR_REG_TYPE) { cost = 3; /* 2 mtvsrd's, 1 xxpermdi. */ @@ -18125,7 +18138,7 @@ rs6000_secondary_reload_direct_move (enu } /* Handle moving 128-bit values from VSX point registers to GPRs on - power8 when running in 64-bit mode using XXPERMDI to get access to the + ISA 2.07 when running in 64-bit mode using XXPERMDI to get access to the bottom 64-bit value. */ else if (to_type == GPR_REG_TYPE && from_type == VSX_REG_TYPE) { @@ -18153,7 +18166,7 @@ rs6000_secondary_reload_direct_move (enu if (TARGET_POWERPC64 && size == 16) { /* Handle moving 128-bit values from GPRs to VSX point registers on - power8 when running in 64-bit mode using XXPERMDI to glue the two + ISA 2.07 when running in 64-bit mode using XXPERMDI to glue the two 64-bit values back together. */ if (to_type == VSX_REG_TYPE && from_type == GPR_REG_TYPE) { @@ -18162,7 +18175,7 @@ rs6000_secondary_reload_direct_move (enu } /* Handle moving 128-bit values from VSX point registers to GPRs on - power8 when running in 64-bit mode using XXPERMDI to get access to the + ISA 2.07 when running in 64-bit mode using XXPERMDI to get access to the bottom 64-bit value. */ else if (to_type == GPR_REG_TYPE && from_type == VSX_REG_TYPE) { @@ -18174,8 +18187,8 @@ rs6000_secondary_reload_direct_move (enu else if (!TARGET_POWERPC64 && size == 8) { /* Handle moving 64-bit values from GPRs to floating point registers on - power8 when running in 32-bit mode using FMRGOW to glue the two 32-bit - values back together. Altivec register classes must be handled + ISA 2.07 when running in 32-bit mode using FMRGOW to glue the two + 32-bit values back together. Altivec register classes must be handled specially since a different instruction is used, and the secondary reload support requires a single instruction class in the scratch register constraint. However, right now TFmode is not allowed in @@ -18202,7 +18215,7 @@ rs6000_secondary_reload_direct_move (enu /* Return whether a move between two register classes can be done either directly (simple move) or via a pattern that uses a single extra temporary - (using power8's direct move in this case. */ + (using ISA 2.07's direct move in this case. */ static bool rs6000_secondary_reload_move (enum rs6000_reg_type to_type, @@ -19241,6 +19254,11 @@ rs6000_output_move_128bit (rtx operands[ if (src_gpr_p) return "#"; + if (TARGET_DIRECT_MOVE_128 && src_vsx_p) + return (WORDS_BIG_ENDIAN + ? "mfvsrd %0,%x1\n\tmfvsrld %L0,%x1" + : "mfvsrd %L0,%x1\n\tmfvsrld %0,%x1"); + else if (TARGET_VSX && TARGET_DIRECT_MOVE && src_vsx_p) return "#"; } @@ -19250,6 +19268,11 @@ rs6000_output_move_128bit (rtx operands[ if (src_vsx_p) return "xxlor %x0,%x1,%x1"; + else if (TARGET_DIRECT_MOVE_128 && src_gpr_p) + return (WORDS_BIG_ENDIAN + ? "mtvsrdd %x0,%1,%L1" + : "mtvsrdd %x0,%L1,%1"); + else if (TARGET_DIRECT_MOVE && src_gpr_p) return "#"; } Index: gcc/config/rs6000/vsx.md =================================================================== --- gcc/config/rs6000/vsx.md (revision 229970) +++ gcc/config/rs6000/vsx.md (working copy) @@ -760,31 +760,31 @@ (define_split "") (define_insn "*vsx_mov<mode>" - [(set (match_operand:VSX_M 0 "nonimmediate_operand" "=Z,<VSr>,<VSr>,?Z,?<VSa>,?<VSa>,wQ,?&r,??Y,??r,??r,<VSr>,?<VSa>,*r,v,wZ, v") - (match_operand:VSX_M 1 "input_operand" "<VSr>,Z,<VSr>,<VSa>,Z,<VSa>,r,wQ,r,Y,r,j,j,j,W,v,wZ"))] + [(set (match_operand:VSX_M 0 "nonimmediate_operand" "=Z,<VSr>,<VSr>,?Z,?<VSa>,?<VSa>,r,we,wQ,?&r,??Y,??r,??r,<VSr>,?<VSa>,*r,v,wZ,v") + (match_operand:VSX_M 1 "input_operand" "<VSr>,Z,<VSr>,<VSa>,Z,<VSa>,we,b,r,wQ,r,Y,r,j,j,j,W,v,wZ"))] "VECTOR_MEM_VSX_P (<MODE>mode) && (register_operand (operands[0], <MODE>mode) || register_operand (operands[1], <MODE>mode))" { return rs6000_output_move_128bit (operands); } - [(set_attr "type" "vecstore,vecload,vecsimple,vecstore,vecload,vecsimple,load,store,store,load, *,vecsimple,vecsimple,*, *,vecstore,vecload") - (set_attr "length" "4,4,4,4,4,4,12,12,12,12,16,4,4,*,16,4,4")]) + [(set_attr "type" "vecstore,vecload,vecsimple,vecstore,vecload,vecsimple,mffgpr,mftgpr,load,store,store,load, *,vecsimple,vecsimple,*, *,vecstore,vecload") + (set_attr "length" "4,4,4,4,4,4,8,4,12,12,12,12,16,4,4,*,16,4,4")]) ;; Unlike other VSX moves, allow the GPRs even for reloading, since a normal ;; use of TImode is for unions. However for plain data movement, slightly ;; favor the vector loads (define_insn "*vsx_movti_64bit" - [(set (match_operand:TI 0 "nonimmediate_operand" "=Z,wa,wa,wa,v,v,wZ,wQ,&r,Y,r,r,?r") - (match_operand:TI 1 "input_operand" "wa,Z,wa,O,W,wZ,v,r,wQ,r,Y,r,n"))] + [(set (match_operand:TI 0 "nonimmediate_operand" "=Z,wa,wa,wa,r,we,v,v,wZ,wQ,&r,Y,r,r,?r") + (match_operand:TI 1 "input_operand" "wa,Z,wa,O,we,b,W,wZ,v,r,wQ,r,Y,r,n"))] "TARGET_POWERPC64 && VECTOR_MEM_VSX_P (TImode) && (register_operand (operands[0], TImode) || register_operand (operands[1], TImode))" { return rs6000_output_move_128bit (operands); } - [(set_attr "type" "vecstore,vecload,vecsimple,vecsimple,vecsimple,vecstore,vecload,store,load,store,load,*,*") - (set_attr "length" "4,4,4,4,16,4,4,8,8,8,8,8,8")]) + [(set_attr "type" "vecstore,vecload,vecsimple,vecsimple,mffgpr,mftgpr,vecsimple,vecstore,vecload,store,load,store,load,*,*") + (set_attr "length" "4,4,4,4,8,4,16,4,4,8,8,8,8,8,8")]) (define_insn "*vsx_movti_32bit" [(set (match_operand:TI 0 "nonimmediate_operand" "=Z,wa,wa,wa,v, v,wZ,Q,Y,????r,????r,????r,r") @@ -1909,11 +1909,11 @@ (define_expand "vsx_extract_<mode>" ;; Optimize cases were we can do a simple or direct move. ;; Or see if we can avoid doing the move at all (define_insn "*vsx_extract_<mode>_internal1" - [(set (match_operand:<VS_scalar> 0 "register_operand" "=d,<VS_64reg>,r") + [(set (match_operand:<VS_scalar> 0 "register_operand" "=d,<VS_64reg>,r,r") (vec_select:<VS_scalar> - (match_operand:VSX_D 1 "register_operand" "d,<VS_64reg>,<VS_64dm>") + (match_operand:VSX_D 1 "register_operand" "d,<VS_64reg>,<VS_64dm>,<VS_64dm>") (parallel - [(match_operand:QI 2 "vsx_scalar_64bit" "wD,wD,wD")])))] + [(match_operand:QI 2 "vsx_scalar_64bit" "wD,wD,wD,wL")])))] "VECTOR_MEM_VSX_P (<MODE>mode) && TARGET_POWERPC64 && TARGET_DIRECT_MOVE" { int op0_regno = REGNO (operands[0]); @@ -1923,14 +1923,16 @@ (define_insn "*vsx_extract_<mode>_intern return "nop"; if (INT_REGNO_P (op0_regno)) - return "mfvsrd %0,%x1"; + return ((INTVAL (operands[2]) == VECTOR_ELEMENT_MFVSRLD_64BIT) + ? "mfvsrdl %0,%x1" + : "mfvsrd %0,%x1"); if (FP_REGNO_P (op0_regno) && FP_REGNO_P (op1_regno)) return "fmr %0,%1"; return "xxlor %x0,%x1,%x1"; } - [(set_attr "type" "fp,vecsimple,mftgpr") + [(set_attr "type" "fp,vecsimple,mftgpr,mftgpr") (set_attr "length" "4")]) (define_insn "*vsx_extract_<mode>_internal2" Index: gcc/config/rs6000/rs6000.h =================================================================== --- gcc/config/rs6000/rs6000.h (revision 229976) +++ gcc/config/rs6000/rs6000.h (working copy) @@ -516,6 +516,10 @@ extern int rs6000_vector_align[]; with scalar instructions. */ #define VECTOR_ELEMENT_SCALAR_64BIT ((BYTES_BIG_ENDIAN) ? 0 : 1) +/* Element number of the 64-bit value in a 128-bit vector that can be accessed + with the ISA 3.0 MFVSRLD instructions. */ +#define VECTOR_ELEMENT_MFVSRLD_64BIT ((BYTES_BIG_ENDIAN) ? 1 : 0) + /* Alignment options for fields in structures for sub-targets following AIX-like ABI. ALIGN_POWER word-aligns FP doubles (default AIX ABI). @@ -571,6 +575,8 @@ extern int rs6000_vector_align[]; #define TARGET_XSCVDPSPN (TARGET_DIRECT_MOVE || TARGET_P8_VECTOR) #define TARGET_XSCVSPDPN (TARGET_DIRECT_MOVE || TARGET_P8_VECTOR) #define TARGET_VADDUQM (TARGET_P8_VECTOR && TARGET_POWERPC64) +#define TARGET_DIRECT_MOVE_128 (TARGET_P9_VECTOR && TARGET_DIRECT_MOVE \ + && TARGET_POWERPC64) /* Byte/char syncs were added as phased in for ISA 2.06B, but are not present in power7, so conditionalize them on p8 features. TImode syncs need quad @@ -1517,6 +1523,7 @@ enum r6000_reg_class_enum { RS6000_CONSTRAINT_v, /* Altivec registers */ RS6000_CONSTRAINT_wa, /* Any VSX register */ RS6000_CONSTRAINT_wd, /* VSX register for V2DF */ + RS6000_CONSTRAINT_we, /* VSX register if ISA 3.0 vector. */ RS6000_CONSTRAINT_wf, /* VSX register for V4SF */ RS6000_CONSTRAINT_wg, /* FPR register for -mmfpgpr */ RS6000_CONSTRAINT_wh, /* FPR register for direct moves. */ Index: gcc/config/rs6000/rs6000.md =================================================================== --- gcc/config/rs6000/rs6000.md (revision 229977) +++ gcc/config/rs6000/rs6000.md (working copy) @@ -7521,7 +7521,10 @@ (define_split (match_operand:FMOVE128_GPR 1 "input_operand" ""))] "reload_completed && (int_reg_operand (operands[0], <MODE>mode) - || int_reg_operand (operands[1], <MODE>mode))" + || int_reg_operand (operands[1], <MODE>mode)) + && (!TARGET_DIRECT_MOVE_128 + || (!vsx_register_operand (operands[0], <MODE>mode) + && !vsx_register_operand (operands[1], <MODE>mode)))" [(pc)] { rs6000_split_multireg_move (operands[0], operands[1]); DONE; }) Index: gcc/doc/md.texi =================================================================== --- gcc/doc/md.texi (revision 229970) +++ gcc/doc/md.texi (working copy) @@ -3121,9 +3121,28 @@ asm ("xvadddp %0,%1,%2" : "=wa" (v1) : " is not correct. +If an instruction only takes Altivec registers, you do not want to use +@code{%x<n>}. + +@smallexample +asm ("xsaddqp %0,%1,%2" : "=v" (v1) : "v" (v2), "v" (v3)); +@end smallexample + +is correct because the @code{xsaddqp} instruction only takes Altivec +registers, while: + +@smallexample +asm ("xsaddqp %x0,%x1,%x2" : "=v" (v1) : "v" (v2), "v" (v3)); +@end smallexample + +is incorrect. + @item wd VSX vector register to hold vector double data or NO_REGS. +@item we +VSX register if the -mpower9-vector -m64 options were used or NO_REGS. + @item wf VSX vector register to hold vector float data or NO_REGS. @@ -3187,6 +3206,16 @@ Floating point register if the LFIWZX in @item wD Int constant that is the element number of the 64-bit scalar in a vector. +@item wF +Memory operand suitable for power9 fusion load/stores. + +@item wG +Memory operand suitable for TOC fusion memory references. + +@item wL +Int constant that is the element number that the MFVSRLD instruction +targets. + @item wQ A memory address that will work with the @code{lq} and @code{stq} instructions. Index: gcc/testsuite/gcc.target/powerpc/direct-move-vector.c =================================================================== --- gcc/testsuite/gcc.target/powerpc/direct-move-vector.c (revision 0) +++ gcc/testsuite/gcc.target/powerpc/direct-move-vector.c (revision 0) @@ -0,0 +1,35 @@ +/* { dg-do compile { target { powerpc*-*-linux* && lp64 } } } */ +/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */ +/* { dg-skip-if "" { powerpc*-*-*spe* } { "*" } { "" } } */ +/* { dg-require-effective-target powerpc_p9vector_ok } */ +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power9" } } */ +/* { dg-options "-mcpu=power9 -O2" } */ + +/* Check code generation for direct move for long types. */ + +void +test (vector double *p) +{ + vector double v1 = *p; + vector double v2; + vector double v3; + vector double v4; + + /* Force memory -> FPR load. */ + __asm__ (" # reg %x0" : "+d" (v1)); + + /* force VSX -> GPR direct move. */ + v2 = v1; + __asm__ (" # reg %0" : "+r" (v2)); + + /* Force GPR -> Altivec direct move. */ + v3 = v2; + __asm__ (" # reg %x0" : "+v" (v3)); + *p = v3; +} + +/* { dg-final { scan-assembler "mfvsrd" } } */ +/* { dg-final { scan-assembler "mfvsrld" } } */ +/* { dg-final { scan-assembler "mtvsrdd" } } */ + +

, Add power9 support to GCC, patch #7 (direct move enhancements)

Commit Message

Patch