[4.8,19/26] Backport Power8 and LE support: Quad memory atomic

Message ID	1395257619.17148.21.camel@gnopaine
State	New
Headers	show Return-Path: <gcc-patches-return-363576-incoming=patchwork.ozlabs.org@gcc.gnu.org> DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender :message-id:subject:from:to:cc:date:content-type :content-transfer-encoding:mime-version; q=dns; s=default; b=ArH I6/Yj8IPEv0KEgDueMgdUpKWoaYZX8v3cot+BmEvkT6cHW8HXPoADTwc4l1VTl3+ 0jQVbgXEVa1aZJnF+PQyqc3DdjYzv9548nSaxuszw1N67HH4x21dPClOJVQDbYdx go5CyMoqQkYPsqvqv1Tmf9v9a3g6e8zoarBUTB9s= Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk Sender: gcc-patches-owner@gcc.gnu.org Gateway: Authorized Use Only! Violators will be prosecuted for <gcc-patches@gcc.gnu.org> from <wschmidt@linux.vnet.ibm.com>; Thu, 20 Mar 2014 01:03:32 +0530 Gateway: Authorized Use Only! Violators will be prosecuted; Thu, 20 Mar 2014 01:03:29 +0530 Message-ID: <1395257619.17148.21.camel@gnopaine> Subject: [4.8, PATCH 19/26] Backport Power8 and LE support: Quad memory atomic From: Bill Schmidt <wschmidt@linux.vnet.ibm.com> To: gcc-patches@gcc.gnu.org Cc: dje.gcc@gmail.com Date: Wed, 19 Mar 2014 14:33:39 -0500 Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: 7bit Mime-Version: 1.0

Index: gcc-4_8-test/gcc/config/rs6000/predicates.md =================================================================== --- gcc-4_8-test.orig/gcc/config/rs6000/predicates.md +++ gcc-4_8-test/gcc/config/rs6000/predicates.md @@ -270,7 +270,7 @@ { HOST_WIDE_INT r; - if (!TARGET_QUAD_MEMORY) + if (!TARGET_QUAD_MEMORY && !TARGET_QUAD_MEMORY_ATOMIC) return 0; if (GET_CODE (op) == SUBREG) @@ -633,6 +633,7 @@ (match_test "offsettable_nonstrict_memref_p (op)"))) ;; Return 1 if the operand is suitable for load/store quad memory. +;; This predicate only checks for non-atomic loads/stores. (define_predicate "quad_memory_operand" (match_code "mem") { Index: gcc-4_8-test/gcc/config/rs6000/rs6000-c.c =================================================================== --- gcc-4_8-test.orig/gcc/config/rs6000/rs6000-c.c +++ gcc-4_8-test/gcc/config/rs6000/rs6000-c.c @@ -337,6 +337,10 @@ rs6000_target_modify_macros (bool define rs6000_define_or_undefine_macro (define_p, "__HTM__"); if ((flags & OPTION_MASK_P8_VECTOR) != 0) rs6000_define_or_undefine_macro (define_p, "__POWER8_VECTOR__"); + if ((flags & OPTION_MASK_QUAD_MEMORY) != 0) + rs6000_define_or_undefine_macro (define_p, "__QUAD_MEMORY__"); + if ((flags & OPTION_MASK_QUAD_MEMORY_ATOMIC) != 0) + rs6000_define_or_undefine_macro (define_p, "__QUAD_MEMORY_ATOMIC__"); if ((flags & OPTION_MASK_CRYPTO) != 0) rs6000_define_or_undefine_macro (define_p, "__CRYPTO__"); Index: gcc-4_8-test/gcc/config/rs6000/rs6000-cpus.def =================================================================== --- gcc-4_8-test.orig/gcc/config/rs6000/rs6000-cpus.def +++ gcc-4_8-test/gcc/config/rs6000/rs6000-cpus.def @@ -53,7 +53,8 @@ | OPTION_MASK_CRYPTO \ | OPTION_MASK_DIRECT_MOVE \ | OPTION_MASK_HTM \ - | OPTION_MASK_QUAD_MEMORY) + | OPTION_MASK_QUAD_MEMORY \ + | OPTION_MASK_QUAD_MEMORY_ATOMIC) #define POWERPC_7400_MASK (OPTION_MASK_PPC_GFXOPT | OPTION_MASK_ALTIVEC) Index: gcc-4_8-test/gcc/config/rs6000/rs6000.c =================================================================== --- gcc-4_8-test.orig/gcc/config/rs6000/rs6000.c +++ gcc-4_8-test/gcc/config/rs6000/rs6000.c @@ -3317,14 +3317,37 @@ rs6000_option_override_internal (bool gl /* The quad memory instructions only works in 64-bit mode. In 32-bit mode, silently turn off quad memory mode. */ - if (TARGET_QUAD_MEMORY && !TARGET_POWERPC64) + if ((TARGET_QUAD_MEMORY || TARGET_QUAD_MEMORY_ATOMIC) && !TARGET_POWERPC64) { if ((rs6000_isa_flags_explicit & OPTION_MASK_QUAD_MEMORY) != 0) warning (0, N_("-mquad-memory requires 64-bit mode")); + if ((rs6000_isa_flags_explicit & OPTION_MASK_QUAD_MEMORY_ATOMIC) != 0) + warning (0, N_("-mquad-memory-atomic requires 64-bit mode")); + + rs6000_isa_flags &= ~(OPTION_MASK_QUAD_MEMORY + | OPTION_MASK_QUAD_MEMORY_ATOMIC); + } + + /* Non-atomic quad memory load/store are disabled for little endian, since + the words are reversed, but atomic operations can still be done by + swapping the words. */ + if (TARGET_QUAD_MEMORY && !WORDS_BIG_ENDIAN) + { + if ((rs6000_isa_flags_explicit & OPTION_MASK_QUAD_MEMORY) != 0) + warning (0, N_("-mquad-memory is not available in little endian mode")); + rs6000_isa_flags &= ~OPTION_MASK_QUAD_MEMORY; } + /* Assume if the user asked for normal quad memory instructions, they want + the atomic versions as well, unless they explicity told us not to use quad + word atomic instructions. */ + if (TARGET_QUAD_MEMORY + && !TARGET_QUAD_MEMORY_ATOMIC + && ((rs6000_isa_flags_explicit & OPTION_MASK_QUAD_MEMORY_ATOMIC) == 0)) + rs6000_isa_flags |= OPTION_MASK_QUAD_MEMORY_ATOMIC; + /* Enable power8 fusion if we are tuning for power8, even if we aren't generating power8 instructions. */ if (!(rs6000_isa_flags_explicit & OPTION_MASK_P8_FUSION)) @@ -5875,7 +5898,8 @@ direct_move_p (rtx op0, rtx op1) return false; } -/* Return true if this is a load or store quad operation. */ +/* Return true if this is a load or store quad operation. This function does + not handle the atomic quad memory instructions. */ bool quad_load_store_p (rtx op0, rtx op1) @@ -30675,6 +30699,7 @@ static struct rs6000_opt_mask const rs60 { "powerpc-gfxopt", OPTION_MASK_PPC_GFXOPT, false, true }, { "powerpc-gpopt", OPTION_MASK_PPC_GPOPT, false, true }, { "quad-memory", OPTION_MASK_QUAD_MEMORY, false, true }, + { "quad-memory-atomic", OPTION_MASK_QUAD_MEMORY_ATOMIC, false, true }, { "recip-precision", OPTION_MASK_RECIP_PRECISION, false, true }, { "string", OPTION_MASK_STRING, false, true }, { "update", OPTION_MASK_NO_UPDATE, true , true }, Index: gcc-4_8-test/gcc/config/rs6000/rs6000.h =================================================================== --- gcc-4_8-test.orig/gcc/config/rs6000/rs6000.h +++ gcc-4_8-test/gcc/config/rs6000/rs6000.h @@ -524,8 +524,11 @@ extern int rs6000_vector_align[]; /* Byte/char syncs were added as phased in for ISA 2.06B, but are not present in power7, so conditionalize them on p8 features. TImode syncs need quad memory support. */ -#define TARGET_SYNC_HI_QI (TARGET_QUAD_MEMORY || TARGET_DIRECT_MOVE) -#define TARGET_SYNC_TI TARGET_QUAD_MEMORY +#define TARGET_SYNC_HI_QI (TARGET_QUAD_MEMORY \ + || TARGET_QUAD_MEMORY_ATOMIC \ + || TARGET_DIRECT_MOVE) + +#define TARGET_SYNC_TI TARGET_QUAD_MEMORY_ATOMIC /* Power7 has both 32-bit load and store integer for the FPRs, so we don't need to allocate the SDmode stack slot to get the value into the proper location Index: gcc-4_8-test/gcc/config/rs6000/rs6000.opt =================================================================== --- gcc-4_8-test.orig/gcc/config/rs6000/rs6000.opt +++ gcc-4_8-test/gcc/config/rs6000/rs6000.opt @@ -556,7 +556,11 @@ Use ISA 2.07 transactional memory (HTM) mquad-memory Target Report Mask(QUAD_MEMORY) Var(rs6000_isa_flags) -Generate the quad word memory instructions (lq/stq/lqarx/stqcx). +Generate the quad word memory instructions (lq/stq). + +mquad-memory-atomic +Target Report Mask(QUAD_MEMORY_ATOMIC) Var(rs6000_isa_flags) +Generate the quad word memory atomic instructions (lqarx/stqcx). mcompat-align-parm Target Report Var(rs6000_compat_align_parm) Init(1) Save Index: gcc-4_8-test/gcc/config/rs6000/sync.md =================================================================== --- gcc-4_8-test.orig/gcc/config/rs6000/sync.md +++ gcc-4_8-test/gcc/config/rs6000/sync.md @@ -204,25 +204,46 @@ "<QHI:larx> %0,%y1" [(set_attr "type" "load_l")]) -;; Use PTImode to get even/odd register pairs +;; Use PTImode to get even/odd register pairs. +;; Use a temporary register to force getting an even register for the +;; lqarx/stqcrx. instructions. Normal optimizations will eliminate this extra +;; copy on big endian systems. + +;; On little endian systems where non-atomic quad word load/store instructions +;; are not used, the address can be register+offset, so make sure the address +;; is indexed or indirect before register allocation. + (define_expand "load_lockedti" [(use (match_operand:TI 0 "quad_int_reg_operand" "")) (use (match_operand:TI 1 "memory_operand" ""))] "TARGET_SYNC_TI" { - /* Use a temporary register to force getting an even register for the - lqarx/stqcrx. instructions. Normal optimizations will eliminate this - extra copy. */ + rtx op0 = operands[0]; + rtx op1 = operands[1]; rtx pti = gen_reg_rtx (PTImode); - emit_insn (gen_load_lockedpti (pti, operands[1])); - emit_move_insn (operands[0], gen_lowpart (TImode, pti)); + + if (!indexed_or_indirect_operand (op1, TImode)) + { + rtx old_addr = XEXP (op1, 0); + rtx new_addr = force_reg (Pmode, old_addr); + operands[1] = op1 = change_address (op1, TImode, new_addr); + } + + emit_insn (gen_load_lockedpti (pti, op1)); + if (WORDS_BIG_ENDIAN) + emit_move_insn (op0, gen_lowpart (TImode, pti)); + else + { + emit_move_insn (gen_lowpart (DImode, op0), gen_highpart (DImode, pti)); + emit_move_insn (gen_highpart (DImode, op0), gen_lowpart (DImode, pti)); + } DONE; }) (define_insn "load_lockedpti" [(set (match_operand:PTI 0 "quad_int_reg_operand" "=&r") (unspec_volatile:PTI - [(match_operand:TI 1 "memory_operand" "Z")] UNSPECV_LL))] + [(match_operand:TI 1 "indexed_or_indirect_operand" "Z")] UNSPECV_LL))] "TARGET_SYNC_TI && !reg_mentioned_p (operands[0], operands[1]) && quad_int_reg_operand (operands[0], PTImode)" @@ -238,6 +259,14 @@ "<stcx> %2,%y1" [(set_attr "type" "store_c")]) +;; Use a temporary register to force getting an even register for the +;; lqarx/stqcrx. instructions. Normal optimizations will eliminate this extra +;; copy on big endian systems. + +;; On little endian systems where non-atomic quad word load/store instructions +;; are not used, the address can be register+offset, so make sure the address +;; is indexed or indirect before register allocation. + (define_expand "store_conditionalti" [(use (match_operand:CC 0 "cc_reg_operand" "")) (use (match_operand:TI 1 "memory_operand" "")) @@ -247,21 +276,36 @@ rtx op0 = operands[0]; rtx op1 = operands[1]; rtx op2 = operands[2]; - rtx pti_op1 = change_address (op1, PTImode, XEXP (op1, 0)); - rtx pti_op2 = gen_reg_rtx (PTImode); + rtx addr = XEXP (op1, 0); + rtx pti_mem; + rtx pti_reg; + + if (!indexed_or_indirect_operand (op1, TImode)) + { + rtx new_addr = force_reg (Pmode, addr); + operands[1] = op1 = change_address (op1, TImode, new_addr); + addr = new_addr; + } + + pti_mem = change_address (op1, PTImode, addr); + pti_reg = gen_reg_rtx (PTImode); + + if (WORDS_BIG_ENDIAN) + emit_move_insn (pti_reg, gen_lowpart (PTImode, op2)); + else + { + emit_move_insn (gen_lowpart (DImode, pti_reg), gen_highpart (DImode, op2)); + emit_move_insn (gen_highpart (DImode, pti_reg), gen_lowpart (DImode, op2)); + } - /* Use a temporary register to force getting an even register for the - lqarx/stqcrx. instructions. Normal optimizations will eliminate this - extra copy. */ - emit_move_insn (pti_op2, gen_lowpart (PTImode, op2)); - emit_insn (gen_store_conditionalpti (op0, pti_op1, pti_op2)); + emit_insn (gen_store_conditionalpti (op0, pti_mem, pti_reg)); DONE; }) (define_insn "store_conditionalpti" [(set (match_operand:CC 0 "cc_reg_operand" "=x") (unspec_volatile:CC [(const_int 0)] UNSPECV_SC)) - (set (match_operand:PTI 1 "memory_operand" "=Z") + (set (match_operand:PTI 1 "indexed_or_indirect_operand" "=Z") (match_operand:PTI 2 "quad_int_reg_operand" "r"))] "TARGET_SYNC_TI && quad_int_reg_operand (operands[2], PTImode)" "stqcx. %2,%y1" Index: gcc-4_8-test/gcc/doc/invoke.texi =================================================================== --- gcc-4_8-test.orig/gcc/doc/invoke.texi +++ gcc-4_8-test/gcc/doc/invoke.texi @@ -858,7 +858,9 @@ See RS/6000 and PowerPC Options. -msave-toc-indirect -mno-save-toc-indirect @gol -mpower8-fusion -mno-mpower8-fusion -mpower8-vector -mno-power8-vector @gol -mcrypto -mno-crypto -mdirect-move -mno-direct-move @gol --mquad-memory -mno-quad-memory} +-mquad-memory -mno-quad-memory @gol +-mquad-memory-atomic -mno-quad-memory-atomic @gol +-mcompat-align-parm -mno-compat-align-parm} @emph{RX Options} @gccoptlist{-m64bit-doubles -m32bit-doubles -fpu -nofpu@gol @@ -17243,8 +17245,8 @@ following options: -mpopcntb -mpopcntd -mpowerpc64 @gol -mpowerpc-gpopt -mpowerpc-gfxopt -msingle-float -mdouble-float @gol -msimple-fpu -mstring -mmulhw -mdlmzb -mmfpgpr -mvsx @gol --mcrypto -mdirect-move -mpower8-fusion -mpower8-vector -mquad-memory @gol --mcompat-align-parm -mno-compat-align-parm} +-mcrypto -mdirect-move -mpower8-fusion -mpower8-vector @gol +-mquad-memory -mquad-memory-atomic} The particular options set for any particular CPU varies between compiler versions, depending on what setting seems to produce optimal @@ -17399,10 +17401,18 @@ the vector instructions. @itemx -mno-quad-memory @opindex mquad-memory @opindex mno-quad-memory -Generate code that uses (does not use) the quad word memory +Generate code that uses (does not use) the non-atomic quad word memory instructions. The @option{-mquad-memory} option requires use of 64-bit mode. +@item -mquad-memory-atomic +@itemx -mno-quad-memory-atomic +@opindex mquad-memory-atomic +@opindex mno-quad-memory-atomic +Generate code that uses (does not use) the atomic quad word memory +instructions. The @option{-mquad-memory-atomic} option requires use of +64-bit mode. + @item -mfloat-gprs=@var{yes/single/double/no} @itemx -mfloat-gprs @opindex mfloat-gprs Index: gcc-4_8-test/gcc/testsuite/gcc.target/powerpc/quad-atomic.c =================================================================== --- /dev/null +++ gcc-4_8-test/gcc/testsuite/gcc.target/powerpc/quad-atomic.c @@ -0,0 +1,67 @@ +/* { dg-do run { target { powerpc*-*-linux* && lp64 } } } */ +/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */ +/* { dg-skip-if "" { powerpc*-*-*spe* } { "*" } { "" } } */ +/* { dg-require-effective-target p8vector_hw } */ +/* { dg-options "-mcpu=power8 -O2" } */ + +/* Test whether we get the right bits for quad word atomic instructions. */ +#include <stdlib.h> + +static __int128_t quad_fetch_and (__int128_t *, __int128_t value) __attribute__((__noinline__)); +static __int128_t quad_fetch_or (__int128_t *, __int128_t value) __attribute__((__noinline__)); +static __int128_t quad_fetch_add (__int128_t *, __int128_t value) __attribute__((__noinline__)); + +static __int128_t +quad_fetch_and (__int128_t *ptr, __int128_t value) +{ + return __atomic_fetch_and (ptr, value, __ATOMIC_ACQUIRE); +} + +static __int128_t +quad_fetch_or (__int128_t *ptr, __int128_t value) +{ + return __atomic_fetch_or (ptr, value, __ATOMIC_ACQUIRE); +} + +static __int128_t +quad_fetch_add (__int128_t *ptr, __int128_t value) +{ + return __atomic_fetch_add (ptr, value, __ATOMIC_ACQUIRE); +} + +int +main (void) +{ + __int128_t result; + __int128_t value; + __int128_t and_input = ((((__int128_t) 0x1234567890abcdefULL) << 64) | ((__int128_t) 0xfedcba0987654321ULL)); + __int128_t and_value = ((((__int128_t) 0xfffffffffffffff0ULL) << 64) | ((__int128_t) 0xfffffffffffffff0ULL)); + __int128_t and_exp = ((((__int128_t) 0x1234567890abcde0ULL) << 64) | ((__int128_t) 0xfedcba0987654320ULL)); + + __int128_t or_input = ((((__int128_t) 0x1234567890abcdefULL) << 64) | ((__int128_t) 0xfedcba0987654321ULL)); + __int128_t or_value = ((((__int128_t) 0x0000000000000010ULL) << 64) | ((__int128_t) 0x000000000000000eULL)); + __int128_t or_exp = ((((__int128_t) 0x1234567890abcdffULL) << 64) | ((__int128_t) 0xfedcba098765432fULL)); + + __int128_t add_input = ((((__int128_t) 0x1234567890abcdefULL) << 64) | ((__int128_t) 0xfedcba0987654321ULL)); + __int128_t add_value = ((((__int128_t) 0x0000000001000000ULL) << 64) | ((__int128_t) 0x0000001000000000ULL)); + __int128_t add_exp = ((((__int128_t) 0x1234567891abcdefULL) << 64) | ((__int128_t) 0xfedcba1987654321ULL)); + + + value = and_input; + result = quad_fetch_and (&value, and_value); + if (result != and_input || value != and_exp) + abort (); + + value = or_input; + result = quad_fetch_or (&value, or_value); + if (result != or_input || value != or_exp) + abort (); + + value = add_input; + result = quad_fetch_add (&value, add_value); + if (result != add_input || value != add_exp) + abort (); + + return 0; +} +

[4.8,19/26] Backport Power8 and LE support: Quad memory atomic

Commit Message

Comments

Patch