Message ID | 1427383399.2939.25.camel@gnopaine |
---|---|
State | New |
Headers | show |
Oops. Fixed post title. On Thu, 2015-03-26 at 10:23 -0500, Bill Schmidt wrote: > Hi, > > This is a follow-up to > https://gcc.gnu.org/ml/gcc-patches/2015-03/msg01310.html, which > backported the POWER-specific little-endian swap optimization pass to > the 4.9 branch. We also need to backport this to the 4.8 branch. This > patch does that. > > The patch is very similar to the 4.9 backport, except for two things. > First, the passes infrastructure changed quite a bit between 4.8 and > 4.9, so the code to describe the new pass to the pass manager is > somewhat different. Second, I've omitted three of the test cases, which > happen to fail on 4.8 for unrelated reasons. (We run out of volatile > registers and end up saving non-volatiles to the stack in the prologue, > which generates load/swap sequences for now.) > > Tested on powerpc64le-unknown-linux-gnu with no regressions. Is this OK > for 4.8? > > Thanks, > Bill > > > [gcc] > > 2015-03-26 Bill Schmidt <wschmidt@linux.vnet.ibm.com> > > Backport of r214242, r214254, and bug fix patches from mainline > * config/rs6000/rs6000.c (tree-pass.h): New #include. > (rs6000_analyze_swaps): New declaration. > (gate_analyze_swaps): New function. > (execute_analyze_swaps): Likewise. > (pass_analyze_swaps): New struct rtl_opt_pass. > (rs6000_option_override): Register swap-optimization pass. > (swap_web_entry): New class. > (special_handling_values): New enum. > (union_defs): New function. > (union_uses): Likewise. > (insn_is_load_p): Likewise. > (insn_is_store_p): Likewise. > (insn_is_swap_p): Likewise. > (rtx_is_swappable_p): Likewise. > (insn_is_swappable_p): Likewise. > (chain_purpose): New enum. > (chain_contains_only_swaps): New function. > (mark_swaps_for_removal): Likewise. > (swap_const_vector_halves): Likewise. > (adjust_subreg_index): Likewise. > (permute_load): Likewise. > (permute_store): Likewise. > (adjust_extract): Likewise. > (adjust_splat): Likewise. > (handle_special_swappables): Likewise. > (replace_swap_with_copy): Likewise. > (dump_swap_insn_table): Likewise. > (rs6000_analyze_swaps): Likewise. > * config/rs6000/rs6000.opt (moptimize-swaps): New option. > * df.h (web_entry_base): New class, replacing struct web_entry. > (web_entry_base::pred): New method. > (web_entry_base::set_pred): Likewise. > (web_entry_base::unionfind_root): Likewise. > (web_entry_base::unionfind_union): Likewise. > (unionfind_root): Delete external reference. > (unionfind_union): Likewise. > (union_defs): Likewise. > * web.c (web_entry_base::unionfind_root): Convert to method. > (web_entry_base::unionfind_union): Likewise. > (web_entry): New class. > (union_match_dups): Convert to use class structure. > (union_defs): Likewise. > (entry_register): Likewise. > (web_main): Likewise. > > > [gcc/testsuite] > > 2015-03-26 Bill Schmidt <wschmidt@linux.vnet.ibm.com> > > Backport r214254 and related tests from mainline > * gcc.target/powerpc/swaps-p8-1.c: New test. > * gcc.target/powerpc/swaps-p8-3.c: New test. > * gcc.target/powerpc/swaps-p8-4.c: New test. > * gcc.target/powerpc/swaps-p8-5.c: New test. > * gcc.target/powerpc/swaps-p8-6.c: New test. > * gcc.target/powerpc/swaps-p8-7.c: New test. > * gcc.target/powerpc/swaps-p8-8.c: New test. > * gcc.target/powerpc/swaps-p8-9.c: New test. > * gcc.target/powerpc/swaps-p8-10.c: New test. > * gcc.target/powerpc/swaps-p8-11.c: New test. > * gcc.target/powerpc/swaps-p8-12.c: New test. > * gcc.target/powerpc/swaps-p8-13.c: New test. > * gcc.target/powerpc/swaps-p8-15.c: New test. > * gcc.target/powerpc/swaps-p8-17.c: New test. > > > Index: gcc/config/rs6000/rs6000.c > =================================================================== > --- gcc/config/rs6000/rs6000.c (revision 221696) > +++ gcc/config/rs6000/rs6000.c (working copy) > @@ -61,6 +61,7 @@ > #include "tree-vectorizer.h" > #include "dumpfile.h" > #include "real.h" > +#include "tree-pass.h" > #if TARGET_XCOFF > #include "xcoffout.h" /* get declarations of xcoff_*_section_name */ > #endif > @@ -1153,6 +1154,7 @@ static bool rs6000_secondary_reload_move (enum rs6 > enum machine_mode, > secondary_reload_info *, > bool); > +static unsigned int rs6000_analyze_swaps (function *); > > /* Hash table stuff for keeping track of TOC entries. */ > > @@ -4046,6 +4048,37 @@ rs6000_option_override_internal (bool global_init_ > return ret; > } > > +static bool > +gate_analyze_swaps (void) > +{ > + return (optimize > 0 && !BYTES_BIG_ENDIAN && TARGET_VSX > + && rs6000_optimize_swaps); > +} > + > +static unsigned int > +execute_analyze_swaps (void) > +{ > + return rs6000_analyze_swaps (cfun); > +} > + > +struct rtl_opt_pass pass_analyze_swaps = > +{ > + RTL_PASS, > + "swaps", /* name */ > + OPTGROUP_NONE, /* optinfo_flags */ > + gate_analyze_swaps, /* has_gate */ > + execute_analyze_swaps, /* has_execute */ > + NULL, /* sub */ > + NULL, /* next */ > + 0, /* static_pass_number */ > + TV_NONE, /* tv_id */ > + 0, /* properties_required */ > + 0, /* properties_provided */ > + 0, /* properties_destroyed */ > + 0, /* todo_flags_start */ > + TODO_df_finish, /* todo_flags_finish */ > +}; > + > /* Implement TARGET_OPTION_OVERRIDE. On the RS/6000 this is used to > define the target cpu type. */ > > @@ -4053,6 +4086,13 @@ static void > rs6000_option_override (void) > { > (void) rs6000_option_override_internal (true); > + > + /* Register machine-specific passes. This needs to be done at start-up. > + It's convenient to do it here (like i386 does). */ > + static struct register_pass_info analyze_swaps_info > + = { &pass_analyze_swaps.pass, "cse1", 1, PASS_POS_INSERT_BEFORE }; > + > + register_pass (&analyze_swaps_info); > } > > > @@ -33210,7 +33250,1148 @@ emit_fusion_gpr_load (rtx target, rtx mem) > > return ""; > } > + > +/* Analyze vector computations and remove unnecessary doubleword > + swaps (xxswapdi instructions). This pass is performed only > + for little-endian VSX code generation. > > + For this specific case, loads and stores of 4x32 and 2x64 vectors > + are inefficient. These are implemented using the lvx2dx and > + stvx2dx instructions, which invert the order of doublewords in > + a vector register. Thus the code generation inserts an xxswapdi > + after each such load, and prior to each such store. (For spill > + code after register assignment, an additional xxswapdi is inserted > + following each store in order to return a hard register to its > + unpermuted value.) > + > + The extra xxswapdi instructions reduce performance. This can be > + particularly bad for vectorized code. The purpose of this pass > + is to reduce the number of xxswapdi instructions required for > + correctness. > + > + The primary insight is that much code that operates on vectors > + does not care about the relative order of elements in a register, > + so long as the correct memory order is preserved. If we have > + a computation where all input values are provided by lvxd2x/xxswapdi > + sequences, all outputs are stored using xxswapdi/stvxd2x sequences, > + and all intermediate computations are pure SIMD (independent of > + element order), then all the xxswapdi's associated with the loads > + and stores may be removed. > + > + This pass uses some of the infrastructure and logical ideas from > + the "web" pass in web.c. We create maximal webs of computations > + fitting the description above using union-find. Each such web is > + then optimized by removing its unnecessary xxswapdi instructions. > + > + The pass is placed prior to global optimization so that we can > + perform the optimization in the safest and simplest way possible; > + that is, by replacing each xxswapdi insn with a register copy insn. > + Subsequent forward propagation will remove copies where possible. > + > + There are some operations sensitive to element order for which we > + can still allow the operation, provided we modify those operations. > + These include CONST_VECTORs, for which we must swap the first and > + second halves of the constant vector; and SUBREGs, for which we > + must adjust the byte offset to account for the swapped doublewords. > + A remaining opportunity would be non-immediate-form splats, for > + which we should adjust the selected lane of the input. We should > + also make code generation adjustments for sum-across operations, > + since this is a common vectorizer reduction. > + > + Because we run prior to the first split, we can see loads and stores > + here that match *vsx_le_perm_{load,store}_<mode>. These are vanilla > + vector loads and stores that have not yet been split into a permuting > + load/store and a swap. (One way this can happen is with a builtin > + call to vec_vsx_{ld,st}.) We can handle these as well, but rather > + than deleting a swap, we convert the load/store into a permuting > + load/store (which effectively removes the swap). */ > + > +/* Notes on Permutes > + > + We do not currently handle computations that contain permutes. There > + is a general transformation that can be performed correctly, but it > + may introduce more expensive code than it replaces. To handle these > + would require a cost model to determine when to perform the optimization. > + This commentary records how this could be done if desired. > + > + The most general permute is something like this (example for V16QI): > + > + (vec_select:V16QI (vec_concat:V32QI (op1:V16QI) (op2:V16QI)) > + (parallel [(const_int a0) (const_int a1) > + ... > + (const_int a14) (const_int a15)])) > + > + where a0,...,a15 are in [0,31] and select elements from op1 and op2 > + to produce in the result. > + > + Regardless of mode, we can convert the PARALLEL to a mask of 16 > + byte-element selectors. Let's call this M, with M[i] representing > + the ith byte-element selector value. Then if we swap doublewords > + throughout the computation, we can get correct behavior by replacing > + M with M' as follows: > + > + { M[i+8]+8 : i < 8, M[i+8] in [0,7] U [16,23] > + M'[i] = { M[i+8]-8 : i < 8, M[i+8] in [8,15] U [24,31] > + { M[i-8]+8 : i >= 8, M[i-8] in [0,7] U [16,23] > + { M[i-8]-8 : i >= 8, M[i-8] in [8,15] U [24,31] > + > + This seems promising at first, since we are just replacing one mask > + with another. But certain masks are preferable to others. If M > + is a mask that matches a vmrghh pattern, for example, M' certainly > + will not. Instead of a single vmrghh, we would generate a load of > + M' and a vperm. So we would need to know how many xxswapd's we can > + remove as a result of this transformation to determine if it's > + profitable; and preferably the logic would need to be aware of all > + the special preferable masks. > + > + Another form of permute is an UNSPEC_VPERM, in which the mask is > + already in a register. In some cases, this mask may be a constant > + that we can discover with ud-chains, in which case the above > + transformation is ok. However, the common usage here is for the > + mask to be produced by an UNSPEC_LVSL, in which case the mask > + cannot be known at compile time. In such a case we would have to > + generate several instructions to compute M' as above at run time, > + and a cost model is needed again. */ > + > +/* This is based on the union-find logic in web.c. web_entry_base is > + defined in df.h. */ > +class swap_web_entry : public web_entry_base > +{ > + public: > + /* Pointer to the insn. */ > + rtx insn; > + /* Set if insn contains a mention of a vector register. All other > + fields are undefined if this field is unset. */ > + unsigned int is_relevant : 1; > + /* Set if insn is a load. */ > + unsigned int is_load : 1; > + /* Set if insn is a store. */ > + unsigned int is_store : 1; > + /* Set if insn is a doubleword swap. This can either be a register swap > + or a permuting load or store (test is_load and is_store for this). */ > + unsigned int is_swap : 1; > + /* Set if the insn has a live-in use of a parameter register. */ > + unsigned int is_live_in : 1; > + /* Set if the insn has a live-out def of a return register. */ > + unsigned int is_live_out : 1; > + /* Set if the insn contains a subreg reference of a vector register. */ > + unsigned int contains_subreg : 1; > + /* Set if the insn contains a 128-bit integer operand. */ > + unsigned int is_128_int : 1; > + /* Set if this is a call-insn. */ > + unsigned int is_call : 1; > + /* Set if this insn does not perform a vector operation for which > + element order matters, or if we know how to fix it up if it does. > + Undefined if is_swap is set. */ > + unsigned int is_swappable : 1; > + /* A nonzero value indicates what kind of special handling for this > + insn is required if doublewords are swapped. Undefined if > + is_swappable is not set. */ > + unsigned int special_handling : 3; > + /* Set if the web represented by this entry cannot be optimized. */ > + unsigned int web_not_optimizable : 1; > + /* Set if this insn should be deleted. */ > + unsigned int will_delete : 1; > +}; > + > +enum special_handling_values { > + SH_NONE = 0, > + SH_CONST_VECTOR, > + SH_SUBREG, > + SH_NOSWAP_LD, > + SH_NOSWAP_ST, > + SH_EXTRACT, > + SH_SPLAT > +}; > + > +/* Union INSN with all insns containing definitions that reach USE. > + Detect whether USE is live-in to the current function. */ > +static void > +union_defs (swap_web_entry *insn_entry, rtx insn, df_ref use) > +{ > + struct df_link *link = DF_REF_CHAIN (use); > + > + if (!link) > + insn_entry[INSN_UID (insn)].is_live_in = 1; > + > + while (link) > + { > + if (DF_REF_IS_ARTIFICIAL (link->ref)) > + insn_entry[INSN_UID (insn)].is_live_in = 1; > + > + if (DF_REF_INSN_INFO (link->ref)) > + { > + rtx def_insn = DF_REF_INSN (link->ref); > + (void)unionfind_union (insn_entry + INSN_UID (insn), > + insn_entry + INSN_UID (def_insn)); > + } > + > + link = link->next; > + } > +} > + > +/* Union INSN with all insns containing uses reached from DEF. > + Detect whether DEF is live-out from the current function. */ > +static void > +union_uses (swap_web_entry *insn_entry, rtx insn, df_ref def) > +{ > + struct df_link *link = DF_REF_CHAIN (def); > + > + if (!link) > + insn_entry[INSN_UID (insn)].is_live_out = 1; > + > + while (link) > + { > + /* This could be an eh use or some other artificial use; > + we treat these all the same (killing the optimization). */ > + if (DF_REF_IS_ARTIFICIAL (link->ref)) > + insn_entry[INSN_UID (insn)].is_live_out = 1; > + > + if (DF_REF_INSN_INFO (link->ref)) > + { > + rtx use_insn = DF_REF_INSN (link->ref); > + (void)unionfind_union (insn_entry + INSN_UID (insn), > + insn_entry + INSN_UID (use_insn)); > + } > + > + link = link->next; > + } > +} > + > +/* Return 1 iff INSN is a load insn, including permuting loads that > + represent an lvxd2x instruction; else return 0. */ > +static unsigned int > +insn_is_load_p (rtx insn) > +{ > + rtx body = PATTERN (insn); > + > + if (GET_CODE (body) == SET) > + { > + if (GET_CODE (SET_SRC (body)) == MEM) > + return 1; > + > + if (GET_CODE (SET_SRC (body)) == VEC_SELECT > + && GET_CODE (XEXP (SET_SRC (body), 0)) == MEM) > + return 1; > + > + return 0; > + } > + > + if (GET_CODE (body) != PARALLEL) > + return 0; > + > + rtx set = XVECEXP (body, 0, 0); > + > + if (GET_CODE (set) == SET && GET_CODE (SET_SRC (set)) == MEM) > + return 1; > + > + return 0; > +} > + > +/* Return 1 iff INSN is a store insn, including permuting stores that > + represent an stvxd2x instruction; else return 0. */ > +static unsigned int > +insn_is_store_p (rtx insn) > +{ > + rtx body = PATTERN (insn); > + if (GET_CODE (body) == SET && GET_CODE (SET_DEST (body)) == MEM) > + return 1; > + if (GET_CODE (body) != PARALLEL) > + return 0; > + rtx set = XVECEXP (body, 0, 0); > + if (GET_CODE (set) == SET && GET_CODE (SET_DEST (set)) == MEM) > + return 1; > + return 0; > +} > + > +/* Return 1 iff INSN swaps doublewords. This may be a reg-reg swap, > + a permuting load, or a permuting store. */ > +static unsigned int > +insn_is_swap_p (rtx insn) > +{ > + rtx body = PATTERN (insn); > + if (GET_CODE (body) != SET) > + return 0; > + rtx rhs = SET_SRC (body); > + if (GET_CODE (rhs) != VEC_SELECT) > + return 0; > + rtx parallel = XEXP (rhs, 1); > + if (GET_CODE (parallel) != PARALLEL) > + return 0; > + unsigned int len = XVECLEN (parallel, 0); > + if (len != 2 && len != 4 && len != 8 && len != 16) > + return 0; > + for (unsigned int i = 0; i < len / 2; ++i) > + { > + rtx op = XVECEXP (parallel, 0, i); > + if (GET_CODE (op) != CONST_INT || INTVAL (op) != len / 2 + i) > + return 0; > + } > + for (unsigned int i = len / 2; i < len; ++i) > + { > + rtx op = XVECEXP (parallel, 0, i); > + if (GET_CODE (op) != CONST_INT || INTVAL (op) != i - len / 2) > + return 0; > + } > + return 1; > +} > + > +/* Return 1 iff OP is an operand that will not be affected by having > + vector doublewords swapped in memory. */ > +static unsigned int > +rtx_is_swappable_p (rtx op, unsigned int *special) > +{ > + enum rtx_code code = GET_CODE (op); > + int i, j; > + rtx parallel; > + > + switch (code) > + { > + case LABEL_REF: > + case SYMBOL_REF: > + case CLOBBER: > + case REG: > + return 1; > + > + case VEC_CONCAT: > + case ASM_INPUT: > + case ASM_OPERANDS: > + return 0; > + > + case CONST_VECTOR: > + { > + *special = SH_CONST_VECTOR; > + return 1; > + } > + > + case VEC_DUPLICATE: > + /* Opportunity: If XEXP (op, 0) has the same mode as the result, > + and XEXP (op, 1) is a PARALLEL with a single QImode const int, > + it represents a vector splat for which we can do special > + handling. */ > + if (GET_CODE (XEXP (op, 0)) == CONST_INT) > + return 1; > + else if (GET_CODE (XEXP (op, 0)) == REG > + && GET_MODE_INNER (GET_MODE (op)) == GET_MODE (XEXP (op, 0))) > + /* This catches V2DF and V2DI splat, at a minimum. */ > + return 1; > + else if (GET_CODE (XEXP (op, 0)) == VEC_SELECT) > + /* If the duplicated item is from a select, defer to the select > + processing to see if we can change the lane for the splat. */ > + return rtx_is_swappable_p (XEXP (op, 0), special); > + else > + return 0; > + > + case VEC_SELECT: > + /* A vec_extract operation is ok if we change the lane. */ > + if (GET_CODE (XEXP (op, 0)) == REG > + && GET_MODE_INNER (GET_MODE (XEXP (op, 0))) == GET_MODE (op) > + && GET_CODE ((parallel = XEXP (op, 1))) == PARALLEL > + && XVECLEN (parallel, 0) == 1 > + && GET_CODE (XVECEXP (parallel, 0, 0)) == CONST_INT) > + { > + *special = SH_EXTRACT; > + return 1; > + } > + else > + return 0; > + > + case UNSPEC: > + { > + /* Various operations are unsafe for this optimization, at least > + without significant additional work. Permutes are obviously > + problematic, as both the permute control vector and the ordering > + of the target values are invalidated by doubleword swapping. > + Vector pack and unpack modify the number of vector lanes. > + Merge-high/low will not operate correctly on swapped operands. > + Vector shifts across element boundaries are clearly uncool, > + as are vector select and concatenate operations. Vector > + sum-across instructions define one operand with a specific > + order-dependent element, so additional fixup code would be > + needed to make those work. Vector set and non-immediate-form > + vector splat are element-order sensitive. A few of these > + cases might be workable with special handling if required. */ > + int val = XINT (op, 1); > + switch (val) > + { > + default: > + break; > + case UNSPEC_VMRGH_DIRECT: > + case UNSPEC_VMRGL_DIRECT: > + case UNSPEC_VPACK_SIGN_SIGN_SAT: > + case UNSPEC_VPACK_SIGN_UNS_SAT: > + case UNSPEC_VPACK_UNS_UNS_MOD: > + case UNSPEC_VPACK_UNS_UNS_MOD_DIRECT: > + case UNSPEC_VPACK_UNS_UNS_SAT: > + case UNSPEC_VPERM: > + case UNSPEC_VPERM_UNS: > + case UNSPEC_VPERMHI: > + case UNSPEC_VPERMSI: > + case UNSPEC_VPKPX: > + case UNSPEC_VSLDOI: > + case UNSPEC_VSLO: > + case UNSPEC_VSRO: > + case UNSPEC_VSUM2SWS: > + case UNSPEC_VSUM4S: > + case UNSPEC_VSUM4UBS: > + case UNSPEC_VSUMSWS: > + case UNSPEC_VSUMSWS_DIRECT: > + case UNSPEC_VSX_CONCAT: > + case UNSPEC_VSX_SET: > + case UNSPEC_VSX_SLDWI: > + case UNSPEC_VUNPACK_HI_SIGN: > + case UNSPEC_VUNPACK_HI_SIGN_DIRECT: > + case UNSPEC_VUNPACK_LO_SIGN: > + case UNSPEC_VUNPACK_LO_SIGN_DIRECT: > + case UNSPEC_VUPKHPX: > + case UNSPEC_VUPKHS_V4SF: > + case UNSPEC_VUPKHU_V4SF: > + case UNSPEC_VUPKLPX: > + case UNSPEC_VUPKLS_V4SF: > + case UNSPEC_VUPKLU_V4SF: > + /* The following could be handled as an idiom with XXSPLTW. > + These place a scalar in BE element zero, but the XXSPLTW > + will currently expect it in BE element 2 in a swapped > + region. When one of these feeds an XXSPLTW with no other > + defs/uses either way, we can avoid the lane change for > + XXSPLTW and things will be correct. TBD. */ > + case UNSPEC_VSX_CVDPSPN: > + case UNSPEC_VSX_CVSPDP: > + case UNSPEC_VSX_CVSPDPN: > + return 0; > + case UNSPEC_VSPLT_DIRECT: > + *special = SH_SPLAT; > + return 1; > + } > + } > + > + default: > + break; > + } > + > + const char *fmt = GET_RTX_FORMAT (code); > + int ok = 1; > + > + for (i = 0; i < GET_RTX_LENGTH (code); ++i) > + if (fmt[i] == 'e' || fmt[i] == 'u') > + { > + unsigned int special_op = SH_NONE; > + ok &= rtx_is_swappable_p (XEXP (op, i), &special_op); > + /* Ensure we never have two kinds of special handling > + for the same insn. */ > + if (*special != SH_NONE && special_op != SH_NONE > + && *special != special_op) > + return 0; > + *special = special_op; > + } > + else if (fmt[i] == 'E') > + for (j = 0; j < XVECLEN (op, i); ++j) > + { > + unsigned int special_op = SH_NONE; > + ok &= rtx_is_swappable_p (XVECEXP (op, i, j), &special_op); > + /* Ensure we never have two kinds of special handling > + for the same insn. */ > + if (*special != SH_NONE && special_op != SH_NONE > + && *special != special_op) > + return 0; > + *special = special_op; > + } > + > + return ok; > +} > + > +/* Return 1 iff INSN is an operand that will not be affected by > + having vector doublewords swapped in memory (in which case > + *SPECIAL is unchanged), or that can be modified to be correct > + if vector doublewords are swapped in memory (in which case > + *SPECIAL is changed to a value indicating how). */ > +static unsigned int > +insn_is_swappable_p (swap_web_entry *insn_entry, rtx insn, > + unsigned int *special) > +{ > + /* Calls are always bad. */ > + if (GET_CODE (insn) == CALL_INSN) > + return 0; > + > + /* Loads and stores seen here are not permuting, but we can still > + fix them up by converting them to permuting ones. Exceptions: > + UNSPEC_LVE, UNSPEC_LVX, and UNSPEC_STVX, which have a PARALLEL > + body instead of a SET; and UNSPEC_STVE, which has an UNSPEC > + for the SET source. */ > + rtx body = PATTERN (insn); > + int i = INSN_UID (insn); > + > + if (insn_entry[i].is_load) > + { > + if (GET_CODE (body) == SET) > + { > + *special = SH_NOSWAP_LD; > + return 1; > + } > + else > + return 0; > + } > + > + if (insn_entry[i].is_store) > + { > + if (GET_CODE (body) == SET && GET_CODE (SET_SRC (body)) != UNSPEC) > + { > + *special = SH_NOSWAP_ST; > + return 1; > + } > + else > + return 0; > + } > + > + /* Otherwise check the operands for vector lane violations. */ > + return rtx_is_swappable_p (body, special); > +} > + > +enum chain_purpose { FOR_LOADS, FOR_STORES }; > + > +/* Return true if the UD or DU chain headed by LINK is non-empty, > + and every entry on the chain references an insn that is a > + register swap. Furthermore, if PURPOSE is FOR_LOADS, each such > + register swap must have only permuting loads as reaching defs. > + If PURPOSE is FOR_STORES, each such register swap must have only > + register swaps or permuting stores as reached uses. */ > +static bool > +chain_contains_only_swaps (swap_web_entry *insn_entry, struct df_link *link, > + enum chain_purpose purpose) > +{ > + if (!link) > + return false; > + > + for (; link; link = link->next) > + { > + if (!VECTOR_MODE_P (GET_MODE (DF_REF_REG (link->ref)))) > + continue; > + > + if (DF_REF_IS_ARTIFICIAL (link->ref)) > + return false; > + > + rtx reached_insn = DF_REF_INSN (link->ref); > + unsigned uid = INSN_UID (reached_insn); > + > + if (!insn_entry[uid].is_swap || insn_entry[uid].is_load > + || insn_entry[uid].is_store) > + return false; > + > + if (purpose == FOR_LOADS) > + { > + df_ref *use_rec; > + for (use_rec = DF_INSN_UID_USES (uid); *use_rec; use_rec++) > + { > + df_ref use = *use_rec; > + struct df_link *swap_link = DF_REF_CHAIN (use); > + > + while (swap_link) > + { > + if (DF_REF_IS_ARTIFICIAL (link->ref)) > + return false; > + > + rtx swap_def_insn = DF_REF_INSN (swap_link->ref); > + unsigned uid2 = INSN_UID (swap_def_insn); > + > + /* Only permuting loads are allowed. */ > + if (!insn_entry[uid2].is_swap || !insn_entry[uid2].is_load) > + return false; > + > + swap_link = swap_link->next; > + } > + } > + } > + else if (purpose == FOR_STORES) > + { > + df_ref *def_rec; > + for (def_rec = DF_INSN_UID_DEFS (uid); *def_rec; def_rec++) > + { > + df_ref def = *def_rec; > + struct df_link *swap_link = DF_REF_CHAIN (def); > + > + while (swap_link) > + { > + if (DF_REF_IS_ARTIFICIAL (link->ref)) > + return false; > + > + rtx swap_use_insn = DF_REF_INSN (swap_link->ref); > + unsigned uid2 = INSN_UID (swap_use_insn); > + > + /* Permuting stores or register swaps are allowed. */ > + if (!insn_entry[uid2].is_swap || insn_entry[uid2].is_load) > + return false; > + > + swap_link = swap_link->next; > + } > + } > + } > + } > + > + return true; > +} > + > +/* Mark the xxswapdi instructions associated with permuting loads and > + stores for removal. Note that we only flag them for deletion here, > + as there is a possibility of a swap being reached from multiple > + loads, etc. */ > +static void > +mark_swaps_for_removal (swap_web_entry *insn_entry, unsigned int i) > +{ > + rtx insn = insn_entry[i].insn; > + unsigned uid = INSN_UID (insn); > + > + if (insn_entry[i].is_load) > + { > + df_ref *def_rec; > + for (def_rec = DF_INSN_UID_DEFS (uid); *def_rec; def_rec++) > + { > + df_ref def = *def_rec; > + struct df_link *link = DF_REF_CHAIN (def); > + > + /* We know by now that these are swaps, so we can delete > + them confidently. */ > + while (link) > + { > + rtx use_insn = DF_REF_INSN (link->ref); > + insn_entry[INSN_UID (use_insn)].will_delete = 1; > + link = link->next; > + } > + } > + } > + else if (insn_entry[i].is_store) > + { > + df_ref *use_rec; > + for (use_rec = DF_INSN_UID_USES (uid); *use_rec; use_rec++) > + { > + df_ref use = *use_rec; > + /* Ignore uses for addressability. */ > + machine_mode mode = GET_MODE (DF_REF_REG (use)); > + if (!VECTOR_MODE_P (mode)) > + continue; > + > + struct df_link *link = DF_REF_CHAIN (use); > + > + /* We know by now that these are swaps, so we can delete > + them confidently. */ > + while (link) > + { > + rtx def_insn = DF_REF_INSN (link->ref); > + insn_entry[INSN_UID (def_insn)].will_delete = 1; > + link = link->next; > + } > + } > + } > +} > + > +/* OP is either a CONST_VECTOR or an expression containing one. > + Swap the first half of the vector with the second in the first > + case. Recurse to find it in the second. */ > +static void > +swap_const_vector_halves (rtx op) > +{ > + int i; > + enum rtx_code code = GET_CODE (op); > + if (GET_CODE (op) == CONST_VECTOR) > + { > + int half_units = GET_MODE_NUNITS (GET_MODE (op)) / 2; > + for (i = 0; i < half_units; ++i) > + { > + rtx temp = CONST_VECTOR_ELT (op, i); > + CONST_VECTOR_ELT (op, i) = CONST_VECTOR_ELT (op, i + half_units); > + CONST_VECTOR_ELT (op, i + half_units) = temp; > + } > + } > + else > + { > + int j; > + const char *fmt = GET_RTX_FORMAT (code); > + for (i = 0; i < GET_RTX_LENGTH (code); ++i) > + if (fmt[i] == 'e' || fmt[i] == 'u') > + swap_const_vector_halves (XEXP (op, i)); > + else if (fmt[i] == 'E') > + for (j = 0; j < XVECLEN (op, i); ++j) > + swap_const_vector_halves (XVECEXP (op, i, j)); > + } > +} > + > +/* Find all subregs of a vector expression that perform a narrowing, > + and adjust the subreg index to account for doubleword swapping. */ > +static void > +adjust_subreg_index (rtx op) > +{ > + enum rtx_code code = GET_CODE (op); > + if (code == SUBREG > + && (GET_MODE_SIZE (GET_MODE (op)) > + < GET_MODE_SIZE (GET_MODE (XEXP (op, 0))))) > + { > + unsigned int index = SUBREG_BYTE (op); > + if (index < 8) > + index += 8; > + else > + index -= 8; > + SUBREG_BYTE (op) = index; > + } > + > + const char *fmt = GET_RTX_FORMAT (code); > + int i,j; > + for (i = 0; i < GET_RTX_LENGTH (code); ++i) > + if (fmt[i] == 'e' || fmt[i] == 'u') > + adjust_subreg_index (XEXP (op, i)); > + else if (fmt[i] == 'E') > + for (j = 0; j < XVECLEN (op, i); ++j) > + adjust_subreg_index (XVECEXP (op, i, j)); > +} > + > +/* Convert the non-permuting load INSN to a permuting one. */ > +static void > +permute_load (rtx insn) > +{ > + rtx body = PATTERN (insn); > + rtx mem_op = SET_SRC (body); > + rtx tgt_reg = SET_DEST (body); > + machine_mode mode = GET_MODE (tgt_reg); > + int n_elts = GET_MODE_NUNITS (mode); > + int half_elts = n_elts / 2; > + rtx par = gen_rtx_PARALLEL (mode, rtvec_alloc (n_elts)); > + int i, j; > + for (i = 0, j = half_elts; i < half_elts; ++i, ++j) > + XVECEXP (par, 0, i) = GEN_INT (j); > + for (i = half_elts, j = 0; j < half_elts; ++i, ++j) > + XVECEXP (par, 0, i) = GEN_INT (j); > + rtx sel = gen_rtx_VEC_SELECT (mode, mem_op, par); > + SET_SRC (body) = sel; > + INSN_CODE (insn) = -1; /* Force re-recognition. */ > + df_insn_rescan (insn); > + > + if (dump_file) > + fprintf (dump_file, "Replacing load %d with permuted load\n", > + INSN_UID (insn)); > +} > + > +/* Convert the non-permuting store INSN to a permuting one. */ > +static void > +permute_store (rtx insn) > +{ > + rtx body = PATTERN (insn); > + rtx src_reg = SET_SRC (body); > + machine_mode mode = GET_MODE (src_reg); > + int n_elts = GET_MODE_NUNITS (mode); > + int half_elts = n_elts / 2; > + rtx par = gen_rtx_PARALLEL (mode, rtvec_alloc (n_elts)); > + int i, j; > + for (i = 0, j = half_elts; i < half_elts; ++i, ++j) > + XVECEXP (par, 0, i) = GEN_INT (j); > + for (i = half_elts, j = 0; j < half_elts; ++i, ++j) > + XVECEXP (par, 0, i) = GEN_INT (j); > + rtx sel = gen_rtx_VEC_SELECT (mode, src_reg, par); > + SET_SRC (body) = sel; > + INSN_CODE (insn) = -1; /* Force re-recognition. */ > + df_insn_rescan (insn); > + > + if (dump_file) > + fprintf (dump_file, "Replacing store %d with permuted store\n", > + INSN_UID (insn)); > +} > + > +/* Given OP that contains a vector extract operation, adjust the index > + of the extracted lane to account for the doubleword swap. */ > +static void > +adjust_extract (rtx insn) > +{ > + rtx src = SET_SRC (PATTERN (insn)); > + /* The vec_select may be wrapped in a vec_duplicate for a splat, so > + account for that. */ > + rtx sel = GET_CODE (src) == VEC_DUPLICATE ? XEXP (src, 0) : src; > + rtx par = XEXP (sel, 1); > + int half_elts = GET_MODE_NUNITS (GET_MODE (XEXP (sel, 0))) >> 1; > + int lane = INTVAL (XVECEXP (par, 0, 0)); > + lane = lane >= half_elts ? lane - half_elts : lane + half_elts; > + XVECEXP (par, 0, 0) = GEN_INT (lane); > + INSN_CODE (insn) = -1; /* Force re-recognition. */ > + df_insn_rescan (insn); > + > + if (dump_file) > + fprintf (dump_file, "Changing lane for extract %d\n", INSN_UID (insn)); > +} > + > +/* Given OP that contains a vector direct-splat operation, adjust the index > + of the source lane to account for the doubleword swap. */ > +static void > +adjust_splat (rtx insn) > +{ > + rtx body = PATTERN (insn); > + rtx unspec = XEXP (body, 1); > + int half_elts = GET_MODE_NUNITS (GET_MODE (unspec)) >> 1; > + int lane = INTVAL (XVECEXP (unspec, 0, 1)); > + lane = lane >= half_elts ? lane - half_elts : lane + half_elts; > + XVECEXP (unspec, 0, 1) = GEN_INT (lane); > + INSN_CODE (insn) = -1; /* Force re-recognition. */ > + df_insn_rescan (insn); > + > + if (dump_file) > + fprintf (dump_file, "Changing lane for splat %d\n", INSN_UID (insn)); > +} > + > +/* The insn described by INSN_ENTRY[I] can be swapped, but only > + with special handling. Take care of that here. */ > +static void > +handle_special_swappables (swap_web_entry *insn_entry, unsigned i) > +{ > + rtx insn = insn_entry[i].insn; > + rtx body = PATTERN (insn); > + > + switch (insn_entry[i].special_handling) > + { > + default: > + gcc_unreachable (); > + case SH_CONST_VECTOR: > + { > + /* A CONST_VECTOR will only show up somewhere in the RHS of a SET. */ > + gcc_assert (GET_CODE (body) == SET); > + rtx rhs = SET_SRC (body); > + swap_const_vector_halves (rhs); > + if (dump_file) > + fprintf (dump_file, "Swapping constant halves in insn %d\n", i); > + break; > + } > + case SH_SUBREG: > + /* A subreg of the same size is already safe. For subregs that > + select a smaller portion of a reg, adjust the index for > + swapped doublewords. */ > + adjust_subreg_index (body); > + if (dump_file) > + fprintf (dump_file, "Adjusting subreg in insn %d\n", i); > + break; > + case SH_NOSWAP_LD: > + /* Convert a non-permuting load to a permuting one. */ > + permute_load (insn); > + break; > + case SH_NOSWAP_ST: > + /* Convert a non-permuting store to a permuting one. */ > + permute_store (insn); > + break; > + case SH_EXTRACT: > + /* Change the lane on an extract operation. */ > + adjust_extract (insn); > + break; > + case SH_SPLAT: > + /* Change the lane on a direct-splat operation. */ > + adjust_splat (insn); > + break; > + } > +} > + > +/* Find the insn from the Ith table entry, which is known to be a > + register swap Y = SWAP(X). Replace it with a copy Y = X. */ > +static void > +replace_swap_with_copy (swap_web_entry *insn_entry, unsigned i) > +{ > + rtx insn = insn_entry[i].insn; > + rtx body = PATTERN (insn); > + rtx src_reg = XEXP (SET_SRC (body), 0); > + rtx copy = gen_rtx_SET (VOIDmode, SET_DEST (body), src_reg); > + rtx new_insn = emit_insn_before (copy, insn); > + set_block_for_insn (new_insn, BLOCK_FOR_INSN (insn)); > + df_insn_rescan (new_insn); > + > + if (dump_file) > + { > + unsigned int new_uid = INSN_UID (new_insn); > + fprintf (dump_file, "Replacing swap %d with copy %d\n", i, new_uid); > + } > + > + df_insn_delete (BLOCK_FOR_INSN (insn), INSN_UID (insn)); > + remove_insn (insn); > + INSN_DELETED_P (insn) = 1; > +} > + > +/* Dump the swap table to DUMP_FILE. */ > +static void > +dump_swap_insn_table (swap_web_entry *insn_entry) > +{ > + int e = get_max_uid (); > + fprintf (dump_file, "\nRelevant insns with their flag settings\n\n"); > + > + for (int i = 0; i < e; ++i) > + if (insn_entry[i].is_relevant) > + { > + swap_web_entry *pred_entry = (swap_web_entry *)insn_entry[i].pred (); > + fprintf (dump_file, "%6d %6d ", i, > + pred_entry && pred_entry->insn > + ? INSN_UID (pred_entry->insn) : 0); > + if (insn_entry[i].is_load) > + fputs ("load ", dump_file); > + if (insn_entry[i].is_store) > + fputs ("store ", dump_file); > + if (insn_entry[i].is_swap) > + fputs ("swap ", dump_file); > + if (insn_entry[i].is_live_in) > + fputs ("live-in ", dump_file); > + if (insn_entry[i].is_live_out) > + fputs ("live-out ", dump_file); > + if (insn_entry[i].contains_subreg) > + fputs ("subreg ", dump_file); > + if (insn_entry[i].is_128_int) > + fputs ("int128 ", dump_file); > + if (insn_entry[i].is_call) > + fputs ("call ", dump_file); > + if (insn_entry[i].is_swappable) > + { > + fputs ("swappable ", dump_file); > + if (insn_entry[i].special_handling == SH_CONST_VECTOR) > + fputs ("special:constvec ", dump_file); > + else if (insn_entry[i].special_handling == SH_SUBREG) > + fputs ("special:subreg ", dump_file); > + else if (insn_entry[i].special_handling == SH_NOSWAP_LD) > + fputs ("special:load ", dump_file); > + else if (insn_entry[i].special_handling == SH_NOSWAP_ST) > + fputs ("special:store ", dump_file); > + else if (insn_entry[i].special_handling == SH_EXTRACT) > + fputs ("special:extract ", dump_file); > + else if (insn_entry[i].special_handling == SH_SPLAT) > + fputs ("special:splat ", dump_file); > + } > + if (insn_entry[i].web_not_optimizable) > + fputs ("unoptimizable ", dump_file); > + if (insn_entry[i].will_delete) > + fputs ("delete ", dump_file); > + fputs ("\n", dump_file); > + } > + fputs ("\n", dump_file); > +} > + > +/* Main entry point for this pass. */ > +unsigned int > +rs6000_analyze_swaps (function *fun) > +{ > + swap_web_entry *insn_entry; > + basic_block bb; > + rtx insn; > + > + /* Dataflow analysis for use-def chains. */ > + df_set_flags (DF_RD_PRUNE_DEAD_DEFS); > + df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN); > + df_analyze (); > + df_set_flags (DF_DEFER_INSN_RESCAN); > + > + /* Allocate structure to represent webs of insns. */ > + insn_entry = XCNEWVEC (swap_web_entry, get_max_uid ()); > + > + /* Walk the insns to gather basic data. */ > + FOR_ALL_BB_FN (bb, fun) > + FOR_BB_INSNS (bb, insn) > + { > + unsigned int uid = INSN_UID (insn); > + if (NONDEBUG_INSN_P (insn)) > + { > + insn_entry[uid].insn = insn; > + > + if (GET_CODE (insn) == CALL_INSN) > + insn_entry[uid].is_call = 1; > + > + /* Walk the uses and defs to see if we mention vector regs. > + Record any constraints on optimization of such mentions. */ > + df_ref *use_rec; > + for (use_rec = DF_INSN_UID_USES (uid); *use_rec; use_rec++) > + { > + df_ref mention = *use_rec; > + /* We use DF_REF_REAL_REG here to get inside any subregs. */ > + machine_mode mode = GET_MODE (DF_REF_REAL_REG (mention)); > + > + /* If a use gets its value from a call insn, it will be > + a hard register and will look like (reg:V4SI 3 3). > + The df analysis creates two mentions for GPR3 and GPR4, > + both DImode. We must recognize this and treat it as a > + vector mention to ensure the call is unioned with this > + use. */ > + if (mode == DImode && DF_REF_INSN_INFO (mention)) > + { > + rtx feeder = DF_REF_INSN (mention); > + /* FIXME: It is pretty hard to get from the df mention > + to the mode of the use in the insn. We arbitrarily > + pick a vector mode here, even though the use might > + be a real DImode. We can be too conservative > + (create a web larger than necessary) because of > + this, so consider eventually fixing this. */ > + if (GET_CODE (feeder) == CALL_INSN) > + mode = V4SImode; > + } > + > + if (VECTOR_MODE_P (mode) || mode == TImode) > + { > + insn_entry[uid].is_relevant = 1; > + if (mode == TImode || mode == V1TImode) > + insn_entry[uid].is_128_int = 1; > + if (DF_REF_INSN_INFO (mention)) > + insn_entry[uid].contains_subreg > + = !rtx_equal_p (DF_REF_REG (mention), > + DF_REF_REAL_REG (mention)); > + union_defs (insn_entry, insn, mention); > + } > + } > + df_ref *def_rec; > + for (def_rec = DF_INSN_UID_DEFS (uid); *def_rec; def_rec++) > + { > + df_ref mention = *def_rec; > + /* We use DF_REF_REAL_REG here to get inside any subregs. */ > + machine_mode mode = GET_MODE (DF_REF_REAL_REG (mention)); > + > + /* If we're loading up a hard vector register for a call, > + it looks like (set (reg:V4SI 9 9) (...)). The df > + analysis creates two mentions for GPR9 and GPR10, both > + DImode. So relying on the mode from the mentions > + isn't sufficient to ensure we union the call into the > + web with the parameter setup code. */ > + if (mode == DImode && GET_CODE (insn) == SET > + && VECTOR_MODE_P (GET_MODE (SET_DEST (insn)))) > + mode = GET_MODE (SET_DEST (insn)); > + > + if (VECTOR_MODE_P (mode) || mode == TImode) > + { > + insn_entry[uid].is_relevant = 1; > + if (mode == TImode || mode == V1TImode) > + insn_entry[uid].is_128_int = 1; > + if (DF_REF_INSN_INFO (mention)) > + insn_entry[uid].contains_subreg > + = !rtx_equal_p (DF_REF_REG (mention), > + DF_REF_REAL_REG (mention)); > + /* REG_FUNCTION_VALUE_P is not valid for subregs. */ > + else if (REG_FUNCTION_VALUE_P (DF_REF_REG (mention))) > + insn_entry[uid].is_live_out = 1; > + union_uses (insn_entry, insn, mention); > + } > + } > + > + if (insn_entry[uid].is_relevant) > + { > + /* Determine if this is a load or store. */ > + insn_entry[uid].is_load = insn_is_load_p (insn); > + insn_entry[uid].is_store = insn_is_store_p (insn); > + > + /* Determine if this is a doubleword swap. If not, > + determine whether it can legally be swapped. */ > + if (insn_is_swap_p (insn)) > + insn_entry[uid].is_swap = 1; > + else > + { > + unsigned int special = SH_NONE; > + insn_entry[uid].is_swappable > + = insn_is_swappable_p (insn_entry, insn, &special); > + if (special != SH_NONE && insn_entry[uid].contains_subreg) > + insn_entry[uid].is_swappable = 0; > + else if (special != SH_NONE) > + insn_entry[uid].special_handling = special; > + else if (insn_entry[uid].contains_subreg) > + insn_entry[uid].special_handling = SH_SUBREG; > + } > + } > + } > + } > + > + if (dump_file) > + { > + fprintf (dump_file, "\nSwap insn entry table when first built\n"); > + dump_swap_insn_table (insn_entry); > + } > + > + /* Record unoptimizable webs. */ > + unsigned e = get_max_uid (), i; > + for (i = 0; i < e; ++i) > + { > + if (!insn_entry[i].is_relevant) > + continue; > + > + swap_web_entry *root > + = (swap_web_entry*)(&insn_entry[i])->unionfind_root (); > + unsigned uid = INSN_UID (insn_entry[i].insn); > + > + if (insn_entry[i].is_live_in || insn_entry[i].is_live_out > + || (insn_entry[i].contains_subreg > + && insn_entry[i].special_handling != SH_SUBREG) > + || insn_entry[i].is_128_int || insn_entry[i].is_call > + || !(insn_entry[i].is_swappable || insn_entry[i].is_swap)) > + root->web_not_optimizable = 1; > + > + /* If we have loads or stores that aren't permuting then the > + optimization isn't appropriate. */ > + else if ((insn_entry[i].is_load || insn_entry[i].is_store) > + && !insn_entry[i].is_swap && !insn_entry[i].is_swappable) > + root->web_not_optimizable = 1; > + > + /* If we have permuting loads or stores that are not accompanied > + by a register swap, the optimization isn't appropriate. */ > + else if (insn_entry[i].is_load && insn_entry[i].is_swap) > + { > + df_ref *def_rec; > + > + for (def_rec = DF_INSN_UID_DEFS (uid); *def_rec; def_rec++) > + { > + df_ref def = *def_rec; > + struct df_link *link = DF_REF_CHAIN (def); > + > + if (!chain_contains_only_swaps (insn_entry, link, FOR_LOADS)) > + { > + root->web_not_optimizable = 1; > + break; > + } > + } > + } > + else if (insn_entry[i].is_store && insn_entry[i].is_swap) > + { > + df_ref *use_rec; > + > + for (use_rec = DF_INSN_UID_USES (uid); *use_rec; use_rec++) > + { > + df_ref use = *use_rec; > + struct df_link *link = DF_REF_CHAIN (use); > + > + if (!chain_contains_only_swaps (insn_entry, link, FOR_STORES)) > + { > + root->web_not_optimizable = 1; > + break; > + } > + } > + } > + } > + > + if (dump_file) > + { > + fprintf (dump_file, "\nSwap insn entry table after web analysis\n"); > + dump_swap_insn_table (insn_entry); > + } > + > + /* For each load and store in an optimizable web (which implies > + the loads and stores are permuting), find the associated > + register swaps and mark them for removal. Due to various > + optimizations we may mark the same swap more than once. Also > + perform special handling for swappable insns that require it. */ > + for (i = 0; i < e; ++i) > + if ((insn_entry[i].is_load || insn_entry[i].is_store) > + && insn_entry[i].is_swap) > + { > + swap_web_entry* root_entry > + = (swap_web_entry*)((&insn_entry[i])->unionfind_root ()); > + if (!root_entry->web_not_optimizable) > + mark_swaps_for_removal (insn_entry, i); > + } > + else if (insn_entry[i].is_swappable && insn_entry[i].special_handling) > + { > + swap_web_entry* root_entry > + = (swap_web_entry*)((&insn_entry[i])->unionfind_root ()); > + if (!root_entry->web_not_optimizable) > + handle_special_swappables (insn_entry, i); > + } > + > + /* Now delete the swaps marked for removal. */ > + for (i = 0; i < e; ++i) > + if (insn_entry[i].will_delete) > + replace_swap_with_copy (insn_entry, i); > + > + /* Clean up. */ > + free (insn_entry); > + return 0; > +} > + > > struct gcc_target targetm = TARGET_INITIALIZER; > > Index: gcc/config/rs6000/rs6000.opt > =================================================================== > --- gcc/config/rs6000/rs6000.opt (revision 221696) > +++ gcc/config/rs6000/rs6000.opt (working copy) > @@ -585,3 +585,7 @@ Allow double variables in upper registers with -mc > mupper-regs-sf > Target Undocumented Mask(UPPER_REGS_SF) Var(rs6000_isa_flags) > Allow float variables in upper registers with -mcpu=power8 or -mp8-vector > + > +moptimize-swaps > +Target Undocumented Var(rs6000_optimize_swaps) Init(1) Save > +Analyze and remove doubleword swaps from VSX computations. > Index: gcc/df.h > =================================================================== > --- gcc/df.h (revision 221696) > +++ gcc/df.h (working copy) > @@ -1132,20 +1132,22 @@ df_get_artificial_uses (unsigned int bb_index) > > /* web */ > > -/* This entry is allocated for each reference in the insn stream. */ > -struct web_entry > +class web_entry_base > { > - /* Pointer to the parent in the union/find tree. */ > - struct web_entry *pred; > - /* Newly assigned register to the entry. Set only for roots. */ > - rtx reg; > - void* extra_info; > + private: > + /* Reference to the parent in the union/find tree. */ > + web_entry_base *pred_pvt; > + > + public: > + /* Accessors. */ > + web_entry_base *pred () { return pred_pvt; } > + void set_pred (web_entry_base *p) { pred_pvt = p; } > + > + /* Find representative in union-find tree. */ > + web_entry_base *unionfind_root (); > + > + /* Union with another set, returning TRUE if they are already unioned. */ > + friend bool unionfind_union (web_entry_base *first, web_entry_base *second); > }; > > -extern struct web_entry *unionfind_root (struct web_entry *); > -extern bool unionfind_union (struct web_entry *, struct web_entry *); > -extern void union_defs (df_ref, struct web_entry *, > - unsigned int *used, struct web_entry *, > - bool (*fun) (struct web_entry *, struct web_entry *)); > - > #endif /* GCC_DF_H */ > Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-1.c > =================================================================== > --- gcc/testsuite/gcc.target/powerpc/swaps-p8-1.c (revision 0) > +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-1.c (working copy) > @@ -0,0 +1,35 @@ > +/* { dg-do compile { target { powerpc64le-*-* } } } */ > +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */ > +/* { dg-options "-mcpu=power8 -O3" } */ > +/* { dg-final { scan-assembler "lxvd2x" } } */ > +/* { dg-final { scan-assembler "stxvd2x" } } */ > +/* { dg-final { scan-assembler-not "xxpermdi" } } */ > + > +void abort(); > + > +#define N 16 > + > +signed char ca[N] __attribute__((aligned(16))); > +signed char cb[] __attribute__((aligned(16))) > + = {8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7}; > +signed char cc[] __attribute__((aligned(16))) > + = {1, 1, 2, 2, 3, 3, 2, 2, 1, 1, 0, 0, -1, -1, -2, -2}; > + > +__attribute__((noinline)) void foo () > +{ > + int i; > + for (i = 0; i < N; i++) { > + ca[i] = cb[i] - cc[i]; > + } > +} > + > +int main () > +{ > + signed char cd[] = {7, 6, 4, 3, 1, 0, 0, -1, -1, -2, -2, -3, -3, -4, -4, -5}; > + int i; > + foo (); > + for (i = 0; i < N; ++i) > + if (ca[i] != cd[i]) > + abort (); > + return 0; > +} > Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-10.c > =================================================================== > --- gcc/testsuite/gcc.target/powerpc/swaps-p8-10.c (revision 0) > +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-10.c (working copy) > @@ -0,0 +1,42 @@ > +/* { dg-do run { target { powerpc64le-*-* } } } */ > +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */ > +/* { dg-options "-mcpu=power8 -O3" } */ > + > +void abort (); > + > +#define N 4096 > +int ca[N] __attribute__((aligned(16))); > +int cb[N] __attribute__((aligned(16))); > +int cc[N] __attribute__((aligned(16))); > +int cd[N] __attribute__((aligned(16))); > + > +__attribute__((noinline)) void foo () > +{ > + int i; > + for (i = 0; i < N; i++) { > + ca[i] = ((cb[i] + cc[i]) * cd[i]) >> 3; > + } > +} > + > +__attribute__((noinline)) void init () > +{ > + int i; > + for (i = 0; i < N; ++i) { > + cb[i] = 3 * i - 2048; > + cc[i] = -5 * i + 93; > + cd[i] = i % 2 ? 1 : -1; > + } > +} > + > +int main () > +{ > + int i; > + init (); > + foo (); > + for (i = 0; i < N; ++i) > + if (i % 2 == 1 && ca[i] != (-2 * i - 1955) >> 3) > + abort (); > + else if (i % 2 == 0 && ca[i] != (1955 + 2 * i) >> 3) > + abort (); > + return 0; > +} > Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-11.c > =================================================================== > --- gcc/testsuite/gcc.target/powerpc/swaps-p8-11.c (revision 0) > +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-11.c (working copy) > @@ -0,0 +1,53 @@ > +/* { dg-do run { target { powerpc64le-*-* } } } */ > +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */ > +/* { dg-options "-mcpu=power8 -O3" } */ > + > +#include <altivec.h> > +void abort (); > + > +#define N 4096 > +int ca[N] __attribute__((aligned(16))); > +int cb[N] __attribute__((aligned(16))); > +int cc[N] __attribute__((aligned(16))); > +int cd[N] __attribute__((aligned(16))); > +int hey; > + > +__attribute__((noinline)) void foo () > +{ > + int i; > + vector int va, vb, vc, vd, tmp; > + vector unsigned int threes = vec_splat_u32(3); > + for (i = 0; i < N; i+=4) { > + vb = vec_vsx_ld (0, &cb[i]); > + vc = vec_vsx_ld (0, &cc[i]); > + vd = vec_vsx_ld (0, &cd[i]); > + tmp = vec_add (vb, vc); > + tmp = vec_sub (tmp, vd); > + tmp = vec_sra (tmp, threes); > + hey = tmp[3]; > + vec_vsx_st (tmp, 0, &ca[i]); > + } > +} > + > +__attribute__((noinline)) void init () > +{ > + int i; > + for (i = 0; i < N; ++i) { > + cb[i] = 3 * i - 2048; > + cc[i] = -5 * i + 93; > + cd[i] = i + 14; > + } > +} > + > +int main () > +{ > + int i; > + init (); > + foo (); > + for (i = 0; i < N; ++i) > + if (ca[i] != (-3 * i - 1969) >> 3) > + abort (); > + if (hey != ca[N-1]) > + abort (); > + return 0; > +} > Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-12.c > =================================================================== > --- gcc/testsuite/gcc.target/powerpc/swaps-p8-12.c (revision 0) > +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-12.c (working copy) > @@ -0,0 +1,56 @@ > +/* { dg-do compile { target { powerpc64le-*-* } } } */ > +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */ > +/* { dg-options "-mcpu=power8 -O3" } */ > +/* { dg-final { scan-assembler "lxvd2x" } } */ > +/* { dg-final { scan-assembler "stxvd2x" } } */ > +/* { dg-final { scan-assembler-not "xxpermdi" } } */ > + > +#include "altivec.h" > +void abort (); > + > +#define N 4096 > +int ca[N] __attribute__((aligned(16))); > +int cb[N] __attribute__((aligned(16))); > +int cc[N] __attribute__((aligned(16))); > +int cd[N] __attribute__((aligned(16))); > +int hey; > + > +__attribute__((noinline)) void foo () > +{ > + int i; > + vector int va, vb, vc, vd, tmp; > + vector unsigned int threes = vec_splat_u32(3); > + for (i = 0; i < N; i+=4) { > + vb = vec_vsx_ld (0, &cb[i]); > + vc = vec_vsx_ld (0, &cc[i]); > + vd = vec_vsx_ld (0, &cd[i]); > + tmp = vec_add (vb, vc); > + tmp = vec_sub (tmp, vd); > + tmp = vec_sra (tmp, threes); > + hey = tmp[3]; > + vec_vsx_st (tmp, 0, &ca[i]); > + } > +} > + > +__attribute__((noinline)) void init () > +{ > + int i; > + for (i = 0; i < N; ++i) { > + cb[i] = 3 * i - 2048; > + cc[i] = -5 * i + 93; > + cd[i] = i + 14; > + } > +} > + > +int main () > +{ > + int i; > + init (); > + foo (); > + for (i = 0; i < N; ++i) > + if (ca[i] != (-3 * i - 1969) >> 3) > + abort (); > + if (hey != ca[N-1]) > + abort (); > + return 0; > +} > Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-13.c > =================================================================== > --- gcc/testsuite/gcc.target/powerpc/swaps-p8-13.c (revision 0) > +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-13.c (working copy) > @@ -0,0 +1,54 @@ > +/* { dg-do run { target { powerpc64le-*-* } } } */ > +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */ > +/* { dg-options "-mcpu=power8 -O3" } */ > + > +#include <altivec.h> > +void abort (); > + > +#define N 4096 > +long long ca[N] __attribute__((aligned(16))); > +long long cb[N] __attribute__((aligned(16))); > +long long cc[N] __attribute__((aligned(16))); > +long long cd[N] __attribute__((aligned(16))); > +long long x; > + > +__attribute__((noinline)) void foo () > +{ > + int i; > + vector long long va, vb, vc, vd, tmp; > + volatile unsigned long long three = 3; > + vector unsigned long long threes = vec_splats (three); > + for (i = 0; i < N; i+=2) { > + vb = vec_vsx_ld (0, (vector long long *)&cb[i]); > + vc = vec_vsx_ld (0, (vector long long *)&cc[i]); > + vd = vec_vsx_ld (0, (vector long long *)&cd[i]); > + tmp = vec_add (vb, vc); > + tmp = vec_sub (tmp, vd); > + tmp = vec_sra (tmp, threes); > + x = vec_extract (tmp, 0); > + vec_vsx_st (tmp, 0, (vector long long *)&ca[i]); > + } > +} > + > +__attribute__((noinline)) void init () > +{ > + int i; > + for (i = 0; i < N; ++i) { > + cb[i] = 3 * i - 2048; > + cc[i] = -5 * i + 93; > + cd[i] = i + 14; > + } > +} > + > +int main () > +{ > + int i; > + init (); > + foo (); > + for (i = 0; i < N; ++i) > + if (ca[i] != (-3 * i - 1969) >> 3) > + abort (); > + if (x != ca[N-1]) > + abort (); > + return 0; > +} > Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-15.c > =================================================================== > --- gcc/testsuite/gcc.target/powerpc/swaps-p8-15.c (revision 0) > +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-15.c (working copy) > @@ -0,0 +1,51 @@ > +/* { dg-do compile { target { powerpc64le-*-* } } } */ > +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */ > +/* { dg-options "-mcpu=power8 -O3" } */ > +/* { dg-final { scan-assembler "lxvd2x" } } */ > +/* { dg-final { scan-assembler "stxvd2x" } } */ > +/* { dg-final { scan-assembler "xxspltw" } } */ > + > +/* Currently the analyze_swaps phase cannot optimize this loop because > + of the presence of an UNSPEC_VSX_CVDPSPN. At such time as this is > + handled, we need to add a 'scan-assembler-not "xxpermdi"' directive to > + this test. */ > +#include <altivec.h> > +void abort(); > + > +#define N 4096 > +#define M 10000000 > +vector float ca[N][4] = {0}; > +vector float cb[N][4] = {0}; > +vector float cc[N][4] = {0}; > + > +__attribute__((noinline)) void foo () > +{ > + int i; > + for (i = 0; i < N; i++) { > + cc[i][0] = vec_mul(vec_splats(cb[i][0][0]), ca[i][0]); > + cc[i][0] = vec_madd(cc[i][0],vec_splats(cb[i][0][1]), ca[i][1]); > + cc[i][0] = vec_madd(cc[i][0],vec_splats(cb[i][0][2]), ca[i][2]); > + cc[i][0] = vec_madd(cc[i][0],vec_splats(cb[i][0][3]), ca[i][3]); > + > + cc[i][1] = vec_mul(vec_splats(cb[i][1][0]), ca[i][0]); > + cc[i][1] = vec_madd(cc[i][0],vec_splats(cb[i][1][1]), ca[i][1]); > + cc[i][1] = vec_madd(cc[i][0],vec_splats(cb[i][1][2]), ca[i][2]); > + cc[i][1] = vec_madd(cc[i][0],vec_splats(cb[i][1][3]), ca[i][3]); > + > + cc[i][2] = vec_mul(vec_splats(cb[i][2][0]), ca[i][0]); > + cc[i][2] = vec_madd(cc[i][0],vec_splats(cb[i][2][1]), ca[i][1]); > + cc[i][2] = vec_madd(cc[i][0],vec_splats(cb[i][2][2]), ca[i][2]); > + cc[i][2] = vec_madd(cc[i][0],vec_splats(cb[i][2][3]), ca[i][3]); > + > + cc[i][3] = vec_mul(vec_splats(cb[i][3][0]), ca[i][0]); > + cc[i][3] = vec_madd(cc[i][0],vec_splats(cb[i][3][1]), ca[i][1]); > + cc[i][3] = vec_madd(cc[i][0],vec_splats(cb[i][3][2]), ca[i][2]); > + cc[i][3] = vec_madd(cc[i][0],vec_splats(cb[i][3][3]), ca[i][3]); > + } > +} > + > +int main () > +{ > + foo (); > + return 0; > +} > Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-17.c > =================================================================== > --- gcc/testsuite/gcc.target/powerpc/swaps-p8-17.c (revision 0) > +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-17.c (working copy) > @@ -0,0 +1,15 @@ > +/* { dg-do compile { target { powerpc64le-*-* } } } */ > +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */ > +/* { dg-options "-mcpu=power8 -O1" } */ > +/* { dg-final { scan-assembler "lxvd2x" } } */ > +/* { dg-final { scan-assembler "xxpermdi" } } */ > + > +/* Verify that we don't try to do permute removal in the presence of > + vec_ste. This used to ICE. */ > +#include <altivec.h> > + > +void f (void *p) > +{ > + vector unsigned int u32 = vec_vsx_ld (1, (const unsigned int *)p); > + vec_ste (u32, 1, (unsigned int *)p); > +} > Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-3.c > =================================================================== > --- gcc/testsuite/gcc.target/powerpc/swaps-p8-3.c (revision 0) > +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-3.c (working copy) > @@ -0,0 +1,43 @@ > +/* { dg-do compile { target { powerpc64le-*-* } } } */ > +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */ > +/* { dg-options "-mcpu=power8 -O3" } */ > +/* { dg-final { scan-assembler "lxvd2x" } } */ > +/* { dg-final { scan-assembler "stxvd2x" } } */ > +/* { dg-final { scan-assembler-not "xxpermdi" } } */ > + > +void abort (); > + > +#define N 4096 > +signed char ca[N] __attribute__((aligned(16))); > +signed char cb[N] __attribute__((aligned(16))); > +signed char cc[N] __attribute__((aligned(16))); > + > +__attribute__((noinline)) void foo () > +{ > + int i; > + for (i = 0; i < N; i++) { > + ca[i] = cb[i] - cc[i]; > + } > +} > + > +__attribute__((noinline)) void init () > +{ > + int i, ii; > + for (i = 0, ii = 0; i < N; ++i, ii = (ii + 1) % 128) { > + cb[i] = ii - 128; > + cc[i] = ii/2 - 64; > + } > +} > + > +int main () > +{ > + int i, ii; > + init (); > + foo (); > + for (i = 0; i < N; ++i) { > + ii = i % 128; > + if (ca[i] != ii - ii/2 - 64) > + abort (); > + } > + return 0; > +} > Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-4.c > =================================================================== > --- gcc/testsuite/gcc.target/powerpc/swaps-p8-4.c (revision 0) > +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-4.c (working copy) > @@ -0,0 +1,45 @@ > +/* { dg-do compile { target { powerpc64le-*-* } } } */ > +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */ > +/* { dg-options "-mcpu=power8 -O3" } */ > +/* { dg-final { scan-assembler "lxvd2x" } } */ > +/* { dg-final { scan-assembler "stxvd2x" } } */ > +/* { dg-final { scan-assembler-not "xxpermdi" } } */ > + > +void abort (); > + > +#define N 4096 > +int ca[N] __attribute__((aligned(16))); > +int cb[N] __attribute__((aligned(16))); > +int cc[N] __attribute__((aligned(16))); > +int cd[N] __attribute__((aligned(16))); > + > +__attribute__((noinline)) void foo () > +{ > + int i; > + for (i = 0; i < N; i++) { > + ca[i] = (cb[i] + cc[i]) * cd[i]; > + } > +} > + > +__attribute__((noinline)) void init () > +{ > + int i; > + for (i = 0; i < N; ++i) { > + cb[i] = 3 * i - 2048; > + cc[i] = -5 * i + 93; > + cd[i] = i % 2 ? 1 : -1; > + } > +} > + > +int main () > +{ > + int i; > + init (); > + foo (); > + for (i = 0; i < N; ++i) > + if (i % 2 == 1 && ca[i] != -2 * i - 1955) > + abort (); > + else if (i % 2 == 0 && ca[i] != 1955 + 2 * i) > + abort (); > + return 0; > +} > Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-5.c > =================================================================== > --- gcc/testsuite/gcc.target/powerpc/swaps-p8-5.c (revision 0) > +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-5.c (working copy) > @@ -0,0 +1,45 @@ > +/* { dg-do compile { target { powerpc64le-*-* } } } */ > +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */ > +/* { dg-options "-mcpu=power8 -O3" } */ > +/* { dg-final { scan-assembler "lxvd2x" } } */ > +/* { dg-final { scan-assembler "stxvd2x" } } */ > +/* { dg-final { scan-assembler-not "xxpermdi" } } */ > + > +void abort (); > + > +#define N 4096 > +int ca[N] __attribute__((aligned(16))); > +int cb[N] __attribute__((aligned(16))); > +int cc[N] __attribute__((aligned(16))); > +int cd[N] __attribute__((aligned(16))); > + > +__attribute__((noinline)) void foo () > +{ > + int i; > + for (i = 0; i < N; i++) { > + ca[i] = ((cb[i] + cc[i]) * cd[i]) >> 3; > + } > +} > + > +__attribute__((noinline)) void init () > +{ > + int i; > + for (i = 0; i < N; ++i) { > + cb[i] = 3 * i - 2048; > + cc[i] = -5 * i + 93; > + cd[i] = i % 2 ? 1 : -1; > + } > +} > + > +int main () > +{ > + int i; > + init (); > + foo (); > + for (i = 0; i < N; ++i) > + if (i % 2 == 1 && ca[i] != (-2 * i - 1955) >> 3) > + abort (); > + else if (i % 2 == 0 && ca[i] != (1955 + 2 * i) >> 3) > + abort (); > + return 0; > +} > Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-6.c > =================================================================== > --- gcc/testsuite/gcc.target/powerpc/swaps-p8-6.c (revision 0) > +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-6.c (working copy) > @@ -0,0 +1,32 @@ > +/* { dg-do run { target { powerpc64le-*-* } } } */ > +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */ > +/* { dg-options "-mcpu=power8 -O3" } */ > + > +void abort(); > + > +#define N 16 > + > +signed char ca[N] __attribute__((aligned(16))); > +signed char cb[] __attribute__((aligned(16))) > + = {8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7}; > +signed char cc[] __attribute__((aligned(16))) > + = {1, 1, 2, 2, 3, 3, 2, 2, 1, 1, 0, 0, -1, -1, -2, -2}; > + > +__attribute__((noinline)) void foo () > +{ > + int i; > + for (i = 0; i < N; i++) { > + ca[i] = cb[i] - cc[i]; > + } > +} > + > +int main () > +{ > + signed char cd[] = {7, 6, 4, 3, 1, 0, 0, -1, -1, -2, -2, -3, -3, -4, -4, -5}; > + int i; > + foo (); > + for (i = 0; i < N; ++i) > + if (ca[i] != cd[i]) > + abort (); > + return 0; > +} > Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-7.c > =================================================================== > --- gcc/testsuite/gcc.target/powerpc/swaps-p8-7.c (revision 0) > +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-7.c (working copy) > @@ -0,0 +1,38 @@ > +/* { dg-do run { target { powerpc64le-*-* } } } */ > +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */ > +/* { dg-options "-mcpu=power8 -O3" } */ > + > +void abort (); > + > +#define N 256 > +signed char ca[N] __attribute__((aligned(16))); > +signed char cb[N] __attribute__((aligned(16))); > +signed char cc[N] __attribute__((aligned(16))); > + > +__attribute__((noinline)) void foo () > +{ > + int i; > + for (i = 0; i < N; i++) { > + ca[i] = cb[i] - cc[i]; > + } > +} > + > +__attribute__((noinline)) void init () > +{ > + int i; > + for (i = 0; i < N; ++i) { > + cb[i] = i - 128; > + cc[i] = i/2 - 64; > + } > +} > + > +int main () > +{ > + int i; > + init (); > + foo (); > + for (i = 0; i < N; ++i) > + if (ca[i] != i - i/2 - 64) > + abort (); > + return 0; > +} > Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-8.c > =================================================================== > --- gcc/testsuite/gcc.target/powerpc/swaps-p8-8.c (revision 0) > +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-8.c (working copy) > @@ -0,0 +1,40 @@ > +/* { dg-do run { target { powerpc64le-*-* } } } */ > +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */ > +/* { dg-options "-mcpu=power8 -O3" } */ > + > +void abort (); > + > +#define N 4096 > +signed char ca[N] __attribute__((aligned(16))); > +signed char cb[N] __attribute__((aligned(16))); > +signed char cc[N] __attribute__((aligned(16))); > + > +__attribute__((noinline)) void foo () > +{ > + int i; > + for (i = 0; i < N; i++) { > + ca[i] = cb[i] - cc[i]; > + } > +} > + > +__attribute__((noinline)) void init () > +{ > + int i, ii; > + for (i = 0, ii = 0; i < N; ++i, ii = (ii + 1) % 128) { > + cb[i] = ii - 128; > + cc[i] = ii/2 - 64; > + } > +} > + > +int main () > +{ > + int i, ii; > + init (); > + foo (); > + for (i = 0; i < N; ++i) { > + ii = i % 128; > + if (ca[i] != ii - ii/2 - 64) > + abort (); > + } > + return 0; > +} > Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-9.c > =================================================================== > --- gcc/testsuite/gcc.target/powerpc/swaps-p8-9.c (revision 0) > +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-9.c (working copy) > @@ -0,0 +1,42 @@ > +/* { dg-do run { target { powerpc64le-*-* } } } */ > +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */ > +/* { dg-options "-mcpu=power8 -O3" } */ > + > +void abort (); > + > +#define N 4096 > +int ca[N] __attribute__((aligned(16))); > +int cb[N] __attribute__((aligned(16))); > +int cc[N] __attribute__((aligned(16))); > +int cd[N] __attribute__((aligned(16))); > + > +__attribute__((noinline)) void foo () > +{ > + int i; > + for (i = 0; i < N; i++) { > + ca[i] = (cb[i] + cc[i]) * cd[i]; > + } > +} > + > +__attribute__((noinline)) void init () > +{ > + int i; > + for (i = 0; i < N; ++i) { > + cb[i] = 3 * i - 2048; > + cc[i] = -5 * i + 93; > + cd[i] = i % 2 ? 1 : -1; > + } > +} > + > +int main () > +{ > + int i; > + init (); > + foo (); > + for (i = 0; i < N; ++i) > + if (i % 2 == 1 && ca[i] != -2 * i - 1955) > + abort (); > + else if (i % 2 == 0 && ca[i] != 1955 + 2 * i) > + abort (); > + return 0; > +} > Index: gcc/web.c > =================================================================== > --- gcc/web.c (revision 221696) > +++ gcc/web.c (working copy) > @@ -53,17 +53,17 @@ along with GCC; see the file COPYING3. If not see > > /* Find the root of unionfind tree (the representative of set). */ > > -struct web_entry * > -unionfind_root (struct web_entry *element) > +web_entry_base * > +web_entry_base::unionfind_root () > { > - struct web_entry *element1 = element, *element2; > + web_entry_base *element = this, *element1 = this, *element2; > > - while (element->pred) > - element = element->pred; > - while (element1->pred) > + while (element->pred ()) > + element = element->pred (); > + while (element1->pred ()) > { > - element2 = element1->pred; > - element1->pred = element; > + element2 = element1->pred (); > + element1->set_pred (element); > element1 = element2; > } > return element; > @@ -74,23 +74,32 @@ along with GCC; see the file COPYING3. If not see > nothing is done. Otherwise, return false. */ > > bool > -unionfind_union (struct web_entry *first, struct web_entry *second) > +unionfind_union (web_entry_base *first, web_entry_base *second) > { > - first = unionfind_root (first); > - second = unionfind_root (second); > + first = first->unionfind_root (); > + second = second->unionfind_root (); > if (first == second) > return true; > - second->pred = first; > + second->set_pred (first); > return false; > } > > +class web_entry : public web_entry_base > +{ > + private: > + rtx reg_pvt; > + > + public: > + rtx reg () { return reg_pvt; } > + void set_reg (rtx r) { reg_pvt = r; } > +}; > + > /* For INSN, union all defs and uses that are linked by match_dup. > FUN is the function that does the union. */ > > static void > -union_match_dups (rtx insn, struct web_entry *def_entry, > - struct web_entry *use_entry, > - bool (*fun) (struct web_entry *, struct web_entry *)) > +union_match_dups (rtx insn, web_entry *def_entry, web_entry *use_entry, > + bool (*fun) (web_entry_base *, web_entry_base *)) > { > struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn); > df_ref *use_link = DF_INSN_INFO_USES (insn_info); > @@ -157,9 +166,9 @@ static void > the values 0 and 1 are reserved for use by entry_register. */ > > void > -union_defs (df_ref use, struct web_entry *def_entry, > - unsigned int *used, struct web_entry *use_entry, > - bool (*fun) (struct web_entry *, struct web_entry *)) > +union_defs (df_ref use, web_entry *def_entry, > + unsigned int *used, web_entry *use_entry, > + bool (*fun) (web_entry_base *, web_entry_base *)) > { > struct df_insn_info *insn_info = DF_REF_INSN_INFO (use); > struct df_link *link = DF_REF_CHAIN (use); > @@ -260,15 +269,15 @@ void > /* Find the corresponding register for the given entry. */ > > static rtx > -entry_register (struct web_entry *entry, df_ref ref, unsigned int *used) > +entry_register (web_entry *entry, df_ref ref, unsigned int *used) > { > - struct web_entry *root; > + web_entry *root; > rtx reg, newreg; > > /* Find the corresponding web and see if it has been visited. */ > - root = unionfind_root (entry); > - if (root->reg) > - return root->reg; > + root = (web_entry *)entry->unionfind_root (); > + if (root->reg ()) > + return root->reg (); > > /* We are seeing this web for the first time, do the assignment. */ > reg = DF_REF_REAL_REG (ref); > @@ -292,7 +301,7 @@ static rtx > REGNO (newreg)); > } > > - root->reg = newreg; > + root->set_reg (newreg); > return newreg; > } > > @@ -326,8 +335,8 @@ gate_handle_web (void) > static unsigned int > web_main (void) > { > - struct web_entry *def_entry; > - struct web_entry *use_entry; > + web_entry *def_entry; > + web_entry *use_entry; > unsigned int max = max_reg_num (); > unsigned int *used; > basic_block bb; > @@ -364,9 +373,9 @@ web_main (void) > } > > /* Record the number of uses and defs at the beginning of the optimization. */ > - def_entry = XCNEWVEC (struct web_entry, DF_DEFS_TABLE_SIZE()); > + def_entry = XCNEWVEC (web_entry, DF_DEFS_TABLE_SIZE()); > used = XCNEWVEC (unsigned, max); > - use_entry = XCNEWVEC (struct web_entry, uses_num); > + use_entry = XCNEWVEC (web_entry, uses_num); > > /* Produce the web. */ > FOR_ALL_BB (bb) > >
On Thu, Mar 26, 2015 at 11:25 AM, Bill Schmidt <wschmidt@linux.vnet.ibm.com> wrote: >> This is a follow-up to >> https://gcc.gnu.org/ml/gcc-patches/2015-03/msg01310.html, which >> backported the POWER-specific little-endian swap optimization pass to >> the 4.9 branch. We also need to backport this to the 4.8 branch. This >> patch does that. >> >> The patch is very similar to the 4.9 backport, except for two things. >> First, the passes infrastructure changed quite a bit between 4.8 and >> 4.9, so the code to describe the new pass to the pass manager is >> somewhat different. Second, I've omitted three of the test cases, which >> happen to fail on 4.8 for unrelated reasons. (We run out of volatile >> registers and end up saving non-volatiles to the stack in the prologue, >> which generates load/swap sequences for now.) >> >> Tested on powerpc64le-unknown-linux-gnu with no regressions. Is this OK >> for 4.8? >> >> Thanks, >> Bill >> >> >> [gcc] >> >> 2015-03-26 Bill Schmidt <wschmidt@linux.vnet.ibm.com> >> >> Backport of r214242, r214254, and bug fix patches from mainline >> * config/rs6000/rs6000.c (tree-pass.h): New #include. >> (rs6000_analyze_swaps): New declaration. >> (gate_analyze_swaps): New function. >> (execute_analyze_swaps): Likewise. >> (pass_analyze_swaps): New struct rtl_opt_pass. >> (rs6000_option_override): Register swap-optimization pass. >> (swap_web_entry): New class. >> (special_handling_values): New enum. >> (union_defs): New function. >> (union_uses): Likewise. >> (insn_is_load_p): Likewise. >> (insn_is_store_p): Likewise. >> (insn_is_swap_p): Likewise. >> (rtx_is_swappable_p): Likewise. >> (insn_is_swappable_p): Likewise. >> (chain_purpose): New enum. >> (chain_contains_only_swaps): New function. >> (mark_swaps_for_removal): Likewise. >> (swap_const_vector_halves): Likewise. >> (adjust_subreg_index): Likewise. >> (permute_load): Likewise. >> (permute_store): Likewise. >> (adjust_extract): Likewise. >> (adjust_splat): Likewise. >> (handle_special_swappables): Likewise. >> (replace_swap_with_copy): Likewise. >> (dump_swap_insn_table): Likewise. >> (rs6000_analyze_swaps): Likewise. >> * config/rs6000/rs6000.opt (moptimize-swaps): New option. >> * df.h (web_entry_base): New class, replacing struct web_entry. >> (web_entry_base::pred): New method. >> (web_entry_base::set_pred): Likewise. >> (web_entry_base::unionfind_root): Likewise. >> (web_entry_base::unionfind_union): Likewise. >> (unionfind_root): Delete external reference. >> (unionfind_union): Likewise. >> (union_defs): Likewise. >> * web.c (web_entry_base::unionfind_root): Convert to method. >> (web_entry_base::unionfind_union): Likewise. >> (web_entry): New class. >> (union_match_dups): Convert to use class structure. >> (union_defs): Likewise. >> (entry_register): Likewise. >> (web_main): Likewise. >> >> >> [gcc/testsuite] >> >> 2015-03-26 Bill Schmidt <wschmidt@linux.vnet.ibm.com> >> >> Backport r214254 and related tests from mainline >> * gcc.target/powerpc/swaps-p8-1.c: New test. >> * gcc.target/powerpc/swaps-p8-3.c: New test. >> * gcc.target/powerpc/swaps-p8-4.c: New test. >> * gcc.target/powerpc/swaps-p8-5.c: New test. >> * gcc.target/powerpc/swaps-p8-6.c: New test. >> * gcc.target/powerpc/swaps-p8-7.c: New test. >> * gcc.target/powerpc/swaps-p8-8.c: New test. >> * gcc.target/powerpc/swaps-p8-9.c: New test. >> * gcc.target/powerpc/swaps-p8-10.c: New test. >> * gcc.target/powerpc/swaps-p8-11.c: New test. >> * gcc.target/powerpc/swaps-p8-12.c: New test. >> * gcc.target/powerpc/swaps-p8-13.c: New test. >> * gcc.target/powerpc/swaps-p8-15.c: New test. >> * gcc.target/powerpc/swaps-p8-17.c: New test. Okay. Thanks, David
Index: gcc/config/rs6000/rs6000.c =================================================================== --- gcc/config/rs6000/rs6000.c (revision 221696) +++ gcc/config/rs6000/rs6000.c (working copy) @@ -61,6 +61,7 @@ #include "tree-vectorizer.h" #include "dumpfile.h" #include "real.h" +#include "tree-pass.h" #if TARGET_XCOFF #include "xcoffout.h" /* get declarations of xcoff_*_section_name */ #endif @@ -1153,6 +1154,7 @@ static bool rs6000_secondary_reload_move (enum rs6 enum machine_mode, secondary_reload_info *, bool); +static unsigned int rs6000_analyze_swaps (function *); /* Hash table stuff for keeping track of TOC entries. */ @@ -4046,6 +4048,37 @@ rs6000_option_override_internal (bool global_init_ return ret; } +static bool +gate_analyze_swaps (void) +{ + return (optimize > 0 && !BYTES_BIG_ENDIAN && TARGET_VSX + && rs6000_optimize_swaps); +} + +static unsigned int +execute_analyze_swaps (void) +{ + return rs6000_analyze_swaps (cfun); +} + +struct rtl_opt_pass pass_analyze_swaps = +{ + RTL_PASS, + "swaps", /* name */ + OPTGROUP_NONE, /* optinfo_flags */ + gate_analyze_swaps, /* has_gate */ + execute_analyze_swaps, /* has_execute */ + NULL, /* sub */ + NULL, /* next */ + 0, /* static_pass_number */ + TV_NONE, /* tv_id */ + 0, /* properties_required */ + 0, /* properties_provided */ + 0, /* properties_destroyed */ + 0, /* todo_flags_start */ + TODO_df_finish, /* todo_flags_finish */ +}; + /* Implement TARGET_OPTION_OVERRIDE. On the RS/6000 this is used to define the target cpu type. */ @@ -4053,6 +4086,13 @@ static void rs6000_option_override (void) { (void) rs6000_option_override_internal (true); + + /* Register machine-specific passes. This needs to be done at start-up. + It's convenient to do it here (like i386 does). */ + static struct register_pass_info analyze_swaps_info + = { &pass_analyze_swaps.pass, "cse1", 1, PASS_POS_INSERT_BEFORE }; + + register_pass (&analyze_swaps_info); } @@ -33210,7 +33250,1148 @@ emit_fusion_gpr_load (rtx target, rtx mem) return ""; } + +/* Analyze vector computations and remove unnecessary doubleword + swaps (xxswapdi instructions). This pass is performed only + for little-endian VSX code generation. + For this specific case, loads and stores of 4x32 and 2x64 vectors + are inefficient. These are implemented using the lvx2dx and + stvx2dx instructions, which invert the order of doublewords in + a vector register. Thus the code generation inserts an xxswapdi + after each such load, and prior to each such store. (For spill + code after register assignment, an additional xxswapdi is inserted + following each store in order to return a hard register to its + unpermuted value.) + + The extra xxswapdi instructions reduce performance. This can be + particularly bad for vectorized code. The purpose of this pass + is to reduce the number of xxswapdi instructions required for + correctness. + + The primary insight is that much code that operates on vectors + does not care about the relative order of elements in a register, + so long as the correct memory order is preserved. If we have + a computation where all input values are provided by lvxd2x/xxswapdi + sequences, all outputs are stored using xxswapdi/stvxd2x sequences, + and all intermediate computations are pure SIMD (independent of + element order), then all the xxswapdi's associated with the loads + and stores may be removed. + + This pass uses some of the infrastructure and logical ideas from + the "web" pass in web.c. We create maximal webs of computations + fitting the description above using union-find. Each such web is + then optimized by removing its unnecessary xxswapdi instructions. + + The pass is placed prior to global optimization so that we can + perform the optimization in the safest and simplest way possible; + that is, by replacing each xxswapdi insn with a register copy insn. + Subsequent forward propagation will remove copies where possible. + + There are some operations sensitive to element order for which we + can still allow the operation, provided we modify those operations. + These include CONST_VECTORs, for which we must swap the first and + second halves of the constant vector; and SUBREGs, for which we + must adjust the byte offset to account for the swapped doublewords. + A remaining opportunity would be non-immediate-form splats, for + which we should adjust the selected lane of the input. We should + also make code generation adjustments for sum-across operations, + since this is a common vectorizer reduction. + + Because we run prior to the first split, we can see loads and stores + here that match *vsx_le_perm_{load,store}_<mode>. These are vanilla + vector loads and stores that have not yet been split into a permuting + load/store and a swap. (One way this can happen is with a builtin + call to vec_vsx_{ld,st}.) We can handle these as well, but rather + than deleting a swap, we convert the load/store into a permuting + load/store (which effectively removes the swap). */ + +/* Notes on Permutes + + We do not currently handle computations that contain permutes. There + is a general transformation that can be performed correctly, but it + may introduce more expensive code than it replaces. To handle these + would require a cost model to determine when to perform the optimization. + This commentary records how this could be done if desired. + + The most general permute is something like this (example for V16QI): + + (vec_select:V16QI (vec_concat:V32QI (op1:V16QI) (op2:V16QI)) + (parallel [(const_int a0) (const_int a1) + ... + (const_int a14) (const_int a15)])) + + where a0,...,a15 are in [0,31] and select elements from op1 and op2 + to produce in the result. + + Regardless of mode, we can convert the PARALLEL to a mask of 16 + byte-element selectors. Let's call this M, with M[i] representing + the ith byte-element selector value. Then if we swap doublewords + throughout the computation, we can get correct behavior by replacing + M with M' as follows: + + { M[i+8]+8 : i < 8, M[i+8] in [0,7] U [16,23] + M'[i] = { M[i+8]-8 : i < 8, M[i+8] in [8,15] U [24,31] + { M[i-8]+8 : i >= 8, M[i-8] in [0,7] U [16,23] + { M[i-8]-8 : i >= 8, M[i-8] in [8,15] U [24,31] + + This seems promising at first, since we are just replacing one mask + with another. But certain masks are preferable to others. If M + is a mask that matches a vmrghh pattern, for example, M' certainly + will not. Instead of a single vmrghh, we would generate a load of + M' and a vperm. So we would need to know how many xxswapd's we can + remove as a result of this transformation to determine if it's + profitable; and preferably the logic would need to be aware of all + the special preferable masks. + + Another form of permute is an UNSPEC_VPERM, in which the mask is + already in a register. In some cases, this mask may be a constant + that we can discover with ud-chains, in which case the above + transformation is ok. However, the common usage here is for the + mask to be produced by an UNSPEC_LVSL, in which case the mask + cannot be known at compile time. In such a case we would have to + generate several instructions to compute M' as above at run time, + and a cost model is needed again. */ + +/* This is based on the union-find logic in web.c. web_entry_base is + defined in df.h. */ +class swap_web_entry : public web_entry_base +{ + public: + /* Pointer to the insn. */ + rtx insn; + /* Set if insn contains a mention of a vector register. All other + fields are undefined if this field is unset. */ + unsigned int is_relevant : 1; + /* Set if insn is a load. */ + unsigned int is_load : 1; + /* Set if insn is a store. */ + unsigned int is_store : 1; + /* Set if insn is a doubleword swap. This can either be a register swap + or a permuting load or store (test is_load and is_store for this). */ + unsigned int is_swap : 1; + /* Set if the insn has a live-in use of a parameter register. */ + unsigned int is_live_in : 1; + /* Set if the insn has a live-out def of a return register. */ + unsigned int is_live_out : 1; + /* Set if the insn contains a subreg reference of a vector register. */ + unsigned int contains_subreg : 1; + /* Set if the insn contains a 128-bit integer operand. */ + unsigned int is_128_int : 1; + /* Set if this is a call-insn. */ + unsigned int is_call : 1; + /* Set if this insn does not perform a vector operation for which + element order matters, or if we know how to fix it up if it does. + Undefined if is_swap is set. */ + unsigned int is_swappable : 1; + /* A nonzero value indicates what kind of special handling for this + insn is required if doublewords are swapped. Undefined if + is_swappable is not set. */ + unsigned int special_handling : 3; + /* Set if the web represented by this entry cannot be optimized. */ + unsigned int web_not_optimizable : 1; + /* Set if this insn should be deleted. */ + unsigned int will_delete : 1; +}; + +enum special_handling_values { + SH_NONE = 0, + SH_CONST_VECTOR, + SH_SUBREG, + SH_NOSWAP_LD, + SH_NOSWAP_ST, + SH_EXTRACT, + SH_SPLAT +}; + +/* Union INSN with all insns containing definitions that reach USE. + Detect whether USE is live-in to the current function. */ +static void +union_defs (swap_web_entry *insn_entry, rtx insn, df_ref use) +{ + struct df_link *link = DF_REF_CHAIN (use); + + if (!link) + insn_entry[INSN_UID (insn)].is_live_in = 1; + + while (link) + { + if (DF_REF_IS_ARTIFICIAL (link->ref)) + insn_entry[INSN_UID (insn)].is_live_in = 1; + + if (DF_REF_INSN_INFO (link->ref)) + { + rtx def_insn = DF_REF_INSN (link->ref); + (void)unionfind_union (insn_entry + INSN_UID (insn), + insn_entry + INSN_UID (def_insn)); + } + + link = link->next; + } +} + +/* Union INSN with all insns containing uses reached from DEF. + Detect whether DEF is live-out from the current function. */ +static void +union_uses (swap_web_entry *insn_entry, rtx insn, df_ref def) +{ + struct df_link *link = DF_REF_CHAIN (def); + + if (!link) + insn_entry[INSN_UID (insn)].is_live_out = 1; + + while (link) + { + /* This could be an eh use or some other artificial use; + we treat these all the same (killing the optimization). */ + if (DF_REF_IS_ARTIFICIAL (link->ref)) + insn_entry[INSN_UID (insn)].is_live_out = 1; + + if (DF_REF_INSN_INFO (link->ref)) + { + rtx use_insn = DF_REF_INSN (link->ref); + (void)unionfind_union (insn_entry + INSN_UID (insn), + insn_entry + INSN_UID (use_insn)); + } + + link = link->next; + } +} + +/* Return 1 iff INSN is a load insn, including permuting loads that + represent an lvxd2x instruction; else return 0. */ +static unsigned int +insn_is_load_p (rtx insn) +{ + rtx body = PATTERN (insn); + + if (GET_CODE (body) == SET) + { + if (GET_CODE (SET_SRC (body)) == MEM) + return 1; + + if (GET_CODE (SET_SRC (body)) == VEC_SELECT + && GET_CODE (XEXP (SET_SRC (body), 0)) == MEM) + return 1; + + return 0; + } + + if (GET_CODE (body) != PARALLEL) + return 0; + + rtx set = XVECEXP (body, 0, 0); + + if (GET_CODE (set) == SET && GET_CODE (SET_SRC (set)) == MEM) + return 1; + + return 0; +} + +/* Return 1 iff INSN is a store insn, including permuting stores that + represent an stvxd2x instruction; else return 0. */ +static unsigned int +insn_is_store_p (rtx insn) +{ + rtx body = PATTERN (insn); + if (GET_CODE (body) == SET && GET_CODE (SET_DEST (body)) == MEM) + return 1; + if (GET_CODE (body) != PARALLEL) + return 0; + rtx set = XVECEXP (body, 0, 0); + if (GET_CODE (set) == SET && GET_CODE (SET_DEST (set)) == MEM) + return 1; + return 0; +} + +/* Return 1 iff INSN swaps doublewords. This may be a reg-reg swap, + a permuting load, or a permuting store. */ +static unsigned int +insn_is_swap_p (rtx insn) +{ + rtx body = PATTERN (insn); + if (GET_CODE (body) != SET) + return 0; + rtx rhs = SET_SRC (body); + if (GET_CODE (rhs) != VEC_SELECT) + return 0; + rtx parallel = XEXP (rhs, 1); + if (GET_CODE (parallel) != PARALLEL) + return 0; + unsigned int len = XVECLEN (parallel, 0); + if (len != 2 && len != 4 && len != 8 && len != 16) + return 0; + for (unsigned int i = 0; i < len / 2; ++i) + { + rtx op = XVECEXP (parallel, 0, i); + if (GET_CODE (op) != CONST_INT || INTVAL (op) != len / 2 + i) + return 0; + } + for (unsigned int i = len / 2; i < len; ++i) + { + rtx op = XVECEXP (parallel, 0, i); + if (GET_CODE (op) != CONST_INT || INTVAL (op) != i - len / 2) + return 0; + } + return 1; +} + +/* Return 1 iff OP is an operand that will not be affected by having + vector doublewords swapped in memory. */ +static unsigned int +rtx_is_swappable_p (rtx op, unsigned int *special) +{ + enum rtx_code code = GET_CODE (op); + int i, j; + rtx parallel; + + switch (code) + { + case LABEL_REF: + case SYMBOL_REF: + case CLOBBER: + case REG: + return 1; + + case VEC_CONCAT: + case ASM_INPUT: + case ASM_OPERANDS: + return 0; + + case CONST_VECTOR: + { + *special = SH_CONST_VECTOR; + return 1; + } + + case VEC_DUPLICATE: + /* Opportunity: If XEXP (op, 0) has the same mode as the result, + and XEXP (op, 1) is a PARALLEL with a single QImode const int, + it represents a vector splat for which we can do special + handling. */ + if (GET_CODE (XEXP (op, 0)) == CONST_INT) + return 1; + else if (GET_CODE (XEXP (op, 0)) == REG + && GET_MODE_INNER (GET_MODE (op)) == GET_MODE (XEXP (op, 0))) + /* This catches V2DF and V2DI splat, at a minimum. */ + return 1; + else if (GET_CODE (XEXP (op, 0)) == VEC_SELECT) + /* If the duplicated item is from a select, defer to the select + processing to see if we can change the lane for the splat. */ + return rtx_is_swappable_p (XEXP (op, 0), special); + else + return 0; + + case VEC_SELECT: + /* A vec_extract operation is ok if we change the lane. */ + if (GET_CODE (XEXP (op, 0)) == REG + && GET_MODE_INNER (GET_MODE (XEXP (op, 0))) == GET_MODE (op) + && GET_CODE ((parallel = XEXP (op, 1))) == PARALLEL + && XVECLEN (parallel, 0) == 1 + && GET_CODE (XVECEXP (parallel, 0, 0)) == CONST_INT) + { + *special = SH_EXTRACT; + return 1; + } + else + return 0; + + case UNSPEC: + { + /* Various operations are unsafe for this optimization, at least + without significant additional work. Permutes are obviously + problematic, as both the permute control vector and the ordering + of the target values are invalidated by doubleword swapping. + Vector pack and unpack modify the number of vector lanes. + Merge-high/low will not operate correctly on swapped operands. + Vector shifts across element boundaries are clearly uncool, + as are vector select and concatenate operations. Vector + sum-across instructions define one operand with a specific + order-dependent element, so additional fixup code would be + needed to make those work. Vector set and non-immediate-form + vector splat are element-order sensitive. A few of these + cases might be workable with special handling if required. */ + int val = XINT (op, 1); + switch (val) + { + default: + break; + case UNSPEC_VMRGH_DIRECT: + case UNSPEC_VMRGL_DIRECT: + case UNSPEC_VPACK_SIGN_SIGN_SAT: + case UNSPEC_VPACK_SIGN_UNS_SAT: + case UNSPEC_VPACK_UNS_UNS_MOD: + case UNSPEC_VPACK_UNS_UNS_MOD_DIRECT: + case UNSPEC_VPACK_UNS_UNS_SAT: + case UNSPEC_VPERM: + case UNSPEC_VPERM_UNS: + case UNSPEC_VPERMHI: + case UNSPEC_VPERMSI: + case UNSPEC_VPKPX: + case UNSPEC_VSLDOI: + case UNSPEC_VSLO: + case UNSPEC_VSRO: + case UNSPEC_VSUM2SWS: + case UNSPEC_VSUM4S: + case UNSPEC_VSUM4UBS: + case UNSPEC_VSUMSWS: + case UNSPEC_VSUMSWS_DIRECT: + case UNSPEC_VSX_CONCAT: + case UNSPEC_VSX_SET: + case UNSPEC_VSX_SLDWI: + case UNSPEC_VUNPACK_HI_SIGN: + case UNSPEC_VUNPACK_HI_SIGN_DIRECT: + case UNSPEC_VUNPACK_LO_SIGN: + case UNSPEC_VUNPACK_LO_SIGN_DIRECT: + case UNSPEC_VUPKHPX: + case UNSPEC_VUPKHS_V4SF: + case UNSPEC_VUPKHU_V4SF: + case UNSPEC_VUPKLPX: + case UNSPEC_VUPKLS_V4SF: + case UNSPEC_VUPKLU_V4SF: + /* The following could be handled as an idiom with XXSPLTW. + These place a scalar in BE element zero, but the XXSPLTW + will currently expect it in BE element 2 in a swapped + region. When one of these feeds an XXSPLTW with no other + defs/uses either way, we can avoid the lane change for + XXSPLTW and things will be correct. TBD. */ + case UNSPEC_VSX_CVDPSPN: + case UNSPEC_VSX_CVSPDP: + case UNSPEC_VSX_CVSPDPN: + return 0; + case UNSPEC_VSPLT_DIRECT: + *special = SH_SPLAT; + return 1; + } + } + + default: + break; + } + + const char *fmt = GET_RTX_FORMAT (code); + int ok = 1; + + for (i = 0; i < GET_RTX_LENGTH (code); ++i) + if (fmt[i] == 'e' || fmt[i] == 'u') + { + unsigned int special_op = SH_NONE; + ok &= rtx_is_swappable_p (XEXP (op, i), &special_op); + /* Ensure we never have two kinds of special handling + for the same insn. */ + if (*special != SH_NONE && special_op != SH_NONE + && *special != special_op) + return 0; + *special = special_op; + } + else if (fmt[i] == 'E') + for (j = 0; j < XVECLEN (op, i); ++j) + { + unsigned int special_op = SH_NONE; + ok &= rtx_is_swappable_p (XVECEXP (op, i, j), &special_op); + /* Ensure we never have two kinds of special handling + for the same insn. */ + if (*special != SH_NONE && special_op != SH_NONE + && *special != special_op) + return 0; + *special = special_op; + } + + return ok; +} + +/* Return 1 iff INSN is an operand that will not be affected by + having vector doublewords swapped in memory (in which case + *SPECIAL is unchanged), or that can be modified to be correct + if vector doublewords are swapped in memory (in which case + *SPECIAL is changed to a value indicating how). */ +static unsigned int +insn_is_swappable_p (swap_web_entry *insn_entry, rtx insn, + unsigned int *special) +{ + /* Calls are always bad. */ + if (GET_CODE (insn) == CALL_INSN) + return 0; + + /* Loads and stores seen here are not permuting, but we can still + fix them up by converting them to permuting ones. Exceptions: + UNSPEC_LVE, UNSPEC_LVX, and UNSPEC_STVX, which have a PARALLEL + body instead of a SET; and UNSPEC_STVE, which has an UNSPEC + for the SET source. */ + rtx body = PATTERN (insn); + int i = INSN_UID (insn); + + if (insn_entry[i].is_load) + { + if (GET_CODE (body) == SET) + { + *special = SH_NOSWAP_LD; + return 1; + } + else + return 0; + } + + if (insn_entry[i].is_store) + { + if (GET_CODE (body) == SET && GET_CODE (SET_SRC (body)) != UNSPEC) + { + *special = SH_NOSWAP_ST; + return 1; + } + else + return 0; + } + + /* Otherwise check the operands for vector lane violations. */ + return rtx_is_swappable_p (body, special); +} + +enum chain_purpose { FOR_LOADS, FOR_STORES }; + +/* Return true if the UD or DU chain headed by LINK is non-empty, + and every entry on the chain references an insn that is a + register swap. Furthermore, if PURPOSE is FOR_LOADS, each such + register swap must have only permuting loads as reaching defs. + If PURPOSE is FOR_STORES, each such register swap must have only + register swaps or permuting stores as reached uses. */ +static bool +chain_contains_only_swaps (swap_web_entry *insn_entry, struct df_link *link, + enum chain_purpose purpose) +{ + if (!link) + return false; + + for (; link; link = link->next) + { + if (!VECTOR_MODE_P (GET_MODE (DF_REF_REG (link->ref)))) + continue; + + if (DF_REF_IS_ARTIFICIAL (link->ref)) + return false; + + rtx reached_insn = DF_REF_INSN (link->ref); + unsigned uid = INSN_UID (reached_insn); + + if (!insn_entry[uid].is_swap || insn_entry[uid].is_load + || insn_entry[uid].is_store) + return false; + + if (purpose == FOR_LOADS) + { + df_ref *use_rec; + for (use_rec = DF_INSN_UID_USES (uid); *use_rec; use_rec++) + { + df_ref use = *use_rec; + struct df_link *swap_link = DF_REF_CHAIN (use); + + while (swap_link) + { + if (DF_REF_IS_ARTIFICIAL (link->ref)) + return false; + + rtx swap_def_insn = DF_REF_INSN (swap_link->ref); + unsigned uid2 = INSN_UID (swap_def_insn); + + /* Only permuting loads are allowed. */ + if (!insn_entry[uid2].is_swap || !insn_entry[uid2].is_load) + return false; + + swap_link = swap_link->next; + } + } + } + else if (purpose == FOR_STORES) + { + df_ref *def_rec; + for (def_rec = DF_INSN_UID_DEFS (uid); *def_rec; def_rec++) + { + df_ref def = *def_rec; + struct df_link *swap_link = DF_REF_CHAIN (def); + + while (swap_link) + { + if (DF_REF_IS_ARTIFICIAL (link->ref)) + return false; + + rtx swap_use_insn = DF_REF_INSN (swap_link->ref); + unsigned uid2 = INSN_UID (swap_use_insn); + + /* Permuting stores or register swaps are allowed. */ + if (!insn_entry[uid2].is_swap || insn_entry[uid2].is_load) + return false; + + swap_link = swap_link->next; + } + } + } + } + + return true; +} + +/* Mark the xxswapdi instructions associated with permuting loads and + stores for removal. Note that we only flag them for deletion here, + as there is a possibility of a swap being reached from multiple + loads, etc. */ +static void +mark_swaps_for_removal (swap_web_entry *insn_entry, unsigned int i) +{ + rtx insn = insn_entry[i].insn; + unsigned uid = INSN_UID (insn); + + if (insn_entry[i].is_load) + { + df_ref *def_rec; + for (def_rec = DF_INSN_UID_DEFS (uid); *def_rec; def_rec++) + { + df_ref def = *def_rec; + struct df_link *link = DF_REF_CHAIN (def); + + /* We know by now that these are swaps, so we can delete + them confidently. */ + while (link) + { + rtx use_insn = DF_REF_INSN (link->ref); + insn_entry[INSN_UID (use_insn)].will_delete = 1; + link = link->next; + } + } + } + else if (insn_entry[i].is_store) + { + df_ref *use_rec; + for (use_rec = DF_INSN_UID_USES (uid); *use_rec; use_rec++) + { + df_ref use = *use_rec; + /* Ignore uses for addressability. */ + machine_mode mode = GET_MODE (DF_REF_REG (use)); + if (!VECTOR_MODE_P (mode)) + continue; + + struct df_link *link = DF_REF_CHAIN (use); + + /* We know by now that these are swaps, so we can delete + them confidently. */ + while (link) + { + rtx def_insn = DF_REF_INSN (link->ref); + insn_entry[INSN_UID (def_insn)].will_delete = 1; + link = link->next; + } + } + } +} + +/* OP is either a CONST_VECTOR or an expression containing one. + Swap the first half of the vector with the second in the first + case. Recurse to find it in the second. */ +static void +swap_const_vector_halves (rtx op) +{ + int i; + enum rtx_code code = GET_CODE (op); + if (GET_CODE (op) == CONST_VECTOR) + { + int half_units = GET_MODE_NUNITS (GET_MODE (op)) / 2; + for (i = 0; i < half_units; ++i) + { + rtx temp = CONST_VECTOR_ELT (op, i); + CONST_VECTOR_ELT (op, i) = CONST_VECTOR_ELT (op, i + half_units); + CONST_VECTOR_ELT (op, i + half_units) = temp; + } + } + else + { + int j; + const char *fmt = GET_RTX_FORMAT (code); + for (i = 0; i < GET_RTX_LENGTH (code); ++i) + if (fmt[i] == 'e' || fmt[i] == 'u') + swap_const_vector_halves (XEXP (op, i)); + else if (fmt[i] == 'E') + for (j = 0; j < XVECLEN (op, i); ++j) + swap_const_vector_halves (XVECEXP (op, i, j)); + } +} + +/* Find all subregs of a vector expression that perform a narrowing, + and adjust the subreg index to account for doubleword swapping. */ +static void +adjust_subreg_index (rtx op) +{ + enum rtx_code code = GET_CODE (op); + if (code == SUBREG + && (GET_MODE_SIZE (GET_MODE (op)) + < GET_MODE_SIZE (GET_MODE (XEXP (op, 0))))) + { + unsigned int index = SUBREG_BYTE (op); + if (index < 8) + index += 8; + else + index -= 8; + SUBREG_BYTE (op) = index; + } + + const char *fmt = GET_RTX_FORMAT (code); + int i,j; + for (i = 0; i < GET_RTX_LENGTH (code); ++i) + if (fmt[i] == 'e' || fmt[i] == 'u') + adjust_subreg_index (XEXP (op, i)); + else if (fmt[i] == 'E') + for (j = 0; j < XVECLEN (op, i); ++j) + adjust_subreg_index (XVECEXP (op, i, j)); +} + +/* Convert the non-permuting load INSN to a permuting one. */ +static void +permute_load (rtx insn) +{ + rtx body = PATTERN (insn); + rtx mem_op = SET_SRC (body); + rtx tgt_reg = SET_DEST (body); + machine_mode mode = GET_MODE (tgt_reg); + int n_elts = GET_MODE_NUNITS (mode); + int half_elts = n_elts / 2; + rtx par = gen_rtx_PARALLEL (mode, rtvec_alloc (n_elts)); + int i, j; + for (i = 0, j = half_elts; i < half_elts; ++i, ++j) + XVECEXP (par, 0, i) = GEN_INT (j); + for (i = half_elts, j = 0; j < half_elts; ++i, ++j) + XVECEXP (par, 0, i) = GEN_INT (j); + rtx sel = gen_rtx_VEC_SELECT (mode, mem_op, par); + SET_SRC (body) = sel; + INSN_CODE (insn) = -1; /* Force re-recognition. */ + df_insn_rescan (insn); + + if (dump_file) + fprintf (dump_file, "Replacing load %d with permuted load\n", + INSN_UID (insn)); +} + +/* Convert the non-permuting store INSN to a permuting one. */ +static void +permute_store (rtx insn) +{ + rtx body = PATTERN (insn); + rtx src_reg = SET_SRC (body); + machine_mode mode = GET_MODE (src_reg); + int n_elts = GET_MODE_NUNITS (mode); + int half_elts = n_elts / 2; + rtx par = gen_rtx_PARALLEL (mode, rtvec_alloc (n_elts)); + int i, j; + for (i = 0, j = half_elts; i < half_elts; ++i, ++j) + XVECEXP (par, 0, i) = GEN_INT (j); + for (i = half_elts, j = 0; j < half_elts; ++i, ++j) + XVECEXP (par, 0, i) = GEN_INT (j); + rtx sel = gen_rtx_VEC_SELECT (mode, src_reg, par); + SET_SRC (body) = sel; + INSN_CODE (insn) = -1; /* Force re-recognition. */ + df_insn_rescan (insn); + + if (dump_file) + fprintf (dump_file, "Replacing store %d with permuted store\n", + INSN_UID (insn)); +} + +/* Given OP that contains a vector extract operation, adjust the index + of the extracted lane to account for the doubleword swap. */ +static void +adjust_extract (rtx insn) +{ + rtx src = SET_SRC (PATTERN (insn)); + /* The vec_select may be wrapped in a vec_duplicate for a splat, so + account for that. */ + rtx sel = GET_CODE (src) == VEC_DUPLICATE ? XEXP (src, 0) : src; + rtx par = XEXP (sel, 1); + int half_elts = GET_MODE_NUNITS (GET_MODE (XEXP (sel, 0))) >> 1; + int lane = INTVAL (XVECEXP (par, 0, 0)); + lane = lane >= half_elts ? lane - half_elts : lane + half_elts; + XVECEXP (par, 0, 0) = GEN_INT (lane); + INSN_CODE (insn) = -1; /* Force re-recognition. */ + df_insn_rescan (insn); + + if (dump_file) + fprintf (dump_file, "Changing lane for extract %d\n", INSN_UID (insn)); +} + +/* Given OP that contains a vector direct-splat operation, adjust the index + of the source lane to account for the doubleword swap. */ +static void +adjust_splat (rtx insn) +{ + rtx body = PATTERN (insn); + rtx unspec = XEXP (body, 1); + int half_elts = GET_MODE_NUNITS (GET_MODE (unspec)) >> 1; + int lane = INTVAL (XVECEXP (unspec, 0, 1)); + lane = lane >= half_elts ? lane - half_elts : lane + half_elts; + XVECEXP (unspec, 0, 1) = GEN_INT (lane); + INSN_CODE (insn) = -1; /* Force re-recognition. */ + df_insn_rescan (insn); + + if (dump_file) + fprintf (dump_file, "Changing lane for splat %d\n", INSN_UID (insn)); +} + +/* The insn described by INSN_ENTRY[I] can be swapped, but only + with special handling. Take care of that here. */ +static void +handle_special_swappables (swap_web_entry *insn_entry, unsigned i) +{ + rtx insn = insn_entry[i].insn; + rtx body = PATTERN (insn); + + switch (insn_entry[i].special_handling) + { + default: + gcc_unreachable (); + case SH_CONST_VECTOR: + { + /* A CONST_VECTOR will only show up somewhere in the RHS of a SET. */ + gcc_assert (GET_CODE (body) == SET); + rtx rhs = SET_SRC (body); + swap_const_vector_halves (rhs); + if (dump_file) + fprintf (dump_file, "Swapping constant halves in insn %d\n", i); + break; + } + case SH_SUBREG: + /* A subreg of the same size is already safe. For subregs that + select a smaller portion of a reg, adjust the index for + swapped doublewords. */ + adjust_subreg_index (body); + if (dump_file) + fprintf (dump_file, "Adjusting subreg in insn %d\n", i); + break; + case SH_NOSWAP_LD: + /* Convert a non-permuting load to a permuting one. */ + permute_load (insn); + break; + case SH_NOSWAP_ST: + /* Convert a non-permuting store to a permuting one. */ + permute_store (insn); + break; + case SH_EXTRACT: + /* Change the lane on an extract operation. */ + adjust_extract (insn); + break; + case SH_SPLAT: + /* Change the lane on a direct-splat operation. */ + adjust_splat (insn); + break; + } +} + +/* Find the insn from the Ith table entry, which is known to be a + register swap Y = SWAP(X). Replace it with a copy Y = X. */ +static void +replace_swap_with_copy (swap_web_entry *insn_entry, unsigned i) +{ + rtx insn = insn_entry[i].insn; + rtx body = PATTERN (insn); + rtx src_reg = XEXP (SET_SRC (body), 0); + rtx copy = gen_rtx_SET (VOIDmode, SET_DEST (body), src_reg); + rtx new_insn = emit_insn_before (copy, insn); + set_block_for_insn (new_insn, BLOCK_FOR_INSN (insn)); + df_insn_rescan (new_insn); + + if (dump_file) + { + unsigned int new_uid = INSN_UID (new_insn); + fprintf (dump_file, "Replacing swap %d with copy %d\n", i, new_uid); + } + + df_insn_delete (BLOCK_FOR_INSN (insn), INSN_UID (insn)); + remove_insn (insn); + INSN_DELETED_P (insn) = 1; +} + +/* Dump the swap table to DUMP_FILE. */ +static void +dump_swap_insn_table (swap_web_entry *insn_entry) +{ + int e = get_max_uid (); + fprintf (dump_file, "\nRelevant insns with their flag settings\n\n"); + + for (int i = 0; i < e; ++i) + if (insn_entry[i].is_relevant) + { + swap_web_entry *pred_entry = (swap_web_entry *)insn_entry[i].pred (); + fprintf (dump_file, "%6d %6d ", i, + pred_entry && pred_entry->insn + ? INSN_UID (pred_entry->insn) : 0); + if (insn_entry[i].is_load) + fputs ("load ", dump_file); + if (insn_entry[i].is_store) + fputs ("store ", dump_file); + if (insn_entry[i].is_swap) + fputs ("swap ", dump_file); + if (insn_entry[i].is_live_in) + fputs ("live-in ", dump_file); + if (insn_entry[i].is_live_out) + fputs ("live-out ", dump_file); + if (insn_entry[i].contains_subreg) + fputs ("subreg ", dump_file); + if (insn_entry[i].is_128_int) + fputs ("int128 ", dump_file); + if (insn_entry[i].is_call) + fputs ("call ", dump_file); + if (insn_entry[i].is_swappable) + { + fputs ("swappable ", dump_file); + if (insn_entry[i].special_handling == SH_CONST_VECTOR) + fputs ("special:constvec ", dump_file); + else if (insn_entry[i].special_handling == SH_SUBREG) + fputs ("special:subreg ", dump_file); + else if (insn_entry[i].special_handling == SH_NOSWAP_LD) + fputs ("special:load ", dump_file); + else if (insn_entry[i].special_handling == SH_NOSWAP_ST) + fputs ("special:store ", dump_file); + else if (insn_entry[i].special_handling == SH_EXTRACT) + fputs ("special:extract ", dump_file); + else if (insn_entry[i].special_handling == SH_SPLAT) + fputs ("special:splat ", dump_file); + } + if (insn_entry[i].web_not_optimizable) + fputs ("unoptimizable ", dump_file); + if (insn_entry[i].will_delete) + fputs ("delete ", dump_file); + fputs ("\n", dump_file); + } + fputs ("\n", dump_file); +} + +/* Main entry point for this pass. */ +unsigned int +rs6000_analyze_swaps (function *fun) +{ + swap_web_entry *insn_entry; + basic_block bb; + rtx insn; + + /* Dataflow analysis for use-def chains. */ + df_set_flags (DF_RD_PRUNE_DEAD_DEFS); + df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN); + df_analyze (); + df_set_flags (DF_DEFER_INSN_RESCAN); + + /* Allocate structure to represent webs of insns. */ + insn_entry = XCNEWVEC (swap_web_entry, get_max_uid ()); + + /* Walk the insns to gather basic data. */ + FOR_ALL_BB_FN (bb, fun) + FOR_BB_INSNS (bb, insn) + { + unsigned int uid = INSN_UID (insn); + if (NONDEBUG_INSN_P (insn)) + { + insn_entry[uid].insn = insn; + + if (GET_CODE (insn) == CALL_INSN) + insn_entry[uid].is_call = 1; + + /* Walk the uses and defs to see if we mention vector regs. + Record any constraints on optimization of such mentions. */ + df_ref *use_rec; + for (use_rec = DF_INSN_UID_USES (uid); *use_rec; use_rec++) + { + df_ref mention = *use_rec; + /* We use DF_REF_REAL_REG here to get inside any subregs. */ + machine_mode mode = GET_MODE (DF_REF_REAL_REG (mention)); + + /* If a use gets its value from a call insn, it will be + a hard register and will look like (reg:V4SI 3 3). + The df analysis creates two mentions for GPR3 and GPR4, + both DImode. We must recognize this and treat it as a + vector mention to ensure the call is unioned with this + use. */ + if (mode == DImode && DF_REF_INSN_INFO (mention)) + { + rtx feeder = DF_REF_INSN (mention); + /* FIXME: It is pretty hard to get from the df mention + to the mode of the use in the insn. We arbitrarily + pick a vector mode here, even though the use might + be a real DImode. We can be too conservative + (create a web larger than necessary) because of + this, so consider eventually fixing this. */ + if (GET_CODE (feeder) == CALL_INSN) + mode = V4SImode; + } + + if (VECTOR_MODE_P (mode) || mode == TImode) + { + insn_entry[uid].is_relevant = 1; + if (mode == TImode || mode == V1TImode) + insn_entry[uid].is_128_int = 1; + if (DF_REF_INSN_INFO (mention)) + insn_entry[uid].contains_subreg + = !rtx_equal_p (DF_REF_REG (mention), + DF_REF_REAL_REG (mention)); + union_defs (insn_entry, insn, mention); + } + } + df_ref *def_rec; + for (def_rec = DF_INSN_UID_DEFS (uid); *def_rec; def_rec++) + { + df_ref mention = *def_rec; + /* We use DF_REF_REAL_REG here to get inside any subregs. */ + machine_mode mode = GET_MODE (DF_REF_REAL_REG (mention)); + + /* If we're loading up a hard vector register for a call, + it looks like (set (reg:V4SI 9 9) (...)). The df + analysis creates two mentions for GPR9 and GPR10, both + DImode. So relying on the mode from the mentions + isn't sufficient to ensure we union the call into the + web with the parameter setup code. */ + if (mode == DImode && GET_CODE (insn) == SET + && VECTOR_MODE_P (GET_MODE (SET_DEST (insn)))) + mode = GET_MODE (SET_DEST (insn)); + + if (VECTOR_MODE_P (mode) || mode == TImode) + { + insn_entry[uid].is_relevant = 1; + if (mode == TImode || mode == V1TImode) + insn_entry[uid].is_128_int = 1; + if (DF_REF_INSN_INFO (mention)) + insn_entry[uid].contains_subreg + = !rtx_equal_p (DF_REF_REG (mention), + DF_REF_REAL_REG (mention)); + /* REG_FUNCTION_VALUE_P is not valid for subregs. */ + else if (REG_FUNCTION_VALUE_P (DF_REF_REG (mention))) + insn_entry[uid].is_live_out = 1; + union_uses (insn_entry, insn, mention); + } + } + + if (insn_entry[uid].is_relevant) + { + /* Determine if this is a load or store. */ + insn_entry[uid].is_load = insn_is_load_p (insn); + insn_entry[uid].is_store = insn_is_store_p (insn); + + /* Determine if this is a doubleword swap. If not, + determine whether it can legally be swapped. */ + if (insn_is_swap_p (insn)) + insn_entry[uid].is_swap = 1; + else + { + unsigned int special = SH_NONE; + insn_entry[uid].is_swappable + = insn_is_swappable_p (insn_entry, insn, &special); + if (special != SH_NONE && insn_entry[uid].contains_subreg) + insn_entry[uid].is_swappable = 0; + else if (special != SH_NONE) + insn_entry[uid].special_handling = special; + else if (insn_entry[uid].contains_subreg) + insn_entry[uid].special_handling = SH_SUBREG; + } + } + } + } + + if (dump_file) + { + fprintf (dump_file, "\nSwap insn entry table when first built\n"); + dump_swap_insn_table (insn_entry); + } + + /* Record unoptimizable webs. */ + unsigned e = get_max_uid (), i; + for (i = 0; i < e; ++i) + { + if (!insn_entry[i].is_relevant) + continue; + + swap_web_entry *root + = (swap_web_entry*)(&insn_entry[i])->unionfind_root (); + unsigned uid = INSN_UID (insn_entry[i].insn); + + if (insn_entry[i].is_live_in || insn_entry[i].is_live_out + || (insn_entry[i].contains_subreg + && insn_entry[i].special_handling != SH_SUBREG) + || insn_entry[i].is_128_int || insn_entry[i].is_call + || !(insn_entry[i].is_swappable || insn_entry[i].is_swap)) + root->web_not_optimizable = 1; + + /* If we have loads or stores that aren't permuting then the + optimization isn't appropriate. */ + else if ((insn_entry[i].is_load || insn_entry[i].is_store) + && !insn_entry[i].is_swap && !insn_entry[i].is_swappable) + root->web_not_optimizable = 1; + + /* If we have permuting loads or stores that are not accompanied + by a register swap, the optimization isn't appropriate. */ + else if (insn_entry[i].is_load && insn_entry[i].is_swap) + { + df_ref *def_rec; + + for (def_rec = DF_INSN_UID_DEFS (uid); *def_rec; def_rec++) + { + df_ref def = *def_rec; + struct df_link *link = DF_REF_CHAIN (def); + + if (!chain_contains_only_swaps (insn_entry, link, FOR_LOADS)) + { + root->web_not_optimizable = 1; + break; + } + } + } + else if (insn_entry[i].is_store && insn_entry[i].is_swap) + { + df_ref *use_rec; + + for (use_rec = DF_INSN_UID_USES (uid); *use_rec; use_rec++) + { + df_ref use = *use_rec; + struct df_link *link = DF_REF_CHAIN (use); + + if (!chain_contains_only_swaps (insn_entry, link, FOR_STORES)) + { + root->web_not_optimizable = 1; + break; + } + } + } + } + + if (dump_file) + { + fprintf (dump_file, "\nSwap insn entry table after web analysis\n"); + dump_swap_insn_table (insn_entry); + } + + /* For each load and store in an optimizable web (which implies + the loads and stores are permuting), find the associated + register swaps and mark them for removal. Due to various + optimizations we may mark the same swap more than once. Also + perform special handling for swappable insns that require it. */ + for (i = 0; i < e; ++i) + if ((insn_entry[i].is_load || insn_entry[i].is_store) + && insn_entry[i].is_swap) + { + swap_web_entry* root_entry + = (swap_web_entry*)((&insn_entry[i])->unionfind_root ()); + if (!root_entry->web_not_optimizable) + mark_swaps_for_removal (insn_entry, i); + } + else if (insn_entry[i].is_swappable && insn_entry[i].special_handling) + { + swap_web_entry* root_entry + = (swap_web_entry*)((&insn_entry[i])->unionfind_root ()); + if (!root_entry->web_not_optimizable) + handle_special_swappables (insn_entry, i); + } + + /* Now delete the swaps marked for removal. */ + for (i = 0; i < e; ++i) + if (insn_entry[i].will_delete) + replace_swap_with_copy (insn_entry, i); + + /* Clean up. */ + free (insn_entry); + return 0; +} + struct gcc_target targetm = TARGET_INITIALIZER; Index: gcc/config/rs6000/rs6000.opt =================================================================== --- gcc/config/rs6000/rs6000.opt (revision 221696) +++ gcc/config/rs6000/rs6000.opt (working copy) @@ -585,3 +585,7 @@ Allow double variables in upper registers with -mc mupper-regs-sf Target Undocumented Mask(UPPER_REGS_SF) Var(rs6000_isa_flags) Allow float variables in upper registers with -mcpu=power8 or -mp8-vector + +moptimize-swaps +Target Undocumented Var(rs6000_optimize_swaps) Init(1) Save +Analyze and remove doubleword swaps from VSX computations. Index: gcc/df.h =================================================================== --- gcc/df.h (revision 221696) +++ gcc/df.h (working copy) @@ -1132,20 +1132,22 @@ df_get_artificial_uses (unsigned int bb_index) /* web */ -/* This entry is allocated for each reference in the insn stream. */ -struct web_entry +class web_entry_base { - /* Pointer to the parent in the union/find tree. */ - struct web_entry *pred; - /* Newly assigned register to the entry. Set only for roots. */ - rtx reg; - void* extra_info; + private: + /* Reference to the parent in the union/find tree. */ + web_entry_base *pred_pvt; + + public: + /* Accessors. */ + web_entry_base *pred () { return pred_pvt; } + void set_pred (web_entry_base *p) { pred_pvt = p; } + + /* Find representative in union-find tree. */ + web_entry_base *unionfind_root (); + + /* Union with another set, returning TRUE if they are already unioned. */ + friend bool unionfind_union (web_entry_base *first, web_entry_base *second); }; -extern struct web_entry *unionfind_root (struct web_entry *); -extern bool unionfind_union (struct web_entry *, struct web_entry *); -extern void union_defs (df_ref, struct web_entry *, - unsigned int *used, struct web_entry *, - bool (*fun) (struct web_entry *, struct web_entry *)); - #endif /* GCC_DF_H */ Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-1.c =================================================================== --- gcc/testsuite/gcc.target/powerpc/swaps-p8-1.c (revision 0) +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-1.c (working copy) @@ -0,0 +1,35 @@ +/* { dg-do compile { target { powerpc64le-*-* } } } */ +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */ +/* { dg-options "-mcpu=power8 -O3" } */ +/* { dg-final { scan-assembler "lxvd2x" } } */ +/* { dg-final { scan-assembler "stxvd2x" } } */ +/* { dg-final { scan-assembler-not "xxpermdi" } } */ + +void abort(); + +#define N 16 + +signed char ca[N] __attribute__((aligned(16))); +signed char cb[] __attribute__((aligned(16))) + = {8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7}; +signed char cc[] __attribute__((aligned(16))) + = {1, 1, 2, 2, 3, 3, 2, 2, 1, 1, 0, 0, -1, -1, -2, -2}; + +__attribute__((noinline)) void foo () +{ + int i; + for (i = 0; i < N; i++) { + ca[i] = cb[i] - cc[i]; + } +} + +int main () +{ + signed char cd[] = {7, 6, 4, 3, 1, 0, 0, -1, -1, -2, -2, -3, -3, -4, -4, -5}; + int i; + foo (); + for (i = 0; i < N; ++i) + if (ca[i] != cd[i]) + abort (); + return 0; +} Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-10.c =================================================================== --- gcc/testsuite/gcc.target/powerpc/swaps-p8-10.c (revision 0) +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-10.c (working copy) @@ -0,0 +1,42 @@ +/* { dg-do run { target { powerpc64le-*-* } } } */ +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */ +/* { dg-options "-mcpu=power8 -O3" } */ + +void abort (); + +#define N 4096 +int ca[N] __attribute__((aligned(16))); +int cb[N] __attribute__((aligned(16))); +int cc[N] __attribute__((aligned(16))); +int cd[N] __attribute__((aligned(16))); + +__attribute__((noinline)) void foo () +{ + int i; + for (i = 0; i < N; i++) { + ca[i] = ((cb[i] + cc[i]) * cd[i]) >> 3; + } +} + +__attribute__((noinline)) void init () +{ + int i; + for (i = 0; i < N; ++i) { + cb[i] = 3 * i - 2048; + cc[i] = -5 * i + 93; + cd[i] = i % 2 ? 1 : -1; + } +} + +int main () +{ + int i; + init (); + foo (); + for (i = 0; i < N; ++i) + if (i % 2 == 1 && ca[i] != (-2 * i - 1955) >> 3) + abort (); + else if (i % 2 == 0 && ca[i] != (1955 + 2 * i) >> 3) + abort (); + return 0; +} Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-11.c =================================================================== --- gcc/testsuite/gcc.target/powerpc/swaps-p8-11.c (revision 0) +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-11.c (working copy) @@ -0,0 +1,53 @@ +/* { dg-do run { target { powerpc64le-*-* } } } */ +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */ +/* { dg-options "-mcpu=power8 -O3" } */ + +#include <altivec.h> +void abort (); + +#define N 4096 +int ca[N] __attribute__((aligned(16))); +int cb[N] __attribute__((aligned(16))); +int cc[N] __attribute__((aligned(16))); +int cd[N] __attribute__((aligned(16))); +int hey; + +__attribute__((noinline)) void foo () +{ + int i; + vector int va, vb, vc, vd, tmp; + vector unsigned int threes = vec_splat_u32(3); + for (i = 0; i < N; i+=4) { + vb = vec_vsx_ld (0, &cb[i]); + vc = vec_vsx_ld (0, &cc[i]); + vd = vec_vsx_ld (0, &cd[i]); + tmp = vec_add (vb, vc); + tmp = vec_sub (tmp, vd); + tmp = vec_sra (tmp, threes); + hey = tmp[3]; + vec_vsx_st (tmp, 0, &ca[i]); + } +} + +__attribute__((noinline)) void init () +{ + int i; + for (i = 0; i < N; ++i) { + cb[i] = 3 * i - 2048; + cc[i] = -5 * i + 93; + cd[i] = i + 14; + } +} + +int main () +{ + int i; + init (); + foo (); + for (i = 0; i < N; ++i) + if (ca[i] != (-3 * i - 1969) >> 3) + abort (); + if (hey != ca[N-1]) + abort (); + return 0; +} Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-12.c =================================================================== --- gcc/testsuite/gcc.target/powerpc/swaps-p8-12.c (revision 0) +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-12.c (working copy) @@ -0,0 +1,56 @@ +/* { dg-do compile { target { powerpc64le-*-* } } } */ +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */ +/* { dg-options "-mcpu=power8 -O3" } */ +/* { dg-final { scan-assembler "lxvd2x" } } */ +/* { dg-final { scan-assembler "stxvd2x" } } */ +/* { dg-final { scan-assembler-not "xxpermdi" } } */ + +#include "altivec.h" +void abort (); + +#define N 4096 +int ca[N] __attribute__((aligned(16))); +int cb[N] __attribute__((aligned(16))); +int cc[N] __attribute__((aligned(16))); +int cd[N] __attribute__((aligned(16))); +int hey; + +__attribute__((noinline)) void foo () +{ + int i; + vector int va, vb, vc, vd, tmp; + vector unsigned int threes = vec_splat_u32(3); + for (i = 0; i < N; i+=4) { + vb = vec_vsx_ld (0, &cb[i]); + vc = vec_vsx_ld (0, &cc[i]); + vd = vec_vsx_ld (0, &cd[i]); + tmp = vec_add (vb, vc); + tmp = vec_sub (tmp, vd); + tmp = vec_sra (tmp, threes); + hey = tmp[3]; + vec_vsx_st (tmp, 0, &ca[i]); + } +} + +__attribute__((noinline)) void init () +{ + int i; + for (i = 0; i < N; ++i) { + cb[i] = 3 * i - 2048; + cc[i] = -5 * i + 93; + cd[i] = i + 14; + } +} + +int main () +{ + int i; + init (); + foo (); + for (i = 0; i < N; ++i) + if (ca[i] != (-3 * i - 1969) >> 3) + abort (); + if (hey != ca[N-1]) + abort (); + return 0; +} Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-13.c =================================================================== --- gcc/testsuite/gcc.target/powerpc/swaps-p8-13.c (revision 0) +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-13.c (working copy) @@ -0,0 +1,54 @@ +/* { dg-do run { target { powerpc64le-*-* } } } */ +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */ +/* { dg-options "-mcpu=power8 -O3" } */ + +#include <altivec.h> +void abort (); + +#define N 4096 +long long ca[N] __attribute__((aligned(16))); +long long cb[N] __attribute__((aligned(16))); +long long cc[N] __attribute__((aligned(16))); +long long cd[N] __attribute__((aligned(16))); +long long x; + +__attribute__((noinline)) void foo () +{ + int i; + vector long long va, vb, vc, vd, tmp; + volatile unsigned long long three = 3; + vector unsigned long long threes = vec_splats (three); + for (i = 0; i < N; i+=2) { + vb = vec_vsx_ld (0, (vector long long *)&cb[i]); + vc = vec_vsx_ld (0, (vector long long *)&cc[i]); + vd = vec_vsx_ld (0, (vector long long *)&cd[i]); + tmp = vec_add (vb, vc); + tmp = vec_sub (tmp, vd); + tmp = vec_sra (tmp, threes); + x = vec_extract (tmp, 0); + vec_vsx_st (tmp, 0, (vector long long *)&ca[i]); + } +} + +__attribute__((noinline)) void init () +{ + int i; + for (i = 0; i < N; ++i) { + cb[i] = 3 * i - 2048; + cc[i] = -5 * i + 93; + cd[i] = i + 14; + } +} + +int main () +{ + int i; + init (); + foo (); + for (i = 0; i < N; ++i) + if (ca[i] != (-3 * i - 1969) >> 3) + abort (); + if (x != ca[N-1]) + abort (); + return 0; +} Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-15.c =================================================================== --- gcc/testsuite/gcc.target/powerpc/swaps-p8-15.c (revision 0) +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-15.c (working copy) @@ -0,0 +1,51 @@ +/* { dg-do compile { target { powerpc64le-*-* } } } */ +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */ +/* { dg-options "-mcpu=power8 -O3" } */ +/* { dg-final { scan-assembler "lxvd2x" } } */ +/* { dg-final { scan-assembler "stxvd2x" } } */ +/* { dg-final { scan-assembler "xxspltw" } } */ + +/* Currently the analyze_swaps phase cannot optimize this loop because + of the presence of an UNSPEC_VSX_CVDPSPN. At such time as this is + handled, we need to add a 'scan-assembler-not "xxpermdi"' directive to + this test. */ +#include <altivec.h> +void abort(); + +#define N 4096 +#define M 10000000 +vector float ca[N][4] = {0}; +vector float cb[N][4] = {0}; +vector float cc[N][4] = {0}; + +__attribute__((noinline)) void foo () +{ + int i; + for (i = 0; i < N; i++) { + cc[i][0] = vec_mul(vec_splats(cb[i][0][0]), ca[i][0]); + cc[i][0] = vec_madd(cc[i][0],vec_splats(cb[i][0][1]), ca[i][1]); + cc[i][0] = vec_madd(cc[i][0],vec_splats(cb[i][0][2]), ca[i][2]); + cc[i][0] = vec_madd(cc[i][0],vec_splats(cb[i][0][3]), ca[i][3]); + + cc[i][1] = vec_mul(vec_splats(cb[i][1][0]), ca[i][0]); + cc[i][1] = vec_madd(cc[i][0],vec_splats(cb[i][1][1]), ca[i][1]); + cc[i][1] = vec_madd(cc[i][0],vec_splats(cb[i][1][2]), ca[i][2]); + cc[i][1] = vec_madd(cc[i][0],vec_splats(cb[i][1][3]), ca[i][3]); + + cc[i][2] = vec_mul(vec_splats(cb[i][2][0]), ca[i][0]); + cc[i][2] = vec_madd(cc[i][0],vec_splats(cb[i][2][1]), ca[i][1]); + cc[i][2] = vec_madd(cc[i][0],vec_splats(cb[i][2][2]), ca[i][2]); + cc[i][2] = vec_madd(cc[i][0],vec_splats(cb[i][2][3]), ca[i][3]); + + cc[i][3] = vec_mul(vec_splats(cb[i][3][0]), ca[i][0]); + cc[i][3] = vec_madd(cc[i][0],vec_splats(cb[i][3][1]), ca[i][1]); + cc[i][3] = vec_madd(cc[i][0],vec_splats(cb[i][3][2]), ca[i][2]); + cc[i][3] = vec_madd(cc[i][0],vec_splats(cb[i][3][3]), ca[i][3]); + } +} + +int main () +{ + foo (); + return 0; +} Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-17.c =================================================================== --- gcc/testsuite/gcc.target/powerpc/swaps-p8-17.c (revision 0) +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-17.c (working copy) @@ -0,0 +1,15 @@ +/* { dg-do compile { target { powerpc64le-*-* } } } */ +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */ +/* { dg-options "-mcpu=power8 -O1" } */ +/* { dg-final { scan-assembler "lxvd2x" } } */ +/* { dg-final { scan-assembler "xxpermdi" } } */ + +/* Verify that we don't try to do permute removal in the presence of + vec_ste. This used to ICE. */ +#include <altivec.h> + +void f (void *p) +{ + vector unsigned int u32 = vec_vsx_ld (1, (const unsigned int *)p); + vec_ste (u32, 1, (unsigned int *)p); +} Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-3.c =================================================================== --- gcc/testsuite/gcc.target/powerpc/swaps-p8-3.c (revision 0) +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-3.c (working copy) @@ -0,0 +1,43 @@ +/* { dg-do compile { target { powerpc64le-*-* } } } */ +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */ +/* { dg-options "-mcpu=power8 -O3" } */ +/* { dg-final { scan-assembler "lxvd2x" } } */ +/* { dg-final { scan-assembler "stxvd2x" } } */ +/* { dg-final { scan-assembler-not "xxpermdi" } } */ + +void abort (); + +#define N 4096 +signed char ca[N] __attribute__((aligned(16))); +signed char cb[N] __attribute__((aligned(16))); +signed char cc[N] __attribute__((aligned(16))); + +__attribute__((noinline)) void foo () +{ + int i; + for (i = 0; i < N; i++) { + ca[i] = cb[i] - cc[i]; + } +} + +__attribute__((noinline)) void init () +{ + int i, ii; + for (i = 0, ii = 0; i < N; ++i, ii = (ii + 1) % 128) { + cb[i] = ii - 128; + cc[i] = ii/2 - 64; + } +} + +int main () +{ + int i, ii; + init (); + foo (); + for (i = 0; i < N; ++i) { + ii = i % 128; + if (ca[i] != ii - ii/2 - 64) + abort (); + } + return 0; +} Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-4.c =================================================================== --- gcc/testsuite/gcc.target/powerpc/swaps-p8-4.c (revision 0) +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-4.c (working copy) @@ -0,0 +1,45 @@ +/* { dg-do compile { target { powerpc64le-*-* } } } */ +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */ +/* { dg-options "-mcpu=power8 -O3" } */ +/* { dg-final { scan-assembler "lxvd2x" } } */ +/* { dg-final { scan-assembler "stxvd2x" } } */ +/* { dg-final { scan-assembler-not "xxpermdi" } } */ + +void abort (); + +#define N 4096 +int ca[N] __attribute__((aligned(16))); +int cb[N] __attribute__((aligned(16))); +int cc[N] __attribute__((aligned(16))); +int cd[N] __attribute__((aligned(16))); + +__attribute__((noinline)) void foo () +{ + int i; + for (i = 0; i < N; i++) { + ca[i] = (cb[i] + cc[i]) * cd[i]; + } +} + +__attribute__((noinline)) void init () +{ + int i; + for (i = 0; i < N; ++i) { + cb[i] = 3 * i - 2048; + cc[i] = -5 * i + 93; + cd[i] = i % 2 ? 1 : -1; + } +} + +int main () +{ + int i; + init (); + foo (); + for (i = 0; i < N; ++i) + if (i % 2 == 1 && ca[i] != -2 * i - 1955) + abort (); + else if (i % 2 == 0 && ca[i] != 1955 + 2 * i) + abort (); + return 0; +} Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-5.c =================================================================== --- gcc/testsuite/gcc.target/powerpc/swaps-p8-5.c (revision 0) +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-5.c (working copy) @@ -0,0 +1,45 @@ +/* { dg-do compile { target { powerpc64le-*-* } } } */ +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */ +/* { dg-options "-mcpu=power8 -O3" } */ +/* { dg-final { scan-assembler "lxvd2x" } } */ +/* { dg-final { scan-assembler "stxvd2x" } } */ +/* { dg-final { scan-assembler-not "xxpermdi" } } */ + +void abort (); + +#define N 4096 +int ca[N] __attribute__((aligned(16))); +int cb[N] __attribute__((aligned(16))); +int cc[N] __attribute__((aligned(16))); +int cd[N] __attribute__((aligned(16))); + +__attribute__((noinline)) void foo () +{ + int i; + for (i = 0; i < N; i++) { + ca[i] = ((cb[i] + cc[i]) * cd[i]) >> 3; + } +} + +__attribute__((noinline)) void init () +{ + int i; + for (i = 0; i < N; ++i) { + cb[i] = 3 * i - 2048; + cc[i] = -5 * i + 93; + cd[i] = i % 2 ? 1 : -1; + } +} + +int main () +{ + int i; + init (); + foo (); + for (i = 0; i < N; ++i) + if (i % 2 == 1 && ca[i] != (-2 * i - 1955) >> 3) + abort (); + else if (i % 2 == 0 && ca[i] != (1955 + 2 * i) >> 3) + abort (); + return 0; +} Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-6.c =================================================================== --- gcc/testsuite/gcc.target/powerpc/swaps-p8-6.c (revision 0) +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-6.c (working copy) @@ -0,0 +1,32 @@ +/* { dg-do run { target { powerpc64le-*-* } } } */ +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */ +/* { dg-options "-mcpu=power8 -O3" } */ + +void abort(); + +#define N 16 + +signed char ca[N] __attribute__((aligned(16))); +signed char cb[] __attribute__((aligned(16))) + = {8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7}; +signed char cc[] __attribute__((aligned(16))) + = {1, 1, 2, 2, 3, 3, 2, 2, 1, 1, 0, 0, -1, -1, -2, -2}; + +__attribute__((noinline)) void foo () +{ + int i; + for (i = 0; i < N; i++) { + ca[i] = cb[i] - cc[i]; + } +} + +int main () +{ + signed char cd[] = {7, 6, 4, 3, 1, 0, 0, -1, -1, -2, -2, -3, -3, -4, -4, -5}; + int i; + foo (); + for (i = 0; i < N; ++i) + if (ca[i] != cd[i]) + abort (); + return 0; +} Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-7.c =================================================================== --- gcc/testsuite/gcc.target/powerpc/swaps-p8-7.c (revision 0) +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-7.c (working copy) @@ -0,0 +1,38 @@ +/* { dg-do run { target { powerpc64le-*-* } } } */ +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */ +/* { dg-options "-mcpu=power8 -O3" } */ + +void abort (); + +#define N 256 +signed char ca[N] __attribute__((aligned(16))); +signed char cb[N] __attribute__((aligned(16))); +signed char cc[N] __attribute__((aligned(16))); + +__attribute__((noinline)) void foo () +{ + int i; + for (i = 0; i < N; i++) { + ca[i] = cb[i] - cc[i]; + } +} + +__attribute__((noinline)) void init () +{ + int i; + for (i = 0; i < N; ++i) { + cb[i] = i - 128; + cc[i] = i/2 - 64; + } +} + +int main () +{ + int i; + init (); + foo (); + for (i = 0; i < N; ++i) + if (ca[i] != i - i/2 - 64) + abort (); + return 0; +} Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-8.c =================================================================== --- gcc/testsuite/gcc.target/powerpc/swaps-p8-8.c (revision 0) +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-8.c (working copy) @@ -0,0 +1,40 @@ +/* { dg-do run { target { powerpc64le-*-* } } } */ +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */ +/* { dg-options "-mcpu=power8 -O3" } */ + +void abort (); + +#define N 4096 +signed char ca[N] __attribute__((aligned(16))); +signed char cb[N] __attribute__((aligned(16))); +signed char cc[N] __attribute__((aligned(16))); + +__attribute__((noinline)) void foo () +{ + int i; + for (i = 0; i < N; i++) { + ca[i] = cb[i] - cc[i]; + } +} + +__attribute__((noinline)) void init () +{ + int i, ii; + for (i = 0, ii = 0; i < N; ++i, ii = (ii + 1) % 128) { + cb[i] = ii - 128; + cc[i] = ii/2 - 64; + } +} + +int main () +{ + int i, ii; + init (); + foo (); + for (i = 0; i < N; ++i) { + ii = i % 128; + if (ca[i] != ii - ii/2 - 64) + abort (); + } + return 0; +} Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-9.c =================================================================== --- gcc/testsuite/gcc.target/powerpc/swaps-p8-9.c (revision 0) +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-9.c (working copy) @@ -0,0 +1,42 @@ +/* { dg-do run { target { powerpc64le-*-* } } } */ +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */ +/* { dg-options "-mcpu=power8 -O3" } */ + +void abort (); + +#define N 4096 +int ca[N] __attribute__((aligned(16))); +int cb[N] __attribute__((aligned(16))); +int cc[N] __attribute__((aligned(16))); +int cd[N] __attribute__((aligned(16))); + +__attribute__((noinline)) void foo () +{ + int i; + for (i = 0; i < N; i++) { + ca[i] = (cb[i] + cc[i]) * cd[i]; + } +} + +__attribute__((noinline)) void init () +{ + int i; + for (i = 0; i < N; ++i) { + cb[i] = 3 * i - 2048; + cc[i] = -5 * i + 93; + cd[i] = i % 2 ? 1 : -1; + } +} + +int main () +{ + int i; + init (); + foo (); + for (i = 0; i < N; ++i) + if (i % 2 == 1 && ca[i] != -2 * i - 1955) + abort (); + else if (i % 2 == 0 && ca[i] != 1955 + 2 * i) + abort (); + return 0; +} Index: gcc/web.c =================================================================== --- gcc/web.c (revision 221696) +++ gcc/web.c (working copy) @@ -53,17 +53,17 @@ along with GCC; see the file COPYING3. If not see /* Find the root of unionfind tree (the representative of set). */ -struct web_entry * -unionfind_root (struct web_entry *element) +web_entry_base * +web_entry_base::unionfind_root () { - struct web_entry *element1 = element, *element2; + web_entry_base *element = this, *element1 = this, *element2; - while (element->pred) - element = element->pred; - while (element1->pred) + while (element->pred ()) + element = element->pred (); + while (element1->pred ()) { - element2 = element1->pred; - element1->pred = element; + element2 = element1->pred (); + element1->set_pred (element); element1 = element2; } return element; @@ -74,23 +74,32 @@ along with GCC; see the file COPYING3. If not see nothing is done. Otherwise, return false. */ bool -unionfind_union (struct web_entry *first, struct web_entry *second) +unionfind_union (web_entry_base *first, web_entry_base *second) { - first = unionfind_root (first); - second = unionfind_root (second); + first = first->unionfind_root (); + second = second->unionfind_root (); if (first == second) return true; - second->pred = first; + second->set_pred (first); return false; } +class web_entry : public web_entry_base +{ + private: + rtx reg_pvt; + + public: + rtx reg () { return reg_pvt; } + void set_reg (rtx r) { reg_pvt = r; } +}; + /* For INSN, union all defs and uses that are linked by match_dup. FUN is the function that does the union. */ static void -union_match_dups (rtx insn, struct web_entry *def_entry, - struct web_entry *use_entry, - bool (*fun) (struct web_entry *, struct web_entry *)) +union_match_dups (rtx insn, web_entry *def_entry, web_entry *use_entry, + bool (*fun) (web_entry_base *, web_entry_base *)) { struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn); df_ref *use_link = DF_INSN_INFO_USES (insn_info); @@ -157,9 +166,9 @@ static void the values 0 and 1 are reserved for use by entry_register. */ void -union_defs (df_ref use, struct web_entry *def_entry, - unsigned int *used, struct web_entry *use_entry, - bool (*fun) (struct web_entry *, struct web_entry *)) +union_defs (df_ref use, web_entry *def_entry, + unsigned int *used, web_entry *use_entry, + bool (*fun) (web_entry_base *, web_entry_base *)) { struct df_insn_info *insn_info = DF_REF_INSN_INFO (use); struct df_link *link = DF_REF_CHAIN (use); @@ -260,15 +269,15 @@ void /* Find the corresponding register for the given entry. */ static rtx -entry_register (struct web_entry *entry, df_ref ref, unsigned int *used) +entry_register (web_entry *entry, df_ref ref, unsigned int *used) { - struct web_entry *root; + web_entry *root; rtx reg, newreg; /* Find the corresponding web and see if it has been visited. */ - root = unionfind_root (entry); - if (root->reg) - return root->reg; + root = (web_entry *)entry->unionfind_root (); + if (root->reg ()) + return root->reg (); /* We are seeing this web for the first time, do the assignment. */ reg = DF_REF_REAL_REG (ref); @@ -292,7 +301,7 @@ static rtx REGNO (newreg)); } - root->reg = newreg; + root->set_reg (newreg); return newreg; } @@ -326,8 +335,8 @@ gate_handle_web (void) static unsigned int web_main (void) { - struct web_entry *def_entry; - struct web_entry *use_entry; + web_entry *def_entry; + web_entry *use_entry; unsigned int max = max_reg_num (); unsigned int *used; basic_block bb; @@ -364,9 +373,9 @@ web_main (void) } /* Record the number of uses and defs at the beginning of the optimization. */ - def_entry = XCNEWVEC (struct web_entry, DF_DEFS_TABLE_SIZE()); + def_entry = XCNEWVEC (web_entry, DF_DEFS_TABLE_SIZE()); used = XCNEWVEC (unsigned, max); - use_entry = XCNEWVEC (struct web_entry, uses_num); + use_entry = XCNEWVEC (web_entry, uses_num); /* Produce the web. */ FOR_ALL_BB (bb)