===================================================================
@@ -11937,6 +11937,9 @@ SSE2 and SSE3 instruction set support.
@item core2
Intel Core2 CPU with 64-bit extensions, MMX, SSE, SSE2, SSE3 and SSSE3
instruction set support.
+@item corei7
+Intel Core i7 CPU with 64-bit extensions, MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1
+and SSE4.2 instruction set support.
@item atom
Intel Atom CPU with 64-bit extensions, MMX, SSE, SSE2, SSE3 and SSSE3
instruction set support.
===================================================================
@@ -239,6 +239,7 @@ extern const struct processor_costs ix86
#define TARGET_ATHLON_K8 (TARGET_K8 || TARGET_ATHLON)
#define TARGET_NOCONA (ix86_tune == PROCESSOR_NOCONA)
#define TARGET_CORE2 (ix86_tune == PROCESSOR_CORE2)
+#define TARGET_COREI7 (ix86_tune == PROCESSOR_COREI7)
#define TARGET_GENERIC32 (ix86_tune == PROCESSOR_GENERIC32)
#define TARGET_GENERIC64 (ix86_tune == PROCESSOR_GENERIC64)
#define TARGET_GENERIC (TARGET_GENERIC32 || TARGET_GENERIC64)
@@ -274,6 +275,7 @@ enum ix86_tune_indices {
X86_TUNE_HIMODE_MATH,
X86_TUNE_PROMOTE_QI_REGS,
X86_TUNE_PROMOTE_HI_REGS,
+ X86_TUNE_PROMOTE_HI_CONSTANTS,
X86_TUNE_ADD_ESP_4,
X86_TUNE_ADD_ESP_8,
X86_TUNE_SUB_ESP_4,
@@ -348,6 +350,8 @@ extern unsigned char ix86_tune_features[
#define TARGET_HIMODE_MATH ix86_tune_features[X86_TUNE_HIMODE_MATH]
#define TARGET_PROMOTE_QI_REGS ix86_tune_features[X86_TUNE_PROMOTE_QI_REGS]
#define TARGET_PROMOTE_HI_REGS ix86_tune_features[X86_TUNE_PROMOTE_HI_REGS]
+#define TARGET_PROMOTE_HI_CONSTANTS \
+ ix86_tune_features[X86_TUNE_PROMOTE_HI_CONSTANTS]
#define TARGET_ADD_ESP_4 ix86_tune_features[X86_TUNE_ADD_ESP_4]
#define TARGET_ADD_ESP_8 ix86_tune_features[X86_TUNE_ADD_ESP_8]
#define TARGET_SUB_ESP_4 ix86_tune_features[X86_TUNE_SUB_ESP_4]
@@ -597,6 +601,7 @@ enum target_cpu_default
TARGET_CPU_DEFAULT_prescott,
TARGET_CPU_DEFAULT_nocona,
TARGET_CPU_DEFAULT_core2,
+ TARGET_CPU_DEFAULT_corei7,
TARGET_CPU_DEFAULT_atom,
TARGET_CPU_DEFAULT_geode,
@@ -2139,6 +2144,7 @@ enum processor_type
PROCESSOR_K8,
PROCESSOR_NOCONA,
PROCESSOR_CORE2,
+ PROCESSOR_COREI7,
PROCESSOR_GENERIC32,
PROCESSOR_GENERIC64,
PROCESSOR_AMDFAM10,
===================================================================
@@ -349,8 +349,8 @@ (define_constants
;; Processor type.
-(define_attr "cpu" "none,pentium,pentiumpro,geode,k6,athlon,k8,core2,atom,
- generic64,amdfam10,bdver1"
+(define_attr "cpu" "none,pentium,pentiumpro,geode,k6,athlon,k8,core2,corei7,
+ atom,generic64,amdfam10,bdver1"
(const (symbol_ref "ix86_schedule")))
;; A basic instruction type. Refinements due to arguments to be
@@ -388,6 +388,10 @@ (define_attr "unit" "integer,i387,sse,mm
(const_string "unknown")]
(const_string "integer")))
+;; For integer multiply insns, the number of operands.
+(define_attr "mul_operands" ""
+ (const_int 2))
+
;; The (bounding maximum) length of an instruction immediate.
(define_attr "length_immediate" ""
(cond [(eq_attr "type" "incdec,setcc,icmov,str,lea,other,multi,idiv,leave,
@@ -919,6 +923,7 @@ (define_mode_iterator P [(SI "Pmode == S
(include "athlon.md")
(include "geode.md")
(include "atom.md")
+(include "core2.md")
;; Operand and operator predicates and constraints
@@ -7010,6 +7015,7 @@ (define_insn "*mul<mode>3_1"
imul{<imodesuffix>}\t{%2, %1, %0|%0, %1, %2}
imul{<imodesuffix>}\t{%2, %0|%0, %2}"
[(set_attr "type" "imul")
+ (set_attr "mul_operands" "3,2,2")
(set_attr "prefix_0f" "0,0,1")
(set (attr "athlon_decode")
(cond [(eq_attr "cpu" "athlon")
@@ -7040,6 +7046,7 @@ (define_insn "*mulsi3_1_zext"
imul{l}\t{%2, %1, %k0|%k0, %1, %2}
imul{l}\t{%2, %k0|%k0, %2}"
[(set_attr "type" "imul")
+ (set_attr "mul_operands" "3,3,2")
(set_attr "prefix_0f" "0,0,1")
(set (attr "athlon_decode")
(cond [(eq_attr "cpu" "athlon")
@@ -7077,6 +7084,7 @@ (define_insn "*mulhi3_1"
imul{w}\t{%2, %1, %0|%0, %1, %2}
imul{w}\t{%2, %0|%0, %2}"
[(set_attr "type" "imul")
+ (set_attr "mul_operands" "3,3,2")
(set_attr "prefix_0f" "0,0,1")
(set (attr "athlon_decode")
(cond [(eq_attr "cpu" "athlon")
@@ -7103,6 +7111,7 @@ (define_insn "*mulqi3_1"
&& !(MEM_P (operands[1]) && MEM_P (operands[2]))"
"mul{b}\t%2"
[(set_attr "type" "imul")
+ (set_attr "mul_operands" "1")
(set_attr "length_immediate" "0")
(set (attr "athlon_decode")
(if_then_else (eq_attr "cpu" "athlon")
@@ -7144,6 +7153,7 @@ (define_insn "*<u>mul<mode><dwi>3_1"
"!(MEM_P (operands[1]) && MEM_P (operands[2]))"
"<sgnprefix>mul{<imodesuffix>}\t%2"
[(set_attr "type" "imul")
+ (set_attr "mul_operands" "1")
(set_attr "length_immediate" "0")
(set (attr "athlon_decode")
(if_then_else (eq_attr "cpu" "athlon")
@@ -7164,6 +7174,7 @@ (define_insn "*<u>mulqihi3_1"
&& !(MEM_P (operands[1]) && MEM_P (operands[2]))"
"<sgnprefix>mul{b}\t%2"
[(set_attr "type" "imul")
+ (set_attr "mul_operands" "1")
(set_attr "length_immediate" "0")
(set (attr "athlon_decode")
(if_then_else (eq_attr "cpu" "athlon")
@@ -7203,6 +7214,7 @@ (define_insn "*<s>muldi3_highpart_1"
&& !(MEM_P (operands[1]) && MEM_P (operands[2]))"
"<sgnprefix>mul{q}\t%2"
[(set_attr "type" "imul")
+ (set_attr "mul_operands" "1")
(set_attr "length_immediate" "0")
(set (attr "athlon_decode")
(if_then_else (eq_attr "cpu" "athlon")
@@ -7226,6 +7238,7 @@ (define_insn "*<s>mulsi3_highpart_1"
"!(MEM_P (operands[1]) && MEM_P (operands[2]))"
"<sgnprefix>mul{l}\t%2"
[(set_attr "type" "imul")
+ (set_attr "mul_operands" "1")
(set_attr "length_immediate" "0")
(set (attr "athlon_decode")
(if_then_else (eq_attr "cpu" "athlon")
@@ -7249,6 +7262,7 @@ (define_insn "*<s>mulsi3_highpart_zext"
&& !(MEM_P (operands[1]) && MEM_P (operands[2]))"
"<sgnprefix>mul{l}\t%2"
[(set_attr "type" "imul")
+ (set_attr "mul_operands" "1")
(set_attr "length_immediate" "0")
(set (attr "athlon_decode")
(if_then_else (eq_attr "cpu" "athlon")
===================================================================
@@ -0,0 +1,744 @@
+;; Scheduling for Core 2 and derived processors.
+;; Copyright (C) 2004, 2005, 2007, 2008, 2010 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3. If not see
+;; <http://www.gnu.org/licenses/>. */
+
+;; The scheduling description in this file is based on the one in ppro.md,
+;; with additional information obtained from
+;;
+;; "How to optimize for the Pentium family of microprocessors",
+;; by Agner Fog, PhD.
+;;
+;; The major difference from the P6 pipeline is one extra decoder, and
+;; one extra execute unit. Due to micro-op fusion, many insns no longer
+;; need to be decoded in decoder 0, but can be handled by all of them.
+
+;; The core2_idiv, core2_fdiv and core2_ssediv automata are used to
+;; model issue latencies of idiv, fdiv and ssediv type insns.
+(define_automaton "core2_decoder,core2_core,core2_idiv,core2_fdiv,core2_ssediv,core2_load,core2_store")
+
+;; The CPU domain, used for Core i7 bypass latencies
+(define_attr "i7_domain" "int,float,simd"
+ (cond [(eq_attr "type" "fmov,fop,fsgn,fmul,fdiv,fpspc,fcmov,fcmp,fxch,fistp,fisttp,frndint")
+ (const_string "float")
+ (eq_attr "type" "sselog,sselog1,sseiadd,sseiadd1,sseishft,sseishft1,sseimul,
+ sse,ssemov,sseadd,ssemul,ssecmp,ssecomi,ssecvt,
+ ssecvt1,sseicvt,ssediv,sseins,ssemuladd,sse4arg")
+ (cond [(eq_attr "mode" "V4DF,V8SF,V2DF,V4SF,SF,DF")
+ (const_string "float")
+ (eq_attr "mode" "SI")
+ (const_string "int")]
+ (const_string "simd"))
+ (eq_attr "type" "mmx,mmxmov,mmxadd,mmxmul,mmxcmp,mmxcvt,mmxshft")
+ (const_string "simd")]
+ (const_string "int")))
+
+;; As for the Pentium Pro,
+;; - an instruction with 1 uop can be decoded by any of the three
+;; decoders in one cycle.
+;; - an instruction with 1 to 4 uops can be decoded only by decoder 0
+;; but still in only one cycle.
+;; - a complex (microcode) instruction can also only be decoded by
+;; decoder 0, and this takes an unspecified number of cycles.
+;;
+;; The goal is to schedule such that we have a few-one-one uops sequence
+;; in each cycle, to decode as many instructions per cycle as possible.
+(define_cpu_unit "c2_decoder0" "core2_decoder")
+(define_cpu_unit "c2_decoder1" "core2_decoder")
+(define_cpu_unit "c2_decoder2" "core2_decoder")
+(define_cpu_unit "c2_decoder3" "core2_decoder")
+
+;; We first wish to find an instruction for c2_decoder0, so exclude
+;; c2_decoder1 and c2_decoder2 from being reserved until c2_decoder 0 is
+;; reserved.
+(presence_set "c2_decoder1" "c2_decoder0")
+(presence_set "c2_decoder2" "c2_decoder0")
+(presence_set "c2_decoder3" "c2_decoder0")
+
+;; Most instructions can be decoded on any of the three decoders.
+(define_reservation "c2_decodern" "(c2_decoder0|c2_decoder1|c2_decoder2|c2_decoder3)")
+
+;; The out-of-order core has six pipelines. These are similar to the
+;; Pentium Pro's five pipelines. Port 2 is responsible for memory loads,
+;; port 3 for store address calculations, port 4 for memory stores, and
+;; ports 0, 1 and 5 for everything else.
+
+(define_cpu_unit "c2_p0,c2_p1,c2_p5" "core2_core")
+(define_cpu_unit "c2_p2" "core2_load")
+(define_cpu_unit "c2_p3,c2_p4" "core2_store")
+(define_cpu_unit "c2_idiv" "core2_idiv")
+(define_cpu_unit "c2_fdiv" "core2_fdiv")
+(define_cpu_unit "c2_ssediv" "core2_ssediv")
+
+;; Only the irregular instructions have to be modeled here. A load
+;; increases the latency by 2 or 3, or by nothing if the manual gives
+;; a latency already. Store latencies are not accounted for.
+;;
+;; The simple instructions follow a very regular pattern of 1 uop per
+;; reg-reg operation, 1 uop per load on port 2. and 2 uops per store
+;; on port 4 and port 3. These instructions are modelled at the bottom
+;; of this file.
+;;
+;; For microcoded instructions we don't know how many uops are produced.
+;; These instructions are the "complex" ones in the Intel manuals. All
+;; we _do_ know is that they typically produce four or more uops, so
+;; they can only be decoded on c2_decoder0. Modelling their latencies
+;; doesn't make sense because we don't know how these instructions are
+;; executed in the core. So we just model that they can only be decoded
+;; on decoder 0, and say that it takes a little while before the result
+;; is available.
+(define_insn_reservation "c2_complex_insn" 6
+ (and (eq_attr "cpu" "core2,corei7")
+ (eq_attr "type" "other,multi,str"))
+ "c2_decoder0")
+
+(define_insn_reservation "c2_call" 1
+ (and (eq_attr "cpu" "core2,corei7")
+ (eq_attr "type" "call,callv"))
+ "c2_decoder0")
+
+;; imov with memory operands does not use the integer units.
+;; imovx always decodes to one uop, and also doesn't use the integer
+;; units if it has memory operands.
+(define_insn_reservation "c2_imov" 1
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "none")
+ (eq_attr "type" "imov,imovx")))
+ "c2_decodern,(c2_p0|c2_p1|c2_p5)")
+
+(define_insn_reservation "c2_imov_load" 4
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "load")
+ (eq_attr "type" "imov,imovx")))
+ "c2_decodern,c2_p2")
+
+(define_insn_reservation "c2_imov_store" 1
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "store")
+ (eq_attr "type" "imov")))
+ "c2_decodern,c2_p4+c2_p3")
+
+(define_insn_reservation "c2_icmov" 2
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "none")
+ (eq_attr "type" "icmov")))
+ "c2_decoder0,(c2_p0|c2_p1|c2_p5)*2")
+
+(define_insn_reservation "c2_icmov_load" 2
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "load")
+ (eq_attr "type" "icmov")))
+ "c2_decoder0,c2_p2,(c2_p0|c2_p1|c2_p5)*2")
+
+(define_insn_reservation "c2_push_reg" 1
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "store")
+ (eq_attr "type" "push")))
+ "c2_decodern,c2_p4+c2_p3")
+
+(define_insn_reservation "c2_push_mem" 1
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "both")
+ (eq_attr "type" "push")))
+ "c2_decoder0,c2_p2,c2_p4+c2_p3")
+
+;; lea executes on port 0 with latency one and throughput 1.
+(define_insn_reservation "c2_lea" 1
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "none")
+ (eq_attr "type" "lea")))
+ "c2_decodern,c2_p0")
+
+;; Shift and rotate decode as two uops which can go to port 0 or 5.
+;; The load and store units need to be reserved when memory operands
+;; are involved.
+(define_insn_reservation "c2_shift_rotate" 1
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "none")
+ (eq_attr "type" "ishift,ishift1,rotate,rotate1")))
+ "c2_decodern,(c2_p0|c2_p5)")
+
+(define_insn_reservation "c2_shift_rotate_mem" 4
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "!none")
+ (eq_attr "type" "ishift,ishift1,rotate,rotate1")))
+ "c2_decoder0,c2_p2,(c2_p0|c2_p5),c2_p4+c2_p3")
+
+;; See comments in ppro.md for the corresponding reservation.
+(define_insn_reservation "c2_branch" 1
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "none")
+ (eq_attr "type" "ibr")))
+ "c2_decodern,c2_p5")
+
+;; ??? Indirect branches probably have worse latency than this.
+(define_insn_reservation "c2_indirect_branch" 6
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "!none")
+ (eq_attr "type" "ibr")))
+ "c2_decoder0,c2_p2+c2_p5")
+
+(define_insn_reservation "c2_leave" 4
+ (and (eq_attr "cpu" "core2,corei7")
+ (eq_attr "type" "leave"))
+ "c2_decoder0,c2_p2+(c2_p0|c2_p1),(c2_p0|c2_p1)")
+
+;; mul and imul with two/three operands only execute on port 1 for HImode
+;; and SImode, port 0 for DImode.
+(define_insn_reservation "c2_imul_hisi" 3
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "none")
+ (and (eq_attr "mode" "HI,SI")
+ (and (eq_attr "type" "imul")
+ (eq_attr "mul_operands" "2,3")))))
+ "c2_decodern,c2_p1")
+
+(define_insn_reservation "c2_imul_hisi_mem" 3
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "!none")
+ (and (eq_attr "mode" "HI,SI")
+ (and (eq_attr "type" "imul")
+ (eq_attr "mul_operands" "2,3")))))
+ "c2_decoder0,c2_p2+c2_p1")
+
+(define_insn_reservation "c2_imul_di" 5
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "none")
+ (and (eq_attr "mode" "DI")
+ (and (eq_attr "type" "imul")
+ (eq_attr "mul_operands" "2,3")))))
+ "c2_decodern,c2_p0")
+
+(define_insn_reservation "c2_imul_di_mem" 5
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "!none")
+ (and (eq_attr "mode" "DI")
+ (and (eq_attr "type" "imul")
+ (eq_attr "mul_operands" "2,3")))))
+ "c2_decoder0,c2_p2+c2_p0")
+
+(define_insn_reservation "c2_imul_qi1" 3
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "none")
+ (and (eq_attr "mode" "QI")
+ (and (eq_attr "type" "imul")
+ (eq_attr "mul_operands" "1")))))
+ "c2_decodern,c2_p1")
+
+(define_insn_reservation "c2_imul_qi1_mem" 3
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "none")
+ (and (eq_attr "mode" "QI")
+ (and (eq_attr "type" "imul")
+ (eq_attr "mul_operands" "1")))))
+ "c2_decoder0,c2_p2+c2_p1")
+
+(define_insn_reservation "c2_imul_hisi1" 5
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "none")
+ (and (eq_attr "mode" "HI,SI")
+ (and (eq_attr "type" "imul")
+ (eq_attr "mul_operands" "1")))))
+ "c2_decoder0,c2_p1")
+
+(define_insn_reservation "c2_imul_hisi1_mem" 5
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "none")
+ (and (eq_attr "mode" "HI,SI")
+ (and (eq_attr "type" "imul")
+ (eq_attr "mul_operands" "1")))))
+ "c2_decoder0,c2_p2+c2_p1")
+
+(define_insn_reservation "c2_imul_di1" 7
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "none")
+ (and (eq_attr "mode" "DI")
+ (and (eq_attr "type" "imul")
+ (eq_attr "mul_operands" "1")))))
+ "c2_decoder0,c2_p0")
+
+(define_insn_reservation "c2_imul_di1_mem" 7
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "none")
+ (and (eq_attr "mode" "DI")
+ (and (eq_attr "type" "imul")
+ (eq_attr "mul_operands" "1")))))
+ "c2_decoder0,c2_p2+c2_p0")
+
+;; div and idiv are very similar, so we model them the same.
+;; QI, HI, and SI have issue latency 12, 21, and 37, respectively.
+;; These issue latencies are modelled via the c2_div automaton.
+(define_insn_reservation "c2_idiv_QI" 19
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "none")
+ (and (eq_attr "mode" "QI")
+ (eq_attr "type" "idiv"))))
+ "c2_decoder0,(c2_p0+c2_idiv)*2,(c2_p0|c2_p1)+c2_idiv,c2_idiv*9")
+
+(define_insn_reservation "c2_idiv_QI_load" 19
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "load")
+ (and (eq_attr "mode" "QI")
+ (eq_attr "type" "idiv"))))
+ "c2_decoder0,c2_p2+c2_p0+c2_idiv,c2_p0+c2_idiv,(c2_p0|c2_p1)+c2_idiv,c2_idiv*9")
+
+(define_insn_reservation "c2_idiv_HI" 23
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "none")
+ (and (eq_attr "mode" "HI")
+ (eq_attr "type" "idiv"))))
+ "c2_decoder0,(c2_p0+c2_idiv)*3,(c2_p0|c2_p1)+c2_idiv,c2_idiv*17")
+
+(define_insn_reservation "c2_idiv_HI_load" 23
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "load")
+ (and (eq_attr "mode" "HI")
+ (eq_attr "type" "idiv"))))
+ "c2_decoder0,c2_p2+c2_p0+c2_idiv,c2_p0+c2_idiv,(c2_p0|c2_p1)+c2_idiv,c2_idiv*18")
+
+(define_insn_reservation "c2_idiv_SI" 39
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "none")
+ (and (eq_attr "mode" "SI")
+ (eq_attr "type" "idiv"))))
+ "c2_decoder0,(c2_p0+c2_idiv)*3,(c2_p0|c2_p1)+c2_idiv,c2_idiv*33")
+
+(define_insn_reservation "c2_idiv_SI_load" 39
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "load")
+ (and (eq_attr "mode" "SI")
+ (eq_attr "type" "idiv"))))
+ "c2_decoder0,c2_p2+c2_p0+c2_idiv,c2_p0+c2_idiv,(c2_p0|c2_p1)+c2_idiv,c2_idiv*34")
+
+;; x87 floating point operations.
+
+(define_insn_reservation "c2_fxch" 0
+ (and (eq_attr "cpu" "core2,corei7")
+ (eq_attr "type" "fxch"))
+ "c2_decodern")
+
+(define_insn_reservation "c2_fop" 3
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "none,unknown")
+ (eq_attr "type" "fop")))
+ "c2_decodern,c2_p1")
+
+(define_insn_reservation "c2_fop_load" 5
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "load")
+ (eq_attr "type" "fop")))
+ "c2_decoder0,c2_p2+c2_p1,c2_p1")
+
+(define_insn_reservation "c2_fop_store" 3
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "store")
+ (eq_attr "type" "fop")))
+ "c2_decoder0,c2_p0,c2_p0,c2_p0+c2_p4+c2_p3")
+
+(define_insn_reservation "c2_fop_both" 5
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "both")
+ (eq_attr "type" "fop")))
+ "c2_decoder0,c2_p2+c2_p0,c2_p0+c2_p4+c2_p3")
+
+(define_insn_reservation "c2_fsgn" 1
+ (and (eq_attr "cpu" "core2,corei7")
+ (eq_attr "type" "fsgn"))
+ "c2_decodern,c2_p0")
+
+(define_insn_reservation "c2_fistp" 5
+ (and (eq_attr "cpu" "core2,corei7")
+ (eq_attr "type" "fistp"))
+ "c2_decoder0,c2_p0*2,c2_p4+c2_p3")
+
+(define_insn_reservation "c2_fcmov" 2
+ (and (eq_attr "cpu" "core2,corei7")
+ (eq_attr "type" "fcmov"))
+ "c2_decoder0,c2_p0*2")
+
+(define_insn_reservation "c2_fcmp" 1
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "none")
+ (eq_attr "type" "fcmp")))
+ "c2_decodern,c2_p1")
+
+(define_insn_reservation "c2_fcmp_load" 4
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "load")
+ (eq_attr "type" "fcmp")))
+ "c2_decoder0,c2_p2+c2_p1")
+
+(define_insn_reservation "c2_fmov" 1
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "none")
+ (eq_attr "type" "fmov")))
+ "c2_decodern,c2_p0")
+
+(define_insn_reservation "c2_fmov_load" 1
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "load")
+ (and (eq_attr "mode" "!XF")
+ (eq_attr "type" "fmov"))))
+ "c2_decodern,c2_p2")
+
+(define_insn_reservation "c2_fmov_XF_load" 3
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "load")
+ (and (eq_attr "mode" "XF")
+ (eq_attr "type" "fmov"))))
+ "c2_decoder0,(c2_p2+c2_p0)*2")
+
+(define_insn_reservation "c2_fmov_store" 1
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "store")
+ (and (eq_attr "mode" "!XF")
+ (eq_attr "type" "fmov"))))
+ "c2_decodern,c2_p3+c2_p4")
+
+(define_insn_reservation "c2_fmov_XF_store" 3
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "store")
+ (and (eq_attr "mode" "XF")
+ (eq_attr "type" "fmov"))))
+ "c2_decoder0,(c2_p3+c2_p4),(c2_p3+c2_p4)")
+
+;; fmul executes on port 0 with latency 5. It has issue latency 2,
+;; but we don't model this.
+(define_insn_reservation "c2_fmul" 5
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "none")
+ (eq_attr "type" "fmul")))
+ "c2_decoder0,c2_p0*2")
+
+(define_insn_reservation "c2_fmul_load" 6
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "load")
+ (eq_attr "type" "fmul")))
+ "c2_decoder0,c2_p2+c2_p0,c2_p0")
+
+;; fdiv latencies depend on the mode of the operands. XFmode gives
+;; a latency of 38 cycles, DFmode gives 32, and SFmode gives latency 18.
+;; Division by a power of 2 takes only 9 cycles, but we cannot model
+;; that. Throughput is equal to latency - 1, which we model using the
+;; c2_div automaton.
+(define_insn_reservation "c2_fdiv_SF" 18
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "none")
+ (and (eq_attr "mode" "SF")
+ (eq_attr "type" "fdiv,fpspc"))))
+ "c2_decodern,c2_p0+c2_fdiv,c2_fdiv*16")
+
+(define_insn_reservation "c2_fdiv_SF_load" 19
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "load")
+ (and (eq_attr "mode" "SF")
+ (eq_attr "type" "fdiv,fpspc"))))
+ "c2_decoder0,c2_p2+c2_p0+c2_fdiv,c2_fdiv*16")
+
+(define_insn_reservation "c2_fdiv_DF" 32
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "none")
+ (and (eq_attr "mode" "DF")
+ (eq_attr "type" "fdiv,fpspc"))))
+ "c2_decodern,c2_p0+c2_fdiv,c2_fdiv*30")
+
+(define_insn_reservation "c2_fdiv_DF_load" 33
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "load")
+ (and (eq_attr "mode" "DF")
+ (eq_attr "type" "fdiv,fpspc"))))
+ "c2_decoder0,c2_p2+c2_p0+c2_fdiv,c2_fdiv*30")
+
+(define_insn_reservation "c2_fdiv_XF" 38
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "none")
+ (and (eq_attr "mode" "XF")
+ (eq_attr "type" "fdiv,fpspc"))))
+ "c2_decodern,c2_p0+c2_fdiv,c2_fdiv*36")
+
+(define_insn_reservation "c2_fdiv_XF_load" 39
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "load")
+ (and (eq_attr "mode" "XF")
+ (eq_attr "type" "fdiv,fpspc"))))
+ "c2_decoder0,c2_p2+c2_p0+c2_fdiv,c2_fdiv*36")
+
+;; MMX instructions.
+
+(define_insn_reservation "c2_mmx_add" 1
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "none")
+ (eq_attr "type" "mmxadd,sseiadd")))
+ "c2_decodern,c2_p0|c2_p5")
+
+(define_insn_reservation "c2_mmx_add_load" 2
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "load")
+ (eq_attr "type" "mmxadd,sseiadd")))
+ "c2_decodern,c2_p2+c2_p0|c2_p5")
+
+(define_insn_reservation "c2_mmx_shft" 1
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "none")
+ (eq_attr "type" "mmxshft")))
+ "c2_decodern,c2_p0|c2_p5")
+
+(define_insn_reservation "c2_mmx_shft_load" 2
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "load")
+ (eq_attr "type" "mmxshft")))
+ "c2_decoder0,c2_p2+c2_p1")
+
+(define_insn_reservation "c2_mmx_sse_shft" 1
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "none")
+ (and (eq_attr "type" "sseishft")
+ (eq_attr "length_immediate" "!0"))))
+ "c2_decodern,c2_p1")
+
+(define_insn_reservation "c2_mmx_sse_shft_load" 2
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "load")
+ (and (eq_attr "type" "sseishft")
+ (eq_attr "length_immediate" "!0"))))
+ "c2_decodern,c2_p1")
+
+(define_insn_reservation "c2_mmx_sse_shft1" 2
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "none")
+ (and (eq_attr "type" "sseishft")
+ (eq_attr "length_immediate" "0"))))
+ "c2_decodern,c2_p1")
+
+(define_insn_reservation "c2_mmx_sse_shft1_load" 3
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "load")
+ (and (eq_attr "type" "sseishft")
+ (eq_attr "length_immediate" "0"))))
+ "c2_decodern,c2_p1")
+
+(define_insn_reservation "c2_mmx_mul" 3
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "none")
+ (eq_attr "type" "mmxmul,sseimul")))
+ "c2_decodern,c2_p1")
+
+(define_insn_reservation "c2_mmx_mul_load" 3
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "none")
+ (eq_attr "type" "mmxmul,sseimul")))
+ "c2_decoder0,c2_p2+c2_p1")
+
+(define_insn_reservation "c2_sse_mmxcvt" 4
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "mode" "DI")
+ (eq_attr "type" "mmxcvt")))
+ "c2_decodern,c2_p1")
+
+;; FIXME: These are Pentium III only, but we cannot tell here if
+;; we're generating code for PentiumPro/Pentium II or Pentium III
+;; (define_insn_reservation "c2_sse_mmxshft" 2
+;; (and (eq_attr "cpu" "core2,corei7")
+;; (and (eq_attr "mode" "TI")
+;; (eq_attr "type" "mmxshft")))
+;; "c2_decodern,c2_p0")
+
+;; The sfence instruction.
+(define_insn_reservation "c2_sse_sfence" 3
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "unknown")
+ (eq_attr "type" "sse")))
+ "c2_decoder0,c2_p4+c2_p3")
+
+;; FIXME: This reservation is all wrong when we're scheduling sqrtss.
+(define_insn_reservation "c2_sse_SFDF" 3
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "mode" "SF,DF")
+ (eq_attr "type" "sse")))
+ "c2_decodern,c2_p0")
+
+(define_insn_reservation "c2_sse_V4SF" 4
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "mode" "V4SF")
+ (eq_attr "type" "sse")))
+ "c2_decoder0,c2_p1*2")
+
+(define_insn_reservation "c2_sse_addcmp" 3
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "none")
+ (eq_attr "type" "sseadd,ssecmp,ssecomi")))
+ "c2_decodern,c2_p1")
+
+(define_insn_reservation "c2_sse_addcmp_load" 3
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "load")
+ (eq_attr "type" "sseadd,ssecmp,ssecomi")))
+ "c2_decodern,c2_p2+c2_p1")
+
+(define_insn_reservation "c2_sse_mul_SF" 4
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "none")
+ (and (eq_attr "mode" "SF,V4SF")
+ (eq_attr "type" "ssemul"))))
+ "c2_decodern,c2_p0")
+
+(define_insn_reservation "c2_sse_mul_SF_load" 4
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "load")
+ (and (eq_attr "mode" "SF,V4SF")
+ (eq_attr "type" "ssemul"))))
+ "c2_decodern,c2_p2+c2_p0")
+
+(define_insn_reservation "c2_sse_mul_DF" 5
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "none")
+ (and (eq_attr "mode" "DF,V2DF")
+ (eq_attr "type" "ssemul"))))
+ "c2_decodern,c2_p0")
+
+(define_insn_reservation "c2_sse_mul_DF_load" 5
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "load")
+ (and (eq_attr "mode" "DF,V2DF")
+ (eq_attr "type" "ssemul"))))
+ "c2_decodern,c2_p2+c2_p0")
+
+(define_insn_reservation "c2_sse_div_SF" 18
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "none")
+ (and (eq_attr "mode" "SF,V4SF")
+ (eq_attr "type" "ssediv"))))
+ "c2_decodern,c2_p0,c2_ssediv*17")
+
+(define_insn_reservation "c2_sse_div_SF_load" 18
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "none")
+ (and (eq_attr "mode" "SF,V4SF")
+ (eq_attr "type" "ssediv"))))
+ "c2_decodern,(c2_p2+c2_p0),c2_ssediv*17")
+
+(define_insn_reservation "c2_sse_div_DF" 32
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "none")
+ (and (eq_attr "mode" "DF,V2DF")
+ (eq_attr "type" "ssediv"))))
+ "c2_decodern,c2_p0,c2_ssediv*31")
+
+(define_insn_reservation "c2_sse_div_DF_load" 32
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "none")
+ (and (eq_attr "mode" "DF,V2DF")
+ (eq_attr "type" "ssediv"))))
+ "c2_decodern,(c2_p2+c2_p0),c2_ssediv*31")
+
+;; FIXME: these have limited throughput
+(define_insn_reservation "c2_sse_icvt_SF" 4
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "none")
+ (and (eq_attr "mode" "SF")
+ (eq_attr "type" "sseicvt"))))
+ "c2_decodern,c2_p1")
+
+(define_insn_reservation "c2_sse_icvt_SF_load" 4
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "!none")
+ (and (eq_attr "mode" "SF")
+ (eq_attr "type" "sseicvt"))))
+ "c2_decodern,c2_p2+c2_p1")
+
+(define_insn_reservation "c2_sse_icvt_DF" 4
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "none")
+ (and (eq_attr "mode" "DF")
+ (eq_attr "type" "sseicvt"))))
+ "c2_decoder0,c2_p0+c2_p1")
+
+(define_insn_reservation "c2_sse_icvt_DF_load" 4
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "!none")
+ (and (eq_attr "mode" "DF")
+ (eq_attr "type" "sseicvt"))))
+ "c2_decoder0,(c2_p2+c2_p1)")
+
+(define_insn_reservation "c2_sse_icvt_SI" 3
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "none")
+ (and (eq_attr "mode" "SI")
+ (eq_attr "type" "sseicvt"))))
+ "c2_decodern,c2_p1")
+
+(define_insn_reservation "c2_sse_icvt_SI_load" 3
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "!none")
+ (and (eq_attr "mode" "SI")
+ (eq_attr "type" "sseicvt"))))
+ "c2_decodern,(c2_p2+c2_p1)")
+
+(define_insn_reservation "c2_sse_mov" 1
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "none")
+ (eq_attr "type" "ssemov")))
+ "c2_decodern,(c2_p0|c2_p1|c2_p5)")
+
+(define_insn_reservation "c2_sse_mov_load" 2
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "load")
+ (eq_attr "type" "ssemov")))
+ "c2_decodern,c2_p2")
+
+(define_insn_reservation "c2_sse_mov_store" 1
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "store")
+ (eq_attr "type" "ssemov")))
+ "c2_decodern,c2_p4+c2_p3")
+
+;; All other instructions are modelled as simple instructions.
+;; We have already modelled all i387 floating point instructions, so all
+;; other instructions execute on either port 0, 1 or 5. This includes
+;; the ALU units, and the MMX units.
+;;
+;; reg-reg instructions produce 1 uop so they can be decoded on any of
+;; the three decoders. Loads benefit from micro-op fusion and can be
+;; treated in the same way.
+(define_insn_reservation "c2_insn" 1
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "none,unknown")
+ (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,sseishft1,mmx,mmxcmp")))
+ "c2_decodern,(c2_p0|c2_p1|c2_p5)")
+
+(define_insn_reservation "c2_insn_load" 4
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "load")
+ (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,pop,sseishft1,mmx,mmxcmp")))
+ "c2_decodern,c2_p2,(c2_p0|c2_p1|c2_p5)")
+
+;; register-memory instructions have three uops, so they have to be
+;; decoded on c2_decoder0.
+(define_insn_reservation "c2_insn_store" 1
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "store")
+ (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,sseishft1,mmx,mmxcmp")))
+ "c2_decoder0,(c2_p0|c2_p1|c2_p5),c2_p4+c2_p3")
+
+;; read-modify-store instructions produce 4 uops so they have to be
+;; decoded on c2_decoder0 as well.
+(define_insn_reservation "c2_insn_both" 4
+ (and (eq_attr "cpu" "core2,corei7")
+ (and (eq_attr "memory" "both")
+ (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,pop,sseishft1,mmx,mmxcmp")))
+ "c2_decoder0,c2_p2,(c2_p0|c2_p1|c2_p5),c2_p4+c2_p3")
+
===================================================================
@@ -122,6 +122,10 @@ ix86_target_macros_internal (int isa_fla
def_or_undef (parse_in, "__core2");
def_or_undef (parse_in, "__core2__");
break;
+ case PROCESSOR_COREI7:
+ def_or_undef (parse_in, "__corei7");
+ def_or_undef (parse_in, "__corei7__");
+ break;
case PROCESSOR_ATOM:
def_or_undef (parse_in, "__atom");
def_or_undef (parse_in, "__atom__");
@@ -197,6 +201,9 @@ ix86_target_macros_internal (int isa_fla
case PROCESSOR_CORE2:
def_or_undef (parse_in, "__tune_core2__");
break;
+ case PROCESSOR_COREI7:
+ def_or_undef (parse_in, "__tune_corei7__");
+ break;
case PROCESSOR_ATOM:
def_or_undef (parse_in, "__tune_atom__");
break;
===================================================================
@@ -1124,6 +1124,79 @@ struct processor_costs core2_cost = {
};
static const
+struct processor_costs corei7_cost = {
+ COSTS_N_INSNS (1), /* cost of an add instruction */
+ COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
+ COSTS_N_INSNS (1), /* variable shift costs */
+ COSTS_N_INSNS (1), /* constant shift costs */
+ {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
+ COSTS_N_INSNS (3), /* HI */
+ COSTS_N_INSNS (3), /* SI */
+ COSTS_N_INSNS (3), /* DI */
+ COSTS_N_INSNS (3)}, /* other */
+ 0, /* cost of multiply per each bit set */
+ {COSTS_N_INSNS (22), /* cost of a divide/mod for QI */
+ COSTS_N_INSNS (22), /* HI */
+ COSTS_N_INSNS (22), /* SI */
+ COSTS_N_INSNS (22), /* DI */
+ COSTS_N_INSNS (22)}, /* other */
+ COSTS_N_INSNS (1), /* cost of movsx */
+ COSTS_N_INSNS (1), /* cost of movzx */
+ 8, /* "large" insn */
+ 16, /* MOVE_RATIO */
+ 2, /* cost for loading QImode using movzbl */
+ {6, 6, 6}, /* cost of loading integer registers
+ in QImode, HImode and SImode.
+ Relative to reg-reg move (2). */
+ {4, 4, 4}, /* cost of storing integer registers */
+ 2, /* cost of reg,reg fld/fst */
+ {6, 6, 6}, /* cost of loading fp registers
+ in SFmode, DFmode and XFmode */
+ {4, 4, 4}, /* cost of storing fp registers
+ in SFmode, DFmode and XFmode */
+ 2, /* cost of moving MMX register */
+ {6, 6}, /* cost of loading MMX registers
+ in SImode and DImode */
+ {4, 4}, /* cost of storing MMX registers
+ in SImode and DImode */
+ 2, /* cost of moving SSE register */
+ {6, 6, 6}, /* cost of loading SSE registers
+ in SImode, DImode and TImode */
+ {4, 4, 4}, /* cost of storing SSE registers
+ in SImode, DImode and TImode */
+ 2, /* MMX or SSE register to integer */
+ 32, /* size of l1 cache. */
+ 256, /* size of l2 cache. */
+ 128, /* size of prefetch block */
+ 8, /* number of parallel prefetches */
+ 3, /* Branch cost */
+ COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
+ COSTS_N_INSNS (5), /* cost of FMUL instruction. */
+ COSTS_N_INSNS (32), /* cost of FDIV instruction. */
+ COSTS_N_INSNS (1), /* cost of FABS instruction. */
+ COSTS_N_INSNS (1), /* cost of FCHS instruction. */
+ COSTS_N_INSNS (58), /* cost of FSQRT instruction. */
+ {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
+ {libcall, {{32, loop}, {64, rep_prefix_4_byte},
+ {8192, rep_prefix_8_byte}, {-1, libcall}}}},
+ {{libcall, {{8, loop}, {15, unrolled_loop},
+ {2048, rep_prefix_4_byte}, {-1, libcall}}},
+ {libcall, {{24, loop}, {32, unrolled_loop},
+ {8192, rep_prefix_8_byte}, {-1, libcall}}}},
+ 1, /* scalar_stmt_cost. */
+ 1, /* scalar load_cost. */
+ 1, /* scalar_store_cost. */
+ 1, /* vec_stmt_cost. */
+ 1, /* vec_to_scalar_cost. */
+ 1, /* scalar_to_vec_cost. */
+ 1, /* vec_align_load_cost. */
+ 2, /* vec_unalign_load_cost. */
+ 1, /* vec_store_cost. */
+ 3, /* cond_taken_branch_cost. */
+ 1, /* cond_not_taken_branch_cost. */
+};
+
+static const
struct processor_costs atom_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
@@ -1355,6 +1428,8 @@ const struct processor_costs *ix86_cost
#define m_PENT4 (1<<PROCESSOR_PENTIUM4)
#define m_NOCONA (1<<PROCESSOR_NOCONA)
#define m_CORE2 (1<<PROCESSOR_CORE2)
+#define m_COREI7 (1<<PROCESSOR_COREI7)
+#define m_CORE2I7 (m_CORE2 | m_COREI7)
#define m_ATOM (1<<PROCESSOR_ATOM)
#define m_GEODE (1<<PROCESSOR_GEODE)
@@ -1384,18 +1459,18 @@ static unsigned int initial_ix86_tune_fe
negatively, so enabling for Generic64 seems like good code size
tradeoff. We can't enable it for 32bit generic because it does not
work well with PPro base chips. */
- m_386 | m_K6_GEODE | m_AMD_MULTIPLE | m_CORE2 | m_GENERIC64,
+ m_386 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
/* X86_TUNE_PUSH_MEMORY */
m_386 | m_K6_GEODE | m_AMD_MULTIPLE | m_PENT4
- | m_NOCONA | m_CORE2 | m_GENERIC,
+ | m_NOCONA | m_CORE2I7 | m_GENERIC,
/* X86_TUNE_ZERO_EXTEND_WITH_AND */
m_486 | m_PENT,
/* X86_TUNE_UNROLL_STRLEN */
m_486 | m_PENT | m_ATOM | m_PPRO | m_AMD_MULTIPLE | m_K6
- | m_CORE2 | m_GENERIC,
+ | m_CORE2I7 | m_GENERIC,
/* X86_TUNE_DEEP_BRANCH_PREDICTION */
m_ATOM | m_PPRO | m_K6_GEODE | m_AMD_MULTIPLE | m_PENT4 | m_GENERIC,
@@ -1411,12 +1486,12 @@ static unsigned int initial_ix86_tune_fe
/* X86_TUNE_USE_SAHF */
m_ATOM | m_PPRO | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER1 | m_PENT4
- | m_NOCONA | m_CORE2 | m_GENERIC,
+ | m_NOCONA | m_CORE2I7 | m_GENERIC,
/* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
partial dependencies. */
m_AMD_MULTIPLE | m_ATOM | m_PPRO | m_PENT4 | m_NOCONA
- | m_CORE2 | m_GENERIC | m_GEODE /* m_386 | m_K6 */,
+ | m_CORE2I7 | m_GENERIC | m_GEODE /* m_386 | m_K6 */,
/* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
register stalls on Generic32 compilation setting as well. However
@@ -1429,19 +1504,19 @@ static unsigned int initial_ix86_tune_fe
m_PPRO,
/* X86_TUNE_PARTIAL_FLAG_REG_STALL */
- m_CORE2 | m_GENERIC,
+ m_CORE2I7 | m_GENERIC,
/* X86_TUNE_USE_HIMODE_FIOP */
m_386 | m_486 | m_K6_GEODE,
/* X86_TUNE_USE_SIMODE_FIOP */
- ~(m_PPRO | m_AMD_MULTIPLE | m_PENT | m_ATOM | m_CORE2 | m_GENERIC),
+ ~(m_PPRO | m_AMD_MULTIPLE | m_PENT | m_ATOM | m_CORE2I7 | m_GENERIC),
/* X86_TUNE_USE_MOV0 */
m_K6,
/* X86_TUNE_USE_CLTD */
- ~(m_PENT | m_ATOM | m_K6 | m_CORE2 | m_GENERIC),
+ ~(m_PENT | m_ATOM | m_K6 | m_CORE2I7 | m_GENERIC),
/* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
m_PENT4,
@@ -1457,7 +1532,7 @@ static unsigned int initial_ix86_tune_fe
/* X86_TUNE_PROMOTE_QIMODE */
m_K6_GEODE | m_PENT | m_ATOM | m_386 | m_486 | m_AMD_MULTIPLE
- | m_CORE2 | m_GENERIC /* | m_PENT4 ? */,
+ | m_CORE2I7 | m_GENERIC /* | m_PENT4 ? */,
/* X86_TUNE_FAST_PREFIX */
~(m_PENT | m_486 | m_386),
@@ -1478,31 +1553,34 @@ static unsigned int initial_ix86_tune_fe
0,
/* X86_TUNE_PROMOTE_HI_REGS */
- m_PPRO,
+ m_PPRO | m_CORE2I7,
+
+ /* X86_TUNE_PROMOTE_HI_CONSTANTS */
+ m_PPRO | m_CORE2I7,
/* X86_TUNE_ADD_ESP_4: Enable if add/sub is preferred over 1/2 push/pop. */
m_ATOM | m_AMD_MULTIPLE | m_K6_GEODE | m_PENT4 | m_NOCONA
- | m_CORE2 | m_GENERIC,
+ | m_CORE2I7 | m_GENERIC,
/* X86_TUNE_ADD_ESP_8 */
m_AMD_MULTIPLE | m_ATOM | m_PPRO | m_K6_GEODE | m_386
- | m_486 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
+ | m_486 | m_PENT4 | m_NOCONA | m_CORE2I7 | m_GENERIC,
/* X86_TUNE_SUB_ESP_4 */
- m_AMD_MULTIPLE | m_ATOM | m_PPRO | m_PENT4 | m_NOCONA | m_CORE2
+ m_AMD_MULTIPLE | m_ATOM | m_PPRO | m_PENT4 | m_NOCONA | m_CORE2I7
| m_GENERIC,
/* X86_TUNE_SUB_ESP_8 */
m_AMD_MULTIPLE | m_ATOM | m_PPRO | m_386 | m_486
- | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
+ | m_PENT4 | m_NOCONA | m_CORE2I7 | m_GENERIC,
/* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
for DFmode copies */
- ~(m_AMD_MULTIPLE | m_ATOM | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2
+ ~(m_AMD_MULTIPLE | m_ATOM | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2I7
| m_GENERIC | m_GEODE),
/* X86_TUNE_PARTIAL_REG_DEPENDENCY */
- m_AMD_MULTIPLE | m_ATOM | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
+ m_AMD_MULTIPLE | m_ATOM | m_PENT4 | m_NOCONA | m_CORE2I7 | m_GENERIC,
/* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
conflict here in between PPro/Pentium4 based chips that thread 128bit
@@ -1513,7 +1591,7 @@ static unsigned int initial_ix86_tune_fe
shows that disabling this option on P4 brings over 20% SPECfp regression,
while enabling it on K8 brings roughly 2.4% regression that can be partly
masked by careful scheduling of moves. */
- m_ATOM | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC
+ m_ATOM | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2I7 | m_GENERIC
| m_AMDFAM10 | m_BDVER1,
/* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
@@ -1538,13 +1616,13 @@ static unsigned int initial_ix86_tune_fe
m_PPRO | m_PENT4 | m_NOCONA,
/* X86_TUNE_MEMORY_MISMATCH_STALL */
- m_AMD_MULTIPLE | m_ATOM | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
+ m_AMD_MULTIPLE | m_ATOM | m_PENT4 | m_NOCONA | m_CORE2I7 | m_GENERIC,
/* X86_TUNE_PROLOGUE_USING_MOVE */
- m_ATHLON_K8 | m_ATOM | m_PPRO | m_CORE2 | m_GENERIC,
+ m_ATHLON_K8 | m_ATOM | m_PPRO | m_CORE2I7 | m_GENERIC,
/* X86_TUNE_EPILOGUE_USING_MOVE */
- m_ATHLON_K8 | m_ATOM | m_PPRO | m_CORE2 | m_GENERIC,
+ m_ATHLON_K8 | m_ATOM | m_PPRO | m_CORE2I7 | m_GENERIC,
/* X86_TUNE_SHIFT1 */
~m_486,
@@ -1560,25 +1638,25 @@ static unsigned int initial_ix86_tune_fe
/* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
than 4 branch instructions in the 16 byte window. */
- m_ATOM | m_PPRO | m_AMD_MULTIPLE | m_PENT4 | m_NOCONA | m_CORE2
+ m_ATOM | m_PPRO | m_AMD_MULTIPLE | m_PENT4 | m_NOCONA | m_CORE2I7
| m_GENERIC,
/* X86_TUNE_SCHEDULE */
- m_PPRO | m_AMD_MULTIPLE | m_K6_GEODE | m_PENT | m_ATOM | m_CORE2
+ m_PPRO | m_AMD_MULTIPLE | m_K6_GEODE | m_PENT | m_ATOM | m_CORE2I7
| m_GENERIC,
/* X86_TUNE_USE_BT */
- m_AMD_MULTIPLE | m_ATOM | m_CORE2 | m_GENERIC,
+ m_AMD_MULTIPLE | m_ATOM | m_CORE2I7 | m_GENERIC,
/* X86_TUNE_USE_INCDEC */
- ~(m_PENT4 | m_NOCONA | m_GENERIC | m_ATOM),
+ ~(m_PENT4 | m_NOCONA | m_GENERIC | m_CORE2I7 | m_ATOM),
/* X86_TUNE_PAD_RETURNS */
- m_AMD_MULTIPLE | m_CORE2 | m_GENERIC,
+ m_AMD_MULTIPLE | m_GENERIC,
/* X86_TUNE_EXT_80387_CONSTANTS */
m_K6_GEODE | m_ATHLON_K8 | m_ATOM | m_PENT4 | m_NOCONA | m_PPRO
- | m_CORE2 | m_GENERIC,
+ | m_CORE2I7 | m_GENERIC,
/* X86_TUNE_SHORTEN_X87_SSE */
~m_K8,
@@ -1622,7 +1700,7 @@ static unsigned int initial_ix86_tune_fe
/* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
with a subsequent conditional jump instruction into a single
compare-and-branch uop. */
- m_CORE2 | m_BDVER1,
+ m_CORE2I7 | m_BDVER1,
/* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
will impact LEA instruction selection. */
@@ -1652,12 +1730,12 @@ static unsigned int initial_ix86_arch_fe
};
static const unsigned int x86_accumulate_outgoing_args
- = m_AMD_MULTIPLE | m_ATOM | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2
+ = m_AMD_MULTIPLE | m_ATOM | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2I7
| m_GENERIC;
static const unsigned int x86_arch_always_fancy_math_387
= m_PENT | m_ATOM | m_PPRO | m_AMD_MULTIPLE | m_PENT4
- | m_NOCONA | m_CORE2 | m_GENERIC;
+ | m_NOCONA | m_CORE2I7 | m_GENERIC;
static enum stringop_alg stringop_alg = no_stringop;
@@ -2173,6 +2251,7 @@ static const struct ptt processor_target
{&k8_cost, 16, 7, 16, 7, 16},
{&nocona_cost, 0, 0, 0, 0, 0},
{&core2_cost, 16, 10, 16, 10, 16},
+ {&corei7_cost, 16, 10, 16, 10, 16},
{&generic32_cost, 16, 7, 16, 7, 16},
{&generic64_cost, 16, 10, 16, 10, 16},
{&amdfam10_cost, 32, 24, 32, 7, 32},
@@ -2195,6 +2274,7 @@ static const char *const cpu_names[TARGE
"prescott",
"nocona",
"core2",
+ "corei7",
"atom",
"geode",
"k6",
@@ -2889,6 +2969,9 @@ override_options (bool main_args_p)
{"core2", PROCESSOR_CORE2, CPU_CORE2,
PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
| PTA_SSSE3 | PTA_CX16},
+ {"corei7", PROCESSOR_COREI7, CPU_COREI7,
+ PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
+ | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16},
{"atom", PROCESSOR_ATOM, CPU_ATOM,
PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
| PTA_SSSE3 | PTA_CX16 | PTA_MOVBE},
@@ -14291,6 +14374,12 @@ ix86_fixup_binary_operands (enum rtx_cod
if (MEM_P (src1) && !rtx_equal_p (dst, src1))
src1 = force_reg (mode, src1);
+ if (TARGET_PROMOTE_HI_CONSTANTS && mode == HImode && CONSTANT_P (src2)
+ && (INTVAL (src2) < -128 || INTVAL (src2) > 127)
+ && (code != AND
+ || (INTVAL (src2) != 255 && INTVAL (src2) != -65281)))
+ src2 = gen_lowpart (HImode, force_reg (SImode, src2));
+
operands[1] = src1;
operands[2] = src2;
return dst;
@@ -14377,6 +14466,12 @@ ix86_binary_operator_ok (enum rtx_code c
if (MEM_P (src1) && !rtx_equal_p (dst, src1))
return 0;
+ if (TARGET_PROMOTE_HI_CONSTANTS && mode == HImode && CONSTANT_P (src2)
+ && (INTVAL (src2) < -128 || INTVAL (src2) > 127)
+ && (code != AND
+ || (INTVAL (src2) != 255 && INTVAL (src2) != -65281)))
+ return 0;
+
return 1;
}
@@ -20495,6 +20590,7 @@ ix86_issue_rate (void)
return 3;
case PROCESSOR_CORE2:
+ case PROCESSOR_COREI7:
return 4;
default:
@@ -20569,6 +20665,7 @@ ix86_adjust_cost (rtx insn, rtx link, rt
{
enum attr_type insn_type, dep_insn_type;
enum attr_memory memory;
+ enum attr_i7_domain domain1, domain2;
rtx set, set2;
int dep_insn_code_number;
@@ -20711,6 +20808,19 @@ ix86_adjust_cost (rtx insn, rtx link, rt
else
cost = 0;
}
+ break;
+
+ case PROCESSOR_COREI7:
+ memory = get_attr_memory (insn);
+
+ domain1 = get_attr_i7_domain (insn);
+ domain2 = get_attr_i7_domain (dep_insn);
+ if (domain1 != domain2
+ && !ix86_agi_dependent (dep_insn, insn))
+ cost += ((domain1 == I7_DOMAIN_SIMD && domain2 == I7_DOMAIN_INT)
+ || (domain1 == I7_DOMAIN_INT && domain2 == I7_DOMAIN_SIMD)
+ ? 1 : 2);
+ break;
default:
break;