From a6eb26cf6f3dbbb726fd6e5e9f02f222df40cdf8 Mon Sep 17 00:00:00 2001 From: pfg Date: Tue, 12 Jun 2012 15:04:18 +0000 Subject: Add experimental support for amdfam10/barcelona from the GCC 4.3 branch. Initial support for the AMD barcelona chipsets has been available in the gcc43 branch under GPLv2 but was not included when the Core 2 support was brought to the system gcc. AMD and some linux distributions (OpenSUSE) did a backport of the amdfam10 support and made them available. Unfortunately this is still experimental and while it can improve performance, enabling the CPUTYPE may break some C++ ports (like clang). Special care was taken to make sure that the patches predate the GPLv3 switch upstream. Tested by: Vladimir Kushnir Reviewed by: mm Approved by: jhb (mentor) MFC after: 2 weeks --- contrib/gcc/ChangeLog.gcc43 | 273 ++++++++++++++++++- contrib/gcc/builtins.c | 18 ++ contrib/gcc/config.gcc | 16 +- contrib/gcc/config/i386/ammintrin.h | 73 +++++ contrib/gcc/config/i386/athlon.md | 378 +++++++++++++++++++++++--- contrib/gcc/config/i386/driver-i386.c | 11 +- contrib/gcc/config/i386/i386.c | 495 +++++++++++++++++++++++++++++----- contrib/gcc/config/i386/i386.h | 132 ++++----- contrib/gcc/config/i386/i386.md | 360 +++++++++++++++++++++---- contrib/gcc/config/i386/i386.opt | 18 +- contrib/gcc/config/i386/mmx.md | 8 +- contrib/gcc/config/i386/pmmintrin.h | 8 +- contrib/gcc/config/i386/sse.md | 133 ++++++++- contrib/gcc/config/i386/tmmintrin.h | 230 +--------------- contrib/gcc/doc/extend.texi | 17 ++ contrib/gcc/doc/invoke.texi | 14 +- contrib/gcc/fold-const.c | 8 +- contrib/gcc/gimplify.c | 4 +- contrib/gcc/tree-ssa-ccp.c | 4 +- contrib/gcc/tree-ssa-pre.c | 17 ++ 20 files changed, 1724 insertions(+), 493 deletions(-) create mode 100644 contrib/gcc/config/i386/ammintrin.h (limited to 'contrib/gcc') diff --git a/contrib/gcc/ChangeLog.gcc43 b/contrib/gcc/ChangeLog.gcc43 index 23b3a39..d15236b0 100644 --- a/contrib/gcc/ChangeLog.gcc43 +++ b/contrib/gcc/ChangeLog.gcc43 @@ -1,3 +1,8 @@ +2007-05-01 Dwarakanath Rajagopal + + * doc/invoke.texi: Fix typo, 'AMD Family 10h core' instead of + 'AMD Family 10 core'. + 2007-05-01 Dwarakanath Rajagopal (r124339) * config/i386/i386.c (override_options): Accept k8-sse3, opteron-sse3 @@ -5,10 +10,39 @@ with SSE3 instruction set support. * doc/invoke.texi: Likewise. +2007-05-01 Dwarakanath Rajagopal + + * config/i386/i386.c (override_options): Tuning 32-byte loop + alignment for amdfam10 architecture. Increasing the max loop + alignment to 24 bytes. + +2007-04-12 Richard Guenther + + PR tree-optimization/24689 + PR tree-optimization/31307 + * fold-const.c (operand_equal_p): Compare INTEGER_CST array + indices by value. + * gimplify.c (canonicalize_addr_expr): To be consistent with + gimplify_compound_lval only set operands two and three of + ARRAY_REFs if they are not gimple_min_invariant. This makes + it never at this place. + * tree-ssa-ccp.c (maybe_fold_offset_to_array_ref): Likewise. + 2007-04-07 H.J. Lu (r123639) * config/i386/i386.c (ix86_handle_option): Handle SSSE3. +2007-03-28 Dwarakanath Rajagopal + + * config.gcc: Accept barcelona as a variant of amdfam10. + * config/i386/i386.c (override_options): Likewise. + * doc/invoke.texi: Likewise. + +2007-02-09 Dwarakanath Rajagopal + + * config/i386/driver-i386.c: Turn on -mtune=native for AMDFAM10. + (bit_SSE4a): New. + 2007-02-08 Harsha Jagasia (r121726) * config/i386/xmmintrin.h: Make inclusion of emmintrin.h @@ -26,6 +60,173 @@ * config/i386/i386.c (override_options): Set PTA_SSSE3 for core2. +2007-02-05 Harsha Jagasia + + * config/i386/athlon.md (athlon_fldxf_k8, athlon_fld_k8, + athlon_fstxf_k8, athlon_fst_k8, athlon_fist, athlon_fmov, + athlon_fadd_load, athlon_fadd_load_k8, athlon_fadd, athlon_fmul, + athlon_fmul_load, athlon_fmul_load_k8, athlon_fsgn, + athlon_fdiv_load, athlon_fdiv_load_k8, athlon_fdiv_k8, + athlon_fpspc_load, athlon_fpspc, athlon_fcmov_load, + athlon_fcmov_load_k8, athlon_fcmov_k8, athlon_fcomi_load_k8, + athlon_fcomi, athlon_fcom_load_k8, athlon_fcom): Added amdfam10. + +2007-02-05 Harsha Jagasia + + * config/i386/i386.md (x86_sahf_1, cmpfp_i_mixed, cmpfp_i_sse, + cmpfp_i_i387, cmpfp_iu_mixed, cmpfp_iu_sse, cmpfp_iu_387, + swapsi, swaphi_1, swapqi_1, swapdi_rex64, fix_truncsfdi_sse, + fix_truncdfdi_sse, fix_truncsfsi_sse, fix_truncdfsi_sse, + x86_fldcw_1, floatsisf2_mixed, floatsisf2_sse, floatdisf2_mixed, + floatdisf2_sse, floatsidf2_mixed, floatsidf2_sse, + floatdidf2_mixed, floatdidf2_sse, muldi3_1_rex64, mulsi3_1, + mulsi3_1_zext, mulhi3_1, mulqi3_1, umulqihi3_1, mulqihi3_insn, + umulditi3_insn, umulsidi3_insn, mulditi3_insn, mulsidi3_insn, + umuldi3_highpart_rex64, umulsi3_highpart_insn, + umulsi3_highpart_zext, smuldi3_highpart_rex64, + smulsi3_highpart_insn, smulsi3_highpart_zext, x86_64_shld, + x86_shld_1, x86_64_shrd, sqrtsf2_mixed, sqrtsf2_sse, + sqrtsf2_i387, sqrtdf2_mixed, sqrtdf2_sse, sqrtdf2_i387, + sqrtextendsfdf2_i387, sqrtxf2, sqrtextendsfxf2_i387, + sqrtextenddfxf2_i387): Added amdfam10_decode. + + * config/i386/athlon.md (athlon_idirect_amdfam10, + athlon_ivector_amdfam10, athlon_idirect_load_amdfam10, + athlon_ivector_load_amdfam10, athlon_idirect_both_amdfam10, + athlon_ivector_both_amdfam10, athlon_idirect_store_amdfam10, + athlon_ivector_store_amdfam10): New define_insn_reservation. + (athlon_idirect_loadmov, athlon_idirect_movstore): Added + amdfam10. + +2007-02-05 Harsha Jagasia + + * config/i386/athlon.md (athlon_call_amdfam10, + athlon_pop_amdfam10, athlon_lea_amdfam10): New + define_insn_reservation. + (athlon_branch, athlon_push, athlon_leave_k8, athlon_imul_k8, + athlon_imul_k8_DI, athlon_imul_mem_k8, athlon_imul_mem_k8_DI, + athlon_idiv, athlon_idiv_mem, athlon_str): Added amdfam10. + +2007-02-05 Harsha Jagasia + + * config/i386/athlon.md (athlon_sseld_amdfam10, + athlon_mmxld_amdfam10, athlon_ssest_amdfam10, + athlon_mmxssest_short_amdfam10): New define_insn_reservation. + +2007-02-05 Harsha Jagasia + + * config/i386/athlon.md (athlon_sseins_amdfam10): New + define_insn_reservation. + * config/i386/i386.md (sseins): Added sseins to define_attr type + and define_attr unit. + * config/i386/sse.md: Set type attribute to sseins for insertq + and insertqi. + +2007-02-05 Harsha Jagasia + + * config/i386/athlon.md (sselog_load_amdfam10, sselog_amdfam10, + ssecmpvector_load_amdfam10, ssecmpvector_amdfam10, + ssecomi_load_amdfam10, ssecomi_amdfam10, + sseaddvector_load_amdfam10, sseaddvector_amdfam10): New + define_insn_reservation. + (ssecmp_load_k8, ssecmp, sseadd_load_k8, seadd): Added amdfam10. + +2007-02-05 Harsha Jagasia + + * config/i386/athlon.md (cvtss2sd_load_amdfam10, + cvtss2sd_amdfam10, cvtps2pd_load_amdfam10, cvtps2pd_amdfam10, + cvtsi2sd_load_amdfam10, cvtsi2ss_load_amdfam10, + cvtsi2sd_amdfam10, cvtsi2ss_amdfam10, cvtsd2ss_load_amdfam10, + cvtsd2ss_amdfam10, cvtpd2ps_load_amdfam10, cvtpd2ps_amdfam10, + cvtsX2si_load_amdfam10, cvtsX2si_amdfam10): New + define_insn_reservation. + + * config/i386/sse.md (cvtsi2ss, cvtsi2ssq, cvtss2si, + cvtss2siq, cvttss2si, cvttss2siq, cvtsi2sd, cvtsi2sdq, + cvtsd2si, cvtsd2siq, cvttsd2si, cvttsd2siq, + cvtpd2dq, cvttpd2dq, cvtsd2ss, cvtss2sd, + cvtpd2ps, cvtps2pd): Added amdfam10_decode attribute. + +2007-02-05 Harsha Jagasia + + * config/i386/athlon.md (athlon_ssedivvector_amdfam10, + athlon_ssedivvector_load_amdfam10, athlon_ssemulvector_amdfam10, + athlon_ssemulvector_load_amdfam10): New define_insn_reservation. + (athlon_ssediv, athlon_ssediv_load_k8, athlon_ssemul, + athlon_ssemul_load_k8): Added amdfam10. + +2007-02-05 Harsha Jagasia + + * config/i386/i386.h (TARGET_SSE_UNALIGNED_MOVE_OPTIMAL): New macro. + (x86_sse_unaligned_move_optimal): New variable. + + * config/i386/i386.c (x86_sse_unaligned_move_optimal): Enable for + m_AMDFAM10. + (ix86_expand_vector_move_misalign): Add code to generate movupd/movups + for unaligned vector SSE double/single precision loads for AMDFAM10. + +2007-02-05 Harsha Jagasia + + * config/i386/i386.h (TARGET_AMDFAM10): New macro. + (TARGET_CPU_CPP_BUILTINS): Add code for amdfam10. + Define TARGET_CPU_DEFAULT_amdfam10. + (TARGET_CPU_DEFAULT_NAMES): Add amdfam10. + (processor_type): Add PROCESSOR_AMDFAM10. + + * config/i386/i386.md: Add amdfam10 as a new cpu attribute to match + processor_type in config/i386/i386.h. + Enable imul peepholes for TARGET_AMDFAM10. + + * config.gcc: Add support for --with-cpu option for amdfam10. + + * config/i386/i386.c (amdfam10_cost): New variable. + (m_AMDFAM10): New macro. + (m_ATHLON_K8_AMDFAM10): New macro. + (x86_use_leave, x86_push_memory, x86_movx, x86_unroll_strlen, + x86_cmove, x86_3dnow_a, x86_deep_branch, x86_use_simode_fiop, + x86_promote_QImode, x86_integer_DFmode_moves, + x86_partial_reg_dependency, x86_memory_mismatch_stall, + x86_accumulate_outgoing_args, x86_arch_always_fancy_math_387, + x86_sse_partial_reg_dependency, x86_sse_typeless_stores, + x86_use_ffreep, x86_use_incdec, x86_four_jump_limit, + x86_schedule, x86_use_bt, x86_cmpxchg16b, x86_pad_returns): + Enable/disable for amdfam10. + (override_options): Add amdfam10_cost to processor_target_table. + Set up PROCESSOR_AMDFAM10 for amdfam10 entry in + processor_alias_table. + (ix86_issue_rate): Add PROCESSOR_AMDFAM10. + (ix86_adjust_cost): Add code for amdfam10. + +2007-02-05 Harsha Jagasia + + * config/i386/i386.opt: Add new Advanced Bit Manipulation (-mabm) + instruction set feature flag. Add new (-mpopcnt) flag for popcnt + instruction. Add new SSE4A (-msse4a) instruction set feature flag. + * config/i386/i386.h: Add builtin definition for SSE4A. + * config/i386/i386.md: Add support for ABM instructions + (popcnt and lzcnt). + * config/i386/sse.md: Add support for SSE4A instructions + (movntss, movntsd, extrq, insertq). + * config/i386/i386.c: Add support for ABM and SSE4A builtins. + Add -march=amdfam10 flag. + * config/i386/ammintrin.h: Add support for SSE4A intrinsics. + * doc/invoke.texi: Add documentation on flags for sse4a, abm, popcnt + and amdfam10. + * doc/extend.texi: Add documentation for SSE4A builtins. + +2007-01-24 Jakub Jelinek + + * config/i386/i386.h (x86_cmpxchg16b): Remove const. + (TARGET_CMPXCHG16B): Define to x86_cmpxchg16b. + * config/i386/i386.c (x86_cmpxchg16b): Remove const. + (override_options): Add PTA_CX16 flag. Set x86_cmpxchg16b + for CPUs that have PTA_CX16 set. + +2007-01-18 Michael Meissner + + * config/i386/i386.c (ix86_compute_frame_layout): Make fprintf's + in #if 0 code type correct. + 2007-01-17 Eric Christopher (r120846) * config.gcc: Support core2 processor. @@ -62,7 +263,30 @@ x86_pad_returns): Add m_CORE2. (override_options): Add entries for Core2. (ix86_issue_rate): Add case for Core2. + +2006-10-28 Uros Bizjak + + * config/i386/i386.h (GENERAL_REGNO_P): Use STACK_POINTER_REGNUM. + (NON_QI_REG_P): Use IN_RANGE. + (REX_INT_REGNO_P): Use IN_RANGE. + (FP_REGNO_P): Use IN_RANGE. + (SSE_REGNO_P): Use IN_RANGE. + (REX_SSE_REGNO_P): Use IN_RANGE. + (MMX_REGNO_P): Use IN_RANGE. + (STACK_REGNO_P): New macro. + (STACK_REG_P): Use STACK_REGNO_P. + (NON_STACK_REG_P): Use STACK_REGNO_P. + (REGNO_OK_FOR_INDEX_P): Use REX_INT_REGNO_P. + (REGNO_OK_FOR_BASE_P): Use GENERAL_REGNO_P. + (REG_OK_FOR_INDEX_NONSTRICT_P): Use REX_INT_REGNO_P. + (REG_OK_FOR_BASE_NONSTRICT_P): Use GENERAL_REGNO_P. + (HARD_REGNO_RENAME_OK): Use !IN_RANGE. +2006-10-28 Uros Bizjak + + * config/i386/i386.c (output_387_ffreep): Create output from a + template string for !HAVE_AS_IX86_FFREEP. + 2006-10-27 Vladimir Makarov (r118090) * config/i386/i386.h (TARGET_GEODE): @@ -95,7 +319,31 @@ * config/i386/geode.md: New file. * doc/invoke.texi: Add entry about geode processor. - + +2006-10-24 Uros Bizjak + + * config/i386/i386.h (FIRST_PSEUDO_REGISTER): Define to 54. + (FIXED_REGISTERS, CALL_USED_REGISTERS): Add fpcr register. + (REG_ALLOC_ORDER): Add one element to allocate fpcr register. + (FRAME_POINTER_REGNUM): Update register number to 21. + (REG_CLASS_CONTENTS): Update contents for added fpcr register. + (HI_REGISTER_NAMES): Add "fpcr" for fpcr register. + + * config/i386/i386.c (regclass_map): Add fpcr entry. + (dbx_register_map, dbx64_register_map, svr4_dbx_register_map): + Add fpcr entry. + (print_reg): Assert REGNO (x) != FPCR_REG. + + * config/i386/i386.md (FPCR_REG, R11_REG): New constants. + (DIRFLAG_REG): Renumber. + (x86_fnstcw_1, x86_fldcw_1): Use FPCR_REG instead of FPSR_REG. + (*sibcall_1_rex64_v, *sibcall_value_1_rex64_v): Use R11_REG. + (sse_prologue_save, *sse_prologue_save_insn): Renumber + hardcoded SSE register numbers. + + * config/i386/mmx.md (mmx_emms, mmx_femms): Renumber + hardcoded MMX register numbers. + 2006-10-24 Richard Guenther PR middle-end/28796 @@ -104,6 +352,17 @@ for deciding optimizations in consistency with fold-const.c (fold_builtin_unordered_cmp): Likewise. +2006-10-24 Richard Guenther + + * builtins.c (fold_builtin_floor): Fold floor (x) where + x is nonnegative to trunc (x). + (fold_builtin_int_roundingfn): Fold lfloor (x) where x is + nonnegative to FIX_TRUNC_EXPR. + +2006-10-22 H.J. Lu + + * config/i386/tmmintrin.h: Remove the duplicated content. + 2006-10-22 H.J. Lu (r117958) * config.gcc (i[34567]86-*-*): Add tmmintrin.h to extra_headers. @@ -170,6 +429,18 @@ * doc/invoke.texi: Document -mssse3/-mno-ssse3 switches. +2006-10-21 H.J. Lu + + * config/i386/i386.md (UNSPEC_LDQQU): Renamed to ... + (UNSPEC_LDDQU): This. + * config/i386/sse.md (sse3_lddqu): Updated. + +2006-10-21 Richard Guenther + + PR tree-optimization/3511 + * tree-ssa-pre.c (phi_translate): Fold CALL_EXPRs that + got new invariant arguments during PHI translation. + 2006-10-21 Richard Guenther * builtins.c (fold_builtin_classify): Fix typo. diff --git a/contrib/gcc/builtins.c b/contrib/gcc/builtins.c index a65d725..3321821 100644 --- a/contrib/gcc/builtins.c +++ b/contrib/gcc/builtins.c @@ -7355,6 +7355,12 @@ fold_builtin_ceil (tree fndecl, tree arglist) } } + /* Fold floor (x) where x is nonnegative to trunc (x). */ + if (tree_expr_nonnegative_p (arg)) + return build_function_call_expr (mathfn_built_in (TREE_TYPE (arg), + BUILT_IN_TRUNC), + arglist); + return fold_trunc_transparent_mathfn (fndecl, arglist); } @@ -7442,6 +7448,18 @@ fold_builtin_int_roundingfn (tree fndecl, tree arglist) } } + switch (DECL_FUNCTION_CODE (fndecl)) + { + CASE_FLT_FN (BUILT_IN_LFLOOR): + CASE_FLT_FN (BUILT_IN_LLFLOOR): + /* Fold lfloor (x) where x is nonnegative to FIX_TRUNC (x). */ + if (tree_expr_nonnegative_p (arg)) + return fold_build1 (FIX_TRUNC_EXPR, TREE_TYPE (TREE_TYPE (fndecl)), + arg); + break; + default:; + } + return fold_fixed_mathfn (fndecl, arglist); } diff --git a/contrib/gcc/config.gcc b/contrib/gcc/config.gcc index 4406593..3e4e596 100644 --- a/contrib/gcc/config.gcc +++ b/contrib/gcc/config.gcc @@ -269,12 +269,12 @@ xscale-*-*) i[34567]86-*-*) cpu_type=i386 extra_headers="mmintrin.h mm3dnow.h xmmintrin.h emmintrin.h - pmmintrin.h tmmintrin.h" + pmmintrin.h tmmintrin.h ammintrin.h" ;; x86_64-*-*) cpu_type=i386 extra_headers="mmintrin.h mm3dnow.h xmmintrin.h emmintrin.h - pmmintrin.h tmmintrin.h" + pmmintrin.h tmmintrin.h ammintrin.h" need_64bit_hwint=yes ;; ia64-*-*) @@ -1209,14 +1209,14 @@ i[34567]86-*-solaris2*) # FIXME: -m64 for i[34567]86-*-* should be allowed just # like -m32 for x86_64-*-*. case X"${with_cpu}" in - Xgeneric|Xcore2|Xnocona|Xx86-64|Xk8|Xopteron|Xathlon64|Xathlon-fx) + Xgeneric|Xcore2|Xnocona|Xx86-64|Xamdfam10|Xbarcelona|Xk8|Xopteron|Xathlon64|Xathlon-fx) ;; X) with_cpu=generic ;; *) echo "Unsupported CPU used in --with-cpu=$with_cpu, supported values:" 1>&2 - echo "generic core2 nocona x86-64 k8 opteron athlon64 athlon-fx" 1>&2 + echo "generic core2 nocona x86-64amd fam10 barcelona k8 opteron athlon64 athlon-fx" 1>&2 exit 1 ;; esac @@ -2515,6 +2515,9 @@ if test x$with_cpu = x ; then ;; i686-*-* | i786-*-*) case ${target_noncanonical} in + amdfam10-*|barcelona-*) + with_cpu=amdfam10 + ;; k8-*|opteron-*|athlon_64-*) with_cpu=k8 ;; @@ -2555,6 +2558,9 @@ if test x$with_cpu = x ; then ;; x86_64-*-*) case ${target_noncanonical} in + amdfam10-*|barcelona-*) + with_cpu=amdfam10 + ;; k8-*|opteron-*|athlon_64-*) with_cpu=k8 ;; @@ -2795,7 +2801,7 @@ case "${target}" in esac # OK ;; - "" | k8 | opteron | athlon64 | athlon-fx | nocona | core2 | generic) + "" | amdfam10 | barcelona | k8 | opteron | athlon64 | athlon-fx | nocona | core2 | generic) # OK ;; *) diff --git a/contrib/gcc/config/i386/ammintrin.h b/contrib/gcc/config/i386/ammintrin.h new file mode 100644 index 0000000..869c288 --- /dev/null +++ b/contrib/gcc/config/i386/ammintrin.h @@ -0,0 +1,73 @@ +/* Copyright (C) 2007 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING. If not, write to + the Free Software Foundation, 51 Franklin Street, Fifth Floor, + Boston, MA 02110-1301, USA. */ + +/* As a special exception, if you include this header file into source + files compiled by GCC, this header file does not by itself cause + the resulting executable to be covered by the GNU General Public + License. This exception does not however invalidate any other + reasons why the executable file might be covered by the GNU General + Public License. */ + +/* Implemented from the specification included in the AMD Programmers + Manual Update, version 2.x */ + +#ifndef _AMMINTRIN_H_INCLUDED +#define _AMMINTRIN_H_INCLUDED + +#ifndef __SSE4A__ +# error "SSE4A instruction set not enabled" +#else + +/* We need definitions from the SSE3, SSE2 and SSE header files*/ +#include + +static __inline void __attribute__((__always_inline__)) +_mm_stream_sd (double * __P, __m128d __Y) +{ + __builtin_ia32_movntsd (__P, (__v2df) __Y); +} + +static __inline void __attribute__((__always_inline__)) +_mm_stream_ss (float * __P, __m128 __Y) +{ + __builtin_ia32_movntss (__P, (__v4sf) __Y); +} + +static __inline __m128i __attribute__((__always_inline__)) +_mm_extract_si64 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_extrq ((__v2di) __X, (__v16qi) __Y); +} + +#define _mm_extracti_si64(X, I, L) \ +((__m128i) __builtin_ia32_extrqi ((__v2di)(X), I, L)) + +static __inline __m128i __attribute__((__always_inline__)) +_mm_insert_si64 (__m128i __X,__m128i __Y) +{ + return (__m128i) __builtin_ia32_insertq ((__v2di)__X, (__v2di)__Y); +} + +#define _mm_inserti_si64(X, Y, I, L) \ +((__m128i) __builtin_ia32_insertqi ((__v2di)(X), (__v2di)(Y), I, L)) + + +#endif /* __SSE4A__ */ + +#endif /* _AMMINTRIN_H_INCLUDED */ diff --git a/contrib/gcc/config/i386/athlon.md b/contrib/gcc/config/i386/athlon.md index 6d92b94..a52f9bc 100644 --- a/contrib/gcc/config/i386/athlon.md +++ b/contrib/gcc/config/i386/athlon.md @@ -29,6 +29,8 @@ (const_string "vector")] (const_string "direct"))) +(define_attr "amdfam10_decode" "direct,vector,double" + (const_string "direct")) ;; ;; decode0 decode1 decode2 ;; \ | / @@ -131,18 +133,22 @@ ;; Jump instructions are executed in the branch unit completely transparent to us (define_insn_reservation "athlon_branch" 0 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (eq_attr "type" "ibr")) "athlon-direct,athlon-ieu") (define_insn_reservation "athlon_call" 0 (and (eq_attr "cpu" "athlon,k8,generic64") (eq_attr "type" "call,callv")) "athlon-vector,athlon-ieu") +(define_insn_reservation "athlon_call_amdfam10" 0 + (and (eq_attr "cpu" "amdfam10") + (eq_attr "type" "call,callv")) + "athlon-double,athlon-ieu") ;; Latency of push operation is 3 cycles, but ESP value is available ;; earlier (define_insn_reservation "athlon_push" 2 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (eq_attr "type" "push")) "athlon-direct,athlon-agu,athlon-store") (define_insn_reservation "athlon_pop" 4 @@ -153,12 +159,16 @@ (and (eq_attr "cpu" "k8,generic64") (eq_attr "type" "pop")) "athlon-double,(athlon-ieu+athlon-load)") +(define_insn_reservation "athlon_pop_amdfam10" 3 + (and (eq_attr "cpu" "amdfam10") + (eq_attr "type" "pop")) + "athlon-direct,(athlon-ieu+athlon-load)") (define_insn_reservation "athlon_leave" 3 (and (eq_attr "cpu" "athlon") (eq_attr "type" "leave")) "athlon-vector,(athlon-ieu+athlon-load)") (define_insn_reservation "athlon_leave_k8" 3 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (eq_attr "type" "leave")) "athlon-double,(athlon-ieu+athlon-load)") @@ -167,6 +177,11 @@ (and (eq_attr "cpu" "athlon,k8,generic64") (eq_attr "type" "lea")) "athlon-direct,athlon-agu,nothing") +;; Lea executes in AGU unit with 1 cycle latency on AMDFAM10 +(define_insn_reservation "athlon_lea_amdfam10" 1 + (and (eq_attr "cpu" "amdfam10") + (eq_attr "type" "lea")) + "athlon-direct,athlon-agu,nothing") ;; Mul executes in special multiplier unit attached to IEU0 (define_insn_reservation "athlon_imul" 5 @@ -176,29 +191,35 @@ "athlon-vector,athlon-ieu0,athlon-mult,nothing,nothing,athlon-ieu0") ;; ??? Widening multiply is vector or double. (define_insn_reservation "athlon_imul_k8_DI" 4 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "imul") (and (eq_attr "mode" "DI") (eq_attr "memory" "none,unknown")))) "athlon-direct0,athlon-ieu0,athlon-mult,nothing,athlon-ieu0") (define_insn_reservation "athlon_imul_k8" 3 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "imul") (eq_attr "memory" "none,unknown"))) "athlon-direct0,athlon-ieu0,athlon-mult,athlon-ieu0") +(define_insn_reservation "athlon_imul_amdfam10_HI" 4 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "imul") + (and (eq_attr "mode" "HI") + (eq_attr "memory" "none,unknown")))) + "athlon-vector,athlon-ieu0,athlon-mult,nothing,athlon-ieu0") (define_insn_reservation "athlon_imul_mem" 8 (and (eq_attr "cpu" "athlon") (and (eq_attr "type" "imul") (eq_attr "memory" "load,both"))) "athlon-vector,athlon-load,athlon-ieu,athlon-mult,nothing,nothing,athlon-ieu") (define_insn_reservation "athlon_imul_mem_k8_DI" 7 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "imul") (and (eq_attr "mode" "DI") (eq_attr "memory" "load,both")))) "athlon-vector,athlon-load,athlon-ieu,athlon-mult,nothing,athlon-ieu") (define_insn_reservation "athlon_imul_mem_k8" 6 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "imul") (eq_attr "memory" "load,both"))) "athlon-vector,athlon-load,athlon-ieu,athlon-mult,athlon-ieu") @@ -209,21 +230,23 @@ ;; other instructions. ;; ??? Experiments show that the idiv can overlap with roughly 6 cycles ;; of the other code +;; Using the same heuristics for amdfam10 as K8 with idiv (define_insn_reservation "athlon_idiv" 6 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (and (eq_attr "type" "idiv") (eq_attr "memory" "none,unknown"))) "athlon-vector,(athlon-ieu0*6+(athlon-fpsched,athlon-fvector))") (define_insn_reservation "athlon_idiv_mem" 9 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (and (eq_attr "type" "idiv") (eq_attr "memory" "load,both"))) "athlon-vector,((athlon-load,athlon-ieu0*6)+(athlon-fpsched,athlon-fvector))") ;; The parallelism of string instructions is not documented. Model it same way ;; as idiv to create smaller automata. This probably does not matter much. +;; Using the same heuristics for amdfam10 as K8 with idiv (define_insn_reservation "athlon_str" 6 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (and (eq_attr "type" "str") (eq_attr "memory" "load,both,store"))) "athlon-vector,athlon-load,athlon-ieu0*6") @@ -234,34 +257,62 @@ (and (eq_attr "unit" "integer,unknown") (eq_attr "memory" "none,unknown")))) "athlon-direct,athlon-ieu") +(define_insn_reservation "athlon_idirect_amdfam10" 1 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "amdfam10_decode" "direct") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "none,unknown")))) + "athlon-direct,athlon-ieu") (define_insn_reservation "athlon_ivector" 2 (and (eq_attr "cpu" "athlon,k8,generic64") (and (eq_attr "athlon_decode" "vector") (and (eq_attr "unit" "integer,unknown") (eq_attr "memory" "none,unknown")))) "athlon-vector,athlon-ieu,athlon-ieu") +(define_insn_reservation "athlon_ivector_amdfam10" 2 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "amdfam10_decode" "vector") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "none,unknown")))) + "athlon-vector,athlon-ieu,athlon-ieu") + (define_insn_reservation "athlon_idirect_loadmov" 3 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (and (eq_attr "type" "imov") (eq_attr "memory" "load"))) "athlon-direct,athlon-load") + (define_insn_reservation "athlon_idirect_load" 4 (and (eq_attr "cpu" "athlon,k8,generic64") (and (eq_attr "athlon_decode" "direct") (and (eq_attr "unit" "integer,unknown") (eq_attr "memory" "load")))) "athlon-direct,athlon-load,athlon-ieu") +(define_insn_reservation "athlon_idirect_load_amdfam10" 4 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "amdfam10_decode" "direct") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "load")))) + "athlon-direct,athlon-load,athlon-ieu") (define_insn_reservation "athlon_ivector_load" 6 (and (eq_attr "cpu" "athlon,k8,generic64") (and (eq_attr "athlon_decode" "vector") (and (eq_attr "unit" "integer,unknown") (eq_attr "memory" "load")))) "athlon-vector,athlon-load,athlon-ieu,athlon-ieu") +(define_insn_reservation "athlon_ivector_load_amdfam10" 6 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "amdfam10_decode" "vector") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "load")))) + "athlon-vector,athlon-load,athlon-ieu,athlon-ieu") + (define_insn_reservation "athlon_idirect_movstore" 1 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (and (eq_attr "type" "imov") (eq_attr "memory" "store"))) "athlon-direct,athlon-agu,athlon-store") + (define_insn_reservation "athlon_idirect_both" 4 (and (eq_attr "cpu" "athlon,k8,generic64") (and (eq_attr "athlon_decode" "direct") @@ -270,6 +321,15 @@ "athlon-direct,athlon-load, athlon-ieu,athlon-store, athlon-store") +(define_insn_reservation "athlon_idirect_both_amdfam10" 4 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "amdfam10_decode" "direct") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "both")))) + "athlon-direct,athlon-load, + athlon-ieu,athlon-store, + athlon-store") + (define_insn_reservation "athlon_ivector_both" 6 (and (eq_attr "cpu" "athlon,k8,generic64") (and (eq_attr "athlon_decode" "vector") @@ -279,6 +339,16 @@ athlon-ieu, athlon-ieu, athlon-store") +(define_insn_reservation "athlon_ivector_both_amdfam10" 6 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "amdfam10_decode" "vector") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "both")))) + "athlon-vector,athlon-load, + athlon-ieu, + athlon-ieu, + athlon-store") + (define_insn_reservation "athlon_idirect_store" 1 (and (eq_attr "cpu" "athlon,k8,generic64") (and (eq_attr "athlon_decode" "direct") @@ -286,6 +356,14 @@ (eq_attr "memory" "store")))) "athlon-direct,(athlon-ieu+athlon-agu), athlon-store") +(define_insn_reservation "athlon_idirect_store_amdfam10" 1 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "amdfam10_decode" "direct") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "store")))) + "athlon-direct,(athlon-ieu+athlon-agu), + athlon-store") + (define_insn_reservation "athlon_ivector_store" 2 (and (eq_attr "cpu" "athlon,k8,generic64") (and (eq_attr "athlon_decode" "vector") @@ -293,6 +371,13 @@ (eq_attr "memory" "store")))) "athlon-vector,(athlon-ieu+athlon-agu),athlon-ieu, athlon-store") +(define_insn_reservation "athlon_ivector_store_amdfam10" 2 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "amdfam10_decode" "vector") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "store")))) + "athlon-vector,(athlon-ieu+athlon-agu),athlon-ieu, + athlon-store") ;; Athlon floatin point unit (define_insn_reservation "athlon_fldxf" 12 @@ -302,7 +387,7 @@ (eq_attr "mode" "XF")))) "athlon-vector,athlon-fpload2,athlon-fvector*9") (define_insn_reservation "athlon_fldxf_k8" 13 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "fmov") (and (eq_attr "memory" "load") (eq_attr "mode" "XF")))) @@ -314,7 +399,7 @@ (eq_attr "memory" "load"))) "athlon-direct,athlon-fpload,athlon-fany") (define_insn_reservation "athlon_fld_k8" 2 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "fmov") (eq_attr "memory" "load"))) "athlon-direct,athlon-fploadk8,athlon-fstore") @@ -326,7 +411,7 @@ (eq_attr "mode" "XF")))) "athlon-vector,(athlon-fpsched+athlon-agu),(athlon-store2+(athlon-fvector*7))") (define_insn_reservation "athlon_fstxf_k8" 8 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "fmov") (and (eq_attr "memory" "store,both") (eq_attr "mode" "XF")))) @@ -337,16 +422,16 @@ (eq_attr "memory" "store,both"))) "athlon-direct,(athlon-fpsched+athlon-agu),(athlon-fstore+athlon-store)") (define_insn_reservation "athlon_fst_k8" 2 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "fmov") (eq_attr "memory" "store,both"))) "athlon-direct,(athlon-fpsched+athlon-agu),(athlon-fstore+athlon-store)") (define_insn_reservation "athlon_fist" 4 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (eq_attr "type" "fistp,fisttp")) "athlon-direct,(athlon-fpsched+athlon-agu),(athlon-fstore+athlon-store)") (define_insn_reservation "athlon_fmov" 2 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (eq_attr "type" "fmov")) "athlon-direct,athlon-fpsched,athlon-faddmul") (define_insn_reservation "athlon_fadd_load" 4 @@ -355,12 +440,12 @@ (eq_attr "memory" "load"))) "athlon-direct,athlon-fpload,athlon-fadd") (define_insn_reservation "athlon_fadd_load_k8" 6 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "fop") (eq_attr "memory" "load"))) "athlon-direct,athlon-fploadk8,athlon-fadd") (define_insn_reservation "athlon_fadd" 4 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (eq_attr "type" "fop")) "athlon-direct,athlon-fpsched,athlon-fadd") (define_insn_reservation "athlon_fmul_load" 4 @@ -369,16 +454,16 @@ (eq_attr "memory" "load"))) "athlon-direct,athlon-fpload,athlon-fmul") (define_insn_reservation "athlon_fmul_load_k8" 6 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "fmul") (eq_attr "memory" "load"))) "athlon-direct,athlon-fploadk8,athlon-fmul") (define_insn_reservation "athlon_fmul" 4 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (eq_attr "type" "fmul")) "athlon-direct,athlon-fpsched,athlon-fmul") (define_insn_reservation "athlon_fsgn" 2 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (eq_attr "type" "fsgn")) "athlon-direct,athlon-fpsched,athlon-fmul") (define_insn_reservation "athlon_fdiv_load" 24 @@ -387,7 +472,7 @@ (eq_attr "memory" "load"))) "athlon-direct,athlon-fpload,athlon-fmul") (define_insn_reservation "athlon_fdiv_load_k8" 13 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "fdiv") (eq_attr "memory" "load"))) "athlon-direct,athlon-fploadk8,athlon-fmul") @@ -396,16 +481,16 @@ (eq_attr "type" "fdiv")) "athlon-direct,athlon-fpsched,athlon-fmul") (define_insn_reservation "athlon_fdiv_k8" 11 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (eq_attr "type" "fdiv")) "athlon-direct,athlon-fpsched,athlon-fmul") (define_insn_reservation "athlon_fpspc_load" 103 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (and (eq_attr "type" "fpspc") (eq_attr "memory" "load"))) "athlon-vector,athlon-fpload,athlon-fvector") (define_insn_reservation "athlon_fpspc" 100 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (eq_attr "type" "fpspc")) "athlon-vector,athlon-fpsched,athlon-fvector") (define_insn_reservation "athlon_fcmov_load" 7 @@ -418,12 +503,12 @@ (eq_attr "type" "fcmov")) "athlon-vector,athlon-fpsched,athlon-fvector") (define_insn_reservation "athlon_fcmov_load_k8" 17 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "fcmov") (eq_attr "memory" "load"))) "athlon-vector,athlon-fploadk8,athlon-fvector") (define_insn_reservation "athlon_fcmov_k8" 15 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (eq_attr "type" "fcmov")) "athlon-vector,athlon-fpsched,athlon-fvector") ;; fcomi is vector decoded by uses only one pipe. @@ -434,13 +519,13 @@ (eq_attr "memory" "load")))) "athlon-vector,athlon-fpload,athlon-fadd") (define_insn_reservation "athlon_fcomi_load_k8" 5 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "fcmp") (and (eq_attr "athlon_decode" "vector") (eq_attr "memory" "load")))) "athlon-vector,athlon-fploadk8,athlon-fadd") (define_insn_reservation "athlon_fcomi" 3 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (and (eq_attr "athlon_decode" "vector") (eq_attr "type" "fcmp"))) "athlon-vector,athlon-fpsched,athlon-fadd") @@ -450,18 +535,18 @@ (eq_attr "memory" "load"))) "athlon-direct,athlon-fpload,athlon-fadd") (define_insn_reservation "athlon_fcom_load_k8" 4 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "fcmp") (eq_attr "memory" "load"))) "athlon-direct,athlon-fploadk8,athlon-fadd") (define_insn_reservation "athlon_fcom" 2 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (eq_attr "type" "fcmp")) "athlon-direct,athlon-fpsched,athlon-fadd") ;; Never seen by the scheduler because we still don't do post reg-stack ;; scheduling. ;(define_insn_reservation "athlon_fxch" 2 -; (and (eq_attr "cpu" "athlon,k8,generic64") +; (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") ; (eq_attr "type" "fxch")) ; "athlon-direct,athlon-fpsched,athlon-fany") @@ -516,6 +601,23 @@ (and (eq_attr "type" "mmxmov,ssemov") (eq_attr "memory" "load"))) "athlon-direct,athlon-fploadk8,athlon-fstore") +;; On AMDFAM10 all double, single and integer packed and scalar SSEx data +;; loads generated are direct path, latency of 2 and do not use any FP +;; executions units. No seperate entries for movlpx/movhpx loads, which +;; are direct path, latency of 4 and use the FADD/FMUL FP execution units, +;; as they will not be generated. +(define_insn_reservation "athlon_sseld_amdfam10" 2 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssemov") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fploadk8") +;; On AMDFAM10 MMX data loads generated are direct path, latency of 4 +;; and can use any FP executions units +(define_insn_reservation "athlon_mmxld_amdfam10" 4 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "mmxmov") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fploadk8, athlon-fany") (define_insn_reservation "athlon_mmxssest" 3 (and (eq_attr "cpu" "k8,generic64") (and (eq_attr "type" "mmxmov,ssemov") @@ -533,6 +635,25 @@ (and (eq_attr "type" "mmxmov,ssemov") (eq_attr "memory" "store,both"))) "athlon-direct,(athlon-fpsched+athlon-agu),(athlon-fstore+athlon-store)") +;; On AMDFAM10 all double, single and integer packed SSEx data stores +;; generated are all double path, latency of 2 and use the FSTORE FP +;; execution unit. No entries seperate for movupx/movdqu, which are +;; vector path, latency of 3 and use the FSTORE*2 FP execution unit, +;; as they will not be generated. +(define_insn_reservation "athlon_ssest_amdfam10" 2 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssemov") + (and (eq_attr "mode" "V4SF,V2DF,TI") + (eq_attr "memory" "store,both")))) + "athlon-double,(athlon-fpsched+athlon-agu),((athlon-fstore+athlon-store)*2)") +;; On AMDFAM10 all double, single and integer scalar SSEx and MMX +;; data stores generated are all direct path, latency of 2 and use +;; the FSTORE FP execution unit +(define_insn_reservation "athlon_mmxssest_short_amdfam10" 2 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "mmxmov,ssemov") + (eq_attr "memory" "store,both"))) + "athlon-direct,(athlon-fpsched+athlon-agu),(athlon-fstore+athlon-store)") (define_insn_reservation "athlon_movaps_k8" 2 (and (eq_attr "cpu" "k8,generic64") (and (eq_attr "type" "ssemov") @@ -578,6 +699,11 @@ (and (eq_attr "type" "sselog,sselog1") (eq_attr "memory" "load"))) "athlon-double,athlon-fpload2k8,(athlon-fmul*2)") +(define_insn_reservation "athlon_sselog_load_amdfam10" 4 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "sselog,sselog1") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fploadk8,(athlon-fadd|athlon-fmul)") (define_insn_reservation "athlon_sselog" 3 (and (eq_attr "cpu" "athlon") (eq_attr "type" "sselog,sselog1")) @@ -586,6 +712,11 @@ (and (eq_attr "cpu" "k8,generic64") (eq_attr "type" "sselog,sselog1")) "athlon-double,athlon-fpsched,athlon-fmul") +(define_insn_reservation "athlon_sselog_amdfam10" 2 + (and (eq_attr "cpu" "amdfam10") + (eq_attr "type" "sselog,sselog1")) + "athlon-direct,athlon-fpsched,(athlon-fadd|athlon-fmul)") + ;; ??? pcmp executes in addmul, probably not worthwhile to bother about that. (define_insn_reservation "athlon_ssecmp_load" 2 (and (eq_attr "cpu" "athlon") @@ -594,13 +725,13 @@ (eq_attr "memory" "load")))) "athlon-direct,athlon-fpload,athlon-fadd") (define_insn_reservation "athlon_ssecmp_load_k8" 4 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "ssecmp") (and (eq_attr "mode" "SF,DF,DI,TI") (eq_attr "memory" "load")))) "athlon-direct,athlon-fploadk8,athlon-fadd") (define_insn_reservation "athlon_ssecmp" 2 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (and (eq_attr "type" "ssecmp") (eq_attr "mode" "SF,DF,DI,TI"))) "athlon-direct,athlon-fpsched,athlon-fadd") @@ -614,6 +745,11 @@ (and (eq_attr "type" "ssecmp") (eq_attr "memory" "load"))) "athlon-double,athlon-fpload2k8,(athlon-fadd*2)") +(define_insn_reservation "athlon_ssecmpvector_load_amdfam10" 4 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssecmp") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fploadk8,athlon-fadd") (define_insn_reservation "athlon_ssecmpvector" 3 (and (eq_attr "cpu" "athlon") (eq_attr "type" "ssecmp")) @@ -622,6 +758,10 @@ (and (eq_attr "cpu" "k8,generic64") (eq_attr "type" "ssecmp")) "athlon-double,athlon-fpsched,(athlon-fadd*2)") +(define_insn_reservation "athlon_ssecmpvector_amdfam10" 2 + (and (eq_attr "cpu" "amdfam10") + (eq_attr "type" "ssecmp")) + "athlon-direct,athlon-fpsched,athlon-fadd") (define_insn_reservation "athlon_ssecomi_load" 4 (and (eq_attr "cpu" "athlon") (and (eq_attr "type" "ssecomi") @@ -632,10 +772,20 @@ (and (eq_attr "type" "ssecomi") (eq_attr "memory" "load"))) "athlon-vector,athlon-fploadk8,athlon-fadd") +(define_insn_reservation "athlon_ssecomi_load_amdfam10" 5 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssecomi") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fploadk8,athlon-fadd") (define_insn_reservation "athlon_ssecomi" 4 (and (eq_attr "cpu" "athlon,k8,generic64") (eq_attr "type" "ssecmp")) "athlon-vector,athlon-fpsched,athlon-fadd") +(define_insn_reservation "athlon_ssecomi_amdfam10" 3 + (and (eq_attr "cpu" "amdfam10") +;; It seems athlon_ssecomi has a bug in the attr_type, fixed for amdfam10 + (eq_attr "type" "ssecomi")) + "athlon-direct,athlon-fpsched,athlon-fadd") (define_insn_reservation "athlon_sseadd_load" 4 (and (eq_attr "cpu" "athlon") (and (eq_attr "type" "sseadd") @@ -643,13 +793,13 @@ (eq_attr "memory" "load")))) "athlon-direct,athlon-fpload,athlon-fadd") (define_insn_reservation "athlon_sseadd_load_k8" 6 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "sseadd") (and (eq_attr "mode" "SF,DF,DI") (eq_attr "memory" "load")))) "athlon-direct,athlon-fploadk8,athlon-fadd") (define_insn_reservation "athlon_sseadd" 4 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (and (eq_attr "type" "sseadd") (eq_attr "mode" "SF,DF,DI"))) "athlon-direct,athlon-fpsched,athlon-fadd") @@ -663,6 +813,11 @@ (and (eq_attr "type" "sseadd") (eq_attr "memory" "load"))) "athlon-double,athlon-fpload2k8,(athlon-fadd*2)") +(define_insn_reservation "athlon_sseaddvector_load_amdfam10" 6 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "sseadd") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fploadk8,athlon-fadd") (define_insn_reservation "athlon_sseaddvector" 5 (and (eq_attr "cpu" "athlon") (eq_attr "type" "sseadd")) @@ -671,6 +826,10 @@ (and (eq_attr "cpu" "k8,generic64") (eq_attr "type" "sseadd")) "athlon-double,athlon-fpsched,(athlon-fadd*2)") +(define_insn_reservation "athlon_sseaddvector_amdfam10" 4 + (and (eq_attr "cpu" "amdfam10") + (eq_attr "type" "sseadd")) + "athlon-direct,athlon-fpsched,athlon-fadd") ;; Conversions behaves very irregularly and the scheduling is critical here. ;; Take each instruction separately. Assume that the mode is always set to the @@ -684,12 +843,25 @@ (and (eq_attr "mode" "DF") (eq_attr "memory" "load"))))) "athlon-direct,athlon-fploadk8,athlon-fstore") +(define_insn_reservation "athlon_ssecvt_cvtss2sd_load_amdfam10" 7 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "amdfam10_decode" "double") + (and (eq_attr "mode" "DF") + (eq_attr "memory" "load"))))) + "athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)") (define_insn_reservation "athlon_ssecvt_cvtss2sd" 2 (and (eq_attr "cpu" "athlon,k8,generic64") (and (eq_attr "type" "ssecvt") (and (eq_attr "athlon_decode" "direct") (eq_attr "mode" "DF")))) "athlon-direct,athlon-fpsched,athlon-fstore") +(define_insn_reservation "athlon_ssecvt_cvtss2sd_amdfam10" 7 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "amdfam10_decode" "vector") + (eq_attr "mode" "DF")))) + "athlon-vector,athlon-fpsched,athlon-faddmul,(athlon-fstore*2)") ;; cvtps2pd. Model same way the other double decoded FP conversions. (define_insn_reservation "athlon_ssecvt_cvtps2pd_load_k8" 5 (and (eq_attr "cpu" "k8,athlon,generic64") @@ -698,12 +870,25 @@ (and (eq_attr "mode" "V2DF,V4SF,TI") (eq_attr "memory" "load"))))) "athlon-double,athlon-fpload2k8,(athlon-fstore*2)") +(define_insn_reservation "athlon_ssecvt_cvtps2pd_load_amdfam10" 4 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "amdfam10_decode" "direct") + (and (eq_attr "mode" "V2DF,V4SF,TI") + (eq_attr "memory" "load"))))) + "athlon-direct,athlon-fploadk8,athlon-fstore") (define_insn_reservation "athlon_ssecvt_cvtps2pd_k8" 3 (and (eq_attr "cpu" "k8,athlon,generic64") (and (eq_attr "type" "ssecvt") (and (eq_attr "athlon_decode" "double") (eq_attr "mode" "V2DF,V4SF,TI")))) "athlon-double,athlon-fpsched,athlon-fstore,athlon-fstore") +(define_insn_reservation "athlon_ssecvt_cvtps2pd_amdfam10" 2 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "amdfam10_decode" "direct") + (eq_attr "mode" "V2DF,V4SF,TI")))) + "athlon-direct,athlon-fpsched,athlon-fstore") ;; cvtsi2sd mem,reg is directpath path (cvtsi2sd reg,reg is doublepath) ;; cvtsi2sd has troughput 1 and is executed in store unit with latency of 6 (define_insn_reservation "athlon_sseicvt_cvtsi2sd_load" 6 @@ -713,6 +898,13 @@ (and (eq_attr "mode" "SF,DF") (eq_attr "memory" "load"))))) "athlon-direct,athlon-fploadk8,athlon-fstore") +(define_insn_reservation "athlon_sseicvt_cvtsi2sd_load_amdfam10" 9 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "sseicvt") + (and (eq_attr "amdfam10_decode" "double") + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "load"))))) + "athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)") ;; cvtsi2ss mem, reg is doublepath (define_insn_reservation "athlon_sseicvt_cvtsi2ss_load" 9 (and (eq_attr "cpu" "athlon") @@ -728,6 +920,13 @@ (and (eq_attr "mode" "SF,DF") (eq_attr "memory" "load"))))) "athlon-double,athlon-fploadk8,(athlon-fstore*2)") +(define_insn_reservation "athlon_sseicvt_cvtsi2ss_load_amdfam10" 9 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "sseicvt") + (and (eq_attr "amdfam10_decode" "double") + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "load"))))) + "athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)") ;; cvtsi2sd reg,reg is double decoded (vector on Athlon) (define_insn_reservation "athlon_sseicvt_cvtsi2sd_k8" 11 (and (eq_attr "cpu" "k8,athlon,generic64") @@ -736,6 +935,13 @@ (and (eq_attr "mode" "SF,DF") (eq_attr "memory" "none"))))) "athlon-double,athlon-fploadk8,athlon-fstore") +(define_insn_reservation "athlon_sseicvt_cvtsi2sd_amdfam10" 14 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "sseicvt") + (and (eq_attr "amdfam10_decode" "vector") + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "none"))))) + "athlon-vector,athlon-fploadk8,(athlon-faddmul+athlon-fstore)") ;; cvtsi2ss reg, reg is doublepath (define_insn_reservation "athlon_sseicvt_cvtsi2ss" 14 (and (eq_attr "cpu" "athlon,k8,generic64") @@ -744,6 +950,13 @@ (and (eq_attr "mode" "SF,DF") (eq_attr "memory" "none"))))) "athlon-vector,athlon-fploadk8,(athlon-fvector*2)") +(define_insn_reservation "athlon_sseicvt_cvtsi2ss_amdfam10" 14 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "sseicvt") + (and (eq_attr "amdfam10_decode" "vector") + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "none"))))) + "athlon-vector,athlon-fploadk8,(athlon-faddmul+athlon-fstore)") ;; cvtsd2ss mem,reg is doublepath, troughput unknown, latency 9 (define_insn_reservation "athlon_ssecvt_cvtsd2ss_load_k8" 9 (and (eq_attr "cpu" "k8,athlon,generic64") @@ -752,6 +965,13 @@ (and (eq_attr "mode" "SF") (eq_attr "memory" "load"))))) "athlon-double,athlon-fploadk8,(athlon-fstore*3)") +(define_insn_reservation "athlon_ssecvt_cvtsd2ss_load_amdfam10" 9 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "amdfam10_decode" "double") + (and (eq_attr "mode" "SF") + (eq_attr "memory" "load"))))) + "athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)") ;; cvtsd2ss reg,reg is vectorpath, troughput unknown, latency 12 (define_insn_reservation "athlon_ssecvt_cvtsd2ss" 12 (and (eq_attr "cpu" "athlon,k8,generic64") @@ -760,6 +980,13 @@ (and (eq_attr "mode" "SF") (eq_attr "memory" "none"))))) "athlon-vector,athlon-fpsched,(athlon-fvector*3)") +(define_insn_reservation "athlon_ssecvt_cvtsd2ss_amdfam10" 8 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "amdfam10_decode" "vector") + (and (eq_attr "mode" "SF") + (eq_attr "memory" "none"))))) + "athlon-vector,athlon-fpsched,athlon-faddmul,(athlon-fstore*2)") (define_insn_reservation "athlon_ssecvt_cvtpd2ps_load_k8" 8 (and (eq_attr "cpu" "athlon,k8,generic64") (and (eq_attr "type" "ssecvt") @@ -767,6 +994,13 @@ (and (eq_attr "mode" "V4SF,V2DF,TI") (eq_attr "memory" "load"))))) "athlon-double,athlon-fpload2k8,(athlon-fstore*3)") +(define_insn_reservation "athlon_ssecvt_cvtpd2ps_load_amdfam10" 9 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "amdfam10_decode" "double") + (and (eq_attr "mode" "V4SF,V2DF,TI") + (eq_attr "memory" "load"))))) + "athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)") ;; cvtpd2ps mem,reg is vectorpath, troughput unknown, latency 10 ;; ??? Why it is fater than cvtsd2ss? (define_insn_reservation "athlon_ssecvt_cvtpd2ps" 8 @@ -776,6 +1010,13 @@ (and (eq_attr "mode" "V4SF,V2DF,TI") (eq_attr "memory" "none"))))) "athlon-vector,athlon-fpsched,athlon-fvector*2") +(define_insn_reservation "athlon_ssecvt_cvtpd2ps_amdfam10" 7 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "amdfam10_decode" "double") + (and (eq_attr "mode" "V4SF,V2DF,TI") + (eq_attr "memory" "none"))))) + "athlon-double,athlon-fpsched,(athlon-faddmul+athlon-fstore)") ;; cvtsd2si mem,reg is doublepath, troughput 1, latency 9 (define_insn_reservation "athlon_secvt_cvtsX2si_load" 9 (and (eq_attr "cpu" "athlon,k8,generic64") @@ -784,6 +1025,13 @@ (and (eq_attr "mode" "SI,DI") (eq_attr "memory" "load"))))) "athlon-vector,athlon-fploadk8,athlon-fvector") +(define_insn_reservation "athlon_secvt_cvtsX2si_load_amdfam10" 10 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "sseicvt") + (and (eq_attr "amdfam10_decode" "double") + (and (eq_attr "mode" "SI,DI") + (eq_attr "memory" "load"))))) + "athlon-double,athlon-fploadk8,(athlon-fadd+athlon-fstore)") ;; cvtsd2si reg,reg is doublepath, troughput 1, latency 9 (define_insn_reservation "athlon_ssecvt_cvtsX2si" 9 (and (eq_attr "cpu" "athlon") @@ -799,6 +1047,29 @@ (and (eq_attr "mode" "SI,DI") (eq_attr "memory" "none"))))) "athlon-double,athlon-fpsched,athlon-fstore") +(define_insn_reservation "athlon_ssecvt_cvtsX2si_amdfam10" 8 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "sseicvt") + (and (eq_attr "amdfam10_decode" "double") + (and (eq_attr "mode" "SI,DI") + (eq_attr "memory" "none"))))) + "athlon-double,athlon-fpsched,(athlon-fadd+athlon-fstore)") +;; cvtpd2dq reg,mem is doublepath, troughput 1, latency 9 on amdfam10 +(define_insn_reservation "athlon_sseicvt_cvtpd2dq_load_amdfam10" 9 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "sseicvt") + (and (eq_attr "amdfam10_decode" "double") + (and (eq_attr "mode" "TI") + (eq_attr "memory" "load"))))) + "athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)") +;; cvtpd2dq reg,mem is doublepath, troughput 1, latency 7 on amdfam10 +(define_insn_reservation "athlon_sseicvt_cvtpd2dq_amdfam10" 7 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "sseicvt") + (and (eq_attr "amdfam10_decode" "double") + (and (eq_attr "mode" "TI") + (eq_attr "memory" "none"))))) + "athlon-double,athlon-fpsched,(athlon-faddmul+athlon-fstore)") (define_insn_reservation "athlon_ssemul_load" 4 @@ -808,13 +1079,13 @@ (eq_attr "memory" "load")))) "athlon-direct,athlon-fpload,athlon-fmul") (define_insn_reservation "athlon_ssemul_load_k8" 6 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "ssemul") (and (eq_attr "mode" "SF,DF") (eq_attr "memory" "load")))) "athlon-direct,athlon-fploadk8,athlon-fmul") (define_insn_reservation "athlon_ssemul" 4 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (and (eq_attr "type" "ssemul") (eq_attr "mode" "SF,DF"))) "athlon-direct,athlon-fpsched,athlon-fmul") @@ -828,6 +1099,11 @@ (and (eq_attr "type" "ssemul") (eq_attr "memory" "load"))) "athlon-double,athlon-fpload2k8,(athlon-fmul*2)") +(define_insn_reservation "athlon_ssemulvector_load_amdfam10" 6 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssemul") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fploadk8,athlon-fmul") (define_insn_reservation "athlon_ssemulvector" 5 (and (eq_attr "cpu" "athlon") (eq_attr "type" "ssemul")) @@ -836,6 +1112,10 @@ (and (eq_attr "cpu" "k8,generic64") (eq_attr "type" "ssemul")) "athlon-double,athlon-fpsched,(athlon-fmul*2)") +(define_insn_reservation "athlon_ssemulvector_amdfam10" 4 + (and (eq_attr "cpu" "amdfam10") + (eq_attr "type" "ssemul")) + "athlon-direct,athlon-fpsched,athlon-fmul") ;; divsd timings. divss is faster (define_insn_reservation "athlon_ssediv_load" 20 (and (eq_attr "cpu" "athlon") @@ -844,13 +1124,13 @@ (eq_attr "memory" "load")))) "athlon-direct,athlon-fpload,athlon-fmul*17") (define_insn_reservation "athlon_ssediv_load_k8" 22 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "ssediv") (and (eq_attr "mode" "SF,DF") (eq_attr "memory" "load")))) "athlon-direct,athlon-fploadk8,athlon-fmul*17") (define_insn_reservation "athlon_ssediv" 20 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (and (eq_attr "type" "ssediv") (eq_attr "mode" "SF,DF"))) "athlon-direct,athlon-fpsched,athlon-fmul*17") @@ -864,6 +1144,11 @@ (and (eq_attr "type" "ssediv") (eq_attr "memory" "load"))) "athlon-double,athlon-fpload2k8,athlon-fmul*34") +(define_insn_reservation "athlon_ssedivvector_load_amdfam10" 22 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssediv") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fploadk8,athlon-fmul*17") (define_insn_reservation "athlon_ssedivvector" 39 (and (eq_attr "cpu" "athlon") (eq_attr "type" "ssediv")) @@ -872,3 +1157,12 @@ (and (eq_attr "cpu" "k8,generic64") (eq_attr "type" "ssediv")) "athlon-double,athlon-fmul*34") +(define_insn_reservation "athlon_ssedivvector_amdfam10" 20 + (and (eq_attr "cpu" "amdfam10") + (eq_attr "type" "ssediv")) + "athlon-direct,athlon-fmul*17") +(define_insn_reservation "athlon_sseins_amdfam10" 5 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "sseins") + (eq_attr "mode" "TI"))) + "athlon-vector,athlon-fpsched,athlon-faddmul") diff --git a/contrib/gcc/config/i386/driver-i386.c b/contrib/gcc/config/i386/driver-i386.c index cb9c6c3..6658263 100644 --- a/contrib/gcc/config/i386/driver-i386.c +++ b/contrib/gcc/config/i386/driver-i386.c @@ -1,5 +1,5 @@ /* Subroutines for the gcc driver. - Copyright (C) 2006 Free Software Foundation, Inc. + Copyright (C) 2006, 2007 Free Software Foundation, Inc. This file is part of GCC. @@ -40,6 +40,7 @@ const char *host_detect_local_cpu (int argc, const char **argv); #define bit_SSE3 (1 << 0) #define bit_SSSE3 (1 << 9) +#define bit_SSE4a (1 << 6) #define bit_CMPXCHG16B (1 << 13) #define bit_3DNOW (1 << 31) @@ -68,7 +69,7 @@ const char *host_detect_local_cpu (int argc, const char **argv) unsigned int ext_level; unsigned char has_mmx = 0, has_3dnow = 0, has_3dnowp = 0, has_sse = 0; unsigned char has_sse2 = 0, has_sse3 = 0, has_ssse3 = 0, has_cmov = 0; - unsigned char has_longmode = 0, has_cmpxchg8b = 0; + unsigned char has_longmode = 0, has_cmpxchg8b = 0, has_sse4a = 0; unsigned char is_amd = 0; unsigned int family = 0; bool arch; @@ -120,6 +121,7 @@ const char *host_detect_local_cpu (int argc, const char **argv) has_3dnow = !!(edx & bit_3DNOW); has_3dnowp = !!(edx & bit_3DNOWP); has_longmode = !!(edx & bit_LM); + has_sse4a = !!(ecx & bit_SSE4a); } is_amd = vendor == *(unsigned int*)"Auth"; @@ -132,6 +134,8 @@ const char *host_detect_local_cpu (int argc, const char **argv) processor = PROCESSOR_ATHLON; if (has_sse2 || has_longmode) processor = PROCESSOR_K8; + if (has_sse4a) + processor = PROCESSOR_AMDFAM10; } else { @@ -266,6 +270,9 @@ const char *host_detect_local_cpu (int argc, const char **argv) case PROCESSOR_NOCONA: cpu = "nocona"; break; + case PROCESSOR_AMDFAM10: + cpu = "amdfam10"; + break; case PROCESSOR_GENERIC32: case PROCESSOR_GENERIC64: cpu = "generic"; diff --git a/contrib/gcc/config/i386/i386.c b/contrib/gcc/config/i386/i386.c index 2567fca..c07b721 100644 --- a/contrib/gcc/config/i386/i386.c +++ b/contrib/gcc/config/i386/i386.c @@ -548,6 +548,71 @@ struct processor_costs k8_cost = { COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ }; +struct processor_costs amdfam10_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (2), /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ + COSTS_N_INSNS (4), /* HI */ + COSTS_N_INSNS (3), /* SI */ + COSTS_N_INSNS (4), /* DI */ + COSTS_N_INSNS (5)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (35), /* HI */ + COSTS_N_INSNS (51), /* SI */ + COSTS_N_INSNS (83), /* DI */ + COSTS_N_INSNS (83)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 8, /* "large" insn */ + 9, /* MOVE_RATIO */ + 4, /* cost for loading QImode using movzbl */ + {3, 4, 3}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {3, 4, 3}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {4, 4, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {6, 6, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {3, 3}, /* cost of loading MMX registers + in SImode and DImode */ + {4, 4}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {4, 4, 3}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {4, 4, 5}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 3, /* MMX or SSE register to integer */ + /* On K8: + MOVD reg64, xmmreg Double FSTORE 4 + MOVD reg32, xmmreg Double FSTORE 4 + On AMDFAM10: + MOVD reg64, xmmreg Double FADD 3 + 1/1 1/1 + MOVD reg32, xmmreg Double FADD 3 + 1/1 1/1 */ + 64, /* size of prefetch block */ + /* New AMD processors never drop prefetches; if they cannot be performed + immediately, they are queued. We set number of simultaneous prefetches + to a large constant to reflect this (it probably is not a good idea not + to limit number of prefetches at all, as their execution also takes some + time). */ + 100, /* number of parallel prefetches */ + 2, /* Branch cost */ + COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (4), /* cost of FMUL instruction. */ + COSTS_N_INSNS (19), /* cost of FDIV instruction. */ + COSTS_N_INSNS (2), /* cost of FABS instruction. */ + COSTS_N_INSNS (2), /* cost of FCHS instruction. */ + COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ +}; + static const struct processor_costs pentium4_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ @@ -834,11 +899,13 @@ const struct processor_costs *ix86_cost = &pentium_cost; #define m_PENT4 (1<to_allocate -= frame->red_zone_size; frame->stack_pointer_offset -= frame->red_zone_size; #if 0 - fprintf (stderr, "nregs: %i\n", frame->nregs); - fprintf (stderr, "size: %i\n", size); - fprintf (stderr, "alignment1: %i\n", stack_alignment_needed); - fprintf (stderr, "padding1: %i\n", frame->padding1); - fprintf (stderr, "va_arg: %i\n", frame->va_arg_size); - fprintf (stderr, "padding2: %i\n", frame->padding2); - fprintf (stderr, "to_allocate: %i\n", frame->to_allocate); - fprintf (stderr, "red_zone_size: %i\n", frame->red_zone_size); - fprintf (stderr, "frame_pointer_offset: %i\n", frame->frame_pointer_offset); - fprintf (stderr, "hard_frame_pointer_offset: %i\n", - frame->hard_frame_pointer_offset); - fprintf (stderr, "stack_pointer_offset: %i\n", frame->stack_pointer_offset); + fprintf (stderr, "\n"); + fprintf (stderr, "nregs: %ld\n", (long)frame->nregs); + fprintf (stderr, "size: %ld\n", (long)size); + fprintf (stderr, "alignment1: %ld\n", (long)stack_alignment_needed); + fprintf (stderr, "padding1: %ld\n", (long)frame->padding1); + fprintf (stderr, "va_arg: %ld\n", (long)frame->va_arg_size); + fprintf (stderr, "padding2: %ld\n", (long)frame->padding2); + fprintf (stderr, "to_allocate: %ld\n", (long)frame->to_allocate); + fprintf (stderr, "red_zone_size: %ld\n", (long)frame->red_zone_size); + fprintf (stderr, "frame_pointer_offset: %ld\n", (long)frame->frame_pointer_offset); + fprintf (stderr, "hard_frame_pointer_offset: %ld\n", + (long)frame->hard_frame_pointer_offset); + fprintf (stderr, "stack_pointer_offset: %ld\n", (long)frame->stack_pointer_offset); + fprintf (stderr, "current_function_is_leaf: %ld\n", (long)current_function_is_leaf); + fprintf (stderr, "current_function_calls_alloca: %ld\n", (long)current_function_calls_alloca); + fprintf (stderr, "x86_current_function_calls_tls_descriptor: %ld\n", (long)ix86_current_function_calls_tls_descriptor); #endif } @@ -7664,7 +7856,8 @@ print_reg (rtx x, int code, FILE *file) gcc_assert (REGNO (x) != ARG_POINTER_REGNUM && REGNO (x) != FRAME_POINTER_REGNUM && REGNO (x) != FLAGS_REG - && REGNO (x) != FPSR_REG); + && REGNO (x) != FPSR_REG + && REGNO (x) != FPCR_REG); if (ASSEMBLER_DIALECT == ASM_ATT || USER_LABEL_PREFIX[0] == 0) putc ('%', file); @@ -8859,17 +9052,15 @@ output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno) #if HAVE_AS_IX86_FFREEP return opno ? "ffreep\t%y1" : "ffreep\t%y0"; #else - switch (REGNO (operands[opno])) - { - case FIRST_STACK_REG + 0: return ".word\t0xc0df"; - case FIRST_STACK_REG + 1: return ".word\t0xc1df"; - case FIRST_STACK_REG + 2: return ".word\t0xc2df"; - case FIRST_STACK_REG + 3: return ".word\t0xc3df"; - case FIRST_STACK_REG + 4: return ".word\t0xc4df"; - case FIRST_STACK_REG + 5: return ".word\t0xc5df"; - case FIRST_STACK_REG + 6: return ".word\t0xc6df"; - case FIRST_STACK_REG + 7: return ".word\t0xc7df"; - } + { + static char retval[] = ".word\t0xc_df"; + int regno = REGNO (operands[opno]); + + gcc_assert (FP_REGNO_P (regno)); + + retval[9] = '0' + (regno - FIRST_STACK_REG); + return retval; + } #endif return opno ? "fstp\t%y1" : "fstp\t%y0"; @@ -9247,8 +9438,16 @@ ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[]) } if (TARGET_SSE2 && mode == V2DFmode) - { - rtx zero; + { + rtx zero; + + if (TARGET_SSE_UNALIGNED_MOVE_OPTIMAL) + { + op0 = gen_lowpart (V2DFmode, op0); + op1 = gen_lowpart (V2DFmode, op1); + emit_insn (gen_sse2_movupd (op0, op1)); + return; + } /* When SSE registers are split into halves, we can avoid writing to the top half twice. */ @@ -9276,7 +9475,15 @@ ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[]) emit_insn (gen_sse2_loadhpd (op0, op0, m)); } else - { + { + if (TARGET_SSE_UNALIGNED_MOVE_OPTIMAL) + { + op0 = gen_lowpart (V4SFmode, op0); + op1 = gen_lowpart (V4SFmode, op1); + emit_insn (gen_sse_movups (op0, op1)); + return; + } + if (TARGET_SSE_PARTIAL_REG_DEPENDENCY) emit_move_insn (op0, CONST0_RTX (mode)); else @@ -13833,6 +14040,7 @@ ix86_issue_rate (void) case PROCESSOR_PENTIUM4: case PROCESSOR_ATHLON: case PROCESSOR_K8: + case PROCESSOR_AMDFAM10: case PROCESSOR_NOCONA: case PROCESSOR_GENERIC32: case PROCESSOR_GENERIC64: @@ -14031,6 +14239,7 @@ ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost) case PROCESSOR_ATHLON: case PROCESSOR_K8: + case PROCESSOR_AMDFAM10: case PROCESSOR_GENERIC32: case PROCESSOR_GENERIC64: memory = get_attr_memory (insn); @@ -14744,6 +14953,14 @@ enum ix86_builtins IX86_BUILTIN_PABSW128, IX86_BUILTIN_PABSD128, + /* AMDFAM10 - SSE4A New Instructions. */ + IX86_BUILTIN_MOVNTSD, + IX86_BUILTIN_MOVNTSS, + IX86_BUILTIN_EXTRQI, + IX86_BUILTIN_EXTRQ, + IX86_BUILTIN_INSERTQI, + IX86_BUILTIN_INSERTQ, + IX86_BUILTIN_VEC_INIT_V2SI, IX86_BUILTIN_VEC_INIT_V4HI, IX86_BUILTIN_VEC_INIT_V8QI, @@ -15468,6 +15685,18 @@ ix86_init_mmx_sse_builtins (void) = build_function_type_list (void_type_node, pchar_type_node, V16QI_type_node, NULL_TREE); + tree v2di_ftype_v2di_unsigned_unsigned + = build_function_type_list (V2DI_type_node, V2DI_type_node, + unsigned_type_node, unsigned_type_node, + NULL_TREE); + tree v2di_ftype_v2di_v2di_unsigned_unsigned + = build_function_type_list (V2DI_type_node, V2DI_type_node, V2DI_type_node, + unsigned_type_node, unsigned_type_node, + NULL_TREE); + tree v2di_ftype_v2di_v16qi + = build_function_type_list (V2DI_type_node, V2DI_type_node, V16QI_type_node, + NULL_TREE); + tree float80_type; tree float128_type; tree ftype; @@ -15804,6 +16033,20 @@ ix86_init_mmx_sse_builtins (void) def_builtin (MASK_SSSE3, "__builtin_ia32_palignr", di_ftype_di_di_int, IX86_BUILTIN_PALIGNR); + /* AMDFAM10 SSE4A New built-ins */ + def_builtin (MASK_SSE4A, "__builtin_ia32_movntsd", + void_ftype_pdouble_v2df, IX86_BUILTIN_MOVNTSD); + def_builtin (MASK_SSE4A, "__builtin_ia32_movntss", + void_ftype_pfloat_v4sf, IX86_BUILTIN_MOVNTSS); + def_builtin (MASK_SSE4A, "__builtin_ia32_extrqi", + v2di_ftype_v2di_unsigned_unsigned, IX86_BUILTIN_EXTRQI); + def_builtin (MASK_SSE4A, "__builtin_ia32_extrq", + v2di_ftype_v2di_v16qi, IX86_BUILTIN_EXTRQ); + def_builtin (MASK_SSE4A, "__builtin_ia32_insertqi", + v2di_ftype_v2di_v2di_unsigned_unsigned, IX86_BUILTIN_INSERTQI); + def_builtin (MASK_SSE4A, "__builtin_ia32_insertq", + v2di_ftype_v2di_v2di, IX86_BUILTIN_INSERTQ); + /* Access to the vec_init patterns. */ ftype = build_function_type_list (V2SI_type_node, integer_type_node, integer_type_node, NULL_TREE); @@ -16300,9 +16543,9 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, enum insn_code icode; tree fndecl = TREE_OPERAND (TREE_OPERAND (exp, 0), 0); tree arglist = TREE_OPERAND (exp, 1); - tree arg0, arg1, arg2; - rtx op0, op1, op2, pat; - enum machine_mode tmode, mode0, mode1, mode2, mode3; + tree arg0, arg1, arg2, arg3; + rtx op0, op1, op2, op3, pat; + enum machine_mode tmode, mode0, mode1, mode2, mode3, mode4; unsigned int fcode = DECL_FUNCTION_CODE (fndecl); switch (fcode) @@ -16818,6 +17061,114 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, emit_insn (pat); return target; + case IX86_BUILTIN_MOVNTSD: + return ix86_expand_store_builtin (CODE_FOR_sse4a_vmmovntv2df, arglist); + + case IX86_BUILTIN_MOVNTSS: + return ix86_expand_store_builtin (CODE_FOR_sse4a_vmmovntv4sf, arglist); + + case IX86_BUILTIN_INSERTQ: + case IX86_BUILTIN_EXTRQ: + icode = (fcode == IX86_BUILTIN_EXTRQ + ? CODE_FOR_sse4a_extrq + : CODE_FOR_sse4a_insertq); + arg0 = TREE_VALUE (arglist); + arg1 = TREE_VALUE (TREE_CHAIN (arglist)); + op0 = expand_normal (arg0); + op1 = expand_normal (arg1); + tmode = insn_data[icode].operand[0].mode; + mode1 = insn_data[icode].operand[1].mode; + mode2 = insn_data[icode].operand[2].mode; + if (! (*insn_data[icode].operand[1].predicate) (op0, mode1)) + op0 = copy_to_mode_reg (mode1, op0); + if (! (*insn_data[icode].operand[2].predicate) (op1, mode2)) + op1 = copy_to_mode_reg (mode2, op1); + if (optimize || target == 0 + || GET_MODE (target) != tmode + || ! (*insn_data[icode].operand[0].predicate) (target, tmode)) + target = gen_reg_rtx (tmode); + pat = GEN_FCN (icode) (target, op0, op1); + if (! pat) + return NULL_RTX; + emit_insn (pat); + return target; + + case IX86_BUILTIN_EXTRQI: + icode = CODE_FOR_sse4a_extrqi; + arg0 = TREE_VALUE (arglist); + arg1 = TREE_VALUE (TREE_CHAIN (arglist)); + arg2 = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (arglist))); + op0 = expand_normal (arg0); + op1 = expand_normal (arg1); + op2 = expand_normal (arg2); + tmode = insn_data[icode].operand[0].mode; + mode1 = insn_data[icode].operand[1].mode; + mode2 = insn_data[icode].operand[2].mode; + mode3 = insn_data[icode].operand[3].mode; + if (! (*insn_data[icode].operand[1].predicate) (op0, mode1)) + op0 = copy_to_mode_reg (mode1, op0); + if (! (*insn_data[icode].operand[2].predicate) (op1, mode2)) + { + error ("index mask must be an immediate"); + return gen_reg_rtx (tmode); + } + if (! (*insn_data[icode].operand[3].predicate) (op2, mode3)) + { + error ("length mask must be an immediate"); + return gen_reg_rtx (tmode); + } + if (optimize || target == 0 + || GET_MODE (target) != tmode + || ! (*insn_data[icode].operand[0].predicate) (target, tmode)) + target = gen_reg_rtx (tmode); + pat = GEN_FCN (icode) (target, op0, op1, op2); + if (! pat) + return NULL_RTX; + emit_insn (pat); + return target; + + case IX86_BUILTIN_INSERTQI: + icode = CODE_FOR_sse4a_insertqi; + arg0 = TREE_VALUE (arglist); + arg1 = TREE_VALUE (TREE_CHAIN (arglist)); + arg2 = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (arglist))); + arg3 = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (TREE_CHAIN (arglist)))); + op0 = expand_normal (arg0); + op1 = expand_normal (arg1); + op2 = expand_normal (arg2); + op3 = expand_normal (arg3); + tmode = insn_data[icode].operand[0].mode; + mode1 = insn_data[icode].operand[1].mode; + mode2 = insn_data[icode].operand[2].mode; + mode3 = insn_data[icode].operand[3].mode; + mode4 = insn_data[icode].operand[4].mode; + + if (! (*insn_data[icode].operand[1].predicate) (op0, mode1)) + op0 = copy_to_mode_reg (mode1, op0); + + if (! (*insn_data[icode].operand[2].predicate) (op1, mode2)) + op1 = copy_to_mode_reg (mode2, op1); + + if (! (*insn_data[icode].operand[3].predicate) (op2, mode3)) + { + error ("index mask must be an immediate"); + return gen_reg_rtx (tmode); + } + if (! (*insn_data[icode].operand[4].predicate) (op3, mode4)) + { + error ("length mask must be an immediate"); + return gen_reg_rtx (tmode); + } + if (optimize || target == 0 + || GET_MODE (target) != tmode + || ! (*insn_data[icode].operand[0].predicate) (target, tmode)) + target = gen_reg_rtx (tmode); + pat = GEN_FCN (icode) (target, op0, op1, op2, op3); + if (! pat) + return NULL_RTX; + emit_insn (pat); + return target; + case IX86_BUILTIN_VEC_INIT_V2SI: case IX86_BUILTIN_VEC_INIT_V4HI: case IX86_BUILTIN_VEC_INIT_V8QI: diff --git a/contrib/gcc/config/i386/i386.h b/contrib/gcc/config/i386/i386.h index b977014..896bf1d 100644 --- a/contrib/gcc/config/i386/i386.h +++ b/contrib/gcc/config/i386/i386.h @@ -141,6 +141,7 @@ extern const struct processor_costs *ix86_cost; #define TARGET_GENERIC32 (ix86_tune == PROCESSOR_GENERIC32) #define TARGET_GENERIC64 (ix86_tune == PROCESSOR_GENERIC64) #define TARGET_GENERIC (TARGET_GENERIC32 || TARGET_GENERIC64) +#define TARGET_AMDFAM10 (ix86_tune == PROCESSOR_AMDFAM10) #define TUNEMASK (1 << ix86_tune) extern const int x86_use_leave, x86_push_memory, x86_zero_extend_with_and; @@ -159,15 +160,16 @@ extern const int x86_accumulate_outgoing_args, x86_prologue_using_move; extern const int x86_epilogue_using_move, x86_decompose_lea; extern const int x86_arch_always_fancy_math_387, x86_shift1; extern const int x86_sse_partial_reg_dependency, x86_sse_split_regs; +extern const int x86_sse_unaligned_move_optimal; extern const int x86_sse_typeless_stores, x86_sse_load0_by_pxor; extern const int x86_use_ffreep; extern const int x86_inter_unit_moves, x86_schedule; extern const int x86_use_bt; -extern const int x86_cmpxchg, x86_cmpxchg8b, x86_cmpxchg16b, x86_xadd; +extern const int x86_cmpxchg, x86_cmpxchg8b, x86_xadd; extern const int x86_use_incdec; extern const int x86_pad_returns; extern const int x86_partial_flag_reg_stall; -extern int x86_prefetch_sse; +extern int x86_prefetch_sse, x86_cmpxchg16b; #define TARGET_USE_LEAVE (x86_use_leave & TUNEMASK) #define TARGET_PUSH_MEMORY (x86_push_memory & TUNEMASK) @@ -207,6 +209,8 @@ extern int x86_prefetch_sse; #define TARGET_PARTIAL_REG_DEPENDENCY (x86_partial_reg_dependency & TUNEMASK) #define TARGET_SSE_PARTIAL_REG_DEPENDENCY \ (x86_sse_partial_reg_dependency & TUNEMASK) +#define TARGET_SSE_UNALIGNED_MOVE_OPTIMAL \ + (x86_sse_unaligned_move_optimal & TUNEMASK) #define TARGET_SSE_SPLIT_REGS (x86_sse_split_regs & TUNEMASK) #define TARGET_SSE_TYPELESS_STORES (x86_sse_typeless_stores & TUNEMASK) #define TARGET_SSE_LOAD0_BY_PXOR (x86_sse_load0_by_pxor & TUNEMASK) @@ -237,7 +241,7 @@ extern int x86_prefetch_sse; #define TARGET_CMPXCHG (x86_cmpxchg & (1 << ix86_arch)) #define TARGET_CMPXCHG8B (x86_cmpxchg8b & (1 << ix86_arch)) -#define TARGET_CMPXCHG16B (x86_cmpxchg16b & (1 << ix86_arch)) +#define TARGET_CMPXCHG16B (x86_cmpxchg16b) #define TARGET_XADD (x86_xadd & (1 << ix86_arch)) #ifndef TARGET_64BIT_DEFAULT @@ -399,6 +403,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); } \ else if (TARGET_K8) \ builtin_define ("__tune_k8__"); \ + else if (TARGET_AMDFAM10) \ + builtin_define ("__tune_amdfam10__"); \ else if (TARGET_PENTIUM4) \ builtin_define ("__tune_pentium4__"); \ else if (TARGET_NOCONA) \ @@ -420,6 +426,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); builtin_define ("__SSE3__"); \ if (TARGET_SSSE3) \ builtin_define ("__SSSE3__"); \ + if (TARGET_SSE4A) \ + builtin_define ("__SSE4A__"); \ if (TARGET_SSE_MATH && TARGET_SSE) \ builtin_define ("__SSE_MATH__"); \ if (TARGET_SSE_MATH && TARGET_SSE2) \ @@ -475,6 +483,11 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); builtin_define ("__k8"); \ builtin_define ("__k8__"); \ } \ + else if (ix86_arch == PROCESSOR_AMDFAM10) \ + { \ + builtin_define ("__amdfam10"); \ + builtin_define ("__amdfam10__"); \ + } \ else if (ix86_arch == PROCESSOR_PENTIUM4) \ { \ builtin_define ("__pentium4"); \ @@ -513,13 +526,14 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); #define TARGET_CPU_DEFAULT_nocona 17 #define TARGET_CPU_DEFAULT_core2 18 #define TARGET_CPU_DEFAULT_generic 19 +#define TARGET_CPU_DEFAULT_amdfam10 20 #define TARGET_CPU_DEFAULT_NAMES {"i386", "i486", "pentium", "pentium-mmx",\ "pentiumpro", "pentium2", "pentium3", \ "pentium4", "geode", "k6", "k6-2", "k6-3", \ "athlon", "athlon-4", "k8", \ "pentium-m", "prescott", "nocona", \ - "core2", "generic"} + "core2", "generic", "amdfam10"} #ifndef CC1_SPEC #define CC1_SPEC "%(cc1_cpu) " @@ -734,7 +748,7 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); eliminated during reloading in favor of either the stack or frame pointer. */ -#define FIRST_PSEUDO_REGISTER 53 +#define FIRST_PSEUDO_REGISTER 54 /* Number of hardware registers that go into the DWARF-2 unwind info. If not defined, equals FIRST_PSEUDO_REGISTER. */ @@ -754,8 +768,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); #define FIXED_REGISTERS \ /*ax,dx,cx,bx,si,di,bp,sp,st,st1,st2,st3,st4,st5,st6,st7*/ \ { 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, \ -/*arg,flags,fpsr,dir,frame*/ \ - 1, 1, 1, 1, 1, \ +/*arg,flags,fpsr,fpcr,dir,frame*/ \ + 1, 1, 1, 1, 1, 1, \ /*xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7*/ \ 0, 0, 0, 0, 0, 0, 0, 0, \ /*mmx0,mmx1,mmx2,mmx3,mmx4,mmx5,mmx6,mmx7*/ \ @@ -782,10 +796,10 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); #define CALL_USED_REGISTERS \ /*ax,dx,cx,bx,si,di,bp,sp,st,st1,st2,st3,st4,st5,st6,st7*/ \ { 1, 1, 1, 0, 3, 3, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, \ -/*arg,flags,fpsr,dir,frame*/ \ - 1, 1, 1, 1, 1, \ +/*arg,flags,fpsr,fpcr,dir,frame*/ \ + 1, 1, 1, 1, 1, 1, \ /*xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7*/ \ - 1, 1, 1, 1, 1, 1, 1, 1, \ + 1, 1, 1, 1, 1, 1, 1, 1, \ /*mmx0,mmx1,mmx2,mmx3,mmx4,mmx5,mmx6,mmx7*/ \ 1, 1, 1, 1, 1, 1, 1, 1, \ /* r8, r9, r10, r11, r12, r13, r14, r15*/ \ @@ -806,11 +820,11 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,\ 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, \ 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, \ - 48, 49, 50, 51, 52 } + 48, 49, 50, 51, 52, 53 } /* ORDER_REGS_FOR_LOCAL_ALLOC is a macro which permits reg_alloc_order to be rearranged based on a particular function. When using sse math, - we want to allocate SSE before x87 registers and vice vera. */ + we want to allocate SSE before x87 registers and vice versa. */ #define ORDER_REGS_FOR_LOCAL_ALLOC x86_order_regs_for_local_alloc () @@ -972,7 +986,7 @@ do { \ #define HARD_FRAME_POINTER_REGNUM 6 /* Base register for access to local variables of the function. */ -#define FRAME_POINTER_REGNUM 20 +#define FRAME_POINTER_REGNUM 21 /* First floating point reg */ #define FIRST_FLOAT_REG 8 @@ -1085,7 +1099,7 @@ do { \ opcode needs reg %ebx. But some systems pass args to the OS in ebx, and the "b" register constraint is useful in asms for syscalls. - The flags and fpsr registers are in no class. */ + The flags, fpsr and fpcr registers are in no class. */ enum reg_class { @@ -1166,21 +1180,21 @@ enum reg_class { 0x10, 0x0 }, { 0x20, 0x0 }, /* SIREG, DIREG */ \ { 0x03, 0x0 }, /* AD_REGS */ \ { 0x0f, 0x0 }, /* Q_REGS */ \ - { 0x1100f0, 0x1fe0 }, /* NON_Q_REGS */ \ - { 0x7f, 0x1fe0 }, /* INDEX_REGS */ \ - { 0x1100ff, 0x0 }, /* LEGACY_REGS */ \ - { 0x1100ff, 0x1fe0 }, /* GENERAL_REGS */ \ + { 0x2100f0, 0x3fc0 }, /* NON_Q_REGS */ \ + { 0x7f, 0x3fc0 }, /* INDEX_REGS */ \ + { 0x2100ff, 0x0 }, /* LEGACY_REGS */ \ + { 0x2100ff, 0x3fc0 }, /* GENERAL_REGS */ \ { 0x100, 0x0 }, { 0x0200, 0x0 },/* FP_TOP_REG, FP_SECOND_REG */\ { 0xff00, 0x0 }, /* FLOAT_REGS */ \ -{ 0x1fe00000,0x1fe000 }, /* SSE_REGS */ \ -{ 0xe0000000, 0x1f }, /* MMX_REGS */ \ -{ 0x1fe00100,0x1fe000 }, /* FP_TOP_SSE_REG */ \ -{ 0x1fe00200,0x1fe000 }, /* FP_SECOND_SSE_REG */ \ -{ 0x1fe0ff00,0x1fe000 }, /* FLOAT_SSE_REGS */ \ - { 0x1ffff, 0x1fe0 }, /* FLOAT_INT_REGS */ \ -{ 0x1fe100ff,0x1fffe0 }, /* INT_SSE_REGS */ \ -{ 0x1fe1ffff,0x1fffe0 }, /* FLOAT_INT_SSE_REGS */ \ -{ 0xffffffff,0x1fffff } \ +{ 0x3fc00000,0x3fc000 }, /* SSE_REGS */ \ +{ 0xc0000000, 0x3f }, /* MMX_REGS */ \ +{ 0x3fc00100,0x3fc000 }, /* FP_TOP_SSE_REG */ \ +{ 0x3fc00200,0x3fc000 }, /* FP_SECOND_SSE_REG */ \ +{ 0x3fc0ff00,0x3fc000 }, /* FLOAT_SSE_REGS */ \ + { 0x1ffff, 0x3fc0 }, /* FLOAT_INT_REGS */ \ +{ 0x3fc100ff,0x3fffc0 }, /* INT_SSE_REGS */ \ +{ 0x3fc1ffff,0x3fffc0 }, /* FLOAT_INT_SSE_REGS */ \ +{ 0xffffffff,0x3fffff } \ } /* The same information, inverted: @@ -1196,11 +1210,10 @@ enum reg_class #define SMALL_REGISTER_CLASSES 1 -#define QI_REG_P(X) \ - (REG_P (X) && REGNO (X) < 4) +#define QI_REG_P(X) (REG_P (X) && REGNO (X) < 4) #define GENERAL_REGNO_P(N) \ - ((N) < 8 || REX_INT_REGNO_P (N)) + ((N) <= STACK_POINTER_REGNUM || REX_INT_REGNO_P (N)) #define GENERAL_REG_P(X) \ (REG_P (X) && GENERAL_REGNO_P (REGNO (X))) @@ -1208,39 +1221,38 @@ enum reg_class #define ANY_QI_REG_P(X) (TARGET_64BIT ? GENERAL_REG_P(X) : QI_REG_P (X)) #define NON_QI_REG_P(X) \ - (REG_P (X) && REGNO (X) >= 4 && REGNO (X) < FIRST_PSEUDO_REGISTER) + (REG_P (X) && IN_RANGE (REGNO (X), 4, FIRST_PSEUDO_REGISTER - 1)) -#define REX_INT_REGNO_P(N) ((N) >= FIRST_REX_INT_REG && (N) <= LAST_REX_INT_REG) +#define REX_INT_REGNO_P(N) \ + IN_RANGE ((N), FIRST_REX_INT_REG, LAST_REX_INT_REG) #define REX_INT_REG_P(X) (REG_P (X) && REX_INT_REGNO_P (REGNO (X))) #define FP_REG_P(X) (REG_P (X) && FP_REGNO_P (REGNO (X))) -#define FP_REGNO_P(N) ((N) >= FIRST_STACK_REG && (N) <= LAST_STACK_REG) +#define FP_REGNO_P(N) IN_RANGE ((N), FIRST_STACK_REG, LAST_STACK_REG) #define ANY_FP_REG_P(X) (REG_P (X) && ANY_FP_REGNO_P (REGNO (X))) #define ANY_FP_REGNO_P(N) (FP_REGNO_P (N) || SSE_REGNO_P (N)) -#define SSE_REGNO_P(N) \ - (((N) >= FIRST_SSE_REG && (N) <= LAST_SSE_REG) \ - || ((N) >= FIRST_REX_SSE_REG && (N) <= LAST_REX_SSE_REG)) +#define SSE_REG_P(N) (REG_P (N) && SSE_REGNO_P (REGNO (N))) +#define SSE_REGNO_P(N) \ + (IN_RANGE ((N), FIRST_SSE_REG, LAST_SSE_REG) \ + || REX_SSE_REGNO_P (N)) #define REX_SSE_REGNO_P(N) \ - ((N) >= FIRST_REX_SSE_REG && (N) <= LAST_REX_SSE_REG) + IN_RANGE ((N), FIRST_REX_SSE_REG, LAST_REX_SSE_REG) #define SSE_REGNO(N) \ ((N) < 8 ? FIRST_SSE_REG + (N) : FIRST_REX_SSE_REG + (N) - 8) -#define SSE_REG_P(N) (REG_P (N) && SSE_REGNO_P (REGNO (N))) #define SSE_FLOAT_MODE_P(MODE) \ ((TARGET_SSE && (MODE) == SFmode) || (TARGET_SSE2 && (MODE) == DFmode)) -#define MMX_REGNO_P(N) ((N) >= FIRST_MMX_REG && (N) <= LAST_MMX_REG) #define MMX_REG_P(XOP) (REG_P (XOP) && MMX_REGNO_P (REGNO (XOP))) +#define MMX_REGNO_P(N) IN_RANGE ((N), FIRST_MMX_REG, LAST_MMX_REG) -#define STACK_REG_P(XOP) \ - (REG_P (XOP) && \ - REGNO (XOP) >= FIRST_STACK_REG && \ - REGNO (XOP) <= LAST_STACK_REG) - -#define NON_STACK_REG_P(XOP) (REG_P (XOP) && ! STACK_REG_P (XOP)) +#define STACK_REG_P(XOP) (REG_P (XOP) && STACK_REGNO_P (REGNO (XOP))) +#define NON_STACK_REG_P(XOP) \ + (REG_P (XOP) && ! STACK_REGNO_P (REGNO (XOP))) +#define STACK_REGNO_P(N) IN_RANGE ((N), FIRST_STACK_REG, LAST_STACK_REG) #define STACK_TOP_P(XOP) (REG_P (XOP) && REGNO (XOP) == FIRST_STACK_REG) @@ -1588,21 +1600,15 @@ typedef struct ix86_args { #define REGNO_OK_FOR_INDEX_P(REGNO) \ ((REGNO) < STACK_POINTER_REGNUM \ - || (REGNO >= FIRST_REX_INT_REG \ - && (REGNO) <= LAST_REX_INT_REG) \ - || ((unsigned) reg_renumber[(REGNO)] >= FIRST_REX_INT_REG \ - && (unsigned) reg_renumber[(REGNO)] <= LAST_REX_INT_REG) \ - || (unsigned) reg_renumber[(REGNO)] < STACK_POINTER_REGNUM) + || REX_INT_REGNO_P (REGNO) \ + || (unsigned) reg_renumber[(REGNO)] < STACK_POINTER_REGNUM \ + || REX_INT_REGNO_P ((unsigned) reg_renumber[(REGNO)])) #define REGNO_OK_FOR_BASE_P(REGNO) \ - ((REGNO) <= STACK_POINTER_REGNUM \ + (GENERAL_REGNO_P (REGNO) \ || (REGNO) == ARG_POINTER_REGNUM \ || (REGNO) == FRAME_POINTER_REGNUM \ - || (REGNO >= FIRST_REX_INT_REG \ - && (REGNO) <= LAST_REX_INT_REG) \ - || ((unsigned) reg_renumber[(REGNO)] >= FIRST_REX_INT_REG \ - && (unsigned) reg_renumber[(REGNO)] <= LAST_REX_INT_REG) \ - || (unsigned) reg_renumber[(REGNO)] <= STACK_POINTER_REGNUM) + || GENERAL_REGNO_P ((unsigned) reg_renumber[(REGNO)])) #define REGNO_OK_FOR_SIREG_P(REGNO) \ ((REGNO) == 4 || reg_renumber[(REGNO)] == 4) @@ -1626,16 +1632,13 @@ typedef struct ix86_args { /* Non strict versions, pseudos are ok. */ #define REG_OK_FOR_INDEX_NONSTRICT_P(X) \ (REGNO (X) < STACK_POINTER_REGNUM \ - || (REGNO (X) >= FIRST_REX_INT_REG \ - && REGNO (X) <= LAST_REX_INT_REG) \ + || REX_INT_REGNO_P (REGNO (X)) \ || REGNO (X) >= FIRST_PSEUDO_REGISTER) #define REG_OK_FOR_BASE_NONSTRICT_P(X) \ - (REGNO (X) <= STACK_POINTER_REGNUM \ + (GENERAL_REGNO_P (REGNO (X)) \ || REGNO (X) == ARG_POINTER_REGNUM \ || REGNO (X) == FRAME_POINTER_REGNUM \ - || (REGNO (X) >= FIRST_REX_INT_REG \ - && REGNO (X) <= LAST_REX_INT_REG) \ || REGNO (X) >= FIRST_PSEUDO_REGISTER) /* Strict versions, hard registers only */ @@ -1940,9 +1943,9 @@ do { \ #define HI_REGISTER_NAMES \ {"ax","dx","cx","bx","si","di","bp","sp", \ "st","st(1)","st(2)","st(3)","st(4)","st(5)","st(6)","st(7)", \ - "argp", "flags", "fpsr", "dirflag", "frame", \ + "argp", "flags", "fpsr", "fpcr", "dirflag", "frame", \ "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7", \ - "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7" , \ + "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", \ "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", \ "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"} @@ -2108,6 +2111,7 @@ enum processor_type PROCESSOR_CORE2, PROCESSOR_GENERIC32, PROCESSOR_GENERIC64, + PROCESSOR_AMDFAM10, PROCESSOR_max }; @@ -2247,7 +2251,7 @@ enum ix86_stack_slot ??? Maybe Pentium chips benefits from renaming, someone can try.... */ #define HARD_REGNO_RENAME_OK(SRC, TARGET) \ - ((SRC) < FIRST_STACK_REG || (SRC) > LAST_STACK_REG) + (! IN_RANGE ((SRC), FIRST_STACK_REG, LAST_STACK_REG)) #define DLL_IMPORT_EXPORT_PREFIX '#' diff --git a/contrib/gcc/config/i386/i386.md b/contrib/gcc/config/i386/i386.md index bd81c4a..e0b0d0c 100644 --- a/contrib/gcc/config/i386/i386.md +++ b/contrib/gcc/config/i386/i386.md @@ -104,7 +104,7 @@ (UNSPEC_MFENCE 44) (UNSPEC_LFENCE 45) (UNSPEC_PSADBW 46) - (UNSPEC_LDQQU 47) + (UNSPEC_LDDQU 47) ; Generic math support (UNSPEC_COPYSIGN 50) @@ -153,6 +153,12 @@ (UNSPEC_PSHUFB 120) (UNSPEC_PSIGN 121) (UNSPEC_PALIGNR 122) + + ; For SSE4A support + (UNSPEC_EXTRQI 130) + (UNSPEC_EXTRQ 131) + (UNSPEC_INSERTQI 132) + (UNSPEC_INSERTQ 133) ]) (define_constants @@ -178,7 +184,9 @@ (SP_REG 7) (FLAGS_REG 17) (FPSR_REG 18) - (DIRFLAG_REG 19) + (FPCR_REG 19) + (DIRFLAG_REG 20) + (R11_REG 41) ]) ;; Insns whose names begin with "x86_" are emitted by gen_FOO calls @@ -192,7 +200,8 @@ ;; Processor type. This attribute must exactly match the processor_type ;; enumeration in i386.h. -(define_attr "cpu" "i386,i486,pentium,pentiumpro,geode,k6,athlon,pentium4,k8,nocona,core2,generic32,generic64" +(define_attr "cpu" "i386,i486,pentium,pentiumpro,geode,k6,athlon,pentium4,k8, + nocona,core2,generic32,generic64,amdfam10" (const (symbol_ref "ix86_tune"))) ;; A basic instruction type. Refinements due to arguments to be @@ -203,10 +212,10 @@ incdec,ishift,ishift1,rotate,rotate1,imul,idiv, icmp,test,ibr,setcc,icmov, push,pop,call,callv,leave, - str,cld, + str,,bitmanip,cld, fmov,fop,fsgn,fmul,fdiv,fpspc,fcmov,fcmp,fxch,fistp,fisttp,frndint, sselog,sselog1,sseiadd,sseishft,sseimul, - sse,ssemov,sseadd,ssemul,ssecmp,ssecomi,ssecvt,sseicvt,ssediv, + sse,ssemov,sseadd,ssemul,ssecmp,ssecomi,ssecvt,sseicvt,ssediv,sseins, mmx,mmxmov,mmxadd,mmxmul,mmxcmp,mmxcvt,mmxshft" (const_string "other")) @@ -220,7 +229,7 @@ (cond [(eq_attr "type" "fmov,fop,fsgn,fmul,fdiv,fpspc,fcmov,fcmp,fxch,fistp,fisttp,frndint") (const_string "i387") (eq_attr "type" "sselog,sselog1,sseiadd,sseishft,sseimul, - sse,ssemov,sseadd,ssemul,ssecmp,ssecomi,ssecvt,sseicvt,ssediv") + sse,ssemov,sseadd,ssemul,ssecmp,ssecomi,ssecvt,sseicvt,ssediv,sseins") (const_string "sse") (eq_attr "type" "mmx,mmxmov,mmxadd,mmxmul,mmxcmp,mmxcvt,mmxshft") (const_string "mmx") @@ -230,7 +239,8 @@ ;; The (bounding maximum) length of an instruction immediate. (define_attr "length_immediate" "" - (cond [(eq_attr "type" "incdec,setcc,icmov,str,cld,lea,other,multi,idiv,leave") + (cond [(eq_attr "type" "incdec,setcc,icmov,str,cld,lea,other,multi,idiv,leave, + bitmanip") (const_int 0) (eq_attr "unit" "i387,sse,mmx") (const_int 0) @@ -284,7 +294,7 @@ ;; Set when 0f opcode prefix is used. (define_attr "prefix_0f" "" (if_then_else - (ior (eq_attr "type" "imovx,setcc,icmov") + (ior (eq_attr "type" "imovx,setcc,icmov,bitmanip") (eq_attr "unit" "sse,mmx")) (const_int 1) (const_int 0))) @@ -413,7 +423,7 @@ (const_string "load") (and (eq_attr "type" "!alu1,negnot,ishift1, - imov,imovx,icmp,test, + imov,imovx,icmp,test,bitmanip, fmov,fcmp,fsgn, sse,ssemov,ssecmp,ssecomi,ssecvt,sseicvt,sselog1, mmx,mmxmov,mmxcmp,mmxcvt") @@ -968,10 +978,11 @@ "sahf" [(set_attr "length" "1") (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "direct") (set_attr "mode" "SI")]) ;; Pentium Pro can do steps 1 through 3 in one go. - +;; comi*, ucomi*, fcomi*, ficomi*,fucomi* (i387 instructions set condition codes) (define_insn "*cmpfp_i_mixed" [(set (reg:CCFP FLAGS_REG) (compare:CCFP (match_operand 0 "register_operand" "f,x") @@ -985,7 +996,8 @@ (if_then_else (match_operand:SF 1 "" "") (const_string "SF") (const_string "DF"))) - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "direct")]) (define_insn "*cmpfp_i_sse" [(set (reg:CCFP FLAGS_REG) @@ -1000,7 +1012,8 @@ (if_then_else (match_operand:SF 1 "" "") (const_string "SF") (const_string "DF"))) - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "direct")]) (define_insn "*cmpfp_i_i387" [(set (reg:CCFP FLAGS_REG) @@ -1019,7 +1032,8 @@ (const_string "DF") ] (const_string "XF"))) - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "direct")]) (define_insn "*cmpfp_iu_mixed" [(set (reg:CCFPU FLAGS_REG) @@ -1034,7 +1048,8 @@ (if_then_else (match_operand:SF 1 "" "") (const_string "SF") (const_string "DF"))) - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "direct")]) (define_insn "*cmpfp_iu_sse" [(set (reg:CCFPU FLAGS_REG) @@ -1049,7 +1064,8 @@ (if_then_else (match_operand:SF 1 "" "") (const_string "SF") (const_string "DF"))) - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "direct")]) (define_insn "*cmpfp_iu_387" [(set (reg:CCFPU FLAGS_REG) @@ -1068,7 +1084,8 @@ (const_string "DF") ] (const_string "XF"))) - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "direct")]) ;; Move instructions. @@ -1274,7 +1291,8 @@ [(set_attr "type" "imov") (set_attr "mode" "SI") (set_attr "pent_pair" "np") - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "double")]) (define_expand "movhi" [(set (match_operand:HI 0 "nonimmediate_operand" "") @@ -1391,8 +1409,10 @@ [(set_attr "type" "imov") (set_attr "mode" "SI") (set_attr "pent_pair" "np") - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "double")]) +;; Not added amdfam10_decode since TARGET_PARTIAL_REG_STALL is disabled for AMDFAM10 (define_insn "*swaphi_2" [(set (match_operand:HI 0 "register_operand" "+r") (match_operand:HI 1 "register_operand" "+r")) @@ -1565,8 +1585,10 @@ [(set_attr "type" "imov") (set_attr "mode" "SI") (set_attr "pent_pair" "np") - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "vector")]) +;; Not added amdfam10_decode since TARGET_PARTIAL_REG_STALL is disabled for AMDFAM10 (define_insn "*swapqi_2" [(set (match_operand:QI 0 "register_operand" "+q") (match_operand:QI 1 "register_operand" "+q")) @@ -2120,7 +2142,8 @@ [(set_attr "type" "imov") (set_attr "mode" "DI") (set_attr "pent_pair" "np") - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "double")]) (define_expand "movti" [(set (match_operand:TI 0 "nonimmediate_operand" "") @@ -4150,7 +4173,8 @@ "cvttss2si{q}\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "mode" "SF") - (set_attr "athlon_decode" "double,vector")]) + (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double")]) (define_insn "fix_truncdfdi_sse" [(set (match_operand:DI 0 "register_operand" "=r,r") @@ -4159,7 +4183,8 @@ "cvttsd2si{q}\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "mode" "DF") - (set_attr "athlon_decode" "double,vector")]) + (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double")]) (define_insn "fix_truncsfsi_sse" [(set (match_operand:SI 0 "register_operand" "=r,r") @@ -4168,7 +4193,8 @@ "cvttss2si\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "mode" "DF") - (set_attr "athlon_decode" "double,vector")]) + (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double")]) (define_insn "fix_truncdfsi_sse" [(set (match_operand:SI 0 "register_operand" "=r,r") @@ -4177,7 +4203,8 @@ "cvttsd2si\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "mode" "DF") - (set_attr "athlon_decode" "double,vector")]) + (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double")]) ;; Avoid vector decoded forms of the instruction. (define_peephole2 @@ -4423,7 +4450,7 @@ (define_insn "x86_fnstcw_1" [(set (match_operand:HI 0 "memory_operand" "=m") - (unspec:HI [(reg:HI FPSR_REG)] UNSPEC_FSTCW))] + (unspec:HI [(reg:HI FPCR_REG)] UNSPEC_FSTCW))] "TARGET_80387" "fnstcw\t%0" [(set_attr "length" "2") @@ -4431,14 +4458,15 @@ (set_attr "unit" "i387")]) (define_insn "x86_fldcw_1" - [(set (reg:HI FPSR_REG) + [(set (reg:HI FPCR_REG) (unspec:HI [(match_operand:HI 0 "memory_operand" "m")] UNSPEC_FLDCW))] "TARGET_80387" "fldcw\t%0" [(set_attr "length" "2") (set_attr "mode" "HI") (set_attr "unit" "i387") - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "vector")]) ;; Conversion between fixed point and floating point. @@ -4489,6 +4517,7 @@ (set_attr "mode" "SF") (set_attr "unit" "*,i387,*,*") (set_attr "athlon_decode" "*,*,vector,double") + (set_attr "amdfam10_decode" "*,*,vector,double") (set_attr "fp_int_src" "true")]) (define_insn "*floatsisf2_sse" @@ -4499,6 +4528,7 @@ [(set_attr "type" "sseicvt") (set_attr "mode" "SF") (set_attr "athlon_decode" "vector,double") + (set_attr "amdfam10_decode" "vector,double") (set_attr "fp_int_src" "true")]) (define_insn "*floatsisf2_i387" @@ -4532,6 +4562,7 @@ (set_attr "mode" "SF") (set_attr "unit" "*,i387,*,*") (set_attr "athlon_decode" "*,*,vector,double") + (set_attr "amdfam10_decode" "*,*,vector,double") (set_attr "fp_int_src" "true")]) (define_insn "*floatdisf2_sse" @@ -4542,6 +4573,7 @@ [(set_attr "type" "sseicvt") (set_attr "mode" "SF") (set_attr "athlon_decode" "vector,double") + (set_attr "amdfam10_decode" "vector,double") (set_attr "fp_int_src" "true")]) (define_insn "*floatdisf2_i387" @@ -4600,6 +4632,7 @@ (set_attr "mode" "DF") (set_attr "unit" "*,i387,*,*") (set_attr "athlon_decode" "*,*,double,direct") + (set_attr "amdfam10_decode" "*,*,vector,double") (set_attr "fp_int_src" "true")]) (define_insn "*floatsidf2_sse" @@ -4610,6 +4643,7 @@ [(set_attr "type" "sseicvt") (set_attr "mode" "DF") (set_attr "athlon_decode" "double,direct") + (set_attr "amdfam10_decode" "vector,double") (set_attr "fp_int_src" "true")]) (define_insn "*floatsidf2_i387" @@ -4643,6 +4677,7 @@ (set_attr "mode" "DF") (set_attr "unit" "*,i387,*,*") (set_attr "athlon_decode" "*,*,double,direct") + (set_attr "amdfam10_decode" "*,*,vector,double") (set_attr "fp_int_src" "true")]) (define_insn "*floatdidf2_sse" @@ -4653,6 +4688,7 @@ [(set_attr "type" "sseicvt") (set_attr "mode" "DF") (set_attr "athlon_decode" "double,direct") + (set_attr "amdfam10_decode" "vector,double") (set_attr "fp_int_src" "true")]) (define_insn "*floatdidf2_i387" @@ -6860,6 +6896,14 @@ "TARGET_64BIT" "") +;; On AMDFAM10 +;; IMUL reg64, reg64, imm8 Direct +;; IMUL reg64, mem64, imm8 VectorPath +;; IMUL reg64, reg64, imm32 Direct +;; IMUL reg64, mem64, imm32 VectorPath +;; IMUL reg64, reg64 Direct +;; IMUL reg64, mem64 Direct + (define_insn "*muldi3_1_rex64" [(set (match_operand:DI 0 "register_operand" "=r,r,r") (mult:DI (match_operand:DI 1 "nonimmediate_operand" "%rm,rm,0") @@ -6882,6 +6926,11 @@ (match_operand 1 "memory_operand" "")) (const_string "vector")] (const_string "direct"))) + (set (attr "amdfam10_decode") + (cond [(and (eq_attr "alternative" "0,1") + (match_operand 1 "memory_operand" "")) + (const_string "vector")] + (const_string "direct"))) (set_attr "mode" "DI")]) (define_expand "mulsi3" @@ -6892,6 +6941,14 @@ "" "") +;; On AMDFAM10 +;; IMUL reg32, reg32, imm8 Direct +;; IMUL reg32, mem32, imm8 VectorPath +;; IMUL reg32, reg32, imm32 Direct +;; IMUL reg32, mem32, imm32 VectorPath +;; IMUL reg32, reg32 Direct +;; IMUL reg32, mem32 Direct + (define_insn "*mulsi3_1" [(set (match_operand:SI 0 "register_operand" "=r,r,r") (mult:SI (match_operand:SI 1 "nonimmediate_operand" "%rm,rm,0") @@ -6913,6 +6970,11 @@ (match_operand 1 "memory_operand" "")) (const_string "vector")] (const_string "direct"))) + (set (attr "amdfam10_decode") + (cond [(and (eq_attr "alternative" "0,1") + (match_operand 1 "memory_operand" "")) + (const_string "vector")] + (const_string "direct"))) (set_attr "mode" "SI")]) (define_insn "*mulsi3_1_zext" @@ -6938,6 +7000,11 @@ (match_operand 1 "memory_operand" "")) (const_string "vector")] (const_string "direct"))) + (set (attr "amdfam10_decode") + (cond [(and (eq_attr "alternative" "0,1") + (match_operand 1 "memory_operand" "")) + (const_string "vector")] + (const_string "direct"))) (set_attr "mode" "SI")]) (define_expand "mulhi3" @@ -6948,6 +7015,13 @@ "TARGET_HIMODE_MATH" "") +;; On AMDFAM10 +;; IMUL reg16, reg16, imm8 VectorPath +;; IMUL reg16, mem16, imm8 VectorPath +;; IMUL reg16, reg16, imm16 VectorPath +;; IMUL reg16, mem16, imm16 VectorPath +;; IMUL reg16, reg16 Direct +;; IMUL reg16, mem16 Direct (define_insn "*mulhi3_1" [(set (match_operand:HI 0 "register_operand" "=r,r,r") (mult:HI (match_operand:HI 1 "nonimmediate_operand" "%rm,rm,0") @@ -6966,6 +7040,10 @@ (eq_attr "alternative" "1,2") (const_string "vector")] (const_string "direct"))) + (set (attr "amdfam10_decode") + (cond [(eq_attr "alternative" "0,1") + (const_string "vector")] + (const_string "direct"))) (set_attr "mode" "HI")]) (define_expand "mulqi3" @@ -6976,6 +7054,10 @@ "TARGET_QIMODE_MATH" "") +;;On AMDFAM10 +;; MUL reg8 Direct +;; MUL mem8 Direct + (define_insn "*mulqi3_1" [(set (match_operand:QI 0 "register_operand" "=a") (mult:QI (match_operand:QI 1 "nonimmediate_operand" "%0") @@ -6990,6 +7072,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "direct"))) + (set_attr "amdfam10_decode" "direct") (set_attr "mode" "QI")]) (define_expand "umulqihi3" @@ -7016,6 +7099,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "direct"))) + (set_attr "amdfam10_decode" "direct") (set_attr "mode" "QI")]) (define_expand "mulqihi3" @@ -7040,6 +7124,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "direct"))) + (set_attr "amdfam10_decode" "direct") (set_attr "mode" "QI")]) (define_expand "umulditi3" @@ -7066,6 +7151,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "double"))) + (set_attr "amdfam10_decode" "double") (set_attr "mode" "DI")]) ;; We can't use this pattern in 64bit mode, since it results in two separate 32bit registers @@ -7093,6 +7179,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "double"))) + (set_attr "amdfam10_decode" "double") (set_attr "mode" "SI")]) (define_expand "mulditi3" @@ -7119,6 +7206,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "double"))) + (set_attr "amdfam10_decode" "double") (set_attr "mode" "DI")]) (define_expand "mulsidi3" @@ -7145,6 +7233,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "double"))) + (set_attr "amdfam10_decode" "double") (set_attr "mode" "SI")]) (define_expand "umuldi3_highpart" @@ -7181,6 +7270,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "double"))) + (set_attr "amdfam10_decode" "double") (set_attr "mode" "DI")]) (define_expand "umulsi3_highpart" @@ -7216,6 +7306,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "double"))) + (set_attr "amdfam10_decode" "double") (set_attr "mode" "SI")]) (define_insn "*umulsi3_highpart_zext" @@ -7238,6 +7329,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "double"))) + (set_attr "amdfam10_decode" "double") (set_attr "mode" "SI")]) (define_expand "smuldi3_highpart" @@ -7273,6 +7365,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "double"))) + (set_attr "amdfam10_decode" "double") (set_attr "mode" "DI")]) (define_expand "smulsi3_highpart" @@ -7307,6 +7400,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "double"))) + (set_attr "amdfam10_decode" "double") (set_attr "mode" "SI")]) (define_insn "*smulsi3_highpart_zext" @@ -7328,6 +7422,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "double"))) + (set_attr "amdfam10_decode" "double") (set_attr "mode" "SI")]) ;; The patterns that match these are at the end of this file. @@ -10309,7 +10404,8 @@ [(set_attr "type" "ishift") (set_attr "prefix_0f" "1") (set_attr "mode" "DI") - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "vector")]) (define_expand "x86_64_shift_adj" [(set (reg:CCZ FLAGS_REG) @@ -10524,7 +10620,8 @@ (set_attr "prefix_0f" "1") (set_attr "mode" "SI") (set_attr "pent_pair" "np") - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "vector")]) (define_expand "x86_shift_adj_1" [(set (reg:CCZ FLAGS_REG) @@ -11284,7 +11381,8 @@ [(set_attr "type" "ishift") (set_attr "prefix_0f" "1") (set_attr "mode" "DI") - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "vector")]) (define_expand "ashrdi3" [(set (match_operand:DI 0 "shiftdi_operand" "") @@ -14156,7 +14254,7 @@ [(set_attr "type" "call")]) (define_insn "*sibcall_1_rex64_v" - [(call (mem:QI (reg:DI 40)) + [(call (mem:QI (reg:DI R11_REG)) (match_operand 0 "" ""))] "SIBLING_CALL_P (insn) && TARGET_64BIT" "jmp\t*%%r11" @@ -14558,7 +14656,23 @@ [(set (match_dup 0) (xor:SI (match_dup 0) (const_int 31))) (clobber (reg:CC FLAGS_REG))])] "" - "") +{ + if (TARGET_ABM) + { + emit_insn (gen_clzsi2_abm (operands[0], operands[1])); + DONE; + } +}) + +(define_insn "clzsi2_abm" + [(set (match_operand:SI 0 "register_operand" "=r") + (clz:SI (match_operand:SI 1 "nonimmediate_operand" ""))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_ABM" + "lzcnt{l}\t{%1, %0|%0, %1}" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "SI")]) (define_insn "*bsr" [(set (match_operand:SI 0 "register_operand" "=r") @@ -14567,7 +14681,44 @@ (clobber (reg:CC FLAGS_REG))] "" "bsr{l}\t{%1, %0|%0, %1}" - [(set_attr "prefix_0f" "1")]) + [(set_attr "prefix_0f" "1") + (set_attr "mode" "SI")]) + +(define_insn "popcountsi2" + [(set (match_operand:SI 0 "register_operand" "=r") + (popcount:SI (match_operand:SI 1 "nonimmediate_operand" ""))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_POPCNT" + "popcnt{l}\t{%1, %0|%0, %1}" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "SI")]) + +(define_insn "*popcountsi2_cmp" + [(set (reg FLAGS_REG) + (compare + (popcount:SI (match_operand:SI 1 "nonimmediate_operand" "rm")) + (const_int 0))) + (set (match_operand:SI 0 "register_operand" "=r") + (popcount:SI (match_dup 1)))] + "TARGET_POPCNT && ix86_match_ccmode (insn, CCZmode)" + "popcnt{l}\t{%1, %0|%0, %1}" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "SI")]) + +(define_insn "*popcountsi2_cmp_zext" + [(set (reg FLAGS_REG) + (compare + (popcount:SI (match_operand:SI 1 "nonimmediate_operand" "rm")) + (const_int 0))) + (set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI(popcount:SI (match_dup 1))))] + "TARGET_64BIT && TARGET_POPCNT && ix86_match_ccmode (insn, CCZmode)" + "popcnt{l}\t{%1, %0|%0, %1}" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "SI")]) (define_expand "clzdi2" [(parallel @@ -14579,7 +14730,23 @@ [(set (match_dup 0) (xor:DI (match_dup 0) (const_int 63))) (clobber (reg:CC FLAGS_REG))])] "TARGET_64BIT" - "") +{ + if (TARGET_ABM) + { + emit_insn (gen_clzdi2_abm (operands[0], operands[1])); + DONE; + } +}) + +(define_insn "clzdi2_abm" + [(set (match_operand:DI 0 "register_operand" "=r") + (clz:DI (match_operand:DI 1 "nonimmediate_operand" ""))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT && TARGET_ABM" + "lzcnt{q}\t{%1, %0|%0, %1}" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "DI")]) (define_insn "*bsr_rex64" [(set (match_operand:DI 0 "register_operand" "=r") @@ -14588,7 +14755,92 @@ (clobber (reg:CC FLAGS_REG))] "TARGET_64BIT" "bsr{q}\t{%1, %0|%0, %1}" - [(set_attr "prefix_0f" "1")]) + [(set_attr "prefix_0f" "1") + (set_attr "mode" "DI")]) + +(define_insn "popcountdi2" + [(set (match_operand:DI 0 "register_operand" "=r") + (popcount:DI (match_operand:DI 1 "nonimmediate_operand" ""))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT && TARGET_POPCNT" + "popcnt{q}\t{%1, %0|%0, %1}" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "DI")]) + +(define_insn "*popcountdi2_cmp" + [(set (reg FLAGS_REG) + (compare + (popcount:DI (match_operand:DI 1 "nonimmediate_operand" "rm")) + (const_int 0))) + (set (match_operand:DI 0 "register_operand" "=r") + (popcount:DI (match_dup 1)))] + "TARGET_64BIT && TARGET_POPCNT && ix86_match_ccmode (insn, CCZmode)" + "popcnt{q}\t{%1, %0|%0, %1}" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "DI")]) + +(define_expand "clzhi2" + [(parallel + [(set (match_operand:HI 0 "register_operand" "") + (minus:HI (const_int 15) + (clz:HI (match_operand:HI 1 "nonimmediate_operand" "")))) + (clobber (reg:CC FLAGS_REG))]) + (parallel + [(set (match_dup 0) (xor:HI (match_dup 0) (const_int 15))) + (clobber (reg:CC FLAGS_REG))])] + "" +{ + if (TARGET_ABM) + { + emit_insn (gen_clzhi2_abm (operands[0], operands[1])); + DONE; + } +}) + +(define_insn "clzhi2_abm" + [(set (match_operand:HI 0 "register_operand" "=r") + (clz:HI (match_operand:HI 1 "nonimmediate_operand" ""))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_ABM" + "lzcnt{w}\t{%1, %0|%0, %1}" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "HI")]) + +(define_insn "*bsrhi" + [(set (match_operand:HI 0 "register_operand" "=r") + (minus:HI (const_int 15) + (clz:HI (match_operand:HI 1 "nonimmediate_operand" "rm")))) + (clobber (reg:CC FLAGS_REG))] + "" + "bsr{w}\t{%1, %0|%0, %1}" + [(set_attr "prefix_0f" "1") + (set_attr "mode" "HI")]) + +(define_insn "popcounthi2" + [(set (match_operand:HI 0 "register_operand" "=r") + (popcount:HI (match_operand:HI 1 "nonimmediate_operand" ""))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_POPCNT" + "popcnt{w}\t{%1, %0|%0, %1}" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "HI")]) + +(define_insn "*popcounthi2_cmp" + [(set (reg FLAGS_REG) + (compare + (popcount:HI (match_operand:HI 1 "nonimmediate_operand" "rm")) + (const_int 0))) + (set (match_operand:HI 0 "register_operand" "=r") + (popcount:HI (match_dup 1)))] + "TARGET_POPCNT && ix86_match_ccmode (insn, CCZmode)" + "popcnt{w}\t{%1, %0|%0, %1}" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "HI")]) ;; Thread-local storage patterns for ELF. ;; @@ -15503,7 +15755,8 @@ "sqrtss\t{%1, %0|%0, %1}" [(set_attr "type" "sse") (set_attr "mode" "SF") - (set_attr "athlon_decode" "*")]) + (set_attr "athlon_decode" "*") + (set_attr "amdfam10_decode" "*")]) (define_insn "*sqrtsf2_i387" [(set (match_operand:SF 0 "register_operand" "=f") @@ -15541,7 +15794,8 @@ "sqrtsd\t{%1, %0|%0, %1}" [(set_attr "type" "sse") (set_attr "mode" "DF") - (set_attr "athlon_decode" "*")]) + (set_attr "athlon_decode" "*") + (set_attr "amdfam10_decode" "*")]) (define_insn "*sqrtdf2_i387" [(set (match_operand:DF 0 "register_operand" "=f") @@ -15570,7 +15824,8 @@ "fsqrt" [(set_attr "type" "fpspc") (set_attr "mode" "XF") - (set_attr "athlon_decode" "direct")]) + (set_attr "athlon_decode" "direct") + (set_attr "amdfam10_decode" "direct")]) (define_insn "*sqrtextendsfxf2_i387" [(set (match_operand:XF 0 "register_operand" "=f") @@ -15590,7 +15845,8 @@ "fsqrt" [(set_attr "type" "fpspc") (set_attr "mode" "XF") - (set_attr "athlon_decode" "direct")]) + (set_attr "athlon_decode" "direct") + (set_attr "amdfam10_decode" "direct")]) (define_insn "fpremxf4" [(set (match_operand:XF 0 "register_operand" "=f") @@ -20391,7 +20647,7 @@ (mult:DI (match_operand:DI 1 "memory_operand" "") (match_operand:DI 2 "immediate_operand" ""))) (clobber (reg:CC FLAGS_REG))])] - "(TARGET_K8 || TARGET_GENERIC64) && !optimize_size + "(TARGET_K8 || TARGET_GENERIC64 || TARGET_AMDFAM10) && !optimize_size && !satisfies_constraint_K (operands[2])" [(set (match_dup 3) (match_dup 1)) (parallel [(set (match_dup 0) (mult:DI (match_dup 3) (match_dup 2))) @@ -20404,7 +20660,7 @@ (mult:SI (match_operand:SI 1 "memory_operand" "") (match_operand:SI 2 "immediate_operand" ""))) (clobber (reg:CC FLAGS_REG))])] - "(TARGET_K8 || TARGET_GENERIC64) && !optimize_size + "(TARGET_K8 || TARGET_GENERIC64 || TARGET_AMDFAM10) && !optimize_size && !satisfies_constraint_K (operands[2])" [(set (match_dup 3) (match_dup 1)) (parallel [(set (match_dup 0) (mult:SI (match_dup 3) (match_dup 2))) @@ -20418,7 +20674,7 @@ (mult:SI (match_operand:SI 1 "memory_operand" "") (match_operand:SI 2 "immediate_operand" "")))) (clobber (reg:CC FLAGS_REG))])] - "(TARGET_K8 || TARGET_GENERIC64) && !optimize_size + "(TARGET_K8 || TARGET_GENERIC64 || TARGET_AMDFAM10) && !optimize_size && !satisfies_constraint_K (operands[2])" [(set (match_dup 3) (match_dup 1)) (parallel [(set (match_dup 0) (zero_extend:DI (mult:SI (match_dup 3) (match_dup 2)))) @@ -20435,7 +20691,7 @@ (match_operand:DI 2 "const_int_operand" ""))) (clobber (reg:CC FLAGS_REG))]) (match_scratch:DI 3 "r")] - "(TARGET_K8 || TARGET_GENERIC64) && !optimize_size + "(TARGET_K8 || TARGET_GENERIC64 || TARGET_AMDFAM10) && !optimize_size && satisfies_constraint_K (operands[2])" [(set (match_dup 3) (match_dup 2)) (parallel [(set (match_dup 0) (mult:DI (match_dup 0) (match_dup 3))) @@ -20451,7 +20707,7 @@ (match_operand:SI 2 "const_int_operand" ""))) (clobber (reg:CC FLAGS_REG))]) (match_scratch:SI 3 "r")] - "(TARGET_K8 || TARGET_GENERIC64) && !optimize_size + "(TARGET_K8 || TARGET_GENERIC64 || TARGET_AMDFAM10) && !optimize_size && satisfies_constraint_K (operands[2])" [(set (match_dup 3) (match_dup 2)) (parallel [(set (match_dup 0) (mult:SI (match_dup 0) (match_dup 3))) @@ -20467,7 +20723,7 @@ (match_operand:HI 2 "immediate_operand" ""))) (clobber (reg:CC FLAGS_REG))]) (match_scratch:HI 3 "r")] - "(TARGET_K8 || TARGET_GENERIC64) && !optimize_size" + "(TARGET_K8 || TARGET_GENERIC64 || TARGET_AMDFAM10) && !optimize_size" [(set (match_dup 3) (match_dup 2)) (parallel [(set (match_dup 0) (mult:HI (match_dup 0) (match_dup 3))) (clobber (reg:CC FLAGS_REG))])] @@ -20646,7 +20902,7 @@ (define_insn "*sibcall_value_1_rex64_v" [(set (match_operand 0 "" "") - (call (mem:QI (reg:DI 40)) + (call (mem:QI (reg:DI R11_REG)) (match_operand:DI 1 "" "")))] "SIBLING_CALL_P (insn) && TARGET_64BIT" "jmp\t*%%r11" @@ -20665,14 +20921,14 @@ (define_expand "sse_prologue_save" [(parallel [(set (match_operand:BLK 0 "" "") - (unspec:BLK [(reg:DI 21) - (reg:DI 22) + (unspec:BLK [(reg:DI 22) (reg:DI 23) (reg:DI 24) (reg:DI 25) (reg:DI 26) (reg:DI 27) - (reg:DI 28)] UNSPEC_SSE_PROLOGUE_SAVE)) + (reg:DI 28) + (reg:DI 29)] UNSPEC_SSE_PROLOGUE_SAVE)) (use (match_operand:DI 1 "register_operand" "")) (use (match_operand:DI 2 "immediate_operand" "")) (use (label_ref:DI (match_operand 3 "" "")))])] @@ -20682,14 +20938,14 @@ (define_insn "*sse_prologue_save_insn" [(set (mem:BLK (plus:DI (match_operand:DI 0 "register_operand" "R") (match_operand:DI 4 "const_int_operand" "n"))) - (unspec:BLK [(reg:DI 21) - (reg:DI 22) + (unspec:BLK [(reg:DI 22) (reg:DI 23) (reg:DI 24) (reg:DI 25) (reg:DI 26) (reg:DI 27) - (reg:DI 28)] UNSPEC_SSE_PROLOGUE_SAVE)) + (reg:DI 28) + (reg:DI 29)] UNSPEC_SSE_PROLOGUE_SAVE)) (use (match_operand:DI 1 "register_operand" "r")) (use (match_operand:DI 2 "const_int_operand" "i")) (use (label_ref:DI (match_operand 3 "" "X")))] diff --git a/contrib/gcc/config/i386/i386.opt b/contrib/gcc/config/i386/i386.opt index aa24920..fa73e17 100644 --- a/contrib/gcc/config/i386/i386.opt +++ b/contrib/gcc/config/i386/i386.opt @@ -1,6 +1,6 @@ ; Options for the IA-32 and AMD64 ports of the compiler. -; Copyright (C) 2005 Free Software Foundation, Inc. +; Copyright (C) 2005, 2006, 2007 Free Software Foundation, Inc. ; ; This file is part of GCC. ; @@ -201,6 +201,22 @@ mssse3 Target Report Mask(SSSE3) Support MMX, SSE, SSE2, SSE3 and SSSE3 built-in functions and code generation +msse4a +Target Report Mask(SSE4A) +Support MMX, SSE, SSE2, SSE3 and SSE4A built-in functions and code generation + +mpopcnt +Target Report Mask(POPCNT) +Support code generation of popcount instruction for popcount built-ins +namely __builtin_popcount, __builtin_popcountl and __builtin_popcountll + +mabm +Target Report Mask(ABM) +Support code generation of Advanced Bit Manipulation (ABM) instructions, +which include popcnt and lzcnt instructions, for popcount and clz built-ins +namely __builtin_popcount, __builtin_popcountl, __builtin_popcountll and +__builtin_clz, __builtin_clzl, __builtin_clzll + msseregparm Target RejectNegative Mask(SSEREGPARM) Use SSE register passing conventions for SF and DF mode diff --git a/contrib/gcc/config/i386/mmx.md b/contrib/gcc/config/i386/mmx.md index 4f0ab2c..6fc9da4 100644 --- a/contrib/gcc/config/i386/mmx.md +++ b/contrib/gcc/config/i386/mmx.md @@ -1396,14 +1396,14 @@ (clobber (reg:XF 13)) (clobber (reg:XF 14)) (clobber (reg:XF 15)) - (clobber (reg:DI 29)) (clobber (reg:DI 30)) (clobber (reg:DI 31)) (clobber (reg:DI 32)) (clobber (reg:DI 33)) (clobber (reg:DI 34)) (clobber (reg:DI 35)) - (clobber (reg:DI 36))] + (clobber (reg:DI 36)) + (clobber (reg:DI 37))] "TARGET_MMX" "emms" [(set_attr "type" "mmx") @@ -1419,14 +1419,14 @@ (clobber (reg:XF 13)) (clobber (reg:XF 14)) (clobber (reg:XF 15)) - (clobber (reg:DI 29)) (clobber (reg:DI 30)) (clobber (reg:DI 31)) (clobber (reg:DI 32)) (clobber (reg:DI 33)) (clobber (reg:DI 34)) (clobber (reg:DI 35)) - (clobber (reg:DI 36))] + (clobber (reg:DI 36)) + (clobber (reg:DI 37))] "TARGET_3DNOW" "femms" [(set_attr "type" "mmx") diff --git a/contrib/gcc/config/i386/pmmintrin.h b/contrib/gcc/config/i386/pmmintrin.h index 7dbf030..39d7c17 100644 --- a/contrib/gcc/config/i386/pmmintrin.h +++ b/contrib/gcc/config/i386/pmmintrin.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2003, 2004, 2005, 2006 Free Software Foundation, Inc. +/* Copyright (C) 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc. This file is part of GCC. @@ -30,7 +30,11 @@ #ifndef _PMMINTRIN_H_INCLUDED #define _PMMINTRIN_H_INCLUDED -#ifdef __SSE3__ +#ifndef __SSE3__ +# error "SSE3 instruction set not enabled" +#else + +/* We need definitions from the SSE2 and SSE header files*/ #include #include diff --git a/contrib/gcc/config/i386/sse.md b/contrib/gcc/config/i386/sse.md index 4e25280..6ff9f08 100644 --- a/contrib/gcc/config/i386/sse.md +++ b/contrib/gcc/config/i386/sse.md @@ -1,5 +1,5 @@ ;; GCC machine description for SSE instructions -;; Copyright (C) 2005, 2006 +;; Copyright (C) 2005, 2006, 2007 ;; Free Software Foundation, Inc. ;; ;; This file is part of GCC. @@ -261,7 +261,7 @@ (define_insn "sse3_lddqu" [(set (match_operand:V16QI 0 "register_operand" "=x") (unspec:V16QI [(match_operand:V16QI 1 "memory_operand" "m")] - UNSPEC_LDQQU))] + UNSPEC_LDDQU))] "TARGET_SSE3" "lddqu\t{%1, %0|%0, %1}" [(set_attr "type" "ssecvt") @@ -920,6 +920,7 @@ "cvtsi2ss\t{%2, %0|%0, %2}" [(set_attr "type" "sseicvt") (set_attr "athlon_decode" "vector,double") + (set_attr "amdfam10_decode" "vector,double") (set_attr "mode" "SF")]) (define_insn "sse_cvtsi2ssq" @@ -933,6 +934,7 @@ "cvtsi2ssq\t{%2, %0|%0, %2}" [(set_attr "type" "sseicvt") (set_attr "athlon_decode" "vector,double") + (set_attr "amdfam10_decode" "vector,double") (set_attr "mode" "SF")]) (define_insn "sse_cvtss2si" @@ -946,6 +948,7 @@ "cvtss2si\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double") (set_attr "mode" "SI")]) (define_insn "sse_cvtss2siq" @@ -959,6 +962,7 @@ "cvtss2siq\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double") (set_attr "mode" "DI")]) (define_insn "sse_cvttss2si" @@ -971,6 +975,7 @@ "cvttss2si\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double") (set_attr "mode" "SI")]) (define_insn "sse_cvttss2siq" @@ -983,6 +988,7 @@ "cvttss2siq\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double") (set_attr "mode" "DI")]) (define_insn "sse2_cvtdq2ps" @@ -1852,7 +1858,8 @@ "cvtsi2sd\t{%2, %0|%0, %2}" [(set_attr "type" "sseicvt") (set_attr "mode" "DF") - (set_attr "athlon_decode" "double,direct")]) + (set_attr "athlon_decode" "double,direct") + (set_attr "amdfam10_decode" "vector,double")]) (define_insn "sse2_cvtsi2sdq" [(set (match_operand:V2DF 0 "register_operand" "=x,x") @@ -1865,7 +1872,8 @@ "cvtsi2sdq\t{%2, %0|%0, %2}" [(set_attr "type" "sseicvt") (set_attr "mode" "DF") - (set_attr "athlon_decode" "double,direct")]) + (set_attr "athlon_decode" "double,direct") + (set_attr "amdfam10_decode" "vector,double")]) (define_insn "sse2_cvtsd2si" [(set (match_operand:SI 0 "register_operand" "=r,r") @@ -1878,6 +1886,7 @@ "cvtsd2si\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double") (set_attr "mode" "SI")]) (define_insn "sse2_cvtsd2siq" @@ -1891,6 +1900,7 @@ "cvtsd2siq\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double") (set_attr "mode" "DI")]) (define_insn "sse2_cvttsd2si" @@ -1903,7 +1913,8 @@ "cvttsd2si\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "mode" "SI") - (set_attr "athlon_decode" "double,vector")]) + (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double")]) (define_insn "sse2_cvttsd2siq" [(set (match_operand:DI 0 "register_operand" "=r,r") @@ -1915,7 +1926,8 @@ "cvttsd2siq\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "mode" "DI") - (set_attr "athlon_decode" "double,vector")]) + (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double")]) (define_insn "sse2_cvtdq2pd" [(set (match_operand:V2DF 0 "register_operand" "=x") @@ -1946,7 +1958,8 @@ "TARGET_SSE2" "cvtpd2dq\t{%1, %0|%0, %1}" [(set_attr "type" "ssecvt") - (set_attr "mode" "TI")]) + (set_attr "mode" "TI") + (set_attr "amdfam10_decode" "double")]) (define_expand "sse2_cvttpd2dq" [(set (match_operand:V4SI 0 "register_operand" "") @@ -1964,7 +1977,8 @@ "TARGET_SSE2" "cvttpd2dq\t{%1, %0|%0, %1}" [(set_attr "type" "ssecvt") - (set_attr "mode" "TI")]) + (set_attr "mode" "TI") + (set_attr "amdfam10_decode" "double")]) (define_insn "sse2_cvtsd2ss" [(set (match_operand:V4SF 0 "register_operand" "=x,x") @@ -1978,20 +1992,22 @@ "cvtsd2ss\t{%2, %0|%0, %2}" [(set_attr "type" "ssecvt") (set_attr "athlon_decode" "vector,double") + (set_attr "amdfam10_decode" "vector,double") (set_attr "mode" "SF")]) (define_insn "sse2_cvtss2sd" - [(set (match_operand:V2DF 0 "register_operand" "=x") + [(set (match_operand:V2DF 0 "register_operand" "=x,x") (vec_merge:V2DF (float_extend:V2DF (vec_select:V2SF - (match_operand:V4SF 2 "nonimmediate_operand" "xm") + (match_operand:V4SF 2 "nonimmediate_operand" "x,m") (parallel [(const_int 0) (const_int 1)]))) - (match_operand:V2DF 1 "register_operand" "0") + (match_operand:V2DF 1 "register_operand" "0,0") (const_int 1)))] "TARGET_SSE2" "cvtss2sd\t{%2, %0|%0, %2}" [(set_attr "type" "ssecvt") + (set_attr "amdfam10_decode" "vector,double") (set_attr "mode" "DF")]) (define_expand "sse2_cvtpd2ps" @@ -2012,7 +2028,8 @@ "TARGET_SSE2" "cvtpd2ps\t{%1, %0|%0, %1}" [(set_attr "type" "ssecvt") - (set_attr "mode" "V4SF")]) + (set_attr "mode" "V4SF") + (set_attr "amdfam10_decode" "double")]) (define_insn "sse2_cvtps2pd" [(set (match_operand:V2DF 0 "register_operand" "=x") @@ -2023,7 +2040,8 @@ "TARGET_SSE2" "cvtps2pd\t{%1, %0|%0, %1}" [(set_attr "type" "ssecvt") - (set_attr "mode" "V2DF")]) + (set_attr "mode" "V2DF") + (set_attr "amdfam10_decode" "direct")]) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; @@ -4524,3 +4542,92 @@ "pabs\t{%1, %0|%0, %1}"; [(set_attr "type" "sselog1") (set_attr "mode" "DI")]) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; AMD SSE4A instructions +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_insn "sse4a_vmmovntv2df" + [(set (match_operand:DF 0 "memory_operand" "=m") + (unspec:DF [(vec_select:DF + (match_operand:V2DF 1 "register_operand" "x") + (parallel [(const_int 0)]))] + UNSPEC_MOVNT))] + "TARGET_SSE4A" + "movntsd\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "mode" "DF")]) + +(define_insn "sse4a_movntdf" + [(set (match_operand:DF 0 "memory_operand" "=m") + (unspec:DF [(match_operand:DF 1 "register_operand" "x")] + UNSPEC_MOVNT))] + "TARGET_SSE4A" + "movntsd\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "mode" "DF")]) + +(define_insn "sse4a_vmmovntv4sf" + [(set (match_operand:SF 0 "memory_operand" "=m") + (unspec:SF [(vec_select:SF + (match_operand:V4SF 1 "register_operand" "x") + (parallel [(const_int 0)]))] + UNSPEC_MOVNT))] + "TARGET_SSE4A" + "movntss\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "mode" "SF")]) + +(define_insn "sse4a_movntsf" + [(set (match_operand:SF 0 "memory_operand" "=m") + (unspec:SF [(match_operand:SF 1 "register_operand" "x")] + UNSPEC_MOVNT))] + "TARGET_SSE4A" + "movntss\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "mode" "SF")]) + +(define_insn "sse4a_extrqi" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0") + (match_operand 2 "const_int_operand" "") + (match_operand 3 "const_int_operand" "")] + UNSPEC_EXTRQI))] + "TARGET_SSE4A" + "extrq\t{%3, %2, %0|%0, %2, %3}" + [(set_attr "type" "sse") + (set_attr "mode" "TI")]) + +(define_insn "sse4a_extrq" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0") + (match_operand:V16QI 2 "register_operand" "x")] + UNSPEC_EXTRQ))] + "TARGET_SSE4A" + "extrq\t{%2, %0|%0, %2}" + [(set_attr "type" "sse") + (set_attr "mode" "TI")]) + +(define_insn "sse4a_insertqi" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0") + (match_operand:V2DI 2 "register_operand" "x") + (match_operand 3 "const_int_operand" "") + (match_operand 4 "const_int_operand" "")] + UNSPEC_INSERTQI))] + "TARGET_SSE4A" + "insertq\t{%4, %3, %2, %0|%0, %2, %3, %4}" + [(set_attr "type" "sseins") + (set_attr "mode" "TI")]) + +(define_insn "sse4a_insertq" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0") + (match_operand:V2DI 2 "register_operand" "x")] + UNSPEC_INSERTQ))] + "TARGET_SSE4A" + "insertq\t{%2, %0|%0, %2}" + [(set_attr "type" "sseins") + (set_attr "mode" "TI")]) diff --git a/contrib/gcc/config/i386/tmmintrin.h b/contrib/gcc/config/i386/tmmintrin.h index e1bedc5..cf9d99d 100644 --- a/contrib/gcc/config/i386/tmmintrin.h +++ b/contrib/gcc/config/i386/tmmintrin.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2006 Free Software Foundation, Inc. +/* Copyright (C) 2006, 2007 Free Software Foundation, Inc. This file is part of GCC. @@ -30,231 +30,11 @@ #ifndef _TMMINTRIN_H_INCLUDED #define _TMMINTRIN_H_INCLUDED -#ifdef __SSSE3__ -#include - -static __inline __m128i __attribute__((__always_inline__)) -_mm_hadd_epi16 (__m128i __X, __m128i __Y) -{ - return (__m128i) __builtin_ia32_phaddw128 ((__v8hi)__X, (__v8hi)__Y); -} - -static __inline __m128i __attribute__((__always_inline__)) -_mm_hadd_epi32 (__m128i __X, __m128i __Y) -{ - return (__m128i) __builtin_ia32_phaddd128 ((__v4si)__X, (__v4si)__Y); -} - -static __inline __m128i __attribute__((__always_inline__)) -_mm_hadds_epi16 (__m128i __X, __m128i __Y) -{ - return (__m128i) __builtin_ia32_phaddsw128 ((__v8hi)__X, (__v8hi)__Y); -} - -static __inline __m64 __attribute__((__always_inline__)) -_mm_hadd_pi16 (__m64 __X, __m64 __Y) -{ - return (__m64) __builtin_ia32_phaddw ((__v4hi)__X, (__v4hi)__Y); -} - -static __inline __m64 __attribute__((__always_inline__)) -_mm_hadd_pi32 (__m64 __X, __m64 __Y) -{ - return (__m64) __builtin_ia32_phaddd ((__v2si)__X, (__v2si)__Y); -} - -static __inline __m64 __attribute__((__always_inline__)) -_mm_hadds_pi16 (__m64 __X, __m64 __Y) -{ - return (__m64) __builtin_ia32_phaddsw ((__v4hi)__X, (__v4hi)__Y); -} - -static __inline __m128i __attribute__((__always_inline__)) -_mm_hsub_epi16 (__m128i __X, __m128i __Y) -{ - return (__m128i) __builtin_ia32_phsubw128 ((__v8hi)__X, (__v8hi)__Y); -} - -static __inline __m128i __attribute__((__always_inline__)) -_mm_hsub_epi32 (__m128i __X, __m128i __Y) -{ - return (__m128i) __builtin_ia32_phsubd128 ((__v4si)__X, (__v4si)__Y); -} - -static __inline __m128i __attribute__((__always_inline__)) -_mm_hsubs_epi16 (__m128i __X, __m128i __Y) -{ - return (__m128i) __builtin_ia32_phsubsw128 ((__v8hi)__X, (__v8hi)__Y); -} - -static __inline __m64 __attribute__((__always_inline__)) -_mm_hsub_pi16 (__m64 __X, __m64 __Y) -{ - return (__m64) __builtin_ia32_phsubw ((__v4hi)__X, (__v4hi)__Y); -} - -static __inline __m64 __attribute__((__always_inline__)) -_mm_hsub_pi32 (__m64 __X, __m64 __Y) -{ - return (__m64) __builtin_ia32_phsubd ((__v2si)__X, (__v2si)__Y); -} - -static __inline __m64 __attribute__((__always_inline__)) -_mm_hsubs_pi16 (__m64 __X, __m64 __Y) -{ - return (__m64) __builtin_ia32_phsubsw ((__v4hi)__X, (__v4hi)__Y); -} - -static __inline __m128i __attribute__((__always_inline__)) -_mm_maddubs_epi16 (__m128i __X, __m128i __Y) -{ - return (__m128i) __builtin_ia32_pmaddubsw128 ((__v16qi)__X, (__v16qi)__Y); -} - -static __inline __m64 __attribute__((__always_inline__)) -_mm_maddubs_pi16 (__m64 __X, __m64 __Y) -{ - return (__m64) __builtin_ia32_pmaddubsw ((__v8qi)__X, (__v8qi)__Y); -} - -static __inline __m128i __attribute__((__always_inline__)) -_mm_mulhrs_epi16 (__m128i __X, __m128i __Y) -{ - return (__m128i) __builtin_ia32_pmulhrsw128 ((__v8hi)__X, (__v8hi)__Y); -} - -static __inline __m64 __attribute__((__always_inline__)) -_mm_mulhrs_pi16 (__m64 __X, __m64 __Y) -{ - return (__m64) __builtin_ia32_pmulhrsw ((__v4hi)__X, (__v4hi)__Y); -} - -static __inline __m128i __attribute__((__always_inline__)) -_mm_shuffle_epi8 (__m128i __X, __m128i __Y) -{ - return (__m128i) __builtin_ia32_pshufb128 ((__v16qi)__X, (__v16qi)__Y); -} - -static __inline __m64 __attribute__((__always_inline__)) -_mm_shuffle_pi8 (__m64 __X, __m64 __Y) -{ - return (__m64) __builtin_ia32_pshufb ((__v8qi)__X, (__v8qi)__Y); -} - -static __inline __m128i __attribute__((__always_inline__)) -_mm_sign_epi8 (__m128i __X, __m128i __Y) -{ - return (__m128i) __builtin_ia32_psignb128 ((__v16qi)__X, (__v16qi)__Y); -} - -static __inline __m128i __attribute__((__always_inline__)) -_mm_sign_epi16 (__m128i __X, __m128i __Y) -{ - return (__m128i) __builtin_ia32_psignw128 ((__v8hi)__X, (__v8hi)__Y); -} - -static __inline __m128i __attribute__((__always_inline__)) -_mm_sign_epi32 (__m128i __X, __m128i __Y) -{ - return (__m128i) __builtin_ia32_psignd128 ((__v4si)__X, (__v4si)__Y); -} - -static __inline __m64 __attribute__((__always_inline__)) -_mm_sign_pi8 (__m64 __X, __m64 __Y) -{ - return (__m64) __builtin_ia32_psignb ((__v8qi)__X, (__v8qi)__Y); -} - -static __inline __m64 __attribute__((__always_inline__)) -_mm_sign_pi16 (__m64 __X, __m64 __Y) -{ - return (__m64) __builtin_ia32_psignw ((__v4hi)__X, (__v4hi)__Y); -} - -static __inline __m64 __attribute__((__always_inline__)) -_mm_sign_pi32 (__m64 __X, __m64 __Y) -{ - return (__m64) __builtin_ia32_psignd ((__v2si)__X, (__v2si)__Y); -} - -#define _mm_alignr_epi8(__X, __Y, __N) \ - ((__m128i)__builtin_ia32_palignr128 ((__v2di) __X, (__v2di) __Y, (__N) * 8)) - -#define _mm_alignr_pi8(__X, __Y, __N) \ - ((__m64)__builtin_ia32_palignr ((long long) (__X), (long long) (__Y), (__N) * 8)) - -static __inline __m128i __attribute__((__always_inline__)) -_mm_abs_epi8 (__m128i __X) -{ - return (__m128i) __builtin_ia32_pabsb128 ((__v16qi)__X); -} - -static __inline __m128i __attribute__((__always_inline__)) -_mm_abs_epi16 (__m128i __X) -{ - return (__m128i) __builtin_ia32_pabsw128 ((__v8hi)__X); -} - -static __inline __m128i __attribute__((__always_inline__)) -_mm_abs_epi32 (__m128i __X) -{ - return (__m128i) __builtin_ia32_pabsd128 ((__v4si)__X); -} - -static __inline __m64 __attribute__((__always_inline__)) -_mm_abs_pi8 (__m64 __X) -{ - return (__m64) __builtin_ia32_pabsb ((__v8qi)__X); -} - -static __inline __m64 __attribute__((__always_inline__)) -_mm_abs_pi16 (__m64 __X) -{ - return (__m64) __builtin_ia32_pabsw ((__v4hi)__X); -} - -static __inline __m64 __attribute__((__always_inline__)) -_mm_abs_pi32 (__m64 __X) -{ - return (__m64) __builtin_ia32_pabsd ((__v2si)__X); -} - -#endif /* __SSSE3__ */ - -#endif /* _TMMINTRIN_H_INCLUDED */ -/* Copyright (C) 2006 Free Software Foundation, Inc. - - This file is part of GCC. - - GCC is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2, or (at your option) - any later version. - - GCC is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with GCC; see the file COPYING. If not, write to - the Free Software Foundation, 59 Temple Place - Suite 330, - Boston, MA 02111-1307, USA. */ - -/* As a special exception, if you include this header file into source - files compiled by GCC, this header file does not by itself cause - the resulting executable to be covered by the GNU General Public - License. This exception does not however invalidate any other - reasons why the executable file might be covered by the GNU General - Public License. */ - -/* Implemented from the specification included in the Intel C++ Compiler - User Guide and Reference, version 9.1. */ - -#ifndef _TMMINTRIN_H_INCLUDED -#define _TMMINTRIN_H_INCLUDED +#ifndef __SSSE3__ +# error "SSSE3 instruction set not enabled" +#else -#ifdef __SSSE3__ +/* We need definitions from the SSE3, SSE2 and SSE header files*/ #include static __inline __m128i __attribute__((__always_inline__)) diff --git a/contrib/gcc/doc/extend.texi b/contrib/gcc/doc/extend.texi index 0c8bb7d..d7a1494 100644 --- a/contrib/gcc/doc/extend.texi +++ b/contrib/gcc/doc/extend.texi @@ -7255,6 +7255,23 @@ v4si __builtin_ia32_pabsd128 (v4si) v8hi __builtin_ia32_pabsw128 (v8hi) @end smallexample +The following built-in functions are available when @option{-msse4a} is used. + +@smallexample +void _mm_stream_sd (double*,__m128d); +Generates the @code{movntsd} machine instruction. +void _mm_stream_ss (float*,__m128); +Generates the @code{movntss} machine instruction. +__m128i _mm_extract_si64 (__m128i, __m128i); +Generates the @code{extrq} machine instruction with only SSE register operands. +__m128i _mm_extracti_si64 (__m128i, int, int); +Generates the @code{extrq} machine instruction with SSE register and immediate operands. +__m128i _mm_insert_si64 (__m128i, __m128i); +Generates the @code{insertq} machine instruction with only SSE register operands. +__m128i _mm_inserti_si64 (__m128i, __m128i, int, int); +Generates the @code{insertq} machine instruction with SSE register and immediate operands. +@end smallexample + The following built-in functions are available when @option{-m3dnow} is used. All of them generate the machine instruction that is part of the name. diff --git a/contrib/gcc/doc/invoke.texi b/contrib/gcc/doc/invoke.texi index 0c39136..0587a9e 100644 --- a/contrib/gcc/doc/invoke.texi +++ b/contrib/gcc/doc/invoke.texi @@ -513,7 +513,7 @@ in the following sections. -mno-fp-ret-in-387 -msoft-float -msvr3-shlib @gol -mno-wide-multiply -mrtd -malign-double @gol -mpreferred-stack-boundary=@var{num} @gol --mmmx -msse -msse2 -msse3 -mssse3 -m3dnow @gol +-mmmx -msse -msse2 -msse3 -mssse3 -msse4a -m3dnow -mpopcnt -mabm @gol -mthreads -mno-align-stringops -minline-all-stringops @gol -mpush-args -maccumulate-outgoing-args -m128bit-long-double @gol -m96bit-long-double -mregparm=@var{num} -msseregparm @gol @@ -9059,6 +9059,10 @@ instruction set support. @item k8, opteron, athlon64, athlon-fx AMD K8 core based CPUs with x86-64 instruction set support. (This supersets MMX, SSE, SSE2, 3dNOW!, enhanced 3dNOW! and 64-bit instruction set extensions.) +@item amdfam10, barcelona +AMD Family 10h core based CPUs with x86-64 instruction set support. (This +supersets MMX, SSE, SSE2, SSE3, SSE4A, 3dNOW!, enhanced 3dNOW!, ABM and 64-bit +instruction set extensions.) @item k8-sse3, opteron-sse3, athlon64-sse3 Improved versions of k8, opteron and athlon64 with SSE3 instruction set support. @item winchip-c6 @@ -9355,8 +9359,14 @@ preferred alignment to @option{-mpreferred-stack-boundary=2}. @itemx -mno-sse3 @item -mssse3 @itemx -mno-ssse3 +@item -msse4a +@item -mno-sse4a @item -m3dnow @itemx -mno-3dnow +@item -mpopcnt +@itemx -mno-popcnt +@item -mabm +@itemx -mno-abm @opindex mmmx @opindex mno-mmx @opindex msse @@ -9364,7 +9374,7 @@ preferred alignment to @option{-mpreferred-stack-boundary=2}. @opindex m3dnow @opindex mno-3dnow These switches enable or disable the use of instructions in the MMX, -SSE, SSE2, SSE3, SSSE3 or 3DNow! extended instruction sets. +SSE, SSE2, SSE3, SSSE3, SSE4A, ABM or 3DNow! extended instruction sets. These extensions are also available as built-in functions: see @ref{X86 Built-in Functions}, for details of the functions enabled and disabled by these switches. diff --git a/contrib/gcc/fold-const.c b/contrib/gcc/fold-const.c index 37d99f8..1aadd30 100644 --- a/contrib/gcc/fold-const.c +++ b/contrib/gcc/fold-const.c @@ -2802,9 +2802,13 @@ operand_equal_p (tree arg0, tree arg1, unsigned int flags) case ARRAY_REF: case ARRAY_RANGE_REF: - /* Operands 2 and 3 may be null. */ + /* Operands 2 and 3 may be null. + Compare the array index by value if it is constant first as we + may have different types but same value here. */ return (OP_SAME (0) - && OP_SAME (1) + && (tree_int_cst_equal (TREE_OPERAND (arg0, 1), + TREE_OPERAND (arg1, 1)) + || OP_SAME (1)) && OP_SAME_WITH_NULL (2) && OP_SAME_WITH_NULL (3)); diff --git a/contrib/gcc/gimplify.c b/contrib/gcc/gimplify.c index 80dcd1a..7efae38 100644 --- a/contrib/gcc/gimplify.c +++ b/contrib/gcc/gimplify.c @@ -1600,9 +1600,7 @@ canonicalize_addr_expr (tree *expr_p) /* All checks succeeded. Build a new node to merge the cast. */ *expr_p = build4 (ARRAY_REF, dctype, obj_expr, TYPE_MIN_VALUE (TYPE_DOMAIN (datype)), - TYPE_MIN_VALUE (TYPE_DOMAIN (datype)), - size_binop (EXACT_DIV_EXPR, TYPE_SIZE_UNIT (dctype), - size_int (TYPE_ALIGN_UNIT (dctype)))); + NULL_TREE, NULL_TREE); *expr_p = build1 (ADDR_EXPR, ctype, *expr_p); } diff --git a/contrib/gcc/tree-ssa-ccp.c b/contrib/gcc/tree-ssa-ccp.c index 6e74f35..e64d80b 100644 --- a/contrib/gcc/tree-ssa-ccp.c +++ b/contrib/gcc/tree-ssa-ccp.c @@ -1621,9 +1621,7 @@ maybe_fold_offset_to_array_ref (tree base, tree offset, tree orig_type) if (!integer_zerop (elt_offset)) idx = int_const_binop (PLUS_EXPR, idx, elt_offset, 0); - return build4 (ARRAY_REF, orig_type, base, idx, min_idx, - size_int (tree_low_cst (elt_size, 1) - / (TYPE_ALIGN_UNIT (elt_type)))); + return build4 (ARRAY_REF, orig_type, base, idx, NULL_TREE, NULL_TREE); } diff --git a/contrib/gcc/tree-ssa-pre.c b/contrib/gcc/tree-ssa-pre.c index 9c7b89f..ba32b3c 100644 --- a/contrib/gcc/tree-ssa-pre.c +++ b/contrib/gcc/tree-ssa-pre.c @@ -1076,6 +1076,7 @@ phi_translate (tree expr, value_set_t set, basic_block pred, tree newexpr; tree vh = get_value_handle (expr); bool listchanged = false; + bool invariantarg = false; VEC (tree, gc) *vuses = VALUE_HANDLE_VUSES (vh); VEC (tree, gc) *tvuses; @@ -1134,10 +1135,26 @@ phi_translate (tree expr, value_set_t set, basic_block pred, if (newval != oldval) { listchanged = true; + invariantarg |= is_gimple_min_invariant (newval); TREE_VALUE (newwalker) = get_value_handle (newval); } } } + + /* In case of new invariant args we might try to fold the call + again. */ + if (invariantarg) + { + tree tmp = fold_ternary (CALL_EXPR, TREE_TYPE (expr), + newop0, newarglist, newop2); + if (tmp) + { + STRIP_TYPE_NOPS (tmp); + if (is_gimple_min_invariant (tmp)) + return tmp; + } + } + if (listchanged) vn_lookup_or_add (newarglist, NULL); -- cgit v1.1