From 21efb4d4640a4f96bcbe9244ad33580d80c582c7 Mon Sep 17 00:00:00 2001 From: Harsha Jagasia Date: Mon, 5 Feb 2007 23:33:54 +0000 Subject: [PATCH] amdfam10 From-SVN: r121625 --- gcc/ChangeLog | 154 +++++++ gcc/config.gcc | 20 +- gcc/config/i386/ammintrin.h | 73 +++ gcc/config/i386/athlon.md | 378 +++++++++++++-- gcc/config/i386/emmintrin.h | 8 +- gcc/config/i386/i386.c | 431 ++++++++++++++++-- gcc/config/i386/i386.h | 17 +- gcc/config/i386/i386.md | 331 ++++++++++++-- gcc/config/i386/i386.opt | 18 +- gcc/config/i386/pmmintrin.h | 8 +- gcc/config/i386/sse.md | 131 +++++- gcc/config/i386/tmmintrin.h | 8 +- gcc/doc/extend.texi | 17 + gcc/doc/invoke.texi | 14 +- gcc/testsuite/ChangeLog | 9 + gcc/testsuite/gcc.dg/i386-cpuid.h | 59 +++ gcc/testsuite/gcc.target/i386/sse4a-extract.c | 100 ++++ gcc/testsuite/gcc.target/i386/sse4a-insert.c | 110 +++++ gcc/testsuite/gcc.target/i386/sse4a-montsd.c | 64 +++ gcc/testsuite/gcc.target/i386/sse4a-montss.c | 64 +++ 20 files changed, 1865 insertions(+), 149 deletions(-) create mode 100644 gcc/config/i386/ammintrin.h create mode 100644 gcc/testsuite/gcc.target/i386/sse4a-extract.c create mode 100644 gcc/testsuite/gcc.target/i386/sse4a-insert.c create mode 100644 gcc/testsuite/gcc.target/i386/sse4a-montsd.c create mode 100644 gcc/testsuite/gcc.target/i386/sse4a-montss.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 148de8f7038..8cf29789f67 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,157 @@ +2007-02-05 Harsha Jagasia + + * config/i386/athlon.md (athlon_fldxf_k8, athlon_fld_k8, + athlon_fstxf_k8, athlon_fst_k8, athlon_fist, athlon_fmov, + athlon_fadd_load, athlon_fadd_load_k8, athlon_fadd, athlon_fmul, + athlon_fmul_load, athlon_fmul_load_k8, athlon_fsgn, + athlon_fdiv_load, athlon_fdiv_load_k8, athlon_fdiv_k8, + athlon_fpspc_load, athlon_fpspc, athlon_fcmov_load, + athlon_fcmov_load_k8, athlon_fcmov_k8, athlon_fcomi_load_k8, + athlon_fcomi, athlon_fcom_load_k8, athlon_fcom): Added amdfam10. + +2007-02-05 Harsha Jagasia + + * config/i386/i386.md (x86_sahf_1, cmpfp_i_mixed, cmpfp_i_sse, + cmpfp_i_i387, cmpfp_iu_mixed, cmpfp_iu_sse, cmpfp_iu_387, + swapsi, swaphi_1, swapqi_1, swapdi_rex64, fix_truncsfdi_sse, + fix_truncdfdi_sse, fix_truncsfsi_sse, fix_truncdfsi_sse, + x86_fldcw_1, floatsisf2_mixed, floatsisf2_sse, floatdisf2_mixed, + floatdisf2_sse, floatsidf2_mixed, floatsidf2_sse, + floatdidf2_mixed, floatdidf2_sse, muldi3_1_rex64, mulsi3_1, + mulsi3_1_zext, mulhi3_1, mulqi3_1, umulqihi3_1, mulqihi3_insn, + umulditi3_insn, umulsidi3_insn, mulditi3_insn, mulsidi3_insn, + umuldi3_highpart_rex64, umulsi3_highpart_insn, + umulsi3_highpart_zext, smuldi3_highpart_rex64, + smulsi3_highpart_insn, smulsi3_highpart_zext, x86_64_shld, + x86_shld_1, x86_64_shrd, sqrtsf2_mixed, sqrtsf2_sse, + sqrtsf2_i387, sqrtdf2_mixed, sqrtdf2_sse, sqrtdf2_i387, + sqrtextendsfdf2_i387, sqrtxf2, sqrtextendsfxf2_i387, + sqrtextenddfxf2_i387): Added amdfam10_decode. + + * config/i386/athlon.md (athlon_idirect_amdfam10, + athlon_ivector_amdfam10, athlon_idirect_load_amdfam10, + athlon_ivector_load_amdfam10, athlon_idirect_both_amdfam10, + athlon_ivector_both_amdfam10, athlon_idirect_store_amdfam10, + athlon_ivector_store_amdfam10): New define_insn_reservation. + (athlon_idirect_loadmov, athlon_idirect_movstore): Added + amdfam10. + +2007-02-05 Harsha Jagasia + + * config/i386/athlon.md (athlon_call_amdfam10, + athlon_pop_amdfam10, athlon_lea_amdfam10): New + define_insn_reservation. + (athlon_branch, athlon_push, athlon_leave_k8, athlon_imul_k8, + athlon_imul_k8_DI, athlon_imul_mem_k8, athlon_imul_mem_k8_DI, + athlon_idiv, athlon_idiv_mem, athlon_str): Added amdfam10. + +2007-02-05 Harsha Jagasia + + * config/i386/athlon.md (athlon_sseld_amdfam10, + athlon_mmxld_amdfam10, athlon_ssest_amdfam10, + athlon_mmxssest_short_amdfam10): New define_insn_reservation. + +2007-02-05 Harsha Jagasia + + * config/i386/athlon.md (athlon_sseins_amdfam10): New + define_insn_reservation. + * config/i386/i386.md (sseins): Added sseins to define_attr type + and define_attr unit. + * config/i386/sse.md: Set type attribute to sseins for insertq + and insertqi. + +2007-02-05 Harsha Jagasia + + * config/i386/athlon.md (sselog_load_amdfam10, sselog_amdfam10, + ssecmpvector_load_amdfam10, ssecmpvector_amdfam10, + ssecomi_load_amdfam10, ssecomi_amdfam10, + sseaddvector_load_amdfam10, sseaddvector_amdfam10): New + define_insn_reservation. + (ssecmp_load_k8, ssecmp, sseadd_load_k8, seadd): Added amdfam10. + +2007-02-05 Harsha Jagasia + + * config/i386/athlon.md (cvtss2sd_load_amdfam10, + cvtss2sd_amdfam10, cvtps2pd_load_amdfam10, cvtps2pd_amdfam10, + cvtsi2sd_load_amdfam10, cvtsi2ss_load_amdfam10, + cvtsi2sd_amdfam10, cvtsi2ss_amdfam10, cvtsd2ss_load_amdfam10, + cvtsd2ss_amdfam10, cvtpd2ps_load_amdfam10, cvtpd2ps_amdfam10, + cvtsX2si_load_amdfam10, cvtsX2si_amdfam10): New + define_insn_reservation. + + * config/i386/sse.md (cvtsi2ss, cvtsi2ssq, cvtss2si, + cvtss2siq, cvttss2si, cvttss2siq, cvtsi2sd, cvtsi2sdq, + cvtsd2si, cvtsd2siq, cvttsd2si, cvttsd2siq, + cvtpd2dq, cvttpd2dq, cvtsd2ss, cvtss2sd, + cvtpd2ps, cvtps2pd): Added amdfam10_decode attribute. + +2007-02-05 Harsha Jagasia + + * config/i386/athlon.md (athlon_ssedivvector_amdfam10, + athlon_ssedivvector_load_amdfam10, athlon_ssemulvector_amdfam10, + athlon_ssemulvector_load_amdfam10): New define_insn_reservation. + (athlon_ssediv, athlon_ssediv_load_k8, athlon_ssemul, + athlon_ssemul_load_k8): Added amdfam10. + +2007-02-05 Harsha Jagasia + + * config/i386/i386.h (TARGET_SSE_UNALIGNED_MOVE_OPTIMAL): New macro. + (x86_sse_unaligned_move_optimal): New variable. + + * config/i386/i386.c (x86_sse_unaligned_move_optimal): Enable for + m_AMDFAM10. + (ix86_expand_vector_move_misalign): Add code to generate movupd/movups + for unaligned vector SSE double/single precision loads for AMDFAM10. + +2007-02-05 Harsha Jagasia + + * config/i386/i386.h (TARGET_AMDFAM10): New macro. + (TARGET_CPU_CPP_BUILTINS): Add code for amdfam10. + Define TARGET_CPU_DEFAULT_amdfam10. + (TARGET_CPU_DEFAULT_NAMES): Add amdfam10. + (processor_type): Add PROCESSOR_AMDFAM10. + + * config/i386/i386.md: Add amdfam10 as a new cpu attribute to match + processor_type in config/i386/i386.h. + Enable imul peepholes for TARGET_AMDFAM10. + + * config.gcc: Add support for --with-cpu option for amdfam10. + + * config/i386/i386.c (amdfam10_cost): New variable. + (m_AMDFAM10): New macro. + (m_ATHLON_K8_AMDFAM10): New macro. + (x86_use_leave, x86_push_memory, x86_movx, x86_unroll_strlen, + x86_cmove, x86_3dnow_a, x86_deep_branch, x86_use_simode_fiop, + x86_promote_QImode, x86_integer_DFmode_moves, + x86_partial_reg_dependency, x86_memory_mismatch_stall, + x86_accumulate_outgoing_args, x86_arch_always_fancy_math_387, + x86_sse_partial_reg_dependency, x86_sse_typeless_stores, + x86_use_ffreep, x86_use_incdec, x86_four_jump_limit, + x86_schedule, x86_use_bt, x86_cmpxchg16b, x86_pad_returns): + Enable/disable for amdfam10. + (override_options): Add amdfam10_cost to processor_target_table. + Set up PROCESSOR_AMDFAM10 for amdfam10 entry in + processor_alias_table. + (ix86_issue_rate): Add PROCESSOR_AMDFAM10. + (ix86_adjust_cost): Add code for amdfam10. + +2007-02-05 Harsha Jagasia + + * config/i386/i386.opt: Add new Advanced Bit Manipulation (-mabm) + instruction set feature flag. Add new (-mpopcnt) flag for popcnt + instruction. Add new SSE4A (-msse4a) instruction set feature flag. + * config/i386/i386.h: Add builtin definition for SSE4A. + * config/i386/i386.md: Add support for ABM instructions + (popcnt and lzcnt). + * config/i386/sse.md: Add support for SSE4A instructions + (movntss, movntsd, extrq, insertq). + * config/i386/i386.c: Add support for ABM and SSE4A builtins. + Add -march=amdfam10 flag. + * config/i386/ammintrin.h: Add support for SSE4A intrinsics. + * doc/invoke.texi: Add documentation on flags for sse4a, abm, popcnt + and amdfam10. + * doc/extend.texi: Add documentation for SSE4A builtins. + 2007-02-05 Bob Wilson * config/xtensa/xtensa.c (constantpool_mem_p): Skip over SUBREGs. diff --git a/gcc/config.gcc b/gcc/config.gcc index 0fcaa8f9d0c..17f429c9f2d 100644 --- a/gcc/config.gcc +++ b/gcc/config.gcc @@ -272,12 +272,12 @@ xscale-*-*) i[34567]86-*-*) cpu_type=i386 extra_headers="mmintrin.h mm3dnow.h xmmintrin.h emmintrin.h - pmmintrin.h tmmintrin.h" + pmmintrin.h tmmintrin.h ammintrin.h" ;; x86_64-*-*) cpu_type=i386 extra_headers="mmintrin.h mm3dnow.h xmmintrin.h emmintrin.h - pmmintrin.h tmmintrin.h" + pmmintrin.h tmmintrin.h ammintrin.h" need_64bit_hwint=yes ;; ia64-*-*) @@ -1111,14 +1111,14 @@ i[34567]86-*-linux* | i[34567]86-*-kfreebsd*-gnu | i[34567]86-*-knetbsd*-gnu) # FIXME: -m64 for i[34567]86-*-* should be allowed just # like -m32 for x86_64-*-*. case X"${with_cpu}" in - Xgeneric|Xcore2|Xnocona|Xx86-64|Xk8|Xopteron|Xathlon64|Xathlon-fx) + Xgeneric|Xcore2|Xnocona|Xx86-64|Xamdfam10|Xk8|Xopteron|Xathlon64|Xathlon-fx) ;; X) with_cpu=generic ;; *) echo "Unsupported CPU used in --with-cpu=$with_cpu, supported values:" 1>&2 - echo "generic core2 nocona x86-64 k8 opteron athlon64 athlon-fx" 1>&2 + echo "generic core2 nocona x86-64 amdfam10 k8 opteron athlon64 athlon-fx" 1>&2 exit 1 ;; esac @@ -1240,14 +1240,14 @@ i[34567]86-*-solaris2*) # FIXME: -m64 for i[34567]86-*-* should be allowed just # like -m32 for x86_64-*-*. case X"${with_cpu}" in - Xgeneric|Xcore2|Xnocona|Xx86-64|Xk8|Xopteron|Xathlon64|Xathlon-fx) + Xgeneric|Xcore2|Xnocona|Xx86-64|Xamdfam10|Xk8|Xopteron|Xathlon64|Xathlon-fx) ;; X) with_cpu=generic ;; *) echo "Unsupported CPU used in --with-cpu=$with_cpu, supported values:" 1>&2 - echo "generic core2 nocona x86-64 k8 opteron athlon64 athlon-fx" 1>&2 + echo "generic core2 nocona x86-64 amdfam10 k8 opteron athlon64 athlon-fx" 1>&2 exit 1 ;; esac @@ -2568,6 +2568,9 @@ if test x$with_cpu = x ; then ;; i686-*-* | i786-*-*) case ${target_noncanonical} in + amdfam10-*) + with_cpu=amdfam10 + ;; k8-*|opteron-*|athlon_64-*) with_cpu=k8 ;; @@ -2611,6 +2614,9 @@ if test x$with_cpu = x ; then ;; x86_64-*-*) case ${target_noncanonical} in + amdfam10-*) + with_cpu=amdfam10 + ;; k8-*|opteron-*|athlon_64-*) with_cpu=k8 ;; @@ -2874,7 +2880,7 @@ case "${target}" in esac # OK ;; - "" | k8 | opteron | athlon64 | athlon-fx | nocona | core2 | generic) + "" | amdfam10 | k8 | opteron | athlon64 | athlon-fx | nocona | core2 | generic) # OK ;; *) diff --git a/gcc/config/i386/ammintrin.h b/gcc/config/i386/ammintrin.h new file mode 100644 index 00000000000..869c2880e25 --- /dev/null +++ b/gcc/config/i386/ammintrin.h @@ -0,0 +1,73 @@ +/* Copyright (C) 2007 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING. If not, write to + the Free Software Foundation, 51 Franklin Street, Fifth Floor, + Boston, MA 02110-1301, USA. */ + +/* As a special exception, if you include this header file into source + files compiled by GCC, this header file does not by itself cause + the resulting executable to be covered by the GNU General Public + License. This exception does not however invalidate any other + reasons why the executable file might be covered by the GNU General + Public License. */ + +/* Implemented from the specification included in the AMD Programmers + Manual Update, version 2.x */ + +#ifndef _AMMINTRIN_H_INCLUDED +#define _AMMINTRIN_H_INCLUDED + +#ifndef __SSE4A__ +# error "SSE4A instruction set not enabled" +#else + +/* We need definitions from the SSE3, SSE2 and SSE header files*/ +#include + +static __inline void __attribute__((__always_inline__)) +_mm_stream_sd (double * __P, __m128d __Y) +{ + __builtin_ia32_movntsd (__P, (__v2df) __Y); +} + +static __inline void __attribute__((__always_inline__)) +_mm_stream_ss (float * __P, __m128 __Y) +{ + __builtin_ia32_movntss (__P, (__v4sf) __Y); +} + +static __inline __m128i __attribute__((__always_inline__)) +_mm_extract_si64 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_extrq ((__v2di) __X, (__v16qi) __Y); +} + +#define _mm_extracti_si64(X, I, L) \ +((__m128i) __builtin_ia32_extrqi ((__v2di)(X), I, L)) + +static __inline __m128i __attribute__((__always_inline__)) +_mm_insert_si64 (__m128i __X,__m128i __Y) +{ + return (__m128i) __builtin_ia32_insertq ((__v2di)__X, (__v2di)__Y); +} + +#define _mm_inserti_si64(X, Y, I, L) \ +((__m128i) __builtin_ia32_insertqi ((__v2di)(X), (__v2di)(Y), I, L)) + + +#endif /* __SSE4A__ */ + +#endif /* _AMMINTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/athlon.md b/gcc/config/i386/athlon.md index 21d460d677e..708507f4224 100644 --- a/gcc/config/i386/athlon.md +++ b/gcc/config/i386/athlon.md @@ -29,6 +29,8 @@ (const_string "vector")] (const_string "direct"))) +(define_attr "amdfam10_decode" "direct,vector,double" + (const_string "direct")) ;; ;; decode0 decode1 decode2 ;; \ | / @@ -131,18 +133,22 @@ ;; Jump instructions are executed in the branch unit completely transparent to us (define_insn_reservation "athlon_branch" 0 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (eq_attr "type" "ibr")) "athlon-direct,athlon-ieu") (define_insn_reservation "athlon_call" 0 (and (eq_attr "cpu" "athlon,k8,generic64") (eq_attr "type" "call,callv")) "athlon-vector,athlon-ieu") +(define_insn_reservation "athlon_call_amdfam10" 0 + (and (eq_attr "cpu" "amdfam10") + (eq_attr "type" "call,callv")) + "athlon-double,athlon-ieu") ;; Latency of push operation is 3 cycles, but ESP value is available ;; earlier (define_insn_reservation "athlon_push" 2 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (eq_attr "type" "push")) "athlon-direct,athlon-agu,athlon-store") (define_insn_reservation "athlon_pop" 4 @@ -153,12 +159,16 @@ (and (eq_attr "cpu" "k8,generic64") (eq_attr "type" "pop")) "athlon-double,(athlon-ieu+athlon-load)") +(define_insn_reservation "athlon_pop_amdfam10" 3 + (and (eq_attr "cpu" "amdfam10") + (eq_attr "type" "pop")) + "athlon-direct,(athlon-ieu+athlon-load)") (define_insn_reservation "athlon_leave" 3 (and (eq_attr "cpu" "athlon") (eq_attr "type" "leave")) "athlon-vector,(athlon-ieu+athlon-load)") (define_insn_reservation "athlon_leave_k8" 3 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (eq_attr "type" "leave")) "athlon-double,(athlon-ieu+athlon-load)") @@ -167,6 +177,11 @@ (and (eq_attr "cpu" "athlon,k8,generic64") (eq_attr "type" "lea")) "athlon-direct,athlon-agu,nothing") +;; Lea executes in AGU unit with 1 cycle latency on AMDFAM10 +(define_insn_reservation "athlon_lea_amdfam10" 1 + (and (eq_attr "cpu" "amdfam10") + (eq_attr "type" "lea")) + "athlon-direct,athlon-agu,nothing") ;; Mul executes in special multiplier unit attached to IEU0 (define_insn_reservation "athlon_imul" 5 @@ -176,29 +191,35 @@ "athlon-vector,athlon-ieu0,athlon-mult,nothing,nothing,athlon-ieu0") ;; ??? Widening multiply is vector or double. (define_insn_reservation "athlon_imul_k8_DI" 4 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "imul") (and (eq_attr "mode" "DI") (eq_attr "memory" "none,unknown")))) "athlon-direct0,athlon-ieu0,athlon-mult,nothing,athlon-ieu0") (define_insn_reservation "athlon_imul_k8" 3 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "imul") (eq_attr "memory" "none,unknown"))) "athlon-direct0,athlon-ieu0,athlon-mult,athlon-ieu0") +(define_insn_reservation "athlon_imul_amdfam10_HI" 4 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "imul") + (and (eq_attr "mode" "HI") + (eq_attr "memory" "none,unknown")))) + "athlon-vector,athlon-ieu0,athlon-mult,nothing,athlon-ieu0") (define_insn_reservation "athlon_imul_mem" 8 (and (eq_attr "cpu" "athlon") (and (eq_attr "type" "imul") (eq_attr "memory" "load,both"))) "athlon-vector,athlon-load,athlon-ieu,athlon-mult,nothing,nothing,athlon-ieu") (define_insn_reservation "athlon_imul_mem_k8_DI" 7 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "imul") (and (eq_attr "mode" "DI") (eq_attr "memory" "load,both")))) "athlon-vector,athlon-load,athlon-ieu,athlon-mult,nothing,athlon-ieu") (define_insn_reservation "athlon_imul_mem_k8" 6 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "imul") (eq_attr "memory" "load,both"))) "athlon-vector,athlon-load,athlon-ieu,athlon-mult,athlon-ieu") @@ -209,21 +230,23 @@ ;; other instructions. ;; ??? Experiments show that the idiv can overlap with roughly 6 cycles ;; of the other code +;; Using the same heuristics for amdfam10 as K8 with idiv (define_insn_reservation "athlon_idiv" 6 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (and (eq_attr "type" "idiv") (eq_attr "memory" "none,unknown"))) "athlon-vector,(athlon-ieu0*6+(athlon-fpsched,athlon-fvector))") (define_insn_reservation "athlon_idiv_mem" 9 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (and (eq_attr "type" "idiv") (eq_attr "memory" "load,both"))) "athlon-vector,((athlon-load,athlon-ieu0*6)+(athlon-fpsched,athlon-fvector))") ;; The parallelism of string instructions is not documented. Model it same way ;; as idiv to create smaller automata. This probably does not matter much. +;; Using the same heuristics for amdfam10 as K8 with idiv (define_insn_reservation "athlon_str" 6 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (and (eq_attr "type" "str") (eq_attr "memory" "load,both,store"))) "athlon-vector,athlon-load,athlon-ieu0*6") @@ -234,34 +257,62 @@ (and (eq_attr "unit" "integer,unknown") (eq_attr "memory" "none,unknown")))) "athlon-direct,athlon-ieu") +(define_insn_reservation "athlon_idirect_amdfam10" 1 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "amdfam10_decode" "direct") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "none,unknown")))) + "athlon-direct,athlon-ieu") (define_insn_reservation "athlon_ivector" 2 (and (eq_attr "cpu" "athlon,k8,generic64") (and (eq_attr "athlon_decode" "vector") (and (eq_attr "unit" "integer,unknown") (eq_attr "memory" "none,unknown")))) "athlon-vector,athlon-ieu,athlon-ieu") +(define_insn_reservation "athlon_ivector_amdfam10" 2 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "amdfam10_decode" "vector") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "none,unknown")))) + "athlon-vector,athlon-ieu,athlon-ieu") + (define_insn_reservation "athlon_idirect_loadmov" 3 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (and (eq_attr "type" "imov") (eq_attr "memory" "load"))) "athlon-direct,athlon-load") + (define_insn_reservation "athlon_idirect_load" 4 (and (eq_attr "cpu" "athlon,k8,generic64") (and (eq_attr "athlon_decode" "direct") (and (eq_attr "unit" "integer,unknown") (eq_attr "memory" "load")))) "athlon-direct,athlon-load,athlon-ieu") +(define_insn_reservation "athlon_idirect_load_amdfam10" 4 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "amdfam10_decode" "direct") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "load")))) + "athlon-direct,athlon-load,athlon-ieu") (define_insn_reservation "athlon_ivector_load" 6 (and (eq_attr "cpu" "athlon,k8,generic64") (and (eq_attr "athlon_decode" "vector") (and (eq_attr "unit" "integer,unknown") (eq_attr "memory" "load")))) "athlon-vector,athlon-load,athlon-ieu,athlon-ieu") +(define_insn_reservation "athlon_ivector_load_amdfam10" 6 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "amdfam10_decode" "vector") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "load")))) + "athlon-vector,athlon-load,athlon-ieu,athlon-ieu") + (define_insn_reservation "athlon_idirect_movstore" 1 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (and (eq_attr "type" "imov") (eq_attr "memory" "store"))) "athlon-direct,athlon-agu,athlon-store") + (define_insn_reservation "athlon_idirect_both" 4 (and (eq_attr "cpu" "athlon,k8,generic64") (and (eq_attr "athlon_decode" "direct") @@ -270,6 +321,15 @@ "athlon-direct,athlon-load, athlon-ieu,athlon-store, athlon-store") +(define_insn_reservation "athlon_idirect_both_amdfam10" 4 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "amdfam10_decode" "direct") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "both")))) + "athlon-direct,athlon-load, + athlon-ieu,athlon-store, + athlon-store") + (define_insn_reservation "athlon_ivector_both" 6 (and (eq_attr "cpu" "athlon,k8,generic64") (and (eq_attr "athlon_decode" "vector") @@ -279,6 +339,16 @@ athlon-ieu, athlon-ieu, athlon-store") +(define_insn_reservation "athlon_ivector_both_amdfam10" 6 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "amdfam10_decode" "vector") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "both")))) + "athlon-vector,athlon-load, + athlon-ieu, + athlon-ieu, + athlon-store") + (define_insn_reservation "athlon_idirect_store" 1 (and (eq_attr "cpu" "athlon,k8,generic64") (and (eq_attr "athlon_decode" "direct") @@ -286,6 +356,14 @@ (eq_attr "memory" "store")))) "athlon-direct,(athlon-ieu+athlon-agu), athlon-store") +(define_insn_reservation "athlon_idirect_store_amdfam10" 1 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "amdfam10_decode" "direct") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "store")))) + "athlon-direct,(athlon-ieu+athlon-agu), + athlon-store") + (define_insn_reservation "athlon_ivector_store" 2 (and (eq_attr "cpu" "athlon,k8,generic64") (and (eq_attr "athlon_decode" "vector") @@ -293,6 +371,13 @@ (eq_attr "memory" "store")))) "athlon-vector,(athlon-ieu+athlon-agu),athlon-ieu, athlon-store") +(define_insn_reservation "athlon_ivector_store_amdfam10" 2 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "amdfam10_decode" "vector") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "store")))) + "athlon-vector,(athlon-ieu+athlon-agu),athlon-ieu, + athlon-store") ;; Athlon floatin point unit (define_insn_reservation "athlon_fldxf" 12 @@ -302,7 +387,7 @@ (eq_attr "mode" "XF")))) "athlon-vector,athlon-fpload2,athlon-fvector*9") (define_insn_reservation "athlon_fldxf_k8" 13 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "fmov") (and (eq_attr "memory" "load") (eq_attr "mode" "XF")))) @@ -314,7 +399,7 @@ (eq_attr "memory" "load"))) "athlon-direct,athlon-fpload,athlon-fany") (define_insn_reservation "athlon_fld_k8" 2 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "fmov") (eq_attr "memory" "load"))) "athlon-direct,athlon-fploadk8,athlon-fstore") @@ -326,7 +411,7 @@ (eq_attr "mode" "XF")))) "athlon-vector,(athlon-fpsched+athlon-agu),(athlon-store2+(athlon-fvector*7))") (define_insn_reservation "athlon_fstxf_k8" 8 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "fmov") (and (eq_attr "memory" "store,both") (eq_attr "mode" "XF")))) @@ -337,16 +422,16 @@ (eq_attr "memory" "store,both"))) "athlon-direct,(athlon-fpsched+athlon-agu),(athlon-fstore+athlon-store)") (define_insn_reservation "athlon_fst_k8" 2 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "fmov") (eq_attr "memory" "store,both"))) "athlon-direct,(athlon-fpsched+athlon-agu),(athlon-fstore+athlon-store)") (define_insn_reservation "athlon_fist" 4 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (eq_attr "type" "fistp,fisttp")) "athlon-direct,(athlon-fpsched+athlon-agu),(athlon-fstore+athlon-store)") (define_insn_reservation "athlon_fmov" 2 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (eq_attr "type" "fmov")) "athlon-direct,athlon-fpsched,athlon-faddmul") (define_insn_reservation "athlon_fadd_load" 4 @@ -355,12 +440,12 @@ (eq_attr "memory" "load"))) "athlon-direct,athlon-fpload,athlon-fadd") (define_insn_reservation "athlon_fadd_load_k8" 6 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "fop") (eq_attr "memory" "load"))) "athlon-direct,athlon-fploadk8,athlon-fadd") (define_insn_reservation "athlon_fadd" 4 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (eq_attr "type" "fop")) "athlon-direct,athlon-fpsched,athlon-fadd") (define_insn_reservation "athlon_fmul_load" 4 @@ -369,16 +454,16 @@ (eq_attr "memory" "load"))) "athlon-direct,athlon-fpload,athlon-fmul") (define_insn_reservation "athlon_fmul_load_k8" 6 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "fmul") (eq_attr "memory" "load"))) "athlon-direct,athlon-fploadk8,athlon-fmul") (define_insn_reservation "athlon_fmul" 4 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (eq_attr "type" "fmul")) "athlon-direct,athlon-fpsched,athlon-fmul") (define_insn_reservation "athlon_fsgn" 2 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (eq_attr "type" "fsgn")) "athlon-direct,athlon-fpsched,athlon-fmul") (define_insn_reservation "athlon_fdiv_load" 24 @@ -387,7 +472,7 @@ (eq_attr "memory" "load"))) "athlon-direct,athlon-fpload,athlon-fmul") (define_insn_reservation "athlon_fdiv_load_k8" 13 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "fdiv") (eq_attr "memory" "load"))) "athlon-direct,athlon-fploadk8,athlon-fmul") @@ -396,16 +481,16 @@ (eq_attr "type" "fdiv")) "athlon-direct,athlon-fpsched,athlon-fmul") (define_insn_reservation "athlon_fdiv_k8" 11 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (eq_attr "type" "fdiv")) "athlon-direct,athlon-fpsched,athlon-fmul") (define_insn_reservation "athlon_fpspc_load" 103 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (and (eq_attr "type" "fpspc") (eq_attr "memory" "load"))) "athlon-vector,athlon-fpload,athlon-fvector") (define_insn_reservation "athlon_fpspc" 100 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (eq_attr "type" "fpspc")) "athlon-vector,athlon-fpsched,athlon-fvector") (define_insn_reservation "athlon_fcmov_load" 7 @@ -418,12 +503,12 @@ (eq_attr "type" "fcmov")) "athlon-vector,athlon-fpsched,athlon-fvector") (define_insn_reservation "athlon_fcmov_load_k8" 17 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "fcmov") (eq_attr "memory" "load"))) "athlon-vector,athlon-fploadk8,athlon-fvector") (define_insn_reservation "athlon_fcmov_k8" 15 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (eq_attr "type" "fcmov")) "athlon-vector,athlon-fpsched,athlon-fvector") ;; fcomi is vector decoded by uses only one pipe. @@ -434,13 +519,13 @@ (eq_attr "memory" "load")))) "athlon-vector,athlon-fpload,athlon-fadd") (define_insn_reservation "athlon_fcomi_load_k8" 5 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "fcmp") (and (eq_attr "athlon_decode" "vector") (eq_attr "memory" "load")))) "athlon-vector,athlon-fploadk8,athlon-fadd") (define_insn_reservation "athlon_fcomi" 3 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (and (eq_attr "athlon_decode" "vector") (eq_attr "type" "fcmp"))) "athlon-vector,athlon-fpsched,athlon-fadd") @@ -450,18 +535,18 @@ (eq_attr "memory" "load"))) "athlon-direct,athlon-fpload,athlon-fadd") (define_insn_reservation "athlon_fcom_load_k8" 4 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "fcmp") (eq_attr "memory" "load"))) "athlon-direct,athlon-fploadk8,athlon-fadd") (define_insn_reservation "athlon_fcom" 2 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (eq_attr "type" "fcmp")) "athlon-direct,athlon-fpsched,athlon-fadd") ;; Never seen by the scheduler because we still don't do post reg-stack ;; scheduling. ;(define_insn_reservation "athlon_fxch" 2 -; (and (eq_attr "cpu" "athlon,k8,generic64") +; (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") ; (eq_attr "type" "fxch")) ; "athlon-direct,athlon-fpsched,athlon-fany") @@ -516,6 +601,23 @@ (and (eq_attr "type" "mmxmov,ssemov") (eq_attr "memory" "load"))) "athlon-direct,athlon-fploadk8,athlon-fstore") +;; On AMDFAM10 all double, single and integer packed and scalar SSEx data +;; loads generated are direct path, latency of 2 and do not use any FP +;; executions units. No seperate entries for movlpx/movhpx loads, which +;; are direct path, latency of 4 and use the FADD/FMUL FP execution units, +;; as they will not be generated. +(define_insn_reservation "athlon_sseld_amdfam10" 2 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssemov") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fploadk8") +;; On AMDFAM10 MMX data loads generated are direct path, latency of 4 +;; and can use any FP executions units +(define_insn_reservation "athlon_mmxld_amdfam10" 4 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "mmxmov") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fploadk8, athlon-fany") (define_insn_reservation "athlon_mmxssest" 3 (and (eq_attr "cpu" "k8,generic64") (and (eq_attr "type" "mmxmov,ssemov") @@ -533,6 +635,25 @@ (and (eq_attr "type" "mmxmov,ssemov") (eq_attr "memory" "store,both"))) "athlon-direct,(athlon-fpsched+athlon-agu),(athlon-fstore+athlon-store)") +;; On AMDFAM10 all double, single and integer packed SSEx data stores +;; generated are all double path, latency of 2 and use the FSTORE FP +;; execution unit. No entries seperate for movupx/movdqu, which are +;; vector path, latency of 3 and use the FSTORE*2 FP execution unit, +;; as they will not be generated. +(define_insn_reservation "athlon_ssest_amdfam10" 2 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssemov") + (and (eq_attr "mode" "V4SF,V2DF,TI") + (eq_attr "memory" "store,both")))) + "athlon-double,(athlon-fpsched+athlon-agu),((athlon-fstore+athlon-store)*2)") +;; On AMDFAM10 all double, single and integer scalar SSEx and MMX +;; data stores generated are all direct path, latency of 2 and use +;; the FSTORE FP execution unit +(define_insn_reservation "athlon_mmxssest_short_amdfam10" 2 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "mmxmov,ssemov") + (eq_attr "memory" "store,both"))) + "athlon-direct,(athlon-fpsched+athlon-agu),(athlon-fstore+athlon-store)") (define_insn_reservation "athlon_movaps_k8" 2 (and (eq_attr "cpu" "k8,generic64") (and (eq_attr "type" "ssemov") @@ -578,6 +699,11 @@ (and (eq_attr "type" "sselog,sselog1") (eq_attr "memory" "load"))) "athlon-double,athlon-fpload2k8,(athlon-fmul*2)") +(define_insn_reservation "athlon_sselog_load_amdfam10" 4 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "sselog,sselog1") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fploadk8,(athlon-fadd|athlon-fmul)") (define_insn_reservation "athlon_sselog" 3 (and (eq_attr "cpu" "athlon") (eq_attr "type" "sselog,sselog1")) @@ -586,6 +712,11 @@ (and (eq_attr "cpu" "k8,generic64") (eq_attr "type" "sselog,sselog1")) "athlon-double,athlon-fpsched,athlon-fmul") +(define_insn_reservation "athlon_sselog_amdfam10" 2 + (and (eq_attr "cpu" "amdfam10") + (eq_attr "type" "sselog,sselog1")) + "athlon-direct,athlon-fpsched,(athlon-fadd|athlon-fmul)") + ;; ??? pcmp executes in addmul, probably not worthwhile to bother about that. (define_insn_reservation "athlon_ssecmp_load" 2 (and (eq_attr "cpu" "athlon") @@ -594,13 +725,13 @@ (eq_attr "memory" "load")))) "athlon-direct,athlon-fpload,athlon-fadd") (define_insn_reservation "athlon_ssecmp_load_k8" 4 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "ssecmp") (and (eq_attr "mode" "SF,DF,DI,TI") (eq_attr "memory" "load")))) "athlon-direct,athlon-fploadk8,athlon-fadd") (define_insn_reservation "athlon_ssecmp" 2 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (and (eq_attr "type" "ssecmp") (eq_attr "mode" "SF,DF,DI,TI"))) "athlon-direct,athlon-fpsched,athlon-fadd") @@ -614,6 +745,11 @@ (and (eq_attr "type" "ssecmp") (eq_attr "memory" "load"))) "athlon-double,athlon-fpload2k8,(athlon-fadd*2)") +(define_insn_reservation "athlon_ssecmpvector_load_amdfam10" 4 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssecmp") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fploadk8,athlon-fadd") (define_insn_reservation "athlon_ssecmpvector" 3 (and (eq_attr "cpu" "athlon") (eq_attr "type" "ssecmp")) @@ -622,6 +758,10 @@ (and (eq_attr "cpu" "k8,generic64") (eq_attr "type" "ssecmp")) "athlon-double,athlon-fpsched,(athlon-fadd*2)") +(define_insn_reservation "athlon_ssecmpvector_amdfam10" 2 + (and (eq_attr "cpu" "amdfam10") + (eq_attr "type" "ssecmp")) + "athlon-direct,athlon-fpsched,athlon-fadd") (define_insn_reservation "athlon_ssecomi_load" 4 (and (eq_attr "cpu" "athlon") (and (eq_attr "type" "ssecomi") @@ -632,10 +772,20 @@ (and (eq_attr "type" "ssecomi") (eq_attr "memory" "load"))) "athlon-vector,athlon-fploadk8,athlon-fadd") +(define_insn_reservation "athlon_ssecomi_load_amdfam10" 5 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssecomi") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fploadk8,athlon-fadd") (define_insn_reservation "athlon_ssecomi" 4 (and (eq_attr "cpu" "athlon,k8,generic64") (eq_attr "type" "ssecmp")) "athlon-vector,athlon-fpsched,athlon-fadd") +(define_insn_reservation "athlon_ssecomi_amdfam10" 3 + (and (eq_attr "cpu" "amdfam10") +;; It seems athlon_ssecomi has a bug in the attr_type, fixed for amdfam10 + (eq_attr "type" "ssecomi")) + "athlon-direct,athlon-fpsched,athlon-fadd") (define_insn_reservation "athlon_sseadd_load" 4 (and (eq_attr "cpu" "athlon") (and (eq_attr "type" "sseadd") @@ -643,13 +793,13 @@ (eq_attr "memory" "load")))) "athlon-direct,athlon-fpload,athlon-fadd") (define_insn_reservation "athlon_sseadd_load_k8" 6 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "sseadd") (and (eq_attr "mode" "SF,DF,DI") (eq_attr "memory" "load")))) "athlon-direct,athlon-fploadk8,athlon-fadd") (define_insn_reservation "athlon_sseadd" 4 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (and (eq_attr "type" "sseadd") (eq_attr "mode" "SF,DF,DI"))) "athlon-direct,athlon-fpsched,athlon-fadd") @@ -663,6 +813,11 @@ (and (eq_attr "type" "sseadd") (eq_attr "memory" "load"))) "athlon-double,athlon-fpload2k8,(athlon-fadd*2)") +(define_insn_reservation "athlon_sseaddvector_load_amdfam10" 6 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "sseadd") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fploadk8,athlon-fadd") (define_insn_reservation "athlon_sseaddvector" 5 (and (eq_attr "cpu" "athlon") (eq_attr "type" "sseadd")) @@ -671,6 +826,10 @@ (and (eq_attr "cpu" "k8,generic64") (eq_attr "type" "sseadd")) "athlon-double,athlon-fpsched,(athlon-fadd*2)") +(define_insn_reservation "athlon_sseaddvector_amdfam10" 4 + (and (eq_attr "cpu" "amdfam10") + (eq_attr "type" "sseadd")) + "athlon-direct,athlon-fpsched,athlon-fadd") ;; Conversions behaves very irregularly and the scheduling is critical here. ;; Take each instruction separately. Assume that the mode is always set to the @@ -684,12 +843,25 @@ (and (eq_attr "mode" "DF") (eq_attr "memory" "load"))))) "athlon-direct,athlon-fploadk8,athlon-fstore") +(define_insn_reservation "athlon_ssecvt_cvtss2sd_load_amdfam10" 7 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "amdfam10_decode" "double") + (and (eq_attr "mode" "DF") + (eq_attr "memory" "load"))))) + "athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)") (define_insn_reservation "athlon_ssecvt_cvtss2sd" 2 (and (eq_attr "cpu" "athlon,k8,generic64") (and (eq_attr "type" "ssecvt") (and (eq_attr "athlon_decode" "direct") (eq_attr "mode" "DF")))) "athlon-direct,athlon-fpsched,athlon-fstore") +(define_insn_reservation "athlon_ssecvt_cvtss2sd_amdfam10" 7 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "amdfam10_decode" "vector") + (eq_attr "mode" "DF")))) + "athlon-vector,athlon-fpsched,athlon-faddmul,(athlon-fstore*2)") ;; cvtps2pd. Model same way the other double decoded FP conversions. (define_insn_reservation "athlon_ssecvt_cvtps2pd_load_k8" 5 (and (eq_attr "cpu" "k8,athlon,generic64") @@ -698,12 +870,25 @@ (and (eq_attr "mode" "V2DF,V4SF,TI") (eq_attr "memory" "load"))))) "athlon-double,athlon-fpload2k8,(athlon-fstore*2)") +(define_insn_reservation "athlon_ssecvt_cvtps2pd_load_amdfam10" 4 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "amdfam10_decode" "direct") + (and (eq_attr "mode" "V2DF,V4SF,TI") + (eq_attr "memory" "load"))))) + "athlon-direct,athlon-fploadk8,athlon-fstore") (define_insn_reservation "athlon_ssecvt_cvtps2pd_k8" 3 (and (eq_attr "cpu" "k8,athlon,generic64") (and (eq_attr "type" "ssecvt") (and (eq_attr "athlon_decode" "double") (eq_attr "mode" "V2DF,V4SF,TI")))) "athlon-double,athlon-fpsched,athlon-fstore,athlon-fstore") +(define_insn_reservation "athlon_ssecvt_cvtps2pd_amdfam10" 2 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "amdfam10_decode" "direct") + (eq_attr "mode" "V2DF,V4SF,TI")))) + "athlon-direct,athlon-fpsched,athlon-fstore") ;; cvtsi2sd mem,reg is directpath path (cvtsi2sd reg,reg is doublepath) ;; cvtsi2sd has troughput 1 and is executed in store unit with latency of 6 (define_insn_reservation "athlon_sseicvt_cvtsi2sd_load" 6 @@ -713,6 +898,13 @@ (and (eq_attr "mode" "SF,DF") (eq_attr "memory" "load"))))) "athlon-direct,athlon-fploadk8,athlon-fstore") +(define_insn_reservation "athlon_sseicvt_cvtsi2sd_load_amdfam10" 9 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "sseicvt") + (and (eq_attr "amdfam10_decode" "double") + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "load"))))) + "athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)") ;; cvtsi2ss mem, reg is doublepath (define_insn_reservation "athlon_sseicvt_cvtsi2ss_load" 9 (and (eq_attr "cpu" "athlon") @@ -728,6 +920,13 @@ (and (eq_attr "mode" "SF,DF") (eq_attr "memory" "load"))))) "athlon-double,athlon-fploadk8,(athlon-fstore*2)") +(define_insn_reservation "athlon_sseicvt_cvtsi2ss_load_amdfam10" 9 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "sseicvt") + (and (eq_attr "amdfam10_decode" "double") + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "load"))))) + "athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)") ;; cvtsi2sd reg,reg is double decoded (vector on Athlon) (define_insn_reservation "athlon_sseicvt_cvtsi2sd_k8" 11 (and (eq_attr "cpu" "k8,athlon,generic64") @@ -736,6 +935,13 @@ (and (eq_attr "mode" "SF,DF") (eq_attr "memory" "none"))))) "athlon-double,athlon-fploadk8,athlon-fstore") +(define_insn_reservation "athlon_sseicvt_cvtsi2sd_amdfam10" 14 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "sseicvt") + (and (eq_attr "amdfam10_decode" "vector") + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "none"))))) + "athlon-vector,athlon-fploadk8,(athlon-faddmul+athlon-fstore)") ;; cvtsi2ss reg, reg is doublepath (define_insn_reservation "athlon_sseicvt_cvtsi2ss" 14 (and (eq_attr "cpu" "athlon,k8,generic64") @@ -744,6 +950,13 @@ (and (eq_attr "mode" "SF,DF") (eq_attr "memory" "none"))))) "athlon-vector,athlon-fploadk8,(athlon-fvector*2)") +(define_insn_reservation "athlon_sseicvt_cvtsi2ss_amdfam10" 14 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "sseicvt") + (and (eq_attr "amdfam10_decode" "vector") + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "none"))))) + "athlon-vector,athlon-fploadk8,(athlon-faddmul+athlon-fstore)") ;; cvtsd2ss mem,reg is doublepath, troughput unknown, latency 9 (define_insn_reservation "athlon_ssecvt_cvtsd2ss_load_k8" 9 (and (eq_attr "cpu" "k8,athlon,generic64") @@ -752,6 +965,13 @@ (and (eq_attr "mode" "SF") (eq_attr "memory" "load"))))) "athlon-double,athlon-fploadk8,(athlon-fstore*3)") +(define_insn_reservation "athlon_ssecvt_cvtsd2ss_load_amdfam10" 9 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "amdfam10_decode" "double") + (and (eq_attr "mode" "SF") + (eq_attr "memory" "load"))))) + "athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)") ;; cvtsd2ss reg,reg is vectorpath, troughput unknown, latency 12 (define_insn_reservation "athlon_ssecvt_cvtsd2ss" 12 (and (eq_attr "cpu" "athlon,k8,generic64") @@ -760,6 +980,13 @@ (and (eq_attr "mode" "SF") (eq_attr "memory" "none"))))) "athlon-vector,athlon-fpsched,(athlon-fvector*3)") +(define_insn_reservation "athlon_ssecvt_cvtsd2ss_amdfam10" 8 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "amdfam10_decode" "vector") + (and (eq_attr "mode" "SF") + (eq_attr "memory" "none"))))) + "athlon-vector,athlon-fpsched,athlon-faddmul,(athlon-fstore*2)") (define_insn_reservation "athlon_ssecvt_cvtpd2ps_load_k8" 8 (and (eq_attr "cpu" "athlon,k8,generic64") (and (eq_attr "type" "ssecvt") @@ -767,6 +994,13 @@ (and (eq_attr "mode" "V4SF,V2DF,TI") (eq_attr "memory" "load"))))) "athlon-double,athlon-fpload2k8,(athlon-fstore*3)") +(define_insn_reservation "athlon_ssecvt_cvtpd2ps_load_amdfam10" 9 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "amdfam10_decode" "double") + (and (eq_attr "mode" "V4SF,V2DF,TI") + (eq_attr "memory" "load"))))) + "athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)") ;; cvtpd2ps mem,reg is vectorpath, troughput unknown, latency 10 ;; ??? Why it is fater than cvtsd2ss? (define_insn_reservation "athlon_ssecvt_cvtpd2ps" 8 @@ -776,6 +1010,13 @@ (and (eq_attr "mode" "V4SF,V2DF,TI") (eq_attr "memory" "none"))))) "athlon-vector,athlon-fpsched,athlon-fvector*2") +(define_insn_reservation "athlon_ssecvt_cvtpd2ps_amdfam10" 7 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "amdfam10_decode" "double") + (and (eq_attr "mode" "V4SF,V2DF,TI") + (eq_attr "memory" "none"))))) + "athlon-double,athlon-fpsched,(athlon-faddmul+athlon-fstore)") ;; cvtsd2si mem,reg is doublepath, troughput 1, latency 9 (define_insn_reservation "athlon_secvt_cvtsX2si_load" 9 (and (eq_attr "cpu" "athlon,k8,generic64") @@ -784,6 +1025,13 @@ (and (eq_attr "mode" "SI,DI") (eq_attr "memory" "load"))))) "athlon-vector,athlon-fploadk8,athlon-fvector") +(define_insn_reservation "athlon_secvt_cvtsX2si_load_amdfam10" 10 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "sseicvt") + (and (eq_attr "amdfam10_decode" "double") + (and (eq_attr "mode" "SI,DI") + (eq_attr "memory" "load"))))) + "athlon-double,athlon-fploadk8,(athlon-fadd+athlon-fstore)") ;; cvtsd2si reg,reg is doublepath, troughput 1, latency 9 (define_insn_reservation "athlon_ssecvt_cvtsX2si" 9 (and (eq_attr "cpu" "athlon") @@ -799,6 +1047,29 @@ (and (eq_attr "mode" "SI,DI") (eq_attr "memory" "none"))))) "athlon-double,athlon-fpsched,athlon-fstore") +(define_insn_reservation "athlon_ssecvt_cvtsX2si_amdfam10" 8 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "sseicvt") + (and (eq_attr "amdfam10_decode" "double") + (and (eq_attr "mode" "SI,DI") + (eq_attr "memory" "none"))))) + "athlon-double,athlon-fpsched,(athlon-fadd+athlon-fstore)") +;; cvtpd2dq reg,mem is doublepath, troughput 1, latency 9 on amdfam10 +(define_insn_reservation "athlon_sseicvt_cvtpd2dq_load_amdfam10" 9 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "sseicvt") + (and (eq_attr "amdfam10_decode" "double") + (and (eq_attr "mode" "TI") + (eq_attr "memory" "load"))))) + "athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)") +;; cvtpd2dq reg,mem is doublepath, troughput 1, latency 7 on amdfam10 +(define_insn_reservation "athlon_sseicvt_cvtpd2dq_amdfam10" 7 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "sseicvt") + (and (eq_attr "amdfam10_decode" "double") + (and (eq_attr "mode" "TI") + (eq_attr "memory" "none"))))) + "athlon-double,athlon-fpsched,(athlon-faddmul+athlon-fstore)") (define_insn_reservation "athlon_ssemul_load" 4 @@ -808,13 +1079,13 @@ (eq_attr "memory" "load")))) "athlon-direct,athlon-fpload,athlon-fmul") (define_insn_reservation "athlon_ssemul_load_k8" 6 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "ssemul") (and (eq_attr "mode" "SF,DF") (eq_attr "memory" "load")))) "athlon-direct,athlon-fploadk8,athlon-fmul") (define_insn_reservation "athlon_ssemul" 4 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (and (eq_attr "type" "ssemul") (eq_attr "mode" "SF,DF"))) "athlon-direct,athlon-fpsched,athlon-fmul") @@ -828,6 +1099,11 @@ (and (eq_attr "type" "ssemul") (eq_attr "memory" "load"))) "athlon-double,athlon-fpload2k8,(athlon-fmul*2)") +(define_insn_reservation "athlon_ssemulvector_load_amdfam10" 6 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssemul") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fploadk8,athlon-fmul") (define_insn_reservation "athlon_ssemulvector" 5 (and (eq_attr "cpu" "athlon") (eq_attr "type" "ssemul")) @@ -836,6 +1112,10 @@ (and (eq_attr "cpu" "k8,generic64") (eq_attr "type" "ssemul")) "athlon-double,athlon-fpsched,(athlon-fmul*2)") +(define_insn_reservation "athlon_ssemulvector_amdfam10" 4 + (and (eq_attr "cpu" "amdfam10") + (eq_attr "type" "ssemul")) + "athlon-direct,athlon-fpsched,athlon-fmul") ;; divsd timings. divss is faster (define_insn_reservation "athlon_ssediv_load" 20 (and (eq_attr "cpu" "athlon") @@ -844,13 +1124,13 @@ (eq_attr "memory" "load")))) "athlon-direct,athlon-fpload,athlon-fmul*17") (define_insn_reservation "athlon_ssediv_load_k8" 22 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "ssediv") (and (eq_attr "mode" "SF,DF") (eq_attr "memory" "load")))) "athlon-direct,athlon-fploadk8,athlon-fmul*17") (define_insn_reservation "athlon_ssediv" 20 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (and (eq_attr "type" "ssediv") (eq_attr "mode" "SF,DF"))) "athlon-direct,athlon-fpsched,athlon-fmul*17") @@ -864,6 +1144,11 @@ (and (eq_attr "type" "ssediv") (eq_attr "memory" "load"))) "athlon-double,athlon-fpload2k8,athlon-fmul*34") +(define_insn_reservation "athlon_ssedivvector_load_amdfam10" 22 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssediv") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fploadk8,athlon-fmul*17") (define_insn_reservation "athlon_ssedivvector" 39 (and (eq_attr "cpu" "athlon") (eq_attr "type" "ssediv")) @@ -872,3 +1157,12 @@ (and (eq_attr "cpu" "k8,generic64") (eq_attr "type" "ssediv")) "athlon-double,athlon-fmul*34") +(define_insn_reservation "athlon_ssedivvector_amdfam10" 20 + (and (eq_attr "cpu" "amdfam10") + (eq_attr "type" "ssediv")) + "athlon-direct,athlon-fmul*17") +(define_insn_reservation "athlon_sseins_amdfam10" 5 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "sseins") + (eq_attr "mode" "TI"))) + "athlon-vector,athlon-fpsched,athlon-faddmul") diff --git a/gcc/config/i386/emmintrin.h b/gcc/config/i386/emmintrin.h index f867e530a0b..43a62623644 100644 --- a/gcc/config/i386/emmintrin.h +++ b/gcc/config/i386/emmintrin.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2003, 2004, 2005, 2006 Free Software Foundation, Inc. +/* Copyright (C) 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc. This file is part of GCC. @@ -30,7 +30,11 @@ #ifndef _EMMINTRIN_H_INCLUDED #define _EMMINTRIN_H_INCLUDED -#ifdef __SSE2__ +#ifndef __SSE2__ +# error "SSE2 instruction set not enabled" +#else + +/* We need definitions from the SSE header files*/ #include /* SSE2 */ diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index ea1284960ad..24ebc440407 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -604,6 +604,80 @@ struct processor_costs k8_cost = { {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}} }; +struct processor_costs amdfam10_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (2), /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ + COSTS_N_INSNS (4), /* HI */ + COSTS_N_INSNS (3), /* SI */ + COSTS_N_INSNS (4), /* DI */ + COSTS_N_INSNS (5)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (35), /* HI */ + COSTS_N_INSNS (51), /* SI */ + COSTS_N_INSNS (83), /* DI */ + COSTS_N_INSNS (83)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 8, /* "large" insn */ + 9, /* MOVE_RATIO */ + 4, /* cost for loading QImode using movzbl */ + {3, 4, 3}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {3, 4, 3}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {4, 4, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {6, 6, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {3, 3}, /* cost of loading MMX registers + in SImode and DImode */ + {4, 4}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {4, 4, 3}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {4, 4, 5}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 3, /* MMX or SSE register to integer */ + /* On K8 + MOVD reg64, xmmreg Double FSTORE 4 + MOVD reg32, xmmreg Double FSTORE 4 + On AMDFAM10 + MOVD reg64, xmmreg Double FADD 3 + 1/1 1/1 + MOVD reg32, xmmreg Double FADD 3 + 1/1 1/1 */ + 64, /* size of prefetch block */ + /* New AMD processors never drop prefetches; if they cannot be performed + immediately, they are queued. We set number of simultaneous prefetches + to a large constant to reflect this (it probably is not a good idea not + to limit number of prefetches at all, as their execution also takes some + time). */ + 100, /* number of parallel prefetches */ + 5, /* Branch cost */ + COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (4), /* cost of FMUL instruction. */ + COSTS_N_INSNS (19), /* cost of FDIV instruction. */ + COSTS_N_INSNS (2), /* cost of FABS instruction. */ + COSTS_N_INSNS (2), /* cost of FCHS instruction. */ + COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ + + /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for + very small blocks it is better to use loop. For large blocks, libcall can + do nontemporary accesses and beat inline considerably. */ + {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}}, + {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + {{libcall, {{8, loop}, {24, unrolled_loop}, + {2048, rep_prefix_4_byte}, {-1, libcall}}}, + {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}} +}; + static const struct processor_costs pentium4_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ @@ -917,11 +991,13 @@ const struct processor_costs *ix86_cost = &pentium_cost; #define m_PENT4 (1<SSE reload sequences of fix_trunc?f?i_sse patterns. (define_peephole2 @@ -4488,7 +4513,8 @@ [(set_attr "length" "2") (set_attr "mode" "HI") (set_attr "unit" "i387") - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "vector")]) ;; Conversion between fixed point and floating point. @@ -4539,6 +4565,7 @@ (set_attr "mode" "SF") (set_attr "unit" "*,i387,*,*") (set_attr "athlon_decode" "*,*,vector,double") + (set_attr "amdfam10_decode" "*,*,vector,double") (set_attr "fp_int_src" "true")]) (define_insn "*floatsisf2_sse" @@ -4549,6 +4576,7 @@ [(set_attr "type" "sseicvt") (set_attr "mode" "SF") (set_attr "athlon_decode" "vector,double") + (set_attr "amdfam10_decode" "vector,double") (set_attr "fp_int_src" "true")]) (define_insn "*floatsisf2_i387" @@ -4582,6 +4610,7 @@ (set_attr "mode" "SF") (set_attr "unit" "*,i387,*,*") (set_attr "athlon_decode" "*,*,vector,double") + (set_attr "amdfam10_decode" "*,*,vector,double") (set_attr "fp_int_src" "true")]) (define_insn "*floatdisf2_sse" @@ -4592,6 +4621,7 @@ [(set_attr "type" "sseicvt") (set_attr "mode" "SF") (set_attr "athlon_decode" "vector,double") + (set_attr "amdfam10_decode" "vector,double") (set_attr "fp_int_src" "true")]) (define_insn "*floatdisf2_i387" @@ -4650,6 +4680,7 @@ (set_attr "mode" "DF") (set_attr "unit" "*,i387,*,*") (set_attr "athlon_decode" "*,*,double,direct") + (set_attr "amdfam10_decode" "*,*,vector,double") (set_attr "fp_int_src" "true")]) (define_insn "*floatsidf2_sse" @@ -4660,6 +4691,7 @@ [(set_attr "type" "sseicvt") (set_attr "mode" "DF") (set_attr "athlon_decode" "double,direct") + (set_attr "amdfam10_decode" "vector,double") (set_attr "fp_int_src" "true")]) (define_insn "*floatsidf2_i387" @@ -4693,6 +4725,7 @@ (set_attr "mode" "DF") (set_attr "unit" "*,i387,*,*") (set_attr "athlon_decode" "*,*,double,direct") + (set_attr "amdfam10_decode" "*,*,vector,double") (set_attr "fp_int_src" "true")]) (define_insn "*floatdidf2_sse" @@ -4703,6 +4736,7 @@ [(set_attr "type" "sseicvt") (set_attr "mode" "DF") (set_attr "athlon_decode" "double,direct") + (set_attr "amdfam10_decode" "vector,double") (set_attr "fp_int_src" "true")]) (define_insn "*floatdidf2_i387" @@ -6910,6 +6944,14 @@ "TARGET_64BIT" "") +;; On AMDFAM10 +;; IMUL reg64, reg64, imm8 Direct +;; IMUL reg64, mem64, imm8 VectorPath +;; IMUL reg64, reg64, imm32 Direct +;; IMUL reg64, mem64, imm32 VectorPath +;; IMUL reg64, reg64 Direct +;; IMUL reg64, mem64 Direct + (define_insn "*muldi3_1_rex64" [(set (match_operand:DI 0 "register_operand" "=r,r,r") (mult:DI (match_operand:DI 1 "nonimmediate_operand" "%rm,rm,0") @@ -6932,6 +6974,11 @@ (match_operand 1 "memory_operand" "")) (const_string "vector")] (const_string "direct"))) + (set (attr "amdfam10_decode") + (cond [(and (eq_attr "alternative" "0,1") + (match_operand 1 "memory_operand" "")) + (const_string "vector")] + (const_string "direct"))) (set_attr "mode" "DI")]) (define_expand "mulsi3" @@ -6942,6 +6989,14 @@ "" "") +;; On AMDFAM10 +;; IMUL reg32, reg32, imm8 Direct +;; IMUL reg32, mem32, imm8 VectorPath +;; IMUL reg32, reg32, imm32 Direct +;; IMUL reg32, mem32, imm32 VectorPath +;; IMUL reg32, reg32 Direct +;; IMUL reg32, mem32 Direct + (define_insn "*mulsi3_1" [(set (match_operand:SI 0 "register_operand" "=r,r,r") (mult:SI (match_operand:SI 1 "nonimmediate_operand" "%rm,rm,0") @@ -6963,6 +7018,11 @@ (match_operand 1 "memory_operand" "")) (const_string "vector")] (const_string "direct"))) + (set (attr "amdfam10_decode") + (cond [(and (eq_attr "alternative" "0,1") + (match_operand 1 "memory_operand" "")) + (const_string "vector")] + (const_string "direct"))) (set_attr "mode" "SI")]) (define_insn "*mulsi3_1_zext" @@ -6988,6 +7048,11 @@ (match_operand 1 "memory_operand" "")) (const_string "vector")] (const_string "direct"))) + (set (attr "amdfam10_decode") + (cond [(and (eq_attr "alternative" "0,1") + (match_operand 1 "memory_operand" "")) + (const_string "vector")] + (const_string "direct"))) (set_attr "mode" "SI")]) (define_expand "mulhi3" @@ -6998,6 +7063,13 @@ "TARGET_HIMODE_MATH" "") +;; On AMDFAM10 +;; IMUL reg16, reg16, imm8 VectorPath +;; IMUL reg16, mem16, imm8 VectorPath +;; IMUL reg16, reg16, imm16 VectorPath +;; IMUL reg16, mem16, imm16 VectorPath +;; IMUL reg16, reg16 Direct +;; IMUL reg16, mem16 Direct (define_insn "*mulhi3_1" [(set (match_operand:HI 0 "register_operand" "=r,r,r") (mult:HI (match_operand:HI 1 "nonimmediate_operand" "%rm,rm,0") @@ -7016,6 +7088,10 @@ (eq_attr "alternative" "1,2") (const_string "vector")] (const_string "direct"))) + (set (attr "amdfam10_decode") + (cond [(eq_attr "alternative" "0,1") + (const_string "vector")] + (const_string "direct"))) (set_attr "mode" "HI")]) (define_expand "mulqi3" @@ -7026,6 +7102,10 @@ "TARGET_QIMODE_MATH" "") +;;On AMDFAM10 +;; MUL reg8 Direct +;; MUL mem8 Direct + (define_insn "*mulqi3_1" [(set (match_operand:QI 0 "register_operand" "=a") (mult:QI (match_operand:QI 1 "nonimmediate_operand" "%0") @@ -7040,6 +7120,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "direct"))) + (set_attr "amdfam10_decode" "direct") (set_attr "mode" "QI")]) (define_expand "umulqihi3" @@ -7066,6 +7147,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "direct"))) + (set_attr "amdfam10_decode" "direct") (set_attr "mode" "QI")]) (define_expand "mulqihi3" @@ -7090,6 +7172,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "direct"))) + (set_attr "amdfam10_decode" "direct") (set_attr "mode" "QI")]) (define_expand "umulditi3" @@ -7116,6 +7199,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "double"))) + (set_attr "amdfam10_decode" "double") (set_attr "mode" "DI")]) ;; We can't use this pattern in 64bit mode, since it results in two separate 32bit registers @@ -7143,6 +7227,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "double"))) + (set_attr "amdfam10_decode" "double") (set_attr "mode" "SI")]) (define_expand "mulditi3" @@ -7169,6 +7254,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "double"))) + (set_attr "amdfam10_decode" "double") (set_attr "mode" "DI")]) (define_expand "mulsidi3" @@ -7195,6 +7281,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "double"))) + (set_attr "amdfam10_decode" "double") (set_attr "mode" "SI")]) (define_expand "umuldi3_highpart" @@ -7231,6 +7318,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "double"))) + (set_attr "amdfam10_decode" "double") (set_attr "mode" "DI")]) (define_expand "umulsi3_highpart" @@ -7266,6 +7354,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "double"))) + (set_attr "amdfam10_decode" "double") (set_attr "mode" "SI")]) (define_insn "*umulsi3_highpart_zext" @@ -7288,6 +7377,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "double"))) + (set_attr "amdfam10_decode" "double") (set_attr "mode" "SI")]) (define_expand "smuldi3_highpart" @@ -7323,6 +7413,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "double"))) + (set_attr "amdfam10_decode" "double") (set_attr "mode" "DI")]) (define_expand "smulsi3_highpart" @@ -7357,6 +7448,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "double"))) + (set_attr "amdfam10_decode" "double") (set_attr "mode" "SI")]) (define_insn "*smulsi3_highpart_zext" @@ -7378,6 +7470,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "double"))) + (set_attr "amdfam10_decode" "double") (set_attr "mode" "SI")]) ;; The patterns that match these are at the end of this file. @@ -10359,7 +10452,8 @@ [(set_attr "type" "ishift") (set_attr "prefix_0f" "1") (set_attr "mode" "DI") - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "vector")]) (define_expand "x86_64_shift_adj" [(set (reg:CCZ FLAGS_REG) @@ -10574,7 +10668,8 @@ (set_attr "prefix_0f" "1") (set_attr "mode" "SI") (set_attr "pent_pair" "np") - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "vector")]) (define_expand "x86_shift_adj_1" [(set (reg:CCZ FLAGS_REG) @@ -11334,7 +11429,8 @@ [(set_attr "type" "ishift") (set_attr "prefix_0f" "1") (set_attr "mode" "DI") - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "vector")]) (define_expand "ashrdi3" [(set (match_operand:DI 0 "shiftdi_operand" "") @@ -14608,7 +14704,23 @@ [(set (match_dup 0) (xor:SI (match_dup 0) (const_int 31))) (clobber (reg:CC FLAGS_REG))])] "" - "") +{ + if (TARGET_ABM) + { + emit_insn (gen_clzsi2_abm (operands[0], operands[1])); + DONE; + } +}) + +(define_insn "clzsi2_abm" + [(set (match_operand:SI 0 "register_operand" "=r") + (clz:SI (match_operand:SI 1 "nonimmediate_operand" ""))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_ABM" + "lzcnt{l}\t{%1, %0|%0, %1}" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "SI")]) (define_insn "*bsr" [(set (match_operand:SI 0 "register_operand" "=r") @@ -14617,7 +14729,44 @@ (clobber (reg:CC FLAGS_REG))] "" "bsr{l}\t{%1, %0|%0, %1}" - [(set_attr "prefix_0f" "1")]) + [(set_attr "prefix_0f" "1") + (set_attr "mode" "SI")]) + +(define_insn "popcountsi2" + [(set (match_operand:SI 0 "register_operand" "=r") + (popcount:SI (match_operand:SI 1 "nonimmediate_operand" ""))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_POPCNT" + "popcnt{l}\t{%1, %0|%0, %1}" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "SI")]) + +(define_insn "*popcountsi2_cmp" + [(set (reg FLAGS_REG) + (compare + (popcount:SI (match_operand:SI 1 "nonimmediate_operand" "rm")) + (const_int 0))) + (set (match_operand:SI 0 "register_operand" "=r") + (popcount:SI (match_dup 1)))] + "TARGET_POPCNT && ix86_match_ccmode (insn, CCZmode)" + "popcnt{l}\t{%1, %0|%0, %1}" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "SI")]) + +(define_insn "*popcountsi2_cmp_zext" + [(set (reg FLAGS_REG) + (compare + (popcount:SI (match_operand:SI 1 "nonimmediate_operand" "rm")) + (const_int 0))) + (set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI(popcount:SI (match_dup 1))))] + "TARGET_64BIT && TARGET_POPCNT && ix86_match_ccmode (insn, CCZmode)" + "popcnt{l}\t{%1, %0|%0, %1}" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "SI")]) (define_insn "bswapsi2" [(set (match_operand:SI 0 "register_operand" "=r") @@ -14647,7 +14796,23 @@ [(set (match_dup 0) (xor:DI (match_dup 0) (const_int 63))) (clobber (reg:CC FLAGS_REG))])] "TARGET_64BIT" - "") +{ + if (TARGET_ABM) + { + emit_insn (gen_clzdi2_abm (operands[0], operands[1])); + DONE; + } +}) + +(define_insn "clzdi2_abm" + [(set (match_operand:DI 0 "register_operand" "=r") + (clz:DI (match_operand:DI 1 "nonimmediate_operand" ""))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT && TARGET_ABM" + "lzcnt{q}\t{%1, %0|%0, %1}" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "DI")]) (define_insn "*bsr_rex64" [(set (match_operand:DI 0 "register_operand" "=r") @@ -14656,7 +14821,92 @@ (clobber (reg:CC FLAGS_REG))] "TARGET_64BIT" "bsr{q}\t{%1, %0|%0, %1}" - [(set_attr "prefix_0f" "1")]) + [(set_attr "prefix_0f" "1") + (set_attr "mode" "DI")]) + +(define_insn "popcountdi2" + [(set (match_operand:DI 0 "register_operand" "=r") + (popcount:DI (match_operand:DI 1 "nonimmediate_operand" ""))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT && TARGET_POPCNT" + "popcnt{q}\t{%1, %0|%0, %1}" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "DI")]) + +(define_insn "*popcountdi2_cmp" + [(set (reg FLAGS_REG) + (compare + (popcount:DI (match_operand:DI 1 "nonimmediate_operand" "rm")) + (const_int 0))) + (set (match_operand:DI 0 "register_operand" "=r") + (popcount:DI (match_dup 1)))] + "TARGET_64BIT && TARGET_POPCNT && ix86_match_ccmode (insn, CCZmode)" + "popcnt{q}\t{%1, %0|%0, %1}" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "DI")]) + +(define_expand "clzhi2" + [(parallel + [(set (match_operand:HI 0 "register_operand" "") + (minus:HI (const_int 15) + (clz:HI (match_operand:HI 1 "nonimmediate_operand" "")))) + (clobber (reg:CC FLAGS_REG))]) + (parallel + [(set (match_dup 0) (xor:HI (match_dup 0) (const_int 15))) + (clobber (reg:CC FLAGS_REG))])] + "" +{ + if (TARGET_ABM) + { + emit_insn (gen_clzhi2_abm (operands[0], operands[1])); + DONE; + } +}) + +(define_insn "clzhi2_abm" + [(set (match_operand:HI 0 "register_operand" "=r") + (clz:HI (match_operand:HI 1 "nonimmediate_operand" ""))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_ABM" + "lzcnt{w}\t{%1, %0|%0, %1}" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "HI")]) + +(define_insn "*bsrhi" + [(set (match_operand:HI 0 "register_operand" "=r") + (minus:HI (const_int 15) + (clz:HI (match_operand:HI 1 "nonimmediate_operand" "rm")))) + (clobber (reg:CC FLAGS_REG))] + "" + "bsr{w}\t{%1, %0|%0, %1}" + [(set_attr "prefix_0f" "1") + (set_attr "mode" "HI")]) + +(define_insn "popcounthi2" + [(set (match_operand:HI 0 "register_operand" "=r") + (popcount:HI (match_operand:HI 1 "nonimmediate_operand" ""))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_POPCNT" + "popcnt{w}\t{%1, %0|%0, %1}" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "HI")]) + +(define_insn "*popcounthi2_cmp" + [(set (reg FLAGS_REG) + (compare + (popcount:HI (match_operand:HI 1 "nonimmediate_operand" "rm")) + (const_int 0))) + (set (match_operand:HI 0 "register_operand" "=r") + (popcount:HI (match_dup 1)))] + "TARGET_POPCNT && ix86_match_ccmode (insn, CCZmode)" + "popcnt{w}\t{%1, %0|%0, %1}" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "HI")]) ;; Thread-local storage patterns for ELF. ;; @@ -15564,7 +15814,8 @@ "fsqrt" [(set_attr "type" "fpspc") (set_attr "mode" "XF") - (set_attr "athlon_decode" "direct")]) + (set_attr "athlon_decode" "direct") + (set_attr "amdfam10_decode" "direct")]) (define_insn "sqrt_extendxf2_i387" [(set (match_operand:XF 0 "register_operand" "=f") @@ -15575,7 +15826,8 @@ "fsqrt" [(set_attr "type" "fpspc") (set_attr "mode" "XF") - (set_attr "athlon_decode" "direct")]) + (set_attr "athlon_decode" "direct") + (set_attr "amdfam10_decode" "direct")]) (define_insn "*sqrt2_sse" [(set (match_operand:SSEMODEF 0 "register_operand" "=x") @@ -15585,7 +15837,8 @@ "sqrts\t{%1, %0|%0, %1}" [(set_attr "type" "sse") (set_attr "mode" "") - (set_attr "athlon_decode" "*")]) + (set_attr "athlon_decode" "*") + (set_attr "amdfam10_decode" "*")]) (define_expand "sqrt2" [(set (match_operand:X87MODEF12 0 "register_operand" "") @@ -19995,7 +20248,7 @@ (mult:DI (match_operand:DI 1 "memory_operand" "") (match_operand:DI 2 "immediate_operand" ""))) (clobber (reg:CC FLAGS_REG))])] - "(TARGET_K8 || TARGET_GENERIC64) && !optimize_size + "(TARGET_K8 || TARGET_GENERIC64 || TARGET_AMDFAM10) && !optimize_size && !satisfies_constraint_K (operands[2])" [(set (match_dup 3) (match_dup 1)) (parallel [(set (match_dup 0) (mult:DI (match_dup 3) (match_dup 2))) @@ -20008,7 +20261,7 @@ (mult:SI (match_operand:SI 1 "memory_operand" "") (match_operand:SI 2 "immediate_operand" ""))) (clobber (reg:CC FLAGS_REG))])] - "(TARGET_K8 || TARGET_GENERIC64) && !optimize_size + "(TARGET_K8 || TARGET_GENERIC64 || TARGET_AMDFAM10) && !optimize_size && !satisfies_constraint_K (operands[2])" [(set (match_dup 3) (match_dup 1)) (parallel [(set (match_dup 0) (mult:SI (match_dup 3) (match_dup 2))) @@ -20022,7 +20275,7 @@ (mult:SI (match_operand:SI 1 "memory_operand" "") (match_operand:SI 2 "immediate_operand" "")))) (clobber (reg:CC FLAGS_REG))])] - "(TARGET_K8 || TARGET_GENERIC64) && !optimize_size + "(TARGET_K8 || TARGET_GENERIC64 || TARGET_AMDFAM10) && !optimize_size && !satisfies_constraint_K (operands[2])" [(set (match_dup 3) (match_dup 1)) (parallel [(set (match_dup 0) (zero_extend:DI (mult:SI (match_dup 3) (match_dup 2)))) @@ -20039,7 +20292,7 @@ (match_operand:DI 2 "const_int_operand" ""))) (clobber (reg:CC FLAGS_REG))]) (match_scratch:DI 3 "r")] - "(TARGET_K8 || TARGET_GENERIC64) && !optimize_size + "(TARGET_K8 || TARGET_GENERIC64 || TARGET_AMDFAM10) && !optimize_size && satisfies_constraint_K (operands[2])" [(set (match_dup 3) (match_dup 2)) (parallel [(set (match_dup 0) (mult:DI (match_dup 0) (match_dup 3))) @@ -20055,7 +20308,7 @@ (match_operand:SI 2 "const_int_operand" ""))) (clobber (reg:CC FLAGS_REG))]) (match_scratch:SI 3 "r")] - "(TARGET_K8 || TARGET_GENERIC64) && !optimize_size + "(TARGET_K8 || TARGET_GENERIC64 || TARGET_AMDFAM10) && !optimize_size && satisfies_constraint_K (operands[2])" [(set (match_dup 3) (match_dup 2)) (parallel [(set (match_dup 0) (mult:SI (match_dup 0) (match_dup 3))) @@ -20071,7 +20324,7 @@ (match_operand:HI 2 "immediate_operand" ""))) (clobber (reg:CC FLAGS_REG))]) (match_scratch:HI 3 "r")] - "(TARGET_K8 || TARGET_GENERIC64) && !optimize_size" + "(TARGET_K8 || TARGET_GENERIC64 || TARGET_AMDFAM10) && !optimize_size" [(set (match_dup 3) (match_dup 2)) (parallel [(set (match_dup 0) (mult:HI (match_dup 0) (match_dup 3))) (clobber (reg:CC FLAGS_REG))])] diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt index 25b2f2d4e95..61c68074020 100644 --- a/gcc/config/i386/i386.opt +++ b/gcc/config/i386/i386.opt @@ -1,6 +1,6 @@ ; Options for the IA-32 and AMD64 ports of the compiler. -; Copyright (C) 2005 Free Software Foundation, Inc. +; Copyright (C) 2005, 2006, 2007 Free Software Foundation, Inc. ; ; This file is part of GCC. ; @@ -205,6 +205,22 @@ mssse3 Target Report Mask(SSSE3) Support MMX, SSE, SSE2, SSE3 and SSSE3 built-in functions and code generation +msse4a +Target Report Mask(SSE4A) +Support MMX, SSE, SSE2, SSE3 and SSE4A built-in functions and code generation + +mpopcnt +Target Report Mask(POPCNT) +Support code generation of popcount instruction for popcount built-ins +namely __builtin_popcount, __builtin_popcountl and __builtin_popcountll + +mabm +Target Report Mask(ABM) +Support code generation of Advanced Bit Manipulation (ABM) instructions, +which include popcnt and lzcnt instructions, for popcount and clz built-ins +namely __builtin_popcount, __builtin_popcountl, __builtin_popcountll and +__builtin_clz, __builtin_clzl, __builtin_clzll + msseregparm Target RejectNegative Mask(SSEREGPARM) Use SSE register passing conventions for SF and DF mode diff --git a/gcc/config/i386/pmmintrin.h b/gcc/config/i386/pmmintrin.h index 7dbf03043fb..39d7c170a73 100644 --- a/gcc/config/i386/pmmintrin.h +++ b/gcc/config/i386/pmmintrin.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2003, 2004, 2005, 2006 Free Software Foundation, Inc. +/* Copyright (C) 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc. This file is part of GCC. @@ -30,7 +30,11 @@ #ifndef _PMMINTRIN_H_INCLUDED #define _PMMINTRIN_H_INCLUDED -#ifdef __SSE3__ +#ifndef __SSE3__ +# error "SSE3 instruction set not enabled" +#else + +/* We need definitions from the SSE2 and SSE header files*/ #include #include diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index a17e7a9cdbc..1bb099e48a9 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -1,5 +1,5 @@ ;; GCC machine description for SSE instructions -;; Copyright (C) 2005, 2006 +;; Copyright (C) 2005, 2006, 2007 ;; Free Software Foundation, Inc. ;; ;; This file is part of GCC. @@ -956,6 +956,7 @@ "cvtsi2ss\t{%2, %0|%0, %2}" [(set_attr "type" "sseicvt") (set_attr "athlon_decode" "vector,double") + (set_attr "amdfam10_decode" "vector,double") (set_attr "mode" "SF")]) (define_insn "sse_cvtsi2ssq" @@ -969,6 +970,7 @@ "cvtsi2ssq\t{%2, %0|%0, %2}" [(set_attr "type" "sseicvt") (set_attr "athlon_decode" "vector,double") + (set_attr "amdfam10_decode" "vector,double") (set_attr "mode" "SF")]) (define_insn "sse_cvtss2si" @@ -992,6 +994,7 @@ "cvtss2si\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double") (set_attr "mode" "SI")]) (define_insn "sse_cvtss2siq" @@ -1015,6 +1018,7 @@ "cvtss2siq\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double") (set_attr "mode" "DI")]) (define_insn "sse_cvttss2si" @@ -1027,6 +1031,7 @@ "cvttss2si\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double") (set_attr "mode" "SI")]) (define_insn "sse_cvttss2siq" @@ -1039,6 +1044,7 @@ "cvttss2siq\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double") (set_attr "mode" "DI")]) (define_insn "sse2_cvtdq2ps" @@ -1944,7 +1950,8 @@ "cvtsi2sd\t{%2, %0|%0, %2}" [(set_attr "type" "sseicvt") (set_attr "mode" "DF") - (set_attr "athlon_decode" "double,direct")]) + (set_attr "athlon_decode" "double,direct") + (set_attr "amdfam10_decode" "vector,double")]) (define_insn "sse2_cvtsi2sdq" [(set (match_operand:V2DF 0 "register_operand" "=x,x") @@ -1957,7 +1964,8 @@ "cvtsi2sdq\t{%2, %0|%0, %2}" [(set_attr "type" "sseicvt") (set_attr "mode" "DF") - (set_attr "athlon_decode" "double,direct")]) + (set_attr "athlon_decode" "double,direct") + (set_attr "amdfam10_decode" "vector,double")]) (define_insn "sse2_cvtsd2si" [(set (match_operand:SI 0 "register_operand" "=r,r") @@ -1980,6 +1988,7 @@ "cvtsd2si\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double") (set_attr "mode" "SI")]) (define_insn "sse2_cvtsd2siq" @@ -2003,6 +2012,7 @@ "cvtsd2siq\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double") (set_attr "mode" "DI")]) (define_insn "sse2_cvttsd2si" @@ -2015,7 +2025,8 @@ "cvttsd2si\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "mode" "SI") - (set_attr "athlon_decode" "double,vector")]) + (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double")]) (define_insn "sse2_cvttsd2siq" [(set (match_operand:DI 0 "register_operand" "=r,r") @@ -2027,7 +2038,8 @@ "cvttsd2siq\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "mode" "DI") - (set_attr "athlon_decode" "double,vector")]) + (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double")]) (define_insn "sse2_cvtdq2pd" [(set (match_operand:V2DF 0 "register_operand" "=x") @@ -2058,7 +2070,8 @@ "TARGET_SSE2" "cvtpd2dq\t{%1, %0|%0, %1}" [(set_attr "type" "ssecvt") - (set_attr "mode" "TI")]) + (set_attr "mode" "TI") + (set_attr "amdfam10_decode" "double")]) (define_expand "sse2_cvttpd2dq" [(set (match_operand:V4SI 0 "register_operand" "") @@ -2076,7 +2089,8 @@ "TARGET_SSE2" "cvttpd2dq\t{%1, %0|%0, %1}" [(set_attr "type" "ssecvt") - (set_attr "mode" "TI")]) + (set_attr "mode" "TI") + (set_attr "amdfam10_decode" "double")]) (define_insn "sse2_cvtsd2ss" [(set (match_operand:V4SF 0 "register_operand" "=x,x") @@ -2090,20 +2104,22 @@ "cvtsd2ss\t{%2, %0|%0, %2}" [(set_attr "type" "ssecvt") (set_attr "athlon_decode" "vector,double") + (set_attr "amdfam10_decode" "vector,double") (set_attr "mode" "SF")]) (define_insn "sse2_cvtss2sd" - [(set (match_operand:V2DF 0 "register_operand" "=x") + [(set (match_operand:V2DF 0 "register_operand" "=x,x") (vec_merge:V2DF (float_extend:V2DF (vec_select:V2SF - (match_operand:V4SF 2 "nonimmediate_operand" "xm") + (match_operand:V4SF 2 "nonimmediate_operand" "x,m") (parallel [(const_int 0) (const_int 1)]))) - (match_operand:V2DF 1 "register_operand" "0") + (match_operand:V2DF 1 "register_operand" "0,0") (const_int 1)))] "TARGET_SSE2" "cvtss2sd\t{%2, %0|%0, %2}" [(set_attr "type" "ssecvt") + (set_attr "amdfam10_decode" "vector,double") (set_attr "mode" "DF")]) (define_expand "sse2_cvtpd2ps" @@ -2124,7 +2140,8 @@ "TARGET_SSE2" "cvtpd2ps\t{%1, %0|%0, %1}" [(set_attr "type" "ssecvt") - (set_attr "mode" "V4SF")]) + (set_attr "mode" "V4SF") + (set_attr "amdfam10_decode" "double")]) (define_insn "sse2_cvtps2pd" [(set (match_operand:V2DF 0 "register_operand" "=x") @@ -2135,7 +2152,8 @@ "TARGET_SSE2" "cvtps2pd\t{%1, %0|%0, %1}" [(set_attr "type" "ssecvt") - (set_attr "mode" "V2DF")]) + (set_attr "mode" "V2DF") + (set_attr "amdfam10_decode" "direct")]) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; @@ -5146,3 +5164,92 @@ "pabs\t{%1, %0|%0, %1}"; [(set_attr "type" "sselog1") (set_attr "mode" "DI")]) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; AMD SSE4A instructions +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_insn "sse4a_vmmovntv2df" + [(set (match_operand:DF 0 "memory_operand" "=m") + (unspec:DF [(vec_select:DF + (match_operand:V2DF 1 "register_operand" "x") + (parallel [(const_int 0)]))] + UNSPEC_MOVNT))] + "TARGET_SSE4A" + "movntsd\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "mode" "DF")]) + +(define_insn "sse4a_movntdf" + [(set (match_operand:DF 0 "memory_operand" "=m") + (unspec:DF [(match_operand:DF 1 "register_operand" "x")] + UNSPEC_MOVNT))] + "TARGET_SSE4A" + "movntsd\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "mode" "DF")]) + +(define_insn "sse4a_vmmovntv4sf" + [(set (match_operand:SF 0 "memory_operand" "=m") + (unspec:SF [(vec_select:SF + (match_operand:V4SF 1 "register_operand" "x") + (parallel [(const_int 0)]))] + UNSPEC_MOVNT))] + "TARGET_SSE4A" + "movntss\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "mode" "SF")]) + +(define_insn "sse4a_movntsf" + [(set (match_operand:SF 0 "memory_operand" "=m") + (unspec:SF [(match_operand:SF 1 "register_operand" "x")] + UNSPEC_MOVNT))] + "TARGET_SSE4A" + "movntss\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "mode" "SF")]) + +(define_insn "sse4a_extrqi" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0") + (match_operand 2 "const_int_operand" "") + (match_operand 3 "const_int_operand" "")] + UNSPEC_EXTRQI))] + "TARGET_SSE4A" + "extrq\t{%3, %2, %0|%0, %2, %3}" + [(set_attr "type" "sse") + (set_attr "mode" "TI")]) + +(define_insn "sse4a_extrq" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0") + (match_operand:V16QI 2 "register_operand" "x")] + UNSPEC_EXTRQ))] + "TARGET_SSE4A" + "extrq\t{%2, %0|%0, %2}" + [(set_attr "type" "sse") + (set_attr "mode" "TI")]) + +(define_insn "sse4a_insertqi" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0") + (match_operand:V2DI 2 "register_operand" "x") + (match_operand 3 "const_int_operand" "") + (match_operand 4 "const_int_operand" "")] + UNSPEC_INSERTQI))] + "TARGET_SSE4A" + "insertq\t{%4, %3, %2, %0|%0, %2, %3, %4}" + [(set_attr "type" "sseins") + (set_attr "mode" "TI")]) + +(define_insn "sse4a_insertq" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0") + (match_operand:V2DI 2 "register_operand" "x")] + UNSPEC_INSERTQ))] + "TARGET_SSE4A" + "insertq\t{%2, %0|%0, %2}" + [(set_attr "type" "sseins") + (set_attr "mode" "TI")]) diff --git a/gcc/config/i386/tmmintrin.h b/gcc/config/i386/tmmintrin.h index e53dd0f5c55..cf9d99d0901 100644 --- a/gcc/config/i386/tmmintrin.h +++ b/gcc/config/i386/tmmintrin.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2006 Free Software Foundation, Inc. +/* Copyright (C) 2006, 2007 Free Software Foundation, Inc. This file is part of GCC. @@ -30,7 +30,11 @@ #ifndef _TMMINTRIN_H_INCLUDED #define _TMMINTRIN_H_INCLUDED -#ifdef __SSSE3__ +#ifndef __SSSE3__ +# error "SSSE3 instruction set not enabled" +#else + +/* We need definitions from the SSE3, SSE2 and SSE header files*/ #include static __inline __m128i __attribute__((__always_inline__)) diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi index f56f77fd0af..2417a73d20a 100644 --- a/gcc/doc/extend.texi +++ b/gcc/doc/extend.texi @@ -7269,6 +7269,23 @@ v4si __builtin_ia32_pabsd128 (v4si) v8hi __builtin_ia32_pabsw128 (v8hi) @end smallexample +The following built-in functions are available when @option{-msse4a} is used. + +@smallexample +void _mm_stream_sd (double*,__m128d); +Generates the @code{movntsd} machine instruction. +void _mm_stream_ss (float*,__m128); +Generates the @code{movntss} machine instruction. +__m128i _mm_extract_si64 (__m128i, __m128i); +Generates the @code{extrq} machine instruction with only SSE register operands. +__m128i _mm_extracti_si64 (__m128i, int, int); +Generates the @code{extrq} machine instruction with SSE register and immediate operands. +__m128i _mm_insert_si64 (__m128i, __m128i); +Generates the @code{insertq} machine instruction with only SSE register operands. +__m128i _mm_inserti_si64 (__m128i, __m128i, int, int); +Generates the @code{insertq} machine instruction with SSE register and immediate operands. +@end smallexample + The following built-in functions are available when @option{-m3dnow} is used. All of them generate the machine instruction that is part of the name. diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 4be7de1022b..cae7f8b038d 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -538,7 +538,7 @@ Objective-C and Objective-C++ Dialects}. -mno-fp-ret-in-387 -msoft-float -msvr3-shlib @gol -mno-wide-multiply -mrtd -malign-double @gol -mpreferred-stack-boundary=@var{num} @gol --mmmx -msse -msse2 -msse3 -mssse3 -m3dnow @gol +-mmmx -msse -msse2 -msse3 -mssse3 -msse4a -m3dnow -mpopcnt -mabm @gol -mthreads -mno-align-stringops -minline-all-stringops @gol -mpush-args -maccumulate-outgoing-args -m128bit-long-double @gol -m96bit-long-double -mregparm=@var{num} -msseregparm @gol @@ -9501,6 +9501,10 @@ instruction set support. @item k8, opteron, athlon64, athlon-fx AMD K8 core based CPUs with x86-64 instruction set support. (This supersets MMX, SSE, SSE2, 3dNOW!, enhanced 3dNOW! and 64-bit instruction set extensions.) +@item amdfam10 +AMD Family 10 core based CPUs with x86-64 instruction set support. (This +supersets MMX, SSE, SSE2, SSE3, SSE4A, 3dNOW!, enhanced 3dNOW!, ABM and 64-bit +instruction set extensions.) @item winchip-c6 IDT Winchip C6 CPU, dealt in same way as i486 with additional MMX instruction set support. @@ -9795,8 +9799,14 @@ preferred alignment to @option{-mpreferred-stack-boundary=2}. @itemx -mno-sse3 @item -mssse3 @itemx -mno-ssse3 +@item -msse4a +@item -mno-sse4a @item -m3dnow @itemx -mno-3dnow +@item -mpopcnt +@itemx -mno-popcnt +@item -mabm +@itemx -mno-abm @opindex mmmx @opindex mno-mmx @opindex msse @@ -9804,7 +9814,7 @@ preferred alignment to @option{-mpreferred-stack-boundary=2}. @opindex m3dnow @opindex mno-3dnow These switches enable or disable the use of instructions in the MMX, -SSE, SSE2, SSE3, SSSE3 or 3DNow! extended instruction sets. +SSE, SSE2, SSE3, SSSE3, SSE4A, ABM or 3DNow! extended instruction sets. These extensions are also available as built-in functions: see @ref{X86 Built-in Functions}, for details of the functions enabled and disabled by these switches. diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 2b88029caf4..03f1aa6462e 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,12 @@ +2007-02-05 Dwarakanath Rajagopal + + * gcc.dg/i386-cpuid.h: Test whether SSE4A is supported + for running tests. + * gcc.target/i386/sse4a-extract.c: New test. + * gcc.target/i386/sse4a-insert.c: New test. + * gcc.target/i386/sse4a-montsd.c: New test. + * gcc.target/i386/sse4a-montss.c: New test. + 2007-02-05 Richard Guenther * gcc.target/i386/vectorize3.c: New testcase. diff --git a/gcc/testsuite/gcc.dg/i386-cpuid.h b/gcc/testsuite/gcc.dg/i386-cpuid.h index 695b4c9ce72..c7b999c7fdf 100644 --- a/gcc/testsuite/gcc.dg/i386-cpuid.h +++ b/gcc/testsuite/gcc.dg/i386-cpuid.h @@ -12,6 +12,10 @@ #define bit_SSE (1 << 25) #define bit_SSE2 (1 << 26) +/* Extended Features */ +/* %ecx */ +#define bit_SSE4a (1 << 6) + #ifndef NOINLINE #define NOINLINE __attribute__ ((noinline)) #endif @@ -60,8 +64,43 @@ i386_get_cpuid (unsigned int *ecx, unsigned int *edx) return 1; } +static inline unsigned int +i386_get_extended_cpuid (unsigned int *ecx, unsigned int *edx) +{ + int fl1; + if (!(i386_get_cpuid (ecx, edx))) + return 0; + + /* Invoke CPUID(0x80000000) to get the highest supported extended function + number */ +#ifdef __x86_64__ + __asm__ ("cpuid" + : "=a" (fl1) : "0" (0x80000000) : "edx", "ecx", "ebx"); +#else + __asm__ ("pushl %%ebx; cpuid; popl %%ebx" + : "=a" (fl1) : "0" (0x80000000) : "edx", "ecx"); +#endif + /* Check if highest supported extended function used below are supported */ + if (fl1 < 0x80000001) + return 0; + + /* Invoke CPUID(0x80000001), return %ecx and %edx; caller can examine bits to + determine what's supported. */ +#ifdef __x86_64__ + __asm__ ("cpuid" + : "=c" (*ecx), "=d" (*edx), "=a" (fl1) : "2" (0x80000001) : "ebx"); +#else + __asm__ ("pushl %%ebx; cpuid; popl %%ebx" + : "=c" (*ecx), "=d" (*edx), "=a" (fl1) : "2" (0x80000001)); +#endif + return 1; +} + + unsigned int i386_cpuid_ecx (void) NOINLINE; unsigned int i386_cpuid_edx (void) NOINLINE; +unsigned int i386_extended_cpuid_ecx (void) NOINLINE; +unsigned int i386_extended_cpuid_edx (void) NOINLINE; unsigned int NOINLINE i386_cpuid_ecx (void) @@ -83,6 +122,26 @@ i386_cpuid_edx (void) return 0; } +unsigned int NOINLINE +i386_extended_cpuid_ecx (void) +{ + unsigned int ecx, edx; + if (i386_get_extended_cpuid (&ecx, &edx)) + return ecx; + else + return 0; +} + +unsigned int NOINLINE +i386_extended_cpuid_edx (void) +{ + unsigned int ecx, edx; + if (i386_get_extended_cpuid (&ecx, &edx)) + return edx; + else + return 0; +} + static inline unsigned int i386_cpuid (void) { diff --git a/gcc/testsuite/gcc.target/i386/sse4a-extract.c b/gcc/testsuite/gcc.target/i386/sse4a-extract.c new file mode 100644 index 00000000000..90ad0f65a23 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse4a-extract.c @@ -0,0 +1,100 @@ +/* { dg-do run { target i?86-*-* x86_64-*-* } } */ +/* { dg-options "-O2 -msse4a" } */ +#include +#include +#include "../../gcc.dg/i386-cpuid.h" + +static void sse4a_test (void); + +typedef union +{ + long long i[2]; + __m128i vec; +} LI; + +int +main () +{ + unsigned long cpu_facilities; + + cpu_facilities = i386_extended_cpuid_ecx (); + + /* Run SSE4a test only if host has SSE4a support. */ + if ((cpu_facilities & bit_SSE4a)) + sse4a_test (); + + exit (0); +} + +static long long +sse4a_test_extrq (long long in) +{ + __m128i v1, v2; + long long index_length, pad; + LI v_out; + index_length = 0x0000000000000810; + pad = 0x0; + v1 = _mm_set_epi64x (pad, in); + v2 = _mm_set_epi64x (pad, index_length); + v_out.vec = _mm_extract_si64 (v1, v2); + return (v_out.i[0]); +} + +static long long +sse4a_test_extrqi (long long in) +{ + __m128i v1; + long long pad =0x0; + LI v_out; + v1 = _mm_set_epi64x (pad, in); + v_out.vec = _mm_extracti_si64 (v1, (unsigned int) 0x10,(unsigned int) 0x08); + return (v_out.i[0]); +} + +static chk (long long i1, long long i2) +{ + int n_fails =0; + if (i1 != i2) + n_fails +=1; + return n_fails; +} + +long long vals_in[5] = + { + 0x1234567887654321, + 0x1456782093002490, + 0x2340909123990390, + 0x9595959599595999, + 0x9099038798000029 + }; + +long long vals_out[5] = + { + 0x0000000000006543, + 0x0000000000000024, + 0x0000000000009903, + 0x0000000000005959, + 0x0000000000000000 + }; + +static void +sse4a_test (void) +{ + int i; + int fail = 0; + long long out; + + for (i = 0; i < 5; i += 1) + { + out = sse4a_test_extrq (vals_in[i]); + fail += chk(out, vals_out[i]); + + out = sse4a_test_extrqi (vals_in[i]); + fail += chk(out, vals_out[i]); + } + + if (fail != 0) + abort (); + + exit (0); +} diff --git a/gcc/testsuite/gcc.target/i386/sse4a-insert.c b/gcc/testsuite/gcc.target/i386/sse4a-insert.c new file mode 100644 index 00000000000..69262bdfcb0 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse4a-insert.c @@ -0,0 +1,110 @@ +/* { dg-do run { target i?86-*-* x86_64-*-* } } */ +/* { dg-options "-O2 -msse4a" } */ +#include +#include +#include "../../gcc.dg/i386-cpuid.h" + +static void sse4a_test (void); + +typedef union +{ + long long i[2]; + __m128i vec; +} LI; + +int +main () +{ + unsigned long cpu_facilities; + + cpu_facilities = i386_extended_cpuid_ecx (); + + /* Run SSE4a test only if host has SSE4a support. */ + if ((cpu_facilities & bit_SSE4a)) + sse4a_test (); + + exit (0); +} + +static long long +sse4a_test_insert (long long in1, long long in2) +{ + __m128i v1,v2; + long long index_length, pad; + LI v_out; + index_length = 0x0000000000000810; + pad = 0x0; + v1 = _mm_set_epi64x (pad, in1); + v2 = _mm_set_epi64x (index_length, in2); + v_out.vec = _mm_insert_si64 (v1, v2); + return (v_out.i[0]); +} + +static long long +sse4a_test_inserti (long long in1, long long in2) +{ + __m128i v1,v2; + long long pad = 0x0; + LI v_out; + v1 = _mm_set_epi64x (pad, in1); + v2 = _mm_set_epi64x (pad, in2); + v_out.vec = _mm_inserti_si64 (v1, v2, (unsigned int) 0x10, (unsigned int) 0x08); + return (v_out.i[0]); +} + +static chk (long long i1, long long i2) +{ + int n_fails =0; + if (i1 != i2) + n_fails +=1; + return n_fails; +} + +long long vals_in1[5] = + { + 0x1234567887654321, + 0x1456782093002490, + 0x2340909123990390, + 0x9595959599595999, + 0x9099038798000029 + }; + +long long vals_in2[5] = + { + 0x9ABCDEF00FEDCBA9, + 0x234567097289672A, + 0x45476453097BD342, + 0x23569012AE586FF0, + 0x432567ABCDEF765D + }; + +long long vals_out[5] = + { + 0x1234567887CBA921, + 0x1456782093672A90, + 0x2340909123D34290, + 0x95959595996FF099, + 0x9099038798765D29 + }; + +static void +sse4a_test (void) +{ + int i; + int fail = 0; + long long out; + + for (i = 0; i < 5; i += 1) + { + out = sse4a_test_insert (vals_in1[i], vals_in2[i]); + fail += chk(out, vals_out[i]); + + out = sse4a_test_inserti (vals_in1[i], vals_in2[i]); + fail += chk(out, vals_out[i]); + } + + if (fail != 0) + abort (); + + exit (0); +} diff --git a/gcc/testsuite/gcc.target/i386/sse4a-montsd.c b/gcc/testsuite/gcc.target/i386/sse4a-montsd.c new file mode 100644 index 00000000000..e9be98e3bb8 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse4a-montsd.c @@ -0,0 +1,64 @@ +/* { dg-do run { target i?86-*-* x86_64-*-* } } */ +/* { dg-options "-O2 -msse4a" } */ +#include +#include +#include "../../gcc.dg/i386-cpuid.h" + +static void sse4a_test (void); + +int +main () +{ + unsigned long cpu_facilities; + + cpu_facilities = i386_extended_cpuid_ecx (); + + /* Run SSE4a test only if host has SSE4a support. */ + if ((cpu_facilities & bit_SSE4a)) + sse4a_test (); + + exit (0); +} + +static void +sse4a_test_movntsd (double *out, double *in) +{ + __m128d in_v2df = _mm_load_sd (in); + _mm_stream_sd (out, in_v2df); +} + +static int +chk_sd (double *v1, double *v2) +{ + int n_fails = 0; + if (v1[0] != v2[0]) + n_fails += 1; + return n_fails; +} + +double vals[10] = + { + 100.0, 200.0, 300.0, 400.0, 5.0, + -1.0, .345, -21.5, 9.32, 8.41 + }; + +static void +sse4a_test (void) +{ + int i; + int fail = 0; + double *out; + + out = (double *) malloc (sizeof (double)); + for (i = 0; i < 10; i += 1) + { + sse4a_test_movntsd (out, &vals[i]); + + fail += chk_sd (out, &vals[i]); + } + + if (fail != 0) + abort (); + + exit (0); +} diff --git a/gcc/testsuite/gcc.target/i386/sse4a-montss.c b/gcc/testsuite/gcc.target/i386/sse4a-montss.c new file mode 100644 index 00000000000..28ecb1cf3c0 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse4a-montss.c @@ -0,0 +1,64 @@ +/* { dg-do run { target i?86-*-* x86_64-*-* } } */ +/* { dg-options "-O2 -msse4a" } */ +#include +#include +#include "../../gcc.dg/i386-cpuid.h" + +static void sse4a_test (void); + +int +main () +{ + unsigned long cpu_facilities; + + cpu_facilities = i386_extended_cpuid_ecx (); + + /* Run SSE4a test only if host has SSE4a support. */ + if ((cpu_facilities & bit_SSE4a)) + sse4a_test (); + + exit (0); +} + +static void +sse4a_test_movntss (float *out, float *in) +{ + __m128 in_v4sf = _mm_load_ss (in); + _mm_stream_ss (out, in_v4sf); +} + +static int +chk_ss (float *v1, float *v2) +{ + int n_fails = 0; + if (v1[0] != v2[0]) + n_fails += 1; + return n_fails; +} + +float vals[10] = + { + 100.0, 200.0, 300.0, 400.0, 5.0, + -1.0, .345, -21.5, 9.32, 8.41 + }; + +static void +sse4a_test (void) +{ + int i; + int fail = 0; + float *out; + + out = (float *) malloc (sizeof (float)); + for (i = 0; i < 10; i += 1) + { + sse4a_test_movntss (out, &vals[i]); + + fail += chk_ss (out, &vals[i]); + } + + if (fail != 0) + abort (); + + exit (0); +}